1/*
2 * Copyright (c) 1998-2018 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
62 */
63/*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70#include <sys/param.h>
71#include <sys/systm.h>
72#include <sys/filedesc.h>
73#include <sys/proc.h>
74#include <sys/proc_internal.h>
75#include <sys/kauth.h>
76#include <sys/file_internal.h>
77#include <sys/fcntl.h>
78#include <sys/malloc.h>
79#include <sys/mbuf.h>
80#include <sys/domain.h>
81#include <sys/kernel.h>
82#include <sys/event.h>
83#include <sys/poll.h>
84#include <sys/protosw.h>
85#include <sys/socket.h>
86#include <sys/socketvar.h>
87#include <sys/resourcevar.h>
88#include <sys/signalvar.h>
89#include <sys/sysctl.h>
90#include <sys/syslog.h>
91#include <sys/uio.h>
92#include <sys/uio_internal.h>
93#include <sys/ev.h>
94#include <sys/kdebug.h>
95#include <sys/un.h>
96#include <sys/user.h>
97#include <sys/priv.h>
98#include <sys/kern_event.h>
99#include <net/route.h>
100#include <net/init.h>
101#include <net/net_api_stats.h>
102#include <net/ntstat.h>
103#include <net/content_filter.h>
104#include <netinet/in.h>
105#include <netinet/in_pcb.h>
106#include <netinet/in_tclass.h>
107#include <netinet/tcp_var.h>
108#include <netinet/ip6.h>
109#include <netinet6/ip6_var.h>
110#include <netinet/flow_divert.h>
111#include <kern/zalloc.h>
112#include <kern/locks.h>
113#include <machine/limits.h>
114#include <libkern/OSAtomic.h>
115#include <pexpert/pexpert.h>
116#include <kern/assert.h>
117#include <kern/task.h>
118#include <kern/policy_internal.h>
119
120#include <sys/kpi_mbuf.h>
121#include <sys/mcache.h>
122#include <sys/unpcb.h>
123#include <libkern/section_keywords.h>
124
125#if CONFIG_MACF
126#include <security/mac_framework.h>
127#endif /* MAC */
128
129#if MULTIPATH
130#include <netinet/mp_pcb.h>
131#include <netinet/mptcp_var.h>
132#endif /* MULTIPATH */
133
134#define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
135
136#if DEBUG || DEVELOPMENT
137#define DEBUG_KERNEL_ADDRPERM(_v) (_v)
138#else
139#define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
140#endif
141
142/* TODO: this should be in a header file somewhere */
143extern char *proc_name_address(void *p);
144
145static u_int32_t so_cache_hw; /* High water mark for socache */
146static u_int32_t so_cache_timeouts; /* number of timeouts */
147static u_int32_t so_cache_max_freed; /* max freed per timeout */
148static u_int32_t cached_sock_count = 0;
149STAILQ_HEAD(, socket) so_cache_head;
150int max_cached_sock_count = MAX_CACHED_SOCKETS;
151static u_int32_t so_cache_time;
152static int socketinit_done;
153static struct zone *so_cache_zone;
154
155static lck_grp_t *so_cache_mtx_grp;
156static lck_attr_t *so_cache_mtx_attr;
157static lck_grp_attr_t *so_cache_mtx_grp_attr;
158static lck_mtx_t *so_cache_mtx;
159
160#include <machine/limits.h>
161
162static int filt_sorattach(struct knote *kn, struct kevent_internal_s *kev);
163static void filt_sordetach(struct knote *kn);
164static int filt_soread(struct knote *kn, long hint);
165static int filt_sortouch(struct knote *kn, struct kevent_internal_s *kev);
166static int filt_sorprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
167
168static int filt_sowattach(struct knote *kn, struct kevent_internal_s *kev);
169static void filt_sowdetach(struct knote *kn);
170static int filt_sowrite(struct knote *kn, long hint);
171static int filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev);
172static int filt_sowprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
173
174static int filt_sockattach(struct knote *kn, struct kevent_internal_s *kev);
175static void filt_sockdetach(struct knote *kn);
176static int filt_sockev(struct knote *kn, long hint);
177static int filt_socktouch(struct knote *kn, struct kevent_internal_s *kev);
178static int filt_sockprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev);
179
180static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
181static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
182
183SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
184 .f_isfd = 1,
185 .f_attach = filt_sorattach,
186 .f_detach = filt_sordetach,
187 .f_event = filt_soread,
188 .f_touch = filt_sortouch,
189 .f_process = filt_sorprocess,
190};
191
192SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
193 .f_isfd = 1,
194 .f_attach = filt_sowattach,
195 .f_detach = filt_sowdetach,
196 .f_event = filt_sowrite,
197 .f_touch = filt_sowtouch,
198 .f_process = filt_sowprocess,
199};
200
201SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
202 .f_isfd = 1,
203 .f_attach = filt_sockattach,
204 .f_detach = filt_sockdetach,
205 .f_event = filt_sockev,
206 .f_touch = filt_socktouch,
207 .f_process = filt_sockprocess,
208};
209
210SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
211 .f_isfd = 1,
212 .f_attach = filt_sorattach,
213 .f_detach = filt_sordetach,
214 .f_event = filt_soread,
215 .f_touch = filt_sortouch,
216 .f_process = filt_sorprocess,
217};
218
219SYSCTL_DECL(_kern_ipc);
220
221#define EVEN_MORE_LOCKING_DEBUG 0
222
223int socket_debug = 0;
224SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
225 CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
226
227static unsigned long sodefunct_calls = 0;
228SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
229 &sodefunct_calls, "");
230
231static int socket_zone = M_SOCKET;
232so_gen_t so_gencnt; /* generation count for sockets */
233
234MALLOC_DEFINE(M_SONAME, "soname", "socket name");
235MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
236
237#define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
238#define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
239#define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
240#define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
241#define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
242#define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
243#define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
244#define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
245#define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
246
247#define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
248
249int somaxconn = SOMAXCONN;
250SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
251 CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
252
253/* Should we get a maximum also ??? */
254static int sosendmaxchain = 65536;
255static int sosendminchain = 16384;
256static int sorecvmincopy = 16384;
257SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
258 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
259SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
260 CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
261
262/*
263 * Set to enable jumbo clusters (if available) for large writes when
264 * the socket is marked with SOF_MULTIPAGES; see below.
265 */
266int sosendjcl = 1;
267SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
268 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
269
270/*
271 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
272 * writes on the socket for all protocols on any network interfaces,
273 * depending upon sosendjcl above. Be extra careful when setting this
274 * to 1, because sending down packets that cross physical pages down to
275 * broken drivers (those that falsely assume that the physical pages
276 * are contiguous) might lead to system panics or silent data corruption.
277 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
278 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
279 * capable. Set this to 1 only for testing/debugging purposes.
280 */
281int sosendjcl_ignore_capab = 0;
282SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
283 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
284
285/*
286 * Set this to ignore SOF1_IF_2KCL and use big clusters for large
287 * writes on the socket for all protocols on any network interfaces.
288 * Be extra careful when setting this to 1, because sending down packets with
289 * clusters larger that 2 KB might lead to system panics or data corruption.
290 * When set to 0, the system will respect SOF1_IF_2KCL, which is set
291 * on the outgoing interface
292 * Set this to 1 for testing/debugging purposes only.
293 */
294int sosendbigcl_ignore_capab = 0;
295SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
296 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
297
298int sodefunctlog = 0;
299SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
300 &sodefunctlog, 0, "");
301
302int sothrottlelog = 0;
303SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
304 &sothrottlelog, 0, "");
305
306int sorestrictrecv = 1;
307SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
308 &sorestrictrecv, 0, "Enable inbound interface restrictions");
309
310int sorestrictsend = 1;
311SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
312 &sorestrictsend, 0, "Enable outbound interface restrictions");
313
314int soreserveheadroom = 1;
315SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
316 &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
317
318#if (DEBUG || DEVELOPMENT)
319int so_notsent_lowat_check = 1;
320SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW|CTLFLAG_LOCKED,
321 &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
322#endif /* DEBUG || DEVELOPMENT */
323
324int so_accept_list_waits = 0;
325#if (DEBUG || DEVELOPMENT)
326SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW|CTLFLAG_LOCKED,
327 &so_accept_list_waits, 0, "number of waits for listener incomp list");
328#endif /* DEBUG || DEVELOPMENT */
329
330extern struct inpcbinfo tcbinfo;
331
332/* TODO: these should be in header file */
333extern int get_inpcb_str_size(void);
334extern int get_tcp_str_size(void);
335
336vm_size_t so_cache_zone_element_size;
337
338static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
339 user_ssize_t *);
340static void cached_sock_alloc(struct socket **, int);
341static void cached_sock_free(struct socket *);
342
343/*
344 * Maximum of extended background idle sockets per process
345 * Set to zero to disable further setting of the option
346 */
347
348#define SO_IDLE_BK_IDLE_MAX_PER_PROC 1
349#define SO_IDLE_BK_IDLE_TIME 600
350#define SO_IDLE_BK_IDLE_RCV_HIWAT 131072
351
352struct soextbkidlestat soextbkidlestat;
353
354SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
355 CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
356 "Maximum of extended background idle sockets per process");
357
358SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
359 &soextbkidlestat.so_xbkidle_time, 0,
360 "Time in seconds to keep extended background idle sockets");
361
362SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
363 &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
364 "High water mark for extended background idle sockets");
365
366SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
367 &soextbkidlestat, soextbkidlestat, "");
368
369int so_set_extended_bk_idle(struct socket *, int);
370
371
372/*
373 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
374 * setting the DSCP code on the packet based on the service class; see
375 * <rdar://problem/11277343> for details.
376 */
377__private_extern__ u_int32_t sotcdb = 0;
378SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
379 &sotcdb, 0, "");
380
381void
382socketinit(void)
383{
384 _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
385 VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
386
387#ifdef __LP64__
388 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
389 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
390 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
391 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
392 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
393 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
394#else
395 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
396 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
397 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
398 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
399 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
400 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
401#endif
402
403 if (socketinit_done) {
404 printf("socketinit: already called...\n");
405 return;
406 }
407 socketinit_done = 1;
408
409 PE_parse_boot_argn("socket_debug", &socket_debug,
410 sizeof (socket_debug));
411
412 /*
413 * allocate lock group attribute and group for socket cache mutex
414 */
415 so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
416 so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
417 so_cache_mtx_grp_attr);
418
419 /*
420 * allocate the lock attribute for socket cache mutex
421 */
422 so_cache_mtx_attr = lck_attr_alloc_init();
423
424 /* cached sockets mutex */
425 so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
426 if (so_cache_mtx == NULL) {
427 panic("%s: unable to allocate so_cache_mtx\n", __func__);
428 /* NOTREACHED */
429 }
430 STAILQ_INIT(&so_cache_head);
431
432 so_cache_zone_element_size = (vm_size_t)(sizeof (struct socket) + 4
433 + get_inpcb_str_size() + 4 + get_tcp_str_size());
434
435 so_cache_zone = zinit(so_cache_zone_element_size,
436 (120000 * so_cache_zone_element_size), 8192, "socache zone");
437 zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
438 zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
439
440 bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
441 soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
442 soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
443 soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
444
445 in_pcbinit();
446 sflt_init();
447 socket_tclass_init();
448#if MULTIPATH
449 mp_pcbinit();
450#endif /* MULTIPATH */
451}
452
453static void
454cached_sock_alloc(struct socket **so, int waitok)
455{
456 caddr_t temp;
457 uintptr_t offset;
458
459 lck_mtx_lock(so_cache_mtx);
460
461 if (!STAILQ_EMPTY(&so_cache_head)) {
462 VERIFY(cached_sock_count > 0);
463
464 *so = STAILQ_FIRST(&so_cache_head);
465 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
466 STAILQ_NEXT((*so), so_cache_ent) = NULL;
467
468 cached_sock_count--;
469 lck_mtx_unlock(so_cache_mtx);
470
471 temp = (*so)->so_saved_pcb;
472 bzero((caddr_t)*so, sizeof (struct socket));
473
474 (*so)->so_saved_pcb = temp;
475 } else {
476
477 lck_mtx_unlock(so_cache_mtx);
478
479 if (waitok)
480 *so = (struct socket *)zalloc(so_cache_zone);
481 else
482 *so = (struct socket *)zalloc_noblock(so_cache_zone);
483
484 if (*so == NULL)
485 return;
486
487 bzero((caddr_t)*so, sizeof (struct socket));
488
489 /*
490 * Define offsets for extra structures into our
491 * single block of memory. Align extra structures
492 * on longword boundaries.
493 */
494
495 offset = (uintptr_t)*so;
496 offset += sizeof (struct socket);
497
498 offset = ALIGN(offset);
499
500 (*so)->so_saved_pcb = (caddr_t)offset;
501 offset += get_inpcb_str_size();
502
503 offset = ALIGN(offset);
504
505 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
506 (caddr_t)offset;
507 }
508
509 OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
510}
511
512static void
513cached_sock_free(struct socket *so)
514{
515
516 lck_mtx_lock(so_cache_mtx);
517
518 so_cache_time = net_uptime();
519 if (++cached_sock_count > max_cached_sock_count) {
520 --cached_sock_count;
521 lck_mtx_unlock(so_cache_mtx);
522 zfree(so_cache_zone, so);
523 } else {
524 if (so_cache_hw < cached_sock_count)
525 so_cache_hw = cached_sock_count;
526
527 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
528
529 so->cache_timestamp = so_cache_time;
530 lck_mtx_unlock(so_cache_mtx);
531 }
532}
533
534void
535so_update_last_owner_locked(struct socket *so, proc_t self)
536{
537 if (so->last_pid != 0) {
538 /*
539 * last_pid and last_upid should remain zero for sockets
540 * created using sock_socket. The check above achieves that
541 */
542 if (self == PROC_NULL)
543 self = current_proc();
544
545 if (so->last_upid != proc_uniqueid(self) ||
546 so->last_pid != proc_pid(self)) {
547 so->last_upid = proc_uniqueid(self);
548 so->last_pid = proc_pid(self);
549 proc_getexecutableuuid(self, so->last_uuid,
550 sizeof (so->last_uuid));
551 }
552 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
553 }
554}
555
556void
557so_update_policy(struct socket *so)
558{
559 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
560 (void) inp_update_policy(sotoinpcb(so));
561}
562
563#if NECP
564static void
565so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
566 struct sockaddr *override_remote_addr)
567{
568 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6)
569 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
570 override_remote_addr, 0);
571}
572#endif /* NECP */
573
574boolean_t
575so_cache_timer(void)
576{
577 struct socket *p;
578 int n_freed = 0;
579 boolean_t rc = FALSE;
580
581 lck_mtx_lock(so_cache_mtx);
582 so_cache_timeouts++;
583 so_cache_time = net_uptime();
584
585 while (!STAILQ_EMPTY(&so_cache_head)) {
586 VERIFY(cached_sock_count > 0);
587 p = STAILQ_FIRST(&so_cache_head);
588 if ((so_cache_time - p->cache_timestamp) <
589 SO_CACHE_TIME_LIMIT)
590 break;
591
592 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
593 --cached_sock_count;
594
595 zfree(so_cache_zone, p);
596
597 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
598 so_cache_max_freed++;
599 break;
600 }
601 }
602
603 /* Schedule again if there is more to cleanup */
604 if (!STAILQ_EMPTY(&so_cache_head))
605 rc = TRUE;
606
607 lck_mtx_unlock(so_cache_mtx);
608 return (rc);
609}
610
611/*
612 * Get a socket structure from our zone, and initialize it.
613 * We don't implement `waitok' yet (see comments in uipc_domain.c).
614 * Note that it would probably be better to allocate socket
615 * and PCB at the same time, but I'm not convinced that all
616 * the protocols can be easily modified to do this.
617 */
618struct socket *
619soalloc(int waitok, int dom, int type)
620{
621 struct socket *so;
622
623 if ((dom == PF_INET) && (type == SOCK_STREAM)) {
624 cached_sock_alloc(&so, waitok);
625 } else {
626 MALLOC_ZONE(so, struct socket *, sizeof (*so), socket_zone,
627 M_WAITOK);
628 if (so != NULL)
629 bzero(so, sizeof (*so));
630 }
631 if (so != NULL) {
632 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
633 so->so_zone = socket_zone;
634
635 /*
636 * Increment the socket allocation statistics
637 */
638 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
639
640#if CONFIG_MACF_SOCKET
641 /* Convert waitok to M_WAITOK/M_NOWAIT for MAC Framework. */
642 if (mac_socket_label_init(so, !waitok) != 0) {
643 sodealloc(so);
644 return (NULL);
645 }
646#endif /* MAC_SOCKET */
647 }
648
649 return (so);
650}
651
652int
653socreate_internal(int dom, struct socket **aso, int type, int proto,
654 struct proc *p, uint32_t flags, struct proc *ep)
655{
656 struct protosw *prp;
657 struct socket *so;
658 int error = 0;
659
660#if TCPDEBUG
661 extern int tcpconsdebug;
662#endif
663
664 VERIFY(aso != NULL);
665 *aso = NULL;
666
667 if (proto != 0)
668 prp = pffindproto(dom, proto, type);
669 else
670 prp = pffindtype(dom, type);
671
672 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
673 if (pffinddomain(dom) == NULL)
674 return (EAFNOSUPPORT);
675 if (proto != 0) {
676 if (pffindprotonotype(dom, proto) != NULL)
677 return (EPROTOTYPE);
678 }
679 return (EPROTONOSUPPORT);
680 }
681 if (prp->pr_type != type)
682 return (EPROTOTYPE);
683 so = soalloc(1, dom, type);
684 if (so == NULL)
685 return (ENOBUFS);
686
687 switch (dom) {
688 case PF_LOCAL:
689 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
690 break;
691 case PF_INET:
692 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
693 if (type == SOCK_STREAM) {
694 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
695 } else {
696 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
697 }
698 break;
699 case PF_ROUTE:
700 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
701 break;
702 case PF_NDRV:
703 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
704 break;
705 case PF_KEY:
706 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
707 break;
708 case PF_INET6:
709 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
710 if (type == SOCK_STREAM) {
711 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
712 } else {
713 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
714 }
715 break;
716 case PF_SYSTEM:
717 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
718 break;
719 case PF_MULTIPATH:
720 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
721 break;
722 default:
723 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
724 break;
725 }
726
727 if (flags & SOCF_ASYNC)
728 so->so_state |= SS_NBIO;
729
730 TAILQ_INIT(&so->so_incomp);
731 TAILQ_INIT(&so->so_comp);
732 so->so_type = type;
733 so->last_upid = proc_uniqueid(p);
734 so->last_pid = proc_pid(p);
735 proc_getexecutableuuid(p, so->last_uuid, sizeof (so->last_uuid));
736 proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
737
738 if (ep != PROC_NULL && ep != p) {
739 so->e_upid = proc_uniqueid(ep);
740 so->e_pid = proc_pid(ep);
741 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
742 so->so_flags |= SOF_DELEGATED;
743 }
744
745 so->so_cred = kauth_cred_proc_ref(p);
746 if (!suser(kauth_cred_get(), NULL))
747 so->so_state |= SS_PRIV;
748
749 so->so_proto = prp;
750 so->so_rcv.sb_flags |= SB_RECV;
751 so->so_rcv.sb_so = so->so_snd.sb_so = so;
752 so->next_lock_lr = 0;
753 so->next_unlock_lr = 0;
754
755#if CONFIG_MACF_SOCKET
756 mac_socket_label_associate(kauth_cred_get(), so);
757#endif /* MAC_SOCKET */
758
759 /*
760 * Attachment will create the per pcb lock if necessary and
761 * increase refcount for creation, make sure it's done before
762 * socket is inserted in lists.
763 */
764 so->so_usecount++;
765
766 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
767 if (error != 0) {
768 /*
769 * Warning:
770 * If so_pcb is not zero, the socket will be leaked,
771 * so protocol attachment handler must be coded carefuly
772 */
773 so->so_state |= SS_NOFDREF;
774 VERIFY(so->so_usecount > 0);
775 so->so_usecount--;
776 sofreelastref(so, 1); /* will deallocate the socket */
777 return (error);
778 }
779
780 atomic_add_32(&prp->pr_domain->dom_refs, 1);
781 TAILQ_INIT(&so->so_evlist);
782
783 /* Attach socket filters for this protocol */
784 sflt_initsock(so);
785#if TCPDEBUG
786 if (tcpconsdebug == 2)
787 so->so_options |= SO_DEBUG;
788#endif
789 so_set_default_traffic_class(so);
790
791 /*
792 * If this thread or task is marked to create backgrounded sockets,
793 * mark the socket as background.
794 */
795 if (proc_get_effective_thread_policy(current_thread(),
796 TASK_POLICY_NEW_SOCKETS_BG)) {
797 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
798 so->so_background_thread = current_thread();
799 }
800
801 switch (dom) {
802 /*
803 * Don't mark Unix domain, system or multipath sockets as
804 * eligible for defunct by default.
805 */
806 case PF_LOCAL:
807 case PF_SYSTEM:
808 case PF_MULTIPATH:
809 so->so_flags |= SOF_NODEFUNCT;
810 break;
811 default:
812 break;
813 }
814
815 /*
816 * Entitlements can't be checked at socket creation time except if the
817 * application requested a feature guarded by a privilege (c.f., socket
818 * delegation).
819 * The priv(9) and the Sandboxing APIs are designed with the idea that
820 * a privilege check should only be triggered by a userland request.
821 * A privilege check at socket creation time is time consuming and
822 * could trigger many authorisation error messages from the security
823 * APIs.
824 */
825
826 *aso = so;
827
828 return (0);
829}
830
831/*
832 * Returns: 0 Success
833 * EAFNOSUPPORT
834 * EPROTOTYPE
835 * EPROTONOSUPPORT
836 * ENOBUFS
837 * <pru_attach>:ENOBUFS[AF_UNIX]
838 * <pru_attach>:ENOBUFS[TCP]
839 * <pru_attach>:ENOMEM[TCP]
840 * <pru_attach>:??? [other protocol families, IPSEC]
841 */
842int
843socreate(int dom, struct socket **aso, int type, int proto)
844{
845 return (socreate_internal(dom, aso, type, proto, current_proc(), 0,
846 PROC_NULL));
847}
848
849int
850socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
851{
852 int error = 0;
853 struct proc *ep = PROC_NULL;
854
855 if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
856 error = ESRCH;
857 goto done;
858 }
859
860 error = socreate_internal(dom, aso, type, proto, current_proc(), 0, ep);
861
862 /*
863 * It might not be wise to hold the proc reference when calling
864 * socreate_internal since it calls soalloc with M_WAITOK
865 */
866done:
867 if (ep != PROC_NULL)
868 proc_rele(ep);
869
870 return (error);
871}
872
873/*
874 * Returns: 0 Success
875 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
876 * <pru_bind>:EAFNOSUPPORT Address family not supported
877 * <pru_bind>:EADDRNOTAVAIL Address not available.
878 * <pru_bind>:EINVAL Invalid argument
879 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
880 * <pru_bind>:EACCES Permission denied
881 * <pru_bind>:EADDRINUSE Address in use
882 * <pru_bind>:EAGAIN Resource unavailable, try again
883 * <pru_bind>:EPERM Operation not permitted
884 * <pru_bind>:???
885 * <sf_bind>:???
886 *
887 * Notes: It's not possible to fully enumerate the return codes above,
888 * since socket filter authors and protocol family authors may
889 * not choose to limit their error returns to those listed, even
890 * though this may result in some software operating incorrectly.
891 *
892 * The error codes which are enumerated above are those known to
893 * be returned by the tcp_usr_bind function supplied.
894 */
895int
896sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
897{
898 struct proc *p = current_proc();
899 int error = 0;
900
901 if (dolock)
902 socket_lock(so, 1);
903
904 so_update_last_owner_locked(so, p);
905 so_update_policy(so);
906
907#if NECP
908 so_update_necp_policy(so, nam, NULL);
909#endif /* NECP */
910
911 /*
912 * If this is a bind request on a socket that has been marked
913 * as inactive, reject it now before we go any further.
914 */
915 if (so->so_flags & SOF_DEFUNCT) {
916 error = EINVAL;
917 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
918 __func__, proc_pid(p), proc_best_name(p),
919 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
920 SOCK_DOM(so), SOCK_TYPE(so), error);
921 goto out;
922 }
923
924 /* Socket filter */
925 error = sflt_bind(so, nam);
926
927 if (error == 0)
928 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
929out:
930 if (dolock)
931 socket_unlock(so, 1);
932
933 if (error == EJUSTRETURN)
934 error = 0;
935
936 return (error);
937}
938
939void
940sodealloc(struct socket *so)
941{
942 kauth_cred_unref(&so->so_cred);
943
944 /* Remove any filters */
945 sflt_termsock(so);
946
947#if CONTENT_FILTER
948 cfil_sock_detach(so);
949#endif /* CONTENT_FILTER */
950
951 /* Delete the state allocated for msg queues on a socket */
952 if (so->so_flags & SOF_ENABLE_MSGS) {
953 FREE(so->so_msg_state, M_TEMP);
954 so->so_msg_state = NULL;
955 }
956 VERIFY(so->so_msg_state == NULL);
957
958 so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
959
960#if CONFIG_MACF_SOCKET
961 mac_socket_label_destroy(so);
962#endif /* MAC_SOCKET */
963
964 if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
965 cached_sock_free(so);
966 } else {
967 FREE_ZONE(so, sizeof (*so), so->so_zone);
968 }
969}
970
971/*
972 * Returns: 0 Success
973 * EINVAL
974 * EOPNOTSUPP
975 * <pru_listen>:EINVAL[AF_UNIX]
976 * <pru_listen>:EINVAL[TCP]
977 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
978 * <pru_listen>:EINVAL[TCP] Invalid argument
979 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
980 * <pru_listen>:EACCES[TCP] Permission denied
981 * <pru_listen>:EADDRINUSE[TCP] Address in use
982 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
983 * <pru_listen>:EPERM[TCP] Operation not permitted
984 * <sf_listen>:???
985 *
986 * Notes: Other <pru_listen> returns depend on the protocol family; all
987 * <sf_listen> returns depend on what the filter author causes
988 * their filter to return.
989 */
990int
991solisten(struct socket *so, int backlog)
992{
993 struct proc *p = current_proc();
994 int error = 0;
995
996 socket_lock(so, 1);
997
998 so_update_last_owner_locked(so, p);
999 so_update_policy(so);
1000
1001#if NECP
1002 so_update_necp_policy(so, NULL, NULL);
1003#endif /* NECP */
1004
1005 if (so->so_proto == NULL) {
1006 error = EINVAL;
1007 goto out;
1008 }
1009 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1010 error = EOPNOTSUPP;
1011 goto out;
1012 }
1013
1014 /*
1015 * If the listen request is made on a socket that is not fully
1016 * disconnected, or on a socket that has been marked as inactive,
1017 * reject the request now.
1018 */
1019 if ((so->so_state &
1020 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) ||
1021 (so->so_flags & SOF_DEFUNCT)) {
1022 error = EINVAL;
1023 if (so->so_flags & SOF_DEFUNCT) {
1024 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1025 "(%d)\n", __func__, proc_pid(p),
1026 proc_best_name(p),
1027 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1028 SOCK_DOM(so), SOCK_TYPE(so), error);
1029 }
1030 goto out;
1031 }
1032
1033 if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
1034 error = EPERM;
1035 goto out;
1036 }
1037
1038 error = sflt_listen(so);
1039 if (error == 0)
1040 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1041
1042 if (error) {
1043 if (error == EJUSTRETURN)
1044 error = 0;
1045 goto out;
1046 }
1047
1048 if (TAILQ_EMPTY(&so->so_comp))
1049 so->so_options |= SO_ACCEPTCONN;
1050 /*
1051 * POSIX: The implementation may have an upper limit on the length of
1052 * the listen queue-either global or per accepting socket. If backlog
1053 * exceeds this limit, the length of the listen queue is set to the
1054 * limit.
1055 *
1056 * If listen() is called with a backlog argument value that is less
1057 * than 0, the function behaves as if it had been called with a backlog
1058 * argument value of 0.
1059 *
1060 * A backlog argument of 0 may allow the socket to accept connections,
1061 * in which case the length of the listen queue may be set to an
1062 * implementation-defined minimum value.
1063 */
1064 if (backlog <= 0 || backlog > somaxconn)
1065 backlog = somaxconn;
1066
1067 so->so_qlimit = backlog;
1068out:
1069 socket_unlock(so, 1);
1070 return (error);
1071}
1072
1073/*
1074 * The "accept list lock" protects the fields related to the listener queues
1075 * because we can unlock a socket to respect the lock ordering between
1076 * the listener socket and its clients sockets. The lock ordering is first to
1077 * acquire the client socket before the listener socket.
1078 *
1079 * The accept list lock serializes access to the following fields:
1080 * - of the listener socket:
1081 * - so_comp
1082 * - so_incomp
1083 * - so_qlen
1084 * - so_inqlen
1085 * - of client sockets that are in so_comp or so_incomp:
1086 * - so_head
1087 * - so_list
1088 *
1089 * As one can see the accept list lock protects the consistent of the
1090 * linkage of the client sockets.
1091 *
1092 * Note that those fields may be read without holding the accept list lock
1093 * for a preflight provided the accept list lock is taken when committing
1094 * to take an action based on the result of the preflight. The preflight
1095 * saves the cost of doing the unlock/lock dance.
1096 */
1097void
1098so_acquire_accept_list(struct socket *head, struct socket *so)
1099{
1100 lck_mtx_t *mutex_held;
1101
1102 if (head->so_proto->pr_getlock == NULL) {
1103 return;
1104 }
1105 mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1106 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1107
1108 if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1109 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1110 return;
1111 }
1112 if (so != NULL) {
1113 socket_unlock(so, 0);
1114 }
1115 while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1116 so_accept_list_waits += 1;
1117 msleep((caddr_t)&head->so_incomp, mutex_held,
1118 PSOCK | PCATCH, __func__, NULL);
1119 }
1120 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1121 if (so != NULL) {
1122 socket_unlock(head, 0);
1123 socket_lock(so, 0);
1124 socket_lock(head, 0);
1125 }
1126}
1127
1128void
1129so_release_accept_list(struct socket *head)
1130{
1131 if (head->so_proto->pr_getlock != NULL) {
1132 lck_mtx_t *mutex_held;
1133
1134 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
1135 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1136
1137 head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1138 wakeup((caddr_t)&head->so_incomp);
1139 }
1140}
1141
1142void
1143sofreelastref(struct socket *so, int dealloc)
1144{
1145 struct socket *head = so->so_head;
1146
1147 /* Assume socket is locked */
1148
1149 if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1150 selthreadclear(&so->so_snd.sb_sel);
1151 selthreadclear(&so->so_rcv.sb_sel);
1152 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
1153 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
1154 so->so_event = sonullevent;
1155 return;
1156 }
1157 if (head != NULL) {
1158 /*
1159 * Need to lock the listener when the protocol has
1160 * per socket locks
1161 */
1162 if (head->so_proto->pr_getlock != NULL) {
1163 socket_lock(head, 1);
1164 so_acquire_accept_list(head, so);
1165 }
1166 if (so->so_state & SS_INCOMP) {
1167 so->so_state &= ~SS_INCOMP;
1168 TAILQ_REMOVE(&head->so_incomp, so, so_list);
1169 head->so_incqlen--;
1170 head->so_qlen--;
1171 so->so_head = NULL;
1172
1173 if (head->so_proto->pr_getlock != NULL) {
1174 so_release_accept_list(head);
1175 socket_unlock(head, 1);
1176 }
1177 } else if (so->so_state & SS_COMP) {
1178 if (head->so_proto->pr_getlock != NULL) {
1179 so_release_accept_list(head);
1180 socket_unlock(head, 1);
1181 }
1182 /*
1183 * We must not decommission a socket that's
1184 * on the accept(2) queue. If we do, then
1185 * accept(2) may hang after select(2) indicated
1186 * that the listening socket was ready.
1187 */
1188 selthreadclear(&so->so_snd.sb_sel);
1189 selthreadclear(&so->so_rcv.sb_sel);
1190 so->so_rcv.sb_flags &= ~(SB_SEL|SB_UPCALL);
1191 so->so_snd.sb_flags &= ~(SB_SEL|SB_UPCALL);
1192 so->so_event = sonullevent;
1193 return;
1194 } else {
1195 if (head->so_proto->pr_getlock != NULL) {
1196 so_release_accept_list(head);
1197 socket_unlock(head, 1);
1198 }
1199 printf("sofree: not queued\n");
1200 }
1201 }
1202 sowflush(so);
1203 sorflush(so);
1204
1205#if FLOW_DIVERT
1206 if (so->so_flags & SOF_FLOW_DIVERT) {
1207 flow_divert_detach(so);
1208 }
1209#endif /* FLOW_DIVERT */
1210
1211 /* 3932268: disable upcall */
1212 so->so_rcv.sb_flags &= ~SB_UPCALL;
1213 so->so_snd.sb_flags &= ~(SB_UPCALL|SB_SNDBYTE_CNT);
1214 so->so_event = sonullevent;
1215
1216 if (dealloc)
1217 sodealloc(so);
1218}
1219
1220void
1221soclose_wait_locked(struct socket *so)
1222{
1223 lck_mtx_t *mutex_held;
1224
1225 if (so->so_proto->pr_getlock != NULL)
1226 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1227 else
1228 mutex_held = so->so_proto->pr_domain->dom_mtx;
1229 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1230
1231 /*
1232 * Double check here and return if there's no outstanding upcall;
1233 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1234 */
1235 if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT))
1236 return;
1237 so->so_rcv.sb_flags &= ~SB_UPCALL;
1238 so->so_snd.sb_flags &= ~SB_UPCALL;
1239 so->so_flags |= SOF_CLOSEWAIT;
1240
1241 (void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - 1),
1242 "soclose_wait_locked", NULL);
1243 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1244 so->so_flags &= ~SOF_CLOSEWAIT;
1245}
1246
1247/*
1248 * Close a socket on last file table reference removal.
1249 * Initiate disconnect if connected.
1250 * Free socket when disconnect complete.
1251 */
1252int
1253soclose_locked(struct socket *so)
1254{
1255 int error = 0;
1256 struct timespec ts;
1257
1258 if (so->so_usecount == 0) {
1259 panic("soclose: so=%p refcount=0\n", so);
1260 /* NOTREACHED */
1261 }
1262
1263 sflt_notify(so, sock_evt_closing, NULL);
1264
1265 if (so->so_upcallusecount)
1266 soclose_wait_locked(so);
1267
1268#if CONTENT_FILTER
1269 /*
1270 * We have to wait until the content filters are done
1271 */
1272 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1273 cfil_sock_close_wait(so);
1274 cfil_sock_is_closed(so);
1275 cfil_sock_detach(so);
1276 }
1277#endif /* CONTENT_FILTER */
1278
1279 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1280 soresume(current_proc(), so, 1);
1281 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1282 }
1283
1284 if ((so->so_options & SO_ACCEPTCONN)) {
1285 struct socket *sp, *sonext;
1286 int persocklock = 0;
1287 int incomp_overflow_only;
1288
1289 /*
1290 * We do not want new connection to be added
1291 * to the connection queues
1292 */
1293 so->so_options &= ~SO_ACCEPTCONN;
1294
1295 /*
1296 * We can drop the lock on the listener once
1297 * we've acquired the incoming list
1298 */
1299 if (so->so_proto->pr_getlock != NULL) {
1300 persocklock = 1;
1301 so_acquire_accept_list(so, NULL);
1302 socket_unlock(so, 0);
1303 }
1304again:
1305 incomp_overflow_only = 1;
1306
1307 TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1308 /*
1309 * Radar 5350314
1310 * skip sockets thrown away by tcpdropdropblreq
1311 * they will get cleanup by the garbage collection.
1312 * otherwise, remove the incomp socket from the queue
1313 * and let soabort trigger the appropriate cleanup.
1314 */
1315 if (sp->so_flags & SOF_OVERFLOW)
1316 continue;
1317
1318 if (persocklock != 0)
1319 socket_lock(sp, 1);
1320
1321 /*
1322 * Radar 27945981
1323 * The extra reference for the list insure the
1324 * validity of the socket pointer when we perform the
1325 * unlock of the head above
1326 */
1327 if (sp->so_state & SS_INCOMP) {
1328 sp->so_state &= ~SS_INCOMP;
1329 sp->so_head = NULL;
1330 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1331 so->so_incqlen--;
1332 so->so_qlen--;
1333
1334 (void) soabort(sp);
1335 } else {
1336 panic("%s sp %p in so_incomp but !SS_INCOMP",
1337 __func__, sp);
1338 }
1339
1340 if (persocklock != 0)
1341 socket_unlock(sp, 1);
1342 }
1343
1344 TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1345 /* Dequeue from so_comp since sofree() won't do it */
1346 if (persocklock != 0)
1347 socket_lock(sp, 1);
1348
1349 if (sp->so_state & SS_COMP) {
1350 sp->so_state &= ~SS_COMP;
1351 sp->so_head = NULL;
1352 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1353 so->so_qlen--;
1354
1355 (void) soabort(sp);
1356 } else {
1357 panic("%s sp %p in so_comp but !SS_COMP",
1358 __func__, sp);
1359 }
1360
1361 if (persocklock)
1362 socket_unlock(sp, 1);
1363 }
1364
1365 if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1366#if (DEBUG|DEVELOPMENT)
1367 panic("%s head %p so_comp not empty\n", __func__, so);
1368#endif /* (DEVELOPMENT || DEBUG) */
1369
1370 goto again;
1371 }
1372
1373 if (!TAILQ_EMPTY(&so->so_comp)) {
1374#if (DEBUG|DEVELOPMENT)
1375 panic("%s head %p so_comp not empty\n", __func__, so);
1376#endif /* (DEVELOPMENT || DEBUG) */
1377
1378 goto again;
1379 }
1380
1381 if (persocklock) {
1382 socket_lock(so, 0);
1383 so_release_accept_list(so);
1384 }
1385 }
1386 if (so->so_pcb == NULL) {
1387 /* 3915887: mark the socket as ready for dealloc */
1388 so->so_flags |= SOF_PCBCLEARING;
1389 goto discard;
1390 }
1391 if (so->so_state & SS_ISCONNECTED) {
1392 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1393 error = sodisconnectlocked(so);
1394 if (error)
1395 goto drop;
1396 }
1397 if (so->so_options & SO_LINGER) {
1398 lck_mtx_t *mutex_held;
1399
1400 if ((so->so_state & SS_ISDISCONNECTING) &&
1401 (so->so_state & SS_NBIO))
1402 goto drop;
1403 if (so->so_proto->pr_getlock != NULL)
1404 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1405 else
1406 mutex_held = so->so_proto->pr_domain->dom_mtx;
1407 while (so->so_state & SS_ISCONNECTED) {
1408 ts.tv_sec = (so->so_linger/100);
1409 ts.tv_nsec = (so->so_linger % 100) *
1410 NSEC_PER_USEC * 1000 * 10;
1411 error = msleep((caddr_t)&so->so_timeo,
1412 mutex_held, PSOCK | PCATCH, "soclose", &ts);
1413 if (error) {
1414 /*
1415 * It's OK when the time fires,
1416 * don't report an error
1417 */
1418 if (error == EWOULDBLOCK)
1419 error = 0;
1420 break;
1421 }
1422 }
1423 }
1424 }
1425drop:
1426 if (so->so_usecount == 0) {
1427 panic("soclose: usecount is zero so=%p\n", so);
1428 /* NOTREACHED */
1429 }
1430 if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1431 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1432 if (error == 0)
1433 error = error2;
1434 }
1435 if (so->so_usecount <= 0) {
1436 panic("soclose: usecount is zero so=%p\n", so);
1437 /* NOTREACHED */
1438 }
1439discard:
1440 if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1441 (so->so_state & SS_NOFDREF)) {
1442 panic("soclose: NOFDREF");
1443 /* NOTREACHED */
1444 }
1445 so->so_state |= SS_NOFDREF;
1446
1447 if ((so->so_flags & SOF_KNOTE) != 0)
1448 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1449
1450 atomic_add_32(&so->so_proto->pr_domain->dom_refs, -1);
1451 evsofree(so);
1452
1453 VERIFY(so->so_usecount > 0);
1454 so->so_usecount--;
1455 sofree(so);
1456 return (error);
1457}
1458
1459int
1460soclose(struct socket *so)
1461{
1462 int error = 0;
1463 socket_lock(so, 1);
1464
1465 if (so->so_retaincnt == 0) {
1466 error = soclose_locked(so);
1467 } else {
1468 /*
1469 * if the FD is going away, but socket is
1470 * retained in kernel remove its reference
1471 */
1472 so->so_usecount--;
1473 if (so->so_usecount < 2)
1474 panic("soclose: retaincnt non null and so=%p "
1475 "usecount=%d\n", so, so->so_usecount);
1476 }
1477 socket_unlock(so, 1);
1478 return (error);
1479}
1480
1481/*
1482 * Must be called at splnet...
1483 */
1484/* Should already be locked */
1485int
1486soabort(struct socket *so)
1487{
1488 int error;
1489
1490#ifdef MORE_LOCKING_DEBUG
1491 lck_mtx_t *mutex_held;
1492
1493 if (so->so_proto->pr_getlock != NULL)
1494 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1495 else
1496 mutex_held = so->so_proto->pr_domain->dom_mtx;
1497 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1498#endif
1499
1500 if ((so->so_flags & SOF_ABORTED) == 0) {
1501 so->so_flags |= SOF_ABORTED;
1502 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1503 if (error) {
1504 sofree(so);
1505 return (error);
1506 }
1507 }
1508 return (0);
1509}
1510
1511int
1512soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1513{
1514 int error;
1515
1516 if (dolock)
1517 socket_lock(so, 1);
1518
1519 so_update_last_owner_locked(so, PROC_NULL);
1520 so_update_policy(so);
1521#if NECP
1522 so_update_necp_policy(so, NULL, NULL);
1523#endif /* NECP */
1524
1525 if ((so->so_state & SS_NOFDREF) == 0)
1526 panic("soaccept: !NOFDREF");
1527 so->so_state &= ~SS_NOFDREF;
1528 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1529
1530 if (dolock)
1531 socket_unlock(so, 1);
1532 return (error);
1533}
1534
1535int
1536soaccept(struct socket *so, struct sockaddr **nam)
1537{
1538 return (soacceptlock(so, nam, 1));
1539}
1540
1541int
1542soacceptfilter(struct socket *so, struct socket *head)
1543{
1544 struct sockaddr *local = NULL, *remote = NULL;
1545 int error = 0;
1546
1547 /*
1548 * Hold the lock even if this socket has not been made visible
1549 * to the filter(s). For sockets with global locks, this protects
1550 * against the head or peer going away
1551 */
1552 socket_lock(so, 1);
1553 if (sogetaddr_locked(so, &remote, 1) != 0 ||
1554 sogetaddr_locked(so, &local, 0) != 0) {
1555 so->so_state &= ~SS_NOFDREF;
1556 socket_unlock(so, 1);
1557 soclose(so);
1558 /* Out of resources; try it again next time */
1559 error = ECONNABORTED;
1560 goto done;
1561 }
1562
1563 error = sflt_accept(head, so, local, remote);
1564
1565 /*
1566 * If we get EJUSTRETURN from one of the filters, mark this socket
1567 * as inactive and return it anyway. This newly accepted socket
1568 * will be disconnected later before we hand it off to the caller.
1569 */
1570 if (error == EJUSTRETURN) {
1571 error = 0;
1572 (void) sosetdefunct(current_proc(), so,
1573 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1574 }
1575
1576 if (error != 0) {
1577 /*
1578 * This may seem like a duplication to the above error
1579 * handling part when we return ECONNABORTED, except
1580 * the following is done while holding the lock since
1581 * the socket has been exposed to the filter(s) earlier.
1582 */
1583 so->so_state &= ~SS_NOFDREF;
1584 socket_unlock(so, 1);
1585 soclose(so);
1586 /* Propagate socket filter's error code to the caller */
1587 } else {
1588 socket_unlock(so, 1);
1589 }
1590done:
1591 /* Callee checks for NULL pointer */
1592 sock_freeaddr(remote);
1593 sock_freeaddr(local);
1594 return (error);
1595}
1596
1597/*
1598 * Returns: 0 Success
1599 * EOPNOTSUPP Operation not supported on socket
1600 * EISCONN Socket is connected
1601 * <pru_connect>:EADDRNOTAVAIL Address not available.
1602 * <pru_connect>:EINVAL Invalid argument
1603 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1604 * <pru_connect>:EACCES Permission denied
1605 * <pru_connect>:EADDRINUSE Address in use
1606 * <pru_connect>:EAGAIN Resource unavailable, try again
1607 * <pru_connect>:EPERM Operation not permitted
1608 * <sf_connect_out>:??? [anything a filter writer might set]
1609 */
1610int
1611soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1612{
1613 int error;
1614 struct proc *p = current_proc();
1615
1616 if (dolock)
1617 socket_lock(so, 1);
1618
1619 so_update_last_owner_locked(so, p);
1620 so_update_policy(so);
1621
1622#if NECP
1623 so_update_necp_policy(so, NULL, nam);
1624#endif /* NECP */
1625
1626 /*
1627 * If this is a listening socket or if this is a previously-accepted
1628 * socket that has been marked as inactive, reject the connect request.
1629 */
1630 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1631 error = EOPNOTSUPP;
1632 if (so->so_flags & SOF_DEFUNCT) {
1633 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1634 "(%d)\n", __func__, proc_pid(p),
1635 proc_best_name(p),
1636 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1637 SOCK_DOM(so), SOCK_TYPE(so), error);
1638 }
1639 if (dolock)
1640 socket_unlock(so, 1);
1641 return (error);
1642 }
1643
1644 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1645 if (dolock)
1646 socket_unlock(so, 1);
1647 return (EPERM);
1648 }
1649
1650 /*
1651 * If protocol is connection-based, can only connect once.
1652 * Otherwise, if connected, try to disconnect first.
1653 * This allows user to disconnect by connecting to, e.g.,
1654 * a null address.
1655 */
1656 if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
1657 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1658 (error = sodisconnectlocked(so)))) {
1659 error = EISCONN;
1660 } else {
1661 /*
1662 * Run connect filter before calling protocol:
1663 * - non-blocking connect returns before completion;
1664 */
1665 error = sflt_connectout(so, nam);
1666 if (error != 0) {
1667 if (error == EJUSTRETURN)
1668 error = 0;
1669 } else {
1670 error = (*so->so_proto->pr_usrreqs->pru_connect)
1671 (so, nam, p);
1672 }
1673 }
1674 if (dolock)
1675 socket_unlock(so, 1);
1676 return (error);
1677}
1678
1679int
1680soconnect(struct socket *so, struct sockaddr *nam)
1681{
1682 return (soconnectlock(so, nam, 1));
1683}
1684
1685/*
1686 * Returns: 0 Success
1687 * <pru_connect2>:EINVAL[AF_UNIX]
1688 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1689 * <pru_connect2>:??? [other protocol families]
1690 *
1691 * Notes: <pru_connect2> is not supported by [TCP].
1692 */
1693int
1694soconnect2(struct socket *so1, struct socket *so2)
1695{
1696 int error;
1697
1698 socket_lock(so1, 1);
1699 if (so2->so_proto->pr_lock)
1700 socket_lock(so2, 1);
1701
1702 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1703
1704 socket_unlock(so1, 1);
1705 if (so2->so_proto->pr_lock)
1706 socket_unlock(so2, 1);
1707 return (error);
1708}
1709
1710int
1711soconnectxlocked(struct socket *so, struct sockaddr *src,
1712 struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1713 sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1714 uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1715{
1716 int error;
1717
1718 so_update_last_owner_locked(so, p);
1719 so_update_policy(so);
1720
1721 /*
1722 * If this is a listening socket or if this is a previously-accepted
1723 * socket that has been marked as inactive, reject the connect request.
1724 */
1725 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1726 error = EOPNOTSUPP;
1727 if (so->so_flags & SOF_DEFUNCT) {
1728 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1729 "(%d)\n", __func__, proc_pid(p),
1730 proc_best_name(p),
1731 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1732 SOCK_DOM(so), SOCK_TYPE(so), error);
1733 }
1734 return (error);
1735 }
1736
1737 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0)
1738 return (EPERM);
1739
1740 /*
1741 * If protocol is connection-based, can only connect once
1742 * unless PR_MULTICONN is set. Otherwise, if connected,
1743 * try to disconnect first. This allows user to disconnect
1744 * by connecting to, e.g., a null address.
1745 */
1746 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) &&
1747 !(so->so_proto->pr_flags & PR_MULTICONN) &&
1748 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1749 (error = sodisconnectlocked(so)) != 0)) {
1750 error = EISCONN;
1751 } else {
1752 /*
1753 * Run connect filter before calling protocol:
1754 * - non-blocking connect returns before completion;
1755 */
1756 error = sflt_connectout(so, dst);
1757 if (error != 0) {
1758 /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1759 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1760 if (error == EJUSTRETURN)
1761 error = 0;
1762 } else {
1763 error = (*so->so_proto->pr_usrreqs->pru_connectx)
1764 (so, src, dst, p, ifscope, aid, pcid,
1765 flags, arg, arglen, auio, bytes_written);
1766 }
1767 }
1768
1769 return (error);
1770}
1771
1772int
1773sodisconnectlocked(struct socket *so)
1774{
1775 int error;
1776
1777 if ((so->so_state & SS_ISCONNECTED) == 0) {
1778 error = ENOTCONN;
1779 goto bad;
1780 }
1781 if (so->so_state & SS_ISDISCONNECTING) {
1782 error = EALREADY;
1783 goto bad;
1784 }
1785
1786 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1787 if (error == 0)
1788 sflt_notify(so, sock_evt_disconnected, NULL);
1789
1790bad:
1791 return (error);
1792}
1793
1794/* Locking version */
1795int
1796sodisconnect(struct socket *so)
1797{
1798 int error;
1799
1800 socket_lock(so, 1);
1801 error = sodisconnectlocked(so);
1802 socket_unlock(so, 1);
1803 return (error);
1804}
1805
1806int
1807sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1808{
1809 int error;
1810
1811 /*
1812 * Call the protocol disconnectx handler; let it handle all
1813 * matters related to the connection state of this session.
1814 */
1815 error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1816 if (error == 0) {
1817 /*
1818 * The event applies only for the session, not for
1819 * the disconnection of individual subflows.
1820 */
1821 if (so->so_state & (SS_ISDISCONNECTING|SS_ISDISCONNECTED))
1822 sflt_notify(so, sock_evt_disconnected, NULL);
1823 }
1824 return (error);
1825}
1826
1827int
1828sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1829{
1830 int error;
1831
1832 socket_lock(so, 1);
1833 error = sodisconnectxlocked(so, aid, cid);
1834 socket_unlock(so, 1);
1835 return (error);
1836}
1837
1838#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1839
1840/*
1841 * sosendcheck will lock the socket buffer if it isn't locked and
1842 * verify that there is space for the data being inserted.
1843 *
1844 * Returns: 0 Success
1845 * EPIPE
1846 * sblock:EWOULDBLOCK
1847 * sblock:EINTR
1848 * sbwait:EBADF
1849 * sbwait:EINTR
1850 * [so_error]:???
1851 */
1852int
1853sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1854 int32_t clen, int32_t atomic, int flags, int *sblocked,
1855 struct mbuf *control)
1856{
1857 int error = 0;
1858 int32_t space;
1859 int assumelock = 0;
1860
1861restart:
1862 if (*sblocked == 0) {
1863 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
1864 so->so_send_filt_thread != 0 &&
1865 so->so_send_filt_thread == current_thread()) {
1866 /*
1867 * We're being called recursively from a filter,
1868 * allow this to continue. Radar 4150520.
1869 * Don't set sblocked because we don't want
1870 * to perform an unlock later.
1871 */
1872 assumelock = 1;
1873 } else {
1874 error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1875 if (error) {
1876 if (so->so_flags & SOF_DEFUNCT)
1877 goto defunct;
1878 return (error);
1879 }
1880 *sblocked = 1;
1881 }
1882 }
1883
1884 /*
1885 * If a send attempt is made on a socket that has been marked
1886 * as inactive (disconnected), reject the request.
1887 */
1888 if (so->so_flags & SOF_DEFUNCT) {
1889defunct:
1890 error = EPIPE;
1891 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
1892 __func__, proc_selfpid(), proc_best_name(current_proc()),
1893 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1894 SOCK_DOM(so), SOCK_TYPE(so), error);
1895 return (error);
1896 }
1897
1898 if (so->so_state & SS_CANTSENDMORE) {
1899#if CONTENT_FILTER
1900 /*
1901 * Can re-inject data of half closed connections
1902 */
1903 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
1904 so->so_snd.sb_cfil_thread == current_thread() &&
1905 cfil_sock_data_pending(&so->so_snd) != 0)
1906 CFIL_LOG(LOG_INFO,
1907 "so %llx ignore SS_CANTSENDMORE",
1908 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
1909 else
1910#endif /* CONTENT_FILTER */
1911 return (EPIPE);
1912 }
1913 if (so->so_error) {
1914 error = so->so_error;
1915 so->so_error = 0;
1916 return (error);
1917 }
1918
1919 if ((so->so_state & SS_ISCONNECTED) == 0) {
1920 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
1921 if (((so->so_state & SS_ISCONFIRMING) == 0) &&
1922 (resid != 0 || clen == 0) &&
1923 !(so->so_flags1 & SOF1_PRECONNECT_DATA))
1924 return (ENOTCONN);
1925
1926 } else if (addr == 0 && !(flags&MSG_HOLD)) {
1927 return ((so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1928 ENOTCONN : EDESTADDRREQ);
1929 }
1930 }
1931
1932 if (so->so_flags & SOF_ENABLE_MSGS)
1933 space = msgq_sbspace(so, control);
1934 else
1935 space = sbspace(&so->so_snd);
1936
1937 if (flags & MSG_OOB)
1938 space += 1024;
1939 if ((atomic && resid > so->so_snd.sb_hiwat) ||
1940 clen > so->so_snd.sb_hiwat)
1941 return (EMSGSIZE);
1942
1943 if ((space < resid + clen &&
1944 (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
1945 space < clen)) ||
1946 (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
1947 /*
1948 * don't block the connectx call when there's more data
1949 * than can be copied.
1950 */
1951 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
1952 if (space == 0) {
1953 return (EWOULDBLOCK);
1954 }
1955 if (space < (int32_t)so->so_snd.sb_lowat) {
1956 return (0);
1957 }
1958 }
1959 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
1960 assumelock) {
1961 return (EWOULDBLOCK);
1962 }
1963 sbunlock(&so->so_snd, TRUE); /* keep socket locked */
1964 *sblocked = 0;
1965 error = sbwait(&so->so_snd);
1966 if (error) {
1967 if (so->so_flags & SOF_DEFUNCT)
1968 goto defunct;
1969 return (error);
1970 }
1971 goto restart;
1972 }
1973 return (0);
1974}
1975
1976/*
1977 * Send on a socket.
1978 * If send must go all at once and message is larger than
1979 * send buffering, then hard error.
1980 * Lock against other senders.
1981 * If must go all at once and not enough room now, then
1982 * inform user that this would block and do nothing.
1983 * Otherwise, if nonblocking, send as much as possible.
1984 * The data to be sent is described by "uio" if nonzero,
1985 * otherwise by the mbuf chain "top" (which must be null
1986 * if uio is not). Data provided in mbuf chain must be small
1987 * enough to send all at once.
1988 *
1989 * Returns nonzero on error, timeout or signal; callers
1990 * must check for short counts if EINTR/ERESTART are returned.
1991 * Data and control buffers are freed on return.
1992 * Experiment:
1993 * MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1994 * MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1995 * point at the mbuf chain being constructed and go from there.
1996 *
1997 * Returns: 0 Success
1998 * EOPNOTSUPP
1999 * EINVAL
2000 * ENOBUFS
2001 * uiomove:EFAULT
2002 * sosendcheck:EPIPE
2003 * sosendcheck:EWOULDBLOCK
2004 * sosendcheck:EINTR
2005 * sosendcheck:EBADF
2006 * sosendcheck:EINTR
2007 * sosendcheck:??? [value from so_error]
2008 * <pru_send>:ECONNRESET[TCP]
2009 * <pru_send>:EINVAL[TCP]
2010 * <pru_send>:ENOBUFS[TCP]
2011 * <pru_send>:EADDRINUSE[TCP]
2012 * <pru_send>:EADDRNOTAVAIL[TCP]
2013 * <pru_send>:EAFNOSUPPORT[TCP]
2014 * <pru_send>:EACCES[TCP]
2015 * <pru_send>:EAGAIN[TCP]
2016 * <pru_send>:EPERM[TCP]
2017 * <pru_send>:EMSGSIZE[TCP]
2018 * <pru_send>:EHOSTUNREACH[TCP]
2019 * <pru_send>:ENETUNREACH[TCP]
2020 * <pru_send>:ENETDOWN[TCP]
2021 * <pru_send>:ENOMEM[TCP]
2022 * <pru_send>:ENOBUFS[TCP]
2023 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
2024 * <pru_send>:EINVAL[AF_UNIX]
2025 * <pru_send>:EOPNOTSUPP[AF_UNIX]
2026 * <pru_send>:EPIPE[AF_UNIX]
2027 * <pru_send>:ENOTCONN[AF_UNIX]
2028 * <pru_send>:EISCONN[AF_UNIX]
2029 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
2030 * <sf_data_out>:??? [whatever a filter author chooses]
2031 *
2032 * Notes: Other <pru_send> returns depend on the protocol family; all
2033 * <sf_data_out> returns depend on what the filter author causes
2034 * their filter to return.
2035 */
2036int
2037sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2038 struct mbuf *top, struct mbuf *control, int flags)
2039{
2040 struct mbuf **mp;
2041 struct mbuf *m, *freelist = NULL;
2042 user_ssize_t space, len, resid, orig_resid;
2043 int clen = 0, error, dontroute, mlen, sendflags;
2044 int atomic = sosendallatonce(so) || top;
2045 int sblocked = 0;
2046 struct proc *p = current_proc();
2047 struct mbuf *control_copy = NULL;
2048 uint16_t headroom = 0;
2049 boolean_t en_tracing = FALSE;
2050
2051 if (uio != NULL)
2052 resid = uio_resid(uio);
2053 else
2054 resid = top->m_pkthdr.len;
2055
2056 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2057 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2058
2059 socket_lock(so, 1);
2060
2061 /*
2062 * trace if tracing & network (vs. unix) sockets & and
2063 * non-loopback
2064 */
2065 if (ENTR_SHOULDTRACE &&
2066 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2067 struct inpcb *inp = sotoinpcb(so);
2068 if (inp->inp_last_outifp != NULL &&
2069 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2070 en_tracing = TRUE;
2071 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2072 VM_KERNEL_ADDRPERM(so),
2073 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2074 (int64_t)resid);
2075 orig_resid = resid;
2076 }
2077 }
2078
2079 /*
2080 * Re-injection should not affect process accounting
2081 */
2082 if ((flags & MSG_SKIPCFIL) == 0) {
2083 so_update_last_owner_locked(so, p);
2084 so_update_policy(so);
2085
2086#if NECP
2087 so_update_necp_policy(so, NULL, addr);
2088#endif /* NECP */
2089 }
2090
2091 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2092 error = EOPNOTSUPP;
2093 goto out_locked;
2094 }
2095
2096 /*
2097 * In theory resid should be unsigned.
2098 * However, space must be signed, as it might be less than 0
2099 * if we over-committed, and we must use a signed comparison
2100 * of space and resid. On the other hand, a negative resid
2101 * causes us to loop sending 0-length segments to the protocol.
2102 *
2103 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2104 * But it will be used by sockets doing message delivery.
2105 *
2106 * Note: We limit resid to be a positive int value as we use
2107 * imin() to set bytes_to_copy -- radr://14558484
2108 */
2109 if (resid < 0 || resid > INT_MAX || (so->so_type == SOCK_STREAM &&
2110 !(so->so_flags & SOF_ENABLE_MSGS) && (flags & MSG_EOR))) {
2111 error = EINVAL;
2112 goto out_locked;
2113 }
2114
2115 dontroute = (flags & MSG_DONTROUTE) &&
2116 (so->so_options & SO_DONTROUTE) == 0 &&
2117 (so->so_proto->pr_flags & PR_ATOMIC);
2118 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2119
2120 if (control != NULL)
2121 clen = control->m_len;
2122
2123 if (soreserveheadroom != 0)
2124 headroom = so->so_pktheadroom;
2125
2126 do {
2127 error = sosendcheck(so, addr, resid, clen, atomic, flags,
2128 &sblocked, control);
2129 if (error)
2130 goto out_locked;
2131
2132 mp = &top;
2133 if (so->so_flags & SOF_ENABLE_MSGS)
2134 space = msgq_sbspace(so, control);
2135 else
2136 space = sbspace(&so->so_snd) - clen;
2137 space += ((flags & MSG_OOB) ? 1024 : 0);
2138
2139 do {
2140 if (uio == NULL) {
2141 /*
2142 * Data is prepackaged in "top".
2143 */
2144 resid = 0;
2145 if (flags & MSG_EOR)
2146 top->m_flags |= M_EOR;
2147 } else {
2148 int chainlength;
2149 int bytes_to_copy;
2150 boolean_t jumbocl;
2151 boolean_t bigcl;
2152 int bytes_to_alloc;
2153
2154 bytes_to_copy = imin(resid, space);
2155
2156 bytes_to_alloc = bytes_to_copy;
2157 if (top == NULL)
2158 bytes_to_alloc += headroom;
2159
2160 if (sosendminchain > 0)
2161 chainlength = 0;
2162 else
2163 chainlength = sosendmaxchain;
2164
2165 /*
2166 * Use big 4 KB cluster when the outgoing interface
2167 * does not prefer 2 KB clusters
2168 */
2169 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2170 sosendbigcl_ignore_capab;
2171
2172 /*
2173 * Attempt to use larger than system page-size
2174 * clusters for large writes only if there is
2175 * a jumbo cluster pool and if the socket is
2176 * marked accordingly.
2177 */
2178 jumbocl = sosendjcl && njcl > 0 &&
2179 ((so->so_flags & SOF_MULTIPAGES) ||
2180 sosendjcl_ignore_capab) &&
2181 bigcl;
2182
2183 socket_unlock(so, 0);
2184
2185 do {
2186 int num_needed;
2187 int hdrs_needed = (top == NULL) ? 1 : 0;
2188
2189 /*
2190 * try to maintain a local cache of mbuf
2191 * clusters needed to complete this
2192 * write the list is further limited to
2193 * the number that are currently needed
2194 * to fill the socket this mechanism
2195 * allows a large number of mbufs/
2196 * clusters to be grabbed under a single
2197 * mbuf lock... if we can't get any
2198 * clusters, than fall back to trying
2199 * for mbufs if we fail early (or
2200 * miscalcluate the number needed) make
2201 * sure to release any clusters we
2202 * haven't yet consumed.
2203 */
2204 if (freelist == NULL &&
2205 bytes_to_alloc > MBIGCLBYTES &&
2206 jumbocl) {
2207 num_needed =
2208 bytes_to_alloc / M16KCLBYTES;
2209
2210 if ((bytes_to_alloc -
2211 (num_needed * M16KCLBYTES))
2212 >= MINCLSIZE)
2213 num_needed++;
2214
2215 freelist =
2216 m_getpackets_internal(
2217 (unsigned int *)&num_needed,
2218 hdrs_needed, M_WAIT, 0,
2219 M16KCLBYTES);
2220 /*
2221 * Fall back to 4K cluster size
2222 * if allocation failed
2223 */
2224 }
2225
2226 if (freelist == NULL &&
2227 bytes_to_alloc > MCLBYTES &&
2228 bigcl) {
2229 num_needed =
2230 bytes_to_alloc / MBIGCLBYTES;
2231
2232 if ((bytes_to_alloc -
2233 (num_needed * MBIGCLBYTES)) >=
2234 MINCLSIZE)
2235 num_needed++;
2236
2237 freelist =
2238 m_getpackets_internal(
2239 (unsigned int *)&num_needed,
2240 hdrs_needed, M_WAIT, 0,
2241 MBIGCLBYTES);
2242 /*
2243 * Fall back to cluster size
2244 * if allocation failed
2245 */
2246 }
2247
2248 /*
2249 * Allocate a cluster as we want to
2250 * avoid to split the data in more
2251 * that one segment and using MINCLSIZE
2252 * would lead us to allocate two mbufs
2253 */
2254 if (soreserveheadroom != 0 &&
2255 freelist == NULL &&
2256 ((top == NULL &&
2257 bytes_to_alloc > _MHLEN) ||
2258 bytes_to_alloc > _MLEN)) {
2259 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2260 MCLBYTES;
2261 freelist =
2262 m_getpackets_internal(
2263 (unsigned int *)&num_needed,
2264 hdrs_needed, M_WAIT, 0,
2265 MCLBYTES);
2266 /*
2267 * Fall back to a single mbuf
2268 * if allocation failed
2269 */
2270 } else if (freelist == NULL &&
2271 bytes_to_alloc > MINCLSIZE) {
2272 num_needed =
2273 bytes_to_alloc / MCLBYTES;
2274
2275 if ((bytes_to_alloc -
2276 (num_needed * MCLBYTES)) >=
2277 MINCLSIZE)
2278 num_needed++;
2279
2280 freelist =
2281 m_getpackets_internal(
2282 (unsigned int *)&num_needed,
2283 hdrs_needed, M_WAIT, 0,
2284 MCLBYTES);
2285 /*
2286 * Fall back to a single mbuf
2287 * if allocation failed
2288 */
2289 }
2290 /*
2291 * For datagram protocols, leave
2292 * headroom for protocol headers
2293 * in the first cluster of the chain
2294 */
2295 if (freelist != NULL && atomic &&
2296 top == NULL && headroom > 0) {
2297 freelist->m_data += headroom;
2298 }
2299
2300 /*
2301 * Fall back to regular mbufs without
2302 * reserving the socket headroom
2303 */
2304 if (freelist == NULL) {
2305 if (top == NULL)
2306 MGETHDR(freelist,
2307 M_WAIT, MT_DATA);
2308 else
2309 MGET(freelist,
2310 M_WAIT, MT_DATA);
2311
2312 if (freelist == NULL) {
2313 error = ENOBUFS;
2314 socket_lock(so, 0);
2315 goto out_locked;
2316 }
2317 /*
2318 * For datagram protocols,
2319 * leave room for protocol
2320 * headers in first mbuf.
2321 */
2322 if (atomic && top == NULL &&
2323 bytes_to_copy < MHLEN) {
2324 MH_ALIGN(freelist,
2325 bytes_to_copy);
2326 }
2327 }
2328 m = freelist;
2329 freelist = m->m_next;
2330 m->m_next = NULL;
2331
2332 if ((m->m_flags & M_EXT))
2333 mlen = m->m_ext.ext_size -
2334 M_LEADINGSPACE(m);
2335 else if ((m->m_flags & M_PKTHDR))
2336 mlen =
2337 MHLEN - M_LEADINGSPACE(m);
2338 else
2339 mlen = MLEN - M_LEADINGSPACE(m);
2340 len = imin(mlen, bytes_to_copy);
2341
2342 chainlength += len;
2343
2344 space -= len;
2345
2346 error = uiomove(mtod(m, caddr_t),
2347 len, uio);
2348
2349 resid = uio_resid(uio);
2350
2351 m->m_len = len;
2352 *mp = m;
2353 top->m_pkthdr.len += len;
2354 if (error)
2355 break;
2356 mp = &m->m_next;
2357 if (resid <= 0) {
2358 if (flags & MSG_EOR)
2359 top->m_flags |= M_EOR;
2360 break;
2361 }
2362 bytes_to_copy = min(resid, space);
2363
2364 } while (space > 0 &&
2365 (chainlength < sosendmaxchain || atomic ||
2366 resid < MINCLSIZE));
2367
2368 socket_lock(so, 0);
2369
2370 if (error)
2371 goto out_locked;
2372 }
2373
2374 if (flags & (MSG_HOLD|MSG_SEND)) {
2375 /* Enqueue for later, go away if HOLD */
2376 struct mbuf *mb1;
2377 if (so->so_temp && (flags & MSG_FLUSH)) {
2378 m_freem(so->so_temp);
2379 so->so_temp = NULL;
2380 }
2381 if (so->so_temp)
2382 so->so_tail->m_next = top;
2383 else
2384 so->so_temp = top;
2385 mb1 = top;
2386 while (mb1->m_next)
2387 mb1 = mb1->m_next;
2388 so->so_tail = mb1;
2389 if (flags & MSG_HOLD) {
2390 top = NULL;
2391 goto out_locked;
2392 }
2393 top = so->so_temp;
2394 }
2395 if (dontroute)
2396 so->so_options |= SO_DONTROUTE;
2397
2398 /*
2399 * Compute flags here, for pru_send and NKEs
2400 *
2401 * If the user set MSG_EOF, the protocol
2402 * understands this flag and nothing left to
2403 * send then use PRU_SEND_EOF instead of PRU_SEND.
2404 */
2405 sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2406 ((flags & MSG_EOF) &&
2407 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2408 (resid <= 0)) ? PRUS_EOF :
2409 /* If there is more to send set PRUS_MORETOCOME */
2410 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2411
2412 if ((flags & MSG_SKIPCFIL) == 0) {
2413 /*
2414 * Socket filter processing
2415 */
2416 error = sflt_data_out(so, addr, &top,
2417 &control, (sendflags & MSG_OOB) ?
2418 sock_data_filt_flag_oob : 0);
2419 if (error) {
2420 if (error == EJUSTRETURN) {
2421 error = 0;
2422 clen = 0;
2423 control = NULL;
2424 top = NULL;
2425 }
2426 goto out_locked;
2427 }
2428#if CONTENT_FILTER
2429 /*
2430 * Content filter processing
2431 */
2432 error = cfil_sock_data_out(so, addr, top,
2433 control, sendflags);
2434 if (error) {
2435 if (error == EJUSTRETURN) {
2436 error = 0;
2437 clen = 0;
2438 control = NULL;
2439 top = NULL;
2440 }
2441 goto out_locked;
2442 }
2443#endif /* CONTENT_FILTER */
2444 }
2445 if (so->so_flags & SOF_ENABLE_MSGS) {
2446 /*
2447 * Make a copy of control mbuf,
2448 * so that msg priority can be
2449 * passed to subsequent mbufs.
2450 */
2451 control_copy = m_dup(control, M_NOWAIT);
2452 }
2453 error = (*so->so_proto->pr_usrreqs->pru_send)
2454 (so, sendflags, top, addr, control, p);
2455
2456 if (flags & MSG_SEND)
2457 so->so_temp = NULL;
2458
2459 if (dontroute)
2460 so->so_options &= ~SO_DONTROUTE;
2461
2462 clen = 0;
2463 control = control_copy;
2464 control_copy = NULL;
2465 top = NULL;
2466 mp = &top;
2467 if (error)
2468 goto out_locked;
2469 } while (resid && space > 0);
2470 } while (resid);
2471
2472out_locked:
2473 if (sblocked)
2474 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2475 else
2476 socket_unlock(so, 1);
2477 if (top != NULL)
2478 m_freem(top);
2479 if (control != NULL)
2480 m_freem(control);
2481 if (freelist != NULL)
2482 m_freem_list(freelist);
2483 if (control_copy != NULL)
2484 m_freem(control_copy);
2485
2486 soclearfastopen(so);
2487
2488 if (en_tracing) {
2489 /* resid passed here is the bytes left in uio */
2490 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2491 VM_KERNEL_ADDRPERM(so),
2492 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2493 (int64_t)(orig_resid - resid));
2494 }
2495 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2496 so->so_snd.sb_cc, space, error);
2497
2498 return (error);
2499}
2500
2501int
2502sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2503{
2504 struct mbuf *m0, *control_end;
2505
2506 socket_lock_assert_owned(so);
2507
2508 /*
2509 * top must points to mbuf chain to be sent.
2510 * If control is not NULL, top must be packet header
2511 */
2512 VERIFY(top != NULL &&
2513 (control == NULL || top->m_flags & M_PKTHDR));
2514
2515 /*
2516 * If control is not passed in, see if we can get it
2517 * from top.
2518 */
2519 if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2520 // Locate start of control if present and start of data
2521 for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2522 if (m0->m_flags & M_PKTHDR) {
2523 top = m0;
2524 break;
2525 } else if (m0->m_type == MT_CONTROL) {
2526 if (control == NULL) {
2527 // Found start of control
2528 control = m0;
2529 }
2530 if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2531 // Found end of control
2532 control_end = m0;
2533 }
2534 }
2535 }
2536 if (control_end != NULL)
2537 control_end->m_next = NULL;
2538 }
2539
2540 int error = (*so->so_proto->pr_usrreqs->pru_send)
2541 (so, sendflags, top, addr, control, current_proc());
2542
2543 return error;
2544}
2545
2546/*
2547 * Supported only connected sockets (no address) without ancillary data
2548 * (control mbuf) for atomic protocols
2549 */
2550int
2551sosend_list(struct socket *so, struct uio **uioarray, u_int uiocnt, int flags)
2552{
2553 struct mbuf *m, *freelist = NULL;
2554 user_ssize_t len, resid;
2555 int error, dontroute, mlen;
2556 int atomic = sosendallatonce(so);
2557 int sblocked = 0;
2558 struct proc *p = current_proc();
2559 u_int uiofirst = 0;
2560 u_int uiolast = 0;
2561 struct mbuf *top = NULL;
2562 uint16_t headroom = 0;
2563 boolean_t bigcl;
2564
2565 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2566 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2567
2568 if (so->so_type != SOCK_DGRAM) {
2569 error = EINVAL;
2570 goto out;
2571 }
2572 if (atomic == 0) {
2573 error = EINVAL;
2574 goto out;
2575 }
2576 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2577 error = EPROTONOSUPPORT;
2578 goto out;
2579 }
2580 if (flags & ~(MSG_DONTWAIT | MSG_NBIO)) {
2581 error = EINVAL;
2582 goto out;
2583 }
2584 resid = uio_array_resid(uioarray, uiocnt);
2585
2586 /*
2587 * In theory resid should be unsigned.
2588 * However, space must be signed, as it might be less than 0
2589 * if we over-committed, and we must use a signed comparison
2590 * of space and resid. On the other hand, a negative resid
2591 * causes us to loop sending 0-length segments to the protocol.
2592 *
2593 * Note: We limit resid to be a positive int value as we use
2594 * imin() to set bytes_to_copy -- radr://14558484
2595 */
2596 if (resid < 0 || resid > INT_MAX) {
2597 error = EINVAL;
2598 goto out;
2599 }
2600
2601 socket_lock(so, 1);
2602 so_update_last_owner_locked(so, p);
2603 so_update_policy(so);
2604
2605#if NECP
2606 so_update_necp_policy(so, NULL, NULL);
2607#endif /* NECP */
2608
2609 dontroute = (flags & MSG_DONTROUTE) &&
2610 (so->so_options & SO_DONTROUTE) == 0 &&
2611 (so->so_proto->pr_flags & PR_ATOMIC);
2612 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2613
2614 error = sosendcheck(so, NULL, resid, 0, atomic, flags,
2615 &sblocked, NULL);
2616 if (error)
2617 goto release;
2618
2619 /*
2620 * Use big 4 KB clusters when the outgoing interface does not prefer
2621 * 2 KB clusters
2622 */
2623 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) || sosendbigcl_ignore_capab;
2624
2625 if (soreserveheadroom != 0)
2626 headroom = so->so_pktheadroom;
2627
2628 do {
2629 int i;
2630 int num_needed = 0;
2631 int chainlength;
2632 size_t maxpktlen = 0;
2633 int bytes_to_alloc;
2634
2635 if (sosendminchain > 0)
2636 chainlength = 0;
2637 else
2638 chainlength = sosendmaxchain;
2639
2640 socket_unlock(so, 0);
2641
2642 /*
2643 * Find a set of uio that fit in a reasonable number
2644 * of mbuf packets
2645 */
2646 for (i = uiofirst; i < uiocnt; i++) {
2647 struct uio *auio = uioarray[i];
2648
2649 len = uio_resid(auio);
2650
2651 /* Do nothing for empty messages */
2652 if (len == 0)
2653 continue;
2654
2655 num_needed += 1;
2656 uiolast += 1;
2657
2658 if (len > maxpktlen)
2659 maxpktlen = len;
2660
2661 chainlength += len;
2662 if (chainlength > sosendmaxchain)
2663 break;
2664 }
2665 /*
2666 * Nothing left to send
2667 */
2668 if (num_needed == 0) {
2669 socket_lock(so, 0);
2670 break;
2671 }
2672 /*
2673 * Allocate buffer large enough to include headroom space for
2674 * network and link header
2675 *
2676 */
2677 bytes_to_alloc = maxpktlen + headroom;
2678
2679 /*
2680 * Allocate a single contiguous buffer of the smallest available
2681 * size when possible
2682 */
2683 if (bytes_to_alloc > MCLBYTES &&
2684 bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2685 freelist = m_getpackets_internal(
2686 (unsigned int *)&num_needed,
2687 num_needed, M_WAIT, 1,
2688 MBIGCLBYTES);
2689 } else if (bytes_to_alloc > _MHLEN &&
2690 bytes_to_alloc <= MCLBYTES) {
2691 freelist = m_getpackets_internal(
2692 (unsigned int *)&num_needed,
2693 num_needed, M_WAIT, 1,
2694 MCLBYTES);
2695 } else {
2696 freelist = m_allocpacket_internal(
2697 (unsigned int *)&num_needed,
2698 bytes_to_alloc, NULL, M_WAIT, 1, 0);
2699 }
2700
2701 if (freelist == NULL) {
2702 socket_lock(so, 0);
2703 error = ENOMEM;
2704 goto release;
2705 }
2706 /*
2707 * Copy each uio of the set into its own mbuf packet
2708 */
2709 for (i = uiofirst, m = freelist;
2710 i < uiolast && m != NULL;
2711 i++) {
2712 int bytes_to_copy;
2713 struct mbuf *n;
2714 struct uio *auio = uioarray[i];
2715
2716 bytes_to_copy = uio_resid(auio);
2717
2718 /* Do nothing for empty messages */
2719 if (bytes_to_copy == 0)
2720 continue;
2721 /*
2722 * Leave headroom for protocol headers
2723 * in the first mbuf of the chain
2724 */
2725 m->m_data += headroom;
2726
2727 for (n = m; n != NULL; n = n->m_next) {
2728 if ((m->m_flags & M_EXT))
2729 mlen = m->m_ext.ext_size -
2730 M_LEADINGSPACE(m);
2731 else if ((m->m_flags & M_PKTHDR))
2732 mlen =
2733 MHLEN - M_LEADINGSPACE(m);
2734 else
2735 mlen = MLEN - M_LEADINGSPACE(m);
2736 len = imin(mlen, bytes_to_copy);
2737
2738 /*
2739 * Note: uiomove() decrements the iovec
2740 * length
2741 */
2742 error = uiomove(mtod(n, caddr_t),
2743 len, auio);
2744 if (error != 0)
2745 break;
2746 n->m_len = len;
2747 m->m_pkthdr.len += len;
2748
2749 VERIFY(m->m_pkthdr.len <= maxpktlen);
2750
2751 bytes_to_copy -= len;
2752 resid -= len;
2753 }
2754 if (m->m_pkthdr.len == 0) {
2755 printf(
2756 "%s:%d so %llx pkt %llx type %u len null\n",
2757 __func__, __LINE__,
2758 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2759 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2760 m->m_type);
2761 }
2762 if (error != 0)
2763 break;
2764 m = m->m_nextpkt;
2765 }
2766
2767 socket_lock(so, 0);
2768
2769 if (error)
2770 goto release;
2771 top = freelist;
2772 freelist = NULL;
2773
2774 if (dontroute)
2775 so->so_options |= SO_DONTROUTE;
2776
2777 if ((flags & MSG_SKIPCFIL) == 0) {
2778 struct mbuf **prevnextp = NULL;
2779
2780 for (i = uiofirst, m = top;
2781 i < uiolast && m != NULL;
2782 i++) {
2783 struct mbuf *nextpkt = m->m_nextpkt;
2784
2785 /*
2786 * Socket filter processing
2787 */
2788 error = sflt_data_out(so, NULL, &m,
2789 NULL, 0);
2790 if (error != 0 && error != EJUSTRETURN)
2791 goto release;
2792
2793#if CONTENT_FILTER
2794 if (error == 0) {
2795 /*
2796 * Content filter processing
2797 */
2798 error = cfil_sock_data_out(so, NULL, m,
2799 NULL, 0);
2800 if (error != 0 && error != EJUSTRETURN)
2801 goto release;
2802 }
2803#endif /* CONTENT_FILTER */
2804 /*
2805 * Remove packet from the list when
2806 * swallowed by a filter
2807 */
2808 if (error == EJUSTRETURN) {
2809 error = 0;
2810 if (prevnextp != NULL)
2811 *prevnextp = nextpkt;
2812 else
2813 top = nextpkt;
2814 }
2815
2816 m = nextpkt;
2817 if (m != NULL)
2818 prevnextp = &m->m_nextpkt;
2819 }
2820 }
2821 if (top != NULL)
2822 error = (*so->so_proto->pr_usrreqs->pru_send_list)
2823 (so, 0, top, NULL, NULL, p);
2824
2825 if (dontroute)
2826 so->so_options &= ~SO_DONTROUTE;
2827
2828 top = NULL;
2829 uiofirst = uiolast;
2830 } while (resid > 0 && error == 0);
2831release:
2832 if (sblocked)
2833 sbunlock(&so->so_snd, FALSE); /* will unlock socket */
2834 else
2835 socket_unlock(so, 1);
2836out:
2837 if (top != NULL)
2838 m_freem(top);
2839 if (freelist != NULL)
2840 m_freem_list(freelist);
2841
2842 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2843 so->so_snd.sb_cc, 0, error);
2844
2845 return (error);
2846}
2847
2848/*
2849 * May return ERESTART when packet is dropped by MAC policy check
2850 */
2851static int
2852soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2853 int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2854{
2855 int error = 0;
2856 struct mbuf *m = *mp;
2857 struct mbuf *nextrecord = *nextrecordp;
2858
2859 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2860#if CONFIG_MACF_SOCKET_SUBSET
2861 /*
2862 * Call the MAC framework for policy checking if we're in
2863 * the user process context and the socket isn't connected.
2864 */
2865 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2866 struct mbuf *m0 = m;
2867 /*
2868 * Dequeue this record (temporarily) from the receive
2869 * list since we're about to drop the socket's lock
2870 * where a new record may arrive and be appended to
2871 * the list. Upon MAC policy failure, the record
2872 * will be freed. Otherwise, we'll add it back to
2873 * the head of the list. We cannot rely on SB_LOCK
2874 * because append operation uses the socket's lock.
2875 */
2876 do {
2877 m->m_nextpkt = NULL;
2878 sbfree(&so->so_rcv, m);
2879 m = m->m_next;
2880 } while (m != NULL);
2881 m = m0;
2882 so->so_rcv.sb_mb = nextrecord;
2883 SB_EMPTY_FIXUP(&so->so_rcv);
2884 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2885 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2886 socket_unlock(so, 0);
2887
2888 if (mac_socket_check_received(proc_ucred(p), so,
2889 mtod(m, struct sockaddr *)) != 0) {
2890 /*
2891 * MAC policy failure; free this record and
2892 * process the next record (or block until
2893 * one is available). We have adjusted sb_cc
2894 * and sb_mbcnt above so there is no need to
2895 * call sbfree() again.
2896 */
2897 m_freem(m);
2898 /*
2899 * Clear SB_LOCK but don't unlock the socket.
2900 * Process the next record or wait for one.
2901 */
2902 socket_lock(so, 0);
2903 sbunlock(&so->so_rcv, TRUE); /* stay locked */
2904 error = ERESTART;
2905 goto done;
2906 }
2907 socket_lock(so, 0);
2908 /*
2909 * If the socket has been defunct'd, drop it.
2910 */
2911 if (so->so_flags & SOF_DEFUNCT) {
2912 m_freem(m);
2913 error = ENOTCONN;
2914 goto done;
2915 }
2916 /*
2917 * Re-adjust the socket receive list and re-enqueue
2918 * the record in front of any packets which may have
2919 * been appended while we dropped the lock.
2920 */
2921 for (m = m0; m->m_next != NULL; m = m->m_next)
2922 sballoc(&so->so_rcv, m);
2923 sballoc(&so->so_rcv, m);
2924 if (so->so_rcv.sb_mb == NULL) {
2925 so->so_rcv.sb_lastrecord = m0;
2926 so->so_rcv.sb_mbtail = m;
2927 }
2928 m = m0;
2929 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
2930 so->so_rcv.sb_mb = m;
2931 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
2932 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
2933 }
2934#endif /* CONFIG_MACF_SOCKET_SUBSET */
2935 if (psa != NULL) {
2936 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
2937 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
2938 error = EWOULDBLOCK;
2939 goto done;
2940 }
2941 }
2942 if (flags & MSG_PEEK) {
2943 m = m->m_next;
2944 } else {
2945 sbfree(&so->so_rcv, m);
2946 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
2947 panic("%s: about to create invalid socketbuf",
2948 __func__);
2949 /* NOTREACHED */
2950 }
2951 MFREE(m, so->so_rcv.sb_mb);
2952 m = so->so_rcv.sb_mb;
2953 if (m != NULL) {
2954 m->m_nextpkt = nextrecord;
2955 } else {
2956 so->so_rcv.sb_mb = nextrecord;
2957 SB_EMPTY_FIXUP(&so->so_rcv);
2958 }
2959 }
2960done:
2961 *mp = m;
2962 *nextrecordp = nextrecord;
2963
2964 return (error);
2965}
2966
2967/*
2968 * Process one or more MT_CONTROL mbufs present before any data mbufs
2969 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
2970 * just copy the data; if !MSG_PEEK, we call into the protocol to
2971 * perform externalization.
2972 */
2973static int
2974soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
2975 struct mbuf **mp, struct mbuf **nextrecordp)
2976{
2977 int error = 0;
2978 struct mbuf *cm = NULL, *cmn;
2979 struct mbuf **cme = &cm;
2980 struct sockbuf *sb_rcv = &so->so_rcv;
2981 struct mbuf **msgpcm = NULL;
2982 struct mbuf *m = *mp;
2983 struct mbuf *nextrecord = *nextrecordp;
2984 struct protosw *pr = so->so_proto;
2985
2986 /*
2987 * Externalizing the control messages would require us to
2988 * drop the socket's lock below. Once we re-acquire the
2989 * lock, the mbuf chain might change. In order to preserve
2990 * consistency, we unlink all control messages from the
2991 * first mbuf chain in one shot and link them separately
2992 * onto a different chain.
2993 */
2994 do {
2995 if (flags & MSG_PEEK) {
2996 if (controlp != NULL) {
2997 if (*controlp == NULL) {
2998 msgpcm = controlp;
2999 }
3000 *controlp = m_copy(m, 0, m->m_len);
3001
3002 /*
3003 * If we failed to allocate an mbuf,
3004 * release any previously allocated
3005 * mbufs for control data. Return
3006 * an error. Keep the mbufs in the
3007 * socket as this is using
3008 * MSG_PEEK flag.
3009 */
3010 if (*controlp == NULL) {
3011 m_freem(*msgpcm);
3012 error = ENOBUFS;
3013 goto done;
3014 }
3015 controlp = &(*controlp)->m_next;
3016 }
3017 m = m->m_next;
3018 } else {
3019 m->m_nextpkt = NULL;
3020 sbfree(sb_rcv, m);
3021 sb_rcv->sb_mb = m->m_next;
3022 m->m_next = NULL;
3023 *cme = m;
3024 cme = &(*cme)->m_next;
3025 m = sb_rcv->sb_mb;
3026 }
3027 } while (m != NULL && m->m_type == MT_CONTROL);
3028
3029 if (!(flags & MSG_PEEK)) {
3030 if (sb_rcv->sb_mb != NULL) {
3031 sb_rcv->sb_mb->m_nextpkt = nextrecord;
3032 } else {
3033 sb_rcv->sb_mb = nextrecord;
3034 SB_EMPTY_FIXUP(sb_rcv);
3035 }
3036 if (nextrecord == NULL)
3037 sb_rcv->sb_lastrecord = m;
3038 }
3039
3040 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3041 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3042
3043 while (cm != NULL) {
3044 int cmsg_type;
3045
3046 cmn = cm->m_next;
3047 cm->m_next = NULL;
3048 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3049
3050 /*
3051 * Call the protocol to externalize SCM_RIGHTS message
3052 * and return the modified message to the caller upon
3053 * success. Otherwise, all other control messages are
3054 * returned unmodified to the caller. Note that we
3055 * only get into this loop if MSG_PEEK is not set.
3056 */
3057 if (pr->pr_domain->dom_externalize != NULL &&
3058 cmsg_type == SCM_RIGHTS) {
3059 /*
3060 * Release socket lock: see 3903171. This
3061 * would also allow more records to be appended
3062 * to the socket buffer. We still have SB_LOCK
3063 * set on it, so we can be sure that the head
3064 * of the mbuf chain won't change.
3065 */
3066 socket_unlock(so, 0);
3067 error = (*pr->pr_domain->dom_externalize)(cm);
3068 socket_lock(so, 0);
3069 } else {
3070 error = 0;
3071 }
3072
3073 if (controlp != NULL && error == 0) {
3074 *controlp = cm;
3075 controlp = &(*controlp)->m_next;
3076 } else {
3077 (void) m_free(cm);
3078 }
3079 cm = cmn;
3080 }
3081 /*
3082 * Update the value of nextrecord in case we received new
3083 * records when the socket was unlocked above for
3084 * externalizing SCM_RIGHTS.
3085 */
3086 if (m != NULL)
3087 nextrecord = sb_rcv->sb_mb->m_nextpkt;
3088 else
3089 nextrecord = sb_rcv->sb_mb;
3090
3091done:
3092 *mp = m;
3093 *nextrecordp = nextrecord;
3094
3095 return (error);
3096}
3097
3098/*
3099 * Implement receive operations on a socket.
3100 * We depend on the way that records are added to the sockbuf
3101 * by sbappend*. In particular, each record (mbufs linked through m_next)
3102 * must begin with an address if the protocol so specifies,
3103 * followed by an optional mbuf or mbufs containing ancillary data,
3104 * and then zero or more mbufs of data.
3105 * In order to avoid blocking network interrupts for the entire time here,
3106 * we splx() while doing the actual copy to user space.
3107 * Although the sockbuf is locked, new data may still be appended,
3108 * and thus we must maintain consistency of the sockbuf during that time.
3109 *
3110 * The caller may receive the data as a single mbuf chain by supplying
3111 * an mbuf **mp0 for use in returning the chain. The uio is then used
3112 * only for the count in uio_resid.
3113 *
3114 * Returns: 0 Success
3115 * ENOBUFS
3116 * ENOTCONN
3117 * EWOULDBLOCK
3118 * uiomove:EFAULT
3119 * sblock:EWOULDBLOCK
3120 * sblock:EINTR
3121 * sbwait:EBADF
3122 * sbwait:EINTR
3123 * sodelayed_copy:EFAULT
3124 * <pru_rcvoob>:EINVAL[TCP]
3125 * <pru_rcvoob>:EWOULDBLOCK[TCP]
3126 * <pru_rcvoob>:???
3127 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3128 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3129 * <pr_domain->dom_externalize>:???
3130 *
3131 * Notes: Additional return values from calls through <pru_rcvoob> and
3132 * <pr_domain->dom_externalize> depend on protocols other than
3133 * TCP or AF_UNIX, which are documented above.
3134 */
3135int
3136soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3137 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3138{
3139 struct mbuf *m, **mp, *ml = NULL;
3140 struct mbuf *nextrecord, *free_list;
3141 int flags, error, offset;
3142 user_ssize_t len;
3143 struct protosw *pr = so->so_proto;
3144 int moff, type = 0;
3145 user_ssize_t orig_resid = uio_resid(uio);
3146 user_ssize_t delayed_copy_len;
3147 int can_delay;
3148 int need_event;
3149 struct proc *p = current_proc();
3150 boolean_t en_tracing = FALSE;
3151
3152 /*
3153 * Sanity check on the length passed by caller as we are making 'int'
3154 * comparisons
3155 */
3156 if (orig_resid < 0 || orig_resid > INT_MAX)
3157 return (EINVAL);
3158
3159 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3160 uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3161 so->so_rcv.sb_hiwat);
3162
3163 socket_lock(so, 1);
3164 so_update_last_owner_locked(so, p);
3165 so_update_policy(so);
3166
3167#ifdef MORE_LOCKING_DEBUG
3168 if (so->so_usecount == 1) {
3169 panic("%s: so=%x no other reference on socket\n", __func__, so);
3170 /* NOTREACHED */
3171 }
3172#endif
3173 mp = mp0;
3174 if (psa != NULL)
3175 *psa = NULL;
3176 if (controlp != NULL)
3177 *controlp = NULL;
3178 if (flagsp != NULL)
3179 flags = *flagsp &~ MSG_EOR;
3180 else
3181 flags = 0;
3182
3183 /*
3184 * If a recv attempt is made on a previously-accepted socket
3185 * that has been marked as inactive (disconnected), reject
3186 * the request.
3187 */
3188 if (so->so_flags & SOF_DEFUNCT) {
3189 struct sockbuf *sb = &so->so_rcv;
3190
3191 error = ENOTCONN;
3192 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3193 __func__, proc_pid(p), proc_best_name(p),
3194 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3195 SOCK_DOM(so), SOCK_TYPE(so), error);
3196 /*
3197 * This socket should have been disconnected and flushed
3198 * prior to being returned from sodefunct(); there should
3199 * be no data on its receive list, so panic otherwise.
3200 */
3201 if (so->so_state & SS_DEFUNCT)
3202 sb_empty_assert(sb, __func__);
3203 socket_unlock(so, 1);
3204 return (error);
3205 }
3206
3207 if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3208 pr->pr_usrreqs->pru_preconnect) {
3209 /*
3210 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3211 * calling write() right after this. *If* the app calls a read
3212 * we do not want to block this read indefinetely. Thus,
3213 * we trigger a connect so that the session gets initiated.
3214 */
3215 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3216
3217 if (error) {
3218 socket_unlock(so, 1);
3219 return (error);
3220 }
3221 }
3222
3223 if (ENTR_SHOULDTRACE &&
3224 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3225 /*
3226 * enable energy tracing for inet sockets that go over
3227 * non-loopback interfaces only.
3228 */
3229 struct inpcb *inp = sotoinpcb(so);
3230 if (inp->inp_last_outifp != NULL &&
3231 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3232 en_tracing = TRUE;
3233 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3234 VM_KERNEL_ADDRPERM(so),
3235 ((so->so_state & SS_NBIO) ?
3236 kEnTrFlagNonBlocking : 0),
3237 (int64_t)orig_resid);
3238 }
3239 }
3240
3241 /*
3242 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3243 * regardless of the flags argument. Here is the case were
3244 * out-of-band data is not inline.
3245 */
3246 if ((flags & MSG_OOB) ||
3247 ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3248 (so->so_options & SO_OOBINLINE) == 0 &&
3249 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3250 m = m_get(M_WAIT, MT_DATA);
3251 if (m == NULL) {
3252 socket_unlock(so, 1);
3253 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3254 ENOBUFS, 0, 0, 0, 0);
3255 return (ENOBUFS);
3256 }
3257 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3258 if (error)
3259 goto bad;
3260 socket_unlock(so, 0);
3261 do {
3262 error = uiomove(mtod(m, caddr_t),
3263 imin(uio_resid(uio), m->m_len), uio);
3264 m = m_free(m);
3265 } while (uio_resid(uio) && error == 0 && m != NULL);
3266 socket_lock(so, 0);
3267bad:
3268 if (m != NULL)
3269 m_freem(m);
3270
3271 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3272 if (error == EWOULDBLOCK || error == EINVAL) {
3273 /*
3274 * Let's try to get normal data:
3275 * EWOULDBLOCK: out-of-band data not
3276 * receive yet. EINVAL: out-of-band data
3277 * already read.
3278 */
3279 error = 0;
3280 goto nooob;
3281 } else if (error == 0 && flagsp != NULL) {
3282 *flagsp |= MSG_OOB;
3283 }
3284 }
3285 socket_unlock(so, 1);
3286 if (en_tracing) {
3287 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3288 VM_KERNEL_ADDRPERM(so), 0,
3289 (int64_t)(orig_resid - uio_resid(uio)));
3290 }
3291 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3292 0, 0, 0, 0);
3293
3294 return (error);
3295 }
3296nooob:
3297 if (mp != NULL)
3298 *mp = NULL;
3299
3300 if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3301 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
3302 }
3303
3304 free_list = NULL;
3305 delayed_copy_len = 0;
3306restart:
3307#ifdef MORE_LOCKING_DEBUG
3308 if (so->so_usecount <= 1)
3309 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3310 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3311#endif
3312 /*
3313 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3314 * and if so just return to the caller. This could happen when
3315 * soreceive() is called by a socket upcall function during the
3316 * time the socket is freed. The socket buffer would have been
3317 * locked across the upcall, therefore we cannot put this thread
3318 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3319 * we may livelock), because the lock on the socket buffer will
3320 * only be released when the upcall routine returns to its caller.
3321 * Because the socket has been officially closed, there can be
3322 * no further read on it.
3323 *
3324 * A multipath subflow socket would have its SS_NOFDREF set by
3325 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3326 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3327 */
3328 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3329 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3330 socket_unlock(so, 1);
3331 return (0);
3332 }
3333
3334 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3335 if (error) {
3336 socket_unlock(so, 1);
3337 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3338 0, 0, 0, 0);
3339 if (en_tracing) {
3340 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3341 VM_KERNEL_ADDRPERM(so), 0,
3342 (int64_t)(orig_resid - uio_resid(uio)));
3343 }
3344 return (error);
3345 }
3346
3347 m = so->so_rcv.sb_mb;
3348 /*
3349 * If we have less data than requested, block awaiting more
3350 * (subject to any timeout) if:
3351 * 1. the current count is less than the low water mark, or
3352 * 2. MSG_WAITALL is set, and it is possible to do the entire
3353 * receive operation at once if we block (resid <= hiwat).
3354 * 3. MSG_DONTWAIT is not set
3355 * If MSG_WAITALL is set but resid is larger than the receive buffer,
3356 * we have to do the receive in sections, and thus risk returning
3357 * a short count if a timeout or signal occurs after we start.
3358 */
3359 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
3360 so->so_rcv.sb_cc < uio_resid(uio)) &&
3361 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
3362 ((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
3363 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0)) {
3364 /*
3365 * Panic if we notice inconsistencies in the socket's
3366 * receive list; both sb_mb and sb_cc should correctly
3367 * reflect the contents of the list, otherwise we may
3368 * end up with false positives during select() or poll()
3369 * which could put the application in a bad state.
3370 */
3371 SB_MB_CHECK(&so->so_rcv);
3372
3373 if (so->so_error) {
3374 if (m != NULL)
3375 goto dontblock;
3376 error = so->so_error;
3377 if ((flags & MSG_PEEK) == 0)
3378 so->so_error = 0;
3379 goto release;
3380 }
3381 if (so->so_state & SS_CANTRCVMORE) {
3382#if CONTENT_FILTER
3383 /*
3384 * Deal with half closed connections
3385 */
3386 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3387 cfil_sock_data_pending(&so->so_rcv) != 0)
3388 CFIL_LOG(LOG_INFO,
3389 "so %llx ignore SS_CANTRCVMORE",
3390 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3391 else
3392#endif /* CONTENT_FILTER */
3393 if (m != NULL)
3394 goto dontblock;
3395 else
3396 goto release;
3397 }
3398 for (; m != NULL; m = m->m_next)
3399 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3400 m = so->so_rcv.sb_mb;
3401 goto dontblock;
3402 }
3403 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
3404 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3405 error = ENOTCONN;
3406 goto release;
3407 }
3408 if (uio_resid(uio) == 0)
3409 goto release;
3410
3411 if ((so->so_state & SS_NBIO) ||
3412 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
3413 error = EWOULDBLOCK;
3414 goto release;
3415 }
3416 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3417 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3418 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3419#if EVEN_MORE_LOCKING_DEBUG
3420 if (socket_debug)
3421 printf("Waiting for socket data\n");
3422#endif
3423
3424 error = sbwait(&so->so_rcv);
3425#if EVEN_MORE_LOCKING_DEBUG
3426 if (socket_debug)
3427 printf("SORECEIVE - sbwait returned %d\n", error);
3428#endif
3429 if (so->so_usecount < 1) {
3430 panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
3431 __func__, so, so->so_usecount);
3432 /* NOTREACHED */
3433 }
3434 if (error) {
3435 socket_unlock(so, 1);
3436 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3437 0, 0, 0, 0);
3438 if (en_tracing) {
3439 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3440 VM_KERNEL_ADDRPERM(so), 0,
3441 (int64_t)(orig_resid - uio_resid(uio)));
3442 }
3443 return (error);
3444 }
3445 goto restart;
3446 }
3447dontblock:
3448 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3449 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3450 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3451 nextrecord = m->m_nextpkt;
3452
3453 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3454 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3455 mp0 == NULL);
3456 if (error == ERESTART)
3457 goto restart;
3458 else if (error != 0)
3459 goto release;
3460 orig_resid = 0;
3461 }
3462
3463 /*
3464 * Process one or more MT_CONTROL mbufs present before any data mbufs
3465 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3466 * just copy the data; if !MSG_PEEK, we call into the protocol to
3467 * perform externalization.
3468 */
3469 if (m != NULL && m->m_type == MT_CONTROL) {
3470 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3471 if (error != 0)
3472 goto release;
3473 orig_resid = 0;
3474 }
3475
3476 /*
3477 * If the socket is a TCP socket with message delivery
3478 * enabled, then create a control msg to deliver the
3479 * relative TCP sequence number for this data. Waiting
3480 * until this point will protect against failures to
3481 * allocate an mbuf for control msgs.
3482 */
3483 if (so->so_type == SOCK_STREAM && SOCK_PROTO(so) == IPPROTO_TCP &&
3484 (so->so_flags & SOF_ENABLE_MSGS) && controlp != NULL) {
3485 struct mbuf *seq_cm;
3486
3487 seq_cm = sbcreatecontrol((caddr_t)&m->m_pkthdr.msg_seq,
3488 sizeof (uint32_t), SCM_SEQNUM, SOL_SOCKET);
3489 if (seq_cm == NULL) {
3490 /* unable to allocate a control mbuf */
3491 error = ENOBUFS;
3492 goto release;
3493 }
3494 *controlp = seq_cm;
3495 controlp = &seq_cm->m_next;
3496 }
3497
3498 if (m != NULL) {
3499 if (!(flags & MSG_PEEK)) {
3500 /*
3501 * We get here because m points to an mbuf following
3502 * any MT_SONAME or MT_CONTROL mbufs which have been
3503 * processed above. In any case, m should be pointing
3504 * to the head of the mbuf chain, and the nextrecord
3505 * should be either NULL or equal to m->m_nextpkt.
3506 * See comments above about SB_LOCK.
3507 */
3508 if (m != so->so_rcv.sb_mb ||
3509 m->m_nextpkt != nextrecord) {
3510 panic("%s: post-control !sync so=%p m=%p "
3511 "nextrecord=%p\n", __func__, so, m,
3512 nextrecord);
3513 /* NOTREACHED */
3514 }
3515 if (nextrecord == NULL)
3516 so->so_rcv.sb_lastrecord = m;
3517 }
3518 type = m->m_type;
3519 if (type == MT_OOBDATA)
3520 flags |= MSG_OOB;
3521 } else {
3522 if (!(flags & MSG_PEEK)) {
3523 SB_EMPTY_FIXUP(&so->so_rcv);
3524 }
3525 }
3526 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3527 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3528
3529 moff = 0;
3530 offset = 0;
3531
3532 if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy)
3533 can_delay = 1;
3534 else
3535 can_delay = 0;
3536
3537 need_event = 0;
3538
3539 while (m != NULL &&
3540 (uio_resid(uio) - delayed_copy_len) > 0 && error == 0) {
3541 if (m->m_type == MT_OOBDATA) {
3542 if (type != MT_OOBDATA)
3543 break;
3544 } else if (type == MT_OOBDATA) {
3545 break;
3546 }
3547 /*
3548 * Make sure to allways set MSG_OOB event when getting
3549 * out of band data inline.
3550 */
3551 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3552 (so->so_options & SO_OOBINLINE) != 0 &&
3553 (so->so_state & SS_RCVATMARK) != 0) {
3554 flags |= MSG_OOB;
3555 }
3556 so->so_state &= ~SS_RCVATMARK;
3557 len = uio_resid(uio) - delayed_copy_len;
3558 if (so->so_oobmark && len > so->so_oobmark - offset)
3559 len = so->so_oobmark - offset;
3560 if (len > m->m_len - moff)
3561 len = m->m_len - moff;
3562 /*
3563 * If mp is set, just pass back the mbufs.
3564 * Otherwise copy them out via the uio, then free.
3565 * Sockbuf must be consistent here (points to current mbuf,
3566 * it points to next record) when we drop priority;
3567 * we must note any additions to the sockbuf when we
3568 * block interrupts again.
3569 */
3570 if (mp == NULL) {
3571 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3572 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3573 if (can_delay && len == m->m_len) {
3574 /*
3575 * only delay the copy if we're consuming the
3576 * mbuf and we're NOT in MSG_PEEK mode
3577 * and we have enough data to make it worthwile
3578 * to drop and retake the lock... can_delay
3579 * reflects the state of the 2 latter
3580 * constraints moff should always be zero
3581 * in these cases
3582 */
3583 delayed_copy_len += len;
3584 } else {
3585 if (delayed_copy_len) {
3586 error = sodelayed_copy(so, uio,
3587 &free_list, &delayed_copy_len);
3588
3589 if (error) {
3590 goto release;
3591 }
3592 /*
3593 * can only get here if MSG_PEEK is not
3594 * set therefore, m should point at the
3595 * head of the rcv queue; if it doesn't,
3596 * it means something drastically
3597 * changed while we were out from behind
3598 * the lock in sodelayed_copy. perhaps
3599 * a RST on the stream. in any event,
3600 * the stream has been interrupted. it's
3601 * probably best just to return whatever
3602 * data we've moved and let the caller
3603 * sort it out...
3604 */
3605 if (m != so->so_rcv.sb_mb) {
3606 break;
3607 }
3608 }
3609 socket_unlock(so, 0);
3610 error = uiomove(mtod(m, caddr_t) + moff,
3611 (int)len, uio);
3612 socket_lock(so, 0);
3613
3614 if (error)
3615 goto release;
3616 }
3617 } else {
3618 uio_setresid(uio, (uio_resid(uio) - len));
3619 }
3620 if (len == m->m_len - moff) {
3621 if (m->m_flags & M_EOR)
3622 flags |= MSG_EOR;
3623 if (flags & MSG_PEEK) {
3624 m = m->m_next;
3625 moff = 0;
3626 } else {
3627 nextrecord = m->m_nextpkt;
3628 sbfree(&so->so_rcv, m);
3629 m->m_nextpkt = NULL;
3630
3631 /*
3632 * If this packet is an unordered packet
3633 * (indicated by M_UNORDERED_DATA flag), remove
3634 * the additional bytes added to the
3635 * receive socket buffer size.
3636 */
3637 if ((so->so_flags & SOF_ENABLE_MSGS) &&
3638 m->m_len &&
3639 (m->m_flags & M_UNORDERED_DATA) &&
3640 sbreserve(&so->so_rcv,
3641 so->so_rcv.sb_hiwat - m->m_len)) {
3642 if (so->so_msg_state->msg_uno_bytes >
3643 m->m_len) {
3644 so->so_msg_state->
3645 msg_uno_bytes -= m->m_len;
3646 } else {
3647 so->so_msg_state->
3648 msg_uno_bytes = 0;
3649 }
3650 m->m_flags &= ~M_UNORDERED_DATA;
3651 }
3652
3653 if (mp != NULL) {
3654 *mp = m;
3655 mp = &m->m_next;
3656 so->so_rcv.sb_mb = m = m->m_next;
3657 *mp = NULL;
3658 } else {
3659 if (free_list == NULL)
3660 free_list = m;
3661 else
3662 ml->m_next = m;
3663 ml = m;
3664 so->so_rcv.sb_mb = m = m->m_next;
3665 ml->m_next = NULL;
3666 }
3667 if (m != NULL) {
3668 m->m_nextpkt = nextrecord;
3669 if (nextrecord == NULL)
3670 so->so_rcv.sb_lastrecord = m;
3671 } else {
3672 so->so_rcv.sb_mb = nextrecord;
3673 SB_EMPTY_FIXUP(&so->so_rcv);
3674 }
3675 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3676 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3677 }
3678 } else {
3679 if (flags & MSG_PEEK) {
3680 moff += len;
3681 } else {
3682 if (mp != NULL) {
3683 int copy_flag;
3684
3685 if (flags & MSG_DONTWAIT)
3686 copy_flag = M_DONTWAIT;
3687 else
3688 copy_flag = M_WAIT;
3689 *mp = m_copym(m, 0, len, copy_flag);
3690 /*
3691 * Failed to allocate an mbuf?
3692 * Adjust uio_resid back, it was
3693 * adjusted down by len bytes which
3694 * we didn't copy over.
3695 */
3696 if (*mp == NULL) {
3697 uio_setresid(uio,
3698 (uio_resid(uio) + len));
3699 break;
3700 }
3701 }
3702 m->m_data += len;
3703 m->m_len -= len;
3704 so->so_rcv.sb_cc -= len;
3705 }
3706 }
3707 if (so->so_oobmark) {
3708 if ((flags & MSG_PEEK) == 0) {
3709 so->so_oobmark -= len;
3710 if (so->so_oobmark == 0) {
3711 so->so_state |= SS_RCVATMARK;
3712 /*
3713 * delay posting the actual event until
3714 * after any delayed copy processing
3715 * has finished
3716 */
3717 need_event = 1;
3718 break;
3719 }
3720 } else {
3721 offset += len;
3722 if (offset == so->so_oobmark)
3723 break;
3724 }
3725 }
3726 if (flags & MSG_EOR)
3727 break;
3728 /*
3729 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3730 * (for non-atomic socket), we must not quit until
3731 * "uio->uio_resid == 0" or an error termination.
3732 * If a signal/timeout occurs, return with a short
3733 * count but without error. Keep sockbuf locked
3734 * against other readers.
3735 */
3736 while (flags & (MSG_WAITALL|MSG_WAITSTREAM) && m == NULL &&
3737 (uio_resid(uio) - delayed_copy_len) > 0 &&
3738 !sosendallatonce(so) && !nextrecord) {
3739 if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3740#if CONTENT_FILTER
3741 && cfil_sock_data_pending(&so->so_rcv) == 0
3742#endif /* CONTENT_FILTER */
3743 ))
3744 goto release;
3745
3746 /*
3747 * Depending on the protocol (e.g. TCP), the following
3748 * might cause the socket lock to be dropped and later
3749 * be reacquired, and more data could have arrived and
3750 * have been appended to the receive socket buffer by
3751 * the time it returns. Therefore, we only sleep in
3752 * sbwait() below if and only if the socket buffer is
3753 * empty, in order to avoid a false sleep.
3754 */
3755 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
3756 (((struct inpcb *)so->so_pcb)->inp_state !=
3757 INPCB_STATE_DEAD))
3758 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3759
3760 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3761 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3762
3763 if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3764 error = 0;
3765 goto release;
3766 }
3767 /*
3768 * have to wait until after we get back from the sbwait
3769 * to do the copy because we will drop the lock if we
3770 * have enough data that has been delayed... by dropping
3771 * the lock we open up a window allowing the netisr
3772 * thread to process the incoming packets and to change
3773 * the state of this socket... we're issuing the sbwait
3774 * because the socket is empty and we're expecting the
3775 * netisr thread to wake us up when more packets arrive;
3776 * if we allow that processing to happen and then sbwait
3777 * we could stall forever with packets sitting in the
3778 * socket if no further packets arrive from the remote
3779 * side.
3780 *
3781 * we want to copy before we've collected all the data
3782 * to satisfy this request to allow the copy to overlap
3783 * the incoming packet processing on an MP system
3784 */
3785 if (delayed_copy_len > sorecvmincopy &&
3786 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3787 error = sodelayed_copy(so, uio,
3788 &free_list, &delayed_copy_len);
3789
3790 if (error)
3791 goto release;
3792 }
3793 m = so->so_rcv.sb_mb;
3794 if (m != NULL) {
3795 nextrecord = m->m_nextpkt;
3796 }
3797 SB_MB_CHECK(&so->so_rcv);
3798 }
3799 }
3800#ifdef MORE_LOCKING_DEBUG
3801 if (so->so_usecount <= 1) {
3802 panic("%s: after big while so=%p ref=%d on socket\n",
3803 __func__, so, so->so_usecount);
3804 /* NOTREACHED */
3805 }
3806#endif
3807
3808 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3809 if (so->so_options & SO_DONTTRUNC) {
3810 flags |= MSG_RCVMORE;
3811 } else {
3812 flags |= MSG_TRUNC;
3813 if ((flags & MSG_PEEK) == 0)
3814 (void) sbdroprecord(&so->so_rcv);
3815 }
3816 }
3817
3818 /*
3819 * pru_rcvd below (for TCP) may cause more data to be received
3820 * if the socket lock is dropped prior to sending the ACK; some
3821 * legacy OpenTransport applications don't handle this well
3822 * (if it receives less data than requested while MSG_HAVEMORE
3823 * is set), and so we set the flag now based on what we know
3824 * prior to calling pru_rcvd.
3825 */
3826 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
3827 flags |= MSG_HAVEMORE;
3828
3829 if ((flags & MSG_PEEK) == 0) {
3830 if (m == NULL) {
3831 so->so_rcv.sb_mb = nextrecord;
3832 /*
3833 * First part is an inline SB_EMPTY_FIXUP(). Second
3834 * part makes sure sb_lastrecord is up-to-date if
3835 * there is still data in the socket buffer.
3836 */
3837 if (so->so_rcv.sb_mb == NULL) {
3838 so->so_rcv.sb_mbtail = NULL;
3839 so->so_rcv.sb_lastrecord = NULL;
3840 } else if (nextrecord->m_nextpkt == NULL) {
3841 so->so_rcv.sb_lastrecord = nextrecord;
3842 }
3843 SB_MB_CHECK(&so->so_rcv);
3844 }
3845 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3846 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
3847 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
3848 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3849 }
3850
3851 if (delayed_copy_len) {
3852 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3853 if (error)
3854 goto release;
3855 }
3856 if (free_list != NULL) {
3857 m_freem_list(free_list);
3858 free_list = NULL;
3859 }
3860 if (need_event)
3861 postevent(so, 0, EV_OOB);
3862
3863 if (orig_resid == uio_resid(uio) && orig_resid &&
3864 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
3865 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
3866 goto restart;
3867 }
3868
3869 if (flagsp != NULL)
3870 *flagsp |= flags;
3871release:
3872#ifdef MORE_LOCKING_DEBUG
3873 if (so->so_usecount <= 1) {
3874 panic("%s: release so=%p ref=%d on socket\n", __func__,
3875 so, so->so_usecount);
3876 /* NOTREACHED */
3877 }
3878#endif
3879 if (delayed_copy_len)
3880 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3881
3882 if (free_list != NULL)
3883 m_freem_list(free_list);
3884
3885 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
3886
3887 if (en_tracing) {
3888 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3889 VM_KERNEL_ADDRPERM(so),
3890 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
3891 (int64_t)(orig_resid - uio_resid(uio)));
3892 }
3893 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
3894 so->so_rcv.sb_cc, 0, error);
3895
3896 return (error);
3897}
3898
3899/*
3900 * Returns: 0 Success
3901 * uiomove:EFAULT
3902 */
3903static int
3904sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
3905 user_ssize_t *resid)
3906{
3907 int error = 0;
3908 struct mbuf *m;
3909
3910 m = *free_list;
3911
3912 socket_unlock(so, 0);
3913
3914 while (m != NULL && error == 0) {
3915 error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
3916 m = m->m_next;
3917 }
3918 m_freem_list(*free_list);
3919
3920 *free_list = NULL;
3921 *resid = 0;
3922
3923 socket_lock(so, 0);
3924
3925 return (error);
3926}
3927
3928static int
3929sodelayed_copy_list(struct socket *so, struct recv_msg_elem *msgarray,
3930 u_int uiocnt, struct mbuf **free_list, user_ssize_t *resid)
3931{
3932#pragma unused(so)
3933 int error = 0;
3934 struct mbuf *ml, *m;
3935 int i = 0;
3936 struct uio *auio;
3937
3938 for (ml = *free_list, i = 0; ml != NULL && i < uiocnt;
3939 ml = ml->m_nextpkt, i++) {
3940 auio = msgarray[i].uio;
3941 for (m = ml; m != NULL; m = m->m_next) {
3942 error = uiomove(mtod(m, caddr_t), m->m_len, auio);
3943 if (error != 0)
3944 goto out;
3945 }
3946 }
3947out:
3948 m_freem_list(*free_list);
3949
3950 *free_list = NULL;
3951 *resid = 0;
3952
3953 return (error);
3954}
3955
3956int
3957soreceive_list(struct socket *so, struct recv_msg_elem *msgarray, u_int uiocnt,
3958 int *flagsp)
3959{
3960 struct mbuf *m;
3961 struct mbuf *nextrecord;
3962 struct mbuf *ml = NULL, *free_list = NULL, *free_tail = NULL;
3963 int error;
3964 user_ssize_t len, pktlen, delayed_copy_len = 0;
3965 struct protosw *pr = so->so_proto;
3966 user_ssize_t resid;
3967 struct proc *p = current_proc();
3968 struct uio *auio = NULL;
3969 int npkts = 0;
3970 int sblocked = 0;
3971 struct sockaddr **psa = NULL;
3972 struct mbuf **controlp = NULL;
3973 int can_delay;
3974 int flags;
3975 struct mbuf *free_others = NULL;
3976
3977 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START,
3978 so, uiocnt,
3979 so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
3980
3981 /*
3982 * Sanity checks:
3983 * - Only supports don't wait flags
3984 * - Only support datagram sockets (could be extended to raw)
3985 * - Must be atomic
3986 * - Protocol must support packet chains
3987 * - The uio array is NULL (should we panic?)
3988 */
3989 if (flagsp != NULL)
3990 flags = *flagsp;
3991 else
3992 flags = 0;
3993 if (flags & ~(MSG_PEEK | MSG_WAITALL | MSG_DONTWAIT | MSG_NEEDSA |
3994 MSG_NBIO)) {
3995 printf("%s invalid flags 0x%x\n", __func__, flags);
3996 error = EINVAL;
3997 goto out;
3998 }
3999 if (so->so_type != SOCK_DGRAM) {
4000 error = EINVAL;
4001 goto out;
4002 }
4003 if (sosendallatonce(so) == 0) {
4004 error = EINVAL;
4005 goto out;
4006 }
4007 if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
4008 error = EPROTONOSUPPORT;
4009 goto out;
4010 }
4011 if (msgarray == NULL) {
4012 printf("%s uioarray is NULL\n", __func__);
4013 error = EINVAL;
4014 goto out;
4015 }
4016 if (uiocnt == 0) {
4017 printf("%s uiocnt is 0\n", __func__);
4018 error = EINVAL;
4019 goto out;
4020 }
4021 /*
4022 * Sanity check on the length passed by caller as we are making 'int'
4023 * comparisons
4024 */
4025 resid = recv_msg_array_resid(msgarray, uiocnt);
4026 if (resid < 0 || resid > INT_MAX) {
4027 error = EINVAL;
4028 goto out;
4029 }
4030
4031 if (!(flags & MSG_PEEK) && sorecvmincopy > 0)
4032 can_delay = 1;
4033 else
4034 can_delay = 0;
4035
4036 socket_lock(so, 1);
4037 so_update_last_owner_locked(so, p);
4038 so_update_policy(so);
4039
4040#if NECP
4041 so_update_necp_policy(so, NULL, NULL);
4042#endif /* NECP */
4043
4044 /*
4045 * If a recv attempt is made on a previously-accepted socket
4046 * that has been marked as inactive (disconnected), reject
4047 * the request.
4048 */
4049 if (so->so_flags & SOF_DEFUNCT) {
4050 struct sockbuf *sb = &so->so_rcv;
4051
4052 error = ENOTCONN;
4053 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
4054 __func__, proc_pid(p), proc_best_name(p),
4055 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4056 SOCK_DOM(so), SOCK_TYPE(so), error);
4057 /*
4058 * This socket should have been disconnected and flushed
4059 * prior to being returned from sodefunct(); there should
4060 * be no data on its receive list, so panic otherwise.
4061 */
4062 if (so->so_state & SS_DEFUNCT)
4063 sb_empty_assert(sb, __func__);
4064 goto release;
4065 }
4066
4067next:
4068 /*
4069 * The uio may be empty
4070 */
4071 if (npkts >= uiocnt) {
4072 error = 0;
4073 goto release;
4074 }
4075restart:
4076 /*
4077 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4078 * and if so just return to the caller. This could happen when
4079 * soreceive() is called by a socket upcall function during the
4080 * time the socket is freed. The socket buffer would have been
4081 * locked across the upcall, therefore we cannot put this thread
4082 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4083 * we may livelock), because the lock on the socket buffer will
4084 * only be released when the upcall routine returns to its caller.
4085 * Because the socket has been officially closed, there can be
4086 * no further read on it.
4087 */
4088 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4089 (SS_NOFDREF | SS_CANTRCVMORE)) {
4090 error = 0;
4091 goto release;
4092 }
4093
4094 error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4095 if (error) {
4096 goto release;
4097 }
4098 sblocked = 1;
4099
4100 m = so->so_rcv.sb_mb;
4101 /*
4102 * Block awaiting more datagram if needed
4103 */
4104 if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
4105 (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
4106 ((flags & MSG_WAITALL) && npkts < uiocnt))))) {
4107 /*
4108 * Panic if we notice inconsistencies in the socket's
4109 * receive list; both sb_mb and sb_cc should correctly
4110 * reflect the contents of the list, otherwise we may
4111 * end up with false positives during select() or poll()
4112 * which could put the application in a bad state.
4113 */
4114 SB_MB_CHECK(&so->so_rcv);
4115
4116 if (so->so_error) {
4117 error = so->so_error;
4118 if ((flags & MSG_PEEK) == 0)
4119 so->so_error = 0;
4120 goto release;
4121 }
4122 if (so->so_state & SS_CANTRCVMORE) {
4123 goto release;
4124 }
4125 if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
4126 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4127 error = ENOTCONN;
4128 goto release;
4129 }
4130 if ((so->so_state & SS_NBIO) ||
4131 (flags & (MSG_DONTWAIT|MSG_NBIO))) {
4132 error = EWOULDBLOCK;
4133 goto release;
4134 }
4135 /*
4136 * Do not block if we got some data
4137 */
4138 if (free_list != NULL) {
4139 error = 0;
4140 goto release;
4141 }
4142
4143 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4144 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4145
4146 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4147 sblocked = 0;
4148
4149 error = sbwait(&so->so_rcv);
4150 if (error) {
4151 goto release;
4152 }
4153 goto restart;
4154 }
4155
4156 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4157 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4158 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4159
4160 /*
4161 * Consume the current uio index as we have a datagram
4162 */
4163 auio = msgarray[npkts].uio;
4164 resid = uio_resid(auio);
4165 msgarray[npkts].which |= SOCK_MSG_DATA;
4166 psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4167 &msgarray[npkts].psa : NULL;
4168 controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4169 &msgarray[npkts].controlp : NULL;
4170 npkts += 1;
4171 nextrecord = m->m_nextpkt;
4172
4173 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4174 error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, 1);
4175 if (error == ERESTART)
4176 goto restart;
4177 else if (error != 0)
4178 goto release;
4179 }
4180
4181 if (m != NULL && m->m_type == MT_CONTROL) {
4182 error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
4183 if (error != 0)
4184 goto release;
4185 }
4186
4187 if (m->m_pkthdr.len == 0) {
4188 printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4189 __func__, __LINE__,
4190 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4191 (uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4192 m->m_type);
4193 }
4194
4195 /*
4196 * Loop to copy the mbufs of the current record
4197 * Support zero length packets
4198 */
4199 ml = NULL;
4200 pktlen = 0;
4201 while (m != NULL && (len = resid - pktlen) >= 0 && error == 0) {
4202 if (m->m_len == 0)
4203 panic("%p m_len zero", m);
4204 if (m->m_type == 0)
4205 panic("%p m_type zero", m);
4206 /*
4207 * Clip to the residual length
4208 */
4209 if (len > m->m_len)
4210 len = m->m_len;
4211 pktlen += len;
4212 /*
4213 * Copy the mbufs via the uio or delay the copy
4214 * Sockbuf must be consistent here (points to current mbuf,
4215 * it points to next record) when we drop priority;
4216 * we must note any additions to the sockbuf when we
4217 * block interrupts again.
4218 */
4219 if (len > 0 && can_delay == 0) {
4220 socket_unlock(so, 0);
4221 error = uiomove(mtod(m, caddr_t), (int)len, auio);
4222 socket_lock(so, 0);
4223 if (error)
4224 goto release;
4225 } else {
4226 delayed_copy_len += len;
4227 }
4228
4229 if (len == m->m_len) {
4230 /*
4231 * m was entirely copied
4232 */
4233 sbfree(&so->so_rcv, m);
4234 nextrecord = m->m_nextpkt;
4235 m->m_nextpkt = NULL;
4236
4237 /*
4238 * Set the first packet to the head of the free list
4239 */
4240 if (free_list == NULL)
4241 free_list = m;
4242 /*
4243 * Link current packet to tail of free list
4244 */
4245 if (ml == NULL) {
4246 if (free_tail != NULL)
4247 free_tail->m_nextpkt = m;
4248 free_tail = m;
4249 }
4250 /*
4251 * Link current mbuf to last mbuf of current packet
4252 */
4253 if (ml != NULL)
4254 ml->m_next = m;
4255 ml = m;
4256
4257 /*
4258 * Move next buf to head of socket buffer
4259 */
4260 so->so_rcv.sb_mb = m = ml->m_next;
4261 ml->m_next = NULL;
4262
4263 if (m != NULL) {
4264 m->m_nextpkt = nextrecord;
4265 if (nextrecord == NULL)
4266 so->so_rcv.sb_lastrecord = m;
4267 } else {
4268 so->so_rcv.sb_mb = nextrecord;
4269 SB_EMPTY_FIXUP(&so->so_rcv);
4270 }
4271 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4272 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4273 } else {
4274 /*
4275 * Stop the loop on partial copy
4276 */
4277 break;
4278 }
4279 }
4280#ifdef MORE_LOCKING_DEBUG
4281 if (so->so_usecount <= 1) {
4282 panic("%s: after big while so=%llx ref=%d on socket\n",
4283 __func__,
4284 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
4285 /* NOTREACHED */
4286 }
4287#endif
4288 /*
4289 * Tell the caller we made a partial copy
4290 */
4291 if (m != NULL) {
4292 if (so->so_options & SO_DONTTRUNC) {
4293 /*
4294 * Copyout first the freelist then the partial mbuf
4295 */
4296 socket_unlock(so, 0);
4297 if (delayed_copy_len)
4298 error = sodelayed_copy_list(so, msgarray,
4299 uiocnt, &free_list, &delayed_copy_len);
4300
4301 if (error == 0) {
4302 error = uiomove(mtod(m, caddr_t), (int)len,
4303 auio);
4304 }
4305 socket_lock(so, 0);
4306 if (error)
4307 goto release;
4308
4309 m->m_data += len;
4310 m->m_len -= len;
4311 so->so_rcv.sb_cc -= len;
4312 flags |= MSG_RCVMORE;
4313 } else {
4314 (void) sbdroprecord(&so->so_rcv);
4315 nextrecord = so->so_rcv.sb_mb;
4316 m = NULL;
4317 flags |= MSG_TRUNC;
4318 }
4319 }
4320
4321 if (m == NULL) {
4322 so->so_rcv.sb_mb = nextrecord;
4323 /*
4324 * First part is an inline SB_EMPTY_FIXUP(). Second
4325 * part makes sure sb_lastrecord is up-to-date if
4326 * there is still data in the socket buffer.
4327 */
4328 if (so->so_rcv.sb_mb == NULL) {
4329 so->so_rcv.sb_mbtail = NULL;
4330 so->so_rcv.sb_lastrecord = NULL;
4331 } else if (nextrecord->m_nextpkt == NULL) {
4332 so->so_rcv.sb_lastrecord = nextrecord;
4333 }
4334 SB_MB_CHECK(&so->so_rcv);
4335 }
4336 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4337 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4338
4339 /*
4340 * We can continue to the next packet as long as:
4341 * - We haven't exhausted the uio array
4342 * - There was no error
4343 * - A packet was not truncated
4344 * - We can still receive more data
4345 */
4346 if (npkts < uiocnt && error == 0 &&
4347 (flags & (MSG_RCVMORE | MSG_TRUNC)) == 0 &&
4348 (so->so_state & SS_CANTRCVMORE) == 0) {
4349 sbunlock(&so->so_rcv, TRUE); /* keep socket locked */
4350 sblocked = 0;
4351
4352 goto next;
4353 }
4354 if (flagsp != NULL)
4355 *flagsp |= flags;
4356
4357release:
4358 /*
4359 * pru_rcvd may cause more data to be received if the socket lock
4360 * is dropped so we set MSG_HAVEMORE now based on what we know.
4361 * That way the caller won't be surprised if it receives less data
4362 * than requested.
4363 */
4364 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0)
4365 flags |= MSG_HAVEMORE;
4366
4367 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
4368 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4369
4370 if (sblocked)
4371 sbunlock(&so->so_rcv, FALSE); /* will unlock socket */
4372 else
4373 socket_unlock(so, 1);
4374
4375 if (delayed_copy_len)
4376 error = sodelayed_copy_list(so, msgarray, uiocnt,
4377 &free_list, &delayed_copy_len);
4378out:
4379 /*
4380 * Amortize the cost of freeing the mbufs
4381 */
4382 if (free_list != NULL)
4383 m_freem_list(free_list);
4384 if (free_others != NULL)
4385 m_freem_list(free_others);
4386
4387 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4388 0, 0, 0, 0);
4389 return (error);
4390}
4391
4392/*
4393 * Returns: 0 Success
4394 * EINVAL
4395 * ENOTCONN
4396 * <pru_shutdown>:EINVAL
4397 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
4398 * <pru_shutdown>:ENOBUFS[TCP]
4399 * <pru_shutdown>:EMSGSIZE[TCP]
4400 * <pru_shutdown>:EHOSTUNREACH[TCP]
4401 * <pru_shutdown>:ENETUNREACH[TCP]
4402 * <pru_shutdown>:ENETDOWN[TCP]
4403 * <pru_shutdown>:ENOMEM[TCP]
4404 * <pru_shutdown>:EACCES[TCP]
4405 * <pru_shutdown>:EMSGSIZE[TCP]
4406 * <pru_shutdown>:ENOBUFS[TCP]
4407 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
4408 * <pru_shutdown>:??? [other protocol families]
4409 */
4410int
4411soshutdown(struct socket *so, int how)
4412{
4413 int error;
4414
4415 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4416
4417 switch (how) {
4418 case SHUT_RD:
4419 case SHUT_WR:
4420 case SHUT_RDWR:
4421 socket_lock(so, 1);
4422 if ((so->so_state &
4423 (SS_ISCONNECTED|SS_ISCONNECTING|SS_ISDISCONNECTING)) == 0) {
4424 error = ENOTCONN;
4425 } else {
4426 error = soshutdownlock(so, how);
4427 }
4428 socket_unlock(so, 1);
4429 break;
4430 default:
4431 error = EINVAL;
4432 break;
4433 }
4434
4435 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4436
4437 return (error);
4438}
4439
4440int
4441soshutdownlock_final(struct socket *so, int how)
4442{
4443 struct protosw *pr = so->so_proto;
4444 int error = 0;
4445
4446 sflt_notify(so, sock_evt_shutdown, &how);
4447
4448 if (how != SHUT_WR) {
4449 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4450 /* read already shut down */
4451 error = ENOTCONN;
4452 goto done;
4453 }
4454 sorflush(so);
4455 postevent(so, 0, EV_RCLOSED);
4456 }
4457 if (how != SHUT_RD) {
4458 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4459 /* write already shut down */
4460 error = ENOTCONN;
4461 goto done;
4462 }
4463 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4464 postevent(so, 0, EV_WCLOSED);
4465 }
4466done:
4467 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4468 return (error);
4469}
4470
4471int
4472soshutdownlock(struct socket *so, int how)
4473{
4474 int error = 0;
4475
4476#if CONTENT_FILTER
4477 /*
4478 * A content filter may delay the actual shutdown until it
4479 * has processed the pending data
4480 */
4481 if (so->so_flags & SOF_CONTENT_FILTER) {
4482 error = cfil_sock_shutdown(so, &how);
4483 if (error == EJUSTRETURN) {
4484 error = 0;
4485 goto done;
4486 } else if (error != 0) {
4487 goto done;
4488 }
4489 }
4490#endif /* CONTENT_FILTER */
4491
4492 error = soshutdownlock_final(so, how);
4493
4494done:
4495 return (error);
4496}
4497
4498void
4499sowflush(struct socket *so)
4500{
4501 struct sockbuf *sb = &so->so_snd;
4502
4503 /*
4504 * Obtain lock on the socket buffer (SB_LOCK). This is required
4505 * to prevent the socket buffer from being unexpectedly altered
4506 * while it is used by another thread in socket send/receive.
4507 *
4508 * sblock() must not fail here, hence the assertion.
4509 */
4510 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4511 VERIFY(sb->sb_flags & SB_LOCK);
4512
4513 sb->sb_flags &= ~(SB_SEL|SB_UPCALL);
4514 sb->sb_flags |= SB_DROP;
4515 sb->sb_upcall = NULL;
4516 sb->sb_upcallarg = NULL;
4517
4518 sbunlock(sb, TRUE); /* keep socket locked */
4519
4520 selthreadclear(&sb->sb_sel);
4521 sbrelease(sb);
4522}
4523
4524void
4525sorflush(struct socket *so)
4526{
4527 struct sockbuf *sb = &so->so_rcv;
4528 struct protosw *pr = so->so_proto;
4529 struct sockbuf asb;
4530#ifdef notyet
4531 lck_mtx_t *mutex_held;
4532 /*
4533 * XXX: This code is currently commented out, because we may get here
4534 * as part of sofreelastref(), and at that time, pr_getlock() may no
4535 * longer be able to return us the lock; this will be fixed in future.
4536 */
4537 if (so->so_proto->pr_getlock != NULL)
4538 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4539 else
4540 mutex_held = so->so_proto->pr_domain->dom_mtx;
4541
4542 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4543#endif /* notyet */
4544
4545 sflt_notify(so, sock_evt_flush_read, NULL);
4546
4547 socantrcvmore(so);
4548
4549 /*
4550 * Obtain lock on the socket buffer (SB_LOCK). This is required
4551 * to prevent the socket buffer from being unexpectedly altered
4552 * while it is used by another thread in socket send/receive.
4553 *
4554 * sblock() must not fail here, hence the assertion.
4555 */
4556 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4557 VERIFY(sb->sb_flags & SB_LOCK);
4558
4559 /*
4560 * Copy only the relevant fields from "sb" to "asb" which we
4561 * need for sbrelease() to function. In particular, skip
4562 * sb_sel as it contains the wait queue linkage, which would
4563 * wreak havoc if we were to issue selthreadclear() on "asb".
4564 * Make sure to not carry over SB_LOCK in "asb", as we need
4565 * to acquire it later as part of sbrelease().
4566 */
4567 bzero(&asb, sizeof (asb));
4568 asb.sb_cc = sb->sb_cc;
4569 asb.sb_hiwat = sb->sb_hiwat;
4570 asb.sb_mbcnt = sb->sb_mbcnt;
4571 asb.sb_mbmax = sb->sb_mbmax;
4572 asb.sb_ctl = sb->sb_ctl;
4573 asb.sb_lowat = sb->sb_lowat;
4574 asb.sb_mb = sb->sb_mb;
4575 asb.sb_mbtail = sb->sb_mbtail;
4576 asb.sb_lastrecord = sb->sb_lastrecord;
4577 asb.sb_so = sb->sb_so;
4578 asb.sb_flags = sb->sb_flags;
4579 asb.sb_flags &= ~(SB_LOCK|SB_SEL|SB_KNOTE|SB_UPCALL);
4580 asb.sb_flags |= SB_DROP;
4581
4582 /*
4583 * Ideally we'd bzero() these and preserve the ones we need;
4584 * but to do that we'd need to shuffle things around in the
4585 * sockbuf, and we can't do it now because there are KEXTS
4586 * that are directly referring to the socket structure.
4587 *
4588 * Setting SB_DROP acts as a barrier to prevent further appends.
4589 * Clearing SB_SEL is done for selthreadclear() below.
4590 */
4591 sb->sb_cc = 0;
4592 sb->sb_hiwat = 0;
4593 sb->sb_mbcnt = 0;
4594 sb->sb_mbmax = 0;
4595 sb->sb_ctl = 0;
4596 sb->sb_lowat = 0;
4597 sb->sb_mb = NULL;
4598 sb->sb_mbtail = NULL;
4599 sb->sb_lastrecord = NULL;
4600 sb->sb_timeo.tv_sec = 0;
4601 sb->sb_timeo.tv_usec = 0;
4602 sb->sb_upcall = NULL;
4603 sb->sb_upcallarg = NULL;
4604 sb->sb_flags &= ~(SB_SEL|SB_UPCALL);
4605 sb->sb_flags |= SB_DROP;
4606
4607 sbunlock(sb, TRUE); /* keep socket locked */
4608
4609 /*
4610 * Note that selthreadclear() is called on the original "sb" and
4611 * not the local "asb" because of the way wait queue linkage is
4612 * implemented. Given that selwakeup() may be triggered, SB_SEL
4613 * should no longer be set (cleared above.)
4614 */
4615 selthreadclear(&sb->sb_sel);
4616
4617 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose)
4618 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4619
4620 sbrelease(&asb);
4621}
4622
4623/*
4624 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4625 * an additional variant to handle the case where the option value needs
4626 * to be some kind of integer, but not a specific size.
4627 * In addition to their use here, these functions are also called by the
4628 * protocol-level pr_ctloutput() routines.
4629 *
4630 * Returns: 0 Success
4631 * EINVAL
4632 * copyin:EFAULT
4633 */
4634int
4635sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4636{
4637 size_t valsize;
4638
4639 /*
4640 * If the user gives us more than we wanted, we ignore it,
4641 * but if we don't get the minimum length the caller
4642 * wants, we return EINVAL. On success, sopt->sopt_valsize
4643 * is set to however much we actually retrieved.
4644 */
4645 if ((valsize = sopt->sopt_valsize) < minlen)
4646 return (EINVAL);
4647 if (valsize > len)
4648 sopt->sopt_valsize = valsize = len;
4649
4650 if (sopt->sopt_p != kernproc)
4651 return (copyin(sopt->sopt_val, buf, valsize));
4652
4653 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4654 return (0);
4655}
4656
4657/*
4658 * sooptcopyin_timeval
4659 * Copy in a timeval value into tv_p, and take into account whether the
4660 * the calling process is 64-bit or 32-bit. Moved the sanity checking
4661 * code here so that we can verify the 64-bit tv_sec value before we lose
4662 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4663 */
4664static int
4665sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4666{
4667 int error;
4668
4669 if (proc_is64bit(sopt->sopt_p)) {
4670 struct user64_timeval tv64;
4671
4672 if (sopt->sopt_valsize < sizeof (tv64))
4673 return (EINVAL);
4674
4675 sopt->sopt_valsize = sizeof (tv64);
4676 if (sopt->sopt_p != kernproc) {
4677 error = copyin(sopt->sopt_val, &tv64, sizeof (tv64));
4678 if (error != 0)
4679 return (error);
4680 } else {
4681 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4682 sizeof (tv64));
4683 }
4684 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4685 tv64.tv_usec < 0 || tv64.tv_usec >= 1000000)
4686 return (EDOM);
4687
4688 tv_p->tv_sec = tv64.tv_sec;
4689 tv_p->tv_usec = tv64.tv_usec;
4690 } else {
4691 struct user32_timeval tv32;
4692
4693 if (sopt->sopt_valsize < sizeof (tv32))
4694 return (EINVAL);
4695
4696 sopt->sopt_valsize = sizeof (tv32);
4697 if (sopt->sopt_p != kernproc) {
4698 error = copyin(sopt->sopt_val, &tv32, sizeof (tv32));
4699 if (error != 0) {
4700 return (error);
4701 }
4702 } else {
4703 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4704 sizeof (tv32));
4705 }
4706#ifndef __LP64__
4707 /*
4708 * K64todo "comparison is always false due to
4709 * limited range of data type"
4710 */
4711 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4712 tv32.tv_usec < 0 || tv32.tv_usec >= 1000000)
4713 return (EDOM);
4714#endif
4715 tv_p->tv_sec = tv32.tv_sec;
4716 tv_p->tv_usec = tv32.tv_usec;
4717 }
4718 return (0);
4719}
4720
4721int
4722soopt_cred_check(struct socket *so, int priv, boolean_t allow_root)
4723{
4724 kauth_cred_t cred = NULL;
4725 proc_t ep = PROC_NULL;
4726 uid_t uid;
4727 int error = 0;
4728
4729 if (so->so_flags & SOF_DELEGATED) {
4730 ep = proc_find(so->e_pid);
4731 if (ep)
4732 cred = kauth_cred_proc_ref(ep);
4733 }
4734
4735 uid = kauth_cred_getuid(cred ? cred : so->so_cred);
4736
4737 /* uid is 0 for root */
4738 if (uid != 0 || !allow_root)
4739 error = priv_check_cred(cred ? cred : so->so_cred, priv, 0);
4740 if (cred)
4741 kauth_cred_unref(&cred);
4742 if (ep != PROC_NULL)
4743 proc_rele(ep);
4744
4745 return (error);
4746}
4747
4748/*
4749 * Returns: 0 Success
4750 * EINVAL
4751 * ENOPROTOOPT
4752 * ENOBUFS
4753 * EDOM
4754 * sooptcopyin:EINVAL
4755 * sooptcopyin:EFAULT
4756 * sooptcopyin_timeval:EINVAL
4757 * sooptcopyin_timeval:EFAULT
4758 * sooptcopyin_timeval:EDOM
4759 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4760 * <pr_ctloutput>:???w
4761 * sflt_attach_private:??? [whatever a filter author chooses]
4762 * <sf_setoption>:??? [whatever a filter author chooses]
4763 *
4764 * Notes: Other <pru_listen> returns depend on the protocol family; all
4765 * <sf_listen> returns depend on what the filter author causes
4766 * their filter to return.
4767 */
4768int
4769sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4770{
4771 int error, optval;
4772 struct linger l;
4773 struct timeval tv;
4774#if CONFIG_MACF_SOCKET
4775 struct mac extmac;
4776#endif /* MAC_SOCKET */
4777
4778 if (sopt->sopt_dir != SOPT_SET)
4779 sopt->sopt_dir = SOPT_SET;
4780
4781 if (dolock)
4782 socket_lock(so, 1);
4783
4784 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4785 (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
4786 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
4787 /* the socket has been shutdown, no more sockopt's */
4788 error = EINVAL;
4789 goto out;
4790 }
4791
4792 error = sflt_setsockopt(so, sopt);
4793 if (error != 0) {
4794 if (error == EJUSTRETURN)
4795 error = 0;
4796 goto out;
4797 }
4798
4799 if (sopt->sopt_level != SOL_SOCKET) {
4800 if (so->so_proto != NULL &&
4801 so->so_proto->pr_ctloutput != NULL) {
4802 error = (*so->so_proto->pr_ctloutput)(so, sopt);
4803 goto out;
4804 }
4805 error = ENOPROTOOPT;
4806 } else {
4807 /*
4808 * Allow socket-level (SOL_SOCKET) options to be filtered by
4809 * the protocol layer, if needed. A zero value returned from
4810 * the handler means use default socket-level processing as
4811 * done by the rest of this routine. Otherwise, any other
4812 * return value indicates that the option is unsupported.
4813 */
4814 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
4815 pru_socheckopt(so, sopt)) != 0)
4816 goto out;
4817
4818 error = 0;
4819 switch (sopt->sopt_name) {
4820 case SO_LINGER:
4821 case SO_LINGER_SEC:
4822 error = sooptcopyin(sopt, &l, sizeof (l), sizeof (l));
4823 if (error != 0)
4824 goto out;
4825
4826 so->so_linger = (sopt->sopt_name == SO_LINGER) ?
4827 l.l_linger : l.l_linger * hz;
4828 if (l.l_onoff != 0)
4829 so->so_options |= SO_LINGER;
4830 else
4831 so->so_options &= ~SO_LINGER;
4832 break;
4833
4834 case SO_DEBUG:
4835 case SO_KEEPALIVE:
4836 case SO_DONTROUTE:
4837 case SO_USELOOPBACK:
4838 case SO_BROADCAST:
4839 case SO_REUSEADDR:
4840 case SO_REUSEPORT:
4841 case SO_OOBINLINE:
4842 case SO_TIMESTAMP:
4843 case SO_TIMESTAMP_MONOTONIC:
4844 case SO_TIMESTAMP_CONTINUOUS:
4845 case SO_DONTTRUNC:
4846 case SO_WANTMORE:
4847 case SO_WANTOOBFLAG:
4848 case SO_NOWAKEFROMSLEEP:
4849 case SO_NOAPNFALLBK:
4850 error = sooptcopyin(sopt, &optval, sizeof (optval),
4851 sizeof (optval));
4852 if (error != 0)
4853 goto out;
4854 if (optval)
4855 so->so_options |= sopt->sopt_name;
4856 else
4857 so->so_options &= ~sopt->sopt_name;
4858 break;
4859
4860 case SO_SNDBUF:
4861 case SO_RCVBUF:
4862 case SO_SNDLOWAT:
4863 case SO_RCVLOWAT:
4864 error = sooptcopyin(sopt, &optval, sizeof (optval),
4865 sizeof (optval));
4866 if (error != 0)
4867 goto out;
4868
4869 /*
4870 * Values < 1 make no sense for any of these
4871 * options, so disallow them.
4872 */
4873 if (optval < 1) {
4874 error = EINVAL;
4875 goto out;
4876 }
4877
4878 switch (sopt->sopt_name) {
4879 case SO_SNDBUF:
4880 case SO_RCVBUF: {
4881 struct sockbuf *sb =
4882 (sopt->sopt_name == SO_SNDBUF) ?
4883 &so->so_snd : &so->so_rcv;
4884 if (sbreserve(sb, (u_int32_t)optval) == 0) {
4885 error = ENOBUFS;
4886 goto out;
4887 }
4888 sb->sb_flags |= SB_USRSIZE;
4889 sb->sb_flags &= ~SB_AUTOSIZE;
4890 sb->sb_idealsize = (u_int32_t)optval;
4891 break;
4892 }
4893 /*
4894 * Make sure the low-water is never greater than
4895 * the high-water.
4896 */
4897 case SO_SNDLOWAT: {
4898 int space = sbspace(&so->so_snd);
4899 u_int32_t hiwat = so->so_snd.sb_hiwat;
4900
4901 if (so->so_snd.sb_flags & SB_UNIX) {
4902 struct unpcb *unp =
4903 (struct unpcb *)(so->so_pcb);
4904 if (unp != NULL &&
4905 unp->unp_conn != NULL) {
4906 hiwat += unp->unp_conn->unp_cc;
4907 }
4908 }
4909
4910 so->so_snd.sb_lowat =
4911 (optval > hiwat) ?
4912 hiwat : optval;
4913
4914 if (space >= so->so_snd.sb_lowat) {
4915 sowwakeup(so);
4916 }
4917 break;
4918 }
4919 case SO_RCVLOWAT: {
4920 int64_t data_len;
4921 so->so_rcv.sb_lowat =
4922 (optval > so->so_rcv.sb_hiwat) ?
4923 so->so_rcv.sb_hiwat : optval;
4924 data_len = so->so_rcv.sb_cc
4925 - so->so_rcv.sb_ctl;
4926 if (data_len >= so->so_rcv.sb_lowat)
4927 sorwakeup(so);
4928 break;
4929 }
4930 }
4931 break;
4932
4933 case SO_SNDTIMEO:
4934 case SO_RCVTIMEO:
4935 error = sooptcopyin_timeval(sopt, &tv);
4936 if (error != 0)
4937 goto out;
4938
4939 switch (sopt->sopt_name) {
4940 case SO_SNDTIMEO:
4941 so->so_snd.sb_timeo = tv;
4942 break;
4943 case SO_RCVTIMEO:
4944 so->so_rcv.sb_timeo = tv;
4945 break;
4946 }
4947 break;
4948
4949 case SO_NKE: {
4950 struct so_nke nke;
4951
4952 error = sooptcopyin(sopt, &nke, sizeof (nke),
4953 sizeof (nke));
4954 if (error != 0)
4955 goto out;
4956
4957 error = sflt_attach_internal(so, nke.nke_handle);
4958 break;
4959 }
4960
4961 case SO_NOSIGPIPE:
4962 error = sooptcopyin(sopt, &optval, sizeof (optval),
4963 sizeof (optval));
4964 if (error != 0)
4965 goto out;
4966 if (optval != 0)
4967 so->so_flags |= SOF_NOSIGPIPE;
4968 else
4969 so->so_flags &= ~SOF_NOSIGPIPE;
4970 break;
4971
4972 case SO_NOADDRERR:
4973 error = sooptcopyin(sopt, &optval, sizeof (optval),
4974 sizeof (optval));
4975 if (error != 0)
4976 goto out;
4977 if (optval != 0)
4978 so->so_flags |= SOF_NOADDRAVAIL;
4979 else
4980 so->so_flags &= ~SOF_NOADDRAVAIL;
4981 break;
4982
4983 case SO_REUSESHAREUID:
4984 error = sooptcopyin(sopt, &optval, sizeof (optval),
4985 sizeof (optval));
4986 if (error != 0)
4987 goto out;
4988 if (optval != 0)
4989 so->so_flags |= SOF_REUSESHAREUID;
4990 else
4991 so->so_flags &= ~SOF_REUSESHAREUID;
4992 break;
4993
4994 case SO_NOTIFYCONFLICT:
4995 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
4996 error = EPERM;
4997 goto out;
4998 }
4999 error = sooptcopyin(sopt, &optval, sizeof (optval),
5000 sizeof (optval));
5001 if (error != 0)
5002 goto out;
5003 if (optval != 0)
5004 so->so_flags |= SOF_NOTIFYCONFLICT;
5005 else
5006 so->so_flags &= ~SOF_NOTIFYCONFLICT;
5007 break;
5008
5009 case SO_RESTRICTIONS:
5010 error = sooptcopyin(sopt, &optval, sizeof (optval),
5011 sizeof (optval));
5012 if (error != 0)
5013 goto out;
5014
5015 error = so_set_restrictions(so, optval);
5016 break;
5017
5018 case SO_AWDL_UNRESTRICTED:
5019 if (SOCK_DOM(so) != PF_INET &&
5020 SOCK_DOM(so) != PF_INET6) {
5021 error = EOPNOTSUPP;
5022 goto out;
5023 }
5024 error = sooptcopyin(sopt, &optval, sizeof(optval),
5025 sizeof(optval));
5026 if (error != 0)
5027 goto out;
5028 if (optval != 0) {
5029 error = soopt_cred_check(so,
5030 PRIV_NET_RESTRICTED_AWDL, false);
5031 if (error == 0)
5032 inp_set_awdl_unrestricted(
5033 sotoinpcb(so));
5034 } else
5035 inp_clear_awdl_unrestricted(sotoinpcb(so));
5036 break;
5037 case SO_INTCOPROC_ALLOW:
5038 if (SOCK_DOM(so) != PF_INET6) {
5039 error = EOPNOTSUPP;
5040 goto out;
5041 }
5042 error = sooptcopyin(sopt, &optval, sizeof(optval),
5043 sizeof(optval));
5044 if (error != 0)
5045 goto out;
5046 if (optval != 0 &&
5047 inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5048 error = soopt_cred_check(so,
5049 PRIV_NET_RESTRICTED_INTCOPROC, false);
5050 if (error == 0)
5051 inp_set_intcoproc_allowed(
5052 sotoinpcb(so));
5053 } else if (optval == 0)
5054 inp_clear_intcoproc_allowed(sotoinpcb(so));
5055 break;
5056
5057 case SO_LABEL:
5058#if CONFIG_MACF_SOCKET
5059 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5060 sizeof (extmac))) != 0)
5061 goto out;
5062
5063 error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
5064 so, &extmac);
5065#else
5066 error = EOPNOTSUPP;
5067#endif /* MAC_SOCKET */
5068 break;
5069
5070 case SO_UPCALLCLOSEWAIT:
5071 error = sooptcopyin(sopt, &optval, sizeof (optval),
5072 sizeof (optval));
5073 if (error != 0)
5074 goto out;
5075 if (optval != 0)
5076 so->so_flags |= SOF_UPCALLCLOSEWAIT;
5077 else
5078 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5079 break;
5080
5081 case SO_RANDOMPORT:
5082 error = sooptcopyin(sopt, &optval, sizeof (optval),
5083 sizeof (optval));
5084 if (error != 0)
5085 goto out;
5086 if (optval != 0)
5087 so->so_flags |= SOF_BINDRANDOMPORT;
5088 else
5089 so->so_flags &= ~SOF_BINDRANDOMPORT;
5090 break;
5091
5092 case SO_NP_EXTENSIONS: {
5093 struct so_np_extensions sonpx;
5094
5095 error = sooptcopyin(sopt, &sonpx, sizeof (sonpx),
5096 sizeof (sonpx));
5097 if (error != 0)
5098 goto out;
5099 if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5100 error = EINVAL;
5101 goto out;
5102 }
5103 /*
5104 * Only one bit defined for now
5105 */
5106 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5107 if ((sonpx.npx_flags & SONPX_SETOPTSHUT))
5108 so->so_flags |= SOF_NPX_SETOPTSHUT;
5109 else
5110 so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5111 }
5112 break;
5113 }
5114
5115 case SO_TRAFFIC_CLASS: {
5116 error = sooptcopyin(sopt, &optval, sizeof (optval),
5117 sizeof (optval));
5118 if (error != 0)
5119 goto out;
5120 if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5121 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5122 error = so_set_net_service_type(so, netsvc);
5123 goto out;
5124 }
5125 error = so_set_traffic_class(so, optval);
5126 if (error != 0)
5127 goto out;
5128 so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5129 so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5130 break;
5131 }
5132
5133 case SO_RECV_TRAFFIC_CLASS: {
5134 error = sooptcopyin(sopt, &optval, sizeof (optval),
5135 sizeof (optval));
5136 if (error != 0)
5137 goto out;
5138 if (optval == 0)
5139 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5140 else
5141 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5142 break;
5143 }
5144
5145#if (DEVELOPMENT || DEBUG)
5146 case SO_TRAFFIC_CLASS_DBG: {
5147 struct so_tcdbg so_tcdbg;
5148
5149 error = sooptcopyin(sopt, &so_tcdbg,
5150 sizeof (struct so_tcdbg), sizeof (struct so_tcdbg));
5151 if (error != 0)
5152 goto out;
5153 error = so_set_tcdbg(so, &so_tcdbg);
5154 if (error != 0)
5155 goto out;
5156 break;
5157 }
5158#endif /* (DEVELOPMENT || DEBUG) */
5159
5160 case SO_PRIVILEGED_TRAFFIC_CLASS:
5161 error = priv_check_cred(kauth_cred_get(),
5162 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, 0);
5163 if (error != 0)
5164 goto out;
5165 error = sooptcopyin(sopt, &optval, sizeof (optval),
5166 sizeof (optval));
5167 if (error != 0)
5168 goto out;
5169 if (optval == 0)
5170 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5171 else
5172 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5173 break;
5174
5175#if (DEVELOPMENT || DEBUG)
5176 case SO_DEFUNCTIT:
5177 error = sosetdefunct(current_proc(), so, 0, FALSE);
5178 if (error == 0)
5179 error = sodefunct(current_proc(), so, 0);
5180
5181 break;
5182#endif /* (DEVELOPMENT || DEBUG) */
5183
5184 case SO_DEFUNCTOK:
5185 error = sooptcopyin(sopt, &optval, sizeof (optval),
5186 sizeof (optval));
5187 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5188 if (error == 0)
5189 error = EBADF;
5190 goto out;
5191 }
5192 /*
5193 * Any process can set SO_DEFUNCTOK (clear
5194 * SOF_NODEFUNCT), but only root can clear
5195 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5196 */
5197 if (optval == 0 &&
5198 kauth_cred_issuser(kauth_cred_get()) == 0) {
5199 error = EPERM;
5200 goto out;
5201 }
5202 if (optval)
5203 so->so_flags &= ~SOF_NODEFUNCT;
5204 else
5205 so->so_flags |= SOF_NODEFUNCT;
5206
5207 if (SOCK_DOM(so) == PF_INET ||
5208 SOCK_DOM(so) == PF_INET6) {
5209 char s[MAX_IPv6_STR_LEN];
5210 char d[MAX_IPv6_STR_LEN];
5211 struct inpcb *inp = sotoinpcb(so);
5212
5213 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5214 "[%s %s:%d -> %s:%d] is now marked "
5215 "as %seligible for "
5216 "defunct\n", __func__, proc_selfpid(),
5217 proc_best_name(current_proc()),
5218 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5219 (SOCK_TYPE(so) == SOCK_STREAM) ?
5220 "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5221 ((SOCK_DOM(so) == PF_INET) ?
5222 (void *)&inp->inp_laddr.s_addr :
5223 (void *)&inp->in6p_laddr), s, sizeof (s)),
5224 ntohs(inp->in6p_lport),
5225 inet_ntop(SOCK_DOM(so),
5226 (SOCK_DOM(so) == PF_INET) ?
5227 (void *)&inp->inp_faddr.s_addr :
5228 (void *)&inp->in6p_faddr, d, sizeof (d)),
5229 ntohs(inp->in6p_fport),
5230 (so->so_flags & SOF_NODEFUNCT) ?
5231 "not " : "");
5232 } else {
5233 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5234 "is now marked as %seligible for "
5235 "defunct\n",
5236 __func__, proc_selfpid(),
5237 proc_best_name(current_proc()),
5238 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5239 SOCK_DOM(so), SOCK_TYPE(so),
5240 (so->so_flags & SOF_NODEFUNCT) ?
5241 "not " : "");
5242 }
5243 break;
5244
5245 case SO_ISDEFUNCT:
5246 /* This option is not settable */
5247 error = EINVAL;
5248 break;
5249
5250 case SO_OPPORTUNISTIC:
5251 error = sooptcopyin(sopt, &optval, sizeof (optval),
5252 sizeof (optval));
5253 if (error == 0)
5254 error = so_set_opportunistic(so, optval);
5255 break;
5256
5257 case SO_FLUSH:
5258 /* This option is handled by lower layer(s) */
5259 error = 0;
5260 break;
5261
5262 case SO_RECV_ANYIF:
5263 error = sooptcopyin(sopt, &optval, sizeof (optval),
5264 sizeof (optval));
5265 if (error == 0)
5266 error = so_set_recv_anyif(so, optval);
5267 break;
5268
5269 case SO_TRAFFIC_MGT_BACKGROUND: {
5270 /* This option is handled by lower layer(s) */
5271 error = 0;
5272 break;
5273 }
5274
5275#if FLOW_DIVERT
5276 case SO_FLOW_DIVERT_TOKEN:
5277 error = flow_divert_token_set(so, sopt);
5278 break;
5279#endif /* FLOW_DIVERT */
5280
5281
5282 case SO_DELEGATED:
5283 if ((error = sooptcopyin(sopt, &optval, sizeof (optval),
5284 sizeof (optval))) != 0)
5285 break;
5286
5287 error = so_set_effective_pid(so, optval, sopt->sopt_p);
5288 break;
5289
5290 case SO_DELEGATED_UUID: {
5291 uuid_t euuid;
5292
5293 if ((error = sooptcopyin(sopt, &euuid, sizeof (euuid),
5294 sizeof (euuid))) != 0)
5295 break;
5296
5297 error = so_set_effective_uuid(so, euuid, sopt->sopt_p);
5298 break;
5299 }
5300
5301#if NECP
5302 case SO_NECP_ATTRIBUTES:
5303 error = necp_set_socket_attributes(so, sopt);
5304 break;
5305
5306 case SO_NECP_CLIENTUUID:
5307 if (SOCK_DOM(so) == PF_MULTIPATH) {
5308 /* Handled by MPTCP itself */
5309 break;
5310 }
5311
5312 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5313 error = EINVAL;
5314 goto out;
5315 }
5316
5317 struct inpcb *inp = sotoinpcb(so);
5318 if (!uuid_is_null(inp->necp_client_uuid)) {
5319 // Clear out the old client UUID if present
5320 necp_inpcb_remove_cb(inp);
5321 }
5322
5323 error = sooptcopyin(sopt, &inp->necp_client_uuid,
5324 sizeof(uuid_t), sizeof(uuid_t));
5325 if (error != 0) {
5326 goto out;
5327 }
5328
5329 if (uuid_is_null(inp->necp_client_uuid)) {
5330 error = EINVAL;
5331 goto out;
5332 }
5333
5334 error = necp_client_register_socket_flow(so->last_pid,
5335 inp->necp_client_uuid, inp);
5336 if (error != 0) {
5337 uuid_clear(inp->necp_client_uuid);
5338 goto out;
5339 }
5340
5341 if (inp->inp_lport != 0) {
5342 // There is bound local port, so this is not
5343 // a fresh socket. Assign to the client.
5344 necp_client_assign_from_socket(so->last_pid, inp->necp_client_uuid, inp);
5345 }
5346
5347 break;
5348#endif /* NECP */
5349
5350 case SO_EXTENDED_BK_IDLE:
5351 error = sooptcopyin(sopt, &optval, sizeof (optval),
5352 sizeof (optval));
5353 if (error == 0)
5354 error = so_set_extended_bk_idle(so, optval);
5355 break;
5356
5357 case SO_MARK_CELLFALLBACK:
5358 error = sooptcopyin(sopt, &optval, sizeof(optval),
5359 sizeof(optval));
5360 if (error != 0)
5361 goto out;
5362 if (optval < 0) {
5363 error = EINVAL;
5364 goto out;
5365 }
5366 if (optval == 0)
5367 so->so_flags1 &= ~SOF1_CELLFALLBACK;
5368 else
5369 so->so_flags1 |= SOF1_CELLFALLBACK;
5370 break;
5371
5372 case SO_NET_SERVICE_TYPE: {
5373 error = sooptcopyin(sopt, &optval, sizeof(optval),
5374 sizeof(optval));
5375 if (error != 0)
5376 goto out;
5377 error = so_set_net_service_type(so, optval);
5378 break;
5379 }
5380
5381 case SO_QOSMARKING_POLICY_OVERRIDE:
5382 error = priv_check_cred(kauth_cred_get(),
5383 PRIV_NET_QOSMARKING_POLICY_OVERRIDE, 0);
5384 if (error != 0)
5385 goto out;
5386 error = sooptcopyin(sopt, &optval, sizeof(optval),
5387 sizeof(optval));
5388 if (error != 0)
5389 goto out;
5390 if (optval == 0)
5391 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5392 else
5393 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5394 break;
5395
5396 default:
5397 error = ENOPROTOOPT;
5398 break;
5399 }
5400 if (error == 0 && so->so_proto != NULL &&
5401 so->so_proto->pr_ctloutput != NULL) {
5402 (void) so->so_proto->pr_ctloutput(so, sopt);
5403 }
5404 }
5405out:
5406 if (dolock)
5407 socket_unlock(so, 1);
5408 return (error);
5409}
5410
5411/* Helper routines for getsockopt */
5412int
5413sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5414{
5415 int error;
5416 size_t valsize;
5417
5418 error = 0;
5419
5420 /*
5421 * Documented get behavior is that we always return a value,
5422 * possibly truncated to fit in the user's buffer.
5423 * Traditional behavior is that we always tell the user
5424 * precisely how much we copied, rather than something useful
5425 * like the total amount we had available for her.
5426 * Note that this interface is not idempotent; the entire answer must
5427 * generated ahead of time.
5428 */
5429 valsize = min(len, sopt->sopt_valsize);
5430 sopt->sopt_valsize = valsize;
5431 if (sopt->sopt_val != USER_ADDR_NULL) {
5432 if (sopt->sopt_p != kernproc)
5433 error = copyout(buf, sopt->sopt_val, valsize);
5434 else
5435 bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5436 }
5437 return (error);
5438}
5439
5440static int
5441sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5442{
5443 int error;
5444 size_t len;
5445 struct user64_timeval tv64 = {};
5446 struct user32_timeval tv32 = {};
5447 const void * val;
5448 size_t valsize;
5449
5450 error = 0;
5451 if (proc_is64bit(sopt->sopt_p)) {
5452 len = sizeof (tv64);
5453 tv64.tv_sec = tv_p->tv_sec;
5454 tv64.tv_usec = tv_p->tv_usec;
5455 val = &tv64;
5456 } else {
5457 len = sizeof (tv32);
5458 tv32.tv_sec = tv_p->tv_sec;
5459 tv32.tv_usec = tv_p->tv_usec;
5460 val = &tv32;
5461 }
5462 valsize = min(len, sopt->sopt_valsize);
5463 sopt->sopt_valsize = valsize;
5464 if (sopt->sopt_val != USER_ADDR_NULL) {
5465 if (sopt->sopt_p != kernproc)
5466 error = copyout(val, sopt->sopt_val, valsize);
5467 else
5468 bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5469 }
5470 return (error);
5471}
5472
5473/*
5474 * Return: 0 Success
5475 * ENOPROTOOPT
5476 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5477 * <pr_ctloutput>:???
5478 * <sf_getoption>:???
5479 */
5480int
5481sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5482{
5483 int error, optval;
5484 struct linger l;
5485 struct timeval tv;
5486#if CONFIG_MACF_SOCKET
5487 struct mac extmac;
5488#endif /* MAC_SOCKET */
5489
5490 if (sopt->sopt_dir != SOPT_GET)
5491 sopt->sopt_dir = SOPT_GET;
5492
5493 if (dolock)
5494 socket_lock(so, 1);
5495
5496 error = sflt_getsockopt(so, sopt);
5497 if (error != 0) {
5498 if (error == EJUSTRETURN)
5499 error = 0;
5500 goto out;
5501 }
5502
5503 if (sopt->sopt_level != SOL_SOCKET) {
5504 if (so->so_proto != NULL &&
5505 so->so_proto->pr_ctloutput != NULL) {
5506 error = (*so->so_proto->pr_ctloutput)(so, sopt);
5507 goto out;
5508 }
5509 error = ENOPROTOOPT;
5510 } else {
5511 /*
5512 * Allow socket-level (SOL_SOCKET) options to be filtered by
5513 * the protocol layer, if needed. A zero value returned from
5514 * the handler means use default socket-level processing as
5515 * done by the rest of this routine. Otherwise, any other
5516 * return value indicates that the option is unsupported.
5517 */
5518 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5519 pru_socheckopt(so, sopt)) != 0)
5520 goto out;
5521
5522 error = 0;
5523 switch (sopt->sopt_name) {
5524 case SO_LINGER:
5525 case SO_LINGER_SEC:
5526 l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
5527 l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5528 so->so_linger : so->so_linger / hz;
5529 error = sooptcopyout(sopt, &l, sizeof (l));
5530 break;
5531
5532 case SO_USELOOPBACK:
5533 case SO_DONTROUTE:
5534 case SO_DEBUG:
5535 case SO_KEEPALIVE:
5536 case SO_REUSEADDR:
5537 case SO_REUSEPORT:
5538 case SO_BROADCAST:
5539 case SO_OOBINLINE:
5540 case SO_TIMESTAMP:
5541 case SO_TIMESTAMP_MONOTONIC:
5542 case SO_TIMESTAMP_CONTINUOUS:
5543 case SO_DONTTRUNC:
5544 case SO_WANTMORE:
5545 case SO_WANTOOBFLAG:
5546 case SO_NOWAKEFROMSLEEP:
5547 case SO_NOAPNFALLBK:
5548 optval = so->so_options & sopt->sopt_name;
5549integer:
5550 error = sooptcopyout(sopt, &optval, sizeof (optval));
5551 break;
5552
5553 case SO_TYPE:
5554 optval = so->so_type;
5555 goto integer;
5556
5557 case SO_NREAD:
5558 if (so->so_proto->pr_flags & PR_ATOMIC) {
5559 int pkt_total;
5560 struct mbuf *m1;
5561
5562 pkt_total = 0;
5563 m1 = so->so_rcv.sb_mb;
5564 while (m1 != NULL) {
5565 if (m1->m_type == MT_DATA ||
5566 m1->m_type == MT_HEADER ||
5567 m1->m_type == MT_OOBDATA)
5568 pkt_total += m1->m_len;
5569 m1 = m1->m_next;
5570 }
5571 optval = pkt_total;
5572 } else {
5573 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5574 }
5575 goto integer;
5576
5577 case SO_NUMRCVPKT:
5578 if (so->so_proto->pr_flags & PR_ATOMIC) {
5579 int cnt = 0;
5580 struct mbuf *m1;
5581
5582 m1 = so->so_rcv.sb_mb;
5583 while (m1 != NULL) {
5584 if (m1->m_type == MT_DATA ||
5585 m1->m_type == MT_HEADER ||
5586 m1->m_type == MT_OOBDATA)
5587 cnt += 1;
5588 m1 = m1->m_nextpkt;
5589 }
5590 optval = cnt;
5591 goto integer;
5592 } else {
5593 error = EINVAL;
5594 break;
5595 }
5596
5597 case SO_NWRITE:
5598 optval = so->so_snd.sb_cc;
5599 goto integer;
5600
5601 case SO_ERROR:
5602 optval = so->so_error;
5603 so->so_error = 0;
5604 goto integer;
5605
5606 case SO_SNDBUF: {
5607 u_int32_t hiwat = so->so_snd.sb_hiwat;
5608
5609 if (so->so_snd.sb_flags & SB_UNIX) {
5610 struct unpcb *unp =
5611 (struct unpcb *)(so->so_pcb);
5612 if (unp != NULL && unp->unp_conn != NULL) {
5613 hiwat += unp->unp_conn->unp_cc;
5614 }
5615 }
5616
5617 optval = hiwat;
5618 goto integer;
5619 }
5620 case SO_RCVBUF:
5621 optval = so->so_rcv.sb_hiwat;
5622 goto integer;
5623
5624 case SO_SNDLOWAT:
5625 optval = so->so_snd.sb_lowat;
5626 goto integer;
5627
5628 case SO_RCVLOWAT:
5629 optval = so->so_rcv.sb_lowat;
5630 goto integer;
5631
5632 case SO_SNDTIMEO:
5633 case SO_RCVTIMEO:
5634 tv = (sopt->sopt_name == SO_SNDTIMEO ?
5635 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
5636
5637 error = sooptcopyout_timeval(sopt, &tv);
5638 break;
5639
5640 case SO_NOSIGPIPE:
5641 optval = (so->so_flags & SOF_NOSIGPIPE);
5642 goto integer;
5643
5644 case SO_NOADDRERR:
5645 optval = (so->so_flags & SOF_NOADDRAVAIL);
5646 goto integer;
5647
5648 case SO_REUSESHAREUID:
5649 optval = (so->so_flags & SOF_REUSESHAREUID);
5650 goto integer;
5651
5652
5653 case SO_NOTIFYCONFLICT:
5654 optval = (so->so_flags & SOF_NOTIFYCONFLICT);
5655 goto integer;
5656
5657 case SO_RESTRICTIONS:
5658 optval = so_get_restrictions(so);
5659 goto integer;
5660
5661 case SO_AWDL_UNRESTRICTED:
5662 if (SOCK_DOM(so) == PF_INET ||
5663 SOCK_DOM(so) == PF_INET6) {
5664 optval = inp_get_awdl_unrestricted(
5665 sotoinpcb(so));
5666 goto integer;
5667 } else
5668 error = EOPNOTSUPP;
5669 break;
5670
5671 case SO_INTCOPROC_ALLOW:
5672 if (SOCK_DOM(so) == PF_INET6) {
5673 optval = inp_get_intcoproc_allowed(
5674 sotoinpcb(so));
5675 goto integer;
5676 } else
5677 error = EOPNOTSUPP;
5678 break;
5679
5680 case SO_LABEL:
5681#if CONFIG_MACF_SOCKET
5682 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5683 sizeof (extmac))) != 0 ||
5684 (error = mac_socket_label_get(proc_ucred(
5685 sopt->sopt_p), so, &extmac)) != 0)
5686 break;
5687
5688 error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5689#else
5690 error = EOPNOTSUPP;
5691#endif /* MAC_SOCKET */
5692 break;
5693
5694 case SO_PEERLABEL:
5695#if CONFIG_MACF_SOCKET
5696 if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5697 sizeof (extmac))) != 0 ||
5698 (error = mac_socketpeer_label_get(proc_ucred(
5699 sopt->sopt_p), so, &extmac)) != 0)
5700 break;
5701
5702 error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5703#else
5704 error = EOPNOTSUPP;
5705#endif /* MAC_SOCKET */
5706 break;
5707
5708#ifdef __APPLE_API_PRIVATE
5709 case SO_UPCALLCLOSEWAIT:
5710 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
5711 goto integer;
5712#endif
5713 case SO_RANDOMPORT:
5714 optval = (so->so_flags & SOF_BINDRANDOMPORT);
5715 goto integer;
5716
5717 case SO_NP_EXTENSIONS: {
5718 struct so_np_extensions sonpx = {};
5719
5720 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
5721 SONPX_SETOPTSHUT : 0;
5722 sonpx.npx_mask = SONPX_MASK_VALID;
5723
5724 error = sooptcopyout(sopt, &sonpx,
5725 sizeof (struct so_np_extensions));
5726 break;
5727 }
5728
5729 case SO_TRAFFIC_CLASS:
5730 optval = so->so_traffic_class;
5731 goto integer;
5732
5733 case SO_RECV_TRAFFIC_CLASS:
5734 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
5735 goto integer;
5736
5737 case SO_TRAFFIC_CLASS_STATS:
5738 error = sooptcopyout(sopt, &so->so_tc_stats,
5739 sizeof (so->so_tc_stats));
5740 break;
5741
5742#if (DEVELOPMENT || DEBUG)
5743 case SO_TRAFFIC_CLASS_DBG:
5744 error = sogetopt_tcdbg(so, sopt);
5745 break;
5746#endif /* (DEVELOPMENT || DEBUG) */
5747
5748 case SO_PRIVILEGED_TRAFFIC_CLASS:
5749 optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
5750 goto integer;
5751
5752 case SO_DEFUNCTOK:
5753 optval = !(so->so_flags & SOF_NODEFUNCT);
5754 goto integer;
5755
5756 case SO_ISDEFUNCT:
5757 optval = (so->so_flags & SOF_DEFUNCT);
5758 goto integer;
5759
5760 case SO_OPPORTUNISTIC:
5761 optval = so_get_opportunistic(so);
5762 goto integer;
5763
5764 case SO_FLUSH:
5765 /* This option is not gettable */
5766 error = EINVAL;
5767 break;
5768
5769 case SO_RECV_ANYIF:
5770 optval = so_get_recv_anyif(so);
5771 goto integer;
5772
5773 case SO_TRAFFIC_MGT_BACKGROUND:
5774 /* This option is handled by lower layer(s) */
5775 if (so->so_proto != NULL &&
5776 so->so_proto->pr_ctloutput != NULL) {
5777 (void) so->so_proto->pr_ctloutput(so, sopt);
5778 }
5779 break;
5780
5781#if FLOW_DIVERT
5782 case SO_FLOW_DIVERT_TOKEN:
5783 error = flow_divert_token_get(so, sopt);
5784 break;
5785#endif /* FLOW_DIVERT */
5786
5787#if NECP
5788 case SO_NECP_ATTRIBUTES:
5789 error = necp_get_socket_attributes(so, sopt);
5790 break;
5791
5792 case SO_NECP_CLIENTUUID:
5793 {
5794 uuid_t *ncu;
5795
5796 if (SOCK_DOM(so) == PF_MULTIPATH) {
5797 ncu = &mpsotomppcb(so)->necp_client_uuid;
5798 } else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
5799 ncu = &sotoinpcb(so)->necp_client_uuid;
5800 } else {
5801 error = EINVAL;
5802 goto out;
5803 }
5804
5805 error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
5806 break;
5807 }
5808#endif /* NECP */
5809
5810#if CONTENT_FILTER
5811 case SO_CFIL_SOCK_ID: {
5812 cfil_sock_id_t sock_id;
5813
5814 sock_id = cfil_sock_id_from_socket(so);
5815
5816 error = sooptcopyout(sopt, &sock_id,
5817 sizeof(cfil_sock_id_t));
5818 break;
5819 }
5820#endif /* CONTENT_FILTER */
5821
5822 case SO_EXTENDED_BK_IDLE:
5823 optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
5824 goto integer;
5825 case SO_MARK_CELLFALLBACK:
5826 optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
5827 ? 1 : 0;
5828 goto integer;
5829 case SO_NET_SERVICE_TYPE: {
5830 if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE))
5831 optval = so->so_netsvctype;
5832 else
5833 optval = NET_SERVICE_TYPE_BE;
5834 goto integer;
5835 }
5836 case SO_NETSVC_MARKING_LEVEL:
5837 optval = so_get_netsvc_marking_level(so);
5838 goto integer;
5839
5840 default:
5841 error = ENOPROTOOPT;
5842 break;
5843 }
5844 }
5845out:
5846 if (dolock)
5847 socket_unlock(so, 1);
5848 return (error);
5849}
5850
5851/*
5852 * The size limits on our soopt_getm is different from that on FreeBSD.
5853 * We limit the size of options to MCLBYTES. This will have to change
5854 * if we need to define options that need more space than MCLBYTES.
5855 */
5856int
5857soopt_getm(struct sockopt *sopt, struct mbuf **mp)
5858{
5859 struct mbuf *m, *m_prev;
5860 int sopt_size = sopt->sopt_valsize;
5861 int how;
5862
5863 if (sopt_size <= 0 || sopt_size > MCLBYTES)
5864 return (EMSGSIZE);
5865
5866 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
5867 MGET(m, how, MT_DATA);
5868 if (m == NULL)
5869 return (ENOBUFS);
5870 if (sopt_size > MLEN) {
5871 MCLGET(m, how);
5872 if ((m->m_flags & M_EXT) == 0) {
5873 m_free(m);
5874 return (ENOBUFS);
5875 }
5876 m->m_len = min(MCLBYTES, sopt_size);
5877 } else {
5878 m->m_len = min(MLEN, sopt_size);
5879 }
5880 sopt_size -= m->m_len;
5881 *mp = m;
5882 m_prev = m;
5883
5884 while (sopt_size > 0) {
5885 MGET(m, how, MT_DATA);
5886 if (m == NULL) {
5887 m_freem(*mp);
5888 return (ENOBUFS);
5889 }
5890 if (sopt_size > MLEN) {
5891 MCLGET(m, how);
5892 if ((m->m_flags & M_EXT) == 0) {
5893 m_freem(*mp);
5894 m_freem(m);
5895 return (ENOBUFS);
5896 }
5897 m->m_len = min(MCLBYTES, sopt_size);
5898 } else {
5899 m->m_len = min(MLEN, sopt_size);
5900 }
5901 sopt_size -= m->m_len;
5902 m_prev->m_next = m;
5903 m_prev = m;
5904 }
5905 return (0);
5906}
5907
5908/* copyin sopt data into mbuf chain */
5909int
5910soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
5911{
5912 struct mbuf *m0 = m;
5913
5914 if (sopt->sopt_val == USER_ADDR_NULL)
5915 return (0);
5916 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
5917 if (sopt->sopt_p != kernproc) {
5918 int error;
5919
5920 error = copyin(sopt->sopt_val, mtod(m, char *),
5921 m->m_len);
5922 if (error != 0) {
5923 m_freem(m0);
5924 return (error);
5925 }
5926 } else {
5927 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
5928 mtod(m, char *), m->m_len);
5929 }
5930 sopt->sopt_valsize -= m->m_len;
5931 sopt->sopt_val += m->m_len;
5932 m = m->m_next;
5933 }
5934 /* should be allocated enoughly at ip6_sooptmcopyin() */
5935 if (m != NULL) {
5936 panic("soopt_mcopyin");
5937 /* NOTREACHED */
5938 }
5939 return (0);
5940}
5941
5942/* copyout mbuf chain data into soopt */
5943int
5944soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
5945{
5946 struct mbuf *m0 = m;
5947 size_t valsize = 0;
5948
5949 if (sopt->sopt_val == USER_ADDR_NULL)
5950 return (0);
5951 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
5952 if (sopt->sopt_p != kernproc) {
5953 int error;
5954
5955 error = copyout(mtod(m, char *), sopt->sopt_val,
5956 m->m_len);
5957 if (error != 0) {
5958 m_freem(m0);
5959 return (error);
5960 }
5961 } else {
5962 bcopy(mtod(m, char *),
5963 CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
5964 }
5965 sopt->sopt_valsize -= m->m_len;
5966 sopt->sopt_val += m->m_len;
5967 valsize += m->m_len;
5968 m = m->m_next;
5969 }
5970 if (m != NULL) {
5971 /* enough soopt buffer should be given from user-land */
5972 m_freem(m0);
5973 return (EINVAL);
5974 }
5975 sopt->sopt_valsize = valsize;
5976 return (0);
5977}
5978
5979void
5980sohasoutofband(struct socket *so)
5981{
5982 if (so->so_pgid < 0)
5983 gsignal(-so->so_pgid, SIGURG);
5984 else if (so->so_pgid > 0)
5985 proc_signal(so->so_pgid, SIGURG);
5986 selwakeup(&so->so_rcv.sb_sel);
5987 if (so->so_rcv.sb_flags & SB_KNOTE) {
5988 KNOTE(&so->so_rcv.sb_sel.si_note,
5989 (NOTE_OOB | SO_FILT_HINT_LOCKED));
5990 }
5991}
5992
5993int
5994sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
5995{
5996#pragma unused(cred)
5997 struct proc *p = current_proc();
5998 int revents = 0;
5999
6000 socket_lock(so, 1);
6001 so_update_last_owner_locked(so, PROC_NULL);
6002 so_update_policy(so);
6003
6004 if (events & (POLLIN | POLLRDNORM))
6005 if (soreadable(so))
6006 revents |= events & (POLLIN | POLLRDNORM);
6007
6008 if (events & (POLLOUT | POLLWRNORM))
6009 if (sowriteable(so))
6010 revents |= events & (POLLOUT | POLLWRNORM);
6011
6012 if (events & (POLLPRI | POLLRDBAND))
6013 if (so->so_oobmark || (so->so_state & SS_RCVATMARK))
6014 revents |= events & (POLLPRI | POLLRDBAND);
6015
6016 if (revents == 0) {
6017 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
6018 /*
6019 * Darwin sets the flag first,
6020 * BSD calls selrecord first
6021 */
6022 so->so_rcv.sb_flags |= SB_SEL;
6023 selrecord(p, &so->so_rcv.sb_sel, wql);
6024 }
6025
6026 if (events & (POLLOUT | POLLWRNORM)) {
6027 /*
6028 * Darwin sets the flag first,
6029 * BSD calls selrecord first
6030 */
6031 so->so_snd.sb_flags |= SB_SEL;
6032 selrecord(p, &so->so_snd.sb_sel, wql);
6033 }
6034 }
6035
6036 socket_unlock(so, 1);
6037 return (revents);
6038}
6039
6040int
6041soo_kqfilter(struct fileproc *fp, struct knote *kn,
6042 struct kevent_internal_s *kev, vfs_context_t ctx)
6043{
6044#pragma unused(fp)
6045#if !CONFIG_MACF_SOCKET
6046#pragma unused(ctx)
6047#endif /* MAC_SOCKET */
6048 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6049 int result;
6050
6051 socket_lock(so, 1);
6052 so_update_last_owner_locked(so, PROC_NULL);
6053 so_update_policy(so);
6054
6055#if CONFIG_MACF_SOCKET
6056 if (mac_socket_check_kqfilter(proc_ucred(vfs_context_proc(ctx)),
6057 kn, so) != 0) {
6058 socket_unlock(so, 1);
6059 kn->kn_flags = EV_ERROR;
6060 kn->kn_data = EPERM;
6061 return 0;
6062 }
6063#endif /* MAC_SOCKET */
6064
6065 switch (kn->kn_filter) {
6066 case EVFILT_READ:
6067 kn->kn_filtid = EVFILTID_SOREAD;
6068 break;
6069 case EVFILT_WRITE:
6070 kn->kn_filtid = EVFILTID_SOWRITE;
6071 break;
6072 case EVFILT_SOCK:
6073 kn->kn_filtid = EVFILTID_SCK;
6074 break;
6075 case EVFILT_EXCEPT:
6076 kn->kn_filtid = EVFILTID_SOEXCEPT;
6077 break;
6078 default:
6079 socket_unlock(so, 1);
6080 kn->kn_flags = EV_ERROR;
6081 kn->kn_data = EINVAL;
6082 return 0;
6083 }
6084
6085 /*
6086 * call the appropriate sub-filter attach
6087 * with the socket still locked
6088 */
6089 result = knote_fops(kn)->f_attach(kn, kev);
6090
6091 socket_unlock(so, 1);
6092
6093 return result;
6094}
6095
6096static int
6097filt_soread_common(struct knote *kn, struct socket *so)
6098{
6099 if (so->so_options & SO_ACCEPTCONN) {
6100 int is_not_empty;
6101
6102 /*
6103 * Radar 6615193 handle the listen case dynamically
6104 * for kqueue read filter. This allows to call listen()
6105 * after registering the kqueue EVFILT_READ.
6106 */
6107
6108 kn->kn_data = so->so_qlen;
6109 is_not_empty = ! TAILQ_EMPTY(&so->so_comp);
6110
6111 return (is_not_empty);
6112 }
6113
6114 /* socket isn't a listener */
6115 /*
6116 * NOTE_LOWAT specifies new low water mark in data, i.e.
6117 * the bytes of protocol data. We therefore exclude any
6118 * control bytes.
6119 */
6120 kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6121
6122 if (kn->kn_sfflags & NOTE_OOB) {
6123 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6124 kn->kn_fflags |= NOTE_OOB;
6125 kn->kn_data -= so->so_oobmark;
6126 return (1);
6127 }
6128 }
6129
6130 if ((so->so_state & SS_CANTRCVMORE)
6131#if CONTENT_FILTER
6132 && cfil_sock_data_pending(&so->so_rcv) == 0
6133#endif /* CONTENT_FILTER */
6134 ) {
6135 kn->kn_flags |= EV_EOF;
6136 kn->kn_fflags = so->so_error;
6137 return (1);
6138 }
6139
6140 if (so->so_error) { /* temporary udp error */
6141 return (1);
6142 }
6143
6144 int64_t lowwat = so->so_rcv.sb_lowat;
6145 /*
6146 * Ensure that when NOTE_LOWAT is used, the derived
6147 * low water mark is bounded by socket's rcv buf's
6148 * high and low water mark values.
6149 */
6150 if (kn->kn_sfflags & NOTE_LOWAT) {
6151 if (kn->kn_sdata > so->so_rcv.sb_hiwat)
6152 lowwat = so->so_rcv.sb_hiwat;
6153 else if (kn->kn_sdata > lowwat)
6154 lowwat = kn->kn_sdata;
6155 }
6156
6157 /*
6158 * The order below is important. Since NOTE_LOWAT
6159 * overrides sb_lowat, check for NOTE_LOWAT case
6160 * first.
6161 */
6162 if (kn->kn_sfflags & NOTE_LOWAT)
6163 return (kn->kn_data >= lowwat);
6164
6165 return (so->so_rcv.sb_cc >= lowwat);
6166}
6167
6168static int
6169filt_sorattach(struct knote *kn, __unused struct kevent_internal_s *kev)
6170{
6171 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6172
6173 /* socket locked */
6174
6175 /*
6176 * If the caller explicitly asked for OOB results (e.g. poll())
6177 * from EVFILT_READ, then save that off in the hookid field
6178 * and reserve the kn_flags EV_OOBAND bit for output only.
6179 */
6180 if (kn->kn_filter == EVFILT_READ &&
6181 kn->kn_flags & EV_OOBAND) {
6182 kn->kn_flags &= ~EV_OOBAND;
6183 kn->kn_hookid = EV_OOBAND;
6184 } else {
6185 kn->kn_hookid = 0;
6186 }
6187 if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn))
6188 so->so_rcv.sb_flags |= SB_KNOTE;
6189
6190 /* indicate if event is already fired */
6191 return filt_soread_common(kn, so);
6192}
6193
6194static void
6195filt_sordetach(struct knote *kn)
6196{
6197 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6198
6199 socket_lock(so, 1);
6200 if (so->so_rcv.sb_flags & SB_KNOTE)
6201 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn))
6202 so->so_rcv.sb_flags &= ~SB_KNOTE;
6203 socket_unlock(so, 1);
6204}
6205
6206/*ARGSUSED*/
6207static int
6208filt_soread(struct knote *kn, long hint)
6209{
6210 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6211 int retval;
6212
6213 if ((hint & SO_FILT_HINT_LOCKED) == 0)
6214 socket_lock(so, 1);
6215
6216 retval = filt_soread_common(kn, so);
6217
6218 if ((hint & SO_FILT_HINT_LOCKED) == 0)
6219 socket_unlock(so, 1);
6220
6221 return retval;
6222}
6223
6224static int
6225filt_sortouch(struct knote *kn, struct kevent_internal_s *kev)
6226{
6227 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6228 int retval;
6229
6230 socket_lock(so, 1);
6231
6232 /* save off the new input fflags and data */
6233 kn->kn_sfflags = kev->fflags;
6234 kn->kn_sdata = kev->data;
6235
6236 /* determine if changes result in fired events */
6237 retval = filt_soread_common(kn, so);
6238
6239 socket_unlock(so, 1);
6240
6241 return retval;
6242}
6243
6244static int
6245filt_sorprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
6246{
6247#pragma unused(data)
6248 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6249 int retval;
6250
6251 socket_lock(so, 1);
6252 retval = filt_soread_common(kn, so);
6253 if (retval) {
6254 *kev = kn->kn_kevent;
6255 if (kn->kn_flags & EV_CLEAR) {
6256 kn->kn_fflags = 0;
6257 kn->kn_data = 0;
6258 }
6259 }
6260 socket_unlock(so, 1);
6261
6262 return retval;
6263}
6264
6265int
6266so_wait_for_if_feedback(struct socket *so)
6267{
6268 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6269 (so->so_state & SS_ISCONNECTED)) {
6270 struct inpcb *inp = sotoinpcb(so);
6271 if (INP_WAIT_FOR_IF_FEEDBACK(inp))
6272 return (1);
6273 }
6274 return (0);
6275}
6276
6277static int
6278filt_sowrite_common(struct knote *kn, struct socket *so)
6279{
6280 int ret = 0;
6281
6282 kn->kn_data = sbspace(&so->so_snd);
6283 if (so->so_state & SS_CANTSENDMORE) {
6284 kn->kn_flags |= EV_EOF;
6285 kn->kn_fflags = so->so_error;
6286 return 1;
6287 }
6288 if (so->so_error) { /* temporary udp error */
6289 return 1;
6290 }
6291 if (!socanwrite(so)) {
6292 return 0;
6293 }
6294 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6295 return 1;
6296 }
6297 int64_t lowwat = so->so_snd.sb_lowat;
6298 if (kn->kn_sfflags & NOTE_LOWAT) {
6299 if (kn->kn_sdata > so->so_snd.sb_hiwat)
6300 lowwat = so->so_snd.sb_hiwat;
6301 else if (kn->kn_sdata > lowwat)
6302 lowwat = kn->kn_sdata;
6303 }
6304 if (kn->kn_data >= lowwat) {
6305 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6306#if (DEBUG || DEVELOPMENT)
6307 && so_notsent_lowat_check == 1
6308#endif /* DEBUG || DEVELOPMENT */
6309 ) {
6310 if ((SOCK_DOM(so) == PF_INET ||
6311 SOCK_DOM(so) == PF_INET6) &&
6312 so->so_type == SOCK_STREAM) {
6313 ret = tcp_notsent_lowat_check(so);
6314 }
6315#if MPTCP
6316 else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6317 (SOCK_PROTO(so) == IPPROTO_TCP)) {
6318 ret = mptcp_notsent_lowat_check(so);
6319 }
6320#endif
6321 else {
6322 return 1;
6323 }
6324 } else {
6325 ret = 1;
6326 }
6327 }
6328 if (so_wait_for_if_feedback(so))
6329 ret = 0;
6330 return (ret);
6331}
6332
6333static int
6334filt_sowattach(struct knote *kn, __unused struct kevent_internal_s *kev)
6335{
6336 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6337
6338 /* socket locked */
6339 if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn))
6340 so->so_snd.sb_flags |= SB_KNOTE;
6341
6342 /* determine if its already fired */
6343 return filt_sowrite_common(kn, so);
6344}
6345
6346static void
6347filt_sowdetach(struct knote *kn)
6348{
6349 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6350 socket_lock(so, 1);
6351
6352 if (so->so_snd.sb_flags & SB_KNOTE)
6353 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn))
6354 so->so_snd.sb_flags &= ~SB_KNOTE;
6355 socket_unlock(so, 1);
6356}
6357
6358/*ARGSUSED*/
6359static int
6360filt_sowrite(struct knote *kn, long hint)
6361{
6362 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6363 int ret;
6364
6365 if ((hint & SO_FILT_HINT_LOCKED) == 0)
6366 socket_lock(so, 1);
6367
6368 ret = filt_sowrite_common(kn, so);
6369
6370 if ((hint & SO_FILT_HINT_LOCKED) == 0)
6371 socket_unlock(so, 1);
6372
6373 return ret;
6374}
6375
6376static int
6377filt_sowtouch(struct knote *kn, struct kevent_internal_s *kev)
6378{
6379 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6380 int ret;
6381
6382 socket_lock(so, 1);
6383
6384 /*save off the new input fflags and data */
6385 kn->kn_sfflags = kev->fflags;
6386 kn->kn_sdata = kev->data;
6387
6388 /* determine if these changes result in a triggered event */
6389 ret = filt_sowrite_common(kn, so);
6390
6391 socket_unlock(so, 1);
6392
6393 return ret;
6394}
6395
6396static int
6397filt_sowprocess(struct knote *kn, struct filt_process_s *data, struct kevent_internal_s *kev)
6398{
6399#pragma unused(data)
6400 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6401 int ret;
6402
6403 socket_lock(so, 1);
6404 ret = filt_sowrite_common(kn, so);
6405 if (ret) {
6406 *kev = kn->kn_kevent;
6407 if (kn->kn_flags & EV_CLEAR) {
6408 kn->kn_fflags = 0;
6409 kn->kn_data = 0;
6410 }
6411 }
6412 socket_unlock(so, 1);
6413 return ret;
6414}
6415
6416static int
6417filt_sockev_common(struct knote *kn, struct socket *so, long ev_hint)
6418{
6419 int ret = 0;
6420 uint32_t level_trigger = 0;
6421
6422 if (ev_hint & SO_FILT_HINT_CONNRESET) {
6423 kn->kn_fflags |= NOTE_CONNRESET;
6424 }
6425 if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6426 kn->kn_fflags |= NOTE_TIMEOUT;
6427 }
6428 if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6429 kn->kn_fflags |= NOTE_NOSRCADDR;
6430 }
6431 if (ev_hint & SO_FILT_HINT_IFDENIED) {
6432 kn->kn_fflags |= NOTE_IFDENIED;
6433 }
6434 if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6435 kn->kn_fflags |= NOTE_KEEPALIVE;
6436 }
6437 if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
6438 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
6439 }
6440 if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
6441 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
6442 }
6443 if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6444 (so->so_state & SS_ISCONNECTED)) {
6445 kn->kn_fflags |= NOTE_CONNECTED;
6446 level_trigger |= NOTE_CONNECTED;
6447 }
6448 if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6449 (so->so_state & SS_ISDISCONNECTED)) {
6450 kn->kn_fflags |= NOTE_DISCONNECTED;
6451 level_trigger |= NOTE_DISCONNECTED;
6452 }
6453 if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6454 if (so->so_proto != NULL &&
6455 (so->so_proto->pr_flags & PR_EVCONNINFO))
6456 kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
6457 }
6458
6459 if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
6460 tcp_notify_ack_active(so)) {
6461 kn->kn_fflags |= NOTE_NOTIFY_ACK;
6462 }
6463
6464 if ((so->so_state & SS_CANTRCVMORE)
6465#if CONTENT_FILTER
6466 && cfil_sock_data_pending(&so->so_rcv) == 0
6467#endif /* CONTENT_FILTER */
6468 ) {
6469 kn->kn_fflags |= NOTE_READCLOSED;
6470 level_trigger |= NOTE_READCLOSED;
6471 }
6472
6473 if (so->so_state & SS_CANTSENDMORE) {
6474 kn->kn_fflags |= NOTE_WRITECLOSED;
6475 level_trigger |= NOTE_WRITECLOSED;
6476 }
6477
6478 if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6479 (so->so_flags & SOF_SUSPENDED)) {
6480 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6481
6482 /* If resume event was delivered before, reset it */
6483 kn->kn_hookid &= ~NOTE_RESUME;
6484
6485 kn->kn_fflags |= NOTE_SUSPEND;
6486 level_trigger |= NOTE_SUSPEND;
6487 }
6488
6489 if ((ev_hint & SO_FILT_HINT_RESUME) ||
6490 (so->so_flags & SOF_SUSPENDED) == 0) {
6491 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6492
6493 /* If suspend event was delivered before, reset it */
6494 kn->kn_hookid &= ~NOTE_SUSPEND;
6495
6496 kn->kn_fflags |= NOTE_RESUME;
6497 level_trigger |= NOTE_RESUME;
6498 }
6499
6500 if (so->so_error != 0) {
6501 ret = 1;
6502 kn->kn_data = so->so_error;
6503 kn->kn_flags |= EV_EOF;
6504 } else {
6505 get_sockev_state(so, (u_int32_t *)&(kn->kn_data));
6506 }
6507
6508 /* Reset any events that are not requested on this knote */
6509 kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6510 level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6511
6512 /* Find the level triggerred events that are already delivered */
6513 level_trigger &= kn->kn_hookid;
6514 level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
6515
6516 /* Do not deliver level triggerred events more than once */
6517 if ((kn->kn_fflags & ~level_trigger) != 0)
6518 ret = 1;
6519
6520 return (ret);
6521}
6522
6523static int
6524filt_sockattach(struct knote *kn, __unused struct kevent_internal_s *kev)
6525{
6526 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6527
6528 /* socket locked */
6529 kn->kn_hookid = 0;
6530 if (KNOTE_ATTACH(&so->so_klist, kn))
6531 so->so_flags |= SOF_KNOTE;
6532
6533 /* determine if event already fired */
6534 return filt_sockev_common(kn, so, 0);
6535}
6536
6537static void
6538filt_sockdetach(struct knote *kn)
6539{
6540 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6541 socket_lock(so, 1);
6542
6543 if ((so->so_flags & SOF_KNOTE) != 0)
6544 if (KNOTE_DETACH(&so->so_klist, kn))
6545 so->so_flags &= ~SOF_KNOTE;
6546 socket_unlock(so, 1);
6547}
6548
6549static int
6550filt_sockev(struct knote *kn, long hint)
6551{
6552 int ret = 0, locked = 0;
6553 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6554 long ev_hint = (hint & SO_FILT_HINT_EV);
6555
6556 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6557 socket_lock(so, 1);
6558 locked = 1;
6559 }
6560
6561 ret = filt_sockev_common(kn, so, ev_hint);
6562
6563 if (locked)
6564 socket_unlock(so, 1);
6565
6566 return ret;
6567}
6568
6569
6570
6571/*
6572 * filt_socktouch - update event state
6573 */
6574static int
6575filt_socktouch(
6576 struct knote *kn,
6577 struct kevent_internal_s *kev)
6578{
6579 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6580 uint32_t changed_flags;
6581 int ret;
6582
6583 socket_lock(so, 1);
6584
6585 /* save off the [result] data and fflags */
6586 changed_flags = (kn->kn_sfflags ^ kn->kn_hookid);
6587
6588 /* save off the new input fflags and data */
6589 kn->kn_sfflags = kev->fflags;
6590 kn->kn_sdata = kev->data;
6591
6592 /* restrict the current results to the (smaller?) set of new interest */
6593 /*
6594 * For compatibility with previous implementations, we leave kn_fflags
6595 * as they were before.
6596 */
6597 //kn->kn_fflags &= kev->fflags;
6598
6599 /*
6600 * Since we keep track of events that are already
6601 * delivered, if any of those events are not requested
6602 * anymore the state related to them can be reset
6603 */
6604 kn->kn_hookid &=
6605 ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6606
6607 /* determine if we have events to deliver */
6608 ret = filt_sockev_common(kn, so, 0);
6609
6610 socket_unlock(so, 1);
6611
6612 return ret;
6613}
6614
6615/*
6616 * filt_sockprocess - query event fired state and return data
6617 */
6618static int
6619filt_sockprocess(
6620 struct knote *kn,
6621 struct filt_process_s *data,
6622 struct kevent_internal_s *kev)
6623{
6624#pragma unused(data)
6625
6626 struct socket *so = (struct socket *)kn->kn_fp->f_fglob->fg_data;
6627 int ret = 0;
6628
6629 socket_lock(so, 1);
6630
6631 ret = filt_sockev_common(kn, so, 0);
6632 if (ret) {
6633 *kev = kn->kn_kevent;
6634
6635 /*
6636 * Store the state of the events being delivered. This
6637 * state can be used to deliver level triggered events
6638 * ateast once and still avoid waking up the application
6639 * multiple times as long as the event is active.
6640 */
6641 if (kn->kn_fflags != 0)
6642 kn->kn_hookid |= (kn->kn_fflags &
6643 EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6644
6645 /*
6646 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
6647 * only one of them and remember the last one that was
6648 * delivered last
6649 */
6650 if (kn->kn_fflags & NOTE_SUSPEND)
6651 kn->kn_hookid &= ~NOTE_RESUME;
6652 if (kn->kn_fflags & NOTE_RESUME)
6653 kn->kn_hookid &= ~NOTE_SUSPEND;
6654
6655 if (kn->kn_flags & EV_CLEAR) {
6656 kn->kn_data = 0;
6657 kn->kn_fflags = 0;
6658 }
6659 }
6660
6661 socket_unlock(so, 1);
6662
6663 return ret;
6664}
6665
6666void
6667get_sockev_state(struct socket *so, u_int32_t *statep)
6668{
6669 u_int32_t state = *(statep);
6670
6671 /*
6672 * If the state variable is already used by a previous event,
6673 * reset it.
6674 */
6675 if (state != 0)
6676 return;
6677
6678 if (so->so_state & SS_ISCONNECTED)
6679 state |= SOCKEV_CONNECTED;
6680 else
6681 state &= ~(SOCKEV_CONNECTED);
6682 state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
6683 *(statep) = state;
6684}
6685
6686#define SO_LOCK_HISTORY_STR_LEN \
6687 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
6688
6689__private_extern__ const char *
6690solockhistory_nr(struct socket *so)
6691{
6692 size_t n = 0;
6693 int i;
6694 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
6695
6696 bzero(lock_history_str, sizeof (lock_history_str));
6697 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
6698 n += snprintf(lock_history_str + n,
6699 SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
6700 so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
6701 so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
6702 }
6703 return (lock_history_str);
6704}
6705
6706void
6707socket_lock(struct socket *so, int refcount)
6708{
6709 void *lr_saved;
6710
6711 lr_saved = __builtin_return_address(0);
6712
6713 if (so->so_proto->pr_lock) {
6714 (*so->so_proto->pr_lock)(so, refcount, lr_saved);
6715 } else {
6716#ifdef MORE_LOCKING_DEBUG
6717 LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
6718 LCK_MTX_ASSERT_NOTOWNED);
6719#endif
6720 lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
6721 if (refcount)
6722 so->so_usecount++;
6723 so->lock_lr[so->next_lock_lr] = lr_saved;
6724 so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
6725 }
6726}
6727
6728void
6729socket_lock_assert_owned(struct socket *so)
6730{
6731 lck_mtx_t *mutex_held;
6732
6733 if (so->so_proto->pr_getlock != NULL)
6734 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
6735 else
6736 mutex_held = so->so_proto->pr_domain->dom_mtx;
6737
6738 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
6739}
6740
6741int
6742socket_try_lock(struct socket *so)
6743{
6744 lck_mtx_t *mtx;
6745
6746 if (so->so_proto->pr_getlock != NULL)
6747 mtx = (*so->so_proto->pr_getlock)(so, 0);
6748 else
6749 mtx = so->so_proto->pr_domain->dom_mtx;
6750
6751 return (lck_mtx_try_lock(mtx));
6752}
6753
6754void
6755socket_unlock(struct socket *so, int refcount)
6756{
6757 void *lr_saved;
6758 lck_mtx_t *mutex_held;
6759
6760 lr_saved = __builtin_return_address(0);
6761
6762 if (so->so_proto == NULL) {
6763 panic("%s: null so_proto so=%p\n", __func__, so);
6764 /* NOTREACHED */
6765 }
6766
6767 if (so && so->so_proto->pr_unlock) {
6768 (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
6769 } else {
6770 mutex_held = so->so_proto->pr_domain->dom_mtx;
6771#ifdef MORE_LOCKING_DEBUG
6772 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
6773#endif
6774 so->unlock_lr[so->next_unlock_lr] = lr_saved;
6775 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
6776
6777 if (refcount) {
6778 if (so->so_usecount <= 0) {
6779 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
6780 "lrh=%s", __func__, so->so_usecount, so,
6781 SOCK_DOM(so), so->so_type,
6782 SOCK_PROTO(so), solockhistory_nr(so));
6783 /* NOTREACHED */
6784 }
6785
6786 so->so_usecount--;
6787 if (so->so_usecount == 0)
6788 sofreelastref(so, 1);
6789 }
6790 lck_mtx_unlock(mutex_held);
6791 }
6792}
6793
6794/* Called with socket locked, will unlock socket */
6795void
6796sofree(struct socket *so)
6797{
6798 lck_mtx_t *mutex_held;
6799
6800 if (so->so_proto->pr_getlock != NULL)
6801 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
6802 else
6803 mutex_held = so->so_proto->pr_domain->dom_mtx;
6804 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
6805
6806 sofreelastref(so, 0);
6807}
6808
6809void
6810soreference(struct socket *so)
6811{
6812 socket_lock(so, 1); /* locks & take one reference on socket */
6813 socket_unlock(so, 0); /* unlock only */
6814}
6815
6816void
6817sodereference(struct socket *so)
6818{
6819 socket_lock(so, 0);
6820 socket_unlock(so, 1);
6821}
6822
6823/*
6824 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
6825 * possibility of using jumbo clusters. Caller must ensure to hold
6826 * the socket lock.
6827 */
6828void
6829somultipages(struct socket *so, boolean_t set)
6830{
6831 if (set)
6832 so->so_flags |= SOF_MULTIPAGES;
6833 else
6834 so->so_flags &= ~SOF_MULTIPAGES;
6835}
6836
6837void
6838soif2kcl(struct socket *so, boolean_t set)
6839{
6840 if (set)
6841 so->so_flags1 |= SOF1_IF_2KCL;
6842 else
6843 so->so_flags1 &= ~SOF1_IF_2KCL;
6844}
6845
6846int
6847so_isdstlocal(struct socket *so) {
6848
6849 struct inpcb *inp = (struct inpcb *)so->so_pcb;
6850
6851 if (SOCK_DOM(so) == PF_INET)
6852 return (inaddr_local(inp->inp_faddr));
6853 else if (SOCK_DOM(so) == PF_INET6)
6854 return (in6addr_local(&inp->in6p_faddr));
6855
6856 return (0);
6857}
6858
6859int
6860sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
6861{
6862 struct sockbuf *rcv, *snd;
6863 int err = 0, defunct;
6864
6865 rcv = &so->so_rcv;
6866 snd = &so->so_snd;
6867
6868 defunct = (so->so_flags & SOF_DEFUNCT);
6869 if (defunct) {
6870 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
6871 panic("%s: SB_DROP not set", __func__);
6872 /* NOTREACHED */
6873 }
6874 goto done;
6875 }
6876
6877 if (so->so_flags & SOF_NODEFUNCT) {
6878 if (noforce) {
6879 err = EOPNOTSUPP;
6880 if (p != PROC_NULL) {
6881 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
6882 "name %s level %d) so 0x%llx [%d,%d] "
6883 "is not eligible for defunct "
6884 "(%d)\n", __func__, proc_selfpid(),
6885 proc_best_name(current_proc()), proc_pid(p),
6886 proc_best_name(p), level,
6887 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6888 SOCK_DOM(so), SOCK_TYPE(so), err);
6889 }
6890 return (err);
6891 }
6892 so->so_flags &= ~SOF_NODEFUNCT;
6893 if (p != PROC_NULL) {
6894 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
6895 "name %s level %d) so 0x%llx [%d,%d] "
6896 "defunct by force "
6897 "(%d)\n", __func__, proc_selfpid(),
6898 proc_best_name(current_proc()), proc_pid(p),
6899 proc_best_name(p), level,
6900 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6901 SOCK_DOM(so), SOCK_TYPE(so), err);
6902 }
6903 } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
6904 struct inpcb *inp = (struct inpcb *)so->so_pcb;
6905 struct ifnet *ifp = inp->inp_last_outifp;
6906
6907 if (ifp && IFNET_IS_CELLULAR(ifp)) {
6908 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
6909 } else if (so->so_flags & SOF_DELEGATED) {
6910 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
6911 } else if (soextbkidlestat.so_xbkidle_time == 0) {
6912 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
6913 } else if (noforce && p != PROC_NULL) {
6914 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
6915
6916 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
6917 so->so_extended_bk_start = net_uptime();
6918 OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
6919
6920 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
6921
6922 err = EOPNOTSUPP;
6923 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
6924 "name %s level %d) so 0x%llx [%d,%d] "
6925 "extend bk idle "
6926 "(%d)\n", __func__, proc_selfpid(),
6927 proc_best_name(current_proc()), proc_pid(p),
6928 proc_best_name(p), level,
6929 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6930 SOCK_DOM(so), SOCK_TYPE(so), err);
6931 return (err);
6932 } else {
6933 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
6934 }
6935 }
6936
6937 so->so_flags |= SOF_DEFUNCT;
6938
6939 /* Prevent further data from being appended to the socket buffers */
6940 snd->sb_flags |= SB_DROP;
6941 rcv->sb_flags |= SB_DROP;
6942
6943 /* Flush any existing data in the socket buffers */
6944 if (rcv->sb_cc != 0) {
6945 rcv->sb_flags &= ~SB_SEL;
6946 selthreadclear(&rcv->sb_sel);
6947 sbrelease(rcv);
6948 }
6949 if (snd->sb_cc != 0) {
6950 snd->sb_flags &= ~SB_SEL;
6951 selthreadclear(&snd->sb_sel);
6952 sbrelease(snd);
6953 }
6954
6955done:
6956 if (p != PROC_NULL) {
6957 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6958 "so 0x%llx [%d,%d] %s defunct%s\n", __func__,
6959 proc_selfpid(), proc_best_name(current_proc()),
6960 proc_pid(p), proc_best_name(p), level,
6961 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
6962 SOCK_TYPE(so), defunct ? "is already" : "marked as",
6963 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
6964 " extbkidle" : "");
6965 }
6966 return (err);
6967}
6968
6969int
6970sodefunct(struct proc *p, struct socket *so, int level)
6971{
6972 struct sockbuf *rcv, *snd;
6973
6974 if (!(so->so_flags & SOF_DEFUNCT)) {
6975 panic("%s improperly called", __func__);
6976 /* NOTREACHED */
6977 }
6978 if (so->so_state & SS_DEFUNCT)
6979 goto done;
6980
6981 rcv = &so->so_rcv;
6982 snd = &so->so_snd;
6983
6984 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6985 char s[MAX_IPv6_STR_LEN];
6986 char d[MAX_IPv6_STR_LEN];
6987 struct inpcb *inp = sotoinpcb(so);
6988
6989 if (p != PROC_NULL) {
6990 SODEFUNCTLOG(
6991 "%s[%d, %s]: (target pid %d name %s level %d) "
6992 "so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
6993 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
6994 " snd_fl 0x%x]\n", __func__,
6995 proc_selfpid(), proc_best_name(current_proc()),
6996 proc_pid(p), proc_best_name(p), level,
6997 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6998 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
6999 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7000 (void *)&inp->inp_laddr.s_addr :
7001 (void *)&inp->in6p_laddr),
7002 s, sizeof (s)), ntohs(inp->in6p_lport),
7003 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7004 (void *)&inp->inp_faddr.s_addr :
7005 (void *)&inp->in6p_faddr,
7006 d, sizeof (d)), ntohs(inp->in6p_fport),
7007 (uint32_t)rcv->sb_sel.si_flags,
7008 (uint32_t)snd->sb_sel.si_flags,
7009 rcv->sb_flags, snd->sb_flags);
7010 }
7011 } else if (p != PROC_NULL) {
7012 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7013 "so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
7014 "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7015 proc_selfpid(), proc_best_name(current_proc()),
7016 proc_pid(p), proc_best_name(p), level,
7017 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7018 SOCK_DOM(so), SOCK_TYPE(so),
7019 (uint32_t)rcv->sb_sel.si_flags,
7020 (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
7021 snd->sb_flags);
7022 }
7023
7024 /*
7025 * Unwedge threads blocked on sbwait() and sb_lock().
7026 */
7027 sbwakeup(rcv);
7028 sbwakeup(snd);
7029
7030 so->so_flags1 |= SOF1_DEFUNCTINPROG;
7031 if (rcv->sb_flags & SB_LOCK)
7032 sbunlock(rcv, TRUE); /* keep socket locked */
7033 if (snd->sb_flags & SB_LOCK)
7034 sbunlock(snd, TRUE); /* keep socket locked */
7035
7036 /*
7037 * Flush the buffers and disconnect. We explicitly call shutdown
7038 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7039 * states are set for the socket. This would also flush out data
7040 * hanging off the receive list of this socket.
7041 */
7042 (void) soshutdownlock_final(so, SHUT_RD);
7043 (void) soshutdownlock_final(so, SHUT_WR);
7044 (void) sodisconnectlocked(so);
7045
7046 /*
7047 * Explicitly handle connectionless-protocol disconnection
7048 * and release any remaining data in the socket buffers.
7049 */
7050 if (!(so->so_state & SS_ISDISCONNECTED))
7051 (void) soisdisconnected(so);
7052
7053 if (so->so_error == 0)
7054 so->so_error = EBADF;
7055
7056 if (rcv->sb_cc != 0) {
7057 rcv->sb_flags &= ~SB_SEL;
7058 selthreadclear(&rcv->sb_sel);
7059 sbrelease(rcv);
7060 }
7061 if (snd->sb_cc != 0) {
7062 snd->sb_flags &= ~SB_SEL;
7063 selthreadclear(&snd->sb_sel);
7064 sbrelease(snd);
7065 }
7066 so->so_state |= SS_DEFUNCT;
7067 OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
7068
7069done:
7070 return (0);
7071}
7072
7073int
7074soresume(struct proc *p, struct socket *so, int locked)
7075{
7076 if (locked == 0)
7077 socket_lock(so, 1);
7078
7079 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7080 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
7081 "[%d,%d] resumed from bk idle\n",
7082 __func__, proc_selfpid(), proc_best_name(current_proc()),
7083 proc_pid(p), proc_best_name(p),
7084 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7085 SOCK_DOM(so), SOCK_TYPE(so));
7086
7087 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7088 so->so_extended_bk_start = 0;
7089 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7090
7091 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7092 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7093 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7094 }
7095 if (locked == 0)
7096 socket_unlock(so, 1);
7097
7098 return (0);
7099}
7100
7101/*
7102 * Does not attempt to account for sockets that are delegated from
7103 * the current process
7104 */
7105int
7106so_set_extended_bk_idle(struct socket *so, int optval)
7107{
7108 int error = 0;
7109
7110 if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7111 SOCK_PROTO(so) != IPPROTO_TCP) {
7112 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7113 error = EOPNOTSUPP;
7114 } else if (optval == 0) {
7115 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7116
7117 soresume(current_proc(), so, 1);
7118 } else {
7119 struct proc *p = current_proc();
7120 int i;
7121 struct filedesc *fdp;
7122 int count = 0;
7123
7124 /*
7125 * Unlock socket to avoid lock ordering issue with
7126 * the proc fd table lock
7127 */
7128 socket_unlock(so, 0);
7129
7130 proc_fdlock(p);
7131
7132 fdp = p->p_fd;
7133 for (i = 0; i < fdp->fd_nfiles; i++) {
7134 struct fileproc *fp = fdp->fd_ofiles[i];
7135 struct socket *so2;
7136
7137 if (fp == NULL ||
7138 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
7139 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
7140 continue;
7141
7142 so2 = (struct socket *)fp->f_fglob->fg_data;
7143 if (so != so2 &&
7144 so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED)
7145 count++;
7146 if (count >= soextbkidlestat.so_xbkidle_maxperproc)
7147 break;
7148 }
7149 proc_fdunlock(p);
7150
7151 socket_lock(so, 0);
7152
7153 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7154 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7155 error = EBUSY;
7156 } else if (so->so_flags & SOF_DELEGATED) {
7157 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7158 error = EBUSY;
7159 } else {
7160 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7161 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7162 }
7163 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
7164 "%s marked for extended bk idle\n",
7165 __func__, proc_selfpid(), proc_best_name(current_proc()),
7166 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7167 SOCK_DOM(so), SOCK_TYPE(so),
7168 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7169 "is" : "not");
7170 }
7171
7172 return (error);
7173}
7174
7175static void
7176so_stop_extended_bk_idle(struct socket *so)
7177{
7178 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7179 so->so_extended_bk_start = 0;
7180
7181 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7182 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7183 /*
7184 * Force defunct
7185 */
7186 sosetdefunct(current_proc(), so,
7187 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7188 if (so->so_flags & SOF_DEFUNCT) {
7189 sodefunct(current_proc(), so,
7190 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7191 }
7192}
7193
7194void
7195so_drain_extended_bk_idle(struct socket *so)
7196{
7197 if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7198 /*
7199 * Only penalize sockets that have outstanding data
7200 */
7201 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7202 so_stop_extended_bk_idle(so);
7203
7204 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7205 }
7206 }
7207}
7208
7209/*
7210 * Return values tells if socket is still in extended background idle
7211 */
7212int
7213so_check_extended_bk_idle_time(struct socket *so)
7214{
7215 int ret = 1;
7216
7217 if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7218 SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
7219 __func__, proc_selfpid(), proc_best_name(current_proc()),
7220 (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7221 SOCK_DOM(so), SOCK_TYPE(so));
7222 if (net_uptime() - so->so_extended_bk_start >
7223 soextbkidlestat.so_xbkidle_time) {
7224 so_stop_extended_bk_idle(so);
7225
7226 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7227
7228 ret = 0;
7229 } else {
7230 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7231
7232 inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7233 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7234 }
7235 }
7236
7237 return (ret);
7238}
7239
7240void
7241resume_proc_sockets(proc_t p)
7242{
7243 if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7244 struct filedesc *fdp;
7245 int i;
7246
7247 proc_fdlock(p);
7248 fdp = p->p_fd;
7249 for (i = 0; i < fdp->fd_nfiles; i++) {
7250 struct fileproc *fp;
7251 struct socket *so;
7252
7253 fp = fdp->fd_ofiles[i];
7254 if (fp == NULL ||
7255 (fdp->fd_ofileflags[i] & UF_RESERVED) != 0 ||
7256 FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
7257 continue;
7258
7259 so = (struct socket *)fp->f_fglob->fg_data;
7260 (void) soresume(p, so, 0);
7261 }
7262 proc_fdunlock(p);
7263
7264 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7265 }
7266}
7267
7268__private_extern__ int
7269so_set_recv_anyif(struct socket *so, int optval)
7270{
7271 int ret = 0;
7272
7273#if INET6
7274 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7275#else
7276 if (SOCK_DOM(so) == PF_INET) {
7277#endif /* !INET6 */
7278 if (optval)
7279 sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7280 else
7281 sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7282 }
7283
7284
7285 return (ret);
7286}
7287
7288__private_extern__ int
7289so_get_recv_anyif(struct socket *so)
7290{
7291 int ret = 0;
7292
7293#if INET6
7294 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7295#else
7296 if (SOCK_DOM(so) == PF_INET) {
7297#endif /* !INET6 */
7298 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7299 }
7300
7301 return (ret);
7302}
7303
7304int
7305so_set_restrictions(struct socket *so, uint32_t vals)
7306{
7307 int nocell_old, nocell_new;
7308 int noexpensive_old, noexpensive_new;
7309
7310 /*
7311 * Deny-type restrictions are trapdoors; once set they cannot be
7312 * unset for the lifetime of the socket. This allows them to be
7313 * issued by a framework on behalf of the application without
7314 * having to worry that they can be undone.
7315 *
7316 * Note here that socket-level restrictions overrides any protocol
7317 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
7318 * socket restriction issued on the socket has a higher precendence
7319 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
7320 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7321 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7322 */
7323 nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7324 noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7325 so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
7326 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
7327 SO_RESTRICT_DENY_EXPENSIVE));
7328 nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7329 noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7330
7331 /* we can only set, not clear restrictions */
7332 if ((nocell_new - nocell_old) == 0 &&
7333 (noexpensive_new - noexpensive_old) == 0)
7334 return (0);
7335#if INET6
7336 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7337#else
7338 if (SOCK_DOM(so) == PF_INET) {
7339#endif /* !INET6 */
7340 if (nocell_new - nocell_old != 0) {
7341 /*
7342 * if deny cellular is now set, do what's needed
7343 * for INPCB
7344 */
7345 inp_set_nocellular(sotoinpcb(so));
7346 }
7347 if (noexpensive_new - noexpensive_old != 0) {
7348 inp_set_noexpensive(sotoinpcb(so));
7349 }
7350 }
7351
7352 if (SOCK_DOM(so) == PF_MULTIPATH)
7353 mptcp_set_restrictions(so);
7354
7355 return (0);
7356}
7357
7358uint32_t
7359so_get_restrictions(struct socket *so)
7360{
7361 return (so->so_restrictions & (SO_RESTRICT_DENY_IN |
7362 SO_RESTRICT_DENY_OUT |
7363 SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE));
7364}
7365
7366int
7367so_set_effective_pid(struct socket *so, int epid, struct proc *p)
7368{
7369 struct proc *ep = PROC_NULL;
7370 int error = 0;
7371
7372 /* pid 0 is reserved for kernel */
7373 if (epid == 0) {
7374 error = EINVAL;
7375 goto done;
7376 }
7377
7378 /*
7379 * If this is an in-kernel socket, prevent its delegate
7380 * association from changing unless the socket option is
7381 * coming from within the kernel itself.
7382 */
7383 if (so->last_pid == 0 && p != kernproc) {
7384 error = EACCES;
7385 goto done;
7386 }
7387
7388 /*
7389 * If this is issued by a process that's recorded as the
7390 * real owner of the socket, or if the pid is the same as
7391 * the process's own pid, then proceed. Otherwise ensure
7392 * that the issuing process has the necessary privileges.
7393 */
7394 if (epid != so->last_pid || epid != proc_pid(p)) {
7395 if ((error = priv_check_cred(kauth_cred_get(),
7396 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7397 error = EACCES;
7398 goto done;
7399 }
7400 }
7401
7402 /* Find the process that corresponds to the effective pid */
7403 if ((ep = proc_find(epid)) == PROC_NULL) {
7404 error = ESRCH;
7405 goto done;
7406 }
7407
7408 /*
7409 * If a process tries to delegate the socket to itself, then
7410 * there's really nothing to do; treat it as a way for the
7411 * delegate association to be cleared. Note that we check
7412 * the passed-in proc rather than calling proc_selfpid(),
7413 * as we need to check the process issuing the socket option
7414 * which could be kernproc. Given that we don't allow 0 for
7415 * effective pid, it means that a delegated in-kernel socket
7416 * stays delegated during its lifetime (which is probably OK.)
7417 */
7418 if (epid == proc_pid(p)) {
7419 so->so_flags &= ~SOF_DELEGATED;
7420 so->e_upid = 0;
7421 so->e_pid = 0;
7422 uuid_clear(so->e_uuid);
7423 } else {
7424 so->so_flags |= SOF_DELEGATED;
7425 so->e_upid = proc_uniqueid(ep);
7426 so->e_pid = proc_pid(ep);
7427 proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
7428 }
7429done:
7430 if (error == 0 && net_io_policy_log) {
7431 uuid_string_t buf;
7432
7433 uuid_unparse(so->e_uuid, buf);
7434 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7435 "euuid %s%s\n", __func__, proc_name_address(p),
7436 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7437 SOCK_DOM(so), SOCK_TYPE(so),
7438 so->e_pid, proc_name_address(ep), buf,
7439 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7440 } else if (error != 0 && net_io_policy_log) {
7441 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7442 "ERROR (%d)\n", __func__, proc_name_address(p),
7443 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7444 SOCK_DOM(so), SOCK_TYPE(so),
7445 epid, (ep == PROC_NULL) ? "PROC_NULL" :
7446 proc_name_address(ep), error);
7447 }
7448
7449 /* Update this socket's policy upon success */
7450 if (error == 0) {
7451 so->so_policy_gencnt *= -1;
7452 so_update_policy(so);
7453#if NECP
7454 so_update_necp_policy(so, NULL, NULL);
7455#endif /* NECP */
7456 }
7457
7458 if (ep != PROC_NULL)
7459 proc_rele(ep);
7460
7461 return (error);
7462}
7463
7464int
7465so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p)
7466{
7467 uuid_string_t buf;
7468 uuid_t uuid;
7469 int error = 0;
7470
7471 /* UUID must not be all-zeroes (reserved for kernel) */
7472 if (uuid_is_null(euuid)) {
7473 error = EINVAL;
7474 goto done;
7475 }
7476
7477 /*
7478 * If this is an in-kernel socket, prevent its delegate
7479 * association from changing unless the socket option is
7480 * coming from within the kernel itself.
7481 */
7482 if (so->last_pid == 0 && p != kernproc) {
7483 error = EACCES;
7484 goto done;
7485 }
7486
7487 /* Get the UUID of the issuing process */
7488 proc_getexecutableuuid(p, uuid, sizeof (uuid));
7489
7490 /*
7491 * If this is issued by a process that's recorded as the
7492 * real owner of the socket, or if the uuid is the same as
7493 * the process's own uuid, then proceed. Otherwise ensure
7494 * that the issuing process has the necessary privileges.
7495 */
7496 if (uuid_compare(euuid, so->last_uuid) != 0 ||
7497 uuid_compare(euuid, uuid) != 0) {
7498 if ((error = priv_check_cred(kauth_cred_get(),
7499 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, 0))) {
7500 error = EACCES;
7501 goto done;
7502 }
7503 }
7504
7505 /*
7506 * If a process tries to delegate the socket to itself, then
7507 * there's really nothing to do; treat it as a way for the
7508 * delegate association to be cleared. Note that we check
7509 * the uuid of the passed-in proc rather than that of the
7510 * current process, as we need to check the process issuing
7511 * the socket option which could be kernproc itself. Given
7512 * that we don't allow 0 for effective uuid, it means that
7513 * a delegated in-kernel socket stays delegated during its
7514 * lifetime (which is okay.)
7515 */
7516 if (uuid_compare(euuid, uuid) == 0) {
7517 so->so_flags &= ~SOF_DELEGATED;
7518 so->e_upid = 0;
7519 so->e_pid = 0;
7520 uuid_clear(so->e_uuid);
7521 } else {
7522 so->so_flags |= SOF_DELEGATED;
7523 /*
7524 * Unlike so_set_effective_pid(), we only have the UUID
7525 * here and the process ID is not known. Inherit the
7526 * real {pid,upid} of the socket.
7527 */
7528 so->e_upid = so->last_upid;
7529 so->e_pid = so->last_pid;
7530 uuid_copy(so->e_uuid, euuid);
7531 }
7532
7533done:
7534 if (error == 0 && net_io_policy_log) {
7535 uuid_unparse(so->e_uuid, buf);
7536 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
7537 "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
7538 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7539 SOCK_TYPE(so), so->e_pid, buf,
7540 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7541 } else if (error != 0 && net_io_policy_log) {
7542 uuid_unparse(euuid, buf);
7543 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
7544 "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
7545 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7546 SOCK_TYPE(so), buf, error);
7547 }
7548
7549 /* Update this socket's policy upon success */
7550 if (error == 0) {
7551 so->so_policy_gencnt *= -1;
7552 so_update_policy(so);
7553#if NECP
7554 so_update_necp_policy(so, NULL, NULL);
7555#endif /* NECP */
7556 }
7557
7558 return (error);
7559}
7560
7561void
7562netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
7563 uint32_t ev_datalen)
7564{
7565 struct kev_msg ev_msg;
7566
7567 /*
7568 * A netpolicy event always starts with a netpolicy_event_data
7569 * structure, but the caller can provide for a longer event
7570 * structure to post, depending on the event code.
7571 */
7572 VERIFY(ev_data != NULL && ev_datalen >= sizeof (*ev_data));
7573
7574 bzero(&ev_msg, sizeof (ev_msg));
7575 ev_msg.vendor_code = KEV_VENDOR_APPLE;
7576 ev_msg.kev_class = KEV_NETWORK_CLASS;
7577 ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS;
7578 ev_msg.event_code = ev_code;
7579
7580 ev_msg.dv[0].data_ptr = ev_data;
7581 ev_msg.dv[0].data_length = ev_datalen;
7582
7583 kev_post_msg(&ev_msg);
7584}
7585
7586void
7587socket_post_kev_msg(uint32_t ev_code,
7588 struct kev_socket_event_data *ev_data,
7589 uint32_t ev_datalen)
7590{
7591 struct kev_msg ev_msg;
7592
7593 bzero(&ev_msg, sizeof(ev_msg));
7594 ev_msg.vendor_code = KEV_VENDOR_APPLE;
7595 ev_msg.kev_class = KEV_NETWORK_CLASS;
7596 ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
7597 ev_msg.event_code = ev_code;
7598
7599 ev_msg.dv[0].data_ptr = ev_data;
7600 ev_msg.dv[0]. data_length = ev_datalen;
7601
7602 kev_post_msg(&ev_msg);
7603}
7604
7605void
7606socket_post_kev_msg_closed(struct socket *so)
7607{
7608 struct kev_socket_closed ev;
7609 struct sockaddr *socksa = NULL, *peersa = NULL;
7610 int err;
7611 bzero(&ev, sizeof(ev));
7612 err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
7613 if (err == 0) {
7614 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
7615 &peersa);
7616 if (err == 0) {
7617 memcpy(&ev.ev_data.kev_sockname, socksa,
7618 min(socksa->sa_len,
7619 sizeof (ev.ev_data.kev_sockname)));
7620 memcpy(&ev.ev_data.kev_peername, peersa,
7621 min(peersa->sa_len,
7622 sizeof (ev.ev_data.kev_peername)));
7623 socket_post_kev_msg(KEV_SOCKET_CLOSED,
7624 &ev.ev_data, sizeof (ev));
7625 }
7626 }
7627 if (socksa != NULL)
7628 FREE(socksa, M_SONAME);
7629 if (peersa != NULL)
7630 FREE(peersa, M_SONAME);
7631}
7632