uipc_socket.c source code [xnu/bsd/kern/uipc_socket.c]

1	/*
2	* Copyright (c) 1998-2018 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28	/ Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved /
29	/*
30	* Copyright (c) 1982, 1986, 1988, 1990, 1993
31	* The Regents of the University of California. All rights reserved.
32	*
33	* Redistribution and use in source and binary forms, with or without
34	* modification, are permitted provided that the following conditions
35	* are met:
36	* 1. Redistributions of source code must retain the above copyright
37	* notice, this list of conditions and the following disclaimer.
38	* 2. Redistributions in binary form must reproduce the above copyright
39	* notice, this list of conditions and the following disclaimer in the
40	* documentation and/or other materials provided with the distribution.
41	* 3. All advertising materials mentioning features or use of this software
42	* must display the following acknowledgement:
43	* This product includes software developed by the University of
44	* California, Berkeley and its contributors.
45	* 4. Neither the name of the University nor the names of its contributors
46	* may be used to endorse or promote products derived from this software
47	* without specific prior written permission.
48	*
49	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59	* SUCH DAMAGE.
60	*
61	* @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
62	*/
63	/*
64	* NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65	* support for mandatory and extensible security protections. This notice
66	* is included in support of clause 2.2 (b) of the Apple Public License,
67	* Version 2.0.
68	*/
69
70	#include <sys/param.h>
71	#include <sys/systm.h>
72	#include <sys/filedesc.h>
73	#include <sys/proc.h>
74	#include <sys/proc_internal.h>
75	#include <sys/kauth.h>
76	#include <sys/file_internal.h>
77	#include <sys/fcntl.h>
78	#include <sys/malloc.h>
79	#include <sys/mbuf.h>
80	#include <sys/domain.h>
81	#include <sys/kernel.h>
82	#include <sys/event.h>
83	#include <sys/poll.h>
84	#include <sys/protosw.h>
85	#include <sys/socket.h>
86	#include <sys/socketvar.h>
87	#include <sys/resourcevar.h>
88	#include <sys/signalvar.h>
89	#include <sys/sysctl.h>
90	#include <sys/syslog.h>
91	#include <sys/uio.h>
92	#include <sys/uio_internal.h>
93	#include <sys/ev.h>
94	#include <sys/kdebug.h>
95	#include <sys/un.h>
96	#include <sys/user.h>
97	#include <sys/priv.h>
98	#include <sys/kern_event.h>
99	#include <net/route.h>
100	#include <net/init.h>
101	#include <net/net_api_stats.h>
102	#include <net/ntstat.h>
103	#include <net/content_filter.h>
104	#include <netinet/in.h>
105	#include <netinet/in_pcb.h>
106	#include <netinet/in_tclass.h>
107	#include <netinet/tcp_var.h>
108	#include <netinet/ip6.h>
109	#include <netinet6/ip6_var.h>
110	#include <netinet/flow_divert.h>
111	#include <kern/zalloc.h>
112	#include <kern/locks.h>
113	#include <machine/limits.h>
114	#include <libkern/OSAtomic.h>
115	#include <pexpert/pexpert.h>
116	#include <kern/assert.h>
117	#include <kern/task.h>
118	#include <kern/policy_internal.h>
119
120	#include <sys/kpi_mbuf.h>
121	#include <sys/mcache.h>
122	#include <sys/unpcb.h>
123	#include <libkern/section_keywords.h>
124
125	#if CONFIG_MACF
126	#include <security/mac_framework.h>
127	#endif /* MAC */
128
129	#if MULTIPATH
130	#include <netinet/mp_pcb.h>
131	#include <netinet/mptcp_var.h>
132	#endif /* MULTIPATH */
133
134	#define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
135
136	#if DEBUG \|\| DEVELOPMENT
137	#define DEBUG_KERNEL_ADDRPERM(_v) (_v)
138	#else
139	#define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
140	#endif
141
142	/ TODO: this should be in a header file somewhere /
143	extern char proc_name_address(void* *p);
144
145	static u_int32_t so_cache_hw; / High water mark for socache /
146	static u_int32_t so_cache_timeouts; / number of timeouts /
147	static u_int32_t so_cache_max_freed; / max freed per timeout /
148	static u_int32_t cached_sock_count = `0`;
149	STAILQ_HEAD(, socket) so_cache_head;
150	int max_cached_sock_count = MAX_CACHED_SOCKETS;
151	static u_int32_t so_cache_time;
152	static int socketinit_done;
153	static struct zone *so_cache_zone;
154
155	static lck_grp_t *so_cache_mtx_grp;
156	static lck_attr_t *so_cache_mtx_attr;
157	static lck_grp_attr_t *so_cache_mtx_grp_attr;
158	static lck_mtx_t *so_cache_mtx;
159
160	#include <machine/limits.h>
161
162	static int filt_sorattach(struct knote kn, struct* kevent_internal_s *kev);
163	static void filt_sordetach(struct knote *kn);
164	static int filt_soread(struct knote kn, long* hint);
165	static int filt_sortouch(struct knote kn, struct* kevent_internal_s *kev);
166	static int filt_sorprocess(struct knote kn, struct* filt_process_s data, struct* kevent_internal_s *kev);
167
168	static int filt_sowattach(struct knote kn, struct* kevent_internal_s *kev);
169	static void filt_sowdetach(struct knote *kn);
170	static int filt_sowrite(struct knote kn, long* hint);
171	static int filt_sowtouch(struct knote kn, struct* kevent_internal_s *kev);
172	static int filt_sowprocess(struct knote kn, struct* filt_process_s data, struct* kevent_internal_s *kev);
173
174	static int filt_sockattach(struct knote kn, struct* kevent_internal_s *kev);
175	static void filt_sockdetach(struct knote *kn);
176	static int filt_sockev(struct knote kn, long* hint);
177	static int filt_socktouch(struct knote kn, struct* kevent_internal_s *kev);
178	static int filt_sockprocess(struct knote kn, struct* filt_process_s data, struct* kevent_internal_s *kev);
179
180	static int sooptcopyin_timeval(struct sockopt , struct* timeval *);
181	static int sooptcopyout_timeval(struct sockopt , const* struct timeval *);
182
183	SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
184	.f_isfd = `1`,
185	.f_attach = filt_sorattach,
186	.f_detach = filt_sordetach,
187	.f_event = filt_soread,
188	.f_touch = filt_sortouch,
189	.f_process = filt_sorprocess,
190	};
191
192	SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
193	.f_isfd = `1`,
194	.f_attach = filt_sowattach,
195	.f_detach = filt_sowdetach,
196	.f_event = filt_sowrite,
197	.f_touch = filt_sowtouch,
198	.f_process = filt_sowprocess,
199	};
200
201	SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
202	.f_isfd = `1`,
203	.f_attach = filt_sockattach,
204	.f_detach = filt_sockdetach,
205	.f_event = filt_sockev,
206	.f_touch = filt_socktouch,
207	.f_process = filt_sockprocess,
208	};
209
210	SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
211	.f_isfd = `1`,
212	.f_attach = filt_sorattach,
213	.f_detach = filt_sordetach,
214	.f_event = filt_soread,
215	.f_touch = filt_sortouch,
216	.f_process = filt_sorprocess,
217	};
218
219	SYSCTL_DECL(_kern_ipc);
220
221	#define EVEN_MORE_LOCKING_DEBUG 0
222
223	int socket_debug = `0`;
224	SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
225	CTLFLAG_RW \| CTLFLAG_LOCKED, &socket_debug, `0`, "");
226
227	static unsigned long sodefunct_calls = `0`;
228	SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
229	&sodefunct_calls, "");
230
231	static int socket_zone = M_SOCKET;
232	so_gen_t so_gencnt; / generation count for sockets /
233
234	MALLOC_DEFINE(M_SONAME, "soname", "socket name");
235	MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
236
237	#define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
238	#define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
239	#define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
240	#define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
241	#define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) \| 1)
242	#define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) \| 3)
243	#define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
244	#define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) \| 3)
245	#define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
246
247	#define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
248
249	int somaxconn = SOMAXCONN;
250	SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
251	CTLFLAG_RW \| CTLFLAG_LOCKED, &somaxconn, `0`, "");
252
253	/ Should we get a maximum also ??? /
254	static int sosendmaxchain = `65536`;
255	static int sosendminchain = `16384`;
256	static int sorecvmincopy = `16384`;
257	SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
258	CTLFLAG_RW \| CTLFLAG_LOCKED, &sosendminchain, `0`, "");
259	SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
260	CTLFLAG_RW \| CTLFLAG_LOCKED, &sorecvmincopy, `0`, "");
261
262	/*
263	* Set to enable jumbo clusters (if available) for large writes when
264	* the socket is marked with SOF_MULTIPAGES; see below.
265	*/
266	int sosendjcl = `1`;
267	SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
268	CTLFLAG_RW \| CTLFLAG_LOCKED, &sosendjcl, `0`, "");
269
270	/*
271	* Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
272	* writes on the socket for all protocols on any network interfaces,
273	* depending upon sosendjcl above. Be extra careful when setting this
274	* to 1, because sending down packets that cross physical pages down to
275	* broken drivers (those that falsely assume that the physical pages
276	* are contiguous) might lead to system panics or silent data corruption.
277	* When set to 0, the system will respect SOF_MULTIPAGES, which is set
278	* only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
279	* capable. Set this to 1 only for testing/debugging purposes.
280	*/
281	int sosendjcl_ignore_capab = `0`;
282	SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
283	CTLFLAG_RW \| CTLFLAG_LOCKED, &sosendjcl_ignore_capab, `0`, "");
284
285	/*
286	* Set this to ignore SOF1_IF_2KCL and use big clusters for large
287	* writes on the socket for all protocols on any network interfaces.
288	* Be extra careful when setting this to 1, because sending down packets with
289	* clusters larger that 2 KB might lead to system panics or data corruption.
290	* When set to 0, the system will respect SOF1_IF_2KCL, which is set
291	* on the outgoing interface
292	* Set this to 1 for testing/debugging purposes only.
293	*/
294	int sosendbigcl_ignore_capab = `0`;
295	SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
296	CTLFLAG_RW \| CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, `0`, "");
297
298	int sodefunctlog = `0`;
299	SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW \| CTLFLAG_LOCKED,
300	&sodefunctlog, `0`, "");
301
302	int sothrottlelog = `0`;
303	SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW \| CTLFLAG_LOCKED,
304	&sothrottlelog, `0`, "");
305
306	int sorestrictrecv = `1`;
307	SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW \| CTLFLAG_LOCKED,
308	&sorestrictrecv, `0`, "Enable inbound interface restrictions");
309
310	int sorestrictsend = `1`;
311	SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW \| CTLFLAG_LOCKED,
312	&sorestrictsend, `0`, "Enable outbound interface restrictions");
313
314	int soreserveheadroom = `1`;
315	SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW \| CTLFLAG_LOCKED,
316	&soreserveheadroom, `0`, "To allocate contiguous datagram buffers");
317
318	#if (DEBUG \|\| DEVELOPMENT)
319	int so_notsent_lowat_check = `1`;
320	SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW\|CTLFLAG_LOCKED,
321	&so_notsent_lowat_check, `0`, "enable/disable notsnet lowat check");
322	#endif /* DEBUG \|\| DEVELOPMENT */
323
324	int so_accept_list_waits = `0`;
325	#if (DEBUG \|\| DEVELOPMENT)
326	SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW\|CTLFLAG_LOCKED,
327	&so_accept_list_waits, `0`, "number of waits for listener incomp list");
328	#endif /* DEBUG \|\| DEVELOPMENT */
329
330	extern struct inpcbinfo tcbinfo;
331
332	/ TODO: these should be in header file /
333	extern int get_inpcb_str_size(void);
334	extern int get_tcp_str_size(void);
335
336	vm_size_t so_cache_zone_element_size;
337
338	static int sodelayed_copy(struct socket , struct* uio , struct* mbuf **,
339	user_ssize_t *);
340	static void cached_sock_alloc(struct socket *, int*);
341	static void cached_sock_free(struct socket *);
342
343	/*
344	* Maximum of extended background idle sockets per process
345	* Set to zero to disable further setting of the option
346	*/
347
348	#define SO_IDLE_BK_IDLE_MAX_PER_PROC 1
349	#define SO_IDLE_BK_IDLE_TIME 600
350	#define SO_IDLE_BK_IDLE_RCV_HIWAT 131072
351
352	struct soextbkidlestat soextbkidlestat;
353
354	SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
355	CTLFLAG_RW \| CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, `0`,
356	"Maximum of extended background idle sockets per process");
357
358	SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW \| CTLFLAG_LOCKED,
359	&soextbkidlestat.so_xbkidle_time, `0`,
360	"Time in seconds to keep extended background idle sockets");
361
362	SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW \| CTLFLAG_LOCKED,
363	&soextbkidlestat.so_xbkidle_rcvhiwat, `0`,
364	"High water mark for extended background idle sockets");
365
366	SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD \| CTLFLAG_LOCKED,
367	&soextbkidlestat, soextbkidlestat, "");
368
369	int so_set_extended_bk_idle(struct socket , int*);
370
371
372	/*
373	* SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
374	* setting the DSCP code on the packet based on the service class; see
375	* <rdar://problem/11277343> for details.
376	*/
377	__private_extern__ u_int32_t sotcdb = `0`;
378	SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW \| CTLFLAG_LOCKED,
379	&sotcdb, `0`, "");
380
381	void
382	socketinit(void)
383	{
384	_CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
385	VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
386
387	#ifdef __LP64__
388	_CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
389	_CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
390	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
391	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
392	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
393	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
394	#else
395	_CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
396	_CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
397	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
398	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
399	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
400	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
401	#endif
402
403	if (socketinit_done) {
404	printf("socketinit: already called...\n");
405	return;
406	}
407	socketinit_done = `1`;
408
409	PE_parse_boot_argn("socket_debug", &socket_debug,
410	sizeof (socket_debug));
411
412	/*
413	* allocate lock group attribute and group for socket cache mutex
414	*/
415	so_cache_mtx_grp_attr = lck_grp_attr_alloc_init();
416	so_cache_mtx_grp = lck_grp_alloc_init("so_cache",
417	so_cache_mtx_grp_attr);
418
419	/*
420	* allocate the lock attribute for socket cache mutex
421	*/
422	so_cache_mtx_attr = lck_attr_alloc_init();
423
424	/ cached sockets mutex /
425	so_cache_mtx = lck_mtx_alloc_init(so_cache_mtx_grp, so_cache_mtx_attr);
426	if (so_cache_mtx == NULL) {
427	panic("%s: unable to allocate so_cache_mtx\n", __func__);
428	/ NOTREACHED /
429	}
430	STAILQ_INIT(&so_cache_head);
431
432	so_cache_zone_element_size = (vm_size_t)(sizeof (struct socket) + `4`
433	+ get_inpcb_str_size() + `4` + get_tcp_str_size());
434
435	so_cache_zone = zinit(so_cache_zone_element_size,
436	(`120000` * so_cache_zone_element_size), `8192`, "socache zone");
437	zone_change(so_cache_zone, Z_CALLERACCT, FALSE);
438	zone_change(so_cache_zone, Z_NOENCRYPT, TRUE);
439
440	bzero(&soextbkidlestat, sizeof(struct soextbkidlestat));
441	soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
442	soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
443	soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
444
445	in_pcbinit();
446	sflt_init();
447	socket_tclass_init();
448	#if MULTIPATH
449	mp_pcbinit();
450	#endif /* MULTIPATH */
451	}
452
453	static void
454	cached_sock_alloc(struct socket *so, int* waitok)
455	{
456	caddr_t temp;
457	uintptr_t offset;
458
459	lck_mtx_lock(so_cache_mtx);
460
461	if (!STAILQ_EMPTY(&so_cache_head)) {
462	VERIFY(cached_sock_count > `0`);
463
464	*so = STAILQ_FIRST(&so_cache_head);
465	STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
466	STAILQ_NEXT((*so), so_cache_ent) = NULL;
467
468	cached_sock_count--;
469	lck_mtx_unlock(so_cache_mtx);
470
471	temp = (*so)->so_saved_pcb;
472	bzero((caddr_t)so, sizeof* (struct socket));
473
474	(*so)->so_saved_pcb = temp;
475	} else {
476
477	lck_mtx_unlock(so_cache_mtx);
478
479	if (waitok)
480	so = (struct* socket *)zalloc(so_cache_zone);
481	else
482	so = (struct* socket *)zalloc_noblock(so_cache_zone);
483
484	if (*so == NULL)
485	return;
486
487	bzero((caddr_t)so, sizeof* (struct socket));
488
489	/*
490	* Define offsets for extra structures into our
491	* single block of memory. Align extra structures
492	* on longword boundaries.
493	*/
494
495	offset = (uintptr_t)*so;
496	offset += sizeof (struct socket);
497
498	offset = ALIGN(offset);
499
500	(*so)->so_saved_pcb = (caddr_t)offset;
501	offset += get_inpcb_str_size();
502
503	offset = ALIGN(offset);
504
505	((struct inpcb )(void* )(so)->so_saved_pcb)->inp_saved_ppcb =
506	(caddr_t)offset;
507	}
508
509	OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
510	}
511
512	static void
513	cached_sock_free(struct socket *so)
514	{
515
516	lck_mtx_lock(so_cache_mtx);
517
518	so_cache_time = net_uptime();
519	if (++cached_sock_count > max_cached_sock_count) {
520	--cached_sock_count;
521	lck_mtx_unlock(so_cache_mtx);
522	zfree(so_cache_zone, so);
523	} else {
524	if (so_cache_hw < cached_sock_count)
525	so_cache_hw = cached_sock_count;
526
527	STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
528
529	so->cache_timestamp = so_cache_time;
530	lck_mtx_unlock(so_cache_mtx);
531	}
532	}
533
534	void
535	so_update_last_owner_locked(struct socket *so, proc_t self)
536	{
537	if (so->last_pid != `0`) {
538	/*
539	* last_pid and last_upid should remain zero for sockets
540	* created using sock_socket. The check above achieves that
541	*/
542	if (self == PROC_NULL)
543	self = current_proc();
544
545	if (so->last_upid != proc_uniqueid(self) \|\|
546	so->last_pid != proc_pid(self)) {
547	so->last_upid = proc_uniqueid(self);
548	so->last_pid = proc_pid(self);
549	proc_getexecutableuuid(self, so->last_uuid,
550	sizeof (so->last_uuid));
551	}
552	proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
553	}
554	}
555
556	void
557	so_update_policy(struct socket *so)
558	{
559	if (SOCK_DOM(so) == PF_INET \|\| SOCK_DOM(so) == PF_INET6)
560	(void) inp_update_policy(sotoinpcb(so));
561	}
562
563	#if NECP
564	static void
565	so_update_necp_policy(struct socket so, struct* sockaddr *override_local_addr,
566	struct sockaddr *override_remote_addr)
567	{
568	if (SOCK_DOM(so) == PF_INET \|\| SOCK_DOM(so) == PF_INET6)
569	inp_update_necp_policy(sotoinpcb(so), override_local_addr,
570	override_remote_addr, `0`);
571	}
572	#endif /* NECP */
573
574	boolean_t
575	so_cache_timer(void)
576	{
577	struct socket *p;
578	int n_freed = `0`;
579	boolean_t rc = FALSE;
580
581	lck_mtx_lock(so_cache_mtx);
582	so_cache_timeouts++;
583	so_cache_time = net_uptime();
584
585	while (!STAILQ_EMPTY(&so_cache_head)) {
586	VERIFY(cached_sock_count > `0`);
587	p = STAILQ_FIRST(&so_cache_head);
588	if ((so_cache_time - p->cache_timestamp) <
589	SO_CACHE_TIME_LIMIT)
590	break;
591
592	STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
593	--cached_sock_count;
594
595	zfree(so_cache_zone, p);
596
597	if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
598	so_cache_max_freed++;
599	break;
600	}
601	}
602
603	/ Schedule again if there is more to cleanup /
604	if (!STAILQ_EMPTY(&so_cache_head))
605	rc = TRUE;
606
607	lck_mtx_unlock(so_cache_mtx);
608	return (rc);
609	}
610
611	/*
612	* Get a socket structure from our zone, and initialize it.
613	* We don't implement `waitok' yet (see comments in uipc_domain.c).
614	* Note that it would probably be better to allocate socket
615	* and PCB at the same time, but I'm not convinced that all
616	* the protocols can be easily modified to do this.
617	*/
618	struct socket *
619	soalloc(int waitok, int dom, int type)
620	{
621	struct socket *so;
622
623	if ((dom == PF_INET) && (type == SOCK_STREAM)) {
624	cached_sock_alloc(&so, waitok);
625	} else {
626	MALLOC_ZONE(so, struct socket , sizeof* (*so), socket_zone,
627	M_WAITOK);
628	if (so != NULL)
629	bzero(so, sizeof (*so));
630	}
631	if (so != NULL) {
632	so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
633	so->so_zone = socket_zone;
634
635	/*
636	* Increment the socket allocation statistics
637	*/
638	INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
639
640	#if CONFIG_MACF_SOCKET
641	/ Convert waitok to M_WAITOK/M_NOWAIT for MAC Framework. /
642	if (mac_socket_label_init(so, !waitok) != `0`) {
643	sodealloc(so);
644	return (NULL);
645	}
646	#endif /* MAC_SOCKET */
647	}
648
649	return (so);
650	}
651
652	int
653	socreate_internal(int dom, struct socket *aso, int* type, int proto,
654	struct proc p, uint32_t flags, struct* proc *ep)
655	{
656	struct protosw *prp;
657	struct socket *so;
658	int error = `0`;
659
660	#if TCPDEBUG
661	extern int tcpconsdebug;
662	#endif
663
664	VERIFY(aso != NULL);
665	*aso = NULL;
666
667	if (proto != `0`)
668	prp = pffindproto(dom, proto, type);
669	else
670	prp = pffindtype(dom, type);
671
672	if (prp == NULL \|\| prp->pr_usrreqs->pru_attach == NULL) {
673	if (pffinddomain(dom) == NULL)
674	return (EAFNOSUPPORT);
675	if (proto != `0`) {
676	if (pffindprotonotype(dom, proto) != NULL)
677	return (EPROTOTYPE);
678	}
679	return (EPROTONOSUPPORT);
680	}
681	if (prp->pr_type != type)
682	return (EPROTOTYPE);
683	so = soalloc(`1`, dom, type);
684	if (so == NULL)
685	return (ENOBUFS);
686
687	switch (dom) {
688	case PF_LOCAL:
689	INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
690	break;
691	case PF_INET:
692	INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
693	if (type == SOCK_STREAM) {
694	INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
695	} else {
696	INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
697	}
698	break;
699	case PF_ROUTE:
700	INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
701	break;
702	case PF_NDRV:
703	INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
704	break;
705	case PF_KEY:
706	INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
707	break;
708	case PF_INET6:
709	INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
710	if (type == SOCK_STREAM) {
711	INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
712	} else {
713	INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
714	}
715	break;
716	case PF_SYSTEM:
717	INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
718	break;
719	case PF_MULTIPATH:
720	INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
721	break;
722	default:
723	INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
724	break;
725	}
726
727	if (flags & SOCF_ASYNC)
728	so->so_state \|= SS_NBIO;
729
730	TAILQ_INIT(&so->so_incomp);
731	TAILQ_INIT(&so->so_comp);
732	so->so_type = type;
733	so->last_upid = proc_uniqueid(p);
734	so->last_pid = proc_pid(p);
735	proc_getexecutableuuid(p, so->last_uuid, sizeof (so->last_uuid));
736	proc_pidoriginatoruuid(so->so_vuuid, sizeof(so->so_vuuid));
737
738	if (ep != PROC_NULL && ep != p) {
739	so->e_upid = proc_uniqueid(ep);
740	so->e_pid = proc_pid(ep);
741	proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
742	so->so_flags \|= SOF_DELEGATED;
743	}
744
745	so->so_cred = kauth_cred_proc_ref(p);
746	if (!suser(kauth_cred_get(), NULL))
747	so->so_state \|= SS_PRIV;
748
749	so->so_proto = prp;
750	so->so_rcv.sb_flags \|= SB_RECV;
751	so->so_rcv.sb_so = so->so_snd.sb_so = so;
752	so->next_lock_lr = `0`;
753	so->next_unlock_lr = `0`;
754
755	#if CONFIG_MACF_SOCKET
756	mac_socket_label_associate(kauth_cred_get(), so);
757	#endif /* MAC_SOCKET */
758
759	/*
760	* Attachment will create the per pcb lock if necessary and
761	* increase refcount for creation, make sure it's done before
762	* socket is inserted in lists.
763	*/
764	so->so_usecount++;
765
766	error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
767	if (error != `0`) {
768	/*
769	* Warning:
770	* If so_pcb is not zero, the socket will be leaked,
771	* so protocol attachment handler must be coded carefuly
772	*/
773	so->so_state \|= SS_NOFDREF;
774	VERIFY(so->so_usecount > `0`);
775	so->so_usecount--;
776	sofreelastref(so, `1`); / will deallocate the socket /
777	return (error);
778	}
779
780	atomic_add_32(&prp->pr_domain->dom_refs, `1`);
781	TAILQ_INIT(&so->so_evlist);
782
783	/ Attach socket filters for this protocol /
784	sflt_initsock(so);
785	#if TCPDEBUG
786	if (tcpconsdebug == `2`)
787	so->so_options \|= SO_DEBUG;
788	#endif
789	so_set_default_traffic_class(so);
790
791	/*
792	* If this thread or task is marked to create backgrounded sockets,
793	* mark the socket as background.
794	*/
795	if (proc_get_effective_thread_policy(current_thread(),
796	TASK_POLICY_NEW_SOCKETS_BG)) {
797	socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
798	so->so_background_thread = current_thread();
799	}
800
801	switch (dom) {
802	/*
803	* Don't mark Unix domain, system or multipath sockets as
804	* eligible for defunct by default.
805	*/
806	case PF_LOCAL:
807	case PF_SYSTEM:
808	case PF_MULTIPATH:
809	so->so_flags \|= SOF_NODEFUNCT;
810	break;
811	default:
812	break;
813	}
814
815	/*
816	* Entitlements can't be checked at socket creation time except if the
817	* application requested a feature guarded by a privilege (c.f., socket
818	* delegation).
819	* The priv(9) and the Sandboxing APIs are designed with the idea that
820	* a privilege check should only be triggered by a userland request.
821	* A privilege check at socket creation time is time consuming and
822	* could trigger many authorisation error messages from the security
823	* APIs.
824	*/
825
826	*aso = so;
827
828	return (`0`);
829	}
830
831	/*
832	* Returns: 0 Success
833	* EAFNOSUPPORT
834	* EPROTOTYPE
835	* EPROTONOSUPPORT
836	* ENOBUFS
837	* <pru_attach>:ENOBUFS[AF_UNIX]
838	* <pru_attach>:ENOBUFS[TCP]
839	* <pru_attach>:ENOMEM[TCP]
840	* <pru_attach>:??? [other protocol families, IPSEC]
841	*/
842	int
843	socreate(int dom, struct socket *aso, int* type, int proto)
844	{
845	return (socreate_internal(dom, aso, type, proto, current_proc(), `0`,
846	PROC_NULL));
847	}
848
849	int
850	socreate_delegate(int dom, struct socket *aso, int* type, int proto, pid_t epid)
851	{
852	int error = `0`;
853	struct proc *ep = PROC_NULL;
854
855	if ((proc_selfpid() != epid) && ((ep = proc_find(epid)) == PROC_NULL)) {
856	error = ESRCH;
857	goto done;
858	}
859
860	error = socreate_internal(dom, aso, type, proto, current_proc(), `0`, ep);
861
862	/*
863	* It might not be wise to hold the proc reference when calling
864	* socreate_internal since it calls soalloc with M_WAITOK
865	*/
866	done:
867	if (ep != PROC_NULL)
868	proc_rele(ep);
869
870	return (error);
871	}
872
873	/*
874	* Returns: 0 Success
875	* <pru_bind>:EINVAL Invalid argument [COMMON_START]
876	* <pru_bind>:EAFNOSUPPORT Address family not supported
877	* <pru_bind>:EADDRNOTAVAIL Address not available.
878	* <pru_bind>:EINVAL Invalid argument
879	* <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
880	* <pru_bind>:EACCES Permission denied
881	* <pru_bind>:EADDRINUSE Address in use
882	* <pru_bind>:EAGAIN Resource unavailable, try again
883	* <pru_bind>:EPERM Operation not permitted
884	* <pru_bind>:???
885	* <sf_bind>:???
886	*
887	* Notes: It's not possible to fully enumerate the return codes above,
888	* since socket filter authors and protocol family authors may
889	* not choose to limit their error returns to those listed, even
890	* though this may result in some software operating incorrectly.
891	*
892	* The error codes which are enumerated above are those known to
893	* be returned by the tcp_usr_bind function supplied.
894	*/
895	int
896	sobindlock(struct socket so, struct* sockaddr nam, int* dolock)
897	{
898	struct proc *p = current_proc();
899	int error = `0`;
900
901	if (dolock)
902	socket_lock(so, `1`);
903
904	so_update_last_owner_locked(so, p);
905	so_update_policy(so);
906
907	#if NECP
908	so_update_necp_policy(so, nam, NULL);
909	#endif /* NECP */
910
911	/*
912	* If this is a bind request on a socket that has been marked
913	* as inactive, reject it now before we go any further.
914	*/
915	if (so->so_flags & SOF_DEFUNCT) {
916	error = EINVAL;
917	SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
918	__func__, proc_pid(p), proc_best_name(p),
919	(uint64_t)DEBUG_KERNEL_ADDRPERM(so),
920	SOCK_DOM(so), SOCK_TYPE(so), error);
921	goto out;
922	}
923
924	/ Socket filter /
925	error = sflt_bind(so, nam);
926
927	if (error == `0`)
928	error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
929	out:
930	if (dolock)
931	socket_unlock(so, `1`);
932
933	if (error == EJUSTRETURN)
934	error = `0`;
935
936	return (error);
937	}
938
939	void
940	sodealloc(struct socket *so)
941	{
942	kauth_cred_unref(&so->so_cred);
943
944	/ Remove any filters /
945	sflt_termsock(so);
946
947	#if CONTENT_FILTER
948	cfil_sock_detach(so);
949	#endif /* CONTENT_FILTER */
950
951	/ Delete the state allocated for msg queues on a socket /
952	if (so->so_flags & SOF_ENABLE_MSGS) {
953	FREE(so->so_msg_state, M_TEMP);
954	so->so_msg_state = NULL;
955	}
956	VERIFY(so->so_msg_state == NULL);
957
958	so->so_gencnt = OSIncrementAtomic64((SInt64 *)&so_gencnt);
959
960	#if CONFIG_MACF_SOCKET
961	mac_socket_label_destroy(so);
962	#endif /* MAC_SOCKET */
963
964	if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
965	cached_sock_free(so);
966	} else {
967	FREE_ZONE(so, sizeof (*so), so->so_zone);
968	}
969	}
970
971	/*
972	* Returns: 0 Success
973	* EINVAL
974	* EOPNOTSUPP
975	* <pru_listen>:EINVAL[AF_UNIX]
976	* <pru_listen>:EINVAL[TCP]
977	* <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
978	* <pru_listen>:EINVAL[TCP] Invalid argument
979	* <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
980	* <pru_listen>:EACCES[TCP] Permission denied
981	* <pru_listen>:EADDRINUSE[TCP] Address in use
982	* <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
983	* <pru_listen>:EPERM[TCP] Operation not permitted
984	* <sf_listen>:???
985	*
986	* Notes: Other <pru_listen> returns depend on the protocol family; all
987	* <sf_listen> returns depend on what the filter author causes
988	* their filter to return.
989	*/
990	int
991	solisten(struct socket so, int* backlog)
992	{
993	struct proc *p = current_proc();
994	int error = `0`;
995
996	socket_lock(so, `1`);
997
998	so_update_last_owner_locked(so, p);
999	so_update_policy(so);
1000
1001	#if NECP
1002	so_update_necp_policy(so, NULL, NULL);
1003	#endif /* NECP */
1004
1005	if (so->so_proto == NULL) {
1006	error = EINVAL;
1007	goto out;
1008	}
1009	if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == `0`) {
1010	error = EOPNOTSUPP;
1011	goto out;
1012	}
1013
1014	/*
1015	* If the listen request is made on a socket that is not fully
1016	* disconnected, or on a socket that has been marked as inactive,
1017	* reject the request now.
1018	*/
1019	if ((so->so_state &
1020	(SS_ISCONNECTED\|SS_ISCONNECTING\|SS_ISDISCONNECTING)) \|\|
1021	(so->so_flags & SOF_DEFUNCT)) {
1022	error = EINVAL;
1023	if (so->so_flags & SOF_DEFUNCT) {
1024	SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1025	"(%d)\n", __func__, proc_pid(p),
1026	proc_best_name(p),
1027	(uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1028	SOCK_DOM(so), SOCK_TYPE(so), error);
1029	}
1030	goto out;
1031	}
1032
1033	if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != `0`) {
1034	error = EPERM;
1035	goto out;
1036	}
1037
1038	error = sflt_listen(so);
1039	if (error == `0`)
1040	error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1041
1042	if (error) {
1043	if (error == EJUSTRETURN)
1044	error = `0`;
1045	goto out;
1046	}
1047
1048	if (TAILQ_EMPTY(&so->so_comp))
1049	so->so_options \|= SO_ACCEPTCONN;
1050	/*
1051	* POSIX: The implementation may have an upper limit on the length of
1052	* the listen queue-either global or per accepting socket. If backlog
1053	* exceeds this limit, the length of the listen queue is set to the
1054	* limit.
1055	*
1056	* If listen() is called with a backlog argument value that is less
1057	* than 0, the function behaves as if it had been called with a backlog
1058	* argument value of 0.
1059	*
1060	* A backlog argument of 0 may allow the socket to accept connections,
1061	* in which case the length of the listen queue may be set to an
1062	* implementation-defined minimum value.
1063	*/
1064	if (backlog <= `0` \|\| backlog > somaxconn)
1065	backlog = somaxconn;
1066
1067	so->so_qlimit = backlog;
1068	out:
1069	socket_unlock(so, `1`);
1070	return (error);
1071	}
1072
1073	/*
1074	* The "accept list lock" protects the fields related to the listener queues
1075	* because we can unlock a socket to respect the lock ordering between
1076	* the listener socket and its clients sockets. The lock ordering is first to
1077	* acquire the client socket before the listener socket.
1078	*
1079	* The accept list lock serializes access to the following fields:
1080	* - of the listener socket:
1081	* - so_comp
1082	* - so_incomp
1083	* - so_qlen
1084	* - so_inqlen
1085	* - of client sockets that are in so_comp or so_incomp:
1086	* - so_head
1087	* - so_list
1088	*
1089	* As one can see the accept list lock protects the consistent of the
1090	* linkage of the client sockets.
1091	*
1092	* Note that those fields may be read without holding the accept list lock
1093	* for a preflight provided the accept list lock is taken when committing
1094	* to take an action based on the result of the preflight. The preflight
1095	* saves the cost of doing the unlock/lock dance.
1096	*/
1097	void
1098	so_acquire_accept_list(struct socket head, struct* socket *so)
1099	{
1100	lck_mtx_t *mutex_held;
1101
1102	if (head->so_proto->pr_getlock == NULL) {
1103	return;
1104	}
1105	mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1106	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1107
1108	if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1109	head->so_flags1 \|= SOF1_ACCEPT_LIST_HELD;
1110	return;
1111	}
1112	if (so != NULL) {
1113	socket_unlock(so, `0`);
1114	}
1115	while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1116	so_accept_list_waits += `1`;
1117	msleep((caddr_t)&head->so_incomp, mutex_held,
1118	PSOCK \| PCATCH, __func__, NULL);
1119	}
1120	head->so_flags1 \|= SOF1_ACCEPT_LIST_HELD;
1121	if (so != NULL) {
1122	socket_unlock(head, `0`);
1123	socket_lock(so, `0`);
1124	socket_lock(head, `0`);
1125	}
1126	}
1127
1128	void
1129	so_release_accept_list(struct socket *head)
1130	{
1131	if (head->so_proto->pr_getlock != NULL) {
1132	lck_mtx_t *mutex_held;
1133
1134	mutex_held = (*head->so_proto->pr_getlock)(head, `0`);
1135	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1136
1137	head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1138	wakeup((caddr_t)&head->so_incomp);
1139	}
1140	}
1141
1142	void
1143	sofreelastref(struct socket so, int* dealloc)
1144	{
1145	struct socket *head = so->so_head;
1146
1147	/ Assume socket is locked /
1148
1149	if (!(so->so_flags & SOF_PCBCLEARING) \|\| !(so->so_state & SS_NOFDREF)) {
1150	selthreadclear(&so->so_snd.sb_sel);
1151	selthreadclear(&so->so_rcv.sb_sel);
1152	so->so_rcv.sb_flags &= ~(SB_SEL\|SB_UPCALL);
1153	so->so_snd.sb_flags &= ~(SB_SEL\|SB_UPCALL);
1154	so->so_event = sonullevent;
1155	return;
1156	}
1157	if (head != NULL) {
1158	/*
1159	* Need to lock the listener when the protocol has
1160	* per socket locks
1161	*/
1162	if (head->so_proto->pr_getlock != NULL) {
1163	socket_lock(head, `1`);
1164	so_acquire_accept_list(head, so);
1165	}
1166	if (so->so_state & SS_INCOMP) {
1167	so->so_state &= ~SS_INCOMP;
1168	TAILQ_REMOVE(&head->so_incomp, so, so_list);
1169	head->so_incqlen--;
1170	head->so_qlen--;
1171	so->so_head = NULL;
1172
1173	if (head->so_proto->pr_getlock != NULL) {
1174	so_release_accept_list(head);
1175	socket_unlock(head, `1`);
1176	}
1177	} else if (so->so_state & SS_COMP) {
1178	if (head->so_proto->pr_getlock != NULL) {
1179	so_release_accept_list(head);
1180	socket_unlock(head, `1`);
1181	}
1182	/*
1183	* We must not decommission a socket that's
1184	* on the accept(2) queue. If we do, then
1185	* accept(2) may hang after select(2) indicated
1186	* that the listening socket was ready.
1187	*/
1188	selthreadclear(&so->so_snd.sb_sel);
1189	selthreadclear(&so->so_rcv.sb_sel);
1190	so->so_rcv.sb_flags &= ~(SB_SEL\|SB_UPCALL);
1191	so->so_snd.sb_flags &= ~(SB_SEL\|SB_UPCALL);
1192	so->so_event = sonullevent;
1193	return;
1194	} else {
1195	if (head->so_proto->pr_getlock != NULL) {
1196	so_release_accept_list(head);
1197	socket_unlock(head, `1`);
1198	}
1199	printf("sofree: not queued\n");
1200	}
1201	}
1202	sowflush(so);
1203	sorflush(so);
1204
1205	#if FLOW_DIVERT
1206	if (so->so_flags & SOF_FLOW_DIVERT) {
1207	flow_divert_detach(so);
1208	}
1209	#endif /* FLOW_DIVERT */
1210
1211	/ 3932268: disable upcall /
1212	so->so_rcv.sb_flags &= ~SB_UPCALL;
1213	so->so_snd.sb_flags &= ~(SB_UPCALL\|SB_SNDBYTE_CNT);
1214	so->so_event = sonullevent;
1215
1216	if (dealloc)
1217	sodealloc(so);
1218	}
1219
1220	void
1221	soclose_wait_locked(struct socket *so)
1222	{
1223	lck_mtx_t *mutex_held;
1224
1225	if (so->so_proto->pr_getlock != NULL)
1226	mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1227	else
1228	mutex_held = so->so_proto->pr_domain->dom_mtx;
1229	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1230
1231	/*
1232	* Double check here and return if there's no outstanding upcall;
1233	* otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1234	*/
1235	if (!so->so_upcallusecount \|\| !(so->so_flags & SOF_UPCALLCLOSEWAIT))
1236	return;
1237	so->so_rcv.sb_flags &= ~SB_UPCALL;
1238	so->so_snd.sb_flags &= ~SB_UPCALL;
1239	so->so_flags \|= SOF_CLOSEWAIT;
1240
1241	(void) msleep((caddr_t)&so->so_upcallusecount, mutex_held, (PZERO - `1`),
1242	"soclose_wait_locked", NULL);
1243	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1244	so->so_flags &= ~SOF_CLOSEWAIT;
1245	}
1246
1247	/*
1248	* Close a socket on last file table reference removal.
1249	* Initiate disconnect if connected.
1250	* Free socket when disconnect complete.
1251	*/
1252	int
1253	soclose_locked(struct socket *so)
1254	{
1255	int error = `0`;
1256	struct timespec ts;
1257
1258	if (so->so_usecount == `0`) {
1259	panic("soclose: so=%p refcount=0\n", so);
1260	/ NOTREACHED /
1261	}
1262
1263	sflt_notify(so, sock_evt_closing, NULL);
1264
1265	if (so->so_upcallusecount)
1266	soclose_wait_locked(so);
1267
1268	#if CONTENT_FILTER
1269	/*
1270	* We have to wait until the content filters are done
1271	*/
1272	if ((so->so_flags & SOF_CONTENT_FILTER) != `0`) {
1273	cfil_sock_close_wait(so);
1274	cfil_sock_is_closed(so);
1275	cfil_sock_detach(so);
1276	}
1277	#endif /* CONTENT_FILTER */
1278
1279	if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1280	soresume(current_proc(), so, `1`);
1281	so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1282	}
1283
1284	if ((so->so_options & SO_ACCEPTCONN)) {
1285	struct socket sp, sonext;
1286	int persocklock = `0`;
1287	int incomp_overflow_only;
1288
1289	/*
1290	* We do not want new connection to be added
1291	* to the connection queues
1292	*/
1293	so->so_options &= ~SO_ACCEPTCONN;
1294
1295	/*
1296	* We can drop the lock on the listener once
1297	* we've acquired the incoming list
1298	*/
1299	if (so->so_proto->pr_getlock != NULL) {
1300	persocklock = `1`;
1301	so_acquire_accept_list(so, NULL);
1302	socket_unlock(so, `0`);
1303	}
1304	again:
1305	incomp_overflow_only = `1`;
1306
1307	TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1308	/*
1309	* Radar 5350314
1310	* skip sockets thrown away by tcpdropdropblreq
1311	* they will get cleanup by the garbage collection.
1312	* otherwise, remove the incomp socket from the queue
1313	* and let soabort trigger the appropriate cleanup.
1314	*/
1315	if (sp->so_flags & SOF_OVERFLOW)
1316	continue;
1317
1318	if (persocklock != `0`)
1319	socket_lock(sp, `1`);
1320
1321	/*
1322	* Radar 27945981
1323	* The extra reference for the list insure the
1324	* validity of the socket pointer when we perform the
1325	* unlock of the head above
1326	*/
1327	if (sp->so_state & SS_INCOMP) {
1328	sp->so_state &= ~SS_INCOMP;
1329	sp->so_head = NULL;
1330	TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1331	so->so_incqlen--;
1332	so->so_qlen--;
1333
1334	(void) soabort(sp);
1335	} else {
1336	panic("%s sp %p in so_incomp but !SS_INCOMP",
1337	__func__, sp);
1338	}
1339
1340	if (persocklock != `0`)
1341	socket_unlock(sp, `1`);
1342	}
1343
1344	TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1345	/ Dequeue from so_comp since sofree() won't do it /
1346	if (persocklock != `0`)
1347	socket_lock(sp, `1`);
1348
1349	if (sp->so_state & SS_COMP) {
1350	sp->so_state &= ~SS_COMP;
1351	sp->so_head = NULL;
1352	TAILQ_REMOVE(&so->so_comp, sp, so_list);
1353	so->so_qlen--;
1354
1355	(void) soabort(sp);
1356	} else {
1357	panic("%s sp %p in so_comp but !SS_COMP",
1358	__func__, sp);
1359	}
1360
1361	if (persocklock)
1362	socket_unlock(sp, `1`);
1363	}
1364
1365	if (incomp_overflow_only == `0` && !TAILQ_EMPTY(&so->so_incomp)) {
1366	#if (DEBUG\|DEVELOPMENT)
1367	panic("%s head %p so_comp not empty\n", __func__, so);
1368	#endif /* (DEVELOPMENT \|\| DEBUG) */
1369
1370	goto again;
1371	}
1372
1373	if (!TAILQ_EMPTY(&so->so_comp)) {
1374	#if (DEBUG\|DEVELOPMENT)
1375	panic("%s head %p so_comp not empty\n", __func__, so);
1376	#endif /* (DEVELOPMENT \|\| DEBUG) */
1377
1378	goto again;
1379	}
1380
1381	if (persocklock) {
1382	socket_lock(so, `0`);
1383	so_release_accept_list(so);
1384	}
1385	}
1386	if (so->so_pcb == NULL) {
1387	/ 3915887: mark the socket as ready for dealloc /
1388	so->so_flags \|= SOF_PCBCLEARING;
1389	goto discard;
1390	}
1391	if (so->so_state & SS_ISCONNECTED) {
1392	if ((so->so_state & SS_ISDISCONNECTING) == `0`) {
1393	error = sodisconnectlocked(so);
1394	if (error)
1395	goto drop;
1396	}
1397	if (so->so_options & SO_LINGER) {
1398	lck_mtx_t *mutex_held;
1399
1400	if ((so->so_state & SS_ISDISCONNECTING) &&
1401	(so->so_state & SS_NBIO))
1402	goto drop;
1403	if (so->so_proto->pr_getlock != NULL)
1404	mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1405	else
1406	mutex_held = so->so_proto->pr_domain->dom_mtx;
1407	while (so->so_state & SS_ISCONNECTED) {
1408	ts.tv_sec = (so->so_linger/`100`);
1409	ts.tv_nsec = (so->so_linger % `100`) *
1410	NSEC_PER_USEC * `1000` * `10`;
1411	error = msleep((caddr_t)&so->so_timeo,
1412	mutex_held, PSOCK \| PCATCH, "soclose", &ts);
1413	if (error) {
1414	/*
1415	* It's OK when the time fires,
1416	* don't report an error
1417	*/
1418	if (error == EWOULDBLOCK)
1419	error = `0`;
1420	break;
1421	}
1422	}
1423	}
1424	}
1425	drop:
1426	if (so->so_usecount == `0`) {
1427	panic("soclose: usecount is zero so=%p\n", so);
1428	/ NOTREACHED /
1429	}
1430	if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1431	int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1432	if (error == `0`)
1433	error = error2;
1434	}
1435	if (so->so_usecount <= `0`) {
1436	panic("soclose: usecount is zero so=%p\n", so);
1437	/ NOTREACHED /
1438	}
1439	discard:
1440	if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1441	(so->so_state & SS_NOFDREF)) {
1442	panic("soclose: NOFDREF");
1443	/ NOTREACHED /
1444	}
1445	so->so_state \|= SS_NOFDREF;
1446
1447	if ((so->so_flags & SOF_KNOTE) != `0`)
1448	KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1449
1450	atomic_add_32(&so->so_proto->pr_domain->dom_refs, -`1`);
1451	evsofree(so);
1452
1453	VERIFY(so->so_usecount > `0`);
1454	so->so_usecount--;
1455	sofree(so);
1456	return (error);
1457	}
1458
1459	int
1460	soclose(struct socket *so)
1461	{
1462	int error = `0`;
1463	socket_lock(so, `1`);
1464
1465	if (so->so_retaincnt == `0`) {
1466	error = soclose_locked(so);
1467	} else {
1468	/*
1469	* if the FD is going away, but socket is
1470	* retained in kernel remove its reference
1471	*/
1472	so->so_usecount--;
1473	if (so->so_usecount < `2`)
1474	panic("soclose: retaincnt non null and so=%p "
1475	"usecount=%d\n", so, so->so_usecount);
1476	}
1477	socket_unlock(so, `1`);
1478	return (error);
1479	}
1480
1481	/*
1482	* Must be called at splnet...
1483	*/
1484	/ Should already be locked /
1485	int
1486	soabort(struct socket *so)
1487	{
1488	int error;
1489
1490	#ifdef MORE_LOCKING_DEBUG
1491	lck_mtx_t *mutex_held;
1492
1493	if (so->so_proto->pr_getlock != NULL)
1494	mutex_held = (*so->so_proto->pr_getlock)(so, `0`);
1495	else
1496	mutex_held = so->so_proto->pr_domain->dom_mtx;
1497	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1498	#endif
1499
1500	if ((so->so_flags & SOF_ABORTED) == `0`) {
1501	so->so_flags \|= SOF_ABORTED;
1502	error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1503	if (error) {
1504	sofree(so);
1505	return (error);
1506	}
1507	}
1508	return (`0`);
1509	}
1510
1511	int
1512	soacceptlock(struct socket so, struct* sockaddr *nam, int* dolock)
1513	{
1514	int error;
1515
1516	if (dolock)
1517	socket_lock(so, `1`);
1518
1519	so_update_last_owner_locked(so, PROC_NULL);
1520	so_update_policy(so);
1521	#if NECP
1522	so_update_necp_policy(so, NULL, NULL);
1523	#endif /* NECP */
1524
1525	if ((so->so_state & SS_NOFDREF) == `0`)
1526	panic("soaccept: !NOFDREF");
1527	so->so_state &= ~SS_NOFDREF;
1528	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1529
1530	if (dolock)
1531	socket_unlock(so, `1`);
1532	return (error);
1533	}
1534
1535	int
1536	soaccept(struct socket so, struct* sockaddr **nam)
1537	{
1538	return (soacceptlock(so, nam, `1`));
1539	}
1540
1541	int
1542	soacceptfilter(struct socket so, struct* socket *head)
1543	{
1544	struct sockaddr local = NULL, remote = NULL;
1545	int error = `0`;
1546
1547	/*
1548	* Hold the lock even if this socket has not been made visible
1549	* to the filter(s). For sockets with global locks, this protects
1550	* against the head or peer going away
1551	*/
1552	socket_lock(so, `1`);
1553	if (sogetaddr_locked(so, &remote, `1`) != `0` \|\|
1554	sogetaddr_locked(so, &local, `0`) != `0`) {
1555	so->so_state &= ~SS_NOFDREF;
1556	socket_unlock(so, `1`);
1557	soclose(so);
1558	/ Out of resources; try it again next time /
1559	error = ECONNABORTED;
1560	goto done;
1561	}
1562
1563	error = sflt_accept(head, so, local, remote);
1564
1565	/*
1566	* If we get EJUSTRETURN from one of the filters, mark this socket
1567	* as inactive and return it anyway. This newly accepted socket
1568	* will be disconnected later before we hand it off to the caller.
1569	*/
1570	if (error == EJUSTRETURN) {
1571	error = `0`;
1572	(void) sosetdefunct(current_proc(), so,
1573	SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1574	}
1575
1576	if (error != `0`) {
1577	/*
1578	* This may seem like a duplication to the above error
1579	* handling part when we return ECONNABORTED, except
1580	* the following is done while holding the lock since
1581	* the socket has been exposed to the filter(s) earlier.
1582	*/
1583	so->so_state &= ~SS_NOFDREF;
1584	socket_unlock(so, `1`);
1585	soclose(so);
1586	/ Propagate socket filter's error code to the caller /
1587	} else {
1588	socket_unlock(so, `1`);
1589	}
1590	done:
1591	/ Callee checks for NULL pointer /
1592	sock_freeaddr(remote);
1593	sock_freeaddr(local);
1594	return (error);
1595	}
1596
1597	/*
1598	* Returns: 0 Success
1599	* EOPNOTSUPP Operation not supported on socket
1600	* EISCONN Socket is connected
1601	* <pru_connect>:EADDRNOTAVAIL Address not available.
1602	* <pru_connect>:EINVAL Invalid argument
1603	* <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1604	* <pru_connect>:EACCES Permission denied
1605	* <pru_connect>:EADDRINUSE Address in use
1606	* <pru_connect>:EAGAIN Resource unavailable, try again
1607	* <pru_connect>:EPERM Operation not permitted
1608	* <sf_connect_out>:??? [anything a filter writer might set]
1609	*/
1610	int
1611	soconnectlock(struct socket so, struct* sockaddr nam, int* dolock)
1612	{
1613	int error;
1614	struct proc *p = current_proc();
1615
1616	if (dolock)
1617	socket_lock(so, `1`);
1618
1619	so_update_last_owner_locked(so, p);
1620	so_update_policy(so);
1621
1622	#if NECP
1623	so_update_necp_policy(so, NULL, nam);
1624	#endif /* NECP */
1625
1626	/*
1627	* If this is a listening socket or if this is a previously-accepted
1628	* socket that has been marked as inactive, reject the connect request.
1629	*/
1630	if ((so->so_options & SO_ACCEPTCONN) \|\| (so->so_flags & SOF_DEFUNCT)) {
1631	error = EOPNOTSUPP;
1632	if (so->so_flags & SOF_DEFUNCT) {
1633	SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1634	"(%d)\n", __func__, proc_pid(p),
1635	proc_best_name(p),
1636	(uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1637	SOCK_DOM(so), SOCK_TYPE(so), error);
1638	}
1639	if (dolock)
1640	socket_unlock(so, `1`);
1641	return (error);
1642	}
1643
1644	if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != `0`) {
1645	if (dolock)
1646	socket_unlock(so, `1`);
1647	return (EPERM);
1648	}
1649
1650	/*
1651	* If protocol is connection-based, can only connect once.
1652	* Otherwise, if connected, try to disconnect first.
1653	* This allows user to disconnect by connecting to, e.g.,
1654	* a null address.
1655	*/
1656	if (so->so_state & (SS_ISCONNECTED\|SS_ISCONNECTING) &&
1657	((so->so_proto->pr_flags & PR_CONNREQUIRED) \|\|
1658	(error = sodisconnectlocked(so)))) {
1659	error = EISCONN;
1660	} else {
1661	/*
1662	* Run connect filter before calling protocol:
1663	* - non-blocking connect returns before completion;
1664	*/
1665	error = sflt_connectout(so, nam);
1666	if (error != `0`) {
1667	if (error == EJUSTRETURN)
1668	error = `0`;
1669	} else {
1670	error = (*so->so_proto->pr_usrreqs->pru_connect)
1671	(so, nam, p);
1672	}
1673	}
1674	if (dolock)
1675	socket_unlock(so, `1`);
1676	return (error);
1677	}
1678
1679	int
1680	soconnect(struct socket so, struct* sockaddr *nam)
1681	{
1682	return (soconnectlock(so, nam, `1`));
1683	}
1684
1685	/*
1686	* Returns: 0 Success
1687	* <pru_connect2>:EINVAL[AF_UNIX]
1688	* <pru_connect2>:EPROTOTYPE[AF_UNIX]
1689	* <pru_connect2>:??? [other protocol families]
1690	*
1691	* Notes: <pru_connect2> is not supported by [TCP].
1692	*/
1693	int
1694	soconnect2(struct socket so1, struct* socket *so2)
1695	{
1696	int error;
1697
1698	socket_lock(so1, `1`);
1699	if (so2->so_proto->pr_lock)
1700	socket_lock(so2, `1`);
1701
1702	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1703
1704	socket_unlock(so1, `1`);
1705	if (so2->so_proto->pr_lock)
1706	socket_unlock(so2, `1`);
1707	return (error);
1708	}
1709
1710	int
1711	soconnectxlocked(struct socket so, struct* sockaddr *src,
1712	struct sockaddr dst, struct* proc *p, uint32_t ifscope,
1713	sae_associd_t aid, sae_connid_t pcid, uint32_t flags, void* *arg,
1714	uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1715	{
1716	int error;
1717
1718	so_update_last_owner_locked(so, p);
1719	so_update_policy(so);
1720
1721	/*
1722	* If this is a listening socket or if this is a previously-accepted
1723	* socket that has been marked as inactive, reject the connect request.
1724	*/
1725	if ((so->so_options & SO_ACCEPTCONN) \|\| (so->so_flags & SOF_DEFUNCT)) {
1726	error = EOPNOTSUPP;
1727	if (so->so_flags & SOF_DEFUNCT) {
1728	SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
1729	"(%d)\n", __func__, proc_pid(p),
1730	proc_best_name(p),
1731	(uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1732	SOCK_DOM(so), SOCK_TYPE(so), error);
1733	}
1734	return (error);
1735	}
1736
1737	if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != `0`)
1738	return (EPERM);
1739
1740	/*
1741	* If protocol is connection-based, can only connect once
1742	* unless PR_MULTICONN is set. Otherwise, if connected,
1743	* try to disconnect first. This allows user to disconnect
1744	* by connecting to, e.g., a null address.
1745	*/
1746	if ((so->so_state & (SS_ISCONNECTED\|SS_ISCONNECTING)) &&
1747	!(so->so_proto->pr_flags & PR_MULTICONN) &&
1748	((so->so_proto->pr_flags & PR_CONNREQUIRED) \|\|
1749	(error = sodisconnectlocked(so)) != `0`)) {
1750	error = EISCONN;
1751	} else {
1752	/*
1753	* Run connect filter before calling protocol:
1754	* - non-blocking connect returns before completion;
1755	*/
1756	error = sflt_connectout(so, dst);
1757	if (error != `0`) {
1758	/ Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. /
1759	so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1760	if (error == EJUSTRETURN)
1761	error = `0`;
1762	} else {
1763	error = (*so->so_proto->pr_usrreqs->pru_connectx)
1764	(so, src, dst, p, ifscope, aid, pcid,
1765	flags, arg, arglen, auio, bytes_written);
1766	}
1767	}
1768
1769	return (error);
1770	}
1771
1772	int
1773	sodisconnectlocked(struct socket *so)
1774	{
1775	int error;
1776
1777	if ((so->so_state & SS_ISCONNECTED) == `0`) {
1778	error = ENOTCONN;
1779	goto bad;
1780	}
1781	if (so->so_state & SS_ISDISCONNECTING) {
1782	error = EALREADY;
1783	goto bad;
1784	}
1785
1786	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1787	if (error == `0`)
1788	sflt_notify(so, sock_evt_disconnected, NULL);
1789
1790	bad:
1791	return (error);
1792	}
1793
1794	/ Locking version /
1795	int
1796	sodisconnect(struct socket *so)
1797	{
1798	int error;
1799
1800	socket_lock(so, `1`);
1801	error = sodisconnectlocked(so);
1802	socket_unlock(so, `1`);
1803	return (error);
1804	}
1805
1806	int
1807	sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1808	{
1809	int error;
1810
1811	/*
1812	* Call the protocol disconnectx handler; let it handle all
1813	* matters related to the connection state of this session.
1814	*/
1815	error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1816	if (error == `0`) {
1817	/*
1818	* The event applies only for the session, not for
1819	* the disconnection of individual subflows.
1820	*/
1821	if (so->so_state & (SS_ISDISCONNECTING\|SS_ISDISCONNECTED))
1822	sflt_notify(so, sock_evt_disconnected, NULL);
1823	}
1824	return (error);
1825	}
1826
1827	int
1828	sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1829	{
1830	int error;
1831
1832	socket_lock(so, `1`);
1833	error = sodisconnectxlocked(so, aid, cid);
1834	socket_unlock(so, `1`);
1835	return (error);
1836	}
1837
1838	#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1839
1840	/*
1841	* sosendcheck will lock the socket buffer if it isn't locked and
1842	* verify that there is space for the data being inserted.
1843	*
1844	* Returns: 0 Success
1845	* EPIPE
1846	* sblock:EWOULDBLOCK
1847	* sblock:EINTR
1848	* sbwait:EBADF
1849	* sbwait:EINTR
1850	* [so_error]:???
1851	*/
1852	int
1853	sosendcheck(struct socket so, struct* sockaddr *addr, user_ssize_t resid,
1854	int32_t clen, int32_t atomic, int flags, int *sblocked,
1855	struct mbuf *control)
1856	{
1857	int error = `0`;
1858	int32_t space;
1859	int assumelock = `0`;
1860
1861	restart:
1862	if (*sblocked == `0`) {
1863	if ((so->so_snd.sb_flags & SB_LOCK) != `0` &&
1864	so->so_send_filt_thread != `0` &&
1865	so->so_send_filt_thread == current_thread()) {
1866	/*
1867	* We're being called recursively from a filter,
1868	* allow this to continue. Radar 4150520.
1869	* Don't set sblocked because we don't want
1870	* to perform an unlock later.
1871	*/
1872	assumelock = `1`;
1873	} else {
1874	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
1875	if (error) {
1876	if (so->so_flags & SOF_DEFUNCT)
1877	goto defunct;
1878	return (error);
1879	}
1880	*sblocked = `1`;
1881	}
1882	}
1883
1884	/*
1885	* If a send attempt is made on a socket that has been marked
1886	* as inactive (disconnected), reject the request.
1887	*/
1888	if (so->so_flags & SOF_DEFUNCT) {
1889	defunct:
1890	error = EPIPE;
1891	SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
1892	__func__, proc_selfpid(), proc_best_name(current_proc()),
1893	(uint64_t)DEBUG_KERNEL_ADDRPERM(so),
1894	SOCK_DOM(so), SOCK_TYPE(so), error);
1895	return (error);
1896	}
1897
1898	if (so->so_state & SS_CANTSENDMORE) {
1899	#if CONTENT_FILTER
1900	/*
1901	* Can re-inject data of half closed connections
1902	*/
1903	if ((so->so_state & SS_ISDISCONNECTED) == `0` &&
1904	so->so_snd.sb_cfil_thread == current_thread() &&
1905	cfil_sock_data_pending(&so->so_snd) != `0`)
1906	CFIL_LOG(LOG_INFO,
1907	"so %llx ignore SS_CANTSENDMORE",
1908	(uint64_t)DEBUG_KERNEL_ADDRPERM(so));
1909	else
1910	#endif /* CONTENT_FILTER */
1911	return (EPIPE);
1912	}
1913	if (so->so_error) {
1914	error = so->so_error;
1915	so->so_error = `0`;
1916	return (error);
1917	}
1918
1919	if ((so->so_state & SS_ISCONNECTED) == `0`) {
1920	if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != `0`) {
1921	if (((so->so_state & SS_ISCONFIRMING) == `0`) &&
1922	(resid != `0` \|\| clen == `0`) &&
1923	!(so->so_flags1 & SOF1_PRECONNECT_DATA))
1924	return (ENOTCONN);
1925
1926	} else if (addr == `0` && !(flags&MSG_HOLD)) {
1927	return ((so->so_proto->pr_flags & PR_CONNREQUIRED) ?
1928	ENOTCONN : EDESTADDRREQ);
1929	}
1930	}
1931
1932	if (so->so_flags & SOF_ENABLE_MSGS)
1933	space = msgq_sbspace(so, control);
1934	else
1935	space = sbspace(&so->so_snd);
1936
1937	if (flags & MSG_OOB)
1938	space += `1024`;
1939	if ((atomic && resid > so->so_snd.sb_hiwat) \|\|
1940	clen > so->so_snd.sb_hiwat)
1941	return (EMSGSIZE);
1942
1943	if ((space < resid + clen &&
1944	(atomic \|\| (space < (int32_t)so->so_snd.sb_lowat) \|\|
1945	space < clen)) \|\|
1946	(so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
1947	/*
1948	* don't block the connectx call when there's more data
1949	* than can be copied.
1950	*/
1951	if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
1952	if (space == `0`) {
1953	return (EWOULDBLOCK);
1954	}
1955	if (space < (int32_t)so->so_snd.sb_lowat) {
1956	return (`0`);
1957	}
1958	}
1959	if ((so->so_state & SS_NBIO) \|\| (flags & MSG_NBIO) \|\|
1960	assumelock) {
1961	return (EWOULDBLOCK);
1962	}
1963	sbunlock(&so->so_snd, TRUE); / keep socket locked /
1964	*sblocked = `0`;
1965	error = sbwait(&so->so_snd);
1966	if (error) {
1967	if (so->so_flags & SOF_DEFUNCT)
1968	goto defunct;
1969	return (error);
1970	}
1971	goto restart;
1972	}
1973	return (`0`);
1974	}
1975
1976	/*
1977	* Send on a socket.
1978	* If send must go all at once and message is larger than
1979	* send buffering, then hard error.
1980	* Lock against other senders.
1981	* If must go all at once and not enough room now, then
1982	* inform user that this would block and do nothing.
1983	* Otherwise, if nonblocking, send as much as possible.
1984	* The data to be sent is described by "uio" if nonzero,
1985	* otherwise by the mbuf chain "top" (which must be null
1986	* if uio is not). Data provided in mbuf chain must be small
1987	* enough to send all at once.
1988	*
1989	* Returns nonzero on error, timeout or signal; callers
1990	* must check for short counts if EINTR/ERESTART are returned.
1991	* Data and control buffers are freed on return.
1992	* Experiment:
1993	* MSG_HOLD: go thru most of sosend(), but just enqueue the mbuf
1994	* MSG_SEND: go thru as for MSG_HOLD on current fragment, then
1995	* point at the mbuf chain being constructed and go from there.
1996	*
1997	* Returns: 0 Success
1998	* EOPNOTSUPP
1999	* EINVAL
2000	* ENOBUFS
2001	* uiomove:EFAULT
2002	* sosendcheck:EPIPE
2003	* sosendcheck:EWOULDBLOCK
2004	* sosendcheck:EINTR
2005	* sosendcheck:EBADF
2006	* sosendcheck:EINTR
2007	* sosendcheck:??? [value from so_error]
2008	* <pru_send>:ECONNRESET[TCP]
2009	* <pru_send>:EINVAL[TCP]
2010	* <pru_send>:ENOBUFS[TCP]
2011	* <pru_send>:EADDRINUSE[TCP]
2012	* <pru_send>:EADDRNOTAVAIL[TCP]
2013	* <pru_send>:EAFNOSUPPORT[TCP]
2014	* <pru_send>:EACCES[TCP]
2015	* <pru_send>:EAGAIN[TCP]
2016	* <pru_send>:EPERM[TCP]
2017	* <pru_send>:EMSGSIZE[TCP]
2018	* <pru_send>:EHOSTUNREACH[TCP]
2019	* <pru_send>:ENETUNREACH[TCP]
2020	* <pru_send>:ENETDOWN[TCP]
2021	* <pru_send>:ENOMEM[TCP]
2022	* <pru_send>:ENOBUFS[TCP]
2023	* <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
2024	* <pru_send>:EINVAL[AF_UNIX]
2025	* <pru_send>:EOPNOTSUPP[AF_UNIX]
2026	* <pru_send>:EPIPE[AF_UNIX]
2027	* <pru_send>:ENOTCONN[AF_UNIX]
2028	* <pru_send>:EISCONN[AF_UNIX]
2029	* <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
2030	* <sf_data_out>:??? [whatever a filter author chooses]
2031	*
2032	* Notes: Other <pru_send> returns depend on the protocol family; all
2033	* <sf_data_out> returns depend on what the filter author causes
2034	* their filter to return.
2035	*/
2036	int
2037	sosend(struct socket so, struct* sockaddr addr, struct* uio *uio,
2038	struct mbuf top, struct* mbuf control, int* flags)
2039	{
2040	struct mbuf **mp;
2041	struct mbuf m, freelist = NULL;
2042	user_ssize_t space, len, resid, orig_resid;
2043	int clen = `0`, error, dontroute, mlen, sendflags;
2044	int atomic = sosendallatonce(so) \|\| top;
2045	int sblocked = `0`;
2046	struct proc *p = current_proc();
2047	struct mbuf *control_copy = NULL;
2048	uint16_t headroom = `0`;
2049	boolean_t en_tracing = FALSE;
2050
2051	if (uio != NULL)
2052	resid = uio_resid(uio);
2053	else
2054	resid = top->m_pkthdr.len;
2055
2056	KERNEL_DEBUG((DBG_FNC_SOSEND \| DBG_FUNC_START), so, resid,
2057	so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2058
2059	socket_lock(so, `1`);
2060
2061	/*
2062	* trace if tracing & network (vs. unix) sockets & and
2063	* non-loopback
2064	*/
2065	if (ENTR_SHOULDTRACE &&
2066	(SOCK_CHECK_DOM(so, AF_INET) \|\| SOCK_CHECK_DOM(so, AF_INET6))) {
2067	struct inpcb *inp = sotoinpcb(so);
2068	if (inp->inp_last_outifp != NULL &&
2069	!(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2070	en_tracing = TRUE;
2071	KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2072	VM_KERNEL_ADDRPERM(so),
2073	((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : `0`),
2074	(int64_t)resid);
2075	orig_resid = resid;
2076	}
2077	}
2078
2079	/*
2080	* Re-injection should not affect process accounting
2081	*/
2082	if ((flags & MSG_SKIPCFIL) == `0`) {
2083	so_update_last_owner_locked(so, p);
2084	so_update_policy(so);
2085
2086	#if NECP
2087	so_update_necp_policy(so, NULL, addr);
2088	#endif /* NECP */
2089	}
2090
2091	if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != `0`) {
2092	error = EOPNOTSUPP;
2093	goto out_locked;
2094	}
2095
2096	/*
2097	* In theory resid should be unsigned.
2098	* However, space must be signed, as it might be less than 0
2099	* if we over-committed, and we must use a signed comparison
2100	* of space and resid. On the other hand, a negative resid
2101	* causes us to loop sending 0-length segments to the protocol.
2102	*
2103	* Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2104	* But it will be used by sockets doing message delivery.
2105	*
2106	* Note: We limit resid to be a positive int value as we use
2107	* imin() to set bytes_to_copy -- radr://14558484
2108	*/
2109	if (resid < `0` \|\| resid > INT_MAX \|\| (so->so_type == SOCK_STREAM &&
2110	!(so->so_flags & SOF_ENABLE_MSGS) && (flags & MSG_EOR))) {
2111	error = EINVAL;
2112	goto out_locked;
2113	}
2114
2115	dontroute = (flags & MSG_DONTROUTE) &&
2116	(so->so_options & SO_DONTROUTE) == `0` &&
2117	(so->so_proto->pr_flags & PR_ATOMIC);
2118	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2119
2120	if (control != NULL)
2121	clen = control->m_len;
2122
2123	if (soreserveheadroom != `0`)
2124	headroom = so->so_pktheadroom;
2125
2126	do {
2127	error = sosendcheck(so, addr, resid, clen, atomic, flags,
2128	&sblocked, control);
2129	if (error)
2130	goto out_locked;
2131
2132	mp = &top;
2133	if (so->so_flags & SOF_ENABLE_MSGS)
2134	space = msgq_sbspace(so, control);
2135	else
2136	space = sbspace(&so->so_snd) - clen;
2137	space += ((flags & MSG_OOB) ? `1024` : `0`);
2138
2139	do {
2140	if (uio == NULL) {
2141	/*
2142	* Data is prepackaged in "top".
2143	*/
2144	resid = `0`;
2145	if (flags & MSG_EOR)
2146	top->m_flags \|= M_EOR;
2147	} else {
2148	int chainlength;
2149	int bytes_to_copy;
2150	boolean_t jumbocl;
2151	boolean_t bigcl;
2152	int bytes_to_alloc;
2153
2154	bytes_to_copy = imin(resid, space);
2155
2156	bytes_to_alloc = bytes_to_copy;
2157	if (top == NULL)
2158	bytes_to_alloc += headroom;
2159
2160	if (sosendminchain > `0`)
2161	chainlength = `0`;
2162	else
2163	chainlength = sosendmaxchain;
2164
2165	/*
2166	* Use big 4 KB cluster when the outgoing interface
2167	* does not prefer 2 KB clusters
2168	*/
2169	bigcl = !(so->so_flags1 & SOF1_IF_2KCL) \|\|
2170	sosendbigcl_ignore_capab;
2171
2172	/*
2173	* Attempt to use larger than system page-size
2174	* clusters for large writes only if there is
2175	* a jumbo cluster pool and if the socket is
2176	* marked accordingly.
2177	*/
2178	jumbocl = sosendjcl && njcl > `0` &&
2179	((so->so_flags & SOF_MULTIPAGES) \|\|
2180	sosendjcl_ignore_capab) &&
2181	bigcl;
2182
2183	socket_unlock(so, `0`);
2184
2185	do {
2186	int num_needed;
2187	int hdrs_needed = (top == NULL) ? `1` : `0`;
2188
2189	/*
2190	* try to maintain a local cache of mbuf
2191	* clusters needed to complete this
2192	* write the list is further limited to
2193	* the number that are currently needed
2194	* to fill the socket this mechanism
2195	* allows a large number of mbufs/
2196	* clusters to be grabbed under a single
2197	* mbuf lock... if we can't get any
2198	* clusters, than fall back to trying
2199	* for mbufs if we fail early (or
2200	* miscalcluate the number needed) make
2201	* sure to release any clusters we
2202	* haven't yet consumed.
2203	*/
2204	if (freelist == NULL &&
2205	bytes_to_alloc > MBIGCLBYTES &&
2206	jumbocl) {
2207	num_needed =
2208	bytes_to_alloc / M16KCLBYTES;
2209
2210	if ((bytes_to_alloc -
2211	(num_needed * M16KCLBYTES))
2212	>= MINCLSIZE)
2213	num_needed++;
2214
2215	freelist =
2216	m_getpackets_internal(
2217	(unsigned int *)&num_needed,
2218	hdrs_needed, M_WAIT, `0`,
2219	M16KCLBYTES);
2220	/*
2221	* Fall back to 4K cluster size
2222	* if allocation failed
2223	*/
2224	}
2225
2226	if (freelist == NULL &&
2227	bytes_to_alloc > MCLBYTES &&
2228	bigcl) {
2229	num_needed =
2230	bytes_to_alloc / MBIGCLBYTES;
2231
2232	if ((bytes_to_alloc -
2233	(num_needed * MBIGCLBYTES)) >=
2234	MINCLSIZE)
2235	num_needed++;
2236
2237	freelist =
2238	m_getpackets_internal(
2239	(unsigned int *)&num_needed,
2240	hdrs_needed, M_WAIT, `0`,
2241	MBIGCLBYTES);
2242	/*
2243	* Fall back to cluster size
2244	* if allocation failed
2245	*/
2246	}
2247
2248	/*
2249	* Allocate a cluster as we want to
2250	* avoid to split the data in more
2251	* that one segment and using MINCLSIZE
2252	* would lead us to allocate two mbufs
2253	*/
2254	if (soreserveheadroom != `0` &&
2255	freelist == NULL &&
2256	((top == NULL &&
2257	bytes_to_alloc > _MHLEN) \|\|
2258	bytes_to_alloc > _MLEN)) {
2259	num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2260	MCLBYTES;
2261	freelist =
2262	m_getpackets_internal(
2263	(unsigned int *)&num_needed,
2264	hdrs_needed, M_WAIT, `0`,
2265	MCLBYTES);
2266	/*
2267	* Fall back to a single mbuf
2268	* if allocation failed
2269	*/
2270	} else if (freelist == NULL &&
2271	bytes_to_alloc > MINCLSIZE) {
2272	num_needed =
2273	bytes_to_alloc / MCLBYTES;
2274
2275	if ((bytes_to_alloc -
2276	(num_needed * MCLBYTES)) >=
2277	MINCLSIZE)
2278	num_needed++;
2279
2280	freelist =
2281	m_getpackets_internal(
2282	(unsigned int *)&num_needed,
2283	hdrs_needed, M_WAIT, `0`,
2284	MCLBYTES);
2285	/*
2286	* Fall back to a single mbuf
2287	* if allocation failed
2288	*/
2289	}
2290	/*
2291	* For datagram protocols, leave
2292	* headroom for protocol headers
2293	* in the first cluster of the chain
2294	*/
2295	if (freelist != NULL && atomic &&
2296	top == NULL && headroom > `0`) {
2297	freelist->m_data += headroom;
2298	}
2299
2300	/*
2301	* Fall back to regular mbufs without
2302	* reserving the socket headroom
2303	*/
2304	if (freelist == NULL) {
2305	if (top == NULL)
2306	MGETHDR(freelist,
2307	M_WAIT, MT_DATA);
2308	else
2309	MGET(freelist,
2310	M_WAIT, MT_DATA);
2311
2312	if (freelist == NULL) {
2313	error = ENOBUFS;
2314	socket_lock(so, `0`);
2315	goto out_locked;
2316	}
2317	/*
2318	* For datagram protocols,
2319	* leave room for protocol
2320	* headers in first mbuf.
2321	*/
2322	if (atomic && top == NULL &&
2323	bytes_to_copy < MHLEN) {
2324	MH_ALIGN(freelist,
2325	bytes_to_copy);
2326	}
2327	}
2328	m = freelist;
2329	freelist = m->m_next;
2330	m->m_next = NULL;
2331
2332	if ((m->m_flags & M_EXT))
2333	mlen = m->m_ext.ext_size -
2334	M_LEADINGSPACE(m);
2335	else if ((m->m_flags & M_PKTHDR))
2336	mlen =
2337	MHLEN - M_LEADINGSPACE(m);
2338	else
2339	mlen = MLEN - M_LEADINGSPACE(m);
2340	len = imin(mlen, bytes_to_copy);
2341
2342	chainlength += len;
2343
2344	space -= len;
2345
2346	error = uiomove(mtod(m, caddr_t),
2347	len, uio);
2348
2349	resid = uio_resid(uio);
2350
2351	m->m_len = len;
2352	*mp = m;
2353	top->m_pkthdr.len += len;
2354	if (error)
2355	break;
2356	mp = &m->m_next;
2357	if (resid <= `0`) {
2358	if (flags & MSG_EOR)
2359	top->m_flags \|= M_EOR;
2360	break;
2361	}
2362	bytes_to_copy = min(resid, space);
2363
2364	} while (space > `0` &&
2365	(chainlength < sosendmaxchain \|\| atomic \|\|
2366	resid < MINCLSIZE));
2367
2368	socket_lock(so, `0`);
2369
2370	if (error)
2371	goto out_locked;
2372	}
2373
2374	if (flags & (MSG_HOLD\|MSG_SEND)) {
2375	/ Enqueue for later, go away if HOLD /
2376	struct mbuf *mb1;
2377	if (so->so_temp && (flags & MSG_FLUSH)) {
2378	m_freem(so->so_temp);
2379	so->so_temp = NULL;
2380	}
2381	if (so->so_temp)
2382	so->so_tail->m_next = top;
2383	else
2384	so->so_temp = top;
2385	mb1 = top;
2386	while (mb1->m_next)
2387	mb1 = mb1->m_next;
2388	so->so_tail = mb1;
2389	if (flags & MSG_HOLD) {
2390	top = NULL;
2391	goto out_locked;
2392	}
2393	top = so->so_temp;
2394	}
2395	if (dontroute)
2396	so->so_options \|= SO_DONTROUTE;
2397
2398	/*
2399	* Compute flags here, for pru_send and NKEs
2400	*
2401	* If the user set MSG_EOF, the protocol
2402	* understands this flag and nothing left to
2403	* send then use PRU_SEND_EOF instead of PRU_SEND.
2404	*/
2405	sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2406	((flags & MSG_EOF) &&
2407	(so->so_proto->pr_flags & PR_IMPLOPCL) &&
2408	(resid <= `0`)) ? PRUS_EOF :
2409	/ If there is more to send set PRUS_MORETOCOME /
2410	(resid > `0` && space > `0`) ? PRUS_MORETOCOME : `0`;
2411
2412	if ((flags & MSG_SKIPCFIL) == `0`) {
2413	/*
2414	* Socket filter processing
2415	*/
2416	error = sflt_data_out(so, addr, &top,
2417	&control, (sendflags & MSG_OOB) ?
2418	sock_data_filt_flag_oob : `0`);
2419	if (error) {
2420	if (error == EJUSTRETURN) {
2421	error = `0`;
2422	clen = `0`;
2423	control = NULL;
2424	top = NULL;
2425	}
2426	goto out_locked;
2427	}
2428	#if CONTENT_FILTER
2429	/*
2430	* Content filter processing
2431	*/
2432	error = cfil_sock_data_out(so, addr, top,
2433	control, sendflags);
2434	if (error) {
2435	if (error == EJUSTRETURN) {
2436	error = `0`;
2437	clen = `0`;
2438	control = NULL;
2439	top = NULL;
2440	}
2441	goto out_locked;
2442	}
2443	#endif /* CONTENT_FILTER */
2444	}
2445	if (so->so_flags & SOF_ENABLE_MSGS) {
2446	/*
2447	* Make a copy of control mbuf,
2448	* so that msg priority can be
2449	* passed to subsequent mbufs.
2450	*/
2451	control_copy = m_dup(control, M_NOWAIT);
2452	}
2453	error = (*so->so_proto->pr_usrreqs->pru_send)
2454	(so, sendflags, top, addr, control, p);
2455
2456	if (flags & MSG_SEND)
2457	so->so_temp = NULL;
2458
2459	if (dontroute)
2460	so->so_options &= ~SO_DONTROUTE;
2461
2462	clen = `0`;
2463	control = control_copy;
2464	control_copy = NULL;
2465	top = NULL;
2466	mp = &top;
2467	if (error)
2468	goto out_locked;
2469	} while (resid && space > `0`);
2470	} while (resid);
2471
2472	out_locked:
2473	if (sblocked)
2474	sbunlock(&so->so_snd, FALSE); / will unlock socket /
2475	else
2476	socket_unlock(so, `1`);
2477	if (top != NULL)
2478	m_freem(top);
2479	if (control != NULL)
2480	m_freem(control);
2481	if (freelist != NULL)
2482	m_freem_list(freelist);
2483	if (control_copy != NULL)
2484	m_freem(control_copy);
2485
2486	soclearfastopen(so);
2487
2488	if (en_tracing) {
2489	/ resid passed here is the bytes left in uio /
2490	KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2491	VM_KERNEL_ADDRPERM(so),
2492	((error == EWOULDBLOCK) ? kEnTrFlagNoWork : `0`),
2493	(int64_t)(orig_resid - resid));
2494	}
2495	KERNEL_DEBUG(DBG_FNC_SOSEND \| DBG_FUNC_END, so, resid,
2496	so->so_snd.sb_cc, space, error);
2497
2498	return (error);
2499	}
2500
2501	int
2502	sosend_reinject(struct socket so, struct* sockaddr addr, struct* mbuf top, struct* mbuf *control, uint32_t sendflags)
2503	{
2504	struct mbuf m0, control_end;
2505
2506	socket_lock_assert_owned(so);
2507
2508	/*
2509	* top must points to mbuf chain to be sent.
2510	* If control is not NULL, top must be packet header
2511	*/
2512	VERIFY(top != NULL &&
2513	(control == NULL \|\| top->m_flags & M_PKTHDR));
2514
2515	/*
2516	* If control is not passed in, see if we can get it
2517	* from top.
2518	*/
2519	if (control == NULL && (top->m_flags & M_PKTHDR) == `0`) {
2520	// Locate start of control if present and start of data
2521	for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2522	if (m0->m_flags & M_PKTHDR) {
2523	top = m0;
2524	break;
2525	} else if (m0->m_type == MT_CONTROL) {
2526	if (control == NULL) {
2527	// Found start of control
2528	control = m0;
2529	}
2530	if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2531	// Found end of control
2532	control_end = m0;
2533	}
2534	}
2535	}
2536	if (control_end != NULL)
2537	control_end->m_next = NULL;
2538	}
2539
2540	int error = (*so->so_proto->pr_usrreqs->pru_send)
2541	(so, sendflags, top, addr, control, current_proc());
2542
2543	return error;
2544	}
2545
2546	/*
2547	* Supported only connected sockets (no address) without ancillary data
2548	* (control mbuf) for atomic protocols
2549	*/
2550	int
2551	sosend_list(struct socket so, struct* uio *uioarray, u_int uiocnt, int* flags)
2552	{
2553	struct mbuf m, freelist = NULL;
2554	user_ssize_t len, resid;
2555	int error, dontroute, mlen;
2556	int atomic = sosendallatonce(so);
2557	int sblocked = `0`;
2558	struct proc *p = current_proc();
2559	u_int uiofirst = `0`;
2560	u_int uiolast = `0`;
2561	struct mbuf *top = NULL;
2562	uint16_t headroom = `0`;
2563	boolean_t bigcl;
2564
2565	KERNEL_DEBUG((DBG_FNC_SOSEND_LIST \| DBG_FUNC_START), so, uiocnt,
2566	so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2567
2568	if (so->so_type != SOCK_DGRAM) {
2569	error = EINVAL;
2570	goto out;
2571	}
2572	if (atomic == `0`) {
2573	error = EINVAL;
2574	goto out;
2575	}
2576	if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
2577	error = EPROTONOSUPPORT;
2578	goto out;
2579	}
2580	if (flags & ~(MSG_DONTWAIT \| MSG_NBIO)) {
2581	error = EINVAL;
2582	goto out;
2583	}
2584	resid = uio_array_resid(uioarray, uiocnt);
2585
2586	/*
2587	* In theory resid should be unsigned.
2588	* However, space must be signed, as it might be less than 0
2589	* if we over-committed, and we must use a signed comparison
2590	* of space and resid. On the other hand, a negative resid
2591	* causes us to loop sending 0-length segments to the protocol.
2592	*
2593	* Note: We limit resid to be a positive int value as we use
2594	* imin() to set bytes_to_copy -- radr://14558484
2595	*/
2596	if (resid < `0` \|\| resid > INT_MAX) {
2597	error = EINVAL;
2598	goto out;
2599	}
2600
2601	socket_lock(so, `1`);
2602	so_update_last_owner_locked(so, p);
2603	so_update_policy(so);
2604
2605	#if NECP
2606	so_update_necp_policy(so, NULL, NULL);
2607	#endif /* NECP */
2608
2609	dontroute = (flags & MSG_DONTROUTE) &&
2610	(so->so_options & SO_DONTROUTE) == `0` &&
2611	(so->so_proto->pr_flags & PR_ATOMIC);
2612	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
2613
2614	error = sosendcheck(so, NULL, resid, `0`, atomic, flags,
2615	&sblocked, NULL);
2616	if (error)
2617	goto release;
2618
2619	/*
2620	* Use big 4 KB clusters when the outgoing interface does not prefer
2621	* 2 KB clusters
2622	*/
2623	bigcl = !(so->so_flags1 & SOF1_IF_2KCL) \|\| sosendbigcl_ignore_capab;
2624
2625	if (soreserveheadroom != `0`)
2626	headroom = so->so_pktheadroom;
2627
2628	do {
2629	int i;
2630	int num_needed = `0`;
2631	int chainlength;
2632	size_t maxpktlen = `0`;
2633	int bytes_to_alloc;
2634
2635	if (sosendminchain > `0`)
2636	chainlength = `0`;
2637	else
2638	chainlength = sosendmaxchain;
2639
2640	socket_unlock(so, `0`);
2641
2642	/*
2643	* Find a set of uio that fit in a reasonable number
2644	* of mbuf packets
2645	*/
2646	for (i = uiofirst; i < uiocnt; i++) {
2647	struct uio *auio = uioarray[i];
2648
2649	len = uio_resid(auio);
2650
2651	/ Do nothing for empty messages /
2652	if (len == `0`)
2653	continue;
2654
2655	num_needed += `1`;
2656	uiolast += `1`;
2657
2658	if (len > maxpktlen)
2659	maxpktlen = len;
2660
2661	chainlength += len;
2662	if (chainlength > sosendmaxchain)
2663	break;
2664	}
2665	/*
2666	* Nothing left to send
2667	*/
2668	if (num_needed == `0`) {
2669	socket_lock(so, `0`);
2670	break;
2671	}
2672	/*
2673	* Allocate buffer large enough to include headroom space for
2674	* network and link header
2675	*
2676	*/
2677	bytes_to_alloc = maxpktlen + headroom;
2678
2679	/*
2680	* Allocate a single contiguous buffer of the smallest available
2681	* size when possible
2682	*/
2683	if (bytes_to_alloc > MCLBYTES &&
2684	bytes_to_alloc <= MBIGCLBYTES && bigcl) {
2685	freelist = m_getpackets_internal(
2686	(unsigned int *)&num_needed,
2687	num_needed, M_WAIT, `1`,
2688	MBIGCLBYTES);
2689	} else if (bytes_to_alloc > _MHLEN &&
2690	bytes_to_alloc <= MCLBYTES) {
2691	freelist = m_getpackets_internal(
2692	(unsigned int *)&num_needed,
2693	num_needed, M_WAIT, `1`,
2694	MCLBYTES);
2695	} else {
2696	freelist = m_allocpacket_internal(
2697	(unsigned int *)&num_needed,
2698	bytes_to_alloc, NULL, M_WAIT, `1`, `0`);
2699	}
2700
2701	if (freelist == NULL) {
2702	socket_lock(so, `0`);
2703	error = ENOMEM;
2704	goto release;
2705	}
2706	/*
2707	* Copy each uio of the set into its own mbuf packet
2708	*/
2709	for (i = uiofirst, m = freelist;
2710	i < uiolast && m != NULL;
2711	i++) {
2712	int bytes_to_copy;
2713	struct mbuf *n;
2714	struct uio *auio = uioarray[i];
2715
2716	bytes_to_copy = uio_resid(auio);
2717
2718	/ Do nothing for empty messages /
2719	if (bytes_to_copy == `0`)
2720	continue;
2721	/*
2722	* Leave headroom for protocol headers
2723	* in the first mbuf of the chain
2724	*/
2725	m->m_data += headroom;
2726
2727	for (n = m; n != NULL; n = n->m_next) {
2728	if ((m->m_flags & M_EXT))
2729	mlen = m->m_ext.ext_size -
2730	M_LEADINGSPACE(m);
2731	else if ((m->m_flags & M_PKTHDR))
2732	mlen =
2733	MHLEN - M_LEADINGSPACE(m);
2734	else
2735	mlen = MLEN - M_LEADINGSPACE(m);
2736	len = imin(mlen, bytes_to_copy);
2737
2738	/*
2739	* Note: uiomove() decrements the iovec
2740	* length
2741	*/
2742	error = uiomove(mtod(n, caddr_t),
2743	len, auio);
2744	if (error != `0`)
2745	break;
2746	n->m_len = len;
2747	m->m_pkthdr.len += len;
2748
2749	VERIFY(m->m_pkthdr.len <= maxpktlen);
2750
2751	bytes_to_copy -= len;
2752	resid -= len;
2753	}
2754	if (m->m_pkthdr.len == `0`) {
2755	printf(
2756	"%s:%d so %llx pkt %llx type %u len null\n",
2757	__func__, __LINE__,
2758	(uint64_t)DEBUG_KERNEL_ADDRPERM(so),
2759	(uint64_t)DEBUG_KERNEL_ADDRPERM(m),
2760	m->m_type);
2761	}
2762	if (error != `0`)
2763	break;
2764	m = m->m_nextpkt;
2765	}
2766
2767	socket_lock(so, `0`);
2768
2769	if (error)
2770	goto release;
2771	top = freelist;
2772	freelist = NULL;
2773
2774	if (dontroute)
2775	so->so_options \|= SO_DONTROUTE;
2776
2777	if ((flags & MSG_SKIPCFIL) == `0`) {
2778	struct mbuf **prevnextp = NULL;
2779
2780	for (i = uiofirst, m = top;
2781	i < uiolast && m != NULL;
2782	i++) {
2783	struct mbuf *nextpkt = m->m_nextpkt;
2784
2785	/*
2786	* Socket filter processing
2787	*/
2788	error = sflt_data_out(so, NULL, &m,
2789	NULL, `0`);
2790	if (error != `0` && error != EJUSTRETURN)
2791	goto release;
2792
2793	#if CONTENT_FILTER
2794	if (error == `0`) {
2795	/*
2796	* Content filter processing
2797	*/
2798	error = cfil_sock_data_out(so, NULL, m,
2799	NULL, `0`);
2800	if (error != `0` && error != EJUSTRETURN)
2801	goto release;
2802	}
2803	#endif /* CONTENT_FILTER */
2804	/*
2805	* Remove packet from the list when
2806	* swallowed by a filter
2807	*/
2808	if (error == EJUSTRETURN) {
2809	error = `0`;
2810	if (prevnextp != NULL)
2811	*prevnextp = nextpkt;
2812	else
2813	top = nextpkt;
2814	}
2815
2816	m = nextpkt;
2817	if (m != NULL)
2818	prevnextp = &m->m_nextpkt;
2819	}
2820	}
2821	if (top != NULL)
2822	error = (*so->so_proto->pr_usrreqs->pru_send_list)
2823	(so, `0`, top, NULL, NULL, p);
2824
2825	if (dontroute)
2826	so->so_options &= ~SO_DONTROUTE;
2827
2828	top = NULL;
2829	uiofirst = uiolast;
2830	} while (resid > `0` && error == `0`);
2831	release:
2832	if (sblocked)
2833	sbunlock(&so->so_snd, FALSE); / will unlock socket /
2834	else
2835	socket_unlock(so, `1`);
2836	out:
2837	if (top != NULL)
2838	m_freem(top);
2839	if (freelist != NULL)
2840	m_freem_list(freelist);
2841
2842	KERNEL_DEBUG(DBG_FNC_SOSEND_LIST \| DBG_FUNC_END, so, resid,
2843	so->so_snd.sb_cc, `0`, error);
2844
2845	return (error);
2846	}
2847
2848	/*
2849	* May return ERESTART when packet is dropped by MAC policy check
2850	*/
2851	static int
2852	soreceive_addr(struct proc p, struct* socket so, struct* sockaddr **psa,
2853	int flags, struct mbuf mp, struct mbuf nextrecordp, int canwait)
2854	{
2855	int error = `0`;
2856	struct mbuf m = mp;
2857	struct mbuf nextrecord = nextrecordp;
2858
2859	KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2860	#if CONFIG_MACF_SOCKET_SUBSET
2861	/*
2862	* Call the MAC framework for policy checking if we're in
2863	* the user process context and the socket isn't connected.
2864	*/
2865	if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2866	struct mbuf *m0 = m;
2867	/*
2868	* Dequeue this record (temporarily) from the receive
2869	* list since we're about to drop the socket's lock
2870	* where a new record may arrive and be appended to
2871	* the list. Upon MAC policy failure, the record
2872	* will be freed. Otherwise, we'll add it back to
2873	* the head of the list. We cannot rely on SB_LOCK
2874	* because append operation uses the socket's lock.
2875	*/
2876	do {
2877	m->m_nextpkt = NULL;
2878	sbfree(&so->so_rcv, m);
2879	m = m->m_next;
2880	} while (m != NULL);
2881	m = m0;
2882	so->so_rcv.sb_mb = nextrecord;
2883	SB_EMPTY_FIXUP(&so->so_rcv);
2884	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2885	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2886	socket_unlock(so, `0`);
2887
2888	if (mac_socket_check_received(proc_ucred(p), so,
2889	mtod(m, struct sockaddr *)) != `0`) {
2890	/*
2891	* MAC policy failure; free this record and
2892	* process the next record (or block until
2893	* one is available). We have adjusted sb_cc
2894	* and sb_mbcnt above so there is no need to
2895	* call sbfree() again.
2896	*/
2897	m_freem(m);
2898	/*
2899	* Clear SB_LOCK but don't unlock the socket.
2900	* Process the next record or wait for one.
2901	*/
2902	socket_lock(so, `0`);
2903	sbunlock(&so->so_rcv, TRUE); / stay locked /
2904	error = ERESTART;
2905	goto done;
2906	}
2907	socket_lock(so, `0`);
2908	/*
2909	* If the socket has been defunct'd, drop it.
2910	*/
2911	if (so->so_flags & SOF_DEFUNCT) {
2912	m_freem(m);
2913	error = ENOTCONN;
2914	goto done;
2915	}
2916	/*
2917	* Re-adjust the socket receive list and re-enqueue
2918	* the record in front of any packets which may have
2919	* been appended while we dropped the lock.
2920	*/
2921	for (m = m0; m->m_next != NULL; m = m->m_next)
2922	sballoc(&so->so_rcv, m);
2923	sballoc(&so->so_rcv, m);
2924	if (so->so_rcv.sb_mb == NULL) {
2925	so->so_rcv.sb_lastrecord = m0;
2926	so->so_rcv.sb_mbtail = m;
2927	}
2928	m = m0;
2929	nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
2930	so->so_rcv.sb_mb = m;
2931	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
2932	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
2933	}
2934	#endif /* CONFIG_MACF_SOCKET_SUBSET */
2935	if (psa != NULL) {
2936	psa = dup_sockaddr(mtod(m, struct* sockaddr *), canwait);
2937	if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
2938	error = EWOULDBLOCK;
2939	goto done;
2940	}
2941	}
2942	if (flags & MSG_PEEK) {
2943	m = m->m_next;
2944	} else {
2945	sbfree(&so->so_rcv, m);
2946	if (m->m_next == NULL && so->so_rcv.sb_cc != `0`) {
2947	panic("%s: about to create invalid socketbuf",
2948	__func__);
2949	/ NOTREACHED /
2950	}
2951	MFREE(m, so->so_rcv.sb_mb);
2952	m = so->so_rcv.sb_mb;
2953	if (m != NULL) {
2954	m->m_nextpkt = nextrecord;
2955	} else {
2956	so->so_rcv.sb_mb = nextrecord;
2957	SB_EMPTY_FIXUP(&so->so_rcv);
2958	}
2959	}
2960	done:
2961	*mp = m;
2962	*nextrecordp = nextrecord;
2963
2964	return (error);
2965	}
2966
2967	/*
2968	* Process one or more MT_CONTROL mbufs present before any data mbufs
2969	* in the first mbuf chain on the socket buffer. If MSG_PEEK, we
2970	* just copy the data; if !MSG_PEEK, we call into the protocol to
2971	* perform externalization.
2972	*/
2973	static int
2974	soreceive_ctl(struct socket so, struct* mbuf *controlp, int* flags,
2975	struct mbuf mp, struct mbuf nextrecordp)
2976	{
2977	int error = `0`;
2978	struct mbuf cm = NULL, cmn;
2979	struct mbuf **cme = &cm;
2980	struct sockbuf *sb_rcv = &so->so_rcv;
2981	struct mbuf **msgpcm = NULL;
2982	struct mbuf m = mp;
2983	struct mbuf nextrecord = nextrecordp;
2984	struct protosw *pr = so->so_proto;
2985
2986	/*
2987	* Externalizing the control messages would require us to
2988	* drop the socket's lock below. Once we re-acquire the
2989	* lock, the mbuf chain might change. In order to preserve
2990	* consistency, we unlink all control messages from the
2991	* first mbuf chain in one shot and link them separately
2992	* onto a different chain.
2993	*/
2994	do {
2995	if (flags & MSG_PEEK) {
2996	if (controlp != NULL) {
2997	if (*controlp == NULL) {
2998	msgpcm = controlp;
2999	}
3000	*controlp = m_copy(m, `0`, m->m_len);
3001
3002	/*
3003	* If we failed to allocate an mbuf,
3004	* release any previously allocated
3005	* mbufs for control data. Return
3006	* an error. Keep the mbufs in the
3007	* socket as this is using
3008	* MSG_PEEK flag.
3009	*/
3010	if (*controlp == NULL) {
3011	m_freem(*msgpcm);
3012	error = ENOBUFS;
3013	goto done;
3014	}
3015	controlp = &(*controlp)->m_next;
3016	}
3017	m = m->m_next;
3018	} else {
3019	m->m_nextpkt = NULL;
3020	sbfree(sb_rcv, m);
3021	sb_rcv->sb_mb = m->m_next;
3022	m->m_next = NULL;
3023	*cme = m;
3024	cme = &(*cme)->m_next;
3025	m = sb_rcv->sb_mb;
3026	}
3027	} while (m != NULL && m->m_type == MT_CONTROL);
3028
3029	if (!(flags & MSG_PEEK)) {
3030	if (sb_rcv->sb_mb != NULL) {
3031	sb_rcv->sb_mb->m_nextpkt = nextrecord;
3032	} else {
3033	sb_rcv->sb_mb = nextrecord;
3034	SB_EMPTY_FIXUP(sb_rcv);
3035	}
3036	if (nextrecord == NULL)
3037	sb_rcv->sb_lastrecord = m;
3038	}
3039
3040	SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3041	SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3042
3043	while (cm != NULL) {
3044	int cmsg_type;
3045
3046	cmn = cm->m_next;
3047	cm->m_next = NULL;
3048	cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3049
3050	/*
3051	* Call the protocol to externalize SCM_RIGHTS message
3052	* and return the modified message to the caller upon
3053	* success. Otherwise, all other control messages are
3054	* returned unmodified to the caller. Note that we
3055	* only get into this loop if MSG_PEEK is not set.
3056	*/
3057	if (pr->pr_domain->dom_externalize != NULL &&
3058	cmsg_type == SCM_RIGHTS) {
3059	/*
3060	* Release socket lock: see 3903171. This
3061	* would also allow more records to be appended
3062	* to the socket buffer. We still have SB_LOCK
3063	* set on it, so we can be sure that the head
3064	* of the mbuf chain won't change.
3065	*/
3066	socket_unlock(so, `0`);
3067	error = (*pr->pr_domain->dom_externalize)(cm);
3068	socket_lock(so, `0`);
3069	} else {
3070	error = `0`;
3071	}
3072
3073	if (controlp != NULL && error == `0`) {
3074	*controlp = cm;
3075	controlp = &(*controlp)->m_next;
3076	} else {
3077	(void) m_free(cm);
3078	}
3079	cm = cmn;
3080	}
3081	/*
3082	* Update the value of nextrecord in case we received new
3083	* records when the socket was unlocked above for
3084	* externalizing SCM_RIGHTS.
3085	*/
3086	if (m != NULL)
3087	nextrecord = sb_rcv->sb_mb->m_nextpkt;
3088	else
3089	nextrecord = sb_rcv->sb_mb;
3090
3091	done:
3092	*mp = m;
3093	*nextrecordp = nextrecord;
3094
3095	return (error);
3096	}
3097
3098	/*
3099	* Implement receive operations on a socket.
3100	* We depend on the way that records are added to the sockbuf
3101	* by sbappend*. In particular, each record (mbufs linked through m_next)
3102	* must begin with an address if the protocol so specifies,
3103	* followed by an optional mbuf or mbufs containing ancillary data,
3104	* and then zero or more mbufs of data.
3105	* In order to avoid blocking network interrupts for the entire time here,
3106	* we splx() while doing the actual copy to user space.
3107	* Although the sockbuf is locked, new data may still be appended,
3108	* and thus we must maintain consistency of the sockbuf during that time.
3109	*
3110	* The caller may receive the data as a single mbuf chain by supplying
3111	* an mbuf **mp0 for use in returning the chain. The uio is then used
3112	* only for the count in uio_resid.
3113	*
3114	* Returns: 0 Success
3115	* ENOBUFS
3116	* ENOTCONN
3117	* EWOULDBLOCK
3118	* uiomove:EFAULT
3119	* sblock:EWOULDBLOCK
3120	* sblock:EINTR
3121	* sbwait:EBADF
3122	* sbwait:EINTR
3123	* sodelayed_copy:EFAULT
3124	* <pru_rcvoob>:EINVAL[TCP]
3125	* <pru_rcvoob>:EWOULDBLOCK[TCP]
3126	* <pru_rcvoob>:???
3127	* <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3128	* <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3129	* <pr_domain->dom_externalize>:???
3130	*
3131	* Notes: Additional return values from calls through <pru_rcvoob> and
3132	* <pr_domain->dom_externalize> depend on protocols other than
3133	* TCP or AF_UNIX, which are documented above.
3134	*/
3135	int
3136	soreceive(struct socket so, struct* sockaddr psa, struct** uio *uio,
3137	struct mbuf mp0, struct mbuf controlp, int *flagsp)
3138	{
3139	struct mbuf m, mp, ml = NULL;
3140	struct mbuf nextrecord, free_list;
3141	int flags, error, offset;
3142	user_ssize_t len;
3143	struct protosw *pr = so->so_proto;
3144	int moff, type = `0`;
3145	user_ssize_t orig_resid = uio_resid(uio);
3146	user_ssize_t delayed_copy_len;
3147	int can_delay;
3148	int need_event;
3149	struct proc *p = current_proc();
3150	boolean_t en_tracing = FALSE;
3151
3152	/*
3153	* Sanity check on the length passed by caller as we are making 'int'
3154	* comparisons
3155	*/
3156	if (orig_resid < `0` \|\| orig_resid > INT_MAX)
3157	return (EINVAL);
3158
3159	KERNEL_DEBUG(DBG_FNC_SORECEIVE \| DBG_FUNC_START, so,
3160	uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3161	so->so_rcv.sb_hiwat);
3162
3163	socket_lock(so, `1`);
3164	so_update_last_owner_locked(so, p);
3165	so_update_policy(so);
3166
3167	#ifdef MORE_LOCKING_DEBUG
3168	if (so->so_usecount == `1`) {
3169	panic("%s: so=%x no other reference on socket\n", __func__, so);
3170	/ NOTREACHED /
3171	}
3172	#endif
3173	mp = mp0;
3174	if (psa != NULL)
3175	*psa = NULL;
3176	if (controlp != NULL)
3177	*controlp = NULL;
3178	if (flagsp != NULL)
3179	flags = *flagsp &~ MSG_EOR;
3180	else
3181	flags = `0`;
3182
3183	/*
3184	* If a recv attempt is made on a previously-accepted socket
3185	* that has been marked as inactive (disconnected), reject
3186	* the request.
3187	*/
3188	if (so->so_flags & SOF_DEFUNCT) {
3189	struct sockbuf *sb = &so->so_rcv;
3190
3191	error = ENOTCONN;
3192	SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
3193	__func__, proc_pid(p), proc_best_name(p),
3194	(uint64_t)DEBUG_KERNEL_ADDRPERM(so),
3195	SOCK_DOM(so), SOCK_TYPE(so), error);
3196	/*
3197	* This socket should have been disconnected and flushed
3198	* prior to being returned from sodefunct(); there should
3199	* be no data on its receive list, so panic otherwise.
3200	*/
3201	if (so->so_state & SS_DEFUNCT)
3202	sb_empty_assert(sb, __func__);
3203	socket_unlock(so, `1`);
3204	return (error);
3205	}
3206
3207	if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3208	pr->pr_usrreqs->pru_preconnect) {
3209	/*
3210	* A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3211	* calling write() right after this. If the app calls a read
3212	* we do not want to block this read indefinetely. Thus,
3213	* we trigger a connect so that the session gets initiated.
3214	*/
3215	error = (*pr->pr_usrreqs->pru_preconnect)(so);
3216
3217	if (error) {
3218	socket_unlock(so, `1`);
3219	return (error);
3220	}
3221	}
3222
3223	if (ENTR_SHOULDTRACE &&
3224	(SOCK_CHECK_DOM(so, AF_INET) \|\| SOCK_CHECK_DOM(so, AF_INET6))) {
3225	/*
3226	* enable energy tracing for inet sockets that go over
3227	* non-loopback interfaces only.
3228	*/
3229	struct inpcb *inp = sotoinpcb(so);
3230	if (inp->inp_last_outifp != NULL &&
3231	!(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3232	en_tracing = TRUE;
3233	KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3234	VM_KERNEL_ADDRPERM(so),
3235	((so->so_state & SS_NBIO) ?
3236	kEnTrFlagNonBlocking : `0`),
3237	(int64_t)orig_resid);
3238	}
3239	}
3240
3241	/*
3242	* When SO_WANTOOBFLAG is set we try to get out-of-band data
3243	* regardless of the flags argument. Here is the case were
3244	* out-of-band data is not inline.
3245	*/
3246	if ((flags & MSG_OOB) \|\|
3247	((so->so_options & SO_WANTOOBFLAG) != `0` &&
3248	(so->so_options & SO_OOBINLINE) == `0` &&
3249	(so->so_oobmark \|\| (so->so_state & SS_RCVATMARK)))) {
3250	m = m_get(M_WAIT, MT_DATA);
3251	if (m == NULL) {
3252	socket_unlock(so, `1`);
3253	KERNEL_DEBUG(DBG_FNC_SORECEIVE \| DBG_FUNC_END,
3254	ENOBUFS, `0`, `0`, `0`, `0`);
3255	return (ENOBUFS);
3256	}
3257	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3258	if (error)
3259	goto bad;
3260	socket_unlock(so, `0`);
3261	do {
3262	error = uiomove(mtod(m, caddr_t),
3263	imin(uio_resid(uio), m->m_len), uio);
3264	m = m_free(m);
3265	} while (uio_resid(uio) && error == `0` && m != NULL);
3266	socket_lock(so, `0`);
3267	bad:
3268	if (m != NULL)
3269	m_freem(m);
3270
3271	if ((so->so_options & SO_WANTOOBFLAG) != `0`) {
3272	if (error == EWOULDBLOCK \|\| error == EINVAL) {
3273	/*
3274	* Let's try to get normal data:
3275	* EWOULDBLOCK: out-of-band data not
3276	* receive yet. EINVAL: out-of-band data
3277	* already read.
3278	*/
3279	error = `0`;
3280	goto nooob;
3281	} else if (error == `0` && flagsp != NULL) {
3282	*flagsp \|= MSG_OOB;
3283	}
3284	}
3285	socket_unlock(so, `1`);
3286	if (en_tracing) {
3287	KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3288	VM_KERNEL_ADDRPERM(so), `0`,
3289	(int64_t)(orig_resid - uio_resid(uio)));
3290	}
3291	KERNEL_DEBUG(DBG_FNC_SORECEIVE \| DBG_FUNC_END, error,
3292	`0`, `0`, `0`, `0`);
3293
3294	return (error);
3295	}
3296	nooob:
3297	if (mp != NULL)
3298	*mp = NULL;
3299
3300	if (so->so_state & SS_ISCONFIRMING && uio_resid(uio)) {
3301	(*pr->pr_usrreqs->pru_rcvd)(so, `0`);
3302	}
3303
3304	free_list = NULL;
3305	delayed_copy_len = `0`;
3306	restart:
3307	#ifdef MORE_LOCKING_DEBUG
3308	if (so->so_usecount <= `1`)
3309	printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3310	(uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3311	#endif
3312	/*
3313	* See if the socket has been closed (SS_NOFDREF\|SS_CANTRCVMORE)
3314	* and if so just return to the caller. This could happen when
3315	* soreceive() is called by a socket upcall function during the
3316	* time the socket is freed. The socket buffer would have been
3317	* locked across the upcall, therefore we cannot put this thread
3318	* to sleep (else we will deadlock) or return EWOULDBLOCK (else
3319	* we may livelock), because the lock on the socket buffer will
3320	* only be released when the upcall routine returns to its caller.
3321	* Because the socket has been officially closed, there can be
3322	* no further read on it.
3323	*
3324	* A multipath subflow socket would have its SS_NOFDREF set by
3325	* default, so check for SOF_MP_SUBFLOW socket flag; when the
3326	* socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3327	*/
3328	if ((so->so_state & (SS_NOFDREF \| SS_CANTRCVMORE)) ==
3329	(SS_NOFDREF \| SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3330	socket_unlock(so, `1`);
3331	return (`0`);
3332	}
3333
3334	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
3335	if (error) {
3336	socket_unlock(so, `1`);
3337	KERNEL_DEBUG(DBG_FNC_SORECEIVE \| DBG_FUNC_END, error,
3338	`0`, `0`, `0`, `0`);
3339	if (en_tracing) {
3340	KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3341	VM_KERNEL_ADDRPERM(so), `0`,
3342	(int64_t)(orig_resid - uio_resid(uio)));
3343	}
3344	return (error);
3345	}
3346
3347	m = so->so_rcv.sb_mb;
3348	/*
3349	* If we have less data than requested, block awaiting more
3350	* (subject to any timeout) if:
3351	* 1. the current count is less than the low water mark, or
3352	* 2. MSG_WAITALL is set, and it is possible to do the entire
3353	* receive operation at once if we block (resid <= hiwat).
3354	* 3. MSG_DONTWAIT is not set
3355	* If MSG_WAITALL is set but resid is larger than the receive buffer,
3356	* we have to do the receive in sections, and thus risk returning
3357	* a short count if a timeout or signal occurs after we start.
3358	*/
3359	if (m == NULL \|\| (((flags & MSG_DONTWAIT) == `0` &&
3360	so->so_rcv.sb_cc < uio_resid(uio)) &&
3361	(so->so_rcv.sb_cc < so->so_rcv.sb_lowat \|\|
3362	((flags & MSG_WAITALL) && uio_resid(uio) <= so->so_rcv.sb_hiwat)) &&
3363	m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == `0`)) {
3364	/*
3365	* Panic if we notice inconsistencies in the socket's
3366	* receive list; both sb_mb and sb_cc should correctly
3367	* reflect the contents of the list, otherwise we may
3368	* end up with false positives during select() or poll()
3369	* which could put the application in a bad state.
3370	*/
3371	SB_MB_CHECK(&so->so_rcv);
3372
3373	if (so->so_error) {
3374	if (m != NULL)
3375	goto dontblock;
3376	error = so->so_error;
3377	if ((flags & MSG_PEEK) == `0`)
3378	so->so_error = `0`;
3379	goto release;
3380	}
3381	if (so->so_state & SS_CANTRCVMORE) {
3382	#if CONTENT_FILTER
3383	/*
3384	* Deal with half closed connections
3385	*/
3386	if ((so->so_state & SS_ISDISCONNECTED) == `0` &&
3387	cfil_sock_data_pending(&so->so_rcv) != `0`)
3388	CFIL_LOG(LOG_INFO,
3389	"so %llx ignore SS_CANTRCVMORE",
3390	(uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3391	else
3392	#endif /* CONTENT_FILTER */
3393	if (m != NULL)
3394	goto dontblock;
3395	else
3396	goto release;
3397	}
3398	for (; m != NULL; m = m->m_next)
3399	if (m->m_type == MT_OOBDATA \|\| (m->m_flags & M_EOR)) {
3400	m = so->so_rcv.sb_mb;
3401	goto dontblock;
3402	}
3403	if ((so->so_state & (SS_ISCONNECTED\|SS_ISCONNECTING)) == `0` &&
3404	(so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3405	error = ENOTCONN;
3406	goto release;
3407	}
3408	if (uio_resid(uio) == `0`)
3409	goto release;
3410
3411	if ((so->so_state & SS_NBIO) \|\|
3412	(flags & (MSG_DONTWAIT\|MSG_NBIO))) {
3413	error = EWOULDBLOCK;
3414	goto release;
3415	}
3416	SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3417	SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3418	sbunlock(&so->so_rcv, TRUE); / keep socket locked /
3419	#if EVEN_MORE_LOCKING_DEBUG
3420	if (socket_debug)
3421	printf("Waiting for socket data\n");
3422	#endif
3423
3424	error = sbwait(&so->so_rcv);
3425	#if EVEN_MORE_LOCKING_DEBUG
3426	if (socket_debug)
3427	printf("SORECEIVE - sbwait returned %d\n", error);
3428	#endif
3429	if (so->so_usecount < `1`) {
3430	panic("%s: after 2nd sblock so=%p ref=%d on socket\n",
3431	__func__, so, so->so_usecount);
3432	/ NOTREACHED /
3433	}
3434	if (error) {
3435	socket_unlock(so, `1`);
3436	KERNEL_DEBUG(DBG_FNC_SORECEIVE \| DBG_FUNC_END, error,
3437	`0`, `0`, `0`, `0`);
3438	if (en_tracing) {
3439	KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3440	VM_KERNEL_ADDRPERM(so), `0`,
3441	(int64_t)(orig_resid - uio_resid(uio)));
3442	}
3443	return (error);
3444	}
3445	goto restart;
3446	}
3447	dontblock:
3448	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
3449	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3450	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3451	nextrecord = m->m_nextpkt;
3452
3453	if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3454	error = soreceive_addr(p, so, psa, flags, &m, &nextrecord,
3455	mp0 == NULL);
3456	if (error == ERESTART)
3457	goto restart;
3458	else if (error != `0`)
3459	goto release;
3460	orig_resid = `0`;
3461	}
3462
3463	/*
3464	* Process one or more MT_CONTROL mbufs present before any data mbufs
3465	* in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3466	* just copy the data; if !MSG_PEEK, we call into the protocol to
3467	* perform externalization.
3468	*/
3469	if (m != NULL && m->m_type == MT_CONTROL) {
3470	error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
3471	if (error != `0`)
3472	goto release;
3473	orig_resid = `0`;
3474	}
3475
3476	/*
3477	* If the socket is a TCP socket with message delivery
3478	* enabled, then create a control msg to deliver the
3479	* relative TCP sequence number for this data. Waiting
3480	* until this point will protect against failures to
3481	* allocate an mbuf for control msgs.
3482	*/
3483	if (so->so_type == SOCK_STREAM && SOCK_PROTO(so) == IPPROTO_TCP &&
3484	(so->so_flags & SOF_ENABLE_MSGS) && controlp != NULL) {
3485	struct mbuf *seq_cm;
3486
3487	seq_cm = sbcreatecontrol((caddr_t)&m->m_pkthdr.msg_seq,
3488	sizeof (uint32_t), SCM_SEQNUM, SOL_SOCKET);
3489	if (seq_cm == NULL) {
3490	/ unable to allocate a control mbuf /
3491	error = ENOBUFS;
3492	goto release;
3493	}
3494	*controlp = seq_cm;
3495	controlp = &seq_cm->m_next;
3496	}
3497
3498	if (m != NULL) {
3499	if (!(flags & MSG_PEEK)) {
3500	/*
3501	* We get here because m points to an mbuf following
3502	* any MT_SONAME or MT_CONTROL mbufs which have been
3503	* processed above. In any case, m should be pointing
3504	* to the head of the mbuf chain, and the nextrecord
3505	* should be either NULL or equal to m->m_nextpkt.
3506	* See comments above about SB_LOCK.
3507	*/
3508	if (m != so->so_rcv.sb_mb \|\|
3509	m->m_nextpkt != nextrecord) {
3510	panic("%s: post-control !sync so=%p m=%p "
3511	"nextrecord=%p\n", __func__, so, m,
3512	nextrecord);
3513	/ NOTREACHED /
3514	}
3515	if (nextrecord == NULL)
3516	so->so_rcv.sb_lastrecord = m;
3517	}
3518	type = m->m_type;
3519	if (type == MT_OOBDATA)
3520	flags \|= MSG_OOB;
3521	} else {
3522	if (!(flags & MSG_PEEK)) {
3523	SB_EMPTY_FIXUP(&so->so_rcv);
3524	}
3525	}
3526	SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3527	SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3528
3529	moff = `0`;
3530	offset = `0`;
3531
3532	if (!(flags & MSG_PEEK) && uio_resid(uio) > sorecvmincopy)
3533	can_delay = `1`;
3534	else
3535	can_delay = `0`;
3536
3537	need_event = `0`;
3538
3539	while (m != NULL &&
3540	(uio_resid(uio) - delayed_copy_len) > `0` && error == `0`) {
3541	if (m->m_type == MT_OOBDATA) {
3542	if (type != MT_OOBDATA)
3543	break;
3544	} else if (type == MT_OOBDATA) {
3545	break;
3546	}
3547	/*
3548	* Make sure to allways set MSG_OOB event when getting
3549	* out of band data inline.
3550	*/
3551	if ((so->so_options & SO_WANTOOBFLAG) != `0` &&
3552	(so->so_options & SO_OOBINLINE) != `0` &&
3553	(so->so_state & SS_RCVATMARK) != `0`) {
3554	flags \|= MSG_OOB;
3555	}
3556	so->so_state &= ~SS_RCVATMARK;
3557	len = uio_resid(uio) - delayed_copy_len;
3558	if (so->so_oobmark && len > so->so_oobmark - offset)
3559	len = so->so_oobmark - offset;
3560	if (len > m->m_len - moff)
3561	len = m->m_len - moff;
3562	/*
3563	* If mp is set, just pass back the mbufs.
3564	* Otherwise copy them out via the uio, then free.
3565	* Sockbuf must be consistent here (points to current mbuf,
3566	* it points to next record) when we drop priority;
3567	* we must note any additions to the sockbuf when we
3568	* block interrupts again.
3569	*/
3570	if (mp == NULL) {
3571	SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3572	SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3573	if (can_delay && len == m->m_len) {
3574	/*
3575	* only delay the copy if we're consuming the
3576	* mbuf and we're NOT in MSG_PEEK mode
3577	* and we have enough data to make it worthwile
3578	* to drop and retake the lock... can_delay
3579	* reflects the state of the 2 latter
3580	* constraints moff should always be zero
3581	* in these cases
3582	*/
3583	delayed_copy_len += len;
3584	} else {
3585	if (delayed_copy_len) {
3586	error = sodelayed_copy(so, uio,
3587	&free_list, &delayed_copy_len);
3588
3589	if (error) {
3590	goto release;
3591	}
3592	/*
3593	* can only get here if MSG_PEEK is not
3594	* set therefore, m should point at the
3595	* head of the rcv queue; if it doesn't,
3596	* it means something drastically
3597	* changed while we were out from behind
3598	* the lock in sodelayed_copy. perhaps
3599	* a RST on the stream. in any event,
3600	* the stream has been interrupted. it's
3601	* probably best just to return whatever
3602	* data we've moved and let the caller
3603	* sort it out...
3604	*/
3605	if (m != so->so_rcv.sb_mb) {
3606	break;
3607	}
3608	}
3609	socket_unlock(so, `0`);
3610	error = uiomove(mtod(m, caddr_t) + moff,
3611	(int)len, uio);
3612	socket_lock(so, `0`);
3613
3614	if (error)
3615	goto release;
3616	}
3617	} else {
3618	uio_setresid(uio, (uio_resid(uio) - len));
3619	}
3620	if (len == m->m_len - moff) {
3621	if (m->m_flags & M_EOR)
3622	flags \|= MSG_EOR;
3623	if (flags & MSG_PEEK) {
3624	m = m->m_next;
3625	moff = `0`;
3626	} else {
3627	nextrecord = m->m_nextpkt;
3628	sbfree(&so->so_rcv, m);
3629	m->m_nextpkt = NULL;
3630
3631	/*
3632	* If this packet is an unordered packet
3633	* (indicated by M_UNORDERED_DATA flag), remove
3634	* the additional bytes added to the
3635	* receive socket buffer size.
3636	*/
3637	if ((so->so_flags & SOF_ENABLE_MSGS) &&
3638	m->m_len &&
3639	(m->m_flags & M_UNORDERED_DATA) &&
3640	sbreserve(&so->so_rcv,
3641	so->so_rcv.sb_hiwat - m->m_len)) {
3642	if (so->so_msg_state->msg_uno_bytes >
3643	m->m_len) {
3644	so->so_msg_state->
3645	msg_uno_bytes -= m->m_len;
3646	} else {
3647	so->so_msg_state->
3648	msg_uno_bytes = `0`;
3649	}
3650	m->m_flags &= ~M_UNORDERED_DATA;
3651	}
3652
3653	if (mp != NULL) {
3654	*mp = m;
3655	mp = &m->m_next;
3656	so->so_rcv.sb_mb = m = m->m_next;
3657	*mp = NULL;
3658	} else {
3659	if (free_list == NULL)
3660	free_list = m;
3661	else
3662	ml->m_next = m;
3663	ml = m;
3664	so->so_rcv.sb_mb = m = m->m_next;
3665	ml->m_next = NULL;
3666	}
3667	if (m != NULL) {
3668	m->m_nextpkt = nextrecord;
3669	if (nextrecord == NULL)
3670	so->so_rcv.sb_lastrecord = m;
3671	} else {
3672	so->so_rcv.sb_mb = nextrecord;
3673	SB_EMPTY_FIXUP(&so->so_rcv);
3674	}
3675	SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3676	SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3677	}
3678	} else {
3679	if (flags & MSG_PEEK) {
3680	moff += len;
3681	} else {
3682	if (mp != NULL) {
3683	int copy_flag;
3684
3685	if (flags & MSG_DONTWAIT)
3686	copy_flag = M_DONTWAIT;
3687	else
3688	copy_flag = M_WAIT;
3689	*mp = m_copym(m, `0`, len, copy_flag);
3690	/*
3691	* Failed to allocate an mbuf?
3692	* Adjust uio_resid back, it was
3693	* adjusted down by len bytes which
3694	* we didn't copy over.
3695	*/
3696	if (*mp == NULL) {
3697	uio_setresid(uio,
3698	(uio_resid(uio) + len));
3699	break;
3700	}
3701	}
3702	m->m_data += len;
3703	m->m_len -= len;
3704	so->so_rcv.sb_cc -= len;
3705	}
3706	}
3707	if (so->so_oobmark) {
3708	if ((flags & MSG_PEEK) == `0`) {
3709	so->so_oobmark -= len;
3710	if (so->so_oobmark == `0`) {
3711	so->so_state \|= SS_RCVATMARK;
3712	/*
3713	* delay posting the actual event until
3714	* after any delayed copy processing
3715	* has finished
3716	*/
3717	need_event = `1`;
3718	break;
3719	}
3720	} else {
3721	offset += len;
3722	if (offset == so->so_oobmark)
3723	break;
3724	}
3725	}
3726	if (flags & MSG_EOR)
3727	break;
3728	/*
3729	* If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3730	* (for non-atomic socket), we must not quit until
3731	* "uio->uio_resid == 0" or an error termination.
3732	* If a signal/timeout occurs, return with a short
3733	* count but without error. Keep sockbuf locked
3734	* against other readers.
3735	*/
3736	while (flags & (MSG_WAITALL\|MSG_WAITSTREAM) && m == NULL &&
3737	(uio_resid(uio) - delayed_copy_len) > `0` &&
3738	!sosendallatonce(so) && !nextrecord) {
3739	if (so->so_error \|\| ((so->so_state & SS_CANTRCVMORE)
3740	#if CONTENT_FILTER
3741	&& cfil_sock_data_pending(&so->so_rcv) == `0`
3742	#endif /* CONTENT_FILTER */
3743	))
3744	goto release;
3745
3746	/*
3747	* Depending on the protocol (e.g. TCP), the following
3748	* might cause the socket lock to be dropped and later
3749	* be reacquired, and more data could have arrived and
3750	* have been appended to the receive socket buffer by
3751	* the time it returns. Therefore, we only sleep in
3752	* sbwait() below if and only if the socket buffer is
3753	* empty, in order to avoid a false sleep.
3754	*/
3755	if (pr->pr_flags & PR_WANTRCVD && so->so_pcb &&
3756	(((struct inpcb *)so->so_pcb)->inp_state !=
3757	INPCB_STATE_DEAD))
3758	(*pr->pr_usrreqs->pru_rcvd)(so, flags);
3759
3760	SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3761	SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3762
3763	if (so->so_rcv.sb_mb == NULL && sbwait(&so->so_rcv)) {
3764	error = `0`;
3765	goto release;
3766	}
3767	/*
3768	* have to wait until after we get back from the sbwait
3769	* to do the copy because we will drop the lock if we
3770	* have enough data that has been delayed... by dropping
3771	* the lock we open up a window allowing the netisr
3772	* thread to process the incoming packets and to change
3773	* the state of this socket... we're issuing the sbwait
3774	* because the socket is empty and we're expecting the
3775	* netisr thread to wake us up when more packets arrive;
3776	* if we allow that processing to happen and then sbwait
3777	* we could stall forever with packets sitting in the
3778	* socket if no further packets arrive from the remote
3779	* side.
3780	*
3781	* we want to copy before we've collected all the data
3782	* to satisfy this request to allow the copy to overlap
3783	* the incoming packet processing on an MP system
3784	*/
3785	if (delayed_copy_len > sorecvmincopy &&
3786	(delayed_copy_len > (so->so_rcv.sb_hiwat / `2`))) {
3787	error = sodelayed_copy(so, uio,
3788	&free_list, &delayed_copy_len);
3789
3790	if (error)
3791	goto release;
3792	}
3793	m = so->so_rcv.sb_mb;
3794	if (m != NULL) {
3795	nextrecord = m->m_nextpkt;
3796	}
3797	SB_MB_CHECK(&so->so_rcv);
3798	}
3799	}
3800	#ifdef MORE_LOCKING_DEBUG
3801	if (so->so_usecount <= `1`) {
3802	panic("%s: after big while so=%p ref=%d on socket\n",
3803	__func__, so, so->so_usecount);
3804	/ NOTREACHED /
3805	}
3806	#endif
3807
3808	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3809	if (so->so_options & SO_DONTTRUNC) {
3810	flags \|= MSG_RCVMORE;
3811	} else {
3812	flags \|= MSG_TRUNC;
3813	if ((flags & MSG_PEEK) == `0`)
3814	(void) sbdroprecord(&so->so_rcv);
3815	}
3816	}
3817
3818	/*
3819	* pru_rcvd below (for TCP) may cause more data to be received
3820	* if the socket lock is dropped prior to sending the ACK; some
3821	* legacy OpenTransport applications don't handle this well
3822	* (if it receives less data than requested while MSG_HAVEMORE
3823	* is set), and so we set the flag now based on what we know
3824	* prior to calling pru_rcvd.
3825	*/
3826	if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > `0`)
3827	flags \|= MSG_HAVEMORE;
3828
3829	if ((flags & MSG_PEEK) == `0`) {
3830	if (m == NULL) {
3831	so->so_rcv.sb_mb = nextrecord;
3832	/*
3833	* First part is an inline SB_EMPTY_FIXUP(). Second
3834	* part makes sure sb_lastrecord is up-to-date if
3835	* there is still data in the socket buffer.
3836	*/
3837	if (so->so_rcv.sb_mb == NULL) {
3838	so->so_rcv.sb_mbtail = NULL;
3839	so->so_rcv.sb_lastrecord = NULL;
3840	} else if (nextrecord->m_nextpkt == NULL) {
3841	so->so_rcv.sb_lastrecord = nextrecord;
3842	}
3843	SB_MB_CHECK(&so->so_rcv);
3844	}
3845	SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3846	SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
3847	if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
3848	(*pr->pr_usrreqs->pru_rcvd)(so, flags);
3849	}
3850
3851	if (delayed_copy_len) {
3852	error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3853	if (error)
3854	goto release;
3855	}
3856	if (free_list != NULL) {
3857	m_freem_list(free_list);
3858	free_list = NULL;
3859	}
3860	if (need_event)
3861	postevent(so, `0`, EV_OOB);
3862
3863	if (orig_resid == uio_resid(uio) && orig_resid &&
3864	(flags & MSG_EOR) == `0` && (so->so_state & SS_CANTRCVMORE) == `0`) {
3865	sbunlock(&so->so_rcv, TRUE); / keep socket locked /
3866	goto restart;
3867	}
3868
3869	if (flagsp != NULL)
3870	*flagsp \|= flags;
3871	release:
3872	#ifdef MORE_LOCKING_DEBUG
3873	if (so->so_usecount <= `1`) {
3874	panic("%s: release so=%p ref=%d on socket\n", __func__,
3875	so, so->so_usecount);
3876	/ NOTREACHED /
3877	}
3878	#endif
3879	if (delayed_copy_len)
3880	error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
3881
3882	if (free_list != NULL)
3883	m_freem_list(free_list);
3884
3885	sbunlock(&so->so_rcv, FALSE); / will unlock socket /
3886
3887	if (en_tracing) {
3888	KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3889	VM_KERNEL_ADDRPERM(so),
3890	((error == EWOULDBLOCK) ? kEnTrFlagNoWork : `0`),
3891	(int64_t)(orig_resid - uio_resid(uio)));
3892	}
3893	KERNEL_DEBUG(DBG_FNC_SORECEIVE \| DBG_FUNC_END, so, uio_resid(uio),
3894	so->so_rcv.sb_cc, `0`, error);
3895
3896	return (error);
3897	}
3898
3899	/*
3900	* Returns: 0 Success
3901	* uiomove:EFAULT
3902	*/
3903	static int
3904	sodelayed_copy(struct socket so, struct* uio uio, struct* mbuf **free_list,
3905	user_ssize_t *resid)
3906	{
3907	int error = `0`;
3908	struct mbuf *m;
3909
3910	m = *free_list;
3911
3912	socket_unlock(so, `0`);
3913
3914	while (m != NULL && error == `0`) {
3915	error = uiomove(mtod(m, caddr_t), (int)m->m_len, uio);
3916	m = m->m_next;
3917	}
3918	m_freem_list(*free_list);
3919
3920	*free_list = NULL;
3921	*resid = `0`;
3922
3923	socket_lock(so, `0`);
3924
3925	return (error);
3926	}
3927
3928	static int
3929	sodelayed_copy_list(struct socket so, struct* recv_msg_elem *msgarray,
3930	u_int uiocnt, struct mbuf *free_list, user_ssize_t resid)
3931	{
3932	#pragma unused(so)
3933	int error = `0`;
3934	struct mbuf ml, m;
3935	int i = `0`;
3936	struct uio *auio;
3937
3938	for (ml = *free_list, i = `0`; ml != NULL && i < uiocnt;
3939	ml = ml->m_nextpkt, i++) {
3940	auio = msgarray[i].uio;
3941	for (m = ml; m != NULL; m = m->m_next) {
3942	error = uiomove(mtod(m, caddr_t), m->m_len, auio);
3943	if (error != `0`)
3944	goto out;
3945	}
3946	}
3947	out:
3948	m_freem_list(*free_list);
3949
3950	*free_list = NULL;
3951	*resid = `0`;
3952
3953	return (error);
3954	}
3955
3956	int
3957	soreceive_list(struct socket so, struct* recv_msg_elem *msgarray, u_int uiocnt,
3958	int *flagsp)
3959	{
3960	struct mbuf *m;
3961	struct mbuf *nextrecord;
3962	struct mbuf ml = NULL, free_list = NULL, *free_tail = NULL;
3963	int error;
3964	user_ssize_t len, pktlen, delayed_copy_len = `0`;
3965	struct protosw *pr = so->so_proto;
3966	user_ssize_t resid;
3967	struct proc *p = current_proc();
3968	struct uio *auio = NULL;
3969	int npkts = `0`;
3970	int sblocked = `0`;
3971	struct sockaddr **psa = NULL;
3972	struct mbuf **controlp = NULL;
3973	int can_delay;
3974	int flags;
3975	struct mbuf *free_others = NULL;
3976
3977	KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST \| DBG_FUNC_START,
3978	so, uiocnt,
3979	so->so_rcv.sb_cc, so->so_rcv.sb_lowat, so->so_rcv.sb_hiwat);
3980
3981	/*
3982	* Sanity checks:
3983	* - Only supports don't wait flags
3984	* - Only support datagram sockets (could be extended to raw)
3985	* - Must be atomic
3986	* - Protocol must support packet chains
3987	* - The uio array is NULL (should we panic?)
3988	*/
3989	if (flagsp != NULL)
3990	flags = *flagsp;
3991	else
3992	flags = `0`;
3993	if (flags & ~(MSG_PEEK \| MSG_WAITALL \| MSG_DONTWAIT \| MSG_NEEDSA \|
3994	MSG_NBIO)) {
3995	printf("%s invalid flags 0x%x\n", __func__, flags);
3996	error = EINVAL;
3997	goto out;
3998	}
3999	if (so->so_type != SOCK_DGRAM) {
4000	error = EINVAL;
4001	goto out;
4002	}
4003	if (sosendallatonce(so) == `0`) {
4004	error = EINVAL;
4005	goto out;
4006	}
4007	if (so->so_proto->pr_usrreqs->pru_send_list == NULL) {
4008	error = EPROTONOSUPPORT;
4009	goto out;
4010	}
4011	if (msgarray == NULL) {
4012	printf("%s uioarray is NULL\n", __func__);
4013	error = EINVAL;
4014	goto out;
4015	}
4016	if (uiocnt == `0`) {
4017	printf("%s uiocnt is 0\n", __func__);
4018	error = EINVAL;
4019	goto out;
4020	}
4021	/*
4022	* Sanity check on the length passed by caller as we are making 'int'
4023	* comparisons
4024	*/
4025	resid = recv_msg_array_resid(msgarray, uiocnt);
4026	if (resid < `0` \|\| resid > INT_MAX) {
4027	error = EINVAL;
4028	goto out;
4029	}
4030
4031	if (!(flags & MSG_PEEK) && sorecvmincopy > `0`)
4032	can_delay = `1`;
4033	else
4034	can_delay = `0`;
4035
4036	socket_lock(so, `1`);
4037	so_update_last_owner_locked(so, p);
4038	so_update_policy(so);
4039
4040	#if NECP
4041	so_update_necp_policy(so, NULL, NULL);
4042	#endif /* NECP */
4043
4044	/*
4045	* If a recv attempt is made on a previously-accepted socket
4046	* that has been marked as inactive (disconnected), reject
4047	* the request.
4048	*/
4049	if (so->so_flags & SOF_DEFUNCT) {
4050	struct sockbuf *sb = &so->so_rcv;
4051
4052	error = ENOTCONN;
4053	SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] (%d)\n",
4054	__func__, proc_pid(p), proc_best_name(p),
4055	(uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4056	SOCK_DOM(so), SOCK_TYPE(so), error);
4057	/*
4058	* This socket should have been disconnected and flushed
4059	* prior to being returned from sodefunct(); there should
4060	* be no data on its receive list, so panic otherwise.
4061	*/
4062	if (so->so_state & SS_DEFUNCT)
4063	sb_empty_assert(sb, __func__);
4064	goto release;
4065	}
4066
4067	next:
4068	/*
4069	* The uio may be empty
4070	*/
4071	if (npkts >= uiocnt) {
4072	error = `0`;
4073	goto release;
4074	}
4075	restart:
4076	/*
4077	* See if the socket has been closed (SS_NOFDREF\|SS_CANTRCVMORE)
4078	* and if so just return to the caller. This could happen when
4079	* soreceive() is called by a socket upcall function during the
4080	* time the socket is freed. The socket buffer would have been
4081	* locked across the upcall, therefore we cannot put this thread
4082	* to sleep (else we will deadlock) or return EWOULDBLOCK (else
4083	* we may livelock), because the lock on the socket buffer will
4084	* only be released when the upcall routine returns to its caller.
4085	* Because the socket has been officially closed, there can be
4086	* no further read on it.
4087	*/
4088	if ((so->so_state & (SS_NOFDREF \| SS_CANTRCVMORE)) ==
4089	(SS_NOFDREF \| SS_CANTRCVMORE)) {
4090	error = `0`;
4091	goto release;
4092	}
4093
4094	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
4095	if (error) {
4096	goto release;
4097	}
4098	sblocked = `1`;
4099
4100	m = so->so_rcv.sb_mb;
4101	/*
4102	* Block awaiting more datagram if needed
4103	*/
4104	if (m == NULL \|\| (((flags & MSG_DONTWAIT) == `0` &&
4105	(so->so_rcv.sb_cc < so->so_rcv.sb_lowat \|\|
4106	((flags & MSG_WAITALL) && npkts < uiocnt))))) {
4107	/*
4108	* Panic if we notice inconsistencies in the socket's
4109	* receive list; both sb_mb and sb_cc should correctly
4110	* reflect the contents of the list, otherwise we may
4111	* end up with false positives during select() or poll()
4112	* which could put the application in a bad state.
4113	*/
4114	SB_MB_CHECK(&so->so_rcv);
4115
4116	if (so->so_error) {
4117	error = so->so_error;
4118	if ((flags & MSG_PEEK) == `0`)
4119	so->so_error = `0`;
4120	goto release;
4121	}
4122	if (so->so_state & SS_CANTRCVMORE) {
4123	goto release;
4124	}
4125	if ((so->so_state & (SS_ISCONNECTED\|SS_ISCONNECTING)) == `0` &&
4126	(so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4127	error = ENOTCONN;
4128	goto release;
4129	}
4130	if ((so->so_state & SS_NBIO) \|\|
4131	(flags & (MSG_DONTWAIT\|MSG_NBIO))) {
4132	error = EWOULDBLOCK;
4133	goto release;
4134	}
4135	/*
4136	* Do not block if we got some data
4137	*/
4138	if (free_list != NULL) {
4139	error = `0`;
4140	goto release;
4141	}
4142
4143	SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4144	SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4145
4146	sbunlock(&so->so_rcv, TRUE); / keep socket locked /
4147	sblocked = `0`;
4148
4149	error = sbwait(&so->so_rcv);
4150	if (error) {
4151	goto release;
4152	}
4153	goto restart;
4154	}
4155
4156	OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
4157	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4158	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4159
4160	/*
4161	* Consume the current uio index as we have a datagram
4162	*/
4163	auio = msgarray[npkts].uio;
4164	resid = uio_resid(auio);
4165	msgarray[npkts].which \|= SOCK_MSG_DATA;
4166	psa = (msgarray[npkts].which & SOCK_MSG_SA) ?
4167	&msgarray[npkts].psa : NULL;
4168	controlp = (msgarray[npkts].which & SOCK_MSG_CONTROL) ?
4169	&msgarray[npkts].controlp : NULL;
4170	npkts += `1`;
4171	nextrecord = m->m_nextpkt;
4172
4173	if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4174	error = soreceive_addr(p, so, psa, flags, &m, &nextrecord, `1`);
4175	if (error == ERESTART)
4176	goto restart;
4177	else if (error != `0`)
4178	goto release;
4179	}
4180
4181	if (m != NULL && m->m_type == MT_CONTROL) {
4182	error = soreceive_ctl(so, controlp, flags, &m, &nextrecord);
4183	if (error != `0`)
4184	goto release;
4185	}
4186
4187	if (m->m_pkthdr.len == `0`) {
4188	printf("%s:%d so %llx pkt %llx type %u pktlen null\n",
4189	__func__, __LINE__,
4190	(uint64_t)DEBUG_KERNEL_ADDRPERM(so),
4191	(uint64_t)DEBUG_KERNEL_ADDRPERM(m),
4192	m->m_type);
4193	}
4194
4195	/*
4196	* Loop to copy the mbufs of the current record
4197	* Support zero length packets
4198	*/
4199	ml = NULL;
4200	pktlen = `0`;
4201	while (m != NULL && (len = resid - pktlen) >= `0` && error == `0`) {
4202	if (m->m_len == `0`)
4203	panic("%p m_len zero", m);
4204	if (m->m_type == `0`)
4205	panic("%p m_type zero", m);
4206	/*
4207	* Clip to the residual length
4208	*/
4209	if (len > m->m_len)
4210	len = m->m_len;
4211	pktlen += len;
4212	/*
4213	* Copy the mbufs via the uio or delay the copy
4214	* Sockbuf must be consistent here (points to current mbuf,
4215	* it points to next record) when we drop priority;
4216	* we must note any additions to the sockbuf when we
4217	* block interrupts again.
4218	*/
4219	if (len > `0` && can_delay == `0`) {
4220	socket_unlock(so, `0`);
4221	error = uiomove(mtod(m, caddr_t), (int)len, auio);
4222	socket_lock(so, `0`);
4223	if (error)
4224	goto release;
4225	} else {
4226	delayed_copy_len += len;
4227	}
4228
4229	if (len == m->m_len) {
4230	/*
4231	* m was entirely copied
4232	*/
4233	sbfree(&so->so_rcv, m);
4234	nextrecord = m->m_nextpkt;
4235	m->m_nextpkt = NULL;
4236
4237	/*
4238	* Set the first packet to the head of the free list
4239	*/
4240	if (free_list == NULL)
4241	free_list = m;
4242	/*
4243	* Link current packet to tail of free list
4244	*/
4245	if (ml == NULL) {
4246	if (free_tail != NULL)
4247	free_tail->m_nextpkt = m;
4248	free_tail = m;
4249	}
4250	/*
4251	* Link current mbuf to last mbuf of current packet
4252	*/
4253	if (ml != NULL)
4254	ml->m_next = m;
4255	ml = m;
4256
4257	/*
4258	* Move next buf to head of socket buffer
4259	*/
4260	so->so_rcv.sb_mb = m = ml->m_next;
4261	ml->m_next = NULL;
4262
4263	if (m != NULL) {
4264	m->m_nextpkt = nextrecord;
4265	if (nextrecord == NULL)
4266	so->so_rcv.sb_lastrecord = m;
4267	} else {
4268	so->so_rcv.sb_mb = nextrecord;
4269	SB_EMPTY_FIXUP(&so->so_rcv);
4270	}
4271	SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
4272	SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
4273	} else {
4274	/*
4275	* Stop the loop on partial copy
4276	*/
4277	break;
4278	}
4279	}
4280	#ifdef MORE_LOCKING_DEBUG
4281	if (so->so_usecount <= `1`) {
4282	panic("%s: after big while so=%llx ref=%d on socket\n",
4283	__func__,
4284	(uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
4285	/ NOTREACHED /
4286	}
4287	#endif
4288	/*
4289	* Tell the caller we made a partial copy
4290	*/
4291	if (m != NULL) {
4292	if (so->so_options & SO_DONTTRUNC) {
4293	/*
4294	* Copyout first the freelist then the partial mbuf
4295	*/
4296	socket_unlock(so, `0`);
4297	if (delayed_copy_len)
4298	error = sodelayed_copy_list(so, msgarray,
4299	uiocnt, &free_list, &delayed_copy_len);
4300
4301	if (error == `0`) {
4302	error = uiomove(mtod(m, caddr_t), (int)len,
4303	auio);
4304	}
4305	socket_lock(so, `0`);
4306	if (error)
4307	goto release;
4308
4309	m->m_data += len;
4310	m->m_len -= len;
4311	so->so_rcv.sb_cc -= len;
4312	flags \|= MSG_RCVMORE;
4313	} else {
4314	(void) sbdroprecord(&so->so_rcv);
4315	nextrecord = so->so_rcv.sb_mb;
4316	m = NULL;
4317	flags \|= MSG_TRUNC;
4318	}
4319	}
4320
4321	if (m == NULL) {
4322	so->so_rcv.sb_mb = nextrecord;
4323	/*
4324	* First part is an inline SB_EMPTY_FIXUP(). Second
4325	* part makes sure sb_lastrecord is up-to-date if
4326	* there is still data in the socket buffer.
4327	*/
4328	if (so->so_rcv.sb_mb == NULL) {
4329	so->so_rcv.sb_mbtail = NULL;
4330	so->so_rcv.sb_lastrecord = NULL;
4331	} else if (nextrecord->m_nextpkt == NULL) {
4332	so->so_rcv.sb_lastrecord = nextrecord;
4333	}
4334	SB_MB_CHECK(&so->so_rcv);
4335	}
4336	SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4337	SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4338
4339	/*
4340	* We can continue to the next packet as long as:
4341	* - We haven't exhausted the uio array
4342	* - There was no error
4343	* - A packet was not truncated
4344	* - We can still receive more data
4345	*/
4346	if (npkts < uiocnt && error == `0` &&
4347	(flags & (MSG_RCVMORE \| MSG_TRUNC)) == `0` &&
4348	(so->so_state & SS_CANTRCVMORE) == `0`) {
4349	sbunlock(&so->so_rcv, TRUE); / keep socket locked /
4350	sblocked = `0`;
4351
4352	goto next;
4353	}
4354	if (flagsp != NULL)
4355	*flagsp \|= flags;
4356
4357	release:
4358	/*
4359	* pru_rcvd may cause more data to be received if the socket lock
4360	* is dropped so we set MSG_HAVEMORE now based on what we know.
4361	* That way the caller won't be surprised if it receives less data
4362	* than requested.
4363	*/
4364	if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > `0`)
4365	flags \|= MSG_HAVEMORE;
4366
4367	if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
4368	(*pr->pr_usrreqs->pru_rcvd)(so, flags);
4369
4370	if (sblocked)
4371	sbunlock(&so->so_rcv, FALSE); / will unlock socket /
4372	else
4373	socket_unlock(so, `1`);
4374
4375	if (delayed_copy_len)
4376	error = sodelayed_copy_list(so, msgarray, uiocnt,
4377	&free_list, &delayed_copy_len);
4378	out:
4379	/*
4380	* Amortize the cost of freeing the mbufs
4381	*/
4382	if (free_list != NULL)
4383	m_freem_list(free_list);
4384	if (free_others != NULL)
4385	m_freem_list(free_others);
4386
4387	KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST \| DBG_FUNC_END, error,
4388	`0`, `0`, `0`, `0`);
4389	return (error);
4390	}
4391
4392	/*
4393	* Returns: 0 Success
4394	* EINVAL
4395	* ENOTCONN
4396	* <pru_shutdown>:EINVAL
4397	* <pru_shutdown>:EADDRNOTAVAIL[TCP]
4398	* <pru_shutdown>:ENOBUFS[TCP]
4399	* <pru_shutdown>:EMSGSIZE[TCP]
4400	* <pru_shutdown>:EHOSTUNREACH[TCP]
4401	* <pru_shutdown>:ENETUNREACH[TCP]
4402	* <pru_shutdown>:ENETDOWN[TCP]
4403	* <pru_shutdown>:ENOMEM[TCP]
4404	* <pru_shutdown>:EACCES[TCP]
4405	* <pru_shutdown>:EMSGSIZE[TCP]
4406	* <pru_shutdown>:ENOBUFS[TCP]
4407	* <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
4408	* <pru_shutdown>:??? [other protocol families]
4409	*/
4410	int
4411	soshutdown(struct socket so, int* how)
4412	{
4413	int error;
4414
4415	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN \| DBG_FUNC_START, how, `0`, `0`, `0`, `0`);
4416
4417	switch (how) {
4418	case SHUT_RD:
4419	case SHUT_WR:
4420	case SHUT_RDWR:
4421	socket_lock(so, `1`);
4422	if ((so->so_state &
4423	(SS_ISCONNECTED\|SS_ISCONNECTING\|SS_ISDISCONNECTING)) == `0`) {
4424	error = ENOTCONN;
4425	} else {
4426	error = soshutdownlock(so, how);
4427	}
4428	socket_unlock(so, `1`);
4429	break;
4430	default:
4431	error = EINVAL;
4432	break;
4433	}
4434
4435	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN \| DBG_FUNC_END, how, error, `0`, `0`, `0`);
4436
4437	return (error);
4438	}
4439
4440	int
4441	soshutdownlock_final(struct socket so, int* how)
4442	{
4443	struct protosw *pr = so->so_proto;
4444	int error = `0`;
4445
4446	sflt_notify(so, sock_evt_shutdown, &how);
4447
4448	if (how != SHUT_WR) {
4449	if ((so->so_state & SS_CANTRCVMORE) != `0`) {
4450	/ read already shut down /
4451	error = ENOTCONN;
4452	goto done;
4453	}
4454	sorflush(so);
4455	postevent(so, `0`, EV_RCLOSED);
4456	}
4457	if (how != SHUT_RD) {
4458	if ((so->so_state & SS_CANTSENDMORE) != `0`) {
4459	/ write already shut down /
4460	error = ENOTCONN;
4461	goto done;
4462	}
4463	error = (*pr->pr_usrreqs->pru_shutdown)(so);
4464	postevent(so, `0`, EV_WCLOSED);
4465	}
4466	done:
4467	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, `1`, `0`, `0`, `0`);
4468	return (error);
4469	}
4470
4471	int
4472	soshutdownlock(struct socket so, int* how)
4473	{
4474	int error = `0`;
4475
4476	#if CONTENT_FILTER
4477	/*
4478	* A content filter may delay the actual shutdown until it
4479	* has processed the pending data
4480	*/
4481	if (so->so_flags & SOF_CONTENT_FILTER) {
4482	error = cfil_sock_shutdown(so, &how);
4483	if (error == EJUSTRETURN) {
4484	error = `0`;
4485	goto done;
4486	} else if (error != `0`) {
4487	goto done;
4488	}
4489	}
4490	#endif /* CONTENT_FILTER */
4491
4492	error = soshutdownlock_final(so, how);
4493
4494	done:
4495	return (error);
4496	}
4497
4498	void
4499	sowflush(struct socket *so)
4500	{
4501	struct sockbuf *sb = &so->so_snd;
4502
4503	/*
4504	* Obtain lock on the socket buffer (SB_LOCK). This is required
4505	* to prevent the socket buffer from being unexpectedly altered
4506	* while it is used by another thread in socket send/receive.
4507	*
4508	* sblock() must not fail here, hence the assertion.
4509	*/
4510	(void) sblock(sb, SBL_WAIT \| SBL_NOINTR \| SBL_IGNDEFUNCT);
4511	VERIFY(sb->sb_flags & SB_LOCK);
4512
4513	sb->sb_flags &= ~(SB_SEL\|SB_UPCALL);
4514	sb->sb_flags \|= SB_DROP;
4515	sb->sb_upcall = NULL;
4516	sb->sb_upcallarg = NULL;
4517
4518	sbunlock(sb, TRUE); / keep socket locked /
4519
4520	selthreadclear(&sb->sb_sel);
4521	sbrelease(sb);
4522	}
4523
4524	void
4525	sorflush(struct socket *so)
4526	{
4527	struct sockbuf *sb = &so->so_rcv;
4528	struct protosw *pr = so->so_proto;
4529	struct sockbuf asb;
4530	#ifdef notyet
4531	lck_mtx_t *mutex_held;
4532	/*
4533	* XXX: This code is currently commented out, because we may get here
4534	* as part of sofreelastref(), and at that time, pr_getlock() may no
4535	* longer be able to return us the lock; this will be fixed in future.
4536	*/
4537	if (so->so_proto->pr_getlock != NULL)
4538	mutex_held = (*so->so_proto->pr_getlock)(so, `0`);
4539	else
4540	mutex_held = so->so_proto->pr_domain->dom_mtx;
4541
4542	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4543	#endif /* notyet */
4544
4545	sflt_notify(so, sock_evt_flush_read, NULL);
4546
4547	socantrcvmore(so);
4548
4549	/*
4550	* Obtain lock on the socket buffer (SB_LOCK). This is required
4551	* to prevent the socket buffer from being unexpectedly altered
4552	* while it is used by another thread in socket send/receive.
4553	*
4554	* sblock() must not fail here, hence the assertion.
4555	*/
4556	(void) sblock(sb, SBL_WAIT \| SBL_NOINTR \| SBL_IGNDEFUNCT);
4557	VERIFY(sb->sb_flags & SB_LOCK);
4558
4559	/*
4560	* Copy only the relevant fields from "sb" to "asb" which we
4561	* need for sbrelease() to function. In particular, skip
4562	* sb_sel as it contains the wait queue linkage, which would
4563	* wreak havoc if we were to issue selthreadclear() on "asb".
4564	* Make sure to not carry over SB_LOCK in "asb", as we need
4565	* to acquire it later as part of sbrelease().
4566	*/
4567	bzero(&asb, sizeof (asb));
4568	asb.sb_cc = sb->sb_cc;
4569	asb.sb_hiwat = sb->sb_hiwat;
4570	asb.sb_mbcnt = sb->sb_mbcnt;
4571	asb.sb_mbmax = sb->sb_mbmax;
4572	asb.sb_ctl = sb->sb_ctl;
4573	asb.sb_lowat = sb->sb_lowat;
4574	asb.sb_mb = sb->sb_mb;
4575	asb.sb_mbtail = sb->sb_mbtail;
4576	asb.sb_lastrecord = sb->sb_lastrecord;
4577	asb.sb_so = sb->sb_so;
4578	asb.sb_flags = sb->sb_flags;
4579	asb.sb_flags &= ~(SB_LOCK\|SB_SEL\|SB_KNOTE\|SB_UPCALL);
4580	asb.sb_flags \|= SB_DROP;
4581
4582	/*
4583	* Ideally we'd bzero() these and preserve the ones we need;
4584	* but to do that we'd need to shuffle things around in the
4585	* sockbuf, and we can't do it now because there are KEXTS
4586	* that are directly referring to the socket structure.
4587	*
4588	* Setting SB_DROP acts as a barrier to prevent further appends.
4589	* Clearing SB_SEL is done for selthreadclear() below.
4590	*/
4591	sb->sb_cc = `0`;
4592	sb->sb_hiwat = `0`;
4593	sb->sb_mbcnt = `0`;
4594	sb->sb_mbmax = `0`;
4595	sb->sb_ctl = `0`;
4596	sb->sb_lowat = `0`;
4597	sb->sb_mb = NULL;
4598	sb->sb_mbtail = NULL;
4599	sb->sb_lastrecord = NULL;
4600	sb->sb_timeo.tv_sec = `0`;
4601	sb->sb_timeo.tv_usec = `0`;
4602	sb->sb_upcall = NULL;
4603	sb->sb_upcallarg = NULL;
4604	sb->sb_flags &= ~(SB_SEL\|SB_UPCALL);
4605	sb->sb_flags \|= SB_DROP;
4606
4607	sbunlock(sb, TRUE); / keep socket locked /
4608
4609	/*
4610	* Note that selthreadclear() is called on the original "sb" and
4611	* not the local "asb" because of the way wait queue linkage is
4612	* implemented. Given that selwakeup() may be triggered, SB_SEL
4613	* should no longer be set (cleared above.)
4614	*/
4615	selthreadclear(&sb->sb_sel);
4616
4617	if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose)
4618	(*pr->pr_domain->dom_dispose)(asb.sb_mb);
4619
4620	sbrelease(&asb);
4621	}
4622
4623	/*
4624	* Perhaps this routine, and sooptcopyout(), below, ought to come in
4625	* an additional variant to handle the case where the option value needs
4626	* to be some kind of integer, but not a specific size.
4627	* In addition to their use here, these functions are also called by the
4628	* protocol-level pr_ctloutput() routines.
4629	*
4630	* Returns: 0 Success
4631	* EINVAL
4632	* copyin:EFAULT
4633	*/
4634	int
4635	sooptcopyin(struct sockopt sopt, void* *buf, size_t len, size_t minlen)
4636	{
4637	size_t valsize;
4638
4639	/*
4640	* If the user gives us more than we wanted, we ignore it,
4641	* but if we don't get the minimum length the caller
4642	* wants, we return EINVAL. On success, sopt->sopt_valsize
4643	* is set to however much we actually retrieved.
4644	*/
4645	if ((valsize = sopt->sopt_valsize) < minlen)
4646	return (EINVAL);
4647	if (valsize > len)
4648	sopt->sopt_valsize = valsize = len;
4649
4650	if (sopt->sopt_p != kernproc)
4651	return (copyin(sopt->sopt_val, buf, valsize));
4652
4653	bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), buf, valsize);
4654	return (`0`);
4655	}
4656
4657	/*
4658	* sooptcopyin_timeval
4659	* Copy in a timeval value into tv_p, and take into account whether the
4660	* the calling process is 64-bit or 32-bit. Moved the sanity checking
4661	* code here so that we can verify the 64-bit tv_sec value before we lose
4662	* the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4663	*/
4664	static int
4665	sooptcopyin_timeval(struct sockopt sopt, struct* timeval *tv_p)
4666	{
4667	int error;
4668
4669	if (proc_is64bit(sopt->sopt_p)) {
4670	struct user64_timeval tv64;
4671
4672	if (sopt->sopt_valsize < sizeof (tv64))
4673	return (EINVAL);
4674
4675	sopt->sopt_valsize = sizeof (tv64);
4676	if (sopt->sopt_p != kernproc) {
4677	error = copyin(sopt->sopt_val, &tv64, sizeof (tv64));
4678	if (error != `0`)
4679	return (error);
4680	} else {
4681	bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv64,
4682	sizeof (tv64));
4683	}
4684	if (tv64.tv_sec < `0` \|\| tv64.tv_sec > LONG_MAX \|\|
4685	tv64.tv_usec < `0` \|\| tv64.tv_usec >= `1000000`)
4686	return (EDOM);
4687
4688	tv_p->tv_sec = tv64.tv_sec;
4689	tv_p->tv_usec = tv64.tv_usec;
4690	} else {
4691	struct user32_timeval tv32;
4692
4693	if (sopt->sopt_valsize < sizeof (tv32))
4694	return (EINVAL);
4695
4696	sopt->sopt_valsize = sizeof (tv32);
4697	if (sopt->sopt_p != kernproc) {
4698	error = copyin(sopt->sopt_val, &tv32, sizeof (tv32));
4699	if (error != `0`) {
4700	return (error);
4701	}
4702	} else {
4703	bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), &tv32,
4704	sizeof (tv32));
4705	}
4706	#ifndef __LP64__
4707	/*
4708	* K64todo "comparison is always false due to
4709	* limited range of data type"
4710	*/
4711	if (tv32.tv_sec < `0` \|\| tv32.tv_sec > LONG_MAX \|\|
4712	tv32.tv_usec < `0` \|\| tv32.tv_usec >= `1000000`)
4713	return (EDOM);
4714	#endif
4715	tv_p->tv_sec = tv32.tv_sec;
4716	tv_p->tv_usec = tv32.tv_usec;
4717	}
4718	return (`0`);
4719	}
4720
4721	int
4722	soopt_cred_check(struct socket so, int* priv, boolean_t allow_root)
4723	{
4724	kauth_cred_t cred = NULL;
4725	proc_t ep = PROC_NULL;
4726	uid_t uid;
4727	int error = `0`;
4728
4729	if (so->so_flags & SOF_DELEGATED) {
4730	ep = proc_find(so->e_pid);
4731	if (ep)
4732	cred = kauth_cred_proc_ref(ep);
4733	}
4734
4735	uid = kauth_cred_getuid(cred ? cred : so->so_cred);
4736
4737	/ uid is 0 for root /
4738	if (uid != `0` \|\| !allow_root)
4739	error = priv_check_cred(cred ? cred : so->so_cred, priv, `0`);
4740	if (cred)
4741	kauth_cred_unref(&cred);
4742	if (ep != PROC_NULL)
4743	proc_rele(ep);
4744
4745	return (error);
4746	}
4747
4748	/*
4749	* Returns: 0 Success
4750	* EINVAL
4751	* ENOPROTOOPT
4752	* ENOBUFS
4753	* EDOM
4754	* sooptcopyin:EINVAL
4755	* sooptcopyin:EFAULT
4756	* sooptcopyin_timeval:EINVAL
4757	* sooptcopyin_timeval:EFAULT
4758	* sooptcopyin_timeval:EDOM
4759	* <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4760	* <pr_ctloutput>:???w
4761	* sflt_attach_private:??? [whatever a filter author chooses]
4762	* <sf_setoption>:??? [whatever a filter author chooses]
4763	*
4764	* Notes: Other <pru_listen> returns depend on the protocol family; all
4765	* <sf_listen> returns depend on what the filter author causes
4766	* their filter to return.
4767	*/
4768	int
4769	sosetoptlock(struct socket so, struct* sockopt sopt, int* dolock)
4770	{
4771	int error, optval;
4772	struct linger l;
4773	struct timeval tv;
4774	#if CONFIG_MACF_SOCKET
4775	struct mac extmac;
4776	#endif /* MAC_SOCKET */
4777
4778	if (sopt->sopt_dir != SOPT_SET)
4779	sopt->sopt_dir = SOPT_SET;
4780
4781	if (dolock)
4782	socket_lock(so, `1`);
4783
4784	if ((so->so_state & (SS_CANTRCVMORE \| SS_CANTSENDMORE)) ==
4785	(SS_CANTRCVMORE \| SS_CANTSENDMORE) &&
4786	(so->so_flags & SOF_NPX_SETOPTSHUT) == `0`) {
4787	/ the socket has been shutdown, no more sockopt's /
4788	error = EINVAL;
4789	goto out;
4790	}
4791
4792	error = sflt_setsockopt(so, sopt);
4793	if (error != `0`) {
4794	if (error == EJUSTRETURN)
4795	error = `0`;
4796	goto out;
4797	}
4798
4799	if (sopt->sopt_level != SOL_SOCKET) {
4800	if (so->so_proto != NULL &&
4801	so->so_proto->pr_ctloutput != NULL) {
4802	error = (*so->so_proto->pr_ctloutput)(so, sopt);
4803	goto out;
4804	}
4805	error = ENOPROTOOPT;
4806	} else {
4807	/*
4808	* Allow socket-level (SOL_SOCKET) options to be filtered by
4809	* the protocol layer, if needed. A zero value returned from
4810	* the handler means use default socket-level processing as
4811	* done by the rest of this routine. Otherwise, any other
4812	* return value indicates that the option is unsupported.
4813	*/
4814	if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
4815	pru_socheckopt(so, sopt)) != `0`)
4816	goto out;
4817
4818	error = `0`;
4819	switch (sopt->sopt_name) {
4820	case SO_LINGER:
4821	case SO_LINGER_SEC:
4822	error = sooptcopyin(sopt, &l, sizeof (l), sizeof (l));
4823	if (error != `0`)
4824	goto out;
4825
4826	so->so_linger = (sopt->sopt_name == SO_LINGER) ?
4827	l.l_linger : l.l_linger * hz;
4828	if (l.l_onoff != `0`)
4829	so->so_options \|= SO_LINGER;
4830	else
4831	so->so_options &= ~SO_LINGER;
4832	break;
4833
4834	case SO_DEBUG:
4835	case SO_KEEPALIVE:
4836	case SO_DONTROUTE:
4837	case SO_USELOOPBACK:
4838	case SO_BROADCAST:
4839	case SO_REUSEADDR:
4840	case SO_REUSEPORT:
4841	case SO_OOBINLINE:
4842	case SO_TIMESTAMP:
4843	case SO_TIMESTAMP_MONOTONIC:
4844	case SO_TIMESTAMP_CONTINUOUS:
4845	case SO_DONTTRUNC:
4846	case SO_WANTMORE:
4847	case SO_WANTOOBFLAG:
4848	case SO_NOWAKEFROMSLEEP:
4849	case SO_NOAPNFALLBK:
4850	error = sooptcopyin(sopt, &optval, sizeof (optval),
4851	sizeof (optval));
4852	if (error != `0`)
4853	goto out;
4854	if (optval)
4855	so->so_options \|= sopt->sopt_name;
4856	else
4857	so->so_options &= ~sopt->sopt_name;
4858	break;
4859
4860	case SO_SNDBUF:
4861	case SO_RCVBUF:
4862	case SO_SNDLOWAT:
4863	case SO_RCVLOWAT:
4864	error = sooptcopyin(sopt, &optval, sizeof (optval),
4865	sizeof (optval));
4866	if (error != `0`)
4867	goto out;
4868
4869	/*
4870	* Values < 1 make no sense for any of these
4871	* options, so disallow them.
4872	*/
4873	if (optval < `1`) {
4874	error = EINVAL;
4875	goto out;
4876	}
4877
4878	switch (sopt->sopt_name) {
4879	case SO_SNDBUF:
4880	case SO_RCVBUF: {
4881	struct sockbuf *sb =
4882	(sopt->sopt_name == SO_SNDBUF) ?
4883	&so->so_snd : &so->so_rcv;
4884	if (sbreserve(sb, (u_int32_t)optval) == `0`) {
4885	error = ENOBUFS;
4886	goto out;
4887	}
4888	sb->sb_flags \|= SB_USRSIZE;
4889	sb->sb_flags &= ~SB_AUTOSIZE;
4890	sb->sb_idealsize = (u_int32_t)optval;
4891	break;
4892	}
4893	/*
4894	* Make sure the low-water is never greater than
4895	* the high-water.
4896	*/
4897	case SO_SNDLOWAT: {
4898	int space = sbspace(&so->so_snd);
4899	u_int32_t hiwat = so->so_snd.sb_hiwat;
4900
4901	if (so->so_snd.sb_flags & SB_UNIX) {
4902	struct unpcb *unp =
4903	(struct unpcb *)(so->so_pcb);
4904	if (unp != NULL &&
4905	unp->unp_conn != NULL) {
4906	hiwat += unp->unp_conn->unp_cc;
4907	}
4908	}
4909
4910	so->so_snd.sb_lowat =
4911	(optval > hiwat) ?
4912	hiwat : optval;
4913
4914	if (space >= so->so_snd.sb_lowat) {
4915	sowwakeup(so);
4916	}
4917	break;
4918	}
4919	case SO_RCVLOWAT: {
4920	int64_t data_len;
4921	so->so_rcv.sb_lowat =
4922	(optval > so->so_rcv.sb_hiwat) ?
4923	so->so_rcv.sb_hiwat : optval;
4924	data_len = so->so_rcv.sb_cc
4925	- so->so_rcv.sb_ctl;
4926	if (data_len >= so->so_rcv.sb_lowat)
4927	sorwakeup(so);
4928	break;
4929	}
4930	}
4931	break;
4932
4933	case SO_SNDTIMEO:
4934	case SO_RCVTIMEO:
4935	error = sooptcopyin_timeval(sopt, &tv);
4936	if (error != `0`)
4937	goto out;
4938
4939	switch (sopt->sopt_name) {
4940	case SO_SNDTIMEO:
4941	so->so_snd.sb_timeo = tv;
4942	break;
4943	case SO_RCVTIMEO:
4944	so->so_rcv.sb_timeo = tv;
4945	break;
4946	}
4947	break;
4948
4949	case SO_NKE: {
4950	struct so_nke nke;
4951
4952	error = sooptcopyin(sopt, &nke, sizeof (nke),
4953	sizeof (nke));
4954	if (error != `0`)
4955	goto out;
4956
4957	error = sflt_attach_internal(so, nke.nke_handle);
4958	break;
4959	}
4960
4961	case SO_NOSIGPIPE:
4962	error = sooptcopyin(sopt, &optval, sizeof (optval),
4963	sizeof (optval));
4964	if (error != `0`)
4965	goto out;
4966	if (optval != `0`)
4967	so->so_flags \|= SOF_NOSIGPIPE;
4968	else
4969	so->so_flags &= ~SOF_NOSIGPIPE;
4970	break;
4971
4972	case SO_NOADDRERR:
4973	error = sooptcopyin(sopt, &optval, sizeof (optval),
4974	sizeof (optval));
4975	if (error != `0`)
4976	goto out;
4977	if (optval != `0`)
4978	so->so_flags \|= SOF_NOADDRAVAIL;
4979	else
4980	so->so_flags &= ~SOF_NOADDRAVAIL;
4981	break;
4982
4983	case SO_REUSESHAREUID:
4984	error = sooptcopyin(sopt, &optval, sizeof (optval),
4985	sizeof (optval));
4986	if (error != `0`)
4987	goto out;
4988	if (optval != `0`)
4989	so->so_flags \|= SOF_REUSESHAREUID;
4990	else
4991	so->so_flags &= ~SOF_REUSESHAREUID;
4992	break;
4993
4994	case SO_NOTIFYCONFLICT:
4995	if (kauth_cred_issuser(kauth_cred_get()) == `0`) {
4996	error = EPERM;
4997	goto out;
4998	}
4999	error = sooptcopyin(sopt, &optval, sizeof (optval),
5000	sizeof (optval));
5001	if (error != `0`)
5002	goto out;
5003	if (optval != `0`)
5004	so->so_flags \|= SOF_NOTIFYCONFLICT;
5005	else
5006	so->so_flags &= ~SOF_NOTIFYCONFLICT;
5007	break;
5008
5009	case SO_RESTRICTIONS:
5010	error = sooptcopyin(sopt, &optval, sizeof (optval),
5011	sizeof (optval));
5012	if (error != `0`)
5013	goto out;
5014
5015	error = so_set_restrictions(so, optval);
5016	break;
5017
5018	case SO_AWDL_UNRESTRICTED:
5019	if (SOCK_DOM(so) != PF_INET &&
5020	SOCK_DOM(so) != PF_INET6) {
5021	error = EOPNOTSUPP;
5022	goto out;
5023	}
5024	error = sooptcopyin(sopt, &optval, sizeof(optval),
5025	sizeof(optval));
5026	if (error != `0`)
5027	goto out;
5028	if (optval != `0`) {
5029	error = soopt_cred_check(so,
5030	PRIV_NET_RESTRICTED_AWDL, false);
5031	if (error == `0`)
5032	inp_set_awdl_unrestricted(
5033	sotoinpcb(so));
5034	} else
5035	inp_clear_awdl_unrestricted(sotoinpcb(so));
5036	break;
5037	case SO_INTCOPROC_ALLOW:
5038	if (SOCK_DOM(so) != PF_INET6) {
5039	error = EOPNOTSUPP;
5040	goto out;
5041	}
5042	error = sooptcopyin(sopt, &optval, sizeof(optval),
5043	sizeof(optval));
5044	if (error != `0`)
5045	goto out;
5046	if (optval != `0` &&
5047	inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5048	error = soopt_cred_check(so,
5049	PRIV_NET_RESTRICTED_INTCOPROC, false);
5050	if (error == `0`)
5051	inp_set_intcoproc_allowed(
5052	sotoinpcb(so));
5053	} else if (optval == `0`)
5054	inp_clear_intcoproc_allowed(sotoinpcb(so));
5055	break;
5056
5057	case SO_LABEL:
5058	#if CONFIG_MACF_SOCKET
5059	if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5060	sizeof (extmac))) != `0`)
5061	goto out;
5062
5063	error = mac_setsockopt_label(proc_ucred(sopt->sopt_p),
5064	so, &extmac);
5065	#else
5066	error = EOPNOTSUPP;
5067	#endif /* MAC_SOCKET */
5068	break;
5069
5070	case SO_UPCALLCLOSEWAIT:
5071	error = sooptcopyin(sopt, &optval, sizeof (optval),
5072	sizeof (optval));
5073	if (error != `0`)
5074	goto out;
5075	if (optval != `0`)
5076	so->so_flags \|= SOF_UPCALLCLOSEWAIT;
5077	else
5078	so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5079	break;
5080
5081	case SO_RANDOMPORT:
5082	error = sooptcopyin(sopt, &optval, sizeof (optval),
5083	sizeof (optval));
5084	if (error != `0`)
5085	goto out;
5086	if (optval != `0`)
5087	so->so_flags \|= SOF_BINDRANDOMPORT;
5088	else
5089	so->so_flags &= ~SOF_BINDRANDOMPORT;
5090	break;
5091
5092	case SO_NP_EXTENSIONS: {
5093	struct so_np_extensions sonpx;
5094
5095	error = sooptcopyin(sopt, &sonpx, sizeof (sonpx),
5096	sizeof (sonpx));
5097	if (error != `0`)
5098	goto out;
5099	if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5100	error = EINVAL;
5101	goto out;
5102	}
5103	/*
5104	* Only one bit defined for now
5105	*/
5106	if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5107	if ((sonpx.npx_flags & SONPX_SETOPTSHUT))
5108	so->so_flags \|= SOF_NPX_SETOPTSHUT;
5109	else
5110	so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5111	}
5112	break;
5113	}
5114
5115	case SO_TRAFFIC_CLASS: {
5116	error = sooptcopyin(sopt, &optval, sizeof (optval),
5117	sizeof (optval));
5118	if (error != `0`)
5119	goto out;
5120	if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5121	int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5122	error = so_set_net_service_type(so, netsvc);
5123	goto out;
5124	}
5125	error = so_set_traffic_class(so, optval);
5126	if (error != `0`)
5127	goto out;
5128	so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5129	so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5130	break;
5131	}
5132
5133	case SO_RECV_TRAFFIC_CLASS: {
5134	error = sooptcopyin(sopt, &optval, sizeof (optval),
5135	sizeof (optval));
5136	if (error != `0`)
5137	goto out;
5138	if (optval == `0`)
5139	so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5140	else
5141	so->so_flags \|= SOF_RECV_TRAFFIC_CLASS;
5142	break;
5143	}
5144
5145	#if (DEVELOPMENT \|\| DEBUG)
5146	case SO_TRAFFIC_CLASS_DBG: {
5147	struct so_tcdbg so_tcdbg;
5148
5149	error = sooptcopyin(sopt, &so_tcdbg,
5150	sizeof (struct so_tcdbg), sizeof (struct so_tcdbg));
5151	if (error != `0`)
5152	goto out;
5153	error = so_set_tcdbg(so, &so_tcdbg);
5154	if (error != `0`)
5155	goto out;
5156	break;
5157	}
5158	#endif /* (DEVELOPMENT \|\| DEBUG) */
5159
5160	case SO_PRIVILEGED_TRAFFIC_CLASS:
5161	error = priv_check_cred(kauth_cred_get(),
5162	PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, `0`);
5163	if (error != `0`)
5164	goto out;
5165	error = sooptcopyin(sopt, &optval, sizeof (optval),
5166	sizeof (optval));
5167	if (error != `0`)
5168	goto out;
5169	if (optval == `0`)
5170	so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5171	else
5172	so->so_flags \|= SOF_PRIVILEGED_TRAFFIC_CLASS;
5173	break;
5174
5175	#if (DEVELOPMENT \|\| DEBUG)
5176	case SO_DEFUNCTIT:
5177	error = sosetdefunct(current_proc(), so, `0`, FALSE);
5178	if (error == `0`)
5179	error = sodefunct(current_proc(), so, `0`);
5180
5181	break;
5182	#endif /* (DEVELOPMENT \|\| DEBUG) */
5183
5184	case SO_DEFUNCTOK:
5185	error = sooptcopyin(sopt, &optval, sizeof (optval),
5186	sizeof (optval));
5187	if (error != `0` \|\| (so->so_flags & SOF_DEFUNCT)) {
5188	if (error == `0`)
5189	error = EBADF;
5190	goto out;
5191	}
5192	/*
5193	* Any process can set SO_DEFUNCTOK (clear
5194	* SOF_NODEFUNCT), but only root can clear
5195	* SO_DEFUNCTOK (set SOF_NODEFUNCT).
5196	*/
5197	if (optval == `0` &&
5198	kauth_cred_issuser(kauth_cred_get()) == `0`) {
5199	error = EPERM;
5200	goto out;
5201	}
5202	if (optval)
5203	so->so_flags &= ~SOF_NODEFUNCT;
5204	else
5205	so->so_flags \|= SOF_NODEFUNCT;
5206
5207	if (SOCK_DOM(so) == PF_INET \|\|
5208	SOCK_DOM(so) == PF_INET6) {
5209	char s[MAX_IPv6_STR_LEN];
5210	char d[MAX_IPv6_STR_LEN];
5211	struct inpcb *inp = sotoinpcb(so);
5212
5213	SODEFUNCTLOG("%s[%d, %s]: so 0x%llx "
5214	"[%s %s:%d -> %s:%d] is now marked "
5215	"as %seligible for "
5216	"defunct\n", __func__, proc_selfpid(),
5217	proc_best_name(current_proc()),
5218	(uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5219	(SOCK_TYPE(so) == SOCK_STREAM) ?
5220	"TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5221	((SOCK_DOM(so) == PF_INET) ?
5222	(void *)&inp->inp_laddr.s_addr :
5223	(void )&inp->in6p_laddr), s, sizeof* (s)),
5224	ntohs(inp->in6p_lport),
5225	inet_ntop(SOCK_DOM(so),
5226	(SOCK_DOM(so) == PF_INET) ?
5227	(void *)&inp->inp_faddr.s_addr :
5228	(void )&inp->in6p_faddr, d, sizeof* (d)),
5229	ntohs(inp->in6p_fport),
5230	(so->so_flags & SOF_NODEFUNCT) ?
5231	"not " : "");
5232	} else {
5233	SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
5234	"is now marked as %seligible for "
5235	"defunct\n",
5236	__func__, proc_selfpid(),
5237	proc_best_name(current_proc()),
5238	(uint64_t)DEBUG_KERNEL_ADDRPERM(so),
5239	SOCK_DOM(so), SOCK_TYPE(so),
5240	(so->so_flags & SOF_NODEFUNCT) ?
5241	"not " : "");
5242	}
5243	break;
5244
5245	case SO_ISDEFUNCT:
5246	/ This option is not settable /
5247	error = EINVAL;
5248	break;
5249
5250	case SO_OPPORTUNISTIC:
5251	error = sooptcopyin(sopt, &optval, sizeof (optval),
5252	sizeof (optval));
5253	if (error == `0`)
5254	error = so_set_opportunistic(so, optval);
5255	break;
5256
5257	case SO_FLUSH:
5258	/ This option is handled by lower layer(s) /
5259	error = `0`;
5260	break;
5261
5262	case SO_RECV_ANYIF:
5263	error = sooptcopyin(sopt, &optval, sizeof (optval),
5264	sizeof (optval));
5265	if (error == `0`)
5266	error = so_set_recv_anyif(so, optval);
5267	break;
5268
5269	case SO_TRAFFIC_MGT_BACKGROUND: {
5270	/ This option is handled by lower layer(s) /
5271	error = `0`;
5272	break;
5273	}
5274
5275	#if FLOW_DIVERT
5276	case SO_FLOW_DIVERT_TOKEN:
5277	error = flow_divert_token_set(so, sopt);
5278	break;
5279	#endif /* FLOW_DIVERT */
5280
5281
5282	case SO_DELEGATED:
5283	if ((error = sooptcopyin(sopt, &optval, sizeof (optval),
5284	sizeof (optval))) != `0`)
5285	break;
5286
5287	error = so_set_effective_pid(so, optval, sopt->sopt_p);
5288	break;
5289
5290	case SO_DELEGATED_UUID: {
5291	uuid_t euuid;
5292
5293	if ((error = sooptcopyin(sopt, &euuid, sizeof (euuid),
5294	sizeof (euuid))) != `0`)
5295	break;
5296
5297	error = so_set_effective_uuid(so, euuid, sopt->sopt_p);
5298	break;
5299	}
5300
5301	#if NECP
5302	case SO_NECP_ATTRIBUTES:
5303	error = necp_set_socket_attributes(so, sopt);
5304	break;
5305
5306	case SO_NECP_CLIENTUUID:
5307	if (SOCK_DOM(so) == PF_MULTIPATH) {
5308	/ Handled by MPTCP itself /
5309	break;
5310	}
5311
5312	if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5313	error = EINVAL;
5314	goto out;
5315	}
5316
5317	struct inpcb *inp = sotoinpcb(so);
5318	if (!uuid_is_null(inp->necp_client_uuid)) {
5319	// Clear out the old client UUID if present
5320	necp_inpcb_remove_cb(inp);
5321	}
5322
5323	error = sooptcopyin(sopt, &inp->necp_client_uuid,
5324	sizeof(uuid_t), sizeof(uuid_t));
5325	if (error != `0`) {
5326	goto out;
5327	}
5328
5329	if (uuid_is_null(inp->necp_client_uuid)) {
5330	error = EINVAL;
5331	goto out;
5332	}
5333
5334	error = necp_client_register_socket_flow(so->last_pid,
5335	inp->necp_client_uuid, inp);
5336	if (error != `0`) {
5337	uuid_clear(inp->necp_client_uuid);
5338	goto out;
5339	}
5340
5341	if (inp->inp_lport != `0`) {
5342	// There is bound local port, so this is not
5343	// a fresh socket. Assign to the client.
5344	necp_client_assign_from_socket(so->last_pid, inp->necp_client_uuid, inp);
5345	}
5346
5347	break;
5348	#endif /* NECP */
5349
5350	case SO_EXTENDED_BK_IDLE:
5351	error = sooptcopyin(sopt, &optval, sizeof (optval),
5352	sizeof (optval));
5353	if (error == `0`)
5354	error = so_set_extended_bk_idle(so, optval);
5355	break;
5356
5357	case SO_MARK_CELLFALLBACK:
5358	error = sooptcopyin(sopt, &optval, sizeof(optval),
5359	sizeof(optval));
5360	if (error != `0`)
5361	goto out;
5362	if (optval < `0`) {
5363	error = EINVAL;
5364	goto out;
5365	}
5366	if (optval == `0`)
5367	so->so_flags1 &= ~SOF1_CELLFALLBACK;
5368	else
5369	so->so_flags1 \|= SOF1_CELLFALLBACK;
5370	break;
5371
5372	case SO_NET_SERVICE_TYPE: {
5373	error = sooptcopyin(sopt, &optval, sizeof(optval),
5374	sizeof(optval));
5375	if (error != `0`)
5376	goto out;
5377	error = so_set_net_service_type(so, optval);
5378	break;
5379	}
5380
5381	case SO_QOSMARKING_POLICY_OVERRIDE:
5382	error = priv_check_cred(kauth_cred_get(),
5383	PRIV_NET_QOSMARKING_POLICY_OVERRIDE, `0`);
5384	if (error != `0`)
5385	goto out;
5386	error = sooptcopyin(sopt, &optval, sizeof(optval),
5387	sizeof(optval));
5388	if (error != `0`)
5389	goto out;
5390	if (optval == `0`)
5391	so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5392	else
5393	so->so_flags1 \|= SOF1_QOSMARKING_POLICY_OVERRIDE;
5394	break;
5395
5396	default:
5397	error = ENOPROTOOPT;
5398	break;
5399	}
5400	if (error == `0` && so->so_proto != NULL &&
5401	so->so_proto->pr_ctloutput != NULL) {
5402	(void) so->so_proto->pr_ctloutput(so, sopt);
5403	}
5404	}
5405	out:
5406	if (dolock)
5407	socket_unlock(so, `1`);
5408	return (error);
5409	}
5410
5411	/ Helper routines for getsockopt /
5412	int
5413	sooptcopyout(struct sockopt sopt, void* *buf, size_t len)
5414	{
5415	int error;
5416	size_t valsize;
5417
5418	error = `0`;
5419
5420	/*
5421	* Documented get behavior is that we always return a value,
5422	* possibly truncated to fit in the user's buffer.
5423	* Traditional behavior is that we always tell the user
5424	* precisely how much we copied, rather than something useful
5425	* like the total amount we had available for her.
5426	* Note that this interface is not idempotent; the entire answer must
5427	* generated ahead of time.
5428	*/
5429	valsize = min(len, sopt->sopt_valsize);
5430	sopt->sopt_valsize = valsize;
5431	if (sopt->sopt_val != USER_ADDR_NULL) {
5432	if (sopt->sopt_p != kernproc)
5433	error = copyout(buf, sopt->sopt_val, valsize);
5434	else
5435	bcopy(buf, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5436	}
5437	return (error);
5438	}
5439
5440	static int
5441	sooptcopyout_timeval(struct sockopt sopt, const* struct timeval *tv_p)
5442	{
5443	int error;
5444	size_t len;
5445	struct user64_timeval tv64 = {};
5446	struct user32_timeval tv32 = {};
5447	const void * val;
5448	size_t valsize;
5449
5450	error = `0`;
5451	if (proc_is64bit(sopt->sopt_p)) {
5452	len = sizeof (tv64);
5453	tv64.tv_sec = tv_p->tv_sec;
5454	tv64.tv_usec = tv_p->tv_usec;
5455	val = &tv64;
5456	} else {
5457	len = sizeof (tv32);
5458	tv32.tv_sec = tv_p->tv_sec;
5459	tv32.tv_usec = tv_p->tv_usec;
5460	val = &tv32;
5461	}
5462	valsize = min(len, sopt->sopt_valsize);
5463	sopt->sopt_valsize = valsize;
5464	if (sopt->sopt_val != USER_ADDR_NULL) {
5465	if (sopt->sopt_p != kernproc)
5466	error = copyout(val, sopt->sopt_val, valsize);
5467	else
5468	bcopy(val, CAST_DOWN(caddr_t, sopt->sopt_val), valsize);
5469	}
5470	return (error);
5471	}
5472
5473	/*
5474	* Return: 0 Success
5475	* ENOPROTOOPT
5476	* <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5477	* <pr_ctloutput>:???
5478	* <sf_getoption>:???
5479	*/
5480	int
5481	sogetoptlock(struct socket so, struct* sockopt sopt, int* dolock)
5482	{
5483	int error, optval;
5484	struct linger l;
5485	struct timeval tv;
5486	#if CONFIG_MACF_SOCKET
5487	struct mac extmac;
5488	#endif /* MAC_SOCKET */
5489
5490	if (sopt->sopt_dir != SOPT_GET)
5491	sopt->sopt_dir = SOPT_GET;
5492
5493	if (dolock)
5494	socket_lock(so, `1`);
5495
5496	error = sflt_getsockopt(so, sopt);
5497	if (error != `0`) {
5498	if (error == EJUSTRETURN)
5499	error = `0`;
5500	goto out;
5501	}
5502
5503	if (sopt->sopt_level != SOL_SOCKET) {
5504	if (so->so_proto != NULL &&
5505	so->so_proto->pr_ctloutput != NULL) {
5506	error = (*so->so_proto->pr_ctloutput)(so, sopt);
5507	goto out;
5508	}
5509	error = ENOPROTOOPT;
5510	} else {
5511	/*
5512	* Allow socket-level (SOL_SOCKET) options to be filtered by
5513	* the protocol layer, if needed. A zero value returned from
5514	* the handler means use default socket-level processing as
5515	* done by the rest of this routine. Otherwise, any other
5516	* return value indicates that the option is unsupported.
5517	*/
5518	if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5519	pru_socheckopt(so, sopt)) != `0`)
5520	goto out;
5521
5522	error = `0`;
5523	switch (sopt->sopt_name) {
5524	case SO_LINGER:
5525	case SO_LINGER_SEC:
5526	l.l_onoff = ((so->so_options & SO_LINGER) ? `1` : `0`);
5527	l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5528	so->so_linger : so->so_linger / hz;
5529	error = sooptcopyout(sopt, &l, sizeof (l));
5530	break;
5531
5532	case SO_USELOOPBACK:
5533	case SO_DONTROUTE:
5534	case SO_DEBUG:
5535	case SO_KEEPALIVE:
5536	case SO_REUSEADDR:
5537	case SO_REUSEPORT:
5538	case SO_BROADCAST:
5539	case SO_OOBINLINE:
5540	case SO_TIMESTAMP:
5541	case SO_TIMESTAMP_MONOTONIC:
5542	case SO_TIMESTAMP_CONTINUOUS:
5543	case SO_DONTTRUNC:
5544	case SO_WANTMORE:
5545	case SO_WANTOOBFLAG:
5546	case SO_NOWAKEFROMSLEEP:
5547	case SO_NOAPNFALLBK:
5548	optval = so->so_options & sopt->sopt_name;
5549	integer:
5550	error = sooptcopyout(sopt, &optval, sizeof (optval));
5551	break;
5552
5553	case SO_TYPE:
5554	optval = so->so_type;
5555	goto integer;
5556
5557	case SO_NREAD:
5558	if (so->so_proto->pr_flags & PR_ATOMIC) {
5559	int pkt_total;
5560	struct mbuf *m1;
5561
5562	pkt_total = `0`;
5563	m1 = so->so_rcv.sb_mb;
5564	while (m1 != NULL) {
5565	if (m1->m_type == MT_DATA \|\|
5566	m1->m_type == MT_HEADER \|\|
5567	m1->m_type == MT_OOBDATA)
5568	pkt_total += m1->m_len;
5569	m1 = m1->m_next;
5570	}
5571	optval = pkt_total;
5572	} else {
5573	optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5574	}
5575	goto integer;
5576
5577	case SO_NUMRCVPKT:
5578	if (so->so_proto->pr_flags & PR_ATOMIC) {
5579	int cnt = `0`;
5580	struct mbuf *m1;
5581
5582	m1 = so->so_rcv.sb_mb;
5583	while (m1 != NULL) {
5584	if (m1->m_type == MT_DATA \|\|
5585	m1->m_type == MT_HEADER \|\|
5586	m1->m_type == MT_OOBDATA)
5587	cnt += `1`;
5588	m1 = m1->m_nextpkt;
5589	}
5590	optval = cnt;
5591	goto integer;
5592	} else {
5593	error = EINVAL;
5594	break;
5595	}
5596
5597	case SO_NWRITE:
5598	optval = so->so_snd.sb_cc;
5599	goto integer;
5600
5601	case SO_ERROR:
5602	optval = so->so_error;
5603	so->so_error = `0`;
5604	goto integer;
5605
5606	case SO_SNDBUF: {
5607	u_int32_t hiwat = so->so_snd.sb_hiwat;
5608
5609	if (so->so_snd.sb_flags & SB_UNIX) {
5610	struct unpcb *unp =
5611	(struct unpcb *)(so->so_pcb);
5612	if (unp != NULL && unp->unp_conn != NULL) {
5613	hiwat += unp->unp_conn->unp_cc;
5614	}
5615	}
5616
5617	optval = hiwat;
5618	goto integer;
5619	}
5620	case SO_RCVBUF:
5621	optval = so->so_rcv.sb_hiwat;
5622	goto integer;
5623
5624	case SO_SNDLOWAT:
5625	optval = so->so_snd.sb_lowat;
5626	goto integer;
5627
5628	case SO_RCVLOWAT:
5629	optval = so->so_rcv.sb_lowat;
5630	goto integer;
5631
5632	case SO_SNDTIMEO:
5633	case SO_RCVTIMEO:
5634	tv = (sopt->sopt_name == SO_SNDTIMEO ?
5635	so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
5636
5637	error = sooptcopyout_timeval(sopt, &tv);
5638	break;
5639
5640	case SO_NOSIGPIPE:
5641	optval = (so->so_flags & SOF_NOSIGPIPE);
5642	goto integer;
5643
5644	case SO_NOADDRERR:
5645	optval = (so->so_flags & SOF_NOADDRAVAIL);
5646	goto integer;
5647
5648	case SO_REUSESHAREUID:
5649	optval = (so->so_flags & SOF_REUSESHAREUID);
5650	goto integer;
5651
5652
5653	case SO_NOTIFYCONFLICT:
5654	optval = (so->so_flags & SOF_NOTIFYCONFLICT);
5655	goto integer;
5656
5657	case SO_RESTRICTIONS:
5658	optval = so_get_restrictions(so);
5659	goto integer;
5660
5661	case SO_AWDL_UNRESTRICTED:
5662	if (SOCK_DOM(so) == PF_INET \|\|
5663	SOCK_DOM(so) == PF_INET6) {
5664	optval = inp_get_awdl_unrestricted(
5665	sotoinpcb(so));
5666	goto integer;
5667	} else
5668	error = EOPNOTSUPP;
5669	break;
5670
5671	case SO_INTCOPROC_ALLOW:
5672	if (SOCK_DOM(so) == PF_INET6) {
5673	optval = inp_get_intcoproc_allowed(
5674	sotoinpcb(so));
5675	goto integer;
5676	} else
5677	error = EOPNOTSUPP;
5678	break;
5679
5680	case SO_LABEL:
5681	#if CONFIG_MACF_SOCKET
5682	if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5683	sizeof (extmac))) != `0` \|\|
5684	(error = mac_socket_label_get(proc_ucred(
5685	sopt->sopt_p), so, &extmac)) != `0`)
5686	break;
5687
5688	error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5689	#else
5690	error = EOPNOTSUPP;
5691	#endif /* MAC_SOCKET */
5692	break;
5693
5694	case SO_PEERLABEL:
5695	#if CONFIG_MACF_SOCKET
5696	if ((error = sooptcopyin(sopt, &extmac, sizeof (extmac),
5697	sizeof (extmac))) != `0` \|\|
5698	(error = mac_socketpeer_label_get(proc_ucred(
5699	sopt->sopt_p), so, &extmac)) != `0`)
5700	break;
5701
5702	error = sooptcopyout(sopt, &extmac, sizeof (extmac));
5703	#else
5704	error = EOPNOTSUPP;
5705	#endif /* MAC_SOCKET */
5706	break;
5707
5708	#ifdef __APPLE_API_PRIVATE
5709	case SO_UPCALLCLOSEWAIT:
5710	optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
5711	goto integer;
5712	#endif
5713	case SO_RANDOMPORT:
5714	optval = (so->so_flags & SOF_BINDRANDOMPORT);
5715	goto integer;
5716
5717	case SO_NP_EXTENSIONS: {
5718	struct so_np_extensions sonpx = {};
5719
5720	sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
5721	SONPX_SETOPTSHUT : `0`;
5722	sonpx.npx_mask = SONPX_MASK_VALID;
5723
5724	error = sooptcopyout(sopt, &sonpx,
5725	sizeof (struct so_np_extensions));
5726	break;
5727	}
5728
5729	case SO_TRAFFIC_CLASS:
5730	optval = so->so_traffic_class;
5731	goto integer;
5732
5733	case SO_RECV_TRAFFIC_CLASS:
5734	optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
5735	goto integer;
5736
5737	case SO_TRAFFIC_CLASS_STATS:
5738	error = sooptcopyout(sopt, &so->so_tc_stats,
5739	sizeof (so->so_tc_stats));
5740	break;
5741
5742	#if (DEVELOPMENT \|\| DEBUG)
5743	case SO_TRAFFIC_CLASS_DBG:
5744	error = sogetopt_tcdbg(so, sopt);
5745	break;
5746	#endif /* (DEVELOPMENT \|\| DEBUG) */
5747
5748	case SO_PRIVILEGED_TRAFFIC_CLASS:
5749	optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
5750	goto integer;
5751
5752	case SO_DEFUNCTOK:
5753	optval = !(so->so_flags & SOF_NODEFUNCT);
5754	goto integer;
5755
5756	case SO_ISDEFUNCT:
5757	optval = (so->so_flags & SOF_DEFUNCT);
5758	goto integer;
5759
5760	case SO_OPPORTUNISTIC:
5761	optval = so_get_opportunistic(so);
5762	goto integer;
5763
5764	case SO_FLUSH:
5765	/ This option is not gettable /
5766	error = EINVAL;
5767	break;
5768
5769	case SO_RECV_ANYIF:
5770	optval = so_get_recv_anyif(so);
5771	goto integer;
5772
5773	case SO_TRAFFIC_MGT_BACKGROUND:
5774	/ This option is handled by lower layer(s) /
5775	if (so->so_proto != NULL &&
5776	so->so_proto->pr_ctloutput != NULL) {
5777	(void) so->so_proto->pr_ctloutput(so, sopt);
5778	}
5779	break;
5780
5781	#if FLOW_DIVERT
5782	case SO_FLOW_DIVERT_TOKEN:
5783	error = flow_divert_token_get(so, sopt);
5784	break;
5785	#endif /* FLOW_DIVERT */
5786
5787	#if NECP
5788	case SO_NECP_ATTRIBUTES:
5789	error = necp_get_socket_attributes(so, sopt);
5790	break;
5791
5792	case SO_NECP_CLIENTUUID:
5793	{
5794	uuid_t *ncu;
5795
5796	if (SOCK_DOM(so) == PF_MULTIPATH) {
5797	ncu = &mpsotomppcb(so)->necp_client_uuid;
5798	} else if (SOCK_DOM(so) == PF_INET \|\| SOCK_DOM(so) == PF_INET6) {
5799	ncu = &sotoinpcb(so)->necp_client_uuid;
5800	} else {
5801	error = EINVAL;
5802	goto out;
5803	}
5804
5805	error = sooptcopyout(sopt, ncu, sizeof(uuid_t));
5806	break;
5807	}
5808	#endif /* NECP */
5809
5810	#if CONTENT_FILTER
5811	case SO_CFIL_SOCK_ID: {
5812	cfil_sock_id_t sock_id;
5813
5814	sock_id = cfil_sock_id_from_socket(so);
5815
5816	error = sooptcopyout(sopt, &sock_id,
5817	sizeof(cfil_sock_id_t));
5818	break;
5819	}
5820	#endif /* CONTENT_FILTER */
5821
5822	case SO_EXTENDED_BK_IDLE:
5823	optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
5824	goto integer;
5825	case SO_MARK_CELLFALLBACK:
5826	optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > `0`)
5827	? `1` : `0`;
5828	goto integer;
5829	case SO_NET_SERVICE_TYPE: {
5830	if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE))
5831	optval = so->so_netsvctype;
5832	else
5833	optval = NET_SERVICE_TYPE_BE;
5834	goto integer;
5835	}
5836	case SO_NETSVC_MARKING_LEVEL:
5837	optval = so_get_netsvc_marking_level(so);
5838	goto integer;
5839
5840	default:
5841	error = ENOPROTOOPT;
5842	break;
5843	}
5844	}
5845	out:
5846	if (dolock)
5847	socket_unlock(so, `1`);
5848	return (error);
5849	}
5850
5851	/*
5852	* The size limits on our soopt_getm is different from that on FreeBSD.
5853	* We limit the size of options to MCLBYTES. This will have to change
5854	* if we need to define options that need more space than MCLBYTES.
5855	*/
5856	int
5857	soopt_getm(struct sockopt sopt, struct* mbuf **mp)
5858	{
5859	struct mbuf m, m_prev;
5860	int sopt_size = sopt->sopt_valsize;
5861	int how;
5862
5863	if (sopt_size <= `0` \|\| sopt_size > MCLBYTES)
5864	return (EMSGSIZE);
5865
5866	how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
5867	MGET(m, how, MT_DATA);
5868	if (m == NULL)
5869	return (ENOBUFS);
5870	if (sopt_size > MLEN) {
5871	MCLGET(m, how);
5872	if ((m->m_flags & M_EXT) == `0`) {
5873	m_free(m);
5874	return (ENOBUFS);
5875	}
5876	m->m_len = min(MCLBYTES, sopt_size);
5877	} else {
5878	m->m_len = min(MLEN, sopt_size);
5879	}
5880	sopt_size -= m->m_len;
5881	*mp = m;
5882	m_prev = m;
5883
5884	while (sopt_size > `0`) {
5885	MGET(m, how, MT_DATA);
5886	if (m == NULL) {
5887	m_freem(*mp);
5888	return (ENOBUFS);
5889	}
5890	if (sopt_size > MLEN) {
5891	MCLGET(m, how);
5892	if ((m->m_flags & M_EXT) == `0`) {
5893	m_freem(*mp);
5894	m_freem(m);
5895	return (ENOBUFS);
5896	}
5897	m->m_len = min(MCLBYTES, sopt_size);
5898	} else {
5899	m->m_len = min(MLEN, sopt_size);
5900	}
5901	sopt_size -= m->m_len;
5902	m_prev->m_next = m;
5903	m_prev = m;
5904	}
5905	return (`0`);
5906	}
5907
5908	/ copyin sopt data into mbuf chain /
5909	int
5910	soopt_mcopyin(struct sockopt sopt, struct* mbuf *m)
5911	{
5912	struct mbuf *m0 = m;
5913
5914	if (sopt->sopt_val == USER_ADDR_NULL)
5915	return (`0`);
5916	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
5917	if (sopt->sopt_p != kernproc) {
5918	int error;
5919
5920	error = copyin(sopt->sopt_val, mtod(m, char *),
5921	m->m_len);
5922	if (error != `0`) {
5923	m_freem(m0);
5924	return (error);
5925	}
5926	} else {
5927	bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
5928	mtod(m, char *), m->m_len);
5929	}
5930	sopt->sopt_valsize -= m->m_len;
5931	sopt->sopt_val += m->m_len;
5932	m = m->m_next;
5933	}
5934	/ should be allocated enoughly at ip6_sooptmcopyin() /
5935	if (m != NULL) {
5936	panic("soopt_mcopyin");
5937	/ NOTREACHED /
5938	}
5939	return (`0`);
5940	}
5941
5942	/ copyout mbuf chain data into soopt /
5943	int
5944	soopt_mcopyout(struct sockopt sopt, struct* mbuf *m)
5945	{
5946	struct mbuf *m0 = m;
5947	size_t valsize = `0`;
5948
5949	if (sopt->sopt_val == USER_ADDR_NULL)
5950	return (`0`);
5951	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
5952	if (sopt->sopt_p != kernproc) {
5953	int error;
5954
5955	error = copyout(mtod(m, char *), sopt->sopt_val,
5956	m->m_len);
5957	if (error != `0`) {
5958	m_freem(m0);
5959	return (error);
5960	}
5961	} else {
5962	bcopy(mtod(m, char *),
5963	CAST_DOWN(caddr_t, sopt->sopt_val), m->m_len);
5964	}
5965	sopt->sopt_valsize -= m->m_len;
5966	sopt->sopt_val += m->m_len;
5967	valsize += m->m_len;
5968	m = m->m_next;
5969	}
5970	if (m != NULL) {
5971	/ enough soopt buffer should be given from user-land /
5972	m_freem(m0);
5973	return (EINVAL);
5974	}
5975	sopt->sopt_valsize = valsize;
5976	return (`0`);
5977	}
5978
5979	void
5980	sohasoutofband(struct socket *so)
5981	{
5982	if (so->so_pgid < `0`)
5983	gsignal(-so->so_pgid, SIGURG);
5984	else if (so->so_pgid > `0`)
5985	proc_signal(so->so_pgid, SIGURG);
5986	selwakeup(&so->so_rcv.sb_sel);
5987	if (so->so_rcv.sb_flags & SB_KNOTE) {
5988	KNOTE(&so->so_rcv.sb_sel.si_note,
5989	(NOTE_OOB \| SO_FILT_HINT_LOCKED));
5990	}
5991	}
5992
5993	int
5994	sopoll(struct socket so, int* events, kauth_cred_t cred, void * wql)
5995	{
5996	#pragma unused(cred)
5997	struct proc *p = current_proc();
5998	int revents = `0`;
5999
6000	socket_lock(so, `1`);
6001	so_update_last_owner_locked(so, PROC_NULL);
6002	so_update_policy(so);
6003
6004	if (events & (POLLIN \| POLLRDNORM))
6005	if (soreadable(so))
6006	revents \|= events & (POLLIN \| POLLRDNORM);
6007
6008	if (events & (POLLOUT \| POLLWRNORM))
6009	if (sowriteable(so))
6010	revents \|= events & (POLLOUT \| POLLWRNORM);
6011
6012	if (events & (POLLPRI \| POLLRDBAND))
6013	if (so->so_oobmark \|\| (so->so_state & SS_RCVATMARK))
6014	revents \|= events & (POLLPRI \| POLLRDBAND);
6015
6016	if (revents == `0`) {
6017	if (events & (POLLIN \| POLLPRI \| POLLRDNORM \| POLLRDBAND)) {
6018	/*
6019	* Darwin sets the flag first,
6020	* BSD calls selrecord first
6021	*/
6022	so->so_rcv.sb_flags \|= SB_SEL;
6023	selrecord(p, &so->so_rcv.sb_sel, wql);
6024	}
6025
6026	if (events & (POLLOUT \| POLLWRNORM)) {
6027	/*
6028	* Darwin sets the flag first,
6029	* BSD calls selrecord first
6030	*/
6031	so->so_snd.sb_flags \|= SB_SEL;
6032	selrecord(p, &so->so_snd.sb_sel, wql);
6033	}
6034	}
6035
6036	socket_unlock(so, `1`);
6037	return (revents);
6038	}
6039
6040	int
6041	soo_kqfilter(struct fileproc fp, struct* knote *kn,
6042	struct kevent_internal_s *kev, vfs_context_t ctx)
6043	{
6044	#pragma unused(fp)
6045	#if !CONFIG_MACF_SOCKET
6046	#pragma unused(ctx)
6047	#endif /* MAC_SOCKET */
6048	struct socket so = (struct* socket *)kn->kn_fp->f_fglob->fg_data;
6049	int result;
6050
6051	socket_lock(so, `1`);
6052	so_update_last_owner_locked(so, PROC_NULL);
6053	so_update_policy(so);
6054
6055	#if CONFIG_MACF_SOCKET
6056	if (mac_socket_check_kqfilter(proc_ucred(vfs_context_proc(ctx)),
6057	kn, so) != `0`) {
6058	socket_unlock(so, `1`);
6059	kn->kn_flags = EV_ERROR;
6060	kn->kn_data = EPERM;
6061	return `0`;
6062	}
6063	#endif /* MAC_SOCKET */
6064
6065	switch (kn->kn_filter) {
6066	case EVFILT_READ:
6067	kn->kn_filtid = EVFILTID_SOREAD;
6068	break;
6069	case EVFILT_WRITE:
6070	kn->kn_filtid = EVFILTID_SOWRITE;
6071	break;
6072	case EVFILT_SOCK:
6073	kn->kn_filtid = EVFILTID_SCK;
6074	break;
6075	case EVFILT_EXCEPT:
6076	kn->kn_filtid = EVFILTID_SOEXCEPT;
6077	break;
6078	default:
6079	socket_unlock(so, `1`);
6080	kn->kn_flags = EV_ERROR;
6081	kn->kn_data = EINVAL;
6082	return `0`;
6083	}
6084
6085	/*
6086	* call the appropriate sub-filter attach
6087	* with the socket still locked
6088	*/
6089	result = knote_fops(kn)->f_attach(kn, kev);
6090
6091	socket_unlock(so, `1`);
6092
6093	return result;
6094	}
6095
6096	static int
6097	filt_soread_common(struct knote kn, struct* socket *so)
6098	{
6099	if (so->so_options & SO_ACCEPTCONN) {
6100	int is_not_empty;
6101
6102	/*
6103	* Radar 6615193 handle the listen case dynamically
6104	* for kqueue read filter. This allows to call listen()
6105	* after registering the kqueue EVFILT_READ.
6106	*/
6107
6108	kn->kn_data = so->so_qlen;
6109	is_not_empty = ! TAILQ_EMPTY(&so->so_comp);
6110
6111	return (is_not_empty);
6112	}
6113
6114	/ socket isn't a listener /
6115	/*
6116	* NOTE_LOWAT specifies new low water mark in data, i.e.
6117	* the bytes of protocol data. We therefore exclude any
6118	* control bytes.
6119	*/
6120	kn->kn_data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6121
6122	if (kn->kn_sfflags & NOTE_OOB) {
6123	if (so->so_oobmark \|\| (so->so_state & SS_RCVATMARK)) {
6124	kn->kn_fflags \|= NOTE_OOB;
6125	kn->kn_data -= so->so_oobmark;
6126	return (`1`);
6127	}
6128	}
6129
6130	if ((so->so_state & SS_CANTRCVMORE)
6131	#if CONTENT_FILTER
6132	&& cfil_sock_data_pending(&so->so_rcv) == `0`
6133	#endif /* CONTENT_FILTER */
6134	) {
6135	kn->kn_flags \|= EV_EOF;
6136	kn->kn_fflags = so->so_error;
6137	return (`1`);
6138	}
6139
6140	if (so->so_error) { / temporary udp error /
6141	return (`1`);
6142	}
6143
6144	int64_t lowwat = so->so_rcv.sb_lowat;
6145	/*
6146	* Ensure that when NOTE_LOWAT is used, the derived
6147	* low water mark is bounded by socket's rcv buf's
6148	* high and low water mark values.
6149	*/
6150	if (kn->kn_sfflags & NOTE_LOWAT) {
6151	if (kn->kn_sdata > so->so_rcv.sb_hiwat)
6152	lowwat = so->so_rcv.sb_hiwat;
6153	else if (kn->kn_sdata > lowwat)
6154	lowwat = kn->kn_sdata;
6155	}
6156
6157	/*
6158	* The order below is important. Since NOTE_LOWAT
6159	* overrides sb_lowat, check for NOTE_LOWAT case
6160	* first.
6161	*/
6162	if (kn->kn_sfflags & NOTE_LOWAT)
6163	return (kn->kn_data >= lowwat);
6164
6165	return (so->so_rcv.sb_cc >= lowwat);
6166	}
6167
6168	static int
6169	filt_sorattach(struct knote kn, __unused struct* kevent_internal_s *kev)
6170	{
6171	struct socket so = (struct* socket *)kn->kn_fp->f_fglob->fg_data;
6172
6173	/ socket locked /
6174
6175	/*
6176	* If the caller explicitly asked for OOB results (e.g. poll())
6177	* from EVFILT_READ, then save that off in the hookid field
6178	* and reserve the kn_flags EV_OOBAND bit for output only.
6179	*/
6180	if (kn->kn_filter == EVFILT_READ &&
6181	kn->kn_flags & EV_OOBAND) {
6182	kn->kn_flags &= ~EV_OOBAND;
6183	kn->kn_hookid = EV_OOBAND;
6184	} else {
6185	kn->kn_hookid = `0`;
6186	}
6187	if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn))
6188	so->so_rcv.sb_flags \|= SB_KNOTE;
6189
6190	/ indicate if event is already fired /
6191	return filt_soread_common(kn, so);
6192	}
6193
6194	static void
6195	filt_sordetach(struct knote *kn)
6196	{
6197	struct socket so = (struct* socket *)kn->kn_fp->f_fglob->fg_data;
6198
6199	socket_lock(so, `1`);
6200	if (so->so_rcv.sb_flags & SB_KNOTE)
6201	if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn))
6202	so->so_rcv.sb_flags &= ~SB_KNOTE;
6203	socket_unlock(so, `1`);
6204	}
6205
6206	/ARGSUSED/
6207	static int
6208	filt_soread(struct knote kn, long* hint)
6209	{
6210	struct socket so = (struct* socket *)kn->kn_fp->f_fglob->fg_data;
6211	int retval;
6212
6213	if ((hint & SO_FILT_HINT_LOCKED) == `0`)
6214	socket_lock(so, `1`);
6215
6216	retval = filt_soread_common(kn, so);
6217
6218	if ((hint & SO_FILT_HINT_LOCKED) == `0`)
6219	socket_unlock(so, `1`);
6220
6221	return retval;
6222	}
6223
6224	static int
6225	filt_sortouch(struct knote kn, struct* kevent_internal_s *kev)
6226	{
6227	struct socket so = (struct* socket *)kn->kn_fp->f_fglob->fg_data;
6228	int retval;
6229
6230	socket_lock(so, `1`);
6231
6232	/ save off the new input fflags and data /
6233	kn->kn_sfflags = kev->fflags;
6234	kn->kn_sdata = kev->data;
6235
6236	/ determine if changes result in fired events /
6237	retval = filt_soread_common(kn, so);
6238
6239	socket_unlock(so, `1`);
6240
6241	return retval;
6242	}
6243
6244	static int
6245	filt_sorprocess(struct knote kn, struct* filt_process_s data, struct* kevent_internal_s *kev)
6246	{
6247	#pragma unused(data)
6248	struct socket so = (struct* socket *)kn->kn_fp->f_fglob->fg_data;
6249	int retval;
6250
6251	socket_lock(so, `1`);
6252	retval = filt_soread_common(kn, so);
6253	if (retval) {
6254	*kev = kn->kn_kevent;
6255	if (kn->kn_flags & EV_CLEAR) {
6256	kn->kn_fflags = `0`;
6257	kn->kn_data = `0`;
6258	}
6259	}
6260	socket_unlock(so, `1`);
6261
6262	return retval;
6263	}
6264
6265	int
6266	so_wait_for_if_feedback(struct socket *so)
6267	{
6268	if ((SOCK_DOM(so) == PF_INET \|\| SOCK_DOM(so) == PF_INET6) &&
6269	(so->so_state & SS_ISCONNECTED)) {
6270	struct inpcb *inp = sotoinpcb(so);
6271	if (INP_WAIT_FOR_IF_FEEDBACK(inp))
6272	return (`1`);
6273	}
6274	return (`0`);
6275	}
6276
6277	static int
6278	filt_sowrite_common(struct knote kn, struct* socket *so)
6279	{
6280	int ret = `0`;
6281
6282	kn->kn_data = sbspace(&so->so_snd);
6283	if (so->so_state & SS_CANTSENDMORE) {
6284	kn->kn_flags \|= EV_EOF;
6285	kn->kn_fflags = so->so_error;
6286	return `1`;
6287	}
6288	if (so->so_error) { / temporary udp error /
6289	return `1`;
6290	}
6291	if (!socanwrite(so)) {
6292	return `0`;
6293	}
6294	if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6295	return `1`;
6296	}
6297	int64_t lowwat = so->so_snd.sb_lowat;
6298	if (kn->kn_sfflags & NOTE_LOWAT) {
6299	if (kn->kn_sdata > so->so_snd.sb_hiwat)
6300	lowwat = so->so_snd.sb_hiwat;
6301	else if (kn->kn_sdata > lowwat)
6302	lowwat = kn->kn_sdata;
6303	}
6304	if (kn->kn_data >= lowwat) {
6305	if ((so->so_flags & SOF_NOTSENT_LOWAT)
6306	#if (DEBUG \|\| DEVELOPMENT)
6307	&& so_notsent_lowat_check == `1`
6308	#endif /* DEBUG \|\| DEVELOPMENT */
6309	) {
6310	if ((SOCK_DOM(so) == PF_INET \|\|
6311	SOCK_DOM(so) == PF_INET6) &&
6312	so->so_type == SOCK_STREAM) {
6313	ret = tcp_notsent_lowat_check(so);
6314	}
6315	#if MPTCP
6316	else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6317	(SOCK_PROTO(so) == IPPROTO_TCP)) {
6318	ret = mptcp_notsent_lowat_check(so);
6319	}
6320	#endif
6321	else {
6322	return `1`;
6323	}
6324	} else {
6325	ret = `1`;
6326	}
6327	}
6328	if (so_wait_for_if_feedback(so))
6329	ret = `0`;
6330	return (ret);
6331	}
6332
6333	static int
6334	filt_sowattach(struct knote kn, __unused struct* kevent_internal_s *kev)
6335	{
6336	struct socket so = (struct* socket *)kn->kn_fp->f_fglob->fg_data;
6337
6338	/ socket locked /
6339	if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn))
6340	so->so_snd.sb_flags \|= SB_KNOTE;
6341
6342	/ determine if its already fired /
6343	return filt_sowrite_common(kn, so);
6344	}
6345
6346	static void
6347	filt_sowdetach(struct knote *kn)
6348	{
6349	struct socket so = (struct* socket *)kn->kn_fp->f_fglob->fg_data;
6350	socket_lock(so, `1`);
6351
6352	if (so->so_snd.sb_flags & SB_KNOTE)
6353	if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn))
6354	so->so_snd.sb_flags &= ~SB_KNOTE;
6355	socket_unlock(so, `1`);
6356	}
6357
6358	/ARGSUSED/
6359	static int
6360	filt_sowrite(struct knote kn, long* hint)
6361	{
6362	struct socket so = (struct* socket *)kn->kn_fp->f_fglob->fg_data;
6363	int ret;
6364
6365	if ((hint & SO_FILT_HINT_LOCKED) == `0`)
6366	socket_lock(so, `1`);
6367
6368	ret = filt_sowrite_common(kn, so);
6369
6370	if ((hint & SO_FILT_HINT_LOCKED) == `0`)
6371	socket_unlock(so, `1`);
6372
6373	return ret;
6374	}
6375
6376	static int
6377	filt_sowtouch(struct knote kn, struct* kevent_internal_s *kev)
6378	{
6379	struct socket so = (struct* socket *)kn->kn_fp->f_fglob->fg_data;
6380	int ret;
6381
6382	socket_lock(so, `1`);
6383
6384	/save off the new input fflags and data /
6385	kn->kn_sfflags = kev->fflags;
6386	kn->kn_sdata = kev->data;
6387
6388	/ determine if these changes result in a triggered event /
6389	ret = filt_sowrite_common(kn, so);
6390
6391	socket_unlock(so, `1`);
6392
6393	return ret;
6394	}
6395
6396	static int
6397	filt_sowprocess(struct knote kn, struct* filt_process_s data, struct* kevent_internal_s *kev)
6398	{
6399	#pragma unused(data)
6400	struct socket so = (struct* socket *)kn->kn_fp->f_fglob->fg_data;
6401	int ret;
6402
6403	socket_lock(so, `1`);
6404	ret = filt_sowrite_common(kn, so);
6405	if (ret) {
6406	*kev = kn->kn_kevent;
6407	if (kn->kn_flags & EV_CLEAR) {
6408	kn->kn_fflags = `0`;
6409	kn->kn_data = `0`;
6410	}
6411	}
6412	socket_unlock(so, `1`);
6413	return ret;
6414	}
6415
6416	static int
6417	filt_sockev_common(struct knote kn, struct* socket so, long* ev_hint)
6418	{
6419	int ret = `0`;
6420	uint32_t level_trigger = `0`;
6421
6422	if (ev_hint & SO_FILT_HINT_CONNRESET) {
6423	kn->kn_fflags \|= NOTE_CONNRESET;
6424	}
6425	if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6426	kn->kn_fflags \|= NOTE_TIMEOUT;
6427	}
6428	if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6429	kn->kn_fflags \|= NOTE_NOSRCADDR;
6430	}
6431	if (ev_hint & SO_FILT_HINT_IFDENIED) {
6432	kn->kn_fflags \|= NOTE_IFDENIED;
6433	}
6434	if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6435	kn->kn_fflags \|= NOTE_KEEPALIVE;
6436	}
6437	if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
6438	kn->kn_fflags \|= NOTE_ADAPTIVE_WTIMO;
6439	}
6440	if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
6441	kn->kn_fflags \|= NOTE_ADAPTIVE_RTIMO;
6442	}
6443	if ((ev_hint & SO_FILT_HINT_CONNECTED) \|\|
6444	(so->so_state & SS_ISCONNECTED)) {
6445	kn->kn_fflags \|= NOTE_CONNECTED;
6446	level_trigger \|= NOTE_CONNECTED;
6447	}
6448	if ((ev_hint & SO_FILT_HINT_DISCONNECTED) \|\|
6449	(so->so_state & SS_ISDISCONNECTED)) {
6450	kn->kn_fflags \|= NOTE_DISCONNECTED;
6451	level_trigger \|= NOTE_DISCONNECTED;
6452	}
6453	if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6454	if (so->so_proto != NULL &&
6455	(so->so_proto->pr_flags & PR_EVCONNINFO))
6456	kn->kn_fflags \|= NOTE_CONNINFO_UPDATED;
6457	}
6458
6459	if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) \|\|
6460	tcp_notify_ack_active(so)) {
6461	kn->kn_fflags \|= NOTE_NOTIFY_ACK;
6462	}
6463
6464	if ((so->so_state & SS_CANTRCVMORE)
6465	#if CONTENT_FILTER
6466	&& cfil_sock_data_pending(&so->so_rcv) == `0`
6467	#endif /* CONTENT_FILTER */
6468	) {
6469	kn->kn_fflags \|= NOTE_READCLOSED;
6470	level_trigger \|= NOTE_READCLOSED;
6471	}
6472
6473	if (so->so_state & SS_CANTSENDMORE) {
6474	kn->kn_fflags \|= NOTE_WRITECLOSED;
6475	level_trigger \|= NOTE_WRITECLOSED;
6476	}
6477
6478	if ((ev_hint & SO_FILT_HINT_SUSPEND) \|\|
6479	(so->so_flags & SOF_SUSPENDED)) {
6480	kn->kn_fflags &= ~(NOTE_SUSPEND \| NOTE_RESUME);
6481
6482	/ If resume event was delivered before, reset it /
6483	kn->kn_hookid &= ~NOTE_RESUME;
6484
6485	kn->kn_fflags \|= NOTE_SUSPEND;
6486	level_trigger \|= NOTE_SUSPEND;
6487	}
6488
6489	if ((ev_hint & SO_FILT_HINT_RESUME) \|\|
6490	(so->so_flags & SOF_SUSPENDED) == `0`) {
6491	kn->kn_fflags &= ~(NOTE_SUSPEND \| NOTE_RESUME);
6492
6493	/ If suspend event was delivered before, reset it /
6494	kn->kn_hookid &= ~NOTE_SUSPEND;
6495
6496	kn->kn_fflags \|= NOTE_RESUME;
6497	level_trigger \|= NOTE_RESUME;
6498	}
6499
6500	if (so->so_error != `0`) {
6501	ret = `1`;
6502	kn->kn_data = so->so_error;
6503	kn->kn_flags \|= EV_EOF;
6504	} else {
6505	get_sockev_state(so, (u_int32_t *)&(kn->kn_data));
6506	}
6507
6508	/ Reset any events that are not requested on this knote /
6509	kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6510	level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6511
6512	/ Find the level triggerred events that are already delivered /
6513	level_trigger &= kn->kn_hookid;
6514	level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
6515
6516	/ Do not deliver level triggerred events more than once /
6517	if ((kn->kn_fflags & ~level_trigger) != `0`)
6518	ret = `1`;
6519
6520	return (ret);
6521	}
6522
6523	static int
6524	filt_sockattach(struct knote kn, __unused struct* kevent_internal_s *kev)
6525	{
6526	struct socket so = (struct* socket *)kn->kn_fp->f_fglob->fg_data;
6527
6528	/ socket locked /
6529	kn->kn_hookid = `0`;
6530	if (KNOTE_ATTACH(&so->so_klist, kn))
6531	so->so_flags \|= SOF_KNOTE;
6532
6533	/ determine if event already fired /
6534	return filt_sockev_common(kn, so, `0`);
6535	}
6536
6537	static void
6538	filt_sockdetach(struct knote *kn)
6539	{
6540	struct socket so = (struct* socket *)kn->kn_fp->f_fglob->fg_data;
6541	socket_lock(so, `1`);
6542
6543	if ((so->so_flags & SOF_KNOTE) != `0`)
6544	if (KNOTE_DETACH(&so->so_klist, kn))
6545	so->so_flags &= ~SOF_KNOTE;
6546	socket_unlock(so, `1`);
6547	}
6548
6549	static int
6550	filt_sockev(struct knote kn, long* hint)
6551	{
6552	int ret = `0`, locked = `0`;
6553	struct socket so = (struct* socket *)kn->kn_fp->f_fglob->fg_data;
6554	long ev_hint = (hint & SO_FILT_HINT_EV);
6555
6556	if ((hint & SO_FILT_HINT_LOCKED) == `0`) {
6557	socket_lock(so, `1`);
6558	locked = `1`;
6559	}
6560
6561	ret = filt_sockev_common(kn, so, ev_hint);
6562
6563	if (locked)
6564	socket_unlock(so, `1`);
6565
6566	return ret;
6567	}
6568
6569
6570
6571	/*
6572	* filt_socktouch - update event state
6573	*/
6574	static int
6575	filt_socktouch(
6576	struct knote *kn,
6577	struct kevent_internal_s *kev)
6578	{
6579	struct socket so = (struct* socket *)kn->kn_fp->f_fglob->fg_data;
6580	uint32_t changed_flags;
6581	int ret;
6582
6583	socket_lock(so, `1`);
6584
6585	/ save off the [result] data and fflags /
6586	changed_flags = (kn->kn_sfflags ^ kn->kn_hookid);
6587
6588	/ save off the new input fflags and data /
6589	kn->kn_sfflags = kev->fflags;
6590	kn->kn_sdata = kev->data;
6591
6592	/ restrict the current results to the (smaller?) set of new interest /
6593	/*
6594	* For compatibility with previous implementations, we leave kn_fflags
6595	* as they were before.
6596	*/
6597	//kn->kn_fflags &= kev->fflags;
6598
6599	/*
6600	* Since we keep track of events that are already
6601	* delivered, if any of those events are not requested
6602	* anymore the state related to them can be reset
6603	*/
6604	kn->kn_hookid &=
6605	~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6606
6607	/ determine if we have events to deliver /
6608	ret = filt_sockev_common(kn, so, `0`);
6609
6610	socket_unlock(so, `1`);
6611
6612	return ret;
6613	}
6614
6615	/*
6616	* filt_sockprocess - query event fired state and return data
6617	*/
6618	static int
6619	filt_sockprocess(
6620	struct knote *kn,
6621	struct filt_process_s *data,
6622	struct kevent_internal_s *kev)
6623	{
6624	#pragma unused(data)
6625
6626	struct socket so = (struct* socket *)kn->kn_fp->f_fglob->fg_data;
6627	int ret = `0`;
6628
6629	socket_lock(so, `1`);
6630
6631	ret = filt_sockev_common(kn, so, `0`);
6632	if (ret) {
6633	*kev = kn->kn_kevent;
6634
6635	/*
6636	* Store the state of the events being delivered. This
6637	* state can be used to deliver level triggered events
6638	* ateast once and still avoid waking up the application
6639	* multiple times as long as the event is active.
6640	*/
6641	if (kn->kn_fflags != `0`)
6642	kn->kn_hookid \|= (kn->kn_fflags &
6643	EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6644
6645	/*
6646	* NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
6647	* only one of them and remember the last one that was
6648	* delivered last
6649	*/
6650	if (kn->kn_fflags & NOTE_SUSPEND)
6651	kn->kn_hookid &= ~NOTE_RESUME;
6652	if (kn->kn_fflags & NOTE_RESUME)
6653	kn->kn_hookid &= ~NOTE_SUSPEND;
6654
6655	if (kn->kn_flags & EV_CLEAR) {
6656	kn->kn_data = `0`;
6657	kn->kn_fflags = `0`;
6658	}
6659	}
6660
6661	socket_unlock(so, `1`);
6662
6663	return ret;
6664	}
6665
6666	void
6667	get_sockev_state(struct socket so, u_int32_t statep)
6668	{
6669	u_int32_t state = *(statep);
6670
6671	/*
6672	* If the state variable is already used by a previous event,
6673	* reset it.
6674	*/
6675	if (state != `0`)
6676	return;
6677
6678	if (so->so_state & SS_ISCONNECTED)
6679	state \|= SOCKEV_CONNECTED;
6680	else
6681	state &= ~(SOCKEV_CONNECTED);
6682	state \|= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : `0`);
6683	*(statep) = state;
6684	}
6685
6686	#define SO_LOCK_HISTORY_STR_LEN \
6687	(2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
6688
6689	__private_extern__ const char *
6690	solockhistory_nr(struct socket *so)
6691	{
6692	size_t n = `0`;
6693	int i;
6694	static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
6695
6696	bzero(lock_history_str, sizeof (lock_history_str));
6697	for (i = SO_LCKDBG_MAX - `1`; i >= `0`; i--) {
6698	n += snprintf(lock_history_str + n,
6699	SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
6700	so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
6701	so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
6702	}
6703	return (lock_history_str);
6704	}
6705
6706	void
6707	socket_lock(struct socket so, int* refcount)
6708	{
6709	void *lr_saved;
6710
6711	lr_saved = __builtin_return_address(`0`);
6712
6713	if (so->so_proto->pr_lock) {
6714	(*so->so_proto->pr_lock)(so, refcount, lr_saved);
6715	} else {
6716	#ifdef MORE_LOCKING_DEBUG
6717	LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
6718	LCK_MTX_ASSERT_NOTOWNED);
6719	#endif
6720	lck_mtx_lock(so->so_proto->pr_domain->dom_mtx);
6721	if (refcount)
6722	so->so_usecount++;
6723	so->lock_lr[so->next_lock_lr] = lr_saved;
6724	so->next_lock_lr = (so->next_lock_lr+`1`) % SO_LCKDBG_MAX;
6725	}
6726	}
6727
6728	void
6729	socket_lock_assert_owned(struct socket *so)
6730	{
6731	lck_mtx_t *mutex_held;
6732
6733	if (so->so_proto->pr_getlock != NULL)
6734	mutex_held = (*so->so_proto->pr_getlock)(so, `0`);
6735	else
6736	mutex_held = so->so_proto->pr_domain->dom_mtx;
6737
6738	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
6739	}
6740
6741	int
6742	socket_try_lock(struct socket *so)
6743	{
6744	lck_mtx_t *mtx;
6745
6746	if (so->so_proto->pr_getlock != NULL)
6747	mtx = (*so->so_proto->pr_getlock)(so, `0`);
6748	else
6749	mtx = so->so_proto->pr_domain->dom_mtx;
6750
6751	return (lck_mtx_try_lock(mtx));
6752	}
6753
6754	void
6755	socket_unlock(struct socket so, int* refcount)
6756	{
6757	void *lr_saved;
6758	lck_mtx_t *mutex_held;
6759
6760	lr_saved = __builtin_return_address(`0`);
6761
6762	if (so->so_proto == NULL) {
6763	panic("%s: null so_proto so=%p\n", __func__, so);
6764	/ NOTREACHED /
6765	}
6766
6767	if (so && so->so_proto->pr_unlock) {
6768	(*so->so_proto->pr_unlock)(so, refcount, lr_saved);
6769	} else {
6770	mutex_held = so->so_proto->pr_domain->dom_mtx;
6771	#ifdef MORE_LOCKING_DEBUG
6772	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
6773	#endif
6774	so->unlock_lr[so->next_unlock_lr] = lr_saved;
6775	so->next_unlock_lr = (so->next_unlock_lr+`1`) % SO_LCKDBG_MAX;
6776
6777	if (refcount) {
6778	if (so->so_usecount <= `0`) {
6779	panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
6780	"lrh=%s", __func__, so->so_usecount, so,
6781	SOCK_DOM(so), so->so_type,
6782	SOCK_PROTO(so), solockhistory_nr(so));
6783	/ NOTREACHED /
6784	}
6785
6786	so->so_usecount--;
6787	if (so->so_usecount == `0`)
6788	sofreelastref(so, `1`);
6789	}
6790	lck_mtx_unlock(mutex_held);
6791	}
6792	}
6793
6794	/ Called with socket locked, will unlock socket /
6795	void
6796	sofree(struct socket *so)
6797	{
6798	lck_mtx_t *mutex_held;
6799
6800	if (so->so_proto->pr_getlock != NULL)
6801	mutex_held = (*so->so_proto->pr_getlock)(so, `0`);
6802	else
6803	mutex_held = so->so_proto->pr_domain->dom_mtx;
6804	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
6805
6806	sofreelastref(so, `0`);
6807	}
6808
6809	void
6810	soreference(struct socket *so)
6811	{
6812	socket_lock(so, `1`); / locks & take one reference on socket /
6813	socket_unlock(so, `0`); / unlock only /
6814	}
6815
6816	void
6817	sodereference(struct socket *so)
6818	{
6819	socket_lock(so, `0`);
6820	socket_unlock(so, `1`);
6821	}
6822
6823	/*
6824	* Set or clear SOF_MULTIPAGES on the socket to enable or disable the
6825	* possibility of using jumbo clusters. Caller must ensure to hold
6826	* the socket lock.
6827	*/
6828	void
6829	somultipages(struct socket *so, boolean_t set)
6830	{
6831	if (set)
6832	so->so_flags \|= SOF_MULTIPAGES;
6833	else
6834	so->so_flags &= ~SOF_MULTIPAGES;
6835	}
6836
6837	void
6838	soif2kcl(struct socket *so, boolean_t set)
6839	{
6840	if (set)
6841	so->so_flags1 \|= SOF1_IF_2KCL;
6842	else
6843	so->so_flags1 &= ~SOF1_IF_2KCL;
6844	}
6845
6846	int
6847	so_isdstlocal(struct socket *so) {
6848
6849	struct inpcb inp = (struct* inpcb *)so->so_pcb;
6850
6851	if (SOCK_DOM(so) == PF_INET)
6852	return (inaddr_local(inp->inp_faddr));
6853	else if (SOCK_DOM(so) == PF_INET6)
6854	return (in6addr_local(&inp->in6p_faddr));
6855
6856	return (`0`);
6857	}
6858
6859	int
6860	sosetdefunct(struct proc p, struct* socket so, int* level, boolean_t noforce)
6861	{
6862	struct sockbuf rcv, snd;
6863	int err = `0`, defunct;
6864
6865	rcv = &so->so_rcv;
6866	snd = &so->so_snd;
6867
6868	defunct = (so->so_flags & SOF_DEFUNCT);
6869	if (defunct) {
6870	if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
6871	panic("%s: SB_DROP not set", __func__);
6872	/ NOTREACHED /
6873	}
6874	goto done;
6875	}
6876
6877	if (so->so_flags & SOF_NODEFUNCT) {
6878	if (noforce) {
6879	err = EOPNOTSUPP;
6880	if (p != PROC_NULL) {
6881	SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
6882	"name %s level %d) so 0x%llx [%d,%d] "
6883	"is not eligible for defunct "
6884	"(%d)\n", __func__, proc_selfpid(),
6885	proc_best_name(current_proc()), proc_pid(p),
6886	proc_best_name(p), level,
6887	(uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6888	SOCK_DOM(so), SOCK_TYPE(so), err);
6889	}
6890	return (err);
6891	}
6892	so->so_flags &= ~SOF_NODEFUNCT;
6893	if (p != PROC_NULL) {
6894	SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
6895	"name %s level %d) so 0x%llx [%d,%d] "
6896	"defunct by force "
6897	"(%d)\n", __func__, proc_selfpid(),
6898	proc_best_name(current_proc()), proc_pid(p),
6899	proc_best_name(p), level,
6900	(uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6901	SOCK_DOM(so), SOCK_TYPE(so), err);
6902	}
6903	} else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
6904	struct inpcb inp = (struct* inpcb *)so->so_pcb;
6905	struct ifnet *ifp = inp->inp_last_outifp;
6906
6907	if (ifp && IFNET_IS_CELLULAR(ifp)) {
6908	OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
6909	} else if (so->so_flags & SOF_DELEGATED) {
6910	OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
6911	} else if (soextbkidlestat.so_xbkidle_time == `0`) {
6912	OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
6913	} else if (noforce && p != PROC_NULL) {
6914	OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
6915
6916	so->so_flags1 \|= SOF1_EXTEND_BK_IDLE_INPROG;
6917	so->so_extended_bk_start = net_uptime();
6918	OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
6919
6920	inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
6921
6922	err = EOPNOTSUPP;
6923	SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
6924	"name %s level %d) so 0x%llx [%d,%d] "
6925	"extend bk idle "
6926	"(%d)\n", __func__, proc_selfpid(),
6927	proc_best_name(current_proc()), proc_pid(p),
6928	proc_best_name(p), level,
6929	(uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6930	SOCK_DOM(so), SOCK_TYPE(so), err);
6931	return (err);
6932	} else {
6933	OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
6934	}
6935	}
6936
6937	so->so_flags \|= SOF_DEFUNCT;
6938
6939	/ Prevent further data from being appended to the socket buffers /
6940	snd->sb_flags \|= SB_DROP;
6941	rcv->sb_flags \|= SB_DROP;
6942
6943	/ Flush any existing data in the socket buffers /
6944	if (rcv->sb_cc != `0`) {
6945	rcv->sb_flags &= ~SB_SEL;
6946	selthreadclear(&rcv->sb_sel);
6947	sbrelease(rcv);
6948	}
6949	if (snd->sb_cc != `0`) {
6950	snd->sb_flags &= ~SB_SEL;
6951	selthreadclear(&snd->sb_sel);
6952	sbrelease(snd);
6953	}
6954
6955	done:
6956	if (p != PROC_NULL) {
6957	SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
6958	"so 0x%llx [%d,%d] %s defunct%s\n", __func__,
6959	proc_selfpid(), proc_best_name(current_proc()),
6960	proc_pid(p), proc_best_name(p), level,
6961	(uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
6962	SOCK_TYPE(so), defunct ? "is already" : "marked as",
6963	(so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
6964	" extbkidle" : "");
6965	}
6966	return (err);
6967	}
6968
6969	int
6970	sodefunct(struct proc p, struct* socket so, int* level)
6971	{
6972	struct sockbuf rcv, snd;
6973
6974	if (!(so->so_flags & SOF_DEFUNCT)) {
6975	panic("%s improperly called", __func__);
6976	/ NOTREACHED /
6977	}
6978	if (so->so_state & SS_DEFUNCT)
6979	goto done;
6980
6981	rcv = &so->so_rcv;
6982	snd = &so->so_snd;
6983
6984	if (SOCK_DOM(so) == PF_INET \|\| SOCK_DOM(so) == PF_INET6) {
6985	char s[MAX_IPv6_STR_LEN];
6986	char d[MAX_IPv6_STR_LEN];
6987	struct inpcb *inp = sotoinpcb(so);
6988
6989	if (p != PROC_NULL) {
6990	SODEFUNCTLOG(
6991	"%s[%d, %s]: (target pid %d name %s level %d) "
6992	"so 0x%llx [%s %s:%d -> %s:%d] is now defunct "
6993	"[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
6994	" snd_fl 0x%x]\n", __func__,
6995	proc_selfpid(), proc_best_name(current_proc()),
6996	proc_pid(p), proc_best_name(p), level,
6997	(uint64_t)DEBUG_KERNEL_ADDRPERM(so),
6998	(SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
6999	inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7000	(void *)&inp->inp_laddr.s_addr :
7001	(void *)&inp->in6p_laddr),
7002	s, sizeof (s)), ntohs(inp->in6p_lport),
7003	inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7004	(void *)&inp->inp_faddr.s_addr :
7005	(void *)&inp->in6p_faddr,
7006	d, sizeof (d)), ntohs(inp->in6p_fport),
7007	(uint32_t)rcv->sb_sel.si_flags,
7008	(uint32_t)snd->sb_sel.si_flags,
7009	rcv->sb_flags, snd->sb_flags);
7010	}
7011	} else if (p != PROC_NULL) {
7012	SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7013	"so 0x%llx [%d,%d] is now defunct [rcv_si 0x%x, "
7014	"snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7015	proc_selfpid(), proc_best_name(current_proc()),
7016	proc_pid(p), proc_best_name(p), level,
7017	(uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7018	SOCK_DOM(so), SOCK_TYPE(so),
7019	(uint32_t)rcv->sb_sel.si_flags,
7020	(uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
7021	snd->sb_flags);
7022	}
7023
7024	/*
7025	* Unwedge threads blocked on sbwait() and sb_lock().
7026	*/
7027	sbwakeup(rcv);
7028	sbwakeup(snd);
7029
7030	so->so_flags1 \|= SOF1_DEFUNCTINPROG;
7031	if (rcv->sb_flags & SB_LOCK)
7032	sbunlock(rcv, TRUE); / keep socket locked /
7033	if (snd->sb_flags & SB_LOCK)
7034	sbunlock(snd, TRUE); / keep socket locked /
7035
7036	/*
7037	* Flush the buffers and disconnect. We explicitly call shutdown
7038	* on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7039	* states are set for the socket. This would also flush out data
7040	* hanging off the receive list of this socket.
7041	*/
7042	(void) soshutdownlock_final(so, SHUT_RD);
7043	(void) soshutdownlock_final(so, SHUT_WR);
7044	(void) sodisconnectlocked(so);
7045
7046	/*
7047	* Explicitly handle connectionless-protocol disconnection
7048	* and release any remaining data in the socket buffers.
7049	*/
7050	if (!(so->so_state & SS_ISDISCONNECTED))
7051	(void) soisdisconnected(so);
7052
7053	if (so->so_error == `0`)
7054	so->so_error = EBADF;
7055
7056	if (rcv->sb_cc != `0`) {
7057	rcv->sb_flags &= ~SB_SEL;
7058	selthreadclear(&rcv->sb_sel);
7059	sbrelease(rcv);
7060	}
7061	if (snd->sb_cc != `0`) {
7062	snd->sb_flags &= ~SB_SEL;
7063	selthreadclear(&snd->sb_sel);
7064	sbrelease(snd);
7065	}
7066	so->so_state \|= SS_DEFUNCT;
7067	OSIncrementAtomicLong((volatile long *)&sodefunct_calls);
7068
7069	done:
7070	return (`0`);
7071	}
7072
7073	int
7074	soresume(struct proc p, struct* socket so, int* locked)
7075	{
7076	if (locked == `0`)
7077	socket_lock(so, `1`);
7078
7079	if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7080	SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llx "
7081	"[%d,%d] resumed from bk idle\n",
7082	__func__, proc_selfpid(), proc_best_name(current_proc()),
7083	proc_pid(p), proc_best_name(p),
7084	(uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7085	SOCK_DOM(so), SOCK_TYPE(so));
7086
7087	so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7088	so->so_extended_bk_start = `0`;
7089	OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7090
7091	OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7092	OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7093	VERIFY(soextbkidlestat.so_xbkidle_active >= `0`);
7094	}
7095	if (locked == `0`)
7096	socket_unlock(so, `1`);
7097
7098	return (`0`);
7099	}
7100
7101	/*
7102	* Does not attempt to account for sockets that are delegated from
7103	* the current process
7104	*/
7105	int
7106	so_set_extended_bk_idle(struct socket so, int* optval)
7107	{
7108	int error = `0`;
7109
7110	if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) \|\|
7111	SOCK_PROTO(so) != IPPROTO_TCP) {
7112	OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7113	error = EOPNOTSUPP;
7114	} else if (optval == `0`) {
7115	so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7116
7117	soresume(current_proc(), so, `1`);
7118	} else {
7119	struct proc *p = current_proc();
7120	int i;
7121	struct filedesc *fdp;
7122	int count = `0`;
7123
7124	/*
7125	* Unlock socket to avoid lock ordering issue with
7126	* the proc fd table lock
7127	*/
7128	socket_unlock(so, `0`);
7129
7130	proc_fdlock(p);
7131
7132	fdp = p->p_fd;
7133	for (i = `0`; i < fdp->fd_nfiles; i++) {
7134	struct fileproc *fp = fdp->fd_ofiles[i];
7135	struct socket *so2;
7136
7137	if (fp == NULL \|\|
7138	(fdp->fd_ofileflags[i] & UF_RESERVED) != `0` \|\|
7139	FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
7140	continue;
7141
7142	so2 = (struct socket *)fp->f_fglob->fg_data;
7143	if (so != so2 &&
7144	so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED)
7145	count++;
7146	if (count >= soextbkidlestat.so_xbkidle_maxperproc)
7147	break;
7148	}
7149	proc_fdunlock(p);
7150
7151	socket_lock(so, `0`);
7152
7153	if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7154	OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7155	error = EBUSY;
7156	} else if (so->so_flags & SOF_DELEGATED) {
7157	OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7158	error = EBUSY;
7159	} else {
7160	so->so_flags1 \|= SOF1_EXTEND_BK_IDLE_WANTED;
7161	OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7162	}
7163	SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d] "
7164	"%s marked for extended bk idle\n",
7165	__func__, proc_selfpid(), proc_best_name(current_proc()),
7166	(uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7167	SOCK_DOM(so), SOCK_TYPE(so),
7168	(so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7169	"is" : "not");
7170	}
7171
7172	return (error);
7173	}
7174
7175	static void
7176	so_stop_extended_bk_idle(struct socket *so)
7177	{
7178	so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7179	so->so_extended_bk_start = `0`;
7180
7181	OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7182	VERIFY(soextbkidlestat.so_xbkidle_active >= `0`);
7183	/*
7184	* Force defunct
7185	*/
7186	sosetdefunct(current_proc(), so,
7187	SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7188	if (so->so_flags & SOF_DEFUNCT) {
7189	sodefunct(current_proc(), so,
7190	SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7191	}
7192	}
7193
7194	void
7195	so_drain_extended_bk_idle(struct socket *so)
7196	{
7197	if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7198	/*
7199	* Only penalize sockets that have outstanding data
7200	*/
7201	if (so->so_rcv.sb_cc \|\| so->so_snd.sb_cc) {
7202	so_stop_extended_bk_idle(so);
7203
7204	OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7205	}
7206	}
7207	}
7208
7209	/*
7210	* Return values tells if socket is still in extended background idle
7211	*/
7212	int
7213	so_check_extended_bk_idle_time(struct socket *so)
7214	{
7215	int ret = `1`;
7216
7217	if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7218	SODEFUNCTLOG("%s[%d, %s]: so 0x%llx [%d,%d]\n",
7219	__func__, proc_selfpid(), proc_best_name(current_proc()),
7220	(uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7221	SOCK_DOM(so), SOCK_TYPE(so));
7222	if (net_uptime() - so->so_extended_bk_start >
7223	soextbkidlestat.so_xbkidle_time) {
7224	so_stop_extended_bk_idle(so);
7225
7226	OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7227
7228	ret = `0`;
7229	} else {
7230	struct inpcb inp = (struct* inpcb *)so->so_pcb;
7231
7232	inpcb_timer_sched(inp->inp_pcbinfo, INPCB_TIMER_LAZY);
7233	OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7234	}
7235	}
7236
7237	return (ret);
7238	}
7239
7240	void
7241	resume_proc_sockets(proc_t p)
7242	{
7243	if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7244	struct filedesc *fdp;
7245	int i;
7246
7247	proc_fdlock(p);
7248	fdp = p->p_fd;
7249	for (i = `0`; i < fdp->fd_nfiles; i++) {
7250	struct fileproc *fp;
7251	struct socket *so;
7252
7253	fp = fdp->fd_ofiles[i];
7254	if (fp == NULL \|\|
7255	(fdp->fd_ofileflags[i] & UF_RESERVED) != `0` \|\|
7256	FILEGLOB_DTYPE(fp->f_fglob) != DTYPE_SOCKET)
7257	continue;
7258
7259	so = (struct socket *)fp->f_fglob->fg_data;
7260	(void) soresume(p, so, `0`);
7261	}
7262	proc_fdunlock(p);
7263
7264	OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7265	}
7266	}
7267
7268	__private_extern__ int
7269	so_set_recv_anyif(struct socket so, int* optval)
7270	{
7271	int ret = `0`;
7272
7273	#if INET6
7274	if (SOCK_DOM(so) == PF_INET \|\| SOCK_DOM(so) == PF_INET6) {
7275	#else
7276	if (SOCK_DOM(so) == PF_INET) {
7277	#endif /* !INET6 */
7278	if (optval)
7279	sotoinpcb(so)->inp_flags \|= INP_RECV_ANYIF;
7280	else
7281	sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7282	}
7283
7284
7285	return (ret);
7286	}
7287
7288	__private_extern__ int
7289	so_get_recv_anyif(struct socket *so)
7290	{
7291	int ret = `0`;
7292
7293	#if INET6
7294	if (SOCK_DOM(so) == PF_INET \|\| SOCK_DOM(so) == PF_INET6) {
7295	#else
7296	if (SOCK_DOM(so) == PF_INET) {
7297	#endif /* !INET6 */
7298	ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? `1` : `0`;
7299	}
7300
7301	return (ret);
7302	}
7303
7304	int
7305	so_set_restrictions(struct socket *so, uint32_t vals)
7306	{
7307	int nocell_old, nocell_new;
7308	int noexpensive_old, noexpensive_new;
7309
7310	/*
7311	* Deny-type restrictions are trapdoors; once set they cannot be
7312	* unset for the lifetime of the socket. This allows them to be
7313	* issued by a framework on behalf of the application without
7314	* having to worry that they can be undone.
7315	*
7316	* Note here that socket-level restrictions overrides any protocol
7317	* level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
7318	* socket restriction issued on the socket has a higher precendence
7319	* than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
7320	* policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7321	* i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7322	*/
7323	nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7324	noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7325	so->so_restrictions \|= (vals & (SO_RESTRICT_DENY_IN \|
7326	SO_RESTRICT_DENY_OUT \| SO_RESTRICT_DENY_CELLULAR \|
7327	SO_RESTRICT_DENY_EXPENSIVE));
7328	nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7329	noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7330
7331	/ we can only set, not clear restrictions /
7332	if ((nocell_new - nocell_old) == `0` &&
7333	(noexpensive_new - noexpensive_old) == `0`)
7334	return (`0`);
7335	#if INET6
7336	if (SOCK_DOM(so) == PF_INET \|\| SOCK_DOM(so) == PF_INET6) {
7337	#else
7338	if (SOCK_DOM(so) == PF_INET) {
7339	#endif /* !INET6 */
7340	if (nocell_new - nocell_old != `0`) {
7341	/*
7342	* if deny cellular is now set, do what's needed
7343	* for INPCB
7344	*/
7345	inp_set_nocellular(sotoinpcb(so));
7346	}
7347	if (noexpensive_new - noexpensive_old != `0`) {
7348	inp_set_noexpensive(sotoinpcb(so));
7349	}
7350	}
7351
7352	if (SOCK_DOM(so) == PF_MULTIPATH)
7353	mptcp_set_restrictions(so);
7354
7355	return (`0`);
7356	}
7357
7358	uint32_t
7359	so_get_restrictions(struct socket *so)
7360	{
7361	return (so->so_restrictions & (SO_RESTRICT_DENY_IN \|
7362	SO_RESTRICT_DENY_OUT \|
7363	SO_RESTRICT_DENY_CELLULAR \| SO_RESTRICT_DENY_EXPENSIVE));
7364	}
7365
7366	int
7367	so_set_effective_pid(struct socket so, int* epid, struct proc *p)
7368	{
7369	struct proc *ep = PROC_NULL;
7370	int error = `0`;
7371
7372	/ pid 0 is reserved for kernel /
7373	if (epid == `0`) {
7374	error = EINVAL;
7375	goto done;
7376	}
7377
7378	/*
7379	* If this is an in-kernel socket, prevent its delegate
7380	* association from changing unless the socket option is
7381	* coming from within the kernel itself.
7382	*/
7383	if (so->last_pid == `0` && p != kernproc) {
7384	error = EACCES;
7385	goto done;
7386	}
7387
7388	/*
7389	* If this is issued by a process that's recorded as the
7390	* real owner of the socket, or if the pid is the same as
7391	* the process's own pid, then proceed. Otherwise ensure
7392	* that the issuing process has the necessary privileges.
7393	*/
7394	if (epid != so->last_pid \|\| epid != proc_pid(p)) {
7395	if ((error = priv_check_cred(kauth_cred_get(),
7396	PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, `0`))) {
7397	error = EACCES;
7398	goto done;
7399	}
7400	}
7401
7402	/ Find the process that corresponds to the effective pid /
7403	if ((ep = proc_find(epid)) == PROC_NULL) {
7404	error = ESRCH;
7405	goto done;
7406	}
7407
7408	/*
7409	* If a process tries to delegate the socket to itself, then
7410	* there's really nothing to do; treat it as a way for the
7411	* delegate association to be cleared. Note that we check
7412	* the passed-in proc rather than calling proc_selfpid(),
7413	* as we need to check the process issuing the socket option
7414	* which could be kernproc. Given that we don't allow 0 for
7415	* effective pid, it means that a delegated in-kernel socket
7416	* stays delegated during its lifetime (which is probably OK.)
7417	*/
7418	if (epid == proc_pid(p)) {
7419	so->so_flags &= ~SOF_DELEGATED;
7420	so->e_upid = `0`;
7421	so->e_pid = `0`;
7422	uuid_clear(so->e_uuid);
7423	} else {
7424	so->so_flags \|= SOF_DELEGATED;
7425	so->e_upid = proc_uniqueid(ep);
7426	so->e_pid = proc_pid(ep);
7427	proc_getexecutableuuid(ep, so->e_uuid, sizeof (so->e_uuid));
7428	}
7429	done:
7430	if (error == `0` && net_io_policy_log) {
7431	uuid_string_t buf;
7432
7433	uuid_unparse(so->e_uuid, buf);
7434	log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7435	"euuid %s%s\n", __func__, proc_name_address(p),
7436	proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7437	SOCK_DOM(so), SOCK_TYPE(so),
7438	so->e_pid, proc_name_address(ep), buf,
7439	((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7440	} else if (error != `0` && net_io_policy_log) {
7441	log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7442	"ERROR (%d)\n", __func__, proc_name_address(p),
7443	proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7444	SOCK_DOM(so), SOCK_TYPE(so),
7445	epid, (ep == PROC_NULL) ? "PROC_NULL" :
7446	proc_name_address(ep), error);
7447	}
7448
7449	/ Update this socket's policy upon success /
7450	if (error == `0`) {
7451	so->so_policy_gencnt *= -`1`;
7452	so_update_policy(so);
7453	#if NECP
7454	so_update_necp_policy(so, NULL, NULL);
7455	#endif /* NECP */
7456	}
7457
7458	if (ep != PROC_NULL)
7459	proc_rele(ep);
7460
7461	return (error);
7462	}
7463
7464	int
7465	so_set_effective_uuid(struct socket so, uuid_t euuid, struct* proc *p)
7466	{
7467	uuid_string_t buf;
7468	uuid_t uuid;
7469	int error = `0`;
7470
7471	/ UUID must not be all-zeroes (reserved for kernel) /
7472	if (uuid_is_null(euuid)) {
7473	error = EINVAL;
7474	goto done;
7475	}
7476
7477	/*
7478	* If this is an in-kernel socket, prevent its delegate
7479	* association from changing unless the socket option is
7480	* coming from within the kernel itself.
7481	*/
7482	if (so->last_pid == `0` && p != kernproc) {
7483	error = EACCES;
7484	goto done;
7485	}
7486
7487	/ Get the UUID of the issuing process /
7488	proc_getexecutableuuid(p, uuid, sizeof (uuid));
7489
7490	/*
7491	* If this is issued by a process that's recorded as the
7492	* real owner of the socket, or if the uuid is the same as
7493	* the process's own uuid, then proceed. Otherwise ensure
7494	* that the issuing process has the necessary privileges.
7495	*/
7496	if (uuid_compare(euuid, so->last_uuid) != `0` \|\|
7497	uuid_compare(euuid, uuid) != `0`) {
7498	if ((error = priv_check_cred(kauth_cred_get(),
7499	PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, `0`))) {
7500	error = EACCES;
7501	goto done;
7502	}
7503	}
7504
7505	/*
7506	* If a process tries to delegate the socket to itself, then
7507	* there's really nothing to do; treat it as a way for the
7508	* delegate association to be cleared. Note that we check
7509	* the uuid of the passed-in proc rather than that of the
7510	* current process, as we need to check the process issuing
7511	* the socket option which could be kernproc itself. Given
7512	* that we don't allow 0 for effective uuid, it means that
7513	* a delegated in-kernel socket stays delegated during its
7514	* lifetime (which is okay.)
7515	*/
7516	if (uuid_compare(euuid, uuid) == `0`) {
7517	so->so_flags &= ~SOF_DELEGATED;
7518	so->e_upid = `0`;
7519	so->e_pid = `0`;
7520	uuid_clear(so->e_uuid);
7521	} else {
7522	so->so_flags \|= SOF_DELEGATED;
7523	/*
7524	* Unlike so_set_effective_pid(), we only have the UUID
7525	* here and the process ID is not known. Inherit the
7526	* real {pid,upid} of the socket.
7527	*/
7528	so->e_upid = so->last_upid;
7529	so->e_pid = so->last_pid;
7530	uuid_copy(so->e_uuid, euuid);
7531	}
7532
7533	done:
7534	if (error == `0` && net_io_policy_log) {
7535	uuid_unparse(so->e_uuid, buf);
7536	log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
7537	"euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
7538	(uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7539	SOCK_TYPE(so), so->e_pid, buf,
7540	((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7541	} else if (error != `0` && net_io_policy_log) {
7542	uuid_unparse(euuid, buf);
7543	log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
7544	"ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
7545	(uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
7546	SOCK_TYPE(so), buf, error);
7547	}
7548
7549	/ Update this socket's policy upon success /
7550	if (error == `0`) {
7551	so->so_policy_gencnt *= -`1`;
7552	so_update_policy(so);
7553	#if NECP
7554	so_update_necp_policy(so, NULL, NULL);
7555	#endif /* NECP */
7556	}
7557
7558	return (error);
7559	}
7560
7561	void
7562	netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
7563	uint32_t ev_datalen)
7564	{
7565	struct kev_msg ev_msg;
7566
7567	/*
7568	* A netpolicy event always starts with a netpolicy_event_data
7569	* structure, but the caller can provide for a longer event
7570	* structure to post, depending on the event code.
7571	*/
7572	VERIFY(ev_data != NULL && ev_datalen >= sizeof (*ev_data));
7573
7574	bzero(&ev_msg, sizeof (ev_msg));
7575	ev_msg.vendor_code = KEV_VENDOR_APPLE;
7576	ev_msg.kev_class = KEV_NETWORK_CLASS;
7577	ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS;
7578	ev_msg.event_code = ev_code;
7579
7580	ev_msg.dv[`0`].data_ptr = ev_data;
7581	ev_msg.dv[`0`].data_length = ev_datalen;
7582
7583	kev_post_msg(&ev_msg);
7584	}
7585
7586	void
7587	socket_post_kev_msg(uint32_t ev_code,
7588	struct kev_socket_event_data *ev_data,
7589	uint32_t ev_datalen)
7590	{
7591	struct kev_msg ev_msg;
7592
7593	bzero(&ev_msg, sizeof(ev_msg));
7594	ev_msg.vendor_code = KEV_VENDOR_APPLE;
7595	ev_msg.kev_class = KEV_NETWORK_CLASS;
7596	ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
7597	ev_msg.event_code = ev_code;
7598
7599	ev_msg.dv[`0`].data_ptr = ev_data;
7600	ev_msg.dv[`0`]. data_length = ev_datalen;
7601
7602	kev_post_msg(&ev_msg);
7603	}
7604
7605	void
7606	socket_post_kev_msg_closed(struct socket *so)
7607	{
7608	struct kev_socket_closed ev;
7609	struct sockaddr socksa = NULL, peersa = NULL;
7610	int err;
7611	bzero(&ev, sizeof(ev));
7612	err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
7613	if (err == `0`) {
7614	err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
7615	&peersa);
7616	if (err == `0`) {
7617	memcpy(&ev.ev_data.kev_sockname, socksa,
7618	min(socksa->sa_len,
7619	sizeof (ev.ev_data.kev_sockname)));
7620	memcpy(&ev.ev_data.kev_peername, peersa,
7621	min(peersa->sa_len,
7622	sizeof (ev.ev_data.kev_peername)));
7623	socket_post_kev_msg(KEV_SOCKET_CLOSED,
7624	&ev.ev_data, sizeof (ev));
7625	}
7626	}
7627	if (socksa != NULL)
7628	FREE(socksa, M_SONAME);
7629	if (peersa != NULL)
7630	FREE(peersa, M_SONAME);
7631	}
7632

Browse the source code of xnu/bsd/kern/uipc_socket.c