uipc_socket.c source code [xnu/bsd/kern/uipc_socket.c]

1	/*
2	* Copyright (c) 1998-2022 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28	/ Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved /
29	/*
30	* Copyright (c) 1982, 1986, 1988, 1990, 1993
31	* The Regents of the University of California. All rights reserved.
32	*
33	* Redistribution and use in source and binary forms, with or without
34	* modification, are permitted provided that the following conditions
35	* are met:
36	* 1. Redistributions of source code must retain the above copyright
37	* notice, this list of conditions and the following disclaimer.
38	* 2. Redistributions in binary form must reproduce the above copyright
39	* notice, this list of conditions and the following disclaimer in the
40	* documentation and/or other materials provided with the distribution.
41	* 3. All advertising materials mentioning features or use of this software
42	* must display the following acknowledgement:
43	* This product includes software developed by the University of
44	* California, Berkeley and its contributors.
45	* 4. Neither the name of the University nor the names of its contributors
46	* may be used to endorse or promote products derived from this software
47	* without specific prior written permission.
48	*
49	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59	* SUCH DAMAGE.
60	*
61	* @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
62	*/
63	/*
64	* NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65	* support for mandatory and extensible security protections. This notice
66	* is included in support of clause 2.2 (b) of the Apple Public License,
67	* Version 2.0.
68	*/
69
70	#include <sys/param.h>
71	#include <sys/systm.h>
72	#include <sys/filedesc.h>
73	#include <sys/proc.h>
74	#include <sys/proc_internal.h>
75	#include <sys/kauth.h>
76	#include <sys/file_internal.h>
77	#include <sys/fcntl.h>
78	#include <sys/malloc.h>
79	#include <sys/mbuf.h>
80	#include <sys/domain.h>
81	#include <sys/kernel.h>
82	#include <sys/event.h>
83	#include <sys/poll.h>
84	#include <sys/protosw.h>
85	#include <sys/socket.h>
86	#include <sys/socketvar.h>
87	#include <sys/resourcevar.h>
88	#include <sys/signalvar.h>
89	#include <sys/sysctl.h>
90	#include <sys/syslog.h>
91	#include <sys/uio.h>
92	#include <sys/uio_internal.h>
93	#include <sys/ev.h>
94	#include <sys/kdebug.h>
95	#include <sys/un.h>
96	#include <sys/user.h>
97	#include <sys/priv.h>
98	#include <sys/kern_event.h>
99	#include <sys/persona.h>
100	#include <net/route.h>
101	#include <net/init.h>
102	#include <net/net_api_stats.h>
103	#include <net/ntstat.h>
104	#include <net/content_filter.h>
105	#include <netinet/in.h>
106	#include <netinet/in_pcb.h>
107	#include <netinet/in_tclass.h>
108	#include <netinet/in_var.h>
109	#include <netinet/tcp_var.h>
110	#include <netinet/ip6.h>
111	#include <netinet6/ip6_var.h>
112	#include <netinet/flow_divert.h>
113	#include <kern/zalloc.h>
114	#include <kern/locks.h>
115	#include <machine/limits.h>
116	#include <libkern/OSAtomic.h>
117	#include <pexpert/pexpert.h>
118	#include <kern/assert.h>
119	#include <kern/task.h>
120	#include <kern/policy_internal.h>
121
122	#include <sys/kpi_mbuf.h>
123	#include <sys/mcache.h>
124	#include <sys/unpcb.h>
125	#include <libkern/section_keywords.h>
126
127	#include <os/log.h>
128
129	#if CONFIG_MACF
130	#include <security/mac_framework.h>
131	#endif /* MAC */
132
133	#if MULTIPATH
134	#include <netinet/mp_pcb.h>
135	#include <netinet/mptcp_var.h>
136	#endif /* MULTIPATH */
137
138	#define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
139
140	#if DEBUG \|\| DEVELOPMENT
141	#define DEBUG_KERNEL_ADDRPERM(_v) (_v)
142	#else
143	#define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
144	#endif
145
146	/ TODO: this should be in a header file somewhere /
147	extern char proc_name_address(void* *p);
148
149	static u_int32_t so_cache_hw; / High water mark for socache /
150	static u_int32_t so_cache_timeouts; / number of timeouts /
151	static u_int32_t so_cache_max_freed; / max freed per timeout /
152	static u_int32_t cached_sock_count = `0`;
153	STAILQ_HEAD(, socket) so_cache_head;
154	int max_cached_sock_count = MAX_CACHED_SOCKETS;
155	static uint64_t so_cache_time;
156	static int socketinit_done;
157	static struct zone *so_cache_zone;
158
159	static LCK_GRP_DECLARE(so_cache_mtx_grp, "so_cache");
160	static LCK_MTX_DECLARE(so_cache_mtx, &so_cache_mtx_grp);
161
162	#include <machine/limits.h>
163
164	static int filt_sorattach(struct knote kn, struct* kevent_qos_s *kev);
165	static void filt_sordetach(struct knote *kn);
166	static int filt_soread(struct knote kn, long* hint);
167	static int filt_sortouch(struct knote kn, struct* kevent_qos_s *kev);
168	static int filt_sorprocess(struct knote kn, struct* kevent_qos_s *kev);
169
170	static int filt_sowattach(struct knote kn, struct* kevent_qos_s *kev);
171	static void filt_sowdetach(struct knote *kn);
172	static int filt_sowrite(struct knote kn, long* hint);
173	static int filt_sowtouch(struct knote kn, struct* kevent_qos_s *kev);
174	static int filt_sowprocess(struct knote kn, struct* kevent_qos_s *kev);
175
176	static int filt_sockattach(struct knote kn, struct* kevent_qos_s *kev);
177	static void filt_sockdetach(struct knote *kn);
178	static int filt_sockev(struct knote kn, long* hint);
179	static int filt_socktouch(struct knote kn, struct* kevent_qos_s *kev);
180	static int filt_sockprocess(struct knote kn, struct* kevent_qos_s *kev);
181
182	static int sooptcopyin_timeval(struct sockopt , struct* timeval *);
183	static int sooptcopyout_timeval(struct sockopt , const* struct timeval *);
184
185	SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
186	.f_isfd = `1`,
187	.f_attach = filt_sorattach,
188	.f_detach = filt_sordetach,
189	.f_event = filt_soread,
190	.f_touch = filt_sortouch,
191	.f_process = filt_sorprocess,
192	};
193
194	SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
195	.f_isfd = `1`,
196	.f_attach = filt_sowattach,
197	.f_detach = filt_sowdetach,
198	.f_event = filt_sowrite,
199	.f_touch = filt_sowtouch,
200	.f_process = filt_sowprocess,
201	};
202
203	SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
204	.f_isfd = `1`,
205	.f_attach = filt_sockattach,
206	.f_detach = filt_sockdetach,
207	.f_event = filt_sockev,
208	.f_touch = filt_socktouch,
209	.f_process = filt_sockprocess,
210	};
211
212	SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
213	.f_isfd = `1`,
214	.f_attach = filt_sorattach,
215	.f_detach = filt_sordetach,
216	.f_event = filt_soread,
217	.f_touch = filt_sortouch,
218	.f_process = filt_sorprocess,
219	};
220
221	SYSCTL_DECL(_kern_ipc);
222
223	#define EVEN_MORE_LOCKING_DEBUG 0
224
225	int socket_debug = `0`;
226	SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
227	CTLFLAG_RW \| CTLFLAG_LOCKED, &socket_debug, `0`, "");
228
229	#if (DEBUG \|\| DEVELOPMENT)
230	#define DEFAULT_SOSEND_ASSERT_PANIC 1
231	#else
232	#define DEFAULT_SOSEND_ASSERT_PANIC 0
233	#endif /* (DEBUG \|\| DEVELOPMENT) */
234
235	int sosend_assert_panic = `0`;
236	SYSCTL_INT(_kern_ipc, OID_AUTO, sosend_assert_panic,
237	CTLFLAG_RW \| CTLFLAG_LOCKED, &sosend_assert_panic, DEFAULT_SOSEND_ASSERT_PANIC, "");
238
239	static unsigned long sodefunct_calls = `0`;
240	SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
241	&sodefunct_calls, "");
242
243	ZONE_DEFINE_TYPE(socket_zone, "socket", struct socket, ZC_ZFREE_CLEARMEM);
244	so_gen_t so_gencnt; / generation count for sockets /
245
246	MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
247
248	#define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
249	#define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
250	#define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
251	#define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
252	#define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) \| 1)
253	#define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) \| 3)
254	#define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
255	#define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) \| 3)
256	#define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
257
258	#define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
259
260	int somaxconn = SOMAXCONN;
261	SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
262	CTLFLAG_RW \| CTLFLAG_LOCKED, &somaxconn, `0`, "");
263
264	/ Should we get a maximum also ??? /
265	static int sosendmaxchain = `65536`;
266	static int sosendminchain = `16384`;
267	static int sorecvmincopy = `16384`;
268	SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
269	CTLFLAG_RW \| CTLFLAG_LOCKED, &sosendminchain, `0`, "");
270	SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
271	CTLFLAG_RW \| CTLFLAG_LOCKED, &sorecvmincopy, `0`, "");
272
273	/*
274	* Set to enable jumbo clusters (if available) for large writes when
275	* the socket is marked with SOF_MULTIPAGES; see below.
276	*/
277	int sosendjcl = `1`;
278	SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
279	CTLFLAG_RW \| CTLFLAG_LOCKED, &sosendjcl, `0`, "");
280
281	/*
282	* Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
283	* writes on the socket for all protocols on any network interfaces,
284	* depending upon sosendjcl above. Be extra careful when setting this
285	* to 1, because sending down packets that cross physical pages down to
286	* broken drivers (those that falsely assume that the physical pages
287	* are contiguous) might lead to system panics or silent data corruption.
288	* When set to 0, the system will respect SOF_MULTIPAGES, which is set
289	* only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
290	* capable. Set this to 1 only for testing/debugging purposes.
291	*/
292	int sosendjcl_ignore_capab = `0`;
293	SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
294	CTLFLAG_RW \| CTLFLAG_LOCKED, &sosendjcl_ignore_capab, `0`, "");
295
296	/*
297	* Set this to ignore SOF1_IF_2KCL and use big clusters for large
298	* writes on the socket for all protocols on any network interfaces.
299	* Be extra careful when setting this to 1, because sending down packets with
300	* clusters larger that 2 KB might lead to system panics or data corruption.
301	* When set to 0, the system will respect SOF1_IF_2KCL, which is set
302	* on the outgoing interface
303	* Set this to 1 for testing/debugging purposes only.
304	*/
305	int sosendbigcl_ignore_capab = `0`;
306	SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
307	CTLFLAG_RW \| CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, `0`, "");
308
309	int sodefunctlog = `0`;
310	SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW \| CTLFLAG_LOCKED,
311	&sodefunctlog, `0`, "");
312
313	int sothrottlelog = `0`;
314	SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW \| CTLFLAG_LOCKED,
315	&sothrottlelog, `0`, "");
316
317	int sorestrictrecv = `1`;
318	SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW \| CTLFLAG_LOCKED,
319	&sorestrictrecv, `0`, "Enable inbound interface restrictions");
320
321	int sorestrictsend = `1`;
322	SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW \| CTLFLAG_LOCKED,
323	&sorestrictsend, `0`, "Enable outbound interface restrictions");
324
325	int soreserveheadroom = `1`;
326	SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW \| CTLFLAG_LOCKED,
327	&soreserveheadroom, `0`, "To allocate contiguous datagram buffers");
328
329	#if (DEBUG \|\| DEVELOPMENT)
330	int so_notsent_lowat_check = `1`;
331	SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW \| CTLFLAG_LOCKED,
332	&so_notsent_lowat_check, `0`, "enable/disable notsnet lowat check");
333	#endif /* DEBUG \|\| DEVELOPMENT */
334
335	int so_accept_list_waits = `0`;
336	#if (DEBUG \|\| DEVELOPMENT)
337	SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW \| CTLFLAG_LOCKED,
338	&so_accept_list_waits, `0`, "number of waits for listener incomp list");
339	#endif /* DEBUG \|\| DEVELOPMENT */
340
341	extern struct inpcbinfo tcbinfo;
342
343	/ TODO: these should be in header file /
344	extern int get_inpcb_str_size(void);
345	extern int get_tcp_str_size(void);
346
347	vm_size_t so_cache_zone_element_size;
348
349	static int sodelayed_copy(struct socket , struct* uio , struct* mbuf **,
350	user_ssize_t *);
351	static void cached_sock_alloc(struct socket **, zalloc_flags_t);
352	static void cached_sock_free(struct socket *);
353
354	/*
355	* Maximum of extended background idle sockets per process
356	* Set to zero to disable further setting of the option
357	*/
358
359	#define SO_IDLE_BK_IDLE_MAX_PER_PROC 1
360	#define SO_IDLE_BK_IDLE_TIME 600
361	#define SO_IDLE_BK_IDLE_RCV_HIWAT 131072
362
363	struct soextbkidlestat soextbkidlestat;
364
365	SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
366	CTLFLAG_RW \| CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, `0`,
367	"Maximum of extended background idle sockets per process");
368
369	SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW \| CTLFLAG_LOCKED,
370	&soextbkidlestat.so_xbkidle_time, `0`,
371	"Time in seconds to keep extended background idle sockets");
372
373	SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW \| CTLFLAG_LOCKED,
374	&soextbkidlestat.so_xbkidle_rcvhiwat, `0`,
375	"High water mark for extended background idle sockets");
376
377	SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD \| CTLFLAG_LOCKED,
378	&soextbkidlestat, soextbkidlestat, "");
379
380	int so_set_extended_bk_idle(struct socket , int*);
381
382	#define SO_MAX_MSG_X 1024
383
384	/*
385	* SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
386	* setting the DSCP code on the packet based on the service class; see
387	* <rdar://problem/11277343> for details.
388	*/
389	__private_extern__ u_int32_t sotcdb = `0`;
390	SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW \| CTLFLAG_LOCKED,
391	&sotcdb, `0`, "");
392
393	void
394	socketinit(void)
395	{
396	_CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
397	VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
398
399	#ifdef __LP64__
400	_CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
401	_CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
402	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
403	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
404	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
405	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
406	#else
407	_CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
408	_CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
409	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
410	_CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
411	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
412	_CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
413	#endif
414
415	if (socketinit_done) {
416	printf("socketinit: already called...\n");
417	return;
418	}
419	socketinit_done = `1`;
420
421	PE_parse_boot_argn(arg_string: "socket_debug", arg_ptr: &socket_debug,
422	max_arg: sizeof(socket_debug));
423
424	PE_parse_boot_argn(arg_string: "sosend_assert_panic", arg_ptr: &sosend_assert_panic,
425	max_arg: sizeof(sosend_assert_panic));
426
427	STAILQ_INIT(&so_cache_head);
428
429	so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + `4`
430	+ get_inpcb_str_size() + `4` + get_tcp_str_size());
431
432	so_cache_zone = zone_create(name: "socache zone", size: so_cache_zone_element_size,
433	flags: ZC_PGZ_USE_GUARDS \| ZC_ZFREE_CLEARMEM);
434
435	bzero(s: &soextbkidlestat, n: sizeof(struct soextbkidlestat));
436	soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
437	soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
438	soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
439
440	in_pcbinit();
441	}
442
443	static void
444	cached_sock_alloc(struct socket **so, zalloc_flags_t how)
445	{
446	caddr_t temp;
447	uintptr_t offset;
448
449	lck_mtx_lock(lck: &so_cache_mtx);
450
451	if (!STAILQ_EMPTY(&so_cache_head)) {
452	VERIFY(cached_sock_count > `0`);
453
454	*so = STAILQ_FIRST(&so_cache_head);
455	STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
456	STAILQ_NEXT((*so), so_cache_ent) = NULL;
457
458	cached_sock_count--;
459	lck_mtx_unlock(lck: &so_cache_mtx);
460
461	temp = (*so)->so_saved_pcb;
462	bzero(s: (caddr_t)so, n: sizeof(struct* socket));
463
464	(*so)->so_saved_pcb = temp;
465	} else {
466	lck_mtx_unlock(lck: &so_cache_mtx);
467
468	*so = zalloc_flags(so_cache_zone, how \| Z_ZERO);
469
470	/*
471	* Define offsets for extra structures into our
472	* single block of memory. Align extra structures
473	* on longword boundaries.
474	*/
475
476	offset = (uintptr_t)*so;
477	offset += sizeof(struct socket);
478
479	offset = ALIGN(offset);
480
481	(*so)->so_saved_pcb = (caddr_t)offset;
482	offset += get_inpcb_str_size();
483
484	offset = ALIGN(offset);
485
486	((struct inpcb )(void* )(so)->so_saved_pcb)->inp_saved_ppcb =
487	(caddr_t)offset;
488	}
489
490	OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
491	}
492
493	static void
494	cached_sock_free(struct socket *so)
495	{
496	lck_mtx_lock(lck: &so_cache_mtx);
497
498	so_cache_time = net_uptime();
499	if (++cached_sock_count > max_cached_sock_count) {
500	--cached_sock_count;
501	lck_mtx_unlock(lck: &so_cache_mtx);
502	zfree(so_cache_zone, so);
503	} else {
504	if (so_cache_hw < cached_sock_count) {
505	so_cache_hw = cached_sock_count;
506	}
507
508	STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
509
510	so->cache_timestamp = so_cache_time;
511	lck_mtx_unlock(lck: &so_cache_mtx);
512	}
513	}
514
515	void
516	so_update_last_owner_locked(struct socket *so, proc_t self)
517	{
518	if (so->last_pid != `0`) {
519	/*
520	* last_pid and last_upid should remain zero for sockets
521	* created using sock_socket. The check above achieves that
522	*/
523	if (self == PROC_NULL) {
524	self = current_proc();
525	}
526
527	if (so->last_upid != proc_uniqueid(self) \|\|
528	so->last_pid != proc_pid(self)) {
529	so->last_upid = proc_uniqueid(self);
530	so->last_pid = proc_pid(self);
531	proc_getexecutableuuid(self, so->last_uuid,
532	sizeof(so->last_uuid));
533	if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
534	(*so->so_proto->pr_update_last_owner)(so, self, NULL);
535	}
536	}
537	proc_pidoriginatoruuid(uuid_buf: so->so_vuuid, buffersize: sizeof(so->so_vuuid));
538	}
539	}
540
541	void
542	so_update_policy(struct socket *so)
543	{
544	if (SOCK_DOM(so) == PF_INET \|\| SOCK_DOM(so) == PF_INET6) {
545	(void) inp_update_policy(sotoinpcb(so));
546	}
547	}
548
549	#if NECP
550	static void
551	so_update_necp_policy(struct socket so, struct* sockaddr *override_local_addr,
552	struct sockaddr *override_remote_addr)
553	{
554	if (SOCK_DOM(so) == PF_INET \|\| SOCK_DOM(so) == PF_INET6) {
555	inp_update_necp_policy(sotoinpcb(so), override_local_addr,
556	override_remote_addr, `0`);
557	}
558	}
559	#endif /* NECP */
560
561	boolean_t
562	so_cache_timer(void)
563	{
564	struct socket *p;
565	int n_freed = `0`;
566	boolean_t rc = FALSE;
567
568	lck_mtx_lock(lck: &so_cache_mtx);
569	so_cache_timeouts++;
570	so_cache_time = net_uptime();
571
572	while (!STAILQ_EMPTY(&so_cache_head)) {
573	VERIFY(cached_sock_count > `0`);
574	p = STAILQ_FIRST(&so_cache_head);
575	if ((so_cache_time - p->cache_timestamp) <
576	SO_CACHE_TIME_LIMIT) {
577	break;
578	}
579
580	STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
581	--cached_sock_count;
582
583	zfree(so_cache_zone, p);
584
585	if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
586	so_cache_max_freed++;
587	break;
588	}
589	}
590
591	/ Schedule again if there is more to cleanup /
592	if (!STAILQ_EMPTY(&so_cache_head)) {
593	rc = TRUE;
594	}
595
596	lck_mtx_unlock(lck: &so_cache_mtx);
597	return rc;
598	}
599
600	/*
601	* Get a socket structure from our zone, and initialize it.
602	* We don't implement `waitok' yet (see comments in uipc_domain.c).
603	* Note that it would probably be better to allocate socket
604	* and PCB at the same time, but I'm not convinced that all
605	* the protocols can be easily modified to do this.
606	*/
607	struct socket *
608	soalloc(int waitok, int dom, int type)
609	{
610	zalloc_flags_t how = waitok ? Z_WAITOK : Z_NOWAIT;
611	struct socket *so;
612
613	if ((dom == PF_INET) && (type == SOCK_STREAM)) {
614	cached_sock_alloc(so: &so, how);
615	} else {
616	so = zalloc_flags(socket_zone, how \| Z_ZERO);
617	}
618	if (so != NULL) {
619	so->so_gencnt = OSIncrementAtomic64(address: (SInt64 *)&so_gencnt);
620
621	/*
622	* Increment the socket allocation statistics
623	*/
624	INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
625	}
626
627	return so;
628	}
629
630	int
631	socreate_internal(int dom, struct socket *aso, int* type, int proto,
632	struct proc p, uint32_t flags, struct* proc *ep)
633	{
634	struct protosw *prp;
635	struct socket *so;
636	int error = `0`;
637	pid_t rpid = -`1`;
638
639	#if TCPDEBUG
640	extern int tcpconsdebug;
641	#endif
642
643	VERIFY(aso != NULL);
644	*aso = NULL;
645
646	if (proto != `0`) {
647	prp = pffindproto(family: dom, protocol: proto, type);
648	} else {
649	prp = pffindtype(dom, type);
650	}
651
652	if (prp == NULL \|\| prp->pr_usrreqs->pru_attach == NULL) {
653	if (pffinddomain(dom) == NULL) {
654	return EAFNOSUPPORT;
655	}
656	if (proto != `0`) {
657	if (pffindprotonotype(dom, proto) != NULL) {
658	return EPROTOTYPE;
659	}
660	}
661	return EPROTONOSUPPORT;
662	}
663	if (prp->pr_type != type) {
664	return EPROTOTYPE;
665	}
666	so = soalloc(waitok: `1`, dom, type);
667	if (so == NULL) {
668	return ENOBUFS;
669	}
670
671	switch (dom) {
672	case PF_LOCAL:
673	INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
674	break;
675	case PF_INET:
676	INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
677	if (type == SOCK_STREAM) {
678	INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
679	} else {
680	INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
681	}
682	break;
683	case PF_ROUTE:
684	INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
685	break;
686	case PF_NDRV:
687	INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
688	break;
689	case PF_KEY:
690	INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
691	break;
692	case PF_INET6:
693	INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
694	if (type == SOCK_STREAM) {
695	INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
696	} else {
697	INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
698	}
699	break;
700	case PF_SYSTEM:
701	INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
702	break;
703	case PF_MULTIPATH:
704	INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
705	break;
706	default:
707	INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
708	break;
709	}
710
711	if (flags & SOCF_MPTCP) {
712	so->so_state \|= SS_NBIO;
713	}
714
715	TAILQ_INIT(&so->so_incomp);
716	TAILQ_INIT(&so->so_comp);
717	so->so_type = (short)type;
718	so->so_family = prp->pr_domain->dom_family;
719	so->so_protocol = prp->pr_protocol;
720	so->last_upid = proc_uniqueid(p);
721	so->last_pid = proc_pid(p);
722	proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
723	proc_pidoriginatoruuid(uuid_buf: so->so_vuuid, buffersize: sizeof(so->so_vuuid));
724
725	so->so_rpid = -`1`;
726	uuid_clear(uu: so->so_ruuid);
727
728	if (ep != PROC_NULL && ep != p) {
729	so->e_upid = proc_uniqueid(ep);
730	so->e_pid = proc_pid(ep);
731	proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
732	so->so_flags \|= SOF_DELEGATED;
733	if (ep->p_responsible_pid != so->e_pid) {
734	rpid = ep->p_responsible_pid;
735	so->so_rpid = rpid;
736	proc_getresponsibleuuid(ep, so->so_ruuid, sizeof(so->so_ruuid));
737	}
738	}
739
740	if (rpid < `0` && p->p_responsible_pid != so->last_pid) {
741	rpid = p->p_responsible_pid;
742	so->so_rpid = rpid;
743	proc_getresponsibleuuid(p, so->so_ruuid, sizeof(so->so_ruuid));
744	}
745
746	so->so_cred = kauth_cred_proc_ref(procp: p);
747	if (!suser(cred: kauth_cred_get(), NULL)) {
748	so->so_state \|= SS_PRIV;
749	}
750
751	so->so_persona_id = current_persona_get_id();
752	so->so_proto = prp;
753	so->so_rcv.sb_flags \|= SB_RECV;
754	so->so_rcv.sb_so = so->so_snd.sb_so = so;
755	so->next_lock_lr = `0`;
756	so->next_unlock_lr = `0`;
757
758	/*
759	* Attachment will create the per pcb lock if necessary and
760	* increase refcount for creation, make sure it's done before
761	* socket is inserted in lists.
762	*/
763	so->so_usecount++;
764
765	error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
766	if (error != `0`) {
767	/*
768	* Warning:
769	* If so_pcb is not zero, the socket will be leaked,
770	* so protocol attachment handler must be coded carefuly
771	*/
772	if (so->so_pcb != NULL) {
773	os_log_error(OS_LOG_DEFAULT,
774	"so_pcb not NULL after pru_attach error %d for dom %d, proto %d, type %d",
775	error, dom, proto, type);
776	}
777	/*
778	* Both SS_NOFDREF and SOF_PCBCLEARING should be set to free the socket
779	*/
780	so->so_state \|= SS_NOFDREF;
781	so->so_flags \|= SOF_PCBCLEARING;
782	VERIFY(so->so_usecount > `0`);
783	so->so_usecount--;
784	sofreelastref(so, `1`); / will deallocate the socket /
785	return error;
786	}
787
788	/*
789	* Note: needs so_pcb to be set after pru_attach
790	*/
791	if (prp->pr_update_last_owner != NULL) {
792	(*prp->pr_update_last_owner)(so, p, ep);
793	}
794
795	os_atomic_inc(&prp->pr_domain->dom_refs, relaxed);
796
797	/ Attach socket filters for this protocol /
798	sflt_initsock(so);
799	#if TCPDEBUG
800	if (tcpconsdebug == `2`) {
801	so->so_options \|= SO_DEBUG;
802	}
803	#endif
804	so_set_default_traffic_class(so);
805
806	/*
807	* If this thread or task is marked to create backgrounded sockets,
808	* mark the socket as background.
809	*/
810	if (!(flags & SOCF_MPTCP) &&
811	proc_get_effective_thread_policy(thread: current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
812	socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
813	so->so_background_thread = current_thread();
814	}
815
816	switch (dom) {
817	/*
818	* Don't mark Unix domain or system
819	* eligible for defunct by default.
820	*/
821	case PF_LOCAL:
822	case PF_SYSTEM:
823	so->so_flags \|= SOF_NODEFUNCT;
824	break;
825	default:
826	break;
827	}
828
829	/*
830	* Entitlements can't be checked at socket creation time except if the
831	* application requested a feature guarded by a privilege (c.f., socket
832	* delegation).
833	* The priv(9) and the Sandboxing APIs are designed with the idea that
834	* a privilege check should only be triggered by a userland request.
835	* A privilege check at socket creation time is time consuming and
836	* could trigger many authorisation error messages from the security
837	* APIs.
838	*/
839
840	*aso = so;
841
842	return `0`;
843	}
844
845	/*
846	* Returns: 0 Success
847	* EAFNOSUPPORT
848	* EPROTOTYPE
849	* EPROTONOSUPPORT
850	* ENOBUFS
851	* <pru_attach>:ENOBUFS[AF_UNIX]
852	* <pru_attach>:ENOBUFS[TCP]
853	* <pru_attach>:ENOMEM[TCP]
854	* <pru_attach>:??? [other protocol families, IPSEC]
855	*/
856	int
857	socreate(int dom, struct socket *aso, int* type, int proto)
858	{
859	return socreate_internal(dom, aso, type, proto, p: current_proc(), flags: `0`,
860	PROC_NULL);
861	}
862
863	int
864	socreate_delegate(int dom, struct socket *aso, int* type, int proto, pid_t epid)
865	{
866	int error = `0`;
867	struct proc *ep = PROC_NULL;
868
869	if ((proc_selfpid() != epid) && ((ep = proc_find(pid: epid)) == PROC_NULL)) {
870	error = ESRCH;
871	goto done;
872	}
873
874	error = socreate_internal(dom, aso, type, proto, p: current_proc(), flags: `0`, ep);
875
876	/*
877	* It might not be wise to hold the proc reference when calling
878	* socreate_internal since it calls soalloc with M_WAITOK
879	*/
880	done:
881	if (ep != PROC_NULL) {
882	proc_rele(p: ep);
883	}
884
885	return error;
886	}
887
888	/*
889	* Returns: 0 Success
890	* <pru_bind>:EINVAL Invalid argument [COMMON_START]
891	* <pru_bind>:EAFNOSUPPORT Address family not supported
892	* <pru_bind>:EADDRNOTAVAIL Address not available.
893	* <pru_bind>:EINVAL Invalid argument
894	* <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
895	* <pru_bind>:EACCES Permission denied
896	* <pru_bind>:EADDRINUSE Address in use
897	* <pru_bind>:EAGAIN Resource unavailable, try again
898	* <pru_bind>:EPERM Operation not permitted
899	* <pru_bind>:???
900	* <sf_bind>:???
901	*
902	* Notes: It's not possible to fully enumerate the return codes above,
903	* since socket filter authors and protocol family authors may
904	* not choose to limit their error returns to those listed, even
905	* though this may result in some software operating incorrectly.
906	*
907	* The error codes which are enumerated above are those known to
908	* be returned by the tcp_usr_bind function supplied.
909	*/
910	int
911	sobindlock(struct socket so, struct* sockaddr nam, int* dolock)
912	{
913	struct proc *p = current_proc();
914	int error = `0`;
915
916	if (dolock) {
917	socket_lock(so, refcount: `1`);
918	}
919
920	so_update_last_owner_locked(so, self: p);
921	so_update_policy(so);
922
923	#if NECP
924	so_update_necp_policy(so, override_local_addr: nam, NULL);
925	#endif /* NECP */
926
927	/*
928	* If this is a bind request on a socket that has been marked
929	* as inactive, reject it now before we go any further.
930	*/
931	if (so->so_flags & SOF_DEFUNCT) {
932	error = EINVAL;
933	SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
934	__func__, proc_pid(p), proc_best_name(p),
935	so->so_gencnt,
936	SOCK_DOM(so), SOCK_TYPE(so), error);
937	goto out;
938	}
939
940	/ Socket filter /
941	error = sflt_bind(so, nam);
942
943	if (error == `0`) {
944	error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
945	}
946	out:
947	if (dolock) {
948	socket_unlock(so, refcount: `1`);
949	}
950
951	if (error == EJUSTRETURN) {
952	error = `0`;
953	}
954
955	return error;
956	}
957
958	void
959	sodealloc(struct socket *so)
960	{
961	kauth_cred_unref(&so->so_cred);
962
963	/ Remove any filters /
964	sflt_termsock(so);
965
966	so->so_gencnt = OSIncrementAtomic64(address: (SInt64 *)&so_gencnt);
967
968	if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
969	cached_sock_free(so);
970	} else {
971	zfree(socket_zone, so);
972	}
973	}
974
975	/*
976	* Returns: 0 Success
977	* EINVAL
978	* EOPNOTSUPP
979	* <pru_listen>:EINVAL[AF_UNIX]
980	* <pru_listen>:EINVAL[TCP]
981	* <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
982	* <pru_listen>:EINVAL[TCP] Invalid argument
983	* <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
984	* <pru_listen>:EACCES[TCP] Permission denied
985	* <pru_listen>:EADDRINUSE[TCP] Address in use
986	* <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
987	* <pru_listen>:EPERM[TCP] Operation not permitted
988	* <sf_listen>:???
989	*
990	* Notes: Other <pru_listen> returns depend on the protocol family; all
991	* <sf_listen> returns depend on what the filter author causes
992	* their filter to return.
993	*/
994	int
995	solisten(struct socket so, int* backlog)
996	{
997	struct proc *p = current_proc();
998	int error = `0`;
999
1000	socket_lock(so, refcount: `1`);
1001
1002	so_update_last_owner_locked(so, self: p);
1003	so_update_policy(so);
1004
1005	if (TAILQ_EMPTY(&so->so_comp)) {
1006	so->so_options \|= SO_ACCEPTCONN;
1007	}
1008
1009	#if NECP
1010	so_update_necp_policy(so, NULL, NULL);
1011	#endif /* NECP */
1012
1013	if (so->so_proto == NULL) {
1014	error = EINVAL;
1015	so->so_options &= ~SO_ACCEPTCONN;
1016	goto out;
1017	}
1018	if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == `0`) {
1019	error = EOPNOTSUPP;
1020	so->so_options &= ~SO_ACCEPTCONN;
1021	goto out;
1022	}
1023
1024	/*
1025	* If the listen request is made on a socket that is not fully
1026	* disconnected, or on a socket that has been marked as inactive,
1027	* reject the request now.
1028	*/
1029	if ((so->so_state &
1030	(SS_ISCONNECTED \| SS_ISCONNECTING \| SS_ISDISCONNECTING)) \|\|
1031	(so->so_flags & SOF_DEFUNCT)) {
1032	error = EINVAL;
1033	if (so->so_flags & SOF_DEFUNCT) {
1034	SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1035	"(%d)\n", __func__, proc_pid(p),
1036	proc_best_name(p),
1037	so->so_gencnt,
1038	SOCK_DOM(so), SOCK_TYPE(so), error);
1039	}
1040	so->so_options &= ~SO_ACCEPTCONN;
1041	goto out;
1042	}
1043
1044	if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != `0`) {
1045	error = EPERM;
1046	so->so_options &= ~SO_ACCEPTCONN;
1047	goto out;
1048	}
1049
1050	error = sflt_listen(so);
1051	if (error == `0`) {
1052	error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1053	}
1054
1055	if (error) {
1056	if (error == EJUSTRETURN) {
1057	error = `0`;
1058	}
1059	so->so_options &= ~SO_ACCEPTCONN;
1060	goto out;
1061	}
1062
1063	/*
1064	* POSIX: The implementation may have an upper limit on the length of
1065	* the listen queue-either global or per accepting socket. If backlog
1066	* exceeds this limit, the length of the listen queue is set to the
1067	* limit.
1068	*
1069	* If listen() is called with a backlog argument value that is less
1070	* than 0, the function behaves as if it had been called with a backlog
1071	* argument value of 0.
1072	*
1073	* A backlog argument of 0 may allow the socket to accept connections,
1074	* in which case the length of the listen queue may be set to an
1075	* implementation-defined minimum value.
1076	*/
1077	if (backlog <= `0` \|\| backlog > somaxconn) {
1078	backlog = somaxconn;
1079	}
1080
1081	so->so_qlimit = (short)backlog;
1082	out:
1083	socket_unlock(so, refcount: `1`);
1084	return error;
1085	}
1086
1087	/*
1088	* The "accept list lock" protects the fields related to the listener queues
1089	* because we can unlock a socket to respect the lock ordering between
1090	* the listener socket and its clients sockets. The lock ordering is first to
1091	* acquire the client socket before the listener socket.
1092	*
1093	* The accept list lock serializes access to the following fields:
1094	* - of the listener socket:
1095	* - so_comp
1096	* - so_incomp
1097	* - so_qlen
1098	* - so_inqlen
1099	* - of client sockets that are in so_comp or so_incomp:
1100	* - so_head
1101	* - so_list
1102	*
1103	* As one can see the accept list lock protects the consistent of the
1104	* linkage of the client sockets.
1105	*
1106	* Note that those fields may be read without holding the accept list lock
1107	* for a preflight provided the accept list lock is taken when committing
1108	* to take an action based on the result of the preflight. The preflight
1109	* saves the cost of doing the unlock/lock dance.
1110	*/
1111	void
1112	so_acquire_accept_list(struct socket head, struct* socket *so)
1113	{
1114	lck_mtx_t *mutex_held;
1115
1116	if (head->so_proto->pr_getlock == NULL) {
1117	return;
1118	}
1119	mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1120	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1121
1122	if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1123	head->so_flags1 \|= SOF1_ACCEPT_LIST_HELD;
1124	return;
1125	}
1126	if (so != NULL) {
1127	socket_unlock(so, refcount: `0`);
1128	}
1129	while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1130	so_accept_list_waits += `1`;
1131	msleep(chan: (caddr_t)&head->so_incomp, mtx: mutex_held,
1132	PSOCK \| PCATCH, wmesg: __func__, NULL);
1133	}
1134	head->so_flags1 \|= SOF1_ACCEPT_LIST_HELD;
1135	if (so != NULL) {
1136	socket_unlock(so: head, refcount: `0`);
1137	socket_lock(so, refcount: `0`);
1138	socket_lock(so: head, refcount: `0`);
1139	}
1140	}
1141
1142	void
1143	so_release_accept_list(struct socket *head)
1144	{
1145	if (head->so_proto->pr_getlock != NULL) {
1146	lck_mtx_t *mutex_held;
1147
1148	mutex_held = (*head->so_proto->pr_getlock)(head, `0`);
1149	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1150
1151	head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1152	wakeup(chan: (caddr_t)&head->so_incomp);
1153	}
1154	}
1155
1156	void
1157	sofreelastref(struct socket so, int* dealloc)
1158	{
1159	struct socket *head = so->so_head;
1160
1161	/ Assume socket is locked /
1162
1163	#if FLOW_DIVERT
1164	if (so->so_flags & SOF_FLOW_DIVERT) {
1165	flow_divert_detach(so);
1166	}
1167	#endif /* FLOW_DIVERT */
1168
1169	#if CONTENT_FILTER
1170	if ((so->so_flags & SOF_CONTENT_FILTER) != `0`) {
1171	cfil_sock_detach(so);
1172	}
1173	#endif /* CONTENT_FILTER */
1174
1175	if (NEED_DGRAM_FLOW_TRACKING(so)) {
1176	soflow_detach(so);
1177	}
1178
1179	if (!(so->so_flags & SOF_PCBCLEARING) \|\| !(so->so_state & SS_NOFDREF)) {
1180	selthreadclear(&so->so_snd.sb_sel);
1181	selthreadclear(&so->so_rcv.sb_sel);
1182	so->so_rcv.sb_flags &= ~(SB_SEL \| SB_UPCALL);
1183	so->so_snd.sb_flags &= ~(SB_SEL \| SB_UPCALL);
1184	so->so_event = sonullevent;
1185	return;
1186	}
1187	if (head != NULL) {
1188	/*
1189	* Need to lock the listener when the protocol has
1190	* per socket locks
1191	*/
1192	if (head->so_proto->pr_getlock != NULL) {
1193	socket_lock(so: head, refcount: `1`);
1194	so_acquire_accept_list(head, so);
1195	}
1196	if (so->so_state & SS_INCOMP) {
1197	so->so_state &= ~SS_INCOMP;
1198	TAILQ_REMOVE(&head->so_incomp, so, so_list);
1199	head->so_incqlen--;
1200	head->so_qlen--;
1201	so->so_head = NULL;
1202
1203	if (head->so_proto->pr_getlock != NULL) {
1204	so_release_accept_list(head);
1205	socket_unlock(so: head, refcount: `1`);
1206	}
1207	} else if (so->so_state & SS_COMP) {
1208	if (head->so_proto->pr_getlock != NULL) {
1209	so_release_accept_list(head);
1210	socket_unlock(so: head, refcount: `1`);
1211	}
1212	/*
1213	* We must not decommission a socket that's
1214	* on the accept(2) queue. If we do, then
1215	* accept(2) may hang after select(2) indicated
1216	* that the listening socket was ready.
1217	*/
1218	selthreadclear(&so->so_snd.sb_sel);
1219	selthreadclear(&so->so_rcv.sb_sel);
1220	so->so_rcv.sb_flags &= ~(SB_SEL \| SB_UPCALL);
1221	so->so_snd.sb_flags &= ~(SB_SEL \| SB_UPCALL);
1222	so->so_event = sonullevent;
1223	return;
1224	} else {
1225	if (head->so_proto->pr_getlock != NULL) {
1226	so_release_accept_list(head);
1227	socket_unlock(so: head, refcount: `1`);
1228	}
1229	printf("sofree: not queued\n");
1230	}
1231	}
1232	sowflush(so);
1233	sorflush(so);
1234
1235	/ 3932268: disable upcall /
1236	so->so_rcv.sb_flags &= ~SB_UPCALL;
1237	so->so_snd.sb_flags &= ~(SB_UPCALL \| SB_SNDBYTE_CNT);
1238	so->so_event = sonullevent;
1239
1240	if (dealloc) {
1241	sodealloc(so);
1242	}
1243	}
1244
1245	void
1246	soclose_wait_locked(struct socket *so)
1247	{
1248	lck_mtx_t *mutex_held;
1249
1250	if (so->so_proto->pr_getlock != NULL) {
1251	mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1252	} else {
1253	mutex_held = so->so_proto->pr_domain->dom_mtx;
1254	}
1255	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1256
1257	/*
1258	* Double check here and return if there's no outstanding upcall;
1259	* otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1260	*/
1261	if (!so->so_upcallusecount \|\| !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
1262	return;
1263	}
1264	so->so_rcv.sb_flags &= ~SB_UPCALL;
1265	so->so_snd.sb_flags &= ~SB_UPCALL;
1266	so->so_flags \|= SOF_CLOSEWAIT;
1267
1268	(void) msleep(chan: (caddr_t)&so->so_upcallusecount, mtx: mutex_held, pri: (PZERO - `1`),
1269	wmesg: "soclose_wait_locked", NULL);
1270	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1271	so->so_flags &= ~SOF_CLOSEWAIT;
1272	}
1273
1274	/*
1275	* Close a socket on last file table reference removal.
1276	* Initiate disconnect if connected.
1277	* Free socket when disconnect complete.
1278	*/
1279	int
1280	soclose_locked(struct socket *so)
1281	{
1282	int error = `0`;
1283	struct timespec ts;
1284
1285	if (so->so_usecount == `0`) {
1286	panic("soclose: so=%p refcount=0", so);
1287	/ NOTREACHED /
1288	}
1289
1290	sflt_notify(so, event: sock_evt_closing, NULL);
1291
1292	if (so->so_upcallusecount) {
1293	soclose_wait_locked(so);
1294	}
1295
1296	#if CONTENT_FILTER
1297	/*
1298	* We have to wait until the content filters are done
1299	*/
1300	if ((so->so_flags & SOF_CONTENT_FILTER) != `0`) {
1301	cfil_sock_close_wait(so);
1302	cfil_sock_is_closed(so);
1303	cfil_sock_detach(so);
1304	}
1305	#endif /* CONTENT_FILTER */
1306
1307	if (NEED_DGRAM_FLOW_TRACKING(so)) {
1308	soflow_detach(so);
1309	}
1310
1311	if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1312	soresume(current_proc(), so, `1`);
1313	so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1314	}
1315
1316	if ((so->so_options & SO_ACCEPTCONN)) {
1317	struct socket sp, sonext;
1318	int persocklock = `0`;
1319	int incomp_overflow_only;
1320
1321	/*
1322	* We do not want new connection to be added
1323	* to the connection queues
1324	*/
1325	so->so_options &= ~SO_ACCEPTCONN;
1326
1327	/*
1328	* We can drop the lock on the listener once
1329	* we've acquired the incoming list
1330	*/
1331	if (so->so_proto->pr_getlock != NULL) {
1332	persocklock = `1`;
1333	so_acquire_accept_list(head: so, NULL);
1334	socket_unlock(so, refcount: `0`);
1335	}
1336	again:
1337	incomp_overflow_only = `1`;
1338
1339	TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1340	/*
1341	* Radar 5350314
1342	* skip sockets thrown away by tcpdropdropblreq
1343	* they will get cleanup by the garbage collection.
1344	* otherwise, remove the incomp socket from the queue
1345	* and let soabort trigger the appropriate cleanup.
1346	*/
1347	if (sp->so_flags & SOF_OVERFLOW) {
1348	continue;
1349	}
1350
1351	if (persocklock != `0`) {
1352	socket_lock(so: sp, refcount: `1`);
1353	}
1354
1355	/*
1356	* Radar 27945981
1357	* The extra reference for the list insure the
1358	* validity of the socket pointer when we perform the
1359	* unlock of the head above
1360	*/
1361	if (sp->so_state & SS_INCOMP) {
1362	sp->so_state &= ~SS_INCOMP;
1363	sp->so_head = NULL;
1364	TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1365	so->so_incqlen--;
1366	so->so_qlen--;
1367
1368	(void) soabort(so: sp);
1369	} else {
1370	panic("%s sp %p in so_incomp but !SS_INCOMP",
1371	__func__, sp);
1372	}
1373
1374	if (persocklock != `0`) {
1375	socket_unlock(so: sp, refcount: `1`);
1376	}
1377	}
1378
1379	TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1380	/ Dequeue from so_comp since sofree() won't do it /
1381	if (persocklock != `0`) {
1382	socket_lock(so: sp, refcount: `1`);
1383	}
1384
1385	if (sp->so_state & SS_COMP) {
1386	sp->so_state &= ~SS_COMP;
1387	sp->so_head = NULL;
1388	TAILQ_REMOVE(&so->so_comp, sp, so_list);
1389	so->so_qlen--;
1390
1391	(void) soabort(so: sp);
1392	} else {
1393	panic("%s sp %p in so_comp but !SS_COMP",
1394	__func__, sp);
1395	}
1396
1397	if (persocklock) {
1398	socket_unlock(so: sp, refcount: `1`);
1399	}
1400	}
1401
1402	if (incomp_overflow_only == `0` && !TAILQ_EMPTY(&so->so_incomp)) {
1403	#if (DEBUG \| DEVELOPMENT)
1404	panic("%s head %p so_comp not empty", __func__, so);
1405	#endif /* (DEVELOPMENT \|\| DEBUG) */
1406
1407	goto again;
1408	}
1409
1410	if (!TAILQ_EMPTY(&so->so_comp)) {
1411	#if (DEBUG \| DEVELOPMENT)
1412	panic("%s head %p so_comp not empty", __func__, so);
1413	#endif /* (DEVELOPMENT \|\| DEBUG) */
1414
1415	goto again;
1416	}
1417
1418	if (persocklock) {
1419	socket_lock(so, refcount: `0`);
1420	so_release_accept_list(head: so);
1421	}
1422	}
1423	if (so->so_pcb == NULL) {
1424	/ 3915887: mark the socket as ready for dealloc /
1425	so->so_flags \|= SOF_PCBCLEARING;
1426	goto discard;
1427	}
1428
1429	if (so->so_state & SS_ISCONNECTED) {
1430	if ((so->so_state & SS_ISDISCONNECTING) == `0`) {
1431	error = sodisconnectlocked(so);
1432	if (error) {
1433	goto drop;
1434	}
1435	}
1436	if (so->so_options & SO_LINGER) {
1437	if ((so->so_state & SS_ISDISCONNECTING) &&
1438	(so->so_state & SS_NBIO)) {
1439	goto drop;
1440	}
1441	while ((so->so_state & SS_ISCONNECTED) && so->so_linger > `0`) {
1442	lck_mtx_t *mutex_held;
1443
1444	if (so->so_proto->pr_getlock != NULL) {
1445	mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1446	} else {
1447	mutex_held = so->so_proto->pr_domain->dom_mtx;
1448	}
1449	ts.tv_sec = (so->so_linger / `100`);
1450	ts.tv_nsec = (so->so_linger % `100`) *
1451	NSEC_PER_USEC * `1000` * `10`;
1452	error = msleep(chan: (caddr_t)&so->so_timeo,
1453	mtx: mutex_held, PSOCK \| PCATCH, wmesg: "soclose", ts: &ts);
1454	if (error) {
1455	/*
1456	* It's OK when the time fires,
1457	* don't report an error
1458	*/
1459	if (error == EWOULDBLOCK) {
1460	error = `0`;
1461	}
1462	break;
1463	}
1464	}
1465	}
1466	}
1467	drop:
1468	if (so->so_usecount == `0`) {
1469	panic("soclose: usecount is zero so=%p", so);
1470	/ NOTREACHED /
1471	}
1472	if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1473	int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1474	if (error == `0`) {
1475	error = error2;
1476	}
1477	}
1478	if (so->so_usecount <= `0`) {
1479	panic("soclose: usecount is zero so=%p", so);
1480	/ NOTREACHED /
1481	}
1482	discard:
1483	if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1484	(so->so_state & SS_NOFDREF)) {
1485	panic("soclose: NOFDREF");
1486	/ NOTREACHED /
1487	}
1488	so->so_state \|= SS_NOFDREF;
1489
1490	if ((so->so_flags & SOF_KNOTE) != `0`) {
1491	KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1492	}
1493
1494	os_atomic_dec(&so->so_proto->pr_domain->dom_refs, relaxed);
1495
1496	VERIFY(so->so_usecount > `0`);
1497	so->so_usecount--;
1498	sofree(so);
1499	return error;
1500	}
1501
1502	int
1503	soclose(struct socket *so)
1504	{
1505	int error = `0`;
1506	socket_lock(so, refcount: `1`);
1507
1508	if (so->so_retaincnt == `0`) {
1509	error = soclose_locked(so);
1510	} else {
1511	/*
1512	* if the FD is going away, but socket is
1513	* retained in kernel remove its reference
1514	*/
1515	so->so_usecount--;
1516	if (so->so_usecount < `2`) {
1517	panic("soclose: retaincnt non null and so=%p "
1518	"usecount=%d\n", so, so->so_usecount);
1519	}
1520	}
1521	socket_unlock(so, refcount: `1`);
1522	return error;
1523	}
1524
1525	/*
1526	* Must be called at splnet...
1527	*/
1528	/ Should already be locked /
1529	int
1530	soabort(struct socket *so)
1531	{
1532	int error;
1533
1534	#ifdef MORE_LOCKING_DEBUG
1535	lck_mtx_t *mutex_held;
1536
1537	if (so->so_proto->pr_getlock != NULL) {
1538	mutex_held = (*so->so_proto->pr_getlock)(so, `0`);
1539	} else {
1540	mutex_held = so->so_proto->pr_domain->dom_mtx;
1541	}
1542	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1543	#endif
1544
1545	if ((so->so_flags & SOF_ABORTED) == `0`) {
1546	so->so_flags \|= SOF_ABORTED;
1547	error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1548	if (error) {
1549	sofree(so);
1550	return error;
1551	}
1552	}
1553	return `0`;
1554	}
1555
1556	int
1557	soacceptlock(struct socket so, struct* sockaddr *nam, int* dolock)
1558	{
1559	int error;
1560
1561	if (dolock) {
1562	socket_lock(so, refcount: `1`);
1563	}
1564
1565	so_update_last_owner_locked(so, PROC_NULL);
1566	so_update_policy(so);
1567	#if NECP
1568	so_update_necp_policy(so, NULL, NULL);
1569	#endif /* NECP */
1570
1571	if ((so->so_state & SS_NOFDREF) == `0`) {
1572	panic("soaccept: !NOFDREF");
1573	}
1574	so->so_state &= ~SS_NOFDREF;
1575	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1576
1577	if (dolock) {
1578	socket_unlock(so, refcount: `1`);
1579	}
1580	return error;
1581	}
1582
1583	int
1584	soaccept(struct socket so, struct* sockaddr **nam)
1585	{
1586	return soacceptlock(so, nam, dolock: `1`);
1587	}
1588
1589	int
1590	soacceptfilter(struct socket so, struct* socket *head)
1591	{
1592	struct sockaddr local = NULL, remote = NULL;
1593	int error = `0`;
1594
1595	/*
1596	* Hold the lock even if this socket has not been made visible
1597	* to the filter(s). For sockets with global locks, this protects
1598	* against the head or peer going away
1599	*/
1600	socket_lock(so, refcount: `1`);
1601	if (sogetaddr_locked(so, &remote, `1`) != `0` \|\|
1602	sogetaddr_locked(so, &local, `0`) != `0`) {
1603	so->so_state &= ~SS_NOFDREF;
1604	socket_unlock(so, refcount: `1`);
1605	soclose(so);
1606	/ Out of resources; try it again next time /
1607	error = ECONNABORTED;
1608	goto done;
1609	}
1610
1611	error = sflt_accept(head, so, local, remote);
1612
1613	/*
1614	* If we get EJUSTRETURN from one of the filters, mark this socket
1615	* as inactive and return it anyway. This newly accepted socket
1616	* will be disconnected later before we hand it off to the caller.
1617	*/
1618	if (error == EJUSTRETURN) {
1619	error = `0`;
1620	(void) sosetdefunct(current_proc(), so,
1621	SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1622	}
1623
1624	if (error != `0`) {
1625	/*
1626	* This may seem like a duplication to the above error
1627	* handling part when we return ECONNABORTED, except
1628	* the following is done while holding the lock since
1629	* the socket has been exposed to the filter(s) earlier.
1630	*/
1631	so->so_state &= ~SS_NOFDREF;
1632	socket_unlock(so, refcount: `1`);
1633	soclose(so);
1634	/ Propagate socket filter's error code to the caller /
1635	} else {
1636	socket_unlock(so, refcount: `1`);
1637	}
1638	done:
1639	/ Callee checks for NULL pointer /
1640	sock_freeaddr(sockname: remote);
1641	sock_freeaddr(sockname: local);
1642	return error;
1643	}
1644
1645	/*
1646	* Returns: 0 Success
1647	* EOPNOTSUPP Operation not supported on socket
1648	* EISCONN Socket is connected
1649	* <pru_connect>:EADDRNOTAVAIL Address not available.
1650	* <pru_connect>:EINVAL Invalid argument
1651	* <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1652	* <pru_connect>:EACCES Permission denied
1653	* <pru_connect>:EADDRINUSE Address in use
1654	* <pru_connect>:EAGAIN Resource unavailable, try again
1655	* <pru_connect>:EPERM Operation not permitted
1656	* <sf_connect_out>:??? [anything a filter writer might set]
1657	*/
1658	int
1659	soconnectlock(struct socket so, struct* sockaddr nam, int* dolock)
1660	{
1661	int error;
1662	struct proc *p = current_proc();
1663	tracker_metadata_t metadata = { };
1664
1665	if (dolock) {
1666	socket_lock(so, refcount: `1`);
1667	}
1668
1669	so_update_last_owner_locked(so, self: p);
1670	so_update_policy(so);
1671
1672	/*
1673	* If this is a listening socket or if this is a previously-accepted
1674	* socket that has been marked as inactive, reject the connect request.
1675	*/
1676	if ((so->so_options & SO_ACCEPTCONN) \|\| (so->so_flags & SOF_DEFUNCT)) {
1677	error = EOPNOTSUPP;
1678	if (so->so_flags & SOF_DEFUNCT) {
1679	SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1680	"(%d)\n", __func__, proc_pid(p),
1681	proc_best_name(p),
1682	so->so_gencnt,
1683	SOCK_DOM(so), SOCK_TYPE(so), error);
1684	}
1685	if (dolock) {
1686	socket_unlock(so, refcount: `1`);
1687	}
1688	return error;
1689	}
1690
1691	if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != `0`) {
1692	if (dolock) {
1693	socket_unlock(so, refcount: `1`);
1694	}
1695	return EPERM;
1696	}
1697
1698	/*
1699	* If protocol is connection-based, can only connect once.
1700	* Otherwise, if connected, try to disconnect first.
1701	* This allows user to disconnect by connecting to, e.g.,
1702	* a null address.
1703	*/
1704	if (so->so_state & (SS_ISCONNECTED \| SS_ISCONNECTING) &&
1705	((so->so_proto->pr_flags & PR_CONNREQUIRED) \|\|
1706	(error = sodisconnectlocked(so)))) {
1707	error = EISCONN;
1708	} else {
1709	/*
1710	* For connected v4/v6 sockets, check if destination address associates with a domain name and if it is
1711	* a tracker domain. Mark socket accordingly. Skip lookup if socket has already been marked a tracker.
1712	*/
1713	if (!(so->so_flags1 & SOF1_KNOWN_TRACKER) && IS_INET(so)) {
1714	if (tracker_lookup(app_uuid: so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, nam, metadata: &metadata) == `0`) {
1715	if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1716	so->so_flags1 \|= SOF1_KNOWN_TRACKER;
1717	}
1718	if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1719	so->so_flags1 \|= SOF1_APPROVED_APP_DOMAIN;
1720	}
1721	if (necp_set_socket_domain_attributes(so, domain: metadata.domain, domain_owner: metadata.domain_owner)) {
1722	printf("connect() - failed necp_set_socket_domain_attributes");
1723	}
1724	}
1725	}
1726
1727	#if NECP
1728	/ Update NECP evaluation after setting any domain via the tracker checks /
1729	so_update_necp_policy(so, NULL, override_remote_addr: nam);
1730	#endif /* NECP */
1731
1732	/*
1733	* Run connect filter before calling protocol:
1734	* - non-blocking connect returns before completion;
1735	*/
1736	error = sflt_connectout(so, nam);
1737	if (error != `0`) {
1738	if (error == EJUSTRETURN) {
1739	error = `0`;
1740	}
1741	} else {
1742	error = (*so->so_proto->pr_usrreqs->pru_connect)
1743	(so, nam, p);
1744	if (error != `0`) {
1745	so->so_state &= ~SS_ISCONNECTING;
1746	}
1747	}
1748	}
1749	if (dolock) {
1750	socket_unlock(so, refcount: `1`);
1751	}
1752	return error;
1753	}
1754
1755	int
1756	soconnect(struct socket so, struct* sockaddr *nam)
1757	{
1758	return soconnectlock(so, nam, dolock: `1`);
1759	}
1760
1761	/*
1762	* Returns: 0 Success
1763	* <pru_connect2>:EINVAL[AF_UNIX]
1764	* <pru_connect2>:EPROTOTYPE[AF_UNIX]
1765	* <pru_connect2>:??? [other protocol families]
1766	*
1767	* Notes: <pru_connect2> is not supported by [TCP].
1768	*/
1769	int
1770	soconnect2(struct socket so1, struct* socket *so2)
1771	{
1772	int error;
1773
1774	socket_lock(so: so1, refcount: `1`);
1775	if (so2->so_proto->pr_lock) {
1776	socket_lock(so: so2, refcount: `1`);
1777	}
1778
1779	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1780
1781	socket_unlock(so: so1, refcount: `1`);
1782	if (so2->so_proto->pr_lock) {
1783	socket_unlock(so: so2, refcount: `1`);
1784	}
1785	return error;
1786	}
1787
1788	int
1789	soconnectxlocked(struct socket so, struct* sockaddr *src,
1790	struct sockaddr dst, struct* proc *p, uint32_t ifscope,
1791	sae_associd_t aid, sae_connid_t pcid, uint32_t flags, void* *arg,
1792	uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1793	{
1794	int error;
1795	tracker_metadata_t metadata = { };
1796
1797	so_update_last_owner_locked(so, self: p);
1798	so_update_policy(so);
1799
1800	/*
1801	* If this is a listening socket or if this is a previously-accepted
1802	* socket that has been marked as inactive, reject the connect request.
1803	*/
1804	if ((so->so_options & SO_ACCEPTCONN) \|\| (so->so_flags & SOF_DEFUNCT)) {
1805	error = EOPNOTSUPP;
1806	if (so->so_flags & SOF_DEFUNCT) {
1807	SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1808	"(%d)\n", __func__, proc_pid(p),
1809	proc_best_name(p),
1810	so->so_gencnt,
1811	SOCK_DOM(so), SOCK_TYPE(so), error);
1812	}
1813	return error;
1814	}
1815
1816	if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != `0`) {
1817	return EPERM;
1818	}
1819
1820	/*
1821	* If protocol is connection-based, can only connect once
1822	* unless PR_MULTICONN is set. Otherwise, if connected,
1823	* try to disconnect first. This allows user to disconnect
1824	* by connecting to, e.g., a null address.
1825	*/
1826	if ((so->so_state & (SS_ISCONNECTED \| SS_ISCONNECTING)) &&
1827	!(so->so_proto->pr_flags & PR_MULTICONN) &&
1828	((so->so_proto->pr_flags & PR_CONNREQUIRED) \|\|
1829	(error = sodisconnectlocked(so)) != `0`)) {
1830	error = EISCONN;
1831	} else {
1832	/*
1833	* For TCP, check if destination address is a tracker and mark the socket accordingly
1834	* (only if it hasn't been marked yet).
1835	*/
1836	if (SOCK_CHECK_TYPE(so, SOCK_STREAM) && SOCK_CHECK_PROTO(so, IPPROTO_TCP) &&
1837	!(so->so_flags1 & SOF1_KNOWN_TRACKER)) {
1838	if (tracker_lookup(app_uuid: so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, dst, metadata: &metadata) == `0`) {
1839	if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1840	so->so_flags1 \|= SOF1_KNOWN_TRACKER;
1841	}
1842	if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1843	so->so_flags1 \|= SOF1_APPROVED_APP_DOMAIN;
1844	}
1845	if (necp_set_socket_domain_attributes(so, domain: metadata.domain, domain_owner: metadata.domain_owner)) {
1846	printf("connectx() - failed necp_set_socket_domain_attributes");
1847	}
1848	}
1849	}
1850
1851	if ((so->so_proto->pr_flags & PR_DATA_IDEMPOTENT) &&
1852	(flags & CONNECT_DATA_IDEMPOTENT)) {
1853	so->so_flags1 \|= SOF1_DATA_IDEMPOTENT;
1854
1855	if (flags & CONNECT_DATA_AUTHENTICATED) {
1856	so->so_flags1 \|= SOF1_DATA_AUTHENTICATED;
1857	}
1858	}
1859
1860	/*
1861	* Case 1: CONNECT_RESUME_ON_READ_WRITE set, no data.
1862	* Case 2: CONNECT_RESUME_ON_READ_WRITE set, with data (user error)
1863	* Case 3: CONNECT_RESUME_ON_READ_WRITE not set, with data
1864	* Case 3 allows user to combine write with connect even if they have
1865	* no use for TFO (such as regular TCP, and UDP).
1866	* Case 4: CONNECT_RESUME_ON_READ_WRITE not set, no data (regular case)
1867	*/
1868	if ((so->so_proto->pr_flags & PR_PRECONN_WRITE) &&
1869	((flags & CONNECT_RESUME_ON_READ_WRITE) \|\| auio)) {
1870	so->so_flags1 \|= SOF1_PRECONNECT_DATA;
1871	}
1872
1873	/*
1874	* If a user sets data idempotent and does not pass an uio, or
1875	* sets CONNECT_RESUME_ON_READ_WRITE, this is an error, reset
1876	* SOF1_DATA_IDEMPOTENT.
1877	*/
1878	if (!(so->so_flags1 & SOF1_PRECONNECT_DATA) &&
1879	(so->so_flags1 & SOF1_DATA_IDEMPOTENT)) {
1880	/ We should return EINVAL instead perhaps. /
1881	so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT;
1882	}
1883
1884	/*
1885	* Run connect filter before calling protocol:
1886	* - non-blocking connect returns before completion;
1887	*/
1888	error = sflt_connectout(so, nam: dst);
1889	if (error != `0`) {
1890	/ Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. /
1891	so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1892	if (error == EJUSTRETURN) {
1893	error = `0`;
1894	}
1895	} else {
1896	error = (*so->so_proto->pr_usrreqs->pru_connectx)
1897	(so, src, dst, p, ifscope, aid, pcid,
1898	flags, arg, arglen, auio, bytes_written);
1899	if (error != `0`) {
1900	so->so_state &= ~SS_ISCONNECTING;
1901	if (error != EINPROGRESS) {
1902	so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1903	}
1904	}
1905	}
1906	}
1907
1908	return error;
1909	}
1910
1911	int
1912	sodisconnectlocked(struct socket *so)
1913	{
1914	int error;
1915
1916	if ((so->so_state & SS_ISCONNECTED) == `0`) {
1917	error = ENOTCONN;
1918	goto bad;
1919	}
1920	if (so->so_state & SS_ISDISCONNECTING) {
1921	error = EALREADY;
1922	goto bad;
1923	}
1924
1925	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1926	if (error == `0`) {
1927	sflt_notify(so, event: sock_evt_disconnected, NULL);
1928	}
1929
1930	bad:
1931	return error;
1932	}
1933
1934	/ Locking version /
1935	int
1936	sodisconnect(struct socket *so)
1937	{
1938	int error;
1939
1940	socket_lock(so, refcount: `1`);
1941	error = sodisconnectlocked(so);
1942	socket_unlock(so, refcount: `1`);
1943	return error;
1944	}
1945
1946	int
1947	sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1948	{
1949	int error;
1950
1951	/*
1952	* Call the protocol disconnectx handler; let it handle all
1953	* matters related to the connection state of this session.
1954	*/
1955	error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1956	if (error == `0`) {
1957	/*
1958	* The event applies only for the session, not for
1959	* the disconnection of individual subflows.
1960	*/
1961	if (so->so_state & (SS_ISDISCONNECTING \| SS_ISDISCONNECTED)) {
1962	sflt_notify(so, event: sock_evt_disconnected, NULL);
1963	}
1964	}
1965	return error;
1966	}
1967
1968	int
1969	sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1970	{
1971	int error;
1972
1973	socket_lock(so, refcount: `1`);
1974	error = sodisconnectxlocked(so, aid, cid);
1975	socket_unlock(so, refcount: `1`);
1976	return error;
1977	}
1978
1979	#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1980
1981	/*
1982	* sosendcheck will lock the socket buffer if it isn't locked and
1983	* verify that there is space for the data being inserted.
1984	*
1985	* Returns: 0 Success
1986	* EPIPE
1987	* sblock:EWOULDBLOCK
1988	* sblock:EINTR
1989	* sbwait:EBADF
1990	* sbwait:EINTR
1991	* [so_error]:???
1992	*/
1993	int
1994	sosendcheck(struct socket so, struct* sockaddr *addr, user_ssize_t resid,
1995	int32_t clen, int32_t atomic, int flags, int *sblocked)
1996	{
1997	int error = `0`;
1998	int32_t space;
1999	int assumelock = `0`;
2000
2001	restart:
2002	if (*sblocked == `0`) {
2003	if ((so->so_snd.sb_flags & SB_LOCK) != `0` &&
2004	so->so_send_filt_thread != `0` &&
2005	so->so_send_filt_thread == current_thread()) {
2006	/*
2007	* We're being called recursively from a filter,
2008	* allow this to continue. Radar 4150520.
2009	* Don't set sblocked because we don't want
2010	* to perform an unlock later.
2011	*/
2012	assumelock = `1`;
2013	} else {
2014	error = sblock(sb: &so->so_snd, SBLOCKWAIT(flags));
2015	if (error) {
2016	if (so->so_flags & SOF_DEFUNCT) {
2017	goto defunct;
2018	}
2019	return error;
2020	}
2021	*sblocked = `1`;
2022	}
2023	}
2024
2025	/*
2026	* If a send attempt is made on a socket that has been marked
2027	* as inactive (disconnected), reject the request.
2028	*/
2029	if (so->so_flags & SOF_DEFUNCT) {
2030	defunct:
2031	error = EPIPE;
2032	SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
2033	__func__, proc_selfpid(), proc_best_name(current_proc()),
2034	so->so_gencnt,
2035	SOCK_DOM(so), SOCK_TYPE(so), error);
2036	return error;
2037	}
2038
2039	if (so->so_state & SS_CANTSENDMORE) {
2040	#if CONTENT_FILTER
2041	/*
2042	* Can re-inject data of half closed connections
2043	*/
2044	if ((so->so_state & SS_ISDISCONNECTED) == `0` &&
2045	so->so_snd.sb_cfil_thread == current_thread() &&
2046	cfil_sock_data_pending(sb: &so->so_snd) != `0`) {
2047	CFIL_LOG(LOG_INFO,
2048	"so %llx ignore SS_CANTSENDMORE",
2049	(uint64_t)DEBUG_KERNEL_ADDRPERM(so));
2050	} else
2051	#endif /* CONTENT_FILTER */
2052	return EPIPE;
2053	}
2054	if (so->so_error) {
2055	error = so->so_error;
2056	so->so_error = `0`;
2057	return error;
2058	}
2059
2060	if ((so->so_state & SS_ISCONNECTED) == `0`) {
2061	if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != `0`) {
2062	if (((so->so_state & SS_ISCONFIRMING) == `0`) &&
2063	(resid != `0` \|\| clen == `0`) &&
2064	!(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
2065	return ENOTCONN;
2066	}
2067	} else if (addr == `0`) {
2068	return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
2069	ENOTCONN : EDESTADDRREQ;
2070	}
2071	}
2072
2073	space = sbspace(sb: &so->so_snd);
2074
2075	if (flags & MSG_OOB) {
2076	space += `1024`;
2077	}
2078	if ((atomic && resid > so->so_snd.sb_hiwat) \|\|
2079	clen > so->so_snd.sb_hiwat) {
2080	return EMSGSIZE;
2081	}
2082
2083	if ((space < resid + clen &&
2084	(atomic \|\| (space < (int32_t)so->so_snd.sb_lowat) \|\|
2085	space < clen)) \|\|
2086	(so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
2087	/*
2088	* don't block the connectx call when there's more data
2089	* than can be copied.
2090	*/
2091	if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2092	if (space == `0`) {
2093	return EWOULDBLOCK;
2094	}
2095	if (space < (int32_t)so->so_snd.sb_lowat) {
2096	return `0`;
2097	}
2098	}
2099	if ((so->so_state & SS_NBIO) \|\| (flags & MSG_NBIO) \|\|
2100	assumelock) {
2101	return EWOULDBLOCK;
2102	}
2103	sbunlock(sb: &so->so_snd, TRUE); / keep socket locked /
2104	*sblocked = `0`;
2105	error = sbwait(sb: &so->so_snd);
2106	if (error) {
2107	if (so->so_flags & SOF_DEFUNCT) {
2108	goto defunct;
2109	}
2110	return error;
2111	}
2112	goto restart;
2113	}
2114	return `0`;
2115	}
2116
2117	/*
2118	* Send on a socket.
2119	* If send must go all at once and message is larger than
2120	* send buffering, then hard error.
2121	* Lock against other senders.
2122	* If must go all at once and not enough room now, then
2123	* inform user that this would block and do nothing.
2124	* Otherwise, if nonblocking, send as much as possible.
2125	* The data to be sent is described by "uio" if nonzero,
2126	* otherwise by the mbuf chain "top" (which must be null
2127	* if uio is not). Data provided in mbuf chain must be small
2128	* enough to send all at once.
2129	*
2130	* Returns nonzero on error, timeout or signal; callers
2131	* must check for short counts if EINTR/ERESTART are returned.
2132	* Data and control buffers are freed on return.
2133	*
2134	* Returns: 0 Success
2135	* EOPNOTSUPP
2136	* EINVAL
2137	* ENOBUFS
2138	* uiomove:EFAULT
2139	* sosendcheck:EPIPE
2140	* sosendcheck:EWOULDBLOCK
2141	* sosendcheck:EINTR
2142	* sosendcheck:EBADF
2143	* sosendcheck:EINTR
2144	* sosendcheck:??? [value from so_error]
2145	* <pru_send>:ECONNRESET[TCP]
2146	* <pru_send>:EINVAL[TCP]
2147	* <pru_send>:ENOBUFS[TCP]
2148	* <pru_send>:EADDRINUSE[TCP]
2149	* <pru_send>:EADDRNOTAVAIL[TCP]
2150	* <pru_send>:EAFNOSUPPORT[TCP]
2151	* <pru_send>:EACCES[TCP]
2152	* <pru_send>:EAGAIN[TCP]
2153	* <pru_send>:EPERM[TCP]
2154	* <pru_send>:EMSGSIZE[TCP]
2155	* <pru_send>:EHOSTUNREACH[TCP]
2156	* <pru_send>:ENETUNREACH[TCP]
2157	* <pru_send>:ENETDOWN[TCP]
2158	* <pru_send>:ENOMEM[TCP]
2159	* <pru_send>:ENOBUFS[TCP]
2160	* <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
2161	* <pru_send>:EINVAL[AF_UNIX]
2162	* <pru_send>:EOPNOTSUPP[AF_UNIX]
2163	* <pru_send>:EPIPE[AF_UNIX]
2164	* <pru_send>:ENOTCONN[AF_UNIX]
2165	* <pru_send>:EISCONN[AF_UNIX]
2166	* <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
2167	* <sf_data_out>:??? [whatever a filter author chooses]
2168	*
2169	* Notes: Other <pru_send> returns depend on the protocol family; all
2170	* <sf_data_out> returns depend on what the filter author causes
2171	* their filter to return.
2172	*/
2173	int
2174	sosend(struct socket so, struct* sockaddr addr, struct* uio *uio,
2175	struct mbuf top, struct* mbuf control, int* flags)
2176	{
2177	struct mbuf **mp;
2178	struct mbuf m, freelist = NULL;
2179	struct soflow_hash_entry *dgram_flow_entry = NULL;
2180	user_ssize_t space, len, resid, orig_resid;
2181	int clen = `0`, error, dontroute, sendflags;
2182	int atomic = sosendallatonce(so) \|\| top;
2183	int sblocked = `0`;
2184	struct proc *p = current_proc();
2185	uint16_t headroom = `0`;
2186	ssize_t mlen;
2187	boolean_t en_tracing = FALSE;
2188
2189	if (uio != NULL) {
2190	resid = uio_resid(a_uio: uio);
2191	} else {
2192	resid = top->m_pkthdr.len;
2193	}
2194	orig_resid = resid;
2195
2196	KERNEL_DEBUG((DBG_FNC_SOSEND \| DBG_FUNC_START), so, resid,
2197	so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2198
2199	socket_lock(so, refcount: `1`);
2200
2201	if (NEED_DGRAM_FLOW_TRACKING(so)) {
2202	dgram_flow_entry = soflow_get_flow(so, NULL, addr, control, resid, true, `0`);
2203	}
2204
2205	/*
2206	* trace if tracing & network (vs. unix) sockets & and
2207	* non-loopback
2208	*/
2209	if (ENTR_SHOULDTRACE &&
2210	(SOCK_CHECK_DOM(so, AF_INET) \|\| SOCK_CHECK_DOM(so, AF_INET6))) {
2211	struct inpcb *inp = sotoinpcb(so);
2212	if (inp->inp_last_outifp != NULL &&
2213	!(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2214	en_tracing = TRUE;
2215	KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2216	VM_KERNEL_ADDRPERM(so),
2217	((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : `0`),
2218	(int64_t)resid);
2219	}
2220	}
2221
2222	/*
2223	* Re-injection should not affect process accounting
2224	*/
2225	if ((flags & MSG_SKIPCFIL) == `0`) {
2226	so_update_last_owner_locked(so, self: p);
2227	so_update_policy(so);
2228
2229	#if NECP
2230	so_update_necp_policy(so, NULL, override_remote_addr: addr);
2231	#endif /* NECP */
2232	}
2233
2234	if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != `0`) {
2235	error = EOPNOTSUPP;
2236	goto out_locked;
2237	}
2238
2239	/*
2240	* In theory resid should be unsigned.
2241	* However, space must be signed, as it might be less than 0
2242	* if we over-committed, and we must use a signed comparison
2243	* of space and resid. On the other hand, a negative resid
2244	* causes us to loop sending 0-length segments to the protocol.
2245	*
2246	* Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2247	*
2248	* Note: We limit resid to be a positive int value as we use
2249	* imin() to set bytes_to_copy -- radr://14558484
2250	*/
2251	if (resid < `0` \|\| resid > INT_MAX \|\|
2252	(so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
2253	error = EINVAL;
2254	goto out_locked;
2255	}
2256
2257	dontroute = (flags & MSG_DONTROUTE) &&
2258	(so->so_options & SO_DONTROUTE) == `0` &&
2259	(so->so_proto->pr_flags & PR_ATOMIC);
2260	OSIncrementAtomicLong(address: &p->p_stats->p_ru.ru_msgsnd);
2261
2262	if (control != NULL) {
2263	clen = control->m_len;
2264	}
2265
2266	if (soreserveheadroom != `0`) {
2267	headroom = so->so_pktheadroom;
2268	}
2269
2270	do {
2271	error = sosendcheck(so, addr, resid, clen, atomic, flags,
2272	sblocked: &sblocked);
2273	if (error) {
2274	goto out_locked;
2275	}
2276
2277	mp = &top;
2278	space = sbspace(sb: &so->so_snd) - clen;
2279	space += ((flags & MSG_OOB) ? `1024` : `0`);
2280
2281	do {
2282	if (uio == NULL) {
2283	/*
2284	* Data is prepackaged in "top".
2285	*/
2286	resid = `0`;
2287	if (flags & MSG_EOR) {
2288	top->m_flags \|= M_EOR;
2289	}
2290	} else {
2291	int chainlength;
2292	int bytes_to_copy;
2293	boolean_t jumbocl;
2294	boolean_t bigcl;
2295	int bytes_to_alloc;
2296
2297	bytes_to_copy = imin(a: (int)resid, b: (int)space);
2298
2299	bytes_to_alloc = bytes_to_copy;
2300	if (top == NULL) {
2301	bytes_to_alloc += headroom;
2302	}
2303
2304	if (sosendminchain > `0`) {
2305	chainlength = `0`;
2306	} else {
2307	chainlength = sosendmaxchain;
2308	}
2309
2310	/*
2311	* Use big 4 KB cluster when the outgoing interface
2312	* does not prefer 2 KB clusters
2313	*/
2314	bigcl = !(so->so_flags1 & SOF1_IF_2KCL) \|\|
2315	sosendbigcl_ignore_capab;
2316
2317	/*
2318	* Attempt to use larger than system page-size
2319	* clusters for large writes only if there is
2320	* a jumbo cluster pool and if the socket is
2321	* marked accordingly.
2322	*/
2323	jumbocl = sosendjcl && njcl > `0` &&
2324	((so->so_flags & SOF_MULTIPAGES) \|\|
2325	sosendjcl_ignore_capab) &&
2326	bigcl;
2327
2328	socket_unlock(so, refcount: `0`);
2329
2330	do {
2331	int num_needed;
2332	int hdrs_needed = (top == NULL) ? `1` : `0`;
2333
2334	/*
2335	* try to maintain a local cache of mbuf
2336	* clusters needed to complete this
2337	* write the list is further limited to
2338	* the number that are currently needed
2339	* to fill the socket this mechanism
2340	* allows a large number of mbufs/
2341	* clusters to be grabbed under a single
2342	* mbuf lock... if we can't get any
2343	* clusters, than fall back to trying
2344	* for mbufs if we fail early (or
2345	* miscalcluate the number needed) make
2346	* sure to release any clusters we
2347	* haven't yet consumed.
2348	*/
2349	if (freelist == NULL &&
2350	bytes_to_alloc > MBIGCLBYTES &&
2351	jumbocl) {
2352	num_needed =
2353	bytes_to_alloc / M16KCLBYTES;
2354
2355	if ((bytes_to_alloc -
2356	(num_needed * M16KCLBYTES))
2357	>= MINCLSIZE) {
2358	num_needed++;
2359	}
2360
2361	freelist =
2362	m_getpackets_internal(
2363	(unsigned int *)&num_needed,
2364	hdrs_needed, M_WAIT, `0`,
2365	M16KCLBYTES);
2366	/*
2367	* Fall back to 4K cluster size
2368	* if allocation failed
2369	*/
2370	}
2371
2372	if (freelist == NULL &&
2373	bytes_to_alloc > MCLBYTES &&
2374	bigcl) {
2375	num_needed =
2376	bytes_to_alloc / MBIGCLBYTES;
2377
2378	if ((bytes_to_alloc -
2379	(num_needed * MBIGCLBYTES)) >=
2380	MINCLSIZE) {
2381	num_needed++;
2382	}
2383
2384	freelist =
2385	m_getpackets_internal(
2386	(unsigned int *)&num_needed,
2387	hdrs_needed, M_WAIT, `0`,
2388	MBIGCLBYTES);
2389	/*
2390	* Fall back to cluster size
2391	* if allocation failed
2392	*/
2393	}
2394
2395	/*
2396	* Allocate a cluster as we want to
2397	* avoid to split the data in more
2398	* that one segment and using MINCLSIZE
2399	* would lead us to allocate two mbufs
2400	*/
2401	if (soreserveheadroom != `0` &&
2402	freelist == NULL &&
2403	((top == NULL &&
2404	bytes_to_alloc > _MHLEN) \|\|
2405	bytes_to_alloc > _MLEN)) {
2406	num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2407	MCLBYTES;
2408	freelist =
2409	m_getpackets_internal(
2410	(unsigned int *)&num_needed,
2411	hdrs_needed, M_WAIT, `0`,
2412	MCLBYTES);
2413	/*
2414	* Fall back to a single mbuf
2415	* if allocation failed
2416	*/
2417	} else if (freelist == NULL &&
2418	bytes_to_alloc > MINCLSIZE) {
2419	num_needed =
2420	bytes_to_alloc / MCLBYTES;
2421
2422	if ((bytes_to_alloc -
2423	(num_needed * MCLBYTES)) >=
2424	MINCLSIZE) {
2425	num_needed++;
2426	}
2427
2428	freelist =
2429	m_getpackets_internal(
2430	(unsigned int *)&num_needed,
2431	hdrs_needed, M_WAIT, `0`,
2432	MCLBYTES);
2433	/*
2434	* Fall back to a single mbuf
2435	* if allocation failed
2436	*/
2437	}
2438	/*
2439	* For datagram protocols, leave
2440	* headroom for protocol headers
2441	* in the first cluster of the chain
2442	*/
2443	if (freelist != NULL && atomic &&
2444	top == NULL && headroom > `0`) {
2445	freelist->m_data += headroom;
2446	}
2447
2448	/*
2449	* Fall back to regular mbufs without
2450	* reserving the socket headroom
2451	*/
2452	if (freelist == NULL) {
2453	if (SOCK_TYPE(so) != SOCK_STREAM \|\| bytes_to_alloc <= MINCLSIZE) {
2454	if (top == NULL) {
2455	MGETHDR(freelist,
2456	M_WAIT, MT_DATA);
2457	} else {
2458	MGET(freelist,
2459	M_WAIT, MT_DATA);
2460	}
2461	}
2462
2463	if (freelist == NULL) {
2464	error = ENOBUFS;
2465	socket_lock(so, refcount: `0`);
2466	goto out_locked;
2467	}
2468	/*
2469	* For datagram protocols,
2470	* leave room for protocol
2471	* headers in first mbuf.
2472	*/
2473	if (atomic && top == NULL &&
2474	bytes_to_copy > `0` &&
2475	bytes_to_copy < MHLEN) {
2476	MH_ALIGN(freelist,
2477	bytes_to_copy);
2478	}
2479	}
2480	m = freelist;
2481	freelist = m->m_next;
2482	m->m_next = NULL;
2483
2484	if ((m->m_flags & M_EXT)) {
2485	mlen = m->m_ext.ext_size -
2486	M_LEADINGSPACE(m);
2487	} else if ((m->m_flags & M_PKTHDR)) {
2488	mlen = MHLEN - M_LEADINGSPACE(m);
2489	m_add_crumb(m, PKT_CRUMB_SOSEND);
2490	} else {
2491	mlen = MLEN - M_LEADINGSPACE(m);
2492	}
2493	len = imin(a: (int)mlen, b: bytes_to_copy);
2494
2495	chainlength += len;
2496
2497	space -= len;
2498
2499	error = uiomove(mtod(m, caddr_t),
2500	n: (int)len, uio);
2501
2502	resid = uio_resid(a_uio: uio);
2503
2504	m->m_len = (int32_t)len;
2505	*mp = m;
2506	top->m_pkthdr.len += len;
2507	if (error) {
2508	break;
2509	}
2510	mp = &m->m_next;
2511	if (resid <= `0`) {
2512	if (flags & MSG_EOR) {
2513	top->m_flags \|= M_EOR;
2514	}
2515	break;
2516	}
2517	bytes_to_copy = imin(a: (int)resid, b: (int)space);
2518	} while (space > `0` &&
2519	(chainlength < sosendmaxchain \|\| atomic \|\|
2520	resid < MINCLSIZE));
2521
2522	socket_lock(so, refcount: `0`);
2523
2524	if (error) {
2525	goto out_locked;
2526	}
2527	}
2528
2529	if (dontroute) {
2530	so->so_options \|= SO_DONTROUTE;
2531	}
2532
2533	/*
2534	* Compute flags here, for pru_send and NKEs
2535	*
2536	* If the user set MSG_EOF, the protocol
2537	* understands this flag and nothing left to
2538	* send then use PRU_SEND_EOF instead of PRU_SEND.
2539	*/
2540	sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2541	((flags & MSG_EOF) &&
2542	(so->so_proto->pr_flags & PR_IMPLOPCL) &&
2543	(resid <= `0`)) ? PRUS_EOF :
2544	/ If there is more to send set PRUS_MORETOCOME /
2545	(resid > `0` && space > `0`) ? PRUS_MORETOCOME : `0`;
2546
2547	if ((flags & MSG_SKIPCFIL) == `0`) {
2548	/*
2549	* Socket filter processing
2550	*/
2551	error = sflt_data_out(so, to: addr, data: &top,
2552	control: &control, flags: (sendflags & MSG_OOB) ?
2553	sock_data_filt_flag_oob : `0`);
2554	if (error) {
2555	if (error == EJUSTRETURN) {
2556	error = `0`;
2557	goto packet_consumed;
2558	}
2559	goto out_locked;
2560	}
2561	#if CONTENT_FILTER
2562	/*
2563	* Content filter processing
2564	*/
2565	error = cfil_sock_data_out(so, to: addr, data: top,
2566	control, flags: sendflags, dgram_flow_entry);
2567	if (error) {
2568	if (error == EJUSTRETURN) {
2569	error = `0`;
2570	goto packet_consumed;
2571	}
2572	goto out_locked;
2573	}
2574	#endif /* CONTENT_FILTER */
2575	}
2576	error = (*so->so_proto->pr_usrreqs->pru_send)
2577	(so, sendflags, top, addr, control, p);
2578
2579	packet_consumed:
2580	if (dontroute) {
2581	so->so_options &= ~SO_DONTROUTE;
2582	}
2583
2584	clen = `0`;
2585	control = NULL;
2586	top = NULL;
2587	mp = &top;
2588	if (error) {
2589	goto out_locked;
2590	}
2591	} while (resid && space > `0`);
2592	} while (resid);
2593
2594
2595	out_locked:
2596	if (resid > orig_resid) {
2597	char pname[MAXCOMLEN] = {};
2598	pid_t current_pid = proc_pid(current_proc());
2599	proc_name(pid: current_pid, buf: pname, size: sizeof(pname));
2600
2601	if (sosend_assert_panic != `0`) {
2602	panic("sosend so %p resid %lld > orig_resid %lld proc %s:%d",
2603	so, resid, orig_resid, pname, current_pid);
2604	} else {
2605	os_log_error(OS_LOG_DEFAULT, "sosend: so_gencnt %llu resid %lld > orig_resid %lld proc %s:%d",
2606	so->so_gencnt, resid, orig_resid, pname, current_pid);
2607	}
2608	}
2609
2610	if (sblocked) {
2611	sbunlock(sb: &so->so_snd, FALSE); / will unlock socket /
2612	} else {
2613	socket_unlock(so, refcount: `1`);
2614	}
2615	if (top != NULL) {
2616	m_freem(top);
2617	}
2618	if (control != NULL) {
2619	m_freem(control);
2620	}
2621	if (freelist != NULL) {
2622	m_freem_list(freelist);
2623	}
2624
2625	if (dgram_flow_entry != NULL) {
2626	soflow_free_flow(dgram_flow_entry);
2627	}
2628
2629	soclearfastopen(so);
2630
2631	if (en_tracing) {
2632	/ resid passed here is the bytes left in uio /
2633	KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2634	VM_KERNEL_ADDRPERM(so),
2635	((error == EWOULDBLOCK) ? kEnTrFlagNoWork : `0`),
2636	(int64_t)(orig_resid - resid));
2637	}
2638	KERNEL_DEBUG(DBG_FNC_SOSEND \| DBG_FUNC_END, so, resid,
2639	so->so_snd.sb_cc, space, error);
2640
2641	return error;
2642	}
2643
2644	int
2645	sosend_reinject(struct socket so, struct* sockaddr addr, struct* mbuf top, struct* mbuf *control, uint32_t sendflags)
2646	{
2647	struct mbuf m0 = NULL, control_end = NULL;
2648
2649	socket_lock_assert_owned(so);
2650
2651	/*
2652	* top must points to mbuf chain to be sent.
2653	* If control is not NULL, top must be packet header
2654	*/
2655	VERIFY(top != NULL &&
2656	(control == NULL \|\| top->m_flags & M_PKTHDR));
2657
2658	/*
2659	* If control is not passed in, see if we can get it
2660	* from top.
2661	*/
2662	if (control == NULL && (top->m_flags & M_PKTHDR) == `0`) {
2663	// Locate start of control if present and start of data
2664	for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2665	if (m0->m_flags & M_PKTHDR) {
2666	top = m0;
2667	break;
2668	} else if (m0->m_type == MT_CONTROL) {
2669	if (control == NULL) {
2670	// Found start of control
2671	control = m0;
2672	}
2673	if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2674	// Found end of control
2675	control_end = m0;
2676	}
2677	}
2678	}
2679	if (control_end != NULL) {
2680	control_end->m_next = NULL;
2681	}
2682	}
2683
2684	int error = (*so->so_proto->pr_usrreqs->pru_send)
2685	(so, sendflags, top, addr, control, current_proc());
2686
2687	return error;
2688	}
2689
2690	static struct mbuf *
2691	mbuf_detach_control_from_list(struct mbuf **mp)
2692	{
2693	struct mbuf *control = NULL;
2694	struct mbuf m = mp;
2695
2696	if (m->m_type == MT_CONTROL) {
2697	struct mbuf *control_end;
2698	struct mbuf *n;
2699
2700	n = control_end = control = m;
2701
2702	/*
2703	* Break the chain per mbuf type
2704	*/
2705	while (n != NULL && n->m_type == MT_CONTROL) {
2706	control_end = n;
2707	n = n->m_next;
2708	}
2709	control_end->m_next = NULL;
2710	*mp = n;
2711	}
2712	VERIFY(*mp != NULL);
2713
2714	return control;
2715	}
2716
2717	/*
2718	* Supported only connected sockets (no address) without ancillary data
2719	* (control mbuf) for atomic protocols
2720	*/
2721	int
2722	sosend_list(struct socket so, struct* mbuf pktlist, size_t total_len, u_int pktcnt, int flags)
2723	{
2724	struct mbuf *m;
2725	struct soflow_hash_entry *dgram_flow_entry = NULL;
2726	int error, dontroute;
2727	int atomic = sosendallatonce(so);
2728	int sblocked = `0`;
2729	struct proc *p = current_proc();
2730	struct mbuf *top = pktlist;
2731	bool skip_filt = (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) \|\| (flags & MSG_SKIPCFIL);
2732
2733	KERNEL_DEBUG((DBG_FNC_SOSEND_LIST \| DBG_FUNC_START), so, uiocnt,
2734	so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2735
2736	if (so->so_type != SOCK_DGRAM) {
2737	error = EINVAL;
2738	os_log(OS_LOG_DEFAULT, "sosend_list: so->so_type != SOCK_DGRAM error %d",
2739	error);
2740	goto out;
2741	}
2742	if (atomic == `0`) {
2743	error = EINVAL;
2744	os_log(OS_LOG_DEFAULT, "sosend_list: atomic == 0 error %d",
2745	error);
2746	goto out;
2747	}
2748	if ((so->so_state & SS_ISCONNECTED) == `0`) {
2749	error = ENOTCONN;
2750	os_log(OS_LOG_DEFAULT, "sosend_list: SS_ISCONNECTED not set error: %d",
2751	error);
2752	goto out;
2753	}
2754	if (flags & ~(MSG_DONTWAIT \| MSG_NBIO \| MSG_SKIPCFIL)) {
2755	error = EINVAL;
2756	os_log(OS_LOG_DEFAULT, "sosend_list: flags 0x%x error %d",
2757	flags, error);
2758	goto out;
2759	}
2760
2761	socket_lock(so, refcount: `1`);
2762	so_update_last_owner_locked(so, self: p);
2763	so_update_policy(so);
2764
2765	if (NEED_DGRAM_FLOW_TRACKING(so)) {
2766	dgram_flow_entry = soflow_get_flow(so, NULL, NULL, NULL, total_len, true, `0`);
2767	}
2768
2769	#if NECP
2770	so_update_necp_policy(so, NULL, NULL);
2771	#endif /* NECP */
2772
2773	dontroute = (flags & MSG_DONTROUTE) &&
2774	(so->so_options & SO_DONTROUTE) == `0` &&
2775	(so->so_proto->pr_flags & PR_ATOMIC);
2776	if (dontroute) {
2777	so->so_options \|= SO_DONTROUTE;
2778	}
2779
2780	OSIncrementAtomicLong(address: &p->p_stats->p_ru.ru_msgsnd);
2781
2782	error = sosendcheck(so, NULL, resid: `0`, clen: `0`, atomic, flags, sblocked: &sblocked);
2783	if (error) {
2784	os_log(OS_LOG_DEFAULT, "sosend_list: sosendcheck error %d",
2785	error);
2786	goto release;
2787	}
2788
2789	if (!skip_filt) {
2790	struct mbuf **prevnextp = NULL;
2791
2792	for (m = top; m != NULL; m = m->m_nextpkt) {
2793	struct mbuf *control = NULL;
2794	struct mbuf *last_control = NULL;
2795	struct mbuf *nextpkt;
2796
2797	/*
2798	* Remove packet from the list of packets
2799	*/
2800	nextpkt = m->m_nextpkt;
2801	if (prevnextp != NULL) {
2802	*prevnextp = nextpkt;
2803	} else {
2804	top = nextpkt;
2805	}
2806	m->m_nextpkt = NULL;
2807
2808	/*
2809	* Break the chain per mbuf type
2810	*/
2811	if (m->m_type == MT_CONTROL) {
2812	control = mbuf_detach_control_from_list(mp: &m);
2813	}
2814	/*
2815	* Socket filter processing
2816	*/
2817	error = sflt_data_out(so, NULL, data: &m,
2818	control: &control, flags: `0`);
2819	if (error != `0` && error != EJUSTRETURN) {
2820	os_log(OS_LOG_DEFAULT, "sosend_list: sflt_data_out error %d",
2821	error);
2822	goto release;
2823	}
2824
2825	#if CONTENT_FILTER
2826	if (error == `0`) {
2827	/*
2828	* Content filter processing
2829	*/
2830	error = cfil_sock_data_out(so, NULL, data: m,
2831	control, flags: `0`, dgram_flow_entry);
2832	if (error != `0` && error != EJUSTRETURN) {
2833	os_log(OS_LOG_DEFAULT, "sosend_list: cfil_sock_data_out error %d",
2834	error);
2835	goto release;
2836	}
2837	}
2838	#endif /* CONTENT_FILTER */
2839	if (error == EJUSTRETURN) {
2840	/*
2841	* When swallowed by a filter, the packet is not
2842	* in the list anymore
2843	*/
2844	error = `0`;
2845	} else {
2846	/*
2847	* Rebuild the mbuf chain of the packet
2848	*/
2849	if (control != NULL) {
2850	last_control->m_next = m;
2851	m = control;
2852	}
2853	/*
2854	* Reinsert the packet in the list of packets
2855	*/
2856	m->m_nextpkt = nextpkt;
2857	if (prevnextp != NULL) {
2858	*prevnextp = m;
2859	} else {
2860	top = m;
2861	}
2862	prevnextp = &m->m_nextpkt;
2863	}
2864	}
2865	}
2866
2867	if (top != NULL) {
2868	if (so->so_proto->pr_usrreqs->pru_send_list != pru_send_list_notsupp) {
2869	error = (*so->so_proto->pr_usrreqs->pru_send_list)
2870	(so, top, pktcnt, flags);
2871	if (error != `0`) {
2872	os_log(OS_LOG_DEFAULT, "sosend_list: pru_send_list error %d",
2873	error);
2874	}
2875	top = NULL;
2876	} else {
2877	*pktcnt = `0`;
2878	for (m = top; m != NULL; m = top) {
2879	struct mbuf *control = NULL;
2880
2881	top = m->m_nextpkt;
2882	m->m_nextpkt = NULL;
2883
2884	/*
2885	* Break the chain per mbuf type
2886	*/
2887	if (m->m_type == MT_CONTROL) {
2888	control = mbuf_detach_control_from_list(mp: &m);
2889	}
2890
2891	error = (*so->so_proto->pr_usrreqs->pru_send)
2892	(so, `0`, m, NULL, control, current_proc());
2893	if (error != `0`) {
2894	os_log(OS_LOG_DEFAULT, "sosend_list: pru_send error %d",
2895	error);
2896	goto release;
2897	}
2898	*pktcnt += `1`;
2899	}
2900	}
2901	}
2902
2903	release:
2904	if (dontroute) {
2905	so->so_options &= ~SO_DONTROUTE;
2906	}
2907	if (sblocked) {
2908	sbunlock(sb: &so->so_snd, FALSE); / will unlock socket /
2909	} else {
2910	socket_unlock(so, refcount: `1`);
2911	}
2912	out:
2913	if (top != NULL) {
2914	os_log(OS_LOG_DEFAULT, "sosend_list: m_freem_list(top) with error %d",
2915	error);
2916	m_freem_list(top);
2917	}
2918
2919	if (dgram_flow_entry != NULL) {
2920	soflow_free_flow(dgram_flow_entry);
2921	}
2922
2923	KERNEL_DEBUG(DBG_FNC_SOSEND_LIST \| DBG_FUNC_END, so, resid,
2924	so->so_snd.sb_cc, `0`, error);
2925
2926	return error;
2927	}
2928
2929	/*
2930	* May return ERESTART when packet is dropped by MAC policy check
2931	*/
2932	static int
2933	soreceive_addr(struct proc p, struct* socket so, struct* sockaddr **psa,
2934	struct mbuf **maddrp,
2935	int flags, struct mbuf mp, struct mbuf nextrecordp, int canwait)
2936	{
2937	int error = `0`;
2938	struct mbuf m = mp;
2939	struct mbuf nextrecord = nextrecordp;
2940
2941	KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2942	#if CONFIG_MACF_SOCKET_SUBSET
2943	/*
2944	* Call the MAC framework for policy checking if we're in
2945	* the user process context and the socket isn't connected.
2946	*/
2947	if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2948	struct mbuf *m0 = m;
2949	/*
2950	* Dequeue this record (temporarily) from the receive
2951	* list since we're about to drop the socket's lock
2952	* where a new record may arrive and be appended to
2953	* the list. Upon MAC policy failure, the record
2954	* will be freed. Otherwise, we'll add it back to
2955	* the head of the list. We cannot rely on SB_LOCK
2956	* because append operation uses the socket's lock.
2957	*/
2958	do {
2959	m->m_nextpkt = NULL;
2960	sbfree(sb: &so->so_rcv, m);
2961	m = m->m_next;
2962	} while (m != NULL);
2963	m = m0;
2964	so->so_rcv.sb_mb = nextrecord;
2965	SB_EMPTY_FIXUP(&so->so_rcv);
2966	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2967	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2968	socket_unlock(so, refcount: `0`);
2969
2970	error = mac_socket_check_received(cred: kauth_cred_get(), so,
2971	mtod(m, struct sockaddr *));
2972
2973	if (error != `0`) {
2974	/*
2975	* MAC policy failure; free this record and
2976	* process the next record (or block until
2977	* one is available). We have adjusted sb_cc
2978	* and sb_mbcnt above so there is no need to
2979	* call sbfree() again.
2980	*/
2981	m_freem(m);
2982	/*
2983	* Clear SB_LOCK but don't unlock the socket.
2984	* Process the next record or wait for one.
2985	*/
2986	socket_lock(so, refcount: `0`);
2987	sbunlock(sb: &so->so_rcv, TRUE); / stay locked /
2988	error = ERESTART;
2989	goto done;
2990	}
2991	socket_lock(so, refcount: `0`);
2992	/*
2993	* If the socket has been defunct'd, drop it.
2994	*/
2995	if (so->so_flags & SOF_DEFUNCT) {
2996	m_freem(m);
2997	error = ENOTCONN;
2998	goto done;
2999	}
3000	/*
3001	* Re-adjust the socket receive list and re-enqueue
3002	* the record in front of any packets which may have
3003	* been appended while we dropped the lock.
3004	*/
3005	for (m = m0; m->m_next != NULL; m = m->m_next) {
3006	sballoc(sb: &so->so_rcv, m);
3007	}
3008	sballoc(sb: &so->so_rcv, m);
3009	if (so->so_rcv.sb_mb == NULL) {
3010	so->so_rcv.sb_lastrecord = m0;
3011	so->so_rcv.sb_mbtail = m;
3012	}
3013	m = m0;
3014	nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
3015	so->so_rcv.sb_mb = m;
3016	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
3017	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
3018	}
3019	#endif /* CONFIG_MACF_SOCKET_SUBSET */
3020	if (psa != NULL) {
3021	psa = dup_sockaddr(mtod(m, struct* sockaddr *), canwait);
3022	if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
3023	error = EWOULDBLOCK;
3024	goto done;
3025	}
3026	} else if (maddrp != NULL) {
3027	*maddrp = m;
3028	}
3029	if (flags & MSG_PEEK) {
3030	m = m->m_next;
3031	} else {
3032	sbfree(sb: &so->so_rcv, m);
3033	if (m->m_next == NULL && so->so_rcv.sb_cc != `0`) {
3034	panic("%s: about to create invalid socketbuf",
3035	__func__);
3036	/ NOTREACHED /
3037	}
3038	if (maddrp == NULL) {
3039	MFREE(m, so->so_rcv.sb_mb);
3040	} else {
3041	so->so_rcv.sb_mb = m->m_next;
3042	m->m_next = NULL;
3043	}
3044	m = so->so_rcv.sb_mb;
3045	if (m != NULL) {
3046	m->m_nextpkt = nextrecord;
3047	} else {
3048	so->so_rcv.sb_mb = nextrecord;
3049	SB_EMPTY_FIXUP(&so->so_rcv);
3050	}
3051	}
3052	done:
3053	*mp = m;
3054	*nextrecordp = nextrecord;
3055
3056	return error;
3057	}
3058
3059	/*
3060	* When peeking SCM_RIGHTS, the actual file descriptors are not yet created
3061	* so clear the data portion in order not to leak the file pointers
3062	*/
3063	static void
3064	sopeek_scm_rights(struct mbuf *rights)
3065	{
3066	struct cmsghdr cm = mtod(rights, struct* cmsghdr *);
3067
3068	if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) {
3069	VERIFY(cm->cmsg_len <= rights->m_len);
3070	memset(s: cm + `1`, c: `0`, n: cm->cmsg_len - sizeof(*cm));
3071	}
3072	}
3073
3074	/*
3075	* Process one or more MT_CONTROL mbufs present before any data mbufs
3076	* in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3077	* just copy the data; if !MSG_PEEK, we call into the protocol to
3078	* perform externalization.
3079	*/
3080	static int
3081	soreceive_ctl(struct socket so, struct* mbuf *controlp, int* flags,
3082	struct mbuf mp, struct mbuf nextrecordp)
3083	{
3084	int error = `0`;
3085	struct mbuf cm = NULL, cmn;
3086	struct mbuf **cme = &cm;
3087	struct sockbuf *sb_rcv = &so->so_rcv;
3088	struct mbuf **msgpcm = NULL;
3089	struct mbuf m = mp;
3090	struct mbuf nextrecord = nextrecordp;
3091	struct protosw *pr = so->so_proto;
3092
3093	/*
3094	* Externalizing the control messages would require us to
3095	* drop the socket's lock below. Once we re-acquire the
3096	* lock, the mbuf chain might change. In order to preserve
3097	* consistency, we unlink all control messages from the
3098	* first mbuf chain in one shot and link them separately
3099	* onto a different chain.
3100	*/
3101	do {
3102	if (flags & MSG_PEEK) {
3103	if (controlp != NULL) {
3104	if (*controlp == NULL) {
3105	msgpcm = controlp;
3106	}
3107	*controlp = m_copy(m, `0`, m->m_len);
3108
3109	/*
3110	* If we failed to allocate an mbuf,
3111	* release any previously allocated
3112	* mbufs for control data. Return
3113	* an error. Keep the mbufs in the
3114	* socket as this is using
3115	* MSG_PEEK flag.
3116	*/
3117	if (*controlp == NULL) {
3118	m_freem(*msgpcm);
3119	error = ENOBUFS;
3120	goto done;
3121	}
3122
3123	if (pr->pr_domain->dom_externalize != NULL) {
3124	sopeek_scm_rights(rights: *controlp);
3125	}
3126
3127	controlp = &(*controlp)->m_next;
3128	}
3129	m = m->m_next;
3130	} else {
3131	m->m_nextpkt = NULL;
3132	sbfree(sb: sb_rcv, m);
3133	sb_rcv->sb_mb = m->m_next;
3134	m->m_next = NULL;
3135	*cme = m;
3136	cme = &(*cme)->m_next;
3137	m = sb_rcv->sb_mb;
3138	}
3139	} while (m != NULL && m->m_type == MT_CONTROL);
3140
3141	if (!(flags & MSG_PEEK)) {
3142	if (sb_rcv->sb_mb != NULL) {
3143	sb_rcv->sb_mb->m_nextpkt = nextrecord;
3144	} else {
3145	sb_rcv->sb_mb = nextrecord;
3146	SB_EMPTY_FIXUP(sb_rcv);
3147	}
3148	if (nextrecord == NULL) {
3149	sb_rcv->sb_lastrecord = m;
3150	}
3151	}
3152
3153	SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3154	SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3155
3156	while (cm != NULL) {
3157	int cmsg_level;
3158	int cmsg_type;
3159
3160	cmn = cm->m_next;
3161	cm->m_next = NULL;
3162	cmsg_level = mtod(cm, struct cmsghdr *)->cmsg_level;
3163	cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3164
3165	/*
3166	* Call the protocol to externalize SCM_RIGHTS message
3167	* and return the modified message to the caller upon
3168	* success. Otherwise, all other control messages are
3169	* returned unmodified to the caller. Note that we
3170	* only get into this loop if MSG_PEEK is not set.
3171	*/
3172	if (pr->pr_domain->dom_externalize != NULL &&
3173	cmsg_level == SOL_SOCKET &&
3174	cmsg_type == SCM_RIGHTS) {
3175	/*
3176	* Release socket lock: see 3903171. This
3177	* would also allow more records to be appended
3178	* to the socket buffer. We still have SB_LOCK
3179	* set on it, so we can be sure that the head
3180	* of the mbuf chain won't change.
3181	*/
3182	socket_unlock(so, refcount: `0`);
3183	error = (*pr->pr_domain->dom_externalize)(cm);
3184	socket_lock(so, refcount: `0`);
3185	} else {
3186	error = `0`;
3187	}
3188
3189	if (controlp != NULL && error == `0`) {
3190	*controlp = cm;
3191	controlp = &(*controlp)->m_next;
3192	} else {
3193	(void) m_free(cm);
3194	}
3195	cm = cmn;
3196	}
3197	/*
3198	* Update the value of nextrecord in case we received new
3199	* records when the socket was unlocked above for
3200	* externalizing SCM_RIGHTS.
3201	*/
3202	if (m != NULL) {
3203	nextrecord = sb_rcv->sb_mb->m_nextpkt;
3204	} else {
3205	nextrecord = sb_rcv->sb_mb;
3206	}
3207
3208	done:
3209	*mp = m;
3210	*nextrecordp = nextrecord;
3211
3212	return error;
3213	}
3214
3215	/*
3216	* If we have less data than requested, block awaiting more
3217	* (subject to any timeout) if:
3218	* 1. the current count is less than the low water mark, or
3219	* 2. MSG_WAITALL is set, and it is possible to do the entire
3220	* receive operation at once if we block (resid <= hiwat).
3221	* 3. MSG_DONTWAIT is not set
3222	* If MSG_WAITALL is set but resid is larger than the receive buffer,
3223	* we have to do the receive in sections, and thus risk returning
3224	* a short count if a timeout or signal occurs after we start.
3225	*/
3226	static boolean_t
3227	so_should_wait(struct socket so, struct* uio uio, struct* mbuf m, int* flags)
3228	{
3229	struct protosw *pr = so->so_proto;
3230
3231	/ No mbufs in the receive-queue? Wait! /
3232	if (m == NULL) {
3233	return true;
3234	}
3235
3236	/ Not enough data in the receive socket-buffer - we may have to wait /
3237	if ((flags & MSG_DONTWAIT) == `0` && so->so_rcv.sb_cc < uio_resid(a_uio: uio) &&
3238	m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == `0`) {
3239	/*
3240	* Application did set the lowater-mark, so we should wait for
3241	* this data to be present.
3242	*/
3243	if (so->so_rcv.sb_cc < so->so_rcv.sb_lowat) {
3244	return true;
3245	}
3246
3247	/*
3248	* Application wants all the data - so let's try to do the
3249	* receive-operation at once by waiting for everything to
3250	* be there.
3251	*/
3252	if ((flags & MSG_WAITALL) && uio_resid(a_uio: uio) <= so->so_rcv.sb_hiwat) {
3253	return true;
3254	}
3255	}
3256
3257	return false;
3258	}
3259
3260	/*
3261	* Implement receive operations on a socket.
3262	* We depend on the way that records are added to the sockbuf
3263	* by sbappend*. In particular, each record (mbufs linked through m_next)
3264	* must begin with an address if the protocol so specifies,
3265	* followed by an optional mbuf or mbufs containing ancillary data,
3266	* and then zero or more mbufs of data.
3267	* In order to avoid blocking network interrupts for the entire time here,
3268	* we splx() while doing the actual copy to user space.
3269	* Although the sockbuf is locked, new data may still be appended,
3270	* and thus we must maintain consistency of the sockbuf during that time.
3271	*
3272	* The caller may receive the data as a single mbuf chain by supplying
3273	* an mbuf **mp0 for use in returning the chain. The uio is then used
3274	* only for the count in uio_resid.
3275	*
3276	* Returns: 0 Success
3277	* ENOBUFS
3278	* ENOTCONN
3279	* EWOULDBLOCK
3280	* uiomove:EFAULT
3281	* sblock:EWOULDBLOCK
3282	* sblock:EINTR
3283	* sbwait:EBADF
3284	* sbwait:EINTR
3285	* sodelayed_copy:EFAULT
3286	* <pru_rcvoob>:EINVAL[TCP]
3287	* <pru_rcvoob>:EWOULDBLOCK[TCP]
3288	* <pru_rcvoob>:???
3289	* <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3290	* <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3291	* <pr_domain->dom_externalize>:???
3292	*
3293	* Notes: Additional return values from calls through <pru_rcvoob> and
3294	* <pr_domain->dom_externalize> depend on protocols other than
3295	* TCP or AF_UNIX, which are documented above.
3296	*/
3297	int
3298	soreceive(struct socket so, struct* sockaddr psa, struct** uio *uio,
3299	struct mbuf mp0, struct mbuf controlp, int *flagsp)
3300	{
3301	struct mbuf m, mp, ml = NULL;
3302	struct mbuf nextrecord, free_list;
3303	int flags, error, offset;
3304	user_ssize_t len;
3305	struct protosw *pr = so->so_proto;
3306	int moff, type = `0`;
3307	user_ssize_t orig_resid = uio_resid(a_uio: uio);
3308	user_ssize_t delayed_copy_len;
3309	int can_delay;
3310	struct proc *p = current_proc();
3311	boolean_t en_tracing = FALSE;
3312
3313	/*
3314	* Sanity check on the length passed by caller as we are making 'int'
3315	* comparisons
3316	*/
3317	if (orig_resid < `0` \|\| orig_resid > INT_MAX) {
3318	return EINVAL;
3319	}
3320
3321	KERNEL_DEBUG(DBG_FNC_SORECEIVE \| DBG_FUNC_START, so,
3322	uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3323	so->so_rcv.sb_hiwat);
3324
3325	socket_lock(so, refcount: `1`);
3326	so_update_last_owner_locked(so, self: p);
3327	so_update_policy(so);
3328
3329	#ifdef MORE_LOCKING_DEBUG
3330	if (so->so_usecount == `1`) {
3331	panic("%s: so=%x no other reference on socket", __func__, so);
3332	/ NOTREACHED /
3333	}
3334	#endif
3335	mp = mp0;
3336	if (psa != NULL) {
3337	*psa = NULL;
3338	}
3339	if (controlp != NULL) {
3340	*controlp = NULL;
3341	}
3342	if (flagsp != NULL) {
3343	flags = *flagsp & ~MSG_EOR;
3344	} else {
3345	flags = `0`;
3346	}
3347
3348	/*
3349	* If a recv attempt is made on a previously-accepted socket
3350	* that has been marked as inactive (disconnected), reject
3351	* the request.
3352	*/
3353	if (so->so_flags & SOF_DEFUNCT) {
3354	struct sockbuf *sb = &so->so_rcv;
3355
3356	error = ENOTCONN;
3357	SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
3358	__func__, proc_pid(p), proc_best_name(p),
3359	so->so_gencnt,
3360	SOCK_DOM(so), SOCK_TYPE(so), error);
3361	/*
3362	* This socket should have been disconnected and flushed
3363	* prior to being returned from sodefunct(); there should
3364	* be no data on its receive list, so panic otherwise.
3365	*/
3366	if (so->so_state & SS_DEFUNCT) {
3367	sb_empty_assert(sb, __func__);
3368	}
3369	socket_unlock(so, refcount: `1`);
3370	return error;
3371	}
3372
3373	if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3374	pr->pr_usrreqs->pru_preconnect) {
3375	/*
3376	* A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3377	* calling write() right after this. If the app calls a read
3378	* we do not want to block this read indefinetely. Thus,
3379	* we trigger a connect so that the session gets initiated.
3380	*/
3381	error = (*pr->pr_usrreqs->pru_preconnect)(so);
3382
3383	if (error) {
3384	socket_unlock(so, refcount: `1`);
3385	return error;
3386	}
3387	}
3388
3389	if (ENTR_SHOULDTRACE &&
3390	(SOCK_CHECK_DOM(so, AF_INET) \|\| SOCK_CHECK_DOM(so, AF_INET6))) {
3391	/*
3392	* enable energy tracing for inet sockets that go over
3393	* non-loopback interfaces only.
3394	*/
3395	struct inpcb *inp = sotoinpcb(so);
3396	if (inp->inp_last_outifp != NULL &&
3397	!(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3398	en_tracing = TRUE;
3399	KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3400	VM_KERNEL_ADDRPERM(so),
3401	((so->so_state & SS_NBIO) ?
3402	kEnTrFlagNonBlocking : `0`),
3403	(int64_t)orig_resid);
3404	}
3405	}
3406
3407	/*
3408	* When SO_WANTOOBFLAG is set we try to get out-of-band data
3409	* regardless of the flags argument. Here is the case were
3410	* out-of-band data is not inline.
3411	*/
3412	if ((flags & MSG_OOB) \|\|
3413	((so->so_options & SO_WANTOOBFLAG) != `0` &&
3414	(so->so_options & SO_OOBINLINE) == `0` &&
3415	(so->so_oobmark \|\| (so->so_state & SS_RCVATMARK)))) {
3416	m = m_get(M_WAIT, MT_DATA);
3417	if (m == NULL) {
3418	socket_unlock(so, refcount: `1`);
3419	KERNEL_DEBUG(DBG_FNC_SORECEIVE \| DBG_FUNC_END,
3420	ENOBUFS, `0`, `0`, `0`, `0`);
3421	return ENOBUFS;
3422	}
3423	error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3424	if (error) {
3425	goto bad;
3426	}
3427	socket_unlock(so, refcount: `0`);
3428	do {
3429	error = uiomove(mtod(m, caddr_t),
3430	n: imin(a: (int)uio_resid(a_uio: uio), b: m->m_len), uio);
3431	m = m_free(m);
3432	} while (uio_resid(a_uio: uio) && error == `0` && m != NULL);
3433	socket_lock(so, refcount: `0`);
3434	bad:
3435	if (m != NULL) {
3436	m_freem(m);
3437	}
3438
3439	if ((so->so_options & SO_WANTOOBFLAG) != `0`) {
3440	if (error == EWOULDBLOCK \|\| error == EINVAL) {
3441	/*
3442	* Let's try to get normal data:
3443	* EWOULDBLOCK: out-of-band data not
3444	* receive yet. EINVAL: out-of-band data
3445	* already read.
3446	*/
3447	error = `0`;
3448	goto nooob;
3449	} else if (error == `0` && flagsp != NULL) {
3450	*flagsp \|= MSG_OOB;
3451	}
3452	}
3453	socket_unlock(so, refcount: `1`);
3454	if (en_tracing) {
3455	KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3456	VM_KERNEL_ADDRPERM(so), `0`,
3457	(int64_t)(orig_resid - uio_resid(uio)));
3458	}
3459	KERNEL_DEBUG(DBG_FNC_SORECEIVE \| DBG_FUNC_END, error,
3460	`0`, `0`, `0`, `0`);
3461
3462	return error;
3463	}
3464	nooob:
3465	if (mp != NULL) {
3466	*mp = NULL;
3467	}
3468
3469	if (so->so_state & SS_ISCONFIRMING && uio_resid(a_uio: uio)) {
3470	(*pr->pr_usrreqs->pru_rcvd)(so, `0`);
3471	}
3472
3473	free_list = NULL;
3474	delayed_copy_len = `0`;
3475	restart:
3476	#ifdef MORE_LOCKING_DEBUG
3477	if (so->so_usecount <= `1`) {
3478	printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3479	(uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3480	}
3481	#endif
3482	/*
3483	* See if the socket has been closed (SS_NOFDREF\|SS_CANTRCVMORE)
3484	* and if so just return to the caller. This could happen when
3485	* soreceive() is called by a socket upcall function during the
3486	* time the socket is freed. The socket buffer would have been
3487	* locked across the upcall, therefore we cannot put this thread
3488	* to sleep (else we will deadlock) or return EWOULDBLOCK (else
3489	* we may livelock), because the lock on the socket buffer will
3490	* only be released when the upcall routine returns to its caller.
3491	* Because the socket has been officially closed, there can be
3492	* no further read on it.
3493	*
3494	* A multipath subflow socket would have its SS_NOFDREF set by
3495	* default, so check for SOF_MP_SUBFLOW socket flag; when the
3496	* socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3497	*/
3498	if ((so->so_state & (SS_NOFDREF \| SS_CANTRCVMORE)) ==
3499	(SS_NOFDREF \| SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3500	socket_unlock(so, refcount: `1`);
3501	return `0`;
3502	}
3503
3504	error = sblock(sb: &so->so_rcv, SBLOCKWAIT(flags));
3505	if (error) {
3506	socket_unlock(so, refcount: `1`);
3507	KERNEL_DEBUG(DBG_FNC_SORECEIVE \| DBG_FUNC_END, error,
3508	`0`, `0`, `0`, `0`);
3509	if (en_tracing) {
3510	KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3511	VM_KERNEL_ADDRPERM(so), `0`,
3512	(int64_t)(orig_resid - uio_resid(uio)));
3513	}
3514	return error;
3515	}
3516
3517	m = so->so_rcv.sb_mb;
3518	if (so_should_wait(so, uio, m, flags)) {
3519	/*
3520	* Panic if we notice inconsistencies in the socket's
3521	* receive list; both sb_mb and sb_cc should correctly
3522	* reflect the contents of the list, otherwise we may
3523	* end up with false positives during select() or poll()
3524	* which could put the application in a bad state.
3525	*/
3526	SB_MB_CHECK(&so->so_rcv);
3527
3528	if (so->so_error) {
3529	if (m != NULL) {
3530	goto dontblock;
3531	}
3532	error = so->so_error;
3533	if ((flags & MSG_PEEK) == `0`) {
3534	so->so_error = `0`;
3535	}
3536	goto release;
3537	}
3538	if (so->so_state & SS_CANTRCVMORE) {
3539	#if CONTENT_FILTER
3540	/*
3541	* Deal with half closed connections
3542	*/
3543	if ((so->so_state & SS_ISDISCONNECTED) == `0` &&
3544	cfil_sock_data_pending(sb: &so->so_rcv) != `0`) {
3545	CFIL_LOG(LOG_INFO,
3546	"so %llx ignore SS_CANTRCVMORE",
3547	(uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3548	} else
3549	#endif /* CONTENT_FILTER */
3550	if (m != NULL) {
3551	goto dontblock;
3552	} else {
3553	goto release;
3554	}
3555	}
3556	for (; m != NULL; m = m->m_next) {
3557	if (m->m_type == MT_OOBDATA \|\| (m->m_flags & M_EOR)) {
3558	m = so->so_rcv.sb_mb;
3559	goto dontblock;
3560	}
3561	}
3562	if ((so->so_state & (SS_ISCONNECTED \| SS_ISCONNECTING)) == `0` &&
3563	(so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3564	error = ENOTCONN;
3565	goto release;
3566	}
3567	if (uio_resid(a_uio: uio) == `0`) {
3568	goto release;
3569	}
3570
3571	if ((so->so_state & SS_NBIO) \|\|
3572	(flags & (MSG_DONTWAIT \| MSG_NBIO))) {
3573	error = EWOULDBLOCK;
3574	goto release;
3575	}
3576	SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3577	SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3578	sbunlock(sb: &so->so_rcv, TRUE); / keep socket locked /
3579	#if EVEN_MORE_LOCKING_DEBUG
3580	if (socket_debug) {
3581	printf("Waiting for socket data\n");
3582	}
3583	#endif
3584
3585	/*
3586	* Depending on the protocol (e.g. TCP), the following
3587	* might cause the socket lock to be dropped and later
3588	* be reacquired, and more data could have arrived and
3589	* have been appended to the receive socket buffer by
3590	* the time it returns. Therefore, we only sleep in
3591	* sbwait() below if and only if the wait-condition is still
3592	* true.
3593	*/
3594	if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3595	(*pr->pr_usrreqs->pru_rcvd)(so, flags);
3596	}
3597
3598	error = `0`;
3599	if (so_should_wait(so, uio, m: so->so_rcv.sb_mb, flags)) {
3600	error = sbwait(sb: &so->so_rcv);
3601	}
3602
3603	#if EVEN_MORE_LOCKING_DEBUG
3604	if (socket_debug) {
3605	printf("SORECEIVE - sbwait returned %d\n", error);
3606	}
3607	#endif
3608	if (so->so_usecount < `1`) {
3609	panic("%s: after 2nd sblock so=%p ref=%d on socket",
3610	__func__, so, so->so_usecount);
3611	/ NOTREACHED /
3612	}
3613	if (error) {
3614	socket_unlock(so, refcount: `1`);
3615	KERNEL_DEBUG(DBG_FNC_SORECEIVE \| DBG_FUNC_END, error,
3616	`0`, `0`, `0`, `0`);
3617	if (en_tracing) {
3618	KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3619	VM_KERNEL_ADDRPERM(so), `0`,
3620	(int64_t)(orig_resid - uio_resid(uio)));
3621	}
3622	return error;
3623	}
3624	goto restart;
3625	}
3626	dontblock:
3627	OSIncrementAtomicLong(address: &p->p_stats->p_ru.ru_msgrcv);
3628	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3629	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3630	nextrecord = m->m_nextpkt;
3631
3632	if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3633	error = soreceive_addr(p, so, psa, NULL, flags, mp: &m, nextrecordp: &nextrecord,
3634	canwait: mp0 == NULL);
3635	if (error == ERESTART) {
3636	goto restart;
3637	} else if (error != `0`) {
3638	goto release;
3639	}
3640	orig_resid = `0`;
3641	}
3642
3643	/*
3644	* Process one or more MT_CONTROL mbufs present before any data mbufs
3645	* in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3646	* just copy the data; if !MSG_PEEK, we call into the protocol to
3647	* perform externalization.
3648	*/
3649	if (m != NULL && m->m_type == MT_CONTROL) {
3650	error = soreceive_ctl(so, controlp, flags, mp: &m, nextrecordp: &nextrecord);
3651	if (error != `0`) {
3652	goto release;
3653	}
3654	orig_resid = `0`;
3655	}
3656
3657	if (m != NULL) {
3658	if (!(flags & MSG_PEEK)) {
3659	/*
3660	* We get here because m points to an mbuf following
3661	* any MT_SONAME or MT_CONTROL mbufs which have been
3662	* processed above. In any case, m should be pointing
3663	* to the head of the mbuf chain, and the nextrecord
3664	* should be either NULL or equal to m->m_nextpkt.
3665	* See comments above about SB_LOCK.
3666	*/
3667	if (m != so->so_rcv.sb_mb \|\|
3668	m->m_nextpkt != nextrecord) {
3669	panic("%s: post-control !sync so=%p m=%p "
3670	"nextrecord=%p\n", __func__, so, m,
3671	nextrecord);
3672	/ NOTREACHED /
3673	}
3674	if (nextrecord == NULL) {
3675	so->so_rcv.sb_lastrecord = m;
3676	}
3677	}
3678	type = m->m_type;
3679	if (type == MT_OOBDATA) {
3680	flags \|= MSG_OOB;
3681	}
3682	} else {
3683	if (!(flags & MSG_PEEK)) {
3684	SB_EMPTY_FIXUP(&so->so_rcv);
3685	}
3686	}
3687	SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3688	SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3689
3690	moff = `0`;
3691	offset = `0`;
3692
3693	if (!(flags & MSG_PEEK) && uio_resid(a_uio: uio) > sorecvmincopy) {
3694	can_delay = `1`;
3695	} else {
3696	can_delay = `0`;
3697	}
3698
3699	while (m != NULL &&
3700	(uio_resid(a_uio: uio) - delayed_copy_len) > `0` && error == `0`) {
3701	if (m->m_type == MT_OOBDATA) {
3702	if (type != MT_OOBDATA) {
3703	break;
3704	}
3705	} else if (type == MT_OOBDATA) {
3706	break;
3707	}
3708
3709	if (!m_has_mtype(m, mtype_flags: MTF_DATA \| MTF_HEADER \| MTF_OOBDATA)) {
3710	break;
3711	}
3712	/*
3713	* Make sure to allways set MSG_OOB event when getting
3714	* out of band data inline.
3715	*/
3716	if ((so->so_options & SO_WANTOOBFLAG) != `0` &&
3717	(so->so_options & SO_OOBINLINE) != `0` &&
3718	(so->so_state & SS_RCVATMARK) != `0`) {
3719	flags \|= MSG_OOB;
3720	}
3721	so->so_state &= ~SS_RCVATMARK;
3722	len = uio_resid(a_uio: uio) - delayed_copy_len;
3723	if (so->so_oobmark && len > so->so_oobmark - offset) {
3724	len = so->so_oobmark - offset;
3725	}
3726	if (len > m->m_len - moff) {
3727	len = m->m_len - moff;
3728	}
3729	/*
3730	* If mp is set, just pass back the mbufs.
3731	* Otherwise copy them out via the uio, then free.
3732	* Sockbuf must be consistent here (points to current mbuf,
3733	* it points to next record) when we drop priority;
3734	* we must note any additions to the sockbuf when we
3735	* block interrupts again.
3736	*/
3737	if (mp == NULL) {
3738	SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3739	SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3740	if (can_delay && len == m->m_len) {
3741	/*
3742	* only delay the copy if we're consuming the
3743	* mbuf and we're NOT in MSG_PEEK mode
3744	* and we have enough data to make it worthwile
3745	* to drop and retake the lock... can_delay
3746	* reflects the state of the 2 latter
3747	* constraints moff should always be zero
3748	* in these cases
3749	*/
3750	delayed_copy_len += len;
3751	} else {
3752	if (delayed_copy_len) {
3753	error = sodelayed_copy(so, uio,
3754	&free_list, &delayed_copy_len);
3755
3756	if (error) {
3757	goto release;
3758	}
3759	/*
3760	* can only get here if MSG_PEEK is not
3761	* set therefore, m should point at the
3762	* head of the rcv queue; if it doesn't,
3763	* it means something drastically
3764	* changed while we were out from behind
3765	* the lock in sodelayed_copy. perhaps
3766	* a RST on the stream. in any event,
3767	* the stream has been interrupted. it's
3768	* probably best just to return whatever
3769	* data we've moved and let the caller
3770	* sort it out...
3771	*/
3772	if (m != so->so_rcv.sb_mb) {
3773	break;
3774	}
3775	}
3776	socket_unlock(so, refcount: `0`);
3777	error = uiomove(mtod(m, caddr_t) + moff,
3778	n: (int)len, uio);
3779	socket_lock(so, refcount: `0`);
3780
3781	if (error) {
3782	goto release;
3783	}
3784	}
3785	} else {
3786	uio_setresid(a_uio: uio, a_value: (uio_resid(a_uio: uio) - len));
3787	}
3788	if (len == m->m_len - moff) {
3789	if (m->m_flags & M_EOR) {
3790	flags \|= MSG_EOR;
3791	}
3792	if (flags & MSG_PEEK) {
3793	m = m->m_next;
3794	moff = `0`;
3795	} else {
3796	nextrecord = m->m_nextpkt;
3797	sbfree(sb: &so->so_rcv, m);
3798	m->m_nextpkt = NULL;
3799
3800	if (mp != NULL) {
3801	*mp = m;
3802	mp = &m->m_next;
3803	so->so_rcv.sb_mb = m = m->m_next;
3804	*mp = NULL;
3805	} else {
3806	if (free_list == NULL) {
3807	free_list = m;
3808	} else {
3809	ml->m_next = m;
3810	}
3811	ml = m;
3812	so->so_rcv.sb_mb = m = m->m_next;
3813	ml->m_next = NULL;
3814	}
3815	if (m != NULL) {
3816	m->m_nextpkt = nextrecord;
3817	if (nextrecord == NULL) {
3818	so->so_rcv.sb_lastrecord = m;
3819	}
3820	} else {
3821	so->so_rcv.sb_mb = nextrecord;
3822	SB_EMPTY_FIXUP(&so->so_rcv);
3823	}
3824	SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3825	SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3826	}
3827	} else {
3828	if (flags & MSG_PEEK) {
3829	moff += len;
3830	} else {
3831	if (mp != NULL) {
3832	int copy_flag;
3833
3834	if (flags & MSG_DONTWAIT) {
3835	copy_flag = M_DONTWAIT;
3836	} else {
3837	copy_flag = M_WAIT;
3838	}
3839	mp = m_copym(m, `0`, (int*)len, copy_flag);
3840	/*
3841	* Failed to allocate an mbuf?
3842	* Adjust uio_resid back, it was
3843	* adjusted down by len bytes which
3844	* we didn't copy over.
3845	*/
3846	if (*mp == NULL) {
3847	uio_setresid(a_uio: uio,
3848	a_value: (uio_resid(a_uio: uio) + len));
3849	break;
3850	}
3851	}
3852	m->m_data += len;
3853	m->m_len -= len;
3854	so->so_rcv.sb_cc -= len;
3855	}
3856	}
3857	if (so->so_oobmark) {
3858	if ((flags & MSG_PEEK) == `0`) {
3859	so->so_oobmark -= len;
3860	if (so->so_oobmark == `0`) {
3861	so->so_state \|= SS_RCVATMARK;
3862	break;
3863	}
3864	} else {
3865	offset += len;
3866	if (offset == so->so_oobmark) {
3867	break;
3868	}
3869	}
3870	}
3871	if (flags & MSG_EOR) {
3872	break;
3873	}
3874	/*
3875	* If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3876	* (for non-atomic socket), we must not quit until
3877	* "uio->uio_resid == 0" or an error termination.
3878	* If a signal/timeout occurs, return with a short
3879	* count but without error. Keep sockbuf locked
3880	* against other readers.
3881	*/
3882	while (flags & (MSG_WAITALL \| MSG_WAITSTREAM) && m == NULL &&
3883	(uio_resid(a_uio: uio) - delayed_copy_len) > `0` &&
3884	!sosendallatonce(so) && !nextrecord) {
3885	if (so->so_error \|\| ((so->so_state & SS_CANTRCVMORE)
3886	#if CONTENT_FILTER
3887	&& cfil_sock_data_pending(sb: &so->so_rcv) == `0`
3888	#endif /* CONTENT_FILTER */
3889	)) {
3890	goto release;
3891	}
3892
3893	/*
3894	* Depending on the protocol (e.g. TCP), the following
3895	* might cause the socket lock to be dropped and later
3896	* be reacquired, and more data could have arrived and
3897	* have been appended to the receive socket buffer by
3898	* the time it returns. Therefore, we only sleep in
3899	* sbwait() below if and only if the socket buffer is
3900	* empty, in order to avoid a false sleep.
3901	*/
3902	if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3903	(*pr->pr_usrreqs->pru_rcvd)(so, flags);
3904	}
3905
3906	SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3907	SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3908
3909	if (so->so_rcv.sb_mb == NULL && sbwait(sb: &so->so_rcv)) {
3910	error = `0`;
3911	goto release;
3912	}
3913	/*
3914	* have to wait until after we get back from the sbwait
3915	* to do the copy because we will drop the lock if we
3916	* have enough data that has been delayed... by dropping
3917	* the lock we open up a window allowing the netisr
3918	* thread to process the incoming packets and to change
3919	* the state of this socket... we're issuing the sbwait
3920	* because the socket is empty and we're expecting the
3921	* netisr thread to wake us up when more packets arrive;
3922	* if we allow that processing to happen and then sbwait
3923	* we could stall forever with packets sitting in the
3924	* socket if no further packets arrive from the remote
3925	* side.
3926	*
3927	* we want to copy before we've collected all the data
3928	* to satisfy this request to allow the copy to overlap
3929	* the incoming packet processing on an MP system
3930	*/
3931	if (delayed_copy_len > sorecvmincopy &&
3932	(delayed_copy_len > (so->so_rcv.sb_hiwat / `2`))) {
3933	error = sodelayed_copy(so, uio,
3934	&free_list, &delayed_copy_len);
3935
3936	if (error) {
3937	goto release;
3938	}
3939	}
3940	m = so->so_rcv.sb_mb;
3941	if (m != NULL) {
3942	nextrecord = m->m_nextpkt;
3943	}
3944	SB_MB_CHECK(&so->so_rcv);
3945	}
3946	}
3947	#ifdef MORE_LOCKING_DEBUG
3948	if (so->so_usecount <= `1`) {
3949	panic("%s: after big while so=%p ref=%d on socket",
3950	__func__, so, so->so_usecount);
3951	/ NOTREACHED /
3952	}
3953	#endif
3954
3955	if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3956	if (so->so_options & SO_DONTTRUNC) {
3957	flags \|= MSG_RCVMORE;
3958	} else {
3959	flags \|= MSG_TRUNC;
3960	if ((flags & MSG_PEEK) == `0`) {
3961	(void) sbdroprecord(sb: &so->so_rcv);
3962	}
3963	}
3964	}
3965
3966	/*
3967	* pru_rcvd below (for TCP) may cause more data to be received
3968	* if the socket lock is dropped prior to sending the ACK; some
3969	* legacy OpenTransport applications don't handle this well
3970	* (if it receives less data than requested while MSG_HAVEMORE
3971	* is set), and so we set the flag now based on what we know
3972	* prior to calling pru_rcvd.
3973	*/
3974	if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > `0`) {
3975	flags \|= MSG_HAVEMORE;
3976	}
3977
3978	if ((flags & MSG_PEEK) == `0`) {
3979	if (m == NULL) {
3980	so->so_rcv.sb_mb = nextrecord;
3981	/*
3982	* First part is an inline SB_EMPTY_FIXUP(). Second
3983	* part makes sure sb_lastrecord is up-to-date if
3984	* there is still data in the socket buffer.
3985	*/
3986	if (so->so_rcv.sb_mb == NULL) {
3987	so->so_rcv.sb_mbtail = NULL;
3988	so->so_rcv.sb_lastrecord = NULL;
3989	} else if (nextrecord->m_nextpkt == NULL) {
3990	so->so_rcv.sb_lastrecord = nextrecord;
3991	}
3992	SB_MB_CHECK(&so->so_rcv);
3993	}
3994	SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3995	SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
3996	if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
3997	(*pr->pr_usrreqs->pru_rcvd)(so, flags);
3998	}
3999	}
4000
4001	if (delayed_copy_len) {
4002	error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4003	if (error) {
4004	goto release;
4005	}
4006	}
4007	if (free_list != NULL) {
4008	m_freem_list(free_list);
4009	free_list = NULL;
4010	}
4011
4012	if (orig_resid == uio_resid(a_uio: uio) && orig_resid &&
4013	(flags & MSG_EOR) == `0` && (so->so_state & SS_CANTRCVMORE) == `0`) {
4014	sbunlock(sb: &so->so_rcv, TRUE); / keep socket locked /
4015	goto restart;
4016	}
4017
4018	if (flagsp != NULL) {
4019	*flagsp \|= flags;
4020	}
4021	release:
4022	#ifdef MORE_LOCKING_DEBUG
4023	if (so->so_usecount <= `1`) {
4024	panic("%s: release so=%p ref=%d on socket", __func__,
4025	so, so->so_usecount);
4026	/ NOTREACHED /
4027	}
4028	#endif
4029	if (delayed_copy_len) {
4030	error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4031	}
4032
4033	if (free_list != NULL) {
4034	m_freem_list(free_list);
4035	}
4036
4037	sbunlock(sb: &so->so_rcv, FALSE); / will unlock socket /
4038
4039	if (en_tracing) {
4040	KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
4041	VM_KERNEL_ADDRPERM(so),
4042	((error == EWOULDBLOCK) ? kEnTrFlagNoWork : `0`),
4043	(int64_t)(orig_resid - uio_resid(uio)));
4044	}
4045	KERNEL_DEBUG(DBG_FNC_SORECEIVE \| DBG_FUNC_END, so, uio_resid(uio),
4046	so->so_rcv.sb_cc, `0`, error);
4047
4048	return error;
4049	}
4050
4051	/*
4052	* Returns: 0 Success
4053	* uiomove:EFAULT
4054	*/
4055	static int
4056	sodelayed_copy(struct socket so, struct* uio uio, struct* mbuf **free_list,
4057	user_ssize_t *resid)
4058	{
4059	int error = `0`;
4060	struct mbuf *m;
4061
4062	m = *free_list;
4063
4064	socket_unlock(so, refcount: `0`);
4065
4066	while (m != NULL && error == `0`) {
4067	error = uiomove(mtod(m, caddr_t), n: (int)m->m_len, uio);
4068	m = m->m_next;
4069	}
4070	m_freem_list(*free_list);
4071
4072	*free_list = NULL;
4073	*resid = `0`;
4074
4075	socket_lock(so, refcount: `0`);
4076
4077	return error;
4078	}
4079
4080	int
4081	soreceive_m_list(struct socket so, u_int pktcntp, struct mbuf **maddrp,
4082	struct mbuf mp0, struct mbuf controlp, int *flagsp)
4083	{
4084	struct mbuf m, *mp;
4085	struct mbuf *nextrecord;
4086	int flags, error;
4087	struct protosw *pr = so->so_proto;
4088	struct proc *p = current_proc();
4089	u_int npkts = `0`;
4090	struct mbuf *free_list = NULL;
4091	int sblocked = `0`;
4092
4093	/*
4094	* Sanity check on the parameters passed by caller
4095	*/
4096	if (mp0 == NULL \|\| pktcntp == NULL) {
4097	return EINVAL;
4098	}
4099	if (pktcntp > SO_MAX_MSG_X \|\| pktcntp == `0`) {
4100	return EINVAL;
4101	}
4102
4103	mp = mp0;
4104	*mp0 = NULL;
4105	if (controlp != NULL) {
4106	*controlp = NULL;
4107	}
4108	if (maddrp != NULL) {
4109	*maddrp = NULL;
4110	}
4111	if (flagsp != NULL) {
4112	flags = *flagsp;
4113	} else {
4114	flags = `0`;
4115	}
4116
4117	KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST \| DBG_FUNC_START, so,
4118	*pktcntp, so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
4119	so->so_rcv.sb_hiwat);
4120
4121	socket_lock(so, refcount: `1`);
4122	so_update_last_owner_locked(so, self: p);
4123	so_update_policy(so);
4124
4125	#if NECP
4126	so_update_necp_policy(so, NULL, NULL);
4127	#endif /* NECP */
4128
4129	/*
4130	* If a recv attempt is made on a previously-accepted socket
4131	* that has been marked as inactive (disconnected), reject
4132	* the request.
4133	*/
4134	if (so->so_flags & SOF_DEFUNCT) {
4135	struct sockbuf *sb = &so->so_rcv;
4136
4137	error = ENOTCONN;
4138	SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
4139	__func__, proc_pid(p), proc_best_name(p),
4140	so->so_gencnt,
4141	SOCK_DOM(so), SOCK_TYPE(so), error);
4142	/*
4143	* This socket should have been disconnected and flushed
4144	* prior to being returned from sodefunct(); there should
4145	* be no data on its receive list, so panic otherwise.
4146	*/
4147	if (so->so_state & SS_DEFUNCT) {
4148	sb_empty_assert(sb, __func__);
4149	}
4150	goto release;
4151	}
4152
4153	*mp = NULL;
4154
4155	restart:
4156	/*
4157	* See if the socket has been closed (SS_NOFDREF\|SS_CANTRCVMORE)
4158	* and if so just return to the caller. This could happen when
4159	* soreceive() is called by a socket upcall function during the
4160	* time the socket is freed. The socket buffer would have been
4161	* locked across the upcall, therefore we cannot put this thread
4162	* to sleep (else we will deadlock) or return EWOULDBLOCK (else
4163	* we may livelock), because the lock on the socket buffer will
4164	* only be released when the upcall routine returns to its caller.
4165	* Because the socket has been officially closed, there can be
4166	* no further read on it.
4167	*/
4168	if ((so->so_state & (SS_NOFDREF \| SS_CANTRCVMORE)) ==
4169	(SS_NOFDREF \| SS_CANTRCVMORE)) {
4170	error = `0`;
4171	goto out;
4172	}
4173
4174	error = sblock(sb: &so->so_rcv, SBLOCKWAIT(flags));
4175	if (error) {
4176	goto out;
4177	}
4178	sblocked = `1`;
4179
4180	m = so->so_rcv.sb_mb;
4181	/*
4182	* Block awaiting more datagram if needed
4183	*/
4184	if (m == NULL \|\| ((flags & MSG_DONTWAIT) == `0` &&
4185	so->so_rcv.sb_cc < so->so_rcv.sb_lowat)) {
4186	/*
4187	* Panic if we notice inconsistencies in the socket's
4188	* receive list; both sb_mb and sb_cc should correctly
4189	* reflect the contents of the list, otherwise we may
4190	* end up with false positives during select() or poll()
4191	* which could put the application in a bad state.
4192	*/
4193	SB_MB_CHECK(&so->so_rcv);
4194
4195	if (so->so_error) {
4196	if (m != NULL) {
4197	goto dontblock;
4198	}
4199	error = so->so_error;
4200	if ((flags & MSG_PEEK) == `0`) {
4201	so->so_error = `0`;
4202	}
4203	goto release;
4204	}
4205	if (so->so_state & SS_CANTRCVMORE) {
4206	if (m != NULL) {
4207	goto dontblock;
4208	} else {
4209	goto release;
4210	}
4211	}
4212	for (; m != NULL; m = m->m_next) {
4213	if (m->m_flags & M_EOR) {
4214	m = so->so_rcv.sb_mb;
4215	goto dontblock;
4216	}
4217	}
4218	if ((so->so_state & (SS_ISCONNECTED \| SS_ISCONNECTING)) == `0` &&
4219	(so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4220	error = ENOTCONN;
4221	goto release;
4222	}
4223	if ((so->so_state & SS_NBIO) \|\|
4224	(flags & (MSG_DONTWAIT \| MSG_NBIO))) {
4225	error = EWOULDBLOCK;
4226	goto release;
4227	}
4228	SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4229	SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4230
4231	sbunlock(sb: &so->so_rcv, TRUE); / keep socket locked /
4232	sblocked = `0`;
4233
4234	error = sbwait(sb: &so->so_rcv);
4235	if (error != `0`) {
4236	goto release;
4237	}
4238	goto restart;
4239	}
4240	dontblock:
4241	m = so->so_rcv.sb_mb;
4242	if (m == NULL) {
4243	goto release;
4244	}
4245
4246	OSIncrementAtomicLong(address: &p->p_stats->p_ru.ru_msgrcv);
4247	SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4248	SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4249	nextrecord = m->m_nextpkt;
4250
4251	if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4252	struct mbuf *maddr = NULL;
4253
4254	error = soreceive_addr(p, so, NULL, maddrp: &maddr, flags, mp: &m,
4255	nextrecordp: &nextrecord, canwait: `1`);
4256	if (error == ERESTART) {
4257	goto restart;
4258	} else if (error != `0`) {
4259	goto release;
4260	}
4261
4262	if (maddr != NULL) {
4263	maddr->m_nextpkt = NULL;
4264	maddr->m_next = NULL;
4265	if (maddrp != NULL) {
4266	*maddrp = maddr;
4267	maddrp = &maddr->m_nextpkt;
4268	} else {
4269	maddr->m_next = free_list;
4270	free_list = maddr;
4271	}
4272	}
4273	}
4274
4275	/*
4276	* Process one or more MT_CONTROL mbufs present before any data mbufs
4277	* in the first mbuf chain on the socket buffer.
4278	* We call into the protocol to perform externalization.
4279	*/
4280	if (m != NULL && m->m_type == MT_CONTROL) {
4281	struct mbuf *control = NULL;
4282
4283	error = soreceive_ctl(so, controlp: &control, flags, mp: &m, nextrecordp: &nextrecord);
4284	if (error != `0`) {
4285	goto release;
4286	}
4287	if (control != NULL) {
4288	control->m_nextpkt = NULL;
4289	control->m_next = NULL;
4290	if (controlp != NULL) {
4291	*controlp = control;
4292	controlp = &control->m_nextpkt;
4293	} else {
4294	control->m_next = free_list;
4295	free_list = control;
4296	}
4297	}
4298	}
4299
4300	/*
4301	* Link the packet to the list
4302	*/
4303	if (m != NULL) {
4304	if (!m_has_mtype(m, mtype_flags: MTF_DATA \| MTF_HEADER \| MTF_OOBDATA)) {
4305	panic("%s: m %p m_type %d != MT_DATA", __func__, m, m->m_type);
4306	}
4307	m->m_nextpkt = NULL;
4308	*mp = m;
4309	mp = &m->m_nextpkt;
4310	}
4311	while (m != NULL) {
4312	sbfree(sb: &so->so_rcv, m);
4313
4314	m = m->m_next;
4315	}
4316
4317	so->so_rcv.sb_mb = nextrecord;
4318	/*
4319	* First part is an inline SB_EMPTY_FIXUP(). Second
4320	* part makes sure sb_lastrecord is up-to-date if
4321	* there is still data in the socket buffer.
4322	*/
4323	if (so->so_rcv.sb_mb == NULL) {
4324	so->so_rcv.sb_mbtail = NULL;
4325	so->so_rcv.sb_lastrecord = NULL;
4326	} else if (nextrecord->m_nextpkt == NULL) {
4327	so->so_rcv.sb_lastrecord = nextrecord;
4328	}
4329	SB_MB_CHECK(&so->so_rcv);
4330
4331	SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4332	SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4333
4334	npkts += `1`;
4335
4336	/*
4337	* We continue as long as all those conditions as we have less packets
4338	* than requested and the socket buffer is not empty
4339	*/
4340	if (npkts < *pktcntp) {
4341	if (so->so_rcv.sb_mb != NULL) {
4342	goto dontblock;
4343	}
4344	if ((flags & MSG_WAITALL) != `0`) {
4345	goto restart;
4346	}
4347	}
4348
4349	if (flagsp != NULL) {
4350	*flagsp \|= flags;
4351	}
4352
4353	release:
4354	/*
4355	* pru_rcvd may cause more data to be received if the socket lock
4356	* is dropped so we set MSG_HAVEMORE now based on what we know.
4357	* That way the caller won't be surprised if it receives less data
4358	* than requested.
4359	*/
4360	if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > `0`) {
4361	flags \|= MSG_HAVEMORE;
4362	}
4363
4364	if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) {
4365	(*pr->pr_usrreqs->pru_rcvd)(so, flags);
4366	}
4367
4368	if (sblocked) {
4369	sbunlock(sb: &so->so_rcv, FALSE); / will unlock socket /
4370	} else {
4371	socket_unlock(so, refcount: `1`);
4372	}
4373
4374	out:
4375	*pktcntp = npkts;
4376	/*
4377	* Amortize the cost of freeing the mbufs
4378	*/
4379	if (free_list != NULL) {
4380	m_freem_list(free_list);
4381	}
4382
4383	KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST \| DBG_FUNC_END, error,
4384	`0`, `0`, `0`, `0`);
4385	return error;
4386	}
4387
4388	static int
4389	so_statistics_event_to_nstat_event(int64_t *input_options,
4390	uint64_t *nstat_event)
4391	{
4392	int error = `0`;
4393	switch (*input_options) {
4394	case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK:
4395	*nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK;
4396	break;
4397	case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK:
4398	*nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK;
4399	break;
4400	case SO_STATISTICS_EVENT_ATTRIBUTION_CHANGE:
4401	*nstat_event = NSTAT_EVENT_SRC_ATTRIBUTION_CHANGE;
4402	break;
4403	#if (DEBUG \|\| DEVELOPMENT)
4404	case SO_STATISTICS_EVENT_RESERVED_2:
4405	*nstat_event = NSTAT_EVENT_SRC_RESERVED_2;
4406	break;
4407	#endif /* (DEBUG \|\| DEVELOPMENT) */
4408	default:
4409	error = EINVAL;
4410	break;
4411	}
4412	return error;
4413	}
4414
4415	/*
4416	* Returns: 0 Success
4417	* EINVAL
4418	* ENOTCONN
4419	* <pru_shutdown>:EINVAL
4420	* <pru_shutdown>:EADDRNOTAVAIL[TCP]
4421	* <pru_shutdown>:ENOBUFS[TCP]
4422	* <pru_shutdown>:EMSGSIZE[TCP]
4423	* <pru_shutdown>:EHOSTUNREACH[TCP]
4424	* <pru_shutdown>:ENETUNREACH[TCP]
4425	* <pru_shutdown>:ENETDOWN[TCP]
4426	* <pru_shutdown>:ENOMEM[TCP]
4427	* <pru_shutdown>:EACCES[TCP]
4428	* <pru_shutdown>:EMSGSIZE[TCP]
4429	* <pru_shutdown>:ENOBUFS[TCP]
4430	* <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
4431	* <pru_shutdown>:??? [other protocol families]
4432	*/
4433	int
4434	soshutdown(struct socket so, int* how)
4435	{
4436	int error;
4437
4438	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN \| DBG_FUNC_START, how, `0`, `0`, `0`, `0`);
4439
4440	switch (how) {
4441	case SHUT_RD:
4442	case SHUT_WR:
4443	case SHUT_RDWR:
4444	socket_lock(so, refcount: `1`);
4445	if ((so->so_state &
4446	(SS_ISCONNECTED \| SS_ISCONNECTING \| SS_ISDISCONNECTING)) == `0`) {
4447	error = ENOTCONN;
4448	} else {
4449	error = soshutdownlock(so, how);
4450	}
4451	socket_unlock(so, refcount: `1`);
4452	break;
4453	default:
4454	error = EINVAL;
4455	break;
4456	}
4457
4458	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN \| DBG_FUNC_END, how, error, `0`, `0`, `0`);
4459
4460	return error;
4461	}
4462
4463	int
4464	soshutdownlock_final(struct socket so, int* how)
4465	{
4466	struct protosw *pr = so->so_proto;
4467	int error = `0`;
4468
4469	sflt_notify(so, event: sock_evt_shutdown, param: &how);
4470
4471	if (how != SHUT_WR) {
4472	if ((so->so_state & SS_CANTRCVMORE) != `0`) {
4473	/ read already shut down /
4474	error = ENOTCONN;
4475	goto done;
4476	}
4477	sorflush(so);
4478	}
4479	if (how != SHUT_RD) {
4480	if ((so->so_state & SS_CANTSENDMORE) != `0`) {
4481	/ write already shut down /
4482	error = ENOTCONN;
4483	goto done;
4484	}
4485	error = (*pr->pr_usrreqs->pru_shutdown)(so);
4486	}
4487	done:
4488	KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, `1`, `0`, `0`, `0`);
4489	return error;
4490	}
4491
4492	int
4493	soshutdownlock(struct socket so, int* how)
4494	{
4495	int error = `0`;
4496
4497	#if CONTENT_FILTER
4498	/*
4499	* A content filter may delay the actual shutdown until it
4500	* has processed the pending data
4501	*/
4502	if (so->so_flags & SOF_CONTENT_FILTER) {
4503	error = cfil_sock_shutdown(so, how: &how);
4504	if (error == EJUSTRETURN) {
4505	error = `0`;
4506	goto done;
4507	} else if (error != `0`) {
4508	goto done;
4509	}
4510	}
4511	#endif /* CONTENT_FILTER */
4512
4513	error = soshutdownlock_final(so, how);
4514
4515	done:
4516	return error;
4517	}
4518
4519	void
4520	sowflush(struct socket *so)
4521	{
4522	struct sockbuf *sb = &so->so_snd;
4523
4524	/*
4525	* Obtain lock on the socket buffer (SB_LOCK). This is required
4526	* to prevent the socket buffer from being unexpectedly altered
4527	* while it is used by another thread in socket send/receive.
4528	*
4529	* sblock() must not fail here, hence the assertion.
4530	*/
4531	(void) sblock(sb, SBL_WAIT \| SBL_NOINTR \| SBL_IGNDEFUNCT);
4532	VERIFY(sb->sb_flags & SB_LOCK);
4533
4534	sb->sb_flags &= ~(SB_SEL \| SB_UPCALL);
4535	sb->sb_flags \|= SB_DROP;
4536	sb->sb_upcall = NULL;
4537	sb->sb_upcallarg = NULL;
4538
4539	sbunlock(sb, TRUE); / keep socket locked /
4540
4541	selthreadclear(&sb->sb_sel);
4542	sbrelease(sb);
4543	}
4544
4545	void
4546	sorflush(struct socket *so)
4547	{
4548	struct sockbuf *sb = &so->so_rcv;
4549	struct protosw *pr = so->so_proto;
4550	struct sockbuf asb;
4551	#ifdef notyet
4552	lck_mtx_t *mutex_held;
4553	/*
4554	* XXX: This code is currently commented out, because we may get here
4555	* as part of sofreelastref(), and at that time, pr_getlock() may no
4556	* longer be able to return us the lock; this will be fixed in future.
4557	*/
4558	if (so->so_proto->pr_getlock != NULL) {
4559	mutex_held = (*so->so_proto->pr_getlock)(so, `0`);
4560	} else {
4561	mutex_held = so->so_proto->pr_domain->dom_mtx;
4562	}
4563
4564	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4565	#endif /* notyet */
4566
4567	sflt_notify(so, event: sock_evt_flush_read, NULL);
4568
4569	socantrcvmore(so);
4570
4571	/*
4572	* Obtain lock on the socket buffer (SB_LOCK). This is required
4573	* to prevent the socket buffer from being unexpectedly altered
4574	* while it is used by another thread in socket send/receive.
4575	*
4576	* sblock() must not fail here, hence the assertion.
4577	*/
4578	(void) sblock(sb, SBL_WAIT \| SBL_NOINTR \| SBL_IGNDEFUNCT);
4579	VERIFY(sb->sb_flags & SB_LOCK);
4580
4581	/*
4582	* Copy only the relevant fields from "sb" to "asb" which we
4583	* need for sbrelease() to function. In particular, skip
4584	* sb_sel as it contains the wait queue linkage, which would
4585	* wreak havoc if we were to issue selthreadclear() on "asb".
4586	* Make sure to not carry over SB_LOCK in "asb", as we need
4587	* to acquire it later as part of sbrelease().
4588	*/
4589	bzero(s: &asb, n: sizeof(asb));
4590	asb.sb_cc = sb->sb_cc;
4591	asb.sb_hiwat = sb->sb_hiwat;
4592	asb.sb_mbcnt = sb->sb_mbcnt;
4593	asb.sb_mbmax = sb->sb_mbmax;
4594	asb.sb_ctl = sb->sb_ctl;
4595	asb.sb_lowat = sb->sb_lowat;
4596	asb.sb_mb = sb->sb_mb;
4597	asb.sb_mbtail = sb->sb_mbtail;
4598	asb.sb_lastrecord = sb->sb_lastrecord;
4599	asb.sb_so = sb->sb_so;
4600	asb.sb_flags = sb->sb_flags;
4601	asb.sb_flags &= ~(SB_LOCK \| SB_SEL \| SB_KNOTE \| SB_UPCALL);
4602	asb.sb_flags \|= SB_DROP;
4603
4604	/*
4605	* Ideally we'd bzero() these and preserve the ones we need;
4606	* but to do that we'd need to shuffle things around in the
4607	* sockbuf, and we can't do it now because there are KEXTS
4608	* that are directly referring to the socket structure.
4609	*
4610	* Setting SB_DROP acts as a barrier to prevent further appends.
4611	* Clearing SB_SEL is done for selthreadclear() below.
4612	*/
4613	sb->sb_cc = `0`;
4614	sb->sb_hiwat = `0`;
4615	sb->sb_mbcnt = `0`;
4616	sb->sb_mbmax = `0`;
4617	sb->sb_ctl = `0`;
4618	sb->sb_lowat = `0`;
4619	sb->sb_mb = NULL;
4620	sb->sb_mbtail = NULL;
4621	sb->sb_lastrecord = NULL;
4622	sb->sb_timeo.tv_sec = `0`;
4623	sb->sb_timeo.tv_usec = `0`;
4624	sb->sb_upcall = NULL;
4625	sb->sb_upcallarg = NULL;
4626	sb->sb_flags &= ~(SB_SEL \| SB_UPCALL);
4627	sb->sb_flags \|= SB_DROP;
4628
4629	sbunlock(sb, TRUE); / keep socket locked /
4630
4631	/*
4632	* Note that selthreadclear() is called on the original "sb" and
4633	* not the local "asb" because of the way wait queue linkage is
4634	* implemented. Given that selwakeup() may be triggered, SB_SEL
4635	* should no longer be set (cleared above.)
4636	*/
4637	selthreadclear(&sb->sb_sel);
4638
4639	if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
4640	(*pr->pr_domain->dom_dispose)(asb.sb_mb);
4641	}
4642
4643	sbrelease(sb: &asb);
4644	}
4645
4646	/*
4647	* Perhaps this routine, and sooptcopyout(), below, ought to come in
4648	* an additional variant to handle the case where the option value needs
4649	* to be some kind of integer, but not a specific size.
4650	* In addition to their use here, these functions are also called by the
4651	* protocol-level pr_ctloutput() routines.
4652	*
4653	* Returns: 0 Success
4654	* EINVAL
4655	* copyin:EFAULT
4656	*/
4657	int
4658	sooptcopyin(struct sockopt sopt, void* *buf, size_t len, size_t minlen)
4659	{
4660	size_t valsize;
4661
4662	/*
4663	* If the user gives us more than we wanted, we ignore it,
4664	* but if we don't get the minimum length the caller
4665	* wants, we return EINVAL. On success, sopt->sopt_valsize
4666	* is set to however much we actually retrieved.
4667	*/
4668	if ((valsize = sopt->sopt_valsize) < minlen) {
4669	return EINVAL;
4670	}
4671	if (valsize > len) {
4672	sopt->sopt_valsize = valsize = len;
4673	}
4674
4675	if (sopt->sopt_p != kernproc) {
4676	return copyin(sopt->sopt_val, buf, valsize);
4677	}
4678
4679	bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), dst: buf, n: valsize);
4680	return `0`;
4681	}
4682
4683	/*
4684	* sooptcopyin_timeval
4685	* Copy in a timeval value into tv_p, and take into account whether the
4686	* the calling process is 64-bit or 32-bit. Moved the sanity checking
4687	* code here so that we can verify the 64-bit tv_sec value before we lose
4688	* the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4689	*/
4690	static int
4691	sooptcopyin_timeval(struct sockopt sopt, struct* timeval *tv_p)
4692	{
4693	int error;
4694
4695	if (proc_is64bit(sopt->sopt_p)) {
4696	struct user64_timeval tv64;
4697
4698	if (sopt->sopt_valsize < sizeof(tv64)) {
4699	return EINVAL;
4700	}
4701
4702	sopt->sopt_valsize = sizeof(tv64);
4703	if (sopt->sopt_p != kernproc) {
4704	error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4705	if (error != `0`) {
4706	return error;
4707	}
4708	} else {
4709	bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), dst: &tv64,
4710	n: sizeof(tv64));
4711	}
4712	if (tv64.tv_sec < `0` \|\| tv64.tv_sec > LONG_MAX \|\|
4713	tv64.tv_usec < `0` \|\| tv64.tv_usec >= `1000000`) {
4714	return EDOM;
4715	}
4716
4717	tv_p->tv_sec = (__darwin_time_t)tv64.tv_sec;
4718	tv_p->tv_usec = tv64.tv_usec;
4719	} else {
4720	struct user32_timeval tv32;
4721
4722	if (sopt->sopt_valsize < sizeof(tv32)) {
4723	return EINVAL;
4724	}
4725
4726	sopt->sopt_valsize = sizeof(tv32);
4727	if (sopt->sopt_p != kernproc) {
4728	error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
4729	if (error != `0`) {
4730	return error;
4731	}
4732	} else {
4733	bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), dst: &tv32,
4734	n: sizeof(tv32));
4735	}
4736	#ifndef __LP64__
4737	/*
4738	* K64todo "comparison is always false due to
4739	* limited range of data type"
4740	*/
4741	if (tv32.tv_sec < `0` \|\| tv32.tv_sec > LONG_MAX \|\|
4742	tv32.tv_usec < `0` \|\| tv32.tv_usec >= `1000000`) {
4743	return EDOM;
4744	}
4745	#endif
4746	tv_p->tv_sec = tv32.tv_sec;
4747	tv_p->tv_usec = tv32.tv_usec;
4748	}
4749	return `0`;
4750	}
4751
4752	int
4753	soopt_cred_check(struct socket so, int* priv, boolean_t allow_root,
4754	boolean_t ignore_delegate)
4755	{
4756	kauth_cred_t cred = NULL;
4757	proc_t ep = PROC_NULL;
4758	uid_t uid;
4759	int error = `0`;
4760
4761	if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) {
4762	ep = proc_find(pid: so->e_pid);
4763	if (ep) {
4764	cred = kauth_cred_proc_ref(procp: ep);
4765	}
4766	}
4767
4768	uid = kauth_cred_getuid(cred: cred ? cred : so->so_cred);
4769
4770	/ uid is 0 for root /
4771	if (uid != `0` \|\| !allow_root) {
4772	error = priv_check_cred(cred: cred ? cred : so->so_cred, priv, flags: `0`);
4773	}
4774	if (cred) {
4775	kauth_cred_unref(&cred);
4776	}
4777	if (ep != PROC_NULL) {
4778	proc_rele(p: ep);
4779	}
4780
4781	return error;
4782	}
4783
4784	/*
4785	* Returns: 0 Success
4786	* EINVAL
4787	* ENOPROTOOPT
4788	* ENOBUFS
4789	* EDOM
4790	* sooptcopyin:EINVAL
4791	* sooptcopyin:EFAULT
4792	* sooptcopyin_timeval:EINVAL
4793	* sooptcopyin_timeval:EFAULT
4794	* sooptcopyin_timeval:EDOM
4795	* <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4796	* <pr_ctloutput>:???w
4797	* sflt_attach_private:??? [whatever a filter author chooses]
4798	* <sf_setoption>:??? [whatever a filter author chooses]
4799	*
4800	* Notes: Other <pru_listen> returns depend on the protocol family; all
4801	* <sf_listen> returns depend on what the filter author causes
4802	* their filter to return.
4803	*/
4804	int
4805	sosetoptlock(struct socket so, struct* sockopt sopt, int* dolock)
4806	{
4807	int error, optval;
4808	int64_t long_optval;
4809	struct linger l;
4810	struct timeval tv;
4811
4812	if (sopt->sopt_dir != SOPT_SET) {
4813	sopt->sopt_dir = SOPT_SET;
4814	}
4815
4816	if (dolock) {
4817	socket_lock(so, refcount: `1`);
4818	}
4819
4820	if ((so->so_state & (SS_CANTRCVMORE \| SS_CANTSENDMORE)) ==
4821	(SS_CANTRCVMORE \| SS_CANTSENDMORE) &&
4822	(so->so_flags & SOF_NPX_SETOPTSHUT) == `0`) {
4823	/ the socket has been shutdown, no more sockopt's /
4824	error = EINVAL;
4825	goto out;
4826	}
4827
4828	error = sflt_setsockopt(so, sopt);
4829	if (error != `0`) {
4830	if (error == EJUSTRETURN) {
4831	error = `0`;
4832	}
4833	goto out;
4834	}
4835
4836	if (sopt->sopt_level != SOL_SOCKET) {
4837	if (so->so_proto != NULL &&
4838	so->so_proto->pr_ctloutput != NULL) {
4839	error = (*so->so_proto->pr_ctloutput)(so, sopt);
4840	goto out;
4841	}
4842	error = ENOPROTOOPT;
4843	} else {
4844	/*
4845	* Allow socket-level (SOL_SOCKET) options to be filtered by
4846	* the protocol layer, if needed. A zero value returned from
4847	* the handler means use default socket-level processing as
4848	* done by the rest of this routine. Otherwise, any other
4849	* return value indicates that the option is unsupported.
4850	*/
4851	if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
4852	pru_socheckopt(so, sopt)) != `0`) {
4853	goto out;
4854	}
4855
4856	error = `0`;
4857	switch (sopt->sopt_name) {
4858	case SO_LINGER:
4859	case SO_LINGER_SEC: {
4860	error = sooptcopyin(sopt, buf: &l, len: sizeof(l), minlen: sizeof(l));
4861	if (error != `0`) {
4862	goto out;
4863	}
4864	/ Make sure to use sane values /
4865	if (sopt->sopt_name == SO_LINGER) {
4866	so->so_linger = (short)l.l_linger;
4867	} else {
4868	so->so_linger = (short)((long)l.l_linger * hz);
4869	}
4870	if (l.l_onoff != `0`) {
4871	so->so_options \|= SO_LINGER;
4872	} else {
4873	so->so_options &= ~SO_LINGER;
4874	}
4875	break;
4876	}
4877	case SO_DEBUG:
4878	case SO_KEEPALIVE:
4879	case SO_DONTROUTE:
4880	case SO_USELOOPBACK:
4881	case SO_BROADCAST:
4882	case SO_REUSEADDR:
4883	case SO_REUSEPORT:
4884	case SO_OOBINLINE:
4885	case SO_TIMESTAMP:
4886	case SO_TIMESTAMP_MONOTONIC:
4887	case SO_TIMESTAMP_CONTINUOUS:
4888	case SO_DONTTRUNC:
4889	case SO_WANTMORE:
4890	case SO_WANTOOBFLAG:
4891	case SO_NOWAKEFROMSLEEP:
4892	case SO_NOAPNFALLBK:
4893	error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
4894	minlen: sizeof(optval));
4895	if (error != `0`) {
4896	goto out;
4897	}
4898	if (optval) {
4899	so->so_options \|= sopt->sopt_name;
4900	} else {
4901	so->so_options &= ~sopt->sopt_name;
4902	}
4903	#if SKYWALK
4904	inp_update_netns_flags(so);
4905	#endif /* SKYWALK */
4906	break;
4907
4908	case SO_SNDBUF:
4909	case SO_RCVBUF:
4910	case SO_SNDLOWAT:
4911	case SO_RCVLOWAT:
4912	error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
4913	minlen: sizeof(optval));
4914	if (error != `0`) {
4915	goto out;
4916	}
4917
4918	/*
4919	* Values < 1 make no sense for any of these
4920	* options, so disallow them.
4921	*/
4922	if (optval < `1`) {
4923	error = EINVAL;
4924	goto out;
4925	}
4926
4927	switch (sopt->sopt_name) {
4928	case SO_SNDBUF:
4929	case SO_RCVBUF: {
4930	struct sockbuf *sb =
4931	(sopt->sopt_name == SO_SNDBUF) ?
4932	&so->so_snd : &so->so_rcv;
4933	if (sbreserve(sb, cc: (u_int32_t)optval) == `0`) {
4934	error = ENOBUFS;
4935	goto out;
4936	}
4937	sb->sb_flags \|= SB_USRSIZE;
4938	sb->sb_flags &= ~SB_AUTOSIZE;
4939	sb->sb_idealsize = (u_int32_t)optval;
4940	break;
4941	}
4942	/*
4943	* Make sure the low-water is never greater than
4944	* the high-water.
4945	*/
4946	case SO_SNDLOWAT: {
4947	int space = sbspace(sb: &so->so_snd);
4948	uint32_t hiwat = so->so_snd.sb_hiwat;
4949
4950	if (so->so_snd.sb_flags & SB_UNIX) {
4951	struct unpcb *unp =
4952	(struct unpcb *)(so->so_pcb);
4953	if (unp != NULL &&
4954	unp->unp_conn != NULL) {
4955	struct socket *so2 = unp->unp_conn->unp_socket;
4956	hiwat += unp->unp_conn->unp_cc;
4957	space = sbspace(sb: &so2->so_rcv);
4958	}
4959	}
4960
4961	so->so_snd.sb_lowat =
4962	(optval > hiwat) ?
4963	hiwat : optval;
4964
4965	if (space >= so->so_snd.sb_lowat) {
4966	sowwakeup(so);
4967	}
4968	break;
4969	}
4970	case SO_RCVLOWAT: {
4971	int64_t data_len;
4972	so->so_rcv.sb_lowat =
4973	(optval > so->so_rcv.sb_hiwat) ?
4974	so->so_rcv.sb_hiwat : optval;
4975	if (so->so_rcv.sb_flags & SB_UNIX) {
4976	struct unpcb *unp =
4977	(struct unpcb *)(so->so_pcb);
4978	if (unp != NULL &&
4979	unp->unp_conn != NULL) {
4980	struct socket *so2 = unp->unp_conn->unp_socket;
4981	data_len = so2->so_snd.sb_cc
4982	- so2->so_snd.sb_ctl;
4983	} else {
4984	data_len = so->so_rcv.sb_cc
4985	- so->so_rcv.sb_ctl;
4986	}
4987	} else {
4988	data_len = so->so_rcv.sb_cc
4989	- so->so_rcv.sb_ctl;
4990	}
4991
4992	if (data_len >= so->so_rcv.sb_lowat) {
4993	sorwakeup(so);
4994	}
4995	break;
4996	}
4997	}
4998	break;
4999
5000	case SO_SNDTIMEO:
5001	case SO_RCVTIMEO:
5002	error = sooptcopyin_timeval(sopt, tv_p: &tv);
5003	if (error != `0`) {
5004	goto out;
5005	}
5006
5007	switch (sopt->sopt_name) {
5008	case SO_SNDTIMEO:
5009	so->so_snd.sb_timeo = tv;
5010	break;
5011	case SO_RCVTIMEO:
5012	so->so_rcv.sb_timeo = tv;
5013	break;
5014	}
5015	break;
5016
5017	case SO_NKE: {
5018	struct so_nke nke;
5019
5020	error = sooptcopyin(sopt, buf: &nke, len: sizeof(nke),
5021	minlen: sizeof(nke));
5022	if (error != `0`) {
5023	goto out;
5024	}
5025
5026	error = sflt_attach_internal(so, handle: nke.nke_handle);
5027	break;
5028	}
5029
5030	case SO_NOSIGPIPE:
5031	error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5032	minlen: sizeof(optval));
5033	if (error != `0`) {
5034	goto out;
5035	}
5036	if (optval != `0`) {
5037	so->so_flags \|= SOF_NOSIGPIPE;
5038	} else {
5039	so->so_flags &= ~SOF_NOSIGPIPE;
5040	}
5041	break;
5042
5043	case SO_NOADDRERR:
5044	error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5045	minlen: sizeof(optval));
5046	if (error != `0`) {
5047	goto out;
5048	}
5049	if (optval != `0`) {
5050	so->so_flags \|= SOF_NOADDRAVAIL;
5051	} else {
5052	so->so_flags &= ~SOF_NOADDRAVAIL;
5053	}
5054	break;
5055
5056	case SO_REUSESHAREUID:
5057	error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5058	minlen: sizeof(optval));
5059	if (error != `0`) {
5060	goto out;
5061	}
5062	if (optval != `0`) {
5063	so->so_flags \|= SOF_REUSESHAREUID;
5064	} else {
5065	so->so_flags &= ~SOF_REUSESHAREUID;
5066	}
5067	break;
5068
5069	case SO_NOTIFYCONFLICT:
5070	if (kauth_cred_issuser(cred: kauth_cred_get()) == `0`) {
5071	error = EPERM;
5072	goto out;
5073	}
5074	error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5075	minlen: sizeof(optval));
5076	if (error != `0`) {
5077	goto out;
5078	}
5079	if (optval != `0`) {
5080	so->so_flags \|= SOF_NOTIFYCONFLICT;
5081	} else {
5082	so->so_flags &= ~SOF_NOTIFYCONFLICT;
5083	}
5084	break;
5085
5086	case SO_RESTRICTIONS:
5087	error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5088	minlen: sizeof(optval));
5089	if (error != `0`) {
5090	goto out;
5091	}
5092
5093	error = so_set_restrictions(so, optval);
5094	break;
5095
5096	case SO_AWDL_UNRESTRICTED:
5097	if (SOCK_DOM(so) != PF_INET &&
5098	SOCK_DOM(so) != PF_INET6) {
5099	error = EOPNOTSUPP;
5100	goto out;
5101	}
5102	error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5103	minlen: sizeof(optval));
5104	if (error != `0`) {
5105	goto out;
5106	}
5107	if (optval != `0`) {
5108	error = soopt_cred_check(so,
5109	PRIV_NET_RESTRICTED_AWDL, false, false);
5110	if (error == `0`) {
5111	inp_set_awdl_unrestricted(
5112	sotoinpcb(so));
5113	}
5114	} else {
5115	inp_clear_awdl_unrestricted(sotoinpcb(so));
5116	}
5117	break;
5118	case SO_INTCOPROC_ALLOW:
5119	if (SOCK_DOM(so) != PF_INET6) {
5120	error = EOPNOTSUPP;
5121	goto out;
5122	}
5123	error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5124	minlen: sizeof(optval));
5125	if (error != `0`) {
5126	goto out;
5127	}
5128	if (optval != `0` &&
5129	inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5130	error = soopt_cred_check(so,
5131	PRIV_NET_RESTRICTED_INTCOPROC, false, false);
5132	if (error == `0`) {
5133	inp_set_intcoproc_allowed(
5134	sotoinpcb(so));
5135	}
5136	} else if (optval == `0`) {
5137	inp_clear_intcoproc_allowed(sotoinpcb(so));
5138	}
5139	break;
5140
5141	case SO_LABEL:
5142	error = EOPNOTSUPP;
5143	break;
5144
5145	case SO_UPCALLCLOSEWAIT:
5146	error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5147	minlen: sizeof(optval));
5148	if (error != `0`) {
5149	goto out;
5150	}
5151	if (optval != `0`) {
5152	so->so_flags \|= SOF_UPCALLCLOSEWAIT;
5153	} else {
5154	so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5155	}
5156	break;
5157
5158	case SO_RANDOMPORT:
5159	error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5160	minlen: sizeof(optval));
5161	if (error != `0`) {
5162	goto out;
5163	}
5164	if (optval != `0`) {
5165	so->so_flags \|= SOF_BINDRANDOMPORT;
5166	} else {
5167	so->so_flags &= ~SOF_BINDRANDOMPORT;
5168	}
5169	break;
5170
5171	case SO_NP_EXTENSIONS: {
5172	struct so_np_extensions sonpx;
5173
5174	error = sooptcopyin(sopt, buf: &sonpx, len: sizeof(sonpx),
5175	minlen: sizeof(sonpx));
5176	if (error != `0`) {
5177	goto out;
5178	}
5179	if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5180	error = EINVAL;
5181	goto out;
5182	}
5183	/*
5184	* Only one bit defined for now
5185	*/
5186	if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5187	if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
5188	so->so_flags \|= SOF_NPX_SETOPTSHUT;
5189	} else {
5190	so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5191	}
5192	}
5193	break;
5194	}
5195
5196	case SO_TRAFFIC_CLASS: {
5197	error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5198	minlen: sizeof(optval));
5199	if (error != `0`) {
5200	goto out;
5201	}
5202	if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5203	int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5204	error = so_set_net_service_type(so, netsvc);
5205	goto out;
5206	}
5207	error = so_set_traffic_class(so, optval);
5208	if (error != `0`) {
5209	goto out;
5210	}
5211	so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5212	so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5213	break;
5214	}
5215
5216	case SO_RECV_TRAFFIC_CLASS: {
5217	error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5218	minlen: sizeof(optval));
5219	if (error != `0`) {
5220	goto out;
5221	}
5222	if (optval == `0`) {
5223	so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5224	} else {
5225	so->so_flags \|= SOF_RECV_TRAFFIC_CLASS;
5226	}
5227	break;
5228	}
5229
5230	#if (DEVELOPMENT \|\| DEBUG)
5231	case SO_TRAFFIC_CLASS_DBG: {
5232	struct so_tcdbg so_tcdbg;
5233
5234	error = sooptcopyin(sopt, &so_tcdbg,
5235	sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5236	if (error != `0`) {
5237	goto out;
5238	}
5239	error = so_set_tcdbg(so, &so_tcdbg);
5240	if (error != `0`) {
5241	goto out;
5242	}
5243	break;
5244	}
5245	#endif /* (DEVELOPMENT \|\| DEBUG) */
5246
5247	case SO_PRIVILEGED_TRAFFIC_CLASS:
5248	error = priv_check_cred(cred: kauth_cred_get(),
5249	PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, flags: `0`);
5250	if (error != `0`) {
5251	goto out;
5252	}
5253	error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5254	minlen: sizeof(optval));
5255	if (error != `0`) {
5256	goto out;
5257	}
5258	if (optval == `0`) {
5259	so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5260	} else {
5261	so->so_flags \|= SOF_PRIVILEGED_TRAFFIC_CLASS;
5262	}
5263	break;
5264
5265	#if (DEVELOPMENT \|\| DEBUG)
5266	case SO_DEFUNCTIT:
5267	error = sosetdefunct(current_proc(), so, `0`, FALSE);
5268	if (error == `0`) {
5269	error = sodefunct(current_proc(), so, `0`);
5270	}
5271
5272	break;
5273	#endif /* (DEVELOPMENT \|\| DEBUG) */
5274
5275	case SO_DEFUNCTOK:
5276	error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5277	minlen: sizeof(optval));
5278	if (error != `0` \|\| (so->so_flags & SOF_DEFUNCT)) {
5279	if (error == `0`) {
5280	error = EBADF;
5281	}
5282	goto out;
5283	}
5284	/*
5285	* Any process can set SO_DEFUNCTOK (clear
5286	* SOF_NODEFUNCT), but only root can clear
5287	* SO_DEFUNCTOK (set SOF_NODEFUNCT).
5288	*/
5289	if (optval == `0` &&
5290	kauth_cred_issuser(cred: kauth_cred_get()) == `0`) {
5291	error = EPERM;
5292	goto out;
5293	}
5294	if (optval) {
5295	so->so_flags &= ~SOF_NODEFUNCT;
5296	} else {
5297	so->so_flags \|= SOF_NODEFUNCT;
5298	}
5299
5300	if (SOCK_DOM(so) == PF_INET \|\|
5301	SOCK_DOM(so) == PF_INET6) {
5302	char s[MAX_IPv6_STR_LEN];
5303	char d[MAX_IPv6_STR_LEN];
5304	struct inpcb *inp = sotoinpcb(so);
5305
5306	SODEFUNCTLOG("%s[%d, %s]: so 0x%llu "
5307	"[%s %s:%d -> %s:%d] is now marked "
5308	"as %seligible for "
5309	"defunct\n", __func__, proc_selfpid(),
5310	proc_best_name(current_proc()),
5311	so->so_gencnt,
5312	(SOCK_TYPE(so) == SOCK_STREAM) ?
5313	"TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5314	((SOCK_DOM(so) == PF_INET) ?
5315	(void *)&inp->inp_laddr.s_addr :
5316	(void )&inp->in6p_laddr), s, sizeof*(s)),
5317	ntohs(inp->in6p_lport),
5318	inet_ntop(SOCK_DOM(so),
5319	(SOCK_DOM(so) == PF_INET) ?
5320	(void *)&inp->inp_faddr.s_addr :
5321	(void )&inp->in6p_faddr, d, sizeof*(d)),
5322	ntohs(inp->in6p_fport),
5323	(so->so_flags & SOF_NODEFUNCT) ?
5324	"not " : "");
5325	} else {
5326	SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d] "
5327	"is now marked as %seligible for "
5328	"defunct\n",
5329	__func__, proc_selfpid(),
5330	proc_best_name(current_proc()),
5331	so->so_gencnt,
5332	SOCK_DOM(so), SOCK_TYPE(so),
5333	(so->so_flags & SOF_NODEFUNCT) ?
5334	"not " : "");
5335	}
5336	break;
5337
5338	case SO_ISDEFUNCT:
5339	/ This option is not settable /
5340	error = EINVAL;
5341	break;
5342
5343	case SO_OPPORTUNISTIC:
5344	error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5345	minlen: sizeof(optval));
5346	if (error == `0`) {
5347	error = so_set_opportunistic(so, optval);
5348	}
5349	break;
5350
5351	case SO_FLUSH:
5352	/ This option is handled by lower layer(s) /
5353	error = `0`;
5354	break;
5355
5356	case SO_RECV_ANYIF:
5357	error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5358	minlen: sizeof(optval));
5359	if (error == `0`) {
5360	error = so_set_recv_anyif(so, optval);
5361	}
5362	break;
5363
5364	case SO_TRAFFIC_MGT_BACKGROUND: {
5365	/ This option is handled by lower layer(s) /
5366	error = `0`;
5367	break;
5368	}
5369
5370	#if FLOW_DIVERT
5371	case SO_FLOW_DIVERT_TOKEN:
5372	error = flow_divert_token_set(so, sopt);
5373	break;
5374	#endif /* FLOW_DIVERT */
5375
5376
5377	case SO_DELEGATED:
5378	if ((error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5379	minlen: sizeof(optval))) != `0`) {
5380	break;
5381	}
5382
5383	error = so_set_effective_pid(so, epid: optval, p: sopt->sopt_p, true);
5384	break;
5385
5386	case SO_DELEGATED_UUID: {
5387	uuid_t euuid;
5388
5389	if ((error = sooptcopyin(sopt, buf: &euuid, len: sizeof(euuid),
5390	minlen: sizeof(euuid))) != `0`) {
5391	break;
5392	}
5393
5394	error = so_set_effective_uuid(so, euuid, p: sopt->sopt_p, true);
5395	break;
5396	}
5397
5398	#if NECP
5399	case SO_NECP_ATTRIBUTES:
5400	if (SOCK_DOM(so) == PF_MULTIPATH) {
5401	/ Handled by MPTCP itself /
5402	break;
5403	}
5404
5405	if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5406	error = EINVAL;
5407	goto out;
5408	}
5409
5410	error = necp_set_socket_attributes(attributes: &sotoinpcb(so)->inp_necp_attributes, sopt);
5411	break;
5412
5413	case SO_NECP_CLIENTUUID: {
5414	if (SOCK_DOM(so) == PF_MULTIPATH) {
5415	/ Handled by MPTCP itself /
5416	break;
5417	}
5418
5419	if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5420	error = EINVAL;
5421	goto out;
5422	}
5423
5424	struct inpcb *inp = sotoinpcb(so);
5425	if (!uuid_is_null(uu: inp->necp_client_uuid)) {
5426	// Clear out the old client UUID if present
5427	necp_inpcb_remove_cb(inp);
5428	}
5429
5430	error = sooptcopyin(sopt, buf: &inp->necp_client_uuid,
5431	len: sizeof(uuid_t), minlen: sizeof(uuid_t));
5432	if (error != `0`) {
5433	goto out;
5434	}
5435
5436	if (uuid_is_null(uu: inp->necp_client_uuid)) {
5437	error = EINVAL;
5438	goto out;
5439	}
5440
5441	pid_t current_pid = proc_pid(current_proc());
5442	error = necp_client_register_socket_flow(pid: current_pid,
5443	client_id: inp->necp_client_uuid, inp);
5444	if (error != `0`) {
5445	uuid_clear(uu: inp->necp_client_uuid);
5446	goto out;
5447	}
5448
5449	if (inp->inp_lport != `0`) {
5450	// There is a bound local port, so this is not
5451	// a fresh socket. Assign to the client.
5452	necp_client_assign_from_socket(pid: current_pid, client_id: inp->necp_client_uuid, inp);
5453	}
5454
5455	break;
5456	}
5457	case SO_NECP_LISTENUUID: {
5458	if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5459	error = EINVAL;
5460	goto out;
5461	}
5462
5463	struct inpcb *inp = sotoinpcb(so);
5464	if (!uuid_is_null(uu: inp->necp_client_uuid)) {
5465	error = EINVAL;
5466	goto out;
5467	}
5468
5469	error = sooptcopyin(sopt, buf: &inp->necp_client_uuid,
5470	len: sizeof(uuid_t), minlen: sizeof(uuid_t));
5471	if (error != `0`) {
5472	goto out;
5473	}
5474
5475	if (uuid_is_null(uu: inp->necp_client_uuid)) {
5476	error = EINVAL;
5477	goto out;
5478	}
5479
5480	error = necp_client_register_socket_listener(pid: proc_pid(current_proc()),
5481	client_id: inp->necp_client_uuid, inp);
5482	if (error != `0`) {
5483	uuid_clear(uu: inp->necp_client_uuid);
5484	goto out;
5485	}
5486
5487	// Mark that the port registration is held by NECP
5488	inp->inp_flags2 \|= INP2_EXTERNAL_PORT;
5489
5490	break;
5491	}
5492
5493	case SO_RESOLVER_SIGNATURE: {
5494	if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5495	error = EINVAL;
5496	goto out;
5497	}
5498	error = necp_set_socket_resolver_signature(sotoinpcb(so), sopt);
5499	break;
5500	}
5501	#endif /* NECP */
5502
5503	case SO_EXTENDED_BK_IDLE:
5504	error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5505	minlen: sizeof(optval));
5506	if (error == `0`) {
5507	error = so_set_extended_bk_idle(so, optval);
5508	}
5509	break;
5510
5511	case SO_MARK_CELLFALLBACK:
5512	error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5513	minlen: sizeof(optval));
5514	if (error != `0`) {
5515	goto out;
5516	}
5517	if (optval < `0`) {
5518	error = EINVAL;
5519	goto out;
5520	}
5521	if (optval == `0`) {
5522	so->so_flags1 &= ~SOF1_CELLFALLBACK;
5523	} else {
5524	so->so_flags1 \|= SOF1_CELLFALLBACK;
5525	}
5526	break;
5527
5528	case SO_MARK_CELLFALLBACK_UUID:
5529	{
5530	struct so_mark_cellfallback_uuid_args args;
5531
5532	error = sooptcopyin(sopt, buf: &args, len: sizeof(args),
5533	minlen: sizeof(args));
5534	if (error != `0`) {
5535	goto out;
5536	}
5537	error = nstat_userland_mark_rnf_override(fuuid: args.flow_uuid,
5538	rnf_override: args.flow_cellfallback);
5539	break;
5540	}
5541
5542	case SO_FALLBACK_MODE:
5543	error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5544	minlen: sizeof(optval));
5545	if (error != `0`) {
5546	goto out;
5547	}
5548	if (optval < SO_FALLBACK_MODE_NONE \|\|
5549	optval > SO_FALLBACK_MODE_PREFER) {
5550	error = EINVAL;
5551	goto out;
5552	}
5553	so->so_fallback_mode = (u_int8_t)optval;
5554	break;
5555
5556	case SO_MARK_KNOWN_TRACKER: {
5557	error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5558	minlen: sizeof(optval));
5559	if (error != `0`) {
5560	goto out;
5561	}
5562	if (optval < `0`) {
5563	error = EINVAL;
5564	goto out;
5565	}
5566	if (optval == `0`) {
5567	so->so_flags1 &= ~SOF1_KNOWN_TRACKER;
5568	} else {
5569	so->so_flags1 \|= SOF1_KNOWN_TRACKER;
5570	}
5571	break;
5572	}
5573
5574	case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
5575	error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5576	minlen: sizeof(optval));
5577	if (error != `0`) {
5578	goto out;
5579	}
5580	if (optval < `0`) {
5581	error = EINVAL;
5582	goto out;
5583	}
5584	if (optval == `0`) {
5585	so->so_flags1 &= ~SOF1_TRACKER_NON_APP_INITIATED;
5586	} else {
5587	so->so_flags1 \|= SOF1_TRACKER_NON_APP_INITIATED;
5588	}
5589	break;
5590	}
5591
5592	case SO_MARK_APPROVED_APP_DOMAIN: {
5593	error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5594	minlen: sizeof(optval));
5595	if (error != `0`) {
5596	goto out;
5597	}
5598	if (optval < `0`) {
5599	error = EINVAL;
5600	goto out;
5601	}
5602	if (optval == `0`) {
5603	so->so_flags1 &= ~SOF1_APPROVED_APP_DOMAIN;
5604	} else {
5605	so->so_flags1 \|= SOF1_APPROVED_APP_DOMAIN;
5606	}
5607	break;
5608	}
5609
5610	case SO_STATISTICS_EVENT:
5611	error = sooptcopyin(sopt, buf: &long_optval,
5612	len: sizeof(long_optval), minlen: sizeof(long_optval));
5613	if (error != `0`) {
5614	goto out;
5615	}
5616	u_int64_t nstat_event = `0`;
5617	error = so_statistics_event_to_nstat_event(
5618	input_options: &long_optval, nstat_event: &nstat_event);
5619	if (error != `0`) {
5620	goto out;
5621	}
5622	nstat_pcb_event(sotoinpcb(so), event: nstat_event);
5623	break;
5624
5625	case SO_NET_SERVICE_TYPE: {
5626	error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5627	minlen: sizeof(optval));
5628	if (error != `0`) {
5629	goto out;
5630	}
5631	error = so_set_net_service_type(so, optval);
5632	break;
5633	}
5634
5635	case SO_QOSMARKING_POLICY_OVERRIDE:
5636	error = priv_check_cred(cred: kauth_cred_get(),
5637	PRIV_NET_QOSMARKING_POLICY_OVERRIDE, flags: `0`);
5638	if (error != `0`) {
5639	goto out;
5640	}
5641	error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5642	minlen: sizeof(optval));
5643	if (error != `0`) {
5644	goto out;
5645	}
5646	if (optval == `0`) {
5647	so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5648	} else {
5649	so->so_flags1 \|= SOF1_QOSMARKING_POLICY_OVERRIDE;
5650	}
5651	break;
5652
5653	case SO_MPKL_SEND_INFO: {
5654	struct so_mpkl_send_info so_mpkl_send_info;
5655
5656	error = sooptcopyin(sopt, buf: &so_mpkl_send_info,
5657	len: sizeof(struct so_mpkl_send_info), minlen: sizeof(struct so_mpkl_send_info));
5658	if (error != `0`) {
5659	goto out;
5660	}
5661	uuid_copy(dst: so->so_mpkl_send_uuid, src: so_mpkl_send_info.mpkl_uuid);
5662	so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto;
5663
5664	if (uuid_is_null(uu: so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == `0`) {
5665	so->so_flags1 &= ~SOF1_MPKL_SEND_INFO;
5666	} else {
5667	so->so_flags1 \|= SOF1_MPKL_SEND_INFO;
5668	}
5669	break;
5670	}
5671	case SO_WANT_KEV_SOCKET_CLOSED: {
5672	error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5673	minlen: sizeof(optval));
5674	if (error != `0`) {
5675	goto out;
5676	}
5677	if (optval == `0`) {
5678	so->so_flags1 &= ~SOF1_WANT_KEV_SOCK_CLOSED;
5679	} else {
5680	so->so_flags1 \|= SOF1_WANT_KEV_SOCK_CLOSED;
5681	}
5682	break;
5683	}
5684	case SO_MARK_WAKE_PKT: {
5685	error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5686	minlen: sizeof(optval));
5687	if (error != `0`) {
5688	goto out;
5689	}
5690	if (optval == `0`) {
5691	so->so_flags &= ~SOF_MARK_WAKE_PKT;
5692	} else {
5693	so->so_flags \|= SOF_MARK_WAKE_PKT;
5694	}
5695	break;
5696	}
5697	case SO_RECV_WAKE_PKT: {
5698	error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5699	minlen: sizeof(optval));
5700	if (error != `0`) {
5701	goto out;
5702	}
5703	if (optval == `0`) {
5704	so->so_flags &= ~SOF_RECV_WAKE_PKT;
5705	} else {
5706	so->so_flags \|= SOF_RECV_WAKE_PKT;
5707	}
5708	break;
5709	}
5710	case SO_APPLICATION_ID: {
5711	so_application_id_t application_id = { `0` };
5712
5713	if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5714	error = EINVAL;
5715	goto out;
5716	}
5717	error = sooptcopyin(sopt, buf: &application_id, len: sizeof(application_id),
5718	minlen: sizeof(application_id));
5719	if (error != `0`) {
5720	goto out;
5721	}
5722
5723	// The user needs to match
5724	if (kauth_cred_getuid(cred: so->so_cred) != application_id.uid) {
5725	error = EINVAL;
5726	printf("setsockopt: SO_APPLICATION_ID - wrong uid");
5727	goto out;
5728	}
5729	error = so_set_effective_uuid(so, euuid: application_id.effective_uuid, p: sopt->sopt_p, true);
5730	if (error != `0`) {
5731	printf("setsockopt: SO_APPLICATION_ID - failed to set e_uuid");
5732	goto out;
5733	}
5734	if (application_id.persona_id != PERSONA_ID_NONE) {
5735	so->so_persona_id = application_id.persona_id;
5736	}
5737	break;
5738	}
5739	default:
5740	error = ENOPROTOOPT;
5741	break;
5742	}
5743	if (error == `0` && so->so_proto != NULL &&
5744	so->so_proto->pr_ctloutput != NULL) {
5745	(void) so->so_proto->pr_ctloutput(so, sopt);
5746	}
5747	}
5748	out:
5749	if (dolock) {
5750	socket_unlock(so, refcount: `1`);
5751	}
5752	return error;
5753	}
5754
5755	/ Helper routines for getsockopt /
5756	int
5757	sooptcopyout(struct sockopt sopt, void* *buf, size_t len)
5758	{
5759	int error;
5760	size_t valsize;
5761
5762	error = `0`;
5763
5764	/*
5765	* Documented get behavior is that we always return a value,
5766	* possibly truncated to fit in the user's buffer.
5767	* Traditional behavior is that we always tell the user
5768	* precisely how much we copied, rather than something useful
5769	* like the total amount we had available for her.
5770	* Note that this interface is not idempotent; the entire answer must
5771	* generated ahead of time.
5772	*/
5773	valsize = MIN(len, sopt->sopt_valsize);
5774	sopt->sopt_valsize = valsize;
5775	if (sopt->sopt_val != USER_ADDR_NULL) {
5776	if (sopt->sopt_p != kernproc) {
5777	error = copyout(buf, sopt->sopt_val, valsize);
5778	} else {
5779	bcopy(src: buf, CAST_DOWN(caddr_t, sopt->sopt_val), n: valsize);
5780	}
5781	}
5782	return error;
5783	}
5784
5785	static int
5786	sooptcopyout_timeval(struct sockopt sopt, const* struct timeval *tv_p)
5787	{
5788	int error;
5789	size_t len;
5790	struct user64_timeval tv64 = {};
5791	struct user32_timeval tv32 = {};
5792	const void * val;
5793	size_t valsize;
5794
5795	error = `0`;
5796	if (proc_is64bit(sopt->sopt_p)) {
5797	len = sizeof(tv64);
5798	tv64.tv_sec = tv_p->tv_sec;
5799	tv64.tv_usec = tv_p->tv_usec;
5800	val = &tv64;
5801	} else {
5802	len = sizeof(tv32);
5803	tv32.tv_sec = (user32_time_t)tv_p->tv_sec;
5804	tv32.tv_usec = tv_p->tv_usec;
5805	val = &tv32;
5806	}
5807	valsize = MIN(len, sopt->sopt_valsize);
5808	sopt->sopt_valsize = valsize;
5809	if (sopt->sopt_val != USER_ADDR_NULL) {
5810	if (sopt->sopt_p != kernproc) {
5811	error = copyout(val, sopt->sopt_val, valsize);
5812	} else {
5813	bcopy(src: val, CAST_DOWN(caddr_t, sopt->sopt_val), n: valsize);
5814	}
5815	}
5816	return error;
5817	}
5818
5819	/*
5820	* Return: 0 Success
5821	* ENOPROTOOPT
5822	* <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5823	* <pr_ctloutput>:???
5824	* <sf_getoption>:???
5825	*/
5826	int
5827	sogetoptlock(struct socket so, struct* sockopt sopt, int* dolock)
5828	{
5829	int error, optval;
5830	struct linger l;
5831	struct timeval tv;
5832
5833	if (sopt->sopt_dir != SOPT_GET) {
5834	sopt->sopt_dir = SOPT_GET;
5835	}
5836
5837	if (dolock) {
5838	socket_lock(so, refcount: `1`);
5839	}
5840
5841	error = sflt_getsockopt(so, sopt);
5842	if (error != `0`) {
5843	if (error == EJUSTRETURN) {
5844	error = `0`;
5845	}
5846	goto out;
5847	}
5848
5849	if (sopt->sopt_level != SOL_SOCKET) {
5850	if (so->so_proto != NULL &&
5851	so->so_proto->pr_ctloutput != NULL) {
5852	error = (*so->so_proto->pr_ctloutput)(so, sopt);
5853	goto out;
5854	}
5855	error = ENOPROTOOPT;
5856	} else {
5857	/*
5858	* Allow socket-level (SOL_SOCKET) options to be filtered by
5859	* the protocol layer, if needed. A zero value returned from
5860	* the handler means use default socket-level processing as
5861	* done by the rest of this routine. Otherwise, any other
5862	* return value indicates that the option is unsupported.
5863	*/
5864	if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5865	pru_socheckopt(so, sopt)) != `0`) {
5866	goto out;
5867	}
5868
5869	error = `0`;
5870	switch (sopt->sopt_name) {
5871	case SO_LINGER:
5872	case SO_LINGER_SEC:
5873	l.l_onoff = ((so->so_options & SO_LINGER) ? `1` : `0`);
5874	l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5875	so->so_linger : so->so_linger / hz;
5876	error = sooptcopyout(sopt, buf: &l, len: sizeof(l));
5877	break;
5878
5879	case SO_USELOOPBACK:
5880	case SO_DONTROUTE:
5881	case SO_DEBUG:
5882	case SO_KEEPALIVE:
5883	case SO_REUSEADDR:
5884	case SO_REUSEPORT:
5885	case SO_BROADCAST:
5886	case SO_OOBINLINE:
5887	case SO_TIMESTAMP:
5888	case SO_TIMESTAMP_MONOTONIC:
5889	case SO_TIMESTAMP_CONTINUOUS:
5890	case SO_DONTTRUNC:
5891	case SO_WANTMORE:
5892	case SO_WANTOOBFLAG:
5893	case SO_NOWAKEFROMSLEEP:
5894	case SO_NOAPNFALLBK:
5895	optval = so->so_options & sopt->sopt_name;
5896	integer:
5897	error = sooptcopyout(sopt, buf: &optval, len: sizeof(optval));
5898	break;
5899
5900	case SO_TYPE:
5901	optval = so->so_type;
5902	goto integer;
5903
5904	case SO_NREAD:
5905	if (so->so_proto->pr_flags & PR_ATOMIC) {
5906	int pkt_total;
5907	struct mbuf *m1;
5908
5909	pkt_total = `0`;
5910	m1 = so->so_rcv.sb_mb;
5911	while (m1 != NULL) {
5912	if (m_has_mtype(m: m1, mtype_flags: MTF_DATA \| MTF_HEADER \| MTF_OOBDATA)) {
5913	pkt_total += m1->m_len;
5914	}
5915	m1 = m1->m_next;
5916	}
5917	optval = pkt_total;
5918	} else {
5919	optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5920	}
5921	goto integer;
5922
5923	case SO_NUMRCVPKT:
5924	if (so->so_proto->pr_flags & PR_ATOMIC) {
5925	int cnt = `0`;
5926	struct mbuf *m1;
5927
5928	m1 = so->so_rcv.sb_mb;
5929	while (m1 != NULL) {
5930	cnt += `1`;
5931	m1 = m1->m_nextpkt;
5932	}
5933	optval = cnt;
5934	goto integer;
5935	} else {
5936	error = ENOPROTOOPT;
5937	break;
5938	}
5939
5940	case SO_NWRITE:
5941	optval = so->so_snd.sb_cc;
5942	goto integer;
5943
5944	case SO_ERROR:
5945	optval = so->so_error;
5946	so->so_error = `0`;
5947	goto integer;
5948
5949	case SO_SNDBUF: {
5950	u_int32_t hiwat = so->so_snd.sb_hiwat;
5951
5952	if (so->so_snd.sb_flags & SB_UNIX) {
5953	struct unpcb *unp =
5954	(struct unpcb *)(so->so_pcb);
5955	if (unp != NULL && unp->unp_conn != NULL) {
5956	hiwat += unp->unp_conn->unp_cc;
5957	}
5958	}
5959
5960	optval = hiwat;
5961	goto integer;
5962	}
5963	case SO_RCVBUF:
5964	optval = so->so_rcv.sb_hiwat;
5965	goto integer;
5966
5967	case SO_SNDLOWAT:
5968	optval = so->so_snd.sb_lowat;
5969	goto integer;
5970
5971	case SO_RCVLOWAT:
5972	optval = so->so_rcv.sb_lowat;
5973	goto integer;
5974
5975	case SO_SNDTIMEO:
5976	case SO_RCVTIMEO:
5977	tv = (sopt->sopt_name == SO_SNDTIMEO ?
5978	so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
5979
5980	error = sooptcopyout_timeval(sopt, tv_p: &tv);
5981	break;
5982
5983	case SO_NOSIGPIPE:
5984	optval = (so->so_flags & SOF_NOSIGPIPE);
5985	goto integer;
5986
5987	case SO_NOADDRERR:
5988	optval = (so->so_flags & SOF_NOADDRAVAIL);
5989	goto integer;
5990
5991	case SO_REUSESHAREUID:
5992	optval = (so->so_flags & SOF_REUSESHAREUID);
5993	goto integer;
5994
5995
5996	case SO_NOTIFYCONFLICT:
5997	optval = (so->so_flags & SOF_NOTIFYCONFLICT);
5998	goto integer;
5999
6000	case SO_RESTRICTIONS:
6001	optval = so_get_restrictions(so);
6002	goto integer;
6003
6004	case SO_AWDL_UNRESTRICTED:
6005	if (SOCK_DOM(so) == PF_INET \|\|
6006	SOCK_DOM(so) == PF_INET6) {
6007	optval = inp_get_awdl_unrestricted(
6008	sotoinpcb(so));
6009	goto integer;
6010	} else {
6011	error = EOPNOTSUPP;
6012	}
6013	break;
6014
6015	case SO_INTCOPROC_ALLOW:
6016	if (SOCK_DOM(so) == PF_INET6) {
6017	optval = inp_get_intcoproc_allowed(
6018	sotoinpcb(so));
6019	goto integer;
6020	} else {
6021	error = EOPNOTSUPP;
6022	}
6023	break;
6024
6025	case SO_LABEL:
6026	error = EOPNOTSUPP;
6027	break;
6028
6029	case SO_PEERLABEL:
6030	error = EOPNOTSUPP;
6031	break;
6032
6033	#ifdef __APPLE_API_PRIVATE
6034	case SO_UPCALLCLOSEWAIT:
6035	optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
6036	goto integer;
6037	#endif
6038	case SO_RANDOMPORT:
6039	optval = (so->so_flags & SOF_BINDRANDOMPORT);
6040	goto integer;
6041
6042	case SO_NP_EXTENSIONS: {
6043	struct so_np_extensions sonpx = {};
6044
6045	sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
6046	SONPX_SETOPTSHUT : `0`;
6047	sonpx.npx_mask = SONPX_MASK_VALID;
6048
6049	error = sooptcopyout(sopt, buf: &sonpx,
6050	len: sizeof(struct so_np_extensions));
6051	break;
6052	}
6053
6054	case SO_TRAFFIC_CLASS:
6055	optval = so->so_traffic_class;
6056	goto integer;
6057
6058	case SO_RECV_TRAFFIC_CLASS:
6059	optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
6060	goto integer;
6061
6062	#if (DEVELOPMENT \|\| DEBUG)
6063	case SO_TRAFFIC_CLASS_DBG:
6064	error = sogetopt_tcdbg(so, sopt);
6065	break;
6066	#endif /* (DEVELOPMENT \|\| DEBUG) */
6067
6068	case SO_PRIVILEGED_TRAFFIC_CLASS:
6069	optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6070	goto integer;
6071
6072	case SO_DEFUNCTOK:
6073	optval = !(so->so_flags & SOF_NODEFUNCT);
6074	goto integer;
6075
6076	case SO_ISDEFUNCT:
6077	optval = (so->so_flags & SOF_DEFUNCT);
6078	goto integer;
6079
6080	case SO_OPPORTUNISTIC:
6081	optval = so_get_opportunistic(so);
6082	goto integer;
6083
6084	case SO_FLUSH:
6085	/ This option is not gettable /
6086	error = EINVAL;
6087	break;
6088
6089	case SO_RECV_ANYIF:
6090	optval = so_get_recv_anyif(so);
6091	goto integer;
6092
6093	case SO_TRAFFIC_MGT_BACKGROUND:
6094	/ This option is handled by lower layer(s) /
6095	if (so->so_proto != NULL &&
6096	so->so_proto->pr_ctloutput != NULL) {
6097	(void) so->so_proto->pr_ctloutput(so, sopt);
6098	}
6099	break;
6100
6101	#if FLOW_DIVERT
6102	case SO_FLOW_DIVERT_TOKEN:
6103	error = flow_divert_token_get(so, sopt);
6104	break;
6105	#endif /* FLOW_DIVERT */
6106
6107	#if NECP
6108	case SO_NECP_ATTRIBUTES:
6109	if (SOCK_DOM(so) == PF_MULTIPATH) {
6110	/ Handled by MPTCP itself /
6111	break;
6112	}
6113
6114	if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6115	error = EINVAL;
6116	goto out;
6117	}
6118
6119	error = necp_get_socket_attributes(attributes: &sotoinpcb(so)->inp_necp_attributes, sopt);
6120	break;
6121
6122	case SO_NECP_CLIENTUUID: {
6123	uuid_t *ncu;
6124
6125	if (SOCK_DOM(so) == PF_MULTIPATH) {
6126	ncu = &mpsotomppcb(mp_so: so)->necp_client_uuid;
6127	} else if (SOCK_DOM(so) == PF_INET \|\| SOCK_DOM(so) == PF_INET6) {
6128	ncu = &sotoinpcb(so)->necp_client_uuid;
6129	} else {
6130	error = EINVAL;
6131	goto out;
6132	}
6133
6134	error = sooptcopyout(sopt, buf: ncu, len: sizeof(uuid_t));
6135	break;
6136	}
6137
6138	case SO_NECP_LISTENUUID: {
6139	uuid_t *nlu;
6140
6141	if (SOCK_DOM(so) == PF_INET \|\| SOCK_DOM(so) == PF_INET6) {
6142	if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) {
6143	nlu = &sotoinpcb(so)->necp_client_uuid;
6144	} else {
6145	error = ENOENT;
6146	goto out;
6147	}
6148	} else {
6149	error = EINVAL;
6150	goto out;
6151	}
6152
6153	error = sooptcopyout(sopt, buf: nlu, len: sizeof(uuid_t));
6154	break;
6155	}
6156
6157	case SO_RESOLVER_SIGNATURE: {
6158	if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6159	error = EINVAL;
6160	goto out;
6161	}
6162	error = necp_get_socket_resolver_signature(sotoinpcb(so), sopt);
6163	break;
6164	}
6165
6166	#endif /* NECP */
6167
6168	#if CONTENT_FILTER
6169	case SO_CFIL_SOCK_ID: {
6170	cfil_sock_id_t sock_id;
6171
6172	sock_id = cfil_sock_id_from_socket(so);
6173
6174	error = sooptcopyout(sopt, buf: &sock_id,
6175	len: sizeof(cfil_sock_id_t));
6176	break;
6177	}
6178	#endif /* CONTENT_FILTER */
6179
6180	case SO_EXTENDED_BK_IDLE:
6181	optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6182	goto integer;
6183	case SO_MARK_CELLFALLBACK:
6184	optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > `0`)
6185	? `1` : `0`;
6186	goto integer;
6187	case SO_FALLBACK_MODE:
6188	optval = so->so_fallback_mode;
6189	goto integer;
6190	case SO_MARK_KNOWN_TRACKER: {
6191	optval = ((so->so_flags1 & SOF1_KNOWN_TRACKER) > `0`)
6192	? `1` : `0`;
6193	goto integer;
6194	}
6195	case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
6196	optval = ((so->so_flags1 & SOF1_TRACKER_NON_APP_INITIATED) > `0`)
6197	? `1` : `0`;
6198	goto integer;
6199	}
6200	case SO_MARK_APPROVED_APP_DOMAIN: {
6201	optval = ((so->so_flags1 & SOF1_APPROVED_APP_DOMAIN) > `0`)
6202	? `1` : `0`;
6203	goto integer;
6204	}
6205	case SO_NET_SERVICE_TYPE: {
6206	if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
6207	optval = so->so_netsvctype;
6208	} else {
6209	optval = NET_SERVICE_TYPE_BE;
6210	}
6211	goto integer;
6212	}
6213	case SO_NETSVC_MARKING_LEVEL:
6214	optval = so_get_netsvc_marking_level(so);
6215	goto integer;
6216
6217	case SO_MPKL_SEND_INFO: {
6218	struct so_mpkl_send_info so_mpkl_send_info;
6219
6220	uuid_copy(dst: so_mpkl_send_info.mpkl_uuid, src: so->so_mpkl_send_uuid);
6221	so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
6222	error = sooptcopyout(sopt, buf: &so_mpkl_send_info,
6223	len: sizeof(struct so_mpkl_send_info));
6224	break;
6225	}
6226	case SO_MARK_WAKE_PKT:
6227	optval = (so->so_flags & SOF_MARK_WAKE_PKT);
6228	goto integer;
6229	case SO_RECV_WAKE_PKT:
6230	optval = (so->so_flags & SOF_RECV_WAKE_PKT);
6231	goto integer;
6232	case SO_APPLICATION_ID: {
6233	if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6234	error = EINVAL;
6235	goto out;
6236	}
6237	so_application_id_t application_id = { `0` };
6238	application_id.uid = kauth_cred_getuid(cred: so->so_cred);
6239	uuid_copy(dst: application_id.effective_uuid, src: !uuid_is_null(uu: so->e_uuid) ? so->e_uuid : so->last_uuid);
6240	application_id.persona_id = so->so_persona_id;
6241	error = sooptcopyout(sopt, buf: &application_id, len: sizeof(so_application_id_t));
6242	break;
6243	}
6244	default:
6245	error = ENOPROTOOPT;
6246	break;
6247	}
6248	}
6249	out:
6250	if (dolock) {
6251	socket_unlock(so, refcount: `1`);
6252	}
6253	return error;
6254	}
6255
6256	/*
6257	* The size limits on our soopt_getm is different from that on FreeBSD.
6258	* We limit the size of options to MCLBYTES. This will have to change
6259	* if we need to define options that need more space than MCLBYTES.
6260	*/
6261	int
6262	soopt_getm(struct sockopt sopt, struct* mbuf **mp)
6263	{
6264	struct mbuf m, m_prev;
6265	int sopt_size = (int)sopt->sopt_valsize;
6266	int how;
6267
6268	if (sopt_size <= `0` \|\| sopt_size > MCLBYTES) {
6269	return EMSGSIZE;
6270	}
6271
6272	how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6273	MGET(m, how, MT_DATA);
6274	if (m == NULL) {
6275	return ENOBUFS;
6276	}
6277	if (sopt_size > MLEN) {
6278	MCLGET(m, how);
6279	if ((m->m_flags & M_EXT) == `0`) {
6280	m_free(m);
6281	return ENOBUFS;
6282	}
6283	m->m_len = min(MCLBYTES, b: sopt_size);
6284	} else {
6285	m->m_len = min(MLEN, b: sopt_size);
6286	}
6287	sopt_size -= m->m_len;
6288	*mp = m;
6289	m_prev = m;
6290
6291	while (sopt_size > `0`) {
6292	MGET(m, how, MT_DATA);
6293	if (m == NULL) {
6294	m_freem(*mp);
6295	return ENOBUFS;
6296	}
6297	if (sopt_size > MLEN) {
6298	MCLGET(m, how);
6299	if ((m->m_flags & M_EXT) == `0`) {
6300	m_freem(*mp);
6301	m_freem(m);
6302	return ENOBUFS;
6303	}
6304	m->m_len = min(MCLBYTES, b: sopt_size);
6305	} else {
6306	m->m_len = min(MLEN, b: sopt_size);
6307	}
6308	sopt_size -= m->m_len;
6309	m_prev->m_next = m;
6310	m_prev = m;
6311	}
6312	return `0`;
6313	}
6314
6315	/ copyin sopt data into mbuf chain /
6316	int
6317	soopt_mcopyin(struct sockopt sopt, struct* mbuf *m)
6318	{
6319	struct mbuf *m0 = m;
6320
6321	if (sopt->sopt_val == USER_ADDR_NULL) {
6322	return `0`;
6323	}
6324	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6325	if (sopt->sopt_p != kernproc) {
6326	int error;
6327
6328	error = copyin(sopt->sopt_val, mtod(m, char *),
6329	m->m_len);
6330	if (error != `0`) {
6331	m_freem(m0);
6332	return error;
6333	}
6334	} else {
6335	bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
6336	mtod(m, char *), n: m->m_len);
6337	}
6338	sopt->sopt_valsize -= m->m_len;
6339	sopt->sopt_val += m->m_len;
6340	m = m->m_next;
6341	}
6342	/ should be allocated enoughly at ip6_sooptmcopyin() /
6343	if (m != NULL) {
6344	panic("soopt_mcopyin");
6345	/ NOTREACHED /
6346	}
6347	return `0`;
6348	}
6349
6350	/ copyout mbuf chain data into soopt /
6351	int
6352	soopt_mcopyout(struct sockopt sopt, struct* mbuf *m)
6353	{
6354	struct mbuf *m0 = m;
6355	size_t valsize = `0`;
6356
6357	if (sopt->sopt_val == USER_ADDR_NULL) {
6358	return `0`;
6359	}
6360	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6361	if (sopt->sopt_p != kernproc) {
6362	int error;
6363
6364	error = copyout(mtod(m, char *), sopt->sopt_val,
6365	m->m_len);
6366	if (error != `0`) {
6367	m_freem(m0);
6368	return error;
6369	}
6370	} else {
6371	bcopy(mtod(m, char *),
6372	CAST_DOWN(caddr_t, sopt->sopt_val), n: m->m_len);
6373	}
6374	sopt->sopt_valsize -= m->m_len;
6375	sopt->sopt_val += m->m_len;
6376	valsize += m->m_len;
6377	m = m->m_next;
6378	}
6379	if (m != NULL) {
6380	/ enough soopt buffer should be given from user-land /
6381	m_freem(m0);
6382	return EINVAL;
6383	}
6384	sopt->sopt_valsize = valsize;
6385	return `0`;
6386	}
6387
6388	void
6389	sohasoutofband(struct socket *so)
6390	{
6391	if (so->so_pgid < `0`) {
6392	gsignal(pgid: -so->so_pgid, SIGURG);
6393	} else if (so->so_pgid > `0`) {
6394	proc_signal(pid: so->so_pgid, SIGURG);
6395	}
6396	selwakeup(&so->so_rcv.sb_sel);
6397	if (so->so_rcv.sb_flags & SB_KNOTE) {
6398	KNOTE(&so->so_rcv.sb_sel.si_note,
6399	(NOTE_OOB \| SO_FILT_HINT_LOCKED));
6400	}
6401	}
6402
6403	int
6404	sopoll(struct socket so, int* events, kauth_cred_t cred, void * wql)
6405	{
6406	#pragma unused(cred)
6407	struct proc *p = current_proc();
6408	int revents = `0`;
6409
6410	socket_lock(so, refcount: `1`);
6411	so_update_last_owner_locked(so, PROC_NULL);
6412	so_update_policy(so);
6413
6414	if (events & (POLLIN \| POLLRDNORM)) {
6415	if (soreadable(so)) {
6416	revents \|= events & (POLLIN \| POLLRDNORM);
6417	}
6418	}
6419
6420	if (events & (POLLOUT \| POLLWRNORM)) {
6421	if (sowriteable(so)) {
6422	revents \|= events & (POLLOUT \| POLLWRNORM);
6423	}
6424	}
6425
6426	if (events & (POLLPRI \| POLLRDBAND)) {
6427	if (so->so_oobmark \|\| (so->so_state & SS_RCVATMARK)) {
6428	revents \|= events & (POLLPRI \| POLLRDBAND);
6429	}
6430	}
6431
6432	if (revents == `0`) {
6433	if (events & (POLLIN \| POLLPRI \| POLLRDNORM \| POLLRDBAND)) {
6434	/*
6435	* Darwin sets the flag first,
6436	* BSD calls selrecord first
6437	*/
6438	so->so_rcv.sb_flags \|= SB_SEL;
6439	selrecord(selector: p, &so->so_rcv.sb_sel, wql);
6440	}
6441
6442	if (events & (POLLOUT \| POLLWRNORM)) {
6443	/*
6444	* Darwin sets the flag first,
6445	* BSD calls selrecord first
6446	*/
6447	so->so_snd.sb_flags \|= SB_SEL;
6448	selrecord(selector: p, &so->so_snd.sb_sel, wql);
6449	}
6450	}
6451
6452	socket_unlock(so, refcount: `1`);
6453	return revents;
6454	}
6455
6456	int
6457	soo_kqfilter(struct fileproc fp, struct* knote kn, struct* kevent_qos_s *kev)
6458	{
6459	struct socket so = (struct* socket *)fp_get_data(fp);
6460	int result;
6461
6462	socket_lock(so, refcount: `1`);
6463	so_update_last_owner_locked(so, PROC_NULL);
6464	so_update_policy(so);
6465
6466	switch (kn->kn_filter) {
6467	case EVFILT_READ:
6468	kn->kn_filtid = EVFILTID_SOREAD;
6469	break;
6470	case EVFILT_WRITE:
6471	kn->kn_filtid = EVFILTID_SOWRITE;
6472	break;
6473	case EVFILT_SOCK:
6474	kn->kn_filtid = EVFILTID_SCK;
6475	break;
6476	case EVFILT_EXCEPT:
6477	kn->kn_filtid = EVFILTID_SOEXCEPT;
6478	break;
6479	default:
6480	socket_unlock(so, refcount: `1`);
6481	knote_set_error(kn, EINVAL);
6482	return `0`;
6483	}
6484
6485	/*
6486	* call the appropriate sub-filter attach
6487	* with the socket still locked
6488	*/
6489	result = knote_fops(kn)->f_attach(kn, kev);
6490
6491	socket_unlock(so, refcount: `1`);
6492
6493	return result;
6494	}
6495
6496	static int
6497	filt_soread_common(struct knote kn, struct* kevent_qos_s kev, struct* socket *so)
6498	{
6499	int retval = `0`;
6500	int64_t data = `0`;
6501
6502	if (so->so_options & SO_ACCEPTCONN) {
6503	/*
6504	* Radar 6615193 handle the listen case dynamically
6505	* for kqueue read filter. This allows to call listen()
6506	* after registering the kqueue EVFILT_READ.
6507	*/
6508
6509	retval = !TAILQ_EMPTY(&so->so_comp);
6510	data = so->so_qlen;
6511	goto out;
6512	}
6513
6514	/ socket isn't a listener /
6515	/*
6516	* NOTE_LOWAT specifies new low water mark in data, i.e.
6517	* the bytes of protocol data. We therefore exclude any
6518	* control bytes.
6519	*/
6520	data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6521
6522	if (kn->kn_sfflags & NOTE_OOB) {
6523	if (so->so_oobmark \|\| (so->so_state & SS_RCVATMARK)) {
6524	kn->kn_fflags \|= NOTE_OOB;
6525	data -= so->so_oobmark;
6526	retval = `1`;
6527	goto out;
6528	}
6529	}
6530
6531	if ((so->so_state & SS_CANTRCVMORE)
6532	#if CONTENT_FILTER
6533	&& cfil_sock_data_pending(sb: &so->so_rcv) == `0`
6534	#endif /* CONTENT_FILTER */
6535	) {
6536	kn->kn_flags \|= EV_EOF;
6537	kn->kn_fflags = so->so_error;
6538	retval = `1`;
6539	goto out;
6540	}
6541
6542	if (so->so_error) { / temporary udp error /
6543	retval = `1`;
6544	goto out;
6545	}
6546
6547	int64_t lowwat = so->so_rcv.sb_lowat;
6548	/*
6549	* Ensure that when NOTE_LOWAT is used, the derived
6550	* low water mark is bounded by socket's rcv buf's
6551	* high and low water mark values.
6552	*/
6553	if (kn->kn_sfflags & NOTE_LOWAT) {
6554	if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6555	lowwat = so->so_rcv.sb_hiwat;
6556	} else if (kn->kn_sdata > lowwat) {
6557	lowwat = kn->kn_sdata;
6558	}
6559	}
6560
6561	/*
6562	* While the `data` field is the amount of data to read,
6563	* 0-sized packets need to wake up the kqueue, see 58140856,
6564	* so we need to take control bytes into account too.
6565	*/
6566	retval = (so->so_rcv.sb_cc >= lowwat);
6567
6568	out:
6569	if (retval && kev) {
6570	knote_fill_kevent(kn, kev, data);
6571	}
6572	return retval;
6573	}
6574
6575	static int
6576	filt_sorattach(struct knote kn, __unused struct* kevent_qos_s *kev)
6577	{
6578	struct socket so = (struct* socket *)fp_get_data(fp: kn->kn_fp);
6579
6580	/ socket locked /
6581
6582	/*
6583	* If the caller explicitly asked for OOB results (e.g. poll())
6584	* from EVFILT_READ, then save that off in the hookid field
6585	* and reserve the kn_flags EV_OOBAND bit for output only.
6586	*/
6587	if (kn->kn_filter == EVFILT_READ &&
6588	kn->kn_flags & EV_OOBAND) {
6589	kn->kn_flags &= ~EV_OOBAND;
6590	kn->kn_hook32 = EV_OOBAND;
6591	} else {
6592	kn->kn_hook32 = `0`;
6593	}
6594	if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
6595	so->so_rcv.sb_flags \|= SB_KNOTE;
6596	}
6597
6598	/ indicate if event is already fired /
6599	return filt_soread_common(kn, NULL, so);
6600	}
6601
6602	static void
6603	filt_sordetach(struct knote *kn)
6604	{
6605	struct socket so = (struct* socket *)fp_get_data(fp: kn->kn_fp);
6606
6607	socket_lock(so, refcount: `1`);
6608	if (so->so_rcv.sb_flags & SB_KNOTE) {
6609	if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
6610	so->so_rcv.sb_flags &= ~SB_KNOTE;
6611	}
6612	}
6613	socket_unlock(so, refcount: `1`);
6614	}
6615
6616	/ARGSUSED/
6617	static int
6618	filt_soread(struct knote kn, long* hint)
6619	{
6620	struct socket so = (struct* socket *)fp_get_data(fp: kn->kn_fp);
6621	int retval;
6622
6623	if ((hint & SO_FILT_HINT_LOCKED) == `0`) {
6624	socket_lock(so, refcount: `1`);
6625	}
6626
6627	retval = filt_soread_common(kn, NULL, so);
6628
6629	if ((hint & SO_FILT_HINT_LOCKED) == `0`) {
6630	socket_unlock(so, refcount: `1`);
6631	}
6632
6633	return retval;
6634	}
6635
6636	static int
6637	filt_sortouch(struct knote kn, struct* kevent_qos_s *kev)
6638	{
6639	struct socket so = (struct* socket *)fp_get_data(fp: kn->kn_fp);
6640	int retval;
6641
6642	socket_lock(so, refcount: `1`);
6643
6644	/ save off the new input fflags and data /
6645	kn->kn_sfflags = kev->fflags;
6646	kn->kn_sdata = kev->data;
6647
6648	/ determine if changes result in fired events /
6649	retval = filt_soread_common(kn, NULL, so);
6650
6651	socket_unlock(so, refcount: `1`);
6652
6653	return retval;
6654	}
6655
6656	static int
6657	filt_sorprocess(struct knote kn, struct* kevent_qos_s *kev)
6658	{
6659	struct socket so = (struct* socket *)fp_get_data(fp: kn->kn_fp);
6660	int retval;
6661
6662	socket_lock(so, refcount: `1`);
6663	retval = filt_soread_common(kn, kev, so);
6664	socket_unlock(so, refcount: `1`);
6665
6666	return retval;
6667	}
6668
6669	int
6670	so_wait_for_if_feedback(struct socket *so)
6671	{
6672	if ((SOCK_DOM(so) == PF_INET \|\| SOCK_DOM(so) == PF_INET6) &&
6673	(so->so_state & SS_ISCONNECTED)) {
6674	struct inpcb *inp = sotoinpcb(so);
6675	if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6676	return `1`;
6677	}
6678	}
6679	return `0`;
6680	}
6681
6682	static int
6683	filt_sowrite_common(struct knote kn, struct* kevent_qos_s kev, struct* socket *so)
6684	{
6685	int ret = `0`;
6686	int64_t data = sbspace(sb: &so->so_snd);
6687
6688	if (so->so_state & SS_CANTSENDMORE) {
6689	kn->kn_flags \|= EV_EOF;
6690	kn->kn_fflags = so->so_error;
6691	ret = `1`;
6692	goto out;
6693	}
6694
6695	if (so->so_error) { / temporary udp error /
6696	ret = `1`;
6697	goto out;
6698	}
6699
6700	if (!socanwrite(so)) {
6701	ret = `0`;
6702	goto out;
6703	}
6704
6705	if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6706	ret = `1`;
6707	goto out;
6708	}
6709
6710	int64_t lowwat = so->so_snd.sb_lowat;
6711	const int64_t hiwat = so->so_snd.sb_hiwat;
6712	/*
6713	* Deal with connected UNIX domain sockets which
6714	* rely on the fact that the sender's socket buffer is
6715	* actually the receiver's socket buffer.
6716	*/
6717	if (SOCK_DOM(so) == PF_LOCAL) {
6718	struct unpcb *unp = sotounpcb(so);
6719	if (unp != NULL && unp->unp_conn != NULL &&
6720	unp->unp_conn->unp_socket != NULL) {
6721	struct socket *so2 = unp->unp_conn->unp_socket;
6722	/*
6723	* At this point we know that `so' is locked
6724	* and that `unp_conn` isn't going to change.
6725	* However, we don't lock `so2` because doing so
6726	* may require unlocking `so'
6727	* (see unp_get_locks_in_order()).
6728	*
6729	* Two cases can happen:
6730	*
6731	* 1) we return 1 and tell the application that
6732	* it can write. Meanwhile, another thread
6733	* fills up the socket buffer. This will either
6734	* lead to a blocking send or EWOULDBLOCK
6735	* which the application should deal with.
6736	* 2) we return 0 and tell the application that
6737	* the socket is not writable. Meanwhile,
6738	* another thread depletes the receive socket
6739	* buffer. In this case the application will
6740	* be woken up by sb_notify().
6741	*
6742	* MIN() is required because otherwise sosendcheck()
6743	* may return EWOULDBLOCK since it only considers
6744	* so->so_snd.
6745	*/
6746	data = MIN(data, sbspace(&so2->so_rcv));
6747	}
6748	}
6749
6750	if (kn->kn_sfflags & NOTE_LOWAT) {
6751	if (kn->kn_sdata > hiwat) {
6752	lowwat = hiwat;
6753	} else if (kn->kn_sdata > lowwat) {
6754	lowwat = kn->kn_sdata;
6755	}
6756	}
6757
6758	if (data > `0` && data >= lowwat) {
6759	if ((so->so_flags & SOF_NOTSENT_LOWAT)
6760	#if (DEBUG \|\| DEVELOPMENT)
6761	&& so_notsent_lowat_check == `1`
6762	#endif /* DEBUG \|\| DEVELOPMENT */
6763	) {
6764	if ((SOCK_DOM(so) == PF_INET \|\|
6765	SOCK_DOM(so) == PF_INET6) &&
6766	so->so_type == SOCK_STREAM) {
6767	ret = tcp_notsent_lowat_check(so);
6768	}
6769	#if MPTCP
6770	else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6771	(SOCK_PROTO(so) == IPPROTO_TCP)) {
6772	ret = mptcp_notsent_lowat_check(so);
6773	}
6774	#endif
6775	else {
6776	ret = `1`;
6777	goto out;
6778	}
6779	} else {
6780	ret = `1`;
6781	}
6782	}
6783	if (so_wait_for_if_feedback(so)) {
6784	ret = `0`;
6785	}
6786
6787	out:
6788	if (ret && kev) {
6789	knote_fill_kevent(kn, kev, data);
6790	}
6791	return ret;
6792	}
6793
6794	static int
6795	filt_sowattach(struct knote kn, __unused struct* kevent_qos_s *kev)
6796	{
6797	struct socket so = (struct* socket *)fp_get_data(fp: kn->kn_fp);
6798
6799	/ socket locked /
6800	if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
6801	so->so_snd.sb_flags \|= SB_KNOTE;
6802	}
6803
6804	/ determine if its already fired /
6805	return filt_sowrite_common(kn, NULL, so);
6806	}
6807
6808	static void
6809	filt_sowdetach(struct knote *kn)
6810	{
6811	struct socket so = (struct* socket *)fp_get_data(fp: kn->kn_fp);
6812	socket_lock(so, refcount: `1`);
6813
6814	if (so->so_snd.sb_flags & SB_KNOTE) {
6815	if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
6816	so->so_snd.sb_flags &= ~SB_KNOTE;
6817	}
6818	}
6819	socket_unlock(so, refcount: `1`);
6820	}
6821
6822	/ARGSUSED/
6823	static int
6824	filt_sowrite(struct knote kn, long* hint)
6825	{
6826	struct socket so = (struct* socket *)fp_get_data(fp: kn->kn_fp);
6827	int ret;
6828
6829	if ((hint & SO_FILT_HINT_LOCKED) == `0`) {
6830	socket_lock(so, refcount: `1`);
6831	}
6832
6833	ret = filt_sowrite_common(kn, NULL, so);
6834
6835	if ((hint & SO_FILT_HINT_LOCKED) == `0`) {
6836	socket_unlock(so, refcount: `1`);
6837	}
6838
6839	return ret;
6840	}
6841
6842	static int
6843	filt_sowtouch(struct knote kn, struct* kevent_qos_s *kev)
6844	{
6845	struct socket so = (struct* socket *)fp_get_data(fp: kn->kn_fp);
6846	int ret;
6847
6848	socket_lock(so, refcount: `1`);
6849
6850	/save off the new input fflags and data /
6851	kn->kn_sfflags = kev->fflags;
6852	kn->kn_sdata = kev->data;
6853
6854	/ determine if these changes result in a triggered event /
6855	ret = filt_sowrite_common(kn, NULL, so);
6856
6857	socket_unlock(so, refcount: `1`);
6858
6859	return ret;
6860	}
6861
6862	static int
6863	filt_sowprocess(struct knote kn, struct* kevent_qos_s *kev)
6864	{
6865	struct socket so = (struct* socket *)fp_get_data(fp: kn->kn_fp);
6866	int ret;
6867
6868	socket_lock(so, refcount: `1`);
6869	ret = filt_sowrite_common(kn, kev, so);
6870	socket_unlock(so, refcount: `1`);
6871
6872	return ret;
6873	}
6874
6875	static int
6876	filt_sockev_common(struct knote kn, struct* kevent_qos_s *kev,
6877	struct socket so, long* ev_hint)
6878	{
6879	int ret = `0`;
6880	int64_t data = `0`;
6881	uint32_t level_trigger = `0`;
6882
6883	if (ev_hint & SO_FILT_HINT_CONNRESET) {
6884	kn->kn_fflags \|= NOTE_CONNRESET;
6885	}
6886	if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6887	kn->kn_fflags \|= NOTE_TIMEOUT;
6888	}
6889	if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6890	kn->kn_fflags \|= NOTE_NOSRCADDR;
6891	}
6892	if (ev_hint & SO_FILT_HINT_IFDENIED) {
6893	kn->kn_fflags \|= NOTE_IFDENIED;
6894	}
6895	if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6896	kn->kn_fflags \|= NOTE_KEEPALIVE;
6897	}
6898	if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
6899	kn->kn_fflags \|= NOTE_ADAPTIVE_WTIMO;
6900	}
6901	if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
6902	kn->kn_fflags \|= NOTE_ADAPTIVE_RTIMO;
6903	}
6904	if ((ev_hint & SO_FILT_HINT_CONNECTED) \|\|
6905	(so->so_state & SS_ISCONNECTED)) {
6906	kn->kn_fflags \|= NOTE_CONNECTED;
6907	level_trigger \|= NOTE_CONNECTED;
6908	}
6909	if ((ev_hint & SO_FILT_HINT_DISCONNECTED) \|\|
6910	(so->so_state & SS_ISDISCONNECTED)) {
6911	kn->kn_fflags \|= NOTE_DISCONNECTED;
6912	level_trigger \|= NOTE_DISCONNECTED;
6913	}
6914	if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6915	if (so->so_proto != NULL &&
6916	(so->so_proto->pr_flags & PR_EVCONNINFO)) {
6917	kn->kn_fflags \|= NOTE_CONNINFO_UPDATED;
6918	}
6919	}
6920	if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) \|\|
6921	tcp_notify_ack_active(so)) {
6922	kn->kn_fflags \|= NOTE_NOTIFY_ACK;
6923	}
6924	if (ev_hint & SO_FILT_HINT_WAKE_PKT) {
6925	kn->kn_fflags \|= NOTE_WAKE_PKT;
6926	}
6927
6928	if ((so->so_state & SS_CANTRCVMORE)
6929	#if CONTENT_FILTER
6930	&& cfil_sock_data_pending(sb: &so->so_rcv) == `0`
6931	#endif /* CONTENT_FILTER */
6932	) {
6933	kn->kn_fflags \|= NOTE_READCLOSED;
6934	level_trigger \|= NOTE_READCLOSED;
6935	}
6936
6937	if (so->so_state & SS_CANTSENDMORE) {
6938	kn->kn_fflags \|= NOTE_WRITECLOSED;
6939	level_trigger \|= NOTE_WRITECLOSED;
6940	}
6941
6942	if ((ev_hint & SO_FILT_HINT_SUSPEND) \|\|
6943	(so->so_flags & SOF_SUSPENDED)) {
6944	kn->kn_fflags &= ~(NOTE_SUSPEND \| NOTE_RESUME);
6945
6946	/ If resume event was delivered before, reset it /
6947	kn->kn_hook32 &= ~NOTE_RESUME;
6948
6949	kn->kn_fflags \|= NOTE_SUSPEND;
6950	level_trigger \|= NOTE_SUSPEND;
6951	}
6952
6953	if ((ev_hint & SO_FILT_HINT_RESUME) \|\|
6954	(so->so_flags & SOF_SUSPENDED) == `0`) {
6955	kn->kn_fflags &= ~(NOTE_SUSPEND \| NOTE_RESUME);
6956
6957	/ If suspend event was delivered before, reset it /
6958	kn->kn_hook32 &= ~NOTE_SUSPEND;
6959
6960	kn->kn_fflags \|= NOTE_RESUME;
6961	level_trigger \|= NOTE_RESUME;
6962	}
6963
6964	if (so->so_error != `0`) {
6965	ret = `1`;
6966	data = so->so_error;
6967	kn->kn_flags \|= EV_EOF;
6968	} else {
6969	u_int32_t data32 = `0`;
6970	get_sockev_state(so, &data32);
6971	data = data32;
6972	}
6973
6974	/ Reset any events that are not requested on this knote /
6975	kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6976	level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6977
6978	/ Find the level triggerred events that are already delivered /
6979	level_trigger &= kn->kn_hook32;
6980	level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
6981
6982	/ Do not deliver level triggerred events more than once /
6983	if ((kn->kn_fflags & ~level_trigger) != `0`) {
6984	ret = `1`;
6985	}
6986
6987	if (ret && kev) {
6988	/*
6989	* Store the state of the events being delivered. This
6990	* state can be used to deliver level triggered events
6991	* ateast once and still avoid waking up the application
6992	* multiple times as long as the event is active.
6993	*/
6994	if (kn->kn_fflags != `0`) {
6995	kn->kn_hook32 \|= (kn->kn_fflags &
6996	EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6997	}
6998
6999	/*
7000	* NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
7001	* only one of them and remember the last one that was
7002	* delivered last
7003	*/
7004	if (kn->kn_fflags & NOTE_SUSPEND) {
7005	kn->kn_hook32 &= ~NOTE_RESUME;
7006	}
7007	if (kn->kn_fflags & NOTE_RESUME) {
7008	kn->kn_hook32 &= ~NOTE_SUSPEND;
7009	}
7010
7011	knote_fill_kevent(kn, kev, data);
7012	}
7013	return ret;
7014	}
7015
7016	static int
7017	filt_sockattach(struct knote kn, __unused struct* kevent_qos_s *kev)
7018	{
7019	struct socket so = (struct* socket *)fp_get_data(fp: kn->kn_fp);
7020
7021	/ socket locked /
7022	kn->kn_hook32 = `0`;
7023	if (KNOTE_ATTACH(&so->so_klist, kn)) {
7024	so->so_flags \|= SOF_KNOTE;
7025	}
7026
7027	/ determine if event already fired /
7028	return filt_sockev_common(kn, NULL, so, ev_hint: `0`);
7029	}
7030
7031	static void
7032	filt_sockdetach(struct knote *kn)
7033	{
7034	struct socket so = (struct* socket *)fp_get_data(fp: kn->kn_fp);
7035	socket_lock(so, refcount: `1`);
7036
7037	if ((so->so_flags & SOF_KNOTE) != `0`) {
7038	if (KNOTE_DETACH(&so->so_klist, kn)) {
7039	so->so_flags &= ~SOF_KNOTE;
7040	}
7041	}
7042	socket_unlock(so, refcount: `1`);
7043	}
7044
7045	static int
7046	filt_sockev(struct knote kn, long* hint)
7047	{
7048	int ret = `0`, locked = `0`;
7049	struct socket so = (struct* socket *)fp_get_data(fp: kn->kn_fp);
7050	long ev_hint = (hint & SO_FILT_HINT_EV);
7051
7052	if ((hint & SO_FILT_HINT_LOCKED) == `0`) {
7053	socket_lock(so, refcount: `1`);
7054	locked = `1`;
7055	}
7056
7057	ret = filt_sockev_common(kn, NULL, so, ev_hint);
7058
7059	if (locked) {
7060	socket_unlock(so, refcount: `1`);
7061	}
7062
7063	return ret;
7064	}
7065
7066
7067
7068	/*
7069	* filt_socktouch - update event state
7070	*/
7071	static int
7072	filt_socktouch(
7073	struct knote *kn,
7074	struct kevent_qos_s *kev)
7075	{
7076	struct socket so = (struct* socket *)fp_get_data(fp: kn->kn_fp);
7077	uint32_t changed_flags;
7078	int ret;
7079
7080	socket_lock(so, refcount: `1`);
7081
7082	/ save off the [result] data and fflags /
7083	changed_flags = (kn->kn_sfflags ^ kn->kn_hook32);
7084
7085	/ save off the new input fflags and data /
7086	kn->kn_sfflags = kev->fflags;
7087	kn->kn_sdata = kev->data;
7088
7089	/ restrict the current results to the (smaller?) set of new interest /
7090	/*
7091	* For compatibility with previous implementations, we leave kn_fflags
7092	* as they were before.
7093	*/
7094	//kn->kn_fflags &= kev->fflags;
7095
7096	/*
7097	* Since we keep track of events that are already
7098	* delivered, if any of those events are not requested
7099	* anymore the state related to them can be reset
7100	*/
7101	kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7102
7103	/ determine if we have events to deliver /
7104	ret = filt_sockev_common(kn, NULL, so, ev_hint: `0`);
7105
7106	socket_unlock(so, refcount: `1`);
7107
7108	return ret;
7109	}
7110
7111	/*
7112	* filt_sockprocess - query event fired state and return data
7113	*/
7114	static int
7115	filt_sockprocess(struct knote kn, struct* kevent_qos_s *kev)
7116	{
7117	struct socket so = (struct* socket *)fp_get_data(fp: kn->kn_fp);
7118	int ret = `0`;
7119
7120	socket_lock(so, refcount: `1`);
7121
7122	ret = filt_sockev_common(kn, kev, so, ev_hint: `0`);
7123
7124	socket_unlock(so, refcount: `1`);
7125
7126	return ret;
7127	}
7128
7129	void
7130	get_sockev_state(struct socket so, u_int32_t statep)
7131	{
7132	u_int32_t state = *(statep);
7133
7134	/*
7135	* If the state variable is already used by a previous event,
7136	* reset it.
7137	*/
7138	if (state != `0`) {
7139	return;
7140	}
7141
7142	if (so->so_state & SS_ISCONNECTED) {
7143	state \|= SOCKEV_CONNECTED;
7144	} else {
7145	state &= ~(SOCKEV_CONNECTED);
7146	}
7147	state \|= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : `0`);
7148	*(statep) = state;
7149	}
7150
7151	#define SO_LOCK_HISTORY_STR_LEN \
7152	(2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
7153
7154	__private_extern__ const char *
7155	solockhistory_nr(struct socket *so)
7156	{
7157	size_t n = `0`;
7158	int i;
7159	static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
7160
7161	bzero(s: lock_history_str, n: sizeof(lock_history_str));
7162	for (i = SO_LCKDBG_MAX - `1`; i >= `0`; i--) {
7163	n += scnprintf(lock_history_str + n,
7164	SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7165	so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7166	so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
7167	}
7168	return lock_history_str;
7169	}
7170
7171	lck_mtx_t *
7172	socket_getlock(struct socket so, int* flags)
7173	{
7174	if (so->so_proto->pr_getlock != NULL) {
7175	return (*so->so_proto->pr_getlock)(so, flags);
7176	} else {
7177	return so->so_proto->pr_domain->dom_mtx;
7178	}
7179	}
7180
7181	void
7182	socket_lock(struct socket so, int* refcount)
7183	{
7184	void *lr_saved;
7185
7186	lr_saved = __builtin_return_address(`0`);
7187
7188	if (so->so_proto->pr_lock) {
7189	(*so->so_proto->pr_lock)(so, refcount, lr_saved);
7190	} else {
7191	#ifdef MORE_LOCKING_DEBUG
7192	LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
7193	LCK_MTX_ASSERT_NOTOWNED);
7194	#endif
7195	lck_mtx_lock(lck: so->so_proto->pr_domain->dom_mtx);
7196	if (refcount) {
7197	so->so_usecount++;
7198	}
7199	so->lock_lr[so->next_lock_lr] = lr_saved;
7200	so->next_lock_lr = (so->next_lock_lr + `1`) % SO_LCKDBG_MAX;
7201	}
7202	}
7203
7204	void
7205	socket_lock_assert_owned(struct socket *so)
7206	{
7207	lck_mtx_t *mutex_held;
7208
7209	if (so->so_proto->pr_getlock != NULL) {
7210	mutex_held = (*so->so_proto->pr_getlock)(so, `0`);
7211	} else {
7212	mutex_held = so->so_proto->pr_domain->dom_mtx;
7213	}
7214
7215	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7216	}
7217
7218	int
7219	socket_try_lock(struct socket *so)
7220	{
7221	lck_mtx_t *mtx;
7222
7223	if (so->so_proto->pr_getlock != NULL) {
7224	mtx = (*so->so_proto->pr_getlock)(so, `0`);
7225	} else {
7226	mtx = so->so_proto->pr_domain->dom_mtx;
7227	}
7228
7229	return lck_mtx_try_lock(lck: mtx);
7230	}
7231
7232	void
7233	socket_unlock(struct socket so, int* refcount)
7234	{
7235	void *lr_saved;
7236	lck_mtx_t *mutex_held;
7237
7238	lr_saved = __builtin_return_address(`0`);
7239
7240	if (so == NULL \|\| so->so_proto == NULL) {
7241	panic("%s: null so_proto so=%p", __func__, so);
7242	/ NOTREACHED /
7243	}
7244
7245	if (so->so_proto->pr_unlock) {
7246	(*so->so_proto->pr_unlock)(so, refcount, lr_saved);
7247	} else {
7248	mutex_held = so->so_proto->pr_domain->dom_mtx;
7249	#ifdef MORE_LOCKING_DEBUG
7250	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7251	#endif
7252	so->unlock_lr[so->next_unlock_lr] = lr_saved;
7253	so->next_unlock_lr = (so->next_unlock_lr + `1`) % SO_LCKDBG_MAX;
7254
7255	if (refcount) {
7256	if (so->so_usecount <= `0`) {
7257	panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7258	"lrh=%s", __func__, so->so_usecount, so,
7259	SOCK_DOM(so), so->so_type,
7260	SOCK_PROTO(so), solockhistory_nr(so));
7261	/ NOTREACHED /
7262	}
7263
7264	so->so_usecount--;
7265	if (so->so_usecount == `0`) {
7266	sofreelastref(so, dealloc: `1`);
7267	}
7268	}
7269	lck_mtx_unlock(lck: mutex_held);
7270	}
7271	}
7272
7273	/ Called with socket locked, will unlock socket /
7274	void
7275	sofree(struct socket *so)
7276	{
7277	lck_mtx_t *mutex_held;
7278
7279	if (so->so_proto->pr_getlock != NULL) {
7280	mutex_held = (*so->so_proto->pr_getlock)(so, `0`);
7281	} else {
7282	mutex_held = so->so_proto->pr_domain->dom_mtx;
7283	}
7284	LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7285
7286	sofreelastref(so, dealloc: `0`);
7287	}
7288
7289	void
7290	soreference(struct socket *so)
7291	{
7292	socket_lock(so, refcount: `1`); / locks & take one reference on socket /
7293	socket_unlock(so, refcount: `0`); / unlock only /
7294	}
7295
7296	void
7297	sodereference(struct socket *so)
7298	{
7299	socket_lock(so, refcount: `0`);
7300	socket_unlock(so, refcount: `1`);
7301	}
7302
7303	/*
7304	* Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7305	* possibility of using jumbo clusters. Caller must ensure to hold
7306	* the socket lock.
7307	*/
7308	void
7309	somultipages(struct socket *so, boolean_t set)
7310	{
7311	if (set) {
7312	so->so_flags \|= SOF_MULTIPAGES;
7313	} else {
7314	so->so_flags &= ~SOF_MULTIPAGES;
7315	}
7316	}
7317
7318	void
7319	soif2kcl(struct socket *so, boolean_t set)
7320	{
7321	if (set) {
7322	so->so_flags1 \|= SOF1_IF_2KCL;
7323	} else {
7324	so->so_flags1 &= ~SOF1_IF_2KCL;
7325	}
7326	}
7327
7328	int
7329	so_isdstlocal(struct socket *so)
7330	{
7331	struct inpcb inp = (struct* inpcb *)so->so_pcb;
7332
7333	if (SOCK_DOM(so) == PF_INET) {
7334	return inaddr_local(inp->inp_faddr);
7335	} else if (SOCK_DOM(so) == PF_INET6) {
7336	return in6addr_local(&inp->in6p_faddr);
7337	}
7338
7339	return `0`;
7340	}
7341
7342	int
7343	sosetdefunct(struct proc p, struct* socket so, int* level, boolean_t noforce)
7344	{
7345	struct sockbuf rcv, snd;
7346	int err = `0`, defunct;
7347
7348	rcv = &so->so_rcv;
7349	snd = &so->so_snd;
7350
7351	defunct = (so->so_flags & SOF_DEFUNCT);
7352	if (defunct) {
7353	if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
7354	panic("%s: SB_DROP not set", __func__);
7355	/ NOTREACHED /
7356	}
7357	goto done;
7358	}
7359
7360	if (so->so_flags & SOF_NODEFUNCT) {
7361	if (noforce) {
7362	err = EOPNOTSUPP;
7363	if (p != PROC_NULL) {
7364	SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7365	"name %s level %d) so 0x%llu [%d,%d] "
7366	"is not eligible for defunct "
7367	"(%d)\n", __func__, proc_selfpid(),
7368	proc_best_name(current_proc()), proc_pid(p),
7369	proc_best_name(p), level,
7370	so->so_gencnt,
7371	SOCK_DOM(so), SOCK_TYPE(so), err);
7372	}
7373	return err;
7374	}
7375	so->so_flags &= ~SOF_NODEFUNCT;
7376	if (p != PROC_NULL) {
7377	SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7378	"name %s level %d) so 0x%llu [%d,%d] "
7379	"defunct by force "
7380	"(%d)\n", __func__, proc_selfpid(),
7381	proc_best_name(current_proc()), proc_pid(p),
7382	proc_best_name(p), level,
7383	so->so_gencnt,
7384	SOCK_DOM(so), SOCK_TYPE(so), err);
7385	}
7386	} else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7387	struct inpcb inp = (struct* inpcb *)so->so_pcb;
7388	struct ifnet *ifp = inp->inp_last_outifp;
7389
7390	if (ifp && IFNET_IS_CELLULAR(ifp)) {
7391	OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7392	} else if (so->so_flags & SOF_DELEGATED) {
7393	OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7394	} else if (soextbkidlestat.so_xbkidle_time == `0`) {
7395	OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
7396	} else if (noforce && p != PROC_NULL) {
7397	OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
7398
7399	so->so_flags1 \|= SOF1_EXTEND_BK_IDLE_INPROG;
7400	so->so_extended_bk_start = net_uptime();
7401	OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
7402
7403	inpcb_timer_sched(inp->inp_pcbinfo, type: INPCB_TIMER_LAZY);
7404
7405	err = EOPNOTSUPP;
7406	SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7407	"name %s level %d) so 0x%llu [%d,%d] "
7408	"extend bk idle "
7409	"(%d)\n", __func__, proc_selfpid(),
7410	proc_best_name(current_proc()), proc_pid(p),
7411	proc_best_name(p), level,
7412	so->so_gencnt,
7413	SOCK_DOM(so), SOCK_TYPE(so), err);
7414	return err;
7415	} else {
7416	OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7417	}
7418	}
7419
7420	so->so_flags \|= SOF_DEFUNCT;
7421
7422	/ Prevent further data from being appended to the socket buffers /
7423	snd->sb_flags \|= SB_DROP;
7424	rcv->sb_flags \|= SB_DROP;
7425
7426	/ Flush any existing data in the socket buffers /
7427	if (rcv->sb_cc != `0`) {
7428	rcv->sb_flags &= ~SB_SEL;
7429	selthreadclear(&rcv->sb_sel);
7430	sbrelease(sb: rcv);
7431	}
7432	if (snd->sb_cc != `0`) {
7433	snd->sb_flags &= ~SB_SEL;
7434	selthreadclear(&snd->sb_sel);
7435	sbrelease(sb: snd);
7436	}
7437
7438	done:
7439	if (p != PROC_NULL) {
7440	SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7441	"so 0x%llu [%d,%d] %s defunct%s\n", __func__,
7442	proc_selfpid(), proc_best_name(current_proc()),
7443	proc_pid(p), proc_best_name(p), level,
7444	so->so_gencnt, SOCK_DOM(so),
7445	SOCK_TYPE(so), defunct ? "is already" : "marked as",
7446	(so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7447	" extbkidle" : "");
7448	}
7449	return err;
7450	}
7451
7452	int
7453	sodefunct(struct proc p, struct* socket so, int* level)
7454	{
7455	struct sockbuf rcv, snd;
7456
7457	if (!(so->so_flags & SOF_DEFUNCT)) {
7458	panic("%s improperly called", __func__);
7459	/ NOTREACHED /
7460	}
7461	if (so->so_state & SS_DEFUNCT) {
7462	goto done;
7463	}
7464
7465	rcv = &so->so_rcv;
7466	snd = &so->so_snd;
7467
7468	if (SOCK_DOM(so) == PF_INET \|\| SOCK_DOM(so) == PF_INET6) {
7469	char s[MAX_IPv6_STR_LEN];
7470	char d[MAX_IPv6_STR_LEN];
7471	struct inpcb *inp = sotoinpcb(so);
7472
7473	if (p != PROC_NULL) {
7474	SODEFUNCTLOG(
7475	"%s[%d, %s]: (target pid %d name %s level %d) "
7476	"so 0x%llu [%s %s:%d -> %s:%d] is now defunct "
7477	"[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7478	" snd_fl 0x%x]\n", __func__,
7479	proc_selfpid(), proc_best_name(current_proc()),
7480	proc_pid(p), proc_best_name(p), level,
7481	so->so_gencnt,
7482	(SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7483	inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7484	(void *)&inp->inp_laddr.s_addr :
7485	(void *)&inp->in6p_laddr),
7486	s, sizeof(s)), ntohs(inp->in6p_lport),
7487	inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7488	(void *)&inp->inp_faddr.s_addr :
7489	(void *)&inp->in6p_faddr,
7490	d, sizeof(d)), ntohs(inp->in6p_fport),
7491	(uint32_t)rcv->sb_sel.si_flags,
7492	(uint32_t)snd->sb_sel.si_flags,
7493	rcv->sb_flags, snd->sb_flags);
7494	}
7495	} else if (p != PROC_NULL) {
7496	SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7497	"so 0x%llu [%d,%d] is now defunct [rcv_si 0x%x, "
7498	"snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7499	proc_selfpid(), proc_best_name(current_proc()),
7500	proc_pid(p), proc_best_name(p), level,
7501	so->so_gencnt,
7502	SOCK_DOM(so), SOCK_TYPE(so),
7503	(uint32_t)rcv->sb_sel.si_flags,
7504	(uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
7505	snd->sb_flags);
7506	}
7507
7508	/*
7509	* First tell the protocol the flow is defunct
7510	*/
7511	(void) (*so->so_proto->pr_usrreqs->pru_defunct)(so);
7512
7513	/*
7514	* Unwedge threads blocked on sbwait() and sb_lock().
7515	*/
7516	sbwakeup(sb: rcv);
7517	sbwakeup(sb: snd);
7518
7519	so->so_flags1 \|= SOF1_DEFUNCTINPROG;
7520	if (rcv->sb_flags & SB_LOCK) {
7521	sbunlock(sb: rcv, TRUE); / keep socket locked /
7522	}
7523	if (snd->sb_flags & SB_LOCK) {
7524	sbunlock(sb: snd, TRUE); / keep socket locked /
7525	}
7526	/*
7527	* Flush the buffers and disconnect. We explicitly call shutdown
7528	* on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7529	* states are set for the socket. This would also flush out data
7530	* hanging off the receive list of this socket.
7531	*/
7532	(void) soshutdownlock_final(so, SHUT_RD);
7533	(void) soshutdownlock_final(so, SHUT_WR);
7534	(void) sodisconnectlocked(so);
7535
7536	/*
7537	* Explicitly handle connectionless-protocol disconnection
7538	* and release any remaining data in the socket buffers.
7539	*/
7540	if (!(so->so_state & SS_ISDISCONNECTED)) {
7541	(void) soisdisconnected(so);
7542	}
7543
7544	if (so->so_error == `0`) {
7545	so->so_error = EBADF;
7546	}
7547
7548	if (rcv->sb_cc != `0`) {
7549	rcv->sb_flags &= ~SB_SEL;
7550	selthreadclear(&rcv->sb_sel);
7551	sbrelease(sb: rcv);
7552	}
7553	if (snd->sb_cc != `0`) {
7554	snd->sb_flags &= ~SB_SEL;
7555	selthreadclear(&snd->sb_sel);
7556	sbrelease(sb: snd);
7557	}
7558	so->so_state \|= SS_DEFUNCT;
7559	OSIncrementAtomicLong(address: (volatile long *)&sodefunct_calls);
7560
7561	done:
7562	return `0`;
7563	}
7564
7565	int
7566	soresume(struct proc p, struct* socket so, int* locked)
7567	{
7568	if (locked == `0`) {
7569	socket_lock(so, refcount: `1`);
7570	}
7571
7572	if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7573	SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llu "
7574	"[%d,%d] resumed from bk idle\n",
7575	__func__, proc_selfpid(), proc_best_name(current_proc()),
7576	proc_pid(p), proc_best_name(p),
7577	so->so_gencnt,
7578	SOCK_DOM(so), SOCK_TYPE(so));
7579
7580	so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7581	so->so_extended_bk_start = `0`;
7582	OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7583
7584	OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7585	OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7586	VERIFY(soextbkidlestat.so_xbkidle_active >= `0`);
7587	}
7588	if (locked == `0`) {
7589	socket_unlock(so, refcount: `1`);
7590	}
7591
7592	return `0`;
7593	}
7594
7595	/*
7596	* Does not attempt to account for sockets that are delegated from
7597	* the current process
7598	*/
7599	int
7600	so_set_extended_bk_idle(struct socket so, int* optval)
7601	{
7602	int error = `0`;
7603
7604	if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) \|\|
7605	SOCK_PROTO(so) != IPPROTO_TCP) {
7606	OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7607	error = EOPNOTSUPP;
7608	} else if (optval == `0`) {
7609	so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7610
7611	soresume(p: current_proc(), so, locked: `1`);
7612	} else {
7613	struct proc *p = current_proc();
7614	struct fileproc *fp;
7615	int count = `0`;
7616
7617	/*
7618	* Unlock socket to avoid lock ordering issue with
7619	* the proc fd table lock
7620	*/
7621	socket_unlock(so, refcount: `0`);
7622
7623	proc_fdlock(p);
7624	fdt_foreach(fp, p) {
7625	struct socket *so2;
7626
7627	if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7628	continue;
7629	}
7630
7631	so2 = (struct socket *)fp_get_data(fp);
7632	if (so != so2 &&
7633	so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7634	count++;
7635	}
7636	if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7637	break;
7638	}
7639	}
7640	proc_fdunlock(p);
7641
7642	socket_lock(so, refcount: `0`);
7643
7644	if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7645	OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7646	error = EBUSY;
7647	} else if (so->so_flags & SOF_DELEGATED) {
7648	OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7649	error = EBUSY;
7650	} else {
7651	so->so_flags1 \|= SOF1_EXTEND_BK_IDLE_WANTED;
7652	OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7653	}
7654	SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d] "
7655	"%s marked for extended bk idle\n",
7656	__func__, proc_selfpid(), proc_best_name(current_proc()),
7657	so->so_gencnt,
7658	SOCK_DOM(so), SOCK_TYPE(so),
7659	(so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7660	"is" : "not");
7661	}
7662
7663	return error;
7664	}
7665
7666	static void
7667	so_stop_extended_bk_idle(struct socket *so)
7668	{
7669	so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7670	so->so_extended_bk_start = `0`;
7671
7672	OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7673	VERIFY(soextbkidlestat.so_xbkidle_active >= `0`);
7674	/*
7675	* Force defunct
7676	*/
7677	sosetdefunct(p: current_proc(), so,
7678	SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7679	if (so->so_flags & SOF_DEFUNCT) {
7680	sodefunct(p: current_proc(), so,
7681	SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7682	}
7683	}
7684
7685	void
7686	so_drain_extended_bk_idle(struct socket *so)
7687	{
7688	if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7689	/*
7690	* Only penalize sockets that have outstanding data
7691	*/
7692	if (so->so_rcv.sb_cc \|\| so->so_snd.sb_cc) {
7693	so_stop_extended_bk_idle(so);
7694
7695	OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7696	}
7697	}
7698	}
7699
7700	/*
7701	* Return values tells if socket is still in extended background idle
7702	*/
7703	int
7704	so_check_extended_bk_idle_time(struct socket *so)
7705	{
7706	int ret = `1`;
7707
7708	if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7709	SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d]\n",
7710	__func__, proc_selfpid(), proc_best_name(current_proc()),
7711	so->so_gencnt,
7712	SOCK_DOM(so), SOCK_TYPE(so));
7713	if (net_uptime() - so->so_extended_bk_start >
7714	soextbkidlestat.so_xbkidle_time) {
7715	so_stop_extended_bk_idle(so);
7716
7717	OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7718
7719	ret = `0`;
7720	} else {
7721	struct inpcb inp = (struct* inpcb *)so->so_pcb;
7722
7723	inpcb_timer_sched(inp->inp_pcbinfo, type: INPCB_TIMER_LAZY);
7724	OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7725	}
7726	}
7727
7728	return ret;
7729	}
7730
7731	void
7732	resume_proc_sockets(proc_t p)
7733	{
7734	if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7735	struct fileproc *fp;
7736	struct socket *so;
7737
7738	proc_fdlock(p);
7739	fdt_foreach(fp, p) {
7740	if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7741	continue;
7742	}
7743
7744	so = (struct socket *)fp_get_data(fp);
7745	(void) soresume(p, so, locked: `0`);
7746	}
7747	proc_fdunlock(p);
7748
7749	OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7750	}
7751	}
7752
7753	__private_extern__ int
7754	so_set_recv_anyif(struct socket so, int* optval)
7755	{
7756	int ret = `0`;
7757
7758	if (SOCK_DOM(so) == PF_INET \|\| SOCK_DOM(so) == PF_INET6) {
7759	if (optval) {
7760	sotoinpcb(so)->inp_flags \|= INP_RECV_ANYIF;
7761	} else {
7762	sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7763	}
7764	#if SKYWALK
7765	inp_update_netns_flags(so);
7766	#endif /* SKYWALK */
7767	}
7768
7769
7770	return ret;
7771	}
7772
7773	__private_extern__ int
7774	so_get_recv_anyif(struct socket *so)
7775	{
7776	int ret = `0`;
7777
7778	if (SOCK_DOM(so) == PF_INET \|\| SOCK_DOM(so) == PF_INET6) {
7779	ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? `1` : `0`;
7780	}
7781
7782	return ret;
7783	}
7784
7785	int
7786	so_set_restrictions(struct socket *so, uint32_t vals)
7787	{
7788	int nocell_old, nocell_new;
7789	int noexpensive_old, noexpensive_new;
7790	int noconstrained_old, noconstrained_new;
7791
7792	/*
7793	* Deny-type restrictions are trapdoors; once set they cannot be
7794	* unset for the lifetime of the socket. This allows them to be
7795	* issued by a framework on behalf of the application without
7796	* having to worry that they can be undone.
7797	*
7798	* Note here that socket-level restrictions overrides any protocol
7799	* level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
7800	* socket restriction issued on the socket has a higher precendence
7801	* than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
7802	* policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7803	* i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7804	*/
7805	nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7806	noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7807	noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7808	so->so_restrictions \|= (vals & (SO_RESTRICT_DENY_IN \|
7809	SO_RESTRICT_DENY_OUT \| SO_RESTRICT_DENY_CELLULAR \|
7810	SO_RESTRICT_DENY_EXPENSIVE \| SO_RESTRICT_DENY_CONSTRAINED));
7811	nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7812	noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7813	noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7814
7815	/ we can only set, not clear restrictions /
7816	if ((nocell_new - nocell_old) == `0` &&
7817	(noexpensive_new - noexpensive_old) == `0` &&
7818	(noconstrained_new - noconstrained_old) == `0`) {
7819	return `0`;
7820	}
7821	if (SOCK_DOM(so) == PF_INET \|\| SOCK_DOM(so) == PF_INET6) {
7822	if (nocell_new - nocell_old != `0`) {
7823	/*
7824	* if deny cellular is now set, do what's needed
7825	* for INPCB
7826	*/
7827	inp_set_nocellular(sotoinpcb(so));
7828	}
7829	if (noexpensive_new - noexpensive_old != `0`) {
7830	inp_set_noexpensive(sotoinpcb(so));
7831	}
7832	if (noconstrained_new - noconstrained_old != `0`) {
7833	inp_set_noconstrained(sotoinpcb(so));
7834	}
7835	}
7836
7837	if (SOCK_DOM(so) == PF_MULTIPATH) {
7838	mptcp_set_restrictions(mp_so: so);
7839	}
7840
7841	return `0`;
7842	}
7843
7844	uint32_t
7845	so_get_restrictions(struct socket *so)
7846	{
7847	return so->so_restrictions & (SO_RESTRICT_DENY_IN \|
7848	SO_RESTRICT_DENY_OUT \|
7849	SO_RESTRICT_DENY_CELLULAR \| SO_RESTRICT_DENY_EXPENSIVE);
7850	}
7851
7852	int
7853	so_set_effective_pid(struct socket so, int* epid, struct proc *p, boolean_t check_cred)
7854	{
7855	struct proc *ep = PROC_NULL;
7856	int error = `0`;
7857
7858	/ pid 0 is reserved for kernel /
7859	if (epid == `0`) {
7860	error = EINVAL;
7861	goto done;
7862	}
7863
7864	/*
7865	* If this is an in-kernel socket, prevent its delegate
7866	* association from changing unless the socket option is
7867	* coming from within the kernel itself.
7868	*/
7869	if (so->last_pid == `0` && p != kernproc) {
7870	error = EACCES;
7871	goto done;
7872	}
7873
7874	/*
7875	* If this is issued by a process that's recorded as the
7876	* real owner of the socket, or if the pid is the same as
7877	* the process's own pid, then proceed. Otherwise ensure
7878	* that the issuing process has the necessary privileges.
7879	*/
7880	if (check_cred && (epid != so->last_pid \|\| epid != proc_pid(p))) {
7881	if ((error = priv_check_cred(cred: kauth_cred_get(),
7882	PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, flags: `0`))) {
7883	error = EACCES;
7884	goto done;
7885	}
7886	}
7887
7888	/ Find the process that corresponds to the effective pid /
7889	if ((ep = proc_find(pid: epid)) == PROC_NULL) {
7890	error = ESRCH;
7891	goto done;
7892	}
7893
7894	/*
7895	* If a process tries to delegate the socket to itself, then
7896	* there's really nothing to do; treat it as a way for the
7897	* delegate association to be cleared. Note that we check
7898	* the passed-in proc rather than calling proc_selfpid(),
7899	* as we need to check the process issuing the socket option
7900	* which could be kernproc. Given that we don't allow 0 for
7901	* effective pid, it means that a delegated in-kernel socket
7902	* stays delegated during its lifetime (which is probably OK.)
7903	*/
7904	if (epid == proc_pid(p)) {
7905	so->so_flags &= ~SOF_DELEGATED;
7906	so->e_upid = `0`;
7907	so->e_pid = `0`;
7908	uuid_clear(uu: so->e_uuid);
7909	} else {
7910	so->so_flags \|= SOF_DELEGATED;
7911	so->e_upid = proc_uniqueid(ep);
7912	so->e_pid = proc_pid(ep);
7913	proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
7914
7915	#if defined(XNU_TARGET_OS_OSX)
7916	if (ep->p_responsible_pid != so->e_pid) {
7917	proc_t rp = proc_find(pid: ep->p_responsible_pid);
7918	if (rp != PROC_NULL) {
7919	proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
7920	so->so_rpid = ep->p_responsible_pid;
7921	proc_rele(p: rp);
7922	} else {
7923	uuid_clear(uu: so->so_ruuid);
7924	so->so_rpid = -`1`;
7925	}
7926	}
7927	#endif
7928	}
7929	if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
7930	(*so->so_proto->pr_update_last_owner)(so, NULL, ep);
7931	}
7932	done:
7933	if (error == `0` && net_io_policy_log) {
7934	uuid_string_t buf;
7935
7936	uuid_unparse(uu: so->e_uuid, out: buf);
7937	log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7938	"euuid %s%s\n", __func__, proc_name_address(p),
7939	proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7940	SOCK_DOM(so), SOCK_TYPE(so),
7941	so->e_pid, proc_name_address(p: ep), buf,
7942	((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7943	} else if (error != `0` && net_io_policy_log) {
7944	log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7945	"ERROR (%d)\n", __func__, proc_name_address(p),
7946	proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7947	SOCK_DOM(so), SOCK_TYPE(so),
7948	epid, (ep == PROC_NULL) ? "PROC_NULL" :
7949	proc_name_address(p: ep), error);
7950	}
7951
7952	/ Update this socket's policy upon success /
7953	if (error == `0`) {
7954	so->so_policy_gencnt *= -`1`;
7955	so_update_policy(so);
7956	#if NECP
7957	so_update_necp_policy(so, NULL, NULL);
7958	#endif /* NECP */
7959	}
7960
7961	if (ep != PROC_NULL) {
7962	proc_rele(p: ep);
7963	}
7964
7965	return error;
7966	}
7967
7968	int
7969	so_set_effective_uuid(struct socket so, uuid_t euuid, struct* proc *p, boolean_t check_cred)
7970	{
7971	uuid_string_t buf;
7972	uuid_t uuid;
7973	int error = `0`;
7974
7975	/ UUID must not be all-zeroes (reserved for kernel) /
7976	if (uuid_is_null(uu: euuid)) {
7977	error = EINVAL;
7978	goto done;
7979	}
7980
7981	/*
7982	* If this is an in-kernel socket, prevent its delegate
7983	* association from changing unless the socket option is
7984	* coming from within the kernel itself.
7985	*/
7986	if (so->last_pid == `0` && p != kernproc) {
7987	error = EACCES;
7988	goto done;
7989	}
7990
7991	/ Get the UUID of the issuing process /
7992	proc_getexecutableuuid(p, uuid, sizeof(uuid));
7993
7994	/*
7995	* If this is issued by a process that's recorded as the
7996	* real owner of the socket, or if the uuid is the same as
7997	* the process's own uuid, then proceed. Otherwise ensure
7998	* that the issuing process has the necessary privileges.
7999	*/
8000	if (check_cred &&
8001	(uuid_compare(uu1: euuid, uu2: so->last_uuid) != `0` \|\|
8002	uuid_compare(uu1: euuid, uu2: uuid) != `0`)) {
8003	if ((error = priv_check_cred(cred: kauth_cred_get(),
8004	PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, flags: `0`))) {
8005	error = EACCES;
8006	goto done;
8007	}
8008	}
8009
8010	/*
8011	* If a process tries to delegate the socket to itself, then
8012	* there's really nothing to do; treat it as a way for the
8013	* delegate association to be cleared. Note that we check
8014	* the uuid of the passed-in proc rather than that of the
8015	* current process, as we need to check the process issuing
8016	* the socket option which could be kernproc itself. Given
8017	* that we don't allow 0 for effective uuid, it means that
8018	* a delegated in-kernel socket stays delegated during its
8019	* lifetime (which is okay.)
8020	*/
8021	if (uuid_compare(uu1: euuid, uu2: uuid) == `0`) {
8022	so->so_flags &= ~SOF_DELEGATED;
8023	so->e_upid = `0`;
8024	so->e_pid = `0`;
8025	uuid_clear(uu: so->e_uuid);
8026	} else {
8027	so->so_flags \|= SOF_DELEGATED;
8028	/*
8029	* Unlike so_set_effective_pid(), we only have the UUID
8030	* here and the process ID is not known. Inherit the
8031	* real {pid,upid} of the socket.
8032	*/
8033	so->e_upid = so->last_upid;
8034	so->e_pid = so->last_pid;
8035	uuid_copy(dst: so->e_uuid, src: euuid);
8036	}
8037	/*
8038	* The following will clear the effective process name as it's the same
8039	* as the real process
8040	*/
8041	if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8042	(*so->so_proto->pr_update_last_owner)(so, NULL, NULL);
8043	}
8044	done:
8045	if (error == `0` && net_io_policy_log) {
8046	uuid_unparse(uu: so->e_uuid, out: buf);
8047	log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
8048	"euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
8049	(uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8050	SOCK_TYPE(so), so->e_pid, buf,
8051	((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8052	} else if (error != `0` && net_io_policy_log) {
8053	uuid_unparse(uu: euuid, out: buf);
8054	log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
8055	"ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
8056	(uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8057	SOCK_TYPE(so), buf, error);
8058	}
8059
8060	/ Update this socket's policy upon success /
8061	if (error == `0`) {
8062	so->so_policy_gencnt *= -`1`;
8063	so_update_policy(so);
8064	#if NECP
8065	so_update_necp_policy(so, NULL, NULL);
8066	#endif /* NECP */
8067	}
8068
8069	return error;
8070	}
8071
8072	void
8073	netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
8074	uint32_t ev_datalen)
8075	{
8076	struct kev_msg ev_msg;
8077
8078	/*
8079	* A netpolicy event always starts with a netpolicy_event_data
8080	* structure, but the caller can provide for a longer event
8081	* structure to post, depending on the event code.
8082	*/
8083	VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
8084
8085	bzero(s: &ev_msg, n: sizeof(ev_msg));
8086	ev_msg.vendor_code = KEV_VENDOR_APPLE;
8087	ev_msg.kev_class = KEV_NETWORK_CLASS;
8088	ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS;
8089	ev_msg.event_code = ev_code;
8090
8091	ev_msg.dv[`0`].data_ptr = ev_data;
8092	ev_msg.dv[`0`].data_length = ev_datalen;
8093
8094	kev_post_msg(event: &ev_msg);
8095	}
8096
8097	void
8098	socket_post_kev_msg(uint32_t ev_code,
8099	struct kev_socket_event_data *ev_data,
8100	uint32_t ev_datalen)
8101	{
8102	struct kev_msg ev_msg;
8103
8104	bzero(s: &ev_msg, n: sizeof(ev_msg));
8105	ev_msg.vendor_code = KEV_VENDOR_APPLE;
8106	ev_msg.kev_class = KEV_NETWORK_CLASS;
8107	ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
8108	ev_msg.event_code = ev_code;
8109
8110	ev_msg.dv[`0`].data_ptr = ev_data;
8111	ev_msg.dv[`0`].data_length = ev_datalen;
8112
8113	kev_post_msg(event: &ev_msg);
8114	}
8115
8116	void
8117	socket_post_kev_msg_closed(struct socket *so)
8118	{
8119	struct kev_socket_closed ev = {};
8120	struct sockaddr socksa = NULL, peersa = NULL;
8121	int err;
8122
8123	if ((so->so_flags1 & SOF1_WANT_KEV_SOCK_CLOSED) == `0`) {
8124	return;
8125	}
8126	err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
8127	if (err == `0`) {
8128	err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
8129	&peersa);
8130	if (err == `0`) {
8131	memcpy(dst: &ev.ev_data.kev_sockname, src: socksa,
8132	n: min(a: socksa->sa_len,
8133	b: sizeof(ev.ev_data.kev_sockname)));
8134	memcpy(dst: &ev.ev_data.kev_peername, src: peersa,
8135	n: min(a: peersa->sa_len,
8136	b: sizeof(ev.ev_data.kev_peername)));
8137	socket_post_kev_msg(KEV_SOCKET_CLOSED,
8138	ev_data: &ev.ev_data, ev_datalen: sizeof(ev));
8139	}
8140	}
8141	free_sockaddr(socksa);
8142	free_sockaddr(peersa);
8143	}
8144
8145	__attribute__((noinline, cold, not_tail_called, noreturn))
8146	__private_extern__ int
8147	assfail(const char a, const* char f, int* l)
8148	{
8149	panic("assertion failed: %s, file: %s, line: %d", a, f, l);
8150	/ NOTREACHED /
8151	__builtin_unreachable();
8152	}
8153

Browse the source code of xnu/bsd/kern/uipc_socket.c