1/*
2 * Copyright (c) 1998-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
62 */
63/*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70#include <sys/param.h>
71#include <sys/systm.h>
72#include <sys/filedesc.h>
73#include <sys/proc.h>
74#include <sys/proc_internal.h>
75#include <sys/kauth.h>
76#include <sys/file_internal.h>
77#include <sys/fcntl.h>
78#include <sys/malloc.h>
79#include <sys/mbuf.h>
80#include <sys/domain.h>
81#include <sys/kernel.h>
82#include <sys/event.h>
83#include <sys/poll.h>
84#include <sys/protosw.h>
85#include <sys/socket.h>
86#include <sys/socketvar.h>
87#include <sys/resourcevar.h>
88#include <sys/signalvar.h>
89#include <sys/sysctl.h>
90#include <sys/syslog.h>
91#include <sys/uio.h>
92#include <sys/uio_internal.h>
93#include <sys/ev.h>
94#include <sys/kdebug.h>
95#include <sys/un.h>
96#include <sys/user.h>
97#include <sys/priv.h>
98#include <sys/kern_event.h>
99#include <sys/persona.h>
100#include <net/route.h>
101#include <net/init.h>
102#include <net/net_api_stats.h>
103#include <net/ntstat.h>
104#include <net/content_filter.h>
105#include <netinet/in.h>
106#include <netinet/in_pcb.h>
107#include <netinet/in_tclass.h>
108#include <netinet/in_var.h>
109#include <netinet/tcp_var.h>
110#include <netinet/ip6.h>
111#include <netinet6/ip6_var.h>
112#include <netinet/flow_divert.h>
113#include <kern/zalloc.h>
114#include <kern/locks.h>
115#include <machine/limits.h>
116#include <libkern/OSAtomic.h>
117#include <pexpert/pexpert.h>
118#include <kern/assert.h>
119#include <kern/task.h>
120#include <kern/policy_internal.h>
121
122#include <sys/kpi_mbuf.h>
123#include <sys/mcache.h>
124#include <sys/unpcb.h>
125#include <libkern/section_keywords.h>
126
127#include <os/log.h>
128
129#if CONFIG_MACF
130#include <security/mac_framework.h>
131#endif /* MAC */
132
133#if MULTIPATH
134#include <netinet/mp_pcb.h>
135#include <netinet/mptcp_var.h>
136#endif /* MULTIPATH */
137
138#define ROUNDUP(a, b) (((a) + ((b) - 1)) & (~((b) - 1)))
139
140#if DEBUG || DEVELOPMENT
141#define DEBUG_KERNEL_ADDRPERM(_v) (_v)
142#else
143#define DEBUG_KERNEL_ADDRPERM(_v) VM_KERNEL_ADDRPERM(_v)
144#endif
145
146/* TODO: this should be in a header file somewhere */
147extern char *proc_name_address(void *p);
148
149static u_int32_t so_cache_hw; /* High water mark for socache */
150static u_int32_t so_cache_timeouts; /* number of timeouts */
151static u_int32_t so_cache_max_freed; /* max freed per timeout */
152static u_int32_t cached_sock_count = 0;
153STAILQ_HEAD(, socket) so_cache_head;
154int max_cached_sock_count = MAX_CACHED_SOCKETS;
155static uint64_t so_cache_time;
156static int socketinit_done;
157static struct zone *so_cache_zone;
158
159static LCK_GRP_DECLARE(so_cache_mtx_grp, "so_cache");
160static LCK_MTX_DECLARE(so_cache_mtx, &so_cache_mtx_grp);
161
162#include <machine/limits.h>
163
164static int filt_sorattach(struct knote *kn, struct kevent_qos_s *kev);
165static void filt_sordetach(struct knote *kn);
166static int filt_soread(struct knote *kn, long hint);
167static int filt_sortouch(struct knote *kn, struct kevent_qos_s *kev);
168static int filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev);
169
170static int filt_sowattach(struct knote *kn, struct kevent_qos_s *kev);
171static void filt_sowdetach(struct knote *kn);
172static int filt_sowrite(struct knote *kn, long hint);
173static int filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev);
174static int filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev);
175
176static int filt_sockattach(struct knote *kn, struct kevent_qos_s *kev);
177static void filt_sockdetach(struct knote *kn);
178static int filt_sockev(struct knote *kn, long hint);
179static int filt_socktouch(struct knote *kn, struct kevent_qos_s *kev);
180static int filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev);
181
182static int sooptcopyin_timeval(struct sockopt *, struct timeval *);
183static int sooptcopyout_timeval(struct sockopt *, const struct timeval *);
184
185SECURITY_READ_ONLY_EARLY(struct filterops) soread_filtops = {
186 .f_isfd = 1,
187 .f_attach = filt_sorattach,
188 .f_detach = filt_sordetach,
189 .f_event = filt_soread,
190 .f_touch = filt_sortouch,
191 .f_process = filt_sorprocess,
192};
193
194SECURITY_READ_ONLY_EARLY(struct filterops) sowrite_filtops = {
195 .f_isfd = 1,
196 .f_attach = filt_sowattach,
197 .f_detach = filt_sowdetach,
198 .f_event = filt_sowrite,
199 .f_touch = filt_sowtouch,
200 .f_process = filt_sowprocess,
201};
202
203SECURITY_READ_ONLY_EARLY(struct filterops) sock_filtops = {
204 .f_isfd = 1,
205 .f_attach = filt_sockattach,
206 .f_detach = filt_sockdetach,
207 .f_event = filt_sockev,
208 .f_touch = filt_socktouch,
209 .f_process = filt_sockprocess,
210};
211
212SECURITY_READ_ONLY_EARLY(struct filterops) soexcept_filtops = {
213 .f_isfd = 1,
214 .f_attach = filt_sorattach,
215 .f_detach = filt_sordetach,
216 .f_event = filt_soread,
217 .f_touch = filt_sortouch,
218 .f_process = filt_sorprocess,
219};
220
221SYSCTL_DECL(_kern_ipc);
222
223#define EVEN_MORE_LOCKING_DEBUG 0
224
225int socket_debug = 0;
226SYSCTL_INT(_kern_ipc, OID_AUTO, socket_debug,
227 CTLFLAG_RW | CTLFLAG_LOCKED, &socket_debug, 0, "");
228
229#if (DEBUG || DEVELOPMENT)
230#define DEFAULT_SOSEND_ASSERT_PANIC 1
231#else
232#define DEFAULT_SOSEND_ASSERT_PANIC 0
233#endif /* (DEBUG || DEVELOPMENT) */
234
235int sosend_assert_panic = 0;
236SYSCTL_INT(_kern_ipc, OID_AUTO, sosend_assert_panic,
237 CTLFLAG_RW | CTLFLAG_LOCKED, &sosend_assert_panic, DEFAULT_SOSEND_ASSERT_PANIC, "");
238
239static unsigned long sodefunct_calls = 0;
240SYSCTL_LONG(_kern_ipc, OID_AUTO, sodefunct_calls, CTLFLAG_LOCKED,
241 &sodefunct_calls, "");
242
243ZONE_DEFINE_TYPE(socket_zone, "socket", struct socket, ZC_ZFREE_CLEARMEM);
244so_gen_t so_gencnt; /* generation count for sockets */
245
246MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
247
248#define DBG_LAYER_IN_BEG NETDBG_CODE(DBG_NETSOCK, 0)
249#define DBG_LAYER_IN_END NETDBG_CODE(DBG_NETSOCK, 2)
250#define DBG_LAYER_OUT_BEG NETDBG_CODE(DBG_NETSOCK, 1)
251#define DBG_LAYER_OUT_END NETDBG_CODE(DBG_NETSOCK, 3)
252#define DBG_FNC_SOSEND NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 1)
253#define DBG_FNC_SOSEND_LIST NETDBG_CODE(DBG_NETSOCK, (4 << 8) | 3)
254#define DBG_FNC_SORECEIVE NETDBG_CODE(DBG_NETSOCK, (8 << 8))
255#define DBG_FNC_SORECEIVE_LIST NETDBG_CODE(DBG_NETSOCK, (8 << 8) | 3)
256#define DBG_FNC_SOSHUTDOWN NETDBG_CODE(DBG_NETSOCK, (9 << 8))
257
258#define MAX_SOOPTGETM_SIZE (128 * MCLBYTES)
259
260int somaxconn = SOMAXCONN;
261SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn,
262 CTLFLAG_RW | CTLFLAG_LOCKED, &somaxconn, 0, "");
263
264/* Should we get a maximum also ??? */
265static int sosendmaxchain = 65536;
266static int sosendminchain = 16384;
267static int sorecvmincopy = 16384;
268SYSCTL_INT(_kern_ipc, OID_AUTO, sosendminchain,
269 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendminchain, 0, "");
270SYSCTL_INT(_kern_ipc, OID_AUTO, sorecvmincopy,
271 CTLFLAG_RW | CTLFLAG_LOCKED, &sorecvmincopy, 0, "");
272
273/*
274 * Set to enable jumbo clusters (if available) for large writes when
275 * the socket is marked with SOF_MULTIPAGES; see below.
276 */
277int sosendjcl = 1;
278SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl,
279 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl, 0, "");
280
281/*
282 * Set this to ignore SOF_MULTIPAGES and use jumbo clusters for large
283 * writes on the socket for all protocols on any network interfaces,
284 * depending upon sosendjcl above. Be extra careful when setting this
285 * to 1, because sending down packets that cross physical pages down to
286 * broken drivers (those that falsely assume that the physical pages
287 * are contiguous) might lead to system panics or silent data corruption.
288 * When set to 0, the system will respect SOF_MULTIPAGES, which is set
289 * only for TCP sockets whose outgoing interface is IFNET_MULTIPAGES
290 * capable. Set this to 1 only for testing/debugging purposes.
291 */
292int sosendjcl_ignore_capab = 0;
293SYSCTL_INT(_kern_ipc, OID_AUTO, sosendjcl_ignore_capab,
294 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendjcl_ignore_capab, 0, "");
295
296/*
297 * Set this to ignore SOF1_IF_2KCL and use big clusters for large
298 * writes on the socket for all protocols on any network interfaces.
299 * Be extra careful when setting this to 1, because sending down packets with
300 * clusters larger that 2 KB might lead to system panics or data corruption.
301 * When set to 0, the system will respect SOF1_IF_2KCL, which is set
302 * on the outgoing interface
303 * Set this to 1 for testing/debugging purposes only.
304 */
305int sosendbigcl_ignore_capab = 0;
306SYSCTL_INT(_kern_ipc, OID_AUTO, sosendbigcl_ignore_capab,
307 CTLFLAG_RW | CTLFLAG_LOCKED, &sosendbigcl_ignore_capab, 0, "");
308
309int sodefunctlog = 0;
310SYSCTL_INT(_kern_ipc, OID_AUTO, sodefunctlog, CTLFLAG_RW | CTLFLAG_LOCKED,
311 &sodefunctlog, 0, "");
312
313int sothrottlelog = 0;
314SYSCTL_INT(_kern_ipc, OID_AUTO, sothrottlelog, CTLFLAG_RW | CTLFLAG_LOCKED,
315 &sothrottlelog, 0, "");
316
317int sorestrictrecv = 1;
318SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictrecv, CTLFLAG_RW | CTLFLAG_LOCKED,
319 &sorestrictrecv, 0, "Enable inbound interface restrictions");
320
321int sorestrictsend = 1;
322SYSCTL_INT(_kern_ipc, OID_AUTO, sorestrictsend, CTLFLAG_RW | CTLFLAG_LOCKED,
323 &sorestrictsend, 0, "Enable outbound interface restrictions");
324
325int soreserveheadroom = 1;
326SYSCTL_INT(_kern_ipc, OID_AUTO, soreserveheadroom, CTLFLAG_RW | CTLFLAG_LOCKED,
327 &soreserveheadroom, 0, "To allocate contiguous datagram buffers");
328
329#if (DEBUG || DEVELOPMENT)
330int so_notsent_lowat_check = 1;
331SYSCTL_INT(_kern_ipc, OID_AUTO, notsent_lowat, CTLFLAG_RW | CTLFLAG_LOCKED,
332 &so_notsent_lowat_check, 0, "enable/disable notsnet lowat check");
333#endif /* DEBUG || DEVELOPMENT */
334
335int so_accept_list_waits = 0;
336#if (DEBUG || DEVELOPMENT)
337SYSCTL_INT(_kern_ipc, OID_AUTO, accept_list_waits, CTLFLAG_RW | CTLFLAG_LOCKED,
338 &so_accept_list_waits, 0, "number of waits for listener incomp list");
339#endif /* DEBUG || DEVELOPMENT */
340
341extern struct inpcbinfo tcbinfo;
342
343/* TODO: these should be in header file */
344extern int get_inpcb_str_size(void);
345extern int get_tcp_str_size(void);
346
347vm_size_t so_cache_zone_element_size;
348
349static int sodelayed_copy(struct socket *, struct uio *, struct mbuf **,
350 user_ssize_t *);
351static void cached_sock_alloc(struct socket **, zalloc_flags_t);
352static void cached_sock_free(struct socket *);
353
354/*
355 * Maximum of extended background idle sockets per process
356 * Set to zero to disable further setting of the option
357 */
358
359#define SO_IDLE_BK_IDLE_MAX_PER_PROC 1
360#define SO_IDLE_BK_IDLE_TIME 600
361#define SO_IDLE_BK_IDLE_RCV_HIWAT 131072
362
363struct soextbkidlestat soextbkidlestat;
364
365SYSCTL_UINT(_kern_ipc, OID_AUTO, maxextbkidleperproc,
366 CTLFLAG_RW | CTLFLAG_LOCKED, &soextbkidlestat.so_xbkidle_maxperproc, 0,
367 "Maximum of extended background idle sockets per process");
368
369SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidletime, CTLFLAG_RW | CTLFLAG_LOCKED,
370 &soextbkidlestat.so_xbkidle_time, 0,
371 "Time in seconds to keep extended background idle sockets");
372
373SYSCTL_UINT(_kern_ipc, OID_AUTO, extbkidlercvhiwat, CTLFLAG_RW | CTLFLAG_LOCKED,
374 &soextbkidlestat.so_xbkidle_rcvhiwat, 0,
375 "High water mark for extended background idle sockets");
376
377SYSCTL_STRUCT(_kern_ipc, OID_AUTO, extbkidlestat, CTLFLAG_RD | CTLFLAG_LOCKED,
378 &soextbkidlestat, soextbkidlestat, "");
379
380int so_set_extended_bk_idle(struct socket *, int);
381
382#define SO_MAX_MSG_X 1024
383
384/*
385 * SOTCDB_NO_DSCP is set by default, to prevent the networking stack from
386 * setting the DSCP code on the packet based on the service class; see
387 * <rdar://problem/11277343> for details.
388 */
389__private_extern__ u_int32_t sotcdb = 0;
390SYSCTL_INT(_kern_ipc, OID_AUTO, sotcdb, CTLFLAG_RW | CTLFLAG_LOCKED,
391 &sotcdb, 0, "");
392
393void
394socketinit(void)
395{
396 _CASSERT(sizeof(so_gencnt) == sizeof(uint64_t));
397 VERIFY(IS_P2ALIGNED(&so_gencnt, sizeof(uint32_t)));
398
399#ifdef __LP64__
400 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user64_sa_endpoints));
401 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user64_sa_endpoints, sae_srcif));
402 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user64_sa_endpoints, sae_srcaddr));
403 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user64_sa_endpoints, sae_srcaddrlen));
404 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user64_sa_endpoints, sae_dstaddr));
405 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user64_sa_endpoints, sae_dstaddrlen));
406#else
407 _CASSERT(sizeof(struct sa_endpoints) == sizeof(struct user32_sa_endpoints));
408 _CASSERT(offsetof(struct sa_endpoints, sae_srcif) == offsetof(struct user32_sa_endpoints, sae_srcif));
409 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddr) == offsetof(struct user32_sa_endpoints, sae_srcaddr));
410 _CASSERT(offsetof(struct sa_endpoints, sae_srcaddrlen) == offsetof(struct user32_sa_endpoints, sae_srcaddrlen));
411 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddr) == offsetof(struct user32_sa_endpoints, sae_dstaddr));
412 _CASSERT(offsetof(struct sa_endpoints, sae_dstaddrlen) == offsetof(struct user32_sa_endpoints, sae_dstaddrlen));
413#endif
414
415 if (socketinit_done) {
416 printf("socketinit: already called...\n");
417 return;
418 }
419 socketinit_done = 1;
420
421 PE_parse_boot_argn(arg_string: "socket_debug", arg_ptr: &socket_debug,
422 max_arg: sizeof(socket_debug));
423
424 PE_parse_boot_argn(arg_string: "sosend_assert_panic", arg_ptr: &sosend_assert_panic,
425 max_arg: sizeof(sosend_assert_panic));
426
427 STAILQ_INIT(&so_cache_head);
428
429 so_cache_zone_element_size = (vm_size_t)(sizeof(struct socket) + 4
430 + get_inpcb_str_size() + 4 + get_tcp_str_size());
431
432 so_cache_zone = zone_create(name: "socache zone", size: so_cache_zone_element_size,
433 flags: ZC_PGZ_USE_GUARDS | ZC_ZFREE_CLEARMEM);
434
435 bzero(s: &soextbkidlestat, n: sizeof(struct soextbkidlestat));
436 soextbkidlestat.so_xbkidle_maxperproc = SO_IDLE_BK_IDLE_MAX_PER_PROC;
437 soextbkidlestat.so_xbkidle_time = SO_IDLE_BK_IDLE_TIME;
438 soextbkidlestat.so_xbkidle_rcvhiwat = SO_IDLE_BK_IDLE_RCV_HIWAT;
439
440 in_pcbinit();
441}
442
443static void
444cached_sock_alloc(struct socket **so, zalloc_flags_t how)
445{
446 caddr_t temp;
447 uintptr_t offset;
448
449 lck_mtx_lock(lck: &so_cache_mtx);
450
451 if (!STAILQ_EMPTY(&so_cache_head)) {
452 VERIFY(cached_sock_count > 0);
453
454 *so = STAILQ_FIRST(&so_cache_head);
455 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
456 STAILQ_NEXT((*so), so_cache_ent) = NULL;
457
458 cached_sock_count--;
459 lck_mtx_unlock(lck: &so_cache_mtx);
460
461 temp = (*so)->so_saved_pcb;
462 bzero(s: (caddr_t)*so, n: sizeof(struct socket));
463
464 (*so)->so_saved_pcb = temp;
465 } else {
466 lck_mtx_unlock(lck: &so_cache_mtx);
467
468 *so = zalloc_flags(so_cache_zone, how | Z_ZERO);
469
470 /*
471 * Define offsets for extra structures into our
472 * single block of memory. Align extra structures
473 * on longword boundaries.
474 */
475
476 offset = (uintptr_t)*so;
477 offset += sizeof(struct socket);
478
479 offset = ALIGN(offset);
480
481 (*so)->so_saved_pcb = (caddr_t)offset;
482 offset += get_inpcb_str_size();
483
484 offset = ALIGN(offset);
485
486 ((struct inpcb *)(void *)(*so)->so_saved_pcb)->inp_saved_ppcb =
487 (caddr_t)offset;
488 }
489
490 OSBitOrAtomic(SOF1_CACHED_IN_SOCK_LAYER, &(*so)->so_flags1);
491}
492
493static void
494cached_sock_free(struct socket *so)
495{
496 lck_mtx_lock(lck: &so_cache_mtx);
497
498 so_cache_time = net_uptime();
499 if (++cached_sock_count > max_cached_sock_count) {
500 --cached_sock_count;
501 lck_mtx_unlock(lck: &so_cache_mtx);
502 zfree(so_cache_zone, so);
503 } else {
504 if (so_cache_hw < cached_sock_count) {
505 so_cache_hw = cached_sock_count;
506 }
507
508 STAILQ_INSERT_TAIL(&so_cache_head, so, so_cache_ent);
509
510 so->cache_timestamp = so_cache_time;
511 lck_mtx_unlock(lck: &so_cache_mtx);
512 }
513}
514
515void
516so_update_last_owner_locked(struct socket *so, proc_t self)
517{
518 if (so->last_pid != 0) {
519 /*
520 * last_pid and last_upid should remain zero for sockets
521 * created using sock_socket. The check above achieves that
522 */
523 if (self == PROC_NULL) {
524 self = current_proc();
525 }
526
527 if (so->last_upid != proc_uniqueid(self) ||
528 so->last_pid != proc_pid(self)) {
529 so->last_upid = proc_uniqueid(self);
530 so->last_pid = proc_pid(self);
531 proc_getexecutableuuid(self, so->last_uuid,
532 sizeof(so->last_uuid));
533 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
534 (*so->so_proto->pr_update_last_owner)(so, self, NULL);
535 }
536 }
537 proc_pidoriginatoruuid(uuid_buf: so->so_vuuid, buffersize: sizeof(so->so_vuuid));
538 }
539}
540
541void
542so_update_policy(struct socket *so)
543{
544 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
545 (void) inp_update_policy(sotoinpcb(so));
546 }
547}
548
549#if NECP
550static void
551so_update_necp_policy(struct socket *so, struct sockaddr *override_local_addr,
552 struct sockaddr *override_remote_addr)
553{
554 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
555 inp_update_necp_policy(sotoinpcb(so), override_local_addr,
556 override_remote_addr, 0);
557 }
558}
559#endif /* NECP */
560
561boolean_t
562so_cache_timer(void)
563{
564 struct socket *p;
565 int n_freed = 0;
566 boolean_t rc = FALSE;
567
568 lck_mtx_lock(lck: &so_cache_mtx);
569 so_cache_timeouts++;
570 so_cache_time = net_uptime();
571
572 while (!STAILQ_EMPTY(&so_cache_head)) {
573 VERIFY(cached_sock_count > 0);
574 p = STAILQ_FIRST(&so_cache_head);
575 if ((so_cache_time - p->cache_timestamp) <
576 SO_CACHE_TIME_LIMIT) {
577 break;
578 }
579
580 STAILQ_REMOVE_HEAD(&so_cache_head, so_cache_ent);
581 --cached_sock_count;
582
583 zfree(so_cache_zone, p);
584
585 if (++n_freed >= SO_CACHE_MAX_FREE_BATCH) {
586 so_cache_max_freed++;
587 break;
588 }
589 }
590
591 /* Schedule again if there is more to cleanup */
592 if (!STAILQ_EMPTY(&so_cache_head)) {
593 rc = TRUE;
594 }
595
596 lck_mtx_unlock(lck: &so_cache_mtx);
597 return rc;
598}
599
600/*
601 * Get a socket structure from our zone, and initialize it.
602 * We don't implement `waitok' yet (see comments in uipc_domain.c).
603 * Note that it would probably be better to allocate socket
604 * and PCB at the same time, but I'm not convinced that all
605 * the protocols can be easily modified to do this.
606 */
607struct socket *
608soalloc(int waitok, int dom, int type)
609{
610 zalloc_flags_t how = waitok ? Z_WAITOK : Z_NOWAIT;
611 struct socket *so;
612
613 if ((dom == PF_INET) && (type == SOCK_STREAM)) {
614 cached_sock_alloc(so: &so, how);
615 } else {
616 so = zalloc_flags(socket_zone, how | Z_ZERO);
617 }
618 if (so != NULL) {
619 so->so_gencnt = OSIncrementAtomic64(address: (SInt64 *)&so_gencnt);
620
621 /*
622 * Increment the socket allocation statistics
623 */
624 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_alloc_total);
625 }
626
627 return so;
628}
629
630int
631socreate_internal(int dom, struct socket **aso, int type, int proto,
632 struct proc *p, uint32_t flags, struct proc *ep)
633{
634 struct protosw *prp;
635 struct socket *so;
636 int error = 0;
637 pid_t rpid = -1;
638
639#if TCPDEBUG
640 extern int tcpconsdebug;
641#endif
642
643 VERIFY(aso != NULL);
644 *aso = NULL;
645
646 if (proto != 0) {
647 prp = pffindproto(family: dom, protocol: proto, type);
648 } else {
649 prp = pffindtype(dom, type);
650 }
651
652 if (prp == NULL || prp->pr_usrreqs->pru_attach == NULL) {
653 if (pffinddomain(dom) == NULL) {
654 return EAFNOSUPPORT;
655 }
656 if (proto != 0) {
657 if (pffindprotonotype(dom, proto) != NULL) {
658 return EPROTOTYPE;
659 }
660 }
661 return EPROTONOSUPPORT;
662 }
663 if (prp->pr_type != type) {
664 return EPROTOTYPE;
665 }
666 so = soalloc(waitok: 1, dom, type);
667 if (so == NULL) {
668 return ENOBUFS;
669 }
670
671 switch (dom) {
672 case PF_LOCAL:
673 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_local_total);
674 break;
675 case PF_INET:
676 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet_total);
677 if (type == SOCK_STREAM) {
678 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_stream_total);
679 } else {
680 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet_dgram_total);
681 }
682 break;
683 case PF_ROUTE:
684 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_route_total);
685 break;
686 case PF_NDRV:
687 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_ndrv_total);
688 break;
689 case PF_KEY:
690 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_key_total);
691 break;
692 case PF_INET6:
693 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_inet6_total);
694 if (type == SOCK_STREAM) {
695 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_stream_total);
696 } else {
697 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_inet6_dgram_total);
698 }
699 break;
700 case PF_SYSTEM:
701 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_system_total);
702 break;
703 case PF_MULTIPATH:
704 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_multipath_total);
705 break;
706 default:
707 INC_ATOMIC_INT64_LIM(net_api_stats.nas_socket_domain_other_total);
708 break;
709 }
710
711 if (flags & SOCF_MPTCP) {
712 so->so_state |= SS_NBIO;
713 }
714
715 TAILQ_INIT(&so->so_incomp);
716 TAILQ_INIT(&so->so_comp);
717 so->so_type = (short)type;
718 so->so_family = prp->pr_domain->dom_family;
719 so->so_protocol = prp->pr_protocol;
720 so->last_upid = proc_uniqueid(p);
721 so->last_pid = proc_pid(p);
722 proc_getexecutableuuid(p, so->last_uuid, sizeof(so->last_uuid));
723 proc_pidoriginatoruuid(uuid_buf: so->so_vuuid, buffersize: sizeof(so->so_vuuid));
724
725 so->so_rpid = -1;
726 uuid_clear(uu: so->so_ruuid);
727
728 if (ep != PROC_NULL && ep != p) {
729 so->e_upid = proc_uniqueid(ep);
730 so->e_pid = proc_pid(ep);
731 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
732 so->so_flags |= SOF_DELEGATED;
733 if (ep->p_responsible_pid != so->e_pid) {
734 rpid = ep->p_responsible_pid;
735 so->so_rpid = rpid;
736 proc_getresponsibleuuid(ep, so->so_ruuid, sizeof(so->so_ruuid));
737 }
738 }
739
740 if (rpid < 0 && p->p_responsible_pid != so->last_pid) {
741 rpid = p->p_responsible_pid;
742 so->so_rpid = rpid;
743 proc_getresponsibleuuid(p, so->so_ruuid, sizeof(so->so_ruuid));
744 }
745
746 so->so_cred = kauth_cred_proc_ref(procp: p);
747 if (!suser(cred: kauth_cred_get(), NULL)) {
748 so->so_state |= SS_PRIV;
749 }
750
751 so->so_persona_id = current_persona_get_id();
752 so->so_proto = prp;
753 so->so_rcv.sb_flags |= SB_RECV;
754 so->so_rcv.sb_so = so->so_snd.sb_so = so;
755 so->next_lock_lr = 0;
756 so->next_unlock_lr = 0;
757
758 /*
759 * Attachment will create the per pcb lock if necessary and
760 * increase refcount for creation, make sure it's done before
761 * socket is inserted in lists.
762 */
763 so->so_usecount++;
764
765 error = (*prp->pr_usrreqs->pru_attach)(so, proto, p);
766 if (error != 0) {
767 /*
768 * Warning:
769 * If so_pcb is not zero, the socket will be leaked,
770 * so protocol attachment handler must be coded carefuly
771 */
772 if (so->so_pcb != NULL) {
773 os_log_error(OS_LOG_DEFAULT,
774 "so_pcb not NULL after pru_attach error %d for dom %d, proto %d, type %d",
775 error, dom, proto, type);
776 }
777 /*
778 * Both SS_NOFDREF and SOF_PCBCLEARING should be set to free the socket
779 */
780 so->so_state |= SS_NOFDREF;
781 so->so_flags |= SOF_PCBCLEARING;
782 VERIFY(so->so_usecount > 0);
783 so->so_usecount--;
784 sofreelastref(so, 1); /* will deallocate the socket */
785 return error;
786 }
787
788 /*
789 * Note: needs so_pcb to be set after pru_attach
790 */
791 if (prp->pr_update_last_owner != NULL) {
792 (*prp->pr_update_last_owner)(so, p, ep);
793 }
794
795 os_atomic_inc(&prp->pr_domain->dom_refs, relaxed);
796
797 /* Attach socket filters for this protocol */
798 sflt_initsock(so);
799#if TCPDEBUG
800 if (tcpconsdebug == 2) {
801 so->so_options |= SO_DEBUG;
802 }
803#endif
804 so_set_default_traffic_class(so);
805
806 /*
807 * If this thread or task is marked to create backgrounded sockets,
808 * mark the socket as background.
809 */
810 if (!(flags & SOCF_MPTCP) &&
811 proc_get_effective_thread_policy(thread: current_thread(), TASK_POLICY_NEW_SOCKETS_BG)) {
812 socket_set_traffic_mgt_flags(so, TRAFFIC_MGT_SO_BACKGROUND);
813 so->so_background_thread = current_thread();
814 }
815
816 switch (dom) {
817 /*
818 * Don't mark Unix domain or system
819 * eligible for defunct by default.
820 */
821 case PF_LOCAL:
822 case PF_SYSTEM:
823 so->so_flags |= SOF_NODEFUNCT;
824 break;
825 default:
826 break;
827 }
828
829 /*
830 * Entitlements can't be checked at socket creation time except if the
831 * application requested a feature guarded by a privilege (c.f., socket
832 * delegation).
833 * The priv(9) and the Sandboxing APIs are designed with the idea that
834 * a privilege check should only be triggered by a userland request.
835 * A privilege check at socket creation time is time consuming and
836 * could trigger many authorisation error messages from the security
837 * APIs.
838 */
839
840 *aso = so;
841
842 return 0;
843}
844
845/*
846 * Returns: 0 Success
847 * EAFNOSUPPORT
848 * EPROTOTYPE
849 * EPROTONOSUPPORT
850 * ENOBUFS
851 * <pru_attach>:ENOBUFS[AF_UNIX]
852 * <pru_attach>:ENOBUFS[TCP]
853 * <pru_attach>:ENOMEM[TCP]
854 * <pru_attach>:??? [other protocol families, IPSEC]
855 */
856int
857socreate(int dom, struct socket **aso, int type, int proto)
858{
859 return socreate_internal(dom, aso, type, proto, p: current_proc(), flags: 0,
860 PROC_NULL);
861}
862
863int
864socreate_delegate(int dom, struct socket **aso, int type, int proto, pid_t epid)
865{
866 int error = 0;
867 struct proc *ep = PROC_NULL;
868
869 if ((proc_selfpid() != epid) && ((ep = proc_find(pid: epid)) == PROC_NULL)) {
870 error = ESRCH;
871 goto done;
872 }
873
874 error = socreate_internal(dom, aso, type, proto, p: current_proc(), flags: 0, ep);
875
876 /*
877 * It might not be wise to hold the proc reference when calling
878 * socreate_internal since it calls soalloc with M_WAITOK
879 */
880done:
881 if (ep != PROC_NULL) {
882 proc_rele(p: ep);
883 }
884
885 return error;
886}
887
888/*
889 * Returns: 0 Success
890 * <pru_bind>:EINVAL Invalid argument [COMMON_START]
891 * <pru_bind>:EAFNOSUPPORT Address family not supported
892 * <pru_bind>:EADDRNOTAVAIL Address not available.
893 * <pru_bind>:EINVAL Invalid argument
894 * <pru_bind>:EAFNOSUPPORT Address family not supported [notdef]
895 * <pru_bind>:EACCES Permission denied
896 * <pru_bind>:EADDRINUSE Address in use
897 * <pru_bind>:EAGAIN Resource unavailable, try again
898 * <pru_bind>:EPERM Operation not permitted
899 * <pru_bind>:???
900 * <sf_bind>:???
901 *
902 * Notes: It's not possible to fully enumerate the return codes above,
903 * since socket filter authors and protocol family authors may
904 * not choose to limit their error returns to those listed, even
905 * though this may result in some software operating incorrectly.
906 *
907 * The error codes which are enumerated above are those known to
908 * be returned by the tcp_usr_bind function supplied.
909 */
910int
911sobindlock(struct socket *so, struct sockaddr *nam, int dolock)
912{
913 struct proc *p = current_proc();
914 int error = 0;
915
916 if (dolock) {
917 socket_lock(so, refcount: 1);
918 }
919
920 so_update_last_owner_locked(so, self: p);
921 so_update_policy(so);
922
923#if NECP
924 so_update_necp_policy(so, override_local_addr: nam, NULL);
925#endif /* NECP */
926
927 /*
928 * If this is a bind request on a socket that has been marked
929 * as inactive, reject it now before we go any further.
930 */
931 if (so->so_flags & SOF_DEFUNCT) {
932 error = EINVAL;
933 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
934 __func__, proc_pid(p), proc_best_name(p),
935 so->so_gencnt,
936 SOCK_DOM(so), SOCK_TYPE(so), error);
937 goto out;
938 }
939
940 /* Socket filter */
941 error = sflt_bind(so, nam);
942
943 if (error == 0) {
944 error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p);
945 }
946out:
947 if (dolock) {
948 socket_unlock(so, refcount: 1);
949 }
950
951 if (error == EJUSTRETURN) {
952 error = 0;
953 }
954
955 return error;
956}
957
958void
959sodealloc(struct socket *so)
960{
961 kauth_cred_unref(&so->so_cred);
962
963 /* Remove any filters */
964 sflt_termsock(so);
965
966 so->so_gencnt = OSIncrementAtomic64(address: (SInt64 *)&so_gencnt);
967
968 if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
969 cached_sock_free(so);
970 } else {
971 zfree(socket_zone, so);
972 }
973}
974
975/*
976 * Returns: 0 Success
977 * EINVAL
978 * EOPNOTSUPP
979 * <pru_listen>:EINVAL[AF_UNIX]
980 * <pru_listen>:EINVAL[TCP]
981 * <pru_listen>:EADDRNOTAVAIL[TCP] Address not available.
982 * <pru_listen>:EINVAL[TCP] Invalid argument
983 * <pru_listen>:EAFNOSUPPORT[TCP] Address family not supported [notdef]
984 * <pru_listen>:EACCES[TCP] Permission denied
985 * <pru_listen>:EADDRINUSE[TCP] Address in use
986 * <pru_listen>:EAGAIN[TCP] Resource unavailable, try again
987 * <pru_listen>:EPERM[TCP] Operation not permitted
988 * <sf_listen>:???
989 *
990 * Notes: Other <pru_listen> returns depend on the protocol family; all
991 * <sf_listen> returns depend on what the filter author causes
992 * their filter to return.
993 */
994int
995solisten(struct socket *so, int backlog)
996{
997 struct proc *p = current_proc();
998 int error = 0;
999
1000 socket_lock(so, refcount: 1);
1001
1002 so_update_last_owner_locked(so, self: p);
1003 so_update_policy(so);
1004
1005 if (TAILQ_EMPTY(&so->so_comp)) {
1006 so->so_options |= SO_ACCEPTCONN;
1007 }
1008
1009#if NECP
1010 so_update_necp_policy(so, NULL, NULL);
1011#endif /* NECP */
1012
1013 if (so->so_proto == NULL) {
1014 error = EINVAL;
1015 so->so_options &= ~SO_ACCEPTCONN;
1016 goto out;
1017 }
1018 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) == 0) {
1019 error = EOPNOTSUPP;
1020 so->so_options &= ~SO_ACCEPTCONN;
1021 goto out;
1022 }
1023
1024 /*
1025 * If the listen request is made on a socket that is not fully
1026 * disconnected, or on a socket that has been marked as inactive,
1027 * reject the request now.
1028 */
1029 if ((so->so_state &
1030 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) ||
1031 (so->so_flags & SOF_DEFUNCT)) {
1032 error = EINVAL;
1033 if (so->so_flags & SOF_DEFUNCT) {
1034 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1035 "(%d)\n", __func__, proc_pid(p),
1036 proc_best_name(p),
1037 so->so_gencnt,
1038 SOCK_DOM(so), SOCK_TYPE(so), error);
1039 }
1040 so->so_options &= ~SO_ACCEPTCONN;
1041 goto out;
1042 }
1043
1044 if ((so->so_restrictions & SO_RESTRICT_DENY_IN) != 0) {
1045 error = EPERM;
1046 so->so_options &= ~SO_ACCEPTCONN;
1047 goto out;
1048 }
1049
1050 error = sflt_listen(so);
1051 if (error == 0) {
1052 error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p);
1053 }
1054
1055 if (error) {
1056 if (error == EJUSTRETURN) {
1057 error = 0;
1058 }
1059 so->so_options &= ~SO_ACCEPTCONN;
1060 goto out;
1061 }
1062
1063 /*
1064 * POSIX: The implementation may have an upper limit on the length of
1065 * the listen queue-either global or per accepting socket. If backlog
1066 * exceeds this limit, the length of the listen queue is set to the
1067 * limit.
1068 *
1069 * If listen() is called with a backlog argument value that is less
1070 * than 0, the function behaves as if it had been called with a backlog
1071 * argument value of 0.
1072 *
1073 * A backlog argument of 0 may allow the socket to accept connections,
1074 * in which case the length of the listen queue may be set to an
1075 * implementation-defined minimum value.
1076 */
1077 if (backlog <= 0 || backlog > somaxconn) {
1078 backlog = somaxconn;
1079 }
1080
1081 so->so_qlimit = (short)backlog;
1082out:
1083 socket_unlock(so, refcount: 1);
1084 return error;
1085}
1086
1087/*
1088 * The "accept list lock" protects the fields related to the listener queues
1089 * because we can unlock a socket to respect the lock ordering between
1090 * the listener socket and its clients sockets. The lock ordering is first to
1091 * acquire the client socket before the listener socket.
1092 *
1093 * The accept list lock serializes access to the following fields:
1094 * - of the listener socket:
1095 * - so_comp
1096 * - so_incomp
1097 * - so_qlen
1098 * - so_inqlen
1099 * - of client sockets that are in so_comp or so_incomp:
1100 * - so_head
1101 * - so_list
1102 *
1103 * As one can see the accept list lock protects the consistent of the
1104 * linkage of the client sockets.
1105 *
1106 * Note that those fields may be read without holding the accept list lock
1107 * for a preflight provided the accept list lock is taken when committing
1108 * to take an action based on the result of the preflight. The preflight
1109 * saves the cost of doing the unlock/lock dance.
1110 */
1111void
1112so_acquire_accept_list(struct socket *head, struct socket *so)
1113{
1114 lck_mtx_t *mutex_held;
1115
1116 if (head->so_proto->pr_getlock == NULL) {
1117 return;
1118 }
1119 mutex_held = (*head->so_proto->pr_getlock)(head, PR_F_WILLUNLOCK);
1120 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1121
1122 if (!(head->so_flags1 & SOF1_ACCEPT_LIST_HELD)) {
1123 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1124 return;
1125 }
1126 if (so != NULL) {
1127 socket_unlock(so, refcount: 0);
1128 }
1129 while (head->so_flags1 & SOF1_ACCEPT_LIST_HELD) {
1130 so_accept_list_waits += 1;
1131 msleep(chan: (caddr_t)&head->so_incomp, mtx: mutex_held,
1132 PSOCK | PCATCH, wmesg: __func__, NULL);
1133 }
1134 head->so_flags1 |= SOF1_ACCEPT_LIST_HELD;
1135 if (so != NULL) {
1136 socket_unlock(so: head, refcount: 0);
1137 socket_lock(so, refcount: 0);
1138 socket_lock(so: head, refcount: 0);
1139 }
1140}
1141
1142void
1143so_release_accept_list(struct socket *head)
1144{
1145 if (head->so_proto->pr_getlock != NULL) {
1146 lck_mtx_t *mutex_held;
1147
1148 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
1149 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1150
1151 head->so_flags1 &= ~SOF1_ACCEPT_LIST_HELD;
1152 wakeup(chan: (caddr_t)&head->so_incomp);
1153 }
1154}
1155
1156void
1157sofreelastref(struct socket *so, int dealloc)
1158{
1159 struct socket *head = so->so_head;
1160
1161 /* Assume socket is locked */
1162
1163#if FLOW_DIVERT
1164 if (so->so_flags & SOF_FLOW_DIVERT) {
1165 flow_divert_detach(so);
1166 }
1167#endif /* FLOW_DIVERT */
1168
1169#if CONTENT_FILTER
1170 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1171 cfil_sock_detach(so);
1172 }
1173#endif /* CONTENT_FILTER */
1174
1175 if (NEED_DGRAM_FLOW_TRACKING(so)) {
1176 soflow_detach(so);
1177 }
1178
1179 if (!(so->so_flags & SOF_PCBCLEARING) || !(so->so_state & SS_NOFDREF)) {
1180 selthreadclear(&so->so_snd.sb_sel);
1181 selthreadclear(&so->so_rcv.sb_sel);
1182 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1183 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1184 so->so_event = sonullevent;
1185 return;
1186 }
1187 if (head != NULL) {
1188 /*
1189 * Need to lock the listener when the protocol has
1190 * per socket locks
1191 */
1192 if (head->so_proto->pr_getlock != NULL) {
1193 socket_lock(so: head, refcount: 1);
1194 so_acquire_accept_list(head, so);
1195 }
1196 if (so->so_state & SS_INCOMP) {
1197 so->so_state &= ~SS_INCOMP;
1198 TAILQ_REMOVE(&head->so_incomp, so, so_list);
1199 head->so_incqlen--;
1200 head->so_qlen--;
1201 so->so_head = NULL;
1202
1203 if (head->so_proto->pr_getlock != NULL) {
1204 so_release_accept_list(head);
1205 socket_unlock(so: head, refcount: 1);
1206 }
1207 } else if (so->so_state & SS_COMP) {
1208 if (head->so_proto->pr_getlock != NULL) {
1209 so_release_accept_list(head);
1210 socket_unlock(so: head, refcount: 1);
1211 }
1212 /*
1213 * We must not decommission a socket that's
1214 * on the accept(2) queue. If we do, then
1215 * accept(2) may hang after select(2) indicated
1216 * that the listening socket was ready.
1217 */
1218 selthreadclear(&so->so_snd.sb_sel);
1219 selthreadclear(&so->so_rcv.sb_sel);
1220 so->so_rcv.sb_flags &= ~(SB_SEL | SB_UPCALL);
1221 so->so_snd.sb_flags &= ~(SB_SEL | SB_UPCALL);
1222 so->so_event = sonullevent;
1223 return;
1224 } else {
1225 if (head->so_proto->pr_getlock != NULL) {
1226 so_release_accept_list(head);
1227 socket_unlock(so: head, refcount: 1);
1228 }
1229 printf("sofree: not queued\n");
1230 }
1231 }
1232 sowflush(so);
1233 sorflush(so);
1234
1235 /* 3932268: disable upcall */
1236 so->so_rcv.sb_flags &= ~SB_UPCALL;
1237 so->so_snd.sb_flags &= ~(SB_UPCALL | SB_SNDBYTE_CNT);
1238 so->so_event = sonullevent;
1239
1240 if (dealloc) {
1241 sodealloc(so);
1242 }
1243}
1244
1245void
1246soclose_wait_locked(struct socket *so)
1247{
1248 lck_mtx_t *mutex_held;
1249
1250 if (so->so_proto->pr_getlock != NULL) {
1251 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1252 } else {
1253 mutex_held = so->so_proto->pr_domain->dom_mtx;
1254 }
1255 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1256
1257 /*
1258 * Double check here and return if there's no outstanding upcall;
1259 * otherwise proceed further only if SOF_UPCALLCLOSEWAIT is set.
1260 */
1261 if (!so->so_upcallusecount || !(so->so_flags & SOF_UPCALLCLOSEWAIT)) {
1262 return;
1263 }
1264 so->so_rcv.sb_flags &= ~SB_UPCALL;
1265 so->so_snd.sb_flags &= ~SB_UPCALL;
1266 so->so_flags |= SOF_CLOSEWAIT;
1267
1268 (void) msleep(chan: (caddr_t)&so->so_upcallusecount, mtx: mutex_held, pri: (PZERO - 1),
1269 wmesg: "soclose_wait_locked", NULL);
1270 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1271 so->so_flags &= ~SOF_CLOSEWAIT;
1272}
1273
1274/*
1275 * Close a socket on last file table reference removal.
1276 * Initiate disconnect if connected.
1277 * Free socket when disconnect complete.
1278 */
1279int
1280soclose_locked(struct socket *so)
1281{
1282 int error = 0;
1283 struct timespec ts;
1284
1285 if (so->so_usecount == 0) {
1286 panic("soclose: so=%p refcount=0", so);
1287 /* NOTREACHED */
1288 }
1289
1290 sflt_notify(so, event: sock_evt_closing, NULL);
1291
1292 if (so->so_upcallusecount) {
1293 soclose_wait_locked(so);
1294 }
1295
1296#if CONTENT_FILTER
1297 /*
1298 * We have to wait until the content filters are done
1299 */
1300 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
1301 cfil_sock_close_wait(so);
1302 cfil_sock_is_closed(so);
1303 cfil_sock_detach(so);
1304 }
1305#endif /* CONTENT_FILTER */
1306
1307 if (NEED_DGRAM_FLOW_TRACKING(so)) {
1308 soflow_detach(so);
1309 }
1310
1311 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
1312 soresume(current_proc(), so, 1);
1313 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
1314 }
1315
1316 if ((so->so_options & SO_ACCEPTCONN)) {
1317 struct socket *sp, *sonext;
1318 int persocklock = 0;
1319 int incomp_overflow_only;
1320
1321 /*
1322 * We do not want new connection to be added
1323 * to the connection queues
1324 */
1325 so->so_options &= ~SO_ACCEPTCONN;
1326
1327 /*
1328 * We can drop the lock on the listener once
1329 * we've acquired the incoming list
1330 */
1331 if (so->so_proto->pr_getlock != NULL) {
1332 persocklock = 1;
1333 so_acquire_accept_list(head: so, NULL);
1334 socket_unlock(so, refcount: 0);
1335 }
1336again:
1337 incomp_overflow_only = 1;
1338
1339 TAILQ_FOREACH_SAFE(sp, &so->so_incomp, so_list, sonext) {
1340 /*
1341 * Radar 5350314
1342 * skip sockets thrown away by tcpdropdropblreq
1343 * they will get cleanup by the garbage collection.
1344 * otherwise, remove the incomp socket from the queue
1345 * and let soabort trigger the appropriate cleanup.
1346 */
1347 if (sp->so_flags & SOF_OVERFLOW) {
1348 continue;
1349 }
1350
1351 if (persocklock != 0) {
1352 socket_lock(so: sp, refcount: 1);
1353 }
1354
1355 /*
1356 * Radar 27945981
1357 * The extra reference for the list insure the
1358 * validity of the socket pointer when we perform the
1359 * unlock of the head above
1360 */
1361 if (sp->so_state & SS_INCOMP) {
1362 sp->so_state &= ~SS_INCOMP;
1363 sp->so_head = NULL;
1364 TAILQ_REMOVE(&so->so_incomp, sp, so_list);
1365 so->so_incqlen--;
1366 so->so_qlen--;
1367
1368 (void) soabort(so: sp);
1369 } else {
1370 panic("%s sp %p in so_incomp but !SS_INCOMP",
1371 __func__, sp);
1372 }
1373
1374 if (persocklock != 0) {
1375 socket_unlock(so: sp, refcount: 1);
1376 }
1377 }
1378
1379 TAILQ_FOREACH_SAFE(sp, &so->so_comp, so_list, sonext) {
1380 /* Dequeue from so_comp since sofree() won't do it */
1381 if (persocklock != 0) {
1382 socket_lock(so: sp, refcount: 1);
1383 }
1384
1385 if (sp->so_state & SS_COMP) {
1386 sp->so_state &= ~SS_COMP;
1387 sp->so_head = NULL;
1388 TAILQ_REMOVE(&so->so_comp, sp, so_list);
1389 so->so_qlen--;
1390
1391 (void) soabort(so: sp);
1392 } else {
1393 panic("%s sp %p in so_comp but !SS_COMP",
1394 __func__, sp);
1395 }
1396
1397 if (persocklock) {
1398 socket_unlock(so: sp, refcount: 1);
1399 }
1400 }
1401
1402 if (incomp_overflow_only == 0 && !TAILQ_EMPTY(&so->so_incomp)) {
1403#if (DEBUG | DEVELOPMENT)
1404 panic("%s head %p so_comp not empty", __func__, so);
1405#endif /* (DEVELOPMENT || DEBUG) */
1406
1407 goto again;
1408 }
1409
1410 if (!TAILQ_EMPTY(&so->so_comp)) {
1411#if (DEBUG | DEVELOPMENT)
1412 panic("%s head %p so_comp not empty", __func__, so);
1413#endif /* (DEVELOPMENT || DEBUG) */
1414
1415 goto again;
1416 }
1417
1418 if (persocklock) {
1419 socket_lock(so, refcount: 0);
1420 so_release_accept_list(head: so);
1421 }
1422 }
1423 if (so->so_pcb == NULL) {
1424 /* 3915887: mark the socket as ready for dealloc */
1425 so->so_flags |= SOF_PCBCLEARING;
1426 goto discard;
1427 }
1428
1429 if (so->so_state & SS_ISCONNECTED) {
1430 if ((so->so_state & SS_ISDISCONNECTING) == 0) {
1431 error = sodisconnectlocked(so);
1432 if (error) {
1433 goto drop;
1434 }
1435 }
1436 if (so->so_options & SO_LINGER) {
1437 if ((so->so_state & SS_ISDISCONNECTING) &&
1438 (so->so_state & SS_NBIO)) {
1439 goto drop;
1440 }
1441 while ((so->so_state & SS_ISCONNECTED) && so->so_linger > 0) {
1442 lck_mtx_t *mutex_held;
1443
1444 if (so->so_proto->pr_getlock != NULL) {
1445 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
1446 } else {
1447 mutex_held = so->so_proto->pr_domain->dom_mtx;
1448 }
1449 ts.tv_sec = (so->so_linger / 100);
1450 ts.tv_nsec = (so->so_linger % 100) *
1451 NSEC_PER_USEC * 1000 * 10;
1452 error = msleep(chan: (caddr_t)&so->so_timeo,
1453 mtx: mutex_held, PSOCK | PCATCH, wmesg: "soclose", ts: &ts);
1454 if (error) {
1455 /*
1456 * It's OK when the time fires,
1457 * don't report an error
1458 */
1459 if (error == EWOULDBLOCK) {
1460 error = 0;
1461 }
1462 break;
1463 }
1464 }
1465 }
1466 }
1467drop:
1468 if (so->so_usecount == 0) {
1469 panic("soclose: usecount is zero so=%p", so);
1470 /* NOTREACHED */
1471 }
1472 if (so->so_pcb != NULL && !(so->so_flags & SOF_PCBCLEARING)) {
1473 int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
1474 if (error == 0) {
1475 error = error2;
1476 }
1477 }
1478 if (so->so_usecount <= 0) {
1479 panic("soclose: usecount is zero so=%p", so);
1480 /* NOTREACHED */
1481 }
1482discard:
1483 if (so->so_pcb != NULL && !(so->so_flags & SOF_MP_SUBFLOW) &&
1484 (so->so_state & SS_NOFDREF)) {
1485 panic("soclose: NOFDREF");
1486 /* NOTREACHED */
1487 }
1488 so->so_state |= SS_NOFDREF;
1489
1490 if ((so->so_flags & SOF_KNOTE) != 0) {
1491 KNOTE(&so->so_klist, SO_FILT_HINT_LOCKED);
1492 }
1493
1494 os_atomic_dec(&so->so_proto->pr_domain->dom_refs, relaxed);
1495
1496 VERIFY(so->so_usecount > 0);
1497 so->so_usecount--;
1498 sofree(so);
1499 return error;
1500}
1501
1502int
1503soclose(struct socket *so)
1504{
1505 int error = 0;
1506 socket_lock(so, refcount: 1);
1507
1508 if (so->so_retaincnt == 0) {
1509 error = soclose_locked(so);
1510 } else {
1511 /*
1512 * if the FD is going away, but socket is
1513 * retained in kernel remove its reference
1514 */
1515 so->so_usecount--;
1516 if (so->so_usecount < 2) {
1517 panic("soclose: retaincnt non null and so=%p "
1518 "usecount=%d\n", so, so->so_usecount);
1519 }
1520 }
1521 socket_unlock(so, refcount: 1);
1522 return error;
1523}
1524
1525/*
1526 * Must be called at splnet...
1527 */
1528/* Should already be locked */
1529int
1530soabort(struct socket *so)
1531{
1532 int error;
1533
1534#ifdef MORE_LOCKING_DEBUG
1535 lck_mtx_t *mutex_held;
1536
1537 if (so->so_proto->pr_getlock != NULL) {
1538 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
1539 } else {
1540 mutex_held = so->so_proto->pr_domain->dom_mtx;
1541 }
1542 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
1543#endif
1544
1545 if ((so->so_flags & SOF_ABORTED) == 0) {
1546 so->so_flags |= SOF_ABORTED;
1547 error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
1548 if (error) {
1549 sofree(so);
1550 return error;
1551 }
1552 }
1553 return 0;
1554}
1555
1556int
1557soacceptlock(struct socket *so, struct sockaddr **nam, int dolock)
1558{
1559 int error;
1560
1561 if (dolock) {
1562 socket_lock(so, refcount: 1);
1563 }
1564
1565 so_update_last_owner_locked(so, PROC_NULL);
1566 so_update_policy(so);
1567#if NECP
1568 so_update_necp_policy(so, NULL, NULL);
1569#endif /* NECP */
1570
1571 if ((so->so_state & SS_NOFDREF) == 0) {
1572 panic("soaccept: !NOFDREF");
1573 }
1574 so->so_state &= ~SS_NOFDREF;
1575 error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
1576
1577 if (dolock) {
1578 socket_unlock(so, refcount: 1);
1579 }
1580 return error;
1581}
1582
1583int
1584soaccept(struct socket *so, struct sockaddr **nam)
1585{
1586 return soacceptlock(so, nam, dolock: 1);
1587}
1588
1589int
1590soacceptfilter(struct socket *so, struct socket *head)
1591{
1592 struct sockaddr *local = NULL, *remote = NULL;
1593 int error = 0;
1594
1595 /*
1596 * Hold the lock even if this socket has not been made visible
1597 * to the filter(s). For sockets with global locks, this protects
1598 * against the head or peer going away
1599 */
1600 socket_lock(so, refcount: 1);
1601 if (sogetaddr_locked(so, &remote, 1) != 0 ||
1602 sogetaddr_locked(so, &local, 0) != 0) {
1603 so->so_state &= ~SS_NOFDREF;
1604 socket_unlock(so, refcount: 1);
1605 soclose(so);
1606 /* Out of resources; try it again next time */
1607 error = ECONNABORTED;
1608 goto done;
1609 }
1610
1611 error = sflt_accept(head, so, local, remote);
1612
1613 /*
1614 * If we get EJUSTRETURN from one of the filters, mark this socket
1615 * as inactive and return it anyway. This newly accepted socket
1616 * will be disconnected later before we hand it off to the caller.
1617 */
1618 if (error == EJUSTRETURN) {
1619 error = 0;
1620 (void) sosetdefunct(current_proc(), so,
1621 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
1622 }
1623
1624 if (error != 0) {
1625 /*
1626 * This may seem like a duplication to the above error
1627 * handling part when we return ECONNABORTED, except
1628 * the following is done while holding the lock since
1629 * the socket has been exposed to the filter(s) earlier.
1630 */
1631 so->so_state &= ~SS_NOFDREF;
1632 socket_unlock(so, refcount: 1);
1633 soclose(so);
1634 /* Propagate socket filter's error code to the caller */
1635 } else {
1636 socket_unlock(so, refcount: 1);
1637 }
1638done:
1639 /* Callee checks for NULL pointer */
1640 sock_freeaddr(sockname: remote);
1641 sock_freeaddr(sockname: local);
1642 return error;
1643}
1644
1645/*
1646 * Returns: 0 Success
1647 * EOPNOTSUPP Operation not supported on socket
1648 * EISCONN Socket is connected
1649 * <pru_connect>:EADDRNOTAVAIL Address not available.
1650 * <pru_connect>:EINVAL Invalid argument
1651 * <pru_connect>:EAFNOSUPPORT Address family not supported [notdef]
1652 * <pru_connect>:EACCES Permission denied
1653 * <pru_connect>:EADDRINUSE Address in use
1654 * <pru_connect>:EAGAIN Resource unavailable, try again
1655 * <pru_connect>:EPERM Operation not permitted
1656 * <sf_connect_out>:??? [anything a filter writer might set]
1657 */
1658int
1659soconnectlock(struct socket *so, struct sockaddr *nam, int dolock)
1660{
1661 int error;
1662 struct proc *p = current_proc();
1663 tracker_metadata_t metadata = { };
1664
1665 if (dolock) {
1666 socket_lock(so, refcount: 1);
1667 }
1668
1669 so_update_last_owner_locked(so, self: p);
1670 so_update_policy(so);
1671
1672 /*
1673 * If this is a listening socket or if this is a previously-accepted
1674 * socket that has been marked as inactive, reject the connect request.
1675 */
1676 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1677 error = EOPNOTSUPP;
1678 if (so->so_flags & SOF_DEFUNCT) {
1679 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1680 "(%d)\n", __func__, proc_pid(p),
1681 proc_best_name(p),
1682 so->so_gencnt,
1683 SOCK_DOM(so), SOCK_TYPE(so), error);
1684 }
1685 if (dolock) {
1686 socket_unlock(so, refcount: 1);
1687 }
1688 return error;
1689 }
1690
1691 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1692 if (dolock) {
1693 socket_unlock(so, refcount: 1);
1694 }
1695 return EPERM;
1696 }
1697
1698 /*
1699 * If protocol is connection-based, can only connect once.
1700 * Otherwise, if connected, try to disconnect first.
1701 * This allows user to disconnect by connecting to, e.g.,
1702 * a null address.
1703 */
1704 if (so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING) &&
1705 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1706 (error = sodisconnectlocked(so)))) {
1707 error = EISCONN;
1708 } else {
1709 /*
1710 * For connected v4/v6 sockets, check if destination address associates with a domain name and if it is
1711 * a tracker domain. Mark socket accordingly. Skip lookup if socket has already been marked a tracker.
1712 */
1713 if (!(so->so_flags1 & SOF1_KNOWN_TRACKER) && IS_INET(so)) {
1714 if (tracker_lookup(app_uuid: so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, nam, metadata: &metadata) == 0) {
1715 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1716 so->so_flags1 |= SOF1_KNOWN_TRACKER;
1717 }
1718 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1719 so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1720 }
1721 if (necp_set_socket_domain_attributes(so, domain: metadata.domain, domain_owner: metadata.domain_owner)) {
1722 printf("connect() - failed necp_set_socket_domain_attributes");
1723 }
1724 }
1725 }
1726
1727#if NECP
1728 /* Update NECP evaluation after setting any domain via the tracker checks */
1729 so_update_necp_policy(so, NULL, override_remote_addr: nam);
1730#endif /* NECP */
1731
1732 /*
1733 * Run connect filter before calling protocol:
1734 * - non-blocking connect returns before completion;
1735 */
1736 error = sflt_connectout(so, nam);
1737 if (error != 0) {
1738 if (error == EJUSTRETURN) {
1739 error = 0;
1740 }
1741 } else {
1742 error = (*so->so_proto->pr_usrreqs->pru_connect)
1743 (so, nam, p);
1744 if (error != 0) {
1745 so->so_state &= ~SS_ISCONNECTING;
1746 }
1747 }
1748 }
1749 if (dolock) {
1750 socket_unlock(so, refcount: 1);
1751 }
1752 return error;
1753}
1754
1755int
1756soconnect(struct socket *so, struct sockaddr *nam)
1757{
1758 return soconnectlock(so, nam, dolock: 1);
1759}
1760
1761/*
1762 * Returns: 0 Success
1763 * <pru_connect2>:EINVAL[AF_UNIX]
1764 * <pru_connect2>:EPROTOTYPE[AF_UNIX]
1765 * <pru_connect2>:??? [other protocol families]
1766 *
1767 * Notes: <pru_connect2> is not supported by [TCP].
1768 */
1769int
1770soconnect2(struct socket *so1, struct socket *so2)
1771{
1772 int error;
1773
1774 socket_lock(so: so1, refcount: 1);
1775 if (so2->so_proto->pr_lock) {
1776 socket_lock(so: so2, refcount: 1);
1777 }
1778
1779 error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
1780
1781 socket_unlock(so: so1, refcount: 1);
1782 if (so2->so_proto->pr_lock) {
1783 socket_unlock(so: so2, refcount: 1);
1784 }
1785 return error;
1786}
1787
1788int
1789soconnectxlocked(struct socket *so, struct sockaddr *src,
1790 struct sockaddr *dst, struct proc *p, uint32_t ifscope,
1791 sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
1792 uint32_t arglen, uio_t auio, user_ssize_t *bytes_written)
1793{
1794 int error;
1795 tracker_metadata_t metadata = { };
1796
1797 so_update_last_owner_locked(so, self: p);
1798 so_update_policy(so);
1799
1800 /*
1801 * If this is a listening socket or if this is a previously-accepted
1802 * socket that has been marked as inactive, reject the connect request.
1803 */
1804 if ((so->so_options & SO_ACCEPTCONN) || (so->so_flags & SOF_DEFUNCT)) {
1805 error = EOPNOTSUPP;
1806 if (so->so_flags & SOF_DEFUNCT) {
1807 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] "
1808 "(%d)\n", __func__, proc_pid(p),
1809 proc_best_name(p),
1810 so->so_gencnt,
1811 SOCK_DOM(so), SOCK_TYPE(so), error);
1812 }
1813 return error;
1814 }
1815
1816 if ((so->so_restrictions & SO_RESTRICT_DENY_OUT) != 0) {
1817 return EPERM;
1818 }
1819
1820 /*
1821 * If protocol is connection-based, can only connect once
1822 * unless PR_MULTICONN is set. Otherwise, if connected,
1823 * try to disconnect first. This allows user to disconnect
1824 * by connecting to, e.g., a null address.
1825 */
1826 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) &&
1827 !(so->so_proto->pr_flags & PR_MULTICONN) &&
1828 ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
1829 (error = sodisconnectlocked(so)) != 0)) {
1830 error = EISCONN;
1831 } else {
1832 /*
1833 * For TCP, check if destination address is a tracker and mark the socket accordingly
1834 * (only if it hasn't been marked yet).
1835 */
1836 if (SOCK_CHECK_TYPE(so, SOCK_STREAM) && SOCK_CHECK_PROTO(so, IPPROTO_TCP) &&
1837 !(so->so_flags1 & SOF1_KNOWN_TRACKER)) {
1838 if (tracker_lookup(app_uuid: so->so_flags & SOF_DELEGATED ? so->e_uuid : so->last_uuid, dst, metadata: &metadata) == 0) {
1839 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_TRACKER) {
1840 so->so_flags1 |= SOF1_KNOWN_TRACKER;
1841 }
1842 if (metadata.flags & SO_TRACKER_ATTRIBUTE_FLAGS_APP_APPROVED) {
1843 so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
1844 }
1845 if (necp_set_socket_domain_attributes(so, domain: metadata.domain, domain_owner: metadata.domain_owner)) {
1846 printf("connectx() - failed necp_set_socket_domain_attributes");
1847 }
1848 }
1849 }
1850
1851 if ((so->so_proto->pr_flags & PR_DATA_IDEMPOTENT) &&
1852 (flags & CONNECT_DATA_IDEMPOTENT)) {
1853 so->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1854
1855 if (flags & CONNECT_DATA_AUTHENTICATED) {
1856 so->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1857 }
1858 }
1859
1860 /*
1861 * Case 1: CONNECT_RESUME_ON_READ_WRITE set, no data.
1862 * Case 2: CONNECT_RESUME_ON_READ_WRITE set, with data (user error)
1863 * Case 3: CONNECT_RESUME_ON_READ_WRITE not set, with data
1864 * Case 3 allows user to combine write with connect even if they have
1865 * no use for TFO (such as regular TCP, and UDP).
1866 * Case 4: CONNECT_RESUME_ON_READ_WRITE not set, no data (regular case)
1867 */
1868 if ((so->so_proto->pr_flags & PR_PRECONN_WRITE) &&
1869 ((flags & CONNECT_RESUME_ON_READ_WRITE) || auio)) {
1870 so->so_flags1 |= SOF1_PRECONNECT_DATA;
1871 }
1872
1873 /*
1874 * If a user sets data idempotent and does not pass an uio, or
1875 * sets CONNECT_RESUME_ON_READ_WRITE, this is an error, reset
1876 * SOF1_DATA_IDEMPOTENT.
1877 */
1878 if (!(so->so_flags1 & SOF1_PRECONNECT_DATA) &&
1879 (so->so_flags1 & SOF1_DATA_IDEMPOTENT)) {
1880 /* We should return EINVAL instead perhaps. */
1881 so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT;
1882 }
1883
1884 /*
1885 * Run connect filter before calling protocol:
1886 * - non-blocking connect returns before completion;
1887 */
1888 error = sflt_connectout(so, nam: dst);
1889 if (error != 0) {
1890 /* Disable PRECONNECT_DATA, as we don't need to send a SYN anymore. */
1891 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1892 if (error == EJUSTRETURN) {
1893 error = 0;
1894 }
1895 } else {
1896 error = (*so->so_proto->pr_usrreqs->pru_connectx)
1897 (so, src, dst, p, ifscope, aid, pcid,
1898 flags, arg, arglen, auio, bytes_written);
1899 if (error != 0) {
1900 so->so_state &= ~SS_ISCONNECTING;
1901 if (error != EINPROGRESS) {
1902 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
1903 }
1904 }
1905 }
1906 }
1907
1908 return error;
1909}
1910
1911int
1912sodisconnectlocked(struct socket *so)
1913{
1914 int error;
1915
1916 if ((so->so_state & SS_ISCONNECTED) == 0) {
1917 error = ENOTCONN;
1918 goto bad;
1919 }
1920 if (so->so_state & SS_ISDISCONNECTING) {
1921 error = EALREADY;
1922 goto bad;
1923 }
1924
1925 error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
1926 if (error == 0) {
1927 sflt_notify(so, event: sock_evt_disconnected, NULL);
1928 }
1929
1930bad:
1931 return error;
1932}
1933
1934/* Locking version */
1935int
1936sodisconnect(struct socket *so)
1937{
1938 int error;
1939
1940 socket_lock(so, refcount: 1);
1941 error = sodisconnectlocked(so);
1942 socket_unlock(so, refcount: 1);
1943 return error;
1944}
1945
1946int
1947sodisconnectxlocked(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1948{
1949 int error;
1950
1951 /*
1952 * Call the protocol disconnectx handler; let it handle all
1953 * matters related to the connection state of this session.
1954 */
1955 error = (*so->so_proto->pr_usrreqs->pru_disconnectx)(so, aid, cid);
1956 if (error == 0) {
1957 /*
1958 * The event applies only for the session, not for
1959 * the disconnection of individual subflows.
1960 */
1961 if (so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) {
1962 sflt_notify(so, event: sock_evt_disconnected, NULL);
1963 }
1964 }
1965 return error;
1966}
1967
1968int
1969sodisconnectx(struct socket *so, sae_associd_t aid, sae_connid_t cid)
1970{
1971 int error;
1972
1973 socket_lock(so, refcount: 1);
1974 error = sodisconnectxlocked(so, aid, cid);
1975 socket_unlock(so, refcount: 1);
1976 return error;
1977}
1978
1979#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
1980
1981/*
1982 * sosendcheck will lock the socket buffer if it isn't locked and
1983 * verify that there is space for the data being inserted.
1984 *
1985 * Returns: 0 Success
1986 * EPIPE
1987 * sblock:EWOULDBLOCK
1988 * sblock:EINTR
1989 * sbwait:EBADF
1990 * sbwait:EINTR
1991 * [so_error]:???
1992 */
1993int
1994sosendcheck(struct socket *so, struct sockaddr *addr, user_ssize_t resid,
1995 int32_t clen, int32_t atomic, int flags, int *sblocked)
1996{
1997 int error = 0;
1998 int32_t space;
1999 int assumelock = 0;
2000
2001restart:
2002 if (*sblocked == 0) {
2003 if ((so->so_snd.sb_flags & SB_LOCK) != 0 &&
2004 so->so_send_filt_thread != 0 &&
2005 so->so_send_filt_thread == current_thread()) {
2006 /*
2007 * We're being called recursively from a filter,
2008 * allow this to continue. Radar 4150520.
2009 * Don't set sblocked because we don't want
2010 * to perform an unlock later.
2011 */
2012 assumelock = 1;
2013 } else {
2014 error = sblock(sb: &so->so_snd, SBLOCKWAIT(flags));
2015 if (error) {
2016 if (so->so_flags & SOF_DEFUNCT) {
2017 goto defunct;
2018 }
2019 return error;
2020 }
2021 *sblocked = 1;
2022 }
2023 }
2024
2025 /*
2026 * If a send attempt is made on a socket that has been marked
2027 * as inactive (disconnected), reject the request.
2028 */
2029 if (so->so_flags & SOF_DEFUNCT) {
2030defunct:
2031 error = EPIPE;
2032 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
2033 __func__, proc_selfpid(), proc_best_name(current_proc()),
2034 so->so_gencnt,
2035 SOCK_DOM(so), SOCK_TYPE(so), error);
2036 return error;
2037 }
2038
2039 if (so->so_state & SS_CANTSENDMORE) {
2040#if CONTENT_FILTER
2041 /*
2042 * Can re-inject data of half closed connections
2043 */
2044 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
2045 so->so_snd.sb_cfil_thread == current_thread() &&
2046 cfil_sock_data_pending(sb: &so->so_snd) != 0) {
2047 CFIL_LOG(LOG_INFO,
2048 "so %llx ignore SS_CANTSENDMORE",
2049 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
2050 } else
2051#endif /* CONTENT_FILTER */
2052 return EPIPE;
2053 }
2054 if (so->so_error) {
2055 error = so->so_error;
2056 so->so_error = 0;
2057 return error;
2058 }
2059
2060 if ((so->so_state & SS_ISCONNECTED) == 0) {
2061 if ((so->so_proto->pr_flags & PR_CONNREQUIRED) != 0) {
2062 if (((so->so_state & SS_ISCONFIRMING) == 0) &&
2063 (resid != 0 || clen == 0) &&
2064 !(so->so_flags1 & SOF1_PRECONNECT_DATA)) {
2065 return ENOTCONN;
2066 }
2067 } else if (addr == 0) {
2068 return (so->so_proto->pr_flags & PR_CONNREQUIRED) ?
2069 ENOTCONN : EDESTADDRREQ;
2070 }
2071 }
2072
2073 space = sbspace(sb: &so->so_snd);
2074
2075 if (flags & MSG_OOB) {
2076 space += 1024;
2077 }
2078 if ((atomic && resid > so->so_snd.sb_hiwat) ||
2079 clen > so->so_snd.sb_hiwat) {
2080 return EMSGSIZE;
2081 }
2082
2083 if ((space < resid + clen &&
2084 (atomic || (space < (int32_t)so->so_snd.sb_lowat) ||
2085 space < clen)) ||
2086 (so->so_type == SOCK_STREAM && so_wait_for_if_feedback(so))) {
2087 /*
2088 * don't block the connectx call when there's more data
2089 * than can be copied.
2090 */
2091 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2092 if (space == 0) {
2093 return EWOULDBLOCK;
2094 }
2095 if (space < (int32_t)so->so_snd.sb_lowat) {
2096 return 0;
2097 }
2098 }
2099 if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO) ||
2100 assumelock) {
2101 return EWOULDBLOCK;
2102 }
2103 sbunlock(sb: &so->so_snd, TRUE); /* keep socket locked */
2104 *sblocked = 0;
2105 error = sbwait(sb: &so->so_snd);
2106 if (error) {
2107 if (so->so_flags & SOF_DEFUNCT) {
2108 goto defunct;
2109 }
2110 return error;
2111 }
2112 goto restart;
2113 }
2114 return 0;
2115}
2116
2117/*
2118 * Send on a socket.
2119 * If send must go all at once and message is larger than
2120 * send buffering, then hard error.
2121 * Lock against other senders.
2122 * If must go all at once and not enough room now, then
2123 * inform user that this would block and do nothing.
2124 * Otherwise, if nonblocking, send as much as possible.
2125 * The data to be sent is described by "uio" if nonzero,
2126 * otherwise by the mbuf chain "top" (which must be null
2127 * if uio is not). Data provided in mbuf chain must be small
2128 * enough to send all at once.
2129 *
2130 * Returns nonzero on error, timeout or signal; callers
2131 * must check for short counts if EINTR/ERESTART are returned.
2132 * Data and control buffers are freed on return.
2133 *
2134 * Returns: 0 Success
2135 * EOPNOTSUPP
2136 * EINVAL
2137 * ENOBUFS
2138 * uiomove:EFAULT
2139 * sosendcheck:EPIPE
2140 * sosendcheck:EWOULDBLOCK
2141 * sosendcheck:EINTR
2142 * sosendcheck:EBADF
2143 * sosendcheck:EINTR
2144 * sosendcheck:??? [value from so_error]
2145 * <pru_send>:ECONNRESET[TCP]
2146 * <pru_send>:EINVAL[TCP]
2147 * <pru_send>:ENOBUFS[TCP]
2148 * <pru_send>:EADDRINUSE[TCP]
2149 * <pru_send>:EADDRNOTAVAIL[TCP]
2150 * <pru_send>:EAFNOSUPPORT[TCP]
2151 * <pru_send>:EACCES[TCP]
2152 * <pru_send>:EAGAIN[TCP]
2153 * <pru_send>:EPERM[TCP]
2154 * <pru_send>:EMSGSIZE[TCP]
2155 * <pru_send>:EHOSTUNREACH[TCP]
2156 * <pru_send>:ENETUNREACH[TCP]
2157 * <pru_send>:ENETDOWN[TCP]
2158 * <pru_send>:ENOMEM[TCP]
2159 * <pru_send>:ENOBUFS[TCP]
2160 * <pru_send>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
2161 * <pru_send>:EINVAL[AF_UNIX]
2162 * <pru_send>:EOPNOTSUPP[AF_UNIX]
2163 * <pru_send>:EPIPE[AF_UNIX]
2164 * <pru_send>:ENOTCONN[AF_UNIX]
2165 * <pru_send>:EISCONN[AF_UNIX]
2166 * <pru_send>:???[AF_UNIX] [whatever a filter author chooses]
2167 * <sf_data_out>:??? [whatever a filter author chooses]
2168 *
2169 * Notes: Other <pru_send> returns depend on the protocol family; all
2170 * <sf_data_out> returns depend on what the filter author causes
2171 * their filter to return.
2172 */
2173int
2174sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2175 struct mbuf *top, struct mbuf *control, int flags)
2176{
2177 struct mbuf **mp;
2178 struct mbuf *m, *freelist = NULL;
2179 struct soflow_hash_entry *dgram_flow_entry = NULL;
2180 user_ssize_t space, len, resid, orig_resid;
2181 int clen = 0, error, dontroute, sendflags;
2182 int atomic = sosendallatonce(so) || top;
2183 int sblocked = 0;
2184 struct proc *p = current_proc();
2185 uint16_t headroom = 0;
2186 ssize_t mlen;
2187 boolean_t en_tracing = FALSE;
2188
2189 if (uio != NULL) {
2190 resid = uio_resid(a_uio: uio);
2191 } else {
2192 resid = top->m_pkthdr.len;
2193 }
2194 orig_resid = resid;
2195
2196 KERNEL_DEBUG((DBG_FNC_SOSEND | DBG_FUNC_START), so, resid,
2197 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2198
2199 socket_lock(so, refcount: 1);
2200
2201 if (NEED_DGRAM_FLOW_TRACKING(so)) {
2202 dgram_flow_entry = soflow_get_flow(so, NULL, addr, control, resid, true, 0);
2203 }
2204
2205 /*
2206 * trace if tracing & network (vs. unix) sockets & and
2207 * non-loopback
2208 */
2209 if (ENTR_SHOULDTRACE &&
2210 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2211 struct inpcb *inp = sotoinpcb(so);
2212 if (inp->inp_last_outifp != NULL &&
2213 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2214 en_tracing = TRUE;
2215 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2216 VM_KERNEL_ADDRPERM(so),
2217 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2218 (int64_t)resid);
2219 }
2220 }
2221
2222 /*
2223 * Re-injection should not affect process accounting
2224 */
2225 if ((flags & MSG_SKIPCFIL) == 0) {
2226 so_update_last_owner_locked(so, self: p);
2227 so_update_policy(so);
2228
2229#if NECP
2230 so_update_necp_policy(so, NULL, override_remote_addr: addr);
2231#endif /* NECP */
2232 }
2233
2234 if (so->so_type != SOCK_STREAM && (flags & MSG_OOB) != 0) {
2235 error = EOPNOTSUPP;
2236 goto out_locked;
2237 }
2238
2239 /*
2240 * In theory resid should be unsigned.
2241 * However, space must be signed, as it might be less than 0
2242 * if we over-committed, and we must use a signed comparison
2243 * of space and resid. On the other hand, a negative resid
2244 * causes us to loop sending 0-length segments to the protocol.
2245 *
2246 * Usually, MSG_EOR isn't used on SOCK_STREAM type sockets.
2247 *
2248 * Note: We limit resid to be a positive int value as we use
2249 * imin() to set bytes_to_copy -- radr://14558484
2250 */
2251 if (resid < 0 || resid > INT_MAX ||
2252 (so->so_type == SOCK_STREAM && (flags & MSG_EOR))) {
2253 error = EINVAL;
2254 goto out_locked;
2255 }
2256
2257 dontroute = (flags & MSG_DONTROUTE) &&
2258 (so->so_options & SO_DONTROUTE) == 0 &&
2259 (so->so_proto->pr_flags & PR_ATOMIC);
2260 OSIncrementAtomicLong(address: &p->p_stats->p_ru.ru_msgsnd);
2261
2262 if (control != NULL) {
2263 clen = control->m_len;
2264 }
2265
2266 if (soreserveheadroom != 0) {
2267 headroom = so->so_pktheadroom;
2268 }
2269
2270 do {
2271 error = sosendcheck(so, addr, resid, clen, atomic, flags,
2272 sblocked: &sblocked);
2273 if (error) {
2274 goto out_locked;
2275 }
2276
2277 mp = &top;
2278 space = sbspace(sb: &so->so_snd) - clen;
2279 space += ((flags & MSG_OOB) ? 1024 : 0);
2280
2281 do {
2282 if (uio == NULL) {
2283 /*
2284 * Data is prepackaged in "top".
2285 */
2286 resid = 0;
2287 if (flags & MSG_EOR) {
2288 top->m_flags |= M_EOR;
2289 }
2290 } else {
2291 int chainlength;
2292 int bytes_to_copy;
2293 boolean_t jumbocl;
2294 boolean_t bigcl;
2295 int bytes_to_alloc;
2296
2297 bytes_to_copy = imin(a: (int)resid, b: (int)space);
2298
2299 bytes_to_alloc = bytes_to_copy;
2300 if (top == NULL) {
2301 bytes_to_alloc += headroom;
2302 }
2303
2304 if (sosendminchain > 0) {
2305 chainlength = 0;
2306 } else {
2307 chainlength = sosendmaxchain;
2308 }
2309
2310 /*
2311 * Use big 4 KB cluster when the outgoing interface
2312 * does not prefer 2 KB clusters
2313 */
2314 bigcl = !(so->so_flags1 & SOF1_IF_2KCL) ||
2315 sosendbigcl_ignore_capab;
2316
2317 /*
2318 * Attempt to use larger than system page-size
2319 * clusters for large writes only if there is
2320 * a jumbo cluster pool and if the socket is
2321 * marked accordingly.
2322 */
2323 jumbocl = sosendjcl && njcl > 0 &&
2324 ((so->so_flags & SOF_MULTIPAGES) ||
2325 sosendjcl_ignore_capab) &&
2326 bigcl;
2327
2328 socket_unlock(so, refcount: 0);
2329
2330 do {
2331 int num_needed;
2332 int hdrs_needed = (top == NULL) ? 1 : 0;
2333
2334 /*
2335 * try to maintain a local cache of mbuf
2336 * clusters needed to complete this
2337 * write the list is further limited to
2338 * the number that are currently needed
2339 * to fill the socket this mechanism
2340 * allows a large number of mbufs/
2341 * clusters to be grabbed under a single
2342 * mbuf lock... if we can't get any
2343 * clusters, than fall back to trying
2344 * for mbufs if we fail early (or
2345 * miscalcluate the number needed) make
2346 * sure to release any clusters we
2347 * haven't yet consumed.
2348 */
2349 if (freelist == NULL &&
2350 bytes_to_alloc > MBIGCLBYTES &&
2351 jumbocl) {
2352 num_needed =
2353 bytes_to_alloc / M16KCLBYTES;
2354
2355 if ((bytes_to_alloc -
2356 (num_needed * M16KCLBYTES))
2357 >= MINCLSIZE) {
2358 num_needed++;
2359 }
2360
2361 freelist =
2362 m_getpackets_internal(
2363 (unsigned int *)&num_needed,
2364 hdrs_needed, M_WAIT, 0,
2365 M16KCLBYTES);
2366 /*
2367 * Fall back to 4K cluster size
2368 * if allocation failed
2369 */
2370 }
2371
2372 if (freelist == NULL &&
2373 bytes_to_alloc > MCLBYTES &&
2374 bigcl) {
2375 num_needed =
2376 bytes_to_alloc / MBIGCLBYTES;
2377
2378 if ((bytes_to_alloc -
2379 (num_needed * MBIGCLBYTES)) >=
2380 MINCLSIZE) {
2381 num_needed++;
2382 }
2383
2384 freelist =
2385 m_getpackets_internal(
2386 (unsigned int *)&num_needed,
2387 hdrs_needed, M_WAIT, 0,
2388 MBIGCLBYTES);
2389 /*
2390 * Fall back to cluster size
2391 * if allocation failed
2392 */
2393 }
2394
2395 /*
2396 * Allocate a cluster as we want to
2397 * avoid to split the data in more
2398 * that one segment and using MINCLSIZE
2399 * would lead us to allocate two mbufs
2400 */
2401 if (soreserveheadroom != 0 &&
2402 freelist == NULL &&
2403 ((top == NULL &&
2404 bytes_to_alloc > _MHLEN) ||
2405 bytes_to_alloc > _MLEN)) {
2406 num_needed = ROUNDUP(bytes_to_alloc, MCLBYTES) /
2407 MCLBYTES;
2408 freelist =
2409 m_getpackets_internal(
2410 (unsigned int *)&num_needed,
2411 hdrs_needed, M_WAIT, 0,
2412 MCLBYTES);
2413 /*
2414 * Fall back to a single mbuf
2415 * if allocation failed
2416 */
2417 } else if (freelist == NULL &&
2418 bytes_to_alloc > MINCLSIZE) {
2419 num_needed =
2420 bytes_to_alloc / MCLBYTES;
2421
2422 if ((bytes_to_alloc -
2423 (num_needed * MCLBYTES)) >=
2424 MINCLSIZE) {
2425 num_needed++;
2426 }
2427
2428 freelist =
2429 m_getpackets_internal(
2430 (unsigned int *)&num_needed,
2431 hdrs_needed, M_WAIT, 0,
2432 MCLBYTES);
2433 /*
2434 * Fall back to a single mbuf
2435 * if allocation failed
2436 */
2437 }
2438 /*
2439 * For datagram protocols, leave
2440 * headroom for protocol headers
2441 * in the first cluster of the chain
2442 */
2443 if (freelist != NULL && atomic &&
2444 top == NULL && headroom > 0) {
2445 freelist->m_data += headroom;
2446 }
2447
2448 /*
2449 * Fall back to regular mbufs without
2450 * reserving the socket headroom
2451 */
2452 if (freelist == NULL) {
2453 if (SOCK_TYPE(so) != SOCK_STREAM || bytes_to_alloc <= MINCLSIZE) {
2454 if (top == NULL) {
2455 MGETHDR(freelist,
2456 M_WAIT, MT_DATA);
2457 } else {
2458 MGET(freelist,
2459 M_WAIT, MT_DATA);
2460 }
2461 }
2462
2463 if (freelist == NULL) {
2464 error = ENOBUFS;
2465 socket_lock(so, refcount: 0);
2466 goto out_locked;
2467 }
2468 /*
2469 * For datagram protocols,
2470 * leave room for protocol
2471 * headers in first mbuf.
2472 */
2473 if (atomic && top == NULL &&
2474 bytes_to_copy > 0 &&
2475 bytes_to_copy < MHLEN) {
2476 MH_ALIGN(freelist,
2477 bytes_to_copy);
2478 }
2479 }
2480 m = freelist;
2481 freelist = m->m_next;
2482 m->m_next = NULL;
2483
2484 if ((m->m_flags & M_EXT)) {
2485 mlen = m->m_ext.ext_size -
2486 M_LEADINGSPACE(m);
2487 } else if ((m->m_flags & M_PKTHDR)) {
2488 mlen = MHLEN - M_LEADINGSPACE(m);
2489 m_add_crumb(m, PKT_CRUMB_SOSEND);
2490 } else {
2491 mlen = MLEN - M_LEADINGSPACE(m);
2492 }
2493 len = imin(a: (int)mlen, b: bytes_to_copy);
2494
2495 chainlength += len;
2496
2497 space -= len;
2498
2499 error = uiomove(mtod(m, caddr_t),
2500 n: (int)len, uio);
2501
2502 resid = uio_resid(a_uio: uio);
2503
2504 m->m_len = (int32_t)len;
2505 *mp = m;
2506 top->m_pkthdr.len += len;
2507 if (error) {
2508 break;
2509 }
2510 mp = &m->m_next;
2511 if (resid <= 0) {
2512 if (flags & MSG_EOR) {
2513 top->m_flags |= M_EOR;
2514 }
2515 break;
2516 }
2517 bytes_to_copy = imin(a: (int)resid, b: (int)space);
2518 } while (space > 0 &&
2519 (chainlength < sosendmaxchain || atomic ||
2520 resid < MINCLSIZE));
2521
2522 socket_lock(so, refcount: 0);
2523
2524 if (error) {
2525 goto out_locked;
2526 }
2527 }
2528
2529 if (dontroute) {
2530 so->so_options |= SO_DONTROUTE;
2531 }
2532
2533 /*
2534 * Compute flags here, for pru_send and NKEs
2535 *
2536 * If the user set MSG_EOF, the protocol
2537 * understands this flag and nothing left to
2538 * send then use PRU_SEND_EOF instead of PRU_SEND.
2539 */
2540 sendflags = (flags & MSG_OOB) ? PRUS_OOB :
2541 ((flags & MSG_EOF) &&
2542 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
2543 (resid <= 0)) ? PRUS_EOF :
2544 /* If there is more to send set PRUS_MORETOCOME */
2545 (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0;
2546
2547 if ((flags & MSG_SKIPCFIL) == 0) {
2548 /*
2549 * Socket filter processing
2550 */
2551 error = sflt_data_out(so, to: addr, data: &top,
2552 control: &control, flags: (sendflags & MSG_OOB) ?
2553 sock_data_filt_flag_oob : 0);
2554 if (error) {
2555 if (error == EJUSTRETURN) {
2556 error = 0;
2557 goto packet_consumed;
2558 }
2559 goto out_locked;
2560 }
2561#if CONTENT_FILTER
2562 /*
2563 * Content filter processing
2564 */
2565 error = cfil_sock_data_out(so, to: addr, data: top,
2566 control, flags: sendflags, dgram_flow_entry);
2567 if (error) {
2568 if (error == EJUSTRETURN) {
2569 error = 0;
2570 goto packet_consumed;
2571 }
2572 goto out_locked;
2573 }
2574#endif /* CONTENT_FILTER */
2575 }
2576 error = (*so->so_proto->pr_usrreqs->pru_send)
2577 (so, sendflags, top, addr, control, p);
2578
2579packet_consumed:
2580 if (dontroute) {
2581 so->so_options &= ~SO_DONTROUTE;
2582 }
2583
2584 clen = 0;
2585 control = NULL;
2586 top = NULL;
2587 mp = &top;
2588 if (error) {
2589 goto out_locked;
2590 }
2591 } while (resid && space > 0);
2592 } while (resid);
2593
2594
2595out_locked:
2596 if (resid > orig_resid) {
2597 char pname[MAXCOMLEN] = {};
2598 pid_t current_pid = proc_pid(current_proc());
2599 proc_name(pid: current_pid, buf: pname, size: sizeof(pname));
2600
2601 if (sosend_assert_panic != 0) {
2602 panic("sosend so %p resid %lld > orig_resid %lld proc %s:%d",
2603 so, resid, orig_resid, pname, current_pid);
2604 } else {
2605 os_log_error(OS_LOG_DEFAULT, "sosend: so_gencnt %llu resid %lld > orig_resid %lld proc %s:%d",
2606 so->so_gencnt, resid, orig_resid, pname, current_pid);
2607 }
2608 }
2609
2610 if (sblocked) {
2611 sbunlock(sb: &so->so_snd, FALSE); /* will unlock socket */
2612 } else {
2613 socket_unlock(so, refcount: 1);
2614 }
2615 if (top != NULL) {
2616 m_freem(top);
2617 }
2618 if (control != NULL) {
2619 m_freem(control);
2620 }
2621 if (freelist != NULL) {
2622 m_freem_list(freelist);
2623 }
2624
2625 if (dgram_flow_entry != NULL) {
2626 soflow_free_flow(dgram_flow_entry);
2627 }
2628
2629 soclearfastopen(so);
2630
2631 if (en_tracing) {
2632 /* resid passed here is the bytes left in uio */
2633 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2634 VM_KERNEL_ADDRPERM(so),
2635 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2636 (int64_t)(orig_resid - resid));
2637 }
2638 KERNEL_DEBUG(DBG_FNC_SOSEND | DBG_FUNC_END, so, resid,
2639 so->so_snd.sb_cc, space, error);
2640
2641 return error;
2642}
2643
2644int
2645sosend_reinject(struct socket *so, struct sockaddr *addr, struct mbuf *top, struct mbuf *control, uint32_t sendflags)
2646{
2647 struct mbuf *m0 = NULL, *control_end = NULL;
2648
2649 socket_lock_assert_owned(so);
2650
2651 /*
2652 * top must points to mbuf chain to be sent.
2653 * If control is not NULL, top must be packet header
2654 */
2655 VERIFY(top != NULL &&
2656 (control == NULL || top->m_flags & M_PKTHDR));
2657
2658 /*
2659 * If control is not passed in, see if we can get it
2660 * from top.
2661 */
2662 if (control == NULL && (top->m_flags & M_PKTHDR) == 0) {
2663 // Locate start of control if present and start of data
2664 for (m0 = top; m0 != NULL; m0 = m0->m_next) {
2665 if (m0->m_flags & M_PKTHDR) {
2666 top = m0;
2667 break;
2668 } else if (m0->m_type == MT_CONTROL) {
2669 if (control == NULL) {
2670 // Found start of control
2671 control = m0;
2672 }
2673 if (control != NULL && m0->m_next != NULL && m0->m_next->m_type != MT_CONTROL) {
2674 // Found end of control
2675 control_end = m0;
2676 }
2677 }
2678 }
2679 if (control_end != NULL) {
2680 control_end->m_next = NULL;
2681 }
2682 }
2683
2684 int error = (*so->so_proto->pr_usrreqs->pru_send)
2685 (so, sendflags, top, addr, control, current_proc());
2686
2687 return error;
2688}
2689
2690static struct mbuf *
2691mbuf_detach_control_from_list(struct mbuf **mp)
2692{
2693 struct mbuf *control = NULL;
2694 struct mbuf *m = *mp;
2695
2696 if (m->m_type == MT_CONTROL) {
2697 struct mbuf *control_end;
2698 struct mbuf *n;
2699
2700 n = control_end = control = m;
2701
2702 /*
2703 * Break the chain per mbuf type
2704 */
2705 while (n != NULL && n->m_type == MT_CONTROL) {
2706 control_end = n;
2707 n = n->m_next;
2708 }
2709 control_end->m_next = NULL;
2710 *mp = n;
2711 }
2712 VERIFY(*mp != NULL);
2713
2714 return control;
2715}
2716
2717/*
2718 * Supported only connected sockets (no address) without ancillary data
2719 * (control mbuf) for atomic protocols
2720 */
2721int
2722sosend_list(struct socket *so, struct mbuf *pktlist, size_t total_len, u_int *pktcnt, int flags)
2723{
2724 struct mbuf *m;
2725 struct soflow_hash_entry *dgram_flow_entry = NULL;
2726 int error, dontroute;
2727 int atomic = sosendallatonce(so);
2728 int sblocked = 0;
2729 struct proc *p = current_proc();
2730 struct mbuf *top = pktlist;
2731 bool skip_filt = (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) || (flags & MSG_SKIPCFIL);
2732
2733 KERNEL_DEBUG((DBG_FNC_SOSEND_LIST | DBG_FUNC_START), so, uiocnt,
2734 so->so_snd.sb_cc, so->so_snd.sb_lowat, so->so_snd.sb_hiwat);
2735
2736 if (so->so_type != SOCK_DGRAM) {
2737 error = EINVAL;
2738 os_log(OS_LOG_DEFAULT, "sosend_list: so->so_type != SOCK_DGRAM error %d",
2739 error);
2740 goto out;
2741 }
2742 if (atomic == 0) {
2743 error = EINVAL;
2744 os_log(OS_LOG_DEFAULT, "sosend_list: atomic == 0 error %d",
2745 error);
2746 goto out;
2747 }
2748 if ((so->so_state & SS_ISCONNECTED) == 0) {
2749 error = ENOTCONN;
2750 os_log(OS_LOG_DEFAULT, "sosend_list: SS_ISCONNECTED not set error: %d",
2751 error);
2752 goto out;
2753 }
2754 if (flags & ~(MSG_DONTWAIT | MSG_NBIO | MSG_SKIPCFIL)) {
2755 error = EINVAL;
2756 os_log(OS_LOG_DEFAULT, "sosend_list: flags 0x%x error %d",
2757 flags, error);
2758 goto out;
2759 }
2760
2761 socket_lock(so, refcount: 1);
2762 so_update_last_owner_locked(so, self: p);
2763 so_update_policy(so);
2764
2765 if (NEED_DGRAM_FLOW_TRACKING(so)) {
2766 dgram_flow_entry = soflow_get_flow(so, NULL, NULL, NULL, total_len, true, 0);
2767 }
2768
2769#if NECP
2770 so_update_necp_policy(so, NULL, NULL);
2771#endif /* NECP */
2772
2773 dontroute = (flags & MSG_DONTROUTE) &&
2774 (so->so_options & SO_DONTROUTE) == 0 &&
2775 (so->so_proto->pr_flags & PR_ATOMIC);
2776 if (dontroute) {
2777 so->so_options |= SO_DONTROUTE;
2778 }
2779
2780 OSIncrementAtomicLong(address: &p->p_stats->p_ru.ru_msgsnd);
2781
2782 error = sosendcheck(so, NULL, resid: 0, clen: 0, atomic, flags, sblocked: &sblocked);
2783 if (error) {
2784 os_log(OS_LOG_DEFAULT, "sosend_list: sosendcheck error %d",
2785 error);
2786 goto release;
2787 }
2788
2789 if (!skip_filt) {
2790 struct mbuf **prevnextp = NULL;
2791
2792 for (m = top; m != NULL; m = m->m_nextpkt) {
2793 struct mbuf *control = NULL;
2794 struct mbuf *last_control = NULL;
2795 struct mbuf *nextpkt;
2796
2797 /*
2798 * Remove packet from the list of packets
2799 */
2800 nextpkt = m->m_nextpkt;
2801 if (prevnextp != NULL) {
2802 *prevnextp = nextpkt;
2803 } else {
2804 top = nextpkt;
2805 }
2806 m->m_nextpkt = NULL;
2807
2808 /*
2809 * Break the chain per mbuf type
2810 */
2811 if (m->m_type == MT_CONTROL) {
2812 control = mbuf_detach_control_from_list(mp: &m);
2813 }
2814 /*
2815 * Socket filter processing
2816 */
2817 error = sflt_data_out(so, NULL, data: &m,
2818 control: &control, flags: 0);
2819 if (error != 0 && error != EJUSTRETURN) {
2820 os_log(OS_LOG_DEFAULT, "sosend_list: sflt_data_out error %d",
2821 error);
2822 goto release;
2823 }
2824
2825#if CONTENT_FILTER
2826 if (error == 0) {
2827 /*
2828 * Content filter processing
2829 */
2830 error = cfil_sock_data_out(so, NULL, data: m,
2831 control, flags: 0, dgram_flow_entry);
2832 if (error != 0 && error != EJUSTRETURN) {
2833 os_log(OS_LOG_DEFAULT, "sosend_list: cfil_sock_data_out error %d",
2834 error);
2835 goto release;
2836 }
2837 }
2838#endif /* CONTENT_FILTER */
2839 if (error == EJUSTRETURN) {
2840 /*
2841 * When swallowed by a filter, the packet is not
2842 * in the list anymore
2843 */
2844 error = 0;
2845 } else {
2846 /*
2847 * Rebuild the mbuf chain of the packet
2848 */
2849 if (control != NULL) {
2850 last_control->m_next = m;
2851 m = control;
2852 }
2853 /*
2854 * Reinsert the packet in the list of packets
2855 */
2856 m->m_nextpkt = nextpkt;
2857 if (prevnextp != NULL) {
2858 *prevnextp = m;
2859 } else {
2860 top = m;
2861 }
2862 prevnextp = &m->m_nextpkt;
2863 }
2864 }
2865 }
2866
2867 if (top != NULL) {
2868 if (so->so_proto->pr_usrreqs->pru_send_list != pru_send_list_notsupp) {
2869 error = (*so->so_proto->pr_usrreqs->pru_send_list)
2870 (so, top, pktcnt, flags);
2871 if (error != 0) {
2872 os_log(OS_LOG_DEFAULT, "sosend_list: pru_send_list error %d",
2873 error);
2874 }
2875 top = NULL;
2876 } else {
2877 *pktcnt = 0;
2878 for (m = top; m != NULL; m = top) {
2879 struct mbuf *control = NULL;
2880
2881 top = m->m_nextpkt;
2882 m->m_nextpkt = NULL;
2883
2884 /*
2885 * Break the chain per mbuf type
2886 */
2887 if (m->m_type == MT_CONTROL) {
2888 control = mbuf_detach_control_from_list(mp: &m);
2889 }
2890
2891 error = (*so->so_proto->pr_usrreqs->pru_send)
2892 (so, 0, m, NULL, control, current_proc());
2893 if (error != 0) {
2894 os_log(OS_LOG_DEFAULT, "sosend_list: pru_send error %d",
2895 error);
2896 goto release;
2897 }
2898 *pktcnt += 1;
2899 }
2900 }
2901 }
2902
2903release:
2904 if (dontroute) {
2905 so->so_options &= ~SO_DONTROUTE;
2906 }
2907 if (sblocked) {
2908 sbunlock(sb: &so->so_snd, FALSE); /* will unlock socket */
2909 } else {
2910 socket_unlock(so, refcount: 1);
2911 }
2912out:
2913 if (top != NULL) {
2914 os_log(OS_LOG_DEFAULT, "sosend_list: m_freem_list(top) with error %d",
2915 error);
2916 m_freem_list(top);
2917 }
2918
2919 if (dgram_flow_entry != NULL) {
2920 soflow_free_flow(dgram_flow_entry);
2921 }
2922
2923 KERNEL_DEBUG(DBG_FNC_SOSEND_LIST | DBG_FUNC_END, so, resid,
2924 so->so_snd.sb_cc, 0, error);
2925
2926 return error;
2927}
2928
2929/*
2930 * May return ERESTART when packet is dropped by MAC policy check
2931 */
2932static int
2933soreceive_addr(struct proc *p, struct socket *so, struct sockaddr **psa,
2934 struct mbuf **maddrp,
2935 int flags, struct mbuf **mp, struct mbuf **nextrecordp, int canwait)
2936{
2937 int error = 0;
2938 struct mbuf *m = *mp;
2939 struct mbuf *nextrecord = *nextrecordp;
2940
2941 KASSERT(m->m_type == MT_SONAME, ("receive 1a"));
2942#if CONFIG_MACF_SOCKET_SUBSET
2943 /*
2944 * Call the MAC framework for policy checking if we're in
2945 * the user process context and the socket isn't connected.
2946 */
2947 if (p != kernproc && !(so->so_state & SS_ISCONNECTED)) {
2948 struct mbuf *m0 = m;
2949 /*
2950 * Dequeue this record (temporarily) from the receive
2951 * list since we're about to drop the socket's lock
2952 * where a new record may arrive and be appended to
2953 * the list. Upon MAC policy failure, the record
2954 * will be freed. Otherwise, we'll add it back to
2955 * the head of the list. We cannot rely on SB_LOCK
2956 * because append operation uses the socket's lock.
2957 */
2958 do {
2959 m->m_nextpkt = NULL;
2960 sbfree(sb: &so->so_rcv, m);
2961 m = m->m_next;
2962 } while (m != NULL);
2963 m = m0;
2964 so->so_rcv.sb_mb = nextrecord;
2965 SB_EMPTY_FIXUP(&so->so_rcv);
2966 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1a");
2967 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1a");
2968 socket_unlock(so, refcount: 0);
2969
2970 error = mac_socket_check_received(cred: kauth_cred_get(), so,
2971 mtod(m, struct sockaddr *));
2972
2973 if (error != 0) {
2974 /*
2975 * MAC policy failure; free this record and
2976 * process the next record (or block until
2977 * one is available). We have adjusted sb_cc
2978 * and sb_mbcnt above so there is no need to
2979 * call sbfree() again.
2980 */
2981 m_freem(m);
2982 /*
2983 * Clear SB_LOCK but don't unlock the socket.
2984 * Process the next record or wait for one.
2985 */
2986 socket_lock(so, refcount: 0);
2987 sbunlock(sb: &so->so_rcv, TRUE); /* stay locked */
2988 error = ERESTART;
2989 goto done;
2990 }
2991 socket_lock(so, refcount: 0);
2992 /*
2993 * If the socket has been defunct'd, drop it.
2994 */
2995 if (so->so_flags & SOF_DEFUNCT) {
2996 m_freem(m);
2997 error = ENOTCONN;
2998 goto done;
2999 }
3000 /*
3001 * Re-adjust the socket receive list and re-enqueue
3002 * the record in front of any packets which may have
3003 * been appended while we dropped the lock.
3004 */
3005 for (m = m0; m->m_next != NULL; m = m->m_next) {
3006 sballoc(sb: &so->so_rcv, m);
3007 }
3008 sballoc(sb: &so->so_rcv, m);
3009 if (so->so_rcv.sb_mb == NULL) {
3010 so->so_rcv.sb_lastrecord = m0;
3011 so->so_rcv.sb_mbtail = m;
3012 }
3013 m = m0;
3014 nextrecord = m->m_nextpkt = so->so_rcv.sb_mb;
3015 so->so_rcv.sb_mb = m;
3016 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1b");
3017 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1b");
3018 }
3019#endif /* CONFIG_MACF_SOCKET_SUBSET */
3020 if (psa != NULL) {
3021 *psa = dup_sockaddr(mtod(m, struct sockaddr *), canwait);
3022 if ((*psa == NULL) && (flags & MSG_NEEDSA)) {
3023 error = EWOULDBLOCK;
3024 goto done;
3025 }
3026 } else if (maddrp != NULL) {
3027 *maddrp = m;
3028 }
3029 if (flags & MSG_PEEK) {
3030 m = m->m_next;
3031 } else {
3032 sbfree(sb: &so->so_rcv, m);
3033 if (m->m_next == NULL && so->so_rcv.sb_cc != 0) {
3034 panic("%s: about to create invalid socketbuf",
3035 __func__);
3036 /* NOTREACHED */
3037 }
3038 if (maddrp == NULL) {
3039 MFREE(m, so->so_rcv.sb_mb);
3040 } else {
3041 so->so_rcv.sb_mb = m->m_next;
3042 m->m_next = NULL;
3043 }
3044 m = so->so_rcv.sb_mb;
3045 if (m != NULL) {
3046 m->m_nextpkt = nextrecord;
3047 } else {
3048 so->so_rcv.sb_mb = nextrecord;
3049 SB_EMPTY_FIXUP(&so->so_rcv);
3050 }
3051 }
3052done:
3053 *mp = m;
3054 *nextrecordp = nextrecord;
3055
3056 return error;
3057}
3058
3059/*
3060 * When peeking SCM_RIGHTS, the actual file descriptors are not yet created
3061 * so clear the data portion in order not to leak the file pointers
3062 */
3063static void
3064sopeek_scm_rights(struct mbuf *rights)
3065{
3066 struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
3067
3068 if (cm->cmsg_level == SOL_SOCKET && cm->cmsg_type == SCM_RIGHTS) {
3069 VERIFY(cm->cmsg_len <= rights->m_len);
3070 memset(s: cm + 1, c: 0, n: cm->cmsg_len - sizeof(*cm));
3071 }
3072}
3073
3074/*
3075 * Process one or more MT_CONTROL mbufs present before any data mbufs
3076 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3077 * just copy the data; if !MSG_PEEK, we call into the protocol to
3078 * perform externalization.
3079 */
3080static int
3081soreceive_ctl(struct socket *so, struct mbuf **controlp, int flags,
3082 struct mbuf **mp, struct mbuf **nextrecordp)
3083{
3084 int error = 0;
3085 struct mbuf *cm = NULL, *cmn;
3086 struct mbuf **cme = &cm;
3087 struct sockbuf *sb_rcv = &so->so_rcv;
3088 struct mbuf **msgpcm = NULL;
3089 struct mbuf *m = *mp;
3090 struct mbuf *nextrecord = *nextrecordp;
3091 struct protosw *pr = so->so_proto;
3092
3093 /*
3094 * Externalizing the control messages would require us to
3095 * drop the socket's lock below. Once we re-acquire the
3096 * lock, the mbuf chain might change. In order to preserve
3097 * consistency, we unlink all control messages from the
3098 * first mbuf chain in one shot and link them separately
3099 * onto a different chain.
3100 */
3101 do {
3102 if (flags & MSG_PEEK) {
3103 if (controlp != NULL) {
3104 if (*controlp == NULL) {
3105 msgpcm = controlp;
3106 }
3107 *controlp = m_copy(m, 0, m->m_len);
3108
3109 /*
3110 * If we failed to allocate an mbuf,
3111 * release any previously allocated
3112 * mbufs for control data. Return
3113 * an error. Keep the mbufs in the
3114 * socket as this is using
3115 * MSG_PEEK flag.
3116 */
3117 if (*controlp == NULL) {
3118 m_freem(*msgpcm);
3119 error = ENOBUFS;
3120 goto done;
3121 }
3122
3123 if (pr->pr_domain->dom_externalize != NULL) {
3124 sopeek_scm_rights(rights: *controlp);
3125 }
3126
3127 controlp = &(*controlp)->m_next;
3128 }
3129 m = m->m_next;
3130 } else {
3131 m->m_nextpkt = NULL;
3132 sbfree(sb: sb_rcv, m);
3133 sb_rcv->sb_mb = m->m_next;
3134 m->m_next = NULL;
3135 *cme = m;
3136 cme = &(*cme)->m_next;
3137 m = sb_rcv->sb_mb;
3138 }
3139 } while (m != NULL && m->m_type == MT_CONTROL);
3140
3141 if (!(flags & MSG_PEEK)) {
3142 if (sb_rcv->sb_mb != NULL) {
3143 sb_rcv->sb_mb->m_nextpkt = nextrecord;
3144 } else {
3145 sb_rcv->sb_mb = nextrecord;
3146 SB_EMPTY_FIXUP(sb_rcv);
3147 }
3148 if (nextrecord == NULL) {
3149 sb_rcv->sb_lastrecord = m;
3150 }
3151 }
3152
3153 SBLASTRECORDCHK(&so->so_rcv, "soreceive ctl");
3154 SBLASTMBUFCHK(&so->so_rcv, "soreceive ctl");
3155
3156 while (cm != NULL) {
3157 int cmsg_level;
3158 int cmsg_type;
3159
3160 cmn = cm->m_next;
3161 cm->m_next = NULL;
3162 cmsg_level = mtod(cm, struct cmsghdr *)->cmsg_level;
3163 cmsg_type = mtod(cm, struct cmsghdr *)->cmsg_type;
3164
3165 /*
3166 * Call the protocol to externalize SCM_RIGHTS message
3167 * and return the modified message to the caller upon
3168 * success. Otherwise, all other control messages are
3169 * returned unmodified to the caller. Note that we
3170 * only get into this loop if MSG_PEEK is not set.
3171 */
3172 if (pr->pr_domain->dom_externalize != NULL &&
3173 cmsg_level == SOL_SOCKET &&
3174 cmsg_type == SCM_RIGHTS) {
3175 /*
3176 * Release socket lock: see 3903171. This
3177 * would also allow more records to be appended
3178 * to the socket buffer. We still have SB_LOCK
3179 * set on it, so we can be sure that the head
3180 * of the mbuf chain won't change.
3181 */
3182 socket_unlock(so, refcount: 0);
3183 error = (*pr->pr_domain->dom_externalize)(cm);
3184 socket_lock(so, refcount: 0);
3185 } else {
3186 error = 0;
3187 }
3188
3189 if (controlp != NULL && error == 0) {
3190 *controlp = cm;
3191 controlp = &(*controlp)->m_next;
3192 } else {
3193 (void) m_free(cm);
3194 }
3195 cm = cmn;
3196 }
3197 /*
3198 * Update the value of nextrecord in case we received new
3199 * records when the socket was unlocked above for
3200 * externalizing SCM_RIGHTS.
3201 */
3202 if (m != NULL) {
3203 nextrecord = sb_rcv->sb_mb->m_nextpkt;
3204 } else {
3205 nextrecord = sb_rcv->sb_mb;
3206 }
3207
3208done:
3209 *mp = m;
3210 *nextrecordp = nextrecord;
3211
3212 return error;
3213}
3214
3215/*
3216 * If we have less data than requested, block awaiting more
3217 * (subject to any timeout) if:
3218 * 1. the current count is less than the low water mark, or
3219 * 2. MSG_WAITALL is set, and it is possible to do the entire
3220 * receive operation at once if we block (resid <= hiwat).
3221 * 3. MSG_DONTWAIT is not set
3222 * If MSG_WAITALL is set but resid is larger than the receive buffer,
3223 * we have to do the receive in sections, and thus risk returning
3224 * a short count if a timeout or signal occurs after we start.
3225 */
3226static boolean_t
3227so_should_wait(struct socket *so, struct uio *uio, struct mbuf *m, int flags)
3228{
3229 struct protosw *pr = so->so_proto;
3230
3231 /* No mbufs in the receive-queue? Wait! */
3232 if (m == NULL) {
3233 return true;
3234 }
3235
3236 /* Not enough data in the receive socket-buffer - we may have to wait */
3237 if ((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < uio_resid(a_uio: uio) &&
3238 m->m_nextpkt == NULL && (pr->pr_flags & PR_ATOMIC) == 0) {
3239 /*
3240 * Application did set the lowater-mark, so we should wait for
3241 * this data to be present.
3242 */
3243 if (so->so_rcv.sb_cc < so->so_rcv.sb_lowat) {
3244 return true;
3245 }
3246
3247 /*
3248 * Application wants all the data - so let's try to do the
3249 * receive-operation at once by waiting for everything to
3250 * be there.
3251 */
3252 if ((flags & MSG_WAITALL) && uio_resid(a_uio: uio) <= so->so_rcv.sb_hiwat) {
3253 return true;
3254 }
3255 }
3256
3257 return false;
3258}
3259
3260/*
3261 * Implement receive operations on a socket.
3262 * We depend on the way that records are added to the sockbuf
3263 * by sbappend*. In particular, each record (mbufs linked through m_next)
3264 * must begin with an address if the protocol so specifies,
3265 * followed by an optional mbuf or mbufs containing ancillary data,
3266 * and then zero or more mbufs of data.
3267 * In order to avoid blocking network interrupts for the entire time here,
3268 * we splx() while doing the actual copy to user space.
3269 * Although the sockbuf is locked, new data may still be appended,
3270 * and thus we must maintain consistency of the sockbuf during that time.
3271 *
3272 * The caller may receive the data as a single mbuf chain by supplying
3273 * an mbuf **mp0 for use in returning the chain. The uio is then used
3274 * only for the count in uio_resid.
3275 *
3276 * Returns: 0 Success
3277 * ENOBUFS
3278 * ENOTCONN
3279 * EWOULDBLOCK
3280 * uiomove:EFAULT
3281 * sblock:EWOULDBLOCK
3282 * sblock:EINTR
3283 * sbwait:EBADF
3284 * sbwait:EINTR
3285 * sodelayed_copy:EFAULT
3286 * <pru_rcvoob>:EINVAL[TCP]
3287 * <pru_rcvoob>:EWOULDBLOCK[TCP]
3288 * <pru_rcvoob>:???
3289 * <pr_domain->dom_externalize>:EMSGSIZE[AF_UNIX]
3290 * <pr_domain->dom_externalize>:ENOBUFS[AF_UNIX]
3291 * <pr_domain->dom_externalize>:???
3292 *
3293 * Notes: Additional return values from calls through <pru_rcvoob> and
3294 * <pr_domain->dom_externalize> depend on protocols other than
3295 * TCP or AF_UNIX, which are documented above.
3296 */
3297int
3298soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
3299 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
3300{
3301 struct mbuf *m, **mp, *ml = NULL;
3302 struct mbuf *nextrecord, *free_list;
3303 int flags, error, offset;
3304 user_ssize_t len;
3305 struct protosw *pr = so->so_proto;
3306 int moff, type = 0;
3307 user_ssize_t orig_resid = uio_resid(a_uio: uio);
3308 user_ssize_t delayed_copy_len;
3309 int can_delay;
3310 struct proc *p = current_proc();
3311 boolean_t en_tracing = FALSE;
3312
3313 /*
3314 * Sanity check on the length passed by caller as we are making 'int'
3315 * comparisons
3316 */
3317 if (orig_resid < 0 || orig_resid > INT_MAX) {
3318 return EINVAL;
3319 }
3320
3321 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_START, so,
3322 uio_resid(uio), so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
3323 so->so_rcv.sb_hiwat);
3324
3325 socket_lock(so, refcount: 1);
3326 so_update_last_owner_locked(so, self: p);
3327 so_update_policy(so);
3328
3329#ifdef MORE_LOCKING_DEBUG
3330 if (so->so_usecount == 1) {
3331 panic("%s: so=%x no other reference on socket", __func__, so);
3332 /* NOTREACHED */
3333 }
3334#endif
3335 mp = mp0;
3336 if (psa != NULL) {
3337 *psa = NULL;
3338 }
3339 if (controlp != NULL) {
3340 *controlp = NULL;
3341 }
3342 if (flagsp != NULL) {
3343 flags = *flagsp & ~MSG_EOR;
3344 } else {
3345 flags = 0;
3346 }
3347
3348 /*
3349 * If a recv attempt is made on a previously-accepted socket
3350 * that has been marked as inactive (disconnected), reject
3351 * the request.
3352 */
3353 if (so->so_flags & SOF_DEFUNCT) {
3354 struct sockbuf *sb = &so->so_rcv;
3355
3356 error = ENOTCONN;
3357 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
3358 __func__, proc_pid(p), proc_best_name(p),
3359 so->so_gencnt,
3360 SOCK_DOM(so), SOCK_TYPE(so), error);
3361 /*
3362 * This socket should have been disconnected and flushed
3363 * prior to being returned from sodefunct(); there should
3364 * be no data on its receive list, so panic otherwise.
3365 */
3366 if (so->so_state & SS_DEFUNCT) {
3367 sb_empty_assert(sb, __func__);
3368 }
3369 socket_unlock(so, refcount: 1);
3370 return error;
3371 }
3372
3373 if ((so->so_flags1 & SOF1_PRECONNECT_DATA) &&
3374 pr->pr_usrreqs->pru_preconnect) {
3375 /*
3376 * A user may set the CONNECT_RESUME_ON_READ_WRITE-flag but not
3377 * calling write() right after this. *If* the app calls a read
3378 * we do not want to block this read indefinetely. Thus,
3379 * we trigger a connect so that the session gets initiated.
3380 */
3381 error = (*pr->pr_usrreqs->pru_preconnect)(so);
3382
3383 if (error) {
3384 socket_unlock(so, refcount: 1);
3385 return error;
3386 }
3387 }
3388
3389 if (ENTR_SHOULDTRACE &&
3390 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
3391 /*
3392 * enable energy tracing for inet sockets that go over
3393 * non-loopback interfaces only.
3394 */
3395 struct inpcb *inp = sotoinpcb(so);
3396 if (inp->inp_last_outifp != NULL &&
3397 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
3398 en_tracing = TRUE;
3399 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_START,
3400 VM_KERNEL_ADDRPERM(so),
3401 ((so->so_state & SS_NBIO) ?
3402 kEnTrFlagNonBlocking : 0),
3403 (int64_t)orig_resid);
3404 }
3405 }
3406
3407 /*
3408 * When SO_WANTOOBFLAG is set we try to get out-of-band data
3409 * regardless of the flags argument. Here is the case were
3410 * out-of-band data is not inline.
3411 */
3412 if ((flags & MSG_OOB) ||
3413 ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3414 (so->so_options & SO_OOBINLINE) == 0 &&
3415 (so->so_oobmark || (so->so_state & SS_RCVATMARK)))) {
3416 m = m_get(M_WAIT, MT_DATA);
3417 if (m == NULL) {
3418 socket_unlock(so, refcount: 1);
3419 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END,
3420 ENOBUFS, 0, 0, 0, 0);
3421 return ENOBUFS;
3422 }
3423 error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
3424 if (error) {
3425 goto bad;
3426 }
3427 socket_unlock(so, refcount: 0);
3428 do {
3429 error = uiomove(mtod(m, caddr_t),
3430 n: imin(a: (int)uio_resid(a_uio: uio), b: m->m_len), uio);
3431 m = m_free(m);
3432 } while (uio_resid(a_uio: uio) && error == 0 && m != NULL);
3433 socket_lock(so, refcount: 0);
3434bad:
3435 if (m != NULL) {
3436 m_freem(m);
3437 }
3438
3439 if ((so->so_options & SO_WANTOOBFLAG) != 0) {
3440 if (error == EWOULDBLOCK || error == EINVAL) {
3441 /*
3442 * Let's try to get normal data:
3443 * EWOULDBLOCK: out-of-band data not
3444 * receive yet. EINVAL: out-of-band data
3445 * already read.
3446 */
3447 error = 0;
3448 goto nooob;
3449 } else if (error == 0 && flagsp != NULL) {
3450 *flagsp |= MSG_OOB;
3451 }
3452 }
3453 socket_unlock(so, refcount: 1);
3454 if (en_tracing) {
3455 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3456 VM_KERNEL_ADDRPERM(so), 0,
3457 (int64_t)(orig_resid - uio_resid(uio)));
3458 }
3459 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3460 0, 0, 0, 0);
3461
3462 return error;
3463 }
3464nooob:
3465 if (mp != NULL) {
3466 *mp = NULL;
3467 }
3468
3469 if (so->so_state & SS_ISCONFIRMING && uio_resid(a_uio: uio)) {
3470 (*pr->pr_usrreqs->pru_rcvd)(so, 0);
3471 }
3472
3473 free_list = NULL;
3474 delayed_copy_len = 0;
3475restart:
3476#ifdef MORE_LOCKING_DEBUG
3477 if (so->so_usecount <= 1) {
3478 printf("soreceive: sblock so=0x%llx ref=%d on socket\n",
3479 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), so->so_usecount);
3480 }
3481#endif
3482 /*
3483 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
3484 * and if so just return to the caller. This could happen when
3485 * soreceive() is called by a socket upcall function during the
3486 * time the socket is freed. The socket buffer would have been
3487 * locked across the upcall, therefore we cannot put this thread
3488 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
3489 * we may livelock), because the lock on the socket buffer will
3490 * only be released when the upcall routine returns to its caller.
3491 * Because the socket has been officially closed, there can be
3492 * no further read on it.
3493 *
3494 * A multipath subflow socket would have its SS_NOFDREF set by
3495 * default, so check for SOF_MP_SUBFLOW socket flag; when the
3496 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
3497 */
3498 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
3499 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
3500 socket_unlock(so, refcount: 1);
3501 return 0;
3502 }
3503
3504 error = sblock(sb: &so->so_rcv, SBLOCKWAIT(flags));
3505 if (error) {
3506 socket_unlock(so, refcount: 1);
3507 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3508 0, 0, 0, 0);
3509 if (en_tracing) {
3510 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3511 VM_KERNEL_ADDRPERM(so), 0,
3512 (int64_t)(orig_resid - uio_resid(uio)));
3513 }
3514 return error;
3515 }
3516
3517 m = so->so_rcv.sb_mb;
3518 if (so_should_wait(so, uio, m, flags)) {
3519 /*
3520 * Panic if we notice inconsistencies in the socket's
3521 * receive list; both sb_mb and sb_cc should correctly
3522 * reflect the contents of the list, otherwise we may
3523 * end up with false positives during select() or poll()
3524 * which could put the application in a bad state.
3525 */
3526 SB_MB_CHECK(&so->so_rcv);
3527
3528 if (so->so_error) {
3529 if (m != NULL) {
3530 goto dontblock;
3531 }
3532 error = so->so_error;
3533 if ((flags & MSG_PEEK) == 0) {
3534 so->so_error = 0;
3535 }
3536 goto release;
3537 }
3538 if (so->so_state & SS_CANTRCVMORE) {
3539#if CONTENT_FILTER
3540 /*
3541 * Deal with half closed connections
3542 */
3543 if ((so->so_state & SS_ISDISCONNECTED) == 0 &&
3544 cfil_sock_data_pending(sb: &so->so_rcv) != 0) {
3545 CFIL_LOG(LOG_INFO,
3546 "so %llx ignore SS_CANTRCVMORE",
3547 (uint64_t)DEBUG_KERNEL_ADDRPERM(so));
3548 } else
3549#endif /* CONTENT_FILTER */
3550 if (m != NULL) {
3551 goto dontblock;
3552 } else {
3553 goto release;
3554 }
3555 }
3556 for (; m != NULL; m = m->m_next) {
3557 if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) {
3558 m = so->so_rcv.sb_mb;
3559 goto dontblock;
3560 }
3561 }
3562 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
3563 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
3564 error = ENOTCONN;
3565 goto release;
3566 }
3567 if (uio_resid(a_uio: uio) == 0) {
3568 goto release;
3569 }
3570
3571 if ((so->so_state & SS_NBIO) ||
3572 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
3573 error = EWOULDBLOCK;
3574 goto release;
3575 }
3576 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
3577 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
3578 sbunlock(sb: &so->so_rcv, TRUE); /* keep socket locked */
3579#if EVEN_MORE_LOCKING_DEBUG
3580 if (socket_debug) {
3581 printf("Waiting for socket data\n");
3582 }
3583#endif
3584
3585 /*
3586 * Depending on the protocol (e.g. TCP), the following
3587 * might cause the socket lock to be dropped and later
3588 * be reacquired, and more data could have arrived and
3589 * have been appended to the receive socket buffer by
3590 * the time it returns. Therefore, we only sleep in
3591 * sbwait() below if and only if the wait-condition is still
3592 * true.
3593 */
3594 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3595 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3596 }
3597
3598 error = 0;
3599 if (so_should_wait(so, uio, m: so->so_rcv.sb_mb, flags)) {
3600 error = sbwait(sb: &so->so_rcv);
3601 }
3602
3603#if EVEN_MORE_LOCKING_DEBUG
3604 if (socket_debug) {
3605 printf("SORECEIVE - sbwait returned %d\n", error);
3606 }
3607#endif
3608 if (so->so_usecount < 1) {
3609 panic("%s: after 2nd sblock so=%p ref=%d on socket",
3610 __func__, so, so->so_usecount);
3611 /* NOTREACHED */
3612 }
3613 if (error) {
3614 socket_unlock(so, refcount: 1);
3615 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, error,
3616 0, 0, 0, 0);
3617 if (en_tracing) {
3618 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
3619 VM_KERNEL_ADDRPERM(so), 0,
3620 (int64_t)(orig_resid - uio_resid(uio)));
3621 }
3622 return error;
3623 }
3624 goto restart;
3625 }
3626dontblock:
3627 OSIncrementAtomicLong(address: &p->p_stats->p_ru.ru_msgrcv);
3628 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
3629 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
3630 nextrecord = m->m_nextpkt;
3631
3632 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
3633 error = soreceive_addr(p, so, psa, NULL, flags, mp: &m, nextrecordp: &nextrecord,
3634 canwait: mp0 == NULL);
3635 if (error == ERESTART) {
3636 goto restart;
3637 } else if (error != 0) {
3638 goto release;
3639 }
3640 orig_resid = 0;
3641 }
3642
3643 /*
3644 * Process one or more MT_CONTROL mbufs present before any data mbufs
3645 * in the first mbuf chain on the socket buffer. If MSG_PEEK, we
3646 * just copy the data; if !MSG_PEEK, we call into the protocol to
3647 * perform externalization.
3648 */
3649 if (m != NULL && m->m_type == MT_CONTROL) {
3650 error = soreceive_ctl(so, controlp, flags, mp: &m, nextrecordp: &nextrecord);
3651 if (error != 0) {
3652 goto release;
3653 }
3654 orig_resid = 0;
3655 }
3656
3657 if (m != NULL) {
3658 if (!(flags & MSG_PEEK)) {
3659 /*
3660 * We get here because m points to an mbuf following
3661 * any MT_SONAME or MT_CONTROL mbufs which have been
3662 * processed above. In any case, m should be pointing
3663 * to the head of the mbuf chain, and the nextrecord
3664 * should be either NULL or equal to m->m_nextpkt.
3665 * See comments above about SB_LOCK.
3666 */
3667 if (m != so->so_rcv.sb_mb ||
3668 m->m_nextpkt != nextrecord) {
3669 panic("%s: post-control !sync so=%p m=%p "
3670 "nextrecord=%p\n", __func__, so, m,
3671 nextrecord);
3672 /* NOTREACHED */
3673 }
3674 if (nextrecord == NULL) {
3675 so->so_rcv.sb_lastrecord = m;
3676 }
3677 }
3678 type = m->m_type;
3679 if (type == MT_OOBDATA) {
3680 flags |= MSG_OOB;
3681 }
3682 } else {
3683 if (!(flags & MSG_PEEK)) {
3684 SB_EMPTY_FIXUP(&so->so_rcv);
3685 }
3686 }
3687 SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
3688 SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
3689
3690 moff = 0;
3691 offset = 0;
3692
3693 if (!(flags & MSG_PEEK) && uio_resid(a_uio: uio) > sorecvmincopy) {
3694 can_delay = 1;
3695 } else {
3696 can_delay = 0;
3697 }
3698
3699 while (m != NULL &&
3700 (uio_resid(a_uio: uio) - delayed_copy_len) > 0 && error == 0) {
3701 if (m->m_type == MT_OOBDATA) {
3702 if (type != MT_OOBDATA) {
3703 break;
3704 }
3705 } else if (type == MT_OOBDATA) {
3706 break;
3707 }
3708
3709 if (!m_has_mtype(m, mtype_flags: MTF_DATA | MTF_HEADER | MTF_OOBDATA)) {
3710 break;
3711 }
3712 /*
3713 * Make sure to allways set MSG_OOB event when getting
3714 * out of band data inline.
3715 */
3716 if ((so->so_options & SO_WANTOOBFLAG) != 0 &&
3717 (so->so_options & SO_OOBINLINE) != 0 &&
3718 (so->so_state & SS_RCVATMARK) != 0) {
3719 flags |= MSG_OOB;
3720 }
3721 so->so_state &= ~SS_RCVATMARK;
3722 len = uio_resid(a_uio: uio) - delayed_copy_len;
3723 if (so->so_oobmark && len > so->so_oobmark - offset) {
3724 len = so->so_oobmark - offset;
3725 }
3726 if (len > m->m_len - moff) {
3727 len = m->m_len - moff;
3728 }
3729 /*
3730 * If mp is set, just pass back the mbufs.
3731 * Otherwise copy them out via the uio, then free.
3732 * Sockbuf must be consistent here (points to current mbuf,
3733 * it points to next record) when we drop priority;
3734 * we must note any additions to the sockbuf when we
3735 * block interrupts again.
3736 */
3737 if (mp == NULL) {
3738 SBLASTRECORDCHK(&so->so_rcv, "soreceive uiomove");
3739 SBLASTMBUFCHK(&so->so_rcv, "soreceive uiomove");
3740 if (can_delay && len == m->m_len) {
3741 /*
3742 * only delay the copy if we're consuming the
3743 * mbuf and we're NOT in MSG_PEEK mode
3744 * and we have enough data to make it worthwile
3745 * to drop and retake the lock... can_delay
3746 * reflects the state of the 2 latter
3747 * constraints moff should always be zero
3748 * in these cases
3749 */
3750 delayed_copy_len += len;
3751 } else {
3752 if (delayed_copy_len) {
3753 error = sodelayed_copy(so, uio,
3754 &free_list, &delayed_copy_len);
3755
3756 if (error) {
3757 goto release;
3758 }
3759 /*
3760 * can only get here if MSG_PEEK is not
3761 * set therefore, m should point at the
3762 * head of the rcv queue; if it doesn't,
3763 * it means something drastically
3764 * changed while we were out from behind
3765 * the lock in sodelayed_copy. perhaps
3766 * a RST on the stream. in any event,
3767 * the stream has been interrupted. it's
3768 * probably best just to return whatever
3769 * data we've moved and let the caller
3770 * sort it out...
3771 */
3772 if (m != so->so_rcv.sb_mb) {
3773 break;
3774 }
3775 }
3776 socket_unlock(so, refcount: 0);
3777 error = uiomove(mtod(m, caddr_t) + moff,
3778 n: (int)len, uio);
3779 socket_lock(so, refcount: 0);
3780
3781 if (error) {
3782 goto release;
3783 }
3784 }
3785 } else {
3786 uio_setresid(a_uio: uio, a_value: (uio_resid(a_uio: uio) - len));
3787 }
3788 if (len == m->m_len - moff) {
3789 if (m->m_flags & M_EOR) {
3790 flags |= MSG_EOR;
3791 }
3792 if (flags & MSG_PEEK) {
3793 m = m->m_next;
3794 moff = 0;
3795 } else {
3796 nextrecord = m->m_nextpkt;
3797 sbfree(sb: &so->so_rcv, m);
3798 m->m_nextpkt = NULL;
3799
3800 if (mp != NULL) {
3801 *mp = m;
3802 mp = &m->m_next;
3803 so->so_rcv.sb_mb = m = m->m_next;
3804 *mp = NULL;
3805 } else {
3806 if (free_list == NULL) {
3807 free_list = m;
3808 } else {
3809 ml->m_next = m;
3810 }
3811 ml = m;
3812 so->so_rcv.sb_mb = m = m->m_next;
3813 ml->m_next = NULL;
3814 }
3815 if (m != NULL) {
3816 m->m_nextpkt = nextrecord;
3817 if (nextrecord == NULL) {
3818 so->so_rcv.sb_lastrecord = m;
3819 }
3820 } else {
3821 so->so_rcv.sb_mb = nextrecord;
3822 SB_EMPTY_FIXUP(&so->so_rcv);
3823 }
3824 SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
3825 SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
3826 }
3827 } else {
3828 if (flags & MSG_PEEK) {
3829 moff += len;
3830 } else {
3831 if (mp != NULL) {
3832 int copy_flag;
3833
3834 if (flags & MSG_DONTWAIT) {
3835 copy_flag = M_DONTWAIT;
3836 } else {
3837 copy_flag = M_WAIT;
3838 }
3839 *mp = m_copym(m, 0, (int)len, copy_flag);
3840 /*
3841 * Failed to allocate an mbuf?
3842 * Adjust uio_resid back, it was
3843 * adjusted down by len bytes which
3844 * we didn't copy over.
3845 */
3846 if (*mp == NULL) {
3847 uio_setresid(a_uio: uio,
3848 a_value: (uio_resid(a_uio: uio) + len));
3849 break;
3850 }
3851 }
3852 m->m_data += len;
3853 m->m_len -= len;
3854 so->so_rcv.sb_cc -= len;
3855 }
3856 }
3857 if (so->so_oobmark) {
3858 if ((flags & MSG_PEEK) == 0) {
3859 so->so_oobmark -= len;
3860 if (so->so_oobmark == 0) {
3861 so->so_state |= SS_RCVATMARK;
3862 break;
3863 }
3864 } else {
3865 offset += len;
3866 if (offset == so->so_oobmark) {
3867 break;
3868 }
3869 }
3870 }
3871 if (flags & MSG_EOR) {
3872 break;
3873 }
3874 /*
3875 * If the MSG_WAITALL or MSG_WAITSTREAM flag is set
3876 * (for non-atomic socket), we must not quit until
3877 * "uio->uio_resid == 0" or an error termination.
3878 * If a signal/timeout occurs, return with a short
3879 * count but without error. Keep sockbuf locked
3880 * against other readers.
3881 */
3882 while (flags & (MSG_WAITALL | MSG_WAITSTREAM) && m == NULL &&
3883 (uio_resid(a_uio: uio) - delayed_copy_len) > 0 &&
3884 !sosendallatonce(so) && !nextrecord) {
3885 if (so->so_error || ((so->so_state & SS_CANTRCVMORE)
3886#if CONTENT_FILTER
3887 && cfil_sock_data_pending(sb: &so->so_rcv) == 0
3888#endif /* CONTENT_FILTER */
3889 )) {
3890 goto release;
3891 }
3892
3893 /*
3894 * Depending on the protocol (e.g. TCP), the following
3895 * might cause the socket lock to be dropped and later
3896 * be reacquired, and more data could have arrived and
3897 * have been appended to the receive socket buffer by
3898 * the time it returns. Therefore, we only sleep in
3899 * sbwait() below if and only if the socket buffer is
3900 * empty, in order to avoid a false sleep.
3901 */
3902 if ((pr->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL) {
3903 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3904 }
3905
3906 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 2");
3907 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 2");
3908
3909 if (so->so_rcv.sb_mb == NULL && sbwait(sb: &so->so_rcv)) {
3910 error = 0;
3911 goto release;
3912 }
3913 /*
3914 * have to wait until after we get back from the sbwait
3915 * to do the copy because we will drop the lock if we
3916 * have enough data that has been delayed... by dropping
3917 * the lock we open up a window allowing the netisr
3918 * thread to process the incoming packets and to change
3919 * the state of this socket... we're issuing the sbwait
3920 * because the socket is empty and we're expecting the
3921 * netisr thread to wake us up when more packets arrive;
3922 * if we allow that processing to happen and then sbwait
3923 * we could stall forever with packets sitting in the
3924 * socket if no further packets arrive from the remote
3925 * side.
3926 *
3927 * we want to copy before we've collected all the data
3928 * to satisfy this request to allow the copy to overlap
3929 * the incoming packet processing on an MP system
3930 */
3931 if (delayed_copy_len > sorecvmincopy &&
3932 (delayed_copy_len > (so->so_rcv.sb_hiwat / 2))) {
3933 error = sodelayed_copy(so, uio,
3934 &free_list, &delayed_copy_len);
3935
3936 if (error) {
3937 goto release;
3938 }
3939 }
3940 m = so->so_rcv.sb_mb;
3941 if (m != NULL) {
3942 nextrecord = m->m_nextpkt;
3943 }
3944 SB_MB_CHECK(&so->so_rcv);
3945 }
3946 }
3947#ifdef MORE_LOCKING_DEBUG
3948 if (so->so_usecount <= 1) {
3949 panic("%s: after big while so=%p ref=%d on socket",
3950 __func__, so, so->so_usecount);
3951 /* NOTREACHED */
3952 }
3953#endif
3954
3955 if (m != NULL && pr->pr_flags & PR_ATOMIC) {
3956 if (so->so_options & SO_DONTTRUNC) {
3957 flags |= MSG_RCVMORE;
3958 } else {
3959 flags |= MSG_TRUNC;
3960 if ((flags & MSG_PEEK) == 0) {
3961 (void) sbdroprecord(sb: &so->so_rcv);
3962 }
3963 }
3964 }
3965
3966 /*
3967 * pru_rcvd below (for TCP) may cause more data to be received
3968 * if the socket lock is dropped prior to sending the ACK; some
3969 * legacy OpenTransport applications don't handle this well
3970 * (if it receives less data than requested while MSG_HAVEMORE
3971 * is set), and so we set the flag now based on what we know
3972 * prior to calling pru_rcvd.
3973 */
3974 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
3975 flags |= MSG_HAVEMORE;
3976 }
3977
3978 if ((flags & MSG_PEEK) == 0) {
3979 if (m == NULL) {
3980 so->so_rcv.sb_mb = nextrecord;
3981 /*
3982 * First part is an inline SB_EMPTY_FIXUP(). Second
3983 * part makes sure sb_lastrecord is up-to-date if
3984 * there is still data in the socket buffer.
3985 */
3986 if (so->so_rcv.sb_mb == NULL) {
3987 so->so_rcv.sb_mbtail = NULL;
3988 so->so_rcv.sb_lastrecord = NULL;
3989 } else if (nextrecord->m_nextpkt == NULL) {
3990 so->so_rcv.sb_lastrecord = nextrecord;
3991 }
3992 SB_MB_CHECK(&so->so_rcv);
3993 }
3994 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
3995 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
3996 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) {
3997 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
3998 }
3999 }
4000
4001 if (delayed_copy_len) {
4002 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4003 if (error) {
4004 goto release;
4005 }
4006 }
4007 if (free_list != NULL) {
4008 m_freem_list(free_list);
4009 free_list = NULL;
4010 }
4011
4012 if (orig_resid == uio_resid(a_uio: uio) && orig_resid &&
4013 (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
4014 sbunlock(sb: &so->so_rcv, TRUE); /* keep socket locked */
4015 goto restart;
4016 }
4017
4018 if (flagsp != NULL) {
4019 *flagsp |= flags;
4020 }
4021release:
4022#ifdef MORE_LOCKING_DEBUG
4023 if (so->so_usecount <= 1) {
4024 panic("%s: release so=%p ref=%d on socket", __func__,
4025 so, so->so_usecount);
4026 /* NOTREACHED */
4027 }
4028#endif
4029 if (delayed_copy_len) {
4030 error = sodelayed_copy(so, uio, &free_list, &delayed_copy_len);
4031 }
4032
4033 if (free_list != NULL) {
4034 m_freem_list(free_list);
4035 }
4036
4037 sbunlock(sb: &so->so_rcv, FALSE); /* will unlock socket */
4038
4039 if (en_tracing) {
4040 KERNEL_ENERGYTRACE(kEnTrActKernSockRead, DBG_FUNC_END,
4041 VM_KERNEL_ADDRPERM(so),
4042 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
4043 (int64_t)(orig_resid - uio_resid(uio)));
4044 }
4045 KERNEL_DEBUG(DBG_FNC_SORECEIVE | DBG_FUNC_END, so, uio_resid(uio),
4046 so->so_rcv.sb_cc, 0, error);
4047
4048 return error;
4049}
4050
4051/*
4052 * Returns: 0 Success
4053 * uiomove:EFAULT
4054 */
4055static int
4056sodelayed_copy(struct socket *so, struct uio *uio, struct mbuf **free_list,
4057 user_ssize_t *resid)
4058{
4059 int error = 0;
4060 struct mbuf *m;
4061
4062 m = *free_list;
4063
4064 socket_unlock(so, refcount: 0);
4065
4066 while (m != NULL && error == 0) {
4067 error = uiomove(mtod(m, caddr_t), n: (int)m->m_len, uio);
4068 m = m->m_next;
4069 }
4070 m_freem_list(*free_list);
4071
4072 *free_list = NULL;
4073 *resid = 0;
4074
4075 socket_lock(so, refcount: 0);
4076
4077 return error;
4078}
4079
4080int
4081soreceive_m_list(struct socket *so, u_int *pktcntp, struct mbuf **maddrp,
4082 struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
4083{
4084 struct mbuf *m, **mp;
4085 struct mbuf *nextrecord;
4086 int flags, error;
4087 struct protosw *pr = so->so_proto;
4088 struct proc *p = current_proc();
4089 u_int npkts = 0;
4090 struct mbuf *free_list = NULL;
4091 int sblocked = 0;
4092
4093 /*
4094 * Sanity check on the parameters passed by caller
4095 */
4096 if (mp0 == NULL || pktcntp == NULL) {
4097 return EINVAL;
4098 }
4099 if (*pktcntp > SO_MAX_MSG_X || *pktcntp == 0) {
4100 return EINVAL;
4101 }
4102
4103 mp = mp0;
4104 *mp0 = NULL;
4105 if (controlp != NULL) {
4106 *controlp = NULL;
4107 }
4108 if (maddrp != NULL) {
4109 *maddrp = NULL;
4110 }
4111 if (flagsp != NULL) {
4112 flags = *flagsp;
4113 } else {
4114 flags = 0;
4115 }
4116
4117 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_START, so,
4118 *pktcntp, so->so_rcv.sb_cc, so->so_rcv.sb_lowat,
4119 so->so_rcv.sb_hiwat);
4120
4121 socket_lock(so, refcount: 1);
4122 so_update_last_owner_locked(so, self: p);
4123 so_update_policy(so);
4124
4125#if NECP
4126 so_update_necp_policy(so, NULL, NULL);
4127#endif /* NECP */
4128
4129 /*
4130 * If a recv attempt is made on a previously-accepted socket
4131 * that has been marked as inactive (disconnected), reject
4132 * the request.
4133 */
4134 if (so->so_flags & SOF_DEFUNCT) {
4135 struct sockbuf *sb = &so->so_rcv;
4136
4137 error = ENOTCONN;
4138 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llu [%d,%d] (%d)\n",
4139 __func__, proc_pid(p), proc_best_name(p),
4140 so->so_gencnt,
4141 SOCK_DOM(so), SOCK_TYPE(so), error);
4142 /*
4143 * This socket should have been disconnected and flushed
4144 * prior to being returned from sodefunct(); there should
4145 * be no data on its receive list, so panic otherwise.
4146 */
4147 if (so->so_state & SS_DEFUNCT) {
4148 sb_empty_assert(sb, __func__);
4149 }
4150 goto release;
4151 }
4152
4153 *mp = NULL;
4154
4155restart:
4156 /*
4157 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
4158 * and if so just return to the caller. This could happen when
4159 * soreceive() is called by a socket upcall function during the
4160 * time the socket is freed. The socket buffer would have been
4161 * locked across the upcall, therefore we cannot put this thread
4162 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
4163 * we may livelock), because the lock on the socket buffer will
4164 * only be released when the upcall routine returns to its caller.
4165 * Because the socket has been officially closed, there can be
4166 * no further read on it.
4167 */
4168 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
4169 (SS_NOFDREF | SS_CANTRCVMORE)) {
4170 error = 0;
4171 goto out;
4172 }
4173
4174 error = sblock(sb: &so->so_rcv, SBLOCKWAIT(flags));
4175 if (error) {
4176 goto out;
4177 }
4178 sblocked = 1;
4179
4180 m = so->so_rcv.sb_mb;
4181 /*
4182 * Block awaiting more datagram if needed
4183 */
4184 if (m == NULL || ((flags & MSG_DONTWAIT) == 0 &&
4185 so->so_rcv.sb_cc < so->so_rcv.sb_lowat)) {
4186 /*
4187 * Panic if we notice inconsistencies in the socket's
4188 * receive list; both sb_mb and sb_cc should correctly
4189 * reflect the contents of the list, otherwise we may
4190 * end up with false positives during select() or poll()
4191 * which could put the application in a bad state.
4192 */
4193 SB_MB_CHECK(&so->so_rcv);
4194
4195 if (so->so_error) {
4196 if (m != NULL) {
4197 goto dontblock;
4198 }
4199 error = so->so_error;
4200 if ((flags & MSG_PEEK) == 0) {
4201 so->so_error = 0;
4202 }
4203 goto release;
4204 }
4205 if (so->so_state & SS_CANTRCVMORE) {
4206 if (m != NULL) {
4207 goto dontblock;
4208 } else {
4209 goto release;
4210 }
4211 }
4212 for (; m != NULL; m = m->m_next) {
4213 if (m->m_flags & M_EOR) {
4214 m = so->so_rcv.sb_mb;
4215 goto dontblock;
4216 }
4217 }
4218 if ((so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING)) == 0 &&
4219 (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
4220 error = ENOTCONN;
4221 goto release;
4222 }
4223 if ((so->so_state & SS_NBIO) ||
4224 (flags & (MSG_DONTWAIT | MSG_NBIO))) {
4225 error = EWOULDBLOCK;
4226 goto release;
4227 }
4228 SBLASTRECORDCHK(&so->so_rcv, "soreceive sbwait 1");
4229 SBLASTMBUFCHK(&so->so_rcv, "soreceive sbwait 1");
4230
4231 sbunlock(sb: &so->so_rcv, TRUE); /* keep socket locked */
4232 sblocked = 0;
4233
4234 error = sbwait(sb: &so->so_rcv);
4235 if (error != 0) {
4236 goto release;
4237 }
4238 goto restart;
4239 }
4240dontblock:
4241 m = so->so_rcv.sb_mb;
4242 if (m == NULL) {
4243 goto release;
4244 }
4245
4246 OSIncrementAtomicLong(address: &p->p_stats->p_ru.ru_msgrcv);
4247 SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
4248 SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
4249 nextrecord = m->m_nextpkt;
4250
4251 if ((pr->pr_flags & PR_ADDR) && m->m_type == MT_SONAME) {
4252 struct mbuf *maddr = NULL;
4253
4254 error = soreceive_addr(p, so, NULL, maddrp: &maddr, flags, mp: &m,
4255 nextrecordp: &nextrecord, canwait: 1);
4256 if (error == ERESTART) {
4257 goto restart;
4258 } else if (error != 0) {
4259 goto release;
4260 }
4261
4262 if (maddr != NULL) {
4263 maddr->m_nextpkt = NULL;
4264 maddr->m_next = NULL;
4265 if (maddrp != NULL) {
4266 *maddrp = maddr;
4267 maddrp = &maddr->m_nextpkt;
4268 } else {
4269 maddr->m_next = free_list;
4270 free_list = maddr;
4271 }
4272 }
4273 }
4274
4275 /*
4276 * Process one or more MT_CONTROL mbufs present before any data mbufs
4277 * in the first mbuf chain on the socket buffer.
4278 * We call into the protocol to perform externalization.
4279 */
4280 if (m != NULL && m->m_type == MT_CONTROL) {
4281 struct mbuf *control = NULL;
4282
4283 error = soreceive_ctl(so, controlp: &control, flags, mp: &m, nextrecordp: &nextrecord);
4284 if (error != 0) {
4285 goto release;
4286 }
4287 if (control != NULL) {
4288 control->m_nextpkt = NULL;
4289 control->m_next = NULL;
4290 if (controlp != NULL) {
4291 *controlp = control;
4292 controlp = &control->m_nextpkt;
4293 } else {
4294 control->m_next = free_list;
4295 free_list = control;
4296 }
4297 }
4298 }
4299
4300 /*
4301 * Link the packet to the list
4302 */
4303 if (m != NULL) {
4304 if (!m_has_mtype(m, mtype_flags: MTF_DATA | MTF_HEADER | MTF_OOBDATA)) {
4305 panic("%s: m %p m_type %d != MT_DATA", __func__, m, m->m_type);
4306 }
4307 m->m_nextpkt = NULL;
4308 *mp = m;
4309 mp = &m->m_nextpkt;
4310 }
4311 while (m != NULL) {
4312 sbfree(sb: &so->so_rcv, m);
4313
4314 m = m->m_next;
4315 }
4316
4317 so->so_rcv.sb_mb = nextrecord;
4318 /*
4319 * First part is an inline SB_EMPTY_FIXUP(). Second
4320 * part makes sure sb_lastrecord is up-to-date if
4321 * there is still data in the socket buffer.
4322 */
4323 if (so->so_rcv.sb_mb == NULL) {
4324 so->so_rcv.sb_mbtail = NULL;
4325 so->so_rcv.sb_lastrecord = NULL;
4326 } else if (nextrecord->m_nextpkt == NULL) {
4327 so->so_rcv.sb_lastrecord = nextrecord;
4328 }
4329 SB_MB_CHECK(&so->so_rcv);
4330
4331 SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
4332 SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
4333
4334 npkts += 1;
4335
4336 /*
4337 * We continue as long as all those conditions as we have less packets
4338 * than requested and the socket buffer is not empty
4339 */
4340 if (npkts < *pktcntp) {
4341 if (so->so_rcv.sb_mb != NULL) {
4342 goto dontblock;
4343 }
4344 if ((flags & MSG_WAITALL) != 0) {
4345 goto restart;
4346 }
4347 }
4348
4349 if (flagsp != NULL) {
4350 *flagsp |= flags;
4351 }
4352
4353release:
4354 /*
4355 * pru_rcvd may cause more data to be received if the socket lock
4356 * is dropped so we set MSG_HAVEMORE now based on what we know.
4357 * That way the caller won't be surprised if it receives less data
4358 * than requested.
4359 */
4360 if ((so->so_options & SO_WANTMORE) && so->so_rcv.sb_cc > 0) {
4361 flags |= MSG_HAVEMORE;
4362 }
4363
4364 if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) {
4365 (*pr->pr_usrreqs->pru_rcvd)(so, flags);
4366 }
4367
4368 if (sblocked) {
4369 sbunlock(sb: &so->so_rcv, FALSE); /* will unlock socket */
4370 } else {
4371 socket_unlock(so, refcount: 1);
4372 }
4373
4374out:
4375 *pktcntp = npkts;
4376 /*
4377 * Amortize the cost of freeing the mbufs
4378 */
4379 if (free_list != NULL) {
4380 m_freem_list(free_list);
4381 }
4382
4383 KERNEL_DEBUG(DBG_FNC_SORECEIVE_LIST | DBG_FUNC_END, error,
4384 0, 0, 0, 0);
4385 return error;
4386}
4387
4388static int
4389so_statistics_event_to_nstat_event(int64_t *input_options,
4390 uint64_t *nstat_event)
4391{
4392 int error = 0;
4393 switch (*input_options) {
4394 case SO_STATISTICS_EVENT_ENTER_CELLFALLBACK:
4395 *nstat_event = NSTAT_EVENT_SRC_ENTER_CELLFALLBACK;
4396 break;
4397 case SO_STATISTICS_EVENT_EXIT_CELLFALLBACK:
4398 *nstat_event = NSTAT_EVENT_SRC_EXIT_CELLFALLBACK;
4399 break;
4400 case SO_STATISTICS_EVENT_ATTRIBUTION_CHANGE:
4401 *nstat_event = NSTAT_EVENT_SRC_ATTRIBUTION_CHANGE;
4402 break;
4403#if (DEBUG || DEVELOPMENT)
4404 case SO_STATISTICS_EVENT_RESERVED_2:
4405 *nstat_event = NSTAT_EVENT_SRC_RESERVED_2;
4406 break;
4407#endif /* (DEBUG || DEVELOPMENT) */
4408 default:
4409 error = EINVAL;
4410 break;
4411 }
4412 return error;
4413}
4414
4415/*
4416 * Returns: 0 Success
4417 * EINVAL
4418 * ENOTCONN
4419 * <pru_shutdown>:EINVAL
4420 * <pru_shutdown>:EADDRNOTAVAIL[TCP]
4421 * <pru_shutdown>:ENOBUFS[TCP]
4422 * <pru_shutdown>:EMSGSIZE[TCP]
4423 * <pru_shutdown>:EHOSTUNREACH[TCP]
4424 * <pru_shutdown>:ENETUNREACH[TCP]
4425 * <pru_shutdown>:ENETDOWN[TCP]
4426 * <pru_shutdown>:ENOMEM[TCP]
4427 * <pru_shutdown>:EACCES[TCP]
4428 * <pru_shutdown>:EMSGSIZE[TCP]
4429 * <pru_shutdown>:ENOBUFS[TCP]
4430 * <pru_shutdown>:???[TCP] [ignorable: mostly IPSEC/firewall/DLIL]
4431 * <pru_shutdown>:??? [other protocol families]
4432 */
4433int
4434soshutdown(struct socket *so, int how)
4435{
4436 int error;
4437
4438 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_START, how, 0, 0, 0, 0);
4439
4440 switch (how) {
4441 case SHUT_RD:
4442 case SHUT_WR:
4443 case SHUT_RDWR:
4444 socket_lock(so, refcount: 1);
4445 if ((so->so_state &
4446 (SS_ISCONNECTED | SS_ISCONNECTING | SS_ISDISCONNECTING)) == 0) {
4447 error = ENOTCONN;
4448 } else {
4449 error = soshutdownlock(so, how);
4450 }
4451 socket_unlock(so, refcount: 1);
4452 break;
4453 default:
4454 error = EINVAL;
4455 break;
4456 }
4457
4458 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN | DBG_FUNC_END, how, error, 0, 0, 0);
4459
4460 return error;
4461}
4462
4463int
4464soshutdownlock_final(struct socket *so, int how)
4465{
4466 struct protosw *pr = so->so_proto;
4467 int error = 0;
4468
4469 sflt_notify(so, event: sock_evt_shutdown, param: &how);
4470
4471 if (how != SHUT_WR) {
4472 if ((so->so_state & SS_CANTRCVMORE) != 0) {
4473 /* read already shut down */
4474 error = ENOTCONN;
4475 goto done;
4476 }
4477 sorflush(so);
4478 }
4479 if (how != SHUT_RD) {
4480 if ((so->so_state & SS_CANTSENDMORE) != 0) {
4481 /* write already shut down */
4482 error = ENOTCONN;
4483 goto done;
4484 }
4485 error = (*pr->pr_usrreqs->pru_shutdown)(so);
4486 }
4487done:
4488 KERNEL_DEBUG(DBG_FNC_SOSHUTDOWN, how, 1, 0, 0, 0);
4489 return error;
4490}
4491
4492int
4493soshutdownlock(struct socket *so, int how)
4494{
4495 int error = 0;
4496
4497#if CONTENT_FILTER
4498 /*
4499 * A content filter may delay the actual shutdown until it
4500 * has processed the pending data
4501 */
4502 if (so->so_flags & SOF_CONTENT_FILTER) {
4503 error = cfil_sock_shutdown(so, how: &how);
4504 if (error == EJUSTRETURN) {
4505 error = 0;
4506 goto done;
4507 } else if (error != 0) {
4508 goto done;
4509 }
4510 }
4511#endif /* CONTENT_FILTER */
4512
4513 error = soshutdownlock_final(so, how);
4514
4515done:
4516 return error;
4517}
4518
4519void
4520sowflush(struct socket *so)
4521{
4522 struct sockbuf *sb = &so->so_snd;
4523
4524 /*
4525 * Obtain lock on the socket buffer (SB_LOCK). This is required
4526 * to prevent the socket buffer from being unexpectedly altered
4527 * while it is used by another thread in socket send/receive.
4528 *
4529 * sblock() must not fail here, hence the assertion.
4530 */
4531 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4532 VERIFY(sb->sb_flags & SB_LOCK);
4533
4534 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4535 sb->sb_flags |= SB_DROP;
4536 sb->sb_upcall = NULL;
4537 sb->sb_upcallarg = NULL;
4538
4539 sbunlock(sb, TRUE); /* keep socket locked */
4540
4541 selthreadclear(&sb->sb_sel);
4542 sbrelease(sb);
4543}
4544
4545void
4546sorflush(struct socket *so)
4547{
4548 struct sockbuf *sb = &so->so_rcv;
4549 struct protosw *pr = so->so_proto;
4550 struct sockbuf asb;
4551#ifdef notyet
4552 lck_mtx_t *mutex_held;
4553 /*
4554 * XXX: This code is currently commented out, because we may get here
4555 * as part of sofreelastref(), and at that time, pr_getlock() may no
4556 * longer be able to return us the lock; this will be fixed in future.
4557 */
4558 if (so->so_proto->pr_getlock != NULL) {
4559 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
4560 } else {
4561 mutex_held = so->so_proto->pr_domain->dom_mtx;
4562 }
4563
4564 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
4565#endif /* notyet */
4566
4567 sflt_notify(so, event: sock_evt_flush_read, NULL);
4568
4569 socantrcvmore(so);
4570
4571 /*
4572 * Obtain lock on the socket buffer (SB_LOCK). This is required
4573 * to prevent the socket buffer from being unexpectedly altered
4574 * while it is used by another thread in socket send/receive.
4575 *
4576 * sblock() must not fail here, hence the assertion.
4577 */
4578 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
4579 VERIFY(sb->sb_flags & SB_LOCK);
4580
4581 /*
4582 * Copy only the relevant fields from "sb" to "asb" which we
4583 * need for sbrelease() to function. In particular, skip
4584 * sb_sel as it contains the wait queue linkage, which would
4585 * wreak havoc if we were to issue selthreadclear() on "asb".
4586 * Make sure to not carry over SB_LOCK in "asb", as we need
4587 * to acquire it later as part of sbrelease().
4588 */
4589 bzero(s: &asb, n: sizeof(asb));
4590 asb.sb_cc = sb->sb_cc;
4591 asb.sb_hiwat = sb->sb_hiwat;
4592 asb.sb_mbcnt = sb->sb_mbcnt;
4593 asb.sb_mbmax = sb->sb_mbmax;
4594 asb.sb_ctl = sb->sb_ctl;
4595 asb.sb_lowat = sb->sb_lowat;
4596 asb.sb_mb = sb->sb_mb;
4597 asb.sb_mbtail = sb->sb_mbtail;
4598 asb.sb_lastrecord = sb->sb_lastrecord;
4599 asb.sb_so = sb->sb_so;
4600 asb.sb_flags = sb->sb_flags;
4601 asb.sb_flags &= ~(SB_LOCK | SB_SEL | SB_KNOTE | SB_UPCALL);
4602 asb.sb_flags |= SB_DROP;
4603
4604 /*
4605 * Ideally we'd bzero() these and preserve the ones we need;
4606 * but to do that we'd need to shuffle things around in the
4607 * sockbuf, and we can't do it now because there are KEXTS
4608 * that are directly referring to the socket structure.
4609 *
4610 * Setting SB_DROP acts as a barrier to prevent further appends.
4611 * Clearing SB_SEL is done for selthreadclear() below.
4612 */
4613 sb->sb_cc = 0;
4614 sb->sb_hiwat = 0;
4615 sb->sb_mbcnt = 0;
4616 sb->sb_mbmax = 0;
4617 sb->sb_ctl = 0;
4618 sb->sb_lowat = 0;
4619 sb->sb_mb = NULL;
4620 sb->sb_mbtail = NULL;
4621 sb->sb_lastrecord = NULL;
4622 sb->sb_timeo.tv_sec = 0;
4623 sb->sb_timeo.tv_usec = 0;
4624 sb->sb_upcall = NULL;
4625 sb->sb_upcallarg = NULL;
4626 sb->sb_flags &= ~(SB_SEL | SB_UPCALL);
4627 sb->sb_flags |= SB_DROP;
4628
4629 sbunlock(sb, TRUE); /* keep socket locked */
4630
4631 /*
4632 * Note that selthreadclear() is called on the original "sb" and
4633 * not the local "asb" because of the way wait queue linkage is
4634 * implemented. Given that selwakeup() may be triggered, SB_SEL
4635 * should no longer be set (cleared above.)
4636 */
4637 selthreadclear(&sb->sb_sel);
4638
4639 if ((pr->pr_flags & PR_RIGHTS) && pr->pr_domain->dom_dispose) {
4640 (*pr->pr_domain->dom_dispose)(asb.sb_mb);
4641 }
4642
4643 sbrelease(sb: &asb);
4644}
4645
4646/*
4647 * Perhaps this routine, and sooptcopyout(), below, ought to come in
4648 * an additional variant to handle the case where the option value needs
4649 * to be some kind of integer, but not a specific size.
4650 * In addition to their use here, these functions are also called by the
4651 * protocol-level pr_ctloutput() routines.
4652 *
4653 * Returns: 0 Success
4654 * EINVAL
4655 * copyin:EFAULT
4656 */
4657int
4658sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
4659{
4660 size_t valsize;
4661
4662 /*
4663 * If the user gives us more than we wanted, we ignore it,
4664 * but if we don't get the minimum length the caller
4665 * wants, we return EINVAL. On success, sopt->sopt_valsize
4666 * is set to however much we actually retrieved.
4667 */
4668 if ((valsize = sopt->sopt_valsize) < minlen) {
4669 return EINVAL;
4670 }
4671 if (valsize > len) {
4672 sopt->sopt_valsize = valsize = len;
4673 }
4674
4675 if (sopt->sopt_p != kernproc) {
4676 return copyin(sopt->sopt_val, buf, valsize);
4677 }
4678
4679 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), dst: buf, n: valsize);
4680 return 0;
4681}
4682
4683/*
4684 * sooptcopyin_timeval
4685 * Copy in a timeval value into tv_p, and take into account whether the
4686 * the calling process is 64-bit or 32-bit. Moved the sanity checking
4687 * code here so that we can verify the 64-bit tv_sec value before we lose
4688 * the top 32-bits assigning tv64.tv_sec to tv_p->tv_sec.
4689 */
4690static int
4691sooptcopyin_timeval(struct sockopt *sopt, struct timeval *tv_p)
4692{
4693 int error;
4694
4695 if (proc_is64bit(sopt->sopt_p)) {
4696 struct user64_timeval tv64;
4697
4698 if (sopt->sopt_valsize < sizeof(tv64)) {
4699 return EINVAL;
4700 }
4701
4702 sopt->sopt_valsize = sizeof(tv64);
4703 if (sopt->sopt_p != kernproc) {
4704 error = copyin(sopt->sopt_val, &tv64, sizeof(tv64));
4705 if (error != 0) {
4706 return error;
4707 }
4708 } else {
4709 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), dst: &tv64,
4710 n: sizeof(tv64));
4711 }
4712 if (tv64.tv_sec < 0 || tv64.tv_sec > LONG_MAX ||
4713 tv64.tv_usec < 0 || tv64.tv_usec >= 1000000) {
4714 return EDOM;
4715 }
4716
4717 tv_p->tv_sec = (__darwin_time_t)tv64.tv_sec;
4718 tv_p->tv_usec = tv64.tv_usec;
4719 } else {
4720 struct user32_timeval tv32;
4721
4722 if (sopt->sopt_valsize < sizeof(tv32)) {
4723 return EINVAL;
4724 }
4725
4726 sopt->sopt_valsize = sizeof(tv32);
4727 if (sopt->sopt_p != kernproc) {
4728 error = copyin(sopt->sopt_val, &tv32, sizeof(tv32));
4729 if (error != 0) {
4730 return error;
4731 }
4732 } else {
4733 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val), dst: &tv32,
4734 n: sizeof(tv32));
4735 }
4736#ifndef __LP64__
4737 /*
4738 * K64todo "comparison is always false due to
4739 * limited range of data type"
4740 */
4741 if (tv32.tv_sec < 0 || tv32.tv_sec > LONG_MAX ||
4742 tv32.tv_usec < 0 || tv32.tv_usec >= 1000000) {
4743 return EDOM;
4744 }
4745#endif
4746 tv_p->tv_sec = tv32.tv_sec;
4747 tv_p->tv_usec = tv32.tv_usec;
4748 }
4749 return 0;
4750}
4751
4752int
4753soopt_cred_check(struct socket *so, int priv, boolean_t allow_root,
4754 boolean_t ignore_delegate)
4755{
4756 kauth_cred_t cred = NULL;
4757 proc_t ep = PROC_NULL;
4758 uid_t uid;
4759 int error = 0;
4760
4761 if (ignore_delegate == false && so->so_flags & SOF_DELEGATED) {
4762 ep = proc_find(pid: so->e_pid);
4763 if (ep) {
4764 cred = kauth_cred_proc_ref(procp: ep);
4765 }
4766 }
4767
4768 uid = kauth_cred_getuid(cred: cred ? cred : so->so_cred);
4769
4770 /* uid is 0 for root */
4771 if (uid != 0 || !allow_root) {
4772 error = priv_check_cred(cred: cred ? cred : so->so_cred, priv, flags: 0);
4773 }
4774 if (cred) {
4775 kauth_cred_unref(&cred);
4776 }
4777 if (ep != PROC_NULL) {
4778 proc_rele(p: ep);
4779 }
4780
4781 return error;
4782}
4783
4784/*
4785 * Returns: 0 Success
4786 * EINVAL
4787 * ENOPROTOOPT
4788 * ENOBUFS
4789 * EDOM
4790 * sooptcopyin:EINVAL
4791 * sooptcopyin:EFAULT
4792 * sooptcopyin_timeval:EINVAL
4793 * sooptcopyin_timeval:EFAULT
4794 * sooptcopyin_timeval:EDOM
4795 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
4796 * <pr_ctloutput>:???w
4797 * sflt_attach_private:??? [whatever a filter author chooses]
4798 * <sf_setoption>:??? [whatever a filter author chooses]
4799 *
4800 * Notes: Other <pru_listen> returns depend on the protocol family; all
4801 * <sf_listen> returns depend on what the filter author causes
4802 * their filter to return.
4803 */
4804int
4805sosetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
4806{
4807 int error, optval;
4808 int64_t long_optval;
4809 struct linger l;
4810 struct timeval tv;
4811
4812 if (sopt->sopt_dir != SOPT_SET) {
4813 sopt->sopt_dir = SOPT_SET;
4814 }
4815
4816 if (dolock) {
4817 socket_lock(so, refcount: 1);
4818 }
4819
4820 if ((so->so_state & (SS_CANTRCVMORE | SS_CANTSENDMORE)) ==
4821 (SS_CANTRCVMORE | SS_CANTSENDMORE) &&
4822 (so->so_flags & SOF_NPX_SETOPTSHUT) == 0) {
4823 /* the socket has been shutdown, no more sockopt's */
4824 error = EINVAL;
4825 goto out;
4826 }
4827
4828 error = sflt_setsockopt(so, sopt);
4829 if (error != 0) {
4830 if (error == EJUSTRETURN) {
4831 error = 0;
4832 }
4833 goto out;
4834 }
4835
4836 if (sopt->sopt_level != SOL_SOCKET) {
4837 if (so->so_proto != NULL &&
4838 so->so_proto->pr_ctloutput != NULL) {
4839 error = (*so->so_proto->pr_ctloutput)(so, sopt);
4840 goto out;
4841 }
4842 error = ENOPROTOOPT;
4843 } else {
4844 /*
4845 * Allow socket-level (SOL_SOCKET) options to be filtered by
4846 * the protocol layer, if needed. A zero value returned from
4847 * the handler means use default socket-level processing as
4848 * done by the rest of this routine. Otherwise, any other
4849 * return value indicates that the option is unsupported.
4850 */
4851 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
4852 pru_socheckopt(so, sopt)) != 0) {
4853 goto out;
4854 }
4855
4856 error = 0;
4857 switch (sopt->sopt_name) {
4858 case SO_LINGER:
4859 case SO_LINGER_SEC: {
4860 error = sooptcopyin(sopt, buf: &l, len: sizeof(l), minlen: sizeof(l));
4861 if (error != 0) {
4862 goto out;
4863 }
4864 /* Make sure to use sane values */
4865 if (sopt->sopt_name == SO_LINGER) {
4866 so->so_linger = (short)l.l_linger;
4867 } else {
4868 so->so_linger = (short)((long)l.l_linger * hz);
4869 }
4870 if (l.l_onoff != 0) {
4871 so->so_options |= SO_LINGER;
4872 } else {
4873 so->so_options &= ~SO_LINGER;
4874 }
4875 break;
4876 }
4877 case SO_DEBUG:
4878 case SO_KEEPALIVE:
4879 case SO_DONTROUTE:
4880 case SO_USELOOPBACK:
4881 case SO_BROADCAST:
4882 case SO_REUSEADDR:
4883 case SO_REUSEPORT:
4884 case SO_OOBINLINE:
4885 case SO_TIMESTAMP:
4886 case SO_TIMESTAMP_MONOTONIC:
4887 case SO_TIMESTAMP_CONTINUOUS:
4888 case SO_DONTTRUNC:
4889 case SO_WANTMORE:
4890 case SO_WANTOOBFLAG:
4891 case SO_NOWAKEFROMSLEEP:
4892 case SO_NOAPNFALLBK:
4893 error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
4894 minlen: sizeof(optval));
4895 if (error != 0) {
4896 goto out;
4897 }
4898 if (optval) {
4899 so->so_options |= sopt->sopt_name;
4900 } else {
4901 so->so_options &= ~sopt->sopt_name;
4902 }
4903#if SKYWALK
4904 inp_update_netns_flags(so);
4905#endif /* SKYWALK */
4906 break;
4907
4908 case SO_SNDBUF:
4909 case SO_RCVBUF:
4910 case SO_SNDLOWAT:
4911 case SO_RCVLOWAT:
4912 error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
4913 minlen: sizeof(optval));
4914 if (error != 0) {
4915 goto out;
4916 }
4917
4918 /*
4919 * Values < 1 make no sense for any of these
4920 * options, so disallow them.
4921 */
4922 if (optval < 1) {
4923 error = EINVAL;
4924 goto out;
4925 }
4926
4927 switch (sopt->sopt_name) {
4928 case SO_SNDBUF:
4929 case SO_RCVBUF: {
4930 struct sockbuf *sb =
4931 (sopt->sopt_name == SO_SNDBUF) ?
4932 &so->so_snd : &so->so_rcv;
4933 if (sbreserve(sb, cc: (u_int32_t)optval) == 0) {
4934 error = ENOBUFS;
4935 goto out;
4936 }
4937 sb->sb_flags |= SB_USRSIZE;
4938 sb->sb_flags &= ~SB_AUTOSIZE;
4939 sb->sb_idealsize = (u_int32_t)optval;
4940 break;
4941 }
4942 /*
4943 * Make sure the low-water is never greater than
4944 * the high-water.
4945 */
4946 case SO_SNDLOWAT: {
4947 int space = sbspace(sb: &so->so_snd);
4948 uint32_t hiwat = so->so_snd.sb_hiwat;
4949
4950 if (so->so_snd.sb_flags & SB_UNIX) {
4951 struct unpcb *unp =
4952 (struct unpcb *)(so->so_pcb);
4953 if (unp != NULL &&
4954 unp->unp_conn != NULL) {
4955 struct socket *so2 = unp->unp_conn->unp_socket;
4956 hiwat += unp->unp_conn->unp_cc;
4957 space = sbspace(sb: &so2->so_rcv);
4958 }
4959 }
4960
4961 so->so_snd.sb_lowat =
4962 (optval > hiwat) ?
4963 hiwat : optval;
4964
4965 if (space >= so->so_snd.sb_lowat) {
4966 sowwakeup(so);
4967 }
4968 break;
4969 }
4970 case SO_RCVLOWAT: {
4971 int64_t data_len;
4972 so->so_rcv.sb_lowat =
4973 (optval > so->so_rcv.sb_hiwat) ?
4974 so->so_rcv.sb_hiwat : optval;
4975 if (so->so_rcv.sb_flags & SB_UNIX) {
4976 struct unpcb *unp =
4977 (struct unpcb *)(so->so_pcb);
4978 if (unp != NULL &&
4979 unp->unp_conn != NULL) {
4980 struct socket *so2 = unp->unp_conn->unp_socket;
4981 data_len = so2->so_snd.sb_cc
4982 - so2->so_snd.sb_ctl;
4983 } else {
4984 data_len = so->so_rcv.sb_cc
4985 - so->so_rcv.sb_ctl;
4986 }
4987 } else {
4988 data_len = so->so_rcv.sb_cc
4989 - so->so_rcv.sb_ctl;
4990 }
4991
4992 if (data_len >= so->so_rcv.sb_lowat) {
4993 sorwakeup(so);
4994 }
4995 break;
4996 }
4997 }
4998 break;
4999
5000 case SO_SNDTIMEO:
5001 case SO_RCVTIMEO:
5002 error = sooptcopyin_timeval(sopt, tv_p: &tv);
5003 if (error != 0) {
5004 goto out;
5005 }
5006
5007 switch (sopt->sopt_name) {
5008 case SO_SNDTIMEO:
5009 so->so_snd.sb_timeo = tv;
5010 break;
5011 case SO_RCVTIMEO:
5012 so->so_rcv.sb_timeo = tv;
5013 break;
5014 }
5015 break;
5016
5017 case SO_NKE: {
5018 struct so_nke nke;
5019
5020 error = sooptcopyin(sopt, buf: &nke, len: sizeof(nke),
5021 minlen: sizeof(nke));
5022 if (error != 0) {
5023 goto out;
5024 }
5025
5026 error = sflt_attach_internal(so, handle: nke.nke_handle);
5027 break;
5028 }
5029
5030 case SO_NOSIGPIPE:
5031 error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5032 minlen: sizeof(optval));
5033 if (error != 0) {
5034 goto out;
5035 }
5036 if (optval != 0) {
5037 so->so_flags |= SOF_NOSIGPIPE;
5038 } else {
5039 so->so_flags &= ~SOF_NOSIGPIPE;
5040 }
5041 break;
5042
5043 case SO_NOADDRERR:
5044 error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5045 minlen: sizeof(optval));
5046 if (error != 0) {
5047 goto out;
5048 }
5049 if (optval != 0) {
5050 so->so_flags |= SOF_NOADDRAVAIL;
5051 } else {
5052 so->so_flags &= ~SOF_NOADDRAVAIL;
5053 }
5054 break;
5055
5056 case SO_REUSESHAREUID:
5057 error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5058 minlen: sizeof(optval));
5059 if (error != 0) {
5060 goto out;
5061 }
5062 if (optval != 0) {
5063 so->so_flags |= SOF_REUSESHAREUID;
5064 } else {
5065 so->so_flags &= ~SOF_REUSESHAREUID;
5066 }
5067 break;
5068
5069 case SO_NOTIFYCONFLICT:
5070 if (kauth_cred_issuser(cred: kauth_cred_get()) == 0) {
5071 error = EPERM;
5072 goto out;
5073 }
5074 error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5075 minlen: sizeof(optval));
5076 if (error != 0) {
5077 goto out;
5078 }
5079 if (optval != 0) {
5080 so->so_flags |= SOF_NOTIFYCONFLICT;
5081 } else {
5082 so->so_flags &= ~SOF_NOTIFYCONFLICT;
5083 }
5084 break;
5085
5086 case SO_RESTRICTIONS:
5087 error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5088 minlen: sizeof(optval));
5089 if (error != 0) {
5090 goto out;
5091 }
5092
5093 error = so_set_restrictions(so, optval);
5094 break;
5095
5096 case SO_AWDL_UNRESTRICTED:
5097 if (SOCK_DOM(so) != PF_INET &&
5098 SOCK_DOM(so) != PF_INET6) {
5099 error = EOPNOTSUPP;
5100 goto out;
5101 }
5102 error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5103 minlen: sizeof(optval));
5104 if (error != 0) {
5105 goto out;
5106 }
5107 if (optval != 0) {
5108 error = soopt_cred_check(so,
5109 PRIV_NET_RESTRICTED_AWDL, false, false);
5110 if (error == 0) {
5111 inp_set_awdl_unrestricted(
5112 sotoinpcb(so));
5113 }
5114 } else {
5115 inp_clear_awdl_unrestricted(sotoinpcb(so));
5116 }
5117 break;
5118 case SO_INTCOPROC_ALLOW:
5119 if (SOCK_DOM(so) != PF_INET6) {
5120 error = EOPNOTSUPP;
5121 goto out;
5122 }
5123 error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5124 minlen: sizeof(optval));
5125 if (error != 0) {
5126 goto out;
5127 }
5128 if (optval != 0 &&
5129 inp_get_intcoproc_allowed(sotoinpcb(so)) == FALSE) {
5130 error = soopt_cred_check(so,
5131 PRIV_NET_RESTRICTED_INTCOPROC, false, false);
5132 if (error == 0) {
5133 inp_set_intcoproc_allowed(
5134 sotoinpcb(so));
5135 }
5136 } else if (optval == 0) {
5137 inp_clear_intcoproc_allowed(sotoinpcb(so));
5138 }
5139 break;
5140
5141 case SO_LABEL:
5142 error = EOPNOTSUPP;
5143 break;
5144
5145 case SO_UPCALLCLOSEWAIT:
5146 error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5147 minlen: sizeof(optval));
5148 if (error != 0) {
5149 goto out;
5150 }
5151 if (optval != 0) {
5152 so->so_flags |= SOF_UPCALLCLOSEWAIT;
5153 } else {
5154 so->so_flags &= ~SOF_UPCALLCLOSEWAIT;
5155 }
5156 break;
5157
5158 case SO_RANDOMPORT:
5159 error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5160 minlen: sizeof(optval));
5161 if (error != 0) {
5162 goto out;
5163 }
5164 if (optval != 0) {
5165 so->so_flags |= SOF_BINDRANDOMPORT;
5166 } else {
5167 so->so_flags &= ~SOF_BINDRANDOMPORT;
5168 }
5169 break;
5170
5171 case SO_NP_EXTENSIONS: {
5172 struct so_np_extensions sonpx;
5173
5174 error = sooptcopyin(sopt, buf: &sonpx, len: sizeof(sonpx),
5175 minlen: sizeof(sonpx));
5176 if (error != 0) {
5177 goto out;
5178 }
5179 if (sonpx.npx_mask & ~SONPX_MASK_VALID) {
5180 error = EINVAL;
5181 goto out;
5182 }
5183 /*
5184 * Only one bit defined for now
5185 */
5186 if ((sonpx.npx_mask & SONPX_SETOPTSHUT)) {
5187 if ((sonpx.npx_flags & SONPX_SETOPTSHUT)) {
5188 so->so_flags |= SOF_NPX_SETOPTSHUT;
5189 } else {
5190 so->so_flags &= ~SOF_NPX_SETOPTSHUT;
5191 }
5192 }
5193 break;
5194 }
5195
5196 case SO_TRAFFIC_CLASS: {
5197 error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5198 minlen: sizeof(optval));
5199 if (error != 0) {
5200 goto out;
5201 }
5202 if (optval >= SO_TC_NET_SERVICE_OFFSET) {
5203 int netsvc = optval - SO_TC_NET_SERVICE_OFFSET;
5204 error = so_set_net_service_type(so, netsvc);
5205 goto out;
5206 }
5207 error = so_set_traffic_class(so, optval);
5208 if (error != 0) {
5209 goto out;
5210 }
5211 so->so_flags1 &= ~SOF1_TC_NET_SERV_TYPE;
5212 so->so_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
5213 break;
5214 }
5215
5216 case SO_RECV_TRAFFIC_CLASS: {
5217 error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5218 minlen: sizeof(optval));
5219 if (error != 0) {
5220 goto out;
5221 }
5222 if (optval == 0) {
5223 so->so_flags &= ~SOF_RECV_TRAFFIC_CLASS;
5224 } else {
5225 so->so_flags |= SOF_RECV_TRAFFIC_CLASS;
5226 }
5227 break;
5228 }
5229
5230#if (DEVELOPMENT || DEBUG)
5231 case SO_TRAFFIC_CLASS_DBG: {
5232 struct so_tcdbg so_tcdbg;
5233
5234 error = sooptcopyin(sopt, &so_tcdbg,
5235 sizeof(struct so_tcdbg), sizeof(struct so_tcdbg));
5236 if (error != 0) {
5237 goto out;
5238 }
5239 error = so_set_tcdbg(so, &so_tcdbg);
5240 if (error != 0) {
5241 goto out;
5242 }
5243 break;
5244 }
5245#endif /* (DEVELOPMENT || DEBUG) */
5246
5247 case SO_PRIVILEGED_TRAFFIC_CLASS:
5248 error = priv_check_cred(cred: kauth_cred_get(),
5249 PRIV_NET_PRIVILEGED_TRAFFIC_CLASS, flags: 0);
5250 if (error != 0) {
5251 goto out;
5252 }
5253 error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5254 minlen: sizeof(optval));
5255 if (error != 0) {
5256 goto out;
5257 }
5258 if (optval == 0) {
5259 so->so_flags &= ~SOF_PRIVILEGED_TRAFFIC_CLASS;
5260 } else {
5261 so->so_flags |= SOF_PRIVILEGED_TRAFFIC_CLASS;
5262 }
5263 break;
5264
5265#if (DEVELOPMENT || DEBUG)
5266 case SO_DEFUNCTIT:
5267 error = sosetdefunct(current_proc(), so, 0, FALSE);
5268 if (error == 0) {
5269 error = sodefunct(current_proc(), so, 0);
5270 }
5271
5272 break;
5273#endif /* (DEVELOPMENT || DEBUG) */
5274
5275 case SO_DEFUNCTOK:
5276 error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5277 minlen: sizeof(optval));
5278 if (error != 0 || (so->so_flags & SOF_DEFUNCT)) {
5279 if (error == 0) {
5280 error = EBADF;
5281 }
5282 goto out;
5283 }
5284 /*
5285 * Any process can set SO_DEFUNCTOK (clear
5286 * SOF_NODEFUNCT), but only root can clear
5287 * SO_DEFUNCTOK (set SOF_NODEFUNCT).
5288 */
5289 if (optval == 0 &&
5290 kauth_cred_issuser(cred: kauth_cred_get()) == 0) {
5291 error = EPERM;
5292 goto out;
5293 }
5294 if (optval) {
5295 so->so_flags &= ~SOF_NODEFUNCT;
5296 } else {
5297 so->so_flags |= SOF_NODEFUNCT;
5298 }
5299
5300 if (SOCK_DOM(so) == PF_INET ||
5301 SOCK_DOM(so) == PF_INET6) {
5302 char s[MAX_IPv6_STR_LEN];
5303 char d[MAX_IPv6_STR_LEN];
5304 struct inpcb *inp = sotoinpcb(so);
5305
5306 SODEFUNCTLOG("%s[%d, %s]: so 0x%llu "
5307 "[%s %s:%d -> %s:%d] is now marked "
5308 "as %seligible for "
5309 "defunct\n", __func__, proc_selfpid(),
5310 proc_best_name(current_proc()),
5311 so->so_gencnt,
5312 (SOCK_TYPE(so) == SOCK_STREAM) ?
5313 "TCP" : "UDP", inet_ntop(SOCK_DOM(so),
5314 ((SOCK_DOM(so) == PF_INET) ?
5315 (void *)&inp->inp_laddr.s_addr :
5316 (void *)&inp->in6p_laddr), s, sizeof(s)),
5317 ntohs(inp->in6p_lport),
5318 inet_ntop(SOCK_DOM(so),
5319 (SOCK_DOM(so) == PF_INET) ?
5320 (void *)&inp->inp_faddr.s_addr :
5321 (void *)&inp->in6p_faddr, d, sizeof(d)),
5322 ntohs(inp->in6p_fport),
5323 (so->so_flags & SOF_NODEFUNCT) ?
5324 "not " : "");
5325 } else {
5326 SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d] "
5327 "is now marked as %seligible for "
5328 "defunct\n",
5329 __func__, proc_selfpid(),
5330 proc_best_name(current_proc()),
5331 so->so_gencnt,
5332 SOCK_DOM(so), SOCK_TYPE(so),
5333 (so->so_flags & SOF_NODEFUNCT) ?
5334 "not " : "");
5335 }
5336 break;
5337
5338 case SO_ISDEFUNCT:
5339 /* This option is not settable */
5340 error = EINVAL;
5341 break;
5342
5343 case SO_OPPORTUNISTIC:
5344 error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5345 minlen: sizeof(optval));
5346 if (error == 0) {
5347 error = so_set_opportunistic(so, optval);
5348 }
5349 break;
5350
5351 case SO_FLUSH:
5352 /* This option is handled by lower layer(s) */
5353 error = 0;
5354 break;
5355
5356 case SO_RECV_ANYIF:
5357 error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5358 minlen: sizeof(optval));
5359 if (error == 0) {
5360 error = so_set_recv_anyif(so, optval);
5361 }
5362 break;
5363
5364 case SO_TRAFFIC_MGT_BACKGROUND: {
5365 /* This option is handled by lower layer(s) */
5366 error = 0;
5367 break;
5368 }
5369
5370#if FLOW_DIVERT
5371 case SO_FLOW_DIVERT_TOKEN:
5372 error = flow_divert_token_set(so, sopt);
5373 break;
5374#endif /* FLOW_DIVERT */
5375
5376
5377 case SO_DELEGATED:
5378 if ((error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5379 minlen: sizeof(optval))) != 0) {
5380 break;
5381 }
5382
5383 error = so_set_effective_pid(so, epid: optval, p: sopt->sopt_p, true);
5384 break;
5385
5386 case SO_DELEGATED_UUID: {
5387 uuid_t euuid;
5388
5389 if ((error = sooptcopyin(sopt, buf: &euuid, len: sizeof(euuid),
5390 minlen: sizeof(euuid))) != 0) {
5391 break;
5392 }
5393
5394 error = so_set_effective_uuid(so, euuid, p: sopt->sopt_p, true);
5395 break;
5396 }
5397
5398#if NECP
5399 case SO_NECP_ATTRIBUTES:
5400 if (SOCK_DOM(so) == PF_MULTIPATH) {
5401 /* Handled by MPTCP itself */
5402 break;
5403 }
5404
5405 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5406 error = EINVAL;
5407 goto out;
5408 }
5409
5410 error = necp_set_socket_attributes(attributes: &sotoinpcb(so)->inp_necp_attributes, sopt);
5411 break;
5412
5413 case SO_NECP_CLIENTUUID: {
5414 if (SOCK_DOM(so) == PF_MULTIPATH) {
5415 /* Handled by MPTCP itself */
5416 break;
5417 }
5418
5419 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5420 error = EINVAL;
5421 goto out;
5422 }
5423
5424 struct inpcb *inp = sotoinpcb(so);
5425 if (!uuid_is_null(uu: inp->necp_client_uuid)) {
5426 // Clear out the old client UUID if present
5427 necp_inpcb_remove_cb(inp);
5428 }
5429
5430 error = sooptcopyin(sopt, buf: &inp->necp_client_uuid,
5431 len: sizeof(uuid_t), minlen: sizeof(uuid_t));
5432 if (error != 0) {
5433 goto out;
5434 }
5435
5436 if (uuid_is_null(uu: inp->necp_client_uuid)) {
5437 error = EINVAL;
5438 goto out;
5439 }
5440
5441 pid_t current_pid = proc_pid(current_proc());
5442 error = necp_client_register_socket_flow(pid: current_pid,
5443 client_id: inp->necp_client_uuid, inp);
5444 if (error != 0) {
5445 uuid_clear(uu: inp->necp_client_uuid);
5446 goto out;
5447 }
5448
5449 if (inp->inp_lport != 0) {
5450 // There is a bound local port, so this is not
5451 // a fresh socket. Assign to the client.
5452 necp_client_assign_from_socket(pid: current_pid, client_id: inp->necp_client_uuid, inp);
5453 }
5454
5455 break;
5456 }
5457 case SO_NECP_LISTENUUID: {
5458 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5459 error = EINVAL;
5460 goto out;
5461 }
5462
5463 struct inpcb *inp = sotoinpcb(so);
5464 if (!uuid_is_null(uu: inp->necp_client_uuid)) {
5465 error = EINVAL;
5466 goto out;
5467 }
5468
5469 error = sooptcopyin(sopt, buf: &inp->necp_client_uuid,
5470 len: sizeof(uuid_t), minlen: sizeof(uuid_t));
5471 if (error != 0) {
5472 goto out;
5473 }
5474
5475 if (uuid_is_null(uu: inp->necp_client_uuid)) {
5476 error = EINVAL;
5477 goto out;
5478 }
5479
5480 error = necp_client_register_socket_listener(pid: proc_pid(current_proc()),
5481 client_id: inp->necp_client_uuid, inp);
5482 if (error != 0) {
5483 uuid_clear(uu: inp->necp_client_uuid);
5484 goto out;
5485 }
5486
5487 // Mark that the port registration is held by NECP
5488 inp->inp_flags2 |= INP2_EXTERNAL_PORT;
5489
5490 break;
5491 }
5492
5493 case SO_RESOLVER_SIGNATURE: {
5494 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5495 error = EINVAL;
5496 goto out;
5497 }
5498 error = necp_set_socket_resolver_signature(sotoinpcb(so), sopt);
5499 break;
5500 }
5501#endif /* NECP */
5502
5503 case SO_EXTENDED_BK_IDLE:
5504 error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5505 minlen: sizeof(optval));
5506 if (error == 0) {
5507 error = so_set_extended_bk_idle(so, optval);
5508 }
5509 break;
5510
5511 case SO_MARK_CELLFALLBACK:
5512 error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5513 minlen: sizeof(optval));
5514 if (error != 0) {
5515 goto out;
5516 }
5517 if (optval < 0) {
5518 error = EINVAL;
5519 goto out;
5520 }
5521 if (optval == 0) {
5522 so->so_flags1 &= ~SOF1_CELLFALLBACK;
5523 } else {
5524 so->so_flags1 |= SOF1_CELLFALLBACK;
5525 }
5526 break;
5527
5528 case SO_MARK_CELLFALLBACK_UUID:
5529 {
5530 struct so_mark_cellfallback_uuid_args args;
5531
5532 error = sooptcopyin(sopt, buf: &args, len: sizeof(args),
5533 minlen: sizeof(args));
5534 if (error != 0) {
5535 goto out;
5536 }
5537 error = nstat_userland_mark_rnf_override(fuuid: args.flow_uuid,
5538 rnf_override: args.flow_cellfallback);
5539 break;
5540 }
5541
5542 case SO_FALLBACK_MODE:
5543 error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5544 minlen: sizeof(optval));
5545 if (error != 0) {
5546 goto out;
5547 }
5548 if (optval < SO_FALLBACK_MODE_NONE ||
5549 optval > SO_FALLBACK_MODE_PREFER) {
5550 error = EINVAL;
5551 goto out;
5552 }
5553 so->so_fallback_mode = (u_int8_t)optval;
5554 break;
5555
5556 case SO_MARK_KNOWN_TRACKER: {
5557 error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5558 minlen: sizeof(optval));
5559 if (error != 0) {
5560 goto out;
5561 }
5562 if (optval < 0) {
5563 error = EINVAL;
5564 goto out;
5565 }
5566 if (optval == 0) {
5567 so->so_flags1 &= ~SOF1_KNOWN_TRACKER;
5568 } else {
5569 so->so_flags1 |= SOF1_KNOWN_TRACKER;
5570 }
5571 break;
5572 }
5573
5574 case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
5575 error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5576 minlen: sizeof(optval));
5577 if (error != 0) {
5578 goto out;
5579 }
5580 if (optval < 0) {
5581 error = EINVAL;
5582 goto out;
5583 }
5584 if (optval == 0) {
5585 so->so_flags1 &= ~SOF1_TRACKER_NON_APP_INITIATED;
5586 } else {
5587 so->so_flags1 |= SOF1_TRACKER_NON_APP_INITIATED;
5588 }
5589 break;
5590 }
5591
5592 case SO_MARK_APPROVED_APP_DOMAIN: {
5593 error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5594 minlen: sizeof(optval));
5595 if (error != 0) {
5596 goto out;
5597 }
5598 if (optval < 0) {
5599 error = EINVAL;
5600 goto out;
5601 }
5602 if (optval == 0) {
5603 so->so_flags1 &= ~SOF1_APPROVED_APP_DOMAIN;
5604 } else {
5605 so->so_flags1 |= SOF1_APPROVED_APP_DOMAIN;
5606 }
5607 break;
5608 }
5609
5610 case SO_STATISTICS_EVENT:
5611 error = sooptcopyin(sopt, buf: &long_optval,
5612 len: sizeof(long_optval), minlen: sizeof(long_optval));
5613 if (error != 0) {
5614 goto out;
5615 }
5616 u_int64_t nstat_event = 0;
5617 error = so_statistics_event_to_nstat_event(
5618 input_options: &long_optval, nstat_event: &nstat_event);
5619 if (error != 0) {
5620 goto out;
5621 }
5622 nstat_pcb_event(sotoinpcb(so), event: nstat_event);
5623 break;
5624
5625 case SO_NET_SERVICE_TYPE: {
5626 error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5627 minlen: sizeof(optval));
5628 if (error != 0) {
5629 goto out;
5630 }
5631 error = so_set_net_service_type(so, optval);
5632 break;
5633 }
5634
5635 case SO_QOSMARKING_POLICY_OVERRIDE:
5636 error = priv_check_cred(cred: kauth_cred_get(),
5637 PRIV_NET_QOSMARKING_POLICY_OVERRIDE, flags: 0);
5638 if (error != 0) {
5639 goto out;
5640 }
5641 error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5642 minlen: sizeof(optval));
5643 if (error != 0) {
5644 goto out;
5645 }
5646 if (optval == 0) {
5647 so->so_flags1 &= ~SOF1_QOSMARKING_POLICY_OVERRIDE;
5648 } else {
5649 so->so_flags1 |= SOF1_QOSMARKING_POLICY_OVERRIDE;
5650 }
5651 break;
5652
5653 case SO_MPKL_SEND_INFO: {
5654 struct so_mpkl_send_info so_mpkl_send_info;
5655
5656 error = sooptcopyin(sopt, buf: &so_mpkl_send_info,
5657 len: sizeof(struct so_mpkl_send_info), minlen: sizeof(struct so_mpkl_send_info));
5658 if (error != 0) {
5659 goto out;
5660 }
5661 uuid_copy(dst: so->so_mpkl_send_uuid, src: so_mpkl_send_info.mpkl_uuid);
5662 so->so_mpkl_send_proto = so_mpkl_send_info.mpkl_proto;
5663
5664 if (uuid_is_null(uu: so->so_mpkl_send_uuid) && so->so_mpkl_send_proto == 0) {
5665 so->so_flags1 &= ~SOF1_MPKL_SEND_INFO;
5666 } else {
5667 so->so_flags1 |= SOF1_MPKL_SEND_INFO;
5668 }
5669 break;
5670 }
5671 case SO_WANT_KEV_SOCKET_CLOSED: {
5672 error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5673 minlen: sizeof(optval));
5674 if (error != 0) {
5675 goto out;
5676 }
5677 if (optval == 0) {
5678 so->so_flags1 &= ~SOF1_WANT_KEV_SOCK_CLOSED;
5679 } else {
5680 so->so_flags1 |= SOF1_WANT_KEV_SOCK_CLOSED;
5681 }
5682 break;
5683 }
5684 case SO_MARK_WAKE_PKT: {
5685 error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5686 minlen: sizeof(optval));
5687 if (error != 0) {
5688 goto out;
5689 }
5690 if (optval == 0) {
5691 so->so_flags &= ~SOF_MARK_WAKE_PKT;
5692 } else {
5693 so->so_flags |= SOF_MARK_WAKE_PKT;
5694 }
5695 break;
5696 }
5697 case SO_RECV_WAKE_PKT: {
5698 error = sooptcopyin(sopt, buf: &optval, len: sizeof(optval),
5699 minlen: sizeof(optval));
5700 if (error != 0) {
5701 goto out;
5702 }
5703 if (optval == 0) {
5704 so->so_flags &= ~SOF_RECV_WAKE_PKT;
5705 } else {
5706 so->so_flags |= SOF_RECV_WAKE_PKT;
5707 }
5708 break;
5709 }
5710 case SO_APPLICATION_ID: {
5711 so_application_id_t application_id = { 0 };
5712
5713 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
5714 error = EINVAL;
5715 goto out;
5716 }
5717 error = sooptcopyin(sopt, buf: &application_id, len: sizeof(application_id),
5718 minlen: sizeof(application_id));
5719 if (error != 0) {
5720 goto out;
5721 }
5722
5723 // The user needs to match
5724 if (kauth_cred_getuid(cred: so->so_cred) != application_id.uid) {
5725 error = EINVAL;
5726 printf("setsockopt: SO_APPLICATION_ID - wrong uid");
5727 goto out;
5728 }
5729 error = so_set_effective_uuid(so, euuid: application_id.effective_uuid, p: sopt->sopt_p, true);
5730 if (error != 0) {
5731 printf("setsockopt: SO_APPLICATION_ID - failed to set e_uuid");
5732 goto out;
5733 }
5734 if (application_id.persona_id != PERSONA_ID_NONE) {
5735 so->so_persona_id = application_id.persona_id;
5736 }
5737 break;
5738 }
5739 default:
5740 error = ENOPROTOOPT;
5741 break;
5742 }
5743 if (error == 0 && so->so_proto != NULL &&
5744 so->so_proto->pr_ctloutput != NULL) {
5745 (void) so->so_proto->pr_ctloutput(so, sopt);
5746 }
5747 }
5748out:
5749 if (dolock) {
5750 socket_unlock(so, refcount: 1);
5751 }
5752 return error;
5753}
5754
5755/* Helper routines for getsockopt */
5756int
5757sooptcopyout(struct sockopt *sopt, void *buf, size_t len)
5758{
5759 int error;
5760 size_t valsize;
5761
5762 error = 0;
5763
5764 /*
5765 * Documented get behavior is that we always return a value,
5766 * possibly truncated to fit in the user's buffer.
5767 * Traditional behavior is that we always tell the user
5768 * precisely how much we copied, rather than something useful
5769 * like the total amount we had available for her.
5770 * Note that this interface is not idempotent; the entire answer must
5771 * generated ahead of time.
5772 */
5773 valsize = MIN(len, sopt->sopt_valsize);
5774 sopt->sopt_valsize = valsize;
5775 if (sopt->sopt_val != USER_ADDR_NULL) {
5776 if (sopt->sopt_p != kernproc) {
5777 error = copyout(buf, sopt->sopt_val, valsize);
5778 } else {
5779 bcopy(src: buf, CAST_DOWN(caddr_t, sopt->sopt_val), n: valsize);
5780 }
5781 }
5782 return error;
5783}
5784
5785static int
5786sooptcopyout_timeval(struct sockopt *sopt, const struct timeval *tv_p)
5787{
5788 int error;
5789 size_t len;
5790 struct user64_timeval tv64 = {};
5791 struct user32_timeval tv32 = {};
5792 const void * val;
5793 size_t valsize;
5794
5795 error = 0;
5796 if (proc_is64bit(sopt->sopt_p)) {
5797 len = sizeof(tv64);
5798 tv64.tv_sec = tv_p->tv_sec;
5799 tv64.tv_usec = tv_p->tv_usec;
5800 val = &tv64;
5801 } else {
5802 len = sizeof(tv32);
5803 tv32.tv_sec = (user32_time_t)tv_p->tv_sec;
5804 tv32.tv_usec = tv_p->tv_usec;
5805 val = &tv32;
5806 }
5807 valsize = MIN(len, sopt->sopt_valsize);
5808 sopt->sopt_valsize = valsize;
5809 if (sopt->sopt_val != USER_ADDR_NULL) {
5810 if (sopt->sopt_p != kernproc) {
5811 error = copyout(val, sopt->sopt_val, valsize);
5812 } else {
5813 bcopy(src: val, CAST_DOWN(caddr_t, sopt->sopt_val), n: valsize);
5814 }
5815 }
5816 return error;
5817}
5818
5819/*
5820 * Return: 0 Success
5821 * ENOPROTOOPT
5822 * <pr_ctloutput>:EOPNOTSUPP[AF_UNIX]
5823 * <pr_ctloutput>:???
5824 * <sf_getoption>:???
5825 */
5826int
5827sogetoptlock(struct socket *so, struct sockopt *sopt, int dolock)
5828{
5829 int error, optval;
5830 struct linger l;
5831 struct timeval tv;
5832
5833 if (sopt->sopt_dir != SOPT_GET) {
5834 sopt->sopt_dir = SOPT_GET;
5835 }
5836
5837 if (dolock) {
5838 socket_lock(so, refcount: 1);
5839 }
5840
5841 error = sflt_getsockopt(so, sopt);
5842 if (error != 0) {
5843 if (error == EJUSTRETURN) {
5844 error = 0;
5845 }
5846 goto out;
5847 }
5848
5849 if (sopt->sopt_level != SOL_SOCKET) {
5850 if (so->so_proto != NULL &&
5851 so->so_proto->pr_ctloutput != NULL) {
5852 error = (*so->so_proto->pr_ctloutput)(so, sopt);
5853 goto out;
5854 }
5855 error = ENOPROTOOPT;
5856 } else {
5857 /*
5858 * Allow socket-level (SOL_SOCKET) options to be filtered by
5859 * the protocol layer, if needed. A zero value returned from
5860 * the handler means use default socket-level processing as
5861 * done by the rest of this routine. Otherwise, any other
5862 * return value indicates that the option is unsupported.
5863 */
5864 if (so->so_proto != NULL && (error = so->so_proto->pr_usrreqs->
5865 pru_socheckopt(so, sopt)) != 0) {
5866 goto out;
5867 }
5868
5869 error = 0;
5870 switch (sopt->sopt_name) {
5871 case SO_LINGER:
5872 case SO_LINGER_SEC:
5873 l.l_onoff = ((so->so_options & SO_LINGER) ? 1 : 0);
5874 l.l_linger = (sopt->sopt_name == SO_LINGER) ?
5875 so->so_linger : so->so_linger / hz;
5876 error = sooptcopyout(sopt, buf: &l, len: sizeof(l));
5877 break;
5878
5879 case SO_USELOOPBACK:
5880 case SO_DONTROUTE:
5881 case SO_DEBUG:
5882 case SO_KEEPALIVE:
5883 case SO_REUSEADDR:
5884 case SO_REUSEPORT:
5885 case SO_BROADCAST:
5886 case SO_OOBINLINE:
5887 case SO_TIMESTAMP:
5888 case SO_TIMESTAMP_MONOTONIC:
5889 case SO_TIMESTAMP_CONTINUOUS:
5890 case SO_DONTTRUNC:
5891 case SO_WANTMORE:
5892 case SO_WANTOOBFLAG:
5893 case SO_NOWAKEFROMSLEEP:
5894 case SO_NOAPNFALLBK:
5895 optval = so->so_options & sopt->sopt_name;
5896integer:
5897 error = sooptcopyout(sopt, buf: &optval, len: sizeof(optval));
5898 break;
5899
5900 case SO_TYPE:
5901 optval = so->so_type;
5902 goto integer;
5903
5904 case SO_NREAD:
5905 if (so->so_proto->pr_flags & PR_ATOMIC) {
5906 int pkt_total;
5907 struct mbuf *m1;
5908
5909 pkt_total = 0;
5910 m1 = so->so_rcv.sb_mb;
5911 while (m1 != NULL) {
5912 if (m_has_mtype(m: m1, mtype_flags: MTF_DATA | MTF_HEADER | MTF_OOBDATA)) {
5913 pkt_total += m1->m_len;
5914 }
5915 m1 = m1->m_next;
5916 }
5917 optval = pkt_total;
5918 } else {
5919 optval = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
5920 }
5921 goto integer;
5922
5923 case SO_NUMRCVPKT:
5924 if (so->so_proto->pr_flags & PR_ATOMIC) {
5925 int cnt = 0;
5926 struct mbuf *m1;
5927
5928 m1 = so->so_rcv.sb_mb;
5929 while (m1 != NULL) {
5930 cnt += 1;
5931 m1 = m1->m_nextpkt;
5932 }
5933 optval = cnt;
5934 goto integer;
5935 } else {
5936 error = ENOPROTOOPT;
5937 break;
5938 }
5939
5940 case SO_NWRITE:
5941 optval = so->so_snd.sb_cc;
5942 goto integer;
5943
5944 case SO_ERROR:
5945 optval = so->so_error;
5946 so->so_error = 0;
5947 goto integer;
5948
5949 case SO_SNDBUF: {
5950 u_int32_t hiwat = so->so_snd.sb_hiwat;
5951
5952 if (so->so_snd.sb_flags & SB_UNIX) {
5953 struct unpcb *unp =
5954 (struct unpcb *)(so->so_pcb);
5955 if (unp != NULL && unp->unp_conn != NULL) {
5956 hiwat += unp->unp_conn->unp_cc;
5957 }
5958 }
5959
5960 optval = hiwat;
5961 goto integer;
5962 }
5963 case SO_RCVBUF:
5964 optval = so->so_rcv.sb_hiwat;
5965 goto integer;
5966
5967 case SO_SNDLOWAT:
5968 optval = so->so_snd.sb_lowat;
5969 goto integer;
5970
5971 case SO_RCVLOWAT:
5972 optval = so->so_rcv.sb_lowat;
5973 goto integer;
5974
5975 case SO_SNDTIMEO:
5976 case SO_RCVTIMEO:
5977 tv = (sopt->sopt_name == SO_SNDTIMEO ?
5978 so->so_snd.sb_timeo : so->so_rcv.sb_timeo);
5979
5980 error = sooptcopyout_timeval(sopt, tv_p: &tv);
5981 break;
5982
5983 case SO_NOSIGPIPE:
5984 optval = (so->so_flags & SOF_NOSIGPIPE);
5985 goto integer;
5986
5987 case SO_NOADDRERR:
5988 optval = (so->so_flags & SOF_NOADDRAVAIL);
5989 goto integer;
5990
5991 case SO_REUSESHAREUID:
5992 optval = (so->so_flags & SOF_REUSESHAREUID);
5993 goto integer;
5994
5995
5996 case SO_NOTIFYCONFLICT:
5997 optval = (so->so_flags & SOF_NOTIFYCONFLICT);
5998 goto integer;
5999
6000 case SO_RESTRICTIONS:
6001 optval = so_get_restrictions(so);
6002 goto integer;
6003
6004 case SO_AWDL_UNRESTRICTED:
6005 if (SOCK_DOM(so) == PF_INET ||
6006 SOCK_DOM(so) == PF_INET6) {
6007 optval = inp_get_awdl_unrestricted(
6008 sotoinpcb(so));
6009 goto integer;
6010 } else {
6011 error = EOPNOTSUPP;
6012 }
6013 break;
6014
6015 case SO_INTCOPROC_ALLOW:
6016 if (SOCK_DOM(so) == PF_INET6) {
6017 optval = inp_get_intcoproc_allowed(
6018 sotoinpcb(so));
6019 goto integer;
6020 } else {
6021 error = EOPNOTSUPP;
6022 }
6023 break;
6024
6025 case SO_LABEL:
6026 error = EOPNOTSUPP;
6027 break;
6028
6029 case SO_PEERLABEL:
6030 error = EOPNOTSUPP;
6031 break;
6032
6033#ifdef __APPLE_API_PRIVATE
6034 case SO_UPCALLCLOSEWAIT:
6035 optval = (so->so_flags & SOF_UPCALLCLOSEWAIT);
6036 goto integer;
6037#endif
6038 case SO_RANDOMPORT:
6039 optval = (so->so_flags & SOF_BINDRANDOMPORT);
6040 goto integer;
6041
6042 case SO_NP_EXTENSIONS: {
6043 struct so_np_extensions sonpx = {};
6044
6045 sonpx.npx_flags = (so->so_flags & SOF_NPX_SETOPTSHUT) ?
6046 SONPX_SETOPTSHUT : 0;
6047 sonpx.npx_mask = SONPX_MASK_VALID;
6048
6049 error = sooptcopyout(sopt, buf: &sonpx,
6050 len: sizeof(struct so_np_extensions));
6051 break;
6052 }
6053
6054 case SO_TRAFFIC_CLASS:
6055 optval = so->so_traffic_class;
6056 goto integer;
6057
6058 case SO_RECV_TRAFFIC_CLASS:
6059 optval = (so->so_flags & SOF_RECV_TRAFFIC_CLASS);
6060 goto integer;
6061
6062#if (DEVELOPMENT || DEBUG)
6063 case SO_TRAFFIC_CLASS_DBG:
6064 error = sogetopt_tcdbg(so, sopt);
6065 break;
6066#endif /* (DEVELOPMENT || DEBUG) */
6067
6068 case SO_PRIVILEGED_TRAFFIC_CLASS:
6069 optval = (so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS);
6070 goto integer;
6071
6072 case SO_DEFUNCTOK:
6073 optval = !(so->so_flags & SOF_NODEFUNCT);
6074 goto integer;
6075
6076 case SO_ISDEFUNCT:
6077 optval = (so->so_flags & SOF_DEFUNCT);
6078 goto integer;
6079
6080 case SO_OPPORTUNISTIC:
6081 optval = so_get_opportunistic(so);
6082 goto integer;
6083
6084 case SO_FLUSH:
6085 /* This option is not gettable */
6086 error = EINVAL;
6087 break;
6088
6089 case SO_RECV_ANYIF:
6090 optval = so_get_recv_anyif(so);
6091 goto integer;
6092
6093 case SO_TRAFFIC_MGT_BACKGROUND:
6094 /* This option is handled by lower layer(s) */
6095 if (so->so_proto != NULL &&
6096 so->so_proto->pr_ctloutput != NULL) {
6097 (void) so->so_proto->pr_ctloutput(so, sopt);
6098 }
6099 break;
6100
6101#if FLOW_DIVERT
6102 case SO_FLOW_DIVERT_TOKEN:
6103 error = flow_divert_token_get(so, sopt);
6104 break;
6105#endif /* FLOW_DIVERT */
6106
6107#if NECP
6108 case SO_NECP_ATTRIBUTES:
6109 if (SOCK_DOM(so) == PF_MULTIPATH) {
6110 /* Handled by MPTCP itself */
6111 break;
6112 }
6113
6114 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6115 error = EINVAL;
6116 goto out;
6117 }
6118
6119 error = necp_get_socket_attributes(attributes: &sotoinpcb(so)->inp_necp_attributes, sopt);
6120 break;
6121
6122 case SO_NECP_CLIENTUUID: {
6123 uuid_t *ncu;
6124
6125 if (SOCK_DOM(so) == PF_MULTIPATH) {
6126 ncu = &mpsotomppcb(mp_so: so)->necp_client_uuid;
6127 } else if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6128 ncu = &sotoinpcb(so)->necp_client_uuid;
6129 } else {
6130 error = EINVAL;
6131 goto out;
6132 }
6133
6134 error = sooptcopyout(sopt, buf: ncu, len: sizeof(uuid_t));
6135 break;
6136 }
6137
6138 case SO_NECP_LISTENUUID: {
6139 uuid_t *nlu;
6140
6141 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
6142 if (sotoinpcb(so)->inp_flags2 & INP2_EXTERNAL_PORT) {
6143 nlu = &sotoinpcb(so)->necp_client_uuid;
6144 } else {
6145 error = ENOENT;
6146 goto out;
6147 }
6148 } else {
6149 error = EINVAL;
6150 goto out;
6151 }
6152
6153 error = sooptcopyout(sopt, buf: nlu, len: sizeof(uuid_t));
6154 break;
6155 }
6156
6157 case SO_RESOLVER_SIGNATURE: {
6158 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6159 error = EINVAL;
6160 goto out;
6161 }
6162 error = necp_get_socket_resolver_signature(sotoinpcb(so), sopt);
6163 break;
6164 }
6165
6166#endif /* NECP */
6167
6168#if CONTENT_FILTER
6169 case SO_CFIL_SOCK_ID: {
6170 cfil_sock_id_t sock_id;
6171
6172 sock_id = cfil_sock_id_from_socket(so);
6173
6174 error = sooptcopyout(sopt, buf: &sock_id,
6175 len: sizeof(cfil_sock_id_t));
6176 break;
6177 }
6178#endif /* CONTENT_FILTER */
6179
6180 case SO_EXTENDED_BK_IDLE:
6181 optval = (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED);
6182 goto integer;
6183 case SO_MARK_CELLFALLBACK:
6184 optval = ((so->so_flags1 & SOF1_CELLFALLBACK) > 0)
6185 ? 1 : 0;
6186 goto integer;
6187 case SO_FALLBACK_MODE:
6188 optval = so->so_fallback_mode;
6189 goto integer;
6190 case SO_MARK_KNOWN_TRACKER: {
6191 optval = ((so->so_flags1 & SOF1_KNOWN_TRACKER) > 0)
6192 ? 1 : 0;
6193 goto integer;
6194 }
6195 case SO_MARK_KNOWN_TRACKER_NON_APP_INITIATED: {
6196 optval = ((so->so_flags1 & SOF1_TRACKER_NON_APP_INITIATED) > 0)
6197 ? 1 : 0;
6198 goto integer;
6199 }
6200 case SO_MARK_APPROVED_APP_DOMAIN: {
6201 optval = ((so->so_flags1 & SOF1_APPROVED_APP_DOMAIN) > 0)
6202 ? 1 : 0;
6203 goto integer;
6204 }
6205 case SO_NET_SERVICE_TYPE: {
6206 if ((so->so_flags1 & SOF1_TC_NET_SERV_TYPE)) {
6207 optval = so->so_netsvctype;
6208 } else {
6209 optval = NET_SERVICE_TYPE_BE;
6210 }
6211 goto integer;
6212 }
6213 case SO_NETSVC_MARKING_LEVEL:
6214 optval = so_get_netsvc_marking_level(so);
6215 goto integer;
6216
6217 case SO_MPKL_SEND_INFO: {
6218 struct so_mpkl_send_info so_mpkl_send_info;
6219
6220 uuid_copy(dst: so_mpkl_send_info.mpkl_uuid, src: so->so_mpkl_send_uuid);
6221 so_mpkl_send_info.mpkl_proto = so->so_mpkl_send_proto;
6222 error = sooptcopyout(sopt, buf: &so_mpkl_send_info,
6223 len: sizeof(struct so_mpkl_send_info));
6224 break;
6225 }
6226 case SO_MARK_WAKE_PKT:
6227 optval = (so->so_flags & SOF_MARK_WAKE_PKT);
6228 goto integer;
6229 case SO_RECV_WAKE_PKT:
6230 optval = (so->so_flags & SOF_RECV_WAKE_PKT);
6231 goto integer;
6232 case SO_APPLICATION_ID: {
6233 if (SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) {
6234 error = EINVAL;
6235 goto out;
6236 }
6237 so_application_id_t application_id = { 0 };
6238 application_id.uid = kauth_cred_getuid(cred: so->so_cred);
6239 uuid_copy(dst: application_id.effective_uuid, src: !uuid_is_null(uu: so->e_uuid) ? so->e_uuid : so->last_uuid);
6240 application_id.persona_id = so->so_persona_id;
6241 error = sooptcopyout(sopt, buf: &application_id, len: sizeof(so_application_id_t));
6242 break;
6243 }
6244 default:
6245 error = ENOPROTOOPT;
6246 break;
6247 }
6248 }
6249out:
6250 if (dolock) {
6251 socket_unlock(so, refcount: 1);
6252 }
6253 return error;
6254}
6255
6256/*
6257 * The size limits on our soopt_getm is different from that on FreeBSD.
6258 * We limit the size of options to MCLBYTES. This will have to change
6259 * if we need to define options that need more space than MCLBYTES.
6260 */
6261int
6262soopt_getm(struct sockopt *sopt, struct mbuf **mp)
6263{
6264 struct mbuf *m, *m_prev;
6265 int sopt_size = (int)sopt->sopt_valsize;
6266 int how;
6267
6268 if (sopt_size <= 0 || sopt_size > MCLBYTES) {
6269 return EMSGSIZE;
6270 }
6271
6272 how = sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT;
6273 MGET(m, how, MT_DATA);
6274 if (m == NULL) {
6275 return ENOBUFS;
6276 }
6277 if (sopt_size > MLEN) {
6278 MCLGET(m, how);
6279 if ((m->m_flags & M_EXT) == 0) {
6280 m_free(m);
6281 return ENOBUFS;
6282 }
6283 m->m_len = min(MCLBYTES, b: sopt_size);
6284 } else {
6285 m->m_len = min(MLEN, b: sopt_size);
6286 }
6287 sopt_size -= m->m_len;
6288 *mp = m;
6289 m_prev = m;
6290
6291 while (sopt_size > 0) {
6292 MGET(m, how, MT_DATA);
6293 if (m == NULL) {
6294 m_freem(*mp);
6295 return ENOBUFS;
6296 }
6297 if (sopt_size > MLEN) {
6298 MCLGET(m, how);
6299 if ((m->m_flags & M_EXT) == 0) {
6300 m_freem(*mp);
6301 m_freem(m);
6302 return ENOBUFS;
6303 }
6304 m->m_len = min(MCLBYTES, b: sopt_size);
6305 } else {
6306 m->m_len = min(MLEN, b: sopt_size);
6307 }
6308 sopt_size -= m->m_len;
6309 m_prev->m_next = m;
6310 m_prev = m;
6311 }
6312 return 0;
6313}
6314
6315/* copyin sopt data into mbuf chain */
6316int
6317soopt_mcopyin(struct sockopt *sopt, struct mbuf *m)
6318{
6319 struct mbuf *m0 = m;
6320
6321 if (sopt->sopt_val == USER_ADDR_NULL) {
6322 return 0;
6323 }
6324 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6325 if (sopt->sopt_p != kernproc) {
6326 int error;
6327
6328 error = copyin(sopt->sopt_val, mtod(m, char *),
6329 m->m_len);
6330 if (error != 0) {
6331 m_freem(m0);
6332 return error;
6333 }
6334 } else {
6335 bcopy(CAST_DOWN(caddr_t, sopt->sopt_val),
6336 mtod(m, char *), n: m->m_len);
6337 }
6338 sopt->sopt_valsize -= m->m_len;
6339 sopt->sopt_val += m->m_len;
6340 m = m->m_next;
6341 }
6342 /* should be allocated enoughly at ip6_sooptmcopyin() */
6343 if (m != NULL) {
6344 panic("soopt_mcopyin");
6345 /* NOTREACHED */
6346 }
6347 return 0;
6348}
6349
6350/* copyout mbuf chain data into soopt */
6351int
6352soopt_mcopyout(struct sockopt *sopt, struct mbuf *m)
6353{
6354 struct mbuf *m0 = m;
6355 size_t valsize = 0;
6356
6357 if (sopt->sopt_val == USER_ADDR_NULL) {
6358 return 0;
6359 }
6360 while (m != NULL && sopt->sopt_valsize >= m->m_len) {
6361 if (sopt->sopt_p != kernproc) {
6362 int error;
6363
6364 error = copyout(mtod(m, char *), sopt->sopt_val,
6365 m->m_len);
6366 if (error != 0) {
6367 m_freem(m0);
6368 return error;
6369 }
6370 } else {
6371 bcopy(mtod(m, char *),
6372 CAST_DOWN(caddr_t, sopt->sopt_val), n: m->m_len);
6373 }
6374 sopt->sopt_valsize -= m->m_len;
6375 sopt->sopt_val += m->m_len;
6376 valsize += m->m_len;
6377 m = m->m_next;
6378 }
6379 if (m != NULL) {
6380 /* enough soopt buffer should be given from user-land */
6381 m_freem(m0);
6382 return EINVAL;
6383 }
6384 sopt->sopt_valsize = valsize;
6385 return 0;
6386}
6387
6388void
6389sohasoutofband(struct socket *so)
6390{
6391 if (so->so_pgid < 0) {
6392 gsignal(pgid: -so->so_pgid, SIGURG);
6393 } else if (so->so_pgid > 0) {
6394 proc_signal(pid: so->so_pgid, SIGURG);
6395 }
6396 selwakeup(&so->so_rcv.sb_sel);
6397 if (so->so_rcv.sb_flags & SB_KNOTE) {
6398 KNOTE(&so->so_rcv.sb_sel.si_note,
6399 (NOTE_OOB | SO_FILT_HINT_LOCKED));
6400 }
6401}
6402
6403int
6404sopoll(struct socket *so, int events, kauth_cred_t cred, void * wql)
6405{
6406#pragma unused(cred)
6407 struct proc *p = current_proc();
6408 int revents = 0;
6409
6410 socket_lock(so, refcount: 1);
6411 so_update_last_owner_locked(so, PROC_NULL);
6412 so_update_policy(so);
6413
6414 if (events & (POLLIN | POLLRDNORM)) {
6415 if (soreadable(so)) {
6416 revents |= events & (POLLIN | POLLRDNORM);
6417 }
6418 }
6419
6420 if (events & (POLLOUT | POLLWRNORM)) {
6421 if (sowriteable(so)) {
6422 revents |= events & (POLLOUT | POLLWRNORM);
6423 }
6424 }
6425
6426 if (events & (POLLPRI | POLLRDBAND)) {
6427 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6428 revents |= events & (POLLPRI | POLLRDBAND);
6429 }
6430 }
6431
6432 if (revents == 0) {
6433 if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) {
6434 /*
6435 * Darwin sets the flag first,
6436 * BSD calls selrecord first
6437 */
6438 so->so_rcv.sb_flags |= SB_SEL;
6439 selrecord(selector: p, &so->so_rcv.sb_sel, wql);
6440 }
6441
6442 if (events & (POLLOUT | POLLWRNORM)) {
6443 /*
6444 * Darwin sets the flag first,
6445 * BSD calls selrecord first
6446 */
6447 so->so_snd.sb_flags |= SB_SEL;
6448 selrecord(selector: p, &so->so_snd.sb_sel, wql);
6449 }
6450 }
6451
6452 socket_unlock(so, refcount: 1);
6453 return revents;
6454}
6455
6456int
6457soo_kqfilter(struct fileproc *fp, struct knote *kn, struct kevent_qos_s *kev)
6458{
6459 struct socket *so = (struct socket *)fp_get_data(fp);
6460 int result;
6461
6462 socket_lock(so, refcount: 1);
6463 so_update_last_owner_locked(so, PROC_NULL);
6464 so_update_policy(so);
6465
6466 switch (kn->kn_filter) {
6467 case EVFILT_READ:
6468 kn->kn_filtid = EVFILTID_SOREAD;
6469 break;
6470 case EVFILT_WRITE:
6471 kn->kn_filtid = EVFILTID_SOWRITE;
6472 break;
6473 case EVFILT_SOCK:
6474 kn->kn_filtid = EVFILTID_SCK;
6475 break;
6476 case EVFILT_EXCEPT:
6477 kn->kn_filtid = EVFILTID_SOEXCEPT;
6478 break;
6479 default:
6480 socket_unlock(so, refcount: 1);
6481 knote_set_error(kn, EINVAL);
6482 return 0;
6483 }
6484
6485 /*
6486 * call the appropriate sub-filter attach
6487 * with the socket still locked
6488 */
6489 result = knote_fops(kn)->f_attach(kn, kev);
6490
6491 socket_unlock(so, refcount: 1);
6492
6493 return result;
6494}
6495
6496static int
6497filt_soread_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6498{
6499 int retval = 0;
6500 int64_t data = 0;
6501
6502 if (so->so_options & SO_ACCEPTCONN) {
6503 /*
6504 * Radar 6615193 handle the listen case dynamically
6505 * for kqueue read filter. This allows to call listen()
6506 * after registering the kqueue EVFILT_READ.
6507 */
6508
6509 retval = !TAILQ_EMPTY(&so->so_comp);
6510 data = so->so_qlen;
6511 goto out;
6512 }
6513
6514 /* socket isn't a listener */
6515 /*
6516 * NOTE_LOWAT specifies new low water mark in data, i.e.
6517 * the bytes of protocol data. We therefore exclude any
6518 * control bytes.
6519 */
6520 data = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
6521
6522 if (kn->kn_sfflags & NOTE_OOB) {
6523 if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) {
6524 kn->kn_fflags |= NOTE_OOB;
6525 data -= so->so_oobmark;
6526 retval = 1;
6527 goto out;
6528 }
6529 }
6530
6531 if ((so->so_state & SS_CANTRCVMORE)
6532#if CONTENT_FILTER
6533 && cfil_sock_data_pending(sb: &so->so_rcv) == 0
6534#endif /* CONTENT_FILTER */
6535 ) {
6536 kn->kn_flags |= EV_EOF;
6537 kn->kn_fflags = so->so_error;
6538 retval = 1;
6539 goto out;
6540 }
6541
6542 if (so->so_error) { /* temporary udp error */
6543 retval = 1;
6544 goto out;
6545 }
6546
6547 int64_t lowwat = so->so_rcv.sb_lowat;
6548 /*
6549 * Ensure that when NOTE_LOWAT is used, the derived
6550 * low water mark is bounded by socket's rcv buf's
6551 * high and low water mark values.
6552 */
6553 if (kn->kn_sfflags & NOTE_LOWAT) {
6554 if (kn->kn_sdata > so->so_rcv.sb_hiwat) {
6555 lowwat = so->so_rcv.sb_hiwat;
6556 } else if (kn->kn_sdata > lowwat) {
6557 lowwat = kn->kn_sdata;
6558 }
6559 }
6560
6561 /*
6562 * While the `data` field is the amount of data to read,
6563 * 0-sized packets need to wake up the kqueue, see 58140856,
6564 * so we need to take control bytes into account too.
6565 */
6566 retval = (so->so_rcv.sb_cc >= lowwat);
6567
6568out:
6569 if (retval && kev) {
6570 knote_fill_kevent(kn, kev, data);
6571 }
6572 return retval;
6573}
6574
6575static int
6576filt_sorattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6577{
6578 struct socket *so = (struct socket *)fp_get_data(fp: kn->kn_fp);
6579
6580 /* socket locked */
6581
6582 /*
6583 * If the caller explicitly asked for OOB results (e.g. poll())
6584 * from EVFILT_READ, then save that off in the hookid field
6585 * and reserve the kn_flags EV_OOBAND bit for output only.
6586 */
6587 if (kn->kn_filter == EVFILT_READ &&
6588 kn->kn_flags & EV_OOBAND) {
6589 kn->kn_flags &= ~EV_OOBAND;
6590 kn->kn_hook32 = EV_OOBAND;
6591 } else {
6592 kn->kn_hook32 = 0;
6593 }
6594 if (KNOTE_ATTACH(&so->so_rcv.sb_sel.si_note, kn)) {
6595 so->so_rcv.sb_flags |= SB_KNOTE;
6596 }
6597
6598 /* indicate if event is already fired */
6599 return filt_soread_common(kn, NULL, so);
6600}
6601
6602static void
6603filt_sordetach(struct knote *kn)
6604{
6605 struct socket *so = (struct socket *)fp_get_data(fp: kn->kn_fp);
6606
6607 socket_lock(so, refcount: 1);
6608 if (so->so_rcv.sb_flags & SB_KNOTE) {
6609 if (KNOTE_DETACH(&so->so_rcv.sb_sel.si_note, kn)) {
6610 so->so_rcv.sb_flags &= ~SB_KNOTE;
6611 }
6612 }
6613 socket_unlock(so, refcount: 1);
6614}
6615
6616/*ARGSUSED*/
6617static int
6618filt_soread(struct knote *kn, long hint)
6619{
6620 struct socket *so = (struct socket *)fp_get_data(fp: kn->kn_fp);
6621 int retval;
6622
6623 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6624 socket_lock(so, refcount: 1);
6625 }
6626
6627 retval = filt_soread_common(kn, NULL, so);
6628
6629 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6630 socket_unlock(so, refcount: 1);
6631 }
6632
6633 return retval;
6634}
6635
6636static int
6637filt_sortouch(struct knote *kn, struct kevent_qos_s *kev)
6638{
6639 struct socket *so = (struct socket *)fp_get_data(fp: kn->kn_fp);
6640 int retval;
6641
6642 socket_lock(so, refcount: 1);
6643
6644 /* save off the new input fflags and data */
6645 kn->kn_sfflags = kev->fflags;
6646 kn->kn_sdata = kev->data;
6647
6648 /* determine if changes result in fired events */
6649 retval = filt_soread_common(kn, NULL, so);
6650
6651 socket_unlock(so, refcount: 1);
6652
6653 return retval;
6654}
6655
6656static int
6657filt_sorprocess(struct knote *kn, struct kevent_qos_s *kev)
6658{
6659 struct socket *so = (struct socket *)fp_get_data(fp: kn->kn_fp);
6660 int retval;
6661
6662 socket_lock(so, refcount: 1);
6663 retval = filt_soread_common(kn, kev, so);
6664 socket_unlock(so, refcount: 1);
6665
6666 return retval;
6667}
6668
6669int
6670so_wait_for_if_feedback(struct socket *so)
6671{
6672 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
6673 (so->so_state & SS_ISCONNECTED)) {
6674 struct inpcb *inp = sotoinpcb(so);
6675 if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
6676 return 1;
6677 }
6678 }
6679 return 0;
6680}
6681
6682static int
6683filt_sowrite_common(struct knote *kn, struct kevent_qos_s *kev, struct socket *so)
6684{
6685 int ret = 0;
6686 int64_t data = sbspace(sb: &so->so_snd);
6687
6688 if (so->so_state & SS_CANTSENDMORE) {
6689 kn->kn_flags |= EV_EOF;
6690 kn->kn_fflags = so->so_error;
6691 ret = 1;
6692 goto out;
6693 }
6694
6695 if (so->so_error) { /* temporary udp error */
6696 ret = 1;
6697 goto out;
6698 }
6699
6700 if (!socanwrite(so)) {
6701 ret = 0;
6702 goto out;
6703 }
6704
6705 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
6706 ret = 1;
6707 goto out;
6708 }
6709
6710 int64_t lowwat = so->so_snd.sb_lowat;
6711 const int64_t hiwat = so->so_snd.sb_hiwat;
6712 /*
6713 * Deal with connected UNIX domain sockets which
6714 * rely on the fact that the sender's socket buffer is
6715 * actually the receiver's socket buffer.
6716 */
6717 if (SOCK_DOM(so) == PF_LOCAL) {
6718 struct unpcb *unp = sotounpcb(so);
6719 if (unp != NULL && unp->unp_conn != NULL &&
6720 unp->unp_conn->unp_socket != NULL) {
6721 struct socket *so2 = unp->unp_conn->unp_socket;
6722 /*
6723 * At this point we know that `so' is locked
6724 * and that `unp_conn` isn't going to change.
6725 * However, we don't lock `so2` because doing so
6726 * may require unlocking `so'
6727 * (see unp_get_locks_in_order()).
6728 *
6729 * Two cases can happen:
6730 *
6731 * 1) we return 1 and tell the application that
6732 * it can write. Meanwhile, another thread
6733 * fills up the socket buffer. This will either
6734 * lead to a blocking send or EWOULDBLOCK
6735 * which the application should deal with.
6736 * 2) we return 0 and tell the application that
6737 * the socket is not writable. Meanwhile,
6738 * another thread depletes the receive socket
6739 * buffer. In this case the application will
6740 * be woken up by sb_notify().
6741 *
6742 * MIN() is required because otherwise sosendcheck()
6743 * may return EWOULDBLOCK since it only considers
6744 * so->so_snd.
6745 */
6746 data = MIN(data, sbspace(&so2->so_rcv));
6747 }
6748 }
6749
6750 if (kn->kn_sfflags & NOTE_LOWAT) {
6751 if (kn->kn_sdata > hiwat) {
6752 lowwat = hiwat;
6753 } else if (kn->kn_sdata > lowwat) {
6754 lowwat = kn->kn_sdata;
6755 }
6756 }
6757
6758 if (data > 0 && data >= lowwat) {
6759 if ((so->so_flags & SOF_NOTSENT_LOWAT)
6760#if (DEBUG || DEVELOPMENT)
6761 && so_notsent_lowat_check == 1
6762#endif /* DEBUG || DEVELOPMENT */
6763 ) {
6764 if ((SOCK_DOM(so) == PF_INET ||
6765 SOCK_DOM(so) == PF_INET6) &&
6766 so->so_type == SOCK_STREAM) {
6767 ret = tcp_notsent_lowat_check(so);
6768 }
6769#if MPTCP
6770 else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
6771 (SOCK_PROTO(so) == IPPROTO_TCP)) {
6772 ret = mptcp_notsent_lowat_check(so);
6773 }
6774#endif
6775 else {
6776 ret = 1;
6777 goto out;
6778 }
6779 } else {
6780 ret = 1;
6781 }
6782 }
6783 if (so_wait_for_if_feedback(so)) {
6784 ret = 0;
6785 }
6786
6787out:
6788 if (ret && kev) {
6789 knote_fill_kevent(kn, kev, data);
6790 }
6791 return ret;
6792}
6793
6794static int
6795filt_sowattach(struct knote *kn, __unused struct kevent_qos_s *kev)
6796{
6797 struct socket *so = (struct socket *)fp_get_data(fp: kn->kn_fp);
6798
6799 /* socket locked */
6800 if (KNOTE_ATTACH(&so->so_snd.sb_sel.si_note, kn)) {
6801 so->so_snd.sb_flags |= SB_KNOTE;
6802 }
6803
6804 /* determine if its already fired */
6805 return filt_sowrite_common(kn, NULL, so);
6806}
6807
6808static void
6809filt_sowdetach(struct knote *kn)
6810{
6811 struct socket *so = (struct socket *)fp_get_data(fp: kn->kn_fp);
6812 socket_lock(so, refcount: 1);
6813
6814 if (so->so_snd.sb_flags & SB_KNOTE) {
6815 if (KNOTE_DETACH(&so->so_snd.sb_sel.si_note, kn)) {
6816 so->so_snd.sb_flags &= ~SB_KNOTE;
6817 }
6818 }
6819 socket_unlock(so, refcount: 1);
6820}
6821
6822/*ARGSUSED*/
6823static int
6824filt_sowrite(struct knote *kn, long hint)
6825{
6826 struct socket *so = (struct socket *)fp_get_data(fp: kn->kn_fp);
6827 int ret;
6828
6829 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6830 socket_lock(so, refcount: 1);
6831 }
6832
6833 ret = filt_sowrite_common(kn, NULL, so);
6834
6835 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
6836 socket_unlock(so, refcount: 1);
6837 }
6838
6839 return ret;
6840}
6841
6842static int
6843filt_sowtouch(struct knote *kn, struct kevent_qos_s *kev)
6844{
6845 struct socket *so = (struct socket *)fp_get_data(fp: kn->kn_fp);
6846 int ret;
6847
6848 socket_lock(so, refcount: 1);
6849
6850 /*save off the new input fflags and data */
6851 kn->kn_sfflags = kev->fflags;
6852 kn->kn_sdata = kev->data;
6853
6854 /* determine if these changes result in a triggered event */
6855 ret = filt_sowrite_common(kn, NULL, so);
6856
6857 socket_unlock(so, refcount: 1);
6858
6859 return ret;
6860}
6861
6862static int
6863filt_sowprocess(struct knote *kn, struct kevent_qos_s *kev)
6864{
6865 struct socket *so = (struct socket *)fp_get_data(fp: kn->kn_fp);
6866 int ret;
6867
6868 socket_lock(so, refcount: 1);
6869 ret = filt_sowrite_common(kn, kev, so);
6870 socket_unlock(so, refcount: 1);
6871
6872 return ret;
6873}
6874
6875static int
6876filt_sockev_common(struct knote *kn, struct kevent_qos_s *kev,
6877 struct socket *so, long ev_hint)
6878{
6879 int ret = 0;
6880 int64_t data = 0;
6881 uint32_t level_trigger = 0;
6882
6883 if (ev_hint & SO_FILT_HINT_CONNRESET) {
6884 kn->kn_fflags |= NOTE_CONNRESET;
6885 }
6886 if (ev_hint & SO_FILT_HINT_TIMEOUT) {
6887 kn->kn_fflags |= NOTE_TIMEOUT;
6888 }
6889 if (ev_hint & SO_FILT_HINT_NOSRCADDR) {
6890 kn->kn_fflags |= NOTE_NOSRCADDR;
6891 }
6892 if (ev_hint & SO_FILT_HINT_IFDENIED) {
6893 kn->kn_fflags |= NOTE_IFDENIED;
6894 }
6895 if (ev_hint & SO_FILT_HINT_KEEPALIVE) {
6896 kn->kn_fflags |= NOTE_KEEPALIVE;
6897 }
6898 if (ev_hint & SO_FILT_HINT_ADAPTIVE_WTIMO) {
6899 kn->kn_fflags |= NOTE_ADAPTIVE_WTIMO;
6900 }
6901 if (ev_hint & SO_FILT_HINT_ADAPTIVE_RTIMO) {
6902 kn->kn_fflags |= NOTE_ADAPTIVE_RTIMO;
6903 }
6904 if ((ev_hint & SO_FILT_HINT_CONNECTED) ||
6905 (so->so_state & SS_ISCONNECTED)) {
6906 kn->kn_fflags |= NOTE_CONNECTED;
6907 level_trigger |= NOTE_CONNECTED;
6908 }
6909 if ((ev_hint & SO_FILT_HINT_DISCONNECTED) ||
6910 (so->so_state & SS_ISDISCONNECTED)) {
6911 kn->kn_fflags |= NOTE_DISCONNECTED;
6912 level_trigger |= NOTE_DISCONNECTED;
6913 }
6914 if (ev_hint & SO_FILT_HINT_CONNINFO_UPDATED) {
6915 if (so->so_proto != NULL &&
6916 (so->so_proto->pr_flags & PR_EVCONNINFO)) {
6917 kn->kn_fflags |= NOTE_CONNINFO_UPDATED;
6918 }
6919 }
6920 if ((ev_hint & SO_FILT_HINT_NOTIFY_ACK) ||
6921 tcp_notify_ack_active(so)) {
6922 kn->kn_fflags |= NOTE_NOTIFY_ACK;
6923 }
6924 if (ev_hint & SO_FILT_HINT_WAKE_PKT) {
6925 kn->kn_fflags |= NOTE_WAKE_PKT;
6926 }
6927
6928 if ((so->so_state & SS_CANTRCVMORE)
6929#if CONTENT_FILTER
6930 && cfil_sock_data_pending(sb: &so->so_rcv) == 0
6931#endif /* CONTENT_FILTER */
6932 ) {
6933 kn->kn_fflags |= NOTE_READCLOSED;
6934 level_trigger |= NOTE_READCLOSED;
6935 }
6936
6937 if (so->so_state & SS_CANTSENDMORE) {
6938 kn->kn_fflags |= NOTE_WRITECLOSED;
6939 level_trigger |= NOTE_WRITECLOSED;
6940 }
6941
6942 if ((ev_hint & SO_FILT_HINT_SUSPEND) ||
6943 (so->so_flags & SOF_SUSPENDED)) {
6944 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6945
6946 /* If resume event was delivered before, reset it */
6947 kn->kn_hook32 &= ~NOTE_RESUME;
6948
6949 kn->kn_fflags |= NOTE_SUSPEND;
6950 level_trigger |= NOTE_SUSPEND;
6951 }
6952
6953 if ((ev_hint & SO_FILT_HINT_RESUME) ||
6954 (so->so_flags & SOF_SUSPENDED) == 0) {
6955 kn->kn_fflags &= ~(NOTE_SUSPEND | NOTE_RESUME);
6956
6957 /* If suspend event was delivered before, reset it */
6958 kn->kn_hook32 &= ~NOTE_SUSPEND;
6959
6960 kn->kn_fflags |= NOTE_RESUME;
6961 level_trigger |= NOTE_RESUME;
6962 }
6963
6964 if (so->so_error != 0) {
6965 ret = 1;
6966 data = so->so_error;
6967 kn->kn_flags |= EV_EOF;
6968 } else {
6969 u_int32_t data32 = 0;
6970 get_sockev_state(so, &data32);
6971 data = data32;
6972 }
6973
6974 /* Reset any events that are not requested on this knote */
6975 kn->kn_fflags &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6976 level_trigger &= (kn->kn_sfflags & EVFILT_SOCK_ALL_MASK);
6977
6978 /* Find the level triggerred events that are already delivered */
6979 level_trigger &= kn->kn_hook32;
6980 level_trigger &= EVFILT_SOCK_LEVEL_TRIGGER_MASK;
6981
6982 /* Do not deliver level triggerred events more than once */
6983 if ((kn->kn_fflags & ~level_trigger) != 0) {
6984 ret = 1;
6985 }
6986
6987 if (ret && kev) {
6988 /*
6989 * Store the state of the events being delivered. This
6990 * state can be used to deliver level triggered events
6991 * ateast once and still avoid waking up the application
6992 * multiple times as long as the event is active.
6993 */
6994 if (kn->kn_fflags != 0) {
6995 kn->kn_hook32 |= (kn->kn_fflags &
6996 EVFILT_SOCK_LEVEL_TRIGGER_MASK);
6997 }
6998
6999 /*
7000 * NOTE_RESUME and NOTE_SUSPEND are an exception, deliver
7001 * only one of them and remember the last one that was
7002 * delivered last
7003 */
7004 if (kn->kn_fflags & NOTE_SUSPEND) {
7005 kn->kn_hook32 &= ~NOTE_RESUME;
7006 }
7007 if (kn->kn_fflags & NOTE_RESUME) {
7008 kn->kn_hook32 &= ~NOTE_SUSPEND;
7009 }
7010
7011 knote_fill_kevent(kn, kev, data);
7012 }
7013 return ret;
7014}
7015
7016static int
7017filt_sockattach(struct knote *kn, __unused struct kevent_qos_s *kev)
7018{
7019 struct socket *so = (struct socket *)fp_get_data(fp: kn->kn_fp);
7020
7021 /* socket locked */
7022 kn->kn_hook32 = 0;
7023 if (KNOTE_ATTACH(&so->so_klist, kn)) {
7024 so->so_flags |= SOF_KNOTE;
7025 }
7026
7027 /* determine if event already fired */
7028 return filt_sockev_common(kn, NULL, so, ev_hint: 0);
7029}
7030
7031static void
7032filt_sockdetach(struct knote *kn)
7033{
7034 struct socket *so = (struct socket *)fp_get_data(fp: kn->kn_fp);
7035 socket_lock(so, refcount: 1);
7036
7037 if ((so->so_flags & SOF_KNOTE) != 0) {
7038 if (KNOTE_DETACH(&so->so_klist, kn)) {
7039 so->so_flags &= ~SOF_KNOTE;
7040 }
7041 }
7042 socket_unlock(so, refcount: 1);
7043}
7044
7045static int
7046filt_sockev(struct knote *kn, long hint)
7047{
7048 int ret = 0, locked = 0;
7049 struct socket *so = (struct socket *)fp_get_data(fp: kn->kn_fp);
7050 long ev_hint = (hint & SO_FILT_HINT_EV);
7051
7052 if ((hint & SO_FILT_HINT_LOCKED) == 0) {
7053 socket_lock(so, refcount: 1);
7054 locked = 1;
7055 }
7056
7057 ret = filt_sockev_common(kn, NULL, so, ev_hint);
7058
7059 if (locked) {
7060 socket_unlock(so, refcount: 1);
7061 }
7062
7063 return ret;
7064}
7065
7066
7067
7068/*
7069 * filt_socktouch - update event state
7070 */
7071static int
7072filt_socktouch(
7073 struct knote *kn,
7074 struct kevent_qos_s *kev)
7075{
7076 struct socket *so = (struct socket *)fp_get_data(fp: kn->kn_fp);
7077 uint32_t changed_flags;
7078 int ret;
7079
7080 socket_lock(so, refcount: 1);
7081
7082 /* save off the [result] data and fflags */
7083 changed_flags = (kn->kn_sfflags ^ kn->kn_hook32);
7084
7085 /* save off the new input fflags and data */
7086 kn->kn_sfflags = kev->fflags;
7087 kn->kn_sdata = kev->data;
7088
7089 /* restrict the current results to the (smaller?) set of new interest */
7090 /*
7091 * For compatibility with previous implementations, we leave kn_fflags
7092 * as they were before.
7093 */
7094 //kn->kn_fflags &= kev->fflags;
7095
7096 /*
7097 * Since we keep track of events that are already
7098 * delivered, if any of those events are not requested
7099 * anymore the state related to them can be reset
7100 */
7101 kn->kn_hook32 &= ~(changed_flags & EVFILT_SOCK_LEVEL_TRIGGER_MASK);
7102
7103 /* determine if we have events to deliver */
7104 ret = filt_sockev_common(kn, NULL, so, ev_hint: 0);
7105
7106 socket_unlock(so, refcount: 1);
7107
7108 return ret;
7109}
7110
7111/*
7112 * filt_sockprocess - query event fired state and return data
7113 */
7114static int
7115filt_sockprocess(struct knote *kn, struct kevent_qos_s *kev)
7116{
7117 struct socket *so = (struct socket *)fp_get_data(fp: kn->kn_fp);
7118 int ret = 0;
7119
7120 socket_lock(so, refcount: 1);
7121
7122 ret = filt_sockev_common(kn, kev, so, ev_hint: 0);
7123
7124 socket_unlock(so, refcount: 1);
7125
7126 return ret;
7127}
7128
7129void
7130get_sockev_state(struct socket *so, u_int32_t *statep)
7131{
7132 u_int32_t state = *(statep);
7133
7134 /*
7135 * If the state variable is already used by a previous event,
7136 * reset it.
7137 */
7138 if (state != 0) {
7139 return;
7140 }
7141
7142 if (so->so_state & SS_ISCONNECTED) {
7143 state |= SOCKEV_CONNECTED;
7144 } else {
7145 state &= ~(SOCKEV_CONNECTED);
7146 }
7147 state |= ((so->so_state & SS_ISDISCONNECTED) ? SOCKEV_DISCONNECTED : 0);
7148 *(statep) = state;
7149}
7150
7151#define SO_LOCK_HISTORY_STR_LEN \
7152 (2 * SO_LCKDBG_MAX * (2 + (2 * sizeof (void *)) + 1) + 1)
7153
7154__private_extern__ const char *
7155solockhistory_nr(struct socket *so)
7156{
7157 size_t n = 0;
7158 int i;
7159 static char lock_history_str[SO_LOCK_HISTORY_STR_LEN];
7160
7161 bzero(s: lock_history_str, n: sizeof(lock_history_str));
7162 for (i = SO_LCKDBG_MAX - 1; i >= 0; i--) {
7163 n += scnprintf(lock_history_str + n,
7164 SO_LOCK_HISTORY_STR_LEN - n, "%p:%p ",
7165 so->lock_lr[(so->next_lock_lr + i) % SO_LCKDBG_MAX],
7166 so->unlock_lr[(so->next_unlock_lr + i) % SO_LCKDBG_MAX]);
7167 }
7168 return lock_history_str;
7169}
7170
7171lck_mtx_t *
7172socket_getlock(struct socket *so, int flags)
7173{
7174 if (so->so_proto->pr_getlock != NULL) {
7175 return (*so->so_proto->pr_getlock)(so, flags);
7176 } else {
7177 return so->so_proto->pr_domain->dom_mtx;
7178 }
7179}
7180
7181void
7182socket_lock(struct socket *so, int refcount)
7183{
7184 void *lr_saved;
7185
7186 lr_saved = __builtin_return_address(0);
7187
7188 if (so->so_proto->pr_lock) {
7189 (*so->so_proto->pr_lock)(so, refcount, lr_saved);
7190 } else {
7191#ifdef MORE_LOCKING_DEBUG
7192 LCK_MTX_ASSERT(so->so_proto->pr_domain->dom_mtx,
7193 LCK_MTX_ASSERT_NOTOWNED);
7194#endif
7195 lck_mtx_lock(lck: so->so_proto->pr_domain->dom_mtx);
7196 if (refcount) {
7197 so->so_usecount++;
7198 }
7199 so->lock_lr[so->next_lock_lr] = lr_saved;
7200 so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
7201 }
7202}
7203
7204void
7205socket_lock_assert_owned(struct socket *so)
7206{
7207 lck_mtx_t *mutex_held;
7208
7209 if (so->so_proto->pr_getlock != NULL) {
7210 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7211 } else {
7212 mutex_held = so->so_proto->pr_domain->dom_mtx;
7213 }
7214
7215 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7216}
7217
7218int
7219socket_try_lock(struct socket *so)
7220{
7221 lck_mtx_t *mtx;
7222
7223 if (so->so_proto->pr_getlock != NULL) {
7224 mtx = (*so->so_proto->pr_getlock)(so, 0);
7225 } else {
7226 mtx = so->so_proto->pr_domain->dom_mtx;
7227 }
7228
7229 return lck_mtx_try_lock(lck: mtx);
7230}
7231
7232void
7233socket_unlock(struct socket *so, int refcount)
7234{
7235 void *lr_saved;
7236 lck_mtx_t *mutex_held;
7237
7238 lr_saved = __builtin_return_address(0);
7239
7240 if (so == NULL || so->so_proto == NULL) {
7241 panic("%s: null so_proto so=%p", __func__, so);
7242 /* NOTREACHED */
7243 }
7244
7245 if (so->so_proto->pr_unlock) {
7246 (*so->so_proto->pr_unlock)(so, refcount, lr_saved);
7247 } else {
7248 mutex_held = so->so_proto->pr_domain->dom_mtx;
7249#ifdef MORE_LOCKING_DEBUG
7250 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7251#endif
7252 so->unlock_lr[so->next_unlock_lr] = lr_saved;
7253 so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
7254
7255 if (refcount) {
7256 if (so->so_usecount <= 0) {
7257 panic("%s: bad refcount=%d so=%p (%d, %d, %d) "
7258 "lrh=%s", __func__, so->so_usecount, so,
7259 SOCK_DOM(so), so->so_type,
7260 SOCK_PROTO(so), solockhistory_nr(so));
7261 /* NOTREACHED */
7262 }
7263
7264 so->so_usecount--;
7265 if (so->so_usecount == 0) {
7266 sofreelastref(so, dealloc: 1);
7267 }
7268 }
7269 lck_mtx_unlock(lck: mutex_held);
7270 }
7271}
7272
7273/* Called with socket locked, will unlock socket */
7274void
7275sofree(struct socket *so)
7276{
7277 lck_mtx_t *mutex_held;
7278
7279 if (so->so_proto->pr_getlock != NULL) {
7280 mutex_held = (*so->so_proto->pr_getlock)(so, 0);
7281 } else {
7282 mutex_held = so->so_proto->pr_domain->dom_mtx;
7283 }
7284 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
7285
7286 sofreelastref(so, dealloc: 0);
7287}
7288
7289void
7290soreference(struct socket *so)
7291{
7292 socket_lock(so, refcount: 1); /* locks & take one reference on socket */
7293 socket_unlock(so, refcount: 0); /* unlock only */
7294}
7295
7296void
7297sodereference(struct socket *so)
7298{
7299 socket_lock(so, refcount: 0);
7300 socket_unlock(so, refcount: 1);
7301}
7302
7303/*
7304 * Set or clear SOF_MULTIPAGES on the socket to enable or disable the
7305 * possibility of using jumbo clusters. Caller must ensure to hold
7306 * the socket lock.
7307 */
7308void
7309somultipages(struct socket *so, boolean_t set)
7310{
7311 if (set) {
7312 so->so_flags |= SOF_MULTIPAGES;
7313 } else {
7314 so->so_flags &= ~SOF_MULTIPAGES;
7315 }
7316}
7317
7318void
7319soif2kcl(struct socket *so, boolean_t set)
7320{
7321 if (set) {
7322 so->so_flags1 |= SOF1_IF_2KCL;
7323 } else {
7324 so->so_flags1 &= ~SOF1_IF_2KCL;
7325 }
7326}
7327
7328int
7329so_isdstlocal(struct socket *so)
7330{
7331 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7332
7333 if (SOCK_DOM(so) == PF_INET) {
7334 return inaddr_local(inp->inp_faddr);
7335 } else if (SOCK_DOM(so) == PF_INET6) {
7336 return in6addr_local(&inp->in6p_faddr);
7337 }
7338
7339 return 0;
7340}
7341
7342int
7343sosetdefunct(struct proc *p, struct socket *so, int level, boolean_t noforce)
7344{
7345 struct sockbuf *rcv, *snd;
7346 int err = 0, defunct;
7347
7348 rcv = &so->so_rcv;
7349 snd = &so->so_snd;
7350
7351 defunct = (so->so_flags & SOF_DEFUNCT);
7352 if (defunct) {
7353 if (!(snd->sb_flags & rcv->sb_flags & SB_DROP)) {
7354 panic("%s: SB_DROP not set", __func__);
7355 /* NOTREACHED */
7356 }
7357 goto done;
7358 }
7359
7360 if (so->so_flags & SOF_NODEFUNCT) {
7361 if (noforce) {
7362 err = EOPNOTSUPP;
7363 if (p != PROC_NULL) {
7364 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7365 "name %s level %d) so 0x%llu [%d,%d] "
7366 "is not eligible for defunct "
7367 "(%d)\n", __func__, proc_selfpid(),
7368 proc_best_name(current_proc()), proc_pid(p),
7369 proc_best_name(p), level,
7370 so->so_gencnt,
7371 SOCK_DOM(so), SOCK_TYPE(so), err);
7372 }
7373 return err;
7374 }
7375 so->so_flags &= ~SOF_NODEFUNCT;
7376 if (p != PROC_NULL) {
7377 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7378 "name %s level %d) so 0x%llu [%d,%d] "
7379 "defunct by force "
7380 "(%d)\n", __func__, proc_selfpid(),
7381 proc_best_name(current_proc()), proc_pid(p),
7382 proc_best_name(p), level,
7383 so->so_gencnt,
7384 SOCK_DOM(so), SOCK_TYPE(so), err);
7385 }
7386 } else if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7387 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7388 struct ifnet *ifp = inp->inp_last_outifp;
7389
7390 if (ifp && IFNET_IS_CELLULAR(ifp)) {
7391 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nocell);
7392 } else if (so->so_flags & SOF_DELEGATED) {
7393 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7394 } else if (soextbkidlestat.so_xbkidle_time == 0) {
7395 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_notime);
7396 } else if (noforce && p != PROC_NULL) {
7397 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_active);
7398
7399 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_INPROG;
7400 so->so_extended_bk_start = net_uptime();
7401 OSBitOrAtomic(P_LXBKIDLEINPROG, &p->p_ladvflag);
7402
7403 inpcb_timer_sched(inp->inp_pcbinfo, type: INPCB_TIMER_LAZY);
7404
7405 err = EOPNOTSUPP;
7406 SODEFUNCTLOG("%s[%d, %s]: (target pid %d "
7407 "name %s level %d) so 0x%llu [%d,%d] "
7408 "extend bk idle "
7409 "(%d)\n", __func__, proc_selfpid(),
7410 proc_best_name(current_proc()), proc_pid(p),
7411 proc_best_name(p), level,
7412 so->so_gencnt,
7413 SOCK_DOM(so), SOCK_TYPE(so), err);
7414 return err;
7415 } else {
7416 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_forced);
7417 }
7418 }
7419
7420 so->so_flags |= SOF_DEFUNCT;
7421
7422 /* Prevent further data from being appended to the socket buffers */
7423 snd->sb_flags |= SB_DROP;
7424 rcv->sb_flags |= SB_DROP;
7425
7426 /* Flush any existing data in the socket buffers */
7427 if (rcv->sb_cc != 0) {
7428 rcv->sb_flags &= ~SB_SEL;
7429 selthreadclear(&rcv->sb_sel);
7430 sbrelease(sb: rcv);
7431 }
7432 if (snd->sb_cc != 0) {
7433 snd->sb_flags &= ~SB_SEL;
7434 selthreadclear(&snd->sb_sel);
7435 sbrelease(sb: snd);
7436 }
7437
7438done:
7439 if (p != PROC_NULL) {
7440 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7441 "so 0x%llu [%d,%d] %s defunct%s\n", __func__,
7442 proc_selfpid(), proc_best_name(current_proc()),
7443 proc_pid(p), proc_best_name(p), level,
7444 so->so_gencnt, SOCK_DOM(so),
7445 SOCK_TYPE(so), defunct ? "is already" : "marked as",
7446 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7447 " extbkidle" : "");
7448 }
7449 return err;
7450}
7451
7452int
7453sodefunct(struct proc *p, struct socket *so, int level)
7454{
7455 struct sockbuf *rcv, *snd;
7456
7457 if (!(so->so_flags & SOF_DEFUNCT)) {
7458 panic("%s improperly called", __func__);
7459 /* NOTREACHED */
7460 }
7461 if (so->so_state & SS_DEFUNCT) {
7462 goto done;
7463 }
7464
7465 rcv = &so->so_rcv;
7466 snd = &so->so_snd;
7467
7468 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7469 char s[MAX_IPv6_STR_LEN];
7470 char d[MAX_IPv6_STR_LEN];
7471 struct inpcb *inp = sotoinpcb(so);
7472
7473 if (p != PROC_NULL) {
7474 SODEFUNCTLOG(
7475 "%s[%d, %s]: (target pid %d name %s level %d) "
7476 "so 0x%llu [%s %s:%d -> %s:%d] is now defunct "
7477 "[rcv_si 0x%x, snd_si 0x%x, rcv_fl 0x%x, "
7478 " snd_fl 0x%x]\n", __func__,
7479 proc_selfpid(), proc_best_name(current_proc()),
7480 proc_pid(p), proc_best_name(p), level,
7481 so->so_gencnt,
7482 (SOCK_TYPE(so) == SOCK_STREAM) ? "TCP" : "UDP",
7483 inet_ntop(SOCK_DOM(so), ((SOCK_DOM(so) == PF_INET) ?
7484 (void *)&inp->inp_laddr.s_addr :
7485 (void *)&inp->in6p_laddr),
7486 s, sizeof(s)), ntohs(inp->in6p_lport),
7487 inet_ntop(SOCK_DOM(so), (SOCK_DOM(so) == PF_INET) ?
7488 (void *)&inp->inp_faddr.s_addr :
7489 (void *)&inp->in6p_faddr,
7490 d, sizeof(d)), ntohs(inp->in6p_fport),
7491 (uint32_t)rcv->sb_sel.si_flags,
7492 (uint32_t)snd->sb_sel.si_flags,
7493 rcv->sb_flags, snd->sb_flags);
7494 }
7495 } else if (p != PROC_NULL) {
7496 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s level %d) "
7497 "so 0x%llu [%d,%d] is now defunct [rcv_si 0x%x, "
7498 "snd_si 0x%x, rcv_fl 0x%x, snd_fl 0x%x]\n", __func__,
7499 proc_selfpid(), proc_best_name(current_proc()),
7500 proc_pid(p), proc_best_name(p), level,
7501 so->so_gencnt,
7502 SOCK_DOM(so), SOCK_TYPE(so),
7503 (uint32_t)rcv->sb_sel.si_flags,
7504 (uint32_t)snd->sb_sel.si_flags, rcv->sb_flags,
7505 snd->sb_flags);
7506 }
7507
7508 /*
7509 * First tell the protocol the flow is defunct
7510 */
7511 (void) (*so->so_proto->pr_usrreqs->pru_defunct)(so);
7512
7513 /*
7514 * Unwedge threads blocked on sbwait() and sb_lock().
7515 */
7516 sbwakeup(sb: rcv);
7517 sbwakeup(sb: snd);
7518
7519 so->so_flags1 |= SOF1_DEFUNCTINPROG;
7520 if (rcv->sb_flags & SB_LOCK) {
7521 sbunlock(sb: rcv, TRUE); /* keep socket locked */
7522 }
7523 if (snd->sb_flags & SB_LOCK) {
7524 sbunlock(sb: snd, TRUE); /* keep socket locked */
7525 }
7526 /*
7527 * Flush the buffers and disconnect. We explicitly call shutdown
7528 * on both data directions to ensure that SS_CANT{RCV,SEND}MORE
7529 * states are set for the socket. This would also flush out data
7530 * hanging off the receive list of this socket.
7531 */
7532 (void) soshutdownlock_final(so, SHUT_RD);
7533 (void) soshutdownlock_final(so, SHUT_WR);
7534 (void) sodisconnectlocked(so);
7535
7536 /*
7537 * Explicitly handle connectionless-protocol disconnection
7538 * and release any remaining data in the socket buffers.
7539 */
7540 if (!(so->so_state & SS_ISDISCONNECTED)) {
7541 (void) soisdisconnected(so);
7542 }
7543
7544 if (so->so_error == 0) {
7545 so->so_error = EBADF;
7546 }
7547
7548 if (rcv->sb_cc != 0) {
7549 rcv->sb_flags &= ~SB_SEL;
7550 selthreadclear(&rcv->sb_sel);
7551 sbrelease(sb: rcv);
7552 }
7553 if (snd->sb_cc != 0) {
7554 snd->sb_flags &= ~SB_SEL;
7555 selthreadclear(&snd->sb_sel);
7556 sbrelease(sb: snd);
7557 }
7558 so->so_state |= SS_DEFUNCT;
7559 OSIncrementAtomicLong(address: (volatile long *)&sodefunct_calls);
7560
7561done:
7562 return 0;
7563}
7564
7565int
7566soresume(struct proc *p, struct socket *so, int locked)
7567{
7568 if (locked == 0) {
7569 socket_lock(so, refcount: 1);
7570 }
7571
7572 if (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG) {
7573 SODEFUNCTLOG("%s[%d, %s]: (target pid %d name %s) so 0x%llu "
7574 "[%d,%d] resumed from bk idle\n",
7575 __func__, proc_selfpid(), proc_best_name(current_proc()),
7576 proc_pid(p), proc_best_name(p),
7577 so->so_gencnt,
7578 SOCK_DOM(so), SOCK_TYPE(so));
7579
7580 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7581 so->so_extended_bk_start = 0;
7582 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7583
7584 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resumed);
7585 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7586 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7587 }
7588 if (locked == 0) {
7589 socket_unlock(so, refcount: 1);
7590 }
7591
7592 return 0;
7593}
7594
7595/*
7596 * Does not attempt to account for sockets that are delegated from
7597 * the current process
7598 */
7599int
7600so_set_extended_bk_idle(struct socket *so, int optval)
7601{
7602 int error = 0;
7603
7604 if ((SOCK_DOM(so) != PF_INET && SOCK_DOM(so) != PF_INET6) ||
7605 SOCK_PROTO(so) != IPPROTO_TCP) {
7606 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_notsupp);
7607 error = EOPNOTSUPP;
7608 } else if (optval == 0) {
7609 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_WANTED;
7610
7611 soresume(p: current_proc(), so, locked: 1);
7612 } else {
7613 struct proc *p = current_proc();
7614 struct fileproc *fp;
7615 int count = 0;
7616
7617 /*
7618 * Unlock socket to avoid lock ordering issue with
7619 * the proc fd table lock
7620 */
7621 socket_unlock(so, refcount: 0);
7622
7623 proc_fdlock(p);
7624 fdt_foreach(fp, p) {
7625 struct socket *so2;
7626
7627 if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7628 continue;
7629 }
7630
7631 so2 = (struct socket *)fp_get_data(fp);
7632 if (so != so2 &&
7633 so2->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) {
7634 count++;
7635 }
7636 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7637 break;
7638 }
7639 }
7640 proc_fdunlock(p);
7641
7642 socket_lock(so, refcount: 0);
7643
7644 if (count >= soextbkidlestat.so_xbkidle_maxperproc) {
7645 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_toomany);
7646 error = EBUSY;
7647 } else if (so->so_flags & SOF_DELEGATED) {
7648 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_nodlgtd);
7649 error = EBUSY;
7650 } else {
7651 so->so_flags1 |= SOF1_EXTEND_BK_IDLE_WANTED;
7652 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_wantok);
7653 }
7654 SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d] "
7655 "%s marked for extended bk idle\n",
7656 __func__, proc_selfpid(), proc_best_name(current_proc()),
7657 so->so_gencnt,
7658 SOCK_DOM(so), SOCK_TYPE(so),
7659 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ?
7660 "is" : "not");
7661 }
7662
7663 return error;
7664}
7665
7666static void
7667so_stop_extended_bk_idle(struct socket *so)
7668{
7669 so->so_flags1 &= ~SOF1_EXTEND_BK_IDLE_INPROG;
7670 so->so_extended_bk_start = 0;
7671
7672 OSDecrementAtomic(&soextbkidlestat.so_xbkidle_active);
7673 VERIFY(soextbkidlestat.so_xbkidle_active >= 0);
7674 /*
7675 * Force defunct
7676 */
7677 sosetdefunct(p: current_proc(), so,
7678 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL, FALSE);
7679 if (so->so_flags & SOF_DEFUNCT) {
7680 sodefunct(p: current_proc(), so,
7681 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_INTERNAL);
7682 }
7683}
7684
7685void
7686so_drain_extended_bk_idle(struct socket *so)
7687{
7688 if (so && (so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7689 /*
7690 * Only penalize sockets that have outstanding data
7691 */
7692 if (so->so_rcv.sb_cc || so->so_snd.sb_cc) {
7693 so_stop_extended_bk_idle(so);
7694
7695 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_drained);
7696 }
7697 }
7698}
7699
7700/*
7701 * Return values tells if socket is still in extended background idle
7702 */
7703int
7704so_check_extended_bk_idle_time(struct socket *so)
7705{
7706 int ret = 1;
7707
7708 if ((so->so_flags1 & SOF1_EXTEND_BK_IDLE_INPROG)) {
7709 SODEFUNCTLOG("%s[%d, %s]: so 0x%llu [%d,%d]\n",
7710 __func__, proc_selfpid(), proc_best_name(current_proc()),
7711 so->so_gencnt,
7712 SOCK_DOM(so), SOCK_TYPE(so));
7713 if (net_uptime() - so->so_extended_bk_start >
7714 soextbkidlestat.so_xbkidle_time) {
7715 so_stop_extended_bk_idle(so);
7716
7717 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_expired);
7718
7719 ret = 0;
7720 } else {
7721 struct inpcb *inp = (struct inpcb *)so->so_pcb;
7722
7723 inpcb_timer_sched(inp->inp_pcbinfo, type: INPCB_TIMER_LAZY);
7724 OSIncrementAtomic(&soextbkidlestat.so_xbkidle_resched);
7725 }
7726 }
7727
7728 return ret;
7729}
7730
7731void
7732resume_proc_sockets(proc_t p)
7733{
7734 if (p->p_ladvflag & P_LXBKIDLEINPROG) {
7735 struct fileproc *fp;
7736 struct socket *so;
7737
7738 proc_fdlock(p);
7739 fdt_foreach(fp, p) {
7740 if (FILEGLOB_DTYPE(fp->fp_glob) != DTYPE_SOCKET) {
7741 continue;
7742 }
7743
7744 so = (struct socket *)fp_get_data(fp);
7745 (void) soresume(p, so, locked: 0);
7746 }
7747 proc_fdunlock(p);
7748
7749 OSBitAndAtomic(~P_LXBKIDLEINPROG, &p->p_ladvflag);
7750 }
7751}
7752
7753__private_extern__ int
7754so_set_recv_anyif(struct socket *so, int optval)
7755{
7756 int ret = 0;
7757
7758 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7759 if (optval) {
7760 sotoinpcb(so)->inp_flags |= INP_RECV_ANYIF;
7761 } else {
7762 sotoinpcb(so)->inp_flags &= ~INP_RECV_ANYIF;
7763 }
7764#if SKYWALK
7765 inp_update_netns_flags(so);
7766#endif /* SKYWALK */
7767 }
7768
7769
7770 return ret;
7771}
7772
7773__private_extern__ int
7774so_get_recv_anyif(struct socket *so)
7775{
7776 int ret = 0;
7777
7778 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7779 ret = (sotoinpcb(so)->inp_flags & INP_RECV_ANYIF) ? 1 : 0;
7780 }
7781
7782 return ret;
7783}
7784
7785int
7786so_set_restrictions(struct socket *so, uint32_t vals)
7787{
7788 int nocell_old, nocell_new;
7789 int noexpensive_old, noexpensive_new;
7790 int noconstrained_old, noconstrained_new;
7791
7792 /*
7793 * Deny-type restrictions are trapdoors; once set they cannot be
7794 * unset for the lifetime of the socket. This allows them to be
7795 * issued by a framework on behalf of the application without
7796 * having to worry that they can be undone.
7797 *
7798 * Note here that socket-level restrictions overrides any protocol
7799 * level restrictions. For instance, SO_RESTRICT_DENY_CELLULAR
7800 * socket restriction issued on the socket has a higher precendence
7801 * than INP_NO_IFT_CELLULAR. The latter is affected by the UUID
7802 * policy PROC_UUID_NO_CELLULAR for unrestricted sockets only,
7803 * i.e. when SO_RESTRICT_DENY_CELLULAR has not been issued.
7804 */
7805 nocell_old = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7806 noexpensive_old = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7807 noconstrained_old = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7808 so->so_restrictions |= (vals & (SO_RESTRICT_DENY_IN |
7809 SO_RESTRICT_DENY_OUT | SO_RESTRICT_DENY_CELLULAR |
7810 SO_RESTRICT_DENY_EXPENSIVE | SO_RESTRICT_DENY_CONSTRAINED));
7811 nocell_new = (so->so_restrictions & SO_RESTRICT_DENY_CELLULAR);
7812 noexpensive_new = (so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE);
7813 noconstrained_new = (so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED);
7814
7815 /* we can only set, not clear restrictions */
7816 if ((nocell_new - nocell_old) == 0 &&
7817 (noexpensive_new - noexpensive_old) == 0 &&
7818 (noconstrained_new - noconstrained_old) == 0) {
7819 return 0;
7820 }
7821 if (SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) {
7822 if (nocell_new - nocell_old != 0) {
7823 /*
7824 * if deny cellular is now set, do what's needed
7825 * for INPCB
7826 */
7827 inp_set_nocellular(sotoinpcb(so));
7828 }
7829 if (noexpensive_new - noexpensive_old != 0) {
7830 inp_set_noexpensive(sotoinpcb(so));
7831 }
7832 if (noconstrained_new - noconstrained_old != 0) {
7833 inp_set_noconstrained(sotoinpcb(so));
7834 }
7835 }
7836
7837 if (SOCK_DOM(so) == PF_MULTIPATH) {
7838 mptcp_set_restrictions(mp_so: so);
7839 }
7840
7841 return 0;
7842}
7843
7844uint32_t
7845so_get_restrictions(struct socket *so)
7846{
7847 return so->so_restrictions & (SO_RESTRICT_DENY_IN |
7848 SO_RESTRICT_DENY_OUT |
7849 SO_RESTRICT_DENY_CELLULAR | SO_RESTRICT_DENY_EXPENSIVE);
7850}
7851
7852int
7853so_set_effective_pid(struct socket *so, int epid, struct proc *p, boolean_t check_cred)
7854{
7855 struct proc *ep = PROC_NULL;
7856 int error = 0;
7857
7858 /* pid 0 is reserved for kernel */
7859 if (epid == 0) {
7860 error = EINVAL;
7861 goto done;
7862 }
7863
7864 /*
7865 * If this is an in-kernel socket, prevent its delegate
7866 * association from changing unless the socket option is
7867 * coming from within the kernel itself.
7868 */
7869 if (so->last_pid == 0 && p != kernproc) {
7870 error = EACCES;
7871 goto done;
7872 }
7873
7874 /*
7875 * If this is issued by a process that's recorded as the
7876 * real owner of the socket, or if the pid is the same as
7877 * the process's own pid, then proceed. Otherwise ensure
7878 * that the issuing process has the necessary privileges.
7879 */
7880 if (check_cred && (epid != so->last_pid || epid != proc_pid(p))) {
7881 if ((error = priv_check_cred(cred: kauth_cred_get(),
7882 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, flags: 0))) {
7883 error = EACCES;
7884 goto done;
7885 }
7886 }
7887
7888 /* Find the process that corresponds to the effective pid */
7889 if ((ep = proc_find(pid: epid)) == PROC_NULL) {
7890 error = ESRCH;
7891 goto done;
7892 }
7893
7894 /*
7895 * If a process tries to delegate the socket to itself, then
7896 * there's really nothing to do; treat it as a way for the
7897 * delegate association to be cleared. Note that we check
7898 * the passed-in proc rather than calling proc_selfpid(),
7899 * as we need to check the process issuing the socket option
7900 * which could be kernproc. Given that we don't allow 0 for
7901 * effective pid, it means that a delegated in-kernel socket
7902 * stays delegated during its lifetime (which is probably OK.)
7903 */
7904 if (epid == proc_pid(p)) {
7905 so->so_flags &= ~SOF_DELEGATED;
7906 so->e_upid = 0;
7907 so->e_pid = 0;
7908 uuid_clear(uu: so->e_uuid);
7909 } else {
7910 so->so_flags |= SOF_DELEGATED;
7911 so->e_upid = proc_uniqueid(ep);
7912 so->e_pid = proc_pid(ep);
7913 proc_getexecutableuuid(ep, so->e_uuid, sizeof(so->e_uuid));
7914
7915#if defined(XNU_TARGET_OS_OSX)
7916 if (ep->p_responsible_pid != so->e_pid) {
7917 proc_t rp = proc_find(pid: ep->p_responsible_pid);
7918 if (rp != PROC_NULL) {
7919 proc_getexecutableuuid(rp, so->so_ruuid, sizeof(so->so_ruuid));
7920 so->so_rpid = ep->p_responsible_pid;
7921 proc_rele(p: rp);
7922 } else {
7923 uuid_clear(uu: so->so_ruuid);
7924 so->so_rpid = -1;
7925 }
7926 }
7927#endif
7928 }
7929 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
7930 (*so->so_proto->pr_update_last_owner)(so, NULL, ep);
7931 }
7932done:
7933 if (error == 0 && net_io_policy_log) {
7934 uuid_string_t buf;
7935
7936 uuid_unparse(uu: so->e_uuid, out: buf);
7937 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7938 "euuid %s%s\n", __func__, proc_name_address(p),
7939 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7940 SOCK_DOM(so), SOCK_TYPE(so),
7941 so->e_pid, proc_name_address(p: ep), buf,
7942 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
7943 } else if (error != 0 && net_io_policy_log) {
7944 log(LOG_ERR, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d (%s) "
7945 "ERROR (%d)\n", __func__, proc_name_address(p),
7946 proc_pid(p), (uint64_t)DEBUG_KERNEL_ADDRPERM(so),
7947 SOCK_DOM(so), SOCK_TYPE(so),
7948 epid, (ep == PROC_NULL) ? "PROC_NULL" :
7949 proc_name_address(p: ep), error);
7950 }
7951
7952 /* Update this socket's policy upon success */
7953 if (error == 0) {
7954 so->so_policy_gencnt *= -1;
7955 so_update_policy(so);
7956#if NECP
7957 so_update_necp_policy(so, NULL, NULL);
7958#endif /* NECP */
7959 }
7960
7961 if (ep != PROC_NULL) {
7962 proc_rele(p: ep);
7963 }
7964
7965 return error;
7966}
7967
7968int
7969so_set_effective_uuid(struct socket *so, uuid_t euuid, struct proc *p, boolean_t check_cred)
7970{
7971 uuid_string_t buf;
7972 uuid_t uuid;
7973 int error = 0;
7974
7975 /* UUID must not be all-zeroes (reserved for kernel) */
7976 if (uuid_is_null(uu: euuid)) {
7977 error = EINVAL;
7978 goto done;
7979 }
7980
7981 /*
7982 * If this is an in-kernel socket, prevent its delegate
7983 * association from changing unless the socket option is
7984 * coming from within the kernel itself.
7985 */
7986 if (so->last_pid == 0 && p != kernproc) {
7987 error = EACCES;
7988 goto done;
7989 }
7990
7991 /* Get the UUID of the issuing process */
7992 proc_getexecutableuuid(p, uuid, sizeof(uuid));
7993
7994 /*
7995 * If this is issued by a process that's recorded as the
7996 * real owner of the socket, or if the uuid is the same as
7997 * the process's own uuid, then proceed. Otherwise ensure
7998 * that the issuing process has the necessary privileges.
7999 */
8000 if (check_cred &&
8001 (uuid_compare(uu1: euuid, uu2: so->last_uuid) != 0 ||
8002 uuid_compare(uu1: euuid, uu2: uuid) != 0)) {
8003 if ((error = priv_check_cred(cred: kauth_cred_get(),
8004 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, flags: 0))) {
8005 error = EACCES;
8006 goto done;
8007 }
8008 }
8009
8010 /*
8011 * If a process tries to delegate the socket to itself, then
8012 * there's really nothing to do; treat it as a way for the
8013 * delegate association to be cleared. Note that we check
8014 * the uuid of the passed-in proc rather than that of the
8015 * current process, as we need to check the process issuing
8016 * the socket option which could be kernproc itself. Given
8017 * that we don't allow 0 for effective uuid, it means that
8018 * a delegated in-kernel socket stays delegated during its
8019 * lifetime (which is okay.)
8020 */
8021 if (uuid_compare(uu1: euuid, uu2: uuid) == 0) {
8022 so->so_flags &= ~SOF_DELEGATED;
8023 so->e_upid = 0;
8024 so->e_pid = 0;
8025 uuid_clear(uu: so->e_uuid);
8026 } else {
8027 so->so_flags |= SOF_DELEGATED;
8028 /*
8029 * Unlike so_set_effective_pid(), we only have the UUID
8030 * here and the process ID is not known. Inherit the
8031 * real {pid,upid} of the socket.
8032 */
8033 so->e_upid = so->last_upid;
8034 so->e_pid = so->last_pid;
8035 uuid_copy(dst: so->e_uuid, src: euuid);
8036 }
8037 /*
8038 * The following will clear the effective process name as it's the same
8039 * as the real process
8040 */
8041 if (so->so_proto != NULL && so->so_proto->pr_update_last_owner != NULL) {
8042 (*so->so_proto->pr_update_last_owner)(so, NULL, NULL);
8043 }
8044done:
8045 if (error == 0 && net_io_policy_log) {
8046 uuid_unparse(uu: so->e_uuid, out: buf);
8047 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] epid %d "
8048 "euuid %s%s\n", __func__, proc_name_address(p), proc_pid(p),
8049 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8050 SOCK_TYPE(so), so->e_pid, buf,
8051 ((so->so_flags & SOF_DELEGATED) ? " [delegated]" : ""));
8052 } else if (error != 0 && net_io_policy_log) {
8053 uuid_unparse(uu: euuid, out: buf);
8054 log(LOG_DEBUG, "%s[%s,%d]: so 0x%llx [%d,%d] euuid %s "
8055 "ERROR (%d)\n", __func__, proc_name_address(p), proc_pid(p),
8056 (uint64_t)DEBUG_KERNEL_ADDRPERM(so), SOCK_DOM(so),
8057 SOCK_TYPE(so), buf, error);
8058 }
8059
8060 /* Update this socket's policy upon success */
8061 if (error == 0) {
8062 so->so_policy_gencnt *= -1;
8063 so_update_policy(so);
8064#if NECP
8065 so_update_necp_policy(so, NULL, NULL);
8066#endif /* NECP */
8067 }
8068
8069 return error;
8070}
8071
8072void
8073netpolicy_post_msg(uint32_t ev_code, struct netpolicy_event_data *ev_data,
8074 uint32_t ev_datalen)
8075{
8076 struct kev_msg ev_msg;
8077
8078 /*
8079 * A netpolicy event always starts with a netpolicy_event_data
8080 * structure, but the caller can provide for a longer event
8081 * structure to post, depending on the event code.
8082 */
8083 VERIFY(ev_data != NULL && ev_datalen >= sizeof(*ev_data));
8084
8085 bzero(s: &ev_msg, n: sizeof(ev_msg));
8086 ev_msg.vendor_code = KEV_VENDOR_APPLE;
8087 ev_msg.kev_class = KEV_NETWORK_CLASS;
8088 ev_msg.kev_subclass = KEV_NETPOLICY_SUBCLASS;
8089 ev_msg.event_code = ev_code;
8090
8091 ev_msg.dv[0].data_ptr = ev_data;
8092 ev_msg.dv[0].data_length = ev_datalen;
8093
8094 kev_post_msg(event: &ev_msg);
8095}
8096
8097void
8098socket_post_kev_msg(uint32_t ev_code,
8099 struct kev_socket_event_data *ev_data,
8100 uint32_t ev_datalen)
8101{
8102 struct kev_msg ev_msg;
8103
8104 bzero(s: &ev_msg, n: sizeof(ev_msg));
8105 ev_msg.vendor_code = KEV_VENDOR_APPLE;
8106 ev_msg.kev_class = KEV_NETWORK_CLASS;
8107 ev_msg.kev_subclass = KEV_SOCKET_SUBCLASS;
8108 ev_msg.event_code = ev_code;
8109
8110 ev_msg.dv[0].data_ptr = ev_data;
8111 ev_msg.dv[0].data_length = ev_datalen;
8112
8113 kev_post_msg(event: &ev_msg);
8114}
8115
8116void
8117socket_post_kev_msg_closed(struct socket *so)
8118{
8119 struct kev_socket_closed ev = {};
8120 struct sockaddr *socksa = NULL, *peersa = NULL;
8121 int err;
8122
8123 if ((so->so_flags1 & SOF1_WANT_KEV_SOCK_CLOSED) == 0) {
8124 return;
8125 }
8126 err = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &socksa);
8127 if (err == 0) {
8128 err = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so,
8129 &peersa);
8130 if (err == 0) {
8131 memcpy(dst: &ev.ev_data.kev_sockname, src: socksa,
8132 n: min(a: socksa->sa_len,
8133 b: sizeof(ev.ev_data.kev_sockname)));
8134 memcpy(dst: &ev.ev_data.kev_peername, src: peersa,
8135 n: min(a: peersa->sa_len,
8136 b: sizeof(ev.ev_data.kev_peername)));
8137 socket_post_kev_msg(KEV_SOCKET_CLOSED,
8138 ev_data: &ev.ev_data, ev_datalen: sizeof(ev));
8139 }
8140 }
8141 free_sockaddr(socksa);
8142 free_sockaddr(peersa);
8143}
8144
8145__attribute__((noinline, cold, not_tail_called, noreturn))
8146__private_extern__ int
8147assfail(const char *a, const char *f, int l)
8148{
8149 panic("assertion failed: %s, file: %s, line: %d", a, f, l);
8150 /* NOTREACHED */
8151 __builtin_unreachable();
8152}
8153