1/*
2 * Copyright (c) 1998-2015 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/* Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved */
29/*
30 * Copyright (c) 1982, 1986, 1988, 1990, 1993
31 * The Regents of the University of California. All rights reserved.
32 *
33 * Redistribution and use in source and binary forms, with or without
34 * modification, are permitted provided that the following conditions
35 * are met:
36 * 1. Redistributions of source code must retain the above copyright
37 * notice, this list of conditions and the following disclaimer.
38 * 2. Redistributions in binary form must reproduce the above copyright
39 * notice, this list of conditions and the following disclaimer in the
40 * documentation and/or other materials provided with the distribution.
41 * 3. All advertising materials mentioning features or use of this software
42 * must display the following acknowledgement:
43 * This product includes software developed by the University of
44 * California, Berkeley and its contributors.
45 * 4. Neither the name of the University nor the names of its contributors
46 * may be used to endorse or promote products derived from this software
47 * without specific prior written permission.
48 *
49 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
50 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
51 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
52 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
53 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
54 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
55 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
56 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
57 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
58 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
59 * SUCH DAMAGE.
60 *
61 * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93
62 */
63/*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70#include <sys/param.h>
71#include <sys/systm.h>
72#include <sys/domain.h>
73#include <sys/kernel.h>
74#include <sys/proc_internal.h>
75#include <sys/kauth.h>
76#include <sys/malloc.h>
77#include <sys/mbuf.h>
78#include <sys/mcache.h>
79#include <sys/protosw.h>
80#include <sys/stat.h>
81#include <sys/socket.h>
82#include <sys/socketvar.h>
83#include <sys/signalvar.h>
84#include <sys/sysctl.h>
85#include <sys/syslog.h>
86#include <sys/ev.h>
87#include <kern/locks.h>
88#include <net/route.h>
89#include <net/content_filter.h>
90#include <netinet/in.h>
91#include <netinet/in_pcb.h>
92#include <netinet/tcp_var.h>
93#include <sys/kdebug.h>
94#include <libkern/OSAtomic.h>
95
96#if CONFIG_MACF
97#include <security/mac_framework.h>
98#endif
99
100#include <mach/vm_param.h>
101
102#if MPTCP
103#include <netinet/mptcp_var.h>
104#endif
105
106#define DBG_FNC_SBDROP NETDBG_CODE(DBG_NETSOCK, 4)
107#define DBG_FNC_SBAPPEND NETDBG_CODE(DBG_NETSOCK, 5)
108
109SYSCTL_DECL(_kern_ipc);
110
111__private_extern__ u_int32_t net_io_policy_throttle_best_effort = 0;
112SYSCTL_INT(_kern_ipc, OID_AUTO, throttle_best_effort,
113 CTLFLAG_RW | CTLFLAG_LOCKED, &net_io_policy_throttle_best_effort, 0, "");
114
115static inline void sbcompress(struct sockbuf *, struct mbuf *, struct mbuf *);
116static struct socket *sonewconn_internal(struct socket *, int);
117static int sbappendcontrol_internal(struct sockbuf *, struct mbuf *,
118 struct mbuf *);
119static void soevent_ifdenied(struct socket *);
120
121/*
122 * Primitive routines for operating on sockets and socket buffers
123 */
124static int soqlimitcompat = 1;
125static int soqlencomp = 0;
126
127/*
128 * Based on the number of mbuf clusters configured, high_sb_max and sb_max can
129 * get scaled up or down to suit that memory configuration. high_sb_max is a
130 * higher limit on sb_max that is checked when sb_max gets set through sysctl.
131 */
132
133u_int32_t sb_max = SB_MAX; /* XXX should be static */
134u_int32_t high_sb_max = SB_MAX;
135
136static u_int32_t sb_efficiency = 8; /* parameter for sbreserve() */
137int32_t total_sbmb_cnt __attribute__((aligned(8))) = 0;
138int32_t total_sbmb_cnt_floor __attribute__((aligned(8))) = 0;
139int32_t total_sbmb_cnt_peak __attribute__((aligned(8))) = 0;
140int64_t sbmb_limreached __attribute__((aligned(8))) = 0;
141
142u_int32_t net_io_policy_log = 0; /* log socket policy changes */
143#if CONFIG_PROC_UUID_POLICY
144u_int32_t net_io_policy_uuid = 1; /* enable UUID socket policy */
145#endif /* CONFIG_PROC_UUID_POLICY */
146
147/*
148 * Procedures to manipulate state flags of socket
149 * and do appropriate wakeups. Normal sequence from the
150 * active (originating) side is that soisconnecting() is
151 * called during processing of connect() call,
152 * resulting in an eventual call to soisconnected() if/when the
153 * connection is established. When the connection is torn down
154 * soisdisconnecting() is called during processing of disconnect() call,
155 * and soisdisconnected() is called when the connection to the peer
156 * is totally severed. The semantics of these routines are such that
157 * connectionless protocols can call soisconnected() and soisdisconnected()
158 * only, bypassing the in-progress calls when setting up a ``connection''
159 * takes no time.
160 *
161 * From the passive side, a socket is created with
162 * two queues of sockets: so_incomp for connections in progress
163 * and so_comp for connections already made and awaiting user acceptance.
164 * As a protocol is preparing incoming connections, it creates a socket
165 * structure queued on so_incomp by calling sonewconn(). When the connection
166 * is established, soisconnected() is called, and transfers the
167 * socket structure to so_comp, making it available to accept().
168 *
169 * If a socket is closed with sockets on either
170 * so_incomp or so_comp, these sockets are dropped.
171 *
172 * If higher level protocols are implemented in
173 * the kernel, the wakeups done here will sometimes
174 * cause software-interrupt process scheduling.
175 */
176void
177soisconnecting(struct socket *so)
178{
179 so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
180 so->so_state |= SS_ISCONNECTING;
181
182 sflt_notify(so, sock_evt_connecting, NULL);
183}
184
185void
186soisconnected(struct socket *so)
187{
188 so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
189 so->so_state |= SS_ISCONNECTED;
190
191 soreserve_preconnect(so, 0);
192
193 sflt_notify(so, sock_evt_connected, NULL);
194
195 if (so->so_head != NULL && (so->so_state & SS_INCOMP)) {
196 struct socket *head = so->so_head;
197 int locked = 0;
198
199 /*
200 * Enforce lock order when the protocol has per socket locks
201 */
202 if (head->so_proto->pr_getlock != NULL) {
203 socket_lock(head, 1);
204 so_acquire_accept_list(head, so);
205 locked = 1;
206 }
207 if (so->so_head == head && (so->so_state & SS_INCOMP)) {
208 so->so_state &= ~SS_INCOMP;
209 so->so_state |= SS_COMP;
210 TAILQ_REMOVE(&head->so_incomp, so, so_list);
211 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
212 head->so_incqlen--;
213
214 /*
215 * We have to release the accept list in
216 * case a socket callback calls sock_accept()
217 */
218 if (locked != 0) {
219 so_release_accept_list(head);
220 socket_unlock(so, 0);
221 }
222 postevent(head, 0, EV_RCONN);
223 sorwakeup(head);
224 wakeup_one((caddr_t)&head->so_timeo);
225
226 if (locked != 0) {
227 socket_unlock(head, 1);
228 socket_lock(so, 0);
229 }
230 } else if (locked != 0) {
231 so_release_accept_list(head);
232 socket_unlock(head, 1);
233 }
234 } else {
235 postevent(so, 0, EV_WCONN);
236 wakeup((caddr_t)&so->so_timeo);
237 sorwakeup(so);
238 sowwakeup(so);
239 soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CONNECTED |
240 SO_FILT_HINT_CONNINFO_UPDATED);
241 }
242}
243
244boolean_t
245socanwrite(struct socket *so)
246{
247 return ((so->so_state & SS_ISCONNECTED) ||
248 !(so->so_proto->pr_flags & PR_CONNREQUIRED) ||
249 (so->so_flags1 & SOF1_PRECONNECT_DATA));
250}
251
252void
253soisdisconnecting(struct socket *so)
254{
255 so->so_state &= ~SS_ISCONNECTING;
256 so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE);
257 soevent(so, SO_FILT_HINT_LOCKED);
258 sflt_notify(so, sock_evt_disconnecting, NULL);
259 wakeup((caddr_t)&so->so_timeo);
260 sowwakeup(so);
261 sorwakeup(so);
262}
263
264void
265soisdisconnected(struct socket *so)
266{
267 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
268 so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED);
269 soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED |
270 SO_FILT_HINT_CONNINFO_UPDATED);
271 sflt_notify(so, sock_evt_disconnected, NULL);
272 wakeup((caddr_t)&so->so_timeo);
273 sowwakeup(so);
274 sorwakeup(so);
275
276#if CONTENT_FILTER
277 /* Notify content filters as soon as we cannot send/receive data */
278 cfil_sock_notify_shutdown(so, SHUT_RDWR);
279#endif /* CONTENT_FILTER */
280}
281
282/*
283 * This function will issue a wakeup like soisdisconnected but it will not
284 * notify the socket filters. This will avoid unlocking the socket
285 * in the midst of closing it.
286 */
287void
288sodisconnectwakeup(struct socket *so)
289{
290 so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
291 so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_ISDISCONNECTED);
292 soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED |
293 SO_FILT_HINT_CONNINFO_UPDATED);
294 wakeup((caddr_t)&so->so_timeo);
295 sowwakeup(so);
296 sorwakeup(so);
297
298#if CONTENT_FILTER
299 /* Notify content filters as soon as we cannot send/receive data */
300 cfil_sock_notify_shutdown(so, SHUT_RDWR);
301#endif /* CONTENT_FILTER */
302}
303
304/*
305 * When an attempt at a new connection is noted on a socket
306 * which accepts connections, sonewconn is called. If the
307 * connection is possible (subject to space constraints, etc.)
308 * then we allocate a new structure, propoerly linked into the
309 * data structure of the original socket, and return this.
310 * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
311 */
312static struct socket *
313sonewconn_internal(struct socket *head, int connstatus)
314{
315 int so_qlen, error = 0;
316 struct socket *so;
317 lck_mtx_t *mutex_held;
318
319 if (head->so_proto->pr_getlock != NULL)
320 mutex_held = (*head->so_proto->pr_getlock)(head, 0);
321 else
322 mutex_held = head->so_proto->pr_domain->dom_mtx;
323 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
324
325 if (!soqlencomp) {
326 /*
327 * This is the default case; so_qlen represents the
328 * sum of both incomplete and completed queues.
329 */
330 so_qlen = head->so_qlen;
331 } else {
332 /*
333 * When kern.ipc.soqlencomp is set to 1, so_qlen
334 * represents only the completed queue. Since we
335 * cannot let the incomplete queue goes unbounded
336 * (in case of SYN flood), we cap the incomplete
337 * queue length to at most somaxconn, and use that
338 * as so_qlen so that we fail immediately below.
339 */
340 so_qlen = head->so_qlen - head->so_incqlen;
341 if (head->so_incqlen > somaxconn)
342 so_qlen = somaxconn;
343 }
344
345 if (so_qlen >=
346 (soqlimitcompat ? head->so_qlimit : (3 * head->so_qlimit / 2)))
347 return ((struct socket *)0);
348 so = soalloc(1, SOCK_DOM(head), head->so_type);
349 if (so == NULL)
350 return ((struct socket *)0);
351 /* check if head was closed during the soalloc */
352 if (head->so_proto == NULL) {
353 sodealloc(so);
354 return ((struct socket *)0);
355 }
356
357 so->so_type = head->so_type;
358 so->so_options = head->so_options &~ SO_ACCEPTCONN;
359 so->so_linger = head->so_linger;
360 so->so_state = head->so_state | SS_NOFDREF;
361 so->so_proto = head->so_proto;
362 so->so_timeo = head->so_timeo;
363 so->so_pgid = head->so_pgid;
364 kauth_cred_ref(head->so_cred);
365 so->so_cred = head->so_cred;
366 so->last_pid = head->last_pid;
367 so->last_upid = head->last_upid;
368 memcpy(so->last_uuid, head->last_uuid, sizeof (so->last_uuid));
369 if (head->so_flags & SOF_DELEGATED) {
370 so->e_pid = head->e_pid;
371 so->e_upid = head->e_upid;
372 memcpy(so->e_uuid, head->e_uuid, sizeof (so->e_uuid));
373 }
374 /* inherit socket options stored in so_flags */
375 so->so_flags = head->so_flags &
376 (SOF_NOSIGPIPE | SOF_NOADDRAVAIL | SOF_REUSESHAREUID |
377 SOF_NOTIFYCONFLICT | SOF_BINDRANDOMPORT | SOF_NPX_SETOPTSHUT |
378 SOF_NODEFUNCT | SOF_PRIVILEGED_TRAFFIC_CLASS| SOF_NOTSENT_LOWAT |
379 SOF_USELRO | SOF_DELEGATED);
380 so->so_usecount = 1;
381 so->next_lock_lr = 0;
382 so->next_unlock_lr = 0;
383
384 so->so_rcv.sb_flags |= SB_RECV; /* XXX */
385 so->so_rcv.sb_so = so->so_snd.sb_so = so;
386 TAILQ_INIT(&so->so_evlist);
387
388#if CONFIG_MACF_SOCKET
389 mac_socket_label_associate_accept(head, so);
390#endif
391
392 /* inherit traffic management properties of listener */
393 so->so_flags1 |=
394 head->so_flags1 & (SOF1_TRAFFIC_MGT_SO_BACKGROUND);
395 so->so_background_thread = head->so_background_thread;
396 so->so_traffic_class = head->so_traffic_class;
397
398 if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat)) {
399 sodealloc(so);
400 return ((struct socket *)0);
401 }
402 so->so_rcv.sb_flags |= (head->so_rcv.sb_flags & SB_USRSIZE);
403 so->so_snd.sb_flags |= (head->so_snd.sb_flags & SB_USRSIZE);
404
405 /*
406 * Must be done with head unlocked to avoid deadlock
407 * for protocol with per socket mutexes.
408 */
409 if (head->so_proto->pr_unlock)
410 socket_unlock(head, 0);
411 if (((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL) != 0) ||
412 error) {
413 sodealloc(so);
414 if (head->so_proto->pr_unlock)
415 socket_lock(head, 0);
416 return ((struct socket *)0);
417 }
418 if (head->so_proto->pr_unlock) {
419 socket_lock(head, 0);
420 /*
421 * Radar 7385998 Recheck that the head is still accepting
422 * to avoid race condition when head is getting closed.
423 */
424 if ((head->so_options & SO_ACCEPTCONN) == 0) {
425 so->so_state &= ~SS_NOFDREF;
426 soclose(so);
427 return ((struct socket *)0);
428 }
429 }
430
431 atomic_add_32(&so->so_proto->pr_domain->dom_refs, 1);
432
433 /* Insert in head appropriate lists */
434 so_acquire_accept_list(head, NULL);
435
436 so->so_head = head;
437
438 /*
439 * Since this socket is going to be inserted into the incomp
440 * queue, it can be picked up by another thread in
441 * tcp_dropdropablreq to get dropped before it is setup..
442 * To prevent this race, set in-progress flag which can be
443 * cleared later
444 */
445 so->so_flags |= SOF_INCOMP_INPROGRESS;
446
447 if (connstatus) {
448 TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
449 so->so_state |= SS_COMP;
450 } else {
451 TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
452 so->so_state |= SS_INCOMP;
453 head->so_incqlen++;
454 }
455 head->so_qlen++;
456
457 so_release_accept_list(head);
458
459 /* Attach socket filters for this protocol */
460 sflt_initsock(so);
461
462 if (connstatus) {
463 so->so_state |= connstatus;
464 sorwakeup(head);
465 wakeup((caddr_t)&head->so_timeo);
466 }
467 return (so);
468}
469
470
471struct socket *
472sonewconn(struct socket *head, int connstatus, const struct sockaddr *from)
473{
474 int error = sflt_connectin(head, from);
475 if (error) {
476 return (NULL);
477 }
478
479 return (sonewconn_internal(head, connstatus));
480}
481
482/*
483 * Socantsendmore indicates that no more data will be sent on the
484 * socket; it would normally be applied to a socket when the user
485 * informs the system that no more data is to be sent, by the protocol
486 * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data
487 * will be received, and will normally be applied to the socket by a
488 * protocol when it detects that the peer will send no more data.
489 * Data queued for reading in the socket may yet be read.
490 */
491
492void
493socantsendmore(struct socket *so)
494{
495 so->so_state |= SS_CANTSENDMORE;
496 soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CANTSENDMORE);
497 sflt_notify(so, sock_evt_cantsendmore, NULL);
498 sowwakeup(so);
499}
500
501void
502socantrcvmore(struct socket *so)
503{
504 so->so_state |= SS_CANTRCVMORE;
505 soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_CANTRCVMORE);
506 sflt_notify(so, sock_evt_cantrecvmore, NULL);
507 sorwakeup(so);
508}
509
510/*
511 * Wait for data to arrive at/drain from a socket buffer.
512 */
513int
514sbwait(struct sockbuf *sb)
515{
516 boolean_t nointr = (sb->sb_flags & SB_NOINTR);
517 void *lr_saved = __builtin_return_address(0);
518 struct socket *so = sb->sb_so;
519 lck_mtx_t *mutex_held;
520 struct timespec ts;
521 int error = 0;
522
523 if (so == NULL) {
524 panic("%s: null so, sb=%p sb_flags=0x%x lr=%p\n",
525 __func__, sb, sb->sb_flags, lr_saved);
526 /* NOTREACHED */
527 } else if (so->so_usecount < 1) {
528 panic("%s: sb=%p sb_flags=0x%x sb_so=%p usecount=%d lr=%p "
529 "lrh= %s\n", __func__, sb, sb->sb_flags, so,
530 so->so_usecount, lr_saved, solockhistory_nr(so));
531 /* NOTREACHED */
532 }
533
534 if ((so->so_state & SS_DRAINING) || (so->so_flags & SOF_DEFUNCT)) {
535 error = EBADF;
536 if (so->so_flags & SOF_DEFUNCT) {
537 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
538 "(%d)\n", __func__, proc_selfpid(),
539 proc_best_name(current_proc()),
540 (uint64_t)VM_KERNEL_ADDRPERM(so),
541 SOCK_DOM(so), SOCK_TYPE(so), error);
542 }
543 return (error);
544 }
545
546 if (so->so_proto->pr_getlock != NULL)
547 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
548 else
549 mutex_held = so->so_proto->pr_domain->dom_mtx;
550
551 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
552
553 ts.tv_sec = sb->sb_timeo.tv_sec;
554 ts.tv_nsec = sb->sb_timeo.tv_usec * 1000;
555
556 sb->sb_waiters++;
557 VERIFY(sb->sb_waiters != 0);
558
559 error = msleep((caddr_t)&sb->sb_cc, mutex_held,
560 nointr ? PSOCK : PSOCK | PCATCH,
561 nointr ? "sbwait_nointr" : "sbwait", &ts);
562
563 VERIFY(sb->sb_waiters != 0);
564 sb->sb_waiters--;
565
566 if (so->so_usecount < 1) {
567 panic("%s: 2 sb=%p sb_flags=0x%x sb_so=%p usecount=%d lr=%p "
568 "lrh= %s\n", __func__, sb, sb->sb_flags, so,
569 so->so_usecount, lr_saved, solockhistory_nr(so));
570 /* NOTREACHED */
571 }
572
573 if ((so->so_state & SS_DRAINING) || (so->so_flags & SOF_DEFUNCT)) {
574 error = EBADF;
575 if (so->so_flags & SOF_DEFUNCT) {
576 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
577 "(%d)\n", __func__, proc_selfpid(),
578 proc_best_name(current_proc()),
579 (uint64_t)VM_KERNEL_ADDRPERM(so),
580 SOCK_DOM(so), SOCK_TYPE(so), error);
581 }
582 }
583
584 return (error);
585}
586
587void
588sbwakeup(struct sockbuf *sb)
589{
590 if (sb->sb_waiters > 0)
591 wakeup((caddr_t)&sb->sb_cc);
592}
593
594/*
595 * Wakeup processes waiting on a socket buffer.
596 * Do asynchronous notification via SIGIO
597 * if the socket has the SS_ASYNC flag set.
598 */
599void
600sowakeup(struct socket *so, struct sockbuf *sb)
601{
602 if (so->so_flags & SOF_DEFUNCT) {
603 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] si 0x%x, "
604 "fl 0x%x [%s]\n", __func__, proc_selfpid(),
605 proc_best_name(current_proc()),
606 (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
607 SOCK_TYPE(so), (uint32_t)sb->sb_sel.si_flags, sb->sb_flags,
608 (sb->sb_flags & SB_RECV) ? "rcv" : "snd");
609 }
610
611 sb->sb_flags &= ~SB_SEL;
612 selwakeup(&sb->sb_sel);
613 sbwakeup(sb);
614 if (so->so_state & SS_ASYNC) {
615 if (so->so_pgid < 0)
616 gsignal(-so->so_pgid, SIGIO);
617 else if (so->so_pgid > 0)
618 proc_signal(so->so_pgid, SIGIO);
619 }
620 if (sb->sb_flags & SB_KNOTE) {
621 KNOTE(&sb->sb_sel.si_note, SO_FILT_HINT_LOCKED);
622 }
623 if (sb->sb_flags & SB_UPCALL) {
624 void (*sb_upcall)(struct socket *, void *, int);
625 caddr_t sb_upcallarg;
626 int lock = !(sb->sb_flags & SB_UPCALL_LOCK);
627
628 sb_upcall = sb->sb_upcall;
629 sb_upcallarg = sb->sb_upcallarg;
630 /* Let close know that we're about to do an upcall */
631 so->so_upcallusecount++;
632
633 if (lock)
634 socket_unlock(so, 0);
635 (*sb_upcall)(so, sb_upcallarg, M_DONTWAIT);
636 if (lock)
637 socket_lock(so, 0);
638
639 so->so_upcallusecount--;
640 /* Tell close that it's safe to proceed */
641 if ((so->so_flags & SOF_CLOSEWAIT) &&
642 so->so_upcallusecount == 0)
643 wakeup((caddr_t)&so->so_upcallusecount);
644 }
645#if CONTENT_FILTER
646 /*
647 * Trap disconnection events for content filters
648 */
649 if ((so->so_flags & SOF_CONTENT_FILTER) != 0) {
650 if ((sb->sb_flags & SB_RECV)) {
651 if (so->so_state & (SS_CANTRCVMORE))
652 cfil_sock_notify_shutdown(so, SHUT_RD);
653 } else {
654 if (so->so_state & (SS_CANTSENDMORE))
655 cfil_sock_notify_shutdown(so, SHUT_WR);
656 }
657 }
658#endif /* CONTENT_FILTER */
659}
660
661/*
662 * Socket buffer (struct sockbuf) utility routines.
663 *
664 * Each socket contains two socket buffers: one for sending data and
665 * one for receiving data. Each buffer contains a queue of mbufs,
666 * information about the number of mbufs and amount of data in the
667 * queue, and other fields allowing select() statements and notification
668 * on data availability to be implemented.
669 *
670 * Data stored in a socket buffer is maintained as a list of records.
671 * Each record is a list of mbufs chained together with the m_next
672 * field. Records are chained together with the m_nextpkt field. The upper
673 * level routine soreceive() expects the following conventions to be
674 * observed when placing information in the receive buffer:
675 *
676 * 1. If the protocol requires each message be preceded by the sender's
677 * name, then a record containing that name must be present before
678 * any associated data (mbuf's must be of type MT_SONAME).
679 * 2. If the protocol supports the exchange of ``access rights'' (really
680 * just additional data associated with the message), and there are
681 * ``rights'' to be received, then a record containing this data
682 * should be present (mbuf's must be of type MT_RIGHTS).
683 * 3. If a name or rights record exists, then it must be followed by
684 * a data record, perhaps of zero length.
685 *
686 * Before using a new socket structure it is first necessary to reserve
687 * buffer space to the socket, by calling sbreserve(). This should commit
688 * some of the available buffer space in the system buffer pool for the
689 * socket (currently, it does nothing but enforce limits). The space
690 * should be released by calling sbrelease() when the socket is destroyed.
691 */
692
693/*
694 * Returns: 0 Success
695 * ENOBUFS
696 */
697int
698soreserve(struct socket *so, u_int32_t sndcc, u_int32_t rcvcc)
699{
700 if (sbreserve(&so->so_snd, sndcc) == 0)
701 goto bad;
702 else
703 so->so_snd.sb_idealsize = sndcc;
704
705 if (sbreserve(&so->so_rcv, rcvcc) == 0)
706 goto bad2;
707 else
708 so->so_rcv.sb_idealsize = rcvcc;
709
710 if (so->so_rcv.sb_lowat == 0)
711 so->so_rcv.sb_lowat = 1;
712 if (so->so_snd.sb_lowat == 0)
713 so->so_snd.sb_lowat = MCLBYTES;
714 if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
715 so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
716 return (0);
717bad2:
718 so->so_snd.sb_flags &= ~SB_SEL;
719 selthreadclear(&so->so_snd.sb_sel);
720 sbrelease(&so->so_snd);
721bad:
722 return (ENOBUFS);
723}
724
725void
726soreserve_preconnect(struct socket *so, unsigned int pre_cc)
727{
728 /* As of now, same bytes for both preconnect read and write */
729 so->so_snd.sb_preconn_hiwat = pre_cc;
730 so->so_rcv.sb_preconn_hiwat = pre_cc;
731}
732
733/*
734 * Allot mbufs to a sockbuf.
735 * Attempt to scale mbmax so that mbcnt doesn't become limiting
736 * if buffering efficiency is near the normal case.
737 */
738int
739sbreserve(struct sockbuf *sb, u_int32_t cc)
740{
741 if ((u_quad_t)cc > (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES))
742 return (0);
743 sb->sb_hiwat = cc;
744 sb->sb_mbmax = min(cc * sb_efficiency, sb_max);
745 if (sb->sb_lowat > sb->sb_hiwat)
746 sb->sb_lowat = sb->sb_hiwat;
747 return (1);
748}
749
750/*
751 * Free mbufs held by a socket, and reserved mbuf space.
752 */
753/* WARNING needs to do selthreadclear() before calling this */
754void
755sbrelease(struct sockbuf *sb)
756{
757 sbflush(sb);
758 sb->sb_hiwat = 0;
759 sb->sb_mbmax = 0;
760}
761
762/*
763 * Routines to add and remove
764 * data from an mbuf queue.
765 *
766 * The routines sbappend() or sbappendrecord() are normally called to
767 * append new mbufs to a socket buffer, after checking that adequate
768 * space is available, comparing the function sbspace() with the amount
769 * of data to be added. sbappendrecord() differs from sbappend() in
770 * that data supplied is treated as the beginning of a new record.
771 * To place a sender's address, optional access rights, and data in a
772 * socket receive buffer, sbappendaddr() should be used. To place
773 * access rights and data in a socket receive buffer, sbappendrights()
774 * should be used. In either case, the new data begins a new record.
775 * Note that unlike sbappend() and sbappendrecord(), these routines check
776 * for the caller that there will be enough space to store the data.
777 * Each fails if there is not enough space, or if it cannot find mbufs
778 * to store additional information in.
779 *
780 * Reliable protocols may use the socket send buffer to hold data
781 * awaiting acknowledgement. Data is normally copied from a socket
782 * send buffer in a protocol with m_copy for output to a peer,
783 * and then removing the data from the socket buffer with sbdrop()
784 * or sbdroprecord() when the data is acknowledged by the peer.
785 */
786
787/*
788 * Append mbuf chain m to the last record in the
789 * socket buffer sb. The additional space associated
790 * the mbuf chain is recorded in sb. Empty mbufs are
791 * discarded and mbufs are compacted where possible.
792 */
793int
794sbappend(struct sockbuf *sb, struct mbuf *m)
795{
796 struct socket *so = sb->sb_so;
797
798 if (m == NULL || (sb->sb_flags & SB_DROP)) {
799 if (m != NULL)
800 m_freem(m);
801 return (0);
802 }
803
804 SBLASTRECORDCHK(sb, "sbappend 1");
805
806 if (sb->sb_lastrecord != NULL && (sb->sb_mbtail->m_flags & M_EOR))
807 return (sbappendrecord(sb, m));
808
809 if (sb->sb_flags & SB_RECV && !(m && m->m_flags & M_SKIPCFIL)) {
810 int error = sflt_data_in(so, NULL, &m, NULL, 0);
811 SBLASTRECORDCHK(sb, "sbappend 2");
812
813#if CONTENT_FILTER
814 if (error == 0)
815 error = cfil_sock_data_in(so, NULL, m, NULL, 0);
816#endif /* CONTENT_FILTER */
817
818 if (error != 0) {
819 if (error != EJUSTRETURN)
820 m_freem(m);
821 return (0);
822 }
823 } else if (m) {
824 m->m_flags &= ~M_SKIPCFIL;
825 }
826
827 /* If this is the first record, it's also the last record */
828 if (sb->sb_lastrecord == NULL)
829 sb->sb_lastrecord = m;
830
831 sbcompress(sb, m, sb->sb_mbtail);
832 SBLASTRECORDCHK(sb, "sbappend 3");
833 return (1);
834}
835
836/*
837 * Similar to sbappend, except that this is optimized for stream sockets.
838 */
839int
840sbappendstream(struct sockbuf *sb, struct mbuf *m)
841{
842 struct socket *so = sb->sb_so;
843
844 if (m == NULL || (sb->sb_flags & SB_DROP)) {
845 if (m != NULL)
846 m_freem(m);
847 return (0);
848 }
849
850 if (m->m_nextpkt != NULL || (sb->sb_mb != sb->sb_lastrecord)) {
851 panic("sbappendstream: nexpkt %p || mb %p != lastrecord %p\n",
852 m->m_nextpkt, sb->sb_mb, sb->sb_lastrecord);
853 /* NOTREACHED */
854 }
855
856 SBLASTMBUFCHK(sb, __func__);
857
858 if (sb->sb_flags & SB_RECV && !(m && m->m_flags & M_SKIPCFIL)) {
859 int error = sflt_data_in(so, NULL, &m, NULL, 0);
860 SBLASTRECORDCHK(sb, "sbappendstream 1");
861
862#if CONTENT_FILTER
863 if (error == 0)
864 error = cfil_sock_data_in(so, NULL, m, NULL, 0);
865#endif /* CONTENT_FILTER */
866
867 if (error != 0) {
868 if (error != EJUSTRETURN)
869 m_freem(m);
870 return (0);
871 }
872 } else if (m) {
873 m->m_flags &= ~M_SKIPCFIL;
874 }
875
876 sbcompress(sb, m, sb->sb_mbtail);
877 sb->sb_lastrecord = sb->sb_mb;
878 SBLASTRECORDCHK(sb, "sbappendstream 2");
879 return (1);
880}
881
882#ifdef SOCKBUF_DEBUG
883void
884sbcheck(struct sockbuf *sb)
885{
886 struct mbuf *m;
887 struct mbuf *n = 0;
888 u_int32_t len = 0, mbcnt = 0;
889 lck_mtx_t *mutex_held;
890
891 if (sb->sb_so->so_proto->pr_getlock != NULL)
892 mutex_held = (*sb->sb_so->so_proto->pr_getlock)(sb->sb_so, 0);
893 else
894 mutex_held = sb->sb_so->so_proto->pr_domain->dom_mtx;
895
896 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
897
898 if (sbchecking == 0)
899 return;
900
901 for (m = sb->sb_mb; m; m = n) {
902 n = m->m_nextpkt;
903 for (; m; m = m->m_next) {
904 len += m->m_len;
905 mbcnt += MSIZE;
906 /* XXX pretty sure this is bogus */
907 if (m->m_flags & M_EXT)
908 mbcnt += m->m_ext.ext_size;
909 }
910 }
911 if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
912 panic("cc %ld != %ld || mbcnt %ld != %ld\n", len, sb->sb_cc,
913 mbcnt, sb->sb_mbcnt);
914 }
915}
916#endif
917
918void
919sblastrecordchk(struct sockbuf *sb, const char *where)
920{
921 struct mbuf *m = sb->sb_mb;
922
923 while (m && m->m_nextpkt)
924 m = m->m_nextpkt;
925
926 if (m != sb->sb_lastrecord) {
927 printf("sblastrecordchk: mb 0x%llx lastrecord 0x%llx "
928 "last 0x%llx\n",
929 (uint64_t)VM_KERNEL_ADDRPERM(sb->sb_mb),
930 (uint64_t)VM_KERNEL_ADDRPERM(sb->sb_lastrecord),
931 (uint64_t)VM_KERNEL_ADDRPERM(m));
932 printf("packet chain:\n");
933 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt)
934 printf("\t0x%llx\n", (uint64_t)VM_KERNEL_ADDRPERM(m));
935 panic("sblastrecordchk from %s", where);
936 }
937}
938
939void
940sblastmbufchk(struct sockbuf *sb, const char *where)
941{
942 struct mbuf *m = sb->sb_mb;
943 struct mbuf *n;
944
945 while (m && m->m_nextpkt)
946 m = m->m_nextpkt;
947
948 while (m && m->m_next)
949 m = m->m_next;
950
951 if (m != sb->sb_mbtail) {
952 printf("sblastmbufchk: mb 0x%llx mbtail 0x%llx last 0x%llx\n",
953 (uint64_t)VM_KERNEL_ADDRPERM(sb->sb_mb),
954 (uint64_t)VM_KERNEL_ADDRPERM(sb->sb_mbtail),
955 (uint64_t)VM_KERNEL_ADDRPERM(m));
956 printf("packet tree:\n");
957 for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
958 printf("\t");
959 for (n = m; n != NULL; n = n->m_next)
960 printf("0x%llx ",
961 (uint64_t)VM_KERNEL_ADDRPERM(n));
962 printf("\n");
963 }
964 panic("sblastmbufchk from %s", where);
965 }
966}
967
968/*
969 * Similar to sbappend, except the mbuf chain begins a new record.
970 */
971int
972sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
973{
974 struct mbuf *m;
975 int space = 0;
976
977 if (m0 == NULL || (sb->sb_flags & SB_DROP)) {
978 if (m0 != NULL)
979 m_freem(m0);
980 return (0);
981 }
982
983 for (m = m0; m != NULL; m = m->m_next)
984 space += m->m_len;
985
986 if (space > sbspace(sb) && !(sb->sb_flags & SB_UNIX)) {
987 m_freem(m0);
988 return (0);
989 }
990
991 if (sb->sb_flags & SB_RECV && !(m0 && m0->m_flags & M_SKIPCFIL)) {
992 int error = sflt_data_in(sb->sb_so, NULL, &m0, NULL,
993 sock_data_filt_flag_record);
994
995#if CONTENT_FILTER
996 if (error == 0)
997 error = cfil_sock_data_in(sb->sb_so, NULL, m0, NULL, 0);
998#endif /* CONTENT_FILTER */
999
1000 if (error != 0) {
1001 SBLASTRECORDCHK(sb, "sbappendrecord 1");
1002 if (error != EJUSTRETURN)
1003 m_freem(m0);
1004 return (0);
1005 }
1006 } else if (m0) {
1007 m0->m_flags &= ~M_SKIPCFIL;
1008 }
1009
1010 /*
1011 * Note this permits zero length records.
1012 */
1013 sballoc(sb, m0);
1014 SBLASTRECORDCHK(sb, "sbappendrecord 2");
1015 if (sb->sb_lastrecord != NULL) {
1016 sb->sb_lastrecord->m_nextpkt = m0;
1017 } else {
1018 sb->sb_mb = m0;
1019 }
1020 sb->sb_lastrecord = m0;
1021 sb->sb_mbtail = m0;
1022
1023 m = m0->m_next;
1024 m0->m_next = 0;
1025 if (m && (m0->m_flags & M_EOR)) {
1026 m0->m_flags &= ~M_EOR;
1027 m->m_flags |= M_EOR;
1028 }
1029 sbcompress(sb, m, m0);
1030 SBLASTRECORDCHK(sb, "sbappendrecord 3");
1031 return (1);
1032}
1033
1034/*
1035 * As above except that OOB data
1036 * is inserted at the beginning of the sockbuf,
1037 * but after any other OOB data.
1038 */
1039int
1040sbinsertoob(struct sockbuf *sb, struct mbuf *m0)
1041{
1042 struct mbuf *m;
1043 struct mbuf **mp;
1044
1045 if (m0 == 0)
1046 return (0);
1047
1048 SBLASTRECORDCHK(sb, "sbinsertoob 1");
1049
1050 if ((sb->sb_flags & SB_RECV && !(m0->m_flags & M_SKIPCFIL)) != 0) {
1051 int error = sflt_data_in(sb->sb_so, NULL, &m0, NULL,
1052 sock_data_filt_flag_oob);
1053
1054 SBLASTRECORDCHK(sb, "sbinsertoob 2");
1055
1056#if CONTENT_FILTER
1057 if (error == 0)
1058 error = cfil_sock_data_in(sb->sb_so, NULL, m0, NULL, 0);
1059#endif /* CONTENT_FILTER */
1060
1061 if (error) {
1062 if (error != EJUSTRETURN) {
1063 m_freem(m0);
1064 }
1065 return (0);
1066 }
1067 } else if (m0) {
1068 m0->m_flags &= ~M_SKIPCFIL;
1069 }
1070
1071 for (mp = &sb->sb_mb; *mp; mp = &((*mp)->m_nextpkt)) {
1072 m = *mp;
1073again:
1074 switch (m->m_type) {
1075
1076 case MT_OOBDATA:
1077 continue; /* WANT next train */
1078
1079 case MT_CONTROL:
1080 m = m->m_next;
1081 if (m)
1082 goto again; /* inspect THIS train further */
1083 }
1084 break;
1085 }
1086 /*
1087 * Put the first mbuf on the queue.
1088 * Note this permits zero length records.
1089 */
1090 sballoc(sb, m0);
1091 m0->m_nextpkt = *mp;
1092 if (*mp == NULL) {
1093 /* m0 is actually the new tail */
1094 sb->sb_lastrecord = m0;
1095 }
1096 *mp = m0;
1097 m = m0->m_next;
1098 m0->m_next = 0;
1099 if (m && (m0->m_flags & M_EOR)) {
1100 m0->m_flags &= ~M_EOR;
1101 m->m_flags |= M_EOR;
1102 }
1103 sbcompress(sb, m, m0);
1104 SBLASTRECORDCHK(sb, "sbinsertoob 3");
1105 return (1);
1106}
1107
1108/*
1109 * Concatenate address (optional), control (optional) and data into one
1110 * single mbuf chain. If sockbuf *sb is passed in, space check will be
1111 * performed.
1112 *
1113 * Returns: mbuf chain pointer if succeeded, NULL if failed
1114 */
1115struct mbuf *
1116sbconcat_mbufs(struct sockbuf *sb, struct sockaddr *asa, struct mbuf *m0, struct mbuf *control)
1117{
1118 struct mbuf *m = NULL, *n = NULL;
1119 int space = 0;
1120
1121 if (m0 && (m0->m_flags & M_PKTHDR) == 0)
1122 panic("sbconcat_mbufs");
1123
1124 if (m0)
1125 space += m0->m_pkthdr.len;
1126 for (n = control; n; n = n->m_next) {
1127 space += n->m_len;
1128 if (n->m_next == 0) /* keep pointer to last control buf */
1129 break;
1130 }
1131
1132 if (asa != NULL) {
1133 if (asa->sa_len > MLEN)
1134 return (NULL);
1135 space += asa->sa_len;
1136 }
1137
1138 if (sb != NULL && space > sbspace(sb))
1139 return (NULL);
1140
1141 if (n)
1142 n->m_next = m0; /* concatenate data to control */
1143 else
1144 control = m0;
1145
1146 if (asa != NULL) {
1147 MGET(m, M_DONTWAIT, MT_SONAME);
1148 if (m == 0) {
1149 if (n) {
1150 /* unchain control and data if necessary */
1151 n->m_next = NULL;
1152 }
1153 return (NULL);
1154 }
1155 m->m_len = asa->sa_len;
1156 bcopy((caddr_t)asa, mtod(m, caddr_t), asa->sa_len);
1157
1158 m->m_next = control;
1159 } else {
1160 m = control;
1161 }
1162
1163 return (m);
1164}
1165
1166/*
1167 * Queue mbuf chain to the receive queue of a socket.
1168 * Parameter space is the total len of the mbuf chain.
1169 * If passed in, sockbuf space will be checked.
1170 *
1171 * Returns: 0 Invalid mbuf chain
1172 * 1 Success
1173 */
1174int
1175sbappendchain(struct sockbuf *sb, struct mbuf *m, int space)
1176{
1177 struct mbuf *n, *nlast;
1178
1179 if (m == NULL)
1180 return (0);
1181
1182 if (space != 0 && space > sbspace(sb))
1183 return (0);
1184
1185 for (n = m; n->m_next != NULL; n = n->m_next)
1186 sballoc(sb, n);
1187 sballoc(sb, n);
1188 nlast = n;
1189
1190 if (sb->sb_lastrecord != NULL) {
1191 sb->sb_lastrecord->m_nextpkt = m;
1192 } else {
1193 sb->sb_mb = m;
1194 }
1195 sb->sb_lastrecord = m;
1196 sb->sb_mbtail = nlast;
1197
1198 SBLASTMBUFCHK(sb, __func__);
1199 SBLASTRECORDCHK(sb, "sbappendadddr 2");
1200
1201 postevent(0, sb, EV_RWBYTES);
1202 return (1);
1203}
1204
1205/*
1206 * Returns: 0 Error: No space/out of mbufs/etc.
1207 * 1 Success
1208 *
1209 * Imputed: (*error_out) errno for error
1210 * ENOBUFS
1211 * sflt_data_in:??? [whatever a filter author chooses]
1212 */
1213int
1214sbappendaddr(struct sockbuf *sb, struct sockaddr *asa, struct mbuf *m0,
1215 struct mbuf *control, int *error_out)
1216{
1217 int result = 0;
1218 boolean_t sb_unix = (sb->sb_flags & SB_UNIX);
1219 struct mbuf *mbuf_chain = NULL;
1220
1221 if (error_out)
1222 *error_out = 0;
1223
1224 if (m0 && (m0->m_flags & M_PKTHDR) == 0)
1225 panic("sbappendaddrorfree");
1226
1227 if (sb->sb_flags & SB_DROP) {
1228 if (m0 != NULL)
1229 m_freem(m0);
1230 if (control != NULL && !sb_unix)
1231 m_freem(control);
1232 if (error_out != NULL)
1233 *error_out = EINVAL;
1234 return (0);
1235 }
1236
1237 /* Call socket data in filters */
1238 if (sb->sb_flags & SB_RECV && !(m0 && m0->m_flags & M_SKIPCFIL)) {
1239 int error;
1240 error = sflt_data_in(sb->sb_so, asa, &m0, &control, 0);
1241 SBLASTRECORDCHK(sb, __func__);
1242
1243#if CONTENT_FILTER
1244 if (error == 0)
1245 error = cfil_sock_data_in(sb->sb_so, asa, m0, control,
1246 0);
1247#endif /* CONTENT_FILTER */
1248
1249 if (error) {
1250 if (error != EJUSTRETURN) {
1251 if (m0)
1252 m_freem(m0);
1253 if (control != NULL && !sb_unix)
1254 m_freem(control);
1255 if (error_out)
1256 *error_out = error;
1257 }
1258 return (0);
1259 }
1260 } else if (m0) {
1261 m0->m_flags &= ~M_SKIPCFIL;
1262 }
1263
1264 mbuf_chain = sbconcat_mbufs(sb, asa, m0, control);
1265 SBLASTRECORDCHK(sb, "sbappendadddr 1");
1266 result = sbappendchain(sb, mbuf_chain, 0);
1267 if (result == 0) {
1268 if (m0)
1269 m_freem(m0);
1270 if (control != NULL && !sb_unix)
1271 m_freem(control);
1272 if (error_out)
1273 *error_out = ENOBUFS;
1274 }
1275
1276 return (result);
1277}
1278
1279static int
1280sbappendcontrol_internal(struct sockbuf *sb, struct mbuf *m0,
1281 struct mbuf *control)
1282{
1283 struct mbuf *m, *mlast, *n;
1284 int space = 0;
1285
1286 if (control == 0)
1287 panic("sbappendcontrol");
1288
1289 for (m = control; ; m = m->m_next) {
1290 space += m->m_len;
1291 if (m->m_next == 0)
1292 break;
1293 }
1294 n = m; /* save pointer to last control buffer */
1295 for (m = m0; m; m = m->m_next)
1296 space += m->m_len;
1297 if (space > sbspace(sb) && !(sb->sb_flags & SB_UNIX))
1298 return (0);
1299 n->m_next = m0; /* concatenate data to control */
1300 SBLASTRECORDCHK(sb, "sbappendcontrol 1");
1301
1302 for (m = control; m->m_next != NULL; m = m->m_next)
1303 sballoc(sb, m);
1304 sballoc(sb, m);
1305 mlast = m;
1306
1307 if (sb->sb_lastrecord != NULL) {
1308 sb->sb_lastrecord->m_nextpkt = control;
1309 } else {
1310 sb->sb_mb = control;
1311 }
1312 sb->sb_lastrecord = control;
1313 sb->sb_mbtail = mlast;
1314
1315 SBLASTMBUFCHK(sb, __func__);
1316 SBLASTRECORDCHK(sb, "sbappendcontrol 2");
1317
1318 postevent(0, sb, EV_RWBYTES);
1319 return (1);
1320}
1321
1322int
1323sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control,
1324 int *error_out)
1325{
1326 int result = 0;
1327 boolean_t sb_unix = (sb->sb_flags & SB_UNIX);
1328
1329 if (error_out)
1330 *error_out = 0;
1331
1332 if (sb->sb_flags & SB_DROP) {
1333 if (m0 != NULL)
1334 m_freem(m0);
1335 if (control != NULL && !sb_unix)
1336 m_freem(control);
1337 if (error_out != NULL)
1338 *error_out = EINVAL;
1339 return (0);
1340 }
1341
1342 if (sb->sb_flags & SB_RECV && !(m0 && m0->m_flags & M_SKIPCFIL)) {
1343 int error;
1344
1345 error = sflt_data_in(sb->sb_so, NULL, &m0, &control, 0);
1346 SBLASTRECORDCHK(sb, __func__);
1347
1348#if CONTENT_FILTER
1349 if (error == 0)
1350 error = cfil_sock_data_in(sb->sb_so, NULL, m0, control,
1351 0);
1352#endif /* CONTENT_FILTER */
1353
1354 if (error) {
1355 if (error != EJUSTRETURN) {
1356 if (m0)
1357 m_freem(m0);
1358 if (control != NULL && !sb_unix)
1359 m_freem(control);
1360 if (error_out)
1361 *error_out = error;
1362 }
1363 return (0);
1364 }
1365 } else if (m0) {
1366 m0->m_flags &= ~M_SKIPCFIL;
1367 }
1368
1369 result = sbappendcontrol_internal(sb, m0, control);
1370 if (result == 0) {
1371 if (m0)
1372 m_freem(m0);
1373 if (control != NULL && !sb_unix)
1374 m_freem(control);
1375 if (error_out)
1376 *error_out = ENOBUFS;
1377 }
1378
1379 return (result);
1380}
1381
1382/*
1383 * Append a contiguous TCP data blob with TCP sequence number as control data
1384 * as a new msg to the receive socket buffer.
1385 */
1386int
1387sbappendmsgstream_rcv(struct sockbuf *sb, struct mbuf *m, uint32_t seqnum,
1388 int unordered)
1389{
1390 struct mbuf *m_eor = NULL;
1391 u_int32_t data_len = 0;
1392 int ret = 0;
1393 struct socket *so = sb->sb_so;
1394
1395 if (m == NULL)
1396 return (0);
1397
1398 VERIFY((m->m_flags & M_PKTHDR) && m_pktlen(m) > 0);
1399 VERIFY(so->so_msg_state != NULL);
1400 VERIFY(sb->sb_flags & SB_RECV);
1401
1402 /* Keep the TCP sequence number in the mbuf pkthdr */
1403 m->m_pkthdr.msg_seq = seqnum;
1404
1405 /* find last mbuf and set M_EOR */
1406 for (m_eor = m; ; m_eor = m_eor->m_next) {
1407 /*
1408 * If the msg is unordered, we need to account for
1409 * these bytes in receive socket buffer size. Otherwise,
1410 * the receive window advertised will shrink because
1411 * of the additional unordered bytes added to the
1412 * receive buffer.
1413 */
1414 if (unordered) {
1415 m_eor->m_flags |= M_UNORDERED_DATA;
1416 data_len += m_eor->m_len;
1417 so->so_msg_state->msg_uno_bytes += m_eor->m_len;
1418 } else {
1419 m_eor->m_flags &= ~M_UNORDERED_DATA;
1420 }
1421 if (m_eor->m_next == NULL)
1422 break;
1423 }
1424
1425 /* set EOR flag at end of byte blob */
1426 m_eor->m_flags |= M_EOR;
1427
1428 /* expand the receive socket buffer to allow unordered data */
1429 if (unordered && !sbreserve(sb, sb->sb_hiwat + data_len)) {
1430 /*
1431 * Could not allocate memory for unordered data, it
1432 * means this packet will have to be delivered in order
1433 */
1434 printf("%s: could not reserve space for unordered data\n",
1435 __func__);
1436 }
1437
1438 if (!unordered && (sb->sb_mbtail != NULL) &&
1439 !(sb->sb_mbtail->m_flags & M_UNORDERED_DATA)) {
1440 sb->sb_mbtail->m_flags &= ~M_EOR;
1441 sbcompress(sb, m, sb->sb_mbtail);
1442 ret = 1;
1443 } else {
1444 ret = sbappendrecord(sb, m);
1445 }
1446 VERIFY(sb->sb_mbtail->m_flags & M_EOR);
1447 return (ret);
1448}
1449
1450/*
1451 * TCP streams have message based out of order delivery support, or have
1452 * Multipath TCP support, or are regular TCP sockets
1453 */
1454int
1455sbappendstream_rcvdemux(struct socket *so, struct mbuf *m, uint32_t seqnum,
1456 int unordered)
1457{
1458 int ret = 0;
1459
1460 if ((m != NULL) &&
1461 m_pktlen(m) <= 0 &&
1462 !((so->so_flags & SOF_MP_SUBFLOW) &&
1463 (m->m_flags & M_PKTHDR) &&
1464 (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN))) {
1465 m_freem(m);
1466 return (ret);
1467 }
1468
1469 if (so->so_flags & SOF_ENABLE_MSGS) {
1470 ret = sbappendmsgstream_rcv(&so->so_rcv, m, seqnum, unordered);
1471 }
1472#if MPTCP
1473 else if (so->so_flags & SOF_MP_SUBFLOW) {
1474 ret = sbappendmptcpstream_rcv(&so->so_rcv, m);
1475 }
1476#endif /* MPTCP */
1477 else {
1478 ret = sbappendstream(&so->so_rcv, m);
1479 }
1480 return (ret);
1481}
1482
1483#if MPTCP
1484int
1485sbappendmptcpstream_rcv(struct sockbuf *sb, struct mbuf *m)
1486{
1487 struct socket *so = sb->sb_so;
1488
1489 VERIFY(m == NULL || (m->m_flags & M_PKTHDR));
1490 /* SB_NOCOMPRESS must be set prevent loss of M_PKTHDR data */
1491 VERIFY((sb->sb_flags & (SB_RECV|SB_NOCOMPRESS)) ==
1492 (SB_RECV|SB_NOCOMPRESS));
1493
1494 if (m == NULL || m_pktlen(m) == 0 || (sb->sb_flags & SB_DROP) ||
1495 (so->so_state & SS_CANTRCVMORE)) {
1496 if (m && (m->m_flags & M_PKTHDR) &&
1497 m_pktlen(m) == 0 &&
1498 (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN)) {
1499 mptcp_input(tptomptp(sototcpcb(so))->mpt_mpte, m);
1500 return (1);
1501 } else if (m != NULL) {
1502 m_freem(m);
1503 }
1504 return (0);
1505 }
1506 /* the socket is not closed, so SOF_MP_SUBFLOW must be set */
1507 VERIFY(so->so_flags & SOF_MP_SUBFLOW);
1508
1509 if (m->m_nextpkt != NULL || (sb->sb_mb != sb->sb_lastrecord)) {
1510 panic("%s: nexpkt %p || mb %p != lastrecord %p\n", __func__,
1511 m->m_nextpkt, sb->sb_mb, sb->sb_lastrecord);
1512 /* NOTREACHED */
1513 }
1514
1515 SBLASTMBUFCHK(sb, __func__);
1516
1517 /* No filter support (SB_RECV) on mptcp subflow sockets */
1518
1519 sbcompress(sb, m, sb->sb_mbtail);
1520 sb->sb_lastrecord = sb->sb_mb;
1521 SBLASTRECORDCHK(sb, __func__);
1522 return (1);
1523}
1524#endif /* MPTCP */
1525
1526/*
1527 * Append message to send socket buffer based on priority.
1528 */
1529int
1530sbappendmsg_snd(struct sockbuf *sb, struct mbuf *m)
1531{
1532 struct socket *so = sb->sb_so;
1533 struct msg_priq *priq;
1534 int set_eor = 0;
1535
1536 VERIFY(so->so_msg_state != NULL);
1537
1538 if (m->m_nextpkt != NULL || (sb->sb_mb != sb->sb_lastrecord))
1539 panic("sbappendstream: nexpkt %p || mb %p != lastrecord %p\n",
1540 m->m_nextpkt, sb->sb_mb, sb->sb_lastrecord);
1541
1542 SBLASTMBUFCHK(sb, __func__);
1543
1544 if (m == NULL || (sb->sb_flags & SB_DROP) || so->so_msg_state == NULL) {
1545 if (m != NULL)
1546 m_freem(m);
1547 return (0);
1548 }
1549
1550 priq = &so->so_msg_state->msg_priq[m->m_pkthdr.msg_pri];
1551
1552 /* note if we need to propogate M_EOR to the last mbuf */
1553 if (m->m_flags & M_EOR) {
1554 set_eor = 1;
1555
1556 /* Reset M_EOR from the first mbuf */
1557 m->m_flags &= ~(M_EOR);
1558 }
1559
1560 if (priq->msgq_head == NULL) {
1561 VERIFY(priq->msgq_tail == NULL && priq->msgq_lastmsg == NULL);
1562 priq->msgq_head = priq->msgq_lastmsg = m;
1563 } else {
1564 VERIFY(priq->msgq_tail->m_next == NULL);
1565
1566 /* Check if the last message has M_EOR flag set */
1567 if (priq->msgq_tail->m_flags & M_EOR) {
1568 /* Insert as a new message */
1569 priq->msgq_lastmsg->m_nextpkt = m;
1570
1571 /* move the lastmsg pointer */
1572 priq->msgq_lastmsg = m;
1573 } else {
1574 /* Append to the existing message */
1575 priq->msgq_tail->m_next = m;
1576 }
1577 }
1578
1579 /* Update accounting and the queue tail pointer */
1580
1581 while (m->m_next != NULL) {
1582 sballoc(sb, m);
1583 priq->msgq_bytes += m->m_len;
1584 m = m->m_next;
1585 }
1586 sballoc(sb, m);
1587 priq->msgq_bytes += m->m_len;
1588
1589 if (set_eor) {
1590 m->m_flags |= M_EOR;
1591
1592 /*
1593 * Since the user space can not write a new msg
1594 * without completing the previous one, we can
1595 * reset this flag to start sending again.
1596 */
1597 priq->msgq_flags &= ~(MSGQ_MSG_NOTDONE);
1598 }
1599
1600 priq->msgq_tail = m;
1601
1602 SBLASTRECORDCHK(sb, "sbappendstream 2");
1603 postevent(0, sb, EV_RWBYTES);
1604 return (1);
1605}
1606
1607/*
1608 * Pull data from priority queues to the serial snd queue
1609 * right before sending.
1610 */
1611void
1612sbpull_unordered_data(struct socket *so, int32_t off, int32_t len)
1613{
1614 int32_t topull, i;
1615 struct msg_priq *priq = NULL;
1616
1617 VERIFY(so->so_msg_state != NULL);
1618
1619 topull = (off + len) - so->so_msg_state->msg_serial_bytes;
1620
1621 i = MSG_PRI_MAX;
1622 while (i >= MSG_PRI_MIN && topull > 0) {
1623 struct mbuf *m = NULL, *mqhead = NULL, *mend = NULL;
1624 priq = &so->so_msg_state->msg_priq[i];
1625 if ((priq->msgq_flags & MSGQ_MSG_NOTDONE) &&
1626 priq->msgq_head == NULL) {
1627 /*
1628 * We were in the middle of sending
1629 * a message and we have not seen the
1630 * end of it.
1631 */
1632 VERIFY(priq->msgq_lastmsg == NULL &&
1633 priq->msgq_tail == NULL);
1634 return;
1635 }
1636 if (priq->msgq_head != NULL) {
1637 int32_t bytes = 0, topull_tmp = topull;
1638 /*
1639 * We found a msg while scanning the priority
1640 * queue from high to low priority.
1641 */
1642 m = priq->msgq_head;
1643 mqhead = m;
1644 mend = m;
1645
1646 /*
1647 * Move bytes from the priority queue to the
1648 * serial queue. Compute the number of bytes
1649 * being added.
1650 */
1651 while (mqhead->m_next != NULL && topull_tmp > 0) {
1652 bytes += mqhead->m_len;
1653 topull_tmp -= mqhead->m_len;
1654 mend = mqhead;
1655 mqhead = mqhead->m_next;
1656 }
1657
1658 if (mqhead->m_next == NULL) {
1659 /*
1660 * If we have only one more mbuf left,
1661 * move the last mbuf of this message to
1662 * serial queue and set the head of the
1663 * queue to be the next message.
1664 */
1665 bytes += mqhead->m_len;
1666 mend = mqhead;
1667 mqhead = m->m_nextpkt;
1668 if (!(mend->m_flags & M_EOR)) {
1669 /*
1670 * We have not seen the end of
1671 * this message, so we can not
1672 * pull anymore.
1673 */
1674 priq->msgq_flags |= MSGQ_MSG_NOTDONE;
1675 } else {
1676 /* Reset M_EOR */
1677 mend->m_flags &= ~(M_EOR);
1678 }
1679 } else {
1680 /* propogate the next msg pointer */
1681 mqhead->m_nextpkt = m->m_nextpkt;
1682 }
1683 priq->msgq_head = mqhead;
1684
1685 /*
1686 * if the lastmsg pointer points to
1687 * the mbuf that is being dequeued, update
1688 * it to point to the new head.
1689 */
1690 if (priq->msgq_lastmsg == m)
1691 priq->msgq_lastmsg = priq->msgq_head;
1692
1693 m->m_nextpkt = NULL;
1694 mend->m_next = NULL;
1695
1696 if (priq->msgq_head == NULL) {
1697 /* Moved all messages, update tail */
1698 priq->msgq_tail = NULL;
1699 VERIFY(priq->msgq_lastmsg == NULL);
1700 }
1701
1702 /* Move it to serial sb_mb queue */
1703 if (so->so_snd.sb_mb == NULL) {
1704 so->so_snd.sb_mb = m;
1705 } else {
1706 so->so_snd.sb_mbtail->m_next = m;
1707 }
1708
1709 priq->msgq_bytes -= bytes;
1710 VERIFY(priq->msgq_bytes >= 0);
1711 sbwakeup(&so->so_snd);
1712
1713 so->so_msg_state->msg_serial_bytes += bytes;
1714 so->so_snd.sb_mbtail = mend;
1715 so->so_snd.sb_lastrecord = so->so_snd.sb_mb;
1716
1717 topull =
1718 (off + len) - so->so_msg_state->msg_serial_bytes;
1719
1720 if (priq->msgq_flags & MSGQ_MSG_NOTDONE)
1721 break;
1722 } else {
1723 --i;
1724 }
1725 }
1726 sblastrecordchk(&so->so_snd, "sbpull_unordered_data");
1727 sblastmbufchk(&so->so_snd, "sbpull_unordered_data");
1728}
1729
1730/*
1731 * Compress mbuf chain m into the socket
1732 * buffer sb following mbuf n. If n
1733 * is null, the buffer is presumed empty.
1734 */
1735static inline void
1736sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
1737{
1738 int eor = 0, compress = (!(sb->sb_flags & SB_NOCOMPRESS));
1739 struct mbuf *o;
1740
1741 if (m == NULL) {
1742 /* There is nothing to compress; just update the tail */
1743 for (; n->m_next != NULL; n = n->m_next)
1744 ;
1745 sb->sb_mbtail = n;
1746 goto done;
1747 }
1748
1749 while (m != NULL) {
1750 eor |= m->m_flags & M_EOR;
1751 if (compress && m->m_len == 0 && (eor == 0 ||
1752 (((o = m->m_next) || (o = n)) && o->m_type == m->m_type))) {
1753 if (sb->sb_lastrecord == m)
1754 sb->sb_lastrecord = m->m_next;
1755 m = m_free(m);
1756 continue;
1757 }
1758 if (compress && n != NULL && (n->m_flags & M_EOR) == 0 &&
1759#ifndef __APPLE__
1760 M_WRITABLE(n) &&
1761#endif
1762 m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
1763 m->m_len <= M_TRAILINGSPACE(n) &&
1764 n->m_type == m->m_type) {
1765 bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len,
1766 (unsigned)m->m_len);
1767 n->m_len += m->m_len;
1768 sb->sb_cc += m->m_len;
1769 if (m->m_type != MT_DATA && m->m_type != MT_HEADER &&
1770 m->m_type != MT_OOBDATA) {
1771 /* XXX: Probably don't need */
1772 sb->sb_ctl += m->m_len;
1773 }
1774
1775 /* update send byte count */
1776 if (sb->sb_flags & SB_SNDBYTE_CNT) {
1777 inp_incr_sndbytes_total(sb->sb_so,
1778 m->m_len);
1779 inp_incr_sndbytes_unsent(sb->sb_so,
1780 m->m_len);
1781 }
1782 m = m_free(m);
1783 continue;
1784 }
1785 if (n != NULL)
1786 n->m_next = m;
1787 else
1788 sb->sb_mb = m;
1789 sb->sb_mbtail = m;
1790 sballoc(sb, m);
1791 n = m;
1792 m->m_flags &= ~M_EOR;
1793 m = m->m_next;
1794 n->m_next = NULL;
1795 }
1796 if (eor != 0) {
1797 if (n != NULL)
1798 n->m_flags |= eor;
1799 else
1800 printf("semi-panic: sbcompress\n");
1801 }
1802done:
1803 SBLASTMBUFCHK(sb, __func__);
1804 postevent(0, sb, EV_RWBYTES);
1805}
1806
1807void
1808sb_empty_assert(struct sockbuf *sb, const char *where)
1809{
1810 if (!(sb->sb_cc == 0 && sb->sb_mb == NULL && sb->sb_mbcnt == 0 &&
1811 sb->sb_mbtail == NULL && sb->sb_lastrecord == NULL)) {
1812 panic("%s: sb %p so %p cc %d mbcnt %d mb %p mbtail %p "
1813 "lastrecord %p\n", where, sb, sb->sb_so, sb->sb_cc,
1814 sb->sb_mbcnt, sb->sb_mb, sb->sb_mbtail,
1815 sb->sb_lastrecord);
1816 /* NOTREACHED */
1817 }
1818}
1819
1820static void
1821sbflush_priq(struct msg_priq *priq)
1822{
1823 struct mbuf *m;
1824 m = priq->msgq_head;
1825 if (m != NULL)
1826 m_freem_list(m);
1827 priq->msgq_head = priq->msgq_tail = priq->msgq_lastmsg = NULL;
1828 priq->msgq_bytes = priq->msgq_flags = 0;
1829}
1830
1831/*
1832 * Free all mbufs in a sockbuf.
1833 * Check that all resources are reclaimed.
1834 */
1835void
1836sbflush(struct sockbuf *sb)
1837{
1838 void *lr_saved = __builtin_return_address(0);
1839 struct socket *so = sb->sb_so;
1840 u_int32_t i;
1841
1842 /* so_usecount may be 0 if we get here from sofreelastref() */
1843 if (so == NULL) {
1844 panic("%s: null so, sb=%p sb_flags=0x%x lr=%p\n",
1845 __func__, sb, sb->sb_flags, lr_saved);
1846 /* NOTREACHED */
1847 } else if (so->so_usecount < 0) {
1848 panic("%s: sb=%p sb_flags=0x%x sb_so=%p usecount=%d lr=%p "
1849 "lrh= %s\n", __func__, sb, sb->sb_flags, so,
1850 so->so_usecount, lr_saved, solockhistory_nr(so));
1851 /* NOTREACHED */
1852 }
1853
1854 /*
1855 * Obtain lock on the socket buffer (SB_LOCK). This is required
1856 * to prevent the socket buffer from being unexpectedly altered
1857 * while it is used by another thread in socket send/receive.
1858 *
1859 * sblock() must not fail here, hence the assertion.
1860 */
1861 (void) sblock(sb, SBL_WAIT | SBL_NOINTR | SBL_IGNDEFUNCT);
1862 VERIFY(sb->sb_flags & SB_LOCK);
1863
1864 while (sb->sb_mbcnt > 0) {
1865 /*
1866 * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
1867 * we would loop forever. Panic instead.
1868 */
1869 if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len))
1870 break;
1871 sbdrop(sb, (int)sb->sb_cc);
1872 }
1873
1874 if (!(sb->sb_flags & SB_RECV) && (so->so_flags & SOF_ENABLE_MSGS)) {
1875 VERIFY(so->so_msg_state != NULL);
1876 for (i = MSG_PRI_MIN; i <= MSG_PRI_MAX; ++i) {
1877 sbflush_priq(&so->so_msg_state->msg_priq[i]);
1878 }
1879 so->so_msg_state->msg_serial_bytes = 0;
1880 so->so_msg_state->msg_uno_bytes = 0;
1881 }
1882
1883 sb_empty_assert(sb, __func__);
1884 postevent(0, sb, EV_RWBYTES);
1885
1886 sbunlock(sb, TRUE); /* keep socket locked */
1887}
1888
1889/*
1890 * Drop data from (the front of) a sockbuf.
1891 * use m_freem_list to free the mbuf structures
1892 * under a single lock... this is done by pruning
1893 * the top of the tree from the body by keeping track
1894 * of where we get to in the tree and then zeroing the
1895 * two pertinent pointers m_nextpkt and m_next
1896 * the socket buffer is then updated to point at the new
1897 * top of the tree and the pruned area is released via
1898 * m_freem_list.
1899 */
1900void
1901sbdrop(struct sockbuf *sb, int len)
1902{
1903 struct mbuf *m, *free_list, *ml;
1904 struct mbuf *next, *last;
1905
1906 next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
1907#if MPTCP
1908 if (m != NULL && len > 0 && !(sb->sb_flags & SB_RECV) &&
1909 ((sb->sb_so->so_flags & SOF_MP_SUBFLOW) ||
1910 (SOCK_CHECK_DOM(sb->sb_so, PF_MULTIPATH) &&
1911 SOCK_CHECK_PROTO(sb->sb_so, IPPROTO_TCP))) &&
1912 !(sb->sb_so->so_flags1 & SOF1_POST_FALLBACK_SYNC)) {
1913 mptcp_preproc_sbdrop(sb->sb_so, m, (unsigned int)len);
1914 }
1915 if (m != NULL && len > 0 && !(sb->sb_flags & SB_RECV) &&
1916 (sb->sb_so->so_flags & SOF_MP_SUBFLOW) &&
1917 (sb->sb_so->so_flags1 & SOF1_POST_FALLBACK_SYNC)) {
1918 mptcp_fallback_sbdrop(sb->sb_so, m, len);
1919 }
1920#endif /* MPTCP */
1921 KERNEL_DEBUG((DBG_FNC_SBDROP | DBG_FUNC_START), sb, len, 0, 0, 0);
1922
1923 free_list = last = m;
1924 ml = (struct mbuf *)0;
1925
1926 while (len > 0) {
1927 if (m == NULL) {
1928 if (next == NULL) {
1929 /*
1930 * temporarily replacing this panic with printf
1931 * because it occurs occasionally when closing
1932 * a socket when there is no harm in ignoring
1933 * it. This problem will be investigated
1934 * further.
1935 */
1936 /* panic("sbdrop"); */
1937 printf("sbdrop - count not zero\n");
1938 len = 0;
1939 /*
1940 * zero the counts. if we have no mbufs,
1941 * we have no data (PR-2986815)
1942 */
1943 sb->sb_cc = 0;
1944 sb->sb_mbcnt = 0;
1945 if (!(sb->sb_flags & SB_RECV) &&
1946 (sb->sb_so->so_flags & SOF_ENABLE_MSGS)) {
1947 sb->sb_so->so_msg_state->
1948 msg_serial_bytes = 0;
1949 }
1950 break;
1951 }
1952 m = last = next;
1953 next = m->m_nextpkt;
1954 continue;
1955 }
1956 if (m->m_len > len) {
1957 m->m_len -= len;
1958 m->m_data += len;
1959 sb->sb_cc -= len;
1960 /* update the send byte count */
1961 if (sb->sb_flags & SB_SNDBYTE_CNT)
1962 inp_decr_sndbytes_total(sb->sb_so, len);
1963 if (m->m_type != MT_DATA && m->m_type != MT_HEADER &&
1964 m->m_type != MT_OOBDATA)
1965 sb->sb_ctl -= len;
1966 break;
1967 }
1968 len -= m->m_len;
1969 sbfree(sb, m);
1970
1971 ml = m;
1972 m = m->m_next;
1973 }
1974 while (m && m->m_len == 0) {
1975 sbfree(sb, m);
1976
1977 ml = m;
1978 m = m->m_next;
1979 }
1980 if (ml) {
1981 ml->m_next = (struct mbuf *)0;
1982 last->m_nextpkt = (struct mbuf *)0;
1983 m_freem_list(free_list);
1984 }
1985 if (m) {
1986 sb->sb_mb = m;
1987 m->m_nextpkt = next;
1988 } else {
1989 sb->sb_mb = next;
1990 }
1991
1992 /*
1993 * First part is an inline SB_EMPTY_FIXUP(). Second part
1994 * makes sure sb_lastrecord is up-to-date if we dropped
1995 * part of the last record.
1996 */
1997 m = sb->sb_mb;
1998 if (m == NULL) {
1999 sb->sb_mbtail = NULL;
2000 sb->sb_lastrecord = NULL;
2001 } else if (m->m_nextpkt == NULL) {
2002 sb->sb_lastrecord = m;
2003 }
2004
2005#if CONTENT_FILTER
2006 cfil_sock_buf_update(sb);
2007#endif /* CONTENT_FILTER */
2008
2009 postevent(0, sb, EV_RWBYTES);
2010
2011 KERNEL_DEBUG((DBG_FNC_SBDROP | DBG_FUNC_END), sb, 0, 0, 0, 0);
2012}
2013
2014/*
2015 * Drop a record off the front of a sockbuf
2016 * and move the next record to the front.
2017 */
2018void
2019sbdroprecord(struct sockbuf *sb)
2020{
2021 struct mbuf *m, *mn;
2022
2023 m = sb->sb_mb;
2024 if (m) {
2025 sb->sb_mb = m->m_nextpkt;
2026 do {
2027 sbfree(sb, m);
2028 MFREE(m, mn);
2029 m = mn;
2030 } while (m);
2031 }
2032 SB_EMPTY_FIXUP(sb);
2033 postevent(0, sb, EV_RWBYTES);
2034}
2035
2036/*
2037 * Create a "control" mbuf containing the specified data
2038 * with the specified type for presentation on a socket buffer.
2039 */
2040struct mbuf *
2041sbcreatecontrol(caddr_t p, int size, int type, int level)
2042{
2043 struct cmsghdr *cp;
2044 struct mbuf *m;
2045
2046 if (CMSG_SPACE((u_int)size) > MLEN)
2047 return ((struct mbuf *)NULL);
2048 if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL)
2049 return ((struct mbuf *)NULL);
2050 cp = mtod(m, struct cmsghdr *);
2051 VERIFY(IS_P2ALIGNED(cp, sizeof (u_int32_t)));
2052 /* XXX check size? */
2053 (void) memcpy(CMSG_DATA(cp), p, size);
2054 m->m_len = CMSG_SPACE(size);
2055 cp->cmsg_len = CMSG_LEN(size);
2056 cp->cmsg_level = level;
2057 cp->cmsg_type = type;
2058 return (m);
2059}
2060
2061struct mbuf **
2062sbcreatecontrol_mbuf(caddr_t p, int size, int type, int level, struct mbuf **mp)
2063{
2064 struct mbuf *m;
2065 struct cmsghdr *cp;
2066
2067 if (*mp == NULL) {
2068 *mp = sbcreatecontrol(p, size, type, level);
2069 return (mp);
2070 }
2071
2072 if (CMSG_SPACE((u_int)size) + (*mp)->m_len > MLEN) {
2073 mp = &(*mp)->m_next;
2074 *mp = sbcreatecontrol(p, size, type, level);
2075 return (mp);
2076 }
2077
2078 m = *mp;
2079
2080 cp = (struct cmsghdr *)(void *)(mtod(m, char *) + m->m_len);
2081 /* CMSG_SPACE ensures 32-bit alignment */
2082 VERIFY(IS_P2ALIGNED(cp, sizeof (u_int32_t)));
2083 m->m_len += CMSG_SPACE(size);
2084
2085 /* XXX check size? */
2086 (void) memcpy(CMSG_DATA(cp), p, size);
2087 cp->cmsg_len = CMSG_LEN(size);
2088 cp->cmsg_level = level;
2089 cp->cmsg_type = type;
2090
2091 return (mp);
2092}
2093
2094
2095/*
2096 * Some routines that return EOPNOTSUPP for entry points that are not
2097 * supported by a protocol. Fill in as needed.
2098 */
2099int
2100pru_abort_notsupp(struct socket *so)
2101{
2102#pragma unused(so)
2103 return (EOPNOTSUPP);
2104}
2105
2106int
2107pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
2108{
2109#pragma unused(so, nam)
2110 return (EOPNOTSUPP);
2111}
2112
2113int
2114pru_attach_notsupp(struct socket *so, int proto, struct proc *p)
2115{
2116#pragma unused(so, proto, p)
2117 return (EOPNOTSUPP);
2118}
2119
2120int
2121pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct proc *p)
2122{
2123#pragma unused(so, nam, p)
2124 return (EOPNOTSUPP);
2125}
2126
2127int
2128pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct proc *p)
2129{
2130#pragma unused(so, nam, p)
2131 return (EOPNOTSUPP);
2132}
2133
2134int
2135pru_connect2_notsupp(struct socket *so1, struct socket *so2)
2136{
2137#pragma unused(so1, so2)
2138 return (EOPNOTSUPP);
2139}
2140
2141int
2142pru_connectx_notsupp(struct socket *so, struct sockaddr *src,
2143 struct sockaddr *dst, struct proc *p, uint32_t ifscope,
2144 sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
2145 uint32_t arglen, struct uio *uio, user_ssize_t *bytes_written)
2146{
2147#pragma unused(so, src, dst, p, ifscope, aid, pcid, flags, arg, arglen, uio, bytes_written)
2148 return (EOPNOTSUPP);
2149}
2150
2151int
2152pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
2153 struct ifnet *ifp, struct proc *p)
2154{
2155#pragma unused(so, cmd, data, ifp, p)
2156 return (EOPNOTSUPP);
2157}
2158
2159int
2160pru_detach_notsupp(struct socket *so)
2161{
2162#pragma unused(so)
2163 return (EOPNOTSUPP);
2164}
2165
2166int
2167pru_disconnect_notsupp(struct socket *so)
2168{
2169#pragma unused(so)
2170 return (EOPNOTSUPP);
2171}
2172
2173int
2174pru_disconnectx_notsupp(struct socket *so, sae_associd_t aid, sae_connid_t cid)
2175{
2176#pragma unused(so, aid, cid)
2177 return (EOPNOTSUPP);
2178}
2179
2180int
2181pru_listen_notsupp(struct socket *so, struct proc *p)
2182{
2183#pragma unused(so, p)
2184 return (EOPNOTSUPP);
2185}
2186
2187int
2188pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
2189{
2190#pragma unused(so, nam)
2191 return (EOPNOTSUPP);
2192}
2193
2194int
2195pru_rcvd_notsupp(struct socket *so, int flags)
2196{
2197#pragma unused(so, flags)
2198 return (EOPNOTSUPP);
2199}
2200
2201int
2202pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
2203{
2204#pragma unused(so, m, flags)
2205 return (EOPNOTSUPP);
2206}
2207
2208int
2209pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
2210 struct sockaddr *addr, struct mbuf *control, struct proc *p)
2211{
2212#pragma unused(so, flags, m, addr, control, p)
2213 return (EOPNOTSUPP);
2214}
2215
2216int
2217pru_send_list_notsupp(struct socket *so, int flags, struct mbuf *m,
2218 struct sockaddr *addr, struct mbuf *control, struct proc *p)
2219{
2220#pragma unused(so, flags, m, addr, control, p)
2221 return (EOPNOTSUPP);
2222}
2223
2224/*
2225 * This isn't really a ``null'' operation, but it's the default one
2226 * and doesn't do anything destructive.
2227 */
2228int
2229pru_sense_null(struct socket *so, void *ub, int isstat64)
2230{
2231 if (isstat64 != 0) {
2232 struct stat64 *sb64;
2233
2234 sb64 = (struct stat64 *)ub;
2235 sb64->st_blksize = so->so_snd.sb_hiwat;
2236 } else {
2237 struct stat *sb;
2238
2239 sb = (struct stat *)ub;
2240 sb->st_blksize = so->so_snd.sb_hiwat;
2241 }
2242
2243 return (0);
2244}
2245
2246
2247int
2248pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
2249 struct mbuf *top, struct mbuf *control, int flags)
2250{
2251#pragma unused(so, addr, uio, top, control, flags)
2252 return (EOPNOTSUPP);
2253}
2254
2255int
2256pru_sosend_list_notsupp(struct socket *so, struct uio **uio,
2257 u_int uiocnt, int flags)
2258{
2259#pragma unused(so, uio, uiocnt, flags)
2260 return (EOPNOTSUPP);
2261}
2262
2263int
2264pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
2265 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2266{
2267#pragma unused(so, paddr, uio, mp0, controlp, flagsp)
2268 return (EOPNOTSUPP);
2269}
2270
2271int
2272pru_soreceive_list_notsupp(struct socket *so,
2273 struct recv_msg_elem *recv_msg_array, u_int uiocnt, int *flagsp)
2274{
2275#pragma unused(so, recv_msg_array, uiocnt, flagsp)
2276 return (EOPNOTSUPP);
2277}
2278
2279int
2280pru_shutdown_notsupp(struct socket *so)
2281{
2282#pragma unused(so)
2283 return (EOPNOTSUPP);
2284}
2285
2286int
2287pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
2288{
2289#pragma unused(so, nam)
2290 return (EOPNOTSUPP);
2291}
2292
2293int
2294pru_sopoll_notsupp(struct socket *so, int events, kauth_cred_t cred, void *wql)
2295{
2296#pragma unused(so, events, cred, wql)
2297 return (EOPNOTSUPP);
2298}
2299
2300int
2301pru_socheckopt_null(struct socket *so, struct sockopt *sopt)
2302{
2303#pragma unused(so, sopt)
2304 /*
2305 * Allow all options for set/get by default.
2306 */
2307 return (0);
2308}
2309
2310static int
2311pru_preconnect_null(struct socket *so)
2312{
2313#pragma unused(so)
2314 return (0);
2315}
2316
2317void
2318pru_sanitize(struct pr_usrreqs *pru)
2319{
2320#define DEFAULT(foo, bar) if ((foo) == NULL) (foo) = (bar)
2321 DEFAULT(pru->pru_abort, pru_abort_notsupp);
2322 DEFAULT(pru->pru_accept, pru_accept_notsupp);
2323 DEFAULT(pru->pru_attach, pru_attach_notsupp);
2324 DEFAULT(pru->pru_bind, pru_bind_notsupp);
2325 DEFAULT(pru->pru_connect, pru_connect_notsupp);
2326 DEFAULT(pru->pru_connect2, pru_connect2_notsupp);
2327 DEFAULT(pru->pru_connectx, pru_connectx_notsupp);
2328 DEFAULT(pru->pru_control, pru_control_notsupp);
2329 DEFAULT(pru->pru_detach, pru_detach_notsupp);
2330 DEFAULT(pru->pru_disconnect, pru_disconnect_notsupp);
2331 DEFAULT(pru->pru_disconnectx, pru_disconnectx_notsupp);
2332 DEFAULT(pru->pru_listen, pru_listen_notsupp);
2333 DEFAULT(pru->pru_peeraddr, pru_peeraddr_notsupp);
2334 DEFAULT(pru->pru_rcvd, pru_rcvd_notsupp);
2335 DEFAULT(pru->pru_rcvoob, pru_rcvoob_notsupp);
2336 DEFAULT(pru->pru_send, pru_send_notsupp);
2337 DEFAULT(pru->pru_send_list, pru_send_list_notsupp);
2338 DEFAULT(pru->pru_sense, pru_sense_null);
2339 DEFAULT(pru->pru_shutdown, pru_shutdown_notsupp);
2340 DEFAULT(pru->pru_sockaddr, pru_sockaddr_notsupp);
2341 DEFAULT(pru->pru_sopoll, pru_sopoll_notsupp);
2342 DEFAULT(pru->pru_soreceive, pru_soreceive_notsupp);
2343 DEFAULT(pru->pru_soreceive_list, pru_soreceive_list_notsupp);
2344 DEFAULT(pru->pru_sosend, pru_sosend_notsupp);
2345 DEFAULT(pru->pru_sosend_list, pru_sosend_list_notsupp);
2346 DEFAULT(pru->pru_socheckopt, pru_socheckopt_null);
2347 DEFAULT(pru->pru_preconnect, pru_preconnect_null);
2348#undef DEFAULT
2349}
2350
2351/*
2352 * The following are macros on BSD and functions on Darwin
2353 */
2354
2355/*
2356 * Do we need to notify the other side when I/O is possible?
2357 */
2358
2359int
2360sb_notify(struct sockbuf *sb)
2361{
2362 return (sb->sb_waiters > 0 ||
2363 (sb->sb_flags & (SB_SEL|SB_ASYNC|SB_UPCALL|SB_KNOTE)));
2364}
2365
2366/*
2367 * How much space is there in a socket buffer (so->so_snd or so->so_rcv)?
2368 * This is problematical if the fields are unsigned, as the space might
2369 * still be negative (cc > hiwat or mbcnt > mbmax). Should detect
2370 * overflow and return 0.
2371 */
2372int
2373sbspace(struct sockbuf *sb)
2374{
2375 int pending = 0;
2376 int space = imin((int)(sb->sb_hiwat - sb->sb_cc),
2377 (int)(sb->sb_mbmax - sb->sb_mbcnt));
2378
2379 if (sb->sb_preconn_hiwat != 0)
2380 space = imin((int)(sb->sb_preconn_hiwat - sb->sb_cc), space);
2381
2382 if (space < 0)
2383 space = 0;
2384
2385 /* Compensate for data being processed by content filters */
2386#if CONTENT_FILTER
2387 pending = cfil_sock_data_space(sb);
2388#endif /* CONTENT_FILTER */
2389 if (pending > space)
2390 space = 0;
2391 else
2392 space -= pending;
2393
2394 return (space);
2395}
2396
2397/*
2398 * If this socket has priority queues, check if there is enough
2399 * space in the priority queue for this msg.
2400 */
2401int
2402msgq_sbspace(struct socket *so, struct mbuf *control)
2403{
2404 int space = 0, error;
2405 u_int32_t msgpri = 0;
2406 VERIFY(so->so_type == SOCK_STREAM &&
2407 SOCK_PROTO(so) == IPPROTO_TCP);
2408 if (control != NULL) {
2409 error = tcp_get_msg_priority(control, &msgpri);
2410 if (error)
2411 return (0);
2412 } else {
2413 msgpri = MSG_PRI_0;
2414 }
2415 space = (so->so_snd.sb_idealsize / MSG_PRI_COUNT) -
2416 so->so_msg_state->msg_priq[msgpri].msgq_bytes;
2417 if (space < 0)
2418 space = 0;
2419 return (space);
2420}
2421
2422/* do we have to send all at once on a socket? */
2423int
2424sosendallatonce(struct socket *so)
2425{
2426 return (so->so_proto->pr_flags & PR_ATOMIC);
2427}
2428
2429/* can we read something from so? */
2430int
2431soreadable(struct socket *so)
2432{
2433 return (so->so_rcv.sb_cc >= so->so_rcv.sb_lowat ||
2434 ((so->so_state & SS_CANTRCVMORE)
2435#if CONTENT_FILTER
2436 && cfil_sock_data_pending(&so->so_rcv) == 0
2437#endif /* CONTENT_FILTER */
2438 ) ||
2439 so->so_comp.tqh_first || so->so_error);
2440}
2441
2442/* can we write something to so? */
2443
2444int
2445sowriteable(struct socket *so)
2446{
2447 if ((so->so_state & SS_CANTSENDMORE) ||
2448 so->so_error > 0)
2449 return (1);
2450 if (so_wait_for_if_feedback(so) || !socanwrite(so))
2451 return (0);
2452 if (so->so_flags1 & SOF1_PRECONNECT_DATA)
2453 return(1);
2454
2455 if (sbspace(&(so)->so_snd) >= (so)->so_snd.sb_lowat) {
2456 if (so->so_flags & SOF_NOTSENT_LOWAT) {
2457 if ((SOCK_DOM(so) == PF_INET6 ||
2458 SOCK_DOM(so) == PF_INET) &&
2459 so->so_type == SOCK_STREAM) {
2460 return (tcp_notsent_lowat_check(so));
2461 }
2462#if MPTCP
2463 else if ((SOCK_DOM(so) == PF_MULTIPATH) &&
2464 (SOCK_PROTO(so) == IPPROTO_TCP)) {
2465 return (mptcp_notsent_lowat_check(so));
2466 }
2467#endif
2468 else {
2469 return (1);
2470 }
2471 } else {
2472 return (1);
2473 }
2474 }
2475 return (0);
2476}
2477
2478/* adjust counters in sb reflecting allocation of m */
2479
2480void
2481sballoc(struct sockbuf *sb, struct mbuf *m)
2482{
2483 u_int32_t cnt = 1;
2484 sb->sb_cc += m->m_len;
2485 if (m->m_type != MT_DATA && m->m_type != MT_HEADER &&
2486 m->m_type != MT_OOBDATA)
2487 sb->sb_ctl += m->m_len;
2488 sb->sb_mbcnt += MSIZE;
2489
2490 if (m->m_flags & M_EXT) {
2491 sb->sb_mbcnt += m->m_ext.ext_size;
2492 cnt += (m->m_ext.ext_size >> MSIZESHIFT);
2493 }
2494 OSAddAtomic(cnt, &total_sbmb_cnt);
2495 VERIFY(total_sbmb_cnt > 0);
2496 if (total_sbmb_cnt > total_sbmb_cnt_peak)
2497 total_sbmb_cnt_peak = total_sbmb_cnt;
2498
2499 /*
2500 * If data is being added to the send socket buffer,
2501 * update the send byte count
2502 */
2503 if (sb->sb_flags & SB_SNDBYTE_CNT) {
2504 inp_incr_sndbytes_total(sb->sb_so, m->m_len);
2505 inp_incr_sndbytes_unsent(sb->sb_so, m->m_len);
2506 }
2507}
2508
2509/* adjust counters in sb reflecting freeing of m */
2510void
2511sbfree(struct sockbuf *sb, struct mbuf *m)
2512{
2513 int cnt = -1;
2514
2515 sb->sb_cc -= m->m_len;
2516 if (m->m_type != MT_DATA && m->m_type != MT_HEADER &&
2517 m->m_type != MT_OOBDATA)
2518 sb->sb_ctl -= m->m_len;
2519 sb->sb_mbcnt -= MSIZE;
2520 if (m->m_flags & M_EXT) {
2521 sb->sb_mbcnt -= m->m_ext.ext_size;
2522 cnt -= (m->m_ext.ext_size >> MSIZESHIFT);
2523 }
2524 OSAddAtomic(cnt, &total_sbmb_cnt);
2525 VERIFY(total_sbmb_cnt >= 0);
2526 if (total_sbmb_cnt < total_sbmb_cnt_floor)
2527 total_sbmb_cnt_floor = total_sbmb_cnt;
2528
2529 /*
2530 * If data is being removed from the send socket buffer,
2531 * update the send byte count
2532 */
2533 if (sb->sb_flags & SB_SNDBYTE_CNT)
2534 inp_decr_sndbytes_total(sb->sb_so, m->m_len);
2535}
2536
2537/*
2538 * Set lock on sockbuf sb; sleep if lock is already held.
2539 * Unless SB_NOINTR is set on sockbuf, sleep is interruptible.
2540 * Returns error without lock if sleep is interrupted.
2541 */
2542int
2543sblock(struct sockbuf *sb, uint32_t flags)
2544{
2545 boolean_t nointr = ((sb->sb_flags & SB_NOINTR) || (flags & SBL_NOINTR));
2546 void *lr_saved = __builtin_return_address(0);
2547 struct socket *so = sb->sb_so;
2548 void * wchan;
2549 int error = 0;
2550 thread_t tp = current_thread();
2551
2552 VERIFY((flags & SBL_VALID) == flags);
2553
2554 /* so_usecount may be 0 if we get here from sofreelastref() */
2555 if (so == NULL) {
2556 panic("%s: null so, sb=%p sb_flags=0x%x lr=%p\n",
2557 __func__, sb, sb->sb_flags, lr_saved);
2558 /* NOTREACHED */
2559 } else if (so->so_usecount < 0) {
2560 panic("%s: sb=%p sb_flags=0x%x sb_so=%p usecount=%d lr=%p "
2561 "lrh= %s\n", __func__, sb, sb->sb_flags, so,
2562 so->so_usecount, lr_saved, solockhistory_nr(so));
2563 /* NOTREACHED */
2564 }
2565
2566 /*
2567 * The content filter thread must hold the sockbuf lock
2568 */
2569 if ((so->so_flags & SOF_CONTENT_FILTER) && sb->sb_cfil_thread == tp) {
2570 /*
2571 * Don't panic if we are defunct because SB_LOCK has
2572 * been cleared by sodefunct()
2573 */
2574 if (!(so->so_flags & SOF_DEFUNCT) && !(sb->sb_flags & SB_LOCK))
2575 panic("%s: SB_LOCK not held for %p\n",
2576 __func__, sb);
2577
2578 /* Keep the sockbuf locked */
2579 return (0);
2580 }
2581
2582 if ((sb->sb_flags & SB_LOCK) && !(flags & SBL_WAIT))
2583 return (EWOULDBLOCK);
2584 /*
2585 * We may get here from sorflush(), in which case "sb" may not
2586 * point to the real socket buffer. Use the actual socket buffer
2587 * address from the socket instead.
2588 */
2589 wchan = (sb->sb_flags & SB_RECV) ?
2590 &so->so_rcv.sb_flags : &so->so_snd.sb_flags;
2591
2592 /*
2593 * A content filter thread has exclusive access to the sockbuf
2594 * until it clears the
2595 */
2596 while ((sb->sb_flags & SB_LOCK) ||
2597 ((so->so_flags & SOF_CONTENT_FILTER) &&
2598 sb->sb_cfil_thread != NULL)) {
2599 lck_mtx_t *mutex_held;
2600
2601 /*
2602 * XXX: This code should be moved up above outside of this loop;
2603 * however, we may get here as part of sofreelastref(), and
2604 * at that time pr_getlock() may no longer be able to return
2605 * us the lock. This will be fixed in future.
2606 */
2607 if (so->so_proto->pr_getlock != NULL)
2608 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
2609 else
2610 mutex_held = so->so_proto->pr_domain->dom_mtx;
2611
2612 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
2613
2614 sb->sb_wantlock++;
2615 VERIFY(sb->sb_wantlock != 0);
2616
2617 error = msleep(wchan, mutex_held,
2618 nointr ? PSOCK : PSOCK | PCATCH,
2619 nointr ? "sb_lock_nointr" : "sb_lock", NULL);
2620
2621 VERIFY(sb->sb_wantlock != 0);
2622 sb->sb_wantlock--;
2623
2624 if (error == 0 && (so->so_flags & SOF_DEFUNCT) &&
2625 !(flags & SBL_IGNDEFUNCT)) {
2626 error = EBADF;
2627 SODEFUNCTLOG("%s[%d, %s]: defunct so 0x%llx [%d,%d] "
2628 "(%d)\n", __func__, proc_selfpid(),
2629 proc_best_name(current_proc()),
2630 (uint64_t)VM_KERNEL_ADDRPERM(so),
2631 SOCK_DOM(so), SOCK_TYPE(so), error);
2632 }
2633
2634 if (error != 0)
2635 return (error);
2636 }
2637 sb->sb_flags |= SB_LOCK;
2638 return (0);
2639}
2640
2641/*
2642 * Release lock on sockbuf sb
2643 */
2644void
2645sbunlock(struct sockbuf *sb, boolean_t keeplocked)
2646{
2647 void *lr_saved = __builtin_return_address(0);
2648 struct socket *so = sb->sb_so;
2649 thread_t tp = current_thread();
2650
2651 /* so_usecount may be 0 if we get here from sofreelastref() */
2652 if (so == NULL) {
2653 panic("%s: null so, sb=%p sb_flags=0x%x lr=%p\n",
2654 __func__, sb, sb->sb_flags, lr_saved);
2655 /* NOTREACHED */
2656 } else if (so->so_usecount < 0) {
2657 panic("%s: sb=%p sb_flags=0x%x sb_so=%p usecount=%d lr=%p "
2658 "lrh= %s\n", __func__, sb, sb->sb_flags, so,
2659 so->so_usecount, lr_saved, solockhistory_nr(so));
2660 /* NOTREACHED */
2661 }
2662
2663 /*
2664 * The content filter thread must hold the sockbuf lock
2665 */
2666 if ((so->so_flags & SOF_CONTENT_FILTER) && sb->sb_cfil_thread == tp) {
2667 /*
2668 * Don't panic if we are defunct because SB_LOCK has
2669 * been cleared by sodefunct()
2670 */
2671 if (!(so->so_flags & SOF_DEFUNCT) &&
2672 !(sb->sb_flags & SB_LOCK) &&
2673 !(so->so_state & SS_DEFUNCT) &&
2674 !(so->so_flags1 & SOF1_DEFUNCTINPROG)) {
2675 panic("%s: SB_LOCK not held for %p\n",
2676 __func__, sb);
2677 }
2678 /* Keep the sockbuf locked and proceed */
2679 } else {
2680 VERIFY((sb->sb_flags & SB_LOCK) ||
2681 (so->so_state & SS_DEFUNCT) ||
2682 (so->so_flags1 & SOF1_DEFUNCTINPROG));
2683
2684 sb->sb_flags &= ~SB_LOCK;
2685
2686 if (sb->sb_wantlock > 0) {
2687 /*
2688 * We may get here from sorflush(), in which case "sb"
2689 * may not point to the real socket buffer. Use the
2690 * actual socket buffer address from the socket instead.
2691 */
2692 wakeup((sb->sb_flags & SB_RECV) ? &so->so_rcv.sb_flags :
2693 &so->so_snd.sb_flags);
2694 }
2695 }
2696
2697 if (!keeplocked) { /* unlock on exit */
2698 lck_mtx_t *mutex_held;
2699
2700 if (so->so_proto->pr_getlock != NULL)
2701 mutex_held = (*so->so_proto->pr_getlock)(so, PR_F_WILLUNLOCK);
2702 else
2703 mutex_held = so->so_proto->pr_domain->dom_mtx;
2704
2705 LCK_MTX_ASSERT(mutex_held, LCK_MTX_ASSERT_OWNED);
2706
2707 VERIFY(so->so_usecount > 0);
2708 so->so_usecount--;
2709 so->unlock_lr[so->next_unlock_lr] = lr_saved;
2710 so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
2711 lck_mtx_unlock(mutex_held);
2712 }
2713}
2714
2715void
2716sorwakeup(struct socket *so)
2717{
2718 if (sb_notify(&so->so_rcv))
2719 sowakeup(so, &so->so_rcv);
2720}
2721
2722void
2723sowwakeup(struct socket *so)
2724{
2725 if (sb_notify(&so->so_snd))
2726 sowakeup(so, &so->so_snd);
2727}
2728
2729void
2730soevent(struct socket *so, long hint)
2731{
2732 if (so->so_flags & SOF_KNOTE)
2733 KNOTE(&so->so_klist, hint);
2734
2735 soevupcall(so, hint);
2736
2737 /*
2738 * Don't post an event if this a subflow socket or
2739 * the app has opted out of using cellular interface
2740 */
2741 if ((hint & SO_FILT_HINT_IFDENIED) &&
2742 !(so->so_flags & SOF_MP_SUBFLOW) &&
2743 !(so->so_restrictions & SO_RESTRICT_DENY_CELLULAR) &&
2744 !(so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE))
2745 soevent_ifdenied(so);
2746}
2747
2748void
2749soevupcall(struct socket *so, u_int32_t hint)
2750{
2751 if (so->so_event != NULL) {
2752 caddr_t so_eventarg = so->so_eventarg;
2753
2754 hint &= so->so_eventmask;
2755 if (hint != 0)
2756 so->so_event(so, so_eventarg, hint);
2757 }
2758}
2759
2760static void
2761soevent_ifdenied(struct socket *so)
2762{
2763 struct kev_netpolicy_ifdenied ev_ifdenied;
2764
2765 bzero(&ev_ifdenied, sizeof (ev_ifdenied));
2766 /*
2767 * The event consumer is interested about the effective {upid,pid,uuid}
2768 * info which can be different than the those related to the process
2769 * that recently performed a system call on the socket, i.e. when the
2770 * socket is delegated.
2771 */
2772 if (so->so_flags & SOF_DELEGATED) {
2773 ev_ifdenied.ev_data.eupid = so->e_upid;
2774 ev_ifdenied.ev_data.epid = so->e_pid;
2775 uuid_copy(ev_ifdenied.ev_data.euuid, so->e_uuid);
2776 } else {
2777 ev_ifdenied.ev_data.eupid = so->last_upid;
2778 ev_ifdenied.ev_data.epid = so->last_pid;
2779 uuid_copy(ev_ifdenied.ev_data.euuid, so->last_uuid);
2780 }
2781
2782 if (++so->so_ifdenied_notifies > 1) {
2783 /*
2784 * Allow for at most one kernel event to be generated per
2785 * socket; so_ifdenied_notifies is reset upon changes in
2786 * the UUID policy. See comments in inp_update_policy.
2787 */
2788 if (net_io_policy_log) {
2789 uuid_string_t buf;
2790
2791 uuid_unparse(ev_ifdenied.ev_data.euuid, buf);
2792 log(LOG_DEBUG, "%s[%d]: so 0x%llx [%d,%d] epid %d "
2793 "euuid %s%s has %d redundant events supressed\n",
2794 __func__, so->last_pid,
2795 (uint64_t)VM_KERNEL_ADDRPERM(so), SOCK_DOM(so),
2796 SOCK_TYPE(so), ev_ifdenied.ev_data.epid, buf,
2797 ((so->so_flags & SOF_DELEGATED) ?
2798 " [delegated]" : ""), so->so_ifdenied_notifies);
2799 }
2800 } else {
2801 if (net_io_policy_log) {
2802 uuid_string_t buf;
2803
2804 uuid_unparse(ev_ifdenied.ev_data.euuid, buf);
2805 log(LOG_DEBUG, "%s[%d]: so 0x%llx [%d,%d] epid %d "
2806 "euuid %s%s event posted\n", __func__,
2807 so->last_pid, (uint64_t)VM_KERNEL_ADDRPERM(so),
2808 SOCK_DOM(so), SOCK_TYPE(so),
2809 ev_ifdenied.ev_data.epid, buf,
2810 ((so->so_flags & SOF_DELEGATED) ?
2811 " [delegated]" : ""));
2812 }
2813 netpolicy_post_msg(KEV_NETPOLICY_IFDENIED, &ev_ifdenied.ev_data,
2814 sizeof (ev_ifdenied));
2815 }
2816}
2817
2818/*
2819 * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
2820 */
2821struct sockaddr *
2822dup_sockaddr(struct sockaddr *sa, int canwait)
2823{
2824 struct sockaddr *sa2;
2825
2826 MALLOC(sa2, struct sockaddr *, sa->sa_len, M_SONAME,
2827 canwait ? M_WAITOK : M_NOWAIT);
2828 if (sa2)
2829 bcopy(sa, sa2, sa->sa_len);
2830 return (sa2);
2831}
2832
2833/*
2834 * Create an external-format (``xsocket'') structure using the information
2835 * in the kernel-format socket structure pointed to by so. This is done
2836 * to reduce the spew of irrelevant information over this interface,
2837 * to isolate user code from changes in the kernel structure, and
2838 * potentially to provide information-hiding if we decide that
2839 * some of this information should be hidden from users.
2840 */
2841void
2842sotoxsocket(struct socket *so, struct xsocket *xso)
2843{
2844 xso->xso_len = sizeof (*xso);
2845 xso->xso_so = (_XSOCKET_PTR(struct socket *))VM_KERNEL_ADDRPERM(so);
2846 xso->so_type = so->so_type;
2847 xso->so_options = (short)(so->so_options & 0xffff);
2848 xso->so_linger = so->so_linger;
2849 xso->so_state = so->so_state;
2850 xso->so_pcb = (_XSOCKET_PTR(caddr_t))VM_KERNEL_ADDRPERM(so->so_pcb);
2851 if (so->so_proto) {
2852 xso->xso_protocol = SOCK_PROTO(so);
2853 xso->xso_family = SOCK_DOM(so);
2854 } else {
2855 xso->xso_protocol = xso->xso_family = 0;
2856 }
2857 xso->so_qlen = so->so_qlen;
2858 xso->so_incqlen = so->so_incqlen;
2859 xso->so_qlimit = so->so_qlimit;
2860 xso->so_timeo = so->so_timeo;
2861 xso->so_error = so->so_error;
2862 xso->so_pgid = so->so_pgid;
2863 xso->so_oobmark = so->so_oobmark;
2864 sbtoxsockbuf(&so->so_snd, &xso->so_snd);
2865 sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
2866 xso->so_uid = kauth_cred_getuid(so->so_cred);
2867}
2868
2869
2870#if !CONFIG_EMBEDDED
2871
2872void
2873sotoxsocket64(struct socket *so, struct xsocket64 *xso)
2874{
2875 xso->xso_len = sizeof (*xso);
2876 xso->xso_so = (u_int64_t)VM_KERNEL_ADDRPERM(so);
2877 xso->so_type = so->so_type;
2878 xso->so_options = (short)(so->so_options & 0xffff);
2879 xso->so_linger = so->so_linger;
2880 xso->so_state = so->so_state;
2881 xso->so_pcb = (u_int64_t)VM_KERNEL_ADDRPERM(so->so_pcb);
2882 if (so->so_proto) {
2883 xso->xso_protocol = SOCK_PROTO(so);
2884 xso->xso_family = SOCK_DOM(so);
2885 } else {
2886 xso->xso_protocol = xso->xso_family = 0;
2887 }
2888 xso->so_qlen = so->so_qlen;
2889 xso->so_incqlen = so->so_incqlen;
2890 xso->so_qlimit = so->so_qlimit;
2891 xso->so_timeo = so->so_timeo;
2892 xso->so_error = so->so_error;
2893 xso->so_pgid = so->so_pgid;
2894 xso->so_oobmark = so->so_oobmark;
2895 sbtoxsockbuf(&so->so_snd, &xso->so_snd);
2896 sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
2897 xso->so_uid = kauth_cred_getuid(so->so_cred);
2898}
2899
2900#endif /* !CONFIG_EMBEDDED */
2901
2902/*
2903 * This does the same for sockbufs. Note that the xsockbuf structure,
2904 * since it is always embedded in a socket, does not include a self
2905 * pointer nor a length. We make this entry point public in case
2906 * some other mechanism needs it.
2907 */
2908void
2909sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb)
2910{
2911 xsb->sb_cc = sb->sb_cc;
2912 xsb->sb_hiwat = sb->sb_hiwat;
2913 xsb->sb_mbcnt = sb->sb_mbcnt;
2914 xsb->sb_mbmax = sb->sb_mbmax;
2915 xsb->sb_lowat = sb->sb_lowat;
2916 xsb->sb_flags = sb->sb_flags;
2917 xsb->sb_timeo = (short)
2918 (sb->sb_timeo.tv_sec * hz) + sb->sb_timeo.tv_usec / tick;
2919 if (xsb->sb_timeo == 0 && sb->sb_timeo.tv_usec != 0)
2920 xsb->sb_timeo = 1;
2921}
2922
2923/*
2924 * Based on the policy set by an all knowing decison maker, throttle sockets
2925 * that either have been marked as belonging to "background" process.
2926 */
2927inline int
2928soisthrottled(struct socket *so)
2929{
2930 return (so->so_flags1 & SOF1_TRAFFIC_MGT_SO_BACKGROUND);
2931}
2932
2933inline int
2934soisprivilegedtraffic(struct socket *so)
2935{
2936 return ((so->so_flags & SOF_PRIVILEGED_TRAFFIC_CLASS) ? 1 : 0);
2937}
2938
2939inline int
2940soissrcbackground(struct socket *so)
2941{
2942 return ((so->so_flags1 & SOF1_TRAFFIC_MGT_SO_BACKGROUND) ||
2943 IS_SO_TC_BACKGROUND(so->so_traffic_class));
2944}
2945
2946inline int
2947soissrcrealtime(struct socket *so)
2948{
2949 return (so->so_traffic_class >= SO_TC_AV &&
2950 so->so_traffic_class <= SO_TC_VO);
2951}
2952
2953inline int
2954soissrcbesteffort(struct socket *so)
2955{
2956 return (so->so_traffic_class == SO_TC_BE ||
2957 so->so_traffic_class == SO_TC_RD ||
2958 so->so_traffic_class == SO_TC_OAM);
2959}
2960
2961void
2962soclearfastopen(struct socket *so)
2963{
2964 if (so->so_flags1 & SOF1_PRECONNECT_DATA)
2965 so->so_flags1 &= ~SOF1_PRECONNECT_DATA;
2966
2967 if (so->so_flags1 & SOF1_DATA_IDEMPOTENT)
2968 so->so_flags1 &= ~SOF1_DATA_IDEMPOTENT;
2969}
2970
2971void
2972sonullevent(struct socket *so, void *arg, uint32_t hint)
2973{
2974#pragma unused(so, arg, hint)
2975}
2976
2977/*
2978 * Here is the definition of some of the basic objects in the kern.ipc
2979 * branch of the MIB.
2980 */
2981SYSCTL_NODE(_kern, KERN_IPC, ipc,
2982 CTLFLAG_RW|CTLFLAG_LOCKED|CTLFLAG_ANYBODY, 0, "IPC");
2983
2984/* Check that the maximum socket buffer size is within a range */
2985
2986static int
2987sysctl_sb_max SYSCTL_HANDLER_ARGS
2988{
2989#pragma unused(oidp, arg1, arg2)
2990 u_int32_t new_value;
2991 int changed = 0;
2992 int error = sysctl_io_number(req, sb_max, sizeof (u_int32_t),
2993 &new_value, &changed);
2994 if (!error && changed) {
2995 if (new_value > LOW_SB_MAX && new_value <= high_sb_max) {
2996 sb_max = new_value;
2997 } else {
2998 error = ERANGE;
2999 }
3000 }
3001 return (error);
3002}
3003
3004SYSCTL_PROC(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf,
3005 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
3006 &sb_max, 0, &sysctl_sb_max, "IU", "Maximum socket buffer size");
3007
3008SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor,
3009 CTLFLAG_RW | CTLFLAG_LOCKED, &sb_efficiency, 0, "");
3010
3011SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters,
3012 CTLFLAG_RD | CTLFLAG_LOCKED, &nmbclusters, 0, "");
3013
3014SYSCTL_INT(_kern_ipc, OID_AUTO, njcl,
3015 CTLFLAG_RD | CTLFLAG_LOCKED, &njcl, 0, "");
3016
3017SYSCTL_INT(_kern_ipc, OID_AUTO, njclbytes,
3018 CTLFLAG_RD | CTLFLAG_LOCKED, &njclbytes, 0, "");
3019
3020SYSCTL_INT(_kern_ipc, KIPC_SOQLIMITCOMPAT, soqlimitcompat,
3021 CTLFLAG_RW | CTLFLAG_LOCKED, &soqlimitcompat, 1,
3022 "Enable socket queue limit compatibility");
3023
3024/*
3025 * Hack alert -- rdar://33572856
3026 * A loopback test we cannot change was failing because it sets
3027 * SO_SENDTIMEO to 5 seconds and that's also the value
3028 * of the minimum persist timer. Because of the persist timer,
3029 * the connection was not idle for 5 seconds and SO_SNDTIMEO
3030 * was not triggering at 5 seconds causing the test failure.
3031 * As a workaround we check the sysctl soqlencomp the test is already
3032 * setting to set disable auto tuning of the receive buffer.
3033 */
3034
3035extern u_int32_t tcp_do_autorcvbuf;
3036
3037static int
3038sysctl_soqlencomp SYSCTL_HANDLER_ARGS
3039{
3040#pragma unused(oidp, arg1, arg2)
3041 u_int32_t new_value;
3042 int changed = 0;
3043 int error = sysctl_io_number(req, soqlencomp, sizeof (u_int32_t),
3044 &new_value, &changed);
3045 if (!error && changed) {
3046 soqlencomp = new_value;
3047 if (new_value != 0) {
3048 tcp_do_autorcvbuf = 0;
3049 tcptv_persmin_val = 6 * TCP_RETRANSHZ;
3050 }
3051 }
3052 return (error);
3053}
3054SYSCTL_PROC(_kern_ipc, OID_AUTO, soqlencomp,
3055 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
3056 &soqlencomp, 0, &sysctl_soqlencomp, "IU", "");
3057
3058SYSCTL_INT(_kern_ipc, OID_AUTO, sbmb_cnt, CTLFLAG_RD | CTLFLAG_LOCKED,
3059 &total_sbmb_cnt, 0, "");
3060SYSCTL_INT(_kern_ipc, OID_AUTO, sbmb_cnt_peak, CTLFLAG_RD | CTLFLAG_LOCKED,
3061 &total_sbmb_cnt_peak, 0, "");
3062SYSCTL_INT(_kern_ipc, OID_AUTO, sbmb_cnt_floor, CTLFLAG_RD | CTLFLAG_LOCKED,
3063 &total_sbmb_cnt_floor, 0, "");
3064SYSCTL_QUAD(_kern_ipc, OID_AUTO, sbmb_limreached, CTLFLAG_RD | CTLFLAG_LOCKED,
3065 &sbmb_limreached, "");
3066
3067
3068SYSCTL_NODE(_kern_ipc, OID_AUTO, io_policy, CTLFLAG_RW, 0, "network IO policy");
3069
3070SYSCTL_INT(_kern_ipc_io_policy, OID_AUTO, log, CTLFLAG_RW | CTLFLAG_LOCKED,
3071 &net_io_policy_log, 0, "");
3072
3073#if CONFIG_PROC_UUID_POLICY
3074SYSCTL_INT(_kern_ipc_io_policy, OID_AUTO, uuid, CTLFLAG_RW | CTLFLAG_LOCKED,
3075 &net_io_policy_uuid, 0, "");
3076#endif /* CONFIG_PROC_UUID_POLICY */
3077