1/*
2 * Copyright (c) 2012-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*
30 * A note on the MPTCP/NECP-interactions:
31 *
32 * MPTCP uses NECP-callbacks to get notified of interface/policy events.
33 * MPTCP registers to these events at the MPTCP-layer for interface-events
34 * through a call to necp_client_register_multipath_cb.
35 * To get per-flow events (aka per TCP-subflow), we register to it with
36 * necp_client_register_socket_flow. Both registrations happen by using the
37 * necp-client-uuid that comes from the app.
38 *
39 * The locking is rather tricky. In general, we expect the lock-ordering to
40 * happen from necp-fd -> necp->client -> mpp_lock.
41 *
42 * There are however some subtleties.
43 *
44 * 1. When registering the multipath_cb, we are holding the mpp_lock. This is
45 * safe, because it is the very first time this MPTCP-connection goes into NECP.
46 * As we go into NECP we take the NECP-locks and thus are guaranteed that no
47 * NECP-locks will deadlock us. Because these NECP-events will also first take
48 * the NECP-locks. Either they win the race and thus won't find our
49 * MPTCP-connection. Or, MPTCP wins the race and thus it will safely install
50 * the callbacks while holding the NECP lock.
51 *
52 * 2. When registering the subflow-callbacks we must unlock the mpp_lock. This,
53 * because we have already registered callbacks and we might race against an
54 * NECP-event that will match on our socket. So, we have to unlock to be safe.
55 *
56 * 3. When removing the multipath_cb, we do it in mp_pcbdispose(). The
57 * so_usecount has reached 0. We must be careful to not remove the mpp_socket
58 * pointers before we unregistered the callback. Because, again we might be
59 * racing against an NECP-event. Unregistering must happen with an unlocked
60 * mpp_lock, because of the lock-ordering constraint. It could be that
61 * before we had a chance to unregister an NECP-event triggers. That's why
62 * we need to check for the so_usecount in mptcp_session_necp_cb. If we get
63 * there while the socket is being garbage-collected, the use-count will go
64 * down to 0 and we exit. Removal of the multipath_cb again happens by taking
65 * the NECP-locks so any running NECP-events will finish first and exit cleanly.
66 *
67 * 4. When removing the subflow-callback, we do it in in_pcbdispose(). Again,
68 * the socket-lock must be unlocked for lock-ordering constraints. This gets a
69 * bit tricky here, as in tcp_garbage_collect we hold the mp_so and so lock.
70 * So, we drop the mp_so-lock as soon as the subflow is unlinked with
71 * mptcp_subflow_del. Then, in in_pcbdispose we drop the subflow-lock.
72 * If an NECP-event was waiting on the lock in mptcp_subflow_necp_cb, when it
73 * gets it, it will realize that the subflow became non-MPTCP and retry (see
74 * tcp_lock). Then it waits again on the subflow-lock. When we drop this lock
75 * in in_pcbdispose, and enter necp_inpcb_dispose, this one will have to wait
76 * for the NECP-lock (held by the other thread that is taking care of the NECP-
77 * event). So, the event now finally gets the subflow-lock and then hits an
78 * so_usecount that is 0 and exits. Eventually, we can remove the subflow from
79 * the NECP callback.
80 */
81
82#include <sys/param.h>
83#include <sys/systm.h>
84#include <sys/kernel.h>
85#include <sys/mbuf.h>
86#include <sys/mcache.h>
87#include <sys/socket.h>
88#include <sys/socketvar.h>
89#include <sys/syslog.h>
90#include <sys/protosw.h>
91
92#include <kern/zalloc.h>
93#include <kern/locks.h>
94
95#include <mach/sdt.h>
96
97#include <net/if.h>
98#include <netinet/in.h>
99#include <netinet/in_var.h>
100#include <netinet/tcp.h>
101#include <netinet/tcp_fsm.h>
102#include <netinet/tcp_seq.h>
103#include <netinet/tcp_var.h>
104#include <netinet/mptcp_var.h>
105#include <netinet/mptcp.h>
106#include <netinet/mptcp_seq.h>
107#include <netinet/mptcp_opt.h>
108#include <netinet/mptcp_timer.h>
109
110int mptcp_enable = 1;
111SYSCTL_INT(_net_inet_mptcp, OID_AUTO, enable, CTLFLAG_RW | CTLFLAG_LOCKED,
112 &mptcp_enable, 0, "Enable Multipath TCP Support");
113
114/*
115 * Number of times to try negotiating MPTCP on SYN retransmissions.
116 * We haven't seen any reports of a middlebox that is dropping all SYN-segments
117 * that have an MPTCP-option. Thus, let's be generous and retransmit it 4 times.
118 */
119int mptcp_mpcap_retries = 4;
120SYSCTL_INT(_net_inet_mptcp, OID_AUTO, mptcp_cap_retr,
121 CTLFLAG_RW | CTLFLAG_LOCKED,
122 &mptcp_mpcap_retries, 0, "Number of MP Capable SYN Retries");
123
124/*
125 * By default, DSS checksum is turned off, revisit if we ever do
126 * MPTCP for non SSL Traffic.
127 */
128int mptcp_dss_csum = 0;
129SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dss_csum, CTLFLAG_RW | CTLFLAG_LOCKED,
130 &mptcp_dss_csum, 0, "Enable DSS checksum");
131
132/*
133 * When mptcp_fail_thresh number of retransmissions are sent, subflow failover
134 * is attempted on a different path.
135 */
136int mptcp_fail_thresh = 1;
137SYSCTL_INT(_net_inet_mptcp, OID_AUTO, fail, CTLFLAG_RW | CTLFLAG_LOCKED,
138 &mptcp_fail_thresh, 0, "Failover threshold");
139
140/*
141 * MPTCP subflows have TCP keepalives set to ON. Set a conservative keeptime
142 * as carrier networks mostly have a 30 minute to 60 minute NAT Timeout.
143 * Some carrier networks have a timeout of 10 or 15 minutes.
144 */
145int mptcp_subflow_keeptime = 60 * 14;
146SYSCTL_INT(_net_inet_mptcp, OID_AUTO, keepalive, CTLFLAG_RW | CTLFLAG_LOCKED,
147 &mptcp_subflow_keeptime, 0, "Keepalive in seconds");
148
149int mptcp_rtthist_rtthresh = 600;
150SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rtthist_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
151 &mptcp_rtthist_rtthresh, 0, "Rtt threshold");
152
153int mptcp_rtothresh = 1500;
154SYSCTL_INT(_net_inet_mptcp, OID_AUTO, rto_thresh, CTLFLAG_RW | CTLFLAG_LOCKED,
155 &mptcp_rtothresh, 0, "RTO threshold");
156
157/*
158 * Probe the preferred path, when it is not in use
159 */
160uint32_t mptcp_probeto = 1000;
161SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, probeto, CTLFLAG_RW | CTLFLAG_LOCKED,
162 &mptcp_probeto, 0, "Disable probing by setting to 0");
163
164uint32_t mptcp_probecnt = 5;
165SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, probecnt, CTLFLAG_RW | CTLFLAG_LOCKED,
166 &mptcp_probecnt, 0, "Number of probe writes");
167
168uint32_t mptcp_enable_v1 = 1;
169SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, enable_v1, CTLFLAG_RW | CTLFLAG_LOCKED,
170 &mptcp_enable_v1, 0, "Enable or disable v1");
171
172static int
173sysctl_mptcp_version_check SYSCTL_HANDLER_ARGS
174{
175#pragma unused(arg1, arg2)
176 int error;
177 int new_value = *(int *)oidp->oid_arg1;
178 int old_value = *(int *)oidp->oid_arg1;
179
180 error = sysctl_handle_int(oidp, arg1: &new_value, arg2: 0, req);
181 if (!error) {
182 if (new_value != MPTCP_VERSION_0 && new_value != MPTCP_VERSION_1) {
183 return EINVAL;
184 }
185 *(int *)oidp->oid_arg1 = new_value;
186 }
187
188 os_log(OS_LOG_DEFAULT,
189 "%s:%u sysctl net.inet.tcp.mptcp_preferred_version: %d -> %d)",
190 proc_best_name(current_proc()), proc_selfpid(),
191 old_value, *(int *)oidp->oid_arg1);
192
193 return error;
194}
195
196int mptcp_preferred_version = MPTCP_VERSION_1;
197SYSCTL_PROC(_net_inet_tcp, OID_AUTO, mptcp_preferred_version,
198 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
199 &mptcp_preferred_version, 0, &sysctl_mptcp_version_check, "I", "");
200
201int mptcp_reass_total_qlen = 0;
202SYSCTL_INT(_net_inet_mptcp, OID_AUTO, reass_qlen,
203 CTLFLAG_RD | CTLFLAG_LOCKED, &mptcp_reass_total_qlen, 0,
204 "Total number of MPTCP segments in reassembly queues");
205
206static int
207mptcp_reass_present(struct socket *mp_so)
208{
209 struct mptses *mpte = mpsotompte(so: mp_so);
210 struct mptcb *mp_tp = mpte->mpte_mptcb;
211 struct tseg_qent *q;
212 int dowakeup = 0;
213 int flags = 0;
214 int count = 0;
215
216 /*
217 * Present data to user, advancing rcv_nxt through
218 * completed sequence space.
219 */
220 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
221 return flags;
222 }
223 q = LIST_FIRST(&mp_tp->mpt_segq);
224 if (!q || q->tqe_m->m_pkthdr.mp_dsn != mp_tp->mpt_rcvnxt) {
225 return flags;
226 }
227
228 /*
229 * If there is already another thread doing reassembly for this
230 * connection, it is better to let it finish the job --
231 * (radar 16316196)
232 */
233 if (mp_tp->mpt_flags & MPTCPF_REASS_INPROG) {
234 return flags;
235 }
236
237 mp_tp->mpt_flags |= MPTCPF_REASS_INPROG;
238
239 do {
240 mp_tp->mpt_rcvnxt += q->tqe_len;
241 LIST_REMOVE(q, tqe_q);
242 if (mp_so->so_state & SS_CANTRCVMORE) {
243 m_freem(q->tqe_m);
244 } else {
245 flags = !!(q->tqe_m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN);
246 if (sbappendstream_rcvdemux(so: mp_so, m: q->tqe_m)) {
247 dowakeup = 1;
248 }
249 }
250 zfree(tcp_reass_zone, q);
251 mp_tp->mpt_reassqlen--;
252 count++;
253 q = LIST_FIRST(&mp_tp->mpt_segq);
254 } while (q && q->tqe_m->m_pkthdr.mp_dsn == mp_tp->mpt_rcvnxt);
255 mp_tp->mpt_flags &= ~MPTCPF_REASS_INPROG;
256
257 if (count > 0) {
258 OSAddAtomic(-count, &mptcp_reass_total_qlen);
259 }
260 if (dowakeup) {
261 sorwakeup(so: mp_so); /* done with socket lock held */
262 }
263 return flags;
264}
265
266static int
267mptcp_reass(struct socket *mp_so, struct pkthdr *phdr, int *tlenp, struct mbuf *m)
268{
269 struct mptcb *mp_tp = mpsotomppcb(mp_so)->mpp_pcbe->mpte_mptcb;
270 u_int64_t mb_dsn = phdr->mp_dsn;
271 struct tseg_qent *q;
272 struct tseg_qent *p = NULL;
273 struct tseg_qent *nq;
274 struct tseg_qent *te = NULL;
275 uint32_t qlimit;
276
277 /*
278 * Limit the number of segments in the reassembly queue to prevent
279 * holding on to too many segments (and thus running out of mbufs).
280 * Make sure to let the missing segment through which caused this
281 * queue. Always keep one global queue entry spare to be able to
282 * process the missing segment.
283 */
284 qlimit = MIN(MAX(100, mp_so->so_rcv.sb_hiwat >> 10),
285 (tcp_autorcvbuf_max >> 10));
286 if (mb_dsn != mp_tp->mpt_rcvnxt &&
287 (mp_tp->mpt_reassqlen + 1) >= qlimit) {
288 tcpstat.tcps_mptcp_rcvmemdrop++;
289 m_freem(m);
290 *tlenp = 0;
291 return 0;
292 }
293
294 /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
295 te = zalloc_flags(tcp_reass_zone, Z_WAITOK | Z_NOFAIL);
296
297 mp_tp->mpt_reassqlen++;
298 OSIncrementAtomic(&mptcp_reass_total_qlen);
299
300 /*
301 * Find a segment which begins after this one does.
302 */
303 LIST_FOREACH(q, &mp_tp->mpt_segq, tqe_q) {
304 if (MPTCP_SEQ_GT(q->tqe_m->m_pkthdr.mp_dsn, mb_dsn)) {
305 break;
306 }
307 p = q;
308 }
309
310 /*
311 * If there is a preceding segment, it may provide some of
312 * our data already. If so, drop the data from the incoming
313 * segment. If it provides all of our data, drop us.
314 */
315 if (p != NULL) {
316 int64_t i;
317 /* conversion to int (in i) handles seq wraparound */
318 i = p->tqe_m->m_pkthdr.mp_dsn + p->tqe_len - mb_dsn;
319 if (i > 0) {
320 if (i >= *tlenp) {
321 tcpstat.tcps_mptcp_rcvduppack++;
322 m_freem(m);
323 zfree(tcp_reass_zone, te);
324 te = NULL;
325 mp_tp->mpt_reassqlen--;
326 OSDecrementAtomic(&mptcp_reass_total_qlen);
327 /*
328 * Try to present any queued data
329 * at the left window edge to the user.
330 * This is needed after the 3-WHS
331 * completes.
332 */
333 goto out;
334 }
335 VERIFY(i <= INT_MAX);
336 m_adj(m, (int)i);
337 *tlenp -= i;
338 phdr->mp_dsn += i;
339 }
340 }
341
342 tcpstat.tcps_mp_oodata++;
343
344 /*
345 * While we overlap succeeding segments trim them or,
346 * if they are completely covered, dequeue them.
347 */
348 while (q) {
349 int64_t i = (mb_dsn + *tlenp) - q->tqe_m->m_pkthdr.mp_dsn;
350 if (i <= 0) {
351 break;
352 }
353
354 if (i < q->tqe_len) {
355 q->tqe_m->m_pkthdr.mp_dsn += i;
356 q->tqe_len -= i;
357
358 VERIFY(i <= INT_MAX);
359 m_adj(q->tqe_m, (int)i);
360 break;
361 }
362
363 nq = LIST_NEXT(q, tqe_q);
364 LIST_REMOVE(q, tqe_q);
365 m_freem(q->tqe_m);
366 zfree(tcp_reass_zone, q);
367 mp_tp->mpt_reassqlen--;
368 OSDecrementAtomic(&mptcp_reass_total_qlen);
369 q = nq;
370 }
371
372 /* Insert the new segment queue entry into place. */
373 te->tqe_m = m;
374 te->tqe_th = NULL;
375 te->tqe_len = *tlenp;
376
377 if (p == NULL) {
378 LIST_INSERT_HEAD(&mp_tp->mpt_segq, te, tqe_q);
379 } else {
380 LIST_INSERT_AFTER(p, te, tqe_q);
381 }
382
383out:
384 return mptcp_reass_present(mp_so);
385}
386
387/*
388 * MPTCP input, called when data has been read from a subflow socket.
389 */
390void
391mptcp_input(struct mptses *mpte, struct mbuf *m)
392{
393 struct socket *mp_so;
394 struct mptcb *mp_tp = NULL;
395 int count = 0, wakeup = 0;
396 struct mbuf *save = NULL, *prev = NULL;
397 struct mbuf *freelist = NULL, *tail = NULL;
398
399 if (__improbable((m->m_flags & M_PKTHDR) == 0)) {
400 panic("mbuf invalid: %p", m);
401 }
402
403 mp_so = mptetoso(mpte);
404 mp_tp = mpte->mpte_mptcb;
405
406 socket_lock_assert_owned(so: mp_so);
407
408 DTRACE_MPTCP(input);
409
410 mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
411
412 /*
413 * Each mbuf contains MPTCP Data Sequence Map
414 * Process the data for reassembly, delivery to MPTCP socket
415 * client, etc.
416 *
417 */
418 count = mp_so->so_rcv.sb_cc;
419
420 /*
421 * In the degraded fallback case, data is accepted without DSS map
422 */
423 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
424 struct mbuf *iter;
425 int mb_dfin;
426fallback:
427 mb_dfin = 0;
428 mptcp_sbrcv_grow(mp_tp);
429
430 iter = m;
431 while (iter) {
432 if ((iter->m_flags & M_PKTHDR) &&
433 (iter->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN)) {
434 mb_dfin = 1;
435 }
436
437 if ((iter->m_flags & M_PKTHDR) && m_pktlen(iter) == 0) {
438 /* Don't add zero-length packets, so jump it! */
439 if (prev == NULL) {
440 m = iter->m_next;
441 m_free(iter);
442 iter = m;
443 } else {
444 prev->m_next = iter->m_next;
445 m_free(iter);
446 iter = prev->m_next;
447 }
448
449 /* It was a zero-length packet so next one must be a pkthdr */
450 VERIFY(iter == NULL || iter->m_flags & M_PKTHDR);
451 } else {
452 prev = iter;
453 iter = iter->m_next;
454 }
455 }
456
457 /*
458 * assume degraded flow as this may be the first packet
459 * without DSS, and the subflow state is not updated yet.
460 */
461 if (sbappendstream_rcvdemux(so: mp_so, m)) {
462 sorwakeup(so: mp_so);
463 }
464
465 DTRACE_MPTCP5(receive__degraded, struct mbuf *, m,
466 struct socket *, mp_so,
467 struct sockbuf *, &mp_so->so_rcv,
468 struct sockbuf *, &mp_so->so_snd,
469 struct mptses *, mpte);
470 count = mp_so->so_rcv.sb_cc - count;
471
472 mp_tp->mpt_rcvnxt += count;
473
474 if (mb_dfin) {
475 mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN);
476 socantrcvmore(so: mp_so);
477 }
478 return;
479 }
480
481 do {
482 u_int64_t mb_dsn;
483 int32_t mb_datalen;
484 int64_t todrop;
485 int mb_dfin = 0;
486
487 VERIFY(m->m_flags & M_PKTHDR);
488
489 /* If fallback occurs, mbufs will not have PKTF_MPTCP set */
490 if (!(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
491 goto fallback;
492 }
493
494 save = m->m_next;
495 /*
496 * A single TCP packet formed of multiple mbufs
497 * holds DSS mapping in the first mbuf of the chain.
498 * Other mbufs in the chain may have M_PKTHDR set
499 * even though they belong to the same TCP packet
500 * and therefore use the DSS mapping stored in the
501 * first mbuf of the mbuf chain. mptcp_input() can
502 * get an mbuf chain with multiple TCP packets.
503 */
504 while (save && (!(save->m_flags & M_PKTHDR) ||
505 !(save->m_pkthdr.pkt_flags & PKTF_MPTCP))) {
506 prev = save;
507 save = save->m_next;
508 }
509 if (prev) {
510 prev->m_next = NULL;
511 } else {
512 m->m_next = NULL;
513 }
514
515 mb_dsn = m->m_pkthdr.mp_dsn;
516 mb_datalen = m->m_pkthdr.mp_rlen;
517
518 todrop = (mb_dsn + mb_datalen) - (mp_tp->mpt_rcvnxt + mp_tp->mpt_rcvwnd);
519 if (todrop > 0) {
520 tcpstat.tcps_mptcp_rcvpackafterwin++;
521
522 os_log_info(mptcp_log_handle, "%s - %lx: dropping dsn %u dlen %u rcvnxt %u rcvwnd %u todrop %lld\n",
523 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
524 (uint32_t)mb_dsn, mb_datalen, (uint32_t)mp_tp->mpt_rcvnxt,
525 mp_tp->mpt_rcvwnd, todrop);
526
527 if (todrop >= mb_datalen) {
528 if (freelist == NULL) {
529 freelist = m;
530 } else {
531 tail->m_next = m;
532 }
533
534 if (prev != NULL) {
535 tail = prev;
536 } else {
537 tail = m;
538 }
539
540 m = save;
541 prev = save = NULL;
542 continue;
543 } else {
544 VERIFY(todrop <= INT_MAX);
545 m_adj(m, (int)-todrop);
546 mb_datalen -= todrop;
547 m->m_pkthdr.mp_rlen -= todrop;
548 }
549
550 /*
551 * We drop from the right edge of the mbuf, thus the
552 * DATA_FIN is dropped as well
553 */
554 m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP_DFIN;
555 }
556
557 if (MPTCP_SEQ_LT(mb_dsn, mp_tp->mpt_rcvnxt)) {
558 if (MPTCP_SEQ_LEQ((mb_dsn + mb_datalen),
559 mp_tp->mpt_rcvnxt)) {
560 if (freelist == NULL) {
561 freelist = m;
562 } else {
563 tail->m_next = m;
564 }
565
566 if (prev != NULL) {
567 tail = prev;
568 } else {
569 tail = m;
570 }
571
572 m = save;
573 prev = save = NULL;
574 continue;
575 } else {
576 VERIFY((mp_tp->mpt_rcvnxt - mb_dsn) <= INT_MAX);
577 m_adj(m, (int)(mp_tp->mpt_rcvnxt - mb_dsn));
578 mb_datalen -= (mp_tp->mpt_rcvnxt - mb_dsn);
579 mb_dsn = mp_tp->mpt_rcvnxt;
580 VERIFY(mb_datalen >= 0 && mb_datalen <= USHRT_MAX);
581 m->m_pkthdr.mp_rlen = (uint16_t)mb_datalen;
582 m->m_pkthdr.mp_dsn = mb_dsn;
583 }
584 }
585
586 if (MPTCP_SEQ_GT(mb_dsn, mp_tp->mpt_rcvnxt) ||
587 !LIST_EMPTY(&mp_tp->mpt_segq)) {
588 mb_dfin = mptcp_reass(mp_so, phdr: &m->m_pkthdr, tlenp: &mb_datalen, m);
589
590 goto next;
591 }
592 mb_dfin = !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN);
593
594 mptcp_sbrcv_grow(mp_tp);
595
596 if (sbappendstream_rcvdemux(so: mp_so, m)) {
597 wakeup = 1;
598 }
599
600 DTRACE_MPTCP6(receive, struct mbuf *, m, struct socket *, mp_so,
601 struct sockbuf *, &mp_so->so_rcv,
602 struct sockbuf *, &mp_so->so_snd,
603 struct mptses *, mpte,
604 struct mptcb *, mp_tp);
605 count = mp_so->so_rcv.sb_cc - count;
606 tcpstat.tcps_mp_rcvtotal++;
607 tcpstat.tcps_mp_rcvbytes += count;
608
609 mp_tp->mpt_rcvnxt += count;
610
611next:
612 if (mb_dfin) {
613 mptcp_close_fsm(mp_tp, MPCE_RECV_DATA_FIN);
614 socantrcvmore(so: mp_so);
615 }
616 m = save;
617 prev = save = NULL;
618 count = mp_so->so_rcv.sb_cc;
619 } while (m);
620
621 if (freelist) {
622 m_freem(freelist);
623 }
624
625 if (wakeup) {
626 sorwakeup(so: mp_so);
627 }
628}
629
630boolean_t
631mptcp_can_send_more(struct mptcb *mp_tp, boolean_t ignore_reinject)
632{
633 struct socket *mp_so = mptetoso(mpte: mp_tp->mpt_mpte);
634
635 /*
636 * Always send if there is data in the reinject-queue.
637 */
638 if (!ignore_reinject && mp_tp->mpt_mpte->mpte_reinjectq) {
639 return TRUE;
640 }
641
642 /*
643 * Don't send, if:
644 *
645 * 1. snd_nxt >= snd_max : Means, basically everything has been sent.
646 * Except when using TFO, we might be doing a 0-byte write.
647 * 2. snd_una + snd_wnd <= snd_nxt: No space in the receiver's window
648 * 3. snd_nxt + 1 == snd_max and we are closing: A DATA_FIN is scheduled.
649 */
650
651 if (!(mp_so->so_flags1 & SOF1_PRECONNECT_DATA) && MPTCP_SEQ_GEQ(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) {
652 return FALSE;
653 }
654
655 if (MPTCP_SEQ_LEQ(mp_tp->mpt_snduna + mp_tp->mpt_sndwnd, mp_tp->mpt_sndnxt)) {
656 return FALSE;
657 }
658
659 if (mp_tp->mpt_sndnxt + 1 == mp_tp->mpt_sndmax && mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
660 return FALSE;
661 }
662
663 if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_2) {
664 return FALSE;
665 }
666
667 return TRUE;
668}
669
670/*
671 * MPTCP output.
672 */
673int
674mptcp_output(struct mptses *mpte)
675{
676 struct mptcb *mp_tp;
677 struct mptsub *mpts;
678 struct mptsub *mpts_tried = NULL;
679 struct socket *mp_so;
680 struct mptsub *preferred_mpts = NULL;
681 uint64_t old_snd_nxt;
682 int error = 0;
683
684 mp_so = mptetoso(mpte);
685 mp_tp = mpte->mpte_mptcb;
686
687 socket_lock_assert_owned(so: mp_so);
688
689 if (mp_so->so_flags & SOF_DEFUNCT) {
690 return 0;
691 }
692
693 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL));
694 mpte->mpte_mppcb->mpp_flags |= MPP_WUPCALL;
695
696 old_snd_nxt = mp_tp->mpt_sndnxt;
697 while (mptcp_can_send_more(mp_tp, FALSE)) {
698 /* get the "best" subflow to be used for transmission */
699 mpts = mptcp_get_subflow(mpte, preferred: &preferred_mpts);
700 if (mpts == NULL) {
701 break;
702 }
703
704 /* In case there's just one flow, we reattempt later */
705 if (mpts_tried != NULL &&
706 (mpts == mpts_tried || (mpts->mpts_flags & MPTSF_FAILINGOVER))) {
707 mpts_tried->mpts_flags &= ~MPTSF_FAILINGOVER;
708 mpts_tried->mpts_flags |= MPTSF_ACTIVE;
709 mptcp_start_timer(mpte, MPTT_REXMT);
710 break;
711 }
712
713 /*
714 * Automatic sizing of send socket buffer. Increase the send
715 * socket buffer size if all of the following criteria are met
716 * 1. the receiver has enough buffer space for this data
717 * 2. send buffer is filled to 7/8th with data (so we actually
718 * have data to make use of it);
719 */
720 if ((mp_so->so_snd.sb_flags & (SB_AUTOSIZE | SB_TRIM)) == SB_AUTOSIZE) {
721 if ((mp_tp->mpt_sndwnd / 4 * 5) >= mp_so->so_snd.sb_hiwat &&
722 mp_so->so_snd.sb_cc >= (mp_so->so_snd.sb_hiwat / 8 * 7)) {
723 if (sbreserve(sb: &mp_so->so_snd,
724 cc: min(a: mp_so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
725 b: tcp_autosndbuf_max)) == 1) {
726 mp_so->so_snd.sb_idealsize = mp_so->so_snd.sb_hiwat;
727 }
728 }
729 }
730
731 DTRACE_MPTCP3(output, struct mptses *, mpte, struct mptsub *, mpts,
732 struct socket *, mp_so);
733 error = mptcp_subflow_output(mpte, mpts, flags: 0);
734 if (error) {
735 /* can be a temporary loss of source address or other error */
736 mpts->mpts_flags |= MPTSF_FAILINGOVER;
737 mpts->mpts_flags &= ~MPTSF_ACTIVE;
738 mpts_tried = mpts;
739 if (error != ECANCELED) {
740 os_log_error(mptcp_log_handle, "%s - %lx: Error = %d mpts_flags %#x\n",
741 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
742 error, mpts->mpts_flags);
743 }
744 break;
745 }
746 /* The model is to have only one active flow at a time */
747 mpts->mpts_flags |= MPTSF_ACTIVE;
748 mpts->mpts_probesoon = mpts->mpts_probecnt = 0;
749
750 /* Allows us to update the smoothed rtt */
751 if (mptcp_probeto && mpts != preferred_mpts && preferred_mpts != NULL) {
752 if (preferred_mpts->mpts_probesoon) {
753 if ((tcp_now - preferred_mpts->mpts_probesoon) > mptcp_probeto) {
754 mptcp_subflow_output(mpte, mpts: preferred_mpts, MPTCP_SUBOUT_PROBING);
755 if (preferred_mpts->mpts_probecnt >= mptcp_probecnt) {
756 preferred_mpts->mpts_probesoon = 0;
757 preferred_mpts->mpts_probecnt = 0;
758 }
759 }
760 } else {
761 preferred_mpts->mpts_probesoon = tcp_now;
762 preferred_mpts->mpts_probecnt = 0;
763 }
764 }
765
766 if (mpte->mpte_active_sub == NULL) {
767 mpte->mpte_active_sub = mpts;
768 } else if (mpte->mpte_active_sub != mpts) {
769 mpte->mpte_active_sub->mpts_flags &= ~MPTSF_ACTIVE;
770 mpte->mpte_active_sub = mpts;
771
772 mptcpstats_inc_switch(mpte, mpts);
773 }
774 }
775
776 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
777 if (mp_tp->mpt_sndnxt + 1 == mp_tp->mpt_sndmax &&
778 mp_tp->mpt_snduna == mp_tp->mpt_sndnxt) {
779 mptcp_finish_usrclosed(mpte);
780 }
781 }
782
783 mptcp_handle_deferred_upcalls(mpp: mpte->mpte_mppcb, MPP_WUPCALL);
784
785 /* subflow errors should not be percolated back up */
786 return 0;
787}
788
789
790static struct mptsub *
791mptcp_choose_subflow(struct mptsub *mpts, struct mptsub *curbest, int *currtt)
792{
793 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
794
795 /*
796 * Lower RTT? Take it, if it's our first one, or
797 * it doesn't has any loss, or the current one has
798 * loss as well.
799 */
800 if (tp->t_srtt && *currtt > tp->t_srtt &&
801 (curbest == NULL || tp->t_rxtshift == 0 ||
802 sototcpcb(curbest->mpts_socket)->t_rxtshift)) {
803 *currtt = tp->t_srtt;
804 return mpts;
805 }
806
807 /*
808 * If we find a subflow without loss, take it always!
809 */
810 if (curbest &&
811 sototcpcb(curbest->mpts_socket)->t_rxtshift &&
812 tp->t_rxtshift == 0) {
813 *currtt = tp->t_srtt;
814 return mpts;
815 }
816
817 return curbest != NULL ? curbest : mpts;
818}
819
820static struct mptsub *
821mptcp_return_subflow(struct mptsub *mpts)
822{
823 if (mpts && mptcp_subflow_cwnd_space(so: mpts->mpts_socket) <= 0) {
824 return NULL;
825 }
826
827 return mpts;
828}
829
830static boolean_t
831mptcp_subflow_is_slow(struct mptses *mpte, struct mptsub *mpts)
832{
833 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
834 int fail_thresh = mptcp_fail_thresh;
835
836 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER || mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
837 fail_thresh *= 2;
838 }
839
840 return tp->t_rxtshift >= fail_thresh &&
841 (mptetoso(mpte)->so_snd.sb_cc || mpte->mpte_reinjectq);
842}
843
844/*
845 * Return the most eligible subflow to be used for sending data.
846 */
847struct mptsub *
848mptcp_get_subflow(struct mptses *mpte, struct mptsub **preferred)
849{
850 struct tcpcb *besttp, *secondtp;
851 struct inpcb *bestinp, *secondinp;
852 struct mptsub *mpts;
853 struct mptsub *best = NULL;
854 struct mptsub *second_best = NULL;
855 int exp_rtt = INT_MAX, cheap_rtt = INT_MAX;
856
857 /*
858 * First Step:
859 * Choose the best subflow for cellular and non-cellular interfaces.
860 */
861
862 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
863 struct socket *so = mpts->mpts_socket;
864 struct tcpcb *tp = sototcpcb(so);
865 struct inpcb *inp = sotoinpcb(so);
866
867 /*
868 * First, the hard conditions to reject subflows
869 * (e.g., not connected,...)
870 */
871 if (inp->inp_last_outifp == NULL) {
872 continue;
873 }
874
875 if (INP_WAIT_FOR_IF_FEEDBACK(inp)) {
876 continue;
877 }
878
879 /* There can only be one subflow in degraded state */
880 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
881 best = mpts;
882 break;
883 }
884
885 /*
886 * If this subflow is waiting to finally send, do it!
887 */
888 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
889 return mptcp_return_subflow(mpts);
890 }
891
892 /*
893 * Only send if the subflow is MP_CAPABLE. The exceptions to
894 * this rule (degraded or TFO) have been taken care of above.
895 */
896 if (!(mpts->mpts_flags & MPTSF_MP_CAPABLE)) {
897 continue;
898 }
899
900 if ((so->so_state & SS_ISDISCONNECTED) ||
901 !(so->so_state & SS_ISCONNECTED) ||
902 !TCPS_HAVEESTABLISHED(tp->t_state) ||
903 tp->t_state > TCPS_CLOSE_WAIT) {
904 continue;
905 }
906
907 /*
908 * Second, the soft conditions to find the subflow with best
909 * conditions for each set (aka cellular vs non-cellular)
910 */
911 if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
912 second_best = mptcp_choose_subflow(mpts, curbest: second_best,
913 currtt: &exp_rtt);
914 } else {
915 best = mptcp_choose_subflow(mpts, curbest: best, currtt: &cheap_rtt);
916 }
917 }
918
919 /*
920 * If there is no preferred or backup subflow, and there is no active
921 * subflow use the last usable subflow.
922 */
923 if (best == NULL) {
924 return mptcp_return_subflow(mpts: second_best);
925 }
926
927 if (second_best == NULL) {
928 return mptcp_return_subflow(mpts: best);
929 }
930
931 besttp = sototcpcb(best->mpts_socket);
932 bestinp = sotoinpcb(best->mpts_socket);
933 secondtp = sototcpcb(second_best->mpts_socket);
934 secondinp = sotoinpcb(second_best->mpts_socket);
935
936 if (preferred != NULL) {
937 *preferred = mptcp_return_subflow(mpts: best);
938 }
939
940 /*
941 * Second Step: Among best and second_best. Choose the one that is
942 * most appropriate for this particular service-type.
943 */
944 if (mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
945 return mptcp_return_subflow(mpts: best);
946 } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
947 /*
948 * Only handover if Symptoms tells us to do so.
949 */
950 if (!IFNET_IS_CELLULAR(bestinp->inp_last_outifp) &&
951 mptcp_wifi_quality_for_session(mpte) != MPTCP_WIFI_QUALITY_GOOD &&
952 mptcp_subflow_is_slow(mpte, mpts: best)) {
953 return mptcp_return_subflow(mpts: second_best);
954 }
955
956 return mptcp_return_subflow(mpts: best);
957 } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_INTERACTIVE) {
958 int rtt_thresh = mptcp_rtthist_rtthresh << TCP_RTT_SHIFT;
959 int rto_thresh = mptcp_rtothresh;
960
961 /* Adjust with symptoms information */
962 if (!IFNET_IS_CELLULAR(bestinp->inp_last_outifp) &&
963 mptcp_wifi_quality_for_session(mpte) != MPTCP_WIFI_QUALITY_GOOD) {
964 rtt_thresh /= 2;
965 rto_thresh /= 2;
966 }
967
968 if (besttp->t_srtt && secondtp->t_srtt &&
969 besttp->t_srtt >= rtt_thresh &&
970 secondtp->t_srtt < rtt_thresh) {
971 tcpstat.tcps_mp_sel_rtt++;
972 return mptcp_return_subflow(mpts: second_best);
973 }
974
975 if (mptcp_subflow_is_slow(mpte, mpts: best) &&
976 secondtp->t_rxtshift == 0) {
977 return mptcp_return_subflow(mpts: second_best);
978 }
979
980 /* Compare RTOs, select second_best if best's rto exceeds rtothresh */
981 if (besttp->t_rxtcur && secondtp->t_rxtcur &&
982 besttp->t_rxtcur >= rto_thresh &&
983 secondtp->t_rxtcur < rto_thresh) {
984 tcpstat.tcps_mp_sel_rto++;
985
986 return mptcp_return_subflow(mpts: second_best);
987 }
988
989 /*
990 * None of the above conditions for sending on the secondary
991 * were true. So, let's schedule on the best one, if he still
992 * has some space in the congestion-window.
993 */
994 return mptcp_return_subflow(mpts: best);
995 } else if (mpte->mpte_svctype >= MPTCP_SVCTYPE_AGGREGATE) {
996 struct mptsub *tmp;
997
998 /*
999 * We only care about RTT when aggregating
1000 */
1001 if (besttp->t_srtt > secondtp->t_srtt) {
1002 tmp = best;
1003 best = second_best;
1004 besttp = secondtp;
1005 bestinp = secondinp;
1006
1007 second_best = tmp;
1008 secondtp = sototcpcb(second_best->mpts_socket);
1009 secondinp = sotoinpcb(second_best->mpts_socket);
1010 }
1011
1012 /* Is there still space in the congestion window? */
1013 if (mptcp_subflow_cwnd_space(so: bestinp->inp_socket) <= 0) {
1014 return mptcp_return_subflow(mpts: second_best);
1015 }
1016
1017 return mptcp_return_subflow(mpts: best);
1018 } else {
1019 panic("Unknown service-type configured for MPTCP");
1020 }
1021
1022 return NULL;
1023}
1024
1025void
1026mptcp_close_fsm(struct mptcb *mp_tp, uint32_t event)
1027{
1028 struct socket *mp_so = mptetoso(mpte: mp_tp->mpt_mpte);
1029
1030 socket_lock_assert_owned(so: mp_so);
1031
1032 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
1033 uint32_t, event);
1034
1035 switch (mp_tp->mpt_state) {
1036 case MPTCPS_CLOSED:
1037 case MPTCPS_LISTEN:
1038 mp_tp->mpt_state = MPTCPS_TERMINATE;
1039 break;
1040
1041 case MPTCPS_ESTABLISHED:
1042 if (event == MPCE_CLOSE) {
1043 mp_tp->mpt_state = MPTCPS_FIN_WAIT_1;
1044 mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
1045 } else if (event == MPCE_RECV_DATA_FIN) {
1046 mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1047 mp_tp->mpt_state = MPTCPS_CLOSE_WAIT;
1048 }
1049 break;
1050
1051 case MPTCPS_CLOSE_WAIT:
1052 if (event == MPCE_CLOSE) {
1053 mp_tp->mpt_state = MPTCPS_LAST_ACK;
1054 mp_tp->mpt_sndmax += 1; /* adjust for Data FIN */
1055 }
1056 break;
1057
1058 case MPTCPS_FIN_WAIT_1:
1059 if (event == MPCE_RECV_DATA_ACK) {
1060 mp_tp->mpt_state = MPTCPS_FIN_WAIT_2;
1061 } else if (event == MPCE_RECV_DATA_FIN) {
1062 mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1063 mp_tp->mpt_state = MPTCPS_CLOSING;
1064 }
1065 break;
1066
1067 case MPTCPS_CLOSING:
1068 if (event == MPCE_RECV_DATA_ACK) {
1069 mp_tp->mpt_state = MPTCPS_TIME_WAIT;
1070 }
1071 break;
1072
1073 case MPTCPS_LAST_ACK:
1074 if (event == MPCE_RECV_DATA_ACK) {
1075 mptcp_close(mp_tp->mpt_mpte, mp_tp);
1076 }
1077 break;
1078
1079 case MPTCPS_FIN_WAIT_2:
1080 if (event == MPCE_RECV_DATA_FIN) {
1081 mp_tp->mpt_rcvnxt += 1; /* adj remote data FIN */
1082 mp_tp->mpt_state = MPTCPS_TIME_WAIT;
1083 }
1084 break;
1085
1086 case MPTCPS_TIME_WAIT:
1087 case MPTCPS_TERMINATE:
1088 break;
1089
1090 default:
1091 VERIFY(0);
1092 /* NOTREACHED */
1093 }
1094 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
1095 uint32_t, event);
1096}
1097
1098/* If you change this function, match up mptcp_update_rcv_state_f */
1099void
1100mptcp_update_dss_rcv_state(struct mptcp_dsn_opt *dss_info, struct tcpcb *tp,
1101 uint16_t csum)
1102{
1103 struct mptcb *mp_tp = tptomptp(tp);
1104 u_int64_t full_dsn = 0;
1105
1106 NTOHL(dss_info->mdss_dsn);
1107 NTOHL(dss_info->mdss_subflow_seqn);
1108 NTOHS(dss_info->mdss_data_len);
1109
1110 /* XXX for autosndbuf grow sb here */
1111 MPTCP_EXTEND_DSN(mp_tp->mpt_rcvnxt, dss_info->mdss_dsn, full_dsn);
1112 mptcp_update_rcv_state_meat(mp_tp, tp,
1113 full_dsn, dss_info->mdss_subflow_seqn, dss_info->mdss_data_len,
1114 csum);
1115}
1116
1117void
1118mptcp_update_rcv_state_meat(struct mptcb *mp_tp, struct tcpcb *tp,
1119 u_int64_t full_dsn, u_int32_t seqn, u_int16_t mdss_data_len,
1120 uint16_t csum)
1121{
1122 if (mdss_data_len == 0) {
1123 os_log_error(mptcp_log_handle, "%s - %lx: Infinite Mapping.\n",
1124 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte));
1125
1126 if ((mp_tp->mpt_flags & MPTCPF_CHECKSUM) && (csum != 0)) {
1127 os_log_error(mptcp_log_handle, "%s - %lx: Bad checksum %x \n",
1128 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte), csum);
1129 }
1130 mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
1131 return;
1132 }
1133
1134 mptcp_notify_mpready(tp->t_inpcb->inp_socket);
1135
1136 tp->t_rcv_map.mpt_dsn = full_dsn;
1137 tp->t_rcv_map.mpt_sseq = seqn;
1138 tp->t_rcv_map.mpt_len = mdss_data_len;
1139 tp->t_rcv_map.mpt_csum = csum;
1140 tp->t_mpflags |= TMPF_EMBED_DSN;
1141}
1142
1143
1144static uint16_t
1145mptcp_input_csum(struct tcpcb *tp, struct mbuf *m, uint64_t dsn, uint32_t sseq,
1146 uint16_t dlen, uint16_t csum, int dfin)
1147{
1148 struct mptcb *mp_tp = tptomptp(tp);
1149 int real_len = dlen - dfin;
1150 uint32_t sum = 0;
1151
1152 VERIFY(real_len >= 0);
1153
1154 if (mp_tp == NULL) {
1155 return 0;
1156 }
1157
1158 if (!(mp_tp->mpt_flags & MPTCPF_CHECKSUM)) {
1159 return 0;
1160 }
1161
1162 if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
1163 return 0;
1164 }
1165
1166 /*
1167 * The remote side may send a packet with fewer bytes than the
1168 * claimed DSS checksum length.
1169 */
1170 if ((int)m_length2(m, NULL) < real_len) {
1171 return 0xffff;
1172 }
1173
1174 if (real_len != 0) {
1175 sum = m_sum16(m, 0, real_len);
1176 }
1177
1178 sum += in_pseudo64(htonll(dsn), htonl(sseq), htons(dlen) + csum);
1179 ADDCARRY(sum);
1180
1181 DTRACE_MPTCP3(checksum__result, struct tcpcb *, tp, struct mbuf *, m,
1182 uint32_t, sum);
1183
1184 return ~sum & 0xffff;
1185}
1186
1187/*
1188 * MPTCP Checksum support
1189 * The checksum is calculated whenever the MPTCP DSS option is included
1190 * in the TCP packet. The checksum includes the sum of the MPTCP psuedo
1191 * header and the actual data indicated by the length specified in the
1192 * DSS option.
1193 */
1194
1195int
1196mptcp_validate_csum(struct tcpcb *tp, struct mbuf *m, uint64_t dsn,
1197 uint32_t sseq, uint16_t dlen, uint16_t csum, int dfin)
1198{
1199 uint16_t mptcp_csum;
1200
1201 mptcp_csum = mptcp_input_csum(tp, m, dsn, sseq, dlen, csum, dfin);
1202 if (mptcp_csum) {
1203 tp->t_mpflags |= TMPF_SND_MPFAIL;
1204 mptcp_notify_mpfail(tp->t_inpcb->inp_socket);
1205 m_freem(m);
1206 tcpstat.tcps_mp_badcsum++;
1207 return -1;
1208 }
1209 return 0;
1210}
1211
1212uint16_t
1213mptcp_output_csum(struct mbuf *m, uint64_t dss_val, uint32_t sseq, uint16_t dlen)
1214{
1215 uint32_t sum = 0;
1216
1217 if (dlen) {
1218 sum = m_sum16(m, 0, dlen);
1219 }
1220
1221 dss_val = mptcp_hton64(dss_val);
1222 sseq = htonl(sseq);
1223 dlen = htons(dlen);
1224 sum += in_pseudo64(dss_val, sseq, dlen);
1225
1226 ADDCARRY(sum);
1227 sum = ~sum & 0xffff;
1228 DTRACE_MPTCP2(checksum__result, struct mbuf *, m, uint32_t, sum);
1229
1230 return (uint16_t)sum;
1231}
1232
1233/*
1234 * When WiFi signal starts fading, there's more loss and RTT spikes.
1235 * Check if there has been a large spike by comparing against
1236 * a tolerable RTT spike threshold.
1237 */
1238boolean_t
1239mptcp_no_rto_spike(struct socket *so)
1240{
1241 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
1242 int32_t spike = 0;
1243
1244 if (tp->t_rxtcur > mptcp_rtothresh) {
1245 spike = tp->t_rxtcur - mptcp_rtothresh;
1246 }
1247
1248 if (spike > 0) {
1249 return FALSE;
1250 } else {
1251 return TRUE;
1252 }
1253}
1254
1255void
1256mptcp_handle_deferred_upcalls(struct mppcb *mpp, uint32_t flag)
1257{
1258 VERIFY(mpp->mpp_flags & flag);
1259 mpp->mpp_flags &= ~flag;
1260
1261 if (mptcp_should_defer_upcall(mpp)) {
1262 return;
1263 }
1264
1265 if (mpp->mpp_flags & MPP_SHOULD_WORKLOOP) {
1266 mpp->mpp_flags &= ~MPP_SHOULD_WORKLOOP;
1267
1268 mptcp_subflow_workloop(mpp->mpp_pcbe);
1269 }
1270
1271 if (mpp->mpp_flags & MPP_SHOULD_RWAKEUP) {
1272 mpp->mpp_flags &= ~MPP_SHOULD_RWAKEUP;
1273
1274 sorwakeup(so: mpp->mpp_socket);
1275 }
1276
1277 if (mpp->mpp_flags & MPP_SHOULD_WWAKEUP) {
1278 mpp->mpp_flags &= ~MPP_SHOULD_WWAKEUP;
1279
1280 sowwakeup(so: mpp->mpp_socket);
1281 }
1282}
1283
1284static void
1285mptcp_reset_itfinfo(struct mpt_itf_info *info)
1286{
1287 memset(s: info, c: 0, n: sizeof(*info));
1288}
1289
1290void
1291mptcp_session_necp_cb(void *handle, int action, uint32_t interface_index,
1292 uint32_t necp_flags, __unused bool *viable)
1293{
1294 boolean_t has_v4 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV4);
1295 boolean_t has_v6 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_IPV6);
1296 boolean_t has_nat64 = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_HAS_NAT64);
1297 boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER);
1298 struct mppcb *mp = (struct mppcb *)handle;
1299 struct mptses *mpte = mptompte(mp);
1300 struct socket *mp_so;
1301 struct mptcb *mp_tp;
1302 uint32_t i, ifindex;
1303 struct ifnet *ifp;
1304 int locked = 0;
1305
1306 ifindex = interface_index;
1307 VERIFY(ifindex != IFSCOPE_NONE);
1308
1309 /* About to be garbage-collected (see note about MPTCP/NECP interactions) */
1310 if (mp->mpp_socket->so_usecount == 0) {
1311 return;
1312 }
1313
1314 mp_so = mptetoso(mpte);
1315
1316 if (action != NECP_CLIENT_CBACTION_INITIAL) {
1317 socket_lock(so: mp_so, refcount: 1);
1318 locked = 1;
1319
1320 /* Check again, because it might have changed while waiting */
1321 if (mp->mpp_socket->so_usecount == 0) {
1322 goto out;
1323 }
1324 }
1325
1326 socket_lock_assert_owned(so: mp_so);
1327
1328 mp_tp = mpte->mpte_mptcb;
1329
1330 ifnet_head_lock_shared();
1331 ifp = ifindex2ifnet[ifindex];
1332 ifnet_head_done();
1333
1334 os_log(mptcp_log_handle, "%s - %lx: action: %u ifindex %u delegated to %u usecount %u mpt_flags %#x state %u v4 %u v6 %u nat64 %u power %u\n",
1335 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), action, ifindex,
1336 ifp && ifp->if_delegated.ifp ? ifp->if_delegated.ifp->if_index : IFSCOPE_NONE,
1337 mp->mpp_socket->so_usecount, mp_tp->mpt_flags, mp_tp->mpt_state,
1338 has_v4, has_v6, has_nat64, low_power);
1339
1340 /* No need on fallen back sockets */
1341 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
1342 goto out;
1343 }
1344
1345 /*
1346 * When the interface goes in low-power mode we don't want to establish
1347 * new subflows on it. Thus, mark it internally as non-viable.
1348 */
1349 if (low_power) {
1350 action = NECP_CLIENT_CBACTION_NONVIABLE;
1351 }
1352
1353 if (action == NECP_CLIENT_CBACTION_INITIAL) {
1354 mpte->mpte_flags |= MPTE_ITFINFO_INIT;
1355 }
1356
1357 if (action == NECP_CLIENT_CBACTION_NONVIABLE) {
1358 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1359 if (mpte->mpte_itfinfo[i].ifindex == IFSCOPE_NONE) {
1360 continue;
1361 }
1362
1363 if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1364 mptcp_reset_itfinfo(info: &mpte->mpte_itfinfo[i]);
1365 }
1366 }
1367
1368 mptcp_sched_create_subflows(mpte);
1369 } else if (action == NECP_CLIENT_CBACTION_VIABLE ||
1370 action == NECP_CLIENT_CBACTION_INITIAL) {
1371 int found_slot = 0, slot_index = -1;
1372 struct sockaddr *dst;
1373
1374 if (ifp == NULL) {
1375 goto out;
1376 }
1377
1378 if (IFNET_IS_COMPANION_LINK(ifp)) {
1379 goto out;
1380 }
1381
1382 if (IFNET_IS_EXPENSIVE(ifp) &&
1383 (mp_so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE)) {
1384 goto out;
1385 }
1386
1387 if (IFNET_IS_CONSTRAINED(ifp) &&
1388 (mp_so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED)) {
1389 goto out;
1390 }
1391
1392 if (IFNET_IS_CELLULAR(ifp) &&
1393 (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR)) {
1394 goto out;
1395 }
1396
1397 if (IS_INTF_CLAT46(ifp)) {
1398 has_v4 = FALSE;
1399 }
1400
1401 /* Look for the slot on where to store/update the interface-info. */
1402 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1403 /* Found a potential empty slot where we can put it */
1404 if (mpte->mpte_itfinfo[i].ifindex == 0) {
1405 found_slot = 1;
1406 slot_index = i;
1407 }
1408
1409 /*
1410 * The interface is already in our array. Check if we
1411 * need to update it.
1412 */
1413 if (mpte->mpte_itfinfo[i].ifindex == ifindex &&
1414 (mpte->mpte_itfinfo[i].has_v4_conn != has_v4 ||
1415 mpte->mpte_itfinfo[i].has_v6_conn != has_v6 ||
1416 mpte->mpte_itfinfo[i].has_nat64_conn != has_nat64)) {
1417 found_slot = 1;
1418 slot_index = i;
1419 break;
1420 }
1421
1422 if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1423 /*
1424 * Ok, it's already there and we don't need
1425 * to update it
1426 */
1427 goto out;
1428 }
1429 }
1430
1431 dst = mptcp_get_session_dst(mpte, has_v6, has_v4);
1432 if (dst && dst->sa_family == AF_INET &&
1433 has_v6 && !has_nat64 && !has_v4) {
1434 if (found_slot) {
1435 mpte->mpte_itfinfo[slot_index].ifindex = ifindex;
1436 mpte->mpte_itfinfo[slot_index].has_v4_conn = has_v4;
1437 mpte->mpte_itfinfo[slot_index].has_v6_conn = has_v6;
1438 mpte->mpte_itfinfo[slot_index].has_nat64_conn = has_nat64;
1439 }
1440 goto out;
1441 }
1442
1443 if (found_slot == 0) {
1444 int new_size = mpte->mpte_itfinfo_size * 2;
1445 struct mpt_itf_info *info = kalloc_data(sizeof(*info) * new_size, Z_ZERO);
1446
1447 if (info == NULL) {
1448 os_log_error(mptcp_log_handle, "%s - %lx: malloc failed for %u\n",
1449 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), new_size);
1450 goto out;
1451 }
1452
1453 memcpy(dst: info, src: mpte->mpte_itfinfo, n: mpte->mpte_itfinfo_size * sizeof(*info));
1454
1455 if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE) {
1456 kfree_data(mpte->mpte_itfinfo,
1457 sizeof(*info) * mpte->mpte_itfinfo_size);
1458 }
1459
1460 /* We allocated a new one, thus the first must be empty */
1461 slot_index = mpte->mpte_itfinfo_size;
1462
1463 mpte->mpte_itfinfo = info;
1464 mpte->mpte_itfinfo_size = new_size;
1465 }
1466
1467 VERIFY(slot_index >= 0 && slot_index < (int)mpte->mpte_itfinfo_size);
1468 mpte->mpte_itfinfo[slot_index].ifindex = ifindex;
1469 mpte->mpte_itfinfo[slot_index].has_v4_conn = has_v4;
1470 mpte->mpte_itfinfo[slot_index].has_v6_conn = has_v6;
1471 mpte->mpte_itfinfo[slot_index].has_nat64_conn = has_nat64;
1472
1473 mptcp_sched_create_subflows(mpte);
1474 }
1475
1476out:
1477 if (locked) {
1478 socket_unlock(so: mp_so, refcount: 1);
1479 }
1480}
1481
1482void
1483mptcp_set_restrictions(struct socket *mp_so)
1484{
1485 struct mptses *mpte = mpsotompte(so: mp_so);
1486 uint32_t i;
1487
1488 socket_lock_assert_owned(so: mp_so);
1489
1490 ifnet_head_lock_shared();
1491
1492 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1493 struct mpt_itf_info *info = &mpte->mpte_itfinfo[i];
1494 uint32_t ifindex = info->ifindex;
1495 struct ifnet *ifp;
1496
1497 if (ifindex == IFSCOPE_NONE) {
1498 continue;
1499 }
1500
1501 ifp = ifindex2ifnet[ifindex];
1502 if (ifp == NULL) {
1503 continue;
1504 }
1505
1506 if (IFNET_IS_EXPENSIVE(ifp) &&
1507 (mp_so->so_restrictions & SO_RESTRICT_DENY_EXPENSIVE)) {
1508 info->ifindex = IFSCOPE_NONE;
1509 }
1510
1511 if (IFNET_IS_CONSTRAINED(ifp) &&
1512 (mp_so->so_restrictions & SO_RESTRICT_DENY_CONSTRAINED)) {
1513 info->ifindex = IFSCOPE_NONE;
1514 }
1515
1516 if (IFNET_IS_CELLULAR(ifp) &&
1517 (mp_so->so_restrictions & SO_RESTRICT_DENY_CELLULAR)) {
1518 info->ifindex = IFSCOPE_NONE;
1519 }
1520 }
1521
1522 ifnet_head_done();
1523}
1524
1525#define DUMP_BUF_CHK() { \
1526 clen -= k; \
1527 if (clen < 1) \
1528 goto done; \
1529 c += k; \
1530}
1531
1532int
1533dump_mptcp_reass_qlen(char *str, int str_len)
1534{
1535 char *c = str;
1536 int k, clen = str_len;
1537
1538 if (mptcp_reass_total_qlen != 0) {
1539 k = scnprintf(c, count: clen, "\nmptcp reass qlen %d\n", mptcp_reass_total_qlen);
1540 DUMP_BUF_CHK();
1541 }
1542
1543done:
1544 return str_len - clen;
1545}
1546