1/*
2 * Copyright (c) 2012-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <kern/locks.h>
30#include <kern/policy_internal.h>
31#include <kern/zalloc.h>
32
33#include <mach/sdt.h>
34
35#include <sys/domain.h>
36#include <sys/kdebug.h>
37#include <sys/kern_control.h>
38#include <sys/kernel.h>
39#include <sys/mbuf.h>
40#include <sys/mcache.h>
41#include <sys/param.h>
42#include <sys/proc.h>
43#include <sys/protosw.h>
44#include <sys/resourcevar.h>
45#include <sys/socket.h>
46#include <sys/socketvar.h>
47#include <sys/sysctl.h>
48#include <sys/syslog.h>
49#include <sys/systm.h>
50
51#include <net/content_filter.h>
52#include <net/if.h>
53#include <net/if_var.h>
54#include <netinet/in.h>
55#include <netinet/in_pcb.h>
56#include <netinet/in_var.h>
57#include <netinet/tcp.h>
58#include <netinet/tcp_cache.h>
59#include <netinet/tcp_fsm.h>
60#include <netinet/tcp_seq.h>
61#include <netinet/tcp_var.h>
62#include <netinet/mptcp_var.h>
63#include <netinet/mptcp.h>
64#include <netinet/mptcp_opt.h>
65#include <netinet/mptcp_seq.h>
66#include <netinet/mptcp_timer.h>
67#include <libkern/crypto/sha1.h>
68#include <libkern/crypto/sha2.h>
69#include <netinet6/in6_pcb.h>
70#include <netinet6/ip6protosw.h>
71#include <dev/random/randomdev.h>
72#include <net/sockaddr_utils.h>
73
74/*
75 * Notes on MPTCP implementation.
76 *
77 * MPTCP is implemented as <SOCK_STREAM,IPPROTO_TCP> protocol in PF_MULTIPATH
78 * communication domain. The structure mtcbinfo describes the MPTCP instance
79 * of a Multipath protocol in that domain. It is used to keep track of all
80 * MPTCP PCB instances in the system, and is protected by the global lock
81 * mppi_lock.
82 *
83 * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM,
84 * IPPROTO_TCP). Upon success, a Multipath PCB gets allocated and along with
85 * it comes an MPTCP Session and an MPTCP PCB. All three structures are
86 * allocated from the same memory block, and each structure has a pointer
87 * to the adjacent ones. The layout is defined by the mpp_mtp structure.
88 * The socket lock (mpp_lock) is used to protect accesses to the Multipath
89 * PCB (mppcb) as well as the MPTCP Session (mptses).
90 *
91 * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB;
92 *
93 * A functioning MPTCP Session consists of one or more subflow sockets. Each
94 * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is
95 * represented by the mptsub structure. Because each subflow requires access
96 * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each
97 * subflow. This gets decremented prior to the subflow's destruction.
98 *
99 * To handle events (read, write, control) from the subflows, we do direct
100 * upcalls into the specific function.
101 *
102 * The whole MPTCP connection is protected by a single lock, the MPTCP socket's
103 * lock. Incoming data on a subflow also ends up taking this single lock. To
104 * achieve the latter, tcp_lock/unlock has been changed to rather use the lock
105 * of the MPTCP-socket.
106 *
107 * An MPTCP socket will be destroyed when its so_usecount drops to zero; this
108 * work is done by the MPTCP garbage collector which is invoked on demand by
109 * the PF_MULTIPATH garbage collector. This process will take place once all
110 * of the subflows have been destroyed.
111 */
112
113static void mptcp_subflow_abort(struct mptsub *, int);
114
115static void mptcp_send_dfin(struct socket *so);
116static void mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts);
117static int mptcp_freeq(struct mptcb *mp_tp);
118
119/*
120 * Possible return values for subflow event handlers. Note that success
121 * values must be greater or equal than MPTS_EVRET_OK. Values less than that
122 * indicate errors or actions which require immediate attention; they will
123 * prevent the rest of the handlers from processing their respective events
124 * until the next round of events processing.
125 */
126typedef enum {
127 MPTS_EVRET_DELETE = 1, /* delete this subflow */
128 MPTS_EVRET_OK = 2, /* OK */
129 MPTS_EVRET_CONNECT_PENDING = 3, /* resume pended connects */
130 MPTS_EVRET_DISCONNECT_FALLBACK = 4, /* abort all but preferred */
131} ev_ret_t;
132
133static void mptcp_do_sha1(mptcp_key_t *, char *);
134static void mptcp_do_sha256(mptcp_key_t *, char *);
135
136static void mptcp_init_local_parms(struct mptses *, struct sockaddr *);
137
138static KALLOC_TYPE_DEFINE(mptsub_zone, struct mptsub, NET_KT_DEFAULT);
139static KALLOC_TYPE_DEFINE(mptopt_zone, struct mptopt, NET_KT_DEFAULT);
140static KALLOC_TYPE_DEFINE(mpt_subauth_zone, struct mptcp_subf_auth_entry,
141 NET_KT_DEFAULT);
142
143struct mppcbinfo mtcbinfo;
144
145SYSCTL_DECL(_net_inet);
146
147SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "MPTCP");
148
149SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
150 &mtcbinfo.mppi_count, 0, "Number of active PCBs");
151
152
153static int mptcp_alternate_port = 0;
154SYSCTL_INT(_net_inet_mptcp, OID_AUTO, alternate_port, CTLFLAG_RW | CTLFLAG_LOCKED,
155 &mptcp_alternate_port, 0, "Set alternate port for MPTCP connections");
156
157static struct protosw mptcp_subflow_protosw;
158static struct pr_usrreqs mptcp_subflow_usrreqs;
159static struct ip6protosw mptcp_subflow_protosw6;
160static struct pr_usrreqs mptcp_subflow_usrreqs6;
161
162static uint8_t mptcp_create_subflows_scheduled;
163
164/* Using Symptoms Advisory to detect poor WiFi or poor Cell */
165static kern_ctl_ref mptcp_kern_ctrl_ref = NULL;
166static uint32_t mptcp_kern_skt_inuse = 0;
167static uint32_t mptcp_kern_skt_unit;
168static symptoms_advisory_t mptcp_advisory;
169
170uint32_t mptcp_cellicon_refcount = 0;
171
172os_log_t mptcp_log_handle;
173
174int
175mptcpstats_get_index_by_ifindex(struct mptcp_itf_stats *stats, u_short ifindex, boolean_t create)
176{
177 int i, index = -1;
178
179 for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
180 if (create && stats[i].ifindex == IFSCOPE_NONE) {
181 if (index < 0) {
182 index = i;
183 }
184 continue;
185 }
186
187 if (stats[i].ifindex == ifindex) {
188 index = i;
189 return index;
190 }
191 }
192
193 if (index != -1) {
194 stats[index].ifindex = ifindex;
195 }
196
197 return index;
198}
199
200static int
201mptcpstats_get_index(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
202{
203 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
204 int index;
205
206 if (ifp == NULL) {
207 os_log_error(mptcp_log_handle, "%s - %lx: no ifp on subflow, state %u flags %#x\n",
208 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
209 sototcpcb(mpts->mpts_socket)->t_state, mpts->mpts_flags);
210 return -1;
211 }
212
213 index = mptcpstats_get_index_by_ifindex(stats, ifindex: ifp->if_index, true);
214
215 if (index != -1) {
216 if (stats[index].is_expensive == 0) {
217 stats[index].is_expensive = IFNET_IS_CELLULAR(ifp);
218 }
219 }
220
221 return index;
222}
223
224void
225mptcpstats_inc_switch(struct mptses *mpte, const struct mptsub *mpts)
226{
227 int index;
228
229 tcpstat.tcps_mp_switches++;
230 mpte->mpte_subflow_switches++;
231
232 index = mptcpstats_get_index(stats: mpte->mpte_itfstats, mpts);
233
234 if (index != -1) {
235 mpte->mpte_itfstats[index].switches++;
236 }
237}
238
239/*
240 * Flushes all recorded socket options from an MP socket.
241 */
242static void
243mptcp_flush_sopts(struct mptses *mpte)
244{
245 struct mptopt *mpo, *tmpo;
246
247 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
248 mptcp_sopt_remove(mpte, mpo);
249 mptcp_sopt_free(mpo);
250 }
251 VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts));
252}
253
254/*
255 * Create an MPTCP session, called as a result of opening a MPTCP socket.
256 */
257int
258mptcp_session_create(struct mppcb *mpp)
259{
260 struct mpp_mtp *mtp;
261 struct mppcbinfo *mppi;
262 struct mptses *mpte;
263 struct mptcb *mp_tp;
264
265 VERIFY(mpp != NULL);
266 mppi = mpp->mpp_pcbinfo;
267 VERIFY(mppi != NULL);
268
269 mtp = __container_of(mpp, struct mpp_mtp, mpp);
270 mpte = &mtp->mpp_ses;
271 mp_tp = &mtp->mtcb;
272
273 /* MPTCP Multipath PCB Extension */
274 bzero(s: mpte, n: sizeof(*mpte));
275 VERIFY(mpp->mpp_pcbe == NULL);
276 mpp->mpp_pcbe = mpte;
277 mpte->mpte_mppcb = mpp;
278 mpte->mpte_mptcb = mp_tp;
279
280 TAILQ_INIT(&mpte->mpte_sopts);
281 TAILQ_INIT(&mpte->mpte_subflows);
282 mpte->mpte_associd = SAE_ASSOCID_ANY;
283 mpte->mpte_connid_last = SAE_CONNID_ANY;
284
285 mptcp_init_urgency_timer(mpte);
286
287 mpte->mpte_itfinfo = &mpte->_mpte_itfinfo[0];
288 mpte->mpte_itfinfo_size = MPTE_ITFINFO_SIZE;
289
290 if (mptcp_alternate_port > 0 && mptcp_alternate_port < UINT16_MAX) {
291 mpte->mpte_alternate_port = htons((uint16_t)mptcp_alternate_port);
292 }
293
294 mpte->mpte_last_cellicon_set = tcp_now;
295
296 /* MPTCP Protocol Control Block */
297 bzero(s: mp_tp, n: sizeof(*mp_tp));
298 mp_tp->mpt_mpte = mpte;
299 mp_tp->mpt_state = MPTCPS_CLOSED;
300
301 DTRACE_MPTCP1(session__create, struct mppcb *, mpp);
302
303 return 0;
304}
305
306struct sockaddr *
307mptcp_get_session_dst(struct mptses *mpte, boolean_t ipv6, boolean_t ipv4)
308{
309 if (ipv6 && mpte->mpte_sub_dst_v6.sin6_family == AF_INET6) {
310 return SA(&mpte->mpte_sub_dst_v6);
311 }
312
313 if (ipv4 && mpte->mpte_sub_dst_v4.sin_family == AF_INET) {
314 return SA(&mpte->mpte_sub_dst_v4);
315 }
316
317 /* The interface has neither IPv4 nor IPv6 routes. Give our best guess,
318 * meaning we prefer IPv6 over IPv4.
319 */
320 if (mpte->mpte_sub_dst_v6.sin6_family == AF_INET6) {
321 return SA(&mpte->mpte_sub_dst_v6);
322 }
323
324 if (mpte->mpte_sub_dst_v4.sin_family == AF_INET) {
325 return SA(&mpte->mpte_sub_dst_v4);
326 }
327
328 /* We don't yet have a unicast IP */
329 return NULL;
330}
331
332static void
333mptcpstats_get_bytes(struct mptses *mpte, boolean_t initial_cell,
334 uint64_t *cellbytes, uint64_t *allbytes)
335{
336 int64_t mycellbytes = 0;
337 uint64_t myallbytes = 0;
338 int i;
339
340 for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
341 if (mpte->mpte_itfstats[i].is_expensive) {
342 mycellbytes += mpte->mpte_itfstats[i].mpis_txbytes;
343 mycellbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
344 }
345
346 myallbytes += mpte->mpte_itfstats[i].mpis_txbytes;
347 myallbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
348 }
349
350 if (initial_cell) {
351 mycellbytes -= mpte->mpte_init_txbytes;
352 mycellbytes -= mpte->mpte_init_rxbytes;
353 }
354
355 if (mycellbytes < 0) {
356 os_log_error(mptcp_log_handle, "%s - %lx: cellbytes is %lld\n",
357 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mycellbytes);
358 *cellbytes = 0;
359 *allbytes = 0;
360 } else {
361 *cellbytes = mycellbytes;
362 *allbytes = myallbytes;
363 }
364}
365
366static void
367mptcpstats_session_wrapup(struct mptses *mpte)
368{
369 boolean_t cell = mpte->mpte_initial_cell;
370
371 switch (mpte->mpte_svctype) {
372 case MPTCP_SVCTYPE_HANDOVER:
373 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
374 tcpstat.tcps_mptcp_fp_handover_attempt++;
375
376 if (cell && mpte->mpte_handshake_success) {
377 tcpstat.tcps_mptcp_fp_handover_success_cell++;
378
379 if (mpte->mpte_used_wifi) {
380 tcpstat.tcps_mptcp_handover_wifi_from_cell++;
381 }
382 } else if (mpte->mpte_handshake_success) {
383 tcpstat.tcps_mptcp_fp_handover_success_wifi++;
384
385 if (mpte->mpte_used_cell) {
386 tcpstat.tcps_mptcp_handover_cell_from_wifi++;
387 }
388 }
389 } else {
390 tcpstat.tcps_mptcp_handover_attempt++;
391
392 if (cell && mpte->mpte_handshake_success) {
393 tcpstat.tcps_mptcp_handover_success_cell++;
394
395 if (mpte->mpte_used_wifi) {
396 tcpstat.tcps_mptcp_handover_wifi_from_cell++;
397 }
398 } else if (mpte->mpte_handshake_success) {
399 tcpstat.tcps_mptcp_handover_success_wifi++;
400
401 if (mpte->mpte_used_cell) {
402 tcpstat.tcps_mptcp_handover_cell_from_wifi++;
403 }
404 }
405 }
406
407 if (mpte->mpte_handshake_success) {
408 uint64_t cellbytes;
409 uint64_t allbytes;
410
411 mptcpstats_get_bytes(mpte, initial_cell: cell, cellbytes: &cellbytes, allbytes: &allbytes);
412
413 tcpstat.tcps_mptcp_handover_cell_bytes += cellbytes;
414 tcpstat.tcps_mptcp_handover_all_bytes += allbytes;
415 }
416 break;
417 case MPTCP_SVCTYPE_INTERACTIVE:
418 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
419 tcpstat.tcps_mptcp_fp_interactive_attempt++;
420
421 if (mpte->mpte_handshake_success) {
422 tcpstat.tcps_mptcp_fp_interactive_success++;
423
424 if (!cell && mpte->mpte_used_cell) {
425 tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
426 }
427 }
428 } else {
429 tcpstat.tcps_mptcp_interactive_attempt++;
430
431 if (mpte->mpte_handshake_success) {
432 tcpstat.tcps_mptcp_interactive_success++;
433
434 if (!cell && mpte->mpte_used_cell) {
435 tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
436 }
437 }
438 }
439
440 if (mpte->mpte_handshake_success) {
441 uint64_t cellbytes;
442 uint64_t allbytes;
443
444 mptcpstats_get_bytes(mpte, initial_cell: cell, cellbytes: &cellbytes, allbytes: &allbytes);
445
446 tcpstat.tcps_mptcp_interactive_cell_bytes += cellbytes;
447 tcpstat.tcps_mptcp_interactive_all_bytes += allbytes;
448 }
449 break;
450 case MPTCP_SVCTYPE_AGGREGATE:
451 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
452 tcpstat.tcps_mptcp_fp_aggregate_attempt++;
453
454 if (mpte->mpte_handshake_success) {
455 tcpstat.tcps_mptcp_fp_aggregate_success++;
456 }
457 } else {
458 tcpstat.tcps_mptcp_aggregate_attempt++;
459
460 if (mpte->mpte_handshake_success) {
461 tcpstat.tcps_mptcp_aggregate_success++;
462 }
463 }
464
465 if (mpte->mpte_handshake_success) {
466 uint64_t cellbytes;
467 uint64_t allbytes;
468
469 mptcpstats_get_bytes(mpte, initial_cell: cell, cellbytes: &cellbytes, allbytes: &allbytes);
470
471 tcpstat.tcps_mptcp_aggregate_cell_bytes += cellbytes;
472 tcpstat.tcps_mptcp_aggregate_all_bytes += allbytes;
473 }
474 break;
475 }
476
477 if (cell && mpte->mpte_handshake_success && mpte->mpte_used_wifi) {
478 tcpstat.tcps_mptcp_back_to_wifi++;
479 }
480
481 if (mpte->mpte_triggered_cell) {
482 tcpstat.tcps_mptcp_triggered_cell++;
483 }
484}
485
486/*
487 * Destroy an MPTCP session.
488 */
489static void
490mptcp_session_destroy(struct mptses *mpte)
491{
492 struct mptcb *mp_tp = mpte->mpte_mptcb;
493
494 VERIFY(mp_tp != NULL);
495 VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0);
496
497 mptcpstats_session_wrapup(mpte);
498 mptcp_unset_cellicon(mpte, NULL, val: mpte->mpte_cellicon_increments);
499 mptcp_flush_sopts(mpte);
500
501 if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE) {
502 kfree_data(mpte->mpte_itfinfo,
503 sizeof(*mpte->mpte_itfinfo) * mpte->mpte_itfinfo_size);
504 }
505 mpte->mpte_itfinfo = NULL;
506
507 mptcp_freeq(mp_tp);
508 m_freem_list(mpte->mpte_reinjectq);
509
510 os_log(mptcp_log_handle, "%s - %lx: Destroying session\n",
511 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
512}
513
514boolean_t
515mptcp_ok_to_create_subflows(struct mptcb *mp_tp)
516{
517 return mp_tp->mpt_state >= MPTCPS_ESTABLISHED &&
518 mp_tp->mpt_state < MPTCPS_FIN_WAIT_1 &&
519 !(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP);
520}
521
522static int
523mptcp_synthesize_nat64(struct in6_addr *addr, uint32_t len,
524 const struct in_addr *addrv4)
525{
526 static const struct in6_addr well_known_prefix = {
527 .__u6_addr.__u6_addr8 = {0x00, 0x64, 0xff, 0x9b, 0x00, 0x00,
528 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
529 0x00, 0x00, 0x00, 0x00},
530 };
531 const char *ptrv4 = (const char *)addrv4;
532 char *ptr = (char *)addr;
533
534 if (IN_ZERONET(ntohl(addrv4->s_addr)) || // 0.0.0.0/8 Source hosts on local network
535 IN_LOOPBACK(ntohl(addrv4->s_addr)) || // 127.0.0.0/8 Loopback
536 IN_LINKLOCAL(ntohl(addrv4->s_addr)) || // 169.254.0.0/16 Link Local
537 IN_DS_LITE(ntohl(addrv4->s_addr)) || // 192.0.0.0/29 DS-Lite
538 IN_6TO4_RELAY_ANYCAST(ntohl(addrv4->s_addr)) || // 192.88.99.0/24 6to4 Relay Anycast
539 IN_MULTICAST(ntohl(addrv4->s_addr)) || // 224.0.0.0/4 Multicast
540 INADDR_BROADCAST == addrv4->s_addr) { // 255.255.255.255/32 Limited Broadcast
541 return -1;
542 }
543
544 /* Check for the well-known prefix */
545 if (len == NAT64_PREFIX_LEN_96 &&
546 IN6_ARE_ADDR_EQUAL(addr, &well_known_prefix)) {
547 if (IN_PRIVATE(ntohl(addrv4->s_addr)) || // 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 Private-Use
548 IN_SHARED_ADDRESS_SPACE(ntohl(addrv4->s_addr))) { // 100.64.0.0/10 Shared Address Space
549 return -1;
550 }
551 }
552
553 switch (len) {
554 case NAT64_PREFIX_LEN_96:
555 memcpy(dst: ptr + 12, src: ptrv4, n: 4);
556 break;
557 case NAT64_PREFIX_LEN_64:
558 memcpy(dst: ptr + 9, src: ptrv4, n: 4);
559 break;
560 case NAT64_PREFIX_LEN_56:
561 memcpy(dst: ptr + 7, src: ptrv4, n: 1);
562 memcpy(dst: ptr + 9, src: ptrv4 + 1, n: 3);
563 break;
564 case NAT64_PREFIX_LEN_48:
565 memcpy(dst: ptr + 6, src: ptrv4, n: 2);
566 memcpy(dst: ptr + 9, src: ptrv4 + 2, n: 2);
567 break;
568 case NAT64_PREFIX_LEN_40:
569 memcpy(dst: ptr + 5, src: ptrv4, n: 3);
570 memcpy(dst: ptr + 9, src: ptrv4 + 3, n: 1);
571 break;
572 case NAT64_PREFIX_LEN_32:
573 memcpy(dst: ptr + 4, src: ptrv4, n: 4);
574 break;
575 default:
576 panic("NAT64-prefix len is wrong: %u", len);
577 }
578
579 return 0;
580}
581
582static void
583mptcp_trigger_cell_bringup(struct mptses *mpte)
584{
585 struct socket *mp_so = mptetoso(mpte);
586
587 if (!uuid_is_null(uu: mpsotomppcb(mp_so)->necp_client_uuid)) {
588 uuid_string_t uuidstr;
589 int err;
590
591 socket_unlock(so: mp_so, refcount: 0);
592 err = necp_client_assert_bb_radio_manager(client_id: mpsotomppcb(mp_so)->necp_client_uuid,
593 TRUE);
594 socket_lock(so: mp_so, refcount: 0);
595
596 if (err == 0) {
597 mpte->mpte_triggered_cell = 1;
598 }
599
600 uuid_unparse_upper(uu: mpsotomppcb(mp_so)->necp_client_uuid, out: uuidstr);
601 os_log_info(mptcp_log_handle, "%s - %lx: asked irat to bringup cell for uuid %s, err %d\n",
602 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), uuidstr, err);
603 } else {
604 os_log_info(mptcp_log_handle, "%s - %lx: UUID is already null\n",
605 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
606 }
607}
608
609static boolean_t
610mptcp_subflow_disconnecting(struct mptsub *mpts)
611{
612 if (mpts->mpts_socket->so_state & SS_ISDISCONNECTED) {
613 return true;
614 }
615
616 if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED | MPTSF_CLOSE_REQD)) {
617 return true;
618 }
619
620 if (sototcpcb(mpts->mpts_socket)->t_state == TCPS_CLOSED) {
621 return true;
622 }
623
624 return false;
625}
626
627/*
628 * In Handover mode, only create cell subflow if
629 * - Symptoms marked WiFi as weak:
630 * Here, if we are sending data, then we can check the RTO-state. That is a
631 * stronger signal of WiFi quality than the Symptoms indicator.
632 * If however we are not sending any data, the only thing we can do is guess
633 * and thus bring up Cell.
634 *
635 * - Symptoms marked WiFi as unknown:
636 * In this state we don't know what the situation is and thus remain
637 * conservative, only bringing up cell if there are retransmissions going on.
638 */
639static boolean_t
640mptcp_handover_use_cellular(struct mptses *mpte, struct tcpcb *tp)
641{
642 mptcp_wifi_quality_t wifi_quality = mptcp_wifi_quality_for_session(mpte);
643
644 if (wifi_quality == MPTCP_WIFI_QUALITY_GOOD) {
645 /* WiFi is good - don't use cell */
646 return false;
647 }
648
649 if (wifi_quality == MPTCP_WIFI_QUALITY_UNSURE) {
650 /*
651 * We are in unknown state, only use Cell if we have confirmed
652 * that WiFi is bad.
653 */
654 if (mptetoso(mpte)->so_snd.sb_cc != 0 && tp->t_rxtshift >= mptcp_fail_thresh * 2) {
655 return true;
656 } else {
657 return false;
658 }
659 }
660
661 if (wifi_quality == MPTCP_WIFI_QUALITY_BAD) {
662 /*
663 * WiFi is confirmed to be bad from Symptoms-Framework.
664 * If we are sending data, check the RTOs.
665 * Otherwise, be pessimistic and use Cell.
666 */
667 if (mptetoso(mpte)->so_snd.sb_cc != 0) {
668 if (tp->t_rxtshift >= mptcp_fail_thresh * 2) {
669 return true;
670 } else {
671 return false;
672 }
673 } else {
674 return true;
675 }
676 }
677
678 return false;
679}
680
681void
682mptcp_check_subflows_and_add(struct mptses *mpte)
683{
684 struct mptcb *mp_tp = mpte->mpte_mptcb;
685 boolean_t cellular_viable = FALSE;
686 boolean_t want_cellular = TRUE;
687 uint32_t i;
688
689 if (!mptcp_ok_to_create_subflows(mp_tp)) {
690 os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x",
691 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags);
692 return;
693 }
694
695 /* Just to see if we have an IP-address available */
696 if (mptcp_get_session_dst(mpte, false, false) == NULL) {
697 return;
698 }
699
700 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
701 boolean_t need_to_ask_symptoms = FALSE, found = FALSE;
702 struct mpt_itf_info *info;
703 struct sockaddr_in6 nat64pre;
704 struct sockaddr *dst;
705 struct mptsub *mpts;
706 struct ifnet *ifp;
707 uint32_t ifindex;
708
709 info = &mpte->mpte_itfinfo[i];
710
711 ifindex = info->ifindex;
712 if (ifindex == IFSCOPE_NONE) {
713 continue;
714 }
715
716 os_log(mptcp_log_handle, "%s - %lx: itf %u no support %u hasv4 %u has v6 %u hasnat64 %u\n",
717 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), info->ifindex, info->no_mptcp_support,
718 info->has_v4_conn, info->has_v6_conn, info->has_nat64_conn);
719
720 if (info->no_mptcp_support) {
721 continue;
722 }
723
724 ifnet_head_lock_shared();
725 ifp = ifindex2ifnet[ifindex];
726 ifnet_head_done();
727
728 if (ifp == NULL) {
729 continue;
730 }
731
732 if (IFNET_IS_CELLULAR(ifp)) {
733 cellular_viable = TRUE;
734
735 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ||
736 mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
737 if (mptcp_wifi_quality_for_session(mpte) == MPTCP_WIFI_QUALITY_GOOD) {
738 continue;
739 }
740 }
741 }
742
743 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
744 const struct ifnet *subifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
745 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
746
747 if (subifp == NULL) {
748 continue;
749 }
750
751 /*
752 * If there is at least one functioning subflow on WiFi
753 * and we are checking for the cell interface, then
754 * we always need to ask symptoms for permission as
755 * cell is triggered even if WiFi is available.
756 */
757 if (!IFNET_IS_CELLULAR(subifp) &&
758 !mptcp_subflow_disconnecting(mpts) &&
759 IFNET_IS_CELLULAR(ifp)) {
760 need_to_ask_symptoms = TRUE;
761 }
762
763 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER || mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
764 os_log(mptcp_log_handle,
765 "%s - %lx: %s: cell %u wifi-state %d flags %#x rxt %u first-party %u sb_cc %u ifindex %u this %u rtt %u rttvar %u rto %u\n",
766 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
767 mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ? "handover" : "pure-handover",
768 IFNET_IS_CELLULAR(subifp),
769 mptcp_wifi_quality_for_session(mpte),
770 mpts->mpts_flags,
771 tp->t_rxtshift,
772 !!(mpte->mpte_flags & MPTE_FIRSTPARTY),
773 mptetoso(mpte)->so_snd.sb_cc,
774 ifindex, subifp->if_index,
775 tp->t_srtt >> TCP_RTT_SHIFT,
776 tp->t_rttvar >> TCP_RTTVAR_SHIFT,
777 tp->t_rxtcur);
778
779 if (!IFNET_IS_CELLULAR(subifp) &&
780 !mptcp_subflow_disconnecting(mpts) &&
781 (mpts->mpts_flags & MPTSF_CONNECTED) &&
782 !mptcp_handover_use_cellular(mpte, tp)) {
783 found = TRUE;
784
785 /* We found a proper subflow on WiFi - no need for cell */
786 want_cellular = FALSE;
787 break;
788 }
789 } else if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
790 uint64_t time_now = mach_continuous_time();
791
792 os_log(mptcp_log_handle,
793 "%s - %lx: target-based: %llu now %llu wifi quality %d cell %u sostat %#x mpts_flags %#x tcp-state %u\n",
794 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_time_target,
795 time_now, mptcp_wifi_quality_for_session(mpte),
796 IFNET_IS_CELLULAR(subifp), mpts->mpts_socket->so_state,
797 mpts->mpts_flags, sototcpcb(mpts->mpts_socket)->t_state);
798
799 if (!IFNET_IS_CELLULAR(subifp) &&
800 !mptcp_subflow_disconnecting(mpts) &&
801 (mpte->mpte_time_target == 0 ||
802 (int64_t)(mpte->mpte_time_target - time_now) > 0 ||
803 mptcp_wifi_quality_for_session(mpte) == MPTCP_WIFI_QUALITY_GOOD)) {
804 found = TRUE;
805
806 want_cellular = FALSE;
807 break;
808 }
809 }
810
811 if (subifp->if_index == ifindex &&
812 !mptcp_subflow_disconnecting(mpts)) {
813 /*
814 * We found a subflow on this interface.
815 * No need to create a new one.
816 */
817 found = TRUE;
818 break;
819 }
820 }
821
822 if (found) {
823 continue;
824 }
825
826 if (need_to_ask_symptoms &&
827 !(mpte->mpte_flags & MPTE_FIRSTPARTY) &&
828 !(mpte->mpte_flags & MPTE_ACCESS_GRANTED) &&
829 mptcp_developer_mode == 0) {
830 mptcp_ask_symptoms(mpte);
831 return;
832 }
833
834 dst = mptcp_get_session_dst(mpte, ipv6: info->has_v6_conn, ipv4: info->has_v4_conn);
835
836 if (dst->sa_family == AF_INET &&
837 !info->has_v4_conn && info->has_nat64_conn) {
838 struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
839 int error, j;
840
841 SOCKADDR_ZERO(&nat64pre, sizeof(struct sockaddr_in6));
842
843 error = ifnet_get_nat64prefix(ifp, nat64prefixes);
844 if (error) {
845 os_log_error(mptcp_log_handle, "%s - %lx: no NAT64-prefix on itf %s, error %d\n",
846 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ifp->if_name, error);
847 continue;
848 }
849
850 for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
851 if (nat64prefixes[j].prefix_len != 0) {
852 break;
853 }
854 }
855
856 VERIFY(j < NAT64_MAX_NUM_PREFIXES);
857
858 error = mptcp_synthesize_nat64(addr: &nat64prefixes[j].ipv6_prefix,
859 len: nat64prefixes[j].prefix_len,
860 addrv4: &SIN(dst)->sin_addr);
861 if (error != 0) {
862 os_log_error(mptcp_log_handle, "%s - %lx: cannot synthesize this addr\n",
863 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
864 continue;
865 }
866
867 memcpy(dst: &nat64pre.sin6_addr,
868 src: &nat64prefixes[j].ipv6_prefix,
869 n: sizeof(nat64pre.sin6_addr));
870 nat64pre.sin6_len = sizeof(struct sockaddr_in6);
871 nat64pre.sin6_family = AF_INET6;
872 nat64pre.sin6_port = SIN(dst)->sin_port;
873 nat64pre.sin6_flowinfo = 0;
874 nat64pre.sin6_scope_id = 0;
875
876 dst = SA(&nat64pre);
877 }
878
879 if (dst->sa_family == AF_INET && !info->has_v4_conn) {
880 continue;
881 }
882 if (dst->sa_family == AF_INET6 && !info->has_v6_conn) {
883 continue;
884 }
885
886 mptcp_subflow_add(mpte, NULL, dst, ifindex, NULL);
887 }
888
889 if (!cellular_viable && want_cellular) {
890 /* Trigger Cell Bringup */
891 mptcp_trigger_cell_bringup(mpte);
892 }
893}
894
895static void
896mptcp_remove_cell_subflows(struct mptses *mpte)
897{
898 struct mptsub *mpts, *tmpts;
899
900 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
901 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
902
903 if (ifp == NULL || !IFNET_IS_CELLULAR(ifp)) {
904 continue;
905 }
906
907 os_log(mptcp_log_handle, "%s - %lx: removing cell subflow\n",
908 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
909
910 soevent(so: mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
911 }
912
913 return;
914}
915
916static void
917mptcp_remove_wifi_subflows(struct mptses *mpte)
918{
919 struct mptsub *mpts, *tmpts;
920
921 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
922 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
923
924 if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
925 continue;
926 }
927
928 os_log(mptcp_log_handle, "%s - %lx: removing wifi subflow\n",
929 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
930
931 soevent(so: mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
932 }
933
934 return;
935}
936
937static void
938mptcp_pure_handover_subflows_remove(struct mptses *mpte)
939{
940 mptcp_wifi_quality_t wifi_quality = mptcp_wifi_quality_for_session(mpte);
941 boolean_t found_working_wifi_subflow = false;
942 boolean_t found_working_cell_subflow = false;
943
944 struct mptsub *mpts;
945
946 /*
947 * Look for a subflow that is on a non-cellular interface in connected
948 * state.
949 *
950 * In that case, remove all cellular subflows.
951 *
952 * If however there is no connected subflow
953 */
954 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
955 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
956 struct socket *so;
957 struct tcpcb *tp;
958
959 if (ifp == NULL) {
960 continue;
961 }
962
963 so = mpts->mpts_socket;
964 tp = sototcpcb(so);
965
966 if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
967 tp->t_state != TCPS_ESTABLISHED ||
968 mptcp_subflow_disconnecting(mpts)) {
969 continue;
970 }
971
972 if (IFNET_IS_CELLULAR(ifp)) {
973 found_working_cell_subflow = true;
974 } else {
975 os_log_debug(mptcp_log_handle, "%s - %lx: rxt %u sb_cc %u wifi quality %d\n",
976 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), tp->t_rxtshift, mptetoso(mpte)->so_snd.sb_cc, wifi_quality);
977 if (!mptcp_handover_use_cellular(mpte, tp)) {
978 found_working_wifi_subflow = true;
979 }
980 }
981 }
982
983 /*
984 * Couldn't find a working subflow, let's not remove those on a cellular
985 * interface.
986 */
987 os_log_debug(mptcp_log_handle, "%s - %lx: Found Wi-Fi: %u Found Cellular %u",
988 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
989 found_working_wifi_subflow, found_working_cell_subflow);
990 if (!found_working_wifi_subflow && wifi_quality != MPTCP_WIFI_QUALITY_GOOD) {
991 if (found_working_cell_subflow) {
992 mptcp_remove_wifi_subflows(mpte);
993 }
994 return;
995 }
996
997 mptcp_remove_cell_subflows(mpte);
998}
999
1000static void
1001mptcp_handover_subflows_remove(struct mptses *mpte)
1002{
1003 mptcp_wifi_quality_t wifi_quality = mptcp_wifi_quality_for_session(mpte);
1004 boolean_t found_working_subflow = false;
1005 struct mptsub *mpts;
1006
1007 /*
1008 * Look for a subflow that is on a non-cellular interface
1009 * and actually works (aka, no retransmission timeout).
1010 */
1011 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1012 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1013 struct socket *so;
1014 struct tcpcb *tp;
1015
1016 if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
1017 continue;
1018 }
1019
1020 so = mpts->mpts_socket;
1021 tp = sototcpcb(so);
1022
1023 if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
1024 tp->t_state != TCPS_ESTABLISHED) {
1025 continue;
1026 }
1027
1028 os_log_debug(mptcp_log_handle, "%s - %lx: rxt %u sb_cc %u wifi quality %d\n",
1029 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), tp->t_rxtshift, mptetoso(mpte)->so_snd.sb_cc, wifi_quality);
1030
1031 if (!mptcp_handover_use_cellular(mpte, tp)) {
1032 found_working_subflow = true;
1033 break;
1034 }
1035 }
1036
1037 /*
1038 * Couldn't find a working subflow, let's not remove those on a cellular
1039 * interface.
1040 */
1041 if (!found_working_subflow) {
1042 return;
1043 }
1044
1045 mptcp_remove_cell_subflows(mpte);
1046}
1047
1048static void
1049mptcp_targetbased_subflows_remove(struct mptses *mpte)
1050{
1051 uint64_t time_now = mach_continuous_time();
1052 struct mptsub *mpts;
1053
1054 if (mpte->mpte_time_target != 0 &&
1055 (int64_t)(mpte->mpte_time_target - time_now) <= 0 &&
1056 mptcp_wifi_quality_for_session(mpte) != MPTCP_WIFI_QUALITY_GOOD) {
1057 /* WiFi is bad and we are below the target - don't remove any subflows */
1058 return;
1059 }
1060
1061 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1062 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1063
1064 if (ifp == NULL || IFNET_IS_CELLULAR(ifp)) {
1065 continue;
1066 }
1067
1068 /* We have a functioning subflow on WiFi. No need for cell! */
1069 if (mpts->mpts_flags & MPTSF_CONNECTED &&
1070 !mptcp_subflow_disconnecting(mpts)) {
1071 mptcp_remove_cell_subflows(mpte);
1072 break;
1073 }
1074 }
1075}
1076
1077/*
1078 * Based on the MPTCP Service-type and the state of the subflows, we
1079 * will destroy subflows here.
1080 */
1081void
1082mptcp_check_subflows_and_remove(struct mptses *mpte)
1083{
1084 if (!mptcp_ok_to_create_subflows(mp_tp: mpte->mpte_mptcb)) {
1085 return;
1086 }
1087
1088 socket_lock_assert_owned(so: mptetoso(mpte));
1089
1090 if (mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER) {
1091 mptcp_pure_handover_subflows_remove(mpte);
1092 }
1093
1094 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER) {
1095 mptcp_handover_subflows_remove(mpte);
1096 }
1097
1098 if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
1099 mptcp_targetbased_subflows_remove(mpte);
1100 }
1101}
1102
1103static void
1104mptcp_remove_subflows(struct mptses *mpte)
1105{
1106 struct mptsub *mpts, *tmpts;
1107
1108 if (!mptcp_ok_to_create_subflows(mp_tp: mpte->mpte_mptcb)) {
1109 return;
1110 }
1111
1112 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
1113 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1114 boolean_t found = false;
1115 uint32_t ifindex;
1116 uint32_t i;
1117
1118 if (mpts->mpts_flags & MPTSF_CLOSE_REQD) {
1119 mpts->mpts_flags &= ~MPTSF_CLOSE_REQD;
1120
1121 os_log(mptcp_log_handle, "%s - %lx: itf %u close_reqd last itf %d\n",
1122 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope,
1123 ifp ? ifp->if_index : -1);
1124 soevent(so: mpts->mpts_socket,
1125 SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
1126
1127 continue;
1128 }
1129
1130 if (ifp == NULL && mpts->mpts_ifscope == IFSCOPE_NONE) {
1131 continue;
1132 }
1133
1134 if (ifp) {
1135 ifindex = ifp->if_index;
1136 } else {
1137 ifindex = mpts->mpts_ifscope;
1138 }
1139
1140 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
1141 if (mpte->mpte_itfinfo[i].ifindex == IFSCOPE_NONE) {
1142 continue;
1143 }
1144
1145 if (mpte->mpte_itfinfo[i].ifindex == ifindex) {
1146 if (mpts->mpts_dst.sa_family == AF_INET6 &&
1147 (mpte->mpte_itfinfo[i].has_v6_conn || mpte->mpte_itfinfo[i].has_nat64_conn)) {
1148 found = true;
1149 break;
1150 }
1151
1152 if (mpts->mpts_dst.sa_family == AF_INET &&
1153 mpte->mpte_itfinfo[i].has_v4_conn) {
1154 found = true;
1155 break;
1156 }
1157 }
1158 }
1159
1160 if (!found) {
1161 os_log(mptcp_log_handle, "%s - %lx: itf %u killing %#x\n",
1162 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1163 ifindex, mpts->mpts_flags);
1164
1165 soevent(so: mpts->mpts_socket,
1166 SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
1167 }
1168 }
1169}
1170
1171static void
1172mptcp_create_subflows(__unused void *arg)
1173{
1174 struct mppcb *mpp;
1175
1176 /*
1177 * Start with clearing, because we might be processing connections
1178 * while a new event comes in.
1179 */
1180 if (OSTestAndClear(bit: 0x01, startAddress: &mptcp_create_subflows_scheduled)) {
1181 os_log_error(mptcp_log_handle, "%s: bit was already cleared!\n", __func__);
1182 }
1183
1184 /* Iterate over all MPTCP connections */
1185
1186 lck_mtx_lock(lck: &mtcbinfo.mppi_lock);
1187
1188 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
1189 struct socket *mp_so = mpp->mpp_socket;
1190 struct mptses *mpte = mpp->mpp_pcbe;
1191
1192 socket_lock(so: mp_so, refcount: 1);
1193 if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS) ||
1194 !(mpte->mpte_flags & MPTE_ITFINFO_INIT)) {
1195 socket_unlock(so: mp_so, refcount: 1);
1196 continue;
1197 }
1198
1199 VERIFY(mp_so->so_usecount > 0);
1200
1201 mpp->mpp_flags &= ~MPP_CREATE_SUBFLOWS;
1202
1203 mptcp_check_subflows_and_add(mpte);
1204 mptcp_remove_subflows(mpte);
1205
1206 mp_so->so_usecount--; /* See mptcp_sched_create_subflows */
1207 socket_unlock(so: mp_so, refcount: 1);
1208 }
1209
1210 lck_mtx_unlock(lck: &mtcbinfo.mppi_lock);
1211}
1212
1213/*
1214 * We need this because we are coming from an NECP-event. This event gets posted
1215 * while holding NECP-locks. The creation of the subflow however leads us back
1216 * into NECP (e.g., to add the necp_cb and also from tcp_connect).
1217 * So, we would deadlock there as we already hold the NECP-lock.
1218 *
1219 * So, let's schedule this separately. It also gives NECP the chance to make
1220 * progress, without having to wait for MPTCP to finish its subflow creation.
1221 */
1222void
1223mptcp_sched_create_subflows(struct mptses *mpte)
1224{
1225 struct mppcb *mpp = mpte->mpte_mppcb;
1226 struct mptcb *mp_tp = mpte->mpte_mptcb;
1227 struct socket *mp_so = mpp->mpp_socket;
1228
1229 if (!mptcp_ok_to_create_subflows(mp_tp)) {
1230 os_log_debug(mptcp_log_handle, "%s - %lx: not a good time for subflows, state %u flags %#x",
1231 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state, mp_tp->mpt_flags);
1232 return;
1233 }
1234
1235 if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
1236 mp_so->so_usecount++; /* To prevent it from being free'd in-between */
1237 mpp->mpp_flags |= MPP_CREATE_SUBFLOWS;
1238 }
1239
1240 if (OSTestAndSet(bit: 0x01, startAddress: &mptcp_create_subflows_scheduled)) {
1241 return;
1242 }
1243
1244 /* Do the call in 100ms to allow NECP to schedule it on all sockets */
1245 timeout(mptcp_create_subflows, NULL, ticks: hz / 10);
1246}
1247
1248/*
1249 * Allocate an MPTCP socket option structure.
1250 */
1251struct mptopt *
1252mptcp_sopt_alloc(zalloc_flags_t how)
1253{
1254 return zalloc_flags(mptopt_zone, how | Z_ZERO);
1255}
1256
1257/*
1258 * Free an MPTCP socket option structure.
1259 */
1260void
1261mptcp_sopt_free(struct mptopt *mpo)
1262{
1263 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
1264
1265 zfree(mptopt_zone, mpo);
1266}
1267
1268/*
1269 * Add a socket option to the MPTCP socket option list.
1270 */
1271void
1272mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo)
1273{
1274 socket_lock_assert_owned(so: mptetoso(mpte));
1275 mpo->mpo_flags |= MPOF_ATTACHED;
1276 TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry);
1277}
1278
1279/*
1280 * Remove a socket option from the MPTCP socket option list.
1281 */
1282void
1283mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo)
1284{
1285 socket_lock_assert_owned(so: mptetoso(mpte));
1286 VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
1287 mpo->mpo_flags &= ~MPOF_ATTACHED;
1288 TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry);
1289}
1290
1291/*
1292 * Search for an existing <sopt_level,sopt_name> socket option.
1293 */
1294struct mptopt *
1295mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt)
1296{
1297 struct mptopt *mpo;
1298
1299 socket_lock_assert_owned(so: mptetoso(mpte));
1300
1301 TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) {
1302 if (mpo->mpo_level == sopt->sopt_level &&
1303 mpo->mpo_name == sopt->sopt_name) {
1304 break;
1305 }
1306 }
1307 return mpo;
1308}
1309
1310/*
1311 * Allocate a MPTCP subflow structure.
1312 */
1313static struct mptsub *
1314mptcp_subflow_alloc(void)
1315{
1316 return zalloc_flags(mptsub_zone, Z_WAITOK | Z_ZERO);
1317}
1318
1319/*
1320 * Deallocate a subflow structure, called when all of the references held
1321 * on it have been released. This implies that the subflow has been deleted.
1322 */
1323static void
1324mptcp_subflow_free(struct mptsub *mpts)
1325{
1326 VERIFY(mpts->mpts_refcnt == 0);
1327 VERIFY(mpts->mpts_mpte == NULL);
1328 VERIFY(mpts->mpts_socket == NULL);
1329
1330 free_sockaddr(mpts->mpts_src);
1331
1332 zfree(mptsub_zone, mpts);
1333}
1334
1335static void
1336mptcp_subflow_addref(struct mptsub *mpts)
1337{
1338 if (++mpts->mpts_refcnt == 0) {
1339 panic("%s: mpts %p wraparound refcnt", __func__, mpts);
1340 }
1341 /* NOTREACHED */
1342}
1343
1344static void
1345mptcp_subflow_remref(struct mptsub *mpts)
1346{
1347 if (mpts->mpts_refcnt == 0) {
1348 panic("%s: mpts %p negative refcnt", __func__, mpts);
1349 /* NOTREACHED */
1350 }
1351 if (--mpts->mpts_refcnt > 0) {
1352 return;
1353 }
1354
1355 /* callee will unlock and destroy lock */
1356 mptcp_subflow_free(mpts);
1357}
1358
1359static void
1360mptcp_subflow_attach(struct mptses *mpte, struct mptsub *mpts, struct socket *so)
1361{
1362 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
1363 struct tcpcb *tp = sototcpcb(so);
1364
1365 /*
1366 * From this moment on, the subflow is linked to the MPTCP-connection.
1367 * Locking,... happens now at the MPTCP-layer
1368 */
1369 tp->t_mptcb = mpte->mpte_mptcb;
1370 so->so_flags |= SOF_MP_SUBFLOW;
1371 mp_so->so_usecount++;
1372
1373 /*
1374 * Insert the subflow into the list, and associate the MPTCP PCB
1375 * as well as the the subflow socket. From this point on, removing
1376 * the subflow needs to be done via mptcp_subflow_del().
1377 */
1378 TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry);
1379 mpte->mpte_numflows++;
1380
1381 mpts->mpts_mpte = mpte;
1382 mpts->mpts_socket = so;
1383 tp->t_mpsub = mpts;
1384 mptcp_subflow_addref(mpts); /* for being in MPTCP subflow list */
1385 mptcp_subflow_addref(mpts); /* for subflow socket */
1386}
1387
1388static void
1389mptcp_subflow_necp_cb(void *handle, __unused int action,
1390 __unused uint32_t interface_index,
1391 uint32_t necp_flags, bool *viable)
1392{
1393 boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER);
1394 struct inpcb *inp = (struct inpcb *)handle;
1395 struct socket *so = inp->inp_socket;
1396 struct mptsub *mpts;
1397 struct mptses *mpte;
1398
1399 if (low_power) {
1400 action = NECP_CLIENT_CBACTION_NONVIABLE;
1401 }
1402
1403 if (action != NECP_CLIENT_CBACTION_NONVIABLE) {
1404 return;
1405 }
1406
1407 /*
1408 * The socket is being garbage-collected. There is nothing to be done
1409 * here.
1410 */
1411 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
1412 return;
1413 }
1414
1415 socket_lock(so, refcount: 1);
1416
1417 /* Check again after we acquired the lock. */
1418 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
1419 goto out;
1420 }
1421
1422 mpte = tptomptp(sototcpcb(so))->mpt_mpte;
1423 mpts = sototcpcb(so)->t_mpsub;
1424
1425 os_log_debug(mptcp_log_handle, "%s - %lx: Subflow on itf %u became non-viable, power %u",
1426 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_ifscope, low_power);
1427
1428 mpts->mpts_flags |= MPTSF_CLOSE_REQD;
1429
1430 mptcp_sched_create_subflows(mpte);
1431
1432 if ((mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER ||
1433 mpte->mpte_svctype == MPTCP_SVCTYPE_PURE_HANDOVER ||
1434 mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) &&
1435 viable != NULL) {
1436 *viable = 1;
1437 }
1438
1439out:
1440 socket_unlock(so, refcount: 1);
1441}
1442
1443/*
1444 * Create an MPTCP subflow socket.
1445 */
1446static int
1447mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
1448 struct socket **so)
1449{
1450 lck_mtx_t *subflow_mtx;
1451 struct mptopt smpo, *mpo, *tmpo;
1452 struct proc *p;
1453 struct socket *mp_so;
1454 struct mppcb *mpp;
1455 int error;
1456
1457 *so = NULL;
1458
1459 mp_so = mptetoso(mpte);
1460 mpp = mpsotomppcb(mp_so);
1461
1462 p = proc_find(pid: mp_so->last_pid);
1463 if (p == PROC_NULL) {
1464 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
1465 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid);
1466
1467 mptcp_subflow_free(mpts);
1468 return ESRCH;
1469 }
1470
1471 /*
1472 * Create the subflow socket (multipath subflow, non-blocking.)
1473 *
1474 * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow
1475 * socket; it will be cleared when the socket is peeled off or closed.
1476 * It also indicates to the underlying TCP to handle MPTCP options.
1477 * A multipath subflow socket implies SS_NOFDREF state.
1478 */
1479
1480 /*
1481 * Unlock, because tcp_usr_attach ends up in in_pcballoc, which takes
1482 * the ipi-lock. We cannot hold the socket-lock at that point.
1483 */
1484 socket_unlock(so: mp_so, refcount: 0);
1485 error = socreate_internal(dom, aso: so, SOCK_STREAM, IPPROTO_TCP, p,
1486 SOCF_MPTCP, PROC_NULL);
1487 socket_lock(so: mp_so, refcount: 0);
1488 if (error) {
1489 os_log_error(mptcp_log_handle, "%s - %lx: unable to create subflow socket error %d\n",
1490 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1491
1492 proc_rele(p);
1493
1494 mptcp_subflow_free(mpts);
1495 return error;
1496 }
1497
1498 /*
1499 * We need to protect the setting of SOF_MP_SUBFLOW with a lock, because
1500 * this marks the moment of lock-switch from the TCP-lock to the MPTCP-lock.
1501 * Which is why we also need to get the lock with pr_getlock, as after
1502 * setting the flag, socket_unlock will work on the MPTCP-level lock.
1503 */
1504 subflow_mtx = ((*so)->so_proto->pr_getlock)(*so, 0);
1505 lck_mtx_lock(lck: subflow_mtx);
1506
1507 /*
1508 * Must be the first thing we do, to make sure all pointers for this
1509 * subflow are set.
1510 */
1511 mptcp_subflow_attach(mpte, mpts, so: *so);
1512
1513 /*
1514 * A multipath subflow socket is used internally in the kernel,
1515 * therefore it does not have a file desciptor associated by
1516 * default.
1517 */
1518 (*so)->so_state |= SS_NOFDREF;
1519
1520 lck_mtx_unlock(lck: subflow_mtx);
1521
1522 /* prevent the socket buffers from being compressed */
1523 (*so)->so_rcv.sb_flags |= SB_NOCOMPRESS;
1524 (*so)->so_snd.sb_flags |= SB_NOCOMPRESS;
1525
1526 /* Inherit preconnect and TFO data flags */
1527 if (mp_so->so_flags1 & SOF1_PRECONNECT_DATA) {
1528 (*so)->so_flags1 |= SOF1_PRECONNECT_DATA;
1529 }
1530 if (mp_so->so_flags1 & SOF1_DATA_IDEMPOTENT) {
1531 (*so)->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1532 }
1533 if (mp_so->so_flags1 & SOF1_DATA_AUTHENTICATED) {
1534 (*so)->so_flags1 |= SOF1_DATA_AUTHENTICATED;
1535 }
1536
1537 /* Inherit uuid and create the related flow. */
1538 if (!uuid_is_null(uu: mpp->necp_client_uuid)) {
1539 struct mptcb *mp_tp = mpte->mpte_mptcb;
1540
1541 sotoinpcb(*so)->necp_cb = mptcp_subflow_necp_cb;
1542
1543 /*
1544 * A note on the unlock: With MPTCP, we do multiple times a
1545 * necp_client_register_socket_flow. This is problematic,
1546 * because now the lock-ordering guarantee (first necp-locks,
1547 * then socket-locks) is no more respected. So, we need to
1548 * unlock here.
1549 */
1550 socket_unlock(so: mp_so, refcount: 0);
1551 error = necp_client_register_socket_flow(pid: mp_so->last_pid,
1552 client_id: mpp->necp_client_uuid, sotoinpcb(*so));
1553 socket_lock(so: mp_so, refcount: 0);
1554
1555 if (error) {
1556 os_log_error(mptcp_log_handle, "%s - %lx: necp_client_register_socket_flow failed with error %d\n",
1557 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1558
1559 goto out_err;
1560 }
1561
1562 /* Possible state-change during the unlock above */
1563 if (mp_tp->mpt_state >= MPTCPS_TIME_WAIT ||
1564 (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP)) {
1565 os_log_error(mptcp_log_handle, "%s - %lx: state changed during unlock: %u flags %#x\n",
1566 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1567 mp_tp->mpt_state, mp_tp->mpt_flags);
1568
1569 error = EINVAL;
1570 goto out_err;
1571 }
1572
1573 uuid_copy(sotoinpcb(*so)->necp_client_uuid, src: mpp->necp_client_uuid);
1574 }
1575
1576 if (mpp->inp_necp_attributes.inp_domain != NULL) {
1577 size_t string_size = strlen(s: mpp->inp_necp_attributes.inp_domain);
1578 sotoinpcb(*so)->inp_necp_attributes.inp_domain = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1579
1580 if (sotoinpcb(*so)->inp_necp_attributes.inp_domain) {
1581 memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_domain, src: mpp->inp_necp_attributes.inp_domain, n: string_size + 1);
1582 }
1583 }
1584 if (mpp->inp_necp_attributes.inp_account != NULL) {
1585 size_t string_size = strlen(s: mpp->inp_necp_attributes.inp_account);
1586 sotoinpcb(*so)->inp_necp_attributes.inp_account = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1587
1588 if (sotoinpcb(*so)->inp_necp_attributes.inp_account) {
1589 memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_account, src: mpp->inp_necp_attributes.inp_account, n: string_size + 1);
1590 }
1591 }
1592
1593 if (mpp->inp_necp_attributes.inp_domain_owner != NULL) {
1594 size_t string_size = strlen(s: mpp->inp_necp_attributes.inp_domain_owner);
1595 sotoinpcb(*so)->inp_necp_attributes.inp_domain_owner = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1596
1597 if (sotoinpcb(*so)->inp_necp_attributes.inp_domain_owner) {
1598 memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_domain_owner, src: mpp->inp_necp_attributes.inp_domain_owner, n: string_size + 1);
1599 }
1600 }
1601
1602 if (mpp->inp_necp_attributes.inp_tracker_domain != NULL) {
1603 size_t string_size = strlen(s: mpp->inp_necp_attributes.inp_tracker_domain);
1604 sotoinpcb(*so)->inp_necp_attributes.inp_tracker_domain = kalloc_data(string_size + 1, Z_WAITOK | Z_ZERO);
1605
1606 if (sotoinpcb(*so)->inp_necp_attributes.inp_tracker_domain) {
1607 memcpy(sotoinpcb(*so)->inp_necp_attributes.inp_tracker_domain, src: mpp->inp_necp_attributes.inp_tracker_domain, n: string_size + 1);
1608 }
1609 }
1610
1611 /* Needs to happen prior to the delegation! */
1612 (*so)->last_pid = mp_so->last_pid;
1613
1614 if (mp_so->so_flags & SOF_DELEGATED) {
1615 if (mpte->mpte_epid) {
1616 error = so_set_effective_pid(so: *so, epid: mpte->mpte_epid, p, false);
1617 if (error) {
1618 os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_pid failed with error %d\n",
1619 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1620 goto out_err;
1621 }
1622 }
1623 if (!uuid_is_null(uu: mpte->mpte_euuid)) {
1624 error = so_set_effective_uuid(so: *so, euuid: mpte->mpte_euuid, p, false);
1625 if (error) {
1626 os_log_error(mptcp_log_handle, "%s - %lx: so_set_effective_uuid failed with error %d\n",
1627 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
1628 goto out_err;
1629 }
1630 }
1631 }
1632
1633 /* inherit the other socket options */
1634 bzero(s: &smpo, n: sizeof(smpo));
1635 smpo.mpo_flags |= MPOF_SUBFLOW_OK;
1636 smpo.mpo_level = SOL_SOCKET;
1637 smpo.mpo_intval = 1;
1638
1639 /* disable SIGPIPE */
1640 smpo.mpo_name = SO_NOSIGPIPE;
1641 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1642 goto out_err;
1643 }
1644
1645 /* find out if the subflow's source address goes away */
1646 smpo.mpo_name = SO_NOADDRERR;
1647 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1648 goto out_err;
1649 }
1650
1651 if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED) {
1652 /*
1653 * On secondary subflows we might need to set the cell-fallback
1654 * flag (see conditions in mptcp_subflow_sosetopt).
1655 */
1656 smpo.mpo_level = SOL_SOCKET;
1657 smpo.mpo_name = SO_MARK_CELLFALLBACK;
1658 smpo.mpo_intval = 1;
1659 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0) {
1660 goto out_err;
1661 }
1662 }
1663
1664 /* replay setsockopt(2) on the subflow sockets for eligible options */
1665 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
1666 int interim;
1667
1668 if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK)) {
1669 continue;
1670 }
1671
1672 /*
1673 * Skip those that are handled internally; these options
1674 * should not have been recorded and marked with the
1675 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
1676 */
1677 if (mpo->mpo_level == SOL_SOCKET &&
1678 (mpo->mpo_name == SO_NOSIGPIPE ||
1679 mpo->mpo_name == SO_NOADDRERR ||
1680 mpo->mpo_name == SO_KEEPALIVE)) {
1681 continue;
1682 }
1683
1684 interim = (mpo->mpo_flags & MPOF_INTERIM);
1685 if (mptcp_subflow_sosetopt(mpte, mpts, mpo) != 0 && interim) {
1686 os_log_error(mptcp_log_handle, "%s - %lx: sopt %s val %d interim record removed\n",
1687 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1688 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
1689 mpo->mpo_intval);
1690 mptcp_sopt_remove(mpte, mpo);
1691 mptcp_sopt_free(mpo);
1692 continue;
1693 }
1694 }
1695
1696 /*
1697 * We need to receive everything that the subflow socket has,
1698 * so use a customized socket receive function. We will undo
1699 * this when the socket is peeled off or closed.
1700 */
1701 switch (dom) {
1702 case PF_INET:
1703 (*so)->so_proto = &mptcp_subflow_protosw;
1704 break;
1705 case PF_INET6:
1706 (*so)->so_proto = (struct protosw *)&mptcp_subflow_protosw6;
1707 break;
1708 default:
1709 VERIFY(0);
1710 /* NOTREACHED */
1711 }
1712
1713 proc_rele(p);
1714
1715 DTRACE_MPTCP3(subflow__create, struct mptses *, mpte,
1716 int, dom, int, error);
1717
1718 return 0;
1719
1720out_err:
1721 mptcp_subflow_abort(mpts, error);
1722
1723 proc_rele(p);
1724
1725 return error;
1726}
1727
1728/*
1729 * Close an MPTCP subflow socket.
1730 *
1731 * Note that this may be called on an embryonic subflow, and the only
1732 * thing that is guaranteed valid is the protocol-user request.
1733 */
1734static void
1735mptcp_subflow_soclose(struct mptsub *mpts)
1736{
1737 struct socket *so = mpts->mpts_socket;
1738
1739 if (mpts->mpts_flags & MPTSF_CLOSED) {
1740 return;
1741 }
1742
1743 VERIFY(so != NULL);
1744 VERIFY(so->so_flags & SOF_MP_SUBFLOW);
1745 VERIFY((so->so_state & (SS_NBIO | SS_NOFDREF)) == (SS_NBIO | SS_NOFDREF));
1746
1747 DTRACE_MPTCP5(subflow__close, struct mptsub *, mpts,
1748 struct socket *, so,
1749 struct sockbuf *, &so->so_rcv,
1750 struct sockbuf *, &so->so_snd,
1751 struct mptses *, mpts->mpts_mpte);
1752
1753 mpts->mpts_flags |= MPTSF_CLOSED;
1754
1755 if (so->so_retaincnt == 0) {
1756 soclose_locked(so);
1757
1758 return;
1759 } else {
1760 VERIFY(so->so_usecount > 0);
1761 so->so_usecount--;
1762 }
1763
1764 return;
1765}
1766
1767static void
1768mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp, uint8_t addr_id)
1769{
1770 struct tcpcb *tp = sototcpcb(so);
1771 struct mptcp_subf_auth_entry *sauth_entry;
1772
1773 /*
1774 * The address ID of the first flow is implicitly 0.
1775 */
1776 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
1777 tp->t_local_aid = 0;
1778 } else {
1779 tp->t_local_aid = addr_id;
1780 tp->t_mpflags |= (TMPF_PREESTABLISHED | TMPF_JOINED_FLOW);
1781 so->so_flags |= SOF_MP_SEC_SUBFLOW;
1782 }
1783 sauth_entry = zalloc(kt_view: mpt_subauth_zone);
1784 sauth_entry->msae_laddr_id = tp->t_local_aid;
1785 sauth_entry->msae_raddr_id = 0;
1786 sauth_entry->msae_raddr_rand = 0;
1787try_again:
1788 sauth_entry->msae_laddr_rand = RandomULong();
1789 if (sauth_entry->msae_laddr_rand == 0) {
1790 goto try_again;
1791 }
1792 LIST_INSERT_HEAD(&mp_tp->mpt_subauth_list, sauth_entry, msae_next);
1793}
1794
1795static void
1796mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so)
1797{
1798 struct mptcp_subf_auth_entry *sauth_entry;
1799 struct tcpcb *tp = NULL;
1800 int found = 0;
1801
1802 tp = sototcpcb(so);
1803 if (tp == NULL) {
1804 return;
1805 }
1806
1807 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
1808 if (sauth_entry->msae_laddr_id == tp->t_local_aid) {
1809 found = 1;
1810 break;
1811 }
1812 }
1813 if (found) {
1814 LIST_REMOVE(sauth_entry, msae_next);
1815 }
1816
1817 if (found) {
1818 zfree(mpt_subauth_zone, sauth_entry);
1819 }
1820}
1821
1822/*
1823 * Connect an MPTCP subflow socket.
1824 *
1825 * Note that in the pending connect case, the subflow socket may have been
1826 * bound to an interface and/or a source IP address which may no longer be
1827 * around by the time this routine is called; in that case the connect attempt
1828 * will most likely fail.
1829 */
1830static int
1831mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts)
1832{
1833 char dbuf[MAX_IPv6_STR_LEN];
1834 struct socket *mp_so, *so;
1835 struct mptcb *mp_tp;
1836 struct sockaddr *dst;
1837 struct proc *p;
1838 int af, error, dport;
1839
1840 mp_so = mptetoso(mpte);
1841 mp_tp = mpte->mpte_mptcb;
1842 so = mpts->mpts_socket;
1843 af = mpts->mpts_dst.sa_family;
1844 dst = &mpts->mpts_dst;
1845
1846 VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING | MPTSF_CONNECTED)) == MPTSF_CONNECTING);
1847 VERIFY(mpts->mpts_socket != NULL);
1848 VERIFY(af == AF_INET || af == AF_INET6);
1849
1850 if (af == AF_INET) {
1851 inet_ntop(af, &SIN(dst)->sin_addr.s_addr, dbuf, sizeof(dbuf));
1852 dport = ntohs(SIN(dst)->sin_port);
1853 } else {
1854 inet_ntop(af, &SIN6(dst)->sin6_addr, dbuf, sizeof(dbuf));
1855 dport = ntohs(SIN6(dst)->sin6_port);
1856 }
1857
1858 os_log(mptcp_log_handle,
1859 "%s - %lx: ifindex %u dst %s:%d pended %u\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
1860 mpts->mpts_ifscope, dbuf, dport, !!(mpts->mpts_flags & MPTSF_CONNECT_PENDING));
1861
1862 p = proc_find(pid: mp_so->last_pid);
1863 if (p == PROC_NULL) {
1864 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
1865 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_so->last_pid);
1866
1867 return ESRCH;
1868 }
1869
1870 mpts->mpts_flags &= ~MPTSF_CONNECT_PENDING;
1871
1872 mptcp_attach_to_subf(so, mp_tp: mpte->mpte_mptcb, addr_id: mpte->mpte_addrid_last);
1873
1874 /* connect the subflow socket */
1875 error = soconnectxlocked(so, src: mpts->mpts_src, dst: &mpts->mpts_dst,
1876 p, mpts->mpts_ifscope,
1877 mpte->mpte_associd, NULL, 0, NULL, 0, NULL, NULL);
1878
1879 mpts->mpts_iss = sototcpcb(so)->iss;
1880
1881 /* See tcp_connect_complete */
1882 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED &&
1883 (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1884 mp_tp->mpt_sndwnd = sototcpcb(so)->snd_wnd;
1885 }
1886
1887 /* Allocate a unique address id per subflow */
1888 mpte->mpte_addrid_last++;
1889 if (mpte->mpte_addrid_last == 0) {
1890 mpte->mpte_addrid_last++;
1891 }
1892
1893 proc_rele(p);
1894
1895 DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte,
1896 struct mptsub *, mpts, int, error);
1897 if (error) {
1898 os_log_error(mptcp_log_handle, "%s - %lx: connectx failed with error %d ifscope %u\n",
1899 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error, mpts->mpts_ifscope);
1900 }
1901
1902 return error;
1903}
1904
1905static int
1906mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, uint64_t dsn,
1907 uint32_t rseq, uint16_t dlen, uint8_t dfin)
1908{
1909 struct mptsub *mpts = sototcpcb(so)->t_mpsub;
1910
1911 if (m_pktlen(m) == 0) {
1912 return 0;
1913 }
1914
1915 if (!(m->m_flags & M_PKTHDR)) {
1916 return 0;
1917 }
1918
1919 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP) {
1920 if (off && (dsn != m->m_pkthdr.mp_dsn ||
1921 rseq != m->m_pkthdr.mp_rseq ||
1922 dlen != m->m_pkthdr.mp_rlen ||
1923 dfin != !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN))) {
1924 os_log_error(mptcp_log_handle, "%s - %lx: Received incorrect second mapping: DSN: %u - %u , SSN: %u - %u, DLEN: %u - %u, DFIN: %u - %u\n",
1925 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
1926 (uint32_t)dsn, (uint32_t)m->m_pkthdr.mp_dsn,
1927 rseq, m->m_pkthdr.mp_rseq,
1928 dlen, m->m_pkthdr.mp_rlen,
1929 dfin, !!(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN));
1930
1931 soevent(so: mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1932 return -1;
1933 }
1934 }
1935
1936 /* If mbuf is beyond right edge of the mapping, we need to split */
1937 if (m_pktlen(m) > dlen - dfin - off) {
1938 struct mbuf *new = m_split(m, dlen - dfin - off, M_DONTWAIT);
1939 if (new == NULL) {
1940 os_log_error(mptcp_log_handle, "%s - %lx: m_split failed dlen %u dfin %u off %d pktlen %d, killing subflow %d",
1941 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpts->mpts_mpte),
1942 dlen, dfin, off, m_pktlen(m),
1943 mpts->mpts_connid);
1944
1945 soevent(so: mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1946 return -1;
1947 }
1948
1949 m->m_next = new;
1950 sballoc(sb: &so->so_rcv, m: new);
1951 /* Undo, as sballoc will add to it as well */
1952 so->so_rcv.sb_cc -= new->m_len;
1953
1954 if (so->so_rcv.sb_mbtail == m) {
1955 so->so_rcv.sb_mbtail = new;
1956 }
1957 }
1958
1959 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
1960 m->m_pkthdr.mp_dsn = dsn + off;
1961 m->m_pkthdr.mp_rseq = rseq + off;
1962 VERIFY(m_pktlen(m) < UINT16_MAX);
1963 m->m_pkthdr.mp_rlen = (uint16_t)m_pktlen(m);
1964
1965 /* Only put the DATA_FIN-flag on the last mbuf of this mapping */
1966 if (dfin) {
1967 if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen < dsn + dlen - dfin) {
1968 m->m_pkthdr.pkt_flags &= ~PKTF_MPTCP_DFIN;
1969 } else {
1970 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
1971 }
1972 }
1973
1974
1975 mpts->mpts_flags |= MPTSF_FULLY_ESTABLISHED;
1976
1977 return 0;
1978}
1979
1980/*
1981 * Update the pid, upid, uuid of the subflow so, based on parent so
1982 */
1983static void
1984mptcp_update_last_owner(struct socket *so, struct socket *mp_so)
1985{
1986 if (so->last_pid != mp_so->last_pid ||
1987 so->last_upid != mp_so->last_upid) {
1988 so->last_upid = mp_so->last_upid;
1989 so->last_pid = mp_so->last_pid;
1990 uuid_copy(dst: so->last_uuid, src: mp_so->last_uuid);
1991 }
1992 so_update_policy(so);
1993}
1994
1995/*
1996 * MPTCP subflow socket receive routine, derived from soreceive().
1997 */
1998static int
1999mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
2000 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
2001{
2002#pragma unused(uio)
2003 struct socket *mp_so;
2004 struct mptses *mpte;
2005 struct mptcb *mp_tp;
2006 int flags, error = 0;
2007 struct mbuf *m, **mp = mp0;
2008 struct tcpcb *tp = sototcpcb(so);
2009
2010 mpte = tptomptp(sototcpcb(so))->mpt_mpte;
2011 mp_so = mptetoso(mpte);
2012 mp_tp = mpte->mpte_mptcb;
2013
2014 VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED);
2015
2016#ifdef MORE_LOCKING_DEBUG
2017 if (so->so_usecount == 1) {
2018 panic("%s: so=%x no other reference on socket", __func__, so);
2019 /* NOTREACHED */
2020 }
2021#endif
2022 /*
2023 * We return all that is there in the subflow's socket receive buffer
2024 * to the MPTCP layer, so we require that the caller passes in the
2025 * expected parameters.
2026 */
2027 if (mp == NULL || controlp != NULL) {
2028 return EINVAL;
2029 }
2030
2031 *mp = NULL;
2032 if (psa != NULL) {
2033 *psa = NULL;
2034 }
2035 if (flagsp != NULL) {
2036 flags = *flagsp & ~MSG_EOR;
2037 } else {
2038 flags = 0;
2039 }
2040
2041 if (flags & (MSG_PEEK | MSG_OOB | MSG_NEEDSA | MSG_WAITALL | MSG_WAITSTREAM)) {
2042 return EOPNOTSUPP;
2043 }
2044
2045 flags |= (MSG_DONTWAIT | MSG_NBIO);
2046
2047 /*
2048 * If a recv attempt is made on a previously-accepted socket
2049 * that has been marked as inactive (disconnected), reject
2050 * the request.
2051 */
2052 if (so->so_flags & SOF_DEFUNCT) {
2053 struct sockbuf *sb = &so->so_rcv;
2054
2055 error = ENOTCONN;
2056 /*
2057 * This socket should have been disconnected and flushed
2058 * prior to being returned from sodefunct(); there should
2059 * be no data on its receive list, so panic otherwise.
2060 */
2061 if (so->so_state & SS_DEFUNCT) {
2062 sb_empty_assert(sb, __func__);
2063 }
2064 return error;
2065 }
2066
2067 /*
2068 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
2069 * and if so just return to the caller. This could happen when
2070 * soreceive() is called by a socket upcall function during the
2071 * time the socket is freed. The socket buffer would have been
2072 * locked across the upcall, therefore we cannot put this thread
2073 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
2074 * we may livelock), because the lock on the socket buffer will
2075 * only be released when the upcall routine returns to its caller.
2076 * Because the socket has been officially closed, there can be
2077 * no further read on it.
2078 *
2079 * A multipath subflow socket would have its SS_NOFDREF set by
2080 * default, so check for SOF_MP_SUBFLOW socket flag; when the
2081 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
2082 */
2083 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
2084 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW)) {
2085 return 0;
2086 }
2087
2088 /*
2089 * For consistency with soreceive() semantics, we need to obey
2090 * SB_LOCK in case some other code path has locked the buffer.
2091 */
2092 error = sblock(sb: &so->so_rcv, flags: 0);
2093 if (error != 0) {
2094 return error;
2095 }
2096
2097 m = so->so_rcv.sb_mb;
2098 if (m == NULL) {
2099 /*
2100 * Panic if we notice inconsistencies in the socket's
2101 * receive list; both sb_mb and sb_cc should correctly
2102 * reflect the contents of the list, otherwise we may
2103 * end up with false positives during select() or poll()
2104 * which could put the application in a bad state.
2105 */
2106 SB_MB_CHECK(&so->so_rcv);
2107
2108 if (so->so_error != 0) {
2109 error = so->so_error;
2110 so->so_error = 0;
2111 goto release;
2112 }
2113
2114 if (so->so_state & SS_CANTRCVMORE) {
2115 goto release;
2116 }
2117
2118 if (!(so->so_state & (SS_ISCONNECTED | SS_ISCONNECTING))) {
2119 error = ENOTCONN;
2120 goto release;
2121 }
2122
2123 /*
2124 * MSG_DONTWAIT is implicitly defined and this routine will
2125 * never block, so return EWOULDBLOCK when there is nothing.
2126 */
2127 error = EWOULDBLOCK;
2128 goto release;
2129 }
2130
2131 mptcp_update_last_owner(so, mp_so);
2132
2133 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
2134 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
2135
2136 while (m != NULL) {
2137 int dlen = 0, error_out = 0, off = 0;
2138 uint8_t dfin = 0;
2139 struct mbuf *start = m;
2140 uint64_t dsn;
2141 uint32_t sseq;
2142 uint16_t orig_dlen;
2143 uint16_t csum;
2144
2145 VERIFY(m->m_nextpkt == NULL);
2146
2147 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
2148fallback:
2149 /* Just move mbuf to MPTCP-level */
2150
2151 sbfree(sb: &so->so_rcv, m);
2152
2153 if (mp != NULL) {
2154 *mp = m;
2155 mp = &m->m_next;
2156 so->so_rcv.sb_mb = m = m->m_next;
2157 *mp = NULL;
2158 }
2159
2160 if (m != NULL) {
2161 so->so_rcv.sb_lastrecord = m;
2162 } else {
2163 SB_EMPTY_FIXUP(&so->so_rcv);
2164 }
2165
2166 continue;
2167 } else if (!(m->m_flags & M_PKTHDR) || !(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
2168 struct mptsub *mpts = sototcpcb(so)->t_mpsub;
2169 boolean_t found_mapping = false;
2170 int parsed_length = 0;
2171 struct mbuf *m_iter;
2172
2173 /*
2174 * No MPTCP-option in the header. Either fallback or
2175 * wait for additional mappings.
2176 */
2177 if (!(mpts->mpts_flags & MPTSF_FULLY_ESTABLISHED)) {
2178 /* data arrived without a DSS option mapping */
2179
2180 /* initial subflow can fallback right after SYN handshake */
2181 if (mpts->mpts_flags & MPTSF_INITIAL_SUB) {
2182 mptcp_notify_mpfail(so);
2183
2184 goto fallback;
2185 } else {
2186 os_log_error(mptcp_log_handle, "%s - %lx: No DSS on secondary subflow. Killing %d\n",
2187 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
2188 mpts->mpts_connid);
2189 soevent(so: mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2190
2191 error = EIO;
2192 *mp0 = NULL;
2193 goto release;
2194 }
2195 }
2196
2197 /* Thus, let's look for an mbuf with the mapping */
2198 m_iter = m->m_next;
2199 parsed_length = m->m_len;
2200 while (m_iter != NULL && parsed_length < UINT16_MAX) {
2201 if (!(m_iter->m_flags & M_PKTHDR) || !(m_iter->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
2202 parsed_length += m_iter->m_len;
2203 m_iter = m_iter->m_next;
2204 continue;
2205 }
2206
2207 found_mapping = true;
2208
2209 /* Found an mbuf with a DSS-mapping */
2210 orig_dlen = dlen = m_iter->m_pkthdr.mp_rlen;
2211 dsn = m_iter->m_pkthdr.mp_dsn;
2212 sseq = m_iter->m_pkthdr.mp_rseq;
2213 csum = m_iter->m_pkthdr.mp_csum;
2214
2215 if (m_iter->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) {
2216 dfin = 1;
2217 dlen--;
2218 }
2219
2220 break;
2221 }
2222
2223 if (!found_mapping && parsed_length < UINT16_MAX) {
2224 /* Mapping not yet present, we can wait! */
2225 if (*mp0 == NULL) {
2226 error = EWOULDBLOCK;
2227 }
2228 goto release;
2229 } else if (!found_mapping && parsed_length >= UINT16_MAX) {
2230 os_log_error(mptcp_log_handle, "%s - %lx: Received more than 64KB without DSS mapping. Killing %d\n",
2231 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
2232 mpts->mpts_connid);
2233 /* Received 64KB without DSS-mapping. We should kill the subflow */
2234 soevent(so: mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2235
2236 error = EIO;
2237 *mp0 = NULL;
2238 goto release;
2239 }
2240 } else {
2241 orig_dlen = dlen = m->m_pkthdr.mp_rlen;
2242 dsn = m->m_pkthdr.mp_dsn;
2243 sseq = m->m_pkthdr.mp_rseq;
2244 csum = m->m_pkthdr.mp_csum;
2245
2246 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) {
2247 dfin = 1;
2248 dlen--;
2249 }
2250 }
2251
2252 /* Now, see if we need to remove previous packets */
2253 if (SEQ_GT(sseq + tp->irs, tp->rcv_nxt - so->so_rcv.sb_cc)) {
2254 /* Ok, there is data in there that we don't need - let's throw it away! */
2255 int totrim = (int)sseq + tp->irs - (tp->rcv_nxt - so->so_rcv.sb_cc);
2256
2257 sbdrop(sb: &so->so_rcv, len: totrim);
2258
2259 m = so->so_rcv.sb_mb;
2260 }
2261
2262 /*
2263 * Check if the full mapping is now present
2264 */
2265 if ((int)so->so_rcv.sb_cc < dlen) {
2266 if (*mp0 == NULL) {
2267 error = EWOULDBLOCK;
2268 }
2269 goto release;
2270 }
2271
2272 /* Now, get the full mapping */
2273 off = 0;
2274 while (dlen > 0) {
2275 if (mptcp_adj_rmap(so, m, off, dsn, rseq: sseq, dlen: orig_dlen, dfin)) {
2276 error_out = 1;
2277 error = EIO;
2278 dlen = 0;
2279 *mp0 = NULL;
2280 break;
2281 }
2282
2283 dlen -= m->m_len;
2284 off += m->m_len;
2285 sbfree(sb: &so->so_rcv, m);
2286
2287 if (mp != NULL) {
2288 *mp = m;
2289 mp = &m->m_next;
2290 so->so_rcv.sb_mb = m = m->m_next;
2291 *mp = NULL;
2292 }
2293
2294 ASSERT(dlen == 0 || m);
2295 if (dlen != 0 && m == NULL) {
2296 /* "try" to gracefully recover on customer builds */
2297 error_out = 1;
2298 error = EIO;
2299 dlen = 0;
2300
2301 *mp0 = NULL;
2302
2303 SB_EMPTY_FIXUP(&so->so_rcv);
2304 soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
2305
2306 break;
2307 }
2308 }
2309
2310 VERIFY(dlen == 0);
2311
2312 if (m != NULL) {
2313 so->so_rcv.sb_lastrecord = m;
2314 } else {
2315 SB_EMPTY_FIXUP(&so->so_rcv);
2316 }
2317
2318 if (error_out) {
2319 goto release;
2320 }
2321
2322 if (mptcp_validate_csum(sototcpcb(so), m: start, dsn, sseq, dlen: orig_dlen, csum, dfin)) {
2323 error = EIO;
2324 *mp0 = NULL;
2325 goto release;
2326 }
2327
2328 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
2329 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
2330 }
2331
2332 DTRACE_MPTCP3(subflow__receive, struct socket *, so,
2333 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
2334
2335 if (flagsp != NULL) {
2336 *flagsp |= flags;
2337 }
2338
2339release:
2340 sbunlock(sb: &so->so_rcv, TRUE);
2341
2342 return error;
2343}
2344
2345/*
2346 * MPTCP subflow socket send routine, derived from sosend().
2347 */
2348static int
2349mptcp_subflow_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
2350 struct mbuf *top, struct mbuf *control, int flags)
2351{
2352 struct socket *mp_so = mptetoso(mpte: tptomptp(sototcpcb(so))->mpt_mpte);
2353 boolean_t en_tracing = FALSE, proc_held = FALSE;
2354 struct proc *p = current_proc();
2355 int en_tracing_val;
2356 int sblocked = 1; /* Pretend as if it is already locked, so we won't relock it */
2357 int error;
2358
2359 VERIFY(control == NULL);
2360 VERIFY(addr == NULL);
2361 VERIFY(uio == NULL);
2362 VERIFY(flags == 0);
2363 VERIFY((so->so_flags & SOF_CONTENT_FILTER) == 0);
2364
2365 VERIFY(top->m_pkthdr.len > 0 && top->m_pkthdr.len <= UINT16_MAX);
2366 VERIFY(top->m_pkthdr.pkt_flags & PKTF_MPTCP);
2367
2368 /*
2369 * trace if tracing & network (vs. unix) sockets & and
2370 * non-loopback
2371 */
2372 if (ENTR_SHOULDTRACE &&
2373 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
2374 struct inpcb *inp = sotoinpcb(so);
2375 if (inp->inp_last_outifp != NULL &&
2376 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
2377 en_tracing = TRUE;
2378 en_tracing_val = top->m_pkthdr.len;
2379 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
2380 (unsigned long)VM_KERNEL_ADDRPERM(so),
2381 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
2382 (int64_t)en_tracing_val);
2383 }
2384 }
2385
2386 mptcp_update_last_owner(so, mp_so);
2387
2388 if (mp_so->last_pid != proc_pid(p)) {
2389 p = proc_find(pid: mp_so->last_pid);
2390 if (p == PROC_NULL) {
2391 p = current_proc();
2392 } else {
2393 proc_held = TRUE;
2394 }
2395 }
2396
2397#if NECP
2398 inp_update_necp_policy(sotoinpcb(so), NULL, NULL, 0);
2399#endif /* NECP */
2400
2401 error = sosendcheck(so, NULL, top->m_pkthdr.len, 0, 1, 0, &sblocked);
2402 if (error) {
2403 goto out;
2404 }
2405
2406 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, top, NULL, NULL, p);
2407 top = NULL;
2408
2409out:
2410 if (top != NULL) {
2411 m_freem(top);
2412 }
2413
2414 if (proc_held) {
2415 proc_rele(p);
2416 }
2417
2418 soclearfastopen(so);
2419
2420 if (en_tracing) {
2421 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2422 (unsigned long)VM_KERNEL_ADDRPERM(so),
2423 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2424 (int64_t)en_tracing_val);
2425 }
2426
2427 return error;
2428}
2429
2430/*
2431 * Subflow socket write upcall.
2432 *
2433 * Called when the associated subflow socket posted a read event.
2434 */
2435static void
2436mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf)
2437{
2438#pragma unused(so, waitf)
2439 struct mptsub *mpts = arg;
2440 struct mptses *mpte = mpts->mpts_mpte;
2441
2442 VERIFY(mpte != NULL);
2443
2444 if (mptcp_should_defer_upcall(mpp: mpte->mpte_mppcb)) {
2445 if (!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL)) {
2446 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
2447 }
2448 return;
2449 }
2450
2451 mptcp_output(mpte);
2452}
2453
2454/*
2455 * Subflow socket control event upcall.
2456 */
2457static void
2458mptcp_subflow_eupcall1(struct socket *so, void *arg, uint32_t events)
2459{
2460#pragma unused(so)
2461 struct mptsub *mpts = arg;
2462 struct mptses *mpte = mpts->mpts_mpte;
2463
2464 socket_lock_assert_owned(so: mptetoso(mpte));
2465
2466 if ((mpts->mpts_evctl & events) == events) {
2467 return;
2468 }
2469
2470 mpts->mpts_evctl |= events;
2471
2472 if (mptcp_should_defer_upcall(mpp: mpte->mpte_mppcb)) {
2473 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WORKLOOP;
2474 return;
2475 }
2476
2477 mptcp_subflow_workloop(mpte);
2478}
2479
2480/*
2481 * Establish an initial MPTCP connection (if first subflow and not yet
2482 * connected), or add a subflow to an existing MPTCP connection.
2483 */
2484int
2485mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src,
2486 struct sockaddr *dst, uint32_t ifscope, sae_connid_t *pcid)
2487{
2488 struct socket *mp_so, *so = NULL;
2489 struct mptcb *mp_tp;
2490 struct mptsub *mpts = NULL;
2491 int af, error = 0;
2492
2493 mp_so = mptetoso(mpte);
2494 mp_tp = mpte->mpte_mptcb;
2495
2496 socket_lock_assert_owned(so: mp_so);
2497
2498 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
2499 /* If the remote end sends Data FIN, refuse subflow adds */
2500 os_log_error(mptcp_log_handle, "%s - %lx: state %u\n",
2501 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mp_tp->mpt_state);
2502 error = ENOTCONN;
2503 goto out_err;
2504 }
2505
2506 if (mpte->mpte_numflows > MPTCP_MAX_NUM_SUBFLOWS) {
2507 error = EOVERFLOW;
2508 goto out_err;
2509 }
2510
2511 mpts = mptcp_subflow_alloc();
2512 if (mpts == NULL) {
2513 os_log_error(mptcp_log_handle, "%s - %lx: malloc subflow failed\n",
2514 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
2515 error = ENOMEM;
2516 goto out_err;
2517 }
2518
2519 if (src) {
2520 if (src->sa_family != AF_INET && src->sa_family != AF_INET6) {
2521 error = EAFNOSUPPORT;
2522 goto out_err;
2523 }
2524
2525 if (src->sa_family == AF_INET &&
2526 src->sa_len != sizeof(struct sockaddr_in)) {
2527 error = EINVAL;
2528 goto out_err;
2529 }
2530
2531 if (src->sa_family == AF_INET6 &&
2532 src->sa_len != sizeof(struct sockaddr_in6)) {
2533 error = EINVAL;
2534 goto out_err;
2535 }
2536
2537 mpts->mpts_src = SA(alloc_sockaddr(src->sa_len, Z_WAITOK | Z_NOFAIL));
2538
2539 SOCKADDR_COPY(src, mpts->mpts_src, src->sa_len);
2540 }
2541
2542 if (dst->sa_family != AF_INET && dst->sa_family != AF_INET6) {
2543 error = EAFNOSUPPORT;
2544 goto out_err;
2545 }
2546
2547 if (dst->sa_family == AF_INET &&
2548 dst->sa_len != sizeof(mpts->__mpts_dst_v4)) {
2549 error = EINVAL;
2550 goto out_err;
2551 }
2552
2553 if (dst->sa_family == AF_INET6 &&
2554 dst->sa_len != sizeof(mpts->__mpts_dst_v6)) {
2555 error = EINVAL;
2556 goto out_err;
2557 }
2558
2559 SOCKADDR_COPY(dst, &mpts->mpts_dst, dst->sa_len);
2560
2561 af = mpts->mpts_dst.sa_family;
2562
2563 ifnet_head_lock_shared();
2564 if ((ifscope > (unsigned)if_index)) {
2565 ifnet_head_done();
2566 error = ENXIO;
2567 goto out_err;
2568 }
2569 ifnet_head_done();
2570
2571 mpts->mpts_ifscope = ifscope;
2572
2573 /* create the subflow socket */
2574 if ((error = mptcp_subflow_socreate(mpte, mpts, dom: af, so: &so)) != 0) {
2575 /*
2576 * Returning (error) and not cleaning up, because up to here
2577 * all we did is creating mpts.
2578 *
2579 * And the contract is that the call to mptcp_subflow_socreate,
2580 * moves ownership of mpts to mptcp_subflow_socreate.
2581 */
2582 return error;
2583 }
2584
2585 /*
2586 * We may be called from within the kernel. Still need to account this
2587 * one to the real app.
2588 */
2589 mptcp_update_last_owner(so: mpts->mpts_socket, mp_so);
2590
2591 /*
2592 * Increment the counter, while avoiding 0 (SAE_CONNID_ANY) and
2593 * -1 (SAE_CONNID_ALL).
2594 */
2595 mpte->mpte_connid_last++;
2596 if (mpte->mpte_connid_last == SAE_CONNID_ALL ||
2597 mpte->mpte_connid_last == SAE_CONNID_ANY) {
2598 mpte->mpte_connid_last++;
2599 }
2600
2601 mpts->mpts_connid = mpte->mpte_connid_last;
2602
2603 mpts->mpts_rel_seq = 1;
2604
2605 /* Allocate a unique address id per subflow */
2606 mpte->mpte_addrid_last++;
2607 if (mpte->mpte_addrid_last == 0) {
2608 mpte->mpte_addrid_last++;
2609 }
2610
2611 /* register for subflow socket read/write events */
2612 sock_setupcalls_locked(sock: so, NULL, NULL, wcallback: mptcp_subflow_wupcall, wcontext: mpts, locked: 1);
2613
2614 /* Register for subflow socket control events */
2615 sock_catchevents_locked(sock: so, ecallback: mptcp_subflow_eupcall1, econtext: mpts,
2616 SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE |
2617 SO_FILT_HINT_TIMEOUT | SO_FILT_HINT_NOSRCADDR |
2618 SO_FILT_HINT_IFDENIED | SO_FILT_HINT_CONNECTED |
2619 SO_FILT_HINT_DISCONNECTED | SO_FILT_HINT_MPFAILOVER |
2620 SO_FILT_HINT_MPSTATUS | SO_FILT_HINT_MUSTRST |
2621 SO_FILT_HINT_MPCANTRCVMORE | SO_FILT_HINT_ADAPTIVE_RTIMO |
2622 SO_FILT_HINT_ADAPTIVE_WTIMO | SO_FILT_HINT_MP_SUB_ERROR);
2623
2624 /* sanity check */
2625 VERIFY(!(mpts->mpts_flags &
2626 (MPTSF_CONNECTING | MPTSF_CONNECTED | MPTSF_CONNECT_PENDING)));
2627
2628 /*
2629 * Indicate to the TCP subflow whether or not it should establish
2630 * the initial MPTCP connection, or join an existing one. Fill
2631 * in the connection request structure with additional info needed
2632 * by the underlying TCP (to be used in the TCP options, etc.)
2633 */
2634 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && mpte->mpte_numflows == 1) {
2635 mpts->mpts_flags |= MPTSF_INITIAL_SUB;
2636
2637 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
2638 mptcp_init_local_parms(mpte, dst);
2639 }
2640 soisconnecting(so: mp_so);
2641
2642 /* If fastopen is requested, set state in mpts */
2643 if (so->so_flags1 & SOF1_PRECONNECT_DATA) {
2644 mpts->mpts_flags |= MPTSF_TFO_REQD;
2645 }
2646 } else {
2647 if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY)) {
2648 mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
2649 }
2650 }
2651
2652 mpts->mpts_flags |= MPTSF_CONNECTING;
2653
2654 /* connect right away if first attempt, or if join can be done now */
2655 if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING)) {
2656 error = mptcp_subflow_soconnectx(mpte, mpts);
2657 }
2658
2659 if (error) {
2660 goto out_err_close;
2661 }
2662
2663 if (pcid) {
2664 *pcid = mpts->mpts_connid;
2665 }
2666
2667 return 0;
2668
2669out_err_close:
2670 mptcp_subflow_abort(mpts, error);
2671
2672 return error;
2673
2674out_err:
2675 if (mpts) {
2676 mptcp_subflow_free(mpts);
2677 }
2678
2679 return error;
2680}
2681
2682void
2683mptcpstats_update(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
2684{
2685 int index = mptcpstats_get_index(stats, mpts);
2686
2687 if (index != -1) {
2688 struct inpcb *inp = sotoinpcb(mpts->mpts_socket);
2689
2690 stats[index].mpis_txbytes += inp->inp_stat->txbytes;
2691 stats[index].mpis_rxbytes += inp->inp_stat->rxbytes;
2692
2693 stats[index].mpis_wifi_txbytes += inp->inp_wstat->txbytes;
2694 stats[index].mpis_wifi_rxbytes += inp->inp_wstat->rxbytes;
2695
2696 stats[index].mpis_wired_txbytes += inp->inp_Wstat->txbytes;
2697 stats[index].mpis_wired_rxbytes += inp->inp_Wstat->rxbytes;
2698
2699 stats[index].mpis_cell_txbytes += inp->inp_cstat->txbytes;
2700 stats[index].mpis_cell_rxbytes += inp->inp_cstat->rxbytes;
2701 }
2702}
2703
2704/*
2705 * Delete/remove a subflow from an MPTCP. The underlying subflow socket
2706 * will no longer be accessible after a subflow is deleted, thus this
2707 * should occur only after the subflow socket has been disconnected.
2708 */
2709void
2710mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts)
2711{
2712 struct socket *mp_so = mptetoso(mpte);
2713 struct socket *so = mpts->mpts_socket;
2714 struct tcpcb *tp = sototcpcb(so);
2715
2716 socket_lock_assert_owned(so: mp_so);
2717 VERIFY(mpts->mpts_mpte == mpte);
2718 VERIFY(mpte->mpte_numflows != 0);
2719 VERIFY(mp_so->so_usecount > 0);
2720
2721 mptcpstats_update(stats: mpte->mpte_itfstats, mpts);
2722
2723 mptcp_unset_cellicon(mpte, mpts, val: 1);
2724
2725 mpte->mpte_init_rxbytes = sotoinpcb(so)->inp_stat->rxbytes;
2726 mpte->mpte_init_txbytes = sotoinpcb(so)->inp_stat->txbytes;
2727
2728 TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry);
2729 mpte->mpte_numflows--;
2730 if (mpte->mpte_active_sub == mpts) {
2731 mpte->mpte_active_sub = NULL;
2732 }
2733
2734 /*
2735 * Drop references held by this subflow socket; there
2736 * will be no further upcalls made from this point.
2737 */
2738 sock_setupcalls_locked(sock: so, NULL, NULL, NULL, NULL, locked: 0);
2739 sock_catchevents_locked(sock: so, NULL, NULL, emask: 0);
2740
2741 mptcp_detach_mptcb_from_subf(mp_tp: mpte->mpte_mptcb, so);
2742
2743 mp_so->so_usecount--; /* for subflow socket */
2744 mpts->mpts_mpte = NULL;
2745 mpts->mpts_socket = NULL;
2746
2747 mptcp_subflow_remref(mpts); /* for MPTCP subflow list */
2748 mptcp_subflow_remref(mpts); /* for subflow socket */
2749
2750 so->so_flags &= ~SOF_MP_SUBFLOW;
2751 tp->t_mptcb = NULL;
2752 tp->t_mpsub = NULL;
2753}
2754
2755void
2756mptcp_subflow_shutdown(struct mptses *mpte, struct mptsub *mpts)
2757{
2758 struct socket *so = mpts->mpts_socket;
2759 struct mptcb *mp_tp = mpte->mpte_mptcb;
2760 int send_dfin = 0;
2761
2762 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
2763 send_dfin = 1;
2764 }
2765
2766 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2767 (so->so_state & SS_ISCONNECTED)) {
2768 if (send_dfin) {
2769 mptcp_send_dfin(so);
2770 }
2771 soshutdownlock(so, SHUT_WR);
2772 }
2773}
2774
2775static void
2776mptcp_subflow_abort(struct mptsub *mpts, int error)
2777{
2778 struct socket *so = mpts->mpts_socket;
2779 struct tcpcb *tp = sototcpcb(so);
2780
2781 if (mpts->mpts_flags & MPTSF_DISCONNECTED) {
2782 return;
2783 }
2784
2785 if (tp->t_state != TCPS_CLOSED) {
2786 tcp_drop(tp, error);
2787 }
2788
2789 mptcp_subflow_eupcall1(so, arg: mpts, SO_FILT_HINT_DISCONNECTED);
2790}
2791
2792/*
2793 * Disconnect a subflow socket.
2794 */
2795void
2796mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts)
2797{
2798 struct socket *so, *mp_so;
2799 struct mptcb *mp_tp;
2800 int send_dfin = 0;
2801
2802 so = mpts->mpts_socket;
2803 mp_tp = mpte->mpte_mptcb;
2804 mp_so = mptetoso(mpte);
2805
2806 socket_lock_assert_owned(so: mp_so);
2807
2808 if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) {
2809 return;
2810 }
2811
2812 mptcp_unset_cellicon(mpte, mpts, val: 1);
2813
2814 mpts->mpts_flags |= MPTSF_DISCONNECTING;
2815
2816 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT) {
2817 send_dfin = 1;
2818 }
2819
2820 if (mp_so->so_flags & SOF_DEFUNCT) {
2821 errno_t ret;
2822
2823 ret = sosetdefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL, TRUE);
2824 if (ret == 0) {
2825 ret = sodefunct(NULL, so, SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
2826
2827 if (ret != 0) {
2828 os_log_error(mptcp_log_handle, "%s - %lx: sodefunct failed with %d\n",
2829 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret);
2830 }
2831 } else {
2832 os_log_error(mptcp_log_handle, "%s - %lx: sosetdefunct failed with %d\n",
2833 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), ret);
2834 }
2835 }
2836
2837 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2838 (so->so_state & SS_ISCONNECTED)) {
2839 if (send_dfin) {
2840 mptcp_send_dfin(so);
2841 }
2842
2843 (void) soshutdownlock(so, SHUT_RD);
2844 (void) soshutdownlock(so, SHUT_WR);
2845 (void) sodisconnectlocked(so);
2846 }
2847
2848 /*
2849 * Generate a disconnect event for this subflow socket, in case
2850 * the lower layer doesn't do it; this is needed because the
2851 * subflow socket deletion relies on it.
2852 */
2853 mptcp_subflow_eupcall1(so, arg: mpts, SO_FILT_HINT_DISCONNECTED);
2854}
2855
2856/*
2857 * Subflow socket input.
2858 */
2859static void
2860mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts)
2861{
2862 struct socket *mp_so = mptetoso(mpte);
2863 struct mbuf *m = NULL;
2864 struct socket *so;
2865 int error, wakeup = 0;
2866
2867 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_INPUT));
2868 mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_INPUT;
2869
2870 DTRACE_MPTCP2(subflow__input, struct mptses *, mpte,
2871 struct mptsub *, mpts);
2872
2873 if (!(mpts->mpts_flags & MPTSF_CONNECTED)) {
2874 goto out;
2875 }
2876
2877 so = mpts->mpts_socket;
2878
2879 error = sock_receive_internal(so, NULL, &m, 0, NULL);
2880 if (error != 0 && error != EWOULDBLOCK) {
2881 os_log_error(mptcp_log_handle, "%s - %lx: cid %d error %d\n",
2882 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error);
2883 if (error == ENODATA) {
2884 /*
2885 * Don't ignore ENODATA so as to discover
2886 * nasty middleboxes.
2887 */
2888 mp_so->so_error = ENODATA;
2889
2890 wakeup = 1;
2891 goto out;
2892 }
2893 }
2894
2895 /* In fallback, make sure to accept data on all but one subflow */
2896 if (m && (mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
2897 !(mpts->mpts_flags & MPTSF_ACTIVE)) {
2898 m_freem(m);
2899 goto out;
2900 }
2901
2902 if (m != NULL) {
2903 if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
2904 mptcp_set_cellicon(mpte, mpts);
2905
2906 mpte->mpte_used_cell = 1;
2907 } else {
2908 /*
2909 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
2910 * explicitly set the cellicon, then we unset it again.
2911 */
2912 if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) {
2913 mptcp_unset_cellicon(mpte, NULL, val: 1);
2914 }
2915
2916 mpte->mpte_used_wifi = 1;
2917 }
2918
2919 mptcp_input(mpte, m);
2920 }
2921
2922out:
2923 if (wakeup) {
2924 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2925 }
2926
2927 mptcp_handle_deferred_upcalls(mpp: mpte->mpte_mppcb, MPP_INSIDE_INPUT);
2928}
2929
2930void
2931mptcp_handle_input(struct socket *so)
2932{
2933 struct mptsub *mpts, *tmpts;
2934 struct mptses *mpte;
2935
2936 if (!(so->so_flags & SOF_MP_SUBFLOW)) {
2937 return;
2938 }
2939
2940 mpts = sototcpcb(so)->t_mpsub;
2941 mpte = mpts->mpts_mpte;
2942
2943 socket_lock_assert_owned(so: mptetoso(mpte));
2944
2945 if (mptcp_should_defer_upcall(mpp: mpte->mpte_mppcb)) {
2946 if (!(mpte->mpte_mppcb->mpp_flags & MPP_INPUT_HANDLE)) {
2947 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2948 }
2949 return;
2950 }
2951
2952 mpte->mpte_mppcb->mpp_flags |= MPP_INPUT_HANDLE;
2953 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
2954 if (mpts->mpts_socket->so_usecount == 0) {
2955 /* Will be removed soon by tcp_garbage_collect */
2956 continue;
2957 }
2958
2959 mptcp_subflow_addref(mpts);
2960 mpts->mpts_socket->so_usecount++;
2961
2962 mptcp_subflow_input(mpte, mpts);
2963
2964 mptcp_subflow_remref(mpts); /* ours */
2965
2966 VERIFY(mpts->mpts_socket->so_usecount != 0);
2967 mpts->mpts_socket->so_usecount--;
2968 }
2969
2970 mptcp_handle_deferred_upcalls(mpp: mpte->mpte_mppcb, MPP_INPUT_HANDLE);
2971}
2972
2973static boolean_t
2974mptcp_search_seq_in_sub(struct mbuf *m, struct socket *so)
2975{
2976 struct mbuf *so_m = so->so_snd.sb_mb;
2977 uint64_t dsn = m->m_pkthdr.mp_dsn;
2978
2979 while (so_m) {
2980 VERIFY(so_m->m_flags & M_PKTHDR);
2981 VERIFY(so_m->m_pkthdr.pkt_flags & PKTF_MPTCP);
2982
2983 /* Part of the segment is covered, don't reinject here */
2984 if (so_m->m_pkthdr.mp_dsn <= dsn &&
2985 so_m->m_pkthdr.mp_dsn + so_m->m_pkthdr.mp_rlen > dsn) {
2986 return TRUE;
2987 }
2988
2989 so_m = so_m->m_next;
2990 }
2991
2992 return FALSE;
2993}
2994
2995/*
2996 * Subflow socket output.
2997 *
2998 * Called for sending data from MPTCP to the underlying subflow socket.
2999 */
3000int
3001mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags)
3002{
3003 struct mptcb *mp_tp = mpte->mpte_mptcb;
3004 struct mbuf *sb_mb, *m, *mpt_mbuf = NULL, *head = NULL, *tail = NULL;
3005 struct socket *mp_so, *so;
3006 struct tcpcb *tp;
3007 uint64_t mpt_dsn = 0, off = 0;
3008 int sb_cc = 0, error = 0, wakeup = 0;
3009 uint16_t dss_csum;
3010 uint16_t tot_sent = 0;
3011 boolean_t reinjected = FALSE;
3012
3013 mp_so = mptetoso(mpte);
3014 so = mpts->mpts_socket;
3015 tp = sototcpcb(so);
3016
3017 socket_lock_assert_owned(so: mp_so);
3018
3019 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_OUTPUT));
3020 mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_OUTPUT;
3021
3022 VERIFY(!INP_WAIT_FOR_IF_FEEDBACK(sotoinpcb(so)));
3023 VERIFY((mpts->mpts_flags & MPTSF_MP_CAPABLE) ||
3024 (mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
3025 (mpts->mpts_flags & MPTSF_TFO_REQD));
3026 VERIFY(mptcp_subflow_cwnd_space(mpts->mpts_socket) > 0);
3027
3028 DTRACE_MPTCP2(subflow__output, struct mptses *, mpte,
3029 struct mptsub *, mpts);
3030
3031 /* Remove Addr Option is not sent reliably as per I-D */
3032 if (mpte->mpte_flags & MPTE_SND_REM_ADDR) {
3033 tp->t_rem_aid = mpte->mpte_lost_aid;
3034 tp->t_mpflags |= TMPF_SND_REM_ADDR;
3035 mpte->mpte_flags &= ~MPTE_SND_REM_ADDR;
3036 }
3037
3038 /*
3039 * The mbuf chains containing the metadata (as well as pointing to
3040 * the user data sitting at the MPTCP output queue) would then be
3041 * sent down to the subflow socket.
3042 *
3043 * Some notes on data sequencing:
3044 *
3045 * a. Each mbuf must be a M_PKTHDR.
3046 * b. MPTCP metadata is stored in the mptcp_pktinfo structure
3047 * in the mbuf pkthdr structure.
3048 * c. Each mbuf containing the MPTCP metadata must have its
3049 * pkt_flags marked with the PKTF_MPTCP flag.
3050 */
3051
3052 if (mpte->mpte_reinjectq) {
3053 sb_mb = mpte->mpte_reinjectq;
3054 } else {
3055 sb_mb = mp_so->so_snd.sb_mb;
3056 }
3057
3058 if (sb_mb == NULL) {
3059 os_log_error(mptcp_log_handle, "%s - %lx: No data in MPTCP-sendbuffer! smax %u snxt %u suna %u state %u flags %#x\n",
3060 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3061 (uint32_t)mp_tp->mpt_sndmax, (uint32_t)mp_tp->mpt_sndnxt,
3062 (uint32_t)mp_tp->mpt_snduna, mp_tp->mpt_state, mp_so->so_flags1);
3063
3064 /* Fix it to prevent looping */
3065 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
3066 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
3067 }
3068 goto out;
3069 }
3070
3071 VERIFY(sb_mb->m_pkthdr.pkt_flags & PKTF_MPTCP);
3072
3073 if (sb_mb->m_pkthdr.mp_rlen == 0 &&
3074 !(so->so_state & SS_ISCONNECTED) &&
3075 (so->so_flags1 & SOF1_PRECONNECT_DATA)) {
3076 tp->t_mpflags |= TMPF_TFO_REQUEST;
3077
3078 /* Opting to call pru_send as no mbuf at subflow level */
3079 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, NULL, NULL,
3080 NULL, current_proc());
3081
3082 goto done_sending;
3083 }
3084
3085 mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
3086
3087 /* First, drop acknowledged data */
3088 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
3089 os_log_error(mptcp_log_handle, "%s - %lx: dropping data, should have been done earlier "
3090 "dsn %u suna %u reinject? %u\n",
3091 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mpt_dsn,
3092 (uint32_t)mp_tp->mpt_snduna, !!mpte->mpte_reinjectq);
3093 if (mpte->mpte_reinjectq) {
3094 mptcp_clean_reinjectq(mpte);
3095 } else {
3096 uint64_t len = 0;
3097 len = mp_tp->mpt_snduna - mpt_dsn;
3098 sbdrop(sb: &mp_so->so_snd, len: (int)len);
3099 wakeup = 1;
3100 }
3101 }
3102
3103 /* Check again because of above sbdrop */
3104 if (mp_so->so_snd.sb_mb == NULL && mpte->mpte_reinjectq == NULL) {
3105 os_log_error(mptcp_log_handle, "%s - $%lx: send-buffer is empty\n",
3106 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3107 goto out;
3108 }
3109
3110 /*
3111 * In degraded mode, we don't receive data acks, so force free
3112 * mbufs less than snd_nxt
3113 */
3114 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
3115 (mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) &&
3116 mp_so->so_snd.sb_mb) {
3117 mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn;
3118 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
3119 uint64_t len = 0;
3120 len = mp_tp->mpt_snduna - mpt_dsn;
3121 sbdrop(sb: &mp_so->so_snd, len: (int)len);
3122 wakeup = 1;
3123
3124 os_log_error(mptcp_log_handle, "%s - %lx: dropping data in degraded mode, should have been done earlier dsn %u sndnxt %u suna %u\n",
3125 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3126 (uint32_t)mpt_dsn, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_snduna);
3127 }
3128 }
3129
3130 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
3131 !(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC)) {
3132 mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC;
3133 so->so_flags1 |= SOF1_POST_FALLBACK_SYNC;
3134 }
3135
3136 /*
3137 * Adjust the top level notion of next byte used for retransmissions
3138 * and sending FINs.
3139 */
3140 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna)) {
3141 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
3142 }
3143
3144 /* Now determine the offset from which to start transmitting data */
3145 if (mpte->mpte_reinjectq) {
3146 sb_mb = mpte->mpte_reinjectq;
3147 } else {
3148dont_reinject:
3149 sb_mb = mp_so->so_snd.sb_mb;
3150 }
3151 if (sb_mb == NULL) {
3152 os_log_error(mptcp_log_handle, "%s - %lx: send-buffer is still empty\n", __func__,
3153 (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3154 goto out;
3155 }
3156
3157 if (sb_mb == mpte->mpte_reinjectq) {
3158 sb_cc = sb_mb->m_pkthdr.mp_rlen;
3159 off = 0;
3160
3161 if (mptcp_search_seq_in_sub(m: sb_mb, so)) {
3162 if (mptcp_can_send_more(mp_tp, TRUE)) {
3163 goto dont_reinject;
3164 }
3165
3166 error = ECANCELED;
3167 goto out;
3168 }
3169
3170 reinjected = TRUE;
3171 } else if (flags & MPTCP_SUBOUT_PROBING) {
3172 sb_cc = sb_mb->m_pkthdr.mp_rlen;
3173 off = 0;
3174 } else {
3175 sb_cc = min(a: mp_so->so_snd.sb_cc, b: mp_tp->mpt_sndwnd);
3176
3177 /*
3178 * With TFO, there might be no data at all, thus still go into this
3179 * code-path here.
3180 */
3181 if ((mp_so->so_flags1 & SOF1_PRECONNECT_DATA) ||
3182 MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) {
3183 off = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
3184 sb_cc -= off;
3185 } else {
3186 os_log_error(mptcp_log_handle, "%s - %lx: this should not happen: sndnxt %u sndmax %u\n",
3187 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), (uint32_t)mp_tp->mpt_sndnxt,
3188 (uint32_t)mp_tp->mpt_sndmax);
3189
3190 goto out;
3191 }
3192 }
3193
3194 sb_cc = min(a: sb_cc, b: mptcp_subflow_cwnd_space(so));
3195 if (sb_cc <= 0) {
3196 os_log_error(mptcp_log_handle, "%s - %lx: sb_cc is %d, mp_so->sb_cc %u, sndwnd %u,sndnxt %u sndmax %u cwnd %u\n",
3197 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), sb_cc, mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd,
3198 (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax,
3199 mptcp_subflow_cwnd_space(so));
3200 }
3201
3202 sb_cc = min(a: sb_cc, UINT16_MAX);
3203
3204 /*
3205 * Create a DSN mapping for the data we are about to send. It all
3206 * has the same mapping.
3207 */
3208 if (reinjected) {
3209 mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
3210 } else {
3211 mpt_dsn = mp_tp->mpt_snduna + off;
3212 }
3213
3214 mpt_mbuf = sb_mb;
3215 while (mpt_mbuf && reinjected == FALSE &&
3216 (mpt_mbuf->m_pkthdr.mp_rlen == 0 ||
3217 mpt_mbuf->m_pkthdr.mp_rlen <= (uint32_t)off)) {
3218 off -= mpt_mbuf->m_pkthdr.mp_rlen;
3219 mpt_mbuf = mpt_mbuf->m_next;
3220 }
3221 VERIFY((mpt_mbuf == NULL) || (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP));
3222
3223 head = tail = NULL;
3224
3225 while (tot_sent < sb_cc) {
3226 int32_t mlen;
3227
3228 mlen = mpt_mbuf->m_len;
3229 mlen -= off;
3230 mlen = MIN(mlen, sb_cc - tot_sent);
3231
3232 if (mlen < 0) {
3233 os_log_error(mptcp_log_handle, "%s - %lx: mlen %d mp_rlen %u off %u sb_cc %u tot_sent %u\n",
3234 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mlen, mpt_mbuf->m_pkthdr.mp_rlen,
3235 (uint32_t)off, sb_cc, tot_sent);
3236 goto out;
3237 }
3238
3239 if (mlen == 0) {
3240 goto next;
3241 }
3242
3243 m = m_copym_mode(mpt_mbuf, (int)off, mlen, M_DONTWAIT, NULL, NULL,
3244 M_COPYM_MUST_COPY_HDR);
3245 if (m == NULL) {
3246 os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode failed\n", __func__,
3247 (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3248 error = ENOBUFS;
3249 break;
3250 }
3251
3252 /* Create a DSN mapping for the data (m_copym does it) */
3253 VERIFY(m->m_flags & M_PKTHDR);
3254 VERIFY(m->m_next == NULL);
3255
3256 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
3257 m->m_pkthdr.pkt_flags &= ~PKTF_MPSO;
3258 m->m_pkthdr.mp_dsn = mpt_dsn;
3259 m->m_pkthdr.mp_rseq = mpts->mpts_rel_seq;
3260 m->m_pkthdr.len = mlen;
3261
3262 if (head == NULL) {
3263 head = tail = m;
3264 } else {
3265 tail->m_next = m;
3266 tail = m;
3267 }
3268
3269 tot_sent += mlen;
3270 off = 0;
3271next:
3272 mpt_mbuf = mpt_mbuf->m_next;
3273 }
3274
3275 if (reinjected) {
3276 if (sb_cc < sb_mb->m_pkthdr.mp_rlen) {
3277 struct mbuf *n = sb_mb;
3278
3279 while (n) {
3280 n->m_pkthdr.mp_dsn += sb_cc;
3281 n->m_pkthdr.mp_rlen -= sb_cc;
3282 n = n->m_next;
3283 }
3284 m_adj(sb_mb, sb_cc);
3285 } else {
3286 mpte->mpte_reinjectq = sb_mb->m_nextpkt;
3287 m_freem(sb_mb);
3288 }
3289 }
3290
3291 if (head && (mp_tp->mpt_flags & MPTCPF_CHECKSUM)) {
3292 dss_csum = mptcp_output_csum(m: head, dss_val: mpt_dsn, sseq: mpts->mpts_rel_seq,
3293 dlen: tot_sent);
3294 }
3295
3296 /* Now, let's update rel-seq and the data-level length */
3297 mpts->mpts_rel_seq += tot_sent;
3298 m = head;
3299 while (m) {
3300 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) {
3301 m->m_pkthdr.mp_csum = dss_csum;
3302 }
3303 m->m_pkthdr.mp_rlen = tot_sent;
3304 m = m->m_next;
3305 }
3306
3307 if (head != NULL) {
3308 if ((mpts->mpts_flags & MPTSF_TFO_REQD) &&
3309 (tp->t_tfo_stats == 0)) {
3310 tp->t_mpflags |= TMPF_TFO_REQUEST;
3311 }
3312
3313 error = so->so_proto->pr_usrreqs->pru_sosend(so, NULL, NULL, head, NULL, 0);
3314 head = NULL;
3315 }
3316
3317done_sending:
3318 if (error == 0 ||
3319 (error == EWOULDBLOCK && (tp->t_mpflags & TMPF_TFO_REQUEST))) {
3320 uint64_t new_sndnxt = mp_tp->mpt_sndnxt + tot_sent;
3321
3322 if (mpts->mpts_probesoon && mpts->mpts_maxseg && tot_sent) {
3323 tcpstat.tcps_mp_num_probes++;
3324 if ((uint32_t)tot_sent < mpts->mpts_maxseg) {
3325 mpts->mpts_probecnt += 1;
3326 } else {
3327 mpts->mpts_probecnt +=
3328 tot_sent / mpts->mpts_maxseg;
3329 }
3330 }
3331
3332 if (!reinjected && !(flags & MPTCP_SUBOUT_PROBING)) {
3333 if (MPTCP_DATASEQ_HIGH32(new_sndnxt) >
3334 MPTCP_DATASEQ_HIGH32(mp_tp->mpt_sndnxt)) {
3335 mp_tp->mpt_flags |= MPTCPF_SND_64BITDSN;
3336 }
3337 mp_tp->mpt_sndnxt = new_sndnxt;
3338 }
3339
3340 mptcp_cancel_timer(mp_tp, MPTT_REXMT);
3341
3342 /* Must be here as mptcp_can_send_more() checks for this */
3343 soclearfastopen(so: mp_so);
3344
3345 if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
3346 mptcp_set_cellicon(mpte, mpts);
3347
3348 mpte->mpte_used_cell = 1;
3349 } else {
3350 /*
3351 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
3352 * explicitly set the cellicon, then we unset it again.
3353 */
3354 if (TSTMP_LT(mpte->mpte_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE, tcp_now)) {
3355 mptcp_unset_cellicon(mpte, NULL, val: 1);
3356 }
3357
3358 mpte->mpte_used_wifi = 1;
3359 }
3360
3361 /*
3362 * Don't propagate EWOULDBLOCK - it's already taken care of
3363 * in mptcp_usr_send for TFO.
3364 */
3365 error = 0;
3366 } else {
3367 /* We need to revert our change to mpts_rel_seq */
3368 mpts->mpts_rel_seq -= tot_sent;
3369
3370 os_log_error(mptcp_log_handle, "%s - %lx: %u error %d len %d subflags %#x sostate %#x soerror %u hiwat %u lowat %u\n",
3371 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, error, tot_sent, so->so_flags, so->so_state, so->so_error, so->so_snd.sb_hiwat, so->so_snd.sb_lowat);
3372 }
3373out:
3374
3375 if (head != NULL) {
3376 m_freem(head);
3377 }
3378
3379 if (wakeup) {
3380 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
3381 }
3382
3383 mptcp_handle_deferred_upcalls(mpp: mpte->mpte_mppcb, MPP_INSIDE_OUTPUT);
3384 return error;
3385}
3386
3387static void
3388mptcp_add_reinjectq(struct mptses *mpte, struct mbuf *m)
3389{
3390 struct mbuf *n, *prev = NULL;
3391
3392 n = mpte->mpte_reinjectq;
3393
3394 /* First, look for an mbuf n, whose data-sequence-number is bigger or
3395 * equal than m's sequence number.
3396 */
3397 while (n) {
3398 if (MPTCP_SEQ_GEQ(n->m_pkthdr.mp_dsn, m->m_pkthdr.mp_dsn)) {
3399 break;
3400 }
3401
3402 prev = n;
3403
3404 n = n->m_nextpkt;
3405 }
3406
3407 if (n) {
3408 /* m is already fully covered by the next mbuf in the queue */
3409 if (n->m_pkthdr.mp_dsn == m->m_pkthdr.mp_dsn &&
3410 n->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_rlen) {
3411 os_log(mptcp_log_handle, "%s - %lx: dsn %u dlen %u rseq %u fully covered with len %u\n",
3412 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3413 (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3414 m->m_pkthdr.mp_rseq, n->m_pkthdr.mp_rlen);
3415 goto dont_queue;
3416 }
3417
3418 /* m is covering the next mbuf entirely, thus we remove this guy */
3419 if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen >= n->m_pkthdr.mp_dsn + n->m_pkthdr.mp_rlen) {
3420 struct mbuf *tmp = n->m_nextpkt;
3421
3422 os_log(mptcp_log_handle, "%s - %lx: m (dsn %u len %u) is covering existing mbuf (dsn %u len %u)\n",
3423 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3424 (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
3425 (uint32_t)n->m_pkthdr.mp_dsn, n->m_pkthdr.mp_rlen);
3426
3427 m->m_nextpkt = NULL;
3428 if (prev == NULL) {
3429 mpte->mpte_reinjectq = tmp;
3430 } else {
3431 prev->m_nextpkt = tmp;
3432 }
3433
3434 m_freem(n);
3435 n = tmp;
3436 }
3437 }
3438
3439 if (prev) {
3440 /* m is already fully covered by the previous mbuf in the queue */
3441 if (prev->m_pkthdr.mp_dsn + prev->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_dsn + m->m_pkthdr.len) {
3442 os_log(mptcp_log_handle, "%s - %lx: prev (dsn %u len %u) covers us (dsn %u len %u)\n",
3443 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3444 (uint32_t)prev->m_pkthdr.mp_dsn, prev->m_pkthdr.mp_rlen,
3445 (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen);
3446 goto dont_queue;
3447 }
3448 }
3449
3450 if (prev == NULL) {
3451 mpte->mpte_reinjectq = m;
3452 } else {
3453 prev->m_nextpkt = m;
3454 }
3455
3456 m->m_nextpkt = n;
3457
3458 return;
3459
3460dont_queue:
3461 m_freem(m);
3462 return;
3463}
3464
3465static struct mbuf *
3466mptcp_lookup_dsn(struct mptses *mpte, uint64_t dsn)
3467{
3468 struct socket *mp_so = mptetoso(mpte);
3469 struct mbuf *m;
3470
3471 m = mp_so->so_snd.sb_mb;
3472
3473 while (m) {
3474 /* If this segment covers what we are looking for, return it. */
3475 if (MPTCP_SEQ_LEQ(m->m_pkthdr.mp_dsn, dsn) &&
3476 MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, dsn)) {
3477 break;
3478 }
3479
3480
3481 /* Segment is no more in the queue */
3482 if (MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn, dsn)) {
3483 return NULL;
3484 }
3485
3486 m = m->m_next;
3487 }
3488
3489 return m;
3490}
3491
3492static struct mbuf *
3493mptcp_copy_mbuf_list(struct mptses *mpte, struct mbuf *m, int len)
3494{
3495 struct mbuf *top = NULL, *tail = NULL;
3496 uint64_t dsn;
3497 uint32_t dlen, rseq;
3498
3499 dsn = m->m_pkthdr.mp_dsn;
3500 dlen = m->m_pkthdr.mp_rlen;
3501 rseq = m->m_pkthdr.mp_rseq;
3502
3503 while (len > 0) {
3504 struct mbuf *n;
3505
3506 VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3507
3508 n = m_copym_mode(m, 0, m->m_len, M_DONTWAIT, NULL, NULL, M_COPYM_MUST_COPY_HDR);
3509 if (n == NULL) {
3510 os_log_error(mptcp_log_handle, "%s - %lx: m_copym_mode returned NULL\n",
3511 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3512 goto err;
3513 }
3514
3515 VERIFY(n->m_flags & M_PKTHDR);
3516 VERIFY(n->m_next == NULL);
3517 VERIFY(n->m_pkthdr.mp_dsn == dsn);
3518 VERIFY(n->m_pkthdr.mp_rlen == dlen);
3519 VERIFY(n->m_pkthdr.mp_rseq == rseq);
3520 VERIFY(n->m_len == m->m_len);
3521
3522 n->m_pkthdr.pkt_flags |= (PKTF_MPSO | PKTF_MPTCP);
3523
3524 if (top == NULL) {
3525 top = n;
3526 }
3527
3528 if (tail != NULL) {
3529 tail->m_next = n;
3530 }
3531
3532 tail = n;
3533
3534 len -= m->m_len;
3535 m = m->m_next;
3536 }
3537
3538 return top;
3539
3540err:
3541 if (top) {
3542 m_freem(top);
3543 }
3544
3545 return NULL;
3546}
3547
3548static void
3549mptcp_reinject_mbufs(struct socket *so)
3550{
3551 struct tcpcb *tp = sototcpcb(so);
3552 struct mptsub *mpts = tp->t_mpsub;
3553 struct mptcb *mp_tp = tptomptp(tp);
3554 struct mptses *mpte = mp_tp->mpt_mpte;
3555 struct sockbuf *sb = &so->so_snd;
3556 struct mbuf *m;
3557
3558 m = sb->sb_mb;
3559 while (m) {
3560 struct mbuf *n = m->m_next, *orig = m;
3561 bool set_reinject_flag = false;
3562
3563 VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3564
3565 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_REINJ) {
3566 goto next;
3567 }
3568
3569 /* Has it all already been acknowledged at the data-level? */
3570 if (MPTCP_SEQ_GEQ(mp_tp->mpt_snduna, m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen)) {
3571 goto next;
3572 }
3573
3574 /* Part of this has already been acknowledged - lookup in the
3575 * MPTCP-socket for the segment.
3576 */
3577 if (SEQ_GT(tp->snd_una - mpts->mpts_iss, m->m_pkthdr.mp_rseq)) {
3578 m = mptcp_lookup_dsn(mpte, dsn: m->m_pkthdr.mp_dsn);
3579 if (m == NULL) {
3580 goto next;
3581 }
3582 }
3583
3584 /* Copy the mbuf with headers (aka, DSN-numbers) */
3585 m = mptcp_copy_mbuf_list(mpte, m, len: m->m_pkthdr.mp_rlen);
3586 if (m == NULL) {
3587 break;
3588 }
3589
3590 VERIFY(m->m_nextpkt == NULL);
3591
3592 /* Now, add to the reinject-queue, eliminating overlapping
3593 * segments
3594 */
3595 mptcp_add_reinjectq(mpte, m);
3596
3597 set_reinject_flag = true;
3598 orig->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3599
3600next:
3601 /* mp_rlen can cover multiple mbufs, so advance to the end of it. */
3602 while (n) {
3603 VERIFY((n->m_flags & M_PKTHDR) && (n->m_pkthdr.pkt_flags & PKTF_MPTCP));
3604
3605 if (n->m_pkthdr.mp_dsn != orig->m_pkthdr.mp_dsn) {
3606 break;
3607 }
3608
3609 if (set_reinject_flag) {
3610 n->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3611 }
3612 n = n->m_next;
3613 }
3614
3615 m = n;
3616 }
3617}
3618
3619void
3620mptcp_clean_reinjectq(struct mptses *mpte)
3621{
3622 struct mptcb *mp_tp = mpte->mpte_mptcb;
3623
3624 socket_lock_assert_owned(so: mptetoso(mpte));
3625
3626 while (mpte->mpte_reinjectq) {
3627 struct mbuf *m = mpte->mpte_reinjectq;
3628
3629 if (MPTCP_SEQ_GEQ(m->m_pkthdr.mp_dsn, mp_tp->mpt_snduna) ||
3630 MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, mp_tp->mpt_snduna)) {
3631 break;
3632 }
3633
3634 mpte->mpte_reinjectq = m->m_nextpkt;
3635 m->m_nextpkt = NULL;
3636 m_freem(m);
3637 }
3638}
3639
3640static ev_ret_t
3641mptcp_subflow_propagate_ev(struct mptses *mpte, struct mptsub *mpts,
3642 uint32_t *p_mpsofilt_hint, uint32_t event)
3643{
3644 struct socket *mp_so, *so;
3645 struct mptcb *mp_tp;
3646
3647 mp_so = mptetoso(mpte);
3648 mp_tp = mpte->mpte_mptcb;
3649 so = mpts->mpts_socket;
3650
3651 /*
3652 * We got an event for this subflow that might need to be propagated,
3653 * based on the state of the MPTCP connection.
3654 */
3655 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
3656 (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY) && !(mpts->mpts_flags & MPTSF_MP_READY)) ||
3657 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
3658 mp_so->so_error = so->so_error;
3659 *p_mpsofilt_hint |= event;
3660 }
3661
3662 return MPTS_EVRET_OK;
3663}
3664
3665/*
3666 * Handle SO_FILT_HINT_NOSRCADDR subflow socket event.
3667 */
3668static ev_ret_t
3669mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts,
3670 uint32_t *p_mpsofilt_hint, uint32_t event)
3671{
3672 struct socket *mp_so;
3673 struct tcpcb *tp;
3674
3675 mp_so = mptetoso(mpte);
3676 tp = intotcpcb(sotoinpcb(mpts->mpts_socket));
3677
3678 /*
3679 * This overwrites any previous mpte_lost_aid to avoid storing
3680 * too much state when the typical case has only two subflows.
3681 */
3682 mpte->mpte_flags |= MPTE_SND_REM_ADDR;
3683 mpte->mpte_lost_aid = tp->t_local_aid;
3684
3685 /*
3686 * The subflow connection has lost its source address.
3687 */
3688 mptcp_subflow_abort(mpts, EADDRNOTAVAIL);
3689
3690 if (mp_so->so_flags & SOF_NOADDRAVAIL) {
3691 mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
3692 }
3693
3694 return MPTS_EVRET_DELETE;
3695}
3696
3697static ev_ret_t
3698mptcp_subflow_mpsuberror_ev(struct mptses *mpte, struct mptsub *mpts,
3699 uint32_t *p_mpsofilt_hint, uint32_t event)
3700{
3701#pragma unused(event, p_mpsofilt_hint)
3702 struct socket *so, *mp_so;
3703
3704 so = mpts->mpts_socket;
3705
3706 if (so->so_error != ENODATA) {
3707 return MPTS_EVRET_OK;
3708 }
3709
3710
3711 mp_so = mptetoso(mpte);
3712
3713 mp_so->so_error = ENODATA;
3714
3715 sorwakeup(so: mp_so);
3716 sowwakeup(so: mp_so);
3717
3718 return MPTS_EVRET_OK;
3719}
3720
3721
3722/*
3723 * Handle SO_FILT_HINT_MPCANTRCVMORE subflow socket event that
3724 * indicates that the remote side sent a Data FIN
3725 */
3726static ev_ret_t
3727mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
3728 uint32_t *p_mpsofilt_hint, uint32_t event)
3729{
3730#pragma unused(event, mpts)
3731 struct mptcb *mp_tp = mpte->mpte_mptcb;
3732
3733 /*
3734 * We got a Data FIN for the MPTCP connection.
3735 * The FIN may arrive with data. The data is handed up to the
3736 * mptcp socket and the user is notified so that it may close
3737 * the socket if needed.
3738 */
3739 if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT) {
3740 *p_mpsofilt_hint |= SO_FILT_HINT_CANTRCVMORE;
3741 }
3742
3743 return MPTS_EVRET_OK; /* keep the subflow socket around */
3744}
3745
3746/*
3747 * Handle SO_FILT_HINT_MPFAILOVER subflow socket event
3748 */
3749static ev_ret_t
3750mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts,
3751 uint32_t *p_mpsofilt_hint, uint32_t event)
3752{
3753#pragma unused(event, p_mpsofilt_hint)
3754 struct mptsub *mpts_alt = NULL;
3755 struct socket *alt_so = NULL;
3756 struct socket *mp_so;
3757 int altpath_exists = 0;
3758
3759 mp_so = mptetoso(mpte);
3760 os_log_info(mptcp_log_handle, "%s - %lx\n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3761
3762 mptcp_reinject_mbufs(so: mpts->mpts_socket);
3763
3764 mpts_alt = mptcp_get_subflow(mpte, NULL);
3765
3766 /* If there is no alternate eligible subflow, ignore the failover hint. */
3767 if (mpts_alt == NULL || mpts_alt == mpts) {
3768 os_log(mptcp_log_handle, "%s - %lx no alternate path\n", __func__,
3769 (unsigned long)VM_KERNEL_ADDRPERM(mpte));
3770
3771 goto done;
3772 }
3773
3774 altpath_exists = 1;
3775 alt_so = mpts_alt->mpts_socket;
3776 if (mpts_alt->mpts_flags & MPTSF_FAILINGOVER) {
3777 /* All data acknowledged and no RTT spike */
3778 if (alt_so->so_snd.sb_cc == 0 && mptcp_no_rto_spike(alt_so)) {
3779 mpts_alt->mpts_flags &= ~MPTSF_FAILINGOVER;
3780 } else {
3781 /* no alternate path available */
3782 altpath_exists = 0;
3783 }
3784 }
3785
3786 if (altpath_exists) {
3787 mpts_alt->mpts_flags |= MPTSF_ACTIVE;
3788
3789 mpte->mpte_active_sub = mpts_alt;
3790 mpts->mpts_flags |= MPTSF_FAILINGOVER;
3791 mpts->mpts_flags &= ~MPTSF_ACTIVE;
3792
3793 os_log_info(mptcp_log_handle, "%s - %lx: switched from %d to %d\n",
3794 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpts->mpts_connid, mpts_alt->mpts_connid);
3795
3796 mptcpstats_inc_switch(mpte, mpts);
3797
3798 sowwakeup(so: alt_so);
3799 } else {
3800done:
3801 mpts->mpts_socket->so_flags &= ~SOF_MP_TRYFAILOVER;
3802 }
3803
3804 return MPTS_EVRET_OK;
3805}
3806
3807/*
3808 * Handle SO_FILT_HINT_IFDENIED subflow socket event.
3809 */
3810static ev_ret_t
3811mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts,
3812 uint32_t *p_mpsofilt_hint, uint32_t event)
3813{
3814 /*
3815 * The subflow connection cannot use the outgoing interface, let's
3816 * close this subflow.
3817 */
3818 mptcp_subflow_abort(mpts, EPERM);
3819
3820 mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
3821
3822 return MPTS_EVRET_DELETE;
3823}
3824
3825/*
3826 * https://tools.ietf.org/html/rfc6052#section-2
3827 * https://tools.ietf.org/html/rfc6147#section-5.2
3828 */
3829static boolean_t
3830mptcp_desynthesize_ipv6_addr(struct mptses *mpte, const struct in6_addr *addr,
3831 const struct ipv6_prefix *prefix,
3832 struct in_addr *addrv4)
3833{
3834 char buf[MAX_IPv4_STR_LEN];
3835 char *ptrv4 = (char *)addrv4;
3836 const char *ptr = (const char *)addr;
3837
3838 if (memcmp(s1: addr, s2: &prefix->ipv6_prefix, n: prefix->prefix_len) != 0) {
3839 return false;
3840 }
3841
3842 switch (prefix->prefix_len) {
3843 case NAT64_PREFIX_LEN_96:
3844 memcpy(dst: ptrv4, src: ptr + 12, n: 4);
3845 break;
3846 case NAT64_PREFIX_LEN_64:
3847 memcpy(dst: ptrv4, src: ptr + 9, n: 4);
3848 break;
3849 case NAT64_PREFIX_LEN_56:
3850 memcpy(dst: ptrv4, src: ptr + 7, n: 1);
3851 memcpy(dst: ptrv4 + 1, src: ptr + 9, n: 3);
3852 break;
3853 case NAT64_PREFIX_LEN_48:
3854 memcpy(dst: ptrv4, src: ptr + 6, n: 2);
3855 memcpy(dst: ptrv4 + 2, src: ptr + 9, n: 2);
3856 break;
3857 case NAT64_PREFIX_LEN_40:
3858 memcpy(dst: ptrv4, src: ptr + 5, n: 3);
3859 memcpy(dst: ptrv4 + 3, src: ptr + 9, n: 1);
3860 break;
3861 case NAT64_PREFIX_LEN_32:
3862 memcpy(dst: ptrv4, src: ptr + 4, n: 4);
3863 break;
3864 default:
3865 panic("NAT64-prefix len is wrong: %u",
3866 prefix->prefix_len);
3867 }
3868
3869 os_log_info(mptcp_log_handle, "%s - %lx: desynthesized to %s\n", __func__,
3870 (unsigned long)VM_KERNEL_ADDRPERM(mpte),
3871 inet_ntop(AF_INET, (void *)addrv4, buf, sizeof(buf)));
3872
3873 return true;
3874}
3875
3876static void
3877mptcp_handle_ipv6_connection(struct mptses *mpte, const struct mptsub *mpts)
3878{
3879 struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
3880 struct socket *so = mpts->mpts_socket;
3881 struct ifnet *ifp;
3882 int j;
3883
3884 /* Subflow IPs will be steered directly by the server - no need to
3885 * desynthesize.
3886 */
3887 if (mpte->mpte_flags & MPTE_UNICAST_IP) {
3888 return;
3889 }
3890
3891 ifp = sotoinpcb(so)->inp_last_outifp;
3892
3893 if (ifnet_get_nat64prefix(ifp, nat64prefixes) == ENOENT) {
3894 return;
3895 }
3896
3897 for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
3898 int success;
3899
3900 if (nat64prefixes[j].prefix_len == 0) {
3901 continue;
3902 }
3903
3904 success = mptcp_desynthesize_ipv6_addr(mpte,
3905 addr: &mpte->__mpte_dst_v6.sin6_addr,
3906 prefix: &nat64prefixes[j],
3907 addrv4: &mpte->mpte_sub_dst_v4.sin_addr);
3908 if (success) {
3909 mpte->mpte_sub_dst_v4.sin_len = sizeof(mpte->mpte_sub_dst_v4);
3910 mpte->mpte_sub_dst_v4.sin_family = AF_INET;
3911 mpte->mpte_sub_dst_v4.sin_port = mpte->__mpte_dst_v6.sin6_port;
3912
3913 /*
3914 * We connected to a NAT64'ed address. Let's remove it
3915 * from the potential IPs to use. Whenever we are back on
3916 * that network and need to connect, we can synthesize again.
3917 *
3918 * Otherwise, on different IPv6 networks we will attempt
3919 * to connect to that NAT64 address...
3920 */
3921 memset(s: &mpte->mpte_sub_dst_v6, c: 0, n: sizeof(mpte->mpte_sub_dst_v6));
3922 break;
3923 }
3924 }
3925}
3926
3927static void
3928mptcp_try_alternate_port(struct mptses *mpte, struct mptsub *mpts)
3929{
3930 struct inpcb *inp;
3931
3932 if (!mptcp_ok_to_create_subflows(mp_tp: mpte->mpte_mptcb)) {
3933 return;
3934 }
3935
3936 inp = sotoinpcb(mpts->mpts_socket);
3937 if (inp == NULL) {
3938 return;
3939 }
3940
3941 /* Should we try the alternate port? */
3942 if (mpte->mpte_alternate_port &&
3943 inp->inp_fport != mpte->mpte_alternate_port) {
3944 union sockaddr_in_4_6 dst;
3945 struct sockaddr_in *dst_in = SIN(&dst);
3946
3947 SOCKADDR_COPY(&mpts->mpts_dst, &dst, mpts->mpts_dst.sa_len);
3948
3949 dst_in->sin_port = mpte->mpte_alternate_port;
3950
3951 mptcp_subflow_add(mpte, NULL, SA(&dst), ifscope: mpts->mpts_ifscope, NULL);
3952 } else { /* Else, we tried all we could, mark this interface as non-MPTCP */
3953 unsigned int i;
3954
3955 if (inp->inp_last_outifp == NULL) {
3956 return;
3957 }
3958
3959 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
3960 struct mpt_itf_info *info = &mpte->mpte_itfinfo[i];
3961
3962 if (inp->inp_last_outifp->if_index == info->ifindex) {
3963 info->no_mptcp_support = 1;
3964 break;
3965 }
3966 }
3967 }
3968}
3969
3970/* If TFO data is succesfully acked, it must be dropped from the mptcp so */
3971static void
3972mptcp_drop_tfo_data(struct mptses *mpte, struct mptsub *mpts)
3973{
3974 struct socket *mp_so = mptetoso(mpte);
3975 struct socket *so = mpts->mpts_socket;
3976 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
3977 struct mptcb *mp_tp = mpte->mpte_mptcb;
3978
3979 /* If data was sent with SYN, rewind state */
3980 if (tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED) {
3981 u_int64_t mp_droplen = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
3982 unsigned int tcp_droplen = tp->snd_una - tp->iss - 1;
3983
3984 VERIFY(mp_droplen <= (UINT_MAX));
3985 VERIFY(mp_droplen >= tcp_droplen);
3986
3987 mpts->mpts_flags &= ~MPTSF_TFO_REQD;
3988 mpts->mpts_iss += tcp_droplen;
3989 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
3990
3991 if (mp_droplen > tcp_droplen) {
3992 /* handle partial TCP ack */
3993 mp_so->so_flags1 |= SOF1_TFO_REWIND;
3994 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna + (mp_droplen - tcp_droplen);
3995 mp_droplen = tcp_droplen;
3996 } else {
3997 /* all data on SYN was acked */
3998 mpts->mpts_rel_seq = 1;
3999 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
4000 }
4001 mp_tp->mpt_sndmax -= tcp_droplen;
4002
4003 if (mp_droplen != 0) {
4004 VERIFY(mp_so->so_snd.sb_mb != NULL);
4005 sbdrop(sb: &mp_so->so_snd, len: (int)mp_droplen);
4006 }
4007 }
4008}
4009
4010/*
4011 * Handle SO_FILT_HINT_CONNECTED subflow socket event.
4012 */
4013static ev_ret_t
4014mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
4015 uint32_t *p_mpsofilt_hint, uint32_t event)
4016{
4017#pragma unused(event, p_mpsofilt_hint)
4018 struct socket *mp_so, *so;
4019 struct inpcb *inp;
4020 struct tcpcb *tp;
4021 struct mptcb *mp_tp;
4022 int af;
4023 boolean_t mpok = FALSE;
4024
4025 mp_so = mptetoso(mpte);
4026 mp_tp = mpte->mpte_mptcb;
4027 so = mpts->mpts_socket;
4028 tp = sototcpcb(so);
4029 af = mpts->mpts_dst.sa_family;
4030
4031 if (mpts->mpts_flags & MPTSF_CONNECTED) {
4032 return MPTS_EVRET_OK;
4033 }
4034
4035 if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
4036 (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
4037 return MPTS_EVRET_OK;
4038 }
4039
4040 /*
4041 * The subflow connection has been connected. Find out whether it
4042 * is connected as a regular TCP or as a MPTCP subflow. The idea is:
4043 *
4044 * a. If MPTCP connection is not yet established, then this must be
4045 * the first subflow connection. If MPTCP failed to negotiate,
4046 * fallback to regular TCP by degrading this subflow.
4047 *
4048 * b. If MPTCP connection has been established, then this must be
4049 * one of the subsequent subflow connections. If MPTCP failed
4050 * to negotiate, disconnect the connection.
4051 *
4052 * Right now, we simply unblock any waiters at the MPTCP socket layer
4053 * if the MPTCP connection has not been established.
4054 */
4055
4056 if (so->so_state & SS_ISDISCONNECTED) {
4057 /*
4058 * With MPTCP joins, a connection is connected at the subflow
4059 * level, but the 4th ACK from the server elevates the MPTCP
4060 * subflow to connected state. So there is a small window
4061 * where the subflow could get disconnected before the
4062 * connected event is processed.
4063 */
4064 return MPTS_EVRET_OK;
4065 }
4066
4067 if (mpts->mpts_flags & MPTSF_TFO_REQD) {
4068 mptcp_drop_tfo_data(mpte, mpts);
4069 }
4070
4071 mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_TFO_REQD);
4072 mpts->mpts_flags |= MPTSF_CONNECTED;
4073
4074 if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
4075 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
4076 }
4077
4078 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
4079
4080 /* get/verify the outbound interface */
4081 inp = sotoinpcb(so);
4082
4083 mpts->mpts_maxseg = tp->t_maxseg;
4084
4085 mpok = (mpts->mpts_flags & MPTSF_MP_CAPABLE);
4086
4087 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
4088 mp_tp->mpt_state = MPTCPS_ESTABLISHED;
4089 mpte->mpte_associd = mpts->mpts_connid;
4090 DTRACE_MPTCP2(state__change,
4091 struct mptcb *, mp_tp,
4092 uint32_t, 0 /* event */);
4093
4094 if (SOCK_DOM(so) == AF_INET) {
4095 in_getsockaddr_s(so, &mpte->__mpte_src_v4);
4096 } else {
4097 in6_getsockaddr_s(so, &mpte->__mpte_src_v6);
4098 }
4099
4100 mpts->mpts_flags |= MPTSF_ACTIVE;
4101
4102 /* case (a) above */
4103 if (!mpok) {
4104 tcpstat.tcps_mpcap_fallback++;
4105
4106 tp->t_mpflags |= TMPF_INFIN_SENT;
4107 mptcp_notify_mpfail(so);
4108 } else {
4109 if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
4110 mptcp_subflows_need_backup_flag(mpte)) {
4111 tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
4112 } else {
4113 mpts->mpts_flags |= MPTSF_PREFERRED;
4114 }
4115 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
4116 mpte->mpte_nummpcapflows++;
4117
4118 if (SOCK_DOM(so) == AF_INET6) {
4119 mptcp_handle_ipv6_connection(mpte, mpts);
4120 }
4121
4122 mptcp_check_subflows_and_add(mpte);
4123
4124 if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
4125 mpte->mpte_initial_cell = 1;
4126 }
4127
4128 mpte->mpte_handshake_success = 1;
4129 }
4130
4131 mp_tp->mpt_sndwnd = tp->snd_wnd;
4132 mp_tp->mpt_sndwl1 = mp_tp->mpt_rcvnxt;
4133 mp_tp->mpt_sndwl2 = mp_tp->mpt_snduna;
4134 soisconnected(so: mp_so);
4135 } else if (mpok) {
4136 /*
4137 * case (b) above
4138 * In case of additional flows, the MPTCP socket is not
4139 * MPTSF_MP_CAPABLE until an ACK is received from server
4140 * for 3-way handshake. TCP would have guaranteed that this
4141 * is an MPTCP subflow.
4142 */
4143 if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
4144 !(tp->t_mpflags & TMPF_BACKUP_PATH) &&
4145 mptcp_subflows_need_backup_flag(mpte)) {
4146 tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
4147 mpts->mpts_flags &= ~MPTSF_PREFERRED;
4148 } else {
4149 mpts->mpts_flags |= MPTSF_PREFERRED;
4150 }
4151
4152 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
4153 mpte->mpte_nummpcapflows++;
4154
4155 mpts->mpts_rel_seq = 1;
4156
4157 mptcp_check_subflows_and_remove(mpte);
4158 } else {
4159 mptcp_try_alternate_port(mpte, mpts);
4160
4161 tcpstat.tcps_join_fallback++;
4162 if (IFNET_IS_CELLULAR(inp->inp_last_outifp)) {
4163 tcpstat.tcps_mptcp_cell_proxy++;
4164 } else {
4165 tcpstat.tcps_mptcp_wifi_proxy++;
4166 }
4167
4168 soevent(so: mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
4169
4170 return MPTS_EVRET_OK;
4171 }
4172
4173 /* This call, just to "book" an entry in the stats-table for this ifindex */
4174 mptcpstats_get_index(stats: mpte->mpte_itfstats, mpts);
4175
4176 mptcp_output(mpte);
4177
4178 return MPTS_EVRET_OK; /* keep the subflow socket around */
4179}
4180
4181/*
4182 * Handle SO_FILT_HINT_DISCONNECTED subflow socket event.
4183 */
4184static ev_ret_t
4185mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts,
4186 uint32_t *p_mpsofilt_hint, uint32_t event)
4187{
4188#pragma unused(event, p_mpsofilt_hint)
4189 struct socket *mp_so, *so;
4190 struct mptcb *mp_tp;
4191
4192 mp_so = mptetoso(mpte);
4193 mp_tp = mpte->mpte_mptcb;
4194 so = mpts->mpts_socket;
4195
4196 if (mpts->mpts_flags & MPTSF_DISCONNECTED) {
4197 return MPTS_EVRET_DELETE;
4198 }
4199
4200 mpts->mpts_flags |= MPTSF_DISCONNECTED;
4201
4202 /* The subflow connection has been disconnected. */
4203
4204 if (mpts->mpts_flags & MPTSF_MPCAP_CTRSET) {
4205 mpte->mpte_nummpcapflows--;
4206 if (mpte->mpte_active_sub == mpts) {
4207 mpte->mpte_active_sub = NULL;
4208 }
4209 mpts->mpts_flags &= ~MPTSF_MPCAP_CTRSET;
4210 } else {
4211 if (so->so_flags & SOF_MP_SEC_SUBFLOW &&
4212 !(mpts->mpts_flags & MPTSF_CONNECTED)) {
4213 mptcp_try_alternate_port(mpte, mpts);
4214 }
4215 }
4216
4217 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
4218 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
4219 mptcp_drop(mpte, mp_tp, errno: so->so_error);
4220 }
4221
4222 /*
4223 * Clear flags that are used by getconninfo to return state.
4224 * Retain like MPTSF_DELETEOK for internal purposes.
4225 */
4226 mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_CONNECT_PENDING |
4227 MPTSF_CONNECTED | MPTSF_DISCONNECTING | MPTSF_PREFERRED |
4228 MPTSF_MP_CAPABLE | MPTSF_MP_READY | MPTSF_MP_DEGRADED | MPTSF_ACTIVE);
4229
4230 return MPTS_EVRET_DELETE;
4231}
4232
4233/*
4234 * Handle SO_FILT_HINT_MPSTATUS subflow socket event
4235 */
4236static ev_ret_t
4237mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts,
4238 uint32_t *p_mpsofilt_hint, uint32_t event)
4239{
4240#pragma unused(event, p_mpsofilt_hint)
4241 ev_ret_t ret = MPTS_EVRET_OK;
4242 struct socket *mp_so, *so;
4243 struct mptcb *mp_tp;
4244
4245 mp_so = mptetoso(mpte);
4246 mp_tp = mpte->mpte_mptcb;
4247 so = mpts->mpts_socket;
4248 struct inpcb *inp = sotoinpcb(so);
4249 struct tcpcb *tp = intotcpcb(inp);
4250
4251 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE) {
4252 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
4253 } else {
4254 mpts->mpts_flags &= ~MPTSF_MP_CAPABLE;
4255 }
4256
4257 if (sototcpcb(so)->t_mpflags & TMPF_TCP_FALLBACK) {
4258 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4259 goto done;
4260 }
4261 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4262 } else {
4263 mpts->mpts_flags &= ~MPTSF_MP_DEGRADED;
4264 }
4265
4266 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY) {
4267 mpts->mpts_flags |= MPTSF_MP_READY;
4268 } else {
4269 mpts->mpts_flags &= ~MPTSF_MP_READY;
4270 }
4271
4272 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4273 mp_tp->mpt_flags |= MPTCPF_FALLBACK_TO_TCP;
4274 mp_tp->mpt_flags &= ~MPTCPF_JOIN_READY;
4275 tcp_cache_update_mptcp_version(tp, FALSE);
4276 }
4277
4278 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
4279 ret = MPTS_EVRET_DISCONNECT_FALLBACK;
4280
4281 m_freem_list(mpte->mpte_reinjectq);
4282 mpte->mpte_reinjectq = NULL;
4283 } else if (mpts->mpts_flags & MPTSF_MP_READY) {
4284 mp_tp->mpt_flags |= MPTCPF_JOIN_READY;
4285 ret = MPTS_EVRET_CONNECT_PENDING;
4286 }
4287
4288done:
4289 return ret;
4290}
4291
4292/*
4293 * Handle SO_FILT_HINT_MUSTRST subflow socket event
4294 */
4295static ev_ret_t
4296mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts,
4297 uint32_t *p_mpsofilt_hint, uint32_t event)
4298{
4299#pragma unused(event)
4300 struct socket *mp_so, *so;
4301 struct mptcb *mp_tp;
4302 boolean_t is_fastclose;
4303
4304 mp_so = mptetoso(mpte);
4305 mp_tp = mpte->mpte_mptcb;
4306 so = mpts->mpts_socket;
4307
4308 /* We got an invalid option or a fast close */
4309 struct inpcb *inp = sotoinpcb(so);
4310 struct tcpcb *tp = NULL;
4311
4312 tp = intotcpcb(inp);
4313 so->so_error = ECONNABORTED;
4314
4315 is_fastclose = !!(tp->t_mpflags & TMPF_FASTCLOSERCV);
4316
4317 tp->t_mpflags |= TMPF_RESET;
4318
4319 if (tp->t_state != TCPS_CLOSED) {
4320 struct mbuf *m;
4321 struct tcptemp *t_template = tcp_maketemplate(tp, &m);
4322
4323 if (t_template) {
4324 struct tcp_respond_args tra;
4325
4326 bzero(s: &tra, n: sizeof(tra));
4327 if (inp->inp_flags & INP_BOUND_IF) {
4328 tra.ifscope = inp->inp_boundifp->if_index;
4329 } else {
4330 tra.ifscope = IFSCOPE_NONE;
4331 }
4332 tra.awdl_unrestricted = 1;
4333
4334 tcp_respond(tp, t_template->tt_ipgen,
4335 &t_template->tt_t, (struct mbuf *)NULL,
4336 tp->rcv_nxt, tp->snd_una, TH_RST, &tra);
4337 (void) m_free(m);
4338 }
4339 }
4340
4341 if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && is_fastclose) {
4342 struct mptsub *iter, *tmp;
4343
4344 *p_mpsofilt_hint |= SO_FILT_HINT_CONNRESET;
4345
4346 mp_so->so_error = ECONNRESET;
4347
4348 TAILQ_FOREACH_SAFE(iter, &mpte->mpte_subflows, mpts_entry, tmp) {
4349 if (iter == mpts) {
4350 continue;
4351 }
4352 mptcp_subflow_abort(mpts: iter, ECONNABORTED);
4353 }
4354
4355 /*
4356 * mptcp_drop is being called after processing the events, to fully
4357 * close the MPTCP connection
4358 */
4359 mptcp_drop(mpte, mp_tp, errno: mp_so->so_error);
4360 }
4361
4362 mptcp_subflow_abort(mpts, ECONNABORTED);
4363
4364 if (mp_tp->mpt_gc_ticks == MPT_GC_TICKS) {
4365 mp_tp->mpt_gc_ticks = MPT_GC_TICKS_FAST;
4366 }
4367
4368 return MPTS_EVRET_DELETE;
4369}
4370
4371static ev_ret_t
4372mptcp_subflow_adaptive_rtimo_ev(struct mptses *mpte, struct mptsub *mpts,
4373 uint32_t *p_mpsofilt_hint, uint32_t event)
4374{
4375#pragma unused(event)
4376 bool found_active = false;
4377
4378 mpts->mpts_flags |= MPTSF_READ_STALL;
4379
4380 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4381 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4382
4383 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
4384 TCPS_HAVERCVDFIN2(tp->t_state)) {
4385 continue;
4386 }
4387
4388 if (!(mpts->mpts_flags & MPTSF_READ_STALL)) {
4389 found_active = true;
4390 break;
4391 }
4392 }
4393
4394 if (!found_active) {
4395 *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_RTIMO;
4396 }
4397
4398 return MPTS_EVRET_OK;
4399}
4400
4401static ev_ret_t
4402mptcp_subflow_adaptive_wtimo_ev(struct mptses *mpte, struct mptsub *mpts,
4403 uint32_t *p_mpsofilt_hint, uint32_t event)
4404{
4405#pragma unused(event)
4406 bool found_active = false;
4407
4408 mpts->mpts_flags |= MPTSF_WRITE_STALL;
4409
4410 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
4411 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
4412
4413 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
4414 tp->t_state > TCPS_CLOSE_WAIT) {
4415 continue;
4416 }
4417
4418 if (!(mpts->mpts_flags & MPTSF_WRITE_STALL)) {
4419 found_active = true;
4420 break;
4421 }
4422 }
4423
4424 if (!found_active) {
4425 *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_WTIMO;
4426 }
4427
4428 return MPTS_EVRET_OK;
4429}
4430
4431/*
4432 * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked,
4433 * caller must ensure that the option can be issued on subflow sockets, via
4434 * MPOF_SUBFLOW_OK flag.
4435 */
4436int
4437mptcp_subflow_sosetopt(struct mptses *mpte, struct mptsub *mpts, struct mptopt *mpo)
4438{
4439 struct socket *mp_so, *so;
4440 struct sockopt sopt;
4441 int error;
4442
4443 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4444
4445 mp_so = mptetoso(mpte);
4446 so = mpts->mpts_socket;
4447
4448 socket_lock_assert_owned(so: mp_so);
4449
4450 if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED &&
4451 mpo->mpo_level == SOL_SOCKET &&
4452 mpo->mpo_name == SO_MARK_CELLFALLBACK) {
4453 struct ifnet *ifp = ifindex2ifnet[mpts->mpts_ifscope];
4454
4455 /*
4456 * When we open a new subflow, mark it as cell fallback, if
4457 * this subflow goes over cell.
4458 *
4459 * (except for first-party apps)
4460 */
4461
4462 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
4463 return 0;
4464 }
4465
4466 if (sotoinpcb(so)->inp_last_outifp &&
4467 !IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
4468 return 0;
4469 }
4470
4471 /*
4472 * This here is an OR, because if the app is not binding to the
4473 * interface, then it definitely is not a cell-fallback
4474 * connection.
4475 */
4476 if (mpts->mpts_ifscope == IFSCOPE_NONE || ifp == NULL ||
4477 !IFNET_IS_CELLULAR(ifp)) {
4478 return 0;
4479 }
4480 }
4481
4482 mpo->mpo_flags &= ~MPOF_INTERIM;
4483
4484 bzero(s: &sopt, n: sizeof(sopt));
4485 sopt.sopt_dir = SOPT_SET;
4486 sopt.sopt_level = mpo->mpo_level;
4487 sopt.sopt_name = mpo->mpo_name;
4488 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4489 sopt.sopt_valsize = sizeof(int);
4490 sopt.sopt_p = kernproc;
4491
4492 error = sosetoptlock(so, sopt: &sopt, 0);
4493 if (error) {
4494 os_log_error(mptcp_log_handle, "%s - %lx: sopt %s "
4495 "val %d set error %d\n", __func__,
4496 (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4497 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
4498 mpo->mpo_intval, error);
4499 }
4500 return error;
4501}
4502
4503/*
4504 * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked,
4505 * caller must ensure that the option can be issued on subflow sockets, via
4506 * MPOF_SUBFLOW_OK flag.
4507 */
4508int
4509mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so,
4510 struct mptopt *mpo)
4511{
4512 struct socket *mp_so;
4513 struct sockopt sopt;
4514 int error;
4515
4516 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4517 mp_so = mptetoso(mpte);
4518
4519 socket_lock_assert_owned(so: mp_so);
4520
4521 bzero(s: &sopt, n: sizeof(sopt));
4522 sopt.sopt_dir = SOPT_GET;
4523 sopt.sopt_level = mpo->mpo_level;
4524 sopt.sopt_name = mpo->mpo_name;
4525 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4526 sopt.sopt_valsize = sizeof(int);
4527 sopt.sopt_p = kernproc;
4528
4529 error = sogetoptlock(so, sopt: &sopt, 0); /* already locked */
4530 if (error) {
4531 os_log_error(mptcp_log_handle,
4532 "%s - %lx: sopt %s get error %d\n",
4533 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4534 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name), error);
4535 }
4536 return error;
4537}
4538
4539
4540/*
4541 * MPTCP garbage collector.
4542 *
4543 * This routine is called by the MP domain on-demand, periodic callout,
4544 * which is triggered when a MPTCP socket is closed. The callout will
4545 * repeat as long as this routine returns a non-zero value.
4546 */
4547static uint32_t
4548mptcp_gc(struct mppcbinfo *mppi)
4549{
4550 struct mppcb *mpp, *tmpp;
4551 uint32_t active = 0;
4552
4553 LCK_MTX_ASSERT(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED);
4554
4555 TAILQ_FOREACH_SAFE(mpp, &mppi->mppi_pcbs, mpp_entry, tmpp) {
4556 struct socket *mp_so;
4557 struct mptses *mpte;
4558 struct mptcb *mp_tp;
4559
4560 mp_so = mpp->mpp_socket;
4561 mpte = mptompte(mp: mpp);
4562 mp_tp = mpte->mpte_mptcb;
4563
4564 if (!mpp_try_lock(mp: mpp)) {
4565 active++;
4566 continue;
4567 }
4568
4569 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
4570
4571 /* check again under the lock */
4572 if (mp_so->so_usecount > 0) {
4573 boolean_t wakeup = FALSE;
4574 struct mptsub *mpts, *tmpts;
4575
4576 if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) {
4577 if (mp_tp->mpt_gc_ticks > 0) {
4578 mp_tp->mpt_gc_ticks--;
4579 }
4580 if (mp_tp->mpt_gc_ticks == 0) {
4581 wakeup = TRUE;
4582 }
4583 }
4584 if (wakeup) {
4585 TAILQ_FOREACH_SAFE(mpts,
4586 &mpte->mpte_subflows, mpts_entry, tmpts) {
4587 mptcp_subflow_eupcall1(so: mpts->mpts_socket,
4588 arg: mpts, SO_FILT_HINT_DISCONNECTED);
4589 }
4590 }
4591 socket_unlock(so: mp_so, refcount: 0);
4592 active++;
4593 continue;
4594 }
4595
4596 if (mpp->mpp_state != MPPCB_STATE_DEAD) {
4597 panic("%s - %lx: skipped state "
4598 "[u=%d,r=%d,s=%d]\n", __func__,
4599 (unsigned long)VM_KERNEL_ADDRPERM(mpte),
4600 mp_so->so_usecount, mp_so->so_retaincnt,
4601 mpp->mpp_state);
4602 }
4603
4604 if (mp_tp->mpt_state == MPTCPS_TIME_WAIT) {
4605 mptcp_close(mpte, mp_tp);
4606 }
4607
4608 mptcp_session_destroy(mpte);
4609
4610 DTRACE_MPTCP4(dispose, struct socket *, mp_so,
4611 struct sockbuf *, &mp_so->so_rcv,
4612 struct sockbuf *, &mp_so->so_snd,
4613 struct mppcb *, mpp);
4614
4615 mptcp_pcbdispose(mpp);
4616 sodealloc(so: mp_so);
4617 }
4618
4619 return active;
4620}
4621
4622/*
4623 * Drop a MPTCP connection, reporting the specified error.
4624 */
4625struct mptses *
4626mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, u_short errno)
4627{
4628 struct socket *mp_so = mptetoso(mpte);
4629
4630 VERIFY(mpte->mpte_mptcb == mp_tp);
4631
4632 socket_lock_assert_owned(so: mp_so);
4633
4634 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
4635 uint32_t, 0 /* event */);
4636
4637 if (errno == ETIMEDOUT && mp_tp->mpt_softerror != 0) {
4638 errno = mp_tp->mpt_softerror;
4639 }
4640 mp_so->so_error = errno;
4641
4642 return mptcp_close(mpte, mp_tp);
4643}
4644
4645/*
4646 * Close a MPTCP control block.
4647 */
4648struct mptses *
4649mptcp_close(struct mptses *mpte, struct mptcb *mp_tp)
4650{
4651 struct mptsub *mpts = NULL, *tmpts = NULL;
4652 struct socket *mp_so = mptetoso(mpte);
4653
4654 socket_lock_assert_owned(so: mp_so);
4655 VERIFY(mpte->mpte_mptcb == mp_tp);
4656
4657 mp_tp->mpt_state = MPTCPS_TERMINATE;
4658
4659 mptcp_freeq(mp_tp);
4660
4661 soisdisconnected(so: mp_so);
4662
4663 /* Clean up all subflows */
4664 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4665 mptcp_subflow_disconnect(mpte, mpts);
4666 }
4667
4668 return NULL;
4669}
4670
4671void
4672mptcp_notify_close(struct socket *so)
4673{
4674 soevent(so, hint: (SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED));
4675}
4676
4677typedef struct mptcp_subflow_event_entry {
4678 uint32_t sofilt_hint_mask;
4679 ev_ret_t (*sofilt_hint_ev_hdlr)(
4680 struct mptses *mpte,
4681 struct mptsub *mpts,
4682 uint32_t *p_mpsofilt_hint,
4683 uint32_t event);
4684} mptsub_ev_entry_t;
4685
4686/*
4687 * XXX The order of the event handlers below is really
4688 * really important. Think twice before changing it.
4689 */
4690static mptsub_ev_entry_t mpsub_ev_entry_tbl[] = {
4691 {
4692 .sofilt_hint_mask = SO_FILT_HINT_MP_SUB_ERROR,
4693 .sofilt_hint_ev_hdlr = mptcp_subflow_mpsuberror_ev,
4694 },
4695 {
4696 .sofilt_hint_mask = SO_FILT_HINT_MPCANTRCVMORE,
4697 .sofilt_hint_ev_hdlr = mptcp_subflow_mpcantrcvmore_ev,
4698 },
4699 {
4700 .sofilt_hint_mask = SO_FILT_HINT_MPFAILOVER,
4701 .sofilt_hint_ev_hdlr = mptcp_subflow_failover_ev,
4702 },
4703 {
4704 .sofilt_hint_mask = SO_FILT_HINT_CONNRESET,
4705 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
4706 },
4707 {
4708 .sofilt_hint_mask = SO_FILT_HINT_MUSTRST,
4709 .sofilt_hint_ev_hdlr = mptcp_subflow_mustrst_ev,
4710 },
4711 {
4712 .sofilt_hint_mask = SO_FILT_HINT_CANTRCVMORE,
4713 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
4714 },
4715 {
4716 .sofilt_hint_mask = SO_FILT_HINT_TIMEOUT,
4717 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
4718 },
4719 {
4720 .sofilt_hint_mask = SO_FILT_HINT_NOSRCADDR,
4721 .sofilt_hint_ev_hdlr = mptcp_subflow_nosrcaddr_ev,
4722 },
4723 {
4724 .sofilt_hint_mask = SO_FILT_HINT_IFDENIED,
4725 .sofilt_hint_ev_hdlr = mptcp_subflow_ifdenied_ev,
4726 },
4727 {
4728 .sofilt_hint_mask = SO_FILT_HINT_CONNECTED,
4729 .sofilt_hint_ev_hdlr = mptcp_subflow_connected_ev,
4730 },
4731 {
4732 .sofilt_hint_mask = SO_FILT_HINT_MPSTATUS,
4733 .sofilt_hint_ev_hdlr = mptcp_subflow_mpstatus_ev,
4734 },
4735 {
4736 .sofilt_hint_mask = SO_FILT_HINT_DISCONNECTED,
4737 .sofilt_hint_ev_hdlr = mptcp_subflow_disconnected_ev,
4738 },
4739 {
4740 .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_RTIMO,
4741 .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_rtimo_ev,
4742 },
4743 {
4744 .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_WTIMO,
4745 .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_wtimo_ev,
4746 },
4747};
4748
4749/*
4750 * Subflow socket control events.
4751 *
4752 * Called for handling events related to the underlying subflow socket.
4753 */
4754static ev_ret_t
4755mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts,
4756 uint32_t *p_mpsofilt_hint)
4757{
4758 ev_ret_t ret = MPTS_EVRET_OK;
4759 int i, mpsub_ev_entry_count = sizeof(mpsub_ev_entry_tbl) /
4760 sizeof(mpsub_ev_entry_tbl[0]);
4761
4762 /* bail if there's nothing to process */
4763 if (!mpts->mpts_evctl) {
4764 return ret;
4765 }
4766
4767 if (mpts->mpts_evctl & (SO_FILT_HINT_CONNRESET | SO_FILT_HINT_MUSTRST |
4768 SO_FILT_HINT_CANTSENDMORE | SO_FILT_HINT_TIMEOUT |
4769 SO_FILT_HINT_NOSRCADDR | SO_FILT_HINT_IFDENIED |
4770 SO_FILT_HINT_DISCONNECTED)) {
4771 mpts->mpts_evctl |= SO_FILT_HINT_MPFAILOVER;
4772 }
4773
4774 DTRACE_MPTCP3(subflow__events, struct mptses *, mpte,
4775 struct mptsub *, mpts, uint32_t, mpts->mpts_evctl);
4776
4777 /*
4778 * Process all the socket filter hints and reset the hint
4779 * once it is handled
4780 */
4781 for (i = 0; i < mpsub_ev_entry_count && mpts->mpts_evctl; i++) {
4782 /*
4783 * Always execute the DISCONNECTED event, because it will wakeup
4784 * the app.
4785 */
4786 if ((mpts->mpts_evctl & mpsub_ev_entry_tbl[i].sofilt_hint_mask) &&
4787 (ret >= MPTS_EVRET_OK ||
4788 mpsub_ev_entry_tbl[i].sofilt_hint_mask == SO_FILT_HINT_DISCONNECTED)) {
4789 mpts->mpts_evctl &= ~mpsub_ev_entry_tbl[i].sofilt_hint_mask;
4790 ev_ret_t error =
4791 mpsub_ev_entry_tbl[i].sofilt_hint_ev_hdlr(mpte, mpts, p_mpsofilt_hint, mpsub_ev_entry_tbl[i].sofilt_hint_mask);
4792 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
4793 }
4794 }
4795
4796 return ret;
4797}
4798
4799/*
4800 * MPTCP workloop.
4801 */
4802void
4803mptcp_subflow_workloop(struct mptses *mpte)
4804{
4805 boolean_t connect_pending = FALSE, disconnect_fallback = FALSE;
4806 uint32_t mpsofilt_hint_mask = SO_FILT_HINT_LOCKED;
4807 struct mptsub *mpts, *tmpts;
4808 struct socket *mp_so;
4809
4810 mp_so = mptetoso(mpte);
4811
4812 socket_lock_assert_owned(so: mp_so);
4813
4814 if (mpte->mpte_flags & MPTE_IN_WORKLOOP) {
4815 mpte->mpte_flags |= MPTE_WORKLOOP_RELAUNCH;
4816 return;
4817 }
4818 mpte->mpte_flags |= MPTE_IN_WORKLOOP;
4819
4820relaunch:
4821 mpte->mpte_flags &= ~MPTE_WORKLOOP_RELAUNCH;
4822
4823 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4824 ev_ret_t ret;
4825
4826 if (mpts->mpts_socket->so_usecount == 0) {
4827 /* Will be removed soon by tcp_garbage_collect */
4828 continue;
4829 }
4830
4831 mptcp_subflow_addref(mpts);
4832 mpts->mpts_socket->so_usecount++;
4833
4834 ret = mptcp_subflow_events(mpte, mpts, p_mpsofilt_hint: &mpsofilt_hint_mask);
4835
4836 /*
4837 * If MPTCP socket is closed, disconnect all subflows.
4838 * This will generate a disconnect event which will
4839 * be handled during the next iteration, causing a
4840 * non-zero error to be returned above.
4841 */
4842 if (mp_so->so_flags & SOF_PCBCLEARING) {
4843 mptcp_subflow_disconnect(mpte, mpts);
4844 }
4845
4846 switch (ret) {
4847 case MPTS_EVRET_OK:
4848 /* nothing to do */
4849 break;
4850 case MPTS_EVRET_DELETE:
4851 mptcp_subflow_soclose(mpts);
4852 break;
4853 case MPTS_EVRET_CONNECT_PENDING:
4854 connect_pending = TRUE;
4855 break;
4856 case MPTS_EVRET_DISCONNECT_FALLBACK:
4857 disconnect_fallback = TRUE;
4858 break;
4859 default:
4860 break;
4861 }
4862 mptcp_subflow_remref(mpts); /* ours */
4863
4864 VERIFY(mpts->mpts_socket->so_usecount != 0);
4865 mpts->mpts_socket->so_usecount--;
4866 }
4867
4868 if (mpsofilt_hint_mask != SO_FILT_HINT_LOCKED) {
4869 VERIFY(mpsofilt_hint_mask & SO_FILT_HINT_LOCKED);
4870
4871 if (mpsofilt_hint_mask & SO_FILT_HINT_CANTRCVMORE) {
4872 mp_so->so_state |= SS_CANTRCVMORE;
4873 sorwakeup(so: mp_so);
4874 }
4875
4876 soevent(so: mp_so, hint: mpsofilt_hint_mask);
4877 }
4878
4879 if (!connect_pending && !disconnect_fallback) {
4880 goto exit;
4881 }
4882
4883 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4884 if (disconnect_fallback) {
4885 struct socket *so = NULL;
4886 struct inpcb *inp = NULL;
4887 struct tcpcb *tp = NULL;
4888
4889 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
4890 continue;
4891 }
4892
4893 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4894
4895 if (mpts->mpts_flags & (MPTSF_DISCONNECTING |
4896 MPTSF_DISCONNECTED)) {
4897 continue;
4898 }
4899
4900 so = mpts->mpts_socket;
4901
4902 /*
4903 * The MPTCP connection has degraded to a fallback
4904 * mode, so there is no point in keeping this subflow
4905 * regardless of its MPTCP-readiness state, unless it
4906 * is the primary one which we use for fallback. This
4907 * assumes that the subflow used for fallback is the
4908 * ACTIVE one.
4909 */
4910
4911 inp = sotoinpcb(so);
4912 tp = intotcpcb(inp);
4913 tp->t_mpflags &=
4914 ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE);
4915 tp->t_mpflags |= TMPF_TCP_FALLBACK;
4916
4917 soevent(so, SO_FILT_HINT_MUSTRST);
4918 } else if (connect_pending) {
4919 /*
4920 * The MPTCP connection has progressed to a state
4921 * where it supports full multipath semantics; allow
4922 * additional joins to be attempted for all subflows
4923 * that are in the PENDING state.
4924 */
4925 if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
4926 int error = mptcp_subflow_soconnectx(mpte, mpts);
4927
4928 if (error) {
4929 mptcp_subflow_abort(mpts, error);
4930 }
4931 }
4932 }
4933 }
4934
4935exit:
4936 if (mpte->mpte_flags & MPTE_WORKLOOP_RELAUNCH) {
4937 goto relaunch;
4938 }
4939
4940 mpte->mpte_flags &= ~MPTE_IN_WORKLOOP;
4941}
4942
4943/*
4944 * Protocol pr_lock callback.
4945 */
4946int
4947mptcp_lock(struct socket *mp_so, int refcount, void *lr)
4948{
4949 struct mppcb *mpp = mpsotomppcb(mp_so);
4950 void *lr_saved;
4951
4952 if (lr == NULL) {
4953 lr_saved = __builtin_return_address(0);
4954 } else {
4955 lr_saved = lr;
4956 }
4957
4958 if (mpp == NULL) {
4959 panic("%s: so=%p NO PCB! lr=%p lrh= %s", __func__,
4960 mp_so, lr_saved, solockhistory_nr(mp_so));
4961 /* NOTREACHED */
4962 }
4963 mpp_lock(mp: mpp);
4964
4965 if (mp_so->so_usecount < 0) {
4966 panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s", __func__,
4967 mp_so, mp_so->so_pcb, lr_saved, mp_so->so_usecount,
4968 solockhistory_nr(mp_so));
4969 /* NOTREACHED */
4970 }
4971 if (refcount != 0) {
4972 mp_so->so_usecount++;
4973 mpp->mpp_inside++;
4974 }
4975 mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved;
4976 mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX;
4977
4978 return 0;
4979}
4980
4981/*
4982 * Protocol pr_unlock callback.
4983 */
4984int
4985mptcp_unlock(struct socket *mp_so, int refcount, void *lr)
4986{
4987 struct mppcb *mpp = mpsotomppcb(mp_so);
4988 void *lr_saved;
4989
4990 if (lr == NULL) {
4991 lr_saved = __builtin_return_address(0);
4992 } else {
4993 lr_saved = lr;
4994 }
4995
4996 if (mpp == NULL) {
4997 panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s", __func__,
4998 mp_so, mp_so->so_usecount, lr_saved,
4999 solockhistory_nr(mp_so));
5000 /* NOTREACHED */
5001 }
5002 socket_lock_assert_owned(so: mp_so);
5003
5004 if (refcount != 0) {
5005 mp_so->so_usecount--;
5006 mpp->mpp_inside--;
5007 }
5008
5009 if (mp_so->so_usecount < 0) {
5010 panic("%s: so=%p usecount=%x lrh= %s", __func__,
5011 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
5012 /* NOTREACHED */
5013 }
5014 if (mpp->mpp_inside < 0) {
5015 panic("%s: mpp=%p inside=%x lrh= %s", __func__,
5016 mpp, mpp->mpp_inside, solockhistory_nr(mp_so));
5017 /* NOTREACHED */
5018 }
5019 mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved;
5020 mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
5021 mpp_unlock(mp: mpp);
5022
5023 return 0;
5024}
5025
5026/*
5027 * Protocol pr_getlock callback.
5028 */
5029lck_mtx_t *
5030mptcp_getlock(struct socket *mp_so, int flags)
5031{
5032 struct mppcb *mpp = mpsotomppcb(mp_so);
5033
5034 if (mpp == NULL) {
5035 panic("%s: so=%p NULL so_pcb %s", __func__, mp_so,
5036 solockhistory_nr(mp_so));
5037 /* NOTREACHED */
5038 }
5039 if (mp_so->so_usecount < 0) {
5040 panic("%s: so=%p usecount=%x lrh= %s", __func__,
5041 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
5042 /* NOTREACHED */
5043 }
5044 return mpp_getlock(mp: mpp, flags);
5045}
5046
5047void
5048mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand,
5049 u_int32_t *rrand)
5050{
5051 struct mptcp_subf_auth_entry *sauth_entry;
5052
5053 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
5054 if (sauth_entry->msae_laddr_id == addr_id) {
5055 if (lrand) {
5056 *lrand = sauth_entry->msae_laddr_rand;
5057 }
5058 if (rrand) {
5059 *rrand = sauth_entry->msae_raddr_rand;
5060 }
5061 break;
5062 }
5063 }
5064}
5065
5066void
5067mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp,
5068 mptcp_addr_id raddr_id, u_int32_t raddr_rand)
5069{
5070 struct mptcp_subf_auth_entry *sauth_entry;
5071
5072 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
5073 if (sauth_entry->msae_laddr_id == laddr_id) {
5074 if ((sauth_entry->msae_raddr_id != 0) &&
5075 (sauth_entry->msae_raddr_id != raddr_id)) {
5076 os_log_error(mptcp_log_handle, "%s - %lx: mismatched"
5077 " address ids %d %d \n", __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte),
5078 raddr_id, sauth_entry->msae_raddr_id);
5079 return;
5080 }
5081 sauth_entry->msae_raddr_id = raddr_id;
5082 if ((sauth_entry->msae_raddr_rand != 0) &&
5083 (sauth_entry->msae_raddr_rand != raddr_rand)) {
5084 os_log_error(mptcp_log_handle, "%s - %lx: "
5085 "dup SYN_ACK %d %d \n",
5086 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mp_tp->mpt_mpte),
5087 raddr_rand, sauth_entry->msae_raddr_rand);
5088 return;
5089 }
5090 sauth_entry->msae_raddr_rand = raddr_rand;
5091 return;
5092 }
5093 }
5094}
5095
5096/*
5097 * SHA-256 support for MPTCP
5098 */
5099
5100static void
5101mptcp_do_sha256(mptcp_key_t *key, char *sha_digest)
5102{
5103 const unsigned char *sha2_base;
5104 int sha2_size;
5105
5106 sha2_base = (const unsigned char *) key;
5107 sha2_size = sizeof(mptcp_key_t);
5108
5109 SHA256_CTX sha_ctx;
5110 SHA256_Init(ctx: &sha_ctx);
5111 SHA256_Update(ctx: &sha_ctx, data: sha2_base, len: sha2_size);
5112 SHA256_Final(digest: sha_digest, ctx: &sha_ctx);
5113}
5114
5115void
5116mptcp_hmac_sha256(mptcp_key_t key1, mptcp_key_t key2,
5117 u_char *msg, uint16_t msg_len, u_char *digest)
5118{
5119 SHA256_CTX sha_ctx;
5120 mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
5121 mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
5122 int i;
5123
5124 bzero(s: digest, SHA256_DIGEST_LENGTH);
5125
5126 /* Set up the Key for HMAC */
5127 key_ipad[0] = key1;
5128 key_ipad[1] = key2;
5129
5130 key_opad[0] = key1;
5131 key_opad[1] = key2;
5132
5133 /* Key is 512 block length, so no need to compute hash */
5134
5135 /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
5136
5137 for (i = 0; i < 8; i++) {
5138 key_ipad[i] ^= 0x3636363636363636;
5139 key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
5140 }
5141
5142 /* Perform inner SHA256 */
5143 SHA256_Init(ctx: &sha_ctx);
5144 SHA256_Update(ctx: &sha_ctx, data: (unsigned char *)key_ipad, len: sizeof(key_ipad));
5145 SHA256_Update(ctx: &sha_ctx, data: msg, len: msg_len);
5146 SHA256_Final(digest, ctx: &sha_ctx);
5147
5148 /* Perform outer SHA256 */
5149 SHA256_Init(ctx: &sha_ctx);
5150 SHA256_Update(ctx: &sha_ctx, data: (unsigned char *)key_opad, len: sizeof(key_opad));
5151 SHA256_Update(ctx: &sha_ctx, data: (unsigned char *)digest, SHA256_DIGEST_LENGTH);
5152 SHA256_Final(digest, ctx: &sha_ctx);
5153}
5154
5155/*
5156 * SHA1 support for MPTCP
5157 */
5158
5159static void
5160mptcp_do_sha1(mptcp_key_t *key, char *sha_digest)
5161{
5162 SHA1_CTX sha1ctxt;
5163 const unsigned char *sha1_base;
5164 int sha1_size;
5165
5166 sha1_base = (const unsigned char *) key;
5167 sha1_size = sizeof(mptcp_key_t);
5168 SHA1Init(&sha1ctxt);
5169 SHA1Update(&sha1ctxt, sha1_base, sha1_size);
5170 SHA1Final(sha_digest, &sha1ctxt);
5171}
5172
5173void
5174mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2,
5175 u_int32_t rand1, u_int32_t rand2, u_char *digest)
5176{
5177 SHA1_CTX sha1ctxt;
5178 mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
5179 mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
5180 u_int32_t data[2];
5181 int i;
5182
5183 bzero(s: digest, SHA1_RESULTLEN);
5184
5185 /* Set up the Key for HMAC */
5186 key_ipad[0] = key1;
5187 key_ipad[1] = key2;
5188
5189 key_opad[0] = key1;
5190 key_opad[1] = key2;
5191
5192 /* Set up the message for HMAC */
5193 data[0] = rand1;
5194 data[1] = rand2;
5195
5196 /* Key is 512 block length, so no need to compute hash */
5197
5198 /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
5199
5200 for (i = 0; i < 8; i++) {
5201 key_ipad[i] ^= 0x3636363636363636;
5202 key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
5203 }
5204
5205 /* Perform inner SHA1 */
5206 SHA1Init(&sha1ctxt);
5207 SHA1Update(&sha1ctxt, (unsigned char *)key_ipad, sizeof(key_ipad));
5208 SHA1Update(&sha1ctxt, (unsigned char *)data, sizeof(data));
5209 SHA1Final(digest, &sha1ctxt);
5210
5211 /* Perform outer SHA1 */
5212 SHA1Init(&sha1ctxt);
5213 SHA1Update(&sha1ctxt, (unsigned char *)key_opad, sizeof(key_opad));
5214 SHA1Update(&sha1ctxt, (unsigned char *)digest, SHA1_RESULTLEN);
5215 SHA1Final(digest, &sha1ctxt);
5216}
5217
5218/*
5219 * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A))
5220 * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B))
5221 */
5222void
5223mptcp_get_mpjoin_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest, uint8_t digest_len)
5224{
5225 uint32_t lrand, rrand;
5226
5227 lrand = rrand = 0;
5228 mptcp_get_rands(addr_id: aid, mp_tp, lrand: &lrand, rrand: &rrand);
5229
5230 u_char full_digest[MAX(SHA1_RESULTLEN, SHA256_DIGEST_LENGTH)] = {0};
5231 if (mp_tp->mpt_version == MPTCP_VERSION_0) {
5232 mptcp_hmac_sha1(key1: mp_tp->mpt_localkey, key2: mp_tp->mpt_remotekey, rand1: lrand, rand2: rrand, digest: full_digest);
5233 } else {
5234 uint32_t data[2];
5235 data[0] = lrand;
5236 data[1] = rrand;
5237 mptcp_hmac_sha256(key1: mp_tp->mpt_localkey, key2: mp_tp->mpt_remotekey, msg: (u_char*)data, msg_len: 8, digest: full_digest);
5238 }
5239 bcopy(src: full_digest, dst: digest, n: digest_len);
5240}
5241
5242/*
5243 * Authentication data generation
5244 */
5245static void
5246mptcp_generate_token(char *sha_digest, int sha_digest_len, caddr_t token,
5247 int token_len)
5248{
5249 VERIFY(token_len == sizeof(u_int32_t));
5250 VERIFY(sha_digest_len == SHA1_RESULTLEN ||
5251 sha_digest_len == SHA256_DIGEST_LENGTH);
5252
5253 /* Most significant 32 bits of the SHA1/SHA256 hash */
5254 bcopy(src: sha_digest, dst: token, n: sizeof(u_int32_t));
5255 return;
5256}
5257
5258static void
5259mptcp_generate_idsn(char *sha_digest, int sha_digest_len, caddr_t idsn,
5260 int idsn_len, uint8_t mp_version)
5261{
5262 VERIFY(idsn_len == sizeof(u_int64_t));
5263 VERIFY(sha_digest_len == SHA1_RESULTLEN ||
5264 sha_digest_len == SHA256_DIGEST_LENGTH);
5265 VERIFY(mp_version == MPTCP_VERSION_0 || mp_version == MPTCP_VERSION_1);
5266
5267 /*
5268 * Least significant 64 bits of the hash
5269 */
5270
5271 if (mp_version == MPTCP_VERSION_0) {
5272 idsn[7] = sha_digest[12];
5273 idsn[6] = sha_digest[13];
5274 idsn[5] = sha_digest[14];
5275 idsn[4] = sha_digest[15];
5276 idsn[3] = sha_digest[16];
5277 idsn[2] = sha_digest[17];
5278 idsn[1] = sha_digest[18];
5279 idsn[0] = sha_digest[19];
5280 } else {
5281 idsn[7] = sha_digest[24];
5282 idsn[6] = sha_digest[25];
5283 idsn[5] = sha_digest[26];
5284 idsn[4] = sha_digest[27];
5285 idsn[3] = sha_digest[28];
5286 idsn[2] = sha_digest[29];
5287 idsn[1] = sha_digest[30];
5288 idsn[0] = sha_digest[31];
5289 }
5290 return;
5291}
5292
5293static void
5294mptcp_conn_properties(struct mptcb *mp_tp)
5295{
5296 /* Set DSS checksum flag */
5297 if (mptcp_dss_csum) {
5298 mp_tp->mpt_flags |= MPTCPF_CHECKSUM;
5299 }
5300
5301 /* Set up receive window */
5302 mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
5303
5304 /* Set up gc ticks */
5305 mp_tp->mpt_gc_ticks = MPT_GC_TICKS;
5306}
5307
5308static void
5309mptcp_init_local_parms(struct mptses *mpte, struct sockaddr* dst)
5310{
5311 struct mptcb *mp_tp = mpte->mpte_mptcb;
5312 char key_digest[MAX(SHA1_RESULTLEN, SHA256_DIGEST_LENGTH)];
5313 uint16_t digest_len;
5314
5315 if (mpte->mpte_flags & MPTE_FORCE_V0 || !mptcp_enable_v1) {
5316 mp_tp->mpt_version = MPTCP_VERSION_0;
5317 } else if (mpte->mpte_flags & MPTE_FORCE_V1 && mptcp_enable_v1) {
5318 mp_tp->mpt_version = MPTCP_VERSION_1;
5319 } else {
5320 mp_tp->mpt_version = tcp_cache_get_mptcp_version(dst);
5321 }
5322 VERIFY(mp_tp->mpt_version == MPTCP_VERSION_0 ||
5323 mp_tp->mpt_version == MPTCP_VERSION_1);
5324
5325 read_frandom(buffer: &mp_tp->mpt_localkey, numBytes: sizeof(mp_tp->mpt_localkey));
5326 if (mp_tp->mpt_version == MPTCP_VERSION_0) {
5327 digest_len = SHA1_RESULTLEN;
5328 mptcp_do_sha1(key: &mp_tp->mpt_localkey, sha_digest: key_digest);
5329 } else {
5330 digest_len = SHA256_DIGEST_LENGTH;
5331 mptcp_do_sha256(key: &mp_tp->mpt_localkey, sha_digest: key_digest);
5332 }
5333
5334 mptcp_generate_token(sha_digest: key_digest, sha_digest_len: digest_len,
5335 token: (caddr_t)&mp_tp->mpt_localtoken, token_len: sizeof(mp_tp->mpt_localtoken));
5336 mptcp_generate_idsn(sha_digest: key_digest, sha_digest_len: digest_len,
5337 idsn: (caddr_t)&mp_tp->mpt_local_idsn, idsn_len: sizeof(u_int64_t), mp_version: mp_tp->mpt_version);
5338 /* The subflow SYN is also first MPTCP byte */
5339 mp_tp->mpt_snduna = mp_tp->mpt_sndmax = mp_tp->mpt_local_idsn + 1;
5340 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
5341
5342 mptcp_conn_properties(mp_tp);
5343}
5344
5345int
5346mptcp_init_remote_parms(struct mptcb *mp_tp)
5347{
5348 /* Setup local and remote tokens and Initial DSNs */
5349 char remote_digest[MAX(SHA1_RESULTLEN, SHA256_DIGEST_LENGTH)];
5350 uint16_t digest_len;
5351
5352 if (mp_tp->mpt_version == MPTCP_VERSION_0) {
5353 digest_len = SHA1_RESULTLEN;
5354 mptcp_do_sha1(key: &mp_tp->mpt_remotekey, sha_digest: remote_digest);
5355 } else if (mp_tp->mpt_version == MPTCP_VERSION_1) {
5356 digest_len = SHA256_DIGEST_LENGTH;
5357 mptcp_do_sha256(key: &mp_tp->mpt_remotekey, sha_digest: remote_digest);
5358 } else {
5359 return -1;
5360 }
5361
5362 mptcp_generate_token(sha_digest: remote_digest, sha_digest_len: digest_len,
5363 token: (caddr_t)&mp_tp->mpt_remotetoken, token_len: sizeof(mp_tp->mpt_remotetoken));
5364 mptcp_generate_idsn(sha_digest: remote_digest, sha_digest_len: digest_len,
5365 idsn: (caddr_t)&mp_tp->mpt_remote_idsn, idsn_len: sizeof(u_int64_t), mp_version: mp_tp->mpt_version);
5366 mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1;
5367 mp_tp->mpt_rcvadv = mp_tp->mpt_rcvnxt + mp_tp->mpt_rcvwnd;
5368 return 0;
5369}
5370
5371static void
5372mptcp_send_dfin(struct socket *so)
5373{
5374 struct tcpcb *tp = NULL;
5375 struct inpcb *inp = NULL;
5376
5377 inp = sotoinpcb(so);
5378 if (!inp) {
5379 return;
5380 }
5381
5382 tp = intotcpcb(inp);
5383 if (!tp) {
5384 return;
5385 }
5386
5387 if (!(tp->t_mpflags & TMPF_RESET)) {
5388 tp->t_mpflags |= TMPF_SEND_DFIN;
5389 }
5390}
5391
5392/*
5393 * Data Sequence Mapping routines
5394 */
5395void
5396mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m)
5397{
5398 struct mptcb *mp_tp;
5399
5400 if (m == NULL) {
5401 return;
5402 }
5403
5404 mp_tp = &__container_of(mpp, struct mpp_mtp, mpp)->mtcb;
5405
5406 while (m) {
5407 VERIFY(m->m_flags & M_PKTHDR);
5408 m->m_pkthdr.pkt_flags |= (PKTF_MPTCP | PKTF_MPSO);
5409 m->m_pkthdr.mp_dsn = mp_tp->mpt_sndmax;
5410 VERIFY(m_pktlen(m) >= 0 && m_pktlen(m) < UINT16_MAX);
5411 m->m_pkthdr.mp_rlen = (uint16_t)m_pktlen(m);
5412 mp_tp->mpt_sndmax += m_pktlen(m);
5413 m = m->m_next;
5414 }
5415}
5416
5417void
5418mptcp_fallback_sbdrop(struct socket *so, struct mbuf *m, int len)
5419{
5420 struct mptcb *mp_tp = tptomptp(sototcpcb(so));
5421 uint64_t data_ack;
5422 uint64_t dsn;
5423
5424 VERIFY(len >= 0);
5425
5426 if (!m || len == 0) {
5427 return;
5428 }
5429
5430 while (m && len > 0) {
5431 VERIFY(m->m_flags & M_PKTHDR);
5432 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5433
5434 data_ack = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen;
5435 dsn = m->m_pkthdr.mp_dsn;
5436
5437 len -= m->m_len;
5438 m = m->m_next;
5439 }
5440
5441 if (m && len == 0) {
5442 /*
5443 * If there is one more mbuf in the chain, it automatically means
5444 * that up to m->mp_dsn has been ack'ed.
5445 *
5446 * This means, we actually correct data_ack back down (compared
5447 * to what we set inside the loop - dsn + data_len). Because in
5448 * the loop we are "optimistic" and assume that the full mapping
5449 * will be acked. If that's not the case and we get out of the
5450 * loop with m != NULL, it means only up to m->mp_dsn has been
5451 * really acked.
5452 */
5453 data_ack = m->m_pkthdr.mp_dsn;
5454 }
5455
5456 if (len < 0) {
5457 /*
5458 * If len is negative, meaning we acked in the middle of an mbuf,
5459 * only up to this mbuf's data-sequence number has been acked
5460 * at the MPTCP-level.
5461 */
5462 data_ack = dsn;
5463 }
5464
5465 /* We can have data in the subflow's send-queue that is being acked,
5466 * while the DATA_ACK has already advanced. Thus, we should check whether
5467 * or not the DATA_ACK is actually new here.
5468 */
5469 if (MPTCP_SEQ_LEQ(data_ack, mp_tp->mpt_sndmax) &&
5470 MPTCP_SEQ_GEQ(data_ack, mp_tp->mpt_snduna)) {
5471 mptcp_data_ack_rcvd(mp_tp, sototcpcb(so), full_dack: data_ack);
5472 }
5473}
5474
5475void
5476mptcp_preproc_sbdrop(struct socket *so, struct mbuf *m, unsigned int len)
5477{
5478 int rewinding = 0;
5479
5480 /* TFO makes things complicated. */
5481 if (so->so_flags1 & SOF1_TFO_REWIND) {
5482 rewinding = 1;
5483 so->so_flags1 &= ~SOF1_TFO_REWIND;
5484 }
5485
5486 while (m && (!(so->so_flags & SOF_MP_SUBFLOW) || rewinding)) {
5487 u_int32_t sub_len;
5488 VERIFY(m->m_flags & M_PKTHDR);
5489 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5490
5491 sub_len = m->m_pkthdr.mp_rlen;
5492
5493 if (sub_len < len) {
5494 m->m_pkthdr.mp_dsn += sub_len;
5495 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
5496 m->m_pkthdr.mp_rseq += sub_len;
5497 }
5498 m->m_pkthdr.mp_rlen = 0;
5499 len -= sub_len;
5500 } else {
5501 /* sub_len >= len */
5502 if (rewinding == 0) {
5503 m->m_pkthdr.mp_dsn += len;
5504 }
5505 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
5506 if (rewinding == 0) {
5507 m->m_pkthdr.mp_rseq += len;
5508 }
5509 }
5510 m->m_pkthdr.mp_rlen -= len;
5511 break;
5512 }
5513 m = m->m_next;
5514 }
5515
5516 if (so->so_flags & SOF_MP_SUBFLOW &&
5517 !(sototcpcb(so)->t_mpflags & TMPF_TFO_REQUEST) &&
5518 !(sototcpcb(so)->t_mpflags & TMPF_RCVD_DACK)) {
5519 /*
5520 * Received an ack without receiving a DATA_ACK.
5521 * Need to fallback to regular TCP (or destroy this subflow).
5522 */
5523 sototcpcb(so)->t_mpflags |= TMPF_INFIN_SENT;
5524 mptcp_notify_mpfail(so);
5525 }
5526}
5527
5528/* Obtain the DSN mapping stored in the mbuf */
5529void
5530mptcp_output_getm_dsnmap32(struct socket *so, int off,
5531 uint32_t *dsn, uint32_t *relseq, uint16_t *data_len, uint16_t *dss_csum)
5532{
5533 u_int64_t dsn64;
5534
5535 mptcp_output_getm_dsnmap64(so, off, dsn: &dsn64, relseq, data_len, dss_csum);
5536 *dsn = (u_int32_t)MPTCP_DATASEQ_LOW32(dsn64);
5537}
5538
5539void
5540mptcp_output_getm_dsnmap64(struct socket *so, int off, uint64_t *dsn,
5541 uint32_t *relseq, uint16_t *data_len,
5542 uint16_t *dss_csum)
5543{
5544 struct mbuf *m = so->so_snd.sb_mb;
5545
5546 VERIFY(off >= 0);
5547
5548 if (m == NULL && (so->so_flags & SOF_DEFUNCT)) {
5549 *dsn = 0;
5550 *relseq = 0;
5551 *data_len = 0;
5552 *dss_csum = 0;
5553 return;
5554 }
5555
5556 /*
5557 * In the subflow socket, the DSN sequencing can be discontiguous,
5558 * but the subflow sequence mapping is contiguous. Use the subflow
5559 * sequence property to find the right mbuf and corresponding dsn
5560 * mapping.
5561 */
5562
5563 while (m) {
5564 VERIFY(m->m_flags & M_PKTHDR);
5565 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5566
5567 if (off >= m->m_len) {
5568 off -= m->m_len;
5569 m = m->m_next;
5570 } else {
5571 break;
5572 }
5573 }
5574
5575 VERIFY(off >= 0);
5576 VERIFY(m->m_pkthdr.mp_rlen <= UINT16_MAX);
5577
5578 *dsn = m->m_pkthdr.mp_dsn;
5579 *relseq = m->m_pkthdr.mp_rseq;
5580 *data_len = m->m_pkthdr.mp_rlen;
5581 *dss_csum = m->m_pkthdr.mp_csum;
5582}
5583
5584void
5585mptcp_output_getm_data_level_details(struct socket *so, int off, uint16_t *data_len, uint16_t *dss_csum)
5586{
5587 uint64_t dsn;
5588 uint32_t relseq;
5589
5590 mptcp_output_getm_dsnmap64(so, off, dsn: &dsn, relseq: &relseq, data_len, dss_csum);
5591}
5592
5593/*
5594 * Note that this is called only from tcp_input() via mptcp_input_preproc()
5595 * tcp_input() may trim data after the dsn mapping is inserted into the mbuf.
5596 * When it trims data tcp_input calls m_adj() which does not remove the
5597 * m_pkthdr even if the m_len becomes 0 as a result of trimming the mbuf.
5598 * The dsn map insertion cannot be delayed after trim, because data can be in
5599 * the reassembly queue for a while and the DSN option info in tp will be
5600 * overwritten for every new packet received.
5601 * The dsn map will be adjusted just prior to appending to subflow sockbuf
5602 * with mptcp_adj_rmap()
5603 */
5604void
5605mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th)
5606{
5607 VERIFY(m->m_flags & M_PKTHDR);
5608 VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP));
5609
5610 if (tp->t_mpflags & TMPF_EMBED_DSN) {
5611 m->m_pkthdr.mp_dsn = tp->t_rcv_map.mpt_dsn;
5612 m->m_pkthdr.mp_rseq = tp->t_rcv_map.mpt_sseq;
5613 m->m_pkthdr.mp_rlen = tp->t_rcv_map.mpt_len;
5614 m->m_pkthdr.mp_csum = tp->t_rcv_map.mpt_csum;
5615 if (tp->t_rcv_map.mpt_dfin) {
5616 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
5617 }
5618
5619 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
5620
5621 tp->t_mpflags &= ~TMPF_EMBED_DSN;
5622 tp->t_mpflags |= TMPF_MPTCP_ACKNOW;
5623 } else if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
5624 if (th->th_flags & TH_FIN) {
5625 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
5626 }
5627 }
5628}
5629
5630/*
5631 * Following routines help with failure detection and failover of data
5632 * transfer from one subflow to another.
5633 */
5634void
5635mptcp_act_on_txfail(struct socket *so)
5636{
5637 struct tcpcb *tp = NULL;
5638 struct inpcb *inp = sotoinpcb(so);
5639
5640 if (inp == NULL) {
5641 return;
5642 }
5643
5644 tp = intotcpcb(inp);
5645 if (tp == NULL) {
5646 return;
5647 }
5648
5649 if (so->so_flags & SOF_MP_TRYFAILOVER) {
5650 return;
5651 }
5652
5653 so->so_flags |= SOF_MP_TRYFAILOVER;
5654 soevent(so, hint: (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFAILOVER));
5655}
5656
5657/*
5658 * Support for MP_FAIL option
5659 */
5660int
5661mptcp_get_map_for_dsn(struct socket *so, uint64_t dsn_fail, uint32_t *tcp_seq)
5662{
5663 struct mbuf *m = so->so_snd.sb_mb;
5664 uint16_t datalen;
5665 uint64_t dsn;
5666 int off = 0;
5667
5668 if (m == NULL) {
5669 return -1;
5670 }
5671
5672 while (m != NULL) {
5673 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5674 VERIFY(m->m_flags & M_PKTHDR);
5675 dsn = m->m_pkthdr.mp_dsn;
5676 datalen = m->m_pkthdr.mp_rlen;
5677 if (MPTCP_SEQ_LEQ(dsn, dsn_fail) &&
5678 (MPTCP_SEQ_GEQ(dsn + datalen, dsn_fail))) {
5679 off = (int)(dsn_fail - dsn);
5680 *tcp_seq = m->m_pkthdr.mp_rseq + off;
5681 return 0;
5682 }
5683
5684 m = m->m_next;
5685 }
5686
5687 /*
5688 * If there was no mbuf data and a fallback to TCP occurred, there's
5689 * not much else to do.
5690 */
5691
5692 os_log_error(mptcp_log_handle, "%s: %llu not found \n", __func__, dsn_fail);
5693 return -1;
5694}
5695
5696/*
5697 * Support for sending contiguous MPTCP bytes in subflow
5698 * Also for preventing sending data with ACK in 3-way handshake
5699 */
5700int32_t
5701mptcp_adj_sendlen(struct socket *so, int32_t off)
5702{
5703 struct tcpcb *tp = sototcpcb(so);
5704 struct mptsub *mpts = tp->t_mpsub;
5705 uint64_t mdss_dsn;
5706 uint32_t mdss_subflow_seq;
5707 int mdss_subflow_off;
5708 uint16_t mdss_data_len;
5709 uint16_t dss_csum;
5710
5711 if (so->so_snd.sb_mb == NULL && (so->so_flags & SOF_DEFUNCT)) {
5712 return 0;
5713 }
5714
5715 mptcp_output_getm_dsnmap64(so, off, dsn: &mdss_dsn, relseq: &mdss_subflow_seq,
5716 data_len: &mdss_data_len, dss_csum: &dss_csum);
5717
5718 /*
5719 * We need to compute how much of the mapping still remains.
5720 * So, we compute the offset in the send-buffer of the dss-sub-seq.
5721 */
5722 mdss_subflow_off = (mdss_subflow_seq + mpts->mpts_iss) - tp->snd_una;
5723
5724 /*
5725 * When TFO is used, we are sending the mpts->mpts_iss although the relative
5726 * seq has been set to 1 (while it should be 0).
5727 */
5728 if (tp->t_mpflags & TMPF_TFO_REQUEST) {
5729 mdss_subflow_off--;
5730 }
5731
5732 VERIFY(off >= mdss_subflow_off);
5733
5734 return mdss_data_len - (off - mdss_subflow_off);
5735}
5736
5737static uint32_t
5738mptcp_get_maxseg(struct mptses *mpte)
5739{
5740 struct mptsub *mpts;
5741 uint32_t maxseg = 0;
5742
5743 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5744 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5745
5746 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5747 TCPS_HAVERCVDFIN2(tp->t_state)) {
5748 continue;
5749 }
5750
5751 if (tp->t_maxseg > maxseg) {
5752 maxseg = tp->t_maxseg;
5753 }
5754 }
5755
5756 return maxseg;
5757}
5758
5759static uint8_t
5760mptcp_get_rcvscale(struct mptses *mpte)
5761{
5762 struct mptsub *mpts;
5763 uint8_t rcvscale = UINT8_MAX;
5764
5765 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5766 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5767
5768 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5769 TCPS_HAVERCVDFIN2(tp->t_state)) {
5770 continue;
5771 }
5772
5773 if (tp->rcv_scale < rcvscale) {
5774 rcvscale = tp->rcv_scale;
5775 }
5776 }
5777
5778 return rcvscale;
5779}
5780
5781/* Similar to tcp_sbrcv_reserve */
5782static void
5783mptcp_sbrcv_reserve(struct mptcb *mp_tp, struct sockbuf *sbrcv,
5784 u_int32_t newsize, u_int32_t idealsize)
5785{
5786 uint8_t rcvscale = mptcp_get_rcvscale(mpte: mp_tp->mpt_mpte);
5787
5788 if (rcvscale == UINT8_MAX) {
5789 return;
5790 }
5791
5792 /* newsize should not exceed max */
5793 newsize = min(a: newsize, b: tcp_autorcvbuf_max);
5794
5795 /* The receive window scale negotiated at the
5796 * beginning of the connection will also set a
5797 * limit on the socket buffer size
5798 */
5799 newsize = min(a: newsize, TCP_MAXWIN << rcvscale);
5800
5801 /* Set new socket buffer size */
5802 if (newsize > sbrcv->sb_hiwat &&
5803 (sbreserve(sb: sbrcv, cc: newsize) == 1)) {
5804 sbrcv->sb_idealsize = min(a: max(a: sbrcv->sb_idealsize,
5805 b: (idealsize != 0) ? idealsize : newsize), b: tcp_autorcvbuf_max);
5806
5807 /* Again check the limit set by the advertised
5808 * window scale
5809 */
5810 sbrcv->sb_idealsize = min(a: sbrcv->sb_idealsize,
5811 TCP_MAXWIN << rcvscale);
5812 }
5813}
5814
5815void
5816mptcp_sbrcv_grow(struct mptcb *mp_tp)
5817{
5818 struct mptses *mpte = mp_tp->mpt_mpte;
5819 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
5820 struct sockbuf *sbrcv = &mp_so->so_rcv;
5821 uint32_t hiwat_sum = 0;
5822 uint32_t ideal_sum = 0;
5823 struct mptsub *mpts;
5824
5825 /*
5826 * Do not grow the receive socket buffer if
5827 * - auto resizing is disabled, globally or on this socket
5828 * - the high water mark already reached the maximum
5829 * - the stream is in background and receive side is being
5830 * throttled
5831 * - if there are segments in reassembly queue indicating loss,
5832 * do not need to increase recv window during recovery as more
5833 * data is not going to be sent. A duplicate ack sent during
5834 * recovery should not change the receive window
5835 */
5836 if (tcp_do_autorcvbuf == 0 ||
5837 (sbrcv->sb_flags & SB_AUTOSIZE) == 0 ||
5838 sbrcv->sb_hiwat >= tcp_autorcvbuf_max ||
5839 (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ||
5840 !LIST_EMPTY(&mp_tp->mpt_segq)) {
5841 /* Can not resize the socket buffer, just return */
5842 return;
5843 }
5844
5845 /*
5846 * Ideally, we want the rbuf to be (sum_i {bw_i} * rtt_max * 2)
5847 *
5848 * But, for this we first need accurate receiver-RTT estimations, which
5849 * we currently don't have.
5850 *
5851 * Let's use a dummy algorithm for now, just taking the sum of all
5852 * subflow's receive-buffers. It's too low, but that's all we can get
5853 * for now.
5854 */
5855
5856 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5857 hiwat_sum += mpts->mpts_socket->so_rcv.sb_hiwat;
5858 ideal_sum += mpts->mpts_socket->so_rcv.sb_idealsize;
5859 }
5860
5861 mptcp_sbrcv_reserve(mp_tp, sbrcv, newsize: hiwat_sum, idealsize: ideal_sum);
5862}
5863
5864/*
5865 * Determine if we can grow the recieve socket buffer to avoid sending
5866 * a zero window update to the peer. We allow even socket buffers that
5867 * have fixed size (set by the application) to grow if the resource
5868 * constraints are met. They will also be trimmed after the application
5869 * reads data.
5870 *
5871 * Similar to tcp_sbrcv_grow_rwin
5872 */
5873static void
5874mptcp_sbrcv_grow_rwin(struct mptcb *mp_tp, struct sockbuf *sb)
5875{
5876 struct socket *mp_so = mp_tp->mpt_mpte->mpte_mppcb->mpp_socket;
5877 u_int32_t rcvbufinc = mptcp_get_maxseg(mpte: mp_tp->mpt_mpte) << 4;
5878 u_int32_t rcvbuf = sb->sb_hiwat;
5879
5880 if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(mp_so)) {
5881 return;
5882 }
5883
5884 if (tcp_do_autorcvbuf == 1 &&
5885 /* Diff to tcp_sbrcv_grow_rwin */
5886 (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) == 0 &&
5887 (rcvbuf - sb->sb_cc) < rcvbufinc &&
5888 rcvbuf < tcp_autorcvbuf_max &&
5889 (sb->sb_idealsize > 0 &&
5890 sb->sb_hiwat <= (sb->sb_idealsize + rcvbufinc))) {
5891 sbreserve(sb, cc: min(a: (sb->sb_hiwat + rcvbufinc), b: tcp_autorcvbuf_max));
5892 }
5893}
5894
5895/* Similar to tcp_sbspace */
5896int32_t
5897mptcp_sbspace(struct mptcb *mp_tp)
5898{
5899 struct sockbuf *sb = &mp_tp->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv;
5900 uint32_t rcvbuf;
5901 int32_t space;
5902 int32_t pending = 0;
5903
5904 socket_lock_assert_owned(so: mptetoso(mpte: mp_tp->mpt_mpte));
5905
5906 mptcp_sbrcv_grow_rwin(mp_tp, sb);
5907
5908 /* hiwat might have changed */
5909 rcvbuf = sb->sb_hiwat;
5910
5911 space = ((int32_t) imin(a: (rcvbuf - sb->sb_cc),
5912 b: (sb->sb_mbmax - sb->sb_mbcnt)));
5913 if (space < 0) {
5914 space = 0;
5915 }
5916
5917#if CONTENT_FILTER
5918 /* Compensate for data being processed by content filters */
5919 pending = cfil_sock_data_space(sb);
5920#endif /* CONTENT_FILTER */
5921 if (pending > space) {
5922 space = 0;
5923 } else {
5924 space -= pending;
5925 }
5926
5927 return space;
5928}
5929
5930/*
5931 * Support Fallback to Regular TCP
5932 */
5933void
5934mptcp_notify_mpready(struct socket *so)
5935{
5936 struct tcpcb *tp = NULL;
5937
5938 if (so == NULL) {
5939 return;
5940 }
5941
5942 tp = intotcpcb(sotoinpcb(so));
5943
5944 if (tp == NULL) {
5945 return;
5946 }
5947
5948 DTRACE_MPTCP4(multipath__ready, struct socket *, so,
5949 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5950 struct tcpcb *, tp);
5951
5952 if (!(tp->t_mpflags & TMPF_MPTCP_TRUE)) {
5953 return;
5954 }
5955
5956 if (tp->t_mpflags & TMPF_MPTCP_READY) {
5957 return;
5958 }
5959
5960 tp->t_mpflags &= ~TMPF_TCP_FALLBACK;
5961 tp->t_mpflags |= TMPF_MPTCP_READY;
5962
5963 soevent(so, hint: (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5964}
5965
5966void
5967mptcp_notify_mpfail(struct socket *so)
5968{
5969 struct tcpcb *tp = NULL;
5970
5971 if (so == NULL) {
5972 return;
5973 }
5974
5975 tp = intotcpcb(sotoinpcb(so));
5976
5977 if (tp == NULL) {
5978 return;
5979 }
5980
5981 DTRACE_MPTCP4(multipath__failed, struct socket *, so,
5982 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5983 struct tcpcb *, tp);
5984
5985 if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
5986 return;
5987 }
5988
5989 tp->t_mpflags &= ~(TMPF_MPTCP_READY | TMPF_MPTCP_TRUE);
5990 tp->t_mpflags |= TMPF_TCP_FALLBACK;
5991
5992 soevent(so, hint: (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5993}
5994
5995/*
5996 * Keepalive helper function
5997 */
5998boolean_t
5999mptcp_ok_to_keepalive(struct mptcb *mp_tp)
6000{
6001 boolean_t ret = 1;
6002
6003 socket_lock_assert_owned(so: mptetoso(mpte: mp_tp->mpt_mpte));
6004
6005 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
6006 ret = 0;
6007 }
6008 return ret;
6009}
6010
6011/*
6012 * MPTCP t_maxseg adjustment function
6013 */
6014int
6015mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc)
6016{
6017 int mss_lower = 0;
6018 struct mptcb *mp_tp = tptomptp(tp);
6019
6020#define MPTCP_COMPUTE_LEN { \
6021 mss_lower = sizeof (struct mptcp_dss_ack_opt); \
6022 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) \
6023 mss_lower += 2; \
6024 else \
6025 /* adjust to 32-bit boundary + EOL */ \
6026 mss_lower += 2; \
6027}
6028 if (mp_tp == NULL) {
6029 return 0;
6030 }
6031
6032 socket_lock_assert_owned(so: mptetoso(mpte: mp_tp->mpt_mpte));
6033
6034 /*
6035 * For the first subflow and subsequent subflows, adjust mss for
6036 * most common MPTCP option size, for case where tcp_mss is called
6037 * during option processing and MTU discovery.
6038 */
6039 if (!mtudisc) {
6040 if (tp->t_mpflags & TMPF_MPTCP_TRUE &&
6041 !(tp->t_mpflags & TMPF_JOINED_FLOW)) {
6042 MPTCP_COMPUTE_LEN;
6043 }
6044
6045 if (tp->t_mpflags & TMPF_PREESTABLISHED &&
6046 tp->t_mpflags & TMPF_SENT_JOIN) {
6047 MPTCP_COMPUTE_LEN;
6048 }
6049 } else {
6050 if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
6051 MPTCP_COMPUTE_LEN;
6052 }
6053 }
6054
6055 return mss_lower;
6056}
6057
6058static void
6059fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts)
6060{
6061 struct inpcb *inp;
6062
6063 tcp_getconninfo(so, &flow->flow_ci);
6064 inp = sotoinpcb(so);
6065 if ((inp->inp_vflag & INP_IPV6) != 0) {
6066 flow->flow_src.ss_family = AF_INET6;
6067 flow->flow_dst.ss_family = AF_INET6;
6068 flow->flow_src.ss_len = sizeof(struct sockaddr_in6);
6069 flow->flow_dst.ss_len = sizeof(struct sockaddr_in6);
6070 SIN6(&flow->flow_src)->sin6_port = inp->in6p_lport;
6071 SIN6(&flow->flow_dst)->sin6_port = inp->in6p_fport;
6072 SIN6(&flow->flow_src)->sin6_addr = inp->in6p_laddr;
6073 SIN6(&flow->flow_dst)->sin6_addr = inp->in6p_faddr;
6074 } else if ((inp->inp_vflag & INP_IPV4) != 0) {
6075 flow->flow_src.ss_family = AF_INET;
6076 flow->flow_dst.ss_family = AF_INET;
6077 flow->flow_src.ss_len = sizeof(struct sockaddr_in);
6078 flow->flow_dst.ss_len = sizeof(struct sockaddr_in);
6079 SIN(&flow->flow_src)->sin_port = inp->inp_lport;
6080 SIN(&flow->flow_dst)->sin_port = inp->inp_fport;
6081 SIN(&flow->flow_src)->sin_addr = inp->inp_laddr;
6082 SIN(&flow->flow_dst)->sin_addr = inp->inp_faddr;
6083 }
6084 flow->flow_len = sizeof(*flow);
6085 flow->flow_tcpci_offset = offsetof(mptcp_flow_t, flow_ci);
6086 flow->flow_flags = mpts->mpts_flags;
6087 flow->flow_cid = mpts->mpts_connid;
6088 flow->flow_relseq = mpts->mpts_rel_seq;
6089 flow->flow_soerror = mpts->mpts_socket->so_error;
6090 flow->flow_probecnt = mpts->mpts_probecnt;
6091}
6092
6093static int
6094mptcp_pcblist SYSCTL_HANDLER_ARGS
6095{
6096#pragma unused(oidp, arg1, arg2)
6097 int error = 0, f;
6098 size_t len;
6099 struct mppcb *mpp;
6100 struct mptses *mpte;
6101 struct mptcb *mp_tp;
6102 struct mptsub *mpts;
6103 struct socket *so;
6104 conninfo_mptcp_t mptcpci;
6105 mptcp_flow_t *flows = NULL;
6106
6107 if (req->newptr != USER_ADDR_NULL) {
6108 return EPERM;
6109 }
6110
6111 lck_mtx_lock(lck: &mtcbinfo.mppi_lock);
6112 if (req->oldptr == USER_ADDR_NULL) {
6113 size_t n = mtcbinfo.mppi_count;
6114 lck_mtx_unlock(lck: &mtcbinfo.mppi_lock);
6115 req->oldidx = (n + n / 8) * sizeof(conninfo_mptcp_t) +
6116 4 * (n + n / 8) * sizeof(mptcp_flow_t);
6117 return 0;
6118 }
6119 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6120 flows = NULL;
6121 socket_lock(so: mpp->mpp_socket, refcount: 1);
6122 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
6123 mpte = mptompte(mp: mpp);
6124
6125 socket_lock_assert_owned(so: mptetoso(mpte));
6126 mp_tp = mpte->mpte_mptcb;
6127
6128 bzero(s: &mptcpci, n: sizeof(mptcpci));
6129 mptcpci.mptcpci_state = mp_tp->mpt_state;
6130 mptcpci.mptcpci_flags = mp_tp->mpt_flags;
6131 mptcpci.mptcpci_ltoken = mp_tp->mpt_localtoken;
6132 mptcpci.mptcpci_rtoken = mp_tp->mpt_remotetoken;
6133 mptcpci.mptcpci_notsent_lowat = mp_tp->mpt_notsent_lowat;
6134 mptcpci.mptcpci_snduna = mp_tp->mpt_snduna;
6135 mptcpci.mptcpci_sndnxt = mp_tp->mpt_sndnxt;
6136 mptcpci.mptcpci_sndmax = mp_tp->mpt_sndmax;
6137 mptcpci.mptcpci_lidsn = mp_tp->mpt_local_idsn;
6138 mptcpci.mptcpci_sndwnd = mp_tp->mpt_sndwnd;
6139 mptcpci.mptcpci_rcvnxt = mp_tp->mpt_rcvnxt;
6140 mptcpci.mptcpci_rcvatmark = mp_tp->mpt_rcvnxt;
6141 mptcpci.mptcpci_ridsn = mp_tp->mpt_remote_idsn;
6142 mptcpci.mptcpci_rcvwnd = mp_tp->mpt_rcvwnd;
6143
6144 mptcpci.mptcpci_nflows = mpte->mpte_numflows;
6145 mptcpci.mptcpci_mpte_flags = mpte->mpte_flags;
6146 mptcpci.mptcpci_mpte_addrid = mpte->mpte_addrid_last;
6147 mptcpci.mptcpci_flow_offset =
6148 offsetof(conninfo_mptcp_t, mptcpci_flows);
6149
6150 len = sizeof(*flows) * mpte->mpte_numflows;
6151 if (mpte->mpte_numflows != 0) {
6152 flows = kalloc_data(len, Z_WAITOK | Z_ZERO);
6153 if (flows == NULL) {
6154 socket_unlock(so: mpp->mpp_socket, refcount: 1);
6155 break;
6156 }
6157 mptcpci.mptcpci_len = sizeof(mptcpci) +
6158 sizeof(*flows) * (mptcpci.mptcpci_nflows - 1);
6159 error = SYSCTL_OUT(req, &mptcpci,
6160 sizeof(mptcpci) - sizeof(mptcp_flow_t));
6161 } else {
6162 mptcpci.mptcpci_len = sizeof(mptcpci);
6163 error = SYSCTL_OUT(req, &mptcpci, sizeof(mptcpci));
6164 }
6165 if (error) {
6166 socket_unlock(so: mpp->mpp_socket, refcount: 1);
6167 kfree_data(flows, len);
6168 break;
6169 }
6170 f = 0;
6171 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
6172 so = mpts->mpts_socket;
6173 fill_mptcp_subflow(so, flow: &flows[f], mpts);
6174 f++;
6175 }
6176 socket_unlock(so: mpp->mpp_socket, refcount: 1);
6177 if (flows) {
6178 error = SYSCTL_OUT(req, flows, len);
6179 kfree_data(flows, len);
6180 if (error) {
6181 break;
6182 }
6183 }
6184 }
6185 lck_mtx_unlock(lck: &mtcbinfo.mppi_lock);
6186
6187 return error;
6188}
6189
6190SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED,
6191 0, 0, mptcp_pcblist, "S,conninfo_mptcp_t",
6192 "List of active MPTCP connections");
6193
6194/*
6195 * Set notsent lowat mark on the MPTCB
6196 */
6197int
6198mptcp_set_notsent_lowat(struct mptses *mpte, int optval)
6199{
6200 struct mptcb *mp_tp = NULL;
6201 int error = 0;
6202
6203 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) {
6204 mp_tp = mpte->mpte_mptcb;
6205 }
6206
6207 if (mp_tp) {
6208 mp_tp->mpt_notsent_lowat = optval;
6209 } else {
6210 error = EINVAL;
6211 }
6212
6213 return error;
6214}
6215
6216u_int32_t
6217mptcp_get_notsent_lowat(struct mptses *mpte)
6218{
6219 struct mptcb *mp_tp = NULL;
6220
6221 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED) {
6222 mp_tp = mpte->mpte_mptcb;
6223 }
6224
6225 if (mp_tp) {
6226 return mp_tp->mpt_notsent_lowat;
6227 } else {
6228 return 0;
6229 }
6230}
6231
6232int
6233mptcp_notsent_lowat_check(struct socket *so)
6234{
6235 struct mptses *mpte;
6236 struct mppcb *mpp;
6237 struct mptcb *mp_tp;
6238 struct mptsub *mpts;
6239
6240 int notsent = 0;
6241
6242 mpp = mpsotomppcb(mp_so: so);
6243 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
6244 return 0;
6245 }
6246
6247 mpte = mptompte(mp: mpp);
6248 socket_lock_assert_owned(so: mptetoso(mpte));
6249 mp_tp = mpte->mpte_mptcb;
6250
6251 notsent = so->so_snd.sb_cc;
6252
6253 if ((notsent == 0) ||
6254 ((notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)) <=
6255 mp_tp->mpt_notsent_lowat)) {
6256 return 1;
6257 }
6258
6259 /* When Nagle's algorithm is not disabled, it is better
6260 * to wakeup the client even before there is atleast one
6261 * maxseg of data to write.
6262 */
6263 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
6264 int retval = 0;
6265 if (mpts->mpts_flags & MPTSF_ACTIVE) {
6266 struct socket *subf_so = mpts->mpts_socket;
6267 struct tcpcb *tp = intotcpcb(sotoinpcb(subf_so));
6268
6269 notsent = so->so_snd.sb_cc -
6270 (tp->snd_nxt - tp->snd_una);
6271
6272 if ((tp->t_flags & TF_NODELAY) == 0 &&
6273 notsent > 0 && (notsent <= (int)tp->t_maxseg)) {
6274 retval = 1;
6275 }
6276 return retval;
6277 }
6278 }
6279 return 0;
6280}
6281
6282static errno_t
6283mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
6284 void **unitinfo)
6285{
6286#pragma unused(kctlref, sac, unitinfo)
6287
6288 if (OSIncrementAtomic(&mptcp_kern_skt_inuse) > 0) {
6289 os_log_error(mptcp_log_handle, "%s: MPTCP kernel-control socket for Symptoms already open!", __func__);
6290 }
6291
6292 mptcp_kern_skt_unit = sac->sc_unit;
6293
6294 return 0;
6295}
6296
6297static void
6298mptcp_allow_uuid(uuid_t uuid, int32_t rssi)
6299{
6300 struct mppcb *mpp;
6301
6302 /* Iterate over all MPTCP connections */
6303
6304 lck_mtx_lock(lck: &mtcbinfo.mppi_lock);
6305
6306 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6307 struct socket *mp_so = mpp->mpp_socket;
6308 struct mptses *mpte = mpp->mpp_pcbe;
6309
6310 socket_lock(so: mp_so, refcount: 1);
6311
6312 if (mp_so->so_flags & SOF_DELEGATED &&
6313 uuid_compare(uu1: uuid, uu2: mp_so->e_uuid)) {
6314 goto next;
6315 } else if (!(mp_so->so_flags & SOF_DELEGATED) &&
6316 uuid_compare(uu1: uuid, uu2: mp_so->last_uuid)) {
6317 goto next;
6318 }
6319
6320 os_log(mptcp_log_handle, "%s - %lx: Got allowance for useApp with rssi %d\n",
6321 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), rssi);
6322
6323 mpte->mpte_flags |= MPTE_ACCESS_GRANTED;
6324
6325 if (rssi > MPTCP_TARGET_BASED_RSSI_THRESHOLD) {
6326 mpte->mpte_flags |= MPTE_CELL_PROHIBITED;
6327 }
6328
6329 mptcp_check_subflows_and_add(mpte);
6330 mptcp_remove_subflows(mpte);
6331
6332 mpte->mpte_flags &= ~(MPTE_ACCESS_GRANTED | MPTE_CELL_PROHIBITED);
6333
6334next:
6335 socket_unlock(so: mp_so, refcount: 1);
6336 }
6337
6338 lck_mtx_unlock(lck: &mtcbinfo.mppi_lock);
6339}
6340
6341static void
6342mptcp_wifi_status_changed(void)
6343{
6344 struct mppcb *mpp;
6345
6346 /* Iterate over all MPTCP connections */
6347
6348 lck_mtx_lock(lck: &mtcbinfo.mppi_lock);
6349
6350 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
6351 struct socket *mp_so = mpp->mpp_socket;
6352 struct mptses *mpte = mpp->mpp_pcbe;
6353
6354 socket_lock(so: mp_so, refcount: 1);
6355
6356 /* Only handover- and urgency-mode are purely driven by Symptom's Wi-Fi status */
6357 if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER &&
6358 mpte->mpte_svctype != MPTCP_SVCTYPE_PURE_HANDOVER &&
6359 mpte->mpte_svctype != MPTCP_SVCTYPE_TARGET_BASED) {
6360 goto next;
6361 }
6362
6363 mptcp_check_subflows_and_add(mpte);
6364 mptcp_check_subflows_and_remove(mpte);
6365
6366next:
6367 socket_unlock(so: mp_so, refcount: 1);
6368 }
6369
6370 lck_mtx_unlock(lck: &mtcbinfo.mppi_lock);
6371}
6372
6373struct mptcp_uuid_search_info {
6374 uuid_t target_uuid;
6375 proc_t found_proc;
6376 boolean_t is_proc_found;
6377};
6378
6379static int
6380mptcp_find_proc_filter(proc_t p, void *arg)
6381{
6382 struct mptcp_uuid_search_info *info = (struct mptcp_uuid_search_info *)arg;
6383 int found;
6384
6385 if (info->is_proc_found) {
6386 return 0;
6387 }
6388
6389 /*
6390 * uuid_compare returns 0 if the uuids are matching, but the proc-filter
6391 * expects != 0 for a matching filter.
6392 */
6393 found = uuid_compare(uu1: proc_executableuuid_addr(p), uu2: info->target_uuid) == 0;
6394 if (found) {
6395 info->is_proc_found = true;
6396 }
6397
6398 return found;
6399}
6400
6401static int
6402mptcp_find_proc_callout(proc_t p, void * arg)
6403{
6404 struct mptcp_uuid_search_info *info = (struct mptcp_uuid_search_info *)arg;
6405
6406 if (uuid_compare(uu1: proc_executableuuid_addr(p), uu2: info->target_uuid) == 0) {
6407 info->found_proc = p;
6408 return PROC_CLAIMED_DONE;
6409 }
6410
6411 return PROC_RETURNED;
6412}
6413
6414static proc_t
6415mptcp_find_proc(const uuid_t uuid)
6416{
6417 struct mptcp_uuid_search_info info;
6418
6419 uuid_copy(dst: info.target_uuid, src: uuid);
6420 info.found_proc = PROC_NULL;
6421 info.is_proc_found = false;
6422
6423 proc_iterate(PROC_ALLPROCLIST, callout: mptcp_find_proc_callout, arg: &info,
6424 filterfn: mptcp_find_proc_filter, filterarg: &info);
6425
6426 return info.found_proc;
6427}
6428
6429void
6430mptcp_ask_symptoms(struct mptses *mpte)
6431{
6432 struct mptcp_symptoms_ask_uuid ask;
6433 struct socket *mp_so;
6434 struct proc *p = PROC_NULL;
6435 int pid, prio, err;
6436
6437 if (mptcp_kern_skt_unit == 0) {
6438 os_log_error(mptcp_log_handle, "%s - %lx: skt_unit is still 0\n",
6439 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
6440 return;
6441 }
6442
6443 mp_so = mptetoso(mpte);
6444
6445 if (mp_so->so_flags & SOF_DELEGATED) {
6446 if (mpte->mpte_epid != 0) {
6447 p = proc_find(pid: mpte->mpte_epid);
6448 if (p != PROC_NULL) {
6449 /* We found a pid, check its UUID */
6450 if (uuid_compare(uu1: mp_so->e_uuid, uu2: proc_executableuuid_addr(p))) {
6451 /* It's not the same - we need to look for the real proc */
6452 proc_rele(p);
6453 p = PROC_NULL;
6454 }
6455 }
6456 }
6457
6458 if (p == PROC_NULL) {
6459 p = mptcp_find_proc(uuid: mp_so->e_uuid);
6460 if (p == PROC_NULL) {
6461 uuid_string_t uuid_string;
6462 uuid_unparse(uu: mp_so->e_uuid, out: uuid_string);
6463
6464 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for uuid %s\n",
6465 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), uuid_string);
6466
6467 return;
6468 }
6469 mpte->mpte_epid = proc_pid(p);
6470 }
6471
6472 pid = mpte->mpte_epid;
6473 uuid_copy(dst: ask.uuid, src: mp_so->e_uuid);
6474 } else {
6475 pid = mp_so->last_pid;
6476
6477 p = proc_find(pid);
6478 if (p == PROC_NULL) {
6479 os_log_error(mptcp_log_handle, "%s - %lx: Couldn't find proc for pid %u\n",
6480 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid);
6481 return;
6482 }
6483
6484 uuid_copy(dst: ask.uuid, src: mp_so->last_uuid);
6485 }
6486
6487
6488 ask.cmd = MPTCP_SYMPTOMS_ASK_UUID;
6489
6490 prio = proc_get_effective_task_policy(task: proc_task(p), TASK_POLICY_ROLE);
6491
6492 if (prio == TASK_BACKGROUND_APPLICATION || prio == TASK_NONUI_APPLICATION ||
6493 prio == TASK_DARWINBG_APPLICATION) {
6494 ask.priority = MPTCP_SYMPTOMS_BACKGROUND;
6495 } else if (prio == TASK_FOREGROUND_APPLICATION) {
6496 ask.priority = MPTCP_SYMPTOMS_FOREGROUND;
6497 } else {
6498 ask.priority = MPTCP_SYMPTOMS_UNKNOWN;
6499 }
6500
6501 err = ctl_enqueuedata(kctlref: mptcp_kern_ctrl_ref, unit: mptcp_kern_skt_unit,
6502 data: &ask, len: sizeof(ask), CTL_DATA_EOR);
6503
6504 os_log(mptcp_log_handle, "%s - %lx: asked symptoms about pid %u, taskprio %u, prio %u, err %d\n",
6505 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), pid, prio, ask.priority, err);
6506
6507
6508 proc_rele(p);
6509}
6510
6511static errno_t
6512mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit,
6513 void *unitinfo)
6514{
6515#pragma unused(kctlref, kcunit, unitinfo)
6516
6517 OSDecrementAtomic(&mptcp_kern_skt_inuse);
6518
6519 return 0;
6520}
6521
6522static errno_t
6523mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
6524 mbuf_t m, int flags)
6525{
6526#pragma unused(kctlref, unitinfo, flags)
6527 symptoms_advisory_t *sa = NULL;
6528
6529 if (kcunit != mptcp_kern_skt_unit) {
6530 os_log_error(mptcp_log_handle, "%s: kcunit %u is different from expected one %u\n",
6531 __func__, kcunit, mptcp_kern_skt_unit);
6532 }
6533
6534 if (mbuf_pkthdr_len(mbuf: m) < sizeof(*sa)) {
6535 mbuf_freem(mbuf: m);
6536 return EINVAL;
6537 }
6538
6539 if (mbuf_len(mbuf: m) < sizeof(*sa)) {
6540 os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu\n",
6541 __func__, mbuf_len(m), sizeof(*sa));
6542 mbuf_freem(mbuf: m);
6543 return EINVAL;
6544 }
6545
6546 sa = mbuf_data(mbuf: m);
6547
6548 if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_USEAPP) {
6549 os_log(mptcp_log_handle, "%s: wifi new,old: %d,%d, cell new, old: %d,%d\n", __func__,
6550 sa->sa_wifi_status, mptcp_advisory.sa_wifi_status,
6551 sa->sa_cell_status, mptcp_advisory.sa_cell_status);
6552
6553 if (sa->sa_wifi_status != mptcp_advisory.sa_wifi_status) {
6554 mptcp_advisory.sa_wifi_status = sa->sa_wifi_status;
6555 mptcp_wifi_status_changed();
6556 }
6557 } else {
6558 struct mptcp_symptoms_answer answer;
6559 errno_t err;
6560
6561 /* We temporarily allow different sizes for ease of submission */
6562 if (mbuf_len(mbuf: m) != sizeof(uuid_t) + sizeof(*sa) &&
6563 mbuf_len(mbuf: m) != sizeof(answer)) {
6564 os_log_error(mptcp_log_handle, "%s: mbuf is %lu but need %lu or %lu\n",
6565 __func__, mbuf_len(m), sizeof(uuid_t) + sizeof(*sa),
6566 sizeof(answer));
6567 mbuf_free(mbuf: m);
6568 return EINVAL;
6569 }
6570
6571 memset(s: &answer, c: 0, n: sizeof(answer));
6572
6573 err = mbuf_copydata(mbuf: m, offset: 0, length: mbuf_len(mbuf: m), out_data: &answer);
6574 if (err) {
6575 os_log_error(mptcp_log_handle, "%s: mbuf_copydata returned %d\n", __func__, err);
6576 mbuf_free(mbuf: m);
6577 return err;
6578 }
6579
6580 mptcp_allow_uuid(uuid: answer.uuid, rssi: answer.rssi);
6581 }
6582
6583 mbuf_freem(mbuf: m);
6584 return 0;
6585}
6586
6587void
6588mptcp_control_register(void)
6589{
6590 /* Set up the advisory control socket */
6591 struct kern_ctl_reg mptcp_kern_ctl;
6592
6593 bzero(s: &mptcp_kern_ctl, n: sizeof(mptcp_kern_ctl));
6594 strlcpy(dst: mptcp_kern_ctl.ctl_name, MPTCP_KERN_CTL_NAME,
6595 n: sizeof(mptcp_kern_ctl.ctl_name));
6596 mptcp_kern_ctl.ctl_connect = mptcp_symptoms_ctl_connect;
6597 mptcp_kern_ctl.ctl_disconnect = mptcp_symptoms_ctl_disconnect;
6598 mptcp_kern_ctl.ctl_send = mptcp_symptoms_ctl_send;
6599 mptcp_kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED;
6600
6601 (void)ctl_register(userkctl: &mptcp_kern_ctl, kctlref: &mptcp_kern_ctrl_ref);
6602}
6603
6604mptcp_wifi_quality_t
6605mptcp_wifi_quality_for_session(struct mptses *mpte)
6606{
6607 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6608 if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER &&
6609 mptcp_advisory.sa_wifi_status) {
6610 return symptoms_is_wifi_lossy() ? MPTCP_WIFI_QUALITY_BAD : MPTCP_WIFI_QUALITY_GOOD;
6611 }
6612
6613 /*
6614 * If it's a first-party app and we don't have any info
6615 * about the Wi-Fi state, let's be pessimistic.
6616 */
6617 return MPTCP_WIFI_QUALITY_UNSURE;
6618 } else {
6619 if (symptoms_is_wifi_lossy()) {
6620 return MPTCP_WIFI_QUALITY_BAD;
6621 }
6622
6623 /*
6624 * If we are target-based (meaning, we allow to be more lax on
6625 * the when wifi is considered bad), we only *know* about the state once
6626 * we got the allowance from Symptoms (MPTE_ACCESS_GRANTED).
6627 *
6628 * If RSSI is not bad enough, MPTE_CELL_PROHIBITED will then
6629 * be set.
6630 *
6631 * In any other case (while in target-mode), consider WiFi bad
6632 * and we are going to ask for allowance from Symptoms anyway.
6633 */
6634 if (mpte->mpte_svctype == MPTCP_SVCTYPE_TARGET_BASED) {
6635 if (mpte->mpte_flags & MPTE_ACCESS_GRANTED &&
6636 mpte->mpte_flags & MPTE_CELL_PROHIBITED) {
6637 return MPTCP_WIFI_QUALITY_GOOD;
6638 }
6639
6640 return MPTCP_WIFI_QUALITY_BAD;
6641 }
6642
6643 return MPTCP_WIFI_QUALITY_GOOD;
6644 }
6645}
6646
6647boolean_t
6648symptoms_is_wifi_lossy(void)
6649{
6650 return (mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_OK) ? false : true;
6651}
6652
6653int
6654mptcp_freeq(struct mptcb *mp_tp)
6655{
6656 struct tseg_qent *q;
6657 int rv = 0;
6658 int count = 0;
6659
6660 while ((q = LIST_FIRST(&mp_tp->mpt_segq)) != NULL) {
6661 LIST_REMOVE(q, tqe_q);
6662 m_freem(q->tqe_m);
6663 zfree(tcp_reass_zone, q);
6664 count++;
6665 rv = 1;
6666 }
6667 mp_tp->mpt_reassqlen = 0;
6668
6669 if (count > 0) {
6670 OSAddAtomic(-count, &mptcp_reass_total_qlen);
6671 }
6672
6673 return rv;
6674}
6675
6676static int
6677mptcp_post_event(u_int32_t event_code, int value)
6678{
6679 struct kev_mptcp_data event_data;
6680 struct kev_msg ev_msg;
6681
6682 memset(s: &ev_msg, c: 0, n: sizeof(ev_msg));
6683
6684 ev_msg.vendor_code = KEV_VENDOR_APPLE;
6685 ev_msg.kev_class = KEV_NETWORK_CLASS;
6686 ev_msg.kev_subclass = KEV_MPTCP_SUBCLASS;
6687 ev_msg.event_code = event_code;
6688
6689 event_data.value = value;
6690
6691 ev_msg.dv[0].data_ptr = &event_data;
6692 ev_msg.dv[0].data_length = sizeof(event_data);
6693
6694 return kev_post_msg(event: &ev_msg);
6695}
6696
6697static void
6698mptcp_set_cellicon(struct mptses *mpte, struct mptsub *mpts)
6699{
6700 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
6701 int error;
6702
6703 /* First-party apps (Siri) don't flip the cellicon */
6704 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6705 return;
6706 }
6707
6708 /* Subflow is disappearing - don't set it on this one */
6709 if (mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED)) {
6710 return;
6711 }
6712
6713 /* Fallen back connections are not triggering the cellicon */
6714 if (mpte->mpte_mptcb->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
6715 return;
6716 }
6717
6718 /* Remember the last time we set the cellicon. Needed for debouncing */
6719 mpte->mpte_last_cellicon_set = tcp_now;
6720
6721 tp->t_timer[TCPT_CELLICON] = OFFSET_FROM_START(tp, MPTCP_CELLICON_TOGGLE_RATE);
6722 tcp_sched_timers(tp);
6723
6724 if (mpts->mpts_flags & MPTSF_CELLICON_SET &&
6725 mpte->mpte_cellicon_increments != 0) {
6726 if (mptcp_cellicon_refcount == 0) {
6727 os_log_error(mptcp_log_handle, "%s - %lx: Cell should be set (count is %u), but it's zero!\n",
6728 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6729
6730 /* Continue, so that the icon gets set... */
6731 } else {
6732 /*
6733 * In this case, the cellicon is already set. No need to bump it
6734 * even higher
6735 */
6736
6737 return;
6738 }
6739 }
6740
6741 /* When tearing down this subflow, we need to decrement the
6742 * reference counter
6743 */
6744 mpts->mpts_flags |= MPTSF_CELLICON_SET;
6745
6746 /* This counter, so that when a session gets destroyed we decrement
6747 * the reference counter by whatever is left
6748 */
6749 mpte->mpte_cellicon_increments++;
6750
6751 if (OSIncrementAtomic(&mptcp_cellicon_refcount)) {
6752 /* If cellicon is already set, get out of here! */
6753 return;
6754 }
6755
6756 error = mptcp_post_event(KEV_MPTCP_CELLUSE, value: 1);
6757
6758 if (error) {
6759 os_log_error(mptcp_log_handle, "%s - %lx: Setting cellicon failed with %d\n",
6760 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), error);
6761 } else {
6762 os_log(mptcp_log_handle, "%s - %lx: successfully set the cellicon\n",
6763 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte));
6764 }
6765}
6766
6767void
6768mptcp_clear_cellicon(void)
6769{
6770 int error = mptcp_post_event(KEV_MPTCP_CELLUSE, value: 0);
6771
6772 if (error) {
6773 os_log_error(mptcp_log_handle, "%s: Unsetting cellicon failed with %d\n",
6774 __func__, error);
6775 } else {
6776 os_log(mptcp_log_handle, "%s: successfully unset the cellicon\n",
6777 __func__);
6778 }
6779}
6780
6781/*
6782 * Returns true if the icon has been flipped to WiFi.
6783 */
6784static boolean_t
6785__mptcp_unset_cellicon(uint32_t val)
6786{
6787 VERIFY(val < INT32_MAX);
6788 if (OSAddAtomic((int32_t)-val, &mptcp_cellicon_refcount) != 1) {
6789 return false;
6790 }
6791
6792 mptcp_clear_cellicon();
6793
6794 return true;
6795}
6796
6797void
6798mptcp_unset_cellicon(struct mptses *mpte, struct mptsub *mpts, uint32_t val)
6799{
6800 /* First-party apps (Siri) don't flip the cellicon */
6801 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
6802 return;
6803 }
6804
6805 if (mpte->mpte_cellicon_increments == 0) {
6806 /* This flow never used cell - get out of here! */
6807 return;
6808 }
6809
6810 if (mptcp_cellicon_refcount == 0) {
6811 os_log_error(mptcp_log_handle, "%s - %lx: Cell is off, but should be at least %u\n",
6812 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6813
6814 return;
6815 }
6816
6817 if (mpts) {
6818 if (!(mpts->mpts_flags & MPTSF_CELLICON_SET)) {
6819 return;
6820 }
6821
6822 mpts->mpts_flags &= ~MPTSF_CELLICON_SET;
6823 }
6824
6825 if (mpte->mpte_cellicon_increments < val) {
6826 os_log_error(mptcp_log_handle, "%s - %lx: Increments is %u but want to dec by %u.\n",
6827 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments, val);
6828 val = mpte->mpte_cellicon_increments;
6829 }
6830
6831 mpte->mpte_cellicon_increments -= val;
6832
6833 if (__mptcp_unset_cellicon(val) == false) {
6834 return;
6835 }
6836
6837 /* All flows are gone - our counter should be at zero too! */
6838 if (mpte->mpte_cellicon_increments != 0) {
6839 os_log_error(mptcp_log_handle, "%s - %lx: Inconsistent state! Cell refcount is zero but increments are at %u\n",
6840 __func__, (unsigned long)VM_KERNEL_ADDRPERM(mpte), mpte->mpte_cellicon_increments);
6841 }
6842}
6843
6844void
6845mptcp_reset_rexmit_state(struct tcpcb *tp)
6846{
6847 struct mptsub *mpts;
6848 struct inpcb *inp;
6849 struct socket *so;
6850
6851 inp = tp->t_inpcb;
6852 if (inp == NULL) {
6853 return;
6854 }
6855
6856 so = inp->inp_socket;
6857 if (so == NULL) {
6858 return;
6859 }
6860
6861 if (!(so->so_flags & SOF_MP_SUBFLOW)) {
6862 return;
6863 }
6864
6865 mpts = tp->t_mpsub;
6866
6867 mpts->mpts_flags &= ~MPTSF_WRITE_STALL;
6868 so->so_flags &= ~SOF_MP_TRYFAILOVER;
6869}
6870
6871void
6872mptcp_reset_keepalive(struct tcpcb *tp)
6873{
6874 struct mptsub *mpts = tp->t_mpsub;
6875
6876 mpts->mpts_flags &= ~MPTSF_READ_STALL;
6877}
6878
6879static struct mppcb *
6880mtcp_alloc(void)
6881{
6882 return &kalloc_type(struct mpp_mtp, Z_WAITOK | Z_ZERO | Z_NOFAIL)->mpp;
6883}
6884
6885static void
6886mtcp_free(struct mppcb *mpp)
6887{
6888 struct mpp_mtp *mtp = __container_of(mpp, struct mpp_mtp, mpp);
6889
6890 kfree_type(struct mpp_mtp, mtp);
6891}
6892
6893/*
6894 * Protocol pr_init callback.
6895 */
6896void
6897mptcp_init(struct protosw *pp, struct domain *dp)
6898{
6899#pragma unused(dp)
6900 static int mptcp_initialized = 0;
6901 struct protosw *prp;
6902 struct ip6protosw *prp6;
6903
6904 VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED);
6905
6906 /* do this only once */
6907 if (mptcp_initialized) {
6908 return;
6909 }
6910 mptcp_initialized = 1;
6911
6912 mptcp_advisory.sa_wifi_status = SYMPTOMS_ADVISORY_WIFI_OK;
6913
6914 /*
6915 * Since PF_MULTIPATH gets initialized after PF_INET/INET6,
6916 * we must be able to find IPPROTO_TCP entries for both.
6917 */
6918 prp = pffindproto_locked(PF_INET, IPPROTO_TCP, SOCK_STREAM);
6919 VERIFY(prp != NULL);
6920 bcopy(src: prp, dst: &mptcp_subflow_protosw, n: sizeof(*prp));
6921 bcopy(src: prp->pr_usrreqs, dst: &mptcp_subflow_usrreqs,
6922 n: sizeof(mptcp_subflow_usrreqs));
6923 mptcp_subflow_protosw.pr_entry.tqe_next = NULL;
6924 mptcp_subflow_protosw.pr_entry.tqe_prev = NULL;
6925 mptcp_subflow_protosw.pr_usrreqs = &mptcp_subflow_usrreqs;
6926 mptcp_subflow_usrreqs.pru_soreceive = mptcp_subflow_soreceive;
6927 mptcp_subflow_usrreqs.pru_sosend = mptcp_subflow_sosend;
6928 mptcp_subflow_usrreqs.pru_rcvoob = pru_rcvoob_notsupp;
6929 /*
6930 * Socket filters shouldn't attach/detach to/from this protosw
6931 * since pr_protosw is to be used instead, which points to the
6932 * real protocol; if they do, it is a bug and we should panic.
6933 */
6934 mptcp_subflow_protosw.pr_filter_head.tqh_first =
6935 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
6936 mptcp_subflow_protosw.pr_filter_head.tqh_last =
6937 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
6938
6939 prp6 = (struct ip6protosw *)pffindproto_locked(PF_INET6,
6940 IPPROTO_TCP, SOCK_STREAM);
6941 VERIFY(prp6 != NULL);
6942 bcopy(src: prp6, dst: &mptcp_subflow_protosw6, n: sizeof(*prp6));
6943 bcopy(src: prp6->pr_usrreqs, dst: &mptcp_subflow_usrreqs6,
6944 n: sizeof(mptcp_subflow_usrreqs6));
6945 mptcp_subflow_protosw6.pr_entry.tqe_next = NULL;
6946 mptcp_subflow_protosw6.pr_entry.tqe_prev = NULL;
6947 mptcp_subflow_protosw6.pr_usrreqs = &mptcp_subflow_usrreqs6;
6948 mptcp_subflow_usrreqs6.pru_soreceive = mptcp_subflow_soreceive;
6949 mptcp_subflow_usrreqs6.pru_sosend = mptcp_subflow_sosend;
6950 mptcp_subflow_usrreqs6.pru_rcvoob = pru_rcvoob_notsupp;
6951 /*
6952 * Socket filters shouldn't attach/detach to/from this protosw
6953 * since pr_protosw is to be used instead, which points to the
6954 * real protocol; if they do, it is a bug and we should panic.
6955 */
6956 mptcp_subflow_protosw6.pr_filter_head.tqh_first =
6957 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
6958 mptcp_subflow_protosw6.pr_filter_head.tqh_last =
6959 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
6960
6961 bzero(s: &mtcbinfo, n: sizeof(mtcbinfo));
6962 TAILQ_INIT(&mtcbinfo.mppi_pcbs);
6963 mtcbinfo.mppi_alloc = mtcp_alloc;
6964 mtcbinfo.mppi_free = mtcp_free;
6965
6966 mtcbinfo.mppi_lock_grp = lck_grp_alloc_init(grp_name: "mppcb", LCK_GRP_ATTR_NULL);
6967 lck_attr_setdefault(attr: &mtcbinfo.mppi_lock_attr);
6968 lck_mtx_init(lck: &mtcbinfo.mppi_lock, grp: mtcbinfo.mppi_lock_grp,
6969 attr: &mtcbinfo.mppi_lock_attr);
6970
6971 mtcbinfo.mppi_gc = mptcp_gc;
6972 mtcbinfo.mppi_timer = mptcp_timer;
6973
6974 /* attach to MP domain for garbage collection to take place */
6975 mp_pcbinfo_attach(&mtcbinfo);
6976
6977 mptcp_log_handle = os_log_create(subsystem: "com.apple.xnu.net.mptcp", category: "mptcp");
6978}
6979