1/*
2 * Copyright (c) 2012-2017 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <kern/locks.h>
30#include <kern/policy_internal.h>
31#include <kern/zalloc.h>
32
33#include <mach/sdt.h>
34
35#include <sys/domain.h>
36#include <sys/kdebug.h>
37#include <sys/kern_control.h>
38#include <sys/kernel.h>
39#include <sys/mbuf.h>
40#include <sys/mcache.h>
41#include <sys/param.h>
42#include <sys/proc.h>
43#include <sys/protosw.h>
44#include <sys/resourcevar.h>
45#include <sys/socket.h>
46#include <sys/socketvar.h>
47#include <sys/sysctl.h>
48#include <sys/syslog.h>
49#include <sys/systm.h>
50
51#include <net/content_filter.h>
52#include <net/if.h>
53#include <net/if_var.h>
54#include <netinet/in.h>
55#include <netinet/in_pcb.h>
56#include <netinet/in_var.h>
57#include <netinet/tcp.h>
58#include <netinet/tcp_fsm.h>
59#include <netinet/tcp_seq.h>
60#include <netinet/tcp_var.h>
61#include <netinet/mptcp_var.h>
62#include <netinet/mptcp.h>
63#include <netinet/mptcp_opt.h>
64#include <netinet/mptcp_seq.h>
65#include <netinet/mptcp_timer.h>
66#include <libkern/crypto/sha1.h>
67#if INET6
68#include <netinet6/in6_pcb.h>
69#include <netinet6/ip6protosw.h>
70#endif /* INET6 */
71#include <dev/random/randomdev.h>
72
73/*
74 * Notes on MPTCP implementation.
75 *
76 * MPTCP is implemented as <SOCK_STREAM,IPPROTO_TCP> protocol in PF_MULTIPATH
77 * communication domain. The structure mtcbinfo describes the MPTCP instance
78 * of a Multipath protocol in that domain. It is used to keep track of all
79 * MPTCP PCB instances in the system, and is protected by the global lock
80 * mppi_lock.
81 *
82 * An MPTCP socket is opened by calling socket(PF_MULTIPATH, SOCK_STREAM,
83 * IPPROTO_TCP). Upon success, a Multipath PCB gets allocated and along with
84 * it comes an MPTCP Session and an MPTCP PCB. All three structures are
85 * allocated from the same memory block, and each structure has a pointer
86 * to the adjacent ones. The layout is defined by the mpp_mtp structure.
87 * The socket lock (mpp_lock) is used to protect accesses to the Multipath
88 * PCB (mppcb) as well as the MPTCP Session (mptses).
89 *
90 * The MPTCP Session is an MPTCP-specific extension to the Multipath PCB;
91 *
92 * A functioning MPTCP Session consists of one or more subflow sockets. Each
93 * subflow socket is essentially a regular PF_INET/PF_INET6 TCP socket, and is
94 * represented by the mptsub structure. Because each subflow requires access
95 * to the MPTCP Session, the MPTCP socket's so_usecount is bumped up for each
96 * subflow. This gets decremented prior to the subflow's destruction.
97 *
98 * To handle events (read, write, control) from the subflows, we do direct
99 * upcalls into the specific function.
100 *
101 * The whole MPTCP connection is protected by a single lock, the MPTCP socket's
102 * lock. Incoming data on a subflow also ends up taking this single lock. To
103 * achieve the latter, tcp_lock/unlock has been changed to rather use the lock
104 * of the MPTCP-socket.
105 *
106 * An MPTCP socket will be destroyed when its so_usecount drops to zero; this
107 * work is done by the MPTCP garbage collector which is invoked on demand by
108 * the PF_MULTIPATH garbage collector. This process will take place once all
109 * of the subflows have been destroyed.
110 */
111
112static void mptcp_attach_to_subf(struct socket *, struct mptcb *, uint8_t);
113static void mptcp_detach_mptcb_from_subf(struct mptcb *, struct socket *);
114
115static uint32_t mptcp_gc(struct mppcbinfo *);
116static int mptcp_subflow_soreceive(struct socket *, struct sockaddr **,
117 struct uio *, struct mbuf **, struct mbuf **, int *);
118static int mptcp_subflow_sosend(struct socket *, struct sockaddr *,
119 struct uio *, struct mbuf *, struct mbuf *, int);
120static void mptcp_subflow_rupcall(struct socket *, void *, int);
121static void mptcp_subflow_input(struct mptses *, struct mptsub *);
122static void mptcp_subflow_wupcall(struct socket *, void *, int);
123static void mptcp_subflow_eupcall1(struct socket *, void *, uint32_t);
124static void mptcp_update_last_owner(struct socket *so, struct socket *mp_so);
125static void mptcp_drop_tfo_data(struct mptses *, struct mptsub *);
126
127static void mptcp_subflow_abort(struct mptsub *, int);
128
129static void mptcp_send_dfin(struct socket *so);
130
131/*
132 * Possible return values for subflow event handlers. Note that success
133 * values must be greater or equal than MPTS_EVRET_OK. Values less than that
134 * indicate errors or actions which require immediate attention; they will
135 * prevent the rest of the handlers from processing their respective events
136 * until the next round of events processing.
137 */
138typedef enum {
139 MPTS_EVRET_DELETE = 1, /* delete this subflow */
140 MPTS_EVRET_OK = 2, /* OK */
141 MPTS_EVRET_CONNECT_PENDING = 3, /* resume pended connects */
142 MPTS_EVRET_DISCONNECT_FALLBACK = 4, /* abort all but preferred */
143} ev_ret_t;
144
145static ev_ret_t mptcp_subflow_events(struct mptses *, struct mptsub *, uint64_t *);
146static ev_ret_t mptcp_subflow_propagate_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
147static ev_ret_t mptcp_subflow_nosrcaddr_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
148static ev_ret_t mptcp_subflow_failover_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
149static ev_ret_t mptcp_subflow_ifdenied_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
150static ev_ret_t mptcp_subflow_connected_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
151static ev_ret_t mptcp_subflow_disconnected_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
152static ev_ret_t mptcp_subflow_mpstatus_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
153static ev_ret_t mptcp_subflow_mustrst_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
154static ev_ret_t mptcp_subflow_mpcantrcvmore_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
155static ev_ret_t mptcp_subflow_adaptive_rtimo_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
156static ev_ret_t mptcp_subflow_adaptive_wtimo_ev(struct mptses *, struct mptsub *, uint64_t *, uint64_t);
157
158static const char *mptcp_evret2str(ev_ret_t);
159
160static void mptcp_do_sha1(mptcp_key_t *, char *);
161static void mptcp_init_local_parms(struct mptses *);
162
163static unsigned int mptsub_zone_size; /* size of mptsub */
164static struct zone *mptsub_zone; /* zone for mptsub */
165
166static unsigned int mptopt_zone_size; /* size of mptopt */
167static struct zone *mptopt_zone; /* zone for mptopt */
168
169static unsigned int mpt_subauth_entry_size; /* size of subf auth entry */
170static struct zone *mpt_subauth_zone; /* zone of subf auth entry */
171
172struct mppcbinfo mtcbinfo;
173
174#define MPTCP_SUBFLOW_WRITELEN (8 * 1024) /* bytes to write each time */
175#define MPTCP_SUBFLOW_READLEN (8 * 1024) /* bytes to read each time */
176
177SYSCTL_DECL(_net_inet);
178
179SYSCTL_NODE(_net_inet, OID_AUTO, mptcp, CTLFLAG_RW|CTLFLAG_LOCKED, 0, "MPTCP");
180
181uint32_t mptcp_dbg_area = 31; /* more noise if greater than 1 */
182SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, dbg_area, CTLFLAG_RW|CTLFLAG_LOCKED,
183 &mptcp_dbg_area, 0, "MPTCP debug area");
184
185uint32_t mptcp_dbg_level = 1;
186SYSCTL_INT(_net_inet_mptcp, OID_AUTO, dbg_level, CTLFLAG_RW | CTLFLAG_LOCKED,
187 &mptcp_dbg_level, 0, "MPTCP debug level");
188
189SYSCTL_UINT(_net_inet_mptcp, OID_AUTO, pcbcount, CTLFLAG_RD|CTLFLAG_LOCKED,
190 &mtcbinfo.mppi_count, 0, "Number of active PCBs");
191
192
193static int mptcp_alternate_port = 0;
194SYSCTL_INT(_net_inet_mptcp, OID_AUTO, alternate_port, CTLFLAG_RW | CTLFLAG_LOCKED,
195 &mptcp_alternate_port, 0, "Set alternate port for MPTCP connections");
196
197static struct protosw mptcp_subflow_protosw;
198static struct pr_usrreqs mptcp_subflow_usrreqs;
199#if INET6
200static struct ip6protosw mptcp_subflow_protosw6;
201static struct pr_usrreqs mptcp_subflow_usrreqs6;
202#endif /* INET6 */
203
204static uint8_t mptcp_create_subflows_scheduled;
205
206typedef struct mptcp_subflow_event_entry {
207 uint64_t sofilt_hint_mask;
208 ev_ret_t (*sofilt_hint_ev_hdlr)(
209 struct mptses *mpte,
210 struct mptsub *mpts,
211 uint64_t *p_mpsofilt_hint,
212 uint64_t event);
213} mptsub_ev_entry_t;
214
215static uint8_t mptcp_cellicon_is_set;
216static uint32_t mptcp_last_cellicon_set;
217#define MPTCP_CELLICON_TOGGLE_RATE (5 * TCP_RETRANSHZ) /* Only toggle every 5 seconds */
218
219/*
220 * XXX The order of the event handlers below is really
221 * really important. Think twice before changing it.
222 */
223static mptsub_ev_entry_t mpsub_ev_entry_tbl [] = {
224 {
225 .sofilt_hint_mask = SO_FILT_HINT_MPCANTRCVMORE,
226 .sofilt_hint_ev_hdlr = mptcp_subflow_mpcantrcvmore_ev,
227 },
228 {
229 .sofilt_hint_mask = SO_FILT_HINT_MPFAILOVER,
230 .sofilt_hint_ev_hdlr = mptcp_subflow_failover_ev,
231 },
232 {
233 .sofilt_hint_mask = SO_FILT_HINT_CONNRESET,
234 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
235 },
236 {
237 .sofilt_hint_mask = SO_FILT_HINT_MUSTRST,
238 .sofilt_hint_ev_hdlr = mptcp_subflow_mustrst_ev,
239 },
240 {
241 .sofilt_hint_mask = SO_FILT_HINT_CANTRCVMORE,
242 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
243 },
244 {
245 .sofilt_hint_mask = SO_FILT_HINT_TIMEOUT,
246 .sofilt_hint_ev_hdlr = mptcp_subflow_propagate_ev,
247 },
248 {
249 .sofilt_hint_mask = SO_FILT_HINT_NOSRCADDR,
250 .sofilt_hint_ev_hdlr = mptcp_subflow_nosrcaddr_ev,
251 },
252 {
253 .sofilt_hint_mask = SO_FILT_HINT_IFDENIED,
254 .sofilt_hint_ev_hdlr = mptcp_subflow_ifdenied_ev,
255 },
256 {
257 .sofilt_hint_mask = SO_FILT_HINT_CONNECTED,
258 .sofilt_hint_ev_hdlr = mptcp_subflow_connected_ev,
259 },
260 {
261 .sofilt_hint_mask = SO_FILT_HINT_MPSTATUS,
262 .sofilt_hint_ev_hdlr = mptcp_subflow_mpstatus_ev,
263 },
264 {
265 .sofilt_hint_mask = SO_FILT_HINT_DISCONNECTED,
266 .sofilt_hint_ev_hdlr = mptcp_subflow_disconnected_ev,
267 },
268 {
269 .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_RTIMO,
270 .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_rtimo_ev,
271 },
272 {
273 .sofilt_hint_mask = SO_FILT_HINT_ADAPTIVE_WTIMO,
274 .sofilt_hint_ev_hdlr = mptcp_subflow_adaptive_wtimo_ev,
275 },
276};
277
278os_log_t mptcp_log_handle;
279
280/*
281 * Protocol pr_init callback.
282 */
283void
284mptcp_init(struct protosw *pp, struct domain *dp)
285{
286#pragma unused(dp)
287 static int mptcp_initialized = 0;
288 struct protosw *prp;
289#if INET6
290 struct ip6protosw *prp6;
291#endif /* INET6 */
292
293 VERIFY((pp->pr_flags & (PR_INITIALIZED|PR_ATTACHED)) == PR_ATTACHED);
294
295 /* do this only once */
296 if (mptcp_initialized)
297 return;
298 mptcp_initialized = 1;
299
300 /*
301 * Since PF_MULTIPATH gets initialized after PF_INET/INET6,
302 * we must be able to find IPPROTO_TCP entries for both.
303 */
304 prp = pffindproto_locked(PF_INET, IPPROTO_TCP, SOCK_STREAM);
305 VERIFY(prp != NULL);
306 bcopy(prp, &mptcp_subflow_protosw, sizeof (*prp));
307 bcopy(prp->pr_usrreqs, &mptcp_subflow_usrreqs,
308 sizeof (mptcp_subflow_usrreqs));
309 mptcp_subflow_protosw.pr_entry.tqe_next = NULL;
310 mptcp_subflow_protosw.pr_entry.tqe_prev = NULL;
311 mptcp_subflow_protosw.pr_usrreqs = &mptcp_subflow_usrreqs;
312 mptcp_subflow_usrreqs.pru_soreceive = mptcp_subflow_soreceive;
313 mptcp_subflow_usrreqs.pru_sosend = mptcp_subflow_sosend;
314 mptcp_subflow_usrreqs.pru_rcvoob = pru_rcvoob_notsupp;
315 /*
316 * Socket filters shouldn't attach/detach to/from this protosw
317 * since pr_protosw is to be used instead, which points to the
318 * real protocol; if they do, it is a bug and we should panic.
319 */
320 mptcp_subflow_protosw.pr_filter_head.tqh_first =
321 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
322 mptcp_subflow_protosw.pr_filter_head.tqh_last =
323 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
324
325#if INET6
326 prp6 = (struct ip6protosw *)pffindproto_locked(PF_INET6,
327 IPPROTO_TCP, SOCK_STREAM);
328 VERIFY(prp6 != NULL);
329 bcopy(prp6, &mptcp_subflow_protosw6, sizeof (*prp6));
330 bcopy(prp6->pr_usrreqs, &mptcp_subflow_usrreqs6,
331 sizeof (mptcp_subflow_usrreqs6));
332 mptcp_subflow_protosw6.pr_entry.tqe_next = NULL;
333 mptcp_subflow_protosw6.pr_entry.tqe_prev = NULL;
334 mptcp_subflow_protosw6.pr_usrreqs = &mptcp_subflow_usrreqs6;
335 mptcp_subflow_usrreqs6.pru_soreceive = mptcp_subflow_soreceive;
336 mptcp_subflow_usrreqs6.pru_sosend = mptcp_subflow_sosend;
337 mptcp_subflow_usrreqs6.pru_rcvoob = pru_rcvoob_notsupp;
338 /*
339 * Socket filters shouldn't attach/detach to/from this protosw
340 * since pr_protosw is to be used instead, which points to the
341 * real protocol; if they do, it is a bug and we should panic.
342 */
343 mptcp_subflow_protosw6.pr_filter_head.tqh_first =
344 (struct socket_filter *)(uintptr_t)0xdeadbeefdeadbeef;
345 mptcp_subflow_protosw6.pr_filter_head.tqh_last =
346 (struct socket_filter **)(uintptr_t)0xdeadbeefdeadbeef;
347#endif /* INET6 */
348
349 bzero(&mtcbinfo, sizeof (mtcbinfo));
350 TAILQ_INIT(&mtcbinfo.mppi_pcbs);
351 mtcbinfo.mppi_size = sizeof (struct mpp_mtp);
352 if ((mtcbinfo.mppi_zone = zinit(mtcbinfo.mppi_size,
353 1024 * mtcbinfo.mppi_size, 8192, "mptcb")) == NULL) {
354 panic("%s: unable to allocate MPTCP PCB zone\n", __func__);
355 /* NOTREACHED */
356 }
357 zone_change(mtcbinfo.mppi_zone, Z_CALLERACCT, FALSE);
358 zone_change(mtcbinfo.mppi_zone, Z_EXPAND, TRUE);
359
360 mtcbinfo.mppi_lock_grp_attr = lck_grp_attr_alloc_init();
361 mtcbinfo.mppi_lock_grp = lck_grp_alloc_init("mppcb",
362 mtcbinfo.mppi_lock_grp_attr);
363 mtcbinfo.mppi_lock_attr = lck_attr_alloc_init();
364 lck_mtx_init(&mtcbinfo.mppi_lock, mtcbinfo.mppi_lock_grp,
365 mtcbinfo.mppi_lock_attr);
366
367 mtcbinfo.mppi_gc = mptcp_gc;
368 mtcbinfo.mppi_timer = mptcp_timer;
369
370 /* attach to MP domain for garbage collection to take place */
371 mp_pcbinfo_attach(&mtcbinfo);
372
373 mptsub_zone_size = sizeof (struct mptsub);
374 if ((mptsub_zone = zinit(mptsub_zone_size, 1024 * mptsub_zone_size,
375 8192, "mptsub")) == NULL) {
376 panic("%s: unable to allocate MPTCP subflow zone\n", __func__);
377 /* NOTREACHED */
378 }
379 zone_change(mptsub_zone, Z_CALLERACCT, FALSE);
380 zone_change(mptsub_zone, Z_EXPAND, TRUE);
381
382 mptopt_zone_size = sizeof (struct mptopt);
383 if ((mptopt_zone = zinit(mptopt_zone_size, 128 * mptopt_zone_size,
384 1024, "mptopt")) == NULL) {
385 panic("%s: unable to allocate MPTCP option zone\n", __func__);
386 /* NOTREACHED */
387 }
388 zone_change(mptopt_zone, Z_CALLERACCT, FALSE);
389 zone_change(mptopt_zone, Z_EXPAND, TRUE);
390
391 mpt_subauth_entry_size = sizeof (struct mptcp_subf_auth_entry);
392 if ((mpt_subauth_zone = zinit(mpt_subauth_entry_size,
393 1024 * mpt_subauth_entry_size, 8192, "mptauth")) == NULL) {
394 panic("%s: unable to allocate MPTCP address auth zone \n",
395 __func__);
396 /* NOTREACHED */
397 }
398 zone_change(mpt_subauth_zone, Z_CALLERACCT, FALSE);
399 zone_change(mpt_subauth_zone, Z_EXPAND, TRUE);
400
401 mptcp_last_cellicon_set = tcp_now;
402
403 mptcp_log_handle = os_log_create("com.apple.xnu.net.mptcp", "mptcp");
404}
405
406int
407mptcp_get_statsindex(struct mptcp_itf_stats *stats, const struct mptsub *mpts)
408{
409 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
410
411 int i, index = -1;
412
413 if (ifp == NULL) {
414 mptcplog((LOG_ERR, "%s: no ifp on subflow\n", __func__),
415 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
416 return (-1);
417 }
418
419 for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
420 if (stats[i].ifindex == IFSCOPE_NONE) {
421 if (index < 0)
422 index = i;
423 continue;
424 }
425
426 if (stats[i].ifindex == ifp->if_index) {
427 index = i;
428 return (index);
429 }
430 }
431
432 if (index != -1) {
433 stats[index].ifindex = ifp->if_index;
434 if (stats[index].is_expensive == 0)
435 stats[index].is_expensive = IFNET_IS_CELLULAR(ifp);
436 }
437
438 return (index);
439}
440
441void
442mptcpstats_inc_switch(struct mptses *mpte, const struct mptsub *mpts)
443{
444 int index;
445
446 tcpstat.tcps_mp_switches++;
447 mpte->mpte_subflow_switches++;
448
449 index = mptcp_get_statsindex(mpte->mpte_itfstats, mpts);
450
451 if (index != -1)
452 mpte->mpte_itfstats[index].switches++;
453}
454
455/*
456 * Flushes all recorded socket options from an MP socket.
457 */
458static void
459mptcp_flush_sopts(struct mptses *mpte)
460{
461 struct mptopt *mpo, *tmpo;
462
463 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
464 mptcp_sopt_remove(mpte, mpo);
465 mptcp_sopt_free(mpo);
466 }
467 VERIFY(TAILQ_EMPTY(&mpte->mpte_sopts));
468}
469
470/*
471 * Create an MPTCP session, called as a result of opening a MPTCP socket.
472 */
473int
474mptcp_sescreate(struct mppcb *mpp)
475{
476 struct mppcbinfo *mppi;
477 struct mptses *mpte;
478 struct mptcb *mp_tp;
479
480 VERIFY(mpp != NULL);
481 mppi = mpp->mpp_pcbinfo;
482 VERIFY(mppi != NULL);
483
484 __IGNORE_WCASTALIGN(mpte = &((struct mpp_mtp *)mpp)->mpp_ses);
485 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
486
487 /* MPTCP Multipath PCB Extension */
488 bzero(mpte, sizeof (*mpte));
489 VERIFY(mpp->mpp_pcbe == NULL);
490 mpp->mpp_pcbe = mpte;
491 mpte->mpte_mppcb = mpp;
492 mpte->mpte_mptcb = mp_tp;
493
494 TAILQ_INIT(&mpte->mpte_sopts);
495 TAILQ_INIT(&mpte->mpte_subflows);
496 mpte->mpte_associd = SAE_ASSOCID_ANY;
497 mpte->mpte_connid_last = SAE_CONNID_ANY;
498
499 mpte->mpte_itfinfo = &mpte->_mpte_itfinfo[0];
500 mpte->mpte_itfinfo_size = MPTE_ITFINFO_SIZE;
501
502 if (mptcp_alternate_port)
503 mpte->mpte_alternate_port = htons(mptcp_alternate_port);
504
505 /* MPTCP Protocol Control Block */
506 bzero(mp_tp, sizeof (*mp_tp));
507 mp_tp->mpt_mpte = mpte;
508 mp_tp->mpt_state = MPTCPS_CLOSED;
509
510 DTRACE_MPTCP1(session__create, struct mppcb *, mpp);
511
512 return (0);
513}
514
515static void
516mptcpstats_get_bytes(struct mptses *mpte, boolean_t initial_cell,
517 uint64_t *cellbytes, uint64_t *allbytes)
518{
519 int64_t mycellbytes = 0;
520 uint64_t myallbytes = 0;
521 int i;
522
523 for (i = 0; i < MPTCP_ITFSTATS_SIZE; i++) {
524 if (mpte->mpte_itfstats[i].is_expensive) {
525 mycellbytes += mpte->mpte_itfstats[i].mpis_txbytes;
526 mycellbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
527 }
528
529 myallbytes += mpte->mpte_itfstats[i].mpis_txbytes;
530 myallbytes += mpte->mpte_itfstats[i].mpis_rxbytes;
531 }
532
533 if (initial_cell) {
534 mycellbytes -= mpte->mpte_init_txbytes;
535 mycellbytes -= mpte->mpte_init_txbytes;
536 }
537
538 if (mycellbytes < 0) {
539 mptcplog((LOG_ERR, "%s cellbytes is %d\n", __func__, mycellbytes),
540 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
541 *cellbytes = 0;
542 *allbytes = 0;
543 } else {
544 *cellbytes = mycellbytes;
545 *allbytes = myallbytes;
546 }
547}
548
549static void
550mptcpstats_session_wrapup(struct mptses *mpte)
551{
552 boolean_t cell = mpte->mpte_initial_cell;
553
554 switch (mpte->mpte_svctype) {
555 case MPTCP_SVCTYPE_HANDOVER:
556 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
557 tcpstat.tcps_mptcp_fp_handover_attempt++;
558
559 if (cell && mpte->mpte_handshake_success) {
560 tcpstat.tcps_mptcp_fp_handover_success_cell++;
561
562 if (mpte->mpte_used_wifi)
563 tcpstat.tcps_mptcp_handover_wifi_from_cell++;
564 } else if (mpte->mpte_handshake_success) {
565 tcpstat.tcps_mptcp_fp_handover_success_wifi++;
566
567 if (mpte->mpte_used_cell)
568 tcpstat.tcps_mptcp_handover_cell_from_wifi++;
569 }
570 } else {
571 tcpstat.tcps_mptcp_handover_attempt++;
572
573 if (cell && mpte->mpte_handshake_success) {
574 tcpstat.tcps_mptcp_handover_success_cell++;
575
576 if (mpte->mpte_used_wifi)
577 tcpstat.tcps_mptcp_handover_wifi_from_cell++;
578 } else if (mpte->mpte_handshake_success) {
579 tcpstat.tcps_mptcp_handover_success_wifi++;
580
581 if (mpte->mpte_used_cell)
582 tcpstat.tcps_mptcp_handover_cell_from_wifi++;
583 }
584 }
585
586 if (mpte->mpte_handshake_success) {
587 uint64_t cellbytes;
588 uint64_t allbytes;
589
590 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
591
592 tcpstat.tcps_mptcp_handover_cell_bytes += cellbytes;
593 tcpstat.tcps_mptcp_handover_all_bytes += allbytes;
594 }
595 break;
596 case MPTCP_SVCTYPE_INTERACTIVE:
597 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
598 tcpstat.tcps_mptcp_fp_interactive_attempt++;
599
600 if (mpte->mpte_handshake_success) {
601 tcpstat.tcps_mptcp_fp_interactive_success++;
602
603 if (!cell && mpte->mpte_used_cell)
604 tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
605 }
606 } else {
607 tcpstat.tcps_mptcp_interactive_attempt++;
608
609 if (mpte->mpte_handshake_success) {
610 tcpstat.tcps_mptcp_interactive_success++;
611
612 if (!cell && mpte->mpte_used_cell)
613 tcpstat.tcps_mptcp_interactive_cell_from_wifi++;
614 }
615 }
616
617 if (mpte->mpte_handshake_success) {
618 uint64_t cellbytes;
619 uint64_t allbytes;
620
621 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
622
623 tcpstat.tcps_mptcp_interactive_cell_bytes += cellbytes;
624 tcpstat.tcps_mptcp_interactive_all_bytes += allbytes;
625 }
626 break;
627 case MPTCP_SVCTYPE_AGGREGATE:
628 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
629 tcpstat.tcps_mptcp_fp_aggregate_attempt++;
630
631 if (mpte->mpte_handshake_success)
632 tcpstat.tcps_mptcp_fp_aggregate_success++;
633 } else {
634 tcpstat.tcps_mptcp_aggregate_attempt++;
635
636 if (mpte->mpte_handshake_success) {
637 tcpstat.tcps_mptcp_aggregate_success++;
638 }
639 }
640
641 if (mpte->mpte_handshake_success) {
642 uint64_t cellbytes;
643 uint64_t allbytes;
644
645 mptcpstats_get_bytes(mpte, cell, &cellbytes, &allbytes);
646
647 tcpstat.tcps_mptcp_aggregate_cell_bytes += cellbytes;
648 tcpstat.tcps_mptcp_aggregate_all_bytes += allbytes;
649 }
650 break;
651 }
652
653 if (cell && mpte->mpte_handshake_success && mpte->mpte_used_wifi)
654 tcpstat.tcps_mptcp_back_to_wifi++;
655
656 if (mpte->mpte_triggered_cell)
657 tcpstat.tcps_mptcp_triggered_cell++;
658}
659
660/*
661 * Destroy an MPTCP session.
662 */
663static void
664mptcp_session_destroy(struct mptses *mpte)
665{
666 struct mptcb *mp_tp;
667
668 mpte_lock_assert_held(mpte); /* same as MP socket lock */
669
670 mp_tp = mpte->mpte_mptcb;
671 VERIFY(mp_tp != NULL);
672
673 mptcpstats_session_wrapup(mpte);
674
675 mptcp_unset_cellicon();
676
677 /*
678 * MPTCP Multipath PCB Extension section
679 */
680 mptcp_flush_sopts(mpte);
681 VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows) && mpte->mpte_numflows == 0);
682
683 if (mpte->mpte_itfinfo_size > MPTE_ITFINFO_SIZE)
684 _FREE(mpte->mpte_itfinfo, M_TEMP);
685
686 mpte->mpte_itfinfo = NULL;
687
688 m_freem_list(mpte->mpte_reinjectq);
689
690 /*
691 * MPTCP Protocol Control Block section
692 */
693 DTRACE_MPTCP2(session__destroy, struct mptses *, mpte,
694 struct mptcb *, mp_tp);
695}
696
697static boolean_t
698mptcp_ok_to_create_subflows(struct mptcb *mp_tp)
699{
700 return (mp_tp->mpt_state >= MPTCPS_ESTABLISHED &&
701 mp_tp->mpt_state < MPTCPS_FIN_WAIT_1 &&
702 !(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP));
703}
704
705static int
706mptcp_synthesize_nat64(struct in6_addr *addr, uint32_t len, struct in_addr *addrv4)
707{
708 static const struct in6_addr well_known_prefix = {
709 .__u6_addr.__u6_addr8 = {0x00, 0x64, 0xff, 0x9b, 0x00, 0x00,
710 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
711 0x00, 0x00, 0x00, 0x00},
712 };
713 char buf[MAX_IPv6_STR_LEN];
714 char *ptrv4 = (char *)addrv4;
715 char *ptr = (char *)addr;
716
717 if (IN_ZERONET(ntohl(addrv4->s_addr)) || // 0.0.0.0/8 Source hosts on local network
718 IN_LOOPBACK(ntohl(addrv4->s_addr)) || // 127.0.0.0/8 Loopback
719 IN_LINKLOCAL(ntohl(addrv4->s_addr)) || // 169.254.0.0/16 Link Local
720 IN_DS_LITE(ntohl(addrv4->s_addr)) || // 192.0.0.0/29 DS-Lite
721 IN_6TO4_RELAY_ANYCAST(ntohl(addrv4->s_addr)) || // 192.88.99.0/24 6to4 Relay Anycast
722 IN_MULTICAST(ntohl(addrv4->s_addr)) || // 224.0.0.0/4 Multicast
723 INADDR_BROADCAST == addrv4->s_addr) { // 255.255.255.255/32 Limited Broadcast
724 return (-1);
725 }
726
727 /* Check for the well-known prefix */
728 if (len == NAT64_PREFIX_LEN_96 &&
729 IN6_ARE_ADDR_EQUAL(addr, &well_known_prefix)) {
730 if (IN_PRIVATE(ntohl(addrv4->s_addr)) || // 10.0.0.0/8, 172.16.0.0/12, 192.168.0.0/16 Private-Use
731 IN_SHARED_ADDRESS_SPACE(ntohl(addrv4->s_addr))) // 100.64.0.0/10 Shared Address Space
732 return (-1);
733 }
734
735 switch (len) {
736 case NAT64_PREFIX_LEN_96:
737 memcpy(ptr + 12, ptrv4, 4);
738 break;
739 case NAT64_PREFIX_LEN_64:
740 memcpy(ptr + 9, ptrv4, 4);
741 break;
742 case NAT64_PREFIX_LEN_56:
743 memcpy(ptr + 7, ptrv4, 1);
744 memcpy(ptr + 9, ptrv4 + 1, 3);
745 break;
746 case NAT64_PREFIX_LEN_48:
747 memcpy(ptr + 6, ptrv4, 2);
748 memcpy(ptr + 9, ptrv4 + 2, 2);
749 break;
750 case NAT64_PREFIX_LEN_40:
751 memcpy(ptr + 5, ptrv4, 3);
752 memcpy(ptr + 9, ptrv4 + 3, 1);
753 break;
754 case NAT64_PREFIX_LEN_32:
755 memcpy(ptr + 4, ptrv4, 4);
756 break;
757 default:
758 panic("NAT64-prefix len is wrong: %u\n", len);
759 }
760
761 os_log_info(mptcp_log_handle, "%s: nat64prefix-len %u synthesized %s\n",
762 __func__, len,
763 inet_ntop(AF_INET6, (void *)addr, buf, sizeof(buf)));
764
765 return (0);
766}
767
768static void
769mptcp_trigger_cell_bringup(struct mptses *mpte)
770{
771 struct socket *mp_so = mptetoso(mpte);
772
773 if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
774 uuid_string_t uuidstr;
775 int err;
776
777 err = necp_client_assert_bb_radio_manager(mpsotomppcb(mp_so)->necp_client_uuid,
778 TRUE);
779
780 if (err == 0)
781 mpte->mpte_triggered_cell = 1;
782
783 uuid_unparse_upper(mpsotomppcb(mp_so)->necp_client_uuid, uuidstr);
784 os_log_info(mptcp_log_handle, "%s asked irat to bringup cell for uuid %s, err %d\n",
785 __func__, uuidstr, err);
786 } else {
787 os_log_info(mptcp_log_handle, "%s UUID is already null\n", __func__);
788 }
789}
790
791
792void
793mptcp_check_subflows_and_add(struct mptses *mpte)
794{
795 struct mptcb *mp_tp = mpte->mpte_mptcb;
796 boolean_t cellular_viable = FALSE;
797 boolean_t want_cellular = TRUE;
798 uint32_t i;
799
800 if (!mptcp_ok_to_create_subflows(mp_tp))
801 return;
802
803 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
804 struct mpt_itf_info *info;
805 struct mptsub *mpts;
806 struct ifnet *ifp;
807 uint32_t ifindex;
808 int found = 0;
809
810 info = &mpte->mpte_itfinfo[i];
811
812 if (info->no_mptcp_support)
813 continue;
814
815 ifindex = info->ifindex;
816 if (ifindex == IFSCOPE_NONE)
817 continue;
818
819 ifnet_head_lock_shared();
820 ifp = ifindex2ifnet[ifindex];
821 ifnet_head_done();
822
823 if (ifp == NULL)
824 continue;
825
826 if (IFNET_IS_CELLULAR(ifp))
827 cellular_viable = TRUE;
828
829 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
830 const struct ifnet *subifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
831
832 if (subifp == NULL)
833 continue;
834
835 /*
836 * In Handover mode, only create cell subflow if
837 * 1. Wi-Fi Assist is active
838 * 2. Symptoms marked WiFi as weak
839 * 3. We are experiencing RTOs or we are not sending data.
840 *
841 * This covers the scenario, where:
842 * 1. We send and get retransmission timeouts (thus,
843 * we confirmed that WiFi is indeed bad).
844 * 2. We are not sending and the server tries to send.
845 * Establshing a cell-subflow gives the server a
846 * chance to send us some data over cell if WiFi
847 * is dead. We establish the subflow with the
848 * backup-bit set, so the server is not allowed to
849 * send on this subflow as long as WiFi is providing
850 * good performance.
851 */
852 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER &&
853 !IFNET_IS_CELLULAR(subifp) &&
854 !(mpts->mpts_flags & (MPTSF_DISCONNECTING | MPTSF_DISCONNECTED | MPTSF_CLOSE_REQD)) &&
855 (mptcp_is_wifi_unusable(mpte) == 0 ||
856 (sototcpcb(mpts->mpts_socket)->t_rxtshift < mptcp_fail_thresh * 2 &&
857 ((mpte->mpte_flags & MPTE_FIRSTPARTY) || mptetoso(mpte)->so_snd.sb_cc)))) {
858 os_log_debug(mptcp_log_handle, "%s handover, wifi state %d rxt %u first-party %u sb_cc %u ifindex %u this %u\n",
859 __func__, mptcp_is_wifi_unusable(mpte),
860 sototcpcb(mpts->mpts_socket)->t_rxtshift,
861 !!(mpte->mpte_flags & MPTE_FIRSTPARTY),
862 mptetoso(mpte)->so_snd.sb_cc,
863 ifindex, subifp->if_index);
864 found = 1;
865
866 /* We found a proper subflow on WiFi - no need for cell */
867 want_cellular = FALSE;
868 break;
869 } else {
870 os_log_debug(mptcp_log_handle, "%s svc %u cell %u flags %#x unusable %d rtx %u first %u sbcc %u\n",
871 __func__, mpte->mpte_svctype, IFNET_IS_CELLULAR(subifp), mpts->mpts_flags,
872 mptcp_is_wifi_unusable(mpte), sototcpcb(mpts->mpts_socket)->t_rxtshift,
873 !!(mpte->mpte_flags & MPTE_FIRSTPARTY), mptetoso(mpte)->so_snd.sb_cc);
874
875 }
876
877 if (subifp->if_index == ifindex &&
878 !(mpts->mpts_socket->so_state & SS_ISDISCONNECTED) &&
879 sototcpcb(mpts->mpts_socket)->t_state != TCPS_CLOSED) {
880 /*
881 * We found a subflow on this interface.
882 * No need to create a new one.
883 */
884 found = 1;
885 break;
886 }
887 }
888
889 if (!found && !(mpte->mpte_flags & MPTE_FIRSTPARTY) &&
890 !(mpte->mpte_flags & MPTE_ACCESS_GRANTED) &&
891 mptcp_developer_mode == 0) {
892 mptcp_ask_symptoms(mpte);
893 return;
894 }
895
896 if (!found) {
897 struct sockaddr *dst = &mpte->mpte_dst;
898 struct sockaddr_in6 nat64pre;
899
900 if (mpte->mpte_dst.sa_family == AF_INET &&
901 !info->has_v4_conn && info->has_nat64_conn) {
902 struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
903 int error, j;
904
905 bzero(&nat64pre, sizeof(struct sockaddr_in6));
906
907 error = ifnet_get_nat64prefix(ifp, nat64prefixes);
908 if (error) {
909 os_log_error(mptcp_log_handle, "%s: no NAT64-prefix on itf %s, error %d\n",
910 __func__, ifp->if_name, error);
911 continue;
912 }
913
914 for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
915 if (nat64prefixes[j].prefix_len != 0)
916 break;
917 }
918
919 VERIFY(j < NAT64_MAX_NUM_PREFIXES);
920
921 error = mptcp_synthesize_nat64(&nat64prefixes[j].ipv6_prefix,
922 nat64prefixes[j].prefix_len,
923 &mpte->__mpte_dst_v4.sin_addr);
924 if (error != 0) {
925 os_log_info(mptcp_log_handle, "%s: cannot synthesize this addr\n",
926 __func__);
927 continue;
928 }
929
930 memcpy(&nat64pre.sin6_addr,
931 &nat64prefixes[j].ipv6_prefix,
932 sizeof(nat64pre.sin6_addr));
933 nat64pre.sin6_len = sizeof(struct sockaddr_in6);
934 nat64pre.sin6_family = AF_INET6;
935 nat64pre.sin6_port = mpte->__mpte_dst_v6.sin6_port;
936 nat64pre.sin6_flowinfo = 0;
937 nat64pre.sin6_scope_id = 0;
938
939 dst = (struct sockaddr *)&nat64pre;
940 }
941
942 /* Initial subflow started on a NAT64'd address? */
943 if (mpte->mpte_dst.sa_family == AF_INET6 &&
944 mpte->mpte_dst_v4_nat64.sin_family == AF_INET) {
945 dst = (struct sockaddr *)&mpte->mpte_dst_v4_nat64;
946 }
947
948 if (dst->sa_family == AF_INET && !info->has_v4_conn)
949 continue;
950 if (dst->sa_family == AF_INET6 && !info->has_v6_conn)
951 continue;
952
953 mptcp_subflow_add(mpte, NULL, dst, ifindex, NULL);
954 }
955 }
956
957 if (!cellular_viable && want_cellular) {
958 /* Trigger Cell Bringup */
959 mptcp_trigger_cell_bringup(mpte);
960 }
961}
962
963/*
964 * Based on the MPTCP Service-type and the state of the subflows, we
965 * will destroy subflows here.
966 */
967static void
968mptcp_check_subflows_and_remove(struct mptses *mpte)
969{
970 struct mptsub *mpts, *tmpts;
971 int found_working_subflow = 0, removed_some = 0;
972 int wifi_unusable = mptcp_is_wifi_unusable(mpte);
973
974 if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER)
975 return;
976
977 /*
978 * Look for a subflow that is on a non-cellular interface
979 * and actually works (aka, no retransmission timeout).
980 */
981 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
982 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
983 struct socket *so;
984 struct tcpcb *tp;
985
986 if (ifp == NULL || IFNET_IS_CELLULAR(ifp))
987 continue;
988
989 so = mpts->mpts_socket;
990 tp = sototcpcb(so);
991
992 if (!(mpts->mpts_flags & MPTSF_CONNECTED) ||
993 tp->t_state != TCPS_ESTABLISHED)
994 continue;
995
996 /* Is this subflow in good condition? */
997 if (tp->t_rxtshift == 0)
998 found_working_subflow = 1;
999
1000 /* Or WiFi is fine */
1001 if (!wifi_unusable)
1002 found_working_subflow = 1;
1003 }
1004
1005 /*
1006 * Couldn't find a working subflow, let's not remove those on a cellular
1007 * interface.
1008 */
1009 if (!found_working_subflow)
1010 return;
1011
1012 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
1013 const struct ifnet *ifp = sotoinpcb(mpts->mpts_socket)->inp_last_outifp;
1014
1015 /* Only remove cellular subflows */
1016 if (ifp == NULL || !IFNET_IS_CELLULAR(ifp))
1017 continue;
1018
1019 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1020 removed_some = 1;
1021 }
1022
1023 if (removed_some)
1024 mptcp_unset_cellicon();
1025}
1026
1027static void
1028mptcp_remove_subflows(struct mptses *mpte)
1029{
1030 struct mptsub *mpts, *tmpts;
1031
1032 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
1033 if (mpts->mpts_flags & MPTSF_CLOSE_REQD) {
1034 mpts->mpts_flags &= ~MPTSF_CLOSE_REQD;
1035
1036 soevent(mpts->mpts_socket,
1037 SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOSRCADDR);
1038 }
1039 }
1040}
1041
1042static void
1043mptcp_create_subflows(__unused void *arg)
1044{
1045 struct mppcb *mpp;
1046
1047 /*
1048 * Start with clearing, because we might be processing connections
1049 * while a new event comes in.
1050 */
1051 if (OSTestAndClear(0x01, &mptcp_create_subflows_scheduled))
1052 mptcplog((LOG_ERR, "%s: bit was already cleared!\n", __func__),
1053 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1054
1055 /* Iterate over all MPTCP connections */
1056
1057 lck_mtx_lock(&mtcbinfo.mppi_lock);
1058
1059 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
1060 struct mptses *mpte;
1061 struct socket *mp_so;
1062
1063 if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS))
1064 continue;
1065
1066 mpp_lock(mpp);
1067
1068 mpp->mpp_flags &= ~MPP_CREATE_SUBFLOWS;
1069
1070 mpte = mpp->mpp_pcbe;
1071 mp_so = mpp->mpp_socket;
1072
1073 VERIFY(mp_so->so_usecount > 0);
1074
1075 mptcp_check_subflows_and_add(mpte);
1076 mptcp_remove_subflows(mpte);
1077
1078 mp_so->so_usecount--; /* See mptcp_sched_create_subflows */
1079 mpp_unlock(mpp);
1080 }
1081
1082 lck_mtx_unlock(&mtcbinfo.mppi_lock);
1083}
1084
1085/*
1086 * We need this because we are coming from an NECP-event. This event gets posted
1087 * while holding NECP-locks. The creation of the subflow however leads us back
1088 * into NECP (e.g., to add the necp_cb and also from tcp_connect).
1089 * So, we would deadlock there as we already hold the NECP-lock.
1090 *
1091 * So, let's schedule this separately. It also gives NECP the chance to make
1092 * progress, without having to wait for MPTCP to finish its subflow creation.
1093 */
1094void
1095mptcp_sched_create_subflows(struct mptses *mpte)
1096{
1097 struct mppcb *mpp = mpte->mpte_mppcb;
1098 struct mptcb *mp_tp = mpte->mpte_mptcb;
1099 struct socket *mp_so = mpp->mpp_socket;
1100
1101 if (!mptcp_ok_to_create_subflows(mp_tp)) {
1102 mptcplog((LOG_DEBUG, "%s: not a good time for subflows, state %u flags %#x",
1103 __func__, mp_tp->mpt_state, mp_tp->mpt_flags),
1104 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
1105 return;
1106 }
1107
1108 if (!(mpp->mpp_flags & MPP_CREATE_SUBFLOWS)) {
1109 mp_so->so_usecount++; /* To prevent it from being free'd in-between */
1110 mpp->mpp_flags |= MPP_CREATE_SUBFLOWS;
1111 }
1112
1113 if (OSTestAndSet(0x01, &mptcp_create_subflows_scheduled))
1114 return;
1115
1116 /* Do the call in 100ms to allow NECP to schedule it on all sockets */
1117 timeout(mptcp_create_subflows, NULL, hz/10);
1118}
1119
1120/*
1121 * Allocate an MPTCP socket option structure.
1122 */
1123struct mptopt *
1124mptcp_sopt_alloc(int how)
1125{
1126 struct mptopt *mpo;
1127
1128 mpo = (how == M_WAITOK) ? zalloc(mptopt_zone) :
1129 zalloc_noblock(mptopt_zone);
1130 if (mpo != NULL) {
1131 bzero(mpo, mptopt_zone_size);
1132 }
1133
1134 return (mpo);
1135}
1136
1137/*
1138 * Free an MPTCP socket option structure.
1139 */
1140void
1141mptcp_sopt_free(struct mptopt *mpo)
1142{
1143 VERIFY(!(mpo->mpo_flags & MPOF_ATTACHED));
1144
1145 zfree(mptopt_zone, mpo);
1146}
1147
1148/*
1149 * Add a socket option to the MPTCP socket option list.
1150 */
1151void
1152mptcp_sopt_insert(struct mptses *mpte, struct mptopt *mpo)
1153{
1154 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1155 mpo->mpo_flags |= MPOF_ATTACHED;
1156 TAILQ_INSERT_TAIL(&mpte->mpte_sopts, mpo, mpo_entry);
1157}
1158
1159/*
1160 * Remove a socket option from the MPTCP socket option list.
1161 */
1162void
1163mptcp_sopt_remove(struct mptses *mpte, struct mptopt *mpo)
1164{
1165 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1166 VERIFY(mpo->mpo_flags & MPOF_ATTACHED);
1167 mpo->mpo_flags &= ~MPOF_ATTACHED;
1168 TAILQ_REMOVE(&mpte->mpte_sopts, mpo, mpo_entry);
1169}
1170
1171/*
1172 * Search for an existing <sopt_level,sopt_name> socket option.
1173 */
1174struct mptopt *
1175mptcp_sopt_find(struct mptses *mpte, struct sockopt *sopt)
1176{
1177 struct mptopt *mpo;
1178
1179 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1180
1181 TAILQ_FOREACH(mpo, &mpte->mpte_sopts, mpo_entry) {
1182 if (mpo->mpo_level == sopt->sopt_level &&
1183 mpo->mpo_name == sopt->sopt_name)
1184 break;
1185 }
1186 return (mpo);
1187}
1188
1189/*
1190 * Allocate a MPTCP subflow structure.
1191 */
1192static struct mptsub *
1193mptcp_subflow_alloc(void)
1194{
1195 struct mptsub *mpts = zalloc(mptsub_zone);
1196
1197 if (mpts == NULL)
1198 return (NULL);
1199
1200 bzero(mpts, mptsub_zone_size);
1201 return (mpts);
1202}
1203
1204/*
1205 * Deallocate a subflow structure, called when all of the references held
1206 * on it have been released. This implies that the subflow has been deleted.
1207 */
1208static void
1209mptcp_subflow_free(struct mptsub *mpts)
1210{
1211 VERIFY(mpts->mpts_refcnt == 0);
1212 VERIFY(!(mpts->mpts_flags & MPTSF_ATTACHED));
1213 VERIFY(mpts->mpts_mpte == NULL);
1214 VERIFY(mpts->mpts_socket == NULL);
1215
1216 if (mpts->mpts_src != NULL) {
1217 FREE(mpts->mpts_src, M_SONAME);
1218 mpts->mpts_src = NULL;
1219 }
1220
1221 zfree(mptsub_zone, mpts);
1222}
1223
1224static void
1225mptcp_subflow_addref(struct mptsub *mpts)
1226{
1227 if (++mpts->mpts_refcnt == 0)
1228 panic("%s: mpts %p wraparound refcnt\n", __func__, mpts);
1229 /* NOTREACHED */
1230}
1231
1232static void
1233mptcp_subflow_remref(struct mptsub *mpts)
1234{
1235 if (mpts->mpts_refcnt == 0) {
1236 panic("%s: mpts %p negative refcnt\n", __func__, mpts);
1237 /* NOTREACHED */
1238 }
1239 if (--mpts->mpts_refcnt > 0)
1240 return;
1241
1242 /* callee will unlock and destroy lock */
1243 mptcp_subflow_free(mpts);
1244}
1245
1246static void
1247mptcp_subflow_attach(struct mptses *mpte, struct mptsub *mpts, struct socket *so)
1248{
1249 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
1250 struct tcpcb *tp = sototcpcb(so);
1251
1252 /*
1253 * From this moment on, the subflow is linked to the MPTCP-connection.
1254 * Locking,... happens now at the MPTCP-layer
1255 */
1256 tp->t_mptcb = mpte->mpte_mptcb;
1257 so->so_flags |= SOF_MP_SUBFLOW;
1258 mp_so->so_usecount++;
1259
1260 /*
1261 * Insert the subflow into the list, and associate the MPTCP PCB
1262 * as well as the the subflow socket. From this point on, removing
1263 * the subflow needs to be done via mptcp_subflow_del().
1264 */
1265 TAILQ_INSERT_TAIL(&mpte->mpte_subflows, mpts, mpts_entry);
1266 mpte->mpte_numflows++;
1267
1268 atomic_bitset_32(&mpts->mpts_flags, MPTSF_ATTACHED);
1269 mpts->mpts_mpte = mpte;
1270 mpts->mpts_socket = so;
1271 tp->t_mpsub = mpts;
1272 mptcp_subflow_addref(mpts); /* for being in MPTCP subflow list */
1273 mptcp_subflow_addref(mpts); /* for subflow socket */
1274}
1275
1276static void
1277mptcp_subflow_necp_cb(void *handle, __unused int action,
1278 __unused uint32_t interface_index,
1279 uint32_t necp_flags, bool *viable)
1280{
1281 boolean_t low_power = !!(necp_flags & NECP_CLIENT_RESULT_FLAG_INTERFACE_LOW_POWER);
1282 struct inpcb *inp = (struct inpcb *)handle;
1283 struct socket *so = inp->inp_socket;
1284 struct mptsub *mpts;
1285 struct mptses *mpte;
1286
1287 if (low_power)
1288 action = NECP_CLIENT_CBACTION_NONVIABLE;
1289
1290 if (action != NECP_CLIENT_CBACTION_NONVIABLE)
1291 return;
1292
1293 /*
1294 * The socket is being garbage-collected. There is nothing to be done
1295 * here.
1296 */
1297 if (so->so_usecount == 0)
1298 return;
1299
1300 socket_lock(so, 1);
1301
1302 /* Check again after we acquired the lock. */
1303 if (so->so_usecount == 0)
1304 goto out;
1305
1306 mpte = tptomptp(sototcpcb(so))->mpt_mpte;
1307 mpts = sototcpcb(so)->t_mpsub;
1308
1309 os_log_debug(mptcp_log_handle, "%s Subflow on itf %u became non-viable, power %u",
1310 __func__, mpts->mpts_ifscope, low_power);
1311
1312 mpts->mpts_flags |= MPTSF_CLOSE_REQD;
1313
1314 mptcp_sched_create_subflows(mpte);
1315
1316 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER && viable != NULL)
1317 *viable = 1;
1318
1319out:
1320 socket_unlock(so, 1);
1321}
1322
1323/*
1324 * Create an MPTCP subflow socket.
1325 */
1326static int
1327mptcp_subflow_socreate(struct mptses *mpte, struct mptsub *mpts, int dom,
1328 struct socket **so)
1329{
1330 lck_mtx_t *subflow_mtx;
1331 struct mptopt smpo, *mpo, *tmpo;
1332 struct proc *p;
1333 struct socket *mp_so;
1334 int error;
1335
1336 *so = NULL;
1337 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1338 mp_so = mptetoso(mpte);
1339
1340 p = proc_find(mp_so->last_pid);
1341 if (p == PROC_NULL) {
1342 mptcplog((LOG_ERR, "%s: Couldn't find proc for pid %u\n", __func__, mp_so->last_pid),
1343 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1344
1345 return (ESRCH);
1346 }
1347
1348 /*
1349 * Create the subflow socket (multipath subflow, non-blocking.)
1350 *
1351 * This will cause SOF_MP_SUBFLOW socket flag to be set on the subflow
1352 * socket; it will be cleared when the socket is peeled off or closed.
1353 * It also indicates to the underlying TCP to handle MPTCP options.
1354 * A multipath subflow socket implies SS_NOFDREF state.
1355 */
1356
1357 /*
1358 * Unlock, because tcp_usr_attach ends up in in_pcballoc, which takes
1359 * the ipi-lock. We cannot hold the socket-lock at that point.
1360 */
1361 mpte_unlock(mpte);
1362 error = socreate_internal(dom, so, SOCK_STREAM, IPPROTO_TCP, p,
1363 SOCF_ASYNC, PROC_NULL);
1364 mpte_lock(mpte);
1365 if (error) {
1366 mptcplog((LOG_ERR, "%s: subflow socreate mp_so 0x%llx unable to create subflow socket error %d\n",
1367 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), error),
1368 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1369
1370 proc_rele(p);
1371
1372 mptcp_subflow_free(mpts);
1373 return (error);
1374 }
1375
1376 /*
1377 * We need to protect the setting of SOF_MP_SUBFLOW with a lock, because
1378 * this marks the moment of lock-switch from the TCP-lock to the MPTCP-lock.
1379 * Which is why we also need to get the lock with pr_getlock, as after
1380 * setting the flag, socket_unlock will work on the MPTCP-level lock.
1381 */
1382 subflow_mtx = ((*so)->so_proto->pr_getlock)(*so, 0);
1383 lck_mtx_lock(subflow_mtx);
1384
1385 /*
1386 * Must be the first thing we do, to make sure all pointers for this
1387 * subflow are set.
1388 */
1389 mptcp_subflow_attach(mpte, mpts, *so);
1390
1391 /*
1392 * A multipath subflow socket is used internally in the kernel,
1393 * therefore it does not have a file desciptor associated by
1394 * default.
1395 */
1396 (*so)->so_state |= SS_NOFDREF;
1397
1398 lck_mtx_unlock(subflow_mtx);
1399
1400 /* prevent the socket buffers from being compressed */
1401 (*so)->so_rcv.sb_flags |= SB_NOCOMPRESS;
1402 (*so)->so_snd.sb_flags |= SB_NOCOMPRESS;
1403
1404 /* Inherit preconnect and TFO data flags */
1405 if (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)
1406 (*so)->so_flags1 |= SOF1_PRECONNECT_DATA;
1407 if (mp_so->so_flags1 & SOF1_DATA_IDEMPOTENT)
1408 (*so)->so_flags1 |= SOF1_DATA_IDEMPOTENT;
1409
1410 /* Inherit uuid and create the related flow. */
1411 if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
1412 struct mptcb *mp_tp = mpte->mpte_mptcb;
1413
1414 sotoinpcb(*so)->necp_cb = mptcp_subflow_necp_cb;
1415
1416 /*
1417 * A note on the unlock: With MPTCP, we do multiple times a
1418 * necp_client_register_socket_flow. This is problematic,
1419 * because now the lock-ordering guarantee (first necp-locks,
1420 * then socket-locks) is no more respected. So, we need to
1421 * unlock here.
1422 */
1423 mpte_unlock(mpte);
1424 error = necp_client_register_socket_flow(mp_so->last_pid,
1425 mpsotomppcb(mp_so)->necp_client_uuid, sotoinpcb(*so));
1426 mpte_lock(mpte);
1427
1428 if (error)
1429 goto out_err;
1430
1431 /* Possible state-change during the unlock above */
1432 if (mp_tp->mpt_state >= MPTCPS_TIME_WAIT ||
1433 (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP))
1434 goto out_err;
1435
1436 uuid_copy(sotoinpcb(*so)->necp_client_uuid, mpsotomppcb(mp_so)->necp_client_uuid);
1437 } else {
1438 mptcplog((LOG_NOTICE, "%s: uuid is not set!\n"),
1439 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
1440 }
1441
1442 /* inherit the other socket options */
1443 bzero(&smpo, sizeof (smpo));
1444 smpo.mpo_flags |= MPOF_SUBFLOW_OK;
1445 smpo.mpo_level = SOL_SOCKET;
1446 smpo.mpo_intval = 1;
1447
1448 /* disable SIGPIPE */
1449 smpo.mpo_name = SO_NOSIGPIPE;
1450 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0)
1451 goto out_err;
1452
1453 /* find out if the subflow's source address goes away */
1454 smpo.mpo_name = SO_NOADDRERR;
1455 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0)
1456 goto out_err;
1457
1458 /* enable keepalive */
1459 smpo.mpo_name = SO_KEEPALIVE;
1460 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0)
1461 goto out_err;
1462
1463 smpo.mpo_level = IPPROTO_TCP;
1464 smpo.mpo_intval = mptcp_subflow_keeptime;
1465 smpo.mpo_name = TCP_KEEPALIVE;
1466 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0)
1467 goto out_err;
1468
1469 if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED) {
1470 /*
1471 * On secondary subflows we might need to set the cell-fallback
1472 * flag (see conditions in mptcp_subflow_sosetopt).
1473 */
1474 smpo.mpo_level = SOL_SOCKET;
1475 smpo.mpo_name = SO_MARK_CELLFALLBACK;
1476 smpo.mpo_intval = 1;
1477 if ((error = mptcp_subflow_sosetopt(mpte, mpts, &smpo)) != 0)
1478 goto out_err;
1479 }
1480
1481 /* replay setsockopt(2) on the subflow sockets for eligible options */
1482 TAILQ_FOREACH_SAFE(mpo, &mpte->mpte_sopts, mpo_entry, tmpo) {
1483 int interim;
1484
1485 if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK))
1486 continue;
1487
1488 /*
1489 * Skip those that are handled internally; these options
1490 * should not have been recorded and marked with the
1491 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
1492 */
1493 if (mpo->mpo_level == SOL_SOCKET &&
1494 (mpo->mpo_name == SO_NOSIGPIPE ||
1495 mpo->mpo_name == SO_NOADDRERR ||
1496 mpo->mpo_name == SO_KEEPALIVE))
1497 continue;
1498
1499 interim = (mpo->mpo_flags & MPOF_INTERIM);
1500 if (mptcp_subflow_sosetopt(mpte, mpts, mpo) != 0 && interim) {
1501 mptcplog((LOG_ERR, "%s: subflow socreate mp_so 0x%llx"
1502 " sopt %s val %d interim record removed\n", __func__,
1503 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1504 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
1505 mpo->mpo_intval),
1506 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1507 mptcp_sopt_remove(mpte, mpo);
1508 mptcp_sopt_free(mpo);
1509 continue;
1510 }
1511 }
1512
1513 /*
1514 * We need to receive everything that the subflow socket has,
1515 * so use a customized socket receive function. We will undo
1516 * this when the socket is peeled off or closed.
1517 */
1518 switch (dom) {
1519 case PF_INET:
1520 (*so)->so_proto = &mptcp_subflow_protosw;
1521 break;
1522#if INET6
1523 case PF_INET6:
1524 (*so)->so_proto = (struct protosw *)&mptcp_subflow_protosw6;
1525 break;
1526#endif /* INET6 */
1527 default:
1528 VERIFY(0);
1529 /* NOTREACHED */
1530 }
1531
1532 proc_rele(p);
1533
1534 DTRACE_MPTCP3(subflow__create, struct mptses *, mpte,
1535 int, dom, int, error);
1536
1537 return (0);
1538
1539out_err:
1540 mptcp_subflow_abort(mpts, error);
1541
1542 proc_rele(p);
1543
1544 mptcplog((LOG_ERR, "%s: subflow socreate failed with error %d\n",
1545 __func__, error), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1546
1547 return (error);
1548}
1549
1550/*
1551 * Close an MPTCP subflow socket.
1552 *
1553 * Note that this may be called on an embryonic subflow, and the only
1554 * thing that is guaranteed valid is the protocol-user request.
1555 */
1556static void
1557mptcp_subflow_soclose(struct mptsub *mpts)
1558{
1559 struct socket *so = mpts->mpts_socket;
1560
1561 if (mpts->mpts_flags & MPTSF_CLOSED)
1562 return;
1563
1564 VERIFY(so != NULL);
1565 VERIFY(so->so_flags & SOF_MP_SUBFLOW);
1566 VERIFY((so->so_state & (SS_NBIO|SS_NOFDREF)) == (SS_NBIO|SS_NOFDREF));
1567
1568 DTRACE_MPTCP5(subflow__close, struct mptsub *, mpts,
1569 struct socket *, so,
1570 struct sockbuf *, &so->so_rcv,
1571 struct sockbuf *, &so->so_snd,
1572 struct mptses *, mpts->mpts_mpte);
1573
1574 mpts->mpts_flags |= MPTSF_CLOSED;
1575
1576 if (so->so_retaincnt == 0) {
1577 soclose_locked(so);
1578
1579 return;
1580 } else {
1581 VERIFY(so->so_usecount > 0);
1582 so->so_usecount--;
1583 }
1584
1585 return;
1586}
1587
1588/*
1589 * Connect an MPTCP subflow socket.
1590 *
1591 * Note that in the pending connect case, the subflow socket may have been
1592 * bound to an interface and/or a source IP address which may no longer be
1593 * around by the time this routine is called; in that case the connect attempt
1594 * will most likely fail.
1595 */
1596static int
1597mptcp_subflow_soconnectx(struct mptses *mpte, struct mptsub *mpts)
1598{
1599 char dbuf[MAX_IPv6_STR_LEN];
1600 struct socket *mp_so, *so;
1601 struct mptcb *mp_tp;
1602 struct sockaddr *dst;
1603 struct proc *p;
1604 int af, error, dport;
1605
1606 mp_so = mptetoso(mpte);
1607 mp_tp = mpte->mpte_mptcb;
1608 so = mpts->mpts_socket;
1609 af = mpts->mpts_dst.sa_family;
1610 dst = &mpts->mpts_dst;
1611
1612 VERIFY((mpts->mpts_flags & (MPTSF_CONNECTING|MPTSF_CONNECTED)) == MPTSF_CONNECTING);
1613 VERIFY(mpts->mpts_socket != NULL);
1614 VERIFY(af == AF_INET || af == AF_INET6);
1615
1616 if (af == AF_INET) {
1617 inet_ntop(af, &SIN(dst)->sin_addr.s_addr, dbuf, sizeof (dbuf));
1618 dport = ntohs(SIN(dst)->sin_port);
1619 } else {
1620 inet_ntop(af, &SIN6(dst)->sin6_addr, dbuf, sizeof (dbuf));
1621 dport = ntohs(SIN6(dst)->sin6_port);
1622 }
1623
1624 os_log_info(mptcp_log_handle,
1625 "%s: ifindex %u dst %s:%d pended %u\n", __func__, mpts->mpts_ifscope,
1626 dbuf, dport, !!(mpts->mpts_flags & MPTSF_CONNECT_PENDING));
1627
1628 p = proc_find(mp_so->last_pid);
1629 if (p == PROC_NULL) {
1630 mptcplog((LOG_ERR, "%s: Couldn't find proc for pid %u\n", __func__, mp_so->last_pid),
1631 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1632
1633 return (ESRCH);
1634 }
1635
1636 mpts->mpts_flags &= ~MPTSF_CONNECT_PENDING;
1637
1638 mptcp_attach_to_subf(so, mpte->mpte_mptcb, mpte->mpte_addrid_last);
1639
1640 /* connect the subflow socket */
1641 error = soconnectxlocked(so, mpts->mpts_src, &mpts->mpts_dst,
1642 p, mpts->mpts_ifscope,
1643 mpte->mpte_associd, NULL, 0, NULL, 0, NULL, NULL);
1644
1645 mpts->mpts_iss = sototcpcb(so)->iss;
1646
1647 /* See tcp_connect_complete */
1648 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED &&
1649 (mp_so->so_flags1 & SOF1_PRECONNECT_DATA)) {
1650 mp_tp->mpt_sndwnd = sototcpcb(so)->snd_wnd;
1651 }
1652
1653 /* Allocate a unique address id per subflow */
1654 mpte->mpte_addrid_last++;
1655 if (mpte->mpte_addrid_last == 0)
1656 mpte->mpte_addrid_last++;
1657
1658 proc_rele(p);
1659
1660 DTRACE_MPTCP3(subflow__connect, struct mptses *, mpte,
1661 struct mptsub *, mpts, int, error);
1662 if (error)
1663 mptcplog((LOG_ERR, "%s: connectx failed with error %d ifscope %u\n",
1664 __func__, error, mpts->mpts_ifscope),
1665 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1666
1667 return (error);
1668}
1669
1670/*
1671 * MPTCP subflow socket receive routine, derived from soreceive().
1672 */
1673static int
1674mptcp_subflow_soreceive(struct socket *so, struct sockaddr **psa,
1675 struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
1676{
1677#pragma unused(uio)
1678 struct socket *mp_so = mptetoso(tptomptp(sototcpcb(so))->mpt_mpte);
1679 int flags, error = 0;
1680 struct proc *p = current_proc();
1681 struct mbuf *m, **mp = mp0;
1682 boolean_t proc_held = FALSE;
1683
1684 mpte_lock_assert_held(tptomptp(sototcpcb(so))->mpt_mpte);
1685 VERIFY(so->so_proto->pr_flags & PR_CONNREQUIRED);
1686
1687#ifdef MORE_LOCKING_DEBUG
1688 if (so->so_usecount == 1) {
1689 panic("%s: so=%x no other reference on socket\n", __func__, so);
1690 /* NOTREACHED */
1691 }
1692#endif
1693 /*
1694 * We return all that is there in the subflow's socket receive buffer
1695 * to the MPTCP layer, so we require that the caller passes in the
1696 * expected parameters.
1697 */
1698 if (mp == NULL || controlp != NULL)
1699 return (EINVAL);
1700
1701 *mp = NULL;
1702 if (psa != NULL)
1703 *psa = NULL;
1704 if (flagsp != NULL)
1705 flags = *flagsp &~ MSG_EOR;
1706 else
1707 flags = 0;
1708
1709 if (flags & (MSG_PEEK|MSG_OOB|MSG_NEEDSA|MSG_WAITALL|MSG_WAITSTREAM))
1710 return (EOPNOTSUPP);
1711
1712 flags |= (MSG_DONTWAIT|MSG_NBIO);
1713
1714 /*
1715 * If a recv attempt is made on a previously-accepted socket
1716 * that has been marked as inactive (disconnected), reject
1717 * the request.
1718 */
1719 if (so->so_flags & SOF_DEFUNCT) {
1720 struct sockbuf *sb = &so->so_rcv;
1721
1722 error = ENOTCONN;
1723 /*
1724 * This socket should have been disconnected and flushed
1725 * prior to being returned from sodefunct(); there should
1726 * be no data on its receive list, so panic otherwise.
1727 */
1728 if (so->so_state & SS_DEFUNCT)
1729 sb_empty_assert(sb, __func__);
1730 return (error);
1731 }
1732
1733 /*
1734 * See if the socket has been closed (SS_NOFDREF|SS_CANTRCVMORE)
1735 * and if so just return to the caller. This could happen when
1736 * soreceive() is called by a socket upcall function during the
1737 * time the socket is freed. The socket buffer would have been
1738 * locked across the upcall, therefore we cannot put this thread
1739 * to sleep (else we will deadlock) or return EWOULDBLOCK (else
1740 * we may livelock), because the lock on the socket buffer will
1741 * only be released when the upcall routine returns to its caller.
1742 * Because the socket has been officially closed, there can be
1743 * no further read on it.
1744 *
1745 * A multipath subflow socket would have its SS_NOFDREF set by
1746 * default, so check for SOF_MP_SUBFLOW socket flag; when the
1747 * socket is closed for real, SOF_MP_SUBFLOW would be cleared.
1748 */
1749 if ((so->so_state & (SS_NOFDREF | SS_CANTRCVMORE)) ==
1750 (SS_NOFDREF | SS_CANTRCVMORE) && !(so->so_flags & SOF_MP_SUBFLOW))
1751 return (0);
1752
1753 /*
1754 * For consistency with soreceive() semantics, we need to obey
1755 * SB_LOCK in case some other code path has locked the buffer.
1756 */
1757 error = sblock(&so->so_rcv, 0);
1758 if (error != 0)
1759 return (error);
1760
1761 m = so->so_rcv.sb_mb;
1762 if (m == NULL) {
1763 /*
1764 * Panic if we notice inconsistencies in the socket's
1765 * receive list; both sb_mb and sb_cc should correctly
1766 * reflect the contents of the list, otherwise we may
1767 * end up with false positives during select() or poll()
1768 * which could put the application in a bad state.
1769 */
1770 SB_MB_CHECK(&so->so_rcv);
1771
1772 if (so->so_error != 0) {
1773 error = so->so_error;
1774 so->so_error = 0;
1775 goto release;
1776 }
1777
1778 if (so->so_state & SS_CANTRCVMORE) {
1779 goto release;
1780 }
1781
1782 if (!(so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING))) {
1783 error = ENOTCONN;
1784 goto release;
1785 }
1786
1787 /*
1788 * MSG_DONTWAIT is implicitly defined and this routine will
1789 * never block, so return EWOULDBLOCK when there is nothing.
1790 */
1791 error = EWOULDBLOCK;
1792 goto release;
1793 }
1794
1795 mptcp_update_last_owner(so, mp_so);
1796
1797 if (mp_so->last_pid != proc_pid(p)) {
1798 p = proc_find(mp_so->last_pid);
1799 if (p == PROC_NULL) {
1800 p = current_proc();
1801 } else {
1802 proc_held = TRUE;
1803 }
1804 }
1805
1806 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgrcv);
1807 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
1808 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 1");
1809
1810 while (m != NULL) {
1811 int dlen = 0, dfin = 0, error_out = 0;
1812 struct mbuf *start = m;
1813 uint64_t dsn;
1814 uint32_t sseq;
1815 uint16_t orig_dlen;
1816 uint16_t csum;
1817
1818 VERIFY(m->m_nextpkt == NULL);
1819
1820 if ((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
1821 orig_dlen = dlen = m->m_pkthdr.mp_rlen;
1822 dsn = m->m_pkthdr.mp_dsn;
1823 sseq = m->m_pkthdr.mp_rseq;
1824 csum = m->m_pkthdr.mp_csum;
1825 } else {
1826 /* We did fallback */
1827 mptcp_adj_rmap(so, m, 0, 0, 0, 0);
1828
1829 sbfree(&so->so_rcv, m);
1830
1831 if (mp != NULL) {
1832 *mp = m;
1833 mp = &m->m_next;
1834 so->so_rcv.sb_mb = m = m->m_next;
1835 *mp = NULL;
1836
1837 }
1838
1839 if (m != NULL) {
1840 so->so_rcv.sb_lastrecord = m;
1841 } else {
1842 SB_EMPTY_FIXUP(&so->so_rcv);
1843 }
1844
1845 continue;
1846 }
1847
1848 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN)
1849 dfin = 1;
1850
1851 /*
1852 * Check if the full mapping is now present
1853 */
1854 if ((int)so->so_rcv.sb_cc < dlen - dfin) {
1855 mptcplog((LOG_INFO, "%s not enough data (%u) need %u for dsn %u\n",
1856 __func__, so->so_rcv.sb_cc, dlen, (uint32_t)dsn),
1857 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_LOG);
1858
1859 if (*mp0 == NULL)
1860 error = EWOULDBLOCK;
1861 goto release;
1862 }
1863
1864 /* Now, get the full mapping */
1865 while (dlen > 0) {
1866 if (mptcp_adj_rmap(so, m, orig_dlen - dlen, dsn, sseq, orig_dlen)) {
1867 error_out = 1;
1868 error = EIO;
1869 dlen = 0;
1870 soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
1871 break;
1872 }
1873
1874 dlen -= m->m_len;
1875 sbfree(&so->so_rcv, m);
1876
1877 if (mp != NULL) {
1878 *mp = m;
1879 mp = &m->m_next;
1880 so->so_rcv.sb_mb = m = m->m_next;
1881 *mp = NULL;
1882 }
1883
1884 if (dlen - dfin == 0)
1885 dlen = 0;
1886
1887 VERIFY(dlen <= 0 || m);
1888 }
1889
1890 VERIFY(dlen == 0);
1891
1892 if (m != NULL) {
1893 so->so_rcv.sb_lastrecord = m;
1894 } else {
1895 SB_EMPTY_FIXUP(&so->so_rcv);
1896 }
1897
1898 if (error_out)
1899 goto release;
1900
1901
1902 if (mptcp_validate_csum(sototcpcb(so), start, dsn, sseq, orig_dlen, csum, dfin)) {
1903 error = EIO;
1904 *mp0 = NULL;
1905 goto release;
1906 }
1907
1908 SBLASTRECORDCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
1909 SBLASTMBUFCHK(&so->so_rcv, "mptcp_subflow_soreceive 2");
1910 }
1911
1912 DTRACE_MPTCP3(subflow__receive, struct socket *, so,
1913 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd);
1914
1915 if (flagsp != NULL)
1916 *flagsp |= flags;
1917
1918release:
1919 sbunlock(&so->so_rcv, TRUE);
1920
1921 if (proc_held)
1922 proc_rele(p);
1923
1924 return (error);
1925
1926}
1927
1928/*
1929 * MPTCP subflow socket send routine, derived from sosend().
1930 */
1931static int
1932mptcp_subflow_sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
1933 struct mbuf *top, struct mbuf *control, int flags)
1934{
1935 struct socket *mp_so = mptetoso(tptomptp(sototcpcb(so))->mpt_mpte);
1936 struct proc *p = current_proc();
1937 boolean_t en_tracing = FALSE, proc_held = FALSE;
1938 int en_tracing_val;
1939 int sblocked = 1; /* Pretend as if it is already locked, so we won't relock it */
1940 int error;
1941
1942 VERIFY(control == NULL);
1943 VERIFY(addr == NULL);
1944 VERIFY(uio == NULL);
1945 VERIFY(flags == 0);
1946 VERIFY((so->so_flags & SOF_CONTENT_FILTER) == 0);
1947
1948 VERIFY(top->m_pkthdr.len > 0 && top->m_pkthdr.len <= UINT16_MAX);
1949 VERIFY(top->m_pkthdr.pkt_flags & PKTF_MPTCP);
1950
1951 /*
1952 * trace if tracing & network (vs. unix) sockets & and
1953 * non-loopback
1954 */
1955 if (ENTR_SHOULDTRACE &&
1956 (SOCK_CHECK_DOM(so, AF_INET) || SOCK_CHECK_DOM(so, AF_INET6))) {
1957 struct inpcb *inp = sotoinpcb(so);
1958 if (inp->inp_last_outifp != NULL &&
1959 !(inp->inp_last_outifp->if_flags & IFF_LOOPBACK)) {
1960 en_tracing = TRUE;
1961 en_tracing_val = top->m_pkthdr.len;
1962 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_START,
1963 VM_KERNEL_ADDRPERM(so),
1964 ((so->so_state & SS_NBIO) ? kEnTrFlagNonBlocking : 0),
1965 (int64_t)en_tracing_val);
1966 }
1967 }
1968
1969 mptcp_update_last_owner(so, mp_so);
1970
1971 if (mp_so->last_pid != proc_pid(p)) {
1972 p = proc_find(mp_so->last_pid);
1973 if (p == PROC_NULL) {
1974 p = current_proc();
1975 } else {
1976 proc_held = TRUE;
1977 }
1978 }
1979
1980#if NECP
1981 inp_update_necp_policy(sotoinpcb(so), NULL, NULL, 0);
1982#endif /* NECP */
1983
1984 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
1985
1986 error = sosendcheck(so, NULL, top->m_pkthdr.len, 0, 1, 0, &sblocked, NULL);
1987 if (error)
1988 goto out;
1989
1990 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, top, NULL, NULL, p);
1991 top = NULL;
1992
1993out:
1994 if (top != NULL)
1995 m_freem(top);
1996
1997 if (proc_held)
1998 proc_rele(p);
1999
2000 soclearfastopen(so);
2001
2002 if (en_tracing) {
2003 KERNEL_ENERGYTRACE(kEnTrActKernSockWrite, DBG_FUNC_END,
2004 VM_KERNEL_ADDRPERM(so),
2005 ((error == EWOULDBLOCK) ? kEnTrFlagNoWork : 0),
2006 (int64_t)en_tracing_val);
2007 }
2008
2009 return (error);
2010
2011}
2012
2013/*
2014 * Establish an initial MPTCP connection (if first subflow and not yet
2015 * connected), or add a subflow to an existing MPTCP connection.
2016 */
2017int
2018mptcp_subflow_add(struct mptses *mpte, struct sockaddr *src,
2019 struct sockaddr *dst, uint32_t ifscope, sae_connid_t *pcid)
2020{
2021 struct socket *mp_so, *so = NULL;
2022 struct mptcb *mp_tp;
2023 struct mptsub *mpts = NULL;
2024 int af, error = 0;
2025
2026 mpte_lock_assert_held(mpte); /* same as MP socket lock */
2027 mp_so = mptetoso(mpte);
2028 mp_tp = mpte->mpte_mptcb;
2029
2030 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
2031 /* If the remote end sends Data FIN, refuse subflow adds */
2032 mptcplog((LOG_ERR, "%s state %u\n", __func__, mp_tp->mpt_state),
2033 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
2034 error = ENOTCONN;
2035 goto out_err;
2036 }
2037
2038 mpts = mptcp_subflow_alloc();
2039 if (mpts == NULL) {
2040 mptcplog((LOG_ERR, "%s malloc subflow failed\n", __func__),
2041 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
2042 error = ENOMEM;
2043 goto out_err;
2044 }
2045
2046 if (src != NULL) {
2047 int len = src->sa_len;
2048
2049 MALLOC(mpts->mpts_src, struct sockaddr *, len, M_SONAME,
2050 M_WAITOK | M_ZERO);
2051 if (mpts->mpts_src == NULL) {
2052 mptcplog((LOG_ERR, "%s malloc mpts_src failed", __func__),
2053 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
2054 error = ENOMEM;
2055 goto out_err;
2056 }
2057 bcopy(src, mpts->mpts_src, len);
2058 }
2059
2060 memcpy(&mpts->mpts_dst, dst, dst->sa_len);
2061
2062 af = mpts->mpts_dst.sa_family;
2063
2064 mpts->mpts_ifscope = ifscope;
2065
2066 /* create the subflow socket */
2067 if ((error = mptcp_subflow_socreate(mpte, mpts, af, &so)) != 0)
2068 /*
2069 * Returning (error) and not cleaning up, because up to here
2070 * all we did is creating mpts.
2071 *
2072 * And the contract is that the call to mptcp_subflow_socreate,
2073 * moves ownership of mpts to mptcp_subflow_socreate.
2074 */
2075 return (error);
2076
2077 /*
2078 * We may be called from within the kernel. Still need to account this
2079 * one to the real app.
2080 */
2081 mptcp_update_last_owner(mpts->mpts_socket, mp_so);
2082
2083 /*
2084 * Increment the counter, while avoiding 0 (SAE_CONNID_ANY) and
2085 * -1 (SAE_CONNID_ALL).
2086 */
2087 mpte->mpte_connid_last++;
2088 if (mpte->mpte_connid_last == SAE_CONNID_ALL ||
2089 mpte->mpte_connid_last == SAE_CONNID_ANY)
2090 mpte->mpte_connid_last++;
2091
2092 mpts->mpts_connid = mpte->mpte_connid_last;
2093
2094 mpts->mpts_rel_seq = 1;
2095
2096 /* Allocate a unique address id per subflow */
2097 mpte->mpte_addrid_last++;
2098 if (mpte->mpte_addrid_last == 0)
2099 mpte->mpte_addrid_last++;
2100
2101 /* register for subflow socket read/write events */
2102 sock_setupcalls_locked(so, mptcp_subflow_rupcall, mpts, mptcp_subflow_wupcall, mpts, 1);
2103
2104 /* Register for subflow socket control events */
2105 sock_catchevents_locked(so, mptcp_subflow_eupcall1, mpts,
2106 SO_FILT_HINT_CONNRESET | SO_FILT_HINT_CANTRCVMORE |
2107 SO_FILT_HINT_TIMEOUT | SO_FILT_HINT_NOSRCADDR |
2108 SO_FILT_HINT_IFDENIED | SO_FILT_HINT_CONNECTED |
2109 SO_FILT_HINT_DISCONNECTED | SO_FILT_HINT_MPFAILOVER |
2110 SO_FILT_HINT_MPSTATUS | SO_FILT_HINT_MUSTRST |
2111 SO_FILT_HINT_MPCANTRCVMORE | SO_FILT_HINT_ADAPTIVE_RTIMO |
2112 SO_FILT_HINT_ADAPTIVE_WTIMO);
2113
2114 /* sanity check */
2115 VERIFY(!(mpts->mpts_flags &
2116 (MPTSF_CONNECTING|MPTSF_CONNECTED|MPTSF_CONNECT_PENDING)));
2117
2118 /*
2119 * Indicate to the TCP subflow whether or not it should establish
2120 * the initial MPTCP connection, or join an existing one. Fill
2121 * in the connection request structure with additional info needed
2122 * by the underlying TCP (to be used in the TCP options, etc.)
2123 */
2124 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED && mpte->mpte_numflows == 1) {
2125 mpts->mpts_flags |= MPTSF_INITIAL_SUB;
2126
2127 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
2128 mptcp_init_local_parms(mpte);
2129 }
2130 soisconnecting(mp_so);
2131
2132 /* If fastopen is requested, set state in mpts */
2133 if (so->so_flags1 & SOF1_PRECONNECT_DATA)
2134 mpts->mpts_flags |= MPTSF_TFO_REQD;
2135 } else {
2136 if (!(mp_tp->mpt_flags & MPTCPF_JOIN_READY))
2137 mpts->mpts_flags |= MPTSF_CONNECT_PENDING;
2138 }
2139
2140 mpts->mpts_flags |= MPTSF_CONNECTING;
2141
2142 if (af == AF_INET || af == AF_INET6) {
2143 char dbuf[MAX_IPv6_STR_LEN];
2144
2145 mptcplog((LOG_DEBUG, "MPTCP Socket: %s "
2146 "mp_so 0x%llx dst %s[%d] cid %d "
2147 "[pending %s]\n", __func__,
2148 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
2149 inet_ntop(af, ((af == AF_INET) ?
2150 (void *)&SIN(&mpts->mpts_dst)->sin_addr.s_addr :
2151 (void *)&SIN6(&mpts->mpts_dst)->sin6_addr),
2152 dbuf, sizeof (dbuf)), ((af == AF_INET) ?
2153 ntohs(SIN(&mpts->mpts_dst)->sin_port) :
2154 ntohs(SIN6(&mpts->mpts_dst)->sin6_port)),
2155 mpts->mpts_connid,
2156 ((mpts->mpts_flags & MPTSF_CONNECT_PENDING) ?
2157 "YES" : "NO")),
2158 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2159 }
2160
2161 /* connect right away if first attempt, or if join can be done now */
2162 if (!(mpts->mpts_flags & MPTSF_CONNECT_PENDING))
2163 error = mptcp_subflow_soconnectx(mpte, mpts);
2164
2165 if (error)
2166 goto out_err_close;
2167
2168 if (pcid)
2169 *pcid = mpts->mpts_connid;
2170
2171 return (0);
2172
2173out_err_close:
2174 mptcp_subflow_abort(mpts, error);
2175
2176 return (error);
2177
2178out_err:
2179 if (mpts)
2180 mptcp_subflow_free(mpts);
2181
2182 return (error);
2183}
2184
2185void
2186mptcpstats_update(struct mptcp_itf_stats *stats, struct mptsub *mpts)
2187{
2188 int index = mptcp_get_statsindex(stats, mpts);
2189
2190 if (index != -1) {
2191 struct inpcb *inp = sotoinpcb(mpts->mpts_socket);
2192
2193 stats[index].mpis_txbytes += inp->inp_stat->txbytes;
2194 stats[index].mpis_rxbytes += inp->inp_stat->rxbytes;
2195 }
2196}
2197
2198/*
2199 * Delete/remove a subflow from an MPTCP. The underlying subflow socket
2200 * will no longer be accessible after a subflow is deleted, thus this
2201 * should occur only after the subflow socket has been disconnected.
2202 */
2203void
2204mptcp_subflow_del(struct mptses *mpte, struct mptsub *mpts)
2205{
2206 struct socket *mp_so = mptetoso(mpte);
2207 struct socket *so = mpts->mpts_socket;
2208 struct tcpcb *tp = sototcpcb(so);
2209
2210 mpte_lock_assert_held(mpte); /* same as MP socket lock */
2211 VERIFY(mpts->mpts_mpte == mpte);
2212 VERIFY(mpts->mpts_flags & MPTSF_ATTACHED);
2213 VERIFY(mpte->mpte_numflows != 0);
2214 VERIFY(mp_so->so_usecount > 0);
2215
2216 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx [u=%d,r=%d] cid %d %x error %d\n",
2217 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
2218 mp_so->so_usecount, mp_so->so_retaincnt, mpts->mpts_connid,
2219 mpts->mpts_flags, mp_so->so_error),
2220 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2221
2222 mptcpstats_update(mpte->mpte_itfstats, mpts);
2223 mpte->mpte_init_rxbytes = sotoinpcb(so)->inp_stat->rxbytes;
2224 mpte->mpte_init_txbytes = sotoinpcb(so)->inp_stat->txbytes;
2225
2226 atomic_bitclear_32(&mpts->mpts_flags, MPTSF_ATTACHED);
2227 TAILQ_REMOVE(&mpte->mpte_subflows, mpts, mpts_entry);
2228 mpte->mpte_numflows--;
2229 if (mpte->mpte_active_sub == mpts)
2230 mpte->mpte_active_sub = NULL;
2231
2232 /*
2233 * Drop references held by this subflow socket; there
2234 * will be no further upcalls made from this point.
2235 */
2236 sock_setupcalls_locked(so, NULL, NULL, NULL, NULL, 0);
2237 sock_catchevents_locked(so, NULL, NULL, 0);
2238
2239 mptcp_detach_mptcb_from_subf(mpte->mpte_mptcb, so);
2240
2241 mp_so->so_usecount--; /* for subflow socket */
2242 mpts->mpts_mpte = NULL;
2243 mpts->mpts_socket = NULL;
2244
2245 mptcp_subflow_remref(mpts); /* for MPTCP subflow list */
2246 mptcp_subflow_remref(mpts); /* for subflow socket */
2247
2248 so->so_flags &= ~SOF_MP_SUBFLOW;
2249 tp->t_mptcb = NULL;
2250 tp->t_mpsub = NULL;
2251}
2252
2253void
2254mptcp_subflow_shutdown(struct mptses *mpte, struct mptsub *mpts)
2255{
2256 struct socket *so = mpts->mpts_socket;
2257 struct mptcb *mp_tp = mpte->mpte_mptcb;
2258 int send_dfin = 0;
2259
2260 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT)
2261 send_dfin = 1;
2262
2263 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2264 (so->so_state & SS_ISCONNECTED)) {
2265 mptcplog((LOG_DEBUG, "MPTCP subflow shutdown %s: cid %d fin %d\n",
2266 __func__, mpts->mpts_connid, send_dfin),
2267 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2268
2269 if (send_dfin)
2270 mptcp_send_dfin(so);
2271 soshutdownlock(so, SHUT_WR);
2272 }
2273
2274}
2275
2276static void
2277mptcp_subflow_abort(struct mptsub *mpts, int error)
2278{
2279 struct socket *so = mpts->mpts_socket;
2280 struct tcpcb *tp = sototcpcb(so);
2281
2282 if (mpts->mpts_flags & MPTSF_DISCONNECTED)
2283 return;
2284
2285 mptcplog((LOG_DEBUG, "%s aborting connection state %u\n", __func__, tp->t_state),
2286 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2287
2288 if (tp->t_state != TCPS_CLOSED)
2289 tcp_drop(tp, error);
2290
2291 mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
2292}
2293
2294/*
2295 * Disconnect a subflow socket.
2296 */
2297void
2298mptcp_subflow_disconnect(struct mptses *mpte, struct mptsub *mpts)
2299{
2300 struct socket *so;
2301 struct mptcb *mp_tp;
2302 int send_dfin = 0;
2303
2304 mpte_lock_assert_held(mpte); /* same as MP socket lock */
2305
2306 VERIFY(mpts->mpts_mpte == mpte);
2307 VERIFY(mpts->mpts_socket != NULL);
2308
2309 if (mpts->mpts_flags & (MPTSF_DISCONNECTING|MPTSF_DISCONNECTED))
2310 return;
2311
2312 mpts->mpts_flags |= MPTSF_DISCONNECTING;
2313
2314 so = mpts->mpts_socket;
2315 mp_tp = mpte->mpte_mptcb;
2316 if (mp_tp->mpt_state > MPTCPS_CLOSE_WAIT)
2317 send_dfin = 1;
2318
2319 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
2320 (so->so_state & SS_ISCONNECTED)) {
2321 mptcplog((LOG_DEBUG, "%s: cid %d fin %d\n",
2322 __func__, mpts->mpts_connid, send_dfin),
2323 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2324
2325 if (send_dfin)
2326 mptcp_send_dfin(so);
2327 (void) soshutdownlock(so, SHUT_RD);
2328 (void) soshutdownlock(so, SHUT_WR);
2329 (void) sodisconnectlocked(so);
2330 }
2331 /*
2332 * Generate a disconnect event for this subflow socket, in case
2333 * the lower layer doesn't do it; this is needed because the
2334 * subflow socket deletion relies on it.
2335 */
2336 mptcp_subflow_eupcall1(so, mpts, SO_FILT_HINT_DISCONNECTED);
2337}
2338
2339/*
2340 * Called when the associated subflow socket posted a read event.
2341 */
2342static void
2343mptcp_subflow_rupcall(struct socket *so, void *arg, int waitf)
2344{
2345#pragma unused(so, waitf)
2346 struct mptsub *mpts = arg, *tmpts;
2347 struct mptses *mpte = mpts->mpts_mpte;
2348
2349 VERIFY(mpte != NULL);
2350
2351 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2352 if (!(mpte->mpte_mppcb->mpp_flags & MPP_RUPCALL))
2353 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2354 return;
2355 }
2356
2357 mpte->mpte_mppcb->mpp_flags |= MPP_RUPCALL;
2358 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
2359 if (mpts->mpts_socket->so_usecount == 0) {
2360 /* Will be removed soon by tcp_garbage_collect */
2361 continue;
2362 }
2363
2364 mptcp_subflow_addref(mpts);
2365 mpts->mpts_socket->so_usecount++;
2366
2367 mptcp_subflow_input(mpte, mpts);
2368
2369 mptcp_subflow_remref(mpts); /* ours */
2370
2371 VERIFY(mpts->mpts_socket->so_usecount != 0);
2372 mpts->mpts_socket->so_usecount--;
2373 }
2374
2375 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_RUPCALL);
2376}
2377
2378/*
2379 * Subflow socket input.
2380 */
2381static void
2382mptcp_subflow_input(struct mptses *mpte, struct mptsub *mpts)
2383{
2384 struct socket *mp_so = mptetoso(mpte);
2385 struct mbuf *m = NULL;
2386 struct socket *so;
2387 int error, wakeup = 0;
2388
2389 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_INPUT));
2390 mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_INPUT;
2391
2392 DTRACE_MPTCP2(subflow__input, struct mptses *, mpte,
2393 struct mptsub *, mpts);
2394
2395 if (!(mpts->mpts_flags & MPTSF_CONNECTED))
2396 goto out;
2397
2398 so = mpts->mpts_socket;
2399
2400 error = sock_receive_internal(so, NULL, &m, 0, NULL);
2401 if (error != 0 && error != EWOULDBLOCK) {
2402 mptcplog((LOG_ERR, "%s: cid %d error %d\n",
2403 __func__, mpts->mpts_connid, error),
2404 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
2405 if (error == ENODATA) {
2406 /*
2407 * Don't ignore ENODATA so as to discover
2408 * nasty middleboxes.
2409 */
2410 mp_so->so_error = ENODATA;
2411
2412 wakeup = 1;
2413 goto out;
2414 }
2415 } else if (error == 0) {
2416 mptcplog((LOG_DEBUG, "%s: cid %d \n", __func__, mpts->mpts_connid),
2417 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
2418 }
2419
2420 /* In fallback, make sure to accept data on all but one subflow */
2421 if (m && (mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
2422 !(mpts->mpts_flags & MPTSF_ACTIVE)) {
2423 mptcplog((LOG_DEBUG, "%s: degraded and got data on non-active flow\n",
2424 __func__), MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_VERBOSE);
2425 m_freem(m);
2426 goto out;
2427 }
2428
2429 if (m != NULL) {
2430 if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
2431 mpte->mpte_mppcb->mpp_flags |= MPP_SET_CELLICON;
2432
2433 mpte->mpte_used_cell = 1;
2434 } else {
2435 mpte->mpte_mppcb->mpp_flags |= MPP_UNSET_CELLICON;
2436
2437 mpte->mpte_used_wifi = 1;
2438 }
2439
2440 mptcp_input(mpte, m);
2441 }
2442
2443 /* notify protocol that we drained all the data */
2444 if (error == 0 && m != NULL &&
2445 (so->so_proto->pr_flags & PR_WANTRCVD) && so->so_pcb != NULL)
2446 (*so->so_proto->pr_usrreqs->pru_rcvd)(so, 0);
2447
2448out:
2449 if (wakeup)
2450 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_RWAKEUP;
2451
2452 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_INPUT);
2453}
2454
2455/*
2456 * Subflow socket write upcall.
2457 *
2458 * Called when the associated subflow socket posted a read event.
2459 */
2460static void
2461mptcp_subflow_wupcall(struct socket *so, void *arg, int waitf)
2462{
2463#pragma unused(so, waitf)
2464 struct mptsub *mpts = arg;
2465 struct mptses *mpte = mpts->mpts_mpte;
2466
2467 VERIFY(mpte != NULL);
2468
2469 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
2470 if (!(mpte->mpte_mppcb->mpp_flags & MPP_WUPCALL))
2471 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
2472 return;
2473 }
2474
2475 mptcp_output(mpte);
2476}
2477
2478static boolean_t
2479mptcp_search_seq_in_sub(struct mbuf *m, struct socket *so)
2480{
2481 struct mbuf *so_m = so->so_snd.sb_mb;
2482 uint64_t dsn = m->m_pkthdr.mp_dsn;
2483
2484 while (so_m) {
2485 VERIFY(so_m->m_flags & M_PKTHDR);
2486 VERIFY(so_m->m_pkthdr.pkt_flags & PKTF_MPTCP);
2487
2488 /* Part of the segment is covered, don't reinject here */
2489 if (so_m->m_pkthdr.mp_dsn <= dsn &&
2490 so_m->m_pkthdr.mp_dsn + so_m->m_pkthdr.mp_rlen > dsn)
2491 return TRUE;
2492
2493 so_m = so_m->m_next;
2494 }
2495
2496 return FALSE;
2497}
2498
2499/*
2500 * Subflow socket output.
2501 *
2502 * Called for sending data from MPTCP to the underlying subflow socket.
2503 */
2504int
2505mptcp_subflow_output(struct mptses *mpte, struct mptsub *mpts, int flags)
2506{
2507 struct mptcb *mp_tp = mpte->mpte_mptcb;
2508 struct mbuf *sb_mb, *m, *mpt_mbuf = NULL, *head, *tail;
2509 struct socket *mp_so, *so;
2510 struct tcpcb *tp;
2511 uint64_t mpt_dsn = 0, off = 0;
2512 int sb_cc = 0, error = 0, wakeup = 0;
2513 uint32_t dss_csum;
2514 uint16_t tot_sent = 0;
2515 boolean_t reinjected = FALSE;
2516
2517 mpte_lock_assert_held(mpte);
2518
2519 mp_so = mptetoso(mpte);
2520 so = mpts->mpts_socket;
2521 tp = sototcpcb(so);
2522
2523 VERIFY(!(mpte->mpte_mppcb->mpp_flags & MPP_INSIDE_OUTPUT));
2524 mpte->mpte_mppcb->mpp_flags |= MPP_INSIDE_OUTPUT;
2525
2526 VERIFY(!INP_WAIT_FOR_IF_FEEDBACK(sotoinpcb(so)));
2527 VERIFY((mpts->mpts_flags & MPTSF_MP_CAPABLE) ||
2528 (mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
2529 (mpts->mpts_flags & MPTSF_TFO_REQD));
2530 VERIFY(mptcp_subflow_cwnd_space(mpts->mpts_socket) > 0);
2531
2532 mptcplog((LOG_DEBUG, "%s mpts_flags %#x, mpte_flags %#x cwnd_space %u\n",
2533 __func__, mpts->mpts_flags, mpte->mpte_flags,
2534 mptcp_subflow_cwnd_space(so)),
2535 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
2536 DTRACE_MPTCP2(subflow__output, struct mptses *, mpte,
2537 struct mptsub *, mpts);
2538
2539 /* Remove Addr Option is not sent reliably as per I-D */
2540 if (mpte->mpte_flags & MPTE_SND_REM_ADDR) {
2541 tp->t_rem_aid = mpte->mpte_lost_aid;
2542 tp->t_mpflags |= TMPF_SND_REM_ADDR;
2543 mpte->mpte_flags &= ~MPTE_SND_REM_ADDR;
2544 }
2545
2546 /*
2547 * The mbuf chains containing the metadata (as well as pointing to
2548 * the user data sitting at the MPTCP output queue) would then be
2549 * sent down to the subflow socket.
2550 *
2551 * Some notes on data sequencing:
2552 *
2553 * a. Each mbuf must be a M_PKTHDR.
2554 * b. MPTCP metadata is stored in the mptcp_pktinfo structure
2555 * in the mbuf pkthdr structure.
2556 * c. Each mbuf containing the MPTCP metadata must have its
2557 * pkt_flags marked with the PKTF_MPTCP flag.
2558 */
2559
2560 if (mpte->mpte_reinjectq)
2561 sb_mb = mpte->mpte_reinjectq;
2562 else
2563 sb_mb = mp_so->so_snd.sb_mb;
2564
2565 if (sb_mb == NULL) {
2566 mptcplog((LOG_ERR, "%s: No data in MPTCP-sendbuffer! smax %u snxt %u suna %u state %u flags %#x\n",
2567 __func__, (uint32_t)mp_tp->mpt_sndmax, (uint32_t)mp_tp->mpt_sndnxt,
2568 (uint32_t)mp_tp->mpt_snduna, mp_tp->mpt_state, mp_so->so_flags1),
2569 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2570
2571 /* Fix it to prevent looping */
2572 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna))
2573 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
2574 goto out;
2575 }
2576
2577 VERIFY(sb_mb->m_pkthdr.pkt_flags & PKTF_MPTCP);
2578
2579 if (sb_mb->m_pkthdr.mp_rlen == 0 &&
2580 !(so->so_state & SS_ISCONNECTED) &&
2581 (so->so_flags1 & SOF1_PRECONNECT_DATA)) {
2582 tp->t_mpflags |= TMPF_TFO_REQUEST;
2583 goto zero_len_write;
2584 }
2585
2586 mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
2587
2588 /* First, drop acknowledged data */
2589 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
2590 mptcplog((LOG_ERR, "%s: dropping data, should have been done earlier "
2591 "dsn %u suna %u reinject? %u\n",
2592 __func__, (uint32_t)mpt_dsn,
2593 (uint32_t)mp_tp->mpt_snduna, !!mpte->mpte_reinjectq),
2594 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2595 if (mpte->mpte_reinjectq) {
2596 mptcp_clean_reinjectq(mpte);
2597 } else {
2598 uint64_t len = 0;
2599 len = mp_tp->mpt_snduna - mpt_dsn;
2600 sbdrop(&mp_so->so_snd, (int)len);
2601 wakeup = 1;
2602 }
2603 }
2604
2605 /* Check again because of above sbdrop */
2606 if (mp_so->so_snd.sb_mb == NULL && mpte->mpte_reinjectq == NULL) {
2607 mptcplog((LOG_ERR, "%s send-buffer is empty\n", __func__),
2608 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2609 goto out;
2610 }
2611
2612 /*
2613 * In degraded mode, we don't receive data acks, so force free
2614 * mbufs less than snd_nxt
2615 */
2616 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
2617 (mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC) &&
2618 mp_so->so_snd.sb_mb) {
2619 mpt_dsn = mp_so->so_snd.sb_mb->m_pkthdr.mp_dsn;
2620 if (MPTCP_SEQ_LT(mpt_dsn, mp_tp->mpt_snduna)) {
2621 uint64_t len = 0;
2622 len = mp_tp->mpt_snduna - mpt_dsn;
2623 sbdrop(&mp_so->so_snd, (int)len);
2624 wakeup = 1;
2625
2626 mptcplog((LOG_ERR, "%s: dropping data in degraded mode, should have been done earlier dsn %u sndnxt %u suna %u\n",
2627 __func__, (uint32_t)mpt_dsn, (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_snduna),
2628 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2629 }
2630 }
2631
2632 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) &&
2633 !(mp_tp->mpt_flags & MPTCPF_POST_FALLBACK_SYNC)) {
2634 mp_tp->mpt_flags |= MPTCPF_POST_FALLBACK_SYNC;
2635 so->so_flags1 |= SOF1_POST_FALLBACK_SYNC;
2636 }
2637
2638 /*
2639 * Adjust the top level notion of next byte used for retransmissions
2640 * and sending FINs.
2641 */
2642 if (MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_snduna))
2643 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
2644
2645 /* Now determine the offset from which to start transmitting data */
2646 if (mpte->mpte_reinjectq)
2647 sb_mb = mpte->mpte_reinjectq;
2648 else
2649dont_reinject:
2650 sb_mb = mp_so->so_snd.sb_mb;
2651 if (sb_mb == NULL) {
2652 mptcplog((LOG_ERR, "%s send-buffer is still empty\n", __func__),
2653 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2654 goto out;
2655 }
2656
2657 if (sb_mb == mpte->mpte_reinjectq) {
2658 sb_cc = sb_mb->m_pkthdr.mp_rlen;
2659 off = 0;
2660
2661 if (mptcp_search_seq_in_sub(sb_mb, so)) {
2662 if (mptcp_can_send_more(mp_tp, TRUE)) {
2663 goto dont_reinject;
2664 }
2665
2666 error = ECANCELED;
2667 goto out;
2668 }
2669
2670 reinjected = TRUE;
2671 } else if (flags & MPTCP_SUBOUT_PROBING) {
2672 sb_cc = sb_mb->m_pkthdr.mp_rlen;
2673 off = 0;
2674 } else {
2675 sb_cc = min(mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd);
2676
2677 /*
2678 * With TFO, there might be no data at all, thus still go into this
2679 * code-path here.
2680 */
2681 if ((mp_so->so_flags1 & SOF1_PRECONNECT_DATA) ||
2682 MPTCP_SEQ_LT(mp_tp->mpt_sndnxt, mp_tp->mpt_sndmax)) {
2683 off = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
2684 sb_cc -= off;
2685 } else {
2686 mptcplog((LOG_ERR, "%s this should not happen: sndnxt %u sndmax %u\n",
2687 __func__, (uint32_t)mp_tp->mpt_sndnxt,
2688 (uint32_t)mp_tp->mpt_sndmax),
2689 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2690
2691 goto out;
2692 }
2693 }
2694
2695 sb_cc = min(sb_cc, mptcp_subflow_cwnd_space(so));
2696 if (sb_cc <= 0) {
2697 mptcplog((LOG_ERR, "%s sb_cc is %d, mp_so->sb_cc %u, sndwnd %u,sndnxt %u sndmax %u cwnd %u\n",
2698 __func__, sb_cc, mp_so->so_snd.sb_cc, mp_tp->mpt_sndwnd,
2699 (uint32_t)mp_tp->mpt_sndnxt, (uint32_t)mp_tp->mpt_sndmax,
2700 mptcp_subflow_cwnd_space(so)),
2701 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2702 }
2703
2704 sb_cc = min(sb_cc, UINT16_MAX);
2705
2706 /*
2707 * Create a DSN mapping for the data we are about to send. It all
2708 * has the same mapping.
2709 */
2710 if (reinjected)
2711 mpt_dsn = sb_mb->m_pkthdr.mp_dsn;
2712 else
2713 mpt_dsn = mp_tp->mpt_snduna + off;
2714
2715 mpt_mbuf = sb_mb;
2716 while (mpt_mbuf && reinjected == FALSE &&
2717 (mpt_mbuf->m_pkthdr.mp_rlen == 0 ||
2718 mpt_mbuf->m_pkthdr.mp_rlen <= (uint32_t)off)) {
2719 off -= mpt_mbuf->m_pkthdr.mp_rlen;
2720 mpt_mbuf = mpt_mbuf->m_next;
2721 }
2722 if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
2723 mptcplog((LOG_DEBUG, "%s: %u snduna = %u sndnxt = %u probe %d\n",
2724 __func__, mpts->mpts_connid, (uint32_t)mp_tp->mpt_snduna, (uint32_t)mp_tp->mpt_sndnxt,
2725 mpts->mpts_probecnt),
2726 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
2727
2728 VERIFY((mpt_mbuf == NULL) || (mpt_mbuf->m_pkthdr.pkt_flags & PKTF_MPTCP));
2729
2730 head = tail = NULL;
2731
2732 while (tot_sent < sb_cc) {
2733 ssize_t mlen;
2734
2735 mlen = mpt_mbuf->m_len;
2736 mlen -= off;
2737 mlen = min(mlen, sb_cc - tot_sent);
2738
2739 if (mlen < 0) {
2740 mptcplog((LOG_ERR, "%s mlen %d mp_rlen %u off %u sb_cc %u tot_sent %u\n",
2741 __func__, (int)mlen, mpt_mbuf->m_pkthdr.mp_rlen,
2742 (uint32_t)off, sb_cc, tot_sent),
2743 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2744 goto out;
2745 }
2746
2747 if (mlen == 0)
2748 goto next;
2749
2750 m = m_copym_mode(mpt_mbuf, (int)off, mlen, M_DONTWAIT,
2751 M_COPYM_MUST_COPY_HDR);
2752 if (m == NULL) {
2753 mptcplog((LOG_ERR, "%s m_copym_mode failed\n", __func__),
2754 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2755 error = ENOBUFS;
2756 break;
2757 }
2758
2759 /* Create a DSN mapping for the data (m_copym does it) */
2760 VERIFY(m->m_flags & M_PKTHDR);
2761 VERIFY(m->m_next == NULL);
2762
2763 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
2764 m->m_pkthdr.pkt_flags &= ~PKTF_MPSO;
2765 m->m_pkthdr.mp_dsn = mpt_dsn;
2766 m->m_pkthdr.mp_rseq = mpts->mpts_rel_seq;
2767 m->m_pkthdr.len = mlen;
2768
2769 if (head == NULL) {
2770 head = tail = m;
2771 } else {
2772 tail->m_next = m;
2773 tail = m;
2774 }
2775
2776 tot_sent += mlen;
2777 off = 0;
2778next:
2779 mpt_mbuf = mpt_mbuf->m_next;
2780 }
2781
2782 if (reinjected) {
2783 if (sb_cc < sb_mb->m_pkthdr.mp_rlen) {
2784 struct mbuf *n = sb_mb;
2785
2786 while (n) {
2787 n->m_pkthdr.mp_dsn += sb_cc;
2788 n->m_pkthdr.mp_rlen -= sb_cc;
2789 n = n->m_next;
2790 }
2791 m_adj(sb_mb, sb_cc);
2792 } else {
2793 mpte->mpte_reinjectq = sb_mb->m_nextpkt;
2794 m_freem(sb_mb);
2795 }
2796 }
2797
2798 mptcplog((LOG_DEBUG, "%s: Queued dsn %u ssn %u len %u on sub %u\n",
2799 __func__, (uint32_t)mpt_dsn, mpts->mpts_rel_seq,
2800 tot_sent, mpts->mpts_connid), MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
2801
2802 if (head && (mp_tp->mpt_flags & MPTCPF_CHECKSUM)) {
2803 dss_csum = mptcp_output_csum(head, mpt_dsn, mpts->mpts_rel_seq,
2804 tot_sent);
2805 }
2806
2807 /* Now, let's update rel-seq and the data-level length */
2808 mpts->mpts_rel_seq += tot_sent;
2809 m = head;
2810 while (m) {
2811 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM)
2812 m->m_pkthdr.mp_csum = dss_csum;
2813 m->m_pkthdr.mp_rlen = tot_sent;
2814 m = m->m_next;
2815 }
2816
2817 if (head != NULL) {
2818 if ((mpts->mpts_flags & MPTSF_TFO_REQD) &&
2819 (tp->t_tfo_stats == 0))
2820 tp->t_mpflags |= TMPF_TFO_REQUEST;
2821
2822 error = sock_sendmbuf(so, NULL, head, 0, NULL);
2823
2824 DTRACE_MPTCP7(send, struct mbuf *, m, struct socket *, so,
2825 struct sockbuf *, &so->so_rcv,
2826 struct sockbuf *, &so->so_snd,
2827 struct mptses *, mpte, struct mptsub *, mpts,
2828 size_t, tot_sent);
2829 }
2830
2831done_sending:
2832 if (error == 0 ||
2833 (error == EWOULDBLOCK && (tp->t_mpflags & TMPF_TFO_REQUEST))) {
2834 uint64_t new_sndnxt = mp_tp->mpt_sndnxt + tot_sent;
2835
2836 if (mpts->mpts_probesoon && mpts->mpts_maxseg && tot_sent) {
2837 tcpstat.tcps_mp_num_probes++;
2838 if ((uint32_t)tot_sent < mpts->mpts_maxseg)
2839 mpts->mpts_probecnt += 1;
2840 else
2841 mpts->mpts_probecnt +=
2842 tot_sent/mpts->mpts_maxseg;
2843 }
2844
2845 if (!reinjected && !(flags & MPTCP_SUBOUT_PROBING)) {
2846 if (MPTCP_DATASEQ_HIGH32(new_sndnxt) >
2847 MPTCP_DATASEQ_HIGH32(mp_tp->mpt_sndnxt))
2848 mp_tp->mpt_flags |= MPTCPF_SND_64BITDSN;
2849 mp_tp->mpt_sndnxt = new_sndnxt;
2850 }
2851
2852 mptcp_cancel_timer(mp_tp, MPTT_REXMT);
2853
2854 /* Must be here as mptcp_can_send_more() checks for this */
2855 soclearfastopen(mp_so);
2856
2857 if ((mpts->mpts_flags & MPTSF_MP_DEGRADED) ||
2858 (mpts->mpts_probesoon != 0))
2859 mptcplog((LOG_DEBUG, "%s %u degraded %u wrote %d %d probe %d probedelta %d\n",
2860 __func__, mpts->mpts_connid,
2861 !!(mpts->mpts_flags & MPTSF_MP_DEGRADED),
2862 tot_sent, (int) sb_cc, mpts->mpts_probecnt,
2863 (tcp_now - mpts->mpts_probesoon)),
2864 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
2865
2866 if (IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp)) {
2867 mpte->mpte_mppcb->mpp_flags |= MPP_SET_CELLICON;
2868
2869 mpte->mpte_used_cell = 1;
2870 } else {
2871 mpte->mpte_mppcb->mpp_flags |= MPP_UNSET_CELLICON;
2872
2873 mpte->mpte_used_wifi = 1;
2874 }
2875
2876 /*
2877 * Don't propagate EWOULDBLOCK - it's already taken care of
2878 * in mptcp_usr_send for TFO.
2879 */
2880 error = 0;
2881 } else {
2882 mptcplog((LOG_ERR, "%s: %u error %d len %d subflags %#x sostate %#x soerror %u hiwat %u lowat %u\n",
2883 __func__, mpts->mpts_connid, error, tot_sent, so->so_flags, so->so_state, so->so_error, so->so_snd.sb_hiwat, so->so_snd.sb_lowat),
2884 MPTCP_SENDER_DBG, MPTCP_LOGLVL_ERR);
2885 }
2886out:
2887
2888 if (wakeup)
2889 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WWAKEUP;
2890
2891 mptcp_handle_deferred_upcalls(mpte->mpte_mppcb, MPP_INSIDE_OUTPUT);
2892 return (error);
2893
2894zero_len_write:
2895 /* Opting to call pru_send as no mbuf at subflow level */
2896 error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, NULL, NULL,
2897 NULL, current_proc());
2898
2899 goto done_sending;
2900}
2901
2902static void
2903mptcp_add_reinjectq(struct mptses *mpte, struct mbuf *m)
2904{
2905 struct mbuf *n, *prev = NULL;
2906
2907 mptcplog((LOG_DEBUG, "%s reinjecting dsn %u dlen %u rseq %u\n",
2908 __func__, (uint32_t)m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
2909 m->m_pkthdr.mp_rseq),
2910 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2911
2912 n = mpte->mpte_reinjectq;
2913
2914 /* First, look for an mbuf n, whose data-sequence-number is bigger or
2915 * equal than m's sequence number.
2916 */
2917 while (n) {
2918 if (MPTCP_SEQ_GEQ(n->m_pkthdr.mp_dsn, m->m_pkthdr.mp_dsn))
2919 break;
2920
2921 prev = n;
2922
2923 n = n->m_nextpkt;
2924 }
2925
2926 if (n) {
2927 /* m is already fully covered by the next mbuf in the queue */
2928 if (n->m_pkthdr.mp_dsn == m->m_pkthdr.mp_dsn &&
2929 n->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_rlen) {
2930 mptcplog((LOG_DEBUG, "%s fully covered with len %u\n",
2931 __func__, n->m_pkthdr.mp_rlen),
2932 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2933 goto dont_queue;
2934 }
2935
2936 /* m is covering the next mbuf entirely, thus we remove this guy */
2937 if (m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen >= n->m_pkthdr.mp_dsn + n->m_pkthdr.mp_rlen) {
2938 struct mbuf *tmp = n->m_nextpkt;
2939
2940 mptcplog((LOG_DEBUG, "%s m is covering that guy dsn %u len %u dsn %u len %u\n",
2941 __func__, m->m_pkthdr.mp_dsn, m->m_pkthdr.mp_rlen,
2942 n->m_pkthdr.mp_dsn, n->m_pkthdr.mp_rlen),
2943 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2944
2945 m->m_nextpkt = NULL;
2946 if (prev == NULL)
2947 mpte->mpte_reinjectq = tmp;
2948 else
2949 prev->m_nextpkt = tmp;
2950
2951 m_freem(n);
2952 n = tmp;
2953 }
2954
2955 }
2956
2957 if (prev) {
2958 /* m is already fully covered by the previous mbuf in the queue */
2959 if (prev->m_pkthdr.mp_dsn + prev->m_pkthdr.mp_rlen >= m->m_pkthdr.mp_dsn + m->m_pkthdr.len) {
2960 mptcplog((LOG_DEBUG, "%s prev covers us from %u with len %u\n",
2961 __func__, prev->m_pkthdr.mp_dsn, prev->m_pkthdr.mp_rlen),
2962 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
2963 goto dont_queue;
2964 }
2965 }
2966
2967 if (prev == NULL)
2968 mpte->mpte_reinjectq = m;
2969 else
2970 prev->m_nextpkt = m;
2971
2972 m->m_nextpkt = n;
2973
2974 return;
2975
2976dont_queue:
2977 m_freem(m);
2978 return;
2979}
2980
2981static struct mbuf *
2982mptcp_lookup_dsn(struct mptses *mpte, uint64_t dsn)
2983{
2984 struct socket *mp_so = mptetoso(mpte);
2985 struct mbuf *m;
2986
2987 m = mp_so->so_snd.sb_mb;
2988
2989 while (m) {
2990 /* If this segment covers what we are looking for, return it. */
2991 if (MPTCP_SEQ_LEQ(m->m_pkthdr.mp_dsn, dsn) &&
2992 MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, dsn))
2993 break;
2994
2995
2996 /* Segment is no more in the queue */
2997 if (MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn, dsn))
2998 return NULL;
2999
3000 m = m->m_next;
3001 }
3002
3003 return m;
3004}
3005
3006static struct mbuf *
3007mptcp_copy_mbuf_list(struct mbuf *m, int len)
3008{
3009 struct mbuf *top = NULL, *tail = NULL;
3010 uint64_t dsn;
3011 uint32_t dlen, rseq;
3012
3013 dsn = m->m_pkthdr.mp_dsn;
3014 dlen = m->m_pkthdr.mp_rlen;
3015 rseq = m->m_pkthdr.mp_rseq;
3016
3017 while (len > 0) {
3018 struct mbuf *n;
3019
3020 VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3021
3022 n = m_copym_mode(m, 0, m->m_len, M_DONTWAIT, M_COPYM_MUST_COPY_HDR);
3023 if (n == NULL) {
3024 mptcplog((LOG_ERR, "%s m_copym_mode returned NULL\n", __func__),
3025 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
3026 goto err;
3027 }
3028
3029 VERIFY(n->m_flags & M_PKTHDR);
3030 VERIFY(n->m_next == NULL);
3031 VERIFY(n->m_pkthdr.mp_dsn == dsn);
3032 VERIFY(n->m_pkthdr.mp_rlen == dlen);
3033 VERIFY(n->m_pkthdr.mp_rseq == rseq);
3034 VERIFY(n->m_len == m->m_len);
3035
3036 n->m_pkthdr.pkt_flags |= (PKTF_MPSO | PKTF_MPTCP);
3037
3038 if (top == NULL)
3039 top = n;
3040
3041 if (tail != NULL)
3042 tail->m_next = n;
3043
3044 tail = n;
3045
3046 len -= m->m_len;
3047 m = m->m_next;
3048 }
3049
3050 return top;
3051
3052err:
3053 if (top)
3054 m_freem(top);
3055
3056 return NULL;
3057}
3058
3059static void
3060mptcp_reinject_mbufs(struct socket *so)
3061{
3062 struct tcpcb *tp = sototcpcb(so);
3063 struct mptsub *mpts = tp->t_mpsub;
3064 struct mptcb *mp_tp = tptomptp(tp);
3065 struct mptses *mpte = mp_tp->mpt_mpte;;
3066 struct sockbuf *sb = &so->so_snd;
3067 struct mbuf *m;
3068
3069 m = sb->sb_mb;
3070 while (m) {
3071 struct mbuf *n = m->m_next, *orig = m;
3072
3073 mptcplog((LOG_DEBUG, "%s working on suna %u relseq %u iss %u len %u pktflags %#x\n",
3074 __func__, tp->snd_una, m->m_pkthdr.mp_rseq, mpts->mpts_iss,
3075 m->m_pkthdr.mp_rlen, m->m_pkthdr.pkt_flags),
3076 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
3077
3078 VERIFY((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP));
3079
3080 if (m->m_pkthdr.pkt_flags & PKTF_MPTCP_REINJ)
3081 goto next;
3082
3083 /* Has it all already been acknowledged at the data-level? */
3084 if (MPTCP_SEQ_GEQ(mp_tp->mpt_snduna, m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen))
3085 goto next;
3086
3087 /* Part of this has already been acknowledged - lookup in the
3088 * MPTCP-socket for the segment.
3089 */
3090 if (SEQ_GT(tp->snd_una - mpts->mpts_iss, m->m_pkthdr.mp_rseq)) {
3091 m = mptcp_lookup_dsn(mpte, m->m_pkthdr.mp_dsn);
3092 if (m == NULL)
3093 goto next;
3094 }
3095
3096 /* Copy the mbuf with headers (aka, DSN-numbers) */
3097 m = mptcp_copy_mbuf_list(m, m->m_pkthdr.mp_rlen);
3098 if (m == NULL)
3099 break;
3100
3101 VERIFY(m->m_nextpkt == NULL);
3102
3103 /* Now, add to the reinject-queue, eliminating overlapping
3104 * segments
3105 */
3106 mptcp_add_reinjectq(mpte, m);
3107
3108 orig->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3109
3110next:
3111 /* mp_rlen can cover multiple mbufs, so advance to the end of it. */
3112 while (n) {
3113 VERIFY((n->m_flags & M_PKTHDR) && (n->m_pkthdr.pkt_flags & PKTF_MPTCP));
3114
3115 if (n->m_pkthdr.mp_dsn != orig->m_pkthdr.mp_dsn)
3116 break;
3117
3118 n->m_pkthdr.pkt_flags |= PKTF_MPTCP_REINJ;
3119 n = n->m_next;
3120 }
3121
3122 m = n;
3123 }
3124}
3125
3126void
3127mptcp_clean_reinjectq(struct mptses *mpte)
3128{
3129 struct mptcb *mp_tp = mpte->mpte_mptcb;
3130
3131 mpte_lock_assert_held(mpte);
3132
3133 while (mpte->mpte_reinjectq) {
3134 struct mbuf *m = mpte->mpte_reinjectq;
3135
3136 if (MPTCP_SEQ_GEQ(m->m_pkthdr.mp_dsn, mp_tp->mpt_snduna) ||
3137 MPTCP_SEQ_GT(m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen, mp_tp->mpt_snduna))
3138 break;
3139
3140 mpte->mpte_reinjectq = m->m_nextpkt;
3141 m->m_nextpkt = NULL;
3142 m_freem(m);
3143 }
3144}
3145
3146/*
3147 * Subflow socket control event upcall.
3148 */
3149static void
3150mptcp_subflow_eupcall1(struct socket *so, void *arg, uint32_t events)
3151{
3152#pragma unused(so)
3153 struct mptsub *mpts = arg;
3154 struct mptses *mpte = mpts->mpts_mpte;
3155
3156 VERIFY(mpte != NULL);
3157 mpte_lock_assert_held(mpte);
3158
3159 if ((mpts->mpts_evctl & events) == events)
3160 return;
3161
3162 mpts->mpts_evctl |= events;
3163
3164 if (mptcp_should_defer_upcall(mpte->mpte_mppcb)) {
3165 mpte->mpte_mppcb->mpp_flags |= MPP_SHOULD_WORKLOOP;
3166 return;
3167 }
3168
3169 mptcp_subflow_workloop(mpte);
3170}
3171
3172/*
3173 * Subflow socket control events.
3174 *
3175 * Called for handling events related to the underlying subflow socket.
3176 */
3177static ev_ret_t
3178mptcp_subflow_events(struct mptses *mpte, struct mptsub *mpts,
3179 uint64_t *p_mpsofilt_hint)
3180{
3181 ev_ret_t ret = MPTS_EVRET_OK;
3182 int i, mpsub_ev_entry_count = sizeof(mpsub_ev_entry_tbl) /
3183 sizeof(mpsub_ev_entry_tbl[0]);
3184
3185 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3186
3187 /* bail if there's nothing to process */
3188 if (!mpts->mpts_evctl)
3189 return (ret);
3190
3191 if (mpts->mpts_evctl & (SO_FILT_HINT_CONNRESET|SO_FILT_HINT_MUSTRST|
3192 SO_FILT_HINT_CANTSENDMORE|SO_FILT_HINT_TIMEOUT|
3193 SO_FILT_HINT_NOSRCADDR|SO_FILT_HINT_IFDENIED|
3194 SO_FILT_HINT_DISCONNECTED)) {
3195 mpts->mpts_evctl |= SO_FILT_HINT_MPFAILOVER;
3196 }
3197
3198 DTRACE_MPTCP3(subflow__events, struct mptses *, mpte,
3199 struct mptsub *, mpts, uint32_t, mpts->mpts_evctl);
3200
3201 mptcplog((LOG_DEBUG, "%s cid %d events=%b\n", __func__,
3202 mpts->mpts_connid, mpts->mpts_evctl, SO_FILT_HINT_BITS),
3203 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
3204
3205 /*
3206 * Process all the socket filter hints and reset the hint
3207 * once it is handled
3208 */
3209 for (i = 0; i < mpsub_ev_entry_count && mpts->mpts_evctl; i++) {
3210 /*
3211 * Always execute the DISCONNECTED event, because it will wakeup
3212 * the app.
3213 */
3214 if ((mpts->mpts_evctl & mpsub_ev_entry_tbl[i].sofilt_hint_mask) &&
3215 (ret >= MPTS_EVRET_OK ||
3216 mpsub_ev_entry_tbl[i].sofilt_hint_mask == SO_FILT_HINT_DISCONNECTED)) {
3217 mpts->mpts_evctl &= ~mpsub_ev_entry_tbl[i].sofilt_hint_mask;
3218 ev_ret_t error =
3219 mpsub_ev_entry_tbl[i].sofilt_hint_ev_hdlr(mpte, mpts, p_mpsofilt_hint, mpsub_ev_entry_tbl[i].sofilt_hint_mask);
3220 ret = ((error >= MPTS_EVRET_OK) ? MAX(error, ret) : error);
3221 }
3222 }
3223
3224 /*
3225 * We should be getting only events specified via sock_catchevents(),
3226 * so loudly complain if we have any unprocessed one(s).
3227 */
3228 if (mpts->mpts_evctl || ret < MPTS_EVRET_OK)
3229 mptcplog((LOG_WARNING, "%s%s: cid %d evret %s (%d) unhandled events=%b\n", __func__,
3230 (mpts->mpts_evctl && ret == MPTS_EVRET_OK) ? "MPTCP_ERROR " : "",
3231 mpts->mpts_connid,
3232 mptcp_evret2str(ret), ret, mpts->mpts_evctl, SO_FILT_HINT_BITS),
3233 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3234 else
3235 mptcplog((LOG_DEBUG, "%s: Done, events %b\n", __func__,
3236 mpts->mpts_evctl, SO_FILT_HINT_BITS),
3237 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
3238
3239 return (ret);
3240}
3241
3242static ev_ret_t
3243mptcp_subflow_propagate_ev(struct mptses *mpte, struct mptsub *mpts,
3244 uint64_t *p_mpsofilt_hint, uint64_t event)
3245{
3246 struct socket *mp_so, *so;
3247 struct mptcb *mp_tp;
3248
3249 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3250 VERIFY(mpte->mpte_mppcb != NULL);
3251 mp_so = mptetoso(mpte);
3252 mp_tp = mpte->mpte_mptcb;
3253 so = mpts->mpts_socket;
3254
3255 mptcplog((LOG_DEBUG, "%s: cid %d event %d\n", __func__,
3256 mpts->mpts_connid, event),
3257 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3258
3259 /*
3260 * We got an event for this subflow that might need to be propagated,
3261 * based on the state of the MPTCP connection.
3262 */
3263 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
3264 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE))) {
3265 mp_so->so_error = so->so_error;
3266 *p_mpsofilt_hint |= event;
3267 }
3268
3269 return (MPTS_EVRET_OK);
3270}
3271
3272/*
3273 * Handle SO_FILT_HINT_NOSRCADDR subflow socket event.
3274 */
3275static ev_ret_t
3276mptcp_subflow_nosrcaddr_ev(struct mptses *mpte, struct mptsub *mpts,
3277 uint64_t *p_mpsofilt_hint, uint64_t event)
3278{
3279#pragma unused(p_mpsofilt_hint, event)
3280 struct socket *mp_so;
3281 struct tcpcb *tp;
3282
3283 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3284
3285 VERIFY(mpte->mpte_mppcb != NULL);
3286 mp_so = mptetoso(mpte);
3287 tp = intotcpcb(sotoinpcb(mpts->mpts_socket));
3288
3289 /*
3290 * This overwrites any previous mpte_lost_aid to avoid storing
3291 * too much state when the typical case has only two subflows.
3292 */
3293 mpte->mpte_flags |= MPTE_SND_REM_ADDR;
3294 mpte->mpte_lost_aid = tp->t_local_aid;
3295
3296 mptcplog((LOG_DEBUG, "%s cid %d\n", __func__, mpts->mpts_connid),
3297 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3298
3299 /*
3300 * The subflow connection has lost its source address.
3301 */
3302 mptcp_subflow_abort(mpts, EADDRNOTAVAIL);
3303
3304 if (mp_so->so_flags & SOF_NOADDRAVAIL)
3305 mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
3306
3307 return (MPTS_EVRET_DELETE);
3308}
3309
3310/*
3311 * Handle SO_FILT_HINT_MPCANTRCVMORE subflow socket event that
3312 * indicates that the remote side sent a Data FIN
3313 */
3314static ev_ret_t
3315mptcp_subflow_mpcantrcvmore_ev(struct mptses *mpte, struct mptsub *mpts,
3316 uint64_t *p_mpsofilt_hint, uint64_t event)
3317{
3318#pragma unused(event)
3319 struct mptcb *mp_tp;
3320
3321 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3322 mp_tp = mpte->mpte_mptcb;
3323
3324 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__, mpts->mpts_connid),
3325 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3326
3327 /*
3328 * We got a Data FIN for the MPTCP connection.
3329 * The FIN may arrive with data. The data is handed up to the
3330 * mptcp socket and the user is notified so that it may close
3331 * the socket if needed.
3332 */
3333 if (mp_tp->mpt_state == MPTCPS_CLOSE_WAIT)
3334 *p_mpsofilt_hint |= SO_FILT_HINT_CANTRCVMORE;
3335
3336 return (MPTS_EVRET_OK); /* keep the subflow socket around */
3337}
3338
3339/*
3340 * Handle SO_FILT_HINT_MPFAILOVER subflow socket event
3341 */
3342static ev_ret_t
3343mptcp_subflow_failover_ev(struct mptses *mpte, struct mptsub *mpts,
3344 uint64_t *p_mpsofilt_hint, uint64_t event)
3345{
3346#pragma unused(event, p_mpsofilt_hint)
3347 struct mptsub *mpts_alt = NULL;
3348 struct socket *alt_so = NULL;
3349 struct socket *mp_so;
3350 int altpath_exists = 0;
3351
3352 mpte_lock_assert_held(mpte);
3353 mp_so = mptetoso(mpte);
3354 mptcplog((LOG_NOTICE, "%s: mp_so 0x%llx\n", __func__,
3355 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)),
3356 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3357
3358 mptcp_reinject_mbufs(mpts->mpts_socket);
3359
3360 mpts_alt = mptcp_get_subflow(mpte, mpts, NULL);
3361 /*
3362 * If there is no alternate eligible subflow, ignore the
3363 * failover hint.
3364 */
3365 if (mpts_alt == NULL) {
3366 mptcplog((LOG_WARNING, "%s: no alternate path\n", __func__),
3367 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3368
3369 goto done;
3370 }
3371
3372 altpath_exists = 1;
3373 alt_so = mpts_alt->mpts_socket;
3374 if (mpts_alt->mpts_flags & MPTSF_FAILINGOVER) {
3375 /* All data acknowledged and no RTT spike */
3376 if (alt_so->so_snd.sb_cc == 0 && mptcp_no_rto_spike(alt_so)) {
3377 mpts_alt->mpts_flags &= ~MPTSF_FAILINGOVER;
3378 } else {
3379 /* no alternate path available */
3380 altpath_exists = 0;
3381 }
3382 }
3383
3384 if (altpath_exists) {
3385 mpts_alt->mpts_flags |= MPTSF_ACTIVE;
3386
3387 mpte->mpte_active_sub = mpts_alt;
3388 mpts->mpts_flags |= MPTSF_FAILINGOVER;
3389 mpts->mpts_flags &= ~MPTSF_ACTIVE;
3390
3391 mptcplog((LOG_NOTICE, "%s: switched from %d to %d\n",
3392 __func__, mpts->mpts_connid, mpts_alt->mpts_connid),
3393 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3394
3395 mptcpstats_inc_switch(mpte, mpts);
3396
3397 sowwakeup(alt_so);
3398 } else {
3399 mptcplog((LOG_DEBUG, "%s: no alt cid = %d\n", __func__,
3400 mpts->mpts_connid),
3401 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3402done:
3403 mpts->mpts_socket->so_flags &= ~SOF_MP_TRYFAILOVER;
3404 }
3405
3406 return (MPTS_EVRET_OK);
3407}
3408
3409/*
3410 * Handle SO_FILT_HINT_IFDENIED subflow socket event.
3411 */
3412static ev_ret_t
3413mptcp_subflow_ifdenied_ev(struct mptses *mpte, struct mptsub *mpts,
3414 uint64_t *p_mpsofilt_hint, uint64_t event)
3415{
3416 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3417 VERIFY(mpte->mpte_mppcb != NULL);
3418
3419 mptcplog((LOG_DEBUG, "%s: cid %d\n", __func__,
3420 mpts->mpts_connid), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3421
3422 /*
3423 * The subflow connection cannot use the outgoing interface, let's
3424 * close this subflow.
3425 */
3426 mptcp_subflow_abort(mpts, EPERM);
3427
3428 mptcp_subflow_propagate_ev(mpte, mpts, p_mpsofilt_hint, event);
3429
3430 return (MPTS_EVRET_DELETE);
3431}
3432
3433/*
3434 * https://tools.ietf.org/html/rfc6052#section-2
3435 * https://tools.ietf.org/html/rfc6147#section-5.2
3436 */
3437static boolean_t
3438mptcp_desynthesize_ipv6_addr(const struct in6_addr *addr,
3439 const struct ipv6_prefix *prefix,
3440 struct in_addr *addrv4)
3441{
3442 char buf[MAX_IPv4_STR_LEN];
3443 char *ptrv4 = (char *)addrv4;
3444 const char *ptr = (const char *)addr;
3445
3446 if (memcmp(addr, &prefix->ipv6_prefix, prefix->prefix_len) != 0)
3447 return false;
3448
3449 switch (prefix->prefix_len) {
3450 case NAT64_PREFIX_LEN_96:
3451 memcpy(ptrv4, ptr + 12, 4);
3452 break;
3453 case NAT64_PREFIX_LEN_64:
3454 memcpy(ptrv4, ptr + 9, 4);
3455 break;
3456 case NAT64_PREFIX_LEN_56:
3457 memcpy(ptrv4, ptr + 7, 1);
3458 memcpy(ptrv4 + 1, ptr + 9, 3);
3459 break;
3460 case NAT64_PREFIX_LEN_48:
3461 memcpy(ptrv4, ptr + 6, 2);
3462 memcpy(ptrv4 + 2, ptr + 9, 2);
3463 break;
3464 case NAT64_PREFIX_LEN_40:
3465 memcpy(ptrv4, ptr + 5, 3);
3466 memcpy(ptrv4 + 3, ptr + 9, 1);
3467 break;
3468 case NAT64_PREFIX_LEN_32:
3469 memcpy(ptrv4, ptr + 4, 4);
3470 break;
3471 default:
3472 panic("NAT64-prefix len is wrong: %u\n",
3473 prefix->prefix_len);
3474 }
3475
3476 os_log_info(mptcp_log_handle, "%s desynthesized to %s\n", __func__,
3477 inet_ntop(AF_INET, (void *)addrv4, buf, sizeof(buf)));
3478
3479 return true;
3480}
3481
3482static void
3483mptcp_handle_ipv6_connection(struct mptses *mpte, const struct mptsub *mpts)
3484{
3485 struct ipv6_prefix nat64prefixes[NAT64_MAX_NUM_PREFIXES];
3486 struct socket *so = mpts->mpts_socket;
3487 struct ifnet *ifp;
3488 int j;
3489
3490 ifp = sotoinpcb(so)->inp_last_outifp;
3491
3492 if (ifnet_get_nat64prefix(ifp, nat64prefixes) == ENOENT) {
3493 mptcp_ask_for_nat64(ifp);
3494 return;
3495 }
3496
3497
3498 for (j = 0; j < NAT64_MAX_NUM_PREFIXES; j++) {
3499 int success;
3500
3501 if (nat64prefixes[j].prefix_len == 0)
3502 continue;
3503
3504 success = mptcp_desynthesize_ipv6_addr(&mpte->__mpte_dst_v6.sin6_addr,
3505 &nat64prefixes[j],
3506 &mpte->mpte_dst_v4_nat64.sin_addr);
3507 if (success) {
3508 mpte->mpte_dst_v4_nat64.sin_len = sizeof(mpte->mpte_dst_v4_nat64);
3509 mpte->mpte_dst_v4_nat64.sin_family = AF_INET;
3510 mpte->mpte_dst_v4_nat64.sin_port = mpte->__mpte_dst_v6.sin6_port;
3511 break;
3512 }
3513 }
3514}
3515
3516/*
3517 * Handle SO_FILT_HINT_CONNECTED subflow socket event.
3518 */
3519static ev_ret_t
3520mptcp_subflow_connected_ev(struct mptses *mpte, struct mptsub *mpts,
3521 uint64_t *p_mpsofilt_hint, uint64_t event)
3522{
3523#pragma unused(event, p_mpsofilt_hint)
3524 struct socket *mp_so, *so;
3525 struct inpcb *inp;
3526 struct tcpcb *tp;
3527 struct mptcb *mp_tp;
3528 int af;
3529 boolean_t mpok = FALSE;
3530
3531 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3532 VERIFY(mpte->mpte_mppcb != NULL);
3533
3534 mp_so = mptetoso(mpte);
3535 mp_tp = mpte->mpte_mptcb;
3536 so = mpts->mpts_socket;
3537 tp = sototcpcb(so);
3538 af = mpts->mpts_dst.sa_family;
3539
3540 if (mpts->mpts_flags & MPTSF_CONNECTED)
3541 return (MPTS_EVRET_OK);
3542
3543 if ((mpts->mpts_flags & MPTSF_DISCONNECTED) ||
3544 (mpts->mpts_flags & MPTSF_DISCONNECTING)) {
3545 if (!(so->so_state & (SS_ISDISCONNECTING | SS_ISDISCONNECTED)) &&
3546 (so->so_state & SS_ISCONNECTED)) {
3547 mptcplog((LOG_DEBUG, "%s: cid %d disconnect before tcp connect\n",
3548 __func__, mpts->mpts_connid),
3549 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3550 (void) soshutdownlock(so, SHUT_RD);
3551 (void) soshutdownlock(so, SHUT_WR);
3552 (void) sodisconnectlocked(so);
3553 }
3554 return (MPTS_EVRET_OK);
3555 }
3556
3557 /*
3558 * The subflow connection has been connected. Find out whether it
3559 * is connected as a regular TCP or as a MPTCP subflow. The idea is:
3560 *
3561 * a. If MPTCP connection is not yet established, then this must be
3562 * the first subflow connection. If MPTCP failed to negotiate,
3563 * fallback to regular TCP by degrading this subflow.
3564 *
3565 * b. If MPTCP connection has been established, then this must be
3566 * one of the subsequent subflow connections. If MPTCP failed
3567 * to negotiate, disconnect the connection.
3568 *
3569 * Right now, we simply unblock any waiters at the MPTCP socket layer
3570 * if the MPTCP connection has not been established.
3571 */
3572
3573 if (so->so_state & SS_ISDISCONNECTED) {
3574 /*
3575 * With MPTCP joins, a connection is connected at the subflow
3576 * level, but the 4th ACK from the server elevates the MPTCP
3577 * subflow to connected state. So there is a small window
3578 * where the subflow could get disconnected before the
3579 * connected event is processed.
3580 */
3581 return (MPTS_EVRET_OK);
3582 }
3583
3584 if (mpts->mpts_flags & MPTSF_TFO_REQD)
3585 mptcp_drop_tfo_data(mpte, mpts);
3586
3587 mpts->mpts_flags &= ~(MPTSF_CONNECTING | MPTSF_TFO_REQD);
3588 mpts->mpts_flags |= MPTSF_CONNECTED;
3589
3590 if (tp->t_mpflags & TMPF_MPTCP_TRUE)
3591 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
3592
3593 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
3594
3595 /* get/verify the outbound interface */
3596 inp = sotoinpcb(so);
3597
3598 mpts->mpts_maxseg = tp->t_maxseg;
3599
3600 mptcplog((LOG_DEBUG, "%s: cid %d outif %s is %s\n", __func__, mpts->mpts_connid,
3601 ((inp->inp_last_outifp != NULL) ? inp->inp_last_outifp->if_xname : "NULL"),
3602 ((mpts->mpts_flags & MPTSF_MP_CAPABLE) ? "MPTCP capable" : "a regular TCP")),
3603 (MPTCP_SOCKET_DBG | MPTCP_EVENTS_DBG), MPTCP_LOGLVL_LOG);
3604
3605 mpok = (mpts->mpts_flags & MPTSF_MP_CAPABLE);
3606
3607 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
3608 mp_tp->mpt_state = MPTCPS_ESTABLISHED;
3609 mpte->mpte_associd = mpts->mpts_connid;
3610 DTRACE_MPTCP2(state__change,
3611 struct mptcb *, mp_tp,
3612 uint32_t, 0 /* event */);
3613
3614 if (SOCK_DOM(so) == AF_INET) {
3615 in_getsockaddr_s(so, &mpte->__mpte_src_v4);
3616 } else {
3617 in6_getsockaddr_s(so, &mpte->__mpte_src_v6);
3618 }
3619
3620 mpts->mpts_flags |= MPTSF_ACTIVE;
3621
3622 /* case (a) above */
3623 if (!mpok) {
3624 tcpstat.tcps_mpcap_fallback++;
3625
3626 tp->t_mpflags |= TMPF_INFIN_SENT;
3627 mptcp_notify_mpfail(so);
3628 } else {
3629 if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
3630 mpte->mpte_svctype != MPTCP_SVCTYPE_AGGREGATE) {
3631 tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
3632 } else {
3633 mpts->mpts_flags |= MPTSF_PREFERRED;
3634 }
3635 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
3636 mpte->mpte_nummpcapflows++;
3637
3638 if (SOCK_DOM(so) == AF_INET6)
3639 mptcp_handle_ipv6_connection(mpte, mpts);
3640
3641 mptcp_check_subflows_and_add(mpte);
3642
3643 if (IFNET_IS_CELLULAR(inp->inp_last_outifp))
3644 mpte->mpte_initial_cell = 1;
3645
3646 mpte->mpte_handshake_success = 1;
3647 }
3648
3649 mp_tp->mpt_sndwnd = tp->snd_wnd;
3650 mp_tp->mpt_sndwl1 = mp_tp->mpt_rcvnxt;
3651 mp_tp->mpt_sndwl2 = mp_tp->mpt_snduna;
3652 soisconnected(mp_so);
3653
3654 mptcplog((LOG_DEBUG, "%s: MPTCPS_ESTABLISHED for mp_so 0x%llx mpok %u\n",
3655 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mpok),
3656 MPTCP_STATE_DBG, MPTCP_LOGLVL_LOG);
3657 } else if (mpok) {
3658 /*
3659 * case (b) above
3660 * In case of additional flows, the MPTCP socket is not
3661 * MPTSF_MP_CAPABLE until an ACK is received from server
3662 * for 3-way handshake. TCP would have guaranteed that this
3663 * is an MPTCP subflow.
3664 */
3665 if (IFNET_IS_CELLULAR(inp->inp_last_outifp) &&
3666 !(tp->t_mpflags & TMPF_BACKUP_PATH) &&
3667 mpte->mpte_svctype != MPTCP_SVCTYPE_AGGREGATE) {
3668 tp->t_mpflags |= (TMPF_BACKUP_PATH | TMPF_SND_MPPRIO);
3669 mpts->mpts_flags &= ~MPTSF_PREFERRED;
3670 } else {
3671 mpts->mpts_flags |= MPTSF_PREFERRED;
3672 }
3673
3674 mpts->mpts_flags |= MPTSF_MPCAP_CTRSET;
3675 mpte->mpte_nummpcapflows++;
3676
3677 mpts->mpts_rel_seq = 1;
3678
3679 mptcp_check_subflows_and_remove(mpte);
3680 } else {
3681 unsigned int i;
3682
3683 /* Should we try the alternate port? */
3684 if (mpte->mpte_alternate_port &&
3685 inp->inp_fport != mpte->mpte_alternate_port) {
3686 union sockaddr_in_4_6 dst;
3687 struct sockaddr_in *dst_in = (struct sockaddr_in *)&dst;
3688
3689 memcpy(&dst, &mpts->mpts_dst, mpts->mpts_dst.sa_len);
3690
3691 dst_in->sin_port = mpte->mpte_alternate_port;
3692
3693 mptcp_subflow_add(mpte, NULL, (struct sockaddr *)&dst,
3694 mpts->mpts_ifscope , NULL);
3695 } else { /* Else, we tried all we could, mark this interface as non-MPTCP */
3696 for (i = 0; i < mpte->mpte_itfinfo_size; i++) {
3697 struct mpt_itf_info *info = &mpte->mpte_itfinfo[i];
3698
3699 if (inp->inp_last_outifp->if_index == info->ifindex) {
3700 info->no_mptcp_support = 1;
3701 break;
3702 }
3703 }
3704 }
3705
3706 tcpstat.tcps_join_fallback++;
3707 if (IFNET_IS_CELLULAR(inp->inp_last_outifp))
3708 tcpstat.tcps_mptcp_cell_proxy++;
3709 else
3710 tcpstat.tcps_mptcp_wifi_proxy++;
3711
3712 soevent(mpts->mpts_socket, SO_FILT_HINT_LOCKED | SO_FILT_HINT_MUSTRST);
3713
3714 return (MPTS_EVRET_OK);
3715 }
3716
3717 /* This call, just to "book" an entry in the stats-table for this ifindex */
3718 mptcp_get_statsindex(mpte->mpte_itfstats, mpts);
3719
3720 mptcp_output(mpte);
3721
3722 return (MPTS_EVRET_OK); /* keep the subflow socket around */
3723}
3724
3725/*
3726 * Handle SO_FILT_HINT_DISCONNECTED subflow socket event.
3727 */
3728static ev_ret_t
3729mptcp_subflow_disconnected_ev(struct mptses *mpte, struct mptsub *mpts,
3730 uint64_t *p_mpsofilt_hint, uint64_t event)
3731{
3732#pragma unused(event, p_mpsofilt_hint)
3733 struct socket *mp_so, *so;
3734 struct mptcb *mp_tp;
3735
3736 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3737 VERIFY(mpte->mpte_mppcb != NULL);
3738 mp_so = mptetoso(mpte);
3739 mp_tp = mpte->mpte_mptcb;
3740 so = mpts->mpts_socket;
3741
3742 mptcplog((LOG_DEBUG, "%s: cid %d, so_err %d, mpt_state %u fallback %u active %u flags %#x\n",
3743 __func__, mpts->mpts_connid, so->so_error, mp_tp->mpt_state,
3744 !!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP),
3745 !!(mpts->mpts_flags & MPTSF_ACTIVE), sototcpcb(so)->t_mpflags),
3746 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3747
3748 if (mpts->mpts_flags & MPTSF_DISCONNECTED)
3749 return (MPTS_EVRET_DELETE);
3750
3751 mpts->mpts_flags |= MPTSF_DISCONNECTED;
3752
3753 /* The subflow connection has been disconnected. */
3754
3755 if (mpts->mpts_flags & MPTSF_MPCAP_CTRSET) {
3756 mpte->mpte_nummpcapflows--;
3757 if (mpte->mpte_active_sub == mpts) {
3758 mpte->mpte_active_sub = NULL;
3759 mptcplog((LOG_DEBUG, "%s: resetting active subflow \n",
3760 __func__), MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3761 }
3762 mpts->mpts_flags &= ~MPTSF_MPCAP_CTRSET;
3763 }
3764
3765 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED ||
3766 ((mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && (mpts->mpts_flags & MPTSF_ACTIVE)) ||
3767 (sototcpcb(so)->t_mpflags & TMPF_FASTCLOSERCV)) {
3768 mptcp_drop(mpte, mp_tp, so->so_error);
3769 }
3770
3771 /*
3772 * Clear flags that are used by getconninfo to return state.
3773 * Retain like MPTSF_DELETEOK for internal purposes.
3774 */
3775 mpts->mpts_flags &= ~(MPTSF_CONNECTING|MPTSF_CONNECT_PENDING|
3776 MPTSF_CONNECTED|MPTSF_DISCONNECTING|MPTSF_PREFERRED|
3777 MPTSF_MP_CAPABLE|MPTSF_MP_READY|MPTSF_MP_DEGRADED|MPTSF_ACTIVE);
3778
3779 return (MPTS_EVRET_DELETE);
3780}
3781
3782/*
3783 * Handle SO_FILT_HINT_MPSTATUS subflow socket event
3784 */
3785static ev_ret_t
3786mptcp_subflow_mpstatus_ev(struct mptses *mpte, struct mptsub *mpts,
3787 uint64_t *p_mpsofilt_hint, uint64_t event)
3788{
3789#pragma unused(event, p_mpsofilt_hint)
3790 struct socket *mp_so, *so;
3791 struct mptcb *mp_tp;
3792 ev_ret_t ret = MPTS_EVRET_OK;
3793
3794 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3795 VERIFY(mpte->mpte_mppcb != NULL);
3796 mp_so = mptetoso(mpte);
3797 mp_tp = mpte->mpte_mptcb;
3798 so = mpts->mpts_socket;
3799
3800 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_TRUE)
3801 mpts->mpts_flags |= MPTSF_MP_CAPABLE;
3802 else
3803 mpts->mpts_flags &= ~MPTSF_MP_CAPABLE;
3804
3805 if (sototcpcb(so)->t_mpflags & TMPF_TCP_FALLBACK) {
3806 if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
3807 goto done;
3808 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
3809 } else {
3810 mpts->mpts_flags &= ~MPTSF_MP_DEGRADED;
3811 }
3812
3813 if (sototcpcb(so)->t_mpflags & TMPF_MPTCP_READY)
3814 mpts->mpts_flags |= MPTSF_MP_READY;
3815 else
3816 mpts->mpts_flags &= ~MPTSF_MP_READY;
3817
3818 if (mpts->mpts_flags & MPTSF_MP_DEGRADED) {
3819 mp_tp->mpt_flags |= MPTCPF_FALLBACK_TO_TCP;
3820 mp_tp->mpt_flags &= ~MPTCPF_JOIN_READY;
3821 }
3822
3823 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
3824 VERIFY(!(mp_tp->mpt_flags & MPTCPF_JOIN_READY));
3825 ret = MPTS_EVRET_DISCONNECT_FALLBACK;
3826
3827 m_freem_list(mpte->mpte_reinjectq);
3828 mpte->mpte_reinjectq = NULL;
3829 } else if (mpts->mpts_flags & MPTSF_MP_READY) {
3830 mp_tp->mpt_flags |= MPTCPF_JOIN_READY;
3831 ret = MPTS_EVRET_CONNECT_PENDING;
3832 }
3833
3834 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx mpt_flags=%b cid %d mptsf=%b\n",
3835 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3836 mp_tp->mpt_flags, MPTCPF_BITS, mpts->mpts_connid,
3837 mpts->mpts_flags, MPTSF_BITS),
3838 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3839
3840done:
3841 return (ret);
3842}
3843
3844/*
3845 * Handle SO_FILT_HINT_MUSTRST subflow socket event
3846 */
3847static ev_ret_t
3848mptcp_subflow_mustrst_ev(struct mptses *mpte, struct mptsub *mpts,
3849 uint64_t *p_mpsofilt_hint, uint64_t event)
3850{
3851#pragma unused(event)
3852 struct socket *mp_so, *so;
3853 struct mptcb *mp_tp;
3854 boolean_t is_fastclose;
3855
3856 mpte_lock_assert_held(mpte); /* same as MP socket lock */
3857 VERIFY(mpte->mpte_mppcb != NULL);
3858 mp_so = mptetoso(mpte);
3859 mp_tp = mpte->mpte_mptcb;
3860 so = mpts->mpts_socket;
3861
3862 /* We got an invalid option or a fast close */
3863 struct tcptemp *t_template;
3864 struct inpcb *inp = sotoinpcb(so);
3865 struct tcpcb *tp = NULL;
3866
3867 tp = intotcpcb(inp);
3868 so->so_error = ECONNABORTED;
3869
3870 is_fastclose = !!(tp->t_mpflags & TMPF_FASTCLOSERCV);
3871
3872 t_template = tcp_maketemplate(tp);
3873 if (t_template) {
3874 struct tcp_respond_args tra;
3875
3876 bzero(&tra, sizeof(tra));
3877 if (inp->inp_flags & INP_BOUND_IF)
3878 tra.ifscope = inp->inp_boundifp->if_index;
3879 else
3880 tra.ifscope = IFSCOPE_NONE;
3881 tra.awdl_unrestricted = 1;
3882
3883 tcp_respond(tp, t_template->tt_ipgen,
3884 &t_template->tt_t, (struct mbuf *)NULL,
3885 tp->rcv_nxt, tp->snd_una, TH_RST, &tra);
3886 (void) m_free(dtom(t_template));
3887 mptcplog((LOG_DEBUG, "MPTCP Events: "
3888 "%s: mp_so 0x%llx cid %d \n",
3889 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
3890 so, mpts->mpts_connid),
3891 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_LOG);
3892 }
3893 mptcp_subflow_abort(mpts, ECONNABORTED);
3894
3895 if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) && is_fastclose) {
3896 *p_mpsofilt_hint |= SO_FILT_HINT_CONNRESET;
3897
3898 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED)
3899 mp_so->so_error = ECONNABORTED;
3900 else
3901 mp_so->so_error = ECONNRESET;
3902
3903 /*
3904 * mptcp_drop is being called after processing the events, to fully
3905 * close the MPTCP connection
3906 */
3907 }
3908
3909 if (mp_tp->mpt_gc_ticks == MPT_GC_TICKS)
3910 mp_tp->mpt_gc_ticks = MPT_GC_TICKS_FAST;
3911
3912 return (MPTS_EVRET_DELETE);
3913}
3914
3915static ev_ret_t
3916mptcp_subflow_adaptive_rtimo_ev(struct mptses *mpte, struct mptsub *mpts,
3917 uint64_t *p_mpsofilt_hint, uint64_t event)
3918{
3919#pragma unused(event)
3920 bool found_active = false;
3921
3922 mpts->mpts_flags |= MPTSF_READ_STALL;
3923
3924 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
3925 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
3926
3927 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
3928 TCPS_HAVERCVDFIN2(tp->t_state))
3929 continue;
3930
3931 if (!(mpts->mpts_flags & MPTSF_READ_STALL)) {
3932 found_active = true;
3933 break;
3934 }
3935 }
3936
3937 if (!found_active)
3938 *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_RTIMO;
3939
3940 return (MPTS_EVRET_OK);
3941}
3942
3943static ev_ret_t
3944mptcp_subflow_adaptive_wtimo_ev(struct mptses *mpte, struct mptsub *mpts,
3945 uint64_t *p_mpsofilt_hint, uint64_t event)
3946{
3947#pragma unused(event)
3948 bool found_active = false;
3949
3950 mpts->mpts_flags |= MPTSF_WRITE_STALL;
3951
3952 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
3953 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
3954
3955 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
3956 tp->t_state > TCPS_CLOSE_WAIT)
3957 continue;
3958
3959 if (!(mpts->mpts_flags & MPTSF_WRITE_STALL)) {
3960 found_active = true;
3961 break;
3962 }
3963 }
3964
3965 if (!found_active)
3966 *p_mpsofilt_hint |= SO_FILT_HINT_ADAPTIVE_WTIMO;
3967
3968 return (MPTS_EVRET_OK);
3969}
3970
3971static const char *
3972mptcp_evret2str(ev_ret_t ret)
3973{
3974 const char *c = "UNKNOWN";
3975
3976 switch (ret) {
3977 case MPTS_EVRET_DELETE:
3978 c = "MPTS_EVRET_DELETE";
3979 break;
3980 case MPTS_EVRET_CONNECT_PENDING:
3981 c = "MPTS_EVRET_CONNECT_PENDING";
3982 break;
3983 case MPTS_EVRET_DISCONNECT_FALLBACK:
3984 c = "MPTS_EVRET_DISCONNECT_FALLBACK";
3985 break;
3986 case MPTS_EVRET_OK:
3987 c = "MPTS_EVRET_OK";
3988 break;
3989 default:
3990 break;
3991 }
3992 return (c);
3993}
3994
3995/*
3996 * Issues SOPT_SET on an MPTCP subflow socket; socket must already be locked,
3997 * caller must ensure that the option can be issued on subflow sockets, via
3998 * MPOF_SUBFLOW_OK flag.
3999 */
4000int
4001mptcp_subflow_sosetopt(struct mptses *mpte, struct mptsub *mpts, struct mptopt *mpo)
4002{
4003 struct socket *mp_so, *so;
4004 struct sockopt sopt;
4005 int error;
4006
4007 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4008 mpte_lock_assert_held(mpte);
4009
4010 mp_so = mptetoso(mpte);
4011 so = mpts->mpts_socket;
4012
4013 if (mpte->mpte_mptcb->mpt_state >= MPTCPS_ESTABLISHED &&
4014 mpo->mpo_level == SOL_SOCKET &&
4015 mpo->mpo_name == SO_MARK_CELLFALLBACK) {
4016 struct ifnet *ifp = ifindex2ifnet[mpts->mpts_ifscope];
4017
4018 mptcplog((LOG_DEBUG, "%s Setting CELL_FALLBACK, mpte_flags %#x, svctype %u wifi unusable %d lastcell? %d boundcell? %d\n",
4019 __func__, mpte->mpte_flags, mpte->mpte_svctype, mptcp_is_wifi_unusable(mpte),
4020 sotoinpcb(so)->inp_last_outifp ? IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp) : -1,
4021 mpts->mpts_ifscope != IFSCOPE_NONE && ifp ? IFNET_IS_CELLULAR(ifp) : -1),
4022 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4023
4024 /*
4025 * When we open a new subflow, mark it as cell fallback, if
4026 * this subflow goes over cell.
4027 *
4028 * (except for first-party apps)
4029 */
4030
4031 if (mpte->mpte_flags & MPTE_FIRSTPARTY)
4032 return (0);
4033
4034 if (sotoinpcb(so)->inp_last_outifp &&
4035 !IFNET_IS_CELLULAR(sotoinpcb(so)->inp_last_outifp))
4036 return (0);
4037
4038 /*
4039 * This here is an OR, because if the app is not binding to the
4040 * interface, then it definitely is not a cell-fallback
4041 * connection.
4042 */
4043 if (mpts->mpts_ifscope == IFSCOPE_NONE || ifp == NULL ||
4044 !IFNET_IS_CELLULAR(ifp))
4045 return (0);
4046 }
4047
4048 mpo->mpo_flags &= ~MPOF_INTERIM;
4049
4050 bzero(&sopt, sizeof (sopt));
4051 sopt.sopt_dir = SOPT_SET;
4052 sopt.sopt_level = mpo->mpo_level;
4053 sopt.sopt_name = mpo->mpo_name;
4054 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4055 sopt.sopt_valsize = sizeof (int);
4056 sopt.sopt_p = kernproc;
4057
4058 error = sosetoptlock(so, &sopt, 0);
4059 if (error == 0) {
4060 mptcplog((LOG_INFO, "%s: mp_so 0x%llx sopt %s "
4061 "val %d set successful\n", __func__,
4062 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
4063 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
4064 mpo->mpo_intval),
4065 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
4066 } else {
4067 mptcplog((LOG_ERR, "%s:mp_so 0x%llx sopt %s "
4068 "val %d set error %d\n", __func__,
4069 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
4070 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
4071 mpo->mpo_intval, error),
4072 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
4073 }
4074 return (error);
4075}
4076
4077/*
4078 * Issues SOPT_GET on an MPTCP subflow socket; socket must already be locked,
4079 * caller must ensure that the option can be issued on subflow sockets, via
4080 * MPOF_SUBFLOW_OK flag.
4081 */
4082int
4083mptcp_subflow_sogetopt(struct mptses *mpte, struct socket *so,
4084 struct mptopt *mpo)
4085{
4086 struct socket *mp_so;
4087 struct sockopt sopt;
4088 int error;
4089
4090 VERIFY(mpo->mpo_flags & MPOF_SUBFLOW_OK);
4091 mpte_lock_assert_held(mpte); /* same as MP socket lock */
4092 mp_so = mptetoso(mpte);
4093
4094 bzero(&sopt, sizeof (sopt));
4095 sopt.sopt_dir = SOPT_GET;
4096 sopt.sopt_level = mpo->mpo_level;
4097 sopt.sopt_name = mpo->mpo_name;
4098 sopt.sopt_val = CAST_USER_ADDR_T(&mpo->mpo_intval);
4099 sopt.sopt_valsize = sizeof (int);
4100 sopt.sopt_p = kernproc;
4101
4102 error = sogetoptlock(so, &sopt, 0); /* already locked */
4103 if (error == 0) {
4104 mptcplog((LOG_DEBUG, "MPTCP Socket: "
4105 "%s: mp_so 0x%llx sopt %s "
4106 "val %d get successful\n", __func__,
4107 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
4108 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name),
4109 mpo->mpo_intval),
4110 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4111 } else {
4112 mptcplog((LOG_ERR, "MPTCP Socket: "
4113 "%s: mp_so 0x%llx sopt %s get error %d\n",
4114 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
4115 mptcp_sopt2str(mpo->mpo_level, mpo->mpo_name), error),
4116 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
4117 }
4118 return (error);
4119}
4120
4121
4122/*
4123 * MPTCP garbage collector.
4124 *
4125 * This routine is called by the MP domain on-demand, periodic callout,
4126 * which is triggered when a MPTCP socket is closed. The callout will
4127 * repeat as long as this routine returns a non-zero value.
4128 */
4129static uint32_t
4130mptcp_gc(struct mppcbinfo *mppi)
4131{
4132 struct mppcb *mpp, *tmpp;
4133 uint32_t active = 0;
4134
4135 LCK_MTX_ASSERT(&mppi->mppi_lock, LCK_MTX_ASSERT_OWNED);
4136
4137 TAILQ_FOREACH_SAFE(mpp, &mppi->mppi_pcbs, mpp_entry, tmpp) {
4138 struct socket *mp_so;
4139 struct mptses *mpte;
4140 struct mptcb *mp_tp;
4141
4142 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
4143 mp_so = mpp->mpp_socket;
4144 VERIFY(mp_so != NULL);
4145 mpte = mptompte(mpp);
4146 VERIFY(mpte != NULL);
4147 mp_tp = mpte->mpte_mptcb;
4148 VERIFY(mp_tp != NULL);
4149
4150 mptcplog((LOG_DEBUG, "MPTCP Socket: "
4151 "%s: mp_so 0x%llx found "
4152 "(u=%d,r=%d,s=%d)\n", __func__,
4153 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mp_so->so_usecount,
4154 mp_so->so_retaincnt, mpp->mpp_state),
4155 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4156
4157 if (!mpte_try_lock(mpte)) {
4158 mptcplog((LOG_DEBUG, "MPTCP Socket: "
4159 "%s: mp_so 0x%llx skipped lock "
4160 "(u=%d,r=%d)\n", __func__,
4161 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
4162 mp_so->so_usecount, mp_so->so_retaincnt),
4163 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4164 active++;
4165 continue;
4166 }
4167
4168 /* check again under the lock */
4169 if (mp_so->so_usecount > 0) {
4170 boolean_t wakeup = FALSE;
4171 struct mptsub *mpts, *tmpts;
4172
4173 mptcplog((LOG_DEBUG, "MPTCP Socket: "
4174 "%s: mp_so 0x%llx skipped usecount "
4175 "[u=%d,r=%d] %d %d\n", __func__,
4176 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
4177 mp_so->so_usecount, mp_so->so_retaincnt,
4178 mp_tp->mpt_gc_ticks,
4179 mp_tp->mpt_state),
4180 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4181
4182 if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_1) {
4183 if (mp_tp->mpt_gc_ticks > 0)
4184 mp_tp->mpt_gc_ticks--;
4185 if (mp_tp->mpt_gc_ticks == 0) {
4186 wakeup = TRUE;
4187 }
4188 }
4189 if (wakeup) {
4190 TAILQ_FOREACH_SAFE(mpts,
4191 &mpte->mpte_subflows, mpts_entry, tmpts) {
4192 mptcp_subflow_eupcall1(mpts->mpts_socket,
4193 mpts, SO_FILT_HINT_DISCONNECTED);
4194 }
4195 }
4196 mpte_unlock(mpte);
4197 active++;
4198 continue;
4199 }
4200
4201 if (mpp->mpp_state != MPPCB_STATE_DEAD) {
4202 panic("MPTCP Socket: %s: mp_so 0x%llx skipped state "
4203 "[u=%d,r=%d,s=%d]\n", __func__,
4204 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
4205 mp_so->so_usecount, mp_so->so_retaincnt,
4206 mpp->mpp_state);
4207 }
4208
4209 if (mp_tp->mpt_state == MPTCPS_TIME_WAIT)
4210 mptcp_close(mpte, mp_tp);
4211
4212 mptcp_session_destroy(mpte);
4213
4214 mptcplog((LOG_DEBUG, "MPTCP Socket: "
4215 "%s: mp_so 0x%llx destroyed [u=%d,r=%d]\n",
4216 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
4217 mp_so->so_usecount, mp_so->so_retaincnt),
4218 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4219
4220 DTRACE_MPTCP4(dispose, struct socket *, mp_so,
4221 struct sockbuf *, &mp_so->so_rcv,
4222 struct sockbuf *, &mp_so->so_snd,
4223 struct mppcb *, mpp);
4224
4225 mp_pcbdispose(mpp);
4226 sodealloc(mp_so);
4227 }
4228
4229 return (active);
4230}
4231
4232/*
4233 * Drop a MPTCP connection, reporting the specified error.
4234 */
4235struct mptses *
4236mptcp_drop(struct mptses *mpte, struct mptcb *mp_tp, int errno)
4237{
4238 struct socket *mp_so;
4239
4240 mpte_lock_assert_held(mpte); /* same as MP socket lock */
4241 VERIFY(mpte->mpte_mptcb == mp_tp);
4242 mp_so = mptetoso(mpte);
4243
4244 DTRACE_MPTCP2(state__change, struct mptcb *, mp_tp,
4245 uint32_t, 0 /* event */);
4246
4247 if (errno == ETIMEDOUT && mp_tp->mpt_softerror != 0)
4248 errno = mp_tp->mpt_softerror;
4249 mp_so->so_error = errno;
4250
4251 return (mptcp_close(mpte, mp_tp));
4252}
4253
4254/*
4255 * Close a MPTCP control block.
4256 */
4257struct mptses *
4258mptcp_close(struct mptses *mpte, struct mptcb *mp_tp)
4259{
4260 struct socket *mp_so = NULL;
4261 struct mptsub *mpts = NULL, *tmpts = NULL;
4262
4263 mpte_lock_assert_held(mpte); /* same as MP socket lock */
4264 VERIFY(mpte->mpte_mptcb == mp_tp);
4265 mp_so = mptetoso(mpte);
4266
4267 mp_tp->mpt_state = MPTCPS_TERMINATE;
4268
4269 mptcp_freeq(mp_tp);
4270
4271 soisdisconnected(mp_so);
4272
4273 /* Clean up all subflows */
4274 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4275 mptcp_subflow_disconnect(mpte, mpts);
4276 }
4277
4278 return (NULL);
4279}
4280
4281void
4282mptcp_notify_close(struct socket *so)
4283{
4284 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_DISCONNECTED));
4285}
4286
4287/*
4288 * MPTCP workloop.
4289 */
4290void
4291mptcp_subflow_workloop(struct mptses *mpte)
4292{
4293 struct socket *mp_so;
4294 struct mptsub *mpts, *tmpts;
4295 boolean_t connect_pending = FALSE, disconnect_fallback = FALSE;
4296 uint64_t mpsofilt_hint_mask = SO_FILT_HINT_LOCKED;
4297
4298 mpte_lock_assert_held(mpte);
4299 VERIFY(mpte->mpte_mppcb != NULL);
4300 mp_so = mptetoso(mpte);
4301 VERIFY(mp_so != NULL);
4302
4303 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4304 ev_ret_t ret;
4305
4306 if (mpts->mpts_socket->so_usecount == 0) {
4307 /* Will be removed soon by tcp_garbage_collect */
4308 continue;
4309 }
4310
4311 mptcp_subflow_addref(mpts);
4312 mpts->mpts_socket->so_usecount++;
4313
4314 ret = mptcp_subflow_events(mpte, mpts, &mpsofilt_hint_mask);
4315
4316 /*
4317 * If MPTCP socket is closed, disconnect all subflows.
4318 * This will generate a disconnect event which will
4319 * be handled during the next iteration, causing a
4320 * non-zero error to be returned above.
4321 */
4322 if (mp_so->so_flags & SOF_PCBCLEARING)
4323 mptcp_subflow_disconnect(mpte, mpts);
4324
4325 switch (ret) {
4326 case MPTS_EVRET_OK:
4327 /* nothing to do */
4328 break;
4329 case MPTS_EVRET_DELETE:
4330 mptcp_subflow_soclose(mpts);
4331 break;
4332 case MPTS_EVRET_CONNECT_PENDING:
4333 connect_pending = TRUE;
4334 break;
4335 case MPTS_EVRET_DISCONNECT_FALLBACK:
4336 disconnect_fallback = TRUE;
4337 break;
4338 default:
4339 mptcplog((LOG_DEBUG,
4340 "MPTCP Socket: %s: mptcp_subflow_events "
4341 "returned invalid value: %d\n", __func__,
4342 ret),
4343 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4344 break;
4345 }
4346 mptcp_subflow_remref(mpts); /* ours */
4347
4348 VERIFY(mpts->mpts_socket->so_usecount != 0);
4349 mpts->mpts_socket->so_usecount--;
4350 }
4351
4352 if (mpsofilt_hint_mask != SO_FILT_HINT_LOCKED) {
4353 VERIFY(mpsofilt_hint_mask & SO_FILT_HINT_LOCKED);
4354
4355 soevent(mp_so, mpsofilt_hint_mask);
4356 }
4357
4358 if (!connect_pending && !disconnect_fallback)
4359 return;
4360
4361 TAILQ_FOREACH_SAFE(mpts, &mpte->mpte_subflows, mpts_entry, tmpts) {
4362 if (disconnect_fallback) {
4363 struct socket *so = NULL;
4364 struct inpcb *inp = NULL;
4365 struct tcpcb *tp = NULL;
4366
4367 if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
4368 continue;
4369
4370 mpts->mpts_flags |= MPTSF_MP_DEGRADED;
4371
4372 if (mpts->mpts_flags & (MPTSF_DISCONNECTING|
4373 MPTSF_DISCONNECTED|MPTSF_CONNECT_PENDING))
4374 continue;
4375
4376 so = mpts->mpts_socket;
4377
4378 /*
4379 * The MPTCP connection has degraded to a fallback
4380 * mode, so there is no point in keeping this subflow
4381 * regardless of its MPTCP-readiness state, unless it
4382 * is the primary one which we use for fallback. This
4383 * assumes that the subflow used for fallback is the
4384 * ACTIVE one.
4385 */
4386
4387 inp = sotoinpcb(so);
4388 tp = intotcpcb(inp);
4389 tp->t_mpflags &=
4390 ~(TMPF_MPTCP_READY|TMPF_MPTCP_TRUE);
4391 tp->t_mpflags |= TMPF_TCP_FALLBACK;
4392
4393 if (mpts->mpts_flags & MPTSF_ACTIVE) {
4394 continue;
4395 }
4396 tp->t_mpflags |= TMPF_RESET;
4397 soevent(so, SO_FILT_HINT_MUSTRST);
4398 } else if (connect_pending) {
4399 /*
4400 * The MPTCP connection has progressed to a state
4401 * where it supports full multipath semantics; allow
4402 * additional joins to be attempted for all subflows
4403 * that are in the PENDING state.
4404 */
4405 if (mpts->mpts_flags & MPTSF_CONNECT_PENDING) {
4406 int error = mptcp_subflow_soconnectx(mpte, mpts);
4407
4408 if (error)
4409 mptcp_subflow_abort(mpts, error);
4410 }
4411 }
4412 }
4413}
4414
4415/*
4416 * Protocol pr_lock callback.
4417 */
4418int
4419mptcp_lock(struct socket *mp_so, int refcount, void *lr)
4420{
4421 struct mppcb *mpp = mpsotomppcb(mp_so);
4422 void *lr_saved;
4423
4424 if (lr == NULL)
4425 lr_saved = __builtin_return_address(0);
4426 else
4427 lr_saved = lr;
4428
4429 if (mpp == NULL) {
4430 panic("%s: so=%p NO PCB! lr=%p lrh= %s\n", __func__,
4431 mp_so, lr_saved, solockhistory_nr(mp_so));
4432 /* NOTREACHED */
4433 }
4434 mpp_lock(mpp);
4435
4436 if (mp_so->so_usecount < 0) {
4437 panic("%s: so=%p so_pcb=%p lr=%p ref=%x lrh= %s\n", __func__,
4438 mp_so, mp_so->so_pcb, lr_saved, mp_so->so_usecount,
4439 solockhistory_nr(mp_so));
4440 /* NOTREACHED */
4441 }
4442 if (refcount != 0)
4443 mp_so->so_usecount++;
4444 mp_so->lock_lr[mp_so->next_lock_lr] = lr_saved;
4445 mp_so->next_lock_lr = (mp_so->next_lock_lr + 1) % SO_LCKDBG_MAX;
4446
4447 return (0);
4448}
4449
4450/*
4451 * Protocol pr_unlock callback.
4452 */
4453int
4454mptcp_unlock(struct socket *mp_so, int refcount, void *lr)
4455{
4456 struct mppcb *mpp = mpsotomppcb(mp_so);
4457 void *lr_saved;
4458
4459 if (lr == NULL)
4460 lr_saved = __builtin_return_address(0);
4461 else
4462 lr_saved = lr;
4463
4464 if (mpp == NULL) {
4465 panic("%s: so=%p NO PCB usecount=%x lr=%p lrh= %s\n", __func__,
4466 mp_so, mp_so->so_usecount, lr_saved,
4467 solockhistory_nr(mp_so));
4468 /* NOTREACHED */
4469 }
4470 mpp_lock_assert_held(mpp);
4471
4472 if (refcount != 0)
4473 mp_so->so_usecount--;
4474
4475 if (mp_so->so_usecount < 0) {
4476 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
4477 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
4478 /* NOTREACHED */
4479 }
4480 mp_so->unlock_lr[mp_so->next_unlock_lr] = lr_saved;
4481 mp_so->next_unlock_lr = (mp_so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
4482 mpp_unlock(mpp);
4483
4484 return (0);
4485}
4486
4487/*
4488 * Protocol pr_getlock callback.
4489 */
4490lck_mtx_t *
4491mptcp_getlock(struct socket *mp_so, int flags)
4492{
4493 struct mppcb *mpp = mpsotomppcb(mp_so);
4494
4495 if (mpp == NULL) {
4496 panic("%s: so=%p NULL so_pcb %s\n", __func__, mp_so,
4497 solockhistory_nr(mp_so));
4498 /* NOTREACHED */
4499 }
4500 if (mp_so->so_usecount < 0) {
4501 panic("%s: so=%p usecount=%x lrh= %s\n", __func__,
4502 mp_so, mp_so->so_usecount, solockhistory_nr(mp_so));
4503 /* NOTREACHED */
4504 }
4505 return (mpp_getlock(mpp, flags));
4506}
4507
4508/*
4509 * MPTCP Join support
4510 */
4511
4512static void
4513mptcp_attach_to_subf(struct socket *so, struct mptcb *mp_tp,
4514 uint8_t addr_id)
4515{
4516 struct tcpcb *tp = sototcpcb(so);
4517 struct mptcp_subf_auth_entry *sauth_entry;
4518 mpte_lock_assert_held(mp_tp->mpt_mpte);
4519
4520 /*
4521 * The address ID of the first flow is implicitly 0.
4522 */
4523 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
4524 tp->t_local_aid = 0;
4525 } else {
4526 tp->t_local_aid = addr_id;
4527 tp->t_mpflags |= (TMPF_PREESTABLISHED | TMPF_JOINED_FLOW);
4528 so->so_flags |= SOF_MP_SEC_SUBFLOW;
4529 }
4530 sauth_entry = zalloc(mpt_subauth_zone);
4531 sauth_entry->msae_laddr_id = tp->t_local_aid;
4532 sauth_entry->msae_raddr_id = 0;
4533 sauth_entry->msae_raddr_rand = 0;
4534try_again:
4535 sauth_entry->msae_laddr_rand = RandomULong();
4536 if (sauth_entry->msae_laddr_rand == 0)
4537 goto try_again;
4538 LIST_INSERT_HEAD(&mp_tp->mpt_subauth_list, sauth_entry, msae_next);
4539}
4540
4541static void
4542mptcp_detach_mptcb_from_subf(struct mptcb *mp_tp, struct socket *so)
4543{
4544 struct mptcp_subf_auth_entry *sauth_entry;
4545 struct tcpcb *tp = NULL;
4546 int found = 0;
4547
4548 tp = sototcpcb(so);
4549 if (tp == NULL)
4550 return;
4551
4552 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4553 if (sauth_entry->msae_laddr_id == tp->t_local_aid) {
4554 found = 1;
4555 break;
4556 }
4557 }
4558 if (found) {
4559 LIST_REMOVE(sauth_entry, msae_next);
4560 }
4561
4562 if (found)
4563 zfree(mpt_subauth_zone, sauth_entry);
4564}
4565
4566void
4567mptcp_get_rands(mptcp_addr_id addr_id, struct mptcb *mp_tp, u_int32_t *lrand,
4568 u_int32_t *rrand)
4569{
4570 struct mptcp_subf_auth_entry *sauth_entry;
4571 mpte_lock_assert_held(mp_tp->mpt_mpte);
4572
4573 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4574 if (sauth_entry->msae_laddr_id == addr_id) {
4575 if (lrand)
4576 *lrand = sauth_entry->msae_laddr_rand;
4577 if (rrand)
4578 *rrand = sauth_entry->msae_raddr_rand;
4579 break;
4580 }
4581 }
4582}
4583
4584void
4585mptcp_set_raddr_rand(mptcp_addr_id laddr_id, struct mptcb *mp_tp,
4586 mptcp_addr_id raddr_id, u_int32_t raddr_rand)
4587{
4588 struct mptcp_subf_auth_entry *sauth_entry;
4589 mpte_lock_assert_held(mp_tp->mpt_mpte);
4590
4591 LIST_FOREACH(sauth_entry, &mp_tp->mpt_subauth_list, msae_next) {
4592 if (sauth_entry->msae_laddr_id == laddr_id) {
4593 if ((sauth_entry->msae_raddr_id != 0) &&
4594 (sauth_entry->msae_raddr_id != raddr_id)) {
4595 mptcplog((LOG_ERR, "MPTCP Socket: %s mismatched"
4596 " address ids %d %d \n", __func__, raddr_id,
4597 sauth_entry->msae_raddr_id),
4598 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
4599 return;
4600 }
4601 sauth_entry->msae_raddr_id = raddr_id;
4602 if ((sauth_entry->msae_raddr_rand != 0) &&
4603 (sauth_entry->msae_raddr_rand != raddr_rand)) {
4604 mptcplog((LOG_ERR, "MPTCP Socket: "
4605 "%s: dup SYN_ACK %d %d \n",
4606 __func__, raddr_rand,
4607 sauth_entry->msae_raddr_rand),
4608 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
4609 return;
4610 }
4611 sauth_entry->msae_raddr_rand = raddr_rand;
4612 return;
4613 }
4614 }
4615}
4616
4617/*
4618 * SHA1 support for MPTCP
4619 */
4620static void
4621mptcp_do_sha1(mptcp_key_t *key, char *sha_digest)
4622{
4623 SHA1_CTX sha1ctxt;
4624 const unsigned char *sha1_base;
4625 int sha1_size;
4626
4627 sha1_base = (const unsigned char *) key;
4628 sha1_size = sizeof (mptcp_key_t);
4629 SHA1Init(&sha1ctxt);
4630 SHA1Update(&sha1ctxt, sha1_base, sha1_size);
4631 SHA1Final(sha_digest, &sha1ctxt);
4632}
4633
4634void
4635mptcp_hmac_sha1(mptcp_key_t key1, mptcp_key_t key2,
4636 u_int32_t rand1, u_int32_t rand2, u_char *digest)
4637{
4638 SHA1_CTX sha1ctxt;
4639 mptcp_key_t key_ipad[8] = {0}; /* key XOR'd with inner pad */
4640 mptcp_key_t key_opad[8] = {0}; /* key XOR'd with outer pad */
4641 u_int32_t data[2];
4642 int i;
4643
4644 bzero(digest, SHA1_RESULTLEN);
4645
4646 /* Set up the Key for HMAC */
4647 key_ipad[0] = key1;
4648 key_ipad[1] = key2;
4649
4650 key_opad[0] = key1;
4651 key_opad[1] = key2;
4652
4653 /* Set up the message for HMAC */
4654 data[0] = rand1;
4655 data[1] = rand2;
4656
4657 /* Key is 512 block length, so no need to compute hash */
4658
4659 /* Compute SHA1(Key XOR opad, SHA1(Key XOR ipad, data)) */
4660
4661 for (i = 0; i < 8; i++) {
4662 key_ipad[i] ^= 0x3636363636363636;
4663 key_opad[i] ^= 0x5c5c5c5c5c5c5c5c;
4664 }
4665
4666 /* Perform inner SHA1 */
4667 SHA1Init(&sha1ctxt);
4668 SHA1Update(&sha1ctxt, (unsigned char *)key_ipad, sizeof (key_ipad));
4669 SHA1Update(&sha1ctxt, (unsigned char *)data, sizeof (data));
4670 SHA1Final(digest, &sha1ctxt);
4671
4672 /* Perform outer SHA1 */
4673 SHA1Init(&sha1ctxt);
4674 SHA1Update(&sha1ctxt, (unsigned char *)key_opad, sizeof (key_opad));
4675 SHA1Update(&sha1ctxt, (unsigned char *)digest, SHA1_RESULTLEN);
4676 SHA1Final(digest, &sha1ctxt);
4677}
4678
4679/*
4680 * corresponds to MAC-B = MAC (Key=(Key-B+Key-A), Msg=(R-B+R-A))
4681 * corresponds to MAC-A = MAC (Key=(Key-A+Key-B), Msg=(R-A+R-B))
4682 */
4683void
4684mptcp_get_hmac(mptcp_addr_id aid, struct mptcb *mp_tp, u_char *digest)
4685{
4686 uint32_t lrand, rrand;
4687
4688 mpte_lock_assert_held(mp_tp->mpt_mpte);
4689
4690 lrand = rrand = 0;
4691 mptcp_get_rands(aid, mp_tp, &lrand, &rrand);
4692 mptcp_hmac_sha1(mp_tp->mpt_localkey, mp_tp->mpt_remotekey, lrand, rrand,
4693 digest);
4694}
4695
4696/*
4697 * Authentication data generation
4698 */
4699static void
4700mptcp_generate_token(char *sha_digest, int sha_digest_len, caddr_t token,
4701 int token_len)
4702{
4703 VERIFY(token_len == sizeof (u_int32_t));
4704 VERIFY(sha_digest_len == SHA1_RESULTLEN);
4705
4706 /* Most significant 32 bits of the SHA1 hash */
4707 bcopy(sha_digest, token, sizeof (u_int32_t));
4708 return;
4709}
4710
4711static void
4712mptcp_generate_idsn(char *sha_digest, int sha_digest_len, caddr_t idsn,
4713 int idsn_len)
4714{
4715 VERIFY(idsn_len == sizeof (u_int64_t));
4716 VERIFY(sha_digest_len == SHA1_RESULTLEN);
4717
4718 /*
4719 * Least significant 64 bits of the SHA1 hash
4720 */
4721
4722 idsn[7] = sha_digest[12];
4723 idsn[6] = sha_digest[13];
4724 idsn[5] = sha_digest[14];
4725 idsn[4] = sha_digest[15];
4726 idsn[3] = sha_digest[16];
4727 idsn[2] = sha_digest[17];
4728 idsn[1] = sha_digest[18];
4729 idsn[0] = sha_digest[19];
4730 return;
4731}
4732
4733static void
4734mptcp_conn_properties(struct mptcb *mp_tp)
4735{
4736 /* There is only Version 0 at this time */
4737 mp_tp->mpt_version = MPTCP_STD_VERSION_0;
4738
4739 /* Set DSS checksum flag */
4740 if (mptcp_dss_csum)
4741 mp_tp->mpt_flags |= MPTCPF_CHECKSUM;
4742
4743 /* Set up receive window */
4744 mp_tp->mpt_rcvwnd = mptcp_sbspace(mp_tp);
4745
4746 /* Set up gc ticks */
4747 mp_tp->mpt_gc_ticks = MPT_GC_TICKS;
4748}
4749
4750static void
4751mptcp_init_local_parms(struct mptses *mpte)
4752{
4753 struct mptcb *mp_tp = mpte->mpte_mptcb;
4754 char key_digest[SHA1_RESULTLEN];
4755
4756 read_frandom(&mp_tp->mpt_localkey, sizeof(mp_tp->mpt_localkey));
4757 mptcp_do_sha1(&mp_tp->mpt_localkey, key_digest);
4758
4759 mptcp_generate_token(key_digest, SHA1_RESULTLEN,
4760 (caddr_t)&mp_tp->mpt_localtoken, sizeof (mp_tp->mpt_localtoken));
4761 mptcp_generate_idsn(key_digest, SHA1_RESULTLEN,
4762 (caddr_t)&mp_tp->mpt_local_idsn, sizeof (u_int64_t));
4763
4764 /* The subflow SYN is also first MPTCP byte */
4765 mp_tp->mpt_snduna = mp_tp->mpt_sndmax = mp_tp->mpt_local_idsn + 1;
4766 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
4767
4768 mptcp_conn_properties(mp_tp);
4769}
4770
4771int
4772mptcp_init_remote_parms(struct mptcb *mp_tp)
4773{
4774 char remote_digest[SHA1_RESULTLEN];
4775 mpte_lock_assert_held(mp_tp->mpt_mpte);
4776
4777 /* Only Version 0 is supported for auth purposes */
4778 if (mp_tp->mpt_version != MPTCP_STD_VERSION_0)
4779 return (-1);
4780
4781 /* Setup local and remote tokens and Initial DSNs */
4782 mptcp_do_sha1(&mp_tp->mpt_remotekey, remote_digest);
4783 mptcp_generate_token(remote_digest, SHA1_RESULTLEN,
4784 (caddr_t)&mp_tp->mpt_remotetoken, sizeof (mp_tp->mpt_remotetoken));
4785 mptcp_generate_idsn(remote_digest, SHA1_RESULTLEN,
4786 (caddr_t)&mp_tp->mpt_remote_idsn, sizeof (u_int64_t));
4787 mp_tp->mpt_rcvnxt = mp_tp->mpt_remote_idsn + 1;
4788
4789 return (0);
4790}
4791
4792static void
4793mptcp_send_dfin(struct socket *so)
4794{
4795 struct tcpcb *tp = NULL;
4796 struct inpcb *inp = NULL;
4797
4798 inp = sotoinpcb(so);
4799 if (!inp)
4800 return;
4801
4802 tp = intotcpcb(inp);
4803 if (!tp)
4804 return;
4805
4806 if (!(tp->t_mpflags & TMPF_RESET))
4807 tp->t_mpflags |= TMPF_SEND_DFIN;
4808}
4809
4810/*
4811 * Data Sequence Mapping routines
4812 */
4813void
4814mptcp_insert_dsn(struct mppcb *mpp, struct mbuf *m)
4815{
4816 struct mptcb *mp_tp;
4817
4818 if (m == NULL)
4819 return;
4820
4821 __IGNORE_WCASTALIGN(mp_tp = &((struct mpp_mtp *)mpp)->mtcb);
4822 mpte_lock_assert_held(mp_tp->mpt_mpte);
4823
4824 while (m) {
4825 VERIFY(m->m_flags & M_PKTHDR);
4826 m->m_pkthdr.pkt_flags |= (PKTF_MPTCP | PKTF_MPSO);
4827 m->m_pkthdr.mp_dsn = mp_tp->mpt_sndmax;
4828 m->m_pkthdr.mp_rlen = m_pktlen(m);
4829 mp_tp->mpt_sndmax += m_pktlen(m);
4830 m = m->m_next;
4831 }
4832}
4833
4834void
4835mptcp_fallback_sbdrop(struct socket *so, struct mbuf *m, int len)
4836{
4837 struct mptcb *mp_tp = tptomptp(sototcpcb(so));
4838 uint64_t data_ack;
4839 uint64_t dsn;
4840
4841 if (!m || len == 0)
4842 return;
4843
4844 while (m && len > 0) {
4845 VERIFY(m->m_flags & M_PKTHDR);
4846 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
4847
4848 data_ack = m->m_pkthdr.mp_dsn + m->m_pkthdr.mp_rlen;
4849 dsn = m->m_pkthdr.mp_dsn;
4850
4851 len -= m->m_len;
4852 m = m->m_next;
4853 }
4854
4855 if (m && len == 0) {
4856 /*
4857 * If there is one more mbuf in the chain, it automatically means
4858 * that up to m->mp_dsn has been ack'ed.
4859 *
4860 * This means, we actually correct data_ack back down (compared
4861 * to what we set inside the loop - dsn + data_len). Because in
4862 * the loop we are "optimistic" and assume that the full mapping
4863 * will be acked. If that's not the case and we get out of the
4864 * loop with m != NULL, it means only up to m->mp_dsn has been
4865 * really acked.
4866 */
4867 data_ack = m->m_pkthdr.mp_dsn;
4868 }
4869
4870 if (len < 0) {
4871 /*
4872 * If len is negative, meaning we acked in the middle of an mbuf,
4873 * only up to this mbuf's data-sequence number has been acked
4874 * at the MPTCP-level.
4875 */
4876 data_ack = dsn;
4877 }
4878
4879 mptcplog((LOG_DEBUG, "%s inferred ack up to %u\n", __func__, (uint32_t)data_ack),
4880 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
4881 mptcp_data_ack_rcvd(mp_tp, sototcpcb(so), data_ack);
4882}
4883
4884void
4885mptcp_preproc_sbdrop(struct socket *so, struct mbuf *m, unsigned int len)
4886{
4887 int rewinding = 0;
4888
4889 /* TFO makes things complicated. */
4890 if (so->so_flags1 & SOF1_TFO_REWIND) {
4891 rewinding = 1;
4892 so->so_flags1 &= ~SOF1_TFO_REWIND;
4893 }
4894
4895 while (m && (!(so->so_flags & SOF_MP_SUBFLOW) || rewinding)) {
4896 u_int32_t sub_len;
4897 VERIFY(m->m_flags & M_PKTHDR);
4898 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
4899
4900 sub_len = m->m_pkthdr.mp_rlen;
4901
4902 if (sub_len < len) {
4903 m->m_pkthdr.mp_dsn += sub_len;
4904 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
4905 m->m_pkthdr.mp_rseq += sub_len;
4906 }
4907 m->m_pkthdr.mp_rlen = 0;
4908 len -= sub_len;
4909 } else {
4910 /* sub_len >= len */
4911 if (rewinding == 0)
4912 m->m_pkthdr.mp_dsn += len;
4913 if (!(m->m_pkthdr.pkt_flags & PKTF_MPSO)) {
4914 if (rewinding == 0)
4915 m->m_pkthdr.mp_rseq += len;
4916 }
4917 mptcplog((LOG_DEBUG, "%s: dsn %u ssn %u len %d %d\n",
4918 __func__, (u_int32_t)m->m_pkthdr.mp_dsn,
4919 m->m_pkthdr.mp_rseq, m->m_pkthdr.mp_rlen, len),
4920 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
4921 m->m_pkthdr.mp_rlen -= len;
4922 break;
4923 }
4924 m = m->m_next;
4925 }
4926
4927 if (so->so_flags & SOF_MP_SUBFLOW &&
4928 !(sototcpcb(so)->t_mpflags & TMPF_TFO_REQUEST) &&
4929 !(sototcpcb(so)->t_mpflags & TMPF_RCVD_DACK)) {
4930 /*
4931 * Received an ack without receiving a DATA_ACK.
4932 * Need to fallback to regular TCP (or destroy this subflow).
4933 */
4934 sototcpcb(so)->t_mpflags |= TMPF_INFIN_SENT;
4935 mptcp_notify_mpfail(so);
4936 }
4937}
4938
4939/* Obtain the DSN mapping stored in the mbuf */
4940void
4941mptcp_output_getm_dsnmap32(struct socket *so, int off,
4942 uint32_t *dsn, uint32_t *relseq, uint16_t *data_len, uint16_t *dss_csum)
4943{
4944 u_int64_t dsn64;
4945
4946 mptcp_output_getm_dsnmap64(so, off, &dsn64, relseq, data_len, dss_csum);
4947 *dsn = (u_int32_t)MPTCP_DATASEQ_LOW32(dsn64);
4948}
4949
4950void
4951mptcp_output_getm_dsnmap64(struct socket *so, int off, uint64_t *dsn,
4952 uint32_t *relseq, uint16_t *data_len,
4953 uint16_t *dss_csum)
4954{
4955 struct mbuf *m = so->so_snd.sb_mb;
4956 int off_orig = off;
4957
4958 VERIFY(off >= 0);
4959
4960 /*
4961 * In the subflow socket, the DSN sequencing can be discontiguous,
4962 * but the subflow sequence mapping is contiguous. Use the subflow
4963 * sequence property to find the right mbuf and corresponding dsn
4964 * mapping.
4965 */
4966
4967 while (m) {
4968 VERIFY(m->m_flags & M_PKTHDR);
4969 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
4970
4971 if (off >= m->m_len) {
4972 off -= m->m_len;
4973 m = m->m_next;
4974 } else {
4975 break;
4976 }
4977 }
4978
4979 VERIFY(m);
4980 VERIFY(off >= 0);
4981 VERIFY(m->m_pkthdr.mp_rlen <= UINT16_MAX);
4982
4983 *dsn = m->m_pkthdr.mp_dsn;
4984 *relseq = m->m_pkthdr.mp_rseq;
4985 *data_len = m->m_pkthdr.mp_rlen;
4986 *dss_csum = m->m_pkthdr.mp_csum;
4987
4988 mptcplog((LOG_DEBUG, "%s: dsn %u ssn %u data_len %d off %d off_orig %d\n",
4989 __func__, (u_int32_t)(*dsn), *relseq, *data_len, off, off_orig),
4990 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
4991}
4992
4993/*
4994 * Note that this is called only from tcp_input() via mptcp_input_preproc()
4995 * tcp_input() may trim data after the dsn mapping is inserted into the mbuf.
4996 * When it trims data tcp_input calls m_adj() which does not remove the
4997 * m_pkthdr even if the m_len becomes 0 as a result of trimming the mbuf.
4998 * The dsn map insertion cannot be delayed after trim, because data can be in
4999 * the reassembly queue for a while and the DSN option info in tp will be
5000 * overwritten for every new packet received.
5001 * The dsn map will be adjusted just prior to appending to subflow sockbuf
5002 * with mptcp_adj_rmap()
5003 */
5004void
5005mptcp_insert_rmap(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th)
5006{
5007 VERIFY(m->m_flags & M_PKTHDR);
5008 VERIFY(!(m->m_pkthdr.pkt_flags & PKTF_MPTCP));
5009
5010 if (tp->t_mpflags & TMPF_EMBED_DSN) {
5011 m->m_pkthdr.mp_dsn = tp->t_rcv_map.mpt_dsn;
5012 m->m_pkthdr.mp_rseq = tp->t_rcv_map.mpt_sseq;
5013 m->m_pkthdr.mp_rlen = tp->t_rcv_map.mpt_len;
5014 m->m_pkthdr.mp_csum = tp->t_rcv_map.mpt_csum;
5015 if (tp->t_rcv_map.mpt_dfin)
5016 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
5017
5018 m->m_pkthdr.pkt_flags |= PKTF_MPTCP;
5019
5020 tp->t_mpflags &= ~TMPF_EMBED_DSN;
5021 tp->t_mpflags |= TMPF_MPTCP_ACKNOW;
5022 } else if (tp->t_mpflags & TMPF_TCP_FALLBACK) {
5023 if (th->th_flags & TH_FIN)
5024 m->m_pkthdr.pkt_flags |= PKTF_MPTCP_DFIN;
5025 }
5026}
5027
5028int
5029mptcp_adj_rmap(struct socket *so, struct mbuf *m, int off, uint64_t dsn,
5030 uint32_t rseq, uint16_t dlen)
5031{
5032 struct mptsub *mpts = sototcpcb(so)->t_mpsub;
5033
5034 if (m_pktlen(m) == 0)
5035 return (0);
5036
5037 if ((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
5038 if (off && (dsn != m->m_pkthdr.mp_dsn ||
5039 rseq != m->m_pkthdr.mp_rseq ||
5040 dlen != m->m_pkthdr.mp_rlen)) {
5041 mptcplog((LOG_ERR, "%s: Received incorrect second mapping: %llu - %llu , %u - %u, %u - %u\n",
5042 __func__, dsn, m->m_pkthdr.mp_dsn,
5043 rseq, m->m_pkthdr.mp_rseq,
5044 dlen, m->m_pkthdr.mp_rlen),
5045 MPTCP_RECEIVER_DBG, MPTCP_LOGLVL_ERR);
5046 return (-1);
5047 }
5048 m->m_pkthdr.mp_dsn += off;
5049 m->m_pkthdr.mp_rseq += off;
5050 m->m_pkthdr.mp_rlen = m->m_pkthdr.len;
5051 } else {
5052 if (!(mpts->mpts_flags & MPTSF_CONFIRMED)) {
5053 /* data arrived without an DSS option mapping */
5054
5055 /* initial subflow can fallback right after SYN handshake */
5056 mptcp_notify_mpfail(so);
5057 }
5058 }
5059
5060 mpts->mpts_flags |= MPTSF_CONFIRMED;
5061
5062 return (0);
5063}
5064
5065/*
5066 * Following routines help with failure detection and failover of data
5067 * transfer from one subflow to another.
5068 */
5069void
5070mptcp_act_on_txfail(struct socket *so)
5071{
5072 struct tcpcb *tp = NULL;
5073 struct inpcb *inp = sotoinpcb(so);
5074
5075 if (inp == NULL)
5076 return;
5077
5078 tp = intotcpcb(inp);
5079 if (tp == NULL)
5080 return;
5081
5082 if (so->so_flags & SOF_MP_TRYFAILOVER)
5083 return;
5084
5085 so->so_flags |= SOF_MP_TRYFAILOVER;
5086 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPFAILOVER));
5087}
5088
5089/*
5090 * Support for MP_FAIL option
5091 */
5092int
5093mptcp_get_map_for_dsn(struct socket *so, u_int64_t dsn_fail, u_int32_t *tcp_seq)
5094{
5095 struct mbuf *m = so->so_snd.sb_mb;
5096 u_int64_t dsn;
5097 int off = 0;
5098 u_int32_t datalen;
5099
5100 if (m == NULL)
5101 return (-1);
5102
5103 while (m != NULL) {
5104 VERIFY(m->m_pkthdr.pkt_flags & PKTF_MPTCP);
5105 VERIFY(m->m_flags & M_PKTHDR);
5106 dsn = m->m_pkthdr.mp_dsn;
5107 datalen = m->m_pkthdr.mp_rlen;
5108 if (MPTCP_SEQ_LEQ(dsn, dsn_fail) &&
5109 (MPTCP_SEQ_GEQ(dsn + datalen, dsn_fail))) {
5110 off = dsn_fail - dsn;
5111 *tcp_seq = m->m_pkthdr.mp_rseq + off;
5112 mptcplog((LOG_DEBUG, "%s: %llu %llu \n", __func__, dsn,
5113 dsn_fail), MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
5114 return (0);
5115 }
5116
5117 m = m->m_next;
5118 }
5119
5120 /*
5121 * If there was no mbuf data and a fallback to TCP occurred, there's
5122 * not much else to do.
5123 */
5124
5125 mptcplog((LOG_ERR, "MPTCP Sender: "
5126 "%s: %llu not found \n", __func__, dsn_fail),
5127 MPTCP_SENDER_DBG, MPTCP_LOGLVL_LOG);
5128 return (-1);
5129}
5130
5131/*
5132 * Support for sending contiguous MPTCP bytes in subflow
5133 * Also for preventing sending data with ACK in 3-way handshake
5134 */
5135int32_t
5136mptcp_adj_sendlen(struct socket *so, int32_t off)
5137{
5138 struct tcpcb *tp = sototcpcb(so);
5139 struct mptsub *mpts = tp->t_mpsub;
5140 uint64_t mdss_dsn;
5141 uint32_t mdss_subflow_seq;
5142 int mdss_subflow_off;
5143 uint16_t mdss_data_len;
5144 uint16_t dss_csum;
5145
5146 mptcp_output_getm_dsnmap64(so, off, &mdss_dsn, &mdss_subflow_seq,
5147 &mdss_data_len, &dss_csum);
5148
5149 /*
5150 * We need to compute how much of the mapping still remains.
5151 * So, we compute the offset in the send-buffer of the dss-sub-seq.
5152 */
5153 mdss_subflow_off = (mdss_subflow_seq + mpts->mpts_iss) - tp->snd_una;
5154
5155 /*
5156 * When TFO is used, we are sending the mpts->mpts_iss although the relative
5157 * seq has been set to 1 (while it should be 0).
5158 */
5159 if (tp->t_mpflags & TMPF_TFO_REQUEST)
5160 mdss_subflow_off--;
5161
5162 if (off < mdss_subflow_off)
5163 printf("%s off %d mdss_subflow_off %d mdss_subflow_seq %u iss %u suna %u\n", __func__,
5164 off, mdss_subflow_off, mdss_subflow_seq, mpts->mpts_iss, tp->snd_una);
5165 VERIFY(off >= mdss_subflow_off);
5166
5167 mptcplog((LOG_DEBUG, "%s dlen %u off %d sub_off %d sub_seq %u iss %u suna %u\n",
5168 __func__, mdss_data_len, off, mdss_subflow_off, mdss_subflow_seq,
5169 mpts->mpts_iss, tp->snd_una), MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
5170 return (mdss_data_len - (off - mdss_subflow_off));
5171}
5172
5173static uint32_t
5174mptcp_get_maxseg(struct mptses *mpte)
5175{
5176 struct mptsub *mpts;
5177 uint32_t maxseg = 0;
5178
5179 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5180 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5181
5182 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5183 TCPS_HAVERCVDFIN2(tp->t_state))
5184 continue;
5185
5186 if (tp->t_maxseg > maxseg)
5187 maxseg = tp->t_maxseg;
5188 }
5189
5190 return (maxseg);
5191}
5192
5193static uint8_t
5194mptcp_get_rcvscale(struct mptses *mpte)
5195{
5196 struct mptsub *mpts;
5197 uint8_t rcvscale = UINT8_MAX;
5198
5199 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5200 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5201
5202 if (!TCPS_HAVEESTABLISHED(tp->t_state) ||
5203 TCPS_HAVERCVDFIN2(tp->t_state))
5204 continue;
5205
5206 if (tp->rcv_scale < rcvscale)
5207 rcvscale = tp->rcv_scale;
5208 }
5209
5210 return (rcvscale);
5211}
5212
5213/* Similar to tcp_sbrcv_reserve */
5214static void
5215mptcp_sbrcv_reserve(struct mptcb *mp_tp, struct sockbuf *sbrcv,
5216 u_int32_t newsize, u_int32_t idealsize)
5217{
5218 uint8_t rcvscale = mptcp_get_rcvscale(mp_tp->mpt_mpte);
5219
5220 /* newsize should not exceed max */
5221 newsize = min(newsize, tcp_autorcvbuf_max);
5222
5223 /* The receive window scale negotiated at the
5224 * beginning of the connection will also set a
5225 * limit on the socket buffer size
5226 */
5227 newsize = min(newsize, TCP_MAXWIN << rcvscale);
5228
5229 /* Set new socket buffer size */
5230 if (newsize > sbrcv->sb_hiwat &&
5231 (sbreserve(sbrcv, newsize) == 1)) {
5232 sbrcv->sb_idealsize = min(max(sbrcv->sb_idealsize,
5233 (idealsize != 0) ? idealsize : newsize), tcp_autorcvbuf_max);
5234
5235 /* Again check the limit set by the advertised
5236 * window scale
5237 */
5238 sbrcv->sb_idealsize = min(sbrcv->sb_idealsize,
5239 TCP_MAXWIN << rcvscale);
5240 }
5241}
5242
5243void
5244mptcp_sbrcv_grow(struct mptcb *mp_tp)
5245{
5246 struct mptses *mpte = mp_tp->mpt_mpte;
5247 struct socket *mp_so = mpte->mpte_mppcb->mpp_socket;
5248 struct sockbuf *sbrcv = &mp_so->so_rcv;
5249 uint32_t hiwat_sum = 0;
5250 uint32_t ideal_sum = 0;
5251 struct mptsub *mpts;
5252
5253 /*
5254 * Do not grow the receive socket buffer if
5255 * - auto resizing is disabled, globally or on this socket
5256 * - the high water mark already reached the maximum
5257 * - the stream is in background and receive side is being
5258 * throttled
5259 * - if there are segments in reassembly queue indicating loss,
5260 * do not need to increase recv window during recovery as more
5261 * data is not going to be sent. A duplicate ack sent during
5262 * recovery should not change the receive window
5263 */
5264 if (tcp_do_autorcvbuf == 0 ||
5265 (sbrcv->sb_flags & SB_AUTOSIZE) == 0 ||
5266 tcp_cansbgrow(sbrcv) == 0 ||
5267 sbrcv->sb_hiwat >= tcp_autorcvbuf_max ||
5268 (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ||
5269 !LIST_EMPTY(&mp_tp->mpt_segq)) {
5270 /* Can not resize the socket buffer, just return */
5271 return;
5272 }
5273
5274 /*
5275 * Ideally, we want the rbuf to be (sum_i {bw_i} * rtt_max * 2)
5276 *
5277 * But, for this we first need accurate receiver-RTT estimations, which
5278 * we currently don't have.
5279 *
5280 * Let's use a dummy algorithm for now, just taking the sum of all
5281 * subflow's receive-buffers. It's too low, but that's all we can get
5282 * for now.
5283 */
5284
5285 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5286 hiwat_sum += mpts->mpts_socket->so_rcv.sb_hiwat;
5287 ideal_sum += mpts->mpts_socket->so_rcv.sb_idealsize;
5288 }
5289
5290 mptcp_sbrcv_reserve(mp_tp, sbrcv, hiwat_sum, ideal_sum);
5291}
5292
5293/*
5294 * Determine if we can grow the recieve socket buffer to avoid sending
5295 * a zero window update to the peer. We allow even socket buffers that
5296 * have fixed size (set by the application) to grow if the resource
5297 * constraints are met. They will also be trimmed after the application
5298 * reads data.
5299 *
5300 * Similar to tcp_sbrcv_grow_rwin
5301 */
5302static void
5303mptcp_sbrcv_grow_rwin(struct mptcb *mp_tp, struct sockbuf *sb)
5304{
5305 struct socket *mp_so = mp_tp->mpt_mpte->mpte_mppcb->mpp_socket;
5306 u_int32_t rcvbufinc = mptcp_get_maxseg(mp_tp->mpt_mpte) << 4;
5307 u_int32_t rcvbuf = sb->sb_hiwat;
5308
5309 if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(mp_so))
5310 return;
5311
5312 if (tcp_do_autorcvbuf == 1 &&
5313 tcp_cansbgrow(sb) &&
5314 /* Diff to tcp_sbrcv_grow_rwin */
5315 (mp_so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) == 0 &&
5316 (rcvbuf - sb->sb_cc) < rcvbufinc &&
5317 rcvbuf < tcp_autorcvbuf_max &&
5318 (sb->sb_idealsize > 0 &&
5319 sb->sb_hiwat <= (sb->sb_idealsize + rcvbufinc))) {
5320 sbreserve(sb, min((sb->sb_hiwat + rcvbufinc), tcp_autorcvbuf_max));
5321 }
5322}
5323
5324/* Similar to tcp_sbspace */
5325int32_t
5326mptcp_sbspace(struct mptcb *mp_tp)
5327{
5328 struct sockbuf *sb = &mp_tp->mpt_mpte->mpte_mppcb->mpp_socket->so_rcv;
5329 uint32_t rcvbuf;
5330 int32_t space;
5331 int32_t pending = 0;
5332
5333 mpte_lock_assert_held(mp_tp->mpt_mpte);
5334
5335 mptcp_sbrcv_grow_rwin(mp_tp, sb);
5336
5337 /* hiwat might have changed */
5338 rcvbuf = sb->sb_hiwat;
5339
5340 space = ((int32_t) imin((rcvbuf - sb->sb_cc),
5341 (sb->sb_mbmax - sb->sb_mbcnt)));
5342 if (space < 0)
5343 space = 0;
5344
5345#if CONTENT_FILTER
5346 /* Compensate for data being processed by content filters */
5347 pending = cfil_sock_data_space(sb);
5348#endif /* CONTENT_FILTER */
5349 if (pending > space)
5350 space = 0;
5351 else
5352 space -= pending;
5353
5354 return (space);
5355}
5356
5357/*
5358 * Support Fallback to Regular TCP
5359 */
5360void
5361mptcp_notify_mpready(struct socket *so)
5362{
5363 struct tcpcb *tp = NULL;
5364
5365 if (so == NULL)
5366 return;
5367
5368 tp = intotcpcb(sotoinpcb(so));
5369
5370 if (tp == NULL)
5371 return;
5372
5373 DTRACE_MPTCP4(multipath__ready, struct socket *, so,
5374 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5375 struct tcpcb *, tp);
5376
5377 if (!(tp->t_mpflags & TMPF_MPTCP_TRUE))
5378 return;
5379
5380 if (tp->t_mpflags & TMPF_MPTCP_READY)
5381 return;
5382
5383 tp->t_mpflags &= ~TMPF_TCP_FALLBACK;
5384 tp->t_mpflags |= TMPF_MPTCP_READY;
5385
5386 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5387}
5388
5389void
5390mptcp_notify_mpfail(struct socket *so)
5391{
5392 struct tcpcb *tp = NULL;
5393
5394 if (so == NULL)
5395 return;
5396
5397 tp = intotcpcb(sotoinpcb(so));
5398
5399 if (tp == NULL)
5400 return;
5401
5402 DTRACE_MPTCP4(multipath__failed, struct socket *, so,
5403 struct sockbuf *, &so->so_rcv, struct sockbuf *, &so->so_snd,
5404 struct tcpcb *, tp);
5405
5406 if (tp->t_mpflags & TMPF_TCP_FALLBACK)
5407 return;
5408
5409 tp->t_mpflags &= ~(TMPF_MPTCP_READY|TMPF_MPTCP_TRUE);
5410 tp->t_mpflags |= TMPF_TCP_FALLBACK;
5411
5412 soevent(so, (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MPSTATUS));
5413}
5414
5415/*
5416 * Keepalive helper function
5417 */
5418boolean_t
5419mptcp_ok_to_keepalive(struct mptcb *mp_tp)
5420{
5421 boolean_t ret = 1;
5422 mpte_lock_assert_held(mp_tp->mpt_mpte);
5423
5424 if (mp_tp->mpt_state >= MPTCPS_CLOSE_WAIT) {
5425 ret = 0;
5426 }
5427 return (ret);
5428}
5429
5430/*
5431 * MPTCP t_maxseg adjustment function
5432 */
5433int
5434mptcp_adj_mss(struct tcpcb *tp, boolean_t mtudisc)
5435{
5436 int mss_lower = 0;
5437 struct mptcb *mp_tp = tptomptp(tp);
5438
5439#define MPTCP_COMPUTE_LEN { \
5440 mss_lower = sizeof (struct mptcp_dss_ack_opt); \
5441 if (mp_tp->mpt_flags & MPTCPF_CHECKSUM) \
5442 mss_lower += 2; \
5443 else \
5444 /* adjust to 32-bit boundary + EOL */ \
5445 mss_lower += 2; \
5446}
5447 if (mp_tp == NULL)
5448 return (0);
5449
5450 mpte_lock_assert_held(mp_tp->mpt_mpte);
5451
5452 /*
5453 * For the first subflow and subsequent subflows, adjust mss for
5454 * most common MPTCP option size, for case where tcp_mss is called
5455 * during option processing and MTU discovery.
5456 */
5457 if (!mtudisc) {
5458 if (tp->t_mpflags & TMPF_MPTCP_TRUE &&
5459 !(tp->t_mpflags & TMPF_JOINED_FLOW)) {
5460 MPTCP_COMPUTE_LEN;
5461 }
5462
5463 if (tp->t_mpflags & TMPF_PREESTABLISHED &&
5464 tp->t_mpflags & TMPF_SENT_JOIN) {
5465 MPTCP_COMPUTE_LEN;
5466 }
5467 } else {
5468 if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
5469 MPTCP_COMPUTE_LEN;
5470 }
5471 }
5472
5473 return (mss_lower);
5474}
5475
5476/*
5477 * Update the pid, upid, uuid of the subflow so, based on parent so
5478 */
5479void
5480mptcp_update_last_owner(struct socket *so, struct socket *mp_so)
5481{
5482 if (so->last_pid != mp_so->last_pid ||
5483 so->last_upid != mp_so->last_upid) {
5484 so->last_upid = mp_so->last_upid;
5485 so->last_pid = mp_so->last_pid;
5486 uuid_copy(so->last_uuid, mp_so->last_uuid);
5487 }
5488 so_update_policy(so);
5489}
5490
5491static void
5492fill_mptcp_subflow(struct socket *so, mptcp_flow_t *flow, struct mptsub *mpts)
5493{
5494 struct inpcb *inp;
5495
5496 tcp_getconninfo(so, &flow->flow_ci);
5497 inp = sotoinpcb(so);
5498#if INET6
5499 if ((inp->inp_vflag & INP_IPV6) != 0) {
5500 flow->flow_src.ss_family = AF_INET6;
5501 flow->flow_dst.ss_family = AF_INET6;
5502 flow->flow_src.ss_len = sizeof(struct sockaddr_in6);
5503 flow->flow_dst.ss_len = sizeof(struct sockaddr_in6);
5504 SIN6(&flow->flow_src)->sin6_port = inp->in6p_lport;
5505 SIN6(&flow->flow_dst)->sin6_port = inp->in6p_fport;
5506 SIN6(&flow->flow_src)->sin6_addr = inp->in6p_laddr;
5507 SIN6(&flow->flow_dst)->sin6_addr = inp->in6p_faddr;
5508 } else
5509#endif
5510 if ((inp->inp_vflag & INP_IPV4) != 0) {
5511 flow->flow_src.ss_family = AF_INET;
5512 flow->flow_dst.ss_family = AF_INET;
5513 flow->flow_src.ss_len = sizeof(struct sockaddr_in);
5514 flow->flow_dst.ss_len = sizeof(struct sockaddr_in);
5515 SIN(&flow->flow_src)->sin_port = inp->inp_lport;
5516 SIN(&flow->flow_dst)->sin_port = inp->inp_fport;
5517 SIN(&flow->flow_src)->sin_addr = inp->inp_laddr;
5518 SIN(&flow->flow_dst)->sin_addr = inp->inp_faddr;
5519 }
5520 flow->flow_len = sizeof(*flow);
5521 flow->flow_tcpci_offset = offsetof(mptcp_flow_t, flow_ci);
5522 flow->flow_flags = mpts->mpts_flags;
5523 flow->flow_cid = mpts->mpts_connid;
5524 flow->flow_relseq = mpts->mpts_rel_seq;
5525 flow->flow_soerror = mpts->mpts_socket->so_error;
5526 flow->flow_probecnt = mpts->mpts_probecnt;
5527}
5528
5529static int
5530mptcp_pcblist SYSCTL_HANDLER_ARGS
5531{
5532#pragma unused(oidp, arg1, arg2)
5533 int error = 0, f;
5534 size_t len;
5535 struct mppcb *mpp;
5536 struct mptses *mpte;
5537 struct mptcb *mp_tp;
5538 struct mptsub *mpts;
5539 struct socket *so;
5540 conninfo_mptcp_t mptcpci;
5541 mptcp_flow_t *flows = NULL;
5542
5543 if (req->newptr != USER_ADDR_NULL)
5544 return (EPERM);
5545
5546 lck_mtx_lock(&mtcbinfo.mppi_lock);
5547 if (req->oldptr == USER_ADDR_NULL) {
5548 size_t n = mtcbinfo.mppi_count;
5549 lck_mtx_unlock(&mtcbinfo.mppi_lock);
5550 req->oldidx = (n + n/8) * sizeof(conninfo_mptcp_t) +
5551 4 * (n + n/8) * sizeof(mptcp_flow_t);
5552 return (0);
5553 }
5554 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
5555 flows = NULL;
5556 mpp_lock(mpp);
5557 VERIFY(mpp->mpp_flags & MPP_ATTACHED);
5558 mpte = mptompte(mpp);
5559 VERIFY(mpte != NULL);
5560 mpte_lock_assert_held(mpte);
5561 mp_tp = mpte->mpte_mptcb;
5562 VERIFY(mp_tp != NULL);
5563
5564 bzero(&mptcpci, sizeof(mptcpci));
5565 mptcpci.mptcpci_state = mp_tp->mpt_state;
5566 mptcpci.mptcpci_flags = mp_tp->mpt_flags;
5567 mptcpci.mptcpci_ltoken = mp_tp->mpt_localtoken;
5568 mptcpci.mptcpci_rtoken = mp_tp->mpt_remotetoken;
5569 mptcpci.mptcpci_notsent_lowat = mp_tp->mpt_notsent_lowat;
5570 mptcpci.mptcpci_snduna = mp_tp->mpt_snduna;
5571 mptcpci.mptcpci_sndnxt = mp_tp->mpt_sndnxt;
5572 mptcpci.mptcpci_sndmax = mp_tp->mpt_sndmax;
5573 mptcpci.mptcpci_lidsn = mp_tp->mpt_local_idsn;
5574 mptcpci.mptcpci_sndwnd = mp_tp->mpt_sndwnd;
5575 mptcpci.mptcpci_rcvnxt = mp_tp->mpt_rcvnxt;
5576 mptcpci.mptcpci_rcvatmark = mp_tp->mpt_rcvnxt;
5577 mptcpci.mptcpci_ridsn = mp_tp->mpt_remote_idsn;
5578 mptcpci.mptcpci_rcvwnd = mp_tp->mpt_rcvwnd;
5579
5580 mptcpci.mptcpci_nflows = mpte->mpte_numflows;
5581 mptcpci.mptcpci_mpte_flags = mpte->mpte_flags;
5582 mptcpci.mptcpci_mpte_addrid = mpte->mpte_addrid_last;
5583 mptcpci.mptcpci_flow_offset =
5584 offsetof(conninfo_mptcp_t, mptcpci_flows);
5585
5586 len = sizeof(*flows) * mpte->mpte_numflows;
5587 if (mpte->mpte_numflows != 0) {
5588 flows = _MALLOC(len, M_TEMP, M_WAITOK | M_ZERO);
5589 if (flows == NULL) {
5590 mpp_unlock(mpp);
5591 break;
5592 }
5593 mptcpci.mptcpci_len = sizeof(mptcpci) +
5594 sizeof(*flows) * (mptcpci.mptcpci_nflows - 1);
5595 error = SYSCTL_OUT(req, &mptcpci,
5596 sizeof(mptcpci) - sizeof(mptcp_flow_t));
5597 } else {
5598 mptcpci.mptcpci_len = sizeof(mptcpci);
5599 error = SYSCTL_OUT(req, &mptcpci, sizeof(mptcpci));
5600 }
5601 if (error) {
5602 mpp_unlock(mpp);
5603 FREE(flows, M_TEMP);
5604 break;
5605 }
5606 f = 0;
5607 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5608 so = mpts->mpts_socket;
5609 fill_mptcp_subflow(so, &flows[f], mpts);
5610 f++;
5611 }
5612 mpp_unlock(mpp);
5613 if (flows) {
5614 error = SYSCTL_OUT(req, flows, len);
5615 FREE(flows, M_TEMP);
5616 if (error)
5617 break;
5618 }
5619 }
5620 lck_mtx_unlock(&mtcbinfo.mppi_lock);
5621
5622 return (error);
5623}
5624
5625SYSCTL_PROC(_net_inet_mptcp, OID_AUTO, pcblist, CTLFLAG_RD | CTLFLAG_LOCKED,
5626 0, 0, mptcp_pcblist, "S,conninfo_mptcp_t",
5627 "List of active MPTCP connections");
5628
5629/*
5630 * Set notsent lowat mark on the MPTCB
5631 */
5632int
5633mptcp_set_notsent_lowat(struct mptses *mpte, int optval)
5634{
5635 struct mptcb *mp_tp = NULL;
5636 int error = 0;
5637
5638 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED)
5639 mp_tp = mpte->mpte_mptcb;
5640
5641 if (mp_tp)
5642 mp_tp->mpt_notsent_lowat = optval;
5643 else
5644 error = EINVAL;
5645
5646 return (error);
5647}
5648
5649u_int32_t
5650mptcp_get_notsent_lowat(struct mptses *mpte)
5651{
5652 struct mptcb *mp_tp = NULL;
5653
5654 if (mpte->mpte_mppcb->mpp_flags & MPP_ATTACHED)
5655 mp_tp = mpte->mpte_mptcb;
5656
5657 if (mp_tp)
5658 return (mp_tp->mpt_notsent_lowat);
5659 else
5660 return (0);
5661}
5662
5663int
5664mptcp_notsent_lowat_check(struct socket *so)
5665{
5666 struct mptses *mpte;
5667 struct mppcb *mpp;
5668 struct mptcb *mp_tp;
5669 struct mptsub *mpts;
5670
5671 int notsent = 0;
5672
5673 mpp = mpsotomppcb(so);
5674 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
5675 return (0);
5676 }
5677
5678 mpte = mptompte(mpp);
5679 mpte_lock_assert_held(mpte);
5680 mp_tp = mpte->mpte_mptcb;
5681
5682 notsent = so->so_snd.sb_cc;
5683
5684 if ((notsent == 0) ||
5685 ((notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)) <=
5686 mp_tp->mpt_notsent_lowat)) {
5687 mptcplog((LOG_DEBUG, "MPTCP Sender: "
5688 "lowat %d notsent %d actual %d \n",
5689 mp_tp->mpt_notsent_lowat, notsent,
5690 notsent - (mp_tp->mpt_sndnxt - mp_tp->mpt_snduna)),
5691 MPTCP_SENDER_DBG , MPTCP_LOGLVL_VERBOSE);
5692 return (1);
5693 }
5694
5695 /* When Nagle's algorithm is not disabled, it is better
5696 * to wakeup the client even before there is atleast one
5697 * maxseg of data to write.
5698 */
5699 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
5700 int retval = 0;
5701 if (mpts->mpts_flags & MPTSF_ACTIVE) {
5702 struct socket *subf_so = mpts->mpts_socket;
5703 struct tcpcb *tp = intotcpcb(sotoinpcb(subf_so));
5704
5705 notsent = so->so_snd.sb_cc -
5706 (tp->snd_nxt - tp->snd_una);
5707
5708 if ((tp->t_flags & TF_NODELAY) == 0 &&
5709 notsent > 0 && (notsent <= (int)tp->t_maxseg)) {
5710 retval = 1;
5711 }
5712 mptcplog((LOG_DEBUG, "MPTCP Sender: lowat %d notsent %d"
5713 " nodelay false \n",
5714 mp_tp->mpt_notsent_lowat, notsent),
5715 MPTCP_SENDER_DBG , MPTCP_LOGLVL_VERBOSE);
5716 return (retval);
5717 }
5718 }
5719 return (0);
5720}
5721
5722/* Using Symptoms Advisory to detect poor WiFi or poor Cell */
5723static kern_ctl_ref mptcp_kern_ctrl_ref = NULL;
5724static uint32_t mptcp_kern_skt_inuse = 0;
5725static uint32_t mptcp_kern_skt_unit;
5726symptoms_advisory_t mptcp_advisory;
5727
5728static errno_t
5729mptcp_symptoms_ctl_connect(kern_ctl_ref kctlref, struct sockaddr_ctl *sac,
5730 void **unitinfo)
5731{
5732#pragma unused(kctlref, sac, unitinfo)
5733
5734 if (OSIncrementAtomic(&mptcp_kern_skt_inuse) > 0)
5735 os_log_error(mptcp_log_handle, "%s MPTCP kernel-control socket for Symptoms already open!", __func__);
5736
5737 mptcp_kern_skt_unit = sac->sc_unit;
5738
5739 return (0);
5740}
5741
5742static void
5743mptcp_allow_uuid(uuid_t uuid)
5744{
5745 struct mppcb *mpp;
5746
5747 /* Iterate over all MPTCP connections */
5748
5749 lck_mtx_lock(&mtcbinfo.mppi_lock);
5750
5751 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
5752 struct mptses *mpte;
5753 struct socket *mp_so;
5754
5755 mpp_lock(mpp);
5756
5757 mpte = mpp->mpp_pcbe;
5758 mp_so = mpp->mpp_socket;
5759
5760 if (mp_so->so_flags & SOF_DELEGATED &&
5761 uuid_compare(uuid, mp_so->e_uuid))
5762 goto next;
5763 else if (!(mp_so->so_flags & SOF_DELEGATED) &&
5764 uuid_compare(uuid, mp_so->last_uuid))
5765 goto next;
5766
5767 mpte->mpte_flags |= MPTE_ACCESS_GRANTED;
5768
5769 mptcp_check_subflows_and_add(mpte);
5770 mptcp_remove_subflows(mpte);
5771
5772 mpte->mpte_flags &= ~MPTE_ACCESS_GRANTED;
5773
5774next:
5775 mpp_unlock(mpp);
5776 }
5777
5778 lck_mtx_unlock(&mtcbinfo.mppi_lock);
5779}
5780
5781static void
5782mptcp_wifi_status_changed(void)
5783{
5784 struct mppcb *mpp;
5785
5786 /* Iterate over all MPTCP connections */
5787
5788 lck_mtx_lock(&mtcbinfo.mppi_lock);
5789
5790 TAILQ_FOREACH(mpp, &mtcbinfo.mppi_pcbs, mpp_entry) {
5791 struct mptses *mpte;
5792 struct socket *mp_so;
5793
5794 mpp_lock(mpp);
5795
5796 mpte = mpp->mpp_pcbe;
5797 mp_so = mpp->mpp_socket;
5798
5799 /* Only handover-mode is purely driven by Symptom's Wi-Fi status */
5800 if (mpte->mpte_svctype != MPTCP_SVCTYPE_HANDOVER)
5801 goto next;
5802
5803 mptcp_check_subflows_and_add(mpte);
5804 mptcp_check_subflows_and_remove(mpte);
5805
5806next:
5807 mpp_unlock(mpp);
5808 }
5809
5810 lck_mtx_unlock(&mtcbinfo.mppi_lock);
5811}
5812
5813void
5814mptcp_ask_symptoms(struct mptses *mpte)
5815{
5816 struct mptcp_symptoms_ask_uuid ask;
5817 struct socket *mp_so;
5818 struct proc *p;
5819 int pid, prio, err;
5820
5821 if (mptcp_kern_skt_unit == 0) {
5822 os_log_error(mptcp_log_handle, "%s skt_unit is still 0\n", __func__);
5823 return;
5824 }
5825
5826 mp_so = mptetoso(mpte);
5827
5828 if (mp_so->so_flags & SOF_DELEGATED)
5829 pid = mp_so->e_pid;
5830 else
5831 pid = mp_so->last_pid;
5832
5833 p = proc_find(pid);
5834 if (p == PROC_NULL) {
5835 os_log_error(mptcp_log_handle, "%s Couldn't find proc for pid %u\n", __func__, pid);
5836 return;
5837 }
5838
5839 ask.cmd = MPTCP_SYMPTOMS_ASK_UUID;
5840
5841 if (mp_so->so_flags & SOF_DELEGATED)
5842 uuid_copy(ask.uuid, mp_so->e_uuid);
5843 else
5844 uuid_copy(ask.uuid, mp_so->last_uuid);
5845
5846 prio = proc_get_effective_task_policy(proc_task(p), TASK_POLICY_ROLE);
5847
5848 if (prio == TASK_BACKGROUND_APPLICATION)
5849 ask.priority = MPTCP_SYMPTOMS_BACKGROUND;
5850 else if (prio == TASK_FOREGROUND_APPLICATION)
5851 ask.priority = MPTCP_SYMPTOMS_FOREGROUND;
5852 else
5853 ask.priority = MPTCP_SYMPTOMS_UNKNOWN;
5854
5855 err = ctl_enqueuedata(mptcp_kern_ctrl_ref, mptcp_kern_skt_unit,
5856 &ask, sizeof(ask), CTL_DATA_EOR);
5857
5858 os_log_debug(mptcp_log_handle, "%s asked symptoms about pid %u, prio %u, err %d\n",
5859 __func__, pid, ask.priority, err);
5860
5861
5862 proc_rele(p);
5863}
5864
5865static errno_t
5866mptcp_symptoms_ctl_disconnect(kern_ctl_ref kctlref, u_int32_t kcunit,
5867 void *unitinfo)
5868{
5869#pragma unused(kctlref, kcunit, unitinfo)
5870
5871 OSDecrementAtomic(&mptcp_kern_skt_inuse);
5872
5873 return (0);
5874}
5875
5876static errno_t
5877mptcp_symptoms_ctl_send(kern_ctl_ref kctlref, u_int32_t kcunit, void *unitinfo,
5878 mbuf_t m, int flags)
5879{
5880#pragma unused(kctlref, unitinfo, flags)
5881 symptoms_advisory_t *sa = NULL;
5882
5883 if (kcunit != mptcp_kern_skt_unit)
5884 os_log_error(mptcp_log_handle, "%s kcunit %u is different from expected one %u\n",
5885 __func__, kcunit, mptcp_kern_skt_unit);
5886
5887 if (mbuf_pkthdr_len(m) < sizeof(*sa)) {
5888 mbuf_freem(m);
5889 return (EINVAL);
5890 }
5891
5892 if (mbuf_len(m) < sizeof(*sa)) {
5893 mbuf_freem(m);
5894 return (EINVAL);
5895 }
5896
5897 sa = mbuf_data(m);
5898
5899 if (sa->sa_nwk_status != SYMPTOMS_ADVISORY_NOCOMMENT &&
5900 sa->sa_nwk_status != SYMPTOMS_ADVISORY_USEAPP) {
5901 uint8_t old_wifi_status = mptcp_advisory.sa_wifi_status;
5902
5903 mptcplog((LOG_DEBUG, "%s: wifi %d,%d\n",
5904 __func__, sa->sa_wifi_status, mptcp_advisory.sa_wifi_status),
5905 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
5906
5907 if ((sa->sa_wifi_status &
5908 (SYMPTOMS_ADVISORY_WIFI_BAD | SYMPTOMS_ADVISORY_WIFI_OK)) !=
5909 (SYMPTOMS_ADVISORY_WIFI_BAD | SYMPTOMS_ADVISORY_WIFI_OK))
5910 mptcp_advisory.sa_wifi_status = sa->sa_wifi_status;
5911
5912 if (old_wifi_status != mptcp_advisory.sa_wifi_status)
5913 mptcp_wifi_status_changed();
5914 } else if (sa->sa_nwk_status == SYMPTOMS_ADVISORY_NOCOMMENT) {
5915 mptcplog((LOG_DEBUG, "%s: NOCOMMENT wifi %d\n", __func__,
5916 mptcp_advisory.sa_wifi_status),
5917 MPTCP_EVENTS_DBG, MPTCP_LOGLVL_VERBOSE);
5918 } else if (sa->sa_nwk_status == SYMPTOMS_ADVISORY_USEAPP) {
5919 uuid_t uuid;
5920
5921 mptcplog((LOG_DEBUG, "%s Got response about useApp\n", __func__),
5922 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
5923
5924 uuid_copy(uuid, (unsigned char *)(sa + 1));
5925
5926 mptcp_allow_uuid(uuid);
5927 }
5928
5929 mbuf_freem(m);
5930 return (0);
5931}
5932
5933void
5934mptcp_control_register(void)
5935{
5936 /* Set up the advisory control socket */
5937 struct kern_ctl_reg mptcp_kern_ctl;
5938
5939 bzero(&mptcp_kern_ctl, sizeof(mptcp_kern_ctl));
5940 strlcpy(mptcp_kern_ctl.ctl_name, MPTCP_KERN_CTL_NAME,
5941 sizeof(mptcp_kern_ctl.ctl_name));
5942 mptcp_kern_ctl.ctl_connect = mptcp_symptoms_ctl_connect;
5943 mptcp_kern_ctl.ctl_disconnect = mptcp_symptoms_ctl_disconnect;
5944 mptcp_kern_ctl.ctl_send = mptcp_symptoms_ctl_send;
5945 mptcp_kern_ctl.ctl_flags = CTL_FLAG_PRIVILEGED;
5946
5947 (void)ctl_register(&mptcp_kern_ctl, &mptcp_kern_ctrl_ref);
5948}
5949
5950/*
5951 * Three return-values:
5952 * 1 : WiFi is bad
5953 * 0 : WiFi is good
5954 * -1 : WiFi-state is unknown, use subflow-only heuristics
5955 */
5956int
5957mptcp_is_wifi_unusable(struct mptses *mpte)
5958{
5959 if (mpte->mpte_flags & MPTE_FIRSTPARTY) {
5960 if (mptcp_advisory.sa_wifi_status)
5961 return ((mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_BAD) ? 1 : 0);
5962
5963 /*
5964 * If it's a first-party app and we don't have any info
5965 * about the Wi-Fi state, let's be pessimistic.
5966 */
5967 return (-1);
5968 }
5969
5970 return ((mptcp_advisory.sa_wifi_status & SYMPTOMS_ADVISORY_WIFI_BAD) ? 1 : 0);
5971}
5972
5973boolean_t
5974mptcp_subflow_is_bad(struct mptses *mpte, struct mptsub *mpts)
5975{
5976 struct tcpcb *tp = sototcpcb(mpts->mpts_socket);
5977 int fail_thresh = mptcp_fail_thresh;
5978
5979 if (mpte->mpte_svctype == MPTCP_SVCTYPE_HANDOVER)
5980 fail_thresh *= 2;
5981
5982 return (tp->t_rxtshift >= fail_thresh &&
5983 (mptetoso(mpte)->so_snd.sb_cc || mpte->mpte_reinjectq));
5984}
5985
5986/* If TFO data is succesfully acked, it must be dropped from the mptcp so */
5987static void
5988mptcp_drop_tfo_data(struct mptses *mpte, struct mptsub *mpts)
5989{
5990 struct socket *mp_so = mptetoso(mpte);
5991 struct socket *so = mpts->mpts_socket;
5992 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
5993 struct mptcb *mp_tp = mpte->mpte_mptcb;
5994
5995 /* If data was sent with SYN, rewind state */
5996 if (tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED) {
5997 u_int64_t mp_droplen = mp_tp->mpt_sndnxt - mp_tp->mpt_snduna;
5998 unsigned int tcp_droplen = tp->snd_una - tp->iss - 1;
5999
6000 VERIFY(mp_droplen <= (UINT_MAX));
6001 VERIFY(mp_droplen >= tcp_droplen);
6002
6003 mpts->mpts_flags &= ~MPTSF_TFO_REQD;
6004 mpts->mpts_iss += tcp_droplen;
6005 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
6006
6007 if (mp_droplen > tcp_droplen) {
6008 /* handle partial TCP ack */
6009 mp_so->so_flags1 |= SOF1_TFO_REWIND;
6010 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna + (mp_droplen - tcp_droplen);
6011 mp_droplen = tcp_droplen;
6012 } else {
6013 /* all data on SYN was acked */
6014 mpts->mpts_rel_seq = 1;
6015 mp_tp->mpt_sndnxt = mp_tp->mpt_snduna;
6016 }
6017 mp_tp->mpt_sndmax -= tcp_droplen;
6018
6019 if (mp_droplen != 0) {
6020 VERIFY(mp_so->so_snd.sb_mb != NULL);
6021 sbdrop(&mp_so->so_snd, (int)mp_droplen);
6022 }
6023 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx cid %d TFO tcp len %d mptcp len %d\n",
6024 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
6025 mpts->mpts_connid, tcp_droplen, mp_droplen),
6026 MPTCP_SENDER_DBG, MPTCP_LOGLVL_VERBOSE);
6027 }
6028}
6029
6030int
6031mptcp_freeq(struct mptcb *mp_tp)
6032{
6033 struct tseg_qent *q;
6034 int rv = 0;
6035
6036 while ((q = LIST_FIRST(&mp_tp->mpt_segq)) != NULL) {
6037 LIST_REMOVE(q, tqe_q);
6038 m_freem(q->tqe_m);
6039 zfree(tcp_reass_zone, q);
6040 rv = 1;
6041 }
6042 mp_tp->mpt_reassqlen = 0;
6043 return (rv);
6044}
6045
6046static int
6047mptcp_post_event(u_int32_t event_code, int value)
6048{
6049 struct kev_mptcp_data event_data;
6050 struct kev_msg ev_msg;
6051
6052 memset(&ev_msg, 0, sizeof(ev_msg));
6053
6054 ev_msg.vendor_code = KEV_VENDOR_APPLE;
6055 ev_msg.kev_class = KEV_NETWORK_CLASS;
6056 ev_msg.kev_subclass = KEV_MPTCP_SUBCLASS;
6057 ev_msg.event_code = event_code;
6058
6059 event_data.value = value;
6060
6061 ev_msg.dv[0].data_ptr = &event_data;
6062 ev_msg.dv[0].data_length = sizeof(event_data);
6063
6064 return kev_post_msg(&ev_msg);
6065}
6066
6067void
6068mptcp_set_cellicon(struct mptses *mpte)
6069{
6070 int error;
6071
6072 /* First-party apps (Siri) don't flip the cellicon */
6073 if (mpte->mpte_flags & MPTE_FIRSTPARTY)
6074 return;
6075
6076 /* Remember the last time we set the cellicon (see mptcp_unset_cellicon) */
6077 mptcp_last_cellicon_set = tcp_now;
6078
6079 /* If cellicon is already set, get out of here! */
6080 if (OSTestAndSet(7, &mptcp_cellicon_is_set))
6081 return;
6082
6083 error = mptcp_post_event(KEV_MPTCP_CELLUSE, 1);
6084
6085 if (error)
6086 mptcplog((LOG_ERR, "%s: Setting cellicon failed with %d\n",
6087 __func__, error), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
6088 else
6089 mptcplog((LOG_DEBUG, "%s successfully set the cellicon\n", __func__),
6090 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
6091}
6092
6093void
6094mptcp_unset_cellicon(void)
6095{
6096 int error;
6097
6098 /* If cellicon is already unset, get out of here! */
6099 if (OSTestAndClear(7, &mptcp_cellicon_is_set))
6100 return;
6101
6102 /*
6103 * If during the past MPTCP_CELLICON_TOGGLE_RATE seconds we didn't
6104 * explicitly set the cellicon (see mptcp_set_cellicon()), then we unset
6105 * it again.
6106 */
6107 if (TSTMP_GT(mptcp_last_cellicon_set + MPTCP_CELLICON_TOGGLE_RATE,
6108 tcp_now)) {
6109 OSTestAndSet(7, &mptcp_cellicon_is_set);
6110 return;
6111 }
6112
6113 error = mptcp_post_event(KEV_MPTCP_CELLUSE, 0);
6114
6115 if (error)
6116 mptcplog((LOG_ERR, "%s: Unsetting cellicon failed with %d\n",
6117 __func__, error), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
6118 else
6119 mptcplog((LOG_DEBUG, "%s successfully unset the cellicon\n", __func__),
6120 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
6121}
6122
6123void
6124mptcp_reset_rexmit_state(struct tcpcb *tp)
6125{
6126 struct mptsub *mpts;
6127 struct inpcb *inp;
6128 struct socket *so;
6129
6130 inp = tp->t_inpcb;
6131 if (inp == NULL)
6132 return;
6133
6134 so = inp->inp_socket;
6135 if (so == NULL)
6136 return;
6137
6138 if (!(so->so_flags & SOF_MP_SUBFLOW))
6139 return;
6140
6141 mpts = tp->t_mpsub;
6142
6143 mpts->mpts_flags &= ~MPTSF_WRITE_STALL;
6144 so->so_flags &= ~SOF_MP_TRYFAILOVER;
6145}
6146
6147void
6148mptcp_reset_keepalive(struct tcpcb *tp)
6149{
6150 struct mptsub *mpts = tp->t_mpsub;
6151
6152 mpts->mpts_flags &= ~MPTSF_READ_STALL;
6153}
6154
6155