1/*
2 * Copyright (c) 2012-2017 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <sys/param.h>
30#include <sys/systm.h>
31#include <sys/kernel.h>
32#include <sys/socket.h>
33#include <sys/socketvar.h>
34#include <sys/protosw.h>
35#include <sys/mcache.h>
36#include <sys/syslog.h>
37#include <sys/proc.h>
38#include <sys/proc_internal.h>
39#include <sys/resourcevar.h>
40#include <sys/kauth.h>
41#include <sys/priv.h>
42
43#include <net/if.h>
44#include <netinet/in.h>
45#include <netinet/in_var.h>
46#include <netinet/tcp.h>
47#include <netinet/tcp_fsm.h>
48#include <netinet/tcp_seq.h>
49#include <netinet/tcp_var.h>
50#include <netinet/tcp_timer.h>
51#include <netinet/mptcp_var.h>
52#include <netinet/mptcp_timer.h>
53
54#include <mach/sdt.h>
55
56static int mptcp_usr_attach(struct socket *, int, struct proc *);
57static int mptcp_usr_detach(struct socket *);
58static int mptcp_attach(struct socket *, struct proc *);
59static int mptcp_usr_connectx(struct socket *, struct sockaddr *,
60 struct sockaddr *, struct proc *, uint32_t, sae_associd_t,
61 sae_connid_t *, uint32_t, void *, uint32_t, struct uio *, user_ssize_t *);
62static int mptcp_getassocids(struct mptses *, uint32_t *, user_addr_t);
63static int mptcp_getconnids(struct mptses *, sae_associd_t, uint32_t *,
64 user_addr_t);
65static int mptcp_getconninfo(struct mptses *, sae_connid_t *, uint32_t *,
66 uint32_t *, int32_t *, user_addr_t, socklen_t *, user_addr_t, socklen_t *,
67 uint32_t *, user_addr_t, uint32_t *);
68static int mptcp_usr_control(struct socket *, u_long, caddr_t, struct ifnet *,
69 struct proc *);
70static int mptcp_disconnect(struct mptses *);
71static int mptcp_usr_disconnect(struct socket *);
72static int mptcp_usr_disconnectx(struct socket *, sae_associd_t, sae_connid_t);
73static struct mptses *mptcp_usrclosed(struct mptses *);
74static int mptcp_usr_rcvd(struct socket *, int);
75static int mptcp_usr_send(struct socket *, int, struct mbuf *,
76 struct sockaddr *, struct mbuf *, struct proc *);
77static int mptcp_usr_shutdown(struct socket *);
78static int mptcp_usr_sosend(struct socket *, struct sockaddr *, struct uio *,
79 struct mbuf *, struct mbuf *, int);
80static int mptcp_usr_socheckopt(struct socket *, struct sockopt *);
81static int mptcp_default_tcp_optval(struct mptses *, struct sockopt *, int *);
82static int mptcp_usr_preconnect(struct socket *so);
83
84struct pr_usrreqs mptcp_usrreqs = {
85 .pru_attach = mptcp_usr_attach,
86 .pru_connectx = mptcp_usr_connectx,
87 .pru_control = mptcp_usr_control,
88 .pru_detach = mptcp_usr_detach,
89 .pru_disconnect = mptcp_usr_disconnect,
90 .pru_disconnectx = mptcp_usr_disconnectx,
91 .pru_peeraddr = mp_getpeeraddr,
92 .pru_rcvd = mptcp_usr_rcvd,
93 .pru_send = mptcp_usr_send,
94 .pru_shutdown = mptcp_usr_shutdown,
95 .pru_sockaddr = mp_getsockaddr,
96 .pru_sosend = mptcp_usr_sosend,
97 .pru_soreceive = soreceive,
98 .pru_socheckopt = mptcp_usr_socheckopt,
99 .pru_preconnect = mptcp_usr_preconnect,
100};
101
102
103#if (DEVELOPMENT || DEBUG)
104static int mptcp_disable_entitlements = 0;
105SYSCTL_INT(_net_inet_mptcp, OID_AUTO, disable_entitlements, CTLFLAG_RW | CTLFLAG_LOCKED,
106 &mptcp_disable_entitlements, 0, "Disable Multipath TCP Entitlement Checking");
107#endif
108
109int mptcp_developer_mode = 0;
110SYSCTL_INT(_net_inet_mptcp, OID_AUTO, allow_aggregate, CTLFLAG_RW | CTLFLAG_LOCKED,
111 &mptcp_developer_mode, 0, "Allow the Multipath aggregation mode");
112
113
114/*
115 * Attaches an MPTCP control block to a socket.
116 */
117static int
118mptcp_usr_attach(struct socket *mp_so, int proto, struct proc *p)
119{
120#pragma unused(proto)
121 int error;
122
123 VERIFY(mpsotomppcb(mp_so) == NULL);
124
125 error = mptcp_attach(mp_so, p);
126 if (error != 0)
127 goto out;
128 /*
129 * XXX: adi@apple.com
130 *
131 * Might want to use a different SO_LINGER timeout than TCP's?
132 */
133 if ((mp_so->so_options & SO_LINGER) && mp_so->so_linger == 0)
134 mp_so->so_linger = TCP_LINGERTIME * hz;
135out:
136 return (error);
137}
138
139/*
140 * Detaches an MPTCP control block from a socket.
141 */
142static int
143mptcp_usr_detach(struct socket *mp_so)
144{
145 struct mptses *mpte = mpsotompte(mp_so);
146 struct mppcb *mpp = mpsotomppcb(mp_so);
147
148 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
149 mptcplog((LOG_ERR, "%s state: %d\n", __func__,
150 mpp ? mpp->mpp_state : -1),
151 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
152 return (EINVAL);
153 }
154
155 /*
156 * We are done with this MPTCP socket (it has been closed);
157 * trigger all subflows to be disconnected, if not already,
158 * by initiating the PCB detach sequence (SOF_PCBCLEARING
159 * will be set.)
160 */
161 mp_pcbdetach(mp_so);
162
163 mptcp_disconnect(mpte);
164
165 return (0);
166}
167
168/*
169 * Attach MPTCP protocol to socket, allocating MP control block,
170 * MPTCP session, control block, buffer space, etc.
171 */
172static int
173mptcp_attach(struct socket *mp_so, struct proc *p)
174{
175#pragma unused(p)
176 struct mptses *mpte = NULL;
177 struct mptcb *mp_tp = NULL;
178 struct mppcb *mpp = NULL;
179 int error = 0;
180
181 if (mp_so->so_snd.sb_hiwat == 0 || mp_so->so_rcv.sb_hiwat == 0) {
182 error = soreserve(mp_so, tcp_sendspace, tcp_recvspace);
183 if (error != 0)
184 goto out;
185 }
186
187 if (mp_so->so_snd.sb_preconn_hiwat == 0) {
188 soreserve_preconnect(mp_so, 2048);
189 }
190
191 if ((mp_so->so_rcv.sb_flags & SB_USRSIZE) == 0)
192 mp_so->so_rcv.sb_flags |= SB_AUTOSIZE;
193 if ((mp_so->so_snd.sb_flags & SB_USRSIZE) == 0)
194 mp_so->so_snd.sb_flags |= SB_AUTOSIZE;
195
196 /*
197 * MPTCP socket buffers cannot be compressed, due to the
198 * fact that each mbuf chained via m_next is a M_PKTHDR
199 * which carries some MPTCP metadata.
200 */
201 mp_so->so_snd.sb_flags |= SB_NOCOMPRESS;
202 mp_so->so_rcv.sb_flags |= SB_NOCOMPRESS;
203
204 if ((error = mp_pcballoc(mp_so, &mtcbinfo)) != 0) {
205 goto out;
206 }
207
208 mpp = mpsotomppcb(mp_so);
209 VERIFY(mpp != NULL);
210 mpte = (struct mptses *)mpp->mpp_pcbe;
211 VERIFY(mpte != NULL);
212 mp_tp = mpte->mpte_mptcb;
213 VERIFY(mp_tp != NULL);
214out:
215 return (error);
216}
217
218static int
219mptcp_entitlement_check(struct socket *mp_so)
220{
221 struct mptses *mpte = mpsotompte(mp_so);
222
223 if (soopt_cred_check(mp_so, PRIV_NET_RESTRICTED_MULTIPATH_EXTENDED, TRUE) == 0) {
224 /*
225 * This means the app has the extended entitlement. Thus,
226 * it's a first party app and can run without restrictions.
227 */
228 mpte->mpte_flags |= MPTE_FIRSTPARTY;
229 goto grant;
230 }
231
232#if (DEVELOPMENT || DEBUG)
233 if (mptcp_disable_entitlements)
234 goto grant;
235#endif
236
237 if (soopt_cred_check(mp_so, PRIV_NET_PRIVILEGED_MULTIPATH, TRUE)) {
238 mptcplog((LOG_NOTICE, "%s Multipath Capability needed\n", __func__),
239 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
240 return (-1);
241 }
242
243 if (mpte->mpte_svctype > MPTCP_SVCTYPE_INTERACTIVE &&
244 mptcp_developer_mode == 0) {
245 mptcplog((LOG_NOTICE, "%s need to set allow_aggregate sysctl\n",
246 __func__), MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
247 return (-1);
248 }
249
250grant:
251 mptcplog((LOG_NOTICE, "%s entitlement granted for %u\n", __func__, mpte->mpte_svctype),
252 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
253
254 return (0);
255}
256
257/*
258 * Common subroutine to open a MPTCP connection to one of the remote hosts
259 * specified by dst_sl. This includes allocating and establishing a
260 * subflow TCP connection, either initially to establish MPTCP connection,
261 * or to join an existing one. Returns a connection handle upon success.
262 */
263static int
264mptcp_connectx(struct mptses *mpte, struct sockaddr *src,
265 struct sockaddr *dst, uint32_t ifscope, sae_connid_t *pcid)
266{
267 struct socket *mp_so = mptetoso(mpte);
268 int error = 0;
269
270 VERIFY(dst != NULL);
271 VERIFY(pcid != NULL);
272
273 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx\n", __func__,
274 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)),
275 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
276 DTRACE_MPTCP2(connectx, struct mptses *, mpte, struct socket *, mp_so);
277
278 error = mptcp_subflow_add(mpte, src, dst, ifscope, pcid);
279
280 return (error);
281}
282
283/*
284 * User-protocol pru_connectx callback.
285 */
286static int
287mptcp_usr_connectx(struct socket *mp_so, struct sockaddr *src,
288 struct sockaddr *dst, struct proc *p, uint32_t ifscope,
289 sae_associd_t aid, sae_connid_t *pcid, uint32_t flags, void *arg,
290 uint32_t arglen, struct uio *auio, user_ssize_t *bytes_written)
291{
292#pragma unused(p, aid, flags, arg, arglen)
293 struct mppcb *mpp = mpsotomppcb(mp_so);
294 struct mptses *mpte = NULL;
295 struct mptcb *mp_tp = NULL;
296 user_ssize_t datalen;
297 int error = 0;
298
299 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
300 mptcplog((LOG_ERR, "%s state %d\n", __func__,
301 mpp ? mpp->mpp_state : -1),
302 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
303 error = EINVAL;
304 goto out;
305 }
306 mpte = mptompte(mpp);
307 VERIFY(mpte != NULL);
308 mpte_lock_assert_held(mpte);
309
310 mp_tp = mpte->mpte_mptcb;
311 VERIFY(mp_tp != NULL);
312
313 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP) {
314 mptcplog((LOG_ERR, "%s fell back to TCP\n", __func__),
315 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
316 error = EINVAL;
317 goto out;
318 }
319
320 if (dst->sa_family != AF_INET && dst->sa_family != AF_INET6) {
321 error = EAFNOSUPPORT;
322 goto out;
323 }
324
325 if (dst->sa_family == AF_INET &&
326 dst->sa_len != sizeof(mpte->__mpte_dst_v4)) {
327 mptcplog((LOG_ERR, "%s IPv4 dst len %u\n", __func__,
328 dst->sa_len),
329 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
330 error = EINVAL;
331 goto out;
332 }
333
334 if (dst->sa_family == AF_INET6 &&
335 dst->sa_len != sizeof(mpte->__mpte_dst_v6)) {
336 mptcplog((LOG_ERR, "%s IPv6 dst len %u\n", __func__,
337 dst->sa_len),
338 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
339 error = EINVAL;
340 goto out;
341 }
342
343 if (!(mpte->mpte_flags & MPTE_SVCTYPE_CHECKED)) {
344 if (mptcp_entitlement_check(mp_so) < 0) {
345 error = EPERM;
346 goto out;
347 }
348
349 mpte->mpte_flags |= MPTE_SVCTYPE_CHECKED;
350 }
351
352 if ((mp_so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) {
353 memcpy(&mpte->mpte_dst, dst, dst->sa_len);
354 }
355
356 if (src) {
357 if (src->sa_family != AF_INET && src->sa_family != AF_INET6) {
358 error = EAFNOSUPPORT;
359 goto out;
360 }
361
362 if (src->sa_family == AF_INET &&
363 src->sa_len != sizeof(mpte->__mpte_src_v4)) {
364 mptcplog((LOG_ERR, "%s IPv4 src len %u\n", __func__,
365 src->sa_len),
366 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
367 error = EINVAL;
368 goto out;
369 }
370
371 if (src->sa_family == AF_INET6 &&
372 src->sa_len != sizeof(mpte->__mpte_src_v6)) {
373 mptcplog((LOG_ERR, "%s IPv6 src len %u\n", __func__,
374 src->sa_len),
375 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
376 error = EINVAL;
377 goto out;
378 }
379
380 if ((mp_so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0) {
381 memcpy(&mpte->mpte_src, src, src->sa_len);
382 }
383 }
384
385 error = mptcp_connectx(mpte, src, dst, ifscope, pcid);
386
387 /* If there is data, copy it */
388 if (auio != NULL) {
389 datalen = uio_resid(auio);
390 socket_unlock(mp_so, 0);
391 error = mp_so->so_proto->pr_usrreqs->pru_sosend(mp_so, NULL,
392 (uio_t) auio, NULL, NULL, 0);
393
394 if (error == 0 || error == EWOULDBLOCK)
395 *bytes_written = datalen - uio_resid(auio);
396
397 if (error == EWOULDBLOCK)
398 error = EINPROGRESS;
399
400 socket_lock(mp_so, 0);
401 }
402
403out:
404 return (error);
405}
406
407/*
408 * Handle SIOCGASSOCIDS ioctl for PF_MULTIPATH domain.
409 */
410static int
411mptcp_getassocids(struct mptses *mpte, uint32_t *cnt, user_addr_t aidp)
412{
413 mpte_lock_assert_held(mpte); /* same as MP socket lock */
414
415 /* MPTCP has at most 1 association */
416 *cnt = (mpte->mpte_associd != SAE_ASSOCID_ANY) ? 1 : 0;
417
418 /* just asking how many there are? */
419 if (aidp == USER_ADDR_NULL)
420 return (0);
421
422 return (copyout(&mpte->mpte_associd, aidp,
423 sizeof (mpte->mpte_associd)));
424}
425
426/*
427 * Handle SIOCGCONNIDS ioctl for PF_MULTIPATH domain.
428 */
429static int
430mptcp_getconnids(struct mptses *mpte, sae_associd_t aid, uint32_t *cnt,
431 user_addr_t cidp)
432{
433 struct mptsub *mpts;
434 int error = 0;
435
436 mpte_lock_assert_held(mpte); /* same as MP socket lock */
437
438 if (aid != SAE_ASSOCID_ANY && aid != SAE_ASSOCID_ALL &&
439 aid != mpte->mpte_associd)
440 return (EINVAL);
441
442 *cnt = mpte->mpte_numflows;
443
444 /* just asking how many there are? */
445 if (cidp == USER_ADDR_NULL)
446 return (0);
447
448 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
449 if ((error = copyout(&mpts->mpts_connid, cidp,
450 sizeof (mpts->mpts_connid))) != 0)
451 break;
452
453 cidp += sizeof (mpts->mpts_connid);
454 }
455
456 return (error);
457}
458
459/*
460 * Handle SIOCGCONNINFO ioctl for PF_MULTIPATH domain.
461 */
462static int
463mptcp_getconninfo(struct mptses *mpte, sae_connid_t *cid, uint32_t *flags,
464 uint32_t *ifindex, int32_t *soerror, user_addr_t src, socklen_t *src_len,
465 user_addr_t dst, socklen_t *dst_len, uint32_t *aux_type,
466 user_addr_t aux_data, uint32_t *aux_len)
467{
468 struct socket *so;
469 struct inpcb *inp;
470 struct mptsub *mpts;
471 int error = 0;
472
473 *flags = 0;
474 *aux_type = 0;
475 *ifindex = 0;
476 *soerror = 0;
477
478 if (*cid == SAE_CONNID_ALL) {
479 struct socket *mp_so = mptetoso(mpte);
480 struct mptcb *mp_tp = mpte->mpte_mptcb;
481 struct conninfo_multipathtcp mptcp_ci;
482
483 if (*aux_len != 0 && *aux_len != sizeof(mptcp_ci))
484 return (EINVAL);
485
486 if (mp_so->so_state & SS_ISCONNECTING)
487 *flags |= CIF_CONNECTING;
488 if (mp_so->so_state & SS_ISCONNECTED)
489 *flags |= CIF_CONNECTED;
490 if (mp_so->so_state & SS_ISDISCONNECTING)
491 *flags |= CIF_DISCONNECTING;
492 if (mp_so->so_state & SS_ISDISCONNECTED)
493 *flags |= CIF_DISCONNECTED;
494 if (!(mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP))
495 *flags |= CIF_MP_CAPABLE;
496 if (mp_tp->mpt_flags & MPTCPF_FALLBACK_TO_TCP)
497 *flags |= CIF_MP_DEGRADED;
498
499 *src_len = 0;
500 *dst_len = 0;
501
502 *aux_type = CIAUX_MPTCP;
503 *aux_len = sizeof(mptcp_ci);
504
505 if (aux_data != USER_ADDR_NULL) {
506 unsigned long i = 0;
507 int initial_info_set = 0;
508
509 bzero(&mptcp_ci, sizeof (mptcp_ci));
510 mptcp_ci.mptcpci_subflow_count = mpte->mpte_numflows;
511 mptcp_ci.mptcpci_switch_count = mpte->mpte_subflow_switches;
512
513 VERIFY(sizeof(mptcp_ci.mptcpci_itfstats) == sizeof(mpte->mpte_itfstats));
514 memcpy(mptcp_ci.mptcpci_itfstats, mpte->mpte_itfstats, sizeof(mptcp_ci.mptcpci_itfstats));
515
516 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
517 if (i >= sizeof(mptcp_ci.mptcpci_subflow_connids) / sizeof(sae_connid_t))
518 break;
519 mptcp_ci.mptcpci_subflow_connids[i] = mpts->mpts_connid;
520
521 if (mpts->mpts_flags & MPTSF_INITIAL_SUB) {
522 inp = sotoinpcb(mpts->mpts_socket);
523
524 mptcp_ci.mptcpci_init_rxbytes = inp->inp_stat->rxbytes;
525 mptcp_ci.mptcpci_init_txbytes = inp->inp_stat->txbytes;
526 initial_info_set = 1;
527 }
528
529 mptcpstats_update(mptcp_ci.mptcpci_itfstats, mpts);
530
531 i++;
532 }
533
534 if (initial_info_set == 0) {
535 mptcp_ci.mptcpci_init_rxbytes = mpte->mpte_init_rxbytes;
536 mptcp_ci.mptcpci_init_txbytes = mpte->mpte_init_txbytes;
537 }
538
539 if (mpte->mpte_flags & MPTE_FIRSTPARTY)
540 mptcp_ci.mptcpci_flags |= MPTCPCI_FIRSTPARTY;
541
542 error = copyout(&mptcp_ci, aux_data, sizeof(mptcp_ci));
543 if (error != 0) {
544 mptcplog((LOG_ERR, "%s copyout failed: %d\n",
545 __func__, error),
546 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
547 return (error);
548 }
549 }
550
551 return (0);
552 }
553
554 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
555 if (mpts->mpts_connid == *cid || *cid == SAE_CONNID_ANY)
556 break;
557 }
558 if (mpts == NULL)
559 return ((*cid == SAE_CONNID_ANY) ? ENXIO : EINVAL);
560
561 so = mpts->mpts_socket;
562 inp = sotoinpcb(so);
563
564 if (inp->inp_vflag & INP_IPV4)
565 error = in_getconninfo(so, SAE_CONNID_ANY, flags, ifindex,
566 soerror, src, src_len, dst, dst_len,
567 aux_type, aux_data, aux_len);
568 else
569 error = in6_getconninfo(so, SAE_CONNID_ANY, flags, ifindex,
570 soerror, src, src_len, dst, dst_len,
571 aux_type, aux_data, aux_len);
572
573 if (error != 0) {
574 mptcplog((LOG_ERR, "%s error from in_getconninfo %d\n",
575 __func__, error),
576 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
577 return (error);
578 }
579
580 if (mpts->mpts_flags & MPTSF_MP_CAPABLE)
581 *flags |= CIF_MP_CAPABLE;
582 if (mpts->mpts_flags & MPTSF_MP_DEGRADED)
583 *flags |= CIF_MP_DEGRADED;
584 if (mpts->mpts_flags & MPTSF_MP_READY)
585 *flags |= CIF_MP_READY;
586 if (mpts->mpts_flags & MPTSF_ACTIVE)
587 *flags |= CIF_MP_ACTIVE;
588
589 mptcplog((LOG_DEBUG, "%s: cid %d flags %x \n", __func__,
590 mpts->mpts_connid, mpts->mpts_flags),
591 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_VERBOSE);
592
593 return (0);
594}
595
596/*
597 * User-protocol pru_control callback.
598 */
599static int
600mptcp_usr_control(struct socket *mp_so, u_long cmd, caddr_t data,
601 struct ifnet *ifp, struct proc *p)
602{
603#pragma unused(ifp, p)
604 struct mppcb *mpp = mpsotomppcb(mp_so);
605 struct mptses *mpte;
606 int error = 0;
607
608 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
609 error = EINVAL;
610 goto out;
611 }
612 mpte = mptompte(mpp);
613 VERIFY(mpte != NULL);
614
615 mpte_lock_assert_held(mpte); /* same as MP socket lock */
616
617 switch (cmd) {
618 case SIOCGASSOCIDS32: { /* struct so_aidreq32 */
619 struct so_aidreq32 aidr;
620 bcopy(data, &aidr, sizeof (aidr));
621 error = mptcp_getassocids(mpte, &aidr.sar_cnt,
622 aidr.sar_aidp);
623 if (error == 0)
624 bcopy(&aidr, data, sizeof (aidr));
625 break;
626 }
627
628 case SIOCGASSOCIDS64: { /* struct so_aidreq64 */
629 struct so_aidreq64 aidr;
630 bcopy(data, &aidr, sizeof (aidr));
631 error = mptcp_getassocids(mpte, &aidr.sar_cnt,
632 aidr.sar_aidp);
633 if (error == 0)
634 bcopy(&aidr, data, sizeof (aidr));
635 break;
636 }
637
638 case SIOCGCONNIDS32: { /* struct so_cidreq32 */
639 struct so_cidreq32 cidr;
640 bcopy(data, &cidr, sizeof (cidr));
641 error = mptcp_getconnids(mpte, cidr.scr_aid, &cidr.scr_cnt,
642 cidr.scr_cidp);
643 if (error == 0)
644 bcopy(&cidr, data, sizeof (cidr));
645 break;
646 }
647
648 case SIOCGCONNIDS64: { /* struct so_cidreq64 */
649 struct so_cidreq64 cidr;
650 bcopy(data, &cidr, sizeof (cidr));
651 error = mptcp_getconnids(mpte, cidr.scr_aid, &cidr.scr_cnt,
652 cidr.scr_cidp);
653 if (error == 0)
654 bcopy(&cidr, data, sizeof (cidr));
655 break;
656 }
657
658 case SIOCGCONNINFO32: { /* struct so_cinforeq32 */
659 struct so_cinforeq32 cifr;
660 bcopy(data, &cifr, sizeof (cifr));
661 error = mptcp_getconninfo(mpte, &cifr.scir_cid,
662 &cifr.scir_flags, &cifr.scir_ifindex, &cifr.scir_error,
663 cifr.scir_src, &cifr.scir_src_len, cifr.scir_dst,
664 &cifr.scir_dst_len, &cifr.scir_aux_type, cifr.scir_aux_data,
665 &cifr.scir_aux_len);
666 if (error == 0)
667 bcopy(&cifr, data, sizeof (cifr));
668 break;
669 }
670
671 case SIOCGCONNINFO64: { /* struct so_cinforeq64 */
672 struct so_cinforeq64 cifr;
673 bcopy(data, &cifr, sizeof (cifr));
674 error = mptcp_getconninfo(mpte, &cifr.scir_cid,
675 &cifr.scir_flags, &cifr.scir_ifindex, &cifr.scir_error,
676 cifr.scir_src, &cifr.scir_src_len, cifr.scir_dst,
677 &cifr.scir_dst_len, &cifr.scir_aux_type, cifr.scir_aux_data,
678 &cifr.scir_aux_len);
679 if (error == 0)
680 bcopy(&cifr, data, sizeof (cifr));
681 break;
682 }
683
684 default:
685 error = EOPNOTSUPP;
686 break;
687 }
688out:
689 return (error);
690}
691
692static int
693mptcp_disconnect(struct mptses *mpte)
694{
695 struct socket *mp_so;
696 struct mptcb *mp_tp;
697 int error = 0;
698
699 mpte_lock_assert_held(mpte); /* same as MP socket lock */
700
701 mp_so = mptetoso(mpte);
702 mp_tp = mpte->mpte_mptcb;
703
704 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx %d\n", __func__,
705 (u_int64_t)VM_KERNEL_ADDRPERM(mp_so), mp_so->so_error),
706 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
707
708 DTRACE_MPTCP3(disconnectx, struct mptses *, mpte,
709 struct socket *, mp_so, struct mptcb *, mp_tp);
710
711 /* if we're not detached, go thru socket state checks */
712 if (!(mp_so->so_flags & SOF_PCBCLEARING)) {
713 if (!(mp_so->so_state & (SS_ISCONNECTED|
714 SS_ISCONNECTING))) {
715 error = ENOTCONN;
716 goto out;
717 }
718 if (mp_so->so_state & SS_ISDISCONNECTING) {
719 error = EALREADY;
720 goto out;
721 }
722 }
723
724 mptcp_cancel_all_timers(mp_tp);
725 if (mp_tp->mpt_state < MPTCPS_ESTABLISHED) {
726 mptcp_close(mpte, mp_tp);
727 } else if ((mp_so->so_options & SO_LINGER) &&
728 mp_so->so_linger == 0) {
729 mptcp_drop(mpte, mp_tp, 0);
730 } else {
731 soisdisconnecting(mp_so);
732 sbflush(&mp_so->so_rcv);
733 if (mptcp_usrclosed(mpte) != NULL)
734 mptcp_output(mpte);
735 }
736
737 if (error == 0)
738 mptcp_subflow_workloop(mpte);
739
740out:
741 return (error);
742}
743
744/*
745 * Wrapper function to support disconnect on socket
746 */
747static int
748mptcp_usr_disconnect(struct socket *mp_so)
749{
750 return (mptcp_disconnect(mpsotompte(mp_so)));
751}
752
753/*
754 * User-protocol pru_disconnectx callback.
755 */
756static int
757mptcp_usr_disconnectx(struct socket *mp_so, sae_associd_t aid, sae_connid_t cid)
758{
759 if (aid != SAE_ASSOCID_ANY && aid != SAE_ASSOCID_ALL)
760 return (EINVAL);
761
762 if (cid != SAE_CONNID_ANY && cid != SAE_CONNID_ALL)
763 return (EINVAL);
764
765 return (mptcp_usr_disconnect(mp_so));
766}
767
768void
769mptcp_finish_usrclosed(struct mptses *mpte)
770{
771 struct mptcb *mp_tp = mpte->mpte_mptcb;
772 struct socket *mp_so = mptetoso(mpte);
773
774 if (mp_tp->mpt_state == MPTCPS_CLOSED) {
775 mpte = mptcp_close(mpte, mp_tp);
776 } else if (mp_tp->mpt_state >= MPTCPS_FIN_WAIT_2) {
777 soisdisconnected(mp_so);
778 } else {
779 struct mptsub *mpts;
780
781 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
782 if ((mp_so->so_state & (SS_CANTRCVMORE|SS_CANTSENDMORE)) ==
783 (SS_CANTRCVMORE | SS_CANTSENDMORE))
784 mptcp_subflow_disconnect(mpte, mpts);
785 else
786 mptcp_subflow_shutdown(mpte, mpts);
787 }
788 }
789}
790
791/*
792 * User issued close, and wish to trail thru shutdown states.
793 */
794static struct mptses *
795mptcp_usrclosed(struct mptses *mpte)
796{
797 struct mptcb *mp_tp = mpte->mpte_mptcb;
798
799 mptcp_close_fsm(mp_tp, MPCE_CLOSE);
800
801 /* Not everything has been acknowledged - don't close the subflows! */
802 if (mp_tp->mpt_sndnxt + 1 != mp_tp->mpt_sndmax)
803 return (mpte);
804
805 mptcp_finish_usrclosed(mpte);
806
807 return (mpte);
808}
809
810/*
811 * After a receive, possible send some update to peer.
812 */
813static int
814mptcp_usr_rcvd(struct socket *mp_so, int flags)
815{
816#pragma unused(flags)
817 struct mppcb *mpp = mpsotomppcb(mp_so);
818 struct mptses *mpte;
819 int error = 0;
820
821 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
822 error = EINVAL;
823 goto out;
824 }
825 mpte = mptompte(mpp);
826 VERIFY(mpte != NULL);
827
828 error = mptcp_output(mpte);
829out:
830 return (error);
831}
832
833/*
834 * Do a send by putting data in the output queue.
835 */
836static int
837mptcp_usr_send(struct socket *mp_so, int prus_flags, struct mbuf *m,
838 struct sockaddr *nam, struct mbuf *control, struct proc *p)
839{
840#pragma unused(nam, p)
841 struct mppcb *mpp = mpsotomppcb(mp_so);
842 struct mptses *mpte;
843 int error = 0;
844
845 if (prus_flags & (PRUS_OOB|PRUS_EOF)) {
846 error = EOPNOTSUPP;
847 goto out;
848 }
849
850 if (nam != NULL) {
851 error = EOPNOTSUPP;
852 goto out;
853 }
854
855 if (control != NULL && control->m_len != 0) {
856 error = EOPNOTSUPP;
857 goto out;
858 }
859
860 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
861 error = ECONNRESET;
862 goto out;
863 }
864 mpte = mptompte(mpp);
865 VERIFY(mpte != NULL);
866
867 if (!(mp_so->so_state & SS_ISCONNECTED) &&
868 !(mp_so->so_flags1 & SOF1_PRECONNECT_DATA)) {
869 error = ENOTCONN;
870 goto out;
871 }
872
873 mptcp_insert_dsn(mpp, m);
874 VERIFY(mp_so->so_snd.sb_flags & SB_NOCOMPRESS);
875 sbappendstream(&mp_so->so_snd, m);
876 m = NULL;
877
878 error = mptcp_output(mpte);
879 if (error != 0)
880 goto out;
881
882 if (mp_so->so_state & SS_ISCONNECTING) {
883 if (mp_so->so_state & SS_NBIO)
884 error = EWOULDBLOCK;
885 else
886 error = sbwait(&mp_so->so_snd);
887 }
888
889out:
890 if (error) {
891 if (m != NULL)
892 m_freem(m);
893 if (control != NULL)
894 m_freem(control);
895 }
896 return (error);
897}
898
899/*
900 * Mark the MPTCP connection as being incapable of further output.
901 */
902static int
903mptcp_usr_shutdown(struct socket *mp_so)
904{
905 struct mppcb *mpp = mpsotomppcb(mp_so);
906 struct mptses *mpte;
907 int error = 0;
908
909 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
910 error = EINVAL;
911 goto out;
912 }
913 mpte = mptompte(mpp);
914 VERIFY(mpte != NULL);
915
916 socantsendmore(mp_so);
917
918 mpte = mptcp_usrclosed(mpte);
919 if (mpte != NULL)
920 error = mptcp_output(mpte);
921out:
922 return (error);
923}
924
925/*
926 * Copy the contents of uio into a properly sized mbuf chain.
927 */
928static int
929mptcp_uiotombuf(struct uio *uio, int how, int space, uint32_t align,
930 struct mbuf **top)
931{
932 struct mbuf *m, *mb, *nm = NULL, *mtail = NULL;
933 user_ssize_t resid, tot, len, progress; /* must be user_ssize_t */
934 int error;
935
936 VERIFY(top != NULL && *top == NULL);
937
938 /*
939 * space can be zero or an arbitrary large value bound by
940 * the total data supplied by the uio.
941 */
942 resid = uio_resid(uio);
943 if (space > 0)
944 tot = imin(resid, space);
945 else
946 tot = resid;
947
948 /*
949 * The smallest unit is a single mbuf with pkthdr.
950 * We can't align past it.
951 */
952 if (align >= MHLEN)
953 return (EINVAL);
954
955 /*
956 * Give us the full allocation or nothing.
957 * If space is zero return the smallest empty mbuf.
958 */
959 if ((len = tot + align) == 0)
960 len = 1;
961
962 /* Loop and append maximum sized mbufs to the chain tail. */
963 while (len > 0) {
964 uint32_t m_needed = 1;
965
966 if (njcl > 0 && len > MBIGCLBYTES)
967 mb = m_getpackets_internal(&m_needed, 1,
968 how, 1, M16KCLBYTES);
969 else if (len > MCLBYTES)
970 mb = m_getpackets_internal(&m_needed, 1,
971 how, 1, MBIGCLBYTES);
972 else if (len >= (signed)MINCLSIZE)
973 mb = m_getpackets_internal(&m_needed, 1,
974 how, 1, MCLBYTES);
975 else
976 mb = m_gethdr(how, MT_DATA);
977
978 /* Fail the whole operation if one mbuf can't be allocated. */
979 if (mb == NULL) {
980 if (nm != NULL)
981 m_freem(nm);
982 return (ENOBUFS);
983 }
984
985 /* Book keeping. */
986 VERIFY(mb->m_flags & M_PKTHDR);
987 len -= ((mb->m_flags & M_EXT) ? mb->m_ext.ext_size : MHLEN);
988 if (mtail != NULL)
989 mtail->m_next = mb;
990 else
991 nm = mb;
992 mtail = mb;
993 }
994
995 m = nm;
996 m->m_data += align;
997
998 progress = 0;
999 /* Fill all mbufs with uio data and update header information. */
1000 for (mb = m; mb != NULL; mb = mb->m_next) {
1001 len = imin(M_TRAILINGSPACE(mb), tot - progress);
1002
1003 error = uiomove(mtod(mb, char *), len, uio);
1004 if (error != 0) {
1005 m_freem(m);
1006 return (error);
1007 }
1008
1009 /* each mbuf is M_PKTHDR chained via m_next */
1010 mb->m_len = len;
1011 mb->m_pkthdr.len = len;
1012
1013 progress += len;
1014 }
1015 VERIFY(progress == tot);
1016 *top = m;
1017 return (0);
1018}
1019
1020/*
1021 * MPTCP socket protocol-user socket send routine, derived from sosend().
1022 */
1023static int
1024mptcp_usr_sosend(struct socket *mp_so, struct sockaddr *addr, struct uio *uio,
1025 struct mbuf *top, struct mbuf *control, int flags)
1026{
1027#pragma unused(addr)
1028 int32_t space;
1029 user_ssize_t resid;
1030 int error, sendflags;
1031 struct proc *p = current_proc();
1032 int sblocked = 0;
1033
1034 /* UIO is required for now, due to per-mbuf M_PKTHDR constrains */
1035 if (uio == NULL || top != NULL) {
1036 error = EINVAL;
1037 goto out;
1038 }
1039 resid = uio_resid(uio);
1040
1041 socket_lock(mp_so, 1);
1042 so_update_last_owner_locked(mp_so, p);
1043 so_update_policy(mp_so);
1044
1045 VERIFY(mp_so->so_type == SOCK_STREAM);
1046 VERIFY(!(mp_so->so_flags & SOF_MP_SUBFLOW));
1047
1048 if ((flags & (MSG_OOB|MSG_DONTROUTE|MSG_HOLD|MSG_SEND|MSG_FLUSH)) ||
1049 (mp_so->so_flags & SOF_ENABLE_MSGS)) {
1050 error = EOPNOTSUPP;
1051 socket_unlock(mp_so, 1);
1052 goto out;
1053 }
1054
1055 /*
1056 * In theory resid should be unsigned. However, space must be
1057 * signed, as it might be less than 0 if we over-committed, and we
1058 * must use a signed comparison of space and resid. On the other
1059 * hand, a negative resid causes us to loop sending 0-length
1060 * segments to the protocol.
1061 */
1062 if (resid < 0 || (flags & MSG_EOR) || control != NULL) {
1063 error = EINVAL;
1064 socket_unlock(mp_so, 1);
1065 goto out;
1066 }
1067
1068 OSIncrementAtomicLong(&p->p_stats->p_ru.ru_msgsnd);
1069
1070 do {
1071 error = sosendcheck(mp_so, NULL, resid, 0, 0, flags,
1072 &sblocked, NULL);
1073 if (error != 0)
1074 goto release;
1075
1076 space = sbspace(&mp_so->so_snd);
1077 do {
1078 socket_unlock(mp_so, 0);
1079 /*
1080 * Copy the data from userland into an mbuf chain.
1081 */
1082 error = mptcp_uiotombuf(uio, M_WAITOK, space, 0, &top);
1083 if (error != 0) {
1084 socket_lock(mp_so, 0);
1085 goto release;
1086 }
1087 VERIFY(top != NULL);
1088 space -= resid - uio_resid(uio);
1089 resid = uio_resid(uio);
1090 socket_lock(mp_so, 0);
1091
1092 /*
1093 * Compute flags here, for pru_send and NKEs.
1094 */
1095 sendflags = (resid > 0 && space > 0) ?
1096 PRUS_MORETOCOME : 0;
1097
1098 /*
1099 * Socket filter processing
1100 */
1101 VERIFY(control == NULL);
1102 error = sflt_data_out(mp_so, NULL, &top, &control, 0);
1103 if (error != 0) {
1104 if (error == EJUSTRETURN) {
1105 error = 0;
1106 top = NULL;
1107 /* always free control if any */
1108 }
1109 goto release;
1110 }
1111 if (control != NULL) {
1112 m_freem(control);
1113 control = NULL;
1114 }
1115
1116 /*
1117 * Pass data to protocol.
1118 */
1119 error = (*mp_so->so_proto->pr_usrreqs->pru_send)
1120 (mp_so, sendflags, top, NULL, NULL, p);
1121
1122 top = NULL;
1123 if (error != 0)
1124 goto release;
1125 } while (resid != 0 && space > 0);
1126 } while (resid != 0);
1127
1128release:
1129 if (sblocked)
1130 sbunlock(&mp_so->so_snd, FALSE); /* will unlock socket */
1131 else
1132 socket_unlock(mp_so, 1);
1133out:
1134 if (top != NULL)
1135 m_freem(top);
1136 if (control != NULL)
1137 m_freem(control);
1138
1139 soclearfastopen(mp_so);
1140
1141 return (error);
1142}
1143
1144/*
1145 * Called to filter SOPT_{SET,GET} for SOL_SOCKET level socket options.
1146 * This routine simply indicates to the caller whether or not to proceed
1147 * further with the given socket option. This is invoked by sosetoptlock()
1148 * and sogetoptlock().
1149 */
1150static int
1151mptcp_usr_socheckopt(struct socket *mp_so, struct sockopt *sopt)
1152{
1153#pragma unused(mp_so)
1154 int error = 0;
1155
1156 VERIFY(sopt->sopt_level == SOL_SOCKET);
1157
1158 /*
1159 * We could check for sopt_dir (set/get) here, but we'll just
1160 * let the caller deal with it as appropriate; therefore the
1161 * following is a superset of the socket options which we
1162 * allow for set/get.
1163 *
1164 * XXX: adi@apple.com
1165 *
1166 * Need to consider the following cases:
1167 *
1168 * a. Certain socket options don't have a clear definition
1169 * on the expected behavior post connect(2). At the time
1170 * those options are issued on the MP socket, there may
1171 * be existing subflow sockets that are already connected.
1172 */
1173 switch (sopt->sopt_name) {
1174 case SO_LINGER: /* MP */
1175 case SO_LINGER_SEC: /* MP */
1176 case SO_TYPE: /* MP */
1177 case SO_NREAD: /* MP */
1178 case SO_NWRITE: /* MP */
1179 case SO_ERROR: /* MP */
1180 case SO_SNDBUF: /* MP */
1181 case SO_RCVBUF: /* MP */
1182 case SO_SNDLOWAT: /* MP */
1183 case SO_RCVLOWAT: /* MP */
1184 case SO_SNDTIMEO: /* MP */
1185 case SO_RCVTIMEO: /* MP */
1186 case SO_NKE: /* MP */
1187 case SO_NOSIGPIPE: /* MP */
1188 case SO_NOADDRERR: /* MP */
1189 case SO_LABEL: /* MP */
1190 case SO_PEERLABEL: /* MP */
1191 case SO_DEFUNCTOK: /* MP */
1192 case SO_ISDEFUNCT: /* MP */
1193 case SO_TRAFFIC_CLASS_DBG: /* MP */
1194 case SO_DELEGATED: /* MP */
1195 case SO_DELEGATED_UUID: /* MP */
1196#if NECP
1197 case SO_NECP_ATTRIBUTES:
1198 case SO_NECP_CLIENTUUID:
1199#endif /* NECP */
1200 /*
1201 * Tell the caller that these options are to be processed.
1202 */
1203 break;
1204
1205 case SO_DEBUG: /* MP + subflow */
1206 case SO_KEEPALIVE: /* MP + subflow */
1207 case SO_USELOOPBACK: /* MP + subflow */
1208 case SO_RANDOMPORT: /* MP + subflow */
1209 case SO_TRAFFIC_CLASS: /* MP + subflow */
1210 case SO_RECV_TRAFFIC_CLASS: /* MP + subflow */
1211 case SO_PRIVILEGED_TRAFFIC_CLASS: /* MP + subflow */
1212 case SO_RECV_ANYIF: /* MP + subflow */
1213 case SO_RESTRICTIONS: /* MP + subflow */
1214 case SO_FLUSH: /* MP + subflow */
1215 case SO_NOWAKEFROMSLEEP:
1216 case SO_NOAPNFALLBK:
1217 case SO_MARK_CELLFALLBACK:
1218 /*
1219 * Tell the caller that these options are to be processed;
1220 * these will also be recorded later by mptcp_setopt().
1221 *
1222 * NOTE: Only support integer option value for now.
1223 */
1224 if (sopt->sopt_valsize != sizeof (int))
1225 error = EINVAL;
1226 break;
1227
1228 default:
1229 /*
1230 * Tell the caller to stop immediately and return an error.
1231 */
1232 error = ENOPROTOOPT;
1233 break;
1234 }
1235
1236 return (error);
1237}
1238
1239/*
1240 * Issue SOPT_SET for all MPTCP subflows (for integer option values.)
1241 */
1242static int
1243mptcp_setopt_apply(struct mptses *mpte, struct mptopt *mpo)
1244{
1245 struct socket *mp_so;
1246 struct mptsub *mpts;
1247 struct mptopt smpo;
1248 int error = 0;
1249
1250 /* just bail now if this isn't applicable to subflow sockets */
1251 if (!(mpo->mpo_flags & MPOF_SUBFLOW_OK)) {
1252 error = ENOPROTOOPT;
1253 goto out;
1254 }
1255
1256 /*
1257 * Skip those that are handled internally; these options
1258 * should not have been recorded and marked with the
1259 * MPOF_SUBFLOW_OK by mptcp_setopt(), but just in case.
1260 */
1261 if (mpo->mpo_level == SOL_SOCKET &&
1262 (mpo->mpo_name == SO_NOSIGPIPE || mpo->mpo_name == SO_NOADDRERR)) {
1263 error = ENOPROTOOPT;
1264 goto out;
1265 }
1266
1267 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1268 mp_so = mptetoso(mpte);
1269
1270 /*
1271 * Don't bother going further if there's no subflow; mark the option
1272 * with MPOF_INTERIM so that we know whether or not to remove this
1273 * option upon encountering an error while issuing it during subflow
1274 * socket creation.
1275 */
1276 if (mpte->mpte_numflows == 0) {
1277 VERIFY(TAILQ_EMPTY(&mpte->mpte_subflows));
1278 mpo->mpo_flags |= MPOF_INTERIM;
1279 /* return success */
1280 goto out;
1281 }
1282
1283 bzero(&smpo, sizeof (smpo));
1284 smpo.mpo_flags |= MPOF_SUBFLOW_OK;
1285 smpo.mpo_level = mpo->mpo_level;
1286 smpo.mpo_name = mpo->mpo_name;
1287
1288 /* grab exisiting values in case we need to rollback */
1289 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1290 struct socket *so;
1291
1292 mpts->mpts_flags &= ~(MPTSF_SOPT_OLDVAL|MPTSF_SOPT_INPROG);
1293 mpts->mpts_oldintval = 0;
1294 smpo.mpo_intval = 0;
1295 VERIFY(mpts->mpts_socket != NULL);
1296 so = mpts->mpts_socket;
1297 if (mptcp_subflow_sogetopt(mpte, so, &smpo) == 0) {
1298 mpts->mpts_flags |= MPTSF_SOPT_OLDVAL;
1299 mpts->mpts_oldintval = smpo.mpo_intval;
1300 }
1301 }
1302
1303 /* apply socket option */
1304 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1305 struct socket *so;
1306
1307 mpts->mpts_flags |= MPTSF_SOPT_INPROG;
1308 VERIFY(mpts->mpts_socket != NULL);
1309 so = mpts->mpts_socket;
1310 error = mptcp_subflow_sosetopt(mpte, mpts, mpo);
1311 if (error != 0)
1312 break;
1313 }
1314
1315 /* cleanup, and rollback if needed */
1316 TAILQ_FOREACH(mpts, &mpte->mpte_subflows, mpts_entry) {
1317 struct socket *so;
1318
1319 if (!(mpts->mpts_flags & MPTSF_SOPT_INPROG)) {
1320 /* clear in case it's set */
1321 mpts->mpts_flags &= ~MPTSF_SOPT_OLDVAL;
1322 mpts->mpts_oldintval = 0;
1323 continue;
1324 }
1325 if (!(mpts->mpts_flags & MPTSF_SOPT_OLDVAL)) {
1326 mpts->mpts_flags &= ~MPTSF_SOPT_INPROG;
1327 VERIFY(mpts->mpts_oldintval == 0);
1328 continue;
1329 }
1330 /* error during sosetopt, so roll it back */
1331 if (error != 0) {
1332 VERIFY(mpts->mpts_socket != NULL);
1333 so = mpts->mpts_socket;
1334 smpo.mpo_intval = mpts->mpts_oldintval;
1335 mptcp_subflow_sosetopt(mpte, mpts, &smpo);
1336 }
1337 mpts->mpts_oldintval = 0;
1338 mpts->mpts_flags &= ~(MPTSF_SOPT_OLDVAL|MPTSF_SOPT_INPROG);
1339 }
1340
1341out:
1342 return (error);
1343}
1344
1345/*
1346 * Handle SOPT_SET for socket options issued on MP socket.
1347 */
1348static int
1349mptcp_setopt(struct mptses *mpte, struct sockopt *sopt)
1350{
1351 int error = 0, optval = 0, level, optname, rec = 1;
1352 struct mptopt smpo, *mpo = NULL;
1353 struct socket *mp_so;
1354
1355 level = sopt->sopt_level;
1356 optname = sopt->sopt_name;
1357
1358 mp_so = mptetoso(mpte);
1359
1360 /*
1361 * Record socket options which are applicable to subflow sockets so
1362 * that we can replay them for new ones; see mptcp_usr_socheckopt()
1363 * for the list of eligible socket-level options.
1364 */
1365 if (level == SOL_SOCKET) {
1366 switch (optname) {
1367 case SO_DEBUG:
1368 case SO_KEEPALIVE:
1369 case SO_USELOOPBACK:
1370 case SO_RANDOMPORT:
1371 case SO_TRAFFIC_CLASS:
1372 case SO_RECV_TRAFFIC_CLASS:
1373 case SO_PRIVILEGED_TRAFFIC_CLASS:
1374 case SO_RECV_ANYIF:
1375 case SO_RESTRICTIONS:
1376 case SO_NOWAKEFROMSLEEP:
1377 case SO_NOAPNFALLBK:
1378 case SO_MARK_CELLFALLBACK:
1379 /* record it */
1380 break;
1381 case SO_FLUSH:
1382 /* don't record it */
1383 rec = 0;
1384 break;
1385
1386 /* Next ones, record at MPTCP-level */
1387#if NECP
1388 case SO_NECP_CLIENTUUID:
1389 if (!uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
1390 error = EINVAL;
1391 goto out;
1392 }
1393
1394 error = sooptcopyin(sopt, &mpsotomppcb(mp_so)->necp_client_uuid,
1395 sizeof(uuid_t), sizeof(uuid_t));
1396 if (error != 0) {
1397 goto out;
1398 }
1399
1400 mpsotomppcb(mp_so)->necp_cb = mptcp_session_necp_cb;
1401 error = necp_client_register_multipath_cb(mp_so->last_pid,
1402 mpsotomppcb(mp_so)->necp_client_uuid,
1403 mpsotomppcb(mp_so));
1404 if (error)
1405 goto out;
1406
1407 if (uuid_is_null(mpsotomppcb(mp_so)->necp_client_uuid)) {
1408 error = EINVAL;
1409 goto out;
1410 }
1411
1412 goto out;
1413 case SO_NECP_ATTRIBUTES:
1414#endif /* NECP */
1415 default:
1416 /* nothing to do; just return */
1417 goto out;
1418 }
1419 } else {
1420 switch (optname) {
1421 case TCP_NODELAY:
1422 case TCP_RXT_FINDROP:
1423 case TCP_KEEPALIVE:
1424 case TCP_KEEPINTVL:
1425 case TCP_KEEPCNT:
1426 case TCP_CONNECTIONTIMEOUT:
1427 case TCP_RXT_CONNDROPTIME:
1428 case PERSIST_TIMEOUT:
1429 case TCP_ADAPTIVE_READ_TIMEOUT:
1430 case TCP_ADAPTIVE_WRITE_TIMEOUT:
1431 /* eligible; record it */
1432 break;
1433 case TCP_NOTSENT_LOWAT:
1434 /* record at MPTCP level */
1435 error = sooptcopyin(sopt, &optval, sizeof(optval),
1436 sizeof(optval));
1437 if (error)
1438 goto out;
1439 if (optval < 0) {
1440 error = EINVAL;
1441 goto out;
1442 } else {
1443 if (optval == 0) {
1444 mp_so->so_flags &= ~SOF_NOTSENT_LOWAT;
1445 error = mptcp_set_notsent_lowat(mpte,0);
1446 } else {
1447 mp_so->so_flags |= SOF_NOTSENT_LOWAT;
1448 error = mptcp_set_notsent_lowat(mpte,
1449 optval);
1450 }
1451 }
1452 goto out;
1453 case MPTCP_SERVICE_TYPE:
1454 /* record at MPTCP level */
1455 error = sooptcopyin(sopt, &optval, sizeof(optval),
1456 sizeof(optval));
1457 if (error)
1458 goto out;
1459 if (optval < 0 || optval >= MPTCP_SVCTYPE_MAX) {
1460 error = EINVAL;
1461 goto out;
1462 }
1463
1464 mpte->mpte_svctype = optval;
1465
1466 if (mptcp_entitlement_check(mp_so) < 0) {
1467 error = EACCES;
1468 goto out;
1469 }
1470
1471 mpte->mpte_flags |= MPTE_SVCTYPE_CHECKED;
1472
1473 goto out;
1474 case MPTCP_ALTERNATE_PORT:
1475 /* record at MPTCP level */
1476 error = sooptcopyin(sopt, &optval, sizeof(optval),
1477 sizeof(optval));
1478 if (error)
1479 goto out;
1480
1481 if (optval < 0 || optval > UINT16_MAX) {
1482 error = EINVAL;
1483 goto out;
1484 }
1485
1486 mpte->mpte_alternate_port = optval;
1487
1488 goto out;
1489 default:
1490 /* not eligible */
1491 error = ENOPROTOOPT;
1492 goto out;
1493 }
1494 }
1495
1496 if ((error = sooptcopyin(sopt, &optval, sizeof (optval),
1497 sizeof (optval))) != 0)
1498 goto out;
1499
1500 if (rec) {
1501 /* search for an existing one; if not found, allocate */
1502 if ((mpo = mptcp_sopt_find(mpte, sopt)) == NULL)
1503 mpo = mptcp_sopt_alloc(M_WAITOK);
1504
1505 if (mpo == NULL) {
1506 error = ENOBUFS;
1507 } else {
1508 mptcplog((LOG_DEBUG, "%s: mp_so 0x%llx sopt %s val %d %s\n",
1509 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1510 mptcp_sopt2str(level, optname), optval,
1511 (mpo->mpo_flags & MPOF_ATTACHED) ?
1512 "updated" : "recorded"),
1513 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
1514
1515 /* initialize or update, as needed */
1516 mpo->mpo_intval = optval;
1517 if (!(mpo->mpo_flags & MPOF_ATTACHED)) {
1518 mpo->mpo_level = level;
1519 mpo->mpo_name = optname;
1520 mptcp_sopt_insert(mpte, mpo);
1521 }
1522 /* this can be issued on the subflow socket */
1523 mpo->mpo_flags |= MPOF_SUBFLOW_OK;
1524 }
1525 } else {
1526 bzero(&smpo, sizeof (smpo));
1527 mpo = &smpo;
1528 mpo->mpo_flags |= MPOF_SUBFLOW_OK;
1529 mpo->mpo_level = level;
1530 mpo->mpo_name = optname;
1531 mpo->mpo_intval = optval;
1532 }
1533
1534 /* issue this socket option on existing subflows */
1535 if (error == 0) {
1536 error = mptcp_setopt_apply(mpte, mpo);
1537 if (error != 0 && (mpo->mpo_flags & MPOF_ATTACHED)) {
1538 VERIFY(mpo != &smpo);
1539 mptcp_sopt_remove(mpte, mpo);
1540 mptcp_sopt_free(mpo);
1541 }
1542 if (mpo == &smpo)
1543 mpo->mpo_flags &= ~MPOF_INTERIM;
1544 }
1545out:
1546 if (error == 0 && mpo != NULL) {
1547 mptcplog((LOG_INFO, "%s: mp_so 0x%llx sopt %s val %d set %s\n",
1548 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1549 mptcp_sopt2str(level, optname), optval,
1550 (mpo->mpo_flags & MPOF_INTERIM) ?
1551 "pending" : "successful"),
1552 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
1553 } else if (error != 0) {
1554 mptcplog((LOG_ERR, "%s: mp_so 0x%llx sopt %s (%d, %d) val %d can't be issued error %d\n",
1555 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1556 mptcp_sopt2str(level, optname), level, optname, optval, error),
1557 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1558 }
1559 return (error);
1560}
1561
1562/*
1563 * Handle SOPT_GET for socket options issued on MP socket.
1564 */
1565static int
1566mptcp_getopt(struct mptses *mpte, struct sockopt *sopt)
1567{
1568 int error = 0, optval = 0;
1569
1570 /*
1571 * We only handle SOPT_GET for TCP level socket options; we should
1572 * not get here for socket level options since they are already
1573 * handled at the socket layer.
1574 */
1575 if (sopt->sopt_level != IPPROTO_TCP) {
1576 error = ENOPROTOOPT;
1577 goto out;
1578 }
1579
1580 switch (sopt->sopt_name) {
1581 case TCP_NODELAY:
1582 case TCP_RXT_FINDROP:
1583 case TCP_KEEPALIVE:
1584 case TCP_KEEPINTVL:
1585 case TCP_KEEPCNT:
1586 case TCP_CONNECTIONTIMEOUT:
1587 case TCP_RXT_CONNDROPTIME:
1588 case PERSIST_TIMEOUT:
1589 case TCP_ADAPTIVE_READ_TIMEOUT:
1590 case TCP_ADAPTIVE_WRITE_TIMEOUT:
1591 case TCP_NOTSENT_LOWAT:
1592 case MPTCP_SERVICE_TYPE:
1593 case MPTCP_ALTERNATE_PORT:
1594 /* eligible; get the default value just in case */
1595 error = mptcp_default_tcp_optval(mpte, sopt, &optval);
1596 break;
1597 default:
1598 /* not eligible */
1599 error = ENOPROTOOPT;
1600 break;
1601 }
1602
1603 switch (sopt->sopt_name) {
1604 case TCP_NOTSENT_LOWAT:
1605 if (mptetoso(mpte)->so_flags & SOF_NOTSENT_LOWAT)
1606 optval = mptcp_get_notsent_lowat(mpte);
1607 else
1608 optval = 0;
1609 goto out;
1610 case MPTCP_SERVICE_TYPE:
1611 optval = mpte->mpte_svctype;
1612 goto out;
1613 case MPTCP_ALTERNATE_PORT:
1614 optval = mpte->mpte_alternate_port;
1615 goto out;
1616 }
1617
1618 /*
1619 * Search for a previously-issued TCP level socket option and
1620 * return the recorded option value. This assumes that the
1621 * value did not get modified by the lower layer after it was
1622 * issued at setsockopt(2) time. If not found, we'll return
1623 * the default value obtained ealier.
1624 */
1625 if (error == 0) {
1626 struct mptopt *mpo;
1627
1628 if ((mpo = mptcp_sopt_find(mpte, sopt)) != NULL)
1629 optval = mpo->mpo_intval;
1630
1631 error = sooptcopyout(sopt, &optval, sizeof (int));
1632 }
1633out:
1634 return (error);
1635}
1636
1637/*
1638 * Return default values for TCP socket options. Ideally we would query the
1639 * subflow TCP socket, but that requires creating a subflow socket before
1640 * connectx(2) time. To simplify things, just return the default values
1641 * that we know of.
1642 */
1643static int
1644mptcp_default_tcp_optval(struct mptses *mpte, struct sockopt *sopt, int *optval)
1645{
1646 int error = 0;
1647
1648 VERIFY(sopt->sopt_level == IPPROTO_TCP);
1649 VERIFY(sopt->sopt_dir == SOPT_GET);
1650 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1651
1652 /* try to do what tcp_newtcpcb() does */
1653 switch (sopt->sopt_name) {
1654 case TCP_NODELAY:
1655 case TCP_RXT_FINDROP:
1656 case TCP_KEEPINTVL:
1657 case TCP_KEEPCNT:
1658 case TCP_CONNECTIONTIMEOUT:
1659 case TCP_RXT_CONNDROPTIME:
1660 case TCP_NOTSENT_LOWAT:
1661 case TCP_ADAPTIVE_READ_TIMEOUT:
1662 case TCP_ADAPTIVE_WRITE_TIMEOUT:
1663 case MPTCP_SERVICE_TYPE:
1664 case MPTCP_ALTERNATE_PORT:
1665 *optval = 0;
1666 break;
1667
1668 case TCP_KEEPALIVE:
1669 *optval = mptcp_subflow_keeptime;
1670 break;
1671
1672 case PERSIST_TIMEOUT:
1673 *optval = tcp_max_persist_timeout;
1674 break;
1675
1676 default:
1677 error = ENOPROTOOPT;
1678 break;
1679 }
1680 return (error);
1681}
1682
1683/*
1684 * MPTCP SOPT_{SET,GET} socket option handler, for options issued on the MP
1685 * socket, at SOL_SOCKET and IPPROTO_TCP levels. The former is restricted
1686 * to those that are allowed by mptcp_usr_socheckopt().
1687 */
1688int
1689mptcp_ctloutput(struct socket *mp_so, struct sockopt *sopt)
1690{
1691 struct mppcb *mpp = mpsotomppcb(mp_so);
1692 struct mptses *mpte;
1693 int error = 0;
1694
1695 if (mpp == NULL || mpp->mpp_state == MPPCB_STATE_DEAD) {
1696 error = EINVAL;
1697 goto out;
1698 }
1699 mpte = mptompte(mpp);
1700 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1701
1702 /* we only handle socket and TCP-level socket options for MPTCP */
1703 if (sopt->sopt_level != SOL_SOCKET && sopt->sopt_level != IPPROTO_TCP) {
1704 mptcplog((LOG_DEBUG, "MPTCP Socket: "
1705 "%s: mp_so 0x%llx sopt %s level not "
1706 "handled\n", __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so),
1707 mptcp_sopt2str(sopt->sopt_level, sopt->sopt_name)),
1708 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_LOG);
1709 error = EINVAL;
1710 goto out;
1711 }
1712
1713 switch (sopt->sopt_dir) {
1714 case SOPT_SET:
1715 error = mptcp_setopt(mpte, sopt);
1716 break;
1717
1718 case SOPT_GET:
1719 error = mptcp_getopt(mpte, sopt);
1720 break;
1721 }
1722out:
1723 return (error);
1724}
1725
1726const char *
1727mptcp_sopt2str(int level, int optname)
1728{
1729 switch (level) {
1730 case SOL_SOCKET:
1731 switch (optname) {
1732 case SO_LINGER:
1733 return ("SO_LINGER");
1734 case SO_LINGER_SEC:
1735 return ("SO_LINGER_SEC");
1736 case SO_DEBUG:
1737 return ("SO_DEBUG");
1738 case SO_KEEPALIVE:
1739 return ("SO_KEEPALIVE");
1740 case SO_USELOOPBACK:
1741 return ("SO_USELOOPBACK");
1742 case SO_TYPE:
1743 return ("SO_TYPE");
1744 case SO_NREAD:
1745 return ("SO_NREAD");
1746 case SO_NWRITE:
1747 return ("SO_NWRITE");
1748 case SO_ERROR:
1749 return ("SO_ERROR");
1750 case SO_SNDBUF:
1751 return ("SO_SNDBUF");
1752 case SO_RCVBUF:
1753 return ("SO_RCVBUF");
1754 case SO_SNDLOWAT:
1755 return ("SO_SNDLOWAT");
1756 case SO_RCVLOWAT:
1757 return ("SO_RCVLOWAT");
1758 case SO_SNDTIMEO:
1759 return ("SO_SNDTIMEO");
1760 case SO_RCVTIMEO:
1761 return ("SO_RCVTIMEO");
1762 case SO_NKE:
1763 return ("SO_NKE");
1764 case SO_NOSIGPIPE:
1765 return ("SO_NOSIGPIPE");
1766 case SO_NOADDRERR:
1767 return ("SO_NOADDRERR");
1768 case SO_RESTRICTIONS:
1769 return ("SO_RESTRICTIONS");
1770 case SO_LABEL:
1771 return ("SO_LABEL");
1772 case SO_PEERLABEL:
1773 return ("SO_PEERLABEL");
1774 case SO_RANDOMPORT:
1775 return ("SO_RANDOMPORT");
1776 case SO_TRAFFIC_CLASS:
1777 return ("SO_TRAFFIC_CLASS");
1778 case SO_RECV_TRAFFIC_CLASS:
1779 return ("SO_RECV_TRAFFIC_CLASS");
1780 case SO_TRAFFIC_CLASS_DBG:
1781 return ("SO_TRAFFIC_CLASS_DBG");
1782 case SO_PRIVILEGED_TRAFFIC_CLASS:
1783 return ("SO_PRIVILEGED_TRAFFIC_CLASS");
1784 case SO_DEFUNCTOK:
1785 return ("SO_DEFUNCTOK");
1786 case SO_ISDEFUNCT:
1787 return ("SO_ISDEFUNCT");
1788 case SO_OPPORTUNISTIC:
1789 return ("SO_OPPORTUNISTIC");
1790 case SO_FLUSH:
1791 return ("SO_FLUSH");
1792 case SO_RECV_ANYIF:
1793 return ("SO_RECV_ANYIF");
1794 case SO_NOWAKEFROMSLEEP:
1795 return ("SO_NOWAKEFROMSLEEP");
1796 case SO_NOAPNFALLBK:
1797 return ("SO_NOAPNFALLBK");
1798 case SO_MARK_CELLFALLBACK:
1799 return ("SO_CELLFALLBACK");
1800 case SO_DELEGATED:
1801 return ("SO_DELEGATED");
1802 case SO_DELEGATED_UUID:
1803 return ("SO_DELEGATED_UUID");
1804#if NECP
1805 case SO_NECP_ATTRIBUTES:
1806 return ("SO_NECP_ATTRIBUTES");
1807 case SO_NECP_CLIENTUUID:
1808 return ("SO_NECP_CLIENTUUID");
1809#endif /* NECP */
1810 }
1811
1812 break;
1813 case IPPROTO_TCP:
1814 switch (optname) {
1815 case TCP_NODELAY:
1816 return ("TCP_NODELAY");
1817 case TCP_KEEPALIVE:
1818 return ("TCP_KEEPALIVE");
1819 case TCP_KEEPINTVL:
1820 return ("TCP_KEEPINTVL");
1821 case TCP_KEEPCNT:
1822 return ("TCP_KEEPCNT");
1823 case TCP_CONNECTIONTIMEOUT:
1824 return ("TCP_CONNECTIONTIMEOUT");
1825 case TCP_RXT_CONNDROPTIME:
1826 return ("TCP_RXT_CONNDROPTIME");
1827 case PERSIST_TIMEOUT:
1828 return ("PERSIST_TIMEOUT");
1829 case TCP_NOTSENT_LOWAT:
1830 return ("NOTSENT_LOWAT");
1831 case TCP_ADAPTIVE_READ_TIMEOUT:
1832 return ("ADAPTIVE_READ_TIMEOUT");
1833 case TCP_ADAPTIVE_WRITE_TIMEOUT:
1834 return ("ADAPTIVE_WRITE_TIMEOUT");
1835 case MPTCP_SERVICE_TYPE:
1836 return ("MPTCP_SERVICE_TYPE");
1837 case MPTCP_ALTERNATE_PORT:
1838 return ("MPTCP_ALTERNATE_PORT");
1839 }
1840
1841 break;
1842 }
1843
1844 return ("unknown");
1845}
1846
1847static int
1848mptcp_usr_preconnect(struct socket *mp_so)
1849{
1850 struct mptsub *mpts = NULL;
1851 struct mppcb *mpp = mpsotomppcb(mp_so);
1852 struct mptses *mpte;
1853 struct socket *so;
1854 struct tcpcb *tp = NULL;
1855 int error;
1856
1857 mpte = mptompte(mpp);
1858 VERIFY(mpte != NULL);
1859 mpte_lock_assert_held(mpte); /* same as MP socket lock */
1860
1861 mpts = mptcp_get_subflow(mpte, NULL, NULL);
1862 if (mpts == NULL) {
1863 mptcplog((LOG_ERR, "%s: mp_so 0x%llx invalid preconnect ",
1864 __func__, (u_int64_t)VM_KERNEL_ADDRPERM(mp_so)),
1865 MPTCP_SOCKET_DBG, MPTCP_LOGLVL_ERR);
1866 return (EINVAL);
1867 }
1868 mpts->mpts_flags &= ~MPTSF_TFO_REQD;
1869 so = mpts->mpts_socket;
1870 tp = intotcpcb(sotoinpcb(so));
1871 tp->t_mpflags &= ~TMPF_TFO_REQUEST;
1872 error = tcp_output(sototcpcb(so));
1873
1874 soclearfastopen(mp_so);
1875
1876 return (error);
1877}
1878