1/*
2 * Copyright (c) 2000-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
30 * The Regents of the University of California. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95
61 */
62/*
63 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
64 * support for mandatory and extensible security protections. This notice
65 * is included in support of clause 2.2 (b) of the Apple Public License,
66 * Version 2.0.
67 */
68
69#include "tcp_includes.h"
70
71#include <sys/param.h>
72#include <sys/systm.h>
73#include <sys/kernel.h>
74#include <sys/sysctl.h>
75#include <sys/malloc.h>
76#include <sys/mbuf.h>
77#include <sys/domain.h>
78#include <sys/proc.h>
79#include <sys/kauth.h>
80#include <sys/socket.h>
81#include <sys/socketvar.h>
82#include <sys/protosw.h>
83#include <sys/random.h>
84#include <sys/syslog.h>
85#include <sys/mcache.h>
86#include <kern/locks.h>
87#include <kern/zalloc.h>
88
89#include <dev/random/randomdev.h>
90
91#include <net/route.h>
92#include <net/if.h>
93#include <net/content_filter.h>
94#include <net/ntstat.h>
95#include <net/multi_layer_pkt_log.h>
96
97#define tcp_minmssoverload fring
98#define _IP_VHL
99#include <netinet/in.h>
100#include <netinet/in_systm.h>
101#include <netinet/ip.h>
102#include <netinet/ip_icmp.h>
103#include <netinet/ip6.h>
104#include <netinet/icmp6.h>
105#include <netinet/in_pcb.h>
106#include <netinet6/in6_pcb.h>
107#include <netinet/in_var.h>
108#include <netinet/ip_var.h>
109#include <netinet/icmp_var.h>
110#include <netinet6/ip6_var.h>
111#include <netinet/mptcp_var.h>
112#include <netinet/tcp.h>
113#include <netinet/tcp_fsm.h>
114#include <netinet/tcp_seq.h>
115#include <netinet/tcp_timer.h>
116#include <netinet/tcp_var.h>
117#include <netinet/tcp_cc.h>
118#include <netinet/tcp_cache.h>
119#include <kern/thread_call.h>
120
121#include <netinet6/tcp6_var.h>
122#include <netinet/tcpip.h>
123#if TCPDEBUG
124#include <netinet/tcp_debug.h>
125#endif
126#include <netinet/tcp_log.h>
127
128#include <netinet6/ip6protosw.h>
129
130#if IPSEC
131#include <netinet6/ipsec.h>
132#include <netinet6/ipsec6.h>
133#endif /* IPSEC */
134
135#if NECP
136#include <net/necp.h>
137#endif /* NECP */
138
139#undef tcp_minmssoverload
140
141#include <net/sockaddr_utils.h>
142
143#include <corecrypto/ccaes.h>
144#include <libkern/crypto/aes.h>
145#include <libkern/crypto/md5.h>
146#include <sys/kdebug.h>
147#include <mach/sdt.h>
148#include <pexpert/pexpert.h>
149
150#define DBG_FNC_TCP_CLOSE NETDBG_CODE(DBG_NETTCP, ((5 << 8) | 2))
151
152static tcp_cc tcp_ccgen;
153
154extern struct tcptimerlist tcp_timer_list;
155extern struct tcptailq tcp_tw_tailq;
156
157extern int tcp_awdl_rtobase;
158
159SYSCTL_SKMEM_TCP_INT(TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW | CTLFLAG_LOCKED,
160 int, tcp_mssdflt, TCP_MSS, "Default TCP Maximum Segment Size");
161
162SYSCTL_SKMEM_TCP_INT(TCPCTL_V6MSSDFLT, v6mssdflt,
163 CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_v6mssdflt, TCP6_MSS,
164 "Default TCP Maximum Segment Size for IPv6");
165
166int tcp_sysctl_fastopenkey(struct sysctl_oid *, void *, int,
167 struct sysctl_req *);
168SYSCTL_PROC(_net_inet_tcp, OID_AUTO, fastopen_key, CTLTYPE_STRING | CTLFLAG_WR,
169 0, 0, tcp_sysctl_fastopenkey, "S", "TCP Fastopen key");
170
171/* Current count of half-open TFO connections */
172int tcp_tfo_halfcnt = 0;
173
174/* Maximum of half-open TFO connection backlog */
175SYSCTL_SKMEM_TCP_INT(OID_AUTO, fastopen_backlog,
176 CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_tfo_backlog, 10,
177 "Backlog queue for half-open TFO connections");
178
179SYSCTL_SKMEM_TCP_INT(OID_AUTO, fastopen, CTLFLAG_RW | CTLFLAG_LOCKED,
180 int, tcp_fastopen, TCP_FASTOPEN_CLIENT | TCP_FASTOPEN_SERVER,
181 "Enable TCP Fastopen (RFC 7413)");
182
183SYSCTL_SKMEM_TCP_INT(OID_AUTO, now_init, CTLFLAG_RD | CTLFLAG_LOCKED,
184 uint32_t, tcp_now_init, 0, "Initial tcp now value");
185
186SYSCTL_SKMEM_TCP_INT(OID_AUTO, microuptime_init, CTLFLAG_RD | CTLFLAG_LOCKED,
187 uint32_t, tcp_microuptime_init, 0, "Initial tcp uptime value in micro seconds");
188
189/*
190 * Minimum MSS we accept and use. This prevents DoS attacks where
191 * we are forced to a ridiculous low MSS like 20 and send hundreds
192 * of packets instead of one. The effect scales with the available
193 * bandwidth and quickly saturates the CPU and network interface
194 * with packet generation and sending. Set to zero to disable MINMSS
195 * checking. This setting prevents us from sending too small packets.
196 */
197SYSCTL_SKMEM_TCP_INT(OID_AUTO, minmss, CTLFLAG_RW | CTLFLAG_LOCKED,
198 int, tcp_minmss, TCP_MINMSS, "Minmum TCP Maximum Segment Size");
199
200SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
201 &tcbinfo.ipi_count, 0, "Number of active PCBs");
202
203SYSCTL_INT(_net_inet_tcp, OID_AUTO, tw_pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
204 &tcbinfo.ipi_twcount, 0, "Number of pcbs in time-wait state");
205
206SYSCTL_SKMEM_TCP_INT(OID_AUTO, icmp_may_rst, CTLFLAG_RW | CTLFLAG_LOCKED,
207 static int, icmp_may_rst, 1,
208 "Certain ICMP unreachable messages may abort connections in SYN_SENT");
209
210static int tcp_strict_rfc1948 = 0;
211static int tcp_isn_reseed_interval = 0;
212int tcp_do_timestamps = 1;
213#if (DEVELOPMENT || DEBUG)
214SYSCTL_INT(_net_inet_tcp, OID_AUTO, strict_rfc1948, CTLFLAG_RW | CTLFLAG_LOCKED,
215 &tcp_strict_rfc1948, 0, "Determines if RFC1948 is followed exactly");
216
217SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval,
218 CTLFLAG_RW | CTLFLAG_LOCKED,
219 &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");
220
221SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_timestamps,
222 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_timestamps, 0, "enable TCP timestamps");
223#endif /* (DEVELOPMENT || DEBUG) */
224
225SYSCTL_SKMEM_TCP_INT(OID_AUTO, rtt_min, CTLFLAG_RW | CTLFLAG_LOCKED,
226 int, tcp_TCPTV_MIN, 100, "min rtt value allowed");
227
228SYSCTL_SKMEM_TCP_INT(OID_AUTO, rexmt_slop, CTLFLAG_RW,
229 int, tcp_rexmt_slop, TCPTV_REXMTSLOP, "Slop added to retransmit timeout");
230
231SYSCTL_SKMEM_TCP_INT(OID_AUTO, randomize_ports, CTLFLAG_RW | CTLFLAG_LOCKED,
232 __private_extern__ int, tcp_use_randomport, 0,
233 "Randomize TCP port numbers");
234
235SYSCTL_SKMEM_TCP_INT(OID_AUTO, win_scale_factor, CTLFLAG_RW | CTLFLAG_LOCKED,
236 __private_extern__ int, tcp_win_scale, 3, "Window scaling factor");
237
238#if (DEVELOPMENT || DEBUG)
239SYSCTL_SKMEM_TCP_INT(OID_AUTO, init_rtt_from_cache,
240 CTLFLAG_RW | CTLFLAG_LOCKED, static int, tcp_init_rtt_from_cache, 1,
241 "Initalize RTT from route cache");
242#else
243SYSCTL_SKMEM_TCP_INT(OID_AUTO, init_rtt_from_cache,
244 CTLFLAG_RD | CTLFLAG_LOCKED, static int, tcp_init_rtt_from_cache, 1,
245 "Initalize RTT from route cache");
246#endif /* (DEVELOPMENT || DEBUG) */
247
248static int tso_debug = 0;
249SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso_debug, CTLFLAG_RW | CTLFLAG_LOCKED,
250 &tso_debug, 0, "TSO verbosity");
251
252static int tcp_rxt_seg_max = 1024;
253SYSCTL_INT(_net_inet_tcp, OID_AUTO, rxt_seg_max, CTLFLAG_RW | CTLFLAG_LOCKED,
254 &tcp_rxt_seg_max, 0, "");
255
256static unsigned long tcp_rxt_seg_drop = 0;
257SYSCTL_ULONG(_net_inet_tcp, OID_AUTO, rxt_seg_drop, CTLFLAG_RD | CTLFLAG_LOCKED,
258 &tcp_rxt_seg_drop, "");
259
260static void tcp_notify(struct inpcb *, int);
261
262static KALLOC_TYPE_DEFINE(tcp_bwmeas_zone, struct bwmeas, NET_KT_DEFAULT);
263KALLOC_TYPE_DEFINE(tcp_reass_zone, struct tseg_qent, NET_KT_DEFAULT);
264KALLOC_TYPE_DEFINE(tcp_rxt_seg_zone, struct tcp_rxt_seg, NET_KT_DEFAULT);
265
266extern int slowlink_wsize; /* window correction for slow links */
267extern int path_mtu_discovery;
268
269uint32_t tcp_now_remainder_us = 0; /* remaining micro seconds for tcp_now */
270
271static void tcp_sbrcv_grow_rwin(struct tcpcb *tp, struct sockbuf *sb);
272
273#define TCP_BWMEAS_BURST_MINSIZE 6
274#define TCP_BWMEAS_BURST_MAXSIZE 25
275
276/*
277 * Target size of TCP PCB hash tables. Must be a power of two.
278 *
279 * Note that this can be overridden by the kernel environment
280 * variable net.inet.tcp.tcbhashsize
281 */
282#ifndef TCBHASHSIZE
283#define TCBHASHSIZE CONFIG_TCBHASHSIZE
284#endif
285
286__private_extern__ int tcp_tcbhashsize = TCBHASHSIZE;
287SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD | CTLFLAG_LOCKED,
288 &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
289
290/*
291 * This is the actual shape of what we allocate using the zone
292 * allocator. Doing it this way allows us to protect both structures
293 * using the same generation count, and also eliminates the overhead
294 * of allocating tcpcbs separately. By hiding the structure here,
295 * we avoid changing most of the rest of the code (although it needs
296 * to be changed, eventually, for greater efficiency).
297 */
298#define ALIGNMENT 32
299struct inp_tp {
300 struct inpcb inp;
301 struct tcpcb tcb __attribute__((aligned(ALIGNMENT)));
302};
303#undef ALIGNMENT
304
305static KALLOC_TYPE_DEFINE(tcpcbzone, struct inp_tp, NET_KT_DEFAULT);
306
307int get_inpcb_str_size(void);
308int get_tcp_str_size(void);
309
310os_log_t tcp_mpkl_log_object = NULL;
311
312static void tcpcb_to_otcpcb(struct tcpcb *, struct otcpcb *);
313
314int tcp_notsent_lowat_check(struct socket *so);
315static void tcp_flow_lim_stats(struct ifnet_stats_per_flow *ifs,
316 struct if_lim_perf_stat *stat);
317static void tcp_flow_ecn_perf_stats(struct ifnet_stats_per_flow *ifs,
318 struct if_tcp_ecn_perf_stat *stat);
319
320static aes_encrypt_ctx tfo_ctx; /* Crypto-context for TFO */
321
322void
323tcp_tfo_gen_cookie(struct inpcb *inp, u_char *out, size_t blk_size)
324{
325 u_char in[CCAES_BLOCK_SIZE];
326 int isipv6 = inp->inp_vflag & INP_IPV6;
327
328 VERIFY(blk_size == CCAES_BLOCK_SIZE);
329
330 bzero(s: &in[0], CCAES_BLOCK_SIZE);
331 bzero(s: &out[0], CCAES_BLOCK_SIZE);
332
333 if (isipv6) {
334 memcpy(dst: in, src: &inp->in6p_faddr, n: sizeof(struct in6_addr));
335 } else {
336 memcpy(dst: in, src: &inp->inp_faddr, n: sizeof(struct in_addr));
337 }
338
339 aes_encrypt_cbc(in_blk: in, NULL, num_blk: 1, out_blk: out, cx: &tfo_ctx);
340}
341
342__private_extern__ int
343tcp_sysctl_fastopenkey(__unused struct sysctl_oid *oidp, __unused void *arg1,
344 __unused int arg2, struct sysctl_req *req)
345{
346 int error = 0;
347 /*
348 * TFO-key is expressed as a string in hex format
349 * +1 to account for the \0 char
350 * +1 because sysctl_io_string() expects a string length but the sysctl command
351 * now includes the terminating \0 in newlen -- see rdar://77205344
352 */
353 char keystring[TCP_FASTOPEN_KEYLEN * 2 + 2];
354 u_int32_t key[TCP_FASTOPEN_KEYLEN / sizeof(u_int32_t)];
355 int i;
356
357 /*
358 * sysctl_io_string copies keystring into the oldptr of the sysctl_req.
359 * Make sure everything is zero, to avoid putting garbage in there or
360 * leaking the stack.
361 */
362 bzero(s: keystring, n: sizeof(keystring));
363
364 error = sysctl_io_string(req, pValue: keystring, valueSize: sizeof(keystring), trunc: 0, NULL);
365 if (error) {
366 os_log(OS_LOG_DEFAULT,
367 "%s: sysctl_io_string() error %d, req->newlen %lu, sizeof(keystring) %lu",
368 __func__, error, req->newlen, sizeof(keystring));
369 goto exit;
370 }
371 if (req->newptr == USER_ADDR_NULL) {
372 goto exit;
373 }
374
375 if (strlen(s: keystring) != TCP_FASTOPEN_KEYLEN * 2) {
376 os_log(OS_LOG_DEFAULT,
377 "%s: strlen(keystring) %lu != TCP_FASTOPEN_KEYLEN * 2 %u, newlen %lu",
378 __func__, strlen(keystring), TCP_FASTOPEN_KEYLEN * 2, req->newlen);
379 error = EINVAL;
380 goto exit;
381 }
382
383 for (i = 0; i < (TCP_FASTOPEN_KEYLEN / sizeof(u_int32_t)); i++) {
384 /*
385 * We jump over the keystring in 8-character (4 byte in hex)
386 * steps
387 */
388 if (sscanf(&keystring[i * 8], "%8x", &key[i]) != 1) {
389 error = EINVAL;
390 os_log(OS_LOG_DEFAULT,
391 "%s: sscanf() != 1, error EINVAL", __func__);
392 goto exit;
393 }
394 }
395
396 aes_encrypt_key128(key: (u_char *)key, cx: &tfo_ctx);
397
398exit:
399 return error;
400}
401
402int
403get_inpcb_str_size(void)
404{
405 return sizeof(struct inpcb);
406}
407
408int
409get_tcp_str_size(void)
410{
411 return sizeof(struct tcpcb);
412}
413
414static int scale_to_powerof2(int size);
415
416/*
417 * This helper routine returns one of the following scaled value of size:
418 * 1. Rounded down power of two value of size if the size value passed as
419 * argument is not a power of two and the rounded up value overflows.
420 * OR
421 * 2. Rounded up power of two value of size if the size value passed as
422 * argument is not a power of two and the rounded up value does not overflow
423 * OR
424 * 3. Same value as argument size if it is already a power of two.
425 */
426static int
427scale_to_powerof2(int size)
428{
429 /* Handle special case of size = 0 */
430 int ret = size ? size : 1;
431
432 if (!powerof2(ret)) {
433 while (!powerof2(size)) {
434 /*
435 * Clear out least significant
436 * set bit till size is left with
437 * its highest set bit at which point
438 * it is rounded down power of two.
439 */
440 size = size & (size - 1);
441 }
442
443 /* Check for overflow when rounding up */
444 if (0 == (size << 1)) {
445 ret = size;
446 } else {
447 ret = size << 1;
448 }
449 }
450
451 return ret;
452}
453
454/*
455 * Round the floating point to the next integer
456 * Eg. 1.3 will round up to 2.
457 */
458uint32_t
459tcp_ceil(double a)
460{
461 double res = (uint32_t) a;
462 return (uint32_t)(res + (res < a));
463}
464
465uint32_t
466tcp_round_to(uint32_t val, uint32_t round)
467{
468 /*
469 * Round up or down based on the middle. Meaning, if we round upon a
470 * multiple of 10, 16 will round to 20 and 14 will round to 10.
471 */
472 return ((val + (round / 2)) / round) * round;
473}
474
475/*
476 * Round up to the next multiple of base.
477 * Eg. for a base of 64, 65 will become 128,
478 * 2896 will become 2944.
479 */
480uint32_t
481tcp_round_up(uint32_t val, uint32_t base)
482{
483 if (base == 1 || val % base == 0) {
484 return val;
485 }
486
487 return ((val + base) / base) * base;
488}
489
490static void
491tcp_tfo_init(void)
492{
493 u_char key[TCP_FASTOPEN_KEYLEN];
494
495 read_frandom(buffer: key, numBytes: sizeof(key));
496 aes_encrypt_key128(key, cx: &tfo_ctx);
497}
498
499/*
500 * Tcp initialization
501 */
502void
503tcp_init(struct protosw *pp, struct domain *dp)
504{
505#pragma unused(dp)
506 static int tcp_initialized = 0;
507 struct inpcbinfo *pcbinfo;
508
509 VERIFY((pp->pr_flags & (PR_INITIALIZED | PR_ATTACHED)) == PR_ATTACHED);
510
511 if (tcp_initialized) {
512 return;
513 }
514 tcp_initialized = 1;
515
516#if DEBUG || DEVELOPMENT
517 (void) PE_parse_boot_argn("tcp_rxt_seg_max", &tcp_rxt_seg_max,
518 sizeof(tcp_rxt_seg_max));
519#endif /* DEBUG || DEVELOPMENT */
520
521 tcp_ccgen = 1;
522 tcp_keepinit = TCPTV_KEEP_INIT;
523 tcp_keepidle = TCPTV_KEEP_IDLE;
524 tcp_keepintvl = TCPTV_KEEPINTVL;
525 tcp_keepcnt = TCPTV_KEEPCNT;
526 tcp_maxpersistidle = TCPTV_KEEP_IDLE;
527 tcp_msl = TCPTV_MSL;
528
529 microuptime(tv: &tcp_uptime);
530 read_frandom(buffer: &tcp_now, numBytes: sizeof(tcp_now));
531
532 /* Starts tcp internal clock at a random value */
533 tcp_now = tcp_now & 0x3fffffff;
534
535 /* expose initial uptime/now via systcl for utcp to keep time sync */
536 tcp_now_init = tcp_now;
537 tcp_microuptime_init =
538 (uint32_t)(tcp_uptime.tv_usec + (tcp_uptime.tv_sec * USEC_PER_SEC));
539 SYSCTL_SKMEM_UPDATE_FIELD(tcp.microuptime_init, tcp_microuptime_init);
540 SYSCTL_SKMEM_UPDATE_FIELD(tcp.now_init, tcp_now_init);
541
542 tcp_tfo_init();
543
544 LIST_INIT(&tcb);
545 tcbinfo.ipi_listhead = &tcb;
546
547 pcbinfo = &tcbinfo;
548
549 /*
550 * allocate group, lock attributes and lock for tcp pcb mutexes
551 */
552 pcbinfo->ipi_lock_grp = lck_grp_alloc_init(grp_name: "tcppcb",
553 LCK_GRP_ATTR_NULL);
554 lck_attr_setdefault(attr: &pcbinfo->ipi_lock_attr);
555 lck_rw_init(lck: &pcbinfo->ipi_lock, grp: pcbinfo->ipi_lock_grp,
556 attr: &pcbinfo->ipi_lock_attr);
557
558 if (tcp_tcbhashsize == 0) {
559 /* Set to default */
560 tcp_tcbhashsize = 512;
561 }
562
563 if (!powerof2(tcp_tcbhashsize)) {
564 int old_hash_size = tcp_tcbhashsize;
565 tcp_tcbhashsize = scale_to_powerof2(size: tcp_tcbhashsize);
566 /* Lower limit of 16 */
567 if (tcp_tcbhashsize < 16) {
568 tcp_tcbhashsize = 16;
569 }
570 printf("WARNING: TCB hash size not a power of 2, "
571 "scaled from %d to %d.\n",
572 old_hash_size,
573 tcp_tcbhashsize);
574 }
575
576 tcbinfo.ipi_hashbase = hashinit(count: tcp_tcbhashsize, M_PCB,
577 hashmask: &tcbinfo.ipi_hashmask);
578 tcbinfo.ipi_porthashbase = hashinit(count: tcp_tcbhashsize, M_PCB,
579 hashmask: &tcbinfo.ipi_porthashmask);
580 tcbinfo.ipi_zone = tcpcbzone;
581
582 tcbinfo.ipi_gc = tcp_gc;
583 tcbinfo.ipi_timer = tcp_itimer;
584 in_pcbinfo_attach(&tcbinfo);
585
586#define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
587 if (max_protohdr < TCP_MINPROTOHDR) {
588 max_protohdr = (int)P2ROUNDUP(TCP_MINPROTOHDR, sizeof(uint32_t));
589 }
590 if (max_linkhdr + max_protohdr > MCLBYTES) {
591 panic("tcp_init");
592 }
593#undef TCP_MINPROTOHDR
594
595 /* Initialize time wait and timer lists */
596 TAILQ_INIT(&tcp_tw_tailq);
597
598 bzero(s: &tcp_timer_list, n: sizeof(tcp_timer_list));
599 LIST_INIT(&tcp_timer_list.lhead);
600 /*
601 * allocate group and attribute for the tcp timer list
602 */
603 tcp_timer_list.mtx_grp = lck_grp_alloc_init(grp_name: "tcptimerlist",
604 LCK_GRP_ATTR_NULL);
605 lck_mtx_init(lck: &tcp_timer_list.mtx, grp: tcp_timer_list.mtx_grp,
606 LCK_ATTR_NULL);
607
608 tcp_timer_list.call = thread_call_allocate(func: tcp_run_timerlist, NULL);
609 if (tcp_timer_list.call == NULL) {
610 panic("failed to allocate call entry 1 in tcp_init");
611 }
612
613 /* Initialize TCP Cache */
614 tcp_cache_init();
615
616 tcp_mpkl_log_object = MPKL_CREATE_LOGOBJECT("com.apple.xnu.tcp");
617 if (tcp_mpkl_log_object == NULL) {
618 panic("MPKL_CREATE_LOGOBJECT failed");
619 }
620
621 if (PE_parse_boot_argn(arg_string: "tcp_log", arg_ptr: &tcp_log_enable_flags, max_arg: sizeof(tcp_log_enable_flags))) {
622 os_log(OS_LOG_DEFAULT, "tcp_init: set tcp_log_enable_flags to 0x%x", tcp_log_enable_flags);
623 }
624
625 /*
626 * If more than 4GB of actual memory is available, increase the
627 * maximum allowed receive and send socket buffer size.
628 */
629 if (mem_actual >= (1ULL << (GBSHIFT + 2))) {
630 tcp_autorcvbuf_max = 4 * 1024 * 1024;
631 tcp_autosndbuf_max = 4 * 1024 * 1024;
632
633 SYSCTL_SKMEM_UPDATE_FIELD(tcp.autorcvbufmax, tcp_autorcvbuf_max);
634 SYSCTL_SKMEM_UPDATE_FIELD(tcp.autosndbufmax, tcp_autosndbuf_max);
635 }
636
637 /* Initialize the TCP CCA array */
638 tcp_cc_init();
639}
640
641/*
642 * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
643 * tcp_template used to store this data in mbufs, but we now recopy it out
644 * of the tcpcb each time to conserve mbufs.
645 */
646void
647tcp_fillheaders(struct mbuf *m, struct tcpcb *tp, void *ip_ptr, void *tcp_ptr)
648{
649 struct inpcb *inp = tp->t_inpcb;
650 struct tcphdr *tcp_hdr = (struct tcphdr *)tcp_ptr;
651
652 if ((inp->inp_vflag & INP_IPV6) != 0) {
653 struct ip6_hdr *ip6;
654
655 ip6 = (struct ip6_hdr *)ip_ptr;
656 ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
657 (inp->inp_flow & IPV6_FLOWINFO_MASK);
658 ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
659 (IPV6_VERSION & IPV6_VERSION_MASK);
660 ip6->ip6_plen = htons(sizeof(struct tcphdr));
661 ip6->ip6_nxt = IPPROTO_TCP;
662 ip6->ip6_hlim = 0;
663 ip6->ip6_src = inp->in6p_laddr;
664 ip6->ip6_dst = inp->in6p_faddr;
665 if (m->m_flags & M_PKTHDR) {
666 uint32_t lifscope = inp->inp_lifscope != 0 ? inp->inp_lifscope : inp->inp_fifscope;
667 uint32_t fifscope = inp->inp_fifscope != 0 ? inp->inp_fifscope : inp->inp_lifscope;
668 ip6_output_setsrcifscope(m, lifscope, NULL);
669 ip6_output_setdstifscope(m, fifscope, NULL);
670 }
671 tcp_hdr->th_sum = in6_pseudo(&inp->in6p_laddr, &inp->in6p_faddr,
672 htonl(sizeof(struct tcphdr) + IPPROTO_TCP));
673 } else {
674 struct ip *ip = (struct ip *) ip_ptr;
675
676 ip->ip_vhl = IP_VHL_BORING;
677 ip->ip_tos = 0;
678 ip->ip_len = 0;
679 ip->ip_id = 0;
680 ip->ip_off = 0;
681 ip->ip_ttl = 0;
682 ip->ip_sum = 0;
683 ip->ip_p = IPPROTO_TCP;
684 ip->ip_src = inp->inp_laddr;
685 ip->ip_dst = inp->inp_faddr;
686 tcp_hdr->th_sum =
687 in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
688 htons(sizeof(struct tcphdr) + IPPROTO_TCP));
689 }
690
691 tcp_hdr->th_sport = inp->inp_lport;
692 tcp_hdr->th_dport = inp->inp_fport;
693 tcp_hdr->th_seq = 0;
694 tcp_hdr->th_ack = 0;
695 tcp_hdr->th_x2 = 0;
696 tcp_hdr->th_off = 5;
697 tcp_hdr->th_flags = 0;
698 tcp_hdr->th_win = 0;
699 tcp_hdr->th_urp = 0;
700}
701
702/*
703 * Create template to be used to send tcp packets on a connection.
704 * Allocates an mbuf and fills in a skeletal tcp/ip header. The only
705 * use for this function is in keepalives, which use tcp_respond.
706 */
707struct tcptemp *
708tcp_maketemplate(struct tcpcb *tp, struct mbuf **mp)
709{
710 struct mbuf *m;
711 struct tcptemp *n;
712
713 *mp = m = m_get(M_DONTWAIT, MT_HEADER);
714 if (m == NULL) {
715 return NULL;
716 }
717 m->m_len = sizeof(struct tcptemp);
718 n = mtod(m, struct tcptemp *);
719
720 tcp_fillheaders(m, tp, ip_ptr: (void *)&n->tt_ipgen, tcp_ptr: (void *)&n->tt_t);
721 return n;
722}
723
724/*
725 * Send a single message to the TCP at address specified by
726 * the given TCP/IP header. If m == 0, then we make a copy
727 * of the tcpiphdr at ti and send directly to the addressed host.
728 * This is used to force keep alive messages out using the TCP
729 * template for a connection. If flags are given then we send
730 * a message back to the TCP which originated the * segment ti,
731 * and discard the mbuf containing it and any other attached mbufs.
732 *
733 * In any case the ack and sequence number of the transmitted
734 * segment are as specified by the parameters.
735 *
736 * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
737 */
738void
739tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
740 tcp_seq ack, tcp_seq seq, uint8_t flags, struct tcp_respond_args *tra)
741{
742 uint16_t tlen;
743 int win = 0;
744 struct route *ro = 0;
745 struct route sro;
746 struct ip *ip;
747 struct tcphdr *nth;
748 struct route_in6 *ro6 = 0;
749 struct route_in6 sro6;
750 struct ip6_hdr *ip6;
751 int isipv6;
752 struct ifnet *outif;
753 int sotc = SO_TC_UNSPEC;
754 bool check_qos_marking_again = FALSE;
755 uint32_t sifscope = IFSCOPE_NONE, fifscope = IFSCOPE_NONE;
756
757 isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6;
758 ip6 = ipgen;
759 ip = ipgen;
760
761 if (tp) {
762 check_qos_marking_again = tp->t_inpcb->inp_socket->so_flags1 & SOF1_QOSMARKING_POLICY_OVERRIDE ? FALSE : TRUE;
763 sifscope = tp->t_inpcb->inp_lifscope;
764 fifscope = tp->t_inpcb->inp_fifscope;
765 if (!(flags & TH_RST)) {
766 win = tcp_sbspace(tp);
767 if (win > (int32_t)TCP_MAXWIN << tp->rcv_scale) {
768 win = (int32_t)TCP_MAXWIN << tp->rcv_scale;
769 }
770 }
771 if (isipv6) {
772 ro6 = &tp->t_inpcb->in6p_route;
773 } else {
774 ro = &tp->t_inpcb->inp_route;
775 }
776 } else {
777 if (isipv6) {
778 ro6 = &sro6;
779 bzero(s: ro6, n: sizeof(*ro6));
780 } else {
781 ro = &sro;
782 bzero(s: ro, n: sizeof(*ro));
783 }
784 }
785 if (m == 0) {
786 m = m_gethdr(M_DONTWAIT, MT_HEADER); /* MAC-OK */
787 if (m == NULL) {
788 return;
789 }
790 tlen = 0;
791 m->m_data += max_linkhdr;
792 if (isipv6) {
793 VERIFY((MHLEN - max_linkhdr) >=
794 (sizeof(*ip6) + sizeof(*nth)));
795 bcopy(src: (caddr_t)ip6, mtod(m, caddr_t),
796 n: sizeof(struct ip6_hdr));
797 ip6 = mtod(m, struct ip6_hdr *);
798 nth = (struct tcphdr *)(void *)(ip6 + 1);
799 } else {
800 VERIFY((MHLEN - max_linkhdr) >=
801 (sizeof(*ip) + sizeof(*nth)));
802 bcopy(src: (caddr_t)ip, mtod(m, caddr_t), n: sizeof(struct ip));
803 ip = mtod(m, struct ip *);
804 nth = (struct tcphdr *)(void *)(ip + 1);
805 }
806 bcopy(src: (caddr_t)th, dst: (caddr_t)nth, n: sizeof(struct tcphdr));
807#if MPTCP
808 if ((tp) && (tp->t_mpflags & TMPF_RESET)) {
809 flags = (TH_RST | TH_ACK);
810 } else
811#endif
812 flags = TH_ACK;
813 } else {
814 m_freem(m->m_next);
815 m->m_next = 0;
816 m->m_data = (uintptr_t)ipgen;
817 /* m_len is set later */
818 tlen = 0;
819#define xchg(a, b, type) { type t; t = a; a = b; b = t; }
820 if (isipv6) {
821 ip6_getsrcifaddr_info(m, &sifscope, NULL);
822 ip6_getdstifaddr_info(m, &fifscope, NULL);
823 if (!in6_embedded_scope) {
824 m->m_pkthdr.pkt_flags &= ~PKTF_IFAINFO;
825 }
826 /* Expect 32-bit aligned IP on strict-align platforms */
827 IP6_HDR_STRICT_ALIGNMENT_CHECK(ip6);
828 xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
829 nth = (struct tcphdr *)(void *)(ip6 + 1);
830 } else {
831 /* Expect 32-bit aligned IP on strict-align platforms */
832 IP_HDR_STRICT_ALIGNMENT_CHECK(ip);
833 xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long);
834 nth = (struct tcphdr *)(void *)(ip + 1);
835 }
836 if (th != nth) {
837 /*
838 * this is usually a case when an extension header
839 * exists between the IPv6 header and the
840 * TCP header.
841 */
842 nth->th_sport = th->th_sport;
843 nth->th_dport = th->th_dport;
844 }
845 xchg(nth->th_dport, nth->th_sport, n_short);
846#undef xchg
847 }
848 if (isipv6) {
849 ip6->ip6_plen = htons((u_short)(sizeof(struct tcphdr) +
850 tlen));
851 tlen += sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
852 ip6_output_setsrcifscope(m, sifscope, NULL);
853 ip6_output_setdstifscope(m, fifscope, NULL);
854 } else {
855 tlen += sizeof(struct tcpiphdr);
856 ip->ip_len = tlen;
857 ip->ip_ttl = (uint8_t)ip_defttl;
858 }
859 m->m_len = tlen;
860 m->m_pkthdr.len = tlen;
861 m->m_pkthdr.rcvif = 0;
862 if (tra->keep_alive) {
863 m->m_pkthdr.pkt_flags |= PKTF_KEEPALIVE;
864 }
865
866 nth->th_seq = htonl(seq);
867 nth->th_ack = htonl(ack);
868 nth->th_x2 = 0;
869 nth->th_off = sizeof(struct tcphdr) >> 2;
870 nth->th_flags = flags;
871 if (tp) {
872 nth->th_win = htons((u_short) (win >> tp->rcv_scale));
873 } else {
874 nth->th_win = htons((u_short)win);
875 }
876 nth->th_urp = 0;
877 if (isipv6) {
878 nth->th_sum = 0;
879 nth->th_sum = in6_pseudo(&ip6->ip6_src, &ip6->ip6_dst,
880 htonl((tlen - sizeof(struct ip6_hdr)) + IPPROTO_TCP));
881 m->m_pkthdr.csum_flags = CSUM_TCPIPV6;
882 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
883 ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL,
884 ro6 && ro6->ro_rt ? ro6->ro_rt->rt_ifp : NULL);
885 } else {
886 nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
887 htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
888 m->m_pkthdr.csum_flags = CSUM_TCP;
889 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
890 }
891#if TCPDEBUG
892 if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) {
893 tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
894 }
895#endif
896
897#if NECP
898 necp_mark_packet_from_socket(packet: m, inp: tp ? tp->t_inpcb : NULL, policy_id: 0, route_rule_id: 0, skip_policy_id: 0, pass_flags: 0);
899#endif /* NECP */
900
901#if IPSEC
902 if (tp != NULL && tp->t_inpcb->inp_sp != NULL &&
903 ipsec_setsocket(m, tp ? tp->t_inpcb->inp_socket : NULL) != 0) {
904 m_freem(m);
905 return;
906 }
907#endif
908
909 if (tp != NULL) {
910 u_int32_t svc_flags = 0;
911 if (isipv6) {
912 svc_flags |= PKT_SCF_IPV6;
913 }
914 sotc = tp->t_inpcb->inp_socket->so_traffic_class;
915 if ((flags & TH_RST) == 0) {
916 set_packet_service_class(m, tp->t_inpcb->inp_socket,
917 sotc, svc_flags);
918 } else {
919 m_set_service_class(m, MBUF_SC_BK_SYS);
920 }
921
922 /* Embed flowhash and flow control flags */
923 m->m_pkthdr.pkt_flowsrc = FLOWSRC_INPCB;
924 m->m_pkthdr.pkt_flowid = tp->t_inpcb->inp_flowhash;
925 m->m_pkthdr.pkt_flags |= (PKTF_FLOW_ID | PKTF_FLOW_LOCALSRC | PKTF_FLOW_ADV);
926 m->m_pkthdr.pkt_proto = IPPROTO_TCP;
927 m->m_pkthdr.tx_tcp_pid = tp->t_inpcb->inp_socket->last_pid;
928 m->m_pkthdr.tx_tcp_e_pid = tp->t_inpcb->inp_socket->e_pid;
929
930 if (flags & TH_RST) {
931 m->m_pkthdr.comp_gencnt = tp->t_comp_gencnt;
932 }
933 } else {
934 if (flags & TH_RST) {
935 m->m_pkthdr.comp_gencnt = TCP_ACK_COMPRESSION_DUMMY;
936 m_set_service_class(m, MBUF_SC_BK_SYS);
937 }
938 }
939
940 if (isipv6) {
941 struct ip6_out_args ip6oa;
942 bzero(s: &ip6oa, n: sizeof(ip6oa));
943 ip6oa.ip6oa_boundif = tra->ifscope;
944 ip6oa.ip6oa_flags = IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_SRCADDR;
945 ip6oa.ip6oa_sotc = SO_TC_UNSPEC;
946 ip6oa.ip6oa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
947
948 if (tra->ifscope != IFSCOPE_NONE) {
949 ip6oa.ip6oa_flags |= IP6OAF_BOUND_IF;
950 }
951 if (tra->nocell) {
952 ip6oa.ip6oa_flags |= IP6OAF_NO_CELLULAR;
953 }
954 if (tra->noexpensive) {
955 ip6oa.ip6oa_flags |= IP6OAF_NO_EXPENSIVE;
956 }
957 if (tra->noconstrained) {
958 ip6oa.ip6oa_flags |= IP6OAF_NO_CONSTRAINED;
959 }
960 if (tra->awdl_unrestricted) {
961 ip6oa.ip6oa_flags |= IP6OAF_AWDL_UNRESTRICTED;
962 }
963 if (tra->intcoproc_allowed) {
964 ip6oa.ip6oa_flags |= IP6OAF_INTCOPROC_ALLOWED;
965 }
966 if (tra->management_allowed) {
967 ip6oa.ip6oa_flags |= IP6OAF_MANAGEMENT_ALLOWED;
968 }
969 ip6oa.ip6oa_sotc = sotc;
970 if (tp != NULL) {
971 if ((tp->t_inpcb->inp_socket->so_flags1 & SOF1_QOSMARKING_ALLOWED)) {
972 ip6oa.ip6oa_flags |= IP6OAF_QOSMARKING_ALLOWED;
973 }
974 ip6oa.qos_marking_gencount = tp->t_inpcb->inp_policyresult.results.qos_marking_gencount;
975 if (check_qos_marking_again) {
976 ip6oa.ip6oa_flags |= IP6OAF_REDO_QOSMARKING_POLICY;
977 }
978 ip6oa.ip6oa_netsvctype = tp->t_inpcb->inp_socket->so_netsvctype;
979 }
980 (void) ip6_output(m, NULL, ro6, IPV6_OUTARGS, NULL,
981 NULL, &ip6oa);
982
983 if (check_qos_marking_again) {
984 struct inpcb *inp = tp->t_inpcb;
985 inp->inp_policyresult.results.qos_marking_gencount = ip6oa.qos_marking_gencount;
986 if (ip6oa.ip6oa_flags & IP6OAF_QOSMARKING_ALLOWED) {
987 inp->inp_socket->so_flags1 |= SOF1_QOSMARKING_ALLOWED;
988 } else {
989 inp->inp_socket->so_flags1 &= ~SOF1_QOSMARKING_ALLOWED;
990 }
991 }
992
993 if (tp != NULL && ro6 != NULL && ro6->ro_rt != NULL &&
994 (outif = ro6->ro_rt->rt_ifp) !=
995 tp->t_inpcb->in6p_last_outifp) {
996 tp->t_inpcb->in6p_last_outifp = outif;
997#if SKYWALK
998 if (NETNS_TOKEN_VALID(&tp->t_inpcb->inp_netns_token)) {
999 netns_set_ifnet(token: &tp->t_inpcb->inp_netns_token,
1000 ifp: tp->t_inpcb->in6p_last_outifp);
1001 }
1002#endif /* SKYWALK */
1003 }
1004
1005 if (ro6 == &sro6) {
1006 ROUTE_RELEASE(ro6);
1007 }
1008 } else {
1009 struct ip_out_args ipoa;
1010 bzero(s: &ipoa, n: sizeof(ipoa));
1011 ipoa.ipoa_boundif = tra->ifscope;
1012 ipoa.ipoa_flags = IPOAF_SELECT_SRCIF | IPOAF_BOUND_SRCADDR;
1013 ipoa.ipoa_sotc = SO_TC_UNSPEC;
1014 ipoa.ipoa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
1015
1016 if (tra->ifscope != IFSCOPE_NONE) {
1017 ipoa.ipoa_flags |= IPOAF_BOUND_IF;
1018 }
1019 if (tra->nocell) {
1020 ipoa.ipoa_flags |= IPOAF_NO_CELLULAR;
1021 }
1022 if (tra->noexpensive) {
1023 ipoa.ipoa_flags |= IPOAF_NO_EXPENSIVE;
1024 }
1025 if (tra->noconstrained) {
1026 ipoa.ipoa_flags |= IPOAF_NO_CONSTRAINED;
1027 }
1028 if (tra->awdl_unrestricted) {
1029 ipoa.ipoa_flags |= IPOAF_AWDL_UNRESTRICTED;
1030 }
1031 if (tra->management_allowed) {
1032 ipoa.ipoa_flags |= IPOAF_MANAGEMENT_ALLOWED;
1033 }
1034 ipoa.ipoa_sotc = sotc;
1035 if (tp != NULL) {
1036 if ((tp->t_inpcb->inp_socket->so_flags1 & SOF1_QOSMARKING_ALLOWED)) {
1037 ipoa.ipoa_flags |= IPOAF_QOSMARKING_ALLOWED;
1038 }
1039 if (!(tp->t_inpcb->inp_socket->so_flags1 & SOF1_QOSMARKING_POLICY_OVERRIDE)) {
1040 ipoa.ipoa_flags |= IPOAF_REDO_QOSMARKING_POLICY;
1041 }
1042 ipoa.qos_marking_gencount = tp->t_inpcb->inp_policyresult.results.qos_marking_gencount;
1043 ipoa.ipoa_netsvctype = tp->t_inpcb->inp_socket->so_netsvctype;
1044 }
1045 if (ro != &sro) {
1046 /* Copy the cached route and take an extra reference */
1047 inp_route_copyout(tp->t_inpcb, &sro);
1048 }
1049 /*
1050 * For consistency, pass a local route copy.
1051 */
1052 (void) ip_output(m, NULL, &sro, IP_OUTARGS, NULL, &ipoa);
1053
1054 if (check_qos_marking_again) {
1055 struct inpcb *inp = tp->t_inpcb;
1056 inp->inp_policyresult.results.qos_marking_gencount = ipoa.qos_marking_gencount;
1057 if (ipoa.ipoa_flags & IPOAF_QOSMARKING_ALLOWED) {
1058 inp->inp_socket->so_flags1 |= SOF1_QOSMARKING_ALLOWED;
1059 } else {
1060 inp->inp_socket->so_flags1 &= ~SOF1_QOSMARKING_ALLOWED;
1061 }
1062 }
1063 if (tp != NULL && sro.ro_rt != NULL &&
1064 (outif = sro.ro_rt->rt_ifp) !=
1065 tp->t_inpcb->inp_last_outifp) {
1066 tp->t_inpcb->inp_last_outifp = outif;
1067#if SKYWALK
1068 if (NETNS_TOKEN_VALID(&tp->t_inpcb->inp_netns_token)) {
1069 netns_set_ifnet(token: &tp->t_inpcb->inp_netns_token, ifp: outif);
1070 }
1071#endif /* SKYWALK */
1072 }
1073 if (ro != &sro) {
1074 /* Synchronize cached PCB route */
1075 inp_route_copyin(tp->t_inpcb, &sro);
1076 } else {
1077 ROUTE_RELEASE(&sro);
1078 }
1079 }
1080}
1081
1082/*
1083 * Create a new TCP control block, making an
1084 * empty reassembly queue and hooking it to the argument
1085 * protocol control block. The `inp' parameter must have
1086 * come from the zone allocator set up in tcp_init().
1087 */
1088struct tcpcb *
1089tcp_newtcpcb(struct inpcb *inp)
1090{
1091 struct inp_tp *it;
1092 struct tcpcb *tp;
1093 struct socket *so = inp->inp_socket;
1094 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
1095 uint32_t random_32;
1096
1097 calculate_tcp_clock();
1098
1099 if ((so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) == 0) {
1100 it = (struct inp_tp *)(void *)inp;
1101 tp = &it->tcb;
1102 } else {
1103 tp = (struct tcpcb *)(void *)inp->inp_saved_ppcb;
1104 }
1105
1106 bzero(s: (char *) tp, n: sizeof(struct tcpcb));
1107 LIST_INIT(&tp->t_segq);
1108 tp->t_maxseg = tp->t_maxopd = isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
1109
1110 tp->t_flags = TF_REQ_SCALE | (tcp_do_timestamps ? TF_REQ_TSTMP : 0);
1111 tp->t_flagsext |= TF_SACK_ENABLE;
1112
1113 TAILQ_INIT(&tp->snd_holes);
1114 SLIST_INIT(&tp->t_rxt_segments);
1115 SLIST_INIT(&tp->t_notify_ack);
1116 tp->t_inpcb = inp;
1117 /*
1118 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
1119 * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives
1120 * reasonable initial retransmit time.
1121 */
1122 tp->t_srtt = TCPTV_SRTTBASE;
1123 tp->t_rttvar =
1124 ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
1125 tp->t_rttmin = tcp_TCPTV_MIN;
1126 tp->t_rxtcur = TCPTV_RTOBASE;
1127
1128 if (tcp_use_newreno) {
1129 /* use newreno by default */
1130 tp->tcp_cc_index = TCP_CC_ALGO_NEWRENO_INDEX;
1131#if (DEVELOPMENT || DEBUG)
1132 } else if (tcp_use_ledbat) {
1133 /* use ledbat for testing */
1134 tp->tcp_cc_index = TCP_CC_ALGO_BACKGROUND_INDEX;
1135#endif
1136 } else {
1137 tp->tcp_cc_index = TCP_CC_ALGO_CUBIC_INDEX;
1138 }
1139
1140 tcp_cc_allocate_state(tp);
1141
1142 if (CC_ALGO(tp)->init != NULL) {
1143 CC_ALGO(tp)->init(tp);
1144 }
1145
1146 /* Initialize rledbat if we are using recv_bg */
1147 if (tcp_rledbat == 1 && TCP_RECV_BG(inp->inp_socket) &&
1148 tcp_cc_rledbat.init != NULL) {
1149 tcp_cc_rledbat.init(tp);
1150 }
1151
1152 tp->snd_cwnd = tcp_initial_cwnd(tp);
1153 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
1154 tp->snd_ssthresh_prev = TCP_MAXWIN << TCP_MAX_WINSHIFT;
1155 tp->t_rcvtime = tcp_now;
1156 tp->tentry.timer_start = tcp_now;
1157 tp->rcv_unackwin = tcp_now;
1158 tp->t_persist_timeout = tcp_max_persist_timeout;
1159 tp->t_persist_stop = 0;
1160 tp->t_flagsext |= TF_RCVUNACK_WAITSS;
1161 tp->t_rexmtthresh = (uint8_t)tcprexmtthresh;
1162 tp->rfbuf_ts = tcp_now;
1163 tp->rfbuf_space = tcp_initial_cwnd(tp);
1164 tp->t_forced_acks = TCP_FORCED_ACKS_COUNT;
1165
1166 /* Enable bandwidth measurement on this connection */
1167 tp->t_flagsext |= TF_MEASURESNDBW;
1168 if (tp->t_bwmeas == NULL) {
1169 tp->t_bwmeas = tcp_bwmeas_alloc(tp);
1170 if (tp->t_bwmeas == NULL) {
1171 tp->t_flagsext &= ~TF_MEASURESNDBW;
1172 }
1173 }
1174
1175 /* Clear time wait tailq entry */
1176 tp->t_twentry.tqe_next = NULL;
1177 tp->t_twentry.tqe_prev = NULL;
1178
1179 read_frandom(buffer: &random_32, numBytes: sizeof(random_32));
1180 tp->t_comp_gencnt = random_32;
1181 if (tp->t_comp_gencnt <= TCP_ACK_COMPRESSION_DUMMY) {
1182 tp->t_comp_gencnt = TCP_ACK_COMPRESSION_DUMMY + 1;
1183 }
1184 tp->t_comp_lastinc = tcp_now;
1185
1186 if (__probable(tcp_randomize_timestamps)) {
1187 tp->t_ts_offset = random_32;
1188 }
1189
1190 /* Initialize Accurate ECN state */
1191 tp->t_client_accecn_state = tcp_connection_client_accurate_ecn_feature_disabled;
1192 tp->t_server_accecn_state = tcp_connection_server_accurate_ecn_feature_disabled;
1193
1194 /*
1195 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
1196 * because the socket may be bound to an IPv6 wildcard address,
1197 * which may match an IPv4-mapped IPv6 address.
1198 */
1199 inp->inp_ip_ttl = (uint8_t)ip_defttl;
1200 inp->inp_ppcb = (caddr_t)tp;
1201 return tp; /* XXX */
1202}
1203
1204/*
1205 * Drop a TCP connection, reporting
1206 * the specified error. If connection is synchronized,
1207 * then send a RST to peer.
1208 */
1209struct tcpcb *
1210tcp_drop(struct tcpcb *tp, int errno)
1211{
1212 struct socket *so = tp->t_inpcb->inp_socket;
1213#if CONFIG_DTRACE
1214 struct inpcb *inp = tp->t_inpcb;
1215#endif
1216
1217 if (TCPS_HAVERCVDSYN(tp->t_state)) {
1218 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
1219 struct tcpcb *, tp, int32_t, TCPS_CLOSED);
1220 TCP_LOG_STATE(tp, TCPS_CLOSED);
1221 tp->t_state = TCPS_CLOSED;
1222 (void) tcp_output(tp);
1223 tcpstat.tcps_drops++;
1224 } else {
1225 tcpstat.tcps_conndrops++;
1226 }
1227 if (errno == ETIMEDOUT && tp->t_softerror) {
1228 errno = tp->t_softerror;
1229 }
1230 so->so_error = (u_short)errno;
1231
1232 TCP_LOG_CONNECTION_SUMMARY(tp);
1233
1234 return tcp_close(tp);
1235}
1236
1237void
1238tcp_getrt_rtt(struct tcpcb *tp, struct rtentry *rt)
1239{
1240 u_int32_t rtt = rt->rt_rmx.rmx_rtt;
1241 int isnetlocal = (tp->t_flags & TF_LOCAL);
1242
1243 TCP_LOG_RTM_RTT(tp, rt);
1244
1245 if (rtt != 0 && tcp_init_rtt_from_cache != 0) {
1246 /*
1247 * XXX the lock bit for RTT indicates that the value
1248 * is also a minimum value; this is subject to time.
1249 */
1250 if (rt->rt_rmx.rmx_locks & RTV_RTT) {
1251 tp->t_rttmin = rtt / (RTM_RTTUNIT / TCP_RETRANSHZ);
1252 } else {
1253 tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN :
1254 TCPTV_REXMTMIN;
1255 }
1256
1257 tp->t_srtt =
1258 rtt / (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTT_SCALE));
1259 tcpstat.tcps_usedrtt++;
1260
1261 if (rt->rt_rmx.rmx_rttvar) {
1262 tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
1263 (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTTVAR_SCALE));
1264 tcpstat.tcps_usedrttvar++;
1265 } else {
1266 /* default variation is +- 1 rtt */
1267 tp->t_rttvar =
1268 tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
1269 }
1270
1271 /*
1272 * The RTO formula in the route metric case is based on:
1273 * srtt + 4 * rttvar
1274 * modulo the min, max and slop
1275 */
1276 TCPT_RANGESET(tp->t_rxtcur,
1277 TCP_REXMTVAL(tp),
1278 tp->t_rttmin, TCPTV_REXMTMAX,
1279 TCP_ADD_REXMTSLOP(tp));
1280 } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_srtt == 0 &&
1281 tp->t_rxtshift == 0) {
1282 struct ifnet *ifp = rt->rt_ifp;
1283
1284 if (ifp != NULL && (ifp->if_eflags & IFEF_AWDL) != 0) {
1285 /*
1286 * AWDL needs a special value for the default initial retransmission timeout
1287 */
1288 if (tcp_awdl_rtobase > tcp_TCPTV_MIN) {
1289 tp->t_rttvar = ((tcp_awdl_rtobase - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
1290 } else {
1291 tp->t_rttvar = ((tcp_TCPTV_MIN - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
1292 }
1293 TCPT_RANGESET(tp->t_rxtcur,
1294 TCP_REXMTVAL(tp),
1295 tp->t_rttmin, TCPTV_REXMTMAX,
1296 TCP_ADD_REXMTSLOP(tp));
1297 }
1298 }
1299
1300 TCP_LOG_RTT_INFO(tp);
1301}
1302
1303static inline void
1304tcp_create_ifnet_stats_per_flow(struct tcpcb *tp,
1305 struct ifnet_stats_per_flow *ifs)
1306{
1307 struct inpcb *inp;
1308 struct socket *so;
1309 if (tp == NULL || ifs == NULL) {
1310 return;
1311 }
1312
1313 bzero(s: ifs, n: sizeof(*ifs));
1314 inp = tp->t_inpcb;
1315 so = inp->inp_socket;
1316
1317 ifs->ipv4 = (inp->inp_vflag & INP_IPV6) ? 0 : 1;
1318 ifs->local = (tp->t_flags & TF_LOCAL) ? 1 : 0;
1319 ifs->connreset = (so->so_error == ECONNRESET) ? 1 : 0;
1320 ifs->conntimeout = (so->so_error == ETIMEDOUT) ? 1 : 0;
1321 ifs->ecn_flags = tp->ecn_flags;
1322 ifs->txretransmitbytes = tp->t_stat.txretransmitbytes;
1323 ifs->rxoutoforderbytes = tp->t_stat.rxoutoforderbytes;
1324 ifs->rxmitpkts = tp->t_stat.rxmitpkts;
1325 ifs->rcvoopack = tp->t_rcvoopack;
1326 ifs->pawsdrop = tp->t_pawsdrop;
1327 ifs->sack_recovery_episodes = tp->t_sack_recovery_episode;
1328 ifs->reordered_pkts = tp->t_reordered_pkts;
1329 ifs->dsack_sent = tp->t_dsack_sent;
1330 ifs->dsack_recvd = tp->t_dsack_recvd;
1331 ifs->srtt = tp->t_srtt;
1332 ifs->rttupdated = tp->t_rttupdated;
1333 ifs->rttvar = tp->t_rttvar;
1334 ifs->rttmin = get_base_rtt(tp);
1335 if (tp->t_bwmeas != NULL && tp->t_bwmeas->bw_sndbw_max > 0) {
1336 ifs->bw_sndbw_max = tp->t_bwmeas->bw_sndbw_max;
1337 } else {
1338 ifs->bw_sndbw_max = 0;
1339 }
1340 if (tp->t_bwmeas != NULL && tp->t_bwmeas->bw_rcvbw_max > 0) {
1341 ifs->bw_rcvbw_max = tp->t_bwmeas->bw_rcvbw_max;
1342 } else {
1343 ifs->bw_rcvbw_max = 0;
1344 }
1345 ifs->bk_txpackets = so->so_tc_stats[MBUF_TC_BK].txpackets;
1346 ifs->txpackets = inp->inp_stat->txpackets;
1347 ifs->rxpackets = inp->inp_stat->rxpackets;
1348}
1349
1350static inline void
1351tcp_flow_ecn_perf_stats(struct ifnet_stats_per_flow *ifs,
1352 struct if_tcp_ecn_perf_stat *stat)
1353{
1354 u_int64_t curval, oldval;
1355 stat->total_txpkts += ifs->txpackets;
1356 stat->total_rxpkts += ifs->rxpackets;
1357 stat->total_rxmitpkts += ifs->rxmitpkts;
1358 stat->total_oopkts += ifs->rcvoopack;
1359 stat->total_reorderpkts += (ifs->reordered_pkts +
1360 ifs->pawsdrop + ifs->dsack_sent + ifs->dsack_recvd);
1361
1362 /* Average RTT */
1363 curval = ifs->srtt >> TCP_RTT_SHIFT;
1364 if (curval > 0 && ifs->rttupdated >= 16) {
1365 if (stat->rtt_avg == 0) {
1366 stat->rtt_avg = curval;
1367 } else {
1368 oldval = stat->rtt_avg;
1369 stat->rtt_avg = ((oldval << 4) - oldval + curval) >> 4;
1370 }
1371 }
1372
1373 /* RTT variance */
1374 curval = ifs->rttvar >> TCP_RTTVAR_SHIFT;
1375 if (curval > 0 && ifs->rttupdated >= 16) {
1376 if (stat->rtt_var == 0) {
1377 stat->rtt_var = curval;
1378 } else {
1379 oldval = stat->rtt_var;
1380 stat->rtt_var =
1381 ((oldval << 4) - oldval + curval) >> 4;
1382 }
1383 }
1384
1385 /* SACK episodes */
1386 stat->sack_episodes += ifs->sack_recovery_episodes;
1387 if (ifs->connreset) {
1388 stat->rst_drop++;
1389 }
1390}
1391
1392static inline void
1393tcp_flow_lim_stats(struct ifnet_stats_per_flow *ifs,
1394 struct if_lim_perf_stat *stat)
1395{
1396 u_int64_t curval, oldval;
1397
1398 stat->lim_total_txpkts += ifs->txpackets;
1399 stat->lim_total_rxpkts += ifs->rxpackets;
1400 stat->lim_total_retxpkts += ifs->rxmitpkts;
1401 stat->lim_total_oopkts += ifs->rcvoopack;
1402
1403 if (ifs->bw_sndbw_max > 0) {
1404 /* convert from bytes per ms to bits per second */
1405 ifs->bw_sndbw_max *= 8000;
1406 stat->lim_ul_max_bandwidth = MAX(stat->lim_ul_max_bandwidth,
1407 ifs->bw_sndbw_max);
1408 }
1409
1410 if (ifs->bw_rcvbw_max > 0) {
1411 /* convert from bytes per ms to bits per second */
1412 ifs->bw_rcvbw_max *= 8000;
1413 stat->lim_dl_max_bandwidth = MAX(stat->lim_dl_max_bandwidth,
1414 ifs->bw_rcvbw_max);
1415 }
1416
1417 /* Average RTT */
1418 curval = ifs->srtt >> TCP_RTT_SHIFT;
1419 if (curval > 0 && ifs->rttupdated >= 16) {
1420 if (stat->lim_rtt_average == 0) {
1421 stat->lim_rtt_average = curval;
1422 } else {
1423 oldval = stat->lim_rtt_average;
1424 stat->lim_rtt_average =
1425 ((oldval << 4) - oldval + curval) >> 4;
1426 }
1427 }
1428
1429 /* RTT variance */
1430 curval = ifs->rttvar >> TCP_RTTVAR_SHIFT;
1431 if (curval > 0 && ifs->rttupdated >= 16) {
1432 if (stat->lim_rtt_variance == 0) {
1433 stat->lim_rtt_variance = curval;
1434 } else {
1435 oldval = stat->lim_rtt_variance;
1436 stat->lim_rtt_variance =
1437 ((oldval << 4) - oldval + curval) >> 4;
1438 }
1439 }
1440
1441 if (stat->lim_rtt_min == 0) {
1442 stat->lim_rtt_min = ifs->rttmin;
1443 } else {
1444 stat->lim_rtt_min = MIN(stat->lim_rtt_min, ifs->rttmin);
1445 }
1446
1447 /* connection timeouts */
1448 stat->lim_conn_attempts++;
1449 if (ifs->conntimeout) {
1450 stat->lim_conn_timeouts++;
1451 }
1452
1453 /* bytes sent using background delay-based algorithms */
1454 stat->lim_bk_txpkts += ifs->bk_txpackets;
1455}
1456
1457/*
1458 * Close a TCP control block:
1459 * discard all space held by the tcp
1460 * discard internet protocol block
1461 * wake up any sleepers
1462 */
1463struct tcpcb *
1464tcp_close(struct tcpcb *tp)
1465{
1466 struct inpcb *inp = tp->t_inpcb;
1467 struct socket *so = inp->inp_socket;
1468 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
1469 struct route *ro;
1470 struct rtentry *rt;
1471 int dosavessthresh;
1472 struct ifnet_stats_per_flow ifs;
1473
1474 /* tcp_close was called previously, bail */
1475 if (inp->inp_ppcb == NULL) {
1476 return NULL;
1477 }
1478
1479 tcp_del_fsw_flow(tp);
1480
1481 tcp_canceltimers(tp);
1482 KERNEL_DEBUG(DBG_FNC_TCP_CLOSE | DBG_FUNC_START, tp, 0, 0, 0, 0);
1483
1484 /*
1485 * If another thread for this tcp is currently in ip (indicated by
1486 * the TF_SENDINPROG flag), defer the cleanup until after it returns
1487 * back to tcp. This is done to serialize the close until after all
1488 * pending output is finished, in order to avoid having the PCB be
1489 * detached and the cached route cleaned, only for ip to cache the
1490 * route back into the PCB again. Note that we've cleared all the
1491 * timers at this point. Set TF_CLOSING to indicate to tcp_output()
1492 * that is should call us again once it returns from ip; at that
1493 * point both flags should be cleared and we can proceed further
1494 * with the cleanup.
1495 */
1496 if ((tp->t_flags & TF_CLOSING) ||
1497 inp->inp_sndinprog_cnt > 0) {
1498 tp->t_flags |= TF_CLOSING;
1499 return NULL;
1500 }
1501
1502 TCP_LOG_CONNECTION_SUMMARY(tp);
1503
1504 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
1505 struct tcpcb *, tp, int32_t, TCPS_CLOSED);
1506
1507 ro = (isipv6 ? (struct route *)&inp->in6p_route : &inp->inp_route);
1508 rt = ro->ro_rt;
1509 if (rt != NULL) {
1510 RT_LOCK_SPIN(rt);
1511 }
1512
1513 /*
1514 * If we got enough samples through the srtt filter,
1515 * save the rtt and rttvar in the routing entry.
1516 * 'Enough' is arbitrarily defined as the 16 samples.
1517 * 16 samples is enough for the srtt filter to converge
1518 * to within 5% of the correct value; fewer samples and
1519 * we could save a very bogus rtt.
1520 *
1521 * Don't update the default route's characteristics and don't
1522 * update anything that the user "locked".
1523 */
1524 if (tp->t_rttupdated >= 16) {
1525 u_int32_t i = 0;
1526 bool log_rtt = false;
1527
1528 if (isipv6) {
1529 struct sockaddr_in6 *sin6;
1530
1531 if (rt == NULL) {
1532 goto no_valid_rt;
1533 }
1534 sin6 = SIN6(rt_key(rt));
1535 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
1536 goto no_valid_rt;
1537 }
1538 } else if (ROUTE_UNUSABLE(ro) ||
1539 SIN(rt_key(rt))->sin_addr.s_addr == INADDR_ANY) {
1540 DTRACE_TCP4(state__change, void, NULL,
1541 struct inpcb *, inp, struct tcpcb *, tp,
1542 int32_t, TCPS_CLOSED);
1543 TCP_LOG_STATE(tp, TCPS_CLOSED);
1544 tp->t_state = TCPS_CLOSED;
1545 goto no_valid_rt;
1546 }
1547
1548 RT_LOCK_ASSERT_HELD(rt);
1549 if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
1550 i = tp->t_srtt *
1551 (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTT_SCALE));
1552 if (rt->rt_rmx.rmx_rtt && i) {
1553 /*
1554 * filter this update to half the old & half
1555 * the new values, converting scale.
1556 * See route.h and tcp_var.h for a
1557 * description of the scaling constants.
1558 */
1559 rt->rt_rmx.rmx_rtt =
1560 (rt->rt_rmx.rmx_rtt + i) / 2;
1561 } else {
1562 rt->rt_rmx.rmx_rtt = i;
1563 }
1564 tcpstat.tcps_cachedrtt++;
1565 log_rtt = true;
1566 }
1567 if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
1568 i = tp->t_rttvar *
1569 (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTTVAR_SCALE));
1570 if (rt->rt_rmx.rmx_rttvar && i) {
1571 rt->rt_rmx.rmx_rttvar =
1572 (rt->rt_rmx.rmx_rttvar + i) / 2;
1573 } else {
1574 rt->rt_rmx.rmx_rttvar = i;
1575 }
1576 tcpstat.tcps_cachedrttvar++;
1577 log_rtt = true;
1578 }
1579 if (log_rtt) {
1580 TCP_LOG_RTM_RTT(tp, rt);
1581 TCP_LOG_RTT_INFO(tp);
1582 }
1583 /*
1584 * The old comment here said:
1585 * update the pipelimit (ssthresh) if it has been updated
1586 * already or if a pipesize was specified & the threshhold
1587 * got below half the pipesize. I.e., wait for bad news
1588 * before we start updating, then update on both good
1589 * and bad news.
1590 *
1591 * But we want to save the ssthresh even if no pipesize is
1592 * specified explicitly in the route, because such
1593 * connections still have an implicit pipesize specified
1594 * by the global tcp_sendspace. In the absence of a reliable
1595 * way to calculate the pipesize, it will have to do.
1596 */
1597 i = tp->snd_ssthresh;
1598 if (rt->rt_rmx.rmx_sendpipe != 0) {
1599 dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2);
1600 } else {
1601 dosavessthresh = (i < so->so_snd.sb_hiwat / 2);
1602 }
1603 if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
1604 i != 0 && rt->rt_rmx.rmx_ssthresh != 0) ||
1605 dosavessthresh) {
1606 /*
1607 * convert the limit from user data bytes to
1608 * packets then to packet data bytes.
1609 */
1610 i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
1611 if (i < 2) {
1612 i = 2;
1613 }
1614 i *= (u_int32_t)(tp->t_maxseg +
1615 isipv6 ? sizeof(struct ip6_hdr) +
1616 sizeof(struct tcphdr) :
1617 sizeof(struct tcpiphdr));
1618 if (rt->rt_rmx.rmx_ssthresh) {
1619 rt->rt_rmx.rmx_ssthresh =
1620 (rt->rt_rmx.rmx_ssthresh + i) / 2;
1621 } else {
1622 rt->rt_rmx.rmx_ssthresh = i;
1623 }
1624 tcpstat.tcps_cachedssthresh++;
1625 }
1626 }
1627
1628 /*
1629 * Mark route for deletion if no information is cached.
1630 */
1631 if (rt != NULL && (so->so_flags & SOF_OVERFLOW)) {
1632 if (!(rt->rt_rmx.rmx_locks & RTV_RTT) &&
1633 rt->rt_rmx.rmx_rtt == 0) {
1634 rt->rt_flags |= RTF_DELCLONE;
1635 }
1636 }
1637
1638no_valid_rt:
1639 if (rt != NULL) {
1640 RT_UNLOCK(rt);
1641 }
1642
1643 /* free the reassembly queue, if any */
1644 (void) tcp_freeq(tp);
1645
1646 /* performance stats per interface */
1647 tcp_create_ifnet_stats_per_flow(tp, ifs: &ifs);
1648 tcp_update_stats_per_flow(&ifs, inp->inp_last_outifp);
1649
1650 tcp_free_sackholes(tp);
1651 tcp_notify_ack_free(tp);
1652
1653 inp_decr_sndbytes_allunsent(so, tp->snd_una);
1654
1655 if (tp->t_bwmeas != NULL) {
1656 tcp_bwmeas_free(tp);
1657 }
1658 tcp_rxtseg_clean(tp);
1659 /* Free the packet list */
1660 if (tp->t_pktlist_head != NULL) {
1661 m_freem_list(tp->t_pktlist_head);
1662 }
1663 TCP_PKTLIST_CLEAR(tp);
1664
1665 if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
1666 inp->inp_saved_ppcb = (caddr_t) tp;
1667 }
1668
1669 TCP_LOG_STATE(tp, TCPS_CLOSED);
1670 tp->t_state = TCPS_CLOSED;
1671
1672 /*
1673 * Issue a wakeup before detach so that we don't miss
1674 * a wakeup
1675 */
1676 sodisconnectwakeup(so);
1677
1678 /*
1679 * Make sure to clear the TCP Keep Alive Offload as it is
1680 * ref counted on the interface
1681 */
1682 tcp_clear_keep_alive_offload(so);
1683
1684 /*
1685 * If this is a socket that does not want to wakeup the device
1686 * for it's traffic, the application might need to know that the
1687 * socket is closed, send a notification.
1688 */
1689 if ((so->so_options & SO_NOWAKEFROMSLEEP) &&
1690 inp->inp_state != INPCB_STATE_DEAD &&
1691 !(inp->inp_flags2 & INP2_TIMEWAIT)) {
1692 socket_post_kev_msg_closed(so);
1693 }
1694
1695 if (CC_ALGO(tp)->cleanup != NULL) {
1696 CC_ALGO(tp)->cleanup(tp);
1697 }
1698
1699 tp->tcp_cc_index = TCP_CC_ALGO_NONE;
1700
1701 if (TCP_USE_RLEDBAT(tp, so) && tcp_cc_rledbat.cleanup != NULL) {
1702 tcp_cc_rledbat.cleanup(tp);
1703 }
1704
1705 /* Can happen if we close the socket before receiving the third ACK */
1706 if ((tp->t_tfo_flags & TFO_F_COOKIE_VALID)) {
1707 OSDecrementAtomic(&tcp_tfo_halfcnt);
1708
1709 /* Panic if something has gone terribly wrong. */
1710 VERIFY(tcp_tfo_halfcnt >= 0);
1711
1712 tp->t_tfo_flags &= ~TFO_F_COOKIE_VALID;
1713 }
1714
1715 if (SOCK_CHECK_DOM(so, PF_INET6)) {
1716 in6_pcbdetach(inp);
1717 } else {
1718 in_pcbdetach(inp);
1719 }
1720
1721 /*
1722 * Call soisdisconnected after detach because it might unlock the socket
1723 */
1724 soisdisconnected(so);
1725 tcpstat.tcps_closed++;
1726 KERNEL_DEBUG(DBG_FNC_TCP_CLOSE | DBG_FUNC_END,
1727 tcpstat.tcps_closed, 0, 0, 0, 0);
1728 return NULL;
1729}
1730
1731int
1732tcp_freeq(struct tcpcb *tp)
1733{
1734 struct tseg_qent *q;
1735 int rv = 0;
1736 int count = 0;
1737
1738 while ((q = LIST_FIRST(&tp->t_segq)) != NULL) {
1739 LIST_REMOVE(q, tqe_q);
1740 tp->t_reassq_mbcnt -= _MSIZE + (q->tqe_m->m_flags & M_EXT) ?
1741 q->tqe_m->m_ext.ext_size : 0;
1742 m_freem(q->tqe_m);
1743 zfree(tcp_reass_zone, q);
1744 rv = 1;
1745 count++;
1746 }
1747 tp->t_reassqlen = 0;
1748 if (count > 0) {
1749 OSAddAtomic(-count, &tcp_reass_total_qlen);
1750 }
1751 return rv;
1752}
1753
1754
1755void
1756tcp_drain(void)
1757{
1758 struct inpcb *inp;
1759 struct tcpcb *tp;
1760
1761 if (!lck_rw_try_lock_exclusive(lck: &tcbinfo.ipi_lock)) {
1762 return;
1763 }
1764
1765 LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) {
1766 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) !=
1767 WNT_STOPUSING) {
1768 socket_lock(so: inp->inp_socket, refcount: 1);
1769 if (in_pcb_checkstate(inp, WNT_RELEASE, 1)
1770 == WNT_STOPUSING) {
1771 /* lost a race, try the next one */
1772 socket_unlock(so: inp->inp_socket, refcount: 1);
1773 continue;
1774 }
1775 tp = intotcpcb(inp);
1776
1777 so_drain_extended_bk_idle(inp->inp_socket);
1778
1779 socket_unlock(so: inp->inp_socket, refcount: 1);
1780 }
1781 }
1782 lck_rw_done(lck: &tcbinfo.ipi_lock);
1783}
1784
1785/*
1786 * Notify a tcp user of an asynchronous error;
1787 * store error as soft error, but wake up user
1788 * (for now, won't do anything until can select for soft error).
1789 *
1790 * Do not wake up user since there currently is no mechanism for
1791 * reporting soft errors (yet - a kqueue filter may be added).
1792 */
1793static void
1794tcp_notify(struct inpcb *inp, int error)
1795{
1796 struct tcpcb *tp;
1797
1798 if (inp == NULL || (inp->inp_state == INPCB_STATE_DEAD)) {
1799 return; /* pcb is gone already */
1800 }
1801 tp = (struct tcpcb *)inp->inp_ppcb;
1802
1803 VERIFY(tp != NULL);
1804 /*
1805 * Ignore some errors if we are hooked up.
1806 * If connection hasn't completed, has retransmitted several times,
1807 * and receives a second error, give up now. This is better
1808 * than waiting a long time to establish a connection that
1809 * can never complete.
1810 */
1811 if (tp->t_state == TCPS_ESTABLISHED &&
1812 (error == EHOSTUNREACH || error == ENETUNREACH ||
1813 error == EHOSTDOWN)) {
1814 if (inp->inp_route.ro_rt) {
1815 rtfree(inp->inp_route.ro_rt);
1816 inp->inp_route.ro_rt = (struct rtentry *)NULL;
1817 }
1818 } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
1819 tp->t_softerror) {
1820 tcp_drop(tp, errno: error);
1821 } else {
1822 tp->t_softerror = error;
1823 }
1824}
1825
1826struct bwmeas *
1827tcp_bwmeas_alloc(struct tcpcb *tp)
1828{
1829 struct bwmeas *elm;
1830 elm = zalloc_flags(tcp_bwmeas_zone, Z_ZERO | Z_WAITOK);
1831 elm->bw_minsizepkts = TCP_BWMEAS_BURST_MINSIZE;
1832 elm->bw_minsize = elm->bw_minsizepkts * tp->t_maxseg;
1833 return elm;
1834}
1835
1836void
1837tcp_bwmeas_free(struct tcpcb *tp)
1838{
1839 zfree(tcp_bwmeas_zone, tp->t_bwmeas);
1840 tp->t_bwmeas = NULL;
1841 tp->t_flagsext &= ~(TF_MEASURESNDBW);
1842}
1843
1844int
1845get_tcp_inp_list(struct inpcb **inp_list, int n, inp_gen_t gencnt)
1846{
1847 struct tcpcb *tp;
1848 struct inpcb *inp;
1849 int i = 0;
1850
1851 LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) {
1852 if (inp->inp_gencnt <= gencnt &&
1853 inp->inp_state != INPCB_STATE_DEAD) {
1854 inp_list[i++] = inp;
1855 }
1856 if (i >= n) {
1857 break;
1858 }
1859 }
1860
1861 TAILQ_FOREACH(tp, &tcp_tw_tailq, t_twentry) {
1862 inp = tp->t_inpcb;
1863 if (inp->inp_gencnt <= gencnt &&
1864 inp->inp_state != INPCB_STATE_DEAD) {
1865 inp_list[i++] = inp;
1866 }
1867 if (i >= n) {
1868 break;
1869 }
1870 }
1871 return i;
1872}
1873
1874/*
1875 * tcpcb_to_otcpcb copies specific bits of a tcpcb to a otcpcb format.
1876 * The otcpcb data structure is passed to user space and must not change.
1877 */
1878static void
1879tcpcb_to_otcpcb(struct tcpcb *tp, struct otcpcb *otp)
1880{
1881 otp->t_segq = (uint32_t)VM_KERNEL_ADDRHASH(tp->t_segq.lh_first);
1882 otp->t_dupacks = tp->t_dupacks;
1883 otp->t_timer[TCPT_REXMT_EXT] = tp->t_timer[TCPT_REXMT];
1884 otp->t_timer[TCPT_PERSIST_EXT] = tp->t_timer[TCPT_PERSIST];
1885 otp->t_timer[TCPT_KEEP_EXT] = tp->t_timer[TCPT_KEEP];
1886 otp->t_timer[TCPT_2MSL_EXT] = tp->t_timer[TCPT_2MSL];
1887 otp->t_inpcb =
1888 (_TCPCB_PTR(struct inpcb *))VM_KERNEL_ADDRHASH(tp->t_inpcb);
1889 otp->t_state = tp->t_state;
1890 otp->t_flags = tp->t_flags;
1891 otp->t_force = (tp->t_flagsext & TF_FORCE) ? 1 : 0;
1892 otp->snd_una = tp->snd_una;
1893 otp->snd_max = tp->snd_max;
1894 otp->snd_nxt = tp->snd_nxt;
1895 otp->snd_up = tp->snd_up;
1896 otp->snd_wl1 = tp->snd_wl1;
1897 otp->snd_wl2 = tp->snd_wl2;
1898 otp->iss = tp->iss;
1899 otp->irs = tp->irs;
1900 otp->rcv_nxt = tp->rcv_nxt;
1901 otp->rcv_adv = tp->rcv_adv;
1902 otp->rcv_wnd = tp->rcv_wnd;
1903 otp->rcv_up = tp->rcv_up;
1904 otp->snd_wnd = tp->snd_wnd;
1905 otp->snd_cwnd = tp->snd_cwnd;
1906 otp->snd_ssthresh = tp->snd_ssthresh;
1907 otp->t_maxopd = tp->t_maxopd;
1908 otp->t_rcvtime = tp->t_rcvtime;
1909 otp->t_starttime = tp->t_starttime;
1910 otp->t_rtttime = tp->t_rtttime;
1911 otp->t_rtseq = tp->t_rtseq;
1912 otp->t_rxtcur = tp->t_rxtcur;
1913 otp->t_maxseg = tp->t_maxseg;
1914 otp->t_srtt = tp->t_srtt;
1915 otp->t_rttvar = tp->t_rttvar;
1916 otp->t_rxtshift = tp->t_rxtshift;
1917 otp->t_rttmin = tp->t_rttmin;
1918 otp->t_rttupdated = tp->t_rttupdated;
1919 otp->max_sndwnd = tp->max_sndwnd;
1920 otp->t_softerror = tp->t_softerror;
1921 otp->t_oobflags = tp->t_oobflags;
1922 otp->t_iobc = tp->t_iobc;
1923 otp->snd_scale = tp->snd_scale;
1924 otp->rcv_scale = tp->rcv_scale;
1925 otp->request_r_scale = tp->request_r_scale;
1926 otp->requested_s_scale = tp->requested_s_scale;
1927 otp->ts_recent = tp->ts_recent;
1928 otp->ts_recent_age = tp->ts_recent_age;
1929 otp->last_ack_sent = tp->last_ack_sent;
1930 otp->cc_send = 0;
1931 otp->cc_recv = 0;
1932 otp->snd_recover = tp->snd_recover;
1933 otp->snd_cwnd_prev = tp->snd_cwnd_prev;
1934 otp->snd_ssthresh_prev = tp->snd_ssthresh_prev;
1935 otp->t_badrxtwin = 0;
1936}
1937
1938static int
1939tcp_pcblist SYSCTL_HANDLER_ARGS
1940{
1941#pragma unused(oidp, arg1, arg2)
1942 int error, i = 0, n, sz;
1943 struct inpcb **inp_list;
1944 inp_gen_t gencnt;
1945 struct xinpgen xig;
1946
1947 /*
1948 * The process of preparing the TCB list is too time-consuming and
1949 * resource-intensive to repeat twice on every request.
1950 */
1951 lck_rw_lock_shared(lck: &tcbinfo.ipi_lock);
1952 if (req->oldptr == USER_ADDR_NULL) {
1953 n = tcbinfo.ipi_count;
1954 req->oldidx = 2 * (sizeof(xig))
1955 + (n + n / 8) * sizeof(struct xtcpcb);
1956 lck_rw_done(lck: &tcbinfo.ipi_lock);
1957 return 0;
1958 }
1959
1960 if (req->newptr != USER_ADDR_NULL) {
1961 lck_rw_done(lck: &tcbinfo.ipi_lock);
1962 return EPERM;
1963 }
1964
1965 /*
1966 * OK, now we're committed to doing something.
1967 */
1968 gencnt = tcbinfo.ipi_gencnt;
1969 sz = n = tcbinfo.ipi_count;
1970
1971 bzero(s: &xig, n: sizeof(xig));
1972 xig.xig_len = sizeof(xig);
1973 xig.xig_count = n;
1974 xig.xig_gen = gencnt;
1975 xig.xig_sogen = so_gencnt;
1976 error = SYSCTL_OUT(req, &xig, sizeof(xig));
1977 if (error) {
1978 lck_rw_done(lck: &tcbinfo.ipi_lock);
1979 return error;
1980 }
1981 /*
1982 * We are done if there is no pcb
1983 */
1984 if (n == 0) {
1985 lck_rw_done(lck: &tcbinfo.ipi_lock);
1986 return 0;
1987 }
1988
1989 inp_list = kalloc_type(struct inpcb *, n, Z_WAITOK);
1990 if (inp_list == NULL) {
1991 lck_rw_done(lck: &tcbinfo.ipi_lock);
1992 return ENOMEM;
1993 }
1994
1995 n = get_tcp_inp_list(inp_list, n, gencnt);
1996
1997 error = 0;
1998 for (i = 0; i < n; i++) {
1999 struct xtcpcb xt;
2000 caddr_t inp_ppcb;
2001 struct inpcb *inp;
2002
2003 inp = inp_list[i];
2004
2005 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
2006 continue;
2007 }
2008 socket_lock(so: inp->inp_socket, refcount: 1);
2009 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
2010 socket_unlock(so: inp->inp_socket, refcount: 1);
2011 continue;
2012 }
2013 if (inp->inp_gencnt > gencnt) {
2014 socket_unlock(so: inp->inp_socket, refcount: 1);
2015 continue;
2016 }
2017
2018 bzero(s: &xt, n: sizeof(xt));
2019 xt.xt_len = sizeof(xt);
2020 /* XXX should avoid extra copy */
2021 inpcb_to_compat(inp, &xt.xt_inp);
2022 inp_ppcb = inp->inp_ppcb;
2023 if (inp_ppcb != NULL) {
2024 tcpcb_to_otcpcb(tp: (struct tcpcb *)(void *)inp_ppcb,
2025 otp: &xt.xt_tp);
2026 } else {
2027 bzero(s: (char *) &xt.xt_tp, n: sizeof(xt.xt_tp));
2028 }
2029 if (inp->inp_socket) {
2030 sotoxsocket(so: inp->inp_socket, xso: &xt.xt_socket);
2031 }
2032
2033 socket_unlock(so: inp->inp_socket, refcount: 1);
2034
2035 error = SYSCTL_OUT(req, &xt, sizeof(xt));
2036 }
2037 if (!error) {
2038 /*
2039 * Give the user an updated idea of our state.
2040 * If the generation differs from what we told
2041 * her before, she knows that something happened
2042 * while we were processing this request, and it
2043 * might be necessary to retry.
2044 */
2045 bzero(s: &xig, n: sizeof(xig));
2046 xig.xig_len = sizeof(xig);
2047 xig.xig_gen = tcbinfo.ipi_gencnt;
2048 xig.xig_sogen = so_gencnt;
2049 xig.xig_count = tcbinfo.ipi_count;
2050 error = SYSCTL_OUT(req, &xig, sizeof(xig));
2051 }
2052
2053 lck_rw_done(lck: &tcbinfo.ipi_lock);
2054 kfree_type(struct inpcb *, sz, inp_list);
2055 return error;
2056}
2057
2058SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist,
2059 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
2060 tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
2061
2062#if XNU_TARGET_OS_OSX
2063
2064static void
2065tcpcb_to_xtcpcb64(struct tcpcb *tp, struct xtcpcb64 *otp)
2066{
2067 otp->t_segq = (uint32_t)VM_KERNEL_ADDRHASH(tp->t_segq.lh_first);
2068 otp->t_dupacks = tp->t_dupacks;
2069 otp->t_timer[TCPT_REXMT_EXT] = tp->t_timer[TCPT_REXMT];
2070 otp->t_timer[TCPT_PERSIST_EXT] = tp->t_timer[TCPT_PERSIST];
2071 otp->t_timer[TCPT_KEEP_EXT] = tp->t_timer[TCPT_KEEP];
2072 otp->t_timer[TCPT_2MSL_EXT] = tp->t_timer[TCPT_2MSL];
2073 otp->t_state = tp->t_state;
2074 otp->t_flags = tp->t_flags;
2075 otp->t_force = (tp->t_flagsext & TF_FORCE) ? 1 : 0;
2076 otp->snd_una = tp->snd_una;
2077 otp->snd_max = tp->snd_max;
2078 otp->snd_nxt = tp->snd_nxt;
2079 otp->snd_up = tp->snd_up;
2080 otp->snd_wl1 = tp->snd_wl1;
2081 otp->snd_wl2 = tp->snd_wl2;
2082 otp->iss = tp->iss;
2083 otp->irs = tp->irs;
2084 otp->rcv_nxt = tp->rcv_nxt;
2085 otp->rcv_adv = tp->rcv_adv;
2086 otp->rcv_wnd = tp->rcv_wnd;
2087 otp->rcv_up = tp->rcv_up;
2088 otp->snd_wnd = tp->snd_wnd;
2089 otp->snd_cwnd = tp->snd_cwnd;
2090 otp->snd_ssthresh = tp->snd_ssthresh;
2091 otp->t_maxopd = tp->t_maxopd;
2092 otp->t_rcvtime = tp->t_rcvtime;
2093 otp->t_starttime = tp->t_starttime;
2094 otp->t_rtttime = tp->t_rtttime;
2095 otp->t_rtseq = tp->t_rtseq;
2096 otp->t_rxtcur = tp->t_rxtcur;
2097 otp->t_maxseg = tp->t_maxseg;
2098 otp->t_srtt = tp->t_srtt;
2099 otp->t_rttvar = tp->t_rttvar;
2100 otp->t_rxtshift = tp->t_rxtshift;
2101 otp->t_rttmin = tp->t_rttmin;
2102 otp->t_rttupdated = tp->t_rttupdated;
2103 otp->max_sndwnd = tp->max_sndwnd;
2104 otp->t_softerror = tp->t_softerror;
2105 otp->t_oobflags = tp->t_oobflags;
2106 otp->t_iobc = tp->t_iobc;
2107 otp->snd_scale = tp->snd_scale;
2108 otp->rcv_scale = tp->rcv_scale;
2109 otp->request_r_scale = tp->request_r_scale;
2110 otp->requested_s_scale = tp->requested_s_scale;
2111 otp->ts_recent = tp->ts_recent;
2112 otp->ts_recent_age = tp->ts_recent_age;
2113 otp->last_ack_sent = tp->last_ack_sent;
2114 otp->cc_send = 0;
2115 otp->cc_recv = 0;
2116 otp->snd_recover = tp->snd_recover;
2117 otp->snd_cwnd_prev = tp->snd_cwnd_prev;
2118 otp->snd_ssthresh_prev = tp->snd_ssthresh_prev;
2119 otp->t_badrxtwin = 0;
2120}
2121
2122
2123static int
2124tcp_pcblist64 SYSCTL_HANDLER_ARGS
2125{
2126#pragma unused(oidp, arg1, arg2)
2127 int error, i = 0, n, sz;
2128 struct inpcb **inp_list;
2129 inp_gen_t gencnt;
2130 struct xinpgen xig;
2131
2132 /*
2133 * The process of preparing the TCB list is too time-consuming and
2134 * resource-intensive to repeat twice on every request.
2135 */
2136 lck_rw_lock_shared(lck: &tcbinfo.ipi_lock);
2137 if (req->oldptr == USER_ADDR_NULL) {
2138 n = tcbinfo.ipi_count;
2139 req->oldidx = 2 * (sizeof(xig))
2140 + (n + n / 8) * sizeof(struct xtcpcb64);
2141 lck_rw_done(lck: &tcbinfo.ipi_lock);
2142 return 0;
2143 }
2144
2145 if (req->newptr != USER_ADDR_NULL) {
2146 lck_rw_done(lck: &tcbinfo.ipi_lock);
2147 return EPERM;
2148 }
2149
2150 /*
2151 * OK, now we're committed to doing something.
2152 */
2153 gencnt = tcbinfo.ipi_gencnt;
2154 sz = n = tcbinfo.ipi_count;
2155
2156 bzero(s: &xig, n: sizeof(xig));
2157 xig.xig_len = sizeof(xig);
2158 xig.xig_count = n;
2159 xig.xig_gen = gencnt;
2160 xig.xig_sogen = so_gencnt;
2161 error = SYSCTL_OUT(req, &xig, sizeof(xig));
2162 if (error) {
2163 lck_rw_done(lck: &tcbinfo.ipi_lock);
2164 return error;
2165 }
2166 /*
2167 * We are done if there is no pcb
2168 */
2169 if (n == 0) {
2170 lck_rw_done(lck: &tcbinfo.ipi_lock);
2171 return 0;
2172 }
2173
2174 inp_list = kalloc_type(struct inpcb *, n, Z_WAITOK);
2175 if (inp_list == NULL) {
2176 lck_rw_done(lck: &tcbinfo.ipi_lock);
2177 return ENOMEM;
2178 }
2179
2180 n = get_tcp_inp_list(inp_list, n, gencnt);
2181
2182 error = 0;
2183 for (i = 0; i < n; i++) {
2184 struct xtcpcb64 xt;
2185 struct inpcb *inp;
2186
2187 inp = inp_list[i];
2188
2189 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
2190 continue;
2191 }
2192 socket_lock(so: inp->inp_socket, refcount: 1);
2193 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
2194 socket_unlock(so: inp->inp_socket, refcount: 1);
2195 continue;
2196 }
2197 if (inp->inp_gencnt > gencnt) {
2198 socket_unlock(so: inp->inp_socket, refcount: 1);
2199 continue;
2200 }
2201
2202 bzero(s: &xt, n: sizeof(xt));
2203 xt.xt_len = sizeof(xt);
2204 inpcb_to_xinpcb64(inp, &xt.xt_inpcb);
2205 xt.xt_inpcb.inp_ppcb =
2206 (uint64_t)VM_KERNEL_ADDRHASH(inp->inp_ppcb);
2207 if (inp->inp_ppcb != NULL) {
2208 tcpcb_to_xtcpcb64(tp: (struct tcpcb *)inp->inp_ppcb,
2209 otp: &xt);
2210 }
2211 if (inp->inp_socket) {
2212 sotoxsocket64(so: inp->inp_socket,
2213 xso: &xt.xt_inpcb.xi_socket);
2214 }
2215
2216 socket_unlock(so: inp->inp_socket, refcount: 1);
2217
2218 error = SYSCTL_OUT(req, &xt, sizeof(xt));
2219 }
2220 if (!error) {
2221 /*
2222 * Give the user an updated idea of our state.
2223 * If the generation differs from what we told
2224 * her before, she knows that something happened
2225 * while we were processing this request, and it
2226 * might be necessary to retry.
2227 */
2228 bzero(s: &xig, n: sizeof(xig));
2229 xig.xig_len = sizeof(xig);
2230 xig.xig_gen = tcbinfo.ipi_gencnt;
2231 xig.xig_sogen = so_gencnt;
2232 xig.xig_count = tcbinfo.ipi_count;
2233 error = SYSCTL_OUT(req, &xig, sizeof(xig));
2234 }
2235
2236 lck_rw_done(lck: &tcbinfo.ipi_lock);
2237 kfree_type(struct inpcb *, sz, inp_list);
2238 return error;
2239}
2240
2241SYSCTL_PROC(_net_inet_tcp, OID_AUTO, pcblist64,
2242 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
2243 tcp_pcblist64, "S,xtcpcb64", "List of active TCP connections");
2244
2245#endif /* XNU_TARGET_OS_OSX */
2246
2247static int
2248tcp_pcblist_n SYSCTL_HANDLER_ARGS
2249{
2250#pragma unused(oidp, arg1, arg2)
2251 int error = 0;
2252
2253 error = get_pcblist_n(IPPROTO_TCP, req, &tcbinfo);
2254
2255 return error;
2256}
2257
2258
2259SYSCTL_PROC(_net_inet_tcp, OID_AUTO, pcblist_n,
2260 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
2261 tcp_pcblist_n, "S,xtcpcb_n", "List of active TCP connections");
2262
2263static int
2264tcp_progress_indicators SYSCTL_HANDLER_ARGS
2265{
2266#pragma unused(oidp, arg1, arg2)
2267
2268 return ntstat_tcp_progress_indicators(req);
2269}
2270
2271SYSCTL_PROC(_net_inet_tcp, OID_AUTO, progress,
2272 CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, 0, 0,
2273 tcp_progress_indicators, "S", "Various items that indicate the current state of progress on the link");
2274
2275
2276static int
2277tcp_progress_probe_enable SYSCTL_HANDLER_ARGS
2278{
2279#pragma unused(oidp, arg1, arg2)
2280
2281 return ntstat_tcp_progress_enable(req);
2282}
2283
2284SYSCTL_PROC(_net_inet_tcp, OID_AUTO, progress_enable,
2285 CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, 0, 0,
2286 tcp_progress_probe_enable, "S", "Enable/disable TCP keepalive probing on the specified link(s)");
2287
2288
2289__private_extern__ void
2290tcp_get_ports_used(ifnet_t ifp, int protocol, uint32_t flags,
2291 bitstr_t *bitfield)
2292{
2293 inpcb_get_ports_used(ifp, protocol, flags, bitfield,
2294 &tcbinfo);
2295}
2296
2297__private_extern__ uint32_t
2298tcp_count_opportunistic(unsigned int ifindex, u_int32_t flags)
2299{
2300 return inpcb_count_opportunistic(ifindex, &tcbinfo, flags);
2301}
2302
2303__private_extern__ uint32_t
2304tcp_find_anypcb_byaddr(struct ifaddr *ifa)
2305{
2306#if SKYWALK
2307 if (netns_is_enabled()) {
2308 return netns_find_anyres_byaddr(ifa, IPPROTO_TCP);
2309 } else
2310#endif /* SKYWALK */
2311 return inpcb_find_anypcb_byaddr(ifa, &tcbinfo);
2312}
2313
2314static void
2315tcp_handle_msgsize(struct ip *ip, struct inpcb *inp)
2316{
2317 struct rtentry *rt = NULL;
2318 u_short ifscope = IFSCOPE_NONE;
2319 int mtu;
2320 struct sockaddr_in icmpsrc = {
2321 .sin_len = sizeof(struct sockaddr_in),
2322 .sin_family = AF_INET, .sin_port = 0, .sin_addr = { .s_addr = 0 },
2323 .sin_zero = { 0, 0, 0, 0, 0, 0, 0, 0 }
2324 };
2325 struct icmp *icp = NULL;
2326
2327 icp = (struct icmp *)(void *)
2328 ((caddr_t)ip - offsetof(struct icmp, icmp_ip));
2329
2330 icmpsrc.sin_addr = icp->icmp_ip.ip_dst;
2331
2332 /*
2333 * MTU discovery:
2334 * If we got a needfrag and there is a host route to the
2335 * original destination, and the MTU is not locked, then
2336 * set the MTU in the route to the suggested new value
2337 * (if given) and then notify as usual. The ULPs will
2338 * notice that the MTU has changed and adapt accordingly.
2339 * If no new MTU was suggested, then we guess a new one
2340 * less than the current value. If the new MTU is
2341 * unreasonably small (defined by sysctl tcp_minmss), then
2342 * we reset the MTU to the interface value and enable the
2343 * lock bit, indicating that we are no longer doing MTU
2344 * discovery.
2345 */
2346 if (ROUTE_UNUSABLE(&(inp->inp_route)) == false) {
2347 rt = inp->inp_route.ro_rt;
2348 }
2349
2350 /*
2351 * icmp6_mtudisc_update scopes the routing lookup
2352 * to the incoming interface (delivered from mbuf
2353 * packet header.
2354 * That is mostly ok but for asymmetric networks
2355 * that may be an issue.
2356 * Frag needed OR Packet too big really communicates
2357 * MTU for the out data path.
2358 * Take the interface scope from cached route or
2359 * the last outgoing interface from inp
2360 */
2361 if (rt != NULL) {
2362 ifscope = (rt->rt_ifp != NULL) ?
2363 rt->rt_ifp->if_index : IFSCOPE_NONE;
2364 } else {
2365 ifscope = (inp->inp_last_outifp != NULL) ?
2366 inp->inp_last_outifp->if_index : IFSCOPE_NONE;
2367 }
2368
2369 if ((rt == NULL) ||
2370 !(rt->rt_flags & RTF_HOST) ||
2371 (rt->rt_flags & (RTF_CLONING | RTF_PRCLONING))) {
2372 rt = rtalloc1_scoped(SA(&icmpsrc), 0, RTF_CLONING | RTF_PRCLONING, ifscope);
2373 } else if (rt) {
2374 RT_LOCK(rt);
2375 rtref(rt);
2376 RT_UNLOCK(rt);
2377 }
2378
2379 if (rt != NULL) {
2380 RT_LOCK(rt);
2381 if ((rt->rt_flags & RTF_HOST) &&
2382 !(rt->rt_rmx.rmx_locks & RTV_MTU)) {
2383 mtu = ntohs(icp->icmp_nextmtu);
2384 /*
2385 * XXX Stock BSD has changed the following
2386 * to compare with icp->icmp_ip.ip_len
2387 * to converge faster when sent packet
2388 * < route's MTU. We may want to adopt
2389 * that change.
2390 */
2391 if (mtu == 0) {
2392 mtu = ip_next_mtu(rt->rt_rmx.
2393 rmx_mtu, 1);
2394 }
2395#if DEBUG_MTUDISC
2396 printf("MTU for %s reduced to %d\n",
2397 inet_ntop(AF_INET,
2398 &icmpsrc.sin_addr, ipv4str,
2399 sizeof(ipv4str)), mtu);
2400#endif
2401 if (mtu < max(a: 296, b: (tcp_minmss +
2402 sizeof(struct tcpiphdr)))) {
2403 rt->rt_rmx.rmx_locks |= RTV_MTU;
2404 } else if (rt->rt_rmx.rmx_mtu > mtu) {
2405 rt->rt_rmx.rmx_mtu = mtu;
2406 }
2407 }
2408 RT_UNLOCK(rt);
2409 rtfree(rt);
2410 }
2411}
2412
2413void
2414tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip, __unused struct ifnet *ifp)
2415{
2416 tcp_seq icmp_tcp_seq;
2417 struct ipctlparam *ctl_param = vip;
2418 struct ip *ip = NULL;
2419 struct mbuf *m = NULL;
2420 struct in_addr faddr;
2421 struct inpcb *inp;
2422 struct tcpcb *tp;
2423 struct tcphdr *th;
2424 struct icmp *icp;
2425 size_t off;
2426#if SKYWALK
2427 union sockaddr_in_4_6 sock_laddr;
2428 struct protoctl_ev_val prctl_ev_val;
2429#endif /* SKYWALK */
2430 void (*notify)(struct inpcb *, int) = tcp_notify;
2431
2432 if (ctl_param != NULL) {
2433 ip = ctl_param->ipc_icmp_ip;
2434 icp = ctl_param->ipc_icmp;
2435 m = ctl_param->ipc_m;
2436 off = ctl_param->ipc_off;
2437 } else {
2438 ip = NULL;
2439 icp = NULL;
2440 m = NULL;
2441 off = 0;
2442 }
2443
2444 faddr = SIN(sa)->sin_addr;
2445 if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) {
2446 return;
2447 }
2448
2449 if ((unsigned)cmd >= PRC_NCMDS) {
2450 return;
2451 }
2452
2453 /* Source quench is deprecated */
2454 if (cmd == PRC_QUENCH) {
2455 return;
2456 }
2457
2458 if (cmd == PRC_MSGSIZE) {
2459 notify = tcp_mtudisc;
2460 } else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
2461 cmd == PRC_UNREACH_PORT || cmd == PRC_UNREACH_PROTOCOL ||
2462 cmd == PRC_TIMXCEED_INTRANS) && ip) {
2463 notify = tcp_drop_syn_sent;
2464 }
2465 /*
2466 * Hostdead is ugly because it goes linearly through all PCBs.
2467 * XXX: We never get this from ICMP, otherwise it makes an
2468 * excellent DoS attack on machines with many connections.
2469 */
2470 else if (cmd == PRC_HOSTDEAD) {
2471 ip = NULL;
2472 } else if (inetctlerrmap[cmd] == 0 && !PRC_IS_REDIRECT(cmd)) {
2473 return;
2474 }
2475
2476#if SKYWALK
2477 bzero(s: &prctl_ev_val, n: sizeof(prctl_ev_val));
2478 bzero(s: &sock_laddr, n: sizeof(sock_laddr));
2479#endif /* SKYWALK */
2480
2481 if (ip == NULL) {
2482 in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify);
2483#if SKYWALK
2484 protoctl_event_enqueue_nwk_wq_entry(ifp, NULL,
2485 p_raddr: sa, lport: 0, rport: 0, IPPROTO_TCP, protoctl_event_code: cmd, NULL);
2486#endif /* SKYWALK */
2487 return;
2488 }
2489
2490 /* Check if we can safely get the sport, dport and the sequence number from the tcp header. */
2491 if (m == NULL ||
2492 (m->m_len < off + (sizeof(unsigned short) + sizeof(unsigned short) + sizeof(tcp_seq)))) {
2493 /* Insufficient length */
2494 return;
2495 }
2496
2497 th = (struct tcphdr*)(void*)(mtod(m, uint8_t*) + off);
2498 icmp_tcp_seq = ntohl(th->th_seq);
2499
2500 inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport,
2501 ip->ip_src, th->th_sport, 0, NULL);
2502
2503 if (inp == NULL ||
2504 inp->inp_socket == NULL) {
2505#if SKYWALK
2506 if (cmd == PRC_MSGSIZE) {
2507 prctl_ev_val.val = ntohs(icp->icmp_nextmtu);
2508 }
2509 prctl_ev_val.tcp_seq_number = icmp_tcp_seq;
2510
2511 sock_laddr.sin.sin_family = AF_INET;
2512 sock_laddr.sin.sin_len = sizeof(sock_laddr.sin);
2513 sock_laddr.sin.sin_addr = ip->ip_src;
2514
2515 protoctl_event_enqueue_nwk_wq_entry(ifp,
2516 SA(&sock_laddr), p_raddr: sa,
2517 lport: th->th_sport, rport: th->th_dport, IPPROTO_TCP,
2518 protoctl_event_code: cmd, p_protoctl_ev_val: &prctl_ev_val);
2519#endif /* SKYWALK */
2520 return;
2521 }
2522
2523 socket_lock(so: inp->inp_socket, refcount: 1);
2524 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) ==
2525 WNT_STOPUSING) {
2526 socket_unlock(so: inp->inp_socket, refcount: 1);
2527 return;
2528 }
2529
2530 if (PRC_IS_REDIRECT(cmd)) {
2531 /* signal EHOSTDOWN, as it flushes the cached route */
2532 (*notify)(inp, EHOSTDOWN);
2533 } else {
2534 tp = intotcpcb(inp);
2535 if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
2536 SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
2537 if (cmd == PRC_MSGSIZE) {
2538 tcp_handle_msgsize(ip, inp);
2539 }
2540
2541 (*notify)(inp, inetctlerrmap[cmd]);
2542 }
2543 }
2544 socket_unlock(so: inp->inp_socket, refcount: 1);
2545}
2546
2547void
2548tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d, __unused struct ifnet *ifp)
2549{
2550 tcp_seq icmp_tcp_seq;
2551 struct in6_addr *dst;
2552 void (*notify)(struct inpcb *, int) = tcp_notify;
2553 struct ip6_hdr *ip6;
2554 struct mbuf *m;
2555 struct inpcb *inp;
2556 struct tcpcb *tp;
2557 struct icmp6_hdr *icmp6;
2558 struct ip6ctlparam *ip6cp = NULL;
2559 const struct sockaddr_in6 *sa6_src = NULL;
2560 unsigned int mtu;
2561 unsigned int off;
2562
2563 struct tcp_ports {
2564 uint16_t th_sport;
2565 uint16_t th_dport;
2566 } t_ports;
2567#if SKYWALK
2568 union sockaddr_in_4_6 sock_laddr;
2569 struct protoctl_ev_val prctl_ev_val;
2570#endif /* SKYWALK */
2571
2572 if (sa->sa_family != AF_INET6 ||
2573 sa->sa_len != sizeof(struct sockaddr_in6)) {
2574 return;
2575 }
2576
2577 /* Source quench is deprecated */
2578 if (cmd == PRC_QUENCH) {
2579 return;
2580 }
2581
2582 if ((unsigned)cmd >= PRC_NCMDS) {
2583 return;
2584 }
2585
2586 /* if the parameter is from icmp6, decode it. */
2587 if (d != NULL) {
2588 ip6cp = (struct ip6ctlparam *)d;
2589 icmp6 = ip6cp->ip6c_icmp6;
2590 m = ip6cp->ip6c_m;
2591 ip6 = ip6cp->ip6c_ip6;
2592 off = ip6cp->ip6c_off;
2593 sa6_src = ip6cp->ip6c_src;
2594 dst = ip6cp->ip6c_finaldst;
2595 } else {
2596 m = NULL;
2597 ip6 = NULL;
2598 off = 0; /* fool gcc */
2599 sa6_src = &sa6_any;
2600 dst = NULL;
2601 }
2602
2603 if (cmd == PRC_MSGSIZE) {
2604 notify = tcp_mtudisc;
2605 } else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
2606 cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) &&
2607 ip6 != NULL) {
2608 notify = tcp_drop_syn_sent;
2609 }
2610 /*
2611 * Hostdead is ugly because it goes linearly through all PCBs.
2612 * XXX: We never get this from ICMP, otherwise it makes an
2613 * excellent DoS attack on machines with many connections.
2614 */
2615 else if (cmd == PRC_HOSTDEAD) {
2616 ip6 = NULL;
2617 } else if (inet6ctlerrmap[cmd] == 0 && !PRC_IS_REDIRECT(cmd)) {
2618 return;
2619 }
2620
2621#if SKYWALK
2622 bzero(s: &prctl_ev_val, n: sizeof(prctl_ev_val));
2623 bzero(s: &sock_laddr, n: sizeof(sock_laddr));
2624#endif /* SKYWALK */
2625
2626 if (ip6 == NULL) {
2627 in6_pcbnotify(&tcbinfo, sa, 0, SA(sa6_src), 0, cmd, NULL, notify);
2628#if SKYWALK
2629 protoctl_event_enqueue_nwk_wq_entry(ifp, NULL, p_raddr: sa,
2630 lport: 0, rport: 0, IPPROTO_TCP, protoctl_event_code: cmd, NULL);
2631#endif /* SKYWALK */
2632 return;
2633 }
2634
2635 /* Check if we can safely get the ports from the tcp hdr */
2636 if (m == NULL ||
2637 (m->m_pkthdr.len <
2638 (int32_t) (off + sizeof(struct tcp_ports)))) {
2639 return;
2640 }
2641 bzero(s: &t_ports, n: sizeof(struct tcp_ports));
2642 m_copydata(m, off, sizeof(struct tcp_ports), (caddr_t)&t_ports);
2643
2644 off += sizeof(struct tcp_ports);
2645 if (m->m_pkthdr.len < (int32_t) (off + sizeof(tcp_seq))) {
2646 return;
2647 }
2648 m_copydata(m, off, sizeof(tcp_seq), (caddr_t)&icmp_tcp_seq);
2649 icmp_tcp_seq = ntohl(icmp_tcp_seq);
2650
2651 if (cmd == PRC_MSGSIZE) {
2652 mtu = ntohl(icmp6->icmp6_mtu);
2653 /*
2654 * If no alternative MTU was proposed, or the proposed
2655 * MTU was too small, set to the min.
2656 */
2657 if (mtu < IPV6_MMTU) {
2658 mtu = IPV6_MMTU - 8;
2659 }
2660 }
2661
2662 inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_dst, t_ports.th_dport, ip6_input_getdstifscope(m),
2663 &ip6->ip6_src, t_ports.th_sport, ip6_input_getsrcifscope(m), 0, NULL);
2664
2665 if (inp == NULL ||
2666 inp->inp_socket == NULL) {
2667#if SKYWALK
2668 if (cmd == PRC_MSGSIZE) {
2669 prctl_ev_val.val = mtu;
2670 }
2671 prctl_ev_val.tcp_seq_number = icmp_tcp_seq;
2672
2673 sock_laddr.sin6.sin6_family = AF_INET6;
2674 sock_laddr.sin6.sin6_len = sizeof(sock_laddr.sin6);
2675 sock_laddr.sin6.sin6_addr = ip6->ip6_src;
2676
2677 protoctl_event_enqueue_nwk_wq_entry(ifp,
2678 SA(&sock_laddr), p_raddr: sa,
2679 lport: t_ports.th_sport, rport: t_ports.th_dport, IPPROTO_TCP,
2680 protoctl_event_code: cmd, p_protoctl_ev_val: &prctl_ev_val);
2681#endif /* SKYWALK */
2682 return;
2683 }
2684
2685 socket_lock(so: inp->inp_socket, refcount: 1);
2686 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) ==
2687 WNT_STOPUSING) {
2688 socket_unlock(so: inp->inp_socket, refcount: 1);
2689 return;
2690 }
2691
2692 if (PRC_IS_REDIRECT(cmd)) {
2693 /* signal EHOSTDOWN, as it flushes the cached route */
2694 (*notify)(inp, EHOSTDOWN);
2695 } else {
2696 tp = intotcpcb(inp);
2697 if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
2698 SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
2699 if (cmd == PRC_MSGSIZE) {
2700 /*
2701 * Only process the offered MTU if it
2702 * is smaller than the current one.
2703 */
2704 if (mtu < tp->t_maxseg +
2705 (sizeof(struct tcphdr) + sizeof(struct ip6_hdr))) {
2706 (*notify)(inp, inetctlerrmap[cmd]);
2707 }
2708 } else {
2709 (*notify)(inp, inetctlerrmap[cmd]);
2710 }
2711 }
2712 }
2713 socket_unlock(so: inp->inp_socket, refcount: 1);
2714}
2715
2716
2717/*
2718 * Following is where TCP initial sequence number generation occurs.
2719 *
2720 * There are two places where we must use initial sequence numbers:
2721 * 1. In SYN-ACK packets.
2722 * 2. In SYN packets.
2723 *
2724 * The ISNs in SYN-ACK packets have no monotonicity requirement,
2725 * and should be as unpredictable as possible to avoid the possibility
2726 * of spoofing and/or connection hijacking. To satisfy this
2727 * requirement, SYN-ACK ISNs are generated via the arc4random()
2728 * function. If exact RFC 1948 compliance is requested via sysctl,
2729 * these ISNs will be generated just like those in SYN packets.
2730 *
2731 * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
2732 * depends on this property. In addition, these ISNs should be
2733 * unguessable so as to prevent connection hijacking. To satisfy
2734 * the requirements of this situation, the algorithm outlined in
2735 * RFC 1948 is used to generate sequence numbers.
2736 *
2737 * For more information on the theory of operation, please see
2738 * RFC 1948.
2739 *
2740 * Implementation details:
2741 *
2742 * Time is based off the system timer, and is corrected so that it
2743 * increases by one megabyte per second. This allows for proper
2744 * recycling on high speed LANs while still leaving over an hour
2745 * before rollover.
2746 *
2747 * Two sysctls control the generation of ISNs:
2748 *
2749 * net.inet.tcp.isn_reseed_interval controls the number of seconds
2750 * between seeding of isn_secret. This is normally set to zero,
2751 * as reseeding should not be necessary.
2752 *
2753 * net.inet.tcp.strict_rfc1948 controls whether RFC 1948 is followed
2754 * strictly. When strict compliance is requested, reseeding is
2755 * disabled and SYN-ACKs will be generated in the same manner as
2756 * SYNs. Strict mode is disabled by default.
2757 *
2758 */
2759
2760#define ISN_BYTES_PER_SECOND 1048576
2761
2762tcp_seq
2763tcp_new_isn(struct tcpcb *tp)
2764{
2765 u_int32_t md5_buffer[4];
2766 tcp_seq new_isn;
2767 struct timeval timenow;
2768 u_char isn_secret[32];
2769 long isn_last_reseed = 0;
2770 MD5_CTX isn_ctx;
2771
2772 /* Use arc4random for SYN-ACKs when not in exact RFC1948 mode. */
2773 if (((tp->t_state == TCPS_LISTEN) || (tp->t_state == TCPS_TIME_WAIT)) &&
2774 tcp_strict_rfc1948 == 0)
2775#ifdef __APPLE__
2776 { return RandomULong(); }
2777#else
2778 { return arc4random(); }
2779#endif
2780 getmicrotime(&timenow);
2781
2782 /* Seed if this is the first use, reseed if requested. */
2783 if ((isn_last_reseed == 0) ||
2784 ((tcp_strict_rfc1948 == 0) && (tcp_isn_reseed_interval > 0) &&
2785 (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval * hz)
2786 < (u_int)timenow.tv_sec))) {
2787#ifdef __APPLE__
2788 read_frandom(buffer: &isn_secret, numBytes: sizeof(isn_secret));
2789#else
2790 read_random_unlimited(&isn_secret, sizeof(isn_secret));
2791#endif
2792 isn_last_reseed = timenow.tv_sec;
2793 }
2794
2795 /* Compute the md5 hash and return the ISN. */
2796 MD5Init(&isn_ctx);
2797 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport,
2798 sizeof(u_short));
2799 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport,
2800 sizeof(u_short));
2801 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
2802 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr,
2803 sizeof(struct in6_addr));
2804 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr,
2805 sizeof(struct in6_addr));
2806 } else {
2807 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr,
2808 sizeof(struct in_addr));
2809 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr,
2810 sizeof(struct in_addr));
2811 }
2812 MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret));
2813 MD5Final((u_char *) &md5_buffer, &isn_ctx);
2814 new_isn = (tcp_seq) md5_buffer[0];
2815 new_isn += timenow.tv_sec * (ISN_BYTES_PER_SECOND / hz);
2816 return new_isn;
2817}
2818
2819
2820/*
2821 * When a specific ICMP unreachable message is received and the
2822 * connection state is SYN-SENT, drop the connection. This behavior
2823 * is controlled by the icmp_may_rst sysctl.
2824 */
2825void
2826tcp_drop_syn_sent(struct inpcb *inp, int errno)
2827{
2828 struct tcpcb *tp = intotcpcb(inp);
2829
2830 if (tp && tp->t_state == TCPS_SYN_SENT) {
2831 tcp_drop(tp, errno);
2832 }
2833}
2834
2835/*
2836 * When `need fragmentation' ICMP is received, update our idea of the MSS
2837 * based on the new value in the route. Also nudge TCP to send something,
2838 * since we know the packet we just sent was dropped.
2839 * This duplicates some code in the tcp_mss() function in tcp_input.c.
2840 */
2841void
2842tcp_mtudisc(struct inpcb *inp, __unused int errno)
2843{
2844 struct tcpcb *tp = intotcpcb(inp);
2845 struct rtentry *rt;
2846 struct socket *so = inp->inp_socket;
2847 int mss;
2848 u_int32_t mtu;
2849 u_int32_t protoHdrOverhead = sizeof(struct tcpiphdr);
2850 int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
2851
2852 /*
2853 * Nothing left to send after the socket is defunct or TCP is in the closed state
2854 */
2855 if ((so->so_state & SS_DEFUNCT) || (tp != NULL && tp->t_state == TCPS_CLOSED)) {
2856 return;
2857 }
2858
2859 if (isipv6) {
2860 protoHdrOverhead = sizeof(struct ip6_hdr) +
2861 sizeof(struct tcphdr);
2862 }
2863
2864 if (tp != NULL) {
2865 if (isipv6) {
2866 rt = tcp_rtlookup6(inp, IFSCOPE_NONE);
2867 } else {
2868 rt = tcp_rtlookup(inp, IFSCOPE_NONE);
2869 }
2870 if (!rt || !rt->rt_rmx.rmx_mtu) {
2871 tp->t_maxopd = tp->t_maxseg =
2872 isipv6 ? tcp_v6mssdflt :
2873 tcp_mssdflt;
2874
2875 /* Route locked during lookup above */
2876 if (rt != NULL) {
2877 RT_UNLOCK(rt);
2878 }
2879 return;
2880 }
2881 mtu = rt->rt_rmx.rmx_mtu;
2882
2883 /* Route locked during lookup above */
2884 RT_UNLOCK(rt);
2885
2886#if NECP
2887 // Adjust MTU if necessary.
2888 mtu = necp_socket_get_effective_mtu(inp, current_mtu: mtu);
2889#endif /* NECP */
2890 mss = mtu - protoHdrOverhead;
2891
2892 if (tp->t_maxopd) {
2893 mss = min(a: mss, b: tp->t_maxopd);
2894 }
2895 /*
2896 * XXX - The above conditional probably violates the TCP
2897 * spec. The problem is that, since we don't know the
2898 * other end's MSS, we are supposed to use a conservative
2899 * default. But, if we do that, then MTU discovery will
2900 * never actually take place, because the conservative
2901 * default is much less than the MTUs typically seen
2902 * on the Internet today. For the moment, we'll sweep
2903 * this under the carpet.
2904 *
2905 * The conservative default might not actually be a problem
2906 * if the only case this occurs is when sending an initial
2907 * SYN with options and data to a host we've never talked
2908 * to before. Then, they will reply with an MSS value which
2909 * will get recorded and the new parameters should get
2910 * recomputed. For Further Study.
2911 */
2912 if (tp->t_maxopd <= mss) {
2913 return;
2914 }
2915 tp->t_maxopd = mss;
2916
2917 if ((tp->t_flags & (TF_REQ_TSTMP | TF_NOOPT)) == TF_REQ_TSTMP &&
2918 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) {
2919 mss -= TCPOLEN_TSTAMP_APPA;
2920 }
2921
2922#if MPTCP
2923 mss -= mptcp_adj_mss(tp, TRUE);
2924#endif
2925 if (so->so_snd.sb_hiwat < mss) {
2926 mss = so->so_snd.sb_hiwat;
2927 }
2928
2929 tp->t_maxseg = mss;
2930
2931 ASSERT(tp->t_maxseg);
2932
2933 /*
2934 * Reset the slow-start flight size as it may depends on the
2935 * new MSS
2936 */
2937 if (CC_ALGO(tp)->cwnd_init != NULL) {
2938 CC_ALGO(tp)->cwnd_init(tp);
2939 }
2940
2941 if (TCP_USE_RLEDBAT(tp, so) && tcp_cc_rledbat.rwnd_init != NULL) {
2942 tcp_cc_rledbat.rwnd_init(tp);
2943 }
2944
2945 tcpstat.tcps_mturesent++;
2946 tp->t_rtttime = 0;
2947 tp->snd_nxt = tp->snd_una;
2948 tcp_output(tp);
2949 }
2950}
2951
2952/*
2953 * Look-up the routing entry to the peer of this inpcb. If no route
2954 * is found and it cannot be allocated the return NULL. This routine
2955 * is called by TCP routines that access the rmx structure and by tcp_mss
2956 * to get the interface MTU. If a route is found, this routine will
2957 * hold the rtentry lock; the caller is responsible for unlocking.
2958 */
2959struct rtentry *
2960tcp_rtlookup(struct inpcb *inp, unsigned int input_ifscope)
2961{
2962 struct route *ro;
2963 struct rtentry *rt;
2964 struct tcpcb *tp;
2965
2966 LCK_MTX_ASSERT(rnh_lock, LCK_MTX_ASSERT_NOTOWNED);
2967
2968 ro = &inp->inp_route;
2969 if ((rt = ro->ro_rt) != NULL) {
2970 RT_LOCK(rt);
2971 }
2972
2973 if (ROUTE_UNUSABLE(ro)) {
2974 if (rt != NULL) {
2975 RT_UNLOCK(rt);
2976 rt = NULL;
2977 }
2978 ROUTE_RELEASE(ro);
2979 /* No route yet, so try to acquire one */
2980 if (inp->inp_faddr.s_addr != INADDR_ANY) {
2981 unsigned int ifscope;
2982
2983 ro->ro_dst.sa_family = AF_INET;
2984 ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
2985 SIN(&ro->ro_dst)->sin_addr = inp->inp_faddr;
2986
2987 /*
2988 * If the socket was bound to an interface, then
2989 * the bound-to-interface takes precedence over
2990 * the inbound interface passed in by the caller
2991 * (if we get here as part of the output path then
2992 * input_ifscope is IFSCOPE_NONE).
2993 */
2994 ifscope = (inp->inp_flags & INP_BOUND_IF) ?
2995 inp->inp_boundifp->if_index : input_ifscope;
2996
2997 rtalloc_scoped(ro, ifscope);
2998 if ((rt = ro->ro_rt) != NULL) {
2999 RT_LOCK(rt);
3000 }
3001 }
3002 }
3003 if (rt != NULL) {
3004 RT_LOCK_ASSERT_HELD(rt);
3005 }
3006
3007 /*
3008 * Update MTU discovery determination. Don't do it if:
3009 * 1) it is disabled via the sysctl
3010 * 2) the route isn't up
3011 * 3) the MTU is locked (if it is, then discovery has been
3012 * disabled)
3013 */
3014
3015 tp = intotcpcb(inp);
3016
3017 if (!path_mtu_discovery || ((rt != NULL) &&
3018 (!(rt->rt_flags & RTF_UP) || (rt->rt_rmx.rmx_locks & RTV_MTU)))) {
3019 tp->t_flags &= ~TF_PMTUD;
3020 } else {
3021 tp->t_flags |= TF_PMTUD;
3022 }
3023
3024 if (rt != NULL && rt->rt_ifp != NULL) {
3025 somultipages(inp->inp_socket,
3026 (rt->rt_ifp->if_hwassist & IFNET_MULTIPAGES));
3027 tcp_set_tso(tp, ifp: rt->rt_ifp);
3028 soif2kcl(inp->inp_socket,
3029 (rt->rt_ifp->if_eflags & IFEF_2KCL));
3030 tcp_set_ecn(tp, ifp: rt->rt_ifp);
3031 if (inp->inp_last_outifp == NULL) {
3032 inp->inp_last_outifp = rt->rt_ifp;
3033#if SKYWALK
3034 if (NETNS_TOKEN_VALID(&inp->inp_netns_token)) {
3035 netns_set_ifnet(token: &inp->inp_netns_token,
3036 ifp: inp->inp_last_outifp);
3037 }
3038#endif /* SKYWALK */
3039 }
3040 }
3041
3042 /* Note if the peer is local */
3043 if (rt != NULL && !(rt->rt_ifp->if_flags & IFF_POINTOPOINT) &&
3044 (rt->rt_gateway->sa_family == AF_LINK ||
3045 rt->rt_ifp->if_flags & IFF_LOOPBACK ||
3046 in_localaddr(inp->inp_faddr))) {
3047 tp->t_flags |= TF_LOCAL;
3048 }
3049
3050 /*
3051 * Caller needs to call RT_UNLOCK(rt).
3052 */
3053 return rt;
3054}
3055
3056struct rtentry *
3057tcp_rtlookup6(struct inpcb *inp, unsigned int input_ifscope)
3058{
3059 struct route_in6 *ro6;
3060 struct rtentry *rt;
3061 struct tcpcb *tp;
3062
3063 LCK_MTX_ASSERT(rnh_lock, LCK_MTX_ASSERT_NOTOWNED);
3064
3065 ro6 = &inp->in6p_route;
3066 if ((rt = ro6->ro_rt) != NULL) {
3067 RT_LOCK(rt);
3068 }
3069
3070 if (ROUTE_UNUSABLE(ro6)) {
3071 if (rt != NULL) {
3072 RT_UNLOCK(rt);
3073 rt = NULL;
3074 }
3075 ROUTE_RELEASE(ro6);
3076 /* No route yet, so try to acquire one */
3077 if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
3078 struct sockaddr_in6 *dst6;
3079 unsigned int ifscope;
3080
3081 dst6 = SIN6(&ro6->ro_dst);
3082 dst6->sin6_family = AF_INET6;
3083 dst6->sin6_len = sizeof(*dst6);
3084 dst6->sin6_addr = inp->in6p_faddr;
3085
3086 /*
3087 * If the socket was bound to an interface, then
3088 * the bound-to-interface takes precedence over
3089 * the inbound interface passed in by the caller
3090 * (if we get here as part of the output path then
3091 * input_ifscope is IFSCOPE_NONE).
3092 */
3093 ifscope = (inp->inp_flags & INP_BOUND_IF) ?
3094 inp->inp_boundifp->if_index : input_ifscope;
3095
3096 rtalloc_scoped((struct route *)ro6, ifscope);
3097 if ((rt = ro6->ro_rt) != NULL) {
3098 RT_LOCK(rt);
3099 }
3100 }
3101 }
3102 if (rt != NULL) {
3103 RT_LOCK_ASSERT_HELD(rt);
3104 }
3105
3106 /*
3107 * Update path MTU Discovery determination
3108 * while looking up the route:
3109 * 1) we have a valid route to the destination
3110 * 2) the MTU is not locked (if it is, then discovery has been
3111 * disabled)
3112 */
3113
3114
3115 tp = intotcpcb(inp);
3116
3117 /*
3118 * Update MTU discovery determination. Don't do it if:
3119 * 1) it is disabled via the sysctl
3120 * 2) the route isn't up
3121 * 3) the MTU is locked (if it is, then discovery has been
3122 * disabled)
3123 */
3124
3125 if (!path_mtu_discovery || ((rt != NULL) &&
3126 (!(rt->rt_flags & RTF_UP) || (rt->rt_rmx.rmx_locks & RTV_MTU)))) {
3127 tp->t_flags &= ~TF_PMTUD;
3128 } else {
3129 tp->t_flags |= TF_PMTUD;
3130 }
3131
3132 if (rt != NULL && rt->rt_ifp != NULL) {
3133 somultipages(inp->inp_socket,
3134 (rt->rt_ifp->if_hwassist & IFNET_MULTIPAGES));
3135 tcp_set_tso(tp, ifp: rt->rt_ifp);
3136 soif2kcl(inp->inp_socket,
3137 (rt->rt_ifp->if_eflags & IFEF_2KCL));
3138 tcp_set_ecn(tp, ifp: rt->rt_ifp);
3139 if (inp->inp_last_outifp == NULL) {
3140 inp->inp_last_outifp = rt->rt_ifp;
3141#if SKYWALK
3142 if (NETNS_TOKEN_VALID(&inp->inp_netns_token)) {
3143 netns_set_ifnet(token: &inp->inp_netns_token,
3144 ifp: inp->inp_last_outifp);
3145 }
3146#endif /* SKYWALK */
3147 }
3148
3149 /* Note if the peer is local */
3150 if (!(rt->rt_ifp->if_flags & IFF_POINTOPOINT) &&
3151 (IN6_IS_ADDR_LOOPBACK(&inp->in6p_faddr) ||
3152 IN6_IS_ADDR_LINKLOCAL(&inp->in6p_faddr) ||
3153 rt->rt_gateway->sa_family == AF_LINK ||
3154 in6_localaddr(&inp->in6p_faddr))) {
3155 tp->t_flags |= TF_LOCAL;
3156 }
3157 }
3158
3159 /*
3160 * Caller needs to call RT_UNLOCK(rt).
3161 */
3162 return rt;
3163}
3164
3165#if IPSEC
3166/* compute ESP/AH header size for TCP, including outer IP header. */
3167size_t
3168ipsec_hdrsiz_tcp(struct tcpcb *tp)
3169{
3170 struct inpcb *inp;
3171 struct mbuf *m;
3172 size_t hdrsiz;
3173 struct ip *ip;
3174 struct ip6_hdr *ip6 = NULL;
3175 struct tcphdr *th;
3176
3177 if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL)) {
3178 return 0;
3179 }
3180 MGETHDR(m, M_DONTWAIT, MT_DATA); /* MAC-OK */
3181 if (!m) {
3182 return 0;
3183 }
3184
3185 if ((inp->inp_vflag & INP_IPV6) != 0) {
3186 ip6 = mtod(m, struct ip6_hdr *);
3187 th = (struct tcphdr *)(void *)(ip6 + 1);
3188 m->m_pkthdr.len = m->m_len =
3189 sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
3190 tcp_fillheaders(m, tp, ip_ptr: ip6, tcp_ptr: th);
3191 hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
3192 } else {
3193 ip = mtod(m, struct ip *);
3194 th = (struct tcphdr *)(ip + 1);
3195 m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
3196 tcp_fillheaders(m, tp, ip_ptr: ip, tcp_ptr: th);
3197 hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
3198 }
3199 m_free(m);
3200 return hdrsiz;
3201}
3202#endif /* IPSEC */
3203
3204int
3205tcp_lock(struct socket *so, int refcount, void *lr)
3206{
3207 void *lr_saved;
3208
3209 if (lr == NULL) {
3210 lr_saved = __builtin_return_address(0);
3211 } else {
3212 lr_saved = lr;
3213 }
3214
3215retry:
3216 if (so->so_pcb != NULL) {
3217 if (so->so_flags & SOF_MP_SUBFLOW) {
3218 struct mptcb *mp_tp = tptomptp(sototcpcb(so));
3219 struct socket *mp_so = mptetoso(mpte: mp_tp->mpt_mpte);
3220
3221 socket_lock(so: mp_so, refcount);
3222
3223 /*
3224 * Check if we became non-MPTCP while waiting for the lock.
3225 * If yes, we have to retry to grab the right lock.
3226 */
3227 if (!(so->so_flags & SOF_MP_SUBFLOW)) {
3228 socket_unlock(so: mp_so, refcount);
3229 goto retry;
3230 }
3231 } else {
3232 lck_mtx_lock(lck: &((struct inpcb *)so->so_pcb)->inpcb_mtx);
3233
3234 if (so->so_flags & SOF_MP_SUBFLOW) {
3235 /*
3236 * While waiting for the lock, we might have
3237 * become MPTCP-enabled (see mptcp_subflow_socreate).
3238 */
3239 lck_mtx_unlock(lck: &((struct inpcb *)so->so_pcb)->inpcb_mtx);
3240 goto retry;
3241 }
3242 }
3243 } else {
3244 panic("tcp_lock: so=%p NO PCB! lr=%p lrh= %s",
3245 so, lr_saved, solockhistory_nr(so));
3246 /* NOTREACHED */
3247 }
3248
3249 if (so->so_usecount < 0) {
3250 panic("tcp_lock: so=%p so_pcb=%p lr=%p ref=%x lrh= %s",
3251 so, so->so_pcb, lr_saved, so->so_usecount,
3252 solockhistory_nr(so));
3253 /* NOTREACHED */
3254 }
3255 if (refcount) {
3256 so->so_usecount++;
3257 }
3258 so->lock_lr[so->next_lock_lr] = lr_saved;
3259 so->next_lock_lr = (so->next_lock_lr + 1) % SO_LCKDBG_MAX;
3260 return 0;
3261}
3262
3263int
3264tcp_unlock(struct socket *so, int refcount, void *lr)
3265{
3266 void *lr_saved;
3267
3268 if (lr == NULL) {
3269 lr_saved = __builtin_return_address(0);
3270 } else {
3271 lr_saved = lr;
3272 }
3273
3274#ifdef MORE_TCPLOCK_DEBUG
3275 printf("tcp_unlock: so=0x%llx sopcb=0x%llx lock=0x%llx ref=%x "
3276 "lr=0x%llx\n", (uint64_t)VM_KERNEL_ADDRPERM(so),
3277 (uint64_t)VM_KERNEL_ADDRPERM(so->so_pcb),
3278 (uint64_t)VM_KERNEL_ADDRPERM(&(sotoinpcb(so)->inpcb_mtx)),
3279 so->so_usecount, (uint64_t)VM_KERNEL_ADDRPERM(lr_saved));
3280#endif
3281 if (refcount) {
3282 so->so_usecount--;
3283 }
3284
3285 if (so->so_usecount < 0) {
3286 panic("tcp_unlock: so=%p usecount=%x lrh= %s",
3287 so, so->so_usecount, solockhistory_nr(so));
3288 /* NOTREACHED */
3289 }
3290 if (so->so_pcb == NULL) {
3291 panic("tcp_unlock: so=%p NO PCB usecount=%x lr=%p lrh= %s",
3292 so, so->so_usecount, lr_saved, solockhistory_nr(so));
3293 /* NOTREACHED */
3294 } else {
3295 so->unlock_lr[so->next_unlock_lr] = lr_saved;
3296 so->next_unlock_lr = (so->next_unlock_lr + 1) % SO_LCKDBG_MAX;
3297
3298 if (so->so_flags & SOF_MP_SUBFLOW) {
3299 struct mptcb *mp_tp = tptomptp(sototcpcb(so));
3300 struct socket *mp_so = mptetoso(mpte: mp_tp->mpt_mpte);
3301
3302 socket_lock_assert_owned(so: mp_so);
3303
3304 socket_unlock(so: mp_so, refcount);
3305 } else {
3306 LCK_MTX_ASSERT(&((struct inpcb *)so->so_pcb)->inpcb_mtx,
3307 LCK_MTX_ASSERT_OWNED);
3308 lck_mtx_unlock(lck: &((struct inpcb *)so->so_pcb)->inpcb_mtx);
3309 }
3310 }
3311 return 0;
3312}
3313
3314lck_mtx_t *
3315tcp_getlock(struct socket *so, int flags)
3316{
3317 struct inpcb *inp = sotoinpcb(so);
3318
3319 if (so->so_pcb) {
3320 if (so->so_usecount < 0) {
3321 panic("tcp_getlock: so=%p usecount=%x lrh= %s",
3322 so, so->so_usecount, solockhistory_nr(so));
3323 }
3324
3325 if (so->so_flags & SOF_MP_SUBFLOW) {
3326 struct mptcb *mp_tp = tptomptp(sototcpcb(so));
3327 struct socket *mp_so = mptetoso(mpte: mp_tp->mpt_mpte);
3328
3329 return mp_so->so_proto->pr_getlock(mp_so, flags);
3330 } else {
3331 return &inp->inpcb_mtx;
3332 }
3333 } else {
3334 panic("tcp_getlock: so=%p NULL so_pcb %s",
3335 so, solockhistory_nr(so));
3336 return so->so_proto->pr_domain->dom_mtx;
3337 }
3338}
3339
3340/*
3341 * Determine if we can grow the recieve socket buffer to avoid sending
3342 * a zero window update to the peer. We allow even socket buffers that
3343 * have fixed size (set by the application) to grow if the resource
3344 * constraints are met. They will also be trimmed after the application
3345 * reads data.
3346 */
3347static void
3348tcp_sbrcv_grow_rwin(struct tcpcb *tp, struct sockbuf *sb)
3349{
3350 u_int32_t rcvbufinc = tp->t_maxseg << 4;
3351 u_int32_t rcvbuf = sb->sb_hiwat;
3352 struct socket *so = tp->t_inpcb->inp_socket;
3353
3354 if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(so)) {
3355 return;
3356 }
3357
3358 if (tcp_do_autorcvbuf == 1 &&
3359 (tp->t_flags & TF_SLOWLINK) == 0 &&
3360 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) == 0 &&
3361 (rcvbuf - sb->sb_cc) < rcvbufinc &&
3362 rcvbuf < tcp_autorcvbuf_max &&
3363 (sb->sb_idealsize > 0 &&
3364 sb->sb_hiwat <= (sb->sb_idealsize + rcvbufinc))) {
3365 sbreserve(sb,
3366 cc: min(a: (sb->sb_hiwat + rcvbufinc), b: tcp_autorcvbuf_max));
3367 }
3368}
3369
3370int32_t
3371tcp_sbspace(struct tcpcb *tp)
3372{
3373 struct socket *so = tp->t_inpcb->inp_socket;
3374 struct sockbuf *sb = &so->so_rcv;
3375 u_int32_t rcvbuf;
3376 int32_t space;
3377 int32_t pending = 0;
3378
3379 if (so->so_flags & SOF_MP_SUBFLOW) {
3380 /* We still need to grow TCP's buffer to have a BDP-estimate */
3381 tcp_sbrcv_grow_rwin(tp, sb);
3382
3383 return mptcp_sbspace(tptomptp(tp));
3384 }
3385
3386 tcp_sbrcv_grow_rwin(tp, sb);
3387
3388 /* hiwat might have changed */
3389 rcvbuf = sb->sb_hiwat;
3390
3391 space = ((int32_t) imin(a: (rcvbuf - sb->sb_cc),
3392 b: (sb->sb_mbmax - sb->sb_mbcnt)));
3393 if (space < 0) {
3394 space = 0;
3395 }
3396
3397#if CONTENT_FILTER
3398 /* Compensate for data being processed by content filters */
3399 pending = cfil_sock_data_space(sb);
3400#endif /* CONTENT_FILTER */
3401 if (pending > space) {
3402 space = 0;
3403 } else {
3404 space -= pending;
3405 }
3406
3407 /*
3408 * Avoid increasing window size if the current window
3409 * is already very low, we could be in "persist" mode and
3410 * we could break some apps (see rdar://5409343)
3411 */
3412
3413 if (space < tp->t_maxseg) {
3414 return space;
3415 }
3416
3417 /* Clip window size for slower link */
3418
3419 if (((tp->t_flags & TF_SLOWLINK) != 0) && slowlink_wsize > 0) {
3420 return imin(a: space, b: slowlink_wsize);
3421 }
3422
3423 return space;
3424}
3425/*
3426 * Checks TCP Segment Offloading capability for a given connection
3427 * and interface pair.
3428 */
3429void
3430tcp_set_tso(struct tcpcb *tp, struct ifnet *ifp)
3431{
3432 struct inpcb *inp;
3433 int isipv6;
3434 struct ifnet *tunnel_ifp = NULL;
3435#define IFNET_TSO_MASK (IFNET_TSO_IPV6 | IFNET_TSO_IPV4)
3436
3437 tp->t_flags &= ~TF_TSO;
3438
3439 /*
3440 * Bail if there's a non-TSO-capable filter on the interface.
3441 */
3442 if (ifp == NULL || ifp->if_flt_no_tso_count > 0) {
3443 return;
3444 }
3445
3446 inp = tp->t_inpcb;
3447 isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
3448
3449#if MPTCP
3450 /*
3451 * We can't use TSO if this tcpcb belongs to an MPTCP session.
3452 */
3453 if (inp->inp_socket->so_flags & SOF_MP_SUBFLOW) {
3454 return;
3455 }
3456#endif
3457 /*
3458 * We can't use TSO if the TSO capability of the tunnel interface does
3459 * not match the capability of another interface known by TCP
3460 */
3461 if (inp->inp_policyresult.results.result == NECP_KERNEL_POLICY_RESULT_IP_TUNNEL) {
3462 u_int tunnel_if_index = inp->inp_policyresult.results.result_parameter.tunnel_interface_index;
3463
3464 if (tunnel_if_index != 0) {
3465 ifnet_head_lock_shared();
3466 tunnel_ifp = ifindex2ifnet[tunnel_if_index];
3467 ifnet_head_done();
3468 }
3469
3470 if (tunnel_ifp == NULL) {
3471 return;
3472 }
3473
3474 if ((ifp->if_hwassist & IFNET_TSO_MASK) != (tunnel_ifp->if_hwassist & IFNET_TSO_MASK)) {
3475 if (tso_debug > 0) {
3476 os_log(OS_LOG_DEFAULT,
3477 "%s: %u > %u TSO 0 tunnel_ifp %s hwassist mismatch with ifp %s",
3478 __func__,
3479 ntohs(tp->t_inpcb->inp_lport), ntohs(tp->t_inpcb->inp_fport),
3480 tunnel_ifp->if_xname, ifp->if_xname);
3481 }
3482 return;
3483 }
3484 if (inp->inp_last_outifp != NULL &&
3485 (inp->inp_last_outifp->if_hwassist & IFNET_TSO_MASK) != (tunnel_ifp->if_hwassist & IFNET_TSO_MASK)) {
3486 if (tso_debug > 0) {
3487 os_log(OS_LOG_DEFAULT,
3488 "%s: %u > %u TSO 0 tunnel_ifp %s hwassist mismatch with inp_last_outifp %s",
3489 __func__,
3490 ntohs(tp->t_inpcb->inp_lport), ntohs(tp->t_inpcb->inp_fport),
3491 tunnel_ifp->if_xname, inp->inp_last_outifp->if_xname);
3492 }
3493 return;
3494 }
3495 if ((inp->inp_flags & INP_BOUND_IF) && inp->inp_boundifp != NULL &&
3496 (inp->inp_boundifp->if_hwassist & IFNET_TSO_MASK) != (tunnel_ifp->if_hwassist & IFNET_TSO_MASK)) {
3497 if (tso_debug > 0) {
3498 os_log(OS_LOG_DEFAULT,
3499 "%s: %u > %u TSO 0 tunnel_ifp %s hwassist mismatch with inp_boundifp %s",
3500 __func__,
3501 ntohs(tp->t_inpcb->inp_lport), ntohs(tp->t_inpcb->inp_fport),
3502 tunnel_ifp->if_xname, inp->inp_boundifp->if_xname);
3503 }
3504 return;
3505 }
3506 }
3507
3508 if (isipv6) {
3509 if (ifp->if_hwassist & IFNET_TSO_IPV6) {
3510 tp->t_flags |= TF_TSO;
3511 if (ifp->if_tso_v6_mtu != 0) {
3512 tp->tso_max_segment_size = ifp->if_tso_v6_mtu;
3513 } else {
3514 tp->tso_max_segment_size = TCP_MAXWIN;
3515 }
3516 }
3517 } else {
3518 if (ifp->if_hwassist & IFNET_TSO_IPV4) {
3519 tp->t_flags |= TF_TSO;
3520 if (ifp->if_tso_v4_mtu != 0) {
3521 tp->tso_max_segment_size = ifp->if_tso_v4_mtu;
3522 } else {
3523 tp->tso_max_segment_size = TCP_MAXWIN;
3524 }
3525 if (INTF_ADJUST_MTU_FOR_CLAT46(ifp)) {
3526 tp->tso_max_segment_size -=
3527 CLAT46_HDR_EXPANSION_OVERHD;
3528 }
3529 }
3530 }
3531
3532 if (tso_debug > 1) {
3533 os_log(OS_LOG_DEFAULT, "%s: %u > %u TSO %d ifp %s",
3534 __func__,
3535 ntohs(tp->t_inpcb->inp_lport),
3536 ntohs(tp->t_inpcb->inp_fport),
3537 (tp->t_flags & TF_TSO) != 0,
3538 ifp != NULL ? ifp->if_xname : "<NULL>");
3539 }
3540}
3541
3542#define TIMEVAL_TO_TCPHZ(_tv_) ((uint32_t)((_tv_).tv_sec * TCP_RETRANSHZ + \
3543 (_tv_).tv_usec / TCP_RETRANSHZ_TO_USEC))
3544
3545/*
3546 * Function to calculate the tcp clock. The tcp clock will get updated
3547 * at the boundaries of the tcp layer. This is done at 3 places:
3548 * 1. Right before processing an input tcp packet
3549 * 2. Whenever a connection wants to access the network using tcp_usrreqs
3550 * 3. When a tcp timer fires or before tcp slow timeout
3551 *
3552 */
3553
3554void
3555calculate_tcp_clock(void)
3556{
3557 struct timeval tv = tcp_uptime;
3558 struct timeval interval = {.tv_sec = 0, .tv_usec = TCP_RETRANSHZ_TO_USEC};
3559 struct timeval now, hold_now;
3560 uint32_t incr = 0;
3561
3562 microuptime(tv: &now);
3563
3564 /*
3565 * Update coarse-grained networking timestamp (in sec.); the idea
3566 * is to update the counter returnable via net_uptime() when
3567 * we read time.
3568 */
3569 net_update_uptime_with_time(&now);
3570
3571 timevaladd(t1: &tv, t2: &interval);
3572 if (timevalcmp(&now, &tv, >)) {
3573 /* time to update the clock */
3574 lck_spin_lock(lck: &tcp_uptime_lock);
3575 if (timevalcmp(&tcp_uptime, &now, >=)) {
3576 /* clock got updated while waiting for the lock */
3577 lck_spin_unlock(lck: &tcp_uptime_lock);
3578 return;
3579 }
3580
3581 microuptime(tv: &now);
3582 hold_now = now;
3583 tv = tcp_uptime;
3584 timevalsub(t1: &now, t2: &tv);
3585
3586 incr = TIMEVAL_TO_TCPHZ(now);
3587
3588 /* Account for the previous remainder */
3589 uint32_t remaining_us = (now.tv_usec % TCP_RETRANSHZ_TO_USEC) +
3590 tcp_now_remainder_us;
3591 if (remaining_us >= TCP_RETRANSHZ_TO_USEC) {
3592 incr += (remaining_us / TCP_RETRANSHZ_TO_USEC);
3593 }
3594
3595 if (incr > 0) {
3596 tcp_uptime = hold_now;
3597 tcp_now_remainder_us = remaining_us % TCP_RETRANSHZ_TO_USEC;
3598 tcp_now += incr;
3599 }
3600
3601 lck_spin_unlock(lck: &tcp_uptime_lock);
3602 }
3603}
3604
3605/*
3606 * Compute receive window scaling that we are going to request
3607 * for this connection based on sb_hiwat. Try to leave some
3608 * room to potentially increase the window size upto a maximum
3609 * defined by the constant tcp_autorcvbuf_max.
3610 */
3611void
3612tcp_set_max_rwinscale(struct tcpcb *tp, struct socket *so)
3613{
3614 uint32_t maxsockbufsize;
3615
3616 tp->request_r_scale = MAX((uint8_t)tcp_win_scale, tp->request_r_scale);
3617 maxsockbufsize = ((so->so_rcv.sb_flags & SB_USRSIZE) != 0) ?
3618 so->so_rcv.sb_hiwat : tcp_autorcvbuf_max;
3619
3620 /*
3621 * Window scale should not exceed what is needed
3622 * to send the max receive window size; adding 1 to TCP_MAXWIN
3623 * ensures that.
3624 */
3625 while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
3626 ((TCP_MAXWIN + 1) << tp->request_r_scale) < maxsockbufsize) {
3627 tp->request_r_scale++;
3628 }
3629 tp->request_r_scale = MIN(tp->request_r_scale, TCP_MAX_WINSHIFT);
3630}
3631
3632int
3633tcp_notsent_lowat_check(struct socket *so)
3634{
3635 struct inpcb *inp = sotoinpcb(so);
3636 struct tcpcb *tp = NULL;
3637 int notsent = 0;
3638
3639 if (inp != NULL) {
3640 tp = intotcpcb(inp);
3641 }
3642
3643 if (tp == NULL) {
3644 return 0;
3645 }
3646
3647 notsent = so->so_snd.sb_cc -
3648 (tp->snd_nxt - tp->snd_una);
3649
3650 /*
3651 * When we send a FIN or SYN, not_sent can be negative.
3652 * In that case also we need to send a write event to the
3653 * process if it is waiting. In the FIN case, it will
3654 * get an error from send because cantsendmore will be set.
3655 */
3656 if (notsent <= tp->t_notsent_lowat) {
3657 return 1;
3658 }
3659
3660 /*
3661 * When Nagle's algorithm is not disabled, it is better
3662 * to wakeup the client until there is atleast one
3663 * maxseg of data to write.
3664 */
3665 if ((tp->t_flags & TF_NODELAY) == 0 &&
3666 notsent > 0 && notsent < tp->t_maxseg) {
3667 return 1;
3668 }
3669 return 0;
3670}
3671
3672void
3673tcp_rxtseg_insert(struct tcpcb *tp, tcp_seq start, tcp_seq end)
3674{
3675 struct tcp_rxt_seg *rxseg = NULL, *prev = NULL, *next = NULL;
3676 uint16_t rxcount = 0;
3677
3678 if (SLIST_EMPTY(&tp->t_rxt_segments)) {
3679 tp->t_dsack_lastuna = tp->snd_una;
3680 }
3681 /*
3682 * First check if there is a segment already existing for this
3683 * sequence space.
3684 */
3685
3686 SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
3687 if (SEQ_GT(rxseg->rx_start, start)) {
3688 break;
3689 }
3690 prev = rxseg;
3691 }
3692 next = rxseg;
3693
3694 /* check if prev seg is for this sequence */
3695 if (prev != NULL && SEQ_LEQ(prev->rx_start, start) &&
3696 SEQ_GEQ(prev->rx_end, end)) {
3697 prev->rx_count++;
3698 return;
3699 }
3700
3701 /*
3702 * There are a couple of possibilities at this point.
3703 * 1. prev overlaps with the beginning of this sequence
3704 * 2. next overlaps with the end of this sequence
3705 * 3. there is no overlap.
3706 */
3707
3708 if (prev != NULL && SEQ_GT(prev->rx_end, start)) {
3709 if (prev->rx_start == start && SEQ_GT(end, prev->rx_end)) {
3710 start = prev->rx_end + 1;
3711 prev->rx_count++;
3712 } else {
3713 prev->rx_end = (start - 1);
3714 rxcount = prev->rx_count;
3715 }
3716 }
3717
3718 if (next != NULL && SEQ_LT(next->rx_start, end)) {
3719 if (SEQ_LEQ(next->rx_end, end)) {
3720 end = next->rx_start - 1;
3721 next->rx_count++;
3722 } else {
3723 next->rx_start = end + 1;
3724 rxcount = next->rx_count;
3725 }
3726 }
3727 if (!SEQ_LT(start, end)) {
3728 return;
3729 }
3730
3731 if (tcp_rxt_seg_max > 0 && tp->t_rxt_seg_count >= tcp_rxt_seg_max) {
3732 rxseg = SLIST_FIRST(&tp->t_rxt_segments);
3733 if (prev == rxseg) {
3734 prev = NULL;
3735 }
3736 SLIST_REMOVE(&tp->t_rxt_segments, rxseg,
3737 tcp_rxt_seg, rx_link);
3738
3739 tcp_rxt_seg_drop++;
3740 tp->t_rxt_seg_drop++;
3741 TCP_LOG(tp, "removed rxseg list overflow %u:%u ",
3742 rxseg->rx_start, rxseg->rx_end);
3743 zfree(tcp_rxt_seg_zone, rxseg);
3744
3745 tp->t_rxt_seg_count -= 1;
3746 }
3747
3748 rxseg = zalloc_flags(tcp_rxt_seg_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
3749 rxseg->rx_start = start;
3750 rxseg->rx_end = end;
3751 rxseg->rx_count = rxcount + 1;
3752
3753 if (prev != NULL) {
3754 SLIST_INSERT_AFTER(prev, rxseg, rx_link);
3755 } else {
3756 SLIST_INSERT_HEAD(&tp->t_rxt_segments, rxseg, rx_link);
3757 }
3758 tp->t_rxt_seg_count += 1;
3759}
3760
3761struct tcp_rxt_seg *
3762tcp_rxtseg_find(struct tcpcb *tp, tcp_seq start, tcp_seq end)
3763{
3764 struct tcp_rxt_seg *rxseg;
3765
3766 if (SLIST_EMPTY(&tp->t_rxt_segments)) {
3767 return NULL;
3768 }
3769
3770 SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
3771 if (SEQ_LEQ(rxseg->rx_start, start) &&
3772 SEQ_GEQ(rxseg->rx_end, end)) {
3773 return rxseg;
3774 }
3775 if (SEQ_GT(rxseg->rx_start, start)) {
3776 break;
3777 }
3778 }
3779 return NULL;
3780}
3781
3782void
3783tcp_rxtseg_set_spurious(struct tcpcb *tp, tcp_seq start, tcp_seq end)
3784{
3785 struct tcp_rxt_seg *rxseg;
3786
3787 if (SLIST_EMPTY(&tp->t_rxt_segments)) {
3788 return;
3789 }
3790
3791 SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
3792 if (SEQ_GEQ(rxseg->rx_start, start) &&
3793 SEQ_LEQ(rxseg->rx_end, end)) {
3794 /*
3795 * If the segment was retransmitted only once, mark it as
3796 * spurious.
3797 */
3798 if (rxseg->rx_count == 1) {
3799 rxseg->rx_flags |= TCP_RXT_SPURIOUS;
3800 }
3801 }
3802
3803 if (SEQ_GEQ(rxseg->rx_start, end)) {
3804 break;
3805 }
3806 }
3807 return;
3808}
3809
3810void
3811tcp_rxtseg_clean(struct tcpcb *tp)
3812{
3813 struct tcp_rxt_seg *rxseg, *next;
3814
3815 SLIST_FOREACH_SAFE(rxseg, &tp->t_rxt_segments, rx_link, next) {
3816 SLIST_REMOVE(&tp->t_rxt_segments, rxseg,
3817 tcp_rxt_seg, rx_link);
3818 zfree(tcp_rxt_seg_zone, rxseg);
3819 }
3820 tp->t_rxt_seg_count = 0;
3821 tp->t_dsack_lastuna = tp->snd_max;
3822}
3823
3824boolean_t
3825tcp_rxtseg_detect_bad_rexmt(struct tcpcb *tp, tcp_seq th_ack)
3826{
3827 boolean_t bad_rexmt;
3828 struct tcp_rxt_seg *rxseg;
3829
3830 if (SLIST_EMPTY(&tp->t_rxt_segments)) {
3831 return FALSE;
3832 }
3833
3834 /*
3835 * If all of the segments in this window are not cumulatively
3836 * acknowledged, then there can still be undetected packet loss.
3837 * Do not restore congestion window in that case.
3838 */
3839 if (SEQ_LT(th_ack, tp->snd_recover)) {
3840 return FALSE;
3841 }
3842
3843 bad_rexmt = TRUE;
3844 SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
3845 if (!(rxseg->rx_flags & TCP_RXT_SPURIOUS)) {
3846 bad_rexmt = FALSE;
3847 break;
3848 }
3849 }
3850 return bad_rexmt;
3851}
3852
3853u_int32_t
3854tcp_rxtseg_total_size(struct tcpcb *tp)
3855{
3856 struct tcp_rxt_seg *rxseg;
3857 u_int32_t total_size = 0;
3858
3859 SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
3860 total_size += (rxseg->rx_end - rxseg->rx_start) + 1;
3861 }
3862 return total_size;
3863}
3864
3865void
3866tcp_get_connectivity_status(struct tcpcb *tp,
3867 struct tcp_conn_status *connstatus)
3868{
3869 if (tp == NULL || connstatus == NULL) {
3870 return;
3871 }
3872 bzero(s: connstatus, n: sizeof(*connstatus));
3873 if (tp->t_rxtshift >= TCP_CONNECTIVITY_PROBES_MAX) {
3874 if (TCPS_HAVEESTABLISHED(tp->t_state)) {
3875 connstatus->write_probe_failed = 1;
3876 } else {
3877 connstatus->conn_probe_failed = 1;
3878 }
3879 }
3880 if (tp->t_rtimo_probes >= TCP_CONNECTIVITY_PROBES_MAX) {
3881 connstatus->read_probe_failed = 1;
3882 }
3883 if (tp->t_inpcb != NULL && tp->t_inpcb->inp_last_outifp != NULL &&
3884 (tp->t_inpcb->inp_last_outifp->if_eflags & IFEF_PROBE_CONNECTIVITY)) {
3885 connstatus->probe_activated = 1;
3886 }
3887}
3888
3889boolean_t
3890tfo_enabled(const struct tcpcb *tp)
3891{
3892 return (tp->t_flagsext & TF_FASTOPEN)? TRUE : FALSE;
3893}
3894
3895void
3896tcp_disable_tfo(struct tcpcb *tp)
3897{
3898 tp->t_flagsext &= ~TF_FASTOPEN;
3899}
3900
3901static struct mbuf *
3902tcp_make_keepalive_frame(struct tcpcb *tp, struct ifnet *ifp,
3903 boolean_t is_probe)
3904{
3905 struct inpcb *inp = tp->t_inpcb;
3906 struct tcphdr *th;
3907 u_int8_t *data;
3908 int win = 0;
3909 struct mbuf *m;
3910
3911 /*
3912 * The code assumes the IP + TCP headers fit in an mbuf packet header
3913 */
3914 _CASSERT(sizeof(struct ip) + sizeof(struct tcphdr) <= _MHLEN);
3915 _CASSERT(sizeof(struct ip6_hdr) + sizeof(struct tcphdr) <= _MHLEN);
3916
3917 MGETHDR(m, M_WAIT, MT_HEADER);
3918 if (m == NULL) {
3919 return NULL;
3920 }
3921 m->m_pkthdr.pkt_proto = IPPROTO_TCP;
3922
3923 data = mbuf_datastart(mbuf: m);
3924
3925 if (inp->inp_vflag & INP_IPV4) {
3926 bzero(s: data, n: sizeof(struct ip) + sizeof(struct tcphdr));
3927 th = (struct tcphdr *)(void *) (data + sizeof(struct ip));
3928 m->m_len = sizeof(struct ip) + sizeof(struct tcphdr);
3929 m->m_pkthdr.len = m->m_len;
3930 } else {
3931 VERIFY(inp->inp_vflag & INP_IPV6);
3932
3933 bzero(s: data, n: sizeof(struct ip6_hdr)
3934 + sizeof(struct tcphdr));
3935 th = (struct tcphdr *)(void *)(data + sizeof(struct ip6_hdr));
3936 m->m_len = sizeof(struct ip6_hdr) +
3937 sizeof(struct tcphdr);
3938 m->m_pkthdr.len = m->m_len;
3939 }
3940
3941 tcp_fillheaders(m, tp, ip_ptr: data, tcp_ptr: th);
3942
3943 if (inp->inp_vflag & INP_IPV4) {
3944 struct ip *ip;
3945
3946 ip = (__typeof__(ip))(void *)data;
3947
3948 ip->ip_id = rfc6864 ? 0 : ip_randomid((uint64_t)m);
3949 ip->ip_off = htons(IP_DF);
3950 ip->ip_len = htons(sizeof(struct ip) + sizeof(struct tcphdr));
3951 ip->ip_ttl = inp->inp_ip_ttl;
3952 ip->ip_tos |= (inp->inp_ip_tos & ~IPTOS_ECN_MASK);
3953 ip->ip_sum = in_cksum_hdr(ip);
3954 } else {
3955 struct ip6_hdr *ip6;
3956
3957 ip6 = (__typeof__(ip6))(void *)data;
3958
3959 ip6->ip6_plen = htons(sizeof(struct tcphdr));
3960 ip6->ip6_hlim = in6_selecthlim(inp, ifp);
3961 ip6->ip6_flow = ip6->ip6_flow & ~IPV6_FLOW_ECN_MASK;
3962
3963 if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src)) {
3964 ip6->ip6_src.s6_addr16[1] = 0;
3965 }
3966 if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst)) {
3967 ip6->ip6_dst.s6_addr16[1] = 0;
3968 }
3969 }
3970 th->th_flags = TH_ACK;
3971
3972 win = tcp_sbspace(tp);
3973 if (win > ((int32_t)TCP_MAXWIN << tp->rcv_scale)) {
3974 win = (int32_t)TCP_MAXWIN << tp->rcv_scale;
3975 }
3976 th->th_win = htons((u_short) (win >> tp->rcv_scale));
3977
3978 if (is_probe) {
3979 th->th_seq = htonl(tp->snd_una - 1);
3980 } else {
3981 th->th_seq = htonl(tp->snd_una);
3982 }
3983 th->th_ack = htonl(tp->rcv_nxt);
3984
3985 /* Force recompute TCP checksum to be the final value */
3986 th->th_sum = 0;
3987 if (inp->inp_vflag & INP_IPV4) {
3988 th->th_sum = inet_cksum(m, IPPROTO_TCP,
3989 sizeof(struct ip), sizeof(struct tcphdr));
3990 } else {
3991 th->th_sum = inet6_cksum(m, IPPROTO_TCP,
3992 sizeof(struct ip6_hdr), sizeof(struct tcphdr));
3993 }
3994
3995 return m;
3996}
3997
3998void
3999tcp_fill_keepalive_offload_frames(ifnet_t ifp,
4000 struct ifnet_keepalive_offload_frame *frames_array,
4001 u_int32_t frames_array_count, size_t frame_data_offset,
4002 u_int32_t *used_frames_count)
4003{
4004 struct inpcb *inp;
4005 inp_gen_t gencnt;
4006 u_int32_t frame_index = *used_frames_count;
4007
4008 if (ifp == NULL || frames_array == NULL ||
4009 frames_array_count == 0 ||
4010 frame_index >= frames_array_count ||
4011 frame_data_offset >= IFNET_KEEPALIVE_OFFLOAD_FRAME_DATA_SIZE) {
4012 return;
4013 }
4014
4015 /*
4016 * This function is called outside the regular TCP processing
4017 * so we need to update the TCP clock.
4018 */
4019 calculate_tcp_clock();
4020
4021 lck_rw_lock_shared(lck: &tcbinfo.ipi_lock);
4022 gencnt = tcbinfo.ipi_gencnt;
4023 LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) {
4024 struct socket *so;
4025 struct ifnet_keepalive_offload_frame *frame;
4026 struct mbuf *m = NULL;
4027 struct tcpcb *tp = intotcpcb(inp);
4028
4029 if (frame_index >= frames_array_count) {
4030 break;
4031 }
4032
4033 if (inp->inp_gencnt > gencnt ||
4034 inp->inp_state == INPCB_STATE_DEAD) {
4035 continue;
4036 }
4037
4038 if ((so = inp->inp_socket) == NULL ||
4039 (so->so_state & SS_DEFUNCT)) {
4040 continue;
4041 }
4042 /*
4043 * check for keepalive offload flag without socket
4044 * lock to avoid a deadlock
4045 */
4046 if (!(inp->inp_flags2 & INP2_KEEPALIVE_OFFLOAD)) {
4047 continue;
4048 }
4049
4050 if (!(inp->inp_vflag & (INP_IPV4 | INP_IPV6))) {
4051 continue;
4052 }
4053 if (inp->inp_ppcb == NULL ||
4054 in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
4055 continue;
4056 }
4057 socket_lock(so, refcount: 1);
4058 /* Release the want count */
4059 if (inp->inp_ppcb == NULL ||
4060 (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING)) {
4061 socket_unlock(so, refcount: 1);
4062 continue;
4063 }
4064 if ((inp->inp_vflag & INP_IPV4) &&
4065 (inp->inp_laddr.s_addr == INADDR_ANY ||
4066 inp->inp_faddr.s_addr == INADDR_ANY)) {
4067 socket_unlock(so, refcount: 1);
4068 continue;
4069 }
4070 if ((inp->inp_vflag & INP_IPV6) &&
4071 (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ||
4072 IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))) {
4073 socket_unlock(so, refcount: 1);
4074 continue;
4075 }
4076 if (inp->inp_lport == 0 || inp->inp_fport == 0) {
4077 socket_unlock(so, refcount: 1);
4078 continue;
4079 }
4080 if (inp->inp_last_outifp == NULL ||
4081 inp->inp_last_outifp->if_index != ifp->if_index) {
4082 socket_unlock(so, refcount: 1);
4083 continue;
4084 }
4085 if ((inp->inp_vflag & INP_IPV4) && frame_data_offset +
4086 sizeof(struct ip) + sizeof(struct tcphdr) >
4087 IFNET_KEEPALIVE_OFFLOAD_FRAME_DATA_SIZE) {
4088 socket_unlock(so, refcount: 1);
4089 continue;
4090 } else if (!(inp->inp_vflag & INP_IPV4) && frame_data_offset +
4091 sizeof(struct ip6_hdr) + sizeof(struct tcphdr) >
4092 IFNET_KEEPALIVE_OFFLOAD_FRAME_DATA_SIZE) {
4093 socket_unlock(so, refcount: 1);
4094 continue;
4095 }
4096 /*
4097 * There is no point in waking up the device for connections
4098 * that are not established. Long lived connection are meant
4099 * for processes that will sent and receive data
4100 */
4101 if (tp->t_state != TCPS_ESTABLISHED) {
4102 socket_unlock(so, refcount: 1);
4103 continue;
4104 }
4105 /*
4106 * This inp has all the information that is needed to
4107 * generate an offload frame.
4108 */
4109 frame = &frames_array[frame_index];
4110 frame->type = IFNET_KEEPALIVE_OFFLOAD_FRAME_TCP;
4111 frame->ether_type = (inp->inp_vflag & INP_IPV4) ?
4112 IFNET_KEEPALIVE_OFFLOAD_FRAME_ETHERTYPE_IPV4 :
4113 IFNET_KEEPALIVE_OFFLOAD_FRAME_ETHERTYPE_IPV6;
4114 frame->interval = (uint16_t)(tp->t_keepidle > 0 ? tp->t_keepidle :
4115 tcp_keepidle);
4116 frame->keep_cnt = (uint8_t)TCP_CONN_KEEPCNT(tp);
4117 frame->keep_retry = (uint16_t)TCP_CONN_KEEPINTVL(tp);
4118 if (so->so_options & SO_NOWAKEFROMSLEEP) {
4119 frame->flags |=
4120 IFNET_KEEPALIVE_OFFLOAD_FLAG_NOWAKEFROMSLEEP;
4121 }
4122 frame->local_port = ntohs(inp->inp_lport);
4123 frame->remote_port = ntohs(inp->inp_fport);
4124 frame->local_seq = tp->snd_nxt;
4125 frame->remote_seq = tp->rcv_nxt;
4126 if (inp->inp_vflag & INP_IPV4) {
4127 ASSERT(frame_data_offset + sizeof(struct ip) + sizeof(struct tcphdr) <= UINT8_MAX);
4128 frame->length = (uint8_t)(frame_data_offset +
4129 sizeof(struct ip) + sizeof(struct tcphdr));
4130 frame->reply_length = frame->length;
4131
4132 frame->addr_length = sizeof(struct in_addr);
4133 bcopy(src: &inp->inp_laddr, dst: frame->local_addr,
4134 n: sizeof(struct in_addr));
4135 bcopy(src: &inp->inp_faddr, dst: frame->remote_addr,
4136 n: sizeof(struct in_addr));
4137 } else {
4138 struct in6_addr *ip6;
4139
4140 ASSERT(frame_data_offset + sizeof(struct ip6_hdr) + sizeof(struct tcphdr) <= UINT8_MAX);
4141 frame->length = (uint8_t)(frame_data_offset +
4142 sizeof(struct ip6_hdr) + sizeof(struct tcphdr));
4143 frame->reply_length = frame->length;
4144
4145 frame->addr_length = sizeof(struct in6_addr);
4146 ip6 = (struct in6_addr *)(void *)frame->local_addr;
4147 bcopy(src: &inp->in6p_laddr, dst: ip6, n: sizeof(struct in6_addr));
4148 if (IN6_IS_SCOPE_EMBED(ip6)) {
4149 ip6->s6_addr16[1] = 0;
4150 }
4151
4152 ip6 = (struct in6_addr *)(void *)frame->remote_addr;
4153 bcopy(src: &inp->in6p_faddr, dst: ip6, n: sizeof(struct in6_addr));
4154 if (IN6_IS_SCOPE_EMBED(ip6)) {
4155 ip6->s6_addr16[1] = 0;
4156 }
4157 }
4158
4159 /*
4160 * First the probe
4161 */
4162 m = tcp_make_keepalive_frame(tp, ifp, TRUE);
4163 if (m == NULL) {
4164 socket_unlock(so, refcount: 1);
4165 continue;
4166 }
4167 bcopy(src: m_mtod_current(m), dst: frame->data + frame_data_offset, n: m->m_len);
4168 m_freem(m);
4169
4170 /*
4171 * Now the response packet to incoming probes
4172 */
4173 m = tcp_make_keepalive_frame(tp, ifp, FALSE);
4174 if (m == NULL) {
4175 socket_unlock(so, refcount: 1);
4176 continue;
4177 }
4178 bcopy(src: m_mtod_current(m), dst: frame->reply_data + frame_data_offset,
4179 n: m->m_len);
4180 m_freem(m);
4181
4182 frame_index++;
4183 socket_unlock(so, refcount: 1);
4184 }
4185 lck_rw_done(lck: &tcbinfo.ipi_lock);
4186 *used_frames_count = frame_index;
4187}
4188
4189static bool
4190inp_matches_kao_frame(ifnet_t ifp, struct ifnet_keepalive_offload_frame *frame,
4191 struct inpcb *inp)
4192{
4193 if (inp->inp_ppcb == NULL) {
4194 return false;
4195 }
4196 /* Release the want count */
4197 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
4198 return false;
4199 }
4200 if (inp->inp_last_outifp == NULL ||
4201 inp->inp_last_outifp->if_index != ifp->if_index) {
4202 return false;
4203 }
4204 if (frame->local_port != ntohs(inp->inp_lport) ||
4205 frame->remote_port != ntohs(inp->inp_fport)) {
4206 return false;
4207 }
4208 if (inp->inp_vflag & INP_IPV4) {
4209 if (memcmp(s1: &inp->inp_laddr, s2: frame->local_addr,
4210 n: sizeof(struct in_addr)) != 0 ||
4211 memcmp(s1: &inp->inp_faddr, s2: frame->remote_addr,
4212 n: sizeof(struct in_addr)) != 0) {
4213 return false;
4214 }
4215 } else if (inp->inp_vflag & INP_IPV6) {
4216 if (memcmp(s1: &inp->inp_laddr, s2: frame->local_addr,
4217 n: sizeof(struct in6_addr)) != 0 ||
4218 memcmp(s1: &inp->inp_faddr, s2: frame->remote_addr,
4219 n: sizeof(struct in6_addr)) != 0) {
4220 return false;
4221 }
4222 } else {
4223 return false;
4224 }
4225 return true;
4226}
4227
4228int
4229tcp_notify_kao_timeout(ifnet_t ifp,
4230 struct ifnet_keepalive_offload_frame *frame)
4231{
4232 struct inpcb *inp = NULL;
4233 struct socket *so = NULL;
4234 bool found = false;
4235
4236 /*
4237 * Unlock the list before posting event on the matching socket
4238 */
4239 lck_rw_lock_shared(lck: &tcbinfo.ipi_lock);
4240
4241 LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) {
4242 if ((so = inp->inp_socket) == NULL ||
4243 (so->so_state & SS_DEFUNCT)) {
4244 continue;
4245 }
4246 if (!(inp->inp_flags2 & INP2_KEEPALIVE_OFFLOAD)) {
4247 continue;
4248 }
4249 if (!(inp->inp_vflag & (INP_IPV4 | INP_IPV6))) {
4250 continue;
4251 }
4252 if (inp->inp_ppcb == NULL ||
4253 in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING) {
4254 continue;
4255 }
4256 socket_lock(so, refcount: 1);
4257 if (inp_matches_kao_frame(ifp, frame, inp)) {
4258 /*
4259 * Keep the matching socket locked
4260 */
4261 found = true;
4262 break;
4263 }
4264 socket_unlock(so, refcount: 1);
4265 }
4266 lck_rw_done(lck: &tcbinfo.ipi_lock);
4267
4268 if (found) {
4269 ASSERT(inp != NULL);
4270 ASSERT(so != NULL);
4271 ASSERT(so == inp->inp_socket);
4272 /*
4273 * Drop the TCP connection like tcptimers() does
4274 */
4275 struct tcpcb *tp = inp->inp_ppcb;
4276
4277 tcpstat.tcps_keepdrops++;
4278 soevent(so,
4279 hint: (SO_FILT_HINT_LOCKED | SO_FILT_HINT_TIMEOUT));
4280 tp = tcp_drop(tp, ETIMEDOUT);
4281
4282 tcpstat.tcps_ka_offload_drops++;
4283 os_log_info(OS_LOG_DEFAULT, "%s: dropped lport %u fport %u\n",
4284 __func__, frame->local_port, frame->remote_port);
4285
4286 socket_unlock(so, refcount: 1);
4287 }
4288
4289 return 0;
4290}
4291
4292errno_t
4293tcp_notify_ack_id_valid(struct tcpcb *tp, struct socket *so,
4294 u_int32_t notify_id)
4295{
4296 struct tcp_notify_ack_marker *elm;
4297
4298 if (so->so_snd.sb_cc == 0) {
4299 return ENOBUFS;
4300 }
4301
4302 SLIST_FOREACH(elm, &tp->t_notify_ack, notify_next) {
4303 /* Duplicate id is not allowed */
4304 if (elm->notify_id == notify_id) {
4305 return EINVAL;
4306 }
4307 /* Duplicate position is not allowed */
4308 if (elm->notify_snd_una == tp->snd_una + so->so_snd.sb_cc) {
4309 return EINVAL;
4310 }
4311 }
4312 return 0;
4313}
4314
4315errno_t
4316tcp_add_notify_ack_marker(struct tcpcb *tp, u_int32_t notify_id)
4317{
4318 struct tcp_notify_ack_marker *nm, *elm = NULL;
4319 struct socket *so = tp->t_inpcb->inp_socket;
4320
4321 nm = kalloc_type(struct tcp_notify_ack_marker, M_WAIT | Z_ZERO);
4322 if (nm == NULL) {
4323 return ENOMEM;
4324 }
4325 nm->notify_id = notify_id;
4326 nm->notify_snd_una = tp->snd_una + so->so_snd.sb_cc;
4327
4328 SLIST_FOREACH(elm, &tp->t_notify_ack, notify_next) {
4329 if (SEQ_GT(nm->notify_snd_una, elm->notify_snd_una)) {
4330 break;
4331 }
4332 }
4333
4334 if (elm == NULL) {
4335 VERIFY(SLIST_EMPTY(&tp->t_notify_ack));
4336 SLIST_INSERT_HEAD(&tp->t_notify_ack, nm, notify_next);
4337 } else {
4338 SLIST_INSERT_AFTER(elm, nm, notify_next);
4339 }
4340 tp->t_notify_ack_count++;
4341 return 0;
4342}
4343
4344void
4345tcp_notify_ack_free(struct tcpcb *tp)
4346{
4347 struct tcp_notify_ack_marker *elm, *next;
4348 if (SLIST_EMPTY(&tp->t_notify_ack)) {
4349 return;
4350 }
4351
4352 SLIST_FOREACH_SAFE(elm, &tp->t_notify_ack, notify_next, next) {
4353 SLIST_REMOVE(&tp->t_notify_ack, elm, tcp_notify_ack_marker,
4354 notify_next);
4355 kfree_type(struct tcp_notify_ack_marker, elm);
4356 }
4357 SLIST_INIT(&tp->t_notify_ack);
4358 tp->t_notify_ack_count = 0;
4359}
4360
4361inline void
4362tcp_notify_acknowledgement(struct tcpcb *tp, struct socket *so)
4363{
4364 struct tcp_notify_ack_marker *elm;
4365
4366 elm = SLIST_FIRST(&tp->t_notify_ack);
4367 if (SEQ_GEQ(tp->snd_una, elm->notify_snd_una)) {
4368 soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOTIFY_ACK);
4369 }
4370}
4371
4372void
4373tcp_get_notify_ack_count(struct tcpcb *tp,
4374 struct tcp_notify_ack_complete *retid)
4375{
4376 struct tcp_notify_ack_marker *elm;
4377 uint32_t complete = 0;
4378
4379 SLIST_FOREACH(elm, &tp->t_notify_ack, notify_next) {
4380 if (SEQ_GEQ(tp->snd_una, elm->notify_snd_una)) {
4381 ASSERT(complete < UINT32_MAX);
4382 complete++;
4383 } else {
4384 break;
4385 }
4386 }
4387 retid->notify_pending = tp->t_notify_ack_count - complete;
4388 retid->notify_complete_count = min(TCP_MAX_NOTIFY_ACK, b: complete);
4389}
4390
4391void
4392tcp_get_notify_ack_ids(struct tcpcb *tp,
4393 struct tcp_notify_ack_complete *retid)
4394{
4395 size_t i = 0;
4396 struct tcp_notify_ack_marker *elm, *next;
4397
4398 SLIST_FOREACH_SAFE(elm, &tp->t_notify_ack, notify_next, next) {
4399 if (i >= retid->notify_complete_count) {
4400 break;
4401 }
4402 if (SEQ_GEQ(tp->snd_una, elm->notify_snd_una)) {
4403 retid->notify_complete_id[i++] = elm->notify_id;
4404 SLIST_REMOVE(&tp->t_notify_ack, elm,
4405 tcp_notify_ack_marker, notify_next);
4406 kfree_type(struct tcp_notify_ack_marker, elm);
4407 tp->t_notify_ack_count--;
4408 } else {
4409 break;
4410 }
4411 }
4412}
4413
4414bool
4415tcp_notify_ack_active(struct socket *so)
4416{
4417 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
4418 SOCK_TYPE(so) == SOCK_STREAM) {
4419 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
4420
4421 if (!SLIST_EMPTY(&tp->t_notify_ack)) {
4422 struct tcp_notify_ack_marker *elm;
4423 elm = SLIST_FIRST(&tp->t_notify_ack);
4424 if (SEQ_GEQ(tp->snd_una, elm->notify_snd_una)) {
4425 return true;
4426 }
4427 }
4428 }
4429 return false;
4430}
4431
4432inline int32_t
4433inp_get_sndbytes_allunsent(struct socket *so, u_int32_t th_ack)
4434{
4435 struct inpcb *inp = sotoinpcb(so);
4436 struct tcpcb *tp = intotcpcb(inp);
4437
4438 if ((so->so_snd.sb_flags & SB_SNDBYTE_CNT) &&
4439 so->so_snd.sb_cc > 0) {
4440 int32_t unsent, sent;
4441 sent = tp->snd_max - th_ack;
4442 if (tp->t_flags & TF_SENTFIN) {
4443 sent--;
4444 }
4445 unsent = so->so_snd.sb_cc - sent;
4446 return unsent;
4447 }
4448 return 0;
4449}
4450
4451uint8_t
4452tcp_get_ace(struct tcphdr *th)
4453{
4454 uint8_t ace = 0;
4455 if (th->th_flags & TH_ECE) {
4456 ace += 1;
4457 }
4458 if (th->th_flags & TH_CWR) {
4459 ace += 2;
4460 }
4461 if (th->th_x2 & (TH_AE >> 8)) {
4462 ace += 4;
4463 }
4464
4465 return ace;
4466}
4467
4468#define IFP_PER_FLOW_STAT(_ipv4_, _stat_) { \
4469 if (_ipv4_) { \
4470 ifp->if_ipv4_stat->_stat_++; \
4471 } else { \
4472 ifp->if_ipv6_stat->_stat_++; \
4473 } \
4474}
4475
4476#define FLOW_ECN_ENABLED(_flags_) \
4477 ((_flags_ & (TE_ECN_ON)) == (TE_ECN_ON))
4478
4479void
4480tcp_update_stats_per_flow(struct ifnet_stats_per_flow *ifs,
4481 struct ifnet *ifp)
4482{
4483 if (ifp == NULL || !IF_FULLY_ATTACHED(ifp)) {
4484 return;
4485 }
4486
4487 ifnet_lock_shared(ifp);
4488 if (ifs->ecn_flags & TE_SETUPSENT) {
4489 if (ifs->ecn_flags & TE_CLIENT_SETUP) {
4490 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_client_setup);
4491 if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
4492 IFP_PER_FLOW_STAT(ifs->ipv4,
4493 ecn_client_success);
4494 } else if (ifs->ecn_flags & TE_LOST_SYN) {
4495 IFP_PER_FLOW_STAT(ifs->ipv4,
4496 ecn_syn_lost);
4497 } else {
4498 IFP_PER_FLOW_STAT(ifs->ipv4,
4499 ecn_peer_nosupport);
4500 }
4501 } else {
4502 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_server_setup);
4503 if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
4504 IFP_PER_FLOW_STAT(ifs->ipv4,
4505 ecn_server_success);
4506 } else if (ifs->ecn_flags & TE_LOST_SYN) {
4507 IFP_PER_FLOW_STAT(ifs->ipv4,
4508 ecn_synack_lost);
4509 } else {
4510 IFP_PER_FLOW_STAT(ifs->ipv4,
4511 ecn_peer_nosupport);
4512 }
4513 }
4514 } else {
4515 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_off_conn);
4516 }
4517 if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
4518 if (ifs->ecn_flags & TE_RECV_ECN_CE) {
4519 tcpstat.tcps_ecn_conn_recv_ce++;
4520 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_conn_recv_ce);
4521 }
4522 if (ifs->ecn_flags & TE_RECV_ECN_ECE) {
4523 tcpstat.tcps_ecn_conn_recv_ece++;
4524 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_conn_recv_ece);
4525 }
4526 if (ifs->ecn_flags & (TE_RECV_ECN_CE | TE_RECV_ECN_ECE)) {
4527 if (ifs->txretransmitbytes > 0 ||
4528 ifs->rxoutoforderbytes > 0) {
4529 tcpstat.tcps_ecn_conn_pl_ce++;
4530 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_conn_plce);
4531 } else {
4532 tcpstat.tcps_ecn_conn_nopl_ce++;
4533 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_conn_noplce);
4534 }
4535 } else {
4536 if (ifs->txretransmitbytes > 0 ||
4537 ifs->rxoutoforderbytes > 0) {
4538 tcpstat.tcps_ecn_conn_plnoce++;
4539 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_conn_plnoce);
4540 }
4541 }
4542 }
4543
4544 /* Other stats are interesting for non-local connections only */
4545 if (ifs->local) {
4546 ifnet_lock_done(ifp);
4547 return;
4548 }
4549
4550 if (ifs->ipv4) {
4551 ifp->if_ipv4_stat->timestamp = net_uptime();
4552 if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
4553 tcp_flow_ecn_perf_stats(ifs, stat: &ifp->if_ipv4_stat->ecn_on);
4554 } else {
4555 tcp_flow_ecn_perf_stats(ifs, stat: &ifp->if_ipv4_stat->ecn_off);
4556 }
4557 } else {
4558 ifp->if_ipv6_stat->timestamp = net_uptime();
4559 if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
4560 tcp_flow_ecn_perf_stats(ifs, stat: &ifp->if_ipv6_stat->ecn_on);
4561 } else {
4562 tcp_flow_ecn_perf_stats(ifs, stat: &ifp->if_ipv6_stat->ecn_off);
4563 }
4564 }
4565
4566 if (ifs->rxmit_drop) {
4567 if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
4568 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_on.rxmit_drop);
4569 } else {
4570 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_off.rxmit_drop);
4571 }
4572 }
4573 if (ifs->ecn_fallback_synloss) {
4574 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_fallback_synloss);
4575 }
4576 if (ifs->ecn_fallback_droprst) {
4577 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_fallback_droprst);
4578 }
4579 if (ifs->ecn_fallback_droprxmt) {
4580 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_fallback_droprxmt);
4581 }
4582 if (ifs->ecn_fallback_ce) {
4583 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_fallback_ce);
4584 }
4585 if (ifs->ecn_fallback_reorder) {
4586 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_fallback_reorder);
4587 }
4588 if (ifs->ecn_recv_ce > 0) {
4589 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_recv_ce);
4590 }
4591 if (ifs->ecn_recv_ece > 0) {
4592 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_recv_ece);
4593 }
4594
4595 tcp_flow_lim_stats(ifs, stat: &ifp->if_lim_stat);
4596 ifnet_lock_done(ifp);
4597}
4598
4599#if SKYWALK
4600
4601#include <skywalk/core/skywalk_var.h>
4602#include <skywalk/nexus/flowswitch/nx_flowswitch.h>
4603
4604void
4605tcp_add_fsw_flow(struct tcpcb *tp, struct ifnet *ifp)
4606{
4607 struct inpcb *inp = tp->t_inpcb;
4608 struct socket *so = inp->inp_socket;
4609 uuid_t fsw_uuid;
4610 struct nx_flow_req nfr;
4611 int err;
4612
4613 if (!NX_FSW_TCP_RX_AGG_ENABLED()) {
4614 return;
4615 }
4616
4617 if (ifp == NULL || kern_nexus_get_flowswitch_instance(ifp, nx_uuid: fsw_uuid)) {
4618 TCP_LOG_FSW_FLOW(tp, "skip ifp no fsw");
4619 return;
4620 }
4621
4622 memset(s: &nfr, c: 0, n: sizeof(nfr));
4623
4624 if (inp->inp_vflag & INP_IPV4) {
4625 ASSERT(!(inp->inp_laddr.s_addr == INADDR_ANY ||
4626 inp->inp_faddr.s_addr == INADDR_ANY ||
4627 IN_MULTICAST(ntohl(inp->inp_laddr.s_addr)) ||
4628 IN_MULTICAST(ntohl(inp->inp_faddr.s_addr))));
4629 nfr.nfr_saddr.sin.sin_len = sizeof(struct sockaddr_in);
4630 nfr.nfr_saddr.sin.sin_family = AF_INET;
4631 nfr.nfr_saddr.sin.sin_port = inp->inp_lport;
4632 memcpy(dst: &nfr.nfr_saddr.sin.sin_addr, src: &inp->inp_laddr,
4633 n: sizeof(struct in_addr));
4634 nfr.nfr_daddr.sin.sin_len = sizeof(struct sockaddr_in);
4635 nfr.nfr_daddr.sin.sin_family = AF_INET;
4636 nfr.nfr_daddr.sin.sin_port = inp->inp_fport;
4637 memcpy(dst: &nfr.nfr_daddr.sin.sin_addr, src: &inp->inp_faddr,
4638 n: sizeof(struct in_addr));
4639 } else {
4640 ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ||
4641 IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) ||
4642 IN6_IS_ADDR_MULTICAST(&inp->in6p_laddr) ||
4643 IN6_IS_ADDR_MULTICAST(&inp->in6p_faddr)));
4644 nfr.nfr_saddr.sin6.sin6_len = sizeof(struct sockaddr_in6);
4645 nfr.nfr_saddr.sin6.sin6_family = AF_INET6;
4646 nfr.nfr_saddr.sin6.sin6_port = inp->inp_lport;
4647 memcpy(dst: &nfr.nfr_saddr.sin6.sin6_addr, src: &inp->in6p_laddr,
4648 n: sizeof(struct in6_addr));
4649 nfr.nfr_daddr.sin6.sin6_len = sizeof(struct sockaddr_in6);
4650 nfr.nfr_daddr.sin.sin_family = AF_INET6;
4651 nfr.nfr_daddr.sin6.sin6_port = inp->inp_fport;
4652 memcpy(dst: &nfr.nfr_daddr.sin6.sin6_addr, src: &inp->in6p_faddr,
4653 n: sizeof(struct in6_addr));
4654 /* clear embedded scope ID */
4655 if (IN6_IS_SCOPE_EMBED(&nfr.nfr_saddr.sin6.sin6_addr)) {
4656 nfr.nfr_saddr.sin6.sin6_addr.s6_addr16[1] = 0;
4657 }
4658 if (IN6_IS_SCOPE_EMBED(&nfr.nfr_daddr.sin6.sin6_addr)) {
4659 nfr.nfr_daddr.sin6.sin6_addr.s6_addr16[1] = 0;
4660 }
4661 }
4662
4663 nfr.nfr_nx_port = 1;
4664 nfr.nfr_ip_protocol = IPPROTO_TCP;
4665 nfr.nfr_transport_protocol = IPPROTO_TCP;
4666 nfr.nfr_flags = NXFLOWREQF_ASIS;
4667 nfr.nfr_epid = (so != NULL ? so->last_pid : 0);
4668 if (NETNS_TOKEN_VALID(&inp->inp_netns_token)) {
4669 nfr.nfr_port_reservation = inp->inp_netns_token;
4670 nfr.nfr_flags |= NXFLOWREQF_EXT_PORT_RSV;
4671 }
4672 ASSERT(inp->inp_flowhash != 0);
4673 nfr.nfr_inp_flowhash = inp->inp_flowhash;
4674
4675 uuid_generate_random(out: nfr.nfr_flow_uuid);
4676 err = kern_nexus_flow_add(ncd: kern_nexus_shared_controller(), nx_uuid: fsw_uuid,
4677 data: &nfr, data_len: sizeof(nfr));
4678
4679 if (err == 0) {
4680 uuid_copy(dst: tp->t_fsw_uuid, src: fsw_uuid);
4681 uuid_copy(dst: tp->t_flow_uuid, src: nfr.nfr_flow_uuid);
4682 }
4683
4684 TCP_LOG_FSW_FLOW(tp, "add err %d\n", err);
4685}
4686
4687void
4688tcp_del_fsw_flow(struct tcpcb *tp)
4689{
4690 if (uuid_is_null(uu: tp->t_fsw_uuid) || uuid_is_null(uu: tp->t_flow_uuid)) {
4691 return;
4692 }
4693
4694 struct nx_flow_req nfr;
4695 uuid_copy(dst: nfr.nfr_flow_uuid, src: tp->t_flow_uuid);
4696
4697 /* It's possible for this call to fail if the nexus has detached */
4698 int err = kern_nexus_flow_del(ncd: kern_nexus_shared_controller(),
4699 nx_uuid: tp->t_fsw_uuid, data: &nfr, data_len: sizeof(nfr));
4700 VERIFY(err == 0 || err == ENOENT || err == ENXIO);
4701
4702 uuid_clear(uu: tp->t_fsw_uuid);
4703 uuid_clear(uu: tp->t_flow_uuid);
4704
4705 TCP_LOG_FSW_FLOW(tp, "del err %d\n", err);
4706}
4707
4708#endif /* SKYWALK */
4709