tcp_subr.c source code [xnu/bsd/netinet/tcp_subr.c]

1	/*
2	* Copyright (c) 2000-2022 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28	/*
29	* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
30	* The Regents of the University of California. All rights reserved.
31	*
32	* Redistribution and use in source and binary forms, with or without
33	* modification, are permitted provided that the following conditions
34	* are met:
35	* 1. Redistributions of source code must retain the above copyright
36	* notice, this list of conditions and the following disclaimer.
37	* 2. Redistributions in binary form must reproduce the above copyright
38	* notice, this list of conditions and the following disclaimer in the
39	* documentation and/or other materials provided with the distribution.
40	* 3. All advertising materials mentioning features or use of this software
41	* must display the following acknowledgement:
42	* This product includes software developed by the University of
43	* California, Berkeley and its contributors.
44	* 4. Neither the name of the University nor the names of its contributors
45	* may be used to endorse or promote products derived from this software
46	* without specific prior written permission.
47	*
48	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58	* SUCH DAMAGE.
59	*
60	* @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95
61	*/
62	/*
63	* NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
64	* support for mandatory and extensible security protections. This notice
65	* is included in support of clause 2.2 (b) of the Apple Public License,
66	* Version 2.0.
67	*/
68
69	#include "tcp_includes.h"
70
71	#include <sys/param.h>
72	#include <sys/systm.h>
73	#include <sys/kernel.h>
74	#include <sys/sysctl.h>
75	#include <sys/malloc.h>
76	#include <sys/mbuf.h>
77	#include <sys/domain.h>
78	#include <sys/proc.h>
79	#include <sys/kauth.h>
80	#include <sys/socket.h>
81	#include <sys/socketvar.h>
82	#include <sys/protosw.h>
83	#include <sys/random.h>
84	#include <sys/syslog.h>
85	#include <sys/mcache.h>
86	#include <kern/locks.h>
87	#include <kern/zalloc.h>
88
89	#include <dev/random/randomdev.h>
90
91	#include <net/route.h>
92	#include <net/if.h>
93	#include <net/content_filter.h>
94	#include <net/ntstat.h>
95	#include <net/multi_layer_pkt_log.h>
96
97	#define tcp_minmssoverload fring
98	#define _IP_VHL
99	#include <netinet/in.h>
100	#include <netinet/in_systm.h>
101	#include <netinet/ip.h>
102	#include <netinet/ip_icmp.h>
103	#include <netinet/ip6.h>
104	#include <netinet/icmp6.h>
105	#include <netinet/in_pcb.h>
106	#include <netinet6/in6_pcb.h>
107	#include <netinet/in_var.h>
108	#include <netinet/ip_var.h>
109	#include <netinet/icmp_var.h>
110	#include <netinet6/ip6_var.h>
111	#include <netinet/mptcp_var.h>
112	#include <netinet/tcp.h>
113	#include <netinet/tcp_fsm.h>
114	#include <netinet/tcp_seq.h>
115	#include <netinet/tcp_timer.h>
116	#include <netinet/tcp_var.h>
117	#include <netinet/tcp_cc.h>
118	#include <netinet/tcp_cache.h>
119	#include <kern/thread_call.h>
120
121	#include <netinet6/tcp6_var.h>
122	#include <netinet/tcpip.h>
123	#if TCPDEBUG
124	#include <netinet/tcp_debug.h>
125	#endif
126	#include <netinet/tcp_log.h>
127
128	#include <netinet6/ip6protosw.h>
129
130	#if IPSEC
131	#include <netinet6/ipsec.h>
132	#include <netinet6/ipsec6.h>
133	#endif /* IPSEC */
134
135	#if NECP
136	#include <net/necp.h>
137	#endif /* NECP */
138
139	#undef tcp_minmssoverload
140
141	#include <net/sockaddr_utils.h>
142
143	#include <corecrypto/ccaes.h>
144	#include <libkern/crypto/aes.h>
145	#include <libkern/crypto/md5.h>
146	#include <sys/kdebug.h>
147	#include <mach/sdt.h>
148	#include <pexpert/pexpert.h>
149
150	#define DBG_FNC_TCP_CLOSE NETDBG_CODE(DBG_NETTCP, ((5 << 8) \| 2))
151
152	static tcp_cc tcp_ccgen;
153
154	extern struct tcptimerlist tcp_timer_list;
155	extern struct tcptailq tcp_tw_tailq;
156
157	extern int tcp_awdl_rtobase;
158
159	SYSCTL_SKMEM_TCP_INT(TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW \| CTLFLAG_LOCKED,
160	int, tcp_mssdflt, TCP_MSS, "Default TCP Maximum Segment Size");
161
162	SYSCTL_SKMEM_TCP_INT(TCPCTL_V6MSSDFLT, v6mssdflt,
163	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_v6mssdflt, TCP6_MSS,
164	"Default TCP Maximum Segment Size for IPv6");
165
166	int tcp_sysctl_fastopenkey(struct sysctl_oid , void* , int*,
167	struct sysctl_req *);
168	SYSCTL_PROC(_net_inet_tcp, OID_AUTO, fastopen_key, CTLTYPE_STRING \| CTLFLAG_WR,
169	`0`, `0`, tcp_sysctl_fastopenkey, "S", "TCP Fastopen key");
170
171	/ Current count of half-open TFO connections /
172	int tcp_tfo_halfcnt = `0`;
173
174	/ Maximum of half-open TFO connection backlog /
175	SYSCTL_SKMEM_TCP_INT(OID_AUTO, fastopen_backlog,
176	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_tfo_backlog, `10`,
177	"Backlog queue for half-open TFO connections");
178
179	SYSCTL_SKMEM_TCP_INT(OID_AUTO, fastopen, CTLFLAG_RW \| CTLFLAG_LOCKED,
180	int, tcp_fastopen, TCP_FASTOPEN_CLIENT \| TCP_FASTOPEN_SERVER,
181	"Enable TCP Fastopen (RFC 7413)");
182
183	SYSCTL_SKMEM_TCP_INT(OID_AUTO, now_init, CTLFLAG_RD \| CTLFLAG_LOCKED,
184	uint32_t, tcp_now_init, `0`, "Initial tcp now value");
185
186	SYSCTL_SKMEM_TCP_INT(OID_AUTO, microuptime_init, CTLFLAG_RD \| CTLFLAG_LOCKED,
187	uint32_t, tcp_microuptime_init, `0`, "Initial tcp uptime value in micro seconds");
188
189	/*
190	* Minimum MSS we accept and use. This prevents DoS attacks where
191	* we are forced to a ridiculous low MSS like 20 and send hundreds
192	* of packets instead of one. The effect scales with the available
193	* bandwidth and quickly saturates the CPU and network interface
194	* with packet generation and sending. Set to zero to disable MINMSS
195	* checking. This setting prevents us from sending too small packets.
196	*/
197	SYSCTL_SKMEM_TCP_INT(OID_AUTO, minmss, CTLFLAG_RW \| CTLFLAG_LOCKED,
198	int, tcp_minmss, TCP_MINMSS, "Minmum TCP Maximum Segment Size");
199
200	SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD \| CTLFLAG_LOCKED,
201	&tcbinfo.ipi_count, `0`, "Number of active PCBs");
202
203	SYSCTL_INT(_net_inet_tcp, OID_AUTO, tw_pcbcount, CTLFLAG_RD \| CTLFLAG_LOCKED,
204	&tcbinfo.ipi_twcount, `0`, "Number of pcbs in time-wait state");
205
206	SYSCTL_SKMEM_TCP_INT(OID_AUTO, icmp_may_rst, CTLFLAG_RW \| CTLFLAG_LOCKED,
207	static int, icmp_may_rst, `1`,
208	"Certain ICMP unreachable messages may abort connections in SYN_SENT");
209
210	static int tcp_strict_rfc1948 = `0`;
211	static int tcp_isn_reseed_interval = `0`;
212	int tcp_do_timestamps = `1`;
213	#if (DEVELOPMENT \|\| DEBUG)
214	SYSCTL_INT(_net_inet_tcp, OID_AUTO, strict_rfc1948, CTLFLAG_RW \| CTLFLAG_LOCKED,
215	&tcp_strict_rfc1948, `0`, "Determines if RFC1948 is followed exactly");
216
217	SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval,
218	CTLFLAG_RW \| CTLFLAG_LOCKED,
219	&tcp_isn_reseed_interval, `0`, "Seconds between reseeding of ISN secret");
220
221	SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_timestamps,
222	CTLFLAG_RW \| CTLFLAG_LOCKED, &tcp_do_timestamps, `0`, "enable TCP timestamps");
223	#endif /* (DEVELOPMENT \|\| DEBUG) */
224
225	SYSCTL_SKMEM_TCP_INT(OID_AUTO, rtt_min, CTLFLAG_RW \| CTLFLAG_LOCKED,
226	int, tcp_TCPTV_MIN, `100`, "min rtt value allowed");
227
228	SYSCTL_SKMEM_TCP_INT(OID_AUTO, rexmt_slop, CTLFLAG_RW,
229	int, tcp_rexmt_slop, TCPTV_REXMTSLOP, "Slop added to retransmit timeout");
230
231	SYSCTL_SKMEM_TCP_INT(OID_AUTO, randomize_ports, CTLFLAG_RW \| CTLFLAG_LOCKED,
232	__private_extern__ int, tcp_use_randomport, `0`,
233	"Randomize TCP port numbers");
234
235	SYSCTL_SKMEM_TCP_INT(OID_AUTO, win_scale_factor, CTLFLAG_RW \| CTLFLAG_LOCKED,
236	__private_extern__ int, tcp_win_scale, `3`, "Window scaling factor");
237
238	#if (DEVELOPMENT \|\| DEBUG)
239	SYSCTL_SKMEM_TCP_INT(OID_AUTO, init_rtt_from_cache,
240	CTLFLAG_RW \| CTLFLAG_LOCKED, static int, tcp_init_rtt_from_cache, `1`,
241	"Initalize RTT from route cache");
242	#else
243	SYSCTL_SKMEM_TCP_INT(OID_AUTO, init_rtt_from_cache,
244	CTLFLAG_RD \| CTLFLAG_LOCKED, static int, tcp_init_rtt_from_cache, `1`,
245	"Initalize RTT from route cache");
246	#endif /* (DEVELOPMENT \|\| DEBUG) */
247
248	static int tso_debug = `0`;
249	SYSCTL_INT(_net_inet_tcp, OID_AUTO, tso_debug, CTLFLAG_RW \| CTLFLAG_LOCKED,
250	&tso_debug, `0`, "TSO verbosity");
251
252	static int tcp_rxt_seg_max = `1024`;
253	SYSCTL_INT(_net_inet_tcp, OID_AUTO, rxt_seg_max, CTLFLAG_RW \| CTLFLAG_LOCKED,
254	&tcp_rxt_seg_max, `0`, "");
255
256	static unsigned long tcp_rxt_seg_drop = `0`;
257	SYSCTL_ULONG(_net_inet_tcp, OID_AUTO, rxt_seg_drop, CTLFLAG_RD \| CTLFLAG_LOCKED,
258	&tcp_rxt_seg_drop, "");
259
260	static void tcp_notify(struct inpcb , int*);
261
262	static KALLOC_TYPE_DEFINE(tcp_bwmeas_zone, struct bwmeas, NET_KT_DEFAULT);
263	KALLOC_TYPE_DEFINE(tcp_reass_zone, struct tseg_qent, NET_KT_DEFAULT);
264	KALLOC_TYPE_DEFINE(tcp_rxt_seg_zone, struct tcp_rxt_seg, NET_KT_DEFAULT);
265
266	extern int slowlink_wsize; / window correction for slow links /
267	extern int path_mtu_discovery;
268
269	uint32_t tcp_now_remainder_us = `0`; / remaining micro seconds for tcp_now /
270
271	static void tcp_sbrcv_grow_rwin(struct tcpcb tp, struct* sockbuf *sb);
272
273	#define TCP_BWMEAS_BURST_MINSIZE 6
274	#define TCP_BWMEAS_BURST_MAXSIZE 25
275
276	/*
277	* Target size of TCP PCB hash tables. Must be a power of two.
278	*
279	* Note that this can be overridden by the kernel environment
280	* variable net.inet.tcp.tcbhashsize
281	*/
282	#ifndef TCBHASHSIZE
283	#define TCBHASHSIZE CONFIG_TCBHASHSIZE
284	#endif
285
286	__private_extern__ int tcp_tcbhashsize = TCBHASHSIZE;
287	SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD \| CTLFLAG_LOCKED,
288	&tcp_tcbhashsize, `0`, "Size of TCP control-block hashtable");
289
290	/*
291	* This is the actual shape of what we allocate using the zone
292	* allocator. Doing it this way allows us to protect both structures
293	* using the same generation count, and also eliminates the overhead
294	* of allocating tcpcbs separately. By hiding the structure here,
295	* we avoid changing most of the rest of the code (although it needs
296	* to be changed, eventually, for greater efficiency).
297	*/
298	#define ALIGNMENT 32
299	struct inp_tp {
300	struct inpcb inp;
301	struct tcpcb tcb __attribute__((aligned(ALIGNMENT)));
302	};
303	#undef ALIGNMENT
304
305	static KALLOC_TYPE_DEFINE(tcpcbzone, struct inp_tp, NET_KT_DEFAULT);
306
307	int get_inpcb_str_size(void);
308	int get_tcp_str_size(void);
309
310	os_log_t tcp_mpkl_log_object = NULL;
311
312	static void tcpcb_to_otcpcb(struct tcpcb , struct* otcpcb *);
313
314	int tcp_notsent_lowat_check(struct socket *so);
315	static void tcp_flow_lim_stats(struct ifnet_stats_per_flow *ifs,
316	struct if_lim_perf_stat *stat);
317	static void tcp_flow_ecn_perf_stats(struct ifnet_stats_per_flow *ifs,
318	struct if_tcp_ecn_perf_stat *stat);
319
320	static aes_encrypt_ctx tfo_ctx; / Crypto-context for TFO /
321
322	void
323	tcp_tfo_gen_cookie(struct inpcb inp, u_char out, size_t blk_size)
324	{
325	u_char in[CCAES_BLOCK_SIZE];
326	int isipv6 = inp->inp_vflag & INP_IPV6;
327
328	VERIFY(blk_size == CCAES_BLOCK_SIZE);
329
330	bzero(s: &in[`0`], CCAES_BLOCK_SIZE);
331	bzero(s: &out[`0`], CCAES_BLOCK_SIZE);
332
333	if (isipv6) {
334	memcpy(dst: in, src: &inp->in6p_faddr, n: sizeof(struct in6_addr));
335	} else {
336	memcpy(dst: in, src: &inp->inp_faddr, n: sizeof(struct in_addr));
337	}
338
339	aes_encrypt_cbc(in_blk: in, NULL, num_blk: `1`, out_blk: out, cx: &tfo_ctx);
340	}
341
342	__private_extern__ int
343	tcp_sysctl_fastopenkey(__unused struct sysctl_oid oidp, __unused void* *arg1,
344	__unused int arg2, struct sysctl_req *req)
345	{
346	int error = `0`;
347	/*
348	* TFO-key is expressed as a string in hex format
349	* +1 to account for the \0 char
350	* +1 because sysctl_io_string() expects a string length but the sysctl command
351	* now includes the terminating \0 in newlen -- see rdar://77205344
352	*/
353	char keystring[TCP_FASTOPEN_KEYLEN * `2` + `2`];
354	u_int32_t key[TCP_FASTOPEN_KEYLEN / sizeof(u_int32_t)];
355	int i;
356
357	/*
358	* sysctl_io_string copies keystring into the oldptr of the sysctl_req.
359	* Make sure everything is zero, to avoid putting garbage in there or
360	* leaking the stack.
361	*/
362	bzero(s: keystring, n: sizeof(keystring));
363
364	error = sysctl_io_string(req, pValue: keystring, valueSize: sizeof(keystring), trunc: `0`, NULL);
365	if (error) {
366	os_log(OS_LOG_DEFAULT,
367	"%s: sysctl_io_string() error %d, req->newlen %lu, sizeof(keystring) %lu",
368	__func__, error, req->newlen, sizeof(keystring));
369	goto exit;
370	}
371	if (req->newptr == USER_ADDR_NULL) {
372	goto exit;
373	}
374
375	if (strlen(s: keystring) != TCP_FASTOPEN_KEYLEN * `2`) {
376	os_log(OS_LOG_DEFAULT,
377	"%s: strlen(keystring) %lu != TCP_FASTOPEN_KEYLEN * 2 %u, newlen %lu",
378	__func__, strlen(keystring), TCP_FASTOPEN_KEYLEN * `2`, req->newlen);
379	error = EINVAL;
380	goto exit;
381	}
382
383	for (i = `0`; i < (TCP_FASTOPEN_KEYLEN / sizeof(u_int32_t)); i++) {
384	/*
385	* We jump over the keystring in 8-character (4 byte in hex)
386	* steps
387	*/
388	if (sscanf(&keystring[i * `8`], "%8x", &key[i]) != `1`) {
389	error = EINVAL;
390	os_log(OS_LOG_DEFAULT,
391	"%s: sscanf() != 1, error EINVAL", __func__);
392	goto exit;
393	}
394	}
395
396	aes_encrypt_key128(key: (u_char *)key, cx: &tfo_ctx);
397
398	exit:
399	return error;
400	}
401
402	int
403	get_inpcb_str_size(void)
404	{
405	return sizeof(struct inpcb);
406	}
407
408	int
409	get_tcp_str_size(void)
410	{
411	return sizeof(struct tcpcb);
412	}
413
414	static int scale_to_powerof2(int size);
415
416	/*
417	* This helper routine returns one of the following scaled value of size:
418	* 1. Rounded down power of two value of size if the size value passed as
419	* argument is not a power of two and the rounded up value overflows.
420	* OR
421	* 2. Rounded up power of two value of size if the size value passed as
422	* argument is not a power of two and the rounded up value does not overflow
423	* OR
424	* 3. Same value as argument size if it is already a power of two.
425	*/
426	static int
427	scale_to_powerof2(int size)
428	{
429	/ Handle special case of size = 0 /
430	int ret = size ? size : `1`;
431
432	if (!powerof2(ret)) {
433	while (!powerof2(size)) {
434	/*
435	* Clear out least significant
436	* set bit till size is left with
437	* its highest set bit at which point
438	* it is rounded down power of two.
439	*/
440	size = size & (size - `1`);
441	}
442
443	/ Check for overflow when rounding up /
444	if (`0` == (size << `1`)) {
445	ret = size;
446	} else {
447	ret = size << `1`;
448	}
449	}
450
451	return ret;
452	}
453
454	/*
455	* Round the floating point to the next integer
456	* Eg. 1.3 will round up to 2.
457	*/
458	uint32_t
459	tcp_ceil(double a)
460	{
461	double res = (uint32_t) a;
462	return (uint32_t)(res + (res < a));
463	}
464
465	uint32_t
466	tcp_round_to(uint32_t val, uint32_t round)
467	{
468	/*
469	* Round up or down based on the middle. Meaning, if we round upon a
470	* multiple of 10, 16 will round to 20 and 14 will round to 10.
471	*/
472	return ((val + (round / `2`)) / round) * round;
473	}
474
475	/*
476	* Round up to the next multiple of base.
477	* Eg. for a base of 64, 65 will become 128,
478	* 2896 will become 2944.
479	*/
480	uint32_t
481	tcp_round_up(uint32_t val, uint32_t base)
482	{
483	if (base == `1` \|\| val % base == `0`) {
484	return val;
485	}
486
487	return ((val + base) / base) * base;
488	}
489
490	static void
491	tcp_tfo_init(void)
492	{
493	u_char key[TCP_FASTOPEN_KEYLEN];
494
495	read_frandom(buffer: key, numBytes: sizeof(key));
496	aes_encrypt_key128(key, cx: &tfo_ctx);
497	}
498
499	/*
500	* Tcp initialization
501	*/
502	void
503	tcp_init(struct protosw pp, struct* domain *dp)
504	{
505	#pragma unused(dp)
506	static int tcp_initialized = `0`;
507	struct inpcbinfo *pcbinfo;
508
509	VERIFY((pp->pr_flags & (PR_INITIALIZED \| PR_ATTACHED)) == PR_ATTACHED);
510
511	if (tcp_initialized) {
512	return;
513	}
514	tcp_initialized = `1`;
515
516	#if DEBUG \|\| DEVELOPMENT
517	(void) PE_parse_boot_argn("tcp_rxt_seg_max", &tcp_rxt_seg_max,
518	sizeof(tcp_rxt_seg_max));
519	#endif /* DEBUG \|\| DEVELOPMENT */
520
521	tcp_ccgen = `1`;
522	tcp_keepinit = TCPTV_KEEP_INIT;
523	tcp_keepidle = TCPTV_KEEP_IDLE;
524	tcp_keepintvl = TCPTV_KEEPINTVL;
525	tcp_keepcnt = TCPTV_KEEPCNT;
526	tcp_maxpersistidle = TCPTV_KEEP_IDLE;
527	tcp_msl = TCPTV_MSL;
528
529	microuptime(tv: &tcp_uptime);
530	read_frandom(buffer: &tcp_now, numBytes: sizeof(tcp_now));
531
532	/ Starts tcp internal clock at a random value /
533	tcp_now = tcp_now & `0x3fffffff`;
534
535	/ expose initial uptime/now via systcl for utcp to keep time sync /
536	tcp_now_init = tcp_now;
537	tcp_microuptime_init =
538	(uint32_t)(tcp_uptime.tv_usec + (tcp_uptime.tv_sec * USEC_PER_SEC));
539	SYSCTL_SKMEM_UPDATE_FIELD(tcp.microuptime_init, tcp_microuptime_init);
540	SYSCTL_SKMEM_UPDATE_FIELD(tcp.now_init, tcp_now_init);
541
542	tcp_tfo_init();
543
544	LIST_INIT(&tcb);
545	tcbinfo.ipi_listhead = &tcb;
546
547	pcbinfo = &tcbinfo;
548
549	/*
550	* allocate group, lock attributes and lock for tcp pcb mutexes
551	*/
552	pcbinfo->ipi_lock_grp = lck_grp_alloc_init(grp_name: "tcppcb",
553	LCK_GRP_ATTR_NULL);
554	lck_attr_setdefault(attr: &pcbinfo->ipi_lock_attr);
555	lck_rw_init(lck: &pcbinfo->ipi_lock, grp: pcbinfo->ipi_lock_grp,
556	attr: &pcbinfo->ipi_lock_attr);
557
558	if (tcp_tcbhashsize == `0`) {
559	/ Set to default /
560	tcp_tcbhashsize = `512`;
561	}
562
563	if (!powerof2(tcp_tcbhashsize)) {
564	int old_hash_size = tcp_tcbhashsize;
565	tcp_tcbhashsize = scale_to_powerof2(size: tcp_tcbhashsize);
566	/ Lower limit of 16 /
567	if (tcp_tcbhashsize < `16`) {
568	tcp_tcbhashsize = `16`;
569	}
570	printf("WARNING: TCB hash size not a power of 2, "
571	"scaled from %d to %d.\n",
572	old_hash_size,
573	tcp_tcbhashsize);
574	}
575
576	tcbinfo.ipi_hashbase = hashinit(count: tcp_tcbhashsize, M_PCB,
577	hashmask: &tcbinfo.ipi_hashmask);
578	tcbinfo.ipi_porthashbase = hashinit(count: tcp_tcbhashsize, M_PCB,
579	hashmask: &tcbinfo.ipi_porthashmask);
580	tcbinfo.ipi_zone = tcpcbzone;
581
582	tcbinfo.ipi_gc = tcp_gc;
583	tcbinfo.ipi_timer = tcp_itimer;
584	in_pcbinfo_attach(&tcbinfo);
585
586	#define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
587	if (max_protohdr < TCP_MINPROTOHDR) {
588	max_protohdr = (int)P2ROUNDUP(TCP_MINPROTOHDR, sizeof(uint32_t));
589	}
590	if (max_linkhdr + max_protohdr > MCLBYTES) {
591	panic("tcp_init");
592	}
593	#undef TCP_MINPROTOHDR
594
595	/ Initialize time wait and timer lists /
596	TAILQ_INIT(&tcp_tw_tailq);
597
598	bzero(s: &tcp_timer_list, n: sizeof(tcp_timer_list));
599	LIST_INIT(&tcp_timer_list.lhead);
600	/*
601	* allocate group and attribute for the tcp timer list
602	*/
603	tcp_timer_list.mtx_grp = lck_grp_alloc_init(grp_name: "tcptimerlist",
604	LCK_GRP_ATTR_NULL);
605	lck_mtx_init(lck: &tcp_timer_list.mtx, grp: tcp_timer_list.mtx_grp,
606	LCK_ATTR_NULL);
607
608	tcp_timer_list.call = thread_call_allocate(func: tcp_run_timerlist, NULL);
609	if (tcp_timer_list.call == NULL) {
610	panic("failed to allocate call entry 1 in tcp_init");
611	}
612
613	/ Initialize TCP Cache /
614	tcp_cache_init();
615
616	tcp_mpkl_log_object = MPKL_CREATE_LOGOBJECT("com.apple.xnu.tcp");
617	if (tcp_mpkl_log_object == NULL) {
618	panic("MPKL_CREATE_LOGOBJECT failed");
619	}
620
621	if (PE_parse_boot_argn(arg_string: "tcp_log", arg_ptr: &tcp_log_enable_flags, max_arg: sizeof(tcp_log_enable_flags))) {
622	os_log(OS_LOG_DEFAULT, "tcp_init: set tcp_log_enable_flags to 0x%x", tcp_log_enable_flags);
623	}
624
625	/*
626	* If more than 4GB of actual memory is available, increase the
627	* maximum allowed receive and send socket buffer size.
628	*/
629	if (mem_actual >= (`1ULL` << (GBSHIFT + `2`))) {
630	tcp_autorcvbuf_max = `4` * `1024` * `1024`;
631	tcp_autosndbuf_max = `4` * `1024` * `1024`;
632
633	SYSCTL_SKMEM_UPDATE_FIELD(tcp.autorcvbufmax, tcp_autorcvbuf_max);
634	SYSCTL_SKMEM_UPDATE_FIELD(tcp.autosndbufmax, tcp_autosndbuf_max);
635	}
636
637	/ Initialize the TCP CCA array /
638	tcp_cc_init();
639	}
640
641	/*
642	* Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
643	* tcp_template used to store this data in mbufs, but we now recopy it out
644	* of the tcpcb each time to conserve mbufs.
645	*/
646	void
647	tcp_fillheaders(struct mbuf m, struct* tcpcb tp, void* ip_ptr, void* *tcp_ptr)
648	{
649	struct inpcb *inp = tp->t_inpcb;
650	struct tcphdr tcp_hdr = (struct* tcphdr *)tcp_ptr;
651
652	if ((inp->inp_vflag & INP_IPV6) != `0`) {
653	struct ip6_hdr *ip6;
654
655	ip6 = (struct ip6_hdr *)ip_ptr;
656	ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) \|
657	(inp->inp_flow & IPV6_FLOWINFO_MASK);
658	ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) \|
659	(IPV6_VERSION & IPV6_VERSION_MASK);
660	ip6->ip6_plen = htons(sizeof(struct tcphdr));
661	ip6->ip6_nxt = IPPROTO_TCP;
662	ip6->ip6_hlim = `0`;
663	ip6->ip6_src = inp->in6p_laddr;
664	ip6->ip6_dst = inp->in6p_faddr;
665	if (m->m_flags & M_PKTHDR) {
666	uint32_t lifscope = inp->inp_lifscope != `0` ? inp->inp_lifscope : inp->inp_fifscope;
667	uint32_t fifscope = inp->inp_fifscope != `0` ? inp->inp_fifscope : inp->inp_lifscope;
668	ip6_output_setsrcifscope(m, lifscope, NULL);
669	ip6_output_setdstifscope(m, fifscope, NULL);
670	}
671	tcp_hdr->th_sum = in6_pseudo(&inp->in6p_laddr, &inp->in6p_faddr,
672	htonl(sizeof(struct tcphdr) + IPPROTO_TCP));
673	} else {
674	struct ip ip = (struct* ip *) ip_ptr;
675
676	ip->ip_vhl = IP_VHL_BORING;
677	ip->ip_tos = `0`;
678	ip->ip_len = `0`;
679	ip->ip_id = `0`;
680	ip->ip_off = `0`;
681	ip->ip_ttl = `0`;
682	ip->ip_sum = `0`;
683	ip->ip_p = IPPROTO_TCP;
684	ip->ip_src = inp->inp_laddr;
685	ip->ip_dst = inp->inp_faddr;
686	tcp_hdr->th_sum =
687	in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
688	htons(sizeof(struct tcphdr) + IPPROTO_TCP));
689	}
690
691	tcp_hdr->th_sport = inp->inp_lport;
692	tcp_hdr->th_dport = inp->inp_fport;
693	tcp_hdr->th_seq = `0`;
694	tcp_hdr->th_ack = `0`;
695	tcp_hdr->th_x2 = `0`;
696	tcp_hdr->th_off = `5`;
697	tcp_hdr->th_flags = `0`;
698	tcp_hdr->th_win = `0`;
699	tcp_hdr->th_urp = `0`;
700	}
701
702	/*
703	* Create template to be used to send tcp packets on a connection.
704	* Allocates an mbuf and fills in a skeletal tcp/ip header. The only
705	* use for this function is in keepalives, which use tcp_respond.
706	*/
707	struct tcptemp *
708	tcp_maketemplate(struct tcpcb tp, struct* mbuf **mp)
709	{
710	struct mbuf *m;
711	struct tcptemp *n;
712
713	*mp = m = m_get(M_DONTWAIT, MT_HEADER);
714	if (m == NULL) {
715	return NULL;
716	}
717	m->m_len = sizeof(struct tcptemp);
718	n = mtod(m, struct tcptemp *);
719
720	tcp_fillheaders(m, tp, ip_ptr: (void )&n->tt_ipgen, tcp_ptr: (void* *)&n->tt_t);
721	return n;
722	}
723
724	/*
725	* Send a single message to the TCP at address specified by
726	* the given TCP/IP header. If m == 0, then we make a copy
727	* of the tcpiphdr at ti and send directly to the addressed host.
728	* This is used to force keep alive messages out using the TCP
729	* template for a connection. If flags are given then we send
730	* a message back to the TCP which originated the * segment ti,
731	* and discard the mbuf containing it and any other attached mbufs.
732	*
733	* In any case the ack and sequence number of the transmitted
734	* segment are as specified by the parameters.
735	*
736	* NOTE: If m != NULL, then ti must point to inside the mbuf.
737	*/
738	void
739	tcp_respond(struct tcpcb tp, void* ipgen, struct* tcphdr th, struct* mbuf *m,
740	tcp_seq ack, tcp_seq seq, uint8_t flags, struct tcp_respond_args *tra)
741	{
742	uint16_t tlen;
743	int win = `0`;
744	struct route *ro = `0`;
745	struct route sro;
746	struct ip *ip;
747	struct tcphdr *nth;
748	struct route_in6 *ro6 = `0`;
749	struct route_in6 sro6;
750	struct ip6_hdr *ip6;
751	int isipv6;
752	struct ifnet *outif;
753	int sotc = SO_TC_UNSPEC;
754	bool check_qos_marking_again = FALSE;
755	uint32_t sifscope = IFSCOPE_NONE, fifscope = IFSCOPE_NONE;
756
757	isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == `6`;
758	ip6 = ipgen;
759	ip = ipgen;
760
761	if (tp) {
762	check_qos_marking_again = tp->t_inpcb->inp_socket->so_flags1 & SOF1_QOSMARKING_POLICY_OVERRIDE ? FALSE : TRUE;
763	sifscope = tp->t_inpcb->inp_lifscope;
764	fifscope = tp->t_inpcb->inp_fifscope;
765	if (!(flags & TH_RST)) {
766	win = tcp_sbspace(tp);
767	if (win > (int32_t)TCP_MAXWIN << tp->rcv_scale) {
768	win = (int32_t)TCP_MAXWIN << tp->rcv_scale;
769	}
770	}
771	if (isipv6) {
772	ro6 = &tp->t_inpcb->in6p_route;
773	} else {
774	ro = &tp->t_inpcb->inp_route;
775	}
776	} else {
777	if (isipv6) {
778	ro6 = &sro6;
779	bzero(s: ro6, n: sizeof(*ro6));
780	} else {
781	ro = &sro;
782	bzero(s: ro, n: sizeof(*ro));
783	}
784	}
785	if (m == `0`) {
786	m = m_gethdr(M_DONTWAIT, MT_HEADER); / MAC-OK /
787	if (m == NULL) {
788	return;
789	}
790	tlen = `0`;
791	m->m_data += max_linkhdr;
792	if (isipv6) {
793	VERIFY((MHLEN - max_linkhdr) >=
794	(sizeof(ip6) + sizeof(nth)));
795	bcopy(src: (caddr_t)ip6, mtod(m, caddr_t),
796	n: sizeof(struct ip6_hdr));
797	ip6 = mtod(m, struct ip6_hdr *);
798	nth = (struct tcphdr )(void* *)(ip6 + `1`);
799	} else {
800	VERIFY((MHLEN - max_linkhdr) >=
801	(sizeof(ip) + sizeof(nth)));
802	bcopy(src: (caddr_t)ip, mtod(m, caddr_t), n: sizeof(struct ip));
803	ip = mtod(m, struct ip *);
804	nth = (struct tcphdr )(void* *)(ip + `1`);
805	}
806	bcopy(src: (caddr_t)th, dst: (caddr_t)nth, n: sizeof(struct tcphdr));
807	#if MPTCP
808	if ((tp) && (tp->t_mpflags & TMPF_RESET)) {
809	flags = (TH_RST \| TH_ACK);
810	} else
811	#endif
812	flags = TH_ACK;
813	} else {
814	m_freem(m->m_next);
815	m->m_next = `0`;
816	m->m_data = (uintptr_t)ipgen;
817	/ m_len is set later /
818	tlen = `0`;
819	#define xchg(a, b, type) { type t; t = a; a = b; b = t; }
820	if (isipv6) {
821	ip6_getsrcifaddr_info(m, &sifscope, NULL);
822	ip6_getdstifaddr_info(m, &fifscope, NULL);
823	if (!in6_embedded_scope) {
824	m->m_pkthdr.pkt_flags &= ~PKTF_IFAINFO;
825	}
826	/ Expect 32-bit aligned IP on strict-align platforms /
827	IP6_HDR_STRICT_ALIGNMENT_CHECK(ip6);
828	xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
829	nth = (struct tcphdr )(void* *)(ip6 + `1`);
830	} else {
831	/ Expect 32-bit aligned IP on strict-align platforms /
832	IP_HDR_STRICT_ALIGNMENT_CHECK(ip);
833	xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long);
834	nth = (struct tcphdr )(void* *)(ip + `1`);
835	}
836	if (th != nth) {
837	/*
838	* this is usually a case when an extension header
839	* exists between the IPv6 header and the
840	* TCP header.
841	*/
842	nth->th_sport = th->th_sport;
843	nth->th_dport = th->th_dport;
844	}
845	xchg(nth->th_dport, nth->th_sport, n_short);
846	#undef xchg
847	}
848	if (isipv6) {
849	ip6->ip6_plen = htons((u_short)(sizeof(struct tcphdr) +
850	tlen));
851	tlen += sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
852	ip6_output_setsrcifscope(m, sifscope, NULL);
853	ip6_output_setdstifscope(m, fifscope, NULL);
854	} else {
855	tlen += sizeof(struct tcpiphdr);
856	ip->ip_len = tlen;
857	ip->ip_ttl = (uint8_t)ip_defttl;
858	}
859	m->m_len = tlen;
860	m->m_pkthdr.len = tlen;
861	m->m_pkthdr.rcvif = `0`;
862	if (tra->keep_alive) {
863	m->m_pkthdr.pkt_flags \|= PKTF_KEEPALIVE;
864	}
865
866	nth->th_seq = htonl(seq);
867	nth->th_ack = htonl(ack);
868	nth->th_x2 = `0`;
869	nth->th_off = sizeof(struct tcphdr) >> `2`;
870	nth->th_flags = flags;
871	if (tp) {
872	nth->th_win = htons((u_short) (win >> tp->rcv_scale));
873	} else {
874	nth->th_win = htons((u_short)win);
875	}
876	nth->th_urp = `0`;
877	if (isipv6) {
878	nth->th_sum = `0`;
879	nth->th_sum = in6_pseudo(&ip6->ip6_src, &ip6->ip6_dst,
880	htonl((tlen - sizeof(struct ip6_hdr)) + IPPROTO_TCP));
881	m->m_pkthdr.csum_flags = CSUM_TCPIPV6;
882	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
883	ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL,
884	ro6 && ro6->ro_rt ? ro6->ro_rt->rt_ifp : NULL);
885	} else {
886	nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
887	htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
888	m->m_pkthdr.csum_flags = CSUM_TCP;
889	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
890	}
891	#if TCPDEBUG
892	if (tp == NULL \|\| (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) {
893	tcp_trace(TA_OUTPUT, `0`, tp, mtod(m, void *), th, `0`);
894	}
895	#endif
896
897	#if NECP
898	necp_mark_packet_from_socket(packet: m, inp: tp ? tp->t_inpcb : NULL, policy_id: `0`, route_rule_id: `0`, skip_policy_id: `0`, pass_flags: `0`);
899	#endif /* NECP */
900
901	#if IPSEC
902	if (tp != NULL && tp->t_inpcb->inp_sp != NULL &&
903	ipsec_setsocket(m, tp ? tp->t_inpcb->inp_socket : NULL) != `0`) {
904	m_freem(m);
905	return;
906	}
907	#endif
908
909	if (tp != NULL) {
910	u_int32_t svc_flags = `0`;
911	if (isipv6) {
912	svc_flags \|= PKT_SCF_IPV6;
913	}
914	sotc = tp->t_inpcb->inp_socket->so_traffic_class;
915	if ((flags & TH_RST) == `0`) {
916	set_packet_service_class(m, tp->t_inpcb->inp_socket,
917	sotc, svc_flags);
918	} else {
919	m_set_service_class(m, MBUF_SC_BK_SYS);
920	}
921
922	/ Embed flowhash and flow control flags /
923	m->m_pkthdr.pkt_flowsrc = FLOWSRC_INPCB;
924	m->m_pkthdr.pkt_flowid = tp->t_inpcb->inp_flowhash;
925	m->m_pkthdr.pkt_flags \|= (PKTF_FLOW_ID \| PKTF_FLOW_LOCALSRC \| PKTF_FLOW_ADV);
926	m->m_pkthdr.pkt_proto = IPPROTO_TCP;
927	m->m_pkthdr.tx_tcp_pid = tp->t_inpcb->inp_socket->last_pid;
928	m->m_pkthdr.tx_tcp_e_pid = tp->t_inpcb->inp_socket->e_pid;
929
930	if (flags & TH_RST) {
931	m->m_pkthdr.comp_gencnt = tp->t_comp_gencnt;
932	}
933	} else {
934	if (flags & TH_RST) {
935	m->m_pkthdr.comp_gencnt = TCP_ACK_COMPRESSION_DUMMY;
936	m_set_service_class(m, MBUF_SC_BK_SYS);
937	}
938	}
939
940	if (isipv6) {
941	struct ip6_out_args ip6oa;
942	bzero(s: &ip6oa, n: sizeof(ip6oa));
943	ip6oa.ip6oa_boundif = tra->ifscope;
944	ip6oa.ip6oa_flags = IP6OAF_SELECT_SRCIF \| IP6OAF_BOUND_SRCADDR;
945	ip6oa.ip6oa_sotc = SO_TC_UNSPEC;
946	ip6oa.ip6oa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
947
948	if (tra->ifscope != IFSCOPE_NONE) {
949	ip6oa.ip6oa_flags \|= IP6OAF_BOUND_IF;
950	}
951	if (tra->nocell) {
952	ip6oa.ip6oa_flags \|= IP6OAF_NO_CELLULAR;
953	}
954	if (tra->noexpensive) {
955	ip6oa.ip6oa_flags \|= IP6OAF_NO_EXPENSIVE;
956	}
957	if (tra->noconstrained) {
958	ip6oa.ip6oa_flags \|= IP6OAF_NO_CONSTRAINED;
959	}
960	if (tra->awdl_unrestricted) {
961	ip6oa.ip6oa_flags \|= IP6OAF_AWDL_UNRESTRICTED;
962	}
963	if (tra->intcoproc_allowed) {
964	ip6oa.ip6oa_flags \|= IP6OAF_INTCOPROC_ALLOWED;
965	}
966	if (tra->management_allowed) {
967	ip6oa.ip6oa_flags \|= IP6OAF_MANAGEMENT_ALLOWED;
968	}
969	ip6oa.ip6oa_sotc = sotc;
970	if (tp != NULL) {
971	if ((tp->t_inpcb->inp_socket->so_flags1 & SOF1_QOSMARKING_ALLOWED)) {
972	ip6oa.ip6oa_flags \|= IP6OAF_QOSMARKING_ALLOWED;
973	}
974	ip6oa.qos_marking_gencount = tp->t_inpcb->inp_policyresult.results.qos_marking_gencount;
975	if (check_qos_marking_again) {
976	ip6oa.ip6oa_flags \|= IP6OAF_REDO_QOSMARKING_POLICY;
977	}
978	ip6oa.ip6oa_netsvctype = tp->t_inpcb->inp_socket->so_netsvctype;
979	}
980	(void) ip6_output(m, NULL, ro6, IPV6_OUTARGS, NULL,
981	NULL, &ip6oa);
982
983	if (check_qos_marking_again) {
984	struct inpcb *inp = tp->t_inpcb;
985	inp->inp_policyresult.results.qos_marking_gencount = ip6oa.qos_marking_gencount;
986	if (ip6oa.ip6oa_flags & IP6OAF_QOSMARKING_ALLOWED) {
987	inp->inp_socket->so_flags1 \|= SOF1_QOSMARKING_ALLOWED;
988	} else {
989	inp->inp_socket->so_flags1 &= ~SOF1_QOSMARKING_ALLOWED;
990	}
991	}
992
993	if (tp != NULL && ro6 != NULL && ro6->ro_rt != NULL &&
994	(outif = ro6->ro_rt->rt_ifp) !=
995	tp->t_inpcb->in6p_last_outifp) {
996	tp->t_inpcb->in6p_last_outifp = outif;
997	#if SKYWALK
998	if (NETNS_TOKEN_VALID(&tp->t_inpcb->inp_netns_token)) {
999	netns_set_ifnet(token: &tp->t_inpcb->inp_netns_token,
1000	ifp: tp->t_inpcb->in6p_last_outifp);
1001	}
1002	#endif /* SKYWALK */
1003	}
1004
1005	if (ro6 == &sro6) {
1006	ROUTE_RELEASE(ro6);
1007	}
1008	} else {
1009	struct ip_out_args ipoa;
1010	bzero(s: &ipoa, n: sizeof(ipoa));
1011	ipoa.ipoa_boundif = tra->ifscope;
1012	ipoa.ipoa_flags = IPOAF_SELECT_SRCIF \| IPOAF_BOUND_SRCADDR;
1013	ipoa.ipoa_sotc = SO_TC_UNSPEC;
1014	ipoa.ipoa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
1015
1016	if (tra->ifscope != IFSCOPE_NONE) {
1017	ipoa.ipoa_flags \|= IPOAF_BOUND_IF;
1018	}
1019	if (tra->nocell) {
1020	ipoa.ipoa_flags \|= IPOAF_NO_CELLULAR;
1021	}
1022	if (tra->noexpensive) {
1023	ipoa.ipoa_flags \|= IPOAF_NO_EXPENSIVE;
1024	}
1025	if (tra->noconstrained) {
1026	ipoa.ipoa_flags \|= IPOAF_NO_CONSTRAINED;
1027	}
1028	if (tra->awdl_unrestricted) {
1029	ipoa.ipoa_flags \|= IPOAF_AWDL_UNRESTRICTED;
1030	}
1031	if (tra->management_allowed) {
1032	ipoa.ipoa_flags \|= IPOAF_MANAGEMENT_ALLOWED;
1033	}
1034	ipoa.ipoa_sotc = sotc;
1035	if (tp != NULL) {
1036	if ((tp->t_inpcb->inp_socket->so_flags1 & SOF1_QOSMARKING_ALLOWED)) {
1037	ipoa.ipoa_flags \|= IPOAF_QOSMARKING_ALLOWED;
1038	}
1039	if (!(tp->t_inpcb->inp_socket->so_flags1 & SOF1_QOSMARKING_POLICY_OVERRIDE)) {
1040	ipoa.ipoa_flags \|= IPOAF_REDO_QOSMARKING_POLICY;
1041	}
1042	ipoa.qos_marking_gencount = tp->t_inpcb->inp_policyresult.results.qos_marking_gencount;
1043	ipoa.ipoa_netsvctype = tp->t_inpcb->inp_socket->so_netsvctype;
1044	}
1045	if (ro != &sro) {
1046	/ Copy the cached route and take an extra reference /
1047	inp_route_copyout(tp->t_inpcb, &sro);
1048	}
1049	/*
1050	* For consistency, pass a local route copy.
1051	*/
1052	(void) ip_output(m, NULL, &sro, IP_OUTARGS, NULL, &ipoa);
1053
1054	if (check_qos_marking_again) {
1055	struct inpcb *inp = tp->t_inpcb;
1056	inp->inp_policyresult.results.qos_marking_gencount = ipoa.qos_marking_gencount;
1057	if (ipoa.ipoa_flags & IPOAF_QOSMARKING_ALLOWED) {
1058	inp->inp_socket->so_flags1 \|= SOF1_QOSMARKING_ALLOWED;
1059	} else {
1060	inp->inp_socket->so_flags1 &= ~SOF1_QOSMARKING_ALLOWED;
1061	}
1062	}
1063	if (tp != NULL && sro.ro_rt != NULL &&
1064	(outif = sro.ro_rt->rt_ifp) !=
1065	tp->t_inpcb->inp_last_outifp) {
1066	tp->t_inpcb->inp_last_outifp = outif;
1067	#if SKYWALK
1068	if (NETNS_TOKEN_VALID(&tp->t_inpcb->inp_netns_token)) {
1069	netns_set_ifnet(token: &tp->t_inpcb->inp_netns_token, ifp: outif);
1070	}
1071	#endif /* SKYWALK */
1072	}
1073	if (ro != &sro) {
1074	/ Synchronize cached PCB route /
1075	inp_route_copyin(tp->t_inpcb, &sro);
1076	} else {
1077	ROUTE_RELEASE(&sro);
1078	}
1079	}
1080	}
1081
1082	/*
1083	* Create a new TCP control block, making an
1084	* empty reassembly queue and hooking it to the argument
1085	* protocol control block. The `inp' parameter must have
1086	* come from the zone allocator set up in tcp_init().
1087	*/
1088	struct tcpcb *
1089	tcp_newtcpcb(struct inpcb *inp)
1090	{
1091	struct inp_tp *it;
1092	struct tcpcb *tp;
1093	struct socket *so = inp->inp_socket;
1094	int isipv6 = (inp->inp_vflag & INP_IPV6) != `0`;
1095	uint32_t random_32;
1096
1097	calculate_tcp_clock();
1098
1099	if ((so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) == `0`) {
1100	it = (struct inp_tp )(void* *)inp;
1101	tp = &it->tcb;
1102	} else {
1103	tp = (struct tcpcb )(void* *)inp->inp_saved_ppcb;
1104	}
1105
1106	bzero(s: (char ) tp, n: sizeof(struct* tcpcb));
1107	LIST_INIT(&tp->t_segq);
1108	tp->t_maxseg = tp->t_maxopd = isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
1109
1110	tp->t_flags = TF_REQ_SCALE \| (tcp_do_timestamps ? TF_REQ_TSTMP : `0`);
1111	tp->t_flagsext \|= TF_SACK_ENABLE;
1112
1113	TAILQ_INIT(&tp->snd_holes);
1114	SLIST_INIT(&tp->t_rxt_segments);
1115	SLIST_INIT(&tp->t_notify_ack);
1116	tp->t_inpcb = inp;
1117	/*
1118	* Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
1119	* rtt estimate. Set rttvar so that srtt + 4 * rttvar gives
1120	* reasonable initial retransmit time.
1121	*/
1122	tp->t_srtt = TCPTV_SRTTBASE;
1123	tp->t_rttvar =
1124	((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / `4`;
1125	tp->t_rttmin = tcp_TCPTV_MIN;
1126	tp->t_rxtcur = TCPTV_RTOBASE;
1127
1128	if (tcp_use_newreno) {
1129	/ use newreno by default /
1130	tp->tcp_cc_index = TCP_CC_ALGO_NEWRENO_INDEX;
1131	#if (DEVELOPMENT \|\| DEBUG)
1132	} else if (tcp_use_ledbat) {
1133	/ use ledbat for testing /
1134	tp->tcp_cc_index = TCP_CC_ALGO_BACKGROUND_INDEX;
1135	#endif
1136	} else {
1137	tp->tcp_cc_index = TCP_CC_ALGO_CUBIC_INDEX;
1138	}
1139
1140	tcp_cc_allocate_state(tp);
1141
1142	if (CC_ALGO(tp)->init != NULL) {
1143	CC_ALGO(tp)->init(tp);
1144	}
1145
1146	/ Initialize rledbat if we are using recv_bg /
1147	if (tcp_rledbat == `1` && TCP_RECV_BG(inp->inp_socket) &&
1148	tcp_cc_rledbat.init != NULL) {
1149	tcp_cc_rledbat.init(tp);
1150	}
1151
1152	tp->snd_cwnd = tcp_initial_cwnd(tp);
1153	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
1154	tp->snd_ssthresh_prev = TCP_MAXWIN << TCP_MAX_WINSHIFT;
1155	tp->t_rcvtime = tcp_now;
1156	tp->tentry.timer_start = tcp_now;
1157	tp->rcv_unackwin = tcp_now;
1158	tp->t_persist_timeout = tcp_max_persist_timeout;
1159	tp->t_persist_stop = `0`;
1160	tp->t_flagsext \|= TF_RCVUNACK_WAITSS;
1161	tp->t_rexmtthresh = (uint8_t)tcprexmtthresh;
1162	tp->rfbuf_ts = tcp_now;
1163	tp->rfbuf_space = tcp_initial_cwnd(tp);
1164	tp->t_forced_acks = TCP_FORCED_ACKS_COUNT;
1165
1166	/ Enable bandwidth measurement on this connection /
1167	tp->t_flagsext \|= TF_MEASURESNDBW;
1168	if (tp->t_bwmeas == NULL) {
1169	tp->t_bwmeas = tcp_bwmeas_alloc(tp);
1170	if (tp->t_bwmeas == NULL) {
1171	tp->t_flagsext &= ~TF_MEASURESNDBW;
1172	}
1173	}
1174
1175	/ Clear time wait tailq entry /
1176	tp->t_twentry.tqe_next = NULL;
1177	tp->t_twentry.tqe_prev = NULL;
1178
1179	read_frandom(buffer: &random_32, numBytes: sizeof(random_32));
1180	tp->t_comp_gencnt = random_32;
1181	if (tp->t_comp_gencnt <= TCP_ACK_COMPRESSION_DUMMY) {
1182	tp->t_comp_gencnt = TCP_ACK_COMPRESSION_DUMMY + `1`;
1183	}
1184	tp->t_comp_lastinc = tcp_now;
1185
1186	if (__probable(tcp_randomize_timestamps)) {
1187	tp->t_ts_offset = random_32;
1188	}
1189
1190	/ Initialize Accurate ECN state /
1191	tp->t_client_accecn_state = tcp_connection_client_accurate_ecn_feature_disabled;
1192	tp->t_server_accecn_state = tcp_connection_server_accurate_ecn_feature_disabled;
1193
1194	/*
1195	* IPv4 TTL initialization is necessary for an IPv6 socket as well,
1196	* because the socket may be bound to an IPv6 wildcard address,
1197	* which may match an IPv4-mapped IPv6 address.
1198	*/
1199	inp->inp_ip_ttl = (uint8_t)ip_defttl;
1200	inp->inp_ppcb = (caddr_t)tp;
1201	return tp; / XXX /
1202	}
1203
1204	/*
1205	* Drop a TCP connection, reporting
1206	* the specified error. If connection is synchronized,
1207	* then send a RST to peer.
1208	*/
1209	struct tcpcb *
1210	tcp_drop(struct tcpcb tp, int* errno)
1211	{
1212	struct socket *so = tp->t_inpcb->inp_socket;
1213	#if CONFIG_DTRACE
1214	struct inpcb *inp = tp->t_inpcb;
1215	#endif
1216
1217	if (TCPS_HAVERCVDSYN(tp->t_state)) {
1218	DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
1219	struct tcpcb *, tp, int32_t, TCPS_CLOSED);
1220	TCP_LOG_STATE(tp, TCPS_CLOSED);
1221	tp->t_state = TCPS_CLOSED;
1222	(void) tcp_output(tp);
1223	tcpstat.tcps_drops++;
1224	} else {
1225	tcpstat.tcps_conndrops++;
1226	}
1227	if (errno == ETIMEDOUT && tp->t_softerror) {
1228	errno = tp->t_softerror;
1229	}
1230	so->so_error = (u_short)errno;
1231
1232	TCP_LOG_CONNECTION_SUMMARY(tp);
1233
1234	return tcp_close(tp);
1235	}
1236
1237	void
1238	tcp_getrt_rtt(struct tcpcb tp, struct* rtentry *rt)
1239	{
1240	u_int32_t rtt = rt->rt_rmx.rmx_rtt;
1241	int isnetlocal = (tp->t_flags & TF_LOCAL);
1242
1243	TCP_LOG_RTM_RTT(tp, rt);
1244
1245	if (rtt != `0` && tcp_init_rtt_from_cache != `0`) {
1246	/*
1247	* XXX the lock bit for RTT indicates that the value
1248	* is also a minimum value; this is subject to time.
1249	*/
1250	if (rt->rt_rmx.rmx_locks & RTV_RTT) {
1251	tp->t_rttmin = rtt / (RTM_RTTUNIT / TCP_RETRANSHZ);
1252	} else {
1253	tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN :
1254	TCPTV_REXMTMIN;
1255	}
1256
1257	tp->t_srtt =
1258	rtt / (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTT_SCALE));
1259	tcpstat.tcps_usedrtt++;
1260
1261	if (rt->rt_rmx.rmx_rttvar) {
1262	tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
1263	(RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTTVAR_SCALE));
1264	tcpstat.tcps_usedrttvar++;
1265	} else {
1266	/ default variation is +- 1 rtt /
1267	tp->t_rttvar =
1268	tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
1269	}
1270
1271	/*
1272	* The RTO formula in the route metric case is based on:
1273	* srtt + 4 * rttvar
1274	* modulo the min, max and slop
1275	*/
1276	TCPT_RANGESET(tp->t_rxtcur,
1277	TCP_REXMTVAL(tp),
1278	tp->t_rttmin, TCPTV_REXMTMAX,
1279	TCP_ADD_REXMTSLOP(tp));
1280	} else if (tp->t_state < TCPS_ESTABLISHED && tp->t_srtt == `0` &&
1281	tp->t_rxtshift == `0`) {
1282	struct ifnet *ifp = rt->rt_ifp;
1283
1284	if (ifp != NULL && (ifp->if_eflags & IFEF_AWDL) != `0`) {
1285	/*
1286	* AWDL needs a special value for the default initial retransmission timeout
1287	*/
1288	if (tcp_awdl_rtobase > tcp_TCPTV_MIN) {
1289	tp->t_rttvar = ((tcp_awdl_rtobase - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / `4`;
1290	} else {
1291	tp->t_rttvar = ((tcp_TCPTV_MIN - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / `4`;
1292	}
1293	TCPT_RANGESET(tp->t_rxtcur,
1294	TCP_REXMTVAL(tp),
1295	tp->t_rttmin, TCPTV_REXMTMAX,
1296	TCP_ADD_REXMTSLOP(tp));
1297	}
1298	}
1299
1300	TCP_LOG_RTT_INFO(tp);
1301	}
1302
1303	static inline void
1304	tcp_create_ifnet_stats_per_flow(struct tcpcb *tp,
1305	struct ifnet_stats_per_flow *ifs)
1306	{
1307	struct inpcb *inp;
1308	struct socket *so;
1309	if (tp == NULL \|\| ifs == NULL) {
1310	return;
1311	}
1312
1313	bzero(s: ifs, n: sizeof(*ifs));
1314	inp = tp->t_inpcb;
1315	so = inp->inp_socket;
1316
1317	ifs->ipv4 = (inp->inp_vflag & INP_IPV6) ? `0` : `1`;
1318	ifs->local = (tp->t_flags & TF_LOCAL) ? `1` : `0`;
1319	ifs->connreset = (so->so_error == ECONNRESET) ? `1` : `0`;
1320	ifs->conntimeout = (so->so_error == ETIMEDOUT) ? `1` : `0`;
1321	ifs->ecn_flags = tp->ecn_flags;
1322	ifs->txretransmitbytes = tp->t_stat.txretransmitbytes;
1323	ifs->rxoutoforderbytes = tp->t_stat.rxoutoforderbytes;
1324	ifs->rxmitpkts = tp->t_stat.rxmitpkts;
1325	ifs->rcvoopack = tp->t_rcvoopack;
1326	ifs->pawsdrop = tp->t_pawsdrop;
1327	ifs->sack_recovery_episodes = tp->t_sack_recovery_episode;
1328	ifs->reordered_pkts = tp->t_reordered_pkts;
1329	ifs->dsack_sent = tp->t_dsack_sent;
1330	ifs->dsack_recvd = tp->t_dsack_recvd;
1331	ifs->srtt = tp->t_srtt;
1332	ifs->rttupdated = tp->t_rttupdated;
1333	ifs->rttvar = tp->t_rttvar;
1334	ifs->rttmin = get_base_rtt(tp);
1335	if (tp->t_bwmeas != NULL && tp->t_bwmeas->bw_sndbw_max > `0`) {
1336	ifs->bw_sndbw_max = tp->t_bwmeas->bw_sndbw_max;
1337	} else {
1338	ifs->bw_sndbw_max = `0`;
1339	}
1340	if (tp->t_bwmeas != NULL && tp->t_bwmeas->bw_rcvbw_max > `0`) {
1341	ifs->bw_rcvbw_max = tp->t_bwmeas->bw_rcvbw_max;
1342	} else {
1343	ifs->bw_rcvbw_max = `0`;
1344	}
1345	ifs->bk_txpackets = so->so_tc_stats[MBUF_TC_BK].txpackets;
1346	ifs->txpackets = inp->inp_stat->txpackets;
1347	ifs->rxpackets = inp->inp_stat->rxpackets;
1348	}
1349
1350	static inline void
1351	tcp_flow_ecn_perf_stats(struct ifnet_stats_per_flow *ifs,
1352	struct if_tcp_ecn_perf_stat *stat)
1353	{
1354	u_int64_t curval, oldval;
1355	stat->total_txpkts += ifs->txpackets;
1356	stat->total_rxpkts += ifs->rxpackets;
1357	stat->total_rxmitpkts += ifs->rxmitpkts;
1358	stat->total_oopkts += ifs->rcvoopack;
1359	stat->total_reorderpkts += (ifs->reordered_pkts +
1360	ifs->pawsdrop + ifs->dsack_sent + ifs->dsack_recvd);
1361
1362	/ Average RTT /
1363	curval = ifs->srtt >> TCP_RTT_SHIFT;
1364	if (curval > `0` && ifs->rttupdated >= `16`) {
1365	if (stat->rtt_avg == `0`) {
1366	stat->rtt_avg = curval;
1367	} else {
1368	oldval = stat->rtt_avg;
1369	stat->rtt_avg = ((oldval << `4`) - oldval + curval) >> `4`;
1370	}
1371	}
1372
1373	/ RTT variance /
1374	curval = ifs->rttvar >> TCP_RTTVAR_SHIFT;
1375	if (curval > `0` && ifs->rttupdated >= `16`) {
1376	if (stat->rtt_var == `0`) {
1377	stat->rtt_var = curval;
1378	} else {
1379	oldval = stat->rtt_var;
1380	stat->rtt_var =
1381	((oldval << `4`) - oldval + curval) >> `4`;
1382	}
1383	}
1384
1385	/ SACK episodes /
1386	stat->sack_episodes += ifs->sack_recovery_episodes;
1387	if (ifs->connreset) {
1388	stat->rst_drop++;
1389	}
1390	}
1391
1392	static inline void
1393	tcp_flow_lim_stats(struct ifnet_stats_per_flow *ifs,
1394	struct if_lim_perf_stat *stat)
1395	{
1396	u_int64_t curval, oldval;
1397
1398	stat->lim_total_txpkts += ifs->txpackets;
1399	stat->lim_total_rxpkts += ifs->rxpackets;
1400	stat->lim_total_retxpkts += ifs->rxmitpkts;
1401	stat->lim_total_oopkts += ifs->rcvoopack;
1402
1403	if (ifs->bw_sndbw_max > `0`) {
1404	/ convert from bytes per ms to bits per second /
1405	ifs->bw_sndbw_max *= `8000`;
1406	stat->lim_ul_max_bandwidth = MAX(stat->lim_ul_max_bandwidth,
1407	ifs->bw_sndbw_max);
1408	}
1409
1410	if (ifs->bw_rcvbw_max > `0`) {
1411	/ convert from bytes per ms to bits per second /
1412	ifs->bw_rcvbw_max *= `8000`;
1413	stat->lim_dl_max_bandwidth = MAX(stat->lim_dl_max_bandwidth,
1414	ifs->bw_rcvbw_max);
1415	}
1416
1417	/ Average RTT /
1418	curval = ifs->srtt >> TCP_RTT_SHIFT;
1419	if (curval > `0` && ifs->rttupdated >= `16`) {
1420	if (stat->lim_rtt_average == `0`) {
1421	stat->lim_rtt_average = curval;
1422	} else {
1423	oldval = stat->lim_rtt_average;
1424	stat->lim_rtt_average =
1425	((oldval << `4`) - oldval + curval) >> `4`;
1426	}
1427	}
1428
1429	/ RTT variance /
1430	curval = ifs->rttvar >> TCP_RTTVAR_SHIFT;
1431	if (curval > `0` && ifs->rttupdated >= `16`) {
1432	if (stat->lim_rtt_variance == `0`) {
1433	stat->lim_rtt_variance = curval;
1434	} else {
1435	oldval = stat->lim_rtt_variance;
1436	stat->lim_rtt_variance =
1437	((oldval << `4`) - oldval + curval) >> `4`;
1438	}
1439	}
1440
1441	if (stat->lim_rtt_min == `0`) {
1442	stat->lim_rtt_min = ifs->rttmin;
1443	} else {
1444	stat->lim_rtt_min = MIN(stat->lim_rtt_min, ifs->rttmin);
1445	}
1446
1447	/ connection timeouts /
1448	stat->lim_conn_attempts++;
1449	if (ifs->conntimeout) {
1450	stat->lim_conn_timeouts++;
1451	}
1452
1453	/ bytes sent using background delay-based algorithms /
1454	stat->lim_bk_txpkts += ifs->bk_txpackets;
1455	}
1456
1457	/*
1458	* Close a TCP control block:
1459	* discard all space held by the tcp
1460	* discard internet protocol block
1461	* wake up any sleepers
1462	*/
1463	struct tcpcb *
1464	tcp_close(struct tcpcb *tp)
1465	{
1466	struct inpcb *inp = tp->t_inpcb;
1467	struct socket *so = inp->inp_socket;
1468	int isipv6 = (inp->inp_vflag & INP_IPV6) != `0`;
1469	struct route *ro;
1470	struct rtentry *rt;
1471	int dosavessthresh;
1472	struct ifnet_stats_per_flow ifs;
1473
1474	/ tcp_close was called previously, bail /
1475	if (inp->inp_ppcb == NULL) {
1476	return NULL;
1477	}
1478
1479	tcp_del_fsw_flow(tp);
1480
1481	tcp_canceltimers(tp);
1482	KERNEL_DEBUG(DBG_FNC_TCP_CLOSE \| DBG_FUNC_START, tp, `0`, `0`, `0`, `0`);
1483
1484	/*
1485	* If another thread for this tcp is currently in ip (indicated by
1486	* the TF_SENDINPROG flag), defer the cleanup until after it returns
1487	* back to tcp. This is done to serialize the close until after all
1488	* pending output is finished, in order to avoid having the PCB be
1489	* detached and the cached route cleaned, only for ip to cache the
1490	* route back into the PCB again. Note that we've cleared all the
1491	* timers at this point. Set TF_CLOSING to indicate to tcp_output()
1492	* that is should call us again once it returns from ip; at that
1493	* point both flags should be cleared and we can proceed further
1494	* with the cleanup.
1495	*/
1496	if ((tp->t_flags & TF_CLOSING) \|\|
1497	inp->inp_sndinprog_cnt > `0`) {
1498	tp->t_flags \|= TF_CLOSING;
1499	return NULL;
1500	}
1501
1502	TCP_LOG_CONNECTION_SUMMARY(tp);
1503
1504	DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
1505	struct tcpcb *, tp, int32_t, TCPS_CLOSED);
1506
1507	ro = (isipv6 ? (struct route *)&inp->in6p_route : &inp->inp_route);
1508	rt = ro->ro_rt;
1509	if (rt != NULL) {
1510	RT_LOCK_SPIN(rt);
1511	}
1512
1513	/*
1514	* If we got enough samples through the srtt filter,
1515	* save the rtt and rttvar in the routing entry.
1516	* 'Enough' is arbitrarily defined as the 16 samples.
1517	* 16 samples is enough for the srtt filter to converge
1518	* to within 5% of the correct value; fewer samples and
1519	* we could save a very bogus rtt.
1520	*
1521	* Don't update the default route's characteristics and don't
1522	* update anything that the user "locked".
1523	*/
1524	if (tp->t_rttupdated >= `16`) {
1525	u_int32_t i = `0`;
1526	bool log_rtt = false;
1527
1528	if (isipv6) {
1529	struct sockaddr_in6 *sin6;
1530
1531	if (rt == NULL) {
1532	goto no_valid_rt;
1533	}
1534	sin6 = SIN6(rt_key(rt));
1535	if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr)) {
1536	goto no_valid_rt;
1537	}
1538	} else if (ROUTE_UNUSABLE(ro) \|\|
1539	SIN(rt_key(rt))->sin_addr.s_addr == INADDR_ANY) {
1540	DTRACE_TCP4(state__change, void, NULL,
1541	struct inpcb , inp, struct* tcpcb *, tp,
1542	int32_t, TCPS_CLOSED);
1543	TCP_LOG_STATE(tp, TCPS_CLOSED);
1544	tp->t_state = TCPS_CLOSED;
1545	goto no_valid_rt;
1546	}
1547
1548	RT_LOCK_ASSERT_HELD(rt);
1549	if ((rt->rt_rmx.rmx_locks & RTV_RTT) == `0`) {
1550	i = tp->t_srtt *
1551	(RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTT_SCALE));
1552	if (rt->rt_rmx.rmx_rtt && i) {
1553	/*
1554	* filter this update to half the old & half
1555	* the new values, converting scale.
1556	* See route.h and tcp_var.h for a
1557	* description of the scaling constants.
1558	*/
1559	rt->rt_rmx.rmx_rtt =
1560	(rt->rt_rmx.rmx_rtt + i) / `2`;
1561	} else {
1562	rt->rt_rmx.rmx_rtt = i;
1563	}
1564	tcpstat.tcps_cachedrtt++;
1565	log_rtt = true;
1566	}
1567	if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == `0`) {
1568	i = tp->t_rttvar *
1569	(RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTTVAR_SCALE));
1570	if (rt->rt_rmx.rmx_rttvar && i) {
1571	rt->rt_rmx.rmx_rttvar =
1572	(rt->rt_rmx.rmx_rttvar + i) / `2`;
1573	} else {
1574	rt->rt_rmx.rmx_rttvar = i;
1575	}
1576	tcpstat.tcps_cachedrttvar++;
1577	log_rtt = true;
1578	}
1579	if (log_rtt) {
1580	TCP_LOG_RTM_RTT(tp, rt);
1581	TCP_LOG_RTT_INFO(tp);
1582	}
1583	/*
1584	* The old comment here said:
1585	* update the pipelimit (ssthresh) if it has been updated
1586	* already or if a pipesize was specified & the threshhold
1587	* got below half the pipesize. I.e., wait for bad news
1588	* before we start updating, then update on both good
1589	* and bad news.
1590	*
1591	* But we want to save the ssthresh even if no pipesize is
1592	* specified explicitly in the route, because such
1593	* connections still have an implicit pipesize specified
1594	* by the global tcp_sendspace. In the absence of a reliable
1595	* way to calculate the pipesize, it will have to do.
1596	*/
1597	i = tp->snd_ssthresh;
1598	if (rt->rt_rmx.rmx_sendpipe != `0`) {
1599	dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / `2`);
1600	} else {
1601	dosavessthresh = (i < so->so_snd.sb_hiwat / `2`);
1602	}
1603	if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == `0` &&
1604	i != `0` && rt->rt_rmx.rmx_ssthresh != `0`) \|\|
1605	dosavessthresh) {
1606	/*
1607	* convert the limit from user data bytes to
1608	* packets then to packet data bytes.
1609	*/
1610	i = (i + tp->t_maxseg / `2`) / tp->t_maxseg;
1611	if (i < `2`) {
1612	i = `2`;
1613	}
1614	i *= (u_int32_t)(tp->t_maxseg +
1615	isipv6 ? sizeof(struct ip6_hdr) +
1616	sizeof(struct tcphdr) :
1617	sizeof(struct tcpiphdr));
1618	if (rt->rt_rmx.rmx_ssthresh) {
1619	rt->rt_rmx.rmx_ssthresh =
1620	(rt->rt_rmx.rmx_ssthresh + i) / `2`;
1621	} else {
1622	rt->rt_rmx.rmx_ssthresh = i;
1623	}
1624	tcpstat.tcps_cachedssthresh++;
1625	}
1626	}
1627
1628	/*
1629	* Mark route for deletion if no information is cached.
1630	*/
1631	if (rt != NULL && (so->so_flags & SOF_OVERFLOW)) {
1632	if (!(rt->rt_rmx.rmx_locks & RTV_RTT) &&
1633	rt->rt_rmx.rmx_rtt == `0`) {
1634	rt->rt_flags \|= RTF_DELCLONE;
1635	}
1636	}
1637
1638	no_valid_rt:
1639	if (rt != NULL) {
1640	RT_UNLOCK(rt);
1641	}
1642
1643	/ free the reassembly queue, if any /
1644	(void) tcp_freeq(tp);
1645
1646	/ performance stats per interface /
1647	tcp_create_ifnet_stats_per_flow(tp, ifs: &ifs);
1648	tcp_update_stats_per_flow(&ifs, inp->inp_last_outifp);
1649
1650	tcp_free_sackholes(tp);
1651	tcp_notify_ack_free(tp);
1652
1653	inp_decr_sndbytes_allunsent(so, tp->snd_una);
1654
1655	if (tp->t_bwmeas != NULL) {
1656	tcp_bwmeas_free(tp);
1657	}
1658	tcp_rxtseg_clean(tp);
1659	/ Free the packet list /
1660	if (tp->t_pktlist_head != NULL) {
1661	m_freem_list(tp->t_pktlist_head);
1662	}
1663	TCP_PKTLIST_CLEAR(tp);
1664
1665	if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) {
1666	inp->inp_saved_ppcb = (caddr_t) tp;
1667	}
1668
1669	TCP_LOG_STATE(tp, TCPS_CLOSED);
1670	tp->t_state = TCPS_CLOSED;
1671
1672	/*
1673	* Issue a wakeup before detach so that we don't miss
1674	* a wakeup
1675	*/
1676	sodisconnectwakeup(so);
1677
1678	/*
1679	* Make sure to clear the TCP Keep Alive Offload as it is
1680	* ref counted on the interface
1681	*/
1682	tcp_clear_keep_alive_offload(so);
1683
1684	/*
1685	* If this is a socket that does not want to wakeup the device
1686	* for it's traffic, the application might need to know that the
1687	* socket is closed, send a notification.
1688	*/
1689	if ((so->so_options & SO_NOWAKEFROMSLEEP) &&
1690	inp->inp_state != INPCB_STATE_DEAD &&
1691	!(inp->inp_flags2 & INP2_TIMEWAIT)) {
1692	socket_post_kev_msg_closed(so);
1693	}
1694
1695	if (CC_ALGO(tp)->cleanup != NULL) {
1696	CC_ALGO(tp)->cleanup(tp);
1697	}
1698
1699	tp->tcp_cc_index = TCP_CC_ALGO_NONE;
1700
1701	if (TCP_USE_RLEDBAT(tp, so) && tcp_cc_rledbat.cleanup != NULL) {
1702	tcp_cc_rledbat.cleanup(tp);
1703	}
1704
1705	/ Can happen if we close the socket before receiving the third ACK /
1706	if ((tp->t_tfo_flags & TFO_F_COOKIE_VALID)) {
1707	OSDecrementAtomic(&tcp_tfo_halfcnt);
1708
1709	/ Panic if something has gone terribly wrong. /
1710	VERIFY(tcp_tfo_halfcnt >= `0`);
1711
1712	tp->t_tfo_flags &= ~TFO_F_COOKIE_VALID;
1713	}
1714
1715	if (SOCK_CHECK_DOM(so, PF_INET6)) {
1716	in6_pcbdetach(inp);
1717	} else {
1718	in_pcbdetach(inp);
1719	}
1720
1721	/*
1722	* Call soisdisconnected after detach because it might unlock the socket
1723	*/
1724	soisdisconnected(so);
1725	tcpstat.tcps_closed++;
1726	KERNEL_DEBUG(DBG_FNC_TCP_CLOSE \| DBG_FUNC_END,
1727	tcpstat.tcps_closed, `0`, `0`, `0`, `0`);
1728	return NULL;
1729	}
1730
1731	int
1732	tcp_freeq(struct tcpcb *tp)
1733	{
1734	struct tseg_qent *q;
1735	int rv = `0`;
1736	int count = `0`;
1737
1738	while ((q = LIST_FIRST(&tp->t_segq)) != NULL) {
1739	LIST_REMOVE(q, tqe_q);
1740	tp->t_reassq_mbcnt -= _MSIZE + (q->tqe_m->m_flags & M_EXT) ?
1741	q->tqe_m->m_ext.ext_size : `0`;
1742	m_freem(q->tqe_m);
1743	zfree(tcp_reass_zone, q);
1744	rv = `1`;
1745	count++;
1746	}
1747	tp->t_reassqlen = `0`;
1748	if (count > `0`) {
1749	OSAddAtomic(-count, &tcp_reass_total_qlen);
1750	}
1751	return rv;
1752	}
1753
1754
1755	void
1756	tcp_drain(void)
1757	{
1758	struct inpcb *inp;
1759	struct tcpcb *tp;
1760
1761	if (!lck_rw_try_lock_exclusive(lck: &tcbinfo.ipi_lock)) {
1762	return;
1763	}
1764
1765	LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) {
1766	if (in_pcb_checkstate(inp, WNT_ACQUIRE, `0`) !=
1767	WNT_STOPUSING) {
1768	socket_lock(so: inp->inp_socket, refcount: `1`);
1769	if (in_pcb_checkstate(inp, WNT_RELEASE, `1`)
1770	== WNT_STOPUSING) {
1771	/ lost a race, try the next one /
1772	socket_unlock(so: inp->inp_socket, refcount: `1`);
1773	continue;
1774	}
1775	tp = intotcpcb(inp);
1776
1777	so_drain_extended_bk_idle(inp->inp_socket);
1778
1779	socket_unlock(so: inp->inp_socket, refcount: `1`);
1780	}
1781	}
1782	lck_rw_done(lck: &tcbinfo.ipi_lock);
1783	}
1784
1785	/*
1786	* Notify a tcp user of an asynchronous error;
1787	* store error as soft error, but wake up user
1788	* (for now, won't do anything until can select for soft error).
1789	*
1790	* Do not wake up user since there currently is no mechanism for
1791	* reporting soft errors (yet - a kqueue filter may be added).
1792	*/
1793	static void
1794	tcp_notify(struct inpcb inp, int* error)
1795	{
1796	struct tcpcb *tp;
1797
1798	if (inp == NULL \|\| (inp->inp_state == INPCB_STATE_DEAD)) {
1799	return; / pcb is gone already /
1800	}
1801	tp = (struct tcpcb *)inp->inp_ppcb;
1802
1803	VERIFY(tp != NULL);
1804	/*
1805	* Ignore some errors if we are hooked up.
1806	* If connection hasn't completed, has retransmitted several times,
1807	* and receives a second error, give up now. This is better
1808	* than waiting a long time to establish a connection that
1809	* can never complete.
1810	*/
1811	if (tp->t_state == TCPS_ESTABLISHED &&
1812	(error == EHOSTUNREACH \|\| error == ENETUNREACH \|\|
1813	error == EHOSTDOWN)) {
1814	if (inp->inp_route.ro_rt) {
1815	rtfree(inp->inp_route.ro_rt);
1816	inp->inp_route.ro_rt = (struct rtentry *)NULL;
1817	}
1818	} else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > `3` &&
1819	tp->t_softerror) {
1820	tcp_drop(tp, errno: error);
1821	} else {
1822	tp->t_softerror = error;
1823	}
1824	}
1825
1826	struct bwmeas *
1827	tcp_bwmeas_alloc(struct tcpcb *tp)
1828	{
1829	struct bwmeas *elm;
1830	elm = zalloc_flags(tcp_bwmeas_zone, Z_ZERO \| Z_WAITOK);
1831	elm->bw_minsizepkts = TCP_BWMEAS_BURST_MINSIZE;
1832	elm->bw_minsize = elm->bw_minsizepkts * tp->t_maxseg;
1833	return elm;
1834	}
1835
1836	void
1837	tcp_bwmeas_free(struct tcpcb *tp)
1838	{
1839	zfree(tcp_bwmeas_zone, tp->t_bwmeas);
1840	tp->t_bwmeas = NULL;
1841	tp->t_flagsext &= ~(TF_MEASURESNDBW);
1842	}
1843
1844	int
1845	get_tcp_inp_list(struct inpcb *inp_list, int* n, inp_gen_t gencnt)
1846	{
1847	struct tcpcb *tp;
1848	struct inpcb *inp;
1849	int i = `0`;
1850
1851	LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) {
1852	if (inp->inp_gencnt <= gencnt &&
1853	inp->inp_state != INPCB_STATE_DEAD) {
1854	inp_list[i++] = inp;
1855	}
1856	if (i >= n) {
1857	break;
1858	}
1859	}
1860
1861	TAILQ_FOREACH(tp, &tcp_tw_tailq, t_twentry) {
1862	inp = tp->t_inpcb;
1863	if (inp->inp_gencnt <= gencnt &&
1864	inp->inp_state != INPCB_STATE_DEAD) {
1865	inp_list[i++] = inp;
1866	}
1867	if (i >= n) {
1868	break;
1869	}
1870	}
1871	return i;
1872	}
1873
1874	/*
1875	* tcpcb_to_otcpcb copies specific bits of a tcpcb to a otcpcb format.
1876	* The otcpcb data structure is passed to user space and must not change.
1877	*/
1878	static void
1879	tcpcb_to_otcpcb(struct tcpcb tp, struct* otcpcb *otp)
1880	{
1881	otp->t_segq = (uint32_t)VM_KERNEL_ADDRHASH(tp->t_segq.lh_first);
1882	otp->t_dupacks = tp->t_dupacks;
1883	otp->t_timer[TCPT_REXMT_EXT] = tp->t_timer[TCPT_REXMT];
1884	otp->t_timer[TCPT_PERSIST_EXT] = tp->t_timer[TCPT_PERSIST];
1885	otp->t_timer[TCPT_KEEP_EXT] = tp->t_timer[TCPT_KEEP];
1886	otp->t_timer[TCPT_2MSL_EXT] = tp->t_timer[TCPT_2MSL];
1887	otp->t_inpcb =
1888	(_TCPCB_PTR(struct inpcb *))VM_KERNEL_ADDRHASH(tp->t_inpcb);
1889	otp->t_state = tp->t_state;
1890	otp->t_flags = tp->t_flags;
1891	otp->t_force = (tp->t_flagsext & TF_FORCE) ? `1` : `0`;
1892	otp->snd_una = tp->snd_una;
1893	otp->snd_max = tp->snd_max;
1894	otp->snd_nxt = tp->snd_nxt;
1895	otp->snd_up = tp->snd_up;
1896	otp->snd_wl1 = tp->snd_wl1;
1897	otp->snd_wl2 = tp->snd_wl2;
1898	otp->iss = tp->iss;
1899	otp->irs = tp->irs;
1900	otp->rcv_nxt = tp->rcv_nxt;
1901	otp->rcv_adv = tp->rcv_adv;
1902	otp->rcv_wnd = tp->rcv_wnd;
1903	otp->rcv_up = tp->rcv_up;
1904	otp->snd_wnd = tp->snd_wnd;
1905	otp->snd_cwnd = tp->snd_cwnd;
1906	otp->snd_ssthresh = tp->snd_ssthresh;
1907	otp->t_maxopd = tp->t_maxopd;
1908	otp->t_rcvtime = tp->t_rcvtime;
1909	otp->t_starttime = tp->t_starttime;
1910	otp->t_rtttime = tp->t_rtttime;
1911	otp->t_rtseq = tp->t_rtseq;
1912	otp->t_rxtcur = tp->t_rxtcur;
1913	otp->t_maxseg = tp->t_maxseg;
1914	otp->t_srtt = tp->t_srtt;
1915	otp->t_rttvar = tp->t_rttvar;
1916	otp->t_rxtshift = tp->t_rxtshift;
1917	otp->t_rttmin = tp->t_rttmin;
1918	otp->t_rttupdated = tp->t_rttupdated;
1919	otp->max_sndwnd = tp->max_sndwnd;
1920	otp->t_softerror = tp->t_softerror;
1921	otp->t_oobflags = tp->t_oobflags;
1922	otp->t_iobc = tp->t_iobc;
1923	otp->snd_scale = tp->snd_scale;
1924	otp->rcv_scale = tp->rcv_scale;
1925	otp->request_r_scale = tp->request_r_scale;
1926	otp->requested_s_scale = tp->requested_s_scale;
1927	otp->ts_recent = tp->ts_recent;
1928	otp->ts_recent_age = tp->ts_recent_age;
1929	otp->last_ack_sent = tp->last_ack_sent;
1930	otp->cc_send = `0`;
1931	otp->cc_recv = `0`;
1932	otp->snd_recover = tp->snd_recover;
1933	otp->snd_cwnd_prev = tp->snd_cwnd_prev;
1934	otp->snd_ssthresh_prev = tp->snd_ssthresh_prev;
1935	otp->t_badrxtwin = `0`;
1936	}
1937
1938	static int
1939	tcp_pcblist SYSCTL_HANDLER_ARGS
1940	{
1941	#pragma unused(oidp, arg1, arg2)
1942	int error, i = `0`, n, sz;
1943	struct inpcb **inp_list;
1944	inp_gen_t gencnt;
1945	struct xinpgen xig;
1946
1947	/*
1948	* The process of preparing the TCB list is too time-consuming and
1949	* resource-intensive to repeat twice on every request.
1950	*/
1951	lck_rw_lock_shared(lck: &tcbinfo.ipi_lock);
1952	if (req->oldptr == USER_ADDR_NULL) {
1953	n = tcbinfo.ipi_count;
1954	req->oldidx = `2` * (sizeof(xig))
1955	+ (n + n / `8`) * sizeof(struct xtcpcb);
1956	lck_rw_done(lck: &tcbinfo.ipi_lock);
1957	return `0`;
1958	}
1959
1960	if (req->newptr != USER_ADDR_NULL) {
1961	lck_rw_done(lck: &tcbinfo.ipi_lock);
1962	return EPERM;
1963	}
1964
1965	/*
1966	* OK, now we're committed to doing something.
1967	*/
1968	gencnt = tcbinfo.ipi_gencnt;
1969	sz = n = tcbinfo.ipi_count;
1970
1971	bzero(s: &xig, n: sizeof(xig));
1972	xig.xig_len = sizeof(xig);
1973	xig.xig_count = n;
1974	xig.xig_gen = gencnt;
1975	xig.xig_sogen = so_gencnt;
1976	error = SYSCTL_OUT(req, &xig, sizeof(xig));
1977	if (error) {
1978	lck_rw_done(lck: &tcbinfo.ipi_lock);
1979	return error;
1980	}
1981	/*
1982	* We are done if there is no pcb
1983	*/
1984	if (n == `0`) {
1985	lck_rw_done(lck: &tcbinfo.ipi_lock);
1986	return `0`;
1987	}
1988
1989	inp_list = kalloc_type(struct inpcb *, n, Z_WAITOK);
1990	if (inp_list == NULL) {
1991	lck_rw_done(lck: &tcbinfo.ipi_lock);
1992	return ENOMEM;
1993	}
1994
1995	n = get_tcp_inp_list(inp_list, n, gencnt);
1996
1997	error = `0`;
1998	for (i = `0`; i < n; i++) {
1999	struct xtcpcb xt;
2000	caddr_t inp_ppcb;
2001	struct inpcb *inp;
2002
2003	inp = inp_list[i];
2004
2005	if (in_pcb_checkstate(inp, WNT_ACQUIRE, `0`) == WNT_STOPUSING) {
2006	continue;
2007	}
2008	socket_lock(so: inp->inp_socket, refcount: `1`);
2009	if (in_pcb_checkstate(inp, WNT_RELEASE, `1`) == WNT_STOPUSING) {
2010	socket_unlock(so: inp->inp_socket, refcount: `1`);
2011	continue;
2012	}
2013	if (inp->inp_gencnt > gencnt) {
2014	socket_unlock(so: inp->inp_socket, refcount: `1`);
2015	continue;
2016	}
2017
2018	bzero(s: &xt, n: sizeof(xt));
2019	xt.xt_len = sizeof(xt);
2020	/ XXX should avoid extra copy /
2021	inpcb_to_compat(inp, &xt.xt_inp);
2022	inp_ppcb = inp->inp_ppcb;
2023	if (inp_ppcb != NULL) {
2024	tcpcb_to_otcpcb(tp: (struct tcpcb )(void* *)inp_ppcb,
2025	otp: &xt.xt_tp);
2026	} else {
2027	bzero(s: (char ) &xt.xt_tp, n: sizeof*(xt.xt_tp));
2028	}
2029	if (inp->inp_socket) {
2030	sotoxsocket(so: inp->inp_socket, xso: &xt.xt_socket);
2031	}
2032
2033	socket_unlock(so: inp->inp_socket, refcount: `1`);
2034
2035	error = SYSCTL_OUT(req, &xt, sizeof(xt));
2036	}
2037	if (!error) {
2038	/*
2039	* Give the user an updated idea of our state.
2040	* If the generation differs from what we told
2041	* her before, she knows that something happened
2042	* while we were processing this request, and it
2043	* might be necessary to retry.
2044	*/
2045	bzero(s: &xig, n: sizeof(xig));
2046	xig.xig_len = sizeof(xig);
2047	xig.xig_gen = tcbinfo.ipi_gencnt;
2048	xig.xig_sogen = so_gencnt;
2049	xig.xig_count = tcbinfo.ipi_count;
2050	error = SYSCTL_OUT(req, &xig, sizeof(xig));
2051	}
2052
2053	lck_rw_done(lck: &tcbinfo.ipi_lock);
2054	kfree_type(struct inpcb *, sz, inp_list);
2055	return error;
2056	}
2057
2058	SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist,
2059	CTLTYPE_STRUCT \| CTLFLAG_RD \| CTLFLAG_LOCKED, `0`, `0`,
2060	tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
2061
2062	#if XNU_TARGET_OS_OSX
2063
2064	static void
2065	tcpcb_to_xtcpcb64(struct tcpcb tp, struct* xtcpcb64 *otp)
2066	{
2067	otp->t_segq = (uint32_t)VM_KERNEL_ADDRHASH(tp->t_segq.lh_first);
2068	otp->t_dupacks = tp->t_dupacks;
2069	otp->t_timer[TCPT_REXMT_EXT] = tp->t_timer[TCPT_REXMT];
2070	otp->t_timer[TCPT_PERSIST_EXT] = tp->t_timer[TCPT_PERSIST];
2071	otp->t_timer[TCPT_KEEP_EXT] = tp->t_timer[TCPT_KEEP];
2072	otp->t_timer[TCPT_2MSL_EXT] = tp->t_timer[TCPT_2MSL];
2073	otp->t_state = tp->t_state;
2074	otp->t_flags = tp->t_flags;
2075	otp->t_force = (tp->t_flagsext & TF_FORCE) ? `1` : `0`;
2076	otp->snd_una = tp->snd_una;
2077	otp->snd_max = tp->snd_max;
2078	otp->snd_nxt = tp->snd_nxt;
2079	otp->snd_up = tp->snd_up;
2080	otp->snd_wl1 = tp->snd_wl1;
2081	otp->snd_wl2 = tp->snd_wl2;
2082	otp->iss = tp->iss;
2083	otp->irs = tp->irs;
2084	otp->rcv_nxt = tp->rcv_nxt;
2085	otp->rcv_adv = tp->rcv_adv;
2086	otp->rcv_wnd = tp->rcv_wnd;
2087	otp->rcv_up = tp->rcv_up;
2088	otp->snd_wnd = tp->snd_wnd;
2089	otp->snd_cwnd = tp->snd_cwnd;
2090	otp->snd_ssthresh = tp->snd_ssthresh;
2091	otp->t_maxopd = tp->t_maxopd;
2092	otp->t_rcvtime = tp->t_rcvtime;
2093	otp->t_starttime = tp->t_starttime;
2094	otp->t_rtttime = tp->t_rtttime;
2095	otp->t_rtseq = tp->t_rtseq;
2096	otp->t_rxtcur = tp->t_rxtcur;
2097	otp->t_maxseg = tp->t_maxseg;
2098	otp->t_srtt = tp->t_srtt;
2099	otp->t_rttvar = tp->t_rttvar;
2100	otp->t_rxtshift = tp->t_rxtshift;
2101	otp->t_rttmin = tp->t_rttmin;
2102	otp->t_rttupdated = tp->t_rttupdated;
2103	otp->max_sndwnd = tp->max_sndwnd;
2104	otp->t_softerror = tp->t_softerror;
2105	otp->t_oobflags = tp->t_oobflags;
2106	otp->t_iobc = tp->t_iobc;
2107	otp->snd_scale = tp->snd_scale;
2108	otp->rcv_scale = tp->rcv_scale;
2109	otp->request_r_scale = tp->request_r_scale;
2110	otp->requested_s_scale = tp->requested_s_scale;
2111	otp->ts_recent = tp->ts_recent;
2112	otp->ts_recent_age = tp->ts_recent_age;
2113	otp->last_ack_sent = tp->last_ack_sent;
2114	otp->cc_send = `0`;
2115	otp->cc_recv = `0`;
2116	otp->snd_recover = tp->snd_recover;
2117	otp->snd_cwnd_prev = tp->snd_cwnd_prev;
2118	otp->snd_ssthresh_prev = tp->snd_ssthresh_prev;
2119	otp->t_badrxtwin = `0`;
2120	}
2121
2122
2123	static int
2124	tcp_pcblist64 SYSCTL_HANDLER_ARGS
2125	{
2126	#pragma unused(oidp, arg1, arg2)
2127	int error, i = `0`, n, sz;
2128	struct inpcb **inp_list;
2129	inp_gen_t gencnt;
2130	struct xinpgen xig;
2131
2132	/*
2133	* The process of preparing the TCB list is too time-consuming and
2134	* resource-intensive to repeat twice on every request.
2135	*/
2136	lck_rw_lock_shared(lck: &tcbinfo.ipi_lock);
2137	if (req->oldptr == USER_ADDR_NULL) {
2138	n = tcbinfo.ipi_count;
2139	req->oldidx = `2` * (sizeof(xig))
2140	+ (n + n / `8`) * sizeof(struct xtcpcb64);
2141	lck_rw_done(lck: &tcbinfo.ipi_lock);
2142	return `0`;
2143	}
2144
2145	if (req->newptr != USER_ADDR_NULL) {
2146	lck_rw_done(lck: &tcbinfo.ipi_lock);
2147	return EPERM;
2148	}
2149
2150	/*
2151	* OK, now we're committed to doing something.
2152	*/
2153	gencnt = tcbinfo.ipi_gencnt;
2154	sz = n = tcbinfo.ipi_count;
2155
2156	bzero(s: &xig, n: sizeof(xig));
2157	xig.xig_len = sizeof(xig);
2158	xig.xig_count = n;
2159	xig.xig_gen = gencnt;
2160	xig.xig_sogen = so_gencnt;
2161	error = SYSCTL_OUT(req, &xig, sizeof(xig));
2162	if (error) {
2163	lck_rw_done(lck: &tcbinfo.ipi_lock);
2164	return error;
2165	}
2166	/*
2167	* We are done if there is no pcb
2168	*/
2169	if (n == `0`) {
2170	lck_rw_done(lck: &tcbinfo.ipi_lock);
2171	return `0`;
2172	}
2173
2174	inp_list = kalloc_type(struct inpcb *, n, Z_WAITOK);
2175	if (inp_list == NULL) {
2176	lck_rw_done(lck: &tcbinfo.ipi_lock);
2177	return ENOMEM;
2178	}
2179
2180	n = get_tcp_inp_list(inp_list, n, gencnt);
2181
2182	error = `0`;
2183	for (i = `0`; i < n; i++) {
2184	struct xtcpcb64 xt;
2185	struct inpcb *inp;
2186
2187	inp = inp_list[i];
2188
2189	if (in_pcb_checkstate(inp, WNT_ACQUIRE, `0`) == WNT_STOPUSING) {
2190	continue;
2191	}
2192	socket_lock(so: inp->inp_socket, refcount: `1`);
2193	if (in_pcb_checkstate(inp, WNT_RELEASE, `1`) == WNT_STOPUSING) {
2194	socket_unlock(so: inp->inp_socket, refcount: `1`);
2195	continue;
2196	}
2197	if (inp->inp_gencnt > gencnt) {
2198	socket_unlock(so: inp->inp_socket, refcount: `1`);
2199	continue;
2200	}
2201
2202	bzero(s: &xt, n: sizeof(xt));
2203	xt.xt_len = sizeof(xt);
2204	inpcb_to_xinpcb64(inp, &xt.xt_inpcb);
2205	xt.xt_inpcb.inp_ppcb =
2206	(uint64_t)VM_KERNEL_ADDRHASH(inp->inp_ppcb);
2207	if (inp->inp_ppcb != NULL) {
2208	tcpcb_to_xtcpcb64(tp: (struct tcpcb *)inp->inp_ppcb,
2209	otp: &xt);
2210	}
2211	if (inp->inp_socket) {
2212	sotoxsocket64(so: inp->inp_socket,
2213	xso: &xt.xt_inpcb.xi_socket);
2214	}
2215
2216	socket_unlock(so: inp->inp_socket, refcount: `1`);
2217
2218	error = SYSCTL_OUT(req, &xt, sizeof(xt));
2219	}
2220	if (!error) {
2221	/*
2222	* Give the user an updated idea of our state.
2223	* If the generation differs from what we told
2224	* her before, she knows that something happened
2225	* while we were processing this request, and it
2226	* might be necessary to retry.
2227	*/
2228	bzero(s: &xig, n: sizeof(xig));
2229	xig.xig_len = sizeof(xig);
2230	xig.xig_gen = tcbinfo.ipi_gencnt;
2231	xig.xig_sogen = so_gencnt;
2232	xig.xig_count = tcbinfo.ipi_count;
2233	error = SYSCTL_OUT(req, &xig, sizeof(xig));
2234	}
2235
2236	lck_rw_done(lck: &tcbinfo.ipi_lock);
2237	kfree_type(struct inpcb *, sz, inp_list);
2238	return error;
2239	}
2240
2241	SYSCTL_PROC(_net_inet_tcp, OID_AUTO, pcblist64,
2242	CTLTYPE_STRUCT \| CTLFLAG_RD \| CTLFLAG_LOCKED, `0`, `0`,
2243	tcp_pcblist64, "S,xtcpcb64", "List of active TCP connections");
2244
2245	#endif /* XNU_TARGET_OS_OSX */
2246
2247	static int
2248	tcp_pcblist_n SYSCTL_HANDLER_ARGS
2249	{
2250	#pragma unused(oidp, arg1, arg2)
2251	int error = `0`;
2252
2253	error = get_pcblist_n(IPPROTO_TCP, req, &tcbinfo);
2254
2255	return error;
2256	}
2257
2258
2259	SYSCTL_PROC(_net_inet_tcp, OID_AUTO, pcblist_n,
2260	CTLTYPE_STRUCT \| CTLFLAG_RD \| CTLFLAG_LOCKED, `0`, `0`,
2261	tcp_pcblist_n, "S,xtcpcb_n", "List of active TCP connections");
2262
2263	static int
2264	tcp_progress_indicators SYSCTL_HANDLER_ARGS
2265	{
2266	#pragma unused(oidp, arg1, arg2)
2267
2268	return ntstat_tcp_progress_indicators(req);
2269	}
2270
2271	SYSCTL_PROC(_net_inet_tcp, OID_AUTO, progress,
2272	CTLTYPE_STRUCT \| CTLFLAG_RW \| CTLFLAG_LOCKED \| CTLFLAG_ANYBODY, `0`, `0`,
2273	tcp_progress_indicators, "S", "Various items that indicate the current state of progress on the link");
2274
2275
2276	static int
2277	tcp_progress_probe_enable SYSCTL_HANDLER_ARGS
2278	{
2279	#pragma unused(oidp, arg1, arg2)
2280
2281	return ntstat_tcp_progress_enable(req);
2282	}
2283
2284	SYSCTL_PROC(_net_inet_tcp, OID_AUTO, progress_enable,
2285	CTLTYPE_STRUCT \| CTLFLAG_RW \| CTLFLAG_LOCKED \| CTLFLAG_ANYBODY, `0`, `0`,
2286	tcp_progress_probe_enable, "S", "Enable/disable TCP keepalive probing on the specified link(s)");
2287
2288
2289	__private_extern__ void
2290	tcp_get_ports_used(ifnet_t ifp, int protocol, uint32_t flags,
2291	bitstr_t *bitfield)
2292	{
2293	inpcb_get_ports_used(ifp, protocol, flags, bitfield,
2294	&tcbinfo);
2295	}
2296
2297	__private_extern__ uint32_t
2298	tcp_count_opportunistic(unsigned int ifindex, u_int32_t flags)
2299	{
2300	return inpcb_count_opportunistic(ifindex, &tcbinfo, flags);
2301	}
2302
2303	__private_extern__ uint32_t
2304	tcp_find_anypcb_byaddr(struct ifaddr *ifa)
2305	{
2306	#if SKYWALK
2307	if (netns_is_enabled()) {
2308	return netns_find_anyres_byaddr(ifa, IPPROTO_TCP);
2309	} else
2310	#endif /* SKYWALK */
2311	return inpcb_find_anypcb_byaddr(ifa, &tcbinfo);
2312	}
2313
2314	static void
2315	tcp_handle_msgsize(struct ip ip, struct* inpcb *inp)
2316	{
2317	struct rtentry *rt = NULL;
2318	u_short ifscope = IFSCOPE_NONE;
2319	int mtu;
2320	struct sockaddr_in icmpsrc = {
2321	.sin_len = sizeof(struct sockaddr_in),
2322	.sin_family = AF_INET, .sin_port = `0`, .sin_addr = { .s_addr = `0` },
2323	.sin_zero = { `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0` }
2324	};
2325	struct icmp *icp = NULL;
2326
2327	icp = (struct icmp )(void* *)
2328	((caddr_t)ip - offsetof(struct icmp, icmp_ip));
2329
2330	icmpsrc.sin_addr = icp->icmp_ip.ip_dst;
2331
2332	/*
2333	* MTU discovery:
2334	* If we got a needfrag and there is a host route to the
2335	* original destination, and the MTU is not locked, then
2336	* set the MTU in the route to the suggested new value
2337	* (if given) and then notify as usual. The ULPs will
2338	* notice that the MTU has changed and adapt accordingly.
2339	* If no new MTU was suggested, then we guess a new one
2340	* less than the current value. If the new MTU is
2341	* unreasonably small (defined by sysctl tcp_minmss), then
2342	* we reset the MTU to the interface value and enable the
2343	* lock bit, indicating that we are no longer doing MTU
2344	* discovery.
2345	*/
2346	if (ROUTE_UNUSABLE(&(inp->inp_route)) == false) {
2347	rt = inp->inp_route.ro_rt;
2348	}
2349
2350	/*
2351	* icmp6_mtudisc_update scopes the routing lookup
2352	* to the incoming interface (delivered from mbuf
2353	* packet header.
2354	* That is mostly ok but for asymmetric networks
2355	* that may be an issue.
2356	* Frag needed OR Packet too big really communicates
2357	* MTU for the out data path.
2358	* Take the interface scope from cached route or
2359	* the last outgoing interface from inp
2360	*/
2361	if (rt != NULL) {
2362	ifscope = (rt->rt_ifp != NULL) ?
2363	rt->rt_ifp->if_index : IFSCOPE_NONE;
2364	} else {
2365	ifscope = (inp->inp_last_outifp != NULL) ?
2366	inp->inp_last_outifp->if_index : IFSCOPE_NONE;
2367	}
2368
2369	if ((rt == NULL) \|\|
2370	!(rt->rt_flags & RTF_HOST) \|\|
2371	(rt->rt_flags & (RTF_CLONING \| RTF_PRCLONING))) {
2372	rt = rtalloc1_scoped(SA(&icmpsrc), `0`, RTF_CLONING \| RTF_PRCLONING, ifscope);
2373	} else if (rt) {
2374	RT_LOCK(rt);
2375	rtref(rt);
2376	RT_UNLOCK(rt);
2377	}
2378
2379	if (rt != NULL) {
2380	RT_LOCK(rt);
2381	if ((rt->rt_flags & RTF_HOST) &&
2382	!(rt->rt_rmx.rmx_locks & RTV_MTU)) {
2383	mtu = ntohs(icp->icmp_nextmtu);
2384	/*
2385	* XXX Stock BSD has changed the following
2386	* to compare with icp->icmp_ip.ip_len
2387	* to converge faster when sent packet
2388	* < route's MTU. We may want to adopt
2389	* that change.
2390	*/
2391	if (mtu == `0`) {
2392	mtu = ip_next_mtu(rt->rt_rmx.
2393	rmx_mtu, `1`);
2394	}
2395	#if DEBUG_MTUDISC
2396	printf("MTU for %s reduced to %d\n",
2397	inet_ntop(AF_INET,
2398	&icmpsrc.sin_addr, ipv4str,
2399	sizeof(ipv4str)), mtu);
2400	#endif
2401	if (mtu < max(a: `296`, b: (tcp_minmss +
2402	sizeof(struct tcpiphdr)))) {
2403	rt->rt_rmx.rmx_locks \|= RTV_MTU;
2404	} else if (rt->rt_rmx.rmx_mtu > mtu) {
2405	rt->rt_rmx.rmx_mtu = mtu;
2406	}
2407	}
2408	RT_UNLOCK(rt);
2409	rtfree(rt);
2410	}
2411	}
2412
2413	void
2414	tcp_ctlinput(int cmd, struct sockaddr sa, void* vip, __unused struct* ifnet *ifp)
2415	{
2416	tcp_seq icmp_tcp_seq;
2417	struct ipctlparam *ctl_param = vip;
2418	struct ip *ip = NULL;
2419	struct mbuf *m = NULL;
2420	struct in_addr faddr;
2421	struct inpcb *inp;
2422	struct tcpcb *tp;
2423	struct tcphdr *th;
2424	struct icmp *icp;
2425	size_t off;
2426	#if SKYWALK
2427	union sockaddr_in_4_6 sock_laddr;
2428	struct protoctl_ev_val prctl_ev_val;
2429	#endif /* SKYWALK */
2430	void (notify)(struct* inpcb , int*) = tcp_notify;
2431
2432	if (ctl_param != NULL) {
2433	ip = ctl_param->ipc_icmp_ip;
2434	icp = ctl_param->ipc_icmp;
2435	m = ctl_param->ipc_m;
2436	off = ctl_param->ipc_off;
2437	} else {
2438	ip = NULL;
2439	icp = NULL;
2440	m = NULL;
2441	off = `0`;
2442	}
2443
2444	faddr = SIN(sa)->sin_addr;
2445	if (sa->sa_family != AF_INET \|\| faddr.s_addr == INADDR_ANY) {
2446	return;
2447	}
2448
2449	if ((unsigned)cmd >= PRC_NCMDS) {
2450	return;
2451	}
2452
2453	/ Source quench is deprecated /
2454	if (cmd == PRC_QUENCH) {
2455	return;
2456	}
2457
2458	if (cmd == PRC_MSGSIZE) {
2459	notify = tcp_mtudisc;
2460	} else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB \|\|
2461	cmd == PRC_UNREACH_PORT \|\| cmd == PRC_UNREACH_PROTOCOL \|\|
2462	cmd == PRC_TIMXCEED_INTRANS) && ip) {
2463	notify = tcp_drop_syn_sent;
2464	}
2465	/*
2466	* Hostdead is ugly because it goes linearly through all PCBs.
2467	* XXX: We never get this from ICMP, otherwise it makes an
2468	* excellent DoS attack on machines with many connections.
2469	*/
2470	else if (cmd == PRC_HOSTDEAD) {
2471	ip = NULL;
2472	} else if (inetctlerrmap[cmd] == `0` && !PRC_IS_REDIRECT(cmd)) {
2473	return;
2474	}
2475
2476	#if SKYWALK
2477	bzero(s: &prctl_ev_val, n: sizeof(prctl_ev_val));
2478	bzero(s: &sock_laddr, n: sizeof(sock_laddr));
2479	#endif /* SKYWALK */
2480
2481	if (ip == NULL) {
2482	in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify);
2483	#if SKYWALK
2484	protoctl_event_enqueue_nwk_wq_entry(ifp, NULL,
2485	p_raddr: sa, lport: `0`, rport: `0`, IPPROTO_TCP, protoctl_event_code: cmd, NULL);
2486	#endif /* SKYWALK */
2487	return;
2488	}
2489
2490	/ Check if we can safely get the sport, dport and the sequence number from the tcp header. /
2491	if (m == NULL \|\|
2492	(m->m_len < off + (sizeof(unsigned short) + sizeof(unsigned short) + sizeof(tcp_seq)))) {
2493	/ Insufficient length /
2494	return;
2495	}
2496
2497	th = (struct tcphdr)(void*)(mtod(m, uint8_t) + off);
2498	icmp_tcp_seq = ntohl(th->th_seq);
2499
2500	inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport,
2501	ip->ip_src, th->th_sport, `0`, NULL);
2502
2503	if (inp == NULL \|\|
2504	inp->inp_socket == NULL) {
2505	#if SKYWALK
2506	if (cmd == PRC_MSGSIZE) {
2507	prctl_ev_val.val = ntohs(icp->icmp_nextmtu);
2508	}
2509	prctl_ev_val.tcp_seq_number = icmp_tcp_seq;
2510
2511	sock_laddr.sin.sin_family = AF_INET;
2512	sock_laddr.sin.sin_len = sizeof(sock_laddr.sin);
2513	sock_laddr.sin.sin_addr = ip->ip_src;
2514
2515	protoctl_event_enqueue_nwk_wq_entry(ifp,
2516	SA(&sock_laddr), p_raddr: sa,
2517	lport: th->th_sport, rport: th->th_dport, IPPROTO_TCP,
2518	protoctl_event_code: cmd, p_protoctl_ev_val: &prctl_ev_val);
2519	#endif /* SKYWALK */
2520	return;
2521	}
2522
2523	socket_lock(so: inp->inp_socket, refcount: `1`);
2524	if (in_pcb_checkstate(inp, WNT_RELEASE, `1`) ==
2525	WNT_STOPUSING) {
2526	socket_unlock(so: inp->inp_socket, refcount: `1`);
2527	return;
2528	}
2529
2530	if (PRC_IS_REDIRECT(cmd)) {
2531	/ signal EHOSTDOWN, as it flushes the cached route /
2532	(*notify)(inp, EHOSTDOWN);
2533	} else {
2534	tp = intotcpcb(inp);
2535	if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
2536	SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
2537	if (cmd == PRC_MSGSIZE) {
2538	tcp_handle_msgsize(ip, inp);
2539	}
2540
2541	(*notify)(inp, inetctlerrmap[cmd]);
2542	}
2543	}
2544	socket_unlock(so: inp->inp_socket, refcount: `1`);
2545	}
2546
2547	void
2548	tcp6_ctlinput(int cmd, struct sockaddr sa, void* d, __unused struct* ifnet *ifp)
2549	{
2550	tcp_seq icmp_tcp_seq;
2551	struct in6_addr *dst;
2552	void (notify)(struct* inpcb , int*) = tcp_notify;
2553	struct ip6_hdr *ip6;
2554	struct mbuf *m;
2555	struct inpcb *inp;
2556	struct tcpcb *tp;
2557	struct icmp6_hdr *icmp6;
2558	struct ip6ctlparam *ip6cp = NULL;
2559	const struct sockaddr_in6 *sa6_src = NULL;
2560	unsigned int mtu;
2561	unsigned int off;
2562
2563	struct tcp_ports {
2564	uint16_t th_sport;
2565	uint16_t th_dport;
2566	} t_ports;
2567	#if SKYWALK
2568	union sockaddr_in_4_6 sock_laddr;
2569	struct protoctl_ev_val prctl_ev_val;
2570	#endif /* SKYWALK */
2571
2572	if (sa->sa_family != AF_INET6 \|\|
2573	sa->sa_len != sizeof(struct sockaddr_in6)) {
2574	return;
2575	}
2576
2577	/ Source quench is deprecated /
2578	if (cmd == PRC_QUENCH) {
2579	return;
2580	}
2581
2582	if ((unsigned)cmd >= PRC_NCMDS) {
2583	return;
2584	}
2585
2586	/ if the parameter is from icmp6, decode it. /
2587	if (d != NULL) {
2588	ip6cp = (struct ip6ctlparam *)d;
2589	icmp6 = ip6cp->ip6c_icmp6;
2590	m = ip6cp->ip6c_m;
2591	ip6 = ip6cp->ip6c_ip6;
2592	off = ip6cp->ip6c_off;
2593	sa6_src = ip6cp->ip6c_src;
2594	dst = ip6cp->ip6c_finaldst;
2595	} else {
2596	m = NULL;
2597	ip6 = NULL;
2598	off = `0`; / fool gcc /
2599	sa6_src = &sa6_any;
2600	dst = NULL;
2601	}
2602
2603	if (cmd == PRC_MSGSIZE) {
2604	notify = tcp_mtudisc;
2605	} else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB \|\|
2606	cmd == PRC_UNREACH_PORT \|\| cmd == PRC_TIMXCEED_INTRANS) &&
2607	ip6 != NULL) {
2608	notify = tcp_drop_syn_sent;
2609	}
2610	/*
2611	* Hostdead is ugly because it goes linearly through all PCBs.
2612	* XXX: We never get this from ICMP, otherwise it makes an
2613	* excellent DoS attack on machines with many connections.
2614	*/
2615	else if (cmd == PRC_HOSTDEAD) {
2616	ip6 = NULL;
2617	} else if (inet6ctlerrmap[cmd] == `0` && !PRC_IS_REDIRECT(cmd)) {
2618	return;
2619	}
2620
2621	#if SKYWALK
2622	bzero(s: &prctl_ev_val, n: sizeof(prctl_ev_val));
2623	bzero(s: &sock_laddr, n: sizeof(sock_laddr));
2624	#endif /* SKYWALK */
2625
2626	if (ip6 == NULL) {
2627	in6_pcbnotify(&tcbinfo, sa, `0`, SA(sa6_src), `0`, cmd, NULL, notify);
2628	#if SKYWALK
2629	protoctl_event_enqueue_nwk_wq_entry(ifp, NULL, p_raddr: sa,
2630	lport: `0`, rport: `0`, IPPROTO_TCP, protoctl_event_code: cmd, NULL);
2631	#endif /* SKYWALK */
2632	return;
2633	}
2634
2635	/ Check if we can safely get the ports from the tcp hdr /
2636	if (m == NULL \|\|
2637	(m->m_pkthdr.len <
2638	(int32_t) (off + sizeof(struct tcp_ports)))) {
2639	return;
2640	}
2641	bzero(s: &t_ports, n: sizeof(struct tcp_ports));
2642	m_copydata(m, off, sizeof(struct tcp_ports), (caddr_t)&t_ports);
2643
2644	off += sizeof(struct tcp_ports);
2645	if (m->m_pkthdr.len < (int32_t) (off + sizeof(tcp_seq))) {
2646	return;
2647	}
2648	m_copydata(m, off, sizeof(tcp_seq), (caddr_t)&icmp_tcp_seq);
2649	icmp_tcp_seq = ntohl(icmp_tcp_seq);
2650
2651	if (cmd == PRC_MSGSIZE) {
2652	mtu = ntohl(icmp6->icmp6_mtu);
2653	/*
2654	* If no alternative MTU was proposed, or the proposed
2655	* MTU was too small, set to the min.
2656	*/
2657	if (mtu < IPV6_MMTU) {
2658	mtu = IPV6_MMTU - `8`;
2659	}
2660	}
2661
2662	inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_dst, t_ports.th_dport, ip6_input_getdstifscope(m),
2663	&ip6->ip6_src, t_ports.th_sport, ip6_input_getsrcifscope(m), `0`, NULL);
2664
2665	if (inp == NULL \|\|
2666	inp->inp_socket == NULL) {
2667	#if SKYWALK
2668	if (cmd == PRC_MSGSIZE) {
2669	prctl_ev_val.val = mtu;
2670	}
2671	prctl_ev_val.tcp_seq_number = icmp_tcp_seq;
2672
2673	sock_laddr.sin6.sin6_family = AF_INET6;
2674	sock_laddr.sin6.sin6_len = sizeof(sock_laddr.sin6);
2675	sock_laddr.sin6.sin6_addr = ip6->ip6_src;
2676
2677	protoctl_event_enqueue_nwk_wq_entry(ifp,
2678	SA(&sock_laddr), p_raddr: sa,
2679	lport: t_ports.th_sport, rport: t_ports.th_dport, IPPROTO_TCP,
2680	protoctl_event_code: cmd, p_protoctl_ev_val: &prctl_ev_val);
2681	#endif /* SKYWALK */
2682	return;
2683	}
2684
2685	socket_lock(so: inp->inp_socket, refcount: `1`);
2686	if (in_pcb_checkstate(inp, WNT_RELEASE, `1`) ==
2687	WNT_STOPUSING) {
2688	socket_unlock(so: inp->inp_socket, refcount: `1`);
2689	return;
2690	}
2691
2692	if (PRC_IS_REDIRECT(cmd)) {
2693	/ signal EHOSTDOWN, as it flushes the cached route /
2694	(*notify)(inp, EHOSTDOWN);
2695	} else {
2696	tp = intotcpcb(inp);
2697	if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
2698	SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
2699	if (cmd == PRC_MSGSIZE) {
2700	/*
2701	* Only process the offered MTU if it
2702	* is smaller than the current one.
2703	*/
2704	if (mtu < tp->t_maxseg +
2705	(sizeof(struct tcphdr) + sizeof(struct ip6_hdr))) {
2706	(*notify)(inp, inetctlerrmap[cmd]);
2707	}
2708	} else {
2709	(*notify)(inp, inetctlerrmap[cmd]);
2710	}
2711	}
2712	}
2713	socket_unlock(so: inp->inp_socket, refcount: `1`);
2714	}
2715
2716
2717	/*
2718	* Following is where TCP initial sequence number generation occurs.
2719	*
2720	* There are two places where we must use initial sequence numbers:
2721	* 1. In SYN-ACK packets.
2722	* 2. In SYN packets.
2723	*
2724	* The ISNs in SYN-ACK packets have no monotonicity requirement,
2725	* and should be as unpredictable as possible to avoid the possibility
2726	* of spoofing and/or connection hijacking. To satisfy this
2727	* requirement, SYN-ACK ISNs are generated via the arc4random()
2728	* function. If exact RFC 1948 compliance is requested via sysctl,
2729	* these ISNs will be generated just like those in SYN packets.
2730	*
2731	* The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
2732	* depends on this property. In addition, these ISNs should be
2733	* unguessable so as to prevent connection hijacking. To satisfy
2734	* the requirements of this situation, the algorithm outlined in
2735	* RFC 1948 is used to generate sequence numbers.
2736	*
2737	* For more information on the theory of operation, please see
2738	* RFC 1948.
2739	*
2740	* Implementation details:
2741	*
2742	* Time is based off the system timer, and is corrected so that it
2743	* increases by one megabyte per second. This allows for proper
2744	* recycling on high speed LANs while still leaving over an hour
2745	* before rollover.
2746	*
2747	* Two sysctls control the generation of ISNs:
2748	*
2749	* net.inet.tcp.isn_reseed_interval controls the number of seconds
2750	* between seeding of isn_secret. This is normally set to zero,
2751	* as reseeding should not be necessary.
2752	*
2753	* net.inet.tcp.strict_rfc1948 controls whether RFC 1948 is followed
2754	* strictly. When strict compliance is requested, reseeding is
2755	* disabled and SYN-ACKs will be generated in the same manner as
2756	* SYNs. Strict mode is disabled by default.
2757	*
2758	*/
2759
2760	#define ISN_BYTES_PER_SECOND 1048576
2761
2762	tcp_seq
2763	tcp_new_isn(struct tcpcb *tp)
2764	{
2765	u_int32_t md5_buffer[`4`];
2766	tcp_seq new_isn;
2767	struct timeval timenow;
2768	u_char isn_secret[`32`];
2769	long isn_last_reseed = `0`;
2770	MD5_CTX isn_ctx;
2771
2772	/ Use arc4random for SYN-ACKs when not in exact RFC1948 mode. /
2773	if (((tp->t_state == TCPS_LISTEN) \|\| (tp->t_state == TCPS_TIME_WAIT)) &&
2774	tcp_strict_rfc1948 == `0`)
2775	#ifdef __APPLE__
2776	{ return RandomULong(); }
2777	#else
2778	{ return arc4random(); }
2779	#endif
2780	getmicrotime(&timenow);
2781
2782	/ Seed if this is the first use, reseed if requested. /
2783	if ((isn_last_reseed == `0`) \|\|
2784	((tcp_strict_rfc1948 == `0`) && (tcp_isn_reseed_interval > `0`) &&
2785	(((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval * hz)
2786	< (u_int)timenow.tv_sec))) {
2787	#ifdef __APPLE__
2788	read_frandom(buffer: &isn_secret, numBytes: sizeof(isn_secret));
2789	#else
2790	read_random_unlimited(&isn_secret, sizeof(isn_secret));
2791	#endif
2792	isn_last_reseed = timenow.tv_sec;
2793	}
2794
2795	/ Compute the md5 hash and return the ISN. /
2796	MD5Init(&isn_ctx);
2797	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport,
2798	sizeof(u_short));
2799	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport,
2800	sizeof(u_short));
2801	if ((tp->t_inpcb->inp_vflag & INP_IPV6) != `0`) {
2802	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr,
2803	sizeof(struct in6_addr));
2804	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr,
2805	sizeof(struct in6_addr));
2806	} else {
2807	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr,
2808	sizeof(struct in_addr));
2809	MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr,
2810	sizeof(struct in_addr));
2811	}
2812	MD5Update(&isn_ctx, (u_char ) &isn_secret, sizeof*(isn_secret));
2813	MD5Final((u_char *) &md5_buffer, &isn_ctx);
2814	new_isn = (tcp_seq) md5_buffer[`0`];
2815	new_isn += timenow.tv_sec * (ISN_BYTES_PER_SECOND / hz);
2816	return new_isn;
2817	}
2818
2819
2820	/*
2821	* When a specific ICMP unreachable message is received and the
2822	* connection state is SYN-SENT, drop the connection. This behavior
2823	* is controlled by the icmp_may_rst sysctl.
2824	*/
2825	void
2826	tcp_drop_syn_sent(struct inpcb inp, int* errno)
2827	{
2828	struct tcpcb *tp = intotcpcb(inp);
2829
2830	if (tp && tp->t_state == TCPS_SYN_SENT) {
2831	tcp_drop(tp, errno);
2832	}
2833	}
2834
2835	/*
2836	* When `need fragmentation' ICMP is received, update our idea of the MSS
2837	* based on the new value in the route. Also nudge TCP to send something,
2838	* since we know the packet we just sent was dropped.
2839	* This duplicates some code in the tcp_mss() function in tcp_input.c.
2840	*/
2841	void
2842	tcp_mtudisc(struct inpcb inp, __unused int* errno)
2843	{
2844	struct tcpcb *tp = intotcpcb(inp);
2845	struct rtentry *rt;
2846	struct socket *so = inp->inp_socket;
2847	int mss;
2848	u_int32_t mtu;
2849	u_int32_t protoHdrOverhead = sizeof(struct tcpiphdr);
2850	int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != `0`;
2851
2852	/*
2853	* Nothing left to send after the socket is defunct or TCP is in the closed state
2854	*/
2855	if ((so->so_state & SS_DEFUNCT) \|\| (tp != NULL && tp->t_state == TCPS_CLOSED)) {
2856	return;
2857	}
2858
2859	if (isipv6) {
2860	protoHdrOverhead = sizeof(struct ip6_hdr) +
2861	sizeof(struct tcphdr);
2862	}
2863
2864	if (tp != NULL) {
2865	if (isipv6) {
2866	rt = tcp_rtlookup6(inp, IFSCOPE_NONE);
2867	} else {
2868	rt = tcp_rtlookup(inp, IFSCOPE_NONE);
2869	}
2870	if (!rt \|\| !rt->rt_rmx.rmx_mtu) {
2871	tp->t_maxopd = tp->t_maxseg =
2872	isipv6 ? tcp_v6mssdflt :
2873	tcp_mssdflt;
2874
2875	/ Route locked during lookup above /
2876	if (rt != NULL) {
2877	RT_UNLOCK(rt);
2878	}
2879	return;
2880	}
2881	mtu = rt->rt_rmx.rmx_mtu;
2882
2883	/ Route locked during lookup above /
2884	RT_UNLOCK(rt);
2885
2886	#if NECP
2887	// Adjust MTU if necessary.
2888	mtu = necp_socket_get_effective_mtu(inp, current_mtu: mtu);
2889	#endif /* NECP */
2890	mss = mtu - protoHdrOverhead;
2891
2892	if (tp->t_maxopd) {
2893	mss = min(a: mss, b: tp->t_maxopd);
2894	}
2895	/*
2896	* XXX - The above conditional probably violates the TCP
2897	* spec. The problem is that, since we don't know the
2898	* other end's MSS, we are supposed to use a conservative
2899	* default. But, if we do that, then MTU discovery will
2900	* never actually take place, because the conservative
2901	* default is much less than the MTUs typically seen
2902	* on the Internet today. For the moment, we'll sweep
2903	* this under the carpet.
2904	*
2905	* The conservative default might not actually be a problem
2906	* if the only case this occurs is when sending an initial
2907	* SYN with options and data to a host we've never talked
2908	* to before. Then, they will reply with an MSS value which
2909	* will get recorded and the new parameters should get
2910	* recomputed. For Further Study.
2911	*/
2912	if (tp->t_maxopd <= mss) {
2913	return;
2914	}
2915	tp->t_maxopd = mss;
2916
2917	if ((tp->t_flags & (TF_REQ_TSTMP \| TF_NOOPT)) == TF_REQ_TSTMP &&
2918	(tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP) {
2919	mss -= TCPOLEN_TSTAMP_APPA;
2920	}
2921
2922	#if MPTCP
2923	mss -= mptcp_adj_mss(tp, TRUE);
2924	#endif
2925	if (so->so_snd.sb_hiwat < mss) {
2926	mss = so->so_snd.sb_hiwat;
2927	}
2928
2929	tp->t_maxseg = mss;
2930
2931	ASSERT(tp->t_maxseg);
2932
2933	/*
2934	* Reset the slow-start flight size as it may depends on the
2935	* new MSS
2936	*/
2937	if (CC_ALGO(tp)->cwnd_init != NULL) {
2938	CC_ALGO(tp)->cwnd_init(tp);
2939	}
2940
2941	if (TCP_USE_RLEDBAT(tp, so) && tcp_cc_rledbat.rwnd_init != NULL) {
2942	tcp_cc_rledbat.rwnd_init(tp);
2943	}
2944
2945	tcpstat.tcps_mturesent++;
2946	tp->t_rtttime = `0`;
2947	tp->snd_nxt = tp->snd_una;
2948	tcp_output(tp);
2949	}
2950	}
2951
2952	/*
2953	* Look-up the routing entry to the peer of this inpcb. If no route
2954	* is found and it cannot be allocated the return NULL. This routine
2955	* is called by TCP routines that access the rmx structure and by tcp_mss
2956	* to get the interface MTU. If a route is found, this routine will
2957	* hold the rtentry lock; the caller is responsible for unlocking.
2958	*/
2959	struct rtentry *
2960	tcp_rtlookup(struct inpcb inp, unsigned* int input_ifscope)
2961	{
2962	struct route *ro;
2963	struct rtentry *rt;
2964	struct tcpcb *tp;
2965
2966	LCK_MTX_ASSERT(rnh_lock, LCK_MTX_ASSERT_NOTOWNED);
2967
2968	ro = &inp->inp_route;
2969	if ((rt = ro->ro_rt) != NULL) {
2970	RT_LOCK(rt);
2971	}
2972
2973	if (ROUTE_UNUSABLE(ro)) {
2974	if (rt != NULL) {
2975	RT_UNLOCK(rt);
2976	rt = NULL;
2977	}
2978	ROUTE_RELEASE(ro);
2979	/ No route yet, so try to acquire one /
2980	if (inp->inp_faddr.s_addr != INADDR_ANY) {
2981	unsigned int ifscope;
2982
2983	ro->ro_dst.sa_family = AF_INET;
2984	ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
2985	SIN(&ro->ro_dst)->sin_addr = inp->inp_faddr;
2986
2987	/*
2988	* If the socket was bound to an interface, then
2989	* the bound-to-interface takes precedence over
2990	* the inbound interface passed in by the caller
2991	* (if we get here as part of the output path then
2992	* input_ifscope is IFSCOPE_NONE).
2993	*/
2994	ifscope = (inp->inp_flags & INP_BOUND_IF) ?
2995	inp->inp_boundifp->if_index : input_ifscope;
2996
2997	rtalloc_scoped(ro, ifscope);
2998	if ((rt = ro->ro_rt) != NULL) {
2999	RT_LOCK(rt);
3000	}
3001	}
3002	}
3003	if (rt != NULL) {
3004	RT_LOCK_ASSERT_HELD(rt);
3005	}
3006
3007	/*
3008	* Update MTU discovery determination. Don't do it if:
3009	* 1) it is disabled via the sysctl
3010	* 2) the route isn't up
3011	* 3) the MTU is locked (if it is, then discovery has been
3012	* disabled)
3013	*/
3014
3015	tp = intotcpcb(inp);
3016
3017	if (!path_mtu_discovery \|\| ((rt != NULL) &&
3018	(!(rt->rt_flags & RTF_UP) \|\| (rt->rt_rmx.rmx_locks & RTV_MTU)))) {
3019	tp->t_flags &= ~TF_PMTUD;
3020	} else {
3021	tp->t_flags \|= TF_PMTUD;
3022	}
3023
3024	if (rt != NULL && rt->rt_ifp != NULL) {
3025	somultipages(inp->inp_socket,
3026	(rt->rt_ifp->if_hwassist & IFNET_MULTIPAGES));
3027	tcp_set_tso(tp, ifp: rt->rt_ifp);
3028	soif2kcl(inp->inp_socket,
3029	(rt->rt_ifp->if_eflags & IFEF_2KCL));
3030	tcp_set_ecn(tp, ifp: rt->rt_ifp);
3031	if (inp->inp_last_outifp == NULL) {
3032	inp->inp_last_outifp = rt->rt_ifp;
3033	#if SKYWALK
3034	if (NETNS_TOKEN_VALID(&inp->inp_netns_token)) {
3035	netns_set_ifnet(token: &inp->inp_netns_token,
3036	ifp: inp->inp_last_outifp);
3037	}
3038	#endif /* SKYWALK */
3039	}
3040	}
3041
3042	/ Note if the peer is local /
3043	if (rt != NULL && !(rt->rt_ifp->if_flags & IFF_POINTOPOINT) &&
3044	(rt->rt_gateway->sa_family == AF_LINK \|\|
3045	rt->rt_ifp->if_flags & IFF_LOOPBACK \|\|
3046	in_localaddr(inp->inp_faddr))) {
3047	tp->t_flags \|= TF_LOCAL;
3048	}
3049
3050	/*
3051	* Caller needs to call RT_UNLOCK(rt).
3052	*/
3053	return rt;
3054	}
3055
3056	struct rtentry *
3057	tcp_rtlookup6(struct inpcb inp, unsigned* int input_ifscope)
3058	{
3059	struct route_in6 *ro6;
3060	struct rtentry *rt;
3061	struct tcpcb *tp;
3062
3063	LCK_MTX_ASSERT(rnh_lock, LCK_MTX_ASSERT_NOTOWNED);
3064
3065	ro6 = &inp->in6p_route;
3066	if ((rt = ro6->ro_rt) != NULL) {
3067	RT_LOCK(rt);
3068	}
3069
3070	if (ROUTE_UNUSABLE(ro6)) {
3071	if (rt != NULL) {
3072	RT_UNLOCK(rt);
3073	rt = NULL;
3074	}
3075	ROUTE_RELEASE(ro6);
3076	/ No route yet, so try to acquire one /
3077	if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
3078	struct sockaddr_in6 *dst6;
3079	unsigned int ifscope;
3080
3081	dst6 = SIN6(&ro6->ro_dst);
3082	dst6->sin6_family = AF_INET6;
3083	dst6->sin6_len = sizeof(*dst6);
3084	dst6->sin6_addr = inp->in6p_faddr;
3085
3086	/*
3087	* If the socket was bound to an interface, then
3088	* the bound-to-interface takes precedence over
3089	* the inbound interface passed in by the caller
3090	* (if we get here as part of the output path then
3091	* input_ifscope is IFSCOPE_NONE).
3092	*/
3093	ifscope = (inp->inp_flags & INP_BOUND_IF) ?
3094	inp->inp_boundifp->if_index : input_ifscope;
3095
3096	rtalloc_scoped((struct route *)ro6, ifscope);
3097	if ((rt = ro6->ro_rt) != NULL) {
3098	RT_LOCK(rt);
3099	}
3100	}
3101	}
3102	if (rt != NULL) {
3103	RT_LOCK_ASSERT_HELD(rt);
3104	}
3105
3106	/*
3107	* Update path MTU Discovery determination
3108	* while looking up the route:
3109	* 1) we have a valid route to the destination
3110	* 2) the MTU is not locked (if it is, then discovery has been
3111	* disabled)
3112	*/
3113
3114
3115	tp = intotcpcb(inp);
3116
3117	/*
3118	* Update MTU discovery determination. Don't do it if:
3119	* 1) it is disabled via the sysctl
3120	* 2) the route isn't up
3121	* 3) the MTU is locked (if it is, then discovery has been
3122	* disabled)
3123	*/
3124
3125	if (!path_mtu_discovery \|\| ((rt != NULL) &&
3126	(!(rt->rt_flags & RTF_UP) \|\| (rt->rt_rmx.rmx_locks & RTV_MTU)))) {
3127	tp->t_flags &= ~TF_PMTUD;
3128	} else {
3129	tp->t_flags \|= TF_PMTUD;
3130	}
3131
3132	if (rt != NULL && rt->rt_ifp != NULL) {
3133	somultipages(inp->inp_socket,
3134	(rt->rt_ifp->if_hwassist & IFNET_MULTIPAGES));
3135	tcp_set_tso(tp, ifp: rt->rt_ifp);
3136	soif2kcl(inp->inp_socket,
3137	(rt->rt_ifp->if_eflags & IFEF_2KCL));
3138	tcp_set_ecn(tp, ifp: rt->rt_ifp);
3139	if (inp->inp_last_outifp == NULL) {
3140	inp->inp_last_outifp = rt->rt_ifp;
3141	#if SKYWALK
3142	if (NETNS_TOKEN_VALID(&inp->inp_netns_token)) {
3143	netns_set_ifnet(token: &inp->inp_netns_token,
3144	ifp: inp->inp_last_outifp);
3145	}
3146	#endif /* SKYWALK */
3147	}
3148
3149	/ Note if the peer is local /
3150	if (!(rt->rt_ifp->if_flags & IFF_POINTOPOINT) &&
3151	(IN6_IS_ADDR_LOOPBACK(&inp->in6p_faddr) \|\|
3152	IN6_IS_ADDR_LINKLOCAL(&inp->in6p_faddr) \|\|
3153	rt->rt_gateway->sa_family == AF_LINK \|\|
3154	in6_localaddr(&inp->in6p_faddr))) {
3155	tp->t_flags \|= TF_LOCAL;
3156	}
3157	}
3158
3159	/*
3160	* Caller needs to call RT_UNLOCK(rt).
3161	*/
3162	return rt;
3163	}
3164
3165	#if IPSEC
3166	/ compute ESP/AH header size for TCP, including outer IP header. /
3167	size_t
3168	ipsec_hdrsiz_tcp(struct tcpcb *tp)
3169	{
3170	struct inpcb *inp;
3171	struct mbuf *m;
3172	size_t hdrsiz;
3173	struct ip *ip;
3174	struct ip6_hdr *ip6 = NULL;
3175	struct tcphdr *th;
3176
3177	if ((tp == NULL) \|\| ((inp = tp->t_inpcb) == NULL)) {
3178	return `0`;
3179	}
3180	MGETHDR(m, M_DONTWAIT, MT_DATA); / MAC-OK /
3181	if (!m) {
3182	return `0`;
3183	}
3184
3185	if ((inp->inp_vflag & INP_IPV6) != `0`) {
3186	ip6 = mtod(m, struct ip6_hdr *);
3187	th = (struct tcphdr )(void* *)(ip6 + `1`);
3188	m->m_pkthdr.len = m->m_len =
3189	sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
3190	tcp_fillheaders(m, tp, ip_ptr: ip6, tcp_ptr: th);
3191	hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
3192	} else {
3193	ip = mtod(m, struct ip *);
3194	th = (struct tcphdr *)(ip + `1`);
3195	m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
3196	tcp_fillheaders(m, tp, ip_ptr: ip, tcp_ptr: th);
3197	hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
3198	}
3199	m_free(m);
3200	return hdrsiz;
3201	}
3202	#endif /* IPSEC */
3203
3204	int
3205	tcp_lock(struct socket so, int* refcount, void *lr)
3206	{
3207	void *lr_saved;
3208
3209	if (lr == NULL) {
3210	lr_saved = __builtin_return_address(`0`);
3211	} else {
3212	lr_saved = lr;
3213	}
3214
3215	retry:
3216	if (so->so_pcb != NULL) {
3217	if (so->so_flags & SOF_MP_SUBFLOW) {
3218	struct mptcb *mp_tp = tptomptp(sototcpcb(so));
3219	struct socket *mp_so = mptetoso(mpte: mp_tp->mpt_mpte);
3220
3221	socket_lock(so: mp_so, refcount);
3222
3223	/*
3224	* Check if we became non-MPTCP while waiting for the lock.
3225	* If yes, we have to retry to grab the right lock.
3226	*/
3227	if (!(so->so_flags & SOF_MP_SUBFLOW)) {
3228	socket_unlock(so: mp_so, refcount);
3229	goto retry;
3230	}
3231	} else {
3232	lck_mtx_lock(lck: &((struct inpcb *)so->so_pcb)->inpcb_mtx);
3233
3234	if (so->so_flags & SOF_MP_SUBFLOW) {
3235	/*
3236	* While waiting for the lock, we might have
3237	* become MPTCP-enabled (see mptcp_subflow_socreate).
3238	*/
3239	lck_mtx_unlock(lck: &((struct inpcb *)so->so_pcb)->inpcb_mtx);
3240	goto retry;
3241	}
3242	}
3243	} else {
3244	panic("tcp_lock: so=%p NO PCB! lr=%p lrh= %s",
3245	so, lr_saved, solockhistory_nr(so));
3246	/ NOTREACHED /
3247	}
3248
3249	if (so->so_usecount < `0`) {
3250	panic("tcp_lock: so=%p so_pcb=%p lr=%p ref=%x lrh= %s",
3251	so, so->so_pcb, lr_saved, so->so_usecount,
3252	solockhistory_nr(so));
3253	/ NOTREACHED /
3254	}
3255	if (refcount) {
3256	so->so_usecount++;
3257	}
3258	so->lock_lr[so->next_lock_lr] = lr_saved;
3259	so->next_lock_lr = (so->next_lock_lr + `1`) % SO_LCKDBG_MAX;
3260	return `0`;
3261	}
3262
3263	int
3264	tcp_unlock(struct socket so, int* refcount, void *lr)
3265	{
3266	void *lr_saved;
3267
3268	if (lr == NULL) {
3269	lr_saved = __builtin_return_address(`0`);
3270	} else {
3271	lr_saved = lr;
3272	}
3273
3274	#ifdef MORE_TCPLOCK_DEBUG
3275	printf("tcp_unlock: so=0x%llx sopcb=0x%llx lock=0x%llx ref=%x "
3276	"lr=0x%llx\n", (uint64_t)VM_KERNEL_ADDRPERM(so),
3277	(uint64_t)VM_KERNEL_ADDRPERM(so->so_pcb),
3278	(uint64_t)VM_KERNEL_ADDRPERM(&(sotoinpcb(so)->inpcb_mtx)),
3279	so->so_usecount, (uint64_t)VM_KERNEL_ADDRPERM(lr_saved));
3280	#endif
3281	if (refcount) {
3282	so->so_usecount--;
3283	}
3284
3285	if (so->so_usecount < `0`) {
3286	panic("tcp_unlock: so=%p usecount=%x lrh= %s",
3287	so, so->so_usecount, solockhistory_nr(so));
3288	/ NOTREACHED /
3289	}
3290	if (so->so_pcb == NULL) {
3291	panic("tcp_unlock: so=%p NO PCB usecount=%x lr=%p lrh= %s",
3292	so, so->so_usecount, lr_saved, solockhistory_nr(so));
3293	/ NOTREACHED /
3294	} else {
3295	so->unlock_lr[so->next_unlock_lr] = lr_saved;
3296	so->next_unlock_lr = (so->next_unlock_lr + `1`) % SO_LCKDBG_MAX;
3297
3298	if (so->so_flags & SOF_MP_SUBFLOW) {
3299	struct mptcb *mp_tp = tptomptp(sototcpcb(so));
3300	struct socket *mp_so = mptetoso(mpte: mp_tp->mpt_mpte);
3301
3302	socket_lock_assert_owned(so: mp_so);
3303
3304	socket_unlock(so: mp_so, refcount);
3305	} else {
3306	LCK_MTX_ASSERT(&((struct inpcb *)so->so_pcb)->inpcb_mtx,
3307	LCK_MTX_ASSERT_OWNED);
3308	lck_mtx_unlock(lck: &((struct inpcb *)so->so_pcb)->inpcb_mtx);
3309	}
3310	}
3311	return `0`;
3312	}
3313
3314	lck_mtx_t *
3315	tcp_getlock(struct socket so, int* flags)
3316	{
3317	struct inpcb *inp = sotoinpcb(so);
3318
3319	if (so->so_pcb) {
3320	if (so->so_usecount < `0`) {
3321	panic("tcp_getlock: so=%p usecount=%x lrh= %s",
3322	so, so->so_usecount, solockhistory_nr(so));
3323	}
3324
3325	if (so->so_flags & SOF_MP_SUBFLOW) {
3326	struct mptcb *mp_tp = tptomptp(sototcpcb(so));
3327	struct socket *mp_so = mptetoso(mpte: mp_tp->mpt_mpte);
3328
3329	return mp_so->so_proto->pr_getlock(mp_so, flags);
3330	} else {
3331	return &inp->inpcb_mtx;
3332	}
3333	} else {
3334	panic("tcp_getlock: so=%p NULL so_pcb %s",
3335	so, solockhistory_nr(so));
3336	return so->so_proto->pr_domain->dom_mtx;
3337	}
3338	}
3339
3340	/*
3341	* Determine if we can grow the recieve socket buffer to avoid sending
3342	* a zero window update to the peer. We allow even socket buffers that
3343	* have fixed size (set by the application) to grow if the resource
3344	* constraints are met. They will also be trimmed after the application
3345	* reads data.
3346	*/
3347	static void
3348	tcp_sbrcv_grow_rwin(struct tcpcb tp, struct* sockbuf *sb)
3349	{
3350	u_int32_t rcvbufinc = tp->t_maxseg << `4`;
3351	u_int32_t rcvbuf = sb->sb_hiwat;
3352	struct socket *so = tp->t_inpcb->inp_socket;
3353
3354	if (tcp_recv_bg == `1` \|\| IS_TCP_RECV_BG(so)) {
3355	return;
3356	}
3357
3358	if (tcp_do_autorcvbuf == `1` &&
3359	(tp->t_flags & TF_SLOWLINK) == `0` &&
3360	(so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) == `0` &&
3361	(rcvbuf - sb->sb_cc) < rcvbufinc &&
3362	rcvbuf < tcp_autorcvbuf_max &&
3363	(sb->sb_idealsize > `0` &&
3364	sb->sb_hiwat <= (sb->sb_idealsize + rcvbufinc))) {
3365	sbreserve(sb,
3366	cc: min(a: (sb->sb_hiwat + rcvbufinc), b: tcp_autorcvbuf_max));
3367	}
3368	}
3369
3370	int32_t
3371	tcp_sbspace(struct tcpcb *tp)
3372	{
3373	struct socket *so = tp->t_inpcb->inp_socket;
3374	struct sockbuf *sb = &so->so_rcv;
3375	u_int32_t rcvbuf;
3376	int32_t space;
3377	int32_t pending = `0`;
3378
3379	if (so->so_flags & SOF_MP_SUBFLOW) {
3380	/ We still need to grow TCP's buffer to have a BDP-estimate /
3381	tcp_sbrcv_grow_rwin(tp, sb);
3382
3383	return mptcp_sbspace(tptomptp(tp));
3384	}
3385
3386	tcp_sbrcv_grow_rwin(tp, sb);
3387
3388	/ hiwat might have changed /
3389	rcvbuf = sb->sb_hiwat;
3390
3391	space = ((int32_t) imin(a: (rcvbuf - sb->sb_cc),
3392	b: (sb->sb_mbmax - sb->sb_mbcnt)));
3393	if (space < `0`) {
3394	space = `0`;
3395	}
3396
3397	#if CONTENT_FILTER
3398	/ Compensate for data being processed by content filters /
3399	pending = cfil_sock_data_space(sb);
3400	#endif /* CONTENT_FILTER */
3401	if (pending > space) {
3402	space = `0`;
3403	} else {
3404	space -= pending;
3405	}
3406
3407	/*
3408	* Avoid increasing window size if the current window
3409	* is already very low, we could be in "persist" mode and
3410	* we could break some apps (see rdar://5409343)
3411	*/
3412
3413	if (space < tp->t_maxseg) {
3414	return space;
3415	}
3416
3417	/ Clip window size for slower link /
3418
3419	if (((tp->t_flags & TF_SLOWLINK) != `0`) && slowlink_wsize > `0`) {
3420	return imin(a: space, b: slowlink_wsize);
3421	}
3422
3423	return space;
3424	}
3425	/*
3426	* Checks TCP Segment Offloading capability for a given connection
3427	* and interface pair.
3428	*/
3429	void
3430	tcp_set_tso(struct tcpcb tp, struct* ifnet *ifp)
3431	{
3432	struct inpcb *inp;
3433	int isipv6;
3434	struct ifnet *tunnel_ifp = NULL;
3435	#define IFNET_TSO_MASK (IFNET_TSO_IPV6 \| IFNET_TSO_IPV4)
3436
3437	tp->t_flags &= ~TF_TSO;
3438
3439	/*
3440	* Bail if there's a non-TSO-capable filter on the interface.
3441	*/
3442	if (ifp == NULL \|\| ifp->if_flt_no_tso_count > `0`) {
3443	return;
3444	}
3445
3446	inp = tp->t_inpcb;
3447	isipv6 = (inp->inp_vflag & INP_IPV6) != `0`;
3448
3449	#if MPTCP
3450	/*
3451	* We can't use TSO if this tcpcb belongs to an MPTCP session.
3452	*/
3453	if (inp->inp_socket->so_flags & SOF_MP_SUBFLOW) {
3454	return;
3455	}
3456	#endif
3457	/*
3458	* We can't use TSO if the TSO capability of the tunnel interface does
3459	* not match the capability of another interface known by TCP
3460	*/
3461	if (inp->inp_policyresult.results.result == NECP_KERNEL_POLICY_RESULT_IP_TUNNEL) {
3462	u_int tunnel_if_index = inp->inp_policyresult.results.result_parameter.tunnel_interface_index;
3463
3464	if (tunnel_if_index != `0`) {
3465	ifnet_head_lock_shared();
3466	tunnel_ifp = ifindex2ifnet[tunnel_if_index];
3467	ifnet_head_done();
3468	}
3469
3470	if (tunnel_ifp == NULL) {
3471	return;
3472	}
3473
3474	if ((ifp->if_hwassist & IFNET_TSO_MASK) != (tunnel_ifp->if_hwassist & IFNET_TSO_MASK)) {
3475	if (tso_debug > `0`) {
3476	os_log(OS_LOG_DEFAULT,
3477	"%s: %u > %u TSO 0 tunnel_ifp %s hwassist mismatch with ifp %s",
3478	__func__,
3479	ntohs(tp->t_inpcb->inp_lport), ntohs(tp->t_inpcb->inp_fport),
3480	tunnel_ifp->if_xname, ifp->if_xname);
3481	}
3482	return;
3483	}
3484	if (inp->inp_last_outifp != NULL &&
3485	(inp->inp_last_outifp->if_hwassist & IFNET_TSO_MASK) != (tunnel_ifp->if_hwassist & IFNET_TSO_MASK)) {
3486	if (tso_debug > `0`) {
3487	os_log(OS_LOG_DEFAULT,
3488	"%s: %u > %u TSO 0 tunnel_ifp %s hwassist mismatch with inp_last_outifp %s",
3489	__func__,
3490	ntohs(tp->t_inpcb->inp_lport), ntohs(tp->t_inpcb->inp_fport),
3491	tunnel_ifp->if_xname, inp->inp_last_outifp->if_xname);
3492	}
3493	return;
3494	}
3495	if ((inp->inp_flags & INP_BOUND_IF) && inp->inp_boundifp != NULL &&
3496	(inp->inp_boundifp->if_hwassist & IFNET_TSO_MASK) != (tunnel_ifp->if_hwassist & IFNET_TSO_MASK)) {
3497	if (tso_debug > `0`) {
3498	os_log(OS_LOG_DEFAULT,
3499	"%s: %u > %u TSO 0 tunnel_ifp %s hwassist mismatch with inp_boundifp %s",
3500	__func__,
3501	ntohs(tp->t_inpcb->inp_lport), ntohs(tp->t_inpcb->inp_fport),
3502	tunnel_ifp->if_xname, inp->inp_boundifp->if_xname);
3503	}
3504	return;
3505	}
3506	}
3507
3508	if (isipv6) {
3509	if (ifp->if_hwassist & IFNET_TSO_IPV6) {
3510	tp->t_flags \|= TF_TSO;
3511	if (ifp->if_tso_v6_mtu != `0`) {
3512	tp->tso_max_segment_size = ifp->if_tso_v6_mtu;
3513	} else {
3514	tp->tso_max_segment_size = TCP_MAXWIN;
3515	}
3516	}
3517	} else {
3518	if (ifp->if_hwassist & IFNET_TSO_IPV4) {
3519	tp->t_flags \|= TF_TSO;
3520	if (ifp->if_tso_v4_mtu != `0`) {
3521	tp->tso_max_segment_size = ifp->if_tso_v4_mtu;
3522	} else {
3523	tp->tso_max_segment_size = TCP_MAXWIN;
3524	}
3525	if (INTF_ADJUST_MTU_FOR_CLAT46(ifp)) {
3526	tp->tso_max_segment_size -=
3527	CLAT46_HDR_EXPANSION_OVERHD;
3528	}
3529	}
3530	}
3531
3532	if (tso_debug > `1`) {
3533	os_log(OS_LOG_DEFAULT, "%s: %u > %u TSO %d ifp %s",
3534	__func__,
3535	ntohs(tp->t_inpcb->inp_lport),
3536	ntohs(tp->t_inpcb->inp_fport),
3537	(tp->t_flags & TF_TSO) != `0`,
3538	ifp != NULL ? ifp->if_xname : "<NULL>");
3539	}
3540	}
3541
3542	#define TIMEVAL_TO_TCPHZ(_tv_) ((uint32_t)((_tv_).tv_sec * TCP_RETRANSHZ + \
3543	(_tv_).tv_usec / TCP_RETRANSHZ_TO_USEC))
3544
3545	/*
3546	* Function to calculate the tcp clock. The tcp clock will get updated
3547	* at the boundaries of the tcp layer. This is done at 3 places:
3548	* 1. Right before processing an input tcp packet
3549	* 2. Whenever a connection wants to access the network using tcp_usrreqs
3550	* 3. When a tcp timer fires or before tcp slow timeout
3551	*
3552	*/
3553
3554	void
3555	calculate_tcp_clock(void)
3556	{
3557	struct timeval tv = tcp_uptime;
3558	struct timeval interval = {.tv_sec = `0`, .tv_usec = TCP_RETRANSHZ_TO_USEC};
3559	struct timeval now, hold_now;
3560	uint32_t incr = `0`;
3561
3562	microuptime(tv: &now);
3563
3564	/*
3565	* Update coarse-grained networking timestamp (in sec.); the idea
3566	* is to update the counter returnable via net_uptime() when
3567	* we read time.
3568	*/
3569	net_update_uptime_with_time(&now);
3570
3571	timevaladd(t1: &tv, t2: &interval);
3572	if (timevalcmp(&now, &tv, >)) {
3573	/ time to update the clock /
3574	lck_spin_lock(lck: &tcp_uptime_lock);
3575	if (timevalcmp(&tcp_uptime, &now, >=)) {
3576	/ clock got updated while waiting for the lock /
3577	lck_spin_unlock(lck: &tcp_uptime_lock);
3578	return;
3579	}
3580
3581	microuptime(tv: &now);
3582	hold_now = now;
3583	tv = tcp_uptime;
3584	timevalsub(t1: &now, t2: &tv);
3585
3586	incr = TIMEVAL_TO_TCPHZ(now);
3587
3588	/ Account for the previous remainder /
3589	uint32_t remaining_us = (now.tv_usec % TCP_RETRANSHZ_TO_USEC) +
3590	tcp_now_remainder_us;
3591	if (remaining_us >= TCP_RETRANSHZ_TO_USEC) {
3592	incr += (remaining_us / TCP_RETRANSHZ_TO_USEC);
3593	}
3594
3595	if (incr > `0`) {
3596	tcp_uptime = hold_now;
3597	tcp_now_remainder_us = remaining_us % TCP_RETRANSHZ_TO_USEC;
3598	tcp_now += incr;
3599	}
3600
3601	lck_spin_unlock(lck: &tcp_uptime_lock);
3602	}
3603	}
3604
3605	/*
3606	* Compute receive window scaling that we are going to request
3607	* for this connection based on sb_hiwat. Try to leave some
3608	* room to potentially increase the window size upto a maximum
3609	* defined by the constant tcp_autorcvbuf_max.
3610	*/
3611	void
3612	tcp_set_max_rwinscale(struct tcpcb tp, struct* socket *so)
3613	{
3614	uint32_t maxsockbufsize;
3615
3616	tp->request_r_scale = MAX((uint8_t)tcp_win_scale, tp->request_r_scale);
3617	maxsockbufsize = ((so->so_rcv.sb_flags & SB_USRSIZE) != `0`) ?
3618	so->so_rcv.sb_hiwat : tcp_autorcvbuf_max;
3619
3620	/*
3621	* Window scale should not exceed what is needed
3622	* to send the max receive window size; adding 1 to TCP_MAXWIN
3623	* ensures that.
3624	*/
3625	while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
3626	((TCP_MAXWIN + `1`) << tp->request_r_scale) < maxsockbufsize) {
3627	tp->request_r_scale++;
3628	}
3629	tp->request_r_scale = MIN(tp->request_r_scale, TCP_MAX_WINSHIFT);
3630	}
3631
3632	int
3633	tcp_notsent_lowat_check(struct socket *so)
3634	{
3635	struct inpcb *inp = sotoinpcb(so);
3636	struct tcpcb *tp = NULL;
3637	int notsent = `0`;
3638
3639	if (inp != NULL) {
3640	tp = intotcpcb(inp);
3641	}
3642
3643	if (tp == NULL) {
3644	return `0`;
3645	}
3646
3647	notsent = so->so_snd.sb_cc -
3648	(tp->snd_nxt - tp->snd_una);
3649
3650	/*
3651	* When we send a FIN or SYN, not_sent can be negative.
3652	* In that case also we need to send a write event to the
3653	* process if it is waiting. In the FIN case, it will
3654	* get an error from send because cantsendmore will be set.
3655	*/
3656	if (notsent <= tp->t_notsent_lowat) {
3657	return `1`;
3658	}
3659
3660	/*
3661	* When Nagle's algorithm is not disabled, it is better
3662	* to wakeup the client until there is atleast one
3663	* maxseg of data to write.
3664	*/
3665	if ((tp->t_flags & TF_NODELAY) == `0` &&
3666	notsent > `0` && notsent < tp->t_maxseg) {
3667	return `1`;
3668	}
3669	return `0`;
3670	}
3671
3672	void
3673	tcp_rxtseg_insert(struct tcpcb *tp, tcp_seq start, tcp_seq end)
3674	{
3675	struct tcp_rxt_seg rxseg = NULL, prev = NULL, *next = NULL;
3676	uint16_t rxcount = `0`;
3677
3678	if (SLIST_EMPTY(&tp->t_rxt_segments)) {
3679	tp->t_dsack_lastuna = tp->snd_una;
3680	}
3681	/*
3682	* First check if there is a segment already existing for this
3683	* sequence space.
3684	*/
3685
3686	SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
3687	if (SEQ_GT(rxseg->rx_start, start)) {
3688	break;
3689	}
3690	prev = rxseg;
3691	}
3692	next = rxseg;
3693
3694	/ check if prev seg is for this sequence /
3695	if (prev != NULL && SEQ_LEQ(prev->rx_start, start) &&
3696	SEQ_GEQ(prev->rx_end, end)) {
3697	prev->rx_count++;
3698	return;
3699	}
3700
3701	/*
3702	* There are a couple of possibilities at this point.
3703	* 1. prev overlaps with the beginning of this sequence
3704	* 2. next overlaps with the end of this sequence
3705	* 3. there is no overlap.
3706	*/
3707
3708	if (prev != NULL && SEQ_GT(prev->rx_end, start)) {
3709	if (prev->rx_start == start && SEQ_GT(end, prev->rx_end)) {
3710	start = prev->rx_end + `1`;
3711	prev->rx_count++;
3712	} else {
3713	prev->rx_end = (start - `1`);
3714	rxcount = prev->rx_count;
3715	}
3716	}
3717
3718	if (next != NULL && SEQ_LT(next->rx_start, end)) {
3719	if (SEQ_LEQ(next->rx_end, end)) {
3720	end = next->rx_start - `1`;
3721	next->rx_count++;
3722	} else {
3723	next->rx_start = end + `1`;
3724	rxcount = next->rx_count;
3725	}
3726	}
3727	if (!SEQ_LT(start, end)) {
3728	return;
3729	}
3730
3731	if (tcp_rxt_seg_max > `0` && tp->t_rxt_seg_count >= tcp_rxt_seg_max) {
3732	rxseg = SLIST_FIRST(&tp->t_rxt_segments);
3733	if (prev == rxseg) {
3734	prev = NULL;
3735	}
3736	SLIST_REMOVE(&tp->t_rxt_segments, rxseg,
3737	tcp_rxt_seg, rx_link);
3738
3739	tcp_rxt_seg_drop++;
3740	tp->t_rxt_seg_drop++;
3741	TCP_LOG(tp, "removed rxseg list overflow %u:%u ",
3742	rxseg->rx_start, rxseg->rx_end);
3743	zfree(tcp_rxt_seg_zone, rxseg);
3744
3745	tp->t_rxt_seg_count -= `1`;
3746	}
3747
3748	rxseg = zalloc_flags(tcp_rxt_seg_zone, Z_WAITOK \| Z_ZERO \| Z_NOFAIL);
3749	rxseg->rx_start = start;
3750	rxseg->rx_end = end;
3751	rxseg->rx_count = rxcount + `1`;
3752
3753	if (prev != NULL) {
3754	SLIST_INSERT_AFTER(prev, rxseg, rx_link);
3755	} else {
3756	SLIST_INSERT_HEAD(&tp->t_rxt_segments, rxseg, rx_link);
3757	}
3758	tp->t_rxt_seg_count += `1`;
3759	}
3760
3761	struct tcp_rxt_seg *
3762	tcp_rxtseg_find(struct tcpcb *tp, tcp_seq start, tcp_seq end)
3763	{
3764	struct tcp_rxt_seg *rxseg;
3765
3766	if (SLIST_EMPTY(&tp->t_rxt_segments)) {
3767	return NULL;
3768	}
3769
3770	SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
3771	if (SEQ_LEQ(rxseg->rx_start, start) &&
3772	SEQ_GEQ(rxseg->rx_end, end)) {
3773	return rxseg;
3774	}
3775	if (SEQ_GT(rxseg->rx_start, start)) {
3776	break;
3777	}
3778	}
3779	return NULL;
3780	}
3781
3782	void
3783	tcp_rxtseg_set_spurious(struct tcpcb *tp, tcp_seq start, tcp_seq end)
3784	{
3785	struct tcp_rxt_seg *rxseg;
3786
3787	if (SLIST_EMPTY(&tp->t_rxt_segments)) {
3788	return;
3789	}
3790
3791	SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
3792	if (SEQ_GEQ(rxseg->rx_start, start) &&
3793	SEQ_LEQ(rxseg->rx_end, end)) {
3794	/*
3795	* If the segment was retransmitted only once, mark it as
3796	* spurious.
3797	*/
3798	if (rxseg->rx_count == `1`) {
3799	rxseg->rx_flags \|= TCP_RXT_SPURIOUS;
3800	}
3801	}
3802
3803	if (SEQ_GEQ(rxseg->rx_start, end)) {
3804	break;
3805	}
3806	}
3807	return;
3808	}
3809
3810	void
3811	tcp_rxtseg_clean(struct tcpcb *tp)
3812	{
3813	struct tcp_rxt_seg rxseg, next;
3814
3815	SLIST_FOREACH_SAFE(rxseg, &tp->t_rxt_segments, rx_link, next) {
3816	SLIST_REMOVE(&tp->t_rxt_segments, rxseg,
3817	tcp_rxt_seg, rx_link);
3818	zfree(tcp_rxt_seg_zone, rxseg);
3819	}
3820	tp->t_rxt_seg_count = `0`;
3821	tp->t_dsack_lastuna = tp->snd_max;
3822	}
3823
3824	boolean_t
3825	tcp_rxtseg_detect_bad_rexmt(struct tcpcb *tp, tcp_seq th_ack)
3826	{
3827	boolean_t bad_rexmt;
3828	struct tcp_rxt_seg *rxseg;
3829
3830	if (SLIST_EMPTY(&tp->t_rxt_segments)) {
3831	return FALSE;
3832	}
3833
3834	/*
3835	* If all of the segments in this window are not cumulatively
3836	* acknowledged, then there can still be undetected packet loss.
3837	* Do not restore congestion window in that case.
3838	*/
3839	if (SEQ_LT(th_ack, tp->snd_recover)) {
3840	return FALSE;
3841	}
3842
3843	bad_rexmt = TRUE;
3844	SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
3845	if (!(rxseg->rx_flags & TCP_RXT_SPURIOUS)) {
3846	bad_rexmt = FALSE;
3847	break;
3848	}
3849	}
3850	return bad_rexmt;
3851	}
3852
3853	u_int32_t
3854	tcp_rxtseg_total_size(struct tcpcb *tp)
3855	{
3856	struct tcp_rxt_seg *rxseg;
3857	u_int32_t total_size = `0`;
3858
3859	SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
3860	total_size += (rxseg->rx_end - rxseg->rx_start) + `1`;
3861	}
3862	return total_size;
3863	}
3864
3865	void
3866	tcp_get_connectivity_status(struct tcpcb *tp,
3867	struct tcp_conn_status *connstatus)
3868	{
3869	if (tp == NULL \|\| connstatus == NULL) {
3870	return;
3871	}
3872	bzero(s: connstatus, n: sizeof(*connstatus));
3873	if (tp->t_rxtshift >= TCP_CONNECTIVITY_PROBES_MAX) {
3874	if (TCPS_HAVEESTABLISHED(tp->t_state)) {
3875	connstatus->write_probe_failed = `1`;
3876	} else {
3877	connstatus->conn_probe_failed = `1`;
3878	}
3879	}
3880	if (tp->t_rtimo_probes >= TCP_CONNECTIVITY_PROBES_MAX) {
3881	connstatus->read_probe_failed = `1`;
3882	}
3883	if (tp->t_inpcb != NULL && tp->t_inpcb->inp_last_outifp != NULL &&
3884	(tp->t_inpcb->inp_last_outifp->if_eflags & IFEF_PROBE_CONNECTIVITY)) {
3885	connstatus->probe_activated = `1`;
3886	}
3887	}
3888
3889	boolean_t
3890	tfo_enabled(const struct tcpcb *tp)
3891	{
3892	return (tp->t_flagsext & TF_FASTOPEN)? TRUE : FALSE;
3893	}
3894
3895	void
3896	tcp_disable_tfo(struct tcpcb *tp)
3897	{
3898	tp->t_flagsext &= ~TF_FASTOPEN;
3899	}
3900
3901	static struct mbuf *
3902	tcp_make_keepalive_frame(struct tcpcb tp, struct* ifnet *ifp,
3903	boolean_t is_probe)
3904	{
3905	struct inpcb *inp = tp->t_inpcb;
3906	struct tcphdr *th;
3907	u_int8_t *data;
3908	int win = `0`;
3909	struct mbuf *m;
3910
3911	/*
3912	* The code assumes the IP + TCP headers fit in an mbuf packet header
3913	*/
3914	_CASSERT(sizeof(struct ip) + sizeof(struct tcphdr) <= _MHLEN);
3915	_CASSERT(sizeof(struct ip6_hdr) + sizeof(struct tcphdr) <= _MHLEN);
3916
3917	MGETHDR(m, M_WAIT, MT_HEADER);
3918	if (m == NULL) {
3919	return NULL;
3920	}
3921	m->m_pkthdr.pkt_proto = IPPROTO_TCP;
3922
3923	data = mbuf_datastart(mbuf: m);
3924
3925	if (inp->inp_vflag & INP_IPV4) {
3926	bzero(s: data, n: sizeof(struct ip) + sizeof(struct tcphdr));
3927	th = (struct tcphdr )(void* ) (data + sizeof(struct* ip));
3928	m->m_len = sizeof(struct ip) + sizeof(struct tcphdr);
3929	m->m_pkthdr.len = m->m_len;
3930	} else {
3931	VERIFY(inp->inp_vflag & INP_IPV6);
3932
3933	bzero(s: data, n: sizeof(struct ip6_hdr)
3934	+ sizeof(struct tcphdr));
3935	th = (struct tcphdr )(void* )(data + sizeof(struct* ip6_hdr));
3936	m->m_len = sizeof(struct ip6_hdr) +
3937	sizeof(struct tcphdr);
3938	m->m_pkthdr.len = m->m_len;
3939	}
3940
3941	tcp_fillheaders(m, tp, ip_ptr: data, tcp_ptr: th);
3942
3943	if (inp->inp_vflag & INP_IPV4) {
3944	struct ip *ip;
3945
3946	ip = (__typeof__(ip))(void *)data;
3947
3948	ip->ip_id = rfc6864 ? `0` : ip_randomid((uint64_t)m);
3949	ip->ip_off = htons(IP_DF);
3950	ip->ip_len = htons(sizeof(struct ip) + sizeof(struct tcphdr));
3951	ip->ip_ttl = inp->inp_ip_ttl;
3952	ip->ip_tos \|= (inp->inp_ip_tos & ~IPTOS_ECN_MASK);
3953	ip->ip_sum = in_cksum_hdr(ip);
3954	} else {
3955	struct ip6_hdr *ip6;
3956
3957	ip6 = (__typeof__(ip6))(void *)data;
3958
3959	ip6->ip6_plen = htons(sizeof(struct tcphdr));
3960	ip6->ip6_hlim = in6_selecthlim(inp, ifp);
3961	ip6->ip6_flow = ip6->ip6_flow & ~IPV6_FLOW_ECN_MASK;
3962
3963	if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src)) {
3964	ip6->ip6_src.s6_addr16[`1`] = `0`;
3965	}
3966	if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst)) {
3967	ip6->ip6_dst.s6_addr16[`1`] = `0`;
3968	}
3969	}
3970	th->th_flags = TH_ACK;
3971
3972	win = tcp_sbspace(tp);
3973	if (win > ((int32_t)TCP_MAXWIN << tp->rcv_scale)) {
3974	win = (int32_t)TCP_MAXWIN << tp->rcv_scale;
3975	}
3976	th->th_win = htons((u_short) (win >> tp->rcv_scale));
3977
3978	if (is_probe) {
3979	th->th_seq = htonl(tp->snd_una - `1`);
3980	} else {
3981	th->th_seq = htonl(tp->snd_una);
3982	}
3983	th->th_ack = htonl(tp->rcv_nxt);
3984
3985	/ Force recompute TCP checksum to be the final value /
3986	th->th_sum = `0`;
3987	if (inp->inp_vflag & INP_IPV4) {
3988	th->th_sum = inet_cksum(m, IPPROTO_TCP,
3989	sizeof(struct ip), sizeof(struct tcphdr));
3990	} else {
3991	th->th_sum = inet6_cksum(m, IPPROTO_TCP,
3992	sizeof(struct ip6_hdr), sizeof(struct tcphdr));
3993	}
3994
3995	return m;
3996	}
3997
3998	void
3999	tcp_fill_keepalive_offload_frames(ifnet_t ifp,
4000	struct ifnet_keepalive_offload_frame *frames_array,
4001	u_int32_t frames_array_count, size_t frame_data_offset,
4002	u_int32_t *used_frames_count)
4003	{
4004	struct inpcb *inp;
4005	inp_gen_t gencnt;
4006	u_int32_t frame_index = *used_frames_count;
4007
4008	if (ifp == NULL \|\| frames_array == NULL \|\|
4009	frames_array_count == `0` \|\|
4010	frame_index >= frames_array_count \|\|
4011	frame_data_offset >= IFNET_KEEPALIVE_OFFLOAD_FRAME_DATA_SIZE) {
4012	return;
4013	}
4014
4015	/*
4016	* This function is called outside the regular TCP processing
4017	* so we need to update the TCP clock.
4018	*/
4019	calculate_tcp_clock();
4020
4021	lck_rw_lock_shared(lck: &tcbinfo.ipi_lock);
4022	gencnt = tcbinfo.ipi_gencnt;
4023	LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) {
4024	struct socket *so;
4025	struct ifnet_keepalive_offload_frame *frame;
4026	struct mbuf *m = NULL;
4027	struct tcpcb *tp = intotcpcb(inp);
4028
4029	if (frame_index >= frames_array_count) {
4030	break;
4031	}
4032
4033	if (inp->inp_gencnt > gencnt \|\|
4034	inp->inp_state == INPCB_STATE_DEAD) {
4035	continue;
4036	}
4037
4038	if ((so = inp->inp_socket) == NULL \|\|
4039	(so->so_state & SS_DEFUNCT)) {
4040	continue;
4041	}
4042	/*
4043	* check for keepalive offload flag without socket
4044	* lock to avoid a deadlock
4045	*/
4046	if (!(inp->inp_flags2 & INP2_KEEPALIVE_OFFLOAD)) {
4047	continue;
4048	}
4049
4050	if (!(inp->inp_vflag & (INP_IPV4 \| INP_IPV6))) {
4051	continue;
4052	}
4053	if (inp->inp_ppcb == NULL \|\|
4054	in_pcb_checkstate(inp, WNT_ACQUIRE, `0`) == WNT_STOPUSING) {
4055	continue;
4056	}
4057	socket_lock(so, refcount: `1`);
4058	/ Release the want count /
4059	if (inp->inp_ppcb == NULL \|\|
4060	(in_pcb_checkstate(inp, WNT_RELEASE, `1`) == WNT_STOPUSING)) {
4061	socket_unlock(so, refcount: `1`);
4062	continue;
4063	}
4064	if ((inp->inp_vflag & INP_IPV4) &&
4065	(inp->inp_laddr.s_addr == INADDR_ANY \|\|
4066	inp->inp_faddr.s_addr == INADDR_ANY)) {
4067	socket_unlock(so, refcount: `1`);
4068	continue;
4069	}
4070	if ((inp->inp_vflag & INP_IPV6) &&
4071	(IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) \|\|
4072	IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))) {
4073	socket_unlock(so, refcount: `1`);
4074	continue;
4075	}
4076	if (inp->inp_lport == `0` \|\| inp->inp_fport == `0`) {
4077	socket_unlock(so, refcount: `1`);
4078	continue;
4079	}
4080	if (inp->inp_last_outifp == NULL \|\|
4081	inp->inp_last_outifp->if_index != ifp->if_index) {
4082	socket_unlock(so, refcount: `1`);
4083	continue;
4084	}
4085	if ((inp->inp_vflag & INP_IPV4) && frame_data_offset +
4086	sizeof(struct ip) + sizeof(struct tcphdr) >
4087	IFNET_KEEPALIVE_OFFLOAD_FRAME_DATA_SIZE) {
4088	socket_unlock(so, refcount: `1`);
4089	continue;
4090	} else if (!(inp->inp_vflag & INP_IPV4) && frame_data_offset +
4091	sizeof(struct ip6_hdr) + sizeof(struct tcphdr) >
4092	IFNET_KEEPALIVE_OFFLOAD_FRAME_DATA_SIZE) {
4093	socket_unlock(so, refcount: `1`);
4094	continue;
4095	}
4096	/*
4097	* There is no point in waking up the device for connections
4098	* that are not established. Long lived connection are meant
4099	* for processes that will sent and receive data
4100	*/
4101	if (tp->t_state != TCPS_ESTABLISHED) {
4102	socket_unlock(so, refcount: `1`);
4103	continue;
4104	}
4105	/*
4106	* This inp has all the information that is needed to
4107	* generate an offload frame.
4108	*/
4109	frame = &frames_array[frame_index];
4110	frame->type = IFNET_KEEPALIVE_OFFLOAD_FRAME_TCP;
4111	frame->ether_type = (inp->inp_vflag & INP_IPV4) ?
4112	IFNET_KEEPALIVE_OFFLOAD_FRAME_ETHERTYPE_IPV4 :
4113	IFNET_KEEPALIVE_OFFLOAD_FRAME_ETHERTYPE_IPV6;
4114	frame->interval = (uint16_t)(tp->t_keepidle > `0` ? tp->t_keepidle :
4115	tcp_keepidle);
4116	frame->keep_cnt = (uint8_t)TCP_CONN_KEEPCNT(tp);
4117	frame->keep_retry = (uint16_t)TCP_CONN_KEEPINTVL(tp);
4118	if (so->so_options & SO_NOWAKEFROMSLEEP) {
4119	frame->flags \|=
4120	IFNET_KEEPALIVE_OFFLOAD_FLAG_NOWAKEFROMSLEEP;
4121	}
4122	frame->local_port = ntohs(inp->inp_lport);
4123	frame->remote_port = ntohs(inp->inp_fport);
4124	frame->local_seq = tp->snd_nxt;
4125	frame->remote_seq = tp->rcv_nxt;
4126	if (inp->inp_vflag & INP_IPV4) {
4127	ASSERT(frame_data_offset + sizeof(struct ip) + sizeof(struct tcphdr) <= UINT8_MAX);
4128	frame->length = (uint8_t)(frame_data_offset +
4129	sizeof(struct ip) + sizeof(struct tcphdr));
4130	frame->reply_length = frame->length;
4131
4132	frame->addr_length = sizeof(struct in_addr);
4133	bcopy(src: &inp->inp_laddr, dst: frame->local_addr,
4134	n: sizeof(struct in_addr));
4135	bcopy(src: &inp->inp_faddr, dst: frame->remote_addr,
4136	n: sizeof(struct in_addr));
4137	} else {
4138	struct in6_addr *ip6;
4139
4140	ASSERT(frame_data_offset + sizeof(struct ip6_hdr) + sizeof(struct tcphdr) <= UINT8_MAX);
4141	frame->length = (uint8_t)(frame_data_offset +
4142	sizeof(struct ip6_hdr) + sizeof(struct tcphdr));
4143	frame->reply_length = frame->length;
4144
4145	frame->addr_length = sizeof(struct in6_addr);
4146	ip6 = (struct in6_addr )(void* *)frame->local_addr;
4147	bcopy(src: &inp->in6p_laddr, dst: ip6, n: sizeof(struct in6_addr));
4148	if (IN6_IS_SCOPE_EMBED(ip6)) {
4149	ip6->s6_addr16[`1`] = `0`;
4150	}
4151
4152	ip6 = (struct in6_addr )(void* *)frame->remote_addr;
4153	bcopy(src: &inp->in6p_faddr, dst: ip6, n: sizeof(struct in6_addr));
4154	if (IN6_IS_SCOPE_EMBED(ip6)) {
4155	ip6->s6_addr16[`1`] = `0`;
4156	}
4157	}
4158
4159	/*
4160	* First the probe
4161	*/
4162	m = tcp_make_keepalive_frame(tp, ifp, TRUE);
4163	if (m == NULL) {
4164	socket_unlock(so, refcount: `1`);
4165	continue;
4166	}
4167	bcopy(src: m_mtod_current(m), dst: frame->data + frame_data_offset, n: m->m_len);
4168	m_freem(m);
4169
4170	/*
4171	* Now the response packet to incoming probes
4172	*/
4173	m = tcp_make_keepalive_frame(tp, ifp, FALSE);
4174	if (m == NULL) {
4175	socket_unlock(so, refcount: `1`);
4176	continue;
4177	}
4178	bcopy(src: m_mtod_current(m), dst: frame->reply_data + frame_data_offset,
4179	n: m->m_len);
4180	m_freem(m);
4181
4182	frame_index++;
4183	socket_unlock(so, refcount: `1`);
4184	}
4185	lck_rw_done(lck: &tcbinfo.ipi_lock);
4186	*used_frames_count = frame_index;
4187	}
4188
4189	static bool
4190	inp_matches_kao_frame(ifnet_t ifp, struct ifnet_keepalive_offload_frame *frame,
4191	struct inpcb *inp)
4192	{
4193	if (inp->inp_ppcb == NULL) {
4194	return false;
4195	}
4196	/ Release the want count /
4197	if (in_pcb_checkstate(inp, WNT_RELEASE, `1`) == WNT_STOPUSING) {
4198	return false;
4199	}
4200	if (inp->inp_last_outifp == NULL \|\|
4201	inp->inp_last_outifp->if_index != ifp->if_index) {
4202	return false;
4203	}
4204	if (frame->local_port != ntohs(inp->inp_lport) \|\|
4205	frame->remote_port != ntohs(inp->inp_fport)) {
4206	return false;
4207	}
4208	if (inp->inp_vflag & INP_IPV4) {
4209	if (memcmp(s1: &inp->inp_laddr, s2: frame->local_addr,
4210	n: sizeof(struct in_addr)) != `0` \|\|
4211	memcmp(s1: &inp->inp_faddr, s2: frame->remote_addr,
4212	n: sizeof(struct in_addr)) != `0`) {
4213	return false;
4214	}
4215	} else if (inp->inp_vflag & INP_IPV6) {
4216	if (memcmp(s1: &inp->inp_laddr, s2: frame->local_addr,
4217	n: sizeof(struct in6_addr)) != `0` \|\|
4218	memcmp(s1: &inp->inp_faddr, s2: frame->remote_addr,
4219	n: sizeof(struct in6_addr)) != `0`) {
4220	return false;
4221	}
4222	} else {
4223	return false;
4224	}
4225	return true;
4226	}
4227
4228	int
4229	tcp_notify_kao_timeout(ifnet_t ifp,
4230	struct ifnet_keepalive_offload_frame *frame)
4231	{
4232	struct inpcb *inp = NULL;
4233	struct socket *so = NULL;
4234	bool found = false;
4235
4236	/*
4237	* Unlock the list before posting event on the matching socket
4238	*/
4239	lck_rw_lock_shared(lck: &tcbinfo.ipi_lock);
4240
4241	LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) {
4242	if ((so = inp->inp_socket) == NULL \|\|
4243	(so->so_state & SS_DEFUNCT)) {
4244	continue;
4245	}
4246	if (!(inp->inp_flags2 & INP2_KEEPALIVE_OFFLOAD)) {
4247	continue;
4248	}
4249	if (!(inp->inp_vflag & (INP_IPV4 \| INP_IPV6))) {
4250	continue;
4251	}
4252	if (inp->inp_ppcb == NULL \|\|
4253	in_pcb_checkstate(inp, WNT_ACQUIRE, `0`) == WNT_STOPUSING) {
4254	continue;
4255	}
4256	socket_lock(so, refcount: `1`);
4257	if (inp_matches_kao_frame(ifp, frame, inp)) {
4258	/*
4259	* Keep the matching socket locked
4260	*/
4261	found = true;
4262	break;
4263	}
4264	socket_unlock(so, refcount: `1`);
4265	}
4266	lck_rw_done(lck: &tcbinfo.ipi_lock);
4267
4268	if (found) {
4269	ASSERT(inp != NULL);
4270	ASSERT(so != NULL);
4271	ASSERT(so == inp->inp_socket);
4272	/*
4273	* Drop the TCP connection like tcptimers() does
4274	*/
4275	struct tcpcb *tp = inp->inp_ppcb;
4276
4277	tcpstat.tcps_keepdrops++;
4278	soevent(so,
4279	hint: (SO_FILT_HINT_LOCKED \| SO_FILT_HINT_TIMEOUT));
4280	tp = tcp_drop(tp, ETIMEDOUT);
4281
4282	tcpstat.tcps_ka_offload_drops++;
4283	os_log_info(OS_LOG_DEFAULT, "%s: dropped lport %u fport %u\n",
4284	__func__, frame->local_port, frame->remote_port);
4285
4286	socket_unlock(so, refcount: `1`);
4287	}
4288
4289	return `0`;
4290	}
4291
4292	errno_t
4293	tcp_notify_ack_id_valid(struct tcpcb tp, struct* socket *so,
4294	u_int32_t notify_id)
4295	{
4296	struct tcp_notify_ack_marker *elm;
4297
4298	if (so->so_snd.sb_cc == `0`) {
4299	return ENOBUFS;
4300	}
4301
4302	SLIST_FOREACH(elm, &tp->t_notify_ack, notify_next) {
4303	/ Duplicate id is not allowed /
4304	if (elm->notify_id == notify_id) {
4305	return EINVAL;
4306	}
4307	/ Duplicate position is not allowed /
4308	if (elm->notify_snd_una == tp->snd_una + so->so_snd.sb_cc) {
4309	return EINVAL;
4310	}
4311	}
4312	return `0`;
4313	}
4314
4315	errno_t
4316	tcp_add_notify_ack_marker(struct tcpcb *tp, u_int32_t notify_id)
4317	{
4318	struct tcp_notify_ack_marker nm, elm = NULL;
4319	struct socket *so = tp->t_inpcb->inp_socket;
4320
4321	nm = kalloc_type(struct tcp_notify_ack_marker, M_WAIT \| Z_ZERO);
4322	if (nm == NULL) {
4323	return ENOMEM;
4324	}
4325	nm->notify_id = notify_id;
4326	nm->notify_snd_una = tp->snd_una + so->so_snd.sb_cc;
4327
4328	SLIST_FOREACH(elm, &tp->t_notify_ack, notify_next) {
4329	if (SEQ_GT(nm->notify_snd_una, elm->notify_snd_una)) {
4330	break;
4331	}
4332	}
4333
4334	if (elm == NULL) {
4335	VERIFY(SLIST_EMPTY(&tp->t_notify_ack));
4336	SLIST_INSERT_HEAD(&tp->t_notify_ack, nm, notify_next);
4337	} else {
4338	SLIST_INSERT_AFTER(elm, nm, notify_next);
4339	}
4340	tp->t_notify_ack_count++;
4341	return `0`;
4342	}
4343
4344	void
4345	tcp_notify_ack_free(struct tcpcb *tp)
4346	{
4347	struct tcp_notify_ack_marker elm, next;
4348	if (SLIST_EMPTY(&tp->t_notify_ack)) {
4349	return;
4350	}
4351
4352	SLIST_FOREACH_SAFE(elm, &tp->t_notify_ack, notify_next, next) {
4353	SLIST_REMOVE(&tp->t_notify_ack, elm, tcp_notify_ack_marker,
4354	notify_next);
4355	kfree_type(struct tcp_notify_ack_marker, elm);
4356	}
4357	SLIST_INIT(&tp->t_notify_ack);
4358	tp->t_notify_ack_count = `0`;
4359	}
4360
4361	inline void
4362	tcp_notify_acknowledgement(struct tcpcb tp, struct* socket *so)
4363	{
4364	struct tcp_notify_ack_marker *elm;
4365
4366	elm = SLIST_FIRST(&tp->t_notify_ack);
4367	if (SEQ_GEQ(tp->snd_una, elm->notify_snd_una)) {
4368	soevent(so, SO_FILT_HINT_LOCKED \| SO_FILT_HINT_NOTIFY_ACK);
4369	}
4370	}
4371
4372	void
4373	tcp_get_notify_ack_count(struct tcpcb *tp,
4374	struct tcp_notify_ack_complete *retid)
4375	{
4376	struct tcp_notify_ack_marker *elm;
4377	uint32_t complete = `0`;
4378
4379	SLIST_FOREACH(elm, &tp->t_notify_ack, notify_next) {
4380	if (SEQ_GEQ(tp->snd_una, elm->notify_snd_una)) {
4381	ASSERT(complete < UINT32_MAX);
4382	complete++;
4383	} else {
4384	break;
4385	}
4386	}
4387	retid->notify_pending = tp->t_notify_ack_count - complete;
4388	retid->notify_complete_count = min(TCP_MAX_NOTIFY_ACK, b: complete);
4389	}
4390
4391	void
4392	tcp_get_notify_ack_ids(struct tcpcb *tp,
4393	struct tcp_notify_ack_complete *retid)
4394	{
4395	size_t i = `0`;
4396	struct tcp_notify_ack_marker elm, next;
4397
4398	SLIST_FOREACH_SAFE(elm, &tp->t_notify_ack, notify_next, next) {
4399	if (i >= retid->notify_complete_count) {
4400	break;
4401	}
4402	if (SEQ_GEQ(tp->snd_una, elm->notify_snd_una)) {
4403	retid->notify_complete_id[i++] = elm->notify_id;
4404	SLIST_REMOVE(&tp->t_notify_ack, elm,
4405	tcp_notify_ack_marker, notify_next);
4406	kfree_type(struct tcp_notify_ack_marker, elm);
4407	tp->t_notify_ack_count--;
4408	} else {
4409	break;
4410	}
4411	}
4412	}
4413
4414	bool
4415	tcp_notify_ack_active(struct socket *so)
4416	{
4417	if ((SOCK_DOM(so) == PF_INET \|\| SOCK_DOM(so) == PF_INET6) &&
4418	SOCK_TYPE(so) == SOCK_STREAM) {
4419	struct tcpcb *tp = intotcpcb(sotoinpcb(so));
4420
4421	if (!SLIST_EMPTY(&tp->t_notify_ack)) {
4422	struct tcp_notify_ack_marker *elm;
4423	elm = SLIST_FIRST(&tp->t_notify_ack);
4424	if (SEQ_GEQ(tp->snd_una, elm->notify_snd_una)) {
4425	return true;
4426	}
4427	}
4428	}
4429	return false;
4430	}
4431
4432	inline int32_t
4433	inp_get_sndbytes_allunsent(struct socket *so, u_int32_t th_ack)
4434	{
4435	struct inpcb *inp = sotoinpcb(so);
4436	struct tcpcb *tp = intotcpcb(inp);
4437
4438	if ((so->so_snd.sb_flags & SB_SNDBYTE_CNT) &&
4439	so->so_snd.sb_cc > `0`) {
4440	int32_t unsent, sent;
4441	sent = tp->snd_max - th_ack;
4442	if (tp->t_flags & TF_SENTFIN) {
4443	sent--;
4444	}
4445	unsent = so->so_snd.sb_cc - sent;
4446	return unsent;
4447	}
4448	return `0`;
4449	}
4450
4451	uint8_t
4452	tcp_get_ace(struct tcphdr *th)
4453	{
4454	uint8_t ace = `0`;
4455	if (th->th_flags & TH_ECE) {
4456	ace += `1`;
4457	}
4458	if (th->th_flags & TH_CWR) {
4459	ace += `2`;
4460	}
4461	if (th->th_x2 & (TH_AE >> `8`)) {
4462	ace += `4`;
4463	}
4464
4465	return ace;
4466	}
4467
4468	#define IFP_PER_FLOW_STAT(_ipv4_, _stat_) { \
4469	if (_ipv4_) { \
4470	ifp->if_ipv4_stat->_stat_++; \
4471	} else { \
4472	ifp->if_ipv6_stat->_stat_++; \
4473	} \
4474	}
4475
4476	#define FLOW_ECN_ENABLED(_flags_) \
4477	((_flags_ & (TE_ECN_ON)) == (TE_ECN_ON))
4478
4479	void
4480	tcp_update_stats_per_flow(struct ifnet_stats_per_flow *ifs,
4481	struct ifnet *ifp)
4482	{
4483	if (ifp == NULL \|\| !IF_FULLY_ATTACHED(ifp)) {
4484	return;
4485	}
4486
4487	ifnet_lock_shared(ifp);
4488	if (ifs->ecn_flags & TE_SETUPSENT) {
4489	if (ifs->ecn_flags & TE_CLIENT_SETUP) {
4490	IFP_PER_FLOW_STAT(ifs->ipv4, ecn_client_setup);
4491	if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
4492	IFP_PER_FLOW_STAT(ifs->ipv4,
4493	ecn_client_success);
4494	} else if (ifs->ecn_flags & TE_LOST_SYN) {
4495	IFP_PER_FLOW_STAT(ifs->ipv4,
4496	ecn_syn_lost);
4497	} else {
4498	IFP_PER_FLOW_STAT(ifs->ipv4,
4499	ecn_peer_nosupport);
4500	}
4501	} else {
4502	IFP_PER_FLOW_STAT(ifs->ipv4, ecn_server_setup);
4503	if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
4504	IFP_PER_FLOW_STAT(ifs->ipv4,
4505	ecn_server_success);
4506	} else if (ifs->ecn_flags & TE_LOST_SYN) {
4507	IFP_PER_FLOW_STAT(ifs->ipv4,
4508	ecn_synack_lost);
4509	} else {
4510	IFP_PER_FLOW_STAT(ifs->ipv4,
4511	ecn_peer_nosupport);
4512	}
4513	}
4514	} else {
4515	IFP_PER_FLOW_STAT(ifs->ipv4, ecn_off_conn);
4516	}
4517	if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
4518	if (ifs->ecn_flags & TE_RECV_ECN_CE) {
4519	tcpstat.tcps_ecn_conn_recv_ce++;
4520	IFP_PER_FLOW_STAT(ifs->ipv4, ecn_conn_recv_ce);
4521	}
4522	if (ifs->ecn_flags & TE_RECV_ECN_ECE) {
4523	tcpstat.tcps_ecn_conn_recv_ece++;
4524	IFP_PER_FLOW_STAT(ifs->ipv4, ecn_conn_recv_ece);
4525	}
4526	if (ifs->ecn_flags & (TE_RECV_ECN_CE \| TE_RECV_ECN_ECE)) {
4527	if (ifs->txretransmitbytes > `0` \|\|
4528	ifs->rxoutoforderbytes > `0`) {
4529	tcpstat.tcps_ecn_conn_pl_ce++;
4530	IFP_PER_FLOW_STAT(ifs->ipv4, ecn_conn_plce);
4531	} else {
4532	tcpstat.tcps_ecn_conn_nopl_ce++;
4533	IFP_PER_FLOW_STAT(ifs->ipv4, ecn_conn_noplce);
4534	}
4535	} else {
4536	if (ifs->txretransmitbytes > `0` \|\|
4537	ifs->rxoutoforderbytes > `0`) {
4538	tcpstat.tcps_ecn_conn_plnoce++;
4539	IFP_PER_FLOW_STAT(ifs->ipv4, ecn_conn_plnoce);
4540	}
4541	}
4542	}
4543
4544	/ Other stats are interesting for non-local connections only /
4545	if (ifs->local) {
4546	ifnet_lock_done(ifp);
4547	return;
4548	}
4549
4550	if (ifs->ipv4) {
4551	ifp->if_ipv4_stat->timestamp = net_uptime();
4552	if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
4553	tcp_flow_ecn_perf_stats(ifs, stat: &ifp->if_ipv4_stat->ecn_on);
4554	} else {
4555	tcp_flow_ecn_perf_stats(ifs, stat: &ifp->if_ipv4_stat->ecn_off);
4556	}
4557	} else {
4558	ifp->if_ipv6_stat->timestamp = net_uptime();
4559	if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
4560	tcp_flow_ecn_perf_stats(ifs, stat: &ifp->if_ipv6_stat->ecn_on);
4561	} else {
4562	tcp_flow_ecn_perf_stats(ifs, stat: &ifp->if_ipv6_stat->ecn_off);
4563	}
4564	}
4565
4566	if (ifs->rxmit_drop) {
4567	if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
4568	IFP_PER_FLOW_STAT(ifs->ipv4, ecn_on.rxmit_drop);
4569	} else {
4570	IFP_PER_FLOW_STAT(ifs->ipv4, ecn_off.rxmit_drop);
4571	}
4572	}
4573	if (ifs->ecn_fallback_synloss) {
4574	IFP_PER_FLOW_STAT(ifs->ipv4, ecn_fallback_synloss);
4575	}
4576	if (ifs->ecn_fallback_droprst) {
4577	IFP_PER_FLOW_STAT(ifs->ipv4, ecn_fallback_droprst);
4578	}
4579	if (ifs->ecn_fallback_droprxmt) {
4580	IFP_PER_FLOW_STAT(ifs->ipv4, ecn_fallback_droprxmt);
4581	}
4582	if (ifs->ecn_fallback_ce) {
4583	IFP_PER_FLOW_STAT(ifs->ipv4, ecn_fallback_ce);
4584	}
4585	if (ifs->ecn_fallback_reorder) {
4586	IFP_PER_FLOW_STAT(ifs->ipv4, ecn_fallback_reorder);
4587	}
4588	if (ifs->ecn_recv_ce > `0`) {
4589	IFP_PER_FLOW_STAT(ifs->ipv4, ecn_recv_ce);
4590	}
4591	if (ifs->ecn_recv_ece > `0`) {
4592	IFP_PER_FLOW_STAT(ifs->ipv4, ecn_recv_ece);
4593	}
4594
4595	tcp_flow_lim_stats(ifs, stat: &ifp->if_lim_stat);
4596	ifnet_lock_done(ifp);
4597	}
4598
4599	#if SKYWALK
4600
4601	#include <skywalk/core/skywalk_var.h>
4602	#include <skywalk/nexus/flowswitch/nx_flowswitch.h>
4603
4604	void
4605	tcp_add_fsw_flow(struct tcpcb tp, struct* ifnet *ifp)
4606	{
4607	struct inpcb *inp = tp->t_inpcb;
4608	struct socket *so = inp->inp_socket;
4609	uuid_t fsw_uuid;
4610	struct nx_flow_req nfr;
4611	int err;
4612
4613	if (!NX_FSW_TCP_RX_AGG_ENABLED()) {
4614	return;
4615	}
4616
4617	if (ifp == NULL \|\| kern_nexus_get_flowswitch_instance(ifp, nx_uuid: fsw_uuid)) {
4618	TCP_LOG_FSW_FLOW(tp, "skip ifp no fsw");
4619	return;
4620	}
4621
4622	memset(s: &nfr, c: `0`, n: sizeof(nfr));
4623
4624	if (inp->inp_vflag & INP_IPV4) {
4625	ASSERT(!(inp->inp_laddr.s_addr == INADDR_ANY \|\|
4626	inp->inp_faddr.s_addr == INADDR_ANY \|\|
4627	IN_MULTICAST(ntohl(inp->inp_laddr.s_addr)) \|\|
4628	IN_MULTICAST(ntohl(inp->inp_faddr.s_addr))));
4629	nfr.nfr_saddr.sin.sin_len = sizeof(struct sockaddr_in);
4630	nfr.nfr_saddr.sin.sin_family = AF_INET;
4631	nfr.nfr_saddr.sin.sin_port = inp->inp_lport;
4632	memcpy(dst: &nfr.nfr_saddr.sin.sin_addr, src: &inp->inp_laddr,
4633	n: sizeof(struct in_addr));
4634	nfr.nfr_daddr.sin.sin_len = sizeof(struct sockaddr_in);
4635	nfr.nfr_daddr.sin.sin_family = AF_INET;
4636	nfr.nfr_daddr.sin.sin_port = inp->inp_fport;
4637	memcpy(dst: &nfr.nfr_daddr.sin.sin_addr, src: &inp->inp_faddr,
4638	n: sizeof(struct in_addr));
4639	} else {
4640	ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) \|\|
4641	IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) \|\|
4642	IN6_IS_ADDR_MULTICAST(&inp->in6p_laddr) \|\|
4643	IN6_IS_ADDR_MULTICAST(&inp->in6p_faddr)));
4644	nfr.nfr_saddr.sin6.sin6_len = sizeof(struct sockaddr_in6);
4645	nfr.nfr_saddr.sin6.sin6_family = AF_INET6;
4646	nfr.nfr_saddr.sin6.sin6_port = inp->inp_lport;
4647	memcpy(dst: &nfr.nfr_saddr.sin6.sin6_addr, src: &inp->in6p_laddr,
4648	n: sizeof(struct in6_addr));
4649	nfr.nfr_daddr.sin6.sin6_len = sizeof(struct sockaddr_in6);
4650	nfr.nfr_daddr.sin.sin_family = AF_INET6;
4651	nfr.nfr_daddr.sin6.sin6_port = inp->inp_fport;
4652	memcpy(dst: &nfr.nfr_daddr.sin6.sin6_addr, src: &inp->in6p_faddr,
4653	n: sizeof(struct in6_addr));
4654	/ clear embedded scope ID /
4655	if (IN6_IS_SCOPE_EMBED(&nfr.nfr_saddr.sin6.sin6_addr)) {
4656	nfr.nfr_saddr.sin6.sin6_addr.s6_addr16[`1`] = `0`;
4657	}
4658	if (IN6_IS_SCOPE_EMBED(&nfr.nfr_daddr.sin6.sin6_addr)) {
4659	nfr.nfr_daddr.sin6.sin6_addr.s6_addr16[`1`] = `0`;
4660	}
4661	}
4662
4663	nfr.nfr_nx_port = `1`;
4664	nfr.nfr_ip_protocol = IPPROTO_TCP;
4665	nfr.nfr_transport_protocol = IPPROTO_TCP;
4666	nfr.nfr_flags = NXFLOWREQF_ASIS;
4667	nfr.nfr_epid = (so != NULL ? so->last_pid : `0`);
4668	if (NETNS_TOKEN_VALID(&inp->inp_netns_token)) {
4669	nfr.nfr_port_reservation = inp->inp_netns_token;
4670	nfr.nfr_flags \|= NXFLOWREQF_EXT_PORT_RSV;
4671	}
4672	ASSERT(inp->inp_flowhash != `0`);
4673	nfr.nfr_inp_flowhash = inp->inp_flowhash;
4674
4675	uuid_generate_random(out: nfr.nfr_flow_uuid);
4676	err = kern_nexus_flow_add(ncd: kern_nexus_shared_controller(), nx_uuid: fsw_uuid,
4677	data: &nfr, data_len: sizeof(nfr));
4678
4679	if (err == `0`) {
4680	uuid_copy(dst: tp->t_fsw_uuid, src: fsw_uuid);
4681	uuid_copy(dst: tp->t_flow_uuid, src: nfr.nfr_flow_uuid);
4682	}
4683
4684	TCP_LOG_FSW_FLOW(tp, "add err %d\n", err);
4685	}
4686
4687	void
4688	tcp_del_fsw_flow(struct tcpcb *tp)
4689	{
4690	if (uuid_is_null(uu: tp->t_fsw_uuid) \|\| uuid_is_null(uu: tp->t_flow_uuid)) {
4691	return;
4692	}
4693
4694	struct nx_flow_req nfr;
4695	uuid_copy(dst: nfr.nfr_flow_uuid, src: tp->t_flow_uuid);
4696
4697	/ It's possible for this call to fail if the nexus has detached /
4698	int err = kern_nexus_flow_del(ncd: kern_nexus_shared_controller(),
4699	nx_uuid: tp->t_fsw_uuid, data: &nfr, data_len: sizeof(nfr));
4700	VERIFY(err == `0` \|\| err == ENOENT \|\| err == ENXIO);
4701
4702	uuid_clear(uu: tp->t_fsw_uuid);
4703	uuid_clear(uu: tp->t_flow_uuid);
4704
4705	TCP_LOG_FSW_FLOW(tp, "del err %d\n", err);
4706	}
4707
4708	#endif /* SKYWALK */
4709

Browse the source code of xnu/bsd/netinet/tcp_subr.c