tcp_input.c source code [xnu/bsd/netinet/tcp_input.c]

1	/*
2	* Copyright (c) 2000-2022 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28	/*
29	* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
30	* The Regents of the University of California. All rights reserved.
31	*
32	* Redistribution and use in source and binary forms, with or without
33	* modification, are permitted provided that the following conditions
34	* are met:
35	* 1. Redistributions of source code must retain the above copyright
36	* notice, this list of conditions and the following disclaimer.
37	* 2. Redistributions in binary form must reproduce the above copyright
38	* notice, this list of conditions and the following disclaimer in the
39	* documentation and/or other materials provided with the distribution.
40	* 3. All advertising materials mentioning features or use of this software
41	* must display the following acknowledgement:
42	* This product includes software developed by the University of
43	* California, Berkeley and its contributors.
44	* 4. Neither the name of the University nor the names of its contributors
45	* may be used to endorse or promote products derived from this software
46	* without specific prior written permission.
47	*
48	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58	* SUCH DAMAGE.
59	*
60	* @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
61	* $FreeBSD: src/sys/netinet/tcp_input.c,v 1.107.2.16 2001/08/22 00:59:12 silby Exp $
62	*/
63	/*
64	* NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65	* support for mandatory and extensible security protections. This notice
66	* is included in support of clause 2.2 (b) of the Apple Public License,
67	* Version 2.0.
68	*/
69
70	#include "tcp_includes.h"
71
72	#include <sys/param.h>
73	#include <sys/systm.h>
74	#include <sys/kernel.h>
75	#include <sys/sysctl.h>
76	#include <sys/malloc.h>
77	#include <sys/mbuf.h>
78	#include <sys/proc.h> /* for proc0 declaration */
79	#include <sys/protosw.h>
80	#include <sys/socket.h>
81	#include <sys/socketvar.h>
82	#include <sys/syslog.h>
83	#include <sys/mcache.h>
84	#include <sys/kauth.h>
85	#include <kern/cpu_number.h> /* before tcp_seq.h, for tcp_random18() */
86
87	#include <machine/endian.h>
88
89	#include <net/if.h>
90	#include <net/if_types.h>
91	#include <net/route.h>
92	#include <net/ntstat.h>
93	#include <net/content_filter.h>
94	#include <net/dlil.h>
95	#include <net/multi_layer_pkt_log.h>
96
97	#include <netinet/in.h>
98	#include <netinet/in_systm.h>
99	#include <netinet/ip.h>
100	#include <netinet/ip_icmp.h> /* for ICMP_BANDLIM */
101	#include <netinet/in_var.h>
102	#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
103	#include <netinet/in_pcb.h>
104	#include <netinet/ip_var.h>
105	#include <mach/sdt.h>
106	#include <netinet/ip6.h>
107	#include <netinet/icmp6.h>
108	#include <netinet6/nd6.h>
109	#include <netinet6/ip6_var.h>
110	#include <netinet6/in6_pcb.h>
111	#include <netinet/tcp.h>
112	#include <netinet/tcp_cache.h>
113	#include <netinet/tcp_fsm.h>
114	#include <netinet/tcp_seq.h>
115	#include <netinet/tcp_timer.h>
116	#include <netinet/tcp_var.h>
117	#include <netinet/tcp_cc.h>
118	#include <dev/random/randomdev.h>
119	#include <kern/zalloc.h>
120	#include <netinet6/tcp6_var.h>
121	#include <netinet/tcpip.h>
122	#if TCPDEBUG
123	#include <netinet/tcp_debug.h>
124	u_char tcp_saveipgen[`40`]; / the size must be of max ip header, now IPv6 /
125	struct tcphdr tcp_savetcp;
126	#endif /* TCPDEBUG */
127	#include <netinet/tcp_log.h>
128
129	#if IPSEC
130	#include <netinet6/ipsec.h>
131	#include <netinet6/ipsec6.h>
132	#include <netkey/key.h>
133	#endif /IPSEC/
134
135	#include <sys/kdebug.h>
136	#if MPTCP
137	#include <netinet/mptcp_var.h>
138	#include <netinet/mptcp.h>
139	#include <netinet/mptcp_opt.h>
140	#endif /* MPTCP */
141
142	#include <corecrypto/ccaes.h>
143	#include <net/sockaddr_utils.h>
144
145	#define DBG_LAYER_BEG NETDBG_CODE(DBG_NETTCP, 0)
146	#define DBG_LAYER_END NETDBG_CODE(DBG_NETTCP, 2)
147	#define DBG_FNC_TCP_INPUT NETDBG_CODE(DBG_NETTCP, (3 << 8))
148	#define DBG_FNC_TCP_NEWCONN NETDBG_CODE(DBG_NETTCP, (7 << 8))
149
150	#define TCP_RTT_HISTORY_EXPIRE_TIME (60 * TCP_RETRANSHZ)
151	#define TCP_RECV_THROTTLE_WIN (5 * TCP_RETRANSHZ)
152	#define TCP_STRETCHACK_ENABLE_PKTCNT 2000
153
154	struct tcpstat tcpstat;
155
156	SYSCTL_SKMEM_TCP_INT(OID_AUTO, flow_control_response,
157	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_flow_control_response, `1`,
158	"Improved response to Flow-control events");
159
160	static int log_in_vain = `0`;
161	SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain,
162	CTLFLAG_RW \| CTLFLAG_LOCKED, &log_in_vain, `0`,
163	"Log all incoming TCP connections");
164
165	SYSCTL_SKMEM_TCP_INT(OID_AUTO, ack_strategy,
166	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_ack_strategy, TCP_ACK_STRATEGY_MODERN,
167	"Revised TCP ACK-strategy, avoiding stretch-ACK implementation");
168
169	static int blackhole = `0`;
170	SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole,
171	CTLFLAG_RW \| CTLFLAG_LOCKED, &blackhole, `0`,
172	"Do not send RST when dropping refused connections");
173
174	/ TODO - remove once uTCP stopped using it /
175	SYSCTL_SKMEM_TCP_INT(OID_AUTO, aggressive_rcvwnd_inc,
176	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_aggressive_rcvwnd_inc, `1`,
177	"Be more aggressive about increasing the receive-window.");
178
179	SYSCTL_SKMEM_TCP_INT(OID_AUTO, delayed_ack,
180	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_delack_enabled, `3`,
181	"Delay ACK to try and piggyback it onto a data packet");
182
183	SYSCTL_SKMEM_TCP_INT(OID_AUTO, recvbg, CTLFLAG_RW \| CTLFLAG_LOCKED,
184	int, tcp_recv_bg, `0`, "Receive background");
185
186	SYSCTL_SKMEM_TCP_INT(OID_AUTO, drop_synfin,
187	CTLFLAG_RW \| CTLFLAG_LOCKED, static int, drop_synfin, `1`,
188	"Drop TCP packets with SYN+FIN set");
189
190	SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW \| CTLFLAG_LOCKED, `0`,
191	"TCP Segment Reassembly Queue");
192
193	static int tcp_reass_overflows = `0`;
194	SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows,
195	CTLFLAG_RD \| CTLFLAG_LOCKED, &tcp_reass_overflows, `0`,
196	"Global number of TCP segment reassembly queue overflows");
197
198	int tcp_reass_total_qlen = `0`;
199	SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, qlen,
200	CTLFLAG_RD \| CTLFLAG_LOCKED, &tcp_reass_total_qlen, `0`,
201	"Total number of TCP segments in reassembly queues");
202
203
204	SYSCTL_SKMEM_TCP_INT(OID_AUTO, slowlink_wsize, CTLFLAG_RW \| CTLFLAG_LOCKED,
205	__private_extern__ int, slowlink_wsize, `8192`,
206	"Maximum advertised window size for slowlink");
207
208	SYSCTL_SKMEM_TCP_INT(OID_AUTO, maxseg_unacked,
209	CTLFLAG_RW \| CTLFLAG_LOCKED, int, maxseg_unacked, `8`,
210	"Maximum number of outstanding segments left unacked");
211
212	SYSCTL_SKMEM_TCP_INT(OID_AUTO, rfc3465, CTLFLAG_RW \| CTLFLAG_LOCKED,
213	int, tcp_do_rfc3465, `1`, "");
214
215	SYSCTL_SKMEM_TCP_INT(OID_AUTO, rfc3465_lim2,
216	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_do_rfc3465_lim2, `1`,
217	"Appropriate bytes counting w/ L=2*SMSS");
218
219	int rtt_samples_per_slot = `20`;
220
221	int tcp_acc_iaj_high_thresh = ACC_IAJ_HIGH_THRESH;
222	u_int32_t tcp_autorcvbuf_inc_shift = `3`;
223	SYSCTL_SKMEM_TCP_INT(OID_AUTO, recv_allowed_iaj,
224	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_allowed_iaj, ALLOWED_IAJ,
225	"Allowed inter-packet arrival jiter");
226
227	SYSCTL_SKMEM_TCP_INT(OID_AUTO, doautorcvbuf,
228	CTLFLAG_RW \| CTLFLAG_LOCKED, u_int32_t, tcp_do_autorcvbuf, `1`,
229	"Enable automatic socket buffer tuning");
230
231	SYSCTL_SKMEM_TCP_INT(OID_AUTO, autotunereorder,
232	CTLFLAG_RW \| CTLFLAG_LOCKED, u_int32_t, tcp_autotune_reorder, `1`,
233	"Enable automatic socket buffer tuning even when reordering is present");
234
235	SYSCTL_SKMEM_TCP_INT(OID_AUTO, autorcvbufmax,
236	CTLFLAG_RW \| CTLFLAG_LOCKED \| CTLFLAG_KERN, u_int32_t, tcp_autorcvbuf_max, `2` * `1024` * `1024`,
237	"Maximum receive socket buffer size");
238
239	int tcp_disable_access_to_stats = `1`;
240	SYSCTL_INT(_net_inet_tcp, OID_AUTO, disable_access_to_stats,
241	CTLFLAG_RW \| CTLFLAG_LOCKED, &tcp_disable_access_to_stats, `0`,
242	"Disable access to tcpstat");
243
244	SYSCTL_SKMEM_TCP_INT(OID_AUTO, challengeack_limit,
245	CTLFLAG_RW \| CTLFLAG_LOCKED, uint32_t, tcp_challengeack_limit, `10`,
246	"Maximum number of challenge ACKs per connection per second");
247
248	/ TO BE REMOVED /
249	SYSCTL_SKMEM_TCP_INT(OID_AUTO, do_rfc5961,
250	CTLFLAG_RW \| CTLFLAG_LOCKED, static int, tcp_do_rfc5961, `1`,
251	"Enable/Disable full RFC 5961 compliance");
252
253	SYSCTL_SKMEM_TCP_INT(OID_AUTO, do_better_lr,
254	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_do_better_lr, `1`,
255	"Improved TCP Loss Recovery");
256
257	SYSCTL_SKMEM_TCP_INT(OID_AUTO, use_min_curr_rtt,
258	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_use_min_curr_rtt, `1`,
259	"Use a min of k=4 RTT samples for congestion controllers");
260
261	SYSCTL_SKMEM_TCP_INT(OID_AUTO, awdl_rtobase,
262	CTLFLAG_RW \| CTLFLAG_LOCKED, int, tcp_awdl_rtobase, `100`,
263	"Initial RTO for AWDL interface");
264
265	extern int tcp_acc_iaj_high;
266	extern int tcp_acc_iaj_react_limit;
267	extern int tcp_fin_timeout;
268
269	uint8_t tcprexmtthresh = `3`;
270
271	u_int32_t tcp_now;
272	struct timeval tcp_uptime; / uptime when tcp_now was last updated /
273
274	/ Used to sychronize updates to tcp_now /
275	static LCK_GRP_DECLARE(tcp_uptime_mtx_grp, "tcpuptime");
276	LCK_SPIN_DECLARE(tcp_uptime_lock, &tcp_uptime_mtx_grp);
277
278	struct inpcbhead tcb;
279	#define tcb6 tcb /* for KAME src sync over BSD's /
280	struct inpcbinfo tcbinfo;
281
282	static void tcp_dooptions(struct tcpcb , u_char , int, struct tcphdr *,
283	struct tcpopt *);
284	static void tcp_finalize_options(struct tcpcb , struct* tcpopt , unsigned* int);
285	static void tcp_pulloutofband(struct socket *,
286	struct tcphdr , struct* mbuf , int*);
287	static void tcp_xmit_timer(struct tcpcb , int*, u_int32_t, tcp_seq);
288	static inline unsigned int tcp_maxmtu(struct rtentry *);
289	static inline int tcp_stretch_ack_enable(struct tcpcb tp, int* thflags);
290	static inline void tcp_adaptive_rwtimo_check(struct tcpcb , int*);
291
292	#if TRAFFIC_MGT
293	static inline void compute_iaj(struct tcpcb *tp);
294	static inline void compute_iaj_meat(struct tcpcb *tp, uint32_t cur_iaj);
295	#endif /* TRAFFIC_MGT */
296
297	static inline unsigned int tcp_maxmtu6(struct rtentry *);
298	unsigned int get_maxmtu(struct rtentry *);
299
300	static void tcp_sbrcv_grow(struct tcpcb tp, struct* sockbuf *sb,
301	struct tcpopt *to, uint32_t tlen);
302	void tcp_sbrcv_trim(struct tcpcb tp, struct* sockbuf *sb);
303	static void tcp_sbsnd_trim(struct sockbuf *sbsnd);
304	static inline void tcp_sbrcv_tstmp_check(struct tcpcb *tp);
305	static inline void tcp_sbrcv_reserve(struct tcpcb tp, struct* sockbuf *sb,
306	u_int32_t newsize, u_int32_t idealsize, u_int32_t rcvbuf_max);
307	static void tcp_bad_rexmt_restore_state(struct tcpcb tp, struct* tcphdr *th);
308	static void tcp_compute_rtt(struct tcpcb tp, struct* tcpopt *to,
309	struct tcphdr *th);
310	static void tcp_compute_rcv_rtt(struct tcpcb tp, struct* tcpopt *to,
311	struct tcphdr *th);
312	static void tcp_early_rexmt_check(struct tcpcb tp, struct* tcphdr *th);
313	static void tcp_bad_rexmt_check(struct tcpcb tp, struct* tcphdr *th,
314	struct tcpopt *to);
315	/*
316	* Constants used for resizing receive socket buffer
317	* when timestamps are not supported
318	*/
319	#define TCPTV_RCVNOTS_QUANTUM 100
320	#define TCP_RCVNOTS_BYTELEVEL 204800
321
322	/*
323	* Constants used for limiting early retransmits
324	* to 10 per minute.
325	*/
326	#define TCP_EARLY_REXMT_WIN (60 * TCP_RETRANSHZ) /* 60 seconds */
327	#define TCP_EARLY_REXMT_LIMIT 10
328
329	#define log_in_vain_log( a ) { log a; }
330
331	int tcp_rcvunackwin = TCPTV_UNACKWIN;
332	int tcp_maxrcvidle = TCPTV_MAXRCVIDLE;
333	SYSCTL_SKMEM_TCP_INT(OID_AUTO, rcvsspktcnt, CTLFLAG_RW \| CTLFLAG_LOCKED,
334	int, tcp_rcvsspktcnt, TCP_RCV_SS_PKTCOUNT, "packets to be seen before receiver stretches acks");
335
336	#define DELAY_ACK(tp, th) \
337	(CC_ALGO(tp)->delay_ack != NULL && CC_ALGO(tp)->delay_ack(tp, th))
338
339	static int tcp_dropdropablreq(struct socket *head);
340	static void tcp_newreno_partial_ack(struct tcpcb tp, struct* tcphdr *th);
341	static void update_base_rtt(struct tcpcb *tp, uint32_t rtt);
342	void tcp_set_background_cc(struct socket *so);
343	void tcp_set_foreground_cc(struct socket *so);
344	static void tcp_set_new_cc(struct socket *so, uint8_t cc_index);
345	static void tcp_bwmeas_check(struct tcpcb *tp);
346
347	#if TRAFFIC_MGT
348	void
349	reset_acc_iaj(struct tcpcb *tp)
350	{
351	tp->acc_iaj = `0`;
352	CLEAR_IAJ_STATE(tp);
353	}
354
355	static inline void
356	update_iaj_state(struct tcpcb tp, int* size, int rst_size)
357	{
358	if (rst_size > `0`) {
359	tp->iaj_size = `0`;
360	}
361	if (tp->iaj_size == `0` \|\| size >= tp->iaj_size) {
362	tp->iaj_size = size;
363	tp->iaj_rcv_ts = tcp_now;
364	tp->iaj_small_pkt = `0`;
365	}
366	}
367
368	/ For every 64-bit unsigned integer(v), this function will find the*
369	* largest 32-bit integer n such that (n*n <= v). This takes at most 32 iterations
370	* irrespective of the value of v and does not involve multiplications.
371	*/
372	static inline uint32_t
373	isqrt(uint64_t val)
374	{
375	uint32_t sqrt_cache[`11`] = {`0`, `1`, `4`, `9`, `16`, `25`, `36`, `49`, `64`, `81`, `100`};
376	uint64_t temp, g = `0`, b = `1` << `31`, bshft = `31`;
377	if (val <= `100`) {
378	for (g = `0`; g <= `10`; ++g) {
379	if (sqrt_cache[g] > val) {
380	g--;
381	break;
382	} else if (sqrt_cache[g] == val) {
383	break;
384	}
385	}
386	} else {
387	do {
388	temp = (((g << `1`) + b) << (bshft--));
389	if (val >= temp) {
390	g += b;
391	val -= temp;
392	}
393	b >>= `1`;
394	} while (b > `0` && val > `0`);
395	}
396	return (uint32_t)g;
397	}
398
399	static inline void
400	compute_iaj_meat(struct tcpcb *tp, uint32_t cur_iaj)
401	{
402	/ When accumulated IAJ reaches MAX_ACC_IAJ in milliseconds,*
403	* throttle the receive window to a minimum of MIN_IAJ_WIN packets
404	*/
405	#define MAX_ACC_IAJ (tcp_acc_iaj_high_thresh + tcp_acc_iaj_react_limit)
406	#define IAJ_DIV_SHIFT 4
407	#define IAJ_ROUNDUP_CONST (1 << (IAJ_DIV_SHIFT - 1))
408
409	uint32_t allowed_iaj, acc_iaj = `0`;
410
411	/ Using 64-bit storage for the inter-arrival jitter deviation,*
412	* to avoid accidentally rolling over if the inter-arrival time exceeds 62 seconds.
413	*/
414	int64_t mean, temp, cur_iaj_dev;
415
416	cur_iaj_dev = (cur_iaj - tp->avg_iaj);
417
418	/ Allow a jitter of "allowed_iaj" milliseconds. Some connections*
419	* may have a constant jitter more than that. We detect this by
420	* using standard deviation.
421	*/
422	allowed_iaj = tp->avg_iaj + tp->std_dev_iaj;
423	if (allowed_iaj < tcp_allowed_iaj) {
424	allowed_iaj = tcp_allowed_iaj;
425	}
426
427	/ Initially when the connection starts, the senders congestion*
428	* window is small. During this period we avoid throttling a
429	* connection because we do not have a good starting point for
430	* allowed_iaj. IAJ_IGNORE_PKTCNT is used to quietly gloss over
431	* the first few packets.
432	*/
433	if (tp->iaj_pktcnt > IAJ_IGNORE_PKTCNT) {
434	if (cur_iaj <= allowed_iaj) {
435	if (tp->acc_iaj >= `2`) {
436	acc_iaj = tp->acc_iaj - `2`;
437	} else {
438	acc_iaj = `0`;
439	}
440	} else {
441	acc_iaj = tp->acc_iaj + (cur_iaj - allowed_iaj);
442	}
443
444	if (acc_iaj > MAX_ACC_IAJ) {
445	acc_iaj = MAX_ACC_IAJ;
446	}
447	tp->acc_iaj = acc_iaj;
448	}
449
450	/ Compute weighted average where the history has a weight of*
451	* 15 out of 16 and the current value has a weight of 1 out of 16.
452	* This will make the short-term measurements have more weight.
453	*
454	* The addition of 8 will help to round-up the value
455	* instead of round-down
456	*/
457	tp->avg_iaj = (((tp->avg_iaj << IAJ_DIV_SHIFT) - tp->avg_iaj)
458	+ cur_iaj + IAJ_ROUNDUP_CONST) >> IAJ_DIV_SHIFT;
459
460	/ Compute Root-mean-square of deviation where mean is a weighted*
461	* average as described above.
462	*/
463	temp = tp->std_dev_iaj * tp->std_dev_iaj;
464	mean = (((temp << IAJ_DIV_SHIFT) - temp)
465	+ (cur_iaj_dev * cur_iaj_dev)
466	+ IAJ_ROUNDUP_CONST) >> IAJ_DIV_SHIFT;
467
468	tp->std_dev_iaj = isqrt(val: mean);
469
470	DTRACE_TCP3(iaj, struct tcpcb *, tp, uint32_t, cur_iaj,
471	uint32_t, allowed_iaj);
472
473	return;
474	}
475
476	static inline void
477	compute_iaj(struct tcpcb *tp)
478	{
479	compute_iaj_meat(tp, cur_iaj: (tcp_now - tp->iaj_rcv_ts));
480	}
481	#endif /* TRAFFIC_MGT */
482
483	/*
484	* Perform rate limit check per connection per second
485	* tp->t_challengeack_last is the last_time diff was greater than 1sec
486	* tp->t_challengeack_count is the number of ACKs sent (within 1sec)
487	* Return TRUE if we shouldn't send the ACK due to rate limitation
488	* Return FALSE if it is still ok to send challenge ACK
489	*/
490	static boolean_t
491	tcp_is_ack_ratelimited(struct tcpcb *tp)
492	{
493	boolean_t ret = TRUE;
494	uint32_t now = tcp_now;
495	int32_t diff = `0`;
496
497	diff = timer_diff(t1: now, toff1: `0`, t2: tp->t_challengeack_last, toff2: `0`);
498	/ If it is first time or diff > 1000ms,*
499	* update the challengeack_last and reset the
500	* current count of ACKs
501	*/
502	if (tp->t_challengeack_last == `0` \|\| diff >= `1000`) {
503	tp->t_challengeack_last = now;
504	tp->t_challengeack_count = `0`;
505	ret = FALSE;
506	} else if (tp->t_challengeack_count < tcp_challengeack_limit) {
507	ret = FALSE;
508	}
509
510	/ Careful about wrap-around /
511	if (ret == FALSE && (tp->t_challengeack_count + `1` > `0`)) {
512	tp->t_challengeack_count++;
513	}
514
515	return ret;
516	}
517
518	/ Check if enough amount of data has been acknowledged since*
519	* bw measurement was started
520	*/
521	static void
522	tcp_bwmeas_check(struct tcpcb *tp)
523	{
524	int32_t bw_meas_bytes;
525	uint32_t bw, bytes, elapsed_time;
526
527	if (SEQ_LEQ(tp->snd_una, tp->t_bwmeas->bw_start)) {
528	return;
529	}
530
531	bw_meas_bytes = tp->snd_una - tp->t_bwmeas->bw_start;
532	if ((tp->t_flagsext & TF_BWMEAS_INPROGRESS) &&
533	bw_meas_bytes >= (int32_t)(tp->t_bwmeas->bw_size)) {
534	bytes = bw_meas_bytes;
535	elapsed_time = tcp_now - tp->t_bwmeas->bw_ts;
536	if (elapsed_time > `0`) {
537	bw = bytes / elapsed_time;
538	if (bw > `0`) {
539	if (tp->t_bwmeas->bw_sndbw > `0`) {
540	tp->t_bwmeas->bw_sndbw =
541	(((tp->t_bwmeas->bw_sndbw << `3`)
542	- tp->t_bwmeas->bw_sndbw)
543	+ bw) >> `3`;
544	} else {
545	tp->t_bwmeas->bw_sndbw = bw;
546	}
547
548	/ Store the maximum value /
549	if (tp->t_bwmeas->bw_sndbw_max == `0`) {
550	tp->t_bwmeas->bw_sndbw_max =
551	tp->t_bwmeas->bw_sndbw;
552	} else {
553	tp->t_bwmeas->bw_sndbw_max =
554	max(a: tp->t_bwmeas->bw_sndbw,
555	b: tp->t_bwmeas->bw_sndbw_max);
556	}
557	}
558	}
559	tp->t_flagsext &= ~(TF_BWMEAS_INPROGRESS);
560	}
561	}
562
563	static int
564	tcp_reass(struct tcpcb tp, struct* tcphdr th, int* tlenp, struct* mbuf *m,
565	struct ifnet ifp, int* *dowakeup)
566	{
567	struct tseg_qent *q;
568	struct tseg_qent *p = NULL;
569	struct tseg_qent *nq;
570	struct tseg_qent *te = NULL;
571	struct inpcb *inp = tp->t_inpcb;
572	struct socket *so = inp->inp_socket;
573	int flags = `0`;
574	uint32_t qlimit;
575	boolean_t cell = IFNET_IS_CELLULAR(ifp);
576	boolean_t wifi = (!cell && IFNET_IS_WIFI(ifp));
577	boolean_t wired = (!wifi && IFNET_IS_WIRED(ifp));
578	boolean_t dsack_set = FALSE;
579
580	/*
581	* If the reassembly queue already has entries or if we are going
582	* to add a new one, then the connection has reached a loss state.
583	* Reset the stretch-ack algorithm at this point.
584	*/
585	tcp_reset_stretch_ack(tp);
586	tp->t_forced_acks = TCP_FORCED_ACKS_COUNT;
587
588	#if TRAFFIC_MGT
589	if (tp->acc_iaj > `0`) {
590	reset_acc_iaj(tp);
591	}
592	#endif /* TRAFFIC_MGT */
593
594	if (th->th_seq != tp->rcv_nxt) {
595	struct mbuf *tmp = m;
596	while (tmp != NULL) {
597	if (mbuf_class_under_pressure(m: tmp)) {
598	m_freem(m);
599	tcp_reass_overflows++;
600	tcpstat.tcps_rcvmemdrop++;
601	*tlenp = `0`;
602	return `0`;
603	}
604
605	tmp = tmp->m_next;
606	}
607	}
608
609	/*
610	* Limit the number of segments in the reassembly queue to prevent
611	* holding on to too many segments (and thus running out of mbufs).
612	* Make sure to let the missing segment through which caused this
613	* queue. Always keep one global queue entry spare to be able to
614	* process the missing segment.
615	*/
616	qlimit = min(a: max(a: `100`, b: so->so_rcv.sb_hiwat >> `10`),
617	b: (tcp_autorcvbuf_max >> `10`));
618	if (th->th_seq != tp->rcv_nxt &&
619	(tp->t_reassqlen + `1`) >= qlimit) {
620	tcp_reass_overflows++;
621	tcpstat.tcps_rcvmemdrop++;
622	m_freem(m);
623	*tlenp = `0`;
624	return `0`;
625	}
626
627	/ Allocate a new queue entry. If we can't, just drop the pkt. XXX /
628	te = zalloc_flags(tcp_reass_zone, Z_WAITOK \| Z_NOFAIL);
629	tp->t_reassqlen++;
630	OSIncrementAtomic(&tcp_reass_total_qlen);
631
632	/*
633	* Find a segment which begins after this one does.
634	*/
635	LIST_FOREACH(q, &tp->t_segq, tqe_q) {
636	if (SEQ_GT(q->tqe_th->th_seq, th->th_seq)) {
637	break;
638	}
639	p = q;
640	}
641
642	/*
643	* If there is a preceding segment, it may provide some of
644	* our data already. If so, drop the data from the incoming
645	* segment. If it provides all of our data, drop us.
646	*/
647	if (p != NULL) {
648	int i;
649	/ conversion to int (in i) handles seq wraparound /
650	i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
651	if (i > `0`) {
652	if (i > `1`) {
653	/*
654	* Note duplicate data sequnce numbers
655	* to report in DSACK option
656	*/
657	tp->t_dsack_lseq = th->th_seq;
658	tp->t_dsack_rseq = th->th_seq +
659	min(a: i, b: *tlenp);
660
661	/*
662	* Report only the first part of partial/
663	* non-contiguous duplicate sequence space
664	*/
665	dsack_set = TRUE;
666	}
667	if (i >= *tlenp) {
668	tcpstat.tcps_rcvduppack++;
669	tcpstat.tcps_rcvdupbyte += *tlenp;
670	if (nstat_collect) {
671	nstat_route_rx(rte: inp->inp_route.ro_rt,
672	packets: `1`, bytes: *tlenp,
673	flags: NSTAT_RX_FLAG_DUPLICATE);
674	INP_ADD_STAT(inp, cell, wifi, wired,
675	rxpackets, `1`);
676	INP_ADD_STAT(inp, cell, wifi, wired,
677	rxbytes, *tlenp);
678	tp->t_stat.rxduplicatebytes += *tlenp;
679	inp_set_activity_bitmap(inp);
680	}
681	m_freem(m);
682	zfree(tcp_reass_zone, te);
683	te = NULL;
684	tp->t_reassqlen--;
685	OSDecrementAtomic(&tcp_reass_total_qlen);
686	/*
687	* Try to present any queued data
688	* at the left window edge to the user.
689	* This is needed after the 3-WHS
690	* completes.
691	*/
692	goto present;
693	}
694	m_adj(m, i);
695	*tlenp -= i;
696	th->th_seq += i;
697	}
698	}
699
700	if (th->th_seq != tp->rcv_nxt) {
701	tp->t_rcvoopack++;
702	tcpstat.tcps_rcvoopack++;
703	tcpstat.tcps_rcvoobyte += *tlenp;
704	if (nstat_collect) {
705	tp->t_stat.rxoutoforderbytes += *tlenp;
706	}
707	}
708
709	if (nstat_collect) {
710	nstat_route_rx(rte: inp->inp_route.ro_rt, packets: `1`, bytes: *tlenp,
711	flags: NSTAT_RX_FLAG_OUT_OF_ORDER);
712	INP_ADD_STAT(inp, cell, wifi, wired, rxpackets, `1`);
713	INP_ADD_STAT(inp, cell, wifi, wired, rxbytes, *tlenp);
714	inp_set_activity_bitmap(inp);
715	}
716
717	/*
718	* While we overlap succeeding segments trim them or,
719	* if they are completely covered, dequeue them.
720	*/
721	while (q) {
722	int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
723	if (i <= `0`) {
724	break;
725	}
726
727	/*
728	* Report only the first part of partial/non-contiguous
729	* duplicate segment in dsack option. The variable
730	* dsack_set will be true if a previous entry has some of
731	* the duplicate sequence space.
732	*/
733	if (i > `1` && !dsack_set) {
734	if (tp->t_dsack_lseq == `0`) {
735	tp->t_dsack_lseq = q->tqe_th->th_seq;
736	tp->t_dsack_rseq =
737	tp->t_dsack_lseq + min(a: i, b: q->tqe_len);
738	} else {
739	/*
740	* this segment overlaps data in multple
741	* entries in the reassembly queue, move
742	* the right sequence number further.
743	*/
744	tp->t_dsack_rseq =
745	tp->t_dsack_rseq + min(a: i, b: q->tqe_len);
746	}
747	}
748	if (i < q->tqe_len) {
749	q->tqe_th->th_seq += i;
750	q->tqe_len -= i;
751	m_adj(q->tqe_m, i);
752	break;
753	}
754
755	nq = LIST_NEXT(q, tqe_q);
756	LIST_REMOVE(q, tqe_q);
757	tp->t_reassq_mbcnt -= _MSIZE + (q->tqe_m->m_flags & M_EXT) ?
758	q->tqe_m->m_ext.ext_size : `0`;
759	m_freem(q->tqe_m);
760	zfree(tcp_reass_zone, q);
761	tp->t_reassqlen--;
762	OSDecrementAtomic(&tcp_reass_total_qlen);
763	q = nq;
764	}
765
766	/ Insert the new segment queue entry into place. /
767	te->tqe_m = m;
768	te->tqe_th = th;
769	te->tqe_len = *tlenp;
770
771	tp->t_reassq_mbcnt += _MSIZE + (m->m_flags & M_EXT) ? m->m_ext.ext_size : `0`;
772
773	if (p == NULL) {
774	LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
775	} else {
776	LIST_INSERT_AFTER(p, te, tqe_q);
777	}
778
779	present:
780	/*
781	* Present data to user, advancing rcv_nxt through
782	* completed sequence space.
783	*/
784	if (!TCPS_HAVEESTABLISHED(tp->t_state)) {
785	return `0`;
786	}
787	q = LIST_FIRST(&tp->t_segq);
788	if (!q \|\| q->tqe_th->th_seq != tp->rcv_nxt) {
789	return `0`;
790	}
791
792	/*
793	* If there is already another thread doing reassembly for this
794	* connection, it is better to let it finish the job --
795	* (radar 16316196)
796	*/
797	if (tp->t_flagsext & TF_REASS_INPROG) {
798	return `0`;
799	}
800
801	tp->t_flagsext \|= TF_REASS_INPROG;
802	/ lost packet was recovered, so ooo data can be returned /
803	tcpstat.tcps_recovered_pkts++;
804
805	do {
806	tp->rcv_nxt += q->tqe_len;
807	flags = q->tqe_th->th_flags & TH_FIN;
808	LIST_REMOVE(q, tqe_q);
809	tp->t_reassq_mbcnt -= _MSIZE + (q->tqe_m->m_flags & M_EXT) ?
810	q->tqe_m->m_ext.ext_size : `0`;
811	if (so->so_state & SS_CANTRCVMORE) {
812	m_freem(q->tqe_m);
813	} else {
814	so_recv_data_stat(so, q->tqe_m, `0`); / XXXX /
815	if (q->tqe_th->th_flags & TH_PUSH) {
816	tp->t_flagsext \|= TF_LAST_IS_PSH;
817	} else {
818	tp->t_flagsext &= ~TF_LAST_IS_PSH;
819	}
820
821	if (sbappendstream_rcvdemux(so, m: q->tqe_m)) {
822	*dowakeup = `1`;
823	}
824	}
825	zfree(tcp_reass_zone, q);
826	tp->t_reassqlen--;
827	OSDecrementAtomic(&tcp_reass_total_qlen);
828	q = LIST_FIRST(&tp->t_segq);
829	} while (q && q->tqe_th->th_seq == tp->rcv_nxt);
830	tp->t_flagsext &= ~TF_REASS_INPROG;
831
832	if ((inp->inp_vflag & INP_IPV6) != `0`) {
833	KERNEL_DEBUG(DBG_LAYER_BEG,
834	((inp->inp_fport << `16`) \| inp->inp_lport),
835	(((inp->in6p_laddr.s6_addr16[`0`] & `0xffff`) << `16`) \|
836	(inp->in6p_faddr.s6_addr16[`0`] & `0xffff`)),
837	`0`, `0`, `0`);
838	} else {
839	KERNEL_DEBUG(DBG_LAYER_BEG,
840	((inp->inp_fport << `16`) \| inp->inp_lport),
841	(((inp->inp_laddr.s_addr & `0xffff`) << `16`) \|
842	(inp->inp_faddr.s_addr & `0xffff`)),
843	`0`, `0`, `0`);
844	}
845
846	return flags;
847	}
848
849	/*
850	* Reduce congestion window -- used when ECN is seen or when a tail loss
851	* probe recovers the last packet.
852	*/
853	static void
854	tcp_reduce_congestion_window(struct tcpcb *tp)
855	{
856	/*
857	* If the current tcp cc module has
858	* defined a hook for tasks to run
859	* before entering FR, call it
860	*/
861	if (CC_ALGO(tp)->pre_fr != NULL) {
862	CC_ALGO(tp)->pre_fr(tp);
863	}
864	ENTER_FASTRECOVERY(tp);
865	if (tp->t_flags & TF_SENTFIN) {
866	tp->snd_recover = tp->snd_max - `1`;
867	} else {
868	tp->snd_recover = tp->snd_max;
869	}
870	tp->t_timer[TCPT_REXMT] = `0`;
871	tp->t_timer[TCPT_PTO] = `0`;
872	tp->t_rtttime = `0`;
873	if (tp->t_flagsext & TF_CWND_NONVALIDATED) {
874	tcp_cc_adjust_nonvalidated_cwnd(tp);
875	} else {
876	tp->snd_cwnd = tp->snd_ssthresh +
877	tp->t_maxseg * tcprexmtthresh;
878	}
879	}
880
881	/*
882	* This function is called upon reception of data on a socket. It's purpose is
883	* to handle the adaptive keepalive timers that monitor whether the connection
884	* is making progress. First the adaptive read-timer, second the TFO probe-timer.
885	*
886	* The application wants to get an event if there is a stall during read.
887	* Set the initial keepalive timeout to be equal to twice RTO.
888	*
889	* If the outgoing interface is in marginal conditions, we need to
890	* enable read probes for that too.
891	*/
892	static inline void
893	tcp_adaptive_rwtimo_check(struct tcpcb tp, int* tlen)
894	{
895	struct ifnet *outifp = tp->t_inpcb->inp_last_outifp;
896
897	if ((tp->t_adaptive_rtimo > `0` \|\|
898	(outifp != NULL &&
899	(outifp->if_eflags & IFEF_PROBE_CONNECTIVITY)))
900	&& tlen > `0` &&
901	tp->t_state == TCPS_ESTABLISHED) {
902	tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
903	(TCP_REXMTVAL(tp) << `1`));
904	tp->t_flagsext \|= TF_DETECT_READSTALL;
905	tp->t_rtimo_probes = `0`;
906	}
907	}
908
909	inline void
910	tcp_keepalive_reset(struct tcpcb *tp)
911	{
912	tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
913	TCP_CONN_KEEPIDLE(tp));
914	tp->t_flagsext &= ~(TF_DETECT_READSTALL);
915	tp->t_rtimo_probes = `0`;
916	}
917
918	void
919	tcp_set_finwait_timeout(struct tcpcb *tp)
920	{
921	/*
922	* Starting the TCPT_2MSL timer is contrary to the
923	* specification, but if we don't get a FIN
924	* we'll hang forever.
925	*/
926	ASSERT(tp->t_state == TCPS_FIN_WAIT_2);
927	ASSERT((tp->t_inpcb->inp_socket->so_state & (SS_CANTRCVMORE)) == SS_CANTRCVMORE);
928
929	if (tcp_fin_timeout > `0` &&
930	tcp_fin_timeout < TCP_CONN_MAXIDLE(tp)) {
931	tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp, tcp_fin_timeout);
932	} else {
933	tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp, TCP_CONN_MAXIDLE(tp));
934	}
935	}
936
937	/*
938	* TCP input routine, follows pages 65-76 of the
939	* protocol specification dated September, 1981 very closely.
940	*/
941	int
942	tcp6_input(struct mbuf *mp, int* offp, int* proto)
943	{
944	#pragma unused(proto)
945	struct mbuf m = mp;
946	uint32_t ia6_flags;
947	struct ifnet *ifp = m->m_pkthdr.rcvif;
948
949	IP6_EXTHDR_CHECK(m, offp, sizeof(struct* tcphdr), return IPPROTO_DONE);
950
951	/ Expect 32-bit aligned data pointer on strict-align platforms /
952	MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
953
954	/*
955	* draft-itojun-ipv6-tcp-to-anycast
956	* better place to put this in?
957	*/
958	if (ip6_getdstifaddr_info(m, NULL, &ia6_flags) == `0`) {
959	if (ia6_flags & IN6_IFF_ANYCAST) {
960	struct ip6_hdr *ip6;
961
962	ip6 = mtod(m, struct ip6_hdr *);
963	icmp6_error(m, ICMP6_DST_UNREACH,
964	ICMP6_DST_UNREACH_ADDR,
965	(int)((caddr_t)&ip6->ip6_dst - (caddr_t)ip6));
966
967	IF_TCP_STATINC(ifp, icmp6unreach);
968
969	return IPPROTO_DONE;
970	}
971	}
972
973	tcp_input(m, *offp);
974	return IPPROTO_DONE;
975	}
976
977	static void
978	tcp_sbrcv_reserve(struct tcpcb tp, struct* sockbuf *sbrcv,
979	u_int32_t newsize, u_int32_t idealsize, u_int32_t rcvbuf_max)
980	{
981	/ newsize should not exceed max /
982	newsize = min(a: newsize, b: rcvbuf_max);
983
984	/ The receive window scale negotiated at the*
985	* beginning of the connection will also set a
986	* limit on the socket buffer size
987	*/
988	newsize = min(a: newsize, TCP_MAXWIN << tp->rcv_scale);
989
990	/ Set new socket buffer size /
991	if (newsize > sbrcv->sb_hiwat &&
992	(sbreserve(sb: sbrcv, cc: newsize) == `1`)) {
993	sbrcv->sb_idealsize = min(a: max(a: sbrcv->sb_idealsize,
994	b: (idealsize != `0`) ? idealsize : newsize), b: rcvbuf_max);
995
996	/ Again check the limit set by the advertised*
997	* window scale
998	*/
999	sbrcv->sb_idealsize = min(a: sbrcv->sb_idealsize,
1000	TCP_MAXWIN << tp->rcv_scale);
1001	}
1002	}
1003
1004	/*
1005	* This function is used to grow a receive socket buffer. It
1006	* will take into account system-level memory usage and the
1007	* bandwidth available on the link to make a decision.
1008	*/
1009	static void
1010	tcp_sbrcv_grow(struct tcpcb tp, struct* sockbuf *sbrcv,
1011	struct tcpopt *to, uint32_t pktlen)
1012	{
1013	struct socket *so = sbrcv->sb_so;
1014
1015	/*
1016	* Do not grow the receive socket buffer if
1017	* - auto resizing is disabled, globally or on this socket
1018	* - the high water mark already reached the maximum
1019	* - the stream is in background and receive side is being
1020	* throttled
1021	*/
1022	if (tcp_do_autorcvbuf == `0` \|\|
1023	(sbrcv->sb_flags & SB_AUTOSIZE) == `0` \|\|
1024	sbrcv->sb_hiwat >= tcp_autorcvbuf_max \|\|
1025	(tp->t_flagsext & TF_RECV_THROTTLE) \|\|
1026	(so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) \|\|
1027	(!tcp_autotune_reorder && !LIST_EMPTY(&tp->t_segq))) {
1028	/ Can not resize the socket buffer, just return /
1029	goto out;
1030	}
1031
1032	if (!TSTMP_SUPPORTED(tp)) {
1033	/*
1034	* Timestamp option is not supported on this connection,
1035	* use receiver's RTT. Socket buffer grows based on the
1036	* BDP of the link.
1037	*/
1038	if (TSTMP_GEQ(tcp_now,
1039	tp->rfbuf_ts + (tp->rcv_srtt >> TCP_RTT_SHIFT))) {
1040	tp->rfbuf_cnt += pktlen;
1041	if (tp->rfbuf_cnt > tp->rfbuf_space) {
1042	int32_t rcvbuf_inc;
1043	uint32_t idealsize;
1044
1045	/*
1046	* Increase receive-buffer aggressively if we
1047	* received more than 150% of what was received
1048	* in the previous round. Because, that means
1049	* the sender is in TCP slow-start and so
1050	* we need to give it more space to not be
1051	* limiting the sender with a small receive-window.
1052	*/
1053	if (tp->rfbuf_cnt > tp->rfbuf_space + (tp->rfbuf_space >> `1`)) {
1054	rcvbuf_inc = (tp->rfbuf_cnt << `2`) - sbrcv->sb_hiwat;
1055	idealsize = (tp->rfbuf_cnt << `2`);
1056	} else {
1057	rcvbuf_inc = (tp->rfbuf_cnt << `1`) - sbrcv->sb_hiwat;
1058	idealsize = (tp->rfbuf_cnt << `1`);
1059	}
1060
1061	if (rcvbuf_inc > `0`) {
1062	rcvbuf_inc =
1063	(rcvbuf_inc / tp->t_maxseg) * tp->t_maxseg;
1064
1065	tcp_sbrcv_reserve(tp, sbrcv,
1066	newsize: sbrcv->sb_hiwat + rcvbuf_inc,
1067	idealsize, rcvbuf_max: tcp_autorcvbuf_max);
1068
1069	tp->rfbuf_space = tp->rfbuf_cnt;
1070	}
1071	}
1072	goto out;
1073	} else {
1074	tp->rfbuf_cnt += pktlen;
1075	return;
1076	}
1077	} else if (to->to_tsecr != `0`) {
1078	/*
1079	* If the timestamp shows that one RTT has
1080	* completed, we can stop counting the
1081	* bytes. Here we consider increasing
1082	* the socket buffer if the bandwidth measured in
1083	* last rtt, is more than half of sb_hiwat, this will
1084	* help to scale the buffer according to the bandwidth
1085	* on the link.
1086	*/
1087	if (TSTMP_GEQ(to->to_tsecr, tp->rfbuf_ts)) {
1088	tp->rfbuf_cnt += pktlen;
1089
1090	if (tp->rfbuf_cnt > tp->rfbuf_space) {
1091	int32_t rcvbuf_inc;
1092	uint32_t idealsize;
1093
1094	if (tp->rfbuf_cnt > tp->rfbuf_space + (tp->rfbuf_space >> `1`)) {
1095	rcvbuf_inc = (tp->rfbuf_cnt << `2`) - sbrcv->sb_hiwat;
1096	idealsize = (tp->rfbuf_cnt << `2`);
1097	} else {
1098	rcvbuf_inc = (tp->rfbuf_cnt << `1`) - sbrcv->sb_hiwat;
1099	idealsize = (tp->rfbuf_cnt << `1`);
1100	}
1101
1102	tp->rfbuf_space = tp->rfbuf_cnt;
1103
1104	if (rcvbuf_inc > `0`) {
1105	rcvbuf_inc =
1106	(rcvbuf_inc / tp->t_maxseg) * tp->t_maxseg;
1107
1108	tcp_sbrcv_reserve(tp, sbrcv,
1109	newsize: sbrcv->sb_hiwat + rcvbuf_inc,
1110	idealsize, rcvbuf_max: tcp_autorcvbuf_max);
1111	}
1112	}
1113	/ Measure instantaneous receive bandwidth /
1114	if (tp->t_bwmeas != NULL && tp->rfbuf_cnt > `0` &&
1115	TSTMP_GT(tcp_now, tp->rfbuf_ts)) {
1116	u_int32_t rcv_bw;
1117	rcv_bw = tp->rfbuf_cnt /
1118	(int)(tcp_now - tp->rfbuf_ts);
1119	if (tp->t_bwmeas->bw_rcvbw_max == `0`) {
1120	tp->t_bwmeas->bw_rcvbw_max = rcv_bw;
1121	} else {
1122	tp->t_bwmeas->bw_rcvbw_max = max(
1123	a: tp->t_bwmeas->bw_rcvbw_max, b: rcv_bw);
1124	}
1125	}
1126	goto out;
1127	} else {
1128	tp->rfbuf_cnt += pktlen;
1129	return;
1130	}
1131	}
1132	out:
1133	/ Restart the measurement /
1134	tp->rfbuf_ts = tcp_now;
1135	tp->rfbuf_cnt = `0`;
1136	return;
1137	}
1138
1139	/ This function will trim the excess space added to the socket buffer*
1140	* to help a slow-reading app. The ideal-size of a socket buffer depends
1141	* on the link bandwidth or it is set by an application and we aim to
1142	* reach that size.
1143	*/
1144	void
1145	tcp_sbrcv_trim(struct tcpcb tp, struct* sockbuf *sbrcv)
1146	{
1147	if (tcp_do_autorcvbuf == `1` && sbrcv->sb_idealsize > `0` &&
1148	sbrcv->sb_hiwat > sbrcv->sb_idealsize) {
1149	int32_t trim;
1150	/ compute the difference between ideal and current sizes /
1151	u_int32_t diff = sbrcv->sb_hiwat - sbrcv->sb_idealsize;
1152
1153	/ Compute the maximum advertised window for*
1154	* this connection.
1155	*/
1156	u_int32_t advwin = tp->rcv_adv - tp->rcv_nxt;
1157
1158	/ How much can we trim the receive socket buffer?*
1159	* 1. it can not be trimmed beyond the max rcv win advertised
1160	* 2. if possible, leave 1/16 of bandwidth*delay to
1161	* avoid closing the win completely
1162	*/
1163	u_int32_t leave = max(a: advwin, b: (sbrcv->sb_idealsize >> `4`));
1164
1165	/ Sometimes leave can be zero, in that case leave at least*
1166	* a few segments worth of space.
1167	*/
1168	if (leave == `0`) {
1169	leave = tp->t_maxseg << tcp_autorcvbuf_inc_shift;
1170	}
1171
1172	trim = sbrcv->sb_hiwat - (sbrcv->sb_cc + leave);
1173	trim = imin(a: trim, b: (int32_t)diff);
1174
1175	if (trim > `0`) {
1176	sbreserve(sb: sbrcv, cc: (sbrcv->sb_hiwat - trim));
1177	}
1178	}
1179	}
1180
1181	/ We may need to trim the send socket buffer size for two reasons:*
1182	* 1. if the rtt seen on the connection is climbing up, we do not
1183	* want to fill the buffers any more.
1184	* 2. if the congestion win on the socket backed off, there is no need
1185	* to hold more mbufs for that connection than what the cwnd will allow.
1186	*/
1187	void
1188	tcp_sbsnd_trim(struct sockbuf *sbsnd)
1189	{
1190	if (((sbsnd->sb_flags & (SB_AUTOSIZE \| SB_TRIM)) ==
1191	(SB_AUTOSIZE \| SB_TRIM)) &&
1192	(sbsnd->sb_idealsize > `0`) &&
1193	(sbsnd->sb_hiwat > sbsnd->sb_idealsize)) {
1194	u_int32_t trim = `0`;
1195	if (sbsnd->sb_cc <= sbsnd->sb_idealsize) {
1196	trim = sbsnd->sb_hiwat - sbsnd->sb_idealsize;
1197	} else {
1198	trim = sbsnd->sb_hiwat - sbsnd->sb_cc;
1199	}
1200	sbreserve(sb: sbsnd, cc: (sbsnd->sb_hiwat - trim));
1201	}
1202	if (sbsnd->sb_hiwat <= sbsnd->sb_idealsize) {
1203	sbsnd->sb_flags &= ~(SB_TRIM);
1204	}
1205	}
1206
1207	/*
1208	* If timestamp option was not negotiated on this connection
1209	* and this connection is on the receiving side of a stream
1210	* then we can not measure the delay on the link accurately.
1211	* Instead of enabling automatic receive socket buffer
1212	* resizing, just give more space to the receive socket buffer.
1213	*/
1214	static inline void
1215	tcp_sbrcv_tstmp_check(struct tcpcb *tp)
1216	{
1217	struct socket *so = tp->t_inpcb->inp_socket;
1218	u_int32_t newsize = `2` * tcp_recvspace;
1219	struct sockbuf *sbrcv = &so->so_rcv;
1220
1221	if ((tp->t_flags & (TF_REQ_TSTMP \| TF_RCVD_TSTMP)) !=
1222	(TF_REQ_TSTMP \| TF_RCVD_TSTMP) &&
1223	(sbrcv->sb_flags & SB_AUTOSIZE) != `0`) {
1224	tcp_sbrcv_reserve(tp, sbrcv, newsize, idealsize: `0`, rcvbuf_max: newsize);
1225	}
1226	}
1227
1228	/ A receiver will evaluate the flow of packets on a connection*
1229	* to see if it can reduce ack traffic. The receiver will start
1230	* stretching acks if all of the following conditions are met:
1231	* 1. tcp_delack_enabled is set to 3
1232	* 2. If the bytes received in the last 100ms is greater than a threshold
1233	* defined by maxseg_unacked
1234	* 3. If the connection has not been idle for tcp_maxrcvidle period.
1235	* 4. If the connection has seen enough packets to let the slow-start
1236	* finish after connection establishment or after some packet loss.
1237	*
1238	* The receiver will stop stretching acks if there is congestion/reordering
1239	* as indicated by packets on reassembly queue or an ECN. If the delayed-ack
1240	* timer fires while stretching acks, it means that the packet flow has gone
1241	* below the threshold defined by maxseg_unacked and the receiver will stop
1242	* stretching acks. The receiver gets no indication when slow-start is completed
1243	* or when the connection reaches an idle state. That is why we use
1244	* tcp_rcvsspktcnt to cover slow-start and tcp_maxrcvidle to identify idle
1245	* state.
1246	*/
1247	static inline int
1248	tcp_stretch_ack_enable(struct tcpcb tp, int* thflags)
1249	{
1250	if (tp->rcv_by_unackwin >= (maxseg_unacked * tp->t_maxseg) &&
1251	TSTMP_GEQ(tp->rcv_unackwin, tcp_now)) {
1252	tp->t_flags \|= TF_STREAMING_ON;
1253	} else {
1254	tp->t_flags &= ~TF_STREAMING_ON;
1255	}
1256
1257	/ If there has been an idle time, reset streaming detection /
1258	if (TSTMP_GT(tcp_now, tp->rcv_unackwin + tcp_maxrcvidle)) {
1259	tp->t_flags &= ~TF_STREAMING_ON;
1260	}
1261
1262	/*
1263	* If there are flags other than TH_ACK set, reset streaming
1264	* detection
1265	*/
1266	if (thflags & ~TH_ACK) {
1267	tp->t_flags &= ~TF_STREAMING_ON;
1268	}
1269
1270	if (tp->t_flagsext & TF_DISABLE_STRETCHACK) {
1271	if (tp->rcv_nostrack_pkts >= TCP_STRETCHACK_ENABLE_PKTCNT) {
1272	tp->t_flagsext &= ~TF_DISABLE_STRETCHACK;
1273	tp->rcv_nostrack_pkts = `0`;
1274	tp->rcv_nostrack_ts = `0`;
1275	} else {
1276	tp->rcv_nostrack_pkts++;
1277	}
1278	}
1279
1280	if (!(tp->t_flagsext & (TF_NOSTRETCHACK \| TF_DISABLE_STRETCHACK)) &&
1281	(tp->t_flags & TF_STREAMING_ON) &&
1282	(!(tp->t_flagsext & TF_RCVUNACK_WAITSS) \|\|
1283	(tp->rcv_waitforss >= tcp_rcvsspktcnt))) {
1284	return `1`;
1285	}
1286
1287	return `0`;
1288	}
1289
1290	/*
1291	* Reset the state related to stretch-ack algorithm. This will make
1292	* the receiver generate an ack every other packet. The receiver
1293	* will start re-evaluating the rate at which packets come to decide
1294	* if it can benefit by lowering the ack traffic.
1295	*/
1296	void
1297	tcp_reset_stretch_ack(struct tcpcb *tp)
1298	{
1299	tp->t_flags &= ~(TF_STRETCHACK \| TF_STREAMING_ON);
1300	tp->rcv_by_unackwin = `0`;
1301	tp->rcv_by_unackhalfwin = `0`;
1302	tp->rcv_unackwin = tcp_now + tcp_rcvunackwin;
1303
1304	/*
1305	* When there is packet loss or packet re-ordering or CWR due to
1306	* ECN, the sender's congestion window is reduced. In these states,
1307	* generate an ack for every other packet for some time to allow
1308	* the sender's congestion window to grow.
1309	*/
1310	tp->t_flagsext \|= TF_RCVUNACK_WAITSS;
1311	tp->rcv_waitforss = `0`;
1312	}
1313
1314	/*
1315	* The last packet was a retransmission, check if this ack
1316	* indicates that the retransmission was spurious.
1317	*
1318	* If the connection supports timestamps, we could use it to
1319	* detect if the last retransmit was not needed. Otherwise,
1320	* we check if the ACK arrived within RTT/2 window, then it
1321	* was a mistake to do the retransmit in the first place.
1322	*
1323	* This function will return 1 if it is a spurious retransmit,
1324	* 0 otherwise.
1325	*/
1326	int
1327	tcp_detect_bad_rexmt(struct tcpcb tp, struct* tcphdr *th,
1328	struct tcpopt *to, u_int32_t rxtime)
1329	{
1330	int32_t tdiff, bad_rexmt_win;
1331	bad_rexmt_win = (tp->t_srtt >> (TCP_RTT_SHIFT + `1`));
1332
1333	/ If the ack has ECN CE bit, then cwnd has to be adjusted /
1334	if ((TCP_ACC_ECN_ON(tp) && tp->t_delta_ce_packets > `0`) \|\|
1335	(TCP_ECN_ENABLED(tp) && (th->th_flags & TH_ECE))) {
1336	return `0`;
1337	}
1338	if (TSTMP_SUPPORTED(tp)) {
1339	if (rxtime > `0` && (to->to_flags & TOF_TS) && to->to_tsecr != `0` &&
1340	TSTMP_LT(to->to_tsecr, rxtime)) {
1341	return `1`;
1342	}
1343	} else {
1344	if ((tp->t_rxtshift == `1` \|\| (tp->t_flagsext & TF_SENT_TLPROBE)) &&
1345	rxtime > `0`) {
1346	tdiff = (int32_t)(tcp_now - rxtime);
1347	if (tdiff < bad_rexmt_win) {
1348	return `1`;
1349	}
1350	}
1351	}
1352	return `0`;
1353	}
1354
1355
1356	/*
1357	* Restore congestion window state if a spurious timeout
1358	* was detected.
1359	*/
1360	static void
1361	tcp_bad_rexmt_restore_state(struct tcpcb tp, struct* tcphdr *th)
1362	{
1363	if (TSTMP_SUPPORTED(tp)) {
1364	u_int32_t fsize, acked;
1365	fsize = tp->snd_max - th->th_ack;
1366	acked = BYTES_ACKED(th, tp);
1367
1368	/*
1369	* Implement bad retransmit recovery as
1370	* described in RFC 4015.
1371	*/
1372	tp->snd_ssthresh = tp->snd_ssthresh_prev;
1373
1374	/ Initialize cwnd to the initial window /
1375	if (CC_ALGO(tp)->cwnd_init != NULL) {
1376	CC_ALGO(tp)->cwnd_init(tp);
1377	}
1378
1379	tp->snd_cwnd = fsize + min(a: acked, b: tp->snd_cwnd);
1380	} else {
1381	tp->snd_cwnd = tp->snd_cwnd_prev;
1382	tp->snd_ssthresh = tp->snd_ssthresh_prev;
1383	if (tp->t_flags & TF_WASFRECOVERY) {
1384	ENTER_FASTRECOVERY(tp);
1385	}
1386
1387	/ Do not use the loss flight size in this case /
1388	tp->t_lossflightsize = `0`;
1389	}
1390	tp->snd_cwnd = max(a: tp->snd_cwnd, b: tcp_initial_cwnd(tp));
1391	tp->snd_recover = tp->snd_recover_prev;
1392	tp->snd_nxt = tp->snd_max;
1393
1394	/ Fix send socket buffer to reflect the change in cwnd /
1395	tcp_bad_rexmt_fix_sndbuf(tp);
1396
1397	/*
1398	* This RTT might reflect the extra delay induced
1399	* by the network. Skip using this sample for RTO
1400	* calculation and mark the connection so we can
1401	* recompute RTT when the next eligible sample is
1402	* found.
1403	*/
1404	tp->t_flagsext \|= TF_RECOMPUTE_RTT;
1405	tp->t_badrexmt_time = tcp_now;
1406	tp->t_rtttime = `0`;
1407	}
1408
1409	/*
1410	* If the previous packet was sent in retransmission timer, and it was
1411	* not needed, then restore the congestion window to the state before that
1412	* transmission.
1413	*
1414	* If the last packet was sent in tail loss probe timeout, check if that
1415	* recovered the last packet. If so, that will indicate a real loss and
1416	* the congestion window needs to be lowered.
1417	*/
1418	static void
1419	tcp_bad_rexmt_check(struct tcpcb tp, struct* tcphdr th, struct* tcpopt *to)
1420	{
1421	if (tp->t_rxtshift > `0` &&
1422	tcp_detect_bad_rexmt(tp, th, to, rxtime: tp->t_rxtstart)) {
1423	++tcpstat.tcps_sndrexmitbad;
1424	tcp_bad_rexmt_restore_state(tp, th);
1425	tcp_ccdbg_trace(tp, th, event: TCP_CC_BAD_REXMT_RECOVERY);
1426	} else if ((tp->t_flagsext & TF_SENT_TLPROBE) && tp->t_tlphighrxt > `0` &&
1427	SEQ_GEQ(th->th_ack, tp->t_tlphighrxt) &&
1428	!tcp_detect_bad_rexmt(tp, th, to, rxtime: tp->t_tlpstart)) {
1429	/*
1430	* The tail loss probe recovered the last packet and
1431	* we need to adjust the congestion window to take
1432	* this loss into account.
1433	*/
1434	++tcpstat.tcps_tlp_recoverlastpkt;
1435	if (!IN_FASTRECOVERY(tp)) {
1436	tcp_reduce_congestion_window(tp);
1437	EXIT_FASTRECOVERY(tp);
1438	}
1439	tcp_ccdbg_trace(tp, th, event: TCP_CC_TLP_RECOVER_LASTPACKET);
1440	} else if (tcp_rxtseg_detect_bad_rexmt(tp, th->th_ack)) {
1441	/*
1442	* All of the retransmitted segments were duplicated, this
1443	* can be an indication of bad fast retransmit.
1444	*/
1445	tcpstat.tcps_dsack_badrexmt++;
1446	tcp_bad_rexmt_restore_state(tp, th);
1447	tcp_ccdbg_trace(tp, th, event: TCP_CC_DSACK_BAD_REXMT);
1448	tcp_rxtseg_clean(tp);
1449	}
1450	tp->t_flagsext &= ~(TF_SENT_TLPROBE);
1451	tp->t_tlphighrxt = `0`;
1452	tp->t_tlpstart = `0`;
1453
1454	/*
1455	* check if the latest ack was for a segment sent during PMTU
1456	* blackhole detection. If the timestamp on the ack is before
1457	* PMTU blackhole detection, then revert the size of the max
1458	* segment to previous size.
1459	*/
1460	if (tp->t_rxtshift > `0` && (tp->t_flags & TF_BLACKHOLE) &&
1461	tp->t_pmtud_start_ts > `0` && TSTMP_SUPPORTED(tp)) {
1462	if ((to->to_flags & TOF_TS) && to->to_tsecr != `0`
1463	&& TSTMP_LT(to->to_tsecr, tp->t_pmtud_start_ts)) {
1464	tcp_pmtud_revert_segment_size(tp);
1465	}
1466	}
1467	if (tp->t_pmtud_start_ts > `0`) {
1468	tp->t_pmtud_start_ts = `0`;
1469	}
1470
1471	tp->t_pmtud_lastseg_size = `0`;
1472	}
1473
1474	/*
1475	* Check if early retransmit can be attempted according to RFC 5827.
1476	*
1477	* If packet reordering is detected on a connection, fast recovery will
1478	* be delayed until it is clear that the packet was lost and not reordered.
1479	* But reordering detection is done only when SACK is enabled.
1480	*
1481	* On connections that do not support SACK, there is a limit on the number
1482	* of early retransmits that can be done per minute. This limit is needed
1483	* to make sure that too many packets are not retransmitted when there is
1484	* packet reordering.
1485	*/
1486	static void
1487	tcp_early_rexmt_check(struct tcpcb tp, struct* tcphdr *th)
1488	{
1489	u_int32_t obytes, snd_off;
1490	int32_t snd_len;
1491	struct socket *so = tp->t_inpcb->inp_socket;
1492
1493	if ((SACK_ENABLED(tp) \|\| tp->t_early_rexmt_count < TCP_EARLY_REXMT_LIMIT) &&
1494	SEQ_GT(tp->snd_max, tp->snd_una) &&
1495	(tp->t_dupacks == `1` \|\| (SACK_ENABLED(tp) && !TAILQ_EMPTY(&tp->snd_holes)))) {
1496	/*
1497	* If there are only a few outstanding
1498	* segments on the connection, we might need
1499	* to lower the retransmit threshold. This
1500	* will allow us to do Early Retransmit as
1501	* described in RFC 5827.
1502	*/
1503	if (SACK_ENABLED(tp) &&
1504	!TAILQ_EMPTY(&tp->snd_holes)) {
1505	obytes = (tp->snd_max - tp->snd_fack) +
1506	tp->sackhint.sack_bytes_rexmit;
1507	} else {
1508	obytes = (tp->snd_max - tp->snd_una);
1509	}
1510
1511	/*
1512	* In order to lower retransmit threshold the
1513	* following two conditions must be met.
1514	* 1. the amount of outstanding data is less
1515	* than 4*SMSS bytes
1516	* 2. there is no unsent data ready for
1517	* transmission or the advertised window
1518	* will limit sending new segments.
1519	*/
1520	snd_off = tp->snd_max - tp->snd_una;
1521	snd_len = min(a: so->so_snd.sb_cc, b: tp->snd_wnd) - snd_off;
1522	if (obytes < (tp->t_maxseg << `2`) &&
1523	snd_len <= `0`) {
1524	u_int32_t osegs;
1525
1526	osegs = obytes / tp->t_maxseg;
1527	if ((osegs * tp->t_maxseg) < obytes) {
1528	osegs++;
1529	}
1530
1531	/*
1532	* Since the connection might have already
1533	* received some dupacks, we add them to
1534	* to the outstanding segments count to get
1535	* the correct retransmit threshold.
1536	*
1537	* By checking for early retransmit after
1538	* receiving some duplicate acks when SACK
1539	* is supported, the connection will
1540	* enter fast recovery even if multiple
1541	* segments are lost in the same window.
1542	*/
1543	osegs += tp->t_dupacks;
1544	if (osegs < `4`) {
1545	tp->t_rexmtthresh =
1546	((osegs - `1`) > `1`) ? ((uint8_t)osegs - `1`) : `1`;
1547	tp->t_rexmtthresh =
1548	MIN(tp->t_rexmtthresh, tcprexmtthresh);
1549	tp->t_rexmtthresh =
1550	MAX(tp->t_rexmtthresh,
1551	tp->t_dupacks > UINT8_MAX ? UINT8_MAX : (uint8_t)tp->t_dupacks);
1552
1553	if (tp->t_early_rexmt_count == `0`) {
1554	tp->t_early_rexmt_win = tcp_now;
1555	}
1556
1557	if (tp->t_flagsext & TF_SENT_TLPROBE) {
1558	tcpstat.tcps_tlp_recovery++;
1559	tcp_ccdbg_trace(tp, th,
1560	event: TCP_CC_TLP_RECOVERY);
1561	} else {
1562	tcpstat.tcps_early_rexmt++;
1563	tp->t_early_rexmt_count++;
1564	tcp_ccdbg_trace(tp, th,
1565	event: TCP_CC_EARLY_RETRANSMIT);
1566	}
1567	}
1568	}
1569	}
1570
1571	/*
1572	* If we ever sent a TLP probe, the acknowledgement will trigger
1573	* early retransmit because the value of snd_fack will be close
1574	* to snd_max. This will take care of adjustments to the
1575	* congestion window. So we can reset TF_SENT_PROBE flag.
1576	*/
1577	tp->t_flagsext &= ~(TF_SENT_TLPROBE);
1578	tp->t_tlphighrxt = `0`;
1579	tp->t_tlpstart = `0`;
1580	}
1581
1582	static boolean_t
1583	tcp_tfo_syn(struct tcpcb tp, struct* tcpopt *to)
1584	{
1585	u_char out[CCAES_BLOCK_SIZE];
1586	unsigned char len;
1587
1588	if (!(to->to_flags & (TOF_TFO \| TOF_TFOREQ)) \|\|
1589	!(tcp_fastopen & TCP_FASTOPEN_SERVER)) {
1590	return FALSE;
1591	}
1592
1593	if ((to->to_flags & TOF_TFOREQ)) {
1594	tp->t_tfo_flags \|= TFO_F_OFFER_COOKIE;
1595
1596	tp->t_tfo_stats \|= TFO_S_COOKIEREQ_RECV;
1597	tcpstat.tcps_tfo_cookie_req_rcv++;
1598	return FALSE;
1599	}
1600
1601	/ Ok, then it must be an offered cookie. We need to check that ... /
1602	tcp_tfo_gen_cookie(inp: tp->t_inpcb, out, blk_size: sizeof(out));
1603
1604	len = *to->to_tfo - TCPOLEN_FASTOPEN_REQ;
1605	to->to_tfo++;
1606	if (memcmp(s1: out, s2: to->to_tfo, n: len)) {
1607	/ Cookies are different! Let's return and offer a new cookie /
1608	tp->t_tfo_flags \|= TFO_F_OFFER_COOKIE;
1609
1610	tp->t_tfo_stats \|= TFO_S_COOKIE_INVALID;
1611	tcpstat.tcps_tfo_cookie_invalid++;
1612	return FALSE;
1613	}
1614
1615	if (OSIncrementAtomic(&tcp_tfo_halfcnt) >= tcp_tfo_backlog) {
1616	/ Need to decrement again as we just increased it... /
1617	OSDecrementAtomic(&tcp_tfo_halfcnt);
1618	return FALSE;
1619	}
1620
1621	tp->t_tfo_flags \|= TFO_F_COOKIE_VALID;
1622
1623	tp->t_tfo_stats \|= TFO_S_SYNDATA_RCV;
1624	tcpstat.tcps_tfo_syn_data_rcv++;
1625
1626	return TRUE;
1627	}
1628
1629	static void
1630	tcp_tfo_synack(struct tcpcb tp, struct* tcpopt *to)
1631	{
1632	if (to->to_flags & TOF_TFO) {
1633	unsigned char len = *to->to_tfo - TCPOLEN_FASTOPEN_REQ;
1634
1635	/*
1636	* If this happens, things have gone terribly wrong. len should
1637	* have been checked in tcp_dooptions.
1638	*/
1639	VERIFY(len <= TFO_COOKIE_LEN_MAX);
1640
1641	to->to_tfo++;
1642
1643	tcp_cache_set_cookie(tp, cookie: to->to_tfo, len);
1644	tcp_heuristic_tfo_success(tp);
1645
1646	tp->t_tfo_stats \|= TFO_S_COOKIE_RCV;
1647	tcpstat.tcps_tfo_cookie_rcv++;
1648	if (tp->t_tfo_flags & TFO_F_COOKIE_SENT) {
1649	tcpstat.tcps_tfo_cookie_wrong++;
1650	tp->t_tfo_stats \|= TFO_S_COOKIE_WRONG;
1651	}
1652	} else {
1653	/*
1654	* Thus, no cookie in the response, but we either asked for one
1655	* or sent SYN+DATA. Now, we need to check whether we had to
1656	* rexmit the SYN. If that's the case, it's better to start
1657	* backing of TFO-cookie requests.
1658	*/
1659	if (!(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE) &&
1660	tp->t_tfo_flags & TFO_F_SYN_LOSS) {
1661	tp->t_tfo_stats \|= TFO_S_SYN_LOSS;
1662	tcpstat.tcps_tfo_syn_loss++;
1663
1664	tcp_heuristic_tfo_loss(tp);
1665	} else {
1666	if (tp->t_tfo_flags & TFO_F_COOKIE_REQ) {
1667	tp->t_tfo_stats \|= TFO_S_NO_COOKIE_RCV;
1668	tcpstat.tcps_tfo_no_cookie_rcv++;
1669	}
1670
1671	tcp_heuristic_tfo_success(tp);
1672	}
1673	}
1674	}
1675
1676	static void
1677	tcp_tfo_rcv_probe(struct tcpcb tp, int* tlen)
1678	{
1679	if (tlen != `0`) {
1680	return;
1681	}
1682
1683	tp->t_tfo_probe_state = TFO_PROBE_PROBING;
1684
1685	/*
1686	* We send the probe out rather quickly (after one RTO). It does not
1687	* really hurt that much, it's only one additional segment on the wire.
1688	*/
1689	tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, (TCP_REXMTVAL(tp)));
1690	}
1691
1692	static void
1693	tcp_tfo_rcv_data(struct tcpcb *tp)
1694	{
1695	/ Transition from PROBING to NONE as data has been received /
1696	if (tp->t_tfo_probe_state >= TFO_PROBE_PROBING) {
1697	tp->t_tfo_probe_state = TFO_PROBE_NONE;
1698	}
1699	}
1700
1701	static void
1702	tcp_tfo_rcv_ack(struct tcpcb tp, struct* tcphdr *th)
1703	{
1704	if (tp->t_tfo_probe_state == TFO_PROBE_PROBING &&
1705	tp->t_tfo_probes > `0`) {
1706	if (th->th_seq == tp->rcv_nxt) {
1707	/ No hole, so stop probing /
1708	tp->t_tfo_probe_state = TFO_PROBE_NONE;
1709	} else if (SEQ_GT(th->th_seq, tp->rcv_nxt)) {
1710	/ There is a hole! Wait a bit for data... /
1711	tp->t_tfo_probe_state = TFO_PROBE_WAIT_DATA;
1712	tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
1713	TCP_REXMTVAL(tp));
1714	}
1715	}
1716	}
1717
1718	/*
1719	* Update snd_wnd information.
1720	*/
1721	static inline bool
1722	tcp_update_window(struct tcpcb tp, int* thflags, struct tcphdr * th,
1723	u_int32_t tiwin, int tlen)
1724	{
1725	/ Don't look at the window if there is no ACK flag /
1726	if ((thflags & TH_ACK) &&
1727	(SEQ_LT(tp->snd_wl1, th->th_seq) \|\|
1728	(tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) \|\|
1729	(tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
1730	/ keep track of pure window updates /
1731	if (tlen == `0` &&
1732	tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) {
1733	tcpstat.tcps_rcvwinupd++;
1734	}
1735	tp->snd_wnd = tiwin;
1736	tp->snd_wl1 = th->th_seq;
1737	tp->snd_wl2 = th->th_ack;
1738	if (tp->snd_wnd > tp->max_sndwnd) {
1739	tp->max_sndwnd = tp->snd_wnd;
1740	}
1741
1742	if (tp->t_inpcb->inp_socket->so_flags & SOF_MP_SUBFLOW) {
1743	mptcp_update_window_wakeup(tp);
1744	}
1745	return true;
1746	}
1747	return false;
1748	}
1749
1750	static void
1751	tcp_handle_wakeup(struct socket so, int* read_wakeup, int write_wakeup)
1752	{
1753	if (read_wakeup != `0`) {
1754	sorwakeup(so);
1755	}
1756	if (write_wakeup != `0`) {
1757	sowwakeup(so);
1758	}
1759	}
1760
1761	static void
1762	tcp_update_snd_una(struct tcpcb *tp, uint32_t ack)
1763	{
1764	tp->snd_una = ack;
1765	if (SACK_ENABLED(tp) && SEQ_LT(tp->send_highest_sack, tp->snd_una)) {
1766	tp->send_highest_sack = tp->snd_una;
1767
1768	/ If we move our marker, we need to start fresh /
1769	tp->t_new_dupacks = `0`;
1770	}
1771	}
1772
1773	static bool
1774	tcp_syn_data_valid(struct tcpcb tp, struct* tcphdr tcp_hdr, int* tlen)
1775	{
1776	/ No data? /
1777	if (tlen <= `0`) {
1778	return false;
1779	}
1780
1781	/ Not the right sequence-number? /
1782	if (tcp_hdr->th_seq != tp->irs) {
1783	return false;
1784	}
1785
1786	/ We could have wrapped around, check that /
1787	if (tp->t_inpcb->inp_stat->rxbytes > INT32_MAX) {
1788	return false;
1789	}
1790
1791	return true;
1792	}
1793
1794	/ Process IP-ECN codepoints on received packets and update receive side counters /
1795	static void
1796	tcp_input_ip_ecn(struct tcpcb tp, struct* inpcb *inp, uint32_t tlen, uint32_t segment_count, uint8_t ip_ecn)
1797	{
1798	switch (ip_ecn) {
1799	case IPTOS_ECN_ECT1:
1800	tp->ecn_flags \|= TE_ACO_ECT1;
1801	tp->t_rcv_ect1_bytes += tlen;
1802	break;
1803	case IPTOS_ECN_ECT0:
1804	tp->ecn_flags \|= TE_ACO_ECT0;
1805	tp->t_rcv_ect0_bytes += tlen;
1806	break;
1807	case IPTOS_ECN_CE:
1808	tp->t_rcv_ce_packets += segment_count;
1809	tp->t_rcv_ce_bytes += tlen;
1810	tp->t_ecn_recv_ce++;
1811	tcpstat.tcps_ecn_recv_ce++;
1812	INP_INC_IFNET_STAT(inp, ecn_recv_ce);
1813	break;
1814	default:
1815	/ No counter for Not-ECT /
1816	break;
1817	}
1818	}
1819
1820	/ Process SYN packet that wishes to negotiate Accurate ECN /
1821	static void
1822	tcp_input_process_accecn_syn(struct tcpcb tp, int* ace_flags, uint8_t ip_ecn)
1823	{
1824	switch (ace_flags) {
1825	case (`0` \| `0` \| `0`):
1826	/ No ECN /
1827	tp->t_server_accecn_state = tcp_connection_server_no_ecn_requested;
1828	break;
1829	case (`0` \| TH_CWR \| TH_ECE):
1830	/ Legacy ECN-setup /
1831	tp->ecn_flags \|= (TE_SETUPRECEIVED \| TE_SENDIPECT);
1832	tp->t_server_accecn_state = tcp_connection_server_classic_ecn_requested;
1833	break;
1834	case (TH_ACE):
1835	/ Accurate ECN /
1836	if (TCP_ACC_ECN_ENABLED(tp)) {
1837	switch (ip_ecn) {
1838	case IPTOS_ECN_NOTECT:
1839	tp->ecn_flags \|= TE_ACE_SETUP_NON_ECT;
1840	break;
1841	case IPTOS_ECN_ECT1:
1842	tp->ecn_flags \|= TE_ACE_SETUP_ECT1;
1843	break;
1844	case IPTOS_ECN_ECT0:
1845	tp->ecn_flags \|= TE_ACE_SETUP_ECT0;
1846	break;
1847	case IPTOS_ECN_CE:
1848	tp->ecn_flags \|= TE_ACE_SETUP_CE;
1849	break;
1850	}
1851	/*
1852	* We are not yet committing to send IP ECT packets when
1853	* Accurate ECN is enabled
1854	*/
1855	tp->ecn_flags \|= (TE_ACE_SETUPRECEIVED);
1856
1857	/ Initialize ECT byte counter to 1 to distinguish zeroing of options /
1858	tp->t_rcv_ect1_bytes = tp->t_rcv_ect0_bytes = `1`;
1859	tp->t_snd_ect1_bytes = tp->t_snd_ect0_bytes = `1`;
1860	tp->t_server_accecn_state = tcp_connection_server_accurate_ecn_requested;
1861	} else {
1862	/*
1863	* If AccECN is not enabled, ignore
1864	* the TH_AE bit and do Legacy ECN-setup
1865	*/
1866	tp->ecn_flags \|= (TE_SETUPRECEIVED \| TE_SENDIPECT);
1867	}
1868	default:
1869	/ Forward Compatibility /
1870	/ Accurate ECN /
1871	if (TCP_ACC_ECN_ENABLED(tp)) {
1872	switch (ip_ecn) {
1873	case IPTOS_ECN_NOTECT:
1874	tp->ecn_flags \|= TE_ACE_SETUP_NON_ECT;
1875	break;
1876	case IPTOS_ECN_ECT1:
1877	tp->ecn_flags \|= TE_ACE_SETUP_ECT1;
1878	break;
1879	case IPTOS_ECN_ECT0:
1880	tp->ecn_flags \|= TE_ACE_SETUP_ECT0;
1881	break;
1882	case IPTOS_ECN_CE:
1883	tp->ecn_flags \|= TE_ACE_SETUP_CE;
1884	break;
1885	}
1886	/*
1887	* We are not yet committing to send IP ECT packets when
1888	* Accurate ECN is enabled
1889	*/
1890	tp->ecn_flags \|= (TE_ACE_SETUPRECEIVED);
1891
1892	/ Initialize ECT byte counter to 1 to distinguish zeroing of options /
1893	tp->t_rcv_ect1_bytes = tp->t_rcv_ect0_bytes = `1`;
1894	tp->t_snd_ect1_bytes = tp->t_snd_ect0_bytes = `1`;
1895	tp->t_server_accecn_state = tcp_connection_server_accurate_ecn_requested;
1896	}
1897	break;
1898	}
1899	}
1900
1901	void
1902	tcp_input(struct mbuf m, int* off0)
1903	{
1904	int exiting_fr = `0`;
1905	struct tcphdr *th;
1906	struct ip *ip = NULL;
1907	struct inpcb *inp;
1908	u_char *optp = NULL;
1909	int optlen = `0`;
1910	int tlen, off;
1911	int drop_hdrlen;
1912	struct tcpcb *tp = `0`;
1913	int thflags;
1914	struct socket *so = `0`;
1915	int todrop, acked, ourfinisacked, needoutput = `0`;
1916	int read_wakeup = `0`;
1917	int write_wakeup = `0`;
1918	struct in_addr laddr;
1919	struct in6_addr laddr6;
1920	int dropsocket = `0`;
1921	int iss = `0`, nosock = `0`;
1922	u_int32_t tiwin, sack_bytes_acked = `0`, sack_bytes_newly_acked = `0`;
1923	struct tcpopt to; / options in this segment /
1924	#if TCPDEBUG
1925	short ostate = `0`;
1926	#endif
1927	u_char ip_ecn = IPTOS_ECN_NOTECT;
1928	unsigned int ifscope;
1929	uint8_t isconnected, isdisconnected;
1930	struct ifnet *ifp = m->m_pkthdr.rcvif;
1931	int segment_count = m->m_pkthdr.seg_cnt ? : `1`;
1932	int win;
1933	u_int16_t pf_tag = `0`;
1934	#if MPTCP
1935	struct mptcb *mp_tp = NULL;
1936	#endif /* MPTCP */
1937	boolean_t cell = IFNET_IS_CELLULAR(ifp);
1938	boolean_t wifi = (!cell && IFNET_IS_WIFI(ifp));
1939	boolean_t wired = (!wifi && IFNET_IS_WIRED(ifp));
1940	boolean_t recvd_dsack = FALSE;
1941	struct tcp_respond_args tra;
1942	int prev_t_state;
1943	boolean_t check_cfil = cfil_filter_present();
1944	bool findpcb_iterated = false;
1945	/*
1946	* The mbuf may be freed after it has been added to the receive socket
1947	* buffer or the reassembly queue, so we reinitialize th to point to a
1948	* safe copy of the TCP header
1949	*/
1950	struct tcphdr saved_tcphdr = {};
1951	/*
1952	* Save copy of the IPv4/IPv6 header.
1953	* Note: use array of uint32_t to silence compiler warning when casting
1954	* to a struct ip6_hdr pointer.
1955	*/
1956	#define MAX_IPWORDS ((sizeof(struct ip) + MAX_IPOPTLEN) / sizeof(uint32_t))
1957	uint32_t saved_hdr[MAX_IPWORDS];
1958
1959	#define TCP_INC_VAR(stat, npkts) do { \
1960	stat += npkts; \
1961	} while (0)
1962
1963	if (tcp_ack_strategy == TCP_ACK_STRATEGY_LEGACY) {
1964	segment_count = `1`;
1965	}
1966	TCP_INC_VAR(tcpstat.tcps_rcvtotal, segment_count);
1967
1968	struct ip6_hdr *ip6 = NULL;
1969	int isipv6;
1970	struct proc *kernel_proc = current_proc();
1971
1972	KERNEL_DEBUG(DBG_FNC_TCP_INPUT \| DBG_FUNC_START, `0`, `0`, `0`, `0`, `0`);
1973
1974	isipv6 = (mtod(m, struct ip *)->ip_v == `6`) ? `1` : `0`;
1975	bzero(s: (char )&to, n: sizeof*(to));
1976
1977	m_add_crumb(m, PKT_CRUMB_TCP_INPUT);
1978
1979	if (m->m_flags & M_PKTHDR) {
1980	pf_tag = m_pftag(m)->pftag_tag;
1981	}
1982
1983	if (isipv6) {
1984	/*
1985	* Expect 32-bit aligned data pointer on
1986	* strict-align platforms
1987	*/
1988	MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
1989
1990	/ IP6_EXTHDR_CHECK() is already done at tcp6_input() /
1991	ip6 = mtod(m, struct ip6_hdr *);
1992	tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
1993	th = (struct tcphdr )(void* *)((caddr_t)ip6 + off0);
1994
1995	if (tcp_input_checksum(AF_INET6, m, th, off0, tlen)) {
1996	TCP_LOG_DROP_PKT(ip6, th, ifp, "IPv6 bad tcp checksum");
1997	goto dropnosock;
1998	}
1999
2000	KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << `16`) \| th->th_sport),
2001	(((ip6->ip6_src.s6_addr16[`0`]) << `16`) \| (ip6->ip6_dst.s6_addr16[`0`])),
2002	th->th_seq, th->th_ack, th->th_win);
2003	/*
2004	* Be proactive about unspecified IPv6 address in source.
2005	* As we use all-zero to indicate unbounded/unconnected pcb,
2006	* unspecified IPv6 address can be used to confuse us.
2007	*
2008	* Note that packets with unspecified IPv6 destination is
2009	* already dropped in ip6_input.
2010	*/
2011	if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
2012	/ XXX stat /
2013	IF_TCP_STATINC(ifp, unspecv6);
2014	TCP_LOG_DROP_PKT(ip6, th, ifp, "src IPv6 address unspecified");
2015	goto dropnosock;
2016	}
2017	DTRACE_TCP5(receive, struct mbuf , m, struct* inpcb *, NULL,
2018	struct ip6_hdr , ip6, struct* tcpcb *, NULL,
2019	struct tcphdr *, th);
2020
2021	ip_ecn = (ntohl(ip6->ip6_flow) >> `20`) & IPTOS_ECN_MASK;
2022	} else {
2023	/*
2024	* Get IP and TCP header together in first mbuf.
2025	* Note: IP leaves IP header in first mbuf.
2026	*/
2027	if (off0 > sizeof(struct ip)) {
2028	ip_stripoptions(m);
2029	off0 = sizeof(struct ip);
2030	}
2031	if (m->m_len < sizeof(struct tcpiphdr)) {
2032	if ((m = m_pullup(m, sizeof(struct tcpiphdr))) == `0`) {
2033	tcpstat.tcps_rcvshort++;
2034	return;
2035	}
2036	}
2037
2038	/ Expect 32-bit aligned data pointer on strict-align platforms /
2039	MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
2040
2041	ip = mtod(m, struct ip *);
2042	th = (struct tcphdr )(void* *)((caddr_t)ip + off0);
2043	tlen = ip->ip_len;
2044
2045	if (tcp_input_checksum(AF_INET, m, th, off0, tlen)) {
2046	TCP_LOG_DROP_PKT(ip, th, ifp, "IPv4 bad tcp checksum");
2047	goto dropnosock;
2048	}
2049
2050	/ Re-initialization for later version check /
2051	ip->ip_v = IPVERSION;
2052	ip_ecn = (ip->ip_tos & IPTOS_ECN_MASK);
2053
2054	DTRACE_TCP5(receive, struct mbuf , m, struct* inpcb *, NULL,
2055	struct ip , ip, struct* tcpcb , NULL, struct* tcphdr *, th);
2056
2057	KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << `16`) \| th->th_sport),
2058	(((ip->ip_src.s_addr & `0xffff`) << `16`) \| (ip->ip_dst.s_addr & `0xffff`)),
2059	th->th_seq, th->th_ack, th->th_win);
2060	}
2061
2062	#define TCP_LOG_HDR (isipv6 ? (void )ip6 : (void )ip)
2063
2064	/*
2065	* Check that TCP offset makes sense,
2066	* pull out TCP options and adjust length.
2067	*/
2068	off = th->th_off << `2`;
2069	if (off < sizeof(struct tcphdr) \|\| off > tlen) {
2070	tcpstat.tcps_rcvbadoff++;
2071	IF_TCP_STATINC(ifp, badformat);
2072	TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "bad tcp offset");
2073	goto dropnosock;
2074	}
2075	tlen -= off; / tlen is used instead of ti->ti_len /
2076	if (off > sizeof(struct tcphdr)) {
2077	if (isipv6) {
2078	IP6_EXTHDR_CHECK(m, off0, off, return );
2079	ip6 = mtod(m, struct ip6_hdr *);
2080	th = (struct tcphdr )(void* *)((caddr_t)ip6 + off0);
2081	} else {
2082	if (m->m_len < sizeof(struct ip) + off) {
2083	if ((m = m_pullup(m, sizeof(struct ip) + off)) == `0`) {
2084	tcpstat.tcps_rcvshort++;
2085	return;
2086	}
2087	ip = mtod(m, struct ip *);
2088	th = (struct tcphdr )(void* *)((caddr_t)ip + off0);
2089	}
2090	}
2091	optlen = off - sizeof(struct tcphdr);
2092	optp = (u_char *)(th + `1`);
2093	/*
2094	* Do quick retrieval of timestamp options ("options
2095	* prediction?"). If timestamp is the only option and it's
2096	* formatted as recommended in RFC 1323 appendix A, we
2097	* quickly get the values now and not bother calling
2098	* tcp_dooptions(), etc.
2099	*/
2100	if ((optlen == TCPOLEN_TSTAMP_APPA \|\|
2101	(optlen > TCPOLEN_TSTAMP_APPA &&
2102	optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
2103	(u_int32_t )(void *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
2104	(th->th_flags & TH_SYN) == `0`) {
2105	to.to_flags \|= TOF_TS;
2106	to.to_tsval = ntohl((u_int32_t )(void *)(optp + `4`));
2107	to.to_tsecr = ntohl((u_int32_t )(void *)(optp + `8`));
2108	optp = NULL; / we've parsed the options /
2109	}
2110	}
2111	thflags = th->th_flags;
2112
2113	/*
2114	* Drop all packets with both the SYN and FIN bits set.
2115	* This prevents e.g. nmap from identifying the TCP/IP stack.
2116	*
2117	* This is a violation of the TCP specification.
2118	*/
2119	if ((thflags & (TH_SYN \| TH_FIN)) == (TH_SYN \| TH_FIN)) {
2120	IF_TCP_STATINC(ifp, synfin);
2121	TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "drop SYN FIN");
2122	goto dropnosock;
2123	}
2124
2125	/*
2126	* Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options,
2127	* until after ip6_savecontrol() is called and before other functions
2128	* which don't want those proto headers.
2129	* Because ip6_savecontrol() is going to parse the mbuf to
2130	* search for data to be passed up to user-land, it wants mbuf
2131	* parameters to be unchanged.
2132	*/
2133	drop_hdrlen = off0 + off;
2134
2135	/ Since this is an entry point for input processing of tcp packets, we*
2136	* can update the tcp clock here.
2137	*/
2138	calculate_tcp_clock();
2139
2140	/*
2141	* Record the interface where this segment arrived on; this does not
2142	* affect normal data output (for non-detached TCP) as it provides a
2143	* hint about which route and interface to use for sending in the
2144	* absence of a PCB, when scoped routing (and thus source interface
2145	* selection) are enabled.
2146	*/
2147	if ((m->m_pkthdr.pkt_flags & PKTF_LOOP) \|\| m->m_pkthdr.rcvif == NULL) {
2148	ifscope = IFSCOPE_NONE;
2149	} else {
2150	ifscope = m->m_pkthdr.rcvif->if_index;
2151	}
2152
2153	/*
2154	* Convert TCP protocol specific fields to host format.
2155	*/
2156
2157	#if BYTE_ORDER != BIG_ENDIAN
2158	NTOHL(th->th_seq);
2159	NTOHL(th->th_ack);
2160	NTOHS(th->th_win);
2161	NTOHS(th->th_urp);
2162	#endif
2163
2164	/*
2165	* Locate pcb for segment.
2166	*/
2167	findpcb:
2168
2169	isconnected = FALSE;
2170	isdisconnected = FALSE;
2171
2172	if (isipv6) {
2173	inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_src, th->th_sport, ip6_input_getsrcifscope(m),
2174	&ip6->ip6_dst, th->th_dport, ip6_input_getdstifscope(m), `1`,
2175	m->m_pkthdr.rcvif);
2176	} else {
2177	inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
2178	ip->ip_dst, th->th_dport, `1`, m->m_pkthdr.rcvif);
2179	}
2180
2181	/*
2182	* Use the interface scope information from the PCB for outbound
2183	* segments. If the PCB isn't present and if scoped routing is
2184	* enabled, tcp_respond will use the scope of the interface where
2185	* the segment arrived on.
2186	*/
2187	if (inp != NULL && (inp->inp_flags & INP_BOUND_IF)) {
2188	ifscope = inp->inp_boundifp->if_index;
2189	}
2190
2191	/*
2192	* If the state is CLOSED (i.e., TCB does not exist) then
2193	* all data in the incoming segment is discarded.
2194	* If the TCB exists but is in CLOSED state, it is embryonic,
2195	* but should either do a listen or a connect soon.
2196	*/
2197	if (inp == NULL) {
2198	if (log_in_vain) {
2199	char dbuf[MAX_IPv6_STR_LEN], sbuf[MAX_IPv6_STR_LEN];
2200
2201	if (isipv6) {
2202	inet_ntop(AF_INET6, &ip6->ip6_dst, dbuf, sizeof(dbuf));
2203	inet_ntop(AF_INET6, &ip6->ip6_src, sbuf, sizeof(sbuf));
2204	} else {
2205	inet_ntop(AF_INET, &ip->ip_dst, dbuf, sizeof(dbuf));
2206	inet_ntop(AF_INET, &ip->ip_src, sbuf, sizeof(sbuf));
2207	}
2208	switch (log_in_vain) {
2209	case `1`:
2210	if (thflags & TH_SYN) {
2211	log(LOG_INFO,
2212	"Connection attempt to TCP %s:%d from %s:%d\n",
2213	dbuf, ntohs(th->th_dport),
2214	sbuf,
2215	ntohs(th->th_sport));
2216	}
2217	break;
2218	case `2`:
2219	log(LOG_INFO,
2220	"Connection attempt to TCP %s:%d from %s:%d flags:0x%x\n",
2221	dbuf, ntohs(th->th_dport), sbuf,
2222	ntohs(th->th_sport), thflags);
2223	break;
2224	case `3`:
2225	case `4`:
2226	if ((thflags & TH_SYN) && !(thflags & TH_ACK) &&
2227	!(m->m_flags & (M_BCAST \| M_MCAST)) &&
2228	((isipv6 && !in6_are_addr_equal_scoped(&ip6->ip6_dst, &ip6->ip6_src, ip6_input_getdstifscope(m), ip6_input_getsrcifscope(m))) \|\|
2229	(!isipv6 && ip->ip_dst.s_addr != ip->ip_src.s_addr))) {
2230	log_in_vain_log((LOG_INFO,
2231	"Stealth Mode connection attempt to TCP %s:%d from %s:%d\n",
2232	dbuf, ntohs(th->th_dport),
2233	sbuf,
2234	ntohs(th->th_sport)));
2235	}
2236	break;
2237	default:
2238	break;
2239	}
2240	}
2241	if (blackhole) {
2242	if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type != IFT_LOOP) {
2243	switch (blackhole) {
2244	case `1`:
2245	if (thflags & TH_SYN) {
2246	TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "blackhole 1 syn for closed port");
2247	goto dropnosock;
2248	}
2249	break;
2250	case `2`:
2251	TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "blackhole 2 closed port");
2252	goto dropnosock;
2253	default:
2254	TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "blackhole closed port");
2255	goto dropnosock;
2256	}
2257	}
2258	}
2259	IF_TCP_STATINC(ifp, noconnnolist);
2260	TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "closed port");
2261	goto dropwithresetnosock;
2262	}
2263	so = inp->inp_socket;
2264	if (so == NULL) {
2265	/ This case shouldn't happen as the socket shouldn't be null*
2266	* if inp_state isn't set to INPCB_STATE_DEAD
2267	* But just in case, we pretend we didn't find the socket if we hit this case
2268	* as this isn't cause for a panic (the socket might be leaked however)...
2269	*/
2270	inp = NULL;
2271	#if TEMPDEBUG
2272	printf("tcp_input: no more socket for inp=%x. This shouldn't happen\n", inp);
2273	#endif
2274	TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "inp_socket NULL");
2275	goto dropnosock;
2276	}
2277
2278	socket_lock(so, refcount: `1`);
2279	if (in_pcb_checkstate(inp, WNT_RELEASE, `1`) == WNT_STOPUSING) {
2280	socket_unlock(so, refcount: `1`);
2281	inp = NULL; // pretend we didn't find it
2282	TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "inp state WNT_STOPUSING");
2283	goto dropnosock;
2284	}
2285
2286	if (!isipv6 && inp->inp_faddr.s_addr != INADDR_ANY) {
2287	if (inp->inp_faddr.s_addr != ip->ip_src.s_addr \|\|
2288	inp->inp_laddr.s_addr != ip->ip_dst.s_addr \|\|
2289	inp->inp_fport != th->th_sport \|\|
2290	inp->inp_lport != th->th_dport) {
2291	os_log_error(OS_LOG_DEFAULT, "%s 5-tuple does not match: %u:%u %u:%u\n",
2292	__func__,
2293	ntohs(inp->inp_fport), ntohs(th->th_sport),
2294	ntohs(inp->inp_lport), ntohs(th->th_dport));
2295	if (findpcb_iterated) {
2296	goto drop;
2297	}
2298	findpcb_iterated = true;
2299	socket_unlock(so, refcount: `1`);
2300	inp = NULL;
2301	goto findpcb;
2302	}
2303	} else if (isipv6 && !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
2304	if (!in6_are_addr_equal_scoped(&inp->in6p_faddr, &ip6->ip6_src, inp->inp_fifscope, ip6_input_getsrcifscope(m)) \|\|
2305	!in6_are_addr_equal_scoped(&inp->in6p_laddr, &ip6->ip6_dst, inp->inp_lifscope, ip6_input_getdstifscope(m)) \|\|
2306	inp->inp_fport != th->th_sport \|\|
2307	inp->inp_lport != th->th_dport) {
2308	os_log_error(OS_LOG_DEFAULT, "%s 5-tuple does not match: %u:%u %u:%u\n",
2309	__func__,
2310	ntohs(inp->inp_fport), ntohs(th->th_sport),
2311	ntohs(inp->inp_lport), ntohs(th->th_dport));
2312	if (findpcb_iterated) {
2313	goto drop;
2314	}
2315	findpcb_iterated = true;
2316	socket_unlock(so, refcount: `1`);
2317	inp = NULL;
2318	goto findpcb;
2319	}
2320	}
2321
2322	tp = intotcpcb(inp);
2323	if (tp == NULL) {
2324	IF_TCP_STATINC(ifp, noconnlist);
2325	TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "tp is NULL");
2326	goto dropwithreset;
2327	}
2328
2329	/ Now that we found the tcpcb, we can adjust the TCP timestamp /
2330	if (to.to_flags & TOF_TS) {
2331	to.to_tsecr -= tp->t_ts_offset;
2332	}
2333
2334	if (tp->t_state == TCPS_CLOSED) {
2335	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "tp state TCPS_CLOSED");
2336	goto drop;
2337	}
2338
2339	#if NECP
2340	if (so->so_state & SS_ISCONNECTED) {
2341	// Connected TCP sockets have a fully-bound local and remote,
2342	// so the policy check doesn't need to override addresses
2343	if (!necp_socket_is_allowed_to_send_recv(inp, input_interface: ifp, pf_tag, NULL, NULL, NULL, NULL)) {
2344	TCP_LOG_DROP_NECP(TCP_LOG_HDR, th, intotcpcb(inp), false);
2345	IF_TCP_STATINC(ifp, badformat);
2346	goto drop;
2347	}
2348	} else {
2349	/*
2350	* If the proc_uuid_policy table has been updated since the last use
2351	* of the listening socket (i.e., the proc_uuid_policy_table_gencount
2352	* has been updated), the flags in the socket may be out of date.
2353	* If INP2_WANT_APP_POLICY is stale, inbound packets may
2354	* be dropped by NECP if the socket should now match a per-app
2355	* exception policy.
2356	* In order to avoid this refresh the proc_uuid_policy state to
2357	* potentially recalculate the socket's flags before checking
2358	* with NECP.
2359	*/
2360	(void) inp_update_policy(inp);
2361
2362	if (isipv6) {
2363	if (!necp_socket_is_allowed_to_send_recv_v6(inp,
2364	local_port: th->th_dport, remote_port: th->th_sport, local_addr: &ip6->ip6_dst,
2365	remote_addr: &ip6->ip6_src, input_interface: ifp, pf_tag, NULL, NULL, NULL, NULL)) {
2366	TCP_LOG_DROP_NECP(TCP_LOG_HDR, th, intotcpcb(inp), false);
2367	IF_TCP_STATINC(ifp, badformat);
2368	goto drop;
2369	}
2370	} else {
2371	if (!necp_socket_is_allowed_to_send_recv_v4(inp,
2372	local_port: th->th_dport, remote_port: th->th_sport, local_addr: &ip->ip_dst, remote_addr: &ip->ip_src,
2373	input_interface: ifp, pf_tag, NULL, NULL, NULL, NULL)) {
2374	TCP_LOG_DROP_NECP(TCP_LOG_HDR, th, intotcpcb(inp), false);
2375	IF_TCP_STATINC(ifp, badformat);
2376	goto drop;
2377	}
2378	}
2379	}
2380	#endif /* NECP */
2381
2382	prev_t_state = tp->t_state;
2383
2384	/ If none of the FIN\|SYN\|RST\|ACK flag is set, drop /
2385	if ((thflags & TH_ACCEPT) == `0`) {
2386	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "rfc5961 TH_ACCEPT == 0");
2387	goto drop;
2388	}
2389
2390	/ Unscale the window into a 32-bit value. /
2391	if ((thflags & TH_SYN) == `0`) {
2392	tiwin = th->th_win << tp->snd_scale;
2393	} else {
2394	tiwin = th->th_win;
2395	}
2396
2397	/ Avoid processing packets while closing a listen socket /
2398	if (tp->t_state == TCPS_LISTEN &&
2399	(so->so_options & SO_ACCEPTCONN) == `0`) {
2400	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "closing a listening socket");
2401	goto drop;
2402	}
2403
2404	if ((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) {
2405	soevent(so, SO_FILT_HINT_LOCKED \| SO_FILT_HINT_WAKE_PKT);
2406	}
2407
2408	if (so->so_options & (SO_DEBUG \| SO_ACCEPTCONN)) {
2409	#if TCPDEBUG
2410	if (so->so_options & SO_DEBUG) {
2411	ostate = tp->t_state;
2412	if (isipv6) {
2413	bcopy((char )ip6, (char* *)tcp_saveipgen,
2414	sizeof(*ip6));
2415	} else {
2416	bcopy((char )ip, (char* )tcp_saveipgen, sizeof(ip));
2417	}
2418	tcp_savetcp = *th;
2419	}
2420	#endif
2421	if (so->so_options & SO_ACCEPTCONN) {
2422	struct tcpcb *tp0 = tp;
2423	struct socket *so2;
2424	struct socket *oso;
2425	struct sockaddr_storage from;
2426	struct sockaddr_storage to2;
2427	struct inpcb *oinp = sotoinpcb(so);
2428	struct ifnet *head_ifscope;
2429	bool head_nocell, head_recvanyif,
2430	head_noexpensive, head_awdl_unrestricted,
2431	head_intcoproc_allowed, head_external_port,
2432	head_noconstrained, head_management_allowed;
2433
2434	/ Get listener's bound-to-interface, if any /
2435	head_ifscope = (inp->inp_flags & INP_BOUND_IF) ?
2436	inp->inp_boundifp : NULL;
2437	/ Get listener's no-cellular information, if any /
2438	head_nocell = INP_NO_CELLULAR(inp);
2439	/ Get listener's recv-any-interface, if any /
2440	head_recvanyif = (inp->inp_flags & INP_RECV_ANYIF);
2441	/ Get listener's no-expensive information, if any /
2442	head_noexpensive = INP_NO_EXPENSIVE(inp);
2443	head_noconstrained = INP_NO_CONSTRAINED(inp);
2444	head_awdl_unrestricted = INP_AWDL_UNRESTRICTED(inp);
2445	head_intcoproc_allowed = INP_INTCOPROC_ALLOWED(inp);
2446	head_external_port = (inp->inp_flags2 & INP2_EXTERNAL_PORT);
2447	head_management_allowed = INP_MANAGEMENT_ALLOWED(inp);
2448
2449	/*
2450	* If the state is LISTEN then ignore segment if it contains an RST.
2451	* If the segment contains an ACK then it is bad and send a RST.
2452	* If it does not contain a SYN then it is not interesting; drop it.
2453	* If it is from this socket, drop it, it must be forged.
2454	*/
2455	if ((thflags & (TH_RST \| TH_ACK \| TH_SYN)) != TH_SYN) {
2456	IF_TCP_STATINC(ifp, listbadsyn);
2457
2458	if (thflags & TH_RST) {
2459	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false,
2460	thflags & TH_SYN ? "ignore SYN with RST" : "ignore RST");
2461	goto drop;
2462	}
2463	if (thflags & TH_ACK) {
2464	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false,
2465	thflags & TH_SYN ? "bad SYN with ACK" : "bad ACK");
2466	tp = NULL;
2467	tcpstat.tcps_badsyn++;
2468	goto dropwithreset;
2469	}
2470
2471	/ We come here if there is no SYN set /
2472	tcpstat.tcps_badsyn++;
2473	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "bad SYN");
2474	goto drop;
2475	}
2476	KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN \| DBG_FUNC_START, `0`, `0`, `0`, `0`, `0`);
2477	if (th->th_dport == th->th_sport) {
2478	if (isipv6) {
2479	if (in6_are_addr_equal_scoped(&ip6->ip6_dst, &ip6->ip6_src, ip6_input_getdstifscope(m), ip6_input_getsrcifscope(m))) {
2480	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "bad tuple same port");
2481	goto drop;
2482	}
2483	} else if (ip->ip_dst.s_addr == ip->ip_src.s_addr) {
2484	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "bad tuple same IPv4 address");
2485	goto drop;
2486	}
2487	}
2488	/*
2489	* RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
2490	* in_broadcast() should never return true on a received
2491	* packet with M_BCAST not set.
2492	*
2493	* Packets with a multicast source address should also
2494	* be discarded.
2495	*/
2496	if (m->m_flags & (M_BCAST \| M_MCAST)) {
2497	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "mbuf M_BCAST \| M_MCAST");
2498	goto drop;
2499	}
2500	if (isipv6) {
2501	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) \|\|
2502	IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
2503	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "IN6_IS_ADDR_MULTICAST");
2504	goto drop;
2505	}
2506	} else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) \|\|
2507	IN_MULTICAST(ntohl(ip->ip_src.s_addr)) \|\|
2508	ip->ip_src.s_addr == htonl(INADDR_BROADCAST) \|\|
2509	in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) {
2510	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "multicast or broadcast address");
2511	goto drop;
2512	}
2513
2514
2515	/*
2516	* If deprecated address is forbidden,
2517	* we do not accept SYN to deprecated interface
2518	* address to prevent any new inbound connection from
2519	* getting established.
2520	* When we do not accept SYN, we send a TCP RST,
2521	* with deprecated source address (instead of dropping
2522	* it). We compromise it as it is much better for peer
2523	* to send a RST, and RST will be the final packet
2524	* for the exchange.
2525	*
2526	* If we do not forbid deprecated addresses, we accept
2527	* the SYN packet. RFC 4862 forbids dropping SYN in
2528	* this case.
2529	*/
2530	if (isipv6 && !ip6_use_deprecated) {
2531	uint32_t ia6_flags;
2532
2533	if (ip6_getdstifaddr_info(m, NULL,
2534	&ia6_flags) == `0`) {
2535	if (ia6_flags & IN6_IFF_DEPRECATED) {
2536	tp = NULL;
2537	IF_TCP_STATINC(ifp, deprecate6);
2538	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "deprecated IPv6 address");
2539	goto dropwithreset;
2540	}
2541	}
2542	}
2543	if (so->so_filt \|\| check_cfil) {
2544	if (isipv6) {
2545	struct sockaddr_in6 *sin6 = SIN6(&from);
2546
2547	sin6->sin6_len = sizeof(*sin6);
2548	sin6->sin6_family = AF_INET6;
2549	sin6->sin6_port = th->th_sport;
2550	sin6->sin6_flowinfo = `0`;
2551	sin6->sin6_addr = ip6->ip6_src;
2552	sin6->sin6_scope_id = `0`;
2553
2554	sin6 = SIN6(&to2);
2555
2556	sin6->sin6_len = sizeof(struct sockaddr_in6);
2557	sin6->sin6_family = AF_INET6;
2558	sin6->sin6_port = th->th_dport;
2559	sin6->sin6_flowinfo = `0`;
2560	sin6->sin6_addr = ip6->ip6_dst;
2561	sin6->sin6_scope_id = `0`;
2562	} else {
2563	struct sockaddr_in *sin = SIN(&from);
2564
2565	sin->sin_len = sizeof(*sin);
2566	sin->sin_family = AF_INET;
2567	sin->sin_port = th->th_sport;
2568	sin->sin_addr = ip->ip_src;
2569
2570	sin = SIN(&to2);
2571
2572	sin->sin_len = sizeof(struct sockaddr_in);
2573	sin->sin_family = AF_INET;
2574	sin->sin_port = th->th_dport;
2575	sin->sin_addr = ip->ip_dst;
2576	}
2577	}
2578
2579	if (so->so_filt) {
2580	so2 = sonewconn(head: so, connstatus: `0`, SA(&from));
2581	} else {
2582	so2 = sonewconn(head: so, connstatus: `0`, NULL);
2583	}
2584	if (so2 == `0`) {
2585	tcpstat.tcps_listendrop++;
2586	if (tcp_dropdropablreq(head: so)) {
2587	if (so->so_filt) {
2588	so2 = sonewconn(head: so, connstatus: `0`, SA(&from));
2589	} else {
2590	so2 = sonewconn(head: so, connstatus: `0`, NULL);
2591	}
2592	}
2593	if (!so2) {
2594	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " listen drop");
2595	goto drop;
2596	}
2597	}
2598
2599	/ Point "inp" and "tp" in tandem to new socket /
2600	inp = (struct inpcb *)so2->so_pcb;
2601	tp = intotcpcb(inp);
2602
2603	oso = so;
2604	socket_unlock(so, refcount: `0`); / Unlock but keep a reference on listener for now /
2605
2606	so = so2;
2607	socket_lock(so, refcount: `1`);
2608	/*
2609	* Mark socket as temporary until we're
2610	* committed to keeping it. The code at
2611	* ``drop'' and ``dropwithreset'' check the
2612	* flag dropsocket to see if the temporary
2613	* socket created here should be discarded.
2614	* We mark the socket as discardable until
2615	* we're committed to it below in TCPS_LISTEN.
2616	* There are some error conditions in which we
2617	* have to drop the temporary socket.
2618	*/
2619	dropsocket++;
2620	/*
2621	* Inherit INP_BOUND_IF from listener; testing if
2622	* head_ifscope is non-NULL is sufficient, since it
2623	* can only be set to a non-zero value earlier if
2624	* the listener has such a flag set.
2625	*/
2626	if (head_ifscope != NULL) {
2627	inp->inp_flags \|= INP_BOUND_IF;
2628	inp->inp_boundifp = head_ifscope;
2629	} else {
2630	inp->inp_flags &= ~INP_BOUND_IF;
2631	}
2632	/*
2633	* Inherit restrictions from listener.
2634	*/
2635	if (head_nocell) {
2636	inp_set_nocellular(inp);
2637	}
2638	if (head_noexpensive) {
2639	inp_set_noexpensive(inp);
2640	}
2641	if (head_noconstrained) {
2642	inp_set_noconstrained(inp);
2643	}
2644	if (head_awdl_unrestricted) {
2645	inp_set_awdl_unrestricted(inp);
2646	}
2647	if (head_intcoproc_allowed) {
2648	inp_set_intcoproc_allowed(inp);
2649	}
2650	if (head_management_allowed) {
2651	inp_set_management_allowed(inp);
2652	}
2653	/*
2654	* Inherit {IN,IN6}_RECV_ANYIF from listener.
2655	*/
2656	if (head_recvanyif) {
2657	inp->inp_flags \|= INP_RECV_ANYIF;
2658	} else {
2659	inp->inp_flags &= ~INP_RECV_ANYIF;
2660	}
2661
2662	if (head_external_port) {
2663	inp->inp_flags2 \|= INP2_EXTERNAL_PORT;
2664	}
2665	if (isipv6) {
2666	inp->in6p_laddr = ip6->ip6_dst;
2667	inp->inp_lifscope = in6_addr2scopeid(ifp, &inp->in6p_laddr);
2668	in6_verify_ifscope(&ip6->ip6_dst, inp->inp_lifscope);
2669	} else {
2670	inp->inp_vflag &= ~INP_IPV6;
2671	inp->inp_vflag \|= INP_IPV4;
2672	inp->inp_laddr = ip->ip_dst;
2673	}
2674	inp->inp_lport = th->th_dport;
2675	if (in_pcbinshash(inp, `0`) != `0`) {
2676	/*
2677	* Undo the assignments above if we failed to
2678	* put the PCB on the hash lists.
2679	*/
2680	if (isipv6) {
2681	inp->in6p_laddr = in6addr_any;
2682	inp->inp_lifscope = IFSCOPE_NONE;
2683	} else {
2684	inp->inp_laddr.s_addr = INADDR_ANY;
2685	}
2686	#if SKYWALK
2687	netns_release(token: &inp->inp_netns_token);
2688	#endif /* SKYWALK */
2689	inp->inp_lport = `0`;
2690	socket_lock(so: oso, refcount: `0`); / release ref on parent /
2691	socket_unlock(so: oso, refcount: `1`);
2692	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " in_pcbinshash failed");
2693	goto drop;
2694	}
2695	socket_lock(so: oso, refcount: `0`);
2696	if (isipv6) {
2697	/*
2698	* Inherit socket options from the listening
2699	* socket.
2700	* Note that in6p_inputopts are not (even
2701	* should not be) copied, since it stores
2702	* previously received options and is used to
2703	* detect if each new option is different than
2704	* the previous one and hence should be passed
2705	* to a user.
2706	* If we copied in6p_inputopts, a user would
2707	* not be able to receive options just after
2708	* calling the accept system call.
2709	*/
2710	inp->inp_flags \|=
2711	oinp->inp_flags & INP_CONTROLOPTS;
2712	if (oinp->in6p_outputopts) {
2713	inp->in6p_outputopts =
2714	ip6_copypktopts(oinp->in6p_outputopts,
2715	Z_NOWAIT);
2716	}
2717	} else {
2718	inp->inp_options = ip_srcroute();
2719	inp->inp_ip_tos = oinp->inp_ip_tos;
2720	}
2721	#if IPSEC
2722	/ copy old policy into new socket's /
2723	if (sotoinpcb(oso)->inp_sp) {
2724	int error = `0`;
2725	/ Is it a security hole here to silently fail to copy the policy? /
2726	if (inp->inp_sp == NULL) {
2727	error = ipsec_init_policy(so, &inp->inp_sp);
2728	}
2729	if (error != `0` \|\| ipsec_copy_policy(sotoinpcb(oso)->inp_sp, inp->inp_sp)) {
2730	printf("tcp_input: could not copy policy\n");
2731	}
2732	}
2733	#endif
2734	/ inherit states from the listener /
2735	DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
2736	struct tcpcb *, tp, int32_t, TCPS_LISTEN);
2737	TCP_LOG_STATE(tp, TCPS_LISTEN);
2738	tp->t_state = TCPS_LISTEN;
2739	tp->t_flags \|= tp0->t_flags & (TF_NOPUSH \| TF_NOOPT \| TF_NODELAY);
2740	tp->t_flagsext \|= (tp0->t_flagsext & (TF_RXTFINDROP \| TF_NOTIMEWAIT \| TF_FASTOPEN));
2741	tp->t_keepinit = tp0->t_keepinit;
2742	tp->t_keepcnt = tp0->t_keepcnt;
2743	tp->t_keepintvl = tp0->t_keepintvl;
2744	tp->t_adaptive_wtimo = tp0->t_adaptive_wtimo;
2745	tp->t_adaptive_rtimo = tp0->t_adaptive_rtimo;
2746	tp->t_inpcb->inp_ip_ttl = tp0->t_inpcb->inp_ip_ttl;
2747	if ((so->so_flags & SOF_NOTSENT_LOWAT) != `0`) {
2748	tp->t_notsent_lowat = tp0->t_notsent_lowat;
2749	}
2750	tp->t_inpcb->inp_flags2 \|=
2751	tp0->t_inpcb->inp_flags2 & INP2_KEEPALIVE_OFFLOAD;
2752
2753	/ now drop the reference on the listener /
2754	socket_unlock(so: oso, refcount: `1`);
2755
2756	tcp_set_max_rwinscale(tp, so);
2757
2758	#if CONTENT_FILTER
2759	if (check_cfil) {
2760	int error = cfil_sock_attach(so: so2, SA(&to2), SA(&from), CFS_CONNECTION_DIR_IN);
2761	if (error != `0`) {
2762	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " cfil_sock_attach failed");
2763	goto drop;
2764	}
2765	}
2766	#endif /* CONTENT_FILTER */
2767
2768	KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN \| DBG_FUNC_END, `0`, `0`, `0`, `0`, `0`);
2769	}
2770	}
2771	socket_lock_assert_owned(so);
2772
2773	/*
2774	* Packet accounting should not be done on listening socket
2775	*/
2776	if (th->th_flags & TH_SYN) {
2777	(void) os_add_overflow(`1`, tp->t_syn_rcvd, &tp->t_syn_rcvd);
2778	}
2779	if (th->th_flags & TH_FIN) {
2780	(void) os_add_overflow(`1`, tp->t_fin_rcvd, &tp->t_fin_rcvd);
2781	}
2782	if (th->th_flags & TH_RST) {
2783	(void) os_add_overflow(`1`, tp->t_rst_rcvd, &tp->t_rst_rcvd);
2784	}
2785	TCP_LOG_TH_FLAGS(TCP_LOG_HDR, th, tp, false, ifp);
2786
2787	if (net_mpklog_enabled && (m->m_pkthdr.rcvif->if_xflags & IFXF_MPK_LOG)) {
2788	MPKL_TCP_INPUT(tcp_mpkl_log_object,
2789	ntohs(tp->t_inpcb->inp_lport), ntohs(tp->t_inpcb->inp_fport),
2790	th->th_seq, th->th_ack, tlen, thflags,
2791	so->last_pid, so->so_log_seqn++);
2792	}
2793
2794	if (tp->t_state == TCPS_ESTABLISHED && tlen > `0`) {
2795	/*
2796	* Evaluate the rate of arrival of packets to see if the
2797	* receiver can reduce the ack traffic. The algorithm to
2798	* stretch acks will be enabled if the connection meets
2799	* certain criteria defined in tcp_stretch_ack_enable function.
2800	*/
2801	if ((tp->t_flagsext & TF_RCVUNACK_WAITSS) != `0`) {
2802	TCP_INC_VAR(tp->rcv_waitforss, segment_count);
2803	}
2804	if (tcp_stretch_ack_enable(tp, thflags)) {
2805	tp->t_flags \|= TF_STRETCHACK;
2806	tp->t_flagsext &= ~(TF_RCVUNACK_WAITSS);
2807	tp->rcv_waitforss = `0`;
2808	} else {
2809	tp->t_flags &= ~(TF_STRETCHACK);
2810	}
2811	if (TSTMP_GT(tp->rcv_unackwin - (tcp_rcvunackwin >> `1`), tcp_now)) {
2812	tp->rcv_by_unackhalfwin += (tlen + off);
2813	tp->rcv_by_unackwin += (tlen + off);
2814	} else {
2815	tp->rcv_unackwin = tcp_now + tcp_rcvunackwin;
2816	tp->rcv_by_unackwin = tp->rcv_by_unackhalfwin + tlen + off;
2817	tp->rcv_by_unackhalfwin = tlen + off;
2818	}
2819	}
2820
2821	if (tp->t_state == TCPS_ESTABLISHED && BYTES_ACKED(th, tp) > `0`) {
2822	if (tp->ecn_flags & TE_SENDIPECT) {
2823	/*
2824	* Data sent with ECT has been acknowledged, calculate
2825	* packets approx. by dividing by MSS. This is done to
2826	* count MSS sized packets in case packets are aggregated
2827	* by GRO/LRO.
2828	*/
2829	uint32_t bytes_acked = tcp_round_to(BYTES_ACKED(th, tp), round: tp->t_maxseg);
2830	tp->t_ecn_capable_packets_acked += max(a: `1`, b: (bytes_acked / tp->t_maxseg));
2831	}
2832	}
2833
2834	/ Accurate ECN has different semantics for TH_CWR. /
2835	if (!TCP_ACC_ECN_ENABLED(tp)) {
2836	/*
2837	* Clear TE_SENDECE if TH_CWR is set. This is harmless, so we don't
2838	* bother doing extensive checks for state and whatnot.
2839	*/
2840	if (thflags & TH_CWR) {
2841	tp->ecn_flags &= ~TE_SENDECE;
2842	tp->t_ecn_recv_cwr++;
2843	}
2844	}
2845
2846	/*
2847	* Accurate ECN feedback
2848	* 1. Process peer's feedback in received TCP thflags and update s.cep
2849	* 2. Process IP ECN bits and update r.cep for CE marked pure ACKs
2850	* or valid data packets
2851	*
2852	*/
2853	if (TCP_ACC_ECN_ON(tp) && tp->t_state == TCPS_ESTABLISHED) {
2854	/*
2855	* Update s.cep if bytes have been acknowledged
2856	* otherwise, this ACK has already been superseded.
2857	*/
2858	uint8_t ace = tcp_get_ace(th);
2859	if (BYTES_ACKED(th, tp) > `0`) {
2860	/ Congestion was experienced if delta_cep > 0 /
2861	tp->t_delta_ce_packets = (ace + TCP_ACE_DIV - (tp->t_snd_ce_packets % TCP_ACE_DIV)) % TCP_ACE_DIV;
2862	tp->t_snd_ce_packets += tp->t_delta_ce_packets;
2863	}
2864	/ Update receive side counters /
2865	if (tlen == `0` \|\| (tlen > `0` &&
2866	SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
2867	SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd))) {
2868	tcp_input_ip_ecn(tp, inp, tlen: (uint32_t)tlen, segment_count: (uint32_t)segment_count, ip_ecn);
2869	}
2870
2871	/ Test for ACE bleaching, initial value of ace should be non-zero /
2872	if (th->th_seq == tp->iss + `1` && ace == `0`) {
2873	tp->t_client_accecn_state = tcp_connection_client_accurate_ecn_ace_bleaching_detected;
2874	}
2875	} else {
2876	/*
2877	* Explicit Congestion Notification - Flag that we need to send ECE if
2878	* + The IP Congestion experienced flag was set.
2879	* + Socket is in established state
2880	* + We negotiated ECN in the TCP setup
2881	* + This isn't a pure ack (tlen > 0)
2882	* + The data is in the valid window
2883	*
2884	* TE_SENDECE will be cleared when we receive a packet with TH_CWR set.
2885	*/
2886	if (ip_ecn == IPTOS_ECN_CE && tp->t_state == TCPS_ESTABLISHED &&
2887	TCP_ECN_ENABLED(tp) && tlen > `0` &&
2888	SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
2889	SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
2890	tp->t_ecn_recv_ce++;
2891	tcpstat.tcps_ecn_recv_ce++;
2892	INP_INC_IFNET_STAT(inp, ecn_recv_ce);
2893	/ Mark this connection as it received CE from network /
2894	tp->ecn_flags \|= TE_RECV_ECN_CE;
2895	tp->ecn_flags \|= TE_SENDECE;
2896	}
2897	}
2898
2899	/*
2900	* If we received an explicit notification of congestion in
2901	* ip tos ecn bits or by the CWR bit in TCP header flags, reset
2902	* the ack-stretching state. We need to handle ECN notification if
2903	* an ECN setup SYN was sent even once.
2904	*/
2905	if (tp->t_state == TCPS_ESTABLISHED &&
2906	(tp->ecn_flags & TE_SETUPSENT) &&
2907	(ip_ecn == IPTOS_ECN_CE \|\| (thflags & TH_CWR))) {
2908	tcp_reset_stretch_ack(tp);
2909	tp->t_forced_acks = TCP_FORCED_ACKS_COUNT;
2910	CLEAR_IAJ_STATE(tp);
2911	}
2912
2913	if (ip_ecn == IPTOS_ECN_CE && tp->t_state == TCPS_ESTABLISHED &&
2914	!TCP_ECN_ENABLED(tp) && !(tp->ecn_flags & TE_CEHEURI_SET)) {
2915	tcpstat.tcps_ecn_fallback_ce++;
2916	tcp_heuristic_ecn_aggressive(tp);
2917	tp->ecn_flags \|= TE_CEHEURI_SET;
2918	}
2919
2920	if (tp->t_state == TCPS_ESTABLISHED && TCP_ECN_ENABLED(tp) &&
2921	ip_ecn == IPTOS_ECN_CE && !(tp->ecn_flags & TE_CEHEURI_SET)) {
2922	if (inp->inp_stat->rxpackets < ECN_MIN_CE_PROBES) {
2923	tp->t_ecn_recv_ce_pkt++;
2924	} else if (tp->t_ecn_recv_ce_pkt > ECN_MAX_CE_RATIO) {
2925	tcpstat.tcps_ecn_fallback_ce++;
2926	tcp_heuristic_ecn_aggressive(tp);
2927	tp->ecn_flags \|= TE_CEHEURI_SET;
2928	INP_INC_IFNET_STAT(inp, ecn_fallback_ce);
2929	} else {
2930	/ We tracked the first ECN_MIN_CE_PROBES segments, we*
2931	* now know that the path is good.
2932	*/
2933	tp->ecn_flags \|= TE_CEHEURI_SET;
2934	}
2935	}
2936
2937	/ Update rcvtime as a new segment was received on the connection /
2938	tp->t_rcvtime = tcp_now;
2939
2940	/*
2941	* Segment received on connection.
2942	* Reset idle time and keep-alive timer.
2943	*/
2944	if (TCPS_HAVEESTABLISHED(tp->t_state)) {
2945	tcp_keepalive_reset(tp);
2946
2947	if (tp->t_mpsub) {
2948	mptcp_reset_keepalive(tp);
2949	}
2950	}
2951
2952	/*
2953	* Process options if not in LISTEN state,
2954	* else do it below (after getting remote address).
2955	*/
2956	if (tp->t_state != TCPS_LISTEN && optp) {
2957	tcp_dooptions(tp, optp, optlen, th, &to);
2958	}
2959	#if MPTCP
2960	if (tp->t_state != TCPS_LISTEN && (so->so_flags & SOF_MP_SUBFLOW)) {
2961	mptcp_insert_rmap(tp, m, th);
2962	}
2963	#endif /* MPTCP */
2964	if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
2965	if (!(thflags & TH_ACK) \|\|
2966	(SEQ_GT(th->th_ack, tp->iss) &&
2967	SEQ_LEQ(th->th_ack, tp->snd_max))) {
2968	tcp_finalize_options(tp, &to, ifscope);
2969	}
2970	}
2971
2972	#if TRAFFIC_MGT
2973	/*
2974	* Compute inter-packet arrival jitter. According to RFC 3550,
2975	* inter-packet arrival jitter is defined as the difference in
2976	* packet spacing at the receiver compared to the sender for a
2977	* pair of packets. When two packets of maximum segment size come
2978	* one after the other with consecutive sequence numbers, we
2979	* consider them as packets sent together at the sender and use
2980	* them as a pair to compute inter-packet arrival jitter. This
2981	* metric indicates the delay induced by the network components due
2982	* to queuing in edge/access routers.
2983	*/
2984	if (tp->t_state == TCPS_ESTABLISHED &&
2985	(thflags & (TH_SYN \| TH_FIN \| TH_RST \| TH_URG \| TH_ACK \| TH_ECE \| TH_PUSH)) == TH_ACK &&
2986	((tp->t_flags & TF_NEEDFIN) == `0`) &&
2987	((to.to_flags & TOF_TS) == `0` \|\|
2988	TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
2989	th->th_seq == tp->rcv_nxt && LIST_EMPTY(&tp->t_segq)) {
2990	int seg_size = tlen;
2991	if (tp->iaj_pktcnt <= IAJ_IGNORE_PKTCNT) {
2992	TCP_INC_VAR(tp->iaj_pktcnt, segment_count);
2993	}
2994
2995	if (tp->iaj_size == `0` \|\| seg_size > tp->iaj_size \|\|
2996	(seg_size == tp->iaj_size && tp->iaj_rcv_ts == `0`)) {
2997	/*
2998	* State related to inter-arrival jitter is
2999	* uninitialized or we are trying to find a good
3000	* first packet to start computing the metric
3001	*/
3002	update_iaj_state(tp, size: seg_size, rst_size: `0`);
3003	} else {
3004	if (seg_size == tp->iaj_size) {
3005	/*
3006	* Compute inter-arrival jitter taking
3007	* this packet as the second packet
3008	*/
3009	compute_iaj(tp);
3010	}
3011	if (seg_size < tp->iaj_size) {
3012	/*
3013	* There is a smaller packet in the stream.
3014	* Some times the maximum size supported
3015	* on a path can change if there is a new
3016	* link with smaller MTU. The receiver will
3017	* not know about this change. If there
3018	* are too many packets smaller than
3019	* iaj_size, we try to learn the iaj_size
3020	* again.
3021	*/
3022	TCP_INC_VAR(tp->iaj_small_pkt, segment_count);
3023	if (tp->iaj_small_pkt > RESET_IAJ_SIZE_THRESH) {
3024	update_iaj_state(tp, size: seg_size, rst_size: `1`);
3025	} else {
3026	CLEAR_IAJ_STATE(tp);
3027	}
3028	} else {
3029	update_iaj_state(tp, size: seg_size, rst_size: `0`);
3030	}
3031	}
3032	} else {
3033	CLEAR_IAJ_STATE(tp);
3034	}
3035	#endif /* TRAFFIC_MGT */
3036
3037	/*
3038	* Header prediction: check for the two common cases
3039	* of a uni-directional data xfer. If the packet has
3040	* no control flags, is in-sequence, the window didn't
3041	* change and we're not retransmitting, it's a
3042	* candidate. If the length is zero and the ack moved
3043	* forward, we're the sender side of the xfer. Just
3044	* free the data acked & wake any higher level process
3045	* that was blocked waiting for space. If the length
3046	* is non-zero and the ack didn't move, we're the
3047	* receiver side. If we're getting packets in-order
3048	* (the reassembly queue is empty), add the data to
3049	* the socket buffer and note that we need a delayed ack.
3050	* Make sure that the hidden state-flags are also off.
3051	* Since we check for TCPS_ESTABLISHED above, it can only
3052	* be TH_NEEDSYN.
3053	*/
3054	if (tp->t_state == TCPS_ESTABLISHED &&
3055	!(so->so_state & SS_CANTRCVMORE) &&
3056	(thflags & TH_FLAGS) == TH_ACK &&
3057	((tp->t_flags & TF_NEEDFIN) == `0`) &&
3058	((to.to_flags & TOF_TS) == `0` \|\|
3059	TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
3060	th->th_seq == tp->rcv_nxt &&
3061	tiwin && tiwin == tp->snd_wnd &&
3062	tp->snd_nxt == tp->snd_max) {
3063	/*
3064	* If last ACK falls within this segment's sequence numbers,
3065	* record the timestamp.
3066	* NOTE that the test is modified according to the latest
3067	* proposal of the tcplw@cray.com list (Braden 1993/04/26).
3068	*/
3069	if ((to.to_flags & TOF_TS) != `0` &&
3070	SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
3071	tp->ts_recent_age = tcp_now;
3072	tp->ts_recent = to.to_tsval;
3073	}
3074
3075	/*
3076	* We increment t_unacksegs_ce for both data segments
3077	* and pure ACKs for Accurate ECN
3078	*/
3079	if (TCP_ACC_ECN_ON(tp) && ip_ecn == IPTOS_ECN_CE) {
3080	TCP_INC_VAR(tp->t_unacksegs_ce, segment_count);
3081	}
3082
3083	if (tlen == `0`) {
3084	if (SEQ_GT(th->th_ack, tp->snd_una) &&
3085	SEQ_LEQ(th->th_ack, tp->snd_max) &&
3086	tp->snd_cwnd >= tp->snd_ssthresh &&
3087	(!IN_FASTRECOVERY(tp) &&
3088	((!(SACK_ENABLED(tp)) &&
3089	tp->t_dupacks < tp->t_rexmtthresh) \|\|
3090	(SACK_ENABLED(tp) && to.to_nsacks == `0` &&
3091	TAILQ_EMPTY(&tp->snd_holes))))) {
3092	/*
3093	* this is a pure ack for outstanding data.
3094	*/
3095	++tcpstat.tcps_predack;
3096
3097	tcp_bad_rexmt_check(tp, th, to: &to);
3098
3099	/ Recalculate the RTT /
3100	tcp_compute_rtt(tp, to: &to, th);
3101
3102	VERIFY(SEQ_GEQ(th->th_ack, tp->snd_una));
3103	acked = BYTES_ACKED(th, tp);
3104	tcpstat.tcps_rcvackpack++;
3105	tcpstat.tcps_rcvackbyte += acked;
3106
3107	/*
3108	* Handle an ack that is in sequence during
3109	* congestion avoidance phase. The
3110	* calculations in this function
3111	* assume that snd_una is not updated yet.
3112	*/
3113	if (CC_ALGO(tp)->congestion_avd != NULL) {
3114	CC_ALGO(tp)->congestion_avd(tp, th);
3115	}
3116	tcp_ccdbg_trace(tp, th, event: TCP_CC_INSEQ_ACK_RCVD);
3117	sbdrop(sb: &so->so_snd, len: acked);
3118	tcp_sbsnd_trim(sbsnd: &so->so_snd);
3119
3120	if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
3121	SEQ_LEQ(th->th_ack, tp->snd_recover)) {
3122	tp->snd_recover = th->th_ack - `1`;
3123	}
3124
3125	tcp_update_snd_una(tp, ack: th->th_ack);
3126
3127	TCP_RESET_REXMT_STATE(tp);
3128
3129	/*
3130	* pull snd_wl2 up to prevent seq wrap relative
3131	* to th_ack.
3132	*/
3133	tp->snd_wl2 = th->th_ack;
3134
3135	if (tp->t_dupacks > `0`) {
3136	tp->t_dupacks = `0`;
3137	tp->t_rexmtthresh = tcprexmtthresh;
3138	tp->t_new_dupacks = `0`;
3139	}
3140
3141	tp->sackhint.sack_bytes_acked = `0`;
3142
3143	/*
3144	* If all outstanding data are acked, stop
3145	* retransmit timer, otherwise restart timer
3146	* using current (possibly backed-off) value.
3147	* If process is waiting for space,
3148	* wakeup/selwakeup/signal. If data
3149	* are ready to send, let tcp_output
3150	* decide between more output or persist.
3151	*/
3152	if (tp->snd_una == tp->snd_max) {
3153	tp->t_timer[TCPT_REXMT] = `0`;
3154	tp->t_timer[TCPT_PTO] = `0`;
3155	} else if (tp->t_timer[TCPT_PERSIST] == `0`) {
3156	tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
3157	}
3158	if (!SLIST_EMPTY(&tp->t_rxt_segments) &&
3159	!TCP_DSACK_SEQ_IN_WINDOW(tp,
3160	tp->t_dsack_lastuna, tp->snd_una)) {
3161	tcp_rxtseg_clean(tp);
3162	}
3163
3164	if ((tp->t_flagsext & TF_MEASURESNDBW) != `0` &&
3165	tp->t_bwmeas != NULL) {
3166	tcp_bwmeas_check(tp);
3167	}
3168
3169	write_wakeup = `1`;
3170	if (!SLIST_EMPTY(&tp->t_notify_ack)) {
3171	tcp_notify_acknowledgement(tp, so);
3172	}
3173
3174	if ((so->so_snd.sb_cc) \|\| (tp->t_flags & TF_ACKNOW)) {
3175	(void) tcp_output(tp);
3176	}
3177
3178	tcp_tfo_rcv_ack(tp, th);
3179
3180	m_freem(m);
3181
3182	tcp_check_timer_state(tp);
3183
3184	tcp_handle_wakeup(so, read_wakeup, write_wakeup);
3185
3186	socket_unlock(so, refcount: `1`);
3187	KERNEL_DEBUG(DBG_FNC_TCP_INPUT \| DBG_FUNC_END, `0`, `0`, `0`, `0`, `0`);
3188	return;
3189	}
3190	} else if (th->th_ack == tp->snd_una && LIST_EMPTY(&tp->t_segq) &&
3191	tlen <= tcp_sbspace(tp)) {
3192	/*
3193	* this is a pure, in-sequence data packet
3194	* with nothing on the reassembly queue and
3195	* we have enough buffer space to take it.
3196	*/
3197
3198	/ Clean receiver SACK report if present /
3199	if (SACK_ENABLED(tp) && tp->rcv_numsacks) {
3200	tcp_clean_sackreport(tp);
3201	}
3202	++tcpstat.tcps_preddat;
3203	tp->rcv_nxt += tlen;
3204	/ Update highest received sequence and its timestamp /
3205	if (SEQ_LT(tp->rcv_high, tp->rcv_nxt)) {
3206	tp->rcv_high = tp->rcv_nxt;
3207	if (to.to_flags & TOF_TS) {
3208	tp->tsv_high = to.to_tsval;
3209	}
3210	}
3211
3212	/*
3213	* Pull snd_wl1 up to prevent seq wrap relative to
3214	* th_seq.
3215	*/
3216	tp->snd_wl1 = th->th_seq;
3217	/*
3218	* Pull rcv_up up to prevent seq wrap relative to
3219	* rcv_nxt.
3220	*/
3221	tp->rcv_up = tp->rcv_nxt;
3222	TCP_INC_VAR(tcpstat.tcps_rcvpack, segment_count);
3223	tcpstat.tcps_rcvbyte += tlen;
3224	if (nstat_collect) {
3225	INP_ADD_STAT(inp, cell, wifi, wired,
3226	rxpackets, `1`);
3227	INP_ADD_STAT(inp, cell, wifi, wired, rxbytes,
3228	tlen);
3229	inp_set_activity_bitmap(inp);
3230	}
3231
3232	/ Calculate the RTT on the receiver /
3233	tcp_compute_rcv_rtt(tp, to: &to, th);
3234
3235	tcp_sbrcv_grow(tp, sbrcv: &so->so_rcv, to: &to, pktlen: tlen);
3236	if (TCP_USE_RLEDBAT(tp, so) && tcp_cc_rledbat.data_rcvd != NULL) {
3237	tcp_cc_rledbat.data_rcvd(tp, th, &to, tlen);
3238	}
3239
3240	/*
3241	* Add data to socket buffer.
3242	*/
3243	so_recv_data_stat(so, m, `0`);
3244	m_adj(m, drop_hdrlen); / delayed header drop /
3245
3246	if (isipv6) {
3247	memcpy(dst: &saved_hdr, src: ip6, n: sizeof(struct ip6_hdr));
3248	ip6 = (struct ip6_hdr *)&saved_hdr[`0`];
3249	} else {
3250	memcpy(dst: &saved_hdr, src: ip, n: ip->ip_hl << `2`);
3251	ip = (struct ip *)&saved_hdr[`0`];
3252	}
3253	memcpy(dst: &saved_tcphdr, src: th, n: sizeof(struct tcphdr));
3254
3255	if (th->th_flags & TH_PUSH) {
3256	tp->t_flagsext \|= TF_LAST_IS_PSH;
3257	} else {
3258	tp->t_flagsext &= ~TF_LAST_IS_PSH;
3259	}
3260
3261	if (sbappendstream_rcvdemux(so, m)) {
3262	mptcp_handle_input(so);
3263	read_wakeup = `1`;
3264	}
3265	th = &saved_tcphdr;
3266
3267	if (isipv6) {
3268	KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << `16`) \| th->th_sport),
3269	(((ip6->ip6_src.s6_addr16[`0`]) << `16`) \| (ip6->ip6_dst.s6_addr16[`0`])),
3270	th->th_seq, th->th_ack, th->th_win);
3271	} else {
3272	KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << `16`) \| th->th_sport),
3273	(((ip->ip_src.s_addr & `0xffff`) << `16`) \| (ip->ip_dst.s_addr & `0xffff`)),
3274	th->th_seq, th->th_ack, th->th_win);
3275	}
3276	TCP_INC_VAR(tp->t_unacksegs, segment_count);
3277	if (DELAY_ACK(tp, th)) {
3278	if ((tp->t_flags & TF_DELACK) == `0`) {
3279	tp->t_flags \|= TF_DELACK;
3280	tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
3281	}
3282	} else {
3283	tp->t_flags \|= TF_ACKNOW;
3284	tcp_output(tp);
3285	}
3286
3287	tcp_adaptive_rwtimo_check(tp, tlen);
3288
3289	if (tlen > `0`) {
3290	tcp_tfo_rcv_data(tp);
3291	}
3292
3293	tcp_check_timer_state(tp);
3294
3295	tcp_handle_wakeup(so, read_wakeup, write_wakeup);
3296
3297	socket_unlock(so, refcount: `1`);
3298	KERNEL_DEBUG(DBG_FNC_TCP_INPUT \| DBG_FUNC_END, `0`, `0`, `0`, `0`, `0`);
3299	return;
3300	}
3301	}
3302
3303	/*
3304	* Calculate amount of space in receive window,
3305	* and then do TCP input processing.
3306	* Receive window is amount of space in rcv queue,
3307	* but not less than advertised window.
3308	*/
3309	socket_lock_assert_owned(so);
3310	win = tcp_sbspace(tp);
3311	if (win < `0`) {
3312	win = `0`;
3313	} else { / clip rcv window to 4K for modems /
3314	if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > `0`) {
3315	win = min(a: win, b: slowlink_wsize);
3316	}
3317	}
3318	tp->rcv_wnd = imax(a: win, b: (int)(tp->rcv_adv - tp->rcv_nxt));
3319	#if MPTCP
3320	/*
3321	* Ensure that the subflow receive window isn't greater
3322	* than the connection level receive window.
3323	*/
3324	if ((tp->t_mpflags & TMPF_MPTCP_TRUE) && (mp_tp = tptomptp(tp))) {
3325	socket_lock_assert_owned(so: mptetoso(mpte: mp_tp->mpt_mpte));
3326	int64_t recwin_conn = (int64_t)(mp_tp->mpt_rcvadv - mp_tp->mpt_rcvnxt);
3327
3328	VERIFY(recwin_conn < INT32_MAX && recwin_conn > INT32_MIN);
3329	if (recwin_conn > `0` && tp->rcv_wnd > (uint32_t)recwin_conn) {
3330	tp->rcv_wnd = (uint32_t)recwin_conn;
3331	tcpstat.tcps_mp_reducedwin++;
3332	}
3333	}
3334	#endif /* MPTCP */
3335
3336	switch (tp->t_state) {
3337	/*
3338	* Initialize tp->rcv_nxt, and tp->irs, select an initial
3339	* tp->iss, and send a segment:
3340	* <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
3341	* Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
3342	* Fill in remote peer address fields if not previously specified.
3343	* Enter SYN_RECEIVED state, and process any other fields of this
3344	* segment in this state.
3345	*/
3346	case TCPS_LISTEN: {
3347	struct sockaddr_in *sin;
3348	struct sockaddr_in6 *sin6;
3349
3350	socket_lock_assert_owned(so);
3351
3352	/ Clear the logging flags inherited from the listening socket /
3353	inp->inp_log_flags = `0`;
3354	inp->inp_flags2 \|= INP2_LOGGED_SUMMARY;
3355
3356	if (isipv6) {
3357	sin6 = kalloc_type(struct sockaddr_in6, Z_NOWAIT \| Z_ZERO);
3358	if (sin6 == NULL) {
3359	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "LISTEN kalloc_type failed");
3360	goto drop;
3361	}
3362	sin6->sin6_family = AF_INET6;
3363	sin6->sin6_len = sizeof(*sin6);
3364	sin6->sin6_addr = ip6->ip6_src;
3365	sin6->sin6_port = th->th_sport;
3366	if (!in6_embedded_scope && IN6_IS_SCOPE_EMBED(&ip6->ip6_src)) {
3367	sin6->sin6_scope_id = ip6_input_getsrcifscope(m);
3368	}
3369	laddr6 = inp->in6p_laddr;
3370	uint32_t lifscope = inp->inp_lifscope;
3371	if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
3372	inp->in6p_laddr = ip6->ip6_dst;
3373	inp->inp_lifscope = in6_addr2scopeid(ifp, &inp->in6p_laddr);
3374	in6_verify_ifscope(&inp->in6p_laddr, inp->inp_lifscope);
3375	}
3376	if (in6_pcbconnect(inp, SA(sin6), kernel_proc)) {
3377	inp->in6p_laddr = laddr6;
3378	kfree_type(struct sockaddr_in6, sin6);
3379	inp->inp_lifscope = lifscope;
3380	in6_verify_ifscope(&inp->in6p_laddr, inp->inp_lifscope);
3381	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " LISTEN in6_pcbconnect failed");
3382	goto drop;
3383	}
3384	kfree_type(struct sockaddr_in6, sin6);
3385	} else {
3386	socket_lock_assert_owned(so);
3387	sin = kalloc_type(struct sockaddr_in, Z_NOWAIT);
3388	if (sin == NULL) {
3389	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "LISTEN kalloc_type failed");
3390	goto drop;
3391	}
3392	sin->sin_family = AF_INET;
3393	sin->sin_len = sizeof(*sin);
3394	sin->sin_addr = ip->ip_src;
3395	sin->sin_port = th->th_sport;
3396	bzero(s: (caddr_t)sin->sin_zero, n: sizeof(sin->sin_zero));
3397	laddr = inp->inp_laddr;
3398	if (inp->inp_laddr.s_addr == INADDR_ANY) {
3399	inp->inp_laddr = ip->ip_dst;
3400	}
3401	if (in_pcbconnect(inp, SA(sin), kernel_proc, IFSCOPE_NONE, NULL)) {
3402	inp->inp_laddr = laddr;
3403	kfree_type(struct sockaddr_in, sin);
3404	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " LISTEN in_pcbconnect failed");
3405	goto drop;
3406	}
3407	kfree_type(struct sockaddr_in, sin);
3408	}
3409
3410	tcp_dooptions(tp, optp, optlen, th, &to);
3411	tcp_finalize_options(tp, &to, ifscope);
3412
3413	if (tfo_enabled(tp) && tcp_tfo_syn(tp, to: &to)) {
3414	isconnected = TRUE;
3415	}
3416
3417	if (iss) {
3418	tp->iss = iss;
3419	} else {
3420	tp->iss = tcp_new_isn(tp);
3421	}
3422	tp->irs = th->th_seq;
3423	tcp_sendseqinit(tp);
3424	tcp_rcvseqinit(tp);
3425	tp->snd_recover = tp->snd_una;
3426	/*
3427	* Initialization of the tcpcb for transaction;
3428	* set SND.WND = SEG.WND,
3429	* initialize CCsend and CCrecv.
3430	*/
3431	tp->snd_wnd = tiwin; / initial send-window /
3432	tp->max_sndwnd = tp->snd_wnd;
3433	tp->t_flags \|= TF_ACKNOW;
3434	tp->t_unacksegs = `0`;
3435	tp->t_unacksegs_ce = `0`;
3436	DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
3437	struct tcpcb *, tp, int32_t, TCPS_SYN_RECEIVED);
3438	TCP_LOG_STATE(tp, TCPS_SYN_RECEIVED);
3439	tp->t_state = TCPS_SYN_RECEIVED;
3440	tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
3441	TCP_CONN_KEEPINIT(tp));
3442	tp->t_connect_time = tcp_now;
3443	dropsocket = `0`; / committed to socket /
3444
3445	if (inp->inp_flowhash == `0`) {
3446	inp_calc_flowhash(inp);
3447	ASSERT(inp->inp_flowhash != `0`);
3448	}
3449	/ update flowinfo - RFC 6437 /
3450	if (inp->inp_flow == `0` &&
3451	inp->in6p_flags & IN6P_AUTOFLOWLABEL) {
3452	inp->inp_flow &= ~IPV6_FLOWLABEL_MASK;
3453	inp->inp_flow \|=
3454	(htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
3455	}
3456
3457	/ reset the incomp processing flag /
3458	so->so_flags &= ~(SOF_INCOMP_INPROGRESS);
3459	tcpstat.tcps_accepts++;
3460
3461	int ace_flags = ((th->th_x2 << `8`) \| thflags) & TH_ACE;
3462	tcp_input_process_accecn_syn(tp, ace_flags, ip_ecn);
3463
3464	/*
3465	* The address and connection state are finalized
3466	*/
3467	TCP_LOG_CONNECT(tp, false, `0`);
3468
3469	tcp_add_fsw_flow(tp, ifp);
3470
3471	goto trimthenstep6;
3472	}
3473
3474	/*
3475	* If the state is SYN_RECEIVED and the seg contains an ACK,
3476	* but not for our SYN/ACK, send a RST.
3477	*/
3478	case TCPS_SYN_RECEIVED:
3479	if ((thflags & TH_ACK) &&
3480	(SEQ_LEQ(th->th_ack, tp->snd_una) \|\|
3481	SEQ_GT(th->th_ack, tp->snd_max))) {
3482	IF_TCP_STATINC(ifp, ooopacket);
3483	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN_RECEIVED bad ACK");
3484	goto dropwithreset;
3485	}
3486
3487	/*
3488	* In SYN_RECEIVED state, if we recv some SYNS with
3489	* window scale and others without, window scaling should
3490	* be disabled. Otherwise the window advertised will be
3491	* lower if we assume scaling and the other end does not.
3492	*/
3493	if ((thflags & TH_SYN) &&
3494	(tp->irs == th->th_seq) &&
3495	!(to.to_flags & TOF_SCALE)) {
3496	tp->t_flags &= ~TF_RCVD_SCALE;
3497	}
3498	break;
3499
3500	/*
3501	* If the state is SYN_SENT:
3502	* if seg contains an ACK, but not for our SYN, drop the input.
3503	* if seg contains a RST, then drop the connection.
3504	* if seg does not contain SYN, then drop it.
3505	* Otherwise this is an acceptable SYN segment
3506	* initialize tp->rcv_nxt and tp->irs
3507	* if seg contains ack then advance tp->snd_una
3508	* if SYN has been acked change to ESTABLISHED else SYN_RCVD state
3509	* arrange for segment to be acked (eventually)
3510	* continue processing rest of data/controls, beginning with URG
3511	*/
3512	case TCPS_SYN_SENT:
3513	if ((thflags & TH_ACK) &&
3514	(SEQ_LEQ(th->th_ack, tp->iss) \|\|
3515	SEQ_GT(th->th_ack, tp->snd_max))) {
3516	IF_TCP_STATINC(ifp, ooopacket);
3517	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN_SENT bad ACK");
3518	goto dropwithreset;
3519	}
3520	if (thflags & TH_RST) {
3521	if ((thflags & TH_ACK) != `0`) {
3522	if (tfo_enabled(tp) &&
3523	!(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE)) {
3524	tcp_heuristic_tfo_rst(tp);
3525	}
3526	if ((tp->ecn_flags & (TE_SETUPSENT \| TE_RCVD_SYN_RST)) == TE_SETUPSENT \|\|
3527	(tp->ecn_flags & (TE_ACE_SETUPSENT \| TE_RCVD_SYN_RST)) == TE_ACE_SETUPSENT) {
3528	/*
3529	* On local connections, send
3530	* non-ECN syn one time before
3531	* dropping the connection
3532	*/
3533	if (tp->t_flags & TF_LOCAL) {
3534	tp->ecn_flags \|= TE_RCVD_SYN_RST;
3535	goto drop;
3536	} else {
3537	tcp_heuristic_ecn_synrst(tp);
3538	}
3539	}
3540	soevent(so,
3541	hint: (SO_FILT_HINT_LOCKED \|
3542	SO_FILT_HINT_CONNRESET));
3543	tp = tcp_drop(tp, ECONNREFUSED);
3544	}
3545	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN_SENT got RST");
3546	goto drop;
3547	}
3548	if ((thflags & TH_SYN) == `0`) {
3549	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN_SENT no SYN");
3550	goto drop;
3551	}
3552	tp->snd_wnd = th->th_win; / initial send window /
3553	tp->max_sndwnd = tp->snd_wnd;
3554
3555	tp->irs = th->th_seq;
3556	tcp_rcvseqinit(tp);
3557	if (thflags & TH_ACK) {
3558	/ Client processes SYN-ACK /
3559	tcpstat.tcps_connects++;
3560
3561	const uint32_t ace_flags = ((th->th_x2 << `8`) \| thflags) & TH_ACE;
3562
3563	if ((thflags & (TH_ECE \| TH_CWR)) == (TH_ECE)) {
3564	/ Receiving Any\|0\|1 is classic ECN-setup SYN-ACK /
3565	tp->ecn_flags \|= TE_SETUPRECEIVED;
3566	if (TCP_ECN_ENABLED(tp)) {
3567	tcp_heuristic_ecn_success(tp);
3568	tcpstat.tcps_ecn_client_success++;
3569	}
3570
3571	if (tp->ecn_flags & TE_ACE_SETUPSENT) {
3572	/*
3573	* Sent AccECN SYN but received classic ECN SYN-ACK
3574	* Set classic ECN related flags
3575	*/
3576	tp->ecn_flags \|= (TE_SETUPSENT \| TE_SENDIPECT);
3577	tp->ecn_flags &= ~TE_ACE_SETUPSENT;
3578	if (tp->t_client_accecn_state == tcp_connection_client_accurate_ecn_feature_enabled) {
3579	tp->t_client_accecn_state = tcp_connection_client_classic_ecn_available;
3580	}
3581	}
3582	} else if (TCP_ACC_ECN_ENABLED(tp) && ace_flags != `0` &&
3583	ace_flags != TH_ACE) {
3584	/ Initialize sender side packet & byte counters /
3585	tp->t_snd_ce_packets = `5`;
3586	tp->t_snd_ect1_bytes = tp->t_snd_ect0_bytes = `1`;
3587	tp->t_snd_ce_bytes = `0`;
3588	tp->ecn_flags \|= TE_ACE_FINAL_ACK_3WHS;
3589	/*
3590	* Client received AccECN SYN-ACK that reflects the state (ECN)
3591	* in which SYN packet was delivered. This helps to detect if
3592	* there was mangling of the SYN packet on the path. Currently, we
3593	* only send Not-ECT on SYN packets. So, we should set Not-ECT in
3594	* all packets if we receive any encoding other than 0\|TH_CWR\|0.
3595	* If 0\|0\|0 and 1\|1\|1 were received, fail Accurate ECN negotiation
3596	* by not setting TE_ACE_SETUPRECEIVED.
3597	*/
3598	switch (ace_flags) {
3599	case (`0` \| TH_CWR \| `0`):
3600	/ Non-ECT SYN was delivered /
3601	tp->ecn_flags \|= TE_ACE_SETUPRECEIVED;
3602	tcpstat.tcps_ecn_ace_syn_not_ect++;
3603	tp->t_client_accecn_state = tcp_connection_client_accurate_ecn_negotiation_success;
3604	break;
3605	case (`0` \| TH_CWR \| TH_ECE):
3606	/ ECT1 SYN was delivered /
3607	tp->ecn_flags \|= TE_ACE_SETUPRECEIVED;
3608	/ Mangling detected, set Non-ECT on outgoing packets /
3609	tp->ecn_flags &= ~TE_SENDIPECT;
3610	tcpstat.tcps_ecn_ace_syn_ect1++;
3611	tp->t_client_accecn_state = tcp_connection_client_accurate_ecn_negotiation_success_ect_mangling_detected;
3612	break;
3613	case (TH_AE \| `0` \| `0`):
3614	/ ECT0 SYN was delivered /
3615	tp->ecn_flags \|= TE_ACE_SETUPRECEIVED;
3616	/ Mangling detected, set Non-ECT on outgoing packets /
3617	tp->ecn_flags &= ~TE_SENDIPECT;
3618	tcpstat.tcps_ecn_ace_syn_ect0++;
3619	tp->t_client_accecn_state = tcp_connection_client_accurate_ecn_negotiation_success_ect_mangling_detected;
3620	break;
3621	case (TH_AE \| TH_CWR \| `0`):
3622	/ CE SYN was delivered /
3623	tp->ecn_flags \|= TE_ACE_SETUPRECEIVED;
3624	/ Mangling detected, set Non-ECT on outgoing packets /
3625	tp->t_client_accecn_state = tcp_connection_client_accurate_ecn_negotiation_success_ect_mangling_detected;
3626	tp->ecn_flags &= ~TE_SENDIPECT;
3627	/*
3628	* Although we don't send ECT SYN yet, it is possible that
3629	* a network element changed Not-ECT to ECT and later there
3630	* was congestion at another network element that set it to CE.
3631	* To keep it simple, we will consider this as a congestion event
3632	* for the congestion controller.
3633	* If a TCP client in AccECN mode receives CE feedback in the TCP
3634	* flags of a SYN/ACK, it MUST NOT increment s.cep.
3635	*/
3636	tcpstat.tcps_ecn_ace_syn_ce++;
3637	break;
3638	default:
3639	break;
3640	}
3641	if (TCP_ECN_ENABLED(tp)) {
3642	tcp_heuristic_ecn_success(tp);
3643	tcpstat.tcps_ecn_client_success++;
3644	}
3645	/*
3646	* A TCP client in AccECN mode MUST feed back which of the 4
3647	* possible values of the IP-ECN field that was received in the
3648	* SYN/ACK. Set the setup flag for final ACK accordingly.
3649	* We will initialize r.cep, r.e1b, r.e0b first and then increment
3650	* if CE was set on the IP-ECN field of the SYN-ACK.
3651	*/
3652	tp->t_rcv_ce_packets = `5`;
3653	tp->t_rcv_ect0_bytes = tp->t_rcv_ect1_bytes = `1`;
3654	tp->t_rcv_ce_bytes = `0`;
3655
3656	/ Increment packet & byte counters based on IP-ECN /
3657	tcp_input_ip_ecn(tp, inp, tlen: (uint32_t)tlen, segment_count: (uint32_t)segment_count, ip_ecn);
3658
3659	switch (ip_ecn) {
3660	case IPTOS_ECN_NOTECT:
3661	/ Not-ECT SYN-ACK was received /
3662	tp->ecn_flags \|= TE_ACE_SETUP_NON_ECT;
3663	break;
3664	case IPTOS_ECN_ECT1:
3665	/ ECT1 SYN-ACK was received /
3666	tp->ecn_flags \|= TE_ACE_SETUP_ECT1;
3667	break;
3668	case IPTOS_ECN_ECT0:
3669	/ ECT0 SYN-ACK was received /
3670	tp->ecn_flags \|= TE_ACE_SETUP_ECT0;
3671	break;
3672	case IPTOS_ECN_CE:
3673	tp->ecn_flags \|= TE_ACE_SETUP_CE;
3674	break;
3675	}
3676	} else {
3677	if ((tp->ecn_flags & (TE_SETUPSENT \| TE_ACE_SETUPSENT)) &&
3678	tp->t_rxtshift == `0`) {
3679	tcp_heuristic_ecn_success(tp);
3680	tcpstat.tcps_ecn_not_supported++;
3681	}
3682	if ((tp->ecn_flags & (TE_SETUPSENT \| TE_ACE_SETUPSENT)) &&
3683	tp->t_rxtshift > `0`) {
3684	tcp_heuristic_ecn_loss(tp);
3685	}
3686
3687	/ non-ECN-setup SYN-ACK /
3688	tp->ecn_flags &= ~TE_SENDIPECT;
3689	/*
3690	* If Accurate ECN SYN was retransmitted twice and non-ECN SYN-ACK
3691	* was received, then we consider it as Accurate ECN blackholing
3692	*/
3693	if ((tp->ecn_flags & TE_LOST_SYN) && tp->t_rxtshift <= `2` &&
3694	tp->t_client_accecn_state == tcp_connection_client_accurate_ecn_feature_enabled) {
3695	tp->t_client_accecn_state = tcp_connection_client_accurate_ecn_negotiation_blackholed;
3696	}
3697	/*
3698	* If SYN wasn't retransmitted twice yet, the server supports neither classic nor
3699	* accurate ECN SYN-ACK. Accurate ECN should already be disabled for both half connections
3700	* as TE_ACE_SETUPRECEIVED flag is not set.
3701	*/
3702	if (tp->t_client_accecn_state == tcp_connection_client_accurate_ecn_feature_enabled) {
3703	tp->t_client_accecn_state = tcp_connection_client_ecn_not_available;
3704	}
3705	}
3706
3707	/ Do window scaling on this connection? /
3708	if (TCP_WINDOW_SCALE_ENABLED(tp)) {
3709	tp->snd_scale = tp->requested_s_scale;
3710	tp->rcv_scale = tp->request_r_scale;
3711	}
3712
3713	uint32_t recwin = min(a: tp->rcv_wnd, TCP_MAXWIN << tp->rcv_scale);
3714	if (TCP_USE_RLEDBAT(tp, so) && tcp_cc_rledbat.get_rlwin != NULL) {
3715	/ For a LBE receiver, also use rledbat_win /
3716	uint32_t rledbat_win = tcp_cc_rledbat.get_rlwin(tp);
3717	if (rledbat_win > `0`) {
3718	recwin = min(a: recwin, b: rledbat_win);
3719	}
3720	}
3721	tp->rcv_adv += recwin;
3722
3723	tp->snd_una++; / SYN is acked /
3724	if (SEQ_LT(tp->snd_nxt, tp->snd_una)) {
3725	tp->snd_nxt = tp->snd_una;
3726	}
3727
3728	/*
3729	* We have sent more in the SYN than what is being
3730	* acked. (e.g., TFO)
3731	* We should restart the sending from what the receiver
3732	* has acknowledged immediately.
3733	*/
3734	if (SEQ_GT(tp->snd_nxt, th->th_ack)) {
3735	/*
3736	* rdar://problem/33214601
3737	* There is a middlebox that acks all but one
3738	* byte and still drops the data.
3739	*/
3740	if (!(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE) &&
3741	(tp->t_tfo_stats & TFO_S_SYN_DATA_SENT) &&
3742	tp->snd_max == th->th_ack + `1` &&
3743	tp->snd_max > tp->snd_una + `1`) {
3744	tcp_heuristic_tfo_middlebox(tp);
3745
3746	so->so_error = ENODATA;
3747	soevent(so,
3748	hint: (SO_FILT_HINT_LOCKED \| SO_FILT_HINT_MP_SUB_ERROR));
3749
3750	tp->t_tfo_stats \|= TFO_S_ONE_BYTE_PROXY;
3751	}
3752
3753	tp->snd_max = tp->snd_nxt = th->th_ack;
3754	}
3755
3756	/*
3757	* If there's data, delay ACK; if there's also a FIN
3758	* ACKNOW will be turned on later.
3759	*/
3760	TCP_INC_VAR(tp->t_unacksegs, segment_count);
3761	if (TCP_ACC_ECN_ON(tp) && ip_ecn == IPTOS_ECN_CE) {
3762	TCP_INC_VAR(tp->t_unacksegs_ce, segment_count);
3763	}
3764	if (DELAY_ACK(tp, th) && tlen != `0`) {
3765	if ((tp->t_flags & TF_DELACK) == `0`) {
3766	tp->t_flags \|= TF_DELACK;
3767	tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
3768	}
3769	} else {
3770	tp->t_flags \|= TF_ACKNOW;
3771	}
3772	/*
3773	* Received <SYN,ACK> in SYN_SENT[*] state.
3774	* Transitions:
3775	* SYN_SENT --> ESTABLISHED
3776	* SYN_SENT* --> FIN_WAIT_1
3777	*/
3778	tp->t_starttime = tcp_now;
3779	tcp_sbrcv_tstmp_check(tp);
3780	if (tp->t_flags & TF_NEEDFIN) {
3781	DTRACE_TCP4(state__change, void, NULL,
3782	struct inpcb *, inp,
3783	struct tcpcb *, tp, int32_t,
3784	TCPS_FIN_WAIT_1);
3785	TCP_LOG_STATE(tp, TCPS_FIN_WAIT_1);
3786	tp->t_state = TCPS_FIN_WAIT_1;
3787	tp->t_flags &= ~TF_NEEDFIN;
3788	thflags &= ~TH_SYN;
3789
3790	TCP_LOG_CONNECTION_SUMMARY(tp);
3791	} else {
3792	DTRACE_TCP4(state__change, void, NULL,
3793	struct inpcb , inp, struct* tcpcb *,
3794	tp, int32_t, TCPS_ESTABLISHED);
3795	TCP_LOG_STATE(tp, TCPS_ESTABLISHED);
3796	tp->t_state = TCPS_ESTABLISHED;
3797	tp->t_timer[TCPT_KEEP] =
3798	OFFSET_FROM_START(tp,
3799	TCP_CONN_KEEPIDLE(tp));
3800	if (nstat_collect) {
3801	nstat_route_connect_success(
3802	rte: inp->inp_route.ro_rt);
3803	}
3804	TCP_LOG_CONNECTED(tp, `0`);
3805	/*
3806	* The SYN is acknowledged but una is not
3807	* updated yet. So pass the value of
3808	* ack to compute sndbytes correctly
3809	*/
3810	inp_count_sndbytes(inp, th->th_ack);
3811	}
3812	tp->t_forced_acks = TCP_FORCED_ACKS_COUNT;
3813	#if MPTCP
3814	/*
3815	* Do not send the connect notification for additional
3816	* subflows until ACK for 3-way handshake arrives.
3817	*/
3818	if ((!(tp->t_mpflags & TMPF_MPTCP_TRUE)) &&
3819	(tp->t_mpflags & TMPF_SENT_JOIN)) {
3820	isconnected = FALSE;
3821	} else
3822	#endif /* MPTCP */
3823	isconnected = TRUE;
3824
3825	if ((tp->t_tfo_flags & (TFO_F_COOKIE_REQ \| TFO_F_COOKIE_SENT)) \|\|
3826	(tp->t_tfo_stats & TFO_S_SYN_DATA_SENT)) {
3827	tcp_tfo_synack(tp, to: &to);
3828
3829	if ((tp->t_tfo_stats & TFO_S_SYN_DATA_SENT) &&
3830	SEQ_LT(tp->snd_una, th->th_ack)) {
3831	tp->t_tfo_stats \|= TFO_S_SYN_DATA_ACKED;
3832	tcpstat.tcps_tfo_syn_data_acked++;
3833	#if MPTCP
3834	if (so->so_flags & SOF_MP_SUBFLOW) {
3835	so->so_flags1 \|= SOF1_TFO_REWIND;
3836	}
3837	#endif
3838	tcp_tfo_rcv_probe(tp, tlen);
3839	}
3840	}
3841	} else {
3842	/*
3843	* Received initial SYN in SYN-SENT[*] state => simul-
3844	* taneous open.
3845	* Do 3-way handshake:
3846	* SYN-SENT -> SYN-RECEIVED
3847	* SYN-SENT* -> SYN-RECEIVED*
3848	*/
3849	tp->t_flags \|= TF_ACKNOW;
3850	tp->t_timer[TCPT_REXMT] = `0`;
3851	DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
3852	struct tcpcb *, tp, int32_t, TCPS_SYN_RECEIVED);
3853	TCP_LOG_STATE(tp, TCPS_SYN_RECEIVED);
3854	tp->t_state = TCPS_SYN_RECEIVED;
3855
3856	/*
3857	* During simultaneous open, TFO should not be used.
3858	* So, we disable it here, to prevent that data gets
3859	* sent on the SYN/ACK.
3860	*/
3861	tcp_disable_tfo(tp);
3862	}
3863
3864	trimthenstep6:
3865	/*
3866	* Advance th->th_seq to correspond to first data byte.
3867	* If data, trim to stay within window,
3868	* dropping FIN if necessary.
3869	*/
3870	th->th_seq++;
3871	if (tlen > tp->rcv_wnd) {
3872	todrop = tlen - tp->rcv_wnd;
3873	m_adj(m, -todrop);
3874	tlen = tp->rcv_wnd;
3875	thflags &= ~TH_FIN;
3876	tcpstat.tcps_rcvpackafterwin++;
3877	tcpstat.tcps_rcvbyteafterwin += todrop;
3878	}
3879	tp->snd_wl1 = th->th_seq - `1`;
3880	tp->rcv_up = th->th_seq;
3881	/*
3882	* Client side of transaction: already sent SYN and data.
3883	* If the remote host used T/TCP to validate the SYN,
3884	* our data will be ACK'd; if so, enter normal data segment
3885	* processing in the middle of step 5, ack processing.
3886	* Otherwise, goto step 6.
3887	*/
3888	if (thflags & TH_ACK) {
3889	goto process_ACK;
3890	}
3891	goto step6;
3892	/*
3893	* If the state is LAST_ACK or CLOSING or TIME_WAIT:
3894	* do normal processing.
3895	*
3896	* NB: Leftover from RFC1644 T/TCP. Cases to be reused later.
3897	*/
3898	case TCPS_LAST_ACK:
3899	case TCPS_CLOSING:
3900	case TCPS_TIME_WAIT:
3901	break; / continue normal processing /
3902
3903	/ Received a SYN while connection is already established.*
3904	* This is a "half open connection and other anomalies" described
3905	* in RFC793 page 34, send an ACK so the remote reset the connection
3906	* or recovers by adjusting its sequence numbering. Sending an ACK is
3907	* in accordance with RFC 5961 Section 4.2
3908	*
3909	* For Accurate ECN, if we receive a packet with SYN in ESTABLISHED
3910	* state, we don't send the handshake encoding.
3911	*/
3912	case TCPS_ESTABLISHED:
3913	if (thflags & TH_SYN && tlen <= `0`) {
3914	/ Drop the packet silently if we have reached the limit /
3915	if (tcp_is_ack_ratelimited(tp)) {
3916	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "ESTABLISHED rfc5961 rate limited");
3917	goto drop;
3918	} else {
3919	/ Send challenge ACK /
3920	tcpstat.tcps_synchallenge++;
3921	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "ESTABLISHED rfc5961 challenge ACK");
3922	goto dropafterack;
3923	}
3924	}
3925	break;
3926	}
3927
3928	/*
3929	* States other than LISTEN or SYN_SENT.
3930	* First check the RST flag and sequence number since reset segments
3931	* are exempt from the timestamp and connection count tests. This
3932	* fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
3933	* below which allowed reset segments in half the sequence space
3934	* to fall though and be processed (which gives forged reset
3935	* segments with a random sequence number a 50 percent chance of
3936	* killing a connection).
3937	* Then check timestamp, if present.
3938	* Then check the connection count, if present.
3939	* Then check that at least some bytes of segment are within
3940	* receive window. If segment begins before rcv_nxt,
3941	* drop leading data (and SYN); if nothing left, just ack.
3942	*
3943	*
3944	* If the RST bit is set, check the sequence number to see
3945	* if this is a valid reset segment.
3946	* RFC 793 page 37:
3947	* In all states except SYN-SENT, all reset (RST) segments
3948	* are validated by checking their SEQ-fields. A reset is
3949	* valid if its sequence number is in the window.
3950	* Note: this does not take into account delayed ACKs, so
3951	* we should test against last_ack_sent instead of rcv_nxt.
3952	* The sequence number in the reset segment is normally an
3953	* echo of our outgoing acknowlegement numbers, but some hosts
3954	* send a reset with the sequence number at the rightmost edge
3955	* of our receive window, and we have to handle this case.
3956	* Note 2: Paul Watson's paper "Slipping in the Window" has shown
3957	* that brute force RST attacks are possible. To combat this,
3958	* we use a much stricter check while in the ESTABLISHED state,
3959	* only accepting RSTs where the sequence number is equal to
3960	* last_ack_sent. In all other states (the states in which a
3961	* RST is more likely), the more permissive check is used.
3962	* RFC 5961 Section 3.2: if the RST bit is set, sequence # is
3963	* within the receive window and last_ack_sent == seq,
3964	* then reset the connection. Otherwise if the seq doesn't
3965	* match last_ack_sent, TCP must send challenge ACK. Perform
3966	* rate limitation when sending the challenge ACK.
3967	* If we have multiple segments in flight, the intial reset
3968	* segment sequence numbers will be to the left of last_ack_sent,
3969	* but they will eventually catch up.
3970	* In any case, it never made sense to trim reset segments to
3971	* fit the receive window since RFC 1122 says:
3972	* 4.2.2.12 RST Segment: RFC-793 Section 3.4
3973	*
3974	* A TCP SHOULD allow a received RST segment to include data.
3975	*
3976	* DISCUSSION
3977	* It has been suggested that a RST segment could contain
3978	* ASCII text that encoded and explained the cause of the
3979	* RST. No standard has yet been established for such
3980	* data.
3981	*
3982	* If the reset segment passes the sequence number test examine
3983	* the state:
3984	* SYN_RECEIVED STATE:
3985	* If passive open, return to LISTEN state.
3986	* If active open, inform user that connection was refused.
3987	* ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES:
3988	* Inform user that connection was reset, and close tcb.
3989	* CLOSING, LAST_ACK STATES:
3990	* Close the tcb.
3991	* TIME_WAIT STATE:
3992	* Drop the segment - see Stevens, vol. 2, p. 964 and
3993	* RFC 1337.
3994	*
3995	* Radar 4803931: Allows for the case where we ACKed the FIN but
3996	* there is already a RST in flight from the peer.
3997	* In that case, accept the RST for non-established
3998	* state if it's one off from last_ack_sent.
3999	*
4000	*/
4001	if (thflags & TH_RST) {
4002	if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
4003	SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) \|\|
4004	(tp->rcv_wnd == `0` &&
4005	((tp->last_ack_sent == th->th_seq) \|\|
4006	((tp->last_ack_sent - `1`) == th->th_seq)))) {
4007	if (tp->last_ack_sent == th->th_seq) {
4008	switch (tp->t_state) {
4009	case TCPS_SYN_RECEIVED:
4010	IF_TCP_STATINC(ifp, rstinsynrcv);
4011	so->so_error = ECONNREFUSED;
4012	goto close;
4013
4014	case TCPS_ESTABLISHED:
4015	if ((TCP_ECN_ENABLED(tp) \|\| TCP_ACC_ECN_ON(tp)) &&
4016	tp->snd_una == tp->iss + `1` &&
4017	SEQ_GT(tp->snd_max, tp->snd_una)) {
4018	/*
4019	* If the first data packet on an
4020	* ECN connection, receives a RST
4021	* increment the heuristic
4022	*/
4023	tcp_heuristic_ecn_droprst(tp);
4024	}
4025	OS_FALLTHROUGH;
4026	case TCPS_FIN_WAIT_1:
4027	case TCPS_CLOSE_WAIT:
4028	case TCPS_FIN_WAIT_2:
4029	so->so_error = ECONNRESET;
4030	close:
4031	soevent(so,
4032	hint: (SO_FILT_HINT_LOCKED \|
4033	SO_FILT_HINT_CONNRESET));
4034
4035	tcpstat.tcps_drops++;
4036	tp = tcp_close(tp);
4037	break;
4038
4039	case TCPS_CLOSING:
4040	case TCPS_LAST_ACK:
4041	tp = tcp_close(tp);
4042	break;
4043
4044	case TCPS_TIME_WAIT:
4045	break;
4046	}
4047	} else {
4048	tcpstat.tcps_badrst++;
4049	/ Drop if we have reached the ACK limit /
4050	if (tcp_is_ack_ratelimited(tp)) {
4051	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "ESTABLISHED rfc5961 rate limited");
4052	goto drop;
4053	} else {
4054	/ Send challenge ACK /
4055	tcpstat.tcps_rstchallenge++;
4056	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "ESTABLISHED rfc5961 challenge ACK");
4057	goto dropafterack;
4058	}
4059	}
4060	}
4061	goto drop;
4062	}
4063
4064	/*
4065	* RFC 1323 PAWS: If we have a timestamp reply on this segment
4066	* and it's less than ts_recent, drop it.
4067	*/
4068	if ((to.to_flags & TOF_TS) != `0` && tp->ts_recent &&
4069	TSTMP_LT(to.to_tsval, tp->ts_recent)) {
4070	/ Check to see if ts_recent is over 24 days old. /
4071	if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) {
4072	/*
4073	* Invalidate ts_recent. If this segment updates
4074	* ts_recent, the age will be reset later and ts_recent
4075	* will get a valid value. If it does not, setting
4076	* ts_recent to zero will at least satisfy the
4077	* requirement that zero be placed in the timestamp
4078	* echo reply when ts_recent isn't valid. The
4079	* age isn't reset until we get a valid ts_recent
4080	* because we don't want out-of-order segments to be
4081	* dropped when ts_recent is old.
4082	*/
4083	tp->ts_recent = `0`;
4084	} else {
4085	tcpstat.tcps_rcvduppack++;
4086	tcpstat.tcps_rcvdupbyte += tlen;
4087	tp->t_pawsdrop++;
4088	tcpstat.tcps_pawsdrop++;
4089
4090	/*
4091	* PAWS-drop when ECN is being used? That indicates
4092	* that ECT-marked packets take a different path, with
4093	* different congestion-characteristics.
4094	*
4095	* Only fallback when we did send less than 2GB as PAWS
4096	* really has no reason to kick in earlier.
4097	*/
4098	if ((TCP_ECN_ENABLED(tp) \|\| TCP_ACC_ECN_ON(tp)) &&
4099	inp->inp_stat->rxbytes < `2147483648`) {
4100	INP_INC_IFNET_STAT(inp, ecn_fallback_reorder);
4101	tcpstat.tcps_ecn_fallback_reorder++;
4102	tcp_heuristic_ecn_aggressive(tp);
4103	}
4104
4105	if (nstat_collect) {
4106	nstat_route_rx(rte: tp->t_inpcb->inp_route.ro_rt,
4107	packets: `1`, bytes: tlen, flags: NSTAT_RX_FLAG_DUPLICATE);
4108	INP_ADD_STAT(inp, cell, wifi, wired,
4109	rxpackets, `1`);
4110	INP_ADD_STAT(inp, cell, wifi, wired,
4111	rxbytes, tlen);
4112	tp->t_stat.rxduplicatebytes += tlen;
4113	inp_set_activity_bitmap(inp);
4114	}
4115	if (tlen > `0`) {
4116	goto dropafterack;
4117	}
4118	goto drop;
4119	}
4120	}
4121
4122	/*
4123	* In the SYN-RECEIVED state, validate that the packet belongs to
4124	* this connection before trimming the data to fit the receive
4125	* window. Check the sequence number versus IRS since we know
4126	* the sequence numbers haven't wrapped. This is a partial fix
4127	* for the "LAND" DoS attack.
4128	*/
4129	if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
4130	IF_TCP_STATINC(ifp, dospacket);
4131	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN_RECEIVED bad SEQ");
4132	goto dropwithreset;
4133	}
4134
4135	/*
4136	* Check if there is old data at the beginning of the window
4137	* i.e. the sequence number is before rcv_nxt
4138	*/
4139	todrop = tp->rcv_nxt - th->th_seq;
4140	if (todrop > `0`) {
4141	boolean_t is_syn_set = FALSE;
4142
4143	if (thflags & TH_SYN) {
4144	is_syn_set = TRUE;
4145	thflags &= ~TH_SYN;
4146	th->th_seq++;
4147	if (th->th_urp > `1`) {
4148	th->th_urp--;
4149	} else {
4150	thflags &= ~TH_URG;
4151	}
4152	todrop--;
4153	}
4154	/*
4155	* Following if statement from Stevens, vol. 2, p. 960.
4156	* The amount of duplicate data is greater than or equal
4157	* to the size of the segment - entire segment is duplicate
4158	*/
4159	if (todrop > tlen
4160	\|\| (todrop == tlen && (thflags & TH_FIN) == `0`)) {
4161	/*
4162	* Any valid FIN must be to the left of the window.
4163	* At this point the FIN must be a duplicate or out
4164	* of sequence; drop it.
4165	*/
4166	thflags &= ~TH_FIN;
4167
4168	/*
4169	* Send an ACK to resynchronize and drop any data.
4170	* But keep on processing for RST or ACK.
4171	*
4172	* If the SYN bit was originally set, then only send
4173	* an ACK if we are not rate-limiting this connection.
4174	*/
4175	if (is_syn_set) {
4176	if (!tcp_is_ack_ratelimited(tp)) {
4177	tcpstat.tcps_synchallenge++;
4178	tp->t_flags \|= TF_ACKNOW;
4179	}
4180	} else {
4181	tp->t_flags \|= TF_ACKNOW;
4182	}
4183
4184	if (todrop == `1`) {
4185	/ This could be a keepalive /
4186	soevent(so, SO_FILT_HINT_LOCKED \|
4187	SO_FILT_HINT_KEEPALIVE);
4188	}
4189	todrop = tlen;
4190	tcpstat.tcps_rcvduppack++;
4191	tcpstat.tcps_rcvdupbyte += todrop;
4192	} else {
4193	tcpstat.tcps_rcvpartduppack++;
4194	tcpstat.tcps_rcvpartdupbyte += todrop;
4195	}
4196
4197	if (todrop > `1`) {
4198	/*
4199	* Note the duplicate data sequence space so that
4200	* it can be reported in DSACK option.
4201	*/
4202	tp->t_dsack_lseq = th->th_seq;
4203	tp->t_dsack_rseq = th->th_seq + todrop;
4204	tp->t_flags \|= TF_ACKNOW;
4205	}
4206	if (nstat_collect) {
4207	nstat_route_rx(rte: tp->t_inpcb->inp_route.ro_rt, packets: `1`,
4208	bytes: todrop, flags: NSTAT_RX_FLAG_DUPLICATE);
4209	INP_ADD_STAT(inp, cell, wifi, wired, rxpackets, `1`);
4210	INP_ADD_STAT(inp, cell, wifi, wired, rxbytes, todrop);
4211	tp->t_stat.rxduplicatebytes += todrop;
4212	inp_set_activity_bitmap(inp);
4213	}
4214	drop_hdrlen += todrop; / drop from the top afterwards /
4215	th->th_seq += todrop;
4216	tlen -= todrop;
4217	if (th->th_urp > todrop) {
4218	th->th_urp -= todrop;
4219	} else {
4220	thflags &= ~TH_URG;
4221	th->th_urp = `0`;
4222	}
4223	}
4224
4225	/*
4226	* If new data are received on a connection after the user
4227	* processes are gone, then RST the other end.
4228	* Send also a RST when we received a data segment after we've
4229	* sent our FIN when the socket is defunct.
4230	* Note that an MPTCP subflow socket would have SS_NOFDREF set
4231	* by default. So, if it's an MPTCP-subflow we rather check the
4232	* MPTCP-level's socket state for SS_NOFDREF.
4233	*/
4234	if (tlen) {
4235	boolean_t close_it = FALSE;
4236
4237	if (!(so->so_flags & SOF_MP_SUBFLOW) && (so->so_state & SS_NOFDREF) &&
4238	tp->t_state > TCPS_CLOSE_WAIT) {
4239	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SS_NOFDREF");
4240	close_it = TRUE;
4241	}
4242
4243	if ((so->so_flags & SOF_MP_SUBFLOW) && (mptetoso(mpte: tptomptp(tp)->mpt_mpte)->so_state & SS_NOFDREF) &&
4244	tp->t_state > TCPS_CLOSE_WAIT) {
4245	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SOF_MP_SUBFLOW SS_NOFDREF");
4246	close_it = TRUE;
4247	}
4248
4249	if ((so->so_flags & SOF_DEFUNCT) && tp->t_state > TCPS_FIN_WAIT_1) {
4250	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SOF_DEFUNCT");
4251	close_it = TRUE;
4252	}
4253
4254	if (so->so_state & SS_CANTRCVMORE) {
4255	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SS_CANTRCVMORE");
4256	close_it = TRUE;
4257	}
4258
4259	if (close_it) {
4260	tp = tcp_close(tp);
4261	tcpstat.tcps_rcvafterclose++;
4262	IF_TCP_STATINC(ifp, cleanup);
4263	goto dropwithreset;
4264	}
4265	}
4266
4267	/*
4268	* If segment ends after window, drop trailing data
4269	* (and PUSH and FIN); if nothing left, just ACK.
4270	*/
4271	todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
4272	if (todrop > `0`) {
4273	tcpstat.tcps_rcvpackafterwin++;
4274	if (todrop >= tlen) {
4275	tcpstat.tcps_rcvbyteafterwin += tlen;
4276	/*
4277	* If a new connection request is received
4278	* while in TIME_WAIT, drop the old connection
4279	* and start over if the sequence numbers
4280	* are above the previous ones.
4281	*/
4282	if (thflags & TH_SYN &&
4283	tp->t_state == TCPS_TIME_WAIT &&
4284	SEQ_GT(th->th_seq, tp->rcv_nxt)) {
4285	iss = tcp_new_isn(tp);
4286	tp = tcp_close(tp);
4287	socket_unlock(so, refcount: `1`);
4288	goto findpcb;
4289	}
4290	/*
4291	* If window is closed can only take segments at
4292	* window edge, and have to drop data and PUSH from
4293	* incoming segments. Continue processing, but
4294	* remember to ack. Otherwise, drop segment
4295	* and ack.
4296	*/
4297	if (tp->rcv_wnd == `0` && th->th_seq == tp->rcv_nxt) {
4298	tp->t_flags \|= TF_ACKNOW;
4299	tcpstat.tcps_rcvwinprobe++;
4300	} else {
4301	goto dropafterack;
4302	}
4303	} else {
4304	tcpstat.tcps_rcvbyteafterwin += todrop;
4305	}
4306	m_adj(m, -todrop);
4307	tlen -= todrop;
4308	thflags &= ~(TH_PUSH \| TH_FIN);
4309	}
4310
4311	/*
4312	* If last ACK falls within this segment's sequence numbers,
4313	* record its timestamp.
4314	* NOTE:
4315	* 1) That the test incorporates suggestions from the latest
4316	* proposal of the tcplw@cray.com list (Braden 1993/04/26).
4317	* 2) That updating only on newer timestamps interferes with
4318	* our earlier PAWS tests, so this check should be solely
4319	* predicated on the sequence space of this segment.
4320	* 3) That we modify the segment boundary check to be
4321	* Last.ACK.Sent <= SEG.SEQ + SEG.Len
4322	* instead of RFC1323's
4323	* Last.ACK.Sent < SEG.SEQ + SEG.Len,
4324	* This modified check allows us to overcome RFC1323's
4325	* limitations as described in Stevens TCP/IP Illustrated
4326	* Vol. 2 p.869. In such cases, we can still calculate the
4327	* RTT correctly when RCV.NXT == Last.ACK.Sent.
4328	*/
4329	if ((to.to_flags & TOF_TS) != `0` &&
4330	SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
4331	SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
4332	((thflags & (TH_SYN \| TH_FIN)) != `0`))) {
4333	tp->ts_recent_age = tcp_now;
4334	tp->ts_recent = to.to_tsval;
4335	}
4336
4337	/*
4338	* Stevens: If a SYN is in the window, then this is an
4339	* error and we send an RST and drop the connection.
4340	*
4341	* RFC 5961 Section 4.2
4342	* Send challenge ACK for any SYN in synchronized state
4343	* Perform rate limitation in doing so.
4344	*/
4345	if (thflags & TH_SYN) {
4346	if (!tcp_syn_data_valid(tp, tcp_hdr: th, tlen)) {
4347	tcpstat.tcps_badsyn++;
4348	/ Drop if we have reached ACK limit /
4349	if (tcp_is_ack_ratelimited(tp)) {
4350	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "rfc5961 bad SYN rate limited");
4351	goto drop;
4352	} else {
4353	/ Send challenge ACK /
4354	tcpstat.tcps_synchallenge++;
4355	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "rfc5961 bad SYN challenge ack");
4356	goto dropafterack;
4357	}
4358	} else {
4359	/*
4360	* Received SYN (/ACK) with data.
4361	* Move sequence number along to process the data.
4362	*/
4363	th->th_seq++;
4364	thflags &= ~TH_SYN;
4365	}
4366	}
4367
4368	/*
4369	* If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN
4370	* flag is on (half-synchronized state), then queue data for
4371	* later processing; else drop segment and return.
4372	*/
4373	if ((thflags & TH_ACK) == `0`) {
4374	if (tp->t_state == TCPS_SYN_RECEIVED) {
4375	if ((tfo_enabled(tp))) {
4376	/*
4377	* So, we received a valid segment while in
4378	* SYN-RECEIVED.
4379	* As this cannot be an RST (see that if a bit
4380	* higher), and it does not have the ACK-flag
4381	* set, we want to retransmit the SYN/ACK.
4382	* Thus, we have to reset snd_nxt to snd_una to
4383	* trigger the going back to sending of the
4384	* SYN/ACK. This is more consistent with the
4385	* behavior of tcp_output(), which expects
4386	* to send the segment that is pointed to by
4387	* snd_nxt.
4388	*/
4389	tp->snd_nxt = tp->snd_una;
4390
4391	/*
4392	* We need to make absolutely sure that we are
4393	* going to reply upon a duplicate SYN-segment.
4394	*/
4395	if (th->th_flags & TH_SYN) {
4396	needoutput = `1`;
4397	}
4398	}
4399	/ Process this same as newly received Accurate ECN SYN /
4400	int ace_flags = ((th->th_x2 << `8`) \| thflags) & TH_ACE;
4401	tcp_input_process_accecn_syn(tp, ace_flags, ip_ecn);
4402
4403	goto step6;
4404	} else if (tp->t_flags & TF_ACKNOW) {
4405	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "bad ACK");
4406	goto dropafterack;
4407	} else {
4408	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "bad ACK");
4409	goto drop;
4410	}
4411	}
4412
4413	/*
4414	* Ack processing.
4415	*/
4416
4417	switch (tp->t_state) {
4418	/*
4419	* In SYN_RECEIVED state, the ack ACKs our SYN, so enter
4420	* ESTABLISHED state and continue processing.
4421	* The ACK was checked above.
4422	*/
4423	case TCPS_SYN_RECEIVED:
4424
4425	tcpstat.tcps_connects++;
4426
4427	/ Do window scaling? /
4428	if (TCP_WINDOW_SCALE_ENABLED(tp)) {
4429	tp->snd_scale = tp->requested_s_scale;
4430	tp->rcv_scale = tp->request_r_scale;
4431	tp->snd_wnd = th->th_win << tp->snd_scale;
4432	tp->max_sndwnd = tp->snd_wnd;
4433	tiwin = tp->snd_wnd;
4434	}
4435	/*
4436	* Make transitions:
4437	* SYN-RECEIVED -> ESTABLISHED
4438	* SYN-RECEIVED* -> FIN-WAIT-1
4439	*/
4440	tp->t_starttime = tcp_now;
4441	tcp_sbrcv_tstmp_check(tp);
4442	if (tp->t_flags & TF_NEEDFIN) {
4443	DTRACE_TCP4(state__change, void, NULL,
4444	struct inpcb *, inp,
4445	struct tcpcb *, tp, int32_t, TCPS_FIN_WAIT_1);
4446	TCP_LOG_STATE(tp, TCPS_FIN_WAIT_1);
4447	tp->t_state = TCPS_FIN_WAIT_1;
4448	tp->t_flags &= ~TF_NEEDFIN;
4449
4450	TCP_LOG_CONNECTION_SUMMARY(tp);
4451	} else {
4452	DTRACE_TCP4(state__change, void, NULL,
4453	struct inpcb *, inp,
4454	struct tcpcb *, tp, int32_t, TCPS_ESTABLISHED);
4455	TCP_LOG_STATE(tp, TCPS_ESTABLISHED);
4456	tp->t_state = TCPS_ESTABLISHED;
4457	tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
4458	TCP_CONN_KEEPIDLE(tp));
4459	if (nstat_collect) {
4460	nstat_route_connect_success(
4461	rte: tp->t_inpcb->inp_route.ro_rt);
4462	}
4463	TCP_LOG_CONNECTED(tp, `0`);
4464	/*
4465	* The SYN is acknowledged but una is not updated
4466	* yet. So pass the value of ack to compute
4467	* sndbytes correctly
4468	*/
4469	inp_count_sndbytes(inp, th->th_ack);
4470	}
4471	tp->t_forced_acks = TCP_FORCED_ACKS_COUNT;
4472
4473	VERIFY(LIST_EMPTY(&tp->t_segq));
4474	tp->snd_wl1 = th->th_seq - `1`;
4475
4476	/*
4477	* AccECN server in SYN-RCVD state received an ACK with
4478	* SYN=0, process handshake encoding present in the ACK for SYN-ACK
4479	* and update receive side counters.
4480	*/
4481	if (TCP_ACC_ECN_ON(tp) && (thflags & (TH_SYN \| TH_ACK)) == TH_ACK) {
4482	const uint32_t ace_flags = ((th->th_x2 << `8`) \| thflags) & TH_ACE;
4483	if (tlen == `0` && to.to_nsacks == `0`) {
4484	/*
4485	* ACK for SYN-ACK reflects the state (ECN) in which SYN-ACK packet
4486	* was delivered. Use Table 4 of Accurate ECN draft to decode only
4487	* when a pure ACK with no SACK block is received.
4488	* 0\|0\|0 will fail Accurate ECN negotiation and disable ECN.
4489	*/
4490	switch (ace_flags) {
4491	case (`0` \| TH_CWR \| `0`):
4492	/ Non-ECT SYN-ACK was delivered /
4493	tp->t_snd_ce_packets = `5`;
4494	if (tp->t_server_accecn_state == tcp_connection_server_accurate_ecn_requested) {
4495	tp->t_server_accecn_state = tcp_connection_server_accurate_ecn_negotiation_success;
4496	}
4497	break;
4498	case (`0` \| TH_CWR \| TH_ECE):
4499	/ ECT1 SYN-ACK was delivered, mangling detected /
4500	OS_FALLTHROUGH;
4501	case (TH_AE \| `0` \| `0`):
4502	/ ECT0 SYN-ACK was delivered, mangling detected /
4503	tp->t_snd_ce_packets = `5`;
4504	if (tp->t_server_accecn_state == tcp_connection_server_accurate_ecn_requested) {
4505	tp->t_server_accecn_state = tcp_connection_server_accurate_ecn_negotiation_success_ect_mangling_detected;
4506	}
4507	break;
4508	case (TH_AE \| TH_CWR \| `0`):
4509	/*
4510	* CE SYN-ACK was delivered, even though mangling happened,
4511	* CE could indicate congestion at a node after mangling occured.
4512	* Set cwnd to 2 segments
4513	*/
4514	tp->t_snd_ce_packets = `6`;
4515	tp->snd_cwnd = `2` * tp->t_maxseg;
4516	if (tp->t_server_accecn_state == tcp_connection_server_accurate_ecn_requested) {
4517	tp->t_server_accecn_state = tcp_connection_server_accurate_ecn_negotiation_success_ect_mangling_detected;
4518	}
4519	break;
4520	case (`0` \| `0` \| `0`):
4521	/ Disable ECN, as ACE fields were zeroed /
4522	tp->ecn_flags &= ~(TE_SETUPRECEIVED \| TE_SENDIPECT \|
4523	TE_SENDCWR \| TE_ACE_SETUPRECEIVED);
4524	/*
4525	* Since last ACK has no ECN flag set and TE_LOST_SYNACK is set, this is in response
4526	* to the second (non-ECN setup) SYN-ACK retransmission. In such a case, we assume
4527	* that AccECN SYN-ACK was blackholed.
4528	*/
4529	if ((tp->ecn_flags & TE_LOST_SYNACK) && tp->t_rxtshift <= `2` &&
4530	(tp->t_server_accecn_state == tcp_connection_server_classic_ecn_requested \|\|
4531	tp->t_server_accecn_state == tcp_connection_server_accurate_ecn_requested)) {
4532	tp->t_server_accecn_state = tcp_connection_server_accurate_ecn_negotiation_blackholed;
4533	}
4534	/*
4535	* SYN-ACK hasn't been retransmitted twice yet, so this could likely mean bleaching of ACE
4536	* on the path from client to server on last ACK.
4537	*/
4538	if (tp->t_server_accecn_state == tcp_connection_server_accurate_ecn_requested) {
4539	tp->t_server_accecn_state = tcp_connection_server_accurate_ecn_ace_bleaching_detected;
4540	}
4541	break;
4542	default:
4543	/ Unused values for forward compatibility /
4544	tp->t_snd_ce_packets = `5`;
4545	break;
4546	}
4547	}
4548	/ Increment receive side counters based on IP-ECN /
4549	tcp_input_ip_ecn(tp, inp, tlen: (uint32_t)tlen, segment_count: (uint32_t)segment_count, ip_ecn);
4550	}
4551
4552	#if MPTCP
4553	/*
4554	* Do not send the connect notification for additional subflows
4555	* until ACK for 3-way handshake arrives.
4556	*/
4557	if ((!(tp->t_mpflags & TMPF_MPTCP_TRUE)) &&
4558	(tp->t_mpflags & TMPF_SENT_JOIN)) {
4559	isconnected = FALSE;
4560	} else
4561	#endif /* MPTCP */
4562	isconnected = TRUE;
4563	if ((tp->t_tfo_flags & TFO_F_COOKIE_VALID)) {
4564	/ Done this when receiving the SYN /
4565	isconnected = FALSE;
4566
4567	OSDecrementAtomic(&tcp_tfo_halfcnt);
4568
4569	/ Panic if something has gone terribly wrong. /
4570	VERIFY(tcp_tfo_halfcnt >= `0`);
4571
4572	tp->t_tfo_flags &= ~TFO_F_COOKIE_VALID;
4573	}
4574
4575	/*
4576	* In case there is data in the send-queue (e.g., TFO is being
4577	* used, or connectx+data has been done), then if we would
4578	* "FALLTHROUGH", we would handle this ACK as if data has been
4579	* acknowledged. But, we have to prevent this. And this
4580	* can be prevented by increasing snd_una by 1, so that the
4581	* SYN is not considered as data (snd_una++ is actually also
4582	* done in SYN_SENT-state as part of the regular TCP stack).
4583	*
4584	* In case there is data on this ack as well, the data will be
4585	* handled by the label "dodata" right after step6.
4586	*/
4587	if (so->so_snd.sb_cc) {
4588	tp->snd_una++; / SYN is acked /
4589	if (SEQ_LT(tp->snd_nxt, tp->snd_una)) {
4590	tp->snd_nxt = tp->snd_una;
4591	}
4592
4593	/*
4594	* No duplicate-ACK handling is needed. So, we
4595	* directly advance to processing the ACK (aka,
4596	* updating the RTT estimation,...)
4597	*
4598	* But, we first need to handle eventual SACKs,
4599	* because TFO will start sending data with the
4600	* SYN/ACK, so it might be that the client
4601	* includes a SACK with its ACK.
4602	*/
4603	if (SACK_ENABLED(tp) &&
4604	(to.to_nsacks > `0` \|\| !TAILQ_EMPTY(&tp->snd_holes))) {
4605	tcp_sack_doack(tp, &to, th, &sack_bytes_acked, &sack_bytes_newly_acked);
4606	}
4607
4608	goto process_ACK;
4609	}
4610
4611	OS_FALLTHROUGH;
4612
4613	/*
4614	* In ESTABLISHED state: drop duplicate ACKs; ACK out of range
4615	* ACKs. If the ack is in the range
4616	* tp->snd_una < th->th_ack <= tp->snd_max
4617	* then advance tp->snd_una to th->th_ack and drop
4618	* data from the retransmission queue. If this ACK reflects
4619	* more up to date window information we update our window information.
4620	*/
4621	case TCPS_ESTABLISHED:
4622	case TCPS_FIN_WAIT_1:
4623	case TCPS_FIN_WAIT_2:
4624	case TCPS_CLOSE_WAIT:
4625	case TCPS_CLOSING:
4626	case TCPS_LAST_ACK:
4627	case TCPS_TIME_WAIT:
4628	if (SEQ_GT(th->th_ack, tp->snd_max)) {
4629	tcpstat.tcps_rcvacktoomuch++;
4630	if (tcp_is_ack_ratelimited(tp)) {
4631	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "rfc5961 rcvacktoomuch");
4632	goto drop;
4633	} else {
4634	goto dropafterack;
4635	}
4636	}
4637	if (SEQ_LT(th->th_ack, tp->snd_una - tp->max_sndwnd)) {
4638	if (tcp_is_ack_ratelimited(tp)) {
4639	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "rfc5961 bad ACK");
4640	goto drop;
4641	} else {
4642	goto dropafterack;
4643	}
4644	}
4645	if (SACK_ENABLED(tp) && to.to_nsacks > `0`) {
4646	recvd_dsack = tcp_sack_process_dsack(tp, &to, th);
4647	/*
4648	* If DSACK is received and this packet has no
4649	* other SACK information, it can be dropped.
4650	* We do not want to treat it as a duplicate ack.
4651	*/
4652	if (recvd_dsack &&
4653	SEQ_LEQ(th->th_ack, tp->snd_una) &&
4654	to.to_nsacks == `0`) {
4655	tcp_bad_rexmt_check(tp, th, to: &to);
4656	goto drop;
4657	}
4658	}
4659
4660	if (SACK_ENABLED(tp) &&
4661	(to.to_nsacks > `0` \|\| !TAILQ_EMPTY(&tp->snd_holes))) {
4662	tcp_sack_doack(tp, &to, th, &sack_bytes_acked, &sack_bytes_newly_acked);
4663	}
4664
4665	#if MPTCP
4666	if (tp->t_mpuna && SEQ_GEQ(th->th_ack, tp->t_mpuna)) {
4667	if (tp->t_mpflags & TMPF_PREESTABLISHED) {
4668	/ MP TCP establishment succeeded /
4669	tp->t_mpuna = `0`;
4670	if (tp->t_mpflags & TMPF_JOINED_FLOW) {
4671	if (tp->t_mpflags & TMPF_SENT_JOIN) {
4672	tp->t_mpflags &=
4673	~TMPF_PREESTABLISHED;
4674	tp->t_mpflags \|=
4675	TMPF_MPTCP_TRUE;
4676
4677	tp->t_timer[TCPT_JACK_RXMT] = `0`;
4678	tp->t_mprxtshift = `0`;
4679	isconnected = TRUE;
4680	} else {
4681	isconnected = FALSE;
4682	}
4683	} else {
4684	isconnected = TRUE;
4685	}
4686	}
4687	}
4688	#endif /* MPTCP */
4689
4690	tcp_tfo_rcv_ack(tp, th);
4691
4692	/*
4693	* If we have outstanding data (other than
4694	* a window probe), this is a completely
4695	* duplicate ack and the ack is the biggest we've seen.
4696	*
4697	* Need to accommodate a change in window on duplicate acks
4698	* to allow operating systems that update window during
4699	* recovery with SACK
4700	*/
4701	if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
4702	if (tlen == `0` && (tiwin == tp->snd_wnd \|\|
4703	(to.to_nsacks > `0` && sack_bytes_acked > `0`))) {
4704	uint32_t old_dupacks;
4705	/*
4706	* If both ends send FIN at the same time,
4707	* then the ack will be a duplicate ack
4708	* but we have to process the FIN. Check
4709	* for this condition and process the FIN
4710	* instead of the dupack
4711	*/
4712	if ((thflags & TH_FIN) &&
4713	!TCPS_HAVERCVDFIN(tp->t_state)) {
4714	break;
4715	}
4716	process_dupack:
4717	old_dupacks = tp->t_dupacks;
4718	#if MPTCP
4719	/*
4720	* MPTCP options that are ignored must
4721	* not be treated as duplicate ACKs.
4722	*/
4723	if (to.to_flags & TOF_MPTCP) {
4724	goto drop;
4725	}
4726
4727	if ((isconnected) && (tp->t_mpflags & TMPF_JOINED_FLOW)) {
4728	break;
4729	}
4730	#endif /* MPTCP */
4731	/*
4732	* If a duplicate acknowledgement was seen
4733	* after ECN, it indicates packet loss in
4734	* addition to ECN. Reset INRECOVERY flag
4735	* so that we can process partial acks
4736	* correctly
4737	*/
4738	if (tp->ecn_flags & TE_INRECOVERY) {
4739	tp->ecn_flags &= ~TE_INRECOVERY;
4740	}
4741
4742	tcpstat.tcps_rcvdupack++;
4743	if (SACK_ENABLED(tp) && tcp_do_better_lr) {
4744	tp->t_dupacks += max(a: `1`, b: sack_bytes_acked / tp->t_maxseg);
4745	} else {
4746	++tp->t_dupacks;
4747	}
4748
4749	tp->sackhint.sack_bytes_acked += sack_bytes_acked;
4750
4751	if (SACK_ENABLED(tp) && tcp_do_better_lr) {
4752	tp->t_new_dupacks += (sack_bytes_newly_acked / tp->t_maxseg);
4753
4754	if (tp->t_new_dupacks >= tp->t_rexmtthresh && IN_FASTRECOVERY(tp)) {
4755	/ Let's restart the retransmission /
4756	tcp_sack_lost_rexmit(tp);
4757
4758	/*
4759	* If the current tcp cc module has
4760	* defined a hook for tasks to run
4761	* before entering FR, call it
4762	*/
4763	if (CC_ALGO(tp)->pre_fr != NULL) {
4764	CC_ALGO(tp)->pre_fr(tp);
4765	}
4766
4767	ENTER_FASTRECOVERY(tp);
4768
4769	if (tp->t_flags & TF_SENTFIN) {
4770	tp->snd_recover = tp->snd_max - `1`;
4771	} else {
4772	tp->snd_recover = tp->snd_max;
4773	}
4774	tp->t_rtttime = `0`;
4775	/*
4776	* Accurate ECN Sender MUST NOT set CWR to indicate
4777	* it has received and responded to indications
4778	* of congestion. ACE field is used to reflect counters
4779	* that are continously updated overloading the CWR bit.
4780	*/
4781	if (!TCP_ACC_ECN_ON(tp) && TCP_ECN_ENABLED(tp)) {
4782	tp->ecn_flags \|= TE_SENDCWR;
4783	}
4784
4785	if (tp->t_flagsext & TF_CWND_NONVALIDATED) {
4786	tcp_cc_adjust_nonvalidated_cwnd(tp);
4787	} else {
4788	tp->snd_cwnd = tp->snd_ssthresh;
4789	}
4790	}
4791	}
4792
4793	/*
4794	* Check if we need to reset the limit on
4795	* early retransmit
4796	*/
4797	if (tp->t_early_rexmt_count > `0` &&
4798	TSTMP_GEQ(tcp_now,
4799	(tp->t_early_rexmt_win +
4800	TCP_EARLY_REXMT_WIN))) {
4801	tp->t_early_rexmt_count = `0`;
4802	}
4803
4804	/*
4805	* Is early retransmit needed? We check for
4806	* this when the connection is waiting for
4807	* duplicate acks to enter fast recovery.
4808	*/
4809	if (!IN_FASTRECOVERY(tp)) {
4810	tcp_early_rexmt_check(tp, th);
4811	}
4812
4813	/*
4814	* If we've seen exactly rexmt threshold
4815	* of duplicate acks, assume a packet
4816	* has been dropped and retransmit it.
4817	* Kludge snd_nxt & the congestion
4818	* window so we send only this one
4819	* packet.
4820	*
4821	* We know we're losing at the current
4822	* window size so do congestion avoidance
4823	* (set ssthresh to half the current window
4824	* and pull our congestion window back to
4825	* the new ssthresh).
4826	*
4827	* Dup acks mean that packets have left the
4828	* network (they're now cached at the receiver)
4829	* so bump cwnd by the amount in the receiver
4830	* to keep a constant cwnd packets in the
4831	* network.
4832	*/
4833	if (tp->t_timer[TCPT_REXMT] == `0` \|\|
4834	(th->th_ack != tp->snd_una && sack_bytes_acked == `0`)) {
4835	tp->t_dupacks = `0`;
4836	tp->t_rexmtthresh = tcprexmtthresh;
4837	tp->t_new_dupacks = `0`;
4838	} else if ((tp->t_dupacks > tp->t_rexmtthresh && (!tcp_do_better_lr \|\| old_dupacks >= tp->t_rexmtthresh)) \|\|
4839	IN_FASTRECOVERY(tp)) {
4840	/*
4841	* If this connection was seeing packet
4842	* reordering, then recovery might be
4843	* delayed to disambiguate between
4844	* reordering and loss
4845	*/
4846	if (SACK_ENABLED(tp) && !IN_FASTRECOVERY(tp) &&
4847	(tp->t_flagsext &
4848	(TF_PKTS_REORDERED \| TF_DELAY_RECOVERY)) ==
4849	(TF_PKTS_REORDERED \| TF_DELAY_RECOVERY)) {
4850	/*
4851	* Since the SACK information is already
4852	* updated, this ACK will be dropped
4853	*/
4854	break;
4855	}
4856
4857	/*
4858	* Dup acks mean that packets have left the
4859	* network (they're now cached at the receiver)
4860	* so bump cwnd by the amount in the receiver
4861	* to keep a constant cwnd packets in the
4862	* network.
4863	*/
4864	if (SACK_ENABLED(tp) && IN_FASTRECOVERY(tp)) {
4865	int awnd;
4866
4867	/*
4868	* Compute the amount of data in flight first.
4869	* We can inject new data into the pipe iff
4870	* we have less than snd_ssthres worth of data in
4871	* flight.
4872	*/
4873	awnd = (tp->snd_nxt - tp->snd_fack) + tp->sackhint.sack_bytes_rexmit;
4874	if (awnd < tp->snd_ssthresh) {
4875	tp->snd_cwnd += tp->t_maxseg;
4876	if (tp->snd_cwnd > tp->snd_ssthresh) {
4877	tp->snd_cwnd = tp->snd_ssthresh;
4878	}
4879	}
4880	} else {
4881	tp->snd_cwnd += tp->t_maxseg;
4882	}
4883
4884	/ Process any window updates /
4885	if (tiwin > tp->snd_wnd) {
4886	tcp_update_window(tp, thflags,
4887	th, tiwin, tlen);
4888	}
4889	tcp_ccdbg_trace(tp, th,
4890	event: TCP_CC_IN_FASTRECOVERY);
4891
4892	(void) tcp_output(tp);
4893
4894	goto drop;
4895	} else if ((!tcp_do_better_lr && tp->t_dupacks == tp->t_rexmtthresh) \|\|
4896	(tcp_do_better_lr && tp->t_dupacks >= tp->t_rexmtthresh)) {
4897	tcp_seq onxt = tp->snd_nxt;
4898
4899	/*
4900	* If we're doing sack, check to
4901	* see if we're already in sack
4902	* recovery. If we're not doing sack,
4903	* check to see if we're in newreno
4904	* recovery.
4905	*/
4906	if (SACK_ENABLED(tp)) {
4907	if (IN_FASTRECOVERY(tp)) {
4908	tp->t_dupacks = `0`;
4909	break;
4910	} else if (tp->t_flagsext & TF_DELAY_RECOVERY) {
4911	break;
4912	}
4913	} else {
4914	if (SEQ_LEQ(th->th_ack, tp->snd_recover)) {
4915	tp->t_dupacks = `0`;
4916	break;
4917	}
4918	}
4919	if (tp->t_flags & TF_SENTFIN) {
4920	tp->snd_recover = tp->snd_max - `1`;
4921	} else {
4922	tp->snd_recover = tp->snd_max;
4923	}
4924	tp->t_timer[TCPT_PTO] = `0`;
4925	tp->t_rtttime = `0`;
4926
4927	/*
4928	* If the connection has seen pkt
4929	* reordering, delay recovery until
4930	* it is clear that the packet
4931	* was lost.
4932	*/
4933	if (SACK_ENABLED(tp) &&
4934	(tp->t_flagsext &
4935	(TF_PKTS_REORDERED \| TF_DELAY_RECOVERY))
4936	== TF_PKTS_REORDERED &&
4937	!IN_FASTRECOVERY(tp) &&
4938	tp->t_reorderwin > `0` &&
4939	(tp->t_state == TCPS_ESTABLISHED \|\|
4940	tp->t_state == TCPS_FIN_WAIT_1)) {
4941	tp->t_timer[TCPT_DELAYFR] =
4942	OFFSET_FROM_START(tp,
4943	tp->t_reorderwin);
4944	tp->t_flagsext \|= TF_DELAY_RECOVERY;
4945	tcpstat.tcps_delay_recovery++;
4946	tcp_ccdbg_trace(tp, th,
4947	event: TCP_CC_DELAY_FASTRECOVERY);
4948	break;
4949	}
4950
4951	tcp_rexmt_save_state(tp);
4952	/*
4953	* If the current tcp cc module has
4954	* defined a hook for tasks to run
4955	* before entering FR, call it
4956	*/
4957	if (CC_ALGO(tp)->pre_fr != NULL) {
4958	CC_ALGO(tp)->pre_fr(tp);
4959	}
4960	ENTER_FASTRECOVERY(tp);
4961	tp->t_timer[TCPT_REXMT] = `0`;
4962	if (!TCP_ACC_ECN_ON(tp) && TCP_ECN_ENABLED(tp)) {
4963	tp->ecn_flags \|= TE_SENDCWR;
4964	}
4965
4966	if (SACK_ENABLED(tp)) {
4967	tcpstat.tcps_sack_recovery_episode++;
4968	tp->t_sack_recovery_episode++;
4969	tp->sack_newdata = tp->snd_nxt;
4970	if (tcp_do_better_lr) {
4971	tp->snd_cwnd = tp->snd_ssthresh;
4972	} else {
4973	tp->snd_cwnd = tp->t_maxseg;
4974	}
4975	tp->t_flagsext &= ~TF_CWND_NONVALIDATED;
4976
4977	/ Process any window updates /
4978	if (tiwin > tp->snd_wnd) {
4979	tcp_update_window(tp, thflags, th, tiwin, tlen);
4980	}
4981
4982	tcp_ccdbg_trace(tp, th, event: TCP_CC_ENTER_FASTRECOVERY);
4983	(void) tcp_output(tp);
4984	goto drop;
4985	}
4986	tp->snd_nxt = th->th_ack;
4987	tp->snd_cwnd = tp->t_maxseg;
4988
4989	/ Process any window updates /
4990	if (tiwin > tp->snd_wnd) {
4991	tcp_update_window(tp, thflags, th, tiwin, tlen);
4992	}
4993
4994	(void) tcp_output(tp);
4995	if (tp->t_flagsext & TF_CWND_NONVALIDATED) {
4996	tcp_cc_adjust_nonvalidated_cwnd(tp);
4997	} else {
4998	tp->snd_cwnd = tp->snd_ssthresh + tp->t_maxseg * tp->t_dupacks;
4999	}
5000	if (SEQ_GT(onxt, tp->snd_nxt)) {
5001	tp->snd_nxt = onxt;
5002	}
5003
5004	tcp_ccdbg_trace(tp, th, event: TCP_CC_ENTER_FASTRECOVERY);
5005	goto drop;
5006	} else if (ALLOW_LIMITED_TRANSMIT(tp) &&
5007	(!(SACK_ENABLED(tp)) \|\| sack_bytes_acked > `0`) &&
5008	(so->so_snd.sb_cc - (tp->snd_max - tp->snd_una)) > `0`) {
5009	u_int32_t incr = (tp->t_maxseg * tp->t_dupacks);
5010
5011	/ Use Limited Transmit algorithm on the first two*
5012	* duplicate acks when there is new data to transmit
5013	*/
5014	tp->snd_cwnd += incr;
5015	tcpstat.tcps_limited_txt++;
5016	(void) tcp_output(tp);
5017
5018	tcp_ccdbg_trace(tp, th, event: TCP_CC_LIMITED_TRANSMIT);
5019
5020	/ Reset snd_cwnd back to normal /
5021	tp->snd_cwnd -= incr;
5022	}
5023	}
5024	break;
5025	}
5026	/*
5027	* If the congestion window was inflated to account
5028	* for the other side's cached packets, retract it.
5029	*/
5030	if (IN_FASTRECOVERY(tp)) {
5031	if (SEQ_LT(th->th_ack, tp->snd_recover)) {
5032	/*
5033	* If we received an ECE and entered
5034	* recovery, the subsequent ACKs should
5035	* not be treated as partial acks.
5036	*/
5037	if (tp->ecn_flags & TE_INRECOVERY) {
5038	goto process_ACK;
5039	}
5040
5041	if (SACK_ENABLED(tp)) {
5042	tcp_sack_partialack(tp, th);
5043	} else {
5044	tcp_newreno_partial_ack(tp, th);
5045	}
5046	tcp_ccdbg_trace(tp, th, event: TCP_CC_PARTIAL_ACK);
5047	} else {
5048	if (tcp_cubic_minor_fixes) {
5049	exiting_fr = `1`;
5050	}
5051	EXIT_FASTRECOVERY(tp);
5052	if (CC_ALGO(tp)->post_fr != NULL) {
5053	CC_ALGO(tp)->post_fr(tp, th);
5054	}
5055	tp->t_pipeack = `0`;
5056	tcp_clear_pipeack_state(tp);
5057	tcp_ccdbg_trace(tp, th,
5058	event: TCP_CC_EXIT_FASTRECOVERY);
5059	}
5060	} else if ((tp->t_flagsext &
5061	(TF_PKTS_REORDERED \| TF_DELAY_RECOVERY))
5062	== (TF_PKTS_REORDERED \| TF_DELAY_RECOVERY)) {
5063	/*
5064	* If the ack acknowledges upto snd_recover or if
5065	* it acknowledges all the snd holes, exit
5066	* recovery and cancel the timer. Otherwise,
5067	* this is a partial ack. Wait for recovery timer
5068	* to enter recovery. The snd_holes have already
5069	* been updated.
5070	*/
5071	if (SEQ_GEQ(th->th_ack, tp->snd_recover) \|\|
5072	TAILQ_EMPTY(&tp->snd_holes)) {
5073	tp->t_timer[TCPT_DELAYFR] = `0`;
5074	tp->t_flagsext &= ~TF_DELAY_RECOVERY;
5075	EXIT_FASTRECOVERY(tp);
5076	tcp_ccdbg_trace(tp, th,
5077	event: TCP_CC_EXIT_FASTRECOVERY);
5078	}
5079	} else {
5080	/*
5081	* We were not in fast recovery. Reset the
5082	* duplicate ack counter.
5083	*/
5084	tp->t_dupacks = `0`;
5085	tp->t_rexmtthresh = tcprexmtthresh;
5086	tp->t_new_dupacks = `0`;
5087	}
5088
5089	process_ACK:
5090	VERIFY(SEQ_GEQ(th->th_ack, tp->snd_una));
5091	acked = BYTES_ACKED(th, tp);
5092	tcpstat.tcps_rcvackpack++;
5093	tcpstat.tcps_rcvackbyte += acked;
5094
5095	/*
5096	* If the last packet was a retransmit, make sure
5097	* it was not spurious.
5098	*
5099	* This will also take care of congestion window
5100	* adjustment if a last packet was recovered due to a
5101	* tail loss probe.
5102	*/
5103	tcp_bad_rexmt_check(tp, th, to: &to);
5104
5105	/ Recalculate the RTT /
5106	tcp_compute_rtt(tp, to: &to, th);
5107
5108	/*
5109	* If all outstanding data is acked, stop retransmit
5110	* timer and remember to restart (more output or persist).
5111	* If there is more data to be acked, restart retransmit
5112	* timer, using current (possibly backed-off) value.
5113	*/
5114	TCP_RESET_REXMT_STATE(tp);
5115	TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
5116	tp->t_rttmin, TCPTV_REXMTMAX,
5117	TCP_ADD_REXMTSLOP(tp));
5118	if (th->th_ack == tp->snd_max) {
5119	tp->t_timer[TCPT_REXMT] = `0`;
5120	tp->t_timer[TCPT_PTO] = `0`;
5121	needoutput = `1`;
5122	} else if (tp->t_timer[TCPT_PERSIST] == `0`) {
5123	tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp,
5124	tp->t_rxtcur);
5125	}
5126
5127	if ((prev_t_state == TCPS_SYN_SENT \|\|
5128	prev_t_state == TCPS_SYN_RECEIVED) &&
5129	tp->t_state == TCPS_ESTABLISHED) {
5130	TCP_LOG_RTT_INFO(tp);
5131	}
5132
5133	/*
5134	* If no data (only SYN) was ACK'd, skip rest of ACK
5135	* processing.
5136	*/
5137	if (acked == `0`) {
5138	goto step6;
5139	}
5140
5141	/*
5142	* When outgoing data has been acked (except the SYN+data), we
5143	* mark this connection as "sending good" for TFO.
5144	*/
5145	if ((tp->t_tfo_stats & TFO_S_SYN_DATA_SENT) &&
5146	!(tp->t_tfo_flags & TFO_F_NO_SNDPROBING) &&
5147	!(th->th_flags & TH_SYN)) {
5148	tp->t_tfo_flags \|= TFO_F_NO_SNDPROBING;
5149	}
5150
5151	/*
5152	* Accurate ECN uses delta_cep to determine a congestion
5153	* event if new CE counts were received.
5154	* For classic ECN, congestion event is receiving TH_ECE.
5155	*/
5156	if ((tp->ecn_flags & TE_SENDIPECT)) {
5157	if (TCP_ACC_ECN_ON(tp)) {
5158	if (!IN_FASTRECOVERY(tp) && tp->t_delta_ce_packets > `0`) {
5159	tcp_reduce_congestion_window(tp);
5160	tp->ecn_flags \|= (TE_INRECOVERY);
5161	/ update the stats /
5162	tcpstat.tcps_ecn_ace_recv_ce += tp->t_delta_ce_packets;
5163	tp->t_ecn_capable_packets_marked += tp->t_delta_ce_packets;
5164	tcp_ccdbg_trace(tp, th, event: TCP_CC_ECN_RCVD);
5165	}
5166	} else if (TCP_ECN_ENABLED(tp) && (thflags & TH_ECE)) {
5167	/*
5168	* Reduce the congestion window if we haven't
5169	* done so.
5170	*/
5171	if (!IN_FASTRECOVERY(tp)) {
5172	tcp_reduce_congestion_window(tp);
5173	tp->ecn_flags \|= (TE_INRECOVERY \| TE_SENDCWR);
5174	/*
5175	* Also note that the connection received
5176	* ECE atleast once. We increment
5177	* t_ecn_capable_packets_marked when we first
5178	* enter fast recovery.
5179	*/
5180	tp->ecn_flags \|= TE_RECV_ECN_ECE;
5181	INP_INC_IFNET_STAT(inp, ecn_recv_ece);
5182	tcpstat.tcps_ecn_recv_ece++;
5183	tp->t_ecn_capable_packets_marked++;
5184	tcp_ccdbg_trace(tp, th, event: TCP_CC_ECN_RCVD);
5185	}
5186	}
5187	}
5188
5189	/*
5190	* When new data is acked, open the congestion window.
5191	* The specifics of how this is achieved are up to the
5192	* congestion control algorithm in use for this connection.
5193	*
5194	* The calculations in this function assume that snd_una is
5195	* not updated yet.
5196	*/
5197	if (!IN_FASTRECOVERY(tp) && !exiting_fr) {
5198	if (CC_ALGO(tp)->ack_rcvd != NULL) {
5199	CC_ALGO(tp)->ack_rcvd(tp, th);
5200	}
5201	tcp_ccdbg_trace(tp, th, event: TCP_CC_ACK_RCVD);
5202	}
5203	if (acked > so->so_snd.sb_cc) {
5204	tp->snd_wnd -= so->so_snd.sb_cc;
5205	sbdrop(sb: &so->so_snd, len: (int)so->so_snd.sb_cc);
5206	ourfinisacked = `1`;
5207	} else {
5208	sbdrop(sb: &so->so_snd, len: acked);
5209	tcp_sbsnd_trim(sbsnd: &so->so_snd);
5210	tp->snd_wnd -= acked;
5211	ourfinisacked = `0`;
5212	}
5213	/ detect una wraparound /
5214	if (!IN_FASTRECOVERY(tp) &&
5215	SEQ_GT(tp->snd_una, tp->snd_recover) &&
5216	SEQ_LEQ(th->th_ack, tp->snd_recover)) {
5217	tp->snd_recover = th->th_ack - `1`;
5218	}
5219
5220	if (IN_FASTRECOVERY(tp) &&
5221	SEQ_GEQ(th->th_ack, tp->snd_recover)) {
5222	EXIT_FASTRECOVERY(tp);
5223	}
5224
5225	tcp_update_snd_una(tp, ack: th->th_ack);
5226
5227	if (SACK_ENABLED(tp)) {
5228	if (SEQ_GT(tp->snd_una, tp->snd_recover)) {
5229	tp->snd_recover = tp->snd_una;
5230	}
5231	}
5232	if (SEQ_LT(tp->snd_nxt, tp->snd_una)) {
5233	tp->snd_nxt = tp->snd_una;
5234	}
5235	if (!SLIST_EMPTY(&tp->t_rxt_segments) &&
5236	!TCP_DSACK_SEQ_IN_WINDOW(tp, tp->t_dsack_lastuna,
5237	tp->snd_una)) {
5238	tcp_rxtseg_clean(tp);
5239	}
5240	if ((tp->t_flagsext & TF_MEASURESNDBW) != `0` &&
5241	tp->t_bwmeas != NULL) {
5242	tcp_bwmeas_check(tp);
5243	}
5244
5245	write_wakeup = `1`;
5246
5247	if (!SLIST_EMPTY(&tp->t_notify_ack)) {
5248	tcp_notify_acknowledgement(tp, so);
5249	}
5250
5251	switch (tp->t_state) {
5252	/*
5253	* In FIN_WAIT_1 STATE in addition to the processing
5254	* for the ESTABLISHED state if our FIN is now acknowledged
5255	* then enter FIN_WAIT_2.
5256	*/
5257	case TCPS_FIN_WAIT_1:
5258	if (ourfinisacked) {
5259	/*
5260	* If we can't receive any more
5261	* data, then closing user can proceed.
5262	* Starting the TCPT_2MSL timer is contrary to the
5263	* specification, but if we don't get a FIN
5264	* we'll hang forever.
5265	*/
5266	DTRACE_TCP4(state__change, void, NULL,
5267	struct inpcb *, inp,
5268	struct tcpcb *, tp,
5269	int32_t, TCPS_FIN_WAIT_2);
5270	TCP_LOG_STATE(tp, TCPS_FIN_WAIT_2);
5271	tp->t_state = TCPS_FIN_WAIT_2;
5272	if (so->so_state & SS_CANTRCVMORE) {
5273	isconnected = FALSE;
5274	isdisconnected = TRUE;
5275	tcp_set_finwait_timeout(tp);
5276	}
5277	/*
5278	* fall through and make sure we also recognize
5279	* data ACKed with the FIN
5280	*/
5281	}
5282	break;
5283
5284	/*
5285	* In CLOSING STATE in addition to the processing for
5286	* the ESTABLISHED state if the ACK acknowledges our FIN
5287	* then enter the TIME-WAIT state, otherwise ignore
5288	* the segment.
5289	*/
5290	case TCPS_CLOSING:
5291	if (ourfinisacked) {
5292	DTRACE_TCP4(state__change, void, NULL,
5293	struct inpcb *, inp,
5294	struct tcpcb *, tp,
5295	int32_t, TCPS_TIME_WAIT);
5296	TCP_LOG_STATE(tp, TCPS_TIME_WAIT);
5297	tp->t_state = TCPS_TIME_WAIT;
5298	tcp_canceltimers(tp);
5299	if (tp->t_flagsext & TF_NOTIMEWAIT) {
5300	tp->t_flags \|= TF_CLOSING;
5301	} else {
5302	add_to_time_wait(tp, delay: `2` * tcp_msl);
5303	}
5304	isconnected = FALSE;
5305	isdisconnected = TRUE;
5306	}
5307	break;
5308
5309	/*
5310	* In LAST_ACK, we may still be waiting for data to drain
5311	* and/or to be acked, as well as for the ack of our FIN.
5312	* If our FIN is now acknowledged, delete the TCB,
5313	* enter the closed state and return.
5314	*/
5315	case TCPS_LAST_ACK:
5316	if (ourfinisacked) {
5317	tp = tcp_close(tp);
5318	goto drop;
5319	}
5320	break;
5321
5322	/*
5323	* In TIME_WAIT state the only thing that should arrive
5324	* is a retransmission of the remote FIN. Acknowledge
5325	* it and restart the finack timer.
5326	*/
5327	case TCPS_TIME_WAIT:
5328	add_to_time_wait(tp, delay: `2` * tcp_msl);
5329	goto dropafterack;
5330	}
5331
5332	/*
5333	* If there is a SACK option on the ACK and we
5334	* haven't seen any duplicate acks before, count
5335	* it as a duplicate ack even if the cumulative
5336	* ack is advanced. If the receiver delayed an
5337	* ack and detected loss afterwards, then the ack
5338	* will advance cumulative ack and will also have
5339	* a SACK option. So counting it as one duplicate
5340	* ack is ok.
5341	*/
5342	if (tp->t_state == TCPS_ESTABLISHED &&
5343	SACK_ENABLED(tp) && sack_bytes_acked > `0` &&
5344	to.to_nsacks > `0` && tp->t_dupacks == `0` &&
5345	SEQ_LEQ(th->th_ack, tp->snd_una) && tlen == `0` &&
5346	!(tp->t_flagsext & TF_PKTS_REORDERED)) {
5347	tcpstat.tcps_sack_ackadv++;
5348	goto process_dupack;
5349	}
5350	}
5351
5352	step6:
5353	/*
5354	* Update window information.
5355	*/
5356	if (tcp_update_window(tp, thflags, th, tiwin, tlen)) {
5357	needoutput = `1`;
5358	}
5359
5360	/*
5361	* Process segments with URG.
5362	*/
5363	if ((thflags & TH_URG) && th->th_urp &&
5364	TCPS_HAVERCVDFIN(tp->t_state) == `0`) {
5365	/*
5366	* This is a kludge, but if we receive and accept
5367	* random urgent pointers, we'll crash in
5368	* soreceive. It's hard to imagine someone
5369	* actually wanting to send this much urgent data.
5370	*/
5371	if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
5372	th->th_urp = `0`; / XXX /
5373	thflags &= ~TH_URG; / XXX /
5374	goto dodata; / XXX /
5375	}
5376	/*
5377	* If this segment advances the known urgent pointer,
5378	* then mark the data stream. This should not happen
5379	* in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
5380	* a FIN has been received from the remote side.
5381	* In these states we ignore the URG.
5382	*
5383	* According to RFC961 (Assigned Protocols),
5384	* the urgent pointer points to the last octet
5385	* of urgent data. We continue, however,
5386	* to consider it to indicate the first octet
5387	* of data past the urgent section as the original
5388	* spec states (in one of two places).
5389	*/
5390	if (SEQ_GT(th->th_seq + th->th_urp, tp->rcv_up)) {
5391	tp->rcv_up = th->th_seq + th->th_urp;
5392	so->so_oobmark = so->so_rcv.sb_cc +
5393	(tp->rcv_up - tp->rcv_nxt) - `1`;
5394	if (so->so_oobmark == `0`) {
5395	so->so_state \|= SS_RCVATMARK;
5396	}
5397	sohasoutofband(so);
5398	tp->t_oobflags &= ~(TCPOOB_HAVEDATA \| TCPOOB_HADDATA);
5399	}
5400	/*
5401	* Remove out of band data so doesn't get presented to user.
5402	* This can happen independent of advancing the URG pointer,
5403	* but if two URG's are pending at once, some out-of-band
5404	* data may creep in... ick.
5405	*/
5406	if (th->th_urp <= (u_int32_t)tlen
5407	#if SO_OOBINLINE
5408	&& (so->so_options & SO_OOBINLINE) == `0`
5409	#endif
5410	) {
5411	tcp_pulloutofband(so, th, m,
5412	drop_hdrlen); / hdr drop is delayed /
5413	}
5414	} else {
5415	/*
5416	* If no out of band data is expected,
5417	* pull receive urgent pointer along
5418	* with the receive window.
5419	*/
5420	if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) {
5421	tp->rcv_up = tp->rcv_nxt;
5422	}
5423	}
5424	dodata:
5425
5426	/ Set socket's connect or disconnect state correcly before doing data.*
5427	* The following might unlock the socket if there is an upcall or a socket
5428	* filter.
5429	*/
5430	if (isconnected) {
5431	soisconnected(so);
5432	} else if (isdisconnected) {
5433	soisdisconnected(so);
5434	}
5435
5436	/ Let's check the state of pcb just to make sure that it did not get closed*
5437	* when we unlocked above
5438	*/
5439	if (inp->inp_state == INPCB_STATE_DEAD) {
5440	/ Just drop the packet that we are processing and return /
5441	TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "INPCB_STATE_DEAD");
5442	goto drop;
5443	}
5444
5445	/*
5446	* Process the segment text, merging it into the TCP sequencing queue,
5447	* and arranging for acknowledgment of receipt if necessary.
5448	* This process logically involves adjusting tp->rcv_wnd as data
5449	* is presented to the user (this happens in tcp_usrreq.c,
5450	* case PRU_RCVD). If a FIN has already been received on this
5451	* connection then we just ignore the text.
5452	*
5453	* If we are in SYN-received state and got a valid TFO cookie, we want
5454	* to process the data.
5455	*/
5456	if ((tlen \|\| (thflags & TH_FIN)) &&
5457	TCPS_HAVERCVDFIN(tp->t_state) == `0` &&
5458	(TCPS_HAVEESTABLISHED(tp->t_state) \|\|
5459	(tp->t_state == TCPS_SYN_RECEIVED &&
5460	(tp->t_tfo_flags & TFO_F_COOKIE_VALID)))) {
5461	tcp_seq save_start = th->th_seq;
5462	tcp_seq save_end = th->th_seq + tlen;
5463	m_adj(m, drop_hdrlen); / delayed header drop /
5464	/*
5465	* Insert segment which includes th into TCP reassembly queue
5466	* with control block tp. Set thflags to whether reassembly now
5467	* includes a segment with FIN. This handles the common case
5468	* inline (segment is the next to be received on an established
5469	* connection, and the queue is empty), avoiding linkage into
5470	* and removal from the queue and repetition of various
5471	* conversions.
5472	* Set DELACK for segments received in order, but ack
5473	* immediately when segments are out of order (so
5474	* fast retransmit can work).
5475	*/
5476	if (th->th_seq == tp->rcv_nxt && LIST_EMPTY(&tp->t_segq)) {
5477	TCP_INC_VAR(tp->t_unacksegs, segment_count);
5478
5479	/ Calculate the RTT on the receiver /
5480	tcp_compute_rcv_rtt(tp, to: &to, th);
5481
5482	if (DELAY_ACK(tp, th) &&
5483	((tp->t_flags & TF_ACKNOW) == `0`)) {
5484	if ((tp->t_flags & TF_DELACK) == `0`) {
5485	tp->t_flags \|= TF_DELACK;
5486	tp->t_timer[TCPT_DELACK] =
5487	OFFSET_FROM_START(tp, tcp_delack);
5488	}
5489	} else {
5490	tp->t_flags \|= TF_ACKNOW;
5491	}
5492	tp->rcv_nxt += tlen;
5493	/ Update highest received sequence and its timestamp /
5494	if (SEQ_LT(tp->rcv_high, tp->rcv_nxt)) {
5495	tp->rcv_high = tp->rcv_nxt;
5496	if (to.to_flags & TOF_TS) {
5497	tp->tsv_high = to.to_tsval;
5498	}
5499	}
5500
5501	thflags = th->th_flags & TH_FIN;
5502	TCP_INC_VAR(tcpstat.tcps_rcvpack, segment_count);
5503	tcpstat.tcps_rcvbyte += tlen;
5504	if (nstat_collect) {
5505	INP_ADD_STAT(inp, cell, wifi, wired,
5506	rxpackets, `1`);
5507	INP_ADD_STAT(inp, cell, wifi, wired,
5508	rxbytes, tlen);
5509	inp_set_activity_bitmap(inp);
5510	}
5511	tcp_sbrcv_grow(tp, sbrcv: &so->so_rcv, to: &to, pktlen: tlen);
5512	if (TCP_USE_RLEDBAT(tp, so) &&
5513	tcp_cc_rledbat.data_rcvd != NULL) {
5514	tcp_cc_rledbat.data_rcvd(tp, th, &to, tlen);
5515	}
5516
5517	so_recv_data_stat(so, m, drop_hdrlen);
5518
5519	if (isipv6) {
5520	memcpy(dst: &saved_hdr, src: ip6, n: sizeof(struct ip6_hdr));
5521	ip6 = (struct ip6_hdr *)&saved_hdr[`0`];
5522	} else {
5523	memcpy(dst: &saved_hdr, src: ip, n: ip->ip_hl << `2`);
5524	ip = (struct ip *)&saved_hdr[`0`];
5525	}
5526	memcpy(dst: &saved_tcphdr, src: th, n: sizeof(struct tcphdr));
5527
5528	if (th->th_flags & TH_PUSH) {
5529	tp->t_flagsext \|= TF_LAST_IS_PSH;
5530	} else {
5531	tp->t_flagsext &= ~TF_LAST_IS_PSH;
5532	}
5533
5534	if (sbappendstream_rcvdemux(so, m)) {
5535	read_wakeup = `1`;
5536	}
5537	th = &saved_tcphdr;
5538	} else {
5539	if (isipv6) {
5540	memcpy(dst: &saved_hdr, src: ip6, n: sizeof(struct ip6_hdr));
5541	ip6 = (struct ip6_hdr *)&saved_hdr[`0`];
5542	} else {
5543	memcpy(dst: &saved_hdr, src: ip, n: ip->ip_hl << `2`);
5544	ip = (struct ip *)&saved_hdr[`0`];
5545	}
5546
5547	/ Update highest received sequence and its timestamp /
5548	if (SEQ_LT(tp->rcv_high, th->th_seq + tlen)) {
5549	tp->rcv_high = th->th_seq + tlen;
5550	if (to.to_flags & TOF_TS) {
5551	tp->tsv_high = to.to_tsval;
5552	}
5553	}
5554
5555	/*
5556	* Calculate the RTT on the receiver,
5557	* even if OOO segment is received.
5558	*/
5559	tcp_compute_rcv_rtt(tp, to: &to, th);
5560
5561	if (tcp_autotune_reorder) {
5562	tcp_sbrcv_grow(tp, sbrcv: &so->so_rcv, to: &to, pktlen: tlen);
5563	}
5564	if (TCP_USE_RLEDBAT(tp, so) &&
5565	tcp_cc_rledbat.data_rcvd != NULL) {
5566	tcp_cc_rledbat.data_rcvd(tp, th, &to, tlen);
5567	}
5568
5569	memcpy(dst: &saved_tcphdr, src: th, n: sizeof(struct tcphdr));
5570	thflags = tcp_reass(tp, th, tlenp: &tlen, m, ifp, dowakeup: &read_wakeup);
5571	th = &saved_tcphdr;
5572	tp->t_flags \|= TF_ACKNOW;
5573	}
5574
5575	if ((tlen > `0` \|\| (th->th_flags & TH_FIN)) && SACK_ENABLED(tp)) {
5576	if (th->th_flags & TH_FIN) {
5577	save_end++;
5578	}
5579	tcp_update_sack_list(tp, rcv_laststart: save_start, rcv_lastend: save_end);
5580	}
5581
5582	tcp_adaptive_rwtimo_check(tp, tlen);
5583
5584	if (tlen > `0`) {
5585	tcp_tfo_rcv_data(tp);
5586	}
5587
5588	if (tp->t_flags & TF_DELACK) {
5589	if (isipv6) {
5590	KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << `16`) \| th->th_sport),
5591	(((ip6->ip6_src.s6_addr16[`0`]) << `16`) \| (ip6->ip6_dst.s6_addr16[`0`])),
5592	th->th_seq, th->th_ack, th->th_win);
5593	} else {
5594	KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << `16`) \| th->th_sport),
5595	(((ip->ip_src.s_addr & `0xffff`) << `16`) \| (ip->ip_dst.s_addr & `0xffff`)),
5596	th->th_seq, th->th_ack, th->th_win);
5597	}
5598	}
5599	} else {
5600	if ((so->so_flags & SOF_MP_SUBFLOW) && tlen == `0` &&
5601	(m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) &&
5602	(m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
5603	m_adj(m, drop_hdrlen); / delayed header drop /
5604	/*
5605	* 0-length DATA_FIN. The rlen is actually 0. We special-case the
5606	* byte consumed by the dfin in mptcp_input and mptcp_reass_present
5607	*/
5608	m->m_pkthdr.mp_rlen = `0`;
5609	mptcp_input(tptomptp(tp)->mpt_mpte, m);
5610	tp->t_flags \|= TF_ACKNOW;
5611	} else {
5612	m_freem(m);
5613	}
5614	thflags &= ~TH_FIN;
5615	}
5616	/*
5617	* We increment t_unacksegs_ce for both data segments and pure ACKs
5618	* No need to increment if a FIN has already been received.
5619	*/
5620	if (TCP_ACC_ECN_ON(tp) && TCPS_HAVEESTABLISHED(tp->t_state) &&
5621	TCPS_HAVERCVDFIN(tp->t_state) == `0`) {
5622	if (ip_ecn == IPTOS_ECN_CE) {
5623	TCP_INC_VAR(tp->t_unacksegs_ce, segment_count);
5624	}
5625	/*
5626	* Send an ACK immediately if there is a change in IP ECN
5627	* from non-CE to CE.
5628	* If new data is delivered, then ACK for every 2 CE marks,
5629	* otherwise ACK for every 3 CE marks
5630	*/
5631	if ((ip_ecn == IPTOS_ECN_CE && ip_ecn != tp->t_prev_ip_ecn) \|\|
5632	(tp->t_unacksegs_ce >= `2` && tp->last_ack_sent != tp->rcv_nxt) \|\|
5633	tp->t_unacksegs_ce >= `3`) {
5634	tp->t_flags \|= TF_ACKNOW;
5635	}
5636	tp->t_prev_ip_ecn = ip_ecn;
5637	}
5638	/*
5639	* If FIN is received ACK the FIN and let the user know
5640	* that the connection is closing.
5641	*/
5642	if (thflags & TH_FIN) {
5643	if (TCPS_HAVERCVDFIN(tp->t_state) == `0`) {
5644	socantrcvmore(so);
5645	/*
5646	* If connection is half-synchronized
5647	* (ie NEEDSYN flag on) then delay ACK,
5648	* so it may be piggybacked when SYN is sent.
5649	* Otherwise, since we received a FIN then no
5650	* more input can be expected, send ACK now.
5651	*/
5652	TCP_INC_VAR(tp->t_unacksegs, segment_count);
5653	tp->t_flags \|= TF_ACKNOW;
5654	tp->rcv_nxt++;
5655	}
5656	switch (tp->t_state) {
5657	/*
5658	* In SYN_RECEIVED and ESTABLISHED STATES
5659	* enter the CLOSE_WAIT state.
5660	*/
5661	case TCPS_SYN_RECEIVED:
5662	tp->t_starttime = tcp_now;
5663	OS_FALLTHROUGH;
5664	case TCPS_ESTABLISHED:
5665	DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
5666	struct tcpcb *, tp, int32_t, TCPS_CLOSE_WAIT);
5667	TCP_LOG_STATE(tp, TCPS_CLOSE_WAIT);
5668	tp->t_state = TCPS_CLOSE_WAIT;
5669	break;
5670
5671	/*
5672	* If still in FIN_WAIT_1 STATE FIN has not been acked so
5673	* enter the CLOSING state.
5674	*/
5675	case TCPS_FIN_WAIT_1:
5676	DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
5677	struct tcpcb *, tp, int32_t, TCPS_CLOSING);
5678	TCP_LOG_STATE(tp, TCPS_CLOSING);
5679	tp->t_state = TCPS_CLOSING;
5680	break;
5681
5682	/*
5683	* In FIN_WAIT_2 state enter the TIME_WAIT state,
5684	* starting the time-wait timer, turning off the other
5685	* standard timers.
5686	*/
5687	case TCPS_FIN_WAIT_2:
5688	DTRACE_TCP4(state__change, void, NULL,
5689	struct inpcb *, inp,
5690	struct tcpcb *, tp,
5691	int32_t, TCPS_TIME_WAIT);
5692	TCP_LOG_STATE(tp, TCPS_TIME_WAIT);
5693	tp->t_state = TCPS_TIME_WAIT;
5694	tcp_canceltimers(tp);
5695	tp->t_flags \|= TF_ACKNOW;
5696	if (tp->t_flagsext & TF_NOTIMEWAIT) {
5697	tp->t_flags \|= TF_CLOSING;
5698	} else {
5699	add_to_time_wait(tp, delay: `2` * tcp_msl);
5700	}
5701	soisdisconnected(so);
5702	break;
5703
5704	/*
5705	* In TIME_WAIT state restart the 2 MSL time_wait timer.
5706	*/
5707	case TCPS_TIME_WAIT:
5708	add_to_time_wait(tp, delay: `2` * tcp_msl);
5709	break;
5710	}
5711	}
5712	#if TCPDEBUG
5713	if (so->so_options & SO_DEBUG) {
5714	tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
5715	&tcp_savetcp, `0`);
5716	}
5717	#endif
5718
5719	if (read_wakeup) {
5720	mptcp_handle_input(so);
5721	}
5722
5723	/*
5724	* Return any desired output.
5725	*/
5726	if (needoutput \|\| (tp->t_flags & TF_ACKNOW)) {
5727	(void) tcp_output(tp);
5728	}
5729
5730	tcp_check_timer_state(tp);
5731
5732	tcp_handle_wakeup(so, read_wakeup, write_wakeup);
5733
5734	socket_unlock(so, refcount: `1`);
5735	KERNEL_DEBUG(DBG_FNC_TCP_INPUT \| DBG_FUNC_END, `0`, `0`, `0`, `0`, `0`);
5736	return;
5737
5738	dropafterack:
5739	/*
5740	* Generate an ACK dropping incoming segment if it occupies
5741	* sequence space, where the ACK reflects our state.
5742	*
5743	* We can now skip the test for the RST flag since all
5744	* paths to this code happen after packets containing
5745	* RST have been dropped.
5746	*
5747	* In the SYN-RECEIVED state, don't send an ACK unless the
5748	* segment we received passes the SYN-RECEIVED ACK test.
5749	* If it fails send a RST. This breaks the loop in the
5750	* "LAND" DoS attack, and also prevents an ACK storm
5751	* between two listening ports that have been sent forged
5752	* SYN segments, each with the source address of the other.
5753	*/
5754	if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
5755	(SEQ_GT(tp->snd_una, th->th_ack) \|\|
5756	SEQ_GT(th->th_ack, tp->snd_max))) {
5757	IF_TCP_STATINC(ifp, dospacket);
5758	goto dropwithreset;
5759	}
5760	#if TCPDEBUG
5761	if (so->so_options & SO_DEBUG) {
5762	tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
5763	&tcp_savetcp, `0`);
5764	}
5765	#endif
5766	m_freem(m);
5767	tp->t_flags \|= TF_ACKNOW;
5768
5769	(void) tcp_output(tp);
5770
5771	tcp_handle_wakeup(so, read_wakeup, write_wakeup);
5772
5773	/ Don't need to check timer state as we should have done it during tcp_output /
5774	socket_unlock(so, refcount: `1`);
5775	KERNEL_DEBUG(DBG_FNC_TCP_INPUT \| DBG_FUNC_END, `0`, `0`, `0`, `0`, `0`);
5776	return;
5777	dropwithresetnosock:
5778	nosock = `1`;
5779	dropwithreset:
5780	/*
5781	* Generate a RST, dropping incoming segment.
5782	* Make ACK acceptable to originator of segment.
5783	* Don't bother to respond if destination was broadcast/multicast.
5784	*/
5785	if ((thflags & TH_RST) \|\| m->m_flags & (M_BCAST \| M_MCAST)) {
5786	goto drop;
5787	}
5788	if (isipv6) {
5789	if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) \|\|
5790	IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
5791	goto drop;
5792	}
5793	} else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) \|\|
5794	IN_MULTICAST(ntohl(ip->ip_src.s_addr)) \|\|
5795	ip->ip_src.s_addr == htonl(INADDR_BROADCAST) \|\|
5796	in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) {
5797	goto drop;
5798	}
5799	/ IPv6 anycast check is done at tcp6_input() /
5800
5801	#if TCPDEBUG
5802	if (tp == `0` \|\| (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) {
5803	tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
5804	&tcp_savetcp, `0`);
5805	}
5806	#endif
5807	bzero(s: &tra, n: sizeof(tra));
5808	tra.ifscope = ifscope;
5809	tra.awdl_unrestricted = `1`;
5810	tra.intcoproc_allowed = `1`;
5811	tra.management_allowed = `1`;
5812	if (thflags & TH_ACK) {
5813	/ mtod() below is safe as long as hdr dropping is delayed /
5814	tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)`0`, th->th_ack,
5815	TH_RST, &tra);
5816	} else {
5817	if (thflags & TH_SYN) {
5818	tlen++;
5819	}
5820	/ mtod() below is safe as long as hdr dropping is delayed /
5821	tcp_respond(tp, mtod(m, void *), th, m, th->th_seq + tlen,
5822	(tcp_seq)`0`, TH_RST \| TH_ACK, &tra);
5823	}
5824	/ destroy temporarily created socket /
5825	if (dropsocket) {
5826	(void) soabort(so);
5827	socket_unlock(so, refcount: `1`);
5828	} else if ((inp != NULL) && (nosock == `0`)) {
5829	tcp_handle_wakeup(so, read_wakeup, write_wakeup);
5830
5831	socket_unlock(so, refcount: `1`);
5832	}
5833	KERNEL_DEBUG(DBG_FNC_TCP_INPUT \| DBG_FUNC_END, `0`, `0`, `0`, `0`, `0`);
5834	return;
5835	dropnosock:
5836	nosock = `1`;
5837	drop:
5838	/*
5839	* Drop space held by incoming segment and return.
5840	*/
5841	#if TCPDEBUG
5842	if (tp == `0` \|\| (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) {
5843	tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
5844	&tcp_savetcp, `0`);
5845	}
5846	#endif
5847	m_freem(m);
5848	/ destroy temporarily created socket /
5849	if (dropsocket) {
5850	(void) soabort(so);
5851	socket_unlock(so, refcount: `1`);
5852	} else if (nosock == `0`) {
5853	tcp_handle_wakeup(so, read_wakeup, write_wakeup);
5854
5855	socket_unlock(so, refcount: `1`);
5856	}
5857	KERNEL_DEBUG(DBG_FNC_TCP_INPUT \| DBG_FUNC_END, `0`, `0`, `0`, `0`, `0`);
5858	return;
5859	}
5860
5861	/*
5862	* Parse TCP options and place in tcpopt.
5863	*/
5864	static void
5865	tcp_dooptions(struct tcpcb tp, u_char cp, int cnt, struct tcphdr *th,
5866	struct tcpopt *to)
5867	{
5868	u_short mss = `0`;
5869	uint8_t opt, optlen;
5870
5871	for (; cnt > `0`; cnt -= optlen, cp += optlen) {
5872	opt = cp[`0`];
5873	if (opt == TCPOPT_EOL) {
5874	break;
5875	}
5876	if (opt == TCPOPT_NOP) {
5877	optlen = `1`;
5878	} else {
5879	if (cnt < `2`) {
5880	break;
5881	}
5882	optlen = cp[`1`];
5883	if (optlen < `2` \|\| optlen > cnt) {
5884	break;
5885	}
5886	}
5887	switch (opt) {
5888	default:
5889	continue;
5890
5891	case TCPOPT_MAXSEG:
5892	if (optlen != TCPOLEN_MAXSEG) {
5893	continue;
5894	}
5895	if (!(th->th_flags & TH_SYN)) {
5896	continue;
5897	}
5898	bcopy(src: (char ) cp + `2`, dst: (char* ) &mss, n: sizeof*(mss));
5899	NTOHS(mss);
5900	to->to_mss = mss;
5901	to->to_flags \|= TOF_MSS;
5902	break;
5903
5904	case TCPOPT_WINDOW:
5905	if (optlen != TCPOLEN_WINDOW) {
5906	continue;
5907	}
5908	if (!(th->th_flags & TH_SYN)) {
5909	continue;
5910	}
5911	to->to_flags \|= TOF_SCALE;
5912	to->to_requested_s_scale = MIN(cp[`2`], TCP_MAX_WINSHIFT);
5913	break;
5914
5915	case TCPOPT_TIMESTAMP:
5916	if (optlen != TCPOLEN_TIMESTAMP) {
5917	continue;
5918	}
5919	to->to_flags \|= TOF_TS;
5920	bcopy(src: (char *)cp + `2`,
5921	dst: (char )&to->to_tsval, n: sizeof*(to->to_tsval));
5922	NTOHL(to->to_tsval);
5923	bcopy(src: (char *)cp + `6`,
5924	dst: (char )&to->to_tsecr, n: sizeof*(to->to_tsecr));
5925	NTOHL(to->to_tsecr);
5926	to->to_tsecr -= tp->t_ts_offset;
5927	/ Re-enable sending Timestamps if we received them /
5928	if (!(tp->t_flags & TF_REQ_TSTMP) && tcp_do_timestamps) {
5929	tp->t_flags \|= TF_REQ_TSTMP;
5930	}
5931	break;
5932	case TCPOPT_SACK_PERMITTED:
5933	if (optlen != TCPOLEN_SACK_PERMITTED) {
5934	continue;
5935	}
5936	if (th->th_flags & TH_SYN) {
5937	to->to_flags \|= TOF_SACK;
5938	}
5939	break;
5940	case TCPOPT_SACK:
5941	if (optlen <= `2` \|\| (optlen - `2`) % TCPOLEN_SACK != `0`) {
5942	continue;
5943	}
5944	to->to_nsacks = (optlen - `2`) / TCPOLEN_SACK;
5945	to->to_sacks = cp + `2`;
5946	tcpstat.tcps_sack_rcv_blocks++;
5947
5948	break;
5949	case TCPOPT_FASTOPEN:
5950	if (optlen == TCPOLEN_FASTOPEN_REQ) {
5951	if (tp->t_state != TCPS_LISTEN) {
5952	continue;
5953	}
5954
5955	to->to_flags \|= TOF_TFOREQ;
5956	} else {
5957	if (optlen < TCPOLEN_FASTOPEN_REQ \|\|
5958	(optlen - TCPOLEN_FASTOPEN_REQ) > TFO_COOKIE_LEN_MAX \|\|
5959	(optlen - TCPOLEN_FASTOPEN_REQ) < TFO_COOKIE_LEN_MIN) {
5960	continue;
5961	}
5962	if (tp->t_state != TCPS_LISTEN &&
5963	tp->t_state != TCPS_SYN_SENT) {
5964	continue;
5965	}
5966
5967	to->to_flags \|= TOF_TFO;
5968	to->to_tfo = cp + `1`;
5969	}
5970
5971	break;
5972	#if MPTCP
5973	case TCPOPT_MULTIPATH:
5974	tcp_do_mptcp_options(tp, cp, th, to, optlen);
5975	break;
5976	#endif /* MPTCP */
5977	}
5978	}
5979	}
5980
5981	static void
5982	tcp_finalize_options(struct tcpcb tp, struct* tcpopt to, unsigned* int ifscope)
5983	{
5984	if (to->to_flags & TOF_TS) {
5985	tp->t_flags \|= TF_RCVD_TSTMP;
5986	tp->ts_recent = to->to_tsval;
5987	tp->ts_recent_age = tcp_now;
5988	}
5989	if (to->to_flags & TOF_MSS) {
5990	tcp_mss(tp, to->to_mss, ifscope);
5991	}
5992	if (SACK_ENABLED(tp)) {
5993	if (!(to->to_flags & TOF_SACK)) {
5994	tp->t_flagsext &= ~(TF_SACK_ENABLE);
5995	} else {
5996	tp->t_flags \|= TF_SACK_PERMIT;
5997	}
5998	}
5999	if (to->to_flags & TOF_SCALE) {
6000	tp->t_flags \|= TF_RCVD_SCALE;
6001	tp->requested_s_scale = to->to_requested_s_scale;
6002
6003	/ Re-enable window scaling, if the option is received /
6004	if (tp->request_r_scale > `0`) {
6005	tp->t_flags \|= TF_REQ_SCALE;
6006	}
6007	}
6008	}
6009
6010	/*
6011	* Pull out of band byte out of a segment so
6012	* it doesn't appear in the user's data queue.
6013	* It is still reflected in the segment length for
6014	* sequencing purposes.
6015	*
6016	* @param off delayed to be droped hdrlen
6017	*/
6018	static void
6019	tcp_pulloutofband(struct socket so, struct* tcphdr th, struct* mbuf m, int* off)
6020	{
6021	int cnt = off + th->th_urp - `1`;
6022
6023	while (cnt >= `0`) {
6024	if (m->m_len > cnt) {
6025	char *cp = mtod(m, caddr_t) + cnt;
6026	struct tcpcb *tp = sototcpcb(so);
6027
6028	tp->t_iobc = *cp;
6029	tp->t_oobflags \|= TCPOOB_HAVEDATA;
6030	bcopy(src: cp + `1`, dst: cp, n: (unsigned)(m->m_len - cnt - `1`));
6031	m->m_len--;
6032	if (m->m_flags & M_PKTHDR) {
6033	m->m_pkthdr.len--;
6034	}
6035	return;
6036	}
6037	cnt -= m->m_len;
6038	m = m->m_next;
6039	if (m == `0`) {
6040	break;
6041	}
6042	}
6043	panic("tcp_pulloutofband");
6044	}
6045
6046	uint32_t
6047	get_base_rtt(struct tcpcb *tp)
6048	{
6049	struct rtentry *rt = tp->t_inpcb->inp_route.ro_rt;
6050	return (rt == NULL) ? `0` : rt->rtt_min;
6051	}
6052
6053	static void
6054	update_curr_rtt(struct tcpcb * tp, uint32_t rtt)
6055	{
6056	tp->curr_rtt_index = (tp->curr_rtt_index + `1`) % NCURR_RTT_HIST;
6057	tp->curr_rtt_hist[tp->curr_rtt_index] = rtt;
6058
6059	/ forget the old value and update minimum /
6060	tp->curr_rtt_min = `0`;
6061	for (int i = `0`; i < NCURR_RTT_HIST; ++i) {
6062	if (tp->curr_rtt_hist[i] != `0` && (tp->curr_rtt_min == `0` \|\|
6063	tp->curr_rtt_hist[i] < tp->curr_rtt_min)) {
6064	tp->curr_rtt_min = tp->curr_rtt_hist[i];
6065	}
6066	}
6067	}
6068
6069	/ Each value of RTT base represents the minimum RTT seen in a minute.*
6070	* We keep upto N_RTT_BASE minutes worth of history.
6071	*/
6072	void
6073	update_base_rtt(struct tcpcb *tp, uint32_t rtt)
6074	{
6075	u_int32_t base_rtt, i;
6076	struct rtentry *rt;
6077
6078	if ((rt = tp->t_inpcb->inp_route.ro_rt) == NULL) {
6079	return;
6080	}
6081	if (rt->rtt_expire_ts == `0`) {
6082	RT_LOCK_SPIN(rt);
6083	if (rt->rtt_expire_ts != `0`) {
6084	RT_UNLOCK(rt);
6085	goto update;
6086	}
6087	rt->rtt_expire_ts = tcp_now;
6088	rt->rtt_index = `0`;
6089	rt->rtt_hist[`0`] = rtt;
6090	rt->rtt_min = rtt;
6091	RT_UNLOCK(rt);
6092
6093	tp->curr_rtt_index = `0`;
6094	tp->curr_rtt_hist[`0`] = rtt;
6095	tp->curr_rtt_min = rtt;
6096	return;
6097	}
6098	update:
6099	#if TRAFFIC_MGT
6100	/*
6101	* If the recv side is being throttled, check if the
6102	* current RTT is closer to the base RTT seen in
6103	* first (recent) two slots. If so, unthrottle the stream.
6104	*/
6105	if ((tp->t_flagsext & TF_RECV_THROTTLE) &&
6106	(int)(tcp_now - tp->t_recv_throttle_ts) >= TCP_RECV_THROTTLE_WIN) {
6107	base_rtt = rt->rtt_min;
6108	if (tp->t_rttcur <= (base_rtt + target_qdelay)) {
6109	tp->t_flagsext &= ~TF_RECV_THROTTLE;
6110	tp->t_recv_throttle_ts = `0`;
6111	}
6112	}
6113	#endif /* TRAFFIC_MGT */
6114
6115	/ Update the next current RTT sample /
6116	update_curr_rtt(tp, rtt);
6117
6118	if ((int)(tcp_now - rt->rtt_expire_ts) >=
6119	TCP_RTT_HISTORY_EXPIRE_TIME) {
6120	RT_LOCK_SPIN(rt);
6121	/ check the condition again to avoid race /
6122	if ((int)(tcp_now - rt->rtt_expire_ts) >=
6123	TCP_RTT_HISTORY_EXPIRE_TIME) {
6124	/ Set the base rtt to 0 for idle periods /
6125	uint32_t times = MIN((tcp_now - rt->rtt_expire_ts) /
6126	TCP_RTT_HISTORY_EXPIRE_TIME, NRTT_HIST + `1`);
6127
6128	for (i = rt->rtt_index + `1`; i < rt->rtt_index + times; i++) {
6129	rt->rtt_hist[i % NRTT_HIST] = `0`;
6130	}
6131
6132	rt->rtt_index = i % NRTT_HIST;
6133	rt->rtt_hist[rt->rtt_index] = rtt;
6134	rt->rtt_expire_ts = tcp_now;
6135	} else {
6136	rt->rtt_hist[rt->rtt_index] =
6137	min(a: rt->rtt_hist[rt->rtt_index], b: rtt);
6138	}
6139	/ forget the old value and update minimum /
6140	rt->rtt_min = `0`;
6141	for (i = `0`; i < NRTT_HIST; ++i) {
6142	if (rt->rtt_hist[i] != `0` &&
6143	(rt->rtt_min == `0` \|\|
6144	rt->rtt_hist[i] < rt->rtt_min)) {
6145	rt->rtt_min = rt->rtt_hist[i];
6146	}
6147	}
6148	RT_UNLOCK(rt);
6149	} else {
6150	rt->rtt_hist[rt->rtt_index] =
6151	min(a: rt->rtt_hist[rt->rtt_index], b: rtt);
6152	if (rt->rtt_min == `0`) {
6153	rt->rtt_min = rtt;
6154	} else {
6155	rt->rtt_min = min(a: rt->rtt_min, b: rtt);
6156	}
6157	}
6158	}
6159
6160	/*
6161	* If we have a timestamp reply, update smoothed RTT. If no timestamp is
6162	* present but transmit timer is running and timed sequence number was
6163	* acked, update smoothed RTT.
6164	*
6165	* If timestamps are supported, a receiver can update RTT even if
6166	* there is no outstanding data.
6167	*
6168	* Some boxes send broken timestamp replies during the SYN+ACK phase,
6169	* ignore timestamps of 0or we could calculate a huge RTT and blow up
6170	* the retransmit timer.
6171	*/
6172	static void
6173	tcp_compute_rtt(struct tcpcb tp, struct* tcpopt to, struct* tcphdr *th)
6174	{
6175	int rtt = `0`;
6176	VERIFY(to != NULL && th != NULL);
6177	if (tp->t_rtttime != `0` && SEQ_GT(th->th_ack, tp->t_rtseq)) {
6178	u_int32_t pipe_ack_val;
6179	rtt = tcp_now - tp->t_rtttime;
6180	if (rtt == `0`) {
6181	/*
6182	* Make adjustment for sub ms RTT when
6183	* timestamps are not used.
6184	*/
6185	rtt = `1`;
6186	}
6187	/*
6188	* Compute pipe ack -- the amount of data acknowledged
6189	* in the last RTT -- only works for sender
6190	*/
6191	if (SEQ_GT(th->th_ack, tp->t_pipeack_lastuna)) {
6192	pipe_ack_val = th->th_ack - tp->t_pipeack_lastuna;
6193	/ Update the sample /
6194	tp->t_pipeack_sample[tp->t_pipeack_ind++] =
6195	pipe_ack_val;
6196	tp->t_pipeack_ind %= TCP_PIPEACK_SAMPLE_COUNT;
6197
6198	/ Compute the max of the pipeack samples /
6199	pipe_ack_val = tcp_get_max_pipeack(tp);
6200	tp->t_pipeack = (pipe_ack_val >
6201	tcp_initial_cwnd(tp)) ?
6202	pipe_ack_val : `0`;
6203	}
6204	/ start another measurement /
6205	tp->t_rtttime = `0`;
6206	}
6207	if (((to->to_flags & TOF_TS) != `0`) &&
6208	(to->to_tsecr != `0`) &&
6209	TSTMP_GEQ(tcp_now, to->to_tsecr)) {
6210	tcp_xmit_timer(tp, (tcp_now - to->to_tsecr),
6211	to->to_tsecr, th->th_ack);
6212	} else if (rtt > `0`) {
6213	tcp_xmit_timer(tp, rtt, `0`, th->th_ack);
6214	}
6215	}
6216
6217	static void
6218	tcp_compute_rcv_rtt(struct tcpcb tp, struct* tcpopt to, struct* tcphdr *th)
6219	{
6220	uint32_t rtt = `0`, delta = `0`;
6221	VERIFY(to != NULL && th != NULL);
6222
6223	/ Calculate RTT /
6224	if (((to->to_flags & TOF_TS) != `0`) && (to->to_tsecr != `0`) &&
6225	TSTMP_GEQ(tcp_now, to->to_tsecr)) {
6226	/ Timestamp is supported /
6227	rtt = tcp_now - to->to_tsecr;
6228	if (rtt == `0`) {
6229	/ Make adjustment for sub ms RTT /
6230	rtt = `1`;
6231	}
6232	} else if ((to->to_flags & TOF_TS) == `0`) {
6233	/*
6234	* Timestamp is not supported, 1RTT is roughly
6235	* the time to receive one full window of data
6236	* Currently, RTT calculated this way is only used
6237	* for auto-tuning.
6238	*/
6239	if (tp->rcv_rtt_est_ts != `0`) {
6240	if (SEQ_LT(tp->rcv_nxt, tp->rcv_rtt_est_seq)) {
6241	/ Haven't received a full window yet /
6242	return;
6243	} else {
6244	rtt = tcp_now - tp->rcv_rtt_est_ts;
6245	if (rtt == `0`) {
6246	/ Make adjustment for sub ms RTT /
6247	rtt = `1`;
6248	}
6249	}
6250	} else {
6251	/ Use default value when no RTT measurement /
6252	rtt = TCPTV_RCVNOTS_QUANTUM;
6253	}
6254	/ Restart the measurement /
6255	tp->rcv_rtt_est_ts = tcp_now;
6256	tp->rcv_rtt_est_seq = tp->rcv_nxt + tp->rcv_wnd;
6257	}
6258
6259	/ Update receiver's SRTT /
6260	if (tp->rcv_srtt != `0`) {
6261	/*
6262	* Use the smoothed rtt formula,
6263	* (srtt = rtt/8 + srtt*7/8) in fixed point
6264	*/
6265	delta = (rtt << TCP_DELTA_SHIFT)
6266	- (tp->rcv_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
6267
6268	if ((tp->rcv_srtt += delta) <= `0`) {
6269	tp->rcv_srtt = `1`;
6270	}
6271	} else {
6272	/ No previous measurement /
6273	tp->rcv_srtt = rtt << TCP_RTT_SHIFT;
6274	}
6275
6276	/*
6277	* For current RTT, base RTT and current RTT over k samples,
6278	* we are using the same state for both sender and receiver
6279	* as the most recent sample is always updated before any
6280	* other processing, i.e. the sender will not end up with
6281	* a high RTT due to the receiver.
6282	*/
6283	tp->t_rttcur = rtt;
6284	update_base_rtt(tp, rtt);
6285	}
6286
6287	/*
6288	* Collect new round-trip time estimate and update averages and
6289	* current timeout.
6290	*/
6291	static void
6292	tcp_xmit_timer(struct tcpcb tp, int* rtt,
6293	u_int32_t tsecr, tcp_seq th_ack)
6294	{
6295	VERIFY(rtt >= `0`);
6296	int delta;
6297	int old_srtt = tp->t_srtt;
6298	int old_rttvar = tp->t_rttvar;
6299	bool log_rtt = false;
6300
6301	if (rtt == `0`) {
6302	/*
6303	* As rtt has millisecond precision,
6304	* make adjustment for sub ms RTT
6305	*/
6306	rtt = `1`;
6307	}
6308
6309	if (rtt > `4` * TCPTV_MSL) {
6310	TCP_LOG(tp, "%s: rtt is %d - maxing it at 4 x MSL\n", __func__, rtt);
6311	/*
6312	* We compute RTT either based on the time-to-ACK a packet,
6313	* if TSval is disabled or based on the TSecr value.
6314	* If there is a middlebox messing up the TSecr value, we can
6315	* end up having HUGE rtt values, causing all kinds of problems.
6316	* Let's protect against this by capping RTT to 4*MSL
6317	* (60seconds).
6318	*/
6319	rtt = `4` * TCPTV_MSL;
6320	}
6321
6322	/*
6323	* On AWDL interface, the initial RTT measurement on SYN
6324	* can be wrong due to peer caching. Avoid the first RTT
6325	* measurement as it might skew up the RTO.
6326	* <rdar://problem/28739046>
6327	*/
6328	if (tp->t_inpcb->inp_last_outifp != NULL &&
6329	(tp->t_inpcb->inp_last_outifp->if_eflags & IFEF_AWDL) &&
6330	th_ack == tp->iss + `1`) {
6331	return;
6332	}
6333
6334	if (tp->t_flagsext & TF_RECOMPUTE_RTT) {
6335	if (SEQ_GT(th_ack, tp->snd_una) &&
6336	SEQ_LEQ(th_ack, tp->snd_max) &&
6337	(tsecr == `0` \|\|
6338	TSTMP_GEQ(tsecr, tp->t_badrexmt_time))) {
6339	/*
6340	* We received a new ACK after a
6341	* spurious timeout. Adapt retransmission
6342	* timer as described in rfc 4015.
6343	*/
6344	tp->t_flagsext &= ~(TF_RECOMPUTE_RTT);
6345	tp->t_badrexmt_time = `0`;
6346	tp->t_srtt = max(a: tp->t_srtt_prev, b: rtt);
6347	tp->t_srtt = tp->t_srtt << TCP_RTT_SHIFT;
6348	tp->t_rttvar = max(a: tp->t_rttvar_prev, b: (rtt >> `1`));
6349	tp->t_rttvar = tp->t_rttvar << TCP_RTTVAR_SHIFT;
6350
6351	if (tp->t_rttbest > (tp->t_srtt + tp->t_rttvar)) {
6352	tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
6353	}
6354
6355	goto compute_rto;
6356	} else {
6357	return;
6358	}
6359	}
6360
6361	tcpstat.tcps_rttupdated++;
6362	tp->t_rttupdated++;
6363
6364	tp->t_rttcur = rtt;
6365	update_base_rtt(tp, rtt);
6366
6367	if (tp->t_srtt != `0`) {
6368	/*
6369	* srtt is stored as fixed point with 5 bits after the
6370	* binary point (i.e., scaled by 32). The following magic
6371	* is equivalent to the smoothing algorithm in rfc793 with
6372	* an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
6373	* point).
6374	*
6375	* Freebsd adjusts rtt to origin 0 by subtracting 1
6376	* from the provided rtt value. This was required because
6377	* of the way t_rtttime was initiailised to 1 before.
6378	* Since we changed t_rtttime to be based on
6379	* tcp_now, this extra adjustment is not needed.
6380	*/
6381	delta = (rtt << TCP_DELTA_SHIFT)
6382	- (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
6383
6384	if ((tp->t_srtt += delta) <= `0`) {
6385	tp->t_srtt = `1`;
6386	}
6387
6388	/*
6389	* We accumulate a smoothed rtt variance (actually, a
6390	* smoothed mean difference), then set the retransmit
6391	* timer to smoothed rtt + 4 times the smoothed variance.
6392	* rttvar is stored as fixed point with 4 bits after the
6393	* binary point (scaled by 16). The following is
6394	* equivalent to rfc793 smoothing with an alpha of .75
6395	* (rttvar = rttvar*3/4 + \|delta\| / 4). This replaces
6396	* rfc793's wired-in beta.
6397	*/
6398	if (delta < `0`) {
6399	delta = -delta;
6400	}
6401	delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
6402	if ((tp->t_rttvar += delta) <= `0`) {
6403	tp->t_rttvar = `1`;
6404	}
6405	if (tp->t_rttbest == `0` \|\|
6406	tp->t_rttbest > (tp->t_srtt + tp->t_rttvar)) {
6407	tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
6408	}
6409	} else {
6410	/*
6411	* No rtt measurement yet - use the unsmoothed rtt.
6412	* Set the variance to half the rtt (so our first
6413	* retransmit happens at 3*rtt).
6414	*/
6415	tp->t_srtt = rtt << TCP_RTT_SHIFT;
6416	tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - `1`);
6417	tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
6418
6419	/ Initialize the receive SRTT /
6420	if (tp->rcv_srtt == `0`) {
6421	tp->rcv_srtt = tp->t_srtt;
6422	}
6423	}
6424
6425	compute_rto:
6426	nstat_route_rtt(rte: tp->t_inpcb->inp_route.ro_rt, rtt: tp->t_srtt,
6427	rtt_var: tp->t_rttvar);
6428
6429	/*
6430	* the retransmit should happen at rtt + 4 * rttvar.
6431	* Because of the way we do the smoothing, srtt and rttvar
6432	* will each average +1/2 tick of bias. When we compute
6433	* the retransmit timer, we want 1/2 tick of rounding and
6434	* 1 extra tick because of +-1/2 tick uncertainty in the
6435	* firing of the timer. The bias will give us exactly the
6436	* 1.5 tick we need. But, because the bias is
6437	* statistical, we have to test that we don't drop below
6438	* the minimum feasible timer (which is 2 ticks).
6439	*/
6440	TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
6441	max(tp->t_rttmin, rtt + `2`), TCPTV_REXMTMAX,
6442	TCP_ADD_REXMTSLOP(tp));
6443
6444	/*
6445	* We received an ack for a packet that wasn't retransmitted;
6446	* it is probably safe to discard any error indications we've
6447	* received recently. This isn't quite right, but close enough
6448	* for now (a route might have failed after we sent a segment,
6449	* and the return path might not be symmetrical).
6450	*/
6451	tp->t_softerror = `0`;
6452
6453	if (log_rtt) {
6454	TCP_LOG_RTT_INFO(tp);
6455	}
6456
6457	TCP_LOG_RTT_CHANGE(tp, old_srtt, old_rttvar);
6458	}
6459
6460	static inline unsigned int
6461	tcp_maxmtu(struct rtentry *rt)
6462	{
6463	unsigned int maxmtu;
6464	int interface_mtu = `0`;
6465
6466	RT_LOCK_ASSERT_HELD(rt);
6467	interface_mtu = rt->rt_ifp->if_mtu;
6468
6469	if (rt_key(rt)->sa_family == AF_INET &&
6470	INTF_ADJUST_MTU_FOR_CLAT46(rt->rt_ifp)) {
6471	interface_mtu = IN6_LINKMTU(rt->rt_ifp);
6472	/ Further adjust the size for CLAT46 expansion /
6473	interface_mtu -= CLAT46_HDR_EXPANSION_OVERHD;
6474	}
6475
6476	if (rt->rt_rmx.rmx_mtu == `0`) {
6477	maxmtu = interface_mtu;
6478	} else {
6479	maxmtu = MIN(rt->rt_rmx.rmx_mtu, interface_mtu);
6480	}
6481
6482	return maxmtu;
6483	}
6484
6485	static inline unsigned int
6486	tcp_maxmtu6(struct rtentry *rt)
6487	{
6488	unsigned int maxmtu;
6489	struct nd_ifinfo *ndi = NULL;
6490
6491	RT_LOCK_ASSERT_HELD(rt);
6492	if ((ndi = ND_IFINFO(rt->rt_ifp)) != NULL && !ndi->initialized) {
6493	ndi = NULL;
6494	}
6495	if (ndi != NULL) {
6496	lck_mtx_lock(lck: &ndi->lock);
6497	}
6498	if (rt->rt_rmx.rmx_mtu == `0`) {
6499	maxmtu = IN6_LINKMTU(rt->rt_ifp);
6500	} else {
6501	maxmtu = MIN(rt->rt_rmx.rmx_mtu, IN6_LINKMTU(rt->rt_ifp));
6502	}
6503	if (ndi != NULL) {
6504	lck_mtx_unlock(lck: &ndi->lock);
6505	}
6506
6507	return maxmtu;
6508	}
6509
6510	unsigned int
6511	get_maxmtu(struct rtentry *rt)
6512	{
6513	unsigned int maxmtu = `0`;
6514
6515	RT_LOCK_ASSERT_NOTHELD(rt);
6516
6517	RT_LOCK(rt);
6518
6519	if (rt_key(rt)->sa_family == AF_INET6) {
6520	maxmtu = tcp_maxmtu6(rt);
6521	} else {
6522	maxmtu = tcp_maxmtu(rt);
6523	}
6524
6525	RT_UNLOCK(rt);
6526
6527	return maxmtu;
6528	}
6529
6530	/*
6531	* Determine a reasonable value for maxseg size.
6532	* If the route is known, check route for mtu.
6533	* If none, use an mss that can be handled on the outgoing
6534	* interface without forcing IP to fragment; if bigger than
6535	* an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
6536	* to utilize large mbufs. If no route is found, route has no mtu,
6537	* or the destination isn't local, use a default, hopefully conservative
6538	* size (usually 512 or the default IP max size, but no more than the mtu
6539	* of the interface), as we can't discover anything about intervening
6540	* gateways or networks. We also initialize the congestion/slow start
6541	* window. While looking at the routing entry, we also initialize
6542	* other path-dependent parameters from pre-set or cached values
6543	* in the routing entry.
6544	*
6545	* Also take into account the space needed for options that we
6546	* send regularly. Make maxseg shorter by that amount to assure
6547	* that we can send maxseg amount of data even when the options
6548	* are present. Store the upper limit of the length of options plus
6549	* data in maxopd.
6550	*
6551	* NOTE that this routine is only called when we process an incoming
6552	* segment, for outgoing segments only tcp_mssopt is called.
6553	*
6554	*/
6555	void
6556	tcp_mss(struct tcpcb tp, int* offer, unsigned int input_ifscope)
6557	{
6558	struct rtentry *rt;
6559	struct ifnet *ifp;
6560	int rtt, mss;
6561	uint32_t bufsize;
6562	struct inpcb *inp;
6563	struct socket *so;
6564	int origoffer = offer;
6565	int isnetlocal = `0`;
6566	int isipv6;
6567	int min_protoh;
6568
6569	inp = tp->t_inpcb;
6570
6571	so = inp->inp_socket;
6572	/*
6573	* Nothing left to send after the socket is defunct or TCP is in the closed state
6574	*/
6575	if ((so->so_state & SS_DEFUNCT) \|\| tp->t_state == TCPS_CLOSED) {
6576	return;
6577	}
6578
6579	isipv6 = ((inp->inp_vflag & INP_IPV6) != `0`) ? `1` : `0`;
6580	min_protoh = isipv6 ? sizeof(struct ip6_hdr) + sizeof(struct tcphdr)
6581	: sizeof(struct tcpiphdr);
6582
6583	if (isipv6) {
6584	rt = tcp_rtlookup6(inp, input_ifscope);
6585	} else {
6586	rt = tcp_rtlookup(inp, input_ifscope);
6587	}
6588	isnetlocal = (tp->t_flags & TF_LOCAL);
6589
6590	if (rt == NULL) {
6591	tp->t_maxopd = tp->t_maxseg = isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
6592	return;
6593	}
6594	ifp = rt->rt_ifp;
6595	/*
6596	* Slower link window correction:
6597	* If a value is specificied for slowlink_wsize use it for
6598	* PPP links believed to be on a serial modem (speed <128Kbps).
6599	* Excludes 9600bps as it is the default value adversized
6600	* by pseudo-devices over ppp.
6601	*/
6602	if (ifp->if_type == IFT_PPP && slowlink_wsize > `0` &&
6603	ifp->if_baudrate > `9600` && ifp->if_baudrate <= `128000`) {
6604	tp->t_flags \|= TF_SLOWLINK;
6605	}
6606
6607	/*
6608	* Offer == -1 means that we didn't receive SYN yet. Use 0 then.
6609	*/
6610	if (offer == -`1`) {
6611	offer = rt->rt_rmx.rmx_filler[`0`];
6612	}
6613	/*
6614	* Offer == 0 means that there was no MSS on the SYN segment,
6615	* in this case we use tcp_mssdflt.
6616	*/
6617	if (offer == `0`) {
6618	offer = isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
6619	} else {
6620	/*
6621	* Prevent DoS attack with too small MSS. Round up
6622	* to at least minmss.
6623	*/
6624	offer = max(a: offer, b: tcp_minmss);
6625	/*
6626	* Sanity check: make sure that maxopd will be large
6627	* enough to allow some data on segments even is the
6628	* all the option space is used (40bytes). Otherwise
6629	* funny things may happen in tcp_output.
6630	*/
6631	offer = max(a: offer, b: `64`);
6632	}
6633	rt->rt_rmx.rmx_filler[`0`] = offer;
6634
6635	/*
6636	* While we're here, check if there's an initial rtt
6637	* or rttvar. Convert from the route-table units
6638	* to scaled multiples of the slow timeout timer.
6639	*/
6640	if (tp->t_srtt == `0` && (rtt = rt->rt_rmx.rmx_rtt) != `0`) {
6641	tcp_getrt_rtt(tp, rt);
6642	} else {
6643	tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN : TCPTV_REXMTMIN;
6644	}
6645
6646	mss = (isipv6 ? tcp_maxmtu6(rt) : tcp_maxmtu(rt));
6647
6648	#if NECP
6649	// At this point, the mss is just the MTU. Adjust if necessary.
6650	mss = necp_socket_get_effective_mtu(inp, current_mtu: mss);
6651	#endif /* NECP */
6652
6653	mss -= min_protoh;
6654
6655	if (rt->rt_rmx.rmx_mtu == `0`) {
6656	if (isipv6) {
6657	if (!isnetlocal) {
6658	mss = min(a: mss, b: tcp_v6mssdflt);
6659	}
6660	} else if (!isnetlocal) {
6661	mss = min(a: mss, b: tcp_mssdflt);
6662	}
6663	}
6664
6665	mss = min(a: mss, b: offer);
6666	/*
6667	* maxopd stores the maximum length of data AND options
6668	* in a segment; maxseg is the amount of data in a normal
6669	* segment. We need to store this value (maxopd) apart
6670	* from maxseg, because now every segment carries options
6671	* and thus we normally have somewhat less data in segments.
6672	*/
6673	tp->t_maxopd = mss;
6674
6675	/*
6676	* origoffer==-1 indicates, that no segments were received yet.
6677	* In this case we just guess.
6678	*/
6679	if ((tp->t_flags & (TF_REQ_TSTMP \| TF_NOOPT)) == TF_REQ_TSTMP &&
6680	(origoffer == -`1` \|\|
6681	(tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) {
6682	mss -= TCPOLEN_TSTAMP_APPA;
6683	}
6684
6685	#if MPTCP
6686	mss -= mptcp_adj_mss(tp, FALSE);
6687	#endif /* MPTCP */
6688	tp->t_maxseg = mss;
6689
6690	/*
6691	* If there's a pipesize (ie loopback), change the socket
6692	* buffer to that size only if it's bigger than the current
6693	* sockbuf size. Make the socket buffers an integral
6694	* number of mss units; if the mss is larger than
6695	* the socket buffer, decrease the mss.
6696	*/
6697	#if RTV_SPIPE
6698	bufsize = rt->rt_rmx.rmx_sendpipe;
6699	if (bufsize < so->so_snd.sb_hiwat)
6700	#endif
6701	bufsize = so->so_snd.sb_hiwat;
6702	if (bufsize < mss) {
6703	mss = bufsize;
6704	} else {
6705	bufsize = (((bufsize + mss - `1`) / mss) * mss);
6706	(void)sbreserve(sb: &so->so_snd, cc: bufsize);
6707	}
6708	tp->t_maxseg = mss;
6709
6710	ASSERT(tp->t_maxseg);
6711
6712	/*
6713	* Update MSS using recommendation from link status report. This is
6714	* temporary
6715	*/
6716	tcp_update_mss_locked(so, ifp);
6717
6718	#if RTV_RPIPE
6719	bufsize = rt->rt_rmx.rmx_recvpipe;
6720	if (bufsize < so->so_rcv.sb_hiwat)
6721	#endif
6722	bufsize = so->so_rcv.sb_hiwat;
6723	if (bufsize > mss) {
6724	bufsize = (((bufsize + mss - `1`) / mss) * mss);
6725	(void)sbreserve(sb: &so->so_rcv, cc: bufsize);
6726	}
6727
6728	set_tcp_stream_priority(so);
6729
6730	if (rt->rt_rmx.rmx_ssthresh) {
6731	/*
6732	* There's some sort of gateway or interface
6733	* buffer limit on the path. Use this to set
6734	* slow-start threshold, but set the threshold to
6735	* no less than 2*mss.
6736	*/
6737	tp->snd_ssthresh = max(a: `2` * mss, b: rt->rt_rmx.rmx_ssthresh);
6738	tcpstat.tcps_usedssthresh++;
6739	} else {
6740	tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
6741	}
6742
6743	/*
6744	* Set the slow-start flight size depending on whether this
6745	* is a local network or not.
6746	*/
6747	if (CC_ALGO(tp)->cwnd_init != NULL) {
6748	CC_ALGO(tp)->cwnd_init(tp);
6749	}
6750
6751	tcp_ccdbg_trace(tp, NULL, event: TCP_CC_CWND_INIT);
6752
6753	if (TCP_USE_RLEDBAT(tp, so) && tcp_cc_rledbat.rwnd_init != NULL) {
6754	tcp_cc_rledbat.rwnd_init(tp);
6755	}
6756
6757	/ Route locked during lookup above /
6758	RT_UNLOCK(rt);
6759	}
6760
6761	/*
6762	* Determine the MSS option to send on an outgoing SYN.
6763	*/
6764	int
6765	tcp_mssopt(struct tcpcb *tp)
6766	{
6767	struct rtentry *rt;
6768	int mss;
6769	int isipv6;
6770	int min_protoh;
6771
6772	isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != `0`) ? `1` : `0`;
6773	min_protoh = isipv6 ? sizeof(struct ip6_hdr) + sizeof(struct tcphdr)
6774	: sizeof(struct tcpiphdr);
6775
6776	if (isipv6) {
6777	rt = tcp_rtlookup6(tp->t_inpcb, IFSCOPE_NONE);
6778	} else {
6779	rt = tcp_rtlookup(tp->t_inpcb, IFSCOPE_NONE);
6780	}
6781	if (rt == NULL) {
6782	return isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
6783	}
6784	/*
6785	* Slower link window correction:
6786	* If a value is specificied for slowlink_wsize use it for PPP links
6787	* believed to be on a serial modem (speed <128Kbps). Excludes 9600bps as
6788	* it is the default value adversized by pseudo-devices over ppp.
6789	*/
6790	if (rt->rt_ifp->if_type == IFT_PPP && slowlink_wsize > `0` &&
6791	rt->rt_ifp->if_baudrate > `9600` && rt->rt_ifp->if_baudrate <= `128000`) {
6792	tp->t_flags \|= TF_SLOWLINK;
6793	}
6794
6795	mss = (isipv6 ? tcp_maxmtu6(rt) : tcp_maxmtu(rt));
6796	/ Route locked during lookup above /
6797	RT_UNLOCK(rt);
6798
6799	#if NECP
6800	// At this point, the mss is just the MTU. Adjust if necessary.
6801	mss = necp_socket_get_effective_mtu(inp: tp->t_inpcb, current_mtu: mss);
6802	#endif /* NECP */
6803
6804	return mss - min_protoh;
6805	}
6806
6807	/*
6808	* On a partial ack arrives, force the retransmission of the
6809	* next unacknowledged segment. Do not clear tp->t_dupacks.
6810	* By setting snd_nxt to th_ack, this forces retransmission timer to
6811	* be started again.
6812	*/
6813	static void
6814	tcp_newreno_partial_ack(struct tcpcb tp, struct* tcphdr *th)
6815	{
6816	tcp_seq onxt = tp->snd_nxt;
6817	u_int32_t ocwnd = tp->snd_cwnd;
6818	tp->t_timer[TCPT_REXMT] = `0`;
6819	tp->t_timer[TCPT_PTO] = `0`;
6820	tp->t_rtttime = `0`;
6821	tp->snd_nxt = th->th_ack;
6822	/*
6823	* Set snd_cwnd to one segment beyond acknowledged offset
6824	* (tp->snd_una has not yet been updated when this function
6825	* is called)
6826	*/
6827	tp->snd_cwnd = tp->t_maxseg + BYTES_ACKED(th, tp);
6828	(void) tcp_output(tp);
6829	tp->snd_cwnd = ocwnd;
6830	if (SEQ_GT(onxt, tp->snd_nxt)) {
6831	tp->snd_nxt = onxt;
6832	}
6833	/*
6834	* Partial window deflation. Relies on fact that tp->snd_una
6835	* not updated yet.
6836	*/
6837	if (tp->snd_cwnd > BYTES_ACKED(th, tp)) {
6838	tp->snd_cwnd -= BYTES_ACKED(th, tp);
6839	} else {
6840	tp->snd_cwnd = `0`;
6841	}
6842	tp->snd_cwnd += tp->t_maxseg;
6843	}
6844
6845	/*
6846	* Drop a random TCP connection that hasn't been serviced yet and
6847	* is eligible for discard. There is a one in qlen chance that
6848	* we will return a null, saying that there are no dropable
6849	* requests. In this case, the protocol specific code should drop
6850	* the new request. This insures fairness.
6851	*
6852	* The listening TCP socket "head" must be locked
6853	*/
6854	static int
6855	tcp_dropdropablreq(struct socket *head)
6856	{
6857	struct socket so, sonext;
6858	unsigned int j, qlen;
6859	static uint32_t rnd = `0`;
6860	static uint64_t old_runtime;
6861	static unsigned int cur_cnt, old_cnt;
6862	uint64_t now_sec, i;
6863	struct inpcb *inp = NULL;
6864	struct tcpcb *tp;
6865
6866	if ((head->so_options & SO_ACCEPTCONN) == `0`) {
6867	return `0`;
6868	}
6869
6870	if (TAILQ_EMPTY(&head->so_incomp)) {
6871	return `0`;
6872	}
6873
6874	so_acquire_accept_list(head, NULL);
6875	socket_unlock(so: head, refcount: `0`);
6876
6877	/*
6878	* Check if there is any socket in the incomp queue
6879	* that is closed because of a reset from the peer and is
6880	* waiting to be garbage collected. If so, pick that as
6881	* the victim
6882	*/
6883	TAILQ_FOREACH_SAFE(so, &head->so_incomp, so_list, sonext) {
6884	inp = sotoinpcb(so);
6885	tp = intotcpcb(inp);
6886	if (tp != NULL && tp->t_state == TCPS_CLOSED &&
6887	so->so_head != NULL &&
6888	(so->so_state & (SS_INCOMP \| SS_CANTSENDMORE \| SS_CANTRCVMORE)) ==
6889	(SS_INCOMP \| SS_CANTSENDMORE \| SS_CANTRCVMORE)) {
6890	/*
6891	* The listen socket is already locked but we
6892	* can lock this socket here without lock ordering
6893	* issues because it is in the incomp queue and
6894	* is not visible to others.
6895	*/
6896	if (socket_try_lock(so)) {
6897	so->so_usecount++;
6898	goto found_victim;
6899	} else {
6900	continue;
6901	}
6902	}
6903	}
6904
6905	so = TAILQ_FIRST(&head->so_incomp);
6906
6907	now_sec = net_uptime();
6908	if ((i = (now_sec - old_runtime)) != `0`) {
6909	old_runtime = now_sec;
6910	old_cnt = cur_cnt / i;
6911	cur_cnt = `0`;
6912	}
6913
6914	qlen = head->so_incqlen;
6915	if (rnd == `0`) {
6916	rnd = RandomULong();
6917	}
6918
6919	if (++cur_cnt > qlen \|\| old_cnt > qlen) {
6920	rnd = (`314159` * rnd + `66329`) & `0xffff`;
6921	j = ((qlen + `1`) * rnd) >> `16`;
6922
6923	while (j-- && so) {
6924	so = TAILQ_NEXT(so, so_list);
6925	}
6926	}
6927	/ Find a connection that is not already closing (or being served) /
6928	while (so) {
6929	inp = (struct inpcb *)so->so_pcb;
6930
6931	sonext = TAILQ_NEXT(so, so_list);
6932
6933	if (in_pcb_checkstate(inp, WNT_ACQUIRE, `0`) != WNT_STOPUSING) {
6934	/*
6935	* Avoid the issue of a socket being accepted
6936	* by one input thread and being dropped by
6937	* another input thread. If we can't get a hold
6938	* on this mutex, then grab the next socket in
6939	* line.
6940	*/
6941	if (socket_try_lock(so)) {
6942	so->so_usecount++;
6943	if ((so->so_usecount == `2`) &&
6944	(so->so_state & SS_INCOMP) &&
6945	!(so->so_flags & SOF_INCOMP_INPROGRESS)) {
6946	break;
6947	} else {
6948	/*
6949	* don't use if being accepted or
6950	* used in any other way
6951	*/
6952	in_pcb_checkstate(inp, WNT_RELEASE, `1`);
6953	socket_unlock(so, refcount: `1`);
6954	}
6955	} else {
6956	/*
6957	* do not try to lock the inp in
6958	* in_pcb_checkstate because the lock
6959	* is already held in some other thread.
6960	* Only drop the inp_wntcnt reference.
6961	*/
6962	in_pcb_checkstate(inp, WNT_RELEASE, `1`);
6963	}
6964	}
6965	so = sonext;
6966	}
6967	if (so == NULL) {
6968	socket_lock(so: head, refcount: `0`);
6969	so_release_accept_list(head);
6970	return `0`;
6971	}
6972
6973	/ Makes sure socket is still in the right state to be discarded /
6974
6975	if (in_pcb_checkstate(inp, WNT_RELEASE, `1`) == WNT_STOPUSING) {
6976	socket_unlock(so, refcount: `1`);
6977	socket_lock(so: head, refcount: `0`);
6978	so_release_accept_list(head);
6979	return `0`;
6980	}
6981
6982	found_victim:
6983	if (so->so_usecount != `2` \|\| !(so->so_state & SS_INCOMP)) {
6984	/ do not discard: that socket is being accepted /
6985	socket_unlock(so, refcount: `1`);
6986	socket_lock(so: head, refcount: `0`);
6987	so_release_accept_list(head);
6988	return `0`;
6989	}
6990
6991	socket_lock(so: head, refcount: `0`);
6992	TAILQ_REMOVE(&head->so_incomp, so, so_list);
6993	head->so_incqlen--;
6994	head->so_qlen--;
6995	so->so_state &= ~SS_INCOMP;
6996	so->so_flags \|= SOF_OVERFLOW;
6997	so->so_head = NULL;
6998	so_release_accept_list(head);
6999	socket_unlock(so: head, refcount: `0`);
7000
7001	socket_lock_assert_owned(so);
7002	tp = sototcpcb(so);
7003
7004	tcp_close(tp);
7005	if (inp->inp_wantcnt > `0` && inp->inp_wantcnt != WNT_STOPUSING) {
7006	/*
7007	* Some one has a wantcnt on this pcb. Since WNT_ACQUIRE
7008	* doesn't require a lock, it could have happened while
7009	* we are holding the lock. This pcb will have to
7010	* be garbage collected later.
7011	* Release the reference held for so_incomp queue
7012	*/
7013	VERIFY(so->so_usecount > `0`);
7014	so->so_usecount--;
7015	socket_unlock(so, refcount: `1`);
7016	} else {
7017	/*
7018	* Unlock this socket and leave the reference on.
7019	* We need to acquire the pcbinfo lock in order to
7020	* fully dispose it off
7021	*/
7022	socket_unlock(so, refcount: `0`);
7023
7024	lck_rw_lock_exclusive(lck: &tcbinfo.ipi_lock);
7025
7026	socket_lock(so, refcount: `0`);
7027	/ Release the reference held for so_incomp queue /
7028	VERIFY(so->so_usecount > `0`);
7029	so->so_usecount--;
7030
7031	if (so->so_usecount != `1` \|\|
7032	(inp->inp_wantcnt > `0` &&
7033	inp->inp_wantcnt != WNT_STOPUSING)) {
7034	/*
7035	* There is an extra wantcount or usecount
7036	* that must have been added when the socket
7037	* was unlocked. This socket will have to be
7038	* garbage collected later
7039	*/
7040	socket_unlock(so, refcount: `1`);
7041	} else {
7042	/ Drop the reference held for this function /
7043	VERIFY(so->so_usecount > `0`);
7044	so->so_usecount--;
7045
7046	in_pcbdispose(inp);
7047	}
7048	lck_rw_done(lck: &tcbinfo.ipi_lock);
7049	}
7050	tcpstat.tcps_drops++;
7051
7052	socket_lock(so: head, refcount: `0`);
7053	return `1`;
7054	}
7055
7056	/ Set background congestion control on a socket /
7057	void
7058	tcp_set_background_cc(struct socket *so)
7059	{
7060	tcp_set_new_cc(so, TCP_CC_ALGO_BACKGROUND_INDEX);
7061	}
7062
7063	/ Set foreground congestion control on a socket /
7064	void
7065	tcp_set_foreground_cc(struct socket *so)
7066	{
7067	if (tcp_use_newreno) {
7068	tcp_set_new_cc(so, TCP_CC_ALGO_NEWRENO_INDEX);
7069	#if (DEVELOPMENT \|\| DEBUG)
7070	} else if (tcp_use_ledbat) {
7071	/ Only used for testing /
7072	tcp_set_new_cc(so, TCP_CC_ALGO_BACKGROUND_INDEX);
7073	#endif
7074	} else {
7075	tcp_set_new_cc(so, TCP_CC_ALGO_CUBIC_INDEX);
7076	}
7077	}
7078
7079	static void
7080	tcp_set_new_cc(struct socket *so, uint8_t cc_index)
7081	{
7082	struct inpcb *inp = sotoinpcb(so);
7083	struct tcpcb *tp = intotcpcb(inp);
7084
7085	if (tp->tcp_cc_index != cc_index) {
7086	if (CC_ALGO(tp)->cleanup != NULL) {
7087	CC_ALGO(tp)->cleanup(tp);
7088	}
7089	tp->tcp_cc_index = cc_index;
7090
7091	tcp_cc_allocate_state(tp);
7092
7093	if (CC_ALGO(tp)->switch_to != NULL) {
7094	CC_ALGO(tp)->switch_to(tp);
7095	}
7096
7097	tcp_ccdbg_trace(tp, NULL, event: TCP_CC_CHANGE_ALGO);
7098	}
7099	}
7100
7101	void
7102	tcp_set_recv_bg(struct socket *so)
7103	{
7104	if (!IS_TCP_RECV_BG(so)) {
7105	so->so_flags1 \|= SOF1_TRAFFIC_MGT_TCP_RECVBG;
7106
7107	struct inpcb *inp = sotoinpcb(so);
7108	struct tcpcb *tp = intotcpcb(inp);
7109
7110	if (TCP_RLEDBAT_ENABLED(tp) && tcp_cc_rledbat.switch_to != NULL) {
7111	tcp_cc_rledbat.switch_to(tp);
7112	}
7113	}
7114	}
7115
7116	void
7117	tcp_clear_recv_bg(struct socket *so)
7118	{
7119	if (IS_TCP_RECV_BG(so)) {
7120	so->so_flags1 &= ~(SOF1_TRAFFIC_MGT_TCP_RECVBG);
7121	}
7122	}
7123
7124	void
7125	inp_fc_throttle_tcp(struct inpcb *inp)
7126	{
7127	struct tcpcb *tp = inp->inp_ppcb;
7128
7129	if (!tcp_flow_control_response) {
7130	return;
7131	}
7132
7133	/*
7134	* Back off the slow-start threshold and enter
7135	* congestion avoidance phase
7136	*/
7137	if (CC_ALGO(tp)->pre_fr != NULL) {
7138	CC_ALGO(tp)->pre_fr(tp);
7139	}
7140	}
7141
7142	void
7143	inp_fc_unthrottle_tcp(struct inpcb *inp)
7144	{
7145	struct tcpcb *tp = inp->inp_ppcb;
7146
7147	if (tcp_flow_control_response) {
7148	if (CC_ALGO(tp)->post_fr != NULL) {
7149	CC_ALGO(tp)->post_fr(tp, NULL);
7150	}
7151
7152	tp->t_bytes_acked = `0`;
7153
7154	/*
7155	* Reset retransmit shift as we know that the reason
7156	* for delay in sending a packet is due to flow
7157	* control on the outgoing interface. There is no need
7158	* to backoff retransmit timer.
7159	*/
7160	TCP_RESET_REXMT_STATE(tp);
7161
7162	tp->t_flagsext &= ~TF_CWND_NONVALIDATED;
7163
7164	/*
7165	* Start the output stream again. Since we are
7166	* not retransmitting data, do not reset the
7167	* retransmit timer or rtt calculation.
7168	*/
7169	tcp_output(tp);
7170	return;
7171	}
7172
7173	/*
7174	* Back off the slow-start threshold and enter
7175	* congestion avoidance phase
7176	*/
7177	if (CC_ALGO(tp)->pre_fr != NULL) {
7178	CC_ALGO(tp)->pre_fr(tp);
7179	}
7180
7181	tp->snd_cwnd = tp->snd_ssthresh;
7182	tp->t_flagsext &= ~TF_CWND_NONVALIDATED;
7183	/*
7184	* Restart counting for ABC as we changed the
7185	* congestion window just now.
7186	*/
7187	tp->t_bytes_acked = `0`;
7188
7189	/ Reset retransmit shift as we know that the reason*
7190	* for delay in sending a packet is due to flow
7191	* control on the outgoing interface. There is no need
7192	* to backoff retransmit timer.
7193	*/
7194	TCP_RESET_REXMT_STATE(tp);
7195
7196	/*
7197	* Start the output stream again. Since we are
7198	* not retransmitting data, do not reset the
7199	* retransmit timer or rtt calculation.
7200	*/
7201	tcp_output(tp);
7202	}
7203
7204	static int
7205	tcp_getstat SYSCTL_HANDLER_ARGS
7206	{
7207	#pragma unused(oidp, arg1, arg2)
7208
7209	int error;
7210	struct tcpstat *stat;
7211	stat = &tcpstat;
7212	#if XNU_TARGET_OS_OSX
7213	struct tcpstat zero_stat;
7214
7215	if (tcp_disable_access_to_stats &&
7216	!kauth_cred_issuser(cred: kauth_cred_get())) {
7217	bzero(s: &zero_stat, n: sizeof(zero_stat));
7218	stat = &zero_stat;
7219	}
7220
7221	#endif /* XNU_TARGET_OS_OSX */
7222
7223	if (req->oldptr == `0`) {
7224	req->oldlen = (size_t)sizeof(struct tcpstat);
7225	}
7226
7227	error = SYSCTL_OUT(req, stat, MIN(sizeof(tcpstat), req->oldlen));
7228
7229	return error;
7230	}
7231
7232	/*
7233	* Checksum extended TCP header and data.
7234	*/
7235	int
7236	tcp_input_checksum(int af, struct mbuf m, struct* tcphdr th, int* off, int tlen)
7237	{
7238	struct ifnet *ifp = m->m_pkthdr.rcvif;
7239
7240	switch (af) {
7241	case AF_INET: {
7242	struct ip ip = mtod(m, struct* ip *);
7243	struct ipovly ipov = (struct* ipovly *)ip;
7244
7245	/ ip_stripoptions() must have been called before we get here /
7246	ASSERT((ip->ip_hl << `2`) == sizeof(*ip));
7247
7248	if ((hwcksum_rx \|\| (ifp->if_flags & IFF_LOOPBACK) \|\|
7249	(m->m_pkthdr.pkt_flags & PKTF_LOOP)) &&
7250	(m->m_pkthdr.csum_flags & CSUM_DATA_VALID)) {
7251	if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
7252	th->th_sum = m->m_pkthdr.csum_rx_val;
7253	} else {
7254	uint32_t sum = m->m_pkthdr.csum_rx_val;
7255	uint32_t start = m->m_pkthdr.csum_rx_start;
7256	int32_t trailer = (m_pktlen(m) - (off + tlen));
7257
7258	/*
7259	* Perform 1's complement adjustment of octets
7260	* that got included/excluded in the hardware-
7261	* calculated checksum value. Ignore cases
7262	* where the value already includes the entire
7263	* IP header span, as the sum for those octets
7264	* would already be 0 by the time we get here;
7265	* IP has already performed its header checksum
7266	* checks. If we do need to adjust, restore
7267	* the original fields in the IP header when
7268	* computing the adjustment value. Also take
7269	* care of any trailing bytes and subtract out
7270	* their partial sum.
7271	*/
7272	ASSERT(trailer >= `0`);
7273	if ((m->m_pkthdr.csum_flags & CSUM_PARTIAL) &&
7274	((start != `0` && start != off) \|\| trailer)) {
7275	uint32_t swbytes = (uint32_t)trailer;
7276
7277	if (start < off) {
7278	ip->ip_len += sizeof(*ip);
7279	#if BYTE_ORDER != BIG_ENDIAN
7280	HTONS(ip->ip_len);
7281	HTONS(ip->ip_off);
7282	#endif /* BYTE_ORDER != BIG_ENDIAN */
7283	}
7284	/ callee folds in sum /
7285	sum = m_adj_sum16(m, start, off,
7286	tlen, sum);
7287	if (off > start) {
7288	swbytes += (off - start);
7289	} else {
7290	swbytes += (start - off);
7291	}
7292
7293	if (start < off) {
7294	#if BYTE_ORDER != BIG_ENDIAN
7295	NTOHS(ip->ip_off);
7296	NTOHS(ip->ip_len);
7297	#endif /* BYTE_ORDER != BIG_ENDIAN */
7298	ip->ip_len -= sizeof(*ip);
7299	}
7300
7301	if (swbytes != `0`) {
7302	tcp_in_cksum_stats(swbytes);
7303	}
7304	if (trailer != `0`) {
7305	m_adj(m, -trailer);
7306	}
7307	}
7308
7309	/ callee folds in sum /
7310	th->th_sum = in_pseudo(ip->ip_src.s_addr,
7311	ip->ip_dst.s_addr,
7312	sum + htonl(tlen + IPPROTO_TCP));
7313	}
7314	th->th_sum ^= `0xffff`;
7315	} else {
7316	uint16_t ip_sum;
7317	int len;
7318	char b[`9`];
7319
7320	bcopy(src: ipov->ih_x1, dst: b, n: sizeof(ipov->ih_x1));
7321	bzero(s: ipov->ih_x1, n: sizeof(ipov->ih_x1));
7322	ip_sum = ipov->ih_len;
7323	ipov->ih_len = (u_short)tlen;
7324	#if BYTE_ORDER != BIG_ENDIAN
7325	HTONS(ipov->ih_len);
7326	#endif
7327	len = sizeof(struct ip) + tlen;
7328	th->th_sum = in_cksum(m, len);
7329	bcopy(src: b, dst: ipov->ih_x1, n: sizeof(ipov->ih_x1));
7330	ipov->ih_len = ip_sum;
7331
7332	tcp_in_cksum_stats(len);
7333	}
7334	break;
7335	}
7336	case AF_INET6: {
7337	struct ip6_hdr ip6 = mtod(m, struct* ip6_hdr *);
7338
7339	if ((hwcksum_rx \|\| (ifp->if_flags & IFF_LOOPBACK) \|\|
7340	(m->m_pkthdr.pkt_flags & PKTF_LOOP)) &&
7341	(m->m_pkthdr.csum_flags & CSUM_DATA_VALID)) {
7342	if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
7343	th->th_sum = m->m_pkthdr.csum_rx_val;
7344	} else {
7345	uint32_t sum = m->m_pkthdr.csum_rx_val;
7346	uint32_t start = m->m_pkthdr.csum_rx_start;
7347	int32_t trailer = (m_pktlen(m) - (off + tlen));
7348
7349	/*
7350	* Perform 1's complement adjustment of octets
7351	* that got included/excluded in the hardware-
7352	* calculated checksum value. Also take care
7353	* of any trailing bytes and subtract out their
7354	* partial sum.
7355	*/
7356	ASSERT(trailer >= `0`);
7357	if ((m->m_pkthdr.csum_flags & CSUM_PARTIAL) &&
7358	(start != off \|\| trailer != `0`)) {
7359	uint16_t s = `0`, d = `0`;
7360	uint32_t swbytes = (uint32_t)trailer;
7361
7362	if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src)) {
7363	s = ip6->ip6_src.s6_addr16[`1`];
7364	ip6->ip6_src.s6_addr16[`1`] = `0`;
7365	}
7366	if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst)) {
7367	d = ip6->ip6_dst.s6_addr16[`1`];
7368	ip6->ip6_dst.s6_addr16[`1`] = `0`;
7369	}
7370
7371	/ callee folds in sum /
7372	sum = m_adj_sum16(m, start, off,
7373	tlen, sum);
7374	if (off > start) {
7375	swbytes += (off - start);
7376	} else {
7377	swbytes += (start - off);
7378	}
7379
7380	if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src)) {
7381	ip6->ip6_src.s6_addr16[`1`] = s;
7382	}
7383	if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst)) {
7384	ip6->ip6_dst.s6_addr16[`1`] = d;
7385	}
7386
7387	if (swbytes != `0`) {
7388	tcp_in6_cksum_stats(swbytes);
7389	}
7390	if (trailer != `0`) {
7391	m_adj(m, -trailer);
7392	}
7393	}
7394
7395	th->th_sum = in6_pseudo(
7396	&ip6->ip6_src, &ip6->ip6_dst,
7397	sum + htonl(tlen + IPPROTO_TCP));
7398	}
7399	th->th_sum ^= `0xffff`;
7400	} else {
7401	tcp_in6_cksum_stats(tlen);
7402	th->th_sum = in6_cksum(m, IPPROTO_TCP, off, tlen);
7403	}
7404	break;
7405	}
7406	default:
7407	VERIFY(`0`);
7408	/ NOTREACHED /
7409	}
7410
7411	if (th->th_sum != `0`) {
7412	tcpstat.tcps_rcvbadsum++;
7413	IF_TCP_STATINC(ifp, badformat);
7414	return -`1`;
7415	}
7416
7417	return `0`;
7418	}
7419
7420	#define DUMP_BUF_CHK() { \
7421	clen -= k; \
7422	if (clen < 1) \
7423	goto done; \
7424	c += k; \
7425	}
7426
7427	int
7428	dump_tcp_reass_qlen(char str, int* str_len)
7429	{
7430	char *c = str;
7431	int k, clen = str_len;
7432
7433	if (tcp_reass_total_qlen != `0`) {
7434	k = scnprintf(c, count: clen, "\ntcp reass qlen %d\n", tcp_reass_total_qlen);
7435	DUMP_BUF_CHK();
7436	}
7437
7438	done:
7439	return str_len - clen;
7440	}
7441
7442	uint32_t
7443	tcp_reass_qlen_space(struct socket *so)
7444	{
7445	uint32_t space = `0`;
7446	struct inpcb *inp = sotoinpcb(so);
7447
7448	if (inp != NULL) {
7449	struct tcpcb *tp = intotcpcb(inp);
7450
7451	if (tp != NULL) {
7452	space = tp->t_reassq_mbcnt;
7453	}
7454	}
7455	return space;
7456	}
7457
7458
7459	SYSCTL_PROC(_net_inet_tcp, TCPCTL_STATS, stats,
7460	CTLTYPE_STRUCT \| CTLFLAG_RD \| CTLFLAG_LOCKED, `0`, `0`, tcp_getstat,
7461	"S,tcpstat", "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
7462
7463	static int
7464	sysctl_rexmtthresh SYSCTL_HANDLER_ARGS
7465	{
7466	#pragma unused(arg1, arg2)
7467
7468	int error, val = tcprexmtthresh;
7469
7470	error = sysctl_handle_int(oidp, arg1: &val, arg2: `0`, req);
7471	if (error \|\| !req->newptr) {
7472	return error;
7473	}
7474
7475	/*
7476	* Constrain the number of duplicate ACKs
7477	* to consider for TCP fast retransmit
7478	* to either 2 or 3
7479	*/
7480
7481	if (val < `2` \|\| val > `3`) {
7482	return EINVAL;
7483	}
7484
7485	tcprexmtthresh = (uint8_t)val;
7486
7487	return `0`;
7488	}
7489
7490	SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmt_thresh, CTLTYPE_INT \| CTLFLAG_RW \|
7491	CTLFLAG_LOCKED, &tcprexmtthresh, `0`, &sysctl_rexmtthresh, "I",
7492	"Duplicate ACK Threshold for Fast Retransmit");
7493

Browse the source code of xnu/bsd/netinet/tcp_input.c