1/*
2 * Copyright (c) 2000-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
30 * The Regents of the University of California. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 * @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
61 * $FreeBSD: src/sys/netinet/tcp_input.c,v 1.107.2.16 2001/08/22 00:59:12 silby Exp $
62 */
63/*
64 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
65 * support for mandatory and extensible security protections. This notice
66 * is included in support of clause 2.2 (b) of the Apple Public License,
67 * Version 2.0.
68 */
69
70#include "tcp_includes.h"
71
72#include <sys/param.h>
73#include <sys/systm.h>
74#include <sys/kernel.h>
75#include <sys/sysctl.h>
76#include <sys/malloc.h>
77#include <sys/mbuf.h>
78#include <sys/proc.h> /* for proc0 declaration */
79#include <sys/protosw.h>
80#include <sys/socket.h>
81#include <sys/socketvar.h>
82#include <sys/syslog.h>
83#include <sys/mcache.h>
84#include <sys/kauth.h>
85#include <kern/cpu_number.h> /* before tcp_seq.h, for tcp_random18() */
86
87#include <machine/endian.h>
88
89#include <net/if.h>
90#include <net/if_types.h>
91#include <net/route.h>
92#include <net/ntstat.h>
93#include <net/content_filter.h>
94#include <net/dlil.h>
95#include <net/multi_layer_pkt_log.h>
96
97#include <netinet/in.h>
98#include <netinet/in_systm.h>
99#include <netinet/ip.h>
100#include <netinet/ip_icmp.h> /* for ICMP_BANDLIM */
101#include <netinet/in_var.h>
102#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
103#include <netinet/in_pcb.h>
104#include <netinet/ip_var.h>
105#include <mach/sdt.h>
106#include <netinet/ip6.h>
107#include <netinet/icmp6.h>
108#include <netinet6/nd6.h>
109#include <netinet6/ip6_var.h>
110#include <netinet6/in6_pcb.h>
111#include <netinet/tcp.h>
112#include <netinet/tcp_cache.h>
113#include <netinet/tcp_fsm.h>
114#include <netinet/tcp_seq.h>
115#include <netinet/tcp_timer.h>
116#include <netinet/tcp_var.h>
117#include <netinet/tcp_cc.h>
118#include <dev/random/randomdev.h>
119#include <kern/zalloc.h>
120#include <netinet6/tcp6_var.h>
121#include <netinet/tcpip.h>
122#if TCPDEBUG
123#include <netinet/tcp_debug.h>
124u_char tcp_saveipgen[40]; /* the size must be of max ip header, now IPv6 */
125struct tcphdr tcp_savetcp;
126#endif /* TCPDEBUG */
127#include <netinet/tcp_log.h>
128
129#if IPSEC
130#include <netinet6/ipsec.h>
131#include <netinet6/ipsec6.h>
132#include <netkey/key.h>
133#endif /*IPSEC*/
134
135#include <sys/kdebug.h>
136#if MPTCP
137#include <netinet/mptcp_var.h>
138#include <netinet/mptcp.h>
139#include <netinet/mptcp_opt.h>
140#endif /* MPTCP */
141
142#include <corecrypto/ccaes.h>
143#include <net/sockaddr_utils.h>
144
145#define DBG_LAYER_BEG NETDBG_CODE(DBG_NETTCP, 0)
146#define DBG_LAYER_END NETDBG_CODE(DBG_NETTCP, 2)
147#define DBG_FNC_TCP_INPUT NETDBG_CODE(DBG_NETTCP, (3 << 8))
148#define DBG_FNC_TCP_NEWCONN NETDBG_CODE(DBG_NETTCP, (7 << 8))
149
150#define TCP_RTT_HISTORY_EXPIRE_TIME (60 * TCP_RETRANSHZ)
151#define TCP_RECV_THROTTLE_WIN (5 * TCP_RETRANSHZ)
152#define TCP_STRETCHACK_ENABLE_PKTCNT 2000
153
154struct tcpstat tcpstat;
155
156SYSCTL_SKMEM_TCP_INT(OID_AUTO, flow_control_response,
157 CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_flow_control_response, 1,
158 "Improved response to Flow-control events");
159
160static int log_in_vain = 0;
161SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_in_vain,
162 CTLFLAG_RW | CTLFLAG_LOCKED, &log_in_vain, 0,
163 "Log all incoming TCP connections");
164
165SYSCTL_SKMEM_TCP_INT(OID_AUTO, ack_strategy,
166 CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_ack_strategy, TCP_ACK_STRATEGY_MODERN,
167 "Revised TCP ACK-strategy, avoiding stretch-ACK implementation");
168
169static int blackhole = 0;
170SYSCTL_INT(_net_inet_tcp, OID_AUTO, blackhole,
171 CTLFLAG_RW | CTLFLAG_LOCKED, &blackhole, 0,
172 "Do not send RST when dropping refused connections");
173
174/* TODO - remove once uTCP stopped using it */
175SYSCTL_SKMEM_TCP_INT(OID_AUTO, aggressive_rcvwnd_inc,
176 CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_aggressive_rcvwnd_inc, 1,
177 "Be more aggressive about increasing the receive-window.");
178
179SYSCTL_SKMEM_TCP_INT(OID_AUTO, delayed_ack,
180 CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_delack_enabled, 3,
181 "Delay ACK to try and piggyback it onto a data packet");
182
183SYSCTL_SKMEM_TCP_INT(OID_AUTO, recvbg, CTLFLAG_RW | CTLFLAG_LOCKED,
184 int, tcp_recv_bg, 0, "Receive background");
185
186SYSCTL_SKMEM_TCP_INT(OID_AUTO, drop_synfin,
187 CTLFLAG_RW | CTLFLAG_LOCKED, static int, drop_synfin, 1,
188 "Drop TCP packets with SYN+FIN set");
189
190SYSCTL_NODE(_net_inet_tcp, OID_AUTO, reass, CTLFLAG_RW | CTLFLAG_LOCKED, 0,
191 "TCP Segment Reassembly Queue");
192
193static int tcp_reass_overflows = 0;
194SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, overflows,
195 CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_reass_overflows, 0,
196 "Global number of TCP segment reassembly queue overflows");
197
198int tcp_reass_total_qlen = 0;
199SYSCTL_INT(_net_inet_tcp_reass, OID_AUTO, qlen,
200 CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_reass_total_qlen, 0,
201 "Total number of TCP segments in reassembly queues");
202
203
204SYSCTL_SKMEM_TCP_INT(OID_AUTO, slowlink_wsize, CTLFLAG_RW | CTLFLAG_LOCKED,
205 __private_extern__ int, slowlink_wsize, 8192,
206 "Maximum advertised window size for slowlink");
207
208SYSCTL_SKMEM_TCP_INT(OID_AUTO, maxseg_unacked,
209 CTLFLAG_RW | CTLFLAG_LOCKED, int, maxseg_unacked, 8,
210 "Maximum number of outstanding segments left unacked");
211
212SYSCTL_SKMEM_TCP_INT(OID_AUTO, rfc3465, CTLFLAG_RW | CTLFLAG_LOCKED,
213 int, tcp_do_rfc3465, 1, "");
214
215SYSCTL_SKMEM_TCP_INT(OID_AUTO, rfc3465_lim2,
216 CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_do_rfc3465_lim2, 1,
217 "Appropriate bytes counting w/ L=2*SMSS");
218
219int rtt_samples_per_slot = 20;
220
221int tcp_acc_iaj_high_thresh = ACC_IAJ_HIGH_THRESH;
222u_int32_t tcp_autorcvbuf_inc_shift = 3;
223SYSCTL_SKMEM_TCP_INT(OID_AUTO, recv_allowed_iaj,
224 CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_allowed_iaj, ALLOWED_IAJ,
225 "Allowed inter-packet arrival jiter");
226
227SYSCTL_SKMEM_TCP_INT(OID_AUTO, doautorcvbuf,
228 CTLFLAG_RW | CTLFLAG_LOCKED, u_int32_t, tcp_do_autorcvbuf, 1,
229 "Enable automatic socket buffer tuning");
230
231SYSCTL_SKMEM_TCP_INT(OID_AUTO, autotunereorder,
232 CTLFLAG_RW | CTLFLAG_LOCKED, u_int32_t, tcp_autotune_reorder, 1,
233 "Enable automatic socket buffer tuning even when reordering is present");
234
235SYSCTL_SKMEM_TCP_INT(OID_AUTO, autorcvbufmax,
236 CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_KERN, u_int32_t, tcp_autorcvbuf_max, 2 * 1024 * 1024,
237 "Maximum receive socket buffer size");
238
239int tcp_disable_access_to_stats = 1;
240SYSCTL_INT(_net_inet_tcp, OID_AUTO, disable_access_to_stats,
241 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_disable_access_to_stats, 0,
242 "Disable access to tcpstat");
243
244SYSCTL_SKMEM_TCP_INT(OID_AUTO, challengeack_limit,
245 CTLFLAG_RW | CTLFLAG_LOCKED, uint32_t, tcp_challengeack_limit, 10,
246 "Maximum number of challenge ACKs per connection per second");
247
248/* TO BE REMOVED */
249SYSCTL_SKMEM_TCP_INT(OID_AUTO, do_rfc5961,
250 CTLFLAG_RW | CTLFLAG_LOCKED, static int, tcp_do_rfc5961, 1,
251 "Enable/Disable full RFC 5961 compliance");
252
253SYSCTL_SKMEM_TCP_INT(OID_AUTO, do_better_lr,
254 CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_do_better_lr, 1,
255 "Improved TCP Loss Recovery");
256
257SYSCTL_SKMEM_TCP_INT(OID_AUTO, use_min_curr_rtt,
258 CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_use_min_curr_rtt, 1,
259 "Use a min of k=4 RTT samples for congestion controllers");
260
261SYSCTL_SKMEM_TCP_INT(OID_AUTO, awdl_rtobase,
262 CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_awdl_rtobase, 100,
263 "Initial RTO for AWDL interface");
264
265extern int tcp_acc_iaj_high;
266extern int tcp_acc_iaj_react_limit;
267extern int tcp_fin_timeout;
268
269uint8_t tcprexmtthresh = 3;
270
271u_int32_t tcp_now;
272struct timeval tcp_uptime; /* uptime when tcp_now was last updated */
273
274/* Used to sychronize updates to tcp_now */
275static LCK_GRP_DECLARE(tcp_uptime_mtx_grp, "tcpuptime");
276LCK_SPIN_DECLARE(tcp_uptime_lock, &tcp_uptime_mtx_grp);
277
278struct inpcbhead tcb;
279#define tcb6 tcb /* for KAME src sync over BSD*'s */
280struct inpcbinfo tcbinfo;
281
282static void tcp_dooptions(struct tcpcb *, u_char *, int, struct tcphdr *,
283 struct tcpopt *);
284static void tcp_finalize_options(struct tcpcb *, struct tcpopt *, unsigned int);
285static void tcp_pulloutofband(struct socket *,
286 struct tcphdr *, struct mbuf *, int);
287static void tcp_xmit_timer(struct tcpcb *, int, u_int32_t, tcp_seq);
288static inline unsigned int tcp_maxmtu(struct rtentry *);
289static inline int tcp_stretch_ack_enable(struct tcpcb *tp, int thflags);
290static inline void tcp_adaptive_rwtimo_check(struct tcpcb *, int);
291
292#if TRAFFIC_MGT
293static inline void compute_iaj(struct tcpcb *tp);
294static inline void compute_iaj_meat(struct tcpcb *tp, uint32_t cur_iaj);
295#endif /* TRAFFIC_MGT */
296
297static inline unsigned int tcp_maxmtu6(struct rtentry *);
298unsigned int get_maxmtu(struct rtentry *);
299
300static void tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sb,
301 struct tcpopt *to, uint32_t tlen);
302void tcp_sbrcv_trim(struct tcpcb *tp, struct sockbuf *sb);
303static void tcp_sbsnd_trim(struct sockbuf *sbsnd);
304static inline void tcp_sbrcv_tstmp_check(struct tcpcb *tp);
305static inline void tcp_sbrcv_reserve(struct tcpcb *tp, struct sockbuf *sb,
306 u_int32_t newsize, u_int32_t idealsize, u_int32_t rcvbuf_max);
307static void tcp_bad_rexmt_restore_state(struct tcpcb *tp, struct tcphdr *th);
308static void tcp_compute_rtt(struct tcpcb *tp, struct tcpopt *to,
309 struct tcphdr *th);
310static void tcp_compute_rcv_rtt(struct tcpcb *tp, struct tcpopt *to,
311 struct tcphdr *th);
312static void tcp_early_rexmt_check(struct tcpcb *tp, struct tcphdr *th);
313static void tcp_bad_rexmt_check(struct tcpcb *tp, struct tcphdr *th,
314 struct tcpopt *to);
315/*
316 * Constants used for resizing receive socket buffer
317 * when timestamps are not supported
318 */
319#define TCPTV_RCVNOTS_QUANTUM 100
320#define TCP_RCVNOTS_BYTELEVEL 204800
321
322/*
323 * Constants used for limiting early retransmits
324 * to 10 per minute.
325 */
326#define TCP_EARLY_REXMT_WIN (60 * TCP_RETRANSHZ) /* 60 seconds */
327#define TCP_EARLY_REXMT_LIMIT 10
328
329#define log_in_vain_log( a ) { log a; }
330
331int tcp_rcvunackwin = TCPTV_UNACKWIN;
332int tcp_maxrcvidle = TCPTV_MAXRCVIDLE;
333SYSCTL_SKMEM_TCP_INT(OID_AUTO, rcvsspktcnt, CTLFLAG_RW | CTLFLAG_LOCKED,
334 int, tcp_rcvsspktcnt, TCP_RCV_SS_PKTCOUNT, "packets to be seen before receiver stretches acks");
335
336#define DELAY_ACK(tp, th) \
337 (CC_ALGO(tp)->delay_ack != NULL && CC_ALGO(tp)->delay_ack(tp, th))
338
339static int tcp_dropdropablreq(struct socket *head);
340static void tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th);
341static void update_base_rtt(struct tcpcb *tp, uint32_t rtt);
342void tcp_set_background_cc(struct socket *so);
343void tcp_set_foreground_cc(struct socket *so);
344static void tcp_set_new_cc(struct socket *so, uint8_t cc_index);
345static void tcp_bwmeas_check(struct tcpcb *tp);
346
347#if TRAFFIC_MGT
348void
349reset_acc_iaj(struct tcpcb *tp)
350{
351 tp->acc_iaj = 0;
352 CLEAR_IAJ_STATE(tp);
353}
354
355static inline void
356update_iaj_state(struct tcpcb *tp, int size, int rst_size)
357{
358 if (rst_size > 0) {
359 tp->iaj_size = 0;
360 }
361 if (tp->iaj_size == 0 || size >= tp->iaj_size) {
362 tp->iaj_size = size;
363 tp->iaj_rcv_ts = tcp_now;
364 tp->iaj_small_pkt = 0;
365 }
366}
367
368/* For every 64-bit unsigned integer(v), this function will find the
369 * largest 32-bit integer n such that (n*n <= v). This takes at most 32 iterations
370 * irrespective of the value of v and does not involve multiplications.
371 */
372static inline uint32_t
373isqrt(uint64_t val)
374{
375 uint32_t sqrt_cache[11] = {0, 1, 4, 9, 16, 25, 36, 49, 64, 81, 100};
376 uint64_t temp, g = 0, b = 1 << 31, bshft = 31;
377 if (val <= 100) {
378 for (g = 0; g <= 10; ++g) {
379 if (sqrt_cache[g] > val) {
380 g--;
381 break;
382 } else if (sqrt_cache[g] == val) {
383 break;
384 }
385 }
386 } else {
387 do {
388 temp = (((g << 1) + b) << (bshft--));
389 if (val >= temp) {
390 g += b;
391 val -= temp;
392 }
393 b >>= 1;
394 } while (b > 0 && val > 0);
395 }
396 return (uint32_t)g;
397}
398
399static inline void
400compute_iaj_meat(struct tcpcb *tp, uint32_t cur_iaj)
401{
402 /* When accumulated IAJ reaches MAX_ACC_IAJ in milliseconds,
403 * throttle the receive window to a minimum of MIN_IAJ_WIN packets
404 */
405#define MAX_ACC_IAJ (tcp_acc_iaj_high_thresh + tcp_acc_iaj_react_limit)
406#define IAJ_DIV_SHIFT 4
407#define IAJ_ROUNDUP_CONST (1 << (IAJ_DIV_SHIFT - 1))
408
409 uint32_t allowed_iaj, acc_iaj = 0;
410
411 /* Using 64-bit storage for the inter-arrival jitter deviation,
412 * to avoid accidentally rolling over if the inter-arrival time exceeds 62 seconds.
413 */
414 int64_t mean, temp, cur_iaj_dev;
415
416 cur_iaj_dev = (cur_iaj - tp->avg_iaj);
417
418 /* Allow a jitter of "allowed_iaj" milliseconds. Some connections
419 * may have a constant jitter more than that. We detect this by
420 * using standard deviation.
421 */
422 allowed_iaj = tp->avg_iaj + tp->std_dev_iaj;
423 if (allowed_iaj < tcp_allowed_iaj) {
424 allowed_iaj = tcp_allowed_iaj;
425 }
426
427 /* Initially when the connection starts, the senders congestion
428 * window is small. During this period we avoid throttling a
429 * connection because we do not have a good starting point for
430 * allowed_iaj. IAJ_IGNORE_PKTCNT is used to quietly gloss over
431 * the first few packets.
432 */
433 if (tp->iaj_pktcnt > IAJ_IGNORE_PKTCNT) {
434 if (cur_iaj <= allowed_iaj) {
435 if (tp->acc_iaj >= 2) {
436 acc_iaj = tp->acc_iaj - 2;
437 } else {
438 acc_iaj = 0;
439 }
440 } else {
441 acc_iaj = tp->acc_iaj + (cur_iaj - allowed_iaj);
442 }
443
444 if (acc_iaj > MAX_ACC_IAJ) {
445 acc_iaj = MAX_ACC_IAJ;
446 }
447 tp->acc_iaj = acc_iaj;
448 }
449
450 /* Compute weighted average where the history has a weight of
451 * 15 out of 16 and the current value has a weight of 1 out of 16.
452 * This will make the short-term measurements have more weight.
453 *
454 * The addition of 8 will help to round-up the value
455 * instead of round-down
456 */
457 tp->avg_iaj = (((tp->avg_iaj << IAJ_DIV_SHIFT) - tp->avg_iaj)
458 + cur_iaj + IAJ_ROUNDUP_CONST) >> IAJ_DIV_SHIFT;
459
460 /* Compute Root-mean-square of deviation where mean is a weighted
461 * average as described above.
462 */
463 temp = tp->std_dev_iaj * tp->std_dev_iaj;
464 mean = (((temp << IAJ_DIV_SHIFT) - temp)
465 + (cur_iaj_dev * cur_iaj_dev)
466 + IAJ_ROUNDUP_CONST) >> IAJ_DIV_SHIFT;
467
468 tp->std_dev_iaj = isqrt(val: mean);
469
470 DTRACE_TCP3(iaj, struct tcpcb *, tp, uint32_t, cur_iaj,
471 uint32_t, allowed_iaj);
472
473 return;
474}
475
476static inline void
477compute_iaj(struct tcpcb *tp)
478{
479 compute_iaj_meat(tp, cur_iaj: (tcp_now - tp->iaj_rcv_ts));
480}
481#endif /* TRAFFIC_MGT */
482
483/*
484 * Perform rate limit check per connection per second
485 * tp->t_challengeack_last is the last_time diff was greater than 1sec
486 * tp->t_challengeack_count is the number of ACKs sent (within 1sec)
487 * Return TRUE if we shouldn't send the ACK due to rate limitation
488 * Return FALSE if it is still ok to send challenge ACK
489 */
490static boolean_t
491tcp_is_ack_ratelimited(struct tcpcb *tp)
492{
493 boolean_t ret = TRUE;
494 uint32_t now = tcp_now;
495 int32_t diff = 0;
496
497 diff = timer_diff(t1: now, toff1: 0, t2: tp->t_challengeack_last, toff2: 0);
498 /* If it is first time or diff > 1000ms,
499 * update the challengeack_last and reset the
500 * current count of ACKs
501 */
502 if (tp->t_challengeack_last == 0 || diff >= 1000) {
503 tp->t_challengeack_last = now;
504 tp->t_challengeack_count = 0;
505 ret = FALSE;
506 } else if (tp->t_challengeack_count < tcp_challengeack_limit) {
507 ret = FALSE;
508 }
509
510 /* Careful about wrap-around */
511 if (ret == FALSE && (tp->t_challengeack_count + 1 > 0)) {
512 tp->t_challengeack_count++;
513 }
514
515 return ret;
516}
517
518/* Check if enough amount of data has been acknowledged since
519 * bw measurement was started
520 */
521static void
522tcp_bwmeas_check(struct tcpcb *tp)
523{
524 int32_t bw_meas_bytes;
525 uint32_t bw, bytes, elapsed_time;
526
527 if (SEQ_LEQ(tp->snd_una, tp->t_bwmeas->bw_start)) {
528 return;
529 }
530
531 bw_meas_bytes = tp->snd_una - tp->t_bwmeas->bw_start;
532 if ((tp->t_flagsext & TF_BWMEAS_INPROGRESS) &&
533 bw_meas_bytes >= (int32_t)(tp->t_bwmeas->bw_size)) {
534 bytes = bw_meas_bytes;
535 elapsed_time = tcp_now - tp->t_bwmeas->bw_ts;
536 if (elapsed_time > 0) {
537 bw = bytes / elapsed_time;
538 if (bw > 0) {
539 if (tp->t_bwmeas->bw_sndbw > 0) {
540 tp->t_bwmeas->bw_sndbw =
541 (((tp->t_bwmeas->bw_sndbw << 3)
542 - tp->t_bwmeas->bw_sndbw)
543 + bw) >> 3;
544 } else {
545 tp->t_bwmeas->bw_sndbw = bw;
546 }
547
548 /* Store the maximum value */
549 if (tp->t_bwmeas->bw_sndbw_max == 0) {
550 tp->t_bwmeas->bw_sndbw_max =
551 tp->t_bwmeas->bw_sndbw;
552 } else {
553 tp->t_bwmeas->bw_sndbw_max =
554 max(a: tp->t_bwmeas->bw_sndbw,
555 b: tp->t_bwmeas->bw_sndbw_max);
556 }
557 }
558 }
559 tp->t_flagsext &= ~(TF_BWMEAS_INPROGRESS);
560 }
561}
562
563static int
564tcp_reass(struct tcpcb *tp, struct tcphdr *th, int *tlenp, struct mbuf *m,
565 struct ifnet *ifp, int *dowakeup)
566{
567 struct tseg_qent *q;
568 struct tseg_qent *p = NULL;
569 struct tseg_qent *nq;
570 struct tseg_qent *te = NULL;
571 struct inpcb *inp = tp->t_inpcb;
572 struct socket *so = inp->inp_socket;
573 int flags = 0;
574 uint32_t qlimit;
575 boolean_t cell = IFNET_IS_CELLULAR(ifp);
576 boolean_t wifi = (!cell && IFNET_IS_WIFI(ifp));
577 boolean_t wired = (!wifi && IFNET_IS_WIRED(ifp));
578 boolean_t dsack_set = FALSE;
579
580 /*
581 * If the reassembly queue already has entries or if we are going
582 * to add a new one, then the connection has reached a loss state.
583 * Reset the stretch-ack algorithm at this point.
584 */
585 tcp_reset_stretch_ack(tp);
586 tp->t_forced_acks = TCP_FORCED_ACKS_COUNT;
587
588#if TRAFFIC_MGT
589 if (tp->acc_iaj > 0) {
590 reset_acc_iaj(tp);
591 }
592#endif /* TRAFFIC_MGT */
593
594 if (th->th_seq != tp->rcv_nxt) {
595 struct mbuf *tmp = m;
596 while (tmp != NULL) {
597 if (mbuf_class_under_pressure(m: tmp)) {
598 m_freem(m);
599 tcp_reass_overflows++;
600 tcpstat.tcps_rcvmemdrop++;
601 *tlenp = 0;
602 return 0;
603 }
604
605 tmp = tmp->m_next;
606 }
607 }
608
609 /*
610 * Limit the number of segments in the reassembly queue to prevent
611 * holding on to too many segments (and thus running out of mbufs).
612 * Make sure to let the missing segment through which caused this
613 * queue. Always keep one global queue entry spare to be able to
614 * process the missing segment.
615 */
616 qlimit = min(a: max(a: 100, b: so->so_rcv.sb_hiwat >> 10),
617 b: (tcp_autorcvbuf_max >> 10));
618 if (th->th_seq != tp->rcv_nxt &&
619 (tp->t_reassqlen + 1) >= qlimit) {
620 tcp_reass_overflows++;
621 tcpstat.tcps_rcvmemdrop++;
622 m_freem(m);
623 *tlenp = 0;
624 return 0;
625 }
626
627 /* Allocate a new queue entry. If we can't, just drop the pkt. XXX */
628 te = zalloc_flags(tcp_reass_zone, Z_WAITOK | Z_NOFAIL);
629 tp->t_reassqlen++;
630 OSIncrementAtomic(&tcp_reass_total_qlen);
631
632 /*
633 * Find a segment which begins after this one does.
634 */
635 LIST_FOREACH(q, &tp->t_segq, tqe_q) {
636 if (SEQ_GT(q->tqe_th->th_seq, th->th_seq)) {
637 break;
638 }
639 p = q;
640 }
641
642 /*
643 * If there is a preceding segment, it may provide some of
644 * our data already. If so, drop the data from the incoming
645 * segment. If it provides all of our data, drop us.
646 */
647 if (p != NULL) {
648 int i;
649 /* conversion to int (in i) handles seq wraparound */
650 i = p->tqe_th->th_seq + p->tqe_len - th->th_seq;
651 if (i > 0) {
652 if (i > 1) {
653 /*
654 * Note duplicate data sequnce numbers
655 * to report in DSACK option
656 */
657 tp->t_dsack_lseq = th->th_seq;
658 tp->t_dsack_rseq = th->th_seq +
659 min(a: i, b: *tlenp);
660
661 /*
662 * Report only the first part of partial/
663 * non-contiguous duplicate sequence space
664 */
665 dsack_set = TRUE;
666 }
667 if (i >= *tlenp) {
668 tcpstat.tcps_rcvduppack++;
669 tcpstat.tcps_rcvdupbyte += *tlenp;
670 if (nstat_collect) {
671 nstat_route_rx(rte: inp->inp_route.ro_rt,
672 packets: 1, bytes: *tlenp,
673 flags: NSTAT_RX_FLAG_DUPLICATE);
674 INP_ADD_STAT(inp, cell, wifi, wired,
675 rxpackets, 1);
676 INP_ADD_STAT(inp, cell, wifi, wired,
677 rxbytes, *tlenp);
678 tp->t_stat.rxduplicatebytes += *tlenp;
679 inp_set_activity_bitmap(inp);
680 }
681 m_freem(m);
682 zfree(tcp_reass_zone, te);
683 te = NULL;
684 tp->t_reassqlen--;
685 OSDecrementAtomic(&tcp_reass_total_qlen);
686 /*
687 * Try to present any queued data
688 * at the left window edge to the user.
689 * This is needed after the 3-WHS
690 * completes.
691 */
692 goto present;
693 }
694 m_adj(m, i);
695 *tlenp -= i;
696 th->th_seq += i;
697 }
698 }
699
700 if (th->th_seq != tp->rcv_nxt) {
701 tp->t_rcvoopack++;
702 tcpstat.tcps_rcvoopack++;
703 tcpstat.tcps_rcvoobyte += *tlenp;
704 if (nstat_collect) {
705 tp->t_stat.rxoutoforderbytes += *tlenp;
706 }
707 }
708
709 if (nstat_collect) {
710 nstat_route_rx(rte: inp->inp_route.ro_rt, packets: 1, bytes: *tlenp,
711 flags: NSTAT_RX_FLAG_OUT_OF_ORDER);
712 INP_ADD_STAT(inp, cell, wifi, wired, rxpackets, 1);
713 INP_ADD_STAT(inp, cell, wifi, wired, rxbytes, *tlenp);
714 inp_set_activity_bitmap(inp);
715 }
716
717 /*
718 * While we overlap succeeding segments trim them or,
719 * if they are completely covered, dequeue them.
720 */
721 while (q) {
722 int i = (th->th_seq + *tlenp) - q->tqe_th->th_seq;
723 if (i <= 0) {
724 break;
725 }
726
727 /*
728 * Report only the first part of partial/non-contiguous
729 * duplicate segment in dsack option. The variable
730 * dsack_set will be true if a previous entry has some of
731 * the duplicate sequence space.
732 */
733 if (i > 1 && !dsack_set) {
734 if (tp->t_dsack_lseq == 0) {
735 tp->t_dsack_lseq = q->tqe_th->th_seq;
736 tp->t_dsack_rseq =
737 tp->t_dsack_lseq + min(a: i, b: q->tqe_len);
738 } else {
739 /*
740 * this segment overlaps data in multple
741 * entries in the reassembly queue, move
742 * the right sequence number further.
743 */
744 tp->t_dsack_rseq =
745 tp->t_dsack_rseq + min(a: i, b: q->tqe_len);
746 }
747 }
748 if (i < q->tqe_len) {
749 q->tqe_th->th_seq += i;
750 q->tqe_len -= i;
751 m_adj(q->tqe_m, i);
752 break;
753 }
754
755 nq = LIST_NEXT(q, tqe_q);
756 LIST_REMOVE(q, tqe_q);
757 tp->t_reassq_mbcnt -= _MSIZE + (q->tqe_m->m_flags & M_EXT) ?
758 q->tqe_m->m_ext.ext_size : 0;
759 m_freem(q->tqe_m);
760 zfree(tcp_reass_zone, q);
761 tp->t_reassqlen--;
762 OSDecrementAtomic(&tcp_reass_total_qlen);
763 q = nq;
764 }
765
766 /* Insert the new segment queue entry into place. */
767 te->tqe_m = m;
768 te->tqe_th = th;
769 te->tqe_len = *tlenp;
770
771 tp->t_reassq_mbcnt += _MSIZE + (m->m_flags & M_EXT) ? m->m_ext.ext_size : 0;
772
773 if (p == NULL) {
774 LIST_INSERT_HEAD(&tp->t_segq, te, tqe_q);
775 } else {
776 LIST_INSERT_AFTER(p, te, tqe_q);
777 }
778
779present:
780 /*
781 * Present data to user, advancing rcv_nxt through
782 * completed sequence space.
783 */
784 if (!TCPS_HAVEESTABLISHED(tp->t_state)) {
785 return 0;
786 }
787 q = LIST_FIRST(&tp->t_segq);
788 if (!q || q->tqe_th->th_seq != tp->rcv_nxt) {
789 return 0;
790 }
791
792 /*
793 * If there is already another thread doing reassembly for this
794 * connection, it is better to let it finish the job --
795 * (radar 16316196)
796 */
797 if (tp->t_flagsext & TF_REASS_INPROG) {
798 return 0;
799 }
800
801 tp->t_flagsext |= TF_REASS_INPROG;
802 /* lost packet was recovered, so ooo data can be returned */
803 tcpstat.tcps_recovered_pkts++;
804
805 do {
806 tp->rcv_nxt += q->tqe_len;
807 flags = q->tqe_th->th_flags & TH_FIN;
808 LIST_REMOVE(q, tqe_q);
809 tp->t_reassq_mbcnt -= _MSIZE + (q->tqe_m->m_flags & M_EXT) ?
810 q->tqe_m->m_ext.ext_size : 0;
811 if (so->so_state & SS_CANTRCVMORE) {
812 m_freem(q->tqe_m);
813 } else {
814 so_recv_data_stat(so, q->tqe_m, 0); /* XXXX */
815 if (q->tqe_th->th_flags & TH_PUSH) {
816 tp->t_flagsext |= TF_LAST_IS_PSH;
817 } else {
818 tp->t_flagsext &= ~TF_LAST_IS_PSH;
819 }
820
821 if (sbappendstream_rcvdemux(so, m: q->tqe_m)) {
822 *dowakeup = 1;
823 }
824 }
825 zfree(tcp_reass_zone, q);
826 tp->t_reassqlen--;
827 OSDecrementAtomic(&tcp_reass_total_qlen);
828 q = LIST_FIRST(&tp->t_segq);
829 } while (q && q->tqe_th->th_seq == tp->rcv_nxt);
830 tp->t_flagsext &= ~TF_REASS_INPROG;
831
832 if ((inp->inp_vflag & INP_IPV6) != 0) {
833 KERNEL_DEBUG(DBG_LAYER_BEG,
834 ((inp->inp_fport << 16) | inp->inp_lport),
835 (((inp->in6p_laddr.s6_addr16[0] & 0xffff) << 16) |
836 (inp->in6p_faddr.s6_addr16[0] & 0xffff)),
837 0, 0, 0);
838 } else {
839 KERNEL_DEBUG(DBG_LAYER_BEG,
840 ((inp->inp_fport << 16) | inp->inp_lport),
841 (((inp->inp_laddr.s_addr & 0xffff) << 16) |
842 (inp->inp_faddr.s_addr & 0xffff)),
843 0, 0, 0);
844 }
845
846 return flags;
847}
848
849/*
850 * Reduce congestion window -- used when ECN is seen or when a tail loss
851 * probe recovers the last packet.
852 */
853static void
854tcp_reduce_congestion_window(struct tcpcb *tp)
855{
856 /*
857 * If the current tcp cc module has
858 * defined a hook for tasks to run
859 * before entering FR, call it
860 */
861 if (CC_ALGO(tp)->pre_fr != NULL) {
862 CC_ALGO(tp)->pre_fr(tp);
863 }
864 ENTER_FASTRECOVERY(tp);
865 if (tp->t_flags & TF_SENTFIN) {
866 tp->snd_recover = tp->snd_max - 1;
867 } else {
868 tp->snd_recover = tp->snd_max;
869 }
870 tp->t_timer[TCPT_REXMT] = 0;
871 tp->t_timer[TCPT_PTO] = 0;
872 tp->t_rtttime = 0;
873 if (tp->t_flagsext & TF_CWND_NONVALIDATED) {
874 tcp_cc_adjust_nonvalidated_cwnd(tp);
875 } else {
876 tp->snd_cwnd = tp->snd_ssthresh +
877 tp->t_maxseg * tcprexmtthresh;
878 }
879}
880
881/*
882 * This function is called upon reception of data on a socket. It's purpose is
883 * to handle the adaptive keepalive timers that monitor whether the connection
884 * is making progress. First the adaptive read-timer, second the TFO probe-timer.
885 *
886 * The application wants to get an event if there is a stall during read.
887 * Set the initial keepalive timeout to be equal to twice RTO.
888 *
889 * If the outgoing interface is in marginal conditions, we need to
890 * enable read probes for that too.
891 */
892static inline void
893tcp_adaptive_rwtimo_check(struct tcpcb *tp, int tlen)
894{
895 struct ifnet *outifp = tp->t_inpcb->inp_last_outifp;
896
897 if ((tp->t_adaptive_rtimo > 0 ||
898 (outifp != NULL &&
899 (outifp->if_eflags & IFEF_PROBE_CONNECTIVITY)))
900 && tlen > 0 &&
901 tp->t_state == TCPS_ESTABLISHED) {
902 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
903 (TCP_REXMTVAL(tp) << 1));
904 tp->t_flagsext |= TF_DETECT_READSTALL;
905 tp->t_rtimo_probes = 0;
906 }
907}
908
909inline void
910tcp_keepalive_reset(struct tcpcb *tp)
911{
912 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
913 TCP_CONN_KEEPIDLE(tp));
914 tp->t_flagsext &= ~(TF_DETECT_READSTALL);
915 tp->t_rtimo_probes = 0;
916}
917
918void
919tcp_set_finwait_timeout(struct tcpcb *tp)
920{
921 /*
922 * Starting the TCPT_2MSL timer is contrary to the
923 * specification, but if we don't get a FIN
924 * we'll hang forever.
925 */
926 ASSERT(tp->t_state == TCPS_FIN_WAIT_2);
927 ASSERT((tp->t_inpcb->inp_socket->so_state & (SS_CANTRCVMORE)) == SS_CANTRCVMORE);
928
929 if (tcp_fin_timeout > 0 &&
930 tcp_fin_timeout < TCP_CONN_MAXIDLE(tp)) {
931 tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp, tcp_fin_timeout);
932 } else {
933 tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp, TCP_CONN_MAXIDLE(tp));
934 }
935}
936
937/*
938 * TCP input routine, follows pages 65-76 of the
939 * protocol specification dated September, 1981 very closely.
940 */
941int
942tcp6_input(struct mbuf **mp, int *offp, int proto)
943{
944#pragma unused(proto)
945 struct mbuf *m = *mp;
946 uint32_t ia6_flags;
947 struct ifnet *ifp = m->m_pkthdr.rcvif;
948
949 IP6_EXTHDR_CHECK(m, *offp, sizeof(struct tcphdr), return IPPROTO_DONE);
950
951 /* Expect 32-bit aligned data pointer on strict-align platforms */
952 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
953
954 /*
955 * draft-itojun-ipv6-tcp-to-anycast
956 * better place to put this in?
957 */
958 if (ip6_getdstifaddr_info(m, NULL, &ia6_flags) == 0) {
959 if (ia6_flags & IN6_IFF_ANYCAST) {
960 struct ip6_hdr *ip6;
961
962 ip6 = mtod(m, struct ip6_hdr *);
963 icmp6_error(m, ICMP6_DST_UNREACH,
964 ICMP6_DST_UNREACH_ADDR,
965 (int)((caddr_t)&ip6->ip6_dst - (caddr_t)ip6));
966
967 IF_TCP_STATINC(ifp, icmp6unreach);
968
969 return IPPROTO_DONE;
970 }
971 }
972
973 tcp_input(m, *offp);
974 return IPPROTO_DONE;
975}
976
977static void
978tcp_sbrcv_reserve(struct tcpcb *tp, struct sockbuf *sbrcv,
979 u_int32_t newsize, u_int32_t idealsize, u_int32_t rcvbuf_max)
980{
981 /* newsize should not exceed max */
982 newsize = min(a: newsize, b: rcvbuf_max);
983
984 /* The receive window scale negotiated at the
985 * beginning of the connection will also set a
986 * limit on the socket buffer size
987 */
988 newsize = min(a: newsize, TCP_MAXWIN << tp->rcv_scale);
989
990 /* Set new socket buffer size */
991 if (newsize > sbrcv->sb_hiwat &&
992 (sbreserve(sb: sbrcv, cc: newsize) == 1)) {
993 sbrcv->sb_idealsize = min(a: max(a: sbrcv->sb_idealsize,
994 b: (idealsize != 0) ? idealsize : newsize), b: rcvbuf_max);
995
996 /* Again check the limit set by the advertised
997 * window scale
998 */
999 sbrcv->sb_idealsize = min(a: sbrcv->sb_idealsize,
1000 TCP_MAXWIN << tp->rcv_scale);
1001 }
1002}
1003
1004/*
1005 * This function is used to grow a receive socket buffer. It
1006 * will take into account system-level memory usage and the
1007 * bandwidth available on the link to make a decision.
1008 */
1009static void
1010tcp_sbrcv_grow(struct tcpcb *tp, struct sockbuf *sbrcv,
1011 struct tcpopt *to, uint32_t pktlen)
1012{
1013 struct socket *so = sbrcv->sb_so;
1014
1015 /*
1016 * Do not grow the receive socket buffer if
1017 * - auto resizing is disabled, globally or on this socket
1018 * - the high water mark already reached the maximum
1019 * - the stream is in background and receive side is being
1020 * throttled
1021 */
1022 if (tcp_do_autorcvbuf == 0 ||
1023 (sbrcv->sb_flags & SB_AUTOSIZE) == 0 ||
1024 sbrcv->sb_hiwat >= tcp_autorcvbuf_max ||
1025 (tp->t_flagsext & TF_RECV_THROTTLE) ||
1026 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) ||
1027 (!tcp_autotune_reorder && !LIST_EMPTY(&tp->t_segq))) {
1028 /* Can not resize the socket buffer, just return */
1029 goto out;
1030 }
1031
1032 if (!TSTMP_SUPPORTED(tp)) {
1033 /*
1034 * Timestamp option is not supported on this connection,
1035 * use receiver's RTT. Socket buffer grows based on the
1036 * BDP of the link.
1037 */
1038 if (TSTMP_GEQ(tcp_now,
1039 tp->rfbuf_ts + (tp->rcv_srtt >> TCP_RTT_SHIFT))) {
1040 tp->rfbuf_cnt += pktlen;
1041 if (tp->rfbuf_cnt > tp->rfbuf_space) {
1042 int32_t rcvbuf_inc;
1043 uint32_t idealsize;
1044
1045 /*
1046 * Increase receive-buffer aggressively if we
1047 * received more than 150% of what was received
1048 * in the previous round. Because, that means
1049 * the sender is in TCP slow-start and so
1050 * we need to give it more space to not be
1051 * limiting the sender with a small receive-window.
1052 */
1053 if (tp->rfbuf_cnt > tp->rfbuf_space + (tp->rfbuf_space >> 1)) {
1054 rcvbuf_inc = (tp->rfbuf_cnt << 2) - sbrcv->sb_hiwat;
1055 idealsize = (tp->rfbuf_cnt << 2);
1056 } else {
1057 rcvbuf_inc = (tp->rfbuf_cnt << 1) - sbrcv->sb_hiwat;
1058 idealsize = (tp->rfbuf_cnt << 1);
1059 }
1060
1061 if (rcvbuf_inc > 0) {
1062 rcvbuf_inc =
1063 (rcvbuf_inc / tp->t_maxseg) * tp->t_maxseg;
1064
1065 tcp_sbrcv_reserve(tp, sbrcv,
1066 newsize: sbrcv->sb_hiwat + rcvbuf_inc,
1067 idealsize, rcvbuf_max: tcp_autorcvbuf_max);
1068
1069 tp->rfbuf_space = tp->rfbuf_cnt;
1070 }
1071 }
1072 goto out;
1073 } else {
1074 tp->rfbuf_cnt += pktlen;
1075 return;
1076 }
1077 } else if (to->to_tsecr != 0) {
1078 /*
1079 * If the timestamp shows that one RTT has
1080 * completed, we can stop counting the
1081 * bytes. Here we consider increasing
1082 * the socket buffer if the bandwidth measured in
1083 * last rtt, is more than half of sb_hiwat, this will
1084 * help to scale the buffer according to the bandwidth
1085 * on the link.
1086 */
1087 if (TSTMP_GEQ(to->to_tsecr, tp->rfbuf_ts)) {
1088 tp->rfbuf_cnt += pktlen;
1089
1090 if (tp->rfbuf_cnt > tp->rfbuf_space) {
1091 int32_t rcvbuf_inc;
1092 uint32_t idealsize;
1093
1094 if (tp->rfbuf_cnt > tp->rfbuf_space + (tp->rfbuf_space >> 1)) {
1095 rcvbuf_inc = (tp->rfbuf_cnt << 2) - sbrcv->sb_hiwat;
1096 idealsize = (tp->rfbuf_cnt << 2);
1097 } else {
1098 rcvbuf_inc = (tp->rfbuf_cnt << 1) - sbrcv->sb_hiwat;
1099 idealsize = (tp->rfbuf_cnt << 1);
1100 }
1101
1102 tp->rfbuf_space = tp->rfbuf_cnt;
1103
1104 if (rcvbuf_inc > 0) {
1105 rcvbuf_inc =
1106 (rcvbuf_inc / tp->t_maxseg) * tp->t_maxseg;
1107
1108 tcp_sbrcv_reserve(tp, sbrcv,
1109 newsize: sbrcv->sb_hiwat + rcvbuf_inc,
1110 idealsize, rcvbuf_max: tcp_autorcvbuf_max);
1111 }
1112 }
1113 /* Measure instantaneous receive bandwidth */
1114 if (tp->t_bwmeas != NULL && tp->rfbuf_cnt > 0 &&
1115 TSTMP_GT(tcp_now, tp->rfbuf_ts)) {
1116 u_int32_t rcv_bw;
1117 rcv_bw = tp->rfbuf_cnt /
1118 (int)(tcp_now - tp->rfbuf_ts);
1119 if (tp->t_bwmeas->bw_rcvbw_max == 0) {
1120 tp->t_bwmeas->bw_rcvbw_max = rcv_bw;
1121 } else {
1122 tp->t_bwmeas->bw_rcvbw_max = max(
1123 a: tp->t_bwmeas->bw_rcvbw_max, b: rcv_bw);
1124 }
1125 }
1126 goto out;
1127 } else {
1128 tp->rfbuf_cnt += pktlen;
1129 return;
1130 }
1131 }
1132out:
1133 /* Restart the measurement */
1134 tp->rfbuf_ts = tcp_now;
1135 tp->rfbuf_cnt = 0;
1136 return;
1137}
1138
1139/* This function will trim the excess space added to the socket buffer
1140 * to help a slow-reading app. The ideal-size of a socket buffer depends
1141 * on the link bandwidth or it is set by an application and we aim to
1142 * reach that size.
1143 */
1144void
1145tcp_sbrcv_trim(struct tcpcb *tp, struct sockbuf *sbrcv)
1146{
1147 if (tcp_do_autorcvbuf == 1 && sbrcv->sb_idealsize > 0 &&
1148 sbrcv->sb_hiwat > sbrcv->sb_idealsize) {
1149 int32_t trim;
1150 /* compute the difference between ideal and current sizes */
1151 u_int32_t diff = sbrcv->sb_hiwat - sbrcv->sb_idealsize;
1152
1153 /* Compute the maximum advertised window for
1154 * this connection.
1155 */
1156 u_int32_t advwin = tp->rcv_adv - tp->rcv_nxt;
1157
1158 /* How much can we trim the receive socket buffer?
1159 * 1. it can not be trimmed beyond the max rcv win advertised
1160 * 2. if possible, leave 1/16 of bandwidth*delay to
1161 * avoid closing the win completely
1162 */
1163 u_int32_t leave = max(a: advwin, b: (sbrcv->sb_idealsize >> 4));
1164
1165 /* Sometimes leave can be zero, in that case leave at least
1166 * a few segments worth of space.
1167 */
1168 if (leave == 0) {
1169 leave = tp->t_maxseg << tcp_autorcvbuf_inc_shift;
1170 }
1171
1172 trim = sbrcv->sb_hiwat - (sbrcv->sb_cc + leave);
1173 trim = imin(a: trim, b: (int32_t)diff);
1174
1175 if (trim > 0) {
1176 sbreserve(sb: sbrcv, cc: (sbrcv->sb_hiwat - trim));
1177 }
1178 }
1179}
1180
1181/* We may need to trim the send socket buffer size for two reasons:
1182 * 1. if the rtt seen on the connection is climbing up, we do not
1183 * want to fill the buffers any more.
1184 * 2. if the congestion win on the socket backed off, there is no need
1185 * to hold more mbufs for that connection than what the cwnd will allow.
1186 */
1187void
1188tcp_sbsnd_trim(struct sockbuf *sbsnd)
1189{
1190 if (((sbsnd->sb_flags & (SB_AUTOSIZE | SB_TRIM)) ==
1191 (SB_AUTOSIZE | SB_TRIM)) &&
1192 (sbsnd->sb_idealsize > 0) &&
1193 (sbsnd->sb_hiwat > sbsnd->sb_idealsize)) {
1194 u_int32_t trim = 0;
1195 if (sbsnd->sb_cc <= sbsnd->sb_idealsize) {
1196 trim = sbsnd->sb_hiwat - sbsnd->sb_idealsize;
1197 } else {
1198 trim = sbsnd->sb_hiwat - sbsnd->sb_cc;
1199 }
1200 sbreserve(sb: sbsnd, cc: (sbsnd->sb_hiwat - trim));
1201 }
1202 if (sbsnd->sb_hiwat <= sbsnd->sb_idealsize) {
1203 sbsnd->sb_flags &= ~(SB_TRIM);
1204 }
1205}
1206
1207/*
1208 * If timestamp option was not negotiated on this connection
1209 * and this connection is on the receiving side of a stream
1210 * then we can not measure the delay on the link accurately.
1211 * Instead of enabling automatic receive socket buffer
1212 * resizing, just give more space to the receive socket buffer.
1213 */
1214static inline void
1215tcp_sbrcv_tstmp_check(struct tcpcb *tp)
1216{
1217 struct socket *so = tp->t_inpcb->inp_socket;
1218 u_int32_t newsize = 2 * tcp_recvspace;
1219 struct sockbuf *sbrcv = &so->so_rcv;
1220
1221 if ((tp->t_flags & (TF_REQ_TSTMP | TF_RCVD_TSTMP)) !=
1222 (TF_REQ_TSTMP | TF_RCVD_TSTMP) &&
1223 (sbrcv->sb_flags & SB_AUTOSIZE) != 0) {
1224 tcp_sbrcv_reserve(tp, sbrcv, newsize, idealsize: 0, rcvbuf_max: newsize);
1225 }
1226}
1227
1228/* A receiver will evaluate the flow of packets on a connection
1229 * to see if it can reduce ack traffic. The receiver will start
1230 * stretching acks if all of the following conditions are met:
1231 * 1. tcp_delack_enabled is set to 3
1232 * 2. If the bytes received in the last 100ms is greater than a threshold
1233 * defined by maxseg_unacked
1234 * 3. If the connection has not been idle for tcp_maxrcvidle period.
1235 * 4. If the connection has seen enough packets to let the slow-start
1236 * finish after connection establishment or after some packet loss.
1237 *
1238 * The receiver will stop stretching acks if there is congestion/reordering
1239 * as indicated by packets on reassembly queue or an ECN. If the delayed-ack
1240 * timer fires while stretching acks, it means that the packet flow has gone
1241 * below the threshold defined by maxseg_unacked and the receiver will stop
1242 * stretching acks. The receiver gets no indication when slow-start is completed
1243 * or when the connection reaches an idle state. That is why we use
1244 * tcp_rcvsspktcnt to cover slow-start and tcp_maxrcvidle to identify idle
1245 * state.
1246 */
1247static inline int
1248tcp_stretch_ack_enable(struct tcpcb *tp, int thflags)
1249{
1250 if (tp->rcv_by_unackwin >= (maxseg_unacked * tp->t_maxseg) &&
1251 TSTMP_GEQ(tp->rcv_unackwin, tcp_now)) {
1252 tp->t_flags |= TF_STREAMING_ON;
1253 } else {
1254 tp->t_flags &= ~TF_STREAMING_ON;
1255 }
1256
1257 /* If there has been an idle time, reset streaming detection */
1258 if (TSTMP_GT(tcp_now, tp->rcv_unackwin + tcp_maxrcvidle)) {
1259 tp->t_flags &= ~TF_STREAMING_ON;
1260 }
1261
1262 /*
1263 * If there are flags other than TH_ACK set, reset streaming
1264 * detection
1265 */
1266 if (thflags & ~TH_ACK) {
1267 tp->t_flags &= ~TF_STREAMING_ON;
1268 }
1269
1270 if (tp->t_flagsext & TF_DISABLE_STRETCHACK) {
1271 if (tp->rcv_nostrack_pkts >= TCP_STRETCHACK_ENABLE_PKTCNT) {
1272 tp->t_flagsext &= ~TF_DISABLE_STRETCHACK;
1273 tp->rcv_nostrack_pkts = 0;
1274 tp->rcv_nostrack_ts = 0;
1275 } else {
1276 tp->rcv_nostrack_pkts++;
1277 }
1278 }
1279
1280 if (!(tp->t_flagsext & (TF_NOSTRETCHACK | TF_DISABLE_STRETCHACK)) &&
1281 (tp->t_flags & TF_STREAMING_ON) &&
1282 (!(tp->t_flagsext & TF_RCVUNACK_WAITSS) ||
1283 (tp->rcv_waitforss >= tcp_rcvsspktcnt))) {
1284 return 1;
1285 }
1286
1287 return 0;
1288}
1289
1290/*
1291 * Reset the state related to stretch-ack algorithm. This will make
1292 * the receiver generate an ack every other packet. The receiver
1293 * will start re-evaluating the rate at which packets come to decide
1294 * if it can benefit by lowering the ack traffic.
1295 */
1296void
1297tcp_reset_stretch_ack(struct tcpcb *tp)
1298{
1299 tp->t_flags &= ~(TF_STRETCHACK | TF_STREAMING_ON);
1300 tp->rcv_by_unackwin = 0;
1301 tp->rcv_by_unackhalfwin = 0;
1302 tp->rcv_unackwin = tcp_now + tcp_rcvunackwin;
1303
1304 /*
1305 * When there is packet loss or packet re-ordering or CWR due to
1306 * ECN, the sender's congestion window is reduced. In these states,
1307 * generate an ack for every other packet for some time to allow
1308 * the sender's congestion window to grow.
1309 */
1310 tp->t_flagsext |= TF_RCVUNACK_WAITSS;
1311 tp->rcv_waitforss = 0;
1312}
1313
1314/*
1315 * The last packet was a retransmission, check if this ack
1316 * indicates that the retransmission was spurious.
1317 *
1318 * If the connection supports timestamps, we could use it to
1319 * detect if the last retransmit was not needed. Otherwise,
1320 * we check if the ACK arrived within RTT/2 window, then it
1321 * was a mistake to do the retransmit in the first place.
1322 *
1323 * This function will return 1 if it is a spurious retransmit,
1324 * 0 otherwise.
1325 */
1326int
1327tcp_detect_bad_rexmt(struct tcpcb *tp, struct tcphdr *th,
1328 struct tcpopt *to, u_int32_t rxtime)
1329{
1330 int32_t tdiff, bad_rexmt_win;
1331 bad_rexmt_win = (tp->t_srtt >> (TCP_RTT_SHIFT + 1));
1332
1333 /* If the ack has ECN CE bit, then cwnd has to be adjusted */
1334 if ((TCP_ACC_ECN_ON(tp) && tp->t_delta_ce_packets > 0) ||
1335 (TCP_ECN_ENABLED(tp) && (th->th_flags & TH_ECE))) {
1336 return 0;
1337 }
1338 if (TSTMP_SUPPORTED(tp)) {
1339 if (rxtime > 0 && (to->to_flags & TOF_TS) && to->to_tsecr != 0 &&
1340 TSTMP_LT(to->to_tsecr, rxtime)) {
1341 return 1;
1342 }
1343 } else {
1344 if ((tp->t_rxtshift == 1 || (tp->t_flagsext & TF_SENT_TLPROBE)) &&
1345 rxtime > 0) {
1346 tdiff = (int32_t)(tcp_now - rxtime);
1347 if (tdiff < bad_rexmt_win) {
1348 return 1;
1349 }
1350 }
1351 }
1352 return 0;
1353}
1354
1355
1356/*
1357 * Restore congestion window state if a spurious timeout
1358 * was detected.
1359 */
1360static void
1361tcp_bad_rexmt_restore_state(struct tcpcb *tp, struct tcphdr *th)
1362{
1363 if (TSTMP_SUPPORTED(tp)) {
1364 u_int32_t fsize, acked;
1365 fsize = tp->snd_max - th->th_ack;
1366 acked = BYTES_ACKED(th, tp);
1367
1368 /*
1369 * Implement bad retransmit recovery as
1370 * described in RFC 4015.
1371 */
1372 tp->snd_ssthresh = tp->snd_ssthresh_prev;
1373
1374 /* Initialize cwnd to the initial window */
1375 if (CC_ALGO(tp)->cwnd_init != NULL) {
1376 CC_ALGO(tp)->cwnd_init(tp);
1377 }
1378
1379 tp->snd_cwnd = fsize + min(a: acked, b: tp->snd_cwnd);
1380 } else {
1381 tp->snd_cwnd = tp->snd_cwnd_prev;
1382 tp->snd_ssthresh = tp->snd_ssthresh_prev;
1383 if (tp->t_flags & TF_WASFRECOVERY) {
1384 ENTER_FASTRECOVERY(tp);
1385 }
1386
1387 /* Do not use the loss flight size in this case */
1388 tp->t_lossflightsize = 0;
1389 }
1390 tp->snd_cwnd = max(a: tp->snd_cwnd, b: tcp_initial_cwnd(tp));
1391 tp->snd_recover = tp->snd_recover_prev;
1392 tp->snd_nxt = tp->snd_max;
1393
1394 /* Fix send socket buffer to reflect the change in cwnd */
1395 tcp_bad_rexmt_fix_sndbuf(tp);
1396
1397 /*
1398 * This RTT might reflect the extra delay induced
1399 * by the network. Skip using this sample for RTO
1400 * calculation and mark the connection so we can
1401 * recompute RTT when the next eligible sample is
1402 * found.
1403 */
1404 tp->t_flagsext |= TF_RECOMPUTE_RTT;
1405 tp->t_badrexmt_time = tcp_now;
1406 tp->t_rtttime = 0;
1407}
1408
1409/*
1410 * If the previous packet was sent in retransmission timer, and it was
1411 * not needed, then restore the congestion window to the state before that
1412 * transmission.
1413 *
1414 * If the last packet was sent in tail loss probe timeout, check if that
1415 * recovered the last packet. If so, that will indicate a real loss and
1416 * the congestion window needs to be lowered.
1417 */
1418static void
1419tcp_bad_rexmt_check(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to)
1420{
1421 if (tp->t_rxtshift > 0 &&
1422 tcp_detect_bad_rexmt(tp, th, to, rxtime: tp->t_rxtstart)) {
1423 ++tcpstat.tcps_sndrexmitbad;
1424 tcp_bad_rexmt_restore_state(tp, th);
1425 tcp_ccdbg_trace(tp, th, event: TCP_CC_BAD_REXMT_RECOVERY);
1426 } else if ((tp->t_flagsext & TF_SENT_TLPROBE) && tp->t_tlphighrxt > 0 &&
1427 SEQ_GEQ(th->th_ack, tp->t_tlphighrxt) &&
1428 !tcp_detect_bad_rexmt(tp, th, to, rxtime: tp->t_tlpstart)) {
1429 /*
1430 * The tail loss probe recovered the last packet and
1431 * we need to adjust the congestion window to take
1432 * this loss into account.
1433 */
1434 ++tcpstat.tcps_tlp_recoverlastpkt;
1435 if (!IN_FASTRECOVERY(tp)) {
1436 tcp_reduce_congestion_window(tp);
1437 EXIT_FASTRECOVERY(tp);
1438 }
1439 tcp_ccdbg_trace(tp, th, event: TCP_CC_TLP_RECOVER_LASTPACKET);
1440 } else if (tcp_rxtseg_detect_bad_rexmt(tp, th->th_ack)) {
1441 /*
1442 * All of the retransmitted segments were duplicated, this
1443 * can be an indication of bad fast retransmit.
1444 */
1445 tcpstat.tcps_dsack_badrexmt++;
1446 tcp_bad_rexmt_restore_state(tp, th);
1447 tcp_ccdbg_trace(tp, th, event: TCP_CC_DSACK_BAD_REXMT);
1448 tcp_rxtseg_clean(tp);
1449 }
1450 tp->t_flagsext &= ~(TF_SENT_TLPROBE);
1451 tp->t_tlphighrxt = 0;
1452 tp->t_tlpstart = 0;
1453
1454 /*
1455 * check if the latest ack was for a segment sent during PMTU
1456 * blackhole detection. If the timestamp on the ack is before
1457 * PMTU blackhole detection, then revert the size of the max
1458 * segment to previous size.
1459 */
1460 if (tp->t_rxtshift > 0 && (tp->t_flags & TF_BLACKHOLE) &&
1461 tp->t_pmtud_start_ts > 0 && TSTMP_SUPPORTED(tp)) {
1462 if ((to->to_flags & TOF_TS) && to->to_tsecr != 0
1463 && TSTMP_LT(to->to_tsecr, tp->t_pmtud_start_ts)) {
1464 tcp_pmtud_revert_segment_size(tp);
1465 }
1466 }
1467 if (tp->t_pmtud_start_ts > 0) {
1468 tp->t_pmtud_start_ts = 0;
1469 }
1470
1471 tp->t_pmtud_lastseg_size = 0;
1472}
1473
1474/*
1475 * Check if early retransmit can be attempted according to RFC 5827.
1476 *
1477 * If packet reordering is detected on a connection, fast recovery will
1478 * be delayed until it is clear that the packet was lost and not reordered.
1479 * But reordering detection is done only when SACK is enabled.
1480 *
1481 * On connections that do not support SACK, there is a limit on the number
1482 * of early retransmits that can be done per minute. This limit is needed
1483 * to make sure that too many packets are not retransmitted when there is
1484 * packet reordering.
1485 */
1486static void
1487tcp_early_rexmt_check(struct tcpcb *tp, struct tcphdr *th)
1488{
1489 u_int32_t obytes, snd_off;
1490 int32_t snd_len;
1491 struct socket *so = tp->t_inpcb->inp_socket;
1492
1493 if ((SACK_ENABLED(tp) || tp->t_early_rexmt_count < TCP_EARLY_REXMT_LIMIT) &&
1494 SEQ_GT(tp->snd_max, tp->snd_una) &&
1495 (tp->t_dupacks == 1 || (SACK_ENABLED(tp) && !TAILQ_EMPTY(&tp->snd_holes)))) {
1496 /*
1497 * If there are only a few outstanding
1498 * segments on the connection, we might need
1499 * to lower the retransmit threshold. This
1500 * will allow us to do Early Retransmit as
1501 * described in RFC 5827.
1502 */
1503 if (SACK_ENABLED(tp) &&
1504 !TAILQ_EMPTY(&tp->snd_holes)) {
1505 obytes = (tp->snd_max - tp->snd_fack) +
1506 tp->sackhint.sack_bytes_rexmit;
1507 } else {
1508 obytes = (tp->snd_max - tp->snd_una);
1509 }
1510
1511 /*
1512 * In order to lower retransmit threshold the
1513 * following two conditions must be met.
1514 * 1. the amount of outstanding data is less
1515 * than 4*SMSS bytes
1516 * 2. there is no unsent data ready for
1517 * transmission or the advertised window
1518 * will limit sending new segments.
1519 */
1520 snd_off = tp->snd_max - tp->snd_una;
1521 snd_len = min(a: so->so_snd.sb_cc, b: tp->snd_wnd) - snd_off;
1522 if (obytes < (tp->t_maxseg << 2) &&
1523 snd_len <= 0) {
1524 u_int32_t osegs;
1525
1526 osegs = obytes / tp->t_maxseg;
1527 if ((osegs * tp->t_maxseg) < obytes) {
1528 osegs++;
1529 }
1530
1531 /*
1532 * Since the connection might have already
1533 * received some dupacks, we add them to
1534 * to the outstanding segments count to get
1535 * the correct retransmit threshold.
1536 *
1537 * By checking for early retransmit after
1538 * receiving some duplicate acks when SACK
1539 * is supported, the connection will
1540 * enter fast recovery even if multiple
1541 * segments are lost in the same window.
1542 */
1543 osegs += tp->t_dupacks;
1544 if (osegs < 4) {
1545 tp->t_rexmtthresh =
1546 ((osegs - 1) > 1) ? ((uint8_t)osegs - 1) : 1;
1547 tp->t_rexmtthresh =
1548 MIN(tp->t_rexmtthresh, tcprexmtthresh);
1549 tp->t_rexmtthresh =
1550 MAX(tp->t_rexmtthresh,
1551 tp->t_dupacks > UINT8_MAX ? UINT8_MAX : (uint8_t)tp->t_dupacks);
1552
1553 if (tp->t_early_rexmt_count == 0) {
1554 tp->t_early_rexmt_win = tcp_now;
1555 }
1556
1557 if (tp->t_flagsext & TF_SENT_TLPROBE) {
1558 tcpstat.tcps_tlp_recovery++;
1559 tcp_ccdbg_trace(tp, th,
1560 event: TCP_CC_TLP_RECOVERY);
1561 } else {
1562 tcpstat.tcps_early_rexmt++;
1563 tp->t_early_rexmt_count++;
1564 tcp_ccdbg_trace(tp, th,
1565 event: TCP_CC_EARLY_RETRANSMIT);
1566 }
1567 }
1568 }
1569 }
1570
1571 /*
1572 * If we ever sent a TLP probe, the acknowledgement will trigger
1573 * early retransmit because the value of snd_fack will be close
1574 * to snd_max. This will take care of adjustments to the
1575 * congestion window. So we can reset TF_SENT_PROBE flag.
1576 */
1577 tp->t_flagsext &= ~(TF_SENT_TLPROBE);
1578 tp->t_tlphighrxt = 0;
1579 tp->t_tlpstart = 0;
1580}
1581
1582static boolean_t
1583tcp_tfo_syn(struct tcpcb *tp, struct tcpopt *to)
1584{
1585 u_char out[CCAES_BLOCK_SIZE];
1586 unsigned char len;
1587
1588 if (!(to->to_flags & (TOF_TFO | TOF_TFOREQ)) ||
1589 !(tcp_fastopen & TCP_FASTOPEN_SERVER)) {
1590 return FALSE;
1591 }
1592
1593 if ((to->to_flags & TOF_TFOREQ)) {
1594 tp->t_tfo_flags |= TFO_F_OFFER_COOKIE;
1595
1596 tp->t_tfo_stats |= TFO_S_COOKIEREQ_RECV;
1597 tcpstat.tcps_tfo_cookie_req_rcv++;
1598 return FALSE;
1599 }
1600
1601 /* Ok, then it must be an offered cookie. We need to check that ... */
1602 tcp_tfo_gen_cookie(inp: tp->t_inpcb, out, blk_size: sizeof(out));
1603
1604 len = *to->to_tfo - TCPOLEN_FASTOPEN_REQ;
1605 to->to_tfo++;
1606 if (memcmp(s1: out, s2: to->to_tfo, n: len)) {
1607 /* Cookies are different! Let's return and offer a new cookie */
1608 tp->t_tfo_flags |= TFO_F_OFFER_COOKIE;
1609
1610 tp->t_tfo_stats |= TFO_S_COOKIE_INVALID;
1611 tcpstat.tcps_tfo_cookie_invalid++;
1612 return FALSE;
1613 }
1614
1615 if (OSIncrementAtomic(&tcp_tfo_halfcnt) >= tcp_tfo_backlog) {
1616 /* Need to decrement again as we just increased it... */
1617 OSDecrementAtomic(&tcp_tfo_halfcnt);
1618 return FALSE;
1619 }
1620
1621 tp->t_tfo_flags |= TFO_F_COOKIE_VALID;
1622
1623 tp->t_tfo_stats |= TFO_S_SYNDATA_RCV;
1624 tcpstat.tcps_tfo_syn_data_rcv++;
1625
1626 return TRUE;
1627}
1628
1629static void
1630tcp_tfo_synack(struct tcpcb *tp, struct tcpopt *to)
1631{
1632 if (to->to_flags & TOF_TFO) {
1633 unsigned char len = *to->to_tfo - TCPOLEN_FASTOPEN_REQ;
1634
1635 /*
1636 * If this happens, things have gone terribly wrong. len should
1637 * have been checked in tcp_dooptions.
1638 */
1639 VERIFY(len <= TFO_COOKIE_LEN_MAX);
1640
1641 to->to_tfo++;
1642
1643 tcp_cache_set_cookie(tp, cookie: to->to_tfo, len);
1644 tcp_heuristic_tfo_success(tp);
1645
1646 tp->t_tfo_stats |= TFO_S_COOKIE_RCV;
1647 tcpstat.tcps_tfo_cookie_rcv++;
1648 if (tp->t_tfo_flags & TFO_F_COOKIE_SENT) {
1649 tcpstat.tcps_tfo_cookie_wrong++;
1650 tp->t_tfo_stats |= TFO_S_COOKIE_WRONG;
1651 }
1652 } else {
1653 /*
1654 * Thus, no cookie in the response, but we either asked for one
1655 * or sent SYN+DATA. Now, we need to check whether we had to
1656 * rexmit the SYN. If that's the case, it's better to start
1657 * backing of TFO-cookie requests.
1658 */
1659 if (!(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE) &&
1660 tp->t_tfo_flags & TFO_F_SYN_LOSS) {
1661 tp->t_tfo_stats |= TFO_S_SYN_LOSS;
1662 tcpstat.tcps_tfo_syn_loss++;
1663
1664 tcp_heuristic_tfo_loss(tp);
1665 } else {
1666 if (tp->t_tfo_flags & TFO_F_COOKIE_REQ) {
1667 tp->t_tfo_stats |= TFO_S_NO_COOKIE_RCV;
1668 tcpstat.tcps_tfo_no_cookie_rcv++;
1669 }
1670
1671 tcp_heuristic_tfo_success(tp);
1672 }
1673 }
1674}
1675
1676static void
1677tcp_tfo_rcv_probe(struct tcpcb *tp, int tlen)
1678{
1679 if (tlen != 0) {
1680 return;
1681 }
1682
1683 tp->t_tfo_probe_state = TFO_PROBE_PROBING;
1684
1685 /*
1686 * We send the probe out rather quickly (after one RTO). It does not
1687 * really hurt that much, it's only one additional segment on the wire.
1688 */
1689 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp, (TCP_REXMTVAL(tp)));
1690}
1691
1692static void
1693tcp_tfo_rcv_data(struct tcpcb *tp)
1694{
1695 /* Transition from PROBING to NONE as data has been received */
1696 if (tp->t_tfo_probe_state >= TFO_PROBE_PROBING) {
1697 tp->t_tfo_probe_state = TFO_PROBE_NONE;
1698 }
1699}
1700
1701static void
1702tcp_tfo_rcv_ack(struct tcpcb *tp, struct tcphdr *th)
1703{
1704 if (tp->t_tfo_probe_state == TFO_PROBE_PROBING &&
1705 tp->t_tfo_probes > 0) {
1706 if (th->th_seq == tp->rcv_nxt) {
1707 /* No hole, so stop probing */
1708 tp->t_tfo_probe_state = TFO_PROBE_NONE;
1709 } else if (SEQ_GT(th->th_seq, tp->rcv_nxt)) {
1710 /* There is a hole! Wait a bit for data... */
1711 tp->t_tfo_probe_state = TFO_PROBE_WAIT_DATA;
1712 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
1713 TCP_REXMTVAL(tp));
1714 }
1715 }
1716}
1717
1718/*
1719 * Update snd_wnd information.
1720 */
1721static inline bool
1722tcp_update_window(struct tcpcb *tp, int thflags, struct tcphdr * th,
1723 u_int32_t tiwin, int tlen)
1724{
1725 /* Don't look at the window if there is no ACK flag */
1726 if ((thflags & TH_ACK) &&
1727 (SEQ_LT(tp->snd_wl1, th->th_seq) ||
1728 (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
1729 (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
1730 /* keep track of pure window updates */
1731 if (tlen == 0 &&
1732 tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) {
1733 tcpstat.tcps_rcvwinupd++;
1734 }
1735 tp->snd_wnd = tiwin;
1736 tp->snd_wl1 = th->th_seq;
1737 tp->snd_wl2 = th->th_ack;
1738 if (tp->snd_wnd > tp->max_sndwnd) {
1739 tp->max_sndwnd = tp->snd_wnd;
1740 }
1741
1742 if (tp->t_inpcb->inp_socket->so_flags & SOF_MP_SUBFLOW) {
1743 mptcp_update_window_wakeup(tp);
1744 }
1745 return true;
1746 }
1747 return false;
1748}
1749
1750static void
1751tcp_handle_wakeup(struct socket *so, int read_wakeup, int write_wakeup)
1752{
1753 if (read_wakeup != 0) {
1754 sorwakeup(so);
1755 }
1756 if (write_wakeup != 0) {
1757 sowwakeup(so);
1758 }
1759}
1760
1761static void
1762tcp_update_snd_una(struct tcpcb *tp, uint32_t ack)
1763{
1764 tp->snd_una = ack;
1765 if (SACK_ENABLED(tp) && SEQ_LT(tp->send_highest_sack, tp->snd_una)) {
1766 tp->send_highest_sack = tp->snd_una;
1767
1768 /* If we move our marker, we need to start fresh */
1769 tp->t_new_dupacks = 0;
1770 }
1771}
1772
1773static bool
1774tcp_syn_data_valid(struct tcpcb *tp, struct tcphdr *tcp_hdr, int tlen)
1775{
1776 /* No data? */
1777 if (tlen <= 0) {
1778 return false;
1779 }
1780
1781 /* Not the right sequence-number? */
1782 if (tcp_hdr->th_seq != tp->irs) {
1783 return false;
1784 }
1785
1786 /* We could have wrapped around, check that */
1787 if (tp->t_inpcb->inp_stat->rxbytes > INT32_MAX) {
1788 return false;
1789 }
1790
1791 return true;
1792}
1793
1794/* Process IP-ECN codepoints on received packets and update receive side counters */
1795static void
1796tcp_input_ip_ecn(struct tcpcb *tp, struct inpcb *inp, uint32_t tlen, uint32_t segment_count, uint8_t ip_ecn)
1797{
1798 switch (ip_ecn) {
1799 case IPTOS_ECN_ECT1:
1800 tp->ecn_flags |= TE_ACO_ECT1;
1801 tp->t_rcv_ect1_bytes += tlen;
1802 break;
1803 case IPTOS_ECN_ECT0:
1804 tp->ecn_flags |= TE_ACO_ECT0;
1805 tp->t_rcv_ect0_bytes += tlen;
1806 break;
1807 case IPTOS_ECN_CE:
1808 tp->t_rcv_ce_packets += segment_count;
1809 tp->t_rcv_ce_bytes += tlen;
1810 tp->t_ecn_recv_ce++;
1811 tcpstat.tcps_ecn_recv_ce++;
1812 INP_INC_IFNET_STAT(inp, ecn_recv_ce);
1813 break;
1814 default:
1815 /* No counter for Not-ECT */
1816 break;
1817 }
1818}
1819
1820/* Process SYN packet that wishes to negotiate Accurate ECN */
1821static void
1822tcp_input_process_accecn_syn(struct tcpcb *tp, int ace_flags, uint8_t ip_ecn)
1823{
1824 switch (ace_flags) {
1825 case (0 | 0 | 0):
1826 /* No ECN */
1827 tp->t_server_accecn_state = tcp_connection_server_no_ecn_requested;
1828 break;
1829 case (0 | TH_CWR | TH_ECE):
1830 /* Legacy ECN-setup */
1831 tp->ecn_flags |= (TE_SETUPRECEIVED | TE_SENDIPECT);
1832 tp->t_server_accecn_state = tcp_connection_server_classic_ecn_requested;
1833 break;
1834 case (TH_ACE):
1835 /* Accurate ECN */
1836 if (TCP_ACC_ECN_ENABLED(tp)) {
1837 switch (ip_ecn) {
1838 case IPTOS_ECN_NOTECT:
1839 tp->ecn_flags |= TE_ACE_SETUP_NON_ECT;
1840 break;
1841 case IPTOS_ECN_ECT1:
1842 tp->ecn_flags |= TE_ACE_SETUP_ECT1;
1843 break;
1844 case IPTOS_ECN_ECT0:
1845 tp->ecn_flags |= TE_ACE_SETUP_ECT0;
1846 break;
1847 case IPTOS_ECN_CE:
1848 tp->ecn_flags |= TE_ACE_SETUP_CE;
1849 break;
1850 }
1851 /*
1852 * We are not yet committing to send IP ECT packets when
1853 * Accurate ECN is enabled
1854 */
1855 tp->ecn_flags |= (TE_ACE_SETUPRECEIVED);
1856
1857 /* Initialize ECT byte counter to 1 to distinguish zeroing of options */
1858 tp->t_rcv_ect1_bytes = tp->t_rcv_ect0_bytes = 1;
1859 tp->t_snd_ect1_bytes = tp->t_snd_ect0_bytes = 1;
1860 tp->t_server_accecn_state = tcp_connection_server_accurate_ecn_requested;
1861 } else {
1862 /*
1863 * If AccECN is not enabled, ignore
1864 * the TH_AE bit and do Legacy ECN-setup
1865 */
1866 tp->ecn_flags |= (TE_SETUPRECEIVED | TE_SENDIPECT);
1867 }
1868 default:
1869 /* Forward Compatibility */
1870 /* Accurate ECN */
1871 if (TCP_ACC_ECN_ENABLED(tp)) {
1872 switch (ip_ecn) {
1873 case IPTOS_ECN_NOTECT:
1874 tp->ecn_flags |= TE_ACE_SETUP_NON_ECT;
1875 break;
1876 case IPTOS_ECN_ECT1:
1877 tp->ecn_flags |= TE_ACE_SETUP_ECT1;
1878 break;
1879 case IPTOS_ECN_ECT0:
1880 tp->ecn_flags |= TE_ACE_SETUP_ECT0;
1881 break;
1882 case IPTOS_ECN_CE:
1883 tp->ecn_flags |= TE_ACE_SETUP_CE;
1884 break;
1885 }
1886 /*
1887 * We are not yet committing to send IP ECT packets when
1888 * Accurate ECN is enabled
1889 */
1890 tp->ecn_flags |= (TE_ACE_SETUPRECEIVED);
1891
1892 /* Initialize ECT byte counter to 1 to distinguish zeroing of options */
1893 tp->t_rcv_ect1_bytes = tp->t_rcv_ect0_bytes = 1;
1894 tp->t_snd_ect1_bytes = tp->t_snd_ect0_bytes = 1;
1895 tp->t_server_accecn_state = tcp_connection_server_accurate_ecn_requested;
1896 }
1897 break;
1898 }
1899}
1900
1901void
1902tcp_input(struct mbuf *m, int off0)
1903{
1904 int exiting_fr = 0;
1905 struct tcphdr *th;
1906 struct ip *ip = NULL;
1907 struct inpcb *inp;
1908 u_char *optp = NULL;
1909 int optlen = 0;
1910 int tlen, off;
1911 int drop_hdrlen;
1912 struct tcpcb *tp = 0;
1913 int thflags;
1914 struct socket *so = 0;
1915 int todrop, acked, ourfinisacked, needoutput = 0;
1916 int read_wakeup = 0;
1917 int write_wakeup = 0;
1918 struct in_addr laddr;
1919 struct in6_addr laddr6;
1920 int dropsocket = 0;
1921 int iss = 0, nosock = 0;
1922 u_int32_t tiwin, sack_bytes_acked = 0, sack_bytes_newly_acked = 0;
1923 struct tcpopt to; /* options in this segment */
1924#if TCPDEBUG
1925 short ostate = 0;
1926#endif
1927 u_char ip_ecn = IPTOS_ECN_NOTECT;
1928 unsigned int ifscope;
1929 uint8_t isconnected, isdisconnected;
1930 struct ifnet *ifp = m->m_pkthdr.rcvif;
1931 int segment_count = m->m_pkthdr.seg_cnt ? : 1;
1932 int win;
1933 u_int16_t pf_tag = 0;
1934#if MPTCP
1935 struct mptcb *mp_tp = NULL;
1936#endif /* MPTCP */
1937 boolean_t cell = IFNET_IS_CELLULAR(ifp);
1938 boolean_t wifi = (!cell && IFNET_IS_WIFI(ifp));
1939 boolean_t wired = (!wifi && IFNET_IS_WIRED(ifp));
1940 boolean_t recvd_dsack = FALSE;
1941 struct tcp_respond_args tra;
1942 int prev_t_state;
1943 boolean_t check_cfil = cfil_filter_present();
1944 bool findpcb_iterated = false;
1945 /*
1946 * The mbuf may be freed after it has been added to the receive socket
1947 * buffer or the reassembly queue, so we reinitialize th to point to a
1948 * safe copy of the TCP header
1949 */
1950 struct tcphdr saved_tcphdr = {};
1951 /*
1952 * Save copy of the IPv4/IPv6 header.
1953 * Note: use array of uint32_t to silence compiler warning when casting
1954 * to a struct ip6_hdr pointer.
1955 */
1956#define MAX_IPWORDS ((sizeof(struct ip) + MAX_IPOPTLEN) / sizeof(uint32_t))
1957 uint32_t saved_hdr[MAX_IPWORDS];
1958
1959#define TCP_INC_VAR(stat, npkts) do { \
1960 stat += npkts; \
1961} while (0)
1962
1963 if (tcp_ack_strategy == TCP_ACK_STRATEGY_LEGACY) {
1964 segment_count = 1;
1965 }
1966 TCP_INC_VAR(tcpstat.tcps_rcvtotal, segment_count);
1967
1968 struct ip6_hdr *ip6 = NULL;
1969 int isipv6;
1970 struct proc *kernel_proc = current_proc();
1971
1972 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
1973
1974 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0;
1975 bzero(s: (char *)&to, n: sizeof(to));
1976
1977 m_add_crumb(m, PKT_CRUMB_TCP_INPUT);
1978
1979 if (m->m_flags & M_PKTHDR) {
1980 pf_tag = m_pftag(m)->pftag_tag;
1981 }
1982
1983 if (isipv6) {
1984 /*
1985 * Expect 32-bit aligned data pointer on
1986 * strict-align platforms
1987 */
1988 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
1989
1990 /* IP6_EXTHDR_CHECK() is already done at tcp6_input() */
1991 ip6 = mtod(m, struct ip6_hdr *);
1992 tlen = sizeof(*ip6) + ntohs(ip6->ip6_plen) - off0;
1993 th = (struct tcphdr *)(void *)((caddr_t)ip6 + off0);
1994
1995 if (tcp_input_checksum(AF_INET6, m, th, off0, tlen)) {
1996 TCP_LOG_DROP_PKT(ip6, th, ifp, "IPv6 bad tcp checksum");
1997 goto dropnosock;
1998 }
1999
2000 KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport),
2001 (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
2002 th->th_seq, th->th_ack, th->th_win);
2003 /*
2004 * Be proactive about unspecified IPv6 address in source.
2005 * As we use all-zero to indicate unbounded/unconnected pcb,
2006 * unspecified IPv6 address can be used to confuse us.
2007 *
2008 * Note that packets with unspecified IPv6 destination is
2009 * already dropped in ip6_input.
2010 */
2011 if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
2012 /* XXX stat */
2013 IF_TCP_STATINC(ifp, unspecv6);
2014 TCP_LOG_DROP_PKT(ip6, th, ifp, "src IPv6 address unspecified");
2015 goto dropnosock;
2016 }
2017 DTRACE_TCP5(receive, struct mbuf *, m, struct inpcb *, NULL,
2018 struct ip6_hdr *, ip6, struct tcpcb *, NULL,
2019 struct tcphdr *, th);
2020
2021 ip_ecn = (ntohl(ip6->ip6_flow) >> 20) & IPTOS_ECN_MASK;
2022 } else {
2023 /*
2024 * Get IP and TCP header together in first mbuf.
2025 * Note: IP leaves IP header in first mbuf.
2026 */
2027 if (off0 > sizeof(struct ip)) {
2028 ip_stripoptions(m);
2029 off0 = sizeof(struct ip);
2030 }
2031 if (m->m_len < sizeof(struct tcpiphdr)) {
2032 if ((m = m_pullup(m, sizeof(struct tcpiphdr))) == 0) {
2033 tcpstat.tcps_rcvshort++;
2034 return;
2035 }
2036 }
2037
2038 /* Expect 32-bit aligned data pointer on strict-align platforms */
2039 MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
2040
2041 ip = mtod(m, struct ip *);
2042 th = (struct tcphdr *)(void *)((caddr_t)ip + off0);
2043 tlen = ip->ip_len;
2044
2045 if (tcp_input_checksum(AF_INET, m, th, off0, tlen)) {
2046 TCP_LOG_DROP_PKT(ip, th, ifp, "IPv4 bad tcp checksum");
2047 goto dropnosock;
2048 }
2049
2050 /* Re-initialization for later version check */
2051 ip->ip_v = IPVERSION;
2052 ip_ecn = (ip->ip_tos & IPTOS_ECN_MASK);
2053
2054 DTRACE_TCP5(receive, struct mbuf *, m, struct inpcb *, NULL,
2055 struct ip *, ip, struct tcpcb *, NULL, struct tcphdr *, th);
2056
2057 KERNEL_DEBUG(DBG_LAYER_BEG, ((th->th_dport << 16) | th->th_sport),
2058 (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
2059 th->th_seq, th->th_ack, th->th_win);
2060 }
2061
2062#define TCP_LOG_HDR (isipv6 ? (void *)ip6 : (void *)ip)
2063
2064 /*
2065 * Check that TCP offset makes sense,
2066 * pull out TCP options and adjust length.
2067 */
2068 off = th->th_off << 2;
2069 if (off < sizeof(struct tcphdr) || off > tlen) {
2070 tcpstat.tcps_rcvbadoff++;
2071 IF_TCP_STATINC(ifp, badformat);
2072 TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "bad tcp offset");
2073 goto dropnosock;
2074 }
2075 tlen -= off; /* tlen is used instead of ti->ti_len */
2076 if (off > sizeof(struct tcphdr)) {
2077 if (isipv6) {
2078 IP6_EXTHDR_CHECK(m, off0, off, return );
2079 ip6 = mtod(m, struct ip6_hdr *);
2080 th = (struct tcphdr *)(void *)((caddr_t)ip6 + off0);
2081 } else {
2082 if (m->m_len < sizeof(struct ip) + off) {
2083 if ((m = m_pullup(m, sizeof(struct ip) + off)) == 0) {
2084 tcpstat.tcps_rcvshort++;
2085 return;
2086 }
2087 ip = mtod(m, struct ip *);
2088 th = (struct tcphdr *)(void *)((caddr_t)ip + off0);
2089 }
2090 }
2091 optlen = off - sizeof(struct tcphdr);
2092 optp = (u_char *)(th + 1);
2093 /*
2094 * Do quick retrieval of timestamp options ("options
2095 * prediction?"). If timestamp is the only option and it's
2096 * formatted as recommended in RFC 1323 appendix A, we
2097 * quickly get the values now and not bother calling
2098 * tcp_dooptions(), etc.
2099 */
2100 if ((optlen == TCPOLEN_TSTAMP_APPA ||
2101 (optlen > TCPOLEN_TSTAMP_APPA &&
2102 optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
2103 *(u_int32_t *)(void *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
2104 (th->th_flags & TH_SYN) == 0) {
2105 to.to_flags |= TOF_TS;
2106 to.to_tsval = ntohl(*(u_int32_t *)(void *)(optp + 4));
2107 to.to_tsecr = ntohl(*(u_int32_t *)(void *)(optp + 8));
2108 optp = NULL; /* we've parsed the options */
2109 }
2110 }
2111 thflags = th->th_flags;
2112
2113 /*
2114 * Drop all packets with both the SYN and FIN bits set.
2115 * This prevents e.g. nmap from identifying the TCP/IP stack.
2116 *
2117 * This is a violation of the TCP specification.
2118 */
2119 if ((thflags & (TH_SYN | TH_FIN)) == (TH_SYN | TH_FIN)) {
2120 IF_TCP_STATINC(ifp, synfin);
2121 TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "drop SYN FIN");
2122 goto dropnosock;
2123 }
2124
2125 /*
2126 * Delay dropping TCP, IP headers, IPv6 ext headers, and TCP options,
2127 * until after ip6_savecontrol() is called and before other functions
2128 * which don't want those proto headers.
2129 * Because ip6_savecontrol() is going to parse the mbuf to
2130 * search for data to be passed up to user-land, it wants mbuf
2131 * parameters to be unchanged.
2132 */
2133 drop_hdrlen = off0 + off;
2134
2135 /* Since this is an entry point for input processing of tcp packets, we
2136 * can update the tcp clock here.
2137 */
2138 calculate_tcp_clock();
2139
2140 /*
2141 * Record the interface where this segment arrived on; this does not
2142 * affect normal data output (for non-detached TCP) as it provides a
2143 * hint about which route and interface to use for sending in the
2144 * absence of a PCB, when scoped routing (and thus source interface
2145 * selection) are enabled.
2146 */
2147 if ((m->m_pkthdr.pkt_flags & PKTF_LOOP) || m->m_pkthdr.rcvif == NULL) {
2148 ifscope = IFSCOPE_NONE;
2149 } else {
2150 ifscope = m->m_pkthdr.rcvif->if_index;
2151 }
2152
2153 /*
2154 * Convert TCP protocol specific fields to host format.
2155 */
2156
2157#if BYTE_ORDER != BIG_ENDIAN
2158 NTOHL(th->th_seq);
2159 NTOHL(th->th_ack);
2160 NTOHS(th->th_win);
2161 NTOHS(th->th_urp);
2162#endif
2163
2164 /*
2165 * Locate pcb for segment.
2166 */
2167findpcb:
2168
2169 isconnected = FALSE;
2170 isdisconnected = FALSE;
2171
2172 if (isipv6) {
2173 inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_src, th->th_sport, ip6_input_getsrcifscope(m),
2174 &ip6->ip6_dst, th->th_dport, ip6_input_getdstifscope(m), 1,
2175 m->m_pkthdr.rcvif);
2176 } else {
2177 inp = in_pcblookup_hash(&tcbinfo, ip->ip_src, th->th_sport,
2178 ip->ip_dst, th->th_dport, 1, m->m_pkthdr.rcvif);
2179 }
2180
2181 /*
2182 * Use the interface scope information from the PCB for outbound
2183 * segments. If the PCB isn't present and if scoped routing is
2184 * enabled, tcp_respond will use the scope of the interface where
2185 * the segment arrived on.
2186 */
2187 if (inp != NULL && (inp->inp_flags & INP_BOUND_IF)) {
2188 ifscope = inp->inp_boundifp->if_index;
2189 }
2190
2191 /*
2192 * If the state is CLOSED (i.e., TCB does not exist) then
2193 * all data in the incoming segment is discarded.
2194 * If the TCB exists but is in CLOSED state, it is embryonic,
2195 * but should either do a listen or a connect soon.
2196 */
2197 if (inp == NULL) {
2198 if (log_in_vain) {
2199 char dbuf[MAX_IPv6_STR_LEN], sbuf[MAX_IPv6_STR_LEN];
2200
2201 if (isipv6) {
2202 inet_ntop(AF_INET6, &ip6->ip6_dst, dbuf, sizeof(dbuf));
2203 inet_ntop(AF_INET6, &ip6->ip6_src, sbuf, sizeof(sbuf));
2204 } else {
2205 inet_ntop(AF_INET, &ip->ip_dst, dbuf, sizeof(dbuf));
2206 inet_ntop(AF_INET, &ip->ip_src, sbuf, sizeof(sbuf));
2207 }
2208 switch (log_in_vain) {
2209 case 1:
2210 if (thflags & TH_SYN) {
2211 log(LOG_INFO,
2212 "Connection attempt to TCP %s:%d from %s:%d\n",
2213 dbuf, ntohs(th->th_dport),
2214 sbuf,
2215 ntohs(th->th_sport));
2216 }
2217 break;
2218 case 2:
2219 log(LOG_INFO,
2220 "Connection attempt to TCP %s:%d from %s:%d flags:0x%x\n",
2221 dbuf, ntohs(th->th_dport), sbuf,
2222 ntohs(th->th_sport), thflags);
2223 break;
2224 case 3:
2225 case 4:
2226 if ((thflags & TH_SYN) && !(thflags & TH_ACK) &&
2227 !(m->m_flags & (M_BCAST | M_MCAST)) &&
2228 ((isipv6 && !in6_are_addr_equal_scoped(&ip6->ip6_dst, &ip6->ip6_src, ip6_input_getdstifscope(m), ip6_input_getsrcifscope(m))) ||
2229 (!isipv6 && ip->ip_dst.s_addr != ip->ip_src.s_addr))) {
2230 log_in_vain_log((LOG_INFO,
2231 "Stealth Mode connection attempt to TCP %s:%d from %s:%d\n",
2232 dbuf, ntohs(th->th_dport),
2233 sbuf,
2234 ntohs(th->th_sport)));
2235 }
2236 break;
2237 default:
2238 break;
2239 }
2240 }
2241 if (blackhole) {
2242 if (m->m_pkthdr.rcvif && m->m_pkthdr.rcvif->if_type != IFT_LOOP) {
2243 switch (blackhole) {
2244 case 1:
2245 if (thflags & TH_SYN) {
2246 TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "blackhole 1 syn for closed port");
2247 goto dropnosock;
2248 }
2249 break;
2250 case 2:
2251 TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "blackhole 2 closed port");
2252 goto dropnosock;
2253 default:
2254 TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "blackhole closed port");
2255 goto dropnosock;
2256 }
2257 }
2258 }
2259 IF_TCP_STATINC(ifp, noconnnolist);
2260 TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "closed port");
2261 goto dropwithresetnosock;
2262 }
2263 so = inp->inp_socket;
2264 if (so == NULL) {
2265 /* This case shouldn't happen as the socket shouldn't be null
2266 * if inp_state isn't set to INPCB_STATE_DEAD
2267 * But just in case, we pretend we didn't find the socket if we hit this case
2268 * as this isn't cause for a panic (the socket might be leaked however)...
2269 */
2270 inp = NULL;
2271#if TEMPDEBUG
2272 printf("tcp_input: no more socket for inp=%x. This shouldn't happen\n", inp);
2273#endif
2274 TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "inp_socket NULL");
2275 goto dropnosock;
2276 }
2277
2278 socket_lock(so, refcount: 1);
2279 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
2280 socket_unlock(so, refcount: 1);
2281 inp = NULL; // pretend we didn't find it
2282 TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "inp state WNT_STOPUSING");
2283 goto dropnosock;
2284 }
2285
2286 if (!isipv6 && inp->inp_faddr.s_addr != INADDR_ANY) {
2287 if (inp->inp_faddr.s_addr != ip->ip_src.s_addr ||
2288 inp->inp_laddr.s_addr != ip->ip_dst.s_addr ||
2289 inp->inp_fport != th->th_sport ||
2290 inp->inp_lport != th->th_dport) {
2291 os_log_error(OS_LOG_DEFAULT, "%s 5-tuple does not match: %u:%u %u:%u\n",
2292 __func__,
2293 ntohs(inp->inp_fport), ntohs(th->th_sport),
2294 ntohs(inp->inp_lport), ntohs(th->th_dport));
2295 if (findpcb_iterated) {
2296 goto drop;
2297 }
2298 findpcb_iterated = true;
2299 socket_unlock(so, refcount: 1);
2300 inp = NULL;
2301 goto findpcb;
2302 }
2303 } else if (isipv6 && !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
2304 if (!in6_are_addr_equal_scoped(&inp->in6p_faddr, &ip6->ip6_src, inp->inp_fifscope, ip6_input_getsrcifscope(m)) ||
2305 !in6_are_addr_equal_scoped(&inp->in6p_laddr, &ip6->ip6_dst, inp->inp_lifscope, ip6_input_getdstifscope(m)) ||
2306 inp->inp_fport != th->th_sport ||
2307 inp->inp_lport != th->th_dport) {
2308 os_log_error(OS_LOG_DEFAULT, "%s 5-tuple does not match: %u:%u %u:%u\n",
2309 __func__,
2310 ntohs(inp->inp_fport), ntohs(th->th_sport),
2311 ntohs(inp->inp_lport), ntohs(th->th_dport));
2312 if (findpcb_iterated) {
2313 goto drop;
2314 }
2315 findpcb_iterated = true;
2316 socket_unlock(so, refcount: 1);
2317 inp = NULL;
2318 goto findpcb;
2319 }
2320 }
2321
2322 tp = intotcpcb(inp);
2323 if (tp == NULL) {
2324 IF_TCP_STATINC(ifp, noconnlist);
2325 TCP_LOG_DROP_PKT(TCP_LOG_HDR, th, ifp, "tp is NULL");
2326 goto dropwithreset;
2327 }
2328
2329 /* Now that we found the tcpcb, we can adjust the TCP timestamp */
2330 if (to.to_flags & TOF_TS) {
2331 to.to_tsecr -= tp->t_ts_offset;
2332 }
2333
2334 if (tp->t_state == TCPS_CLOSED) {
2335 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "tp state TCPS_CLOSED");
2336 goto drop;
2337 }
2338
2339#if NECP
2340 if (so->so_state & SS_ISCONNECTED) {
2341 // Connected TCP sockets have a fully-bound local and remote,
2342 // so the policy check doesn't need to override addresses
2343 if (!necp_socket_is_allowed_to_send_recv(inp, input_interface: ifp, pf_tag, NULL, NULL, NULL, NULL)) {
2344 TCP_LOG_DROP_NECP(TCP_LOG_HDR, th, intotcpcb(inp), false);
2345 IF_TCP_STATINC(ifp, badformat);
2346 goto drop;
2347 }
2348 } else {
2349 /*
2350 * If the proc_uuid_policy table has been updated since the last use
2351 * of the listening socket (i.e., the proc_uuid_policy_table_gencount
2352 * has been updated), the flags in the socket may be out of date.
2353 * If INP2_WANT_APP_POLICY is stale, inbound packets may
2354 * be dropped by NECP if the socket should now match a per-app
2355 * exception policy.
2356 * In order to avoid this refresh the proc_uuid_policy state to
2357 * potentially recalculate the socket's flags before checking
2358 * with NECP.
2359 */
2360 (void) inp_update_policy(inp);
2361
2362 if (isipv6) {
2363 if (!necp_socket_is_allowed_to_send_recv_v6(inp,
2364 local_port: th->th_dport, remote_port: th->th_sport, local_addr: &ip6->ip6_dst,
2365 remote_addr: &ip6->ip6_src, input_interface: ifp, pf_tag, NULL, NULL, NULL, NULL)) {
2366 TCP_LOG_DROP_NECP(TCP_LOG_HDR, th, intotcpcb(inp), false);
2367 IF_TCP_STATINC(ifp, badformat);
2368 goto drop;
2369 }
2370 } else {
2371 if (!necp_socket_is_allowed_to_send_recv_v4(inp,
2372 local_port: th->th_dport, remote_port: th->th_sport, local_addr: &ip->ip_dst, remote_addr: &ip->ip_src,
2373 input_interface: ifp, pf_tag, NULL, NULL, NULL, NULL)) {
2374 TCP_LOG_DROP_NECP(TCP_LOG_HDR, th, intotcpcb(inp), false);
2375 IF_TCP_STATINC(ifp, badformat);
2376 goto drop;
2377 }
2378 }
2379 }
2380#endif /* NECP */
2381
2382 prev_t_state = tp->t_state;
2383
2384 /* If none of the FIN|SYN|RST|ACK flag is set, drop */
2385 if ((thflags & TH_ACCEPT) == 0) {
2386 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "rfc5961 TH_ACCEPT == 0");
2387 goto drop;
2388 }
2389
2390 /* Unscale the window into a 32-bit value. */
2391 if ((thflags & TH_SYN) == 0) {
2392 tiwin = th->th_win << tp->snd_scale;
2393 } else {
2394 tiwin = th->th_win;
2395 }
2396
2397 /* Avoid processing packets while closing a listen socket */
2398 if (tp->t_state == TCPS_LISTEN &&
2399 (so->so_options & SO_ACCEPTCONN) == 0) {
2400 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "closing a listening socket");
2401 goto drop;
2402 }
2403
2404 if ((m->m_flags & M_PKTHDR) && (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) {
2405 soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_WAKE_PKT);
2406 }
2407
2408 if (so->so_options & (SO_DEBUG | SO_ACCEPTCONN)) {
2409#if TCPDEBUG
2410 if (so->so_options & SO_DEBUG) {
2411 ostate = tp->t_state;
2412 if (isipv6) {
2413 bcopy((char *)ip6, (char *)tcp_saveipgen,
2414 sizeof(*ip6));
2415 } else {
2416 bcopy((char *)ip, (char *)tcp_saveipgen, sizeof(*ip));
2417 }
2418 tcp_savetcp = *th;
2419 }
2420#endif
2421 if (so->so_options & SO_ACCEPTCONN) {
2422 struct tcpcb *tp0 = tp;
2423 struct socket *so2;
2424 struct socket *oso;
2425 struct sockaddr_storage from;
2426 struct sockaddr_storage to2;
2427 struct inpcb *oinp = sotoinpcb(so);
2428 struct ifnet *head_ifscope;
2429 bool head_nocell, head_recvanyif,
2430 head_noexpensive, head_awdl_unrestricted,
2431 head_intcoproc_allowed, head_external_port,
2432 head_noconstrained, head_management_allowed;
2433
2434 /* Get listener's bound-to-interface, if any */
2435 head_ifscope = (inp->inp_flags & INP_BOUND_IF) ?
2436 inp->inp_boundifp : NULL;
2437 /* Get listener's no-cellular information, if any */
2438 head_nocell = INP_NO_CELLULAR(inp);
2439 /* Get listener's recv-any-interface, if any */
2440 head_recvanyif = (inp->inp_flags & INP_RECV_ANYIF);
2441 /* Get listener's no-expensive information, if any */
2442 head_noexpensive = INP_NO_EXPENSIVE(inp);
2443 head_noconstrained = INP_NO_CONSTRAINED(inp);
2444 head_awdl_unrestricted = INP_AWDL_UNRESTRICTED(inp);
2445 head_intcoproc_allowed = INP_INTCOPROC_ALLOWED(inp);
2446 head_external_port = (inp->inp_flags2 & INP2_EXTERNAL_PORT);
2447 head_management_allowed = INP_MANAGEMENT_ALLOWED(inp);
2448
2449 /*
2450 * If the state is LISTEN then ignore segment if it contains an RST.
2451 * If the segment contains an ACK then it is bad and send a RST.
2452 * If it does not contain a SYN then it is not interesting; drop it.
2453 * If it is from this socket, drop it, it must be forged.
2454 */
2455 if ((thflags & (TH_RST | TH_ACK | TH_SYN)) != TH_SYN) {
2456 IF_TCP_STATINC(ifp, listbadsyn);
2457
2458 if (thflags & TH_RST) {
2459 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false,
2460 thflags & TH_SYN ? "ignore SYN with RST" : "ignore RST");
2461 goto drop;
2462 }
2463 if (thflags & TH_ACK) {
2464 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false,
2465 thflags & TH_SYN ? "bad SYN with ACK" : "bad ACK");
2466 tp = NULL;
2467 tcpstat.tcps_badsyn++;
2468 goto dropwithreset;
2469 }
2470
2471 /* We come here if there is no SYN set */
2472 tcpstat.tcps_badsyn++;
2473 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "bad SYN");
2474 goto drop;
2475 }
2476 KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_START, 0, 0, 0, 0, 0);
2477 if (th->th_dport == th->th_sport) {
2478 if (isipv6) {
2479 if (in6_are_addr_equal_scoped(&ip6->ip6_dst, &ip6->ip6_src, ip6_input_getdstifscope(m), ip6_input_getsrcifscope(m))) {
2480 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "bad tuple same port");
2481 goto drop;
2482 }
2483 } else if (ip->ip_dst.s_addr == ip->ip_src.s_addr) {
2484 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "bad tuple same IPv4 address");
2485 goto drop;
2486 }
2487 }
2488 /*
2489 * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
2490 * in_broadcast() should never return true on a received
2491 * packet with M_BCAST not set.
2492 *
2493 * Packets with a multicast source address should also
2494 * be discarded.
2495 */
2496 if (m->m_flags & (M_BCAST | M_MCAST)) {
2497 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "mbuf M_BCAST | M_MCAST");
2498 goto drop;
2499 }
2500 if (isipv6) {
2501 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
2502 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
2503 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "IN6_IS_ADDR_MULTICAST");
2504 goto drop;
2505 }
2506 } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
2507 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
2508 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
2509 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) {
2510 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "multicast or broadcast address");
2511 goto drop;
2512 }
2513
2514
2515 /*
2516 * If deprecated address is forbidden,
2517 * we do not accept SYN to deprecated interface
2518 * address to prevent any new inbound connection from
2519 * getting established.
2520 * When we do not accept SYN, we send a TCP RST,
2521 * with deprecated source address (instead of dropping
2522 * it). We compromise it as it is much better for peer
2523 * to send a RST, and RST will be the final packet
2524 * for the exchange.
2525 *
2526 * If we do not forbid deprecated addresses, we accept
2527 * the SYN packet. RFC 4862 forbids dropping SYN in
2528 * this case.
2529 */
2530 if (isipv6 && !ip6_use_deprecated) {
2531 uint32_t ia6_flags;
2532
2533 if (ip6_getdstifaddr_info(m, NULL,
2534 &ia6_flags) == 0) {
2535 if (ia6_flags & IN6_IFF_DEPRECATED) {
2536 tp = NULL;
2537 IF_TCP_STATINC(ifp, deprecate6);
2538 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "deprecated IPv6 address");
2539 goto dropwithreset;
2540 }
2541 }
2542 }
2543 if (so->so_filt || check_cfil) {
2544 if (isipv6) {
2545 struct sockaddr_in6 *sin6 = SIN6(&from);
2546
2547 sin6->sin6_len = sizeof(*sin6);
2548 sin6->sin6_family = AF_INET6;
2549 sin6->sin6_port = th->th_sport;
2550 sin6->sin6_flowinfo = 0;
2551 sin6->sin6_addr = ip6->ip6_src;
2552 sin6->sin6_scope_id = 0;
2553
2554 sin6 = SIN6(&to2);
2555
2556 sin6->sin6_len = sizeof(struct sockaddr_in6);
2557 sin6->sin6_family = AF_INET6;
2558 sin6->sin6_port = th->th_dport;
2559 sin6->sin6_flowinfo = 0;
2560 sin6->sin6_addr = ip6->ip6_dst;
2561 sin6->sin6_scope_id = 0;
2562 } else {
2563 struct sockaddr_in *sin = SIN(&from);
2564
2565 sin->sin_len = sizeof(*sin);
2566 sin->sin_family = AF_INET;
2567 sin->sin_port = th->th_sport;
2568 sin->sin_addr = ip->ip_src;
2569
2570 sin = SIN(&to2);
2571
2572 sin->sin_len = sizeof(struct sockaddr_in);
2573 sin->sin_family = AF_INET;
2574 sin->sin_port = th->th_dport;
2575 sin->sin_addr = ip->ip_dst;
2576 }
2577 }
2578
2579 if (so->so_filt) {
2580 so2 = sonewconn(head: so, connstatus: 0, SA(&from));
2581 } else {
2582 so2 = sonewconn(head: so, connstatus: 0, NULL);
2583 }
2584 if (so2 == 0) {
2585 tcpstat.tcps_listendrop++;
2586 if (tcp_dropdropablreq(head: so)) {
2587 if (so->so_filt) {
2588 so2 = sonewconn(head: so, connstatus: 0, SA(&from));
2589 } else {
2590 so2 = sonewconn(head: so, connstatus: 0, NULL);
2591 }
2592 }
2593 if (!so2) {
2594 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " listen drop");
2595 goto drop;
2596 }
2597 }
2598
2599 /* Point "inp" and "tp" in tandem to new socket */
2600 inp = (struct inpcb *)so2->so_pcb;
2601 tp = intotcpcb(inp);
2602
2603 oso = so;
2604 socket_unlock(so, refcount: 0); /* Unlock but keep a reference on listener for now */
2605
2606 so = so2;
2607 socket_lock(so, refcount: 1);
2608 /*
2609 * Mark socket as temporary until we're
2610 * committed to keeping it. The code at
2611 * ``drop'' and ``dropwithreset'' check the
2612 * flag dropsocket to see if the temporary
2613 * socket created here should be discarded.
2614 * We mark the socket as discardable until
2615 * we're committed to it below in TCPS_LISTEN.
2616 * There are some error conditions in which we
2617 * have to drop the temporary socket.
2618 */
2619 dropsocket++;
2620 /*
2621 * Inherit INP_BOUND_IF from listener; testing if
2622 * head_ifscope is non-NULL is sufficient, since it
2623 * can only be set to a non-zero value earlier if
2624 * the listener has such a flag set.
2625 */
2626 if (head_ifscope != NULL) {
2627 inp->inp_flags |= INP_BOUND_IF;
2628 inp->inp_boundifp = head_ifscope;
2629 } else {
2630 inp->inp_flags &= ~INP_BOUND_IF;
2631 }
2632 /*
2633 * Inherit restrictions from listener.
2634 */
2635 if (head_nocell) {
2636 inp_set_nocellular(inp);
2637 }
2638 if (head_noexpensive) {
2639 inp_set_noexpensive(inp);
2640 }
2641 if (head_noconstrained) {
2642 inp_set_noconstrained(inp);
2643 }
2644 if (head_awdl_unrestricted) {
2645 inp_set_awdl_unrestricted(inp);
2646 }
2647 if (head_intcoproc_allowed) {
2648 inp_set_intcoproc_allowed(inp);
2649 }
2650 if (head_management_allowed) {
2651 inp_set_management_allowed(inp);
2652 }
2653 /*
2654 * Inherit {IN,IN6}_RECV_ANYIF from listener.
2655 */
2656 if (head_recvanyif) {
2657 inp->inp_flags |= INP_RECV_ANYIF;
2658 } else {
2659 inp->inp_flags &= ~INP_RECV_ANYIF;
2660 }
2661
2662 if (head_external_port) {
2663 inp->inp_flags2 |= INP2_EXTERNAL_PORT;
2664 }
2665 if (isipv6) {
2666 inp->in6p_laddr = ip6->ip6_dst;
2667 inp->inp_lifscope = in6_addr2scopeid(ifp, &inp->in6p_laddr);
2668 in6_verify_ifscope(&ip6->ip6_dst, inp->inp_lifscope);
2669 } else {
2670 inp->inp_vflag &= ~INP_IPV6;
2671 inp->inp_vflag |= INP_IPV4;
2672 inp->inp_laddr = ip->ip_dst;
2673 }
2674 inp->inp_lport = th->th_dport;
2675 if (in_pcbinshash(inp, 0) != 0) {
2676 /*
2677 * Undo the assignments above if we failed to
2678 * put the PCB on the hash lists.
2679 */
2680 if (isipv6) {
2681 inp->in6p_laddr = in6addr_any;
2682 inp->inp_lifscope = IFSCOPE_NONE;
2683 } else {
2684 inp->inp_laddr.s_addr = INADDR_ANY;
2685 }
2686#if SKYWALK
2687 netns_release(token: &inp->inp_netns_token);
2688#endif /* SKYWALK */
2689 inp->inp_lport = 0;
2690 socket_lock(so: oso, refcount: 0); /* release ref on parent */
2691 socket_unlock(so: oso, refcount: 1);
2692 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " in_pcbinshash failed");
2693 goto drop;
2694 }
2695 socket_lock(so: oso, refcount: 0);
2696 if (isipv6) {
2697 /*
2698 * Inherit socket options from the listening
2699 * socket.
2700 * Note that in6p_inputopts are not (even
2701 * should not be) copied, since it stores
2702 * previously received options and is used to
2703 * detect if each new option is different than
2704 * the previous one and hence should be passed
2705 * to a user.
2706 * If we copied in6p_inputopts, a user would
2707 * not be able to receive options just after
2708 * calling the accept system call.
2709 */
2710 inp->inp_flags |=
2711 oinp->inp_flags & INP_CONTROLOPTS;
2712 if (oinp->in6p_outputopts) {
2713 inp->in6p_outputopts =
2714 ip6_copypktopts(oinp->in6p_outputopts,
2715 Z_NOWAIT);
2716 }
2717 } else {
2718 inp->inp_options = ip_srcroute();
2719 inp->inp_ip_tos = oinp->inp_ip_tos;
2720 }
2721#if IPSEC
2722 /* copy old policy into new socket's */
2723 if (sotoinpcb(oso)->inp_sp) {
2724 int error = 0;
2725 /* Is it a security hole here to silently fail to copy the policy? */
2726 if (inp->inp_sp == NULL) {
2727 error = ipsec_init_policy(so, &inp->inp_sp);
2728 }
2729 if (error != 0 || ipsec_copy_policy(sotoinpcb(oso)->inp_sp, inp->inp_sp)) {
2730 printf("tcp_input: could not copy policy\n");
2731 }
2732 }
2733#endif
2734 /* inherit states from the listener */
2735 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
2736 struct tcpcb *, tp, int32_t, TCPS_LISTEN);
2737 TCP_LOG_STATE(tp, TCPS_LISTEN);
2738 tp->t_state = TCPS_LISTEN;
2739 tp->t_flags |= tp0->t_flags & (TF_NOPUSH | TF_NOOPT | TF_NODELAY);
2740 tp->t_flagsext |= (tp0->t_flagsext & (TF_RXTFINDROP | TF_NOTIMEWAIT | TF_FASTOPEN));
2741 tp->t_keepinit = tp0->t_keepinit;
2742 tp->t_keepcnt = tp0->t_keepcnt;
2743 tp->t_keepintvl = tp0->t_keepintvl;
2744 tp->t_adaptive_wtimo = tp0->t_adaptive_wtimo;
2745 tp->t_adaptive_rtimo = tp0->t_adaptive_rtimo;
2746 tp->t_inpcb->inp_ip_ttl = tp0->t_inpcb->inp_ip_ttl;
2747 if ((so->so_flags & SOF_NOTSENT_LOWAT) != 0) {
2748 tp->t_notsent_lowat = tp0->t_notsent_lowat;
2749 }
2750 tp->t_inpcb->inp_flags2 |=
2751 tp0->t_inpcb->inp_flags2 & INP2_KEEPALIVE_OFFLOAD;
2752
2753 /* now drop the reference on the listener */
2754 socket_unlock(so: oso, refcount: 1);
2755
2756 tcp_set_max_rwinscale(tp, so);
2757
2758#if CONTENT_FILTER
2759 if (check_cfil) {
2760 int error = cfil_sock_attach(so: so2, SA(&to2), SA(&from), CFS_CONNECTION_DIR_IN);
2761 if (error != 0) {
2762 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " cfil_sock_attach failed");
2763 goto drop;
2764 }
2765 }
2766#endif /* CONTENT_FILTER */
2767
2768 KERNEL_DEBUG(DBG_FNC_TCP_NEWCONN | DBG_FUNC_END, 0, 0, 0, 0, 0);
2769 }
2770 }
2771 socket_lock_assert_owned(so);
2772
2773 /*
2774 * Packet accounting should not be done on listening socket
2775 */
2776 if (th->th_flags & TH_SYN) {
2777 (void) os_add_overflow(1, tp->t_syn_rcvd, &tp->t_syn_rcvd);
2778 }
2779 if (th->th_flags & TH_FIN) {
2780 (void) os_add_overflow(1, tp->t_fin_rcvd, &tp->t_fin_rcvd);
2781 }
2782 if (th->th_flags & TH_RST) {
2783 (void) os_add_overflow(1, tp->t_rst_rcvd, &tp->t_rst_rcvd);
2784 }
2785 TCP_LOG_TH_FLAGS(TCP_LOG_HDR, th, tp, false, ifp);
2786
2787 if (net_mpklog_enabled && (m->m_pkthdr.rcvif->if_xflags & IFXF_MPK_LOG)) {
2788 MPKL_TCP_INPUT(tcp_mpkl_log_object,
2789 ntohs(tp->t_inpcb->inp_lport), ntohs(tp->t_inpcb->inp_fport),
2790 th->th_seq, th->th_ack, tlen, thflags,
2791 so->last_pid, so->so_log_seqn++);
2792 }
2793
2794 if (tp->t_state == TCPS_ESTABLISHED && tlen > 0) {
2795 /*
2796 * Evaluate the rate of arrival of packets to see if the
2797 * receiver can reduce the ack traffic. The algorithm to
2798 * stretch acks will be enabled if the connection meets
2799 * certain criteria defined in tcp_stretch_ack_enable function.
2800 */
2801 if ((tp->t_flagsext & TF_RCVUNACK_WAITSS) != 0) {
2802 TCP_INC_VAR(tp->rcv_waitforss, segment_count);
2803 }
2804 if (tcp_stretch_ack_enable(tp, thflags)) {
2805 tp->t_flags |= TF_STRETCHACK;
2806 tp->t_flagsext &= ~(TF_RCVUNACK_WAITSS);
2807 tp->rcv_waitforss = 0;
2808 } else {
2809 tp->t_flags &= ~(TF_STRETCHACK);
2810 }
2811 if (TSTMP_GT(tp->rcv_unackwin - (tcp_rcvunackwin >> 1), tcp_now)) {
2812 tp->rcv_by_unackhalfwin += (tlen + off);
2813 tp->rcv_by_unackwin += (tlen + off);
2814 } else {
2815 tp->rcv_unackwin = tcp_now + tcp_rcvunackwin;
2816 tp->rcv_by_unackwin = tp->rcv_by_unackhalfwin + tlen + off;
2817 tp->rcv_by_unackhalfwin = tlen + off;
2818 }
2819 }
2820
2821 if (tp->t_state == TCPS_ESTABLISHED && BYTES_ACKED(th, tp) > 0) {
2822 if (tp->ecn_flags & TE_SENDIPECT) {
2823 /*
2824 * Data sent with ECT has been acknowledged, calculate
2825 * packets approx. by dividing by MSS. This is done to
2826 * count MSS sized packets in case packets are aggregated
2827 * by GRO/LRO.
2828 */
2829 uint32_t bytes_acked = tcp_round_to(BYTES_ACKED(th, tp), round: tp->t_maxseg);
2830 tp->t_ecn_capable_packets_acked += max(a: 1, b: (bytes_acked / tp->t_maxseg));
2831 }
2832 }
2833
2834 /* Accurate ECN has different semantics for TH_CWR. */
2835 if (!TCP_ACC_ECN_ENABLED(tp)) {
2836 /*
2837 * Clear TE_SENDECE if TH_CWR is set. This is harmless, so we don't
2838 * bother doing extensive checks for state and whatnot.
2839 */
2840 if (thflags & TH_CWR) {
2841 tp->ecn_flags &= ~TE_SENDECE;
2842 tp->t_ecn_recv_cwr++;
2843 }
2844 }
2845
2846 /*
2847 * Accurate ECN feedback
2848 * 1. Process peer's feedback in received TCP thflags and update s.cep
2849 * 2. Process IP ECN bits and update r.cep for CE marked pure ACKs
2850 * or valid data packets
2851 *
2852 */
2853 if (TCP_ACC_ECN_ON(tp) && tp->t_state == TCPS_ESTABLISHED) {
2854 /*
2855 * Update s.cep if bytes have been acknowledged
2856 * otherwise, this ACK has already been superseded.
2857 */
2858 uint8_t ace = tcp_get_ace(th);
2859 if (BYTES_ACKED(th, tp) > 0) {
2860 /* Congestion was experienced if delta_cep > 0 */
2861 tp->t_delta_ce_packets = (ace + TCP_ACE_DIV - (tp->t_snd_ce_packets % TCP_ACE_DIV)) % TCP_ACE_DIV;
2862 tp->t_snd_ce_packets += tp->t_delta_ce_packets;
2863 }
2864 /* Update receive side counters */
2865 if (tlen == 0 || (tlen > 0 &&
2866 SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
2867 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd))) {
2868 tcp_input_ip_ecn(tp, inp, tlen: (uint32_t)tlen, segment_count: (uint32_t)segment_count, ip_ecn);
2869 }
2870
2871 /* Test for ACE bleaching, initial value of ace should be non-zero */
2872 if (th->th_seq == tp->iss + 1 && ace == 0) {
2873 tp->t_client_accecn_state = tcp_connection_client_accurate_ecn_ace_bleaching_detected;
2874 }
2875 } else {
2876 /*
2877 * Explicit Congestion Notification - Flag that we need to send ECE if
2878 * + The IP Congestion experienced flag was set.
2879 * + Socket is in established state
2880 * + We negotiated ECN in the TCP setup
2881 * + This isn't a pure ack (tlen > 0)
2882 * + The data is in the valid window
2883 *
2884 * TE_SENDECE will be cleared when we receive a packet with TH_CWR set.
2885 */
2886 if (ip_ecn == IPTOS_ECN_CE && tp->t_state == TCPS_ESTABLISHED &&
2887 TCP_ECN_ENABLED(tp) && tlen > 0 &&
2888 SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
2889 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
2890 tp->t_ecn_recv_ce++;
2891 tcpstat.tcps_ecn_recv_ce++;
2892 INP_INC_IFNET_STAT(inp, ecn_recv_ce);
2893 /* Mark this connection as it received CE from network */
2894 tp->ecn_flags |= TE_RECV_ECN_CE;
2895 tp->ecn_flags |= TE_SENDECE;
2896 }
2897 }
2898
2899 /*
2900 * If we received an explicit notification of congestion in
2901 * ip tos ecn bits or by the CWR bit in TCP header flags, reset
2902 * the ack-stretching state. We need to handle ECN notification if
2903 * an ECN setup SYN was sent even once.
2904 */
2905 if (tp->t_state == TCPS_ESTABLISHED &&
2906 (tp->ecn_flags & TE_SETUPSENT) &&
2907 (ip_ecn == IPTOS_ECN_CE || (thflags & TH_CWR))) {
2908 tcp_reset_stretch_ack(tp);
2909 tp->t_forced_acks = TCP_FORCED_ACKS_COUNT;
2910 CLEAR_IAJ_STATE(tp);
2911 }
2912
2913 if (ip_ecn == IPTOS_ECN_CE && tp->t_state == TCPS_ESTABLISHED &&
2914 !TCP_ECN_ENABLED(tp) && !(tp->ecn_flags & TE_CEHEURI_SET)) {
2915 tcpstat.tcps_ecn_fallback_ce++;
2916 tcp_heuristic_ecn_aggressive(tp);
2917 tp->ecn_flags |= TE_CEHEURI_SET;
2918 }
2919
2920 if (tp->t_state == TCPS_ESTABLISHED && TCP_ECN_ENABLED(tp) &&
2921 ip_ecn == IPTOS_ECN_CE && !(tp->ecn_flags & TE_CEHEURI_SET)) {
2922 if (inp->inp_stat->rxpackets < ECN_MIN_CE_PROBES) {
2923 tp->t_ecn_recv_ce_pkt++;
2924 } else if (tp->t_ecn_recv_ce_pkt > ECN_MAX_CE_RATIO) {
2925 tcpstat.tcps_ecn_fallback_ce++;
2926 tcp_heuristic_ecn_aggressive(tp);
2927 tp->ecn_flags |= TE_CEHEURI_SET;
2928 INP_INC_IFNET_STAT(inp, ecn_fallback_ce);
2929 } else {
2930 /* We tracked the first ECN_MIN_CE_PROBES segments, we
2931 * now know that the path is good.
2932 */
2933 tp->ecn_flags |= TE_CEHEURI_SET;
2934 }
2935 }
2936
2937 /* Update rcvtime as a new segment was received on the connection */
2938 tp->t_rcvtime = tcp_now;
2939
2940 /*
2941 * Segment received on connection.
2942 * Reset idle time and keep-alive timer.
2943 */
2944 if (TCPS_HAVEESTABLISHED(tp->t_state)) {
2945 tcp_keepalive_reset(tp);
2946
2947 if (tp->t_mpsub) {
2948 mptcp_reset_keepalive(tp);
2949 }
2950 }
2951
2952 /*
2953 * Process options if not in LISTEN state,
2954 * else do it below (after getting remote address).
2955 */
2956 if (tp->t_state != TCPS_LISTEN && optp) {
2957 tcp_dooptions(tp, optp, optlen, th, &to);
2958 }
2959#if MPTCP
2960 if (tp->t_state != TCPS_LISTEN && (so->so_flags & SOF_MP_SUBFLOW)) {
2961 mptcp_insert_rmap(tp, m, th);
2962 }
2963#endif /* MPTCP */
2964 if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) {
2965 if (!(thflags & TH_ACK) ||
2966 (SEQ_GT(th->th_ack, tp->iss) &&
2967 SEQ_LEQ(th->th_ack, tp->snd_max))) {
2968 tcp_finalize_options(tp, &to, ifscope);
2969 }
2970 }
2971
2972#if TRAFFIC_MGT
2973 /*
2974 * Compute inter-packet arrival jitter. According to RFC 3550,
2975 * inter-packet arrival jitter is defined as the difference in
2976 * packet spacing at the receiver compared to the sender for a
2977 * pair of packets. When two packets of maximum segment size come
2978 * one after the other with consecutive sequence numbers, we
2979 * consider them as packets sent together at the sender and use
2980 * them as a pair to compute inter-packet arrival jitter. This
2981 * metric indicates the delay induced by the network components due
2982 * to queuing in edge/access routers.
2983 */
2984 if (tp->t_state == TCPS_ESTABLISHED &&
2985 (thflags & (TH_SYN | TH_FIN | TH_RST | TH_URG | TH_ACK | TH_ECE | TH_PUSH)) == TH_ACK &&
2986 ((tp->t_flags & TF_NEEDFIN) == 0) &&
2987 ((to.to_flags & TOF_TS) == 0 ||
2988 TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
2989 th->th_seq == tp->rcv_nxt && LIST_EMPTY(&tp->t_segq)) {
2990 int seg_size = tlen;
2991 if (tp->iaj_pktcnt <= IAJ_IGNORE_PKTCNT) {
2992 TCP_INC_VAR(tp->iaj_pktcnt, segment_count);
2993 }
2994
2995 if (tp->iaj_size == 0 || seg_size > tp->iaj_size ||
2996 (seg_size == tp->iaj_size && tp->iaj_rcv_ts == 0)) {
2997 /*
2998 * State related to inter-arrival jitter is
2999 * uninitialized or we are trying to find a good
3000 * first packet to start computing the metric
3001 */
3002 update_iaj_state(tp, size: seg_size, rst_size: 0);
3003 } else {
3004 if (seg_size == tp->iaj_size) {
3005 /*
3006 * Compute inter-arrival jitter taking
3007 * this packet as the second packet
3008 */
3009 compute_iaj(tp);
3010 }
3011 if (seg_size < tp->iaj_size) {
3012 /*
3013 * There is a smaller packet in the stream.
3014 * Some times the maximum size supported
3015 * on a path can change if there is a new
3016 * link with smaller MTU. The receiver will
3017 * not know about this change. If there
3018 * are too many packets smaller than
3019 * iaj_size, we try to learn the iaj_size
3020 * again.
3021 */
3022 TCP_INC_VAR(tp->iaj_small_pkt, segment_count);
3023 if (tp->iaj_small_pkt > RESET_IAJ_SIZE_THRESH) {
3024 update_iaj_state(tp, size: seg_size, rst_size: 1);
3025 } else {
3026 CLEAR_IAJ_STATE(tp);
3027 }
3028 } else {
3029 update_iaj_state(tp, size: seg_size, rst_size: 0);
3030 }
3031 }
3032 } else {
3033 CLEAR_IAJ_STATE(tp);
3034 }
3035#endif /* TRAFFIC_MGT */
3036
3037 /*
3038 * Header prediction: check for the two common cases
3039 * of a uni-directional data xfer. If the packet has
3040 * no control flags, is in-sequence, the window didn't
3041 * change and we're not retransmitting, it's a
3042 * candidate. If the length is zero and the ack moved
3043 * forward, we're the sender side of the xfer. Just
3044 * free the data acked & wake any higher level process
3045 * that was blocked waiting for space. If the length
3046 * is non-zero and the ack didn't move, we're the
3047 * receiver side. If we're getting packets in-order
3048 * (the reassembly queue is empty), add the data to
3049 * the socket buffer and note that we need a delayed ack.
3050 * Make sure that the hidden state-flags are also off.
3051 * Since we check for TCPS_ESTABLISHED above, it can only
3052 * be TH_NEEDSYN.
3053 */
3054 if (tp->t_state == TCPS_ESTABLISHED &&
3055 !(so->so_state & SS_CANTRCVMORE) &&
3056 (thflags & TH_FLAGS) == TH_ACK &&
3057 ((tp->t_flags & TF_NEEDFIN) == 0) &&
3058 ((to.to_flags & TOF_TS) == 0 ||
3059 TSTMP_GEQ(to.to_tsval, tp->ts_recent)) &&
3060 th->th_seq == tp->rcv_nxt &&
3061 tiwin && tiwin == tp->snd_wnd &&
3062 tp->snd_nxt == tp->snd_max) {
3063 /*
3064 * If last ACK falls within this segment's sequence numbers,
3065 * record the timestamp.
3066 * NOTE that the test is modified according to the latest
3067 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
3068 */
3069 if ((to.to_flags & TOF_TS) != 0 &&
3070 SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
3071 tp->ts_recent_age = tcp_now;
3072 tp->ts_recent = to.to_tsval;
3073 }
3074
3075 /*
3076 * We increment t_unacksegs_ce for both data segments
3077 * and pure ACKs for Accurate ECN
3078 */
3079 if (TCP_ACC_ECN_ON(tp) && ip_ecn == IPTOS_ECN_CE) {
3080 TCP_INC_VAR(tp->t_unacksegs_ce, segment_count);
3081 }
3082
3083 if (tlen == 0) {
3084 if (SEQ_GT(th->th_ack, tp->snd_una) &&
3085 SEQ_LEQ(th->th_ack, tp->snd_max) &&
3086 tp->snd_cwnd >= tp->snd_ssthresh &&
3087 (!IN_FASTRECOVERY(tp) &&
3088 ((!(SACK_ENABLED(tp)) &&
3089 tp->t_dupacks < tp->t_rexmtthresh) ||
3090 (SACK_ENABLED(tp) && to.to_nsacks == 0 &&
3091 TAILQ_EMPTY(&tp->snd_holes))))) {
3092 /*
3093 * this is a pure ack for outstanding data.
3094 */
3095 ++tcpstat.tcps_predack;
3096
3097 tcp_bad_rexmt_check(tp, th, to: &to);
3098
3099 /* Recalculate the RTT */
3100 tcp_compute_rtt(tp, to: &to, th);
3101
3102 VERIFY(SEQ_GEQ(th->th_ack, tp->snd_una));
3103 acked = BYTES_ACKED(th, tp);
3104 tcpstat.tcps_rcvackpack++;
3105 tcpstat.tcps_rcvackbyte += acked;
3106
3107 /*
3108 * Handle an ack that is in sequence during
3109 * congestion avoidance phase. The
3110 * calculations in this function
3111 * assume that snd_una is not updated yet.
3112 */
3113 if (CC_ALGO(tp)->congestion_avd != NULL) {
3114 CC_ALGO(tp)->congestion_avd(tp, th);
3115 }
3116 tcp_ccdbg_trace(tp, th, event: TCP_CC_INSEQ_ACK_RCVD);
3117 sbdrop(sb: &so->so_snd, len: acked);
3118 tcp_sbsnd_trim(sbsnd: &so->so_snd);
3119
3120 if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
3121 SEQ_LEQ(th->th_ack, tp->snd_recover)) {
3122 tp->snd_recover = th->th_ack - 1;
3123 }
3124
3125 tcp_update_snd_una(tp, ack: th->th_ack);
3126
3127 TCP_RESET_REXMT_STATE(tp);
3128
3129 /*
3130 * pull snd_wl2 up to prevent seq wrap relative
3131 * to th_ack.
3132 */
3133 tp->snd_wl2 = th->th_ack;
3134
3135 if (tp->t_dupacks > 0) {
3136 tp->t_dupacks = 0;
3137 tp->t_rexmtthresh = tcprexmtthresh;
3138 tp->t_new_dupacks = 0;
3139 }
3140
3141 tp->sackhint.sack_bytes_acked = 0;
3142
3143 /*
3144 * If all outstanding data are acked, stop
3145 * retransmit timer, otherwise restart timer
3146 * using current (possibly backed-off) value.
3147 * If process is waiting for space,
3148 * wakeup/selwakeup/signal. If data
3149 * are ready to send, let tcp_output
3150 * decide between more output or persist.
3151 */
3152 if (tp->snd_una == tp->snd_max) {
3153 tp->t_timer[TCPT_REXMT] = 0;
3154 tp->t_timer[TCPT_PTO] = 0;
3155 } else if (tp->t_timer[TCPT_PERSIST] == 0) {
3156 tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
3157 }
3158 if (!SLIST_EMPTY(&tp->t_rxt_segments) &&
3159 !TCP_DSACK_SEQ_IN_WINDOW(tp,
3160 tp->t_dsack_lastuna, tp->snd_una)) {
3161 tcp_rxtseg_clean(tp);
3162 }
3163
3164 if ((tp->t_flagsext & TF_MEASURESNDBW) != 0 &&
3165 tp->t_bwmeas != NULL) {
3166 tcp_bwmeas_check(tp);
3167 }
3168
3169 write_wakeup = 1;
3170 if (!SLIST_EMPTY(&tp->t_notify_ack)) {
3171 tcp_notify_acknowledgement(tp, so);
3172 }
3173
3174 if ((so->so_snd.sb_cc) || (tp->t_flags & TF_ACKNOW)) {
3175 (void) tcp_output(tp);
3176 }
3177
3178 tcp_tfo_rcv_ack(tp, th);
3179
3180 m_freem(m);
3181
3182 tcp_check_timer_state(tp);
3183
3184 tcp_handle_wakeup(so, read_wakeup, write_wakeup);
3185
3186 socket_unlock(so, refcount: 1);
3187 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
3188 return;
3189 }
3190 } else if (th->th_ack == tp->snd_una && LIST_EMPTY(&tp->t_segq) &&
3191 tlen <= tcp_sbspace(tp)) {
3192 /*
3193 * this is a pure, in-sequence data packet
3194 * with nothing on the reassembly queue and
3195 * we have enough buffer space to take it.
3196 */
3197
3198 /* Clean receiver SACK report if present */
3199 if (SACK_ENABLED(tp) && tp->rcv_numsacks) {
3200 tcp_clean_sackreport(tp);
3201 }
3202 ++tcpstat.tcps_preddat;
3203 tp->rcv_nxt += tlen;
3204 /* Update highest received sequence and its timestamp */
3205 if (SEQ_LT(tp->rcv_high, tp->rcv_nxt)) {
3206 tp->rcv_high = tp->rcv_nxt;
3207 if (to.to_flags & TOF_TS) {
3208 tp->tsv_high = to.to_tsval;
3209 }
3210 }
3211
3212 /*
3213 * Pull snd_wl1 up to prevent seq wrap relative to
3214 * th_seq.
3215 */
3216 tp->snd_wl1 = th->th_seq;
3217 /*
3218 * Pull rcv_up up to prevent seq wrap relative to
3219 * rcv_nxt.
3220 */
3221 tp->rcv_up = tp->rcv_nxt;
3222 TCP_INC_VAR(tcpstat.tcps_rcvpack, segment_count);
3223 tcpstat.tcps_rcvbyte += tlen;
3224 if (nstat_collect) {
3225 INP_ADD_STAT(inp, cell, wifi, wired,
3226 rxpackets, 1);
3227 INP_ADD_STAT(inp, cell, wifi, wired, rxbytes,
3228 tlen);
3229 inp_set_activity_bitmap(inp);
3230 }
3231
3232 /* Calculate the RTT on the receiver */
3233 tcp_compute_rcv_rtt(tp, to: &to, th);
3234
3235 tcp_sbrcv_grow(tp, sbrcv: &so->so_rcv, to: &to, pktlen: tlen);
3236 if (TCP_USE_RLEDBAT(tp, so) && tcp_cc_rledbat.data_rcvd != NULL) {
3237 tcp_cc_rledbat.data_rcvd(tp, th, &to, tlen);
3238 }
3239
3240 /*
3241 * Add data to socket buffer.
3242 */
3243 so_recv_data_stat(so, m, 0);
3244 m_adj(m, drop_hdrlen); /* delayed header drop */
3245
3246 if (isipv6) {
3247 memcpy(dst: &saved_hdr, src: ip6, n: sizeof(struct ip6_hdr));
3248 ip6 = (struct ip6_hdr *)&saved_hdr[0];
3249 } else {
3250 memcpy(dst: &saved_hdr, src: ip, n: ip->ip_hl << 2);
3251 ip = (struct ip *)&saved_hdr[0];
3252 }
3253 memcpy(dst: &saved_tcphdr, src: th, n: sizeof(struct tcphdr));
3254
3255 if (th->th_flags & TH_PUSH) {
3256 tp->t_flagsext |= TF_LAST_IS_PSH;
3257 } else {
3258 tp->t_flagsext &= ~TF_LAST_IS_PSH;
3259 }
3260
3261 if (sbappendstream_rcvdemux(so, m)) {
3262 mptcp_handle_input(so);
3263 read_wakeup = 1;
3264 }
3265 th = &saved_tcphdr;
3266
3267 if (isipv6) {
3268 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
3269 (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
3270 th->th_seq, th->th_ack, th->th_win);
3271 } else {
3272 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
3273 (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
3274 th->th_seq, th->th_ack, th->th_win);
3275 }
3276 TCP_INC_VAR(tp->t_unacksegs, segment_count);
3277 if (DELAY_ACK(tp, th)) {
3278 if ((tp->t_flags & TF_DELACK) == 0) {
3279 tp->t_flags |= TF_DELACK;
3280 tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
3281 }
3282 } else {
3283 tp->t_flags |= TF_ACKNOW;
3284 tcp_output(tp);
3285 }
3286
3287 tcp_adaptive_rwtimo_check(tp, tlen);
3288
3289 if (tlen > 0) {
3290 tcp_tfo_rcv_data(tp);
3291 }
3292
3293 tcp_check_timer_state(tp);
3294
3295 tcp_handle_wakeup(so, read_wakeup, write_wakeup);
3296
3297 socket_unlock(so, refcount: 1);
3298 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
3299 return;
3300 }
3301 }
3302
3303 /*
3304 * Calculate amount of space in receive window,
3305 * and then do TCP input processing.
3306 * Receive window is amount of space in rcv queue,
3307 * but not less than advertised window.
3308 */
3309 socket_lock_assert_owned(so);
3310 win = tcp_sbspace(tp);
3311 if (win < 0) {
3312 win = 0;
3313 } else { /* clip rcv window to 4K for modems */
3314 if (tp->t_flags & TF_SLOWLINK && slowlink_wsize > 0) {
3315 win = min(a: win, b: slowlink_wsize);
3316 }
3317 }
3318 tp->rcv_wnd = imax(a: win, b: (int)(tp->rcv_adv - tp->rcv_nxt));
3319#if MPTCP
3320 /*
3321 * Ensure that the subflow receive window isn't greater
3322 * than the connection level receive window.
3323 */
3324 if ((tp->t_mpflags & TMPF_MPTCP_TRUE) && (mp_tp = tptomptp(tp))) {
3325 socket_lock_assert_owned(so: mptetoso(mpte: mp_tp->mpt_mpte));
3326 int64_t recwin_conn = (int64_t)(mp_tp->mpt_rcvadv - mp_tp->mpt_rcvnxt);
3327
3328 VERIFY(recwin_conn < INT32_MAX && recwin_conn > INT32_MIN);
3329 if (recwin_conn > 0 && tp->rcv_wnd > (uint32_t)recwin_conn) {
3330 tp->rcv_wnd = (uint32_t)recwin_conn;
3331 tcpstat.tcps_mp_reducedwin++;
3332 }
3333 }
3334#endif /* MPTCP */
3335
3336 switch (tp->t_state) {
3337 /*
3338 * Initialize tp->rcv_nxt, and tp->irs, select an initial
3339 * tp->iss, and send a segment:
3340 * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
3341 * Also initialize tp->snd_nxt to tp->iss+1 and tp->snd_una to tp->iss.
3342 * Fill in remote peer address fields if not previously specified.
3343 * Enter SYN_RECEIVED state, and process any other fields of this
3344 * segment in this state.
3345 */
3346 case TCPS_LISTEN: {
3347 struct sockaddr_in *sin;
3348 struct sockaddr_in6 *sin6;
3349
3350 socket_lock_assert_owned(so);
3351
3352 /* Clear the logging flags inherited from the listening socket */
3353 inp->inp_log_flags = 0;
3354 inp->inp_flags2 |= INP2_LOGGED_SUMMARY;
3355
3356 if (isipv6) {
3357 sin6 = kalloc_type(struct sockaddr_in6, Z_NOWAIT | Z_ZERO);
3358 if (sin6 == NULL) {
3359 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "LISTEN kalloc_type failed");
3360 goto drop;
3361 }
3362 sin6->sin6_family = AF_INET6;
3363 sin6->sin6_len = sizeof(*sin6);
3364 sin6->sin6_addr = ip6->ip6_src;
3365 sin6->sin6_port = th->th_sport;
3366 if (!in6_embedded_scope && IN6_IS_SCOPE_EMBED(&ip6->ip6_src)) {
3367 sin6->sin6_scope_id = ip6_input_getsrcifscope(m);
3368 }
3369 laddr6 = inp->in6p_laddr;
3370 uint32_t lifscope = inp->inp_lifscope;
3371 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
3372 inp->in6p_laddr = ip6->ip6_dst;
3373 inp->inp_lifscope = in6_addr2scopeid(ifp, &inp->in6p_laddr);
3374 in6_verify_ifscope(&inp->in6p_laddr, inp->inp_lifscope);
3375 }
3376 if (in6_pcbconnect(inp, SA(sin6), kernel_proc)) {
3377 inp->in6p_laddr = laddr6;
3378 kfree_type(struct sockaddr_in6, sin6);
3379 inp->inp_lifscope = lifscope;
3380 in6_verify_ifscope(&inp->in6p_laddr, inp->inp_lifscope);
3381 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " LISTEN in6_pcbconnect failed");
3382 goto drop;
3383 }
3384 kfree_type(struct sockaddr_in6, sin6);
3385 } else {
3386 socket_lock_assert_owned(so);
3387 sin = kalloc_type(struct sockaddr_in, Z_NOWAIT);
3388 if (sin == NULL) {
3389 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "LISTEN kalloc_type failed");
3390 goto drop;
3391 }
3392 sin->sin_family = AF_INET;
3393 sin->sin_len = sizeof(*sin);
3394 sin->sin_addr = ip->ip_src;
3395 sin->sin_port = th->th_sport;
3396 bzero(s: (caddr_t)sin->sin_zero, n: sizeof(sin->sin_zero));
3397 laddr = inp->inp_laddr;
3398 if (inp->inp_laddr.s_addr == INADDR_ANY) {
3399 inp->inp_laddr = ip->ip_dst;
3400 }
3401 if (in_pcbconnect(inp, SA(sin), kernel_proc, IFSCOPE_NONE, NULL)) {
3402 inp->inp_laddr = laddr;
3403 kfree_type(struct sockaddr_in, sin);
3404 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, " LISTEN in_pcbconnect failed");
3405 goto drop;
3406 }
3407 kfree_type(struct sockaddr_in, sin);
3408 }
3409
3410 tcp_dooptions(tp, optp, optlen, th, &to);
3411 tcp_finalize_options(tp, &to, ifscope);
3412
3413 if (tfo_enabled(tp) && tcp_tfo_syn(tp, to: &to)) {
3414 isconnected = TRUE;
3415 }
3416
3417 if (iss) {
3418 tp->iss = iss;
3419 } else {
3420 tp->iss = tcp_new_isn(tp);
3421 }
3422 tp->irs = th->th_seq;
3423 tcp_sendseqinit(tp);
3424 tcp_rcvseqinit(tp);
3425 tp->snd_recover = tp->snd_una;
3426 /*
3427 * Initialization of the tcpcb for transaction;
3428 * set SND.WND = SEG.WND,
3429 * initialize CCsend and CCrecv.
3430 */
3431 tp->snd_wnd = tiwin; /* initial send-window */
3432 tp->max_sndwnd = tp->snd_wnd;
3433 tp->t_flags |= TF_ACKNOW;
3434 tp->t_unacksegs = 0;
3435 tp->t_unacksegs_ce = 0;
3436 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
3437 struct tcpcb *, tp, int32_t, TCPS_SYN_RECEIVED);
3438 TCP_LOG_STATE(tp, TCPS_SYN_RECEIVED);
3439 tp->t_state = TCPS_SYN_RECEIVED;
3440 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
3441 TCP_CONN_KEEPINIT(tp));
3442 tp->t_connect_time = tcp_now;
3443 dropsocket = 0; /* committed to socket */
3444
3445 if (inp->inp_flowhash == 0) {
3446 inp_calc_flowhash(inp);
3447 ASSERT(inp->inp_flowhash != 0);
3448 }
3449 /* update flowinfo - RFC 6437 */
3450 if (inp->inp_flow == 0 &&
3451 inp->in6p_flags & IN6P_AUTOFLOWLABEL) {
3452 inp->inp_flow &= ~IPV6_FLOWLABEL_MASK;
3453 inp->inp_flow |=
3454 (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
3455 }
3456
3457 /* reset the incomp processing flag */
3458 so->so_flags &= ~(SOF_INCOMP_INPROGRESS);
3459 tcpstat.tcps_accepts++;
3460
3461 int ace_flags = ((th->th_x2 << 8) | thflags) & TH_ACE;
3462 tcp_input_process_accecn_syn(tp, ace_flags, ip_ecn);
3463
3464 /*
3465 * The address and connection state are finalized
3466 */
3467 TCP_LOG_CONNECT(tp, false, 0);
3468
3469 tcp_add_fsw_flow(tp, ifp);
3470
3471 goto trimthenstep6;
3472 }
3473
3474 /*
3475 * If the state is SYN_RECEIVED and the seg contains an ACK,
3476 * but not for our SYN/ACK, send a RST.
3477 */
3478 case TCPS_SYN_RECEIVED:
3479 if ((thflags & TH_ACK) &&
3480 (SEQ_LEQ(th->th_ack, tp->snd_una) ||
3481 SEQ_GT(th->th_ack, tp->snd_max))) {
3482 IF_TCP_STATINC(ifp, ooopacket);
3483 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN_RECEIVED bad ACK");
3484 goto dropwithreset;
3485 }
3486
3487 /*
3488 * In SYN_RECEIVED state, if we recv some SYNS with
3489 * window scale and others without, window scaling should
3490 * be disabled. Otherwise the window advertised will be
3491 * lower if we assume scaling and the other end does not.
3492 */
3493 if ((thflags & TH_SYN) &&
3494 (tp->irs == th->th_seq) &&
3495 !(to.to_flags & TOF_SCALE)) {
3496 tp->t_flags &= ~TF_RCVD_SCALE;
3497 }
3498 break;
3499
3500 /*
3501 * If the state is SYN_SENT:
3502 * if seg contains an ACK, but not for our SYN, drop the input.
3503 * if seg contains a RST, then drop the connection.
3504 * if seg does not contain SYN, then drop it.
3505 * Otherwise this is an acceptable SYN segment
3506 * initialize tp->rcv_nxt and tp->irs
3507 * if seg contains ack then advance tp->snd_una
3508 * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
3509 * arrange for segment to be acked (eventually)
3510 * continue processing rest of data/controls, beginning with URG
3511 */
3512 case TCPS_SYN_SENT:
3513 if ((thflags & TH_ACK) &&
3514 (SEQ_LEQ(th->th_ack, tp->iss) ||
3515 SEQ_GT(th->th_ack, tp->snd_max))) {
3516 IF_TCP_STATINC(ifp, ooopacket);
3517 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN_SENT bad ACK");
3518 goto dropwithreset;
3519 }
3520 if (thflags & TH_RST) {
3521 if ((thflags & TH_ACK) != 0) {
3522 if (tfo_enabled(tp) &&
3523 !(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE)) {
3524 tcp_heuristic_tfo_rst(tp);
3525 }
3526 if ((tp->ecn_flags & (TE_SETUPSENT | TE_RCVD_SYN_RST)) == TE_SETUPSENT ||
3527 (tp->ecn_flags & (TE_ACE_SETUPSENT | TE_RCVD_SYN_RST)) == TE_ACE_SETUPSENT) {
3528 /*
3529 * On local connections, send
3530 * non-ECN syn one time before
3531 * dropping the connection
3532 */
3533 if (tp->t_flags & TF_LOCAL) {
3534 tp->ecn_flags |= TE_RCVD_SYN_RST;
3535 goto drop;
3536 } else {
3537 tcp_heuristic_ecn_synrst(tp);
3538 }
3539 }
3540 soevent(so,
3541 hint: (SO_FILT_HINT_LOCKED |
3542 SO_FILT_HINT_CONNRESET));
3543 tp = tcp_drop(tp, ECONNREFUSED);
3544 }
3545 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN_SENT got RST");
3546 goto drop;
3547 }
3548 if ((thflags & TH_SYN) == 0) {
3549 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN_SENT no SYN");
3550 goto drop;
3551 }
3552 tp->snd_wnd = th->th_win; /* initial send window */
3553 tp->max_sndwnd = tp->snd_wnd;
3554
3555 tp->irs = th->th_seq;
3556 tcp_rcvseqinit(tp);
3557 if (thflags & TH_ACK) {
3558 /* Client processes SYN-ACK */
3559 tcpstat.tcps_connects++;
3560
3561 const uint32_t ace_flags = ((th->th_x2 << 8) | thflags) & TH_ACE;
3562
3563 if ((thflags & (TH_ECE | TH_CWR)) == (TH_ECE)) {
3564 /* Receiving Any|0|1 is classic ECN-setup SYN-ACK */
3565 tp->ecn_flags |= TE_SETUPRECEIVED;
3566 if (TCP_ECN_ENABLED(tp)) {
3567 tcp_heuristic_ecn_success(tp);
3568 tcpstat.tcps_ecn_client_success++;
3569 }
3570
3571 if (tp->ecn_flags & TE_ACE_SETUPSENT) {
3572 /*
3573 * Sent AccECN SYN but received classic ECN SYN-ACK
3574 * Set classic ECN related flags
3575 */
3576 tp->ecn_flags |= (TE_SETUPSENT | TE_SENDIPECT);
3577 tp->ecn_flags &= ~TE_ACE_SETUPSENT;
3578 if (tp->t_client_accecn_state == tcp_connection_client_accurate_ecn_feature_enabled) {
3579 tp->t_client_accecn_state = tcp_connection_client_classic_ecn_available;
3580 }
3581 }
3582 } else if (TCP_ACC_ECN_ENABLED(tp) && ace_flags != 0 &&
3583 ace_flags != TH_ACE) {
3584 /* Initialize sender side packet & byte counters */
3585 tp->t_snd_ce_packets = 5;
3586 tp->t_snd_ect1_bytes = tp->t_snd_ect0_bytes = 1;
3587 tp->t_snd_ce_bytes = 0;
3588 tp->ecn_flags |= TE_ACE_FINAL_ACK_3WHS;
3589 /*
3590 * Client received AccECN SYN-ACK that reflects the state (ECN)
3591 * in which SYN packet was delivered. This helps to detect if
3592 * there was mangling of the SYN packet on the path. Currently, we
3593 * only send Not-ECT on SYN packets. So, we should set Not-ECT in
3594 * all packets if we receive any encoding other than 0|TH_CWR|0.
3595 * If 0|0|0 and 1|1|1 were received, fail Accurate ECN negotiation
3596 * by not setting TE_ACE_SETUPRECEIVED.
3597 */
3598 switch (ace_flags) {
3599 case (0 | TH_CWR | 0):
3600 /* Non-ECT SYN was delivered */
3601 tp->ecn_flags |= TE_ACE_SETUPRECEIVED;
3602 tcpstat.tcps_ecn_ace_syn_not_ect++;
3603 tp->t_client_accecn_state = tcp_connection_client_accurate_ecn_negotiation_success;
3604 break;
3605 case (0 | TH_CWR | TH_ECE):
3606 /* ECT1 SYN was delivered */
3607 tp->ecn_flags |= TE_ACE_SETUPRECEIVED;
3608 /* Mangling detected, set Non-ECT on outgoing packets */
3609 tp->ecn_flags &= ~TE_SENDIPECT;
3610 tcpstat.tcps_ecn_ace_syn_ect1++;
3611 tp->t_client_accecn_state = tcp_connection_client_accurate_ecn_negotiation_success_ect_mangling_detected;
3612 break;
3613 case (TH_AE | 0 | 0):
3614 /* ECT0 SYN was delivered */
3615 tp->ecn_flags |= TE_ACE_SETUPRECEIVED;
3616 /* Mangling detected, set Non-ECT on outgoing packets */
3617 tp->ecn_flags &= ~TE_SENDIPECT;
3618 tcpstat.tcps_ecn_ace_syn_ect0++;
3619 tp->t_client_accecn_state = tcp_connection_client_accurate_ecn_negotiation_success_ect_mangling_detected;
3620 break;
3621 case (TH_AE | TH_CWR | 0):
3622 /* CE SYN was delivered */
3623 tp->ecn_flags |= TE_ACE_SETUPRECEIVED;
3624 /* Mangling detected, set Non-ECT on outgoing packets */
3625 tp->t_client_accecn_state = tcp_connection_client_accurate_ecn_negotiation_success_ect_mangling_detected;
3626 tp->ecn_flags &= ~TE_SENDIPECT;
3627 /*
3628 * Although we don't send ECT SYN yet, it is possible that
3629 * a network element changed Not-ECT to ECT and later there
3630 * was congestion at another network element that set it to CE.
3631 * To keep it simple, we will consider this as a congestion event
3632 * for the congestion controller.
3633 * If a TCP client in AccECN mode receives CE feedback in the TCP
3634 * flags of a SYN/ACK, it MUST NOT increment s.cep.
3635 */
3636 tcpstat.tcps_ecn_ace_syn_ce++;
3637 break;
3638 default:
3639 break;
3640 }
3641 if (TCP_ECN_ENABLED(tp)) {
3642 tcp_heuristic_ecn_success(tp);
3643 tcpstat.tcps_ecn_client_success++;
3644 }
3645 /*
3646 * A TCP client in AccECN mode MUST feed back which of the 4
3647 * possible values of the IP-ECN field that was received in the
3648 * SYN/ACK. Set the setup flag for final ACK accordingly.
3649 * We will initialize r.cep, r.e1b, r.e0b first and then increment
3650 * if CE was set on the IP-ECN field of the SYN-ACK.
3651 */
3652 tp->t_rcv_ce_packets = 5;
3653 tp->t_rcv_ect0_bytes = tp->t_rcv_ect1_bytes = 1;
3654 tp->t_rcv_ce_bytes = 0;
3655
3656 /* Increment packet & byte counters based on IP-ECN */
3657 tcp_input_ip_ecn(tp, inp, tlen: (uint32_t)tlen, segment_count: (uint32_t)segment_count, ip_ecn);
3658
3659 switch (ip_ecn) {
3660 case IPTOS_ECN_NOTECT:
3661 /* Not-ECT SYN-ACK was received */
3662 tp->ecn_flags |= TE_ACE_SETUP_NON_ECT;
3663 break;
3664 case IPTOS_ECN_ECT1:
3665 /* ECT1 SYN-ACK was received */
3666 tp->ecn_flags |= TE_ACE_SETUP_ECT1;
3667 break;
3668 case IPTOS_ECN_ECT0:
3669 /* ECT0 SYN-ACK was received */
3670 tp->ecn_flags |= TE_ACE_SETUP_ECT0;
3671 break;
3672 case IPTOS_ECN_CE:
3673 tp->ecn_flags |= TE_ACE_SETUP_CE;
3674 break;
3675 }
3676 } else {
3677 if ((tp->ecn_flags & (TE_SETUPSENT | TE_ACE_SETUPSENT)) &&
3678 tp->t_rxtshift == 0) {
3679 tcp_heuristic_ecn_success(tp);
3680 tcpstat.tcps_ecn_not_supported++;
3681 }
3682 if ((tp->ecn_flags & (TE_SETUPSENT | TE_ACE_SETUPSENT)) &&
3683 tp->t_rxtshift > 0) {
3684 tcp_heuristic_ecn_loss(tp);
3685 }
3686
3687 /* non-ECN-setup SYN-ACK */
3688 tp->ecn_flags &= ~TE_SENDIPECT;
3689 /*
3690 * If Accurate ECN SYN was retransmitted twice and non-ECN SYN-ACK
3691 * was received, then we consider it as Accurate ECN blackholing
3692 */
3693 if ((tp->ecn_flags & TE_LOST_SYN) && tp->t_rxtshift <= 2 &&
3694 tp->t_client_accecn_state == tcp_connection_client_accurate_ecn_feature_enabled) {
3695 tp->t_client_accecn_state = tcp_connection_client_accurate_ecn_negotiation_blackholed;
3696 }
3697 /*
3698 * If SYN wasn't retransmitted twice yet, the server supports neither classic nor
3699 * accurate ECN SYN-ACK. Accurate ECN should already be disabled for both half connections
3700 * as TE_ACE_SETUPRECEIVED flag is not set.
3701 */
3702 if (tp->t_client_accecn_state == tcp_connection_client_accurate_ecn_feature_enabled) {
3703 tp->t_client_accecn_state = tcp_connection_client_ecn_not_available;
3704 }
3705 }
3706
3707 /* Do window scaling on this connection? */
3708 if (TCP_WINDOW_SCALE_ENABLED(tp)) {
3709 tp->snd_scale = tp->requested_s_scale;
3710 tp->rcv_scale = tp->request_r_scale;
3711 }
3712
3713 uint32_t recwin = min(a: tp->rcv_wnd, TCP_MAXWIN << tp->rcv_scale);
3714 if (TCP_USE_RLEDBAT(tp, so) && tcp_cc_rledbat.get_rlwin != NULL) {
3715 /* For a LBE receiver, also use rledbat_win */
3716 uint32_t rledbat_win = tcp_cc_rledbat.get_rlwin(tp);
3717 if (rledbat_win > 0) {
3718 recwin = min(a: recwin, b: rledbat_win);
3719 }
3720 }
3721 tp->rcv_adv += recwin;
3722
3723 tp->snd_una++; /* SYN is acked */
3724 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) {
3725 tp->snd_nxt = tp->snd_una;
3726 }
3727
3728 /*
3729 * We have sent more in the SYN than what is being
3730 * acked. (e.g., TFO)
3731 * We should restart the sending from what the receiver
3732 * has acknowledged immediately.
3733 */
3734 if (SEQ_GT(tp->snd_nxt, th->th_ack)) {
3735 /*
3736 * rdar://problem/33214601
3737 * There is a middlebox that acks all but one
3738 * byte and still drops the data.
3739 */
3740 if (!(tp->t_flagsext & TF_FASTOPEN_FORCE_ENABLE) &&
3741 (tp->t_tfo_stats & TFO_S_SYN_DATA_SENT) &&
3742 tp->snd_max == th->th_ack + 1 &&
3743 tp->snd_max > tp->snd_una + 1) {
3744 tcp_heuristic_tfo_middlebox(tp);
3745
3746 so->so_error = ENODATA;
3747 soevent(so,
3748 hint: (SO_FILT_HINT_LOCKED | SO_FILT_HINT_MP_SUB_ERROR));
3749
3750 tp->t_tfo_stats |= TFO_S_ONE_BYTE_PROXY;
3751 }
3752
3753 tp->snd_max = tp->snd_nxt = th->th_ack;
3754 }
3755
3756 /*
3757 * If there's data, delay ACK; if there's also a FIN
3758 * ACKNOW will be turned on later.
3759 */
3760 TCP_INC_VAR(tp->t_unacksegs, segment_count);
3761 if (TCP_ACC_ECN_ON(tp) && ip_ecn == IPTOS_ECN_CE) {
3762 TCP_INC_VAR(tp->t_unacksegs_ce, segment_count);
3763 }
3764 if (DELAY_ACK(tp, th) && tlen != 0) {
3765 if ((tp->t_flags & TF_DELACK) == 0) {
3766 tp->t_flags |= TF_DELACK;
3767 tp->t_timer[TCPT_DELACK] = OFFSET_FROM_START(tp, tcp_delack);
3768 }
3769 } else {
3770 tp->t_flags |= TF_ACKNOW;
3771 }
3772 /*
3773 * Received <SYN,ACK> in SYN_SENT[*] state.
3774 * Transitions:
3775 * SYN_SENT --> ESTABLISHED
3776 * SYN_SENT* --> FIN_WAIT_1
3777 */
3778 tp->t_starttime = tcp_now;
3779 tcp_sbrcv_tstmp_check(tp);
3780 if (tp->t_flags & TF_NEEDFIN) {
3781 DTRACE_TCP4(state__change, void, NULL,
3782 struct inpcb *, inp,
3783 struct tcpcb *, tp, int32_t,
3784 TCPS_FIN_WAIT_1);
3785 TCP_LOG_STATE(tp, TCPS_FIN_WAIT_1);
3786 tp->t_state = TCPS_FIN_WAIT_1;
3787 tp->t_flags &= ~TF_NEEDFIN;
3788 thflags &= ~TH_SYN;
3789
3790 TCP_LOG_CONNECTION_SUMMARY(tp);
3791 } else {
3792 DTRACE_TCP4(state__change, void, NULL,
3793 struct inpcb *, inp, struct tcpcb *,
3794 tp, int32_t, TCPS_ESTABLISHED);
3795 TCP_LOG_STATE(tp, TCPS_ESTABLISHED);
3796 tp->t_state = TCPS_ESTABLISHED;
3797 tp->t_timer[TCPT_KEEP] =
3798 OFFSET_FROM_START(tp,
3799 TCP_CONN_KEEPIDLE(tp));
3800 if (nstat_collect) {
3801 nstat_route_connect_success(
3802 rte: inp->inp_route.ro_rt);
3803 }
3804 TCP_LOG_CONNECTED(tp, 0);
3805 /*
3806 * The SYN is acknowledged but una is not
3807 * updated yet. So pass the value of
3808 * ack to compute sndbytes correctly
3809 */
3810 inp_count_sndbytes(inp, th->th_ack);
3811 }
3812 tp->t_forced_acks = TCP_FORCED_ACKS_COUNT;
3813#if MPTCP
3814 /*
3815 * Do not send the connect notification for additional
3816 * subflows until ACK for 3-way handshake arrives.
3817 */
3818 if ((!(tp->t_mpflags & TMPF_MPTCP_TRUE)) &&
3819 (tp->t_mpflags & TMPF_SENT_JOIN)) {
3820 isconnected = FALSE;
3821 } else
3822#endif /* MPTCP */
3823 isconnected = TRUE;
3824
3825 if ((tp->t_tfo_flags & (TFO_F_COOKIE_REQ | TFO_F_COOKIE_SENT)) ||
3826 (tp->t_tfo_stats & TFO_S_SYN_DATA_SENT)) {
3827 tcp_tfo_synack(tp, to: &to);
3828
3829 if ((tp->t_tfo_stats & TFO_S_SYN_DATA_SENT) &&
3830 SEQ_LT(tp->snd_una, th->th_ack)) {
3831 tp->t_tfo_stats |= TFO_S_SYN_DATA_ACKED;
3832 tcpstat.tcps_tfo_syn_data_acked++;
3833#if MPTCP
3834 if (so->so_flags & SOF_MP_SUBFLOW) {
3835 so->so_flags1 |= SOF1_TFO_REWIND;
3836 }
3837#endif
3838 tcp_tfo_rcv_probe(tp, tlen);
3839 }
3840 }
3841 } else {
3842 /*
3843 * Received initial SYN in SYN-SENT[*] state => simul-
3844 * taneous open.
3845 * Do 3-way handshake:
3846 * SYN-SENT -> SYN-RECEIVED
3847 * SYN-SENT* -> SYN-RECEIVED*
3848 */
3849 tp->t_flags |= TF_ACKNOW;
3850 tp->t_timer[TCPT_REXMT] = 0;
3851 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
3852 struct tcpcb *, tp, int32_t, TCPS_SYN_RECEIVED);
3853 TCP_LOG_STATE(tp, TCPS_SYN_RECEIVED);
3854 tp->t_state = TCPS_SYN_RECEIVED;
3855
3856 /*
3857 * During simultaneous open, TFO should not be used.
3858 * So, we disable it here, to prevent that data gets
3859 * sent on the SYN/ACK.
3860 */
3861 tcp_disable_tfo(tp);
3862 }
3863
3864trimthenstep6:
3865 /*
3866 * Advance th->th_seq to correspond to first data byte.
3867 * If data, trim to stay within window,
3868 * dropping FIN if necessary.
3869 */
3870 th->th_seq++;
3871 if (tlen > tp->rcv_wnd) {
3872 todrop = tlen - tp->rcv_wnd;
3873 m_adj(m, -todrop);
3874 tlen = tp->rcv_wnd;
3875 thflags &= ~TH_FIN;
3876 tcpstat.tcps_rcvpackafterwin++;
3877 tcpstat.tcps_rcvbyteafterwin += todrop;
3878 }
3879 tp->snd_wl1 = th->th_seq - 1;
3880 tp->rcv_up = th->th_seq;
3881 /*
3882 * Client side of transaction: already sent SYN and data.
3883 * If the remote host used T/TCP to validate the SYN,
3884 * our data will be ACK'd; if so, enter normal data segment
3885 * processing in the middle of step 5, ack processing.
3886 * Otherwise, goto step 6.
3887 */
3888 if (thflags & TH_ACK) {
3889 goto process_ACK;
3890 }
3891 goto step6;
3892 /*
3893 * If the state is LAST_ACK or CLOSING or TIME_WAIT:
3894 * do normal processing.
3895 *
3896 * NB: Leftover from RFC1644 T/TCP. Cases to be reused later.
3897 */
3898 case TCPS_LAST_ACK:
3899 case TCPS_CLOSING:
3900 case TCPS_TIME_WAIT:
3901 break; /* continue normal processing */
3902
3903 /* Received a SYN while connection is already established.
3904 * This is a "half open connection and other anomalies" described
3905 * in RFC793 page 34, send an ACK so the remote reset the connection
3906 * or recovers by adjusting its sequence numbering. Sending an ACK is
3907 * in accordance with RFC 5961 Section 4.2
3908 *
3909 * For Accurate ECN, if we receive a packet with SYN in ESTABLISHED
3910 * state, we don't send the handshake encoding.
3911 */
3912 case TCPS_ESTABLISHED:
3913 if (thflags & TH_SYN && tlen <= 0) {
3914 /* Drop the packet silently if we have reached the limit */
3915 if (tcp_is_ack_ratelimited(tp)) {
3916 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "ESTABLISHED rfc5961 rate limited");
3917 goto drop;
3918 } else {
3919 /* Send challenge ACK */
3920 tcpstat.tcps_synchallenge++;
3921 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "ESTABLISHED rfc5961 challenge ACK");
3922 goto dropafterack;
3923 }
3924 }
3925 break;
3926 }
3927
3928 /*
3929 * States other than LISTEN or SYN_SENT.
3930 * First check the RST flag and sequence number since reset segments
3931 * are exempt from the timestamp and connection count tests. This
3932 * fixes a bug introduced by the Stevens, vol. 2, p. 960 bugfix
3933 * below which allowed reset segments in half the sequence space
3934 * to fall though and be processed (which gives forged reset
3935 * segments with a random sequence number a 50 percent chance of
3936 * killing a connection).
3937 * Then check timestamp, if present.
3938 * Then check the connection count, if present.
3939 * Then check that at least some bytes of segment are within
3940 * receive window. If segment begins before rcv_nxt,
3941 * drop leading data (and SYN); if nothing left, just ack.
3942 *
3943 *
3944 * If the RST bit is set, check the sequence number to see
3945 * if this is a valid reset segment.
3946 * RFC 793 page 37:
3947 * In all states except SYN-SENT, all reset (RST) segments
3948 * are validated by checking their SEQ-fields. A reset is
3949 * valid if its sequence number is in the window.
3950 * Note: this does not take into account delayed ACKs, so
3951 * we should test against last_ack_sent instead of rcv_nxt.
3952 * The sequence number in the reset segment is normally an
3953 * echo of our outgoing acknowlegement numbers, but some hosts
3954 * send a reset with the sequence number at the rightmost edge
3955 * of our receive window, and we have to handle this case.
3956 * Note 2: Paul Watson's paper "Slipping in the Window" has shown
3957 * that brute force RST attacks are possible. To combat this,
3958 * we use a much stricter check while in the ESTABLISHED state,
3959 * only accepting RSTs where the sequence number is equal to
3960 * last_ack_sent. In all other states (the states in which a
3961 * RST is more likely), the more permissive check is used.
3962 * RFC 5961 Section 3.2: if the RST bit is set, sequence # is
3963 * within the receive window and last_ack_sent == seq,
3964 * then reset the connection. Otherwise if the seq doesn't
3965 * match last_ack_sent, TCP must send challenge ACK. Perform
3966 * rate limitation when sending the challenge ACK.
3967 * If we have multiple segments in flight, the intial reset
3968 * segment sequence numbers will be to the left of last_ack_sent,
3969 * but they will eventually catch up.
3970 * In any case, it never made sense to trim reset segments to
3971 * fit the receive window since RFC 1122 says:
3972 * 4.2.2.12 RST Segment: RFC-793 Section 3.4
3973 *
3974 * A TCP SHOULD allow a received RST segment to include data.
3975 *
3976 * DISCUSSION
3977 * It has been suggested that a RST segment could contain
3978 * ASCII text that encoded and explained the cause of the
3979 * RST. No standard has yet been established for such
3980 * data.
3981 *
3982 * If the reset segment passes the sequence number test examine
3983 * the state:
3984 * SYN_RECEIVED STATE:
3985 * If passive open, return to LISTEN state.
3986 * If active open, inform user that connection was refused.
3987 * ESTABLISHED, FIN_WAIT_1, FIN_WAIT_2, CLOSE_WAIT STATES:
3988 * Inform user that connection was reset, and close tcb.
3989 * CLOSING, LAST_ACK STATES:
3990 * Close the tcb.
3991 * TIME_WAIT STATE:
3992 * Drop the segment - see Stevens, vol. 2, p. 964 and
3993 * RFC 1337.
3994 *
3995 * Radar 4803931: Allows for the case where we ACKed the FIN but
3996 * there is already a RST in flight from the peer.
3997 * In that case, accept the RST for non-established
3998 * state if it's one off from last_ack_sent.
3999 *
4000 */
4001 if (thflags & TH_RST) {
4002 if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
4003 SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) ||
4004 (tp->rcv_wnd == 0 &&
4005 ((tp->last_ack_sent == th->th_seq) ||
4006 ((tp->last_ack_sent - 1) == th->th_seq)))) {
4007 if (tp->last_ack_sent == th->th_seq) {
4008 switch (tp->t_state) {
4009 case TCPS_SYN_RECEIVED:
4010 IF_TCP_STATINC(ifp, rstinsynrcv);
4011 so->so_error = ECONNREFUSED;
4012 goto close;
4013
4014 case TCPS_ESTABLISHED:
4015 if ((TCP_ECN_ENABLED(tp) || TCP_ACC_ECN_ON(tp)) &&
4016 tp->snd_una == tp->iss + 1 &&
4017 SEQ_GT(tp->snd_max, tp->snd_una)) {
4018 /*
4019 * If the first data packet on an
4020 * ECN connection, receives a RST
4021 * increment the heuristic
4022 */
4023 tcp_heuristic_ecn_droprst(tp);
4024 }
4025 OS_FALLTHROUGH;
4026 case TCPS_FIN_WAIT_1:
4027 case TCPS_CLOSE_WAIT:
4028 case TCPS_FIN_WAIT_2:
4029 so->so_error = ECONNRESET;
4030close:
4031 soevent(so,
4032 hint: (SO_FILT_HINT_LOCKED |
4033 SO_FILT_HINT_CONNRESET));
4034
4035 tcpstat.tcps_drops++;
4036 tp = tcp_close(tp);
4037 break;
4038
4039 case TCPS_CLOSING:
4040 case TCPS_LAST_ACK:
4041 tp = tcp_close(tp);
4042 break;
4043
4044 case TCPS_TIME_WAIT:
4045 break;
4046 }
4047 } else {
4048 tcpstat.tcps_badrst++;
4049 /* Drop if we have reached the ACK limit */
4050 if (tcp_is_ack_ratelimited(tp)) {
4051 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "ESTABLISHED rfc5961 rate limited");
4052 goto drop;
4053 } else {
4054 /* Send challenge ACK */
4055 tcpstat.tcps_rstchallenge++;
4056 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "ESTABLISHED rfc5961 challenge ACK");
4057 goto dropafterack;
4058 }
4059 }
4060 }
4061 goto drop;
4062 }
4063
4064 /*
4065 * RFC 1323 PAWS: If we have a timestamp reply on this segment
4066 * and it's less than ts_recent, drop it.
4067 */
4068 if ((to.to_flags & TOF_TS) != 0 && tp->ts_recent &&
4069 TSTMP_LT(to.to_tsval, tp->ts_recent)) {
4070 /* Check to see if ts_recent is over 24 days old. */
4071 if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) {
4072 /*
4073 * Invalidate ts_recent. If this segment updates
4074 * ts_recent, the age will be reset later and ts_recent
4075 * will get a valid value. If it does not, setting
4076 * ts_recent to zero will at least satisfy the
4077 * requirement that zero be placed in the timestamp
4078 * echo reply when ts_recent isn't valid. The
4079 * age isn't reset until we get a valid ts_recent
4080 * because we don't want out-of-order segments to be
4081 * dropped when ts_recent is old.
4082 */
4083 tp->ts_recent = 0;
4084 } else {
4085 tcpstat.tcps_rcvduppack++;
4086 tcpstat.tcps_rcvdupbyte += tlen;
4087 tp->t_pawsdrop++;
4088 tcpstat.tcps_pawsdrop++;
4089
4090 /*
4091 * PAWS-drop when ECN is being used? That indicates
4092 * that ECT-marked packets take a different path, with
4093 * different congestion-characteristics.
4094 *
4095 * Only fallback when we did send less than 2GB as PAWS
4096 * really has no reason to kick in earlier.
4097 */
4098 if ((TCP_ECN_ENABLED(tp) || TCP_ACC_ECN_ON(tp)) &&
4099 inp->inp_stat->rxbytes < 2147483648) {
4100 INP_INC_IFNET_STAT(inp, ecn_fallback_reorder);
4101 tcpstat.tcps_ecn_fallback_reorder++;
4102 tcp_heuristic_ecn_aggressive(tp);
4103 }
4104
4105 if (nstat_collect) {
4106 nstat_route_rx(rte: tp->t_inpcb->inp_route.ro_rt,
4107 packets: 1, bytes: tlen, flags: NSTAT_RX_FLAG_DUPLICATE);
4108 INP_ADD_STAT(inp, cell, wifi, wired,
4109 rxpackets, 1);
4110 INP_ADD_STAT(inp, cell, wifi, wired,
4111 rxbytes, tlen);
4112 tp->t_stat.rxduplicatebytes += tlen;
4113 inp_set_activity_bitmap(inp);
4114 }
4115 if (tlen > 0) {
4116 goto dropafterack;
4117 }
4118 goto drop;
4119 }
4120 }
4121
4122 /*
4123 * In the SYN-RECEIVED state, validate that the packet belongs to
4124 * this connection before trimming the data to fit the receive
4125 * window. Check the sequence number versus IRS since we know
4126 * the sequence numbers haven't wrapped. This is a partial fix
4127 * for the "LAND" DoS attack.
4128 */
4129 if (tp->t_state == TCPS_SYN_RECEIVED && SEQ_LT(th->th_seq, tp->irs)) {
4130 IF_TCP_STATINC(ifp, dospacket);
4131 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SYN_RECEIVED bad SEQ");
4132 goto dropwithreset;
4133 }
4134
4135 /*
4136 * Check if there is old data at the beginning of the window
4137 * i.e. the sequence number is before rcv_nxt
4138 */
4139 todrop = tp->rcv_nxt - th->th_seq;
4140 if (todrop > 0) {
4141 boolean_t is_syn_set = FALSE;
4142
4143 if (thflags & TH_SYN) {
4144 is_syn_set = TRUE;
4145 thflags &= ~TH_SYN;
4146 th->th_seq++;
4147 if (th->th_urp > 1) {
4148 th->th_urp--;
4149 } else {
4150 thflags &= ~TH_URG;
4151 }
4152 todrop--;
4153 }
4154 /*
4155 * Following if statement from Stevens, vol. 2, p. 960.
4156 * The amount of duplicate data is greater than or equal
4157 * to the size of the segment - entire segment is duplicate
4158 */
4159 if (todrop > tlen
4160 || (todrop == tlen && (thflags & TH_FIN) == 0)) {
4161 /*
4162 * Any valid FIN must be to the left of the window.
4163 * At this point the FIN must be a duplicate or out
4164 * of sequence; drop it.
4165 */
4166 thflags &= ~TH_FIN;
4167
4168 /*
4169 * Send an ACK to resynchronize and drop any data.
4170 * But keep on processing for RST or ACK.
4171 *
4172 * If the SYN bit was originally set, then only send
4173 * an ACK if we are not rate-limiting this connection.
4174 */
4175 if (is_syn_set) {
4176 if (!tcp_is_ack_ratelimited(tp)) {
4177 tcpstat.tcps_synchallenge++;
4178 tp->t_flags |= TF_ACKNOW;
4179 }
4180 } else {
4181 tp->t_flags |= TF_ACKNOW;
4182 }
4183
4184 if (todrop == 1) {
4185 /* This could be a keepalive */
4186 soevent(so, SO_FILT_HINT_LOCKED |
4187 SO_FILT_HINT_KEEPALIVE);
4188 }
4189 todrop = tlen;
4190 tcpstat.tcps_rcvduppack++;
4191 tcpstat.tcps_rcvdupbyte += todrop;
4192 } else {
4193 tcpstat.tcps_rcvpartduppack++;
4194 tcpstat.tcps_rcvpartdupbyte += todrop;
4195 }
4196
4197 if (todrop > 1) {
4198 /*
4199 * Note the duplicate data sequence space so that
4200 * it can be reported in DSACK option.
4201 */
4202 tp->t_dsack_lseq = th->th_seq;
4203 tp->t_dsack_rseq = th->th_seq + todrop;
4204 tp->t_flags |= TF_ACKNOW;
4205 }
4206 if (nstat_collect) {
4207 nstat_route_rx(rte: tp->t_inpcb->inp_route.ro_rt, packets: 1,
4208 bytes: todrop, flags: NSTAT_RX_FLAG_DUPLICATE);
4209 INP_ADD_STAT(inp, cell, wifi, wired, rxpackets, 1);
4210 INP_ADD_STAT(inp, cell, wifi, wired, rxbytes, todrop);
4211 tp->t_stat.rxduplicatebytes += todrop;
4212 inp_set_activity_bitmap(inp);
4213 }
4214 drop_hdrlen += todrop; /* drop from the top afterwards */
4215 th->th_seq += todrop;
4216 tlen -= todrop;
4217 if (th->th_urp > todrop) {
4218 th->th_urp -= todrop;
4219 } else {
4220 thflags &= ~TH_URG;
4221 th->th_urp = 0;
4222 }
4223 }
4224
4225 /*
4226 * If new data are received on a connection after the user
4227 * processes are gone, then RST the other end.
4228 * Send also a RST when we received a data segment after we've
4229 * sent our FIN when the socket is defunct.
4230 * Note that an MPTCP subflow socket would have SS_NOFDREF set
4231 * by default. So, if it's an MPTCP-subflow we rather check the
4232 * MPTCP-level's socket state for SS_NOFDREF.
4233 */
4234 if (tlen) {
4235 boolean_t close_it = FALSE;
4236
4237 if (!(so->so_flags & SOF_MP_SUBFLOW) && (so->so_state & SS_NOFDREF) &&
4238 tp->t_state > TCPS_CLOSE_WAIT) {
4239 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SS_NOFDREF");
4240 close_it = TRUE;
4241 }
4242
4243 if ((so->so_flags & SOF_MP_SUBFLOW) && (mptetoso(mpte: tptomptp(tp)->mpt_mpte)->so_state & SS_NOFDREF) &&
4244 tp->t_state > TCPS_CLOSE_WAIT) {
4245 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SOF_MP_SUBFLOW SS_NOFDREF");
4246 close_it = TRUE;
4247 }
4248
4249 if ((so->so_flags & SOF_DEFUNCT) && tp->t_state > TCPS_FIN_WAIT_1) {
4250 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SOF_DEFUNCT");
4251 close_it = TRUE;
4252 }
4253
4254 if (so->so_state & SS_CANTRCVMORE) {
4255 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "SS_CANTRCVMORE");
4256 close_it = TRUE;
4257 }
4258
4259 if (close_it) {
4260 tp = tcp_close(tp);
4261 tcpstat.tcps_rcvafterclose++;
4262 IF_TCP_STATINC(ifp, cleanup);
4263 goto dropwithreset;
4264 }
4265 }
4266
4267 /*
4268 * If segment ends after window, drop trailing data
4269 * (and PUSH and FIN); if nothing left, just ACK.
4270 */
4271 todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
4272 if (todrop > 0) {
4273 tcpstat.tcps_rcvpackafterwin++;
4274 if (todrop >= tlen) {
4275 tcpstat.tcps_rcvbyteafterwin += tlen;
4276 /*
4277 * If a new connection request is received
4278 * while in TIME_WAIT, drop the old connection
4279 * and start over if the sequence numbers
4280 * are above the previous ones.
4281 */
4282 if (thflags & TH_SYN &&
4283 tp->t_state == TCPS_TIME_WAIT &&
4284 SEQ_GT(th->th_seq, tp->rcv_nxt)) {
4285 iss = tcp_new_isn(tp);
4286 tp = tcp_close(tp);
4287 socket_unlock(so, refcount: 1);
4288 goto findpcb;
4289 }
4290 /*
4291 * If window is closed can only take segments at
4292 * window edge, and have to drop data and PUSH from
4293 * incoming segments. Continue processing, but
4294 * remember to ack. Otherwise, drop segment
4295 * and ack.
4296 */
4297 if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
4298 tp->t_flags |= TF_ACKNOW;
4299 tcpstat.tcps_rcvwinprobe++;
4300 } else {
4301 goto dropafterack;
4302 }
4303 } else {
4304 tcpstat.tcps_rcvbyteafterwin += todrop;
4305 }
4306 m_adj(m, -todrop);
4307 tlen -= todrop;
4308 thflags &= ~(TH_PUSH | TH_FIN);
4309 }
4310
4311 /*
4312 * If last ACK falls within this segment's sequence numbers,
4313 * record its timestamp.
4314 * NOTE:
4315 * 1) That the test incorporates suggestions from the latest
4316 * proposal of the tcplw@cray.com list (Braden 1993/04/26).
4317 * 2) That updating only on newer timestamps interferes with
4318 * our earlier PAWS tests, so this check should be solely
4319 * predicated on the sequence space of this segment.
4320 * 3) That we modify the segment boundary check to be
4321 * Last.ACK.Sent <= SEG.SEQ + SEG.Len
4322 * instead of RFC1323's
4323 * Last.ACK.Sent < SEG.SEQ + SEG.Len,
4324 * This modified check allows us to overcome RFC1323's
4325 * limitations as described in Stevens TCP/IP Illustrated
4326 * Vol. 2 p.869. In such cases, we can still calculate the
4327 * RTT correctly when RCV.NXT == Last.ACK.Sent.
4328 */
4329 if ((to.to_flags & TOF_TS) != 0 &&
4330 SEQ_LEQ(th->th_seq, tp->last_ack_sent) &&
4331 SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
4332 ((thflags & (TH_SYN | TH_FIN)) != 0))) {
4333 tp->ts_recent_age = tcp_now;
4334 tp->ts_recent = to.to_tsval;
4335 }
4336
4337 /*
4338 * Stevens: If a SYN is in the window, then this is an
4339 * error and we send an RST and drop the connection.
4340 *
4341 * RFC 5961 Section 4.2
4342 * Send challenge ACK for any SYN in synchronized state
4343 * Perform rate limitation in doing so.
4344 */
4345 if (thflags & TH_SYN) {
4346 if (!tcp_syn_data_valid(tp, tcp_hdr: th, tlen)) {
4347 tcpstat.tcps_badsyn++;
4348 /* Drop if we have reached ACK limit */
4349 if (tcp_is_ack_ratelimited(tp)) {
4350 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "rfc5961 bad SYN rate limited");
4351 goto drop;
4352 } else {
4353 /* Send challenge ACK */
4354 tcpstat.tcps_synchallenge++;
4355 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "rfc5961 bad SYN challenge ack");
4356 goto dropafterack;
4357 }
4358 } else {
4359 /*
4360 * Received SYN (/ACK) with data.
4361 * Move sequence number along to process the data.
4362 */
4363 th->th_seq++;
4364 thflags &= ~TH_SYN;
4365 }
4366 }
4367
4368 /*
4369 * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN
4370 * flag is on (half-synchronized state), then queue data for
4371 * later processing; else drop segment and return.
4372 */
4373 if ((thflags & TH_ACK) == 0) {
4374 if (tp->t_state == TCPS_SYN_RECEIVED) {
4375 if ((tfo_enabled(tp))) {
4376 /*
4377 * So, we received a valid segment while in
4378 * SYN-RECEIVED.
4379 * As this cannot be an RST (see that if a bit
4380 * higher), and it does not have the ACK-flag
4381 * set, we want to retransmit the SYN/ACK.
4382 * Thus, we have to reset snd_nxt to snd_una to
4383 * trigger the going back to sending of the
4384 * SYN/ACK. This is more consistent with the
4385 * behavior of tcp_output(), which expects
4386 * to send the segment that is pointed to by
4387 * snd_nxt.
4388 */
4389 tp->snd_nxt = tp->snd_una;
4390
4391 /*
4392 * We need to make absolutely sure that we are
4393 * going to reply upon a duplicate SYN-segment.
4394 */
4395 if (th->th_flags & TH_SYN) {
4396 needoutput = 1;
4397 }
4398 }
4399 /* Process this same as newly received Accurate ECN SYN */
4400 int ace_flags = ((th->th_x2 << 8) | thflags) & TH_ACE;
4401 tcp_input_process_accecn_syn(tp, ace_flags, ip_ecn);
4402
4403 goto step6;
4404 } else if (tp->t_flags & TF_ACKNOW) {
4405 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "bad ACK");
4406 goto dropafterack;
4407 } else {
4408 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "bad ACK");
4409 goto drop;
4410 }
4411 }
4412
4413 /*
4414 * Ack processing.
4415 */
4416
4417 switch (tp->t_state) {
4418 /*
4419 * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
4420 * ESTABLISHED state and continue processing.
4421 * The ACK was checked above.
4422 */
4423 case TCPS_SYN_RECEIVED:
4424
4425 tcpstat.tcps_connects++;
4426
4427 /* Do window scaling? */
4428 if (TCP_WINDOW_SCALE_ENABLED(tp)) {
4429 tp->snd_scale = tp->requested_s_scale;
4430 tp->rcv_scale = tp->request_r_scale;
4431 tp->snd_wnd = th->th_win << tp->snd_scale;
4432 tp->max_sndwnd = tp->snd_wnd;
4433 tiwin = tp->snd_wnd;
4434 }
4435 /*
4436 * Make transitions:
4437 * SYN-RECEIVED -> ESTABLISHED
4438 * SYN-RECEIVED* -> FIN-WAIT-1
4439 */
4440 tp->t_starttime = tcp_now;
4441 tcp_sbrcv_tstmp_check(tp);
4442 if (tp->t_flags & TF_NEEDFIN) {
4443 DTRACE_TCP4(state__change, void, NULL,
4444 struct inpcb *, inp,
4445 struct tcpcb *, tp, int32_t, TCPS_FIN_WAIT_1);
4446 TCP_LOG_STATE(tp, TCPS_FIN_WAIT_1);
4447 tp->t_state = TCPS_FIN_WAIT_1;
4448 tp->t_flags &= ~TF_NEEDFIN;
4449
4450 TCP_LOG_CONNECTION_SUMMARY(tp);
4451 } else {
4452 DTRACE_TCP4(state__change, void, NULL,
4453 struct inpcb *, inp,
4454 struct tcpcb *, tp, int32_t, TCPS_ESTABLISHED);
4455 TCP_LOG_STATE(tp, TCPS_ESTABLISHED);
4456 tp->t_state = TCPS_ESTABLISHED;
4457 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
4458 TCP_CONN_KEEPIDLE(tp));
4459 if (nstat_collect) {
4460 nstat_route_connect_success(
4461 rte: tp->t_inpcb->inp_route.ro_rt);
4462 }
4463 TCP_LOG_CONNECTED(tp, 0);
4464 /*
4465 * The SYN is acknowledged but una is not updated
4466 * yet. So pass the value of ack to compute
4467 * sndbytes correctly
4468 */
4469 inp_count_sndbytes(inp, th->th_ack);
4470 }
4471 tp->t_forced_acks = TCP_FORCED_ACKS_COUNT;
4472
4473 VERIFY(LIST_EMPTY(&tp->t_segq));
4474 tp->snd_wl1 = th->th_seq - 1;
4475
4476 /*
4477 * AccECN server in SYN-RCVD state received an ACK with
4478 * SYN=0, process handshake encoding present in the ACK for SYN-ACK
4479 * and update receive side counters.
4480 */
4481 if (TCP_ACC_ECN_ON(tp) && (thflags & (TH_SYN | TH_ACK)) == TH_ACK) {
4482 const uint32_t ace_flags = ((th->th_x2 << 8) | thflags) & TH_ACE;
4483 if (tlen == 0 && to.to_nsacks == 0) {
4484 /*
4485 * ACK for SYN-ACK reflects the state (ECN) in which SYN-ACK packet
4486 * was delivered. Use Table 4 of Accurate ECN draft to decode only
4487 * when a pure ACK with no SACK block is received.
4488 * 0|0|0 will fail Accurate ECN negotiation and disable ECN.
4489 */
4490 switch (ace_flags) {
4491 case (0 | TH_CWR | 0):
4492 /* Non-ECT SYN-ACK was delivered */
4493 tp->t_snd_ce_packets = 5;
4494 if (tp->t_server_accecn_state == tcp_connection_server_accurate_ecn_requested) {
4495 tp->t_server_accecn_state = tcp_connection_server_accurate_ecn_negotiation_success;
4496 }
4497 break;
4498 case (0 | TH_CWR | TH_ECE):
4499 /* ECT1 SYN-ACK was delivered, mangling detected */
4500 OS_FALLTHROUGH;
4501 case (TH_AE | 0 | 0):
4502 /* ECT0 SYN-ACK was delivered, mangling detected */
4503 tp->t_snd_ce_packets = 5;
4504 if (tp->t_server_accecn_state == tcp_connection_server_accurate_ecn_requested) {
4505 tp->t_server_accecn_state = tcp_connection_server_accurate_ecn_negotiation_success_ect_mangling_detected;
4506 }
4507 break;
4508 case (TH_AE | TH_CWR | 0):
4509 /*
4510 * CE SYN-ACK was delivered, even though mangling happened,
4511 * CE could indicate congestion at a node after mangling occured.
4512 * Set cwnd to 2 segments
4513 */
4514 tp->t_snd_ce_packets = 6;
4515 tp->snd_cwnd = 2 * tp->t_maxseg;
4516 if (tp->t_server_accecn_state == tcp_connection_server_accurate_ecn_requested) {
4517 tp->t_server_accecn_state = tcp_connection_server_accurate_ecn_negotiation_success_ect_mangling_detected;
4518 }
4519 break;
4520 case (0 | 0 | 0):
4521 /* Disable ECN, as ACE fields were zeroed */
4522 tp->ecn_flags &= ~(TE_SETUPRECEIVED | TE_SENDIPECT |
4523 TE_SENDCWR | TE_ACE_SETUPRECEIVED);
4524 /*
4525 * Since last ACK has no ECN flag set and TE_LOST_SYNACK is set, this is in response
4526 * to the second (non-ECN setup) SYN-ACK retransmission. In such a case, we assume
4527 * that AccECN SYN-ACK was blackholed.
4528 */
4529 if ((tp->ecn_flags & TE_LOST_SYNACK) && tp->t_rxtshift <= 2 &&
4530 (tp->t_server_accecn_state == tcp_connection_server_classic_ecn_requested ||
4531 tp->t_server_accecn_state == tcp_connection_server_accurate_ecn_requested)) {
4532 tp->t_server_accecn_state = tcp_connection_server_accurate_ecn_negotiation_blackholed;
4533 }
4534 /*
4535 * SYN-ACK hasn't been retransmitted twice yet, so this could likely mean bleaching of ACE
4536 * on the path from client to server on last ACK.
4537 */
4538 if (tp->t_server_accecn_state == tcp_connection_server_accurate_ecn_requested) {
4539 tp->t_server_accecn_state = tcp_connection_server_accurate_ecn_ace_bleaching_detected;
4540 }
4541 break;
4542 default:
4543 /* Unused values for forward compatibility */
4544 tp->t_snd_ce_packets = 5;
4545 break;
4546 }
4547 }
4548 /* Increment receive side counters based on IP-ECN */
4549 tcp_input_ip_ecn(tp, inp, tlen: (uint32_t)tlen, segment_count: (uint32_t)segment_count, ip_ecn);
4550 }
4551
4552#if MPTCP
4553 /*
4554 * Do not send the connect notification for additional subflows
4555 * until ACK for 3-way handshake arrives.
4556 */
4557 if ((!(tp->t_mpflags & TMPF_MPTCP_TRUE)) &&
4558 (tp->t_mpflags & TMPF_SENT_JOIN)) {
4559 isconnected = FALSE;
4560 } else
4561#endif /* MPTCP */
4562 isconnected = TRUE;
4563 if ((tp->t_tfo_flags & TFO_F_COOKIE_VALID)) {
4564 /* Done this when receiving the SYN */
4565 isconnected = FALSE;
4566
4567 OSDecrementAtomic(&tcp_tfo_halfcnt);
4568
4569 /* Panic if something has gone terribly wrong. */
4570 VERIFY(tcp_tfo_halfcnt >= 0);
4571
4572 tp->t_tfo_flags &= ~TFO_F_COOKIE_VALID;
4573 }
4574
4575 /*
4576 * In case there is data in the send-queue (e.g., TFO is being
4577 * used, or connectx+data has been done), then if we would
4578 * "FALLTHROUGH", we would handle this ACK as if data has been
4579 * acknowledged. But, we have to prevent this. And this
4580 * can be prevented by increasing snd_una by 1, so that the
4581 * SYN is not considered as data (snd_una++ is actually also
4582 * done in SYN_SENT-state as part of the regular TCP stack).
4583 *
4584 * In case there is data on this ack as well, the data will be
4585 * handled by the label "dodata" right after step6.
4586 */
4587 if (so->so_snd.sb_cc) {
4588 tp->snd_una++; /* SYN is acked */
4589 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) {
4590 tp->snd_nxt = tp->snd_una;
4591 }
4592
4593 /*
4594 * No duplicate-ACK handling is needed. So, we
4595 * directly advance to processing the ACK (aka,
4596 * updating the RTT estimation,...)
4597 *
4598 * But, we first need to handle eventual SACKs,
4599 * because TFO will start sending data with the
4600 * SYN/ACK, so it might be that the client
4601 * includes a SACK with its ACK.
4602 */
4603 if (SACK_ENABLED(tp) &&
4604 (to.to_nsacks > 0 || !TAILQ_EMPTY(&tp->snd_holes))) {
4605 tcp_sack_doack(tp, &to, th, &sack_bytes_acked, &sack_bytes_newly_acked);
4606 }
4607
4608 goto process_ACK;
4609 }
4610
4611 OS_FALLTHROUGH;
4612
4613 /*
4614 * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
4615 * ACKs. If the ack is in the range
4616 * tp->snd_una < th->th_ack <= tp->snd_max
4617 * then advance tp->snd_una to th->th_ack and drop
4618 * data from the retransmission queue. If this ACK reflects
4619 * more up to date window information we update our window information.
4620 */
4621 case TCPS_ESTABLISHED:
4622 case TCPS_FIN_WAIT_1:
4623 case TCPS_FIN_WAIT_2:
4624 case TCPS_CLOSE_WAIT:
4625 case TCPS_CLOSING:
4626 case TCPS_LAST_ACK:
4627 case TCPS_TIME_WAIT:
4628 if (SEQ_GT(th->th_ack, tp->snd_max)) {
4629 tcpstat.tcps_rcvacktoomuch++;
4630 if (tcp_is_ack_ratelimited(tp)) {
4631 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "rfc5961 rcvacktoomuch");
4632 goto drop;
4633 } else {
4634 goto dropafterack;
4635 }
4636 }
4637 if (SEQ_LT(th->th_ack, tp->snd_una - tp->max_sndwnd)) {
4638 if (tcp_is_ack_ratelimited(tp)) {
4639 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "rfc5961 bad ACK");
4640 goto drop;
4641 } else {
4642 goto dropafterack;
4643 }
4644 }
4645 if (SACK_ENABLED(tp) && to.to_nsacks > 0) {
4646 recvd_dsack = tcp_sack_process_dsack(tp, &to, th);
4647 /*
4648 * If DSACK is received and this packet has no
4649 * other SACK information, it can be dropped.
4650 * We do not want to treat it as a duplicate ack.
4651 */
4652 if (recvd_dsack &&
4653 SEQ_LEQ(th->th_ack, tp->snd_una) &&
4654 to.to_nsacks == 0) {
4655 tcp_bad_rexmt_check(tp, th, to: &to);
4656 goto drop;
4657 }
4658 }
4659
4660 if (SACK_ENABLED(tp) &&
4661 (to.to_nsacks > 0 || !TAILQ_EMPTY(&tp->snd_holes))) {
4662 tcp_sack_doack(tp, &to, th, &sack_bytes_acked, &sack_bytes_newly_acked);
4663 }
4664
4665#if MPTCP
4666 if (tp->t_mpuna && SEQ_GEQ(th->th_ack, tp->t_mpuna)) {
4667 if (tp->t_mpflags & TMPF_PREESTABLISHED) {
4668 /* MP TCP establishment succeeded */
4669 tp->t_mpuna = 0;
4670 if (tp->t_mpflags & TMPF_JOINED_FLOW) {
4671 if (tp->t_mpflags & TMPF_SENT_JOIN) {
4672 tp->t_mpflags &=
4673 ~TMPF_PREESTABLISHED;
4674 tp->t_mpflags |=
4675 TMPF_MPTCP_TRUE;
4676
4677 tp->t_timer[TCPT_JACK_RXMT] = 0;
4678 tp->t_mprxtshift = 0;
4679 isconnected = TRUE;
4680 } else {
4681 isconnected = FALSE;
4682 }
4683 } else {
4684 isconnected = TRUE;
4685 }
4686 }
4687 }
4688#endif /* MPTCP */
4689
4690 tcp_tfo_rcv_ack(tp, th);
4691
4692 /*
4693 * If we have outstanding data (other than
4694 * a window probe), this is a completely
4695 * duplicate ack and the ack is the biggest we've seen.
4696 *
4697 * Need to accommodate a change in window on duplicate acks
4698 * to allow operating systems that update window during
4699 * recovery with SACK
4700 */
4701 if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
4702 if (tlen == 0 && (tiwin == tp->snd_wnd ||
4703 (to.to_nsacks > 0 && sack_bytes_acked > 0))) {
4704 uint32_t old_dupacks;
4705 /*
4706 * If both ends send FIN at the same time,
4707 * then the ack will be a duplicate ack
4708 * but we have to process the FIN. Check
4709 * for this condition and process the FIN
4710 * instead of the dupack
4711 */
4712 if ((thflags & TH_FIN) &&
4713 !TCPS_HAVERCVDFIN(tp->t_state)) {
4714 break;
4715 }
4716process_dupack:
4717 old_dupacks = tp->t_dupacks;
4718#if MPTCP
4719 /*
4720 * MPTCP options that are ignored must
4721 * not be treated as duplicate ACKs.
4722 */
4723 if (to.to_flags & TOF_MPTCP) {
4724 goto drop;
4725 }
4726
4727 if ((isconnected) && (tp->t_mpflags & TMPF_JOINED_FLOW)) {
4728 break;
4729 }
4730#endif /* MPTCP */
4731 /*
4732 * If a duplicate acknowledgement was seen
4733 * after ECN, it indicates packet loss in
4734 * addition to ECN. Reset INRECOVERY flag
4735 * so that we can process partial acks
4736 * correctly
4737 */
4738 if (tp->ecn_flags & TE_INRECOVERY) {
4739 tp->ecn_flags &= ~TE_INRECOVERY;
4740 }
4741
4742 tcpstat.tcps_rcvdupack++;
4743 if (SACK_ENABLED(tp) && tcp_do_better_lr) {
4744 tp->t_dupacks += max(a: 1, b: sack_bytes_acked / tp->t_maxseg);
4745 } else {
4746 ++tp->t_dupacks;
4747 }
4748
4749 tp->sackhint.sack_bytes_acked += sack_bytes_acked;
4750
4751 if (SACK_ENABLED(tp) && tcp_do_better_lr) {
4752 tp->t_new_dupacks += (sack_bytes_newly_acked / tp->t_maxseg);
4753
4754 if (tp->t_new_dupacks >= tp->t_rexmtthresh && IN_FASTRECOVERY(tp)) {
4755 /* Let's restart the retransmission */
4756 tcp_sack_lost_rexmit(tp);
4757
4758 /*
4759 * If the current tcp cc module has
4760 * defined a hook for tasks to run
4761 * before entering FR, call it
4762 */
4763 if (CC_ALGO(tp)->pre_fr != NULL) {
4764 CC_ALGO(tp)->pre_fr(tp);
4765 }
4766
4767 ENTER_FASTRECOVERY(tp);
4768
4769 if (tp->t_flags & TF_SENTFIN) {
4770 tp->snd_recover = tp->snd_max - 1;
4771 } else {
4772 tp->snd_recover = tp->snd_max;
4773 }
4774 tp->t_rtttime = 0;
4775 /*
4776 * Accurate ECN Sender MUST NOT set CWR to indicate
4777 * it has received and responded to indications
4778 * of congestion. ACE field is used to reflect counters
4779 * that are continously updated overloading the CWR bit.
4780 */
4781 if (!TCP_ACC_ECN_ON(tp) && TCP_ECN_ENABLED(tp)) {
4782 tp->ecn_flags |= TE_SENDCWR;
4783 }
4784
4785 if (tp->t_flagsext & TF_CWND_NONVALIDATED) {
4786 tcp_cc_adjust_nonvalidated_cwnd(tp);
4787 } else {
4788 tp->snd_cwnd = tp->snd_ssthresh;
4789 }
4790 }
4791 }
4792
4793 /*
4794 * Check if we need to reset the limit on
4795 * early retransmit
4796 */
4797 if (tp->t_early_rexmt_count > 0 &&
4798 TSTMP_GEQ(tcp_now,
4799 (tp->t_early_rexmt_win +
4800 TCP_EARLY_REXMT_WIN))) {
4801 tp->t_early_rexmt_count = 0;
4802 }
4803
4804 /*
4805 * Is early retransmit needed? We check for
4806 * this when the connection is waiting for
4807 * duplicate acks to enter fast recovery.
4808 */
4809 if (!IN_FASTRECOVERY(tp)) {
4810 tcp_early_rexmt_check(tp, th);
4811 }
4812
4813 /*
4814 * If we've seen exactly rexmt threshold
4815 * of duplicate acks, assume a packet
4816 * has been dropped and retransmit it.
4817 * Kludge snd_nxt & the congestion
4818 * window so we send only this one
4819 * packet.
4820 *
4821 * We know we're losing at the current
4822 * window size so do congestion avoidance
4823 * (set ssthresh to half the current window
4824 * and pull our congestion window back to
4825 * the new ssthresh).
4826 *
4827 * Dup acks mean that packets have left the
4828 * network (they're now cached at the receiver)
4829 * so bump cwnd by the amount in the receiver
4830 * to keep a constant cwnd packets in the
4831 * network.
4832 */
4833 if (tp->t_timer[TCPT_REXMT] == 0 ||
4834 (th->th_ack != tp->snd_una && sack_bytes_acked == 0)) {
4835 tp->t_dupacks = 0;
4836 tp->t_rexmtthresh = tcprexmtthresh;
4837 tp->t_new_dupacks = 0;
4838 } else if ((tp->t_dupacks > tp->t_rexmtthresh && (!tcp_do_better_lr || old_dupacks >= tp->t_rexmtthresh)) ||
4839 IN_FASTRECOVERY(tp)) {
4840 /*
4841 * If this connection was seeing packet
4842 * reordering, then recovery might be
4843 * delayed to disambiguate between
4844 * reordering and loss
4845 */
4846 if (SACK_ENABLED(tp) && !IN_FASTRECOVERY(tp) &&
4847 (tp->t_flagsext &
4848 (TF_PKTS_REORDERED | TF_DELAY_RECOVERY)) ==
4849 (TF_PKTS_REORDERED | TF_DELAY_RECOVERY)) {
4850 /*
4851 * Since the SACK information is already
4852 * updated, this ACK will be dropped
4853 */
4854 break;
4855 }
4856
4857 /*
4858 * Dup acks mean that packets have left the
4859 * network (they're now cached at the receiver)
4860 * so bump cwnd by the amount in the receiver
4861 * to keep a constant cwnd packets in the
4862 * network.
4863 */
4864 if (SACK_ENABLED(tp) && IN_FASTRECOVERY(tp)) {
4865 int awnd;
4866
4867 /*
4868 * Compute the amount of data in flight first.
4869 * We can inject new data into the pipe iff
4870 * we have less than snd_ssthres worth of data in
4871 * flight.
4872 */
4873 awnd = (tp->snd_nxt - tp->snd_fack) + tp->sackhint.sack_bytes_rexmit;
4874 if (awnd < tp->snd_ssthresh) {
4875 tp->snd_cwnd += tp->t_maxseg;
4876 if (tp->snd_cwnd > tp->snd_ssthresh) {
4877 tp->snd_cwnd = tp->snd_ssthresh;
4878 }
4879 }
4880 } else {
4881 tp->snd_cwnd += tp->t_maxseg;
4882 }
4883
4884 /* Process any window updates */
4885 if (tiwin > tp->snd_wnd) {
4886 tcp_update_window(tp, thflags,
4887 th, tiwin, tlen);
4888 }
4889 tcp_ccdbg_trace(tp, th,
4890 event: TCP_CC_IN_FASTRECOVERY);
4891
4892 (void) tcp_output(tp);
4893
4894 goto drop;
4895 } else if ((!tcp_do_better_lr && tp->t_dupacks == tp->t_rexmtthresh) ||
4896 (tcp_do_better_lr && tp->t_dupacks >= tp->t_rexmtthresh)) {
4897 tcp_seq onxt = tp->snd_nxt;
4898
4899 /*
4900 * If we're doing sack, check to
4901 * see if we're already in sack
4902 * recovery. If we're not doing sack,
4903 * check to see if we're in newreno
4904 * recovery.
4905 */
4906 if (SACK_ENABLED(tp)) {
4907 if (IN_FASTRECOVERY(tp)) {
4908 tp->t_dupacks = 0;
4909 break;
4910 } else if (tp->t_flagsext & TF_DELAY_RECOVERY) {
4911 break;
4912 }
4913 } else {
4914 if (SEQ_LEQ(th->th_ack, tp->snd_recover)) {
4915 tp->t_dupacks = 0;
4916 break;
4917 }
4918 }
4919 if (tp->t_flags & TF_SENTFIN) {
4920 tp->snd_recover = tp->snd_max - 1;
4921 } else {
4922 tp->snd_recover = tp->snd_max;
4923 }
4924 tp->t_timer[TCPT_PTO] = 0;
4925 tp->t_rtttime = 0;
4926
4927 /*
4928 * If the connection has seen pkt
4929 * reordering, delay recovery until
4930 * it is clear that the packet
4931 * was lost.
4932 */
4933 if (SACK_ENABLED(tp) &&
4934 (tp->t_flagsext &
4935 (TF_PKTS_REORDERED | TF_DELAY_RECOVERY))
4936 == TF_PKTS_REORDERED &&
4937 !IN_FASTRECOVERY(tp) &&
4938 tp->t_reorderwin > 0 &&
4939 (tp->t_state == TCPS_ESTABLISHED ||
4940 tp->t_state == TCPS_FIN_WAIT_1)) {
4941 tp->t_timer[TCPT_DELAYFR] =
4942 OFFSET_FROM_START(tp,
4943 tp->t_reorderwin);
4944 tp->t_flagsext |= TF_DELAY_RECOVERY;
4945 tcpstat.tcps_delay_recovery++;
4946 tcp_ccdbg_trace(tp, th,
4947 event: TCP_CC_DELAY_FASTRECOVERY);
4948 break;
4949 }
4950
4951 tcp_rexmt_save_state(tp);
4952 /*
4953 * If the current tcp cc module has
4954 * defined a hook for tasks to run
4955 * before entering FR, call it
4956 */
4957 if (CC_ALGO(tp)->pre_fr != NULL) {
4958 CC_ALGO(tp)->pre_fr(tp);
4959 }
4960 ENTER_FASTRECOVERY(tp);
4961 tp->t_timer[TCPT_REXMT] = 0;
4962 if (!TCP_ACC_ECN_ON(tp) && TCP_ECN_ENABLED(tp)) {
4963 tp->ecn_flags |= TE_SENDCWR;
4964 }
4965
4966 if (SACK_ENABLED(tp)) {
4967 tcpstat.tcps_sack_recovery_episode++;
4968 tp->t_sack_recovery_episode++;
4969 tp->sack_newdata = tp->snd_nxt;
4970 if (tcp_do_better_lr) {
4971 tp->snd_cwnd = tp->snd_ssthresh;
4972 } else {
4973 tp->snd_cwnd = tp->t_maxseg;
4974 }
4975 tp->t_flagsext &= ~TF_CWND_NONVALIDATED;
4976
4977 /* Process any window updates */
4978 if (tiwin > tp->snd_wnd) {
4979 tcp_update_window(tp, thflags, th, tiwin, tlen);
4980 }
4981
4982 tcp_ccdbg_trace(tp, th, event: TCP_CC_ENTER_FASTRECOVERY);
4983 (void) tcp_output(tp);
4984 goto drop;
4985 }
4986 tp->snd_nxt = th->th_ack;
4987 tp->snd_cwnd = tp->t_maxseg;
4988
4989 /* Process any window updates */
4990 if (tiwin > tp->snd_wnd) {
4991 tcp_update_window(tp, thflags, th, tiwin, tlen);
4992 }
4993
4994 (void) tcp_output(tp);
4995 if (tp->t_flagsext & TF_CWND_NONVALIDATED) {
4996 tcp_cc_adjust_nonvalidated_cwnd(tp);
4997 } else {
4998 tp->snd_cwnd = tp->snd_ssthresh + tp->t_maxseg * tp->t_dupacks;
4999 }
5000 if (SEQ_GT(onxt, tp->snd_nxt)) {
5001 tp->snd_nxt = onxt;
5002 }
5003
5004 tcp_ccdbg_trace(tp, th, event: TCP_CC_ENTER_FASTRECOVERY);
5005 goto drop;
5006 } else if (ALLOW_LIMITED_TRANSMIT(tp) &&
5007 (!(SACK_ENABLED(tp)) || sack_bytes_acked > 0) &&
5008 (so->so_snd.sb_cc - (tp->snd_max - tp->snd_una)) > 0) {
5009 u_int32_t incr = (tp->t_maxseg * tp->t_dupacks);
5010
5011 /* Use Limited Transmit algorithm on the first two
5012 * duplicate acks when there is new data to transmit
5013 */
5014 tp->snd_cwnd += incr;
5015 tcpstat.tcps_limited_txt++;
5016 (void) tcp_output(tp);
5017
5018 tcp_ccdbg_trace(tp, th, event: TCP_CC_LIMITED_TRANSMIT);
5019
5020 /* Reset snd_cwnd back to normal */
5021 tp->snd_cwnd -= incr;
5022 }
5023 }
5024 break;
5025 }
5026 /*
5027 * If the congestion window was inflated to account
5028 * for the other side's cached packets, retract it.
5029 */
5030 if (IN_FASTRECOVERY(tp)) {
5031 if (SEQ_LT(th->th_ack, tp->snd_recover)) {
5032 /*
5033 * If we received an ECE and entered
5034 * recovery, the subsequent ACKs should
5035 * not be treated as partial acks.
5036 */
5037 if (tp->ecn_flags & TE_INRECOVERY) {
5038 goto process_ACK;
5039 }
5040
5041 if (SACK_ENABLED(tp)) {
5042 tcp_sack_partialack(tp, th);
5043 } else {
5044 tcp_newreno_partial_ack(tp, th);
5045 }
5046 tcp_ccdbg_trace(tp, th, event: TCP_CC_PARTIAL_ACK);
5047 } else {
5048 if (tcp_cubic_minor_fixes) {
5049 exiting_fr = 1;
5050 }
5051 EXIT_FASTRECOVERY(tp);
5052 if (CC_ALGO(tp)->post_fr != NULL) {
5053 CC_ALGO(tp)->post_fr(tp, th);
5054 }
5055 tp->t_pipeack = 0;
5056 tcp_clear_pipeack_state(tp);
5057 tcp_ccdbg_trace(tp, th,
5058 event: TCP_CC_EXIT_FASTRECOVERY);
5059 }
5060 } else if ((tp->t_flagsext &
5061 (TF_PKTS_REORDERED | TF_DELAY_RECOVERY))
5062 == (TF_PKTS_REORDERED | TF_DELAY_RECOVERY)) {
5063 /*
5064 * If the ack acknowledges upto snd_recover or if
5065 * it acknowledges all the snd holes, exit
5066 * recovery and cancel the timer. Otherwise,
5067 * this is a partial ack. Wait for recovery timer
5068 * to enter recovery. The snd_holes have already
5069 * been updated.
5070 */
5071 if (SEQ_GEQ(th->th_ack, tp->snd_recover) ||
5072 TAILQ_EMPTY(&tp->snd_holes)) {
5073 tp->t_timer[TCPT_DELAYFR] = 0;
5074 tp->t_flagsext &= ~TF_DELAY_RECOVERY;
5075 EXIT_FASTRECOVERY(tp);
5076 tcp_ccdbg_trace(tp, th,
5077 event: TCP_CC_EXIT_FASTRECOVERY);
5078 }
5079 } else {
5080 /*
5081 * We were not in fast recovery. Reset the
5082 * duplicate ack counter.
5083 */
5084 tp->t_dupacks = 0;
5085 tp->t_rexmtthresh = tcprexmtthresh;
5086 tp->t_new_dupacks = 0;
5087 }
5088
5089process_ACK:
5090 VERIFY(SEQ_GEQ(th->th_ack, tp->snd_una));
5091 acked = BYTES_ACKED(th, tp);
5092 tcpstat.tcps_rcvackpack++;
5093 tcpstat.tcps_rcvackbyte += acked;
5094
5095 /*
5096 * If the last packet was a retransmit, make sure
5097 * it was not spurious.
5098 *
5099 * This will also take care of congestion window
5100 * adjustment if a last packet was recovered due to a
5101 * tail loss probe.
5102 */
5103 tcp_bad_rexmt_check(tp, th, to: &to);
5104
5105 /* Recalculate the RTT */
5106 tcp_compute_rtt(tp, to: &to, th);
5107
5108 /*
5109 * If all outstanding data is acked, stop retransmit
5110 * timer and remember to restart (more output or persist).
5111 * If there is more data to be acked, restart retransmit
5112 * timer, using current (possibly backed-off) value.
5113 */
5114 TCP_RESET_REXMT_STATE(tp);
5115 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
5116 tp->t_rttmin, TCPTV_REXMTMAX,
5117 TCP_ADD_REXMTSLOP(tp));
5118 if (th->th_ack == tp->snd_max) {
5119 tp->t_timer[TCPT_REXMT] = 0;
5120 tp->t_timer[TCPT_PTO] = 0;
5121 needoutput = 1;
5122 } else if (tp->t_timer[TCPT_PERSIST] == 0) {
5123 tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp,
5124 tp->t_rxtcur);
5125 }
5126
5127 if ((prev_t_state == TCPS_SYN_SENT ||
5128 prev_t_state == TCPS_SYN_RECEIVED) &&
5129 tp->t_state == TCPS_ESTABLISHED) {
5130 TCP_LOG_RTT_INFO(tp);
5131 }
5132
5133 /*
5134 * If no data (only SYN) was ACK'd, skip rest of ACK
5135 * processing.
5136 */
5137 if (acked == 0) {
5138 goto step6;
5139 }
5140
5141 /*
5142 * When outgoing data has been acked (except the SYN+data), we
5143 * mark this connection as "sending good" for TFO.
5144 */
5145 if ((tp->t_tfo_stats & TFO_S_SYN_DATA_SENT) &&
5146 !(tp->t_tfo_flags & TFO_F_NO_SNDPROBING) &&
5147 !(th->th_flags & TH_SYN)) {
5148 tp->t_tfo_flags |= TFO_F_NO_SNDPROBING;
5149 }
5150
5151 /*
5152 * Accurate ECN uses delta_cep to determine a congestion
5153 * event if new CE counts were received.
5154 * For classic ECN, congestion event is receiving TH_ECE.
5155 */
5156 if ((tp->ecn_flags & TE_SENDIPECT)) {
5157 if (TCP_ACC_ECN_ON(tp)) {
5158 if (!IN_FASTRECOVERY(tp) && tp->t_delta_ce_packets > 0) {
5159 tcp_reduce_congestion_window(tp);
5160 tp->ecn_flags |= (TE_INRECOVERY);
5161 /* update the stats */
5162 tcpstat.tcps_ecn_ace_recv_ce += tp->t_delta_ce_packets;
5163 tp->t_ecn_capable_packets_marked += tp->t_delta_ce_packets;
5164 tcp_ccdbg_trace(tp, th, event: TCP_CC_ECN_RCVD);
5165 }
5166 } else if (TCP_ECN_ENABLED(tp) && (thflags & TH_ECE)) {
5167 /*
5168 * Reduce the congestion window if we haven't
5169 * done so.
5170 */
5171 if (!IN_FASTRECOVERY(tp)) {
5172 tcp_reduce_congestion_window(tp);
5173 tp->ecn_flags |= (TE_INRECOVERY | TE_SENDCWR);
5174 /*
5175 * Also note that the connection received
5176 * ECE atleast once. We increment
5177 * t_ecn_capable_packets_marked when we first
5178 * enter fast recovery.
5179 */
5180 tp->ecn_flags |= TE_RECV_ECN_ECE;
5181 INP_INC_IFNET_STAT(inp, ecn_recv_ece);
5182 tcpstat.tcps_ecn_recv_ece++;
5183 tp->t_ecn_capable_packets_marked++;
5184 tcp_ccdbg_trace(tp, th, event: TCP_CC_ECN_RCVD);
5185 }
5186 }
5187 }
5188
5189 /*
5190 * When new data is acked, open the congestion window.
5191 * The specifics of how this is achieved are up to the
5192 * congestion control algorithm in use for this connection.
5193 *
5194 * The calculations in this function assume that snd_una is
5195 * not updated yet.
5196 */
5197 if (!IN_FASTRECOVERY(tp) && !exiting_fr) {
5198 if (CC_ALGO(tp)->ack_rcvd != NULL) {
5199 CC_ALGO(tp)->ack_rcvd(tp, th);
5200 }
5201 tcp_ccdbg_trace(tp, th, event: TCP_CC_ACK_RCVD);
5202 }
5203 if (acked > so->so_snd.sb_cc) {
5204 tp->snd_wnd -= so->so_snd.sb_cc;
5205 sbdrop(sb: &so->so_snd, len: (int)so->so_snd.sb_cc);
5206 ourfinisacked = 1;
5207 } else {
5208 sbdrop(sb: &so->so_snd, len: acked);
5209 tcp_sbsnd_trim(sbsnd: &so->so_snd);
5210 tp->snd_wnd -= acked;
5211 ourfinisacked = 0;
5212 }
5213 /* detect una wraparound */
5214 if (!IN_FASTRECOVERY(tp) &&
5215 SEQ_GT(tp->snd_una, tp->snd_recover) &&
5216 SEQ_LEQ(th->th_ack, tp->snd_recover)) {
5217 tp->snd_recover = th->th_ack - 1;
5218 }
5219
5220 if (IN_FASTRECOVERY(tp) &&
5221 SEQ_GEQ(th->th_ack, tp->snd_recover)) {
5222 EXIT_FASTRECOVERY(tp);
5223 }
5224
5225 tcp_update_snd_una(tp, ack: th->th_ack);
5226
5227 if (SACK_ENABLED(tp)) {
5228 if (SEQ_GT(tp->snd_una, tp->snd_recover)) {
5229 tp->snd_recover = tp->snd_una;
5230 }
5231 }
5232 if (SEQ_LT(tp->snd_nxt, tp->snd_una)) {
5233 tp->snd_nxt = tp->snd_una;
5234 }
5235 if (!SLIST_EMPTY(&tp->t_rxt_segments) &&
5236 !TCP_DSACK_SEQ_IN_WINDOW(tp, tp->t_dsack_lastuna,
5237 tp->snd_una)) {
5238 tcp_rxtseg_clean(tp);
5239 }
5240 if ((tp->t_flagsext & TF_MEASURESNDBW) != 0 &&
5241 tp->t_bwmeas != NULL) {
5242 tcp_bwmeas_check(tp);
5243 }
5244
5245 write_wakeup = 1;
5246
5247 if (!SLIST_EMPTY(&tp->t_notify_ack)) {
5248 tcp_notify_acknowledgement(tp, so);
5249 }
5250
5251 switch (tp->t_state) {
5252 /*
5253 * In FIN_WAIT_1 STATE in addition to the processing
5254 * for the ESTABLISHED state if our FIN is now acknowledged
5255 * then enter FIN_WAIT_2.
5256 */
5257 case TCPS_FIN_WAIT_1:
5258 if (ourfinisacked) {
5259 /*
5260 * If we can't receive any more
5261 * data, then closing user can proceed.
5262 * Starting the TCPT_2MSL timer is contrary to the
5263 * specification, but if we don't get a FIN
5264 * we'll hang forever.
5265 */
5266 DTRACE_TCP4(state__change, void, NULL,
5267 struct inpcb *, inp,
5268 struct tcpcb *, tp,
5269 int32_t, TCPS_FIN_WAIT_2);
5270 TCP_LOG_STATE(tp, TCPS_FIN_WAIT_2);
5271 tp->t_state = TCPS_FIN_WAIT_2;
5272 if (so->so_state & SS_CANTRCVMORE) {
5273 isconnected = FALSE;
5274 isdisconnected = TRUE;
5275 tcp_set_finwait_timeout(tp);
5276 }
5277 /*
5278 * fall through and make sure we also recognize
5279 * data ACKed with the FIN
5280 */
5281 }
5282 break;
5283
5284 /*
5285 * In CLOSING STATE in addition to the processing for
5286 * the ESTABLISHED state if the ACK acknowledges our FIN
5287 * then enter the TIME-WAIT state, otherwise ignore
5288 * the segment.
5289 */
5290 case TCPS_CLOSING:
5291 if (ourfinisacked) {
5292 DTRACE_TCP4(state__change, void, NULL,
5293 struct inpcb *, inp,
5294 struct tcpcb *, tp,
5295 int32_t, TCPS_TIME_WAIT);
5296 TCP_LOG_STATE(tp, TCPS_TIME_WAIT);
5297 tp->t_state = TCPS_TIME_WAIT;
5298 tcp_canceltimers(tp);
5299 if (tp->t_flagsext & TF_NOTIMEWAIT) {
5300 tp->t_flags |= TF_CLOSING;
5301 } else {
5302 add_to_time_wait(tp, delay: 2 * tcp_msl);
5303 }
5304 isconnected = FALSE;
5305 isdisconnected = TRUE;
5306 }
5307 break;
5308
5309 /*
5310 * In LAST_ACK, we may still be waiting for data to drain
5311 * and/or to be acked, as well as for the ack of our FIN.
5312 * If our FIN is now acknowledged, delete the TCB,
5313 * enter the closed state and return.
5314 */
5315 case TCPS_LAST_ACK:
5316 if (ourfinisacked) {
5317 tp = tcp_close(tp);
5318 goto drop;
5319 }
5320 break;
5321
5322 /*
5323 * In TIME_WAIT state the only thing that should arrive
5324 * is a retransmission of the remote FIN. Acknowledge
5325 * it and restart the finack timer.
5326 */
5327 case TCPS_TIME_WAIT:
5328 add_to_time_wait(tp, delay: 2 * tcp_msl);
5329 goto dropafterack;
5330 }
5331
5332 /*
5333 * If there is a SACK option on the ACK and we
5334 * haven't seen any duplicate acks before, count
5335 * it as a duplicate ack even if the cumulative
5336 * ack is advanced. If the receiver delayed an
5337 * ack and detected loss afterwards, then the ack
5338 * will advance cumulative ack and will also have
5339 * a SACK option. So counting it as one duplicate
5340 * ack is ok.
5341 */
5342 if (tp->t_state == TCPS_ESTABLISHED &&
5343 SACK_ENABLED(tp) && sack_bytes_acked > 0 &&
5344 to.to_nsacks > 0 && tp->t_dupacks == 0 &&
5345 SEQ_LEQ(th->th_ack, tp->snd_una) && tlen == 0 &&
5346 !(tp->t_flagsext & TF_PKTS_REORDERED)) {
5347 tcpstat.tcps_sack_ackadv++;
5348 goto process_dupack;
5349 }
5350 }
5351
5352step6:
5353 /*
5354 * Update window information.
5355 */
5356 if (tcp_update_window(tp, thflags, th, tiwin, tlen)) {
5357 needoutput = 1;
5358 }
5359
5360 /*
5361 * Process segments with URG.
5362 */
5363 if ((thflags & TH_URG) && th->th_urp &&
5364 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
5365 /*
5366 * This is a kludge, but if we receive and accept
5367 * random urgent pointers, we'll crash in
5368 * soreceive. It's hard to imagine someone
5369 * actually wanting to send this much urgent data.
5370 */
5371 if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
5372 th->th_urp = 0; /* XXX */
5373 thflags &= ~TH_URG; /* XXX */
5374 goto dodata; /* XXX */
5375 }
5376 /*
5377 * If this segment advances the known urgent pointer,
5378 * then mark the data stream. This should not happen
5379 * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
5380 * a FIN has been received from the remote side.
5381 * In these states we ignore the URG.
5382 *
5383 * According to RFC961 (Assigned Protocols),
5384 * the urgent pointer points to the last octet
5385 * of urgent data. We continue, however,
5386 * to consider it to indicate the first octet
5387 * of data past the urgent section as the original
5388 * spec states (in one of two places).
5389 */
5390 if (SEQ_GT(th->th_seq + th->th_urp, tp->rcv_up)) {
5391 tp->rcv_up = th->th_seq + th->th_urp;
5392 so->so_oobmark = so->so_rcv.sb_cc +
5393 (tp->rcv_up - tp->rcv_nxt) - 1;
5394 if (so->so_oobmark == 0) {
5395 so->so_state |= SS_RCVATMARK;
5396 }
5397 sohasoutofband(so);
5398 tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
5399 }
5400 /*
5401 * Remove out of band data so doesn't get presented to user.
5402 * This can happen independent of advancing the URG pointer,
5403 * but if two URG's are pending at once, some out-of-band
5404 * data may creep in... ick.
5405 */
5406 if (th->th_urp <= (u_int32_t)tlen
5407#if SO_OOBINLINE
5408 && (so->so_options & SO_OOBINLINE) == 0
5409#endif
5410 ) {
5411 tcp_pulloutofband(so, th, m,
5412 drop_hdrlen); /* hdr drop is delayed */
5413 }
5414 } else {
5415 /*
5416 * If no out of band data is expected,
5417 * pull receive urgent pointer along
5418 * with the receive window.
5419 */
5420 if (SEQ_GT(tp->rcv_nxt, tp->rcv_up)) {
5421 tp->rcv_up = tp->rcv_nxt;
5422 }
5423 }
5424dodata:
5425
5426 /* Set socket's connect or disconnect state correcly before doing data.
5427 * The following might unlock the socket if there is an upcall or a socket
5428 * filter.
5429 */
5430 if (isconnected) {
5431 soisconnected(so);
5432 } else if (isdisconnected) {
5433 soisdisconnected(so);
5434 }
5435
5436 /* Let's check the state of pcb just to make sure that it did not get closed
5437 * when we unlocked above
5438 */
5439 if (inp->inp_state == INPCB_STATE_DEAD) {
5440 /* Just drop the packet that we are processing and return */
5441 TCP_LOG_DROP_PCB(TCP_LOG_HDR, th, tp, false, "INPCB_STATE_DEAD");
5442 goto drop;
5443 }
5444
5445 /*
5446 * Process the segment text, merging it into the TCP sequencing queue,
5447 * and arranging for acknowledgment of receipt if necessary.
5448 * This process logically involves adjusting tp->rcv_wnd as data
5449 * is presented to the user (this happens in tcp_usrreq.c,
5450 * case PRU_RCVD). If a FIN has already been received on this
5451 * connection then we just ignore the text.
5452 *
5453 * If we are in SYN-received state and got a valid TFO cookie, we want
5454 * to process the data.
5455 */
5456 if ((tlen || (thflags & TH_FIN)) &&
5457 TCPS_HAVERCVDFIN(tp->t_state) == 0 &&
5458 (TCPS_HAVEESTABLISHED(tp->t_state) ||
5459 (tp->t_state == TCPS_SYN_RECEIVED &&
5460 (tp->t_tfo_flags & TFO_F_COOKIE_VALID)))) {
5461 tcp_seq save_start = th->th_seq;
5462 tcp_seq save_end = th->th_seq + tlen;
5463 m_adj(m, drop_hdrlen); /* delayed header drop */
5464 /*
5465 * Insert segment which includes th into TCP reassembly queue
5466 * with control block tp. Set thflags to whether reassembly now
5467 * includes a segment with FIN. This handles the common case
5468 * inline (segment is the next to be received on an established
5469 * connection, and the queue is empty), avoiding linkage into
5470 * and removal from the queue and repetition of various
5471 * conversions.
5472 * Set DELACK for segments received in order, but ack
5473 * immediately when segments are out of order (so
5474 * fast retransmit can work).
5475 */
5476 if (th->th_seq == tp->rcv_nxt && LIST_EMPTY(&tp->t_segq)) {
5477 TCP_INC_VAR(tp->t_unacksegs, segment_count);
5478
5479 /* Calculate the RTT on the receiver */
5480 tcp_compute_rcv_rtt(tp, to: &to, th);
5481
5482 if (DELAY_ACK(tp, th) &&
5483 ((tp->t_flags & TF_ACKNOW) == 0)) {
5484 if ((tp->t_flags & TF_DELACK) == 0) {
5485 tp->t_flags |= TF_DELACK;
5486 tp->t_timer[TCPT_DELACK] =
5487 OFFSET_FROM_START(tp, tcp_delack);
5488 }
5489 } else {
5490 tp->t_flags |= TF_ACKNOW;
5491 }
5492 tp->rcv_nxt += tlen;
5493 /* Update highest received sequence and its timestamp */
5494 if (SEQ_LT(tp->rcv_high, tp->rcv_nxt)) {
5495 tp->rcv_high = tp->rcv_nxt;
5496 if (to.to_flags & TOF_TS) {
5497 tp->tsv_high = to.to_tsval;
5498 }
5499 }
5500
5501 thflags = th->th_flags & TH_FIN;
5502 TCP_INC_VAR(tcpstat.tcps_rcvpack, segment_count);
5503 tcpstat.tcps_rcvbyte += tlen;
5504 if (nstat_collect) {
5505 INP_ADD_STAT(inp, cell, wifi, wired,
5506 rxpackets, 1);
5507 INP_ADD_STAT(inp, cell, wifi, wired,
5508 rxbytes, tlen);
5509 inp_set_activity_bitmap(inp);
5510 }
5511 tcp_sbrcv_grow(tp, sbrcv: &so->so_rcv, to: &to, pktlen: tlen);
5512 if (TCP_USE_RLEDBAT(tp, so) &&
5513 tcp_cc_rledbat.data_rcvd != NULL) {
5514 tcp_cc_rledbat.data_rcvd(tp, th, &to, tlen);
5515 }
5516
5517 so_recv_data_stat(so, m, drop_hdrlen);
5518
5519 if (isipv6) {
5520 memcpy(dst: &saved_hdr, src: ip6, n: sizeof(struct ip6_hdr));
5521 ip6 = (struct ip6_hdr *)&saved_hdr[0];
5522 } else {
5523 memcpy(dst: &saved_hdr, src: ip, n: ip->ip_hl << 2);
5524 ip = (struct ip *)&saved_hdr[0];
5525 }
5526 memcpy(dst: &saved_tcphdr, src: th, n: sizeof(struct tcphdr));
5527
5528 if (th->th_flags & TH_PUSH) {
5529 tp->t_flagsext |= TF_LAST_IS_PSH;
5530 } else {
5531 tp->t_flagsext &= ~TF_LAST_IS_PSH;
5532 }
5533
5534 if (sbappendstream_rcvdemux(so, m)) {
5535 read_wakeup = 1;
5536 }
5537 th = &saved_tcphdr;
5538 } else {
5539 if (isipv6) {
5540 memcpy(dst: &saved_hdr, src: ip6, n: sizeof(struct ip6_hdr));
5541 ip6 = (struct ip6_hdr *)&saved_hdr[0];
5542 } else {
5543 memcpy(dst: &saved_hdr, src: ip, n: ip->ip_hl << 2);
5544 ip = (struct ip *)&saved_hdr[0];
5545 }
5546
5547 /* Update highest received sequence and its timestamp */
5548 if (SEQ_LT(tp->rcv_high, th->th_seq + tlen)) {
5549 tp->rcv_high = th->th_seq + tlen;
5550 if (to.to_flags & TOF_TS) {
5551 tp->tsv_high = to.to_tsval;
5552 }
5553 }
5554
5555 /*
5556 * Calculate the RTT on the receiver,
5557 * even if OOO segment is received.
5558 */
5559 tcp_compute_rcv_rtt(tp, to: &to, th);
5560
5561 if (tcp_autotune_reorder) {
5562 tcp_sbrcv_grow(tp, sbrcv: &so->so_rcv, to: &to, pktlen: tlen);
5563 }
5564 if (TCP_USE_RLEDBAT(tp, so) &&
5565 tcp_cc_rledbat.data_rcvd != NULL) {
5566 tcp_cc_rledbat.data_rcvd(tp, th, &to, tlen);
5567 }
5568
5569 memcpy(dst: &saved_tcphdr, src: th, n: sizeof(struct tcphdr));
5570 thflags = tcp_reass(tp, th, tlenp: &tlen, m, ifp, dowakeup: &read_wakeup);
5571 th = &saved_tcphdr;
5572 tp->t_flags |= TF_ACKNOW;
5573 }
5574
5575 if ((tlen > 0 || (th->th_flags & TH_FIN)) && SACK_ENABLED(tp)) {
5576 if (th->th_flags & TH_FIN) {
5577 save_end++;
5578 }
5579 tcp_update_sack_list(tp, rcv_laststart: save_start, rcv_lastend: save_end);
5580 }
5581
5582 tcp_adaptive_rwtimo_check(tp, tlen);
5583
5584 if (tlen > 0) {
5585 tcp_tfo_rcv_data(tp);
5586 }
5587
5588 if (tp->t_flags & TF_DELACK) {
5589 if (isipv6) {
5590 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
5591 (((ip6->ip6_src.s6_addr16[0]) << 16) | (ip6->ip6_dst.s6_addr16[0])),
5592 th->th_seq, th->th_ack, th->th_win);
5593 } else {
5594 KERNEL_DEBUG(DBG_LAYER_END, ((th->th_dport << 16) | th->th_sport),
5595 (((ip->ip_src.s_addr & 0xffff) << 16) | (ip->ip_dst.s_addr & 0xffff)),
5596 th->th_seq, th->th_ack, th->th_win);
5597 }
5598 }
5599 } else {
5600 if ((so->so_flags & SOF_MP_SUBFLOW) && tlen == 0 &&
5601 (m->m_pkthdr.pkt_flags & PKTF_MPTCP_DFIN) &&
5602 (m->m_pkthdr.pkt_flags & PKTF_MPTCP)) {
5603 m_adj(m, drop_hdrlen); /* delayed header drop */
5604 /*
5605 * 0-length DATA_FIN. The rlen is actually 0. We special-case the
5606 * byte consumed by the dfin in mptcp_input and mptcp_reass_present
5607 */
5608 m->m_pkthdr.mp_rlen = 0;
5609 mptcp_input(tptomptp(tp)->mpt_mpte, m);
5610 tp->t_flags |= TF_ACKNOW;
5611 } else {
5612 m_freem(m);
5613 }
5614 thflags &= ~TH_FIN;
5615 }
5616 /*
5617 * We increment t_unacksegs_ce for both data segments and pure ACKs
5618 * No need to increment if a FIN has already been received.
5619 */
5620 if (TCP_ACC_ECN_ON(tp) && TCPS_HAVEESTABLISHED(tp->t_state) &&
5621 TCPS_HAVERCVDFIN(tp->t_state) == 0) {
5622 if (ip_ecn == IPTOS_ECN_CE) {
5623 TCP_INC_VAR(tp->t_unacksegs_ce, segment_count);
5624 }
5625 /*
5626 * Send an ACK immediately if there is a change in IP ECN
5627 * from non-CE to CE.
5628 * If new data is delivered, then ACK for every 2 CE marks,
5629 * otherwise ACK for every 3 CE marks
5630 */
5631 if ((ip_ecn == IPTOS_ECN_CE && ip_ecn != tp->t_prev_ip_ecn) ||
5632 (tp->t_unacksegs_ce >= 2 && tp->last_ack_sent != tp->rcv_nxt) ||
5633 tp->t_unacksegs_ce >= 3) {
5634 tp->t_flags |= TF_ACKNOW;
5635 }
5636 tp->t_prev_ip_ecn = ip_ecn;
5637 }
5638 /*
5639 * If FIN is received ACK the FIN and let the user know
5640 * that the connection is closing.
5641 */
5642 if (thflags & TH_FIN) {
5643 if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
5644 socantrcvmore(so);
5645 /*
5646 * If connection is half-synchronized
5647 * (ie NEEDSYN flag on) then delay ACK,
5648 * so it may be piggybacked when SYN is sent.
5649 * Otherwise, since we received a FIN then no
5650 * more input can be expected, send ACK now.
5651 */
5652 TCP_INC_VAR(tp->t_unacksegs, segment_count);
5653 tp->t_flags |= TF_ACKNOW;
5654 tp->rcv_nxt++;
5655 }
5656 switch (tp->t_state) {
5657 /*
5658 * In SYN_RECEIVED and ESTABLISHED STATES
5659 * enter the CLOSE_WAIT state.
5660 */
5661 case TCPS_SYN_RECEIVED:
5662 tp->t_starttime = tcp_now;
5663 OS_FALLTHROUGH;
5664 case TCPS_ESTABLISHED:
5665 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
5666 struct tcpcb *, tp, int32_t, TCPS_CLOSE_WAIT);
5667 TCP_LOG_STATE(tp, TCPS_CLOSE_WAIT);
5668 tp->t_state = TCPS_CLOSE_WAIT;
5669 break;
5670
5671 /*
5672 * If still in FIN_WAIT_1 STATE FIN has not been acked so
5673 * enter the CLOSING state.
5674 */
5675 case TCPS_FIN_WAIT_1:
5676 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
5677 struct tcpcb *, tp, int32_t, TCPS_CLOSING);
5678 TCP_LOG_STATE(tp, TCPS_CLOSING);
5679 tp->t_state = TCPS_CLOSING;
5680 break;
5681
5682 /*
5683 * In FIN_WAIT_2 state enter the TIME_WAIT state,
5684 * starting the time-wait timer, turning off the other
5685 * standard timers.
5686 */
5687 case TCPS_FIN_WAIT_2:
5688 DTRACE_TCP4(state__change, void, NULL,
5689 struct inpcb *, inp,
5690 struct tcpcb *, tp,
5691 int32_t, TCPS_TIME_WAIT);
5692 TCP_LOG_STATE(tp, TCPS_TIME_WAIT);
5693 tp->t_state = TCPS_TIME_WAIT;
5694 tcp_canceltimers(tp);
5695 tp->t_flags |= TF_ACKNOW;
5696 if (tp->t_flagsext & TF_NOTIMEWAIT) {
5697 tp->t_flags |= TF_CLOSING;
5698 } else {
5699 add_to_time_wait(tp, delay: 2 * tcp_msl);
5700 }
5701 soisdisconnected(so);
5702 break;
5703
5704 /*
5705 * In TIME_WAIT state restart the 2 MSL time_wait timer.
5706 */
5707 case TCPS_TIME_WAIT:
5708 add_to_time_wait(tp, delay: 2 * tcp_msl);
5709 break;
5710 }
5711 }
5712#if TCPDEBUG
5713 if (so->so_options & SO_DEBUG) {
5714 tcp_trace(TA_INPUT, ostate, tp, (void *)tcp_saveipgen,
5715 &tcp_savetcp, 0);
5716 }
5717#endif
5718
5719 if (read_wakeup) {
5720 mptcp_handle_input(so);
5721 }
5722
5723 /*
5724 * Return any desired output.
5725 */
5726 if (needoutput || (tp->t_flags & TF_ACKNOW)) {
5727 (void) tcp_output(tp);
5728 }
5729
5730 tcp_check_timer_state(tp);
5731
5732 tcp_handle_wakeup(so, read_wakeup, write_wakeup);
5733
5734 socket_unlock(so, refcount: 1);
5735 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
5736 return;
5737
5738dropafterack:
5739 /*
5740 * Generate an ACK dropping incoming segment if it occupies
5741 * sequence space, where the ACK reflects our state.
5742 *
5743 * We can now skip the test for the RST flag since all
5744 * paths to this code happen after packets containing
5745 * RST have been dropped.
5746 *
5747 * In the SYN-RECEIVED state, don't send an ACK unless the
5748 * segment we received passes the SYN-RECEIVED ACK test.
5749 * If it fails send a RST. This breaks the loop in the
5750 * "LAND" DoS attack, and also prevents an ACK storm
5751 * between two listening ports that have been sent forged
5752 * SYN segments, each with the source address of the other.
5753 */
5754 if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
5755 (SEQ_GT(tp->snd_una, th->th_ack) ||
5756 SEQ_GT(th->th_ack, tp->snd_max))) {
5757 IF_TCP_STATINC(ifp, dospacket);
5758 goto dropwithreset;
5759 }
5760#if TCPDEBUG
5761 if (so->so_options & SO_DEBUG) {
5762 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
5763 &tcp_savetcp, 0);
5764 }
5765#endif
5766 m_freem(m);
5767 tp->t_flags |= TF_ACKNOW;
5768
5769 (void) tcp_output(tp);
5770
5771 tcp_handle_wakeup(so, read_wakeup, write_wakeup);
5772
5773 /* Don't need to check timer state as we should have done it during tcp_output */
5774 socket_unlock(so, refcount: 1);
5775 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
5776 return;
5777dropwithresetnosock:
5778 nosock = 1;
5779dropwithreset:
5780 /*
5781 * Generate a RST, dropping incoming segment.
5782 * Make ACK acceptable to originator of segment.
5783 * Don't bother to respond if destination was broadcast/multicast.
5784 */
5785 if ((thflags & TH_RST) || m->m_flags & (M_BCAST | M_MCAST)) {
5786 goto drop;
5787 }
5788 if (isipv6) {
5789 if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) ||
5790 IN6_IS_ADDR_MULTICAST(&ip6->ip6_src)) {
5791 goto drop;
5792 }
5793 } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) ||
5794 IN_MULTICAST(ntohl(ip->ip_src.s_addr)) ||
5795 ip->ip_src.s_addr == htonl(INADDR_BROADCAST) ||
5796 in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) {
5797 goto drop;
5798 }
5799 /* IPv6 anycast check is done at tcp6_input() */
5800
5801#if TCPDEBUG
5802 if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) {
5803 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
5804 &tcp_savetcp, 0);
5805 }
5806#endif
5807 bzero(s: &tra, n: sizeof(tra));
5808 tra.ifscope = ifscope;
5809 tra.awdl_unrestricted = 1;
5810 tra.intcoproc_allowed = 1;
5811 tra.management_allowed = 1;
5812 if (thflags & TH_ACK) {
5813 /* mtod() below is safe as long as hdr dropping is delayed */
5814 tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack,
5815 TH_RST, &tra);
5816 } else {
5817 if (thflags & TH_SYN) {
5818 tlen++;
5819 }
5820 /* mtod() below is safe as long as hdr dropping is delayed */
5821 tcp_respond(tp, mtod(m, void *), th, m, th->th_seq + tlen,
5822 (tcp_seq)0, TH_RST | TH_ACK, &tra);
5823 }
5824 /* destroy temporarily created socket */
5825 if (dropsocket) {
5826 (void) soabort(so);
5827 socket_unlock(so, refcount: 1);
5828 } else if ((inp != NULL) && (nosock == 0)) {
5829 tcp_handle_wakeup(so, read_wakeup, write_wakeup);
5830
5831 socket_unlock(so, refcount: 1);
5832 }
5833 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
5834 return;
5835dropnosock:
5836 nosock = 1;
5837drop:
5838 /*
5839 * Drop space held by incoming segment and return.
5840 */
5841#if TCPDEBUG
5842 if (tp == 0 || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) {
5843 tcp_trace(TA_DROP, ostate, tp, (void *)tcp_saveipgen,
5844 &tcp_savetcp, 0);
5845 }
5846#endif
5847 m_freem(m);
5848 /* destroy temporarily created socket */
5849 if (dropsocket) {
5850 (void) soabort(so);
5851 socket_unlock(so, refcount: 1);
5852 } else if (nosock == 0) {
5853 tcp_handle_wakeup(so, read_wakeup, write_wakeup);
5854
5855 socket_unlock(so, refcount: 1);
5856 }
5857 KERNEL_DEBUG(DBG_FNC_TCP_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
5858 return;
5859}
5860
5861/*
5862 * Parse TCP options and place in tcpopt.
5863 */
5864static void
5865tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcphdr *th,
5866 struct tcpopt *to)
5867{
5868 u_short mss = 0;
5869 uint8_t opt, optlen;
5870
5871 for (; cnt > 0; cnt -= optlen, cp += optlen) {
5872 opt = cp[0];
5873 if (opt == TCPOPT_EOL) {
5874 break;
5875 }
5876 if (opt == TCPOPT_NOP) {
5877 optlen = 1;
5878 } else {
5879 if (cnt < 2) {
5880 break;
5881 }
5882 optlen = cp[1];
5883 if (optlen < 2 || optlen > cnt) {
5884 break;
5885 }
5886 }
5887 switch (opt) {
5888 default:
5889 continue;
5890
5891 case TCPOPT_MAXSEG:
5892 if (optlen != TCPOLEN_MAXSEG) {
5893 continue;
5894 }
5895 if (!(th->th_flags & TH_SYN)) {
5896 continue;
5897 }
5898 bcopy(src: (char *) cp + 2, dst: (char *) &mss, n: sizeof(mss));
5899 NTOHS(mss);
5900 to->to_mss = mss;
5901 to->to_flags |= TOF_MSS;
5902 break;
5903
5904 case TCPOPT_WINDOW:
5905 if (optlen != TCPOLEN_WINDOW) {
5906 continue;
5907 }
5908 if (!(th->th_flags & TH_SYN)) {
5909 continue;
5910 }
5911 to->to_flags |= TOF_SCALE;
5912 to->to_requested_s_scale = MIN(cp[2], TCP_MAX_WINSHIFT);
5913 break;
5914
5915 case TCPOPT_TIMESTAMP:
5916 if (optlen != TCPOLEN_TIMESTAMP) {
5917 continue;
5918 }
5919 to->to_flags |= TOF_TS;
5920 bcopy(src: (char *)cp + 2,
5921 dst: (char *)&to->to_tsval, n: sizeof(to->to_tsval));
5922 NTOHL(to->to_tsval);
5923 bcopy(src: (char *)cp + 6,
5924 dst: (char *)&to->to_tsecr, n: sizeof(to->to_tsecr));
5925 NTOHL(to->to_tsecr);
5926 to->to_tsecr -= tp->t_ts_offset;
5927 /* Re-enable sending Timestamps if we received them */
5928 if (!(tp->t_flags & TF_REQ_TSTMP) && tcp_do_timestamps) {
5929 tp->t_flags |= TF_REQ_TSTMP;
5930 }
5931 break;
5932 case TCPOPT_SACK_PERMITTED:
5933 if (optlen != TCPOLEN_SACK_PERMITTED) {
5934 continue;
5935 }
5936 if (th->th_flags & TH_SYN) {
5937 to->to_flags |= TOF_SACK;
5938 }
5939 break;
5940 case TCPOPT_SACK:
5941 if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) {
5942 continue;
5943 }
5944 to->to_nsacks = (optlen - 2) / TCPOLEN_SACK;
5945 to->to_sacks = cp + 2;
5946 tcpstat.tcps_sack_rcv_blocks++;
5947
5948 break;
5949 case TCPOPT_FASTOPEN:
5950 if (optlen == TCPOLEN_FASTOPEN_REQ) {
5951 if (tp->t_state != TCPS_LISTEN) {
5952 continue;
5953 }
5954
5955 to->to_flags |= TOF_TFOREQ;
5956 } else {
5957 if (optlen < TCPOLEN_FASTOPEN_REQ ||
5958 (optlen - TCPOLEN_FASTOPEN_REQ) > TFO_COOKIE_LEN_MAX ||
5959 (optlen - TCPOLEN_FASTOPEN_REQ) < TFO_COOKIE_LEN_MIN) {
5960 continue;
5961 }
5962 if (tp->t_state != TCPS_LISTEN &&
5963 tp->t_state != TCPS_SYN_SENT) {
5964 continue;
5965 }
5966
5967 to->to_flags |= TOF_TFO;
5968 to->to_tfo = cp + 1;
5969 }
5970
5971 break;
5972#if MPTCP
5973 case TCPOPT_MULTIPATH:
5974 tcp_do_mptcp_options(tp, cp, th, to, optlen);
5975 break;
5976#endif /* MPTCP */
5977 }
5978 }
5979}
5980
5981static void
5982tcp_finalize_options(struct tcpcb *tp, struct tcpopt *to, unsigned int ifscope)
5983{
5984 if (to->to_flags & TOF_TS) {
5985 tp->t_flags |= TF_RCVD_TSTMP;
5986 tp->ts_recent = to->to_tsval;
5987 tp->ts_recent_age = tcp_now;
5988 }
5989 if (to->to_flags & TOF_MSS) {
5990 tcp_mss(tp, to->to_mss, ifscope);
5991 }
5992 if (SACK_ENABLED(tp)) {
5993 if (!(to->to_flags & TOF_SACK)) {
5994 tp->t_flagsext &= ~(TF_SACK_ENABLE);
5995 } else {
5996 tp->t_flags |= TF_SACK_PERMIT;
5997 }
5998 }
5999 if (to->to_flags & TOF_SCALE) {
6000 tp->t_flags |= TF_RCVD_SCALE;
6001 tp->requested_s_scale = to->to_requested_s_scale;
6002
6003 /* Re-enable window scaling, if the option is received */
6004 if (tp->request_r_scale > 0) {
6005 tp->t_flags |= TF_REQ_SCALE;
6006 }
6007 }
6008}
6009
6010/*
6011 * Pull out of band byte out of a segment so
6012 * it doesn't appear in the user's data queue.
6013 * It is still reflected in the segment length for
6014 * sequencing purposes.
6015 *
6016 * @param off delayed to be droped hdrlen
6017 */
6018static void
6019tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m, int off)
6020{
6021 int cnt = off + th->th_urp - 1;
6022
6023 while (cnt >= 0) {
6024 if (m->m_len > cnt) {
6025 char *cp = mtod(m, caddr_t) + cnt;
6026 struct tcpcb *tp = sototcpcb(so);
6027
6028 tp->t_iobc = *cp;
6029 tp->t_oobflags |= TCPOOB_HAVEDATA;
6030 bcopy(src: cp + 1, dst: cp, n: (unsigned)(m->m_len - cnt - 1));
6031 m->m_len--;
6032 if (m->m_flags & M_PKTHDR) {
6033 m->m_pkthdr.len--;
6034 }
6035 return;
6036 }
6037 cnt -= m->m_len;
6038 m = m->m_next;
6039 if (m == 0) {
6040 break;
6041 }
6042 }
6043 panic("tcp_pulloutofband");
6044}
6045
6046uint32_t
6047get_base_rtt(struct tcpcb *tp)
6048{
6049 struct rtentry *rt = tp->t_inpcb->inp_route.ro_rt;
6050 return (rt == NULL) ? 0 : rt->rtt_min;
6051}
6052
6053static void
6054update_curr_rtt(struct tcpcb * tp, uint32_t rtt)
6055{
6056 tp->curr_rtt_index = (tp->curr_rtt_index + 1) % NCURR_RTT_HIST;
6057 tp->curr_rtt_hist[tp->curr_rtt_index] = rtt;
6058
6059 /* forget the old value and update minimum */
6060 tp->curr_rtt_min = 0;
6061 for (int i = 0; i < NCURR_RTT_HIST; ++i) {
6062 if (tp->curr_rtt_hist[i] != 0 && (tp->curr_rtt_min == 0 ||
6063 tp->curr_rtt_hist[i] < tp->curr_rtt_min)) {
6064 tp->curr_rtt_min = tp->curr_rtt_hist[i];
6065 }
6066 }
6067}
6068
6069/* Each value of RTT base represents the minimum RTT seen in a minute.
6070 * We keep upto N_RTT_BASE minutes worth of history.
6071 */
6072void
6073update_base_rtt(struct tcpcb *tp, uint32_t rtt)
6074{
6075 u_int32_t base_rtt, i;
6076 struct rtentry *rt;
6077
6078 if ((rt = tp->t_inpcb->inp_route.ro_rt) == NULL) {
6079 return;
6080 }
6081 if (rt->rtt_expire_ts == 0) {
6082 RT_LOCK_SPIN(rt);
6083 if (rt->rtt_expire_ts != 0) {
6084 RT_UNLOCK(rt);
6085 goto update;
6086 }
6087 rt->rtt_expire_ts = tcp_now;
6088 rt->rtt_index = 0;
6089 rt->rtt_hist[0] = rtt;
6090 rt->rtt_min = rtt;
6091 RT_UNLOCK(rt);
6092
6093 tp->curr_rtt_index = 0;
6094 tp->curr_rtt_hist[0] = rtt;
6095 tp->curr_rtt_min = rtt;
6096 return;
6097 }
6098update:
6099#if TRAFFIC_MGT
6100 /*
6101 * If the recv side is being throttled, check if the
6102 * current RTT is closer to the base RTT seen in
6103 * first (recent) two slots. If so, unthrottle the stream.
6104 */
6105 if ((tp->t_flagsext & TF_RECV_THROTTLE) &&
6106 (int)(tcp_now - tp->t_recv_throttle_ts) >= TCP_RECV_THROTTLE_WIN) {
6107 base_rtt = rt->rtt_min;
6108 if (tp->t_rttcur <= (base_rtt + target_qdelay)) {
6109 tp->t_flagsext &= ~TF_RECV_THROTTLE;
6110 tp->t_recv_throttle_ts = 0;
6111 }
6112 }
6113#endif /* TRAFFIC_MGT */
6114
6115 /* Update the next current RTT sample */
6116 update_curr_rtt(tp, rtt);
6117
6118 if ((int)(tcp_now - rt->rtt_expire_ts) >=
6119 TCP_RTT_HISTORY_EXPIRE_TIME) {
6120 RT_LOCK_SPIN(rt);
6121 /* check the condition again to avoid race */
6122 if ((int)(tcp_now - rt->rtt_expire_ts) >=
6123 TCP_RTT_HISTORY_EXPIRE_TIME) {
6124 /* Set the base rtt to 0 for idle periods */
6125 uint32_t times = MIN((tcp_now - rt->rtt_expire_ts) /
6126 TCP_RTT_HISTORY_EXPIRE_TIME, NRTT_HIST + 1);
6127
6128 for (i = rt->rtt_index + 1; i < rt->rtt_index + times; i++) {
6129 rt->rtt_hist[i % NRTT_HIST] = 0;
6130 }
6131
6132 rt->rtt_index = i % NRTT_HIST;
6133 rt->rtt_hist[rt->rtt_index] = rtt;
6134 rt->rtt_expire_ts = tcp_now;
6135 } else {
6136 rt->rtt_hist[rt->rtt_index] =
6137 min(a: rt->rtt_hist[rt->rtt_index], b: rtt);
6138 }
6139 /* forget the old value and update minimum */
6140 rt->rtt_min = 0;
6141 for (i = 0; i < NRTT_HIST; ++i) {
6142 if (rt->rtt_hist[i] != 0 &&
6143 (rt->rtt_min == 0 ||
6144 rt->rtt_hist[i] < rt->rtt_min)) {
6145 rt->rtt_min = rt->rtt_hist[i];
6146 }
6147 }
6148 RT_UNLOCK(rt);
6149 } else {
6150 rt->rtt_hist[rt->rtt_index] =
6151 min(a: rt->rtt_hist[rt->rtt_index], b: rtt);
6152 if (rt->rtt_min == 0) {
6153 rt->rtt_min = rtt;
6154 } else {
6155 rt->rtt_min = min(a: rt->rtt_min, b: rtt);
6156 }
6157 }
6158}
6159
6160/*
6161 * If we have a timestamp reply, update smoothed RTT. If no timestamp is
6162 * present but transmit timer is running and timed sequence number was
6163 * acked, update smoothed RTT.
6164 *
6165 * If timestamps are supported, a receiver can update RTT even if
6166 * there is no outstanding data.
6167 *
6168 * Some boxes send broken timestamp replies during the SYN+ACK phase,
6169 * ignore timestamps of 0or we could calculate a huge RTT and blow up
6170 * the retransmit timer.
6171 */
6172static void
6173tcp_compute_rtt(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th)
6174{
6175 int rtt = 0;
6176 VERIFY(to != NULL && th != NULL);
6177 if (tp->t_rtttime != 0 && SEQ_GT(th->th_ack, tp->t_rtseq)) {
6178 u_int32_t pipe_ack_val;
6179 rtt = tcp_now - tp->t_rtttime;
6180 if (rtt == 0) {
6181 /*
6182 * Make adjustment for sub ms RTT when
6183 * timestamps are not used.
6184 */
6185 rtt = 1;
6186 }
6187 /*
6188 * Compute pipe ack -- the amount of data acknowledged
6189 * in the last RTT -- only works for sender
6190 */
6191 if (SEQ_GT(th->th_ack, tp->t_pipeack_lastuna)) {
6192 pipe_ack_val = th->th_ack - tp->t_pipeack_lastuna;
6193 /* Update the sample */
6194 tp->t_pipeack_sample[tp->t_pipeack_ind++] =
6195 pipe_ack_val;
6196 tp->t_pipeack_ind %= TCP_PIPEACK_SAMPLE_COUNT;
6197
6198 /* Compute the max of the pipeack samples */
6199 pipe_ack_val = tcp_get_max_pipeack(tp);
6200 tp->t_pipeack = (pipe_ack_val >
6201 tcp_initial_cwnd(tp)) ?
6202 pipe_ack_val : 0;
6203 }
6204 /* start another measurement */
6205 tp->t_rtttime = 0;
6206 }
6207 if (((to->to_flags & TOF_TS) != 0) &&
6208 (to->to_tsecr != 0) &&
6209 TSTMP_GEQ(tcp_now, to->to_tsecr)) {
6210 tcp_xmit_timer(tp, (tcp_now - to->to_tsecr),
6211 to->to_tsecr, th->th_ack);
6212 } else if (rtt > 0) {
6213 tcp_xmit_timer(tp, rtt, 0, th->th_ack);
6214 }
6215}
6216
6217static void
6218tcp_compute_rcv_rtt(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th)
6219{
6220 uint32_t rtt = 0, delta = 0;
6221 VERIFY(to != NULL && th != NULL);
6222
6223 /* Calculate RTT */
6224 if (((to->to_flags & TOF_TS) != 0) && (to->to_tsecr != 0) &&
6225 TSTMP_GEQ(tcp_now, to->to_tsecr)) {
6226 /* Timestamp is supported */
6227 rtt = tcp_now - to->to_tsecr;
6228 if (rtt == 0) {
6229 /* Make adjustment for sub ms RTT */
6230 rtt = 1;
6231 }
6232 } else if ((to->to_flags & TOF_TS) == 0) {
6233 /*
6234 * Timestamp is not supported, 1RTT is roughly
6235 * the time to receive one full window of data
6236 * Currently, RTT calculated this way is only used
6237 * for auto-tuning.
6238 */
6239 if (tp->rcv_rtt_est_ts != 0) {
6240 if (SEQ_LT(tp->rcv_nxt, tp->rcv_rtt_est_seq)) {
6241 /* Haven't received a full window yet */
6242 return;
6243 } else {
6244 rtt = tcp_now - tp->rcv_rtt_est_ts;
6245 if (rtt == 0) {
6246 /* Make adjustment for sub ms RTT */
6247 rtt = 1;
6248 }
6249 }
6250 } else {
6251 /* Use default value when no RTT measurement */
6252 rtt = TCPTV_RCVNOTS_QUANTUM;
6253 }
6254 /* Restart the measurement */
6255 tp->rcv_rtt_est_ts = tcp_now;
6256 tp->rcv_rtt_est_seq = tp->rcv_nxt + tp->rcv_wnd;
6257 }
6258
6259 /* Update receiver's SRTT */
6260 if (tp->rcv_srtt != 0) {
6261 /*
6262 * Use the smoothed rtt formula,
6263 * (srtt = rtt/8 + srtt*7/8) in fixed point
6264 */
6265 delta = (rtt << TCP_DELTA_SHIFT)
6266 - (tp->rcv_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
6267
6268 if ((tp->rcv_srtt += delta) <= 0) {
6269 tp->rcv_srtt = 1;
6270 }
6271 } else {
6272 /* No previous measurement */
6273 tp->rcv_srtt = rtt << TCP_RTT_SHIFT;
6274 }
6275
6276 /*
6277 * For current RTT, base RTT and current RTT over k samples,
6278 * we are using the same state for both sender and receiver
6279 * as the most recent sample is always updated before any
6280 * other processing, i.e. the sender will not end up with
6281 * a high RTT due to the receiver.
6282 */
6283 tp->t_rttcur = rtt;
6284 update_base_rtt(tp, rtt);
6285}
6286
6287/*
6288 * Collect new round-trip time estimate and update averages and
6289 * current timeout.
6290 */
6291static void
6292tcp_xmit_timer(struct tcpcb *tp, int rtt,
6293 u_int32_t tsecr, tcp_seq th_ack)
6294{
6295 VERIFY(rtt >= 0);
6296 int delta;
6297 int old_srtt = tp->t_srtt;
6298 int old_rttvar = tp->t_rttvar;
6299 bool log_rtt = false;
6300
6301 if (rtt == 0) {
6302 /*
6303 * As rtt has millisecond precision,
6304 * make adjustment for sub ms RTT
6305 */
6306 rtt = 1;
6307 }
6308
6309 if (rtt > 4 * TCPTV_MSL) {
6310 TCP_LOG(tp, "%s: rtt is %d - maxing it at 4 x MSL\n", __func__, rtt);
6311 /*
6312 * We compute RTT either based on the time-to-ACK a packet,
6313 * if TSval is disabled or based on the TSecr value.
6314 * If there is a middlebox messing up the TSecr value, we can
6315 * end up having HUGE rtt values, causing all kinds of problems.
6316 * Let's protect against this by capping RTT to 4*MSL
6317 * (60seconds).
6318 */
6319 rtt = 4 * TCPTV_MSL;
6320 }
6321
6322 /*
6323 * On AWDL interface, the initial RTT measurement on SYN
6324 * can be wrong due to peer caching. Avoid the first RTT
6325 * measurement as it might skew up the RTO.
6326 * <rdar://problem/28739046>
6327 */
6328 if (tp->t_inpcb->inp_last_outifp != NULL &&
6329 (tp->t_inpcb->inp_last_outifp->if_eflags & IFEF_AWDL) &&
6330 th_ack == tp->iss + 1) {
6331 return;
6332 }
6333
6334 if (tp->t_flagsext & TF_RECOMPUTE_RTT) {
6335 if (SEQ_GT(th_ack, tp->snd_una) &&
6336 SEQ_LEQ(th_ack, tp->snd_max) &&
6337 (tsecr == 0 ||
6338 TSTMP_GEQ(tsecr, tp->t_badrexmt_time))) {
6339 /*
6340 * We received a new ACK after a
6341 * spurious timeout. Adapt retransmission
6342 * timer as described in rfc 4015.
6343 */
6344 tp->t_flagsext &= ~(TF_RECOMPUTE_RTT);
6345 tp->t_badrexmt_time = 0;
6346 tp->t_srtt = max(a: tp->t_srtt_prev, b: rtt);
6347 tp->t_srtt = tp->t_srtt << TCP_RTT_SHIFT;
6348 tp->t_rttvar = max(a: tp->t_rttvar_prev, b: (rtt >> 1));
6349 tp->t_rttvar = tp->t_rttvar << TCP_RTTVAR_SHIFT;
6350
6351 if (tp->t_rttbest > (tp->t_srtt + tp->t_rttvar)) {
6352 tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
6353 }
6354
6355 goto compute_rto;
6356 } else {
6357 return;
6358 }
6359 }
6360
6361 tcpstat.tcps_rttupdated++;
6362 tp->t_rttupdated++;
6363
6364 tp->t_rttcur = rtt;
6365 update_base_rtt(tp, rtt);
6366
6367 if (tp->t_srtt != 0) {
6368 /*
6369 * srtt is stored as fixed point with 5 bits after the
6370 * binary point (i.e., scaled by 32). The following magic
6371 * is equivalent to the smoothing algorithm in rfc793 with
6372 * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
6373 * point).
6374 *
6375 * Freebsd adjusts rtt to origin 0 by subtracting 1
6376 * from the provided rtt value. This was required because
6377 * of the way t_rtttime was initiailised to 1 before.
6378 * Since we changed t_rtttime to be based on
6379 * tcp_now, this extra adjustment is not needed.
6380 */
6381 delta = (rtt << TCP_DELTA_SHIFT)
6382 - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT));
6383
6384 if ((tp->t_srtt += delta) <= 0) {
6385 tp->t_srtt = 1;
6386 }
6387
6388 /*
6389 * We accumulate a smoothed rtt variance (actually, a
6390 * smoothed mean difference), then set the retransmit
6391 * timer to smoothed rtt + 4 times the smoothed variance.
6392 * rttvar is stored as fixed point with 4 bits after the
6393 * binary point (scaled by 16). The following is
6394 * equivalent to rfc793 smoothing with an alpha of .75
6395 * (rttvar = rttvar*3/4 + |delta| / 4). This replaces
6396 * rfc793's wired-in beta.
6397 */
6398 if (delta < 0) {
6399 delta = -delta;
6400 }
6401 delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT);
6402 if ((tp->t_rttvar += delta) <= 0) {
6403 tp->t_rttvar = 1;
6404 }
6405 if (tp->t_rttbest == 0 ||
6406 tp->t_rttbest > (tp->t_srtt + tp->t_rttvar)) {
6407 tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
6408 }
6409 } else {
6410 /*
6411 * No rtt measurement yet - use the unsmoothed rtt.
6412 * Set the variance to half the rtt (so our first
6413 * retransmit happens at 3*rtt).
6414 */
6415 tp->t_srtt = rtt << TCP_RTT_SHIFT;
6416 tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1);
6417 tp->t_rttbest = tp->t_srtt + tp->t_rttvar;
6418
6419 /* Initialize the receive SRTT */
6420 if (tp->rcv_srtt == 0) {
6421 tp->rcv_srtt = tp->t_srtt;
6422 }
6423 }
6424
6425compute_rto:
6426 nstat_route_rtt(rte: tp->t_inpcb->inp_route.ro_rt, rtt: tp->t_srtt,
6427 rtt_var: tp->t_rttvar);
6428
6429 /*
6430 * the retransmit should happen at rtt + 4 * rttvar.
6431 * Because of the way we do the smoothing, srtt and rttvar
6432 * will each average +1/2 tick of bias. When we compute
6433 * the retransmit timer, we want 1/2 tick of rounding and
6434 * 1 extra tick because of +-1/2 tick uncertainty in the
6435 * firing of the timer. The bias will give us exactly the
6436 * 1.5 tick we need. But, because the bias is
6437 * statistical, we have to test that we don't drop below
6438 * the minimum feasible timer (which is 2 ticks).
6439 */
6440 TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
6441 max(tp->t_rttmin, rtt + 2), TCPTV_REXMTMAX,
6442 TCP_ADD_REXMTSLOP(tp));
6443
6444 /*
6445 * We received an ack for a packet that wasn't retransmitted;
6446 * it is probably safe to discard any error indications we've
6447 * received recently. This isn't quite right, but close enough
6448 * for now (a route might have failed after we sent a segment,
6449 * and the return path might not be symmetrical).
6450 */
6451 tp->t_softerror = 0;
6452
6453 if (log_rtt) {
6454 TCP_LOG_RTT_INFO(tp);
6455 }
6456
6457 TCP_LOG_RTT_CHANGE(tp, old_srtt, old_rttvar);
6458}
6459
6460static inline unsigned int
6461tcp_maxmtu(struct rtentry *rt)
6462{
6463 unsigned int maxmtu;
6464 int interface_mtu = 0;
6465
6466 RT_LOCK_ASSERT_HELD(rt);
6467 interface_mtu = rt->rt_ifp->if_mtu;
6468
6469 if (rt_key(rt)->sa_family == AF_INET &&
6470 INTF_ADJUST_MTU_FOR_CLAT46(rt->rt_ifp)) {
6471 interface_mtu = IN6_LINKMTU(rt->rt_ifp);
6472 /* Further adjust the size for CLAT46 expansion */
6473 interface_mtu -= CLAT46_HDR_EXPANSION_OVERHD;
6474 }
6475
6476 if (rt->rt_rmx.rmx_mtu == 0) {
6477 maxmtu = interface_mtu;
6478 } else {
6479 maxmtu = MIN(rt->rt_rmx.rmx_mtu, interface_mtu);
6480 }
6481
6482 return maxmtu;
6483}
6484
6485static inline unsigned int
6486tcp_maxmtu6(struct rtentry *rt)
6487{
6488 unsigned int maxmtu;
6489 struct nd_ifinfo *ndi = NULL;
6490
6491 RT_LOCK_ASSERT_HELD(rt);
6492 if ((ndi = ND_IFINFO(rt->rt_ifp)) != NULL && !ndi->initialized) {
6493 ndi = NULL;
6494 }
6495 if (ndi != NULL) {
6496 lck_mtx_lock(lck: &ndi->lock);
6497 }
6498 if (rt->rt_rmx.rmx_mtu == 0) {
6499 maxmtu = IN6_LINKMTU(rt->rt_ifp);
6500 } else {
6501 maxmtu = MIN(rt->rt_rmx.rmx_mtu, IN6_LINKMTU(rt->rt_ifp));
6502 }
6503 if (ndi != NULL) {
6504 lck_mtx_unlock(lck: &ndi->lock);
6505 }
6506
6507 return maxmtu;
6508}
6509
6510unsigned int
6511get_maxmtu(struct rtentry *rt)
6512{
6513 unsigned int maxmtu = 0;
6514
6515 RT_LOCK_ASSERT_NOTHELD(rt);
6516
6517 RT_LOCK(rt);
6518
6519 if (rt_key(rt)->sa_family == AF_INET6) {
6520 maxmtu = tcp_maxmtu6(rt);
6521 } else {
6522 maxmtu = tcp_maxmtu(rt);
6523 }
6524
6525 RT_UNLOCK(rt);
6526
6527 return maxmtu;
6528}
6529
6530/*
6531 * Determine a reasonable value for maxseg size.
6532 * If the route is known, check route for mtu.
6533 * If none, use an mss that can be handled on the outgoing
6534 * interface without forcing IP to fragment; if bigger than
6535 * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
6536 * to utilize large mbufs. If no route is found, route has no mtu,
6537 * or the destination isn't local, use a default, hopefully conservative
6538 * size (usually 512 or the default IP max size, but no more than the mtu
6539 * of the interface), as we can't discover anything about intervening
6540 * gateways or networks. We also initialize the congestion/slow start
6541 * window. While looking at the routing entry, we also initialize
6542 * other path-dependent parameters from pre-set or cached values
6543 * in the routing entry.
6544 *
6545 * Also take into account the space needed for options that we
6546 * send regularly. Make maxseg shorter by that amount to assure
6547 * that we can send maxseg amount of data even when the options
6548 * are present. Store the upper limit of the length of options plus
6549 * data in maxopd.
6550 *
6551 * NOTE that this routine is only called when we process an incoming
6552 * segment, for outgoing segments only tcp_mssopt is called.
6553 *
6554 */
6555void
6556tcp_mss(struct tcpcb *tp, int offer, unsigned int input_ifscope)
6557{
6558 struct rtentry *rt;
6559 struct ifnet *ifp;
6560 int rtt, mss;
6561 uint32_t bufsize;
6562 struct inpcb *inp;
6563 struct socket *so;
6564 int origoffer = offer;
6565 int isnetlocal = 0;
6566 int isipv6;
6567 int min_protoh;
6568
6569 inp = tp->t_inpcb;
6570
6571 so = inp->inp_socket;
6572 /*
6573 * Nothing left to send after the socket is defunct or TCP is in the closed state
6574 */
6575 if ((so->so_state & SS_DEFUNCT) || tp->t_state == TCPS_CLOSED) {
6576 return;
6577 }
6578
6579 isipv6 = ((inp->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
6580 min_protoh = isipv6 ? sizeof(struct ip6_hdr) + sizeof(struct tcphdr)
6581 : sizeof(struct tcpiphdr);
6582
6583 if (isipv6) {
6584 rt = tcp_rtlookup6(inp, input_ifscope);
6585 } else {
6586 rt = tcp_rtlookup(inp, input_ifscope);
6587 }
6588 isnetlocal = (tp->t_flags & TF_LOCAL);
6589
6590 if (rt == NULL) {
6591 tp->t_maxopd = tp->t_maxseg = isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
6592 return;
6593 }
6594 ifp = rt->rt_ifp;
6595 /*
6596 * Slower link window correction:
6597 * If a value is specificied for slowlink_wsize use it for
6598 * PPP links believed to be on a serial modem (speed <128Kbps).
6599 * Excludes 9600bps as it is the default value adversized
6600 * by pseudo-devices over ppp.
6601 */
6602 if (ifp->if_type == IFT_PPP && slowlink_wsize > 0 &&
6603 ifp->if_baudrate > 9600 && ifp->if_baudrate <= 128000) {
6604 tp->t_flags |= TF_SLOWLINK;
6605 }
6606
6607 /*
6608 * Offer == -1 means that we didn't receive SYN yet. Use 0 then.
6609 */
6610 if (offer == -1) {
6611 offer = rt->rt_rmx.rmx_filler[0];
6612 }
6613 /*
6614 * Offer == 0 means that there was no MSS on the SYN segment,
6615 * in this case we use tcp_mssdflt.
6616 */
6617 if (offer == 0) {
6618 offer = isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
6619 } else {
6620 /*
6621 * Prevent DoS attack with too small MSS. Round up
6622 * to at least minmss.
6623 */
6624 offer = max(a: offer, b: tcp_minmss);
6625 /*
6626 * Sanity check: make sure that maxopd will be large
6627 * enough to allow some data on segments even is the
6628 * all the option space is used (40bytes). Otherwise
6629 * funny things may happen in tcp_output.
6630 */
6631 offer = max(a: offer, b: 64);
6632 }
6633 rt->rt_rmx.rmx_filler[0] = offer;
6634
6635 /*
6636 * While we're here, check if there's an initial rtt
6637 * or rttvar. Convert from the route-table units
6638 * to scaled multiples of the slow timeout timer.
6639 */
6640 if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt) != 0) {
6641 tcp_getrt_rtt(tp, rt);
6642 } else {
6643 tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN : TCPTV_REXMTMIN;
6644 }
6645
6646 mss = (isipv6 ? tcp_maxmtu6(rt) : tcp_maxmtu(rt));
6647
6648#if NECP
6649 // At this point, the mss is just the MTU. Adjust if necessary.
6650 mss = necp_socket_get_effective_mtu(inp, current_mtu: mss);
6651#endif /* NECP */
6652
6653 mss -= min_protoh;
6654
6655 if (rt->rt_rmx.rmx_mtu == 0) {
6656 if (isipv6) {
6657 if (!isnetlocal) {
6658 mss = min(a: mss, b: tcp_v6mssdflt);
6659 }
6660 } else if (!isnetlocal) {
6661 mss = min(a: mss, b: tcp_mssdflt);
6662 }
6663 }
6664
6665 mss = min(a: mss, b: offer);
6666 /*
6667 * maxopd stores the maximum length of data AND options
6668 * in a segment; maxseg is the amount of data in a normal
6669 * segment. We need to store this value (maxopd) apart
6670 * from maxseg, because now every segment carries options
6671 * and thus we normally have somewhat less data in segments.
6672 */
6673 tp->t_maxopd = mss;
6674
6675 /*
6676 * origoffer==-1 indicates, that no segments were received yet.
6677 * In this case we just guess.
6678 */
6679 if ((tp->t_flags & (TF_REQ_TSTMP | TF_NOOPT)) == TF_REQ_TSTMP &&
6680 (origoffer == -1 ||
6681 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)) {
6682 mss -= TCPOLEN_TSTAMP_APPA;
6683 }
6684
6685#if MPTCP
6686 mss -= mptcp_adj_mss(tp, FALSE);
6687#endif /* MPTCP */
6688 tp->t_maxseg = mss;
6689
6690 /*
6691 * If there's a pipesize (ie loopback), change the socket
6692 * buffer to that size only if it's bigger than the current
6693 * sockbuf size. Make the socket buffers an integral
6694 * number of mss units; if the mss is larger than
6695 * the socket buffer, decrease the mss.
6696 */
6697#if RTV_SPIPE
6698 bufsize = rt->rt_rmx.rmx_sendpipe;
6699 if (bufsize < so->so_snd.sb_hiwat)
6700#endif
6701 bufsize = so->so_snd.sb_hiwat;
6702 if (bufsize < mss) {
6703 mss = bufsize;
6704 } else {
6705 bufsize = (((bufsize + mss - 1) / mss) * mss);
6706 (void)sbreserve(sb: &so->so_snd, cc: bufsize);
6707 }
6708 tp->t_maxseg = mss;
6709
6710 ASSERT(tp->t_maxseg);
6711
6712 /*
6713 * Update MSS using recommendation from link status report. This is
6714 * temporary
6715 */
6716 tcp_update_mss_locked(so, ifp);
6717
6718#if RTV_RPIPE
6719 bufsize = rt->rt_rmx.rmx_recvpipe;
6720 if (bufsize < so->so_rcv.sb_hiwat)
6721#endif
6722 bufsize = so->so_rcv.sb_hiwat;
6723 if (bufsize > mss) {
6724 bufsize = (((bufsize + mss - 1) / mss) * mss);
6725 (void)sbreserve(sb: &so->so_rcv, cc: bufsize);
6726 }
6727
6728 set_tcp_stream_priority(so);
6729
6730 if (rt->rt_rmx.rmx_ssthresh) {
6731 /*
6732 * There's some sort of gateway or interface
6733 * buffer limit on the path. Use this to set
6734 * slow-start threshold, but set the threshold to
6735 * no less than 2*mss.
6736 */
6737 tp->snd_ssthresh = max(a: 2 * mss, b: rt->rt_rmx.rmx_ssthresh);
6738 tcpstat.tcps_usedssthresh++;
6739 } else {
6740 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
6741 }
6742
6743 /*
6744 * Set the slow-start flight size depending on whether this
6745 * is a local network or not.
6746 */
6747 if (CC_ALGO(tp)->cwnd_init != NULL) {
6748 CC_ALGO(tp)->cwnd_init(tp);
6749 }
6750
6751 tcp_ccdbg_trace(tp, NULL, event: TCP_CC_CWND_INIT);
6752
6753 if (TCP_USE_RLEDBAT(tp, so) && tcp_cc_rledbat.rwnd_init != NULL) {
6754 tcp_cc_rledbat.rwnd_init(tp);
6755 }
6756
6757 /* Route locked during lookup above */
6758 RT_UNLOCK(rt);
6759}
6760
6761/*
6762 * Determine the MSS option to send on an outgoing SYN.
6763 */
6764int
6765tcp_mssopt(struct tcpcb *tp)
6766{
6767 struct rtentry *rt;
6768 int mss;
6769 int isipv6;
6770 int min_protoh;
6771
6772 isipv6 = ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) ? 1 : 0;
6773 min_protoh = isipv6 ? sizeof(struct ip6_hdr) + sizeof(struct tcphdr)
6774 : sizeof(struct tcpiphdr);
6775
6776 if (isipv6) {
6777 rt = tcp_rtlookup6(tp->t_inpcb, IFSCOPE_NONE);
6778 } else {
6779 rt = tcp_rtlookup(tp->t_inpcb, IFSCOPE_NONE);
6780 }
6781 if (rt == NULL) {
6782 return isipv6 ? tcp_v6mssdflt : tcp_mssdflt;
6783 }
6784 /*
6785 * Slower link window correction:
6786 * If a value is specificied for slowlink_wsize use it for PPP links
6787 * believed to be on a serial modem (speed <128Kbps). Excludes 9600bps as
6788 * it is the default value adversized by pseudo-devices over ppp.
6789 */
6790 if (rt->rt_ifp->if_type == IFT_PPP && slowlink_wsize > 0 &&
6791 rt->rt_ifp->if_baudrate > 9600 && rt->rt_ifp->if_baudrate <= 128000) {
6792 tp->t_flags |= TF_SLOWLINK;
6793 }
6794
6795 mss = (isipv6 ? tcp_maxmtu6(rt) : tcp_maxmtu(rt));
6796 /* Route locked during lookup above */
6797 RT_UNLOCK(rt);
6798
6799#if NECP
6800 // At this point, the mss is just the MTU. Adjust if necessary.
6801 mss = necp_socket_get_effective_mtu(inp: tp->t_inpcb, current_mtu: mss);
6802#endif /* NECP */
6803
6804 return mss - min_protoh;
6805}
6806
6807/*
6808 * On a partial ack arrives, force the retransmission of the
6809 * next unacknowledged segment. Do not clear tp->t_dupacks.
6810 * By setting snd_nxt to th_ack, this forces retransmission timer to
6811 * be started again.
6812 */
6813static void
6814tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th)
6815{
6816 tcp_seq onxt = tp->snd_nxt;
6817 u_int32_t ocwnd = tp->snd_cwnd;
6818 tp->t_timer[TCPT_REXMT] = 0;
6819 tp->t_timer[TCPT_PTO] = 0;
6820 tp->t_rtttime = 0;
6821 tp->snd_nxt = th->th_ack;
6822 /*
6823 * Set snd_cwnd to one segment beyond acknowledged offset
6824 * (tp->snd_una has not yet been updated when this function
6825 * is called)
6826 */
6827 tp->snd_cwnd = tp->t_maxseg + BYTES_ACKED(th, tp);
6828 (void) tcp_output(tp);
6829 tp->snd_cwnd = ocwnd;
6830 if (SEQ_GT(onxt, tp->snd_nxt)) {
6831 tp->snd_nxt = onxt;
6832 }
6833 /*
6834 * Partial window deflation. Relies on fact that tp->snd_una
6835 * not updated yet.
6836 */
6837 if (tp->snd_cwnd > BYTES_ACKED(th, tp)) {
6838 tp->snd_cwnd -= BYTES_ACKED(th, tp);
6839 } else {
6840 tp->snd_cwnd = 0;
6841 }
6842 tp->snd_cwnd += tp->t_maxseg;
6843}
6844
6845/*
6846 * Drop a random TCP connection that hasn't been serviced yet and
6847 * is eligible for discard. There is a one in qlen chance that
6848 * we will return a null, saying that there are no dropable
6849 * requests. In this case, the protocol specific code should drop
6850 * the new request. This insures fairness.
6851 *
6852 * The listening TCP socket "head" must be locked
6853 */
6854static int
6855tcp_dropdropablreq(struct socket *head)
6856{
6857 struct socket *so, *sonext;
6858 unsigned int j, qlen;
6859 static uint32_t rnd = 0;
6860 static uint64_t old_runtime;
6861 static unsigned int cur_cnt, old_cnt;
6862 uint64_t now_sec, i;
6863 struct inpcb *inp = NULL;
6864 struct tcpcb *tp;
6865
6866 if ((head->so_options & SO_ACCEPTCONN) == 0) {
6867 return 0;
6868 }
6869
6870 if (TAILQ_EMPTY(&head->so_incomp)) {
6871 return 0;
6872 }
6873
6874 so_acquire_accept_list(head, NULL);
6875 socket_unlock(so: head, refcount: 0);
6876
6877 /*
6878 * Check if there is any socket in the incomp queue
6879 * that is closed because of a reset from the peer and is
6880 * waiting to be garbage collected. If so, pick that as
6881 * the victim
6882 */
6883 TAILQ_FOREACH_SAFE(so, &head->so_incomp, so_list, sonext) {
6884 inp = sotoinpcb(so);
6885 tp = intotcpcb(inp);
6886 if (tp != NULL && tp->t_state == TCPS_CLOSED &&
6887 so->so_head != NULL &&
6888 (so->so_state & (SS_INCOMP | SS_CANTSENDMORE | SS_CANTRCVMORE)) ==
6889 (SS_INCOMP | SS_CANTSENDMORE | SS_CANTRCVMORE)) {
6890 /*
6891 * The listen socket is already locked but we
6892 * can lock this socket here without lock ordering
6893 * issues because it is in the incomp queue and
6894 * is not visible to others.
6895 */
6896 if (socket_try_lock(so)) {
6897 so->so_usecount++;
6898 goto found_victim;
6899 } else {
6900 continue;
6901 }
6902 }
6903 }
6904
6905 so = TAILQ_FIRST(&head->so_incomp);
6906
6907 now_sec = net_uptime();
6908 if ((i = (now_sec - old_runtime)) != 0) {
6909 old_runtime = now_sec;
6910 old_cnt = cur_cnt / i;
6911 cur_cnt = 0;
6912 }
6913
6914 qlen = head->so_incqlen;
6915 if (rnd == 0) {
6916 rnd = RandomULong();
6917 }
6918
6919 if (++cur_cnt > qlen || old_cnt > qlen) {
6920 rnd = (314159 * rnd + 66329) & 0xffff;
6921 j = ((qlen + 1) * rnd) >> 16;
6922
6923 while (j-- && so) {
6924 so = TAILQ_NEXT(so, so_list);
6925 }
6926 }
6927 /* Find a connection that is not already closing (or being served) */
6928 while (so) {
6929 inp = (struct inpcb *)so->so_pcb;
6930
6931 sonext = TAILQ_NEXT(so, so_list);
6932
6933 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) != WNT_STOPUSING) {
6934 /*
6935 * Avoid the issue of a socket being accepted
6936 * by one input thread and being dropped by
6937 * another input thread. If we can't get a hold
6938 * on this mutex, then grab the next socket in
6939 * line.
6940 */
6941 if (socket_try_lock(so)) {
6942 so->so_usecount++;
6943 if ((so->so_usecount == 2) &&
6944 (so->so_state & SS_INCOMP) &&
6945 !(so->so_flags & SOF_INCOMP_INPROGRESS)) {
6946 break;
6947 } else {
6948 /*
6949 * don't use if being accepted or
6950 * used in any other way
6951 */
6952 in_pcb_checkstate(inp, WNT_RELEASE, 1);
6953 socket_unlock(so, refcount: 1);
6954 }
6955 } else {
6956 /*
6957 * do not try to lock the inp in
6958 * in_pcb_checkstate because the lock
6959 * is already held in some other thread.
6960 * Only drop the inp_wntcnt reference.
6961 */
6962 in_pcb_checkstate(inp, WNT_RELEASE, 1);
6963 }
6964 }
6965 so = sonext;
6966 }
6967 if (so == NULL) {
6968 socket_lock(so: head, refcount: 0);
6969 so_release_accept_list(head);
6970 return 0;
6971 }
6972
6973 /* Makes sure socket is still in the right state to be discarded */
6974
6975 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
6976 socket_unlock(so, refcount: 1);
6977 socket_lock(so: head, refcount: 0);
6978 so_release_accept_list(head);
6979 return 0;
6980 }
6981
6982found_victim:
6983 if (so->so_usecount != 2 || !(so->so_state & SS_INCOMP)) {
6984 /* do not discard: that socket is being accepted */
6985 socket_unlock(so, refcount: 1);
6986 socket_lock(so: head, refcount: 0);
6987 so_release_accept_list(head);
6988 return 0;
6989 }
6990
6991 socket_lock(so: head, refcount: 0);
6992 TAILQ_REMOVE(&head->so_incomp, so, so_list);
6993 head->so_incqlen--;
6994 head->so_qlen--;
6995 so->so_state &= ~SS_INCOMP;
6996 so->so_flags |= SOF_OVERFLOW;
6997 so->so_head = NULL;
6998 so_release_accept_list(head);
6999 socket_unlock(so: head, refcount: 0);
7000
7001 socket_lock_assert_owned(so);
7002 tp = sototcpcb(so);
7003
7004 tcp_close(tp);
7005 if (inp->inp_wantcnt > 0 && inp->inp_wantcnt != WNT_STOPUSING) {
7006 /*
7007 * Some one has a wantcnt on this pcb. Since WNT_ACQUIRE
7008 * doesn't require a lock, it could have happened while
7009 * we are holding the lock. This pcb will have to
7010 * be garbage collected later.
7011 * Release the reference held for so_incomp queue
7012 */
7013 VERIFY(so->so_usecount > 0);
7014 so->so_usecount--;
7015 socket_unlock(so, refcount: 1);
7016 } else {
7017 /*
7018 * Unlock this socket and leave the reference on.
7019 * We need to acquire the pcbinfo lock in order to
7020 * fully dispose it off
7021 */
7022 socket_unlock(so, refcount: 0);
7023
7024 lck_rw_lock_exclusive(lck: &tcbinfo.ipi_lock);
7025
7026 socket_lock(so, refcount: 0);
7027 /* Release the reference held for so_incomp queue */
7028 VERIFY(so->so_usecount > 0);
7029 so->so_usecount--;
7030
7031 if (so->so_usecount != 1 ||
7032 (inp->inp_wantcnt > 0 &&
7033 inp->inp_wantcnt != WNT_STOPUSING)) {
7034 /*
7035 * There is an extra wantcount or usecount
7036 * that must have been added when the socket
7037 * was unlocked. This socket will have to be
7038 * garbage collected later
7039 */
7040 socket_unlock(so, refcount: 1);
7041 } else {
7042 /* Drop the reference held for this function */
7043 VERIFY(so->so_usecount > 0);
7044 so->so_usecount--;
7045
7046 in_pcbdispose(inp);
7047 }
7048 lck_rw_done(lck: &tcbinfo.ipi_lock);
7049 }
7050 tcpstat.tcps_drops++;
7051
7052 socket_lock(so: head, refcount: 0);
7053 return 1;
7054}
7055
7056/* Set background congestion control on a socket */
7057void
7058tcp_set_background_cc(struct socket *so)
7059{
7060 tcp_set_new_cc(so, TCP_CC_ALGO_BACKGROUND_INDEX);
7061}
7062
7063/* Set foreground congestion control on a socket */
7064void
7065tcp_set_foreground_cc(struct socket *so)
7066{
7067 if (tcp_use_newreno) {
7068 tcp_set_new_cc(so, TCP_CC_ALGO_NEWRENO_INDEX);
7069#if (DEVELOPMENT || DEBUG)
7070 } else if (tcp_use_ledbat) {
7071 /* Only used for testing */
7072 tcp_set_new_cc(so, TCP_CC_ALGO_BACKGROUND_INDEX);
7073#endif
7074 } else {
7075 tcp_set_new_cc(so, TCP_CC_ALGO_CUBIC_INDEX);
7076 }
7077}
7078
7079static void
7080tcp_set_new_cc(struct socket *so, uint8_t cc_index)
7081{
7082 struct inpcb *inp = sotoinpcb(so);
7083 struct tcpcb *tp = intotcpcb(inp);
7084
7085 if (tp->tcp_cc_index != cc_index) {
7086 if (CC_ALGO(tp)->cleanup != NULL) {
7087 CC_ALGO(tp)->cleanup(tp);
7088 }
7089 tp->tcp_cc_index = cc_index;
7090
7091 tcp_cc_allocate_state(tp);
7092
7093 if (CC_ALGO(tp)->switch_to != NULL) {
7094 CC_ALGO(tp)->switch_to(tp);
7095 }
7096
7097 tcp_ccdbg_trace(tp, NULL, event: TCP_CC_CHANGE_ALGO);
7098 }
7099}
7100
7101void
7102tcp_set_recv_bg(struct socket *so)
7103{
7104 if (!IS_TCP_RECV_BG(so)) {
7105 so->so_flags1 |= SOF1_TRAFFIC_MGT_TCP_RECVBG;
7106
7107 struct inpcb *inp = sotoinpcb(so);
7108 struct tcpcb *tp = intotcpcb(inp);
7109
7110 if (TCP_RLEDBAT_ENABLED(tp) && tcp_cc_rledbat.switch_to != NULL) {
7111 tcp_cc_rledbat.switch_to(tp);
7112 }
7113 }
7114}
7115
7116void
7117tcp_clear_recv_bg(struct socket *so)
7118{
7119 if (IS_TCP_RECV_BG(so)) {
7120 so->so_flags1 &= ~(SOF1_TRAFFIC_MGT_TCP_RECVBG);
7121 }
7122}
7123
7124void
7125inp_fc_throttle_tcp(struct inpcb *inp)
7126{
7127 struct tcpcb *tp = inp->inp_ppcb;
7128
7129 if (!tcp_flow_control_response) {
7130 return;
7131 }
7132
7133 /*
7134 * Back off the slow-start threshold and enter
7135 * congestion avoidance phase
7136 */
7137 if (CC_ALGO(tp)->pre_fr != NULL) {
7138 CC_ALGO(tp)->pre_fr(tp);
7139 }
7140}
7141
7142void
7143inp_fc_unthrottle_tcp(struct inpcb *inp)
7144{
7145 struct tcpcb *tp = inp->inp_ppcb;
7146
7147 if (tcp_flow_control_response) {
7148 if (CC_ALGO(tp)->post_fr != NULL) {
7149 CC_ALGO(tp)->post_fr(tp, NULL);
7150 }
7151
7152 tp->t_bytes_acked = 0;
7153
7154 /*
7155 * Reset retransmit shift as we know that the reason
7156 * for delay in sending a packet is due to flow
7157 * control on the outgoing interface. There is no need
7158 * to backoff retransmit timer.
7159 */
7160 TCP_RESET_REXMT_STATE(tp);
7161
7162 tp->t_flagsext &= ~TF_CWND_NONVALIDATED;
7163
7164 /*
7165 * Start the output stream again. Since we are
7166 * not retransmitting data, do not reset the
7167 * retransmit timer or rtt calculation.
7168 */
7169 tcp_output(tp);
7170 return;
7171 }
7172
7173 /*
7174 * Back off the slow-start threshold and enter
7175 * congestion avoidance phase
7176 */
7177 if (CC_ALGO(tp)->pre_fr != NULL) {
7178 CC_ALGO(tp)->pre_fr(tp);
7179 }
7180
7181 tp->snd_cwnd = tp->snd_ssthresh;
7182 tp->t_flagsext &= ~TF_CWND_NONVALIDATED;
7183 /*
7184 * Restart counting for ABC as we changed the
7185 * congestion window just now.
7186 */
7187 tp->t_bytes_acked = 0;
7188
7189 /* Reset retransmit shift as we know that the reason
7190 * for delay in sending a packet is due to flow
7191 * control on the outgoing interface. There is no need
7192 * to backoff retransmit timer.
7193 */
7194 TCP_RESET_REXMT_STATE(tp);
7195
7196 /*
7197 * Start the output stream again. Since we are
7198 * not retransmitting data, do not reset the
7199 * retransmit timer or rtt calculation.
7200 */
7201 tcp_output(tp);
7202}
7203
7204static int
7205tcp_getstat SYSCTL_HANDLER_ARGS
7206{
7207#pragma unused(oidp, arg1, arg2)
7208
7209 int error;
7210 struct tcpstat *stat;
7211 stat = &tcpstat;
7212#if XNU_TARGET_OS_OSX
7213 struct tcpstat zero_stat;
7214
7215 if (tcp_disable_access_to_stats &&
7216 !kauth_cred_issuser(cred: kauth_cred_get())) {
7217 bzero(s: &zero_stat, n: sizeof(zero_stat));
7218 stat = &zero_stat;
7219 }
7220
7221#endif /* XNU_TARGET_OS_OSX */
7222
7223 if (req->oldptr == 0) {
7224 req->oldlen = (size_t)sizeof(struct tcpstat);
7225 }
7226
7227 error = SYSCTL_OUT(req, stat, MIN(sizeof(tcpstat), req->oldlen));
7228
7229 return error;
7230}
7231
7232/*
7233 * Checksum extended TCP header and data.
7234 */
7235int
7236tcp_input_checksum(int af, struct mbuf *m, struct tcphdr *th, int off, int tlen)
7237{
7238 struct ifnet *ifp = m->m_pkthdr.rcvif;
7239
7240 switch (af) {
7241 case AF_INET: {
7242 struct ip *ip = mtod(m, struct ip *);
7243 struct ipovly *ipov = (struct ipovly *)ip;
7244
7245 /* ip_stripoptions() must have been called before we get here */
7246 ASSERT((ip->ip_hl << 2) == sizeof(*ip));
7247
7248 if ((hwcksum_rx || (ifp->if_flags & IFF_LOOPBACK) ||
7249 (m->m_pkthdr.pkt_flags & PKTF_LOOP)) &&
7250 (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)) {
7251 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
7252 th->th_sum = m->m_pkthdr.csum_rx_val;
7253 } else {
7254 uint32_t sum = m->m_pkthdr.csum_rx_val;
7255 uint32_t start = m->m_pkthdr.csum_rx_start;
7256 int32_t trailer = (m_pktlen(m) - (off + tlen));
7257
7258 /*
7259 * Perform 1's complement adjustment of octets
7260 * that got included/excluded in the hardware-
7261 * calculated checksum value. Ignore cases
7262 * where the value already includes the entire
7263 * IP header span, as the sum for those octets
7264 * would already be 0 by the time we get here;
7265 * IP has already performed its header checksum
7266 * checks. If we do need to adjust, restore
7267 * the original fields in the IP header when
7268 * computing the adjustment value. Also take
7269 * care of any trailing bytes and subtract out
7270 * their partial sum.
7271 */
7272 ASSERT(trailer >= 0);
7273 if ((m->m_pkthdr.csum_flags & CSUM_PARTIAL) &&
7274 ((start != 0 && start != off) || trailer)) {
7275 uint32_t swbytes = (uint32_t)trailer;
7276
7277 if (start < off) {
7278 ip->ip_len += sizeof(*ip);
7279#if BYTE_ORDER != BIG_ENDIAN
7280 HTONS(ip->ip_len);
7281 HTONS(ip->ip_off);
7282#endif /* BYTE_ORDER != BIG_ENDIAN */
7283 }
7284 /* callee folds in sum */
7285 sum = m_adj_sum16(m, start, off,
7286 tlen, sum);
7287 if (off > start) {
7288 swbytes += (off - start);
7289 } else {
7290 swbytes += (start - off);
7291 }
7292
7293 if (start < off) {
7294#if BYTE_ORDER != BIG_ENDIAN
7295 NTOHS(ip->ip_off);
7296 NTOHS(ip->ip_len);
7297#endif /* BYTE_ORDER != BIG_ENDIAN */
7298 ip->ip_len -= sizeof(*ip);
7299 }
7300
7301 if (swbytes != 0) {
7302 tcp_in_cksum_stats(swbytes);
7303 }
7304 if (trailer != 0) {
7305 m_adj(m, -trailer);
7306 }
7307 }
7308
7309 /* callee folds in sum */
7310 th->th_sum = in_pseudo(ip->ip_src.s_addr,
7311 ip->ip_dst.s_addr,
7312 sum + htonl(tlen + IPPROTO_TCP));
7313 }
7314 th->th_sum ^= 0xffff;
7315 } else {
7316 uint16_t ip_sum;
7317 int len;
7318 char b[9];
7319
7320 bcopy(src: ipov->ih_x1, dst: b, n: sizeof(ipov->ih_x1));
7321 bzero(s: ipov->ih_x1, n: sizeof(ipov->ih_x1));
7322 ip_sum = ipov->ih_len;
7323 ipov->ih_len = (u_short)tlen;
7324#if BYTE_ORDER != BIG_ENDIAN
7325 HTONS(ipov->ih_len);
7326#endif
7327 len = sizeof(struct ip) + tlen;
7328 th->th_sum = in_cksum(m, len);
7329 bcopy(src: b, dst: ipov->ih_x1, n: sizeof(ipov->ih_x1));
7330 ipov->ih_len = ip_sum;
7331
7332 tcp_in_cksum_stats(len);
7333 }
7334 break;
7335 }
7336 case AF_INET6: {
7337 struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
7338
7339 if ((hwcksum_rx || (ifp->if_flags & IFF_LOOPBACK) ||
7340 (m->m_pkthdr.pkt_flags & PKTF_LOOP)) &&
7341 (m->m_pkthdr.csum_flags & CSUM_DATA_VALID)) {
7342 if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) {
7343 th->th_sum = m->m_pkthdr.csum_rx_val;
7344 } else {
7345 uint32_t sum = m->m_pkthdr.csum_rx_val;
7346 uint32_t start = m->m_pkthdr.csum_rx_start;
7347 int32_t trailer = (m_pktlen(m) - (off + tlen));
7348
7349 /*
7350 * Perform 1's complement adjustment of octets
7351 * that got included/excluded in the hardware-
7352 * calculated checksum value. Also take care
7353 * of any trailing bytes and subtract out their
7354 * partial sum.
7355 */
7356 ASSERT(trailer >= 0);
7357 if ((m->m_pkthdr.csum_flags & CSUM_PARTIAL) &&
7358 (start != off || trailer != 0)) {
7359 uint16_t s = 0, d = 0;
7360 uint32_t swbytes = (uint32_t)trailer;
7361
7362 if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src)) {
7363 s = ip6->ip6_src.s6_addr16[1];
7364 ip6->ip6_src.s6_addr16[1] = 0;
7365 }
7366 if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst)) {
7367 d = ip6->ip6_dst.s6_addr16[1];
7368 ip6->ip6_dst.s6_addr16[1] = 0;
7369 }
7370
7371 /* callee folds in sum */
7372 sum = m_adj_sum16(m, start, off,
7373 tlen, sum);
7374 if (off > start) {
7375 swbytes += (off - start);
7376 } else {
7377 swbytes += (start - off);
7378 }
7379
7380 if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src)) {
7381 ip6->ip6_src.s6_addr16[1] = s;
7382 }
7383 if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst)) {
7384 ip6->ip6_dst.s6_addr16[1] = d;
7385 }
7386
7387 if (swbytes != 0) {
7388 tcp_in6_cksum_stats(swbytes);
7389 }
7390 if (trailer != 0) {
7391 m_adj(m, -trailer);
7392 }
7393 }
7394
7395 th->th_sum = in6_pseudo(
7396 &ip6->ip6_src, &ip6->ip6_dst,
7397 sum + htonl(tlen + IPPROTO_TCP));
7398 }
7399 th->th_sum ^= 0xffff;
7400 } else {
7401 tcp_in6_cksum_stats(tlen);
7402 th->th_sum = in6_cksum(m, IPPROTO_TCP, off, tlen);
7403 }
7404 break;
7405 }
7406 default:
7407 VERIFY(0);
7408 /* NOTREACHED */
7409 }
7410
7411 if (th->th_sum != 0) {
7412 tcpstat.tcps_rcvbadsum++;
7413 IF_TCP_STATINC(ifp, badformat);
7414 return -1;
7415 }
7416
7417 return 0;
7418}
7419
7420#define DUMP_BUF_CHK() { \
7421 clen -= k; \
7422 if (clen < 1) \
7423 goto done; \
7424 c += k; \
7425}
7426
7427int
7428dump_tcp_reass_qlen(char *str, int str_len)
7429{
7430 char *c = str;
7431 int k, clen = str_len;
7432
7433 if (tcp_reass_total_qlen != 0) {
7434 k = scnprintf(c, count: clen, "\ntcp reass qlen %d\n", tcp_reass_total_qlen);
7435 DUMP_BUF_CHK();
7436 }
7437
7438done:
7439 return str_len - clen;
7440}
7441
7442uint32_t
7443tcp_reass_qlen_space(struct socket *so)
7444{
7445 uint32_t space = 0;
7446 struct inpcb *inp = sotoinpcb(so);
7447
7448 if (inp != NULL) {
7449 struct tcpcb *tp = intotcpcb(inp);
7450
7451 if (tp != NULL) {
7452 space = tp->t_reassq_mbcnt;
7453 }
7454 }
7455 return space;
7456}
7457
7458
7459SYSCTL_PROC(_net_inet_tcp, TCPCTL_STATS, stats,
7460 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, tcp_getstat,
7461 "S,tcpstat", "TCP statistics (struct tcpstat, netinet/tcp_var.h)");
7462
7463static int
7464sysctl_rexmtthresh SYSCTL_HANDLER_ARGS
7465{
7466#pragma unused(arg1, arg2)
7467
7468 int error, val = tcprexmtthresh;
7469
7470 error = sysctl_handle_int(oidp, arg1: &val, arg2: 0, req);
7471 if (error || !req->newptr) {
7472 return error;
7473 }
7474
7475 /*
7476 * Constrain the number of duplicate ACKs
7477 * to consider for TCP fast retransmit
7478 * to either 2 or 3
7479 */
7480
7481 if (val < 2 || val > 3) {
7482 return EINVAL;
7483 }
7484
7485 tcprexmtthresh = (uint8_t)val;
7486
7487 return 0;
7488}
7489
7490SYSCTL_PROC(_net_inet_tcp, OID_AUTO, rexmt_thresh, CTLTYPE_INT | CTLFLAG_RW |
7491 CTLFLAG_LOCKED, &tcprexmtthresh, 0, &sysctl_rexmtthresh, "I",
7492 "Duplicate ACK Threshold for Fast Retransmit");
7493