1/*
2 * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
30 * The Regents of the University of California. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 * @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95
61 */
62/*
63 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
64 * support for mandatory and extensible security protections. This notice
65 * is included in support of clause 2.2 (b) of the Apple Public License,
66 * Version 2.0.
67 */
68
69#include <sys/param.h>
70#include <sys/systm.h>
71#include <sys/callout.h>
72#include <sys/kernel.h>
73#include <sys/sysctl.h>
74#include <sys/malloc.h>
75#include <sys/mbuf.h>
76#include <sys/domain.h>
77#include <sys/proc.h>
78#include <sys/kauth.h>
79#include <sys/socket.h>
80#include <sys/socketvar.h>
81#include <sys/protosw.h>
82#include <sys/random.h>
83#include <sys/syslog.h>
84#include <sys/mcache.h>
85#include <kern/locks.h>
86#include <kern/zalloc.h>
87
88#include <dev/random/randomdev.h>
89
90#include <net/route.h>
91#include <net/if.h>
92#include <net/content_filter.h>
93#include <net/ntstat.h>
94
95#define tcp_minmssoverload fring
96#define _IP_VHL
97#include <netinet/in.h>
98#include <netinet/in_systm.h>
99#include <netinet/ip.h>
100#include <netinet/ip_icmp.h>
101#if INET6
102#include <netinet/ip6.h>
103#include <netinet/icmp6.h>
104#endif
105#include <netinet/in_pcb.h>
106#if INET6
107#include <netinet6/in6_pcb.h>
108#endif
109#include <netinet/in_var.h>
110#include <netinet/ip_var.h>
111#include <netinet/icmp_var.h>
112#if INET6
113#include <netinet6/ip6_var.h>
114#endif
115#include <netinet/mptcp_var.h>
116#include <netinet/tcp.h>
117#include <netinet/tcp_fsm.h>
118#include <netinet/tcp_seq.h>
119#include <netinet/tcp_timer.h>
120#include <netinet/tcp_var.h>
121#include <netinet/tcp_cc.h>
122#include <netinet/tcp_cache.h>
123#include <kern/thread_call.h>
124
125#if INET6
126#include <netinet6/tcp6_var.h>
127#endif
128#include <netinet/tcpip.h>
129#if TCPDEBUG
130#include <netinet/tcp_debug.h>
131#endif
132#include <netinet6/ip6protosw.h>
133
134#if IPSEC
135#include <netinet6/ipsec.h>
136#if INET6
137#include <netinet6/ipsec6.h>
138#endif
139#endif /* IPSEC */
140
141#if NECP
142#include <net/necp.h>
143#endif /* NECP */
144
145#undef tcp_minmssoverload
146
147#if CONFIG_MACF_NET
148#include <security/mac_framework.h>
149#endif /* MAC_NET */
150
151#include <corecrypto/ccaes.h>
152#include <libkern/crypto/aes.h>
153#include <libkern/crypto/md5.h>
154#include <sys/kdebug.h>
155#include <mach/sdt.h>
156
157#include <netinet/lro_ext.h>
158
159#define DBG_FNC_TCP_CLOSE NETDBG_CODE(DBG_NETTCP, ((5 << 8) | 2))
160
161static tcp_cc tcp_ccgen;
162extern int tcp_lq_overflow;
163
164extern struct tcptimerlist tcp_timer_list;
165extern struct tcptailq tcp_tw_tailq;
166
167SYSCTL_SKMEM_TCP_INT(TCPCTL_MSSDFLT, mssdflt, CTLFLAG_RW | CTLFLAG_LOCKED,
168 int, tcp_mssdflt, TCP_MSS, "Default TCP Maximum Segment Size");
169
170#if INET6
171SYSCTL_SKMEM_TCP_INT(TCPCTL_V6MSSDFLT, v6mssdflt,
172 CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_v6mssdflt, TCP6_MSS,
173 "Default TCP Maximum Segment Size for IPv6");
174#endif
175
176int tcp_sysctl_fastopenkey(struct sysctl_oid *, void *, int,
177 struct sysctl_req *);
178SYSCTL_PROC(_net_inet_tcp, OID_AUTO, fastopen_key, CTLTYPE_STRING | CTLFLAG_WR,
179 0, 0, tcp_sysctl_fastopenkey, "S", "TCP Fastopen key");
180
181/* Current count of half-open TFO connections */
182int tcp_tfo_halfcnt = 0;
183
184/* Maximum of half-open TFO connection backlog */
185SYSCTL_SKMEM_TCP_INT(OID_AUTO, fastopen_backlog,
186 CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_tfo_backlog, 10,
187 "Backlog queue for half-open TFO connections");
188
189SYSCTL_SKMEM_TCP_INT(OID_AUTO, fastopen, CTLFLAG_RW | CTLFLAG_LOCKED,
190 int, tcp_fastopen, TCP_FASTOPEN_CLIENT | TCP_FASTOPEN_SERVER,
191 "Enable TCP Fastopen (RFC 7413)");
192
193SYSCTL_SKMEM_TCP_INT(OID_AUTO, now_init, CTLFLAG_RD | CTLFLAG_LOCKED,
194 uint32_t, tcp_now_init, 0, "Initial tcp now value");
195
196SYSCTL_SKMEM_TCP_INT(OID_AUTO, microuptime_init, CTLFLAG_RD | CTLFLAG_LOCKED,
197 uint32_t, tcp_microuptime_init, 0, "Initial tcp uptime value in micro seconds");
198
199/*
200 * Minimum MSS we accept and use. This prevents DoS attacks where
201 * we are forced to a ridiculous low MSS like 20 and send hundreds
202 * of packets instead of one. The effect scales with the available
203 * bandwidth and quickly saturates the CPU and network interface
204 * with packet generation and sending. Set to zero to disable MINMSS
205 * checking. This setting prevents us from sending too small packets.
206 */
207SYSCTL_SKMEM_TCP_INT(OID_AUTO, minmss, CTLFLAG_RW | CTLFLAG_LOCKED,
208 int, tcp_minmss, TCP_MINMSS, "Minmum TCP Maximum Segment Size");
209int tcp_do_rfc1323 = 1;
210#if (DEVELOPMENT || DEBUG)
211SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1323, rfc1323,
212 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_rfc1323, 0,
213 "Enable rfc1323 (high performance TCP) extensions");
214#endif /* (DEVELOPMENT || DEBUG) */
215
216// Not used
217static int tcp_do_rfc1644 = 0;
218SYSCTL_INT(_net_inet_tcp, TCPCTL_DO_RFC1644, rfc1644,
219 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_do_rfc1644, 0,
220 "Enable rfc1644 (TTCP) extensions");
221
222SYSCTL_SKMEM_TCP_INT(OID_AUTO, do_tcpdrain, CTLFLAG_RW | CTLFLAG_LOCKED,
223 static int, do_tcpdrain, 0,
224 "Enable tcp_drain routine for extra help when low on mbufs");
225
226SYSCTL_INT(_net_inet_tcp, OID_AUTO, pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
227 &tcbinfo.ipi_count, 0, "Number of active PCBs");
228
229SYSCTL_INT(_net_inet_tcp, OID_AUTO, tw_pcbcount, CTLFLAG_RD | CTLFLAG_LOCKED,
230 &tcbinfo.ipi_twcount, 0, "Number of pcbs in time-wait state");
231
232SYSCTL_SKMEM_TCP_INT(OID_AUTO, icmp_may_rst, CTLFLAG_RW | CTLFLAG_LOCKED,
233 static int, icmp_may_rst, 1,
234 "Certain ICMP unreachable messages may abort connections in SYN_SENT");
235
236static int tcp_strict_rfc1948 = 0;
237static int tcp_isn_reseed_interval = 0;
238#if (DEVELOPMENT || DEBUG)
239SYSCTL_INT(_net_inet_tcp, OID_AUTO, strict_rfc1948, CTLFLAG_RW | CTLFLAG_LOCKED,
240 &tcp_strict_rfc1948, 0, "Determines if RFC1948 is followed exactly");
241
242SYSCTL_INT(_net_inet_tcp, OID_AUTO, isn_reseed_interval,
243 CTLFLAG_RW | CTLFLAG_LOCKED,
244 &tcp_isn_reseed_interval, 0, "Seconds between reseeding of ISN secret");
245#endif /* (DEVELOPMENT || DEBUG) */
246
247SYSCTL_SKMEM_TCP_INT(OID_AUTO, rtt_min, CTLFLAG_RW | CTLFLAG_LOCKED,
248 int, tcp_TCPTV_MIN, 100, "min rtt value allowed");
249
250SYSCTL_SKMEM_TCP_INT(OID_AUTO, rexmt_slop, CTLFLAG_RW,
251 int, tcp_rexmt_slop, TCPTV_REXMTSLOP, "Slop added to retransmit timeout");
252
253SYSCTL_SKMEM_TCP_INT(OID_AUTO, randomize_ports, CTLFLAG_RW | CTLFLAG_LOCKED,
254 __private_extern__ int , tcp_use_randomport, 0,
255 "Randomize TCP port numbers");
256
257SYSCTL_SKMEM_TCP_INT(OID_AUTO, win_scale_factor, CTLFLAG_RW | CTLFLAG_LOCKED,
258 __private_extern__ int, tcp_win_scale, 3, "Window scaling factor");
259
260static void tcp_cleartaocache(void);
261static void tcp_notify(struct inpcb *, int);
262
263struct zone *sack_hole_zone;
264struct zone *tcp_reass_zone;
265struct zone *tcp_bwmeas_zone;
266struct zone *tcp_rxt_seg_zone;
267
268extern int slowlink_wsize; /* window correction for slow links */
269extern int path_mtu_discovery;
270
271static void tcp_sbrcv_grow_rwin(struct tcpcb *tp, struct sockbuf *sb);
272
273#define TCP_BWMEAS_BURST_MINSIZE 6
274#define TCP_BWMEAS_BURST_MAXSIZE 25
275
276static uint32_t bwmeas_elm_size;
277
278/*
279 * Target size of TCP PCB hash tables. Must be a power of two.
280 *
281 * Note that this can be overridden by the kernel environment
282 * variable net.inet.tcp.tcbhashsize
283 */
284#ifndef TCBHASHSIZE
285#define TCBHASHSIZE CONFIG_TCBHASHSIZE
286#endif
287
288__private_extern__ int tcp_tcbhashsize = TCBHASHSIZE;
289SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcbhashsize, CTLFLAG_RD | CTLFLAG_LOCKED,
290 &tcp_tcbhashsize, 0, "Size of TCP control-block hashtable");
291
292/*
293 * This is the actual shape of what we allocate using the zone
294 * allocator. Doing it this way allows us to protect both structures
295 * using the same generation count, and also eliminates the overhead
296 * of allocating tcpcbs separately. By hiding the structure here,
297 * we avoid changing most of the rest of the code (although it needs
298 * to be changed, eventually, for greater efficiency).
299 */
300#define ALIGNMENT 32
301struct inp_tp {
302 struct inpcb inp;
303 struct tcpcb tcb __attribute__((aligned(ALIGNMENT)));
304};
305#undef ALIGNMENT
306
307int get_inpcb_str_size(void);
308int get_tcp_str_size(void);
309
310static void tcpcb_to_otcpcb(struct tcpcb *, struct otcpcb *);
311
312static lck_attr_t *tcp_uptime_mtx_attr = NULL;
313static lck_grp_t *tcp_uptime_mtx_grp = NULL;
314static lck_grp_attr_t *tcp_uptime_mtx_grp_attr = NULL;
315int tcp_notsent_lowat_check(struct socket *so);
316static void tcp_flow_lim_stats(struct ifnet_stats_per_flow *ifs,
317 struct if_lim_perf_stat *stat);
318static void tcp_flow_ecn_perf_stats(struct ifnet_stats_per_flow *ifs,
319 struct if_tcp_ecn_perf_stat *stat);
320
321static aes_encrypt_ctx tfo_ctx; /* Crypto-context for TFO */
322
323void
324tcp_tfo_gen_cookie(struct inpcb *inp, u_char *out, size_t blk_size)
325{
326 u_char in[CCAES_BLOCK_SIZE];
327#if INET6
328 int isipv6 = inp->inp_vflag & INP_IPV6;
329#endif
330
331 VERIFY(blk_size == CCAES_BLOCK_SIZE);
332
333 bzero(&in[0], CCAES_BLOCK_SIZE);
334 bzero(&out[0], CCAES_BLOCK_SIZE);
335
336#if INET6
337 if (isipv6)
338 memcpy(in, &inp->in6p_faddr, sizeof(struct in6_addr));
339 else
340#endif /* INET6 */
341 memcpy(in, &inp->inp_faddr, sizeof(struct in_addr));
342
343 aes_encrypt_cbc(in, NULL, 1, out, &tfo_ctx);
344}
345
346__private_extern__ int
347tcp_sysctl_fastopenkey(__unused struct sysctl_oid *oidp, __unused void *arg1,
348 __unused int arg2, struct sysctl_req *req)
349{
350 int error = 0;
351 /*
352 * TFO-key is expressed as a string in hex format
353 * (+1 to account for \0 char)
354 */
355 char keystring[TCP_FASTOPEN_KEYLEN * 2 + 1];
356 u_int32_t key[TCP_FASTOPEN_KEYLEN / sizeof(u_int32_t)];
357 int i;
358
359 /* -1, because newlen is len without the terminating \0 character */
360 if (req->newlen != (sizeof(keystring) - 1)) {
361 error = EINVAL;
362 goto exit;
363 }
364
365 /*
366 * sysctl_io_string copies keystring into the oldptr of the sysctl_req.
367 * Make sure everything is zero, to avoid putting garbage in there or
368 * leaking the stack.
369 */
370 bzero(keystring, sizeof(keystring));
371
372 error = sysctl_io_string(req, keystring, sizeof(keystring), 0, NULL);
373 if (error)
374 goto exit;
375
376 for (i = 0; i < (TCP_FASTOPEN_KEYLEN / sizeof(u_int32_t)); i++) {
377 /*
378 * We jump over the keystring in 8-character (4 byte in hex)
379 * steps
380 */
381 if (sscanf(&keystring[i * 8], "%8x", &key[i]) != 1) {
382 error = EINVAL;
383 goto exit;
384 }
385 }
386
387 aes_encrypt_key128((u_char *)key, &tfo_ctx);
388
389exit:
390 return (error);
391}
392
393int
394get_inpcb_str_size(void)
395{
396 return (sizeof(struct inpcb));
397}
398
399int
400get_tcp_str_size(void)
401{
402 return (sizeof(struct tcpcb));
403}
404
405static int scale_to_powerof2(int size);
406
407/*
408 * This helper routine returns one of the following scaled value of size:
409 * 1. Rounded down power of two value of size if the size value passed as
410 * argument is not a power of two and the rounded up value overflows.
411 * OR
412 * 2. Rounded up power of two value of size if the size value passed as
413 * argument is not a power of two and the rounded up value does not overflow
414 * OR
415 * 3. Same value as argument size if it is already a power of two.
416 */
417static int
418scale_to_powerof2(int size) {
419 /* Handle special case of size = 0 */
420 int ret = size ? size : 1;
421
422 if (!powerof2(ret)) {
423 while (!powerof2(size)) {
424 /*
425 * Clear out least significant
426 * set bit till size is left with
427 * its highest set bit at which point
428 * it is rounded down power of two.
429 */
430 size = size & (size -1);
431 }
432
433 /* Check for overflow when rounding up */
434 if (0 == (size << 1)) {
435 ret = size;
436 } else {
437 ret = size << 1;
438 }
439 }
440
441 return (ret);
442}
443
444static void
445tcp_tfo_init(void)
446{
447 u_char key[TCP_FASTOPEN_KEYLEN];
448
449 read_frandom(key, sizeof(key));
450 aes_encrypt_key128(key, &tfo_ctx);
451}
452
453/*
454 * Tcp initialization
455 */
456void
457tcp_init(struct protosw *pp, struct domain *dp)
458{
459#pragma unused(dp)
460 static int tcp_initialized = 0;
461 vm_size_t str_size;
462 struct inpcbinfo *pcbinfo;
463
464 VERIFY((pp->pr_flags & (PR_INITIALIZED|PR_ATTACHED)) == PR_ATTACHED);
465
466 if (tcp_initialized)
467 return;
468 tcp_initialized = 1;
469
470 tcp_ccgen = 1;
471 tcp_cleartaocache();
472
473 tcp_keepinit = TCPTV_KEEP_INIT;
474 tcp_keepidle = TCPTV_KEEP_IDLE;
475 tcp_keepintvl = TCPTV_KEEPINTVL;
476 tcp_keepcnt = TCPTV_KEEPCNT;
477 tcp_maxpersistidle = TCPTV_KEEP_IDLE;
478 tcp_msl = TCPTV_MSL;
479
480 microuptime(&tcp_uptime);
481 read_frandom(&tcp_now, sizeof(tcp_now));
482
483 /* Starts tcp internal clock at a random value */
484 tcp_now = tcp_now & 0x3fffffff;
485
486 /* expose initial uptime/now via systcl for utcp to keep time sync */
487 tcp_now_init = tcp_now;
488 tcp_microuptime_init =
489 tcp_uptime.tv_usec + (tcp_uptime.tv_sec * USEC_PER_SEC);
490 SYSCTL_SKMEM_UPDATE_FIELD(tcp.microuptime_init, tcp_microuptime_init);
491 SYSCTL_SKMEM_UPDATE_FIELD(tcp.now_init, tcp_now_init);
492
493 tcp_tfo_init();
494
495 LIST_INIT(&tcb);
496 tcbinfo.ipi_listhead = &tcb;
497
498 pcbinfo = &tcbinfo;
499 /*
500 * allocate lock group attribute and group for tcp pcb mutexes
501 */
502 pcbinfo->ipi_lock_grp_attr = lck_grp_attr_alloc_init();
503 pcbinfo->ipi_lock_grp = lck_grp_alloc_init("tcppcb",
504 pcbinfo->ipi_lock_grp_attr);
505
506 /*
507 * allocate the lock attribute for tcp pcb mutexes
508 */
509 pcbinfo->ipi_lock_attr = lck_attr_alloc_init();
510
511 if ((pcbinfo->ipi_lock = lck_rw_alloc_init(pcbinfo->ipi_lock_grp,
512 pcbinfo->ipi_lock_attr)) == NULL) {
513 panic("%s: unable to allocate PCB lock\n", __func__);
514 /* NOTREACHED */
515 }
516
517 if (tcp_tcbhashsize == 0) {
518 /* Set to default */
519 tcp_tcbhashsize = 512;
520 }
521
522 if (!powerof2(tcp_tcbhashsize)) {
523 int old_hash_size = tcp_tcbhashsize;
524 tcp_tcbhashsize = scale_to_powerof2(tcp_tcbhashsize);
525 /* Lower limit of 16 */
526 if (tcp_tcbhashsize < 16) {
527 tcp_tcbhashsize = 16;
528 }
529 printf("WARNING: TCB hash size not a power of 2, "
530 "scaled from %d to %d.\n",
531 old_hash_size,
532 tcp_tcbhashsize);
533 }
534
535 tcbinfo.ipi_hashbase = hashinit(tcp_tcbhashsize, M_PCB,
536 &tcbinfo.ipi_hashmask);
537 tcbinfo.ipi_porthashbase = hashinit(tcp_tcbhashsize, M_PCB,
538 &tcbinfo.ipi_porthashmask);
539 str_size = P2ROUNDUP(sizeof(struct inp_tp), sizeof(u_int64_t));
540 tcbinfo.ipi_zone = zinit(str_size, 120000*str_size, 8192, "tcpcb");
541 zone_change(tcbinfo.ipi_zone, Z_CALLERACCT, FALSE);
542 zone_change(tcbinfo.ipi_zone, Z_EXPAND, TRUE);
543
544 tcbinfo.ipi_gc = tcp_gc;
545 tcbinfo.ipi_timer = tcp_itimer;
546 in_pcbinfo_attach(&tcbinfo);
547
548 str_size = P2ROUNDUP(sizeof(struct sackhole), sizeof(u_int64_t));
549 sack_hole_zone = zinit(str_size, 120000*str_size, 8192,
550 "sack_hole zone");
551 zone_change(sack_hole_zone, Z_CALLERACCT, FALSE);
552 zone_change(sack_hole_zone, Z_EXPAND, TRUE);
553
554 str_size = P2ROUNDUP(sizeof(struct tseg_qent), sizeof(u_int64_t));
555 tcp_reass_zone = zinit(str_size, (nmbclusters >> 4) * str_size,
556 0, "tcp_reass_zone");
557 if (tcp_reass_zone == NULL) {
558 panic("%s: failed allocating tcp_reass_zone", __func__);
559 /* NOTREACHED */
560 }
561 zone_change(tcp_reass_zone, Z_CALLERACCT, FALSE);
562 zone_change(tcp_reass_zone, Z_EXPAND, TRUE);
563
564 bwmeas_elm_size = P2ROUNDUP(sizeof(struct bwmeas), sizeof(u_int64_t));
565 tcp_bwmeas_zone = zinit(bwmeas_elm_size, (100 * bwmeas_elm_size), 0,
566 "tcp_bwmeas_zone");
567 if (tcp_bwmeas_zone == NULL) {
568 panic("%s: failed allocating tcp_bwmeas_zone", __func__);
569 /* NOTREACHED */
570 }
571 zone_change(tcp_bwmeas_zone, Z_CALLERACCT, FALSE);
572 zone_change(tcp_bwmeas_zone, Z_EXPAND, TRUE);
573
574 str_size = P2ROUNDUP(sizeof(struct tcp_ccstate), sizeof(u_int64_t));
575 tcp_cc_zone = zinit(str_size, 20000 * str_size, 0, "tcp_cc_zone");
576 zone_change(tcp_cc_zone, Z_CALLERACCT, FALSE);
577 zone_change(tcp_cc_zone, Z_EXPAND, TRUE);
578
579 str_size = P2ROUNDUP(sizeof(struct tcp_rxt_seg), sizeof(u_int64_t));
580 tcp_rxt_seg_zone = zinit(str_size, 10000 * str_size, 0,
581 "tcp_rxt_seg_zone");
582 zone_change(tcp_rxt_seg_zone, Z_CALLERACCT, FALSE);
583 zone_change(tcp_rxt_seg_zone, Z_EXPAND, TRUE);
584
585#if INET6
586#define TCP_MINPROTOHDR (sizeof(struct ip6_hdr) + sizeof(struct tcphdr))
587#else /* INET6 */
588#define TCP_MINPROTOHDR (sizeof(struct tcpiphdr))
589#endif /* INET6 */
590 if (max_protohdr < TCP_MINPROTOHDR) {
591 _max_protohdr = TCP_MINPROTOHDR;
592 _max_protohdr = max_protohdr; /* round it up */
593 }
594 if (max_linkhdr + max_protohdr > MCLBYTES)
595 panic("tcp_init");
596#undef TCP_MINPROTOHDR
597
598 /* Initialize time wait and timer lists */
599 TAILQ_INIT(&tcp_tw_tailq);
600
601 bzero(&tcp_timer_list, sizeof(tcp_timer_list));
602 LIST_INIT(&tcp_timer_list.lhead);
603 /*
604 * allocate lock group attribute, group and attribute for
605 * the tcp timer list
606 */
607 tcp_timer_list.mtx_grp_attr = lck_grp_attr_alloc_init();
608 tcp_timer_list.mtx_grp = lck_grp_alloc_init("tcptimerlist",
609 tcp_timer_list.mtx_grp_attr);
610 tcp_timer_list.mtx_attr = lck_attr_alloc_init();
611 if ((tcp_timer_list.mtx = lck_mtx_alloc_init(tcp_timer_list.mtx_grp,
612 tcp_timer_list.mtx_attr)) == NULL) {
613 panic("failed to allocate memory for tcp_timer_list.mtx\n");
614 };
615 tcp_timer_list.call = thread_call_allocate(tcp_run_timerlist, NULL);
616 if (tcp_timer_list.call == NULL) {
617 panic("failed to allocate call entry 1 in tcp_init\n");
618 }
619
620 /*
621 * allocate lock group attribute, group and attribute for
622 * tcp_uptime_lock
623 */
624 tcp_uptime_mtx_grp_attr = lck_grp_attr_alloc_init();
625 tcp_uptime_mtx_grp = lck_grp_alloc_init("tcpuptime",
626 tcp_uptime_mtx_grp_attr);
627 tcp_uptime_mtx_attr = lck_attr_alloc_init();
628 tcp_uptime_lock = lck_spin_alloc_init(tcp_uptime_mtx_grp,
629 tcp_uptime_mtx_attr);
630
631 /* Initialize TCP LRO data structures */
632 tcp_lro_init();
633
634 /* Initialize TCP Cache */
635 tcp_cache_init();
636
637 /*
638 * If more than 60 MB of mbuf pool is available, increase the
639 * maximum allowed receive and send socket buffer size.
640 */
641 if (nmbclusters > 30720) {
642 tcp_autorcvbuf_max = 2 * 1024 * 1024;
643 tcp_autosndbuf_max = 2 * 1024 * 1024;
644
645 SYSCTL_SKMEM_UPDATE_FIELD(tcp.autorcvbufmax, tcp_autorcvbuf_max);
646 SYSCTL_SKMEM_UPDATE_FIELD(tcp.autosndbufmax, tcp_autosndbuf_max);
647 }
648}
649
650/*
651 * Fill in the IP and TCP headers for an outgoing packet, given the tcpcb.
652 * tcp_template used to store this data in mbufs, but we now recopy it out
653 * of the tcpcb each time to conserve mbufs.
654 */
655void
656tcp_fillheaders(struct tcpcb *tp, void *ip_ptr, void *tcp_ptr)
657{
658 struct inpcb *inp = tp->t_inpcb;
659 struct tcphdr *tcp_hdr = (struct tcphdr *)tcp_ptr;
660
661#if INET6
662 if ((inp->inp_vflag & INP_IPV6) != 0) {
663 struct ip6_hdr *ip6;
664
665 ip6 = (struct ip6_hdr *)ip_ptr;
666 ip6->ip6_flow = (ip6->ip6_flow & ~IPV6_FLOWINFO_MASK) |
667 (inp->inp_flow & IPV6_FLOWINFO_MASK);
668 ip6->ip6_vfc = (ip6->ip6_vfc & ~IPV6_VERSION_MASK) |
669 (IPV6_VERSION & IPV6_VERSION_MASK);
670 ip6->ip6_plen = htons(sizeof(struct tcphdr));
671 ip6->ip6_nxt = IPPROTO_TCP;
672 ip6->ip6_hlim = 0;
673 ip6->ip6_src = inp->in6p_laddr;
674 ip6->ip6_dst = inp->in6p_faddr;
675 tcp_hdr->th_sum = in6_pseudo(&inp->in6p_laddr, &inp->in6p_faddr,
676 htonl(sizeof (struct tcphdr) + IPPROTO_TCP));
677 } else
678#endif
679 {
680 struct ip *ip = (struct ip *) ip_ptr;
681
682 ip->ip_vhl = IP_VHL_BORING;
683 ip->ip_tos = 0;
684 ip->ip_len = 0;
685 ip->ip_id = 0;
686 ip->ip_off = 0;
687 ip->ip_ttl = 0;
688 ip->ip_sum = 0;
689 ip->ip_p = IPPROTO_TCP;
690 ip->ip_src = inp->inp_laddr;
691 ip->ip_dst = inp->inp_faddr;
692 tcp_hdr->th_sum =
693 in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
694 htons(sizeof(struct tcphdr) + IPPROTO_TCP));
695 }
696
697 tcp_hdr->th_sport = inp->inp_lport;
698 tcp_hdr->th_dport = inp->inp_fport;
699 tcp_hdr->th_seq = 0;
700 tcp_hdr->th_ack = 0;
701 tcp_hdr->th_x2 = 0;
702 tcp_hdr->th_off = 5;
703 tcp_hdr->th_flags = 0;
704 tcp_hdr->th_win = 0;
705 tcp_hdr->th_urp = 0;
706}
707
708/*
709 * Create template to be used to send tcp packets on a connection.
710 * Allocates an mbuf and fills in a skeletal tcp/ip header. The only
711 * use for this function is in keepalives, which use tcp_respond.
712 */
713struct tcptemp *
714tcp_maketemplate(struct tcpcb *tp)
715{
716 struct mbuf *m;
717 struct tcptemp *n;
718
719 m = m_get(M_DONTWAIT, MT_HEADER);
720 if (m == NULL)
721 return (0);
722 m->m_len = sizeof(struct tcptemp);
723 n = mtod(m, struct tcptemp *);
724
725 tcp_fillheaders(tp, (void *)&n->tt_ipgen, (void *)&n->tt_t);
726 return (n);
727}
728
729/*
730 * Send a single message to the TCP at address specified by
731 * the given TCP/IP header. If m == 0, then we make a copy
732 * of the tcpiphdr at ti and send directly to the addressed host.
733 * This is used to force keep alive messages out using the TCP
734 * template for a connection. If flags are given then we send
735 * a message back to the TCP which originated the * segment ti,
736 * and discard the mbuf containing it and any other attached mbufs.
737 *
738 * In any case the ack and sequence number of the transmitted
739 * segment are as specified by the parameters.
740 *
741 * NOTE: If m != NULL, then ti must point to *inside* the mbuf.
742 */
743void
744tcp_respond(struct tcpcb *tp, void *ipgen, struct tcphdr *th, struct mbuf *m,
745 tcp_seq ack, tcp_seq seq, int flags, struct tcp_respond_args *tra)
746{
747 int tlen;
748 int win = 0;
749 struct route *ro = 0;
750 struct route sro;
751 struct ip *ip;
752 struct tcphdr *nth;
753#if INET6
754 struct route_in6 *ro6 = 0;
755 struct route_in6 sro6;
756 struct ip6_hdr *ip6;
757 int isipv6;
758#endif /* INET6 */
759 struct ifnet *outif;
760 int sotc = SO_TC_UNSPEC;
761
762#if INET6
763 isipv6 = IP_VHL_V(((struct ip *)ipgen)->ip_vhl) == 6;
764 ip6 = ipgen;
765#endif /* INET6 */
766 ip = ipgen;
767
768 if (tp) {
769 if (!(flags & TH_RST)) {
770 win = tcp_sbspace(tp);
771 if (win > (int32_t)TCP_MAXWIN << tp->rcv_scale)
772 win = (int32_t)TCP_MAXWIN << tp->rcv_scale;
773 }
774#if INET6
775 if (isipv6)
776 ro6 = &tp->t_inpcb->in6p_route;
777 else
778#endif /* INET6 */
779 ro = &tp->t_inpcb->inp_route;
780 } else {
781#if INET6
782 if (isipv6) {
783 ro6 = &sro6;
784 bzero(ro6, sizeof(*ro6));
785 } else
786#endif /* INET6 */
787 {
788 ro = &sro;
789 bzero(ro, sizeof(*ro));
790 }
791 }
792 if (m == 0) {
793 m = m_gethdr(M_DONTWAIT, MT_HEADER); /* MAC-OK */
794 if (m == NULL)
795 return;
796 tlen = 0;
797 m->m_data += max_linkhdr;
798#if INET6
799 if (isipv6) {
800 VERIFY((MHLEN - max_linkhdr) >=
801 (sizeof (*ip6) + sizeof (*nth)));
802 bcopy((caddr_t)ip6, mtod(m, caddr_t),
803 sizeof(struct ip6_hdr));
804 ip6 = mtod(m, struct ip6_hdr *);
805 nth = (struct tcphdr *)(void *)(ip6 + 1);
806 } else
807#endif /* INET6 */
808 {
809 VERIFY((MHLEN - max_linkhdr) >=
810 (sizeof (*ip) + sizeof (*nth)));
811 bcopy((caddr_t)ip, mtod(m, caddr_t), sizeof(struct ip));
812 ip = mtod(m, struct ip *);
813 nth = (struct tcphdr *)(void *)(ip + 1);
814 }
815 bcopy((caddr_t)th, (caddr_t)nth, sizeof(struct tcphdr));
816#if MPTCP
817 if ((tp) && (tp->t_mpflags & TMPF_RESET))
818 flags = (TH_RST | TH_ACK);
819 else
820#endif
821 flags = TH_ACK;
822 } else {
823 m_freem(m->m_next);
824 m->m_next = 0;
825 m->m_data = (caddr_t)ipgen;
826 /* m_len is set later */
827 tlen = 0;
828#define xchg(a, b, type) { type t; t = a; a = b; b = t; }
829#if INET6
830 if (isipv6) {
831 /* Expect 32-bit aligned IP on strict-align platforms */
832 IP6_HDR_STRICT_ALIGNMENT_CHECK(ip6);
833 xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
834 nth = (struct tcphdr *)(void *)(ip6 + 1);
835 } else
836#endif /* INET6 */
837 {
838 /* Expect 32-bit aligned IP on strict-align platforms */
839 IP_HDR_STRICT_ALIGNMENT_CHECK(ip);
840 xchg(ip->ip_dst.s_addr, ip->ip_src.s_addr, n_long);
841 nth = (struct tcphdr *)(void *)(ip + 1);
842 }
843 if (th != nth) {
844 /*
845 * this is usually a case when an extension header
846 * exists between the IPv6 header and the
847 * TCP header.
848 */
849 nth->th_sport = th->th_sport;
850 nth->th_dport = th->th_dport;
851 }
852 xchg(nth->th_dport, nth->th_sport, n_short);
853#undef xchg
854 }
855#if INET6
856 if (isipv6) {
857 ip6->ip6_plen = htons((u_short)(sizeof (struct tcphdr) +
858 tlen));
859 tlen += sizeof (struct ip6_hdr) + sizeof (struct tcphdr);
860 } else
861#endif
862 {
863 tlen += sizeof (struct tcpiphdr);
864 ip->ip_len = tlen;
865 ip->ip_ttl = ip_defttl;
866 }
867 m->m_len = tlen;
868 m->m_pkthdr.len = tlen;
869 m->m_pkthdr.rcvif = 0;
870#if CONFIG_MACF_NET
871 if (tp != NULL && tp->t_inpcb != NULL) {
872 /*
873 * Packet is associated with a socket, so allow the
874 * label of the response to reflect the socket label.
875 */
876 mac_mbuf_label_associate_inpcb(tp->t_inpcb, m);
877 } else {
878 /*
879 * Packet is not associated with a socket, so possibly
880 * update the label in place.
881 */
882 mac_netinet_tcp_reply(m);
883 }
884#endif
885
886 nth->th_seq = htonl(seq);
887 nth->th_ack = htonl(ack);
888 nth->th_x2 = 0;
889 nth->th_off = sizeof (struct tcphdr) >> 2;
890 nth->th_flags = flags;
891 if (tp)
892 nth->th_win = htons((u_short) (win >> tp->rcv_scale));
893 else
894 nth->th_win = htons((u_short)win);
895 nth->th_urp = 0;
896#if INET6
897 if (isipv6) {
898 nth->th_sum = 0;
899 nth->th_sum = in6_pseudo(&ip6->ip6_src, &ip6->ip6_dst,
900 htonl((tlen - sizeof (struct ip6_hdr)) + IPPROTO_TCP));
901 m->m_pkthdr.csum_flags = CSUM_TCPIPV6;
902 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
903 ip6->ip6_hlim = in6_selecthlim(tp ? tp->t_inpcb : NULL,
904 ro6 && ro6->ro_rt ? ro6->ro_rt->rt_ifp : NULL);
905 } else
906#endif /* INET6 */
907 {
908 nth->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr,
909 htons((u_short)(tlen - sizeof(struct ip) + ip->ip_p)));
910 m->m_pkthdr.csum_flags = CSUM_TCP;
911 m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
912 }
913#if TCPDEBUG
914 if (tp == NULL || (tp->t_inpcb->inp_socket->so_options & SO_DEBUG))
915 tcp_trace(TA_OUTPUT, 0, tp, mtod(m, void *), th, 0);
916#endif
917
918#if NECP
919 necp_mark_packet_from_socket(m, tp ? tp->t_inpcb : NULL, 0, 0, 0);
920#endif /* NECP */
921
922#if IPSEC
923 if (tp != NULL && tp->t_inpcb->inp_sp != NULL &&
924 ipsec_setsocket(m, tp ? tp->t_inpcb->inp_socket : NULL) != 0) {
925 m_freem(m);
926 return;
927 }
928#endif
929
930 if (tp != NULL) {
931 u_int32_t svc_flags = 0;
932 if (isipv6) {
933 svc_flags |= PKT_SCF_IPV6;
934 }
935 sotc = tp->t_inpcb->inp_socket->so_traffic_class;
936 set_packet_service_class(m, tp->t_inpcb->inp_socket,
937 sotc, svc_flags);
938
939 /* Embed flowhash and flow control flags */
940 m->m_pkthdr.pkt_flowsrc = FLOWSRC_INPCB;
941 m->m_pkthdr.pkt_flowid = tp->t_inpcb->inp_flowhash;
942 m->m_pkthdr.pkt_flags |= (PKTF_FLOW_ID | PKTF_FLOW_LOCALSRC | PKTF_FLOW_ADV);
943 m->m_pkthdr.pkt_proto = IPPROTO_TCP;
944 m->m_pkthdr.tx_tcp_pid = tp->t_inpcb->inp_socket->last_pid;
945 m->m_pkthdr.tx_tcp_e_pid = tp->t_inpcb->inp_socket->e_pid;
946 }
947
948#if INET6
949 if (isipv6) {
950 struct ip6_out_args ip6oa;
951 bzero(&ip6oa, sizeof(ip6oa));
952 ip6oa.ip6oa_boundif = tra->ifscope;
953 ip6oa.ip6oa_flags = IP6OAF_SELECT_SRCIF | IP6OAF_BOUND_SRCADDR;
954 ip6oa.ip6oa_sotc = SO_TC_UNSPEC;
955 ip6oa.ip6oa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
956
957 if (tra->ifscope != IFSCOPE_NONE)
958 ip6oa.ip6oa_flags |= IP6OAF_BOUND_IF;
959 if (tra->nocell)
960 ip6oa.ip6oa_flags |= IP6OAF_NO_CELLULAR;
961 if (tra->noexpensive)
962 ip6oa.ip6oa_flags |= IP6OAF_NO_EXPENSIVE;
963 if (tra->awdl_unrestricted)
964 ip6oa.ip6oa_flags |= IP6OAF_AWDL_UNRESTRICTED;
965 if (tra->intcoproc_allowed)
966 ip6oa.ip6oa_flags |= IP6OAF_INTCOPROC_ALLOWED;
967 ip6oa.ip6oa_sotc = sotc;
968 if (tp != NULL) {
969 if ((tp->t_inpcb->inp_socket->so_flags1 & SOF1_QOSMARKING_ALLOWED))
970 ip6oa.ip6oa_flags |= IP6OAF_QOSMARKING_ALLOWED;
971 ip6oa.ip6oa_netsvctype = tp->t_inpcb->inp_socket->so_netsvctype;
972 }
973 (void) ip6_output(m, NULL, ro6, IPV6_OUTARGS, NULL,
974 NULL, &ip6oa);
975
976 if (tp != NULL && ro6 != NULL && ro6->ro_rt != NULL &&
977 (outif = ro6->ro_rt->rt_ifp) !=
978 tp->t_inpcb->in6p_last_outifp) {
979 tp->t_inpcb->in6p_last_outifp = outif;
980 }
981
982 if (ro6 == &sro6)
983 ROUTE_RELEASE(ro6);
984 } else
985#endif /* INET6 */
986 {
987 struct ip_out_args ipoa;
988 bzero(&ipoa, sizeof(ipoa));
989 ipoa.ipoa_boundif = tra->ifscope;
990 ipoa.ipoa_flags = IPOAF_SELECT_SRCIF | IPOAF_BOUND_SRCADDR;
991 ipoa.ipoa_sotc = SO_TC_UNSPEC;
992 ipoa.ipoa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
993
994 if (tra->ifscope != IFSCOPE_NONE)
995 ipoa.ipoa_flags |= IPOAF_BOUND_IF;
996 if (tra->nocell)
997 ipoa.ipoa_flags |= IPOAF_NO_CELLULAR;
998 if (tra->noexpensive)
999 ipoa.ipoa_flags |= IPOAF_NO_EXPENSIVE;
1000 if (tra->awdl_unrestricted)
1001 ipoa.ipoa_flags |= IPOAF_AWDL_UNRESTRICTED;
1002 ipoa.ipoa_sotc = sotc;
1003 if (tp != NULL) {
1004 if ((tp->t_inpcb->inp_socket->so_flags1 & SOF1_QOSMARKING_ALLOWED))
1005 ipoa.ipoa_flags |= IPOAF_QOSMARKING_ALLOWED;
1006 ipoa.ipoa_netsvctype = tp->t_inpcb->inp_socket->so_netsvctype;
1007 }
1008 if (ro != &sro) {
1009 /* Copy the cached route and take an extra reference */
1010 inp_route_copyout(tp->t_inpcb, &sro);
1011 }
1012 /*
1013 * For consistency, pass a local route copy.
1014 */
1015 (void) ip_output(m, NULL, &sro, IP_OUTARGS, NULL, &ipoa);
1016
1017 if (tp != NULL && sro.ro_rt != NULL &&
1018 (outif = sro.ro_rt->rt_ifp) !=
1019 tp->t_inpcb->inp_last_outifp) {
1020 tp->t_inpcb->inp_last_outifp = outif;
1021
1022 }
1023 if (ro != &sro) {
1024 /* Synchronize cached PCB route */
1025 inp_route_copyin(tp->t_inpcb, &sro);
1026 } else {
1027 ROUTE_RELEASE(&sro);
1028 }
1029 }
1030}
1031
1032/*
1033 * Create a new TCP control block, making an
1034 * empty reassembly queue and hooking it to the argument
1035 * protocol control block. The `inp' parameter must have
1036 * come from the zone allocator set up in tcp_init().
1037 */
1038struct tcpcb *
1039tcp_newtcpcb(struct inpcb *inp)
1040{
1041 struct inp_tp *it;
1042 struct tcpcb *tp;
1043 struct socket *so = inp->inp_socket;
1044#if INET6
1045 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
1046#endif /* INET6 */
1047
1048 calculate_tcp_clock();
1049
1050 if ((so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER) == 0) {
1051 it = (struct inp_tp *)(void *)inp;
1052 tp = &it->tcb;
1053 } else {
1054 tp = (struct tcpcb *)(void *)inp->inp_saved_ppcb;
1055 }
1056
1057 bzero((char *) tp, sizeof(struct tcpcb));
1058 LIST_INIT(&tp->t_segq);
1059 tp->t_maxseg = tp->t_maxopd =
1060#if INET6
1061 isipv6 ? tcp_v6mssdflt :
1062#endif /* INET6 */
1063 tcp_mssdflt;
1064
1065 if (tcp_do_rfc1323)
1066 tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP);
1067 if (tcp_do_sack)
1068 tp->t_flagsext |= TF_SACK_ENABLE;
1069
1070 TAILQ_INIT(&tp->snd_holes);
1071 SLIST_INIT(&tp->t_rxt_segments);
1072 SLIST_INIT(&tp->t_notify_ack);
1073 tp->t_inpcb = inp;
1074 /*
1075 * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
1076 * rtt estimate. Set rttvar so that srtt + 4 * rttvar gives
1077 * reasonable initial retransmit time.
1078 */
1079 tp->t_srtt = TCPTV_SRTTBASE;
1080 tp->t_rttvar =
1081 ((TCPTV_RTOBASE - TCPTV_SRTTBASE) << TCP_RTTVAR_SHIFT) / 4;
1082 tp->t_rttmin = tcp_TCPTV_MIN;
1083 tp->t_rxtcur = TCPTV_RTOBASE;
1084
1085 if (tcp_use_newreno)
1086 /* use newreno by default */
1087 tp->tcp_cc_index = TCP_CC_ALGO_NEWRENO_INDEX;
1088 else
1089 tp->tcp_cc_index = TCP_CC_ALGO_CUBIC_INDEX;
1090
1091 tcp_cc_allocate_state(tp);
1092
1093 if (CC_ALGO(tp)->init != NULL)
1094 CC_ALGO(tp)->init(tp);
1095
1096 tp->snd_cwnd = TCP_CC_CWND_INIT_BYTES;
1097 tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
1098 tp->snd_ssthresh_prev = TCP_MAXWIN << TCP_MAX_WINSHIFT;
1099 tp->t_rcvtime = tcp_now;
1100 tp->tentry.timer_start = tcp_now;
1101 tp->t_persist_timeout = tcp_max_persist_timeout;
1102 tp->t_persist_stop = 0;
1103 tp->t_flagsext |= TF_RCVUNACK_WAITSS;
1104 tp->t_rexmtthresh = tcprexmtthresh;
1105
1106 /* Enable bandwidth measurement on this connection */
1107 tp->t_flagsext |= TF_MEASURESNDBW;
1108 if (tp->t_bwmeas == NULL) {
1109 tp->t_bwmeas = tcp_bwmeas_alloc(tp);
1110 if (tp->t_bwmeas == NULL)
1111 tp->t_flagsext &= ~TF_MEASURESNDBW;
1112 }
1113
1114 /* Clear time wait tailq entry */
1115 tp->t_twentry.tqe_next = NULL;
1116 tp->t_twentry.tqe_prev = NULL;
1117
1118 /*
1119 * IPv4 TTL initialization is necessary for an IPv6 socket as well,
1120 * because the socket may be bound to an IPv6 wildcard address,
1121 * which may match an IPv4-mapped IPv6 address.
1122 */
1123 inp->inp_ip_ttl = ip_defttl;
1124 inp->inp_ppcb = (caddr_t)tp;
1125 return (tp); /* XXX */
1126}
1127
1128/*
1129 * Drop a TCP connection, reporting
1130 * the specified error. If connection is synchronized,
1131 * then send a RST to peer.
1132 */
1133struct tcpcb *
1134tcp_drop(struct tcpcb *tp, int errno)
1135{
1136 struct socket *so = tp->t_inpcb->inp_socket;
1137#if CONFIG_DTRACE
1138 struct inpcb *inp = tp->t_inpcb;
1139#endif
1140
1141 if (TCPS_HAVERCVDSYN(tp->t_state)) {
1142 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
1143 struct tcpcb *, tp, int32_t, TCPS_CLOSED);
1144 tp->t_state = TCPS_CLOSED;
1145 (void) tcp_output(tp);
1146 tcpstat.tcps_drops++;
1147 } else
1148 tcpstat.tcps_conndrops++;
1149 if (errno == ETIMEDOUT && tp->t_softerror)
1150 errno = tp->t_softerror;
1151 so->so_error = errno;
1152 return (tcp_close(tp));
1153}
1154
1155void
1156tcp_getrt_rtt(struct tcpcb *tp, struct rtentry *rt)
1157{
1158 u_int32_t rtt = rt->rt_rmx.rmx_rtt;
1159 int isnetlocal = (tp->t_flags & TF_LOCAL);
1160
1161 if (rtt != 0) {
1162 /*
1163 * XXX the lock bit for RTT indicates that the value
1164 * is also a minimum value; this is subject to time.
1165 */
1166 if (rt->rt_rmx.rmx_locks & RTV_RTT)
1167 tp->t_rttmin = rtt / (RTM_RTTUNIT / TCP_RETRANSHZ);
1168 else
1169 tp->t_rttmin = isnetlocal ? tcp_TCPTV_MIN :
1170 TCPTV_REXMTMIN;
1171 tp->t_srtt =
1172 rtt / (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTT_SCALE));
1173 tcpstat.tcps_usedrtt++;
1174 if (rt->rt_rmx.rmx_rttvar) {
1175 tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
1176 (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTTVAR_SCALE));
1177 tcpstat.tcps_usedrttvar++;
1178 } else {
1179 /* default variation is +- 1 rtt */
1180 tp->t_rttvar =
1181 tp->t_srtt * TCP_RTTVAR_SCALE / TCP_RTT_SCALE;
1182 }
1183 TCPT_RANGESET(tp->t_rxtcur,
1184 ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1,
1185 tp->t_rttmin, TCPTV_REXMTMAX,
1186 TCP_ADD_REXMTSLOP(tp));
1187 }
1188}
1189
1190static inline void
1191tcp_create_ifnet_stats_per_flow(struct tcpcb *tp,
1192 struct ifnet_stats_per_flow *ifs)
1193{
1194 struct inpcb *inp;
1195 struct socket *so;
1196 if (tp == NULL || ifs == NULL)
1197 return;
1198
1199 bzero(ifs, sizeof(*ifs));
1200 inp = tp->t_inpcb;
1201 so = inp->inp_socket;
1202
1203 ifs->ipv4 = (inp->inp_vflag & INP_IPV6) ? 0 : 1;
1204 ifs->local = (tp->t_flags & TF_LOCAL) ? 1 : 0;
1205 ifs->connreset = (so->so_error == ECONNRESET) ? 1 : 0;
1206 ifs->conntimeout = (so->so_error == ETIMEDOUT) ? 1 : 0;
1207 ifs->ecn_flags = tp->ecn_flags;
1208 ifs->txretransmitbytes = tp->t_stat.txretransmitbytes;
1209 ifs->rxoutoforderbytes = tp->t_stat.rxoutoforderbytes;
1210 ifs->rxmitpkts = tp->t_stat.rxmitpkts;
1211 ifs->rcvoopack = tp->t_rcvoopack;
1212 ifs->pawsdrop = tp->t_pawsdrop;
1213 ifs->sack_recovery_episodes = tp->t_sack_recovery_episode;
1214 ifs->reordered_pkts = tp->t_reordered_pkts;
1215 ifs->dsack_sent = tp->t_dsack_sent;
1216 ifs->dsack_recvd = tp->t_dsack_recvd;
1217 ifs->srtt = tp->t_srtt;
1218 ifs->rttupdated = tp->t_rttupdated;
1219 ifs->rttvar = tp->t_rttvar;
1220 ifs->rttmin = get_base_rtt(tp);
1221 if (tp->t_bwmeas != NULL && tp->t_bwmeas->bw_sndbw_max > 0) {
1222 ifs->bw_sndbw_max = tp->t_bwmeas->bw_sndbw_max;
1223 } else {
1224 ifs->bw_sndbw_max = 0;
1225 }
1226 if (tp->t_bwmeas!= NULL && tp->t_bwmeas->bw_rcvbw_max > 0) {
1227 ifs->bw_rcvbw_max = tp->t_bwmeas->bw_rcvbw_max;
1228 } else {
1229 ifs->bw_rcvbw_max = 0;
1230 }
1231 ifs->bk_txpackets = so->so_tc_stats[MBUF_TC_BK].txpackets;
1232 ifs->txpackets = inp->inp_stat->txpackets;
1233 ifs->rxpackets = inp->inp_stat->rxpackets;
1234}
1235
1236static inline void
1237tcp_flow_ecn_perf_stats(struct ifnet_stats_per_flow *ifs,
1238 struct if_tcp_ecn_perf_stat *stat)
1239{
1240 u_int64_t curval, oldval;
1241 stat->total_txpkts += ifs->txpackets;
1242 stat->total_rxpkts += ifs->rxpackets;
1243 stat->total_rxmitpkts += ifs->rxmitpkts;
1244 stat->total_oopkts += ifs->rcvoopack;
1245 stat->total_reorderpkts += (ifs->reordered_pkts +
1246 ifs->pawsdrop + ifs->dsack_sent + ifs->dsack_recvd);
1247
1248 /* Average RTT */
1249 curval = ifs->srtt >> TCP_RTT_SHIFT;
1250 if (curval > 0 && ifs->rttupdated >= 16) {
1251 if (stat->rtt_avg == 0) {
1252 stat->rtt_avg = curval;
1253 } else {
1254 oldval = stat->rtt_avg;
1255 stat->rtt_avg = ((oldval << 4) - oldval + curval) >> 4;
1256 }
1257 }
1258
1259 /* RTT variance */
1260 curval = ifs->rttvar >> TCP_RTTVAR_SHIFT;
1261 if (curval > 0 && ifs->rttupdated >= 16) {
1262 if (stat->rtt_var == 0) {
1263 stat->rtt_var = curval;
1264 } else {
1265 oldval = stat->rtt_var;
1266 stat->rtt_var =
1267 ((oldval << 4) - oldval + curval) >> 4;
1268 }
1269 }
1270
1271 /* SACK episodes */
1272 stat->sack_episodes += ifs->sack_recovery_episodes;
1273 if (ifs->connreset)
1274 stat->rst_drop++;
1275}
1276
1277static inline void
1278tcp_flow_lim_stats(struct ifnet_stats_per_flow *ifs,
1279 struct if_lim_perf_stat *stat)
1280{
1281 u_int64_t curval, oldval;
1282
1283 stat->lim_total_txpkts += ifs->txpackets;
1284 stat->lim_total_rxpkts += ifs->rxpackets;
1285 stat->lim_total_retxpkts += ifs->rxmitpkts;
1286 stat->lim_total_oopkts += ifs->rcvoopack;
1287
1288 if (ifs->bw_sndbw_max > 0) {
1289 /* convert from bytes per ms to bits per second */
1290 ifs->bw_sndbw_max *= 8000;
1291 stat->lim_ul_max_bandwidth = max(stat->lim_ul_max_bandwidth,
1292 ifs->bw_sndbw_max);
1293 }
1294
1295 if (ifs->bw_rcvbw_max > 0) {
1296 /* convert from bytes per ms to bits per second */
1297 ifs->bw_rcvbw_max *= 8000;
1298 stat->lim_dl_max_bandwidth = max(stat->lim_dl_max_bandwidth,
1299 ifs->bw_rcvbw_max);
1300 }
1301
1302 /* Average RTT */
1303 curval = ifs->srtt >> TCP_RTT_SHIFT;
1304 if (curval > 0 && ifs->rttupdated >= 16) {
1305 if (stat->lim_rtt_average == 0) {
1306 stat->lim_rtt_average = curval;
1307 } else {
1308 oldval = stat->lim_rtt_average;
1309 stat->lim_rtt_average =
1310 ((oldval << 4) - oldval + curval) >> 4;
1311 }
1312 }
1313
1314 /* RTT variance */
1315 curval = ifs->rttvar >> TCP_RTTVAR_SHIFT;
1316 if (curval > 0 && ifs->rttupdated >= 16) {
1317 if (stat->lim_rtt_variance == 0) {
1318 stat->lim_rtt_variance = curval;
1319 } else {
1320 oldval = stat->lim_rtt_variance;
1321 stat->lim_rtt_variance =
1322 ((oldval << 4) - oldval + curval) >> 4;
1323 }
1324 }
1325
1326 if (stat->lim_rtt_min == 0) {
1327 stat->lim_rtt_min = ifs->rttmin;
1328 } else {
1329 stat->lim_rtt_min = min(stat->lim_rtt_min, ifs->rttmin);
1330 }
1331
1332 /* connection timeouts */
1333 stat->lim_conn_attempts++;
1334 if (ifs->conntimeout)
1335 stat->lim_conn_timeouts++;
1336
1337 /* bytes sent using background delay-based algorithms */
1338 stat->lim_bk_txpkts += ifs->bk_txpackets;
1339
1340}
1341
1342/*
1343 * Close a TCP control block:
1344 * discard all space held by the tcp
1345 * discard internet protocol block
1346 * wake up any sleepers
1347 */
1348struct tcpcb *
1349tcp_close(struct tcpcb *tp)
1350{
1351 struct inpcb *inp = tp->t_inpcb;
1352 struct socket *so = inp->inp_socket;
1353#if INET6
1354 int isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
1355#endif /* INET6 */
1356 struct route *ro;
1357 struct rtentry *rt;
1358 int dosavessthresh;
1359 struct ifnet_stats_per_flow ifs;
1360
1361 /* tcp_close was called previously, bail */
1362 if (inp->inp_ppcb == NULL)
1363 return (NULL);
1364
1365 tcp_canceltimers(tp);
1366 KERNEL_DEBUG(DBG_FNC_TCP_CLOSE | DBG_FUNC_START, tp, 0, 0, 0, 0);
1367
1368 /*
1369 * If another thread for this tcp is currently in ip (indicated by
1370 * the TF_SENDINPROG flag), defer the cleanup until after it returns
1371 * back to tcp. This is done to serialize the close until after all
1372 * pending output is finished, in order to avoid having the PCB be
1373 * detached and the cached route cleaned, only for ip to cache the
1374 * route back into the PCB again. Note that we've cleared all the
1375 * timers at this point. Set TF_CLOSING to indicate to tcp_output()
1376 * that is should call us again once it returns from ip; at that
1377 * point both flags should be cleared and we can proceed further
1378 * with the cleanup.
1379 */
1380 if ((tp->t_flags & TF_CLOSING) ||
1381 inp->inp_sndinprog_cnt > 0) {
1382 tp->t_flags |= TF_CLOSING;
1383 return (NULL);
1384 }
1385
1386 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
1387 struct tcpcb *, tp, int32_t, TCPS_CLOSED);
1388
1389#if INET6
1390 ro = (isipv6 ? (struct route *)&inp->in6p_route : &inp->inp_route);
1391#else
1392 ro = &inp->inp_route;
1393#endif
1394 rt = ro->ro_rt;
1395 if (rt != NULL)
1396 RT_LOCK_SPIN(rt);
1397
1398 /*
1399 * If we got enough samples through the srtt filter,
1400 * save the rtt and rttvar in the routing entry.
1401 * 'Enough' is arbitrarily defined as the 16 samples.
1402 * 16 samples is enough for the srtt filter to converge
1403 * to within 5% of the correct value; fewer samples and
1404 * we could save a very bogus rtt.
1405 *
1406 * Don't update the default route's characteristics and don't
1407 * update anything that the user "locked".
1408 */
1409 if (tp->t_rttupdated >= 16) {
1410 u_int32_t i = 0;
1411
1412#if INET6
1413 if (isipv6) {
1414 struct sockaddr_in6 *sin6;
1415
1416 if (rt == NULL)
1417 goto no_valid_rt;
1418 sin6 = (struct sockaddr_in6 *)(void *)rt_key(rt);
1419 if (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr))
1420 goto no_valid_rt;
1421 }
1422 else
1423#endif /* INET6 */
1424 if (ROUTE_UNUSABLE(ro) ||
1425 SIN(rt_key(rt))->sin_addr.s_addr == INADDR_ANY) {
1426 DTRACE_TCP4(state__change, void, NULL,
1427 struct inpcb *, inp, struct tcpcb *, tp,
1428 int32_t, TCPS_CLOSED);
1429 tp->t_state = TCPS_CLOSED;
1430 goto no_valid_rt;
1431 }
1432
1433 RT_LOCK_ASSERT_HELD(rt);
1434 if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
1435 i = tp->t_srtt *
1436 (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTT_SCALE));
1437 if (rt->rt_rmx.rmx_rtt && i)
1438 /*
1439 * filter this update to half the old & half
1440 * the new values, converting scale.
1441 * See route.h and tcp_var.h for a
1442 * description of the scaling constants.
1443 */
1444 rt->rt_rmx.rmx_rtt =
1445 (rt->rt_rmx.rmx_rtt + i) / 2;
1446 else
1447 rt->rt_rmx.rmx_rtt = i;
1448 tcpstat.tcps_cachedrtt++;
1449 }
1450 if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
1451 i = tp->t_rttvar *
1452 (RTM_RTTUNIT / (TCP_RETRANSHZ * TCP_RTTVAR_SCALE));
1453 if (rt->rt_rmx.rmx_rttvar && i)
1454 rt->rt_rmx.rmx_rttvar =
1455 (rt->rt_rmx.rmx_rttvar + i) / 2;
1456 else
1457 rt->rt_rmx.rmx_rttvar = i;
1458 tcpstat.tcps_cachedrttvar++;
1459 }
1460 /*
1461 * The old comment here said:
1462 * update the pipelimit (ssthresh) if it has been updated
1463 * already or if a pipesize was specified & the threshhold
1464 * got below half the pipesize. I.e., wait for bad news
1465 * before we start updating, then update on both good
1466 * and bad news.
1467 *
1468 * But we want to save the ssthresh even if no pipesize is
1469 * specified explicitly in the route, because such
1470 * connections still have an implicit pipesize specified
1471 * by the global tcp_sendspace. In the absence of a reliable
1472 * way to calculate the pipesize, it will have to do.
1473 */
1474 i = tp->snd_ssthresh;
1475 if (rt->rt_rmx.rmx_sendpipe != 0)
1476 dosavessthresh = (i < rt->rt_rmx.rmx_sendpipe / 2);
1477 else
1478 dosavessthresh = (i < so->so_snd.sb_hiwat / 2);
1479 if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
1480 i != 0 && rt->rt_rmx.rmx_ssthresh != 0) ||
1481 dosavessthresh) {
1482 /*
1483 * convert the limit from user data bytes to
1484 * packets then to packet data bytes.
1485 */
1486 i = (i + tp->t_maxseg / 2) / tp->t_maxseg;
1487 if (i < 2)
1488 i = 2;
1489 i *= (u_int32_t)(tp->t_maxseg +
1490#if INET6
1491 isipv6 ? sizeof (struct ip6_hdr) +
1492 sizeof (struct tcphdr) :
1493#endif /* INET6 */
1494 sizeof (struct tcpiphdr));
1495 if (rt->rt_rmx.rmx_ssthresh)
1496 rt->rt_rmx.rmx_ssthresh =
1497 (rt->rt_rmx.rmx_ssthresh + i) / 2;
1498 else
1499 rt->rt_rmx.rmx_ssthresh = i;
1500 tcpstat.tcps_cachedssthresh++;
1501 }
1502 }
1503
1504 /*
1505 * Mark route for deletion if no information is cached.
1506 */
1507 if (rt != NULL && (so->so_flags & SOF_OVERFLOW) && tcp_lq_overflow) {
1508 if (!(rt->rt_rmx.rmx_locks & RTV_RTT) &&
1509 rt->rt_rmx.rmx_rtt == 0) {
1510 rt->rt_flags |= RTF_DELCLONE;
1511 }
1512 }
1513
1514no_valid_rt:
1515 if (rt != NULL)
1516 RT_UNLOCK(rt);
1517
1518 /* free the reassembly queue, if any */
1519 (void) tcp_freeq(tp);
1520
1521 /* performance stats per interface */
1522 tcp_create_ifnet_stats_per_flow(tp, &ifs);
1523 tcp_update_stats_per_flow(&ifs, inp->inp_last_outifp);
1524
1525 tcp_free_sackholes(tp);
1526 tcp_notify_ack_free(tp);
1527
1528 inp_decr_sndbytes_allunsent(so, tp->snd_una);
1529
1530 if (tp->t_bwmeas != NULL) {
1531 tcp_bwmeas_free(tp);
1532 }
1533 tcp_rxtseg_clean(tp);
1534 /* Free the packet list */
1535 if (tp->t_pktlist_head != NULL)
1536 m_freem_list(tp->t_pktlist_head);
1537 TCP_PKTLIST_CLEAR(tp);
1538
1539 if (so->so_flags1 & SOF1_CACHED_IN_SOCK_LAYER)
1540 inp->inp_saved_ppcb = (caddr_t) tp;
1541
1542 tp->t_state = TCPS_CLOSED;
1543
1544 /*
1545 * Issue a wakeup before detach so that we don't miss
1546 * a wakeup
1547 */
1548 sodisconnectwakeup(so);
1549
1550 /*
1551 * Clean up any LRO state
1552 */
1553 if (tp->t_flagsext & TF_LRO_OFFLOADED) {
1554 tcp_lro_remove_state(inp->inp_laddr, inp->inp_faddr,
1555 inp->inp_lport, inp->inp_fport);
1556 tp->t_flagsext &= ~TF_LRO_OFFLOADED;
1557 }
1558
1559 /*
1560 * If this is a socket that does not want to wakeup the device
1561 * for it's traffic, the application might need to know that the
1562 * socket is closed, send a notification.
1563 */
1564 if ((so->so_options & SO_NOWAKEFROMSLEEP) &&
1565 inp->inp_state != INPCB_STATE_DEAD &&
1566 !(inp->inp_flags2 & INP2_TIMEWAIT))
1567 socket_post_kev_msg_closed(so);
1568
1569 if (CC_ALGO(tp)->cleanup != NULL) {
1570 CC_ALGO(tp)->cleanup(tp);
1571 }
1572
1573 if (tp->t_ccstate != NULL) {
1574 zfree(tcp_cc_zone, tp->t_ccstate);
1575 tp->t_ccstate = NULL;
1576 }
1577 tp->tcp_cc_index = TCP_CC_ALGO_NONE;
1578
1579 /* Can happen if we close the socket before receiving the third ACK */
1580 if ((tp->t_tfo_flags & TFO_F_COOKIE_VALID)) {
1581 OSDecrementAtomic(&tcp_tfo_halfcnt);
1582
1583 /* Panic if something has gone terribly wrong. */
1584 VERIFY(tcp_tfo_halfcnt >= 0);
1585
1586 tp->t_tfo_flags &= ~TFO_F_COOKIE_VALID;
1587 }
1588
1589#if INET6
1590 if (SOCK_CHECK_DOM(so, PF_INET6))
1591 in6_pcbdetach(inp);
1592 else
1593#endif /* INET6 */
1594 in_pcbdetach(inp);
1595
1596 /*
1597 * Call soisdisconnected after detach because it might unlock the socket
1598 */
1599 soisdisconnected(so);
1600 tcpstat.tcps_closed++;
1601 KERNEL_DEBUG(DBG_FNC_TCP_CLOSE | DBG_FUNC_END,
1602 tcpstat.tcps_closed, 0, 0, 0, 0);
1603 return (NULL);
1604}
1605
1606int
1607tcp_freeq(struct tcpcb *tp)
1608{
1609 struct tseg_qent *q;
1610 int rv = 0;
1611
1612 while ((q = LIST_FIRST(&tp->t_segq)) != NULL) {
1613 LIST_REMOVE(q, tqe_q);
1614 m_freem(q->tqe_m);
1615 zfree(tcp_reass_zone, q);
1616 rv = 1;
1617 }
1618 tp->t_reassqlen = 0;
1619 return (rv);
1620}
1621
1622
1623/*
1624 * Walk the tcpbs, if existing, and flush the reassembly queue,
1625 * if there is one when do_tcpdrain is enabled
1626 * Also defunct the extended background idle socket
1627 * Do it next time if the pcbinfo lock is in use
1628 */
1629void
1630tcp_drain(void)
1631{
1632 struct inpcb *inp;
1633 struct tcpcb *tp;
1634
1635 if (!lck_rw_try_lock_exclusive(tcbinfo.ipi_lock))
1636 return;
1637
1638 LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) {
1639 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) !=
1640 WNT_STOPUSING) {
1641 socket_lock(inp->inp_socket, 1);
1642 if (in_pcb_checkstate(inp, WNT_RELEASE, 1)
1643 == WNT_STOPUSING) {
1644 /* lost a race, try the next one */
1645 socket_unlock(inp->inp_socket, 1);
1646 continue;
1647 }
1648 tp = intotcpcb(inp);
1649
1650 if (do_tcpdrain)
1651 tcp_freeq(tp);
1652
1653 so_drain_extended_bk_idle(inp->inp_socket);
1654
1655 socket_unlock(inp->inp_socket, 1);
1656 }
1657 }
1658 lck_rw_done(tcbinfo.ipi_lock);
1659
1660}
1661
1662/*
1663 * Notify a tcp user of an asynchronous error;
1664 * store error as soft error, but wake up user
1665 * (for now, won't do anything until can select for soft error).
1666 *
1667 * Do not wake up user since there currently is no mechanism for
1668 * reporting soft errors (yet - a kqueue filter may be added).
1669 */
1670static void
1671tcp_notify(struct inpcb *inp, int error)
1672{
1673 struct tcpcb *tp;
1674
1675 if (inp == NULL || (inp->inp_state == INPCB_STATE_DEAD))
1676 return; /* pcb is gone already */
1677
1678 tp = (struct tcpcb *)inp->inp_ppcb;
1679
1680 VERIFY(tp != NULL);
1681 /*
1682 * Ignore some errors if we are hooked up.
1683 * If connection hasn't completed, has retransmitted several times,
1684 * and receives a second error, give up now. This is better
1685 * than waiting a long time to establish a connection that
1686 * can never complete.
1687 */
1688 if (tp->t_state == TCPS_ESTABLISHED &&
1689 (error == EHOSTUNREACH || error == ENETUNREACH ||
1690 error == EHOSTDOWN)) {
1691 if (inp->inp_route.ro_rt) {
1692 rtfree(inp->inp_route.ro_rt);
1693 inp->inp_route.ro_rt = (struct rtentry *)NULL;
1694 }
1695 } else if (tp->t_state < TCPS_ESTABLISHED && tp->t_rxtshift > 3 &&
1696 tp->t_softerror)
1697 tcp_drop(tp, error);
1698 else
1699 tp->t_softerror = error;
1700#if 0
1701 wakeup((caddr_t) &so->so_timeo);
1702 sorwakeup(so);
1703 sowwakeup(so);
1704#endif
1705}
1706
1707struct bwmeas *
1708tcp_bwmeas_alloc(struct tcpcb *tp)
1709{
1710 struct bwmeas *elm;
1711 elm = zalloc(tcp_bwmeas_zone);
1712 if (elm == NULL)
1713 return (elm);
1714
1715 bzero(elm, bwmeas_elm_size);
1716 elm->bw_minsizepkts = TCP_BWMEAS_BURST_MINSIZE;
1717 elm->bw_minsize = elm->bw_minsizepkts * tp->t_maxseg;
1718 return (elm);
1719}
1720
1721void
1722tcp_bwmeas_free(struct tcpcb *tp)
1723{
1724 zfree(tcp_bwmeas_zone, tp->t_bwmeas);
1725 tp->t_bwmeas = NULL;
1726 tp->t_flagsext &= ~(TF_MEASURESNDBW);
1727}
1728
1729int
1730get_tcp_inp_list(struct inpcb **inp_list, int n, inp_gen_t gencnt)
1731{
1732 struct tcpcb *tp;
1733 struct inpcb *inp;
1734 int i = 0;
1735
1736 LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) {
1737 if (inp->inp_gencnt <= gencnt &&
1738 inp->inp_state != INPCB_STATE_DEAD)
1739 inp_list[i++] = inp;
1740 if (i >= n)
1741 break;
1742 }
1743
1744 TAILQ_FOREACH(tp, &tcp_tw_tailq, t_twentry) {
1745 inp = tp->t_inpcb;
1746 if (inp->inp_gencnt <= gencnt &&
1747 inp->inp_state != INPCB_STATE_DEAD)
1748 inp_list[i++] = inp;
1749 if (i >= n)
1750 break;
1751 }
1752 return (i);
1753}
1754
1755/*
1756 * tcpcb_to_otcpcb copies specific bits of a tcpcb to a otcpcb format.
1757 * The otcpcb data structure is passed to user space and must not change.
1758 */
1759static void
1760tcpcb_to_otcpcb(struct tcpcb *tp, struct otcpcb *otp)
1761{
1762 otp->t_segq = (uint32_t)VM_KERNEL_ADDRPERM(tp->t_segq.lh_first);
1763 otp->t_dupacks = tp->t_dupacks;
1764 otp->t_timer[TCPT_REXMT_EXT] = tp->t_timer[TCPT_REXMT];
1765 otp->t_timer[TCPT_PERSIST_EXT] = tp->t_timer[TCPT_PERSIST];
1766 otp->t_timer[TCPT_KEEP_EXT] = tp->t_timer[TCPT_KEEP];
1767 otp->t_timer[TCPT_2MSL_EXT] = tp->t_timer[TCPT_2MSL];
1768 otp->t_inpcb =
1769 (_TCPCB_PTR(struct inpcb *))VM_KERNEL_ADDRPERM(tp->t_inpcb);
1770 otp->t_state = tp->t_state;
1771 otp->t_flags = tp->t_flags;
1772 otp->t_force = (tp->t_flagsext & TF_FORCE) ? 1 : 0;
1773 otp->snd_una = tp->snd_una;
1774 otp->snd_max = tp->snd_max;
1775 otp->snd_nxt = tp->snd_nxt;
1776 otp->snd_up = tp->snd_up;
1777 otp->snd_wl1 = tp->snd_wl1;
1778 otp->snd_wl2 = tp->snd_wl2;
1779 otp->iss = tp->iss;
1780 otp->irs = tp->irs;
1781 otp->rcv_nxt = tp->rcv_nxt;
1782 otp->rcv_adv = tp->rcv_adv;
1783 otp->rcv_wnd = tp->rcv_wnd;
1784 otp->rcv_up = tp->rcv_up;
1785 otp->snd_wnd = tp->snd_wnd;
1786 otp->snd_cwnd = tp->snd_cwnd;
1787 otp->snd_ssthresh = tp->snd_ssthresh;
1788 otp->t_maxopd = tp->t_maxopd;
1789 otp->t_rcvtime = tp->t_rcvtime;
1790 otp->t_starttime = tp->t_starttime;
1791 otp->t_rtttime = tp->t_rtttime;
1792 otp->t_rtseq = tp->t_rtseq;
1793 otp->t_rxtcur = tp->t_rxtcur;
1794 otp->t_maxseg = tp->t_maxseg;
1795 otp->t_srtt = tp->t_srtt;
1796 otp->t_rttvar = tp->t_rttvar;
1797 otp->t_rxtshift = tp->t_rxtshift;
1798 otp->t_rttmin = tp->t_rttmin;
1799 otp->t_rttupdated = tp->t_rttupdated;
1800 otp->max_sndwnd = tp->max_sndwnd;
1801 otp->t_softerror = tp->t_softerror;
1802 otp->t_oobflags = tp->t_oobflags;
1803 otp->t_iobc = tp->t_iobc;
1804 otp->snd_scale = tp->snd_scale;
1805 otp->rcv_scale = tp->rcv_scale;
1806 otp->request_r_scale = tp->request_r_scale;
1807 otp->requested_s_scale = tp->requested_s_scale;
1808 otp->ts_recent = tp->ts_recent;
1809 otp->ts_recent_age = tp->ts_recent_age;
1810 otp->last_ack_sent = tp->last_ack_sent;
1811 otp->cc_send = 0;
1812 otp->cc_recv = 0;
1813 otp->snd_recover = tp->snd_recover;
1814 otp->snd_cwnd_prev = tp->snd_cwnd_prev;
1815 otp->snd_ssthresh_prev = tp->snd_ssthresh_prev;
1816 otp->t_badrxtwin = 0;
1817}
1818
1819static int
1820tcp_pcblist SYSCTL_HANDLER_ARGS
1821{
1822#pragma unused(oidp, arg1, arg2)
1823 int error, i = 0, n;
1824 struct inpcb **inp_list;
1825 inp_gen_t gencnt;
1826 struct xinpgen xig;
1827
1828 /*
1829 * The process of preparing the TCB list is too time-consuming and
1830 * resource-intensive to repeat twice on every request.
1831 */
1832 lck_rw_lock_shared(tcbinfo.ipi_lock);
1833 if (req->oldptr == USER_ADDR_NULL) {
1834 n = tcbinfo.ipi_count;
1835 req->oldidx = 2 * (sizeof(xig))
1836 + (n + n/8) * sizeof(struct xtcpcb);
1837 lck_rw_done(tcbinfo.ipi_lock);
1838 return (0);
1839 }
1840
1841 if (req->newptr != USER_ADDR_NULL) {
1842 lck_rw_done(tcbinfo.ipi_lock);
1843 return (EPERM);
1844 }
1845
1846 /*
1847 * OK, now we're committed to doing something.
1848 */
1849 gencnt = tcbinfo.ipi_gencnt;
1850 n = tcbinfo.ipi_count;
1851
1852 bzero(&xig, sizeof(xig));
1853 xig.xig_len = sizeof(xig);
1854 xig.xig_count = n;
1855 xig.xig_gen = gencnt;
1856 xig.xig_sogen = so_gencnt;
1857 error = SYSCTL_OUT(req, &xig, sizeof(xig));
1858 if (error) {
1859 lck_rw_done(tcbinfo.ipi_lock);
1860 return (error);
1861 }
1862 /*
1863 * We are done if there is no pcb
1864 */
1865 if (n == 0) {
1866 lck_rw_done(tcbinfo.ipi_lock);
1867 return (0);
1868 }
1869
1870 inp_list = _MALLOC(n * sizeof (*inp_list), M_TEMP, M_WAITOK);
1871 if (inp_list == 0) {
1872 lck_rw_done(tcbinfo.ipi_lock);
1873 return (ENOMEM);
1874 }
1875
1876 n = get_tcp_inp_list(inp_list, n, gencnt);
1877
1878 error = 0;
1879 for (i = 0; i < n; i++) {
1880 struct xtcpcb xt;
1881 caddr_t inp_ppcb;
1882 struct inpcb *inp;
1883
1884 inp = inp_list[i];
1885
1886 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING)
1887 continue;
1888 socket_lock(inp->inp_socket, 1);
1889 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
1890 socket_unlock(inp->inp_socket, 1);
1891 continue;
1892 }
1893 if (inp->inp_gencnt > gencnt) {
1894 socket_unlock(inp->inp_socket, 1);
1895 continue;
1896 }
1897
1898 bzero(&xt, sizeof(xt));
1899 xt.xt_len = sizeof(xt);
1900 /* XXX should avoid extra copy */
1901 inpcb_to_compat(inp, &xt.xt_inp);
1902 inp_ppcb = inp->inp_ppcb;
1903 if (inp_ppcb != NULL) {
1904 tcpcb_to_otcpcb((struct tcpcb *)(void *)inp_ppcb,
1905 &xt.xt_tp);
1906 } else {
1907 bzero((char *) &xt.xt_tp, sizeof(xt.xt_tp));
1908 }
1909 if (inp->inp_socket)
1910 sotoxsocket(inp->inp_socket, &xt.xt_socket);
1911
1912 socket_unlock(inp->inp_socket, 1);
1913
1914 error = SYSCTL_OUT(req, &xt, sizeof(xt));
1915 }
1916 if (!error) {
1917 /*
1918 * Give the user an updated idea of our state.
1919 * If the generation differs from what we told
1920 * her before, she knows that something happened
1921 * while we were processing this request, and it
1922 * might be necessary to retry.
1923 */
1924 bzero(&xig, sizeof(xig));
1925 xig.xig_len = sizeof(xig);
1926 xig.xig_gen = tcbinfo.ipi_gencnt;
1927 xig.xig_sogen = so_gencnt;
1928 xig.xig_count = tcbinfo.ipi_count;
1929 error = SYSCTL_OUT(req, &xig, sizeof(xig));
1930 }
1931 FREE(inp_list, M_TEMP);
1932 lck_rw_done(tcbinfo.ipi_lock);
1933 return (error);
1934}
1935
1936SYSCTL_PROC(_net_inet_tcp, TCPCTL_PCBLIST, pcblist,
1937 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
1938 tcp_pcblist, "S,xtcpcb", "List of active TCP connections");
1939
1940#if !CONFIG_EMBEDDED
1941
1942static void
1943tcpcb_to_xtcpcb64(struct tcpcb *tp, struct xtcpcb64 *otp)
1944{
1945 otp->t_segq = (uint32_t)VM_KERNEL_ADDRPERM(tp->t_segq.lh_first);
1946 otp->t_dupacks = tp->t_dupacks;
1947 otp->t_timer[TCPT_REXMT_EXT] = tp->t_timer[TCPT_REXMT];
1948 otp->t_timer[TCPT_PERSIST_EXT] = tp->t_timer[TCPT_PERSIST];
1949 otp->t_timer[TCPT_KEEP_EXT] = tp->t_timer[TCPT_KEEP];
1950 otp->t_timer[TCPT_2MSL_EXT] = tp->t_timer[TCPT_2MSL];
1951 otp->t_state = tp->t_state;
1952 otp->t_flags = tp->t_flags;
1953 otp->t_force = (tp->t_flagsext & TF_FORCE) ? 1 : 0;
1954 otp->snd_una = tp->snd_una;
1955 otp->snd_max = tp->snd_max;
1956 otp->snd_nxt = tp->snd_nxt;
1957 otp->snd_up = tp->snd_up;
1958 otp->snd_wl1 = tp->snd_wl1;
1959 otp->snd_wl2 = tp->snd_wl2;
1960 otp->iss = tp->iss;
1961 otp->irs = tp->irs;
1962 otp->rcv_nxt = tp->rcv_nxt;
1963 otp->rcv_adv = tp->rcv_adv;
1964 otp->rcv_wnd = tp->rcv_wnd;
1965 otp->rcv_up = tp->rcv_up;
1966 otp->snd_wnd = tp->snd_wnd;
1967 otp->snd_cwnd = tp->snd_cwnd;
1968 otp->snd_ssthresh = tp->snd_ssthresh;
1969 otp->t_maxopd = tp->t_maxopd;
1970 otp->t_rcvtime = tp->t_rcvtime;
1971 otp->t_starttime = tp->t_starttime;
1972 otp->t_rtttime = tp->t_rtttime;
1973 otp->t_rtseq = tp->t_rtseq;
1974 otp->t_rxtcur = tp->t_rxtcur;
1975 otp->t_maxseg = tp->t_maxseg;
1976 otp->t_srtt = tp->t_srtt;
1977 otp->t_rttvar = tp->t_rttvar;
1978 otp->t_rxtshift = tp->t_rxtshift;
1979 otp->t_rttmin = tp->t_rttmin;
1980 otp->t_rttupdated = tp->t_rttupdated;
1981 otp->max_sndwnd = tp->max_sndwnd;
1982 otp->t_softerror = tp->t_softerror;
1983 otp->t_oobflags = tp->t_oobflags;
1984 otp->t_iobc = tp->t_iobc;
1985 otp->snd_scale = tp->snd_scale;
1986 otp->rcv_scale = tp->rcv_scale;
1987 otp->request_r_scale = tp->request_r_scale;
1988 otp->requested_s_scale = tp->requested_s_scale;
1989 otp->ts_recent = tp->ts_recent;
1990 otp->ts_recent_age = tp->ts_recent_age;
1991 otp->last_ack_sent = tp->last_ack_sent;
1992 otp->cc_send = 0;
1993 otp->cc_recv = 0;
1994 otp->snd_recover = tp->snd_recover;
1995 otp->snd_cwnd_prev = tp->snd_cwnd_prev;
1996 otp->snd_ssthresh_prev = tp->snd_ssthresh_prev;
1997 otp->t_badrxtwin = 0;
1998}
1999
2000
2001static int
2002tcp_pcblist64 SYSCTL_HANDLER_ARGS
2003{
2004#pragma unused(oidp, arg1, arg2)
2005 int error, i = 0, n;
2006 struct inpcb **inp_list;
2007 inp_gen_t gencnt;
2008 struct xinpgen xig;
2009
2010 /*
2011 * The process of preparing the TCB list is too time-consuming and
2012 * resource-intensive to repeat twice on every request.
2013 */
2014 lck_rw_lock_shared(tcbinfo.ipi_lock);
2015 if (req->oldptr == USER_ADDR_NULL) {
2016 n = tcbinfo.ipi_count;
2017 req->oldidx = 2 * (sizeof(xig))
2018 + (n + n/8) * sizeof(struct xtcpcb64);
2019 lck_rw_done(tcbinfo.ipi_lock);
2020 return (0);
2021 }
2022
2023 if (req->newptr != USER_ADDR_NULL) {
2024 lck_rw_done(tcbinfo.ipi_lock);
2025 return (EPERM);
2026 }
2027
2028 /*
2029 * OK, now we're committed to doing something.
2030 */
2031 gencnt = tcbinfo.ipi_gencnt;
2032 n = tcbinfo.ipi_count;
2033
2034 bzero(&xig, sizeof(xig));
2035 xig.xig_len = sizeof(xig);
2036 xig.xig_count = n;
2037 xig.xig_gen = gencnt;
2038 xig.xig_sogen = so_gencnt;
2039 error = SYSCTL_OUT(req, &xig, sizeof(xig));
2040 if (error) {
2041 lck_rw_done(tcbinfo.ipi_lock);
2042 return (error);
2043 }
2044 /*
2045 * We are done if there is no pcb
2046 */
2047 if (n == 0) {
2048 lck_rw_done(tcbinfo.ipi_lock);
2049 return (0);
2050 }
2051
2052 inp_list = _MALLOC(n * sizeof (*inp_list), M_TEMP, M_WAITOK);
2053 if (inp_list == 0) {
2054 lck_rw_done(tcbinfo.ipi_lock);
2055 return (ENOMEM);
2056 }
2057
2058 n = get_tcp_inp_list(inp_list, n, gencnt);
2059
2060 error = 0;
2061 for (i = 0; i < n; i++) {
2062 struct xtcpcb64 xt;
2063 struct inpcb *inp;
2064
2065 inp = inp_list[i];
2066
2067 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING)
2068 continue;
2069 socket_lock(inp->inp_socket, 1);
2070 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
2071 socket_unlock(inp->inp_socket, 1);
2072 continue;
2073 }
2074 if (inp->inp_gencnt > gencnt) {
2075 socket_unlock(inp->inp_socket, 1);
2076 continue;
2077 }
2078
2079 bzero(&xt, sizeof(xt));
2080 xt.xt_len = sizeof(xt);
2081 inpcb_to_xinpcb64(inp, &xt.xt_inpcb);
2082 xt.xt_inpcb.inp_ppcb =
2083 (uint64_t)VM_KERNEL_ADDRPERM(inp->inp_ppcb);
2084 if (inp->inp_ppcb != NULL)
2085 tcpcb_to_xtcpcb64((struct tcpcb *)inp->inp_ppcb,
2086 &xt);
2087 if (inp->inp_socket)
2088 sotoxsocket64(inp->inp_socket,
2089 &xt.xt_inpcb.xi_socket);
2090
2091 socket_unlock(inp->inp_socket, 1);
2092
2093 error = SYSCTL_OUT(req, &xt, sizeof(xt));
2094 }
2095 if (!error) {
2096 /*
2097 * Give the user an updated idea of our state.
2098 * If the generation differs from what we told
2099 * her before, she knows that something happened
2100 * while we were processing this request, and it
2101 * might be necessary to retry.
2102 */
2103 bzero(&xig, sizeof(xig));
2104 xig.xig_len = sizeof(xig);
2105 xig.xig_gen = tcbinfo.ipi_gencnt;
2106 xig.xig_sogen = so_gencnt;
2107 xig.xig_count = tcbinfo.ipi_count;
2108 error = SYSCTL_OUT(req, &xig, sizeof(xig));
2109 }
2110 FREE(inp_list, M_TEMP);
2111 lck_rw_done(tcbinfo.ipi_lock);
2112 return (error);
2113}
2114
2115SYSCTL_PROC(_net_inet_tcp, OID_AUTO, pcblist64,
2116 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
2117 tcp_pcblist64, "S,xtcpcb64", "List of active TCP connections");
2118
2119#endif /* !CONFIG_EMBEDDED */
2120
2121static int
2122tcp_pcblist_n SYSCTL_HANDLER_ARGS
2123{
2124#pragma unused(oidp, arg1, arg2)
2125 int error = 0;
2126
2127 error = get_pcblist_n(IPPROTO_TCP, req, &tcbinfo);
2128
2129 return (error);
2130}
2131
2132
2133SYSCTL_PROC(_net_inet_tcp, OID_AUTO, pcblist_n,
2134 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0,
2135 tcp_pcblist_n, "S,xtcpcb_n", "List of active TCP connections");
2136
2137static int
2138tcp_progress_indicators SYSCTL_HANDLER_ARGS
2139{
2140#pragma unused(oidp, arg1, arg2)
2141
2142 return (ntstat_tcp_progress_indicators(req));
2143}
2144
2145SYSCTL_PROC(_net_inet_tcp, OID_AUTO, progress,
2146 CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY, 0, 0,
2147 tcp_progress_indicators, "S", "Various items that indicate the current state of progress on the link");
2148
2149
2150__private_extern__ void
2151tcp_get_ports_used(uint32_t ifindex, int protocol, uint32_t flags,
2152 bitstr_t *bitfield)
2153{
2154 inpcb_get_ports_used(ifindex, protocol, flags, bitfield,
2155 &tcbinfo);
2156}
2157
2158__private_extern__ uint32_t
2159tcp_count_opportunistic(unsigned int ifindex, u_int32_t flags)
2160{
2161 return (inpcb_count_opportunistic(ifindex, &tcbinfo, flags));
2162}
2163
2164__private_extern__ uint32_t
2165tcp_find_anypcb_byaddr(struct ifaddr *ifa)
2166{
2167 return (inpcb_find_anypcb_byaddr(ifa, &tcbinfo));
2168}
2169
2170static void
2171tcp_handle_msgsize(struct ip *ip, struct inpcb *inp)
2172{
2173 struct rtentry *rt = NULL;
2174 u_short ifscope = IFSCOPE_NONE;
2175 int mtu;
2176 struct sockaddr_in icmpsrc = {
2177 sizeof (struct sockaddr_in),
2178 AF_INET, 0, { 0 },
2179 { 0, 0, 0, 0, 0, 0, 0, 0 } };
2180 struct icmp *icp = NULL;
2181
2182 icp = (struct icmp *)(void *)
2183 ((caddr_t)ip - offsetof(struct icmp, icmp_ip));
2184
2185 icmpsrc.sin_addr = icp->icmp_ip.ip_dst;
2186
2187 /*
2188 * MTU discovery:
2189 * If we got a needfrag and there is a host route to the
2190 * original destination, and the MTU is not locked, then
2191 * set the MTU in the route to the suggested new value
2192 * (if given) and then notify as usual. The ULPs will
2193 * notice that the MTU has changed and adapt accordingly.
2194 * If no new MTU was suggested, then we guess a new one
2195 * less than the current value. If the new MTU is
2196 * unreasonably small (defined by sysctl tcp_minmss), then
2197 * we reset the MTU to the interface value and enable the
2198 * lock bit, indicating that we are no longer doing MTU
2199 * discovery.
2200 */
2201 if (ROUTE_UNUSABLE(&(inp->inp_route)) == false)
2202 rt = inp->inp_route.ro_rt;
2203
2204 /*
2205 * icmp6_mtudisc_update scopes the routing lookup
2206 * to the incoming interface (delivered from mbuf
2207 * packet header.
2208 * That is mostly ok but for asymmetric networks
2209 * that may be an issue.
2210 * Frag needed OR Packet too big really communicates
2211 * MTU for the out data path.
2212 * Take the interface scope from cached route or
2213 * the last outgoing interface from inp
2214 */
2215 if (rt != NULL)
2216 ifscope = (rt->rt_ifp != NULL) ?
2217 rt->rt_ifp->if_index : IFSCOPE_NONE;
2218 else
2219 ifscope = (inp->inp_last_outifp != NULL) ?
2220 inp->inp_last_outifp->if_index : IFSCOPE_NONE;
2221
2222 if ((rt == NULL) ||
2223 !(rt->rt_flags & RTF_HOST) ||
2224 (rt->rt_flags & (RTF_CLONING | RTF_PRCLONING))) {
2225 rt = rtalloc1_scoped((struct sockaddr *)&icmpsrc, 0,
2226 RTF_CLONING | RTF_PRCLONING, ifscope);
2227 } else if (rt) {
2228 RT_LOCK(rt);
2229 rtref(rt);
2230 RT_UNLOCK(rt);
2231 }
2232
2233 if (rt != NULL) {
2234 RT_LOCK(rt);
2235 if ((rt->rt_flags & RTF_HOST) &&
2236 !(rt->rt_rmx.rmx_locks & RTV_MTU)) {
2237 mtu = ntohs(icp->icmp_nextmtu);
2238 /*
2239 * XXX Stock BSD has changed the following
2240 * to compare with icp->icmp_ip.ip_len
2241 * to converge faster when sent packet
2242 * < route's MTU. We may want to adopt
2243 * that change.
2244 */
2245 if (mtu == 0)
2246 mtu = ip_next_mtu(rt->rt_rmx.
2247 rmx_mtu, 1);
2248#if DEBUG_MTUDISC
2249 printf("MTU for %s reduced to %d\n",
2250 inet_ntop(AF_INET,
2251 &icmpsrc.sin_addr, ipv4str,
2252 sizeof (ipv4str)), mtu);
2253#endif
2254 if (mtu < max(296, (tcp_minmss +
2255 sizeof (struct tcpiphdr)))) {
2256 rt->rt_rmx.rmx_locks |= RTV_MTU;
2257 } else if (rt->rt_rmx.rmx_mtu > mtu) {
2258 rt->rt_rmx.rmx_mtu = mtu;
2259 }
2260 }
2261 RT_UNLOCK(rt);
2262 rtfree(rt);
2263 }
2264}
2265
2266void
2267tcp_ctlinput(int cmd, struct sockaddr *sa, void *vip, __unused struct ifnet *ifp)
2268{
2269 tcp_seq icmp_tcp_seq;
2270 struct ip *ip = vip;
2271 struct in_addr faddr;
2272 struct inpcb *inp;
2273 struct tcpcb *tp;
2274 struct tcphdr *th;
2275 struct icmp *icp;
2276 void (*notify)(struct inpcb *, int) = tcp_notify;
2277
2278 faddr = ((struct sockaddr_in *)(void *)sa)->sin_addr;
2279 if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY)
2280 return;
2281
2282 if ((unsigned)cmd >= PRC_NCMDS)
2283 return;
2284
2285 /* Source quench is deprecated */
2286 if (cmd == PRC_QUENCH)
2287 return;
2288
2289 if (cmd == PRC_MSGSIZE)
2290 notify = tcp_mtudisc;
2291 else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
2292 cmd == PRC_UNREACH_PORT || cmd == PRC_UNREACH_PROTOCOL ||
2293 cmd == PRC_TIMXCEED_INTRANS) && ip)
2294 notify = tcp_drop_syn_sent;
2295 /*
2296 * Hostdead is ugly because it goes linearly through all PCBs.
2297 * XXX: We never get this from ICMP, otherwise it makes an
2298 * excellent DoS attack on machines with many connections.
2299 */
2300 else if (cmd == PRC_HOSTDEAD)
2301 ip = NULL;
2302 else if (inetctlerrmap[cmd] == 0 && !PRC_IS_REDIRECT(cmd))
2303 return;
2304
2305
2306 if (ip == NULL) {
2307 in_pcbnotifyall(&tcbinfo, faddr, inetctlerrmap[cmd], notify);
2308 return;
2309 }
2310
2311 icp = (struct icmp *)(void *)
2312 ((caddr_t)ip - offsetof(struct icmp, icmp_ip));
2313 th = (struct tcphdr *)(void *)((caddr_t)ip + (IP_VHL_HL(ip->ip_vhl) << 2));
2314 icmp_tcp_seq = ntohl(th->th_seq);
2315
2316 inp = in_pcblookup_hash(&tcbinfo, faddr, th->th_dport,
2317 ip->ip_src, th->th_sport, 0, NULL);
2318
2319 if (inp == NULL ||
2320 inp->inp_socket == NULL) {
2321 return;
2322 }
2323
2324 socket_lock(inp->inp_socket, 1);
2325 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) ==
2326 WNT_STOPUSING) {
2327 socket_unlock(inp->inp_socket, 1);
2328 return;
2329 }
2330
2331 if (PRC_IS_REDIRECT(cmd)) {
2332 /* signal EHOSTDOWN, as it flushes the cached route */
2333 (*notify)(inp, EHOSTDOWN);
2334 } else {
2335 tp = intotcpcb(inp);
2336 if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
2337 SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
2338 if (cmd == PRC_MSGSIZE)
2339 tcp_handle_msgsize(ip, inp);
2340
2341 (*notify)(inp, inetctlerrmap[cmd]);
2342 }
2343 }
2344 socket_unlock(inp->inp_socket, 1);
2345}
2346
2347#if INET6
2348void
2349tcp6_ctlinput(int cmd, struct sockaddr *sa, void *d, __unused struct ifnet *ifp)
2350{
2351 tcp_seq icmp_tcp_seq;
2352 struct in6_addr *dst;
2353 struct tcphdr *th;
2354 void (*notify)(struct inpcb *, int) = tcp_notify;
2355 struct ip6_hdr *ip6;
2356 struct mbuf *m;
2357 struct inpcb *inp;
2358 struct tcpcb *tp;
2359 struct icmp6_hdr *icmp6;
2360 struct ip6ctlparam *ip6cp = NULL;
2361 const struct sockaddr_in6 *sa6_src = NULL;
2362 unsigned int mtu;
2363 unsigned int off;
2364
2365 if (sa->sa_family != AF_INET6 ||
2366 sa->sa_len != sizeof(struct sockaddr_in6))
2367 return;
2368
2369 /* Source quench is deprecated */
2370 if (cmd == PRC_QUENCH)
2371 return;
2372
2373 if ((unsigned)cmd >= PRC_NCMDS)
2374 return;
2375
2376 /* if the parameter is from icmp6, decode it. */
2377 if (d != NULL) {
2378 ip6cp = (struct ip6ctlparam *)d;
2379 icmp6 = ip6cp->ip6c_icmp6;
2380 m = ip6cp->ip6c_m;
2381 ip6 = ip6cp->ip6c_ip6;
2382 off = ip6cp->ip6c_off;
2383 sa6_src = ip6cp->ip6c_src;
2384 dst = ip6cp->ip6c_finaldst;
2385 } else {
2386 m = NULL;
2387 ip6 = NULL;
2388 off = 0; /* fool gcc */
2389 sa6_src = &sa6_any;
2390 dst = NULL;
2391 }
2392
2393 if (cmd == PRC_MSGSIZE)
2394 notify = tcp_mtudisc;
2395 else if (icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB ||
2396 cmd == PRC_UNREACH_PORT || cmd == PRC_TIMXCEED_INTRANS) &&
2397 ip6 != NULL)
2398 notify = tcp_drop_syn_sent;
2399 /*
2400 * Hostdead is ugly because it goes linearly through all PCBs.
2401 * XXX: We never get this from ICMP, otherwise it makes an
2402 * excellent DoS attack on machines with many connections.
2403 */
2404 else if (cmd == PRC_HOSTDEAD)
2405 ip6 = NULL;
2406 else if (inet6ctlerrmap[cmd] == 0 && !PRC_IS_REDIRECT(cmd))
2407 return;
2408
2409
2410 if (ip6 == NULL) {
2411 in6_pcbnotify(&tcbinfo, sa, 0, (struct sockaddr *)(size_t)sa6_src,
2412 0, cmd, NULL, notify);
2413 return;
2414 }
2415
2416 if (m == NULL ||
2417 (m->m_pkthdr.len < (int32_t) (off + offsetof(struct tcphdr, th_ack))))
2418 return;
2419
2420 th = (struct tcphdr *)(void *)mtodo(m, off);
2421 icmp_tcp_seq = ntohl(th->th_seq);
2422
2423 if (cmd == PRC_MSGSIZE) {
2424 mtu = ntohl(icmp6->icmp6_mtu);
2425 /*
2426 * If no alternative MTU was proposed, or the proposed
2427 * MTU was too small, set to the min.
2428 */
2429 if (mtu < IPV6_MMTU)
2430 mtu = IPV6_MMTU - 8;
2431 }
2432
2433 inp = in6_pcblookup_hash(&tcbinfo, &ip6->ip6_dst, th->th_dport,
2434 &ip6->ip6_src, th->th_sport, 0, NULL);
2435
2436 if (inp == NULL ||
2437 inp->inp_socket == NULL) {
2438 return;
2439 }
2440
2441 socket_lock(inp->inp_socket, 1);
2442 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) ==
2443 WNT_STOPUSING) {
2444 socket_unlock(inp->inp_socket, 1);
2445 return;
2446 }
2447
2448 if (PRC_IS_REDIRECT(cmd)) {
2449 /* signal EHOSTDOWN, as it flushes the cached route */
2450 (*notify)(inp, EHOSTDOWN);
2451 } else {
2452 tp = intotcpcb(inp);
2453 if (SEQ_GEQ(icmp_tcp_seq, tp->snd_una) &&
2454 SEQ_LT(icmp_tcp_seq, tp->snd_max)) {
2455 if (cmd == PRC_MSGSIZE) {
2456 /*
2457 * Only process the offered MTU if it
2458 * is smaller than the current one.
2459 */
2460 if (mtu < tp->t_maxseg +
2461 (sizeof (*th) + sizeof (*ip6)))
2462 (*notify)(inp, inetctlerrmap[cmd]);
2463 } else
2464 (*notify)(inp, inetctlerrmap[cmd]);
2465 }
2466 }
2467 socket_unlock(inp->inp_socket, 1);
2468}
2469#endif /* INET6 */
2470
2471
2472/*
2473 * Following is where TCP initial sequence number generation occurs.
2474 *
2475 * There are two places where we must use initial sequence numbers:
2476 * 1. In SYN-ACK packets.
2477 * 2. In SYN packets.
2478 *
2479 * The ISNs in SYN-ACK packets have no monotonicity requirement,
2480 * and should be as unpredictable as possible to avoid the possibility
2481 * of spoofing and/or connection hijacking. To satisfy this
2482 * requirement, SYN-ACK ISNs are generated via the arc4random()
2483 * function. If exact RFC 1948 compliance is requested via sysctl,
2484 * these ISNs will be generated just like those in SYN packets.
2485 *
2486 * The ISNs in SYN packets must be monotonic; TIME_WAIT recycling
2487 * depends on this property. In addition, these ISNs should be
2488 * unguessable so as to prevent connection hijacking. To satisfy
2489 * the requirements of this situation, the algorithm outlined in
2490 * RFC 1948 is used to generate sequence numbers.
2491 *
2492 * For more information on the theory of operation, please see
2493 * RFC 1948.
2494 *
2495 * Implementation details:
2496 *
2497 * Time is based off the system timer, and is corrected so that it
2498 * increases by one megabyte per second. This allows for proper
2499 * recycling on high speed LANs while still leaving over an hour
2500 * before rollover.
2501 *
2502 * Two sysctls control the generation of ISNs:
2503 *
2504 * net.inet.tcp.isn_reseed_interval controls the number of seconds
2505 * between seeding of isn_secret. This is normally set to zero,
2506 * as reseeding should not be necessary.
2507 *
2508 * net.inet.tcp.strict_rfc1948 controls whether RFC 1948 is followed
2509 * strictly. When strict compliance is requested, reseeding is
2510 * disabled and SYN-ACKs will be generated in the same manner as
2511 * SYNs. Strict mode is disabled by default.
2512 *
2513 */
2514
2515#define ISN_BYTES_PER_SECOND 1048576
2516
2517tcp_seq
2518tcp_new_isn(struct tcpcb *tp)
2519{
2520 u_int32_t md5_buffer[4];
2521 tcp_seq new_isn;
2522 struct timeval timenow;
2523 u_char isn_secret[32];
2524 int isn_last_reseed = 0;
2525 MD5_CTX isn_ctx;
2526
2527 /* Use arc4random for SYN-ACKs when not in exact RFC1948 mode. */
2528 if (((tp->t_state == TCPS_LISTEN) || (tp->t_state == TCPS_TIME_WAIT)) &&
2529 tcp_strict_rfc1948 == 0)
2530#ifdef __APPLE__
2531 return (RandomULong());
2532#else
2533 return (arc4random());
2534#endif
2535 getmicrotime(&timenow);
2536
2537 /* Seed if this is the first use, reseed if requested. */
2538 if ((isn_last_reseed == 0) ||
2539 ((tcp_strict_rfc1948 == 0) && (tcp_isn_reseed_interval > 0) &&
2540 (((u_int)isn_last_reseed + (u_int)tcp_isn_reseed_interval*hz)
2541 < (u_int)timenow.tv_sec))) {
2542#ifdef __APPLE__
2543 read_frandom(&isn_secret, sizeof(isn_secret));
2544#else
2545 read_random_unlimited(&isn_secret, sizeof(isn_secret));
2546#endif
2547 isn_last_reseed = timenow.tv_sec;
2548 }
2549
2550 /* Compute the md5 hash and return the ISN. */
2551 MD5Init(&isn_ctx);
2552 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_fport,
2553 sizeof(u_short));
2554 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_lport,
2555 sizeof(u_short));
2556#if INET6
2557 if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) {
2558 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_faddr,
2559 sizeof(struct in6_addr));
2560 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->in6p_laddr,
2561 sizeof(struct in6_addr));
2562 } else
2563#endif
2564 {
2565 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_faddr,
2566 sizeof(struct in_addr));
2567 MD5Update(&isn_ctx, (u_char *) &tp->t_inpcb->inp_laddr,
2568 sizeof(struct in_addr));
2569 }
2570 MD5Update(&isn_ctx, (u_char *) &isn_secret, sizeof(isn_secret));
2571 MD5Final((u_char *) &md5_buffer, &isn_ctx);
2572 new_isn = (tcp_seq) md5_buffer[0];
2573 new_isn += timenow.tv_sec * (ISN_BYTES_PER_SECOND / hz);
2574 return (new_isn);
2575}
2576
2577
2578/*
2579 * When a specific ICMP unreachable message is received and the
2580 * connection state is SYN-SENT, drop the connection. This behavior
2581 * is controlled by the icmp_may_rst sysctl.
2582 */
2583void
2584tcp_drop_syn_sent(struct inpcb *inp, int errno)
2585{
2586 struct tcpcb *tp = intotcpcb(inp);
2587
2588 if (tp && tp->t_state == TCPS_SYN_SENT)
2589 tcp_drop(tp, errno);
2590}
2591
2592/*
2593 * When `need fragmentation' ICMP is received, update our idea of the MSS
2594 * based on the new value in the route. Also nudge TCP to send something,
2595 * since we know the packet we just sent was dropped.
2596 * This duplicates some code in the tcp_mss() function in tcp_input.c.
2597 */
2598void
2599tcp_mtudisc(
2600 struct inpcb *inp,
2601 __unused int errno
2602)
2603{
2604 struct tcpcb *tp = intotcpcb(inp);
2605 struct rtentry *rt;
2606 struct rmxp_tao *taop;
2607 struct socket *so = inp->inp_socket;
2608 int offered;
2609 int mss;
2610 u_int32_t mtu;
2611 u_int32_t protoHdrOverhead = sizeof (struct tcpiphdr);
2612#if INET6
2613 int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
2614
2615 if (isipv6)
2616 protoHdrOverhead = sizeof(struct ip6_hdr) +
2617 sizeof(struct tcphdr);
2618#endif /* INET6 */
2619
2620 if (tp) {
2621#if INET6
2622 if (isipv6)
2623 rt = tcp_rtlookup6(inp, IFSCOPE_NONE);
2624 else
2625#endif /* INET6 */
2626 rt = tcp_rtlookup(inp, IFSCOPE_NONE);
2627 if (!rt || !rt->rt_rmx.rmx_mtu) {
2628 tp->t_maxopd = tp->t_maxseg =
2629#if INET6
2630 isipv6 ? tcp_v6mssdflt :
2631#endif /* INET6 */
2632 tcp_mssdflt;
2633
2634 /* Route locked during lookup above */
2635 if (rt != NULL)
2636 RT_UNLOCK(rt);
2637 return;
2638 }
2639 taop = rmx_taop(rt->rt_rmx);
2640 offered = taop->tao_mssopt;
2641 mtu = rt->rt_rmx.rmx_mtu;
2642
2643 /* Route locked during lookup above */
2644 RT_UNLOCK(rt);
2645
2646#if NECP
2647 // Adjust MTU if necessary.
2648 mtu = necp_socket_get_effective_mtu(inp, mtu);
2649#endif /* NECP */
2650 mss = mtu - protoHdrOverhead;
2651
2652 if (offered)
2653 mss = min(mss, offered);
2654 /*
2655 * XXX - The above conditional probably violates the TCP
2656 * spec. The problem is that, since we don't know the
2657 * other end's MSS, we are supposed to use a conservative
2658 * default. But, if we do that, then MTU discovery will
2659 * never actually take place, because the conservative
2660 * default is much less than the MTUs typically seen
2661 * on the Internet today. For the moment, we'll sweep
2662 * this under the carpet.
2663 *
2664 * The conservative default might not actually be a problem
2665 * if the only case this occurs is when sending an initial
2666 * SYN with options and data to a host we've never talked
2667 * to before. Then, they will reply with an MSS value which
2668 * will get recorded and the new parameters should get
2669 * recomputed. For Further Study.
2670 */
2671 if (tp->t_maxopd <= mss)
2672 return;
2673 tp->t_maxopd = mss;
2674
2675 if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
2676 (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
2677 mss -= TCPOLEN_TSTAMP_APPA;
2678
2679#if MPTCP
2680 mss -= mptcp_adj_mss(tp, TRUE);
2681#endif
2682 if (so->so_snd.sb_hiwat < mss)
2683 mss = so->so_snd.sb_hiwat;
2684
2685 tp->t_maxseg = mss;
2686
2687 ASSERT(tp->t_maxseg);
2688
2689 /*
2690 * Reset the slow-start flight size as it may depends on the
2691 * new MSS
2692 */
2693 if (CC_ALGO(tp)->cwnd_init != NULL)
2694 CC_ALGO(tp)->cwnd_init(tp);
2695 tcpstat.tcps_mturesent++;
2696 tp->t_rtttime = 0;
2697 tp->snd_nxt = tp->snd_una;
2698 tcp_output(tp);
2699 }
2700}
2701
2702/*
2703 * Look-up the routing entry to the peer of this inpcb. If no route
2704 * is found and it cannot be allocated the return NULL. This routine
2705 * is called by TCP routines that access the rmx structure and by tcp_mss
2706 * to get the interface MTU. If a route is found, this routine will
2707 * hold the rtentry lock; the caller is responsible for unlocking.
2708 */
2709struct rtentry *
2710tcp_rtlookup(struct inpcb *inp, unsigned int input_ifscope)
2711{
2712 struct route *ro;
2713 struct rtentry *rt;
2714 struct tcpcb *tp;
2715
2716 LCK_MTX_ASSERT(rnh_lock, LCK_MTX_ASSERT_NOTOWNED);
2717
2718 ro = &inp->inp_route;
2719 if ((rt = ro->ro_rt) != NULL)
2720 RT_LOCK(rt);
2721
2722 if (ROUTE_UNUSABLE(ro)) {
2723 if (rt != NULL) {
2724 RT_UNLOCK(rt);
2725 rt = NULL;
2726 }
2727 ROUTE_RELEASE(ro);
2728 /* No route yet, so try to acquire one */
2729 if (inp->inp_faddr.s_addr != INADDR_ANY) {
2730 unsigned int ifscope;
2731
2732 ro->ro_dst.sa_family = AF_INET;
2733 ro->ro_dst.sa_len = sizeof(struct sockaddr_in);
2734 ((struct sockaddr_in *)(void *)&ro->ro_dst)->sin_addr =
2735 inp->inp_faddr;
2736
2737 /*
2738 * If the socket was bound to an interface, then
2739 * the bound-to-interface takes precedence over
2740 * the inbound interface passed in by the caller
2741 * (if we get here as part of the output path then
2742 * input_ifscope is IFSCOPE_NONE).
2743 */
2744 ifscope = (inp->inp_flags & INP_BOUND_IF) ?
2745 inp->inp_boundifp->if_index : input_ifscope;
2746
2747 rtalloc_scoped(ro, ifscope);
2748 if ((rt = ro->ro_rt) != NULL)
2749 RT_LOCK(rt);
2750 }
2751 }
2752 if (rt != NULL)
2753 RT_LOCK_ASSERT_HELD(rt);
2754
2755 /*
2756 * Update MTU discovery determination. Don't do it if:
2757 * 1) it is disabled via the sysctl
2758 * 2) the route isn't up
2759 * 3) the MTU is locked (if it is, then discovery has been
2760 * disabled)
2761 */
2762
2763 tp = intotcpcb(inp);
2764
2765 if (!path_mtu_discovery || ((rt != NULL) &&
2766 (!(rt->rt_flags & RTF_UP) || (rt->rt_rmx.rmx_locks & RTV_MTU))))
2767 tp->t_flags &= ~TF_PMTUD;
2768 else
2769 tp->t_flags |= TF_PMTUD;
2770
2771 if (rt != NULL && rt->rt_ifp != NULL) {
2772 somultipages(inp->inp_socket,
2773 (rt->rt_ifp->if_hwassist & IFNET_MULTIPAGES));
2774 tcp_set_tso(tp, rt->rt_ifp);
2775 soif2kcl(inp->inp_socket,
2776 (rt->rt_ifp->if_eflags & IFEF_2KCL));
2777 tcp_set_ecn(tp, rt->rt_ifp);
2778 if (inp->inp_last_outifp == NULL) {
2779 inp->inp_last_outifp = rt->rt_ifp;
2780
2781 }
2782 }
2783
2784 /* Note if the peer is local */
2785 if (rt != NULL && !(rt->rt_ifp->if_flags & IFF_POINTOPOINT) &&
2786 (rt->rt_gateway->sa_family == AF_LINK ||
2787 rt->rt_ifp->if_flags & IFF_LOOPBACK ||
2788 in_localaddr(inp->inp_faddr))) {
2789 tp->t_flags |= TF_LOCAL;
2790 }
2791
2792 /*
2793 * Caller needs to call RT_UNLOCK(rt).
2794 */
2795 return (rt);
2796}
2797
2798#if INET6
2799struct rtentry *
2800tcp_rtlookup6(struct inpcb *inp, unsigned int input_ifscope)
2801{
2802 struct route_in6 *ro6;
2803 struct rtentry *rt;
2804 struct tcpcb *tp;
2805
2806 LCK_MTX_ASSERT(rnh_lock, LCK_MTX_ASSERT_NOTOWNED);
2807
2808 ro6 = &inp->in6p_route;
2809 if ((rt = ro6->ro_rt) != NULL)
2810 RT_LOCK(rt);
2811
2812 if (ROUTE_UNUSABLE(ro6)) {
2813 if (rt != NULL) {
2814 RT_UNLOCK(rt);
2815 rt = NULL;
2816 }
2817 ROUTE_RELEASE(ro6);
2818 /* No route yet, so try to acquire one */
2819 if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) {
2820 struct sockaddr_in6 *dst6;
2821 unsigned int ifscope;
2822
2823 dst6 = (struct sockaddr_in6 *)&ro6->ro_dst;
2824 dst6->sin6_family = AF_INET6;
2825 dst6->sin6_len = sizeof(*dst6);
2826 dst6->sin6_addr = inp->in6p_faddr;
2827
2828 /*
2829 * If the socket was bound to an interface, then
2830 * the bound-to-interface takes precedence over
2831 * the inbound interface passed in by the caller
2832 * (if we get here as part of the output path then
2833 * input_ifscope is IFSCOPE_NONE).
2834 */
2835 ifscope = (inp->inp_flags & INP_BOUND_IF) ?
2836 inp->inp_boundifp->if_index : input_ifscope;
2837
2838 rtalloc_scoped((struct route *)ro6, ifscope);
2839 if ((rt = ro6->ro_rt) != NULL)
2840 RT_LOCK(rt);
2841 }
2842 }
2843 if (rt != NULL)
2844 RT_LOCK_ASSERT_HELD(rt);
2845
2846 /*
2847 * Update path MTU Discovery determination
2848 * while looking up the route:
2849 * 1) we have a valid route to the destination
2850 * 2) the MTU is not locked (if it is, then discovery has been
2851 * disabled)
2852 */
2853
2854
2855 tp = intotcpcb(inp);
2856
2857 /*
2858 * Update MTU discovery determination. Don't do it if:
2859 * 1) it is disabled via the sysctl
2860 * 2) the route isn't up
2861 * 3) the MTU is locked (if it is, then discovery has been
2862 * disabled)
2863 */
2864
2865 if (!path_mtu_discovery || ((rt != NULL) &&
2866 (!(rt->rt_flags & RTF_UP) || (rt->rt_rmx.rmx_locks & RTV_MTU))))
2867 tp->t_flags &= ~TF_PMTUD;
2868 else
2869 tp->t_flags |= TF_PMTUD;
2870
2871 if (rt != NULL && rt->rt_ifp != NULL) {
2872 somultipages(inp->inp_socket,
2873 (rt->rt_ifp->if_hwassist & IFNET_MULTIPAGES));
2874 tcp_set_tso(tp, rt->rt_ifp);
2875 soif2kcl(inp->inp_socket,
2876 (rt->rt_ifp->if_eflags & IFEF_2KCL));
2877 tcp_set_ecn(tp, rt->rt_ifp);
2878 if (inp->inp_last_outifp == NULL) {
2879 inp->inp_last_outifp = rt->rt_ifp;
2880 }
2881
2882 /* Note if the peer is local */
2883 if (!(rt->rt_ifp->if_flags & IFF_POINTOPOINT) &&
2884 (IN6_IS_ADDR_LOOPBACK(&inp->in6p_faddr) ||
2885 IN6_IS_ADDR_LINKLOCAL(&inp->in6p_faddr) ||
2886 rt->rt_gateway->sa_family == AF_LINK ||
2887 in6_localaddr(&inp->in6p_faddr))) {
2888 tp->t_flags |= TF_LOCAL;
2889 }
2890 }
2891
2892 /*
2893 * Caller needs to call RT_UNLOCK(rt).
2894 */
2895 return (rt);
2896}
2897#endif /* INET6 */
2898
2899#if IPSEC
2900/* compute ESP/AH header size for TCP, including outer IP header. */
2901size_t
2902ipsec_hdrsiz_tcp(struct tcpcb *tp)
2903{
2904 struct inpcb *inp;
2905 struct mbuf *m;
2906 size_t hdrsiz;
2907 struct ip *ip;
2908#if INET6
2909 struct ip6_hdr *ip6 = NULL;
2910#endif /* INET6 */
2911 struct tcphdr *th;
2912
2913 if ((tp == NULL) || ((inp = tp->t_inpcb) == NULL))
2914 return (0);
2915 MGETHDR(m, M_DONTWAIT, MT_DATA); /* MAC-OK */
2916 if (!m)
2917 return (0);
2918
2919#if INET6
2920 if ((inp->inp_vflag & INP_IPV6) != 0) {
2921 ip6 = mtod(m, struct ip6_hdr *);
2922 th = (struct tcphdr *)(void *)(ip6 + 1);
2923 m->m_pkthdr.len = m->m_len =
2924 sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
2925 tcp_fillheaders(tp, ip6, th);
2926 hdrsiz = ipsec6_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
2927 } else
2928#endif /* INET6 */
2929 {
2930 ip = mtod(m, struct ip *);
2931 th = (struct tcphdr *)(ip + 1);
2932 m->m_pkthdr.len = m->m_len = sizeof(struct tcpiphdr);
2933 tcp_fillheaders(tp, ip, th);
2934 hdrsiz = ipsec4_hdrsiz(m, IPSEC_DIR_OUTBOUND, inp);
2935 }
2936 m_free(m);
2937 return (hdrsiz);
2938}
2939#endif /* IPSEC */
2940
2941/*
2942 * Return a pointer to the cached information about the remote host.
2943 * The cached information is stored in the protocol specific part of
2944 * the route metrics.
2945 */
2946struct rmxp_tao *
2947tcp_gettaocache(struct inpcb *inp)
2948{
2949 struct rtentry *rt;
2950 struct rmxp_tao *taop;
2951
2952#if INET6
2953 if ((inp->inp_vflag & INP_IPV6) != 0)
2954 rt = tcp_rtlookup6(inp, IFSCOPE_NONE);
2955 else
2956#endif /* INET6 */
2957 rt = tcp_rtlookup(inp, IFSCOPE_NONE);
2958
2959 /* Make sure this is a host route and is up. */
2960 if (rt == NULL ||
2961 (rt->rt_flags & (RTF_UP|RTF_HOST)) != (RTF_UP|RTF_HOST)) {
2962 /* Route locked during lookup above */
2963 if (rt != NULL)
2964 RT_UNLOCK(rt);
2965 return (NULL);
2966 }
2967
2968 taop = rmx_taop(rt->rt_rmx);
2969 /* Route locked during lookup above */
2970 RT_UNLOCK(rt);
2971 return (taop);
2972}
2973
2974/*
2975 * Clear all the TAO cache entries, called from tcp_init.
2976 *
2977 * XXX
2978 * This routine is just an empty one, because we assume that the routing
2979 * routing tables are initialized at the same time when TCP, so there is
2980 * nothing in the cache left over.
2981 */
2982static void
2983tcp_cleartaocache(void)
2984{
2985}
2986
2987int
2988tcp_lock(struct socket *so, int refcount, void *lr)
2989{
2990 void *lr_saved;
2991
2992 if (lr == NULL)
2993 lr_saved = __builtin_return_address(0);
2994 else
2995 lr_saved = lr;
2996
2997retry:
2998 if (so->so_pcb != NULL) {
2999 if (so->so_flags & SOF_MP_SUBFLOW) {
3000 struct mptcb *mp_tp = tptomptp(sototcpcb(so));
3001 VERIFY(mp_tp);
3002
3003 mpte_lock_assert_notheld(mp_tp->mpt_mpte);
3004
3005 mpte_lock(mp_tp->mpt_mpte);
3006
3007 /*
3008 * Check if we became non-MPTCP while waiting for the lock.
3009 * If yes, we have to retry to grab the right lock.
3010 */
3011 if (!(so->so_flags & SOF_MP_SUBFLOW)) {
3012 mpte_unlock(mp_tp->mpt_mpte);
3013 goto retry;
3014 }
3015 } else {
3016 lck_mtx_lock(&((struct inpcb *)so->so_pcb)->inpcb_mtx);
3017
3018 if (so->so_flags & SOF_MP_SUBFLOW) {
3019 /*
3020 * While waiting for the lock, we might have
3021 * become MPTCP-enabled (see mptcp_subflow_socreate).
3022 */
3023 lck_mtx_unlock(&((struct inpcb *)so->so_pcb)->inpcb_mtx);
3024 goto retry;
3025 }
3026 }
3027 } else {
3028 panic("tcp_lock: so=%p NO PCB! lr=%p lrh= %s\n",
3029 so, lr_saved, solockhistory_nr(so));
3030 /* NOTREACHED */
3031 }
3032
3033 if (so->so_usecount < 0) {
3034 panic("tcp_lock: so=%p so_pcb=%p lr=%p ref=%x lrh= %s\n",
3035 so, so->so_pcb, lr_saved, so->so_usecount,
3036 solockhistory_nr(so));
3037 /* NOTREACHED */
3038 }
3039 if (refcount)
3040 so->so_usecount++;
3041 so->lock_lr[so->next_lock_lr] = lr_saved;
3042 so->next_lock_lr = (so->next_lock_lr+1) % SO_LCKDBG_MAX;
3043 return (0);
3044}
3045
3046int
3047tcp_unlock(struct socket *so, int refcount, void *lr)
3048{
3049 void *lr_saved;
3050
3051 if (lr == NULL)
3052 lr_saved = __builtin_return_address(0);
3053 else
3054 lr_saved = lr;
3055
3056#ifdef MORE_TCPLOCK_DEBUG
3057 printf("tcp_unlock: so=0x%llx sopcb=0x%llx lock=0x%llx ref=%x "
3058 "lr=0x%llx\n", (uint64_t)VM_KERNEL_ADDRPERM(so),
3059 (uint64_t)VM_KERNEL_ADDRPERM(so->so_pcb),
3060 (uint64_t)VM_KERNEL_ADDRPERM(&(sotoinpcb(so)->inpcb_mtx)),
3061 so->so_usecount, (uint64_t)VM_KERNEL_ADDRPERM(lr_saved));
3062#endif
3063 if (refcount)
3064 so->so_usecount--;
3065
3066 if (so->so_usecount < 0) {
3067 panic("tcp_unlock: so=%p usecount=%x lrh= %s\n",
3068 so, so->so_usecount, solockhistory_nr(so));
3069 /* NOTREACHED */
3070 }
3071 if (so->so_pcb == NULL) {
3072 panic("tcp_unlock: so=%p NO PCB usecount=%x lr=%p lrh= %s\n",
3073 so, so->so_usecount, lr_saved, solockhistory_nr(so));
3074 /* NOTREACHED */
3075 } else {
3076 so->unlock_lr[so->next_unlock_lr] = lr_saved;
3077 so->next_unlock_lr = (so->next_unlock_lr+1) % SO_LCKDBG_MAX;
3078
3079 if (so->so_flags & SOF_MP_SUBFLOW) {
3080 struct mptcb *mp_tp = tptomptp(sototcpcb(so));
3081
3082 VERIFY(mp_tp);
3083 mpte_lock_assert_held(mp_tp->mpt_mpte);
3084
3085 mpte_unlock(mp_tp->mpt_mpte);
3086 } else {
3087 LCK_MTX_ASSERT(&((struct inpcb *)so->so_pcb)->inpcb_mtx,
3088 LCK_MTX_ASSERT_OWNED);
3089 lck_mtx_unlock(&((struct inpcb *)so->so_pcb)->inpcb_mtx);
3090 }
3091 }
3092 return (0);
3093}
3094
3095lck_mtx_t *
3096tcp_getlock(struct socket *so, int flags)
3097{
3098 struct inpcb *inp = sotoinpcb(so);
3099
3100 if (so->so_pcb) {
3101 if (so->so_usecount < 0)
3102 panic("tcp_getlock: so=%p usecount=%x lrh= %s\n",
3103 so, so->so_usecount, solockhistory_nr(so));
3104
3105 if (so->so_flags & SOF_MP_SUBFLOW) {
3106 struct mptcb *mp_tp = tptomptp(sototcpcb(so));
3107
3108 return (mpte_getlock(mp_tp->mpt_mpte, flags));
3109 } else {
3110 return (&inp->inpcb_mtx);
3111 }
3112 } else {
3113 panic("tcp_getlock: so=%p NULL so_pcb %s\n",
3114 so, solockhistory_nr(so));
3115 return (so->so_proto->pr_domain->dom_mtx);
3116 }
3117}
3118
3119/*
3120 * Determine if we can grow the recieve socket buffer to avoid sending
3121 * a zero window update to the peer. We allow even socket buffers that
3122 * have fixed size (set by the application) to grow if the resource
3123 * constraints are met. They will also be trimmed after the application
3124 * reads data.
3125 */
3126static void
3127tcp_sbrcv_grow_rwin(struct tcpcb *tp, struct sockbuf *sb)
3128{
3129 u_int32_t rcvbufinc = tp->t_maxseg << 4;
3130 u_int32_t rcvbuf = sb->sb_hiwat;
3131 struct socket *so = tp->t_inpcb->inp_socket;
3132
3133 if (tcp_recv_bg == 1 || IS_TCP_RECV_BG(so))
3134 return;
3135 /*
3136 * If message delivery is enabled, do not count
3137 * unordered bytes in receive buffer towards hiwat
3138 */
3139 if (so->so_flags & SOF_ENABLE_MSGS)
3140 rcvbuf = rcvbuf - so->so_msg_state->msg_uno_bytes;
3141
3142 if (tcp_do_autorcvbuf == 1 &&
3143 tcp_cansbgrow(sb) &&
3144 (tp->t_flags & TF_SLOWLINK) == 0 &&
3145 (so->so_flags1 & SOF1_EXTEND_BK_IDLE_WANTED) == 0 &&
3146 (rcvbuf - sb->sb_cc) < rcvbufinc &&
3147 rcvbuf < tcp_autorcvbuf_max &&
3148 (sb->sb_idealsize > 0 &&
3149 sb->sb_hiwat <= (sb->sb_idealsize + rcvbufinc))) {
3150 sbreserve(sb,
3151 min((sb->sb_hiwat + rcvbufinc), tcp_autorcvbuf_max));
3152 }
3153}
3154
3155int32_t
3156tcp_sbspace(struct tcpcb *tp)
3157{
3158 struct socket *so = tp->t_inpcb->inp_socket;
3159 struct sockbuf *sb = &so->so_rcv;
3160 u_int32_t rcvbuf;
3161 int32_t space;
3162 int32_t pending = 0;
3163
3164 tcp_sbrcv_grow_rwin(tp, sb);
3165
3166 /* hiwat might have changed */
3167 rcvbuf = sb->sb_hiwat;
3168
3169 /*
3170 * If message delivery is enabled, do not count
3171 * unordered bytes in receive buffer towards hiwat mark.
3172 * This value is used to return correct rwnd that does
3173 * not reflect the extra unordered bytes added to the
3174 * receive socket buffer.
3175 */
3176 if (so->so_flags & SOF_ENABLE_MSGS)
3177 rcvbuf = rcvbuf - so->so_msg_state->msg_uno_bytes;
3178
3179 space = ((int32_t) imin((rcvbuf - sb->sb_cc),
3180 (sb->sb_mbmax - sb->sb_mbcnt)));
3181 if (space < 0)
3182 space = 0;
3183
3184#if CONTENT_FILTER
3185 /* Compensate for data being processed by content filters */
3186 pending = cfil_sock_data_space(sb);
3187#endif /* CONTENT_FILTER */
3188 if (pending > space)
3189 space = 0;
3190 else
3191 space -= pending;
3192
3193 /*
3194 * Avoid increasing window size if the current window
3195 * is already very low, we could be in "persist" mode and
3196 * we could break some apps (see rdar://5409343)
3197 */
3198
3199 if (space < tp->t_maxseg)
3200 return (space);
3201
3202 /* Clip window size for slower link */
3203
3204 if (((tp->t_flags & TF_SLOWLINK) != 0) && slowlink_wsize > 0)
3205 return (imin(space, slowlink_wsize));
3206
3207 return (space);
3208}
3209/*
3210 * Checks TCP Segment Offloading capability for a given connection
3211 * and interface pair.
3212 */
3213void
3214tcp_set_tso(struct tcpcb *tp, struct ifnet *ifp)
3215{
3216#if INET6
3217 struct inpcb *inp;
3218 int isipv6;
3219#endif /* INET6 */
3220#if MPTCP
3221 /*
3222 * We can't use TSO if this tcpcb belongs to an MPTCP session.
3223 */
3224 if (tp->t_mpflags & TMPF_MPTCP_TRUE) {
3225 tp->t_flags &= ~TF_TSO;
3226 return;
3227 }
3228#endif
3229#if INET6
3230 inp = tp->t_inpcb;
3231 isipv6 = (inp->inp_vflag & INP_IPV6) != 0;
3232
3233 if (isipv6) {
3234 if (ifp && (ifp->if_hwassist & IFNET_TSO_IPV6)) {
3235 tp->t_flags |= TF_TSO;
3236 if (ifp->if_tso_v6_mtu != 0)
3237 tp->tso_max_segment_size = ifp->if_tso_v6_mtu;
3238 else
3239 tp->tso_max_segment_size = TCP_MAXWIN;
3240 } else
3241 tp->t_flags &= ~TF_TSO;
3242
3243 } else
3244#endif /* INET6 */
3245
3246 {
3247 if (ifp && (ifp->if_hwassist & IFNET_TSO_IPV4)) {
3248 tp->t_flags |= TF_TSO;
3249 if (ifp->if_tso_v4_mtu != 0)
3250 tp->tso_max_segment_size = ifp->if_tso_v4_mtu;
3251 else
3252 tp->tso_max_segment_size = TCP_MAXWIN;
3253 } else
3254 tp->t_flags &= ~TF_TSO;
3255 }
3256}
3257
3258#define TIMEVAL_TO_TCPHZ(_tv_) ((_tv_).tv_sec * TCP_RETRANSHZ + \
3259 (_tv_).tv_usec / TCP_RETRANSHZ_TO_USEC)
3260
3261/*
3262 * Function to calculate the tcp clock. The tcp clock will get updated
3263 * at the boundaries of the tcp layer. This is done at 3 places:
3264 * 1. Right before processing an input tcp packet
3265 * 2. Whenever a connection wants to access the network using tcp_usrreqs
3266 * 3. When a tcp timer fires or before tcp slow timeout
3267 *
3268 */
3269
3270void
3271calculate_tcp_clock(void)
3272{
3273 struct timeval tv = tcp_uptime;
3274 struct timeval interval = {0, TCP_RETRANSHZ_TO_USEC};
3275 struct timeval now, hold_now;
3276 uint32_t incr = 0;
3277
3278 microuptime(&now);
3279
3280 /*
3281 * Update coarse-grained networking timestamp (in sec.); the idea
3282 * is to update the counter returnable via net_uptime() when
3283 * we read time.
3284 */
3285 net_update_uptime_with_time(&now);
3286
3287 timevaladd(&tv, &interval);
3288 if (timevalcmp(&now, &tv, >)) {
3289 /* time to update the clock */
3290 lck_spin_lock(tcp_uptime_lock);
3291 if (timevalcmp(&tcp_uptime, &now, >=)) {
3292 /* clock got updated while waiting for the lock */
3293 lck_spin_unlock(tcp_uptime_lock);
3294 return;
3295 }
3296
3297 microuptime(&now);
3298 hold_now = now;
3299 tv = tcp_uptime;
3300 timevalsub(&now, &tv);
3301
3302 incr = TIMEVAL_TO_TCPHZ(now);
3303 if (incr > 0) {
3304 tcp_uptime = hold_now;
3305 tcp_now += incr;
3306 }
3307
3308 lck_spin_unlock(tcp_uptime_lock);
3309 }
3310}
3311
3312/*
3313 * Compute receive window scaling that we are going to request
3314 * for this connection based on sb_hiwat. Try to leave some
3315 * room to potentially increase the window size upto a maximum
3316 * defined by the constant tcp_autorcvbuf_max.
3317 */
3318void
3319tcp_set_max_rwinscale(struct tcpcb *tp, struct socket *so, struct ifnet *ifp)
3320{
3321 uint32_t maxsockbufsize;
3322 uint32_t rcvbuf_max;
3323
3324 if (!tcp_do_rfc1323) {
3325 tp->request_r_scale = 0;
3326 return;
3327 }
3328
3329 /*
3330 * When we start a connection and don't know about the interface, set
3331 * the scaling factor simply to the max - we can always announce less.
3332 */
3333 if (!ifp || (IFNET_IS_CELLULAR(ifp) && (ifp->if_eflags & IFEF_3CA)))
3334 rcvbuf_max = (tcp_autorcvbuf_max << 1);
3335 else
3336 rcvbuf_max = tcp_autorcvbuf_max;
3337
3338 tp->request_r_scale = max(tcp_win_scale, tp->request_r_scale);
3339 maxsockbufsize = ((so->so_rcv.sb_flags & SB_USRSIZE) != 0) ?
3340 so->so_rcv.sb_hiwat : rcvbuf_max;
3341
3342 while (tp->request_r_scale < TCP_MAX_WINSHIFT &&
3343 (TCP_MAXWIN << tp->request_r_scale) < maxsockbufsize)
3344 tp->request_r_scale++;
3345 tp->request_r_scale = min(tp->request_r_scale, TCP_MAX_WINSHIFT);
3346
3347}
3348
3349int
3350tcp_notsent_lowat_check(struct socket *so)
3351{
3352 struct inpcb *inp = sotoinpcb(so);
3353 struct tcpcb *tp = NULL;
3354 int notsent = 0;
3355
3356 if (inp != NULL) {
3357 tp = intotcpcb(inp);
3358 }
3359
3360 if (tp == NULL) {
3361 return (0);
3362 }
3363
3364 notsent = so->so_snd.sb_cc -
3365 (tp->snd_nxt - tp->snd_una);
3366
3367 /*
3368 * When we send a FIN or SYN, not_sent can be negative.
3369 * In that case also we need to send a write event to the
3370 * process if it is waiting. In the FIN case, it will
3371 * get an error from send because cantsendmore will be set.
3372 */
3373 if (notsent <= tp->t_notsent_lowat) {
3374 return (1);
3375 }
3376
3377 /*
3378 * When Nagle's algorithm is not disabled, it is better
3379 * to wakeup the client until there is atleast one
3380 * maxseg of data to write.
3381 */
3382 if ((tp->t_flags & TF_NODELAY) == 0 &&
3383 notsent > 0 && notsent < tp->t_maxseg) {
3384 return (1);
3385 }
3386 return (0);
3387}
3388
3389void
3390tcp_rxtseg_insert(struct tcpcb *tp, tcp_seq start, tcp_seq end)
3391{
3392 struct tcp_rxt_seg *rxseg = NULL, *prev = NULL, *next = NULL;
3393 u_int32_t rxcount = 0;
3394
3395 if (SLIST_EMPTY(&tp->t_rxt_segments))
3396 tp->t_dsack_lastuna = tp->snd_una;
3397 /*
3398 * First check if there is a segment already existing for this
3399 * sequence space.
3400 */
3401
3402 SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
3403 if (SEQ_GT(rxseg->rx_start, start))
3404 break;
3405 prev = rxseg;
3406 }
3407 next = rxseg;
3408
3409 /* check if prev seg is for this sequence */
3410 if (prev != NULL && SEQ_LEQ(prev->rx_start, start) &&
3411 SEQ_GEQ(prev->rx_end, end)) {
3412 prev->rx_count++;
3413 return;
3414 }
3415
3416 /*
3417 * There are a couple of possibilities at this point.
3418 * 1. prev overlaps with the beginning of this sequence
3419 * 2. next overlaps with the end of this sequence
3420 * 3. there is no overlap.
3421 */
3422
3423 if (prev != NULL && SEQ_GT(prev->rx_end, start)) {
3424 if (prev->rx_start == start && SEQ_GT(end, prev->rx_end)) {
3425 start = prev->rx_end + 1;
3426 prev->rx_count++;
3427 } else {
3428 prev->rx_end = (start - 1);
3429 rxcount = prev->rx_count;
3430 }
3431 }
3432
3433 if (next != NULL && SEQ_LT(next->rx_start, end)) {
3434 if (SEQ_LEQ(next->rx_end, end)) {
3435 end = next->rx_start - 1;
3436 next->rx_count++;
3437 } else {
3438 next->rx_start = end + 1;
3439 rxcount = next->rx_count;
3440 }
3441 }
3442 if (!SEQ_LT(start, end))
3443 return;
3444
3445 rxseg = (struct tcp_rxt_seg *) zalloc(tcp_rxt_seg_zone);
3446 if (rxseg == NULL) {
3447 return;
3448 }
3449 bzero(rxseg, sizeof(*rxseg));
3450 rxseg->rx_start = start;
3451 rxseg->rx_end = end;
3452 rxseg->rx_count = rxcount + 1;
3453
3454 if (prev != NULL) {
3455 SLIST_INSERT_AFTER(prev, rxseg, rx_link);
3456 } else {
3457 SLIST_INSERT_HEAD(&tp->t_rxt_segments, rxseg, rx_link);
3458 }
3459}
3460
3461struct tcp_rxt_seg *
3462tcp_rxtseg_find(struct tcpcb *tp, tcp_seq start, tcp_seq end)
3463{
3464 struct tcp_rxt_seg *rxseg;
3465 if (SLIST_EMPTY(&tp->t_rxt_segments))
3466 return (NULL);
3467
3468 SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
3469 if (SEQ_LEQ(rxseg->rx_start, start) &&
3470 SEQ_GEQ(rxseg->rx_end, end))
3471 return (rxseg);
3472 if (SEQ_GT(rxseg->rx_start, start))
3473 break;
3474 }
3475 return (NULL);
3476}
3477
3478void
3479tcp_rxtseg_clean(struct tcpcb *tp)
3480{
3481 struct tcp_rxt_seg *rxseg, *next;
3482
3483 SLIST_FOREACH_SAFE(rxseg, &tp->t_rxt_segments, rx_link, next) {
3484 SLIST_REMOVE(&tp->t_rxt_segments, rxseg,
3485 tcp_rxt_seg, rx_link);
3486 zfree(tcp_rxt_seg_zone, rxseg);
3487 }
3488 tp->t_dsack_lastuna = tp->snd_max;
3489}
3490
3491boolean_t
3492tcp_rxtseg_detect_bad_rexmt(struct tcpcb *tp, tcp_seq th_ack)
3493{
3494 boolean_t bad_rexmt;
3495 struct tcp_rxt_seg *rxseg;
3496
3497 if (SLIST_EMPTY(&tp->t_rxt_segments))
3498 return (FALSE);
3499
3500 /*
3501 * If all of the segments in this window are not cumulatively
3502 * acknowledged, then there can still be undetected packet loss.
3503 * Do not restore congestion window in that case.
3504 */
3505 if (SEQ_LT(th_ack, tp->snd_recover))
3506 return (FALSE);
3507
3508 bad_rexmt = TRUE;
3509 SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
3510 if (rxseg->rx_count > 1 ||
3511 !(rxseg->rx_flags & TCP_RXT_SPURIOUS)) {
3512 bad_rexmt = FALSE;
3513 break;
3514 }
3515 }
3516 return (bad_rexmt);
3517}
3518
3519boolean_t
3520tcp_rxtseg_dsack_for_tlp(struct tcpcb *tp)
3521{
3522 boolean_t dsack_for_tlp = FALSE;
3523 struct tcp_rxt_seg *rxseg;
3524 if (SLIST_EMPTY(&tp->t_rxt_segments))
3525 return (FALSE);
3526
3527 SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
3528 if (rxseg->rx_count == 1 &&
3529 SLIST_NEXT(rxseg, rx_link) == NULL &&
3530 (rxseg->rx_flags & TCP_RXT_DSACK_FOR_TLP)) {
3531 dsack_for_tlp = TRUE;
3532 break;
3533 }
3534 }
3535 return (dsack_for_tlp);
3536}
3537
3538u_int32_t
3539tcp_rxtseg_total_size(struct tcpcb *tp)
3540{
3541 struct tcp_rxt_seg *rxseg;
3542 u_int32_t total_size = 0;
3543
3544 SLIST_FOREACH(rxseg, &tp->t_rxt_segments, rx_link) {
3545 total_size += (rxseg->rx_end - rxseg->rx_start) + 1;
3546 }
3547 return (total_size);
3548}
3549
3550void
3551tcp_get_connectivity_status(struct tcpcb *tp,
3552 struct tcp_conn_status *connstatus)
3553{
3554 if (tp == NULL || connstatus == NULL)
3555 return;
3556 bzero(connstatus, sizeof(*connstatus));
3557 if (tp->t_rxtshift >= TCP_CONNECTIVITY_PROBES_MAX) {
3558 if (TCPS_HAVEESTABLISHED(tp->t_state)) {
3559 connstatus->write_probe_failed = 1;
3560 } else {
3561 connstatus->conn_probe_failed = 1;
3562 }
3563 }
3564 if (tp->t_rtimo_probes >= TCP_CONNECTIVITY_PROBES_MAX)
3565 connstatus->read_probe_failed = 1;
3566 if (tp->t_inpcb != NULL && tp->t_inpcb->inp_last_outifp != NULL &&
3567 (tp->t_inpcb->inp_last_outifp->if_eflags & IFEF_PROBE_CONNECTIVITY))
3568 connstatus->probe_activated = 1;
3569}
3570
3571boolean_t
3572tfo_enabled(const struct tcpcb *tp)
3573{
3574 return ((tp->t_flagsext & TF_FASTOPEN)? TRUE : FALSE);
3575}
3576
3577void
3578tcp_disable_tfo(struct tcpcb *tp)
3579{
3580 tp->t_flagsext &= ~TF_FASTOPEN;
3581}
3582
3583static struct mbuf *
3584tcp_make_keepalive_frame(struct tcpcb *tp, struct ifnet *ifp,
3585 boolean_t is_probe)
3586{
3587 struct inpcb *inp = tp->t_inpcb;
3588 struct tcphdr *th;
3589 u_int8_t *data;
3590 int win = 0;
3591 struct mbuf *m;
3592
3593 /*
3594 * The code assumes the IP + TCP headers fit in an mbuf packet header
3595 */
3596 _CASSERT(sizeof(struct ip) + sizeof(struct tcphdr) <= _MHLEN);
3597 _CASSERT(sizeof(struct ip6_hdr) + sizeof(struct tcphdr) <= _MHLEN);
3598
3599 MGETHDR(m, M_WAIT, MT_HEADER);
3600 if (m == NULL) {
3601 return (NULL);
3602 }
3603 m->m_pkthdr.pkt_proto = IPPROTO_TCP;
3604
3605 data = mbuf_datastart(m);
3606
3607 if (inp->inp_vflag & INP_IPV4) {
3608 bzero(data, sizeof(struct ip) + sizeof(struct tcphdr));
3609 th = (struct tcphdr *)(void *) (data + sizeof(struct ip));
3610 m->m_len = sizeof(struct ip) + sizeof(struct tcphdr);
3611 m->m_pkthdr.len = m->m_len;
3612 } else {
3613 VERIFY(inp->inp_vflag & INP_IPV6);
3614
3615 bzero(data, sizeof(struct ip6_hdr)
3616 + sizeof(struct tcphdr));
3617 th = (struct tcphdr *)(void *)(data + sizeof(struct ip6_hdr));
3618 m->m_len = sizeof(struct ip6_hdr) +
3619 sizeof(struct tcphdr);
3620 m->m_pkthdr.len = m->m_len;
3621 }
3622
3623 tcp_fillheaders(tp, data, th);
3624
3625 if (inp->inp_vflag & INP_IPV4) {
3626 struct ip *ip;
3627
3628 ip = (__typeof__(ip))(void *)data;
3629
3630 ip->ip_id = rfc6864 ? 0 : ip_randomid();
3631 ip->ip_off = htons(IP_DF);
3632 ip->ip_len = htons(sizeof(struct ip) + sizeof(struct tcphdr));
3633 ip->ip_ttl = inp->inp_ip_ttl;
3634 ip->ip_tos |= (inp->inp_ip_tos & ~IPTOS_ECN_MASK);
3635 ip->ip_sum = in_cksum_hdr(ip);
3636 } else {
3637 struct ip6_hdr *ip6;
3638
3639 ip6 = (__typeof__(ip6))(void *)data;
3640
3641 ip6->ip6_plen = htons(sizeof(struct tcphdr));
3642 ip6->ip6_hlim = in6_selecthlim(inp, ifp);
3643 ip6->ip6_flow = ip6->ip6_flow & ~IPV6_FLOW_ECN_MASK;
3644
3645 if (IN6_IS_SCOPE_EMBED(&ip6->ip6_src))
3646 ip6->ip6_src.s6_addr16[1] = 0;
3647 if (IN6_IS_SCOPE_EMBED(&ip6->ip6_dst))
3648 ip6->ip6_dst.s6_addr16[1] = 0;
3649 }
3650 th->th_flags = TH_ACK;
3651
3652 win = tcp_sbspace(tp);
3653 if (win > ((int32_t)TCP_MAXWIN << tp->rcv_scale))
3654 win = (int32_t)TCP_MAXWIN << tp->rcv_scale;
3655 th->th_win = htons((u_short) (win >> tp->rcv_scale));
3656
3657 if (is_probe) {
3658 th->th_seq = htonl(tp->snd_una - 1);
3659 } else {
3660 th->th_seq = htonl(tp->snd_una);
3661 }
3662 th->th_ack = htonl(tp->rcv_nxt);
3663
3664 /* Force recompute TCP checksum to be the final value */
3665 th->th_sum = 0;
3666 if (inp->inp_vflag & INP_IPV4) {
3667 th->th_sum = inet_cksum(m, IPPROTO_TCP,
3668 sizeof(struct ip), sizeof(struct tcphdr));
3669 } else {
3670 th->th_sum = inet6_cksum(m, IPPROTO_TCP,
3671 sizeof(struct ip6_hdr), sizeof(struct tcphdr));
3672 }
3673
3674 return (m);
3675}
3676
3677void
3678tcp_fill_keepalive_offload_frames(ifnet_t ifp,
3679 struct ifnet_keepalive_offload_frame *frames_array,
3680 u_int32_t frames_array_count, size_t frame_data_offset,
3681 u_int32_t *used_frames_count)
3682{
3683 struct inpcb *inp;
3684 inp_gen_t gencnt;
3685 u_int32_t frame_index = *used_frames_count;
3686
3687 if (ifp == NULL || frames_array == NULL ||
3688 frames_array_count == 0 ||
3689 frame_index >= frames_array_count ||
3690 frame_data_offset >= IFNET_KEEPALIVE_OFFLOAD_FRAME_DATA_SIZE)
3691 return;
3692
3693 /*
3694 * This function is called outside the regular TCP processing
3695 * so we need to update the TCP clock.
3696 */
3697 calculate_tcp_clock();
3698
3699 lck_rw_lock_shared(tcbinfo.ipi_lock);
3700 gencnt = tcbinfo.ipi_gencnt;
3701 LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) {
3702 struct socket *so;
3703 struct ifnet_keepalive_offload_frame *frame;
3704 struct mbuf *m = NULL;
3705 struct tcpcb *tp = intotcpcb(inp);
3706
3707 if (frame_index >= frames_array_count)
3708 break;
3709
3710 if (inp->inp_gencnt > gencnt ||
3711 inp->inp_state == INPCB_STATE_DEAD)
3712 continue;
3713
3714 if ((so = inp->inp_socket) == NULL ||
3715 (so->so_state & SS_DEFUNCT))
3716 continue;
3717 /*
3718 * check for keepalive offload flag without socket
3719 * lock to avoid a deadlock
3720 */
3721 if (!(inp->inp_flags2 & INP2_KEEPALIVE_OFFLOAD)) {
3722 continue;
3723 }
3724
3725 if (!(inp->inp_vflag & (INP_IPV4 | INP_IPV6))) {
3726 continue;
3727 }
3728 if (inp->inp_ppcb == NULL ||
3729 in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING)
3730 continue;
3731 socket_lock(so, 1);
3732 /* Release the want count */
3733 if (inp->inp_ppcb == NULL ||
3734 (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING)) {
3735 socket_unlock(so, 1);
3736 continue;
3737 }
3738 if ((inp->inp_vflag & INP_IPV4) &&
3739 (inp->inp_laddr.s_addr == INADDR_ANY ||
3740 inp->inp_faddr.s_addr == INADDR_ANY)) {
3741 socket_unlock(so, 1);
3742 continue;
3743 }
3744 if ((inp->inp_vflag & INP_IPV6) &&
3745 (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ||
3746 IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr))) {
3747 socket_unlock(so, 1);
3748 continue;
3749 }
3750 if (inp->inp_lport == 0 || inp->inp_fport == 0) {
3751 socket_unlock(so, 1);
3752 continue;
3753 }
3754 if (inp->inp_last_outifp == NULL ||
3755 inp->inp_last_outifp->if_index != ifp->if_index) {
3756 socket_unlock(so, 1);
3757 continue;
3758 }
3759 if ((inp->inp_vflag & INP_IPV4) && frame_data_offset +
3760 sizeof(struct ip) + sizeof(struct tcphdr) >
3761 IFNET_KEEPALIVE_OFFLOAD_FRAME_DATA_SIZE) {
3762 socket_unlock(so, 1);
3763 continue;
3764 } else if (!(inp->inp_vflag & INP_IPV4) && frame_data_offset +
3765 sizeof(struct ip6_hdr) + sizeof(struct tcphdr) >
3766 IFNET_KEEPALIVE_OFFLOAD_FRAME_DATA_SIZE) {
3767 socket_unlock(so, 1);
3768 continue;
3769 }
3770 /*
3771 * There is no point in waking up the device for connections
3772 * that are not established. Long lived connection are meant
3773 * for processes that will sent and receive data
3774 */
3775 if (tp->t_state != TCPS_ESTABLISHED) {
3776 socket_unlock(so, 1);
3777 continue;
3778 }
3779 /*
3780 * This inp has all the information that is needed to
3781 * generate an offload frame.
3782 */
3783 frame = &frames_array[frame_index];
3784 frame->type = IFNET_KEEPALIVE_OFFLOAD_FRAME_TCP;
3785 frame->ether_type = (inp->inp_vflag & INP_IPV4) ?
3786 IFNET_KEEPALIVE_OFFLOAD_FRAME_ETHERTYPE_IPV4 :
3787 IFNET_KEEPALIVE_OFFLOAD_FRAME_ETHERTYPE_IPV6;
3788 frame->interval = tp->t_keepidle > 0 ? tp->t_keepidle :
3789 tcp_keepidle;
3790 frame->keep_cnt = TCP_CONN_KEEPCNT(tp);
3791 frame->keep_retry = TCP_CONN_KEEPINTVL(tp);
3792 frame->local_port = ntohs(inp->inp_lport);
3793 frame->remote_port = ntohs(inp->inp_fport);
3794 frame->local_seq = tp->snd_nxt;
3795 frame->remote_seq = tp->rcv_nxt;
3796 if (inp->inp_vflag & INP_IPV4) {
3797 frame->length = frame_data_offset +
3798 sizeof(struct ip) + sizeof(struct tcphdr);
3799 frame->reply_length = frame->length;
3800
3801 frame->addr_length = sizeof(struct in_addr);
3802 bcopy(&inp->inp_laddr, frame->local_addr,
3803 sizeof(struct in_addr));
3804 bcopy(&inp->inp_faddr, frame->remote_addr,
3805 sizeof(struct in_addr));
3806 } else {
3807 struct in6_addr *ip6;
3808
3809 frame->length = frame_data_offset +
3810 sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
3811 frame->reply_length = frame->length;
3812
3813 frame->addr_length = sizeof(struct in6_addr);
3814 ip6 = (struct in6_addr *)(void *)frame->local_addr;
3815 bcopy(&inp->in6p_laddr, ip6, sizeof(struct in6_addr));
3816 if (IN6_IS_SCOPE_EMBED(ip6))
3817 ip6->s6_addr16[1] = 0;
3818
3819 ip6 = (struct in6_addr *)(void *)frame->remote_addr;
3820 bcopy(&inp->in6p_faddr, ip6, sizeof(struct in6_addr));
3821 if (IN6_IS_SCOPE_EMBED(ip6))
3822 ip6->s6_addr16[1] = 0;
3823 }
3824
3825 /*
3826 * First the probe
3827 */
3828 m = tcp_make_keepalive_frame(tp, ifp, TRUE);
3829 if (m == NULL) {
3830 socket_unlock(so, 1);
3831 continue;
3832 }
3833 bcopy(m->m_data, frame->data + frame_data_offset,
3834 m->m_len);
3835 m_freem(m);
3836
3837 /*
3838 * Now the response packet to incoming probes
3839 */
3840 m = tcp_make_keepalive_frame(tp, ifp, FALSE);
3841 if (m == NULL) {
3842 socket_unlock(so, 1);
3843 continue;
3844 }
3845 bcopy(m->m_data, frame->reply_data + frame_data_offset,
3846 m->m_len);
3847 m_freem(m);
3848
3849 frame_index++;
3850 socket_unlock(so, 1);
3851 }
3852 lck_rw_done(tcbinfo.ipi_lock);
3853 *used_frames_count = frame_index;
3854}
3855
3856errno_t
3857tcp_notify_ack_id_valid(struct tcpcb *tp, struct socket *so,
3858 u_int32_t notify_id)
3859{
3860 struct tcp_notify_ack_marker *elm;
3861
3862 if (so->so_snd.sb_cc == 0)
3863 return (ENOBUFS);
3864
3865 SLIST_FOREACH(elm, &tp->t_notify_ack, notify_next) {
3866 /* Duplicate id is not allowed */
3867 if (elm->notify_id == notify_id)
3868 return (EINVAL);
3869 /* Duplicate position is not allowed */
3870 if (elm->notify_snd_una == tp->snd_una + so->so_snd.sb_cc)
3871 return (EINVAL);
3872 }
3873 return (0);
3874}
3875
3876errno_t
3877tcp_add_notify_ack_marker(struct tcpcb *tp, u_int32_t notify_id)
3878{
3879 struct tcp_notify_ack_marker *nm, *elm = NULL;
3880 struct socket *so = tp->t_inpcb->inp_socket;
3881
3882 MALLOC(nm, struct tcp_notify_ack_marker *, sizeof (*nm),
3883 M_TEMP, M_WAIT | M_ZERO);
3884 if (nm == NULL)
3885 return (ENOMEM);
3886 nm->notify_id = notify_id;
3887 nm->notify_snd_una = tp->snd_una + so->so_snd.sb_cc;
3888
3889 SLIST_FOREACH(elm, &tp->t_notify_ack, notify_next) {
3890 if (SEQ_GT(nm->notify_snd_una, elm->notify_snd_una))
3891 break;
3892 }
3893
3894 if (elm == NULL) {
3895 VERIFY(SLIST_EMPTY(&tp->t_notify_ack));
3896 SLIST_INSERT_HEAD(&tp->t_notify_ack, nm, notify_next);
3897 } else {
3898 SLIST_INSERT_AFTER(elm, nm, notify_next);
3899 }
3900 tp->t_notify_ack_count++;
3901 return (0);
3902}
3903
3904void
3905tcp_notify_ack_free(struct tcpcb *tp)
3906{
3907 struct tcp_notify_ack_marker *elm, *next;
3908 if (SLIST_EMPTY(&tp->t_notify_ack))
3909 return;
3910
3911 SLIST_FOREACH_SAFE(elm, &tp->t_notify_ack, notify_next, next) {
3912 SLIST_REMOVE(&tp->t_notify_ack, elm, tcp_notify_ack_marker,
3913 notify_next);
3914 FREE(elm, M_TEMP);
3915 }
3916 SLIST_INIT(&tp->t_notify_ack);
3917 tp->t_notify_ack_count = 0;
3918}
3919
3920inline void
3921tcp_notify_acknowledgement(struct tcpcb *tp, struct socket *so)
3922{
3923 struct tcp_notify_ack_marker *elm;
3924
3925 elm = SLIST_FIRST(&tp->t_notify_ack);
3926 if (SEQ_GEQ(tp->snd_una, elm->notify_snd_una)) {
3927 soevent(so, SO_FILT_HINT_LOCKED | SO_FILT_HINT_NOTIFY_ACK);
3928 }
3929}
3930
3931void
3932tcp_get_notify_ack_count(struct tcpcb *tp,
3933 struct tcp_notify_ack_complete *retid)
3934{
3935 struct tcp_notify_ack_marker *elm;
3936 size_t complete = 0;
3937
3938 SLIST_FOREACH(elm, &tp->t_notify_ack, notify_next) {
3939 if (SEQ_GEQ(tp->snd_una, elm->notify_snd_una))
3940 complete++;
3941 else
3942 break;
3943 }
3944 retid->notify_pending = tp->t_notify_ack_count - complete;
3945 retid->notify_complete_count = min(TCP_MAX_NOTIFY_ACK, complete);
3946}
3947
3948void
3949tcp_get_notify_ack_ids(struct tcpcb *tp,
3950 struct tcp_notify_ack_complete *retid)
3951{
3952 size_t i = 0;
3953 struct tcp_notify_ack_marker *elm, *next;
3954
3955 SLIST_FOREACH_SAFE(elm, &tp->t_notify_ack, notify_next, next) {
3956 if (i >= retid->notify_complete_count)
3957 break;
3958 if (SEQ_GEQ(tp->snd_una, elm->notify_snd_una)) {
3959 retid->notify_complete_id[i++] = elm->notify_id;
3960 SLIST_REMOVE(&tp->t_notify_ack, elm,
3961 tcp_notify_ack_marker, notify_next);
3962 FREE(elm, M_TEMP);
3963 tp->t_notify_ack_count--;
3964 } else {
3965 break;
3966 }
3967 }
3968}
3969
3970bool
3971tcp_notify_ack_active(struct socket *so)
3972{
3973 if ((SOCK_DOM(so) == PF_INET || SOCK_DOM(so) == PF_INET6) &&
3974 SOCK_TYPE(so) == SOCK_STREAM) {
3975 struct tcpcb *tp = intotcpcb(sotoinpcb(so));
3976
3977 if (!SLIST_EMPTY(&tp->t_notify_ack)) {
3978 struct tcp_notify_ack_marker *elm;
3979 elm = SLIST_FIRST(&tp->t_notify_ack);
3980 if (SEQ_GEQ(tp->snd_una, elm->notify_snd_una))
3981 return (true);
3982 }
3983 }
3984 return (false);
3985}
3986
3987inline int32_t
3988inp_get_sndbytes_allunsent(struct socket *so, u_int32_t th_ack)
3989{
3990 struct inpcb *inp = sotoinpcb(so);
3991 struct tcpcb *tp = intotcpcb(inp);
3992
3993 if ((so->so_snd.sb_flags & SB_SNDBYTE_CNT) &&
3994 so->so_snd.sb_cc > 0) {
3995 int32_t unsent, sent;
3996 sent = tp->snd_max - th_ack;
3997 if (tp->t_flags & TF_SENTFIN)
3998 sent--;
3999 unsent = so->so_snd.sb_cc - sent;
4000 return (unsent);
4001 }
4002 return (0);
4003}
4004
4005#define IFP_PER_FLOW_STAT(_ipv4_, _stat_) { \
4006 if (_ipv4_) { \
4007 ifp->if_ipv4_stat->_stat_++; \
4008 } else { \
4009 ifp->if_ipv6_stat->_stat_++; \
4010 } \
4011}
4012
4013#define FLOW_ECN_ENABLED(_flags_) \
4014 ((_flags_ & (TE_ECN_ON)) == (TE_ECN_ON))
4015
4016void tcp_update_stats_per_flow(struct ifnet_stats_per_flow *ifs,
4017 struct ifnet *ifp)
4018{
4019 if (ifp == NULL || !IF_FULLY_ATTACHED(ifp))
4020 return;
4021
4022 ifnet_lock_shared(ifp);
4023 if (ifs->ecn_flags & TE_SETUPSENT) {
4024 if (ifs->ecn_flags & TE_CLIENT_SETUP) {
4025 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_client_setup);
4026 if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
4027 IFP_PER_FLOW_STAT(ifs->ipv4,
4028 ecn_client_success);
4029 } else if (ifs->ecn_flags & TE_LOST_SYN) {
4030 IFP_PER_FLOW_STAT(ifs->ipv4,
4031 ecn_syn_lost);
4032 } else {
4033 IFP_PER_FLOW_STAT(ifs->ipv4,
4034 ecn_peer_nosupport);
4035 }
4036 } else {
4037 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_server_setup);
4038 if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
4039 IFP_PER_FLOW_STAT(ifs->ipv4,
4040 ecn_server_success);
4041 } else if (ifs->ecn_flags & TE_LOST_SYN) {
4042 IFP_PER_FLOW_STAT(ifs->ipv4,
4043 ecn_synack_lost);
4044 } else {
4045 IFP_PER_FLOW_STAT(ifs->ipv4,
4046 ecn_peer_nosupport);
4047 }
4048 }
4049 } else {
4050 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_off_conn);
4051 }
4052 if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
4053 if (ifs->ecn_flags & TE_RECV_ECN_CE) {
4054 tcpstat.tcps_ecn_conn_recv_ce++;
4055 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_conn_recv_ce);
4056 }
4057 if (ifs->ecn_flags & TE_RECV_ECN_ECE) {
4058 tcpstat.tcps_ecn_conn_recv_ece++;
4059 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_conn_recv_ece);
4060 }
4061 if (ifs->ecn_flags & (TE_RECV_ECN_CE | TE_RECV_ECN_ECE)) {
4062 if (ifs->txretransmitbytes > 0 ||
4063 ifs->rxoutoforderbytes > 0) {
4064 tcpstat.tcps_ecn_conn_pl_ce++;
4065 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_conn_plce);
4066 } else {
4067 tcpstat.tcps_ecn_conn_nopl_ce++;
4068 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_conn_noplce);
4069 }
4070 } else {
4071 if (ifs->txretransmitbytes > 0 ||
4072 ifs->rxoutoforderbytes > 0) {
4073 tcpstat.tcps_ecn_conn_plnoce++;
4074 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_conn_plnoce);
4075 }
4076 }
4077 }
4078
4079 /* Other stats are interesting for non-local connections only */
4080 if (ifs->local) {
4081 ifnet_lock_done(ifp);
4082 return;
4083 }
4084
4085 if (ifs->ipv4) {
4086 ifp->if_ipv4_stat->timestamp = net_uptime();
4087 if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
4088 tcp_flow_ecn_perf_stats(ifs, &ifp->if_ipv4_stat->ecn_on);
4089 } else {
4090 tcp_flow_ecn_perf_stats(ifs, &ifp->if_ipv4_stat->ecn_off);
4091 }
4092 } else {
4093 ifp->if_ipv6_stat->timestamp = net_uptime();
4094 if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
4095 tcp_flow_ecn_perf_stats(ifs, &ifp->if_ipv6_stat->ecn_on);
4096 } else {
4097 tcp_flow_ecn_perf_stats(ifs, &ifp->if_ipv6_stat->ecn_off);
4098 }
4099 }
4100
4101 if (ifs->rxmit_drop) {
4102 if (FLOW_ECN_ENABLED(ifs->ecn_flags)) {
4103 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_on.rxmit_drop);
4104 } else {
4105 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_off.rxmit_drop);
4106 }
4107 }
4108 if (ifs->ecn_fallback_synloss)
4109 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_fallback_synloss);
4110 if (ifs->ecn_fallback_droprst)
4111 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_fallback_droprst);
4112 if (ifs->ecn_fallback_droprxmt)
4113 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_fallback_droprxmt);
4114 if (ifs->ecn_fallback_ce)
4115 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_fallback_ce);
4116 if (ifs->ecn_fallback_reorder)
4117 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_fallback_reorder);
4118 if (ifs->ecn_recv_ce > 0)
4119 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_recv_ce);
4120 if (ifs->ecn_recv_ece > 0)
4121 IFP_PER_FLOW_STAT(ifs->ipv4, ecn_recv_ece);
4122
4123 tcp_flow_lim_stats(ifs, &ifp->if_lim_stat);
4124 ifnet_lock_done(ifp);
4125}
4126