1/*
2 * Copyright (c) 2000-2017 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
30 * The Regents of the University of California. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 * @(#)tcp_timer.c 8.2 (Berkeley) 5/24/95
61 * $FreeBSD: src/sys/netinet/tcp_timer.c,v 1.34.2.11 2001/08/22 00:59:12 silby Exp $
62 */
63
64
65#include <sys/param.h>
66#include <sys/systm.h>
67#include <sys/kernel.h>
68#include <sys/mbuf.h>
69#include <sys/sysctl.h>
70#include <sys/socket.h>
71#include <sys/socketvar.h>
72#include <sys/protosw.h>
73#include <sys/domain.h>
74#include <sys/mcache.h>
75#include <sys/queue.h>
76#include <kern/locks.h>
77#include <kern/cpu_number.h> /* before tcp_seq.h, for tcp_random18() */
78#include <mach/boolean.h>
79
80#include <net/route.h>
81#include <net/if_var.h>
82#include <net/ntstat.h>
83
84#include <netinet/in.h>
85#include <netinet/in_systm.h>
86#include <netinet/in_pcb.h>
87#if INET6
88#include <netinet6/in6_pcb.h>
89#endif
90#include <netinet/ip_var.h>
91#include <netinet/tcp.h>
92#include <netinet/tcp_cache.h>
93#include <netinet/tcp_fsm.h>
94#include <netinet/tcp_seq.h>
95#include <netinet/tcp_timer.h>
96#include <netinet/tcp_var.h>
97#include <netinet/tcp_cc.h>
98#if INET6
99#include <netinet6/tcp6_var.h>
100#endif
101#include <netinet/tcpip.h>
102#if TCPDEBUG
103#include <netinet/tcp_debug.h>
104#endif
105#include <sys/kdebug.h>
106#include <mach/sdt.h>
107#include <netinet/mptcp_var.h>
108
109/* Max number of times a stretch ack can be delayed on a connection */
110#define TCP_STRETCHACK_DELAY_THRESHOLD 5
111
112/*
113 * If the host processor has been sleeping for too long, this is the threshold
114 * used to avoid sending stale retransmissions.
115 */
116#define TCP_SLEEP_TOO_LONG (10 * 60 * 1000) /* 10 minutes in ms */
117
118/* tcp timer list */
119struct tcptimerlist tcp_timer_list;
120
121/* List of pcbs in timewait state, protected by tcbinfo's ipi_lock */
122struct tcptailq tcp_tw_tailq;
123
124static int
125sysctl_msec_to_ticks SYSCTL_HANDLER_ARGS
126{
127#pragma unused(arg2)
128 int error, s, tt;
129
130 tt = *(int *)arg1;
131 s = tt * 1000 / TCP_RETRANSHZ;;
132
133 error = sysctl_handle_int(oidp, &s, 0, req);
134 if (error || !req->newptr)
135 return (error);
136
137 tt = s * TCP_RETRANSHZ / 1000;
138 if (tt < 1)
139 return (EINVAL);
140
141 *(int *)arg1 = tt;
142 SYSCTL_SKMEM_UPDATE_AT_OFFSET(arg2, *(int*)arg1);
143 return (0);
144}
145
146#if SYSCTL_SKMEM
147int tcp_keepinit = TCPTV_KEEP_INIT;
148SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit,
149 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
150 &tcp_keepinit, offsetof(skmem_sysctl, tcp.keepinit),
151 sysctl_msec_to_ticks, "I", "");
152
153int tcp_keepidle = TCPTV_KEEP_IDLE;
154SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle,
155 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
156 &tcp_keepidle, offsetof(skmem_sysctl, tcp.keepidle),
157 sysctl_msec_to_ticks, "I", "");
158
159int tcp_keepintvl = TCPTV_KEEPINTVL;
160SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl,
161 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
162 &tcp_keepintvl, offsetof(skmem_sysctl, tcp.keepintvl),
163 sysctl_msec_to_ticks, "I", "");
164
165SYSCTL_SKMEM_TCP_INT(OID_AUTO, keepcnt,
166 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
167 int, tcp_keepcnt, TCPTV_KEEPCNT, "number of times to repeat keepalive");
168
169int tcp_msl = TCPTV_MSL;
170SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl,
171 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
172 &tcp_msl, offsetof(skmem_sysctl, tcp.msl),
173 sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
174#else /* SYSCTL_SKMEM */
175int tcp_keepinit;
176SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINIT, keepinit,
177 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
178 &tcp_keepinit, 0, sysctl_msec_to_ticks, "I", "");
179
180int tcp_keepidle;
181SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPIDLE, keepidle,
182 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
183 &tcp_keepidle, 0, sysctl_msec_to_ticks, "I", "");
184
185int tcp_keepintvl;
186SYSCTL_PROC(_net_inet_tcp, TCPCTL_KEEPINTVL, keepintvl,
187 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
188 &tcp_keepintvl, 0, sysctl_msec_to_ticks, "I", "");
189
190int tcp_keepcnt;
191SYSCTL_INT(_net_inet_tcp, OID_AUTO, keepcnt,
192 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
193 &tcp_keepcnt, 0, "number of times to repeat keepalive");
194
195int tcp_msl;
196SYSCTL_PROC(_net_inet_tcp, OID_AUTO, msl,
197 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
198 &tcp_msl, 0, sysctl_msec_to_ticks, "I", "Maximum segment lifetime");
199#endif /* SYSCTL_SKMEM */
200
201/*
202 * Avoid DoS via TCP Robustness in Persist Condition
203 * (see http://www.ietf.org/id/draft-ananth-tcpm-persist-02.txt)
204 * by allowing a system wide maximum persistence timeout value when in
205 * Zero Window Probe mode.
206 *
207 * Expressed in milliseconds to be consistent without timeout related
208 * values, the TCP socket option is in seconds.
209 */
210#if SYSCTL_SKMEM
211u_int32_t tcp_max_persist_timeout = 0;
212SYSCTL_PROC(_net_inet_tcp, OID_AUTO, max_persist_timeout,
213 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
214 &tcp_max_persist_timeout, offsetof(skmem_sysctl, tcp.max_persist_timeout),
215 sysctl_msec_to_ticks, "I", "Maximum persistence timeout for ZWP");
216#else /* SYSCTL_SKMEM */
217u_int32_t tcp_max_persist_timeout = 0;
218SYSCTL_PROC(_net_inet_tcp, OID_AUTO, max_persist_timeout,
219 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
220 &tcp_max_persist_timeout, 0, sysctl_msec_to_ticks, "I",
221 "Maximum persistence timeout for ZWP");
222#endif /* SYSCTL_SKMEM */
223
224SYSCTL_SKMEM_TCP_INT(OID_AUTO, always_keepalive,
225 CTLFLAG_RW | CTLFLAG_LOCKED, static int, always_keepalive, 0,
226 "Assume SO_KEEPALIVE on all TCP connections");
227
228/*
229 * This parameter determines how long the timer list will stay in fast or
230 * quick mode even though all connections are idle. In this state, the
231 * timer will run more frequently anticipating new data.
232 */
233SYSCTL_SKMEM_TCP_INT(OID_AUTO, timer_fastmode_idlemax,
234 CTLFLAG_RW | CTLFLAG_LOCKED, int, timer_fastmode_idlemax,
235 TCP_FASTMODE_IDLERUN_MAX, "Maximum idle generations in fast mode");
236
237/*
238 * See tcp_syn_backoff[] for interval values between SYN retransmits;
239 * the value set below defines the number of retransmits, before we
240 * disable the timestamp and window scaling options during subsequent
241 * SYN retransmits. Setting it to 0 disables the dropping off of those
242 * two options.
243 */
244SYSCTL_SKMEM_TCP_INT(OID_AUTO, broken_peer_syn_rexmit_thres,
245 CTLFLAG_RW | CTLFLAG_LOCKED, static int, tcp_broken_peer_syn_rxmit_thres,
246 10, "Number of retransmitted SYNs before disabling RFC 1323 "
247 "options on local connections");
248
249static int tcp_timer_advanced = 0;
250SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_timer_advanced,
251 CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_timer_advanced, 0,
252 "Number of times one of the timers was advanced");
253
254static int tcp_resched_timerlist = 0;
255SYSCTL_INT(_net_inet_tcp, OID_AUTO, tcp_resched_timerlist,
256 CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_resched_timerlist, 0,
257 "Number of times timer list was rescheduled as part of processing a packet");
258
259SYSCTL_SKMEM_TCP_INT(OID_AUTO, pmtud_blackhole_detection,
260 CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_pmtud_black_hole_detect, 1,
261 "Path MTU Discovery Black Hole Detection");
262
263SYSCTL_SKMEM_TCP_INT(OID_AUTO, pmtud_blackhole_mss,
264 CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_pmtud_black_hole_mss, 1200,
265 "Path MTU Discovery Black Hole Detection lowered MSS");
266
267static u_int32_t tcp_mss_rec_medium = 1200;
268static u_int32_t tcp_mss_rec_low = 512;
269
270#define TCP_REPORT_STATS_INTERVAL 43200 /* 12 hours, in seconds */
271int tcp_report_stats_interval = TCP_REPORT_STATS_INTERVAL;
272
273/* performed garbage collection of "used" sockets */
274static boolean_t tcp_gc_done = FALSE;
275
276/* max idle probes */
277int tcp_maxpersistidle = TCPTV_KEEP_IDLE;
278
279/*
280 * TCP delack timer is set to 100 ms. Since the processing of timer list
281 * in fast mode will happen no faster than 100 ms, the delayed ack timer
282 * will fire some where between 100 and 200 ms.
283 */
284int tcp_delack = TCP_RETRANSHZ / 10;
285
286#if MPTCP
287/*
288 * MP_JOIN retransmission of 3rd ACK will be every 500 msecs without backoff
289 */
290int tcp_jack_rxmt = TCP_RETRANSHZ / 2;
291#endif /* MPTCP */
292
293static boolean_t tcp_itimer_done = FALSE;
294
295static void tcp_remove_timer(struct tcpcb *tp);
296static void tcp_sched_timerlist(uint32_t offset);
297static u_int32_t tcp_run_conn_timer(struct tcpcb *tp, u_int16_t *mode,
298 u_int16_t probe_if_index);
299static void tcp_sched_timers(struct tcpcb *tp);
300static inline void tcp_set_lotimer_index(struct tcpcb *);
301__private_extern__ void tcp_remove_from_time_wait(struct inpcb *inp);
302static inline void tcp_update_mss_core(struct tcpcb *tp, struct ifnet *ifp);
303__private_extern__ void tcp_report_stats(void);
304
305static u_int64_t tcp_last_report_time;
306
307/*
308 * Structure to store previously reported stats so that we can send
309 * incremental changes in each report interval.
310 */
311struct tcp_last_report_stats {
312 u_int32_t tcps_connattempt;
313 u_int32_t tcps_accepts;
314 u_int32_t tcps_ecn_client_setup;
315 u_int32_t tcps_ecn_server_setup;
316 u_int32_t tcps_ecn_client_success;
317 u_int32_t tcps_ecn_server_success;
318 u_int32_t tcps_ecn_not_supported;
319 u_int32_t tcps_ecn_lost_syn;
320 u_int32_t tcps_ecn_lost_synack;
321 u_int32_t tcps_ecn_recv_ce;
322 u_int32_t tcps_ecn_recv_ece;
323 u_int32_t tcps_ecn_sent_ece;
324 u_int32_t tcps_ecn_conn_recv_ce;
325 u_int32_t tcps_ecn_conn_recv_ece;
326 u_int32_t tcps_ecn_conn_plnoce;
327 u_int32_t tcps_ecn_conn_pl_ce;
328 u_int32_t tcps_ecn_conn_nopl_ce;
329 u_int32_t tcps_ecn_fallback_synloss;
330 u_int32_t tcps_ecn_fallback_reorder;
331 u_int32_t tcps_ecn_fallback_ce;
332
333 /* TFO-related statistics */
334 u_int32_t tcps_tfo_syn_data_rcv;
335 u_int32_t tcps_tfo_cookie_req_rcv;
336 u_int32_t tcps_tfo_cookie_sent;
337 u_int32_t tcps_tfo_cookie_invalid;
338 u_int32_t tcps_tfo_cookie_req;
339 u_int32_t tcps_tfo_cookie_rcv;
340 u_int32_t tcps_tfo_syn_data_sent;
341 u_int32_t tcps_tfo_syn_data_acked;
342 u_int32_t tcps_tfo_syn_loss;
343 u_int32_t tcps_tfo_blackhole;
344 u_int32_t tcps_tfo_cookie_wrong;
345 u_int32_t tcps_tfo_no_cookie_rcv;
346 u_int32_t tcps_tfo_heuristics_disable;
347 u_int32_t tcps_tfo_sndblackhole;
348
349 /* MPTCP-related statistics */
350 u_int32_t tcps_mptcp_handover_attempt;
351 u_int32_t tcps_mptcp_interactive_attempt;
352 u_int32_t tcps_mptcp_aggregate_attempt;
353 u_int32_t tcps_mptcp_fp_handover_attempt;
354 u_int32_t tcps_mptcp_fp_interactive_attempt;
355 u_int32_t tcps_mptcp_fp_aggregate_attempt;
356 u_int32_t tcps_mptcp_heuristic_fallback;
357 u_int32_t tcps_mptcp_fp_heuristic_fallback;
358 u_int32_t tcps_mptcp_handover_success_wifi;
359 u_int32_t tcps_mptcp_handover_success_cell;
360 u_int32_t tcps_mptcp_interactive_success;
361 u_int32_t tcps_mptcp_aggregate_success;
362 u_int32_t tcps_mptcp_fp_handover_success_wifi;
363 u_int32_t tcps_mptcp_fp_handover_success_cell;
364 u_int32_t tcps_mptcp_fp_interactive_success;
365 u_int32_t tcps_mptcp_fp_aggregate_success;
366 u_int32_t tcps_mptcp_handover_cell_from_wifi;
367 u_int32_t tcps_mptcp_handover_wifi_from_cell;
368 u_int32_t tcps_mptcp_interactive_cell_from_wifi;
369 u_int64_t tcps_mptcp_handover_cell_bytes;
370 u_int64_t tcps_mptcp_interactive_cell_bytes;
371 u_int64_t tcps_mptcp_aggregate_cell_bytes;
372 u_int64_t tcps_mptcp_handover_all_bytes;
373 u_int64_t tcps_mptcp_interactive_all_bytes;
374 u_int64_t tcps_mptcp_aggregate_all_bytes;
375 u_int32_t tcps_mptcp_back_to_wifi;
376 u_int32_t tcps_mptcp_wifi_proxy;
377 u_int32_t tcps_mptcp_cell_proxy;
378 u_int32_t tcps_mptcp_triggered_cell;
379};
380
381
382/* Returns true if the timer is on the timer list */
383#define TIMER_IS_ON_LIST(tp) ((tp)->t_flags & TF_TIMER_ONLIST)
384
385/* Run the TCP timerlist atleast once every hour */
386#define TCP_TIMERLIST_MAX_OFFSET (60 * 60 * TCP_RETRANSHZ)
387
388
389static void add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay);
390static boolean_t tcp_garbage_collect(struct inpcb *, int);
391
392#define TIMERENTRY_TO_TP(te) ((struct tcpcb *)((uintptr_t)te - offsetof(struct tcpcb, tentry.le.le_next)))
393
394#define VERIFY_NEXT_LINK(elm,field) do { \
395 if (LIST_NEXT((elm),field) != NULL && \
396 LIST_NEXT((elm),field)->field.le_prev != \
397 &((elm)->field.le_next)) \
398 panic("Bad link elm %p next->prev != elm", (elm)); \
399} while(0)
400
401#define VERIFY_PREV_LINK(elm,field) do { \
402 if (*(elm)->field.le_prev != (elm)) \
403 panic("Bad link elm %p prev->next != elm", (elm)); \
404} while(0)
405
406#define TCP_SET_TIMER_MODE(mode, i) do { \
407 if (IS_TIMER_HZ_10MS(i)) \
408 (mode) |= TCP_TIMERLIST_10MS_MODE; \
409 else if (IS_TIMER_HZ_100MS(i)) \
410 (mode) |= TCP_TIMERLIST_100MS_MODE; \
411 else \
412 (mode) |= TCP_TIMERLIST_500MS_MODE; \
413} while(0)
414
415#if (DEVELOPMENT || DEBUG)
416SYSCTL_UINT(_net_inet_tcp, OID_AUTO, mss_rec_medium,
417 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_mss_rec_medium, 0,
418 "Medium MSS based on recommendation in link status report");
419SYSCTL_UINT(_net_inet_tcp, OID_AUTO, mss_rec_low,
420 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_mss_rec_low, 0,
421 "Low MSS based on recommendation in link status report");
422
423static int32_t tcp_change_mss_recommended = 0;
424static int
425sysctl_change_mss_recommended SYSCTL_HANDLER_ARGS
426{
427#pragma unused(oidp, arg1, arg2)
428 int i, err = 0, changed = 0;
429 struct ifnet *ifp;
430 struct if_link_status ifsr;
431 struct if_cellular_status_v1 *new_cell_sr;
432 err = sysctl_io_number(req, tcp_change_mss_recommended,
433 sizeof (int32_t), &i, &changed);
434 if (changed) {
435 ifnet_head_lock_shared();
436 TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
437 if (IFNET_IS_CELLULAR(ifp)) {
438 bzero(&ifsr, sizeof (ifsr));
439 new_cell_sr = &ifsr.ifsr_u.ifsr_cell.if_cell_u.if_status_v1;
440 ifsr.ifsr_version = IF_CELLULAR_STATUS_REPORT_CURRENT_VERSION;
441 ifsr.ifsr_len = sizeof(*new_cell_sr);
442
443 /* Set MSS recommended */
444 new_cell_sr->valid_bitmask |= IF_CELL_UL_MSS_RECOMMENDED_VALID;
445 new_cell_sr->mss_recommended = i;
446 err = ifnet_link_status_report(ifp, new_cell_sr, sizeof (new_cell_sr));
447 if (err == 0) {
448 tcp_change_mss_recommended = i;
449 } else {
450 break;
451 }
452 }
453 }
454 ifnet_head_done();
455 }
456 return (err);
457}
458
459SYSCTL_PROC(_net_inet_tcp, OID_AUTO, change_mss_recommended,
460 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_change_mss_recommended,
461 0, sysctl_change_mss_recommended, "IU", "Change MSS recommended");
462
463SYSCTL_INT(_net_inet_tcp, OID_AUTO, report_stats_interval,
464 CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_report_stats_interval, 0,
465 "Report stats interval");
466#endif /* (DEVELOPMENT || DEBUG) */
467
468/*
469 * Macro to compare two timers. If there is a reset of the sign bit,
470 * it is safe to assume that the timer has wrapped around. By doing
471 * signed comparision, we take care of wrap around such that the value
472 * with the sign bit reset is actually ahead of the other.
473 */
474inline int32_t
475timer_diff(uint32_t t1, uint32_t toff1, uint32_t t2, uint32_t toff2) {
476 return (int32_t)((t1 + toff1) - (t2 + toff2));
477};
478
479/*
480 * Add to tcp timewait list, delay is given in milliseconds.
481 */
482static void
483add_to_time_wait_locked(struct tcpcb *tp, uint32_t delay)
484{
485 struct inpcbinfo *pcbinfo = &tcbinfo;
486 struct inpcb *inp = tp->t_inpcb;
487 uint32_t timer;
488
489 /* pcb list should be locked when we get here */
490 LCK_RW_ASSERT(pcbinfo->ipi_lock, LCK_RW_ASSERT_EXCLUSIVE);
491
492 /* We may get here multiple times, so check */
493 if (!(inp->inp_flags2 & INP2_TIMEWAIT)) {
494 pcbinfo->ipi_twcount++;
495 inp->inp_flags2 |= INP2_TIMEWAIT;
496
497 /* Remove from global inp list */
498 LIST_REMOVE(inp, inp_list);
499 } else {
500 TAILQ_REMOVE(&tcp_tw_tailq, tp, t_twentry);
501 }
502
503 /* Compute the time at which this socket can be closed */
504 timer = tcp_now + delay;
505
506 /* We will use the TCPT_2MSL timer for tracking this delay */
507
508 if (TIMER_IS_ON_LIST(tp))
509 tcp_remove_timer(tp);
510 tp->t_timer[TCPT_2MSL] = timer;
511
512 TAILQ_INSERT_TAIL(&tcp_tw_tailq, tp, t_twentry);
513}
514
515void
516add_to_time_wait(struct tcpcb *tp, uint32_t delay)
517{
518 struct inpcbinfo *pcbinfo = &tcbinfo;
519 if (tp->t_inpcb->inp_socket->so_options & SO_NOWAKEFROMSLEEP)
520 socket_post_kev_msg_closed(tp->t_inpcb->inp_socket);
521
522 /* 19182803: Notify nstat that connection is closing before waiting. */
523 nstat_pcb_detach(tp->t_inpcb);
524
525 if (!lck_rw_try_lock_exclusive(pcbinfo->ipi_lock)) {
526 socket_unlock(tp->t_inpcb->inp_socket, 0);
527 lck_rw_lock_exclusive(pcbinfo->ipi_lock);
528 socket_lock(tp->t_inpcb->inp_socket, 0);
529 }
530 add_to_time_wait_locked(tp, delay);
531 lck_rw_done(pcbinfo->ipi_lock);
532
533 inpcb_gc_sched(pcbinfo, INPCB_TIMER_LAZY);
534}
535
536/* If this is on time wait queue, remove it. */
537void
538tcp_remove_from_time_wait(struct inpcb *inp)
539{
540 struct tcpcb *tp = intotcpcb(inp);
541 if (inp->inp_flags2 & INP2_TIMEWAIT)
542 TAILQ_REMOVE(&tcp_tw_tailq, tp, t_twentry);
543}
544
545static boolean_t
546tcp_garbage_collect(struct inpcb *inp, int istimewait)
547{
548 boolean_t active = FALSE;
549 struct socket *so, *mp_so = NULL;
550 struct tcpcb *tp;
551
552 so = inp->inp_socket;
553 tp = intotcpcb(inp);
554
555 if (so->so_flags & SOF_MP_SUBFLOW) {
556 mp_so = mptetoso(tptomptp(tp)->mpt_mpte);
557 if (!socket_try_lock(mp_so)) {
558 mp_so = NULL;
559 active = TRUE;
560 goto out;
561 }
562 mp_so->so_usecount++;
563 }
564
565 /*
566 * Skip if still in use or busy; it would have been more efficient
567 * if we were to test so_usecount against 0, but this isn't possible
568 * due to the current implementation of tcp_dropdropablreq() where
569 * overflow sockets that are eligible for garbage collection have
570 * their usecounts set to 1.
571 */
572 if (!lck_mtx_try_lock_spin(&inp->inpcb_mtx)) {
573 active = TRUE;
574 goto out;
575 }
576
577 /* Check again under the lock */
578 if (so->so_usecount > 1) {
579 if (inp->inp_wantcnt == WNT_STOPUSING)
580 active = TRUE;
581 lck_mtx_unlock(&inp->inpcb_mtx);
582 goto out;
583 }
584
585 if (istimewait && TSTMP_GEQ(tcp_now, tp->t_timer[TCPT_2MSL]) &&
586 tp->t_state != TCPS_CLOSED) {
587 /* Become a regular mutex */
588 lck_mtx_convert_spin(&inp->inpcb_mtx);
589 tcp_close(tp);
590 }
591
592 /*
593 * Overflowed socket dropped from the listening queue? Do this
594 * only if we are called to clean up the time wait slots, since
595 * tcp_dropdropablreq() considers a socket to have been fully
596 * dropped after add_to_time_wait() is finished.
597 * Also handle the case of connections getting closed by the peer
598 * while in the queue as seen with rdar://6422317
599 *
600 */
601 if (so->so_usecount == 1 &&
602 ((istimewait && (so->so_flags & SOF_OVERFLOW)) ||
603 ((tp != NULL) && (tp->t_state == TCPS_CLOSED) &&
604 (so->so_head != NULL) &&
605 ((so->so_state & (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE)) ==
606 (SS_INCOMP|SS_CANTSENDMORE|SS_CANTRCVMORE))))) {
607
608 if (inp->inp_state != INPCB_STATE_DEAD) {
609 /* Become a regular mutex */
610 lck_mtx_convert_spin(&inp->inpcb_mtx);
611#if INET6
612 if (SOCK_CHECK_DOM(so, PF_INET6))
613 in6_pcbdetach(inp);
614 else
615#endif /* INET6 */
616 in_pcbdetach(inp);
617 }
618 VERIFY(so->so_usecount > 0);
619 so->so_usecount--;
620 if (inp->inp_wantcnt == WNT_STOPUSING)
621 active = TRUE;
622 lck_mtx_unlock(&inp->inpcb_mtx);
623 goto out;
624 } else if (inp->inp_wantcnt != WNT_STOPUSING) {
625 lck_mtx_unlock(&inp->inpcb_mtx);
626 active = FALSE;
627 goto out;
628 }
629
630 /*
631 * We get here because the PCB is no longer searchable
632 * (WNT_STOPUSING); detach (if needed) and dispose if it is dead
633 * (usecount is 0). This covers all cases, including overflow
634 * sockets and those that are considered as "embryonic",
635 * i.e. created by sonewconn() in TCP input path, and have
636 * not yet been committed. For the former, we reduce the usecount
637 * to 0 as done by the code above. For the latter, the usecount
638 * would have reduced to 0 as part calling soabort() when the
639 * socket is dropped at the end of tcp_input().
640 */
641 if (so->so_usecount == 0) {
642 DTRACE_TCP4(state__change, void, NULL, struct inpcb *, inp,
643 struct tcpcb *, tp, int32_t, TCPS_CLOSED);
644 /* Become a regular mutex */
645 lck_mtx_convert_spin(&inp->inpcb_mtx);
646
647 /*
648 * If this tp still happens to be on the timer list,
649 * take it out
650 */
651 if (TIMER_IS_ON_LIST(tp)) {
652 tcp_remove_timer(tp);
653 }
654
655 if (inp->inp_state != INPCB_STATE_DEAD) {
656#if INET6
657 if (SOCK_CHECK_DOM(so, PF_INET6))
658 in6_pcbdetach(inp);
659 else
660#endif /* INET6 */
661 in_pcbdetach(inp);
662 }
663
664 if (mp_so) {
665 mptcp_subflow_del(tptomptp(tp)->mpt_mpte, tp->t_mpsub);
666
667 /* so is now unlinked from mp_so - let's drop the lock */
668 socket_unlock(mp_so, 1);
669 mp_so = NULL;
670 }
671
672 in_pcbdispose(inp);
673 active = FALSE;
674 goto out;
675 }
676
677 lck_mtx_unlock(&inp->inpcb_mtx);
678 active = TRUE;
679
680out:
681 if (mp_so)
682 socket_unlock(mp_so, 1);
683
684 return (active);
685}
686
687/*
688 * TCP garbage collector callback (inpcb_timer_func_t).
689 *
690 * Returns the number of pcbs that will need to be gc-ed soon,
691 * returnining > 0 will keep timer active.
692 */
693void
694tcp_gc(struct inpcbinfo *ipi)
695{
696 struct inpcb *inp, *nxt;
697 struct tcpcb *tw_tp, *tw_ntp;
698#if TCPDEBUG
699 int ostate;
700#endif
701#if KDEBUG
702 static int tws_checked = 0;
703#endif
704
705 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_START, 0, 0, 0, 0, 0);
706
707 /*
708 * Update tcp_now here as it may get used while
709 * processing the slow timer.
710 */
711 calculate_tcp_clock();
712
713 /*
714 * Garbage collect socket/tcpcb: We need to acquire the list lock
715 * exclusively to do this
716 */
717
718 if (lck_rw_try_lock_exclusive(ipi->ipi_lock) == FALSE) {
719 /* don't sweat it this time; cleanup was done last time */
720 if (tcp_gc_done == TRUE) {
721 tcp_gc_done = FALSE;
722 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END,
723 tws_checked, cur_tw_slot, 0, 0, 0);
724 /* Lock upgrade failed, give up this round */
725 atomic_add_32(&ipi->ipi_gc_req.intimer_fast, 1);
726 return;
727 }
728 /* Upgrade failed, lost lock now take it again exclusive */
729 lck_rw_lock_exclusive(ipi->ipi_lock);
730 }
731 tcp_gc_done = TRUE;
732
733 LIST_FOREACH_SAFE(inp, &tcb, inp_list, nxt) {
734 if (tcp_garbage_collect(inp, 0))
735 atomic_add_32(&ipi->ipi_gc_req.intimer_fast, 1);
736 }
737
738 /* Now cleanup the time wait ones */
739 TAILQ_FOREACH_SAFE(tw_tp, &tcp_tw_tailq, t_twentry, tw_ntp) {
740 /*
741 * We check the timestamp here without holding the
742 * socket lock for better performance. If there are
743 * any pcbs in time-wait, the timer will get rescheduled.
744 * Hence some error in this check can be tolerated.
745 *
746 * Sometimes a socket on time-wait queue can be closed if
747 * 2MSL timer expired but the application still has a
748 * usecount on it.
749 */
750 if (tw_tp->t_state == TCPS_CLOSED ||
751 TSTMP_GEQ(tcp_now, tw_tp->t_timer[TCPT_2MSL])) {
752 if (tcp_garbage_collect(tw_tp->t_inpcb, 1))
753 atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, 1);
754 }
755 }
756
757 /* take into account pcbs that are still in time_wait_slots */
758 atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, ipi->ipi_twcount);
759
760 lck_rw_done(ipi->ipi_lock);
761
762 /* Clean up the socache while we are here */
763 if (so_cache_timer())
764 atomic_add_32(&ipi->ipi_gc_req.intimer_lazy, 1);
765
766 KERNEL_DEBUG(DBG_FNC_TCP_SLOW | DBG_FUNC_END, tws_checked,
767 cur_tw_slot, 0, 0, 0);
768
769 return;
770}
771
772/*
773 * Cancel all timers for TCP tp.
774 */
775void
776tcp_canceltimers(struct tcpcb *tp)
777{
778 int i;
779
780 tcp_remove_timer(tp);
781 for (i = 0; i < TCPT_NTIMERS; i++)
782 tp->t_timer[i] = 0;
783 tp->tentry.timer_start = tcp_now;
784 tp->tentry.index = TCPT_NONE;
785}
786
787int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
788 { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
789
790int tcp_backoff[TCP_MAXRXTSHIFT + 1] =
791 { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 };
792
793static int tcp_totbackoff = 511; /* sum of tcp_backoff[] */
794
795void
796tcp_rexmt_save_state(struct tcpcb *tp)
797{
798 u_int32_t fsize;
799 if (TSTMP_SUPPORTED(tp)) {
800 /*
801 * Since timestamps are supported on the connection,
802 * we can do recovery as described in rfc 4015.
803 */
804 fsize = tp->snd_max - tp->snd_una;
805 tp->snd_ssthresh_prev = max(fsize, tp->snd_ssthresh);
806 tp->snd_recover_prev = tp->snd_recover;
807 } else {
808 /*
809 * Timestamp option is not supported on this connection.
810 * Record ssthresh and cwnd so they can
811 * be recovered if this turns out to be a "bad" retransmit.
812 * A retransmit is considered "bad" if an ACK for this
813 * segment is received within RTT/2 interval; the assumption
814 * here is that the ACK was already in flight. See
815 * "On Estimating End-to-End Network Path Properties" by
816 * Allman and Paxson for more details.
817 */
818 tp->snd_cwnd_prev = tp->snd_cwnd;
819 tp->snd_ssthresh_prev = tp->snd_ssthresh;
820 tp->snd_recover_prev = tp->snd_recover;
821 if (IN_FASTRECOVERY(tp))
822 tp->t_flags |= TF_WASFRECOVERY;
823 else
824 tp->t_flags &= ~TF_WASFRECOVERY;
825 }
826 tp->t_srtt_prev = (tp->t_srtt >> TCP_RTT_SHIFT) + 2;
827 tp->t_rttvar_prev = (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
828 tp->t_flagsext &= ~(TF_RECOMPUTE_RTT);
829}
830
831/*
832 * Revert to the older segment size if there is an indication that PMTU
833 * blackhole detection was not needed.
834 */
835void
836tcp_pmtud_revert_segment_size(struct tcpcb *tp)
837{
838 int32_t optlen;
839
840 VERIFY(tp->t_pmtud_saved_maxopd > 0);
841 tp->t_flags |= TF_PMTUD;
842 tp->t_flags &= ~TF_BLACKHOLE;
843 optlen = tp->t_maxopd - tp->t_maxseg;
844 tp->t_maxopd = tp->t_pmtud_saved_maxopd;
845 tp->t_maxseg = tp->t_maxopd - optlen;
846
847 /*
848 * Reset the slow-start flight size as it
849 * may depend on the new MSS
850 */
851 if (CC_ALGO(tp)->cwnd_init != NULL)
852 CC_ALGO(tp)->cwnd_init(tp);
853 tp->t_pmtud_start_ts = 0;
854 tcpstat.tcps_pmtudbh_reverted++;
855
856 /* change MSS according to recommendation, if there was one */
857 tcp_update_mss_locked(tp->t_inpcb->inp_socket, NULL);
858}
859
860/*
861 * TCP timer processing.
862 */
863struct tcpcb *
864tcp_timers(struct tcpcb *tp, int timer)
865{
866 int32_t rexmt, optlen = 0, idle_time = 0;
867 struct socket *so;
868 struct tcptemp *t_template;
869#if TCPDEBUG
870 int ostate;
871#endif
872
873#if INET6
874 int isipv6 = (tp->t_inpcb->inp_vflag & INP_IPV4) == 0;
875#endif /* INET6 */
876 u_int64_t accsleep_ms;
877 u_int32_t last_sleep_ms = 0;
878
879 so = tp->t_inpcb->inp_socket;
880 idle_time = tcp_now - tp->t_rcvtime;
881
882 switch (timer) {
883
884 /*
885 * 2 MSL timeout in shutdown went off. If we're closed but
886 * still waiting for peer to close and connection has been idle
887 * too long, or if 2MSL time is up from TIME_WAIT or FIN_WAIT_2,
888 * delete connection control block.
889 * Otherwise, (this case shouldn't happen) check again in a bit
890 * we keep the socket in the main list in that case.
891 */
892 case TCPT_2MSL:
893 tcp_free_sackholes(tp);
894 if (tp->t_state != TCPS_TIME_WAIT &&
895 tp->t_state != TCPS_FIN_WAIT_2 &&
896 ((idle_time > 0) && (idle_time < TCP_CONN_MAXIDLE(tp)))) {
897 tp->t_timer[TCPT_2MSL] = OFFSET_FROM_START(tp,
898 (u_int32_t)TCP_CONN_KEEPINTVL(tp));
899 } else {
900 tp = tcp_close(tp);
901 return(tp);
902 }
903 break;
904
905 /*
906 * Retransmission timer went off. Message has not
907 * been acked within retransmit interval. Back off
908 * to a longer retransmit interval and retransmit one segment.
909 */
910 case TCPT_REXMT:
911 absolutetime_to_nanoseconds(mach_absolutetime_asleep,
912 &accsleep_ms);
913 accsleep_ms = accsleep_ms / 1000000UL;
914 if (accsleep_ms > tp->t_accsleep_ms)
915 last_sleep_ms = accsleep_ms - tp->t_accsleep_ms;
916 /*
917 * Drop a connection in the retransmit timer
918 * 1. If we have retransmitted more than TCP_MAXRXTSHIFT
919 * times
920 * 2. If the time spent in this retransmission episode is
921 * more than the time limit set with TCP_RXT_CONNDROPTIME
922 * socket option
923 * 3. If TCP_RXT_FINDROP socket option was set and
924 * we have already retransmitted the FIN 3 times without
925 * receiving an ack
926 */
927 if (++tp->t_rxtshift > TCP_MAXRXTSHIFT ||
928 (tp->t_rxt_conndroptime > 0 && tp->t_rxtstart > 0 &&
929 (tcp_now - tp->t_rxtstart) >= tp->t_rxt_conndroptime) ||
930 ((tp->t_flagsext & TF_RXTFINDROP) != 0 &&
931 (tp->t_flags & TF_SENTFIN) != 0 && tp->t_rxtshift >= 4) ||
932 (tp->t_rxtshift > 4 && last_sleep_ms >= TCP_SLEEP_TOO_LONG)) {
933 if (tp->t_state == TCPS_ESTABLISHED &&
934 tp->t_rxt_minimum_timeout > 0) {
935 /*
936 * Avoid dropping a connection if minimum
937 * timeout is set and that time did not
938 * pass. We will retry sending
939 * retransmissions at the maximum interval
940 */
941 if (TSTMP_LT(tcp_now, (tp->t_rxtstart +
942 tp->t_rxt_minimum_timeout))) {
943 tp->t_rxtshift = TCP_MAXRXTSHIFT - 1;
944 goto retransmit_packet;
945 }
946 }
947 if ((tp->t_flagsext & TF_RXTFINDROP) != 0) {
948 tcpstat.tcps_rxtfindrop++;
949 } else if (last_sleep_ms >= TCP_SLEEP_TOO_LONG) {
950 tcpstat.tcps_drop_after_sleep++;
951 } else {
952 tcpstat.tcps_timeoutdrop++;
953 }
954 if (tp->t_rxtshift >= TCP_MAXRXTSHIFT) {
955 if (TCP_ECN_ENABLED(tp)) {
956 INP_INC_IFNET_STAT(tp->t_inpcb,
957 ecn_on.rxmit_drop);
958 } else {
959 INP_INC_IFNET_STAT(tp->t_inpcb,
960 ecn_off.rxmit_drop);
961 }
962 }
963 tp->t_rxtshift = TCP_MAXRXTSHIFT;
964 postevent(so, 0, EV_TIMEOUT);
965 soevent(so,
966 (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT));
967
968 if (TCP_ECN_ENABLED(tp) &&
969 tp->t_state == TCPS_ESTABLISHED)
970 tcp_heuristic_ecn_droprxmt(tp);
971
972 tp = tcp_drop(tp, tp->t_softerror ?
973 tp->t_softerror : ETIMEDOUT);
974
975 break;
976 }
977retransmit_packet:
978 tcpstat.tcps_rexmttimeo++;
979 tp->t_accsleep_ms = accsleep_ms;
980
981 if (tp->t_rxtshift == 1 &&
982 tp->t_state == TCPS_ESTABLISHED) {
983 /* Set the time at which retransmission started. */
984 tp->t_rxtstart = tcp_now;
985
986 /*
987 * if this is the first retransmit timeout, save
988 * the state so that we can recover if the timeout
989 * is spurious.
990 */
991 tcp_rexmt_save_state(tp);
992 }
993#if MPTCP
994 if ((tp->t_rxtshift >= mptcp_fail_thresh) &&
995 (tp->t_state == TCPS_ESTABLISHED) &&
996 (tp->t_mpflags & TMPF_MPTCP_TRUE))
997 mptcp_act_on_txfail(so);
998
999 if (so->so_flags & SOF_MP_SUBFLOW) {
1000 struct mptses *mpte = tptomptp(tp)->mpt_mpte;
1001
1002 mptcp_check_subflows_and_add(mpte);
1003 }
1004#endif /* MPTCP */
1005
1006 if (tp->t_adaptive_wtimo > 0 &&
1007 tp->t_rxtshift > tp->t_adaptive_wtimo &&
1008 TCPS_HAVEESTABLISHED(tp->t_state)) {
1009 /* Send an event to the application */
1010 soevent(so,
1011 (SO_FILT_HINT_LOCKED|
1012 SO_FILT_HINT_ADAPTIVE_WTIMO));
1013 }
1014
1015 /*
1016 * If this is a retransmit timeout after PTO, the PTO
1017 * was not effective
1018 */
1019 if (tp->t_flagsext & TF_SENT_TLPROBE) {
1020 tp->t_flagsext &= ~(TF_SENT_TLPROBE);
1021 tcpstat.tcps_rto_after_pto++;
1022 }
1023
1024 if (tp->t_flagsext & TF_DELAY_RECOVERY) {
1025 /*
1026 * Retransmit timer fired before entering recovery
1027 * on a connection with packet re-ordering. This
1028 * suggests that the reordering metrics computed
1029 * are not accurate.
1030 */
1031 tp->t_reorderwin = 0;
1032 tp->t_timer[TCPT_DELAYFR] = 0;
1033 tp->t_flagsext &= ~(TF_DELAY_RECOVERY);
1034 }
1035
1036 if (tp->t_state == TCPS_SYN_RECEIVED)
1037 tcp_disable_tfo(tp);
1038
1039 if (!(tp->t_tfo_flags & TFO_F_HEURISTIC_DONE) &&
1040 (tp->t_tfo_stats & TFO_S_SYN_DATA_SENT) &&
1041 !(tp->t_tfo_flags & TFO_F_NO_SNDPROBING) &&
1042 ((tp->t_state != TCPS_SYN_SENT && tp->t_rxtshift > 1) ||
1043 tp->t_rxtshift > 4)) {
1044 /*
1045 * For regular retransmissions, a first one is being
1046 * done for tail-loss probe.
1047 * Thus, if rxtshift > 1, this means we have sent the segment
1048 * a total of 3 times.
1049 *
1050 * If we are in SYN-SENT state, then there is no tail-loss
1051 * probe thus we have to let rxtshift go up to 3.
1052 */
1053 tcp_heuristic_tfo_middlebox(tp);
1054
1055 so->so_error = ENODATA;
1056 sorwakeup(so);
1057 sowwakeup(so);
1058
1059 tp->t_tfo_stats |= TFO_S_SEND_BLACKHOLE;
1060 tcpstat.tcps_tfo_sndblackhole++;
1061 }
1062
1063 if (!(tp->t_tfo_flags & TFO_F_HEURISTIC_DONE) &&
1064 (tp->t_tfo_stats & TFO_S_SYN_DATA_ACKED) &&
1065 tp->t_rxtshift > 3) {
1066 if (TSTMP_GT(tp->t_sndtime - 10 * TCP_RETRANSHZ, tp->t_rcvtime)) {
1067 tcp_heuristic_tfo_middlebox(tp);
1068
1069 so->so_error = ENODATA;
1070 sorwakeup(so);
1071 sowwakeup(so);
1072 }
1073 }
1074
1075 if (tp->t_state == TCPS_SYN_SENT) {
1076 rexmt = TCP_REXMTVAL(tp) * tcp_syn_backoff[tp->t_rxtshift];
1077 tp->t_stat.synrxtshift = tp->t_rxtshift;
1078
1079 /* When retransmitting, disable TFO */
1080 if (tfo_enabled(tp) &&
1081 (!(so->so_flags1 & SOF1_DATA_AUTHENTICATED) ||
1082 (tp->t_flagsext & TF_FASTOPEN_HEUR))) {
1083 tp->t_flagsext &= ~TF_FASTOPEN;
1084 tp->t_tfo_flags |= TFO_F_SYN_LOSS;
1085 }
1086 } else {
1087 rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
1088 }
1089
1090 TCPT_RANGESET(tp->t_rxtcur, rexmt, tp->t_rttmin, TCPTV_REXMTMAX,
1091 TCP_ADD_REXMTSLOP(tp));
1092 tp->t_timer[TCPT_REXMT] = OFFSET_FROM_START(tp, tp->t_rxtcur);
1093
1094 if (INP_WAIT_FOR_IF_FEEDBACK(tp->t_inpcb))
1095 goto fc_output;
1096
1097 tcp_free_sackholes(tp);
1098 /*
1099 * Check for potential Path MTU Discovery Black Hole
1100 */
1101 if (tcp_pmtud_black_hole_detect &&
1102 !(tp->t_flagsext & TF_NOBLACKHOLE_DETECTION) &&
1103 (tp->t_state == TCPS_ESTABLISHED)) {
1104 if ((tp->t_flags & TF_PMTUD) &&
1105 ((tp->t_flags & TF_MAXSEGSNT)
1106 || tp->t_pmtud_lastseg_size > tcp_pmtud_black_hole_mss) &&
1107 tp->t_rxtshift == 2) {
1108 /*
1109 * Enter Path MTU Black-hole Detection mechanism:
1110 * - Disable Path MTU Discovery (IP "DF" bit).
1111 * - Reduce MTU to lower value than what we
1112 * negotiated with the peer.
1113 */
1114 /* Disable Path MTU Discovery for now */
1115 tp->t_flags &= ~TF_PMTUD;
1116 /* Record that we may have found a black hole */
1117 tp->t_flags |= TF_BLACKHOLE;
1118 optlen = tp->t_maxopd - tp->t_maxseg;
1119 /* Keep track of previous MSS */
1120 tp->t_pmtud_saved_maxopd = tp->t_maxopd;
1121 tp->t_pmtud_start_ts = tcp_now;
1122 if (tp->t_pmtud_start_ts == 0)
1123 tp->t_pmtud_start_ts++;
1124 /* Reduce the MSS to intermediary value */
1125 if (tp->t_maxopd > tcp_pmtud_black_hole_mss) {
1126 tp->t_maxopd = tcp_pmtud_black_hole_mss;
1127 } else {
1128 tp->t_maxopd = /* use the default MSS */
1129#if INET6
1130 isipv6 ? tcp_v6mssdflt :
1131#endif /* INET6 */
1132 tcp_mssdflt;
1133 }
1134 tp->t_maxseg = tp->t_maxopd - optlen;
1135
1136 /*
1137 * Reset the slow-start flight size
1138 * as it may depend on the new MSS
1139 */
1140 if (CC_ALGO(tp)->cwnd_init != NULL)
1141 CC_ALGO(tp)->cwnd_init(tp);
1142 tp->snd_cwnd = tp->t_maxseg;
1143 }
1144 /*
1145 * If further retransmissions are still
1146 * unsuccessful with a lowered MTU, maybe this
1147 * isn't a Black Hole and we restore the previous
1148 * MSS and blackhole detection flags.
1149 */
1150 else {
1151
1152 if ((tp->t_flags & TF_BLACKHOLE) &&
1153 (tp->t_rxtshift > 4)) {
1154 tcp_pmtud_revert_segment_size(tp);
1155 tp->snd_cwnd = tp->t_maxseg;
1156 }
1157 }
1158 }
1159
1160
1161 /*
1162 * Disable rfc1323 and rfc1644 if we haven't got any
1163 * response to our SYN (after we reach the threshold)
1164 * to work-around some broken terminal servers (most of
1165 * which have hopefully been retired) that have bad VJ
1166 * header compression code which trashes TCP segments
1167 * containing unknown-to-them TCP options.
1168 * Do this only on non-local connections.
1169 */
1170 if (tp->t_state == TCPS_SYN_SENT &&
1171 tp->t_rxtshift == tcp_broken_peer_syn_rxmit_thres)
1172 tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_REQ_CC);
1173
1174 /*
1175 * If losing, let the lower level know and try for
1176 * a better route. Also, if we backed off this far,
1177 * our srtt estimate is probably bogus. Clobber it
1178 * so we'll take the next rtt measurement as our srtt;
1179 * move the current srtt into rttvar to keep the current
1180 * retransmit times until then.
1181 */
1182 if (tp->t_rxtshift > TCP_MAXRXTSHIFT / 4) {
1183#if INET6
1184 if (isipv6)
1185 in6_losing(tp->t_inpcb);
1186 else
1187#endif /* INET6 */
1188 in_losing(tp->t_inpcb);
1189 tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT);
1190 tp->t_srtt = 0;
1191 }
1192 tp->snd_nxt = tp->snd_una;
1193 /*
1194 * Note: We overload snd_recover to function also as the
1195 * snd_last variable described in RFC 2582
1196 */
1197 tp->snd_recover = tp->snd_max;
1198 /*
1199 * Force a segment to be sent.
1200 */
1201 tp->t_flags |= TF_ACKNOW;
1202
1203 /* If timing a segment in this window, stop the timer */
1204 tp->t_rtttime = 0;
1205
1206 if (!IN_FASTRECOVERY(tp) && tp->t_rxtshift == 1)
1207 tcpstat.tcps_tailloss_rto++;
1208
1209
1210 /*
1211 * RFC 5681 says: when a TCP sender detects segment loss
1212 * using retransmit timer and the given segment has already
1213 * been retransmitted by way of the retransmission timer at
1214 * least once, the value of ssthresh is held constant
1215 */
1216 if (tp->t_rxtshift == 1 &&
1217 CC_ALGO(tp)->after_timeout != NULL) {
1218 CC_ALGO(tp)->after_timeout(tp);
1219 /*
1220 * CWR notifications are to be sent on new data
1221 * right after Fast Retransmits and ECE
1222 * notification receipts.
1223 */
1224 if (TCP_ECN_ENABLED(tp))
1225 tp->ecn_flags |= TE_SENDCWR;
1226 }
1227
1228 EXIT_FASTRECOVERY(tp);
1229
1230 /* Exit cwnd non validated phase */
1231 tp->t_flagsext &= ~TF_CWND_NONVALIDATED;
1232
1233
1234fc_output:
1235 tcp_ccdbg_trace(tp, NULL, TCP_CC_REXMT_TIMEOUT);
1236
1237 (void) tcp_output(tp);
1238 break;
1239
1240 /*
1241 * Persistance timer into zero window.
1242 * Force a byte to be output, if possible.
1243 */
1244 case TCPT_PERSIST:
1245 tcpstat.tcps_persisttimeo++;
1246 /*
1247 * Hack: if the peer is dead/unreachable, we do not
1248 * time out if the window is closed. After a full
1249 * backoff, drop the connection if the idle time
1250 * (no responses to probes) reaches the maximum
1251 * backoff that we would use if retransmitting.
1252 *
1253 * Drop the connection if we reached the maximum allowed time for
1254 * Zero Window Probes without a non-zero update from the peer.
1255 * See rdar://5805356
1256 */
1257 if ((tp->t_rxtshift == TCP_MAXRXTSHIFT &&
1258 (idle_time >= tcp_maxpersistidle ||
1259 idle_time >= TCP_REXMTVAL(tp) * tcp_totbackoff)) ||
1260 ((tp->t_persist_stop != 0) &&
1261 TSTMP_LEQ(tp->t_persist_stop, tcp_now))) {
1262 tcpstat.tcps_persistdrop++;
1263 postevent(so, 0, EV_TIMEOUT);
1264 soevent(so,
1265 (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT));
1266 tp = tcp_drop(tp, ETIMEDOUT);
1267 break;
1268 }
1269 tcp_setpersist(tp);
1270 tp->t_flagsext |= TF_FORCE;
1271 (void) tcp_output(tp);
1272 tp->t_flagsext &= ~TF_FORCE;
1273 break;
1274
1275 /*
1276 * Keep-alive timer went off; send something
1277 * or drop connection if idle for too long.
1278 */
1279 case TCPT_KEEP:
1280 tcpstat.tcps_keeptimeo++;
1281#if MPTCP
1282 /*
1283 * Regular TCP connections do not send keepalives after closing
1284 * MPTCP must not also, after sending Data FINs.
1285 */
1286 struct mptcb *mp_tp = tptomptp(tp);
1287 if ((tp->t_mpflags & TMPF_MPTCP_TRUE) &&
1288 (tp->t_state > TCPS_ESTABLISHED)) {
1289 goto dropit;
1290 } else if (mp_tp != NULL) {
1291 if ((mptcp_ok_to_keepalive(mp_tp) == 0))
1292 goto dropit;
1293 }
1294#endif /* MPTCP */
1295 if (tp->t_state < TCPS_ESTABLISHED)
1296 goto dropit;
1297 if ((always_keepalive ||
1298 (tp->t_inpcb->inp_socket->so_options & SO_KEEPALIVE) ||
1299 (tp->t_flagsext & TF_DETECT_READSTALL) ||
1300 (tp->t_tfo_probe_state == TFO_PROBE_PROBING)) &&
1301 (tp->t_state <= TCPS_CLOSING || tp->t_state == TCPS_FIN_WAIT_2)) {
1302 if (idle_time >= TCP_CONN_KEEPIDLE(tp) + TCP_CONN_MAXIDLE(tp))
1303 goto dropit;
1304 /*
1305 * Send a packet designed to force a response
1306 * if the peer is up and reachable:
1307 * either an ACK if the connection is still alive,
1308 * or an RST if the peer has closed the connection
1309 * due to timeout or reboot.
1310 * Using sequence number tp->snd_una-1
1311 * causes the transmitted zero-length segment
1312 * to lie outside the receive window;
1313 * by the protocol spec, this requires the
1314 * correspondent TCP to respond.
1315 */
1316 tcpstat.tcps_keepprobe++;
1317 t_template = tcp_maketemplate(tp);
1318 if (t_template) {
1319 struct inpcb *inp = tp->t_inpcb;
1320 struct tcp_respond_args tra;
1321
1322 bzero(&tra, sizeof(tra));
1323 tra.nocell = INP_NO_CELLULAR(inp);
1324 tra.noexpensive = INP_NO_EXPENSIVE(inp);
1325 tra.awdl_unrestricted = INP_AWDL_UNRESTRICTED(inp);
1326 tra.intcoproc_allowed = INP_INTCOPROC_ALLOWED(inp);
1327 if (tp->t_inpcb->inp_flags & INP_BOUND_IF)
1328 tra.ifscope = tp->t_inpcb->inp_boundifp->if_index;
1329 else
1330 tra.ifscope = IFSCOPE_NONE;
1331 tcp_respond(tp, t_template->tt_ipgen,
1332 &t_template->tt_t, (struct mbuf *)NULL,
1333 tp->rcv_nxt, tp->snd_una - 1, 0, &tra);
1334 (void) m_free(dtom(t_template));
1335 if (tp->t_flagsext & TF_DETECT_READSTALL)
1336 tp->t_rtimo_probes++;
1337 }
1338 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
1339 TCP_CONN_KEEPINTVL(tp));
1340 } else {
1341 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
1342 TCP_CONN_KEEPIDLE(tp));
1343 }
1344 if (tp->t_flagsext & TF_DETECT_READSTALL) {
1345 struct ifnet *outifp = tp->t_inpcb->inp_last_outifp;
1346 bool reenable_probe = false;
1347 /*
1348 * The keep alive packets sent to detect a read
1349 * stall did not get a response from the
1350 * peer. Generate more keep-alives to confirm this.
1351 * If the number of probes sent reaches the limit,
1352 * generate an event.
1353 */
1354 if (tp->t_adaptive_rtimo > 0) {
1355 if (tp->t_rtimo_probes > tp->t_adaptive_rtimo) {
1356 /* Generate an event */
1357 soevent(so,
1358 (SO_FILT_HINT_LOCKED |
1359 SO_FILT_HINT_ADAPTIVE_RTIMO));
1360 tcp_keepalive_reset(tp);
1361 } else {
1362 reenable_probe = true;
1363 }
1364 } else if (outifp != NULL &&
1365 (outifp->if_eflags & IFEF_PROBE_CONNECTIVITY) &&
1366 tp->t_rtimo_probes <= TCP_CONNECTIVITY_PROBES_MAX) {
1367 reenable_probe = true;
1368 } else {
1369 tp->t_flagsext &= ~TF_DETECT_READSTALL;
1370 }
1371 if (reenable_probe) {
1372 int ind = min(tp->t_rtimo_probes,
1373 TCP_MAXRXTSHIFT);
1374 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(
1375 tp, tcp_backoff[ind] * TCP_REXMTVAL(tp));
1376 }
1377 }
1378 if (tp->t_tfo_probe_state == TFO_PROBE_PROBING) {
1379 int ind;
1380
1381 tp->t_tfo_probes++;
1382 ind = min(tp->t_tfo_probes, TCP_MAXRXTSHIFT);
1383
1384 /*
1385 * We take the minimum among the time set by true
1386 * keepalive (see above) and the backoff'd RTO. That
1387 * way we backoff in case of packet-loss but will never
1388 * timeout slower than regular keepalive due to the
1389 * backing off.
1390 */
1391 tp->t_timer[TCPT_KEEP] = min(OFFSET_FROM_START(
1392 tp, tcp_backoff[ind] * TCP_REXMTVAL(tp)),
1393 tp->t_timer[TCPT_KEEP]);
1394 } else if (!(tp->t_tfo_flags & TFO_F_HEURISTIC_DONE) &&
1395 tp->t_tfo_probe_state == TFO_PROBE_WAIT_DATA) {
1396 /* Still no data! Let's assume a TFO-error and err out... */
1397 tcp_heuristic_tfo_middlebox(tp);
1398
1399 so->so_error = ENODATA;
1400 sorwakeup(so);
1401 tp->t_tfo_stats |= TFO_S_RECV_BLACKHOLE;
1402 tcpstat.tcps_tfo_blackhole++;
1403 }
1404 break;
1405 case TCPT_DELACK:
1406 if (tcp_delack_enabled && (tp->t_flags & TF_DELACK)) {
1407 tp->t_flags &= ~TF_DELACK;
1408 tp->t_timer[TCPT_DELACK] = 0;
1409 tp->t_flags |= TF_ACKNOW;
1410
1411 /*
1412 * If delayed ack timer fired while stretching
1413 * acks, count the number of times the streaming
1414 * detection was not correct. If this exceeds a
1415 * threshold, disable strech ack on this
1416 * connection
1417 *
1418 * Also, go back to acking every other packet.
1419 */
1420 if ((tp->t_flags & TF_STRETCHACK)) {
1421 if (tp->t_unacksegs > 1 &&
1422 tp->t_unacksegs < maxseg_unacked)
1423 tp->t_stretchack_delayed++;
1424
1425 if (tp->t_stretchack_delayed >
1426 TCP_STRETCHACK_DELAY_THRESHOLD) {
1427 tp->t_flagsext |= TF_DISABLE_STRETCHACK;
1428 /*
1429 * Note the time at which stretch
1430 * ack was disabled automatically
1431 */
1432 tp->rcv_nostrack_ts = tcp_now;
1433 tcpstat.tcps_nostretchack++;
1434 tp->t_stretchack_delayed = 0;
1435 tp->rcv_nostrack_pkts = 0;
1436 }
1437 tcp_reset_stretch_ack(tp);
1438 }
1439
1440 /*
1441 * If we are measuring inter packet arrival jitter
1442 * for throttling a connection, this delayed ack
1443 * might be the reason for accumulating some
1444 * jitter. So let's restart the measurement.
1445 */
1446 CLEAR_IAJ_STATE(tp);
1447
1448 tcpstat.tcps_delack++;
1449 (void) tcp_output(tp);
1450 }
1451 break;
1452
1453#if MPTCP
1454 case TCPT_JACK_RXMT:
1455 if ((tp->t_state == TCPS_ESTABLISHED) &&
1456 (tp->t_mpflags & TMPF_PREESTABLISHED) &&
1457 (tp->t_mpflags & TMPF_JOINED_FLOW)) {
1458 if (++tp->t_mprxtshift > TCP_MAXRXTSHIFT) {
1459 tcpstat.tcps_timeoutdrop++;
1460 postevent(so, 0, EV_TIMEOUT);
1461 soevent(so,
1462 (SO_FILT_HINT_LOCKED|
1463 SO_FILT_HINT_TIMEOUT));
1464 tp = tcp_drop(tp, tp->t_softerror ?
1465 tp->t_softerror : ETIMEDOUT);
1466 break;
1467 }
1468 tcpstat.tcps_join_rxmts++;
1469 tp->t_mpflags |= TMPF_SND_JACK;
1470 tp->t_flags |= TF_ACKNOW;
1471
1472 /*
1473 * No backoff is implemented for simplicity for this
1474 * corner case.
1475 */
1476 (void) tcp_output(tp);
1477 }
1478 break;
1479#endif /* MPTCP */
1480
1481 case TCPT_PTO:
1482 {
1483 int32_t snd_len;
1484 tp->t_flagsext &= ~(TF_SENT_TLPROBE);
1485
1486 /*
1487 * Check if the connection is in the right state to
1488 * send a probe
1489 */
1490 if (tp->t_state != TCPS_ESTABLISHED ||
1491 (tp->t_rxtshift > 0 && !(tp->t_flagsext & TF_PROBING)) ||
1492 tp->snd_max == tp->snd_una ||
1493 !SACK_ENABLED(tp) ||
1494 !TAILQ_EMPTY(&tp->snd_holes) ||
1495 IN_FASTRECOVERY(tp))
1496 break;
1497
1498 /*
1499 * If there is no new data to send or if the
1500 * connection is limited by receive window then
1501 * retransmit the last segment, otherwise send
1502 * new data.
1503 */
1504 snd_len = min(so->so_snd.sb_cc, tp->snd_wnd)
1505 - (tp->snd_max - tp->snd_una);
1506 if (snd_len > 0) {
1507 tp->snd_nxt = tp->snd_max;
1508 } else {
1509 snd_len = min((tp->snd_max - tp->snd_una),
1510 tp->t_maxseg);
1511 tp->snd_nxt = tp->snd_max - snd_len;
1512 }
1513
1514 tcpstat.tcps_pto++;
1515 if (tp->t_flagsext & TF_PROBING)
1516 tcpstat.tcps_probe_if++;
1517
1518 /* If timing a segment in this window, stop the timer */
1519 tp->t_rtttime = 0;
1520 /* Note that tail loss probe is being sent */
1521 tp->t_flagsext |= TF_SENT_TLPROBE;
1522 tp->t_tlpstart = tcp_now;
1523
1524 tp->snd_cwnd += tp->t_maxseg;
1525
1526 /*
1527 * When tail-loss-probe fires, we reset the RTO timer, because
1528 * a probe just got sent, so we are good to push out the timer.
1529 *
1530 * Set to 0 to ensure that tcp_output() will reschedule it
1531 */
1532 tp->t_timer[TCPT_REXMT] = 0;
1533
1534 (void )tcp_output(tp);
1535 tp->snd_cwnd -= tp->t_maxseg;
1536
1537 tp->t_tlphighrxt = tp->snd_nxt;
1538 break;
1539 }
1540 case TCPT_DELAYFR:
1541 tp->t_flagsext &= ~TF_DELAY_RECOVERY;
1542
1543 /*
1544 * Don't do anything if one of the following is true:
1545 * - the connection is already in recovery
1546 * - sequence until snd_recover has been acknowledged.
1547 * - retransmit timeout has fired
1548 */
1549 if (IN_FASTRECOVERY(tp) ||
1550 SEQ_GEQ(tp->snd_una, tp->snd_recover) ||
1551 tp->t_rxtshift > 0)
1552 break;
1553
1554 VERIFY(SACK_ENABLED(tp));
1555 tcp_rexmt_save_state(tp);
1556 if (CC_ALGO(tp)->pre_fr != NULL) {
1557 CC_ALGO(tp)->pre_fr(tp);
1558 if (TCP_ECN_ENABLED(tp))
1559 tp->ecn_flags |= TE_SENDCWR;
1560 }
1561 ENTER_FASTRECOVERY(tp);
1562
1563 tp->t_timer[TCPT_REXMT] = 0;
1564 tcpstat.tcps_sack_recovery_episode++;
1565 tp->t_sack_recovery_episode++;
1566 tp->sack_newdata = tp->snd_nxt;
1567 tp->snd_cwnd = tp->t_maxseg;
1568 tcp_ccdbg_trace(tp, NULL, TCP_CC_ENTER_FASTRECOVERY);
1569 (void) tcp_output(tp);
1570 break;
1571 dropit:
1572 tcpstat.tcps_keepdrops++;
1573 postevent(so, 0, EV_TIMEOUT);
1574 soevent(so,
1575 (SO_FILT_HINT_LOCKED|SO_FILT_HINT_TIMEOUT));
1576 tp = tcp_drop(tp, ETIMEDOUT);
1577 break;
1578 }
1579#if TCPDEBUG
1580 if (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)
1581 tcp_trace(TA_USER, ostate, tp, (void *)0, (struct tcphdr *)0,
1582 PRU_SLOWTIMO);
1583#endif
1584 return (tp);
1585}
1586
1587/* Remove a timer entry from timer list */
1588void
1589tcp_remove_timer(struct tcpcb *tp)
1590{
1591 struct tcptimerlist *listp = &tcp_timer_list;
1592
1593 socket_lock_assert_owned(tp->t_inpcb->inp_socket);
1594 if (!(TIMER_IS_ON_LIST(tp))) {
1595 return;
1596 }
1597 lck_mtx_lock(listp->mtx);
1598
1599 /* Check if pcb is on timer list again after acquiring the lock */
1600 if (!(TIMER_IS_ON_LIST(tp))) {
1601 lck_mtx_unlock(listp->mtx);
1602 return;
1603 }
1604
1605 if (listp->next_te != NULL && listp->next_te == &tp->tentry)
1606 listp->next_te = LIST_NEXT(&tp->tentry, le);
1607
1608 LIST_REMOVE(&tp->tentry, le);
1609 tp->t_flags &= ~(TF_TIMER_ONLIST);
1610
1611 listp->entries--;
1612
1613 tp->tentry.le.le_next = NULL;
1614 tp->tentry.le.le_prev = NULL;
1615 lck_mtx_unlock(listp->mtx);
1616}
1617
1618/*
1619 * Function to check if the timerlist needs to be rescheduled to run
1620 * the timer entry correctly. Basically, this is to check if we can avoid
1621 * taking the list lock.
1622 */
1623
1624static boolean_t
1625need_to_resched_timerlist(u_int32_t runtime, u_int16_t mode)
1626{
1627 struct tcptimerlist *listp = &tcp_timer_list;
1628 int32_t diff;
1629
1630 /*
1631 * If the list is being processed then the state of the list is
1632 * in flux. In this case always acquire the lock and set the state
1633 * correctly.
1634 */
1635 if (listp->running)
1636 return (TRUE);
1637
1638 if (!listp->scheduled)
1639 return (TRUE);
1640
1641 diff = timer_diff(listp->runtime, 0, runtime, 0);
1642 if (diff <= 0) {
1643 /* The list is going to run before this timer */
1644 return (FALSE);
1645 } else {
1646 if (mode & TCP_TIMERLIST_10MS_MODE) {
1647 if (diff <= TCP_TIMER_10MS_QUANTUM)
1648 return (FALSE);
1649 } else if (mode & TCP_TIMERLIST_100MS_MODE) {
1650 if (diff <= TCP_TIMER_100MS_QUANTUM)
1651 return (FALSE);
1652 } else {
1653 if (diff <= TCP_TIMER_500MS_QUANTUM)
1654 return (FALSE);
1655 }
1656 }
1657 return (TRUE);
1658}
1659
1660void
1661tcp_sched_timerlist(uint32_t offset)
1662{
1663 uint64_t deadline = 0;
1664 struct tcptimerlist *listp = &tcp_timer_list;
1665
1666 LCK_MTX_ASSERT(listp->mtx, LCK_MTX_ASSERT_OWNED);
1667
1668 offset = min(offset, TCP_TIMERLIST_MAX_OFFSET);
1669 listp->runtime = tcp_now + offset;
1670 listp->schedtime = tcp_now;
1671 if (listp->runtime == 0) {
1672 listp->runtime++;
1673 offset++;
1674 }
1675
1676 clock_interval_to_deadline(offset, USEC_PER_SEC, &deadline);
1677
1678 thread_call_enter_delayed(listp->call, deadline);
1679 listp->scheduled = TRUE;
1680}
1681
1682/*
1683 * Function to run the timers for a connection.
1684 *
1685 * Returns the offset of next timer to be run for this connection which
1686 * can be used to reschedule the timerlist.
1687 *
1688 * te_mode is an out parameter that indicates the modes of active
1689 * timers for this connection.
1690 */
1691u_int32_t
1692tcp_run_conn_timer(struct tcpcb *tp, u_int16_t *te_mode,
1693 u_int16_t probe_if_index)
1694{
1695 struct socket *so;
1696 u_int16_t i = 0, index = TCPT_NONE, lo_index = TCPT_NONE;
1697 u_int32_t timer_val, offset = 0, lo_timer = 0;
1698 int32_t diff;
1699 boolean_t needtorun[TCPT_NTIMERS];
1700 int count = 0;
1701
1702 VERIFY(tp != NULL);
1703 bzero(needtorun, sizeof(needtorun));
1704 *te_mode = 0;
1705
1706 socket_lock(tp->t_inpcb->inp_socket, 1);
1707
1708 so = tp->t_inpcb->inp_socket;
1709 /* Release the want count on inp */
1710 if (in_pcb_checkstate(tp->t_inpcb, WNT_RELEASE, 1)
1711 == WNT_STOPUSING) {
1712 if (TIMER_IS_ON_LIST(tp)) {
1713 tcp_remove_timer(tp);
1714 }
1715
1716 /* Looks like the TCP connection got closed while we
1717 * were waiting for the lock.. Done
1718 */
1719 goto done;
1720 }
1721
1722 /*
1723 * If this connection is over an interface that needs to
1724 * be probed, send probe packets to reinitiate communication.
1725 */
1726 if (probe_if_index > 0 && tp->t_inpcb->inp_last_outifp != NULL &&
1727 tp->t_inpcb->inp_last_outifp->if_index == probe_if_index) {
1728 tp->t_flagsext |= TF_PROBING;
1729 tcp_timers(tp, TCPT_PTO);
1730 tp->t_timer[TCPT_PTO] = 0;
1731 tp->t_flagsext &= ~TF_PROBING;
1732 }
1733
1734 /*
1735 * Since the timer thread needs to wait for tcp lock, it may race
1736 * with another thread that can cancel or reschedule the timer
1737 * that is about to run. Check if we need to run anything.
1738 */
1739 if ((index = tp->tentry.index) == TCPT_NONE)
1740 goto done;
1741
1742 timer_val = tp->t_timer[index];
1743
1744 diff = timer_diff(tp->tentry.runtime, 0, tcp_now, 0);
1745 if (diff > 0) {
1746 if (tp->tentry.index != TCPT_NONE) {
1747 offset = diff;
1748 *(te_mode) = tp->tentry.mode;
1749 }
1750 goto done;
1751 }
1752
1753 tp->t_timer[index] = 0;
1754 if (timer_val > 0) {
1755 tp = tcp_timers(tp, index);
1756 if (tp == NULL)
1757 goto done;
1758 }
1759
1760 /*
1761 * Check if there are any other timers that need to be run.
1762 * While doing it, adjust the timer values wrt tcp_now.
1763 */
1764 tp->tentry.mode = 0;
1765 for (i = 0; i < TCPT_NTIMERS; ++i) {
1766 if (tp->t_timer[i] != 0) {
1767 diff = timer_diff(tp->tentry.timer_start,
1768 tp->t_timer[i], tcp_now, 0);
1769 if (diff <= 0) {
1770 needtorun[i] = TRUE;
1771 count++;
1772 } else {
1773 tp->t_timer[i] = diff;
1774 needtorun[i] = FALSE;
1775 if (lo_timer == 0 || diff < lo_timer) {
1776 lo_timer = diff;
1777 lo_index = i;
1778 }
1779 TCP_SET_TIMER_MODE(tp->tentry.mode, i);
1780 }
1781 }
1782 }
1783
1784 tp->tentry.timer_start = tcp_now;
1785 tp->tentry.index = lo_index;
1786 VERIFY(tp->tentry.index == TCPT_NONE || tp->tentry.mode > 0);
1787
1788 if (tp->tentry.index != TCPT_NONE) {
1789 tp->tentry.runtime = tp->tentry.timer_start +
1790 tp->t_timer[tp->tentry.index];
1791 if (tp->tentry.runtime == 0)
1792 tp->tentry.runtime++;
1793 }
1794
1795 if (count > 0) {
1796 /* run any other timers outstanding at this time. */
1797 for (i = 0; i < TCPT_NTIMERS; ++i) {
1798 if (needtorun[i]) {
1799 tp->t_timer[i] = 0;
1800 tp = tcp_timers(tp, i);
1801 if (tp == NULL) {
1802 offset = 0;
1803 *(te_mode) = 0;
1804 goto done;
1805 }
1806 }
1807 }
1808 tcp_set_lotimer_index(tp);
1809 }
1810
1811 if (tp->tentry.index < TCPT_NONE) {
1812 offset = tp->t_timer[tp->tentry.index];
1813 *(te_mode) = tp->tentry.mode;
1814 }
1815
1816done:
1817 if (tp != NULL && tp->tentry.index == TCPT_NONE) {
1818 tcp_remove_timer(tp);
1819 offset = 0;
1820 }
1821
1822 socket_unlock(so, 1);
1823 return(offset);
1824}
1825
1826void
1827tcp_run_timerlist(void * arg1, void * arg2)
1828{
1829#pragma unused(arg1, arg2)
1830 struct tcptimerentry *te, *next_te;
1831 struct tcptimerlist *listp = &tcp_timer_list;
1832 struct tcpcb *tp;
1833 uint32_t next_timer = 0; /* offset of the next timer on the list */
1834 u_int16_t te_mode = 0; /* modes of all active timers in a tcpcb */
1835 u_int16_t list_mode = 0; /* cumulative of modes of all tcpcbs */
1836 uint32_t active_count = 0;
1837
1838 calculate_tcp_clock();
1839
1840 lck_mtx_lock(listp->mtx);
1841
1842 int32_t drift = tcp_now - listp->runtime;
1843 if (drift <= 1) {
1844 tcpstat.tcps_timer_drift_le_1_ms++;
1845 } else if (drift <= 10) {
1846 tcpstat.tcps_timer_drift_le_10_ms++;
1847 } else if (drift <= 20) {
1848 tcpstat.tcps_timer_drift_le_20_ms++;
1849 } else if (drift <= 50) {
1850 tcpstat.tcps_timer_drift_le_50_ms++;
1851 } else if (drift <= 100) {
1852 tcpstat.tcps_timer_drift_le_100_ms++;
1853 } else if (drift <= 200) {
1854 tcpstat.tcps_timer_drift_le_200_ms++;
1855 } else if (drift <= 500) {
1856 tcpstat.tcps_timer_drift_le_500_ms++;
1857 } else if (drift <= 1000) {
1858 tcpstat.tcps_timer_drift_le_1000_ms++;
1859 } else {
1860 tcpstat.tcps_timer_drift_gt_1000_ms++;
1861 }
1862
1863 listp->running = TRUE;
1864
1865 LIST_FOREACH_SAFE(te, &listp->lhead, le, next_te) {
1866 uint32_t offset = 0;
1867 uint32_t runtime = te->runtime;
1868 if (te->index < TCPT_NONE && TSTMP_GT(runtime, tcp_now)) {
1869 offset = timer_diff(runtime, 0, tcp_now, 0);
1870 if (next_timer == 0 || offset < next_timer) {
1871 next_timer = offset;
1872 }
1873 list_mode |= te->mode;
1874 continue;
1875 }
1876
1877 tp = TIMERENTRY_TO_TP(te);
1878
1879 /*
1880 * Acquire an inp wantcnt on the inpcb so that the socket
1881 * won't get detached even if tcp_close is called
1882 */
1883 if (in_pcb_checkstate(tp->t_inpcb, WNT_ACQUIRE, 0)
1884 == WNT_STOPUSING) {
1885 /*
1886 * Some how this pcb went into dead state while
1887 * on the timer list, just take it off the list.
1888 * Since the timer list entry pointers are
1889 * protected by the timer list lock, we can
1890 * do it here without the socket lock.
1891 */
1892 if (TIMER_IS_ON_LIST(tp)) {
1893 tp->t_flags &= ~(TF_TIMER_ONLIST);
1894 LIST_REMOVE(&tp->tentry, le);
1895 listp->entries--;
1896
1897 tp->tentry.le.le_next = NULL;
1898 tp->tentry.le.le_prev = NULL;
1899 }
1900 continue;
1901 }
1902 active_count++;
1903
1904 /*
1905 * Store the next timerentry pointer before releasing the
1906 * list lock. If that entry has to be removed when we
1907 * release the lock, this pointer will be updated to the
1908 * element after that.
1909 */
1910 listp->next_te = next_te;
1911
1912 VERIFY_NEXT_LINK(&tp->tentry, le);
1913 VERIFY_PREV_LINK(&tp->tentry, le);
1914
1915 lck_mtx_unlock(listp->mtx);
1916
1917 offset = tcp_run_conn_timer(tp, &te_mode,
1918 listp->probe_if_index);
1919
1920 lck_mtx_lock(listp->mtx);
1921
1922 next_te = listp->next_te;
1923 listp->next_te = NULL;
1924
1925 if (offset > 0 && te_mode != 0) {
1926 list_mode |= te_mode;
1927
1928 if (next_timer == 0 || offset < next_timer)
1929 next_timer = offset;
1930 }
1931 }
1932
1933 if (!LIST_EMPTY(&listp->lhead)) {
1934 u_int16_t next_mode = 0;
1935 if ((list_mode & TCP_TIMERLIST_10MS_MODE) ||
1936 (listp->pref_mode & TCP_TIMERLIST_10MS_MODE))
1937 next_mode = TCP_TIMERLIST_10MS_MODE;
1938 else if ((list_mode & TCP_TIMERLIST_100MS_MODE) ||
1939 (listp->pref_mode & TCP_TIMERLIST_100MS_MODE))
1940 next_mode = TCP_TIMERLIST_100MS_MODE;
1941 else
1942 next_mode = TCP_TIMERLIST_500MS_MODE;
1943
1944 if (next_mode != TCP_TIMERLIST_500MS_MODE) {
1945 listp->idleruns = 0;
1946 } else {
1947 /*
1948 * the next required mode is slow mode, but if
1949 * the last one was a faster mode and we did not
1950 * have enough idle runs, repeat the last mode.
1951 *
1952 * We try to keep the timer list in fast mode for
1953 * some idle time in expectation of new data.
1954 */
1955 if (listp->mode != next_mode &&
1956 listp->idleruns < timer_fastmode_idlemax) {
1957 listp->idleruns++;
1958 next_mode = listp->mode;
1959 next_timer = TCP_TIMER_100MS_QUANTUM;
1960 } else {
1961 listp->idleruns = 0;
1962 }
1963 }
1964 listp->mode = next_mode;
1965 if (listp->pref_offset != 0)
1966 next_timer = min(listp->pref_offset, next_timer);
1967
1968 if (listp->mode == TCP_TIMERLIST_500MS_MODE)
1969 next_timer = max(next_timer,
1970 TCP_TIMER_500MS_QUANTUM);
1971
1972 tcp_sched_timerlist(next_timer);
1973 } else {
1974 /*
1975 * No need to reschedule this timer, but always run
1976 * periodically at a much higher granularity.
1977 */
1978 tcp_sched_timerlist(TCP_TIMERLIST_MAX_OFFSET);
1979 }
1980
1981 listp->running = FALSE;
1982 listp->pref_mode = 0;
1983 listp->pref_offset = 0;
1984 listp->probe_if_index = 0;
1985
1986 lck_mtx_unlock(listp->mtx);
1987}
1988
1989/*
1990 * Function to check if the timerlist needs to be rescheduled to run this
1991 * connection's timers correctly.
1992 */
1993void
1994tcp_sched_timers(struct tcpcb *tp)
1995{
1996 struct tcptimerentry *te = &tp->tentry;
1997 u_int16_t index = te->index;
1998 u_int16_t mode = te->mode;
1999 struct tcptimerlist *listp = &tcp_timer_list;
2000 int32_t offset = 0;
2001 boolean_t list_locked = FALSE;
2002
2003 if (tp->t_inpcb->inp_state == INPCB_STATE_DEAD) {
2004 /* Just return without adding the dead pcb to the list */
2005 if (TIMER_IS_ON_LIST(tp)) {
2006 tcp_remove_timer(tp);
2007 }
2008 return;
2009 }
2010
2011 if (index == TCPT_NONE) {
2012 /* Nothing to run */
2013 tcp_remove_timer(tp);
2014 return;
2015 }
2016
2017 /*
2018 * compute the offset at which the next timer for this connection
2019 * has to run.
2020 */
2021 offset = timer_diff(te->runtime, 0, tcp_now, 0);
2022 if (offset <= 0) {
2023 offset = 1;
2024 tcp_timer_advanced++;
2025 }
2026
2027 if (!TIMER_IS_ON_LIST(tp)) {
2028 if (!list_locked) {
2029 lck_mtx_lock(listp->mtx);
2030 list_locked = TRUE;
2031 }
2032
2033 if (!TIMER_IS_ON_LIST(tp)) {
2034 LIST_INSERT_HEAD(&listp->lhead, te, le);
2035 tp->t_flags |= TF_TIMER_ONLIST;
2036
2037 listp->entries++;
2038 if (listp->entries > listp->maxentries)
2039 listp->maxentries = listp->entries;
2040
2041 /* if the list is not scheduled, just schedule it */
2042 if (!listp->scheduled)
2043 goto schedule;
2044 }
2045 }
2046
2047 /*
2048 * Timer entry is currently on the list, check if the list needs
2049 * to be rescheduled.
2050 */
2051 if (need_to_resched_timerlist(te->runtime, mode)) {
2052 tcp_resched_timerlist++;
2053
2054 if (!list_locked) {
2055 lck_mtx_lock(listp->mtx);
2056 list_locked = TRUE;
2057 }
2058
2059 VERIFY_NEXT_LINK(te, le);
2060 VERIFY_PREV_LINK(te, le);
2061
2062 if (listp->running) {
2063 listp->pref_mode |= mode;
2064 if (listp->pref_offset == 0 ||
2065 offset < listp->pref_offset) {
2066 listp->pref_offset = offset;
2067 }
2068 } else {
2069 /*
2070 * The list could have got rescheduled while
2071 * this thread was waiting for the lock
2072 */
2073 if (listp->scheduled) {
2074 int32_t diff;
2075 diff = timer_diff(listp->runtime, 0,
2076 tcp_now, offset);
2077 if (diff <= 0)
2078 goto done;
2079 else
2080 goto schedule;
2081 } else {
2082 goto schedule;
2083 }
2084 }
2085 }
2086 goto done;
2087
2088schedule:
2089 /*
2090 * Since a connection with timers is getting scheduled, the timer
2091 * list moves from idle to active state and that is why idlegen is
2092 * reset
2093 */
2094 if (mode & TCP_TIMERLIST_10MS_MODE) {
2095 listp->mode = TCP_TIMERLIST_10MS_MODE;
2096 listp->idleruns = 0;
2097 offset = min(offset, TCP_TIMER_10MS_QUANTUM);
2098 } else if (mode & TCP_TIMERLIST_100MS_MODE) {
2099 if (listp->mode > TCP_TIMERLIST_100MS_MODE)
2100 listp->mode = TCP_TIMERLIST_100MS_MODE;
2101 listp->idleruns = 0;
2102 offset = min(offset, TCP_TIMER_100MS_QUANTUM);
2103 }
2104 tcp_sched_timerlist(offset);
2105
2106done:
2107 if (list_locked)
2108 lck_mtx_unlock(listp->mtx);
2109
2110 return;
2111}
2112
2113static inline void
2114tcp_set_lotimer_index(struct tcpcb *tp)
2115{
2116 uint16_t i, lo_index = TCPT_NONE, mode = 0;
2117 uint32_t lo_timer = 0;
2118 for (i = 0; i < TCPT_NTIMERS; ++i) {
2119 if (tp->t_timer[i] != 0) {
2120 TCP_SET_TIMER_MODE(mode, i);
2121 if (lo_timer == 0 || tp->t_timer[i] < lo_timer) {
2122 lo_timer = tp->t_timer[i];
2123 lo_index = i;
2124 }
2125 }
2126 }
2127 tp->tentry.index = lo_index;
2128 tp->tentry.mode = mode;
2129 VERIFY(tp->tentry.index == TCPT_NONE || tp->tentry.mode > 0);
2130
2131 if (tp->tentry.index != TCPT_NONE) {
2132 tp->tentry.runtime = tp->tentry.timer_start
2133 + tp->t_timer[tp->tentry.index];
2134 if (tp->tentry.runtime == 0)
2135 tp->tentry.runtime++;
2136 }
2137}
2138
2139void
2140tcp_check_timer_state(struct tcpcb *tp)
2141{
2142 socket_lock_assert_owned(tp->t_inpcb->inp_socket);
2143
2144 if (tp->t_inpcb->inp_flags2 & INP2_TIMEWAIT)
2145 return;
2146
2147 tcp_set_lotimer_index(tp);
2148
2149 tcp_sched_timers(tp);
2150 return;
2151}
2152
2153static inline void
2154tcp_cumulative_stat(u_int32_t cur, u_int32_t *prev, u_int32_t *dest)
2155{
2156 /* handle wrap around */
2157 int32_t diff = (int32_t) (cur - *prev);
2158 if (diff > 0)
2159 *dest = diff;
2160 else
2161 *dest = 0;
2162 *prev = cur;
2163 return;
2164}
2165
2166static inline void
2167tcp_cumulative_stat64(u_int64_t cur, u_int64_t *prev, u_int64_t *dest)
2168{
2169 /* handle wrap around */
2170 int64_t diff = (int64_t) (cur - *prev);
2171 if (diff > 0)
2172 *dest = diff;
2173 else
2174 *dest = 0;
2175 *prev = cur;
2176 return;
2177}
2178
2179__private_extern__ void
2180tcp_report_stats(void)
2181{
2182 struct nstat_sysinfo_data data;
2183 struct sockaddr_in dst;
2184 struct sockaddr_in6 dst6;
2185 struct rtentry *rt = NULL;
2186 static struct tcp_last_report_stats prev;
2187 u_int64_t var, uptime;
2188
2189#define stat data.u.tcp_stats
2190 if (((uptime = net_uptime()) - tcp_last_report_time) <
2191 tcp_report_stats_interval)
2192 return;
2193
2194 tcp_last_report_time = uptime;
2195
2196 bzero(&data, sizeof(data));
2197 data.flags = NSTAT_SYSINFO_TCP_STATS;
2198
2199 bzero(&dst, sizeof(dst));
2200 dst.sin_len = sizeof(dst);
2201 dst.sin_family = AF_INET;
2202
2203 /* ipv4 avg rtt */
2204 lck_mtx_lock(rnh_lock);
2205 rt = rt_lookup(TRUE, (struct sockaddr *)&dst, NULL,
2206 rt_tables[AF_INET], IFSCOPE_NONE);
2207 lck_mtx_unlock(rnh_lock);
2208 if (rt != NULL) {
2209 RT_LOCK(rt);
2210 if (rt_primary_default(rt, rt_key(rt)) &&
2211 rt->rt_stats != NULL) {
2212 stat.ipv4_avgrtt = rt->rt_stats->nstat_avg_rtt;
2213 }
2214 RT_UNLOCK(rt);
2215 rtfree(rt);
2216 rt = NULL;
2217 }
2218
2219 /* ipv6 avg rtt */
2220 bzero(&dst6, sizeof(dst6));
2221 dst6.sin6_len = sizeof(dst6);
2222 dst6.sin6_family = AF_INET6;
2223
2224 lck_mtx_lock(rnh_lock);
2225 rt = rt_lookup(TRUE,(struct sockaddr *)&dst6, NULL,
2226 rt_tables[AF_INET6], IFSCOPE_NONE);
2227 lck_mtx_unlock(rnh_lock);
2228 if (rt != NULL) {
2229 RT_LOCK(rt);
2230 if (rt_primary_default(rt, rt_key(rt)) &&
2231 rt->rt_stats != NULL) {
2232 stat.ipv6_avgrtt = rt->rt_stats->nstat_avg_rtt;
2233 }
2234 RT_UNLOCK(rt);
2235 rtfree(rt);
2236 rt = NULL;
2237 }
2238
2239 /* send packet loss rate, shift by 10 for precision */
2240 if (tcpstat.tcps_sndpack > 0 && tcpstat.tcps_sndrexmitpack > 0) {
2241 var = tcpstat.tcps_sndrexmitpack << 10;
2242 stat.send_plr = (var * 100) / tcpstat.tcps_sndpack;
2243 }
2244
2245 /* recv packet loss rate, shift by 10 for precision */
2246 if (tcpstat.tcps_rcvpack > 0 && tcpstat.tcps_recovered_pkts > 0) {
2247 var = tcpstat.tcps_recovered_pkts << 10;
2248 stat.recv_plr = (var * 100) / tcpstat.tcps_rcvpack;
2249 }
2250
2251 /* RTO after tail loss, shift by 10 for precision */
2252 if (tcpstat.tcps_sndrexmitpack > 0
2253 && tcpstat.tcps_tailloss_rto > 0) {
2254 var = tcpstat.tcps_tailloss_rto << 10;
2255 stat.send_tlrto_rate =
2256 (var * 100) / tcpstat.tcps_sndrexmitpack;
2257 }
2258
2259 /* packet reordering */
2260 if (tcpstat.tcps_sndpack > 0 && tcpstat.tcps_reordered_pkts > 0) {
2261 var = tcpstat.tcps_reordered_pkts << 10;
2262 stat.send_reorder_rate =
2263 (var * 100) / tcpstat.tcps_sndpack;
2264 }
2265
2266 if (tcp_ecn_outbound == 1)
2267 stat.ecn_client_enabled = 1;
2268 if (tcp_ecn_inbound == 1)
2269 stat.ecn_server_enabled = 1;
2270 tcp_cumulative_stat(tcpstat.tcps_connattempt,
2271 &prev.tcps_connattempt, &stat.connection_attempts);
2272 tcp_cumulative_stat(tcpstat.tcps_accepts,
2273 &prev.tcps_accepts, &stat.connection_accepts);
2274 tcp_cumulative_stat(tcpstat.tcps_ecn_client_setup,
2275 &prev.tcps_ecn_client_setup, &stat.ecn_client_setup);
2276 tcp_cumulative_stat(tcpstat.tcps_ecn_server_setup,
2277 &prev.tcps_ecn_server_setup, &stat.ecn_server_setup);
2278 tcp_cumulative_stat(tcpstat.tcps_ecn_client_success,
2279 &prev.tcps_ecn_client_success, &stat.ecn_client_success);
2280 tcp_cumulative_stat(tcpstat.tcps_ecn_server_success,
2281 &prev.tcps_ecn_server_success, &stat.ecn_server_success);
2282 tcp_cumulative_stat(tcpstat.tcps_ecn_not_supported,
2283 &prev.tcps_ecn_not_supported, &stat.ecn_not_supported);
2284 tcp_cumulative_stat(tcpstat.tcps_ecn_lost_syn,
2285 &prev.tcps_ecn_lost_syn, &stat.ecn_lost_syn);
2286 tcp_cumulative_stat(tcpstat.tcps_ecn_lost_synack,
2287 &prev.tcps_ecn_lost_synack, &stat.ecn_lost_synack);
2288 tcp_cumulative_stat(tcpstat.tcps_ecn_recv_ce,
2289 &prev.tcps_ecn_recv_ce, &stat.ecn_recv_ce);
2290 tcp_cumulative_stat(tcpstat.tcps_ecn_recv_ece,
2291 &prev.tcps_ecn_recv_ece, &stat.ecn_recv_ece);
2292 tcp_cumulative_stat(tcpstat.tcps_ecn_recv_ece,
2293 &prev.tcps_ecn_recv_ece, &stat.ecn_recv_ece);
2294 tcp_cumulative_stat(tcpstat.tcps_ecn_sent_ece,
2295 &prev.tcps_ecn_sent_ece, &stat.ecn_sent_ece);
2296 tcp_cumulative_stat(tcpstat.tcps_ecn_sent_ece,
2297 &prev.tcps_ecn_sent_ece, &stat.ecn_sent_ece);
2298 tcp_cumulative_stat(tcpstat.tcps_ecn_conn_recv_ce,
2299 &prev.tcps_ecn_conn_recv_ce, &stat.ecn_conn_recv_ce);
2300 tcp_cumulative_stat(tcpstat.tcps_ecn_conn_recv_ece,
2301 &prev.tcps_ecn_conn_recv_ece, &stat.ecn_conn_recv_ece);
2302 tcp_cumulative_stat(tcpstat.tcps_ecn_conn_plnoce,
2303 &prev.tcps_ecn_conn_plnoce, &stat.ecn_conn_plnoce);
2304 tcp_cumulative_stat(tcpstat.tcps_ecn_conn_pl_ce,
2305 &prev.tcps_ecn_conn_pl_ce, &stat.ecn_conn_pl_ce);
2306 tcp_cumulative_stat(tcpstat.tcps_ecn_conn_nopl_ce,
2307 &prev.tcps_ecn_conn_nopl_ce, &stat.ecn_conn_nopl_ce);
2308 tcp_cumulative_stat(tcpstat.tcps_ecn_fallback_synloss,
2309 &prev.tcps_ecn_fallback_synloss, &stat.ecn_fallback_synloss);
2310 tcp_cumulative_stat(tcpstat.tcps_ecn_fallback_reorder,
2311 &prev.tcps_ecn_fallback_reorder, &stat.ecn_fallback_reorder);
2312 tcp_cumulative_stat(tcpstat.tcps_ecn_fallback_ce,
2313 &prev.tcps_ecn_fallback_ce, &stat.ecn_fallback_ce);
2314 tcp_cumulative_stat(tcpstat.tcps_tfo_syn_data_rcv,
2315 &prev.tcps_tfo_syn_data_rcv, &stat.tfo_syn_data_rcv);
2316 tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_req_rcv,
2317 &prev.tcps_tfo_cookie_req_rcv, &stat.tfo_cookie_req_rcv);
2318 tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_sent,
2319 &prev.tcps_tfo_cookie_sent, &stat.tfo_cookie_sent);
2320 tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_invalid,
2321 &prev.tcps_tfo_cookie_invalid, &stat.tfo_cookie_invalid);
2322 tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_req,
2323 &prev.tcps_tfo_cookie_req, &stat.tfo_cookie_req);
2324 tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_rcv,
2325 &prev.tcps_tfo_cookie_rcv, &stat.tfo_cookie_rcv);
2326 tcp_cumulative_stat(tcpstat.tcps_tfo_syn_data_sent,
2327 &prev.tcps_tfo_syn_data_sent, &stat.tfo_syn_data_sent);
2328 tcp_cumulative_stat(tcpstat.tcps_tfo_syn_data_acked,
2329 &prev.tcps_tfo_syn_data_acked, &stat.tfo_syn_data_acked);
2330 tcp_cumulative_stat(tcpstat.tcps_tfo_syn_loss,
2331 &prev.tcps_tfo_syn_loss, &stat.tfo_syn_loss);
2332 tcp_cumulative_stat(tcpstat.tcps_tfo_blackhole,
2333 &prev.tcps_tfo_blackhole, &stat.tfo_blackhole);
2334 tcp_cumulative_stat(tcpstat.tcps_tfo_cookie_wrong,
2335 &prev.tcps_tfo_cookie_wrong, &stat.tfo_cookie_wrong);
2336 tcp_cumulative_stat(tcpstat.tcps_tfo_no_cookie_rcv,
2337 &prev.tcps_tfo_no_cookie_rcv, &stat.tfo_no_cookie_rcv);
2338 tcp_cumulative_stat(tcpstat.tcps_tfo_heuristics_disable,
2339 &prev.tcps_tfo_heuristics_disable, &stat.tfo_heuristics_disable);
2340 tcp_cumulative_stat(tcpstat.tcps_tfo_sndblackhole,
2341 &prev.tcps_tfo_sndblackhole, &stat.tfo_sndblackhole);
2342
2343
2344 tcp_cumulative_stat(tcpstat.tcps_mptcp_handover_attempt,
2345 &prev.tcps_mptcp_handover_attempt , &stat.mptcp_handover_attempt);
2346 tcp_cumulative_stat(tcpstat.tcps_mptcp_interactive_attempt,
2347 &prev.tcps_mptcp_interactive_attempt , &stat.mptcp_interactive_attempt);
2348 tcp_cumulative_stat(tcpstat.tcps_mptcp_aggregate_attempt,
2349 &prev.tcps_mptcp_aggregate_attempt , &stat.mptcp_aggregate_attempt);
2350 tcp_cumulative_stat(tcpstat.tcps_mptcp_fp_handover_attempt,
2351 &prev.tcps_mptcp_fp_handover_attempt , &stat.mptcp_fp_handover_attempt);
2352 tcp_cumulative_stat(tcpstat.tcps_mptcp_fp_interactive_attempt,
2353 &prev.tcps_mptcp_fp_interactive_attempt , &stat.mptcp_fp_interactive_attempt);
2354 tcp_cumulative_stat(tcpstat.tcps_mptcp_fp_aggregate_attempt,
2355 &prev.tcps_mptcp_fp_aggregate_attempt , &stat.mptcp_fp_aggregate_attempt);
2356 tcp_cumulative_stat(tcpstat.tcps_mptcp_heuristic_fallback,
2357 &prev.tcps_mptcp_heuristic_fallback , &stat.mptcp_heuristic_fallback);
2358 tcp_cumulative_stat(tcpstat.tcps_mptcp_fp_heuristic_fallback,
2359 &prev.tcps_mptcp_fp_heuristic_fallback , &stat.mptcp_fp_heuristic_fallback);
2360 tcp_cumulative_stat(tcpstat.tcps_mptcp_handover_success_wifi,
2361 &prev.tcps_mptcp_handover_success_wifi , &stat.mptcp_handover_success_wifi);
2362 tcp_cumulative_stat(tcpstat.tcps_mptcp_handover_success_cell,
2363 &prev.tcps_mptcp_handover_success_cell , &stat.mptcp_handover_success_cell);
2364 tcp_cumulative_stat(tcpstat.tcps_mptcp_interactive_success,
2365 &prev.tcps_mptcp_interactive_success , &stat.mptcp_interactive_success);
2366 tcp_cumulative_stat(tcpstat.tcps_mptcp_aggregate_success,
2367 &prev.tcps_mptcp_aggregate_success , &stat.mptcp_aggregate_success);
2368 tcp_cumulative_stat(tcpstat.tcps_mptcp_fp_handover_success_wifi,
2369 &prev.tcps_mptcp_fp_handover_success_wifi , &stat.mptcp_fp_handover_success_wifi);
2370 tcp_cumulative_stat(tcpstat.tcps_mptcp_fp_handover_success_cell,
2371 &prev.tcps_mptcp_fp_handover_success_cell , &stat.mptcp_fp_handover_success_cell);
2372 tcp_cumulative_stat(tcpstat.tcps_mptcp_fp_interactive_success,
2373 &prev.tcps_mptcp_fp_interactive_success , &stat.mptcp_fp_interactive_success);
2374 tcp_cumulative_stat(tcpstat.tcps_mptcp_fp_aggregate_success,
2375 &prev.tcps_mptcp_fp_aggregate_success , &stat.mptcp_fp_aggregate_success);
2376 tcp_cumulative_stat(tcpstat.tcps_mptcp_handover_cell_from_wifi,
2377 &prev.tcps_mptcp_handover_cell_from_wifi , &stat.mptcp_handover_cell_from_wifi);
2378 tcp_cumulative_stat(tcpstat.tcps_mptcp_handover_wifi_from_cell,
2379 &prev.tcps_mptcp_handover_wifi_from_cell , &stat.mptcp_handover_wifi_from_cell);
2380 tcp_cumulative_stat(tcpstat.tcps_mptcp_interactive_cell_from_wifi,
2381 &prev.tcps_mptcp_interactive_cell_from_wifi , &stat.mptcp_interactive_cell_from_wifi);
2382 tcp_cumulative_stat64(tcpstat.tcps_mptcp_handover_cell_bytes,
2383 &prev.tcps_mptcp_handover_cell_bytes , &stat.mptcp_handover_cell_bytes);
2384 tcp_cumulative_stat64(tcpstat.tcps_mptcp_interactive_cell_bytes,
2385 &prev.tcps_mptcp_interactive_cell_bytes , &stat.mptcp_interactive_cell_bytes);
2386 tcp_cumulative_stat64(tcpstat.tcps_mptcp_aggregate_cell_bytes,
2387 &prev.tcps_mptcp_aggregate_cell_bytes , &stat.mptcp_aggregate_cell_bytes);
2388 tcp_cumulative_stat64(tcpstat.tcps_mptcp_handover_all_bytes,
2389 &prev.tcps_mptcp_handover_all_bytes , &stat.mptcp_handover_all_bytes);
2390 tcp_cumulative_stat64(tcpstat.tcps_mptcp_interactive_all_bytes,
2391 &prev.tcps_mptcp_interactive_all_bytes , &stat.mptcp_interactive_all_bytes);
2392 tcp_cumulative_stat64(tcpstat.tcps_mptcp_aggregate_all_bytes,
2393 &prev.tcps_mptcp_aggregate_all_bytes , &stat.mptcp_aggregate_all_bytes);
2394 tcp_cumulative_stat(tcpstat.tcps_mptcp_back_to_wifi,
2395 &prev.tcps_mptcp_back_to_wifi , &stat.mptcp_back_to_wifi);
2396 tcp_cumulative_stat(tcpstat.tcps_mptcp_wifi_proxy,
2397 &prev.tcps_mptcp_wifi_proxy , &stat.mptcp_wifi_proxy);
2398 tcp_cumulative_stat(tcpstat.tcps_mptcp_cell_proxy,
2399 &prev.tcps_mptcp_cell_proxy , &stat.mptcp_cell_proxy);
2400 tcp_cumulative_stat(tcpstat.tcps_mptcp_triggered_cell,
2401 &prev.tcps_mptcp_triggered_cell, &stat.mptcp_triggered_cell);
2402
2403 nstat_sysinfo_send_data(&data);
2404
2405#undef stat
2406}
2407
2408void
2409tcp_interface_send_probe(u_int16_t probe_if_index)
2410{
2411 int32_t offset = 0;
2412 struct tcptimerlist *listp = &tcp_timer_list;
2413
2414 /* Make sure TCP clock is up to date */
2415 calculate_tcp_clock();
2416
2417 lck_mtx_lock(listp->mtx);
2418 if (listp->probe_if_index > 0) {
2419 tcpstat.tcps_probe_if_conflict++;
2420 goto done;
2421 }
2422
2423 listp->probe_if_index = probe_if_index;
2424 if (listp->running)
2425 goto done;
2426
2427 /*
2428 * Reschedule the timerlist to run within the next 10ms, which is
2429 * the fastest that we can do.
2430 */
2431 offset = TCP_TIMER_10MS_QUANTUM;
2432 if (listp->scheduled) {
2433 int32_t diff;
2434 diff = timer_diff(listp->runtime, 0, tcp_now, offset);
2435 if (diff <= 0) {
2436 /* The timer will fire sooner than what's needed */
2437 goto done;
2438 }
2439 }
2440 listp->mode = TCP_TIMERLIST_10MS_MODE;
2441 listp->idleruns = 0;
2442
2443 tcp_sched_timerlist(offset);
2444
2445done:
2446 lck_mtx_unlock(listp->mtx);
2447 return;
2448}
2449
2450/*
2451 * Enable read probes on this connection, if:
2452 * - it is in established state
2453 * - doesn't have any data outstanding
2454 * - the outgoing ifp matches
2455 * - we have not already sent any read probes
2456 */
2457static void
2458tcp_enable_read_probe(struct tcpcb *tp, struct ifnet *ifp)
2459{
2460 if (tp->t_state == TCPS_ESTABLISHED &&
2461 tp->snd_max == tp->snd_una &&
2462 tp->t_inpcb->inp_last_outifp == ifp &&
2463 !(tp->t_flagsext & TF_DETECT_READSTALL) &&
2464 tp->t_rtimo_probes == 0) {
2465 tp->t_flagsext |= TF_DETECT_READSTALL;
2466 tp->t_rtimo_probes = 0;
2467 tp->t_timer[TCPT_KEEP] = OFFSET_FROM_START(tp,
2468 TCP_TIMER_10MS_QUANTUM);
2469 if (tp->tentry.index == TCPT_NONE) {
2470 tp->tentry.index = TCPT_KEEP;
2471 tp->tentry.runtime = tcp_now +
2472 TCP_TIMER_10MS_QUANTUM;
2473 } else {
2474 int32_t diff = 0;
2475
2476 /* Reset runtime to be in next 10ms */
2477 diff = timer_diff(tp->tentry.runtime, 0,
2478 tcp_now, TCP_TIMER_10MS_QUANTUM);
2479 if (diff > 0) {
2480 tp->tentry.index = TCPT_KEEP;
2481 tp->tentry.runtime = tcp_now +
2482 TCP_TIMER_10MS_QUANTUM;
2483 if (tp->tentry.runtime == 0)
2484 tp->tentry.runtime++;
2485 }
2486 }
2487 }
2488}
2489
2490/*
2491 * Disable read probe and reset the keep alive timer
2492 */
2493static void
2494tcp_disable_read_probe(struct tcpcb *tp)
2495{
2496 if (tp->t_adaptive_rtimo == 0 &&
2497 ((tp->t_flagsext & TF_DETECT_READSTALL) ||
2498 tp->t_rtimo_probes > 0)) {
2499 tcp_keepalive_reset(tp);
2500
2501 if (tp->t_mpsub)
2502 mptcp_reset_keepalive(tp);
2503 }
2504}
2505
2506/*
2507 * Reschedule the tcp timerlist in the next 10ms to re-enable read/write
2508 * probes on connections going over a particular interface.
2509 */
2510void
2511tcp_probe_connectivity(struct ifnet *ifp, u_int32_t enable)
2512{
2513 int32_t offset;
2514 struct tcptimerlist *listp = &tcp_timer_list;
2515 struct inpcbinfo *pcbinfo = &tcbinfo;
2516 struct inpcb *inp, *nxt;
2517
2518 if (ifp == NULL)
2519 return;
2520
2521 /* update clock */
2522 calculate_tcp_clock();
2523
2524 /*
2525 * Enable keep alive timer on all connections that are
2526 * active/established on this interface.
2527 */
2528 lck_rw_lock_shared(pcbinfo->ipi_lock);
2529
2530 LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, nxt) {
2531 struct tcpcb *tp = NULL;
2532 if (in_pcb_checkstate(inp, WNT_ACQUIRE, 0) ==
2533 WNT_STOPUSING)
2534 continue;
2535
2536 /* Acquire lock to look at the state of the connection */
2537 socket_lock(inp->inp_socket, 1);
2538
2539 /* Release the want count */
2540 if (inp->inp_ppcb == NULL ||
2541 (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING)) {
2542 socket_unlock(inp->inp_socket, 1);
2543 continue;
2544 }
2545 tp = intotcpcb(inp);
2546 if (enable)
2547 tcp_enable_read_probe(tp, ifp);
2548 else
2549 tcp_disable_read_probe(tp);
2550
2551 socket_unlock(inp->inp_socket, 1);
2552 }
2553 lck_rw_done(pcbinfo->ipi_lock);
2554
2555 lck_mtx_lock(listp->mtx);
2556 if (listp->running) {
2557 listp->pref_mode |= TCP_TIMERLIST_10MS_MODE;
2558 goto done;
2559 }
2560
2561 /* Reschedule within the next 10ms */
2562 offset = TCP_TIMER_10MS_QUANTUM;
2563 if (listp->scheduled) {
2564 int32_t diff;
2565 diff = timer_diff(listp->runtime, 0, tcp_now, offset);
2566 if (diff <= 0) {
2567 /* The timer will fire sooner than what's needed */
2568 goto done;
2569 }
2570 }
2571 listp->mode = TCP_TIMERLIST_10MS_MODE;
2572 listp->idleruns = 0;
2573
2574 tcp_sched_timerlist(offset);
2575done:
2576 lck_mtx_unlock(listp->mtx);
2577 return;
2578}
2579
2580inline void
2581tcp_update_mss_core(struct tcpcb *tp, struct ifnet *ifp)
2582{
2583 struct if_cellular_status_v1 *ifsr;
2584 u_int32_t optlen;
2585 ifsr = &ifp->if_link_status->ifsr_u.ifsr_cell.if_cell_u.if_status_v1;
2586 if (ifsr->valid_bitmask & IF_CELL_UL_MSS_RECOMMENDED_VALID) {
2587 optlen = tp->t_maxopd - tp->t_maxseg;
2588
2589 if (ifsr->mss_recommended ==
2590 IF_CELL_UL_MSS_RECOMMENDED_NONE &&
2591 tp->t_cached_maxopd > 0 &&
2592 tp->t_maxopd < tp->t_cached_maxopd) {
2593 tp->t_maxopd = tp->t_cached_maxopd;
2594 tcpstat.tcps_mss_to_default++;
2595 } else if (ifsr->mss_recommended ==
2596 IF_CELL_UL_MSS_RECOMMENDED_MEDIUM &&
2597 tp->t_maxopd > tcp_mss_rec_medium) {
2598 tp->t_cached_maxopd = tp->t_maxopd;
2599 tp->t_maxopd = tcp_mss_rec_medium;
2600 tcpstat.tcps_mss_to_medium++;
2601 } else if (ifsr->mss_recommended ==
2602 IF_CELL_UL_MSS_RECOMMENDED_LOW &&
2603 tp->t_maxopd > tcp_mss_rec_low) {
2604 tp->t_cached_maxopd = tp->t_maxopd;
2605 tp->t_maxopd = tcp_mss_rec_low;
2606 tcpstat.tcps_mss_to_low++;
2607 }
2608 tp->t_maxseg = tp->t_maxopd - optlen;
2609
2610 /*
2611 * clear the cached value if it is same as the current
2612 */
2613 if (tp->t_maxopd == tp->t_cached_maxopd)
2614 tp->t_cached_maxopd = 0;
2615 }
2616}
2617
2618void
2619tcp_update_mss_locked(struct socket *so, struct ifnet *ifp)
2620{
2621 struct inpcb *inp = sotoinpcb(so);
2622 struct tcpcb *tp = intotcpcb(inp);
2623
2624 if (ifp == NULL && (ifp = inp->inp_last_outifp) == NULL)
2625 return;
2626
2627 if (!IFNET_IS_CELLULAR(ifp)) {
2628 /*
2629 * This optimization is implemented for cellular
2630 * networks only
2631 */
2632 return;
2633 }
2634 if ( tp->t_state <= TCPS_CLOSE_WAIT) {
2635 /*
2636 * If the connection is currently doing or has done PMTU
2637 * blackhole detection, do not change the MSS
2638 */
2639 if (tp->t_flags & TF_BLACKHOLE)
2640 return;
2641 if (ifp->if_link_status == NULL)
2642 return;
2643 tcp_update_mss_core(tp, ifp);
2644 }
2645}
2646
2647void
2648tcp_itimer(struct inpcbinfo *ipi)
2649{
2650 struct inpcb *inp, *nxt;
2651
2652 if (lck_rw_try_lock_exclusive(ipi->ipi_lock) == FALSE) {
2653 if (tcp_itimer_done == TRUE) {
2654 tcp_itimer_done = FALSE;
2655 atomic_add_32(&ipi->ipi_timer_req.intimer_fast, 1);
2656 return;
2657 }
2658 /* Upgrade failed, lost lock now take it again exclusive */
2659 lck_rw_lock_exclusive(ipi->ipi_lock);
2660 }
2661 tcp_itimer_done = TRUE;
2662
2663 LIST_FOREACH_SAFE(inp, &tcb, inp_list, nxt) {
2664 struct socket *so;
2665 struct ifnet *ifp;
2666
2667 if (inp->inp_ppcb == NULL ||
2668 in_pcb_checkstate(inp, WNT_ACQUIRE, 0) == WNT_STOPUSING)
2669 continue;
2670 so = inp->inp_socket;
2671 ifp = inp->inp_last_outifp;
2672 socket_lock(so, 1);
2673 if (in_pcb_checkstate(inp, WNT_RELEASE, 1) == WNT_STOPUSING) {
2674 socket_unlock(so, 1);
2675 continue;
2676 }
2677 so_check_extended_bk_idle_time(so);
2678 if (ipi->ipi_flags & INPCBINFO_UPDATE_MSS) {
2679 tcp_update_mss_locked(so, NULL);
2680 }
2681 socket_unlock(so, 1);
2682
2683 /*
2684 * Defunct all system-initiated background sockets if the
2685 * socket is using the cellular interface and the interface
2686 * has its LQM set to abort.
2687 */
2688 if ((ipi->ipi_flags & INPCBINFO_HANDLE_LQM_ABORT) &&
2689 IS_SO_TC_BACKGROUNDSYSTEM(so->so_traffic_class) &&
2690 ifp != NULL && IFNET_IS_CELLULAR(ifp) &&
2691 (ifp->if_interface_state.valid_bitmask &
2692 IF_INTERFACE_STATE_LQM_STATE_VALID) &&
2693 ifp->if_interface_state.lqm_state ==
2694 IFNET_LQM_THRESH_ABORT) {
2695 socket_defunct(current_proc(), so,
2696 SHUTDOWN_SOCKET_LEVEL_DISCONNECT_ALL);
2697 }
2698 }
2699
2700 ipi->ipi_flags &= ~(INPCBINFO_UPDATE_MSS | INPCBINFO_HANDLE_LQM_ABORT);
2701 lck_rw_done(ipi->ipi_lock);
2702}
2703