1 | /* |
2 | * Copyright (c) 2013-2017 Apple Inc. All rights reserved. |
3 | * |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
5 | * |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License |
8 | * Version 2.0 (the 'License'). You may not use this file except in |
9 | * compliance with the License. The rights granted to you under the License |
10 | * may not be used to create, or enable the creation or redistribution of, |
11 | * unlawful or unlicensed copies of an Apple operating system, or to |
12 | * circumvent, violate, or enable the circumvention or violation of, any |
13 | * terms of an Apple operating system software license agreement. |
14 | * |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
17 | * |
18 | * The Original Code and all software distributed under the License are |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and |
24 | * limitations under the License. |
25 | * |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
27 | */ |
28 | |
29 | #include <sys/param.h> |
30 | #include <sys/systm.h> |
31 | #include <sys/kernel.h> |
32 | #include <sys/syslog.h> |
33 | #include <sys/protosw.h> |
34 | #include <sys/socketvar.h> |
35 | #include <sys/kern_control.h> |
36 | #include <sys/domain.h> |
37 | |
38 | #include <netinet/in.h> |
39 | #include <netinet/tcp.h> |
40 | #include <netinet/tcp_var.h> |
41 | #include <netinet/tcp_cc.h> |
42 | #include <mach/sdt.h> |
43 | #include <libkern/OSAtomic.h> |
44 | |
45 | struct tcp_cc_debug_state { |
46 | u_int64_t ccd_tsns; |
47 | char ccd_srcaddr[INET6_ADDRSTRLEN]; |
48 | uint16_t ccd_srcport; |
49 | char ccd_destaddr[INET6_ADDRSTRLEN]; |
50 | uint16_t ccd_destport; |
51 | uint32_t ccd_snd_cwnd; |
52 | uint32_t ccd_snd_wnd; |
53 | uint32_t ccd_snd_ssthresh; |
54 | uint32_t ccd_pipeack; |
55 | uint32_t ccd_rttcur; |
56 | uint32_t ccd_rxtcur; |
57 | uint32_t ccd_srtt; |
58 | uint32_t ccd_event; |
59 | uint32_t ccd_sndcc; |
60 | uint32_t ccd_sndhiwat; |
61 | uint32_t ccd_bytes_acked; |
62 | u_int8_t ccd_cc_index; |
63 | u_int8_t ccd_unused_1__; |
64 | u_int16_t ccd_unused_2__; |
65 | union { |
66 | struct { |
67 | uint32_t ccd_last_max; |
68 | uint32_t ccd_tcp_win; |
69 | uint32_t ccd_target_win; |
70 | uint32_t ccd_avg_lastmax; |
71 | uint32_t ccd_mean_deviation; |
72 | } cubic_state; |
73 | struct { |
74 | u_int32_t led_base_rtt; |
75 | } ledbat_state; |
76 | } u; |
77 | }; |
78 | |
79 | SYSCTL_SKMEM_TCP_INT(OID_AUTO, cc_debug, CTLFLAG_RW | CTLFLAG_LOCKED, |
80 | int, tcp_cc_debug, 0, "Enable debug data collection" ); |
81 | |
82 | extern struct tcp_cc_algo tcp_cc_newreno; |
83 | SYSCTL_INT(_net_inet_tcp, OID_AUTO, newreno_sockets, |
84 | CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_cc_newreno.num_sockets, |
85 | 0, "Number of sockets using newreno" ); |
86 | |
87 | extern struct tcp_cc_algo tcp_cc_ledbat; |
88 | SYSCTL_INT(_net_inet_tcp, OID_AUTO, background_sockets, |
89 | CTLFLAG_RD | CTLFLAG_LOCKED, &tcp_cc_ledbat.num_sockets, |
90 | 0, "Number of sockets using background transport" ); |
91 | |
92 | extern struct tcp_cc_algo tcp_cc_cubic; |
93 | SYSCTL_INT(_net_inet_tcp, OID_AUTO, cubic_sockets, |
94 | CTLFLAG_RD | CTLFLAG_LOCKED,&tcp_cc_cubic.num_sockets, |
95 | 0, "Number of sockets using cubic" ); |
96 | |
97 | SYSCTL_SKMEM_TCP_INT(OID_AUTO, use_newreno, |
98 | CTLFLAG_RW | CTLFLAG_LOCKED, int, tcp_use_newreno, 0, |
99 | "Use TCP NewReno by default" ); |
100 | |
101 | static int tcp_check_cwnd_nonvalidated = 1; |
102 | #if (DEBUG || DEVELOPMENT) |
103 | SYSCTL_INT(_net_inet_tcp, OID_AUTO, cwnd_nonvalidated, |
104 | CTLFLAG_RW | CTLFLAG_LOCKED, &tcp_check_cwnd_nonvalidated, 0, |
105 | "Check if congestion window is non-validated" ); |
106 | #endif /* (DEBUG || DEVELOPMENT) */ |
107 | |
108 | #define SET_SNDSB_IDEAL_SIZE(sndsb, size) \ |
109 | sndsb->sb_idealsize = min(max(tcp_sendspace, tp->snd_ssthresh), \ |
110 | tcp_autosndbuf_max); |
111 | |
112 | /* Array containing pointers to currently implemented TCP CC algorithms */ |
113 | struct tcp_cc_algo* tcp_cc_algo_list[TCP_CC_ALGO_COUNT]; |
114 | struct zone *tcp_cc_zone; |
115 | |
116 | /* Information for colelcting TCP debug information using control socket */ |
117 | #define TCP_CCDEBUG_CONTROL_NAME "com.apple.network.tcp_ccdebug" |
118 | #define TCP_CCDBG_NOUNIT 0xffffffff |
119 | static kern_ctl_ref tcp_ccdbg_ctlref = NULL; |
120 | volatile UInt32 tcp_ccdbg_unit = TCP_CCDBG_NOUNIT; |
121 | |
122 | void tcp_cc_init(void); |
123 | static void tcp_cc_control_register(void); |
124 | static errno_t tcp_ccdbg_control_connect(kern_ctl_ref kctl, |
125 | struct sockaddr_ctl *sac, void **uinfo); |
126 | static errno_t tcp_ccdbg_control_disconnect(kern_ctl_ref kctl, |
127 | u_int32_t unit, void *uinfo); |
128 | static struct tcp_cc_algo tcp_cc_algo_none; |
129 | /* |
130 | * Initialize TCP congestion control algorithms. |
131 | */ |
132 | |
133 | void |
134 | tcp_cc_init(void) |
135 | { |
136 | bzero(&tcp_cc_algo_list, sizeof(tcp_cc_algo_list)); |
137 | bzero(&tcp_cc_algo_none, sizeof(tcp_cc_algo_none)); |
138 | |
139 | tcp_cc_algo_list[TCP_CC_ALGO_NONE] = &tcp_cc_algo_none; |
140 | tcp_cc_algo_list[TCP_CC_ALGO_NEWRENO_INDEX] = &tcp_cc_newreno; |
141 | tcp_cc_algo_list[TCP_CC_ALGO_BACKGROUND_INDEX] = &tcp_cc_ledbat; |
142 | tcp_cc_algo_list[TCP_CC_ALGO_CUBIC_INDEX] = &tcp_cc_cubic; |
143 | |
144 | tcp_cc_control_register(); |
145 | } |
146 | |
147 | static void |
148 | tcp_cc_control_register(void) |
149 | { |
150 | struct kern_ctl_reg ccdbg_control; |
151 | errno_t err; |
152 | |
153 | bzero(&ccdbg_control, sizeof(ccdbg_control)); |
154 | strlcpy(ccdbg_control.ctl_name, TCP_CCDEBUG_CONTROL_NAME, |
155 | sizeof(ccdbg_control.ctl_name)); |
156 | ccdbg_control.ctl_connect = tcp_ccdbg_control_connect; |
157 | ccdbg_control.ctl_disconnect = tcp_ccdbg_control_disconnect; |
158 | ccdbg_control.ctl_flags |= CTL_FLAG_PRIVILEGED; |
159 | ccdbg_control.ctl_flags |= CTL_FLAG_REG_SOCK_STREAM; |
160 | |
161 | err = ctl_register(&ccdbg_control, &tcp_ccdbg_ctlref); |
162 | if (err != 0) { |
163 | log(LOG_ERR, "failed to register tcp_cc debug control" ); |
164 | } |
165 | } |
166 | |
167 | /* Allow only one socket to connect at any time for debugging */ |
168 | static errno_t |
169 | tcp_ccdbg_control_connect(kern_ctl_ref kctl, struct sockaddr_ctl *sac, |
170 | void **uinfo) |
171 | { |
172 | #pragma unused(kctl) |
173 | #pragma unused(uinfo) |
174 | |
175 | UInt32 old_value = TCP_CCDBG_NOUNIT; |
176 | UInt32 new_value = sac->sc_unit; |
177 | |
178 | if (tcp_ccdbg_unit != old_value) |
179 | return (EALREADY); |
180 | |
181 | if (OSCompareAndSwap(old_value, new_value, &tcp_ccdbg_unit)) |
182 | return (0); |
183 | else |
184 | return (EALREADY); |
185 | } |
186 | |
187 | static errno_t |
188 | tcp_ccdbg_control_disconnect(kern_ctl_ref kctl, u_int32_t unit, void *uinfo) |
189 | { |
190 | #pragma unused(kctl, unit, uinfo) |
191 | |
192 | if (unit == tcp_ccdbg_unit) { |
193 | UInt32 old_value = tcp_ccdbg_unit; |
194 | UInt32 new_value = TCP_CCDBG_NOUNIT; |
195 | if (tcp_ccdbg_unit == new_value) |
196 | return (0); |
197 | |
198 | if (!OSCompareAndSwap(old_value, new_value, |
199 | &tcp_ccdbg_unit)) |
200 | log(LOG_DEBUG, |
201 | "failed to disconnect tcp_cc debug control" ); |
202 | } |
203 | return (0); |
204 | } |
205 | |
206 | inline void |
207 | tcp_ccdbg_trace(struct tcpcb *tp, struct tcphdr *th, int32_t event) |
208 | { |
209 | #if !CONFIG_DTRACE |
210 | #pragma unused(th) |
211 | #endif /* !CONFIG_DTRACE */ |
212 | struct inpcb *inp = tp->t_inpcb; |
213 | |
214 | if (tcp_cc_debug && tcp_ccdbg_unit > 0) { |
215 | struct tcp_cc_debug_state dbg_state; |
216 | struct timespec tv; |
217 | |
218 | bzero(&dbg_state, sizeof(dbg_state)); |
219 | |
220 | nanotime(&tv); |
221 | /* Take time in seconds */ |
222 | dbg_state.ccd_tsns = (tv.tv_sec * 1000000000) + tv.tv_nsec; |
223 | inet_ntop(SOCK_DOM(inp->inp_socket), |
224 | ((SOCK_DOM(inp->inp_socket) == PF_INET) ? |
225 | (void *)&inp->inp_laddr.s_addr : |
226 | (void *)&inp->in6p_laddr), dbg_state.ccd_srcaddr, |
227 | sizeof(dbg_state.ccd_srcaddr)); |
228 | dbg_state.ccd_srcport = ntohs(inp->inp_lport); |
229 | inet_ntop(SOCK_DOM(inp->inp_socket), |
230 | ((SOCK_DOM(inp->inp_socket) == PF_INET) ? |
231 | (void *)&inp->inp_faddr.s_addr : |
232 | (void *)&inp->in6p_faddr), dbg_state.ccd_destaddr, |
233 | sizeof(dbg_state.ccd_destaddr)); |
234 | dbg_state.ccd_destport = ntohs(inp->inp_fport); |
235 | |
236 | dbg_state.ccd_snd_cwnd = tp->snd_cwnd; |
237 | dbg_state.ccd_snd_wnd = tp->snd_wnd; |
238 | dbg_state.ccd_snd_ssthresh = tp->snd_ssthresh; |
239 | dbg_state.ccd_pipeack = tp->t_pipeack; |
240 | dbg_state.ccd_rttcur = tp->t_rttcur; |
241 | dbg_state.ccd_rxtcur = tp->t_rxtcur; |
242 | dbg_state.ccd_srtt = tp->t_srtt >> TCP_RTT_SHIFT; |
243 | dbg_state.ccd_event = event; |
244 | dbg_state.ccd_sndcc = inp->inp_socket->so_snd.sb_cc; |
245 | dbg_state.ccd_sndhiwat = inp->inp_socket->so_snd.sb_hiwat; |
246 | dbg_state.ccd_bytes_acked = tp->t_bytes_acked; |
247 | dbg_state.ccd_cc_index = tp->tcp_cc_index; |
248 | switch (tp->tcp_cc_index) { |
249 | case TCP_CC_ALGO_CUBIC_INDEX: |
250 | dbg_state.u.cubic_state.ccd_last_max = |
251 | tp->t_ccstate->cub_last_max; |
252 | dbg_state.u.cubic_state.ccd_tcp_win = |
253 | tp->t_ccstate->cub_tcp_win; |
254 | dbg_state.u.cubic_state.ccd_target_win = |
255 | tp->t_ccstate->cub_target_win; |
256 | dbg_state.u.cubic_state.ccd_avg_lastmax = |
257 | tp->t_ccstate->cub_avg_lastmax; |
258 | dbg_state.u.cubic_state.ccd_mean_deviation = |
259 | tp->t_ccstate->cub_mean_dev; |
260 | break; |
261 | case TCP_CC_ALGO_BACKGROUND_INDEX: |
262 | dbg_state.u.ledbat_state.led_base_rtt = |
263 | get_base_rtt(tp); |
264 | break; |
265 | default: |
266 | break; |
267 | } |
268 | |
269 | ctl_enqueuedata(tcp_ccdbg_ctlref, tcp_ccdbg_unit, |
270 | &dbg_state, sizeof(dbg_state), 0); |
271 | } |
272 | DTRACE_TCP5(cc, void, NULL, struct inpcb *, inp, |
273 | struct tcpcb *, tp, struct tcphdr *, th, int32_t, event); |
274 | } |
275 | |
276 | void tcp_cc_resize_sndbuf(struct tcpcb *tp) |
277 | { |
278 | struct sockbuf *sb; |
279 | /* |
280 | * If the send socket buffer size is bigger than ssthresh, |
281 | * it is time to trim it because we do not want to hold |
282 | * too many mbufs in the socket buffer |
283 | */ |
284 | sb = &tp->t_inpcb->inp_socket->so_snd; |
285 | if (sb->sb_hiwat > tp->snd_ssthresh && |
286 | (sb->sb_flags & SB_AUTOSIZE)) { |
287 | if (sb->sb_idealsize > tp->snd_ssthresh) { |
288 | SET_SNDSB_IDEAL_SIZE(sb, tp->snd_ssthresh); |
289 | } |
290 | sb->sb_flags |= SB_TRIM; |
291 | } |
292 | } |
293 | |
294 | void tcp_bad_rexmt_fix_sndbuf(struct tcpcb *tp) |
295 | { |
296 | struct sockbuf *sb; |
297 | sb = &tp->t_inpcb->inp_socket->so_snd; |
298 | if ((sb->sb_flags & (SB_TRIM|SB_AUTOSIZE)) == (SB_TRIM|SB_AUTOSIZE)) { |
299 | /* |
300 | * If there was a retransmission that was not necessary |
301 | * then the size of socket buffer can be restored to |
302 | * what it was before |
303 | */ |
304 | SET_SNDSB_IDEAL_SIZE(sb, tp->snd_ssthresh); |
305 | if (sb->sb_hiwat <= sb->sb_idealsize) { |
306 | sbreserve(sb, sb->sb_idealsize); |
307 | sb->sb_flags &= ~SB_TRIM; |
308 | } |
309 | } |
310 | } |
311 | |
312 | /* |
313 | * Calculate initial cwnd according to RFC3390. |
314 | * |
315 | * Keep the old ss_fltsz sysctl for ABI compabitility issues. |
316 | * but it will be overriden if tcp_do_rfc3390 sysctl when it is set. |
317 | */ |
318 | void |
319 | tcp_cc_cwnd_init_or_reset(struct tcpcb *tp) |
320 | { |
321 | if (tp->t_flags & TF_LOCAL) { |
322 | tp->snd_cwnd = tp->t_maxseg * ss_fltsz_local; |
323 | } else { |
324 | /* initial congestion window according to RFC 3390 */ |
325 | if (tcp_do_rfc3390) |
326 | tp->snd_cwnd = min(4 * tp->t_maxseg, |
327 | max(2 * tp->t_maxseg, TCP_CC_CWND_INIT_BYTES)); |
328 | else |
329 | tp->snd_cwnd = tp->t_maxseg * ss_fltsz; |
330 | } |
331 | } |
332 | |
333 | /* |
334 | * Indicate whether this ack should be delayed. |
335 | * Here is the explanation for different settings of tcp_delack_enabled: |
336 | * - when set to 1, the bhavior is same as when set to 2. We kept this |
337 | * for binary compatibility. |
338 | * - when set to 2, will "ack every other packet" |
339 | * - if our last ack wasn't a 0-sized window. |
340 | * - if the peer hasn't sent us a TH_PUSH data packet (radar 3649245). |
341 | * If TH_PUSH is set, take this as a clue that we need to ACK |
342 | * with no delay. This helps higher level protocols who |
343 | * won't send us more data even if the window is open |
344 | * because their last "segment" hasn't been ACKed |
345 | * - when set to 3, will do "streaming detection" |
346 | * - if we receive more than "maxseg_unacked" full packets |
347 | * in the last 100ms |
348 | * - if the connection is not in slow-start or idle or |
349 | * loss/recovery states |
350 | * - if those criteria aren't met, it will ack every other packet. |
351 | */ |
352 | int |
353 | tcp_cc_delay_ack(struct tcpcb *tp, struct tcphdr *th) |
354 | { |
355 | switch (tcp_delack_enabled) { |
356 | case 1: |
357 | case 2: |
358 | if ((tp->t_flags & TF_RXWIN0SENT) == 0 && |
359 | (th->th_flags & TH_PUSH) == 0 && |
360 | (tp->t_unacksegs == 1)) |
361 | return(1); |
362 | break; |
363 | case 3: |
364 | if ((tp->t_flags & TF_RXWIN0SENT) == 0 && |
365 | (th->th_flags & TH_PUSH) == 0 && |
366 | ((tp->t_unacksegs == 1) || |
367 | ((tp->t_flags & TF_STRETCHACK) != 0 && |
368 | tp->t_unacksegs < (maxseg_unacked)))) |
369 | return(1); |
370 | break; |
371 | } |
372 | return(0); |
373 | } |
374 | |
375 | void |
376 | tcp_cc_allocate_state(struct tcpcb *tp) |
377 | { |
378 | if (tp->tcp_cc_index == TCP_CC_ALGO_CUBIC_INDEX && |
379 | tp->t_ccstate == NULL) { |
380 | tp->t_ccstate = (struct tcp_ccstate *)zalloc(tcp_cc_zone); |
381 | |
382 | /* |
383 | * If we could not allocate memory for congestion control |
384 | * state, revert to using TCP NewReno as it does not |
385 | * require any state |
386 | */ |
387 | if (tp->t_ccstate == NULL) |
388 | tp->tcp_cc_index = TCP_CC_ALGO_NEWRENO_INDEX; |
389 | else |
390 | bzero(tp->t_ccstate, sizeof(*tp->t_ccstate)); |
391 | } |
392 | } |
393 | |
394 | /* |
395 | * If stretch ack was disabled automatically on long standing connections, |
396 | * re-evaluate the situation after 15 minutes to enable it. |
397 | */ |
398 | #define TCP_STRETCHACK_DISABLE_WIN (15 * 60 * TCP_RETRANSHZ) |
399 | void |
400 | tcp_cc_after_idle_stretchack(struct tcpcb *tp) |
401 | { |
402 | int32_t tdiff; |
403 | |
404 | if (!(tp->t_flagsext & TF_DISABLE_STRETCHACK)) |
405 | return; |
406 | |
407 | tdiff = timer_diff(tcp_now, 0, tp->rcv_nostrack_ts, 0); |
408 | if (tdiff < 0) |
409 | tdiff = -tdiff; |
410 | |
411 | if (tdiff > TCP_STRETCHACK_DISABLE_WIN) { |
412 | tp->t_flagsext &= ~TF_DISABLE_STRETCHACK; |
413 | tp->t_stretchack_delayed = 0; |
414 | |
415 | tcp_reset_stretch_ack(tp); |
416 | } |
417 | } |
418 | |
419 | /* |
420 | * Detect if the congestion window is non-vlidated according to |
421 | * draft-ietf-tcpm-newcwv-07 |
422 | */ |
423 | |
424 | inline uint32_t |
425 | tcp_cc_is_cwnd_nonvalidated(struct tcpcb *tp) |
426 | { |
427 | struct socket *so = tp->t_inpcb->inp_socket; |
428 | if (tp->t_pipeack == 0 || tcp_check_cwnd_nonvalidated == 0) { |
429 | tp->t_flagsext &= ~TF_CWND_NONVALIDATED; |
430 | return (0); |
431 | } |
432 | |
433 | /* |
434 | * The congestion window is validated if the number of bytes acked |
435 | * is more than half of the current window or if there is more |
436 | * data to send in the send socket buffer |
437 | */ |
438 | if (tp->t_pipeack >= (tp->snd_cwnd >> 1) || |
439 | (so != NULL && so->so_snd.sb_cc > tp->snd_cwnd)) |
440 | tp->t_flagsext &= ~TF_CWND_NONVALIDATED; |
441 | else |
442 | tp->t_flagsext |= TF_CWND_NONVALIDATED; |
443 | return (tp->t_flagsext & TF_CWND_NONVALIDATED); |
444 | } |
445 | |
446 | /* |
447 | * Adjust congestion window in response to congestion in non-validated |
448 | * phase. |
449 | */ |
450 | inline void |
451 | tcp_cc_adjust_nonvalidated_cwnd(struct tcpcb *tp) |
452 | { |
453 | tp->t_pipeack = tcp_get_max_pipeack(tp); |
454 | tcp_clear_pipeack_state(tp); |
455 | tp->snd_cwnd = (max(tp->t_pipeack, tp->t_lossflightsize) >> 1); |
456 | tp->snd_cwnd = max(tp->snd_cwnd, TCP_CC_CWND_INIT_BYTES); |
457 | tp->snd_cwnd += tp->t_maxseg * tcprexmtthresh; |
458 | tp->t_flagsext &= ~TF_CWND_NONVALIDATED; |
459 | } |
460 | |
461 | /* |
462 | * Return maximum of all the pipeack samples. Since the number of samples |
463 | * TCP_PIPEACK_SAMPLE_COUNT is 3 at this time, it will be simpler to do |
464 | * a comparision. We should change ths if the number of samples increases. |
465 | */ |
466 | inline u_int32_t |
467 | tcp_get_max_pipeack(struct tcpcb *tp) |
468 | { |
469 | u_int32_t max_pipeack = 0; |
470 | max_pipeack = (tp->t_pipeack_sample[0] > tp->t_pipeack_sample[1]) ? |
471 | tp->t_pipeack_sample[0] : tp->t_pipeack_sample[1]; |
472 | max_pipeack = (tp->t_pipeack_sample[2] > max_pipeack) ? |
473 | tp->t_pipeack_sample[2] : max_pipeack; |
474 | |
475 | return (max_pipeack); |
476 | } |
477 | |
478 | inline void |
479 | tcp_clear_pipeack_state(struct tcpcb *tp) |
480 | { |
481 | bzero(tp->t_pipeack_sample, sizeof(tp->t_pipeack_sample)); |
482 | tp->t_pipeack_ind = 0; |
483 | tp->t_lossflightsize = 0; |
484 | } |
485 | |