1 | /* |
2 | * Copyright (c) 2013-2021 Apple Inc. All rights reserved. |
3 | * |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
5 | * |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License |
8 | * Version 2.0 (the 'License'). You may not use this file except in |
9 | * compliance with the License. The rights granted to you under the License |
10 | * may not be used to create, or enable the creation or redistribution of, |
11 | * unlawful or unlicensed copies of an Apple operating system, or to |
12 | * circumvent, violate, or enable the circumvention or violation of, any |
13 | * terms of an Apple operating system software license agreement. |
14 | * |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
17 | * |
18 | * The Original Code and all software distributed under the License are |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and |
24 | * limitations under the License. |
25 | * |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
27 | */ |
28 | |
29 | #include "tcp_includes.h" |
30 | |
31 | #include <sys/param.h> |
32 | #include <sys/kernel.h> |
33 | #include <sys/syslog.h> |
34 | #include <sys/kern_control.h> |
35 | #include <sys/domain.h> |
36 | |
37 | #include <netinet/in.h> |
38 | #include <mach/sdt.h> |
39 | #include <libkern/OSAtomic.h> |
40 | |
41 | #include <libkern/OSTypes.h> |
42 | |
43 | extern struct tcp_cc_algo tcp_cc_newreno; |
44 | extern struct tcp_cc_algo tcp_cc_ledbat; |
45 | extern struct tcp_cc_algo tcp_cc_cubic; |
46 | |
47 | #define SET_SNDSB_IDEAL_SIZE(sndsb, size) \ |
48 | sndsb->sb_idealsize = min(max(tcp_sendspace, tp->snd_ssthresh), \ |
49 | tcp_autosndbuf_max); |
50 | |
51 | /* Array containing pointers to currently implemented TCP CC algorithms */ |
52 | struct tcp_cc_algo* tcp_cc_algo_list[TCP_CC_ALGO_COUNT]; |
53 | |
54 | static struct tcp_cc_algo tcp_cc_algo_none; |
55 | /* |
56 | * Initialize TCP congestion control algorithms. |
57 | */ |
58 | |
59 | void |
60 | tcp_cc_init(void) |
61 | { |
62 | bzero(s: &tcp_cc_algo_list, n: sizeof(tcp_cc_algo_list)); |
63 | bzero(s: &tcp_cc_algo_none, n: sizeof(tcp_cc_algo_none)); |
64 | |
65 | tcp_cc_algo_list[TCP_CC_ALGO_NONE] = &tcp_cc_algo_none; |
66 | tcp_cc_algo_list[TCP_CC_ALGO_NEWRENO_INDEX] = &tcp_cc_newreno; |
67 | tcp_cc_algo_list[TCP_CC_ALGO_BACKGROUND_INDEX] = &tcp_cc_ledbat; |
68 | tcp_cc_algo_list[TCP_CC_ALGO_CUBIC_INDEX] = &tcp_cc_cubic; |
69 | |
70 | tcp_ccdbg_control_register(); |
71 | } |
72 | |
73 | void |
74 | tcp_cc_resize_sndbuf(struct tcpcb *tp) |
75 | { |
76 | struct sockbuf *sb; |
77 | /* |
78 | * If the send socket buffer size is bigger than ssthresh, |
79 | * it is time to trim it because we do not want to hold |
80 | * too many mbufs in the socket buffer |
81 | */ |
82 | sb = &tp->t_inpcb->inp_socket->so_snd; |
83 | if (sb->sb_hiwat > tp->snd_ssthresh && |
84 | (sb->sb_flags & SB_AUTOSIZE)) { |
85 | if (sb->sb_idealsize > tp->snd_ssthresh) { |
86 | SET_SNDSB_IDEAL_SIZE(sb, tp->snd_ssthresh); |
87 | } |
88 | sb->sb_flags |= SB_TRIM; |
89 | } |
90 | } |
91 | |
92 | void |
93 | tcp_bad_rexmt_fix_sndbuf(struct tcpcb *tp) |
94 | { |
95 | struct sockbuf *sb; |
96 | sb = &tp->t_inpcb->inp_socket->so_snd; |
97 | if ((sb->sb_flags & (SB_TRIM | SB_AUTOSIZE)) == (SB_TRIM | SB_AUTOSIZE)) { |
98 | /* |
99 | * If there was a retransmission that was not necessary |
100 | * then the size of socket buffer can be restored to |
101 | * what it was before |
102 | */ |
103 | SET_SNDSB_IDEAL_SIZE(sb, tp->snd_ssthresh); |
104 | if (sb->sb_hiwat <= sb->sb_idealsize) { |
105 | sbreserve(sb, cc: sb->sb_idealsize); |
106 | sb->sb_flags &= ~SB_TRIM; |
107 | } |
108 | } |
109 | } |
110 | |
111 | /* |
112 | * Calculate initial cwnd according to RFC3390. |
113 | */ |
114 | void |
115 | tcp_cc_cwnd_init_or_reset(struct tcpcb *tp) |
116 | { |
117 | if (tp->t_flags & TF_LOCAL) { |
118 | tp->snd_cwnd = tp->t_maxseg * ss_fltsz_local; |
119 | } else { |
120 | if (tcp_cubic_minor_fixes) { |
121 | tp->snd_cwnd = tcp_initial_cwnd(tp); |
122 | } else { |
123 | /* initial congestion window according to RFC 3390 */ |
124 | tp->snd_cwnd = min(a: 4 * tp->t_maxseg, |
125 | b: max(a: 2 * tp->t_maxseg, TCP_CC_CWND_INIT_BYTES)); |
126 | } |
127 | } |
128 | } |
129 | |
130 | /* |
131 | * Indicate whether this ack should be delayed. |
132 | * Here is the explanation for different settings of tcp_delack_enabled: |
133 | * - when set to 1, the behavior is same as when set to 2. We kept this |
134 | * for binary compatibility. |
135 | * - when set to 2, will "ack every other packet" |
136 | * - if our last ack wasn't a 0-sized window. |
137 | * - if the peer hasn't sent us a TH_PUSH data packet (radar 3649245). |
138 | * If TH_PUSH is set, take this as a clue that we need to ACK |
139 | * with no delay. This helps higher level protocols who |
140 | * won't send us more data even if the window is open |
141 | * because their last "segment" hasn't been ACKed |
142 | * - when set to 3, will do "streaming detection" |
143 | * - if we receive more than "maxseg_unacked" full packets |
144 | * in the last 100ms |
145 | * - if the connection is not in slow-start or idle or |
146 | * loss/recovery states |
147 | * - if those criteria aren't met, it will ack every other packet. |
148 | */ |
149 | int |
150 | tcp_cc_delay_ack(struct tcpcb *tp, struct tcphdr *th) |
151 | { |
152 | switch (tcp_delack_enabled) { |
153 | case 1: |
154 | case 2: |
155 | if ((tp->t_flags & TF_RXWIN0SENT) == 0 && |
156 | (th->th_flags & TH_PUSH) == 0 && |
157 | (tp->t_unacksegs == 1)) { |
158 | return 1; |
159 | } |
160 | break; |
161 | case 3: |
162 | if (tcp_ack_strategy == TCP_ACK_STRATEGY_LEGACY) { |
163 | if ((tp->t_flags & TF_RXWIN0SENT) == 0 && |
164 | (th->th_flags & TH_PUSH) == 0 && |
165 | ((tp->t_unacksegs == 1) || |
166 | ((tp->t_flags & TF_STRETCHACK) && |
167 | tp->t_unacksegs < maxseg_unacked))) { |
168 | return 1; |
169 | } |
170 | } else { |
171 | uint32_t recwin; |
172 | |
173 | /* Get the receive-window we would announce */ |
174 | recwin = tcp_sbspace(tp); |
175 | if (recwin > (uint32_t)(TCP_MAXWIN << tp->rcv_scale)) { |
176 | recwin = (uint32_t)(TCP_MAXWIN << tp->rcv_scale); |
177 | } |
178 | |
179 | /* Delay ACK, if: |
180 | * |
181 | * 1. We are not sending a zero-window |
182 | * 2. We are not forcing fast ACKs |
183 | * 3. We have more than the low-water mark in receive-buffer |
184 | * 4. The receive-window is not increasing |
185 | * 5. We have less than or equal of an MSS unacked or |
186 | * Window actually has been growing larger than the initial value by half of it. |
187 | * (this makes sure that during ramp-up we ACK every second MSS |
188 | * until we pass the tcp_recvspace * 1.5-threshold) |
189 | * 6. We haven't waited for half a BDP |
190 | * 7. The amount of unacked data is less than the maximum ACK-burst (256 MSS) |
191 | * We try to avoid having the sender end up hitting huge ACK-ranges. |
192 | * |
193 | * (a note on 6: The receive-window is |
194 | * roughly 2 BDP. Thus, recwin / 4 means half a BDP and |
195 | * thus we enforce an ACK roughly twice per RTT - even |
196 | * if the app does not read) |
197 | */ |
198 | if ((tp->t_flags & TF_RXWIN0SENT) == 0 && |
199 | tp->t_forced_acks == 0 && |
200 | tp->t_inpcb->inp_socket->so_rcv.sb_cc > tp->t_inpcb->inp_socket->so_rcv.sb_lowat && |
201 | recwin <= tp->t_last_recwin && |
202 | (tp->rcv_nxt - tp->last_ack_sent <= tp->t_maxseg || |
203 | recwin > (uint32_t)(tcp_recvspace + (tcp_recvspace >> 1))) && |
204 | (tp->rcv_nxt - tp->last_ack_sent) < (recwin >> 2) && |
205 | (tp->rcv_nxt - tp->last_ack_sent) < 256 * tp->t_maxseg) { |
206 | tp->t_stat.acks_delayed++; |
207 | return 1; |
208 | } |
209 | } |
210 | break; |
211 | } |
212 | return 0; |
213 | } |
214 | |
215 | void |
216 | tcp_cc_allocate_state(struct tcpcb *tp) |
217 | { |
218 | if ((tp->tcp_cc_index == TCP_CC_ALGO_CUBIC_INDEX || |
219 | tp->tcp_cc_index == TCP_CC_ALGO_BACKGROUND_INDEX) && |
220 | tp->t_ccstate == NULL) { |
221 | tp->t_ccstate = &tp->_t_ccstate; |
222 | |
223 | bzero(s: tp->t_ccstate, n: sizeof(*tp->t_ccstate)); |
224 | } |
225 | } |
226 | |
227 | /* |
228 | * If stretch ack was disabled automatically on long standing connections, |
229 | * re-evaluate the situation after 15 minutes to enable it. |
230 | */ |
231 | #define TCP_STRETCHACK_DISABLE_WIN (15 * 60 * TCP_RETRANSHZ) |
232 | void |
233 | tcp_cc_after_idle_stretchack(struct tcpcb *tp) |
234 | { |
235 | struct tcp_globals *globals; |
236 | int32_t tdiff; |
237 | |
238 | if (!(tp->t_flagsext & TF_DISABLE_STRETCHACK)) { |
239 | return; |
240 | } |
241 | |
242 | globals = tcp_get_globals(tp); |
243 | tdiff = timer_diff(t1: tcp_globals_now(globals), toff1: 0, t2: tp->rcv_nostrack_ts, toff2: 0); |
244 | if (tdiff < 0) { |
245 | tdiff = -tdiff; |
246 | } |
247 | |
248 | if (tdiff > TCP_STRETCHACK_DISABLE_WIN) { |
249 | tp->t_flagsext &= ~TF_DISABLE_STRETCHACK; |
250 | tp->t_stretchack_delayed = 0; |
251 | |
252 | tcp_reset_stretch_ack(tp); |
253 | } |
254 | } |
255 | |
256 | /* |
257 | * Detect if the congestion window is non-validated according to |
258 | * draft-ietf-tcpm-newcwv-07 |
259 | */ |
260 | inline uint32_t |
261 | tcp_cc_is_cwnd_nonvalidated(struct tcpcb *tp) |
262 | { |
263 | struct socket *so = tp->t_inpcb->inp_socket; |
264 | |
265 | if (tp->t_pipeack == 0) { |
266 | tp->t_flagsext &= ~TF_CWND_NONVALIDATED; |
267 | return 0; |
268 | } |
269 | |
270 | /* |
271 | * The congestion window is validated if the number of bytes acked |
272 | * is more than half of the current window or if there is more |
273 | * data to send in the send socket buffer |
274 | */ |
275 | if (tp->t_pipeack >= (tp->snd_cwnd >> 1) || |
276 | (so != NULL && so->so_snd.sb_cc > tp->snd_cwnd)) { |
277 | tp->t_flagsext &= ~TF_CWND_NONVALIDATED; |
278 | } else { |
279 | tp->t_flagsext |= TF_CWND_NONVALIDATED; |
280 | } |
281 | |
282 | return tp->t_flagsext & TF_CWND_NONVALIDATED; |
283 | } |
284 | |
285 | /* |
286 | * Adjust congestion window in response to congestion in non-validated |
287 | * phase. |
288 | */ |
289 | inline void |
290 | tcp_cc_adjust_nonvalidated_cwnd(struct tcpcb *tp) |
291 | { |
292 | tp->t_pipeack = tcp_get_max_pipeack(tp); |
293 | tcp_clear_pipeack_state(tp); |
294 | tp->snd_cwnd = (max(a: tp->t_pipeack, b: tp->t_lossflightsize) >> 1); |
295 | if (tcp_cubic_minor_fixes) { |
296 | tp->snd_cwnd = max(a: tp->snd_cwnd, b: tp->t_maxseg); |
297 | } else { |
298 | tp->snd_cwnd = max(a: tp->snd_cwnd, TCP_CC_CWND_INIT_BYTES); |
299 | } |
300 | tp->snd_cwnd += tp->t_maxseg * tcprexmtthresh; |
301 | tp->t_flagsext &= ~TF_CWND_NONVALIDATED; |
302 | } |
303 | |
304 | /* |
305 | * Return maximum of all the pipeack samples. Since the number of samples |
306 | * TCP_PIPEACK_SAMPLE_COUNT is 3 at this time, it will be simpler to do |
307 | * a comparision. We should change ths if the number of samples increases. |
308 | */ |
309 | inline uint32_t |
310 | tcp_get_max_pipeack(struct tcpcb *tp) |
311 | { |
312 | uint32_t max_pipeack = 0; |
313 | max_pipeack = (tp->t_pipeack_sample[0] > tp->t_pipeack_sample[1]) ? |
314 | tp->t_pipeack_sample[0] : tp->t_pipeack_sample[1]; |
315 | max_pipeack = (tp->t_pipeack_sample[2] > max_pipeack) ? |
316 | tp->t_pipeack_sample[2] : max_pipeack; |
317 | |
318 | return max_pipeack; |
319 | } |
320 | |
321 | inline void |
322 | tcp_clear_pipeack_state(struct tcpcb *tp) |
323 | { |
324 | bzero(s: tp->t_pipeack_sample, n: sizeof(tp->t_pipeack_sample)); |
325 | tp->t_pipeack_ind = 0; |
326 | tp->t_lossflightsize = 0; |
327 | } |
328 | |