1/*
2 * Copyright (c) 2020-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include "tcp_includes.h"
30
31/*
32 * This file implements a LBE congestion control algorithm
33 * to compute the receive window of a background transport
34 * which uses same algorithm as ledbat-plus-plus.
35 */
36
37#define GAIN_CONSTANT (16)
38#define TCP_BASE_RTT_INTERVAL (60 * TCP_RETRANSHZ)
39
40void tcp_rledbat_init(struct tcpcb *tp);
41void tcp_rledbat_cleanup(struct tcpcb *tp);
42void tcp_rledbat_rwnd_init(struct tcpcb *tp);
43void tcp_rledbat_data_rcvd(struct tcpcb *tp, struct tcphdr *th,
44 struct tcpopt *to, uint32_t segment_len);
45uint32_t tcp_rledbat_get_rlwin(struct tcpcb *tp);
46void tcp_rledbat_after_idle(struct tcpcb *tp);
47void tcp_rledbat_switch_to(struct tcpcb *tp);
48
49struct tcp_rcv_cc_algo tcp_cc_rledbat = {
50 .name = "rledbat",
51 .init = tcp_rledbat_init,
52 .cleanup = tcp_rledbat_cleanup,
53 .rwnd_init = tcp_rledbat_rwnd_init,
54 .data_rcvd = tcp_rledbat_data_rcvd,
55 .get_rlwin = tcp_rledbat_get_rlwin,
56 .after_idle = tcp_rledbat_after_idle,
57 .switch_to = tcp_rledbat_switch_to,
58};
59
60static inline void
61rledbat_clear_state(struct tcpcb *tp)
62{
63 tp->t_rlstate.num_slowdown_events = 0;
64 tp->t_rlstate.slowdown_ts = 0;
65 tp->t_rlstate.slowdown_begin = 0;
66 tp->t_rlstate.rcvd_bytes = 0;
67 tp->t_rlstate.md_rcvd_bytes = 0;
68 tp->t_rlstate.drained_bytes = 0;
69}
70
71void
72tcp_rledbat_init(struct tcpcb *tp)
73{
74 os_atomic_inc(&tcp_cc_rledbat.num_sockets, relaxed);
75 rledbat_clear_state(tp);
76
77 tp->t_rlstate.win = tp->t_maxseg * bg_ss_fltsz;
78 tp->t_rlstate.ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
79}
80
81void
82tcp_rledbat_cleanup(struct tcpcb *tp)
83{
84#pragma unused(tp)
85 os_atomic_dec(&tcp_cc_rledbat.num_sockets, relaxed);
86}
87
88/*
89 * Initialize the receive window for a connection
90 */
91void
92tcp_rledbat_rwnd_init(struct tcpcb *tp)
93{
94 tp->t_rlstate.win = tp->t_maxseg * bg_ss_fltsz;
95
96 /* If the ssthresh hasn't been set, do it now */
97 if (tp->t_rlstate.ssthresh == 0) {
98 tp->t_rlstate.ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
99 }
100}
101
102/*
103 * Compute the denominator
104 * MIN(16, ceil(2 * TARGET / base))
105 */
106static uint32_t
107rledbat_gain(uint32_t base_rtt)
108{
109 return MIN(GAIN_CONSTANT, tcp_ceil(2 * target_qdelay /
110 (double)base_rtt));
111}
112
113/*
114 * Congestion avoidance for ledbat++
115 */
116static void
117rledbat_congestion_avd(struct tcpcb *tp, uint32_t segment_len,
118 uint32_t base_rtt, uint32_t curr_rtt, uint32_t now)
119{
120 uint32_t update = 0;
121 /*
122 * Set the next slowdown time i.e. 9 times the duration
123 * of previous slowdown except the initial slowdown.
124 *
125 * Updated: we will slowdown once in 60s based on our
126 * base RTT interval.
127 */
128 if (tp->t_rlstate.slowdown_ts == 0) {
129 uint32_t slowdown_duration = TCP_BASE_RTT_INTERVAL;
130 if (tp->t_rlstate.num_slowdown_events > 0) {
131 if (tp->t_rlstate.ssthresh > tp->t_rlstate.win) {
132 /*
133 * Special case for slowdowns (other than initial)
134 * where cwnd doesn't recover fully to previous
135 * ssthresh
136 */
137 slowdown_duration *= 2;
138 }
139 }
140 tp->t_rlstate.slowdown_ts = now + slowdown_duration;
141
142 /* Reset the start */
143 tp->t_rlstate.slowdown_begin = 0;
144
145 /* On exit slow start due to higher qdelay, cap the ssthresh */
146 if (tp->t_rlstate.ssthresh > tp->t_rlstate.win) {
147 tp->t_rlstate.ssthresh = tp->t_rlstate.win;
148 }
149 }
150
151 if (curr_rtt <= base_rtt + (uint32_t)target_qdelay) {
152 /* Additive increase */
153 tp->t_rlstate.rcvd_bytes += segment_len;
154 if (tp->t_rlstate.rcvd_bytes >= tp->t_rlstate.win) {
155 update = tp->t_maxseg;
156 tp->t_rlstate.rcvd_bytes -= tp->t_rlstate.win;
157 /*
158 * Move background slow-start threshold to current
159 * congestion window so that the next time (after some idle
160 * period), we can attempt to do slow-start till here if there
161 * is no increase in rtt
162 */
163 if (tp->t_rlstate.ssthresh < tp->t_rlstate.win) {
164 tp->t_rlstate.ssthresh = tp->t_rlstate.win;
165 }
166 tp->t_rlstate.win += update;
167 tp->t_rlstate.win = min(a: tcp_round_to(val: tp->t_rlstate.win, round: tp->t_maxseg),
168 TCP_MAXWIN << tp->rcv_scale);
169 }
170 } else {
171 /*
172 * If we are still within 1 RTT of previous reduction
173 * due to loss, do nothing
174 */
175 if (now < tp->t_rlstate.reduction_end) {
176 return;
177 }
178 /*
179 * Multiplicative decrease
180 * W -= min(W * (qdelay/target - 1), W/2) (per RTT)
181 * To calculate per bytes acked, it becomes
182 * W -= min((qdelay/target - 1), 1/2) * bytes_acked
183 */
184 uint32_t qdelay = curr_rtt > base_rtt ?
185 (curr_rtt - base_rtt) : 0;
186
187 tp->t_rlstate.md_rcvd_bytes += segment_len;
188 if (tp->t_rlstate.md_rcvd_bytes >= tp->t_rlstate.win) {
189 update = (uint32_t)(MIN(((double)qdelay / target_qdelay - 1), 0.5) *
190 (double)tp->t_rlstate.win);
191 tp->t_rlstate.md_rcvd_bytes -= tp->t_rlstate.win;
192 tp->t_rlstate.win -= update;
193
194 if (tp->t_rlstate.win < bg_ss_fltsz * tp->t_maxseg) {
195 tp->t_rlstate.win = bg_ss_fltsz * tp->t_maxseg;
196 }
197
198 tp->t_rlstate.win = tcp_round_to(val: tp->t_rlstate.win, round: tp->t_maxseg);
199 /*
200 * Lower background slow-start threshold so that the connection
201 * will stay in congestion avoidance phase
202 */
203 if (tp->t_rlstate.ssthresh > tp->t_rlstate.win) {
204 tp->t_rlstate.ssthresh = tp->t_rlstate.win;
205 }
206
207 if (tp->t_rlstate.slowdown_ts != 0) {
208 /* As the window has been reduced, defer the slowdown. */
209 tp->t_rlstate.slowdown_ts = now + TCP_BASE_RTT_INTERVAL;
210 }
211 }
212 }
213}
214
215/*
216 * Update win based on ledbat++ algo
217 */
218void
219tcp_rledbat_data_rcvd(struct tcpcb *tp, struct tcphdr *th,
220 struct tcpopt *to, uint32_t segment_len)
221{
222 uint32_t update = 0;
223 const uint32_t base_rtt = get_base_rtt(tp);
224 const uint32_t curr_rtt = tcp_use_min_curr_rtt ? tp->curr_rtt_min :
225 tp->t_rttcur;
226 const uint32_t srtt = tp->rcv_srtt >> TCP_RTT_SHIFT;
227 const uint32_t ss_target = (uint32_t)(3 * target_qdelay / 4);
228 tp->t_rlstate.drained_bytes += segment_len;
229 struct tcp_globals *globals = tcp_get_globals(tp);
230
231 /*
232 * Slowdown period - first slowdown
233 * is 2RTT after we exit initial slow start.
234 * Subsequent slowdowns are after 9 times the
235 * previous slow down durations.
236 *
237 * Updated: slowdown periods are once
238 * every 60s unless they are deferred.
239 */
240 if (tp->t_rlstate.slowdown_ts != 0 &&
241 tcp_globals_now(globals) >= tp->t_rlstate.slowdown_ts) {
242 if (tp->t_rlstate.slowdown_begin == 0) {
243 tp->t_rlstate.slowdown_begin = tcp_globals_now(globals);
244 tp->t_rlstate.num_slowdown_events++;
245 }
246 if (tcp_globals_now(globals) < tp->t_rlstate.slowdown_ts + (2 * srtt)) {
247 // Set rwnd to 2 packets and return
248 if (tp->t_rlstate.win > bg_ss_fltsz * tp->t_maxseg) {
249 if (tp->t_rlstate.ssthresh < tp->t_rlstate.win) {
250 tp->t_rlstate.ssthresh = tp->t_rlstate.win;
251 }
252 tp->t_rlstate.win = bg_ss_fltsz * tp->t_maxseg;
253 /* Reset total bytes acked */
254 tp->t_rlstate.rcvd_bytes = 0;
255 }
256 return;
257 }
258 }
259
260 /*
261 * Detect retransmissions first by checking if the current
262 * received sequence is smaller than largest and its
263 * timestamp is higher than the largest so far. Reduce
264 * win based on fast recovery only once per effective RTT.
265 *
266 * Note: As we are detecting retransmissions (not packet loss),
267 * we are giving some leeway for the next window reduction.
268 */
269 if (SEQ_LT(th->th_seq + segment_len, tp->rcv_high) &&
270 TSTMP_GEQ(to->to_tsval, tp->tsv_high)) {
271 if (tcp_globals_now(globals) < tp->t_rlstate.reduction_end) {
272 /* still need to wait for reduction end to elapse */
273 return;
274 }
275
276 uint32_t win = tp->t_rlstate.win / 2;
277 win = tcp_round_to(val: win, round: tp->t_maxseg);
278 if (win < 2 * tp->t_maxseg) {
279 win = 2 * tp->t_maxseg;
280 }
281 tp->t_rlstate.ssthresh = win;
282 tp->t_rlstate.win = win;
283
284 /* Reset the received bytes */
285 tp->t_rlstate.rcvd_bytes = 0;
286 tp->t_rlstate.md_rcvd_bytes = 0;
287
288 /* Update the reduction end time */
289 tp->t_rlstate.reduction_end = tcp_globals_now(globals) + 2 * srtt;
290
291 if (tp->t_rlstate.slowdown_ts != 0) {
292 /* As the window has been halved, defer the slowdown. */
293 tp->t_rlstate.slowdown_ts = tcp_globals_now(globals) +
294 TCP_BASE_RTT_INTERVAL;
295 }
296 return;
297 }
298
299 /* Now we can do slow start or CA */
300 if (curr_rtt == 0 || base_rtt == 0) {
301 update = MIN(segment_len, TCP_CC_CWND_INIT_PKTS *
302 tp->t_maxseg);
303 tp->t_rlstate.win += update;
304 tp->t_rlstate.win = min(a: tp->t_rlstate.win,
305 TCP_MAXWIN << tp->rcv_scale);
306 } else if (tp->t_rlstate.win < tp->t_rlstate.ssthresh &&
307 ((tp->t_rlstate.num_slowdown_events > 0 &&
308 curr_rtt <= (base_rtt + ((uint32_t)target_qdelay << 1))) ||
309 curr_rtt <= (base_rtt + ss_target))) {
310 /*
311 * Modified slow start with a dynamic GAIN
312 * If the queuing delay is larger than 3/4 of the target
313 * delay, exit slow start, iff, it is the initial slow start.
314 * After the initial slow start, during CA, window growth
315 * will be bound by ssthresh.
316 *
317 * We enter slow start again only after a slowdown event
318 * and in that case, we want to allow the window to grow. The
319 * check for target_qdelay is only a safety net in case
320 * the queuing delay increases more than twice.
321 */
322 tp->t_rlstate.rcvd_bytes += segment_len;
323 uint32_t gain_factor = rledbat_gain(base_rtt);
324 if (tp->t_rlstate.rcvd_bytes >= tp->t_maxseg * gain_factor) {
325 update = MIN(tp->t_rlstate.rcvd_bytes / gain_factor,
326 TCP_CC_CWND_INIT_PKTS * tp->t_maxseg);
327 tp->t_rlstate.rcvd_bytes = 0;
328 tp->t_rlstate.win += update;
329 tp->t_rlstate.win = min(a: tcp_round_to(val: tp->t_rlstate.win, round: tp->t_maxseg),
330 TCP_MAXWIN << tp->rcv_scale);
331 }
332
333 /* Reset the next slowdown timestamp */
334 if (tp->t_rlstate.slowdown_ts != 0) {
335 tp->t_rlstate.slowdown_ts = 0;
336 }
337 } else {
338 /* Congestion avoidance */
339 rledbat_congestion_avd(tp, segment_len, base_rtt, curr_rtt, now: tcp_globals_now(globals));
340 }
341}
342
343uint32_t
344tcp_rledbat_get_rlwin(struct tcpcb *tp)
345{
346 /* rlwin is either greater or smaller by at most drained bytes */
347 if (tp->t_rlstate.win > tp->t_rlstate.win_ws ||
348 tp->t_rlstate.win_ws - tp->t_rlstate.win < tp->t_rlstate.drained_bytes) {
349 tp->t_rlstate.win_ws = tp->t_rlstate.win;
350 } else if (tp->t_rlstate.win < tp->t_rlstate.win_ws) {
351 /*
352 * rlwin is smaller, decrease the advertised window
353 * only by drained bytes at a time
354 */
355 tp->t_rlstate.win_ws = tp->t_rlstate.win_ws -
356 tp->t_rlstate.drained_bytes;
357 }
358 tp->t_rlstate.drained_bytes = 0;
359 /* Round up to the receive window scale */
360 tp->t_rlstate.win_ws = tcp_round_up(val: tp->t_rlstate.win_ws, base: 1 << tp->rcv_scale);
361
362 return tp->t_rlstate.win_ws;
363}
364
365/*
366 * Function to handle connections that have been idle for
367 * some time. Slow start to get ack "clock" running again.
368 * Clear base history after idle time.
369 */
370void
371tcp_rledbat_after_idle(struct tcpcb *tp)
372{
373 rledbat_clear_state(tp);
374 /* Reset the rledbat window */
375 tp->t_rlstate.win = tp->t_maxseg * bg_ss_fltsz;
376}
377
378void
379tcp_rledbat_switch_to(struct tcpcb *tp)
380{
381 rledbat_clear_state(tp);
382 uint32_t recwin = 0;
383
384 if (tp->t_rlstate.win == 0) {
385 /*
386 * Use half of previous window, the algorithm
387 * will quickly reduce the window if there is still
388 * high queueing delay.
389 */
390 int32_t win = tcp_sbspace(tp);
391 if (win < 0) {
392 win = 0;
393 }
394
395 recwin = MAX(win, (int)(tp->rcv_adv - tp->rcv_nxt));
396 recwin = recwin / 2;
397 } else {
398 /*
399 * Reduce the window by half from the previous value
400 * but it should be at least 64K
401 */
402 recwin = MAX(tp->t_rlstate.win / 2, TCP_MAXWIN);
403 }
404
405 recwin = tcp_round_to(val: recwin, round: tp->t_maxseg);
406 if (recwin < bg_ss_fltsz * tp->t_maxseg) {
407 recwin = bg_ss_fltsz * tp->t_maxseg;
408 }
409 tp->t_rlstate.win = recwin;
410
411 /* ssthresh should be at most the inital value */
412 if (tp->t_rlstate.ssthresh == 0) {
413 tp->t_rlstate.ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
414 } else {
415 tp->t_rlstate.ssthresh = MIN(tp->t_rlstate.ssthresh,
416 TCP_MAXWIN << TCP_MAX_WINSHIFT);
417 }
418}
419