tcp_rledbat.c source code [xnu/bsd/netinet/tcp_rledbat.c]

1	/*
2	* Copyright (c) 2020-2021 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28
29	#include "tcp_includes.h"
30
31	/*
32	* This file implements a LBE congestion control algorithm
33	* to compute the receive window of a background transport
34	* which uses same algorithm as ledbat-plus-plus.
35	*/
36
37	#define GAIN_CONSTANT (16)
38	#define TCP_BASE_RTT_INTERVAL (60 * TCP_RETRANSHZ)
39
40	void tcp_rledbat_init(struct tcpcb *tp);
41	void tcp_rledbat_cleanup(struct tcpcb *tp);
42	void tcp_rledbat_rwnd_init(struct tcpcb *tp);
43	void tcp_rledbat_data_rcvd(struct tcpcb tp, struct* tcphdr *th,
44	struct tcpopt *to, uint32_t segment_len);
45	uint32_t tcp_rledbat_get_rlwin(struct tcpcb *tp);
46	void tcp_rledbat_after_idle(struct tcpcb *tp);
47	void tcp_rledbat_switch_to(struct tcpcb *tp);
48
49	struct tcp_rcv_cc_algo tcp_cc_rledbat = {
50	.name = "rledbat",
51	.init = tcp_rledbat_init,
52	.cleanup = tcp_rledbat_cleanup,
53	.rwnd_init = tcp_rledbat_rwnd_init,
54	.data_rcvd = tcp_rledbat_data_rcvd,
55	.get_rlwin = tcp_rledbat_get_rlwin,
56	.after_idle = tcp_rledbat_after_idle,
57	.switch_to = tcp_rledbat_switch_to,
58	};
59
60	static inline void
61	rledbat_clear_state(struct tcpcb *tp)
62	{
63	tp->t_rlstate.num_slowdown_events = `0`;
64	tp->t_rlstate.slowdown_ts = `0`;
65	tp->t_rlstate.slowdown_begin = `0`;
66	tp->t_rlstate.rcvd_bytes = `0`;
67	tp->t_rlstate.md_rcvd_bytes = `0`;
68	tp->t_rlstate.drained_bytes = `0`;
69	}
70
71	void
72	tcp_rledbat_init(struct tcpcb *tp)
73	{
74	os_atomic_inc(&tcp_cc_rledbat.num_sockets, relaxed);
75	rledbat_clear_state(tp);
76
77	tp->t_rlstate.win = tp->t_maxseg * bg_ss_fltsz;
78	tp->t_rlstate.ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
79	}
80
81	void
82	tcp_rledbat_cleanup(struct tcpcb *tp)
83	{
84	#pragma unused(tp)
85	os_atomic_dec(&tcp_cc_rledbat.num_sockets, relaxed);
86	}
87
88	/*
89	* Initialize the receive window for a connection
90	*/
91	void
92	tcp_rledbat_rwnd_init(struct tcpcb *tp)
93	{
94	tp->t_rlstate.win = tp->t_maxseg * bg_ss_fltsz;
95
96	/ If the ssthresh hasn't been set, do it now /
97	if (tp->t_rlstate.ssthresh == `0`) {
98	tp->t_rlstate.ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
99	}
100	}
101
102	/*
103	* Compute the denominator
104	* MIN(16, ceil(2 * TARGET / base))
105	*/
106	static uint32_t
107	rledbat_gain(uint32_t base_rtt)
108	{
109	return MIN(GAIN_CONSTANT, tcp_ceil(`2` * target_qdelay /
110	(double)base_rtt));
111	}
112
113	/*
114	* Congestion avoidance for ledbat++
115	*/
116	static void
117	rledbat_congestion_avd(struct tcpcb *tp, uint32_t segment_len,
118	uint32_t base_rtt, uint32_t curr_rtt, uint32_t now)
119	{
120	uint32_t update = `0`;
121	/*
122	* Set the next slowdown time i.e. 9 times the duration
123	* of previous slowdown except the initial slowdown.
124	*
125	* Updated: we will slowdown once in 60s based on our
126	* base RTT interval.
127	*/
128	if (tp->t_rlstate.slowdown_ts == `0`) {
129	uint32_t slowdown_duration = TCP_BASE_RTT_INTERVAL;
130	if (tp->t_rlstate.num_slowdown_events > `0`) {
131	if (tp->t_rlstate.ssthresh > tp->t_rlstate.win) {
132	/*
133	* Special case for slowdowns (other than initial)
134	* where cwnd doesn't recover fully to previous
135	* ssthresh
136	*/
137	slowdown_duration *= `2`;
138	}
139	}
140	tp->t_rlstate.slowdown_ts = now + slowdown_duration;
141
142	/ Reset the start /
143	tp->t_rlstate.slowdown_begin = `0`;
144
145	/ On exit slow start due to higher qdelay, cap the ssthresh /
146	if (tp->t_rlstate.ssthresh > tp->t_rlstate.win) {
147	tp->t_rlstate.ssthresh = tp->t_rlstate.win;
148	}
149	}
150
151	if (curr_rtt <= base_rtt + (uint32_t)target_qdelay) {
152	/ Additive increase /
153	tp->t_rlstate.rcvd_bytes += segment_len;
154	if (tp->t_rlstate.rcvd_bytes >= tp->t_rlstate.win) {
155	update = tp->t_maxseg;
156	tp->t_rlstate.rcvd_bytes -= tp->t_rlstate.win;
157	/*
158	* Move background slow-start threshold to current
159	* congestion window so that the next time (after some idle
160	* period), we can attempt to do slow-start till here if there
161	* is no increase in rtt
162	*/
163	if (tp->t_rlstate.ssthresh < tp->t_rlstate.win) {
164	tp->t_rlstate.ssthresh = tp->t_rlstate.win;
165	}
166	tp->t_rlstate.win += update;
167	tp->t_rlstate.win = min(a: tcp_round_to(val: tp->t_rlstate.win, round: tp->t_maxseg),
168	TCP_MAXWIN << tp->rcv_scale);
169	}
170	} else {
171	/*
172	* If we are still within 1 RTT of previous reduction
173	* due to loss, do nothing
174	*/
175	if (now < tp->t_rlstate.reduction_end) {
176	return;
177	}
178	/*
179	* Multiplicative decrease
180	* W -= min(W * (qdelay/target - 1), W/2) (per RTT)
181	* To calculate per bytes acked, it becomes
182	* W -= min((qdelay/target - 1), 1/2) * bytes_acked
183	*/
184	uint32_t qdelay = curr_rtt > base_rtt ?
185	(curr_rtt - base_rtt) : `0`;
186
187	tp->t_rlstate.md_rcvd_bytes += segment_len;
188	if (tp->t_rlstate.md_rcvd_bytes >= tp->t_rlstate.win) {
189	update = (uint32_t)(MIN(((double)qdelay / target_qdelay - `1`), `0.5`) *
190	(double)tp->t_rlstate.win);
191	tp->t_rlstate.md_rcvd_bytes -= tp->t_rlstate.win;
192	tp->t_rlstate.win -= update;
193
194	if (tp->t_rlstate.win < bg_ss_fltsz * tp->t_maxseg) {
195	tp->t_rlstate.win = bg_ss_fltsz * tp->t_maxseg;
196	}
197
198	tp->t_rlstate.win = tcp_round_to(val: tp->t_rlstate.win, round: tp->t_maxseg);
199	/*
200	* Lower background slow-start threshold so that the connection
201	* will stay in congestion avoidance phase
202	*/
203	if (tp->t_rlstate.ssthresh > tp->t_rlstate.win) {
204	tp->t_rlstate.ssthresh = tp->t_rlstate.win;
205	}
206
207	if (tp->t_rlstate.slowdown_ts != `0`) {
208	/ As the window has been reduced, defer the slowdown. /
209	tp->t_rlstate.slowdown_ts = now + TCP_BASE_RTT_INTERVAL;
210	}
211	}
212	}
213	}
214
215	/*
216	* Update win based on ledbat++ algo
217	*/
218	void
219	tcp_rledbat_data_rcvd(struct tcpcb tp, struct* tcphdr *th,
220	struct tcpopt *to, uint32_t segment_len)
221	{
222	uint32_t update = `0`;
223	const uint32_t base_rtt = get_base_rtt(tp);
224	const uint32_t curr_rtt = tcp_use_min_curr_rtt ? tp->curr_rtt_min :
225	tp->t_rttcur;
226	const uint32_t srtt = tp->rcv_srtt >> TCP_RTT_SHIFT;
227	const uint32_t ss_target = (uint32_t)(`3` * target_qdelay / `4`);
228	tp->t_rlstate.drained_bytes += segment_len;
229	struct tcp_globals *globals = tcp_get_globals(tp);
230
231	/*
232	* Slowdown period - first slowdown
233	* is 2RTT after we exit initial slow start.
234	* Subsequent slowdowns are after 9 times the
235	* previous slow down durations.
236	*
237	* Updated: slowdown periods are once
238	* every 60s unless they are deferred.
239	*/
240	if (tp->t_rlstate.slowdown_ts != `0` &&
241	tcp_globals_now(globals) >= tp->t_rlstate.slowdown_ts) {
242	if (tp->t_rlstate.slowdown_begin == `0`) {
243	tp->t_rlstate.slowdown_begin = tcp_globals_now(globals);
244	tp->t_rlstate.num_slowdown_events++;
245	}
246	if (tcp_globals_now(globals) < tp->t_rlstate.slowdown_ts + (`2` * srtt)) {
247	// Set rwnd to 2 packets and return
248	if (tp->t_rlstate.win > bg_ss_fltsz * tp->t_maxseg) {
249	if (tp->t_rlstate.ssthresh < tp->t_rlstate.win) {
250	tp->t_rlstate.ssthresh = tp->t_rlstate.win;
251	}
252	tp->t_rlstate.win = bg_ss_fltsz * tp->t_maxseg;
253	/ Reset total bytes acked /
254	tp->t_rlstate.rcvd_bytes = `0`;
255	}
256	return;
257	}
258	}
259
260	/*
261	* Detect retransmissions first by checking if the current
262	* received sequence is smaller than largest and its
263	* timestamp is higher than the largest so far. Reduce
264	* win based on fast recovery only once per effective RTT.
265	*
266	* Note: As we are detecting retransmissions (not packet loss),
267	* we are giving some leeway for the next window reduction.
268	*/
269	if (SEQ_LT(th->th_seq + segment_len, tp->rcv_high) &&
270	TSTMP_GEQ(to->to_tsval, tp->tsv_high)) {
271	if (tcp_globals_now(globals) < tp->t_rlstate.reduction_end) {
272	/ still need to wait for reduction end to elapse /
273	return;
274	}
275
276	uint32_t win = tp->t_rlstate.win / `2`;
277	win = tcp_round_to(val: win, round: tp->t_maxseg);
278	if (win < `2` * tp->t_maxseg) {
279	win = `2` * tp->t_maxseg;
280	}
281	tp->t_rlstate.ssthresh = win;
282	tp->t_rlstate.win = win;
283
284	/ Reset the received bytes /
285	tp->t_rlstate.rcvd_bytes = `0`;
286	tp->t_rlstate.md_rcvd_bytes = `0`;
287
288	/ Update the reduction end time /
289	tp->t_rlstate.reduction_end = tcp_globals_now(globals) + `2` * srtt;
290
291	if (tp->t_rlstate.slowdown_ts != `0`) {
292	/ As the window has been halved, defer the slowdown. /
293	tp->t_rlstate.slowdown_ts = tcp_globals_now(globals) +
294	TCP_BASE_RTT_INTERVAL;
295	}
296	return;
297	}
298
299	/ Now we can do slow start or CA /
300	if (curr_rtt == `0` \|\| base_rtt == `0`) {
301	update = MIN(segment_len, TCP_CC_CWND_INIT_PKTS *
302	tp->t_maxseg);
303	tp->t_rlstate.win += update;
304	tp->t_rlstate.win = min(a: tp->t_rlstate.win,
305	TCP_MAXWIN << tp->rcv_scale);
306	} else if (tp->t_rlstate.win < tp->t_rlstate.ssthresh &&
307	((tp->t_rlstate.num_slowdown_events > `0` &&
308	curr_rtt <= (base_rtt + ((uint32_t)target_qdelay << `1`))) \|\|
309	curr_rtt <= (base_rtt + ss_target))) {
310	/*
311	* Modified slow start with a dynamic GAIN
312	* If the queuing delay is larger than 3/4 of the target
313	* delay, exit slow start, iff, it is the initial slow start.
314	* After the initial slow start, during CA, window growth
315	* will be bound by ssthresh.
316	*
317	* We enter slow start again only after a slowdown event
318	* and in that case, we want to allow the window to grow. The
319	* check for target_qdelay is only a safety net in case
320	* the queuing delay increases more than twice.
321	*/
322	tp->t_rlstate.rcvd_bytes += segment_len;
323	uint32_t gain_factor = rledbat_gain(base_rtt);
324	if (tp->t_rlstate.rcvd_bytes >= tp->t_maxseg * gain_factor) {
325	update = MIN(tp->t_rlstate.rcvd_bytes / gain_factor,
326	TCP_CC_CWND_INIT_PKTS * tp->t_maxseg);
327	tp->t_rlstate.rcvd_bytes = `0`;
328	tp->t_rlstate.win += update;
329	tp->t_rlstate.win = min(a: tcp_round_to(val: tp->t_rlstate.win, round: tp->t_maxseg),
330	TCP_MAXWIN << tp->rcv_scale);
331	}
332
333	/ Reset the next slowdown timestamp /
334	if (tp->t_rlstate.slowdown_ts != `0`) {
335	tp->t_rlstate.slowdown_ts = `0`;
336	}
337	} else {
338	/ Congestion avoidance /
339	rledbat_congestion_avd(tp, segment_len, base_rtt, curr_rtt, now: tcp_globals_now(globals));
340	}
341	}
342
343	uint32_t
344	tcp_rledbat_get_rlwin(struct tcpcb *tp)
345	{
346	/ rlwin is either greater or smaller by at most drained bytes /
347	if (tp->t_rlstate.win > tp->t_rlstate.win_ws \|\|
348	tp->t_rlstate.win_ws - tp->t_rlstate.win < tp->t_rlstate.drained_bytes) {
349	tp->t_rlstate.win_ws = tp->t_rlstate.win;
350	} else if (tp->t_rlstate.win < tp->t_rlstate.win_ws) {
351	/*
352	* rlwin is smaller, decrease the advertised window
353	* only by drained bytes at a time
354	*/
355	tp->t_rlstate.win_ws = tp->t_rlstate.win_ws -
356	tp->t_rlstate.drained_bytes;
357	}
358	tp->t_rlstate.drained_bytes = `0`;
359	/ Round up to the receive window scale /
360	tp->t_rlstate.win_ws = tcp_round_up(val: tp->t_rlstate.win_ws, base: `1` << tp->rcv_scale);
361
362	return tp->t_rlstate.win_ws;
363	}
364
365	/*
366	* Function to handle connections that have been idle for
367	* some time. Slow start to get ack "clock" running again.
368	* Clear base history after idle time.
369	*/
370	void
371	tcp_rledbat_after_idle(struct tcpcb *tp)
372	{
373	rledbat_clear_state(tp);
374	/ Reset the rledbat window /
375	tp->t_rlstate.win = tp->t_maxseg * bg_ss_fltsz;
376	}
377
378	void
379	tcp_rledbat_switch_to(struct tcpcb *tp)
380	{
381	rledbat_clear_state(tp);
382	uint32_t recwin = `0`;
383
384	if (tp->t_rlstate.win == `0`) {
385	/*
386	* Use half of previous window, the algorithm
387	* will quickly reduce the window if there is still
388	* high queueing delay.
389	*/
390	int32_t win = tcp_sbspace(tp);
391	if (win < `0`) {
392	win = `0`;
393	}
394
395	recwin = MAX(win, (int)(tp->rcv_adv - tp->rcv_nxt));
396	recwin = recwin / `2`;
397	} else {
398	/*
399	* Reduce the window by half from the previous value
400	* but it should be at least 64K
401	*/
402	recwin = MAX(tp->t_rlstate.win / `2`, TCP_MAXWIN);
403	}
404
405	recwin = tcp_round_to(val: recwin, round: tp->t_maxseg);
406	if (recwin < bg_ss_fltsz * tp->t_maxseg) {
407	recwin = bg_ss_fltsz * tp->t_maxseg;
408	}
409	tp->t_rlstate.win = recwin;
410
411	/ ssthresh should be at most the inital value /
412	if (tp->t_rlstate.ssthresh == `0`) {
413	tp->t_rlstate.ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT;
414	} else {
415	tp->t_rlstate.ssthresh = MIN(tp->t_rlstate.ssthresh,
416	TCP_MAXWIN << TCP_MAX_WINSHIFT);
417	}
418	}
419

Browse the source code of xnu/bsd/netinet/tcp_rledbat.c