| 1 | /* |
| 2 | * Copyright (c) 2013-2021 Apple Inc. All rights reserved. |
| 3 | * |
| 4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
| 5 | * |
| 6 | * This file contains Original Code and/or Modifications of Original Code |
| 7 | * as defined in and that are subject to the Apple Public Source License |
| 8 | * Version 2.0 (the 'License'). You may not use this file except in |
| 9 | * compliance with the License. The rights granted to you under the License |
| 10 | * may not be used to create, or enable the creation or redistribution of, |
| 11 | * unlawful or unlicensed copies of an Apple operating system, or to |
| 12 | * circumvent, violate, or enable the circumvention or violation of, any |
| 13 | * terms of an Apple operating system software license agreement. |
| 14 | * |
| 15 | * Please obtain a copy of the License at |
| 16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
| 17 | * |
| 18 | * The Original Code and all software distributed under the License are |
| 19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
| 20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
| 21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
| 22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
| 23 | * Please see the License for the specific language governing rights and |
| 24 | * limitations under the License. |
| 25 | * |
| 26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
| 27 | */ |
| 28 | |
| 29 | #include "tcp_includes.h" |
| 30 | |
| 31 | #include <sys/param.h> |
| 32 | #include <sys/kernel.h> |
| 33 | #include <sys/syslog.h> |
| 34 | |
| 35 | #include <netinet/in.h> |
| 36 | #include <netinet/in_systm.h> |
| 37 | #include <netinet/ip.h> |
| 38 | #include <netinet/ip6.h> |
| 39 | #include <netinet/ip_var.h> |
| 40 | |
| 41 | static int tcp_cubic_init(struct tcpcb *tp); |
| 42 | static int tcp_cubic_cleanup(struct tcpcb *tp); |
| 43 | static void tcp_cubic_cwnd_init_or_reset(struct tcpcb *tp); |
| 44 | static void tcp_cubic_congestion_avd(struct tcpcb *tp, struct tcphdr *th); |
| 45 | static void tcp_cubic_ack_rcvd(struct tcpcb *tp, struct tcphdr *th); |
| 46 | static void tcp_cubic_pre_fr(struct tcpcb *tp); |
| 47 | static void tcp_cubic_post_fr(struct tcpcb *tp, struct tcphdr *th); |
| 48 | static void tcp_cubic_after_timeout(struct tcpcb *tp); |
| 49 | static int tcp_cubic_delay_ack(struct tcpcb *tp, struct tcphdr *th); |
| 50 | static void tcp_cubic_switch_cc(struct tcpcb *tp); |
| 51 | static uint32_t tcp_cubic_update(struct tcpcb *tp, uint32_t rtt); |
| 52 | static inline void tcp_cubic_clear_state(struct tcpcb *tp); |
| 53 | |
| 54 | extern float cbrtf(float x); |
| 55 | |
| 56 | struct tcp_cc_algo tcp_cc_cubic = { |
| 57 | .name = "cubic" , |
| 58 | .init = tcp_cubic_init, |
| 59 | .cleanup = tcp_cubic_cleanup, |
| 60 | .cwnd_init = tcp_cubic_cwnd_init_or_reset, |
| 61 | .congestion_avd = tcp_cubic_congestion_avd, |
| 62 | .ack_rcvd = tcp_cubic_ack_rcvd, |
| 63 | .pre_fr = tcp_cubic_pre_fr, |
| 64 | .post_fr = tcp_cubic_post_fr, |
| 65 | .after_idle = tcp_cubic_cwnd_init_or_reset, |
| 66 | .after_timeout = tcp_cubic_after_timeout, |
| 67 | .delay_ack = tcp_cubic_delay_ack, |
| 68 | .switch_to = tcp_cubic_switch_cc |
| 69 | }; |
| 70 | |
| 71 | static float tcp_cubic_backoff = 0.2f; /* multiplicative decrease factor */ |
| 72 | static float tcp_cubic_coeff = 0.4f; |
| 73 | static float tcp_cubic_fast_convergence_factor = 0.875f; |
| 74 | |
| 75 | static float tcp_cubic_beta = 0.8f; |
| 76 | |
| 77 | static int |
| 78 | tcp_cubic_init(struct tcpcb *tp) |
| 79 | { |
| 80 | os_atomic_inc(&tcp_cc_cubic.num_sockets, relaxed); |
| 81 | |
| 82 | if (tcp_cubic_rfc_compliant) { |
| 83 | tcp_cubic_backoff = 0.3f; /* multiplicative decrease factor */ |
| 84 | tcp_cubic_fast_convergence_factor = 0.85f; |
| 85 | tcp_cubic_beta = 0.7f; |
| 86 | } else { |
| 87 | tcp_cubic_backoff = 0.2f; /* multiplicative decrease factor */ |
| 88 | tcp_cubic_fast_convergence_factor = 0.875f; |
| 89 | tcp_cubic_beta = 0.8f; |
| 90 | } |
| 91 | |
| 92 | VERIFY(tp->t_ccstate != NULL); |
| 93 | tcp_cubic_clear_state(tp); |
| 94 | return 0; |
| 95 | } |
| 96 | |
| 97 | static int |
| 98 | tcp_cubic_cleanup(struct tcpcb *tp) |
| 99 | { |
| 100 | #pragma unused(tp) |
| 101 | os_atomic_dec(&tcp_cc_cubic.num_sockets, relaxed); |
| 102 | return 0; |
| 103 | } |
| 104 | |
| 105 | /* |
| 106 | * Initialize the congestion window at the beginning of a connection or |
| 107 | * after idle time |
| 108 | */ |
| 109 | static void |
| 110 | tcp_cubic_cwnd_init_or_reset(struct tcpcb *tp) |
| 111 | { |
| 112 | VERIFY(tp->t_ccstate != NULL); |
| 113 | |
| 114 | tcp_cubic_clear_state(tp); |
| 115 | tcp_cc_cwnd_init_or_reset(tp); |
| 116 | tp->t_pipeack = 0; |
| 117 | tcp_clear_pipeack_state(tp); |
| 118 | |
| 119 | /* Start counting bytes for RFC 3465 again */ |
| 120 | tp->t_bytes_acked = 0; |
| 121 | |
| 122 | /* |
| 123 | * slow start threshold could get initialized to a lower value |
| 124 | * when there is a cached value in the route metrics. In this case, |
| 125 | * the connection can enter congestion avoidance without any packet |
| 126 | * loss and Cubic will enter steady-state too early. It is better |
| 127 | * to always probe to find the initial slow-start threshold. |
| 128 | */ |
| 129 | if (tp->t_inpcb->inp_stat->txbytes <= tcp_initial_cwnd(tp) && |
| 130 | tp->snd_ssthresh < (TCP_MAXWIN << TCP_MAX_WINSHIFT)) { |
| 131 | tp->snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT; |
| 132 | } |
| 133 | |
| 134 | /* Initialize cubic last max to be same as ssthresh */ |
| 135 | tp->t_ccstate->cub_last_max = tp->snd_ssthresh; |
| 136 | } |
| 137 | |
| 138 | /* |
| 139 | * Compute the target congestion window for the next RTT according to |
| 140 | * cubic equation when an ack is received. |
| 141 | * |
| 142 | * W(t) = C(t-K)^3 + W(last_max) |
| 143 | */ |
| 144 | static uint32_t |
| 145 | tcp_cubic_update(struct tcpcb *tp, uint32_t rtt) |
| 146 | { |
| 147 | struct tcp_globals *globals = tcp_get_globals(tp); |
| 148 | float K, var; |
| 149 | uint32_t elapsed_time, win; |
| 150 | |
| 151 | win = min(a: tp->snd_cwnd, b: tp->snd_wnd); |
| 152 | if (tp->t_ccstate->cub_last_max == 0) { |
| 153 | tp->t_ccstate->cub_last_max = tp->snd_ssthresh; |
| 154 | } |
| 155 | |
| 156 | if (tp->t_ccstate->cub_epoch_start == 0) { |
| 157 | /* |
| 158 | * This is the beginning of a new epoch, initialize some of |
| 159 | * the variables that we need to use for computing the |
| 160 | * congestion window later. |
| 161 | */ |
| 162 | tp->t_ccstate->cub_epoch_start = tcp_globals_now(globals); |
| 163 | if (tp->t_ccstate->cub_epoch_start == 0) { |
| 164 | tp->t_ccstate->cub_epoch_start = 1; |
| 165 | } |
| 166 | if (win < tp->t_ccstate->cub_last_max) { |
| 167 | /* |
| 168 | * Compute cubic epoch period, this is the time |
| 169 | * period that the window will take to increase to |
| 170 | * last_max again after backoff due to loss. |
| 171 | */ |
| 172 | if (tcp_cubic_minor_fixes) { |
| 173 | K = ((float)tp->t_ccstate->cub_last_max - win) / tp->t_maxseg / tcp_cubic_coeff; |
| 174 | } else { |
| 175 | K = (tp->t_ccstate->cub_last_max - win) / tp->t_maxseg / tcp_cubic_coeff; |
| 176 | } |
| 177 | K = cbrtf(x: K); |
| 178 | tp->t_ccstate->cub_epoch_period = K * TCP_RETRANSHZ; |
| 179 | /* Origin point */ |
| 180 | tp->t_ccstate->cub_origin_point = tp->t_ccstate->cub_last_max; |
| 181 | } else { |
| 182 | tp->t_ccstate->cub_epoch_period = 0; |
| 183 | tp->t_ccstate->cub_origin_point = win; |
| 184 | } |
| 185 | } |
| 186 | |
| 187 | VERIFY(tp->t_ccstate->cub_origin_point > 0); |
| 188 | /* |
| 189 | * Compute the target window for the next RTT using smoothed RTT |
| 190 | * as an estimate for next RTT. |
| 191 | */ |
| 192 | elapsed_time = timer_diff(t1: tcp_globals_now(globals), toff1: 0, t2: tp->t_ccstate->cub_epoch_start, toff2: 0); |
| 193 | |
| 194 | if (tcp_cubic_use_minrtt) { |
| 195 | elapsed_time += max(a: tcp_cubic_use_minrtt, b: rtt); |
| 196 | } else { |
| 197 | elapsed_time += rtt; |
| 198 | } |
| 199 | var = (elapsed_time - tp->t_ccstate->cub_epoch_period) / TCP_RETRANSHZ; |
| 200 | var = var * var * var * (tcp_cubic_coeff * tp->t_maxseg); |
| 201 | |
| 202 | return (uint32_t)(tp->t_ccstate->cub_origin_point + var); |
| 203 | } |
| 204 | |
| 205 | /* |
| 206 | * Standard TCP utilizes bandwidth well in low RTT and low BDP connections |
| 207 | * even when there is some packet loss. Enabling TCP mode will help Cubic |
| 208 | * to achieve this kind of utilization. |
| 209 | * |
| 210 | * But if there is a bottleneck link in the path with a fixed size queue |
| 211 | * and fixed bandwidth, TCP Cubic will help to reduce packet loss at this |
| 212 | * link because of the steady-state behavior. Using average and mean |
| 213 | * absolute deviation of W(lastmax), we try to detect if the congestion |
| 214 | * window is close to the bottleneck bandwidth. In that case, disabling |
| 215 | * TCP mode will help to minimize packet loss at this link. |
| 216 | * |
| 217 | * Disable TCP mode if the W(lastmax) (the window where previous packet |
| 218 | * loss happened) is within a small range from the average last max |
| 219 | * calculated. |
| 220 | */ |
| 221 | #define TCP_CUBIC_ENABLE_TCPMODE(_tp_) \ |
| 222 | ((!soissrcrealtime((_tp_)->t_inpcb->inp_socket) && \ |
| 223 | (_tp_)->t_ccstate->cub_mean_dev > (tp->t_maxseg << 1)) ? 1 : 0) |
| 224 | |
| 225 | /* |
| 226 | * Compute the window growth if standard TCP (AIMD) was used with |
| 227 | * a backoff of 0.5 and additive increase of 1 packet per RTT. |
| 228 | * |
| 229 | * TCP window at time t can be calculated using the following equation |
| 230 | * with tcp_beta_cubic |
| 231 | * |
| 232 | * W(t) <- Wmax * tcp_beta_cubic + 3 * ((1 - tcp_beta_cubic)/(1 + tcp_beta_cubic)) * t/RTT |
| 233 | * |
| 234 | */ |
| 235 | static uint32_t |
| 236 | tcp_cubic_tcpwin(struct tcpcb *tp, struct tcphdr *th) |
| 237 | { |
| 238 | if (tp->t_ccstate->cub_tcp_win == 0) { |
| 239 | /* Start of the epoch, we set the tcp_win to whatever Cubic decided |
| 240 | * at the beginning of the epoch. |
| 241 | */ |
| 242 | tp->t_ccstate->cub_tcp_win = min(a: tp->snd_cwnd, b: tp->snd_wnd); |
| 243 | if (tcp_cubic_minor_fixes) { |
| 244 | tp->t_ccstate->cub_tcp_bytes_acked = BYTES_ACKED(th, tp); |
| 245 | } else { |
| 246 | tp->t_ccstate->cub_tcp_bytes_acked = 0; |
| 247 | } |
| 248 | } else { |
| 249 | tp->t_ccstate->cub_tcp_bytes_acked += BYTES_ACKED(th, tp); |
| 250 | |
| 251 | if (tcp_cubic_minor_fixes) { |
| 252 | /* |
| 253 | * Increase by ai_factor * MSS, once per RTT. Counting bytes_acked |
| 254 | * against the snd_cwnd represents exactly one RTT at full rate. |
| 255 | */ |
| 256 | while (tp->t_ccstate->cub_tcp_bytes_acked >= tp->snd_cwnd) { |
| 257 | /* Enough bytes have been ACK'd for TCP to do AIMD*/ |
| 258 | tp->t_ccstate->cub_tcp_bytes_acked -= tp->snd_cwnd; |
| 259 | |
| 260 | if (tp->snd_cwnd >= tp->t_ccstate->cub_last_max || !tcp_cubic_rfc_compliant) { |
| 261 | tp->t_ccstate->cub_tcp_win += tp->t_maxseg; |
| 262 | } else { |
| 263 | /* Increase-rate from Section 4.2, RFC 8312 */ |
| 264 | float ai_factor = (float)3 * (1 - tcp_cubic_beta) / (1 + tcp_cubic_beta); |
| 265 | |
| 266 | tp->t_ccstate->cub_tcp_win += (uint32_t)(tp->t_maxseg * ai_factor); |
| 267 | } |
| 268 | } |
| 269 | } else { |
| 270 | if (tp->t_ccstate->cub_tcp_bytes_acked >= tp->t_ccstate->cub_tcp_win) { |
| 271 | tp->t_ccstate->cub_tcp_bytes_acked -= tp->t_ccstate->cub_tcp_win; |
| 272 | tp->t_ccstate->cub_tcp_win += tp->t_maxseg; |
| 273 | } |
| 274 | } |
| 275 | } |
| 276 | return tp->t_ccstate->cub_tcp_win; |
| 277 | } |
| 278 | |
| 279 | /* |
| 280 | * Handle an in-sequence ack during congestion avoidance phase. |
| 281 | */ |
| 282 | static void |
| 283 | tcp_cubic_congestion_avd(struct tcpcb *tp, struct tcphdr *th) |
| 284 | { |
| 285 | uint32_t cubic_target_win, tcp_win, rtt; |
| 286 | uint64_t incr_win = UINT32_MAX; |
| 287 | |
| 288 | /* Do not increase congestion window in non-validated phase */ |
| 289 | if (tcp_cc_is_cwnd_nonvalidated(tp) != 0) { |
| 290 | return; |
| 291 | } |
| 292 | |
| 293 | tp->t_bytes_acked += BYTES_ACKED(th, tp); |
| 294 | |
| 295 | rtt = get_base_rtt(tp); |
| 296 | /* |
| 297 | * First compute cubic window. If cubic variables are not |
| 298 | * initialized (after coming out of recovery), this call will |
| 299 | * initialize them. |
| 300 | */ |
| 301 | cubic_target_win = tcp_cubic_update(tp, rtt); |
| 302 | |
| 303 | /* Compute TCP window if a multiplicative decrease of 0.2 is used */ |
| 304 | tcp_win = tcp_cubic_tcpwin(tp, th); |
| 305 | |
| 306 | if (tp->snd_cwnd < tcp_win && tcp_cubic_minor_fixes == 0 && TCP_CUBIC_ENABLE_TCPMODE(tp)) { |
| 307 | /* this connection is in TCP-friendly region */ |
| 308 | if (tp->t_bytes_acked >= tp->snd_cwnd) { |
| 309 | tp->t_bytes_acked -= tp->snd_cwnd; |
| 310 | tp->snd_cwnd = min(a: tcp_win, TCP_MAXWIN << tp->snd_scale); |
| 311 | } |
| 312 | } else { |
| 313 | if (cubic_target_win > tp->snd_cwnd) { |
| 314 | /* |
| 315 | * The target win is computed for the next RTT. |
| 316 | * To reach this value, cwnd will have to be updated |
| 317 | * one segment at a time. Compute how many bytes |
| 318 | * need to be acknowledged before we can increase |
| 319 | * the cwnd by one segment. |
| 320 | */ |
| 321 | incr_win = (uint64_t)tp->snd_cwnd * tp->t_maxseg; |
| 322 | incr_win /= (cubic_target_win - tp->snd_cwnd); |
| 323 | if (!tcp_cubic_minor_fixes) { |
| 324 | if (incr_win > 0 && tp->t_bytes_acked >= incr_win) { |
| 325 | tp->t_bytes_acked -= incr_win; |
| 326 | tp->snd_cwnd = |
| 327 | min(a: (tp->snd_cwnd + tp->t_maxseg), |
| 328 | TCP_MAXWIN << tp->snd_scale); |
| 329 | } |
| 330 | } |
| 331 | } |
| 332 | } |
| 333 | |
| 334 | if (tcp_cubic_minor_fixes) { |
| 335 | tcp_win = tcp_round_to(val: tcp_win, round: tp->t_maxseg); |
| 336 | |
| 337 | if (tp->snd_cwnd < tcp_win) { |
| 338 | uint64_t tcp_incr_win; |
| 339 | |
| 340 | tcp_incr_win = (uint64_t)tp->snd_cwnd * tp->t_maxseg; |
| 341 | tcp_incr_win /= (tcp_win - tp->snd_cwnd); |
| 342 | |
| 343 | if (tcp_incr_win < incr_win) { |
| 344 | /* this connection is in TCP-friendly region */ |
| 345 | incr_win = tcp_incr_win; |
| 346 | } |
| 347 | } |
| 348 | |
| 349 | if (incr_win > 0 && tp->t_bytes_acked >= incr_win) { |
| 350 | tp->t_bytes_acked -= incr_win; |
| 351 | tp->snd_cwnd = min(a: tp->snd_cwnd + tp->t_maxseg, TCP_MAXWIN << tp->snd_scale); |
| 352 | } |
| 353 | } |
| 354 | } |
| 355 | |
| 356 | static void |
| 357 | tcp_cubic_ack_rcvd(struct tcpcb *tp, struct tcphdr *th) |
| 358 | { |
| 359 | /* Do not increase the congestion window in non-validated phase */ |
| 360 | if (tcp_cc_is_cwnd_nonvalidated(tp) != 0) { |
| 361 | return; |
| 362 | } |
| 363 | |
| 364 | if (tp->snd_cwnd >= tp->snd_ssthresh) { |
| 365 | /* Congestion avoidance phase */ |
| 366 | tcp_cubic_congestion_avd(tp, th); |
| 367 | } else { |
| 368 | /* |
| 369 | * Use 2*SMSS as limit on increment as suggested |
| 370 | * by RFC 3465 section 2.3 |
| 371 | */ |
| 372 | uint32_t acked, abc_lim, incr; |
| 373 | |
| 374 | acked = BYTES_ACKED(th, tp); |
| 375 | if (tcp_cubic_minor_fixes) { |
| 376 | /* |
| 377 | * Maximum burst-size is limited to the initial congestion-window. |
| 378 | * We know that the network can survive this kind of burst. |
| 379 | */ |
| 380 | abc_lim = tcp_initial_cwnd(tp); |
| 381 | } else { |
| 382 | abc_lim = (tp->snd_nxt == tp->snd_max) ? 2 * tp->t_maxseg : tp->t_maxseg; |
| 383 | } |
| 384 | incr = min(a: acked, b: abc_lim); |
| 385 | |
| 386 | tp->snd_cwnd += incr; |
| 387 | tp->snd_cwnd = min(a: tp->snd_cwnd, TCP_MAXWIN << tp->snd_scale); |
| 388 | } |
| 389 | } |
| 390 | |
| 391 | static void |
| 392 | tcp_cubic_pre_fr(struct tcpcb *tp) |
| 393 | { |
| 394 | uint32_t win, avg; |
| 395 | int32_t dev; |
| 396 | tp->t_ccstate->cub_epoch_start = 0; |
| 397 | tp->t_ccstate->cub_tcp_win = 0; |
| 398 | tp->t_ccstate->cub_tcp_bytes_acked = 0; |
| 399 | |
| 400 | win = min(a: tp->snd_cwnd, b: tp->snd_wnd); |
| 401 | if (tp->t_flagsext & TF_CWND_NONVALIDATED) { |
| 402 | tp->t_lossflightsize = tp->snd_max - tp->snd_una; |
| 403 | if (tcp_flow_control_response) { |
| 404 | win = max(a: tp->t_pipeack, b: tp->t_lossflightsize); |
| 405 | } else { |
| 406 | win = (max(a: tp->t_pipeack, b: tp->t_lossflightsize)) >> 1; |
| 407 | } |
| 408 | } else { |
| 409 | tp->t_lossflightsize = 0; |
| 410 | } |
| 411 | /* |
| 412 | * Note the congestion window at which packet loss occurred as |
| 413 | * cub_last_max. |
| 414 | * |
| 415 | * If the congestion window is less than the last max window when |
| 416 | * loss occurred, it indicates that capacity available in the |
| 417 | * network has gone down. This can happen if a new flow has started |
| 418 | * and it is capturing some of the bandwidth. To reach convergence |
| 419 | * quickly, backoff a little more. |
| 420 | */ |
| 421 | if (win < tp->t_ccstate->cub_last_max && tcp_cubic_minor_fixes) { |
| 422 | tp->t_ccstate->cub_last_max = (uint32_t)((float)win * tcp_cubic_fast_convergence_factor); |
| 423 | } else { |
| 424 | tp->t_ccstate->cub_last_max = win; |
| 425 | } |
| 426 | |
| 427 | if (tp->t_ccstate->cub_last_max == 0) { |
| 428 | /* |
| 429 | * If last_max is zero because snd_wnd is zero or for |
| 430 | * any other reason, initialize it to the amount of data |
| 431 | * in flight |
| 432 | */ |
| 433 | tp->t_ccstate->cub_last_max = tp->snd_max - tp->snd_una; |
| 434 | } |
| 435 | |
| 436 | /* |
| 437 | * Compute average and mean absolute deviation of the |
| 438 | * window at which packet loss occurred. |
| 439 | */ |
| 440 | if (tp->t_ccstate->cub_avg_lastmax == 0) { |
| 441 | tp->t_ccstate->cub_avg_lastmax = tp->t_ccstate->cub_last_max; |
| 442 | } else { |
| 443 | /* |
| 444 | * Average is computed by taking 63 parts of |
| 445 | * history and one part of the most recent value |
| 446 | */ |
| 447 | avg = tp->t_ccstate->cub_avg_lastmax; |
| 448 | avg = (avg << 6) - avg; |
| 449 | tp->t_ccstate->cub_avg_lastmax = |
| 450 | (avg + tp->t_ccstate->cub_last_max) >> 6; |
| 451 | } |
| 452 | |
| 453 | /* caluclate deviation from average */ |
| 454 | dev = tp->t_ccstate->cub_avg_lastmax - tp->t_ccstate->cub_last_max; |
| 455 | |
| 456 | /* Take the absolute value */ |
| 457 | if (dev < 0) { |
| 458 | dev = -dev; |
| 459 | } |
| 460 | |
| 461 | if (tp->t_ccstate->cub_mean_dev == 0) { |
| 462 | tp->t_ccstate->cub_mean_dev = dev; |
| 463 | } else { |
| 464 | dev = dev + ((tp->t_ccstate->cub_mean_dev << 4) |
| 465 | - tp->t_ccstate->cub_mean_dev); |
| 466 | tp->t_ccstate->cub_mean_dev = dev >> 4; |
| 467 | } |
| 468 | |
| 469 | /* Backoff congestion window by tcp_cubic_backoff factor */ |
| 470 | win = (uint32_t)(win - (win * tcp_cubic_backoff)); |
| 471 | win = tcp_round_to(val: win, round: tp->t_maxseg); |
| 472 | if (win < 2 * tp->t_maxseg) { |
| 473 | win = 2 * tp->t_maxseg; |
| 474 | } |
| 475 | tp->snd_ssthresh = win; |
| 476 | tcp_cc_resize_sndbuf(tp); |
| 477 | } |
| 478 | |
| 479 | static void |
| 480 | tcp_cubic_post_fr(struct tcpcb *tp, struct tcphdr *th) |
| 481 | { |
| 482 | uint32_t flight_size = 0; |
| 483 | uint32_t ack; |
| 484 | |
| 485 | if (th != NULL) { |
| 486 | ack = th->th_ack; |
| 487 | } else { |
| 488 | ack = tp->snd_una; |
| 489 | } |
| 490 | |
| 491 | if (SEQ_LEQ(ack, tp->snd_max) && (!tcp_cubic_minor_fixes || tcp_flow_control_response)) { |
| 492 | flight_size = tp->snd_max - ack; |
| 493 | } else if (tcp_cubic_minor_fixes) { |
| 494 | /* |
| 495 | * Cubic Minor Fixes: snd_max - th_ack is a very very bad estimate |
| 496 | * of the flight size. Either the app is sending at full speed and |
| 497 | * flight_size *is* snd_sshtresh, or the app is not sending at full |
| 498 | * speed and congestion-window validation would have kicked in earlier. |
| 499 | * |
| 500 | * Except that for the latter, snd_ssthresh is way too high. |
| 501 | * When we exit recovery we will burst a lot of data out... |
| 502 | * |
| 503 | * So, tcp_flow_control_response brings us back to the old behavior. |
| 504 | * Too many feature-flags... |
| 505 | */ |
| 506 | flight_size = tp->snd_ssthresh; |
| 507 | } |
| 508 | |
| 509 | /* |
| 510 | * Cubic Minor Fixes: t_lossflightsize is always 0, because of |
| 511 | * EXIT_FASTRECOVERY. This here is basically dead code... |
| 512 | */ |
| 513 | if (SACK_ENABLED(tp) && tp->t_lossflightsize > 0 && !tcp_cubic_minor_fixes) { |
| 514 | uint32_t total_rxt_size = 0, ncwnd; |
| 515 | /* |
| 516 | * When SACK is enabled, the number of retransmitted bytes |
| 517 | * can be counted more accurately. |
| 518 | */ |
| 519 | total_rxt_size = tcp_rxtseg_total_size(tp); |
| 520 | ncwnd = max(a: tp->t_pipeack, b: tp->t_lossflightsize); |
| 521 | if (total_rxt_size <= ncwnd) { |
| 522 | ncwnd = ncwnd - total_rxt_size; |
| 523 | } |
| 524 | |
| 525 | /* |
| 526 | * To avoid sending a large burst at the end of recovery |
| 527 | * set a max limit on ncwnd |
| 528 | */ |
| 529 | ncwnd = min(a: ncwnd, b: (tp->t_maxseg << 6)); |
| 530 | ncwnd = ncwnd >> 1; |
| 531 | flight_size = max(a: ncwnd, b: flight_size); |
| 532 | } |
| 533 | /* |
| 534 | * Complete ack. The current window was inflated for fast recovery. |
| 535 | * It has to be deflated post recovery. |
| 536 | * |
| 537 | * Window inflation should have left us with approx snd_ssthresh |
| 538 | * outstanding data. If the flight size is zero or one segment, |
| 539 | * make congestion window to be at least as big as 2 segments to |
| 540 | * avoid delayed acknowledgements. This is according to RFC 6582. |
| 541 | */ |
| 542 | if (flight_size < tp->snd_ssthresh) { |
| 543 | tp->snd_cwnd = max(a: flight_size, b: tp->t_maxseg) + tp->t_maxseg; |
| 544 | } else { |
| 545 | tp->snd_cwnd = tp->snd_ssthresh; |
| 546 | } |
| 547 | |
| 548 | tp->t_ccstate->cub_tcp_win = 0; |
| 549 | tp->t_ccstate->cub_tcp_bytes_acked = 0; |
| 550 | } |
| 551 | |
| 552 | static void |
| 553 | tcp_cubic_after_timeout(struct tcpcb *tp) |
| 554 | { |
| 555 | VERIFY(tp->t_ccstate != NULL); |
| 556 | |
| 557 | /* |
| 558 | * Avoid adjusting congestion window due to SYN retransmissions. |
| 559 | * If more than one byte (SYN) is outstanding then it is still |
| 560 | * needed to adjust the window. |
| 561 | */ |
| 562 | if (tp->t_state < TCPS_ESTABLISHED && |
| 563 | ((int)(tp->snd_max - tp->snd_una) <= 1)) { |
| 564 | return; |
| 565 | } |
| 566 | |
| 567 | if (!IN_FASTRECOVERY(tp)) { |
| 568 | tcp_cubic_clear_state(tp); |
| 569 | tcp_cubic_pre_fr(tp); |
| 570 | } |
| 571 | |
| 572 | /* |
| 573 | * Close the congestion window down to one segment as a retransmit |
| 574 | * timeout might indicate severe congestion. |
| 575 | */ |
| 576 | tp->snd_cwnd = tp->t_maxseg; |
| 577 | } |
| 578 | |
| 579 | static int |
| 580 | tcp_cubic_delay_ack(struct tcpcb *tp, struct tcphdr *th) |
| 581 | { |
| 582 | return tcp_cc_delay_ack(tp, th); |
| 583 | } |
| 584 | |
| 585 | /* |
| 586 | * When switching from a different CC it is better for Cubic to start |
| 587 | * fresh. The state required for Cubic calculation might be stale and it |
| 588 | * might not represent the current state of the network. If it starts as |
| 589 | * a new connection it will probe and learn the existing network conditions. |
| 590 | */ |
| 591 | static void |
| 592 | tcp_cubic_switch_cc(struct tcpcb *tp) |
| 593 | { |
| 594 | tcp_cubic_cwnd_init_or_reset(tp); |
| 595 | |
| 596 | os_atomic_inc(&tcp_cc_cubic.num_sockets, relaxed); |
| 597 | } |
| 598 | |
| 599 | static inline void |
| 600 | tcp_cubic_clear_state(struct tcpcb *tp) |
| 601 | { |
| 602 | tp->t_ccstate->cub_last_max = 0; |
| 603 | tp->t_ccstate->cub_epoch_start = 0; |
| 604 | tp->t_ccstate->cub_origin_point = 0; |
| 605 | tp->t_ccstate->cub_tcp_win = 0; |
| 606 | tp->t_ccstate->cub_tcp_bytes_acked = 0; |
| 607 | tp->t_ccstate->cub_epoch_period = 0; |
| 608 | } |
| 609 | |