| 1 | /* |
| 2 | * Copyright (c) 2013-2021 Apple Inc. All rights reserved. |
| 3 | * |
| 4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
| 5 | * |
| 6 | * This file contains Original Code and/or Modifications of Original Code |
| 7 | * as defined in and that are subject to the Apple Public Source License |
| 8 | * Version 2.0 (the 'License'). You may not use this file except in |
| 9 | * compliance with the License. The rights granted to you under the License |
| 10 | * may not be used to create, or enable the creation or redistribution of, |
| 11 | * unlawful or unlicensed copies of an Apple operating system, or to |
| 12 | * circumvent, violate, or enable the circumvention or violation of, any |
| 13 | * terms of an Apple operating system software license agreement. |
| 14 | * |
| 15 | * Please obtain a copy of the License at |
| 16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
| 17 | * |
| 18 | * The Original Code and all software distributed under the License are |
| 19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
| 20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
| 21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
| 22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
| 23 | * Please see the License for the specific language governing rights and |
| 24 | * limitations under the License. |
| 25 | * |
| 26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
| 27 | */ |
| 28 | |
| 29 | #include "tcp_includes.h" |
| 30 | |
| 31 | #include <sys/param.h> |
| 32 | #include <sys/kernel.h> |
| 33 | #include <sys/syslog.h> |
| 34 | #include <sys/kern_control.h> |
| 35 | #include <sys/domain.h> |
| 36 | |
| 37 | #include <netinet/in.h> |
| 38 | #include <mach/sdt.h> |
| 39 | #include <libkern/OSAtomic.h> |
| 40 | |
| 41 | #include <libkern/OSTypes.h> |
| 42 | |
| 43 | extern struct tcp_cc_algo tcp_cc_newreno; |
| 44 | extern struct tcp_cc_algo tcp_cc_ledbat; |
| 45 | extern struct tcp_cc_algo tcp_cc_cubic; |
| 46 | |
| 47 | #define SET_SNDSB_IDEAL_SIZE(sndsb, size) \ |
| 48 | sndsb->sb_idealsize = min(max(tcp_sendspace, tp->snd_ssthresh), \ |
| 49 | tcp_autosndbuf_max); |
| 50 | |
| 51 | /* Array containing pointers to currently implemented TCP CC algorithms */ |
| 52 | struct tcp_cc_algo* tcp_cc_algo_list[TCP_CC_ALGO_COUNT]; |
| 53 | |
| 54 | static struct tcp_cc_algo tcp_cc_algo_none; |
| 55 | /* |
| 56 | * Initialize TCP congestion control algorithms. |
| 57 | */ |
| 58 | |
| 59 | void |
| 60 | tcp_cc_init(void) |
| 61 | { |
| 62 | bzero(s: &tcp_cc_algo_list, n: sizeof(tcp_cc_algo_list)); |
| 63 | bzero(s: &tcp_cc_algo_none, n: sizeof(tcp_cc_algo_none)); |
| 64 | |
| 65 | tcp_cc_algo_list[TCP_CC_ALGO_NONE] = &tcp_cc_algo_none; |
| 66 | tcp_cc_algo_list[TCP_CC_ALGO_NEWRENO_INDEX] = &tcp_cc_newreno; |
| 67 | tcp_cc_algo_list[TCP_CC_ALGO_BACKGROUND_INDEX] = &tcp_cc_ledbat; |
| 68 | tcp_cc_algo_list[TCP_CC_ALGO_CUBIC_INDEX] = &tcp_cc_cubic; |
| 69 | |
| 70 | tcp_ccdbg_control_register(); |
| 71 | } |
| 72 | |
| 73 | void |
| 74 | tcp_cc_resize_sndbuf(struct tcpcb *tp) |
| 75 | { |
| 76 | struct sockbuf *sb; |
| 77 | /* |
| 78 | * If the send socket buffer size is bigger than ssthresh, |
| 79 | * it is time to trim it because we do not want to hold |
| 80 | * too many mbufs in the socket buffer |
| 81 | */ |
| 82 | sb = &tp->t_inpcb->inp_socket->so_snd; |
| 83 | if (sb->sb_hiwat > tp->snd_ssthresh && |
| 84 | (sb->sb_flags & SB_AUTOSIZE)) { |
| 85 | if (sb->sb_idealsize > tp->snd_ssthresh) { |
| 86 | SET_SNDSB_IDEAL_SIZE(sb, tp->snd_ssthresh); |
| 87 | } |
| 88 | sb->sb_flags |= SB_TRIM; |
| 89 | } |
| 90 | } |
| 91 | |
| 92 | void |
| 93 | tcp_bad_rexmt_fix_sndbuf(struct tcpcb *tp) |
| 94 | { |
| 95 | struct sockbuf *sb; |
| 96 | sb = &tp->t_inpcb->inp_socket->so_snd; |
| 97 | if ((sb->sb_flags & (SB_TRIM | SB_AUTOSIZE)) == (SB_TRIM | SB_AUTOSIZE)) { |
| 98 | /* |
| 99 | * If there was a retransmission that was not necessary |
| 100 | * then the size of socket buffer can be restored to |
| 101 | * what it was before |
| 102 | */ |
| 103 | SET_SNDSB_IDEAL_SIZE(sb, tp->snd_ssthresh); |
| 104 | if (sb->sb_hiwat <= sb->sb_idealsize) { |
| 105 | sbreserve(sb, cc: sb->sb_idealsize); |
| 106 | sb->sb_flags &= ~SB_TRIM; |
| 107 | } |
| 108 | } |
| 109 | } |
| 110 | |
| 111 | /* |
| 112 | * Calculate initial cwnd according to RFC3390. |
| 113 | */ |
| 114 | void |
| 115 | tcp_cc_cwnd_init_or_reset(struct tcpcb *tp) |
| 116 | { |
| 117 | if (tp->t_flags & TF_LOCAL) { |
| 118 | tp->snd_cwnd = tp->t_maxseg * ss_fltsz_local; |
| 119 | } else { |
| 120 | if (tcp_cubic_minor_fixes) { |
| 121 | tp->snd_cwnd = tcp_initial_cwnd(tp); |
| 122 | } else { |
| 123 | /* initial congestion window according to RFC 3390 */ |
| 124 | tp->snd_cwnd = min(a: 4 * tp->t_maxseg, |
| 125 | b: max(a: 2 * tp->t_maxseg, TCP_CC_CWND_INIT_BYTES)); |
| 126 | } |
| 127 | } |
| 128 | } |
| 129 | |
| 130 | /* |
| 131 | * Indicate whether this ack should be delayed. |
| 132 | * Here is the explanation for different settings of tcp_delack_enabled: |
| 133 | * - when set to 1, the behavior is same as when set to 2. We kept this |
| 134 | * for binary compatibility. |
| 135 | * - when set to 2, will "ack every other packet" |
| 136 | * - if our last ack wasn't a 0-sized window. |
| 137 | * - if the peer hasn't sent us a TH_PUSH data packet (radar 3649245). |
| 138 | * If TH_PUSH is set, take this as a clue that we need to ACK |
| 139 | * with no delay. This helps higher level protocols who |
| 140 | * won't send us more data even if the window is open |
| 141 | * because their last "segment" hasn't been ACKed |
| 142 | * - when set to 3, will do "streaming detection" |
| 143 | * - if we receive more than "maxseg_unacked" full packets |
| 144 | * in the last 100ms |
| 145 | * - if the connection is not in slow-start or idle or |
| 146 | * loss/recovery states |
| 147 | * - if those criteria aren't met, it will ack every other packet. |
| 148 | */ |
| 149 | int |
| 150 | tcp_cc_delay_ack(struct tcpcb *tp, struct tcphdr *th) |
| 151 | { |
| 152 | switch (tcp_delack_enabled) { |
| 153 | case 1: |
| 154 | case 2: |
| 155 | if ((tp->t_flags & TF_RXWIN0SENT) == 0 && |
| 156 | (th->th_flags & TH_PUSH) == 0 && |
| 157 | (tp->t_unacksegs == 1)) { |
| 158 | return 1; |
| 159 | } |
| 160 | break; |
| 161 | case 3: |
| 162 | if (tcp_ack_strategy == TCP_ACK_STRATEGY_LEGACY) { |
| 163 | if ((tp->t_flags & TF_RXWIN0SENT) == 0 && |
| 164 | (th->th_flags & TH_PUSH) == 0 && |
| 165 | ((tp->t_unacksegs == 1) || |
| 166 | ((tp->t_flags & TF_STRETCHACK) && |
| 167 | tp->t_unacksegs < maxseg_unacked))) { |
| 168 | return 1; |
| 169 | } |
| 170 | } else { |
| 171 | uint32_t recwin; |
| 172 | |
| 173 | /* Get the receive-window we would announce */ |
| 174 | recwin = tcp_sbspace(tp); |
| 175 | if (recwin > (uint32_t)(TCP_MAXWIN << tp->rcv_scale)) { |
| 176 | recwin = (uint32_t)(TCP_MAXWIN << tp->rcv_scale); |
| 177 | } |
| 178 | |
| 179 | /* Delay ACK, if: |
| 180 | * |
| 181 | * 1. We are not sending a zero-window |
| 182 | * 2. We are not forcing fast ACKs |
| 183 | * 3. We have more than the low-water mark in receive-buffer |
| 184 | * 4. The receive-window is not increasing |
| 185 | * 5. We have less than or equal of an MSS unacked or |
| 186 | * Window actually has been growing larger than the initial value by half of it. |
| 187 | * (this makes sure that during ramp-up we ACK every second MSS |
| 188 | * until we pass the tcp_recvspace * 1.5-threshold) |
| 189 | * 6. We haven't waited for half a BDP |
| 190 | * 7. The amount of unacked data is less than the maximum ACK-burst (256 MSS) |
| 191 | * We try to avoid having the sender end up hitting huge ACK-ranges. |
| 192 | * |
| 193 | * (a note on 6: The receive-window is |
| 194 | * roughly 2 BDP. Thus, recwin / 4 means half a BDP and |
| 195 | * thus we enforce an ACK roughly twice per RTT - even |
| 196 | * if the app does not read) |
| 197 | */ |
| 198 | if ((tp->t_flags & TF_RXWIN0SENT) == 0 && |
| 199 | tp->t_forced_acks == 0 && |
| 200 | tp->t_inpcb->inp_socket->so_rcv.sb_cc > tp->t_inpcb->inp_socket->so_rcv.sb_lowat && |
| 201 | recwin <= tp->t_last_recwin && |
| 202 | (tp->rcv_nxt - tp->last_ack_sent <= tp->t_maxseg || |
| 203 | recwin > (uint32_t)(tcp_recvspace + (tcp_recvspace >> 1))) && |
| 204 | (tp->rcv_nxt - tp->last_ack_sent) < (recwin >> 2) && |
| 205 | (tp->rcv_nxt - tp->last_ack_sent) < 256 * tp->t_maxseg) { |
| 206 | tp->t_stat.acks_delayed++; |
| 207 | return 1; |
| 208 | } |
| 209 | } |
| 210 | break; |
| 211 | } |
| 212 | return 0; |
| 213 | } |
| 214 | |
| 215 | void |
| 216 | tcp_cc_allocate_state(struct tcpcb *tp) |
| 217 | { |
| 218 | if ((tp->tcp_cc_index == TCP_CC_ALGO_CUBIC_INDEX || |
| 219 | tp->tcp_cc_index == TCP_CC_ALGO_BACKGROUND_INDEX) && |
| 220 | tp->t_ccstate == NULL) { |
| 221 | tp->t_ccstate = &tp->_t_ccstate; |
| 222 | |
| 223 | bzero(s: tp->t_ccstate, n: sizeof(*tp->t_ccstate)); |
| 224 | } |
| 225 | } |
| 226 | |
| 227 | /* |
| 228 | * If stretch ack was disabled automatically on long standing connections, |
| 229 | * re-evaluate the situation after 15 minutes to enable it. |
| 230 | */ |
| 231 | #define TCP_STRETCHACK_DISABLE_WIN (15 * 60 * TCP_RETRANSHZ) |
| 232 | void |
| 233 | tcp_cc_after_idle_stretchack(struct tcpcb *tp) |
| 234 | { |
| 235 | struct tcp_globals *globals; |
| 236 | int32_t tdiff; |
| 237 | |
| 238 | if (!(tp->t_flagsext & TF_DISABLE_STRETCHACK)) { |
| 239 | return; |
| 240 | } |
| 241 | |
| 242 | globals = tcp_get_globals(tp); |
| 243 | tdiff = timer_diff(t1: tcp_globals_now(globals), toff1: 0, t2: tp->rcv_nostrack_ts, toff2: 0); |
| 244 | if (tdiff < 0) { |
| 245 | tdiff = -tdiff; |
| 246 | } |
| 247 | |
| 248 | if (tdiff > TCP_STRETCHACK_DISABLE_WIN) { |
| 249 | tp->t_flagsext &= ~TF_DISABLE_STRETCHACK; |
| 250 | tp->t_stretchack_delayed = 0; |
| 251 | |
| 252 | tcp_reset_stretch_ack(tp); |
| 253 | } |
| 254 | } |
| 255 | |
| 256 | /* |
| 257 | * Detect if the congestion window is non-validated according to |
| 258 | * draft-ietf-tcpm-newcwv-07 |
| 259 | */ |
| 260 | inline uint32_t |
| 261 | tcp_cc_is_cwnd_nonvalidated(struct tcpcb *tp) |
| 262 | { |
| 263 | struct socket *so = tp->t_inpcb->inp_socket; |
| 264 | |
| 265 | if (tp->t_pipeack == 0) { |
| 266 | tp->t_flagsext &= ~TF_CWND_NONVALIDATED; |
| 267 | return 0; |
| 268 | } |
| 269 | |
| 270 | /* |
| 271 | * The congestion window is validated if the number of bytes acked |
| 272 | * is more than half of the current window or if there is more |
| 273 | * data to send in the send socket buffer |
| 274 | */ |
| 275 | if (tp->t_pipeack >= (tp->snd_cwnd >> 1) || |
| 276 | (so != NULL && so->so_snd.sb_cc > tp->snd_cwnd)) { |
| 277 | tp->t_flagsext &= ~TF_CWND_NONVALIDATED; |
| 278 | } else { |
| 279 | tp->t_flagsext |= TF_CWND_NONVALIDATED; |
| 280 | } |
| 281 | |
| 282 | return tp->t_flagsext & TF_CWND_NONVALIDATED; |
| 283 | } |
| 284 | |
| 285 | /* |
| 286 | * Adjust congestion window in response to congestion in non-validated |
| 287 | * phase. |
| 288 | */ |
| 289 | inline void |
| 290 | tcp_cc_adjust_nonvalidated_cwnd(struct tcpcb *tp) |
| 291 | { |
| 292 | tp->t_pipeack = tcp_get_max_pipeack(tp); |
| 293 | tcp_clear_pipeack_state(tp); |
| 294 | tp->snd_cwnd = (max(a: tp->t_pipeack, b: tp->t_lossflightsize) >> 1); |
| 295 | if (tcp_cubic_minor_fixes) { |
| 296 | tp->snd_cwnd = max(a: tp->snd_cwnd, b: tp->t_maxseg); |
| 297 | } else { |
| 298 | tp->snd_cwnd = max(a: tp->snd_cwnd, TCP_CC_CWND_INIT_BYTES); |
| 299 | } |
| 300 | tp->snd_cwnd += tp->t_maxseg * tcprexmtthresh; |
| 301 | tp->t_flagsext &= ~TF_CWND_NONVALIDATED; |
| 302 | } |
| 303 | |
| 304 | /* |
| 305 | * Return maximum of all the pipeack samples. Since the number of samples |
| 306 | * TCP_PIPEACK_SAMPLE_COUNT is 3 at this time, it will be simpler to do |
| 307 | * a comparision. We should change ths if the number of samples increases. |
| 308 | */ |
| 309 | inline uint32_t |
| 310 | tcp_get_max_pipeack(struct tcpcb *tp) |
| 311 | { |
| 312 | uint32_t max_pipeack = 0; |
| 313 | max_pipeack = (tp->t_pipeack_sample[0] > tp->t_pipeack_sample[1]) ? |
| 314 | tp->t_pipeack_sample[0] : tp->t_pipeack_sample[1]; |
| 315 | max_pipeack = (tp->t_pipeack_sample[2] > max_pipeack) ? |
| 316 | tp->t_pipeack_sample[2] : max_pipeack; |
| 317 | |
| 318 | return max_pipeack; |
| 319 | } |
| 320 | |
| 321 | inline void |
| 322 | tcp_clear_pipeack_state(struct tcpcb *tp) |
| 323 | { |
| 324 | bzero(s: tp->t_pipeack_sample, n: sizeof(tp->t_pipeack_sample)); |
| 325 | tp->t_pipeack_ind = 0; |
| 326 | tp->t_lossflightsize = 0; |
| 327 | } |
| 328 | |