| 1 | /* |
| 2 | * Copyright (c) 2016-2021 Apple Inc. All rights reserved. |
| 3 | * |
| 4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
| 5 | * |
| 6 | * This file contains Original Code and/or Modifications of Original Code |
| 7 | * as defined in and that are subject to the Apple Public Source License |
| 8 | * Version 2.0 (the 'License'). You may not use this file except in |
| 9 | * compliance with the License. The rights granted to you under the License |
| 10 | * may not be used to create, or enable the creation or redistribution of, |
| 11 | * unlawful or unlicensed copies of an Apple operating system, or to |
| 12 | * circumvent, violate, or enable the circumvention or violation of, any |
| 13 | * terms of an Apple operating system software license agreement. |
| 14 | * |
| 15 | * Please obtain a copy of the License at |
| 16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
| 17 | * |
| 18 | * The Original Code and all software distributed under the License are |
| 19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
| 20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
| 21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
| 22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
| 23 | * Please see the License for the specific language governing rights and |
| 24 | * limitations under the License. |
| 25 | * |
| 26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
| 27 | */ |
| 28 | |
| 29 | /* |
| 30 | * The migration of flow queue between the different states is summarised in |
| 31 | * the below state diagram. (RFC 8290) |
| 32 | * |
| 33 | * +-----------------+ +------------------+ |
| 34 | * | | Empty | | |
| 35 | * | Empty |<---------------+ Old +----+ |
| 36 | * | | | | | |
| 37 | * +-------+---------+ +------------------+ | |
| 38 | * | ^ ^ |Credits |
| 39 | * |Arrival | | |Exhausted |
| 40 | * v | | | |
| 41 | * +-----------------+ | | | |
| 42 | * | | Empty or | | | |
| 43 | * | New +-------------------+ +-------+ |
| 44 | * | | Credits Exhausted |
| 45 | * +-----------------+ |
| 46 | * |
| 47 | * In this implementation of FQ-CODEL, flow queue is a dynamically allocated |
| 48 | * object. An active flow queue goes through the above cycle of state |
| 49 | * transitions very often. To avoid the cost of frequent flow queue object |
| 50 | * allocation/free, this implementation retains the flow queue object in |
| 51 | * [Empty] state on an Empty flow queue list with an active reference in flow |
| 52 | * queue hash table. The flow queue objects on the Empty flow queue list have |
| 53 | * an associated age and are purged accordingly. |
| 54 | */ |
| 55 | |
| 56 | #include <sys/cdefs.h> |
| 57 | #include <sys/param.h> |
| 58 | #include <sys/mbuf.h> |
| 59 | #include <sys/socket.h> |
| 60 | #include <sys/sockio.h> |
| 61 | #include <sys/systm.h> |
| 62 | #include <sys/syslog.h> |
| 63 | #include <sys/proc.h> |
| 64 | #include <sys/errno.h> |
| 65 | #include <sys/kernel.h> |
| 66 | #include <sys/kauth.h> |
| 67 | #include <sys/sdt.h> |
| 68 | #include <kern/zalloc.h> |
| 69 | #include <netinet/in.h> |
| 70 | |
| 71 | #include <net/classq/classq.h> |
| 72 | #include <net/classq/if_classq.h> |
| 73 | #include <net/pktsched/pktsched.h> |
| 74 | #include <net/pktsched/pktsched_fq_codel.h> |
| 75 | #include <net/classq/classq_fq_codel.h> |
| 76 | |
| 77 | #include <netinet/tcp_var.h> |
| 78 | |
| 79 | #define FQ_ZONE_MAX (32 * 1024) /* across all interfaces */ |
| 80 | |
| 81 | #define DTYPE_NODROP 0 /* no drop */ |
| 82 | #define DTYPE_FORCED 1 /* a "forced" drop */ |
| 83 | #define DTYPE_EARLY 2 /* an "unforced" (early) drop */ |
| 84 | |
| 85 | static uint32_t pkt_compressor = 1; |
| 86 | static uint64_t l4s_ce_threshold = 0; /* in usec */ |
| 87 | static uint32_t l4s_local_ce_report = 0; |
| 88 | static uint64_t pkt_pacing_leeway = 0; /* in usec */ |
| 89 | static uint64_t max_pkt_pacing_interval = 3 * NSEC_PER_SEC; |
| 90 | static uint64_t l4s_min_delay_threshold = 20 * NSEC_PER_MSEC; /* 20 ms */ |
| 91 | #if (DEBUG || DEVELOPMENT) |
| 92 | SYSCTL_NODE(_net_classq, OID_AUTO, flow_q, CTLFLAG_RW | CTLFLAG_LOCKED, |
| 93 | 0, "FQ-CODEL parameters" ); |
| 94 | |
| 95 | SYSCTL_UINT(_net_classq_flow_q, OID_AUTO, pkt_compressor, |
| 96 | CTLFLAG_RW | CTLFLAG_LOCKED, &pkt_compressor, 0, "enable pkt compression" ); |
| 97 | |
| 98 | SYSCTL_QUAD(_net_classq, OID_AUTO, l4s_ce_threshold, |
| 99 | CTLFLAG_RW | CTLFLAG_LOCKED, &l4s_ce_threshold, |
| 100 | "L4S CE threshold" ); |
| 101 | |
| 102 | SYSCTL_UINT(_net_classq_flow_q, OID_AUTO, l4s_local_ce_report, |
| 103 | CTLFLAG_RW | CTLFLAG_LOCKED, &l4s_local_ce_report, 0, |
| 104 | "enable L4S local CE report" ); |
| 105 | |
| 106 | SYSCTL_QUAD(_net_classq_flow_q, OID_AUTO, pkt_pacing_leeway, |
| 107 | CTLFLAG_RW | CTLFLAG_LOCKED, &pkt_pacing_leeway, "packet pacing leeway" ); |
| 108 | |
| 109 | SYSCTL_QUAD(_net_classq_flow_q, OID_AUTO, max_pkt_pacing_interval, |
| 110 | CTLFLAG_RW | CTLFLAG_LOCKED, &max_pkt_pacing_interval, "max packet pacing interval" ); |
| 111 | |
| 112 | SYSCTL_QUAD(_net_classq_flow_q, OID_AUTO, l4s_min_delay_threshold, |
| 113 | CTLFLAG_RW | CTLFLAG_LOCKED, &l4s_min_delay_threshold, "l4s min delay threshold" ); |
| 114 | #endif /* (DEBUG || DEVELOPMENT) */ |
| 115 | |
| 116 | void |
| 117 | fq_codel_init(void) |
| 118 | { |
| 119 | _CASSERT(AQM_KTRACE_AON_FLOW_HIGH_DELAY == 0x8300004); |
| 120 | _CASSERT(AQM_KTRACE_AON_THROTTLE == 0x8300008); |
| 121 | _CASSERT(AQM_KTRACE_AON_FLOW_OVERWHELMING == 0x830000c); |
| 122 | _CASSERT(AQM_KTRACE_AON_FLOW_DQ_STALL == 0x8300010); |
| 123 | |
| 124 | _CASSERT(AQM_KTRACE_STATS_FLOW_ENQUEUE == 0x8310004); |
| 125 | _CASSERT(AQM_KTRACE_STATS_FLOW_DEQUEUE == 0x8310008); |
| 126 | _CASSERT(AQM_KTRACE_STATS_FLOW_CTL == 0x831000c); |
| 127 | _CASSERT(AQM_KTRACE_STATS_FLOW_ALLOC == 0x8310010); |
| 128 | _CASSERT(AQM_KTRACE_STATS_FLOW_DESTROY == 0x8310014); |
| 129 | _CASSERT(AQM_KTRACE_STATS_FLOW_REPORT_CE == 0x8310018); |
| 130 | _CASSERT(AQM_KTRACE_STATS_GET_QLEN == 0x831001c); |
| 131 | _CASSERT(AQM_KTRACE_TX_NOT_READY == 0x8310020); |
| 132 | _CASSERT(AQM_KTRACE_TX_PACEMAKER == 0x8310024); |
| 133 | } |
| 134 | |
| 135 | fq_t * |
| 136 | fq_alloc(classq_pkt_type_t ptype) |
| 137 | { |
| 138 | fq_t *fq = NULL; |
| 139 | |
| 140 | fq = kalloc_type(fq_t, Z_WAITOK_ZERO); |
| 141 | if (ptype == QP_MBUF) { |
| 142 | MBUFQ_INIT(&fq->fq_mbufq); |
| 143 | } |
| 144 | #if SKYWALK |
| 145 | else { |
| 146 | VERIFY(ptype == QP_PACKET); |
| 147 | KPKTQ_INIT(&fq->fq_kpktq); |
| 148 | } |
| 149 | #endif /* SKYWALK */ |
| 150 | CLASSQ_PKT_INIT(&fq->fq_dq_head); |
| 151 | CLASSQ_PKT_INIT(&fq->fq_dq_tail); |
| 152 | fq->fq_in_dqlist = false; |
| 153 | |
| 154 | return fq; |
| 155 | } |
| 156 | |
| 157 | void |
| 158 | fq_destroy(fq_t *fq, classq_pkt_type_t ptype) |
| 159 | { |
| 160 | VERIFY(!fq->fq_in_dqlist); |
| 161 | VERIFY(fq_empty(fq, ptype)); |
| 162 | VERIFY(!(fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW | |
| 163 | FQF_EMPTY_FLOW))); |
| 164 | VERIFY(fq->fq_bytes == 0); |
| 165 | kfree_type(fq_t, fq); |
| 166 | } |
| 167 | |
| 168 | static inline void |
| 169 | fq_detect_dequeue_stall(fq_if_t *fqs, fq_t *flowq, fq_if_classq_t *fq_cl, |
| 170 | u_int64_t *now) |
| 171 | { |
| 172 | u_int64_t maxgetqtime, update_interval; |
| 173 | if (FQ_IS_DELAY_HIGH(flowq) || flowq->fq_getqtime == 0 || |
| 174 | fq_empty(flowq, fqs->fqs_ptype) || |
| 175 | flowq->fq_bytes < FQ_MIN_FC_THRESHOLD_BYTES) { |
| 176 | return; |
| 177 | } |
| 178 | |
| 179 | update_interval = FQ_UPDATE_INTERVAL(flowq); |
| 180 | maxgetqtime = flowq->fq_getqtime + update_interval; |
| 181 | if ((*now) > maxgetqtime) { |
| 182 | /* |
| 183 | * there was no dequeue in an update interval worth of |
| 184 | * time. It means that the queue is stalled. |
| 185 | */ |
| 186 | FQ_SET_DELAY_HIGH(flowq); |
| 187 | fq_cl->fcl_stat.fcl_dequeue_stall++; |
| 188 | os_log_error(OS_LOG_DEFAULT, "%s:num: %d, " |
| 189 | "scidx: %d, flow: 0x%x, iface: %s grp: %hhu" , __func__, |
| 190 | fq_cl->fcl_stat.fcl_dequeue_stall, flowq->fq_sc_index, |
| 191 | flowq->fq_flowhash, if_name(fqs->fqs_ifq->ifcq_ifp), |
| 192 | FQ_GROUP(flowq)->fqg_index); |
| 193 | KDBG(AQM_KTRACE_AON_FLOW_DQ_STALL, flowq->fq_flowhash, |
| 194 | AQM_KTRACE_FQ_GRP_SC_IDX(flowq), flowq->fq_bytes, |
| 195 | (*now) - flowq->fq_getqtime); |
| 196 | } |
| 197 | } |
| 198 | |
| 199 | void |
| 200 | fq_head_drop(fq_if_t *fqs, fq_t *fq) |
| 201 | { |
| 202 | pktsched_pkt_t pkt; |
| 203 | volatile uint32_t *pkt_flags; |
| 204 | uint64_t *pkt_timestamp; |
| 205 | struct ifclassq *ifq = fqs->fqs_ifq; |
| 206 | |
| 207 | _PKTSCHED_PKT_INIT(&pkt); |
| 208 | fq_getq_flow_internal(fqs, fq, &pkt); |
| 209 | if (pkt.pktsched_pkt_mbuf == NULL) { |
| 210 | return; |
| 211 | } |
| 212 | |
| 213 | pktsched_get_pkt_vars(&pkt, &pkt_flags, &pkt_timestamp, NULL, NULL, |
| 214 | NULL, NULL, NULL); |
| 215 | |
| 216 | *pkt_timestamp = 0; |
| 217 | switch (pkt.pktsched_ptype) { |
| 218 | case QP_MBUF: |
| 219 | *pkt_flags &= ~PKTF_PRIV_GUARDED; |
| 220 | break; |
| 221 | #if SKYWALK |
| 222 | case QP_PACKET: |
| 223 | /* sanity check */ |
| 224 | ASSERT((*pkt_flags & ~PKT_F_COMMON_MASK) == 0); |
| 225 | break; |
| 226 | #endif /* SKYWALK */ |
| 227 | default: |
| 228 | VERIFY(0); |
| 229 | /* NOTREACHED */ |
| 230 | __builtin_unreachable(); |
| 231 | } |
| 232 | |
| 233 | IFCQ_DROP_ADD(ifq, 1, pktsched_get_pkt_len(&pkt)); |
| 234 | IFCQ_CONVERT_LOCK(ifq); |
| 235 | pktsched_free_pkt(&pkt); |
| 236 | } |
| 237 | |
| 238 | |
| 239 | static int |
| 240 | fq_compressor(fq_if_t *fqs, fq_t *fq, fq_if_classq_t *fq_cl, |
| 241 | pktsched_pkt_t *pkt) |
| 242 | { |
| 243 | classq_pkt_type_t ptype = fqs->fqs_ptype; |
| 244 | uint32_t comp_gencnt = 0; |
| 245 | uint64_t *pkt_timestamp; |
| 246 | uint64_t old_timestamp = 0; |
| 247 | uint32_t old_pktlen = 0; |
| 248 | struct ifclassq *ifq = fqs->fqs_ifq; |
| 249 | |
| 250 | if (__improbable(pkt_compressor == 0)) { |
| 251 | return 0; |
| 252 | } |
| 253 | |
| 254 | pktsched_get_pkt_vars(pkt, NULL, &pkt_timestamp, NULL, NULL, NULL, |
| 255 | &comp_gencnt, NULL); |
| 256 | |
| 257 | if (comp_gencnt == 0) { |
| 258 | return 0; |
| 259 | } |
| 260 | |
| 261 | fq_cl->fcl_stat.fcl_pkts_compressible++; |
| 262 | |
| 263 | if (fq_empty(fq, fqs->fqs_ptype)) { |
| 264 | return 0; |
| 265 | } |
| 266 | |
| 267 | if (ptype == QP_MBUF) { |
| 268 | struct mbuf *m = MBUFQ_LAST(&fq->fq_mbufq); |
| 269 | |
| 270 | if (comp_gencnt != m->m_pkthdr.comp_gencnt) { |
| 271 | return 0; |
| 272 | } |
| 273 | |
| 274 | /* If we got until here, we should merge/replace the segment */ |
| 275 | MBUFQ_REMOVE(&fq->fq_mbufq, m); |
| 276 | old_pktlen = m_pktlen(m); |
| 277 | old_timestamp = m->m_pkthdr.pkt_timestamp; |
| 278 | |
| 279 | IFCQ_CONVERT_LOCK(fqs->fqs_ifq); |
| 280 | m_freem(m); |
| 281 | } |
| 282 | #if SKYWALK |
| 283 | else { |
| 284 | struct __kern_packet *kpkt = KPKTQ_LAST(&fq->fq_kpktq); |
| 285 | |
| 286 | if (comp_gencnt != kpkt->pkt_comp_gencnt) { |
| 287 | return 0; |
| 288 | } |
| 289 | |
| 290 | /* If we got until here, we should merge/replace the segment */ |
| 291 | KPKTQ_REMOVE(&fq->fq_kpktq, kpkt); |
| 292 | old_pktlen = kpkt->pkt_length; |
| 293 | old_timestamp = kpkt->pkt_timestamp; |
| 294 | |
| 295 | IFCQ_CONVERT_LOCK(fqs->fqs_ifq); |
| 296 | pp_free_packet(*(struct kern_pbufpool **)(uintptr_t)& |
| 297 | (((struct __kern_quantum *)kpkt)->qum_pp), |
| 298 | (uint64_t)kpkt); |
| 299 | } |
| 300 | #endif /* SKYWALK */ |
| 301 | |
| 302 | fq->fq_bytes -= old_pktlen; |
| 303 | fq_cl->fcl_stat.fcl_byte_cnt -= old_pktlen; |
| 304 | fq_cl->fcl_stat.fcl_pkt_cnt--; |
| 305 | IFCQ_DEC_LEN(ifq); |
| 306 | IFCQ_DEC_BYTES(ifq, old_pktlen); |
| 307 | |
| 308 | FQ_GRP_DEC_LEN(fq); |
| 309 | FQ_GRP_DEC_BYTES(fq, old_pktlen); |
| 310 | |
| 311 | *pkt_timestamp = old_timestamp; |
| 312 | |
| 313 | return CLASSQEQ_COMPRESSED; |
| 314 | } |
| 315 | |
| 316 | int |
| 317 | fq_addq(fq_if_t *fqs, fq_if_group_t *fq_grp, pktsched_pkt_t *pkt, |
| 318 | fq_if_classq_t *fq_cl) |
| 319 | { |
| 320 | int droptype = DTYPE_NODROP, fc_adv = 0, ret = CLASSQEQ_SUCCESS; |
| 321 | u_int64_t now; |
| 322 | fq_t *fq = NULL; |
| 323 | uint64_t *pkt_timestamp; |
| 324 | volatile uint32_t *pkt_flags; |
| 325 | uint32_t pkt_flowid, cnt; |
| 326 | uint8_t pkt_proto, pkt_flowsrc; |
| 327 | fq_tfc_type_t tfc_type = FQ_TFC_C; |
| 328 | |
| 329 | cnt = pkt->pktsched_pcnt; |
| 330 | pktsched_get_pkt_vars(pkt, &pkt_flags, &pkt_timestamp, &pkt_flowid, |
| 331 | &pkt_flowsrc, &pkt_proto, NULL, NULL); |
| 332 | |
| 333 | /* |
| 334 | * XXX Not walking the chain to set this flag on every packet. |
| 335 | * This flag is only used for debugging. Nothing is affected if it's |
| 336 | * not set. |
| 337 | */ |
| 338 | switch (pkt->pktsched_ptype) { |
| 339 | case QP_MBUF: |
| 340 | /* See comments in <rdar://problem/14040693> */ |
| 341 | VERIFY(!(*pkt_flags & PKTF_PRIV_GUARDED)); |
| 342 | *pkt_flags |= PKTF_PRIV_GUARDED; |
| 343 | break; |
| 344 | #if SKYWALK |
| 345 | case QP_PACKET: |
| 346 | /* sanity check */ |
| 347 | ASSERT((*pkt_flags & ~PKT_F_COMMON_MASK) == 0); |
| 348 | break; |
| 349 | #endif /* SKYWALK */ |
| 350 | default: |
| 351 | VERIFY(0); |
| 352 | /* NOTREACHED */ |
| 353 | __builtin_unreachable(); |
| 354 | } |
| 355 | |
| 356 | if (ifclassq_enable_l4s) { |
| 357 | tfc_type = pktsched_is_pkt_l4s(pkt) ? FQ_TFC_L4S : FQ_TFC_C; |
| 358 | } |
| 359 | |
| 360 | /* |
| 361 | * Timestamps for every packet must be set prior to entering this path. |
| 362 | */ |
| 363 | now = *pkt_timestamp; |
| 364 | ASSERT(now > 0); |
| 365 | |
| 366 | /* find the flowq for this packet */ |
| 367 | fq = fq_if_hash_pkt(fqs, fq_grp, pkt_flowid, pktsched_get_pkt_svc(pkt), |
| 368 | now, true, tfc_type); |
| 369 | if (__improbable(fq == NULL)) { |
| 370 | DTRACE_IP1(memfail__drop, fq_if_t *, fqs); |
| 371 | /* drop the packet if we could not allocate a flow queue */ |
| 372 | fq_cl->fcl_stat.fcl_drop_memfailure += cnt; |
| 373 | return CLASSQEQ_DROP; |
| 374 | } |
| 375 | VERIFY(fq->fq_group == fq_grp); |
| 376 | VERIFY(fqs->fqs_ptype == pkt->pktsched_ptype); |
| 377 | |
| 378 | KDBG(AQM_KTRACE_STATS_FLOW_ENQUEUE, fq->fq_flowhash, |
| 379 | AQM_KTRACE_FQ_GRP_SC_IDX(fq), |
| 380 | fq->fq_bytes, pktsched_get_pkt_len(pkt)); |
| 381 | |
| 382 | fq_detect_dequeue_stall(fqs, flowq: fq, fq_cl, now: &now); |
| 383 | |
| 384 | /* |
| 385 | * Skip the dropping part if it's L4S. Flow control or ECN marking decision |
| 386 | * will be made at dequeue time. |
| 387 | */ |
| 388 | if (ifclassq_enable_l4s && tfc_type == FQ_TFC_L4S) { |
| 389 | fq_cl->fcl_stat.fcl_l4s_pkts++; |
| 390 | droptype = DTYPE_NODROP; |
| 391 | } |
| 392 | |
| 393 | if (__improbable(FQ_IS_DELAY_HIGH(fq) || FQ_IS_OVERWHELMING(fq))) { |
| 394 | if ((fq->fq_flags & FQF_FLOWCTL_CAPABLE) && |
| 395 | (*pkt_flags & PKTF_FLOW_ADV)) { |
| 396 | fc_adv = 1; |
| 397 | /* |
| 398 | * If the flow is suspended or it is not |
| 399 | * TCP/QUIC, drop the chain. |
| 400 | */ |
| 401 | if ((pkt_proto != IPPROTO_TCP) && |
| 402 | (pkt_proto != IPPROTO_QUIC)) { |
| 403 | droptype = DTYPE_EARLY; |
| 404 | fq_cl->fcl_stat.fcl_drop_early += cnt; |
| 405 | IFCQ_DROP_ADD(fqs->fqs_ifq, cnt, pktsched_get_pkt_len(pkt)); |
| 406 | } |
| 407 | DTRACE_IP6(flow__adv, fq_if_t *, fqs, |
| 408 | fq_if_classq_t *, fq_cl, fq_t *, fq, |
| 409 | int, droptype, pktsched_pkt_t *, pkt, |
| 410 | uint32_t, cnt); |
| 411 | } else { |
| 412 | /* |
| 413 | * Need to drop packets to make room for the new |
| 414 | * ones. Try to drop from the head of the queue |
| 415 | * instead of the latest packets. |
| 416 | */ |
| 417 | if (!fq_empty(fq, fqs->fqs_ptype)) { |
| 418 | uint32_t i; |
| 419 | |
| 420 | for (i = 0; i < cnt; i++) { |
| 421 | fq_head_drop(fqs, fq); |
| 422 | } |
| 423 | droptype = DTYPE_NODROP; |
| 424 | } else { |
| 425 | droptype = DTYPE_EARLY; |
| 426 | } |
| 427 | fq_cl->fcl_stat.fcl_drop_early += cnt; |
| 428 | |
| 429 | DTRACE_IP6(no__flow__adv, fq_if_t *, fqs, |
| 430 | fq_if_classq_t *, fq_cl, fq_t *, fq, |
| 431 | int, droptype, pktsched_pkt_t *, pkt, |
| 432 | uint32_t, cnt); |
| 433 | } |
| 434 | } |
| 435 | |
| 436 | /* Set the return code correctly */ |
| 437 | if (__improbable(fc_adv == 1 && droptype != DTYPE_FORCED)) { |
| 438 | if (fq_if_add_fcentry(fqs, pkt, pkt_flowsrc, fq, fq_cl)) { |
| 439 | fq->fq_flags |= FQF_FLOWCTL_ON; |
| 440 | /* deliver flow control advisory error */ |
| 441 | if (droptype == DTYPE_NODROP) { |
| 442 | ret = CLASSQEQ_SUCCESS_FC; |
| 443 | } else { |
| 444 | /* dropped due to flow control */ |
| 445 | ret = CLASSQEQ_DROP_FC; |
| 446 | } |
| 447 | } else { |
| 448 | /* |
| 449 | * if we could not flow control the flow, it is |
| 450 | * better to drop |
| 451 | */ |
| 452 | droptype = DTYPE_FORCED; |
| 453 | ret = CLASSQEQ_DROP_FC; |
| 454 | fq_cl->fcl_stat.fcl_flow_control_fail++; |
| 455 | } |
| 456 | DTRACE_IP3(fc__ret, fq_if_t *, fqs, int, droptype, int, ret); |
| 457 | } |
| 458 | |
| 459 | /* |
| 460 | * If the queue length hits the queue limit, drop a chain with the |
| 461 | * same number of packets from the front of the queue for a flow with |
| 462 | * maximum number of bytes. This will penalize heavy and unresponsive |
| 463 | * flows. It will also avoid a tail drop. |
| 464 | */ |
| 465 | if (__improbable(droptype == DTYPE_NODROP && |
| 466 | fq_if_at_drop_limit(fqs))) { |
| 467 | uint32_t i; |
| 468 | |
| 469 | if (fqs->fqs_large_flow == fq) { |
| 470 | /* |
| 471 | * Drop from the head of the current fq. Since a |
| 472 | * new packet will be added to the tail, it is ok |
| 473 | * to leave fq in place. |
| 474 | */ |
| 475 | DTRACE_IP5(large__flow, fq_if_t *, fqs, |
| 476 | fq_if_classq_t *, fq_cl, fq_t *, fq, |
| 477 | pktsched_pkt_t *, pkt, uint32_t, cnt); |
| 478 | |
| 479 | for (i = 0; i < cnt; i++) { |
| 480 | fq_head_drop(fqs, fq); |
| 481 | } |
| 482 | fq_cl->fcl_stat.fcl_drop_overflow += cnt; |
| 483 | |
| 484 | /* |
| 485 | * TCP and QUIC will react to the loss of those head dropped pkts |
| 486 | * and adjust send rate. |
| 487 | */ |
| 488 | if ((fq->fq_flags & FQF_FLOWCTL_CAPABLE) && |
| 489 | (*pkt_flags & PKTF_FLOW_ADV) && |
| 490 | (pkt_proto != IPPROTO_TCP) && |
| 491 | (pkt_proto != IPPROTO_QUIC)) { |
| 492 | if (fq_if_add_fcentry(fqs, pkt, pkt_flowsrc, fq, fq_cl)) { |
| 493 | fq->fq_flags |= FQF_FLOWCTL_ON; |
| 494 | FQ_SET_OVERWHELMING(fq); |
| 495 | fq_cl->fcl_stat.fcl_overwhelming++; |
| 496 | /* deliver flow control advisory error */ |
| 497 | ret = CLASSQEQ_SUCCESS_FC; |
| 498 | } |
| 499 | } |
| 500 | } else { |
| 501 | if (fqs->fqs_large_flow == NULL) { |
| 502 | droptype = DTYPE_FORCED; |
| 503 | fq_cl->fcl_stat.fcl_drop_overflow += cnt; |
| 504 | ret = CLASSQEQ_DROP; |
| 505 | |
| 506 | DTRACE_IP5(no__large__flow, fq_if_t *, fqs, |
| 507 | fq_if_classq_t *, fq_cl, fq_t *, fq, |
| 508 | pktsched_pkt_t *, pkt, uint32_t, cnt); |
| 509 | |
| 510 | /* |
| 511 | * if this fq was freshly created and there |
| 512 | * is nothing to enqueue, move it to empty list |
| 513 | */ |
| 514 | if (fq_empty(fq, fqs->fqs_ptype) && |
| 515 | !(fq->fq_flags & (FQF_NEW_FLOW | |
| 516 | FQF_OLD_FLOW))) { |
| 517 | fq_if_move_to_empty_flow(fqs, fq_cl, |
| 518 | fq, now); |
| 519 | fq = NULL; |
| 520 | } |
| 521 | } else { |
| 522 | DTRACE_IP5(different__large__flow, |
| 523 | fq_if_t *, fqs, fq_if_classq_t *, fq_cl, |
| 524 | fq_t *, fq, pktsched_pkt_t *, pkt, |
| 525 | uint32_t, cnt); |
| 526 | |
| 527 | for (i = 0; i < cnt; i++) { |
| 528 | fq_if_drop_packet(fqs, now); |
| 529 | } |
| 530 | } |
| 531 | } |
| 532 | } |
| 533 | |
| 534 | fq_cl->fcl_flags &= ~FCL_PACED; |
| 535 | |
| 536 | if (__probable(droptype == DTYPE_NODROP)) { |
| 537 | uint32_t chain_len = pktsched_get_pkt_len(pkt); |
| 538 | int ret_compress = 0; |
| 539 | |
| 540 | /* |
| 541 | * We do not compress if we are enqueuing a chain. |
| 542 | * Traversing the chain to look for acks would defeat the |
| 543 | * purpose of batch enqueueing. |
| 544 | */ |
| 545 | if (cnt == 1) { |
| 546 | ret_compress = fq_compressor(fqs, fq, fq_cl, pkt); |
| 547 | if (ret_compress == CLASSQEQ_COMPRESSED) { |
| 548 | fq_cl->fcl_stat.fcl_pkts_compressed++; |
| 549 | } |
| 550 | } |
| 551 | DTRACE_IP5(fq_enqueue, fq_if_t *, fqs, fq_if_classq_t *, fq_cl, |
| 552 | fq_t *, fq, pktsched_pkt_t *, pkt, uint32_t, cnt); |
| 553 | fq_enqueue(fq, pkt->pktsched_pkt, pkt->pktsched_tail, cnt, |
| 554 | pkt->pktsched_ptype); |
| 555 | |
| 556 | fq->fq_bytes += chain_len; |
| 557 | fq_cl->fcl_stat.fcl_byte_cnt += chain_len; |
| 558 | fq_cl->fcl_stat.fcl_pkt_cnt += cnt; |
| 559 | |
| 560 | /* |
| 561 | * check if this queue will qualify to be the next |
| 562 | * victim queue |
| 563 | */ |
| 564 | fq_if_is_flow_heavy(fqs, fq); |
| 565 | } else { |
| 566 | DTRACE_IP3(fq_drop, fq_if_t *, fqs, int, droptype, int, ret); |
| 567 | return (ret != CLASSQEQ_SUCCESS) ? ret : CLASSQEQ_DROP; |
| 568 | } |
| 569 | |
| 570 | /* |
| 571 | * If the queue is not currently active, add it to the end of new |
| 572 | * flows list for that service class. |
| 573 | */ |
| 574 | if ((fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW)) == 0) { |
| 575 | VERIFY(STAILQ_NEXT(fq, fq_actlink) == NULL); |
| 576 | STAILQ_INSERT_TAIL(&fq_cl->fcl_new_flows, fq, fq_actlink); |
| 577 | fq->fq_flags |= FQF_NEW_FLOW; |
| 578 | |
| 579 | fq_cl->fcl_stat.fcl_newflows_cnt++; |
| 580 | |
| 581 | fq->fq_deficit = fq_cl->fcl_quantum; |
| 582 | } |
| 583 | return ret; |
| 584 | } |
| 585 | |
| 586 | void |
| 587 | fq_getq_flow_internal(fq_if_t *fqs, fq_t *fq, pktsched_pkt_t *pkt) |
| 588 | { |
| 589 | classq_pkt_t p = CLASSQ_PKT_INITIALIZER(p); |
| 590 | uint32_t plen; |
| 591 | fq_if_classq_t *fq_cl; |
| 592 | struct ifclassq *ifq = fqs->fqs_ifq; |
| 593 | |
| 594 | fq_dequeue(fq, &p, fqs->fqs_ptype); |
| 595 | if (p.cp_ptype == QP_INVALID) { |
| 596 | VERIFY(p.cp_mbuf == NULL); |
| 597 | return; |
| 598 | } |
| 599 | |
| 600 | fq->fq_next_tx_time = FQ_INVALID_TX_TS; |
| 601 | |
| 602 | pktsched_pkt_encap(pkt, &p); |
| 603 | plen = pktsched_get_pkt_len(pkt); |
| 604 | |
| 605 | VERIFY(fq->fq_bytes >= plen); |
| 606 | fq->fq_bytes -= plen; |
| 607 | |
| 608 | fq_cl = &FQ_CLASSQ(fq); |
| 609 | fq_cl->fcl_stat.fcl_byte_cnt -= plen; |
| 610 | fq_cl->fcl_stat.fcl_pkt_cnt--; |
| 611 | fq_cl->fcl_flags &= ~FCL_PACED; |
| 612 | |
| 613 | IFCQ_DEC_LEN(ifq); |
| 614 | IFCQ_DEC_BYTES(ifq, plen); |
| 615 | |
| 616 | FQ_GRP_DEC_LEN(fq); |
| 617 | FQ_GRP_DEC_BYTES(fq, plen); |
| 618 | |
| 619 | /* Reset getqtime so that we don't count idle times */ |
| 620 | if (fq_empty(fq, fqs->fqs_ptype)) { |
| 621 | fq->fq_getqtime = 0; |
| 622 | } |
| 623 | } |
| 624 | |
| 625 | /* |
| 626 | * fq_get_next_tx_time returns FQ_INVALID_TX_TS when there is no tx time in fq |
| 627 | */ |
| 628 | static uint64_t |
| 629 | fq_get_next_tx_time(fq_if_t *fqs, fq_t *fq) |
| 630 | { |
| 631 | uint64_t tx_time = FQ_INVALID_TX_TS; |
| 632 | |
| 633 | /* |
| 634 | * Check the cached value in fq |
| 635 | */ |
| 636 | if (fq->fq_next_tx_time != FQ_INVALID_TX_TS) { |
| 637 | return fq->fq_next_tx_time; |
| 638 | } |
| 639 | |
| 640 | switch (fqs->fqs_ptype) { |
| 641 | case QP_MBUF: { |
| 642 | struct mbuf *m; |
| 643 | if ((m = MBUFQ_FIRST(&fq->fq_mbufq)) != NULL) { |
| 644 | struct m_tag *tag; |
| 645 | tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, |
| 646 | KERNEL_TAG_TYPE_AQM); |
| 647 | if (tag != NULL) { |
| 648 | tx_time = *(uint64_t *)tag->m_tag_data; |
| 649 | } |
| 650 | } |
| 651 | break; |
| 652 | } |
| 653 | case QP_PACKET: { |
| 654 | struct __kern_packet *p = KPKTQ_FIRST(&fq->fq_kpktq); |
| 655 | if (__probable(p != NULL && (p->pkt_pflags & PKT_F_OPT_TX_TIMESTAMP) != 0)) { |
| 656 | tx_time = p->pkt_com_opt->__po_pkt_tx_time; |
| 657 | } |
| 658 | break; |
| 659 | } |
| 660 | default: |
| 661 | VERIFY(0); |
| 662 | /* NOTREACHED */ |
| 663 | __builtin_unreachable(); |
| 664 | } |
| 665 | |
| 666 | /* |
| 667 | * Cache the tx time in fq. The cache will be clear after dequeue or drop |
| 668 | * from the fq. |
| 669 | */ |
| 670 | fq->fq_next_tx_time = tx_time; |
| 671 | |
| 672 | return tx_time; |
| 673 | } |
| 674 | |
| 675 | /* |
| 676 | * fq_tx_time_ready returns true if the fq is empty so that it doesn't |
| 677 | * affect caller logics that handles empty flow. |
| 678 | */ |
| 679 | boolean_t |
| 680 | fq_tx_time_ready(fq_if_t *fqs, fq_t *fq, uint64_t now, uint64_t *ready_time) |
| 681 | { |
| 682 | uint64_t pkt_tx_time; |
| 683 | fq_if_classq_t *fq_cl = &FQ_CLASSQ(fq); |
| 684 | |
| 685 | if (!ifclassq_enable_pacing || !ifclassq_enable_l4s || fq->fq_tfc_type != FQ_TFC_L4S) { |
| 686 | return TRUE; |
| 687 | } |
| 688 | |
| 689 | pkt_tx_time = fq_get_next_tx_time(fqs, fq); |
| 690 | if (ready_time != NULL) { |
| 691 | *ready_time = pkt_tx_time; |
| 692 | } |
| 693 | |
| 694 | if (pkt_tx_time <= now + pkt_pacing_leeway || |
| 695 | pkt_tx_time == FQ_INVALID_TX_TS) { |
| 696 | return TRUE; |
| 697 | } |
| 698 | |
| 699 | /* |
| 700 | * Ignore the tx time if it's scheduled too far in the future |
| 701 | */ |
| 702 | if (pkt_tx_time > max_pkt_pacing_interval + now) { |
| 703 | fq_cl->fcl_stat.fcl_ignore_tx_time++; |
| 704 | return TRUE; |
| 705 | } |
| 706 | |
| 707 | ASSERT(pkt_tx_time != FQ_INVALID_TX_TS); |
| 708 | KDBG(AQM_KTRACE_TX_NOT_READY, fq->fq_flowhash, |
| 709 | AQM_KTRACE_FQ_GRP_SC_IDX(fq), now, pkt_tx_time); |
| 710 | return FALSE; |
| 711 | } |
| 712 | |
| 713 | void |
| 714 | fq_getq_flow(fq_if_t *fqs, fq_t *fq, pktsched_pkt_t *pkt, uint64_t now) |
| 715 | { |
| 716 | fq_if_classq_t *fq_cl = &FQ_CLASSQ(fq); |
| 717 | int64_t qdelay = 0; |
| 718 | volatile uint32_t *pkt_flags; |
| 719 | uint64_t *pkt_timestamp, pkt_tx_time = 0, pacing_delay = 0; |
| 720 | uint64_t fq_min_delay_threshold = FQ_TARGET_DELAY(fq); |
| 721 | uint8_t pkt_flowsrc; |
| 722 | boolean_t l4s_pkt; |
| 723 | |
| 724 | fq_getq_flow_internal(fqs, fq, pkt); |
| 725 | if (pkt->pktsched_ptype == QP_INVALID) { |
| 726 | VERIFY(pkt->pktsched_pkt_mbuf == NULL); |
| 727 | return; |
| 728 | } |
| 729 | |
| 730 | pktsched_get_pkt_vars(pkt, &pkt_flags, &pkt_timestamp, NULL, &pkt_flowsrc, |
| 731 | NULL, NULL, &pkt_tx_time); |
| 732 | l4s_pkt = pktsched_is_pkt_l4s(pkt); |
| 733 | if (ifclassq_enable_pacing && ifclassq_enable_l4s) { |
| 734 | if (pkt_tx_time > *pkt_timestamp) { |
| 735 | pacing_delay = pkt_tx_time - *pkt_timestamp; |
| 736 | fq_cl->fcl_stat.fcl_paced_pkts++; |
| 737 | DTRACE_SKYWALK3(aqm__pacing__delta, uint64_t, now - pkt_tx_time, |
| 738 | fq_if_t *, fqs, fq_t *, fq); |
| 739 | } |
| 740 | #if (DEVELOPMENT || DEBUG) |
| 741 | else { |
| 742 | DTRACE_SKYWALK5(aqm__miss__pacing__delay, uint64_t, *pkt_timestamp, |
| 743 | uint64_t, pkt_tx_time, uint64_t, now, fq_if_t *, |
| 744 | fqs, fq_t *, fq); |
| 745 | } |
| 746 | #endif // (DEVELOPMENT || DEBUG) |
| 747 | } |
| 748 | |
| 749 | /* this will compute qdelay in nanoseconds */ |
| 750 | if (now > *pkt_timestamp) { |
| 751 | qdelay = now - *pkt_timestamp; |
| 752 | } |
| 753 | |
| 754 | /* Update min/max/avg qdelay for the respective class */ |
| 755 | if (fq_cl->fcl_stat.fcl_min_qdelay == 0 || |
| 756 | (qdelay > 0 && (u_int64_t)qdelay < fq_cl->fcl_stat.fcl_min_qdelay)) { |
| 757 | fq_cl->fcl_stat.fcl_min_qdelay = qdelay; |
| 758 | } |
| 759 | |
| 760 | if (fq_cl->fcl_stat.fcl_max_qdelay == 0 || |
| 761 | (qdelay > 0 && (u_int64_t)qdelay > fq_cl->fcl_stat.fcl_max_qdelay)) { |
| 762 | fq_cl->fcl_stat.fcl_max_qdelay = qdelay; |
| 763 | } |
| 764 | |
| 765 | uint64_t num_dequeues = fq_cl->fcl_stat.fcl_dequeue; |
| 766 | |
| 767 | if (num_dequeues == 0) { |
| 768 | fq_cl->fcl_stat.fcl_avg_qdelay = qdelay; |
| 769 | } else if (qdelay > 0) { |
| 770 | uint64_t res = 0; |
| 771 | if (os_add_overflow(num_dequeues, 1, &res)) { |
| 772 | /* Reset the dequeue num and dequeue bytes */ |
| 773 | fq_cl->fcl_stat.fcl_dequeue = num_dequeues = 0; |
| 774 | fq_cl->fcl_stat.fcl_dequeue_bytes = 0; |
| 775 | fq_cl->fcl_stat.fcl_avg_qdelay = qdelay; |
| 776 | os_log_info(OS_LOG_DEFAULT, "%s: dequeue num overflow, " |
| 777 | "flow: 0x%x, iface: %s" , __func__, fq->fq_flowhash, |
| 778 | if_name(fqs->fqs_ifq->ifcq_ifp)); |
| 779 | } else { |
| 780 | uint64_t product = 0; |
| 781 | if (os_mul_overflow(fq_cl->fcl_stat.fcl_avg_qdelay, |
| 782 | num_dequeues, &product) || os_add_overflow(product, qdelay, &res)) { |
| 783 | fq_cl->fcl_stat.fcl_avg_qdelay = qdelay; |
| 784 | } else { |
| 785 | fq_cl->fcl_stat.fcl_avg_qdelay = res / |
| 786 | (num_dequeues + 1); |
| 787 | } |
| 788 | } |
| 789 | } |
| 790 | |
| 791 | fq->fq_pkts_since_last_report++; |
| 792 | if (ifclassq_enable_l4s && l4s_pkt) { |
| 793 | /* |
| 794 | * A safe guard to make sure that L4S is not going to build a huge |
| 795 | * queue if we encounter unexpected problems (for eg., if ACKs don't |
| 796 | * arrive in timely manner due to congestion in reverse path). |
| 797 | */ |
| 798 | fq_min_delay_threshold = l4s_min_delay_threshold; |
| 799 | |
| 800 | if ((l4s_ce_threshold != 0 && qdelay > l4s_ce_threshold + pacing_delay) || |
| 801 | (l4s_ce_threshold == 0 && qdelay > FQ_TARGET_DELAY(fq) + pacing_delay)) { |
| 802 | DTRACE_SKYWALK4(aqm__mark__ce, uint64_t, qdelay, uint64_t, pacing_delay, |
| 803 | fq_if_t *, fqs, fq_t *, fq); |
| 804 | KDBG(AQM_KTRACE_STATS_FLOW_REPORT_CE, fq->fq_flowhash, |
| 805 | AQM_KTRACE_FQ_GRP_SC_IDX(fq), qdelay, pacing_delay); |
| 806 | /* |
| 807 | * The packet buffer that pktsched_mark_ecn writes to can be pageable. |
| 808 | * Since it is not safe to write to pageable memory while preemption |
| 809 | * is disabled, convert the spin lock into mutex. |
| 810 | */ |
| 811 | IFCQ_CONVERT_LOCK(fqs->fqs_ifq); |
| 812 | if (__improbable(l4s_local_ce_report != 0) && |
| 813 | (*pkt_flags & PKTF_FLOW_ADV) != 0 && |
| 814 | fq_if_report_ce(fqs, pkt, 1, fq->fq_pkts_since_last_report)) { |
| 815 | fq->fq_pkts_since_last_report = 0; |
| 816 | fq_cl->fcl_stat.fcl_ce_reported++; |
| 817 | } else if (pktsched_mark_ecn(pkt) == 0) { |
| 818 | fq_cl->fcl_stat.fcl_ce_marked++; |
| 819 | } else { |
| 820 | fq_cl->fcl_stat.fcl_ce_mark_failures++; |
| 821 | } |
| 822 | } |
| 823 | } |
| 824 | |
| 825 | ASSERT(pacing_delay <= INT64_MAX); |
| 826 | qdelay = MAX(0, qdelay - (int64_t)pacing_delay); |
| 827 | if (fq->fq_min_qdelay == 0 || |
| 828 | (u_int64_t)qdelay < fq->fq_min_qdelay) { |
| 829 | fq->fq_min_qdelay = qdelay; |
| 830 | } |
| 831 | |
| 832 | if (now >= fq->fq_updatetime) { |
| 833 | if (fq->fq_min_qdelay > fq_min_delay_threshold) { |
| 834 | if (!FQ_IS_DELAY_HIGH(fq)) { |
| 835 | FQ_SET_DELAY_HIGH(fq); |
| 836 | os_log_error(OS_LOG_DEFAULT, |
| 837 | "%s: scidx: %d, %llu, flow: 0x%x, " |
| 838 | "iface: %s, grp: %hhu\n" , __func__, fq->fq_sc_index, |
| 839 | fq->fq_min_qdelay, fq->fq_flowhash, |
| 840 | if_name(fqs->fqs_ifq->ifcq_ifp), |
| 841 | FQ_GROUP(fq)->fqg_index); |
| 842 | } |
| 843 | } else { |
| 844 | FQ_CLEAR_DELAY_HIGH(fq); |
| 845 | } |
| 846 | /* Reset measured queue delay and update time */ |
| 847 | fq->fq_updatetime = now + FQ_UPDATE_INTERVAL(fq); |
| 848 | fq->fq_min_qdelay = 0; |
| 849 | } |
| 850 | |
| 851 | if (fqs->fqs_large_flow != fq || !fq_if_almost_at_drop_limit(fqs)) { |
| 852 | FQ_CLEAR_OVERWHELMING(fq); |
| 853 | } |
| 854 | if (!FQ_IS_DELAY_HIGH(fq) || fq_empty(fq, fqs->fqs_ptype)) { |
| 855 | FQ_CLEAR_DELAY_HIGH(fq); |
| 856 | } |
| 857 | |
| 858 | if ((fq->fq_flags & FQF_FLOWCTL_ON) && |
| 859 | !FQ_IS_DELAY_HIGH(fq) && !FQ_IS_OVERWHELMING(fq)) { |
| 860 | fq_if_flow_feedback(fqs, fq, fq_cl); |
| 861 | } |
| 862 | |
| 863 | if (fq_empty(fq, fqs->fqs_ptype)) { |
| 864 | /* Reset getqtime so that we don't count idle times */ |
| 865 | fq->fq_getqtime = 0; |
| 866 | } else { |
| 867 | fq->fq_getqtime = now; |
| 868 | } |
| 869 | fq_if_is_flow_heavy(fqs, fq); |
| 870 | |
| 871 | *pkt_timestamp = 0; |
| 872 | switch (pkt->pktsched_ptype) { |
| 873 | case QP_MBUF: |
| 874 | *pkt_flags &= ~PKTF_PRIV_GUARDED; |
| 875 | break; |
| 876 | #if SKYWALK |
| 877 | case QP_PACKET: |
| 878 | /* sanity check */ |
| 879 | ASSERT((*pkt_flags & ~PKT_F_COMMON_MASK) == 0); |
| 880 | break; |
| 881 | #endif /* SKYWALK */ |
| 882 | default: |
| 883 | VERIFY(0); |
| 884 | /* NOTREACHED */ |
| 885 | __builtin_unreachable(); |
| 886 | } |
| 887 | } |
| 888 | |