1/*
2 * Copyright (c) 2016-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <sys/types.h>
30#include <sys/param.h>
31#include <kern/zalloc.h>
32#include <net/ethernet.h>
33#include <net/if_var.h>
34#include <net/if.h>
35#include <net/classq/classq.h>
36#include <net/classq/classq_fq_codel.h>
37#include <net/pktsched/pktsched_fq_codel.h>
38#include <os/log.h>
39#include <pexpert/pexpert.h> /* for PE_parse_boot_argn */
40#include <mach/thread_act.h>
41#include <kern/thread.h>
42#include <kern/sched_prim.h>
43
44#define FQ_CODEL_DEFAULT_QUANTUM 1500
45
46#define FQ_CODEL_QUANTUM_BK_SYS(_q) (_q)
47#define FQ_CODEL_QUANTUM_BK(_q) (_q)
48#define FQ_CODEL_QUANTUM_BE(_q) (_q)
49#define FQ_CODEL_QUANTUM_RD(_q) (_q)
50#define FQ_CODEL_QUANTUM_OAM(_q) (_q)
51#define FQ_CODEL_QUANTUM_AV(_q) (_q * 2)
52#define FQ_CODEL_QUANTUM_RV(_q) (_q * 2)
53#define FQ_CODEL_QUANTUM_VI(_q) (_q * 2)
54#define FQ_CODEL_QUANTUM_VO(_q) ((_q * 2) / 5)
55#define FQ_CODEL_QUANTUM_CTL(_q) ((_q * 2) / 5)
56
57static KALLOC_TYPE_DEFINE(fq_if_zone, fq_if_t, NET_KT_DEFAULT);
58static KALLOC_TYPE_DEFINE(fq_if_grp_zone, fq_if_group_t, NET_KT_DEFAULT);
59
60SYSCTL_NODE(_net_classq, OID_AUTO, fq_codel, CTLFLAG_RW | CTLFLAG_LOCKED,
61 0, "FQ-CODEL parameters");
62
63SYSCTL_INT(_net_classq_fq_codel, OID_AUTO, fq_enable_pacing, CTLFLAG_RW | CTLFLAG_LOCKED,
64 &ifclassq_enable_pacing, 0, "Enable pacing");
65
66static uint64_t fq_empty_purge_delay = FQ_EMPTY_PURGE_DELAY;
67#if (DEVELOPMENT || DEBUG)
68SYSCTL_QUAD(_net_classq_fq_codel, OID_AUTO, fq_empty_purge_delay, CTLFLAG_RW |
69 CTLFLAG_LOCKED, &fq_empty_purge_delay, "Empty flow queue purge delay (ns)");
70#endif /* !DEVELOPMENT && !DEBUG */
71
72unsigned int ifclassq_enable_pacing = 1;
73
74typedef STAILQ_HEAD(, flowq) flowq_dqlist_t;
75
76static fq_if_t *fq_if_alloc(struct ifclassq *, classq_pkt_type_t);
77static void fq_if_destroy(fq_if_t *fqs);
78static void fq_if_classq_init(fq_if_group_t *fqg, uint32_t priority,
79 uint32_t quantum, uint32_t drr_max, uint32_t svc_class);
80static void fq_if_dequeue(fq_if_t *, fq_if_classq_t *, uint32_t,
81 int64_t, classq_pkt_t *, classq_pkt_t *, uint32_t *,
82 uint32_t *, flowq_dqlist_t *, bool, uint64_t, bool*, uint64_t*);
83void fq_if_stat_sc(fq_if_t *fqs, cqrq_stat_sc_t *stat);
84static void fq_if_purge(fq_if_t *);
85static void fq_if_purge_classq(fq_if_t *, fq_if_classq_t *);
86static void fq_if_purge_flow(fq_if_t *, fq_t *, uint32_t *, uint32_t *,
87 uint64_t);
88static void fq_if_empty_new_flow(fq_t *fq, fq_if_classq_t *fq_cl);
89static void fq_if_empty_old_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl,
90 fq_t *fq, uint64_t now);
91static void fq_if_purge_empty_flow(fq_if_t *fqs, fq_t *fq);
92static void fq_if_purge_empty_flow_list(fq_if_t *fqs, uint64_t now,
93 bool purge_all);
94static inline void fq_if_reuse_empty_flow(fq_if_t *fqs, fq_t *fq, uint64_t now);
95static int fq_if_dequeue_sc_classq_multi_separate(struct ifclassq *ifq,
96 mbuf_svc_class_t svc, u_int32_t maxpktcnt, u_int32_t maxbytecnt,
97 classq_pkt_t *first_packet, classq_pkt_t *last_packet, u_int32_t *retpktcnt,
98 u_int32_t *retbytecnt, uint8_t grp_idx);
99static void fq_if_grp_stat_sc(fq_if_t *fqs, fq_if_group_t *grp,
100 cqrq_stat_sc_t *stat, uint64_t now);
101static void fq_if_purge_grp(fq_if_t *fqs, fq_if_group_t *grp);
102static inline boolean_t fq_if_is_grp_combined(fq_if_t *fqs, uint8_t grp_idx);
103static void fq_if_destroy_grps(fq_if_t *fqs);
104
105uint32_t fq_codel_drr_max_values[FQ_IF_MAX_CLASSES] = {
106 [FQ_IF_CTL_INDEX] = 8,
107 [FQ_IF_VO_INDEX] = 8,
108 [FQ_IF_VI_INDEX] = 6,
109 [FQ_IF_RV_INDEX] = 6,
110 [FQ_IF_AV_INDEX] = 6,
111 [FQ_IF_OAM_INDEX] = 4,
112 [FQ_IF_RD_INDEX] = 4,
113 [FQ_IF_BE_INDEX] = 4,
114 [FQ_IF_BK_INDEX] = 2,
115 [FQ_IF_BK_SYS_INDEX] = 2,
116};
117
118#define FQ_CODEL_DRR_MAX(_s) fq_codel_drr_max_values[FQ_IF_##_s##_INDEX]
119
120static boolean_t fq_if_grps_bitmap_zeros(fq_grp_tailq_t *grp_list, int pri,
121 fq_if_state state);
122static void fq_if_grps_bitmap_cpy(fq_grp_tailq_t *grp_list, int pri,
123 fq_if_state dst_state, fq_if_state src_state);
124static void fq_if_grps_bitmap_clr(fq_grp_tailq_t *grp_list, int pri,
125 fq_if_state state);
126static int fq_if_grps_bitmap_ffs(fq_grp_tailq_t *grp_list, int pri,
127 fq_if_state state, fq_if_group_t **selected_grp);
128static void fq_if_grps_bitmap_move(fq_grp_tailq_t *grp_list, int pri,
129 fq_if_state dst_state, fq_if_state src_state);
130
131static boolean_t fq_if_grps_sc_bitmap_zeros(fq_grp_tailq_t *grp_list, int pri,
132 fq_if_state state);
133static void fq_if_grps_sc_bitmap_cpy(fq_grp_tailq_t *grp_list, int pri,
134 fq_if_state dst_state, fq_if_state src_state);
135static void fq_if_grps_sc_bitmap_clr(fq_grp_tailq_t *grp_list, int pri,
136 fq_if_state state);
137static int fq_if_grps_sc_bitmap_ffs(fq_grp_tailq_t *grp_list, int pri,
138 fq_if_state state, fq_if_group_t **selected_grp);
139static void fq_if_grps_sc_bitmap_move(fq_grp_tailq_t *grp_list, int pri,
140 fq_if_state dst_state, fq_if_state src_state);
141
142bitmap_ops_t fq_if_grps_bitmap_ops =
143{
144 .ffs = fq_if_grps_bitmap_ffs,
145 .zeros = fq_if_grps_bitmap_zeros,
146 .cpy = fq_if_grps_bitmap_cpy,
147 .clr = fq_if_grps_bitmap_clr,
148 .move = fq_if_grps_bitmap_move,
149};
150
151bitmap_ops_t fq_if_grps_sc_bitmap_ops =
152{
153 .ffs = fq_if_grps_sc_bitmap_ffs,
154 .zeros = fq_if_grps_sc_bitmap_zeros,
155 .cpy = fq_if_grps_sc_bitmap_cpy,
156 .clr = fq_if_grps_sc_bitmap_clr,
157 .move = fq_if_grps_sc_bitmap_move,
158};
159
160void
161pktsched_fq_init(void)
162{
163 PE_parse_boot_argn(arg_string: "ifclassq_enable_pacing", arg_ptr: &ifclassq_enable_pacing,
164 max_arg: sizeof(ifclassq_enable_pacing));
165
166 // format looks like ifcq_drr_max=8,8,6
167 char buf[(FQ_IF_MAX_CLASSES) * 3];
168 size_t i, len, pri_index = 0;
169 uint32_t drr = 0;
170 if (!PE_parse_boot_arg_str(arg_string: "ifcq_drr_max", arg_ptr: buf, size: sizeof(buf))) {
171 return;
172 }
173
174 len = strlen(s: buf);
175 for (i = 0; i < len + 1 && pri_index < FQ_IF_MAX_CLASSES; i++) {
176 if (buf[i] != ',' && buf[i] != '\0') {
177 VERIFY(buf[i] >= '0' && buf[i] <= '9');
178 drr = drr * 10 + buf[i] - '0';
179 continue;
180 }
181 fq_codel_drr_max_values[pri_index] = drr;
182 pri_index += 1;
183 drr = 0;
184 }
185}
186
187#define FQ_IF_FLOW_HASH_ID(_flowid_) \
188 (((_flowid_) >> FQ_IF_HASH_TAG_SHIFT) & FQ_IF_HASH_TAG_MASK)
189
190#define FQ_IF_CLASSQ_IDLE(_fcl_) \
191 (STAILQ_EMPTY(&(_fcl_)->fcl_new_flows) && \
192 STAILQ_EMPTY(&(_fcl_)->fcl_old_flows))
193
194typedef void (* fq_if_append_pkt_t)(classq_pkt_t *, classq_pkt_t *);
195typedef boolean_t (* fq_getq_flow_t)(fq_if_t *, fq_if_classq_t *, fq_t *,
196 int64_t, uint32_t, classq_pkt_t *, classq_pkt_t *, uint32_t *,
197 uint32_t *, boolean_t *, uint64_t);
198
199static void
200fq_if_append_mbuf(classq_pkt_t *pkt, classq_pkt_t *next_pkt)
201{
202 pkt->cp_mbuf->m_nextpkt = next_pkt->cp_mbuf;
203}
204
205static inline uint64_t
206fq_codel_get_time(void)
207{
208 struct timespec ts;
209 uint64_t now;
210
211 nanouptime(ts: &ts);
212 now = ((uint64_t)ts.tv_sec * NSEC_PER_SEC) + ts.tv_nsec;
213 return now;
214}
215
216#if SKYWALK
217static void
218fq_if_append_pkt(classq_pkt_t *pkt, classq_pkt_t *next_pkt)
219{
220 pkt->cp_kpkt->pkt_nextpkt = next_pkt->cp_kpkt;
221}
222#endif /* SKYWALK */
223
224#if SKYWALK
225static boolean_t
226fq_getq_flow_kpkt(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq,
227 int64_t byte_limit, uint32_t pkt_limit, classq_pkt_t *head,
228 classq_pkt_t *tail, uint32_t *byte_cnt, uint32_t *pkt_cnt,
229 boolean_t *qempty, uint64_t now)
230{
231 uint32_t plen;
232 pktsched_pkt_t pkt;
233 boolean_t limit_reached = FALSE;
234 struct ifclassq *ifq = fqs->fqs_ifq;
235 struct ifnet *ifp = ifq->ifcq_ifp;
236
237 /*
238 * Assert to make sure pflags is part of PKT_F_COMMON_MASK;
239 * all common flags need to be declared in that mask.
240 */
241 while (fq->fq_deficit > 0 && limit_reached == FALSE &&
242 !KPKTQ_EMPTY(&fq->fq_kpktq) && fq_tx_time_ready(fqs, fq, now, NULL)) {
243 _PKTSCHED_PKT_INIT(&pkt);
244 fq_getq_flow(fqs, fq, &pkt, now);
245 ASSERT(pkt.pktsched_ptype == QP_PACKET);
246
247 plen = pktsched_get_pkt_len(pkt: &pkt);
248 fq->fq_deficit -= plen;
249 if (__improbable((fq->fq_flags & FQF_FRESH_FLOW) != 0)) {
250 pkt.pktsched_pkt_kpkt->pkt_pflags |= PKT_F_NEW_FLOW;
251 fq->fq_flags &= ~FQF_FRESH_FLOW;
252 }
253
254 if (head->cp_kpkt == NULL) {
255 *head = pkt.pktsched_pkt;
256 } else {
257 ASSERT(tail->cp_kpkt != NULL);
258 ASSERT(tail->cp_kpkt->pkt_nextpkt == NULL);
259 tail->cp_kpkt->pkt_nextpkt = pkt.pktsched_pkt_kpkt;
260 }
261 *tail = pkt.pktsched_pkt;
262 tail->cp_kpkt->pkt_nextpkt = NULL;
263 fq_cl->fcl_stat.fcl_dequeue++;
264 fq_cl->fcl_stat.fcl_dequeue_bytes += plen;
265 *pkt_cnt += 1;
266 *byte_cnt += plen;
267
268 ifclassq_set_packet_metadata(ifq, ifp, p: &pkt.pktsched_pkt);
269
270 /* Check if the limit is reached */
271 if (*pkt_cnt >= pkt_limit || *byte_cnt >= byte_limit) {
272 limit_reached = TRUE;
273 }
274 }
275 KDBG(AQM_KTRACE_STATS_FLOW_DEQUEUE, fq->fq_flowhash,
276 AQM_KTRACE_FQ_GRP_SC_IDX(fq),
277 fq->fq_bytes, fq->fq_min_qdelay);
278
279 *qempty = KPKTQ_EMPTY(&fq->fq_kpktq);
280 return limit_reached;
281}
282#endif /* SKYWALK */
283
284static boolean_t
285fq_getq_flow_mbuf(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq,
286 int64_t byte_limit, uint32_t pkt_limit, classq_pkt_t *head,
287 classq_pkt_t *tail, uint32_t *byte_cnt, uint32_t *pkt_cnt,
288 boolean_t *qempty, uint64_t now)
289{
290 u_int32_t plen;
291 pktsched_pkt_t pkt;
292 boolean_t limit_reached = FALSE;
293 struct ifclassq *ifq = fqs->fqs_ifq;
294 struct ifnet *ifp = ifq->ifcq_ifp;
295
296 while (fq->fq_deficit > 0 && limit_reached == FALSE &&
297 !MBUFQ_EMPTY(&fq->fq_mbufq) && fq_tx_time_ready(fqs, fq, now, NULL)) {
298 _PKTSCHED_PKT_INIT(&pkt);
299 fq_getq_flow(fqs, fq, &pkt, now);
300 ASSERT(pkt.pktsched_ptype == QP_MBUF);
301
302 plen = pktsched_get_pkt_len(pkt: &pkt);
303 fq->fq_deficit -= plen;
304
305 if (__improbable((fq->fq_flags & FQF_FRESH_FLOW) != 0)) {
306 pkt.pktsched_pkt_mbuf->m_pkthdr.pkt_flags |= PKTF_NEW_FLOW;
307 fq->fq_flags &= ~FQF_FRESH_FLOW;
308 }
309
310 if (head->cp_mbuf == NULL) {
311 *head = pkt.pktsched_pkt;
312 } else {
313 ASSERT(tail->cp_mbuf != NULL);
314 ASSERT(tail->cp_mbuf->m_nextpkt == NULL);
315 tail->cp_mbuf->m_nextpkt = pkt.pktsched_pkt_mbuf;
316 }
317 *tail = pkt.pktsched_pkt;
318 tail->cp_mbuf->m_nextpkt = NULL;
319 fq_cl->fcl_stat.fcl_dequeue++;
320 fq_cl->fcl_stat.fcl_dequeue_bytes += plen;
321 *pkt_cnt += 1;
322 *byte_cnt += plen;
323
324 ifclassq_set_packet_metadata(ifq, ifp, p: &pkt.pktsched_pkt);
325
326 /* Check if the limit is reached */
327 if (*pkt_cnt >= pkt_limit || *byte_cnt >= byte_limit) {
328 limit_reached = TRUE;
329 }
330 }
331 KDBG(AQM_KTRACE_STATS_FLOW_DEQUEUE, fq->fq_flowhash,
332 AQM_KTRACE_FQ_GRP_SC_IDX(fq),
333 fq->fq_bytes, fq->fq_min_qdelay);
334
335 *qempty = MBUFQ_EMPTY(&fq->fq_mbufq);
336 return limit_reached;
337}
338
339static void
340fq_if_pacemaker_tcall(thread_call_param_t arg0, thread_call_param_t arg1)
341{
342#pragma unused(arg1)
343 struct ifnet* ifp = (struct ifnet*)arg0;
344 ASSERT(ifp != NULL);
345
346 ifnet_start_ignore_delay(interface: ifp);
347}
348
349fq_if_t *
350fq_if_alloc(struct ifclassq *ifq, classq_pkt_type_t ptype)
351{
352 fq_if_t *fqs;
353
354 ASSERT(ifq->ifcq_ifp != NULL);
355 fqs = zalloc_flags(fq_if_zone, Z_WAITOK | Z_ZERO);
356 fqs->fqs_ifq = ifq;
357 fqs->fqs_ptype = ptype;
358
359 /* Configure packet drop limit across all queues */
360 fqs->fqs_pkt_droplimit = IFCQ_PKT_DROP_LIMIT(ifq);
361 STAILQ_INIT(&fqs->fqs_fclist);
362 TAILQ_INIT(&fqs->fqs_empty_list);
363 TAILQ_INIT(&fqs->fqs_combined_grp_list);
364 fqs->fqs_pacemaker_tcall = thread_call_allocate_with_options(func: fq_if_pacemaker_tcall,
365 param0: (thread_call_param_t)(ifq->ifcq_ifp), pri: THREAD_CALL_PRIORITY_KERNEL,
366 options: THREAD_CALL_OPTIONS_ONCE);
367 ASSERT(fqs->fqs_pacemaker_tcall != NULL);
368
369 return fqs;
370}
371
372void
373fq_if_destroy(fq_if_t *fqs)
374{
375 struct ifnet *ifp = fqs->fqs_ifq->ifcq_ifp;
376 thread_call_t tcall = fqs->fqs_pacemaker_tcall;
377
378 VERIFY(ifp != NULL);
379 ASSERT(tcall != NULL);
380 IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq);
381 LCK_MTX_ASSERT(&ifp->if_start_lock, LCK_MTX_ASSERT_NOTOWNED);
382 IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
383
384 /*
385 * Since we are holding the IFCQ lock here, another thread cannot enter AQM
386 * and schedule a pacemaker call. So we do not need a sleep wait loop here
387 * cancel wait and free should succeed in one call.
388 */
389 thread_call_cancel_wait(call: tcall);
390 ASSERT(thread_call_free(tcall));
391
392 fq_if_purge(fqs);
393 fq_if_destroy_grps(fqs);
394
395 fqs->fqs_ifq = NULL;
396 zfree(fq_if_zone, fqs);
397}
398
399static inline uint8_t
400fq_if_service_to_priority(fq_if_t *fqs, mbuf_svc_class_t svc)
401{
402 uint8_t pri;
403
404 if (fqs->fqs_flags & FQS_DRIVER_MANAGED) {
405 switch (svc) {
406 case MBUF_SC_BK_SYS:
407 case MBUF_SC_BK:
408 pri = FQ_IF_BK_INDEX;
409 break;
410 case MBUF_SC_BE:
411 case MBUF_SC_RD:
412 case MBUF_SC_OAM:
413 pri = FQ_IF_BE_INDEX;
414 break;
415 case MBUF_SC_AV:
416 case MBUF_SC_RV:
417 case MBUF_SC_VI:
418 case MBUF_SC_SIG:
419 pri = FQ_IF_VI_INDEX;
420 break;
421 case MBUF_SC_VO:
422 case MBUF_SC_CTL:
423 pri = FQ_IF_VO_INDEX;
424 break;
425 default:
426 pri = FQ_IF_BE_INDEX; /* Use best effort by default */
427 break;
428 }
429 return pri;
430 }
431
432 /* scheduler is not managed by the driver */
433 switch (svc) {
434 case MBUF_SC_BK_SYS:
435 pri = FQ_IF_BK_SYS_INDEX;
436 break;
437 case MBUF_SC_BK:
438 pri = FQ_IF_BK_INDEX;
439 break;
440 case MBUF_SC_BE:
441 pri = FQ_IF_BE_INDEX;
442 break;
443 case MBUF_SC_RD:
444 pri = FQ_IF_RD_INDEX;
445 break;
446 case MBUF_SC_OAM:
447 pri = FQ_IF_OAM_INDEX;
448 break;
449 case MBUF_SC_AV:
450 pri = FQ_IF_AV_INDEX;
451 break;
452 case MBUF_SC_RV:
453 pri = FQ_IF_RV_INDEX;
454 break;
455 case MBUF_SC_VI:
456 pri = FQ_IF_VI_INDEX;
457 break;
458 case MBUF_SC_SIG:
459 pri = FQ_IF_SIG_INDEX;
460 break;
461 case MBUF_SC_VO:
462 pri = FQ_IF_VO_INDEX;
463 break;
464 case MBUF_SC_CTL:
465 pri = FQ_IF_CTL_INDEX;
466 break;
467 default:
468 pri = FQ_IF_BE_INDEX; /* Use best effort by default */
469 break;
470 }
471 return pri;
472}
473
474void
475fq_if_classq_init(fq_if_group_t *fqg, uint32_t pri, uint32_t quantum,
476 uint32_t drr_max, uint32_t svc_class)
477{
478 fq_if_classq_t *fq_cl;
479 VERIFY(pri < FQ_IF_MAX_CLASSES);
480 fq_cl = &fqg->fqg_classq[pri];
481
482 VERIFY(fq_cl->fcl_quantum == 0);
483 VERIFY(quantum != 0);
484 fq_cl->fcl_quantum = quantum;
485 fq_cl->fcl_pri = pri;
486 fq_cl->fcl_drr_max = drr_max;
487 fq_cl->fcl_service_class = svc_class;
488 fq_cl->fcl_next_tx_time = 0;
489 fq_cl->fcl_flags = 0;
490 STAILQ_INIT(&fq_cl->fcl_new_flows);
491 STAILQ_INIT(&fq_cl->fcl_old_flows);
492}
493
494int
495fq_if_enqueue_classq(struct ifclassq *ifq, classq_pkt_t *head,
496 classq_pkt_t *tail, uint32_t cnt, uint32_t bytes, boolean_t *pdrop)
497{
498 uint8_t pri, grp_idx = 0;
499 fq_if_t *fqs;
500 fq_if_classq_t *fq_cl;
501 fq_if_group_t *fq_group;
502 int ret;
503 mbuf_svc_class_t svc;
504 pktsched_pkt_t pkt;
505
506 pktsched_pkt_encap_chain(&pkt, head, tail, cnt, bytes);
507
508 fqs = (fq_if_t *)ifq->ifcq_disc;
509 svc = pktsched_get_pkt_svc(&pkt);
510#if SKYWALK
511 if (head->cp_ptype == QP_PACKET) {
512 grp_idx = head->cp_kpkt->pkt_qset_idx;
513 }
514#endif /* SKYWALK */
515 pri = fq_if_service_to_priority(fqs, svc);
516 VERIFY(pri < FQ_IF_MAX_CLASSES);
517
518 IFCQ_LOCK_SPIN(ifq);
519 fq_group = fq_if_find_grp(fqs, grp_idx);
520 fq_cl = &fq_group->fqg_classq[pri];
521
522 if (__improbable(svc == MBUF_SC_BK_SYS && fqs->fqs_throttle == 1)) {
523 IFCQ_UNLOCK(ifq);
524 /* BK_SYS is currently throttled */
525 os_atomic_inc(&fq_cl->fcl_stat.fcl_throttle_drops, relaxed);
526 pktsched_free_pkt(&pkt);
527 *pdrop = TRUE;
528 ret = EQSUSPENDED;
529 goto done;
530 }
531
532 ASSERT(pkt.pktsched_ptype == fqs->fqs_ptype);
533 ret = fq_addq(fqs, fq_group, &pkt, fq_cl);
534 if (!FQ_IF_CLASSQ_IDLE(fq_cl)) {
535 if (((fq_group->fqg_bitmaps[FQ_IF_ER] | fq_group->fqg_bitmaps[FQ_IF_EB]) &
536 (1 << pri)) == 0) {
537 /*
538 * this group is not in ER or EB groups,
539 * mark it as IB
540 */
541 pktsched_bit_set(ix: pri, pData: &fq_group->fqg_bitmaps[FQ_IF_IB]);
542 }
543 }
544
545 if (__improbable(ret != 0)) {
546 if (ret == CLASSQEQ_SUCCESS_FC) {
547 /* packet enqueued, return advisory feedback */
548 ret = EQFULL;
549 *pdrop = FALSE;
550 } else if (ret == CLASSQEQ_COMPRESSED) {
551 ret = 0;
552 *pdrop = FALSE;
553 } else {
554 IFCQ_UNLOCK(ifq);
555 *pdrop = TRUE;
556 pktsched_free_pkt(&pkt);
557 switch (ret) {
558 case CLASSQEQ_DROP:
559 ret = ENOBUFS;
560 goto done;
561 case CLASSQEQ_DROP_FC:
562 ret = EQFULL;
563 goto done;
564 case CLASSQEQ_DROP_SP:
565 ret = EQSUSPENDED;
566 goto done;
567 default:
568 VERIFY(0);
569 /* NOTREACHED */
570 __builtin_unreachable();
571 }
572 /* NOTREACHED */
573 __builtin_unreachable();
574 }
575 } else {
576 *pdrop = FALSE;
577 }
578 IFCQ_ADD_LEN(ifq, cnt);
579 IFCQ_INC_BYTES(ifq, bytes);
580
581
582 FQS_GRP_ADD_LEN(fqs, grp_idx, cnt);
583 FQS_GRP_INC_BYTES(fqs, grp_idx, bytes);
584
585 IFCQ_UNLOCK(ifq);
586done:
587#if DEBUG || DEVELOPMENT
588 if (__improbable((ret == EQFULL) && (ifclassq_flow_control_adv == 0))) {
589 ret = 0;
590 }
591#endif /* DEBUG || DEVELOPMENT */
592 return ret;
593}
594
595void
596fq_if_dequeue_classq(struct ifclassq *ifq, classq_pkt_t *pkt, uint8_t grp_idx)
597{
598 (void) fq_if_dequeue_classq_multi(ifq, maxpktcnt: 1,
599 CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, first_packet: pkt, NULL, NULL, NULL, grp_idx);
600}
601
602void
603fq_if_dequeue_sc_classq(struct ifclassq *ifq, mbuf_svc_class_t svc,
604 classq_pkt_t *pkt, uint8_t grp_idx)
605{
606 (void) fq_if_dequeue_sc_classq_multi(ifq, svc, maxpktcnt: 1,
607 CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, first_packet: pkt, NULL, NULL, NULL, grp_idx);
608}
609
610static inline void
611fq_dqlist_add(flowq_dqlist_t *fq_dqlist_head, fq_t *fq)
612{
613 ASSERT(fq->fq_dq_head.cp_mbuf == NULL);
614 ASSERT(!fq->fq_in_dqlist);
615 STAILQ_INSERT_TAIL(fq_dqlist_head, fq, fq_dqlink);
616 fq->fq_in_dqlist = true;
617}
618
619static inline void
620fq_dqlist_remove(flowq_dqlist_t *fq_dqlist_head, fq_t *fq, classq_pkt_t *head,
621 classq_pkt_t *tail, classq_pkt_type_t ptype)
622{
623 ASSERT(fq->fq_in_dqlist);
624 if (fq->fq_dq_head.cp_mbuf == NULL) {
625 goto done;
626 }
627
628 if (head->cp_mbuf == NULL) {
629 *head = fq->fq_dq_head;
630 } else {
631 ASSERT(tail->cp_mbuf != NULL);
632
633 switch (ptype) {
634 case QP_MBUF:
635 ASSERT(tail->cp_mbuf->m_nextpkt == NULL);
636 tail->cp_mbuf->m_nextpkt = fq->fq_dq_head.cp_mbuf;
637 ASSERT(fq->fq_dq_tail.cp_mbuf->m_nextpkt == NULL);
638 break;
639#if SKYWALK
640 case QP_PACKET:
641 ASSERT(tail->cp_kpkt->pkt_nextpkt == NULL);
642 tail->cp_kpkt->pkt_nextpkt = fq->fq_dq_head.cp_kpkt;
643 ASSERT(fq->fq_dq_tail.cp_kpkt->pkt_nextpkt == NULL);
644 break;
645#endif /* SKYWALK */
646 default:
647 VERIFY(0);
648 /* NOTREACHED */
649 __builtin_unreachable();
650 }
651 }
652 *tail = fq->fq_dq_tail;
653done:
654 STAILQ_REMOVE(fq_dqlist_head, fq, flowq, fq_dqlink);
655 CLASSQ_PKT_INIT(&fq->fq_dq_head);
656 CLASSQ_PKT_INIT(&fq->fq_dq_tail);
657 fq->fq_in_dqlist = false;
658}
659
660static inline void
661fq_dqlist_get_packet_list(flowq_dqlist_t *fq_dqlist_head, classq_pkt_t *head,
662 classq_pkt_t *tail, classq_pkt_type_t ptype)
663{
664 fq_t *fq, *tfq;
665
666 STAILQ_FOREACH_SAFE(fq, fq_dqlist_head, fq_dqlink, tfq) {
667 fq_dqlist_remove(fq_dqlist_head, fq, head, tail, ptype);
668 }
669}
670
671static int
672fq_if_grps_bitmap_ffs(fq_grp_tailq_t *grp_list, int pri, fq_if_state state,
673 fq_if_group_t **selected_grp)
674{
675 #pragma unused(pri)
676
677 fq_if_group_t *grp;
678 uint32_t highest_pri = FQ_IF_MAX_CLASSES;
679 int ret_pri = 0;
680
681 TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
682 uint32_t cur_pri = pktsched_ffs(pData: grp->fqg_bitmaps[state]);
683 /* bitmap is empty in this case */
684 if (cur_pri == 0) {
685 continue;
686 }
687 if (cur_pri <= highest_pri) {
688 highest_pri = cur_pri;
689 ret_pri = cur_pri;
690 *selected_grp = grp;
691 }
692 }
693 return ret_pri;
694}
695
696static boolean_t
697fq_if_grps_bitmap_zeros(fq_grp_tailq_t *grp_list, int pri, fq_if_state state)
698{
699 #pragma unused(pri)
700
701 fq_if_group_t *grp;
702
703 TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
704 if (grp->fqg_bitmaps[state] != 0) {
705 return FALSE;
706 }
707 }
708 return TRUE;
709}
710
711static void
712fq_if_grps_bitmap_cpy(fq_grp_tailq_t *grp_list, int pri, fq_if_state dst_state,
713 fq_if_state src_state)
714{
715 #pragma unused(pri)
716
717 fq_if_group_t *grp;
718 TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
719 grp->fqg_bitmaps[dst_state] = grp->fqg_bitmaps[src_state];
720 }
721}
722
723static void
724fq_if_grps_bitmap_clr(fq_grp_tailq_t *grp_list, int pri, fq_if_state state)
725{
726 #pragma unused(pri)
727
728 fq_if_group_t *grp;
729 TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
730 grp->fqg_bitmaps[state] = 0;
731 }
732}
733
734static void
735fq_if_grps_bitmap_move(fq_grp_tailq_t *grp_list, int pri, fq_if_state dst_state,
736 fq_if_state src_state)
737{
738 #pragma unused(pri)
739
740 fq_if_group_t *grp;
741 TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
742 grp->fqg_bitmaps[dst_state] =
743 grp->fqg_bitmaps[dst_state] | grp->fqg_bitmaps[src_state];
744 grp->fqg_bitmaps[src_state] = 0;
745 }
746}
747
748static int
749fq_if_grps_sc_bitmap_ffs(fq_grp_tailq_t *grp_list, int pri, fq_if_state state,
750 fq_if_group_t **selected_grp)
751{
752 fq_if_group_t *grp;
753 int ret_pri = 0;
754
755 TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
756 if (pktsched_bit_tst(ix: pri, pData: &grp->fqg_bitmaps[state])) {
757 /* +1 to match the semantics of pktsched_ffs */
758 ret_pri = pri + 1;
759 *selected_grp = grp;
760 break;
761 }
762 }
763
764 return ret_pri;
765}
766
767static boolean_t
768fq_if_grps_sc_bitmap_zeros(fq_grp_tailq_t *grp_list, int pri, fq_if_state state)
769{
770 fq_if_group_t *grp;
771
772 TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
773 if (pktsched_bit_tst(ix: pri, pData: &grp->fqg_bitmaps[state])) {
774 return FALSE;
775 }
776 }
777 return TRUE;
778}
779
780static void
781fq_if_grps_sc_bitmap_cpy(fq_grp_tailq_t *grp_list, int pri, fq_if_state dst_state,
782 fq_if_state src_state)
783{
784 fq_if_group_t *grp;
785
786 TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
787 pktsched_bit_cpy(ix: pri, pData_dst: &grp->fqg_bitmaps[dst_state],
788 pData_src: &grp->fqg_bitmaps[src_state]);
789 }
790}
791
792static void
793fq_if_grps_sc_bitmap_clr(fq_grp_tailq_t *grp_list, int pri, fq_if_state state)
794{
795 fq_if_group_t *grp;
796
797 TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
798 pktsched_bit_clr(ix: pri, pData: &grp->fqg_bitmaps[state]);
799 }
800}
801
802static void
803fq_if_grps_sc_bitmap_move(fq_grp_tailq_t *grp_list, int pri, fq_if_state dst_state,
804 fq_if_state src_state)
805{
806 fq_if_group_t *grp;
807
808 TAILQ_FOREACH(grp, grp_list, fqg_grp_link) {
809 pktsched_bit_move(ix: pri, pData_dst: &grp->fqg_bitmaps[dst_state],
810 pData_src: &grp->fqg_bitmaps[src_state]);
811 pktsched_bit_clr(ix: pri, pData: &grp->fqg_bitmaps[src_state]);
812 }
813}
814
815/*
816 * Pacemaker is only scheduled when no packet can be dequeued from AQM
817 * due to pacing. Pacemaker will doorbell the driver when current >= next_tx_time.
818 * This only applies to L4S traffic at this moment.
819 */
820static void
821fq_if_schedule_pacemaker(fq_if_t *fqs, uint64_t now, uint64_t next_tx_time)
822{
823 uint64_t deadline = 0;
824 if (!ifclassq_enable_pacing || !ifclassq_enable_l4s) {
825 return;
826 }
827 ASSERT(next_tx_time != FQ_INVALID_TX_TS);
828 ASSERT(fqs->fqs_pacemaker_tcall != NULL);
829 ASSERT(now < next_tx_time);
830
831 DTRACE_SKYWALK2(pacemaker__schedule, struct ifnet*, fqs->fqs_ifq->ifcq_ifp,
832 uint64_t, next_tx_time - now);
833 KDBG(AQM_KTRACE_TX_PACEMAKER, fqs->fqs_ifq->ifcq_ifp->if_index, now,
834 next_tx_time, next_tx_time - now);
835
836 clock_interval_to_deadline(interval: (uint32_t)(next_tx_time - now), scale_factor: 1, result: &deadline);
837 thread_call_enter_delayed(call: fqs->fqs_pacemaker_tcall, deadline);
838}
839
840static int
841fq_if_dequeue_classq_multi_common(struct ifclassq *ifq, mbuf_svc_class_t svc,
842 u_int32_t maxpktcnt, u_int32_t maxbytecnt, classq_pkt_t *first_packet,
843 classq_pkt_t *last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt,
844 uint8_t grp_idx)
845{
846 uint32_t total_pktcnt = 0, total_bytecnt = 0;
847 classq_pkt_t first = CLASSQ_PKT_INITIALIZER(fisrt);
848 classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last);
849 classq_pkt_t tmp = CLASSQ_PKT_INITIALIZER(tmp);
850 fq_if_append_pkt_t append_pkt;
851 flowq_dqlist_t fq_dqlist_head;
852 fq_if_classq_t *fq_cl;
853 fq_grp_tailq_t *grp_list, tmp_grp_list;
854 fq_if_group_t *fq_grp = NULL;
855 fq_if_t *fqs;
856 uint64_t now, next_tx_time = FQ_INVALID_TX_TS;
857 int pri = 0, svc_pri = 0;
858 bool all_paced = true;
859
860 IFCQ_LOCK_ASSERT_HELD(ifq);
861
862 fqs = (fq_if_t *)ifq->ifcq_disc;
863 STAILQ_INIT(&fq_dqlist_head);
864
865 switch (fqs->fqs_ptype) {
866 case QP_MBUF:
867 append_pkt = fq_if_append_mbuf;
868 break;
869
870#if SKYWALK
871 case QP_PACKET:
872 append_pkt = fq_if_append_pkt;
873 break;
874#endif /* SKYWALK */
875
876 default:
877 VERIFY(0);
878 /* NOTREACHED */
879 __builtin_unreachable();
880 }
881
882 now = fq_codel_get_time();
883 if (fqs->fqs_flags & FQS_DRIVER_MANAGED) {
884 svc_pri = fq_if_service_to_priority(fqs, svc);
885 } else {
886 VERIFY(svc == MBUF_SC_UNSPEC);
887 }
888
889 if (fq_if_is_grp_combined(fqs, grp_idx)) {
890 grp_list = &fqs->fqs_combined_grp_list;
891 VERIFY(!TAILQ_EMPTY(grp_list));
892 } else {
893 grp_list = &tmp_grp_list;
894 fq_grp = fq_if_find_grp(fqs, grp_idx);
895 TAILQ_INIT(grp_list);
896 TAILQ_INSERT_TAIL(grp_list, fq_grp, fqg_grp_link);
897 }
898
899 for (;;) {
900 uint32_t pktcnt = 0, bytecnt = 0;
901 classq_pkt_t head = CLASSQ_PKT_INITIALIZER(head);
902 classq_pkt_t tail = CLASSQ_PKT_INITIALIZER(tail);
903 bool fq_cl_all_paced = false;
904 uint64_t fq_cl_next_tx_time = FQ_INVALID_TX_TS;
905
906 if (fqs->grp_bitmaps_zeros(grp_list, svc_pri, FQ_IF_ER) &&
907 fqs->grp_bitmaps_zeros(grp_list, svc_pri, FQ_IF_EB)) {
908 fqs->grp_bitmaps_cpy(grp_list, svc_pri, FQ_IF_EB, FQ_IF_IB);
909 fqs->grp_bitmaps_clr(grp_list, svc_pri, FQ_IF_IB);
910 if (fqs->grp_bitmaps_zeros(grp_list, svc_pri, FQ_IF_EB)) {
911 if (ifclassq_enable_pacing && ifclassq_enable_l4s) {
912 /*
913 * Move fq_cl in IR back to ER, so that they will inspected with priority
914 * the next time the driver dequeues
915 */
916 fqs->grp_bitmaps_cpy(grp_list, svc_pri, FQ_IF_ER, FQ_IF_IR);
917 fqs->grp_bitmaps_clr(grp_list, svc_pri, FQ_IF_IR);
918 }
919 break;
920 }
921 }
922 pri = fqs->grp_bitmaps_ffs(grp_list, svc_pri, FQ_IF_ER, &fq_grp);
923 if (pri == 0) {
924 /*
925 * There are no ER flows, move the highest
926 * priority one from EB if there are any in that
927 * category
928 */
929 pri = fqs->grp_bitmaps_ffs(grp_list, svc_pri, FQ_IF_EB, &fq_grp);
930 VERIFY(pri > 0);
931 VERIFY(fq_grp != NULL);
932 pktsched_bit_clr(ix: (pri - 1), pData: &fq_grp->fqg_bitmaps[FQ_IF_EB]);
933 pktsched_bit_set(ix: (pri - 1), pData: &fq_grp->fqg_bitmaps[FQ_IF_ER]);
934 }
935 VERIFY(fq_grp != NULL);
936 pri--; /* index starts at 0 */
937 fq_cl = &fq_grp->fqg_classq[pri];
938
939 if (fq_cl->fcl_budget <= 0) {
940 /* Update the budget */
941 fq_cl->fcl_budget += (min(a: fq_cl->fcl_drr_max,
942 b: fq_cl->fcl_stat.fcl_flows_cnt) *
943 fq_cl->fcl_quantum);
944 if (fq_cl->fcl_budget <= 0) {
945 goto state_change;
946 }
947 }
948 fq_if_dequeue(fqs, fq_cl, (maxpktcnt - total_pktcnt),
949 (maxbytecnt - total_bytecnt), &head, &tail, &pktcnt,
950 &bytecnt, &fq_dqlist_head, true, now, &fq_cl_all_paced,
951 &fq_cl_next_tx_time);
952 if (head.cp_mbuf != NULL) {
953 ASSERT(STAILQ_EMPTY(&fq_dqlist_head));
954 if (first.cp_mbuf == NULL) {
955 first = head;
956 } else {
957 ASSERT(last.cp_mbuf != NULL);
958 append_pkt(&last, &head);
959 }
960 last = tail;
961 append_pkt(&last, &tmp);
962 }
963 if (fq_cl_all_paced && fq_cl_next_tx_time < next_tx_time) {
964 fq_cl->fcl_stat.fcl_fcl_pacemaker_needed++;
965 next_tx_time = fq_cl_next_tx_time;
966 }
967 fq_cl->fcl_budget -= bytecnt;
968 total_pktcnt += pktcnt;
969 total_bytecnt += bytecnt;
970
971 /*
972 * If the class has exceeded the budget but still has data
973 * to send, move it to IB
974 */
975state_change:
976 VERIFY(fq_grp != NULL);
977 all_paced &= fq_cl_all_paced;
978 if (!FQ_IF_CLASSQ_IDLE(fq_cl)) {
979 if (fq_cl->fcl_budget <= 0) {
980 pktsched_bit_set(ix: pri, pData: &fq_grp->fqg_bitmaps[FQ_IF_IB]);
981 pktsched_bit_clr(ix: pri, pData: &fq_grp->fqg_bitmaps[FQ_IF_ER]);
982 } else if (fq_cl_all_paced) {
983 if (ifclassq_enable_pacing && ifclassq_enable_l4s) {
984 /*
985 * If a fq_cl still has budget but only paced queues, park it
986 * to IR so that we will not keep loopping over it
987 */
988 pktsched_bit_set(ix: pri, pData: &fq_grp->fqg_bitmaps[FQ_IF_IR]);
989 pktsched_bit_clr(ix: pri, pData: &fq_grp->fqg_bitmaps[FQ_IF_ER]);
990 }
991 }
992 } else {
993 pktsched_bit_clr(ix: pri, pData: &fq_grp->fqg_bitmaps[FQ_IF_ER]);
994 VERIFY(((fq_grp->fqg_bitmaps[FQ_IF_ER] |
995 fq_grp->fqg_bitmaps[FQ_IF_EB] |
996 fq_grp->fqg_bitmaps[FQ_IF_IB]) & (1 << pri)) == 0);
997 fq_cl->fcl_budget = 0;
998 }
999 if (total_pktcnt >= maxpktcnt || total_bytecnt >= maxbytecnt) {
1000 if (ifclassq_enable_pacing && ifclassq_enable_l4s) {
1001 /*
1002 * Move fq_cl in IR back to ER, so that they will inspected with priority
1003 * the next time the driver dequeues
1004 */
1005 fqs->grp_bitmaps_move(grp_list, svc_pri, FQ_IF_ER, FQ_IF_IR);
1006 }
1007 break;
1008 }
1009 }
1010
1011 if (!fq_if_is_grp_combined(fqs, grp_idx)) {
1012 TAILQ_REMOVE(grp_list, fq_grp, fqg_grp_link);
1013 VERIFY(TAILQ_EMPTY(grp_list));
1014 }
1015
1016 fq_dqlist_get_packet_list(fq_dqlist_head: &fq_dqlist_head, head: &first, tail: &last,
1017 ptype: fqs->fqs_ptype);
1018
1019 if (__probable(first_packet != NULL)) {
1020 *first_packet = first;
1021 }
1022 if (last_packet != NULL) {
1023 *last_packet = last;
1024 }
1025 if (retpktcnt != NULL) {
1026 *retpktcnt = total_pktcnt;
1027 }
1028 if (retbytecnt != NULL) {
1029 *retbytecnt = total_bytecnt;
1030 }
1031 if (next_tx_time != FQ_INVALID_TX_TS) {
1032 ASSERT(next_tx_time > now);
1033 fq_if_schedule_pacemaker(fqs, now, next_tx_time);
1034 }
1035
1036 IFCQ_XMIT_ADD(ifq, total_pktcnt, total_bytecnt);
1037 fq_if_purge_empty_flow_list(fqs, now, false);
1038 return 0;
1039}
1040
1041int
1042fq_if_dequeue_classq_multi(struct ifclassq *ifq, u_int32_t maxpktcnt,
1043 u_int32_t maxbytecnt, classq_pkt_t *first_packet,
1044 classq_pkt_t *last_packet, u_int32_t *retpktcnt,
1045 u_int32_t *retbytecnt, uint8_t grp_idx)
1046{
1047 return fq_if_dequeue_classq_multi_common(ifq, svc: MBUF_SC_UNSPEC, maxpktcnt, maxbytecnt,
1048 first_packet, last_packet, retpktcnt, retbytecnt, grp_idx);
1049}
1050
1051int
1052fq_if_dequeue_sc_classq_multi(struct ifclassq *ifq, mbuf_svc_class_t svc,
1053 u_int32_t maxpktcnt, u_int32_t maxbytecnt, classq_pkt_t *first_packet,
1054 classq_pkt_t *last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt,
1055 uint8_t grp_idx)
1056{
1057 fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
1058
1059 if (fq_if_is_grp_combined(fqs, grp_idx)) {
1060 return fq_if_dequeue_classq_multi_common(ifq, svc, maxpktcnt, maxbytecnt,
1061 first_packet, last_packet, retpktcnt, retbytecnt, grp_idx);
1062 } else {
1063 /*
1064 * take a shortcut here since there is no need to schedule
1065 * one single service class.
1066 */
1067 return fq_if_dequeue_sc_classq_multi_separate(ifq, svc, maxpktcnt, maxbytecnt,
1068 first_packet, last_packet, retpktcnt, retbytecnt, grp_idx);
1069 }
1070}
1071
1072static int
1073fq_if_dequeue_sc_classq_multi_separate(struct ifclassq *ifq, mbuf_svc_class_t svc,
1074 u_int32_t maxpktcnt, u_int32_t maxbytecnt, classq_pkt_t *first_packet,
1075 classq_pkt_t *last_packet, u_int32_t *retpktcnt, u_int32_t *retbytecnt,
1076 uint8_t grp_idx)
1077{
1078 fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
1079 uint8_t pri;
1080 u_int32_t total_pktcnt = 0, total_bytecnt = 0;
1081 fq_if_classq_t *fq_cl;
1082 classq_pkt_t first = CLASSQ_PKT_INITIALIZER(fisrt);
1083 classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last);
1084 fq_if_append_pkt_t append_pkt;
1085 flowq_dqlist_t fq_dqlist_head;
1086 fq_if_group_t *fq_grp;
1087 uint64_t now;
1088
1089 switch (fqs->fqs_ptype) {
1090 case QP_MBUF:
1091 append_pkt = fq_if_append_mbuf;
1092 break;
1093
1094#if SKYWALK
1095 case QP_PACKET:
1096 append_pkt = fq_if_append_pkt;
1097 break;
1098#endif /* SKYWALK */
1099
1100 default:
1101 VERIFY(0);
1102 /* NOTREACHED */
1103 __builtin_unreachable();
1104 }
1105
1106 STAILQ_INIT(&fq_dqlist_head);
1107 now = fq_codel_get_time();
1108
1109 pri = fq_if_service_to_priority(fqs, svc);
1110 fq_grp = fq_if_find_grp(fqs, grp_idx);
1111 fq_cl = &fq_grp->fqg_classq[pri];
1112
1113 /*
1114 * Now we have the queue for a particular service class. We need
1115 * to dequeue as many packets as needed, first from the new flows
1116 * and then from the old flows.
1117 */
1118 while (total_pktcnt < maxpktcnt && total_bytecnt < maxbytecnt &&
1119 fq_cl->fcl_stat.fcl_pkt_cnt > 0) {
1120 classq_pkt_t head = CLASSQ_PKT_INITIALIZER(head);
1121 classq_pkt_t tail = CLASSQ_PKT_INITIALIZER(tail);
1122 u_int32_t pktcnt = 0, bytecnt = 0;
1123 bool all_paced = false;
1124 uint64_t next_tx_time = FQ_INVALID_TX_TS;
1125
1126 fq_if_dequeue(fqs, fq_cl, (maxpktcnt - total_pktcnt),
1127 (maxbytecnt - total_bytecnt), &head, &tail, &pktcnt,
1128 &bytecnt, &fq_dqlist_head, false, now, &all_paced, &next_tx_time);
1129 if (head.cp_mbuf != NULL) {
1130 if (first.cp_mbuf == NULL) {
1131 first = head;
1132 } else {
1133 ASSERT(last.cp_mbuf != NULL);
1134 append_pkt(&last, &head);
1135 }
1136 last = tail;
1137 }
1138 total_pktcnt += pktcnt;
1139 total_bytecnt += bytecnt;
1140
1141 if (next_tx_time != FQ_INVALID_TX_TS) {
1142 ASSERT(next_tx_time > now);
1143 fq_cl->fcl_stat.fcl_fcl_pacemaker_needed++;
1144 fq_if_schedule_pacemaker(fqs, now, next_tx_time);
1145 break;
1146 }
1147 }
1148
1149 /*
1150 * Mark classq as IB if it's not idle, so that we can
1151 * start without re-init the bitmaps when it's switched
1152 * to combined mode.
1153 */
1154 if (!FQ_IF_CLASSQ_IDLE(fq_cl)) {
1155 pktsched_bit_set(ix: pri, pData: &fq_grp->fqg_bitmaps[FQ_IF_IB]);
1156 pktsched_bit_clr(ix: pri, pData: &fq_grp->fqg_bitmaps[FQ_IF_ER]);
1157 pktsched_bit_clr(ix: pri, pData: &fq_grp->fqg_bitmaps[FQ_IF_EB]);
1158 } else {
1159 pktsched_bit_clr(ix: pri, pData: &fq_grp->fqg_bitmaps[FQ_IF_IB]);
1160 VERIFY(((fq_grp->fqg_bitmaps[FQ_IF_ER] |
1161 fq_grp->fqg_bitmaps[FQ_IF_EB] |
1162 fq_grp->fqg_bitmaps[FQ_IF_IB]) & (1 << pri)) == 0);
1163 }
1164
1165 fq_dqlist_get_packet_list(fq_dqlist_head: &fq_dqlist_head, head: &first, tail: &last, ptype: fqs->fqs_ptype);
1166
1167 if (__probable(first_packet != NULL)) {
1168 *first_packet = first;
1169 }
1170 if (last_packet != NULL) {
1171 *last_packet = last;
1172 }
1173 if (retpktcnt != NULL) {
1174 *retpktcnt = total_pktcnt;
1175 }
1176 if (retbytecnt != NULL) {
1177 *retbytecnt = total_bytecnt;
1178 }
1179
1180 IFCQ_XMIT_ADD(ifq, total_pktcnt, total_bytecnt);
1181 fq_if_purge_empty_flow_list(fqs, now, false);
1182 return 0;
1183}
1184
1185static void
1186fq_if_purge_flow(fq_if_t *fqs, fq_t *fq, uint32_t *pktsp,
1187 uint32_t *bytesp, uint64_t now)
1188{
1189 fq_if_classq_t *fq_cl;
1190 u_int32_t pkts, bytes;
1191 pktsched_pkt_t pkt;
1192 fq_if_group_t *grp;
1193
1194 fq_cl = &FQ_CLASSQ(fq);
1195 grp = FQ_GROUP(fq);
1196 pkts = bytes = 0;
1197 _PKTSCHED_PKT_INIT(&pkt);
1198 for (;;) {
1199 fq_getq_flow(fqs, fq, &pkt, now);
1200 if (pkt.pktsched_pkt_mbuf == NULL) {
1201 VERIFY(pkt.pktsched_ptype == QP_INVALID);
1202 break;
1203 }
1204 pkts++;
1205 bytes += pktsched_get_pkt_len(pkt: &pkt);
1206 pktsched_free_pkt(&pkt);
1207 }
1208 KDBG(AQM_KTRACE_STATS_FLOW_DEQUEUE, fq->fq_flowhash,
1209 AQM_KTRACE_FQ_GRP_SC_IDX(fq), fq->fq_bytes, fq->fq_min_qdelay);
1210
1211 IFCQ_DROP_ADD(fqs->fqs_ifq, pkts, bytes);
1212
1213 /* move through the flow queue states */
1214 VERIFY((fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW | FQF_EMPTY_FLOW)));
1215 if (fq->fq_flags & FQF_NEW_FLOW) {
1216 fq_if_empty_new_flow(fq, fq_cl);
1217 }
1218 if (fq->fq_flags & FQF_OLD_FLOW) {
1219 fq_if_empty_old_flow(fqs, fq_cl, fq, now);
1220 }
1221 if (fq->fq_flags & FQF_EMPTY_FLOW) {
1222 fq_if_purge_empty_flow(fqs, fq);
1223 fq = NULL;
1224 }
1225
1226 if (FQ_IF_CLASSQ_IDLE(fq_cl)) {
1227 int i;
1228 for (i = FQ_IF_ER; i < FQ_IF_MAX_STATE; i++) {
1229 pktsched_bit_clr(ix: fq_cl->fcl_pri, pData: &grp->fqg_bitmaps[i]);
1230 }
1231 }
1232
1233 if (pktsp != NULL) {
1234 *pktsp = pkts;
1235 }
1236 if (bytesp != NULL) {
1237 *bytesp = bytes;
1238 }
1239}
1240
1241static void
1242fq_if_purge_classq(fq_if_t *fqs, fq_if_classq_t *fq_cl)
1243{
1244 fq_t *fq, *tfq;
1245 uint64_t now;
1246
1247 now = fq_codel_get_time();
1248 /*
1249 * Take each flow from new/old flow list and flush mbufs
1250 * in that flow
1251 */
1252 STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_new_flows, fq_actlink, tfq) {
1253 fq_if_purge_flow(fqs, fq, NULL, NULL, now);
1254 }
1255 STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_old_flows, fq_actlink, tfq) {
1256 fq_if_purge_flow(fqs, fq, NULL, NULL, now);
1257 }
1258 VERIFY(STAILQ_EMPTY(&fq_cl->fcl_new_flows));
1259 VERIFY(STAILQ_EMPTY(&fq_cl->fcl_old_flows));
1260
1261 STAILQ_INIT(&fq_cl->fcl_new_flows);
1262 STAILQ_INIT(&fq_cl->fcl_old_flows);
1263 fq_cl->fcl_budget = 0;
1264}
1265
1266static void
1267fq_if_purge(fq_if_t *fqs)
1268{
1269 uint64_t now;
1270 fq_if_group_t *grp;
1271 int i;
1272
1273 IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
1274 for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1275 if (fqs->fqs_classq_groups[grp_idx] == NULL) {
1276 continue;
1277 }
1278
1279 grp = fq_if_find_grp(fqs, grp_idx);
1280 fq_if_purge_grp(fqs, grp);
1281 }
1282
1283 now = fq_codel_get_time();
1284 fq_if_purge_empty_flow_list(fqs, now, true);
1285
1286 VERIFY(STAILQ_EMPTY(&fqs->fqs_fclist));
1287 VERIFY(TAILQ_EMPTY(&fqs->fqs_empty_list));
1288
1289 fqs->fqs_large_flow = NULL;
1290 for (i = 0; i < FQ_IF_HASH_TABLE_SIZE; i++) {
1291 VERIFY(SLIST_EMPTY(&fqs->fqs_flows[i]));
1292 }
1293
1294 IFCQ_LEN(fqs->fqs_ifq) = 0;
1295 IFCQ_BYTES(fqs->fqs_ifq) = 0;
1296}
1297
1298static void
1299fq_if_purge_sc(fq_if_t *fqs, cqrq_purge_sc_t *req)
1300{
1301 fq_t *fq;
1302 uint64_t now;
1303 fq_if_group_t *grp;
1304
1305 IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq);
1306 req->packets = req->bytes = 0;
1307 VERIFY(req->flow != 0);
1308
1309 now = fq_codel_get_time();
1310
1311 for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1312 if (fqs->fqs_classq_groups[grp_idx] == NULL) {
1313 continue;
1314 }
1315 uint32_t bytes = 0, pkts = 0;
1316
1317 grp = fq_if_find_grp(fqs, grp_idx);
1318 /*
1319 * Packet and traffic type are needed only if we want
1320 * to create a flow queue.
1321 */
1322 fq = fq_if_hash_pkt(fqs, grp, req->flow, req->sc, 0, false, FQ_TFC_C);
1323 if (fq != NULL) {
1324 fq_if_purge_flow(fqs, fq, pktsp: &pkts, bytesp: &bytes, now);
1325 req->bytes += bytes;
1326 req->packets += pkts;
1327 }
1328 }
1329}
1330
1331static uint16_t
1332fq_if_calc_quantum(struct ifnet *ifp)
1333{
1334 uint16_t quantum;
1335
1336 switch (ifp->if_family) {
1337 case IFNET_FAMILY_ETHERNET:
1338 VERIFY((ifp->if_mtu + ETHER_HDR_LEN) <= UINT16_MAX);
1339 quantum = (uint16_t)ifp->if_mtu + ETHER_HDR_LEN;
1340 break;
1341
1342 case IFNET_FAMILY_CELLULAR:
1343 case IFNET_FAMILY_IPSEC:
1344 case IFNET_FAMILY_UTUN:
1345 VERIFY(ifp->if_mtu <= UINT16_MAX);
1346 quantum = (uint16_t)ifp->if_mtu;
1347 break;
1348
1349 default:
1350 quantum = FQ_CODEL_DEFAULT_QUANTUM;
1351 break;
1352 }
1353
1354 if ((ifp->if_hwassist & IFNET_TSOF) != 0) {
1355 VERIFY(ifp->if_tso_v4_mtu <= UINT16_MAX);
1356 VERIFY(ifp->if_tso_v6_mtu <= UINT16_MAX);
1357 quantum = (uint16_t)MAX(ifp->if_tso_v4_mtu, ifp->if_tso_v6_mtu);
1358 quantum = (quantum != 0) ? quantum : IF_MAXMTU;
1359 }
1360
1361 quantum = MAX(FQ_CODEL_DEFAULT_QUANTUM, quantum);
1362#if DEBUG || DEVELOPMENT
1363 quantum = (fq_codel_quantum != 0) ? fq_codel_quantum : quantum;
1364#endif /* DEBUG || DEVELOPMENT */
1365 VERIFY(quantum != 0);
1366 return quantum;
1367}
1368
1369static void
1370fq_if_mtu_update(fq_if_t *fqs)
1371{
1372#define _FQ_CLASSQ_UPDATE_QUANTUM(_grp, _s, _q) \
1373 (_grp)->fqg_classq[FQ_IF_ ## _s ## _INDEX].fcl_quantum = \
1374 FQ_CODEL_QUANTUM_ ## _s(_q) \
1375
1376 uint32_t quantum;
1377 fq_if_group_t *grp;
1378
1379 quantum = fq_if_calc_quantum(ifp: fqs->fqs_ifq->ifcq_ifp);
1380
1381 for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1382 if (fqs->fqs_classq_groups[grp_idx] == NULL) {
1383 continue;
1384 }
1385
1386 grp = fq_if_find_grp(fqs, grp_idx);
1387
1388 if ((fqs->fqs_flags & FQS_DRIVER_MANAGED) != 0) {
1389 _FQ_CLASSQ_UPDATE_QUANTUM(grp, BK, quantum);
1390 _FQ_CLASSQ_UPDATE_QUANTUM(grp, BE, quantum);
1391 _FQ_CLASSQ_UPDATE_QUANTUM(grp, VI, quantum);
1392 _FQ_CLASSQ_UPDATE_QUANTUM(grp, VO, quantum);
1393 } else {
1394 _FQ_CLASSQ_UPDATE_QUANTUM(grp, BK_SYS, quantum);
1395 _FQ_CLASSQ_UPDATE_QUANTUM(grp, BK, quantum);
1396 _FQ_CLASSQ_UPDATE_QUANTUM(grp, BE, quantum);
1397 _FQ_CLASSQ_UPDATE_QUANTUM(grp, RD, quantum);
1398 _FQ_CLASSQ_UPDATE_QUANTUM(grp, OAM, quantum);
1399 _FQ_CLASSQ_UPDATE_QUANTUM(grp, AV, quantum);
1400 _FQ_CLASSQ_UPDATE_QUANTUM(grp, RV, quantum);
1401 _FQ_CLASSQ_UPDATE_QUANTUM(grp, VI, quantum);
1402 _FQ_CLASSQ_UPDATE_QUANTUM(grp, VO, quantum);
1403 _FQ_CLASSQ_UPDATE_QUANTUM(grp, CTL, quantum);
1404 }
1405 }
1406#undef _FQ_CLASSQ_UPDATE_QUANTUM
1407}
1408
1409static void
1410fq_if_event(fq_if_t *fqs, cqev_t ev)
1411{
1412 IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq);
1413
1414 switch (ev) {
1415 case CLASSQ_EV_LINK_UP:
1416 case CLASSQ_EV_LINK_DOWN:
1417 fq_if_purge(fqs);
1418 break;
1419 case CLASSQ_EV_LINK_MTU:
1420 fq_if_mtu_update(fqs);
1421 break;
1422 default:
1423 break;
1424 }
1425}
1426
1427static void
1428fq_if_classq_suspend(fq_if_t *fqs, fq_if_classq_t *fq_cl)
1429{
1430 fq_if_purge_classq(fqs, fq_cl);
1431 fqs->fqs_throttle = 1;
1432 fq_cl->fcl_stat.fcl_throttle_on++;
1433 KDBG(AQM_KTRACE_AON_THROTTLE | DBG_FUNC_START,
1434 fqs->fqs_ifq->ifcq_ifp->if_index, 0, 0, 0);
1435}
1436
1437static void
1438fq_if_classq_resume(fq_if_t *fqs, fq_if_classq_t *fq_cl)
1439{
1440 VERIFY(FQ_IF_CLASSQ_IDLE(fq_cl));
1441 fqs->fqs_throttle = 0;
1442 fq_cl->fcl_stat.fcl_throttle_off++;
1443 KDBG(AQM_KTRACE_AON_THROTTLE | DBG_FUNC_END,
1444 fqs->fqs_ifq->ifcq_ifp->if_index, 0, 0, 0);
1445}
1446
1447
1448static int
1449fq_if_throttle(fq_if_t *fqs, cqrq_throttle_t *tr)
1450{
1451 struct ifclassq *ifq = fqs->fqs_ifq;
1452 uint8_t index;
1453 fq_if_group_t *grp;
1454
1455#if !MACH_ASSERT
1456#pragma unused(ifq)
1457#endif
1458 IFCQ_LOCK_ASSERT_HELD(ifq);
1459
1460 if (!tr->set) {
1461 tr->level = fqs->fqs_throttle;
1462 return 0;
1463 }
1464
1465 if (tr->level == fqs->fqs_throttle) {
1466 return EALREADY;
1467 }
1468
1469 /* Throttling is allowed on BK_SYS class only */
1470 index = fq_if_service_to_priority(fqs, svc: MBUF_SC_BK_SYS);
1471
1472 for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1473 if (fqs->fqs_classq_groups[grp_idx] == NULL) {
1474 continue;
1475 }
1476 grp = fq_if_find_grp(fqs, grp_idx);
1477 switch (tr->level) {
1478 case IFNET_THROTTLE_OFF:
1479 fq_if_classq_resume(fqs, fq_cl: &grp->fqg_classq[index]);
1480 break;
1481 case IFNET_THROTTLE_OPPORTUNISTIC:
1482 fq_if_classq_suspend(fqs, fq_cl: &grp->fqg_classq[index]);
1483 break;
1484 default:
1485 break;
1486 }
1487 }
1488 return 0;
1489}
1490
1491static inline boolean_t
1492fq_if_is_fq_cl_paced(fq_if_classq_t *fq_cl, uint64_t now)
1493{
1494 if ((fq_cl->fcl_flags & FCL_PACED) != 0 && fq_cl->fcl_next_tx_time > now) {
1495 return true;
1496 }
1497
1498 fq_cl->fcl_flags &= ~FCL_PACED;
1499 fq_cl->fcl_next_tx_time = 0;
1500 return false;
1501}
1502
1503static void
1504fq_if_grp_stat_sc(fq_if_t *fqs, fq_if_group_t *grp, cqrq_stat_sc_t *stat, uint64_t now)
1505{
1506 uint8_t pri;
1507 fq_if_classq_t *fq_cl;
1508
1509 ASSERT(stat != NULL);
1510 pri = fq_if_service_to_priority(fqs, svc: stat->sc);
1511
1512 fq_cl = &grp->fqg_classq[pri];
1513 stat->packets = (uint32_t)fq_cl->fcl_stat.fcl_pkt_cnt;
1514 stat->bytes = (uint32_t)fq_cl->fcl_stat.fcl_byte_cnt;
1515
1516 if (ifclassq_enable_pacing && ifclassq_enable_l4s &&
1517 fq_if_is_fq_cl_paced(fq_cl, now)) {
1518 stat->packets = 0;
1519 stat->bytes = 0;
1520 }
1521}
1522
1523static boolean_t
1524fq_if_is_grp_all_paced(fq_if_group_t *grp)
1525{
1526 fq_if_classq_t *fq_cl;
1527 uint64_t now;
1528
1529 if (!ifclassq_enable_pacing || !ifclassq_enable_l4s) {
1530 return false;
1531 }
1532
1533 now = fq_codel_get_time();
1534 for (uint8_t fq_cl_idx = 0; fq_cl_idx < FQ_IF_MAX_CLASSES; fq_cl_idx++) {
1535 fq_cl = &grp->fqg_classq[fq_cl_idx];
1536 if (fq_cl == NULL || FQ_IF_CLASSQ_IDLE(fq_cl)) {
1537 continue;
1538 }
1539 if (!fq_if_is_fq_cl_paced(fq_cl, now)) {
1540 return false;
1541 }
1542 }
1543
1544 return true;
1545}
1546
1547boolean_t
1548fq_if_is_all_paced(struct ifclassq *ifq)
1549{
1550 fq_if_group_t *grp;
1551 fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
1552
1553 IFCQ_LOCK_ASSERT_HELD(ifq);
1554
1555 if (!ifclassq_enable_pacing || !ifclassq_enable_l4s) {
1556 return false;
1557 }
1558
1559 for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1560 grp = fqs->fqs_classq_groups[grp_idx];
1561 if (grp == NULL || FQG_BYTES(grp) == 0) {
1562 continue;
1563 }
1564
1565 if (!fq_if_is_grp_all_paced(grp)) {
1566 return false;
1567 }
1568 }
1569
1570 return true;
1571}
1572
1573void
1574fq_if_stat_sc(fq_if_t *fqs, cqrq_stat_sc_t *stat)
1575{
1576 cqrq_stat_sc_t grp_sc_stat;
1577 fq_if_group_t *grp;
1578 uint64_t now = fq_codel_get_time();
1579
1580 if (stat == NULL) {
1581 return;
1582 }
1583 grp_sc_stat.sc = stat->sc;
1584 stat->packets = 0;
1585 stat->bytes = 0;
1586
1587 if (stat->grp_idx == IF_CLASSQ_ALL_GRPS) {
1588 if (stat->sc == MBUF_SC_UNSPEC) {
1589 if (!fq_if_is_all_paced(ifq: fqs->fqs_ifq)) {
1590 stat->packets = IFCQ_LEN(fqs->fqs_ifq);
1591 stat->bytes = IFCQ_BYTES(fqs->fqs_ifq);
1592 }
1593 } else {
1594 for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
1595 grp = fqs->fqs_classq_groups[grp_idx];
1596 if (grp == NULL) {
1597 continue;
1598 }
1599
1600 fq_if_grp_stat_sc(fqs, grp, stat: &grp_sc_stat, now);
1601 stat->packets += grp_sc_stat.packets;
1602 stat->bytes += grp_sc_stat.bytes;
1603 }
1604 }
1605 return;
1606 }
1607
1608 if (stat->sc == MBUF_SC_UNSPEC) {
1609 if (fq_if_is_grp_combined(fqs, grp_idx: stat->grp_idx)) {
1610 TAILQ_FOREACH(grp, &fqs->fqs_combined_grp_list, fqg_grp_link) {
1611 if (fq_if_is_grp_all_paced(grp)) {
1612 continue;
1613 }
1614 stat->packets += FQG_LEN(grp);
1615 stat->bytes += FQG_BYTES(grp);
1616 }
1617 } else {
1618 grp = fq_if_find_grp(fqs, grp_idx: stat->grp_idx);
1619 if (!fq_if_is_grp_all_paced(grp)) {
1620 stat->packets = FQG_LEN(grp);
1621 stat->bytes = FQG_BYTES(grp);
1622 }
1623 }
1624 } else {
1625 if (fq_if_is_grp_combined(fqs, grp_idx: stat->grp_idx)) {
1626 TAILQ_FOREACH(grp, &fqs->fqs_combined_grp_list, fqg_grp_link) {
1627 if (fq_if_is_grp_all_paced(grp)) {
1628 continue;
1629 }
1630 fq_if_grp_stat_sc(fqs, grp, stat: &grp_sc_stat, now);
1631 stat->packets += grp_sc_stat.packets;
1632 stat->bytes += grp_sc_stat.bytes;
1633 }
1634 } else {
1635 grp = fq_if_find_grp(fqs, grp_idx: stat->grp_idx);
1636 fq_if_grp_stat_sc(fqs, grp, stat, now);
1637 }
1638 }
1639}
1640
1641int
1642fq_if_request_classq(struct ifclassq *ifq, cqrq_t rq, void *arg)
1643{
1644 int err = 0;
1645 fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
1646
1647 IFCQ_LOCK_ASSERT_HELD(ifq);
1648
1649 /*
1650 * These are usually slow operations, convert the lock ahead of time
1651 */
1652 IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
1653 switch (rq) {
1654 case CLASSQRQ_PURGE:
1655 fq_if_purge(fqs);
1656 break;
1657 case CLASSQRQ_PURGE_SC:
1658 fq_if_purge_sc(fqs, req: (cqrq_purge_sc_t *)arg);
1659 break;
1660 case CLASSQRQ_EVENT:
1661 fq_if_event(fqs, ev: (cqev_t)arg);
1662 break;
1663 case CLASSQRQ_THROTTLE:
1664 fq_if_throttle(fqs, tr: (cqrq_throttle_t *)arg);
1665 break;
1666 case CLASSQRQ_STAT_SC:
1667 fq_if_stat_sc(fqs, stat: (cqrq_stat_sc_t *)arg);
1668 break;
1669 }
1670 return err;
1671}
1672
1673int
1674fq_if_setup_ifclassq(struct ifclassq *ifq, u_int32_t flags,
1675 classq_pkt_type_t ptype)
1676{
1677 fq_if_t *fqs = NULL;
1678 int err = 0;
1679
1680 IFCQ_LOCK_ASSERT_HELD(ifq);
1681 VERIFY(ifq->ifcq_disc == NULL);
1682 VERIFY(ifq->ifcq_type == PKTSCHEDT_NONE);
1683
1684 fqs = fq_if_alloc(ifq, ptype);
1685 if (fqs == NULL) {
1686 return ENOMEM;
1687 }
1688 if (flags & PKTSCHEDF_QALG_DRIVER_MANAGED) {
1689 fqs->fqs_flags |= FQS_DRIVER_MANAGED;
1690 fqs->fqs_bm_ops = &fq_if_grps_sc_bitmap_ops;
1691 } else {
1692 fqs->fqs_bm_ops = &fq_if_grps_bitmap_ops;
1693 }
1694
1695 err = ifclassq_attach(ifq, PKTSCHEDT_FQ_CODEL, fqs);
1696 if (err != 0) {
1697 os_log_error(OS_LOG_DEFAULT, "%s: error from ifclassq_attach, "
1698 "failed to attach fq_if: %d\n", __func__, err);
1699 fq_if_destroy(fqs);
1700 return err;
1701 }
1702
1703 /*
1704 * Always create one group. If qset 0 is added later,
1705 * this group will be updated.
1706 */
1707 err = fq_if_create_grp(ifcq: ifq, qset_idx: 0, IF_CLASSQ_DEF);
1708 if (err != 0) {
1709 os_log_error(OS_LOG_DEFAULT, "%s: error from fq_if_create_grp, "
1710 "failed to create a fq group: %d\n", __func__, err);
1711 fq_if_destroy(fqs);
1712 }
1713
1714 return err;
1715}
1716
1717fq_t *
1718fq_if_hash_pkt(fq_if_t *fqs, fq_if_group_t *fq_grp, u_int32_t flowid,
1719 mbuf_svc_class_t svc_class, u_int64_t now, bool create,
1720 fq_tfc_type_t tfc_type)
1721{
1722 fq_t *fq = NULL;
1723 flowq_list_t *fq_list;
1724 fq_if_classq_t *fq_cl;
1725 u_int8_t fqs_hash_id;
1726 u_int8_t scidx;
1727
1728 scidx = fq_if_service_to_priority(fqs, svc: svc_class);
1729
1730 fqs_hash_id = FQ_IF_FLOW_HASH_ID(flowid);
1731
1732 fq_list = &fqs->fqs_flows[fqs_hash_id];
1733
1734 SLIST_FOREACH(fq, fq_list, fq_hashlink) {
1735 if (fq->fq_flowhash == flowid &&
1736 fq->fq_sc_index == scidx &&
1737 fq->fq_tfc_type == tfc_type &&
1738 fq->fq_group == fq_grp) {
1739 break;
1740 }
1741 }
1742 if (fq == NULL && create) {
1743 /* If the flow is not already on the list, allocate it */
1744 IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
1745 fq = fq_alloc(fqs->fqs_ptype);
1746 if (fq != NULL) {
1747 fq->fq_flowhash = flowid;
1748 fq->fq_sc_index = scidx;
1749 fq->fq_group = fq_grp;
1750 fq->fq_tfc_type = tfc_type;
1751 fq_cl = &FQ_CLASSQ(fq);
1752 fq->fq_flags = (FQF_FLOWCTL_CAPABLE | FQF_FRESH_FLOW);
1753 fq->fq_updatetime = now + FQ_UPDATE_INTERVAL(fq);
1754 fq->fq_next_tx_time = FQ_INVALID_TX_TS;
1755 SLIST_INSERT_HEAD(fq_list, fq, fq_hashlink);
1756 fq_cl->fcl_stat.fcl_flows_cnt++;
1757 }
1758 KDBG(AQM_KTRACE_STATS_FLOW_ALLOC,
1759 fqs->fqs_ifq->ifcq_ifp->if_index, fq->fq_flowhash,
1760 AQM_KTRACE_FQ_GRP_SC_IDX(fq), 0);
1761 } else if ((fq != NULL) && (fq->fq_flags & FQF_EMPTY_FLOW)) {
1762 fq_if_reuse_empty_flow(fqs, fq, now);
1763 }
1764
1765 /*
1766 * If getq time is not set because this is the first packet or after
1767 * idle time, set it now so that we can detect a stall.
1768 */
1769 if (fq != NULL && fq->fq_getqtime == 0) {
1770 fq->fq_getqtime = now;
1771 }
1772
1773 return fq;
1774}
1775
1776void
1777fq_if_destroy_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq)
1778{
1779 u_int8_t hash_id;
1780
1781 ASSERT((fq->fq_flags & FQF_EMPTY_FLOW) == 0);
1782 hash_id = FQ_IF_FLOW_HASH_ID(fq->fq_flowhash);
1783 SLIST_REMOVE(&fqs->fqs_flows[hash_id], fq, flowq,
1784 fq_hashlink);
1785 IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
1786 if (__improbable(fq->fq_flags & FQF_FLOWCTL_ON)) {
1787 fq_if_flow_feedback(fqs, fq, fq_cl);
1788 }
1789 KDBG(AQM_KTRACE_STATS_FLOW_DESTROY,
1790 fqs->fqs_ifq->ifcq_ifp->if_index, fq->fq_flowhash,
1791 AQM_KTRACE_FQ_GRP_SC_IDX(fq), 0);
1792 fq_destroy(fq, fqs->fqs_ptype);
1793}
1794
1795inline boolean_t
1796fq_if_at_drop_limit(fq_if_t *fqs)
1797{
1798 return (IFCQ_LEN(fqs->fqs_ifq) >= fqs->fqs_pkt_droplimit) ?
1799 TRUE : FALSE;
1800}
1801
1802inline boolean_t
1803fq_if_almost_at_drop_limit(fq_if_t *fqs)
1804{
1805 /*
1806 * Whether we are above 90% of the queue limit. This is used to tell if we
1807 * can stop flow controlling the largest flow.
1808 */
1809 return IFCQ_LEN(fqs->fqs_ifq) >= fqs->fqs_pkt_droplimit * 9 / 10;
1810}
1811
1812static inline void
1813fq_if_reuse_empty_flow(fq_if_t *fqs, fq_t *fq, uint64_t now)
1814{
1815 ASSERT(fq->fq_flags & FQF_EMPTY_FLOW);
1816 TAILQ_REMOVE(&fqs->fqs_empty_list, fq, fq_empty_link);
1817 STAILQ_NEXT(fq, fq_actlink) = NULL;
1818 fq->fq_flags &= ~FQF_FLOW_STATE_MASK;
1819 fq->fq_empty_purge_time = 0;
1820 fq->fq_getqtime = 0;
1821 fq->fq_updatetime = now + FQ_UPDATE_INTERVAL(fq);
1822 fqs->fqs_empty_list_cnt--;
1823 fq_if_classq_t *fq_cl = &FQ_CLASSQ(fq);
1824 fq_cl->fcl_stat.fcl_flows_cnt++;
1825}
1826
1827inline void
1828fq_if_move_to_empty_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq,
1829 uint64_t now)
1830{
1831 ASSERT(fq->fq_flags & ~(FQF_NEW_FLOW | FQF_OLD_FLOW | FQF_FLOWCTL_ON));
1832 fq->fq_empty_purge_time = now + fq_empty_purge_delay;
1833 TAILQ_INSERT_TAIL(&fqs->fqs_empty_list, fq, fq_empty_link);
1834 fq->fq_flags |= FQF_EMPTY_FLOW;
1835 FQ_CLEAR_OVERWHELMING(fq);
1836 fqs->fqs_empty_list_cnt++;
1837 /*
1838 * fcl_flows_cnt is used in budget determination for the class.
1839 * empty flow shouldn't contribute to the budget.
1840 */
1841 fq_cl->fcl_stat.fcl_flows_cnt--;
1842}
1843
1844static void
1845fq_if_purge_empty_flow(fq_if_t *fqs, fq_t *fq)
1846{
1847 fq_if_classq_t *fq_cl;
1848 fq_cl = &FQ_CLASSQ(fq);
1849
1850 ASSERT((fq->fq_flags & FQF_EMPTY_FLOW) != 0);
1851 TAILQ_REMOVE(&fqs->fqs_empty_list, fq, fq_empty_link);
1852 fq->fq_flags &= ~FQF_EMPTY_FLOW;
1853 fqs->fqs_empty_list_cnt--;
1854 /* Remove from the hash list and free the flow queue */
1855 fq_if_destroy_flow(fqs, fq_cl, fq);
1856}
1857
1858static void
1859fq_if_purge_empty_flow_list(fq_if_t *fqs, uint64_t now, bool purge_all)
1860{
1861 fq_t *fq, *tmp;
1862 int i = 0;
1863
1864 if (fqs->fqs_empty_list_cnt == 0) {
1865 ASSERT(TAILQ_EMPTY(&fqs->fqs_empty_list));
1866 return;
1867 }
1868
1869 TAILQ_FOREACH_SAFE(fq, &fqs->fqs_empty_list, fq_empty_link, tmp) {
1870 if (!purge_all && ((now < fq->fq_empty_purge_time) ||
1871 (i++ == FQ_EMPTY_PURGE_MAX))) {
1872 break;
1873 }
1874 fq_if_purge_empty_flow(fqs, fq);
1875 }
1876
1877 if (__improbable(purge_all)) {
1878 VERIFY(fqs->fqs_empty_list_cnt == 0);
1879 VERIFY(TAILQ_EMPTY(&fqs->fqs_empty_list));
1880 }
1881}
1882
1883static void
1884fq_if_empty_old_flow(fq_if_t *fqs, fq_if_classq_t *fq_cl, fq_t *fq,
1885 uint64_t now)
1886{
1887 /*
1888 * Remove the flow queue from the old flows list.
1889 */
1890 STAILQ_REMOVE(&fq_cl->fcl_old_flows, fq, flowq, fq_actlink);
1891 fq->fq_flags &= ~FQF_OLD_FLOW;
1892 fq_cl->fcl_stat.fcl_oldflows_cnt--;
1893 VERIFY(fq->fq_bytes == 0);
1894
1895 /* release any flow control */
1896 if (__improbable(fq->fq_flags & FQF_FLOWCTL_ON)) {
1897 fq_if_flow_feedback(fqs, fq, fq_cl);
1898 }
1899
1900 /* move the flow queue to empty flows list */
1901 fq_if_move_to_empty_flow(fqs, fq_cl, fq, now);
1902}
1903
1904static void
1905fq_if_empty_new_flow(fq_t *fq, fq_if_classq_t *fq_cl)
1906{
1907 /* Move to the end of old queue list */
1908 STAILQ_REMOVE(&fq_cl->fcl_new_flows, fq,
1909 flowq, fq_actlink);
1910 fq->fq_flags &= ~FQF_NEW_FLOW;
1911 fq_cl->fcl_stat.fcl_newflows_cnt--;
1912
1913 STAILQ_INSERT_TAIL(&fq_cl->fcl_old_flows, fq, fq_actlink);
1914 fq->fq_flags |= FQF_OLD_FLOW;
1915 fq_cl->fcl_stat.fcl_oldflows_cnt++;
1916}
1917
1918inline void
1919fq_if_drop_packet(fq_if_t *fqs, uint64_t now)
1920{
1921 fq_t *fq = fqs->fqs_large_flow;
1922 fq_if_classq_t *fq_cl;
1923 pktsched_pkt_t pkt;
1924 volatile uint32_t *pkt_flags;
1925 uint64_t *pkt_timestamp;
1926
1927 if (fq == NULL) {
1928 return;
1929 }
1930 /* queue can not be empty on the largest flow */
1931 VERIFY(!fq_empty(fq, fqs->fqs_ptype));
1932
1933 fq_cl = &FQ_CLASSQ(fq);
1934 _PKTSCHED_PKT_INIT(&pkt);
1935 fq_getq_flow_internal(fqs, fq, &pkt);
1936 ASSERT(pkt.pktsched_ptype != QP_INVALID);
1937
1938 pktsched_get_pkt_vars(&pkt, &pkt_flags, &pkt_timestamp, NULL, NULL,
1939 NULL, NULL, NULL);
1940
1941 IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
1942 *pkt_timestamp = 0;
1943 switch (pkt.pktsched_ptype) {
1944 case QP_MBUF:
1945 *pkt_flags &= ~PKTF_PRIV_GUARDED;
1946 break;
1947#if SKYWALK
1948 case QP_PACKET:
1949 /* sanity check */
1950 ASSERT((*pkt_flags & ~PKT_F_COMMON_MASK) == 0);
1951 break;
1952#endif /* SKYWALK */
1953 default:
1954 VERIFY(0);
1955 /* NOTREACHED */
1956 __builtin_unreachable();
1957 }
1958
1959 if (fq_empty(fq, fqs->fqs_ptype)) {
1960 fqs->fqs_large_flow = NULL;
1961 if (fq->fq_flags & FQF_OLD_FLOW) {
1962 fq_if_empty_old_flow(fqs, fq_cl, fq, now);
1963 } else {
1964 VERIFY(fq->fq_flags & FQF_NEW_FLOW);
1965 fq_if_empty_new_flow(fq, fq_cl);
1966 }
1967 }
1968 IFCQ_DROP_ADD(fqs->fqs_ifq, 1, pktsched_get_pkt_len(&pkt));
1969
1970 pktsched_free_pkt(&pkt);
1971 fq_cl->fcl_stat.fcl_drop_overflow++;
1972}
1973
1974inline void
1975fq_if_is_flow_heavy(fq_if_t *fqs, fq_t *fq)
1976{
1977 fq_t *prev_fq;
1978
1979 if (fqs->fqs_large_flow != NULL &&
1980 fqs->fqs_large_flow->fq_bytes < FQ_IF_LARGE_FLOW_BYTE_LIMIT) {
1981 fqs->fqs_large_flow = NULL;
1982 }
1983
1984 if (fq == NULL || fq->fq_bytes < FQ_IF_LARGE_FLOW_BYTE_LIMIT) {
1985 return;
1986 }
1987
1988 prev_fq = fqs->fqs_large_flow;
1989 if (prev_fq == NULL) {
1990 if (!fq_empty(fq, fqs->fqs_ptype)) {
1991 fqs->fqs_large_flow = fq;
1992 }
1993 return;
1994 } else if (fq->fq_bytes > prev_fq->fq_bytes) {
1995 fqs->fqs_large_flow = fq;
1996 }
1997}
1998
1999boolean_t
2000fq_if_add_fcentry(fq_if_t *fqs, pktsched_pkt_t *pkt, uint8_t flowsrc,
2001 fq_t *fq, fq_if_classq_t *fq_cl)
2002{
2003 struct flowadv_fcentry *fce;
2004
2005#if DEBUG || DEVELOPMENT
2006 if (__improbable(ifclassq_flow_control_adv == 0)) {
2007 os_log(OS_LOG_DEFAULT, "%s: skipped flow control", __func__);
2008 return TRUE;
2009 }
2010#endif /* DEBUG || DEVELOPMENT */
2011
2012 STAILQ_FOREACH(fce, &fqs->fqs_fclist, fce_link) {
2013 if ((uint8_t)fce->fce_flowsrc_type == flowsrc &&
2014 fce->fce_flowid == fq->fq_flowhash) {
2015 /* Already on flowcontrol list */
2016 return TRUE;
2017 }
2018 }
2019 IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
2020 fce = pktsched_alloc_fcentry(pkt, fqs->fqs_ifq->ifcq_ifp, M_WAITOK);
2021 if (fce != NULL) {
2022 /* XXX Add number of bytes in the queue */
2023 STAILQ_INSERT_TAIL(&fqs->fqs_fclist, fce, fce_link);
2024 fq_cl->fcl_stat.fcl_flow_control++;
2025 os_log(OS_LOG_DEFAULT, "%s: num: %d, scidx: %d, flowsrc: %d, "
2026 "flow: 0x%x, iface: %s, B:%u\n", __func__,
2027 fq_cl->fcl_stat.fcl_flow_control,
2028 fq->fq_sc_index, fce->fce_flowsrc_type, fq->fq_flowhash,
2029 if_name(fqs->fqs_ifq->ifcq_ifp), fq->fq_bytes);
2030 KDBG(AQM_KTRACE_STATS_FLOW_CTL | DBG_FUNC_START,
2031 fq->fq_flowhash, AQM_KTRACE_FQ_GRP_SC_IDX(fq),
2032 fq->fq_bytes, fq->fq_min_qdelay);
2033 }
2034 return (fce != NULL) ? TRUE : FALSE;
2035}
2036
2037static void
2038fq_if_remove_fcentry(fq_if_t *fqs, struct flowadv_fcentry *fce)
2039{
2040 STAILQ_REMOVE(&fqs->fqs_fclist, fce, flowadv_fcentry, fce_link);
2041 STAILQ_NEXT(fce, fce_link) = NULL;
2042 flowadv_add_entry(fce);
2043}
2044
2045void
2046fq_if_flow_feedback(fq_if_t *fqs, fq_t *fq, fq_if_classq_t *fq_cl)
2047{
2048 struct flowadv_fcentry *fce = NULL;
2049
2050 IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
2051 STAILQ_FOREACH(fce, &fqs->fqs_fclist, fce_link) {
2052 if (fce->fce_flowid == fq->fq_flowhash) {
2053 break;
2054 }
2055 }
2056 if (fce != NULL) {
2057 fq_cl->fcl_stat.fcl_flow_feedback++;
2058 fce->fce_event_type = FCE_EVENT_TYPE_FLOW_CONTROL_FEEDBACK;
2059 os_log(OS_LOG_DEFAULT, "%s: num: %d, scidx: %d, flowsrc: %d, "
2060 "flow: 0x%x, iface: %s grp: %hhu, B:%u\n", __func__,
2061 fq_cl->fcl_stat.fcl_flow_feedback, fq->fq_sc_index,
2062 fce->fce_flowsrc_type, fce->fce_flowid,
2063 if_name(fqs->fqs_ifq->ifcq_ifp), FQ_GROUP(fq)->fqg_index,
2064 fq->fq_bytes);
2065 fq_if_remove_fcentry(fqs, fce);
2066 KDBG(AQM_KTRACE_STATS_FLOW_CTL | DBG_FUNC_END,
2067 fq->fq_flowhash, AQM_KTRACE_FQ_GRP_SC_IDX(fq),
2068 fq->fq_bytes, fq->fq_min_qdelay);
2069 }
2070 fq->fq_flags &= ~FQF_FLOWCTL_ON;
2071}
2072
2073boolean_t
2074fq_if_report_ce(fq_if_t *fqs, pktsched_pkt_t *pkt, uint32_t ce_cnt,
2075 uint32_t pkt_cnt)
2076{
2077 struct flowadv_fcentry *fce;
2078
2079#if DEBUG || DEVELOPMENT
2080 if (__improbable(ifclassq_flow_control_adv == 0)) {
2081 os_log(OS_LOG_DEFAULT, "%s: skipped flow control", __func__);
2082 return TRUE;
2083 }
2084#endif /* DEBUG || DEVELOPMENT */
2085
2086 IFCQ_CONVERT_LOCK(fqs->fqs_ifq);
2087 fce = pktsched_alloc_fcentry(pkt, fqs->fqs_ifq->ifcq_ifp, M_WAITOK);
2088 if (fce != NULL) {
2089 fce->fce_event_type = FCE_EVENT_TYPE_CONGESTION_EXPERIENCED;
2090 fce->fce_ce_cnt = ce_cnt;
2091 fce->fce_pkts_since_last_report = pkt_cnt;
2092
2093 flowadv_add_entry(fce);
2094 }
2095 return (fce != NULL) ? TRUE : FALSE;
2096}
2097
2098
2099void
2100fq_if_dequeue(fq_if_t *fqs, fq_if_classq_t *fq_cl, uint32_t pktlimit,
2101 int64_t bytelimit, classq_pkt_t *top, classq_pkt_t *bottom,
2102 uint32_t *retpktcnt, uint32_t *retbytecnt, flowq_dqlist_t *fq_dqlist,
2103 bool budget_restricted, uint64_t now, bool *fq_cl_paced,
2104 uint64_t *next_tx_time)
2105{
2106 fq_t *fq = NULL, *tfq = NULL;
2107 flowq_stailq_t temp_stailq;
2108 uint32_t pktcnt, bytecnt;
2109 boolean_t qempty, limit_reached = FALSE;
2110 bool all_paced = true;
2111 classq_pkt_t last = CLASSQ_PKT_INITIALIZER(last);
2112 fq_getq_flow_t fq_getq_flow_fn;
2113 classq_pkt_t *head, *tail;
2114 uint64_t fq_cl_tx_time = FQ_INVALID_TX_TS;
2115
2116 switch (fqs->fqs_ptype) {
2117 case QP_MBUF:
2118 fq_getq_flow_fn = fq_getq_flow_mbuf;
2119 break;
2120
2121#if SKYWALK
2122 case QP_PACKET:
2123 fq_getq_flow_fn = fq_getq_flow_kpkt;
2124 break;
2125#endif /* SKYWALK */
2126
2127 default:
2128 VERIFY(0);
2129 /* NOTREACHED */
2130 __builtin_unreachable();
2131 }
2132
2133 /*
2134 * maximum byte limit should not be greater than the budget for
2135 * this class
2136 */
2137 if (bytelimit > fq_cl->fcl_budget && budget_restricted) {
2138 bytelimit = fq_cl->fcl_budget;
2139 }
2140
2141 VERIFY(pktlimit > 0 && bytelimit > 0 && top != NULL);
2142 pktcnt = bytecnt = 0;
2143 STAILQ_INIT(&temp_stailq);
2144
2145 STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_new_flows, fq_actlink, tfq) {
2146 ASSERT((fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW)) ==
2147 FQF_NEW_FLOW);
2148 uint64_t fq_tx_time;
2149 if (__improbable(!fq_tx_time_ready(fqs, fq, now, &fq_tx_time))) {
2150 ASSERT(fq_tx_time != FQ_INVALID_TX_TS);
2151 if (fq_tx_time < fq_cl_tx_time) {
2152 fq_cl_tx_time = fq_tx_time;
2153 }
2154 continue;
2155 }
2156 all_paced = false;
2157
2158 if (fq_dqlist != NULL) {
2159 if (!fq->fq_in_dqlist) {
2160 fq_dqlist_add(fq_dqlist_head: fq_dqlist, fq);
2161 }
2162 head = &fq->fq_dq_head;
2163 tail = &fq->fq_dq_tail;
2164 } else {
2165 ASSERT(!fq->fq_in_dqlist);
2166 head = top;
2167 tail = &last;
2168 }
2169
2170 limit_reached = fq_getq_flow_fn(fqs, fq_cl, fq, bytelimit,
2171 pktlimit, head, tail, &bytecnt, &pktcnt, &qempty, now);
2172
2173 /*
2174 * From RFC 8290:
2175 * if that queue has a negative number of credits (i.e., it has already
2176 * dequeued at least a quantum of bytes), it is given an additional
2177 * quantum of credits, the queue is put onto _the end of_ the list of
2178 * old queues, and the routine selects the next queue and starts again.
2179 */
2180 if (fq->fq_deficit <= 0 || qempty) {
2181 fq->fq_deficit += fq_cl->fcl_quantum;
2182 fq_if_empty_new_flow(fq, fq_cl);
2183 }
2184 //TODO: add credit when it's now paced? so that the fq is trated the same as empty
2185
2186 if (!fq_tx_time_ready(fqs, fq, now, ready_time: &fq_tx_time)) {
2187 ASSERT(fq_tx_time != FQ_INVALID_TX_TS);
2188 if (fq_tx_time < fq_cl_tx_time) {
2189 fq_cl_tx_time = fq_tx_time;
2190 }
2191 }
2192
2193 if (limit_reached) {
2194 goto done;
2195 }
2196 }
2197
2198 STAILQ_FOREACH_SAFE(fq, &fq_cl->fcl_old_flows, fq_actlink, tfq) {
2199 VERIFY((fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW)) ==
2200 FQF_OLD_FLOW);
2201 bool destroy = true;
2202 uint64_t fq_tx_time;
2203
2204 if (__improbable(!fq_tx_time_ready(fqs, fq, now, &fq_tx_time))) {
2205 ASSERT(fq_tx_time != FQ_INVALID_TX_TS);
2206 if (fq_tx_time < fq_cl_tx_time) {
2207 fq_cl_tx_time = fq_tx_time;
2208 }
2209 continue;
2210 }
2211 all_paced = false;
2212
2213 if (fq_dqlist != NULL) {
2214 if (!fq->fq_in_dqlist) {
2215 fq_dqlist_add(fq_dqlist_head: fq_dqlist, fq);
2216 }
2217 head = &fq->fq_dq_head;
2218 tail = &fq->fq_dq_tail;
2219 destroy = false;
2220 } else {
2221 ASSERT(!fq->fq_in_dqlist);
2222 head = top;
2223 tail = &last;
2224 }
2225
2226 limit_reached = fq_getq_flow_fn(fqs, fq_cl, fq, bytelimit,
2227 pktlimit, head, tail, &bytecnt, &pktcnt, &qempty, now);
2228
2229 if (!fq_tx_time_ready(fqs, fq, now, ready_time: &fq_tx_time)) {
2230 ASSERT(fq_tx_time != FQ_INVALID_TX_TS);
2231 if (fq_tx_time < fq_cl_tx_time) {
2232 fq_cl_tx_time = fq_tx_time;
2233 }
2234 }
2235
2236 if (qempty) {
2237 fq_if_empty_old_flow(fqs, fq_cl, fq, now);
2238 } else if (fq->fq_deficit <= 0) {
2239 STAILQ_REMOVE(&fq_cl->fcl_old_flows, fq,
2240 flowq, fq_actlink);
2241 /*
2242 * Move to the end of the old queues list. We do not
2243 * need to update the flow count since this flow
2244 * will be added to the tail again
2245 */
2246 STAILQ_INSERT_TAIL(&temp_stailq, fq, fq_actlink);
2247 fq->fq_deficit += fq_cl->fcl_quantum;
2248 }
2249 if (limit_reached) {
2250 break;
2251 }
2252 }
2253
2254done:
2255 if (all_paced) {
2256 fq_cl->fcl_flags |= FCL_PACED;
2257 fq_cl->fcl_next_tx_time = fq_cl_tx_time;
2258 }
2259 if (!STAILQ_EMPTY(&fq_cl->fcl_old_flows)) {
2260 STAILQ_CONCAT(&fq_cl->fcl_old_flows, &temp_stailq);
2261 } else if (!STAILQ_EMPTY(&temp_stailq)) {
2262 fq_cl->fcl_old_flows = temp_stailq;
2263 }
2264 if (last.cp_mbuf != NULL) {
2265 VERIFY(top->cp_mbuf != NULL);
2266 if (bottom != NULL) {
2267 *bottom = last;
2268 }
2269 }
2270 if (retpktcnt != NULL) {
2271 *retpktcnt = pktcnt;
2272 }
2273 if (retbytecnt != NULL) {
2274 *retbytecnt = bytecnt;
2275 }
2276 if (fq_cl_paced != NULL) {
2277 *fq_cl_paced = all_paced;
2278 }
2279 if (next_tx_time != NULL) {
2280 *next_tx_time = fq_cl_tx_time;
2281 }
2282}
2283
2284void
2285fq_if_teardown_ifclassq(struct ifclassq *ifq)
2286{
2287 fq_if_t *fqs = (fq_if_t *)ifq->ifcq_disc;
2288
2289 IFCQ_LOCK_ASSERT_HELD(ifq);
2290 VERIFY(fqs != NULL && ifq->ifcq_type == PKTSCHEDT_FQ_CODEL);
2291 fq_if_destroy(fqs);
2292 ifq->ifcq_disc = NULL;
2293 ifclassq_detach(ifq);
2294}
2295
2296static void
2297fq_export_flowstats(fq_if_t *fqs, fq_t *fq,
2298 struct fq_codel_flowstats *flowstat)
2299{
2300 bzero(s: flowstat, n: sizeof(*flowstat));
2301 flowstat->fqst_min_qdelay = (uint32_t)fq->fq_min_qdelay;
2302 flowstat->fqst_bytes = fq->fq_bytes;
2303 flowstat->fqst_flowhash = fq->fq_flowhash;
2304 if (fq->fq_flags & FQF_NEW_FLOW) {
2305 flowstat->fqst_flags |= FQ_FLOWSTATS_NEW_FLOW;
2306 }
2307 if (fq->fq_flags & FQF_OLD_FLOW) {
2308 flowstat->fqst_flags |= FQ_FLOWSTATS_OLD_FLOW;
2309 }
2310 if (fq->fq_flags & FQF_DELAY_HIGH) {
2311 flowstat->fqst_flags |= FQ_FLOWSTATS_DELAY_HIGH;
2312 }
2313 if (fq->fq_flags & FQF_FLOWCTL_ON) {
2314 flowstat->fqst_flags |= FQ_FLOWSTATS_FLOWCTL_ON;
2315 }
2316 if (fqs->fqs_large_flow == fq) {
2317 flowstat->fqst_flags |= FQ_FLOWSTATS_LARGE_FLOW;
2318 }
2319}
2320
2321int
2322fq_if_getqstats_ifclassq(struct ifclassq *ifq, uint8_t gid, u_int32_t qid,
2323 struct if_ifclassq_stats *ifqs)
2324{
2325 struct fq_codel_classstats *fcls;
2326 fq_if_classq_t *fq_cl;
2327 fq_if_t *fqs;
2328 fq_t *fq = NULL;
2329 fq_if_group_t *grp;
2330 u_int32_t i, flowstat_cnt;
2331
2332 if (qid >= FQ_IF_MAX_CLASSES || gid >= FQ_IF_MAX_GROUPS) {
2333 return EINVAL;
2334 }
2335
2336 fqs = (fq_if_t *)ifq->ifcq_disc;
2337 if (fqs->fqs_classq_groups[gid] == NULL) {
2338 return ENXIO;
2339 }
2340
2341 fcls = &ifqs->ifqs_fq_codel_stats;
2342
2343 fq_cl = &FQS_CLASSQ(fqs, gid, qid);
2344 grp = fq_if_find_grp(fqs, grp_idx: gid);
2345
2346 fcls->fcls_pri = fq_cl->fcl_pri;
2347 fcls->fcls_service_class = fq_cl->fcl_service_class;
2348 fcls->fcls_quantum = fq_cl->fcl_quantum;
2349 fcls->fcls_drr_max = fq_cl->fcl_drr_max;
2350 fcls->fcls_budget = fq_cl->fcl_budget;
2351 fcls->fcls_l4s_target_qdelay = grp->fqg_target_qdelays[FQ_TFC_L4S];
2352 fcls->fcls_target_qdelay = grp->fqg_target_qdelays[FQ_TFC_C];
2353 fcls->fcls_update_interval = grp->fqg_update_intervals[FQ_TFC_C];
2354 fcls->fcls_flow_control = fq_cl->fcl_stat.fcl_flow_control;
2355 fcls->fcls_flow_feedback = fq_cl->fcl_stat.fcl_flow_feedback;
2356 fcls->fcls_dequeue_stall = fq_cl->fcl_stat.fcl_dequeue_stall;
2357 fcls->fcls_drop_overflow = fq_cl->fcl_stat.fcl_drop_overflow;
2358 fcls->fcls_drop_early = fq_cl->fcl_stat.fcl_drop_early;
2359 fcls->fcls_drop_memfailure = fq_cl->fcl_stat.fcl_drop_memfailure;
2360 fcls->fcls_flows_cnt = fq_cl->fcl_stat.fcl_flows_cnt;
2361 fcls->fcls_newflows_cnt = fq_cl->fcl_stat.fcl_newflows_cnt;
2362 fcls->fcls_oldflows_cnt = fq_cl->fcl_stat.fcl_oldflows_cnt;
2363 fcls->fcls_pkt_cnt = fq_cl->fcl_stat.fcl_pkt_cnt;
2364 fcls->fcls_flow_control_fail = fq_cl->fcl_stat.fcl_flow_control_fail;
2365 fcls->fcls_flow_control_fail = fq_cl->fcl_stat.fcl_flow_control_fail;
2366 fcls->fcls_dequeue = fq_cl->fcl_stat.fcl_dequeue;
2367 fcls->fcls_dequeue_bytes = fq_cl->fcl_stat.fcl_dequeue_bytes;
2368 fcls->fcls_byte_cnt = fq_cl->fcl_stat.fcl_byte_cnt;
2369 fcls->fcls_throttle_on = fq_cl->fcl_stat.fcl_throttle_on;
2370 fcls->fcls_throttle_off = fq_cl->fcl_stat.fcl_throttle_off;
2371 fcls->fcls_throttle_drops = fq_cl->fcl_stat.fcl_throttle_drops;
2372 fcls->fcls_dup_rexmts = fq_cl->fcl_stat.fcl_dup_rexmts;
2373 fcls->fcls_pkts_compressible = fq_cl->fcl_stat.fcl_pkts_compressible;
2374 fcls->fcls_pkts_compressed = fq_cl->fcl_stat.fcl_pkts_compressed;
2375 fcls->fcls_min_qdelay = fq_cl->fcl_stat.fcl_min_qdelay;
2376 fcls->fcls_max_qdelay = fq_cl->fcl_stat.fcl_max_qdelay;
2377 fcls->fcls_avg_qdelay = fq_cl->fcl_stat.fcl_avg_qdelay;
2378 fcls->fcls_overwhelming = fq_cl->fcl_stat.fcl_overwhelming;
2379 fcls->fcls_ce_marked = fq_cl->fcl_stat.fcl_ce_marked;
2380 fcls->fcls_ce_reported = fq_cl->fcl_stat.fcl_ce_reported;
2381 fcls->fcls_ce_mark_failures = fq_cl->fcl_stat.fcl_ce_mark_failures;
2382 fcls->fcls_l4s_pkts = fq_cl->fcl_stat.fcl_l4s_pkts;
2383 fcls->fcls_ignore_tx_time = fq_cl->fcl_stat.fcl_ignore_tx_time;
2384 fcls->fcls_paced_pkts = fq_cl->fcl_stat.fcl_paced_pkts;
2385 fcls->fcls_fcl_pacing_needed = fq_cl->fcl_stat.fcl_fcl_pacemaker_needed;
2386
2387 /* Gather per flow stats */
2388 flowstat_cnt = min(a: (fcls->fcls_newflows_cnt +
2389 fcls->fcls_oldflows_cnt), FQ_IF_MAX_FLOWSTATS);
2390 i = 0;
2391 STAILQ_FOREACH(fq, &fq_cl->fcl_new_flows, fq_actlink) {
2392 if (i >= fcls->fcls_newflows_cnt || i >= flowstat_cnt) {
2393 break;
2394 }
2395
2396 /* leave space for a few old flows */
2397 if ((flowstat_cnt - i) < fcls->fcls_oldflows_cnt &&
2398 i >= (FQ_IF_MAX_FLOWSTATS >> 1)) {
2399 break;
2400 }
2401 fq_export_flowstats(fqs, fq, flowstat: &fcls->fcls_flowstats[i]);
2402 i++;
2403 }
2404 STAILQ_FOREACH(fq, &fq_cl->fcl_old_flows, fq_actlink) {
2405 if (i >= flowstat_cnt) {
2406 break;
2407 }
2408 fq_export_flowstats(fqs, fq, flowstat: &fcls->fcls_flowstats[i]);
2409 i++;
2410 }
2411 VERIFY(i <= flowstat_cnt);
2412 fcls->fcls_flowstats_cnt = i;
2413 return 0;
2414}
2415
2416int
2417fq_if_create_grp(struct ifclassq *ifcq, uint8_t grp_idx, uint8_t flags)
2418{
2419#define _FQ_CLASSQ_INIT(_grp, _s, _q) \
2420 fq_if_classq_init(_grp, FQ_IF_ ## _s ##_INDEX, \
2421 FQ_CODEL_QUANTUM_ ## _s(_q), FQ_CODEL_DRR_MAX(_s), \
2422 MBUF_SC_ ## _s );
2423
2424 fq_if_group_t *grp;
2425 fq_if_t *fqs;
2426 uint32_t quantum, calc_flags = IF_CLASSQ_DEF;
2427 struct ifnet *ifp = ifcq->ifcq_ifp;
2428
2429 VERIFY(grp_idx < FQ_IF_MAX_GROUPS);
2430
2431 fqs = (fq_if_t *)ifcq->ifcq_disc;
2432
2433 if (grp_idx == 0 && fqs->fqs_classq_groups[grp_idx] != NULL) {
2434 grp = fqs->fqs_classq_groups[grp_idx];
2435 goto update;
2436 }
2437
2438 if (fqs->fqs_classq_groups[grp_idx] != NULL) {
2439 return EINVAL;
2440 }
2441
2442 grp = zalloc_flags(fq_if_grp_zone, Z_WAITOK | Z_ZERO);
2443 if (grp == NULL) {
2444 return ENOMEM;
2445 }
2446
2447 fqs->fqs_classq_groups[grp_idx] = grp;
2448 grp->fqg_index = grp_idx;
2449
2450 quantum = fq_if_calc_quantum(ifp);
2451 if (fqs->fqs_flags & FQS_DRIVER_MANAGED) {
2452 _FQ_CLASSQ_INIT(grp, BK, quantum);
2453 _FQ_CLASSQ_INIT(grp, BE, quantum);
2454 _FQ_CLASSQ_INIT(grp, VI, quantum);
2455 _FQ_CLASSQ_INIT(grp, VO, quantum);
2456 } else {
2457 /* SIG shares same INDEX with VI */
2458 _CASSERT(SCIDX_SIG == SCIDX_VI);
2459 _CASSERT(FQ_IF_SIG_INDEX == FQ_IF_VI_INDEX);
2460
2461 _FQ_CLASSQ_INIT(grp, BK_SYS, quantum);
2462 _FQ_CLASSQ_INIT(grp, BK, quantum);
2463 _FQ_CLASSQ_INIT(grp, BE, quantum);
2464 _FQ_CLASSQ_INIT(grp, RD, quantum);
2465 _FQ_CLASSQ_INIT(grp, OAM, quantum);
2466 _FQ_CLASSQ_INIT(grp, AV, quantum);
2467 _FQ_CLASSQ_INIT(grp, RV, quantum);
2468 _FQ_CLASSQ_INIT(grp, VI, quantum);
2469 _FQ_CLASSQ_INIT(grp, VO, quantum);
2470 _FQ_CLASSQ_INIT(grp, CTL, quantum);
2471 }
2472
2473update:
2474 if (flags & IF_DEFAULT_GRP) {
2475 fq_if_set_grp_combined(ifcq, qset_idx: grp_idx);
2476 grp->fqg_flags |= FQ_IF_DEFAULT_GRP;
2477 } else {
2478 fq_if_set_grp_separated(ifcq, qset_idx: grp_idx);
2479 grp->fqg_flags &= ~FQ_IF_DEFAULT_GRP;
2480 }
2481
2482 calc_flags |= (flags & IF_CLASSQ_LOW_LATENCY);
2483 ifclassq_calc_target_qdelay(ifp, if_target_qdelay: &grp->fqg_target_qdelays[FQ_TFC_C],
2484 flags: calc_flags);
2485 ifclassq_calc_target_qdelay(ifp, if_target_qdelay: &grp->fqg_target_qdelays[FQ_TFC_L4S],
2486 flags: calc_flags | IF_CLASSQ_L4S);
2487
2488 ifclassq_calc_update_interval(update_interval: &grp->fqg_update_intervals[FQ_TFC_C],
2489 flags: calc_flags);
2490 ifclassq_calc_update_interval(update_interval: &grp->fqg_update_intervals[FQ_TFC_L4S],
2491 flags: calc_flags | IF_CLASSQ_L4S);
2492
2493 return 0;
2494#undef _FQ_CLASSQ_INIT
2495}
2496
2497fq_if_group_t *
2498fq_if_find_grp(fq_if_t *fqs, uint8_t grp_idx)
2499{
2500 fq_if_group_t *grp;
2501
2502 IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq);
2503 VERIFY(grp_idx < FQ_IF_MAX_GROUPS);
2504
2505 grp = fqs->fqs_classq_groups[grp_idx];
2506 VERIFY(grp != NULL);
2507
2508 return grp;
2509}
2510
2511static void
2512fq_if_purge_grp(fq_if_t *fqs, fq_if_group_t *grp)
2513{
2514 for (uint8_t i = 0; i < FQ_IF_MAX_CLASSES; i++) {
2515 fq_if_purge_classq(fqs, fq_cl: &grp->fqg_classq[i]);
2516 }
2517
2518 bzero(s: &grp->fqg_bitmaps, n: sizeof(grp->fqg_bitmaps));
2519 grp->fqg_len = 0;
2520 grp->fqg_bytes = 0;
2521 fq_if_set_grp_separated(ifcq: fqs->fqs_ifq, qset_idx: grp->fqg_index);
2522}
2523
2524void
2525fq_if_destroy_grps(fq_if_t *fqs)
2526{
2527 fq_if_group_t *grp;
2528
2529 IFCQ_LOCK_ASSERT_HELD(fqs->fqs_ifq);
2530
2531 for (uint8_t grp_idx = 0; grp_idx < FQ_IF_MAX_GROUPS; grp_idx++) {
2532 if (fqs->fqs_classq_groups[grp_idx] == NULL) {
2533 continue;
2534 }
2535
2536 grp = fq_if_find_grp(fqs, grp_idx);
2537 fq_if_purge_grp(fqs, grp);
2538 zfree(fq_if_grp_zone, grp);
2539 fqs->fqs_classq_groups[grp_idx] = NULL;
2540 }
2541}
2542
2543static inline boolean_t
2544fq_if_is_grp_combined(fq_if_t *fqs, uint8_t grp_idx)
2545{
2546 return pktsched_bit_tst(ix: grp_idx, pData: &fqs->fqs_combined_grp_bitmap);
2547}
2548
2549void
2550fq_if_set_grp_combined(struct ifclassq *ifcq, uint8_t grp_idx)
2551{
2552 fq_if_t *fqs;
2553 fq_if_group_t *grp;
2554
2555 IFCQ_LOCK_ASSERT_HELD(ifcq);
2556
2557 fqs = (fq_if_t *)ifcq->ifcq_disc;
2558 grp = fq_if_find_grp(fqs, grp_idx);
2559
2560 if (fq_if_is_grp_combined(fqs, grp_idx)) {
2561 return;
2562 }
2563
2564 /*
2565 * We keep the current fq_deficit and fcl_budget when combining a group.
2566 * That might disrupt the AQM but only for a moment.
2567 */
2568 pktsched_bit_set(ix: grp_idx, pData: &fqs->fqs_combined_grp_bitmap);
2569 TAILQ_INSERT_TAIL(&fqs->fqs_combined_grp_list, grp, fqg_grp_link);
2570}
2571
2572void
2573fq_if_set_grp_separated(struct ifclassq *ifcq, uint8_t grp_idx)
2574{
2575 fq_if_t *fqs;
2576 fq_if_group_t *grp;
2577
2578 IFCQ_LOCK_ASSERT_HELD(ifcq);
2579
2580 fqs = (fq_if_t *)ifcq->ifcq_disc;
2581 grp = fq_if_find_grp(fqs, grp_idx);
2582
2583 if (!fq_if_is_grp_combined(fqs, grp_idx)) {
2584 return;
2585 }
2586
2587 pktsched_bit_clr(ix: grp_idx, pData: &fqs->fqs_combined_grp_bitmap);
2588 TAILQ_REMOVE(&fqs->fqs_combined_grp_list, grp, fqg_grp_link);
2589}
2590