1 | /* |
2 | * Copyright (c) 2016-2021 Apple Inc. All rights reserved. |
3 | * |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
5 | * |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License |
8 | * Version 2.0 (the 'License'). You may not use this file except in |
9 | * compliance with the License. The rights granted to you under the License |
10 | * may not be used to create, or enable the creation or redistribution of, |
11 | * unlawful or unlicensed copies of an Apple operating system, or to |
12 | * circumvent, violate, or enable the circumvention or violation of, any |
13 | * terms of an Apple operating system software license agreement. |
14 | * |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
17 | * |
18 | * The Original Code and all software distributed under the License are |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and |
24 | * limitations under the License. |
25 | * |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
27 | */ |
28 | |
29 | /* |
30 | * The migration of flow queue between the different states is summarised in |
31 | * the below state diagram. (RFC 8290) |
32 | * |
33 | * +-----------------+ +------------------+ |
34 | * | | Empty | | |
35 | * | Empty |<---------------+ Old +----+ |
36 | * | | | | | |
37 | * +-------+---------+ +------------------+ | |
38 | * | ^ ^ |Credits |
39 | * |Arrival | | |Exhausted |
40 | * v | | | |
41 | * +-----------------+ | | | |
42 | * | | Empty or | | | |
43 | * | New +-------------------+ +-------+ |
44 | * | | Credits Exhausted |
45 | * +-----------------+ |
46 | * |
47 | * In this implementation of FQ-CODEL, flow queue is a dynamically allocated |
48 | * object. An active flow queue goes through the above cycle of state |
49 | * transitions very often. To avoid the cost of frequent flow queue object |
50 | * allocation/free, this implementation retains the flow queue object in |
51 | * [Empty] state on an Empty flow queue list with an active reference in flow |
52 | * queue hash table. The flow queue objects on the Empty flow queue list have |
53 | * an associated age and are purged accordingly. |
54 | */ |
55 | |
56 | #include <sys/cdefs.h> |
57 | #include <sys/param.h> |
58 | #include <sys/mbuf.h> |
59 | #include <sys/socket.h> |
60 | #include <sys/sockio.h> |
61 | #include <sys/systm.h> |
62 | #include <sys/syslog.h> |
63 | #include <sys/proc.h> |
64 | #include <sys/errno.h> |
65 | #include <sys/kernel.h> |
66 | #include <sys/kauth.h> |
67 | #include <sys/sdt.h> |
68 | #include <kern/zalloc.h> |
69 | #include <netinet/in.h> |
70 | |
71 | #include <net/classq/classq.h> |
72 | #include <net/classq/if_classq.h> |
73 | #include <net/pktsched/pktsched.h> |
74 | #include <net/pktsched/pktsched_fq_codel.h> |
75 | #include <net/classq/classq_fq_codel.h> |
76 | |
77 | #include <netinet/tcp_var.h> |
78 | |
79 | #define FQ_ZONE_MAX (32 * 1024) /* across all interfaces */ |
80 | |
81 | #define DTYPE_NODROP 0 /* no drop */ |
82 | #define DTYPE_FORCED 1 /* a "forced" drop */ |
83 | #define DTYPE_EARLY 2 /* an "unforced" (early) drop */ |
84 | |
85 | static uint32_t pkt_compressor = 1; |
86 | static uint64_t l4s_ce_threshold = 0; /* in usec */ |
87 | static uint32_t l4s_local_ce_report = 0; |
88 | static uint64_t pkt_pacing_leeway = 0; /* in usec */ |
89 | static uint64_t max_pkt_pacing_interval = 3 * NSEC_PER_SEC; |
90 | static uint64_t l4s_min_delay_threshold = 20 * NSEC_PER_MSEC; /* 20 ms */ |
91 | #if (DEBUG || DEVELOPMENT) |
92 | SYSCTL_NODE(_net_classq, OID_AUTO, flow_q, CTLFLAG_RW | CTLFLAG_LOCKED, |
93 | 0, "FQ-CODEL parameters" ); |
94 | |
95 | SYSCTL_UINT(_net_classq_flow_q, OID_AUTO, pkt_compressor, |
96 | CTLFLAG_RW | CTLFLAG_LOCKED, &pkt_compressor, 0, "enable pkt compression" ); |
97 | |
98 | SYSCTL_QUAD(_net_classq, OID_AUTO, l4s_ce_threshold, |
99 | CTLFLAG_RW | CTLFLAG_LOCKED, &l4s_ce_threshold, |
100 | "L4S CE threshold" ); |
101 | |
102 | SYSCTL_UINT(_net_classq_flow_q, OID_AUTO, l4s_local_ce_report, |
103 | CTLFLAG_RW | CTLFLAG_LOCKED, &l4s_local_ce_report, 0, |
104 | "enable L4S local CE report" ); |
105 | |
106 | SYSCTL_QUAD(_net_classq_flow_q, OID_AUTO, pkt_pacing_leeway, |
107 | CTLFLAG_RW | CTLFLAG_LOCKED, &pkt_pacing_leeway, "packet pacing leeway" ); |
108 | |
109 | SYSCTL_QUAD(_net_classq_flow_q, OID_AUTO, max_pkt_pacing_interval, |
110 | CTLFLAG_RW | CTLFLAG_LOCKED, &max_pkt_pacing_interval, "max packet pacing interval" ); |
111 | |
112 | SYSCTL_QUAD(_net_classq_flow_q, OID_AUTO, l4s_min_delay_threshold, |
113 | CTLFLAG_RW | CTLFLAG_LOCKED, &l4s_min_delay_threshold, "l4s min delay threshold" ); |
114 | #endif /* (DEBUG || DEVELOPMENT) */ |
115 | |
116 | void |
117 | fq_codel_init(void) |
118 | { |
119 | _CASSERT(AQM_KTRACE_AON_FLOW_HIGH_DELAY == 0x8300004); |
120 | _CASSERT(AQM_KTRACE_AON_THROTTLE == 0x8300008); |
121 | _CASSERT(AQM_KTRACE_AON_FLOW_OVERWHELMING == 0x830000c); |
122 | _CASSERT(AQM_KTRACE_AON_FLOW_DQ_STALL == 0x8300010); |
123 | |
124 | _CASSERT(AQM_KTRACE_STATS_FLOW_ENQUEUE == 0x8310004); |
125 | _CASSERT(AQM_KTRACE_STATS_FLOW_DEQUEUE == 0x8310008); |
126 | _CASSERT(AQM_KTRACE_STATS_FLOW_CTL == 0x831000c); |
127 | _CASSERT(AQM_KTRACE_STATS_FLOW_ALLOC == 0x8310010); |
128 | _CASSERT(AQM_KTRACE_STATS_FLOW_DESTROY == 0x8310014); |
129 | _CASSERT(AQM_KTRACE_STATS_FLOW_REPORT_CE == 0x8310018); |
130 | _CASSERT(AQM_KTRACE_STATS_GET_QLEN == 0x831001c); |
131 | _CASSERT(AQM_KTRACE_TX_NOT_READY == 0x8310020); |
132 | _CASSERT(AQM_KTRACE_TX_PACEMAKER == 0x8310024); |
133 | } |
134 | |
135 | fq_t * |
136 | fq_alloc(classq_pkt_type_t ptype) |
137 | { |
138 | fq_t *fq = NULL; |
139 | |
140 | fq = kalloc_type(fq_t, Z_WAITOK_ZERO); |
141 | if (ptype == QP_MBUF) { |
142 | MBUFQ_INIT(&fq->fq_mbufq); |
143 | } |
144 | #if SKYWALK |
145 | else { |
146 | VERIFY(ptype == QP_PACKET); |
147 | KPKTQ_INIT(&fq->fq_kpktq); |
148 | } |
149 | #endif /* SKYWALK */ |
150 | CLASSQ_PKT_INIT(&fq->fq_dq_head); |
151 | CLASSQ_PKT_INIT(&fq->fq_dq_tail); |
152 | fq->fq_in_dqlist = false; |
153 | |
154 | return fq; |
155 | } |
156 | |
157 | void |
158 | fq_destroy(fq_t *fq, classq_pkt_type_t ptype) |
159 | { |
160 | VERIFY(!fq->fq_in_dqlist); |
161 | VERIFY(fq_empty(fq, ptype)); |
162 | VERIFY(!(fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW | |
163 | FQF_EMPTY_FLOW))); |
164 | VERIFY(fq->fq_bytes == 0); |
165 | kfree_type(fq_t, fq); |
166 | } |
167 | |
168 | static inline void |
169 | fq_detect_dequeue_stall(fq_if_t *fqs, fq_t *flowq, fq_if_classq_t *fq_cl, |
170 | u_int64_t *now) |
171 | { |
172 | u_int64_t maxgetqtime, update_interval; |
173 | if (FQ_IS_DELAY_HIGH(flowq) || flowq->fq_getqtime == 0 || |
174 | fq_empty(flowq, fqs->fqs_ptype) || |
175 | flowq->fq_bytes < FQ_MIN_FC_THRESHOLD_BYTES) { |
176 | return; |
177 | } |
178 | |
179 | update_interval = FQ_UPDATE_INTERVAL(flowq); |
180 | maxgetqtime = flowq->fq_getqtime + update_interval; |
181 | if ((*now) > maxgetqtime) { |
182 | /* |
183 | * there was no dequeue in an update interval worth of |
184 | * time. It means that the queue is stalled. |
185 | */ |
186 | FQ_SET_DELAY_HIGH(flowq); |
187 | fq_cl->fcl_stat.fcl_dequeue_stall++; |
188 | os_log_error(OS_LOG_DEFAULT, "%s:num: %d, " |
189 | "scidx: %d, flow: 0x%x, iface: %s grp: %hhu" , __func__, |
190 | fq_cl->fcl_stat.fcl_dequeue_stall, flowq->fq_sc_index, |
191 | flowq->fq_flowhash, if_name(fqs->fqs_ifq->ifcq_ifp), |
192 | FQ_GROUP(flowq)->fqg_index); |
193 | KDBG(AQM_KTRACE_AON_FLOW_DQ_STALL, flowq->fq_flowhash, |
194 | AQM_KTRACE_FQ_GRP_SC_IDX(flowq), flowq->fq_bytes, |
195 | (*now) - flowq->fq_getqtime); |
196 | } |
197 | } |
198 | |
199 | void |
200 | fq_head_drop(fq_if_t *fqs, fq_t *fq) |
201 | { |
202 | pktsched_pkt_t pkt; |
203 | volatile uint32_t *pkt_flags; |
204 | uint64_t *pkt_timestamp; |
205 | struct ifclassq *ifq = fqs->fqs_ifq; |
206 | |
207 | _PKTSCHED_PKT_INIT(&pkt); |
208 | fq_getq_flow_internal(fqs, fq, &pkt); |
209 | if (pkt.pktsched_pkt_mbuf == NULL) { |
210 | return; |
211 | } |
212 | |
213 | pktsched_get_pkt_vars(&pkt, &pkt_flags, &pkt_timestamp, NULL, NULL, |
214 | NULL, NULL, NULL); |
215 | |
216 | *pkt_timestamp = 0; |
217 | switch (pkt.pktsched_ptype) { |
218 | case QP_MBUF: |
219 | *pkt_flags &= ~PKTF_PRIV_GUARDED; |
220 | break; |
221 | #if SKYWALK |
222 | case QP_PACKET: |
223 | /* sanity check */ |
224 | ASSERT((*pkt_flags & ~PKT_F_COMMON_MASK) == 0); |
225 | break; |
226 | #endif /* SKYWALK */ |
227 | default: |
228 | VERIFY(0); |
229 | /* NOTREACHED */ |
230 | __builtin_unreachable(); |
231 | } |
232 | |
233 | IFCQ_DROP_ADD(ifq, 1, pktsched_get_pkt_len(&pkt)); |
234 | IFCQ_CONVERT_LOCK(ifq); |
235 | pktsched_free_pkt(&pkt); |
236 | } |
237 | |
238 | |
239 | static int |
240 | fq_compressor(fq_if_t *fqs, fq_t *fq, fq_if_classq_t *fq_cl, |
241 | pktsched_pkt_t *pkt) |
242 | { |
243 | classq_pkt_type_t ptype = fqs->fqs_ptype; |
244 | uint32_t comp_gencnt = 0; |
245 | uint64_t *pkt_timestamp; |
246 | uint64_t old_timestamp = 0; |
247 | uint32_t old_pktlen = 0; |
248 | struct ifclassq *ifq = fqs->fqs_ifq; |
249 | |
250 | if (__improbable(pkt_compressor == 0)) { |
251 | return 0; |
252 | } |
253 | |
254 | pktsched_get_pkt_vars(pkt, NULL, &pkt_timestamp, NULL, NULL, NULL, |
255 | &comp_gencnt, NULL); |
256 | |
257 | if (comp_gencnt == 0) { |
258 | return 0; |
259 | } |
260 | |
261 | fq_cl->fcl_stat.fcl_pkts_compressible++; |
262 | |
263 | if (fq_empty(fq, fqs->fqs_ptype)) { |
264 | return 0; |
265 | } |
266 | |
267 | if (ptype == QP_MBUF) { |
268 | struct mbuf *m = MBUFQ_LAST(&fq->fq_mbufq); |
269 | |
270 | if (comp_gencnt != m->m_pkthdr.comp_gencnt) { |
271 | return 0; |
272 | } |
273 | |
274 | /* If we got until here, we should merge/replace the segment */ |
275 | MBUFQ_REMOVE(&fq->fq_mbufq, m); |
276 | old_pktlen = m_pktlen(m); |
277 | old_timestamp = m->m_pkthdr.pkt_timestamp; |
278 | |
279 | IFCQ_CONVERT_LOCK(fqs->fqs_ifq); |
280 | m_freem(m); |
281 | } |
282 | #if SKYWALK |
283 | else { |
284 | struct __kern_packet *kpkt = KPKTQ_LAST(&fq->fq_kpktq); |
285 | |
286 | if (comp_gencnt != kpkt->pkt_comp_gencnt) { |
287 | return 0; |
288 | } |
289 | |
290 | /* If we got until here, we should merge/replace the segment */ |
291 | KPKTQ_REMOVE(&fq->fq_kpktq, kpkt); |
292 | old_pktlen = kpkt->pkt_length; |
293 | old_timestamp = kpkt->pkt_timestamp; |
294 | |
295 | IFCQ_CONVERT_LOCK(fqs->fqs_ifq); |
296 | pp_free_packet(*(struct kern_pbufpool **)(uintptr_t)& |
297 | (((struct __kern_quantum *)kpkt)->qum_pp), |
298 | (uint64_t)kpkt); |
299 | } |
300 | #endif /* SKYWALK */ |
301 | |
302 | fq->fq_bytes -= old_pktlen; |
303 | fq_cl->fcl_stat.fcl_byte_cnt -= old_pktlen; |
304 | fq_cl->fcl_stat.fcl_pkt_cnt--; |
305 | IFCQ_DEC_LEN(ifq); |
306 | IFCQ_DEC_BYTES(ifq, old_pktlen); |
307 | |
308 | FQ_GRP_DEC_LEN(fq); |
309 | FQ_GRP_DEC_BYTES(fq, old_pktlen); |
310 | |
311 | *pkt_timestamp = old_timestamp; |
312 | |
313 | return CLASSQEQ_COMPRESSED; |
314 | } |
315 | |
316 | int |
317 | fq_addq(fq_if_t *fqs, fq_if_group_t *fq_grp, pktsched_pkt_t *pkt, |
318 | fq_if_classq_t *fq_cl) |
319 | { |
320 | int droptype = DTYPE_NODROP, fc_adv = 0, ret = CLASSQEQ_SUCCESS; |
321 | u_int64_t now; |
322 | fq_t *fq = NULL; |
323 | uint64_t *pkt_timestamp; |
324 | volatile uint32_t *pkt_flags; |
325 | uint32_t pkt_flowid, cnt; |
326 | uint8_t pkt_proto, pkt_flowsrc; |
327 | fq_tfc_type_t tfc_type = FQ_TFC_C; |
328 | |
329 | cnt = pkt->pktsched_pcnt; |
330 | pktsched_get_pkt_vars(pkt, &pkt_flags, &pkt_timestamp, &pkt_flowid, |
331 | &pkt_flowsrc, &pkt_proto, NULL, NULL); |
332 | |
333 | /* |
334 | * XXX Not walking the chain to set this flag on every packet. |
335 | * This flag is only used for debugging. Nothing is affected if it's |
336 | * not set. |
337 | */ |
338 | switch (pkt->pktsched_ptype) { |
339 | case QP_MBUF: |
340 | /* See comments in <rdar://problem/14040693> */ |
341 | VERIFY(!(*pkt_flags & PKTF_PRIV_GUARDED)); |
342 | *pkt_flags |= PKTF_PRIV_GUARDED; |
343 | break; |
344 | #if SKYWALK |
345 | case QP_PACKET: |
346 | /* sanity check */ |
347 | ASSERT((*pkt_flags & ~PKT_F_COMMON_MASK) == 0); |
348 | break; |
349 | #endif /* SKYWALK */ |
350 | default: |
351 | VERIFY(0); |
352 | /* NOTREACHED */ |
353 | __builtin_unreachable(); |
354 | } |
355 | |
356 | if (ifclassq_enable_l4s) { |
357 | tfc_type = pktsched_is_pkt_l4s(pkt) ? FQ_TFC_L4S : FQ_TFC_C; |
358 | } |
359 | |
360 | /* |
361 | * Timestamps for every packet must be set prior to entering this path. |
362 | */ |
363 | now = *pkt_timestamp; |
364 | ASSERT(now > 0); |
365 | |
366 | /* find the flowq for this packet */ |
367 | fq = fq_if_hash_pkt(fqs, fq_grp, pkt_flowid, pktsched_get_pkt_svc(pkt), |
368 | now, true, tfc_type); |
369 | if (__improbable(fq == NULL)) { |
370 | DTRACE_IP1(memfail__drop, fq_if_t *, fqs); |
371 | /* drop the packet if we could not allocate a flow queue */ |
372 | fq_cl->fcl_stat.fcl_drop_memfailure += cnt; |
373 | return CLASSQEQ_DROP; |
374 | } |
375 | VERIFY(fq->fq_group == fq_grp); |
376 | VERIFY(fqs->fqs_ptype == pkt->pktsched_ptype); |
377 | |
378 | KDBG(AQM_KTRACE_STATS_FLOW_ENQUEUE, fq->fq_flowhash, |
379 | AQM_KTRACE_FQ_GRP_SC_IDX(fq), |
380 | fq->fq_bytes, pktsched_get_pkt_len(pkt)); |
381 | |
382 | fq_detect_dequeue_stall(fqs, flowq: fq, fq_cl, now: &now); |
383 | |
384 | /* |
385 | * Skip the dropping part if it's L4S. Flow control or ECN marking decision |
386 | * will be made at dequeue time. |
387 | */ |
388 | if (ifclassq_enable_l4s && tfc_type == FQ_TFC_L4S) { |
389 | fq_cl->fcl_stat.fcl_l4s_pkts++; |
390 | droptype = DTYPE_NODROP; |
391 | } |
392 | |
393 | if (__improbable(FQ_IS_DELAY_HIGH(fq) || FQ_IS_OVERWHELMING(fq))) { |
394 | if ((fq->fq_flags & FQF_FLOWCTL_CAPABLE) && |
395 | (*pkt_flags & PKTF_FLOW_ADV)) { |
396 | fc_adv = 1; |
397 | /* |
398 | * If the flow is suspended or it is not |
399 | * TCP/QUIC, drop the chain. |
400 | */ |
401 | if ((pkt_proto != IPPROTO_TCP) && |
402 | (pkt_proto != IPPROTO_QUIC)) { |
403 | droptype = DTYPE_EARLY; |
404 | fq_cl->fcl_stat.fcl_drop_early += cnt; |
405 | IFCQ_DROP_ADD(fqs->fqs_ifq, cnt, pktsched_get_pkt_len(pkt)); |
406 | } |
407 | DTRACE_IP6(flow__adv, fq_if_t *, fqs, |
408 | fq_if_classq_t *, fq_cl, fq_t *, fq, |
409 | int, droptype, pktsched_pkt_t *, pkt, |
410 | uint32_t, cnt); |
411 | } else { |
412 | /* |
413 | * Need to drop packets to make room for the new |
414 | * ones. Try to drop from the head of the queue |
415 | * instead of the latest packets. |
416 | */ |
417 | if (!fq_empty(fq, fqs->fqs_ptype)) { |
418 | uint32_t i; |
419 | |
420 | for (i = 0; i < cnt; i++) { |
421 | fq_head_drop(fqs, fq); |
422 | } |
423 | droptype = DTYPE_NODROP; |
424 | } else { |
425 | droptype = DTYPE_EARLY; |
426 | } |
427 | fq_cl->fcl_stat.fcl_drop_early += cnt; |
428 | |
429 | DTRACE_IP6(no__flow__adv, fq_if_t *, fqs, |
430 | fq_if_classq_t *, fq_cl, fq_t *, fq, |
431 | int, droptype, pktsched_pkt_t *, pkt, |
432 | uint32_t, cnt); |
433 | } |
434 | } |
435 | |
436 | /* Set the return code correctly */ |
437 | if (__improbable(fc_adv == 1 && droptype != DTYPE_FORCED)) { |
438 | if (fq_if_add_fcentry(fqs, pkt, pkt_flowsrc, fq, fq_cl)) { |
439 | fq->fq_flags |= FQF_FLOWCTL_ON; |
440 | /* deliver flow control advisory error */ |
441 | if (droptype == DTYPE_NODROP) { |
442 | ret = CLASSQEQ_SUCCESS_FC; |
443 | } else { |
444 | /* dropped due to flow control */ |
445 | ret = CLASSQEQ_DROP_FC; |
446 | } |
447 | } else { |
448 | /* |
449 | * if we could not flow control the flow, it is |
450 | * better to drop |
451 | */ |
452 | droptype = DTYPE_FORCED; |
453 | ret = CLASSQEQ_DROP_FC; |
454 | fq_cl->fcl_stat.fcl_flow_control_fail++; |
455 | } |
456 | DTRACE_IP3(fc__ret, fq_if_t *, fqs, int, droptype, int, ret); |
457 | } |
458 | |
459 | /* |
460 | * If the queue length hits the queue limit, drop a chain with the |
461 | * same number of packets from the front of the queue for a flow with |
462 | * maximum number of bytes. This will penalize heavy and unresponsive |
463 | * flows. It will also avoid a tail drop. |
464 | */ |
465 | if (__improbable(droptype == DTYPE_NODROP && |
466 | fq_if_at_drop_limit(fqs))) { |
467 | uint32_t i; |
468 | |
469 | if (fqs->fqs_large_flow == fq) { |
470 | /* |
471 | * Drop from the head of the current fq. Since a |
472 | * new packet will be added to the tail, it is ok |
473 | * to leave fq in place. |
474 | */ |
475 | DTRACE_IP5(large__flow, fq_if_t *, fqs, |
476 | fq_if_classq_t *, fq_cl, fq_t *, fq, |
477 | pktsched_pkt_t *, pkt, uint32_t, cnt); |
478 | |
479 | for (i = 0; i < cnt; i++) { |
480 | fq_head_drop(fqs, fq); |
481 | } |
482 | fq_cl->fcl_stat.fcl_drop_overflow += cnt; |
483 | |
484 | /* |
485 | * TCP and QUIC will react to the loss of those head dropped pkts |
486 | * and adjust send rate. |
487 | */ |
488 | if ((fq->fq_flags & FQF_FLOWCTL_CAPABLE) && |
489 | (*pkt_flags & PKTF_FLOW_ADV) && |
490 | (pkt_proto != IPPROTO_TCP) && |
491 | (pkt_proto != IPPROTO_QUIC)) { |
492 | if (fq_if_add_fcentry(fqs, pkt, pkt_flowsrc, fq, fq_cl)) { |
493 | fq->fq_flags |= FQF_FLOWCTL_ON; |
494 | FQ_SET_OVERWHELMING(fq); |
495 | fq_cl->fcl_stat.fcl_overwhelming++; |
496 | /* deliver flow control advisory error */ |
497 | ret = CLASSQEQ_SUCCESS_FC; |
498 | } |
499 | } |
500 | } else { |
501 | if (fqs->fqs_large_flow == NULL) { |
502 | droptype = DTYPE_FORCED; |
503 | fq_cl->fcl_stat.fcl_drop_overflow += cnt; |
504 | ret = CLASSQEQ_DROP; |
505 | |
506 | DTRACE_IP5(no__large__flow, fq_if_t *, fqs, |
507 | fq_if_classq_t *, fq_cl, fq_t *, fq, |
508 | pktsched_pkt_t *, pkt, uint32_t, cnt); |
509 | |
510 | /* |
511 | * if this fq was freshly created and there |
512 | * is nothing to enqueue, move it to empty list |
513 | */ |
514 | if (fq_empty(fq, fqs->fqs_ptype) && |
515 | !(fq->fq_flags & (FQF_NEW_FLOW | |
516 | FQF_OLD_FLOW))) { |
517 | fq_if_move_to_empty_flow(fqs, fq_cl, |
518 | fq, now); |
519 | fq = NULL; |
520 | } |
521 | } else { |
522 | DTRACE_IP5(different__large__flow, |
523 | fq_if_t *, fqs, fq_if_classq_t *, fq_cl, |
524 | fq_t *, fq, pktsched_pkt_t *, pkt, |
525 | uint32_t, cnt); |
526 | |
527 | for (i = 0; i < cnt; i++) { |
528 | fq_if_drop_packet(fqs, now); |
529 | } |
530 | } |
531 | } |
532 | } |
533 | |
534 | fq_cl->fcl_flags &= ~FCL_PACED; |
535 | |
536 | if (__probable(droptype == DTYPE_NODROP)) { |
537 | uint32_t chain_len = pktsched_get_pkt_len(pkt); |
538 | int ret_compress = 0; |
539 | |
540 | /* |
541 | * We do not compress if we are enqueuing a chain. |
542 | * Traversing the chain to look for acks would defeat the |
543 | * purpose of batch enqueueing. |
544 | */ |
545 | if (cnt == 1) { |
546 | ret_compress = fq_compressor(fqs, fq, fq_cl, pkt); |
547 | if (ret_compress == CLASSQEQ_COMPRESSED) { |
548 | fq_cl->fcl_stat.fcl_pkts_compressed++; |
549 | } |
550 | } |
551 | DTRACE_IP5(fq_enqueue, fq_if_t *, fqs, fq_if_classq_t *, fq_cl, |
552 | fq_t *, fq, pktsched_pkt_t *, pkt, uint32_t, cnt); |
553 | fq_enqueue(fq, pkt->pktsched_pkt, pkt->pktsched_tail, cnt, |
554 | pkt->pktsched_ptype); |
555 | |
556 | fq->fq_bytes += chain_len; |
557 | fq_cl->fcl_stat.fcl_byte_cnt += chain_len; |
558 | fq_cl->fcl_stat.fcl_pkt_cnt += cnt; |
559 | |
560 | /* |
561 | * check if this queue will qualify to be the next |
562 | * victim queue |
563 | */ |
564 | fq_if_is_flow_heavy(fqs, fq); |
565 | } else { |
566 | DTRACE_IP3(fq_drop, fq_if_t *, fqs, int, droptype, int, ret); |
567 | return (ret != CLASSQEQ_SUCCESS) ? ret : CLASSQEQ_DROP; |
568 | } |
569 | |
570 | /* |
571 | * If the queue is not currently active, add it to the end of new |
572 | * flows list for that service class. |
573 | */ |
574 | if ((fq->fq_flags & (FQF_NEW_FLOW | FQF_OLD_FLOW)) == 0) { |
575 | VERIFY(STAILQ_NEXT(fq, fq_actlink) == NULL); |
576 | STAILQ_INSERT_TAIL(&fq_cl->fcl_new_flows, fq, fq_actlink); |
577 | fq->fq_flags |= FQF_NEW_FLOW; |
578 | |
579 | fq_cl->fcl_stat.fcl_newflows_cnt++; |
580 | |
581 | fq->fq_deficit = fq_cl->fcl_quantum; |
582 | } |
583 | return ret; |
584 | } |
585 | |
586 | void |
587 | fq_getq_flow_internal(fq_if_t *fqs, fq_t *fq, pktsched_pkt_t *pkt) |
588 | { |
589 | classq_pkt_t p = CLASSQ_PKT_INITIALIZER(p); |
590 | uint32_t plen; |
591 | fq_if_classq_t *fq_cl; |
592 | struct ifclassq *ifq = fqs->fqs_ifq; |
593 | |
594 | fq_dequeue(fq, &p, fqs->fqs_ptype); |
595 | if (p.cp_ptype == QP_INVALID) { |
596 | VERIFY(p.cp_mbuf == NULL); |
597 | return; |
598 | } |
599 | |
600 | fq->fq_next_tx_time = FQ_INVALID_TX_TS; |
601 | |
602 | pktsched_pkt_encap(pkt, &p); |
603 | plen = pktsched_get_pkt_len(pkt); |
604 | |
605 | VERIFY(fq->fq_bytes >= plen); |
606 | fq->fq_bytes -= plen; |
607 | |
608 | fq_cl = &FQ_CLASSQ(fq); |
609 | fq_cl->fcl_stat.fcl_byte_cnt -= plen; |
610 | fq_cl->fcl_stat.fcl_pkt_cnt--; |
611 | fq_cl->fcl_flags &= ~FCL_PACED; |
612 | |
613 | IFCQ_DEC_LEN(ifq); |
614 | IFCQ_DEC_BYTES(ifq, plen); |
615 | |
616 | FQ_GRP_DEC_LEN(fq); |
617 | FQ_GRP_DEC_BYTES(fq, plen); |
618 | |
619 | /* Reset getqtime so that we don't count idle times */ |
620 | if (fq_empty(fq, fqs->fqs_ptype)) { |
621 | fq->fq_getqtime = 0; |
622 | } |
623 | } |
624 | |
625 | /* |
626 | * fq_get_next_tx_time returns FQ_INVALID_TX_TS when there is no tx time in fq |
627 | */ |
628 | static uint64_t |
629 | fq_get_next_tx_time(fq_if_t *fqs, fq_t *fq) |
630 | { |
631 | uint64_t tx_time = FQ_INVALID_TX_TS; |
632 | |
633 | /* |
634 | * Check the cached value in fq |
635 | */ |
636 | if (fq->fq_next_tx_time != FQ_INVALID_TX_TS) { |
637 | return fq->fq_next_tx_time; |
638 | } |
639 | |
640 | switch (fqs->fqs_ptype) { |
641 | case QP_MBUF: { |
642 | struct mbuf *m; |
643 | if ((m = MBUFQ_FIRST(&fq->fq_mbufq)) != NULL) { |
644 | struct m_tag *tag; |
645 | tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID, |
646 | KERNEL_TAG_TYPE_AQM); |
647 | if (tag != NULL) { |
648 | tx_time = *(uint64_t *)tag->m_tag_data; |
649 | } |
650 | } |
651 | break; |
652 | } |
653 | case QP_PACKET: { |
654 | struct __kern_packet *p = KPKTQ_FIRST(&fq->fq_kpktq); |
655 | if (__probable(p != NULL && (p->pkt_pflags & PKT_F_OPT_TX_TIMESTAMP) != 0)) { |
656 | tx_time = p->pkt_com_opt->__po_pkt_tx_time; |
657 | } |
658 | break; |
659 | } |
660 | default: |
661 | VERIFY(0); |
662 | /* NOTREACHED */ |
663 | __builtin_unreachable(); |
664 | } |
665 | |
666 | /* |
667 | * Cache the tx time in fq. The cache will be clear after dequeue or drop |
668 | * from the fq. |
669 | */ |
670 | fq->fq_next_tx_time = tx_time; |
671 | |
672 | return tx_time; |
673 | } |
674 | |
675 | /* |
676 | * fq_tx_time_ready returns true if the fq is empty so that it doesn't |
677 | * affect caller logics that handles empty flow. |
678 | */ |
679 | boolean_t |
680 | fq_tx_time_ready(fq_if_t *fqs, fq_t *fq, uint64_t now, uint64_t *ready_time) |
681 | { |
682 | uint64_t pkt_tx_time; |
683 | fq_if_classq_t *fq_cl = &FQ_CLASSQ(fq); |
684 | |
685 | if (!ifclassq_enable_pacing || !ifclassq_enable_l4s || fq->fq_tfc_type != FQ_TFC_L4S) { |
686 | return TRUE; |
687 | } |
688 | |
689 | pkt_tx_time = fq_get_next_tx_time(fqs, fq); |
690 | if (ready_time != NULL) { |
691 | *ready_time = pkt_tx_time; |
692 | } |
693 | |
694 | if (pkt_tx_time <= now + pkt_pacing_leeway || |
695 | pkt_tx_time == FQ_INVALID_TX_TS) { |
696 | return TRUE; |
697 | } |
698 | |
699 | /* |
700 | * Ignore the tx time if it's scheduled too far in the future |
701 | */ |
702 | if (pkt_tx_time > max_pkt_pacing_interval + now) { |
703 | fq_cl->fcl_stat.fcl_ignore_tx_time++; |
704 | return TRUE; |
705 | } |
706 | |
707 | ASSERT(pkt_tx_time != FQ_INVALID_TX_TS); |
708 | KDBG(AQM_KTRACE_TX_NOT_READY, fq->fq_flowhash, |
709 | AQM_KTRACE_FQ_GRP_SC_IDX(fq), now, pkt_tx_time); |
710 | return FALSE; |
711 | } |
712 | |
713 | void |
714 | fq_getq_flow(fq_if_t *fqs, fq_t *fq, pktsched_pkt_t *pkt, uint64_t now) |
715 | { |
716 | fq_if_classq_t *fq_cl = &FQ_CLASSQ(fq); |
717 | int64_t qdelay = 0; |
718 | volatile uint32_t *pkt_flags; |
719 | uint64_t *pkt_timestamp, pkt_tx_time = 0, pacing_delay = 0; |
720 | uint64_t fq_min_delay_threshold = FQ_TARGET_DELAY(fq); |
721 | uint8_t pkt_flowsrc; |
722 | boolean_t l4s_pkt; |
723 | |
724 | fq_getq_flow_internal(fqs, fq, pkt); |
725 | if (pkt->pktsched_ptype == QP_INVALID) { |
726 | VERIFY(pkt->pktsched_pkt_mbuf == NULL); |
727 | return; |
728 | } |
729 | |
730 | pktsched_get_pkt_vars(pkt, &pkt_flags, &pkt_timestamp, NULL, &pkt_flowsrc, |
731 | NULL, NULL, &pkt_tx_time); |
732 | l4s_pkt = pktsched_is_pkt_l4s(pkt); |
733 | if (ifclassq_enable_pacing && ifclassq_enable_l4s) { |
734 | if (pkt_tx_time > *pkt_timestamp) { |
735 | pacing_delay = pkt_tx_time - *pkt_timestamp; |
736 | fq_cl->fcl_stat.fcl_paced_pkts++; |
737 | DTRACE_SKYWALK3(aqm__pacing__delta, uint64_t, now - pkt_tx_time, |
738 | fq_if_t *, fqs, fq_t *, fq); |
739 | } |
740 | #if (DEVELOPMENT || DEBUG) |
741 | else { |
742 | DTRACE_SKYWALK5(aqm__miss__pacing__delay, uint64_t, *pkt_timestamp, |
743 | uint64_t, pkt_tx_time, uint64_t, now, fq_if_t *, |
744 | fqs, fq_t *, fq); |
745 | } |
746 | #endif // (DEVELOPMENT || DEBUG) |
747 | } |
748 | |
749 | /* this will compute qdelay in nanoseconds */ |
750 | if (now > *pkt_timestamp) { |
751 | qdelay = now - *pkt_timestamp; |
752 | } |
753 | |
754 | /* Update min/max/avg qdelay for the respective class */ |
755 | if (fq_cl->fcl_stat.fcl_min_qdelay == 0 || |
756 | (qdelay > 0 && (u_int64_t)qdelay < fq_cl->fcl_stat.fcl_min_qdelay)) { |
757 | fq_cl->fcl_stat.fcl_min_qdelay = qdelay; |
758 | } |
759 | |
760 | if (fq_cl->fcl_stat.fcl_max_qdelay == 0 || |
761 | (qdelay > 0 && (u_int64_t)qdelay > fq_cl->fcl_stat.fcl_max_qdelay)) { |
762 | fq_cl->fcl_stat.fcl_max_qdelay = qdelay; |
763 | } |
764 | |
765 | uint64_t num_dequeues = fq_cl->fcl_stat.fcl_dequeue; |
766 | |
767 | if (num_dequeues == 0) { |
768 | fq_cl->fcl_stat.fcl_avg_qdelay = qdelay; |
769 | } else if (qdelay > 0) { |
770 | uint64_t res = 0; |
771 | if (os_add_overflow(num_dequeues, 1, &res)) { |
772 | /* Reset the dequeue num and dequeue bytes */ |
773 | fq_cl->fcl_stat.fcl_dequeue = num_dequeues = 0; |
774 | fq_cl->fcl_stat.fcl_dequeue_bytes = 0; |
775 | fq_cl->fcl_stat.fcl_avg_qdelay = qdelay; |
776 | os_log_info(OS_LOG_DEFAULT, "%s: dequeue num overflow, " |
777 | "flow: 0x%x, iface: %s" , __func__, fq->fq_flowhash, |
778 | if_name(fqs->fqs_ifq->ifcq_ifp)); |
779 | } else { |
780 | uint64_t product = 0; |
781 | if (os_mul_overflow(fq_cl->fcl_stat.fcl_avg_qdelay, |
782 | num_dequeues, &product) || os_add_overflow(product, qdelay, &res)) { |
783 | fq_cl->fcl_stat.fcl_avg_qdelay = qdelay; |
784 | } else { |
785 | fq_cl->fcl_stat.fcl_avg_qdelay = res / |
786 | (num_dequeues + 1); |
787 | } |
788 | } |
789 | } |
790 | |
791 | fq->fq_pkts_since_last_report++; |
792 | if (ifclassq_enable_l4s && l4s_pkt) { |
793 | /* |
794 | * A safe guard to make sure that L4S is not going to build a huge |
795 | * queue if we encounter unexpected problems (for eg., if ACKs don't |
796 | * arrive in timely manner due to congestion in reverse path). |
797 | */ |
798 | fq_min_delay_threshold = l4s_min_delay_threshold; |
799 | |
800 | if ((l4s_ce_threshold != 0 && qdelay > l4s_ce_threshold + pacing_delay) || |
801 | (l4s_ce_threshold == 0 && qdelay > FQ_TARGET_DELAY(fq) + pacing_delay)) { |
802 | DTRACE_SKYWALK4(aqm__mark__ce, uint64_t, qdelay, uint64_t, pacing_delay, |
803 | fq_if_t *, fqs, fq_t *, fq); |
804 | KDBG(AQM_KTRACE_STATS_FLOW_REPORT_CE, fq->fq_flowhash, |
805 | AQM_KTRACE_FQ_GRP_SC_IDX(fq), qdelay, pacing_delay); |
806 | /* |
807 | * The packet buffer that pktsched_mark_ecn writes to can be pageable. |
808 | * Since it is not safe to write to pageable memory while preemption |
809 | * is disabled, convert the spin lock into mutex. |
810 | */ |
811 | IFCQ_CONVERT_LOCK(fqs->fqs_ifq); |
812 | if (__improbable(l4s_local_ce_report != 0) && |
813 | (*pkt_flags & PKTF_FLOW_ADV) != 0 && |
814 | fq_if_report_ce(fqs, pkt, 1, fq->fq_pkts_since_last_report)) { |
815 | fq->fq_pkts_since_last_report = 0; |
816 | fq_cl->fcl_stat.fcl_ce_reported++; |
817 | } else if (pktsched_mark_ecn(pkt) == 0) { |
818 | fq_cl->fcl_stat.fcl_ce_marked++; |
819 | } else { |
820 | fq_cl->fcl_stat.fcl_ce_mark_failures++; |
821 | } |
822 | } |
823 | } |
824 | |
825 | ASSERT(pacing_delay <= INT64_MAX); |
826 | qdelay = MAX(0, qdelay - (int64_t)pacing_delay); |
827 | if (fq->fq_min_qdelay == 0 || |
828 | (u_int64_t)qdelay < fq->fq_min_qdelay) { |
829 | fq->fq_min_qdelay = qdelay; |
830 | } |
831 | |
832 | if (now >= fq->fq_updatetime) { |
833 | if (fq->fq_min_qdelay > fq_min_delay_threshold) { |
834 | if (!FQ_IS_DELAY_HIGH(fq)) { |
835 | FQ_SET_DELAY_HIGH(fq); |
836 | os_log_error(OS_LOG_DEFAULT, |
837 | "%s: scidx: %d, %llu, flow: 0x%x, " |
838 | "iface: %s, grp: %hhu\n" , __func__, fq->fq_sc_index, |
839 | fq->fq_min_qdelay, fq->fq_flowhash, |
840 | if_name(fqs->fqs_ifq->ifcq_ifp), |
841 | FQ_GROUP(fq)->fqg_index); |
842 | } |
843 | } else { |
844 | FQ_CLEAR_DELAY_HIGH(fq); |
845 | } |
846 | /* Reset measured queue delay and update time */ |
847 | fq->fq_updatetime = now + FQ_UPDATE_INTERVAL(fq); |
848 | fq->fq_min_qdelay = 0; |
849 | } |
850 | |
851 | if (fqs->fqs_large_flow != fq || !fq_if_almost_at_drop_limit(fqs)) { |
852 | FQ_CLEAR_OVERWHELMING(fq); |
853 | } |
854 | if (!FQ_IS_DELAY_HIGH(fq) || fq_empty(fq, fqs->fqs_ptype)) { |
855 | FQ_CLEAR_DELAY_HIGH(fq); |
856 | } |
857 | |
858 | if ((fq->fq_flags & FQF_FLOWCTL_ON) && |
859 | !FQ_IS_DELAY_HIGH(fq) && !FQ_IS_OVERWHELMING(fq)) { |
860 | fq_if_flow_feedback(fqs, fq, fq_cl); |
861 | } |
862 | |
863 | if (fq_empty(fq, fqs->fqs_ptype)) { |
864 | /* Reset getqtime so that we don't count idle times */ |
865 | fq->fq_getqtime = 0; |
866 | } else { |
867 | fq->fq_getqtime = now; |
868 | } |
869 | fq_if_is_flow_heavy(fqs, fq); |
870 | |
871 | *pkt_timestamp = 0; |
872 | switch (pkt->pktsched_ptype) { |
873 | case QP_MBUF: |
874 | *pkt_flags &= ~PKTF_PRIV_GUARDED; |
875 | break; |
876 | #if SKYWALK |
877 | case QP_PACKET: |
878 | /* sanity check */ |
879 | ASSERT((*pkt_flags & ~PKT_F_COMMON_MASK) == 0); |
880 | break; |
881 | #endif /* SKYWALK */ |
882 | default: |
883 | VERIFY(0); |
884 | /* NOTREACHED */ |
885 | __builtin_unreachable(); |
886 | } |
887 | } |
888 | |