1/*
2 * Copyright (c) 2011-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <sys/cdefs.h>
30
31#include <sys/param.h>
32#include <sys/malloc.h>
33#include <sys/mbuf.h>
34#include <sys/systm.h>
35#include <sys/kernel.h>
36#include <sys/errno.h>
37#include <sys/mcache.h>
38#include <sys/sysctl.h>
39
40#include <dev/random/randomdev.h>
41#include <net/if.h>
42#include <net/if_var.h>
43#include <net/if_dl.h>
44#include <net/if_types.h>
45#include <net/net_osdep.h>
46#include <net/pktsched/pktsched.h>
47#include <net/pktsched/pktsched_fq_codel.h>
48#include <net/pktsched/pktsched_netem.h>
49
50#define _IP_VHL
51#include <netinet/ip.h>
52#include <netinet/ip6.h>
53
54#include <pexpert/pexpert.h>
55
56#if SKYWALK
57#include <skywalk/os_skywalk_private.h>
58#endif /* SKYWALK */
59
60u_int32_t machclk_freq = 0;
61u_int64_t machclk_per_sec = 0;
62u_int32_t pktsched_verbose = 0; /* more noise if greater than 1 */
63
64static void init_machclk(void);
65
66SYSCTL_NODE(_net, OID_AUTO, pktsched, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "pktsched");
67
68SYSCTL_UINT(_net_pktsched, OID_AUTO, verbose, CTLFLAG_RW | CTLFLAG_LOCKED,
69 &pktsched_verbose, 0, "Packet scheduler verbosity level");
70
71void
72pktsched_init(void)
73{
74 init_machclk();
75 if (machclk_freq == 0) {
76 panic("%s: no CPU clock available!", __func__);
77 /* NOTREACHED */
78 }
79 pktsched_fq_init();
80}
81
82static void
83init_machclk(void)
84{
85 /*
86 * Initialize machclk_freq using the timerbase frequency
87 * value from device specific info.
88 */
89 machclk_freq = (uint32_t)gPEClockFrequencyInfo.timebase_frequency_hz;
90
91 clock_interval_to_absolutetime_interval(interval: 1, NSEC_PER_SEC,
92 result: &machclk_per_sec);
93}
94
95u_int64_t
96pktsched_abs_to_nsecs(u_int64_t abstime)
97{
98 u_int64_t nsecs;
99
100 absolutetime_to_nanoseconds(abstime, result: &nsecs);
101 return nsecs;
102}
103
104u_int64_t
105pktsched_nsecs_to_abstime(u_int64_t nsecs)
106{
107 u_int64_t abstime;
108
109 nanoseconds_to_absolutetime(nanoseconds: nsecs, result: &abstime);
110 return abstime;
111}
112
113int
114pktsched_setup(struct ifclassq *ifq, u_int32_t scheduler, u_int32_t sflags,
115 classq_pkt_type_t ptype)
116{
117 int error = 0;
118 u_int32_t rflags;
119
120 IFCQ_LOCK_ASSERT_HELD(ifq);
121
122 VERIFY(machclk_freq != 0);
123
124 /* Nothing to do unless the scheduler type changes */
125 if (ifq->ifcq_type == scheduler) {
126 return 0;
127 }
128
129 /*
130 * Remember the flags that need to be restored upon success, as
131 * they may be cleared when we tear down existing scheduler.
132 */
133 rflags = (ifq->ifcq_flags & IFCQF_ENABLED);
134
135 if (ifq->ifcq_type != PKTSCHEDT_NONE) {
136 pktsched_teardown(ifq);
137
138 /* Teardown should have succeeded */
139 VERIFY(ifq->ifcq_type == PKTSCHEDT_NONE);
140 VERIFY(ifq->ifcq_disc == NULL);
141 }
142
143 error = fq_if_setup_ifclassq(ifq, flags: sflags, ptype);
144 if (error == 0) {
145 ifq->ifcq_flags |= rflags;
146 }
147
148 return error;
149}
150
151void
152pktsched_teardown(struct ifclassq *ifq)
153{
154 IFCQ_LOCK_ASSERT_HELD(ifq);
155 if_qflush(ifq->ifcq_ifp, ifq, true);
156 VERIFY(IFCQ_IS_EMPTY(ifq));
157 ifq->ifcq_flags &= ~IFCQF_ENABLED;
158 if (ifq->ifcq_type == PKTSCHEDT_FQ_CODEL) {
159 /* Could be PKTSCHEDT_NONE */
160 fq_if_teardown_ifclassq(ifq);
161 }
162 return;
163}
164
165int
166pktsched_getqstats(struct ifclassq *ifq, u_int32_t gid, u_int32_t qid,
167 struct if_ifclassq_stats *ifqs)
168{
169 int error = 0;
170
171 IFCQ_LOCK_ASSERT_HELD(ifq);
172
173 if (ifq->ifcq_type == PKTSCHEDT_FQ_CODEL) {
174 /* Could be PKTSCHEDT_NONE */
175 error = fq_if_getqstats_ifclassq(ifq, gid: (uint8_t)gid, qid, ifqs);
176 }
177
178 return error;
179}
180
181void
182pktsched_pkt_encap(pktsched_pkt_t *pkt, classq_pkt_t *cpkt)
183{
184 pkt->pktsched_pkt = *cpkt;
185 pkt->pktsched_tail = *cpkt;
186 pkt->pktsched_pcnt = 1;
187
188 switch (cpkt->cp_ptype) {
189 case QP_MBUF:
190 pkt->pktsched_plen =
191 (uint32_t)m_pktlen(pkt->pktsched_pkt_mbuf);
192 break;
193
194#if SKYWALK
195 case QP_PACKET:
196 pkt->pktsched_plen = pkt->pktsched_pkt_kpkt->pkt_length;
197 break;
198#endif /* SKYWALK */
199
200 default:
201 VERIFY(0);
202 /* NOTREACHED */
203 __builtin_unreachable();
204 }
205}
206
207void
208pktsched_pkt_encap_chain(pktsched_pkt_t *pkt, classq_pkt_t *cpkt,
209 classq_pkt_t *tail, uint32_t cnt, uint32_t bytes)
210{
211 pkt->pktsched_pkt = *cpkt;
212 pkt->pktsched_tail = *tail;
213 pkt->pktsched_pcnt = cnt;
214 pkt->pktsched_plen = bytes;
215
216 switch (cpkt->cp_ptype) {
217 case QP_MBUF:
218 break;
219
220#if SKYWALK
221 case QP_PACKET:
222 break;
223#endif /* SKYWALK */
224
225 default:
226 VERIFY(0);
227 /* NOTREACHED */
228 __builtin_unreachable();
229 }
230}
231
232int
233pktsched_clone_pkt(pktsched_pkt_t *pkt1, pktsched_pkt_t *pkt2)
234{
235 struct mbuf *m1, *m2;
236#if SKYWALK
237 struct __kern_packet *p1;
238 kern_packet_t ph2;
239 int err;
240#endif /* SKYWALK */
241
242 ASSERT(pkt1 != NULL);
243 ASSERT(pkt1->pktsched_pkt_mbuf != NULL);
244 ASSERT(pkt1->pktsched_pcnt == 1);
245
246 /* allow in place clone, but make sure pkt2->pktsched_pkt won't leak */
247 ASSERT((pkt1 == pkt2 && pkt1->pktsched_pkt_mbuf ==
248 pkt2->pktsched_pkt_mbuf) || (pkt1 != pkt2 &&
249 pkt2->pktsched_pkt_mbuf == NULL));
250
251 switch (pkt1->pktsched_ptype) {
252 case QP_MBUF:
253 m1 = (struct mbuf *)pkt1->pktsched_pkt_mbuf;
254 m2 = m_dup(m: m1, M_NOWAIT);
255 if (__improbable(m2 == NULL)) {
256 return ENOBUFS;
257 }
258 pkt2->pktsched_pkt_mbuf = m2;
259 break;
260
261#if SKYWALK
262 case QP_PACKET:
263 p1 = (struct __kern_packet *)pkt1->pktsched_pkt_kpkt;
264 err = kern_packet_clone_nosleep(SK_PTR_ENCODE(p1,
265 METADATA_TYPE(p1), METADATA_SUBTYPE(p1)), &ph2,
266 KPKT_COPY_HEAVY);
267 if (__improbable(err != 0)) {
268 return err;
269 }
270 ASSERT(ph2 != 0);
271 VERIFY(kern_packet_finalize(ph2) == 0);
272 pkt2->pktsched_pkt_kpkt = SK_PTR_ADDR_KPKT(ph2);
273 break;
274#endif /* SKYWALK */
275
276 default:
277 VERIFY(0);
278 /* NOTREACHED */
279 __builtin_unreachable();
280 }
281
282 pkt2->pktsched_plen = pkt1->pktsched_plen;
283 pkt2->pktsched_ptype = pkt1->pktsched_ptype;
284 pkt2->pktsched_tail = pkt2->pktsched_pkt;
285 pkt2->pktsched_pcnt = 1;
286 return 0;
287}
288
289void
290pktsched_corrupt_packet(pktsched_pkt_t *pkt)
291{
292 struct mbuf *m = NULL;
293 uint8_t *data = NULL;
294 uint32_t data_len = 0;
295 uint32_t rand32, rand_off, rand_bit;
296#if SKYWALK
297 struct __kern_packet *p = NULL;
298#endif /* SKYWALK */
299
300 switch (pkt->pktsched_ptype) {
301 case QP_MBUF:
302 m = pkt->pktsched_pkt_mbuf;
303 data = mtod(m, uint8_t *);
304 data_len = m->m_pkthdr.len;
305 break;
306#if SKYWALK
307 case QP_PACKET:
308 p = pkt->pktsched_pkt_kpkt;
309 if (p->pkt_pflags & PKT_F_MBUF_DATA) {
310 m = p->pkt_mbuf;
311 data = mtod(m, uint8_t *);
312 data_len = m->m_pkthdr.len;
313 } else {
314 MD_BUFLET_ADDR_DLEN(p, data, data_len);
315 }
316 break;
317#endif /* SKYWALK */
318
319 default:
320 /* NOTREACHED */
321 VERIFY(0);
322 __builtin_unreachable();
323 }
324
325 read_frandom(buffer: &rand32, numBytes: sizeof(rand32));
326 rand_bit = rand32 & 0x8;
327 rand_off = (rand32 >> 3) % data_len;
328 data[rand_off] ^= 1 << rand_bit;
329}
330
331void
332pktsched_free_pkt(pktsched_pkt_t *pkt)
333{
334 uint32_t cnt = pkt->pktsched_pcnt;
335 ASSERT(cnt != 0);
336
337 switch (pkt->pktsched_ptype) {
338 case QP_MBUF: {
339 struct mbuf *m;
340
341 m = pkt->pktsched_pkt_mbuf;
342 if (cnt == 1) {
343 VERIFY(m->m_nextpkt == NULL);
344 } else {
345 VERIFY(m->m_nextpkt != NULL);
346 }
347 m_freem_list(m);
348 break;
349 }
350#if SKYWALK
351 case QP_PACKET: {
352 struct __kern_packet *kpkt;
353 int pcnt = 0;
354
355 kpkt = pkt->pktsched_pkt_kpkt;
356 if (cnt == 1) {
357 VERIFY(kpkt->pkt_nextpkt == NULL);
358 } else {
359 VERIFY(kpkt->pkt_nextpkt != NULL);
360 }
361 pp_free_packet_chain(kpkt, &pcnt);
362 VERIFY(cnt == (uint32_t)pcnt);
363 break;
364 }
365#endif /* SKYWALK */
366
367 default:
368 VERIFY(0);
369 /* NOTREACHED */
370 __builtin_unreachable();
371 }
372 pkt->pktsched_pkt = CLASSQ_PKT_INITIALIZER(pkt->pktsched_pkt);
373 pkt->pktsched_tail = CLASSQ_PKT_INITIALIZER(pkt->pktsched_tail);
374 pkt->pktsched_plen = 0;
375 pkt->pktsched_pcnt = 0;
376}
377
378mbuf_svc_class_t
379pktsched_get_pkt_svc(pktsched_pkt_t *pkt)
380{
381 mbuf_svc_class_t svc = MBUF_SC_UNSPEC;
382
383 switch (pkt->pktsched_ptype) {
384 case QP_MBUF:
385 svc = m_get_service_class(pkt->pktsched_pkt_mbuf);
386 break;
387
388#if SKYWALK
389 case QP_PACKET:
390 svc = pkt->pktsched_pkt_kpkt->pkt_svc_class;
391 break;
392#endif /* SKYWALK */
393
394 default:
395 VERIFY(0);
396 /* NOTREACHED */
397 __builtin_unreachable();
398 }
399
400 return svc;
401}
402
403void
404pktsched_get_pkt_vars(pktsched_pkt_t *pkt, volatile uint32_t **flags,
405 uint64_t **timestamp, uint32_t *flowid, uint8_t *flowsrc, uint8_t *proto,
406 uint32_t *comp_gencnt, uint64_t *pkt_tx_time)
407{
408 switch (pkt->pktsched_ptype) {
409 case QP_MBUF: {
410 struct pkthdr *pkth = &(pkt->pktsched_pkt_mbuf->m_pkthdr);
411
412 if (flags != NULL) {
413 *flags = &pkth->pkt_flags;
414 }
415 if (timestamp != NULL) {
416 *timestamp = &pkth->pkt_timestamp;
417 }
418 if (flowid != NULL) {
419 *flowid = pkth->pkt_flowid;
420 }
421 if (flowsrc != NULL) {
422 *flowsrc = pkth->pkt_flowsrc;
423 }
424 if (proto != NULL) {
425 /*
426 * rdar://100524205 - We want to use the pkt_ext_flags
427 * to denote QUIC packets, but AQM is already written in
428 * such a way where IPPROTO_QUIC is used to denote QUIC
429 * packets.
430 */
431 if (pkth->pkt_ext_flags & PKTF_EXT_QUIC) {
432 *proto = IPPROTO_QUIC;
433 } else {
434 *proto = pkth->pkt_proto;
435 }
436 }
437 if (comp_gencnt != NULL) {
438 *comp_gencnt = pkth->comp_gencnt;
439 }
440 if (pkt_tx_time != NULL) {
441 struct m_tag *tag;
442 tag = m_tag_locate(pkt->pktsched_pkt_mbuf, KERNEL_MODULE_TAG_ID,
443 KERNEL_TAG_TYPE_AQM);
444 if (__improbable(tag != NULL)) {
445 *pkt_tx_time = *(uint64_t *)tag->m_tag_data;
446 } else {
447 *pkt_tx_time = 0;
448 }
449 }
450
451 break;
452 }
453
454#if SKYWALK
455 case QP_PACKET: {
456 struct __kern_packet *kp = pkt->pktsched_pkt_kpkt;
457
458 if (flags != NULL) {
459 /* use lower-32 bit for common flags */
460 *flags = &kp->pkt_pflags32;
461 }
462 if (timestamp != NULL) {
463 *timestamp = &kp->pkt_timestamp;
464 }
465 if (flowid != NULL) {
466 *flowid = kp->pkt_flow_token;
467 }
468 if (flowsrc != NULL) {
469 *flowsrc = (uint8_t)kp->pkt_flowsrc_type;
470 }
471 if (proto != NULL) {
472 *proto = kp->pkt_transport_protocol;
473 }
474 if (comp_gencnt != NULL) {
475 *comp_gencnt = kp->pkt_comp_gencnt;
476 }
477 if (pkt_tx_time != NULL && (kp->pkt_pflags & PKT_F_OPT_TX_TIMESTAMP) != 0) {
478 *pkt_tx_time = kp->pkt_com_opt->__po_pkt_tx_time;
479 }
480
481 break;
482 }
483#endif /* SKYWALK */
484
485 default:
486 VERIFY(0);
487 /* NOTREACHED */
488 __builtin_unreachable();
489 }
490}
491
492struct flowadv_fcentry *
493pktsched_alloc_fcentry(pktsched_pkt_t *pkt, struct ifnet *ifp, int how)
494{
495#pragma unused(ifp)
496 struct flowadv_fcentry *fce = NULL;
497
498 switch (pkt->pktsched_ptype) {
499 case QP_MBUF: {
500 struct mbuf *m = pkt->pktsched_pkt_mbuf;
501
502 fce = flowadv_alloc_entry(how);
503 if (fce == NULL) {
504 break;
505 }
506
507 _CASSERT(sizeof(m->m_pkthdr.pkt_flowid) ==
508 sizeof(fce->fce_flowid));
509
510 fce->fce_flowsrc_type = m->m_pkthdr.pkt_flowsrc;
511 fce->fce_flowid = m->m_pkthdr.pkt_flowid;
512#if SKYWALK
513 _CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_srcid) ==
514 sizeof(fce->fce_flowsrc_token));
515 _CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_fidx) ==
516 sizeof(fce->fce_flowsrc_fidx));
517
518 if (fce->fce_flowsrc_type == FLOWSRC_CHANNEL) {
519 fce->fce_flowsrc_fidx = m->m_pkthdr.pkt_mpriv_fidx;
520 fce->fce_flowsrc_token = m->m_pkthdr.pkt_mpriv_srcid;
521 fce->fce_ifp = ifp;
522 }
523#endif /* SKYWALK */
524 break;
525 }
526
527#if SKYWALK
528 case QP_PACKET: {
529 struct __kern_packet *kp = pkt->pktsched_pkt_kpkt;
530
531 fce = flowadv_alloc_entry(how);
532 if (fce == NULL) {
533 break;
534 }
535
536 _CASSERT(sizeof(fce->fce_flowid) ==
537 sizeof(kp->pkt_flow_token));
538 _CASSERT(sizeof(fce->fce_flowsrc_fidx) ==
539 sizeof(kp->pkt_flowsrc_fidx));
540 _CASSERT(sizeof(fce->fce_flowsrc_token) ==
541 sizeof(kp->pkt_flowsrc_token));
542
543 ASSERT(kp->pkt_pflags & PKT_F_FLOW_ADV);
544 fce->fce_flowsrc_type = kp->pkt_flowsrc_type;
545 fce->fce_flowid = kp->pkt_flow_token;
546 fce->fce_flowsrc_fidx = kp->pkt_flowsrc_fidx;
547 fce->fce_flowsrc_token = kp->pkt_flowsrc_token;
548 fce->fce_ifp = ifp;
549 break;
550 }
551#endif /* SKYWALK */
552
553 default:
554 VERIFY(0);
555 /* NOTREACHED */
556 __builtin_unreachable();
557 }
558
559 return fce;
560}
561
562uint32_t *
563pktsched_get_pkt_sfb_vars(pktsched_pkt_t *pkt, uint32_t **sfb_flags)
564{
565 uint32_t *hashp = NULL;
566
567 switch (pkt->pktsched_ptype) {
568 case QP_MBUF: {
569 struct pkthdr *pkth = &(pkt->pktsched_pkt_mbuf->m_pkthdr);
570
571 _CASSERT(sizeof(pkth->pkt_mpriv_hash) == sizeof(uint32_t));
572 _CASSERT(sizeof(pkth->pkt_mpriv_flags) == sizeof(uint32_t));
573 *sfb_flags = &pkth->pkt_mpriv_flags;
574 hashp = &pkth->pkt_mpriv_hash;
575 break;
576 }
577
578#if SKYWALK
579 case QP_PACKET: {
580 struct __kern_packet *kp = pkt->pktsched_pkt_kpkt;
581
582 _CASSERT(sizeof(kp->pkt_classq_hash) == sizeof(uint32_t));
583 _CASSERT(sizeof(kp->pkt_classq_flags) == sizeof(uint32_t));
584 *sfb_flags = &kp->pkt_classq_flags;
585 hashp = &kp->pkt_classq_hash;
586 break;
587 }
588#endif /* SKYWALK */
589
590 default:
591 VERIFY(0);
592 /* NOTREACHED */
593 __builtin_unreachable();
594 }
595
596 return hashp;
597}
598
599static int
600pktsched_mbuf_mark_ecn(struct mbuf* m)
601{
602 struct mbuf *m0;
603 void *hdr;
604 int af;
605 uint8_t ipv;
606
607 hdr = m->m_pkthdr.pkt_hdr;
608 /* verify that hdr is within the mbuf data */
609 for (m0 = m; m0 != NULL; m0 = m0->m_next) {
610 if (((caddr_t)hdr >= m_mtod_current(m: m0)) &&
611 ((caddr_t)hdr < m_mtod_current(m: m0) + m0->m_len)) {
612 break;
613 }
614 }
615 if (m0 == NULL) {
616 return EINVAL;
617 }
618 ipv = IP_VHL_V(*(uint8_t *)hdr);
619 if (ipv == 4) {
620 af = AF_INET;
621 } else if (ipv == 6) {
622 af = AF_INET6;
623 } else {
624 af = AF_UNSPEC;
625 }
626
627 switch (af) {
628 case AF_INET: {
629 struct ip *ip = hdr;
630 uint8_t otos;
631 int sum;
632
633 if (((uintptr_t)ip + sizeof(*ip)) >
634 ((uintptr_t)mbuf_datastart(mbuf: m0) + mbuf_maxlen(mbuf: m0))) {
635 return EINVAL; /* out of bounds */
636 }
637 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT) {
638 return EINVAL; /* not-ECT */
639 }
640 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) {
641 return 0; /* already marked */
642 }
643 /*
644 * ecn-capable but not marked,
645 * mark CE and update checksum
646 */
647 otos = ip->ip_tos;
648 ip->ip_tos |= IPTOS_ECN_CE;
649 /*
650 * update checksum (from RFC1624) only if hw
651 * checksum is not supported.
652 * HC' = ~(~HC + ~m + m')
653 */
654 if ((m->m_pkthdr.csum_flags & CSUM_DELAY_IP) == 0) {
655 sum = ~ntohs(ip->ip_sum) & 0xffff;
656 sum += (~otos & 0xffff) + ip->ip_tos;
657 sum = (sum >> 16) + (sum & 0xffff);
658 sum += (sum >> 16); /* add carry */
659 ip->ip_sum = htons(~sum & 0xffff);
660 }
661 return 0;
662 }
663 case AF_INET6: {
664 struct ip6_hdr *ip6 = hdr;
665 u_int32_t flowlabel;
666
667 if (((uintptr_t)ip6 + sizeof(*ip6)) >
668 ((uintptr_t)mbuf_datastart(mbuf: m0) + mbuf_maxlen(mbuf: m0))) {
669 return EINVAL; /* out of bounds */
670 }
671 flowlabel = ntohl(ip6->ip6_flow);
672 if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
673 (IPTOS_ECN_NOTECT << 20)) {
674 return EINVAL; /* not-ECT */
675 }
676 if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
677 (IPTOS_ECN_CE << 20)) {
678 return 0; /* already marked */
679 }
680 /*
681 * ecn-capable but not marked, mark CE
682 */
683 flowlabel |= (IPTOS_ECN_CE << 20);
684 ip6->ip6_flow = htonl(flowlabel);
685 return 0;
686 }
687 default:
688 return EPROTONOSUPPORT;
689 }
690}
691
692static int
693pktsched_kpkt_mark_ecn(struct __kern_packet *kpkt)
694{
695 uint8_t ipv = 0, *l3_hdr;
696
697 if ((kpkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0) {
698 ipv = kpkt->pkt_flow_ip_ver;
699 l3_hdr = (uint8_t *)kpkt->pkt_flow_ip_hdr;
700 } else {
701 uint8_t *pkt_buf;
702 uint32_t bdlen, bdlim, bdoff;
703 MD_BUFLET_ADDR_ABS_DLEN(kpkt, pkt_buf, bdlen, bdlim, bdoff);
704
705 /* takes care of both IPv4 and IPv6 */
706 l3_hdr = pkt_buf + kpkt->pkt_headroom + kpkt->pkt_l2_len;
707 ipv = IP_VHL_V(*(uint8_t *)l3_hdr);
708 if (ipv == 4) {
709 ipv = IPVERSION;
710 } else if (ipv == 6) {
711 ipv = IPV6_VERSION;
712 } else {
713 ipv = 0;
714 }
715 }
716
717 switch (ipv) {
718 case IPVERSION: {
719 uint8_t otos;
720 int sum;
721
722 struct ip *ip = (struct ip *)(void *)l3_hdr;
723 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT) {
724 return EINVAL; /* not-ECT */
725 }
726 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) {
727 return 0; /* already marked */
728 }
729 /*
730 * ecn-capable but not marked,
731 * mark CE and update checksum
732 */
733 otos = ip->ip_tos;
734 ip->ip_tos |= IPTOS_ECN_CE;
735
736 sum = ~ntohs(ip->ip_sum) & 0xffff;
737 sum += (~otos & 0xffff) + ip->ip_tos;
738 sum = (sum >> 16) + (sum & 0xffff);
739 sum += (sum >> 16); /* add carry */
740 ip->ip_sum = htons(~sum & 0xffff);
741
742 return 0;
743 }
744 case IPV6_VERSION: {
745 struct ip6_hdr *ip6 = (struct ip6_hdr *)l3_hdr;
746 u_int32_t flowlabel;
747 flowlabel = ntohl(ip6->ip6_flow);
748 if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
749 (IPTOS_ECN_NOTECT << 20)) {
750 return EINVAL; /* not-ECT */
751 }
752 if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
753 (IPTOS_ECN_CE << 20)) {
754 return 0; /* already marked */
755 }
756 /*
757 * ecn-capable but not marked, mark CE
758 */
759 flowlabel |= (IPTOS_ECN_CE << 20);
760 ip6->ip6_flow = htonl(flowlabel);
761
762 return 0;
763 }
764 default:
765 return EPROTONOSUPPORT;
766 }
767}
768
769int
770pktsched_mark_ecn(pktsched_pkt_t *pkt)
771{
772 switch (pkt->pktsched_ptype) {
773 case QP_MBUF:
774 return pktsched_mbuf_mark_ecn(m: pkt->pktsched_pkt_mbuf);
775 case QP_PACKET:
776 return pktsched_kpkt_mark_ecn(kpkt: pkt->pktsched_pkt_kpkt);
777 default:
778 VERIFY(0);
779 /* NOTREACHED */
780 __builtin_unreachable();
781 }
782}
783
784boolean_t
785pktsched_is_pkt_l4s(pktsched_pkt_t *pkt)
786{
787 switch (pkt->pktsched_ptype) {
788 case QP_MBUF: {
789 struct pkthdr *pkth = &(pkt->pktsched_pkt_mbuf->m_pkthdr);
790 return (pkth->pkt_ext_flags & PKTF_EXT_L4S) != 0;
791 }
792 case QP_PACKET: {
793 struct __kern_packet *kp = pkt->pktsched_pkt_kpkt;
794 return (kp->pkt_pflags & PKT_F_L4S) != 0;
795 }
796
797 default:
798 VERIFY(0);
799 /* NOTREACHED */
800 __builtin_unreachable();
801 }
802 return FALSE;
803}
804
805struct aqm_tag_container {
806 struct m_tag aqm_m_tag;
807 uint64_t aqm_tag;
808};
809
810static struct m_tag *
811m_tag_kalloc_aqm(u_int32_t id, u_int16_t type, uint16_t len, int wait)
812{
813 struct aqm_tag_container *tag_container;
814 struct m_tag *tag = NULL;
815
816 assert3u(id, ==, KERNEL_MODULE_TAG_ID);
817 assert3u(type, ==, KERNEL_TAG_TYPE_AQM);
818 assert3u(len, ==, sizeof(uint64_t));
819
820 if (len != sizeof(uint64_t)) {
821 return NULL;
822 }
823
824 tag_container = kalloc_type(struct aqm_tag_container, wait | M_ZERO);
825 if (tag_container != NULL) {
826 tag = &tag_container->aqm_m_tag;
827
828 assert3p(tag, ==, tag_container);
829
830 M_TAG_INIT(tag, id, type, len, &tag_container->aqm_tag, NULL);
831 }
832
833 return tag;
834}
835
836static void
837m_tag_kfree_aqm(struct m_tag *tag)
838{
839 struct aqm_tag_container *tag_container = (struct aqm_tag_container *)tag;
840
841 assert3u(tag->m_tag_len, ==, sizeof(uint64_t));
842
843 kfree_type(struct aqm_tag_container, tag_container);
844}
845
846void
847pktsched_register_m_tag(void)
848{
849 int error;
850
851 error = m_register_internal_tag_type(type: KERNEL_TAG_TYPE_AQM, len: sizeof(uint64_t),
852 alloc_func: m_tag_kalloc_aqm, free_func: m_tag_kfree_aqm);
853
854 assert3u(error, ==, 0);
855}
856