1 | /* |
2 | * Copyright (c) 2011-2021 Apple Inc. All rights reserved. |
3 | * |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
5 | * |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License |
8 | * Version 2.0 (the 'License'). You may not use this file except in |
9 | * compliance with the License. The rights granted to you under the License |
10 | * may not be used to create, or enable the creation or redistribution of, |
11 | * unlawful or unlicensed copies of an Apple operating system, or to |
12 | * circumvent, violate, or enable the circumvention or violation of, any |
13 | * terms of an Apple operating system software license agreement. |
14 | * |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
17 | * |
18 | * The Original Code and all software distributed under the License are |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and |
24 | * limitations under the License. |
25 | * |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
27 | */ |
28 | |
29 | #include <sys/cdefs.h> |
30 | |
31 | #include <sys/param.h> |
32 | #include <sys/malloc.h> |
33 | #include <sys/mbuf.h> |
34 | #include <sys/systm.h> |
35 | #include <sys/kernel.h> |
36 | #include <sys/errno.h> |
37 | #include <sys/mcache.h> |
38 | #include <sys/sysctl.h> |
39 | |
40 | #include <dev/random/randomdev.h> |
41 | #include <net/if.h> |
42 | #include <net/if_var.h> |
43 | #include <net/if_dl.h> |
44 | #include <net/if_types.h> |
45 | #include <net/net_osdep.h> |
46 | #include <net/pktsched/pktsched.h> |
47 | #include <net/pktsched/pktsched_fq_codel.h> |
48 | #include <net/pktsched/pktsched_netem.h> |
49 | |
50 | #define _IP_VHL |
51 | #include <netinet/ip.h> |
52 | #include <netinet/ip6.h> |
53 | |
54 | #include <pexpert/pexpert.h> |
55 | |
56 | #if SKYWALK |
57 | #include <skywalk/os_skywalk_private.h> |
58 | #endif /* SKYWALK */ |
59 | |
60 | u_int32_t machclk_freq = 0; |
61 | u_int64_t machclk_per_sec = 0; |
62 | u_int32_t pktsched_verbose = 0; /* more noise if greater than 1 */ |
63 | |
64 | static void init_machclk(void); |
65 | |
66 | SYSCTL_NODE(_net, OID_AUTO, pktsched, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "pktsched" ); |
67 | |
68 | SYSCTL_UINT(_net_pktsched, OID_AUTO, verbose, CTLFLAG_RW | CTLFLAG_LOCKED, |
69 | &pktsched_verbose, 0, "Packet scheduler verbosity level" ); |
70 | |
71 | void |
72 | pktsched_init(void) |
73 | { |
74 | init_machclk(); |
75 | if (machclk_freq == 0) { |
76 | panic("%s: no CPU clock available!" , __func__); |
77 | /* NOTREACHED */ |
78 | } |
79 | pktsched_fq_init(); |
80 | } |
81 | |
82 | static void |
83 | init_machclk(void) |
84 | { |
85 | /* |
86 | * Initialize machclk_freq using the timerbase frequency |
87 | * value from device specific info. |
88 | */ |
89 | machclk_freq = (uint32_t)gPEClockFrequencyInfo.timebase_frequency_hz; |
90 | |
91 | clock_interval_to_absolutetime_interval(interval: 1, NSEC_PER_SEC, |
92 | result: &machclk_per_sec); |
93 | } |
94 | |
95 | u_int64_t |
96 | pktsched_abs_to_nsecs(u_int64_t abstime) |
97 | { |
98 | u_int64_t nsecs; |
99 | |
100 | absolutetime_to_nanoseconds(abstime, result: &nsecs); |
101 | return nsecs; |
102 | } |
103 | |
104 | u_int64_t |
105 | pktsched_nsecs_to_abstime(u_int64_t nsecs) |
106 | { |
107 | u_int64_t abstime; |
108 | |
109 | nanoseconds_to_absolutetime(nanoseconds: nsecs, result: &abstime); |
110 | return abstime; |
111 | } |
112 | |
113 | int |
114 | pktsched_setup(struct ifclassq *ifq, u_int32_t scheduler, u_int32_t sflags, |
115 | classq_pkt_type_t ptype) |
116 | { |
117 | int error = 0; |
118 | u_int32_t rflags; |
119 | |
120 | IFCQ_LOCK_ASSERT_HELD(ifq); |
121 | |
122 | VERIFY(machclk_freq != 0); |
123 | |
124 | /* Nothing to do unless the scheduler type changes */ |
125 | if (ifq->ifcq_type == scheduler) { |
126 | return 0; |
127 | } |
128 | |
129 | /* |
130 | * Remember the flags that need to be restored upon success, as |
131 | * they may be cleared when we tear down existing scheduler. |
132 | */ |
133 | rflags = (ifq->ifcq_flags & IFCQF_ENABLED); |
134 | |
135 | if (ifq->ifcq_type != PKTSCHEDT_NONE) { |
136 | pktsched_teardown(ifq); |
137 | |
138 | /* Teardown should have succeeded */ |
139 | VERIFY(ifq->ifcq_type == PKTSCHEDT_NONE); |
140 | VERIFY(ifq->ifcq_disc == NULL); |
141 | } |
142 | |
143 | error = fq_if_setup_ifclassq(ifq, flags: sflags, ptype); |
144 | if (error == 0) { |
145 | ifq->ifcq_flags |= rflags; |
146 | } |
147 | |
148 | return error; |
149 | } |
150 | |
151 | void |
152 | pktsched_teardown(struct ifclassq *ifq) |
153 | { |
154 | IFCQ_LOCK_ASSERT_HELD(ifq); |
155 | if_qflush(ifq->ifcq_ifp, ifq, true); |
156 | VERIFY(IFCQ_IS_EMPTY(ifq)); |
157 | ifq->ifcq_flags &= ~IFCQF_ENABLED; |
158 | if (ifq->ifcq_type == PKTSCHEDT_FQ_CODEL) { |
159 | /* Could be PKTSCHEDT_NONE */ |
160 | fq_if_teardown_ifclassq(ifq); |
161 | } |
162 | return; |
163 | } |
164 | |
165 | int |
166 | pktsched_getqstats(struct ifclassq *ifq, u_int32_t gid, u_int32_t qid, |
167 | struct if_ifclassq_stats *ifqs) |
168 | { |
169 | int error = 0; |
170 | |
171 | IFCQ_LOCK_ASSERT_HELD(ifq); |
172 | |
173 | if (ifq->ifcq_type == PKTSCHEDT_FQ_CODEL) { |
174 | /* Could be PKTSCHEDT_NONE */ |
175 | error = fq_if_getqstats_ifclassq(ifq, gid: (uint8_t)gid, qid, ifqs); |
176 | } |
177 | |
178 | return error; |
179 | } |
180 | |
181 | void |
182 | pktsched_pkt_encap(pktsched_pkt_t *pkt, classq_pkt_t *cpkt) |
183 | { |
184 | pkt->pktsched_pkt = *cpkt; |
185 | pkt->pktsched_tail = *cpkt; |
186 | pkt->pktsched_pcnt = 1; |
187 | |
188 | switch (cpkt->cp_ptype) { |
189 | case QP_MBUF: |
190 | pkt->pktsched_plen = |
191 | (uint32_t)m_pktlen(pkt->pktsched_pkt_mbuf); |
192 | break; |
193 | |
194 | #if SKYWALK |
195 | case QP_PACKET: |
196 | pkt->pktsched_plen = pkt->pktsched_pkt_kpkt->pkt_length; |
197 | break; |
198 | #endif /* SKYWALK */ |
199 | |
200 | default: |
201 | VERIFY(0); |
202 | /* NOTREACHED */ |
203 | __builtin_unreachable(); |
204 | } |
205 | } |
206 | |
207 | void |
208 | pktsched_pkt_encap_chain(pktsched_pkt_t *pkt, classq_pkt_t *cpkt, |
209 | classq_pkt_t *tail, uint32_t cnt, uint32_t bytes) |
210 | { |
211 | pkt->pktsched_pkt = *cpkt; |
212 | pkt->pktsched_tail = *tail; |
213 | pkt->pktsched_pcnt = cnt; |
214 | pkt->pktsched_plen = bytes; |
215 | |
216 | switch (cpkt->cp_ptype) { |
217 | case QP_MBUF: |
218 | break; |
219 | |
220 | #if SKYWALK |
221 | case QP_PACKET: |
222 | break; |
223 | #endif /* SKYWALK */ |
224 | |
225 | default: |
226 | VERIFY(0); |
227 | /* NOTREACHED */ |
228 | __builtin_unreachable(); |
229 | } |
230 | } |
231 | |
232 | int |
233 | pktsched_clone_pkt(pktsched_pkt_t *pkt1, pktsched_pkt_t *pkt2) |
234 | { |
235 | struct mbuf *m1, *m2; |
236 | #if SKYWALK |
237 | struct __kern_packet *p1; |
238 | kern_packet_t ph2; |
239 | int err; |
240 | #endif /* SKYWALK */ |
241 | |
242 | ASSERT(pkt1 != NULL); |
243 | ASSERT(pkt1->pktsched_pkt_mbuf != NULL); |
244 | ASSERT(pkt1->pktsched_pcnt == 1); |
245 | |
246 | /* allow in place clone, but make sure pkt2->pktsched_pkt won't leak */ |
247 | ASSERT((pkt1 == pkt2 && pkt1->pktsched_pkt_mbuf == |
248 | pkt2->pktsched_pkt_mbuf) || (pkt1 != pkt2 && |
249 | pkt2->pktsched_pkt_mbuf == NULL)); |
250 | |
251 | switch (pkt1->pktsched_ptype) { |
252 | case QP_MBUF: |
253 | m1 = (struct mbuf *)pkt1->pktsched_pkt_mbuf; |
254 | m2 = m_dup(m: m1, M_NOWAIT); |
255 | if (__improbable(m2 == NULL)) { |
256 | return ENOBUFS; |
257 | } |
258 | pkt2->pktsched_pkt_mbuf = m2; |
259 | break; |
260 | |
261 | #if SKYWALK |
262 | case QP_PACKET: |
263 | p1 = (struct __kern_packet *)pkt1->pktsched_pkt_kpkt; |
264 | err = kern_packet_clone_nosleep(SK_PTR_ENCODE(p1, |
265 | METADATA_TYPE(p1), METADATA_SUBTYPE(p1)), &ph2, |
266 | KPKT_COPY_HEAVY); |
267 | if (__improbable(err != 0)) { |
268 | return err; |
269 | } |
270 | ASSERT(ph2 != 0); |
271 | VERIFY(kern_packet_finalize(ph2) == 0); |
272 | pkt2->pktsched_pkt_kpkt = SK_PTR_ADDR_KPKT(ph2); |
273 | break; |
274 | #endif /* SKYWALK */ |
275 | |
276 | default: |
277 | VERIFY(0); |
278 | /* NOTREACHED */ |
279 | __builtin_unreachable(); |
280 | } |
281 | |
282 | pkt2->pktsched_plen = pkt1->pktsched_plen; |
283 | pkt2->pktsched_ptype = pkt1->pktsched_ptype; |
284 | pkt2->pktsched_tail = pkt2->pktsched_pkt; |
285 | pkt2->pktsched_pcnt = 1; |
286 | return 0; |
287 | } |
288 | |
289 | void |
290 | pktsched_corrupt_packet(pktsched_pkt_t *pkt) |
291 | { |
292 | struct mbuf *m = NULL; |
293 | uint8_t *data = NULL; |
294 | uint32_t data_len = 0; |
295 | uint32_t rand32, rand_off, rand_bit; |
296 | #if SKYWALK |
297 | struct __kern_packet *p = NULL; |
298 | #endif /* SKYWALK */ |
299 | |
300 | switch (pkt->pktsched_ptype) { |
301 | case QP_MBUF: |
302 | m = pkt->pktsched_pkt_mbuf; |
303 | data = mtod(m, uint8_t *); |
304 | data_len = m->m_pkthdr.len; |
305 | break; |
306 | #if SKYWALK |
307 | case QP_PACKET: |
308 | p = pkt->pktsched_pkt_kpkt; |
309 | if (p->pkt_pflags & PKT_F_MBUF_DATA) { |
310 | m = p->pkt_mbuf; |
311 | data = mtod(m, uint8_t *); |
312 | data_len = m->m_pkthdr.len; |
313 | } else { |
314 | MD_BUFLET_ADDR_DLEN(p, data, data_len); |
315 | } |
316 | break; |
317 | #endif /* SKYWALK */ |
318 | |
319 | default: |
320 | /* NOTREACHED */ |
321 | VERIFY(0); |
322 | __builtin_unreachable(); |
323 | } |
324 | |
325 | read_frandom(buffer: &rand32, numBytes: sizeof(rand32)); |
326 | rand_bit = rand32 & 0x8; |
327 | rand_off = (rand32 >> 3) % data_len; |
328 | data[rand_off] ^= 1 << rand_bit; |
329 | } |
330 | |
331 | void |
332 | pktsched_free_pkt(pktsched_pkt_t *pkt) |
333 | { |
334 | uint32_t cnt = pkt->pktsched_pcnt; |
335 | ASSERT(cnt != 0); |
336 | |
337 | switch (pkt->pktsched_ptype) { |
338 | case QP_MBUF: { |
339 | struct mbuf *m; |
340 | |
341 | m = pkt->pktsched_pkt_mbuf; |
342 | if (cnt == 1) { |
343 | VERIFY(m->m_nextpkt == NULL); |
344 | } else { |
345 | VERIFY(m->m_nextpkt != NULL); |
346 | } |
347 | m_freem_list(m); |
348 | break; |
349 | } |
350 | #if SKYWALK |
351 | case QP_PACKET: { |
352 | struct __kern_packet *kpkt; |
353 | int pcnt = 0; |
354 | |
355 | kpkt = pkt->pktsched_pkt_kpkt; |
356 | if (cnt == 1) { |
357 | VERIFY(kpkt->pkt_nextpkt == NULL); |
358 | } else { |
359 | VERIFY(kpkt->pkt_nextpkt != NULL); |
360 | } |
361 | pp_free_packet_chain(kpkt, &pcnt); |
362 | VERIFY(cnt == (uint32_t)pcnt); |
363 | break; |
364 | } |
365 | #endif /* SKYWALK */ |
366 | |
367 | default: |
368 | VERIFY(0); |
369 | /* NOTREACHED */ |
370 | __builtin_unreachable(); |
371 | } |
372 | pkt->pktsched_pkt = CLASSQ_PKT_INITIALIZER(pkt->pktsched_pkt); |
373 | pkt->pktsched_tail = CLASSQ_PKT_INITIALIZER(pkt->pktsched_tail); |
374 | pkt->pktsched_plen = 0; |
375 | pkt->pktsched_pcnt = 0; |
376 | } |
377 | |
378 | mbuf_svc_class_t |
379 | pktsched_get_pkt_svc(pktsched_pkt_t *pkt) |
380 | { |
381 | mbuf_svc_class_t svc = MBUF_SC_UNSPEC; |
382 | |
383 | switch (pkt->pktsched_ptype) { |
384 | case QP_MBUF: |
385 | svc = m_get_service_class(pkt->pktsched_pkt_mbuf); |
386 | break; |
387 | |
388 | #if SKYWALK |
389 | case QP_PACKET: |
390 | svc = pkt->pktsched_pkt_kpkt->pkt_svc_class; |
391 | break; |
392 | #endif /* SKYWALK */ |
393 | |
394 | default: |
395 | VERIFY(0); |
396 | /* NOTREACHED */ |
397 | __builtin_unreachable(); |
398 | } |
399 | |
400 | return svc; |
401 | } |
402 | |
403 | void |
404 | pktsched_get_pkt_vars(pktsched_pkt_t *pkt, volatile uint32_t **flags, |
405 | uint64_t **timestamp, uint32_t *flowid, uint8_t *flowsrc, uint8_t *proto, |
406 | uint32_t *comp_gencnt, uint64_t *pkt_tx_time) |
407 | { |
408 | switch (pkt->pktsched_ptype) { |
409 | case QP_MBUF: { |
410 | struct pkthdr *pkth = &(pkt->pktsched_pkt_mbuf->m_pkthdr); |
411 | |
412 | if (flags != NULL) { |
413 | *flags = &pkth->pkt_flags; |
414 | } |
415 | if (timestamp != NULL) { |
416 | *timestamp = &pkth->pkt_timestamp; |
417 | } |
418 | if (flowid != NULL) { |
419 | *flowid = pkth->pkt_flowid; |
420 | } |
421 | if (flowsrc != NULL) { |
422 | *flowsrc = pkth->pkt_flowsrc; |
423 | } |
424 | if (proto != NULL) { |
425 | /* |
426 | * rdar://100524205 - We want to use the pkt_ext_flags |
427 | * to denote QUIC packets, but AQM is already written in |
428 | * such a way where IPPROTO_QUIC is used to denote QUIC |
429 | * packets. |
430 | */ |
431 | if (pkth->pkt_ext_flags & PKTF_EXT_QUIC) { |
432 | *proto = IPPROTO_QUIC; |
433 | } else { |
434 | *proto = pkth->pkt_proto; |
435 | } |
436 | } |
437 | if (comp_gencnt != NULL) { |
438 | *comp_gencnt = pkth->comp_gencnt; |
439 | } |
440 | if (pkt_tx_time != NULL) { |
441 | struct m_tag *tag; |
442 | tag = m_tag_locate(pkt->pktsched_pkt_mbuf, KERNEL_MODULE_TAG_ID, |
443 | KERNEL_TAG_TYPE_AQM); |
444 | if (__improbable(tag != NULL)) { |
445 | *pkt_tx_time = *(uint64_t *)tag->m_tag_data; |
446 | } else { |
447 | *pkt_tx_time = 0; |
448 | } |
449 | } |
450 | |
451 | break; |
452 | } |
453 | |
454 | #if SKYWALK |
455 | case QP_PACKET: { |
456 | struct __kern_packet *kp = pkt->pktsched_pkt_kpkt; |
457 | |
458 | if (flags != NULL) { |
459 | /* use lower-32 bit for common flags */ |
460 | *flags = &kp->pkt_pflags32; |
461 | } |
462 | if (timestamp != NULL) { |
463 | *timestamp = &kp->pkt_timestamp; |
464 | } |
465 | if (flowid != NULL) { |
466 | *flowid = kp->pkt_flow_token; |
467 | } |
468 | if (flowsrc != NULL) { |
469 | *flowsrc = (uint8_t)kp->pkt_flowsrc_type; |
470 | } |
471 | if (proto != NULL) { |
472 | *proto = kp->pkt_transport_protocol; |
473 | } |
474 | if (comp_gencnt != NULL) { |
475 | *comp_gencnt = kp->pkt_comp_gencnt; |
476 | } |
477 | if (pkt_tx_time != NULL && (kp->pkt_pflags & PKT_F_OPT_TX_TIMESTAMP) != 0) { |
478 | *pkt_tx_time = kp->pkt_com_opt->__po_pkt_tx_time; |
479 | } |
480 | |
481 | break; |
482 | } |
483 | #endif /* SKYWALK */ |
484 | |
485 | default: |
486 | VERIFY(0); |
487 | /* NOTREACHED */ |
488 | __builtin_unreachable(); |
489 | } |
490 | } |
491 | |
492 | struct flowadv_fcentry * |
493 | pktsched_alloc_fcentry(pktsched_pkt_t *pkt, struct ifnet *ifp, int how) |
494 | { |
495 | #pragma unused(ifp) |
496 | struct flowadv_fcentry *fce = NULL; |
497 | |
498 | switch (pkt->pktsched_ptype) { |
499 | case QP_MBUF: { |
500 | struct mbuf *m = pkt->pktsched_pkt_mbuf; |
501 | |
502 | fce = flowadv_alloc_entry(how); |
503 | if (fce == NULL) { |
504 | break; |
505 | } |
506 | |
507 | _CASSERT(sizeof(m->m_pkthdr.pkt_flowid) == |
508 | sizeof(fce->fce_flowid)); |
509 | |
510 | fce->fce_flowsrc_type = m->m_pkthdr.pkt_flowsrc; |
511 | fce->fce_flowid = m->m_pkthdr.pkt_flowid; |
512 | #if SKYWALK |
513 | _CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_srcid) == |
514 | sizeof(fce->fce_flowsrc_token)); |
515 | _CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_fidx) == |
516 | sizeof(fce->fce_flowsrc_fidx)); |
517 | |
518 | if (fce->fce_flowsrc_type == FLOWSRC_CHANNEL) { |
519 | fce->fce_flowsrc_fidx = m->m_pkthdr.pkt_mpriv_fidx; |
520 | fce->fce_flowsrc_token = m->m_pkthdr.pkt_mpriv_srcid; |
521 | fce->fce_ifp = ifp; |
522 | } |
523 | #endif /* SKYWALK */ |
524 | break; |
525 | } |
526 | |
527 | #if SKYWALK |
528 | case QP_PACKET: { |
529 | struct __kern_packet *kp = pkt->pktsched_pkt_kpkt; |
530 | |
531 | fce = flowadv_alloc_entry(how); |
532 | if (fce == NULL) { |
533 | break; |
534 | } |
535 | |
536 | _CASSERT(sizeof(fce->fce_flowid) == |
537 | sizeof(kp->pkt_flow_token)); |
538 | _CASSERT(sizeof(fce->fce_flowsrc_fidx) == |
539 | sizeof(kp->pkt_flowsrc_fidx)); |
540 | _CASSERT(sizeof(fce->fce_flowsrc_token) == |
541 | sizeof(kp->pkt_flowsrc_token)); |
542 | |
543 | ASSERT(kp->pkt_pflags & PKT_F_FLOW_ADV); |
544 | fce->fce_flowsrc_type = kp->pkt_flowsrc_type; |
545 | fce->fce_flowid = kp->pkt_flow_token; |
546 | fce->fce_flowsrc_fidx = kp->pkt_flowsrc_fidx; |
547 | fce->fce_flowsrc_token = kp->pkt_flowsrc_token; |
548 | fce->fce_ifp = ifp; |
549 | break; |
550 | } |
551 | #endif /* SKYWALK */ |
552 | |
553 | default: |
554 | VERIFY(0); |
555 | /* NOTREACHED */ |
556 | __builtin_unreachable(); |
557 | } |
558 | |
559 | return fce; |
560 | } |
561 | |
562 | uint32_t * |
563 | pktsched_get_pkt_sfb_vars(pktsched_pkt_t *pkt, uint32_t **sfb_flags) |
564 | { |
565 | uint32_t *hashp = NULL; |
566 | |
567 | switch (pkt->pktsched_ptype) { |
568 | case QP_MBUF: { |
569 | struct pkthdr *pkth = &(pkt->pktsched_pkt_mbuf->m_pkthdr); |
570 | |
571 | _CASSERT(sizeof(pkth->pkt_mpriv_hash) == sizeof(uint32_t)); |
572 | _CASSERT(sizeof(pkth->pkt_mpriv_flags) == sizeof(uint32_t)); |
573 | *sfb_flags = &pkth->pkt_mpriv_flags; |
574 | hashp = &pkth->pkt_mpriv_hash; |
575 | break; |
576 | } |
577 | |
578 | #if SKYWALK |
579 | case QP_PACKET: { |
580 | struct __kern_packet *kp = pkt->pktsched_pkt_kpkt; |
581 | |
582 | _CASSERT(sizeof(kp->pkt_classq_hash) == sizeof(uint32_t)); |
583 | _CASSERT(sizeof(kp->pkt_classq_flags) == sizeof(uint32_t)); |
584 | *sfb_flags = &kp->pkt_classq_flags; |
585 | hashp = &kp->pkt_classq_hash; |
586 | break; |
587 | } |
588 | #endif /* SKYWALK */ |
589 | |
590 | default: |
591 | VERIFY(0); |
592 | /* NOTREACHED */ |
593 | __builtin_unreachable(); |
594 | } |
595 | |
596 | return hashp; |
597 | } |
598 | |
599 | static int |
600 | pktsched_mbuf_mark_ecn(struct mbuf* m) |
601 | { |
602 | struct mbuf *m0; |
603 | void *hdr; |
604 | int af; |
605 | uint8_t ipv; |
606 | |
607 | hdr = m->m_pkthdr.pkt_hdr; |
608 | /* verify that hdr is within the mbuf data */ |
609 | for (m0 = m; m0 != NULL; m0 = m0->m_next) { |
610 | if (((caddr_t)hdr >= m_mtod_current(m: m0)) && |
611 | ((caddr_t)hdr < m_mtod_current(m: m0) + m0->m_len)) { |
612 | break; |
613 | } |
614 | } |
615 | if (m0 == NULL) { |
616 | return EINVAL; |
617 | } |
618 | ipv = IP_VHL_V(*(uint8_t *)hdr); |
619 | if (ipv == 4) { |
620 | af = AF_INET; |
621 | } else if (ipv == 6) { |
622 | af = AF_INET6; |
623 | } else { |
624 | af = AF_UNSPEC; |
625 | } |
626 | |
627 | switch (af) { |
628 | case AF_INET: { |
629 | struct ip *ip = hdr; |
630 | uint8_t otos; |
631 | int sum; |
632 | |
633 | if (((uintptr_t)ip + sizeof(*ip)) > |
634 | ((uintptr_t)mbuf_datastart(mbuf: m0) + mbuf_maxlen(mbuf: m0))) { |
635 | return EINVAL; /* out of bounds */ |
636 | } |
637 | if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT) { |
638 | return EINVAL; /* not-ECT */ |
639 | } |
640 | if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) { |
641 | return 0; /* already marked */ |
642 | } |
643 | /* |
644 | * ecn-capable but not marked, |
645 | * mark CE and update checksum |
646 | */ |
647 | otos = ip->ip_tos; |
648 | ip->ip_tos |= IPTOS_ECN_CE; |
649 | /* |
650 | * update checksum (from RFC1624) only if hw |
651 | * checksum is not supported. |
652 | * HC' = ~(~HC + ~m + m') |
653 | */ |
654 | if ((m->m_pkthdr.csum_flags & CSUM_DELAY_IP) == 0) { |
655 | sum = ~ntohs(ip->ip_sum) & 0xffff; |
656 | sum += (~otos & 0xffff) + ip->ip_tos; |
657 | sum = (sum >> 16) + (sum & 0xffff); |
658 | sum += (sum >> 16); /* add carry */ |
659 | ip->ip_sum = htons(~sum & 0xffff); |
660 | } |
661 | return 0; |
662 | } |
663 | case AF_INET6: { |
664 | struct ip6_hdr *ip6 = hdr; |
665 | u_int32_t flowlabel; |
666 | |
667 | if (((uintptr_t)ip6 + sizeof(*ip6)) > |
668 | ((uintptr_t)mbuf_datastart(mbuf: m0) + mbuf_maxlen(mbuf: m0))) { |
669 | return EINVAL; /* out of bounds */ |
670 | } |
671 | flowlabel = ntohl(ip6->ip6_flow); |
672 | if ((flowlabel & (IPTOS_ECN_MASK << 20)) == |
673 | (IPTOS_ECN_NOTECT << 20)) { |
674 | return EINVAL; /* not-ECT */ |
675 | } |
676 | if ((flowlabel & (IPTOS_ECN_MASK << 20)) == |
677 | (IPTOS_ECN_CE << 20)) { |
678 | return 0; /* already marked */ |
679 | } |
680 | /* |
681 | * ecn-capable but not marked, mark CE |
682 | */ |
683 | flowlabel |= (IPTOS_ECN_CE << 20); |
684 | ip6->ip6_flow = htonl(flowlabel); |
685 | return 0; |
686 | } |
687 | default: |
688 | return EPROTONOSUPPORT; |
689 | } |
690 | } |
691 | |
692 | static int |
693 | pktsched_kpkt_mark_ecn(struct __kern_packet *kpkt) |
694 | { |
695 | uint8_t ipv = 0, *l3_hdr; |
696 | |
697 | if ((kpkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0) { |
698 | ipv = kpkt->pkt_flow_ip_ver; |
699 | l3_hdr = (uint8_t *)kpkt->pkt_flow_ip_hdr; |
700 | } else { |
701 | uint8_t *pkt_buf; |
702 | uint32_t bdlen, bdlim, bdoff; |
703 | MD_BUFLET_ADDR_ABS_DLEN(kpkt, pkt_buf, bdlen, bdlim, bdoff); |
704 | |
705 | /* takes care of both IPv4 and IPv6 */ |
706 | l3_hdr = pkt_buf + kpkt->pkt_headroom + kpkt->pkt_l2_len; |
707 | ipv = IP_VHL_V(*(uint8_t *)l3_hdr); |
708 | if (ipv == 4) { |
709 | ipv = IPVERSION; |
710 | } else if (ipv == 6) { |
711 | ipv = IPV6_VERSION; |
712 | } else { |
713 | ipv = 0; |
714 | } |
715 | } |
716 | |
717 | switch (ipv) { |
718 | case IPVERSION: { |
719 | uint8_t otos; |
720 | int sum; |
721 | |
722 | struct ip *ip = (struct ip *)(void *)l3_hdr; |
723 | if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT) { |
724 | return EINVAL; /* not-ECT */ |
725 | } |
726 | if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) { |
727 | return 0; /* already marked */ |
728 | } |
729 | /* |
730 | * ecn-capable but not marked, |
731 | * mark CE and update checksum |
732 | */ |
733 | otos = ip->ip_tos; |
734 | ip->ip_tos |= IPTOS_ECN_CE; |
735 | |
736 | sum = ~ntohs(ip->ip_sum) & 0xffff; |
737 | sum += (~otos & 0xffff) + ip->ip_tos; |
738 | sum = (sum >> 16) + (sum & 0xffff); |
739 | sum += (sum >> 16); /* add carry */ |
740 | ip->ip_sum = htons(~sum & 0xffff); |
741 | |
742 | return 0; |
743 | } |
744 | case IPV6_VERSION: { |
745 | struct ip6_hdr *ip6 = (struct ip6_hdr *)l3_hdr; |
746 | u_int32_t flowlabel; |
747 | flowlabel = ntohl(ip6->ip6_flow); |
748 | if ((flowlabel & (IPTOS_ECN_MASK << 20)) == |
749 | (IPTOS_ECN_NOTECT << 20)) { |
750 | return EINVAL; /* not-ECT */ |
751 | } |
752 | if ((flowlabel & (IPTOS_ECN_MASK << 20)) == |
753 | (IPTOS_ECN_CE << 20)) { |
754 | return 0; /* already marked */ |
755 | } |
756 | /* |
757 | * ecn-capable but not marked, mark CE |
758 | */ |
759 | flowlabel |= (IPTOS_ECN_CE << 20); |
760 | ip6->ip6_flow = htonl(flowlabel); |
761 | |
762 | return 0; |
763 | } |
764 | default: |
765 | return EPROTONOSUPPORT; |
766 | } |
767 | } |
768 | |
769 | int |
770 | pktsched_mark_ecn(pktsched_pkt_t *pkt) |
771 | { |
772 | switch (pkt->pktsched_ptype) { |
773 | case QP_MBUF: |
774 | return pktsched_mbuf_mark_ecn(m: pkt->pktsched_pkt_mbuf); |
775 | case QP_PACKET: |
776 | return pktsched_kpkt_mark_ecn(kpkt: pkt->pktsched_pkt_kpkt); |
777 | default: |
778 | VERIFY(0); |
779 | /* NOTREACHED */ |
780 | __builtin_unreachable(); |
781 | } |
782 | } |
783 | |
784 | boolean_t |
785 | pktsched_is_pkt_l4s(pktsched_pkt_t *pkt) |
786 | { |
787 | switch (pkt->pktsched_ptype) { |
788 | case QP_MBUF: { |
789 | struct pkthdr *pkth = &(pkt->pktsched_pkt_mbuf->m_pkthdr); |
790 | return (pkth->pkt_ext_flags & PKTF_EXT_L4S) != 0; |
791 | } |
792 | case QP_PACKET: { |
793 | struct __kern_packet *kp = pkt->pktsched_pkt_kpkt; |
794 | return (kp->pkt_pflags & PKT_F_L4S) != 0; |
795 | } |
796 | |
797 | default: |
798 | VERIFY(0); |
799 | /* NOTREACHED */ |
800 | __builtin_unreachable(); |
801 | } |
802 | return FALSE; |
803 | } |
804 | |
805 | struct aqm_tag_container { |
806 | struct m_tag aqm_m_tag; |
807 | uint64_t aqm_tag; |
808 | }; |
809 | |
810 | static struct m_tag * |
811 | m_tag_kalloc_aqm(u_int32_t id, u_int16_t type, uint16_t len, int wait) |
812 | { |
813 | struct aqm_tag_container *tag_container; |
814 | struct m_tag *tag = NULL; |
815 | |
816 | assert3u(id, ==, KERNEL_MODULE_TAG_ID); |
817 | assert3u(type, ==, KERNEL_TAG_TYPE_AQM); |
818 | assert3u(len, ==, sizeof(uint64_t)); |
819 | |
820 | if (len != sizeof(uint64_t)) { |
821 | return NULL; |
822 | } |
823 | |
824 | tag_container = kalloc_type(struct aqm_tag_container, wait | M_ZERO); |
825 | if (tag_container != NULL) { |
826 | tag = &tag_container->aqm_m_tag; |
827 | |
828 | assert3p(tag, ==, tag_container); |
829 | |
830 | M_TAG_INIT(tag, id, type, len, &tag_container->aqm_tag, NULL); |
831 | } |
832 | |
833 | return tag; |
834 | } |
835 | |
836 | static void |
837 | m_tag_kfree_aqm(struct m_tag *tag) |
838 | { |
839 | struct aqm_tag_container *tag_container = (struct aqm_tag_container *)tag; |
840 | |
841 | assert3u(tag->m_tag_len, ==, sizeof(uint64_t)); |
842 | |
843 | kfree_type(struct aqm_tag_container, tag_container); |
844 | } |
845 | |
846 | void |
847 | pktsched_register_m_tag(void) |
848 | { |
849 | int error; |
850 | |
851 | error = m_register_internal_tag_type(type: KERNEL_TAG_TYPE_AQM, len: sizeof(uint64_t), |
852 | alloc_func: m_tag_kalloc_aqm, free_func: m_tag_kfree_aqm); |
853 | |
854 | assert3u(error, ==, 0); |
855 | } |
856 | |