1/*
2 * Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <skywalk/os_skywalk_private.h>
30#include <kern/sched_prim.h>
31#include <sys/sdt.h>
32
33static void kr_update_user_stats(struct __kern_channel_ring *,
34 uint32_t, uint32_t);
35static void kr_externalize_metadata_internal(struct __kern_channel_ring *,
36 const uint32_t, struct __kern_quantum *, struct proc *);
37
38#define KR_TRANSFER_DECAY 2 /* ilog2 of EWMA decay rate (4) */
39static uint32_t kr_transfer_decay = 0;
40
41#define KR_ACCUMULATE_INTERVAL 2 /* 2 seconds */
42static uint32_t kr_accumulate_interval = KR_ACCUMULATE_INTERVAL;
43
44#if (DEVELOPMENT || DEBUG)
45#define KR_STAT_ENABLE 1
46#else /* !(DEVELOPMENT || DEBUG) */
47#define KR_STAT_ENABLE 0
48#endif /* !(DEVELOPMENT || DEBUG) */
49/* Enable/Disable ring stats collection */
50uint32_t kr_stat_enable = KR_STAT_ENABLE;
51
52#if (DEVELOPMENT || DEBUG)
53SYSCTL_UINT(_kern_skywalk, OID_AUTO, ring_transfer_decay,
54 CTLFLAG_RW | CTLFLAG_LOCKED, &kr_transfer_decay,
55 0, "ilog2 of EWMA decay rate of ring transfers");
56
57SYSCTL_UINT(_kern_skywalk, OID_AUTO, ring_stat_accumulate_interval,
58 CTLFLAG_RW | CTLFLAG_LOCKED, &kr_accumulate_interval,
59 KR_ACCUMULATE_INTERVAL, "accumulation interval for ring stats");
60
61uint32_t kr_disable_panic_on_sync_err = 0;
62SYSCTL_UINT(_kern_skywalk, OID_AUTO, disable_panic_on_sync_err,
63 CTLFLAG_RW | CTLFLAG_LOCKED, &kr_disable_panic_on_sync_err,
64 0, "disable panic on sync error");
65#endif /* (DEVELOPMENT || DEBUG) */
66
67SYSCTL_UINT(_kern_skywalk, OID_AUTO, ring_stat_enable,
68 CTLFLAG_RW | CTLFLAG_LOCKED, &kr_stat_enable,
69 0, "enable/disable stats collection for ring");
70
71#define KR_EWMA(old, new, decay) do { \
72 u_int64_t _avg; \
73 if (__probable((_avg = (old)) > 0)) \
74 _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
75 else \
76 _avg = (new); \
77 (old) = _avg; \
78} while (0)
79
80#define _BUF_DLIM(_buf, _pp) (BUFLET_HAS_LARGE_BUF(_buf) ? \
81 PP_BUF_SIZE_LARGE(_pp) : PP_BUF_SIZE_DEF(_pp))
82
83void
84kr_init_to_mhints(struct __kern_channel_ring *kring, uint32_t nslots)
85{
86 uint32_t tail;
87
88 tail = nslots - 1;
89
90 kring->ckr_transfer_decay = KR_TRANSFER_DECAY;
91 kring->ckr_num_slots = nslots;
92 *(slot_idx_t *)(uintptr_t)&kring->ckr_lim = (nslots - 1);
93 kring->ckr_rhead = kring->ckr_khead = 0;
94 /* IMPORTANT: Always keep one slot empty */
95 kring->ckr_rtail = kring->ckr_ktail =
96 ((kring->ckr_tx == NR_TX) || (kring->ckr_tx == NR_F) ? tail : 0);
97}
98
99/*
100 * Try to obtain exclusive right to issue the *sync() or state change
101 * operations on the ring. The right is obtained and must be later
102 * relinquished via kr_exit() if and only if kr_enter() returns 0.
103 *
104 * In all cases the caller will typically skip the ring, possibly collecting
105 * errors along the way.
106 *
107 * If the calling context does not allow sleeping, the caller must pass
108 * FALSE in can_sleep; EBUSY may be returned if the right is held by
109 * another thread. Otherwise, the caller may block until the right is
110 * released by the previous holder.
111 */
112int
113kr_enter(struct __kern_channel_ring *kr, boolean_t can_sleep)
114{
115 lck_spin_lock(lck: &kr->ckr_slock);
116 if (kr->ckr_owner == current_thread()) {
117 ASSERT(kr->ckr_busy != 0);
118 kr->ckr_busy++;
119 goto done;
120 }
121 if (!can_sleep) {
122 if (kr->ckr_busy != 0) {
123 lck_spin_unlock(lck: &kr->ckr_slock);
124 return EBUSY;
125 }
126 } else {
127 while (kr->ckr_busy != 0) {
128 kr->ckr_want++;
129 (void) assert_wait(event: &kr->ckr_busy, THREAD_UNINT);
130 lck_spin_unlock(lck: &kr->ckr_slock);
131 (void) thread_block(THREAD_CONTINUE_NULL);
132 SK_DF(SK_VERB_LOCKS, "waited for kr \"%s\" "
133 "(0x%llx) busy=%u", kr->ckr_name,
134 SK_KVA(kr), kr->ckr_busy);
135 lck_spin_lock(lck: &kr->ckr_slock);
136 }
137 }
138 LCK_SPIN_ASSERT(&kr->ckr_slock, LCK_ASSERT_OWNED);
139 ASSERT(kr->ckr_busy == 0);
140 kr->ckr_busy++;
141 kr->ckr_owner = current_thread();
142done:
143 lck_spin_unlock(lck: &kr->ckr_slock);
144
145 SK_DF(SK_VERB_LOCKS, "kr \"%s\" (0x%llx) right acquired",
146 kr->ckr_name, SK_KVA(kr));
147
148 return 0;
149}
150
151void
152kr_exit(struct __kern_channel_ring *kr)
153{
154 uint32_t want = 0;
155
156 lck_spin_lock(lck: &kr->ckr_slock);
157 ASSERT(kr->ckr_busy != 0);
158 ASSERT(kr->ckr_owner == current_thread());
159 if (--kr->ckr_busy == 0) {
160 kr->ckr_owner = NULL;
161
162 /*
163 * we're done with the kring;
164 * notify anyone that has lost the race
165 */
166 if ((want = kr->ckr_want) != 0) {
167 kr->ckr_want = 0;
168 wakeup(chan: (void *)&kr->ckr_busy);
169 lck_spin_unlock(lck: &kr->ckr_slock);
170 } else {
171 lck_spin_unlock(lck: &kr->ckr_slock);
172 }
173 } else {
174 lck_spin_unlock(lck: &kr->ckr_slock);
175 }
176
177 SK_DF(SK_VERB_LOCKS, "kr \"%s\" (0x%llx) right released (%u waiters)",
178 kr->ckr_name, SK_KVA(kr), want);
179}
180
181
182void
183kr_start(struct __kern_channel_ring *kr)
184{
185 lck_spin_lock(lck: &kr->ckr_slock);
186 ASSERT(kr->ckr_busy != 0);
187 ASSERT(kr->ckr_state == KR_STOPPED || kr->ckr_state == KR_LOCKED);
188 /* now clear the state */
189 kr->ckr_state = KR_READY;
190 lck_spin_unlock(lck: &kr->ckr_slock);
191
192 kr_exit(kr);
193
194 SK_DF(SK_VERB_LOCKS, "kr \"%s\" (0x%llx) is started",
195 kr->ckr_name, SK_KVA(kr));
196}
197
198/*
199 * Put the kring in the 'stopped' state: either KR_STOPPED or KR_LOCKED.
200 * Also marks the ring as busy, which would require either kr_start() at a
201 * later point.
202 */
203void
204kr_stop(struct __kern_channel_ring *kr, uint32_t state)
205{
206 uint32_t s;
207
208 ASSERT(state == KR_STOPPED || state == KR_LOCKED);
209
210 s = kr_enter(kr, TRUE);
211 ASSERT(s == 0);
212
213 lck_spin_lock(lck: &kr->ckr_slock);
214 ASSERT(kr->ckr_busy != 0);
215 /* now set the state */
216 kr->ckr_state = state;
217 lck_spin_unlock(lck: &kr->ckr_slock);
218
219 SK_DF(SK_VERB_LOCKS,
220 "kr \"%s\" (0x%llx) krflags 0x%b is now stopped s=%u",
221 kr->ckr_name, SK_KVA(kr), kr->ckr_flags, CKRF_BITS, state);
222}
223
224static void
225kr_update_user_stats(struct __kern_channel_ring *kring, uint32_t slot_count,
226 uint32_t byte_count)
227{
228 uint64_t now;
229 uint32_t transfer_decay = (kr_transfer_decay != 0) ?
230 kr_transfer_decay : kring->ckr_transfer_decay;
231 channel_ring_user_stats_t stats = &kring->ckr_usr_stats;
232
233 now = net_uptime();
234 kring->ckr_sync_time = now;
235
236 if (kr_stat_enable == 0) {
237 return;
238 }
239
240 stats->crsu_number_of_syncs++;
241 stats->crsu_total_bytes_transferred += byte_count;
242 stats->crsu_total_slots_transferred += slot_count;
243
244 if (slot_count > stats->crsu_max_slots_transferred) {
245 stats->crsu_max_slots_transferred = slot_count;
246 }
247
248 if (stats->crsu_min_slots_transferred == 0 ||
249 slot_count < stats->crsu_min_slots_transferred) {
250 stats->crsu_min_slots_transferred = slot_count;
251 }
252
253 if (__probable(kring->ckr_user_accumulate_start != 0)) {
254 if ((now - kring->ckr_user_accumulate_start) >=
255 kr_accumulate_interval) {
256 uint64_t bps;
257 uint64_t sps;
258 uint64_t sps_ma;
259
260 /* bytes per sync */
261 bps = kring->ckr_user_accumulated_bytes /
262 kring->ckr_user_accumulated_syncs;
263 KR_EWMA(stats->crsu_bytes_per_sync_ma,
264 bps, transfer_decay);
265 stats->crsu_bytes_per_sync = bps;
266
267 /* slots per sync */
268 sps = kring->ckr_user_accumulated_slots /
269 kring->ckr_user_accumulated_syncs;
270 sps_ma = stats->crsu_slots_per_sync_ma;
271 KR_EWMA(sps_ma, sps, transfer_decay);
272 stats->crsu_slots_per_sync_ma = (uint32_t)sps_ma;
273 stats->crsu_slots_per_sync = (uint32_t)sps;
274
275 /* start over */
276 kring->ckr_user_accumulate_start = now;
277 kring->ckr_user_accumulated_bytes = 0;
278 kring->ckr_user_accumulated_slots = 0;
279 kring->ckr_user_accumulated_syncs = 0;
280
281 stats->crsu_min_slots_transferred = 0;
282 stats->crsu_max_slots_transferred = 0;
283 }
284 } else {
285 kring->ckr_user_accumulate_start = now;
286 }
287
288 kring->ckr_user_accumulated_bytes += byte_count;
289 kring->ckr_user_accumulated_slots += slot_count;
290 kring->ckr_user_accumulated_syncs++;
291}
292
293/* caller to make sure thread safety */
294void
295kr_update_stats(struct __kern_channel_ring *kring, uint32_t slot_count,
296 uint32_t byte_count)
297{
298 uint64_t now;
299 uint64_t diff_secs;
300 channel_ring_stats_t stats = &kring->ckr_stats;
301 uint32_t transfer_decay = (kr_transfer_decay != 0) ?
302 kr_transfer_decay : kring->ckr_transfer_decay;
303
304 if (kr_stat_enable == 0) {
305 return;
306 }
307
308 if (__improbable(slot_count == 0)) {
309 return;
310 }
311
312 stats->crs_number_of_transfers++;
313 stats->crs_total_bytes_transferred += byte_count;
314 stats->crs_total_slots_transferred += slot_count;
315 if (slot_count > stats->crs_max_slots_transferred) {
316 stats->crs_max_slots_transferred = slot_count;
317 }
318 if (stats->crs_min_slots_transferred == 0 ||
319 slot_count < stats->crs_min_slots_transferred) {
320 stats->crs_min_slots_transferred = slot_count;
321 }
322
323 now = net_uptime();
324 if (__probable(kring->ckr_accumulate_start != 0)) {
325 diff_secs = now - kring->ckr_accumulate_start;
326 if (diff_secs >= kr_accumulate_interval) {
327 uint64_t bps;
328 uint64_t sps;
329 uint64_t sps_ma;
330
331 /* bytes per second */
332 bps = kring->ckr_accumulated_bytes / diff_secs;
333 KR_EWMA(stats->crs_bytes_per_second_ma,
334 bps, transfer_decay);
335 stats->crs_bytes_per_second = bps;
336
337 /* slots per second */
338 sps = kring->ckr_accumulated_slots / diff_secs;
339 sps_ma = stats->crs_slots_per_second_ma;
340 KR_EWMA(sps_ma, sps, transfer_decay);
341 stats->crs_slots_per_second_ma = (uint32_t)sps_ma;
342 stats->crs_slots_per_second = (uint32_t)sps;
343
344 /* start over */
345 kring->ckr_accumulate_start = now;
346 kring->ckr_accumulated_bytes = 0;
347 kring->ckr_accumulated_slots = 0;
348
349 stats->crs_min_slots_transferred = 0;
350 stats->crs_max_slots_transferred = 0;
351 }
352 } else {
353 kring->ckr_accumulate_start = now;
354 }
355 kring->ckr_accumulated_bytes += byte_count;
356 kring->ckr_accumulated_slots += slot_count;
357}
358
359/* True if no space in the tx ring. only valid after kr_txsync_prologue */
360boolean_t
361kr_txempty(struct __kern_channel_ring *kring)
362{
363 return kring->ckr_rhead == kring->ckr_ktail;
364}
365
366#if SK_LOG
367/*
368 * Error logging routine called when txsync/rxsync detects an error.
369 * Expected to be called before killing the process with skywalk_kill_process()
370 *
371 * This routine is only called by the upper half of the kernel.
372 * It only reads khead (which is changed only by the upper half, too)
373 * and ktail (which may be changed by the lower half, but only on
374 * a tx ring and only to increase it, so any error will be recovered
375 * on the next call). For the above, we don't strictly need to call
376 * it under lock.
377 */
378void
379kr_log_bad_ring(struct __kern_channel_ring *kring)
380{
381 struct __user_channel_ring *ring = kring->ckr_ring;
382 const slot_idx_t lim = kring->ckr_lim;
383 slot_idx_t i;
384 int errors = 0;
385
386 // XXX KASSERT nm_kr_tryget
387 SK_ERR("kr \"%s\" (0x%llx) krflags 0x%b", kring->ckr_name,
388 SK_KVA(kring), kring->ckr_flags, CKRF_BITS);
389 // XXX probably wrong to trust userspace
390
391 if (ring->ring_head > lim) {
392 errors++;
393 }
394 if (ring->ring_tail > lim) {
395 errors++;
396 }
397 for (i = 0; i <= lim; i++) {
398 struct __kern_slot_desc *ksd = KR_KSD(kring, i);
399 struct __kern_quantum *kqum = ksd->sd_qum;
400 obj_idx_t idx;
401 uint32_t len;
402
403 if (!KSD_VALID_METADATA(ksd)) {
404 continue;
405 }
406
407 idx = METADATA_IDX(kqum);
408 len = kqum->qum_len;
409 if (len > kring->ckr_max_pkt_len) {
410 SK_RDERR(5, "bad len at slot %u idx %u len %u",
411 i, idx, len);
412 }
413 }
414
415 if (errors != 0) {
416 SK_ERR("total %d errors", errors);
417 SK_ERR("kr \"%s\" (0x%llx) krflags 0x%b crash, "
418 "head %u -> %u tail %u -> %u", kring->ckr_name,
419 SK_KVA(kring), kring->ckr_flags, CKRF_BITS, ring->ring_head,
420 kring->ckr_rhead, kring->ckr_khead,
421 ring->ring_tail, kring->ckr_ktail);
422 }
423}
424#endif /* SK_LOG */
425
426uint32_t
427kr_reclaim(struct __kern_channel_ring *kr)
428{
429 int r = 0;
430
431 VERIFY(sk_is_sync_protected());
432
433 /*
434 * This is a no-op for TX ring, since the TX reclaim logic is only
435 * known to the nexus itself. There, the nexus's TX sync code would
436 * figure out the number of slots that has been "transmitted", and
437 * advance the slot pointer accordingly. This routine would then be
438 * called as a way to advise the system of such condition.
439 *
440 * For RX ring, this will reclaim user-released slots, and it is
441 * to be called by the provider's RX sync routine prior to its
442 * processing new slots (into the RX ring).
443 *
444 * It is therefore advised that this routine be called at the start
445 * of the RX sync callback, as well as at the end of the TX sync
446 * callback; the latter is useful in case we decide to implement
447 * more logic in future.
448 */
449 if ((kr->ckr_tx == NR_RX) || (kr->ckr_tx == NR_EV)) {
450 /* # of reclaimed slots */
451 r = kr->ckr_rhead - kr->ckr_khead;
452 if (r < 0) {
453 r += kr->ckr_num_slots;
454 }
455
456 kr->ckr_khead = kr->ckr_rhead;
457 /* ensure global visibility */
458 os_atomic_thread_fence(seq_cst);
459 }
460
461 return (slot_idx_t)r;
462}
463
464/*
465 * Nexus-specific kr_txsync_prologue() callback.
466 */
467int
468kr_txprologue(struct kern_channel *ch, struct __kern_channel_ring *kring,
469 const slot_idx_t head, uint32_t *byte_count, uint64_t *err_reason,
470 struct proc *p)
471{
472 struct kern_pbufpool *pp = kring->ckr_pp;
473 const uint32_t maxfrags = pp->pp_max_frags;
474 slot_idx_t slot_idx = kring->ckr_rhead;
475
476 ASSERT(!(KRNA(kring)->na_flags & NAF_USER_PKT_POOL));
477
478 while (slot_idx != head) {
479 struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
480 struct __kern_quantum *kqum = ksd->sd_qum;
481 int err;
482
483 if (__improbable(!(kqum->qum_qflags & QUM_F_KERNEL_ONLY) &&
484 METADATA_IDX(kqum) != METADATA_IDX(kqum->qum_user))) {
485 SK_ERR("qum index mismatch");
486 *err_reason = SKYWALK_KILL_REASON_QUM_IDX_MISMATCH;
487 return -1;
488 }
489
490 /* Internalize */
491 err = kr_internalize_metadata(ch, kring, maxfrags, kqum, p);
492 if (__improbable(err != 0)) {
493 SK_ERR("%s(%d) kr \"%s\" (0x%llx) slot %u dropped "
494 "(err %d) kh %u kt %u | rh %u rt %u | h %u t %u",
495 sk_proc_name_address(p), sk_proc_pid(p),
496 kring->ckr_name, SK_KVA(kring), slot_idx, err,
497 kring->ckr_khead, kring->ckr_ktail,
498 kring->ckr_rhead, kring->ckr_rtail,
499 kring->ckr_ring->ring_head,
500 kring->ckr_ring->ring_tail);
501 *err_reason = SKYWALK_KILL_REASON_INTERNALIZE_FAILED;
502 return -1;
503 }
504
505 *byte_count += kqum->qum_len;
506 slot_idx = SLOT_NEXT(i: slot_idx, lim: kring->ckr_lim);
507 }
508
509 return 0;
510}
511
512/*
513 * Nexus-specific kr_txsync_prologue() callback - user packet pool variant.
514 */
515int
516kr_txprologue_upp(struct kern_channel *ch, struct __kern_channel_ring *kring,
517 const slot_idx_t head, uint32_t *byte_count, uint64_t *err_reason,
518 struct proc *p)
519{
520 struct kern_pbufpool *pp = kring->ckr_pp;
521 const uint32_t maxfrags = pp->pp_max_frags;
522 slot_idx_t slot_idx = kring->ckr_rhead;
523 struct __kern_quantum *kqum = NULL;
524 bool free_pkt = false;
525 int err = 0;
526
527 ASSERT(KRNA(kring)->na_flags & NAF_USER_PKT_POOL);
528
529 PP_LOCK(pp);
530 while (slot_idx != head) {
531 struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
532 struct __user_slot_desc *usd = KR_USD(kring, slot_idx);
533
534 /*
535 * The channel is operating in user packet pool mode;
536 * check if the packet is in the allocated list.
537 */
538 kqum = pp_remove_upp_locked(pp, usd->sd_md_idx, &err);
539 if (__improbable(err != 0)) {
540 if (kqum != NULL) {
541 SK_ERR("%s(%d) kr \"%s\" (0x%llx) slot %u "
542 "kqum %p, bad buflet chain",
543 sk_proc_name_address(p), sk_proc_pid(p),
544 kring->ckr_name, SK_KVA(kring), slot_idx,
545 SK_KVA(kqum));
546 *err_reason =
547 SKYWALK_KILL_REASON_BAD_BUFLET_CHAIN;
548 goto done;
549 }
550
551 SK_ERR("%s(%d) kr \"%s\" (0x%llx) slot %u "
552 " unallocated packet %u kh %u kt %u | "
553 "rh %u rt %u | h %u t %u",
554 sk_proc_name_address(p), sk_proc_pid(p),
555 kring->ckr_name, SK_KVA(kring), slot_idx,
556 usd->sd_md_idx, kring->ckr_khead, kring->ckr_ktail,
557 kring->ckr_rhead, kring->ckr_rtail,
558 kring->ckr_ring->ring_head,
559 kring->ckr_ring->ring_tail);
560 *err_reason = SKYWALK_KILL_REASON_UNALLOCATED_PKT;
561 goto done;
562 }
563
564 if (__improbable(!(kqum->qum_qflags & QUM_F_KERNEL_ONLY) &&
565 METADATA_IDX(kqum) != METADATA_IDX(kqum->qum_user))) {
566 SK_ERR("qum index mismatch");
567 *err_reason = SKYWALK_KILL_REASON_QUM_IDX_MISMATCH;
568 err = ERANGE;
569 free_pkt = true;
570 goto done;
571 }
572
573 /* Internalize */
574 err = kr_internalize_metadata(ch, kring, maxfrags, kqum, p);
575 if (__improbable(err != 0)) {
576 SK_ERR("%s(%d) kr \"%s\" (0x%llx) slot %u dropped "
577 "(err %d) kh %u kt %u | rh %u rt %u | h %u t %u",
578 sk_proc_name_address(p), sk_proc_pid(p),
579 kring->ckr_name, SK_KVA(kring), slot_idx, err,
580 kring->ckr_khead, kring->ckr_ktail,
581 kring->ckr_rhead, kring->ckr_rtail,
582 kring->ckr_ring->ring_head,
583 kring->ckr_ring->ring_tail);
584 *err_reason = SKYWALK_KILL_REASON_INTERNALIZE_FAILED;
585 free_pkt = true;
586 goto done;
587 }
588
589 /*
590 * Attach packet to slot, detach mapping from alloc ring slot.
591 */
592 kqum->qum_ksd = NULL;
593 USD_RESET(usd);
594 KR_SLOT_ATTACH_METADATA(kring, ksd, kqum);
595
596 *byte_count += kqum->qum_len;
597 slot_idx = SLOT_NEXT(i: slot_idx, lim: kring->ckr_lim);
598 }
599
600done:
601 PP_UNLOCK(pp);
602 if (__improbable(err != 0) && free_pkt) {
603 ASSERT(kqum != NULL);
604 kqum->qum_ksd = NULL;
605 pp_free_packet(pp, (uint64_t)kqum);
606 }
607 return err;
608}
609
610#define NM_FAIL_ON(t, reason) if (__improbable(t)) { SK_ERR("fail " #t); \
611 err_reason = reason; goto error; }
612/*
613 * Validate parameters in the TX/FREE ring/kring.
614 *
615 * ckr_rhead, ckr_rtail=ktail are stored from previous round.
616 * khead is the next packet to send to the ring.
617 *
618 * We want
619 * khead <= *ckr_rhead <= head <= tail = *ckr_rtail <= ktail
620 *
621 * ckr_khead, ckr_rhead, ckr_rtail and ckr_ktail are reliable
622 */
623#define _KR_TXRING_VALIDATE(_kring, _ring, _kh, _kt, _rh, _krt) do {\
624 slot_idx_t _n = (_kring)->ckr_num_slots; \
625 /* kernel sanity checks */ \
626 NM_FAIL_ON((_kh) >= _n || kring->ckr_rhead >= _n || (_krt) >= _n || \
627 (_kt) >= _n, SKYWALK_KILL_REASON_BASIC_SANITY); \
628 /* user basic sanity checks */ \
629 NM_FAIL_ON((_rh) >= _n, SKYWALK_KILL_REASON_BASIC_SANITY); \
630 /* \
631 * user sanity checks. We only use 'cur', \
632 * A, B, ... are possible positions for cur: \
633 * \
634 * 0 A cur B tail C n-1 \
635 * 0 D tail E cur F n-1 \
636 * \
637 * B, F, D are valid. A, C, E are wrong \
638 */ \
639 if ((_krt) >= kring->ckr_rhead) { \
640 /* want ckr_rhead <= head <= ckr_rtail */ \
641 NM_FAIL_ON((_rh) < kring->ckr_rhead || (_rh) > (_krt), \
642 SKYWALK_KILL_REASON_HEAD_OOB); \
643 } else { /* here ckr_rtail < ckr_rhead */ \
644 /* we need head outside ckr_rtail .. ckr_rhead */ \
645 NM_FAIL_ON((_rh) > (_krt) && (_rh) < kring->ckr_rhead, \
646 SKYWALK_KILL_REASON_HEAD_OOB_WRAPPED); \
647 } \
648 NM_FAIL_ON(ring->ring_tail != (_krt), \
649 SKYWALK_KILL_REASON_TAIL_MISMATCH); \
650} while (0)
651
652/*
653 * Validate parameters in the ring/kring on entry for *_txsync().
654 * Returns ring->ring_head if ok, or something >= kring->ckr_num_slots
655 * in case of error, in order to force a reinit.
656 */
657slot_idx_t
658kr_txsync_prologue(struct kern_channel *ch, struct __kern_channel_ring *kring,
659 struct proc *p)
660{
661 struct __user_channel_ring *ring = kring->ckr_ring;
662 slot_idx_t ckr_khead, ckr_ktail, ckr_rtail;
663 slot_idx_t head;
664 uint32_t byte_count = 0;
665 uint64_t err_reason = 0;
666 int slot_count;
667
668 VERIFY(sk_is_sync_protected());
669 /* assert that this routine is only called for user facing rings */
670 ASSERT(!KR_KERNEL_ONLY(kring));
671 ASSERT(kring->ckr_usds != NULL);
672
673 /* read these once and use local copies */
674 head = ring->ring_head;
675 ckr_khead = kring->ckr_khead;
676 ckr_ktail = kring->ckr_ktail;
677 os_atomic_thread_fence(seq_cst);
678 ckr_rtail = kring->ckr_rtail;
679
680 SK_DF(SK_VERB_SYNC | SK_VERB_TX, "%s(%d) kr \"%s\", kh %u kt %u | "
681 "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
682 sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
683 kring->ckr_rhead, ckr_rtail,
684 ring->ring_head, ring->ring_tail);
685
686 _KR_TXRING_VALIDATE(kring, ring, ckr_khead, ckr_ktail, head, ckr_rtail);
687
688 /* # of new tx slots */
689 slot_count = head - kring->ckr_rhead;
690 if (slot_count < 0) {
691 slot_count += kring->ckr_num_slots;
692 }
693
694 /*
695 * Invoke nexus-specific TX prologue callback, set in na_kr_create().
696 */
697 if (kring->ckr_prologue != NULL && (kring->ckr_prologue(ch,
698 kring, head, &byte_count, &err_reason, p) != 0)) {
699 goto error;
700 }
701
702 /* update the user's view of slots & bytes transferred */
703 kr_update_user_stats(kring, slot_count, byte_count);
704
705 /* update the kernel view of ring */
706 kring->ckr_rhead = head;
707
708 /* save for kr_txsync_finalize(); only khead is needed */
709 kring->ckr_khead_pre = ckr_khead;
710
711 return head;
712
713error:
714 SK_ERR("%s(%d) kr \"%s\" (0x%llx) krflags 0x%b error: kh %u kt %u | "
715 "rh %u rt %u | h %u t %u |", sk_proc_name_address(p),
716 sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
717 CKRF_BITS, ckr_khead, ckr_ktail, kring->ckr_rhead,
718 ckr_rtail, head, ring->ring_tail);
719
720 skywalk_kill_process(p, err_reason | SKYWALK_KILL_REASON_TX_SYNC);
721
722 return kring->ckr_num_slots;
723}
724
725/*
726 * Validate parameters in the ring/kring on entry for *_free_sync().
727 * Returns ring->ring_head if ok, or something >= kring->ckr_num_slots
728 * in case of error, in order to force a reinit.
729 */
730slot_idx_t
731kr_free_sync_prologue(struct __kern_channel_ring *kring, struct proc *p)
732{
733 struct __user_channel_ring *ring = kring->ckr_ring;
734 slot_idx_t ckr_khead, ckr_ktail, ckr_rtail;
735 slot_idx_t head;
736 uint64_t err_reason = 0;
737
738 VERIFY(sk_is_sync_protected());
739 /* read these once and use local copies */
740 head = ring->ring_head;
741 ckr_khead = kring->ckr_khead;
742 ckr_ktail = kring->ckr_ktail;
743 os_atomic_thread_fence(seq_cst);
744 ckr_rtail = kring->ckr_rtail;
745
746 SK_DF(SK_VERB_SYNC, "%s(%d) kr \"%s\", kh %u kt %u | "
747 "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
748 sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
749 kring->ckr_rhead, ckr_rtail, ring->ring_head, ring->ring_tail);
750
751 _KR_TXRING_VALIDATE(kring, ring, ckr_khead, ckr_ktail, head, ckr_rtail);
752
753 /* update the kernel view of ring */
754 kring->ckr_rhead = head;
755 return head;
756
757error:
758 SK_ERR("%s(%d) kr \"%s\" (0x%llx) krflags 0x%b error: kh %u kt %u | "
759 "rh %u rt %u | h %u t %u |", sk_proc_name_address(p),
760 sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
761 CKRF_BITS, ckr_khead, ckr_ktail, kring->ckr_rhead,
762 ckr_rtail, head, ring->ring_tail);
763
764 skywalk_kill_process(p, err_reason | SKYWALK_KILL_REASON_FREE_SYNC);
765 return kring->ckr_num_slots;
766}
767
768/*
769 * Nexus-specific kr_rxsync_prologue() callback.
770 */
771int
772kr_rxprologue(struct kern_channel *ch, struct __kern_channel_ring *kring,
773 const slot_idx_t head, uint32_t *byte_count, uint64_t *err_reason,
774 struct proc *p)
775{
776#pragma unused(ch, p)
777 slot_idx_t slot_idx = kring->ckr_rhead;
778 uint32_t nfree = 0;
779
780 ASSERT(!(KRNA(kring)->na_flags & NAF_USER_PKT_POOL));
781
782 /*
783 * Iterating through the slots just read by user-space;
784 * ckr_rhead -> ring_head
785 */
786 while (slot_idx != head) {
787 struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
788 struct __kern_quantum *kqum = ksd->sd_qum;
789
790 ASSERT(KSD_VALID_METADATA(ksd));
791 /* # of new bytes transferred */
792 *byte_count += kqum->qum_len;
793
794 /* detach and free the packet */
795 (void) KR_SLOT_DETACH_METADATA(kring, ksd);
796 ASSERT(nfree < kring->ckr_num_slots);
797 kring->ckr_scratch[nfree++] = (uint64_t)kqum;
798
799 slot_idx = SLOT_NEXT(i: slot_idx, lim: kring->ckr_lim);
800 }
801
802 if (nfree > 0) {
803 pp_free_packet_batch(kring->ckr_pp,
804 &kring->ckr_scratch[0], nfree);
805 }
806
807 /*
808 * Update userspace channel statistics of # readable bytes
809 * subtract byte counts from slots just given back to the kernel.
810 */
811 if (kring->ckr_ready_bytes < *byte_count) {
812 SK_ERR("%s(%d) kr \"%s\" (0x%llx) inconsistent ready bytes "
813 "(%u < %u) kh %u kt %u | rh %u rt %u | h %u t %u",
814 sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
815 SK_KVA(kring), kring->ckr_ready_bytes, *byte_count,
816 kring->ckr_khead, kring->ckr_ktail, kring->ckr_rhead,
817 kring->ckr_rtail, kring->ckr_ring->ring_head,
818 kring->ckr_ring->ring_tail);
819 *err_reason = SKYWALK_KILL_REASON_INCONSISTENT_READY_BYTES;
820 return -1;
821 }
822 kring->ckr_ready_bytes -= *byte_count;
823
824 return 0;
825}
826
827/*
828 * Nexus-specific kr_rxsync_prologue() callback - no detach variant.
829 */
830int
831kr_rxprologue_nodetach(struct kern_channel *ch,
832 struct __kern_channel_ring *kring, const slot_idx_t head,
833 uint32_t *byte_count, uint64_t *err_reason, struct proc *p)
834{
835#pragma unused(ch, p)
836 slot_idx_t slot_idx = kring->ckr_rhead;
837
838 ASSERT(!(KRNA(kring)->na_flags & NAF_USER_PKT_POOL));
839
840 /*
841 * Iterating through the slots just read by user-space;
842 * ckr_rhead -> ring_head
843 */
844 while (slot_idx != head) {
845 struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
846 struct __kern_quantum *kqum = ksd->sd_qum;
847
848 ASSERT(KSD_VALID_METADATA(ksd));
849 /* # of new bytes transferred */
850 *byte_count += kqum->qum_len;
851 slot_idx = SLOT_NEXT(i: slot_idx, lim: kring->ckr_lim);
852 }
853
854 /*
855 * Update userspace channel statistics of # readable bytes
856 * subtract byte counts from slots just given back to the kernel.
857 */
858 if (kring->ckr_ready_bytes < *byte_count) {
859 SK_ERR("%s(%d) kr \"%s\" (0x%llx) inconsistent ready bytes "
860 "(%u < %u) kh %u kt %u | rh %u rt %u | h %u t %u",
861 sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
862 SK_KVA(kring), kring->ckr_ready_bytes, *byte_count,
863 kring->ckr_khead, kring->ckr_ktail, kring->ckr_rhead,
864 kring->ckr_rtail, kring->ckr_ring->ring_head,
865 kring->ckr_ring->ring_tail);
866 *err_reason = SKYWALK_KILL_REASON_INCONSISTENT_READY_BYTES;
867#if (DEVELOPMENT || DEBUG)
868 if (kr_disable_panic_on_sync_err == 0) {
869 panic("kr(0x%llx), inconsistent, head %u, ready %llu, "
870 "cnt %u", SK_KVA(kring), head,
871 kring->ckr_ready_bytes, *byte_count);
872 /* NOTREACHED */
873 __builtin_unreachable();
874 }
875#else /* (DEVELOPMENT || DEBUG) */
876 return -1;
877#endif /* !(DEVELOPMENT || DEBUG) */
878 }
879 kring->ckr_ready_bytes -= *byte_count;
880
881 return 0;
882}
883
884/*
885 * Nexus-specific kr_rxsync_prologue() callback - user packet pool variant.
886 */
887int
888kr_rxprologue_upp(struct kern_channel *ch, struct __kern_channel_ring *kring,
889 const slot_idx_t head, uint32_t *byte_count, uint64_t *err_reason,
890 struct proc *p)
891{
892#pragma unused(ch, p)
893 slot_idx_t slot_idx = kring->ckr_rhead;
894
895 ASSERT(KRNA(kring)->na_flags & NAF_USER_PKT_POOL);
896
897 /*
898 * Iterating through the slots just read by user-space;
899 * ckr_rhead -> ring_head
900 */
901 while (slot_idx != head) {
902 struct __user_slot_desc *usd = KR_USD(kring, slot_idx);
903
904 /*
905 * This is a user facing ring opting in for the user packet
906 * pool mode, so ensure that the user has detached packet
907 * from slot.
908 */
909 ASSERT(!KSD_VALID_METADATA(KR_KSD(kring, slot_idx)));
910 if (SD_VALID_METADATA(usd)) {
911 SK_ERR("%s(%d) kr \"%s\" (0x%llx) slot %u not "
912 "detached md %u kh %u kt %u | rh %u rt %u |"
913 " h %u t %u", sk_proc_name_address(p),
914 sk_proc_pid(p), kring->ckr_name,
915 SK_KVA(kring), slot_idx, usd->sd_md_idx,
916 kring->ckr_khead, kring->ckr_ktail,
917 kring->ckr_rhead, kring->ckr_rtail,
918 kring->ckr_ring->ring_head,
919 kring->ckr_ring->ring_tail);
920 *err_reason = SKYWALK_KILL_REASON_SLOT_NOT_DETACHED;
921 return -1;
922 }
923 *byte_count += usd->sd_len;
924
925 slot_idx = SLOT_NEXT(i: slot_idx, lim: kring->ckr_lim);
926 }
927
928 /*
929 * update userspace channel statistics of # readable bytes
930 * subtract byte counts from slots just given back to the kernel
931 */
932 if (kring->ckr_ready_bytes < *byte_count) {
933 SK_ERR("%s(%d) kr \"%s\" (0x%llx) inconsistent ready bytes "
934 "(%u < %u) kh %u kt %u | rh %u rt %u | h %u t %u",
935 sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
936 SK_KVA(kring), kring->ckr_ready_bytes, *byte_count,
937 kring->ckr_khead, kring->ckr_ktail, kring->ckr_rhead,
938 kring->ckr_rtail, kring->ckr_ring->ring_head,
939 kring->ckr_ring->ring_tail);
940 *err_reason = SKYWALK_KILL_REASON_INCONSISTENT_READY_BYTES;
941 return -1;
942 }
943 kring->ckr_ready_bytes -= *byte_count;
944
945 return 0;
946}
947
948/*
949 * Validate parameters in the RX/ALLOC/EVENT ring/kring.
950 * For a valid configuration,
951 * khead <= head <= tail <= ktail
952 *
953 * We only consider head.
954 * khead and ktail are reliable.
955 */
956#define _KR_RXRING_VALIDATE(_kring, _ring, _kh, _kt, _rh) do { \
957 slot_idx_t _n = (_kring)->ckr_num_slots; \
958 /* kernel sanity checks */ \
959 NM_FAIL_ON((_kh) >= _n || (_kt) >= _n, \
960 SKYWALK_KILL_REASON_BASIC_SANITY); \
961 /* user sanity checks */ \
962 if ((_kt) >= (_kh)) { \
963 /* want khead <= head <= ktail */ \
964 NM_FAIL_ON((_rh) < (_kh) || (_rh) > (_kt), \
965 SKYWALK_KILL_REASON_HEAD_OOB); \
966 } else { \
967 /* we need head outside ktail..khead */ \
968 NM_FAIL_ON((_rh) < (_kh) && (_rh) > (_kt), \
969 SKYWALK_KILL_REASON_HEAD_OOB_WRAPPED); \
970 } \
971 NM_FAIL_ON((_ring)->ring_tail != (_kring)->ckr_rtail, \
972 SKYWALK_KILL_REASON_TAIL_MISMATCH); \
973} while (0)
974
975/*
976 * Validate parameters in the ring/kring on entry for *_rxsync().
977 * Returns ring->ring_head if ok, kring->ckr_num_slots on error,
978 * in order to force a reinit.
979 */
980slot_idx_t
981kr_rxsync_prologue(struct kern_channel *ch, struct __kern_channel_ring *kring,
982 struct proc *p)
983{
984#pragma unused(ch)
985 struct __user_channel_ring *ring = kring->ckr_ring;
986 slot_idx_t ckr_khead, ckr_ktail;
987 slot_idx_t head;
988 uint32_t byte_count = 0;
989 uint64_t err_reason = 0;
990 int slot_count;
991
992 VERIFY(sk_is_sync_protected());
993 /* assert that this routine is only called for user facing rings */
994 ASSERT(!KR_KERNEL_ONLY(kring));
995 ASSERT(kring->ckr_usds != NULL);
996
997 /* read these once and use local copies */
998 ckr_khead = kring->ckr_khead;
999 ckr_ktail = kring->ckr_ktail;
1000
1001 SK_DF(SK_VERB_SYNC | SK_VERB_RX, "%s(%d) kr \"%s\", kh %u kt %u | "
1002 "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1003 sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1004 kring->ckr_rhead, kring->ckr_rtail,
1005 ring->ring_head, ring->ring_tail);
1006 /*
1007 * Before storing the new values, we should check they do not
1008 * move backwards. However:
1009 * - head is not an issue because the previous value is khead;
1010 * - cur could in principle go back, however it does not matter
1011 * because we are processing a brand new rxsync()
1012 */
1013 head = ring->ring_head; /* read only once */
1014
1015 _KR_RXRING_VALIDATE(kring, ring, ckr_khead, ckr_ktail, head);
1016
1017 /* # of reclaimed slots */
1018 slot_count = head - kring->ckr_rhead;
1019 if (slot_count < 0) {
1020 slot_count += kring->ckr_num_slots;
1021 }
1022
1023 /*
1024 * Invoke nexus-specific RX prologue callback, which may detach
1025 * and free any consumed packets. Configured in na_kr_create().
1026 */
1027 if (kring->ckr_prologue != NULL && (kring->ckr_prologue(ch,
1028 kring, head, &byte_count, &err_reason, p) != 0)) {
1029 goto error;
1030 }
1031 /* update the user's view of slots & bytes transferred */
1032 kr_update_user_stats(kring, slot_count, byte_count);
1033
1034 /* update the kernel view of ring */
1035 kring->ckr_rhead = head;
1036 return head;
1037
1038error:
1039 SK_ERR("%s(%d) kr \"%s\" (0x%llx) krflags 0x%b error: kh %u kt %u | "
1040 "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1041 sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
1042 CKRF_BITS, ckr_khead, ckr_ktail,
1043 kring->ckr_rhead, kring->ckr_rtail,
1044 ring->ring_head, ring->ring_tail);
1045
1046 skywalk_kill_process(p, err_reason | SKYWALK_KILL_REASON_RX_SYNC);
1047 return kring->ckr_num_slots;
1048}
1049
1050/*
1051 * Validate parameters on the ring/kring on entry for *_alloc_sync().
1052 * Returns ring->ring_head if ok, kring->ckr_num_slots on error,
1053 * in order to force a reinit.
1054 */
1055slot_idx_t
1056kr_alloc_sync_prologue(struct __kern_channel_ring *kring, struct proc *p)
1057{
1058 struct __user_channel_ring *ring = kring->ckr_ring;
1059 slot_idx_t ckr_khead, ckr_ktail;
1060 slot_idx_t head;
1061 uint64_t err_reason = 0;
1062
1063 VERIFY(sk_is_sync_protected());
1064
1065 /* read these once and use local copies */
1066 ckr_khead = kring->ckr_khead;
1067 ckr_ktail = kring->ckr_ktail;
1068 head = ring->ring_head;
1069
1070 SK_DF(SK_VERB_SYNC, "%s(%d) kr \"%s\", kh %u kt %u | "
1071 "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1072 sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1073 kring->ckr_rhead, kring->ckr_rtail,
1074 head, ring->ring_tail);
1075 /*
1076 * Before storing the new values, we should check they do not
1077 * move backwards. However, head is not an issue because the
1078 * previous value is khead;
1079 */
1080 _KR_RXRING_VALIDATE(kring, ring, ckr_khead, ckr_ktail, head);
1081
1082 /* update the kernel view of ring */
1083 kring->ckr_rhead = head;
1084 return head;
1085
1086error:
1087 SK_ERR("%s(%d) kr \"%s\" (0x%llx) krflags 0x%b error: kh %u kt %u | "
1088 "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1089 sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
1090 CKRF_BITS, ckr_khead, ckr_ktail,
1091 kring->ckr_rhead, kring->ckr_rtail,
1092 ring->ring_head, ring->ring_tail);
1093
1094 skywalk_kill_process(p, err_reason | SKYWALK_KILL_REASON_ALLOC_SYNC);
1095 return kring->ckr_num_slots;
1096}
1097
1098/*
1099 * Nexus-specific kr_txsync_finalize() callback.
1100 */
1101void
1102kr_txfinalize(struct kern_channel *ch, struct __kern_channel_ring *kring,
1103 const slot_idx_t head, struct proc *p)
1104{
1105#pragma unused(ch)
1106 struct kern_pbufpool *pp = kring->ckr_pp;
1107 slot_idx_t slot_idx;
1108 uint32_t ph_cnt, i = 0;
1109 int32_t ph_needed;
1110 int err;
1111
1112 ASSERT(!(KRNA(kring)->na_flags & NAF_USER_PKT_POOL));
1113
1114 /* use khead value from pre-sync time */
1115 slot_idx = kring->ckr_khead_pre;
1116
1117 ph_needed = head - slot_idx;
1118 if (ph_needed < 0) {
1119 ph_needed += kring->ckr_num_slots;
1120 }
1121 if (ph_needed == 0) {
1122 return;
1123 }
1124
1125 ph_cnt = (uint32_t)ph_needed;
1126 err = kern_pbufpool_alloc_batch(pbufpool: pp, bufcnt: 1, array: kring->ckr_scratch, size: &ph_cnt);
1127 VERIFY(err == 0 && ph_cnt == (uint32_t)ph_needed);
1128
1129 /* recycle the transferred packets */
1130 while (slot_idx != head) {
1131 struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
1132 kern_packet_t ph;
1133
1134 if (KSD_VALID_METADATA(ksd)) {
1135 goto next_slot;
1136 }
1137
1138 ph = kring->ckr_scratch[i];
1139 ASSERT(ph != 0);
1140 kring->ckr_scratch[i] = 0;
1141 ++i;
1142
1143 /*
1144 * Since this packet is freshly allocated and we need
1145 * to have the flag set for the attach to succeed,
1146 * just set it here rather than calling
1147 * __packet_finalize().
1148 */
1149 SK_PTR_ADDR_KQUM(ph)->qum_qflags |= QUM_F_FINALIZED;
1150
1151 KR_SLOT_ATTACH_METADATA(kring, ksd, SK_PTR_ADDR_KQUM(ph));
1152
1153 kr_externalize_metadata_internal(kring, pp->pp_max_frags,
1154 SK_PTR_ADDR_KQUM(ph), p);
1155next_slot:
1156 slot_idx = SLOT_NEXT(i: slot_idx, lim: kring->ckr_lim);
1157 }
1158
1159 if (i != ph_cnt) {
1160 kern_pbufpool_free_batch(pbufpool: pp, array: &kring->ckr_scratch[i],
1161 size: ph_cnt - i);
1162 }
1163}
1164
1165/*
1166 * Nexus-specific kr_txsync_finalize() callback - user packet pool variant.
1167 */
1168void
1169kr_txfinalize_upp(struct kern_channel *ch, struct __kern_channel_ring *kring,
1170 const slot_idx_t head, struct proc *p)
1171{
1172#pragma unused(ch, p)
1173 slot_idx_t slot_idx;
1174 uint32_t nfree = 0;
1175
1176 ASSERT(KRNA(kring)->na_flags & NAF_USER_PKT_POOL);
1177
1178 /* use khead value from pre-sync time */
1179 slot_idx = kring->ckr_khead_pre;
1180
1181 /* recycle the transferred packets */
1182 while (slot_idx != head) {
1183 struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
1184
1185 if (KSD_VALID_METADATA(ksd)) {
1186 /* detach and free the packet */
1187 struct __kern_quantum *kqum = ksd->sd_qum;
1188 (void) KR_SLOT_DETACH_METADATA(kring, ksd);
1189 ASSERT(nfree < kring->ckr_num_slots);
1190 kring->ckr_scratch[nfree++] = (uint64_t)kqum;
1191 }
1192
1193 slot_idx = SLOT_NEXT(i: slot_idx, lim: kring->ckr_lim);
1194 }
1195
1196 if (__probable(nfree > 0)) {
1197 pp_free_packet_batch(kring->ckr_pp,
1198 &kring->ckr_scratch[0], nfree);
1199 }
1200}
1201
1202/*
1203 * Update kring and ring at the end of txsync.
1204 */
1205void
1206kr_txsync_finalize(struct kern_channel *ch, struct __kern_channel_ring *kring,
1207 struct proc *p)
1208{
1209 slot_idx_t ckr_khead, ckr_ktail;
1210 uint32_t slot_size;
1211 int32_t slot_diff;
1212
1213 VERIFY(sk_is_sync_protected());
1214 /* assert that this routine is only called for user facing rings */
1215 ASSERT(!KR_KERNEL_ONLY(kring));
1216
1217 /* read these once and use local copies */
1218 ckr_khead = kring->ckr_khead;
1219 ckr_ktail = kring->ckr_ktail;
1220
1221 /*
1222 * update userspace-facing channel statistics (# writable bytes/slots)
1223 *
1224 * Since the ring might be dynamically allocated, we can't rely on the
1225 * tail pointer to calculate free TX space (the tail might be sitting
1226 * at the edge of allocated ring space but be able to be pushed over
1227 * into unallocated ring space).
1228 *
1229 * Instead, calculate free TX space by looking at what slots are
1230 * available to the kernel for TX, and subtracting that from the total
1231 * number of possible slots. This is effectively what userspace can
1232 * write to.
1233 */
1234 slot_size = PP_BUF_SIZE_DEF(kring->ckr_pp);
1235 slot_diff = kring->ckr_rhead - ckr_khead;
1236 if (slot_diff < 0) {
1237 slot_diff += kring->ckr_num_slots;
1238 }
1239 slot_diff = kring->ckr_lim - slot_diff;
1240 kring->ckr_ready_slots = slot_diff;
1241 kring->ckr_ready_bytes = slot_diff * slot_size;
1242
1243 /*
1244 * Invoke nexus-specific TX finalize callback, which may recycle any
1245 * transferred packets and/or externalize new ones. Some nexus don't
1246 * have any callback set. Configured in na_kr_create().
1247 */
1248 if (kring->ckr_finalize != NULL) {
1249 kring->ckr_finalize(ch, kring, ckr_khead, p);
1250 }
1251
1252 /* update ring tail/khead to what the kernel knows */
1253 *(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_tail =
1254 kring->ckr_rtail = ckr_ktail;
1255 *(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_khead = ckr_khead;
1256
1257 SK_DF(SK_VERB_SYNC | SK_VERB_TX, "%s(%d) kr \"%s\", kh %u kt %u | "
1258 "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1259 sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1260 kring->ckr_rhead, kring->ckr_rtail,
1261 kring->ckr_ring->ring_head,
1262 kring->ckr_ring->ring_tail);
1263}
1264
1265/*
1266 * Nexus-specific kr_rxsync_finalize() callback.
1267 */
1268void
1269kr_rxfinalize(struct kern_channel *ch, struct __kern_channel_ring *kring,
1270 const slot_idx_t tail, struct proc *p)
1271{
1272#pragma unused(ch)
1273 const uint32_t maxfrags = kring->ckr_pp->pp_max_frags;
1274 slot_idx_t slot_idx = kring->ckr_rtail;
1275 uint32_t byte_count = 0;
1276
1277 while (slot_idx != tail) {
1278 struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
1279 struct __kern_quantum *kqum = ksd->sd_qum;
1280
1281 /*
1282 * nexus provider should never leave an empty slot on rx ring.
1283 */
1284 VERIFY(kqum != NULL);
1285 kr_externalize_metadata_internal(kring, maxfrags, kqum, p);
1286 ASSERT(!(KR_USD(kring, slot_idx)->sd_flags & ~SD_FLAGS_USER));
1287
1288 byte_count += kqum->qum_len;
1289 slot_idx = SLOT_NEXT(i: slot_idx, lim: kring->ckr_lim);
1290 }
1291
1292 kring->ckr_ready_bytes += byte_count;
1293
1294 /* just recalculate slot count using pointer arithmetic */
1295 int32_t slot_diff = tail - kring->ckr_rhead;
1296 if (slot_diff < 0) {
1297 slot_diff += kring->ckr_num_slots;
1298 }
1299 kring->ckr_ready_slots = slot_diff;
1300
1301#if CONFIG_NEXUS_NETIF
1302 /*
1303 * If this is a channel opened directly to the netif nexus, provide
1304 * it feedbacks on the number of packets and bytes consumed. This
1305 * will drive the receive mitigation strategy.
1306 */
1307 if (__improbable(kring->ckr_netif_mit_stats != NULL) &&
1308 slot_diff != 0 && byte_count != 0) {
1309 kring->ckr_netif_mit_stats(kring, slot_diff, byte_count);
1310 }
1311#endif /* CONFIG_NEXUS_NETIF */
1312}
1313
1314/*
1315 * Nexus-specific kr_rxsync_finalize() callback - user packet pool variant.
1316 */
1317void
1318kr_rxfinalize_upp(struct kern_channel *ch, struct __kern_channel_ring *kring,
1319 const slot_idx_t tail, struct proc *p)
1320{
1321 const uint32_t maxfrags = kring->ckr_pp->pp_max_frags;
1322 slot_idx_t slot_idx = kring->ckr_rtail;
1323 struct kern_pbufpool *pp = kring->ckr_pp;
1324 uint32_t byte_count = 0;
1325
1326 PP_LOCK(pp);
1327 while (slot_idx != tail) {
1328 struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
1329 struct __user_slot_desc *usd = KR_USD(kring, slot_idx);
1330 struct __kern_quantum *kqum = ksd->sd_qum;
1331
1332 /*
1333 * nexus provider should never leave an empty slot on rx ring.
1334 */
1335 VERIFY(kqum != NULL);
1336 /*
1337 * The channel is operating in packet allocator
1338 * mode, so add packet to the allocated list.
1339 */
1340 pp_insert_upp_locked(pp, kqum, ch->ch_pid);
1341
1342 KSD_DETACH_METADATA(ksd);
1343 /* To calculate ckr_ready_bytes by kr_rxsync_prologue */
1344 USD_SET_LENGTH(usd, (uint16_t)kqum->qum_len);
1345
1346 kr_externalize_metadata_internal(kring, maxfrags, kqum, p);
1347 ASSERT((usd->sd_flags & ~SD_FLAGS_USER) == 0);
1348
1349 byte_count += kqum->qum_len;
1350 slot_idx = SLOT_NEXT(i: slot_idx, lim: kring->ckr_lim);
1351 }
1352 PP_UNLOCK(pp);
1353
1354 kring->ckr_ready_bytes += byte_count;
1355
1356 /* just recalculate slot count using pointer arithmetic */
1357 int32_t slot_diff = tail - kring->ckr_rhead;
1358 if (slot_diff < 0) {
1359 slot_diff += kring->ckr_num_slots;
1360 }
1361 kring->ckr_ready_slots = slot_diff;
1362
1363#if CONFIG_NEXUS_NETIF
1364 /*
1365 * If this is a channel opened directly to the netif nexus, provide
1366 * it feedbacks on the number of packets and bytes consumed. This
1367 * will drive the receive mitigation strategy.
1368 */
1369 if (__improbable(kring->ckr_netif_mit_stats != NULL) &&
1370 slot_diff != 0 && byte_count != 0) {
1371 kring->ckr_netif_mit_stats(kring, slot_diff, byte_count);
1372 }
1373#endif /* CONFIG_NEXUS_NETIF */
1374}
1375
1376/*
1377 * Update kring and ring at the end of rxsync
1378 */
1379void
1380kr_rxsync_finalize(struct kern_channel *ch, struct __kern_channel_ring *kring,
1381 struct proc *p)
1382{
1383#pragma unused(ch, p)
1384 slot_idx_t ckr_khead, ckr_ktail;
1385
1386 VERIFY(sk_is_sync_protected());
1387 /* assert that this routine is only called for user facing rings */
1388 ASSERT(!KR_KERNEL_ONLY(kring));
1389 ASSERT(kring->ckr_usds != NULL);
1390
1391 /* read these once and use local copies */
1392 ckr_khead = kring->ckr_khead;
1393 ckr_ktail = kring->ckr_ktail;
1394
1395 /*
1396 * Invoke nexus-specific RX finalize callback; set in na_kr_create().
1397 */
1398 if (kring->ckr_finalize != NULL) {
1399 kring->ckr_finalize(ch, kring, ckr_ktail, p);
1400 }
1401
1402 /* update ring tail/khead to what the kernel knows */
1403 *(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_tail =
1404 kring->ckr_rtail = ckr_ktail;
1405 *(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_khead = ckr_khead;
1406
1407 SK_DF(SK_VERB_SYNC | SK_VERB_RX, "%s(%d) kr \"%s\", kh %u kt %u | "
1408 "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1409 sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1410 kring->ckr_rhead, kring->ckr_rtail,
1411 kring->ckr_ring->ring_head,
1412 kring->ckr_ring->ring_tail);
1413}
1414
1415void
1416kr_alloc_sync_finalize(struct __kern_channel_ring *kring, struct proc *p)
1417{
1418#pragma unused(p)
1419 slot_idx_t ckr_khead, ckr_ktail;
1420
1421 VERIFY(sk_is_sync_protected());
1422 /* read these once and use local copies */
1423 ckr_khead = kring->ckr_khead;
1424 ckr_ktail = kring->ckr_ktail;
1425
1426 /* update ring tail/khead to what the kernel knows */
1427 *(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_tail =
1428 kring->ckr_rtail = ckr_ktail;
1429 *(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_khead = ckr_khead;
1430 *(uint32_t *)(uintptr_t)&kring->ckr_ring->ring_alloc_ws =
1431 kring->ckr_alloc_ws;
1432
1433 SK_DF(SK_VERB_SYNC, "%s(%d) kr \"%s\", kh %u kt %u | "
1434 "rh %u rt %u | h %u t %u | ws %u",
1435 sk_proc_name_address(p),
1436 sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1437 kring->ckr_rhead, kring->ckr_rtail,
1438 kring->ckr_ring->ring_head,
1439 kring->ckr_ring->ring_tail, kring->ckr_alloc_ws);
1440}
1441
1442void
1443kr_free_sync_finalize(struct __kern_channel_ring *kring, struct proc *p)
1444{
1445#pragma unused(p)
1446 slot_idx_t ckr_khead, ckr_ktail;
1447
1448 VERIFY(sk_is_sync_protected());
1449 /* read these once and use local copies */
1450 ckr_khead = kring->ckr_khead;
1451 ckr_ktail = kring->ckr_ktail;
1452
1453 /* update ring tail/khead to what the kernel knows */
1454 *(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_tail =
1455 kring->ckr_rtail = ckr_ktail;
1456 *(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_khead = ckr_khead;
1457
1458 SK_DF(SK_VERB_SYNC, "%s(%d) kr \"%s\", kh %u kt %u | "
1459 "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1460 sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1461 kring->ckr_rhead, kring->ckr_rtail,
1462 kring->ckr_ring->ring_head,
1463 kring->ckr_ring->ring_tail);
1464}
1465
1466slot_idx_t
1467kr_event_sync_prologue(struct __kern_channel_ring *kring, struct proc *p)
1468{
1469 struct __user_channel_ring *ring = kring->ckr_ring;
1470 slot_idx_t ckr_khead, ckr_ktail;
1471 slot_idx_t head, slot_idx;
1472 uint64_t err_reason = 0;
1473
1474 ASSERT(kring->ckr_tx == NR_EV);
1475 VERIFY(sk_is_sync_protected());
1476
1477 /* read these once and use local copies */
1478 ckr_khead = kring->ckr_khead;
1479 ckr_ktail = kring->ckr_ktail;
1480 head = ring->ring_head;
1481
1482 SK_DF(SK_VERB_SYNC, "%s(%d) kr \"%s\", kh %u kt %u | "
1483 "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1484 sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1485 kring->ckr_rhead, kring->ckr_rtail,
1486 head, ring->ring_tail);
1487 /*
1488 * Before storing the new values, we should check they do not
1489 * move backwards. However, head is not an issue because the
1490 * previous value is khead;
1491 */
1492 _KR_RXRING_VALIDATE(kring, ring, ckr_khead, ckr_ktail, head);
1493
1494 /*
1495 * Iterating through the slots just read by user-space;
1496 * ckr_rhead -> ring_head
1497 */
1498 slot_idx = kring->ckr_rhead;
1499 while (slot_idx != head) {
1500 struct __kern_slot_desc *ksd = KR_KSD(kring, slot_idx);
1501 struct __user_slot_desc *usd = KR_USD(kring, slot_idx);
1502 /*
1503 * ensure that the user has detached packet from slot.
1504 */
1505 VERIFY(!KSD_VALID_METADATA(ksd));
1506 if (__improbable(SD_VALID_METADATA(usd))) {
1507 SK_ERR("%s(%d) kr \"%s\" (0x%llx) slot %u not "
1508 "detached md %u kh %u kt %u | rh %u rt %u |"
1509 " h %u t %u", sk_proc_name_address(p),
1510 sk_proc_pid(p), kring->ckr_name,
1511 SK_KVA(kring), slot_idx, usd->sd_md_idx,
1512 ckr_khead, ckr_ktail, kring->ckr_rhead,
1513 kring->ckr_rtail, ring->ring_head,
1514 ring->ring_tail);
1515 err_reason = SKYWALK_KILL_REASON_SLOT_NOT_DETACHED;
1516 goto error;
1517 }
1518 slot_idx = SLOT_NEXT(i: slot_idx, lim: kring->ckr_lim);
1519 }
1520
1521 /* update the kernel view of ring */
1522 kring->ckr_rhead = head;
1523 return head;
1524
1525error:
1526 SK_ERR("%s(%d) kr \"%s\" (0x%llx) krflags 0x%b error: kh %u kt %u | "
1527 "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1528 sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
1529 CKRF_BITS, ckr_khead, ckr_ktail,
1530 kring->ckr_rhead, kring->ckr_rtail,
1531 ring->ring_head, ring->ring_tail);
1532
1533 skywalk_kill_process(p, err_reason | SKYWALK_KILL_REASON_EVENT_SYNC);
1534 return kring->ckr_num_slots;
1535}
1536
1537void
1538kr_event_sync_finalize(struct kern_channel *ch,
1539 struct __kern_channel_ring *kring, struct proc *p)
1540{
1541#pragma unused(ch)
1542 struct kern_pbufpool *pp = kring->ckr_pp;
1543 const uint32_t maxfrags = pp->pp_max_frags;
1544 slot_idx_t ckr_khead, ckr_ktail, ckr_rhead;
1545 struct __kern_slot_desc *ksd;
1546 struct __user_slot_desc *usd;
1547 struct __kern_quantum *kqum;
1548
1549 VERIFY(sk_is_sync_protected());
1550 /* assert that this routine is only called for user facing rings */
1551 ASSERT(!KR_KERNEL_ONLY(kring));
1552 ASSERT(kring->ckr_usds != NULL);
1553 ASSERT(kring->ckr_tx == NR_EV);
1554
1555 /* read these once and use local copies */
1556 ckr_khead = kring->ckr_khead;
1557 ckr_ktail = kring->ckr_ktail;
1558 ckr_rhead = kring->ckr_rhead;
1559
1560 slot_idx_t slot_idx = kring->ckr_rtail;
1561 PP_LOCK(pp);
1562 while (slot_idx != ckr_ktail) {
1563 ksd = KR_KSD(kring, slot_idx);
1564 usd = KR_USD(kring, slot_idx);
1565 kqum = ksd->sd_qum;
1566
1567 /*
1568 * Add packet to the allocated list of user packet pool.
1569 */
1570 pp_insert_upp_locked(pp, kqum, ch->ch_pid);
1571
1572 KSD_DETACH_METADATA(ksd);
1573 kr_externalize_metadata_internal(kring, maxfrags, kqum, p);
1574 ASSERT((usd->sd_flags & ~SD_FLAGS_USER) == 0);
1575 slot_idx = SLOT_NEXT(i: slot_idx, lim: kring->ckr_lim);
1576 }
1577 PP_UNLOCK(pp);
1578
1579 /* just recalculate slot count using pointer arithmetic */
1580 int32_t slot_diff = ckr_ktail - ckr_rhead;
1581 if (slot_diff < 0) {
1582 slot_diff += kring->ckr_num_slots;
1583 }
1584 kring->ckr_ready_slots = slot_diff;
1585
1586 /* update ring tail/khead to what the kernel knows */
1587 *(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_tail =
1588 kring->ckr_rtail = ckr_ktail;
1589 *(slot_idx_t *)(uintptr_t)&kring->ckr_ring->ring_khead = ckr_khead;
1590
1591 SK_DF(SK_VERB_SYNC | SK_VERB_RX, "%s(%d) kr \"%s\", kh %u kt %u | "
1592 "rh %u rt %u | h %u t %u", sk_proc_name_address(p),
1593 sk_proc_pid(p), kring->ckr_name, ckr_khead, ckr_ktail,
1594 kring->ckr_rhead, kring->ckr_rtail,
1595 kring->ckr_ring->ring_head,
1596 kring->ckr_ring->ring_tail);
1597}
1598#undef NM_FAIL_ON
1599
1600void
1601kr_txkring_reclaim_and_refill(struct __kern_channel_ring *kring,
1602 slot_idx_t index)
1603{
1604 const slot_idx_t lim = kring->ckr_lim;
1605 slot_idx_t next_index = SLOT_NEXT(i: index, lim);
1606
1607 kring->ckr_khead = next_index;
1608 /* reclaim */
1609 kring->ckr_ktail = index;
1610}
1611
1612/*
1613 * *************************************************************************
1614 * Checks on packet header offsets in kr_internalize_metadata
1615 * *************************************************************************
1616 *
1617 * +----------+------------------------------+----------------------------+
1618 * | | NEXUS_META_SUBTYPE_RAW | NEXUS_META_SUBTYPE_PAYLOAD |
1619 * |----------+------------------------------+----------------------------+
1620 * | buflet | (bdoff + len) <= dlim | (bdoff + len) <= dlim |
1621 * |----------+------------------------------+----------------------------+
1622 * | headroom | hr == bdoff && hr < bdlim | hr == 0 && bdoff == 0 |
1623 * |----------+------------------------------+----------------------------+
1624 * | l2_len | hr + l2_len < bdim | l2_len == 0 |
1625 * |----------+------------------------------+----------------------------+
1626 */
1627int
1628kr_internalize_metadata(struct kern_channel *ch,
1629 struct __kern_channel_ring *kring, const uint32_t maxfrags,
1630 struct __kern_quantum *kqum, struct proc *p)
1631{
1632#pragma unused(kring, maxfrags, p)
1633 struct __user_buflet *ubuf, *pubuf; /* user buflet */
1634 struct __kern_buflet *kbuf, *pkbuf; /* kernel buflet */
1635 struct __user_quantum *uqum; /* user source */
1636 struct __user_packet *upkt;
1637 struct __kern_packet *kpkt;
1638 const nexus_meta_type_t md_type = METADATA_TYPE(kqum);
1639 const nexus_meta_subtype_t md_subtype = METADATA_SUBTYPE(kqum);
1640 uint32_t len = 0, bdoff, bdlim;
1641 uint16_t bcnt = 0, bmax, i;
1642 boolean_t dropped;
1643 int err = 0;
1644
1645 /*
1646 * Verify that the quantum/packet belongs to the same pp as
1647 * the one used by the adapter, i.e. the packet must have
1648 * been allocated from the same pp and attached to the kring.
1649 */
1650 ASSERT(kqum->qum_pp == kring->ckr_pp);
1651
1652 _CASSERT(sizeof(uqum->qum_com) == sizeof(kqum->qum_com));
1653 _CASSERT(sizeof(upkt->pkt_com) == sizeof(kpkt->pkt_com));
1654 uqum = __DECONST(struct __user_quantum *, kqum->qum_user);
1655 ASSERT(!(kqum->qum_qflags & QUM_F_KERNEL_ONLY) && uqum != NULL);
1656 upkt = SK_PTR_ADDR_UPKT(uqum);
1657 kpkt = SK_PTR_ADDR_KPKT(kqum);
1658
1659 DTRACE_SKYWALK3(internalize, struct __kern_channel_ring *, kring,
1660 struct __kern_packet *, kpkt, struct __user_packet *, upkt);
1661 SK_DF(SK_VERB_MEM, "%s(%d) kring 0x%llx uqum 0x%llx -> kqum 0x%llx",
1662 sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(kring),
1663 SK_KVA(uqum), SK_KVA(kqum));
1664
1665 /* check if it's dropped before we internalize it */
1666 dropped = ((uqum->qum_qflags & QUM_F_DROPPED) != 0);
1667
1668 /*
1669 * Internalize common quantum metadata.
1670 *
1671 * For packet metadata, we trust the kernel copy for the buflet
1672 * count and limit; any mismatch on the user copy will cause
1673 * us to drop this packet.
1674 */
1675 _QUM_INTERNALIZE(uqum, kqum);
1676
1677 /* if marked as dropped, don't bother going further */
1678 if (__improbable(dropped)) {
1679 SK_ERR("%s(%d) kring 0x%llx dropped",
1680 sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(kring));
1681 err = ERANGE;
1682 goto done;
1683 }
1684
1685 switch (md_type) {
1686 case NEXUS_META_TYPE_PACKET:
1687 /*
1688 * Internalize common packet metadata.
1689 */
1690 _PKT_INTERNALIZE(upkt, kpkt);
1691
1692 switch (md_subtype) {
1693 case NEXUS_META_SUBTYPE_PAYLOAD:
1694 /* sanitize link layer fields for payload mode */
1695 kpkt->pkt_link_flags = 0;
1696 break;
1697 default:
1698 break;
1699 }
1700
1701 if (__probable(ch != NULL)) {
1702 _UUID_COPY(kpkt->pkt_flowsrc_id,
1703 ch->ch_info->cinfo_ch_id);
1704 }
1705
1706 bcnt = upkt->pkt_bufs_cnt;
1707 bmax = kpkt->pkt_bufs_max;
1708 ASSERT(bmax == maxfrags);
1709 if (__improbable((bcnt == 0) || (bcnt > bmax) ||
1710 (upkt->pkt_bufs_max != bmax))) {
1711 SK_ERR("%s(%d) kring 0x%llx bad bufcnt %d, %d, %d",
1712 sk_proc_name_address(p), sk_proc_pid(p),
1713 SK_KVA(kring), bcnt, bmax, upkt->pkt_bufs_max);
1714 err = ERANGE;
1715 goto done;
1716 }
1717 break;
1718
1719 case NEXUS_META_TYPE_QUANTUM:
1720 ASSERT(maxfrags == 1);
1721 bcnt = bmax = 1;
1722 break;
1723
1724 default:
1725 VERIFY(0);
1726 /* NOTREACHED */
1727 __builtin_unreachable();
1728 }
1729
1730 ASSERT(bcnt != 0);
1731 ubuf = pubuf = NULL;
1732 kbuf = pkbuf = NULL;
1733
1734 /*
1735 * Validate and internalize buflets.
1736 */
1737 for (i = 0; i < bcnt; i++) {
1738 _CASSERT(offsetof(struct __kern_packet, pkt_qum) == 0);
1739 _CASSERT(offsetof(struct __user_packet, pkt_qum) == 0);
1740 _CASSERT(offsetof(struct __kern_quantum, qum_com) == 0);
1741 PKT_GET_NEXT_BUFLET(kpkt, bcnt, pkbuf, kbuf);
1742 ASSERT(kbuf != NULL);
1743 if (kbuf->buf_flag & BUFLET_FLAG_EXTERNAL) {
1744 ubuf = __DECONST(struct __user_buflet *,
1745 ((struct __kern_buflet_ext *)kbuf)->kbe_buf_user);
1746 } else {
1747 ASSERT(i == 0);
1748 ubuf = __DECONST(struct __user_buflet *,
1749 &uqum->qum_buf[0]);
1750 }
1751 ASSERT(ubuf != NULL);
1752 ASSERT((kbuf != pkbuf) && (ubuf != pubuf));
1753 ASSERT(kbuf->buf_dlim == _BUF_DLIM(kbuf, kqum->qum_pp));
1754 ASSERT(kbuf->buf_addr != 0);
1755 /*
1756 * For now, user-facing pool does not support shared
1757 * buffer, since otherwise the ubuf and kbuf buffer
1758 * indices would not match. Assert this is the case.
1759 */
1760 ASSERT(kbuf->buf_addr == (mach_vm_address_t)kbuf->buf_objaddr);
1761
1762 kbuf->buf_dlen = ubuf->buf_dlen;
1763 kbuf->buf_doff = ubuf->buf_doff;
1764
1765 /*
1766 * kernel and user metadata use the same object index
1767 * also checks the sanity of buflet data offset and length
1768 */
1769 if (__improbable(!BUF_IN_RANGE(kbuf) ||
1770 ubuf->buf_idx != kbuf->buf_idx)) {
1771 kbuf->buf_dlen = kbuf->buf_doff = 0;
1772 SK_ERR("%s(%d) kring 0x%llx bad bufidx 0x%x, 0x%x",
1773 sk_proc_name_address(p), sk_proc_pid(p),
1774 SK_KVA(kring), kbuf->buf_idx, ubuf->buf_idx);
1775 err = ERANGE;
1776 goto done;
1777 }
1778
1779 /* save data offset from the first buflet */
1780 if (pkbuf == NULL) {
1781 bdoff = kbuf->buf_doff;
1782 }
1783
1784 /* all good to go */
1785 len += kbuf->buf_dlen;
1786 pubuf = ubuf;
1787 pkbuf = kbuf;
1788 }
1789
1790 _CASSERT(offsetof(struct __kern_packet, pkt_length) ==
1791 offsetof(struct __kern_packet, pkt_qum.qum_len));
1792 if (__improbable(kpkt->pkt_length != len)) {
1793 SK_ERR("%s(%d) kring 0x%llx bad pktlen %d, %d",
1794 sk_proc_name_address(p), sk_proc_pid(p),
1795 SK_KVA(kring), kpkt->pkt_length, len);
1796 err = ERANGE;
1797 goto done;
1798 }
1799
1800 if ((err == 0) && (md_type == NEXUS_META_TYPE_PACKET)) {
1801 bdlim = PP_BUF_SIZE_DEF(kqum->qum_pp);
1802 switch (md_subtype) {
1803 case NEXUS_META_SUBTYPE_RAW:
1804 /*
1805 * For a raw packet from user space we need to
1806 * validate that headroom is sane and is in the
1807 * first buflet.
1808 */
1809 if (__improbable(kpkt->pkt_headroom != bdoff)) {
1810 SK_ERR("%s(%d) kring 0x%llx bad headroom %d, %d",
1811 sk_proc_name_address(p), sk_proc_pid(p),
1812 SK_KVA(kring), kpkt->pkt_headroom, bdoff);
1813 err = ERANGE;
1814 goto done;
1815 }
1816 if (__improbable(kpkt->pkt_headroom +
1817 kpkt->pkt_l2_len >= bdlim)) {
1818 SK_ERR("%s(%d) kring 0x%llx bad headroom l2len %d, %d",
1819 sk_proc_name_address(p), sk_proc_pid(p),
1820 SK_KVA(kring), kpkt->pkt_l2_len, bdlim);
1821 err = ERANGE;
1822 goto done;
1823 }
1824 break;
1825 case NEXUS_META_SUBTYPE_PAYLOAD:
1826 /*
1827 * For a payload packet from user space we need
1828 * to validate that payload starts from 0 and L2
1829 * length is 0.
1830 */
1831 if (__improbable((kpkt->pkt_headroom != 0) ||
1832 (kpkt->pkt_l2_len != 0))) {
1833 SK_ERR("%s(%d) kring 0x%llx bad headroom "
1834 "payload subtype %d headroom %d l2len %d",
1835 sk_proc_name_address(p), sk_proc_pid(p),
1836 SK_KVA(kring), SK_PTR_SUBTYPE(kpkt),
1837 kpkt->pkt_headroom, kpkt->pkt_l2_len);
1838 err = ERANGE;
1839 goto done;
1840 }
1841 break;
1842 default:
1843 VERIFY(0);
1844 /* NOTREACHED */
1845 __builtin_unreachable();
1846 }
1847
1848 /* validate checksum offload properties */
1849 if (__probable(PACKET_HAS_PARTIAL_CHECKSUM(kpkt))) {
1850 uint16_t start = kpkt->pkt_csum_tx_start_off;
1851 uint16_t stuff = kpkt->pkt_csum_tx_stuff_off;
1852 if (__improbable(start > stuff ||
1853 start > kpkt->pkt_length ||
1854 (stuff + sizeof(uint16_t)) > kpkt->pkt_length)) {
1855 SK_ERR("%s(%d) flags 0x%x start %u stuff %u "
1856 "len %u", sk_proc_name_address(p),
1857 sk_proc_pid(p), kpkt->pkt_csum_flags,
1858 start, stuff, kpkt->pkt_length);
1859 err = ERANGE;
1860 goto done;
1861 }
1862 } else {
1863 kpkt->pkt_csum_tx_start_off = 0;
1864 kpkt->pkt_csum_tx_stuff_off = 0;
1865 }
1866 *__DECONST(uint16_t *, &kpkt->pkt_bufs_cnt) = bcnt;
1867 }
1868
1869done:
1870 if (__probable(err == 0)) {
1871 kqum->qum_len = len;
1872 kqum->qum_qflags |= (QUM_F_INTERNALIZED | QUM_F_FINALIZED);
1873 } else {
1874 kqum->qum_len = 0;
1875 kqum->qum_qflags |= (QUM_F_INTERNALIZED | QUM_F_DROPPED);
1876 }
1877 return err;
1878}
1879
1880__attribute__((always_inline))
1881static inline void
1882kr_externalize_metadata_internal(struct __kern_channel_ring *kring,
1883 const uint32_t maxfrags, struct __kern_quantum *kqum, struct proc *p)
1884{
1885#pragma unused(kring, maxfrags, p)
1886 struct __kern_buflet *kbuf, *pkbuf; /* kernel buflet */
1887 struct __user_buflet *ubuf, *pubuf; /* user buflet */
1888 struct __user_quantum *uqum; /* user destination */
1889 struct __user_packet *upkt;
1890 struct __kern_packet *kpkt;
1891 const nexus_meta_type_t md_type = METADATA_TYPE(kqum);
1892 const nexus_meta_subtype_t md_subtype = METADATA_SUBTYPE(kqum);
1893 uint32_t len = 0;
1894 uint16_t bcnt = 0, bmax, i;
1895
1896 /*
1897 * Verify that the quantum/packet belongs to the same pp as
1898 * the one used by the adapter, i.e. the packet must have
1899 * been allocated from the same pp and attached to the kring.
1900 */
1901 ASSERT(kqum->qum_pp == kring->ckr_pp);
1902 ASSERT(kqum->qum_qflags & (QUM_F_FINALIZED | QUM_F_INTERNALIZED));
1903
1904 _CASSERT(sizeof(kpkt->pkt_com) == sizeof(upkt->pkt_com));
1905 _CASSERT(sizeof(kqum->qum_com) == sizeof(uqum->qum_com));
1906 uqum = __DECONST(struct __user_quantum *, kqum->qum_user);
1907 ASSERT(!(kqum->qum_qflags & QUM_F_KERNEL_ONLY) && uqum != NULL);
1908 upkt = SK_PTR_ADDR_UPKT(uqum);
1909 kpkt = SK_PTR_ADDR_KPKT(kqum);
1910
1911 DTRACE_SKYWALK3(externalize, struct __kern_channel_ring *, kring,
1912 struct __kern_packet *, kpkt, struct __user_packet *, upkt);
1913 SK_DF(SK_VERB_MEM, "%s(%d) kring 0x%llx kqum 0x%llx -> uqum 0x%llx",
1914 sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(kring),
1915 SK_KVA(kqum), SK_KVA(uqum));
1916
1917 /*
1918 * Externalize common quantum metadata.
1919 */
1920 _QUM_EXTERNALIZE(kqum, uqum);
1921
1922 switch (md_type) {
1923 case NEXUS_META_TYPE_PACKET: {
1924 bcnt = kpkt->pkt_bufs_cnt;
1925 bmax = kpkt->pkt_bufs_max;
1926 ASSERT(bmax == maxfrags);
1927 ASSERT(bcnt <= bmax);
1928 /*
1929 * Externalize common packet metadata.
1930 */
1931 _PKT_EXTERNALIZE(kpkt, upkt);
1932
1933 /* sanitize buflet count and limit (deconst) */
1934 _CASSERT(sizeof(upkt->pkt_bufs_max) == sizeof(uint16_t));
1935 _CASSERT(sizeof(upkt->pkt_bufs_cnt) == sizeof(uint16_t));
1936 *(uint16_t *)(uintptr_t)&upkt->pkt_bufs_max = bmax;
1937 *(uint16_t *)(uintptr_t)&upkt->pkt_bufs_cnt = bcnt;
1938
1939 switch (md_subtype) {
1940 case NEXUS_META_SUBTYPE_PAYLOAD:
1941 /* sanitize link layer fields for payload mode */
1942 upkt->pkt_headroom = 0;
1943 upkt->pkt_link_flags = 0;
1944 break;
1945 default:
1946 break;
1947 }
1948 break;
1949 }
1950
1951 case NEXUS_META_TYPE_QUANTUM:
1952 ASSERT(maxfrags == 1);
1953 bcnt = bmax = 1;
1954 break;
1955
1956 default:
1957 VERIFY(0);
1958 /* NOTREACHED */
1959 __builtin_unreachable();
1960 }
1961
1962 ASSERT(bcnt != 0);
1963 /*
1964 * special handling to externalize empty packet buflet.
1965 */
1966 kbuf = &kpkt->pkt_qum.qum_buf[0];
1967 if (kbuf->buf_addr == 0) {
1968 ubuf = __DECONST(struct __user_buflet *,
1969 &kpkt->pkt_qum.qum_user->qum_buf[0]);
1970 UBUF_INIT(kbuf, ubuf);
1971 }
1972
1973 kbuf = pkbuf = NULL;
1974 ubuf = pubuf = NULL;
1975 /*
1976 * Externalize buflets.
1977 */
1978 for (i = 0; i < bcnt; i++) {
1979 _CASSERT(offsetof(struct __kern_packet, pkt_qum) == 0);
1980 PKT_GET_NEXT_BUFLET(kpkt, bcnt, pkbuf, kbuf);
1981 ASSERT(kbuf != NULL);
1982
1983 if (kbuf->buf_flag & BUFLET_FLAG_EXTERNAL) {
1984 ubuf = __DECONST(struct __user_buflet *,
1985 ((struct __kern_buflet_ext *)kbuf)->kbe_buf_user);
1986 } else {
1987 ASSERT(i == 0);
1988 ubuf = __DECONST(struct __user_buflet *,
1989 &kpkt->pkt_qum.qum_user->qum_buf[0]);
1990 }
1991
1992 ASSERT(ubuf != NULL);
1993 ASSERT((kbuf != pkbuf) && (ubuf != pubuf));
1994 ASSERT(BUF_IN_RANGE(kbuf));
1995 KBUF_EXTERNALIZE(kbuf, ubuf, kqum->qum_pp);
1996
1997 /* all good to go */
1998 len += kbuf->buf_dlen;
1999 pkbuf = kbuf;
2000 pubuf = ubuf;
2001 }
2002
2003 uqum->qum_len = len;
2004 uqum->qum_qflags |= QUM_F_FINALIZED;
2005
2006 /*
2007 * XXX: adi@apple.com -- do this during reclaim instead?
2008 */
2009 kqum->qum_qflags &= ~QUM_F_INTERNALIZED;
2010}
2011
2012
2013void
2014kr_externalize_metadata(struct __kern_channel_ring *kring,
2015 const uint32_t maxfrags, struct __kern_quantum *kqum, struct proc *p)
2016{
2017 kr_externalize_metadata_internal(kring, maxfrags, kqum, p);
2018}
2019