1/*
2 * Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <sys/kdebug.h>
30#include <skywalk/os_skywalk_private.h>
31#include <net/ntstat.h>
32#include <skywalk/nexus/flowswitch/nx_flowswitch.h>
33#include <skywalk/nexus/netif/nx_netif.h>
34#include <skywalk/nexus/upipe/nx_user_pipe.h>
35
36#define KRING_EMPTY_TX(_kring, _index) \
37 ((_kring)->ckr_rhead == (_index))
38
39#define KRING_FULL_RX(_kring, _index) \
40 ((_kring)->ckr_khead == SLOT_NEXT((_index), (_kring)->ckr_lim))
41
42uint32_t
43kern_channel_notify(const kern_channel_ring_t kring, uint32_t flags)
44{
45#pragma unused(flags)
46 if (__improbable(KR_DROP(kring))) {
47 return ENXIO;
48 }
49
50 return kring->ckr_na_notify(kring, kernproc, 0);
51}
52
53uint32_t
54kern_channel_reclaim(const kern_channel_ring_t kring)
55{
56 return kr_reclaim(kr: kring);
57}
58
59static inline uint32_t
60_kern_channel_available_slot_count_tx(const kern_channel_ring_t kring,
61 slot_idx_t index)
62{
63 ASSERT(kring->ckr_tx == NR_TX);
64
65 if (kring->ckr_rhead < index) {
66 return kring->ckr_num_slots + kring->ckr_rhead - index;
67 }
68
69 return kring->ckr_rhead - index;
70}
71
72static inline uint32_t
73_kern_channel_available_slot_count_rx(const kern_channel_ring_t kring,
74 slot_idx_t index)
75{
76 uint32_t busy;
77 slot_idx_t lim = kring->ckr_lim;
78
79 ASSERT(kring->ckr_tx == NR_RX);
80
81 if (index < kring->ckr_khead) {
82 busy = kring->ckr_num_slots + index - kring->ckr_khead;
83 } else {
84 busy = index - kring->ckr_khead;
85 }
86
87 ASSERT(lim >= busy);
88 return lim - busy;
89}
90
91uint32_t
92kern_channel_available_slot_count(const kern_channel_ring_t kring)
93{
94 if (kring->ckr_tx == NR_TX) {
95 return _kern_channel_available_slot_count_tx(kring,
96 index: kring->ckr_khead);
97 } else {
98 return _kern_channel_available_slot_count_rx(kring,
99 index: kring->ckr_ktail);
100 }
101}
102
103kern_channel_slot_t
104kern_channel_get_next_slot(const kern_channel_ring_t kring,
105 const kern_channel_slot_t slot0, struct kern_slot_prop *prop)
106{
107 kern_channel_slot_t slot;
108 slot_idx_t slot_idx;
109
110 /* Ensure this is only done by the thread doing a sync syscall */
111 VERIFY(sk_is_sync_protected());
112
113 if (__improbable(slot0 == NULL)) {
114 if (kring->ckr_tx == NR_TX) {
115 slot_idx = kring->ckr_khead;
116 } else {
117 slot_idx = kring->ckr_ktail;
118 }
119 } else {
120 slot_idx = SLOT_NEXT(i: KR_SLOT_INDEX(kr: kring, slot: slot0),
121 lim: kring->ckr_lim);
122 }
123
124 ASSERT(slot_idx < kring->ckr_num_slots);
125
126 if (kring->ckr_tx == NR_TX) {
127 if (__improbable(KRING_EMPTY_TX(kring, slot_idx))) {
128 SK_DF(SK_VERB_SYNC | SK_VERB_TX,
129 "EMPTY_TX: na \"%s\" kr \"%s\" "
130 "i %u (kc %u kt %u kl %u | rh %u rt %u)",
131 KRNA(kring)->na_name,
132 kring->ckr_name, slot_idx, kring->ckr_khead,
133 kring->ckr_ktail, kring->ckr_klease,
134 kring->ckr_rhead, kring->ckr_rtail);
135 slot = NULL;
136 } else {
137 slot = &kring->ckr_ksds[slot_idx];
138 }
139 } else {
140 if (__improbable(KRING_FULL_RX(kring, slot_idx))) {
141 SK_DF(SK_VERB_SYNC | SK_VERB_RX,
142 "FULL_RX: na \"%s\" kr \"%s\" "
143 "i %u (kc %u kt %u kl %u | rh %u rt %u)",
144 KRNA(kring)->na_name,
145 kring->ckr_name, slot_idx, kring->ckr_khead,
146 kring->ckr_ktail, kring->ckr_klease,
147 kring->ckr_rhead, kring->ckr_rtail);
148 slot = NULL;
149 } else {
150 slot = &kring->ckr_ksds[slot_idx];
151 }
152 }
153
154 if (prop != NULL) {
155 bzero(s: prop, n: sizeof(*prop));
156 }
157
158 return slot;
159}
160
161static inline void
162_kern_channel_advance_slot_tx(const kern_channel_ring_t kring, slot_idx_t index)
163{
164 /* Ensure this is only done by the thread doing a sync syscall */
165 VERIFY(sk_is_sync_protected());
166 kr_txkring_reclaim_and_refill(kring, index);
167}
168
169static inline void
170_kern_channel_advance_slot_rx(const kern_channel_ring_t kring, slot_idx_t index)
171{
172 ASSERT(kring->ckr_tx == NR_RX || kring->ckr_tx == NR_EV);
173 /* Ensure this is only done by the thread doing a sync syscall */
174 VERIFY(sk_is_sync_protected());
175
176 kring->ckr_ktail = SLOT_NEXT(i: index, lim: kring->ckr_lim);
177}
178
179void
180kern_channel_advance_slot(const kern_channel_ring_t kring,
181 kern_channel_slot_t slot)
182{
183 slot_idx_t index = KR_SLOT_INDEX(kr: kring, slot);
184 ASSERT(index < kring->ckr_num_slots);
185
186 if (kring->ckr_tx == NR_TX) {
187 _kern_channel_advance_slot_tx(kring, index);
188 } else {
189 _kern_channel_advance_slot_rx(kring, index);
190 }
191}
192
193void *
194kern_channel_get_context(const kern_channel_t ch)
195{
196 return ch->ch_ctx;
197}
198
199void *
200kern_channel_ring_get_context(const kern_channel_ring_t kring)
201{
202 return kring->ckr_ctx;
203}
204
205errno_t
206kern_channel_ring_get_container(const kern_channel_ring_t kring,
207 kern_packet_t **array, uint32_t *count)
208{
209 /* Ensure this is only done by the thread doing a sync syscall */
210 VERIFY(sk_is_sync_protected());
211
212 if (array == NULL) {
213 return EINVAL;
214 }
215
216 *array = kring->ckr_scratch;
217 if (count != NULL) {
218 *count = na_get_nslots(na: kring->ckr_na, t: kring->ckr_tx);
219 }
220
221 return 0;
222}
223
224void *
225kern_channel_slot_get_context(const kern_channel_ring_t kring,
226 const kern_channel_slot_t slot)
227{
228 slot_idx_t i = KR_SLOT_INDEX(kr: kring, slot);
229 void *slot_ctx = NULL;
230
231 if (kring->ckr_slot_ctxs != NULL) {
232 slot_ctx = (void *)(kring->ckr_slot_ctxs[i].slot_ctx_arg);
233 }
234
235 return slot_ctx;
236}
237
238void
239kern_channel_increment_ring_stats(kern_channel_ring_t kring,
240 struct kern_channel_ring_stat_increment *stats)
241{
242 kr_update_stats(kring, slot_count: stats->kcrsi_slots_transferred,
243 byte_count: stats->kcrsi_bytes_transferred);
244}
245
246void
247kern_channel_increment_ring_net_stats(kern_channel_ring_t kring,
248 struct ifnet *ifp, struct kern_channel_ring_stat_increment *stats)
249{
250 if (kring->ckr_tx == NR_TX) {
251 os_atomic_add(&ifp->if_data.ifi_opackets, stats->kcrsi_slots_transferred, relaxed);
252 os_atomic_add(&ifp->if_data.ifi_obytes, stats->kcrsi_bytes_transferred, relaxed);
253 } else {
254 os_atomic_add(&ifp->if_data.ifi_ipackets, stats->kcrsi_slots_transferred, relaxed);
255 os_atomic_add(&ifp->if_data.ifi_ibytes, stats->kcrsi_bytes_transferred, relaxed);
256 }
257
258 if (ifp->if_data_threshold != 0) {
259 ifnet_notify_data_threshold(ifp);
260 }
261
262 kr_update_stats(kring, slot_count: stats->kcrsi_slots_transferred,
263 byte_count: stats->kcrsi_bytes_transferred);
264}
265
266kern_packet_t
267kern_channel_slot_get_packet(const kern_channel_ring_t kring,
268 const kern_channel_slot_t slot)
269{
270#if (DEVELOPMENT || DEBUG)
271 /* catch invalid slot */
272 slot_idx_t idx = KR_SLOT_INDEX(kring, slot);
273 struct __kern_slot_desc *ksd = KR_KSD(kring, idx);
274#else
275#pragma unused(kring)
276 struct __kern_slot_desc *ksd = SLOT_DESC_KSD(slot);
277#endif /* (DEVELOPMENT || DEBUG) */
278 struct __kern_quantum *kqum = ksd->sd_qum;
279
280 if (__improbable(kqum == NULL ||
281 (kqum->qum_qflags & QUM_F_DROPPED) != 0)) {
282 return 0;
283 }
284
285 return SD_GET_TAGGED_METADATA(ksd);
286}
287
288errno_t
289kern_channel_slot_attach_packet(const kern_channel_ring_t kring,
290 const kern_channel_slot_t slot, kern_packet_t ph)
291{
292#if (DEVELOPMENT || DEBUG)
293 /* catch invalid slot */
294 slot_idx_t idx = KR_SLOT_INDEX(kring, slot);
295 struct __kern_slot_desc *ksd = KR_KSD(kring, idx);
296#else
297#pragma unused(kring)
298 struct __kern_slot_desc *ksd = SLOT_DESC_KSD(slot);
299#endif /* (DEVELOPMENT || DEBUG) */
300
301 return KR_SLOT_ATTACH_METADATA(kring, ksd, SK_PTR_ADDR_KQUM(ph));
302}
303
304errno_t
305kern_channel_slot_detach_packet(const kern_channel_ring_t kring,
306 const kern_channel_slot_t slot, kern_packet_t ph)
307{
308#pragma unused(ph)
309#if (DEVELOPMENT || DEBUG)
310 /* catch invalid slot */
311 slot_idx_t idx = KR_SLOT_INDEX(kring, slot);
312 struct __kern_slot_desc *ksd = KR_KSD(kring, idx);
313#else
314 struct __kern_slot_desc *ksd = SLOT_DESC_KSD(slot);
315#endif /* (DEVELOPMENT || DEBUG) */
316
317 ASSERT(SK_PTR_ADDR_KQUM(ph) ==
318 SK_PTR_ADDR_KQUM(SD_GET_TAGGED_METADATA(ksd)));
319 (void) KR_SLOT_DETACH_METADATA(kring, ksd);
320
321 return 0;
322}
323
324static errno_t
325kern_channel_tx_refill_common(const kern_channel_ring_t hw_kring,
326 uint32_t pkt_limit, uint32_t byte_limit, boolean_t tx_doorbell_ctxt,
327 boolean_t *pkts_pending, boolean_t canblock)
328{
329#pragma unused(tx_doorbell_ctxt)
330 struct nexus_adapter *hwna;
331 struct ifnet *ifp;
332 sk_protect_t protect;
333 errno_t rc = 0;
334 errno_t sync_err = 0;
335
336 KDBG((SK_KTRACE_CHANNEL_TX_REFILL | DBG_FUNC_START), SK_KVA(hw_kring));
337
338 VERIFY(hw_kring != NULL);
339 hwna = KRNA(hw_kring);
340 ifp = hwna->na_ifp;
341
342 ASSERT(hwna->na_type == NA_NETIF_DEV);
343 ASSERT(hw_kring->ckr_tx == NR_TX);
344 *pkts_pending = FALSE;
345
346 if (__improbable(pkt_limit == 0 || byte_limit == 0)) {
347 SK_ERR("invalid limits plim %d, blim %d",
348 pkt_limit, byte_limit);
349 rc = EINVAL;
350 goto out;
351 }
352
353 if (__improbable(!IF_FULLY_ATTACHED(ifp))) {
354 SK_ERR("hwna 0x%llx ifp %s (0x%llx), interface not attached",
355 SK_KVA(hwna), if_name(ifp), SK_KVA(ifp));
356 rc = ENXIO;
357 goto out;
358 }
359
360 if (__improbable((ifp->if_start_flags & IFSF_FLOW_CONTROLLED) != 0)) {
361 SK_DF(SK_VERB_SYNC | SK_VERB_TX, "hwna 0x%llx ifp %s (0x%llx), "
362 "flow control ON", SK_KVA(hwna), if_name(ifp), SK_KVA(ifp));
363 rc = ENXIO;
364 goto out;
365 }
366
367 /*
368 * if the ring is busy, it means another dequeue is in
369 * progress, so ignore this request and return success.
370 */
371 if (kr_enter(hw_kring, canblock) != 0) {
372 rc = 0;
373 goto out;
374 }
375
376 if (__improbable(KR_DROP(hw_kring) ||
377 !NA_IS_ACTIVE(hw_kring->ckr_na))) {
378 kr_exit(hw_kring);
379 SK_ERR("hw-kr 0x%llx stopped", SK_KVA(hw_kring));
380 rc = ENXIO;
381 goto out;
382 }
383
384 /*
385 * Unlikely to get here, unless a channel is opened by
386 * a user process directly to the netif. Issue a TX sync
387 * on the netif device TX ring.
388 */
389 protect = sk_sync_protect();
390 sync_err = hw_kring->ckr_na_sync(hw_kring, kernproc,
391 NA_SYNCF_NETIF);
392 sk_sync_unprotect(protect);
393 kr_exit(hw_kring);
394
395 if (rc == 0) {
396 rc = sync_err;
397 }
398
399out:
400 KDBG((SK_KTRACE_CHANNEL_TX_REFILL | DBG_FUNC_END), SK_KVA(hw_kring),
401 rc, 0, 0);
402
403 return rc;
404}
405
406errno_t
407kern_channel_tx_refill(const kern_channel_ring_t hw_kring,
408 uint32_t pkt_limit, uint32_t byte_limit, boolean_t tx_doorbell_ctxt,
409 boolean_t *pkts_pending)
410{
411 if (NA_OWNED_BY_FSW(hw_kring->ckr_na)) {
412 return netif_ring_tx_refill(hw_kring, pkt_limit,
413 byte_limit, tx_doorbell_ctxt, pkts_pending, FALSE);
414 } else {
415 return kern_channel_tx_refill_common(hw_kring, pkt_limit,
416 byte_limit, tx_doorbell_ctxt, pkts_pending, FALSE);
417 }
418}
419
420errno_t
421kern_channel_tx_refill_canblock(const kern_channel_ring_t hw_kring,
422 uint32_t pkt_limit, uint32_t byte_limit, boolean_t tx_doorbell_ctxt,
423 boolean_t *pkts_pending)
424{
425 if (NA_OWNED_BY_FSW(hw_kring->ckr_na)) {
426 return netif_ring_tx_refill(hw_kring, pkt_limit,
427 byte_limit, tx_doorbell_ctxt, pkts_pending, TRUE);
428 } else {
429 return kern_channel_tx_refill_common(hw_kring, pkt_limit,
430 byte_limit, tx_doorbell_ctxt, pkts_pending, TRUE);
431 }
432}
433
434errno_t
435kern_channel_get_service_class(const kern_channel_ring_t kring,
436 kern_packet_svc_class_t *svc)
437{
438 if ((KRNA(kring)->na_type != NA_NETIF_DEV) ||
439 (kring->ckr_tx == NR_RX) || (kring->ckr_svc == KPKT_SC_UNSPEC)) {
440 return ENOTSUP;
441 }
442 *svc = kring->ckr_svc;
443 return 0;
444}
445
446void
447kern_channel_flowadv_clear(struct flowadv_fcentry *fce)
448{
449 const flowadv_token_t ch_token = fce->fce_flowsrc_token;
450 const flowadv_token_t flow_token = fce->fce_flowid;
451 const flowadv_idx_t flow_fidx = fce->fce_flowsrc_fidx;
452 struct ifnet *ifp = fce->fce_ifp;
453 struct nexus_adapter *hwna;
454 struct kern_nexus *fsw_nx;
455 struct kern_channel *ch = NULL;
456 struct nx_flowswitch *fsw;
457
458 _CASSERT(sizeof(ch->ch_info->cinfo_ch_token) == sizeof(ch_token));
459
460 SK_LOCK();
461 if (ifnet_is_attached(ifp, refio: 0) == 0 || ifp->if_na == NULL) {
462 goto done;
463 }
464
465 hwna = &ifp->if_na->nifna_up;
466 VERIFY((hwna->na_type == NA_NETIF_DEV) ||
467 (hwna->na_type == NA_NETIF_COMPAT_DEV));
468
469 if (!NA_IS_ACTIVE(hwna) || (fsw = fsw_ifp_to_fsw(ifp)) == NULL) {
470 goto done;
471 }
472
473 fsw_nx = fsw->fsw_nx;
474 VERIFY(fsw_nx != NULL);
475
476 /* find the channel */
477 STAILQ_FOREACH(ch, &fsw_nx->nx_ch_head, ch_link) {
478 if (ch_token == ch->ch_info->cinfo_ch_token) {
479 break;
480 }
481 }
482
483 if (ch != NULL) {
484 if (ch->ch_na != NULL &&
485 na_flowadv_clear(ch, flow_fidx, flow_token)) {
486 /* trigger flow advisory kevent */
487 na_flowadv_event(
488 &ch->ch_na->na_tx_rings[ch->ch_first[NR_TX]]);
489 SK_DF(SK_VERB_FLOW_ADVISORY,
490 "%s(%d) notified of flow update",
491 ch->ch_name, ch->ch_pid);
492 } else if (ch->ch_na == NULL) {
493 SK_DF(SK_VERB_FLOW_ADVISORY,
494 "%s(%d) is closing (flow update ignored)",
495 ch->ch_name, ch->ch_pid);
496 }
497 } else {
498 SK_ERR("channel token 0x%x fidx %u on %s not found",
499 ch_token, flow_fidx, ifp->if_xname);
500 }
501done:
502 SK_UNLOCK();
503}
504
505void
506kern_channel_flowadv_report_ce_event(struct flowadv_fcentry *fce,
507 uint32_t ce_cnt, uint32_t total_pkt_cnt)
508{
509 const flowadv_token_t ch_token = fce->fce_flowsrc_token;
510 const flowadv_token_t flow_token = fce->fce_flowid;
511 const flowadv_idx_t flow_fidx = fce->fce_flowsrc_fidx;
512 struct ifnet *ifp = fce->fce_ifp;
513 struct nexus_adapter *hwna;
514 struct kern_nexus *fsw_nx;
515 struct kern_channel *ch = NULL;
516 struct nx_flowswitch *fsw;
517
518 _CASSERT(sizeof(ch->ch_info->cinfo_ch_token) == sizeof(ch_token));
519
520 SK_LOCK();
521 if (ifnet_is_attached(ifp, refio: 0) == 0 || ifp->if_na == NULL) {
522 goto done;
523 }
524
525 hwna = &ifp->if_na->nifna_up;
526 VERIFY((hwna->na_type == NA_NETIF_DEV) ||
527 (hwna->na_type == NA_NETIF_COMPAT_DEV));
528
529 if (!NA_IS_ACTIVE(hwna) || (fsw = fsw_ifp_to_fsw(ifp)) == NULL) {
530 goto done;
531 }
532
533 fsw_nx = fsw->fsw_nx;
534 VERIFY(fsw_nx != NULL);
535
536 /* find the channel */
537 STAILQ_FOREACH(ch, &fsw_nx->nx_ch_head, ch_link) {
538 if (ch_token == ch->ch_info->cinfo_ch_token) {
539 break;
540 }
541 }
542
543 if (ch != NULL) {
544 if (ch->ch_na != NULL &&
545 na_flowadv_report_ce_event(ch, fe_idx: flow_fidx, flow_token,
546 ce_cnt, total_pkt_cnt)) {
547 SK_DF(SK_VERB_FLOW_ADVISORY,
548 "%s(%d) notified of flow update",
549 ch->ch_name, ch->ch_pid);
550 } else if (ch->ch_na == NULL) {
551 SK_DF(SK_VERB_FLOW_ADVISORY,
552 "%s(%d) is closing (flow update ignored)",
553 ch->ch_name, ch->ch_pid);
554 }
555 } else {
556 SK_ERR("channel token 0x%x fidx %u on %s not found",
557 ch_token, flow_fidx, ifp->if_xname);
558 }
559done:
560 SK_UNLOCK();
561}
562
563
564void
565kern_channel_memstatus(struct proc *p, uint32_t status,
566 struct kern_channel *ch)
567{
568#pragma unused(p, status)
569 SK_LOCK_ASSERT_NOTHELD();
570
571 ASSERT(!(ch->ch_flags & CHANF_KERNEL));
572 ASSERT(proc_pid(p) == ch->ch_pid);
573 /*
574 * If we're already draining, then bail. Otherwise, check it
575 * again via na_drain() with the channel lock held.
576 */
577 if (ch->ch_na->na_flags & NAF_DRAINING) {
578 return;
579 }
580
581 SK_DF(SK_VERB_CHANNEL, "%s(%d) ch 0x%llx flags 0x%b status %s",
582 sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(ch),
583 ch->ch_flags, CHANF_BITS, sk_memstatus2str(status));
584
585 /* serialize accesses against channel syscalls */
586 lck_mtx_lock(lck: &ch->ch_lock);
587 na_drain(ch->ch_na, TRUE); /* purge caches */
588 lck_mtx_unlock(lck: &ch->ch_lock);
589}
590
591static bool
592_kern_channel_defunct_eligible(struct kern_channel *ch)
593{
594 struct nexus_upipe_adapter *pna;
595
596 if ((ch->ch_info->cinfo_ch_mode & CHMODE_DEFUNCT_OK) == 0) {
597 return false;
598 }
599 if (ch->ch_na->na_type != NA_USER_PIPE) {
600 return true;
601 }
602 pna = (struct nexus_upipe_adapter *)ch->ch_na;
603 if ((pna->pna_parent->na_flags & NAF_DEFUNCT_OK) == 0) {
604 return false;
605 }
606 return true;
607}
608
609void
610kern_channel_defunct(struct proc *p, struct kern_channel *ch)
611{
612#pragma unused(p)
613 uint32_t ch_mode = ch->ch_info->cinfo_ch_mode;
614
615 SK_LOCK_ASSERT_NOTHELD();
616
617 ASSERT(!(ch->ch_flags & CHANF_KERNEL));
618 ASSERT(proc_pid(p) == ch->ch_pid);
619 /*
620 * If the channel is eligible for defunct, mark it as such.
621 * Otherwise, set the draining flag which tells the reaper
622 * thread to purge any cached objects associated with it.
623 * That draining flag will be cleared then, which allows the
624 * channel to cache objects again once the process is resumed.
625 */
626 if (_kern_channel_defunct_eligible(ch)) {
627 struct kern_nexus *nx = ch->ch_nexus;
628 struct kern_nexus_domain_provider *nxdom_prov = NX_DOM_PROV(nx);
629 boolean_t need_defunct;
630 int err;
631
632 /*
633 * This may be called often, so check first (without lock) if
634 * the trapdoor flag CHANF_DEFUNCT has been set and bail if so,
635 * for performance reasons. This check is repeated below with
636 * the channel lock held.
637 */
638 if (ch->ch_flags & CHANF_DEFUNCT) {
639 return;
640 }
641
642 SK_DF(SK_VERB_CHANNEL, "%s(%d) ch 0x%llx flags 0x%b",
643 sk_proc_name_address(p), sk_proc_pid(p), SK_KVA(ch),
644 ch->ch_flags, CHANF_BITS);
645
646 /* serialize accesses against channel syscalls */
647 lck_mtx_lock(lck: &ch->ch_lock);
648
649 /*
650 * If opportunistic defunct is in effect, skip the rest of
651 * the defunct work based on two cases:
652 *
653 * a) if the channel isn't using user packet pool; or
654 * b) if the channel is using user packet pool and we
655 * detect that there are outstanding allocations.
656 *
657 * Note that for case (a) above we essentially treat the
658 * channel as ineligible for defunct, and although it may
659 * be idle we'd leave the memory mapping intact. This
660 * should not be a concern as the majority of channels are
661 * on flowswitches where user packet pool is mandatory.
662 *
663 * If skipping, mark the channel with CHANF_DEFUNCT_SKIP
664 * and increment the stats (for flowswitch only).
665 */
666 if (sk_opp_defunct && (!(ch_mode & CHMODE_USER_PACKET_POOL) ||
667 !pp_isempty_upp(ch->ch_pp))) {
668 if (ch->ch_na->na_type == NA_FLOWSWITCH_VP) {
669 struct nx_flowswitch *fsw =
670 VPNA(ch->ch_na)->vpna_fsw;
671 STATS_INC(&fsw->fsw_stats,
672 FSW_STATS_CHAN_DEFUNCT_SKIP);
673 }
674 os_atomic_or(&ch->ch_flags, CHANF_DEFUNCT_SKIP,
675 relaxed);
676 /* skip defunct */
677 lck_mtx_unlock(lck: &ch->ch_lock);
678 return;
679 }
680 os_atomic_andnot(&ch->ch_flags, CHANF_DEFUNCT_SKIP, relaxed);
681
682 /*
683 * Proceed with the rest of the defunct work.
684 */
685 if (os_atomic_or_orig(&ch->ch_flags, CHANF_DEFUNCT, relaxed) &
686 CHANF_DEFUNCT) {
687 /* already defunct; nothing to do */
688 lck_mtx_unlock(lck: &ch->ch_lock);
689 return;
690 }
691
692 /* mark this channel as inactive */
693 ch_deactivate(ch);
694
695 /*
696 * Redirect memory regions for the map; upon success, instruct
697 * the nexus to finalize the defunct and teardown the respective
698 * memory regions. It's crucial that the redirection happens
699 * first before freeing the objects, since the page protection
700 * flags get inherited only from unfreed segments. Freed ones
701 * will cause VM_PROT_NONE to be used for the segment span, to
702 * catch use-after-free cases. For unfreed objects, doing so
703 * may cause an exception when the process is later resumed
704 * and touches an address within the span; hence the ordering.
705 */
706 if ((err = skmem_arena_mredirect(ch->ch_na->na_arena,
707 &ch->ch_mmap, p, &need_defunct)) == 0 && need_defunct) {
708 /*
709 * Let the domain provider handle the initial tasks of
710 * the defunct that are specific to this channel. It
711 * may safely free objects as the redirection is done.
712 */
713 nxdom_prov->nxdom_prov_dom->nxdom_defunct(nxdom_prov,
714 nx, ch, p);
715 /*
716 * Let the domain provider complete the defunct;
717 * do this after dropping the channel lock, as
718 * the nexus may end up acquiring other locks
719 * that would otherwise violate lock ordering.
720 * The channel refcnt is still held by virtue
721 * of the caller holding the process's file
722 * table lock.
723 */
724 lck_mtx_unlock(lck: &ch->ch_lock);
725 nxdom_prov->nxdom_prov_dom->nxdom_defunct_finalize(
726 nxdom_prov, nx, ch, FALSE);
727 } else if (err == 0) {
728 /*
729 * Let the domain provider handle the initial tasks of
730 * the defunct that are specific to this channel. It
731 * may sadely free objects as the redirection is done.
732 */
733 nxdom_prov->nxdom_prov_dom->nxdom_defunct(nxdom_prov,
734 nx, ch, p);
735 lck_mtx_unlock(lck: &ch->ch_lock);
736 } else {
737 /* already redirected; nothing to do */
738 lck_mtx_unlock(lck: &ch->ch_lock);
739 }
740 } else {
741 lck_mtx_lock(lck: &ch->ch_lock);
742 na_drain(ch->ch_na, FALSE); /* prune caches */
743 lck_mtx_unlock(lck: &ch->ch_lock);
744 }
745}
746