1/*
2 * Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*
30 * Copyright (C) 2012-2014 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri.
31 * All rights reserved.
32 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
33 *
34 * Redistribution and use in source and binary forms, with or without
35 * modification, are permitted provided that the following conditions
36 * are met:
37 * 1. Redistributions of source code must retain the above copyright
38 * notice, this list of conditions and the following disclaimer.
39 * 2. Redistributions in binary form must reproduce the above copyright
40 * notice, this list of conditions and the following disclaimer in the
41 * documentation and/or other materials provided with the distribution.
42 *
43 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
44 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
45 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
46 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
47 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53 * SUCH DAMAGE.
54 */
55
56#include <sys/eventvar.h>
57#include <sys/kdebug.h>
58#include <sys/sdt.h>
59#include <skywalk/os_skywalk_private.h>
60#include <skywalk/nexus/netif/nx_netif.h>
61
62#define KEV_EVTID(code) BSDDBG_CODE(DBG_BSD_KEVENT, (code))
63
64struct ch_event_result {
65 uint32_t tx_data;
66 uint32_t rx_data;
67};
68
69static LCK_GRP_DECLARE(channel_lock_group, "sk_ch_lock");
70static LCK_GRP_DECLARE(channel_kn_lock_group, "sk_ch_kn_lock");
71LCK_ATTR_DECLARE(channel_lock_attr, 0, 0);
72
73static void csi_selrecord(struct ch_selinfo *, struct proc *, void *);
74static void csi_selwakeup(struct ch_selinfo *, boolean_t, boolean_t, uint32_t);
75static inline void csi_selwakeup_delayed(struct ch_selinfo *);
76static inline void csi_selwakeup_common(struct ch_selinfo *, boolean_t,
77 boolean_t, boolean_t, uint32_t);
78static boolean_t csi_tcall_start(struct ch_selinfo *);
79static void csi_tcall(thread_call_param_t, thread_call_param_t);
80static uint64_t csi_tcall_update_interval(struct ch_selinfo *);
81
82static void ch_redzone_init(void);
83static void ch_close_common(struct kern_channel *, boolean_t, boolean_t);
84static struct kern_channel *ch_find(struct kern_nexus *, nexus_port_t,
85 ring_id_t);
86static int ch_ev_thresh_validate(struct kern_nexus *, enum txrx,
87 struct ch_ev_thresh *);
88static struct kern_channel *ch_connect(struct kern_nexus *, struct chreq *,
89 struct kern_channel *, struct nxbind *, struct proc *, int, int *);
90static void ch_disconnect(struct kern_channel *);
91static int ch_set_lowat_thresh(struct kern_channel *, enum txrx,
92 struct sockopt *);
93static int ch_get_lowat_thresh(struct kern_channel *, enum txrx,
94 struct sockopt *);
95static struct kern_channel *ch_alloc(zalloc_flags_t);
96static void ch_free(struct kern_channel *);
97static int ch_configure_interface_advisory_event(struct kern_channel *ch,
98 struct sockopt *sopt);
99
100static int filt_chrwattach(struct knote *, struct kevent_qos_s *kev);
101static void filt_chrwdetach(struct knote *, boolean_t);
102static void filt_chrdetach(struct knote *);
103static void filt_chwdetach(struct knote *);
104static int filt_chrw(struct knote *, long, int);
105static int filt_chread(struct knote *, long);
106static int filt_chwrite(struct knote *, long);
107
108static int filt_chtouch(struct knote *, struct kevent_qos_s *, int);
109static int filt_chrtouch(struct knote *, struct kevent_qos_s *);
110static int filt_chwtouch(struct knote *, struct kevent_qos_s *);
111static int filt_chprocess(struct knote *, struct kevent_qos_s *, int);
112static int filt_chrprocess(struct knote *, struct kevent_qos_s *);
113static int filt_chwprocess(struct knote *, struct kevent_qos_s *);
114static int filt_che_attach(struct knote *, struct kevent_qos_s *kev);
115static void filt_che_detach(struct knote *);
116static int filt_che_event(struct knote *, long);
117static int filt_che_touch(struct knote *, struct kevent_qos_s *);
118static int filt_che_process(struct knote *, struct kevent_qos_s *);
119static int filt_chan_extended_common(struct knote *, long);
120
121static int ch_event(struct kern_channel *ch, int events,
122 void *wql, struct proc *p, struct ch_event_result *,
123 const boolean_t is_kevent, int *errno, const boolean_t);
124
125const struct filterops skywalk_channel_rfiltops = {
126 .f_isfd = 1,
127 .f_attach = filt_chrwattach,
128 .f_detach = filt_chrdetach,
129 .f_event = filt_chread,
130 .f_touch = filt_chrtouch,
131 .f_process = filt_chrprocess,
132};
133
134const struct filterops skywalk_channel_wfiltops = {
135 .f_isfd = 1,
136 .f_attach = filt_chrwattach,
137 .f_detach = filt_chwdetach,
138 .f_event = filt_chwrite,
139 .f_touch = filt_chwtouch,
140 .f_process = filt_chwprocess,
141};
142
143const struct filterops skywalk_channel_efiltops = {
144 .f_isfd = 1,
145 .f_attach = filt_che_attach,
146 .f_detach = filt_che_detach,
147 .f_event = filt_che_event,
148 .f_touch = filt_che_touch,
149 .f_process = filt_che_process,
150};
151
152/* mitigation intervals in ns */
153#define CH_MIT_IVAL_MIN NSEC_PER_USEC
154
155static uint64_t ch_mit_ival = CH_MIT_IVAL_DEFAULT;
156
157#if (DEVELOPMENT || DEBUG)
158SYSCTL_NODE(_kern_skywalk, OID_AUTO, channel,
159 CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Skywalk channel parameters");
160SYSCTL_QUAD(_kern_skywalk_channel, OID_AUTO, mit_ival,
161 CTLFLAG_RW | CTLFLAG_LOCKED, &ch_mit_ival, "");
162#endif /* !DEVELOPMENT && !DEBUG */
163
164static SKMEM_TYPE_DEFINE(ch_zone, struct kern_channel);
165
166static SKMEM_TYPE_DEFINE(ch_info_zone, struct ch_info);
167
168static int __ch_inited = 0;
169
170/*
171 * Global cookies to hold the random numbers used for verifying
172 * user metadata red zone violations.
173 */
174uint64_t __ch_umd_redzone_cookie = 0;
175
176#define SKMEM_TAG_CH_KEY "com.apple.skywalk.channel.key"
177SKMEM_TAG_DEFINE(skmem_tag_ch_key, SKMEM_TAG_CH_KEY);
178
179static void
180ch_redzone_init(void)
181{
182 _CASSERT(sizeof(__ch_umd_redzone_cookie) ==
183 sizeof(((struct __metadata_preamble *)0)->mdp_redzone));
184 _CASSERT(METADATA_PREAMBLE_SZ == sizeof(struct __metadata_preamble));
185 _CASSERT(sizeof(struct __slot_desc) == 8);
186
187 /* Initialize random user red zone cookie values */
188 do {
189 read_random(buffer: &__ch_umd_redzone_cookie,
190 numBytes: sizeof(__ch_umd_redzone_cookie));
191 } while (__ch_umd_redzone_cookie == 0);
192
193 SK_D("__ch_umd_redzone_cookie: 0x%llx", __ch_umd_redzone_cookie);
194}
195
196int
197channel_init(void)
198{
199 int error = 0;
200
201 SK_LOCK_ASSERT_HELD();
202 ASSERT(!__ch_inited);
203
204 _CASSERT(offsetof(struct __user_packet, pkt_qum) == 0);
205 _CASSERT(offsetof(struct __kern_packet, pkt_qum) == 0);
206
207 ch_redzone_init();
208
209 __ch_inited = 1;
210
211 return error;
212}
213
214void
215channel_fini(void)
216{
217 SK_LOCK_ASSERT_HELD();
218
219 if (__ch_inited) {
220 __ch_umd_redzone_cookie = 0;
221 __ch_inited = 0;
222 }
223}
224
225void
226csi_init(struct ch_selinfo *csi, boolean_t mitigation, uint64_t mit_ival)
227{
228 csi->csi_flags = 0;
229 csi->csi_pending = 0;
230 if (mitigation) {
231 csi->csi_interval = mit_ival;
232 csi->csi_eff_interval = ch_mit_ival; /* global override */
233 os_atomic_or(&csi->csi_flags, CSI_MITIGATION, relaxed);
234 csi->csi_tcall = thread_call_allocate_with_options(func: csi_tcall,
235 param0: csi, pri: THREAD_CALL_PRIORITY_KERNEL, options: THREAD_CALL_OPTIONS_ONCE);
236 /* this must not fail */
237 VERIFY(csi->csi_tcall != NULL);
238 } else {
239 csi->csi_interval = 0;
240 csi->csi_eff_interval = 0;
241 csi->csi_tcall = NULL;
242 }
243 lck_mtx_init(lck: &csi->csi_lock, grp: &channel_kn_lock_group, attr: &channel_lock_attr);
244 klist_init(list: &csi->csi_si.si_note);
245}
246
247void
248csi_destroy(struct ch_selinfo *csi)
249{
250 /* check if not already destroyed, else do it now */
251 if ((os_atomic_or_orig(&csi->csi_flags, CSI_DESTROYED, relaxed) &
252 CSI_DESTROYED) == 0) {
253 CSI_LOCK(csi);
254 /* must have been set by above atomic op */
255 VERIFY(csi->csi_flags & CSI_DESTROYED);
256 if (csi->csi_flags & CSI_MITIGATION) {
257 thread_call_t tcall = csi->csi_tcall;
258 VERIFY(tcall != NULL);
259 CSI_UNLOCK(csi);
260
261 (void) thread_call_cancel_wait(call: tcall);
262 if (!thread_call_free(call: tcall)) {
263 boolean_t freed;
264 (void) thread_call_cancel_wait(call: tcall);
265 freed = thread_call_free(call: tcall);
266 VERIFY(freed);
267 }
268
269 CSI_LOCK(csi);
270 csi->csi_tcall = NULL;
271 os_atomic_andnot(&csi->csi_flags, CSI_MITIGATION,
272 relaxed);
273 }
274 csi->csi_pending = 0;
275 CSI_UNLOCK(csi);
276
277 selthreadclear(&csi->csi_si);
278 /* now we don't need the mutex anymore */
279 lck_mtx_destroy(lck: &csi->csi_lock, grp: &channel_kn_lock_group);
280 }
281}
282
283/*
284 * Called only for select(2).
285 */
286__attribute__((always_inline))
287static inline void
288csi_selrecord(struct ch_selinfo *csi, struct proc *p, void *wql)
289{
290 struct selinfo *si = &csi->csi_si;
291
292 CSI_LOCK_ASSERT_HELD(csi);
293 selrecord(selector: p, si, wql);
294}
295
296void
297csi_selrecord_one(struct __kern_channel_ring *kring, struct proc *p, void *wql)
298{
299 struct ch_selinfo *csi = &kring->ckr_si;
300
301 CSI_LOCK(csi);
302 SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (0x%llx) kr %s (0x%llx) "
303 "si 0x%llx si_flags 0x%x", (kring->ckr_tx == NR_TX) ? "W" : "R",
304 KRNA(kring)->na_name, SK_KVA(KRNA(kring)), kring->ckr_name,
305 SK_KVA(kring), SK_KVA(&csi->csi_si), csi->csi_si.si_flags);
306
307 csi_selrecord(csi, p, wql);
308 CSI_UNLOCK(csi);
309}
310
311void
312csi_selrecord_all(struct nexus_adapter *na, enum txrx t, struct proc *p,
313 void *wql)
314{
315 struct ch_selinfo *csi = &na->na_si[t];
316
317 CSI_LOCK(csi);
318 SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (0x%llx) si 0x%llx si_flags 0x%x",
319 (t == NR_TX) ? "W" : "R", na->na_name, SK_KVA(na),
320 SK_KVA(&csi->csi_si), csi->csi_si.si_flags);
321
322 csi_selrecord(csi, p, wql);
323 CSI_UNLOCK(csi);
324}
325
326/*
327 * Called from na_post_event().
328 */
329__attribute__((always_inline))
330static inline void
331csi_selwakeup(struct ch_selinfo *csi, boolean_t within_kevent,
332 boolean_t selwake, uint32_t hint)
333{
334 struct selinfo *si = &csi->csi_si;
335
336 CSI_LOCK_ASSERT_HELD(csi);
337 csi->csi_pending = 0;
338 if (selwake) {
339 selwakeup(si);
340 }
341 if ((csi->csi_flags & CSI_KNOTE) && !within_kevent) {
342 KNOTE(&si->si_note, hint);
343 }
344}
345
346__attribute__((always_inline))
347static inline void
348csi_selwakeup_delayed(struct ch_selinfo *csi)
349{
350 CSI_LOCK_ASSERT_HELD(csi);
351 ASSERT(csi->csi_flags & CSI_MITIGATION);
352 ASSERT(csi->csi_tcall != NULL);
353
354 if (thread_call_isactive(call: csi->csi_tcall)) {
355 csi->csi_pending++;
356 } else if (!csi_tcall_start(csi)) {
357 csi_selwakeup(csi, FALSE, FALSE, hint: 0);
358 }
359}
360
361__attribute__((always_inline))
362static inline void
363csi_selwakeup_common(struct ch_selinfo *csi, boolean_t nodelay,
364 boolean_t within_kevent, boolean_t selwake, uint32_t hint)
365{
366 CSI_LOCK_ASSERT_HELD(csi);
367
368 if (nodelay || within_kevent || !selwake || hint != 0 ||
369 !(csi->csi_flags & CSI_MITIGATION)) {
370 csi_selwakeup(csi, within_kevent, selwake, hint);
371 } else {
372 csi_selwakeup_delayed(csi);
373 }
374}
375
376void
377csi_selwakeup_one(struct __kern_channel_ring *kring, boolean_t nodelay,
378 boolean_t within_kevent, boolean_t selwake, uint32_t hint)
379{
380 struct ch_selinfo *csi = &kring->ckr_si;
381
382 CSI_LOCK(csi);
383 SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (0x%llx) kr %s (0x%llx) "
384 "si 0x%llx si_flags 0x%x nodelay %u kev %u sel %u hint 0x%b",
385 (kring->ckr_tx == NR_TX) ? "W" : "R", KRNA(kring)->na_name,
386 SK_KVA(KRNA(kring)), kring->ckr_name, SK_KVA(kring),
387 SK_KVA(&csi->csi_si), csi->csi_si.si_flags, nodelay,
388 within_kevent, selwake, hint, CHAN_FILT_HINT_BITS);
389
390 csi_selwakeup_common(csi, nodelay, within_kevent, selwake, hint);
391 CSI_UNLOCK(csi);
392}
393
394void
395csi_selwakeup_all(struct nexus_adapter *na, enum txrx t, boolean_t nodelay,
396 boolean_t within_kevent, boolean_t selwake, uint32_t hint)
397{
398 struct ch_selinfo *csi = &na->na_si[t];
399
400 CSI_LOCK(csi);
401 SK_DF(SK_VERB_EVENTS, "[%s] na \"%s\" (0x%llx) si 0x%llx "
402 "si_flags 0x%x nodelay %u kev %u sel %u hint 0x%b",
403 (t == NR_TX) ? "W" : "R", na->na_name, SK_KVA(na),
404 SK_KVA(&csi->csi_si), csi->csi_si.si_flags, nodelay,
405 within_kevent, selwake, hint, CHAN_FILT_HINT_BITS);
406
407 switch (t) {
408 case NR_RX:
409 if (!(na->na_flags & NAF_RX_MITIGATION)) {
410 nodelay = TRUE;
411 }
412 break;
413
414 case NR_TX:
415 if (!(na->na_flags & NAF_TX_MITIGATION)) {
416 nodelay = TRUE;
417 }
418 break;
419
420 default:
421 nodelay = TRUE;
422 break;
423 }
424 csi_selwakeup_common(csi, nodelay, within_kevent, selwake, hint);
425 CSI_UNLOCK(csi);
426}
427
428static boolean_t
429csi_tcall_start(struct ch_selinfo *csi)
430{
431 uint64_t now, ival, deadline;
432
433 CSI_LOCK_ASSERT_HELD(csi);
434 ASSERT(csi->csi_flags & CSI_MITIGATION);
435 ASSERT(csi->csi_tcall != NULL);
436
437 /* pick up latest value */
438 ival = csi_tcall_update_interval(csi);
439
440 /* if no mitigation, pass notification up now */
441 if (__improbable(ival == 0)) {
442 return FALSE;
443 }
444
445 deadline = now = mach_absolute_time();
446 clock_deadline_for_periodic_event(interval: ival, abstime: now, deadline: &deadline);
447 (void) thread_call_enter_delayed(call: csi->csi_tcall, deadline);
448
449 return TRUE;
450}
451
452static void
453csi_tcall(thread_call_param_t arg0, thread_call_param_t arg1)
454{
455#pragma unused(arg1)
456 struct ch_selinfo *csi = arg0;
457
458 CSI_LOCK(csi);
459 csi_selwakeup(csi, FALSE, FALSE, hint: 0);
460 CSI_UNLOCK(csi);
461
462 CSI_LOCK(csi);
463 if (__improbable((csi->csi_flags & CSI_DESTROYED) == 0 &&
464 csi->csi_pending != 0 && !csi_tcall_start(csi))) {
465 csi_selwakeup(csi, FALSE, FALSE, hint: 0);
466 }
467 CSI_UNLOCK(csi);
468}
469
470__attribute__((always_inline))
471static inline uint64_t
472csi_tcall_update_interval(struct ch_selinfo *csi)
473{
474 uint64_t i = ch_mit_ival;
475
476 /* if global override was adjusted, update local copies */
477 if (__improbable(csi->csi_eff_interval != i)) {
478 ASSERT(csi->csi_flags & CSI_MITIGATION);
479 csi->csi_interval = csi->csi_eff_interval =
480 ((i == 0) ? 0 : MAX(i, CH_MIT_IVAL_MIN));
481 }
482
483 return csi->csi_interval;
484}
485
486/* return EV_EOF if the channel is defunct */
487static inline boolean_t
488ch_filt_check_defunct(struct kern_channel *ch, struct knote *kn)
489{
490 if (__improbable((ch->ch_flags & CHANF_DEFUNCT) != 0)) {
491 if (kn) {
492 kn->kn_flags |= EV_EOF;
493 }
494 return TRUE;
495 }
496 return FALSE;
497}
498
499static void
500filt_chrwdetach(struct knote *kn, boolean_t write)
501{
502 struct kern_channel *ch = (struct kern_channel *)knote_kn_hook_get_raw(kn);
503 struct ch_selinfo *csi;
504 struct selinfo *si;
505
506 lck_mtx_lock(lck: &ch->ch_lock);
507 csi = ch->ch_si[write ? NR_TX : NR_RX];
508 si = &csi->csi_si;
509
510 CSI_LOCK(csi);
511 SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx kn 0x%llx (%s%s) "
512 "si_flags 0x%x", ch->ch_na->na_name, SK_KVA(ch->ch_na),
513 SK_KVA(ch), SK_KVA(kn), (kn->kn_flags & EV_POLL) ? "poll," : "",
514 write ? "write" : "read", si->si_flags);
515
516 if (KNOTE_DETACH(&si->si_note, kn)) {
517 os_atomic_andnot(&csi->csi_flags, CSI_KNOTE, relaxed);
518 }
519
520 CSI_UNLOCK(csi);
521 lck_mtx_unlock(lck: &ch->ch_lock);
522}
523
524static void
525filt_chrdetach(struct knote *kn)
526{
527 ASSERT(kn->kn_filter == EVFILT_READ);
528 filt_chrwdetach(kn, FALSE);
529}
530
531static void
532filt_chwdetach(struct knote *kn)
533{
534 ASSERT(kn->kn_filter == EVFILT_WRITE);
535 filt_chrwdetach(kn, TRUE);
536}
537
538/*
539 * callback from notifies (generated externally).
540 * This always marks the knote activated, so always
541 * return 1.
542 */
543static int
544filt_chrw(struct knote *kn, long hint, int events)
545{
546#if SK_LOG
547 struct kern_channel *ch = knote_kn_hook_get_raw(kn);
548#else
549#pragma unused(kn)
550#pragma unused(hint)
551#pragma unused(events)
552#endif
553 SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx "
554 "kn 0x%llx (%s%s) hint 0x%x", ch->ch_na->na_name,
555 SK_KVA(ch->ch_na), SK_KVA(ch), SK_KVA(kn),
556 (kn->kn_flags & EV_POLL) ? "poll," : "",
557 (events == POLLOUT) ? "write" : "read",
558 (uint32_t)hint);
559
560 /* assume we are ready */
561 return 1;
562}
563
564static int
565filt_chread(struct knote *kn, long hint)
566{
567 ASSERT(kn->kn_filter == EVFILT_READ);
568 /* There is no hint for read/write event */
569 if (hint != 0) {
570 return 0;
571 }
572 return filt_chrw(kn, hint, POLLIN);
573}
574
575static int
576filt_chwrite(struct knote *kn, long hint)
577{
578 ASSERT(kn->kn_filter == EVFILT_WRITE);
579 /* There is no hint for read/write event */
580 if (hint != 0) {
581 return 0;
582 }
583 return filt_chrw(kn, hint, POLLOUT);
584}
585
586static int
587filt_chtouch(struct knote *kn, struct kevent_qos_s *kev, int events)
588{
589#pragma unused(kev)
590 struct kern_channel *ch = knote_kn_hook_get_raw(kn);
591 int ev = kn->kn_filter;
592 enum txrx dir = (ev == EVFILT_WRITE) ? NR_TX : NR_RX;
593 int event_error = 0;
594 int revents;
595
596 /* save off the new input fflags and data */
597 kn->kn_sfflags = kev->fflags;
598 kn->kn_sdata = kev->data;
599
600 lck_mtx_lock(lck: &ch->ch_lock);
601 if (__improbable(ch_filt_check_defunct(ch, kn))) {
602 lck_mtx_unlock(lck: &ch->ch_lock);
603 return 1;
604 }
605
606 /* if a note-specific low watermark is given, validate it */
607 if (kn->kn_sfflags & NOTE_LOWAT) {
608 struct ch_ev_thresh note_thresh = {
609 .cet_unit = (dir == NR_TX) ?
610 ch->ch_info->cinfo_tx_lowat.cet_unit :
611 ch->ch_info->cinfo_rx_lowat.cet_unit,
612 .cet_value = (uint32_t)kn->kn_sdata
613 };
614 if (ch_ev_thresh_validate(ch->ch_na->na_nx, dir,
615 &note_thresh) != 0) {
616 SK_ERR("invalid NOTE_LOWAT threshold %u",
617 note_thresh.cet_value);
618 knote_set_error(kn, EINVAL);
619 lck_mtx_unlock(lck: &ch->ch_lock);
620 return 1;
621 }
622 }
623
624 /* capture new state just so we can return it */
625 revents = ch_event(ch, events, NULL, p: knote_get_kq(kn)->kq_p, NULL, TRUE,
626 errno: &event_error, FALSE);
627 lck_mtx_unlock(lck: &ch->ch_lock);
628
629 if (revents & POLLERR) {
630 ASSERT(event_error != 0);
631 /*
632 * Setting a knote error here will confuse libdispatch, so we
633 * use EV_EOF instead.
634 */
635 kn->kn_flags |= EV_EOF;
636 return 1;
637 } else {
638 return (events & revents) != 0;
639 }
640}
641
642static int
643filt_chrtouch(struct knote *kn, struct kevent_qos_s *kev)
644{
645 ASSERT(kn->kn_filter == EVFILT_READ);
646
647 if (kev->flags & EV_ENABLE) {
648 KDBG_DEBUG(KEV_EVTID(BSD_KEVENT_KNOTE_ENABLE),
649 kn->kn_udata, kn->kn_status | (kn->kn_id << 32),
650 kn->kn_filtid, VM_KERNEL_UNSLIDE_OR_PERM(
651 ((struct kern_channel *)knote_kn_hook_get_raw(kn))->ch_na));
652 }
653
654 return filt_chtouch(kn, kev, POLLIN);
655}
656
657static int
658filt_chwtouch(struct knote *kn, struct kevent_qos_s *kev)
659{
660 ASSERT(kn->kn_filter == EVFILT_WRITE);
661 return filt_chtouch(kn, kev, POLLOUT);
662}
663
664
665/*
666 * Called from kevent. We call ch_event(POLL[IN|OUT]) and
667 * return 0/1 accordingly.
668 */
669static int
670filt_chprocess(struct knote *kn, struct kevent_qos_s *kev, int events)
671{
672 struct kern_channel *ch = knote_kn_hook_get_raw(kn);
673 struct ch_event_result result;
674 uint32_t lowat;
675 int trigger_event = 1;
676 int revents;
677 int event_error;
678 int64_t data;
679
680 lck_mtx_lock(lck: &ch->ch_lock);
681 if (__improbable(ch_filt_check_defunct(ch, kn))) {
682 knote_fill_kevent(kn, kev, data: 0);
683 lck_mtx_unlock(lck: &ch->ch_lock);
684 return 1;
685 }
686
687 revents = ch_event(ch, events, NULL, p: knote_get_kq(kn)->kq_p, &result,
688 TRUE, errno: &event_error, FALSE);
689
690 if (revents & POLLERR) {
691 ASSERT(event_error != 0);
692 lck_mtx_unlock(lck: &ch->ch_lock);
693 /*
694 * Setting a knote error here will confuse libdispatch, so we
695 * use EV_EOF instead.
696 */
697 kn->kn_flags |= EV_EOF;
698 knote_fill_kevent_with_sdata(kn, kev);
699 return 1;
700 }
701
702 trigger_event = (events & revents) != 0;
703
704 if (events == POLLOUT) {
705 lowat = ch->ch_info->cinfo_tx_lowat.cet_value;
706 if ((kn->kn_sfflags & NOTE_LOWAT) &&
707 kn->kn_sdata > lowat) {
708 lowat = (uint32_t)kn->kn_sdata;
709 }
710
711 data = result.tx_data;
712
713 if (result.tx_data < lowat) {
714 trigger_event = 0;
715 }
716 } else {
717 lowat = ch->ch_info->cinfo_rx_lowat.cet_value;
718 if ((kn->kn_sfflags & NOTE_LOWAT) &&
719 kn->kn_sdata > lowat) {
720 lowat = (uint32_t)kn->kn_sdata;
721 }
722
723 data = result.rx_data;
724
725 if (result.rx_data < lowat) {
726 trigger_event = 0;
727 }
728 }
729
730 if (trigger_event) {
731 knote_fill_kevent(kn, kev, data);
732 }
733
734 lck_mtx_unlock(lck: &ch->ch_lock);
735
736 return trigger_event;
737}
738
739static int
740filt_chrprocess(struct knote *kn, struct kevent_qos_s *kev)
741{
742 ASSERT(kn->kn_filter == EVFILT_READ);
743 return filt_chprocess(kn, kev, POLLIN);
744}
745
746static int
747filt_chwprocess(struct knote *kn, struct kevent_qos_s *kev)
748{
749 ASSERT(kn->kn_filter == EVFILT_WRITE);
750 return filt_chprocess(kn, kev, POLLOUT);
751}
752
753static int
754filt_chrwattach(struct knote *kn, __unused struct kevent_qos_s *kev)
755{
756 struct kern_channel *ch = (struct kern_channel *)knote_kn_hook_get_raw(kn);
757 struct nexus_adapter *na;
758 struct ch_selinfo *csi;
759 int ev = kn->kn_filter;
760 enum txrx dir = (ev == EVFILT_WRITE) ? NR_TX : NR_RX;
761 int revents;
762 int events;
763 int event_error = 0;
764
765 ASSERT((kn->kn_filter == EVFILT_READ) ||
766 (kn->kn_filter == EVFILT_WRITE));
767
768 /* ch_kqfilter() should have acquired the lock */
769 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
770
771 na = ch->ch_na;
772 /* if a note-specific low watermark is given, validate it */
773 if (kn->kn_sfflags & NOTE_LOWAT) {
774 struct ch_ev_thresh note_thresh = {
775 .cet_unit = (dir == NR_TX) ?
776 ch->ch_info->cinfo_tx_lowat.cet_unit :
777 ch->ch_info->cinfo_rx_lowat.cet_unit,
778 .cet_value = (uint32_t)kn->kn_sdata
779 };
780 if (ch_ev_thresh_validate(ch->ch_na->na_nx, dir,
781 &note_thresh) != 0) {
782 SK_ERR("invalid NOTE_LOWAT threshold %u",
783 note_thresh.cet_value);
784 knote_set_error(kn, EINVAL);
785 return 0;
786 }
787 }
788
789 /* the si is indicated in the channel */
790 csi = ch->ch_si[dir];
791 CSI_LOCK(csi);
792
793 if (KNOTE_ATTACH(&csi->csi_si.si_note, kn)) {
794 os_atomic_or(&csi->csi_flags, CSI_KNOTE, relaxed);
795 }
796
797 CSI_UNLOCK(csi);
798
799 SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx kn 0x%llx (%s%s)",
800 na->na_name, SK_KVA(na), SK_KVA(ch), SK_KVA(kn),
801 (kn->kn_flags & EV_POLL) ? "poll," : "",
802 (ev == EVFILT_WRITE) ? "write" : "read");
803
804 /* capture current state */
805 events = (ev == EVFILT_WRITE) ? POLLOUT : POLLIN;
806
807 if (__improbable(ch_filt_check_defunct(ch, kn))) {
808 revents = events;
809 } else {
810 /* filt_chprocess() will fill in the kn_sdata field */
811 revents = ch_event(ch, events, NULL, p: knote_get_kq(kn)->kq_p,
812 NULL, TRUE, errno: &event_error, FALSE);
813 }
814
815 if (revents & POLLERR) {
816 ASSERT(event_error != 0);
817 kn->kn_flags |= EV_EOF;
818 return 1;
819 } else {
820 return (events & revents) != 0;
821 }
822}
823
824static int
825filt_chan_extended_common(struct knote *kn, long ev_hint)
826{
827 /*
828 * This function is not always called with the same set of locks held,
829 * hence it is only allowed to manipulate kn_fflags, with atomics.
830 *
831 * the f_event / f_process functions may run concurrently.
832 */
833 uint32_t add_fflags = 0;
834
835 if ((ev_hint & CHAN_FILT_HINT_FLOW_ADV_UPD) != 0) {
836 add_fflags |= NOTE_FLOW_ADV_UPDATE;
837 }
838 if ((ev_hint & CHAN_FILT_HINT_CHANNEL_EVENT) != 0) {
839 add_fflags |= NOTE_CHANNEL_EVENT;
840 }
841 if ((ev_hint & CHAN_FILT_HINT_IF_ADV_UPD) != 0) {
842 add_fflags |= NOTE_IF_ADV_UPD;
843 }
844 if (add_fflags) {
845 /* Reset any events that are not requested on this knote */
846 add_fflags &= (kn->kn_sfflags & EVFILT_NW_CHANNEL_ALL_MASK);
847 os_atomic_or(&kn->kn_fflags, add_fflags, relaxed);
848 return add_fflags != 0;
849 }
850 return os_atomic_load(&kn->kn_fflags, relaxed) != 0;
851}
852
853static inline void
854che_process_channel_event(struct kern_channel *ch, struct knote *kn,
855 uint32_t fflags, long *hint)
856{
857 int revents, event_error = 0;
858
859 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
860 *hint &= ~CHAN_FILT_HINT_CHANNEL_EVENT;
861
862 if (((ch->ch_flags & CHANF_EVENT_RING) != 0) &&
863 ((fflags & NOTE_CHANNEL_EVENT) != 0)) {
864 /* capture new state to return */
865 revents = ch_event(ch, POLLIN, NULL, p: knote_get_kq(kn)->kq_p,
866 NULL, TRUE, errno: &event_error, TRUE);
867 if (revents & POLLERR) {
868 ASSERT(event_error != 0);
869 /*
870 * Setting a knote error here will confuse libdispatch,
871 * so we use EV_EOF instead.
872 */
873 kn->kn_flags |= EV_EOF;
874 } else if ((revents & POLLIN) != 0) {
875 *hint |= CHAN_FILT_HINT_CHANNEL_EVENT;
876 }
877 }
878 /*
879 * if the sync operation on event ring didn't find any events
880 * then indicate that the channel event is not active.
881 */
882 if ((*hint & CHAN_FILT_HINT_CHANNEL_EVENT) == 0) {
883 /*
884 * Avoid a costly atomic when the bit is already cleared.
885 */
886 uint32_t knfflags = os_atomic_load(&kn->kn_fflags, relaxed);
887 if (knfflags & CHAN_FILT_HINT_CHANNEL_EVENT) {
888 os_atomic_andnot(&kn->kn_fflags,
889 CHAN_FILT_HINT_CHANNEL_EVENT, relaxed);
890 }
891 }
892}
893
894static int
895filt_che_attach(struct knote *kn, __unused struct kevent_qos_s *kev)
896{
897 struct kern_channel *ch = (struct kern_channel *)knote_kn_hook_get_raw(kn);
898 struct ch_selinfo *csi;
899 long hint = 0;
900
901 _CASSERT(CHAN_FILT_HINT_FLOW_ADV_UPD == NOTE_FLOW_ADV_UPDATE);
902 _CASSERT(CHAN_FILT_HINT_CHANNEL_EVENT == NOTE_CHANNEL_EVENT);
903 _CASSERT(CHAN_FILT_HINT_IF_ADV_UPD == NOTE_IF_ADV_UPD);
904
905 ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
906
907 /* ch_kqfilter() should have acquired the lock */
908 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
909
910 csi = ch->ch_si[NR_TX];
911 CSI_LOCK(csi);
912 if (KNOTE_ATTACH(&csi->csi_si.si_note, kn)) {
913 os_atomic_or(&csi->csi_flags, CSI_KNOTE, relaxed);
914 }
915 CSI_UNLOCK(csi);
916
917 if (__improbable(ch_filt_check_defunct(ch, kn))) {
918 return 1;
919 }
920 if ((kn->kn_sfflags & NOTE_CHANNEL_EVENT) != 0) {
921 os_atomic_or(&ch->ch_na->na_flags, NAF_CHANNEL_EVENT_ATTACHED, relaxed);
922 }
923 che_process_channel_event(ch, kn, fflags: kn->kn_sfflags, hint: &hint);
924 if ((kn->kn_sfflags & NOTE_FLOW_ADV_UPDATE) != 0) {
925 /* on registration force an event */
926 hint |= CHAN_FILT_HINT_FLOW_ADV_UPD;
927 }
928 SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx kn 0x%llx (%s)",
929 ch->ch_na->na_name, SK_KVA(ch->ch_na), SK_KVA(ch), SK_KVA(kn),
930 "EVFILT_NW_CHANNEL");
931 return filt_chan_extended_common(kn, ev_hint: hint);
932}
933
934static void
935filt_che_detach(struct knote *kn)
936{
937 struct kern_channel *ch = (struct kern_channel *)knote_kn_hook_get_raw(kn);
938 struct ch_selinfo *csi;
939
940 ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
941
942 lck_mtx_lock(lck: &ch->ch_lock);
943 if ((kn->kn_sfflags & NOTE_CHANNEL_EVENT) != 0) {
944 os_atomic_andnot(&ch->ch_na->na_flags,
945 NAF_CHANNEL_EVENT_ATTACHED, relaxed);
946 }
947 csi = ch->ch_si[NR_TX];
948 CSI_LOCK(csi);
949 if (KNOTE_DETACH(&csi->csi_si.si_note, kn)) {
950 os_atomic_andnot(&csi->csi_flags, CSI_KNOTE, relaxed);
951 }
952 CSI_UNLOCK(csi);
953 lck_mtx_unlock(lck: &ch->ch_lock);
954
955 SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx kn 0x%llx (%s)",
956 ch->ch_na->na_name, SK_KVA(ch->ch_na), SK_KVA(ch), SK_KVA(kn),
957 "EVFILT_NW_CHANNEL");
958}
959
960static int
961filt_che_event(struct knote *kn, long hint)
962{
963 struct kern_channel *ch = (struct kern_channel *)knote_kn_hook_get_raw(kn);
964
965 ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
966 if (hint == 0) {
967 return 0;
968 }
969 if (__improbable(ch_filt_check_defunct(ch, NULL))) {
970 return 1;
971 }
972 if ((hint & CHAN_FILT_HINT_CHANNEL_EVENT) != 0) {
973 VERIFY((ch->ch_flags & CHANF_EVENT_RING) != 0);
974 }
975 SK_DF(SK_VERB_EVENTS, "na \"%s\" (0x%llx) ch 0x%llx hint 0x%b)",
976 ch->ch_na->na_name, SK_KVA(ch->ch_na), SK_KVA(ch), hint,
977 CHAN_FILT_HINT_BITS);
978 return filt_chan_extended_common(kn, ev_hint: hint);
979}
980
981static int
982filt_che_touch(struct knote *kn, struct kevent_qos_s *kev)
983{
984 int ret;
985 long hint = 0;
986 struct kern_channel *ch = (struct kern_channel *)knote_kn_hook_get_raw(kn);
987
988 ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
989 /* save off the new input fflags and data */
990 kn->kn_sfflags = kev->fflags;
991 kn->kn_sdata = kev->data;
992
993 lck_mtx_lock(lck: &ch->ch_lock);
994 if (__improbable(ch_filt_check_defunct(ch, kn))) {
995 ret = 1;
996 goto done;
997 }
998 if ((kn->kn_sfflags & NOTE_CHANNEL_EVENT) != 0) {
999 if (kev->flags & EV_ENABLE) {
1000 os_atomic_or(&ch->ch_na->na_flags,
1001 NAF_CHANNEL_EVENT_ATTACHED, relaxed);
1002 } else if (kev->flags & EV_DISABLE) {
1003 os_atomic_andnot(&ch->ch_na->na_flags,
1004 NAF_CHANNEL_EVENT_ATTACHED, relaxed);
1005 }
1006 }
1007 che_process_channel_event(ch, kn, fflags: kn->kn_sfflags, hint: &hint);
1008 ret = filt_chan_extended_common(kn, ev_hint: hint);
1009done:
1010 lck_mtx_unlock(lck: &ch->ch_lock);
1011 return ret;
1012}
1013
1014static int
1015filt_che_process(struct knote *kn, struct kevent_qos_s *kev)
1016{
1017 int ret;
1018 long hint = 0;
1019 struct kern_channel *ch = knote_kn_hook_get_raw(kn);
1020
1021 ASSERT(kn->kn_filter == EVFILT_NW_CHANNEL);
1022 lck_mtx_lock(lck: &ch->ch_lock);
1023 if (__improbable(ch_filt_check_defunct(ch, kn))) {
1024 ret = 1;
1025 goto done;
1026 }
1027 che_process_channel_event(ch, kn, fflags: kn->kn_sfflags, hint: &hint);
1028 ret = filt_chan_extended_common(kn, ev_hint: hint);
1029done:
1030 lck_mtx_unlock(lck: &ch->ch_lock);
1031 if (ret != 0) {
1032 /*
1033 * This filter historically behaves like EV_CLEAR,
1034 * even when EV_CLEAR wasn't set.
1035 */
1036 knote_fill_kevent(kn, kev, data: 0);
1037 kn->kn_fflags = 0;
1038 }
1039 return ret;
1040}
1041
1042int
1043ch_kqfilter(struct kern_channel *ch, struct knote *kn,
1044 struct kevent_qos_s *kev)
1045{
1046 int result;
1047
1048 lck_mtx_lock(lck: &ch->ch_lock);
1049 VERIFY(!(ch->ch_flags & CHANF_KERNEL));
1050
1051 if (__improbable(ch->ch_na == NULL || !NA_IS_ACTIVE(ch->ch_na) ||
1052 na_reject_channel(ch, ch->ch_na))) {
1053 SK_ERR("%s(%d): channel is non-permissive, flags 0x%b", ch->ch_name,
1054 ch->ch_pid, ch->ch_flags, CHANF_BITS);
1055 knote_set_error(kn, ENXIO);
1056 lck_mtx_unlock(lck: &ch->ch_lock);
1057 return 0;
1058 }
1059
1060 switch (kn->kn_filter) {
1061 case EVFILT_READ:
1062 kn->kn_filtid = EVFILTID_SKYWALK_CHANNEL_R;
1063 break;
1064
1065 case EVFILT_WRITE:
1066 kn->kn_filtid = EVFILTID_SKYWALK_CHANNEL_W;
1067 break;
1068
1069 case EVFILT_NW_CHANNEL:
1070 kn->kn_filtid = EVFILTID_SKYWALK_CHANNEL_E;
1071 break;
1072
1073 default:
1074 lck_mtx_unlock(lck: &ch->ch_lock);
1075 SK_ERR("%s(%d): bad filter request %d", ch->ch_name,
1076 ch->ch_pid, kn->kn_filter);
1077 knote_set_error(kn, EINVAL);
1078 return 0;
1079 }
1080
1081 knote_kn_hook_set_raw(kn, kn_hook: ch);
1082 /* call the appropriate sub-filter attach with the channel lock held */
1083 result = knote_fops(kn)->f_attach(kn, kev);
1084 lck_mtx_unlock(lck: &ch->ch_lock);
1085 return result;
1086}
1087
1088boolean_t
1089ch_is_multiplex(struct kern_channel *ch, enum txrx t)
1090{
1091 return ch->ch_na != NULL && (ch->ch_last[t] - ch->ch_first[t] > 1);
1092}
1093
1094int
1095ch_select(struct kern_channel *ch, int events, void *wql, struct proc *p)
1096{
1097 int revents;
1098 int event_error = 0;
1099
1100 lck_mtx_lock(lck: &ch->ch_lock);
1101 revents = ch_event(ch, events, wql, p, NULL, FALSE, errno: &event_error,
1102 FALSE);
1103 lck_mtx_unlock(lck: &ch->ch_lock);
1104
1105 ASSERT((revents & POLLERR) == 0 || event_error != 0);
1106
1107 return revents;
1108}
1109
1110#if SK_LOG
1111/* Hoisted out of line to reduce kernel stack footprint */
1112SK_LOG_ATTRIBUTE
1113static void
1114ch_event_log(const char *prefix, const struct kern_channel *ch,
1115 struct proc *p, const struct nexus_adapter *na,
1116 int events, int revents)
1117{
1118 SK_DF(SK_VERB_EVENTS, "%s: na \"%s\" (0x%llx) ch 0x%llx %s(%d) "
1119 "th 0x%llx ev 0x%x rev 0x%x", prefix, na->na_name, SK_KVA(na),
1120 SK_KVA(ch), sk_proc_name_address(p), sk_proc_pid(p),
1121 SK_KVA(current_thread()), events, revents);
1122}
1123#endif /* SK_LOG */
1124
1125/*
1126 * select(2), poll(2) and kevent(2) handlers for channels.
1127 *
1128 * Can be called for one or more rings. Return true the event mask
1129 * corresponding to ready events. If there are no ready events, do
1130 * a selrecord on either individual selinfo or on the global one.
1131 * Device-dependent parts (locking and sync of tx/rx rings)
1132 * are done through callbacks.
1133 */
1134static int
1135ch_event(struct kern_channel *ch, int events, void *wql,
1136 struct proc *p, struct ch_event_result *result,
1137 const boolean_t is_kevent, int *errno, const boolean_t is_ch_event)
1138{
1139 struct nexus_adapter *na;
1140 struct __kern_channel_ring *kring;
1141 uint32_t i, check_all_tx, check_all_rx, want[NR_TXRX], revents = 0;
1142 uint32_t ready_tx_data = 0, ready_rx_data = 0;
1143 sk_protect_t protect = NULL;
1144
1145#define want_tx want[NR_TX]
1146#define want_rx want[NR_RX]
1147 /*
1148 * In order to avoid nested locks, we need to "double check"
1149 * txsync and rxsync if we decide to do a selrecord().
1150 * retry_tx (and retry_rx, later) prevent looping forever.
1151 */
1152 boolean_t retry_tx = TRUE, retry_rx = TRUE;
1153 int found, error = 0;
1154 int s;
1155
1156 net_update_uptime();
1157
1158 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1159 ASSERT(!(ch->ch_flags & CHANF_KERNEL));
1160
1161 *errno = 0;
1162
1163 if (__improbable((ch->ch_flags & CHANF_DEFUNCT) ||
1164 ch->ch_schema == NULL)) {
1165 SK_ERR("%s(%d): channel is defunct or no longer bound",
1166 ch->ch_name, ch->ch_pid);
1167 revents = POLLERR;
1168 *errno = ENXIO;
1169 goto done;
1170 }
1171
1172 /* clear CHANF_DEFUNCT_SKIP if it was set during defunct last time */
1173 if (__improbable(ch->ch_flags & CHANF_DEFUNCT_SKIP)) {
1174 os_atomic_andnot(&ch->ch_flags, CHANF_DEFUNCT_SKIP, relaxed);
1175 }
1176
1177 na = ch->ch_na;
1178 if (__improbable(na == NULL ||
1179 !NA_IS_ACTIVE(na) || na_reject_channel(ch, na))) {
1180 SK_ERR("%s(%d): channel is non-permissive",
1181 ch->ch_name, ch->ch_pid);
1182 revents = POLLERR;
1183 *errno = ENXIO;
1184 goto done;
1185 }
1186
1187 /* mark thread with sync-in-progress flag */
1188 protect = sk_sync_protect();
1189
1190 /* update our work timestamp */
1191 na->na_work_ts = _net_uptime;
1192
1193 /* and make this channel eligible for draining again */
1194 if (na->na_flags & NAF_DRAINING) {
1195 os_atomic_andnot(&na->na_flags, NAF_DRAINING, relaxed);
1196 }
1197
1198#if SK_LOG
1199 if (__improbable((sk_verbose & SK_VERB_EVENTS) != 0)) {
1200 ch_event_log("enter", ch, p, na, events, revents);
1201 }
1202#endif
1203 if (is_ch_event) {
1204 goto process_channel_event;
1205 }
1206
1207 want_tx = (events & (POLLOUT | POLLWRNORM));
1208 want_rx = (events & (POLLIN | POLLRDNORM));
1209
1210 /*
1211 * check_all_{tx|rx} are set if the channel has more than one ring
1212 * AND the file descriptor is bound to all of them. If so, we sleep
1213 * on the "global" selinfo, otherwise we sleep on individual selinfo
1214 * The interrupt routine in the driver wake one or the other (or both)
1215 * depending on which clients are active.
1216 *
1217 * rxsync() is only called if we run out of buffers on a POLLIN.
1218 * txsync() is called if we run out of buffers on POLLOUT.
1219 */
1220 check_all_tx = ch_is_multiplex(ch, t: NR_TX);
1221 check_all_rx = ch_is_multiplex(ch, t: NR_RX);
1222
1223 /*
1224 * If want_tx is still set, we must issue txsync calls
1225 * (on all rings, to avoid that the tx rings stall).
1226 * XXX should also check head != khead on the tx rings.
1227 */
1228 if (want_tx) {
1229 ring_id_t first_tx = ch->ch_first[NR_TX];
1230 ring_id_t last_tx = ch->ch_last[NR_TX];
1231
1232 channel_threshold_unit_t tx_unit =
1233 ch->ch_info->cinfo_tx_lowat.cet_unit;
1234
1235 /*
1236 * The first round checks if anyone is ready, if not
1237 * do a selrecord and another round to handle races.
1238 * want_tx goes to 0 if any space is found, and is
1239 * used to skip rings with no pending transmissions.
1240 */
1241flush_tx:
1242 for (i = first_tx, ready_tx_data = 0; i < last_tx; i++) {
1243 kring = &na->na_tx_rings[i];
1244 if (!want_tx &&
1245 kring->ckr_ring->ring_head == kring->ckr_khead) {
1246 continue;
1247 }
1248
1249 /* only one thread does txsync */
1250 s = kr_enter(kring, TRUE);
1251 ASSERT(s == 0);
1252
1253 error = 0;
1254 DTRACE_SKYWALK2(pretxprologue, struct kern_channel *,
1255 ch, struct __kern_channel_ring *, kring);
1256 if (kr_txsync_prologue(ch, kring, p) >=
1257 kring->ckr_num_slots) {
1258 kr_log_bad_ring(kring);
1259 revents |= POLLERR;
1260 error = EFAULT;
1261 if (*errno == 0) {
1262 *errno = EFAULT;
1263 }
1264 } else {
1265 if (kring->ckr_na_sync(kring, p, 0)) {
1266 revents |= POLLERR;
1267 error = EIO;
1268 if (*errno == 0) {
1269 *errno = EIO;
1270 }
1271 } else {
1272 kr_txsync_finalize(ch, kring, p);
1273 }
1274 }
1275 DTRACE_SKYWALK3(posttxfinalize, struct kern_channel *,
1276 ch, struct __kern_channel_ring *, kring, int,
1277 error);
1278
1279 /*
1280 * If we found new slots, notify potential listeners on
1281 * the same ring. Since we just did a txsync, look at
1282 * the copies of cur,tail in the kring.
1283 */
1284 found = kring->ckr_rhead != kring->ckr_rtail;
1285 kr_exit(kring);
1286 if (found) { /* notify other listeners */
1287 revents |= want_tx;
1288 want_tx = 0;
1289 (void) kring->ckr_na_notify(kring, p,
1290 (is_kevent ? NA_NOTEF_IN_KEVENT : 0));
1291 }
1292
1293 /*
1294 * Add this ring's free data to our running
1295 * tally for userspace.
1296 */
1297 if (result != NULL) {
1298 switch (tx_unit) {
1299 case CHANNEL_THRESHOLD_UNIT_BYTES:
1300 ready_tx_data += kring->ckr_ready_bytes;
1301 break;
1302 case CHANNEL_THRESHOLD_UNIT_SLOTS:
1303 ready_tx_data += kring->ckr_ready_slots;
1304 break;
1305 }
1306 }
1307 }
1308 if (want_tx && retry_tx && !is_kevent) {
1309 if (check_all_tx) {
1310 csi_selrecord_all(na, t: NR_TX, p, wql);
1311 } else {
1312 csi_selrecord_one(kring: &na->na_tx_rings[first_tx],
1313 p, wql);
1314 }
1315 retry_tx = FALSE;
1316 goto flush_tx;
1317 }
1318 }
1319
1320 /*
1321 * If want_rx is still set scan receive rings.
1322 * Do it on all rings because otherwise we starve.
1323 */
1324 if (want_rx) {
1325 ring_id_t first_rx = ch->ch_first[NR_RX];
1326 ring_id_t last_rx = ch->ch_last[NR_RX];
1327 channel_threshold_unit_t rx_unit =
1328 ch->ch_info->cinfo_rx_lowat.cet_unit;
1329
1330 /* two rounds here for race avoidance */
1331do_retry_rx:
1332 for (i = first_rx, ready_rx_data = 0; i < last_rx; i++) {
1333 kring = &na->na_rx_rings[i];
1334
1335 /* only one thread does rxsync */
1336 s = kr_enter(kring, TRUE);
1337 ASSERT(s == 0);
1338
1339 error = 0;
1340 DTRACE_SKYWALK2(prerxprologue, struct kern_channel *,
1341 ch, struct __kern_channel_ring *, kring);
1342 if (kr_rxsync_prologue(ch, kring, p) >=
1343 kring->ckr_num_slots) {
1344 kr_log_bad_ring(kring);
1345 revents |= POLLERR;
1346 error = EFAULT;
1347 if (*errno == 0) {
1348 *errno = EFAULT;
1349 }
1350 } else {
1351 /* now we can use kring->rhead, rtail */
1352 if (kring->ckr_na_sync(kring, p, 0)) {
1353 revents |= POLLERR;
1354 error = EIO;
1355 if (*errno == 0) {
1356 *errno = EIO;
1357 }
1358 } else {
1359 kr_rxsync_finalize(ch, kring, p);
1360 }
1361 }
1362
1363 DTRACE_SKYWALK3(postrxfinalize, struct kern_channel *,
1364 ch, struct __kern_channel_ring *, kring, int,
1365 error);
1366
1367 found = kring->ckr_rhead != kring->ckr_rtail;
1368 kr_exit(kring);
1369 if (found) {
1370 revents |= want_rx;
1371 retry_rx = FALSE;
1372 (void) kring->ckr_na_notify(kring, p,
1373 (is_kevent ? NA_NOTEF_IN_KEVENT : 0));
1374 }
1375
1376 /*
1377 * Add this ring's readable data to our running
1378 * tally for userspace.
1379 */
1380 if (result != NULL) {
1381 switch (rx_unit) {
1382 case CHANNEL_THRESHOLD_UNIT_BYTES:
1383 ready_rx_data += kring->ckr_ready_bytes;
1384 break;
1385 case CHANNEL_THRESHOLD_UNIT_SLOTS:
1386 ready_rx_data += kring->ckr_ready_slots;
1387 break;
1388 }
1389 }
1390 }
1391
1392 if (retry_rx && !is_kevent) {
1393 if (check_all_rx) {
1394 csi_selrecord_all(na, t: NR_RX, p, wql);
1395 } else {
1396 csi_selrecord_one(kring: &na->na_rx_rings[first_rx],
1397 p, wql);
1398 }
1399 }
1400 if (retry_rx) {
1401 retry_rx = FALSE;
1402 goto do_retry_rx;
1403 }
1404 }
1405
1406 if (result != NULL) {
1407 result->tx_data = ready_tx_data;
1408 result->rx_data = ready_rx_data;
1409 }
1410 goto skip_channel_event;
1411
1412process_channel_event:
1413 /*
1414 * perform sync operation on the event ring to make the channel
1415 * events enqueued in the ring visible to user-space.
1416 */
1417
1418 /* select() and poll() not supported for event ring */
1419 ASSERT(is_kevent);
1420 VERIFY((ch->ch_last[NR_EV] - ch->ch_first[NR_EV]) == 1);
1421 kring = &na->na_event_rings[ch->ch_first[NR_EV]];
1422
1423 /* only one thread does the sync */
1424 s = kr_enter(kring, TRUE);
1425 ASSERT(s == 0);
1426 if (kr_event_sync_prologue(kring, p) >= kring->ckr_num_slots) {
1427 kr_log_bad_ring(kring);
1428 revents |= POLLERR;
1429 if (*errno == 0) {
1430 *errno = EFAULT;
1431 }
1432 } else {
1433 if (kring->ckr_na_sync(kring, p, 0)) {
1434 revents |= POLLERR;
1435 if (*errno == 0) {
1436 *errno = EIO;
1437 }
1438 } else {
1439 kr_event_sync_finalize(ch, kring, p);
1440 }
1441 }
1442 found = (kring->ckr_rhead != kring->ckr_rtail);
1443 kr_exit(kring);
1444 if (found) {
1445 revents |= (events & POLLIN);
1446 }
1447
1448skip_channel_event:
1449#if SK_LOG
1450 if (__improbable((sk_verbose & SK_VERB_EVENTS) != 0)) {
1451 ch_event_log("exit", ch, p, na, events, revents);
1452 }
1453#endif /* SK_LOG */
1454
1455 /* unmark thread with sync-in-progress flag */
1456 sk_sync_unprotect(protect);
1457
1458done:
1459 ASSERT(!sk_is_sync_protected());
1460
1461 return revents;
1462#undef want_tx
1463#undef want_rx
1464}
1465
1466static struct kern_channel *
1467ch_find(struct kern_nexus *nx, nexus_port_t port, ring_id_t ring_id)
1468{
1469 struct kern_channel *ch;
1470
1471 SK_LOCK_ASSERT_HELD();
1472
1473 STAILQ_FOREACH(ch, &nx->nx_ch_head, ch_link) {
1474 struct ch_info *cinfo = ch->ch_info;
1475
1476 /* see comments in ch_open() */
1477 if (cinfo->cinfo_nx_port != port) {
1478 continue;
1479 } else if (cinfo->cinfo_ch_mode & CHMODE_MONITOR) {
1480 continue;
1481 } else if (cinfo->cinfo_ch_ring_id != CHANNEL_RING_ID_ANY &&
1482 ring_id != cinfo->cinfo_ch_ring_id &&
1483 ring_id != CHANNEL_RING_ID_ANY) {
1484 continue;
1485 }
1486
1487 /* found a match */
1488 break;
1489 }
1490
1491 if (ch != NULL) {
1492 ch_retain_locked(ch);
1493 }
1494
1495 return ch;
1496}
1497
1498#if SK_LOG
1499/* Hoisted out of line to reduce kernel stack footprint */
1500SK_LOG_ATTRIBUTE
1501static void
1502ch_open_log1(const uuid_t p_uuid, struct proc *p, nexus_port_t port)
1503{
1504 uuid_string_t uuidstr;
1505
1506 SK_D("%s(%d) uniqueid %llu exec_uuid %s port %u",
1507 sk_proc_name_address(p), sk_proc_pid(p), proc_uniqueid(p),
1508 sk_uuid_unparse(p_uuid, uuidstr), port);
1509}
1510
1511SK_LOG_ATTRIBUTE
1512static void
1513ch_open_log2(struct proc *p, nexus_port_t port, ring_id_t ring,
1514 uint32_t mode, const char *mode_bits, int err)
1515{
1516 SK_D("%s(%d) port %u ring %d mode 0x%b err %d",
1517 sk_proc_name_address(p), sk_proc_pid(p), port, (int)ring,
1518 mode, mode_bits, err);
1519}
1520#endif /* SK_LOG */
1521
1522struct kern_channel *
1523ch_open(struct ch_init *init, struct proc *p, int fd, int *err)
1524{
1525 uint32_t mode = init->ci_ch_mode;
1526 nexus_port_t port = init->ci_nx_port;
1527 ring_id_t ring = init->ci_ch_ring_id;
1528 struct kern_channel *ch = NULL, *ch0 = NULL;
1529 struct nxbind *nxb = NULL;
1530 struct kern_nexus *nx;
1531 struct chreq chr;
1532 uuid_t p_uuid;
1533 kauth_cred_t cred;
1534
1535 cred = kauth_cred_get();
1536 ASSERT(!uuid_is_null(init->ci_nx_uuid));
1537 proc_getexecutableuuid(p, p_uuid, sizeof(p_uuid));
1538 *err = 0;
1539
1540 /* make sure we don't allow userland to set kernel-only flags */
1541 mode &= CHMODE_MASK;
1542
1543 SK_LOCK();
1544
1545 nx = nx_find(init->ci_nx_uuid, TRUE);
1546 if (nx == NULL) {
1547 *err = ENOENT;
1548 goto done;
1549 }
1550
1551 /* port (zero-based) must be within the domain's range */
1552 if (port >= NXDOM_MAX(NX_DOM(nx), ports)) {
1553 *err = EDOM;
1554 goto done;
1555 }
1556 VERIFY(port != NEXUS_PORT_ANY);
1557
1558 if (mode & CHMODE_LOW_LATENCY) {
1559 if ((*err = skywalk_priv_check_cred(p, cred,
1560 PRIV_SKYWALK_LOW_LATENCY_CHANNEL)) != 0) {
1561 goto done;
1562 }
1563 }
1564
1565 /* "no copy" is valid only when at least one tx/rx mon flag is set */
1566 if (!(mode & CHMODE_MONITOR) && (mode & CHMODE_MONITOR_NO_COPY)) {
1567 mode &= ~CHMODE_MONITOR_NO_COPY;
1568 }
1569
1570 if (mode & CHMODE_MONITOR) {
1571 if ((*err = skywalk_priv_check_cred(p, cred,
1572 PRIV_SKYWALK_OBSERVE_ALL)) != 0) {
1573 goto done;
1574 }
1575 /* Don't allow non-root processes to monitor channels. */
1576 if (kauth_cred_issuser(cred: cred) == 0) {
1577 *err = EPERM;
1578 goto done;
1579 }
1580 }
1581
1582 /*
1583 * Check with the nexus to see if the port is bound; if so, prepare
1584 * our nxbind structure that we'll need to pass down to the nexus
1585 * for it compare. If the caller provides a key, we take it over
1586 * and will free it ourselves (as part of freeing nxbind.)
1587 *
1588 * If this is a monitor channel, skip this altogether since the check
1589 * for PRIV_SKYWALK_OBSERVE_ALL privilege has been done above.
1590 */
1591 if (!(mode & CHMODE_MONITOR) && !NX_ANONYMOUS_PROV(nx)) {
1592 void *key = (void *)(init->ci_key);
1593
1594#if SK_LOG
1595 if (__improbable(sk_verbose != 0)) {
1596 ch_open_log1(p_uuid, p, port);
1597 }
1598#endif /* SK_LOG */
1599
1600 nxb = nxb_alloc(Z_WAITOK);
1601 nxb->nxb_flags |= NXBF_MATCH_UNIQUEID;
1602 nxb->nxb_uniqueid = proc_uniqueid(p);
1603 nxb->nxb_pid = proc_pid(p);
1604 nxb->nxb_flags |= NXBF_MATCH_EXEC_UUID;
1605 uuid_copy(dst: nxb->nxb_exec_uuid, src: p_uuid);
1606 if (key != NULL) {
1607 nxb->nxb_flags |= NXBF_MATCH_KEY;
1608 nxb->nxb_key_len = init->ci_key_len;
1609 nxb->nxb_key = key;
1610 init->ci_key = USER_ADDR_NULL; /* take over */
1611 }
1612 }
1613
1614 /*
1615 * There can only be one owner of {port,ring_id} tuple. Once
1616 * owned, this can be made available among multiple monitors.
1617 * CHANNEL_RING_ID_ANY (-1) ring_id gives exclusive rights over
1618 * all rings. Further attempts to own any or all of the rings
1619 * will be declined.
1620 *
1621 * Multiple monitors are allowed to exist. If a channel has been
1622 * bound to CHANNEL_RING_ID_ANY, any or all of its rings can be
1623 * monitored. If an owning channel has been bound to an individual
1624 * ring, only that ring can be monitored, either by specifying the
1625 * equivalent ring_id or CHANNEL_RING_ID_ANY at monitor open time.
1626 *
1627 * For example, assuming a 2-rings setup for port 'p':
1628 *
1629 * owner{p,-1}
1630 * will allow:
1631 * monitor{p,-1}, monitor{p,0}, monitor{p,1}
1632 * will not allow:
1633 * owner{p,-1}, owner{p,0}, owner{p,1}
1634 *
1635 * owner{p,0}
1636 * will allow:
1637 * owner{p,1}, monitor{p,-1}, monitor{p,0}
1638 * will not allow:
1639 * owner{p,-1}, owner{p,0}, monitor{p,1}
1640 */
1641 if ((ch0 = ch_find(nx, port, ring_id: ring)) != NULL) {
1642 SK_D("found ch0 0x%llx", SK_KVA(ch0));
1643 /*
1644 * Unless this is a monitor channel, allow only at
1645 * most one owner of the {port,ring_id} tuple.
1646 */
1647 if (!(mode & CHMODE_MONITOR)) {
1648#if SK_LOG
1649 uuid_string_t uuidstr;
1650 char *na_name = (ch0->ch_na != NULL) ?
1651 ch0->ch_na->na_name : "";
1652
1653 SK_DSC(p, "ch %s flags (0x%x) exists on port %d on "
1654 "nx %s, owner %s(%d)", na_name, ch0->ch_flags, port,
1655 sk_uuid_unparse(nx->nx_uuid, uuidstr),
1656 ch0->ch_name, ch0->ch_pid);
1657#endif /* SK_LOG */
1658 *err = EBUSY;
1659 goto done;
1660 }
1661 } else if (mode & CHMODE_MONITOR) {
1662 *err = ENXIO;
1663 goto done;
1664 }
1665
1666 bzero(s: &chr, n: sizeof(chr));
1667 chr.cr_tx_lowat = init->ci_tx_lowat;
1668 chr.cr_rx_lowat = init->ci_rx_lowat;
1669 chr.cr_port = port;
1670 chr.cr_mode = mode;
1671 chr.cr_ring_id = ring;
1672
1673 /* upon success, returns a channel with reference held */
1674 ch = ch_connect(nx, &chr, ch0, nxb, p, fd, err);
1675
1676done:
1677
1678#if SK_LOG
1679 if (__improbable(sk_verbose != 0)) {
1680 ch_open_log2(p, port, ring, mode, CHMODE_BITS, *err);
1681 }
1682#endif /* SK_LOG */
1683
1684 if (ch0 != NULL) {
1685 (void) ch_release_locked(ch0);
1686 }
1687
1688 if (nx != NULL) {
1689 (void) nx_release_locked(nx);
1690 }
1691
1692 if (nxb != NULL) {
1693 nxb_free(nxb);
1694 }
1695
1696 SK_UNLOCK();
1697
1698 return ch;
1699}
1700
1701struct kern_channel *
1702ch_open_special(struct kern_nexus *nx, struct chreq *chr, boolean_t nonxref,
1703 int *err)
1704{
1705 struct kern_channel *ch = NULL;
1706
1707 SK_LOCK_ASSERT_HELD();
1708 *err = 0;
1709
1710 ASSERT((chr->cr_mode & CHMODE_USER_PACKET_POOL) == 0);
1711 ASSERT((chr->cr_mode & CHMODE_EVENT_RING) == 0);
1712 ASSERT((chr->cr_mode & CHMODE_LOW_LATENCY) == 0);
1713 ASSERT(!uuid_is_null(chr->cr_spec_uuid));
1714 chr->cr_mode |= CHMODE_KERNEL;
1715 if (nonxref) {
1716 chr->cr_mode |= CHMODE_NO_NXREF;
1717 } else {
1718 chr->cr_mode &= ~CHMODE_NO_NXREF;
1719 }
1720
1721 /* upon success, returns a channel with reference held */
1722 ch = ch_connect(nx, chr, NULL, NULL, kernproc, -1, err);
1723 if (ch != NULL) {
1724 /*
1725 * nonxref channels don't hold any reference to the nexus,
1726 * since otherwise we'll never be able to close them when
1727 * the last regular channel of the nexus is closed, as part
1728 * of the nexus's destructor operation. Release the nonxref
1729 * channel reference now, but make sure the nexus has at
1730 * least 3 refs: global list, provider list and the nonxref
1731 * channel itself, before doing that.
1732 */
1733 if (nonxref) {
1734 ASSERT(ch->ch_flags & (CHANF_KERNEL | CHANF_NONXREF));
1735 ASSERT(nx->nx_refcnt > 3);
1736 (void) nx_release_locked(nx);
1737 }
1738 }
1739
1740#if SK_LOG
1741 uuid_string_t uuidstr;
1742 SK_D("nx 0x%llx (%s:\"%s\":%d:%d) spec_uuid \"%s\" mode 0x%b err %d",
1743 SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, (ch != NULL ?
1744 ch->ch_na->na_name : ""), (int)chr->cr_port, (int)chr->cr_ring_id,
1745 sk_uuid_unparse(chr->cr_spec_uuid, uuidstr), chr->cr_mode,
1746 CHMODE_BITS, *err);
1747#endif /* SK_LOG */
1748
1749 return ch;
1750}
1751
1752static void
1753ch_close_common(struct kern_channel *ch, boolean_t locked, boolean_t special)
1754{
1755#pragma unused(special)
1756#if SK_LOG
1757 uuid_string_t uuidstr;
1758 const char *na_name = (ch->ch_na != NULL) ?
1759 ch->ch_na->na_name : "";
1760 const char *nxdom_name = (ch->ch_nexus != NULL) ?
1761 NX_DOM(ch->ch_nexus)->nxdom_name : "";
1762 const char *nxdom_prov_name = (ch->ch_nexus != NULL) ?
1763 NX_DOM_PROV(ch->ch_nexus)->nxdom_prov_name : "";
1764
1765 SK_D("ch 0x%llx (%s:%s:\"%s\":%u:%d)",
1766 SK_KVA(ch), nxdom_name, nxdom_prov_name, na_name,
1767 ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id);
1768 SK_D(" UUID: %s", sk_uuid_unparse(ch->ch_info->cinfo_ch_id,
1769 uuidstr));
1770 SK_D(" flags: 0x%b", ch->ch_flags, CHANF_BITS);
1771#endif /* SK_LOG */
1772 struct kern_nexus *nx = ch->ch_nexus;
1773
1774 if (!locked) {
1775 SK_LOCK();
1776 }
1777
1778 SK_LOCK_ASSERT_HELD();
1779 /*
1780 * If the channel is participating in the interface advisory
1781 * notification, remove it from the nexus.
1782 * CHANF_IF_ADV is set and cleared only when nx_ch_if_adv_lock
1783 * is held in exclusive mode.
1784 */
1785 lck_rw_lock_exclusive(lck: &nx->nx_ch_if_adv_lock);
1786 if ((ch->ch_flags & CHANF_IF_ADV) != 0) {
1787 STAILQ_REMOVE(&nx->nx_ch_if_adv_head, ch,
1788 kern_channel, ch_link_if_adv);
1789 os_atomic_andnot(&ch->ch_flags, CHANF_IF_ADV, relaxed);
1790 if (STAILQ_EMPTY(&nx->nx_ch_if_adv_head)) {
1791 nx_netif_config_interface_advisory(nx, false);
1792 }
1793 lck_rw_done(lck: &nx->nx_ch_if_adv_lock);
1794 lck_mtx_lock(lck: &ch->ch_lock);
1795 (void) ch_release_locked(ch);
1796 } else {
1797 lck_rw_done(lck: &nx->nx_ch_if_adv_lock);
1798 lck_mtx_lock(lck: &ch->ch_lock);
1799 }
1800 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1801 /*
1802 * Mark the channel as closing to prevent further setopt requests;
1803 * this flag is set once here and never gets cleared.
1804 */
1805 ASSERT(!(ch->ch_flags & CHANF_CLOSING));
1806 os_atomic_or(&ch->ch_flags, CHANF_CLOSING, relaxed);
1807
1808 if (special) {
1809 VERIFY(ch->ch_flags & CHANF_KERNEL);
1810 } else {
1811 VERIFY(!(ch->ch_flags & CHANF_KERNEL));
1812 }
1813
1814 ch->ch_fd = -1;
1815
1816 /* may be called as part of failure cleanup, so check */
1817 if (ch->ch_flags & CHANF_ATTACHED) {
1818 boolean_t nonxref = !!(ch->ch_flags & CHANF_NONXREF);
1819
1820 /* caller must hold an extra ref */
1821 ASSERT(ch->ch_refcnt > 1);
1822
1823 /* disconnect from nexus */
1824 ch_disconnect(ch);
1825
1826 /*
1827 * If this was the last regular channel and the nexus
1828 * has been closed, detach it and finish up the job.
1829 * If this was a nonxref channel, there is nothing
1830 * left to do; see comments in ch_open_special().
1831 */
1832 if (!nonxref) {
1833 STAILQ_REMOVE(&nx->nx_ch_head, ch,
1834 kern_channel, ch_link);
1835 nx->nx_ch_count--;
1836 if (STAILQ_EMPTY(&nx->nx_ch_head) &&
1837 (nx->nx_flags & NXF_CLOSED)) {
1838 ASSERT(STAILQ_EMPTY(&nx->nx_ch_if_adv_head));
1839 nx_detach(nx);
1840 }
1841 (void) nx_release_locked(nx);
1842 } else {
1843 ASSERT(ch->ch_flags & CHANF_KERNEL);
1844 STAILQ_REMOVE(&nx->nx_ch_nonxref_head, ch,
1845 kern_channel, ch_link);
1846 }
1847
1848 os_atomic_andnot(&ch->ch_flags, CHANF_ATTACHED, relaxed);
1849 ch->ch_nexus = NULL;
1850
1851 (void) ch_release_locked(ch); /* for the list */
1852 }
1853
1854 lck_mtx_unlock(lck: &ch->ch_lock);
1855 if (!locked) {
1856 SK_UNLOCK();
1857 }
1858}
1859
1860void
1861ch_close(struct kern_channel *ch, boolean_t locked)
1862{
1863 ch_close_common(ch, locked, FALSE);
1864}
1865
1866void
1867ch_close_special(struct kern_channel *ch)
1868{
1869 ch_close_common(ch, TRUE, TRUE);
1870}
1871
1872static int
1873ch_ev_thresh_validate(struct kern_nexus *nx, enum txrx t,
1874 struct ch_ev_thresh *cet)
1875{
1876 struct nxprov_params *nxp = NX_PROV(nx)->nxprov_params;
1877 uint32_t bmin, bmax, smin, smax;
1878 int err = 0;
1879
1880 if (cet->cet_unit != CHANNEL_THRESHOLD_UNIT_BYTES &&
1881 cet->cet_unit != CHANNEL_THRESHOLD_UNIT_SLOTS) {
1882 err = EINVAL;
1883 goto done;
1884 }
1885
1886 smin = 1; /* minimum 1 slot */
1887 bmin = 1; /* minimum 1 byte */
1888
1889 if (t == NR_TX) {
1890 ASSERT(nxp->nxp_tx_slots > 0);
1891 smax = (nxp->nxp_tx_slots - 1);
1892 } else {
1893 ASSERT(nxp->nxp_rx_slots > 0);
1894 smax = (nxp->nxp_rx_slots - 1);
1895 }
1896 bmax = (smax * nxp->nxp_buf_size);
1897
1898 switch (cet->cet_unit) {
1899 case CHANNEL_THRESHOLD_UNIT_BYTES:
1900 if (cet->cet_value < bmin) {
1901 cet->cet_value = bmin;
1902 } else if (cet->cet_value > bmax) {
1903 cet->cet_value = bmax;
1904 }
1905 break;
1906
1907 case CHANNEL_THRESHOLD_UNIT_SLOTS:
1908 if (cet->cet_value < smin) {
1909 cet->cet_value = smin;
1910 } else if (cet->cet_value > smax) {
1911 cet->cet_value = smax;
1912 }
1913 break;
1914 }
1915
1916done:
1917 return err;
1918}
1919
1920#if SK_LOG
1921/* Hoisted out of line to reduce kernel stack footprint */
1922SK_LOG_ATTRIBUTE
1923static void
1924ch_connect_log1(const struct kern_nexus *nx, const struct ch_info *cinfo,
1925 const struct chreq *chr, const struct kern_channel *ch,
1926 const struct kern_nexus_domain_provider *nxdom_prov,
1927 struct proc *p)
1928{
1929 struct __user_channel_schema *ch_schema = ch->ch_schema;
1930 uuid_string_t uuidstr;
1931 unsigned int n;
1932 ring_id_t i, j;
1933
1934 ASSERT(ch_schema != NULL || (ch->ch_flags & CHANF_KERNEL));
1935 if (ch_schema != NULL) {
1936 SK_D("channel_schema at 0x%llx", SK_KVA(ch_schema));
1937 SK_D(" kern_name: \"%s\"", ch_schema->csm_kern_name);
1938 SK_D(" kern_uuid: %s",
1939 sk_uuid_unparse(ch_schema->csm_kern_uuid, uuidstr));
1940 SK_D(" flags: 0x%b", ch_schema->csm_flags, CSM_BITS);
1941 SK_D(" tx_rings: %u [%u,%u]", ch_schema->csm_tx_rings,
1942 cinfo->cinfo_first_tx_ring, cinfo->cinfo_last_tx_ring);
1943 SK_D(" rx_rings: %u [%u,%u]", ch_schema->csm_rx_rings,
1944 cinfo->cinfo_first_rx_ring, cinfo->cinfo_last_rx_ring);
1945
1946 j = ch->ch_last[NR_TX];
1947 for (n = 0, i = ch->ch_first[NR_TX]; i < j; n++, i++) {
1948 SK_D(" tx_ring_%u_off: 0x%llx", i,
1949 (uint64_t)ch_schema->csm_ring_ofs[n].ring_off);
1950 SK_D(" tx_sd_%u_off: 0x%llx", i,
1951 (uint64_t)ch_schema->csm_ring_ofs[n].sd_off);
1952 }
1953 j = n;
1954 for (n = 0, i = ch->ch_first[NR_RX];
1955 i < ch->ch_last[NR_RX]; n++, i++) {
1956 SK_D(" rx_ring_%u_off: 0x%llx", i,
1957 (uint64_t)ch_schema->csm_ring_ofs[n + j].ring_off);
1958 SK_D(" rx_sd_%u_off: 0x%llx", i,
1959 (uint64_t)ch_schema->csm_ring_ofs[n + j].sd_off);
1960 }
1961 SK_D(" md_type: %u", ch_schema->csm_md_type);
1962 SK_D(" md_subtype: %u", ch_schema->csm_md_subtype);
1963 SK_D(" stats_ofs: 0x%llx", ch_schema->csm_stats_ofs);
1964 SK_D(" stats_type: %u", ch_schema->csm_stats_type);
1965 SK_D(" flowadv_ofs: 0x%llx", ch_schema->csm_flowadv_ofs);
1966 SK_D(" flowadv_max: %u", ch_schema->csm_flowadv_max);
1967 SK_D(" nexusadv_ofs: 0x%llx", ch_schema->csm_nexusadv_ofs);
1968 }
1969
1970 SK_D("ch 0x%llx (%s:%s:\"%s\":%u:%d)",
1971 SK_KVA(ch), nxdom_prov->nxdom_prov_dom->nxdom_name,
1972 nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
1973 cinfo->cinfo_nx_port, (int)cinfo->cinfo_ch_ring_id);
1974 SK_D(" ch UUID: %s", sk_uuid_unparse(cinfo->cinfo_ch_id, uuidstr));
1975 SK_D(" nx UUID: %s", sk_uuid_unparse(nx->nx_uuid, uuidstr));
1976 SK_D(" flags: 0x%b", ch->ch_flags, CHANF_BITS);
1977 SK_D(" task: 0x%llx %s(%d)", SK_KVA(ch->ch_mmap.ami_maptask),
1978 sk_proc_name_address(p), sk_proc_pid(p));
1979 SK_D(" txlowat: %u (%s)", cinfo->cinfo_tx_lowat.cet_value,
1980 ((cinfo->cinfo_tx_lowat.cet_unit == CHANNEL_THRESHOLD_UNIT_BYTES) ?
1981 "bytes" : "slots"));
1982 SK_D(" rxlowat: %u (%s)", cinfo->cinfo_rx_lowat.cet_value,
1983 ((cinfo->cinfo_rx_lowat.cet_unit == CHANNEL_THRESHOLD_UNIT_BYTES) ?
1984 "bytes" : "slots"));
1985 SK_D(" mmapref: 0x%llx", SK_KVA(ch->ch_mmap.ami_mapref));
1986 SK_D(" mapaddr: 0x%llx", (uint64_t)cinfo->cinfo_mem_base);
1987 SK_D(" mapsize: 0x%llx (%llu KB)",
1988 (uint64_t)cinfo->cinfo_mem_map_size,
1989 (uint64_t)cinfo->cinfo_mem_map_size >> 10);
1990 SK_D(" memsize: 0x%llx (%llu KB)",
1991 (uint64_t)chr->cr_memsize, (uint64_t)chr->cr_memsize >> 10);
1992 SK_D(" offset: 0x%llx", (uint64_t)cinfo->cinfo_schema_offset);
1993}
1994
1995SK_LOG_ATTRIBUTE
1996static void
1997ch_connect_log2(const struct kern_nexus *nx, int err)
1998{
1999 uuid_string_t nx_uuidstr;
2000
2001 SK_ERR("Error connecting to nexus UUID %s: %d",
2002 sk_uuid_unparse(nx->nx_uuid, nx_uuidstr), err);
2003}
2004#endif /* SK_LOG */
2005
2006static struct kern_channel *
2007ch_connect(struct kern_nexus *nx, struct chreq *chr, struct kern_channel *ch0,
2008 struct nxbind *nxb, struct proc *p, int fd, int *err)
2009{
2010 struct kern_nexus_domain_provider *nxdom_prov;
2011 struct kern_channel *ch = NULL;
2012 struct ch_info *cinfo = NULL;
2013 uint32_t ch_mode = chr->cr_mode;
2014 boolean_t config = FALSE;
2015 struct nxdom *nxdom;
2016 boolean_t reserved_port = FALSE;
2017
2018 ASSERT(!(ch_mode & CHMODE_KERNEL) || p == kernproc);
2019 ASSERT(chr->cr_port != NEXUS_PORT_ANY || (ch_mode & CHMODE_KERNEL));
2020 SK_LOCK_ASSERT_HELD();
2021
2022 /* validate thresholds before we proceed any further */
2023 if ((*err = ch_ev_thresh_validate(nx, t: NR_TX, cet: &chr->cr_tx_lowat)) != 0 ||
2024 (*err = ch_ev_thresh_validate(nx, t: NR_RX, cet: &chr->cr_rx_lowat)) != 0) {
2025 goto done;
2026 }
2027
2028 if (!(ch_mode & CHMODE_KERNEL) && !NX_USER_CHANNEL_PROV(nx)) {
2029 *err = ENOTSUP;
2030 goto done;
2031 }
2032
2033 ch = ch_alloc(Z_WAITOK);
2034
2035 lck_mtx_lock(lck: &ch->ch_lock);
2036
2037 uuid_generate_random(out: ch->ch_info->cinfo_ch_id);
2038 ch->ch_fd = fd;
2039 ch->ch_pid = proc_pid(p);
2040 (void) snprintf(ch->ch_name, count: sizeof(ch->ch_name), "%s",
2041 proc_name_address(p));
2042
2043 nxdom_prov = NX_DOM_PROV(nx);
2044 nxdom = NX_DOM(nx);
2045
2046 if (ch_mode & (CHMODE_KERNEL | CHMODE_NO_NXREF)) {
2047 /*
2048 * CHANF_KERNEL implies a channel opened by a kernel
2049 * subsystem, and is triggered by the CHMODE_KERNEL
2050 * flag which (only ever) set by ch_open_special().
2051 *
2052 * CHANF_NONXREF can be optionally set based on the
2053 * CHMODE_NO_NXREF request flag. This must only be
2054 * set by ch_open_special() as well, hence we verify.
2055 */
2056 ASSERT(p == kernproc);
2057 ASSERT(ch_mode & CHMODE_KERNEL);
2058 os_atomic_or(&ch->ch_flags, CHANF_KERNEL, relaxed);
2059 if (ch_mode & CHMODE_NO_NXREF) {
2060 os_atomic_or(&ch->ch_flags, CHANF_NONXREF, relaxed);
2061 }
2062
2063 config = (ch_mode & CHMODE_CONFIG) != 0;
2064 if (chr->cr_port == NEXUS_PORT_ANY) {
2065 if (nxdom->nxdom_find_port == NULL) {
2066 *err = ENOTSUP;
2067 goto done;
2068 }
2069
2070 /*
2071 * If ephemeral port request, find one for client;
2072 * we ask for the reserved port range if this is
2073 * a configuration request (CHMODE_CONFIG).
2074 */
2075 if ((*err = nxdom->nxdom_find_port(nx,
2076 config, &chr->cr_port)) != 0) {
2077 goto done;
2078 }
2079 }
2080 }
2081
2082 if (skywalk_check_platform_binary(p)) {
2083 os_atomic_or(&ch->ch_flags, CHANF_PLATFORM, relaxed);
2084 }
2085
2086 ASSERT(chr->cr_port != NEXUS_PORT_ANY);
2087
2088 reserved_port = (nxdom->nxdom_port_is_reserved != NULL &&
2089 (*nxdom->nxdom_port_is_reserved)(nx, chr->cr_port));
2090 if (!config && reserved_port) {
2091 *err = EDOM;
2092 goto done;
2093 }
2094
2095 SK_D("%s(%d) %snexus port %u requested", sk_proc_name_address(p),
2096 sk_proc_pid(p), reserved_port ? "[reserved] " : "", chr->cr_port);
2097
2098 if ((*err = nxdom_prov->nxdom_prov_dom->nxdom_connect(nxdom_prov,
2099 nx, ch, chr, ch0, nxb, p)) != 0) {
2100 goto done;
2101 }
2102
2103 cinfo = ch->ch_info;
2104 uuid_copy(dst: cinfo->cinfo_nx_uuid, src: nx->nx_uuid);
2105 /* for easy access to immutables */
2106 bcopy(src: (void *)nx->nx_prov->nxprov_params,
2107 dst: (void *)&cinfo->cinfo_nxprov_params, n: sizeof(struct nxprov_params));
2108 cinfo->cinfo_ch_mode = ch_mode;
2109 cinfo->cinfo_ch_ring_id = chr->cr_ring_id;
2110 cinfo->cinfo_nx_port = chr->cr_port;
2111 cinfo->cinfo_mem_base = ch->ch_mmap.ami_mapaddr;
2112 cinfo->cinfo_mem_map_size = ch->ch_mmap.ami_mapsize;
2113 cinfo->cinfo_schema_offset = chr->cr_memoffset;
2114 cinfo->cinfo_num_bufs =
2115 PP_BUF_REGION_DEF(skmem_arena_nexus(ch->ch_na->na_arena)->arn_rx_pp)->skr_params.srp_c_obj_cnt;
2116 /*
2117 * ch_last is really the number of rings, but we need to return
2118 * the actual zero-based ring ID to the client. Make sure that
2119 * is the case here and adjust last_{tx,rx}_ring accordingly.
2120 */
2121 ASSERT((ch->ch_last[NR_TX] > 0) ||
2122 (ch->ch_na->na_type == NA_NETIF_COMPAT_DEV));
2123 ASSERT((ch->ch_last[NR_RX] > 0) ||
2124 (ch->ch_na->na_type == NA_NETIF_COMPAT_HOST));
2125 cinfo->cinfo_first_tx_ring = ch->ch_first[NR_TX];
2126 cinfo->cinfo_last_tx_ring = ch->ch_last[NR_TX] - 1;
2127 cinfo->cinfo_first_rx_ring = ch->ch_first[NR_RX];
2128 cinfo->cinfo_last_rx_ring = ch->ch_last[NR_RX] - 1;
2129 cinfo->cinfo_tx_lowat = chr->cr_tx_lowat;
2130 cinfo->cinfo_rx_lowat = chr->cr_rx_lowat;
2131
2132 if (ch_mode & CHMODE_NO_NXREF) {
2133 ASSERT(ch_mode & CHMODE_KERNEL);
2134 STAILQ_INSERT_TAIL(&nx->nx_ch_nonxref_head, ch, ch_link);
2135 } else {
2136 STAILQ_INSERT_TAIL(&nx->nx_ch_head, ch, ch_link);
2137 nx->nx_ch_count++;
2138 }
2139 os_atomic_or(&ch->ch_flags, CHANF_ATTACHED, relaxed);
2140 ch->ch_nexus = nx;
2141 nx_retain_locked(nx); /* hold a ref on the nexus */
2142
2143 ch_retain_locked(ch); /* one for being in the list */
2144 ch_retain_locked(ch); /* one for the caller */
2145
2146 /*
2147 * Now that we've successfully created the nexus adapter, inform the
2148 * nexus provider about the rings and the slots within each ring.
2149 * This is a no-op for internal nexus providers.
2150 */
2151 if ((*err = nxprov_advise_connect(nx, ch, p)) != 0) {
2152 lck_mtx_unlock(lck: &ch->ch_lock);
2153
2154 /* gracefully close this fully-formed channel */
2155 if (ch->ch_flags & CHANF_KERNEL) {
2156 ch_close_special(ch);
2157 } else {
2158 ch_close(ch, TRUE);
2159 }
2160 (void) ch_release_locked(ch);
2161 ch = NULL;
2162 goto done;
2163 }
2164
2165 ASSERT(ch->ch_schema == NULL ||
2166 (ch->ch_schema->csm_flags & CSM_ACTIVE));
2167
2168#if SK_LOG
2169 if (__improbable(sk_verbose != 0)) {
2170 ch_connect_log1(nx, cinfo, chr, ch, nxdom_prov, p);
2171 }
2172#endif /* SK_LOG */
2173
2174done:
2175 if (ch != NULL) {
2176 lck_mtx_unlock(lck: &ch->ch_lock);
2177 }
2178 if (*err != 0) {
2179#if SK_LOG
2180 if (__improbable(sk_verbose != 0)) {
2181 ch_connect_log2(nx, *err);
2182 }
2183#endif /* SK_LOG */
2184 if (ch != NULL) {
2185 ch_free(ch);
2186 ch = NULL;
2187 }
2188 }
2189 return ch;
2190}
2191
2192static void
2193ch_disconnect(struct kern_channel *ch)
2194{
2195 struct kern_nexus *nx = ch->ch_nexus;
2196 struct kern_nexus_domain_provider *nxdom_prov = NX_DOM_PROV(nx);
2197
2198 SK_LOCK_ASSERT_HELD();
2199 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
2200
2201 /*
2202 * Inform the nexus provider that the channel has been quiesced
2203 * and disconnected from the nexus port. This is a no-op for
2204 * internal nexus providers.
2205 */
2206 nxprov_advise_disconnect(nx, ch);
2207
2208 /* Finally, let the domain provider tear down the instance */
2209 nxdom_prov->nxdom_prov_dom->nxdom_disconnect(nxdom_prov, nx, ch);
2210}
2211
2212void
2213ch_deactivate(struct kern_channel *ch)
2214{
2215 /*
2216 * This is a trapdoor flag; once CSM_ACTIVE is cleared,
2217 * it will never be set again. Doing this will cause
2218 * os_channel_is_defunct() to indicate that the channel
2219 * is defunct and is no longer usable (thus should be
2220 * immediately closed).
2221 */
2222 if (ch->ch_schema != NULL &&
2223 (ch->ch_schema->csm_flags & CSM_ACTIVE)) {
2224 os_atomic_andnot(__DECONST(uint32_t *, &ch->ch_schema->csm_flags),
2225 CSM_ACTIVE, relaxed);
2226 /* make this globally visible */
2227 os_atomic_thread_fence(seq_cst);
2228 }
2229}
2230
2231int
2232ch_set_opt(struct kern_channel *ch, struct sockopt *sopt)
2233{
2234#pragma unused(ch)
2235 int err = 0;
2236
2237 if (sopt->sopt_dir != SOPT_SET) {
2238 sopt->sopt_dir = SOPT_SET;
2239 }
2240
2241 switch (sopt->sopt_name) {
2242 case CHOPT_TX_LOWAT_THRESH:
2243 err = ch_set_lowat_thresh(ch, NR_TX, sopt);
2244 break;
2245
2246 case CHOPT_RX_LOWAT_THRESH:
2247 err = ch_set_lowat_thresh(ch, NR_RX, sopt);
2248 break;
2249
2250 case CHOPT_IF_ADV_CONF:
2251 err = ch_configure_interface_advisory_event(ch, sopt);
2252 break;
2253
2254 default:
2255 err = ENOPROTOOPT;
2256 break;
2257 }
2258
2259 return err;
2260}
2261
2262int
2263ch_get_opt(struct kern_channel *ch, struct sockopt *sopt)
2264{
2265#pragma unused(ch)
2266 int err = 0;
2267
2268 if (sopt->sopt_dir != SOPT_GET) {
2269 sopt->sopt_dir = SOPT_GET;
2270 }
2271
2272 switch (sopt->sopt_name) {
2273 case CHOPT_TX_LOWAT_THRESH:
2274 err = ch_get_lowat_thresh(ch, NR_TX, sopt);
2275 break;
2276
2277 case CHOPT_RX_LOWAT_THRESH:
2278 err = ch_get_lowat_thresh(ch, NR_RX, sopt);
2279 break;
2280
2281 default:
2282 err = ENOPROTOOPT;
2283 break;
2284 }
2285
2286 return err;
2287}
2288
2289static int
2290ch_configure_interface_advisory_event(struct kern_channel *ch,
2291 struct sockopt *sopt)
2292{
2293 int err = 0;
2294 boolean_t enable = 0;
2295 struct kern_nexus *nx = ch->ch_nexus;
2296
2297 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
2298 SK_LOCK_ASSERT_NOTHELD();
2299
2300 if (sopt->sopt_val == USER_ADDR_NULL) {
2301 return EINVAL;
2302 }
2303 if (nx->nx_adv.nxv_adv == NULL) {
2304 return ENOTSUP;
2305 }
2306 err = sooptcopyin(sopt, &enable, len: sizeof(enable), minlen: sizeof(enable));
2307 if (err != 0) {
2308 return err;
2309 }
2310
2311 /*
2312 * Drop ch_lock to acquire sk_lock and nx_ch_if_adv_lock due to lock
2313 * ordering requirement; check if the channel is closing once ch_lock
2314 * is reacquired and bail if so.
2315 */
2316 lck_mtx_unlock(lck: &ch->ch_lock);
2317 SK_LOCK();
2318 lck_rw_lock_exclusive(lck: &nx->nx_ch_if_adv_lock);
2319 lck_mtx_lock(lck: &ch->ch_lock);
2320 if (ch->ch_flags & CHANF_CLOSING) {
2321 err = ENXIO;
2322 goto done;
2323 }
2324
2325 /*
2326 * if interface advisory reporting is enabled on the channel then
2327 * add the channel to the list of channels eligible for interface
2328 * advisory update on the nexus. If disabled, remove from the list.
2329 */
2330 if (enable) {
2331 if ((ch->ch_flags & CHANF_IF_ADV) != 0) {
2332 ASSERT(err == 0);
2333 goto done;
2334 }
2335 bool enable_adv = STAILQ_EMPTY(&nx->nx_ch_if_adv_head);
2336 os_atomic_or(&ch->ch_flags, CHANF_IF_ADV, relaxed);
2337 STAILQ_INSERT_TAIL(&nx->nx_ch_if_adv_head, ch, ch_link_if_adv);
2338 if (enable_adv) {
2339 nx_netif_config_interface_advisory(nx, true);
2340 }
2341 ch_retain_locked(ch); /* for being in the IF ADV list */
2342 } else {
2343 if ((ch->ch_flags & CHANF_IF_ADV) == 0) {
2344 ASSERT(err == 0);
2345 goto done;
2346 }
2347 STAILQ_REMOVE(&nx->nx_ch_if_adv_head, ch, kern_channel,
2348 ch_link_if_adv);
2349 os_atomic_andnot(&ch->ch_flags, CHANF_IF_ADV, relaxed);
2350 if (STAILQ_EMPTY(&nx->nx_ch_if_adv_head)) {
2351 nx_netif_config_interface_advisory(nx, false);
2352 }
2353 (void) ch_release_locked(ch);
2354 }
2355
2356done:
2357 lck_mtx_unlock(lck: &ch->ch_lock);
2358 lck_rw_done(lck: &nx->nx_ch_if_adv_lock);
2359 SK_UNLOCK();
2360 lck_mtx_lock(lck: &ch->ch_lock);
2361
2362 return err;
2363}
2364
2365static int
2366ch_set_lowat_thresh(struct kern_channel *ch, enum txrx t,
2367 struct sockopt *sopt)
2368{
2369 struct ch_ev_thresh cet, *ocet;
2370 int err = 0;
2371
2372 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
2373
2374 if (sopt->sopt_val == USER_ADDR_NULL) {
2375 return EINVAL;
2376 }
2377
2378 bzero(s: &cet, n: sizeof(cet));
2379 err = sooptcopyin(sopt, &cet, len: sizeof(cet), minlen: sizeof(cet));
2380 if (err == 0) {
2381 err = ch_ev_thresh_validate(nx: ch->ch_nexus, t, cet: &cet);
2382 if (err == 0) {
2383 if (t == NR_TX) {
2384 ocet = &ch->ch_info->cinfo_tx_lowat;
2385 } else {
2386 ocet = &ch->ch_info->cinfo_rx_lowat;
2387 }
2388
2389 /* if there is no change, we're done */
2390 if (ocet->cet_unit == cet.cet_unit &&
2391 ocet->cet_value == cet.cet_value) {
2392 return 0;
2393 }
2394
2395 *ocet = cet;
2396
2397 for_rx_tx(t) {
2398 ring_id_t qfirst = ch->ch_first[t];
2399 ring_id_t qlast = ch->ch_last[t];
2400 uint32_t i;
2401
2402 for (i = qfirst; i < qlast; i++) {
2403 struct __kern_channel_ring *kring =
2404 &NAKR(na: ch->ch_na, t)[i];
2405
2406 (void) kring->ckr_na_notify(kring,
2407 sopt->sopt_p, 0);
2408 }
2409 }
2410
2411 (void) sooptcopyout(sopt, data: &cet, len: sizeof(cet));
2412 }
2413 }
2414
2415 return err;
2416}
2417
2418static int
2419ch_get_lowat_thresh(struct kern_channel *ch, enum txrx t,
2420 struct sockopt *sopt)
2421{
2422 struct ch_ev_thresh cet;
2423
2424 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
2425
2426 if (sopt->sopt_val == USER_ADDR_NULL) {
2427 return EINVAL;
2428 }
2429
2430 if (t == NR_TX) {
2431 cet = ch->ch_info->cinfo_tx_lowat;
2432 } else {
2433 cet = ch->ch_info->cinfo_rx_lowat;
2434 }
2435
2436 return sooptcopyout(sopt, data: &cet, len: sizeof(cet));
2437}
2438
2439static struct kern_channel *
2440ch_alloc(zalloc_flags_t how)
2441{
2442 struct kern_channel *ch;
2443
2444 ch = zalloc_flags(ch_zone, how | Z_ZERO);
2445 if (ch) {
2446 lck_mtx_init(lck: &ch->ch_lock, grp: &channel_lock_group, attr: &channel_lock_attr);
2447 ch->ch_info = zalloc_flags(ch_info_zone, how | Z_ZERO);
2448 }
2449 return ch;
2450}
2451
2452static void
2453ch_free(struct kern_channel *ch)
2454{
2455 ASSERT(ch->ch_refcnt == 0);
2456 ASSERT(ch->ch_pp == NULL);
2457 ASSERT(!(ch->ch_flags & (CHANF_ATTACHED | CHANF_EXT_CONNECTED |
2458 CHANF_EXT_PRECONNECT | CHANF_IF_ADV)));
2459 lck_mtx_destroy(lck: &ch->ch_lock, grp: &channel_lock_group);
2460 SK_DF(SK_VERB_MEM, "ch 0x%llx FREE", SK_KVA(ch));
2461 ASSERT(ch->ch_info != NULL);
2462 zfree(ch_info_zone, ch->ch_info);
2463 ch->ch_info = NULL;
2464 zfree(ch_zone, ch);
2465}
2466
2467void
2468ch_retain_locked(struct kern_channel *ch)
2469{
2470 SK_LOCK_ASSERT_HELD();
2471
2472 ch->ch_refcnt++;
2473 VERIFY(ch->ch_refcnt != 0);
2474}
2475
2476void
2477ch_retain(struct kern_channel *ch)
2478{
2479 SK_LOCK();
2480 ch_retain_locked(ch);
2481 SK_UNLOCK();
2482}
2483
2484int
2485ch_release_locked(struct kern_channel *ch)
2486{
2487 int oldref = ch->ch_refcnt;
2488
2489 SK_LOCK_ASSERT_HELD();
2490
2491 VERIFY(ch->ch_refcnt != 0);
2492 if (--ch->ch_refcnt == 0) {
2493 ch_free(ch);
2494 }
2495
2496 return oldref == 1;
2497}
2498
2499int
2500ch_release(struct kern_channel *ch)
2501{
2502 int lastref;
2503
2504 SK_LOCK();
2505 lastref = ch_release_locked(ch);
2506 SK_UNLOCK();
2507
2508 return lastref;
2509}
2510
2511void
2512ch_dtor(void *arg)
2513{
2514 struct kern_channel *ch = arg;
2515
2516 SK_LOCK();
2517 ch_close(ch, TRUE);
2518 (void) ch_release_locked(ch);
2519 SK_UNLOCK();
2520}
2521