1/*
2 * Copyright (c) 2015-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <skywalk/os_skywalk_private.h>
30#include <skywalk/nexus/netif/nx_netif.h>
31#include <skywalk/nexus/flowswitch/nx_flowswitch.h>
32#include <sys/sdt.h>
33
34static uint32_t disable_nxctl_check = 0;
35#if (DEVELOPMENT || DEBUG)
36SYSCTL_UINT(_kern_skywalk, OID_AUTO, disable_nxctl_check,
37 CTLFLAG_RW | CTLFLAG_LOCKED, &disable_nxctl_check, 0, "");
38#endif
39
40LCK_GRP_DECLARE(nexus_lock_group, "sk_nx_lock");
41LCK_GRP_DECLARE(nexus_mbq_lock_group, "sk_nx_mbq_lock");
42LCK_GRP_DECLARE(nexus_pktq_lock_group, "sk_nx_pktq_lock");
43LCK_ATTR_DECLARE(nexus_lock_attr, 0, 0);
44
45static STAILQ_HEAD(, nxctl) nxctl_head =
46 STAILQ_HEAD_INITIALIZER(nxctl_head);
47static STAILQ_HEAD(, kern_nexus_provider) nxprov_head =
48 STAILQ_HEAD_INITIALIZER(nxprov_head);
49
50static int nx_cmp(const struct kern_nexus *, const struct kern_nexus *);
51RB_HEAD(kern_nexus_tree, kern_nexus);
52RB_PROTOTYPE_SC(static, kern_nexus_tree, kern_nexus, nx_link, nx_cmp);
53RB_GENERATE(kern_nexus_tree, kern_nexus, nx_link, nx_cmp);
54static struct kern_nexus_tree nx_head;
55
56static int nxctl_get_nexus_prov_list(struct nxctl *, struct sockopt *);
57static int nxctl_get_nexus_prov_entry(struct nxctl *, struct sockopt *);
58static int nxctl_get_nexus_list(struct nxctl *, struct sockopt *);
59static int nxctl_nexus_bind(struct nxctl *, struct sockopt *);
60static int nxctl_nexus_unbind(struct nxctl *, struct sockopt *);
61static int nxctl_nexus_config(struct nxctl *, struct sockopt *);
62static int nxctl_get_channel_list(struct nxctl *, struct sockopt *);
63static void nxctl_retain_locked(struct nxctl *);
64static int nxctl_release_locked(struct nxctl *);
65static void nxctl_init(struct nxctl *, struct proc *, struct fileproc *);
66static struct nxctl *nxctl_alloc(struct proc *, struct fileproc *, zalloc_flags_t);
67static void nxctl_free(struct nxctl *);
68
69static struct kern_nexus_provider *nxprov_create_common(struct nxctl *,
70 struct kern_nexus_domain_provider *, struct nxprov_reg *,
71 const struct kern_nexus_provider_init *init, int *);
72static void nxprov_detach(struct kern_nexus_provider *, boolean_t);
73static void nxprov_retain_locked(struct kern_nexus_provider *);
74static int nxprov_release_locked(struct kern_nexus_provider *);
75static struct kern_nexus_provider *nxprov_alloc(
76 struct kern_nexus_domain_provider *, zalloc_flags_t);
77static void nxprov_free(struct kern_nexus_provider *);
78
79static int nx_init_rings(struct kern_nexus *, struct kern_channel *);
80static void nx_fini_rings(struct kern_nexus *, struct kern_channel *);
81static int nx_init_slots(struct kern_nexus *, struct __kern_channel_ring *);
82static void nx_fini_slots(struct kern_nexus *, struct __kern_channel_ring *);
83static struct kern_nexus *nx_alloc(zalloc_flags_t);
84static void nx_free(struct kern_nexus *);
85
86static SKMEM_TYPE_DEFINE(nxctl_zone, struct nxctl);
87
88static SKMEM_TYPE_DEFINE(nxbind_zone, struct nxbind);
89
90static SKMEM_TYPE_DEFINE(nxprov_zone, struct kern_nexus_provider);
91
92static SKMEM_TYPE_DEFINE(nxprov_params_zone, struct nxprov_params);
93
94static SKMEM_TYPE_DEFINE(nx_zone, struct kern_nexus);
95
96static int __nx_inited = 0;
97
98#define SKMEM_TAG_NX_KEY "com.apple.skywalk.nexus.key"
99SKMEM_TAG_DEFINE(skmem_tag_nx_key, SKMEM_TAG_NX_KEY);
100
101#define SKMEM_TAG_NX_MIB "com.apple.skywalk.nexus.mib"
102static SKMEM_TAG_DEFINE(skmem_tag_nx_mib, SKMEM_TAG_NX_MIB);
103
104#define SKMEM_TAG_NX_PORT "com.apple.skywalk.nexus.port"
105SKMEM_TAG_DEFINE(skmem_tag_nx_port, SKMEM_TAG_NX_PORT);
106
107#define SKMEM_TAG_NX_PORT_INFO "com.apple.skywalk.nexus.port.info"
108SKMEM_TAG_DEFINE(skmem_tag_nx_port_info, SKMEM_TAG_NX_PORT_INFO);
109
110/*
111 * Special nexus controller handle for Skywalk internal use. Unlike all
112 * other nexus controller handles that are created by userland or kernel
113 * clients, this one never gets closed or freed. It is also not part of
114 * the global nxctl_head list.
115 */
116static struct nxctl _kernnxctl;
117static struct nxctl _usernxctl;
118struct nexus_controller kernnxctl = { .ncd_nxctl = &_kernnxctl };
119struct nexus_controller usernxctl = { .ncd_nxctl = &_usernxctl };
120
121int
122nexus_init(void)
123{
124 SK_LOCK_ASSERT_HELD();
125 ASSERT(!__nx_inited);
126
127 RB_INIT(&nx_head);
128
129 na_init();
130
131 /* attach system built-in domains and domain providers */
132 nxdom_attach_all();
133
134 /*
135 * Initialize private kernel and shared user nexus controller handle;
136 *
137 * Shared Kernel controller is used internally for creating nexus providers
138 * and nexus instances from within the Skywalk code (e.g. netif_compat).
139 *
140 * Shared User controller is used userspace by clients(e.g. libnetcore)
141 * that would like to call nexus instances for use cases like
142 * configuring flow entry that they own indirectly (e.g. via NECP), so
143 * that the nexus would perform permission check based on other info
144 * (e.g. PID, UUID) and bypass nxctl check (this nxctl has no
145 * credentials).
146 */
147 nxctl_init(&_kernnxctl, kernproc, NULL);
148 nxctl_retain_locked(&_kernnxctl); /* one for us */
149 nxctl_init(&_usernxctl, kernproc, NULL);
150 nxctl_retain_locked(&_usernxctl); /* one for us */
151 nxctl_traffic_rule_init();
152
153 __nx_inited = 1;
154
155 return 0;
156}
157
158void
159nexus_fini(void)
160{
161 SK_LOCK_ASSERT_HELD();
162
163 if (__nx_inited) {
164 nxctl_traffic_rule_fini();
165 nxctl_release_locked(&_kernnxctl);
166 nxctl_release_locked(&_usernxctl);
167
168 /* tell all domains they're going away */
169 nxdom_detach_all();
170
171 ASSERT(RB_EMPTY(&nx_head));
172
173 na_fini();
174
175 __nx_inited = 0;
176 }
177}
178
179struct nxctl *
180nxctl_create(struct proc *p, struct fileproc *fp, const uuid_t nxctl_uuid,
181 int *err)
182{
183 struct nxctl *nxctl = NULL;
184
185 ASSERT(!uuid_is_null(nxctl_uuid));
186
187 /* privilege checks would be done when performing nxctl operations */
188
189 SK_LOCK();
190
191 nxctl = nxctl_alloc(p, fp, Z_WAITOK);
192
193 STAILQ_INSERT_TAIL(&nxctl_head, nxctl, nxctl_link);
194 nxctl->nxctl_flags |= NEXUSCTLF_ATTACHED;
195 uuid_copy(dst: nxctl->nxctl_uuid, src: nxctl_uuid);
196
197 nxctl_retain_locked(nxctl); /* one for being in the list */
198 nxctl_retain_locked(nxctl); /* one for the caller */
199
200#if SK_LOG
201 uuid_string_t uuidstr;
202 SK_D("nxctl 0x%llx UUID %s", SK_KVA(nxctl),
203 sk_uuid_unparse(nxctl->nxctl_uuid, uuidstr));
204#endif /* SK_LOG */
205
206 SK_UNLOCK();
207
208 if (*err != 0) {
209 nxctl_free(nxctl);
210 nxctl = NULL;
211 }
212 return nxctl;
213}
214
215void
216nxctl_close(struct nxctl *nxctl)
217{
218 struct kern_nexus_provider *nxprov = NULL, *tnxprov;
219
220 lck_mtx_lock(lck: &nxctl->nxctl_lock);
221 SK_LOCK();
222
223 ASSERT(!(nxctl->nxctl_flags & NEXUSCTLF_KERNEL));
224
225#if SK_LOG
226 uuid_string_t uuidstr;
227 SK_D("nxctl 0x%llx UUID %s flags 0x%b", SK_KVA(nxctl),
228 sk_uuid_unparse(nxctl->nxctl_uuid, uuidstr),
229 nxctl->nxctl_flags, NEXUSCTLF_BITS);
230#endif /* SK_LOG */
231
232 if (!(nxctl->nxctl_flags & NEXUSCTLF_NOFDREF)) {
233 nxctl->nxctl_flags |= NEXUSCTLF_NOFDREF;
234 nxctl->nxctl_fp = NULL;
235 }
236
237 /* may be called as part of failure cleanup, so check */
238 if (nxctl->nxctl_flags & NEXUSCTLF_ATTACHED) {
239 /* caller must hold an extra ref */
240 ASSERT(nxctl->nxctl_refcnt > 1);
241 (void) nxctl_release_locked(nxctl);
242
243 STAILQ_REMOVE(&nxctl_head, nxctl, nxctl, nxctl_link);
244 nxctl->nxctl_flags &= ~NEXUSCTLF_ATTACHED;
245 }
246
247repeat:
248 STAILQ_FOREACH_SAFE(nxprov, &nxprov_head, nxprov_link, tnxprov) {
249 /*
250 * Close provider only for those which are owned by
251 * this control instance. Note that if we close the
252 * provider, we need to repeat this search as the
253 * list might have been changed by another thread.
254 * That's possible since SK_UNLOCK() may be called
255 * as a result of calling nxprov_close().
256 */
257 if (!(nxprov->nxprov_flags & NXPROVF_CLOSED) &&
258 nxprov->nxprov_ctl == nxctl) {
259 nxprov_retain_locked(nxprov);
260 (void) nxprov_close(nxprov, TRUE);
261 (void) nxprov_release_locked(nxprov);
262 goto repeat;
263 }
264 }
265
266 SK_UNLOCK();
267 lck_mtx_unlock(lck: &nxctl->nxctl_lock);
268 nxctl_traffic_rule_clean(nxctl);
269}
270
271int
272nxctl_set_opt(struct nxctl *nxctl, struct sockopt *sopt)
273{
274#pragma unused(nxctl)
275 int err = 0;
276
277 NXCTL_LOCK_ASSERT_HELD(nxctl);
278
279 if (sopt->sopt_dir != SOPT_SET) {
280 sopt->sopt_dir = SOPT_SET;
281 }
282
283 switch (sopt->sopt_name) {
284 case NXOPT_NEXUS_BIND:
285 err = nxctl_nexus_bind(nxctl, sopt);
286 break;
287
288 case NXOPT_NEXUS_UNBIND:
289 err = nxctl_nexus_unbind(nxctl, sopt);
290 break;
291
292 case NXOPT_NEXUS_CONFIG:
293 err = nxctl_nexus_config(nxctl, sopt);
294 break;
295
296 default:
297 err = ENOPROTOOPT;
298 break;
299 }
300
301 return err;
302}
303
304int
305nxctl_get_opt(struct nxctl *nxctl, struct sockopt *sopt)
306{
307#pragma unused(nxctl)
308 int err = 0;
309
310 NXCTL_LOCK_ASSERT_HELD(nxctl);
311
312 if (sopt->sopt_dir != SOPT_GET) {
313 sopt->sopt_dir = SOPT_GET;
314 }
315
316 switch (sopt->sopt_name) {
317 case NXOPT_NEXUS_PROV_LIST:
318 err = nxctl_get_nexus_prov_list(nxctl, sopt);
319 break;
320
321 case NXOPT_NEXUS_PROV_ENTRY:
322 err = nxctl_get_nexus_prov_entry(nxctl, sopt);
323 break;
324
325 case NXOPT_NEXUS_LIST:
326 err = nxctl_get_nexus_list(nxctl, sopt);
327 break;
328
329 case NXOPT_CHANNEL_LIST:
330 err = nxctl_get_channel_list(nxctl, sopt);
331 break;
332
333 default:
334 err = ENOPROTOOPT;
335 break;
336 }
337
338 return err;
339}
340
341/* Upper bound on # of nrl_num_regs that we'd return to user space */
342#define MAX_NUM_REG_ENTRIES 256
343
344/* Hoisted out of line to reduce kernel stack footprint */
345SK_NO_INLINE_ATTRIBUTE
346static int
347nxctl_get_nexus_prov_list(struct nxctl *nxctl, struct sockopt *sopt)
348{
349 user_addr_t tmp_ptr = USER_ADDR_NULL;
350 struct nxprov_reg_ent *pnre, *nres = NULL;
351 struct nxprov_list_req nrlr;
352 struct kern_nexus_provider *nxprov = NULL;
353 uint32_t nregs = 0, ncregs = 0;
354 int err = 0, observeall;
355 size_t nres_sz;
356
357 NXCTL_LOCK_ASSERT_HELD(nxctl);
358
359 ASSERT(sopt->sopt_p != NULL);
360 if (sopt->sopt_val == USER_ADDR_NULL) {
361 return EINVAL;
362 }
363
364 err = sooptcopyin(sopt, &nrlr, len: sizeof(nrlr), minlen: sizeof(nrlr));
365 if (err != 0) {
366 return err;
367 }
368
369 if ((size_t)nrlr.nrl_num_regs > MAX_NUM_REG_ENTRIES) {
370 nrlr.nrl_num_regs = MAX_NUM_REG_ENTRIES;
371 }
372
373 /*
374 * If the caller specified a buffer, copy out the Nexus provider
375 * entries to caller gracefully. We only copy out the number of
376 * entries which caller has asked for, but we always tell caller
377 * how big the buffer really needs to be.
378 */
379 tmp_ptr = nrlr.nrl_regs;
380 if (tmp_ptr != USER_ADDR_NULL && nrlr.nrl_num_regs > 0) {
381 nres_sz = (size_t)nrlr.nrl_num_regs * sizeof(*nres);
382 nres = sk_alloc_data(nres_sz, Z_WAITOK, skmem_tag_sysctl_buf);
383 if (__improbable(nres == NULL)) {
384 return ENOBUFS;
385 }
386 }
387
388 observeall = (skywalk_priv_check_cred(sopt->sopt_p, nxctl->nxctl_cred,
389 PRIV_SKYWALK_OBSERVE_ALL) == 0);
390
391 SK_LOCK();
392 /*
393 * Count number of providers. If buffer space exists and
394 * remains, copy out provider entries.
395 */
396 nregs = nrlr.nrl_num_regs;
397 pnre = nres;
398
399 STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
400 /*
401 * Return only entries that are visible to the caller,
402 * unless it has PRIV_SKYWALK_OBSERVE_ALL.
403 */
404 if (nxprov->nxprov_ctl != nxctl && !observeall) {
405 continue;
406 }
407
408 if (nres != NULL && nregs > 0) {
409 uuid_copy(dst: pnre->npre_prov_uuid, src: nxprov->nxprov_uuid);
410 bcopy(src: nxprov->nxprov_params, dst: &pnre->npre_prov_params,
411 n: sizeof(struct nxprov_params));
412 --nregs;
413 ++pnre;
414 ++ncregs;
415 }
416 }
417 SK_UNLOCK();
418
419 if (ncregs == 0) {
420 err = ENOENT;
421 }
422
423 if (nres != NULL) {
424 if (err == 0 && tmp_ptr != USER_ADDR_NULL) {
425 if (sopt->sopt_p != kernproc) {
426 err = copyout(nres, tmp_ptr,
427 ncregs * sizeof(*nres));
428 } else {
429 bcopy(src: nres, CAST_DOWN(caddr_t, tmp_ptr),
430 n: ncregs * sizeof(*nres));
431 }
432 }
433 sk_free_data(nres, nres_sz);
434 nres = NULL;
435 }
436
437 if (err == 0) {
438 nrlr.nrl_num_regs = ncregs;
439 err = sooptcopyout(sopt, data: &nrlr, len: sizeof(nrlr));
440 }
441
442 return err;
443}
444
445/* Hoisted out of line to reduce kernel stack footprint */
446SK_NO_INLINE_ATTRIBUTE
447static int
448nxctl_get_nexus_prov_entry(struct nxctl *nxctl, struct sockopt *sopt)
449{
450 struct nxprov_reg_ent nre;
451 struct kern_nexus_provider *nxprov = NULL;
452 int err = 0;
453
454 NXCTL_LOCK_ASSERT_HELD(nxctl);
455
456 ASSERT(sopt->sopt_p != NULL);
457 if (sopt->sopt_val == USER_ADDR_NULL) {
458 return EINVAL;
459 }
460
461 bzero(s: &nre, n: sizeof(nre));
462 err = sooptcopyin(sopt, &nre, len: sizeof(nre), minlen: sizeof(nre));
463 if (err != 0) {
464 return err;
465 }
466
467 if (uuid_is_null(uu: nre.npre_prov_uuid)) {
468 return EINVAL;
469 }
470
471 SK_LOCK();
472 STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
473 if (uuid_compare(uu1: nxprov->nxprov_uuid,
474 uu2: nre.npre_prov_uuid) == 0) {
475 /*
476 * Return only entries that are visible to the caller,
477 * unless it has PRIV_SKYWALK_OBSERVE_ALL.
478 */
479 if (nxprov->nxprov_ctl != nxctl) {
480 if (skywalk_priv_check_cred(sopt->sopt_p,
481 nxctl->nxctl_cred,
482 PRIV_SKYWALK_OBSERVE_ALL) != 0) {
483 nxprov = NULL;
484 break;
485 }
486 }
487
488 bcopy(src: nxprov->nxprov_params, dst: &nre.npre_prov_params,
489 n: sizeof(struct nxprov_params));
490 break;
491 }
492 }
493 SK_UNLOCK();
494
495 if (nxprov != NULL) {
496 err = sooptcopyout(sopt, data: &nre, len: sizeof(nre));
497 } else {
498 err = ENOENT;
499 }
500
501 return err;
502}
503
504/* Upper bound on # of nl_num_nx_uuids that we'd return to user space */
505#define MAX_NUM_NX_UUIDS 4096
506
507/* Hoisted out of line to reduce kernel stack footprint */
508SK_NO_INLINE_ATTRIBUTE
509static int
510nxctl_get_nexus_list(struct nxctl *nxctl, struct sockopt *sopt)
511{
512 user_addr_t tmp_ptr = USER_ADDR_NULL;
513 uint32_t nuuids = 0, ncuuids = 0;
514 uuid_t *puuid, *uuids = NULL;
515 size_t uuids_sz;
516 struct nx_list_req nlr;
517 struct kern_nexus_provider *nxprov = NULL;
518 struct kern_nexus *nx = NULL;
519 int err = 0, observeall;
520
521 NXCTL_LOCK_ASSERT_HELD(nxctl);
522
523 ASSERT(sopt->sopt_p != NULL);
524 if (sopt->sopt_val == USER_ADDR_NULL) {
525 return EINVAL;
526 }
527
528 err = sooptcopyin(sopt, &nlr, len: sizeof(nlr), minlen: sizeof(nlr));
529 if (err != 0) {
530 return err;
531 }
532
533 if (uuid_is_null(uu: nlr.nl_prov_uuid)) {
534 return EINVAL;
535 } else if ((size_t)nlr.nl_num_nx_uuids > MAX_NUM_NX_UUIDS) {
536 nlr.nl_num_nx_uuids = MAX_NUM_NX_UUIDS;
537 }
538
539 /*
540 * If the caller specified a buffer, copy out the Nexus UUIDs to
541 * caller gracefully. We only copy out the number of UUIDs which
542 * caller has asked for, but we always tell caller how big the
543 * buffer really needs to be.
544 */
545 tmp_ptr = nlr.nl_nx_uuids;
546 if (tmp_ptr != USER_ADDR_NULL && nlr.nl_num_nx_uuids > 0) {
547 uuids_sz = (size_t)nlr.nl_num_nx_uuids * sizeof(uuid_t);
548 uuids = sk_alloc_data(uuids_sz, Z_WAITOK, skmem_tag_sysctl_buf);
549 if (__improbable(uuids == NULL)) {
550 return ENOBUFS;
551 }
552 }
553
554 observeall = (skywalk_priv_check_cred(sopt->sopt_p, nxctl->nxctl_cred,
555 PRIV_SKYWALK_OBSERVE_ALL) == 0);
556
557 SK_LOCK();
558 STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
559 /*
560 * Return only entries that are visible to the caller,
561 * unless it has PRIV_SKYWALK_OBSERVE_ALL.
562 */
563 if (nxprov->nxprov_ctl != nxctl && !observeall) {
564 continue;
565 }
566
567 if (uuid_compare(uu1: nxprov->nxprov_uuid, uu2: nlr.nl_prov_uuid) == 0) {
568 break;
569 }
570 }
571
572 if (nxprov != NULL) {
573 /*
574 * Count number of Nexus. If buffer space exists
575 * and remains, copy out the Nexus UUIDs.
576 */
577 nuuids = nlr.nl_num_nx_uuids;
578 puuid = uuids;
579
580 STAILQ_FOREACH(nx, &nxprov->nxprov_nx_head, nx_prov_link) {
581 ++ncuuids;
582 if (uuids != NULL && nuuids > 0) {
583 uuid_copy(dst: *puuid, src: nx->nx_uuid);
584 --nuuids;
585 ++puuid;
586 }
587 }
588 } else {
589 err = ENOENT;
590 }
591 SK_UNLOCK();
592
593 if (uuids != NULL) {
594 if (err == 0 && nxprov != NULL && tmp_ptr != USER_ADDR_NULL) {
595 uintptr_t cnt_uuid;
596
597 /* Note: Pointer arithmetic */
598 cnt_uuid = (uintptr_t)(puuid - uuids);
599 if (cnt_uuid > 0) {
600 if (sopt->sopt_p != kernproc) {
601 err = copyout(uuids, tmp_ptr,
602 cnt_uuid * sizeof(uuid_t));
603 } else {
604 bcopy(src: uuids,
605 CAST_DOWN(caddr_t, tmp_ptr),
606 n: cnt_uuid * sizeof(uuid_t));
607 }
608 }
609 }
610 sk_free_data(uuids, uuids_sz);
611 uuids = NULL;
612 }
613
614 if (err == 0) {
615 nlr.nl_num_nx_uuids = ncuuids;
616 err = sooptcopyout(sopt, data: &nlr, len: sizeof(nlr));
617 }
618
619 return err;
620}
621
622/* Hoisted out of line to reduce kernel stack footprint */
623SK_NO_INLINE_ATTRIBUTE
624static int
625nxctl_nexus_bind(struct nxctl *nxctl, struct sockopt *sopt)
626{
627 boolean_t m_pid, m_exec_uuid, m_key;
628 struct nx_bind_req nbr;
629 struct proc *p = PROC_NULL;
630 struct nxbind *nxb = NULL;
631 uint64_t p_uniqueid = -1;
632 pid_t p_pid = -1;
633 struct kern_nexus *nx = NULL;
634#if SK_LOG
635 uuid_string_t exec_uuidstr;
636#endif /* SK_LOG */
637 uuid_t p_uuid;
638 void *key = NULL;
639 int err = 0;
640
641 NXCTL_LOCK_ASSERT_HELD(nxctl);
642
643 if (sopt->sopt_val == USER_ADDR_NULL) {
644 return EINVAL;
645 }
646
647 uuid_clear(uu: p_uuid);
648 bzero(s: &nbr, n: sizeof(nbr));
649 err = sooptcopyin(sopt, &nbr, len: sizeof(nbr), minlen: sizeof(nbr));
650 if (err != 0) {
651 return err;
652 }
653
654 if (uuid_is_null(uu: nbr.nb_nx_uuid)) {
655 err = EINVAL;
656 goto done_unlocked;
657 }
658
659 nbr.nb_flags &= NBR_MATCH_MASK;
660 if (nbr.nb_flags == 0) {
661 /* must choose one of the match criteria */
662 err = EINVAL;
663 goto done_unlocked;
664 }
665 m_pid = !!(nbr.nb_flags & NBR_MATCH_PID);
666 m_exec_uuid = !!(nbr.nb_flags & NBR_MATCH_EXEC_UUID);
667 m_key = !!(nbr.nb_flags & NBR_MATCH_KEY);
668
669 if (m_pid || m_exec_uuid) {
670 /*
671 * Validate process ID. A valid PID is needed when we're
672 * asked to match by PID, or if asked to match by executable
673 * UUID with a NULL nb_exec_uuid supplied. The latter is
674 * to support the case when a userland Nexus provider isn't
675 * able to acquire its client's executable UUID, but is
676 * able to identify it via PID.
677 */
678 if ((m_pid || uuid_is_null(uu: nbr.nb_exec_uuid)) &&
679 (p = proc_find(pid: nbr.nb_pid)) == PROC_NULL) {
680 err = ESRCH;
681 goto done_unlocked;
682 }
683 /* exclude kernel from the match criteria */
684 if (p == kernproc) {
685 err = EACCES;
686 goto done_unlocked;
687 } else if (p != PROC_NULL) {
688 proc_getexecutableuuid(p, p_uuid, sizeof(p_uuid));
689 p_uniqueid = proc_uniqueid(p);
690 p_pid = proc_pid(p);
691 } else {
692 uuid_copy(dst: p_uuid, src: nbr.nb_exec_uuid);
693 }
694 }
695
696 if (m_key) {
697 if (nbr.nb_key_len == 0 || nbr.nb_key_len > NEXUS_MAX_KEY_LEN ||
698 nbr.nb_key == USER_ADDR_NULL) {
699 err = EINVAL;
700 goto done_unlocked;
701 }
702
703 key = sk_alloc_data(nbr.nb_key_len, Z_WAITOK, skmem_tag_nx_key);
704 if (__improbable(key == NULL)) {
705 err = ENOMEM;
706 goto done_unlocked;
707 }
708
709 if (sopt->sopt_p != kernproc) {
710 err = copyin(nbr.nb_key, key, nbr.nb_key_len);
711 if (err != 0) {
712 goto done_unlocked;
713 }
714 } else {
715 bcopy(src: (void *)nbr.nb_key, dst: key, n: nbr.nb_key_len);
716 }
717 }
718
719 SK_LOCK();
720 nx = nx_find(nbr.nb_nx_uuid, TRUE);
721 if (nx == NULL || (disable_nxctl_check == 0 &&
722 nx->nx_prov->nxprov_ctl != nxctl &&
723 nxctl != &_kernnxctl)) { /* make exception for kernnxctl */
724 err = ENOENT;
725 goto done;
726 }
727
728 /* bind isn't applicable on anonymous nexus provider */
729 if (NX_ANONYMOUS_PROV(nx)) {
730 err = ENXIO;
731 goto done;
732 }
733
734 /* port must be within the domain's range */
735 if (nbr.nb_port != NEXUS_PORT_ANY &&
736 nbr.nb_port >= NXDOM_MAX(NX_DOM(nx), ports)) {
737 err = EDOM;
738 goto done;
739 } else if (nbr.nb_port == NEXUS_PORT_ANY) {
740 /* for now, this is allowed only for kernel clients */
741 if (sopt->sopt_p != kernproc) {
742 err = EPERM;
743 goto done;
744 }
745 }
746
747 nxb = nxb_alloc(Z_WAITOK);
748
749 if (m_pid) {
750 nxb->nxb_flags |= NXBF_MATCH_UNIQUEID;
751 nxb->nxb_uniqueid = p_uniqueid;
752 nxb->nxb_pid = p_pid;
753 }
754 if (m_exec_uuid) {
755 nxb->nxb_flags |= NXBF_MATCH_EXEC_UUID;
756 ASSERT(!uuid_is_null(p_uuid));
757 uuid_copy(dst: nxb->nxb_exec_uuid, src: p_uuid);
758 }
759 if (m_key) {
760 nxb->nxb_flags |= NXBF_MATCH_KEY;
761 ASSERT(key != NULL);
762 nxb->nxb_key = key;
763 key = NULL; /* let nxb_free() free it */
764 ASSERT(nbr.nb_key_len != 0 &&
765 nbr.nb_key_len <= NEXUS_MAX_KEY_LEN);
766 nxb->nxb_key_len = nbr.nb_key_len;
767 }
768
769 /*
770 * Bind the creds to the nexus port. If client doesn't have a port,
771 * find one, claim it, and associate the creds to it. Upon success,
772 * the nexus may move the nxbind contents (including the key) to
773 * its own nxbind instance; in that case, nxb_free() below will not
774 * be freeing the key within.
775 */
776 err = NX_DOM(nx)->nxdom_bind_port(nx, &nbr.nb_port, nxb, NULL);
777 if (err != 0) {
778 goto done;
779 }
780
781 ASSERT(nbr.nb_port != NEXUS_PORT_ANY);
782 (void) sooptcopyout(sopt, data: &nbr, len: sizeof(nbr));
783
784 SK_D("nexus 0x%llx nxb 0x%llx port %u flags 0x%b pid %d "
785 "(uniqueid %llu) exec_uuid %s key 0x%llx key_len %u",
786 SK_KVA(nx), SK_KVA(nxb), nbr.nb_port, nxb->nxb_flags,
787 NXBF_BITS, nxb->nxb_pid, nxb->nxb_uniqueid,
788 sk_uuid_unparse(nxb->nxb_exec_uuid, exec_uuidstr),
789 (nxb->nxb_key != NULL) ? SK_KVA(nxb->nxb_key) : 0,
790 nxb->nxb_key_len);
791
792done:
793 if (nx != NULL) {
794 (void) nx_release_locked(nx);
795 nx = NULL;
796 }
797 SK_UNLOCK();
798
799done_unlocked:
800 ASSERT(nx == NULL);
801
802 if (nxb != NULL) {
803 nxb_free(nxb);
804 nxb = NULL;
805 }
806 if (key != NULL) {
807 sk_free_data(key, nbr.nb_key_len);
808 key = NULL;
809 }
810 if (p != PROC_NULL) {
811 proc_rele(p);
812 }
813
814 return err;
815}
816
817/* Hoisted out of line to reduce kernel stack footprint */
818SK_NO_INLINE_ATTRIBUTE
819static int
820nxctl_nexus_unbind(struct nxctl *nxctl, struct sockopt *sopt)
821{
822 struct nx_unbind_req nur;
823 struct kern_nexus *nx = NULL;
824 int err = 0;
825
826 NXCTL_LOCK_ASSERT_HELD(nxctl);
827
828 if (sopt->sopt_val == USER_ADDR_NULL) {
829 return EINVAL;
830 }
831
832 bzero(s: &nur, n: sizeof(nur));
833 err = sooptcopyin(sopt, &nur, len: sizeof(nur), minlen: sizeof(nur));
834 if (err != 0) {
835 return err;
836 }
837
838 if (uuid_is_null(uu: nur.nu_nx_uuid)) {
839 return EINVAL;
840 }
841
842 SK_LOCK();
843 nx = nx_find(nur.nu_nx_uuid, TRUE);
844 if (nx == NULL || (nx->nx_prov->nxprov_ctl != nxctl &&
845 nxctl != &_kernnxctl)) { /* make exception for kernnxctl */
846 err = ENOENT;
847 goto done;
848 }
849
850 /* unbind isn't applicable on anonymous nexus provider */
851 if (NX_ANONYMOUS_PROV(nx)) {
852 err = ENXIO;
853 goto done;
854 }
855
856 if (nur.nu_port == NEXUS_PORT_ANY) {
857 err = EINVAL;
858 goto done;
859 }
860
861 err = NX_DOM(nx)->nxdom_unbind_port(nx, nur.nu_port);
862
863done:
864 if (nx != NULL) {
865 (void) nx_release_locked(nx);
866 nx = NULL;
867 }
868 SK_UNLOCK();
869
870 return err;
871}
872
873/* Hoisted out of line to reduce kernel stack footprint */
874SK_NO_INLINE_ATTRIBUTE
875static int
876nxctl_nexus_config(struct nxctl *nxctl, struct sockopt *sopt)
877{
878 struct kern_nexus *nx = NULL;
879 struct nx_cfg_req ncr;
880 int err = 0;
881
882 NXCTL_LOCK_ASSERT_HELD(nxctl);
883
884 if (sopt->sopt_val == USER_ADDR_NULL) {
885 return EINVAL;
886 }
887
888 bzero(s: &ncr, n: sizeof(ncr));
889 err = sooptcopyin(sopt, &ncr, len: sizeof(ncr), minlen: sizeof(ncr));
890 if (err != 0) {
891 return err;
892 }
893
894 if (uuid_is_null(uu: ncr.nc_nx_uuid)) {
895 return EINVAL;
896 }
897
898 SK_LOCK();
899 nx = nx_find(ncr.nc_nx_uuid, TRUE);
900 if (nx == NULL || (disable_nxctl_check == 0 &&
901 nx->nx_prov->nxprov_ctl != nxctl &&
902 nxctl != &_kernnxctl && /* allow kernel/shared user nxctl */
903 nxctl != &_usernxctl)) {
904 err = ENOENT;
905 goto done;
906 }
907
908 if (NX_DOM_PROV(nx)->nxdom_prov_config != NULL) {
909 err = NX_DOM_PROV(nx)->nxdom_prov_config(NX_DOM_PROV(nx),
910 nx, &ncr, sopt->sopt_dir, sopt->sopt_p, nxctl->nxctl_cred);
911 } else {
912 err = EPERM;
913 }
914
915 if (err == 0) {
916 (void) sooptcopyout(sopt, data: &ncr, len: sizeof(ncr));
917 }
918done:
919 if (nx != NULL) {
920 (void) nx_release_locked(nx);
921 nx = NULL;
922 }
923 SK_UNLOCK();
924
925 return err;
926}
927
928struct nxbind *
929nxb_alloc(zalloc_flags_t how)
930{
931 struct nxbind *nxb = zalloc_flags(nxbind_zone, how | Z_ZERO);
932
933 if (nxb) {
934 SK_DF(SK_VERB_MEM, "nxb 0x%llx ALLOC", SK_KVA(nxb));
935 }
936 return nxb;
937}
938
939void
940nxb_free(struct nxbind *nxb)
941{
942 SK_DF(SK_VERB_MEM, "nxb 0x%llx key 0x%llx FREE", SK_KVA(nxb),
943 (nxb->nxb_key != NULL) ? SK_KVA(nxb->nxb_key) : 0);
944
945 if (nxb->nxb_key != NULL) {
946 sk_free_data(nxb->nxb_key, nxb->nxb_key_len);
947 nxb->nxb_key = NULL;
948 }
949 zfree(nxbind_zone, nxb);
950}
951
952/*
953 * nxb0 is assumed to possess the truth, compare nxb1 against it.
954 */
955boolean_t
956nxb_is_equal(struct nxbind *nxb0, struct nxbind *nxb1)
957{
958 ASSERT(nxb0 != NULL && nxb1 != NULL);
959 ASSERT(nxb0 != nxb1);
960
961 /* we always compare using uniqueid and not pid */
962 if ((nxb0->nxb_flags & NXBF_MATCH_UNIQUEID) &&
963 nxb1->nxb_uniqueid != nxb0->nxb_uniqueid) {
964 return FALSE;
965 }
966
967 if ((nxb0->nxb_flags & NXBF_MATCH_EXEC_UUID) &&
968 uuid_compare(uu1: nxb1->nxb_exec_uuid, uu2: nxb0->nxb_exec_uuid) != 0) {
969 return FALSE;
970 }
971
972 ASSERT(!(nxb0->nxb_flags & NXBF_MATCH_KEY) ||
973 (nxb0->nxb_key_len != 0 && nxb0->nxb_key != NULL));
974
975 if ((nxb0->nxb_flags & NXBF_MATCH_KEY) &&
976 (nxb0->nxb_key_len != nxb1->nxb_key_len ||
977 nxb1->nxb_key == NULL || timingsafe_bcmp(b1: nxb1->nxb_key, b2: nxb0->nxb_key,
978 n: nxb1->nxb_key_len) != 0)) {
979 return FALSE;
980 }
981
982 return TRUE;
983}
984
985void
986nxb_move(struct nxbind *snxb, struct nxbind *dnxb)
987{
988 ASSERT(!(snxb->nxb_flags & NXBF_MATCH_KEY) ||
989 (snxb->nxb_key_len != 0 && snxb->nxb_key != NULL));
990
991 /* in case the destination has a key attached, free it first */
992 if (dnxb->nxb_key != NULL) {
993 sk_free_data(dnxb->nxb_key, dnxb->nxb_key_len);
994 dnxb->nxb_key = NULL;
995 }
996
997 /* move everything from src to dst, and then wipe out src */
998 bcopy(src: snxb, dst: dnxb, n: sizeof(*dnxb));
999 bzero(s: snxb, n: sizeof(*snxb));
1000}
1001
1002/* Upper bound on # of cl_num_ch_uuids that we'd return to user space */
1003#define MAX_NUM_CH_UUIDS 4096
1004
1005/* Hoisted out of line to reduce kernel stack footprint */
1006SK_NO_INLINE_ATTRIBUTE
1007static int
1008nxctl_get_channel_list(struct nxctl *nxctl, struct sockopt *sopt)
1009{
1010 user_addr_t tmp_ptr = USER_ADDR_NULL;
1011 uint32_t nuuids = 0, ncuuids = 0;
1012 uuid_t *puuid, *uuids = NULL;
1013 size_t uuids_sz;
1014 struct ch_list_req clr;
1015 struct kern_channel *ch = NULL;
1016 struct kern_nexus *nx = NULL;
1017 struct kern_nexus find;
1018 int err = 0, observeall;
1019
1020 NXCTL_LOCK_ASSERT_HELD(nxctl);
1021
1022 ASSERT(sopt->sopt_p != NULL);
1023 if (sopt->sopt_val == USER_ADDR_NULL) {
1024 return EINVAL;
1025 }
1026
1027 err = sooptcopyin(sopt, &clr, len: sizeof(clr), minlen: sizeof(clr));
1028 if (err != 0) {
1029 return err;
1030 }
1031
1032 if (uuid_is_null(uu: clr.cl_nx_uuid)) {
1033 return EINVAL;
1034 } else if ((size_t)clr.cl_num_ch_uuids > MAX_NUM_CH_UUIDS) {
1035 clr.cl_num_ch_uuids = MAX_NUM_CH_UUIDS;
1036 }
1037
1038 /*
1039 * If the caller specified a buffer, copy out the Channel UUIDs to
1040 * caller gracefully. We only copy out the number of UUIDs which
1041 * caller has asked for, but we always tell caller how big the
1042 * buffer really needs to be.
1043 */
1044 tmp_ptr = clr.cl_ch_uuids;
1045 if (tmp_ptr != USER_ADDR_NULL && clr.cl_num_ch_uuids > 0) {
1046 uuids_sz = (size_t)clr.cl_num_ch_uuids * sizeof(uuid_t);
1047 uuids = sk_alloc_data(uuids_sz, Z_WAITOK, skmem_tag_sysctl_buf);
1048 if (uuids == NULL) {
1049 return ENOBUFS;
1050 }
1051 }
1052
1053 observeall = (skywalk_priv_check_cred(sopt->sopt_p, nxctl->nxctl_cred,
1054 PRIV_SKYWALK_OBSERVE_ALL) == 0);
1055
1056 SK_LOCK();
1057 uuid_copy(dst: find.nx_uuid, src: clr.cl_nx_uuid);
1058 nx = RB_FIND(kern_nexus_tree, &nx_head, &find);
1059 if (nx != NULL && NX_PROV(nx)->nxprov_ctl != nxctl && !observeall) {
1060 /*
1061 * Return only entries that are visible to the caller,
1062 * unless it has PRIV_SKYWALK_OBSERVE_ALL.
1063 */
1064 nx = NULL;
1065 }
1066 if (nx != NULL) {
1067 /*
1068 * Count number of Channels. If buffer space exists
1069 * and remains, copy out the Channel UUIDs.
1070 */
1071 nuuids = clr.cl_num_ch_uuids;
1072 puuid = uuids;
1073
1074 STAILQ_FOREACH(ch, &nx->nx_ch_head, ch_link) {
1075 ++ncuuids;
1076 if (uuids != NULL && nuuids > 0) {
1077 uuid_copy(dst: *puuid, src: ch->ch_info->cinfo_ch_id);
1078 --nuuids;
1079 ++puuid;
1080 }
1081 }
1082 } else {
1083 err = ENOENT;
1084 }
1085 SK_UNLOCK();
1086
1087 if (uuids != NULL) {
1088 if (err == 0 && nx != NULL && tmp_ptr != USER_ADDR_NULL) {
1089 uintptr_t cnt_uuid;
1090
1091 /* Note: Pointer arithmetic */
1092 cnt_uuid = (uintptr_t)(puuid - uuids);
1093 ASSERT(cnt_uuid > 0);
1094
1095 if (sopt->sopt_p != kernproc) {
1096 err = copyout(uuids, tmp_ptr,
1097 cnt_uuid * sizeof(uuid_t));
1098 } else {
1099 bcopy(src: uuids, CAST_DOWN(caddr_t, tmp_ptr),
1100 n: cnt_uuid * sizeof(uuid_t));
1101 }
1102 }
1103 sk_free_data(uuids, uuids_sz);
1104 uuids = NULL;
1105 }
1106
1107 if (err == 0) {
1108 clr.cl_num_ch_uuids = ncuuids;
1109 err = sooptcopyout(sopt, data: &clr, len: sizeof(clr));
1110 }
1111
1112 return err;
1113}
1114
1115static void
1116nxctl_init(struct nxctl *nxctl, struct proc *p, struct fileproc *fp)
1117{
1118 uuid_t p_uuid;
1119
1120 bzero(s: nxctl, n: sizeof(*nxctl));
1121
1122 proc_getexecutableuuid(p, p_uuid, sizeof(p_uuid));
1123
1124 lck_mtx_init(lck: &nxctl->nxctl_lock, grp: &nexus_lock_group, attr: &nexus_lock_attr);
1125 uuid_copy(dst: nxctl->nxctl_proc_uuid, src: p_uuid);
1126 nxctl->nxctl_proc_uniqueid = proc_uniqueid(p);
1127 nxctl->nxctl_cred = kauth_cred_proc_ref(procp: p);
1128 nxctl->nxctl_fp = fp;
1129 if (nxctl == &_kernnxctl) {
1130 ASSERT(p == kernproc);
1131 nxctl->nxctl_flags |= NEXUSCTLF_KERNEL;
1132 }
1133 if (nxctl == &_usernxctl) {
1134 ASSERT(p == kernproc);
1135 nxctl->nxctl_cred = NULL;
1136 }
1137 if (fp == NULL) {
1138 nxctl->nxctl_flags |= NEXUSCTLF_NOFDREF;
1139 }
1140}
1141
1142static struct nxctl *
1143nxctl_alloc(struct proc *p, struct fileproc *fp, zalloc_flags_t how)
1144{
1145 struct nxctl *nxctl = zalloc_flags(nxctl_zone, how);
1146
1147 if (nxctl != NULL) {
1148 nxctl_init(nxctl, p, fp);
1149 }
1150 return nxctl;
1151}
1152
1153static void
1154nxctl_free(struct nxctl *nxctl)
1155{
1156 ASSERT(nxctl->nxctl_refcnt == 0);
1157 ASSERT(!(nxctl->nxctl_flags & NEXUSCTLF_ATTACHED));
1158 kauth_cred_unref(&nxctl->nxctl_cred);
1159 lck_mtx_destroy(lck: &nxctl->nxctl_lock, grp: &nexus_lock_group);
1160 SK_D("nxctl 0x%llx FREE", SK_KVA(nxctl));
1161 if (!(nxctl->nxctl_flags & NEXUSCTLF_KERNEL)) {
1162 zfree(nxctl_zone, nxctl);
1163 }
1164}
1165
1166static void
1167nxctl_retain_locked(struct nxctl *nxctl)
1168{
1169 SK_LOCK_ASSERT_HELD();
1170
1171 nxctl->nxctl_refcnt++;
1172 ASSERT(nxctl->nxctl_refcnt != 0);
1173}
1174
1175void
1176nxctl_retain(struct nxctl *nxctl)
1177{
1178 SK_LOCK();
1179 nxctl_retain_locked(nxctl);
1180 SK_UNLOCK();
1181}
1182
1183static int
1184nxctl_release_locked(struct nxctl *nxctl)
1185{
1186 int oldref = nxctl->nxctl_refcnt;
1187
1188 SK_LOCK_ASSERT_HELD();
1189
1190 ASSERT(nxctl->nxctl_refcnt != 0);
1191 if (--nxctl->nxctl_refcnt == 0) {
1192 nxctl_free(nxctl);
1193 }
1194
1195 return oldref == 1;
1196}
1197
1198int
1199nxctl_release(struct nxctl *nxctl)
1200{
1201 int lastref;
1202
1203 SK_LOCK();
1204 lastref = nxctl_release_locked(nxctl);
1205 SK_UNLOCK();
1206
1207 return lastref;
1208}
1209
1210void
1211nxctl_dtor(void *arg)
1212{
1213 struct nxctl *nxctl = arg;
1214
1215 nxctl_close(nxctl);
1216 SK_LOCK();
1217 (void) nxctl_release_locked(nxctl);
1218 SK_UNLOCK();
1219}
1220
1221int
1222nxprov_advise_connect(struct kern_nexus *nx, struct kern_channel *ch,
1223 struct proc *p)
1224{
1225 struct kern_nexus_provider *nxprov = NX_PROV(nx);
1226 int err = 0;
1227
1228 ASSERT(!(ch->ch_flags & (CHANF_EXT_PRECONNECT | CHANF_EXT_CONNECTED)));
1229 ASSERT(ch->ch_ctx == NULL);
1230
1231 SK_LOCK_ASSERT_HELD();
1232 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1233
1234 /* monitor channels aren't externally visible/usable, so ignore */
1235 if ((ch->ch_info->cinfo_ch_mode & CHMODE_MONITOR) ||
1236 (ch->ch_flags & CHANF_EXT_SKIP) ||
1237 (nxprov->nxprov_ext.nxpi_pre_connect == NULL ||
1238 nxprov->nxprov_ext.nxpi_connected == NULL)) {
1239 return 0;
1240 }
1241
1242 ch_retain_locked(ch);
1243 lck_mtx_unlock(lck: &ch->ch_lock);
1244 SK_UNLOCK();
1245 lck_mtx_lock(lck: &ch->ch_lock);
1246
1247 err = nxprov->nxprov_ext.nxpi_pre_connect(nxprov, p, nx,
1248 ch->ch_info->cinfo_nx_port, ch, &ch->ch_ctx);
1249 if (err != 0) {
1250 SK_D("ch 0x%llx flags %b nx 0x%llx pre_connect "
1251 "error %d", SK_KVA(ch), ch->ch_flags,
1252 CHANF_BITS, SK_KVA(nx), err);
1253 ch->ch_ctx = NULL;
1254 goto done;
1255 }
1256 /*
1257 * Upon ring/slot init failure, this is cleared
1258 * by nxprov_advise_disconnect() below.
1259 */
1260 os_atomic_or(&ch->ch_flags, CHANF_EXT_PRECONNECT, relaxed);
1261 if (NXPROV_LLINK(nxprov)) {
1262 err = nx_netif_llink_ext_init_default_queues(nx);
1263 } else {
1264 err = nx_init_rings(nx, ch);
1265 }
1266 if (err != 0) {
1267 goto done;
1268 }
1269 ASSERT(err == 0);
1270 ASSERT((ch->ch_flags & (CHANF_EXT_PRECONNECT |
1271 CHANF_EXT_CONNECTED)) == CHANF_EXT_PRECONNECT);
1272
1273 err = nxprov->nxprov_ext.nxpi_connected(nxprov, nx, ch);
1274 if (err != 0) {
1275 SK_D("ch 0x%llx flags %b nx 0x%llx connected error %d",
1276 SK_KVA(ch), ch->ch_flags, CHANF_BITS, SK_KVA(nx), err);
1277 goto done;
1278 }
1279 os_atomic_or(&ch->ch_flags, CHANF_EXT_CONNECTED, relaxed);
1280 SK_D("ch 0x%llx flags %b nx 0x%llx connected",
1281 SK_KVA(ch), ch->ch_flags, CHANF_BITS, SK_KVA(nx));
1282
1283
1284done:
1285 lck_mtx_unlock(lck: &ch->ch_lock);
1286 SK_LOCK();
1287 lck_mtx_lock(lck: &ch->ch_lock);
1288 if ((err != 0) &&
1289 (ch->ch_flags & (CHANF_EXT_CONNECTED | CHANF_EXT_PRECONNECT))) {
1290 nxprov_advise_disconnect(nx, ch);
1291 }
1292 /* caller is expected to hold one, in addition to ourselves */
1293 VERIFY(ch->ch_refcnt >= 2);
1294 ch_release_locked(ch);
1295
1296 return err;
1297}
1298
1299void
1300nxprov_advise_disconnect(struct kern_nexus *nx, struct kern_channel *ch)
1301{
1302 struct kern_nexus_provider *nxprov = NX_PROV(nx);
1303
1304 SK_LOCK_ASSERT_HELD();
1305 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1306
1307 /* check as we might be called in the error handling path */
1308 if (ch->ch_flags & (CHANF_EXT_CONNECTED | CHANF_EXT_PRECONNECT)) {
1309 ch_retain_locked(ch);
1310 lck_mtx_unlock(lck: &ch->ch_lock);
1311 SK_UNLOCK();
1312 lck_mtx_lock(lck: &ch->ch_lock);
1313
1314 ASSERT(!(ch->ch_flags & CHANF_EXT_SKIP));
1315 if (ch->ch_flags & CHANF_EXT_CONNECTED) {
1316 nxprov->nxprov_ext.nxpi_pre_disconnect(nxprov, nx, ch);
1317 os_atomic_andnot(&ch->ch_flags, CHANF_EXT_CONNECTED, relaxed);
1318 }
1319
1320 /*
1321 * Inform the external domain provider that the rings
1322 * and slots for this channel are no longer valid.
1323 */
1324 if (NXPROV_LLINK(nxprov)) {
1325 nx_netif_llink_ext_fini_default_queues(nx);
1326 } else {
1327 nx_fini_rings(nx, ch);
1328 }
1329
1330 ASSERT(ch->ch_flags & CHANF_EXT_PRECONNECT);
1331 nxprov->nxprov_ext.nxpi_disconnected(nxprov, nx, ch);
1332 os_atomic_andnot(&ch->ch_flags, CHANF_EXT_PRECONNECT, relaxed);
1333
1334 SK_D("ch 0x%llx flags %b nx 0x%llx disconnected",
1335 SK_KVA(ch), ch->ch_flags, CHANF_BITS, SK_KVA(nx));
1336
1337 /* We're done with this channel */
1338 ch->ch_ctx = NULL;
1339
1340 lck_mtx_unlock(lck: &ch->ch_lock);
1341 SK_LOCK();
1342 lck_mtx_lock(lck: &ch->ch_lock);
1343 /* caller is expected to hold one, in addition to ourselves */
1344 VERIFY(ch->ch_refcnt >= 2);
1345 ch_release_locked(ch);
1346 }
1347 ASSERT(!(ch->ch_flags & (CHANF_EXT_CONNECTED | CHANF_EXT_PRECONNECT)));
1348 ASSERT(ch->ch_ctx == NULL);
1349}
1350
1351static struct kern_nexus_provider *
1352nxprov_create_common(struct nxctl *nxctl,
1353 struct kern_nexus_domain_provider *nxdom_prov, struct nxprov_reg *reg,
1354 const struct kern_nexus_provider_init *init, int *err)
1355{
1356 struct skmem_region_params srp[SKMEM_REGIONS];
1357 struct kern_nexus_provider *nxprov = NULL;
1358 struct nxprov_params nxp;
1359 uint32_t override = 0;
1360 uint32_t pp_region_config_flags;
1361 int i;
1362
1363 _CASSERT(sizeof(*init) == sizeof(nxprov->nxprov_ext));
1364 _CASSERT(sizeof(*init) >=
1365 sizeof(struct kern_nexus_netif_provider_init));
1366
1367 SK_LOCK_ASSERT_HELD();
1368 ASSERT(nxctl != NULL && reg != NULL && nxdom_prov != NULL);
1369
1370 pp_region_config_flags = PP_REGION_CONFIG_MD_MAGAZINE_ENABLE |
1371 PP_REGION_CONFIG_BUF_IODIR_BIDIR;
1372 /*
1373 * Special handling for external nexus providers; similar
1374 * logic to what's done in kern_pbufpool_create().
1375 */
1376 if (init != NULL) {
1377 if (init->nxpi_flags & NXPIF_MONOLITHIC) {
1378 pp_region_config_flags |=
1379 PP_REGION_CONFIG_BUF_MONOLITHIC;
1380 }
1381
1382 if (init->nxpi_flags & NXPIF_INHIBIT_CACHE) {
1383 pp_region_config_flags |=
1384 PP_REGION_CONFIG_BUF_NOCACHE;
1385 }
1386 }
1387
1388 /*
1389 * For network devices, set the packet metadata memory as persistent
1390 * so that it is wired at segment creation. This allows us to access
1391 * it with preemption disabled, as well as for rdar://problem/46511741.
1392 */
1393 if (nxdom_prov->nxdom_prov_dom->nxdom_type == NEXUS_TYPE_NET_IF) {
1394 pp_region_config_flags |= PP_REGION_CONFIG_MD_PERSISTENT;
1395 }
1396
1397 /* process and validate provider parameters */
1398 if ((*err = nxdom_prov_validate_params(nxdom_prov, reg,
1399 &nxp, srp, override, pp_region_config_flags)) != 0) {
1400 goto done;
1401 }
1402
1403 nxprov = nxprov_alloc(nxdom_prov, Z_WAITOK);
1404 ASSERT(nxprov->nxprov_dom_prov == nxdom_prov);
1405
1406 STAILQ_INIT(&nxprov->nxprov_nx_head);
1407 STAILQ_INSERT_TAIL(&nxprov_head, nxprov, nxprov_link);
1408 nxprov->nxprov_flags |= NXPROVF_ATTACHED;
1409 nxprov->nxprov_ctl = nxctl;
1410 uuid_generate_random(out: nxprov->nxprov_uuid);
1411 bcopy(src: &nxp, dst: nxprov->nxprov_params, n: sizeof(struct nxprov_params));
1412
1413 if (init != NULL) {
1414 if (init->nxpi_version == KERN_NEXUS_PROVIDER_VERSION_NETIF) {
1415 ASSERT(NXPROV_LLINK(nxprov));
1416 bcopy(src: init, dst: &nxprov->nxprov_netif_ext,
1417 n: sizeof(nxprov->nxprov_netif_ext));
1418 } else {
1419 ASSERT(!NXPROV_LLINK(nxprov));
1420 ASSERT(init->nxpi_version ==
1421 KERN_NEXUS_PROVIDER_CURRENT_VERSION);
1422 bcopy(src: init, dst: &nxprov->nxprov_ext, n: sizeof(*init));
1423 }
1424 nxprov->nxprov_flags |= NXPROVF_EXTERNAL;
1425 }
1426
1427 /* store validated region parameters to the provider */
1428 for (i = 0; i < SKMEM_REGIONS; i++) {
1429 nxprov->nxprov_region_params[i] = srp[i];
1430 }
1431
1432 if (nxprov->nxprov_flags & NXPROVF_EXTERNAL) {
1433 uint32_t nxpi_flags = nxprov->nxprov_ext.nxpi_flags;
1434
1435 if (nxpi_flags & NXPIF_VIRTUAL_DEVICE) {
1436 nxprov->nxprov_flags |= NXPROVF_VIRTUAL_DEVICE;
1437 }
1438 } else if (nxdom_prov->nxdom_prov_dom->nxdom_type !=
1439 NEXUS_TYPE_NET_IF) {
1440 /*
1441 * Treat non-netif built-in nexus providers as those
1442 * meant for inter-process communications, i.e. there
1443 * is no actual networking hardware involved.
1444 */
1445 nxprov->nxprov_flags |= NXPROVF_VIRTUAL_DEVICE;
1446 }
1447
1448 nxprov_retain_locked(nxprov); /* one for being in the list */
1449 nxprov_retain_locked(nxprov); /* one for the caller */
1450
1451#if SK_LOG
1452 uuid_string_t uuidstr;
1453 SK_D("nxprov 0x%llx UUID %s", SK_KVA(nxprov),
1454 sk_uuid_unparse(nxprov->nxprov_uuid, uuidstr));
1455#endif /* SK_LOG */
1456
1457done:
1458 return nxprov;
1459}
1460
1461struct kern_nexus_provider *
1462nxprov_create(struct proc *p, struct nxctl *nxctl, struct nxprov_reg *reg,
1463 int *err)
1464{
1465 struct nxprov_params *nxp = &reg->nxpreg_params;
1466 struct kern_nexus_domain_provider *nxdom_prov = NULL;
1467 struct kern_nexus_provider *nxprov = NULL;
1468
1469 NXCTL_LOCK_ASSERT_HELD(nxctl);
1470
1471 ASSERT(nxctl->nxctl_cred != proc_ucred_unsafe(kernproc));
1472 *err = 0;
1473
1474 switch (nxp->nxp_type) {
1475 case NEXUS_TYPE_USER_PIPE: /* only for userland */
1476 *err = skywalk_priv_check_cred(p, nxctl->nxctl_cred,
1477 PRIV_SKYWALK_REGISTER_USER_PIPE);
1478 break;
1479
1480 case NEXUS_TYPE_FLOW_SWITCH: /* allowed for userland */
1481 *err = skywalk_priv_check_cred(p, nxctl->nxctl_cred,
1482 PRIV_SKYWALK_REGISTER_FLOW_SWITCH);
1483 break;
1484
1485 case NEXUS_TYPE_NET_IF: /* allowed for userland */
1486 *err = skywalk_priv_check_cred(p, nxctl->nxctl_cred,
1487 PRIV_SKYWALK_REGISTER_NET_IF);
1488 break;
1489
1490 case NEXUS_TYPE_KERNEL_PIPE: /* only for kernel */
1491 case NEXUS_TYPE_MONITOR: /* invalid */
1492 default:
1493 *err = EINVAL;
1494 goto done;
1495 }
1496
1497 if (*err != 0) {
1498 goto done;
1499 }
1500
1501 ASSERT(nxp->nxp_type < NEXUS_TYPE_MAX);
1502 if ((nxdom_prov = nxdom_prov_default[nxp->nxp_type]) == NULL) {
1503 *err = ENXIO;
1504 goto done;
1505 }
1506
1507#if CONFIG_NEXUS_NETIF
1508 /* make sure netif_compat is the default here */
1509 ASSERT(nxp->nxp_type != NEXUS_TYPE_NET_IF ||
1510 strcmp(nxdom_prov->nxdom_prov_name,
1511 NEXUS_PROVIDER_NET_IF_COMPAT) == 0);
1512#endif /* CONFIG_NEXUS_NETIF */
1513
1514 SK_LOCK();
1515 /* callee holds a reference for our caller upon success */
1516 nxprov = nxprov_create_common(nxctl, nxdom_prov, reg, NULL, err);
1517 SK_UNLOCK();
1518done:
1519 return nxprov;
1520}
1521
1522struct kern_nexus_provider *
1523nxprov_create_kern(struct nxctl *nxctl,
1524 struct kern_nexus_domain_provider *nxdom_prov, struct nxprov_reg *reg,
1525 const struct kern_nexus_provider_init *init, int *err)
1526{
1527 struct nxprov_params *nxp = &reg->nxpreg_params;
1528 struct kern_nexus_provider *nxprov = NULL;
1529
1530 NXCTL_LOCK_ASSERT_HELD(nxctl);
1531 SK_LOCK_ASSERT_HELD();
1532
1533 ASSERT(nxctl->nxctl_cred == proc_ucred_unsafe(kernproc));
1534 ASSERT(nxp->nxp_type == nxdom_prov->nxdom_prov_dom->nxdom_type);
1535 ASSERT(init == NULL ||
1536 init->nxpi_version == KERN_NEXUS_PROVIDER_CURRENT_VERSION ||
1537 init->nxpi_version == KERN_NEXUS_PROVIDER_VERSION_NETIF);
1538
1539 *err = 0;
1540
1541 switch (nxp->nxp_type) {
1542 case NEXUS_TYPE_NET_IF:
1543 break;
1544 case NEXUS_TYPE_KERNEL_PIPE:
1545 if (init == NULL) {
1546 *err = EINVAL;
1547 goto done;
1548 }
1549 break;
1550 case NEXUS_TYPE_FLOW_SWITCH:
1551 if (init != NULL) {
1552 *err = EINVAL;
1553 goto done;
1554 }
1555 break;
1556
1557 case NEXUS_TYPE_USER_PIPE: /* only for userland */
1558 case NEXUS_TYPE_MONITOR: /* invalid */
1559 default:
1560 *err = EINVAL;
1561 goto done;
1562 }
1563
1564 /* callee holds a reference for our caller upon success */
1565 nxprov = nxprov_create_common(nxctl, nxdom_prov, reg, init, err);
1566
1567done:
1568 return nxprov;
1569}
1570
1571int
1572nxprov_destroy(struct nxctl *nxctl, const uuid_t nxprov_uuid)
1573{
1574 struct kern_nexus_provider *nxprov = NULL;
1575 int err = 0;
1576
1577 NXCTL_LOCK_ASSERT_HELD(nxctl);
1578
1579 SK_LOCK();
1580
1581 STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
1582 if (nxctl == nxprov->nxprov_ctl &&
1583 uuid_compare(uu1: nxprov_uuid, uu2: nxprov->nxprov_uuid) == 0) {
1584 nxprov_retain_locked(nxprov);
1585 break;
1586 }
1587 }
1588
1589 if (nxprov == NULL) {
1590 err = ENOENT;
1591 } else {
1592 err = nxprov_close(nxprov, TRUE);
1593 }
1594
1595 if (nxprov != NULL) {
1596 (void) nxprov_release_locked(nxprov);
1597 }
1598
1599 SK_UNLOCK();
1600
1601 return err;
1602}
1603
1604int
1605nxprov_close(struct kern_nexus_provider *nxprov, boolean_t locked)
1606{
1607 int err = 0;
1608
1609 if (!locked) {
1610 SK_LOCK();
1611 }
1612
1613 SK_LOCK_ASSERT_HELD();
1614
1615#if SK_LOG
1616 uuid_string_t uuidstr;
1617 SK_D("nxprov 0x%llx UUID %s flags 0x%b", SK_KVA(nxprov),
1618 sk_uuid_unparse(nxprov->nxprov_uuid, uuidstr),
1619 nxprov->nxprov_flags, NXPROVF_BITS);
1620#endif /* SK_LOG */
1621
1622 if (nxprov->nxprov_flags & NXPROVF_CLOSED) {
1623 err = EALREADY;
1624 } else {
1625 struct kern_nexus *nx, *tnx;
1626
1627 nxprov->nxprov_ctl = NULL;
1628
1629 STAILQ_FOREACH_SAFE(nx, &nxprov->nxprov_nx_head,
1630 nx_prov_link, tnx) {
1631 nx_retain_locked(nx);
1632 (void) nx_close(nx, TRUE);
1633 (void) nx_release_locked(nx);
1634 }
1635
1636 if (STAILQ_EMPTY(&nxprov->nxprov_nx_head)) {
1637 /* no nexus created on this, so detach now */
1638 nxprov_detach(nxprov, TRUE);
1639 } else {
1640 /* detach when last nexus is destroyed */
1641 ASSERT(nxprov->nxprov_refcnt > 1);
1642 nxprov->nxprov_flags |= NXPROVF_CLOSED;
1643 }
1644 }
1645
1646 if (!locked) {
1647 SK_UNLOCK();
1648 }
1649
1650 return err;
1651}
1652
1653static void
1654nxprov_detach(struct kern_nexus_provider *nxprov, boolean_t locked)
1655{
1656 if (!locked) {
1657 SK_LOCK();
1658 }
1659
1660 SK_LOCK_ASSERT_HELD();
1661
1662#if SK_LOG
1663 uuid_string_t uuidstr;
1664 SK_D("nxprov 0x%llx UUID %s flags 0x%b", SK_KVA(nxprov),
1665 sk_uuid_unparse(nxprov->nxprov_uuid, uuidstr),
1666 nxprov->nxprov_flags, NXPROVF_BITS);
1667#endif /* SK_LOG */
1668
1669 ASSERT(nxprov->nxprov_flags & NXPROVF_ATTACHED);
1670 STAILQ_REMOVE(&nxprov_head, nxprov, kern_nexus_provider, nxprov_link);
1671 nxprov->nxprov_flags &= ~NXPROVF_ATTACHED;
1672
1673 /* caller must hold an extra ref */
1674 ASSERT(nxprov->nxprov_refcnt > 1);
1675 (void) nxprov_release_locked(nxprov);
1676
1677 if (!locked) {
1678 SK_UNLOCK();
1679 }
1680}
1681
1682static struct kern_nexus_provider *
1683nxprov_alloc(struct kern_nexus_domain_provider *nxdom_prov, zalloc_flags_t how)
1684{
1685 struct kern_nexus_provider *nxprov;
1686 struct nxprov_params *nxp;
1687
1688 ASSERT(nxdom_prov != NULL);
1689
1690 nxp = nxprov_params_alloc(how);
1691 if (nxp == NULL) {
1692 SK_ERR("Failed to allocate nxprov_params");
1693 return NULL;
1694 }
1695
1696 nxprov = zalloc_flags(nxprov_zone, how | Z_ZERO);
1697 if (nxprov == NULL) {
1698 SK_ERR("Failed to allocate nxprov");
1699 nxprov_params_free(nxp);
1700 return NULL;
1701 }
1702
1703 nxprov->nxprov_dom_prov = nxdom_prov;
1704 nxprov->nxprov_params = nxp;
1705 /* hold a reference for nxprov */
1706 nxdom_prov_retain_locked(nxdom_prov);
1707
1708 return nxprov;
1709}
1710
1711static void
1712nxprov_free(struct kern_nexus_provider *nxprov)
1713{
1714 struct kern_nexus_domain_provider *nxdom_prov =
1715 nxprov->nxprov_dom_prov;
1716
1717 SK_LOCK_ASSERT_HELD();
1718
1719 ASSERT(nxdom_prov != NULL);
1720 (void) nxdom_prov_release_locked(nxdom_prov);
1721 nxprov->nxprov_dom_prov = NULL;
1722 ASSERT(nxprov->nxprov_params != NULL);
1723 nxprov_params_free(nxprov->nxprov_params);
1724 nxprov->nxprov_params = NULL;
1725 ASSERT(!(nxprov->nxprov_flags & NXPROVF_ATTACHED));
1726 SK_DF(SK_VERB_MEM, "nxprov 0x%llx FREE", SK_KVA(nxprov));
1727 zfree(nxprov_zone, nxprov);
1728}
1729
1730static void
1731nxprov_retain_locked(struct kern_nexus_provider *nxprov)
1732{
1733 SK_LOCK_ASSERT_HELD();
1734
1735 nxprov->nxprov_refcnt++;
1736 ASSERT(nxprov->nxprov_refcnt != 0);
1737}
1738
1739void
1740nxprov_retain(struct kern_nexus_provider *nxprov)
1741{
1742 SK_LOCK();
1743 nxprov_retain_locked(nxprov);
1744 SK_UNLOCK();
1745}
1746
1747static int
1748nxprov_release_locked(struct kern_nexus_provider *nxprov)
1749{
1750 int oldref = nxprov->nxprov_refcnt;
1751
1752 SK_LOCK_ASSERT_HELD();
1753
1754 ASSERT(nxprov->nxprov_refcnt != 0);
1755 if (--nxprov->nxprov_refcnt == 0) {
1756 nxprov_free(nxprov);
1757 }
1758
1759 return oldref == 1;
1760}
1761
1762int
1763nxprov_release(struct kern_nexus_provider *nxprov)
1764{
1765 int lastref;
1766
1767 SK_LOCK();
1768 lastref = nxprov_release_locked(nxprov);
1769 SK_UNLOCK();
1770
1771 return lastref;
1772}
1773
1774struct nxprov_params *
1775nxprov_params_alloc(zalloc_flags_t how)
1776{
1777 return zalloc_flags(nxprov_params_zone, how | Z_ZERO);
1778}
1779
1780void
1781nxprov_params_free(struct nxprov_params *nxp)
1782{
1783 SK_DF(SK_VERB_MEM, "nxp 0x%llx FREE", SK_KVA(nxp));
1784 zfree(nxprov_params_zone, nxp);
1785}
1786
1787static int
1788nx_check_pp(struct kern_nexus_provider *nxprov, struct kern_pbufpool *pp)
1789{
1790 struct kern_nexus_domain_provider *nxdom_prov = nxprov->nxprov_dom_prov;
1791
1792 if ((pp->pp_flags & (PPF_EXTERNAL | PPF_CLOSED)) != PPF_EXTERNAL) {
1793 SK_ERR("Rejecting \"%s\" built-in pp", pp->pp_name);
1794 return ENOTSUP;
1795 }
1796
1797 /*
1798 * Require that the nexus domain metadata type and the
1799 * metadata type of the caller-provided pbufpool match.
1800 */
1801 if (nxdom_prov->nxdom_prov_dom->nxdom_md_type !=
1802 pp->pp_md_type ||
1803 nxdom_prov->nxdom_prov_dom->nxdom_md_subtype !=
1804 pp->pp_md_subtype) {
1805 SK_ERR("Mismatch in metadata type/subtype "
1806 "(%u/%u != %u/%u)", pp->pp_md_type,
1807 nxdom_prov->nxdom_prov_dom->nxdom_md_type,
1808 pp->pp_md_subtype,
1809 nxdom_prov->nxdom_prov_dom->nxdom_md_subtype);
1810 return EINVAL;
1811 }
1812
1813 /*
1814 * Require that the nexus provider memory configuration
1815 * has the same impedance as the caller-provided one.
1816 * Both need to be lacking or present; if one of them
1817 * is set and the other isn't, then we bail.
1818 */
1819 if (!!(PP_BUF_REGION_DEF(pp)->skr_mode & SKR_MODE_MONOLITHIC) ^
1820 !!(nxprov->nxprov_ext.nxpi_flags & NXPIF_MONOLITHIC)) {
1821 SK_ERR("Memory config mismatch: monolithic mode");
1822 return EINVAL;
1823 }
1824
1825 return 0;
1826}
1827
1828struct kern_nexus *
1829nx_create(struct nxctl *nxctl, const uuid_t nxprov_uuid,
1830 const nexus_type_t dom_type, const void *nx_ctx,
1831 nexus_ctx_release_fn_t nx_ctx_release, struct kern_pbufpool *tx_pp,
1832 struct kern_pbufpool *rx_pp, int *err)
1833{
1834 struct kern_nexus_domain_provider *nxdom_prov;
1835 struct kern_nexus_provider *nxprov = NULL;
1836 struct kern_nexus *nx = NULL;
1837#if SK_LOG
1838 uuid_string_t uuidstr;
1839#endif /* SK_LOG */
1840
1841 NXCTL_LOCK_ASSERT_HELD(nxctl);
1842
1843 ASSERT(dom_type < NEXUS_TYPE_MAX);
1844 ASSERT(!uuid_is_null(nxprov_uuid));
1845 *err = 0;
1846
1847 SK_LOCK();
1848
1849 STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
1850 if (nxctl == nxprov->nxprov_ctl &&
1851 uuid_compare(uu1: nxprov_uuid, uu2: nxprov->nxprov_uuid) == 0) {
1852 break;
1853 }
1854 }
1855
1856 if (nxprov == NULL || (nxprov->nxprov_flags & NXPROVF_CLOSED)) {
1857 SK_ERR("Provider not found or has been closed");
1858 *err = ENOENT;
1859 goto done;
1860 }
1861
1862 nxdom_prov = nxprov->nxprov_dom_prov;
1863 if (dom_type != NEXUS_TYPE_UNDEFINED &&
1864 (nxdom_prov->nxdom_prov_dom->nxdom_type != dom_type)) {
1865 SK_ERR("Mismatch in domain type (0x%u != 0x%u)",
1866 dom_type, nxdom_prov->nxdom_prov_dom->nxdom_type);
1867 nxdom_prov = NULL;
1868 nxprov = NULL;
1869 *err = ENODEV;
1870 goto done;
1871 }
1872
1873 if ((dom_type == NEXUS_TYPE_NET_IF) && NXPROV_LLINK(nxprov) &&
1874 (!tx_pp || !rx_pp)) {
1875#if SK_LOG
1876 SK_ERR("TX/RX packet pool is required for netif logical link "
1877 "nexus provider UUID: %s",
1878 sk_uuid_unparse(nxprov_uuid, uuidstr));
1879#endif /* SK_LOG */
1880 nxdom_prov = NULL;
1881 nxprov = NULL;
1882 *err = EINVAL;
1883 goto done;
1884 }
1885
1886 if ((tx_pp != NULL && (*err = nx_check_pp(nxprov, pp: tx_pp)) != 0) ||
1887 (rx_pp != NULL && (*err = nx_check_pp(nxprov, pp: rx_pp)) != 0)) {
1888 goto done;
1889 }
1890
1891 nx = nx_alloc(Z_WAITOK);
1892
1893 STAILQ_INIT(&nx->nx_ch_head);
1894 STAILQ_INIT(&nx->nx_ch_nonxref_head);
1895 lck_rw_init(lck: &nx->nx_ch_if_adv_lock, grp: &nexus_lock_group,
1896 attr: &nexus_lock_attr);
1897 STAILQ_INIT(&nx->nx_ch_if_adv_head);
1898 uuid_generate_random(out: nx->nx_uuid);
1899 nx->nx_prov = nxprov;
1900 nx->nx_ctx = (void *)(uintptr_t)nx_ctx;
1901 nx->nx_ctx_release = nx_ctx_release;
1902 nx->nx_id = nxdom_prov->nxdom_prov_gencnt++;
1903
1904 if (tx_pp != NULL) {
1905 nx->nx_tx_pp = tx_pp;
1906 pp_retain(tx_pp); /* released by nx_free */
1907 }
1908
1909 if (rx_pp != NULL) {
1910 nx->nx_rx_pp = rx_pp;
1911 pp_retain(rx_pp); /* released by nx_free */
1912 }
1913
1914 /* this nexus is alive; tell the nexus constructor to set it up */
1915 if (nxprov->nxprov_dom_prov->nxdom_prov_nx_ctor != NULL) {
1916 *err = nxprov->nxprov_dom_prov->nxdom_prov_nx_ctor(nx);
1917 if (*err != 0) {
1918 nx->nx_prov = NULL;
1919 goto done;
1920 }
1921 }
1922
1923 nxprov_retain_locked(nxprov); /* hold a ref on the nexus reg */
1924
1925 STAILQ_INSERT_TAIL(&nxprov->nxprov_nx_head, nx, nx_prov_link);
1926 nxprov->nxprov_nx_count++;
1927 RB_INSERT(kern_nexus_tree, &nx_head, nx);
1928 os_atomic_or(&nx->nx_flags, NXF_ATTACHED, relaxed);
1929
1930 nx_retain_locked(nx); /* one for the provider list */
1931 nx_retain_locked(nx); /* one for the global list */
1932 nx_retain_locked(nx); /* one for the caller */
1933
1934#if SK_LOG
1935 SK_D("nexus 0x%llx (%s:%s) UUID %s", SK_KVA(nx),
1936 nxdom_prov->nxdom_prov_dom->nxdom_name,
1937 nxdom_prov->nxdom_prov_name, sk_uuid_unparse(nx->nx_uuid, uuidstr));
1938#endif /* SK_LOG */
1939done:
1940 SK_UNLOCK();
1941
1942 if (*err != 0) {
1943 if (nx != NULL) {
1944 nx_free(nx);
1945 nx = NULL;
1946 }
1947 }
1948 return nx;
1949}
1950
1951int
1952nx_destroy(struct nxctl *nxctl, const uuid_t nx_uuid)
1953{
1954 struct kern_nexus *nx = NULL;
1955 struct kern_nexus find;
1956 int err = 0;
1957
1958 NXCTL_LOCK_ASSERT_HELD(nxctl);
1959
1960 SK_LOCK();
1961
1962 uuid_copy(dst: find.nx_uuid, src: nx_uuid);
1963 nx = RB_FIND(kern_nexus_tree, &nx_head, &find);
1964 if (nx != NULL && nxctl != NX_PROV(nx)->nxprov_ctl) {
1965 nx = NULL;
1966 }
1967
1968 if (nx != NULL) {
1969 nx_retain_locked(nx);
1970 }
1971
1972 if (nx == NULL) {
1973 err = ENOENT;
1974 } else {
1975 err = nx_close(nx, TRUE);
1976 (void) nx_release_locked(nx);
1977 }
1978
1979 SK_UNLOCK();
1980
1981 return err;
1982}
1983
1984static inline int
1985nx_cmp(const struct kern_nexus *a, const struct kern_nexus *b)
1986{
1987 return uuid_compare(uu1: a->nx_uuid, uu2: b->nx_uuid);
1988}
1989
1990struct kern_nexus *
1991nx_find(const uuid_t nx_uuid, boolean_t locked)
1992{
1993 struct kern_nexus *nx = NULL;
1994 struct kern_nexus find;
1995
1996 if (!locked) {
1997 SK_LOCK();
1998 }
1999
2000 SK_LOCK_ASSERT_HELD();
2001
2002 uuid_copy(dst: find.nx_uuid, src: nx_uuid);
2003 nx = RB_FIND(kern_nexus_tree, &nx_head, &find);
2004 if (nx != NULL && (nx->nx_flags & NXF_CLOSED)) {
2005 nx = NULL;
2006 }
2007
2008 /* return reference to caller */
2009 if (nx != NULL) {
2010 nx_retain_locked(nx);
2011 }
2012
2013 if (!locked) {
2014 SK_UNLOCK();
2015 }
2016
2017 return nx;
2018}
2019
2020int
2021nx_close(struct kern_nexus *nx, boolean_t locked)
2022{
2023 int err = 0;
2024
2025 if (!locked) {
2026 SK_LOCK();
2027 }
2028
2029 SK_LOCK_ASSERT_HELD();
2030
2031
2032 if (nx->nx_flags & NXF_CLOSED) {
2033 err = EALREADY;
2034 } else {
2035#if SK_LOG
2036 uuid_string_t uuidstr;
2037 SK_D("nexus 0x%llx (%s:%s) UUID %s flags 0x%b", SK_KVA(nx),
2038 NX_DOM(nx)->nxdom_name, NX_DOM_PROV(nx)->nxdom_prov_name,
2039 sk_uuid_unparse(nx->nx_uuid, uuidstr), nx->nx_flags,
2040 NXF_BITS);
2041#endif /* SK_LOG */
2042
2043 if (STAILQ_EMPTY(&nx->nx_ch_head)) {
2044 /* no regular channels open to it, so detach now */
2045 nx_detach(nx);
2046 } else {
2047 /* detach when the last channel closes */
2048 ASSERT(nx->nx_refcnt > 3);
2049 os_atomic_or(&nx->nx_flags, NXF_CLOSED, relaxed);
2050 }
2051 }
2052
2053 if (!locked) {
2054 SK_UNLOCK();
2055 }
2056
2057 return err;
2058}
2059
2060void
2061nx_stop(struct kern_nexus *nx)
2062{
2063 struct kern_nexus_provider *nxprov = nx->nx_prov;
2064
2065 SK_LOCK_ASSERT_HELD();
2066
2067 /* send a stop message */
2068 if (nxprov->nxprov_dom_prov->nxdom_prov_nx_stop != NULL) {
2069 nxprov->nxprov_dom_prov->nxdom_prov_nx_stop(nx);
2070 }
2071}
2072
2073void
2074nx_detach(struct kern_nexus *nx)
2075{
2076 struct kern_nexus_provider *nxprov = nx->nx_prov;
2077
2078 SK_LOCK_ASSERT_HELD();
2079
2080#if SK_LOG
2081 uuid_string_t uuidstr;
2082 SK_D("nexus 0x%llx UUID %s flags 0x%b", SK_KVA(nx),
2083 sk_uuid_unparse(nx->nx_uuid, uuidstr), nx->nx_flags, NXF_BITS);
2084#endif /* SK_LOG */
2085
2086 /* Caller must hold extra refs, on top of the two in reg/global lists */
2087 ASSERT(nx->nx_refcnt >= 3);
2088 ASSERT(nx->nx_flags & NXF_ATTACHED);
2089
2090 /* this nexus is done; let the nexus destructor do final cleanups */
2091 if (nxprov->nxprov_dom_prov->nxdom_prov_nx_dtor != NULL) {
2092 nxprov->nxprov_dom_prov->nxdom_prov_nx_dtor(nx);
2093 }
2094
2095 ASSERT(STAILQ_EMPTY(&nx->nx_ch_head));
2096 ASSERT(STAILQ_EMPTY(&nx->nx_ch_nonxref_head));
2097
2098 STAILQ_REMOVE(&nxprov->nxprov_nx_head, nx, kern_nexus, nx_prov_link);
2099 nxprov->nxprov_nx_count--;
2100 RB_REMOVE(kern_nexus_tree, &nx_head, nx);
2101 os_atomic_andnot(&nx->nx_flags, NXF_ATTACHED, relaxed);
2102 nx->nx_prov = NULL;
2103 if (nx->nx_ctx_release != NULL) {
2104 nx->nx_ctx_release(nx->nx_ctx);
2105 }
2106 nx->nx_ctx = NULL;
2107
2108 (void) nx_release_locked(nx); /* one for the reg list */
2109 (void) nx_release_locked(nx); /* one for the global list */
2110
2111 /*
2112 * If this was the last nexus and the provider has been closed,
2113 * detach the provider and and finish up the postponed job.
2114 */
2115 if (STAILQ_EMPTY(&nxprov->nxprov_nx_head) &&
2116 (nxprov->nxprov_flags & NXPROVF_CLOSED)) {
2117 nxprov_detach(nxprov, TRUE);
2118 }
2119 (void) nxprov_release_locked(nxprov);
2120}
2121
2122int
2123nx_advisory_alloc(struct kern_nexus *nx, const char *name,
2124 struct skmem_region_params *srp_nexusadv, nexus_advisory_type_t type)
2125{
2126 struct __kern_nexus_adv_metadata *adv_md;
2127
2128 _CASSERT(sizeof(struct __kern_nexus_adv_metadata) == sizeof(uint64_t));
2129 _CASSERT((sizeof(struct sk_nexusadv) +
2130 sizeof(struct __kern_nexus_adv_metadata)) <= NX_NEXUSADV_MAX_SZ);
2131 _CASSERT((sizeof(struct netif_nexus_advisory) +
2132 sizeof(struct __kern_nexus_adv_metadata)) <= NX_NEXUSADV_MAX_SZ);
2133 ASSERT(nx->nx_adv.nxv_reg == NULL);
2134 ASSERT(nx->nx_adv.nxv_adv == NULL);
2135 ASSERT(type == NEXUS_ADVISORY_TYPE_FLOWSWITCH ||
2136 type == NEXUS_ADVISORY_TYPE_NETIF);
2137
2138 if ((nx->nx_adv.nxv_reg = skmem_region_create(name, srp_nexusadv,
2139 NULL, NULL, NULL)) == NULL) {
2140 return ENOMEM;
2141 }
2142
2143 nx->nx_adv.nxv_adv = skmem_region_alloc(nx->nx_adv.nxv_reg, NULL,
2144 NULL, NULL, (SKMEM_NOSLEEP | SKMEM_PANIC));
2145 adv_md = nx->nx_adv.nxv_adv;
2146 adv_md->knam_version = NX_ADVISORY_MD_CURRENT_VERSION;
2147 adv_md->knam_type = type;
2148 adv_md->__reserved = 0;
2149 nx->nx_adv.nxv_adv_type = type;
2150 nx->nx_adv.flowswitch_nxv_adv = (void *)(adv_md + 1);
2151 if (type == NEXUS_ADVISORY_TYPE_FLOWSWITCH) {
2152 nx->nx_adv.flowswitch_nxv_adv->nxadv_ver =
2153 NX_FLOWSWITCH_ADVISORY_CURRENT_VERSION;
2154 } else {
2155 nx->nx_adv.netif_nxv_adv->nna_version =
2156 NX_NETIF_ADVISORY_CURRENT_VERSION;
2157 }
2158 return 0;
2159}
2160
2161void
2162nx_advisory_free(struct kern_nexus *nx)
2163{
2164 if (nx->nx_adv.nxv_reg != NULL) {
2165 ASSERT(nx->nx_adv.nxv_adv != NULL);
2166 skmem_region_free(nx->nx_adv.nxv_reg,
2167 nx->nx_adv.nxv_adv, NULL);
2168 nx->nx_adv.nxv_adv = NULL;
2169 nx->nx_adv.nxv_adv_type = NEXUS_ADVISORY_TYPE_INVALID;
2170 nx->nx_adv.flowswitch_nxv_adv = NULL;
2171 skmem_region_release(nx->nx_adv.nxv_reg);
2172 nx->nx_adv.nxv_reg = NULL;
2173 }
2174
2175 ASSERT(nx->nx_adv.nxv_reg == NULL);
2176 ASSERT(nx->nx_adv.nxv_adv == NULL);
2177 ASSERT(nx->nx_adv.nxv_adv_type == NEXUS_ADVISORY_TYPE_INVALID);
2178 ASSERT(nx->nx_adv.flowswitch_nxv_adv == NULL);
2179}
2180
2181static struct kern_nexus *
2182nx_alloc(zalloc_flags_t how)
2183{
2184 SK_LOCK_ASSERT_HELD();
2185
2186 return zalloc_flags(nx_zone, how | Z_ZERO);
2187}
2188
2189static void
2190nx_free(struct kern_nexus *nx)
2191{
2192 ASSERT(!(nx->nx_flags & NXF_ATTACHED) && nx->nx_prov == NULL);
2193 ASSERT(STAILQ_EMPTY(&nx->nx_ch_head));
2194 ASSERT(STAILQ_EMPTY(&nx->nx_ch_nonxref_head));
2195
2196 nx_port_free_all(nx);
2197
2198 if (nx->nx_tx_pp != NULL) {
2199 pp_release(nx->nx_tx_pp);
2200 nx->nx_tx_pp = NULL;
2201 }
2202 if (nx->nx_rx_pp != NULL) {
2203 pp_release(nx->nx_rx_pp);
2204 nx->nx_rx_pp = NULL;
2205 }
2206
2207 ASSERT(STAILQ_EMPTY(&nx->nx_ch_if_adv_head));
2208 lck_rw_destroy(lck: &nx->nx_ch_if_adv_lock, grp: &nexus_lock_group);
2209
2210 SK_DF(SK_VERB_MEM, "nexus 0x%llx FREE", SK_KVA(nx));
2211 zfree(nx_zone, nx);
2212}
2213
2214void
2215nx_retain_locked(struct kern_nexus *nx)
2216{
2217 SK_LOCK_ASSERT_HELD();
2218
2219 nx->nx_refcnt++;
2220 VERIFY(nx->nx_refcnt > 0);
2221}
2222
2223void
2224nx_retain(struct kern_nexus *nx)
2225{
2226 SK_LOCK();
2227 nx_retain_locked(nx);
2228 SK_UNLOCK();
2229}
2230
2231int
2232nx_release_locked(struct kern_nexus *nx)
2233{
2234 int oldref = nx->nx_refcnt;
2235
2236 SK_LOCK_ASSERT_HELD();
2237
2238 VERIFY(nx->nx_refcnt > 0);
2239 if (--nx->nx_refcnt == 0) {
2240 nx_free(nx);
2241 }
2242
2243 return oldref == 1;
2244}
2245
2246int
2247nx_release(struct kern_nexus *nx)
2248{
2249 int lastref;
2250
2251 SK_LOCK_ASSERT_NOTHELD();
2252
2253 SK_LOCK();
2254 lastref = nx_release_locked(nx);
2255 SK_UNLOCK();
2256
2257 return lastref;
2258}
2259
2260static int
2261nx_init_rings(struct kern_nexus *nx, struct kern_channel *ch)
2262{
2263 struct kern_nexus_provider *nxprov = NX_PROV(nx);
2264 struct nexus_adapter *na = ch->ch_na;
2265 boolean_t undo = FALSE;
2266 int ksd_retains = 0;
2267 enum txrx t;
2268 int err = 0;
2269
2270 ASSERT((ch->ch_flags & (CHANF_EXT_PRECONNECT | CHANF_EXT_CONNECTED)) ==
2271 CHANF_EXT_PRECONNECT);
2272
2273 if (nxprov->nxprov_ext.nxpi_ring_init == NULL) {
2274 return 0;
2275 }
2276
2277 for_rx_tx(t) {
2278 uint32_t i;
2279
2280 for (i = 0; i < na_get_nrings(na, t); i++) {
2281 struct __kern_channel_ring *kring = &NAKR(na, t)[i];
2282
2283 /* skip host rings */
2284 if (kring->ckr_flags & CKRF_HOST) {
2285 continue;
2286 }
2287
2288 if ((err = nxprov->nxprov_ext.nxpi_ring_init(
2289 nxprov, nx, ch, kring, (kring->ckr_tx == NR_TX),
2290 &kring->ckr_ctx)) != 0) {
2291 SK_D("ch 0x%llx flags %b nx 0x%llx kr \"%s\" "
2292 "(0x%llx) krflags %b ring_init error %d",
2293 SK_KVA(ch), ch->ch_flags, CHANF_BITS,
2294 SK_KVA(nx), kring->ckr_name, SK_KVA(kring),
2295 kring->ckr_flags, CKRF_BITS, err);
2296 kring->ckr_ctx = NULL;
2297 undo = TRUE;
2298 break;
2299 }
2300 kring->ckr_flags |= CKRF_EXT_RING_INITED;
2301
2302 if ((err = nx_init_slots(nx, kring)) != 0) {
2303 undo = TRUE;
2304 break;
2305 }
2306
2307 if (kring->ckr_flags & CKRF_EXT_SLOTS_INITED) {
2308 ++ksd_retains;
2309 }
2310 }
2311 if (undo) {
2312 break;
2313 }
2314 }
2315
2316 /*
2317 * Note: retain KSD even in case of error, as we have set
2318 * CKRF_EXT_SLOTS_INITED flag for some of the rings
2319 * nx_fini_rings would take care of release based on it.
2320 */
2321 if (ksd_retains != 0) {
2322 /*
2323 * Mark the kernel slot descriptor region as busy; this
2324 * prevents it from being torn-down at channel defunct
2325 * time, as we need to invoke the slot_fini() callback
2326 * for each slot and we need the descriptors until then.
2327 */
2328 skmem_arena_nexus_sd_set_noidle(skmem_arena_nexus(ar: na->na_arena),
2329 ksd_retains);
2330 }
2331
2332 if (err != 0) {
2333 ASSERT(undo);
2334 nx_fini_rings(nx, ch);
2335 }
2336
2337 return err;
2338}
2339
2340static void
2341nx_fini_rings(struct kern_nexus *nx, struct kern_channel *ch)
2342{
2343 struct kern_nexus_provider *nxprov = NX_PROV(nx);
2344 struct nexus_adapter *na = ch->ch_na;
2345 int ksd_releases = 0;
2346 enum txrx t;
2347
2348 for_rx_tx(t) {
2349 uint32_t i;
2350
2351 for (i = 0; i < na_get_nrings(na, t); i++) {
2352 struct __kern_channel_ring *kring = &NAKR(na, t)[i];
2353
2354 if (!(kring->ckr_flags & CKRF_EXT_RING_INITED)) {
2355 continue;
2356 }
2357
2358 ASSERT(!(kring->ckr_flags & CKRF_HOST));
2359 ASSERT(nxprov->nxprov_ext.nxpi_ring_fini != NULL);
2360 nxprov->nxprov_ext.nxpi_ring_fini(nxprov, nx, kring);
2361 kring->ckr_flags &= ~CKRF_EXT_RING_INITED;
2362
2363 if (kring->ckr_flags & CKRF_EXT_SLOTS_INITED) {
2364 ++ksd_releases;
2365 }
2366
2367 /*
2368 * Undo the work done in nx_init_slots() and inform
2369 * the external domain provider, if applicable, that
2370 * the slots for this ring are no longer valid.
2371 */
2372 nx_fini_slots(nx, kring);
2373 kring->ckr_ctx = NULL;
2374 }
2375 }
2376
2377 if (ksd_releases != 0) {
2378 /*
2379 * Now that we've finished invoking the slot_fini()
2380 * callbacks, release the busy retain counts held
2381 * earlier in nx_init_rings(). This will allow the
2382 * kernel slot descriptor region to be torn down.
2383 */
2384 skmem_arena_nexus_sd_set_noidle(
2385 skmem_arena_nexus(ar: na->na_arena), -ksd_releases);
2386 }
2387}
2388
2389static int
2390nx_init_slots(struct kern_nexus *nx, struct __kern_channel_ring *kring)
2391{
2392 struct kern_nexus_provider *nxprov = NX_PROV(nx);
2393 struct __slot_desc *slot = kring->ckr_ksds;
2394 int err = 0;
2395 uint32_t i;
2396
2397 /*
2398 * If the slot init callback was not provided, or if the
2399 * kring was not created to hold any slot contexts, don't
2400 * go any further.
2401 */
2402 if (nxprov->nxprov_ext.nxpi_slot_init == NULL ||
2403 kring->ckr_slot_ctxs == NULL) {
2404 return 0;
2405 }
2406
2407 ASSERT(kring->ckr_slot_ctxs_set == 0);
2408 ASSERT(slot != NULL);
2409
2410 for (i = 0; i < kring->ckr_num_slots; i++) {
2411 struct kern_slot_prop *slot_ctx_prop = NULL;
2412 void *slot_ctx_arg = NULL;
2413
2414 ASSERT(&slot[i] <= kring->ckr_ksds_last);
2415 if ((err = nxprov->nxprov_ext.nxpi_slot_init(nxprov, nx, kring,
2416 &slot[i], i, &slot_ctx_prop, &slot_ctx_arg)) != 0) {
2417 SK_D("nx 0x%llx kr \"%s\" (0x%llx) krflags %b slot %u "
2418 "slot_init error %d", SK_KVA(nx), kring->ckr_name,
2419 SK_KVA(kring), kring->ckr_flags, CKRF_BITS, i, err);
2420 break;
2421 }
2422 /* we don't want this to be used by client, so verify here */
2423 ASSERT(slot_ctx_prop == NULL);
2424 kring->ckr_slot_ctxs[i].slot_ctx_arg =
2425 (mach_vm_address_t)slot_ctx_arg;
2426 kring->ckr_slot_ctxs_set++;
2427 }
2428
2429 if (err != 0) {
2430 nx_fini_slots(nx, kring);
2431 } else {
2432 kring->ckr_flags |= CKRF_EXT_SLOTS_INITED;
2433 }
2434
2435 return err;
2436}
2437
2438static void
2439nx_fini_slots(struct kern_nexus *nx, struct __kern_channel_ring *kring)
2440{
2441 struct kern_nexus_provider *nxprov = NX_PROV(nx);
2442 struct __slot_desc *slot = kring->ckr_ksds;
2443 uint32_t i;
2444
2445 ASSERT(!(kring->ckr_flags & CKRF_EXT_SLOTS_INITED) ||
2446 nxprov->nxprov_ext.nxpi_slot_fini != NULL);
2447 ASSERT(slot != NULL || !(kring->ckr_flags & CKRF_EXT_SLOTS_INITED));
2448
2449 for (i = 0; i < kring->ckr_slot_ctxs_set; i++) {
2450 ASSERT(slot != NULL && &slot[i] <= kring->ckr_ksds_last);
2451 if (nxprov->nxprov_ext.nxpi_slot_fini != NULL) {
2452 nxprov->nxprov_ext.nxpi_slot_fini(nxprov, nx,
2453 kring, &slot[i], i);
2454 }
2455 if (kring->ckr_slot_ctxs != NULL) {
2456 kring->ckr_slot_ctxs[i].slot_ctx_arg = 0;
2457 }
2458 }
2459 kring->ckr_slot_ctxs_set = 0;
2460
2461 /* We're done with this kring */
2462 kring->ckr_flags &= ~CKRF_EXT_SLOTS_INITED;
2463}
2464
2465
2466/* 64-bit mask with range */
2467#define BMASK64(_beg, _end) \
2468 ((NX_PORT_CHUNK_FREE >> (63 - (_end))) & ~((1ULL << (_beg)) - 1))
2469
2470int
2471nx_port_find(struct kern_nexus *nx, nexus_port_t first,
2472 nexus_port_t last, nexus_port_t *nx_port)
2473{
2474 int err = 0;
2475
2476 ASSERT(first < last);
2477 *nx_port = NEXUS_PORT_ANY;
2478
2479 if (nx->nx_num_ports == 0 || (first + 1) >= nx->nx_num_ports) {
2480 /*
2481 * Left edge of the range is beyond the current map;
2482 * let nx_port_alloc() handle the growing later.
2483 */
2484 *nx_port = first;
2485 } else {
2486 nexus_port_size_t fc = (first / NX_PORT_CHUNK);
2487 nexus_port_size_t lc = (MIN(last, nx->nx_num_ports) / NX_PORT_CHUNK);
2488 nexus_port_size_t lim = (nx->nx_num_ports / NX_PORT_CHUNK);
2489 nexus_port_size_t i, j;
2490 bitmap_t *bmap;
2491
2492 /*
2493 * The right edge of the range is either within or
2494 * beyond the current map; scan thru the current
2495 * map and find the first available port.
2496 */
2497 for (i = fc; i <= lc; i++) {
2498 bitmap_t mask;
2499 nexus_port_size_t beg = 0, end = 63;
2500
2501 if (i == fc) {
2502 beg = (first % NX_PORT_CHUNK);
2503 }
2504 if (i == (last / NX_PORT_CHUNK)) {
2505 end = (last % NX_PORT_CHUNK);
2506 }
2507
2508 if (i < lim) {
2509 bmap = &nx->nx_ports_bmap[i];
2510 mask = BMASK64(beg, end);
2511
2512 j = (nexus_port_size_t)ffsll((*bmap) & mask);
2513 if (j == 0) {
2514 continue;
2515 }
2516
2517 --j;
2518 *nx_port = (i * NX_PORT_CHUNK) + j;
2519 }
2520 break;
2521 }
2522
2523 /*
2524 * If the requested range is within the current map and we
2525 * couldn't find a port, return an err. Otherwise, return
2526 * the next port index to trigger growing later.
2527 */
2528 if (*nx_port == NEXUS_PORT_ANY) {
2529 if (lc == (last / NX_PORT_CHUNK)) {
2530 err = EBUSY;
2531 SK_ERR("port unavail in [%u, %u)", first, last);
2532 } else {
2533 *nx_port = nx->nx_num_ports;
2534 }
2535 }
2536 }
2537
2538 SK_DF(SK_VERB_NXPORT, "nx 0x%llx nx_port %d (err %d)", SK_KVA(nx),
2539 (int)*nx_port, err);
2540
2541 return err;
2542}
2543
2544static int
2545nx_port_grow(struct kern_nexus *nx, nexus_port_size_t grow)
2546{
2547 ASSERT(NXDOM_MAX(NX_DOM(nx), ports) <= NEXUS_PORT_MAX);
2548 nexus_port_t dom_port_max = (nexus_port_size_t)NXDOM_MAX(NX_DOM(nx), ports);
2549 struct nx_port_info *ports;
2550 size_t limit;
2551 nexus_port_size_t i, num_ports, old_num_ports;
2552 bitmap_t *bmap;
2553
2554 ASSERT(grow > 0 && (grow % NX_PORT_CHUNK) == 0);
2555 ASSERT((nx->nx_num_ports % NX_PORT_CHUNK) == 0);
2556 _CASSERT((sizeof(*bmap) * 8) == NX_PORT_CHUNK);
2557 ASSERT(powerof2(dom_port_max));
2558 ASSERT(dom_port_max % NX_PORT_CHUNK == 0);
2559
2560 old_num_ports = nx->nx_num_ports;
2561 num_ports = nx->nx_num_ports + grow;
2562 limit = P2ROUNDUP(dom_port_max, NX_PORT_CHUNK);
2563 if (num_ports > limit) {
2564 SK_ERR("can't grow, total %u grow %u (new %u > dom_max %u)",
2565 nx->nx_num_ports, grow, num_ports, limit);
2566 return EDOM;
2567 }
2568
2569 if ((bmap = sk_realloc_data(nx->nx_ports_bmap,
2570 (old_num_ports / NX_PORT_CHUNK) * sizeof(*bmap),
2571 (num_ports / NX_PORT_CHUNK) * sizeof(*bmap),
2572 Z_WAITOK, skmem_tag_nx_port)) == NULL) {
2573 SK_ERR("bmap alloc failed, num_port %u", num_ports);
2574 return ENOMEM;
2575 }
2576 nx->nx_ports_bmap = bmap;
2577
2578 if ((ports = sk_realloc_type_array(struct nx_port_info, old_num_ports,
2579 num_ports, nx->nx_ports, Z_WAITOK, skmem_tag_nx_port)) == NULL) {
2580 /* can't free bmap here, otherwise nexus won't work */
2581 SK_ERR("nx_ports alloc failed, num_port %u", num_ports);
2582 return ENOMEM;
2583 }
2584
2585 /* initialize the additional new ports */
2586 bzero(s: &ports[nx->nx_num_ports], n: (grow * sizeof(*ports)));
2587 nx->nx_ports = ports;
2588
2589 /* initialize new bitmaps (set all bits) */
2590 for (i = (nx->nx_num_ports / NX_PORT_CHUNK);
2591 i < (num_ports / NX_PORT_CHUNK); i++) {
2592 bmap[i] = NX_PORT_CHUNK_FREE;
2593 }
2594
2595 nx->nx_num_ports = num_ports;
2596
2597 SK_DF(SK_VERB_NXPORT, "!!! nx 0x%llx ports %u/%u, %u ports added",
2598 SK_KVA(nx), nx->nx_active_ports, nx->nx_num_ports, grow);
2599
2600 return 0;
2601}
2602
2603int
2604nx_port_alloc(struct kern_nexus *nx, nexus_port_t nx_port, struct nxbind *nxb,
2605 struct nexus_adapter **na, struct proc *p)
2606{
2607 struct nx_port_info *npi = NULL;
2608 struct nxbind *nxb0;
2609 size_t g;
2610 uint32_t i, j;
2611 bitmap_t *bmap;
2612 bool refonly = false;
2613 int err = 0;
2614
2615 ASSERT(nx_port != NEXUS_PORT_ANY);
2616 ASSERT((nx->nx_num_ports % NX_PORT_CHUNK) == 0);
2617
2618 /* port is zero-based, so adjust here */
2619 if ((nx_port + 1) > nx->nx_num_ports) {
2620 g = P2ROUNDUP((nx_port + 1) - nx->nx_num_ports, NX_PORT_CHUNK);
2621 VERIFY(g <= NEXUS_PORT_MAX);
2622 if ((err = nx_port_grow(nx, grow: (nexus_port_size_t)g)) != 0) {
2623 goto done;
2624 }
2625 }
2626 ASSERT(err == 0);
2627 ASSERT(nx_port < nx->nx_num_ports);
2628 npi = &nx->nx_ports[nx_port];
2629 nxb0 = npi->npi_nxb;
2630 i = nx_port / NX_PORT_CHUNK;
2631 j = nx_port % NX_PORT_CHUNK;
2632 bmap = &nx->nx_ports_bmap[i];
2633
2634 if (bit_test(*bmap, j)) {
2635 /* port is not (yet) bound or allocated */
2636 ASSERT(npi->npi_nah == 0 && npi->npi_nxb == NULL);
2637 if (p != kernproc && !NX_ANONYMOUS_PROV(nx)) {
2638 /*
2639 * If the port allocation is requested by userland
2640 * and the nexus is non-anonymous, then fail the
2641 * request.
2642 */
2643 err = EACCES;
2644 SK_ERR("user proc alloc on named nexus needs binding");
2645 } else if (na != NULL && *na != NULL) {
2646 /*
2647 * Otherwise claim it (clear bit) if the caller
2648 * supplied an adapter for this port; else, it
2649 * is just an existential check and so there's
2650 * no action needed at this point (we'll skip
2651 * the init below since vpna is NULL).
2652 */
2653 bit_clear(*bmap, j);
2654 }
2655 } else {
2656 /* if port is bound, check if credentials match */
2657 if (nxb0 != NULL && p != kernproc && !NX_ANONYMOUS_PROV(nx) &&
2658 (nxb == NULL || !nxb_is_equal(nxb0, nxb1: nxb))) {
2659 SK_ERR("nexus binding mismatch");
2660 err = EACCES;
2661 } else {
2662 /*
2663 * If port is already occupied by an adapter,
2664 * see if the client is requesting a reference
2665 * to it; if so, return the adapter. Otherwise,
2666 * if unoccupied and vpna is non-NULL, associate
2667 * it with this nexus port via the below init.
2668 */
2669 if (NPI_NA(npi) != NULL) {
2670 if (na != NULL && *na == NULL) {
2671 *na = NPI_NA(npi);
2672 na_retain_locked(na: *na);
2673 /* skip the init below */
2674 refonly = true;
2675 } else {
2676 /*
2677 * If the client supplied an adapter
2678 * (regardless of its value) for a
2679 * nexus port that's already occupied,
2680 * then we fail the request.
2681 */
2682 SK_ERR("nexus adapted exits");
2683 err = EEXIST;
2684 }
2685 }
2686 }
2687 }
2688
2689done:
2690 /* initialize the nexus port and the adapter occupying it */
2691 if (err == 0 && na != NULL && *na != NULL && !refonly) {
2692 ASSERT(nx_port < nx->nx_num_ports);
2693 ASSERT(npi->npi_nah == 0);
2694 ASSERT(nx->nx_active_ports < nx->nx_num_ports);
2695 ASSERT(!bit_test(nx->nx_ports_bmap[nx_port / NX_PORT_CHUNK],
2696 (nx_port % NX_PORT_CHUNK)));
2697
2698 nx->nx_active_ports++;
2699 npi->npi_nah = NPI_NA_ENCODE(*na, NEXUS_PORT_STATE_WORKING);
2700 (*na)->na_nx_port = nx_port;
2701 }
2702
2703 SK_DF(SK_VERB_NXPORT, "nx 0x%llx nx_port %d, ports %u/%u (err %d)",
2704 SK_KVA(nx), (int)nx_port, nx->nx_active_ports, nx->nx_num_ports,
2705 err);
2706
2707 return err;
2708}
2709
2710void
2711nx_port_defunct(struct kern_nexus *nx, nexus_port_t nx_port)
2712{
2713 struct nx_port_info *npi = &nx->nx_ports[nx_port];
2714
2715 npi->npi_nah = NPI_NA_ENCODE(npi->npi_nah,
2716 NEXUS_PORT_STATE_DEFUNCT);
2717}
2718
2719void
2720nx_port_free(struct kern_nexus *nx, nexus_port_t nx_port)
2721{
2722 struct nx_port_info *npi = NULL;
2723 bitmap_t *bmap;
2724 uint32_t i, j;
2725
2726 ASSERT((nx->nx_num_ports % NX_PORT_CHUNK) == 0);
2727 ASSERT(nx_port != NEXUS_PORT_ANY && nx_port < nx->nx_num_ports);
2728 ASSERT(nx->nx_active_ports != 0);
2729
2730 i = nx_port / NX_PORT_CHUNK;
2731 j = nx_port % NX_PORT_CHUNK;
2732 bmap = &nx->nx_ports_bmap[i];
2733 ASSERT(!bit_test(*bmap, j));
2734
2735 npi = &nx->nx_ports[nx_port];
2736 npi->npi_nah = 0;
2737 if (npi->npi_nxb == NULL) {
2738 /* it's vacant, release it (set bit) */
2739 bit_set(*bmap, j);
2740 }
2741
2742 nx->nx_active_ports--;
2743
2744 //XXX wshen0123@apple.com --- try to shrink bitmap & nx_ports ???
2745
2746 SK_DF(SK_VERB_NXPORT, "--- nx 0x%llx nx_port %d, ports %u/%u",
2747 SK_KVA(nx), (int)nx_port, nx->nx_active_ports, nx->nx_num_ports);
2748}
2749
2750int
2751nx_port_bind_info(struct kern_nexus *nx, nexus_port_t nx_port,
2752 struct nxbind *nxb0, void *info)
2753{
2754 struct nx_port_info *npi = NULL;
2755 size_t g;
2756 uint32_t i, j;
2757 bitmap_t *bmap;
2758 int err = 0;
2759
2760 ASSERT(nx_port != NEXUS_PORT_ANY);
2761 ASSERT(nx_port < NXDOM_MAX(NX_DOM(nx), ports));
2762 ASSERT((nx->nx_num_ports % NX_PORT_CHUNK) == 0);
2763 ASSERT(nxb0 != NULL);
2764
2765 if ((nx_port) + 1 > nx->nx_num_ports) {
2766 g = P2ROUNDUP((nx_port + 1) - nx->nx_num_ports, NX_PORT_CHUNK);
2767 VERIFY(g <= NEXUS_PORT_MAX);
2768 if ((err = nx_port_grow(nx, grow: (nexus_port_size_t)g)) != 0) {
2769 goto done;
2770 }
2771 }
2772 ASSERT(err == 0);
2773
2774 npi = &nx->nx_ports[nx_port];
2775 i = nx_port / NX_PORT_CHUNK;
2776 j = nx_port % NX_PORT_CHUNK;
2777 bmap = &nx->nx_ports_bmap[i];
2778 if (bit_test(*bmap, j)) {
2779 /* port is not (yet) bound or allocated */
2780 ASSERT(npi->npi_nah == 0 && npi->npi_nxb == NULL);
2781
2782 bit_clear(*bmap, j);
2783 struct nxbind *nxb = nxb_alloc(how: Z_WAITOK);
2784 nxb_move(snxb: nxb0, dnxb: nxb);
2785 npi->npi_nxb = nxb;
2786 npi->npi_info = info;
2787 /* claim it (clear bit) */
2788 bit_clear(*bmap, j);
2789 ASSERT(err == 0);
2790 } else {
2791 /* port is already taken */
2792 ASSERT(NPI_NA(npi) != NULL || npi->npi_nxb != NULL);
2793 err = EEXIST;
2794 }
2795done:
2796
2797 SK_DF(err ? SK_VERB_ERROR : SK_VERB_NXPORT,
2798 "+++ nx 0x%llx nx_port %d, ports %u/%u (err %d)", SK_KVA(nx),
2799 (int)nx_port, nx->nx_active_ports, nx->nx_num_ports, err);
2800
2801 return err;
2802}
2803
2804int
2805nx_port_bind(struct kern_nexus *nx, nexus_port_t nx_port, struct nxbind *nxb0)
2806{
2807 return nx_port_bind_info(nx, nx_port, nxb0, NULL);
2808}
2809
2810static int
2811nx_port_info_size(void *info, size_t *sz)
2812{
2813 struct nx_port_info_header *hdr = info;
2814
2815 switch (hdr->ih_type) {
2816 case NX_PORT_INFO_TYPE_NETIF:
2817 break;
2818 default:
2819 return EINVAL;
2820 }
2821 *sz = hdr->ih_size;
2822 return 0;
2823}
2824
2825int
2826nx_port_unbind(struct kern_nexus *nx, nexus_port_t nx_port)
2827{
2828 struct nx_port_info *npi = NULL;
2829 struct nxbind *nxb;
2830 uint32_t i, j;
2831 bitmap_t *bmap;
2832 int err = 0;
2833
2834 ASSERT(nx_port != NEXUS_PORT_ANY);
2835
2836 if (nx_port >= nx->nx_num_ports) {
2837 err = EDOM;
2838 goto done;
2839 }
2840
2841 npi = &nx->nx_ports[nx_port];
2842 i = nx_port / NX_PORT_CHUNK;
2843 j = nx_port % NX_PORT_CHUNK;
2844 bmap = &nx->nx_ports_bmap[i];
2845
2846 if ((nxb = npi->npi_nxb) == NULL) {
2847 /* must be either free or allocated */
2848 ASSERT(NPI_NA(npi) == NULL ||
2849 (!bit_test(*bmap, j) && nx->nx_active_ports > 0));
2850 err = ENOENT;
2851 } else {
2852 nxb_free(nxb);
2853 npi->npi_nxb = NULL;
2854 if (npi->npi_info != NULL) {
2855 size_t sz;
2856
2857 VERIFY(nx_port_info_size(npi->npi_info, &sz) == 0);
2858 sk_free_data(npi->npi_info, sz);
2859 npi->npi_info = NULL;
2860 }
2861 ASSERT(!bit_test(*bmap, j));
2862 if (NPI_NA(npi) == NULL) {
2863 /* it's vacant, release it (set bit) */
2864 bit_set(*bmap, j);
2865 }
2866 }
2867
2868done:
2869 SK_DF(err ? SK_VERB_ERROR : SK_VERB_NXPORT,
2870 "--- nx 0x%llx nx_port %d, ports %u/%u (err %d)", SK_KVA(nx),
2871 (int)nx_port, nx->nx_active_ports, nx->nx_num_ports, err);
2872
2873 return err;
2874}
2875
2876struct nexus_adapter *
2877nx_port_get_na(struct kern_nexus *nx, nexus_port_t nx_port)
2878{
2879 if (nx->nx_ports != NULL && nx->nx_num_ports > nx_port) {
2880 return NPI_NA(&nx->nx_ports[nx_port]);
2881 } else {
2882 return NULL;
2883 }
2884}
2885
2886int
2887nx_port_get_info(struct kern_nexus *nx, nexus_port_t port,
2888 nx_port_info_type_t type, void *info, uint32_t len)
2889{
2890 struct nx_port_info *npi;
2891 struct nx_port_info_header *hdr;
2892
2893 if (nx->nx_ports == NULL || port >= nx->nx_num_ports) {
2894 return ENXIO;
2895 }
2896 npi = &nx->nx_ports[port];
2897 hdr = npi->npi_info;
2898 if (hdr == NULL) {
2899 return ENOENT;
2900 }
2901
2902 if (hdr->ih_type != type) {
2903 return EINVAL;
2904 }
2905
2906 bcopy(src: npi->npi_info, dst: info, n: len);
2907 return 0;
2908}
2909
2910bool
2911nx_port_is_valid(struct kern_nexus *nx, nexus_port_t nx_port)
2912{
2913 return nx_port < nx->nx_num_ports;
2914}
2915
2916bool
2917nx_port_is_defunct(struct kern_nexus *nx, nexus_port_t nx_port)
2918{
2919 ASSERT(nx_port_is_valid(nx, nx_port));
2920
2921 return NPI_IS_DEFUNCT(&nx->nx_ports[nx_port]);
2922}
2923
2924void
2925nx_port_free_all(struct kern_nexus *nx)
2926{
2927 uint32_t num_ports;
2928
2929 /* uncrustify doesn't handle C blocks properly */
2930 /* BEGIN IGNORE CODESTYLE */
2931 nx_port_foreach(nx, ^(nexus_port_t p) {
2932 struct nxbind *nxb;
2933 void *info;
2934 nxb = nx->nx_ports[p].npi_nxb;
2935 info = nx->nx_ports[p].npi_info;
2936 if (nxb != NULL) {
2937 nxb_free(nxb);
2938 nx->nx_ports[p].npi_nxb = NULL;
2939 }
2940 if (info != NULL) {
2941 size_t sz;
2942
2943 VERIFY(nx_port_info_size(info, &sz) == 0);
2944 skn_free_data(info, info, sz);
2945 nx->nx_ports[p].npi_info = NULL;
2946 }
2947 });
2948 /* END IGNORE CODESTYLE */
2949
2950 num_ports = nx->nx_num_ports;
2951 nx->nx_num_ports = 0;
2952 nx->nx_active_ports = 0;
2953 skn_free_data(ports_bmap,
2954 nx->nx_ports_bmap, (num_ports / NX_PORT_CHUNK) * sizeof(bitmap_t));
2955 nx->nx_ports_bmap = NULL;
2956 sk_free_type_array(struct nx_port_info, num_ports, nx->nx_ports);
2957 nx->nx_ports = NULL;
2958}
2959
2960void
2961nx_port_foreach(struct kern_nexus *nx,
2962 void (^port_handle)(nexus_port_t nx_port))
2963{
2964 for (nexus_port_size_t i = 0; i < (nx->nx_num_ports / NX_PORT_CHUNK); i++) {
2965 bitmap_t bmap = nx->nx_ports_bmap[i];
2966
2967 if (bmap == NX_PORT_CHUNK_FREE) {
2968 continue;
2969 }
2970
2971 for (nexus_port_size_t j = 0; j < NX_PORT_CHUNK; j++) {
2972 if (bit_test(bmap, j)) {
2973 continue;
2974 }
2975 port_handle((i * NX_PORT_CHUNK) + j);
2976 }
2977 }
2978}
2979
2980/*
2981 * sysctl interfaces
2982 */
2983static int nexus_provider_list_sysctl SYSCTL_HANDLER_ARGS;
2984static int nexus_channel_list_sysctl SYSCTL_HANDLER_ARGS;
2985static int nexus_mib_get_sysctl SYSCTL_HANDLER_ARGS;
2986
2987SYSCTL_PROC(_kern_skywalk, OID_AUTO, nexus_provider_list,
2988 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
2989 0, 0, nexus_provider_list_sysctl, "S,nexus_provider_info_t", "");
2990
2991SYSCTL_PROC(_kern_skywalk, OID_AUTO, nexus_channel_list,
2992 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
2993 0, 0, nexus_channel_list_sysctl, "S,nexus_channel_entry_t", "");
2994
2995SYSCTL_PROC(_kern_skywalk, OID_AUTO, llink_list,
2996 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
2997 0, NXMIB_LLINK_LIST, nexus_mib_get_sysctl, "S,nx_llink_info",
2998 "A list of logical links");
2999
3000SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow,
3001 CTLTYPE_STRUCT | CTLFLAG_RW | CTLFLAG_LOCKED | CTLFLAG_ANYBODY | CTLFLAG_KERN,
3002 0, NXMIB_FLOW, nexus_mib_get_sysctl, "S,sk_stats_flow",
3003 "Nexus inet flows with stats collected in kernel");
3004
3005SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow_owner,
3006 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3007 0, NXMIB_FLOW_OWNER, nexus_mib_get_sysctl, "S,sk_stats_flow_owner",
3008 "Nexus flow owners");
3009
3010SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow_route,
3011 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3012 0, NXMIB_FLOW_ROUTE, nexus_mib_get_sysctl, "S,sk_stats_flow_route",
3013 "Nexus flow routes");
3014
3015SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, net_if,
3016 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3017 0, NXMIB_NETIF_STATS, nexus_mib_get_sysctl, "S,sk_stats_net_if",
3018 "Nexus netif statistics collected in kernel");
3019
3020SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow_switch,
3021 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3022 0, NXMIB_FSW_STATS, nexus_mib_get_sysctl, "S,sk_stats_flow_switch",
3023 "Nexus flowswitch statistics collected in kernel");
3024
3025SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, userstack,
3026 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3027 0, NXMIB_USERSTACK_STATS, nexus_mib_get_sysctl, "S,sk_stats_userstack",
3028 "Nexus userstack statistics counter");
3029
3030SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, flow_adv,
3031 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3032 0, NXMIB_FLOW_ADV, nexus_mib_get_sysctl, "S,sk_stats_flow_adv",
3033 "Nexus flow advisory dump");
3034
3035SYSCTL_PROC(_kern_skywalk_stats, OID_AUTO, netif_queue,
3036 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
3037 0, NXMIB_NETIF_QUEUE_STATS, nexus_mib_get_sysctl, "S,netif_qstats_info",
3038 "A list of netif queue stats entries");
3039
3040/*
3041 * Provider list sysctl
3042 */
3043static void
3044nexus_provider_info_populate(struct kern_nexus_provider *nxprov,
3045 nexus_provider_info_t info)
3046{
3047 struct kern_nexus *nx;
3048 uuid_t *uuids;
3049
3050 SK_LOCK_ASSERT_HELD();
3051
3052 /* provider UUID + params */
3053 uuid_copy(dst: info->npi_prov_uuid, src: nxprov->nxprov_uuid);
3054 bcopy(src: nxprov->nxprov_params, dst: &info->npi_prov_params,
3055 n: sizeof(struct nxprov_params));
3056 info->npi_instance_uuids_count = nxprov->nxprov_nx_count;
3057
3058 /* instance UUID list */
3059 uuids = info->npi_instance_uuids;
3060 STAILQ_FOREACH(nx, &nxprov->nxprov_nx_head, nx_prov_link) {
3061 uuid_copy(dst: *uuids, src: nx->nx_uuid);
3062 uuids++;
3063 }
3064}
3065
3066static int
3067nexus_provider_list_sysctl SYSCTL_HANDLER_ARGS
3068{
3069#pragma unused(arg1, arg2, oidp)
3070 size_t actual_space;
3071 caddr_t buffer = NULL;
3072 size_t buffer_space;
3073 size_t allocated_space;
3074 int out_error;
3075 int error = 0;
3076 struct kern_nexus_provider *nxprov;
3077 caddr_t scan;
3078
3079 if (!kauth_cred_issuser(cred: kauth_cred_get())) {
3080 return EPERM;
3081 }
3082
3083 net_update_uptime();
3084 buffer_space = req->oldlen;
3085 if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
3086 if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
3087 buffer_space = SK_SYSCTL_ALLOC_MAX;
3088 }
3089 allocated_space = buffer_space;
3090 buffer = sk_alloc_data(allocated_space, Z_WAITOK, skmem_tag_sysctl_buf);
3091 if (__improbable(buffer == NULL)) {
3092 return ENOBUFS;
3093 }
3094 } else if (req->oldptr == USER_ADDR_NULL) {
3095 buffer_space = 0;
3096 }
3097 actual_space = 0;
3098 scan = buffer;
3099 SK_LOCK();
3100 STAILQ_FOREACH(nxprov, &nxprov_head, nxprov_link) {
3101 size_t info_size;
3102
3103 info_size
3104 = NEXUS_PROVIDER_INFO_SIZE(nxprov->nxprov_nx_count);
3105 if (scan != NULL) {
3106 if (buffer_space < info_size) {
3107 /* supplied buffer too small, stop copying */
3108 error = ENOMEM;
3109 break;
3110 }
3111 nexus_provider_info_populate(nxprov, info: (void *)scan);
3112 scan += info_size;
3113 buffer_space -= info_size;
3114 }
3115 actual_space += info_size;
3116 }
3117 SK_UNLOCK();
3118
3119 out_error = SYSCTL_OUT(req, buffer, actual_space);
3120 if (out_error != 0) {
3121 error = out_error;
3122 }
3123
3124 if (buffer != NULL) {
3125 sk_free_data(buffer, allocated_space);
3126 }
3127
3128 return error;
3129}
3130
3131/*
3132 * Channel list sysctl
3133 */
3134static uint32_t
3135channel_ring_count(struct kern_channel *ch, enum txrx which)
3136{
3137 return ch->ch_last[which] - ch->ch_first[which];
3138}
3139
3140static void
3141populate_ring_entries(struct __kern_channel_ring *kring,
3142 ring_id_t first, ring_id_t last, nexus_channel_ring_entry_t entries)
3143{
3144 ring_id_t i;
3145 nexus_channel_ring_entry_t scan;
3146 struct __kern_channel_ring *ring;
3147
3148 scan = entries;
3149 for (i = first; i < last; i++, scan++) {
3150 ring = &kring[i];
3151
3152 DTRACE_SKYWALK1(populate__ring, struct __kern_channel_ring *,
3153 ring);
3154 if (kr_stat_enable == 0) {
3155 bzero(s: &scan->ncre_stats, n: sizeof(scan->ncre_stats));
3156 bzero(s: &scan->ncre_user_stats,
3157 n: sizeof(scan->ncre_user_stats));
3158 } else {
3159 scan->ncre_stats = ring->ckr_stats;
3160 scan->ncre_user_stats = ring->ckr_usr_stats;
3161 }
3162 scan->ncre_error_stats = ring->ckr_err_stats;
3163 scan->ncre_ring_id = i;
3164 }
3165}
3166
3167/* combine/convert ch_mode/ch_flags into nexus_channel_entry flags */
3168static uint32_t
3169nexus_channel_get_flags(uint32_t ch_mode, uint32_t ch_flags)
3170{
3171 uint32_t flags = 0;
3172
3173 flags |= (ch_mode & CHMODE_MONITOR_TX) ? SCHF_MONITOR_TX : 0;
3174 flags |= (ch_mode & CHMODE_MONITOR_RX) ? SCHF_MONITOR_RX : 0;
3175 flags |= (ch_mode & CHMODE_MONITOR_NO_COPY) ? SCHF_MONITOR_NO_COPY : 0;
3176 flags |= (ch_mode & CHMODE_USER_PACKET_POOL) ? SCHF_USER_PACKET_POOL : 0;
3177 flags |= (ch_mode & CHMODE_DEFUNCT_OK) ? SCHF_DEFUNCT_OK : 0;
3178 flags |= (ch_mode & CHMODE_FILTER) ? SCHF_FILTER : 0;
3179 flags |= (ch_mode & CHMODE_EVENT_RING) ? SCHF_EVENT_RING : 0;
3180 flags |= (ch_mode & CHMODE_EXCLUSIVE) ? SCHF_EXCLUSIVE : 0;
3181 flags |= (ch_flags & CHANF_IF_ADV) ? SCHF_IF_ADV : 0;
3182 flags |= (ch_flags & CHANF_DEFUNCT_SKIP) ? SCHF_DEFUNCT_SKIP : 0;
3183 flags |= (ch_flags & CHANF_CLOSING) ? SCHF_CLOSING : 0;
3184 flags |= (ch_flags & CHANF_DEFUNCT) ? SCHF_DEFUNCT : 0;
3185 flags |= (ch_mode & CHMODE_LOW_LATENCY) ? SCHF_LOW_LATENCY : 0;
3186
3187 return flags;
3188}
3189
3190SK_NO_INLINE_ATTRIBUTE
3191static void
3192nexus_channel_entry_populate(struct kern_channel *ch,
3193 nexus_channel_entry_t entry)
3194{
3195 uint32_t ch_mode = ch->ch_info->cinfo_ch_mode;
3196 uint32_t ch_flags = ch->ch_flags;
3197 ring_id_t rx_first = ch->ch_first[NR_RX];
3198 ring_id_t rx_last = ch->ch_last[NR_RX];
3199 ring_id_t tx_last = ch->ch_last[NR_TX];
3200 ring_id_t tx_first = ch->ch_first[NR_TX];
3201
3202 uuid_copy(dst: entry->nce_uuid, src: ch->ch_info->cinfo_ch_id);
3203 entry->nce_flags = nexus_channel_get_flags(ch_mode, ch_flags);
3204 entry->nce_port = ch->ch_info->cinfo_nx_port;
3205 entry->nce_pid = ch->ch_pid;
3206 entry->nce_fd = ch->ch_fd;
3207 entry->nce_tx_rings = tx_last - tx_first;
3208 entry->nce_rx_rings = rx_last - rx_first;
3209 populate_ring_entries(kring: ch->ch_na->na_tx_rings, first: tx_first, last: tx_last,
3210 entries: entry->nce_ring_entries);
3211 populate_ring_entries(kring: ch->ch_na->na_rx_rings, first: rx_first, last: rx_last,
3212 entries: entry->nce_ring_entries + entry->nce_tx_rings);
3213}
3214
3215SK_NO_INLINE_ATTRIBUTE
3216static size_t
3217nexus_channel_info_populate(struct kern_nexus *nx,
3218 nexus_channel_info_t info, size_t buffer_size)
3219{
3220 struct kern_channel *ch = NULL;
3221 size_t info_size;
3222 caddr_t scan = NULL;
3223
3224 SK_LOCK_ASSERT_HELD();
3225
3226 info_size = sizeof(*info);
3227
3228 /* channel list */
3229 if (info != NULL) {
3230 if (buffer_size < info_size) {
3231 return info_size;
3232 }
3233
3234 /* instance UUID */
3235 uuid_copy(dst: info->nci_instance_uuid, src: nx->nx_uuid);
3236 info->nci_channel_entries_count = nx->nx_ch_count;
3237 scan = (caddr_t)info->nci_channel_entries;
3238 }
3239 STAILQ_FOREACH(ch, &nx->nx_ch_head, ch_link) {
3240 size_t entry_size;
3241 uint32_t ring_count;
3242
3243 ring_count = channel_ring_count(ch, which: NR_TX) +
3244 channel_ring_count(ch, which: NR_RX);
3245 entry_size = NEXUS_CHANNEL_ENTRY_SIZE(ring_count);
3246 info_size += entry_size;
3247 if (scan != NULL) {
3248 if (buffer_size < info_size) {
3249 return info_size;
3250 }
3251
3252 nexus_channel_entry_populate(ch, entry: (void *)scan);
3253 scan += entry_size;
3254 }
3255 }
3256 return info_size;
3257}
3258
3259static int
3260nexus_channel_list_sysctl SYSCTL_HANDLER_ARGS
3261{
3262#pragma unused(arg1, arg2, oidp)
3263 size_t actual_space;
3264 caddr_t buffer = NULL;
3265 size_t buffer_space;
3266 size_t allocated_space;
3267 int out_error;
3268 struct kern_nexus *nx;
3269 int error = 0;
3270 caddr_t scan;
3271
3272 if (!kauth_cred_issuser(cred: kauth_cred_get())) {
3273 return EPERM;
3274 }
3275
3276 net_update_uptime();
3277 buffer_space = req->oldlen;
3278 if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
3279 if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
3280 buffer_space = SK_SYSCTL_ALLOC_MAX;
3281 }
3282 allocated_space = buffer_space;
3283 buffer = sk_alloc_data(allocated_space, Z_WAITOK, skmem_tag_sysctl_buf);
3284 if (__improbable(buffer == NULL)) {
3285 return ENOBUFS;
3286 }
3287 } else if (req->oldptr == USER_ADDR_NULL) {
3288 buffer_space = 0;
3289 }
3290 actual_space = 0;
3291 scan = buffer;
3292 SK_LOCK();
3293 RB_FOREACH(nx, kern_nexus_tree, &nx_head) {
3294 size_t info_size;
3295
3296 info_size = nexus_channel_info_populate(nx, info: (void *)scan,
3297 buffer_size: buffer_space);
3298 if (scan != NULL) {
3299 if (buffer_space < info_size) {
3300 /* supplied buffer too small, stop copying */
3301 error = ENOMEM;
3302 break;
3303 }
3304 scan += info_size;
3305 buffer_space -= info_size;
3306 }
3307 actual_space += info_size;
3308 }
3309 SK_UNLOCK();
3310
3311 if (actual_space != 0) {
3312 out_error = SYSCTL_OUT(req, buffer, actual_space);
3313 if (out_error != 0) {
3314 error = out_error;
3315 }
3316 }
3317 if (buffer != NULL) {
3318 sk_free_data(buffer, allocated_space);
3319 }
3320
3321 return error;
3322}
3323
3324static int
3325nexus_mib_get_sysctl SYSCTL_HANDLER_ARGS
3326{
3327#pragma unused(arg1, arg2)
3328 struct proc *p = req->p;
3329 struct nexus_mib_filter filter;
3330 int error = 0;
3331 size_t actual_space;
3332 caddr_t buffer = NULL;
3333 size_t buffer_space;
3334 size_t allocated_space;
3335 int out_error;
3336 struct kern_nexus *nx;
3337 caddr_t scan;
3338
3339 /* Restrict protocol stats access to root user only (like netstat). */
3340 if (oidp->oid_arg2 == NXMIB_USERSTACK_STATS &&
3341 !kauth_cred_issuser(cred: kauth_cred_get())) {
3342 SK_ERR("mib request rejected, EPERM");
3343 return EPERM;
3344 }
3345
3346 if (req->newptr == USER_ADDR_NULL) {
3347 /*
3348 * For flow stats requests, non-root users need to provide a
3349 * 5-tuple. Otherwise, we do not grant access.
3350 */
3351 if (oidp->oid_arg2 == NXMIB_FLOW &&
3352 !kauth_cred_issuser(cred: kauth_cred_get())) {
3353 SK_ERR("mib request rejected: tuple not provided");
3354 return EPERM;
3355 }
3356 /* use subcommand for multiple nodes */
3357 filter.nmf_type = oidp->oid_arg2;
3358 filter.nmf_bitmap = 0x0;
3359 } else if (req->newlen != sizeof(struct nexus_mib_filter)) {
3360 SK_ERR("mis-matching newlen");
3361 return EINVAL;
3362 } else {
3363 error = SYSCTL_IN(req, &filter, sizeof(struct nexus_mib_filter));
3364 if (error != 0) {
3365 SK_ERR("SYSCTL_IN err %d", error);
3366 return error;
3367 }
3368 if (filter.nmf_type != oidp->oid_arg2) {
3369 SK_ERR("mis-matching nmf_type");
3370 return EINVAL;
3371 }
3372 /*
3373 * For flow stats requests, non-root users need to set the nexus
3374 * mib filter to NXMIB_FILTER_INFO_TUPLE. Otherwise, we do not
3375 * grant access. This ensures that fsw_mib_get_flow looks for a
3376 * flow entry that matches the given tuple of the non-root user.
3377 */
3378 if (filter.nmf_type == NXMIB_FLOW &&
3379 (filter.nmf_bitmap & NXMIB_FILTER_INFO_TUPLE) == 0 &&
3380 !kauth_cred_issuser(cred: kauth_cred_get())) {
3381 SK_ERR("mib request rejected: tuple filter not set");
3382 return EPERM;
3383 }
3384 }
3385
3386 net_update_uptime();
3387 buffer_space = req->oldlen;
3388 if (req->oldptr != USER_ADDR_NULL && buffer_space != 0) {
3389 if (buffer_space > SK_SYSCTL_ALLOC_MAX) {
3390 buffer_space = SK_SYSCTL_ALLOC_MAX;
3391 }
3392 allocated_space = buffer_space;
3393 buffer = sk_alloc_data(allocated_space, Z_WAITOK, skmem_tag_sysctl_buf);
3394 if (__improbable(buffer == NULL)) {
3395 return ENOBUFS;
3396 }
3397 } else if (req->oldptr == USER_ADDR_NULL) {
3398 buffer_space = 0;
3399 }
3400 actual_space = 0;
3401 scan = buffer;
3402
3403 SK_LOCK();
3404 RB_FOREACH(nx, kern_nexus_tree, &nx_head) {
3405 if (NX_DOM_PROV(nx)->nxdom_prov_nx_mib_get == NULL) {
3406 continue;
3407 }
3408
3409 size_t size;
3410 struct kern_nexus_domain_provider *nx_dp = NX_DOM_PROV(nx);
3411
3412 size = nx_dp->nxdom_prov_nx_mib_get(nx, &filter, scan,
3413 buffer_space, p);
3414
3415 if (scan != NULL) {
3416 if (buffer_space < size) {
3417 /* supplied buffer too small, stop copying */
3418 error = ENOMEM;
3419 break;
3420 }
3421 scan += size;
3422 buffer_space -= size;
3423 }
3424 actual_space += size;
3425 }
3426 SK_UNLOCK();
3427
3428 if (actual_space != 0) {
3429 out_error = SYSCTL_OUT(req, buffer, actual_space);
3430 if (out_error != 0) {
3431 error = out_error;
3432 }
3433 }
3434 if (buffer != NULL) {
3435 sk_free_data(buffer, allocated_space);
3436 }
3437
3438 return error;
3439}
3440
3441void
3442kern_nexus_walktree(kern_nexus_walktree_f_t *f, void *arg0,
3443 boolean_t is_sk_locked)
3444{
3445 struct kern_nexus *nx = NULL;
3446
3447 if (!is_sk_locked) {
3448 SK_LOCK();
3449 } else {
3450 SK_LOCK_ASSERT_HELD();
3451 }
3452
3453 RB_FOREACH(nx, kern_nexus_tree, &nx_head) {
3454 (*f)(nx, arg0);
3455 }
3456
3457 if (!is_sk_locked) {
3458 SK_UNLOCK();
3459 }
3460}
3461
3462errno_t
3463kern_nexus_get_pbufpool_info(const uuid_t nx_uuid,
3464 struct kern_pbufpool_memory_info *rx_pool_info,
3465 struct kern_pbufpool_memory_info *tx_pool_info)
3466{
3467 struct kern_pbufpool *tpp, *rpp;
3468 struct kern_nexus *nx;
3469 errno_t err = 0;
3470
3471 nx = nx_find(nx_uuid, FALSE);
3472 if (nx == NULL) {
3473 err = ENOENT;
3474 goto done;
3475 }
3476
3477 if (nx->nx_prov->nxprov_params->nxp_type != NEXUS_TYPE_NET_IF) {
3478 err = ENOTSUP;
3479 goto done;
3480 }
3481
3482 err = nx_netif_prov_nx_mem_info(nx, &tpp, &rpp);
3483 if (err != 0) {
3484 goto done;
3485 }
3486
3487 if ((tpp == NULL) && (rpp == NULL)) {
3488 err = ENOENT;
3489 goto done;
3490 }
3491
3492 if (tx_pool_info != NULL) {
3493 bzero(s: tx_pool_info, n: sizeof(*tx_pool_info));
3494 }
3495 if (rx_pool_info != NULL) {
3496 bzero(s: rx_pool_info, n: sizeof(*rx_pool_info));
3497 }
3498
3499 if ((tx_pool_info != NULL) && (tpp != NULL)) {
3500 err = kern_pbufpool_get_memory_info(pbufpool: tpp, pbufpool_mem_ref: tx_pool_info);
3501 if (err != 0) {
3502 goto done;
3503 }
3504 }
3505
3506 if ((rx_pool_info != NULL) && (rpp != NULL)) {
3507 err = kern_pbufpool_get_memory_info(pbufpool: rpp, pbufpool_mem_ref: rx_pool_info);
3508 }
3509
3510done:
3511 if (nx != NULL) {
3512 (void) nx_release(nx);
3513 nx = NULL;
3514 }
3515 return err;
3516}
3517
3518void
3519nx_interface_advisory_notify(struct kern_nexus *nx)
3520{
3521 struct kern_channel *ch;
3522 struct netif_stats *nifs;
3523 struct fsw_stats *fsw_stats;
3524 nexus_type_t nxdom_type = NX_DOM(nx)->nxdom_type;
3525
3526 if (nxdom_type == NEXUS_TYPE_NET_IF) {
3527 nifs = &NX_NETIF_PRIVATE(nx)->nif_stats;
3528 } else if (nxdom_type == NEXUS_TYPE_FLOW_SWITCH) {
3529 fsw_stats = &NX_FSW_PRIVATE(nx)->fsw_stats;
3530 } else {
3531 VERIFY(0);
3532 __builtin_unreachable();
3533 }
3534 if (!lck_rw_try_lock_shared(lck: &nx->nx_ch_if_adv_lock)) {
3535 if (nxdom_type == NEXUS_TYPE_NET_IF) {
3536 STATS_INC(nifs, NETIF_STATS_IF_ADV_UPD_DROP);
3537 } else {
3538 STATS_INC(fsw_stats, FSW_STATS_IF_ADV_UPD_DROP);
3539 }
3540 return;
3541 }
3542 /*
3543 * if the channel is in "nx_ch_if_adv_head" list, then we can
3544 * safely assume that the channel is not closed yet.
3545 * In ch_close_common(), the channel is removed from the
3546 * "nx_ch_if_adv_head" list holding the "nx_ch_if_adv_lock" in
3547 * exclusive mode, prior to closing the channel.
3548 */
3549 STAILQ_FOREACH(ch, &nx->nx_ch_if_adv_head, ch_link_if_adv) {
3550 struct nexus_adapter *na = ch->ch_na;
3551
3552 ASSERT(na != NULL);
3553 na_post_event(&na->na_tx_rings[ch->ch_first[NR_TX]],
3554 TRUE, FALSE, FALSE, CHAN_FILT_HINT_IF_ADV_UPD);
3555 if (nxdom_type == NEXUS_TYPE_NET_IF) {
3556 STATS_INC(nifs, NETIF_STATS_IF_ADV_UPD_SENT);
3557 } else {
3558 STATS_INC(fsw_stats, FSW_STATS_IF_ADV_UPD_SENT);
3559 }
3560 }
3561 lck_rw_done(lck: &nx->nx_ch_if_adv_lock);
3562}
3563