1/*
2 * Copyright (c) 2015-2023 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*
30 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 *
41 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 */
53
54
55/*
56 * This module implements the flow switch for Skywalk
57 *
58 * --- FLOW SWITCH ---
59 *
60 * For each switch, a lock protects deletion of ports. When configuring
61 * or deleting a new port, the lock is acquired in exclusive mode (after
62 * holding SK_LOCK). When forwarding, the lock is acquired in shared
63 * mode (without SK_LOCK). The lock is held throughout the entire
64 * forwarding cycle, during which the thread may incur in a page fault.
65 * Hence it is important that sleepable shared locks are used.
66 *
67 * On the rx ring, the per-port lock is grabbed initially to reserve
68 * a number of slot in the ring, then the lock is released, packets are
69 * copied from source to destination, and then the lock is acquired again
70 * and the receive ring is updated. (A similar thing is done on the tx
71 * ring for NIC and host stack ports attached to the switch)
72 *
73 * When a netif is attached to a flowswitch, two kernel channels are opened:
74 * The device and host channels. The device channel provides the device
75 * datapath. The host channel is not used in the datapath. It is there
76 * only for providing some callbacks for activating the hostna (e.g.
77 * intercepting host packets).
78 */
79
80#include <net/bpf.h>
81#include <netinet/tcp_seq.h>
82#include <skywalk/os_skywalk_private.h>
83#include <skywalk/nexus/flowswitch/nx_flowswitch.h>
84#include <skywalk/nexus/flowswitch/fsw_var.h>
85#include <skywalk/nexus/upipe/nx_user_pipe.h>
86#include <skywalk/nexus/netif/nx_netif.h>
87#include <skywalk/nexus/nexus_var.h>
88#include <sys/protosw.h>
89#include <sys/domain.h>
90
91SYSCTL_EXTENSIBLE_NODE(_kern_skywalk, OID_AUTO, flowswitch,
92 CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Skywalk FlowSwitch");
93
94static void nx_fsw_dom_init(struct nxdom *);
95static void nx_fsw_dom_terminate(struct nxdom *);
96static void nx_fsw_dom_fini(struct nxdom *);
97static int nx_fsw_dom_find_port(struct kern_nexus *, boolean_t, nexus_port_t *);
98static int nx_fsw_dom_bind_port(struct kern_nexus *, nexus_port_t *,
99 struct nxbind *, void *);
100static int nx_fsw_dom_unbind_port(struct kern_nexus *, nexus_port_t);
101static int nx_fsw_dom_connect(struct kern_nexus_domain_provider *,
102 struct kern_nexus *, struct kern_channel *, struct chreq *,
103 struct kern_channel *, struct nxbind *, struct proc *);
104static void nx_fsw_dom_disconnect(struct kern_nexus_domain_provider *,
105 struct kern_nexus *, struct kern_channel *);
106static void nx_fsw_dom_defunct(struct kern_nexus_domain_provider *,
107 struct kern_nexus *, struct kern_channel *, struct proc *);
108static void nx_fsw_dom_defunct_finalize(struct kern_nexus_domain_provider *,
109 struct kern_nexus *, struct kern_channel *, boolean_t);
110
111static int nx_fsw_prov_init(struct kern_nexus_domain_provider *);
112static int nx_fsw_prov_params_adjust(const struct kern_nexus_domain_provider *,
113 const struct nxprov_params *, struct nxprov_adjusted_params *);
114static int nx_fsw_prov_params(struct kern_nexus_domain_provider *,
115 const uint32_t, const struct nxprov_params *, struct nxprov_params *,
116 struct skmem_region_params[SKMEM_REGIONS], uint32_t);
117static int nx_fsw_prov_mem_new(struct kern_nexus_domain_provider *,
118 struct kern_nexus *, struct nexus_adapter *);
119static int nx_fsw_prov_config(struct kern_nexus_domain_provider *,
120 struct kern_nexus *, struct nx_cfg_req *, int, struct proc *,
121 kauth_cred_t);
122static void nx_fsw_prov_fini(struct kern_nexus_domain_provider *);
123static int nx_fsw_prov_nx_ctor(struct kern_nexus *);
124static void nx_fsw_prov_nx_dtor(struct kern_nexus *);
125static size_t nx_fsw_prov_mib_get(struct kern_nexus *nx,
126 struct nexus_mib_filter *, void *, size_t, struct proc *);
127
128struct nxdom nx_flowswitch_dom_s = {
129 .nxdom_prov_head =
130 STAILQ_HEAD_INITIALIZER(nx_flowswitch_dom_s.nxdom_prov_head),
131 .nxdom_type = NEXUS_TYPE_FLOW_SWITCH,
132 .nxdom_md_type = NEXUS_META_TYPE_PACKET,
133 .nxdom_md_subtype = NEXUS_META_SUBTYPE_RAW,
134 .nxdom_name = "flowswitch",
135 .nxdom_ports = {
136 .nb_def = NX_FSW_VP_MAX,
137 .nb_min = NX_FSW_VP_MIN,
138 .nb_max = NX_FSW_VP_MAX,
139 },
140 .nxdom_tx_rings = {
141 .nb_def = 1,
142 .nb_min = 1,
143 .nb_max = NX_FSW_MAXRINGS,
144 },
145 .nxdom_rx_rings = {
146 .nb_def = 1,
147 .nb_min = 1,
148 .nb_max = NX_FSW_MAXRINGS,
149 },
150 .nxdom_tx_slots = {
151 .nb_def = NX_FSW_TXRINGSIZE,
152 .nb_min = NX_FSW_MINSLOTS,
153 .nb_max = NX_FSW_MAXSLOTS,
154 },
155 .nxdom_rx_slots = {
156 .nb_def = NX_FSW_RXRINGSIZE,
157 .nb_min = NX_FSW_MINSLOTS,
158 .nb_max = NX_FSW_MAXSLOTS,
159 },
160 .nxdom_buf_size = {
161 .nb_def = NX_FSW_BUFSIZE,
162 .nb_min = NX_FSW_MINBUFSIZE,
163 .nb_max = NX_FSW_MAXBUFSIZE,
164 },
165 .nxdom_large_buf_size = {
166 .nb_def = NX_FSW_DEF_LARGE_BUFSIZE,
167 .nb_min = NX_FSW_MIN_LARGE_BUFSIZE,
168 .nb_max = NX_FSW_MAX_LARGE_BUFSIZE,
169 },
170 .nxdom_meta_size = {
171 .nb_def = NX_FSW_UMD_SIZE,
172 .nb_min = NX_FSW_UMD_SIZE,
173 .nb_max = NX_METADATA_USR_MAX_SZ,
174 },
175 .nxdom_stats_size = {
176 .nb_def = 0,
177 .nb_min = 0,
178 .nb_max = NX_STATS_MAX_SZ,
179 },
180 .nxdom_pipes = {
181 .nb_def = 0,
182 .nb_min = 0,
183 .nb_max = NX_UPIPE_MAXPIPES,
184 },
185 .nxdom_flowadv_max = {
186 .nb_def = 0,
187 .nb_min = 0,
188 .nb_max = NX_FLOWADV_MAX,
189 },
190 .nxdom_nexusadv_size = {
191 .nb_def = 0,
192 .nb_min = 0,
193 .nb_max = NX_NEXUSADV_MAX_SZ,
194 },
195 .nxdom_capabilities = {
196 .nb_def = NXPCAP_USER_CHANNEL,
197 .nb_min = 0,
198 .nb_max = (NXPCAP_CHECKSUM_PARTIAL | NXPCAP_USER_PACKET_POOL |
199 NXPCAP_USER_CHANNEL),
200 },
201 .nxdom_qmap = {
202 .nb_def = NEXUS_QMAP_TYPE_INVALID,
203 .nb_min = NEXUS_QMAP_TYPE_INVALID,
204 .nb_max = NEXUS_QMAP_TYPE_INVALID,
205 },
206 .nxdom_max_frags = {
207 .nb_def = NX_PBUF_FRAGS_DEFAULT,
208 .nb_min = NX_PBUF_FRAGS_MIN,
209 .nb_max = NX_PBUF_FRAGS_MAX,
210 },
211 .nxdom_init = nx_fsw_dom_init,
212 .nxdom_terminate = nx_fsw_dom_terminate,
213 .nxdom_fini = nx_fsw_dom_fini,
214 .nxdom_connect = nx_fsw_dom_connect,
215 .nxdom_find_port = nx_fsw_dom_find_port,
216 .nxdom_port_is_reserved = nx_fsw_dom_port_is_reserved,
217 .nxdom_bind_port = nx_fsw_dom_bind_port,
218 .nxdom_unbind_port = nx_fsw_dom_unbind_port,
219 .nxdom_disconnect = nx_fsw_dom_disconnect,
220 .nxdom_defunct = nx_fsw_dom_defunct,
221 .nxdom_defunct_finalize = nx_fsw_dom_defunct_finalize,
222};
223
224struct kern_nexus_domain_provider nx_fsw_prov_s = {
225 .nxdom_prov_name = NEXUS_PROVIDER_FLOW_SWITCH,
226 .nxdom_prov_flags = NXDOMPROVF_DEFAULT,
227 .nxdom_prov_cb = {
228 .dp_cb_init = nx_fsw_prov_init,
229 .dp_cb_fini = nx_fsw_prov_fini,
230 .dp_cb_params = nx_fsw_prov_params,
231 .dp_cb_mem_new = nx_fsw_prov_mem_new,
232 .dp_cb_config = nx_fsw_prov_config,
233 .dp_cb_nx_ctor = nx_fsw_prov_nx_ctor,
234 .dp_cb_nx_dtor = nx_fsw_prov_nx_dtor,
235 .dp_cb_nx_mem_info = NULL, /* not supported */
236 .dp_cb_nx_mib_get = nx_fsw_prov_mib_get,
237 .dp_cb_nx_stop = NULL,
238 },
239};
240
241
242static void
243nx_fsw_dom_init(struct nxdom *nxdom)
244{
245 SK_LOCK_ASSERT_HELD();
246 ASSERT(!(nxdom->nxdom_flags & NEXUSDOMF_INITIALIZED));
247
248 /* Generic initialization */
249 fsw_init();
250 fsw_dp_init();
251
252 (void) nxdom_prov_add(nxdom, &nx_fsw_prov_s);
253}
254
255static void
256nx_fsw_dom_terminate(struct nxdom *nxdom)
257{
258 struct kern_nexus_domain_provider *nxdom_prov, *tnxdp;
259
260 SK_LOCK_ASSERT_HELD();
261
262 STAILQ_FOREACH_SAFE(nxdom_prov, &nxdom->nxdom_prov_head,
263 nxdom_prov_link, tnxdp) {
264 (void) nxdom_prov_del(nxdom_prov);
265 }
266
267 fsw_dp_uninit();
268
269 /* Generic uninitialization */
270 fsw_uninit();
271}
272
273static void
274nx_fsw_dom_fini(struct nxdom *nxdom)
275{
276#pragma unused(nxdom)
277}
278
279static int
280nx_fsw_prov_init(struct kern_nexus_domain_provider *nxdom_prov)
281{
282#pragma unused(nxdom_prov)
283 SK_D("initializing %s", nxdom_prov->nxdom_prov_name);
284 return 0;
285}
286
287static int
288nx_fsw_prov_params_adjust(const struct kern_nexus_domain_provider *nxdom_prov,
289 const struct nxprov_params *nxp, struct nxprov_adjusted_params *adj)
290{
291#pragma unused(nxdom_prov, nxp)
292 _CASSERT(NX_FSW_AFRINGSIZE <= NX_FSW_RXRINGSIZE);
293 _CASSERT(NX_FSW_AFRINGSIZE <= NX_FSW_TXRINGSIZE);
294
295 *(adj->adj_md_subtype) = NEXUS_META_SUBTYPE_PAYLOAD;
296 *(adj->adj_stats_size) = sizeof(struct __nx_stats_fsw);
297 VERIFY(sk_max_flows > 0 && sk_max_flows <= NX_FLOWADV_MAX);
298 *(adj->adj_flowadv_max) = sk_max_flows;
299 *(adj->adj_nexusadv_size) = sizeof(struct sk_nexusadv);
300 *(adj->adj_caps) |= NXPCAP_USER_PACKET_POOL;
301 if (sk_cksum_tx != 0) {
302 *(adj->adj_caps) |= NXPCAP_CHECKSUM_PARTIAL;
303 }
304 *(adj->adj_alloc_rings) = *(adj->adj_free_rings) =
305 ((nxp->nxp_max_frags > 1) && (sk_channel_buflet_alloc != 0)) ?
306 2 : 1;
307 *(adj->adj_alloc_slots) = *(adj->adj_free_slots) =
308 NX_FSW_AFRINGSIZE;
309
310 if (!SKMEM_MEM_CONSTRAINED_DEVICE() &&
311 (*(adj->adj_buf_region_segment_size) < NX_FSW_BUF_SEG_SIZE)) {
312 *(adj->adj_buf_region_segment_size) = NX_FSW_BUF_SEG_SIZE;
313 }
314
315 if (*(adj->adj_max_frags) > 1) {
316 uint32_t fsw_maxbufs = SKMEM_MEM_CONSTRAINED_DEVICE() ?
317 NX_FSW_MAXBUFFERS_MEM_CONSTRAINED : NX_FSW_MAXBUFFERS;
318 uint32_t magazine_max_objs;
319
320 *(adj->adj_max_buffers) = (sk_fsw_max_bufs != 0) ?
321 sk_fsw_max_bufs : fsw_maxbufs;
322
323 /*
324 * Given that packet objects are the ones cached, use the
325 * metadata size to determine the extra amount of objects
326 * at magazine layer.
327 */
328 magazine_max_objs = skmem_cache_magazine_max(
329 NX_METADATA_PACKET_SZ(*(adj->adj_max_frags)) +
330 METADATA_PREAMBLE_SZ);
331
332 /*
333 * Adjust the max buffers to account for the increase
334 * associated with per-CPU caching.
335 */
336 if (skmem_allow_magazines() &&
337 magazine_max_objs < *(adj->adj_max_buffers)) {
338 *(adj->adj_max_buffers) -= magazine_max_objs;
339 }
340 }
341 if (SKMEM_MEM_CONSTRAINED_DEVICE() || (fsw_use_dual_sized_pool == 0) ||
342 (*(adj->adj_max_frags) <= 1)) {
343 *(adj->adj_large_buf_size) = 0;
344 }
345 return 0;
346}
347
348static int
349nx_fsw_prov_params(struct kern_nexus_domain_provider *nxdom_prov,
350 const uint32_t req, const struct nxprov_params *nxp0,
351 struct nxprov_params *nxp, struct skmem_region_params srp[SKMEM_REGIONS],
352 uint32_t pp_region_config_flags)
353{
354 struct nxdom *nxdom = nxdom_prov->nxdom_prov_dom;
355
356 /* USD regions need to be writable to support user packet pool */
357 srp[SKMEM_REGION_TXAUSD].srp_cflags &= ~SKMEM_REGION_CR_UREADONLY;
358 srp[SKMEM_REGION_RXFUSD].srp_cflags &= ~SKMEM_REGION_CR_UREADONLY;
359
360 return nxprov_params_adjust(nxdom_prov, req, nxp0, nxp, srp,
361 nxdom, nxdom, nxdom, pp_region_config_flags,
362 adjust_fn: nx_fsw_prov_params_adjust);
363}
364
365static void
366fsw_vp_region_params_setup(struct nexus_adapter *na, struct skmem_region_params *srp0,
367 struct skmem_region_params *srp)
368{
369 int i;
370 uint32_t totalrings, nslots, afslots, evslots, lbaslots;
371
372 /* copy default flowswitch parameters initialized in nxprov_params_adjust() */
373 for (i = 0; i < SKMEM_REGIONS; i++) {
374 srp[i] = srp0[i];
375 }
376 /* customize parameters that could vary across NAs */
377 totalrings = na_get_nrings(na, t: NR_TX) + na_get_nrings(na, t: NR_RX) +
378 na_get_nrings(na, t: NR_A) + na_get_nrings(na, t: NR_F) +
379 na_get_nrings(na, t: NR_EV) + na_get_nrings(na, t: NR_LBA);
380
381 srp[SKMEM_REGION_SCHEMA].srp_r_obj_size =
382 (uint32_t)CHANNEL_SCHEMA_SIZE(totalrings);
383 srp[SKMEM_REGION_SCHEMA].srp_r_obj_cnt = totalrings;
384 skmem_region_params_config(&srp[SKMEM_REGION_SCHEMA]);
385
386 srp[SKMEM_REGION_RING].srp_r_obj_size =
387 sizeof(struct __user_channel_ring);
388 srp[SKMEM_REGION_RING].srp_r_obj_cnt = totalrings;
389 skmem_region_params_config(&srp[SKMEM_REGION_RING]);
390
391 nslots = na_get_nslots(na, t: NR_TX);
392 afslots = na_get_nslots(na, t: NR_A);
393 evslots = na_get_nslots(na, t: NR_EV);
394 lbaslots = na_get_nslots(na, t: NR_LBA);
395 srp[SKMEM_REGION_TXAKSD].srp_r_obj_size =
396 MAX(MAX(MAX(nslots, afslots), evslots), lbaslots) * SLOT_DESC_SZ;
397 srp[SKMEM_REGION_TXAKSD].srp_r_obj_cnt =
398 na_get_nrings(na, t: NR_TX) + na_get_nrings(na, t: NR_A) +
399 na_get_nrings(na, t: NR_EV) + na_get_nrings(na, t: NR_LBA);
400 skmem_region_params_config(&srp[SKMEM_REGION_TXAKSD]);
401
402 /* USD and KSD objects share the same size and count */
403 srp[SKMEM_REGION_TXAUSD].srp_r_obj_size =
404 srp[SKMEM_REGION_TXAKSD].srp_r_obj_size;
405 srp[SKMEM_REGION_TXAUSD].srp_r_obj_cnt =
406 srp[SKMEM_REGION_TXAKSD].srp_r_obj_cnt;
407 skmem_region_params_config(&srp[SKMEM_REGION_TXAUSD]);
408}
409
410static int
411nx_fsw_prov_mem_new(struct kern_nexus_domain_provider *nxdom_prov,
412 struct kern_nexus *nx, struct nexus_adapter *na)
413{
414#pragma unused(nxdom_prov)
415 int err = 0;
416 struct skmem_region_params *srp0 = NX_PROV(nx)->nxprov_region_params;
417 struct skmem_region_params srp[SKMEM_REGIONS];
418
419 SK_DF(SK_VERB_FSW,
420 "nx 0x%llx (\"%s\":\"%s\") na \"%s\" (0x%llx)", SK_KVA(nx),
421 NX_DOM(nx)->nxdom_name, nxdom_prov->nxdom_prov_name, na->na_name,
422 SK_KVA(na));
423
424 ASSERT(na->na_type == NA_FLOWSWITCH_VP);
425 ASSERT(na->na_arena == NULL);
426 ASSERT((na->na_flags & NAF_USER_PKT_POOL) != 0);
427
428 fsw_vp_region_params_setup(na, srp0, srp);
429 /*
430 * Each port in the flow switch is isolated from one another;
431 * use NULL for the packet buffer pool references to indicate
432 * this, since otherwise we'd be sharing the same pp for the
433 * entire switch (maybe for a future, special use case?)
434 *
435 * This means that clients calling kern_nexus_get_pbufpool()
436 * will get NULL, but this is fine based on current design
437 * of providing port isolation, and also since we don't expose
438 * the flow switch to external kernel clients.
439 */
440 na->na_arena = skmem_arena_create_for_nexus(na, srp, NULL, NULL, FALSE,
441 !NX_USER_CHANNEL_PROV(nx), &nx->nx_adv, &err);
442 ASSERT(na->na_arena != NULL || err != 0);
443 return err;
444}
445
446static int
447nx_fsw_prov_config(struct kern_nexus_domain_provider *nxdom_prov,
448 struct kern_nexus *nx, struct nx_cfg_req *ncr, int sopt_dir,
449 struct proc *p, kauth_cred_t cred)
450{
451#pragma unused(nxdom_prov)
452 struct sockopt sopt;
453 int err = 0;
454
455 SK_LOCK_ASSERT_HELD();
456
457 if (ncr->nc_req == USER_ADDR_NULL) {
458 err = EINVAL;
459 goto done;
460 }
461
462 /* to make life easier for handling copies */
463 bzero(s: &sopt, n: sizeof(sopt));
464 sopt.sopt_dir = sopt_dir;
465 sopt.sopt_val = ncr->nc_req;
466 sopt.sopt_valsize = ncr->nc_req_len;
467 sopt.sopt_p = p;
468
469 /* avoid _MALLOCing at the cost of this ugly switch block */
470 switch (ncr->nc_cmd) {
471 case NXCFG_CMD_ATTACH:
472 case NXCFG_CMD_DETACH: {
473 /* proceed only if the client possesses flow switch entitlement */
474 if (cred == NULL || (err = skywalk_priv_check_cred(p, cred,
475 PRIV_SKYWALK_REGISTER_FLOW_SWITCH)) != 0) {
476 SK_ERR("missing nxctl credential");
477 err = EPERM;
478 goto done;
479 }
480
481 struct nx_spec_req nsr;
482 bzero(s: &nsr, n: sizeof(nsr));
483 err = sooptcopyin(sopt: &sopt, &nsr, len: sizeof(nsr), minlen: sizeof(nsr));
484 if (err != 0) {
485 goto done;
486 }
487
488 /*
489 * Null-terminate in case this has an interface name;
490 * the union is already large enough for uuid_t.
491 */
492 nsr.nsr_name[sizeof(nsr.nsr_name) - 1] = '\0';
493 if (p != kernproc) {
494 nsr.nsr_flags &= NXSPECREQ_MASK;
495 }
496
497 err = fsw_ctl(nx, nc_cmd: ncr->nc_cmd, p, data: &nsr);
498 if (err != 0) {
499 goto done;
500 }
501
502 err = sooptcopyout(sopt: &sopt, data: &nsr, len: sizeof(nsr));
503 break;
504 }
505
506 case NXCFG_CMD_FLOW_ADD:
507 case NXCFG_CMD_FLOW_DEL: {
508 /* need to have owner nxctl or kernnxctl */
509 if (cred == NULL) {
510 SK_ERR("missing nxctl credential");
511 err = EPERM;
512 goto done;
513 }
514 } /* fall through */
515 case NXCFG_CMD_FLOW_CONFIG: {
516 /* checks flow PID ownership instead of nxctl creditial */
517 struct nx_flow_req nfr;
518 bzero(s: &nfr, n: sizeof(nfr));
519 err = sooptcopyin(sopt: &sopt, &nfr, len: sizeof(nfr), minlen: sizeof(nfr));
520 if (err != 0) {
521 goto done;
522 }
523
524 err = fsw_ctl(nx, nc_cmd: ncr->nc_cmd, p, data: &nfr);
525 if (err != 0) {
526 goto done;
527 }
528
529 err = sooptcopyout(sopt: &sopt, data: &nfr, len: sizeof(nfr));
530 break;
531 }
532
533 case NXCFG_CMD_NETEM: {
534 struct if_netem_params inp;
535
536 bzero(s: &inp, n: sizeof(inp));
537 err = sooptcopyin(sopt: &sopt, &inp, len: sizeof(inp), minlen: sizeof(inp));
538 if (err != 0) {
539 goto done;
540 }
541 err = fsw_ctl(nx, nc_cmd: ncr->nc_cmd, p, data: &inp);
542 if (err != 0) {
543 goto done;
544 }
545 break;
546 }
547
548 default:
549 err = EINVAL;
550 goto done;
551 }
552
553done:
554 SK_DF(err ? SK_VERB_ERROR: SK_VERB_FSW,
555 "nexus 0x%llx (%s) cmd %d (err %d)", SK_KVA(nx),
556 NX_DOM_PROV(nx)->nxdom_prov_name, ncr->nc_cmd, err);
557 return err;
558}
559
560static void
561nx_fsw_prov_fini(struct kern_nexus_domain_provider *nxdom_prov)
562{
563#pragma unused(nxdom_prov)
564 SK_D("destroying %s", nxdom_prov->nxdom_prov_name);
565}
566
567static int
568nx_fsw_prov_nx_ctor(struct kern_nexus *nx)
569{
570 struct nx_flowswitch *fsw;
571
572 SK_LOCK_ASSERT_HELD();
573
574 ASSERT(nx->nx_arg == NULL);
575
576 SK_D("nexus 0x%llx (%s)", SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name);
577
578 fsw = fsw_alloc(Z_WAITOK);
579 nx->nx_arg = fsw;
580 fsw->fsw_nx = nx;
581 fsw->fsw_tx_rings = NX_PROV(nx)->nxprov_params->nxp_tx_rings;
582 fsw->fsw_rx_rings = NX_PROV(nx)->nxprov_params->nxp_rx_rings;
583
584 FSW_WLOCK(fsw);
585
586 fsw_dp_ctor(fsw);
587
588 FSW_WUNLOCK(fsw);
589
590 SK_D("create new fsw 0x%llx for nexus 0x%llx",
591 SK_KVA(NX_FSW_PRIVATE(nx)), SK_KVA(nx));
592
593 return 0;
594}
595
596static void
597nx_fsw_prov_nx_dtor(struct kern_nexus *nx)
598{
599 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
600 int err;
601
602 SK_LOCK_ASSERT_HELD();
603
604 SK_D("nexus 0x%llx (%s) fsw 0x%llx", SK_KVA(nx),
605 NX_DOM_PROV(nx)->nxdom_prov_name, SK_KVA(fsw));
606
607 err = fsw_ctl_detach(nx, p: current_proc(), NULL);
608 ASSERT(err == 0); /* this cannot fail */
609 ASSERT(fsw->fsw_dev_ch == NULL);
610 ASSERT(fsw->fsw_host_ch == NULL);
611
612 SK_DF(SK_VERB_FSW, "marking fsw 0x%llx as free", SK_KVA(fsw));
613 fsw_free(fsw);
614 nx->nx_arg = NULL;
615}
616
617static size_t
618nx_fsw_prov_mib_get(struct kern_nexus *nx, struct nexus_mib_filter *filter,
619 void *out, size_t len, struct proc *p)
620{
621 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
622
623 /* this check doesn't require holding fsw_lock */
624 if ((filter->nmf_bitmap & NXMIB_FILTER_NX_UUID) &&
625 (uuid_compare(uu1: filter->nmf_nx_uuid,
626 uu2: fsw->fsw_nx->nx_uuid)) != 0) {
627 return 0;
628 }
629
630 /* intercept NXMIB_FSW_STATS here since it's for flowswitch */
631 FSW_RLOCK(fsw);
632 len = fsw_mib_get(fsw, filter, out, len, p);
633 FSW_UNLOCK(fsw);
634
635 return len;
636}
637
638boolean_t
639nx_fsw_dom_port_is_reserved(struct kern_nexus *nx, nexus_port_t nx_port)
640{
641#pragma unused(nx)
642 return nx_port < NEXUS_PORT_FLOW_SWITCH_CLIENT;
643}
644
645static int
646nx_fsw_dom_find_port(struct kern_nexus *nx, boolean_t rsvd,
647 nexus_port_t *nx_port)
648{
649 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
650 nexus_port_t first, last, port;
651 int error;
652
653 ASSERT(nx_port != NULL);
654
655 port = *nx_port;
656 ASSERT(port == NEXUS_PORT_ANY);
657
658 if (rsvd) {
659 first = 0;
660 last = NEXUS_PORT_FLOW_SWITCH_CLIENT;
661 } else {
662 first = NEXUS_PORT_FLOW_SWITCH_CLIENT;
663 ASSERT(NXDOM_MAX(NX_DOM(nx), ports) <= NEXUS_PORT_MAX);
664 last = (nexus_port_size_t)NXDOM_MAX(NX_DOM(nx), ports);
665 }
666 ASSERT(first <= last);
667
668 FSW_WLOCK(fsw);
669 if (__improbable(first == last)) {
670 error = ENOSPC;
671 } else {
672 error = nx_port_find(nx, first, last - 1, &port);
673 ASSERT(error != 0 || (port >= first && port < last));
674 }
675 FSW_WUNLOCK(fsw);
676
677 SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
678 "nx 0x%llx \"%s\" %snx_port %d [%u,%u] (err %d)", SK_KVA(nx),
679 nx->nx_prov->nxprov_params->nxp_name, (rsvd ? "[reserved] " : ""),
680 (int)port, first, (last - 1), error);
681
682 if (error == 0) {
683 *nx_port = port;
684 }
685
686 return error;
687}
688
689static int
690nx_fsw_dom_bind_port(struct kern_nexus *nx, nexus_port_t *nx_port,
691 struct nxbind *nxb, void *info)
692{
693#pragma unused(info)
694 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
695 nexus_port_t first, last, port;
696 int error;
697
698 ASSERT(nx_port != NULL);
699 ASSERT(nxb != NULL);
700
701 port = *nx_port;
702
703 /* can't bind reserved ports to client credentials */
704 if (nx_fsw_dom_port_is_reserved(nx, nx_port: port)) {
705 return EDOM;
706 }
707
708 /*
709 * Allow clients to bind to regular ports (non-reserved);
710 * reserved ports aren't subject to bind/unbind, since
711 * they are used for internal purposes.
712 */
713 first = NEXUS_PORT_FLOW_SWITCH_CLIENT;
714 ASSERT(NXDOM_MAX(NX_DOM(nx), ports) <= NEXUS_PORT_MAX);
715 last = (nexus_port_size_t)NXDOM_MAX(NX_DOM(nx), ports);
716 ASSERT(first <= last);
717
718 FSW_WLOCK(fsw);
719 if (__improbable(first == last)) {
720 error = ENOSPC;
721 } else if (port != NEXUS_PORT_ANY) {
722 error = nx_port_bind(nx, port, nxb);
723 } else {
724 error = nx_port_find(nx, first, last - 1, &port);
725 ASSERT(error != 0 || (port >= first && port < last));
726 if (error == 0) {
727 error = nx_port_bind(nx, port, nxb);
728 }
729 }
730 FSW_WUNLOCK(fsw);
731
732 SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
733 "nx 0x%llx \"%s\" nx_port %d [%u,%u] (err %d)", SK_KVA(nx),
734 nx->nx_prov->nxprov_params->nxp_name, (int)port,
735 first, (last - 1), error);
736
737 ASSERT(*nx_port == NEXUS_PORT_ANY || *nx_port == port);
738 if (error == 0) {
739 *nx_port = port;
740 }
741
742 return error;
743}
744
745static int
746nx_fsw_dom_unbind_port(struct kern_nexus *nx, nexus_port_t nx_port)
747{
748 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
749 int error;
750
751 FSW_WLOCK(fsw);
752 error = nx_port_unbind(nx, nx_port);
753 FSW_WUNLOCK(fsw);
754
755 SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
756 "nx 0x%llx \"%s\" nx_port %d (err %d)", SK_KVA(nx),
757 nx->nx_prov->nxprov_params->nxp_name, (int)nx_port, error);
758
759 return error;
760}
761
762static int
763nx_fsw_dom_connect(struct kern_nexus_domain_provider *nxdom_prov,
764 struct kern_nexus *nx, struct kern_channel *ch, struct chreq *chr,
765 struct kern_channel *ch0, struct nxbind *nxb, struct proc *p)
766{
767#pragma unused(nxdom_prov)
768 nexus_port_t port = chr->cr_port;
769 int err = 0;
770
771 SK_LOCK_ASSERT_HELD();
772
773 ASSERT(nx->nx_prov->nxprov_params->nxp_type ==
774 nxdom_prov->nxdom_prov_dom->nxdom_type &&
775 nx->nx_prov->nxprov_params->nxp_type == NEXUS_TYPE_FLOW_SWITCH);
776 ASSERT(!(ch->ch_flags & CHANF_HOST));
777 ASSERT(!(ch->ch_flags & CHANF_KERNEL));
778
779 if (port != NEXUS_PORT_ANY && port >= NXDOM_MAX(NX_DOM(nx), ports)) {
780 err = EDOM;
781 goto done;
782 }
783
784 chr->cr_real_endpoint = chr->cr_endpoint = CH_ENDPOINT_FLOW_SWITCH;
785 ASSERT(port != NEXUS_PORT_ANY);
786 (void) snprintf(chr->cr_name, count: sizeof(chr->cr_name),
787 "%s_%llu:%u", NX_FSW_NAME, nx->nx_id, port);
788 chr->cr_ring_set = RING_SET_DEFAULT;
789 err = na_connect(nx, ch, chr, ch0, nxb, p);
790
791done:
792 return err;
793}
794
795static void
796nx_fsw_dom_disconnect(struct kern_nexus_domain_provider *nxdom_prov,
797 struct kern_nexus *nx, struct kern_channel *ch)
798{
799#pragma unused(nxdom_prov)
800 SK_LOCK_ASSERT_HELD();
801
802 SK_D("channel 0x%llx -!- nexus 0x%llx (%s:\"%s\":%u:%d)", SK_KVA(ch),
803 SK_KVA(nx), nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
804 ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id);
805
806 if (ch->ch_flags & CHANF_KERNEL) {
807 na_disconnect_spec(nx, ch);
808 } else {
809 na_disconnect(nx, ch);
810 }
811}
812
813static void
814nx_fsw_dom_defunct(struct kern_nexus_domain_provider *nxdom_prov,
815 struct kern_nexus *nx, struct kern_channel *ch, struct proc *p)
816{
817#pragma unused(nxdom_prov)
818 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
819
820 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
821 ASSERT(!(ch->ch_flags & CHANF_KERNEL));
822 ASSERT(ch->ch_na->na_type == NA_FLOWSWITCH_VP);
823
824 /*
825 * Hold the flowswitch lock as writer; this prevents all data path
826 * accesses to the flowswitch, and allows us to mark the rings with
827 * CKRF_DEFUNCT. Unlike some other nexus types, the flowswitch
828 * doesn't utilize kr_{enter,exit} for serialization, at present.
829 */
830 FSW_WLOCK(fsw);
831 na_ch_rings_defunct(ch, p);
832 FSW_WUNLOCK(fsw);
833}
834
835static void
836nx_fsw_dom_defunct_finalize(struct kern_nexus_domain_provider *nxdom_prov,
837 struct kern_nexus *nx, struct kern_channel *ch, boolean_t locked)
838{
839#pragma unused(nxdom_prov)
840 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
841 int err = 0;
842
843 if (!locked) {
844 SK_LOCK_ASSERT_NOTHELD();
845 SK_LOCK();
846 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
847 } else {
848 SK_LOCK_ASSERT_HELD();
849 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
850 }
851
852 ASSERT(!(ch->ch_flags & CHANF_KERNEL));
853 ASSERT(ch->ch_na->na_type == NA_FLOWSWITCH_VP);
854 ASSERT(VPNA(ch->ch_na)->vpna_nx_port == ch->ch_info->cinfo_nx_port);
855
856 err = fsw_port_na_defunct(fsw, VPNA(ch->ch_na));
857
858 if (err == 0) {
859 na_defunct(nx, ch, ch->ch_na, locked);
860 }
861
862 SK_D("%s(%d): ch 0x%llx -/- nx 0x%llx (%s:\"%s\":%u:%d) err %d",
863 ch->ch_name, ch->ch_pid, SK_KVA(ch), SK_KVA(nx),
864 nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
865 ch->ch_info->cinfo_nx_port,
866 (int)ch->ch_info->cinfo_ch_ring_id, err);
867
868 if (!locked) {
869 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
870 SK_UNLOCK();
871 } else {
872 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
873 SK_LOCK_ASSERT_HELD();
874 }
875}
876
877#if SK_LOG
878/* Hoisted out of line to reduce kernel stack footprint */
879SK_LOG_ATTRIBUTE
880static void
881nx_fsw_na_find_log(const struct chreq *chr, boolean_t create)
882{
883 uuid_string_t uuidstr;
884
885 SK_D("name \"%s\" spec_uuid \"%s\" nx_port %d mode 0x%b pipe_id %u "
886 "ring_id %d ring_set %u ep_type %u:%u create %u%s",
887 chr->cr_name, sk_uuid_unparse(chr->cr_spec_uuid, uuidstr),
888 (int)chr->cr_port, chr->cr_mode, CHMODE_BITS, chr->cr_pipe_id,
889 (int)chr->cr_ring_id, chr->cr_ring_set, chr->cr_real_endpoint,
890 chr->cr_endpoint, create, (strncmp(chr->cr_name, NX_FSW_NAME,
891 sizeof(NX_FSW_NAME) - 1) != 0) ? " (skipped)" : "");
892}
893#endif /* SK_LOG */
894
895/*
896 * Try to get a reference to a Nexus adapter attached to a flow switch.
897 * If the adapter is found (or is created), this function returns 0, a
898 * non NULL pointer is returned into *na, and the caller holds a
899 * reference to the adapter.
900 * If an adapter is not found, then no reference is grabbed and the
901 * function returns an error code, or 0 if there is just a flow switch prefix
902 * mismatch. Therefore the caller holds a reference when
903 * (*na != NULL && return == 0).
904 */
905int
906nx_fsw_na_find(struct kern_nexus *nx, struct kern_channel *ch,
907 struct chreq *chr, struct nxbind *nxb, struct proc *p,
908 struct nexus_adapter **na, boolean_t create)
909{
910#pragma unused(ch)
911 struct nexus_vp_adapter *vpna = NULL;
912 char *cr_name = chr->cr_name;
913 struct nx_flowswitch *fsw;
914 int error = 0;
915
916 SK_LOCK_ASSERT_HELD();
917 *na = NULL; /* default return value */
918
919#if SK_LOG
920 if (__improbable(sk_verbose != 0)) {
921 nx_fsw_na_find_log(chr, create);
922 }
923#endif /* SK_LOG */
924
925 /* first try to see if this is a flow switch port. */
926 if (strncmp(s1: cr_name, NX_FSW_NAME, n: sizeof(NX_FSW_NAME) - 1) != 0) {
927 return 0; /* no error, but no flow switch prefix */
928 }
929 ASSERT(nx->nx_prov->nxprov_params->nxp_type == NEXUS_TYPE_FLOW_SWITCH);
930 fsw = NX_FSW_PRIVATE(nx);
931 ASSERT(fsw != NULL);
932
933 if (!create) {
934 return ENXIO;
935 }
936
937 /*
938 * The flowswitch VP is only attachable from a user channel so none of
939 * these flags should be set.
940 */
941 ASSERT((chr->cr_mode & (CHMODE_KERNEL | CHMODE_CONFIG)) == 0);
942 error = fsw_attach_vp(nx, ch, chr, nxb, p, vpna: &vpna);
943 ASSERT(vpna == NULL || error == 0);
944
945 if (error == 0) {
946 /* use reference held by nx_fsw_attach_vp above */
947 *na = &vpna->vpna_up;
948 SK_DF(SK_VERB_FSW,
949 "vpna \"%s\" (0x%llx) refs %u to fsw \"%s\" nx_port %d",
950 (*na)->na_name, SK_KVA(*na), (*na)->na_refcount,
951 cr_name, (int)vpna->vpna_nx_port);
952 }
953
954 return error;
955}
956
957int
958nx_fsw_netagent_add(struct kern_nexus *nx)
959{
960 return fsw_netagent_add_remove(nx, TRUE);
961}
962
963int
964nx_fsw_netagent_remove(struct kern_nexus *nx)
965{
966 return fsw_netagent_add_remove(nx, FALSE);
967}
968
969void
970nx_fsw_netagent_update(struct kern_nexus *nx)
971{
972 fsw_netagent_update(nx);
973}
974