1/*
2 * Copyright (c) 2015-2023 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*
30 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 *
41 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 */
53#include <pexpert/pexpert.h> /* for PE_parse_boot_argn */
54#include <skywalk/os_skywalk_private.h>
55#include <skywalk/nexus/flowswitch/nx_flowswitch.h>
56#include <skywalk/nexus/flowswitch/fsw_var.h>
57#include <skywalk/nexus/netif/nx_netif.h>
58#include <skywalk/nexus/netif/nx_netif_compat.h>
59
60#include <net/bpf.h>
61#include <net/if.h>
62#include <net/pktsched/pktsched_netem.h>
63#include <sys/eventhandler.h>
64
65#if (DEVELOPMENT || DEBUG)
66SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, chain_enqueue,
67 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_chain_enqueue, 0, "");
68#endif /* !DEVELOPMENT && !DEBUG */
69
70/*
71 * Configures the flowswitch to utilize user packet pool with
72 * dual sized buffers.
73 * A non-zero value enables the support.
74 */
75#if defined(XNU_TARGET_OS_IOS) || defined(XNU_TARGET_OS_OSX)
76uint32_t fsw_use_dual_sized_pool = 1;
77#else
78uint32_t fsw_use_dual_sized_pool = 0;
79#endif
80
81uint32_t fsw_chain_enqueue = 1;
82static int __nx_fsw_inited = 0;
83static eventhandler_tag __nx_fsw_ifnet_eventhandler_tag = NULL;
84static eventhandler_tag __nx_fsw_protoctl_eventhandler_tag = NULL;
85
86static SKMEM_TYPE_DEFINE(nx_fsw_zone, struct nx_flowswitch);
87
88static SKMEM_TYPE_DEFINE(nx_fsw_stats_zone, struct __nx_stats_fsw);
89
90#define SKMEM_TAG_FSW_PORTS "com.apple.skywalk.fsw.ports"
91SKMEM_TAG_DEFINE(skmem_tag_fsw_ports, SKMEM_TAG_FSW_PORTS);
92
93#define SKMEM_TAG_FSW_FOB_HASH "com.apple.skywalk.fsw.fsw.fob.hash"
94SKMEM_TAG_DEFINE(skmem_tag_fsw_fob_hash, SKMEM_TAG_FSW_FOB_HASH);
95
96#define SKMEM_TAG_FSW_FRB_HASH "com.apple.skywalk.fsw.fsw.frb.hash"
97SKMEM_TAG_DEFINE(skmem_tag_fsw_frb_hash, SKMEM_TAG_FSW_FRB_HASH);
98
99#define SKMEM_TAG_FSW_FRIB_HASH "com.apple.skywalk.fsw.fsw.frib.hash"
100SKMEM_TAG_DEFINE(skmem_tag_fsw_frib_hash, SKMEM_TAG_FSW_FRIB_HASH);
101
102#define SKMEM_TAG_FSW_FRAG_MGR "com.apple.skywalk.fsw.fsw.frag.mgr"
103SKMEM_TAG_DEFINE(skmem_tag_fsw_frag_mgr, SKMEM_TAG_FSW_FRAG_MGR);
104
105/* 64-bit mask with range */
106#define BMASK64(_beg, _end) \
107 ((NX_FSW_CHUNK_FREE >> (63 - (_end))) & ~((1ULL << (_beg)) - 1))
108
109static int fsw_detach(struct nx_flowswitch *fsw, struct nexus_adapter *hwna,
110 boolean_t purge);
111
112int
113fsw_attach_vp(struct kern_nexus *nx, struct kern_channel *ch,
114 struct chreq *chr, struct nxbind *nxb, struct proc *p,
115 struct nexus_vp_adapter **vpna)
116{
117#pragma unused(ch)
118 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
119 SK_LOG_VAR(char *cr_name = chr->cr_name);
120 int err = 0;
121
122 SK_LOCK_ASSERT_HELD();
123 ASSERT(!(chr->cr_mode & CHMODE_CONFIG));
124 *vpna = NULL;
125
126 /* if there's an existing adapter on the nexus port then use it */
127 FSW_WLOCK(fsw);
128 err = fsw_port_alloc(fsw, nxb, vpna, nx_port: chr->cr_port, p, FALSE, FALSE);
129 FSW_WUNLOCK(fsw);
130
131 if (err != 0) {
132 ASSERT(*vpna == NULL);
133 goto out;
134 } else if (*vpna != NULL) {
135 /*
136 * Use the existing adapter on that port; fsw_port_alloc()
137 * callback has retained a reference count on the adapter.
138 */
139 goto out;
140 }
141 ASSERT(*vpna == NULL);
142
143 /* create a virtual port; callee holds vpna ref */
144 err = fsw_vp_na_create(nx, chr, p, ret: vpna);
145 if (err != 0) {
146 SK_ERR("vpna create failed (err %d)", err);
147 goto out;
148 }
149
150 FSW_WLOCK(fsw);
151 err = fsw_port_alloc(fsw, nxb, vpna, nx_port: (*vpna)->vpna_nx_port, p, FALSE, FALSE);
152 FSW_WUNLOCK(fsw);
153
154out:
155 if ((*vpna) != NULL) {
156 SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
157 "vpna \"%s\" (0x%llx) refs %u to fsw \"%s\" "
158 "nx_port %d (err %d)", (*vpna)->vpna_up.na_name,
159 SK_KVA(&(*vpna)->vpna_up), (*vpna)->vpna_up.na_refcount,
160 cr_name, (int)(*vpna)->vpna_nx_port, err);
161
162 if (err != 0) {
163 na_release_locked(na: &(*vpna)->vpna_up);
164 *vpna = NULL;
165 }
166 }
167
168 return err;
169}
170
171static int
172fsw_nx_check(struct nx_flowswitch *fsw, struct kern_nexus *hw_nx)
173{
174#pragma unused(fsw)
175 nexus_type_t hw_nxdom_type = NX_DOM(hw_nx)->nxdom_type;
176
177 if (hw_nxdom_type != NEXUS_TYPE_NET_IF) {
178 return EINVAL;
179 }
180
181 /* it's a netif below */
182 return 0;
183}
184
185static int
186fsw_ctl_flow_add(struct nx_flowswitch *fsw, struct proc *p,
187 struct nx_flow_req *req)
188{
189 struct flow_owner *fo;
190 int error = 0;
191
192 ASSERT(p != PROC_NULL);
193
194 if (p != kernproc) {
195 /* special port shouldn't be bound via this method */
196 if (req->nfr_nx_port < FSW_VP_USER_MIN) {
197 return EINVAL;
198 }
199 req->nfr_flags |= (NXFLOWREQF_TRACK | NXFLOWREQF_FLOWADV);
200 } else {
201 /* no flow track or advisory support for bsd flow */
202 ASSERT((req->nfr_flags & NXFLOWREQF_TRACK) == 0);
203 ASSERT((req->nfr_flags & NXFLOWREQF_FLOWADV) == 0);
204 ASSERT((req->nfr_flags & NXFLOWREQF_LOW_LATENCY) == 0);
205 }
206
207 /* init kernel only fields */
208 if (p != kernproc) {
209 nx_flow_req_internalize(req);
210 }
211 req->nfr_pid = proc_pid(p);
212 if (req->nfr_epid == -1) {
213 req->nfr_epid = proc_pid(p);
214 }
215
216 if (req->nfr_flow_demux_count > MAX_FLOW_DEMUX_PATTERN) {
217 SK_ERR("invalid flow demux count %u", req->nfr_flow_demux_count);
218 return EINVAL;
219 }
220
221 fo = fsw_flow_add(fsw, req0: req, error: &error);
222 ASSERT(fo != NULL || error != 0);
223
224 if (error == 0) {
225 // user space don't need this flow stats
226 flow_stats_release(fs: req->nfr_flow_stats);
227 }
228 if (p != kernproc) {
229 nx_flow_req_externalize(req);
230 }
231
232 return error;
233}
234
235static int
236fsw_ctl_flow_del(struct nx_flowswitch *fsw, struct proc *p,
237 struct nx_flow_req *req)
238{
239 int err;
240
241 nx_flow_req_internalize(req);
242 req->nfr_pid = proc_pid(p);
243 err = fsw_flow_del(fsw, req, TRUE, NULL);
244
245 nx_flow_req_externalize(req);
246 return err;
247}
248
249static int
250fsw_ctl_flow_config(struct nx_flowswitch *fsw, struct proc *p,
251 struct nx_flow_req *req)
252{
253 int err;
254
255 nx_flow_req_internalize(req);
256 req->nfr_pid = proc_pid(p);
257 err = fsw_flow_config(fsw, req);
258
259 nx_flow_req_externalize(req);
260 return err;
261}
262
263#if (DEVELOPMENT || DEBUG)
264static int
265fsw_rps_threads_sysctl SYSCTL_HANDLER_ARGS
266{
267#pragma unused(oidp, arg2)
268 struct nx_flowswitch *fsw = arg1;
269 uint32_t nthreads;
270 int changed;
271 int error;
272
273 error = sysctl_io_number(req, fsw->fsw_rps_nthreads,
274 sizeof(fsw->fsw_rps_nthreads), &nthreads, &changed);
275 if (error == 0 && changed != 0) {
276 error = fsw_rps_set_nthreads(fsw, nthreads);
277 }
278 return error;
279}
280#endif /* !DEVELOPMENT && !DEBUG */
281
282void
283fsw_get_tso_capabilities(struct ifnet *ifp, uint32_t *tso_v4_mtu, uint32_t *tso_v6_mtu)
284{
285#pragma unused(ifp)
286 *tso_v4_mtu = 0;
287 *tso_v6_mtu = 0;
288
289#ifdef XNU_TARGET_OS_OSX
290 struct nx_flowswitch *fsw;
291
292 fsw = fsw_ifp_to_fsw(ifp);
293 if (fsw == NULL) {
294 return;
295 }
296 switch (fsw->fsw_tso_mode) {
297 case FSW_TSO_MODE_HW: {
298 ASSERT(ifp->if_tso_v4_mtu != 0 || ifp->if_tso_v6_mtu != 0);
299 *tso_v4_mtu = ifp->if_tso_v4_mtu;
300 *tso_v6_mtu = ifp->if_tso_v6_mtu;
301 break;
302 }
303 case FSW_TSO_MODE_SW: {
304 ASSERT(fsw->fsw_tso_sw_mtu != 0);
305 *tso_v4_mtu = fsw->fsw_tso_sw_mtu;
306 *tso_v6_mtu = fsw->fsw_tso_sw_mtu;
307 break;
308 }
309 default:
310 break;
311 }
312#endif /* XNU_TARGET_OS_OSX */
313}
314
315static void
316fsw_tso_setup(struct nx_flowswitch *fsw)
317{
318 fsw->fsw_tso_mode = FSW_TSO_MODE_NONE;
319#ifdef XNU_TARGET_OS_OSX
320 struct ifnet *ifp = fsw->fsw_ifp;
321 if (!SKYWALK_CAPABLE(ifp) || !SKYWALK_NATIVE(ifp)) {
322 DTRACE_SKYWALK2(tso__no__support, struct nx_flowswitch *, fsw,
323 ifnet_t, ifp);
324 return;
325 }
326 struct nx_netif *nif = NA(ifp)->nifna_netif;
327 uint32_t large_buf_size = NX_PROV_PARAMS(fsw->fsw_nx)->nxp_large_buf_size;
328
329 if (large_buf_size == 0) {
330 DTRACE_SKYWALK2(no__large__buf, struct nx_flowswitch *, fsw,
331 ifnet_t, ifp);
332 return;
333 }
334 /*
335 * Unlike _dlil_adjust_large_buf_size_for_tso(), we check the nif_hwassist
336 * flags here for the original flags because nx_netif_host_adjust_if_capabilities()
337 * has already been called.
338 */
339 if (((nif->nif_hwassist & IFNET_TSO_IPV4) != 0 && ifp->if_tso_v4_mtu != 0) ||
340 ((nif->nif_hwassist & IFNET_TSO_IPV6) != 0 && ifp->if_tso_v6_mtu != 0)) {
341 ASSERT(large_buf_size <= ifp->if_tso_v4_mtu ||
342 large_buf_size <= ifp->if_tso_v6_mtu);
343 fsw->fsw_tso_mode = FSW_TSO_MODE_HW;
344 } else {
345 if (sk_fsw_gso_mtu != 0 && large_buf_size >= sk_fsw_gso_mtu) {
346 fsw->fsw_tso_mode = FSW_TSO_MODE_SW;
347 fsw->fsw_tso_sw_mtu = sk_fsw_gso_mtu;
348 }
349 }
350 DTRACE_SKYWALK3(tso__mode, struct nx_flowswitch *, fsw,
351 fsw_tso_mode_t, fsw->fsw_tso_mode, uint32_t, large_buf_size);
352#endif /* XNU_TARGET_OS_OSX */
353}
354
355static int
356fsw_setup_ifp(struct nx_flowswitch *fsw, struct nexus_adapter *hwna)
357{
358 int error = 0;
359 struct ifnet *ifp = hwna->na_ifp;
360 struct kern_pbufpool *pp = skmem_arena_nexus(ar: hwna->na_arena)->arn_rx_pp;
361 size_t f_limit = pp->pp_kmd_region->skr_c_obj_cnt / 2;
362
363 ASSERT((hwna->na_type == NA_NETIF_HOST) ||
364 (hwna->na_type == NA_NETIF_COMPAT_HOST));
365
366 SK_LOCK_ASSERT_HELD();
367
368 /*
369 * XXX: we don't support non TXSTART interface.
370 * There are assumptions in fsw_port_flush_enqueue_dst() about
371 * single threaded write to destination rings.
372 */
373 if ((ifp->if_eflags & IFEF_TXSTART) == 0) {
374 SK_ERR("non TXSTART interface not supported ifp(0x%llx)",
375 SK_KVA(ifp));
376 return ENOTSUP;
377 }
378
379 FSW_WLOCK(fsw);
380
381 ASSERT(fsw->fsw_ifp == NULL);
382 ASSERT(fsw->fsw_nifna == NULL);
383 ASSERT(fsw->fsw_resolve == NULL);
384 ASSERT(fsw->fsw_frame == NULL);
385 ASSERT(fsw->fsw_demux == NULL);
386 ASSERT(fsw->fsw_pkt_copy_from_pkt == NULL);
387 ASSERT(fsw->fsw_pkt_copy_from_mbuf == NULL);
388 ASSERT(fsw->fsw_pkt_copy_to_mbuf == NULL);
389
390 fsw->fsw_ipfm = fsw_ip_frag_mgr_create(fsw, ifp, f_limit);
391 if (fsw->fsw_ipfm == NULL) {
392 FSW_WUNLOCK(fsw);
393 return ENOMEM;
394 }
395
396 switch (ifp->if_family) {
397 case IFNET_FAMILY_ETHERNET:
398 error = fsw_ethernet_setup(fsw, ifp);
399 fsw->fsw_ifp_dlt = DLT_EN10MB;
400 break;
401
402 case IFNET_FAMILY_CELLULAR:
403 error = fsw_cellular_setup(fsw, ifp);
404 fsw->fsw_ifp_dlt = DLT_RAW;
405 break;
406
407 default:
408 if (ifp->if_family == IFNET_FAMILY_IPSEC ||
409 ifp->if_family == IFNET_FAMILY_UTUN) {
410 error = fsw_ip_setup(fsw, ifp);
411 fsw->fsw_ifp_dlt = DLT_RAW;
412 break;
413 }
414 error = ENOTSUP;
415 break;
416 }
417
418 if (error != 0) {
419 FSW_WUNLOCK(fsw);
420 return error;
421 }
422
423 ASSERT(fsw->fsw_resolve != NULL);
424
425 if (NX_PROV(fsw->fsw_nx)->nxprov_region_params[SKMEM_REGION_KMD].
426 srp_max_frags > 1 || pp->pp_max_frags > 1) {
427 fsw->fsw_pkt_copy_from_pkt = pkt_copy_multi_buflet_from_pkt;
428 fsw->fsw_pkt_copy_from_mbuf = pkt_copy_multi_buflet_from_mbuf;
429 fsw->fsw_pkt_copy_to_mbuf = pkt_copy_multi_buflet_to_mbuf;
430 } else {
431 fsw->fsw_pkt_copy_from_pkt = pkt_copy_from_pkt;
432 fsw->fsw_pkt_copy_from_mbuf = pkt_copy_from_mbuf;
433 fsw->fsw_pkt_copy_to_mbuf = pkt_copy_to_mbuf;
434 }
435
436 /*
437 * Since it is possible for fsw to refer to the ifp after all
438 * underlying hwnas are freed (see fsw_teardown_ifp()), we need
439 * an extra reference to the ifp here.
440 *
441 * We also cache the netif adapter of the interface, as it's
442 * needed for each packet enqueued to the classq. There is no
443 * need to retain a refcnt for the same reason as above.
444 *
445 * We hold the busy lock across these, just in case an interface
446 * detach and reattach happens, as fsw_flow_bind() relies on the
447 * same lock as well before making its checks.
448 */
449 lck_mtx_lock(lck: &fsw->fsw_detach_barrier_lock);
450
451 ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
452 fsw->fsw_ifp = ifp;
453 fsw->fsw_nifna = &ifp->if_na->nifna_up;
454 ifp->if_na->nifna_netif->nif_fsw = fsw;
455 ifp->if_na->nifna_netif->nif_fsw_nxadv =
456 fsw->fsw_nx->nx_adv.flowswitch_nxv_adv;
457 (void) strlcpy(dst: fsw->fsw_flow_mgr->fm_name,
458 if_name(ifp), IFNAMSIZ);
459
460 fsw_classq_setup(fsw, hostna: hwna);
461 fsw->fsw_classq_enabled = TRUE;
462 fsw->fsw_src_lla_gencnt = 0;
463 fsw_tso_setup(fsw);
464
465 ASSERT(fsw->fsw_reap_thread != THREAD_NULL);
466 (void) snprintf(fsw->fsw_reap_name, count: sizeof(fsw->fsw_reap_name),
467 FSW_REAP_THREADNAME, ifp->if_xname, "");
468 thread_set_thread_name(th: fsw->fsw_reap_thread, name: fsw->fsw_reap_name);
469
470 error = fsw_netagent_register(fsw, ifp);
471 SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
472 "fsw_netagent_register %s (family %u) (err %d)",
473 if_name(ifp), ifp->if_family, error);
474
475 /*
476 * Clear NXF_REJECT to allow new channels to be opened
477 * to this nexus, in case this is an interface reattach.
478 * Otherwise this flag should already be cleared.
479 */
480 if (error == 0) {
481 os_atomic_andnot(&fsw->fsw_nx->nx_flags, NXF_REJECT, relaxed);
482 }
483
484 lck_mtx_unlock(lck: &fsw->fsw_detach_barrier_lock);
485
486 /*
487 * Wake up the reaper thread.
488 */
489 if (error == 0) {
490 fsw_reap_sched(fsw);
491 }
492
493 /* init skoid */
494 skoid_create(skoid: &fsw->fsw_skoid,
495 SKOID_SNODE(_kern_skywalk_flowswitch), if_name(ifp),
496 CTLFLAG_RW);
497
498#if (DEVELOPMENT || DEBUG)
499 if (SKYWALK_NATIVE(fsw->fsw_ifp)) {
500 skoid_add_handler(&fsw->fsw_skoid, "rps_nthreads", CTLFLAG_RW,
501 fsw_rps_threads_sysctl, fsw, 0);
502 }
503#endif /* !DEVELOPMENT && !DEBUG */
504
505 FSW_WUNLOCK(fsw);
506
507 return error;
508}
509
510static void
511fsw_teardown_ifp(struct nx_flowswitch *fsw, struct nexus_adapter *hwna)
512{
513 struct ifnet *ifp;
514
515 SK_LOCK_ASSERT_HELD();
516
517 FSW_WLOCK_ASSERT_HELD(fsw);
518 ifp = fsw->fsw_ifp;
519 ASSERT(ifp != NULL);
520 ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
521
522 fsw_netagent_unregister(fsw, ifp);
523
524 if (fsw->fsw_ipfm != NULL) {
525 fsw_ip_frag_mgr_destroy(mgr: fsw->fsw_ipfm);
526 }
527
528 skoid_destroy(skoid: &fsw->fsw_skoid);
529
530 SK_DF(SK_VERB_FSW, "%sdetached from %s (family %u)",
531 ((fsw->fsw_agent_session != NULL) ? "netagent" : ""),
532 if_name(ifp), ifp->if_family);
533
534 if (hwna != NULL) {
535 fsw_classq_teardown(fsw, hostna: hwna);
536 }
537
538 /*
539 * Set NXF_REJECT on the nexus, which would cause existing adapters
540 * to be marked similarly; channels associated with them would then
541 * cease to function.
542 */
543 os_atomic_or(&fsw->fsw_nx->nx_flags, NXF_REJECT, relaxed);
544
545 /* see notes on fsw_na_attach() about I/O refcnt */
546 if (ifp->if_na != NULL) {
547 ifp->if_na->nifna_netif->nif_fsw = NULL;
548 ifp->if_na->nifna_netif->nif_fsw_nxadv = NULL;
549 os_atomic_thread_fence(seq_cst);
550 }
551
552 fsw->fsw_ifp = NULL;
553 fsw->fsw_nifna = NULL;
554 fsw->fsw_resolve = NULL;
555 fsw->fsw_frame = NULL;
556 fsw->fsw_frame_headroom = 0;
557 fsw->fsw_demux = NULL;
558 fsw->fsw_classq_enabled = FALSE;
559 fsw->fsw_pkt_copy_from_pkt = NULL;
560 fsw->fsw_pkt_copy_from_mbuf = NULL;
561 fsw->fsw_pkt_copy_to_mbuf = NULL;
562
563 if (ifp->if_input_netem != NULL) {
564 netem_destroy(ne: ifp->if_input_netem);
565 ifp->if_input_netem = NULL;
566 }
567
568 ASSERT(fsw->fsw_reap_thread != THREAD_NULL);
569 (void) snprintf(fsw->fsw_reap_name, count: sizeof(fsw->fsw_reap_name),
570 FSW_REAP_THREADNAME, if_name(ifp), "_detached");
571 thread_set_thread_name(th: fsw->fsw_reap_thread, name: fsw->fsw_reap_name);
572}
573
574static int
575fsw_host_setup(struct nx_flowswitch *fsw)
576{
577 struct nexus_adapter *hwna;
578 struct ifnet *ifp;
579
580 SK_LOCK_ASSERT_HELD();
581
582 hwna = fsw->fsw_host_ch->ch_na;
583 ASSERT(hwna != NULL);
584
585
586 /* the netif below must have an ifnet attached (dev/host port) */
587 if ((ifp = hwna->na_ifp) == NULL) {
588 return ENXIO;
589 }
590
591 /*
592 * XXX: we don't support multiple rx rings yet.
593 * There are assumptions in fsw_port_flush_enqueue_dst() about
594 * single threaded write to destination rings.
595 */
596 if (SKYWALK_NATIVE(ifp) && (hwna->na_num_rx_rings > 1)) {
597 SK_ERR("ifp(0x%llx): multiple rx rings(%d) not supported",
598 SK_KVA(ifp), hwna->na_num_rx_rings);
599 return ENOTSUP;
600 }
601
602 lck_mtx_lock(lck: &fsw->fsw_detach_barrier_lock);
603 if ((fsw->fsw_detach_flags & FSW_DETACHF_DETACHING) != 0) {
604 lck_mtx_unlock(lck: &fsw->fsw_detach_barrier_lock);
605 return EBUSY;
606 }
607 fsw->fsw_detach_flags = 0;
608 lck_mtx_unlock(lck: &fsw->fsw_detach_barrier_lock);
609
610 int error = fsw_setup_ifp(fsw, hwna);
611 ASSERT(error != 0 || fsw->fsw_ifp != NULL);
612 if (error != 0) {
613 return error;
614 }
615
616 /* update the interface index */
617 ASSERT(NX_PROV(fsw->fsw_nx)->nxprov_params->nxp_ifindex == 0);
618 NX_PROV(fsw->fsw_nx)->nxprov_params->nxp_ifindex = ifp->if_index;
619 return 0;
620}
621
622static int
623fsw_host_teardown(struct nx_flowswitch *fsw)
624{
625 struct nexus_adapter *hwna = fsw->fsw_host_ch->ch_na;
626
627 SK_LOCK_ASSERT_HELD();
628 return fsw_detach(fsw, hwna, FALSE);
629}
630
631#if SK_LOG
632/* Hoisted out of line to reduce kernel stack footprint */
633SK_LOG_ATTRIBUTE
634static void
635fsw_ctl_attach_log(const struct nx_spec_req *nsr,
636 const struct kern_nexus *nx, int err)
637{
638 uuid_string_t uuidstr, ifuuidstr;
639 const char *nustr;
640
641 if (nsr->nsr_flags & NXSPECREQ_UUID) {
642 nustr = sk_uuid_unparse(nsr->nsr_uuid, uuidstr);
643 } else if (nsr->nsr_flags & NXSPECREQ_IFP) {
644 (void) snprintf((char *)uuidstr, sizeof(uuidstr), "0x%llx",
645 SK_KVA(nsr->nsr_ifp));
646 nustr = uuidstr;
647 } else {
648 nustr = nsr->nsr_name;
649 }
650
651 SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
652 "nexus 0x%llx (%s) name/uuid \"%s\" if_uuid %s flags 0x%x err %d",
653 SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, nustr,
654 sk_uuid_unparse(nsr->nsr_if_uuid, ifuuidstr), nsr->nsr_flags, err);
655}
656#endif /* SK_LOG */
657
658SK_NO_INLINE_ATTRIBUTE
659static void
660fsw_netif_set_callbacks_common(struct nx_flowswitch *fsw, boolean_t set)
661{
662 struct nexus_adapter *hwna = fsw->fsw_dev_ch->ch_na;
663
664 ASSERT(hwna->na_type == NA_NETIF_DEV ||
665 hwna->na_type == NA_NETIF_COMPAT_DEV);
666
667 if (set) {
668 netif_hwna_set_mode(hwna, NETIF_MODE_FSW, fsw_devna_rx);
669 } else {
670 netif_hwna_clear_mode(hwna);
671 }
672}
673
674SK_NO_INLINE_ATTRIBUTE
675static void
676fsw_netif_set_callbacks(struct nx_flowswitch *fsw)
677{
678 fsw_netif_set_callbacks_common(fsw, TRUE);
679}
680
681SK_NO_INLINE_ATTRIBUTE
682static void
683fsw_netif_clear_callbacks(struct nx_flowswitch *fsw)
684{
685 fsw_netif_set_callbacks_common(fsw, FALSE);
686}
687
688SK_NO_INLINE_ATTRIBUTE
689static void
690fsw_dp_start(struct nx_flowswitch *fsw)
691{
692 ASSERT(fsw->fsw_dev_ch != NULL);
693 ASSERT(fsw->fsw_host_ch != NULL);
694
695 fsw_netif_set_callbacks(fsw);
696 na_start_spec(fsw->fsw_dev_ch->ch_nexus, fsw->fsw_dev_ch);
697 na_start_spec(fsw->fsw_host_ch->ch_nexus, fsw->fsw_host_ch);
698}
699
700SK_NO_INLINE_ATTRIBUTE
701static int
702fsw_dp_stop(struct nx_flowswitch *fsw, struct ifnet **ifpp)
703{
704 struct ifnet *ifp;
705
706 FSW_WLOCK(fsw);
707 if ((fsw->fsw_state_flags & FSW_STATEF_QUIESCED) != 0) {
708 FSW_WUNLOCK(fsw);
709 return EALREADY;
710 }
711 fsw->fsw_state_flags |= FSW_STATEF_QUIESCED;
712 FSW_WUNLOCK(fsw);
713
714 /*
715 * For regular kernel-attached interfaces, quiescing is handled by
716 * the ifnet detach thread, which calls dlil_quiesce_and_detach_nexuses().
717 * For interfaces created by skywalk test cases, flowswitch/netif nexuses
718 * are constructed on the fly and can also be torn down on the fly.
719 * dlil_quiesce_and_detach_nexuses() won't help here because any nexus
720 * can be detached while the interface is still attached.
721 */
722 if ((ifp = fsw->fsw_ifp) != NULL &&
723 ifnet_datamov_suspend_if_needed(ifp)) {
724 SK_UNLOCK();
725 ifnet_datamov_drain(ifp);
726 /* Reference will be released by caller */
727 *ifpp = ifp;
728 SK_LOCK();
729 }
730 ASSERT(fsw->fsw_dev_ch != NULL);
731 ASSERT(fsw->fsw_host_ch != NULL);
732 na_stop_spec(fsw->fsw_host_ch->ch_nexus, fsw->fsw_host_ch);
733 na_stop_spec(fsw->fsw_dev_ch->ch_nexus, fsw->fsw_dev_ch);
734 fsw_netif_clear_callbacks(fsw);
735 return 0;
736}
737
738SK_NO_INLINE_ATTRIBUTE
739static int
740fsw_netif_port_setup(struct nx_flowswitch *fsw, struct kern_nexus *hw_nx,
741 boolean_t host)
742{
743 struct chreq chr;
744 struct kern_channel *ch;
745 int err;
746
747 bzero(s: &chr, n: sizeof(chr));
748 uuid_copy(dst: chr.cr_spec_uuid, src: hw_nx->nx_uuid);
749 chr.cr_ring_id = CHANNEL_RING_ID_ANY;
750 chr.cr_port = host ? NEXUS_PORT_NET_IF_HOST : NEXUS_PORT_NET_IF_DEV;
751 chr.cr_mode |= CHMODE_CONFIG | (host ? CHMODE_HOST : 0);
752
753 err = 0;
754 ch = ch_open_special(hw_nx, &chr, FALSE, &err);
755 if (ch == NULL) {
756 SK_ERR("ch_open_special(%s) failed: %d",
757 host ? "host" : "dev", err);
758 return err;
759 }
760 if (host) {
761 fsw->fsw_host_ch = ch;
762 } else {
763 fsw->fsw_dev_ch = ch;
764 }
765 return 0;
766}
767
768SK_NO_INLINE_ATTRIBUTE
769static int
770fsw_netif_port_teardown(struct nx_flowswitch *fsw, boolean_t host)
771{
772 struct kern_channel *ch;
773
774 ch = host ? fsw->fsw_host_ch : fsw->fsw_dev_ch;
775 if (ch == NULL) {
776 return EINVAL;
777 }
778 if (host) {
779 fsw->fsw_host_ch = NULL;
780 } else {
781 fsw->fsw_dev_ch = NULL;
782 }
783 ch_close_special(ch);
784 (void) ch_release_locked(ch);
785 return 0;
786}
787
788SK_NO_INLINE_ATTRIBUTE
789static int
790fsw_devna_setup(struct nx_flowswitch *fsw, struct kern_nexus *hw_nx)
791{
792 return fsw_netif_port_setup(fsw, hw_nx, FALSE);
793}
794
795SK_NO_INLINE_ATTRIBUTE
796static int
797fsw_hostna_setup(struct nx_flowswitch *fsw, struct kern_nexus *hw_nx)
798{
799 return fsw_netif_port_setup(fsw, hw_nx, TRUE);
800}
801
802SK_NO_INLINE_ATTRIBUTE
803static int
804fsw_devna_teardown(struct nx_flowswitch *fsw)
805{
806 return fsw_netif_port_teardown(fsw, FALSE);
807}
808
809SK_NO_INLINE_ATTRIBUTE
810static int
811fsw_hostna_teardown(struct nx_flowswitch *fsw)
812{
813 return fsw_netif_port_teardown(fsw, TRUE);
814}
815
816/* Process NXCFG_CMD_ATTACH */
817SK_NO_INLINE_ATTRIBUTE
818static int
819fsw_ctl_attach(struct kern_nexus *nx, struct proc *p, struct nx_spec_req *nsr)
820{
821#pragma unused(p)
822 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
823 struct kern_nexus *hw_nx = NULL;
824 int err = 0;
825
826 SK_LOCK_ASSERT_HELD();
827
828 /*
829 * The flowswitch only accepts UUID as an identifier, since it
830 * represents the UUID of the kernel object we are trying to
831 * attach to this flowswitch.
832 */
833 if ((nsr->nsr_flags & (NXSPECREQ_UUID | NXSPECREQ_IFP)) !=
834 NXSPECREQ_UUID || uuid_is_null(uu: nsr->nsr_uuid)) {
835 err = EINVAL;
836 goto done;
837 }
838
839 if (fsw->fsw_dev_ch != NULL) {
840 ASSERT(fsw->fsw_host_ch != NULL);
841 err = EEXIST;
842 goto done;
843 }
844
845 hw_nx = nx_find(nsr->nsr_uuid, TRUE);
846 if (hw_nx == NULL) {
847 err = ENOENT;
848 goto done;
849 } else if (hw_nx == nx) {
850 err = EINVAL;
851 goto done;
852 }
853
854 /* preflight check to see if the nexus is attachable to us */
855 err = fsw_nx_check(fsw, hw_nx);
856 if (err != 0) {
857 goto done;
858 }
859
860 err = fsw_devna_setup(fsw, hw_nx);
861 if (err != 0) {
862 goto done;
863 }
864
865 err = fsw_hostna_setup(fsw, hw_nx);
866 if (err != 0) {
867 (void) fsw_devna_teardown(fsw);
868 goto done;
869 }
870
871 err = fsw_host_setup(fsw);
872 if (err != 0) {
873 (void) fsw_hostna_teardown(fsw);
874 (void) fsw_devna_teardown(fsw);
875 goto done;
876 }
877
878 fsw_dp_start(fsw);
879
880 /* return the devna UUID */
881 uuid_copy(dst: nsr->nsr_if_uuid, src: fsw->fsw_dev_ch->ch_na->na_uuid);
882 ASSERT(!uuid_is_null(nsr->nsr_if_uuid));
883done:
884#if SK_LOG
885 if (__improbable(sk_verbose != 0)) {
886 fsw_ctl_attach_log(nsr, nx, err);
887 }
888#endif /* SK_LOG */
889
890 if (hw_nx != NULL) {
891 nx_release_locked(hw_nx);
892 }
893
894 return err;
895}
896
897SK_NO_INLINE_ATTRIBUTE
898static void
899fsw_cleanup(struct nx_flowswitch *fsw)
900{
901 int err;
902 struct ifnet *ifp = NULL;
903
904 if (fsw->fsw_dev_ch == NULL) {
905 ASSERT(fsw->fsw_host_ch == NULL);
906 return;
907 }
908 err = fsw_dp_stop(fsw, ifpp: &ifp);
909 if (err != 0) {
910 return;
911 }
912 err = fsw_host_teardown(fsw);
913 VERIFY(err == 0);
914
915 err = fsw_hostna_teardown(fsw);
916 VERIFY(err == 0);
917
918 err = fsw_devna_teardown(fsw);
919 VERIFY(err == 0);
920
921 if (ifp != NULL) {
922 ifnet_datamov_resume(ifp);
923 }
924}
925
926int
927fsw_ctl_detach(struct kern_nexus *nx, struct proc *p,
928 struct nx_spec_req *nsr)
929{
930#pragma unused(p)
931 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
932 int err = 0;
933
934 SK_LOCK_ASSERT_HELD();
935
936 /*
937 * nsr is NULL when we're called from the destructor, and it
938 * implies that we'll detach everything that is attached.
939 */
940 if (nsr == NULL) {
941 fsw_cleanup(fsw);
942 ASSERT(fsw->fsw_dev_ch == NULL);
943 ASSERT(fsw->fsw_host_ch == NULL);
944 goto done;
945 }
946
947 if (uuid_is_null(uu: nsr->nsr_if_uuid)) {
948 err = EINVAL;
949 goto done;
950 } else if (fsw->fsw_dev_ch == NULL || fsw->fsw_host_ch == NULL) {
951 err = ENXIO;
952 goto done;
953 }
954
955 /* check if the devna uuid is correct */
956 if (uuid_compare(uu1: nsr->nsr_if_uuid,
957 uu2: fsw->fsw_dev_ch->ch_na->na_uuid) != 0) {
958 err = ESRCH;
959 goto done;
960 }
961 fsw_cleanup(fsw);
962
963done:
964#if SK_LOG
965 if (nsr != NULL) {
966 uuid_string_t ifuuidstr;
967 SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
968 "nexus 0x%llx (%s) if_uuid %s flags 0x%x err %d",
969 SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name,
970 sk_uuid_unparse(nsr->nsr_if_uuid, ifuuidstr),
971 nsr->nsr_flags, err);
972 } else {
973 SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
974 "nexus 0x%llx (%s) ANY err %d", SK_KVA(nx),
975 NX_DOM_PROV(nx)->nxdom_prov_name, err);
976 }
977#endif /* SK_LOG */
978
979 return err;
980}
981
982static int
983fsw_netem_config(struct nx_flowswitch *fsw, void *data)
984{
985 struct ifnet *ifp = fsw->fsw_ifp;
986 struct if_netem_params *params = data;
987 int ret;
988
989 if (ifp == NULL) {
990 return ENODEV;
991 }
992
993 SK_LOCK_ASSERT_HELD();
994#define fsw_INPUT_NETEM_THREADNAME "if_input_netem_%s@fsw"
995#define fsw_INPUT_NETEM_THREADNAME_LEN 32
996 char netem_name[fsw_INPUT_NETEM_THREADNAME_LEN];
997 (void) snprintf(netem_name, count: sizeof(netem_name),
998 fsw_INPUT_NETEM_THREADNAME, if_name(ifp));
999 ret = netem_config(ne: &ifp->if_input_netem, name: netem_name, ifp, p: params, output_handle: fsw,
1000 output_func: fsw_dev_input_netem_dequeue, FSW_VP_DEV_BATCH_MAX);
1001
1002 return ret;
1003}
1004
1005int
1006fsw_ctl(struct kern_nexus *nx, nxcfg_cmd_t nc_cmd, struct proc *p,
1007 void *data)
1008{
1009 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
1010 struct nx_spec_req *nsr = data;
1011 struct nx_flow_req *req = data;
1012 boolean_t need_check;
1013 int error = 0;
1014
1015 switch (nc_cmd) {
1016 case NXCFG_CMD_FLOW_ADD:
1017 case NXCFG_CMD_FLOW_DEL:
1018 if (uuid_is_null(uu: req->nfr_flow_uuid)) {
1019 error = EINVAL;
1020 goto done;
1021 }
1022 if (p != kernproc) {
1023 req->nfr_flags &= NXFLOWREQF_MASK;
1024 }
1025 req->nfr_flowadv_idx = FLOWADV_IDX_NONE;
1026
1027 if (nc_cmd == NXCFG_CMD_FLOW_DEL) {
1028 break;
1029 }
1030
1031 need_check = FALSE;
1032 if (req->nfr_epid != -1 && proc_pid(p) != req->nfr_epid) {
1033 need_check = TRUE;
1034 } else if (!uuid_is_null(uu: req->nfr_euuid)) {
1035 uuid_t uuid;
1036
1037 /* get the UUID of the issuing process */
1038 proc_getexecutableuuid(p, uuid, sizeof(uuid));
1039
1040 /*
1041 * If this is not issued by a process for its own
1042 * executable UUID and if the process does not have
1043 * the necessary privilege, reject the request.
1044 * The logic is similar to so_set_effective_uuid().
1045 */
1046 if (uuid_compare(uu1: req->nfr_euuid, uu2: uuid) != 0) {
1047 need_check = TRUE;
1048 }
1049 }
1050 if (need_check) {
1051 kauth_cred_t cred = kauth_cred_proc_ref(procp: p);
1052 error = priv_check_cred(cred,
1053 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, flags: 0);
1054 kauth_cred_unref(&cred);
1055 if (error != 0) {
1056 goto done;
1057 }
1058 }
1059 break;
1060
1061 default:
1062 break;
1063 }
1064
1065 switch (nc_cmd) {
1066 case NXCFG_CMD_ATTACH:
1067 error = fsw_ctl_attach(nx, p, nsr);
1068 break;
1069
1070 case NXCFG_CMD_DETACH:
1071 error = fsw_ctl_detach(nx, p, nsr);
1072 break;
1073
1074 case NXCFG_CMD_FLOW_ADD: /* struct nx_flow_req */
1075 error = fsw_ctl_flow_add(fsw, p, req: data);
1076 break;
1077
1078 case NXCFG_CMD_FLOW_DEL: /* struct nx_flow_req */
1079 error = fsw_ctl_flow_del(fsw, p, req: data);
1080 break;
1081
1082 case NXCFG_CMD_FLOW_CONFIG:
1083 error = fsw_ctl_flow_config(fsw, p, req: data);
1084 break;
1085
1086 case NXCFG_CMD_NETEM: /* struct if_netem_params */
1087 error = fsw_netem_config(fsw, data);
1088 break;
1089
1090 default:
1091 SK_ERR("invalid cmd %u", nc_cmd);
1092 error = EINVAL;
1093 break;
1094 }
1095
1096done:
1097 return error;
1098}
1099
1100struct nx_flowswitch *
1101fsw_ifp_to_fsw(struct ifnet *ifp)
1102{
1103 struct nx_flowswitch *fsw = NULL;
1104
1105 if (ifp->if_na != NULL) {
1106 fsw = ifp->if_na->nifna_netif->nif_fsw;
1107 }
1108 return fsw;
1109}
1110
1111static void
1112fsw_ifnet_event_callback(struct eventhandler_entry_arg ee_arg __unused,
1113 struct ifnet *ifp, struct sockaddr *ip_addr __unused,
1114 intf_event_code_t intf_ev_code)
1115{
1116 struct nx_flowswitch *fsw = NULL;
1117
1118 if (ifp->if_na == NULL) {
1119 return;
1120 }
1121
1122 SK_LOCK();
1123 fsw = fsw_ifp_to_fsw(ifp);
1124 if (fsw != NULL) {
1125 switch (intf_ev_code) {
1126 case INTF_EVENT_CODE_LLADDR_UPDATE:
1127 if ((fsw->fsw_ifp == NULL) ||
1128 (fsw->fsw_ifp_dlt != DLT_EN10MB)) {
1129 break;
1130 }
1131
1132 VERIFY(fsw->fsw_ifp == ifp);
1133 SK_DF(SK_VERB_FSW, "MAC address change detected for %s",
1134 if_name(fsw->fsw_ifp));
1135 (void) ifnet_lladdr_copy_bytes(interface: ifp, lladdr: fsw->fsw_ether_shost,
1136 ETHER_ADDR_LEN);
1137 os_atomic_inc(&fsw->fsw_src_lla_gencnt, relaxed);
1138 break;
1139
1140 case INTF_EVENT_CODE_LOW_POWER_UPDATE:
1141 if (fsw->fsw_ifp == NULL) {
1142 break;
1143 }
1144
1145 VERIFY(fsw->fsw_ifp == ifp);
1146
1147 if (ifp->if_xflags & IFXF_LOW_POWER) {
1148 SK_DF(SK_VERB_FSW,
1149 "Low power mode updated for %s",
1150 if_name(fsw->fsw_ifp));
1151
1152 fsw_reap_sched(fsw);
1153 }
1154 break;
1155
1156 default:
1157 break;
1158 }
1159 }
1160 SK_UNLOCK();
1161}
1162
1163static void
1164fsw_protoctl_event_callback(struct eventhandler_entry_arg ee_arg,
1165 struct ifnet *ifp, struct sockaddr *p_laddr, struct sockaddr *p_raddr,
1166 uint16_t lport, uint16_t rport, uint8_t proto, uint32_t protoctl_event_code,
1167 struct protoctl_ev_val *p_val)
1168{
1169#pragma unused(ee_arg)
1170 struct nx_flowswitch *fsw = NULL;
1171 struct flow_entry *fe = NULL;
1172 boolean_t netagent_update_flow = FALSE;
1173 uuid_t fe_uuid;
1174
1175 if (proto != IPPROTO_TCP && proto != IPPROTO_UDP) {
1176 return;
1177 }
1178
1179 /*
1180 * XXX Right now only handle the event if we have enough
1181 * information to match the entire flow.
1182 */
1183 if (lport == 0 || rport == 0 || p_laddr == NULL || p_raddr == NULL) {
1184 return;
1185 }
1186
1187 SK_LOCK();
1188 fsw = fsw_ifp_to_fsw(ifp);
1189 if (fsw == NULL) {
1190 goto out;
1191 }
1192
1193 if (!fsw_detach_barrier_add(fsw)) {
1194 fsw = NULL;
1195 SK_ERR("netagent detached");
1196 goto out;
1197 }
1198
1199 struct flow_key fk __sk_aligned(16);
1200 FLOW_KEY_CLEAR(&fk);
1201 fk.fk_proto = proto;
1202 if (p_laddr->sa_family == AF_INET) {
1203 fk.fk_ipver = IPVERSION;
1204 fk.fk_src4 = SIN(p_laddr)->sin_addr;
1205 fk.fk_dst4 = SIN(p_raddr)->sin_addr;
1206 } else {
1207 fk.fk_ipver = IPV6_VERSION;
1208 fk.fk_src6 = SIN6(p_laddr)->sin6_addr;
1209 /*
1210 * rdar://107435899 The scope ID for destination address needs
1211 * to be cleared out before looking up the flow entry for this
1212 * 5-tuple, because addresses in flow entries do not contain the
1213 * scope ID.
1214 */
1215 struct in6_addr *in6;
1216
1217 fk.fk_dst6 = SIN6(p_raddr)->sin6_addr;
1218 in6 = &fk.fk_dst6;
1219 if (in6_embedded_scope && IN6_IS_SCOPE_EMBED(in6)) {
1220 in6->s6_addr16[1] = 0;
1221 }
1222 }
1223 fk.fk_sport = lport;
1224 fk.fk_dport = rport;
1225 fk.fk_mask = FKMASK_5TUPLE;
1226
1227 fe = flow_mgr_find_fe_by_key(fsw->fsw_flow_mgr, &fk);
1228 if (__improbable(fe == NULL)) {
1229 goto out;
1230 }
1231
1232 uuid_copy(dst: fe_uuid, src: fe->fe_uuid);
1233 /*
1234 * If the protocol notification is for TCP, make sure
1235 * protocol event received is for bytes in the flight.
1236 * XXX Redirect events are not delivered as protocol events
1237 * but as better route events.
1238 * Also redirect events do not indicate loss of the packet.
1239 */
1240 if (proto != IPPROTO_TCP) {
1241 p_val->tcp_seq_number = 0;
1242 }
1243
1244 netagent_update_flow = TRUE;
1245
1246out:
1247 SK_UNLOCK();
1248
1249 if (netagent_update_flow) {
1250 int error = 0;
1251#if SK_LOG
1252 char dbgbuf[FLOWENTRY_DBGBUF_SIZE];
1253 SK_DF(SK_VERB_FLOW, "Update flow entry \"%s\" for protocol "
1254 "event %d with value %d and tcp sequence number %d",
1255 fe_as_string(fe, dbgbuf, sizeof(dbgbuf)),
1256 protoctl_event_code, p_val->val, p_val->tcp_seq_number);
1257#endif /* SK_LOG */
1258 if ((error = netagent_update_flow_protoctl_event(
1259 session: fsw->fsw_agent_session, client_id: fe_uuid, protoctl_event_code,
1260 protoctl_event_val: p_val->val, protoctl_event_tcp_seq_number: p_val->tcp_seq_number)) != 0) {
1261#if SK_LOG
1262 SK_DF(SK_VERB_FLOW, "Error: %d. Could not update "
1263 "flow entry \"%s\" for protocol event %d with "
1264 "value %d and tcp sequence number %d", error,
1265 dbgbuf, protoctl_event_code, p_val->val,
1266 p_val->tcp_seq_number);
1267#endif /* SK_LOG */
1268 }
1269 }
1270
1271 if (fe != NULL) {
1272 flow_entry_release(pfe: &fe);
1273 }
1274
1275 if (fsw != NULL) {
1276 fsw_detach_barrier_remove(fsw);
1277 }
1278}
1279
1280int
1281fsw_netagent_add_remove(struct kern_nexus *nx, boolean_t add)
1282{
1283 struct nx_flowswitch *fsw = NULL;
1284 int error = 0;
1285
1286 SK_LOCK_ASSERT_HELD();
1287 VERIFY(nx != NULL);
1288 VERIFY(NX_PROV(nx) != NULL);
1289 VERIFY(NX_DOM_PROV(nx) != NULL);
1290
1291 if (NX_DOM(nx)->nxdom_type != NEXUS_TYPE_FLOW_SWITCH) {
1292 error = EINVAL;
1293 goto out;
1294 }
1295
1296 fsw = NX_FSW_PRIVATE(nx);
1297 VERIFY(fsw != NULL);
1298 FSW_WLOCK(fsw);
1299
1300 if (fsw->fsw_agent_session == NULL) {
1301 error = ENXIO;
1302 goto out;
1303 }
1304
1305 ASSERT(!uuid_is_null(fsw->fsw_agent_uuid));
1306
1307 if (add) {
1308 if (FSW_NETAGENT_ADDED(fsw)) {
1309 /* agent already added */
1310 error = EEXIST;
1311 } else if (fsw->fsw_ifp->if_bridge != NULL) {
1312 /* see rdar://107076453 */
1313 SK_ERR("%s is bridged, not adding netagent",
1314 if_name(fsw->fsw_ifp));
1315 error = EBUSY;
1316 } else {
1317 fsw->fsw_state_flags |= FSW_STATEF_NETAGENT_ADDED;
1318 if (if_is_fsw_netagent_enabled()) {
1319 fsw->fsw_state_flags
1320 |= FSW_STATEF_NETAGENT_ENABLED;
1321 }
1322 if_add_netagent(fsw->fsw_ifp, fsw->fsw_agent_uuid);
1323 SK_D("flowswitch netagent added for interface %s",
1324 if_name(fsw->fsw_ifp));
1325 }
1326 } else {
1327 if (!FSW_NETAGENT_ADDED(fsw)) {
1328 /* agent has not been added */
1329 error = ENOENT;
1330 } else {
1331 fsw->fsw_state_flags &= ~(FSW_STATEF_NETAGENT_ADDED |
1332 FSW_STATEF_NETAGENT_ENABLED);
1333 if_delete_netagent(fsw->fsw_ifp, fsw->fsw_agent_uuid);
1334 SK_D("flowswitch netagent removed for interface %s",
1335 if_name(fsw->fsw_ifp));
1336 }
1337 }
1338out:
1339 if (fsw != NULL) {
1340 FSW_UNLOCK(fsw);
1341 }
1342 return error;
1343}
1344
1345void
1346fsw_netagent_update(struct kern_nexus *nx)
1347{
1348 struct nx_flowswitch *fsw = NULL;
1349
1350 SK_LOCK_ASSERT_HELD();
1351 VERIFY(nx != NULL);
1352 VERIFY(NX_PROV(nx) != NULL);
1353 VERIFY(NX_DOM_PROV(nx) != NULL);
1354
1355 if (NX_DOM(nx)->nxdom_type != NEXUS_TYPE_FLOW_SWITCH) {
1356 goto out;
1357 }
1358 fsw = NX_FSW_PRIVATE(nx);
1359 VERIFY(fsw != NULL);
1360 FSW_WLOCK(fsw);
1361 if (fsw->fsw_agent_session == NULL) {
1362 goto out;
1363 }
1364 ASSERT(!uuid_is_null(fsw->fsw_agent_uuid));
1365 uint32_t flags = netagent_get_flags(uuid: fsw->fsw_agent_uuid);
1366 const bool ip_agent = ifnet_needs_fsw_ip_netagent(ifp: fsw->fsw_ifp);
1367 const bool transport_agent = ifnet_needs_fsw_transport_netagent(ifp: fsw->fsw_ifp);
1368 if (ip_agent || transport_agent) {
1369 flags |= NETAGENT_FLAG_NEXUS_LISTENER;
1370 } else {
1371 flags &= ~NETAGENT_FLAG_NEXUS_LISTENER;
1372 }
1373 if (transport_agent) {
1374 flags |= NETAGENT_FLAG_NEXUS_PROVIDER;
1375 } else {
1376 flags &= ~NETAGENT_FLAG_NEXUS_PROVIDER;
1377 }
1378 if (ip_agent) {
1379 flags |= NETAGENT_FLAG_CUSTOM_IP_NEXUS;
1380 } else {
1381 flags &= ~NETAGENT_FLAG_CUSTOM_IP_NEXUS;
1382 }
1383 if (netagent_set_flags(uuid: fsw->fsw_agent_uuid, flags) == 0) {
1384 SK_D("flowswitch netagent updated for interface %s",
1385 if_name(fsw->fsw_ifp));
1386 }
1387out:
1388 if (fsw != NULL) {
1389 FSW_UNLOCK(fsw);
1390 }
1391}
1392
1393static int
1394fsw_port_ctor(struct nx_flowswitch *fsw, struct nexus_vp_adapter *vpna,
1395 const struct nxbind *nxb)
1396{
1397#pragma unused(nxb)
1398 int err = 0;
1399
1400 SK_LOCK_ASSERT_HELD();
1401 ASSERT(nxb == NULL || !(nxb->nxb_flags & NXBF_MATCH_UNIQUEID) ||
1402 vpna->vpna_pid == nxb->nxb_pid);
1403
1404 /*
1405 * Reject regular channel open requests unless there is
1406 * something attached to the host port of the flowswitch.
1407 */
1408 if (vpna->vpna_nx_port >= FSW_VP_USER_MIN) {
1409 struct nexus_adapter *na = &vpna->vpna_up;
1410 struct ifnet *ifp = fsw->fsw_ifp;
1411
1412 if (ifp == NULL) {
1413 err = ENXIO;
1414 goto done;
1415 }
1416
1417 /* if adapter supports mitigation, set default value */
1418 if (na->na_flags & (NAF_TX_MITIGATION | NAF_RX_MITIGATION)) {
1419 if (IFNET_IS_WIFI(ifp)) {
1420 na->na_ch_mit_ival = CH_MIT_IVAL_WIFI;
1421 } else if (IFNET_IS_CELLULAR(ifp)) {
1422 na->na_ch_mit_ival = CH_MIT_IVAL_CELLULAR;
1423 } else if (IFNET_IS_ETHERNET(ifp)) {
1424 na->na_ch_mit_ival = CH_MIT_IVAL_ETHERNET;
1425 } else {
1426 na->na_ch_mit_ival = CH_MIT_IVAL_DEFAULT;
1427 }
1428 }
1429 }
1430
1431done:
1432 SK_DF(err ? SK_VERB_ERROR : SK_VERB_FSW,
1433 "fsw 0x%llx nx_port %d vpna_pid %d vpna_pid_bound %u mit_ival %llu "
1434 "(err %d)", SK_KVA(fsw), (int)vpna->vpna_nx_port, vpna->vpna_pid,
1435 vpna->vpna_pid_bound, vpna->vpna_up.na_ch_mit_ival, err);
1436
1437 return err;
1438}
1439
1440static bool
1441fsw_port_dtor(struct nx_flowswitch *fsw, const struct nexus_vp_adapter *vpna)
1442{
1443 struct flow_mgr *fm = fsw->fsw_flow_mgr;
1444 nexus_port_t nx_port = vpna->vpna_nx_port;
1445 uint32_t purge_cnt;
1446
1447 ASSERT(fsw == vpna->vpna_fsw);
1448 ASSERT(nx_port != NEXUS_PORT_ANY);
1449
1450 /*
1451 * If this nexus port was bound to a PID, we just need to look at a
1452 * single bucket and iterate from there. Note that in any case, we
1453 * can't just search for a single flow_owner based on the PID itself,
1454 * since a given process may be opening multiple channels to the
1455 * flowswitch; hence we search for the ones matching this nexus port.
1456 *
1457 * Close any open flows on the port and remove the flow owner and
1458 * nexus port binding.
1459 */
1460 purge_cnt = flow_owner_detach_nexus_port(fm, vpna->vpna_pid_bound,
1461 vpna->vpna_pid, nx_port, FALSE);
1462
1463 SK_DF(SK_VERB_FSW,
1464 "fsw 0x%llx nx_port %d pid %d pid_bound %u defunct %u "
1465 "purged %u", SK_KVA(fsw), (int)nx_port,
1466 vpna->vpna_pid, vpna->vpna_pid_bound, vpna->vpna_defunct,
1467 purge_cnt);
1468
1469 return purge_cnt != 0;
1470}
1471
1472/*
1473 * Flowswitch nexus port allocator.
1474 *
1475 * A nexus port is represented by a bit in the port bitmap; its state is
1476 * either free or allocated. A free state implies that the port has no
1477 * nxbind AND no nexus adapter association. An allocated state means that
1478 * either it has a nxbind OR a nexus adapter assocation. This routine
1479 * manages the nexus adapter association with a nexus port; nxbind is
1480 * handled separately via nx_fsw_port_bind().
1481 *
1482 * The caller of this routine may optionally pass in a NULL nexus adapter.
1483 * In such a case (*vpna is NULL), this routine checks to see if the port
1484 * has already been associated with an adapter, and returns a reference to
1485 * that adapter. No action is taken on a port that doesn't have an adapter
1486 * associated. Otherwise (*vpna is non-NULL), this routine associates that
1487 * adapter with a port that's not already associated with one; the reference
1488 * to the adapter is untouched here, as the caller is expected to handle it.
1489 *
1490 * The flowswitch code invokes this routine each time it is requested to
1491 * find an adapter via nx_fsw_na_find(). The counterpart of this routine,
1492 * nx_fsw_port_free(), is only executed ONCE by the adapter's destructor.
1493 * This allows for multiple channels to be opened to a nexus port, each
1494 * time holding a reference to that same nexus adapter. The releasing of
1495 * the nexus port only happens when the last channel closes.
1496 */
1497static int
1498fsw_port_alloc__(struct nx_flowswitch *fsw, struct nxbind *nxb,
1499 struct nexus_vp_adapter **vpna, nexus_port_t nx_port, struct proc *p)
1500{
1501 struct kern_nexus *nx = fsw->fsw_nx;
1502 boolean_t refonly = FALSE;
1503 int error = 0;
1504
1505 FSW_WLOCK_ASSERT_HELD(fsw);
1506
1507 error = nx_port_alloc(nx, nx_port, nxb, (struct nexus_adapter **)vpna, p);
1508 if (error == 0 && *vpna != NULL && !refonly) {
1509 /* initialize the nexus port and the adapter occupying it */
1510 (*vpna)->vpna_fsw = fsw;
1511 (*vpna)->vpna_nx_port = nx_port;
1512 (*vpna)->vpna_pid = proc_pid(p);
1513 if (nxb != NULL && (nxb->nxb_flags & NXBF_MATCH_UNIQUEID)) {
1514 ASSERT((*vpna)->vpna_pid == nxb->nxb_pid);
1515 (*vpna)->vpna_pid_bound = TRUE;
1516 } else {
1517 (*vpna)->vpna_pid_bound = FALSE;
1518 }
1519
1520 error = fsw_port_ctor(fsw, vpna: *vpna, nxb);
1521 if (error != 0) {
1522 fsw_port_free(fsw, vpna: (*vpna),
1523 nx_port: (*vpna)->vpna_nx_port, FALSE);
1524 }
1525 }
1526
1527#if SK_LOG
1528 if (*vpna != NULL) {
1529 SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
1530 "+++ vpna \"%s\" (0x%llx) <-> fsw 0x%llx "
1531 "%sport %d refonly %u (err %d)",
1532 (*vpna)->vpna_up.na_name, SK_KVA(*vpna), SK_KVA(fsw),
1533 nx_fsw_dom_port_is_reserved(nx, nx_port) ?
1534 "[reserved] " : "", (int)nx_port, refonly, error);
1535 } else {
1536 SK_DF(error ? SK_VERB_ERROR : SK_VERB_FSW,
1537 "+++ fsw 0x%llx nx_port %d refonly %u "
1538 "(err %d)", SK_KVA(fsw), (int)nx_port, refonly, error);
1539 }
1540#endif /* SK_LOG */
1541
1542 return error;
1543}
1544
1545int
1546fsw_port_alloc(struct nx_flowswitch *fsw, struct nxbind *nxb,
1547 struct nexus_vp_adapter **vpna, nexus_port_t nx_port, struct proc *p,
1548 boolean_t ifattach, boolean_t host)
1549{
1550 int err = 0;
1551
1552 FSW_WLOCK_ASSERT_HELD(fsw);
1553
1554 if (ifattach) {
1555 /* override port to either NX_FSW_{HOST,DEV} */
1556 nx_port = (host ? FSW_VP_HOST : FSW_VP_DEV);
1557 /* allocate reserved port for ifattach */
1558 err = fsw_port_alloc__(fsw, nxb, vpna, nx_port, p);
1559 } else if (host) {
1560 /* host is valid only for ifattach */
1561 err = EINVAL;
1562 } else {
1563 /* nexus port otherwise (reserve dev and host for ifattach) */
1564 err = fsw_port_alloc__(fsw, nxb, vpna, nx_port, p);
1565 }
1566
1567 return err;
1568}
1569
1570/*
1571 * Remove nexus port association from a nexus adapter. This call is
1572 * the opposite of fsw_port_alloc(), except that it is called only
1573 * at nx_fsw_vp_na_dtor() destructor time. See above notes
1574 * on fsw_port_alloc().
1575 */
1576void
1577fsw_port_free(struct nx_flowswitch *fsw, struct nexus_vp_adapter *vpna,
1578 nexus_port_t nx_port, boolean_t defunct)
1579{
1580 struct kern_nexus *nx = fsw->fsw_nx;
1581
1582 FSW_WLOCK_ASSERT_HELD(fsw);
1583 ASSERT(vpna->vpna_fsw == fsw);
1584
1585 if (defunct) {
1586 vpna->vpna_defunct = TRUE;
1587 nx_port_defunct(nx, nx_port);
1588 }
1589
1590 bool destroyed = fsw_port_dtor(fsw, vpna);
1591 if (destroyed) {
1592 /*
1593 * If the extension's destructor no longer needs to be
1594 * bound to any channel client, release the binding.
1595 */
1596 nx_port_unbind(nx, nx_port);
1597 }
1598
1599 /*
1600 * If this is a defunct, then stop here as the port is still
1601 * occupied by the channel. We'll come here again later when
1602 * the actual close happens.
1603 */
1604 if (defunct) {
1605 return;
1606 }
1607
1608 SK_DF(SK_VERB_FSW, "--- vpna \"%s\" (0x%llx) -!- fsw 0x%llx "
1609 "nx_port %d defunct %u", vpna->vpna_up.na_name, SK_KVA(vpna),
1610 SK_KVA(fsw), (int)nx_port, vpna->vpna_defunct);
1611
1612 nx_port_free(nx, nx_port);
1613 vpna->vpna_fsw = NULL;
1614 vpna->vpna_nx_port = NEXUS_PORT_ANY;
1615 vpna->vpna_pid_bound = FALSE;
1616 vpna->vpna_pid = -1;
1617 vpna->vpna_defunct = FALSE;
1618}
1619
1620int
1621fsw_port_na_activate(struct nx_flowswitch *fsw,
1622 struct nexus_vp_adapter *vpna, na_activate_mode_t mode)
1623{
1624 struct flow_mgr *fm = fsw->fsw_flow_mgr;
1625 uint32_t fo_cnt = 0;
1626
1627 SK_LOCK_ASSERT_HELD();
1628
1629 /* The following code relies on the static value asserted below */
1630 _CASSERT(FSW_VP_DEV == 0);
1631 _CASSERT(FSW_VP_HOST == 1);
1632
1633 ASSERT(NA_IS_ACTIVE(&vpna->vpna_up));
1634 ASSERT(vpna->vpna_nx_port != NEXUS_PORT_ANY);
1635
1636 switch (mode) {
1637 case NA_ACTIVATE_MODE_ON:
1638 break;
1639
1640 case NA_ACTIVATE_MODE_DEFUNCT:
1641 break;
1642
1643 case NA_ACTIVATE_MODE_OFF:
1644 break;
1645
1646 default:
1647 VERIFY(0);
1648 /* NOTREACHED */
1649 __builtin_unreachable();
1650 }
1651
1652 /* nothing further to do for special ports */
1653 if (vpna->vpna_nx_port < FSW_VP_USER_MIN) {
1654 goto done;
1655 }
1656
1657 /* activate any flow owner related resources (e.g. flowadv), if any */
1658 fo_cnt = flow_owner_activate_nexus_port(fm, vpna->vpna_pid_bound,
1659 vpna->vpna_pid, vpna->vpna_nx_port, &vpna->vpna_up, mode);
1660
1661done:
1662 SK_DF(SK_VERB_FSW,
1663 "fsw 0x%llx %s nx_port %d vpna_pid %d vpna_pid_bound %u fo_cnt %u",
1664 SK_KVA(fsw), na_activate_mode2str(mode), (int)vpna->vpna_nx_port,
1665 vpna->vpna_pid, vpna->vpna_pid_bound, fo_cnt);
1666
1667 return 0;
1668}
1669
1670int
1671fsw_port_na_defunct(struct nx_flowswitch *fsw, struct nexus_vp_adapter *vpna)
1672{
1673 int err = 0;
1674
1675 SK_LOCK_ASSERT_HELD();
1676 ASSERT(vpna->vpna_nx_port >= FSW_VP_USER_MIN);
1677
1678 /*
1679 * During defunct, we want to purge all flows associated to this
1680 * port and the flow owner as well. This is accomplished as part
1681 * of calling the port's destructor. However, we still want to
1682 * occupy the nexus port since there's a channel open to it.
1683 */
1684 FSW_WLOCK(fsw);
1685 if (!vpna->vpna_defunct) {
1686 fsw_port_free(fsw, vpna, nx_port: vpna->vpna_nx_port, TRUE);
1687 } else {
1688 err = EALREADY;
1689 }
1690 FSW_WUNLOCK(fsw);
1691
1692 return err;
1693}
1694
1695static size_t
1696fsw_mib_get_flow(struct nx_flowswitch *fsw,
1697 struct nexus_mib_filter *filter, void *out, size_t len)
1698{
1699 struct flow_mgr *fm = fsw->fsw_flow_mgr;
1700 size_t sf_size = sizeof(struct sk_stats_flow);
1701 __block size_t actual_space = 0;
1702 __block struct sk_stats_flow *sf = out;
1703 struct flow_entry *fe;
1704
1705 FSW_LOCK_ASSERT_HELD(fsw);
1706
1707 if (filter->nmf_bitmap & NXMIB_FILTER_FLOW_ID) {
1708 fe = flow_mgr_get_fe_by_uuid_rlock(fm, filter->nmf_flow_id);
1709 if (fe != NULL) {
1710 if (out != NULL && len >= sf_size) {
1711 flow_entry_stats_get(fe, sf);
1712 }
1713
1714 flow_entry_release(pfe: &fe);
1715 return sf_size;
1716 }
1717 return 0;
1718 } else if (filter->nmf_bitmap & NXMIB_FILTER_INFO_TUPLE) {
1719 struct info_tuple *itpl = &filter->nmf_info_tuple;
1720 struct flow_key fk;
1721 bzero(s: &fk, n: sizeof(fk));
1722 if (itpl->itpl_local_sa.sa_family == AF_INET &&
1723 itpl->itpl_remote_sa.sa_family == AF_INET) {
1724 fk.fk_mask = FKMASK_5TUPLE;
1725 fk.fk_ipver = IPVERSION;
1726 fk.fk_proto = itpl->itpl_proto;
1727 fk.fk_src4 = itpl->itpl_local_sin.sin_addr;
1728 fk.fk_dst4 = itpl->itpl_remote_sin.sin_addr;
1729 fk.fk_sport = itpl->itpl_local_sin.sin_port;
1730 fk.fk_dport = itpl->itpl_remote_sin.sin_port;
1731 } else if (itpl->itpl_local_sa.sa_family == AF_INET6 &&
1732 itpl->itpl_remote_sa.sa_family == AF_INET6) {
1733 fk.fk_mask = FKMASK_5TUPLE;
1734 fk.fk_ipver = IPV6_VERSION;
1735 fk.fk_proto = itpl->itpl_proto;
1736 fk.fk_src6 = itpl->itpl_local_sin6.sin6_addr;
1737 fk.fk_dst6 = itpl->itpl_remote_sin6.sin6_addr;
1738 fk.fk_sport = itpl->itpl_local_sin6.sin6_port;
1739 fk.fk_dport = itpl->itpl_remote_sin6.sin6_port;
1740 } else {
1741 SK_ERR("invalid info tuple: local af %d remote af %d",
1742 itpl->itpl_local_sa.sa_family,
1743 itpl->itpl_remote_sa.sa_family);
1744 return 0;
1745 }
1746
1747 fe = flow_mgr_find_fe_by_key(fsw->fsw_flow_mgr, &fk);
1748 if (fe != NULL) {
1749 if (out != NULL && len >= sf_size) {
1750 flow_entry_stats_get(fe, sf);
1751 }
1752 flow_entry_release(pfe: &fe);
1753 return sf_size;
1754 }
1755 return 0;
1756 }
1757
1758 flow_mgr_foreach_flow(fm: fsw->fsw_flow_mgr, flow_handler: ^(struct flow_entry *_fe) {
1759 actual_space += sf_size;
1760
1761 if (out == NULL || actual_space > len) {
1762 return;
1763 }
1764
1765 flow_entry_stats_get(_fe, sf);
1766 sf++;
1767 });
1768
1769 /*
1770 * Also return the ones in deferred free list.
1771 */
1772 lck_mtx_lock(lck: &fsw->fsw_linger_lock);
1773 TAILQ_FOREACH(fe, &fsw->fsw_linger_head, fe_linger_link) {
1774 actual_space += sf_size;
1775 if (out == NULL || actual_space > len) {
1776 continue;
1777 }
1778
1779 flow_entry_stats_get(fe, sf);
1780 sf++;
1781 }
1782 lck_mtx_unlock(lck: &fsw->fsw_linger_lock);
1783
1784 return actual_space;
1785}
1786
1787static size_t
1788fsw_mib_get_flow_adv(struct nx_flowswitch *fsw,
1789 struct nexus_mib_filter *filter, void *out, size_t len)
1790{
1791#pragma unused(filter)
1792 uint32_t fae_idx;
1793 size_t actual_space = 0;
1794 struct kern_channel *ch = NULL;
1795 struct sk_stats_flow_adv *sfa = NULL;
1796 struct sk_stats_flow_adv_ent *sfae = NULL;
1797 struct __flowadv_entry *fae = NULL;
1798 size_t sfa_size = sizeof(struct sk_stats_flow_adv);
1799 size_t sfae_size = sizeof(struct sk_stats_flow_adv_ent);
1800 uint32_t max_flowadv =
1801 fsw->fsw_nx->nx_prov->nxprov_params->nxp_flowadv_max;
1802
1803 SK_LOCK_ASSERT_HELD();
1804
1805 sfa = out;
1806 /* copyout flow advisory table (allocated entries only) */
1807 STAILQ_FOREACH(ch, &fsw->fsw_nx->nx_ch_head, ch_link) {
1808 struct skmem_arena *ar;
1809 struct skmem_arena_nexus *arn;
1810 struct nexus_adapter *na;
1811
1812 /* ch_lock isn't needed here since sk_lock is held */
1813 if ((ch->ch_flags & CHANF_CLOSING) ||
1814 (na = ch->ch_na) == NULL) {
1815 /* channel is closing */
1816 continue;
1817 }
1818
1819 ar = na->na_arena;
1820 arn = skmem_arena_nexus(ar);
1821
1822 AR_LOCK(ar);
1823 if (arn->arn_flowadv_obj == NULL) {
1824 ASSERT(ar->ar_flags & ARF_DEFUNCT);
1825 AR_UNLOCK(ar);
1826 continue;
1827 }
1828 actual_space += sfa_size;
1829 /* fill out flowadv_table info */
1830 if (out != NULL && actual_space <= len) {
1831 uuid_copy(dst: sfa->sfa_nx_uuid, src: fsw->fsw_nx->nx_uuid);
1832 (void) strlcpy(dst: sfa->sfa_if_name,
1833 src: fsw->fsw_flow_mgr->fm_name, IFNAMSIZ);
1834 sfa->sfa_owner_pid = ch->ch_pid;
1835 sfa->sfa_entries_count = 0;
1836 }
1837
1838 /* fill out flowadv_entries */
1839 sfae = &sfa->sfa_entries[0];
1840 for (fae_idx = 0; fae_idx < max_flowadv; fae_idx++) {
1841 fae = &arn->arn_flowadv_obj[fae_idx];
1842 if (!uuid_is_null(uu: fae->fae_id)) {
1843 actual_space += sfae_size;
1844 if (out == NULL || actual_space > len) {
1845 continue;
1846 }
1847
1848 /* fill out entry */
1849 uuid_copy(dst: sfae->sfae_flow_id, src: fae->fae_id);
1850 sfae->sfae_flags = fae->fae_flags;
1851 sfae++;
1852 sfa->sfa_entries_count++;
1853 }
1854 }
1855 sfa = (struct sk_stats_flow_adv *)
1856 ((uintptr_t)out + actual_space);
1857 AR_UNLOCK(ar);
1858 }
1859
1860 return actual_space;
1861}
1862
1863static inline void
1864fsw_fo2sfo(struct nx_flowswitch *fsw, struct flow_owner *fo,
1865 struct sk_stats_flow_owner *sfo)
1866{
1867 struct flow_mgr *fm = fsw->fsw_flow_mgr;
1868
1869 uuid_copy(dst: sfo->sfo_nx_uuid, src: fsw->fsw_nx->nx_uuid);
1870 (void) strlcpy(dst: sfo->sfo_if_name, src: fsw->fsw_flow_mgr->fm_name,
1871 IFNAMSIZ);
1872 sfo->sfo_bucket_idx = flow_mgr_get_fob_idx(fm, FO_BUCKET(fo));
1873
1874 (void) snprintf(sfo->sfo_name, count: sizeof(sfo->sfo_name), "%s",
1875 fo->fo_name);
1876 sfo->sfo_pid = fo->fo_pid;
1877 sfo->sfo_nx_port = fo->fo_nx_port;
1878 sfo->sfo_nx_port_pid_bound = fo->fo_nx_port_pid_bound;
1879 sfo->sfo_nx_port_destroyed = fo->fo_nx_port_destroyed;
1880}
1881
1882static size_t
1883fsw_mib_get_flow_owner(struct nx_flowswitch *fsw,
1884 struct nexus_mib_filter *filter, void *out, size_t len)
1885{
1886#pragma unused(filter)
1887 uint32_t i;
1888 size_t actual_space = 0;
1889 struct flow_mgr *fm = fsw->fsw_flow_mgr;
1890 struct sk_stats_flow_owner *sfo = out;
1891 size_t sfo_size = sizeof(struct sk_stats_flow_owner);
1892 struct flow_owner *fo;
1893
1894 FSW_LOCK_ASSERT_HELD(fsw);
1895
1896 /*
1897 * Ideally we'd like to hide the bucket level details from flow library
1898 * user, but there is no simple way to iterate flow_owner with
1899 * buckets/RB_TREE nested. So keep it as is.
1900 */
1901 for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
1902 struct flow_owner_bucket *fob = flow_mgr_get_fob_at_idx(fm, idx: i);
1903 FOB_LOCK(fob);
1904 RB_FOREACH(fo, flow_owner_tree, &fob->fob_owner_head) {
1905 actual_space += sfo_size;
1906 if (out == NULL || actual_space > len) {
1907 continue;
1908 }
1909
1910 fsw_fo2sfo(fsw, fo, sfo);
1911 sfo++;
1912 }
1913 FOB_UNLOCK(fob);
1914 }
1915
1916 return actual_space;
1917}
1918
1919static inline void
1920fsw_fr2sfr(struct nx_flowswitch *fsw, struct flow_route *fr,
1921 struct sk_stats_flow_route *sfr, boolean_t ll_scrub)
1922{
1923 uuid_copy(dst: sfr->sfr_nx_uuid, src: fsw->fsw_nx->nx_uuid);
1924 uuid_copy(dst: sfr->sfr_uuid, src: fr->fr_uuid);
1925 (void) strlcpy(dst: sfr->sfr_if_name, src: fsw->fsw_flow_mgr->fm_name,
1926 IFNAMSIZ);
1927
1928 sfr->sfr_bucket_idx = fr->fr_frb->frb_idx;
1929 sfr->sfr_id_bucket_idx = fr->fr_frib->frib_idx;
1930
1931 if (fr->fr_flags & FLOWRTF_ATTACHED) {
1932 sfr->sfr_flags |= SFLOWRTF_ATTACHED;
1933 }
1934 if (fr->fr_flags & FLOWRTF_ONLINK) {
1935 sfr->sfr_flags |= SFLOWRTF_ONLINK;
1936 }
1937 if (fr->fr_flags & FLOWRTF_GATEWAY) {
1938 sfr->sfr_flags |= SFLOWRTF_GATEWAY;
1939 }
1940 if (fr->fr_flags & FLOWRTF_RESOLVED) {
1941 sfr->sfr_flags |= SFLOWRTF_RESOLVED;
1942 }
1943 if (fr->fr_flags & FLOWRTF_HAS_LLINFO) {
1944 sfr->sfr_flags |= SFLOWRTF_HAS_LLINFO;
1945 }
1946 if (fr->fr_flags & FLOWRTF_DELETED) {
1947 sfr->sfr_flags |= SFLOWRTF_DELETED;
1948 }
1949 if (fr->fr_flags & FLOWRTF_DST_LL_MCAST) {
1950 sfr->sfr_flags |= SFLOWRTF_DST_LL_MCAST;
1951 }
1952 if (fr->fr_flags & FLOWRTF_DST_LL_BCAST) {
1953 sfr->sfr_flags |= SFLOWRTF_DST_LL_BCAST;
1954 }
1955
1956 lck_spin_lock(lck: &fr->fr_reflock);
1957 ASSERT(fr->fr_usecnt >= FLOW_ROUTE_MINREF);
1958 sfr->sfr_usecnt = fr->fr_usecnt - FLOW_ROUTE_MINREF;
1959 if (fr->fr_expire != 0) {
1960 sfr->sfr_expire = (int64_t)(fr->fr_expire - net_uptime());
1961 } else {
1962 sfr->sfr_expire = 0;
1963 }
1964 lck_spin_unlock(lck: &fr->fr_reflock);
1965
1966 sfr->sfr_laddr = fr->fr_laddr;
1967 sfr->sfr_faddr = fr->fr_faddr;
1968 sfr->sfr_gaddr = fr->fr_gaddr;
1969
1970 if (ll_scrub) {
1971 static const uint8_t unspec[ETHER_ADDR_LEN] = {[0] = 2 };
1972 bcopy(src: &unspec, dst: &sfr->sfr_ether_dhost, ETHER_ADDR_LEN);
1973 } else {
1974 bcopy(src: &fr->fr_eth.ether_dhost, dst: &sfr->sfr_ether_dhost,
1975 ETHER_ADDR_LEN);
1976 }
1977}
1978
1979#if CONFIG_MACF
1980extern int dlil_lladdr_ckreq;
1981#endif /* CONFIG_MACF */
1982
1983static size_t
1984fsw_mib_get_flow_route(struct nx_flowswitch *fsw,
1985 struct nexus_mib_filter *filter, void *out, size_t len, struct proc *p)
1986{
1987#pragma unused(filter)
1988 uint32_t i;
1989 size_t actual_space = 0;
1990 struct flow_mgr *fm = fsw->fsw_flow_mgr;
1991 struct sk_stats_flow_route *sfr = out;
1992 size_t sfo_size = sizeof(struct sk_stats_flow_route);
1993 struct flow_route *fr;
1994 boolean_t ll_scrub;
1995
1996 FSW_LOCK_ASSERT_HELD(fsw);
1997
1998 /*
1999 * To get the link-layer info, the caller must have the following
2000 * in their sandbox profile (or not be sandboxed at all), else we
2001 * scrub it clean just like dlil_ifaddr_bytes() does:
2002 *
2003 * (allow system-info (info-type "net.link.addr"))
2004 *
2005 * If scrubbed, we return 02:00:00:00:00:00.
2006 */
2007#if CONFIG_MACF
2008 ll_scrub = (dlil_lladdr_ckreq &&
2009 skywalk_mac_system_check_proc_cred(p, "net.link.addr") != 0);
2010#else /* !CONFIG_MACF */
2011 ll_scrub = FALSE;
2012#endif /* !CONFIG_MACF */
2013
2014 for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
2015 struct flow_route_bucket *frb = flow_mgr_get_frb_at_idx(fm, idx: i);
2016 FRB_RLOCK(frb);
2017 RB_FOREACH(fr, flow_route_tree, &frb->frb_head) {
2018 actual_space += sfo_size;
2019 if (out == NULL || actual_space > len) {
2020 continue;
2021 }
2022
2023 fsw_fr2sfr(fsw, fr, sfr, ll_scrub);
2024 sfr++;
2025 }
2026 FRB_UNLOCK(frb);
2027 }
2028
2029 return actual_space;
2030}
2031
2032static inline void
2033fsw_nxs2nus(struct nx_flowswitch *fsw, struct nexus_mib_filter *filter,
2034 pid_t pid, struct __nx_stats_fsw *nxs, struct sk_stats_userstack *sus)
2035{
2036 uuid_copy(dst: sus->sus_nx_uuid, src: fsw->fsw_nx->nx_uuid);
2037 (void) strlcpy(dst: sus->sus_if_name, src: fsw->fsw_flow_mgr->fm_name,
2038 IFNAMSIZ);
2039 sus->sus_owner_pid = pid;
2040
2041 if (filter->nmf_type & NXMIB_IP_STATS) {
2042 sus->sus_ip = nxs->nxs_ipstat;
2043 }
2044
2045 if (filter->nmf_type & NXMIB_IP6_STATS) {
2046 sus->sus_ip6 = nxs->nxs_ip6stat;
2047 }
2048
2049 if (filter->nmf_type & NXMIB_TCP_STATS) {
2050 sus->sus_tcp = nxs->nxs_tcpstat;
2051 }
2052
2053 if (filter->nmf_type & NXMIB_UDP_STATS) {
2054 sus->sus_udp = nxs->nxs_udpstat;
2055 }
2056
2057 if (filter->nmf_type & NXMIB_QUIC_STATS) {
2058 sus->sus_quic = nxs->nxs_quicstat;
2059 }
2060}
2061
2062static size_t
2063fsw_mib_get_userstack_stats(struct nx_flowswitch *fsw,
2064 struct nexus_mib_filter *filter, void *out, size_t len)
2065{
2066 size_t actual_space = 0;
2067 struct kern_channel *ch;
2068 struct __nx_stats_fsw *nxs;
2069 struct sk_stats_userstack *sus = out;
2070 size_t sus_size = sizeof(struct sk_stats_userstack);
2071
2072 SK_LOCK_ASSERT_HELD();
2073
2074 /* copyout saved stats from closed ports */
2075 if (((filter->nmf_bitmap & NXMIB_FILTER_PID) &&
2076 (filter->nmf_pid == 0)) ||
2077 !(filter->nmf_bitmap & NXMIB_FILTER_PID)) {
2078 actual_space += sus_size;
2079 if (out != NULL && actual_space <= len) {
2080 nxs = fsw->fsw_closed_na_stats;
2081 fsw_nxs2nus(fsw, filter, pid: 0, nxs, sus);
2082 sus++;
2083 }
2084 }
2085
2086 /*
2087 * XXX Currently a proc only opens one channel to nexus so we don't do
2088 * per proc aggregation of inet stats now as this needs lots of code
2089 */
2090 /* copyout per process stats */
2091 STAILQ_FOREACH(ch, &fsw->fsw_nx->nx_ch_head, ch_link) {
2092 struct skmem_arena *ar;
2093 struct nexus_adapter *na;
2094
2095 /* ch_lock isn't needed here since sk_lock is held */
2096 if ((ch->ch_flags & CHANF_CLOSING) ||
2097 (na = ch->ch_na) == NULL) {
2098 /* channel is closing */
2099 continue;
2100 }
2101
2102 if ((filter->nmf_bitmap & NXMIB_FILTER_PID) &&
2103 filter->nmf_pid != ch->ch_pid) {
2104 continue;
2105 }
2106
2107 ar = na->na_arena;
2108
2109 AR_LOCK(ar);
2110 nxs = skmem_arena_nexus(ar)->arn_stats_obj;
2111 if (nxs == NULL) {
2112 ASSERT(ar->ar_flags & ARF_DEFUNCT);
2113 AR_UNLOCK(ar);
2114 continue;
2115 }
2116
2117 actual_space += sus_size;
2118 if (out == NULL || actual_space > len) {
2119 AR_UNLOCK(ar);
2120 continue;
2121 }
2122
2123 fsw_nxs2nus(fsw, filter, pid: ch->ch_pid, nxs, sus);
2124 sus++;
2125 AR_UNLOCK(ar);
2126 }
2127
2128 return actual_space;
2129}
2130
2131static size_t
2132fsw_mib_get_stats(struct nx_flowswitch *fsw, void *out, size_t len)
2133{
2134 struct sk_stats_flow_switch *sfs = out;
2135 size_t actual_space = sizeof(struct sk_stats_flow_switch);
2136
2137 if (out != NULL && actual_space <= len) {
2138 uuid_copy(dst: sfs->sfs_nx_uuid, src: fsw->fsw_nx->nx_uuid);
2139 (void) strlcpy(dst: sfs->sfs_if_name,
2140 src: fsw->fsw_flow_mgr->fm_name, IFNAMSIZ);
2141 sfs->sfs_fsws = fsw->fsw_stats;
2142 }
2143
2144 return actual_space;
2145}
2146
2147size_t
2148fsw_mib_get(struct nx_flowswitch *fsw, struct nexus_mib_filter *filter,
2149 void *out, size_t len, struct proc *p)
2150{
2151 size_t ret;
2152
2153 switch (filter->nmf_type) {
2154 case NXMIB_FSW_STATS:
2155 ret = fsw_mib_get_stats(fsw, out, len);
2156 break;
2157 case NXMIB_FLOW:
2158 ret = fsw_mib_get_flow(fsw, filter, out, len);
2159 break;
2160 case NXMIB_FLOW_OWNER:
2161 ret = fsw_mib_get_flow_owner(fsw, filter, out, len);
2162 break;
2163 case NXMIB_FLOW_ROUTE:
2164 ret = fsw_mib_get_flow_route(fsw, filter, out, len, p);
2165 break;
2166 case NXMIB_TCP_STATS:
2167 case NXMIB_UDP_STATS:
2168 case NXMIB_IP_STATS:
2169 case NXMIB_IP6_STATS:
2170 case NXMIB_USERSTACK_STATS:
2171 ret = fsw_mib_get_userstack_stats(fsw, filter, out, len);
2172 break;
2173 case NXMIB_FLOW_ADV:
2174 ret = fsw_mib_get_flow_adv(fsw, filter, out, len);
2175 break;
2176 default:
2177 ret = 0;
2178 break;
2179 }
2180
2181 return ret;
2182}
2183
2184void
2185fsw_fold_stats(struct nx_flowswitch *fsw,
2186 void *data, nexus_stats_type_t type)
2187{
2188 ASSERT(data != NULL);
2189 FSW_LOCK_ASSERT_HELD(fsw);
2190
2191 switch (type) {
2192 case NEXUS_STATS_TYPE_FSW:
2193 {
2194 struct __nx_stats_fsw *d, *s;
2195 d = fsw->fsw_closed_na_stats;
2196 s = data;
2197 ip_stats_fold(dst: &d->nxs_ipstat, src: &s->nxs_ipstat);
2198 ip6_stats_fold(dst: &d->nxs_ip6stat, src: &s->nxs_ip6stat);
2199 tcp_stats_fold(dst: &d->nxs_tcpstat, src: &s->nxs_tcpstat);
2200 udp_stats_fold(dst: &d->nxs_udpstat, src: &s->nxs_udpstat);
2201 quic_stats_fold(dst: &d->nxs_quicstat, src: &s->nxs_quicstat);
2202 break;
2203 }
2204 case NEXUS_STATS_TYPE_CHAN_ERRORS:
2205 {
2206 struct __nx_stats_channel_errors *s = data;
2207 fsw_vp_channel_error_stats_fold(fs: &fsw->fsw_stats, es: s);
2208 break;
2209 }
2210 default:
2211 VERIFY(0);
2212 /* NOTREACHED */
2213 __builtin_unreachable();
2214 }
2215}
2216
2217boolean_t
2218fsw_detach_barrier_add(struct nx_flowswitch *fsw)
2219{
2220 lck_mtx_lock_spin(lck: &fsw->fsw_detach_barrier_lock);
2221 if (__improbable(fsw->fsw_detach_flags != 0 ||
2222 fsw->fsw_ifp == NULL || fsw->fsw_agent_session == NULL)) {
2223 lck_mtx_unlock(lck: &fsw->fsw_detach_barrier_lock);
2224 return FALSE;
2225 }
2226 fsw->fsw_detach_barriers++;
2227 lck_mtx_unlock(lck: &fsw->fsw_detach_barrier_lock);
2228
2229 return TRUE;
2230}
2231
2232void
2233fsw_detach_barrier_remove(struct nx_flowswitch *fsw)
2234{
2235 lck_mtx_lock_spin(lck: &fsw->fsw_detach_barrier_lock);
2236 ASSERT((fsw->fsw_detach_flags & FSW_DETACHF_DETACHED) == 0);
2237 ASSERT(fsw->fsw_detach_barriers != 0);
2238 fsw->fsw_detach_barriers--;
2239 /* if there's a thread waiting to detach the interface, let it know */
2240 if (__improbable((fsw->fsw_detach_waiters > 0) &&
2241 (fsw->fsw_detach_barriers == 0))) {
2242 fsw->fsw_detach_waiters = 0;
2243 wakeup(chan: &fsw->fsw_detach_waiters);
2244 }
2245 lck_mtx_unlock(lck: &fsw->fsw_detach_barrier_lock);
2246}
2247
2248/*
2249 * Generic resolver for non-Ethernet interfaces.
2250 */
2251int
2252fsw_generic_resolve(struct nx_flowswitch *fsw, struct flow_route *fr,
2253 struct __kern_packet *pkt)
2254{
2255#pragma unused(pkt)
2256#if SK_LOG
2257 char dst_s[MAX_IPv6_STR_LEN];
2258#endif /* SK_LOG */
2259 struct ifnet *ifp = fsw->fsw_ifp;
2260 struct rtentry *tgt_rt = NULL;
2261 int err = 0;
2262
2263 ASSERT(fr != NULL);
2264 ASSERT(ifp != NULL);
2265
2266 FR_LOCK(fr);
2267 /*
2268 * If the destination is on-link, we use the final destination
2269 * address as target. If it's off-link, we use the gateway
2270 * address instead. Point tgt_rt to the the destination or
2271 * gateway route accordingly.
2272 */
2273 if (fr->fr_flags & FLOWRTF_ONLINK) {
2274 tgt_rt = fr->fr_rt_dst;
2275 } else if (fr->fr_flags & FLOWRTF_GATEWAY) {
2276 tgt_rt = fr->fr_rt_gw;
2277 }
2278
2279 /*
2280 * Perform another routing table lookup if necessary.
2281 */
2282 if (tgt_rt == NULL || !(tgt_rt->rt_flags & RTF_UP) ||
2283 fr->fr_want_configure) {
2284 if (fr->fr_want_configure == 0) {
2285 os_atomic_inc(&fr->fr_want_configure, relaxed);
2286 }
2287 err = flow_route_configure(fr, ifp, NULL);
2288 if (err != 0) {
2289 SK_ERR("failed to configure route to %s on %s (err %d)",
2290 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
2291 sizeof(dst_s)), ifp->if_xname, err);
2292 goto done;
2293 }
2294
2295 /* refresh pointers */
2296 if (fr->fr_flags & FLOWRTF_ONLINK) {
2297 tgt_rt = fr->fr_rt_dst;
2298 } else if (fr->fr_flags & FLOWRTF_GATEWAY) {
2299 tgt_rt = fr->fr_rt_gw;
2300 }
2301 }
2302
2303 if (__improbable(!(fr->fr_flags & (FLOWRTF_ONLINK | FLOWRTF_GATEWAY)))) {
2304 err = EHOSTUNREACH;
2305 SK_ERR("invalid route for %s on %s (err %d)",
2306 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
2307 sizeof(dst_s)), ifp->if_xname, err);
2308 goto done;
2309 }
2310
2311 ASSERT(tgt_rt != NULL);
2312
2313done:
2314 if (__probable(err == 0)) {
2315 /*
2316 * There's no actual resolution taking place here, so just
2317 * mark it with FLOWRTF_RESOLVED for consistency.
2318 */
2319 os_atomic_or(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
2320 os_atomic_store(&fr->fr_want_probe, 0, release);
2321 } else {
2322 os_atomic_andnot(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
2323 flow_route_cleanup(fr);
2324 }
2325 FR_UNLOCK(fr);
2326
2327 return err;
2328}
2329
2330static void
2331fsw_read_boot_args(void)
2332{
2333 (void) PE_parse_boot_argn(arg_string: "fsw_use_dual_sized_pool",
2334 arg_ptr: &fsw_use_dual_sized_pool, max_arg: sizeof(fsw_use_dual_sized_pool));
2335}
2336
2337void
2338fsw_init(void)
2339{
2340 _CASSERT(NX_FSW_CHUNK_FREE == (uint64_t)-1);
2341 _CASSERT(PKT_MAX_PROTO_HEADER_SIZE <= NX_FSW_MINBUFSIZE);
2342
2343 if (!__nx_fsw_inited) {
2344 fsw_read_boot_args();
2345 /*
2346 * Register callbacks for interface & protocol events
2347 * Use dummy arg for callback cookie.
2348 */
2349 __nx_fsw_ifnet_eventhandler_tag =
2350 EVENTHANDLER_REGISTER(&ifnet_evhdlr_ctxt,
2351 ifnet_event, fsw_ifnet_event_callback,
2352 eventhandler_entry_dummy_arg, EVENTHANDLER_PRI_ANY);
2353 VERIFY(__nx_fsw_ifnet_eventhandler_tag != NULL);
2354
2355 __nx_fsw_protoctl_eventhandler_tag =
2356 EVENTHANDLER_REGISTER(&protoctl_evhdlr_ctxt,
2357 protoctl_event, fsw_protoctl_event_callback,
2358 eventhandler_entry_dummy_arg, EVENTHANDLER_PRI_ANY);
2359 VERIFY(__nx_fsw_protoctl_eventhandler_tag != NULL);
2360 __nx_fsw_inited = 1;
2361 }
2362}
2363
2364void
2365fsw_uninit(void)
2366{
2367 if (__nx_fsw_inited) {
2368 EVENTHANDLER_DEREGISTER(&ifnet_evhdlr_ctxt, ifnet_event,
2369 __nx_fsw_ifnet_eventhandler_tag);
2370 EVENTHANDLER_DEREGISTER(&protoctl_evhdlr_ctxt, protoctl_event,
2371 __nx_fsw_protoctl_eventhandler_tag);
2372
2373 __nx_fsw_inited = 0;
2374 }
2375}
2376
2377struct nx_flowswitch *
2378fsw_alloc(zalloc_flags_t how)
2379{
2380 struct nx_flowswitch *fsw;
2381 struct __nx_stats_fsw *nsfw;
2382
2383 SK_LOCK_ASSERT_HELD();
2384
2385 nsfw = zalloc_flags(nx_fsw_stats_zone, how | Z_ZERO);
2386 if (nsfw == NULL) {
2387 return NULL;
2388 }
2389
2390 fsw = zalloc_flags(nx_fsw_zone, how | Z_ZERO);
2391 if (fsw == NULL) {
2392 zfree(nx_fsw_stats_zone, nsfw);
2393 return NULL;
2394 }
2395
2396 FSW_RWINIT(fsw);
2397 fsw->fsw_dev_ch = NULL;
2398 fsw->fsw_host_ch = NULL;
2399 fsw->fsw_closed_na_stats = nsfw;
2400
2401 SK_DF(SK_VERB_MEM, "fsw 0x%llx ALLOC", SK_KVA(fsw));
2402
2403 return fsw;
2404}
2405
2406static int
2407fsw_detach(struct nx_flowswitch *fsw, struct nexus_adapter *hwna,
2408 boolean_t purge)
2409{
2410 struct kern_nexus_provider *nx_prov = fsw->fsw_nx->nx_prov;
2411 boolean_t do_dtor = FALSE;
2412
2413 SK_LOCK_ASSERT_HELD();
2414
2415 /*
2416 * return error if the the host port detach is in progress
2417 * or already detached.
2418 * For the case of flowswitch free (i.e. purge is TRUE) we have to
2419 * cleanup everything, so we will block if needed.
2420 */
2421 lck_mtx_lock(lck: &fsw->fsw_detach_barrier_lock);
2422 if (!purge && fsw->fsw_detach_flags != 0) {
2423 SK_ERR("fsw detaching");
2424 lck_mtx_unlock(lck: &fsw->fsw_detach_barrier_lock);
2425 return EBUSY;
2426 }
2427 VERIFY(purge || fsw->fsw_detach_flags == 0);
2428 /*
2429 * mark the flowswitch as detaching and release sk_lock while
2430 * waiting for other threads to exit. Maintain lock/unlock
2431 * ordering between the two locks.
2432 */
2433 fsw->fsw_detach_flags |= FSW_DETACHF_DETACHING;
2434 lck_mtx_unlock(lck: &fsw->fsw_detach_barrier_lock);
2435 SK_UNLOCK();
2436
2437 /*
2438 * wait until all threads needing accesses to the flowswitch
2439 * netagent get out, and mark this as detached to prevent
2440 * further access requests from being admitted.
2441 */
2442 lck_mtx_lock(lck: &fsw->fsw_detach_barrier_lock);
2443 while (fsw->fsw_detach_barriers != 0) {
2444 fsw->fsw_detach_waiters++;
2445 (void) msleep(chan: &fsw->fsw_detach_waiters,
2446 mtx: &fsw->fsw_detach_barrier_lock,
2447 pri: (PZERO + 1), wmesg: __FUNCTION__, NULL);
2448 }
2449 VERIFY(fsw->fsw_detach_barriers == 0);
2450 VERIFY(fsw->fsw_detach_flags != 0);
2451 fsw->fsw_detach_flags &= ~FSW_DETACHF_DETACHING;
2452 /*
2453 * if the NA detach thread as well as the flowswitch free thread were
2454 * both waiting, then the thread which wins the race is responsible
2455 * for doing the dtor work.
2456 */
2457 if (fsw->fsw_detach_flags == 0) {
2458 fsw->fsw_detach_flags |= FSW_DETACHF_DETACHED;
2459 do_dtor = TRUE;
2460 }
2461 VERIFY(fsw->fsw_detach_flags == FSW_DETACHF_DETACHED);
2462 lck_mtx_unlock(lck: &fsw->fsw_detach_barrier_lock);
2463 SK_LOCK();
2464
2465 FSW_WLOCK(fsw);
2466 if (do_dtor) {
2467 if (fsw->fsw_ifp != NULL) {
2468 fsw_teardown_ifp(fsw, hwna);
2469 ASSERT(fsw->fsw_ifp == NULL);
2470 ASSERT(fsw->fsw_nifna == NULL);
2471 }
2472 bzero(s: fsw->fsw_slla, n: sizeof(fsw->fsw_slla));
2473 nx_prov->nxprov_params->nxp_ifindex = 0;
2474 /* free any flow entries in the deferred list */
2475 fsw_linger_purge(fsw);
2476 }
2477 /*
2478 * If we are destroying the instance, release lock to let all
2479 * outstanding agent threads to enter, followed by waiting until
2480 * all of them exit the critical section before continuing.
2481 */
2482 if (purge) {
2483 FSW_UNLOCK(fsw);
2484 flow_mgr_terminate(fsw->fsw_flow_mgr);
2485 FSW_WLOCK(fsw);
2486 }
2487 FSW_WUNLOCK(fsw);
2488 return 0;
2489}
2490
2491void
2492fsw_free(struct nx_flowswitch *fsw)
2493{
2494 int err;
2495
2496 SK_LOCK_ASSERT_HELD();
2497 ASSERT(fsw != NULL);
2498
2499 err = fsw_detach(fsw, NULL, TRUE);
2500 VERIFY(err == 0);
2501
2502 fsw_dp_dtor(fsw);
2503
2504 ASSERT(fsw->fsw_dev_ch == NULL);
2505 ASSERT(fsw->fsw_host_ch == NULL);
2506 ASSERT(fsw->fsw_closed_na_stats != NULL);
2507 zfree(nx_fsw_stats_zone, fsw->fsw_closed_na_stats);
2508 fsw->fsw_closed_na_stats = NULL;
2509 FSW_RWDESTROY(fsw);
2510
2511 SK_DF(SK_VERB_MEM, "fsw 0x%llx FREE", SK_KVA(fsw));
2512 zfree(nx_fsw_zone, fsw);
2513}
2514