1/*
2 * Copyright (c) 2015-2023 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*
30 * The netif nexus domain has two domain providers: native and compat, with
31 * the latter being the default provider of this domain. The compat provider
32 * has special handlers for NXCFG_CMD_ATTACH and NXCFG_CMD_DETACH, etc.
33 *
34 * A netif nexus instance can be in a native or compat mode; in either case,
35 * it is associated with two instances of a nexus_adapter structure, and allows
36 * at most two channels opened to the nexus. Two two adapters correspond to
37 * host and device ports, respectively.
38 *
39 * By itself, a netif nexus isn't associated with a network interface. The
40 * association happens by attaching a network interface to the nexus instance.
41 * A channel can only be successfully opened to a netif nexus after it has an
42 * interface attached to it.
43 *
44 * During an attach, the interface is marked as Skywalk-capable, and its ifnet
45 * structure refers to the attached netif nexus adapter via its if_na field.
46 * The nexus also holds a reference to the interface on its na_ifp field. Note
47 * that attaching to a netif_compat nexus does not alter the input/output data
48 * path, nor does it remove any of the interface's hardware offload flags. It
49 * merely associates the interface and netif nexus together.
50 *
51 * During a detach, the above references are dropped and the fields are cleared;
52 * the interface is also marked as non-Skywalk-capable. This detach can happen
53 * explicitly via a command down the nexus, or implicitly when the nexus goes
54 * away (assuming there's no channel opened to it.)
55 *
56 * A userland channel can be opened to a netif nexus via the usual ch_open()
57 * way, assuming the nexus provider is setup to allow access for the userland
58 * process (either by binding the nexus port to PID, etc. or by creating the
59 * nexus in the anonymous mode.)
60 *
61 * Alternatively, a kernel channel can also be opened to it by some kernel
62 * subsystem, via ch_open_special(), e.g. by the flowswitch. Kernel channels
63 * don't have any task mapping created, and the flag CHANF_KERNEL is used to
64 * indicate that.
65 *
66 * Opening a channel to the host port of a native or compat netif causes the
67 * ifnet output path to be redirected to nx_netif_host_transmit(). We also,
68 * at present, disable any hardware offload features.
69 *
70 * Opening a channel to the device port of a compat netif causes the ifnet
71 * input path to be redirected to nx_netif_compat_receive(). This is specific
72 * to the compat variant, as the native variant's RX path already goes to
73 * the native netif.
74 *
75 * During channel close, we restore the original I/O callbacks, as well as the
76 * interface's offload flags.
77 */
78
79#include <skywalk/os_skywalk_private.h>
80#include <skywalk/nexus/netif/nx_netif.h>
81#include <skywalk/nexus/upipe/nx_user_pipe.h>
82#include <skywalk/nexus/flowswitch/nx_flowswitch.h>
83#include <sys/kdebug.h>
84#include <sys/sdt.h>
85#include <os/refcnt.h>
86#include <libkern/OSDebug.h>
87
88#define NX_NETIF_MAXRINGS NX_MAX_NUM_RING_PAIR
89#define NX_NETIF_MINSLOTS 2 /* XXX same as above */
90#define NX_NETIF_MAXSLOTS NX_MAX_NUM_SLOT_PER_RING /* max # of slots */
91#define NX_NETIF_TXRINGSIZE 512 /* default TX ring size */
92#define NX_NETIF_RXRINGSIZE 1024 /* default RX ring size */
93#define NX_NETIF_BUFSIZE (2 * 1024) /* default buffer size */
94#define NX_NETIF_MINBUFSIZE (128) /* min buffer size */
95#define NX_NETIF_MAXBUFSIZE (32 * 1024) /* max buffer size */
96
97/*
98 * TODO: adi@apple.com -- minimum buflets for now; we will need to
99 * have a way to adjust this based on the underlying interface's
100 * parameters, e.g. jumbo MTU, large segment offload, etc.
101 */
102#define NX_NETIF_UMD_SIZE _USER_PACKET_SIZE(BUFLETS_MIN)
103#define NX_NETIF_KMD_SIZE _KERN_PACKET_SIZE(BUFLETS_MIN)
104
105/*
106 * minimum stack space required for IOSkywalkFamily and Driver execution.
107 */
108#if XNU_TARGET_OS_OSX
109#define NX_NETIF_MIN_DRIVER_STACK_SIZE (kernel_stack_size >> 1)
110#else /* !XNU_TARGET_OS_OSX */
111#define NX_NETIF_MIN_DRIVER_STACK_SIZE (kernel_stack_size >> 2)
112#endif /* XNU_TARGET_OS_OSX */
113
114static void nx_netif_dom_init(struct nxdom *);
115static void nx_netif_dom_terminate(struct nxdom *);
116static void nx_netif_dom_fini(struct nxdom *);
117static int nx_netif_prov_params_adjust(
118 const struct kern_nexus_domain_provider *, const struct nxprov_params *,
119 struct nxprov_adjusted_params *);
120
121static int nx_netif_dom_bind_port(struct kern_nexus *, nexus_port_t *,
122 struct nxbind *, void *);
123static int nx_netif_dom_unbind_port(struct kern_nexus *, nexus_port_t);
124static int nx_netif_dom_connect(struct kern_nexus_domain_provider *,
125 struct kern_nexus *, struct kern_channel *, struct chreq *,
126 struct kern_channel *, struct nxbind *, struct proc *);
127static void nx_netif_dom_disconnect(struct kern_nexus_domain_provider *,
128 struct kern_nexus *, struct kern_channel *);
129static void nx_netif_dom_defunct(struct kern_nexus_domain_provider *,
130 struct kern_nexus *, struct kern_channel *, struct proc *);
131static void nx_netif_dom_defunct_finalize(struct kern_nexus_domain_provider *,
132 struct kern_nexus *, struct kern_channel *, boolean_t);
133
134static void nx_netif_doorbell(struct ifnet *);
135static int nx_netif_na_txsync(struct __kern_channel_ring *, struct proc *,
136 uint32_t);
137static int nx_netif_na_rxsync(struct __kern_channel_ring *, struct proc *,
138 uint32_t);
139static void nx_netif_na_dtor(struct nexus_adapter *na);
140static int nx_netif_na_notify_tx(struct __kern_channel_ring *, struct proc *,
141 uint32_t);
142static int nx_netif_na_notify_rx(struct __kern_channel_ring *, struct proc *,
143 uint32_t);
144static int nx_netif_na_activate(struct nexus_adapter *, na_activate_mode_t);
145
146static int nx_netif_ctl(struct kern_nexus *, nxcfg_cmd_t, void *,
147 struct proc *);
148static int nx_netif_ctl_attach(struct kern_nexus *, struct nx_spec_req *,
149 struct proc *);
150static int nx_netif_ctl_detach(struct kern_nexus *, struct nx_spec_req *);
151static int nx_netif_attach(struct kern_nexus *, struct ifnet *);
152static void nx_netif_flags_init(struct nx_netif *);
153static void nx_netif_flags_fini(struct nx_netif *);
154static void nx_netif_callbacks_init(struct nx_netif *);
155static void nx_netif_callbacks_fini(struct nx_netif *);
156static void nx_netif_capabilities_fini(struct nx_netif *);
157static errno_t nx_netif_interface_advisory_notify(void *,
158 const struct ifnet_interface_advisory *);
159
160struct nxdom nx_netif_dom_s = {
161 .nxdom_prov_head =
162 STAILQ_HEAD_INITIALIZER(nx_netif_dom_s.nxdom_prov_head),
163 .nxdom_type = NEXUS_TYPE_NET_IF,
164 .nxdom_md_type = NEXUS_META_TYPE_PACKET,
165 .nxdom_md_subtype = NEXUS_META_SUBTYPE_RAW,
166 .nxdom_name = "netif",
167 .nxdom_ports = {
168 .nb_def = 2,
169 .nb_min = 2,
170 .nb_max = NX_NETIF_MAXPORTS,
171 },
172 .nxdom_tx_rings = {
173 .nb_def = 1,
174 .nb_min = 1,
175 .nb_max = NX_NETIF_MAXRINGS,
176 },
177 .nxdom_rx_rings = {
178 .nb_def = 1,
179 .nb_min = 1,
180 .nb_max = NX_NETIF_MAXRINGS,
181 },
182 .nxdom_tx_slots = {
183 .nb_def = NX_NETIF_TXRINGSIZE,
184 .nb_min = NX_NETIF_MINSLOTS,
185 .nb_max = NX_NETIF_MAXSLOTS,
186 },
187 .nxdom_rx_slots = {
188 .nb_def = NX_NETIF_RXRINGSIZE,
189 .nb_min = NX_NETIF_MINSLOTS,
190 .nb_max = NX_NETIF_MAXSLOTS,
191 },
192 .nxdom_buf_size = {
193 .nb_def = NX_NETIF_BUFSIZE,
194 .nb_min = NX_NETIF_MINBUFSIZE,
195 .nb_max = NX_NETIF_MAXBUFSIZE,
196 },
197 .nxdom_large_buf_size = {
198 .nb_def = 0,
199 .nb_min = 0,
200 .nb_max = 0,
201 },
202 .nxdom_meta_size = {
203 .nb_def = NX_NETIF_UMD_SIZE,
204 .nb_min = NX_NETIF_UMD_SIZE,
205 .nb_max = NX_METADATA_USR_MAX_SZ,
206 },
207 .nxdom_stats_size = {
208 .nb_def = 0,
209 .nb_min = 0,
210 .nb_max = NX_STATS_MAX_SZ,
211 },
212 .nxdom_pipes = {
213 .nb_def = 0,
214 .nb_min = 0,
215 .nb_max = NX_UPIPE_MAXPIPES,
216 },
217 .nxdom_flowadv_max = {
218 .nb_def = 0,
219 .nb_min = 0,
220 .nb_max = NX_FLOWADV_MAX,
221 },
222 .nxdom_nexusadv_size = {
223 .nb_def = 0,
224 .nb_min = 0,
225 .nb_max = NX_NEXUSADV_MAX_SZ,
226 },
227 .nxdom_capabilities = {
228 .nb_def = NXPCAP_USER_CHANNEL,
229 .nb_min = 0,
230 .nb_max = NXPCAP_USER_CHANNEL,
231 },
232 .nxdom_qmap = {
233 .nb_def = NEXUS_QMAP_TYPE_DEFAULT,
234 .nb_min = NEXUS_QMAP_TYPE_DEFAULT,
235 .nb_max = NEXUS_QMAP_TYPE_WMM,
236 },
237 .nxdom_max_frags = {
238 .nb_def = NX_PBUF_FRAGS_DEFAULT,
239 .nb_min = NX_PBUF_FRAGS_MIN,
240 .nb_max = NX_PBUF_FRAGS_MAX,
241 },
242 .nxdom_init = nx_netif_dom_init,
243 .nxdom_terminate = nx_netif_dom_terminate,
244 .nxdom_fini = nx_netif_dom_fini,
245 .nxdom_find_port = NULL,
246 .nxdom_port_is_reserved = NULL,
247 .nxdom_bind_port = nx_netif_dom_bind_port,
248 .nxdom_unbind_port = nx_netif_dom_unbind_port,
249 .nxdom_connect = nx_netif_dom_connect,
250 .nxdom_disconnect = nx_netif_dom_disconnect,
251 .nxdom_defunct = nx_netif_dom_defunct,
252 .nxdom_defunct_finalize = nx_netif_dom_defunct_finalize,
253};
254
255struct kern_nexus_domain_provider nx_netif_prov_s = {
256 .nxdom_prov_name = NEXUS_PROVIDER_NET_IF,
257 /*
258 * Don't install this as the default domain provider, i.e.
259 * NXDOMPROVF_DEFAULT flag not set; we want netif_compat
260 * provider to be the one handling userland-issued requests
261 * coming down thru nxprov_create() instead.
262 */
263 .nxdom_prov_flags = 0,
264 .nxdom_prov_cb = {
265 .dp_cb_init = nx_netif_prov_init,
266 .dp_cb_fini = nx_netif_prov_fini,
267 .dp_cb_params = nx_netif_prov_params,
268 .dp_cb_mem_new = nx_netif_prov_mem_new,
269 .dp_cb_config = nx_netif_prov_config,
270 .dp_cb_nx_ctor = nx_netif_prov_nx_ctor,
271 .dp_cb_nx_dtor = nx_netif_prov_nx_dtor,
272 .dp_cb_nx_mem_info = nx_netif_prov_nx_mem_info,
273 .dp_cb_nx_mib_get = nx_netif_prov_nx_mib_get,
274 .dp_cb_nx_stop = nx_netif_prov_nx_stop,
275 },
276};
277
278struct nexus_ifnet_ops na_netif_ops = {
279 .ni_finalize = na_netif_finalize,
280 .ni_reap = nx_netif_reap,
281 .ni_dequeue = nx_netif_native_tx_dequeue,
282 .ni_get_len = nx_netif_native_tx_get_len,
283};
284
285#define NX_NETIF_DOORBELL_MAX_DEQUEUE 64
286uint32_t nx_netif_doorbell_max_dequeue = NX_NETIF_DOORBELL_MAX_DEQUEUE;
287
288#define NQ_TRANSFER_DECAY 2 /* ilog2 of EWMA decay rate (4) */
289static uint32_t nq_transfer_decay = NQ_TRANSFER_DECAY;
290
291#define NQ_ACCUMULATE_INTERVAL 2 /* 2 seconds */
292static uint32_t nq_accumulate_interval = NQ_ACCUMULATE_INTERVAL;
293
294static uint32_t nq_stat_enable = 0;
295
296SYSCTL_EXTENSIBLE_NODE(_kern_skywalk, OID_AUTO, netif,
297 CTLFLAG_RW | CTLFLAG_LOCKED, 0, "Skywalk network interface");
298#if (DEVELOPMENT || DEBUG)
299SYSCTL_STRING(_kern_skywalk_netif, OID_AUTO, sk_ll_prefix,
300 CTLFLAG_RW | CTLFLAG_LOCKED, sk_ll_prefix, sizeof(sk_ll_prefix),
301 "ifname prefix for enabling low latency support");
302static uint32_t nx_netif_force_ifnet_start = 0;
303SYSCTL_UINT(_kern_skywalk_netif, OID_AUTO, force_ifnet_start,
304 CTLFLAG_RW | CTLFLAG_LOCKED, &nx_netif_force_ifnet_start, 0,
305 "always use ifnet starter thread");
306SYSCTL_UINT(_kern_skywalk_netif, OID_AUTO, doorbell_max_dequeue,
307 CTLFLAG_RW | CTLFLAG_LOCKED, &nx_netif_doorbell_max_dequeue,
308 NX_NETIF_DOORBELL_MAX_DEQUEUE,
309 "max packets to dequeue in doorbell context");
310SYSCTL_UINT(_kern_skywalk_netif, OID_AUTO, netif_queue_transfer_decay,
311 CTLFLAG_RW | CTLFLAG_LOCKED, &nq_transfer_decay,
312 NQ_TRANSFER_DECAY, "ilog2 of EWMA decay rate of netif queue transfers");
313SYSCTL_UINT(_kern_skywalk_netif, OID_AUTO, netif_queue_stat_accumulate_interval,
314 CTLFLAG_RW | CTLFLAG_LOCKED, &nq_accumulate_interval,
315 NQ_ACCUMULATE_INTERVAL, "accumulation interval for netif queue stats");
316#endif /* !DEVELOPMENT && !DEBUG */
317
318SYSCTL_UINT(_kern_skywalk_netif, OID_AUTO, netif_queue_stat_enable,
319 CTLFLAG_RW | CTLFLAG_LOCKED, &nq_stat_enable,
320 0, "enable/disable stats collection for netif queue");
321
322static SKMEM_TYPE_DEFINE(na_netif_zone, struct nexus_netif_adapter);
323
324static SKMEM_TYPE_DEFINE(nx_netif_zone, struct nx_netif);
325
326#define SKMEM_TAG_NETIF_MIT "com.apple.skywalk.netif.mit"
327static SKMEM_TAG_DEFINE(skmem_tag_netif_mit, SKMEM_TAG_NETIF_MIT);
328
329#define SKMEM_TAG_NETIF_FILTER "com.apple.skywalk.netif.filter"
330SKMEM_TAG_DEFINE(skmem_tag_netif_filter, SKMEM_TAG_NETIF_FILTER);
331
332#define SKMEM_TAG_NETIF_FLOW "com.apple.skywalk.netif.flow"
333SKMEM_TAG_DEFINE(skmem_tag_netif_flow, SKMEM_TAG_NETIF_FLOW);
334
335#define SKMEM_TAG_NETIF_AGENT_FLOW "com.apple.skywalk.netif.agent_flow"
336SKMEM_TAG_DEFINE(skmem_tag_netif_agent_flow, SKMEM_TAG_NETIF_AGENT_FLOW);
337
338#define SKMEM_TAG_NETIF_LLINK "com.apple.skywalk.netif.llink"
339SKMEM_TAG_DEFINE(skmem_tag_netif_llink, SKMEM_TAG_NETIF_LLINK);
340
341#define SKMEM_TAG_NETIF_QSET "com.apple.skywalk.netif.qset"
342SKMEM_TAG_DEFINE(skmem_tag_netif_qset, SKMEM_TAG_NETIF_QSET);
343
344#define SKMEM_TAG_NETIF_LLINK_INFO "com.apple.skywalk.netif.llink_info"
345SKMEM_TAG_DEFINE(skmem_tag_netif_llink_info, SKMEM_TAG_NETIF_LLINK_INFO);
346
347/* use this for any temporary allocations */
348#define SKMEM_TAG_NETIF_TEMP "com.apple.skywalk.netif.temp"
349static SKMEM_TAG_DEFINE(skmem_tag_netif_temp, SKMEM_TAG_NETIF_TEMP);
350
351static void
352nx_netif_dom_init(struct nxdom *nxdom)
353{
354 SK_LOCK_ASSERT_HELD();
355 ASSERT(!(nxdom->nxdom_flags & NEXUSDOMF_INITIALIZED));
356
357 _CASSERT(NEXUS_PORT_NET_IF_DEV == 0);
358 _CASSERT(NEXUS_PORT_NET_IF_HOST == 1);
359 _CASSERT(NEXUS_PORT_NET_IF_CLIENT == 2);
360 _CASSERT(SK_NETIF_MIT_FORCE_OFF < SK_NETIF_MIT_FORCE_SIMPLE);
361 _CASSERT(SK_NETIF_MIT_FORCE_SIMPLE < SK_NETIF_MIT_FORCE_ADVANCED);
362 _CASSERT(SK_NETIF_MIT_FORCE_ADVANCED < SK_NETIF_MIT_AUTO);
363 _CASSERT(SK_NETIF_MIT_AUTO == SK_NETIF_MIT_MAX);
364
365 (void) nxdom_prov_add(nxdom, &nx_netif_prov_s);
366
367 nx_netif_compat_init(nxdom);
368
369 ASSERT(nxdom_prov_default[nxdom->nxdom_type] != NULL &&
370 strcmp(nxdom_prov_default[nxdom->nxdom_type]->nxdom_prov_name,
371 NEXUS_PROVIDER_NET_IF_COMPAT) == 0);
372
373 netif_gso_init();
374}
375
376static void
377nx_netif_dom_terminate(struct nxdom *nxdom)
378{
379 struct kern_nexus_domain_provider *nxdom_prov, *tnxdp;
380
381 SK_LOCK_ASSERT_HELD();
382
383 netif_gso_fini();
384 nx_netif_compat_fini();
385
386 STAILQ_FOREACH_SAFE(nxdom_prov, &nxdom->nxdom_prov_head,
387 nxdom_prov_link, tnxdp) {
388 (void) nxdom_prov_del(nxdom_prov);
389 }
390}
391
392static void
393nx_netif_dom_fini(struct nxdom *nxdom)
394{
395#pragma unused(nxdom)
396}
397
398int
399nx_netif_prov_init(struct kern_nexus_domain_provider *nxdom_prov)
400{
401#pragma unused(nxdom_prov)
402 SK_D("initializing %s", nxdom_prov->nxdom_prov_name);
403 return 0;
404}
405
406static int
407nx_netif_na_notify_drop(struct __kern_channel_ring *kring, struct proc *p,
408 uint32_t flags)
409{
410#pragma unused(kring, p, flags)
411 return ENXIO;
412}
413
414int
415nx_netif_prov_nx_stop(struct kern_nexus *nx)
416{
417 uint32_t r;
418 struct nexus_adapter *na = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
419 struct nexus_netif_adapter *nifna = (struct nexus_netif_adapter *)na;
420
421 SK_LOCK_ASSERT_HELD();
422 ASSERT(nx != NULL);
423
424 /* place all rings in drop mode */
425 na_kr_drop(na, TRUE);
426
427 /* ensure global visibility */
428 os_atomic_thread_fence(seq_cst);
429
430 /* reset all TX notify callbacks */
431 for (r = 0; r < na_get_nrings(na, t: NR_TX); r++) {
432 while (!os_atomic_cmpxchg((void * volatile *)&na->na_tx_rings[r].ckr_na_notify,
433 ptrauth_nop_cast(void *, na->na_tx_rings[r].ckr_na_notify),
434 ptrauth_nop_cast(void *, &nx_netif_na_notify_drop), acq_rel)) {
435 ;
436 }
437 os_atomic_thread_fence(seq_cst);
438 if (nifna->nifna_tx_mit != NULL) {
439 nx_netif_mit_cleanup(&nifna->nifna_tx_mit[r]);
440 }
441 }
442 if (nifna->nifna_tx_mit != NULL) {
443 skn_free_type_array(tx, struct nx_netif_mit,
444 na_get_nrings(na, NR_TX), nifna->nifna_tx_mit);
445 nifna->nifna_tx_mit = NULL;
446 }
447
448 /* reset all RX notify callbacks */
449 for (r = 0; r < na_get_nrings(na, t: NR_RX); r++) {
450 while (!os_atomic_cmpxchg((void * volatile *)&na->na_rx_rings[r].ckr_na_notify,
451 ptrauth_nop_cast(void *, na->na_rx_rings[r].ckr_na_notify),
452 ptrauth_nop_cast(void *, &nx_netif_na_notify_drop), acq_rel)) {
453 ;
454 }
455 os_atomic_thread_fence(seq_cst);
456 if (nifna->nifna_rx_mit != NULL) {
457 nx_netif_mit_cleanup(&nifna->nifna_rx_mit[r]);
458 }
459 }
460 if (nifna->nifna_rx_mit != NULL) {
461 skn_free_type_array(rx, struct nx_netif_mit,
462 na_get_nrings(na, NR_RX), nifna->nifna_rx_mit);
463 nifna->nifna_rx_mit = NULL;
464 }
465 return 0;
466}
467
468static inline void
469nx_netif_compat_adjust_ring_size(struct nxprov_adjusted_params *adj,
470 ifnet_t ifp)
471{
472 if (IFNET_IS_CELLULAR(ifp) && (ifp->if_unit != 0)) {
473 *(adj->adj_rx_slots) = sk_netif_compat_aux_cell_rx_ring_sz;
474 *(adj->adj_tx_slots) = sk_netif_compat_aux_cell_tx_ring_sz;
475 } else if (IFNET_IS_WIFI(ifp)) {
476 if (ifp->if_name[0] == 'a' && ifp->if_name[1] == 'p' &&
477 ifp->if_name[2] == '\0') {
478 /* Wi-Fi Access Point */
479 *(adj->adj_rx_slots) = sk_netif_compat_wap_rx_ring_sz;
480 *(adj->adj_tx_slots) = sk_netif_compat_wap_tx_ring_sz;
481 } else if (ifp->if_eflags & IFEF_AWDL) {
482 /* AWDL */
483 *(adj->adj_rx_slots) = sk_netif_compat_awdl_rx_ring_sz;
484 *(adj->adj_tx_slots) = sk_netif_compat_awdl_tx_ring_sz;
485 } else {
486 /* Wi-Fi infrastructure */
487 *(adj->adj_rx_slots) = sk_netif_compat_wif_rx_ring_sz;
488 *(adj->adj_tx_slots) = sk_netif_compat_wif_tx_ring_sz;
489 }
490 } else if (IFNET_IS_ETHERNET(ifp)) {
491#if !XNU_TARGET_OS_OSX
492 /*
493 * On non-macOS platforms, treat all compat Ethernet
494 * interfaces as USB Ethernet with reduced ring sizes.
495 */
496 *(adj->adj_rx_slots) = sk_netif_compat_usb_eth_rx_ring_sz;
497 *(adj->adj_tx_slots) = sk_netif_compat_usb_eth_tx_ring_sz;
498#else /* XNU_TARGET_OS_OSX */
499 if (ifp->if_subfamily == IFNET_SUBFAMILY_USB) {
500 *(adj->adj_rx_slots) =
501 sk_netif_compat_usb_eth_rx_ring_sz;
502 *(adj->adj_tx_slots) =
503 sk_netif_compat_usb_eth_tx_ring_sz;
504 }
505#endif /* XNU_TARGET_OS_OSX */
506 }
507}
508
509static int
510nx_netif_prov_params_adjust(const struct kern_nexus_domain_provider *nxdom_prov,
511 const struct nxprov_params *nxp, struct nxprov_adjusted_params *adj)
512{
513 /*
514 * for netif compat adjust the following parameters for memory
515 * optimization:
516 * - change the size of buffer object to 128 bytes.
517 * - don't allocate rx ring for host port and tx ring for dev port.
518 * - for cellular interfaces other than pdp_ip0 reduce the ring size.
519 * Assumption here is that pdp_ip0 is always used as the data
520 * interface.
521 * - reduce the ring size for AWDL interface.
522 * - reduce the ring size for USB ethernet interface.
523 */
524 if (strcmp(s1: nxdom_prov->nxdom_prov_name,
525 NEXUS_PROVIDER_NET_IF_COMPAT) == 0) {
526 /*
527 * Leave the parameters default if userspace access may be
528 * needed. We can't use skywalk_direct_allowed() here because
529 * the drivers have not attached yet.
530 */
531 if (skywalk_netif_direct_enabled()) {
532 goto done;
533 }
534
535 *(adj->adj_buf_size) = NETIF_COMPAT_BUF_SIZE;
536 *(adj->adj_tx_rings) = 1;
537 if (IF_INDEX_IN_RANGE(nxp->nxp_ifindex)) {
538 ifnet_t ifp;
539 ifnet_head_lock_shared();
540 ifp = ifindex2ifnet[nxp->nxp_ifindex];
541 ifnet_head_done();
542 VERIFY(ifp != NULL);
543 nx_netif_compat_adjust_ring_size(adj, ifp);
544 }
545 } else { /* netif native */
546 if (nxp->nxp_flags & NXPF_NETIF_LLINK) {
547 *(adj->adj_tx_slots) = NX_NETIF_MINSLOTS;
548 *(adj->adj_rx_slots) = NX_NETIF_MINSLOTS;
549 }
550 /*
551 * Add another extra ring for host port. Note that if the
552 * nexus isn't configured to use the same pbufpool for all of
553 * its ports, we'd end up allocating extra here.
554 * Not a big deal since that case isn't the default.
555 */
556 *(adj->adj_tx_rings) += 1;
557 *(adj->adj_rx_rings) += 1;
558
559 if ((*(adj->adj_buf_size) < PKT_MAX_PROTO_HEADER_SIZE)) {
560 SK_ERR("buf size too small, min (%d)",
561 PKT_MAX_PROTO_HEADER_SIZE);
562 return EINVAL;
563 }
564 _CASSERT(sizeof(struct __kern_netif_intf_advisory) ==
565 NX_INTF_ADV_SIZE);
566 *(adj->adj_nexusadv_size) = sizeof(struct netif_nexus_advisory);
567 }
568done:
569 return 0;
570}
571
572int
573nx_netif_prov_params(struct kern_nexus_domain_provider *nxdom_prov,
574 const uint32_t req, const struct nxprov_params *nxp0,
575 struct nxprov_params *nxp, struct skmem_region_params srp[SKMEM_REGIONS],
576 uint32_t pp_region_config_flags)
577{
578 struct nxdom *nxdom = nxdom_prov->nxdom_prov_dom;
579
580 return nxprov_params_adjust(nxdom_prov, req, nxp0, nxp, srp,
581 nxdom, nxdom, nxdom, pp_region_config_flags,
582 adjust_fn: nx_netif_prov_params_adjust);
583}
584
585int
586nx_netif_prov_mem_new(struct kern_nexus_domain_provider *nxdom_prov,
587 struct kern_nexus *nx, struct nexus_adapter *na)
588{
589#pragma unused(nxdom_prov)
590 int err = 0;
591 boolean_t pp_truncated_buf = FALSE;
592 boolean_t allow_direct;
593 boolean_t kernel_only;
594
595 SK_DF(SK_VERB_NETIF,
596 "nx 0x%llx (\"%s\":\"%s\") na \"%s\" (0x%llx)", SK_KVA(nx),
597 NX_DOM(nx)->nxdom_name, nxdom_prov->nxdom_prov_name, na->na_name,
598 SK_KVA(na));
599
600 ASSERT(na->na_arena == NULL);
601 if ((na->na_type == NA_NETIF_COMPAT_DEV) ||
602 (na->na_type == NA_NETIF_COMPAT_HOST)) {
603 pp_truncated_buf = TRUE;
604 }
605 /*
606 * We do this check to determine whether to create the extra
607 * regions needed for userspace access. This is per interface.
608 * NX_USER_CHANNEL_PROV() is systemwide so it can't be used.
609 */
610 allow_direct = skywalk_netif_direct_allowed(na->na_name);
611
612 /*
613 * Both ports (host and dev) share the same packet buffer pool;
614 * the first time a port gets opened will allocate the pp that
615 * gets stored in the nexus, which will then be used by any
616 * subsequent opens.
617 */
618 kernel_only = !allow_direct || !NX_USER_CHANNEL_PROV(nx);
619 na->na_arena = skmem_arena_create_for_nexus(na,
620 NX_PROV(nx)->nxprov_region_params, &nx->nx_tx_pp,
621 &nx->nx_rx_pp, pp_truncated_buf, kernel_only, &nx->nx_adv, &err);
622 ASSERT(na->na_arena != NULL || err != 0);
623 ASSERT(nx->nx_tx_pp == NULL || (nx->nx_tx_pp->pp_md_type ==
624 NX_DOM(nx)->nxdom_md_type && nx->nx_tx_pp->pp_md_subtype ==
625 NX_DOM(nx)->nxdom_md_subtype));
626
627 return err;
628}
629
630SK_NO_INLINE_ATTRIBUTE
631static int
632nx_netif_get_llink_info(struct sockopt *sopt, struct kern_nexus *nx)
633{
634 struct nx_llink_info_req *nlir = NULL;
635 struct nx_netif *nif;
636 struct netif_llink *llink;
637 uint16_t llink_cnt;
638 size_t len, user_len;
639 int err, i;
640
641 nif = NX_NETIF_PRIVATE(nx);
642 if (!NETIF_LLINK_ENABLED(nif)) {
643 SK_ERR("llink mode not enabled");
644 return ENOTSUP;
645 }
646 lck_rw_lock_shared(lck: &nif->nif_llink_lock);
647 llink_cnt = nif->nif_llink_cnt;
648 if (llink_cnt == 0) {
649 SK_ERR("zero llink cnt");
650 err = ENXIO;
651 goto done;
652 }
653 len = sizeof(*nlir) + (sizeof(struct nx_llink_info) * llink_cnt);
654 /* preserve sopt_valsize because it gets overwritten by copyin */
655 user_len = sopt->sopt_valsize;
656 if (user_len < len) {
657 SK_ERR("buffer too small");
658 err = ENOBUFS;
659 goto done;
660 }
661 nlir = sk_alloc_data(len, Z_WAITOK, skmem_tag_netif_llink_info);
662 if (nlir == NULL) {
663 SK_ERR("failed to allocate nlir");
664 err = ENOMEM;
665 goto done;
666 }
667 err = sooptcopyin(sopt, nlir, len: sizeof(*nlir), minlen: sizeof(*nlir));
668 if (err != 0) {
669 SK_ERR("copyin failed: %d", err);
670 goto done;
671 }
672 if (nlir->nlir_version != NETIF_LLINK_INFO_VERSION) {
673 SK_ERR("nlir version mismatch: %d != %d",
674 nlir->nlir_version, NETIF_LLINK_INFO_VERSION);
675 err = ENOTSUP;
676 goto done;
677 }
678 nlir->nlir_llink_cnt = llink_cnt;
679 i = 0;
680 STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
681 struct nx_llink_info *nli;
682 struct netif_qset *qset;
683 uint16_t qset_cnt;
684 int j;
685
686 nli = &nlir->nlir_llink[i];
687 nli->nli_link_id = llink->nll_link_id;
688 nli->nli_link_id_internal = llink->nll_link_id_internal;
689 nli->nli_state = llink->nll_state;
690 nli->nli_flags = llink->nll_flags;
691
692 qset_cnt = llink->nll_qset_cnt;
693 ASSERT(qset_cnt <= NETIF_LLINK_MAX_QSETS);
694 nli->nli_qset_cnt = qset_cnt;
695
696 j = 0;
697 SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
698 struct nx_qset_info *nqi;
699
700 nqi = &nli->nli_qset[j];
701 nqi->nqi_id = qset->nqs_id;
702 nqi->nqi_flags = qset->nqs_flags;
703 nqi->nqi_num_rx_queues = qset->nqs_num_rx_queues;
704 nqi->nqi_num_tx_queues = qset->nqs_num_tx_queues;
705 j++;
706 }
707 ASSERT(j == qset_cnt);
708 i++;
709 }
710 ASSERT(i == llink_cnt);
711 sopt->sopt_valsize = user_len;
712 err = sooptcopyout(sopt, data: nlir, len);
713 if (err != 0) {
714 SK_ERR("sooptcopyout failed: %d", err);
715 }
716done:
717 lck_rw_unlock_shared(lck: &nif->nif_llink_lock);
718 if (nlir != NULL) {
719 sk_free_data(nlir, len);
720 }
721 return err;
722}
723
724int
725nx_netif_prov_config(struct kern_nexus_domain_provider *nxdom_prov,
726 struct kern_nexus *nx, struct nx_cfg_req *ncr, int sopt_dir,
727 struct proc *p, kauth_cred_t cred)
728{
729#pragma unused(nxdom_prov)
730 struct sockopt sopt;
731 int err = 0;
732
733 SK_LOCK_ASSERT_HELD();
734
735 /* proceed only if the client possesses netif entitlement */
736 if ((err = skywalk_priv_check_cred(p, cred,
737 PRIV_SKYWALK_REGISTER_NET_IF)) != 0) {
738 goto done;
739 }
740
741 if (ncr->nc_req == USER_ADDR_NULL) {
742 err = EINVAL;
743 goto done;
744 }
745
746 /* to make life easier for handling copies */
747 bzero(s: &sopt, n: sizeof(sopt));
748 sopt.sopt_dir = sopt_dir;
749 sopt.sopt_val = ncr->nc_req;
750 sopt.sopt_valsize = ncr->nc_req_len;
751 sopt.sopt_p = p;
752
753 switch (ncr->nc_cmd) {
754 case NXCFG_CMD_ATTACH:
755 case NXCFG_CMD_DETACH: {
756 struct nx_spec_req nsr;
757
758 bzero(s: &nsr, n: sizeof(nsr));
759 err = sooptcopyin(sopt: &sopt, &nsr, len: sizeof(nsr), minlen: sizeof(nsr));
760 if (err != 0) {
761 goto done;
762 }
763
764 /*
765 * Null-terminate in case this has an interface name;
766 * the union is already large enough for uuid_t.
767 */
768 nsr.nsr_name[sizeof(nsr.nsr_name) - 1] = '\0';
769 if (p != kernproc) {
770 nsr.nsr_flags &= NXSPECREQ_MASK;
771 }
772
773 err = nx_netif_ctl(nx, ncr->nc_cmd, &nsr, p);
774 if (err != 0) {
775 goto done;
776 }
777
778 /* XXX: adi@apple.com -- can this copyout fail? */
779 (void) sooptcopyout(sopt: &sopt, data: &nsr, len: sizeof(nsr));
780 break;
781 }
782 case NXCFG_CMD_FLOW_ADD:
783 case NXCFG_CMD_FLOW_DEL: {
784 _CASSERT(offsetof(struct nx_flow_req, _nfr_kernel_field_end) ==
785 offsetof(struct nx_flow_req, _nfr_common_field_end));
786 struct nx_flow_req nfr;
787
788 bzero(s: &nfr, n: sizeof(nfr));
789 err = sooptcopyin(sopt: &sopt, &nfr, len: sizeof(nfr), minlen: sizeof(nfr));
790 if (err != 0) {
791 goto done;
792 }
793
794 err = nx_netif_ctl(nx, ncr->nc_cmd, &nfr, p);
795 if (err != 0) {
796 goto done;
797 }
798
799 /* XXX: adi@apple.com -- can this copyout fail? */
800 (void) sooptcopyout(sopt: &sopt, data: &nfr, len: sizeof(nfr));
801 break;
802 }
803 case NXCFG_CMD_GET_LLINK_INFO: {
804 err = nx_netif_get_llink_info(sopt: &sopt, nx);
805 break;
806 }
807 default:
808 err = EINVAL;
809 goto done;
810 }
811done:
812 SK_DF(err ? SK_VERB_ERROR : SK_VERB_NETIF,
813 "nexus 0x%llx (%s) cmd %d err %d", SK_KVA(nx),
814 NX_DOM_PROV(nx)->nxdom_prov_name, ncr->nc_cmd, err);
815 return err;
816}
817
818void
819nx_netif_prov_fini(struct kern_nexus_domain_provider *nxdom_prov)
820{
821#pragma unused(nxdom_prov)
822 SK_D("destroying %s", nxdom_prov->nxdom_prov_name);
823}
824
825int
826nx_netif_prov_nx_ctor(struct kern_nexus *nx)
827{
828 struct nx_netif *n;
829 char name[64];
830 int error;
831
832 SK_LOCK_ASSERT_HELD();
833 ASSERT(nx->nx_arg == NULL);
834
835 SK_D("nexus 0x%llx (%s)", SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name);
836
837 nx->nx_arg = nx_netif_alloc(Z_WAITOK);
838 n = NX_NETIF_PRIVATE(nx);
839 if (NX_USER_CHANNEL_PROV(nx) &&
840 NX_PROV(nx)->nxprov_params->nxp_nexusadv_size != 0) {
841 (void) snprintf(name, count: sizeof(name), "netif_%llu", nx->nx_id);
842 error = nx_advisory_alloc(nx, name,
843 &NX_PROV(nx)->nxprov_region_params[SKMEM_REGION_NEXUSADV],
844 NEXUS_ADVISORY_TYPE_NETIF);
845 if (error != 0) {
846 nx_netif_free(n);
847 return error;
848 }
849 }
850 n->nif_nx = nx;
851 SK_D("create new netif 0x%llx for nexus 0x%llx",
852 SK_KVA(NX_NETIF_PRIVATE(nx)), SK_KVA(nx));
853 return 0;
854}
855
856void
857nx_netif_prov_nx_dtor(struct kern_nexus *nx)
858{
859 struct nx_netif *n = NX_NETIF_PRIVATE(nx);
860
861 SK_LOCK_ASSERT_HELD();
862
863 SK_D("nexus 0x%llx (%s) netif 0x%llx", SK_KVA(nx),
864 NX_DOM_PROV(nx)->nxdom_prov_name, SK_KVA(n));
865
866 /*
867 * XXX
868 * detach should be done separately to be symmetrical with attach.
869 */
870 nx_advisory_free(nx);
871 if (nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV) != NULL) {
872 /* we're called by nx_detach(), so this cannot fail */
873 int err = nx_netif_ctl_detach(nx, NULL);
874 VERIFY(err == 0);
875 }
876 if (n->nif_dev_nxb != NULL) {
877 nxb_free(n->nif_dev_nxb);
878 n->nif_dev_nxb = NULL;
879 }
880 if (n->nif_host_nxb != NULL) {
881 nxb_free(n->nif_host_nxb);
882 n->nif_host_nxb = NULL;
883 }
884 SK_DF(SK_VERB_NETIF, "marking netif 0x%llx as free", SK_KVA(n));
885 nx_netif_free(n);
886 nx->nx_arg = NULL;
887}
888
889int
890nx_netif_prov_nx_mem_info(struct kern_nexus *nx, struct kern_pbufpool **tpp,
891 struct kern_pbufpool **rpp)
892{
893 ASSERT(nx->nx_tx_pp != NULL);
894 ASSERT(nx->nx_rx_pp != NULL);
895
896 if (tpp != NULL) {
897 *tpp = nx->nx_tx_pp;
898 }
899 if (rpp != NULL) {
900 *rpp = nx->nx_rx_pp;
901 }
902
903 return 0;
904}
905
906static size_t
907__netif_mib_get_stats(struct kern_nexus *nx, void *out, size_t len)
908{
909 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
910 struct ifnet *ifp = nif->nif_ifp;
911 struct sk_stats_net_if *sns = out;
912 size_t actual_space = sizeof(struct sk_stats_net_if);
913
914 if (out != NULL && actual_space <= len) {
915 uuid_copy(dst: sns->sns_nx_uuid, src: nx->nx_uuid);
916 if (ifp != NULL) {
917 (void) strlcpy(dst: sns->sns_if_name, if_name(ifp), IFNAMSIZ);
918 }
919 sns->sns_nifs = nif->nif_stats;
920 }
921
922 return actual_space;
923}
924
925static size_t
926__netif_mib_get_llinks(struct kern_nexus *nx, void *out, size_t len)
927{
928 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
929 struct nx_llink_info *nli_list = out;
930 size_t actual_space = 0;
931 if (NETIF_LLINK_ENABLED(nif)) {
932 lck_rw_lock_shared(lck: &nif->nif_llink_lock);
933 actual_space += nif->nif_llink_cnt * sizeof(struct nx_llink_info);
934
935 if (out != NULL && actual_space <= len) {
936 struct netif_llink *llink;
937 int i = 0;
938 STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
939 struct nx_llink_info *nli;
940 struct netif_qset *qset;
941 uint16_t qset_cnt;
942 int j;
943
944 nli = &nli_list[i];
945 uuid_copy(dst: nli->nli_netif_uuid, src: nx->nx_uuid);
946 nli->nli_link_id = llink->nll_link_id;
947 nli->nli_link_id_internal = llink->nll_link_id_internal;
948 nli->nli_state = llink->nll_state;
949 nli->nli_flags = llink->nll_flags;
950
951 qset_cnt = llink->nll_qset_cnt;
952 ASSERT(qset_cnt <= NETIF_LLINK_MAX_QSETS);
953 nli->nli_qset_cnt = qset_cnt;
954
955 j = 0;
956 SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
957 struct nx_qset_info *nqi;
958
959 nqi = &nli->nli_qset[j];
960 nqi->nqi_id = qset->nqs_id;
961 nqi->nqi_flags = qset->nqs_flags;
962 nqi->nqi_num_rx_queues = qset->nqs_num_rx_queues;
963 nqi->nqi_num_tx_queues = qset->nqs_num_tx_queues;
964 j++;
965 }
966 ASSERT(j == qset_cnt);
967 i++;
968 }
969 ASSERT(i == nif->nif_llink_cnt);
970 }
971 lck_rw_unlock_shared(lck: &nif->nif_llink_lock);
972 }
973
974 return actual_space;
975}
976
977static size_t
978__netif_mib_get_queue_stats(struct kern_nexus *nx, void *out, size_t len)
979{
980 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
981 uint8_t *itr = out;
982 size_t actual_space = 0;
983 if (!NETIF_LLINK_ENABLED(nif)) {
984 return actual_space;
985 }
986
987 lck_rw_lock_shared(lck: &nif->nif_llink_lock);
988 struct netif_llink *llink;
989 struct netif_qset *qset;
990 STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
991 SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
992 actual_space += sizeof(struct netif_qstats_info) *
993 (qset->nqs_num_rx_queues + qset->nqs_num_tx_queues);
994 }
995 }
996 if (out == NULL || actual_space > len) {
997 lck_rw_unlock_shared(lck: &nif->nif_llink_lock);
998 return actual_space;
999 }
1000
1001 llink = NULL;
1002 qset = NULL;
1003 uint16_t i = 0, j = 0;
1004 STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
1005 uint16_t qset_cnt;
1006 j = 0;
1007 qset_cnt = llink->nll_qset_cnt;
1008 ASSERT(qset_cnt <= NETIF_LLINK_MAX_QSETS);
1009 SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
1010 int queue_cnt = qset->nqs_num_rx_queues +
1011 qset->nqs_num_tx_queues;
1012 for (uint16_t k = 0; k < queue_cnt; k++) {
1013 struct netif_qstats_info *nqi =
1014 (struct netif_qstats_info *)(void *)itr;
1015 struct netif_queue *nq = &qset->nqs_driver_queues[k];
1016 nqi->nqi_qset_id = qset->nqs_id;
1017 nqi->nqi_queue_idx = k;
1018 if (KPKT_VALID_SVC(nq->nq_svc)) {
1019 nqi->nqi_svc = (packet_svc_class_t)nq->nq_svc;
1020 }
1021 if (nq->nq_flags & NETIF_QUEUE_IS_RX) {
1022 nqi->nqi_queue_flag = NQI_QUEUE_FLAG_IS_RX;
1023 }
1024
1025 struct netif_qstats *nq_out = &nqi->nqi_stats;
1026 struct netif_qstats *nq_src = &nq->nq_stats;
1027 memcpy(dst: nq_out, src: nq_src, n: sizeof(struct netif_qstats));
1028
1029 itr += sizeof(struct netif_qstats_info);
1030 }
1031 j++;
1032 }
1033 ASSERT(j == qset_cnt);
1034 i++;
1035 }
1036 ASSERT(i == nif->nif_llink_cnt);
1037
1038 lck_rw_unlock_shared(lck: &nif->nif_llink_lock);
1039 return actual_space;
1040}
1041
1042size_t
1043nx_netif_prov_nx_mib_get(struct kern_nexus *nx, struct nexus_mib_filter *filter,
1044 void *out, size_t len, struct proc *p)
1045{
1046#pragma unused(p)
1047 size_t ret;
1048
1049 if ((filter->nmf_bitmap & NXMIB_FILTER_NX_UUID) &&
1050 (uuid_compare(uu1: filter->nmf_nx_uuid, uu2: nx->nx_uuid)) != 0) {
1051 return 0;
1052 }
1053
1054 switch (filter->nmf_type) {
1055 case NXMIB_NETIF_STATS:
1056 ret = __netif_mib_get_stats(nx, out, len);
1057 break;
1058 case NXMIB_LLINK_LIST:
1059 ret = __netif_mib_get_llinks(nx, out, len);
1060 break;
1061 case NXMIB_NETIF_QUEUE_STATS:
1062 ret = __netif_mib_get_queue_stats(nx, out, len);
1063 break;
1064 default:
1065 ret = 0;
1066 break;
1067 }
1068 return ret;
1069}
1070
1071static int
1072nx_netif_dom_bind_port(struct kern_nexus *nx, nexus_port_t *nx_port,
1073 struct nxbind *nxb, void *info)
1074{
1075 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
1076 nexus_port_t first, last, port;
1077 int error;
1078
1079 ASSERT(nx_port != NULL);
1080 ASSERT(nxb != NULL);
1081
1082 port = *nx_port;
1083
1084 /*
1085 * If port is:
1086 * != NEXUS_PORT_ANY: attempt to bind to the specified port
1087 * == NEXUS_PORT_ANY: find an available port, bind to it, and
1088 * return back the assigned port.
1089 */
1090 first = NEXUS_PORT_NET_IF_CLIENT;
1091 ASSERT(NXDOM_MAX(NX_DOM(nx), ports) <= NEXUS_PORT_MAX);
1092 last = (nexus_port_size_t)NXDOM_MAX(NX_DOM(nx), ports);
1093 ASSERT(first <= last);
1094
1095 NETIF_WLOCK(nif);
1096
1097 if (__improbable(first == last)) {
1098 error = ENOMEM;
1099 } else if (port != NEXUS_PORT_ANY) {
1100 error = nx_port_bind_info(nx, port, nxb, info);
1101 SK_DF(SK_VERB_NETIF, "port %d, bind err %d", port, error);
1102 } else {
1103 error = nx_port_find(nx, first, last - 1, &port);
1104 ASSERT(error != 0 || (port >= first && port < last));
1105 if (error == 0) {
1106 error = nx_port_bind_info(nx, port, nxb, info);
1107 SK_DF(SK_VERB_NETIF, "found port %d, bind err %d",
1108 port, error);
1109 }
1110 }
1111 NETIF_WUNLOCK(nif);
1112
1113 ASSERT(*nx_port == NEXUS_PORT_ANY || *nx_port == port);
1114 if (error == 0) {
1115 *nx_port = port;
1116 }
1117
1118 SK_DF(error ? SK_VERB_ERROR : SK_VERB_NETIF,
1119 "+++ netif 0x%llx nx_port %d, total %u active %u (err %d)",
1120 SK_KVA(nif), (int)*nx_port, NX_NETIF_MAXPORTS,
1121 nx->nx_active_ports, error);
1122
1123 return error;
1124}
1125
1126static int
1127nx_netif_dom_unbind_port(struct kern_nexus *nx, nexus_port_t nx_port)
1128{
1129 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
1130 int error = 0;
1131
1132 ASSERT(nx_port != NEXUS_PORT_ANY);
1133
1134 NETIF_WLOCK(nif);
1135 error = nx_port_unbind(nx, nx_port);
1136 NETIF_WUNLOCK(nif);
1137
1138 return error;
1139}
1140
1141static int
1142nx_netif_dom_connect(struct kern_nexus_domain_provider *nxdom_prov,
1143 struct kern_nexus *nx, struct kern_channel *ch, struct chreq *chr,
1144 struct kern_channel *ch0, struct nxbind *nxb, struct proc *p)
1145{
1146#pragma unused(nxdom_prov)
1147 int err = 0;
1148
1149 SK_LOCK_ASSERT_HELD();
1150
1151 ASSERT(NX_DOM_PROV(nx) == nxdom_prov);
1152 ASSERT(nx->nx_prov->nxprov_params->nxp_type ==
1153 nxdom_prov->nxdom_prov_dom->nxdom_type &&
1154 nx->nx_prov->nxprov_params->nxp_type == NEXUS_TYPE_NET_IF);
1155 ASSERT(!(ch->ch_flags & CHANF_HOST));
1156
1157 switch (chr->cr_port) {
1158 case NEXUS_PORT_NET_IF_DEV:
1159 if (chr->cr_mode & CHMODE_HOST) {
1160 err = EINVAL;
1161 goto done;
1162 }
1163 break;
1164
1165 case NEXUS_PORT_NET_IF_HOST:
1166 if (!(chr->cr_mode & CHMODE_HOST)) {
1167 if (ch->ch_flags & CHANF_KERNEL) {
1168 err = EINVAL;
1169 goto done;
1170 }
1171 chr->cr_mode |= CHMODE_HOST;
1172 }
1173 /*
1174 * This channel is exclusively opened to the host
1175 * rings; don't notify the external provider.
1176 */
1177 os_atomic_or(&ch->ch_flags, CHANF_HOST | CHANF_EXT_SKIP, relaxed);
1178 break;
1179
1180 default:
1181 /*
1182 * This channel is shared between netif and user process;
1183 * don't notify the external provider.
1184 */
1185 os_atomic_or(&ch->ch_flags, CHANF_EXT_SKIP, relaxed);
1186 break;
1187 }
1188
1189 chr->cr_ring_set = RING_SET_DEFAULT;
1190 chr->cr_real_endpoint = chr->cr_endpoint = CH_ENDPOINT_NET_IF;
1191 (void) snprintf(chr->cr_name, count: sizeof(chr->cr_name), "netif:%llu:%.*s",
1192 nx->nx_id, (int)nx->nx_prov->nxprov_params->nxp_namelen,
1193 nx->nx_prov->nxprov_params->nxp_name);
1194
1195 if (ch->ch_flags & CHANF_KERNEL) {
1196 err = na_connect_spec(nx, ch, chr, p);
1197 } else {
1198 err = na_connect(nx, ch, chr, ch0, nxb, p);
1199 }
1200
1201 if (err == 0) {
1202 /*
1203 * Mark the kernel slot descriptor region as busy; this
1204 * prevents it from being torn-down at channel defunct
1205 * time, as the (external) nexus owner may be calling
1206 * KPIs that require accessing the slots.
1207 */
1208 skmem_arena_nexus_sd_set_noidle(
1209 skmem_arena_nexus(ar: ch->ch_na->na_arena), 1);
1210 }
1211
1212done:
1213 return err;
1214}
1215
1216static void
1217nx_netif_dom_disconnect(struct kern_nexus_domain_provider *nxdom_prov,
1218 struct kern_nexus *nx, struct kern_channel *ch)
1219{
1220#pragma unused(nxdom_prov)
1221 SK_LOCK_ASSERT_HELD();
1222
1223 SK_D("channel 0x%llx -!- nexus 0x%llx (%s:\"%s\":%u:%d)", SK_KVA(ch),
1224 SK_KVA(nx), nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
1225 ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id);
1226
1227 /*
1228 * Release busy assertion held earlier in nx_netif_dom_connect();
1229 * this allows for the final arena teardown to succeed.
1230 */
1231 skmem_arena_nexus_sd_set_noidle(
1232 skmem_arena_nexus(ar: ch->ch_na->na_arena), -1);
1233
1234 if (ch->ch_flags & CHANF_KERNEL) {
1235 na_disconnect_spec(nx, ch);
1236 } else {
1237 na_disconnect(nx, ch);
1238 }
1239}
1240
1241static void
1242nx_netif_dom_defunct(struct kern_nexus_domain_provider *nxdom_prov,
1243 struct kern_nexus *nx, struct kern_channel *ch, struct proc *p)
1244{
1245#pragma unused(nxdom_prov, nx)
1246 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1247 ASSERT(!(ch->ch_flags & CHANF_KERNEL));
1248 ASSERT(ch->ch_na->na_type == NA_NETIF_DEV ||
1249 ch->ch_na->na_type == NA_NETIF_HOST ||
1250 ch->ch_na->na_type == NA_NETIF_COMPAT_DEV ||
1251 ch->ch_na->na_type == NA_NETIF_COMPAT_HOST ||
1252 ch->ch_na->na_type == NA_NETIF_VP);
1253
1254 na_ch_rings_defunct(ch, p);
1255}
1256
1257static void
1258nx_netif_dom_defunct_finalize(struct kern_nexus_domain_provider *nxdom_prov,
1259 struct kern_nexus *nx, struct kern_channel *ch, boolean_t locked)
1260{
1261#pragma unused(nxdom_prov)
1262 struct ifnet *ifp;
1263
1264 if (!locked) {
1265 SK_LOCK_ASSERT_NOTHELD();
1266 SK_LOCK();
1267 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
1268 } else {
1269 SK_LOCK_ASSERT_HELD();
1270 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1271 }
1272
1273 ASSERT(ch->ch_na->na_type == NA_NETIF_DEV ||
1274 ch->ch_na->na_type == NA_NETIF_HOST ||
1275 ch->ch_na->na_type == NA_NETIF_COMPAT_DEV ||
1276 ch->ch_na->na_type == NA_NETIF_COMPAT_HOST ||
1277 ch->ch_na->na_type == NA_NETIF_VP);
1278
1279 na_defunct(nx, ch, ch->ch_na, locked);
1280 ifp = ch->ch_na->na_ifp;
1281 if (ch->ch_na->na_type == NA_NETIF_VP && ifp != NULL &&
1282 ifnet_is_low_latency(ifp)) {
1283 /*
1284 * We release the VPNA's ifp here instead of waiting for the
1285 * application to close the channel to trigger the release.
1286 */
1287 DTRACE_SKYWALK2(release__vpna__ifp, struct nexus_adapter *,
1288 ch->ch_na, struct ifnet *, ifp);
1289 ifnet_decr_iorefcnt(ifp);
1290 ch->ch_na->na_ifp = NULL;
1291 }
1292 SK_D("%s(%d): ch 0x%llx -/- nx 0x%llx (%s:\"%s\":%u:%d)",
1293 ch->ch_name, ch->ch_pid, SK_KVA(ch), SK_KVA(nx),
1294 nxdom_prov->nxdom_prov_name, ch->ch_na->na_name,
1295 ch->ch_info->cinfo_nx_port, (int)ch->ch_info->cinfo_ch_ring_id);
1296
1297 if (!locked) {
1298 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
1299 SK_UNLOCK();
1300 } else {
1301 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_OWNED);
1302 SK_LOCK_ASSERT_HELD();
1303 }
1304}
1305
1306struct nexus_netif_adapter *
1307na_netif_alloc(zalloc_flags_t how)
1308{
1309 _CASSERT(offsetof(struct nexus_netif_adapter, nifna_up) == 0);
1310
1311 return zalloc_flags(na_netif_zone, how | Z_ZERO);
1312}
1313
1314void
1315na_netif_free(struct nexus_adapter *na)
1316{
1317 struct nexus_netif_adapter *nifna = (struct nexus_netif_adapter *)na;
1318
1319 SK_LOCK_ASSERT_HELD();
1320 SK_DF(SK_VERB_MEM, "nifna 0x%llx FREE", SK_KVA(nifna));
1321
1322 ASSERT(na->na_refcount == 0);
1323 ASSERT(nifna->nifna_tx_mit == NULL);
1324 ASSERT(nifna->nifna_rx_mit == NULL);
1325 bzero(s: nifna, n: sizeof(*nifna));
1326
1327 zfree(na_netif_zone, nifna);
1328}
1329
1330/* Process NXCFG_CMD_ATTACH */
1331SK_NO_INLINE_ATTRIBUTE
1332static int
1333nx_netif_ctl_attach(struct kern_nexus *nx, struct nx_spec_req *nsr,
1334 struct proc *p)
1335{
1336 struct nx_netif *n = NX_NETIF_PRIVATE(nx);
1337 struct ifnet *ifp = NULL;
1338 boolean_t compat;
1339 int err = 0;
1340
1341 SK_LOCK_ASSERT_HELD();
1342
1343 ASSERT(NX_DOM(nx)->nxdom_type == NEXUS_TYPE_NET_IF);
1344 compat = (strcmp(NX_DOM_PROV(nx)->nxdom_prov_name,
1345 NEXUS_PROVIDER_NET_IF_COMPAT) == 0);
1346
1347 uuid_clear(uu: nsr->nsr_if_uuid);
1348 /*
1349 * The netif accepts either an interface name or a pointer to
1350 * an ifnet, but never a UUID.
1351 */
1352 if (nsr->nsr_flags & NXSPECREQ_UUID) {
1353 err = EINVAL;
1354 goto done;
1355 }
1356 if (nsr->nsr_flags & NXSPECREQ_IFP) {
1357 if (p != kernproc || (ifp = nsr->nsr_ifp) == NULL) {
1358 err = EINVAL;
1359 goto done;
1360 }
1361 } else if ((ifp = ifunit_ref(nsr->nsr_name)) == NULL) {
1362 err = ENXIO;
1363 goto done;
1364 }
1365
1366 if ((compat && SKYWALK_NATIVE(ifp)) ||
1367 (!compat && !SKYWALK_NATIVE(ifp))) {
1368 /* native driver for netif; non-native for netif_compat */
1369 err = ENODEV;
1370 } else if (ifp->if_na != NULL || !uuid_is_null(uu: n->nif_uuid)) {
1371 err = EBUSY;
1372 } else {
1373 ASSERT(uuid_is_null(n->nif_uuid));
1374 /*
1375 * Upon success, callee will hold its own ifnet iorefcnt
1376 * as well as a retain count on the nexus adapter.
1377 */
1378 if (compat) {
1379 err = nx_netif_compat_attach(nx, ifp);
1380 } else {
1381 err = nx_netif_attach(nx, ifp);
1382 }
1383
1384 if (err == 0) {
1385 /* return the adapter UUID */
1386 uuid_generate_random(out: n->nif_uuid);
1387 uuid_copy(dst: nsr->nsr_if_uuid, src: n->nif_uuid);
1388#if (DEVELOPMENT || DEBUG)
1389 skoid_create(&n->nif_skoid,
1390 SKOID_SNODE(_kern_skywalk_netif), if_name(ifp),
1391 CTLFLAG_RW);
1392#endif /* !DEVELOPMENT && !DEBUG */
1393 }
1394 }
1395done:
1396 /* drop I/O refcnt from ifunit_ref() */
1397 if (ifp != NULL && !(nsr->nsr_flags & NXSPECREQ_IFP)) {
1398 ifnet_decr_iorefcnt(ifp);
1399 }
1400
1401#if SK_LOG
1402 uuid_string_t uuidstr, ifuuidstr;
1403 const char *nustr;
1404 if (nsr->nsr_flags & NXSPECREQ_UUID) {
1405 nustr = sk_uuid_unparse(nsr->nsr_uuid, uuidstr);
1406 } else if (nsr->nsr_flags & NXSPECREQ_IFP) {
1407 (void) snprintf((char *)uuidstr, sizeof(uuidstr), "0x%llx",
1408 SK_KVA(nsr->nsr_ifp));
1409 nustr = uuidstr;
1410 } else {
1411 nustr = nsr->nsr_name;
1412 }
1413 SK_DF(err ? SK_VERB_ERROR : SK_VERB_NETIF,
1414 "nexus 0x%llx (%s) name/uuid \"%s\" if_uuid %s flags 0x%x err %d",
1415 SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name, nustr,
1416 sk_uuid_unparse(nsr->nsr_if_uuid, ifuuidstr), nsr->nsr_flags, err);
1417#endif /* SK_LOG */
1418
1419 return err;
1420}
1421
1422SK_NO_INLINE_ATTRIBUTE
1423static int
1424nx_netif_clean(struct nx_netif *nif, boolean_t quiesce_needed)
1425{
1426 struct kern_nexus *nx = nif->nif_nx;
1427 struct ifnet *ifp;
1428 boolean_t suspended = FALSE;
1429
1430 ifp = nif->nif_ifp;
1431 if (ifp == NULL) {
1432 return EALREADY;
1433 }
1434 /*
1435 * For regular kernel-attached interfaces, quiescing is handled by
1436 * the ifnet detach thread, which calls dlil_quiesce_and_detach_nexuses().
1437 * For interfaces created by skywalk test cases, flowswitch/netif nexuses
1438 * are constructed on the fly and can also be torn down on the fly.
1439 * dlil_quiesce_and_detach_nexuses() won't help here because any nexus
1440 * can be detached while the interface is still attached.
1441 */
1442 if (quiesce_needed && ifnet_datamov_suspend_if_needed(ifp)) {
1443 SK_UNLOCK();
1444 suspended = TRUE;
1445 ifnet_datamov_drain(ifp);
1446 SK_LOCK();
1447 }
1448 nx_netif_callbacks_fini(nif);
1449 nx_netif_agent_fini(nif);
1450 nx_netif_capabilities_fini(nif);
1451 nx_netif_flow_fini(nif);
1452 nx_netif_filter_fini(nif);
1453 nx_netif_llink_fini(nif);
1454 nx_netif_flags_fini(nif);
1455
1456 uuid_clear(uu: nif->nif_uuid);
1457 /* nx_netif_{compat_}attach() held both references */
1458 na_release_locked(na: nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV));
1459 na_release_locked(na: nx_port_get_na(nx, NEXUS_PORT_NET_IF_HOST));
1460 nx_port_free(nx, NEXUS_PORT_NET_IF_DEV);
1461 nx_port_free(nx, NEXUS_PORT_NET_IF_HOST);
1462
1463 ifp->if_na_ops = NULL;
1464 ifp->if_na = NULL;
1465 nif->nif_ifp = NULL;
1466 nif->nif_netif_nxadv = NULL;
1467 SKYWALK_CLEAR_CAPABLE(ifp);
1468 if (suspended) {
1469 ifnet_datamov_resume(ifp);
1470 }
1471
1472#if (DEVELOPMENT || DEBUG)
1473 skoid_destroy(&nif->nif_skoid);
1474#endif /* !DEVELOPMENT && !DEBUG */
1475 return 0;
1476}
1477
1478/* process NXCFG_CMD_DETACH */
1479SK_NO_INLINE_ATTRIBUTE
1480static int
1481nx_netif_ctl_detach(struct kern_nexus *nx, struct nx_spec_req *nsr)
1482{
1483 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
1484 int err = 0;
1485
1486 SK_LOCK_ASSERT_HELD();
1487
1488 /*
1489 * nsr is NULL when we're called from the destructor, and it
1490 * implies that we'll detach whatever that is attached.
1491 */
1492 if (nsr != NULL && uuid_is_null(uu: nsr->nsr_if_uuid)) {
1493 err = EINVAL;
1494 } else if (nsr != NULL && uuid_compare(uu1: nsr->nsr_if_uuid,
1495 uu2: nif->nif_uuid) != 0) {
1496 err = ESRCH;
1497 } else if (nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV) == NULL) {
1498 /* nx_netif_ctl_attach() not yet done or already detached */
1499 err = ENXIO;
1500 } else if (nx->nx_ch_count != 0) {
1501 /*
1502 * There's at least a channel opened; we can't
1503 * yank the interface from underneath the nexus
1504 * since our dlil input/output handler may be
1505 * running now. Bail out and come back here
1506 * again when the nexus detaches.
1507 */
1508 err = EBUSY;
1509 } else {
1510 err = nx_netif_clean(nif, TRUE);
1511 }
1512
1513#if SK_LOG
1514 if (nsr != NULL) {
1515 uuid_string_t ifuuidstr;
1516 SK_DF(err ? SK_VERB_ERROR : SK_VERB_NETIF,
1517 "nexus 0x%llx (%s) if_uuid %s flags 0x%x err %d",
1518 SK_KVA(nx), NX_DOM_PROV(nx)->nxdom_prov_name,
1519 sk_uuid_unparse(nsr->nsr_if_uuid, ifuuidstr),
1520 nsr->nsr_flags, err);
1521 } else {
1522 SK_DF(err ? SK_VERB_ERROR : SK_VERB_NETIF,
1523 "nexus 0x%llx (%s) err %d", SK_KVA(nx),
1524 NX_DOM_PROV(nx)->nxdom_prov_name, err);
1525 }
1526#endif /* SK_LOG */
1527
1528 return err;
1529}
1530
1531/*
1532 * XXX
1533 * These checks are copied from fsw.c
1534 * There are no tests exercising this code. Do we still need this?
1535 */
1536SK_NO_INLINE_ATTRIBUTE
1537static int
1538nx_netif_ctl_flow_check(struct nx_netif *nif, nxcfg_cmd_t cmd,
1539 struct proc *p, struct nx_flow_req *req)
1540{
1541#pragma unused(nif)
1542 boolean_t need_check;
1543 int error;
1544
1545 if (uuid_is_null(uu: req->nfr_flow_uuid)) {
1546 return EINVAL;
1547 }
1548 req->nfr_flags &= NXFLOWREQF_MASK;
1549 req->nfr_flowadv_idx = FLOWADV_IDX_NONE;
1550
1551 if (cmd == NXCFG_CMD_FLOW_DEL) {
1552 return 0;
1553 }
1554 need_check = FALSE;
1555 if (req->nfr_epid != -1 && proc_pid(p) != req->nfr_epid) {
1556 need_check = TRUE;
1557 } else if (!uuid_is_null(uu: req->nfr_euuid)) {
1558 uuid_t uuid;
1559
1560 /* get the UUID of the issuing process */
1561 proc_getexecutableuuid(p, uuid, sizeof(uuid));
1562
1563 /*
1564 * If this is not issued by a process for its own
1565 * executable UUID and if the process does not have
1566 * the necessary privilege, reject the request.
1567 * The logic is similar to so_set_effective_uuid().
1568 */
1569 if (uuid_compare(uu1: req->nfr_euuid, uu2: uuid) != 0) {
1570 need_check = TRUE;
1571 }
1572 }
1573 if (need_check) {
1574 kauth_cred_t cred = kauth_cred_proc_ref(procp: p);
1575 error = priv_check_cred(cred,
1576 PRIV_NET_PRIVILEGED_SOCKET_DELEGATE, flags: 0);
1577 kauth_cred_unref(&cred);
1578 if (error != 0) {
1579 return error;
1580 }
1581 }
1582 return 0;
1583}
1584
1585SK_NO_INLINE_ATTRIBUTE
1586static int
1587nx_netif_ctl_flow_add(struct nx_netif *nif, struct proc *p,
1588 struct nx_flow_req *req)
1589{
1590 int err;
1591
1592 ASSERT(p != PROC_NULL);
1593 err = nx_netif_ctl_flow_check(nif, cmd: NXCFG_CMD_FLOW_ADD, p, req);
1594 if (err != 0) {
1595 return err;
1596 }
1597
1598 /* init kernel only fields */
1599 nx_flow_req_internalize(req);
1600 req->nfr_context = NULL;
1601 req->nfr_flow_stats = NULL;
1602 req->nfr_port_reservation = NULL;
1603 req->nfr_pid = proc_pid(p);
1604
1605 err = nx_netif_netagent_flow_add(nif, req);
1606 nx_flow_req_externalize(req);
1607 return err;
1608}
1609
1610SK_NO_INLINE_ATTRIBUTE
1611static int
1612nx_netif_ctl_flow_del(struct nx_netif *nif, struct proc *p,
1613 struct nx_flow_req *req)
1614{
1615 int err;
1616
1617 err = nx_netif_ctl_flow_check(nif, cmd: NXCFG_CMD_FLOW_DEL, p, req);
1618 if (err != 0) {
1619 return err;
1620 }
1621
1622 nx_flow_req_internalize(req);
1623 req->nfr_pid = proc_pid(p);
1624
1625 err = nx_netif_netagent_flow_del(nif, req);
1626 nx_flow_req_externalize(req);
1627 return err;
1628}
1629
1630SK_NO_INLINE_ATTRIBUTE
1631static int
1632nx_netif_ctl(struct kern_nexus *nx, nxcfg_cmd_t nc_cmd, void *data,
1633 struct proc *p)
1634{
1635 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
1636 struct nx_spec_req *nsr = data;
1637 struct nx_flow_req *nfr = data;
1638 int error = 0;
1639
1640 SK_LOCK_ASSERT_HELD();
1641
1642 switch (nc_cmd) {
1643 case NXCFG_CMD_ATTACH:
1644 error = nx_netif_ctl_attach(nx, nsr, p);
1645 break;
1646
1647 case NXCFG_CMD_DETACH:
1648 error = nx_netif_ctl_detach(nx, nsr);
1649 break;
1650
1651 case NXCFG_CMD_FLOW_ADD:
1652 error = nx_netif_ctl_flow_add(nif, p, req: nfr);
1653 break;
1654
1655 case NXCFG_CMD_FLOW_DEL:
1656 error = nx_netif_ctl_flow_del(nif, p, req: nfr);
1657 break;
1658
1659 default:
1660 SK_ERR("invalid cmd %u", nc_cmd);
1661 error = EINVAL;
1662 break;
1663 }
1664 return error;
1665}
1666
1667static void
1668nx_netif_llink_notify(struct kern_nexus *nx, struct netif_llink *llink,
1669 uint32_t flags)
1670{
1671#pragma unused(flags)
1672 struct netif_qset *qset;
1673
1674 SLIST_FOREACH(qset, &llink->nll_qset_list, nqs_list) {
1675 (void) nx_tx_qset_notify(nx, qset_ctx: qset->nqs_ctx);
1676 }
1677}
1678
1679static void
1680nx_netif_llink_notify_all(struct kern_nexus *nx, uint32_t flags)
1681{
1682 struct nx_netif *nif;
1683 struct netif_llink *llink;
1684
1685 nif = NX_NETIF_PRIVATE(nx);
1686
1687 lck_rw_lock_shared(lck: &nif->nif_llink_lock);
1688 STAILQ_FOREACH(llink, &nif->nif_llink_list, nll_link) {
1689 nx_netif_llink_notify(nx, llink, flags);
1690 }
1691 lck_rw_unlock_shared(lck: &nif->nif_llink_lock);
1692}
1693
1694/*
1695 * if_start() callback for native Skywalk interfaces, registered
1696 * at ifnet_allocate_extended() time, and invoked by the ifnet
1697 * starter thread.
1698 */
1699static void
1700nx_netif_doorbell_internal(struct ifnet *ifp, uint32_t flags)
1701{
1702 if (__improbable(ifp->if_na == NULL)) {
1703 return;
1704 }
1705
1706 /*
1707 * Do this only if the nexus adapter is active, i.e. a channel
1708 * has been opened to it by the module above (flowswitch, etc.)
1709 */
1710 struct nexus_adapter *hwna = &NA(ifp)->nifna_up;
1711 if (__probable(NA_IS_ACTIVE(hwna))) {
1712 struct kern_nexus *nx = hwna->na_nx;
1713
1714 /* update our work timestamp */
1715 hwna->na_work_ts = _net_uptime;
1716
1717 if (NX_LLINK_PROV(nx)) {
1718 nx_netif_llink_notify_all(nx, flags);
1719 } else {
1720 struct __kern_channel_ring *kring;
1721
1722 /* for doorbell purposes, use TX ring 0 */
1723 kring = &hwna->na_tx_rings[0];
1724
1725 /* Issue a synchronous TX doorbell on the netif device ring */
1726 kring->ckr_na_sync(kring, PROC_NULL,
1727 (NA_SYNCF_NETIF_DOORBELL | NA_SYNCF_NETIF_IFSTART));
1728 }
1729 } else {
1730 struct netif_stats *nifs =
1731 &NX_NETIF_PRIVATE(hwna->na_nx)->nif_stats;
1732 STATS_INC(nifs, NETIF_STATS_DROP_NA_INACTIVE);
1733 }
1734}
1735
1736static void
1737nx_netif_doorbell(struct ifnet *ifp)
1738{
1739 nx_netif_doorbell_internal(ifp, NETIF_XMIT_FLAG_HOST);
1740}
1741
1742/*
1743 * TX sync callback, called from nx_netif_doorbell() where we'd expect to
1744 * perform synchronous TX doorbell to the driver, by invoking the driver's
1745 * doorbell callback directly in the same thread context. It is also called
1746 * when the layer above performs a TX sync operation, where we might need
1747 * to do an asynchronous doorbell instead, by simply calling ifnet_start().
1748 */
1749static int
1750nx_netif_na_txsync(struct __kern_channel_ring *kring, struct proc *p,
1751 uint32_t flags)
1752{
1753#pragma unused(p)
1754 struct ifnet *ifp = KRNA(kring)->na_ifp;
1755 boolean_t sync_only;
1756 int ret = 0;
1757
1758 ASSERT(ifp != NULL);
1759
1760 SK_DF(SK_VERB_NETIF | SK_VERB_SYNC | SK_VERB_TX,
1761 "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b ring %u flags 0%x",
1762 sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
1763 SK_KVA(kring), kring->ckr_flags, CKRF_BITS, kring->ckr_ring_id,
1764 flags);
1765
1766 if (__improbable(!IF_FULLY_ATTACHED(ifp))) {
1767 SK_ERR("kr 0x%llx ifp %s (0x%llx), interface not attached",
1768 SK_KVA(kring), if_name(ifp), SK_KVA(ifp));
1769 return ENXIO;
1770 }
1771
1772 if (__improbable((ifp->if_start_flags & IFSF_FLOW_CONTROLLED) != 0)) {
1773 SK_DF(SK_VERB_SYNC | SK_VERB_TX, "kr 0x%llx ifp %s (0x%llx), "
1774 "flow control ON", SK_KVA(kring), if_name(ifp),
1775 SK_KVA(ifp));
1776 return ENXIO;
1777 }
1778
1779 /* update our work timestamp */
1780 KRNA(kring)->na_work_ts = _net_uptime;
1781
1782 sync_only = ((flags & NA_SYNCF_SYNC_ONLY) != 0) ||
1783 !KR_KERNEL_ONLY(kring);
1784 /* regular sync (reclaim) */
1785 if ((flags & NA_SYNCF_NETIF) != 0 || __improbable(sync_only)) {
1786 ret = nx_sync_tx(kring, commit: (flags & NA_SYNCF_FORCE_RECLAIM) ||
1787 kring->ckr_pending_intr != 0);
1788 kring->ckr_pending_intr = 0;
1789
1790 /* direct user channels do not need to use the doorbell */
1791 if (__improbable(sync_only)) {
1792 return ret;
1793 }
1794 }
1795
1796 /*
1797 * Doorbell call. Here we do doorbell explicitly if the flag is
1798 * set or implicitly if we're opened directly by a user channel.
1799 * Synchronous vs. asynchronous depending on the context.
1800 */
1801 if (__probable((flags & NA_SYNCF_NETIF_DOORBELL) != 0)) {
1802 if ((flags & NA_SYNCF_NETIF_IFSTART) != 0) {
1803 ASSERT(!(flags & NA_SYNCF_NETIF_IFSTART) ||
1804 !(flags & NA_SYNCF_NETIF_ASYNC));
1805 nx_tx_doorbell(kring, async: (flags & NA_SYNCF_NETIF_ASYNC));
1806 } else {
1807 ifnet_start(interface: ifp);
1808 }
1809 }
1810
1811 return ret;
1812}
1813
1814static int
1815nx_netif_na_rxsync(struct __kern_channel_ring *kring, struct proc *p,
1816 uint32_t flags)
1817{
1818#pragma unused(p)
1819 int ret;
1820
1821 SK_DF(SK_VERB_NETIF | SK_VERB_SYNC | SK_VERB_RX,
1822 "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b ring %u flags 0%x",
1823 sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
1824 SK_KVA(kring), kring->ckr_flags, CKRF_BITS, kring->ckr_ring_id,
1825 flags);
1826
1827 ASSERT(kring->ckr_rhead <= kring->ckr_lim);
1828
1829 /* update our work timestamp */
1830 KRNA(kring)->na_work_ts = _net_uptime;
1831
1832 ret = nx_sync_rx(kring, commit: (flags & NA_SYNCF_FORCE_READ) ||
1833 kring->ckr_pending_intr != 0);
1834 kring->ckr_pending_intr = 0;
1835
1836 return ret;
1837}
1838
1839static void
1840nx_netif_na_dtor(struct nexus_adapter *na)
1841{
1842 struct ifnet *ifp;
1843 struct nexus_netif_adapter *nifna = NIFNA(na);
1844
1845 SK_LOCK_ASSERT_HELD();
1846 ASSERT(na->na_type == NA_NETIF_DEV || na->na_type == NA_NETIF_HOST);
1847
1848 SK_DF(SK_VERB_NETIF, "na \"%s\" (0x%llx)", na->na_name, SK_KVA(na));
1849
1850 /*
1851 * If the finalizer callback hasn't been called for whatever
1852 * reasons, pick up the embryonic ifnet stored in na_private.
1853 * Otherwise, release the I/O refcnt of a non-NULL na_ifp.
1854 */
1855 if ((ifp = na->na_ifp) == NULL) {
1856 ifp = na->na_private;
1857 na->na_private = NULL;
1858 } else {
1859 ifnet_decr_iorefcnt(ifp);
1860 na->na_ifp = NULL;
1861 }
1862
1863 if (nifna->nifna_netif != NULL) {
1864 nx_netif_release(nifna->nifna_netif);
1865 nifna->nifna_netif = NULL;
1866 }
1867 ASSERT(SKYWALK_NATIVE(ifp));
1868}
1869
1870/*
1871 * Dispatch rx/tx interrupts to the channel rings.
1872 *
1873 * The 'notify' routine depends on what the ring is attached to.
1874 * - for a channel file descriptor, do an event wakeup on the individual
1875 * waitqueue, plus one on the global one if needed (see na_notify)
1876 * - for a device port connected to a FlowSwitch, call the proper
1877 * forwarding routine; see nx_fsw_tx_hwna_notify()
1878 * or nx_fsw_rx_hwna_notify().
1879 */
1880int
1881nx_netif_common_intr(struct __kern_channel_ring *kring, struct proc *p,
1882 uint32_t flags, uint32_t *work_done)
1883{
1884 struct netif_stats *nifs =
1885 &NX_NETIF_PRIVATE(KRNA(kring)->na_nx)->nif_stats;
1886 int (*notify)(struct __kern_channel_ring *kring,
1887 struct proc *, uint32_t flags);
1888 int ret;
1889
1890 KDBG((SK_KTRACE_NETIF_COMMON_INTR | DBG_FUNC_START), SK_KVA(kring));
1891
1892 SK_DF(SK_VERB_NETIF | SK_VERB_INTR |
1893 ((kring->ckr_tx == NR_RX) ? SK_VERB_RX : SK_VERB_TX),
1894 "na \"%s\" (0x%llx) kr \"%s\" (0x%llx) krflags 0x%b",
1895 KRNA(kring)->na_name, SK_KVA(KRNA(kring)), kring->ckr_name,
1896 SK_KVA(kring), kring->ckr_flags, CKRF_BITS);
1897
1898 /* update our work timestamp */
1899 KRNA(kring)->na_work_ts = _net_uptime;
1900
1901 kring->ckr_pending_intr++;
1902 if (work_done != NULL) {
1903 *work_done = 1; /* do not fire again */
1904 }
1905 /*
1906 * We can't be calling ckr_na_notify here since we could already be
1907 * intercepting it, else we'd end up recursively calling ourselves.
1908 * Use the original na_notify callback saved during na_activate, or in
1909 * the case when the module above us is the flowswitch, the notify
1910 * routine that it has installed in place of our original one.
1911 */
1912 if (__probable(!KR_DROP(kring) &&
1913 (notify = kring->ckr_netif_notify) != NULL)) {
1914 ret = notify(kring, p, flags);
1915 } else {
1916 /*
1917 * If the ring is in drop mode, pretend as if it's busy.
1918 * This allows the mitigation thread to pause for a while
1919 * before attempting again.
1920 */
1921 ret = EBUSY;
1922 }
1923 if (__improbable(ret != 0)) {
1924 switch (kring->ckr_tx) {
1925 case NR_RX:
1926 if (ret == EBUSY) {
1927 STATS_INC(nifs, NETIF_STATS_RX_IRQ_BUSY);
1928 } else if (ret == EAGAIN) {
1929 STATS_INC(nifs, NETIF_STATS_RX_IRQ_AGAIN);
1930 } else {
1931 STATS_INC(nifs, NETIF_STATS_RX_IRQ_ERR);
1932 }
1933 break;
1934
1935 case NR_TX:
1936 if (ret == EBUSY) {
1937 STATS_INC(nifs, NETIF_STATS_TX_IRQ_BUSY);
1938 } else if (ret == EAGAIN) {
1939 STATS_INC(nifs, NETIF_STATS_TX_IRQ_AGAIN);
1940 } else {
1941 STATS_INC(nifs, NETIF_STATS_TX_IRQ_ERR);
1942 }
1943 break;
1944
1945 default:
1946 break;
1947 }
1948 }
1949
1950 KDBG((SK_KTRACE_NETIF_COMMON_INTR | DBG_FUNC_END), SK_KVA(kring), ret);
1951
1952 return ret;
1953}
1954
1955static int
1956nx_netif_na_notify_tx(struct __kern_channel_ring *kring, struct proc *p,
1957 uint32_t flags)
1958{
1959 return nx_netif_mit_tx_intr(kring, p, flags, NULL);
1960}
1961
1962static int
1963nx_netif_na_notify_rx(struct __kern_channel_ring *kring, struct proc *p,
1964 uint32_t flags)
1965{
1966 int ret;
1967
1968 /*
1969 * In the event the mitigation thread is disabled, protect
1970 * against recursion by detecting if we're already in the
1971 * context of an RX notify. IOSkywalkFamily may invoke the
1972 * notify callback as part of its RX sync callback.
1973 */
1974 if (__probable(!sk_is_rx_notify_protected())) {
1975 sk_protect_t protect;
1976 uint32_t work_done;
1977
1978 protect = sk_rx_notify_protect();
1979 ret = nx_netif_mit_rx_intr(kring, p, flags, &work_done);
1980 sk_sync_unprotect(protect);
1981 } else {
1982 ret = EAGAIN;
1983 }
1984
1985 return ret;
1986}
1987
1988static int
1989nx_netif_na_notify_rx_redirect(struct __kern_channel_ring *kring, struct proc *p,
1990 uint32_t flags)
1991{
1992 struct netif_stats *nifs =
1993 &NX_NETIF_PRIVATE(KRNA(kring)->na_nx)->nif_stats;
1994 uint32_t work_done;
1995
1996 ASSERT(kring->ckr_tx == NR_RX);
1997 STATS_INC(nifs, NETIF_STATS_RX_IRQ);
1998 return nx_netif_common_intr(kring, p, flags, work_done: &work_done);
1999}
2000
2001void
2002nx_netif_mit_config(struct nexus_netif_adapter *nifna,
2003 boolean_t *tx_mit, boolean_t *tx_mit_simple,
2004 boolean_t *rx_mit, boolean_t *rx_mit_simple)
2005{
2006 struct nx_netif *nif = nifna->nifna_netif;
2007
2008 /*
2009 * TX mitigation is disabled by default, but can be
2010 * overridden via "sk_netif_tx_mit=N" boot-arg, where
2011 * N is one of SK_NETIF_MIT_FORCE_* values.
2012 */
2013 *tx_mit = *tx_mit_simple = FALSE;
2014 switch (sk_netif_tx_mit) {
2015 case SK_NETIF_MIT_FORCE_SIMPLE:
2016 *tx_mit_simple = TRUE;
2017 OS_FALLTHROUGH;
2018 case SK_NETIF_MIT_FORCE_ADVANCED:
2019 *tx_mit = TRUE;
2020 break;
2021 case SK_NETIF_MIT_FORCE_OFF:
2022 case SK_NETIF_MIT_AUTO:
2023 ASSERT(*tx_mit == FALSE);
2024 break;
2025 default:
2026 VERIFY(0);
2027 /* NOTREACHED */
2028 __builtin_unreachable();
2029 }
2030
2031 /*
2032 * RX mitigation is enabled by default only for BSD-style
2033 * virtual network interfaces, but can be overridden
2034 * via "sk_netif_rx_mit=N" boot-arg, where N is one of
2035 * SK_NETIF_MIT_FORCE_* values.
2036 */
2037 *rx_mit = *rx_mit_simple = FALSE;
2038 switch (sk_netif_rx_mit) {
2039 case SK_NETIF_MIT_FORCE_OFF:
2040 ASSERT(*rx_mit == FALSE);
2041 break;
2042 case SK_NETIF_MIT_FORCE_SIMPLE:
2043 *rx_mit_simple = TRUE;
2044 OS_FALLTHROUGH;
2045 case SK_NETIF_MIT_FORCE_ADVANCED:
2046 *rx_mit = TRUE;
2047 break;
2048 case SK_NETIF_MIT_AUTO:
2049 *rx_mit_simple = TRUE;
2050 /*
2051 * Enable RX mitigation thread only for BSD-style virtual (and
2052 * regular) interfaces, since otherwise we may run out of stack
2053 * when subjected to IPsec processing, etc.
2054 */
2055 *rx_mit = (NX_PROV(nifna->nifna_up.na_nx)->nxprov_flags &
2056 NXPROVF_VIRTUAL_DEVICE) && !NETIF_IS_LOW_LATENCY(nif);
2057 break;
2058 default:
2059 VERIFY(0);
2060 /* NOTREACHED */
2061 __builtin_unreachable();
2062 }
2063}
2064
2065static int
2066nx_netif_na_activate(struct nexus_adapter *na, na_activate_mode_t mode)
2067{
2068 struct nexus_netif_adapter *nifna = (struct nexus_netif_adapter *)na;
2069 boolean_t tx_mit, rx_mit, tx_mit_simple, rx_mit_simple;
2070 struct nx_netif *nif = nifna->nifna_netif;
2071 struct ifnet *ifp = na->na_ifp;
2072 int error = 0;
2073 uint32_t r;
2074
2075 ASSERT(na->na_type == NA_NETIF_DEV);
2076 ASSERT(!(na->na_flags & NAF_HOST_ONLY));
2077
2078 SK_DF(SK_VERB_NETIF, "na \"%s\" (0x%llx) %s [%s]", na->na_name,
2079 SK_KVA(na), ifp->if_xname, na_activate_mode2str(mode));
2080
2081 switch (mode) {
2082 case NA_ACTIVATE_MODE_ON:
2083 ASSERT(SKYWALK_CAPABLE(ifp));
2084
2085 nx_netif_mit_config(nifna, tx_mit: &tx_mit, tx_mit_simple: &tx_mit_simple,
2086 rx_mit: &rx_mit, rx_mit_simple: &rx_mit_simple);
2087
2088 /*
2089 * Init the mitigation support on all the dev TX rings.
2090 */
2091 if (tx_mit) {
2092 nifna->nifna_tx_mit =
2093 skn_alloc_type_array(tx_on, struct nx_netif_mit,
2094 na_get_nrings(na, NR_TX), Z_WAITOK,
2095 skmem_tag_netif_mit);
2096 if (nifna->nifna_tx_mit == NULL) {
2097 SK_ERR("TX mitigation allocation failed");
2098 error = ENOMEM;
2099 goto out;
2100 }
2101 } else {
2102 ASSERT(nifna->nifna_tx_mit == NULL);
2103 }
2104
2105 /*
2106 * Init the mitigation support on all the dev RX rings.
2107 */
2108 if (rx_mit) {
2109 nifna->nifna_rx_mit =
2110 skn_alloc_type_array(rx_on, struct nx_netif_mit,
2111 na_get_nrings(na, NR_RX), Z_WAITOK,
2112 skmem_tag_netif_mit);
2113 if (nifna->nifna_rx_mit == NULL) {
2114 SK_ERR("RX mitigation allocation failed");
2115 if (nifna->nifna_tx_mit != NULL) {
2116 skn_free_type_array(rx_fail,
2117 struct nx_netif_mit,
2118 na_get_nrings(na, NR_TX),
2119 nifna->nifna_tx_mit);
2120 nifna->nifna_tx_mit = NULL;
2121 }
2122 error = ENOMEM;
2123 goto out;
2124 }
2125 } else {
2126 ASSERT(nifna->nifna_rx_mit == NULL);
2127 }
2128
2129 /* intercept na_notify callback on the TX rings */
2130 for (r = 0; r < na_get_nrings(na, t: NR_TX); r++) {
2131 na->na_tx_rings[r].ckr_netif_notify =
2132 na->na_tx_rings[r].ckr_na_notify;
2133 na->na_tx_rings[r].ckr_na_notify =
2134 nx_netif_na_notify_tx;
2135 if (nifna->nifna_tx_mit != NULL) {
2136 nx_netif_mit_init(nif, ifp,
2137 &nifna->nifna_tx_mit[r],
2138 &na->na_tx_rings[r], tx_mit_simple);
2139 }
2140 }
2141
2142 /* intercept na_notify callback on the RX rings */
2143 for (r = 0; r < na_get_nrings(na, t: NR_RX); r++) {
2144 na->na_rx_rings[r].ckr_netif_notify =
2145 na->na_rx_rings[r].ckr_na_notify;
2146 na->na_rx_rings[r].ckr_na_notify = IFNET_IS_REDIRECT(ifp) ?
2147 nx_netif_na_notify_rx_redirect : nx_netif_na_notify_rx;
2148 if (nifna->nifna_rx_mit != NULL) {
2149 nx_netif_mit_init(nif, ifp,
2150 &nifna->nifna_rx_mit[r],
2151 &na->na_rx_rings[r], rx_mit_simple);
2152 }
2153 }
2154 nx_netif_filter_enable(nif);
2155 nx_netif_flow_enable(nif);
2156 os_atomic_or(&na->na_flags, NAF_ACTIVE, relaxed);
2157
2158 /* steer all start requests to netif; this must not fail */
2159 lck_mtx_lock(lck: &ifp->if_start_lock);
2160 error = ifnet_set_start_handler(ifp, nx_netif_doorbell);
2161 VERIFY(error == 0);
2162 lck_mtx_unlock(lck: &ifp->if_start_lock);
2163 break;
2164
2165 case NA_ACTIVATE_MODE_DEFUNCT:
2166 ASSERT(SKYWALK_CAPABLE(ifp));
2167 break;
2168
2169 case NA_ACTIVATE_MODE_OFF:
2170 /*
2171 * Note that here we cannot assert SKYWALK_CAPABLE()
2172 * as we're called in the destructor path.
2173 */
2174 os_atomic_andnot(&na->na_flags, NAF_ACTIVE, relaxed);
2175 nx_netif_flow_disable(nif);
2176 nx_netif_filter_disable(nif);
2177
2178 /*
2179 * Here we may block while holding sk_lock, but because
2180 * we've cleared NAF_ACTIVE above, kern_channel_tx_refill()
2181 * should immediately return. A better approach would be
2182 * to drop sk_lock and add a monitor for this routine.
2183 */
2184 lck_mtx_lock(lck: &ifp->if_start_lock);
2185 while (ifp->if_start_active != 0) {
2186 ++ifp->if_start_waiters;
2187 (void) msleep(chan: &ifp->if_start_waiters,
2188 mtx: &ifp->if_start_lock, pri: (PZERO - 1),
2189 wmesg: na->na_name, NULL);
2190 }
2191 /* steer all start requests to default handler */
2192 ifnet_reset_start_handler(ifp);
2193 lck_mtx_unlock(lck: &ifp->if_start_lock);
2194
2195 /* reset all TX notify callbacks */
2196 for (r = 0; r < na_get_nrings(na, t: NR_TX); r++) {
2197 na->na_tx_rings[r].ckr_na_notify =
2198 na->na_tx_rings[r].ckr_netif_notify;
2199 na->na_tx_rings[r].ckr_netif_notify = NULL;
2200 if (nifna->nifna_tx_mit != NULL) {
2201 na->na_tx_rings[r].ckr_netif_mit_stats = NULL;
2202 nx_netif_mit_cleanup(&nifna->nifna_tx_mit[r]);
2203 }
2204 }
2205
2206 if (nifna->nifna_tx_mit != NULL) {
2207 skn_free_type_array(tx_off, struct nx_netif_mit,
2208 na_get_nrings(na, NR_TX), nifna->nifna_tx_mit);
2209 nifna->nifna_tx_mit = NULL;
2210 }
2211
2212 /* reset all RX notify callbacks */
2213 for (r = 0; r < na_get_nrings(na, t: NR_RX); r++) {
2214 na->na_rx_rings[r].ckr_na_notify =
2215 na->na_rx_rings[r].ckr_netif_notify;
2216 na->na_rx_rings[r].ckr_netif_notify = NULL;
2217 if (nifna->nifna_rx_mit != NULL) {
2218 na->na_rx_rings[r].ckr_netif_mit_stats = NULL;
2219 nx_netif_mit_cleanup(&nifna->nifna_rx_mit[r]);
2220 }
2221 }
2222 if (nifna->nifna_rx_mit != NULL) {
2223 skn_free_type_array(rx_off, struct nx_netif_mit,
2224 na_get_nrings(na, NR_RX), nifna->nifna_rx_mit);
2225 nifna->nifna_rx_mit = NULL;
2226 }
2227 break;
2228
2229 default:
2230 VERIFY(0);
2231 /* NOTREACHED */
2232 __builtin_unreachable();
2233 }
2234out:
2235 return error;
2236}
2237
2238SK_NO_INLINE_ATTRIBUTE
2239static int
2240nx_netif_attach(struct kern_nexus *nx, struct ifnet *ifp)
2241__attribute__((optnone))
2242{
2243 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
2244 struct nxprov_params *nxp = NX_PROV(nx)->nxprov_params;
2245 struct nexus_netif_adapter *devnifna = NULL;
2246 struct nexus_netif_adapter *hostnifna = NULL;
2247 struct nexus_adapter *devna = NULL;
2248 struct nexus_adapter *hostna = NULL;
2249 boolean_t embryonic = FALSE;
2250 int retval = 0;
2251 uint32_t na_flags;
2252
2253 SK_LOCK_ASSERT_HELD();
2254 ASSERT(SKYWALK_NATIVE(ifp));
2255 ASSERT(!SKYWALK_CAPABLE(ifp));
2256 ASSERT(ifp->if_na == NULL);
2257 ASSERT(ifp->if_na_ops == NULL);
2258
2259 devnifna = na_netif_alloc(how: Z_WAITOK);
2260 hostnifna = na_netif_alloc(how: Z_WAITOK);
2261
2262 /*
2263 * We can be called for two different interface states:
2264 *
2265 * Fully attached: get an io ref count; upon success, this
2266 * holds a reference to the ifnet for the ifp pointer stored
2267 * in 'na_ifp' down below for both adapters.
2268 *
2269 * Embryonic: temporary hold the ifnet in na_private, which
2270 * upon a successful ifnet_attach(), will be moved over to
2271 * the 'na_ifp' with an io ref count held.
2272 *
2273 * The ifnet in 'na_ifp' will be released by na_release_locked().
2274 */
2275 if (!ifnet_is_attached(ifp, refio: 1)) {
2276 if (!(ifp->if_refflags & IFRF_EMBRYONIC)) {
2277 ifp = NULL;
2278 retval = ENXIO;
2279 goto err;
2280 }
2281 embryonic = TRUE;
2282 }
2283
2284 /* initialize the device netif adapter */
2285 devnifna->nifna_netif = nif;
2286 nx_netif_retain(nif);
2287 devna = &devnifna->nifna_up;
2288 devna->na_type = NA_NETIF_DEV;
2289 devna->na_free = na_netif_free;
2290 (void) strncpy(devna->na_name, ifp->if_xname, sizeof(devna->na_name) - 1);
2291 devna->na_name[sizeof(devna->na_name) - 1] = '\0';
2292 uuid_generate_random(out: devna->na_uuid);
2293 if (embryonic) {
2294 /*
2295 * We will move this over to na_ifp once
2296 * the interface is fully attached.
2297 */
2298 devna->na_private = ifp;
2299 ASSERT(devna->na_ifp == NULL);
2300 } else {
2301 ASSERT(devna->na_private == NULL);
2302 /* use I/O refcnt from ifnet_is_attached() */
2303 devna->na_ifp = ifp;
2304 }
2305 devna->na_activate = nx_netif_na_activate;
2306 devna->na_txsync = nx_netif_na_txsync;
2307 devna->na_rxsync = nx_netif_na_rxsync;
2308 devna->na_dtor = nx_netif_na_dtor;
2309 devna->na_krings_create = nx_netif_dev_krings_create;
2310 devna->na_krings_delete = nx_netif_dev_krings_delete;
2311 devna->na_special = nx_netif_na_special;
2312
2313 na_flags = NAF_NATIVE;
2314 if (NX_PROV(nx)->nxprov_flags & NXPROVF_VIRTUAL_DEVICE) {
2315 na_flags |= NAF_VIRTUAL_DEVICE;
2316 }
2317 if (NX_LLINK_PROV(nx)) {
2318 /*
2319 * while operating in logical link mode, we don't need to
2320 * create backing memory regions for the rings as they are
2321 * not used.
2322 */
2323 na_flags |= NAF_MEM_NO_INIT;
2324 }
2325 os_atomic_or(&devna->na_flags, na_flags, relaxed);
2326 *(nexus_stats_type_t *)(uintptr_t)&devna->na_stats_type =
2327 NEXUS_STATS_TYPE_INVALID;
2328
2329 na_set_nrings(na: devna, t: NR_TX, v: nxp->nxp_tx_rings);
2330 na_set_nrings(na: devna, t: NR_RX, v: nxp->nxp_rx_rings);
2331 na_set_nslots(na: devna, t: NR_TX, v: nxp->nxp_tx_slots);
2332 na_set_nslots(na: devna, t: NR_RX, v: nxp->nxp_rx_slots);
2333 /*
2334 * Verify upper bounds; the parameters must have already been
2335 * validated by nxdom_prov_params() by the time we get here.
2336 */
2337 ASSERT(na_get_nrings(devna, NR_TX) <= NX_DOM(nx)->nxdom_tx_rings.nb_max);
2338 ASSERT(na_get_nrings(devna, NR_RX) <= NX_DOM(nx)->nxdom_rx_rings.nb_max);
2339 ASSERT(na_get_nslots(devna, NR_TX) <= NX_DOM(nx)->nxdom_tx_slots.nb_max);
2340 ASSERT(na_get_nslots(devna, NR_RX) <= NX_DOM(nx)->nxdom_rx_slots.nb_max);
2341
2342 na_attach_common(devna, nx, &nx_netif_prov_s);
2343
2344 if ((retval = NX_DOM_PROV(nx)->nxdom_prov_mem_new(NX_DOM_PROV(nx),
2345 nx, devna)) != 0) {
2346 ASSERT(devna->na_arena == NULL);
2347 goto err;
2348 }
2349 ASSERT(devna->na_arena != NULL);
2350
2351 *(uint32_t *)(uintptr_t)&devna->na_flowadv_max = nxp->nxp_flowadv_max;
2352 ASSERT(devna->na_flowadv_max == 0 ||
2353 skmem_arena_nexus(devna->na_arena)->arn_flowadv_obj != NULL);
2354
2355 /* setup packet copy routines */
2356 if (skmem_arena_nexus(ar: devna->na_arena)->arn_rx_pp->pp_max_frags > 1) {
2357 nif->nif_pkt_copy_from_mbuf = pkt_copy_multi_buflet_from_mbuf;
2358 nif->nif_pkt_copy_to_mbuf = pkt_copy_multi_buflet_to_mbuf;
2359 nif->nif_pkt_copy_from_pkt = pkt_copy_multi_buflet_from_pkt;
2360 } else {
2361 nif->nif_pkt_copy_from_mbuf = pkt_copy_from_mbuf;
2362 nif->nif_pkt_copy_to_mbuf = pkt_copy_to_mbuf;
2363 nif->nif_pkt_copy_from_pkt = pkt_copy_from_pkt;
2364 }
2365
2366 /* initialize the host netif adapter */
2367 hostnifna->nifna_netif = nif;
2368 nx_netif_retain(nif);
2369 hostna = &hostnifna->nifna_up;
2370 (void) snprintf(hostna->na_name, count: sizeof(hostna->na_name),
2371 "%s^", devna->na_name);
2372 uuid_generate_random(out: hostna->na_uuid);
2373 if (embryonic) {
2374 /*
2375 * We will move this over to na_ifp once
2376 * the interface is fully attached.
2377 */
2378 hostna->na_private = ifp;
2379 ASSERT(hostna->na_ifp == NULL);
2380 } else {
2381 ASSERT(hostna->na_private == NULL);
2382 hostna->na_ifp = devna->na_ifp;
2383 ifnet_incr_iorefcnt(hostna->na_ifp);
2384 }
2385 hostna->na_type = NA_NETIF_HOST;
2386 hostna->na_free = na_netif_free;
2387 hostna->na_activate = nx_netif_host_na_activate;
2388 hostna->na_txsync = nx_netif_host_na_txsync;
2389 hostna->na_rxsync = nx_netif_host_na_rxsync;
2390 hostna->na_dtor = nx_netif_na_dtor;
2391 hostna->na_krings_create = nx_netif_host_krings_create;
2392 hostna->na_krings_delete = nx_netif_host_krings_delete;
2393 hostna->na_special = nx_netif_host_na_special;
2394
2395 na_flags = NAF_HOST_ONLY | NAF_NATIVE;
2396 if (NX_LLINK_PROV(nx)) {
2397 /*
2398 * while operating in logical link mode, we don't need to
2399 * create backing memory regions for the rings as they are
2400 * not used.
2401 */
2402 na_flags |= NAF_MEM_NO_INIT;
2403 }
2404 os_atomic_or(&hostna->na_flags, na_flags, relaxed);
2405 *(nexus_stats_type_t *)(uintptr_t)&hostna->na_stats_type =
2406 NEXUS_STATS_TYPE_INVALID;
2407
2408 na_set_nrings(na: hostna, t: NR_TX, v: 1);
2409 na_set_nrings(na: hostna, t: NR_RX, v: 1);
2410 na_set_nslots(na: hostna, t: NR_TX, v: nxp->nxp_tx_slots);
2411 na_set_nslots(na: hostna, t: NR_RX, v: nxp->nxp_rx_slots);
2412
2413 na_attach_common(hostna, nx, &nx_netif_prov_s);
2414
2415 if ((retval = NX_DOM_PROV(nx)->nxdom_prov_mem_new(NX_DOM_PROV(nx),
2416 nx, hostna)) != 0) {
2417 ASSERT(hostna->na_arena == NULL);
2418 goto err;
2419 }
2420 ASSERT(hostna->na_arena != NULL);
2421
2422 *(uint32_t *)(uintptr_t)&hostna->na_flowadv_max = nxp->nxp_flowadv_max;
2423 ASSERT(hostna->na_flowadv_max == 0 ||
2424 skmem_arena_nexus(hostna->na_arena)->arn_flowadv_obj != NULL);
2425
2426 /* adjust the classq packet drop limit */
2427 if (embryonic) {
2428 uint32_t drop_lim;
2429 struct kern_pbufpool_memory_info pp_info;
2430
2431 retval = kern_pbufpool_get_memory_info(pbufpool: nx->nx_tx_pp, pbufpool_mem_ref: &pp_info);
2432 VERIFY(retval == 0);
2433
2434 /* set the drop limit as 80% of size of packet pool */
2435 drop_lim = (pp_info.kpm_packets * 4) / 5;
2436 VERIFY(drop_lim != 0);
2437 IFCQ_PKT_DROP_LIMIT(ifp->if_snd) = drop_lim;
2438 }
2439
2440 /* these will be undone by destructor */
2441 ifp->if_na_ops = &na_netif_ops;
2442 ifp->if_na = devnifna;
2443 na_retain_locked(na: devna);
2444 na_retain_locked(na: hostna);
2445
2446 SKYWALK_SET_CAPABLE(ifp);
2447
2448 NETIF_WLOCK(nif);
2449 nif->nif_ifp = ifp;
2450 nif->nif_netif_nxadv = nx->nx_adv.netif_nxv_adv;
2451 retval = nx_port_alloc(nx, NEXUS_PORT_NET_IF_DEV, NULL, &devna,
2452 kernproc);
2453 ASSERT(retval == 0);
2454 retval = nx_port_alloc(nx, NEXUS_PORT_NET_IF_HOST, NULL, &hostna,
2455 kernproc);
2456 ASSERT(retval == 0);
2457 NETIF_WUNLOCK(nif);
2458
2459#if SK_LOG
2460 uuid_string_t uuidstr;
2461 SK_DF(SK_VERB_NETIF, "devna: \"%s\"", devna->na_name);
2462 SK_DF(SK_VERB_NETIF, " UUID: %s",
2463 sk_uuid_unparse(devna->na_uuid, uuidstr));
2464 SK_DF(SK_VERB_NETIF, " nx: 0x%llx (\"%s\":\"%s\")",
2465 SK_KVA(devna->na_nx), NX_DOM(devna->na_nx)->nxdom_name,
2466 NX_DOM_PROV(devna->na_nx)->nxdom_prov_name);
2467 SK_DF(SK_VERB_NETIF, " flags: 0x%b", devna->na_flags, NAF_BITS);
2468 SK_DF(SK_VERB_NETIF, " flowadv_max: %u", devna->na_flowadv_max);
2469 SK_DF(SK_VERB_NETIF, " rings: tx %u rx %u",
2470 na_get_nrings(devna, NR_TX), na_get_nrings(devna, NR_RX));
2471 SK_DF(SK_VERB_NETIF, " slots: tx %u rx %u",
2472 na_get_nslots(devna, NR_TX), na_get_nslots(devna, NR_RX));
2473#if CONFIG_NEXUS_USER_PIPE
2474 SK_DF(SK_VERB_NETIF, " next_pipe: %u", devna->na_next_pipe);
2475 SK_DF(SK_VERB_NETIF, " max_pipes: %u", devna->na_max_pipes);
2476#endif /* CONFIG_NEXUS_USER_PIPE */
2477 SK_DF(SK_VERB_NETIF, " ifp: 0x%llx %s [ioref %u]",
2478 SK_KVA(ifp), ifp->if_xname, ifp->if_refio);
2479 SK_DF(SK_VERB_NETIF, "hostna: \"%s\"", hostna->na_name);
2480 SK_DF(SK_VERB_NETIF, " UUID: %s",
2481 sk_uuid_unparse(hostna->na_uuid, uuidstr));
2482 SK_DF(SK_VERB_NETIF, " nx: 0x%llx (\"%s\":\"%s\")",
2483 SK_KVA(hostna->na_nx), NX_DOM(hostna->na_nx)->nxdom_name,
2484 NX_DOM_PROV(hostna->na_nx)->nxdom_prov_name);
2485 SK_DF(SK_VERB_NETIF, " flags: 0x%b",
2486 hostna->na_flags, NAF_BITS);
2487 SK_DF(SK_VERB_NETIF, " flowadv_max: %u", hostna->na_flowadv_max);
2488 SK_DF(SK_VERB_NETIF, " rings: tx %u rx %u",
2489 na_get_nrings(hostna, NR_TX), na_get_nrings(hostna, NR_RX));
2490 SK_DF(SK_VERB_NETIF, " slots: tx %u rx %u",
2491 na_get_nslots(hostna, NR_TX), na_get_nslots(hostna, NR_RX));
2492#if CONFIG_NEXUS_USER_PIPE
2493 SK_DF(SK_VERB_NETIF, " next_pipe: %u", hostna->na_next_pipe);
2494 SK_DF(SK_VERB_NETIF, " max_pipes: %u", hostna->na_max_pipes);
2495#endif /* CONFIG_NEXUS_USER_PIPE */
2496 SK_DF(SK_VERB_NETIF, " ifp: 0x%llx %s [ioref %u]",
2497 SK_KVA(ifp), ifp->if_xname, ifp->if_refio);
2498#endif /* SK_LOG */
2499
2500err:
2501 if (retval != 0) {
2502 if (ifp != NULL) {
2503 if (!embryonic) {
2504 ifnet_decr_iorefcnt(ifp);
2505 }
2506 ifp = NULL;
2507 }
2508 if (devna != NULL) {
2509 if (devna->na_arena != NULL) {
2510 skmem_arena_release(devna->na_arena);
2511 devna->na_arena = NULL;
2512 }
2513 if (devna->na_ifp != NULL) {
2514 ifnet_decr_iorefcnt(devna->na_ifp);
2515 devna->na_ifp = NULL;
2516 }
2517 devna->na_private = NULL;
2518 }
2519 if (hostna != NULL) {
2520 if (hostna->na_arena != NULL) {
2521 skmem_arena_release(hostna->na_arena);
2522 hostna->na_arena = NULL;
2523 }
2524 if (hostna->na_ifp != NULL) {
2525 ifnet_decr_iorefcnt(hostna->na_ifp);
2526 hostna->na_ifp = NULL;
2527 }
2528 hostna->na_private = NULL;
2529 }
2530 if (devnifna != NULL) {
2531 if (devnifna->nifna_netif != NULL) {
2532 nx_netif_release(devnifna->nifna_netif);
2533 devnifna->nifna_netif = NULL;
2534 }
2535 na_netif_free(na: (struct nexus_adapter *)devnifna);
2536 }
2537 if (hostnifna != NULL) {
2538 if (hostnifna->nifna_netif != NULL) {
2539 nx_netif_release(hostnifna->nifna_netif);
2540 hostnifna->nifna_netif = NULL;
2541 }
2542 na_netif_free(na: (struct nexus_adapter *)hostnifna);
2543 }
2544 }
2545 return retval;
2546}
2547
2548/*
2549 * Any per-netif state that can be discovered at attach time should be
2550 * initialized here.
2551 */
2552static void
2553nx_netif_flags_init(struct nx_netif *nif)
2554{
2555 ifnet_t ifp = nif->nif_ifp;
2556 struct kern_nexus *nx = nif->nif_nx;
2557 struct nexus_adapter *devna = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
2558
2559 switch (devna->na_type) {
2560 case NA_NETIF_DEV:
2561 if (strcmp(s1: ifp->if_name, s2: sk_ll_prefix) == 0) {
2562 nif->nif_flags |= NETIF_FLAG_LOW_LATENCY;
2563 if_set_xflags(ifp, IFXF_LOW_LATENCY);
2564 }
2565 break;
2566 case NA_NETIF_COMPAT_DEV:
2567 nif->nif_flags |= NETIF_FLAG_COMPAT;
2568 break;
2569 default:
2570 break;
2571 }
2572}
2573
2574/*
2575 * This is also supposed to check for any inconsistent state at detach time.
2576 */
2577static void
2578nx_netif_flags_fini(struct nx_netif *nif)
2579{
2580 ifnet_t ifp = nif->nif_ifp;
2581
2582 if (ifp != NULL) {
2583 if_clear_xflags(ifp, IFXF_LOW_LATENCY);
2584 }
2585 nif->nif_flags = 0;
2586}
2587
2588SK_NO_INLINE_ATTRIBUTE
2589static void
2590nx_netif_callbacks_init(struct nx_netif *nif)
2591{
2592 ifnet_t ifp = nif->nif_ifp;
2593
2594 /*
2595 * XXX
2596 * This function is meant to be called by na_netif_finalize(), which is
2597 * called by ifnet_attach() while holding if_lock exclusively.
2598 */
2599 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
2600 if (ifnet_is_low_latency(ifp)) {
2601 ifnet_set_detach_notify_locked(ifp,
2602 cb: nx_netif_llw_detach_notify, arg: ifp->if_na);
2603 }
2604}
2605
2606SK_NO_INLINE_ATTRIBUTE
2607static void
2608nx_netif_callbacks_fini(struct nx_netif *nif)
2609{
2610 ifnet_t ifp = nif->nif_ifp;
2611
2612 if (ifnet_is_low_latency(ifp)) {
2613 ifnet_set_detach_notify(ifp, NULL, NULL);
2614 }
2615}
2616
2617static void
2618configure_capab_interface_advisory(struct nx_netif *nif,
2619 nxprov_capab_config_fn_t capab_fn)
2620{
2621 struct kern_nexus_capab_interface_advisory capab;
2622 struct kern_nexus *nx = nif->nif_nx;
2623 uint32_t capab_len;
2624 int error;
2625
2626 /* check/configure interface advisory notifications */
2627 if ((nif->nif_ifp->if_eflags & IFEF_ADV_REPORT) == 0) {
2628 return;
2629 }
2630 bzero(s: &capab, n: sizeof(capab));
2631 capab.kncia_version =
2632 KERN_NEXUS_CAPAB_INTERFACE_ADVISORY_VERSION_1;
2633 *__DECONST(kern_nexus_capab_interface_advisory_notify_fn_t *,
2634 &(capab.kncia_notify)) = nx_netif_interface_advisory_notify;
2635 *__DECONST(void **, &(capab.kncia_kern_context)) = nx;
2636 capab_len = sizeof(capab);
2637 error = capab_fn(NX_PROV(nx), nx,
2638 KERN_NEXUS_CAPAB_INTERFACE_ADVISORY, &capab, &capab_len);
2639 if (error != 0) {
2640 DTRACE_SKYWALK2(interface__advisory__capab__error,
2641 struct nx_netif *, nif, int, error);
2642 return;
2643 }
2644 VERIFY(capab.kncia_config != NULL);
2645 VERIFY(capab.kncia_provider_context != NULL);
2646 nif->nif_intf_adv_config = capab.kncia_config;
2647 nif->nif_intf_adv_prov_ctx = capab.kncia_provider_context;
2648 nif->nif_extended_capabilities |= NETIF_CAPAB_INTERFACE_ADVISORY;
2649}
2650
2651static void
2652unconfigure_capab_interface_advisory(struct nx_netif *nif)
2653{
2654 if ((nif->nif_extended_capabilities & NETIF_CAPAB_INTERFACE_ADVISORY) == 0) {
2655 return;
2656 }
2657 nif->nif_intf_adv_config = NULL;
2658 nif->nif_intf_adv_prov_ctx = NULL;
2659 nif->nif_extended_capabilities &= ~NETIF_CAPAB_INTERFACE_ADVISORY;
2660}
2661
2662static void
2663configure_capab_qset_extensions(struct nx_netif *nif,
2664 nxprov_capab_config_fn_t capab_fn)
2665{
2666 struct kern_nexus_capab_qset_extensions capab;
2667 struct kern_nexus *nx = nif->nif_nx;
2668 uint32_t capab_len;
2669 int error;
2670
2671 if (!NX_LLINK_PROV(nx)) {
2672 DTRACE_SKYWALK1(not__llink__prov, struct nx_netif *, nif);
2673 return;
2674 }
2675 bzero(s: &capab, n: sizeof(capab));
2676 capab.cqe_version = KERN_NEXUS_CAPAB_QSET_EXTENSIONS_VERSION_1;
2677 capab_len = sizeof(capab);
2678 error = capab_fn(NX_PROV(nx), nx,
2679 KERN_NEXUS_CAPAB_QSET_EXTENSIONS, &capab, &capab_len);
2680 if (error != 0) {
2681 DTRACE_SKYWALK2(qset__extensions__capab__error,
2682 struct nx_netif *, nif, int, error);
2683 return;
2684 }
2685 VERIFY(capab.cqe_notify_steering_info != NULL);
2686 VERIFY(capab.cqe_prov_ctx != NULL);
2687 nif->nif_qset_extensions.qe_notify_steering_info =
2688 capab.cqe_notify_steering_info;
2689 nif->nif_qset_extensions.qe_prov_ctx = capab.cqe_prov_ctx;
2690 nif->nif_extended_capabilities |= NETIF_CAPAB_QSET_EXTENSIONS;
2691}
2692
2693static void
2694unconfigure_capab_qset_extensions(struct nx_netif *nif)
2695{
2696 if ((nif->nif_extended_capabilities & NETIF_CAPAB_QSET_EXTENSIONS) == 0) {
2697 return;
2698 }
2699 bzero(s: &nif->nif_qset_extensions, n: sizeof(nif->nif_qset_extensions));
2700 nif->nif_extended_capabilities &= ~NETIF_CAPAB_QSET_EXTENSIONS;
2701}
2702
2703int
2704nx_netif_notify_steering_info(struct nx_netif *nif, struct netif_qset *qset,
2705 struct ifnet_traffic_descriptor_common *td, bool add)
2706{
2707 struct netif_qset_extensions *qset_ext;
2708 int err;
2709
2710 if ((nif->nif_extended_capabilities & NETIF_CAPAB_QSET_EXTENSIONS) == 0) {
2711 return ENOTSUP;
2712 }
2713 qset_ext = &nif->nif_qset_extensions;
2714 VERIFY(qset_ext->qe_prov_ctx != NULL);
2715 VERIFY(qset_ext->qe_notify_steering_info != NULL);
2716 err = qset_ext->qe_notify_steering_info(qset_ext->qe_prov_ctx,
2717 qset->nqs_ctx, td, add);
2718 return err;
2719}
2720
2721static void
2722nx_netif_capabilities_init(struct nx_netif *nif)
2723{
2724 struct kern_nexus *nx = nif->nif_nx;
2725 nxprov_capab_config_fn_t capab_fn;
2726
2727 if ((NX_PROV(nx)->nxprov_netif_ext.nxnpi_version) ==
2728 KERN_NEXUS_PROVIDER_VERSION_NETIF) {
2729 capab_fn = NX_PROV(nx)->nxprov_netif_ext.nxnpi_config_capab;
2730 ASSERT(capab_fn != NULL);
2731 } else {
2732 capab_fn = NX_PROV(nx)->nxprov_ext.nxpi_config_capab;
2733 }
2734 if (capab_fn == NULL) {
2735 return;
2736 }
2737 configure_capab_interface_advisory(nif, capab_fn);
2738 configure_capab_qset_extensions(nif, capab_fn);
2739}
2740
2741static void
2742nx_netif_capabilities_fini(struct nx_netif *nif)
2743{
2744 unconfigure_capab_interface_advisory(nif);
2745 unconfigure_capab_qset_extensions(nif);
2746}
2747
2748static void
2749nx_netif_verify_tso_config(struct nx_netif *nif)
2750{
2751 ifnet_t ifp = nif->nif_ifp;
2752 uint32_t tso_v4_mtu = 0;
2753 uint32_t tso_v6_mtu = 0;
2754
2755 /*
2756 * compat interfaces always use 128-byte buffers on the device packet
2757 * pool side (for holding headers for classification) so no need to check
2758 * the size here.
2759 */
2760 if (!SKYWALK_NATIVE(ifp)) {
2761 return;
2762 }
2763
2764 if ((ifp->if_hwassist & IFNET_TSO_IPV4) != 0) {
2765 tso_v4_mtu = ifp->if_tso_v4_mtu;
2766 }
2767 if ((ifp->if_hwassist & IFNET_TSO_IPV6) != 0) {
2768 tso_v6_mtu = ifp->if_tso_v6_mtu;
2769 }
2770 VERIFY(PP_BUF_SIZE_DEF(nif->nif_nx->nx_tx_pp) >=
2771 max(tso_v4_mtu, tso_v6_mtu));
2772}
2773
2774void
2775na_netif_finalize(struct nexus_netif_adapter *nifna, struct ifnet *ifp)
2776{
2777 struct nx_netif *nif = nifna->nifna_netif;
2778 struct kern_nexus *nx = nif->nif_nx;
2779 struct nexus_adapter *devna = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
2780 struct nexus_adapter *hostna = nx_port_get_na(nx,
2781 NEXUS_PORT_NET_IF_HOST);
2782
2783 ASSERT(devna != NULL);
2784 ASSERT(hostna != NULL);
2785
2786 if (!ifnet_is_attached(ifp, refio: 1)) {
2787 VERIFY(0);
2788 /* NOTREACHED */
2789 __builtin_unreachable();
2790 }
2791
2792 ASSERT(devna->na_private == ifp);
2793 ASSERT(devna->na_ifp == NULL);
2794 /* use I/O refcnt held by ifnet_is_attached() above */
2795 devna->na_ifp = devna->na_private;
2796 devna->na_private = NULL;
2797
2798 ASSERT(hostna->na_private == ifp);
2799 ASSERT(hostna->na_ifp == NULL);
2800 hostna->na_ifp = hostna->na_private;
2801 hostna->na_private = NULL;
2802 ifnet_incr_iorefcnt(hostna->na_ifp);
2803
2804 nx_netif_flags_init(nif);
2805 nx_netif_llink_init(nif);
2806 nx_netif_filter_init(nif);
2807 nx_netif_flow_init(nif);
2808 nx_netif_capabilities_init(nif);
2809 nx_netif_agent_init(nif);
2810 (void) nxctl_inet_traffic_rule_get_count(ifp->if_xname,
2811 &ifp->if_traffic_rule_count);
2812 nx_netif_verify_tso_config(nif);
2813 nx_netif_callbacks_init(nif);
2814}
2815
2816void
2817nx_netif_reap(struct nexus_netif_adapter *nifna, struct ifnet *ifp,
2818 uint32_t thres, boolean_t low)
2819{
2820#pragma unused(ifp)
2821 struct nx_netif *nif = nifna->nifna_netif;
2822 struct kern_nexus *nx = nif->nif_nx;
2823 struct nexus_adapter *devna = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
2824 uint64_t now = _net_uptime;
2825 boolean_t purge;
2826
2827 ASSERT(thres != 0);
2828
2829 if (devna->na_work_ts == 0) {
2830 return;
2831 }
2832
2833 /*
2834 * Purge if it's has been inactive for some time (twice the drain
2835 * threshold), and clear the work timestamp to temporarily skip this
2836 * adapter until it's active again. Purging cached objects can be
2837 * expensive since we'd need to allocate and construct them again,
2838 * so we do it only when necessary.
2839 */
2840 if (low || (now - devna->na_work_ts) >= (thres << 1)) {
2841 devna->na_work_ts = 0;
2842 purge = TRUE;
2843 } else {
2844 purge = FALSE;
2845 }
2846
2847 SK_DF(SK_VERB_NETIF, "%s: %s na %s", ifp->if_xname,
2848 (purge ? "purging" : "pruning"), devna->na_name);
2849
2850 /*
2851 * Device and host adapters share the same packet buffer pool,
2852 * so just reap the arena belonging to the device instance.
2853 */
2854 skmem_arena_reap(devna->na_arena, purge);
2855}
2856
2857/*
2858 * The purpose of this callback is to forceably remove resources held by VPNAs
2859 * in event of an interface detach. Without this callback an application can
2860 * prevent the detach from completing indefinitely. Note that this is only needed
2861 * for low latency VPNAs. Userspace do get notified about interface detach events
2862 * for other NA types (custom ether and filter) and will do the necessary cleanup.
2863 * The cleanup is done in two phases:
2864 * 1) VPNAs channels are defuncted. This releases the resources held by VPNAs and
2865 * causes the device channel to be closed. All ifnet references held by VPNAs
2866 * are also released.
2867 * 2) This cleans up the netif nexus and releases the two remaining ifnet
2868 * references held by the device and host ports (nx_netif_clean()).
2869 */
2870void
2871nx_netif_llw_detach_notify(void *arg)
2872{
2873 struct nexus_netif_adapter *nifna = arg;
2874 struct nx_netif *nif = nifna->nifna_netif;
2875 struct kern_nexus *nx = nif->nif_nx;
2876 struct kern_channel **ch_list = NULL;
2877 struct kern_channel *ch;
2878 int err, i, all_ch_cnt = 0, vp_ch_cnt = 0;
2879 struct proc *p;
2880
2881 ASSERT(NETIF_IS_LOW_LATENCY(nif));
2882 /*
2883 * kern_channel_defunct() requires sk_lock to be not held. We
2884 * will first find the list of channels we want to defunct and
2885 * then call kern_channel_defunct() on each of them. The number
2886 * of channels cannot increase after sk_lock is released since
2887 * this interface is being detached.
2888 */
2889 SK_LOCK();
2890 all_ch_cnt = nx->nx_ch_count;
2891 if (all_ch_cnt == 0) {
2892 DTRACE_SKYWALK1(no__channel, struct kern_nexus *, nx);
2893 SK_UNLOCK();
2894 return;
2895 }
2896 ch_list = sk_alloc_type_array(struct kern_channel *, all_ch_cnt,
2897 Z_WAITOK | Z_NOFAIL, skmem_tag_netif_temp);
2898
2899 STAILQ_FOREACH(ch, &nx->nx_ch_head, ch_link) {
2900 struct nexus_adapter *na = ch->ch_na;
2901
2902 if (na != NULL && na->na_type == NA_NETIF_VP) {
2903 ASSERT(vp_ch_cnt < all_ch_cnt);
2904
2905 /* retain channel to prevent it from being freed */
2906 ch_retain_locked(ch);
2907 ch_list[vp_ch_cnt] = ch;
2908 DTRACE_SKYWALK3(vp__ch__found, struct kern_nexus *, nx,
2909 struct kern_channel *, ch, struct nexus_adapter *, na);
2910 vp_ch_cnt++;
2911 }
2912 }
2913 if (vp_ch_cnt == 0) {
2914 DTRACE_SKYWALK1(vp__ch__not__found, struct kern_nexus *, nx);
2915 sk_free_type_array(struct kern_channel *, all_ch_cnt, ch_list);
2916 SK_UNLOCK();
2917 return;
2918 }
2919 /* prevents the netif from being freed */
2920 nx_netif_retain(nif);
2921 SK_UNLOCK();
2922
2923 for (i = 0; i < vp_ch_cnt; i++) {
2924 ch = ch_list[i];
2925 p = proc_find(pid: ch->ch_pid);
2926 if (p == NULL) {
2927 SK_ERR("ch 0x%llx pid %d not found", SK_KVA(ch), ch->ch_pid);
2928 DTRACE_SKYWALK3(ch__pid__not__found, struct kern_nexus *, nx,
2929 struct kern_channel *, ch, pid_t, ch->ch_pid);
2930 ch_release(ch);
2931 continue;
2932 }
2933 /*
2934 * It is possible for the channel to be closed before defunct gets
2935 * called. We need to get the fd lock here to ensure that the check
2936 * for the closed state and the calling of channel defunct are done
2937 * atomically.
2938 */
2939 proc_fdlock(p);
2940 if ((ch->ch_flags & CHANF_ATTACHED) != 0) {
2941 kern_channel_defunct(p, ch);
2942 }
2943 proc_fdunlock(p);
2944 proc_rele(p);
2945 ch_release(ch);
2946 }
2947 sk_free_type_array(struct kern_channel *, all_ch_cnt, ch_list);
2948
2949 SK_LOCK();
2950 /*
2951 * Quiescing is not needed because:
2952 * The defuncting above ensures that no more tx syncs could enter.
2953 * The driver layer ensures that ifnet_detach() (this path) does not get
2954 * called until RX upcalls have returned.
2955 *
2956 * Before sk_lock is reacquired above, userspace could close its channels
2957 * and cause the nexus's destructor to be called. This is fine because we
2958 * have retained the nif so it can't disappear.
2959 */
2960 err = nx_netif_clean(nif, FALSE);
2961 if (err != 0) {
2962 SK_ERR("netif clean failed: err %d", err);
2963 DTRACE_SKYWALK2(nif__clean__failed, struct nx_netif *, nif, int, err);
2964 }
2965 nx_netif_release(nif);
2966 SK_UNLOCK();
2967}
2968
2969void
2970nx_netif_copy_stats(struct nexus_netif_adapter *nifna,
2971 struct if_netif_stats *if_ns)
2972{
2973 struct nx_netif_mit *mit;
2974 struct mit_cfg_tbl *mit_cfg;
2975
2976 if ((mit = nifna->nifna_rx_mit) == NULL) {
2977 return;
2978 }
2979
2980 if ((mit->mit_flags & NETIF_MITF_INITIALIZED) == 0) {
2981 return;
2982 }
2983
2984 if_ns->ifn_rx_mit_interval = mit->mit_interval;
2985 if_ns->ifn_rx_mit_mode = mit->mit_mode;
2986 if_ns->ifn_rx_mit_packets_avg = mit->mit_packets_avg;
2987 if_ns->ifn_rx_mit_packets_min = mit->mit_packets_min;
2988 if_ns->ifn_rx_mit_packets_max = mit->mit_packets_max;
2989 if_ns->ifn_rx_mit_bytes_avg = mit->mit_bytes_avg;
2990 if_ns->ifn_rx_mit_bytes_min = mit->mit_bytes_min;
2991 if_ns->ifn_rx_mit_bytes_max = mit->mit_bytes_max;
2992 if_ns->ifn_rx_mit_cfg_idx = mit->mit_cfg_idx;
2993
2994 VERIFY(if_ns->ifn_rx_mit_cfg_idx < mit->mit_cfg_idx_max);
2995 mit_cfg = &mit->mit_tbl[if_ns->ifn_rx_mit_cfg_idx];
2996 if_ns->ifn_rx_mit_cfg_packets_lowat = mit_cfg->cfg_plowat;
2997 if_ns->ifn_rx_mit_cfg_packets_hiwat = mit_cfg->cfg_phiwat;
2998 if_ns->ifn_rx_mit_cfg_bytes_lowat = mit_cfg->cfg_blowat;
2999 if_ns->ifn_rx_mit_cfg_bytes_hiwat = mit_cfg->cfg_bhiwat;
3000 if_ns->ifn_rx_mit_cfg_interval = mit_cfg->cfg_ival;
3001}
3002
3003int
3004nx_netif_na_special(struct nexus_adapter *na, struct kern_channel *ch,
3005 struct chreq *chr, nxspec_cmd_t spec_cmd)
3006{
3007 ASSERT(na->na_type == NA_NETIF_DEV ||
3008 na->na_type == NA_NETIF_COMPAT_DEV);
3009 return nx_netif_na_special_common(na, ch, chr, spec_cmd);
3010}
3011
3012int
3013nx_netif_na_special_common(struct nexus_adapter *na, struct kern_channel *ch,
3014 struct chreq *chr, nxspec_cmd_t spec_cmd)
3015{
3016 int error = 0;
3017
3018 ASSERT(na->na_type == NA_NETIF_DEV || na->na_type == NA_NETIF_HOST ||
3019 na->na_type == NA_NETIF_COMPAT_DEV ||
3020 na->na_type == NA_NETIF_COMPAT_HOST);
3021 SK_LOCK_ASSERT_HELD();
3022
3023 switch (spec_cmd) {
3024 case NXSPEC_CMD_CONNECT:
3025 /*
3026 * netif adapter isn't created exclusively for kernel.
3027 * We mark (and clear) NAF_KERNEL_ONLY flag upon a succesful
3028 * na_special() connect and disconnect.
3029 */
3030 if (NA_KERNEL_ONLY(na)) {
3031 error = EBUSY;
3032 goto done;
3033 }
3034 ASSERT(!(na->na_flags & NAF_SPEC_INIT));
3035
3036 os_atomic_or(&na->na_flags, NAF_KERNEL_ONLY, relaxed);
3037 error = na_bind_channel(na, ch, chr);
3038 if (error != 0) {
3039 os_atomic_andnot(&na->na_flags, NAF_KERNEL_ONLY, relaxed);
3040 goto done;
3041 }
3042 os_atomic_or(&na->na_flags, NAF_SPEC_INIT, relaxed);
3043 break;
3044
3045 case NXSPEC_CMD_DISCONNECT:
3046 ASSERT(NA_KERNEL_ONLY(na));
3047 ASSERT(na->na_channels > 0);
3048 ASSERT(na->na_flags & NAF_SPEC_INIT);
3049 na_unbind_channel(ch);
3050 os_atomic_andnot(&na->na_flags, (NAF_SPEC_INIT | NAF_KERNEL_ONLY), relaxed);
3051 break;
3052
3053 case NXSPEC_CMD_START:
3054 na_kr_drop(na, FALSE);
3055 break;
3056
3057 case NXSPEC_CMD_STOP:
3058 na_kr_drop(na, TRUE);
3059 LCK_MTX_ASSERT(&ch->ch_lock, LCK_MTX_ASSERT_NOTOWNED);
3060 lck_mtx_lock(lck: &ch->ch_lock);
3061 nxprov_advise_disconnect(na->na_nx, ch);
3062 lck_mtx_unlock(lck: &ch->ch_lock);
3063 break;
3064
3065 default:
3066 error = EINVAL;
3067 break;
3068 }
3069
3070done:
3071 SK_DF(error ? SK_VERB_ERROR : SK_VERB_NETIF,
3072 "ch 0x%llx from na \"%s\" (0x%llx) naflags %b nx 0x%llx "
3073 "spec_cmd %u (err %d)", SK_KVA(ch), na->na_name, SK_KVA(na),
3074 na->na_flags, NAF_BITS, SK_KVA(ch->ch_nexus), spec_cmd, error);
3075
3076 return error;
3077}
3078
3079/*
3080 * Get a skywalk netif adapter for the port.
3081 */
3082int
3083nx_netif_na_find(struct kern_nexus *nx, struct kern_channel *ch,
3084 struct chreq *chr, struct nxbind *nxb, struct proc *p,
3085 struct nexus_adapter **nap, boolean_t create)
3086{
3087#pragma unused(ch)
3088 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
3089 boolean_t anon = NX_ANONYMOUS_PROV(nx);
3090 ch_endpoint_t ep = chr->cr_endpoint;
3091 nexus_port_t nx_port = chr->cr_port;
3092 struct nexus_adapter *na = NULL;
3093 struct ifnet *ifp;
3094 int err = 0;
3095
3096 SK_LOCK_ASSERT_HELD();
3097 *nap = NULL; /* default */
3098
3099#if SK_LOG
3100 uuid_string_t uuidstr;
3101 SK_D("name \"%s\" spec_uuid \"%s\" port %d mode 0x%b pipe_id %u "
3102 "ring_id %d ring_set %u ep_type %u:%u create %u%s",
3103 chr->cr_name, sk_uuid_unparse(chr->cr_spec_uuid, uuidstr),
3104 (int)chr->cr_port, chr->cr_mode, CHMODE_BITS,
3105 chr->cr_pipe_id, (int)chr->cr_ring_id, chr->cr_ring_set,
3106 chr->cr_real_endpoint, chr->cr_endpoint, create,
3107 (ep != CH_ENDPOINT_NET_IF) ? " (skipped)" : "");
3108#endif /* SK_LOG */
3109
3110 if (!create || ep != CH_ENDPOINT_NET_IF) {
3111 err = ENODEV;
3112 goto done;
3113 }
3114
3115 ASSERT(NX_DOM(nx)->nxdom_type == NEXUS_TYPE_NET_IF);
3116 if (nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV) == NULL) {
3117 err = ENXIO;
3118 goto done;
3119 }
3120 ifp = nif->nif_ifp;
3121 if (!(SKYWALK_CAPABLE(ifp))) {
3122 SK_ERR("interface %s is no longer usable", if_name(ifp));
3123 err = ENOTSUP;
3124 goto done;
3125 }
3126
3127 if (chr->cr_mode & CHMODE_LOW_LATENCY) {
3128 SK_ERR("low latency is not supported for netif channel");
3129 err = ENOTSUP;
3130 goto done;
3131 }
3132
3133 switch (nx_port) {
3134 case NEXUS_PORT_NET_IF_DEV:
3135 /*
3136 * We have to reject direct user open that's not explicitly
3137 * allowed because netif nexuses do not by default have
3138 * user memory regions.
3139 */
3140 if (p != kernproc &&
3141 (!skywalk_netif_direct_allowed(ifp->if_xname) ||
3142 (kauth_cred_issuser(cred: kauth_cred_get()) == 0 &&
3143 (anon || nif->nif_dev_nxb == NULL || nxb == NULL ||
3144 !nxb_is_equal(nif->nif_dev_nxb, nxb))))) {
3145 DTRACE_SKYWALK2(direct__not__allowed, struct ifnet *,
3146 ifp, struct chreq *, chr);
3147 err = ENOTSUP;
3148 goto done;
3149 }
3150 if (chr->cr_mode & CHMODE_EVENT_RING) {
3151 SK_ERR("event ring is not supported for netif dev port channel");
3152 err = ENOTSUP;
3153 goto done;
3154 }
3155 na = nx_port_get_na(nx, NEXUS_PORT_NET_IF_DEV);
3156 break;
3157
3158 case NEXUS_PORT_NET_IF_HOST:
3159 if (p != kernproc) {
3160 err = ENOTSUP;
3161 goto done;
3162 }
3163 if (chr->cr_mode & CHMODE_EVENT_RING) {
3164 SK_ERR("event ring is not supported for netif host port channel");
3165 err = ENOTSUP;
3166 goto done;
3167 }
3168 na = nx_port_get_na(nx, NEXUS_PORT_NET_IF_HOST);
3169 break;
3170
3171 default:
3172 ASSERT(!(chr->cr_mode & CHMODE_CONFIG));
3173
3174 NETIF_WLOCK(nif);
3175 err = nx_port_alloc(nx, nx_port, nxb, &na, p);
3176 if (err != 0) {
3177 NETIF_WUNLOCK(nif);
3178 goto done;
3179 }
3180
3181 if (na == NULL) {
3182 if (chr->cr_mode & CHMODE_FILTER) {
3183 err = netif_filter_na_create(nx, chr, &na);
3184 } else {
3185 err = netif_vp_na_create(nx, chr, &na);
3186 }
3187 if (err != 0) {
3188 NETIF_WUNLOCK(nif);
3189 goto done;
3190 }
3191 err = nx_port_alloc(nx, nx_port, nxb, &na, p);
3192 if (err != 0) {
3193 NETIF_WUNLOCK(nif);
3194 goto done;
3195 }
3196 }
3197 NETIF_WUNLOCK(nif);
3198
3199 break;
3200 }
3201
3202 ASSERT(err == 0);
3203 ASSERT(na != NULL);
3204
3205#if CONFIG_NEXUS_USER_PIPE
3206 if (NA_OWNED_BY_ANY(na) || na->na_next_pipe > 0) {
3207#else /* !CONFIG_NEXUS_USER_PIPE */
3208 if (NA_OWNED_BY_ANY(na)) {
3209#endif /* !CONFIG_NEXUS_USER_PIPE */
3210 err = EBUSY;
3211 na = NULL;
3212 goto done;
3213 }
3214
3215 *nap = na;
3216 na_retain_locked(na);
3217
3218done:
3219 ASSERT(err != 0 || na != NULL);
3220 if (err) {
3221 SK_ERR("na not found, err(%d)", err);
3222 } else {
3223 SK_DF(SK_VERB_NETIF, "found na 0x%llu", na);
3224 }
3225 return err;
3226}
3227
3228/* na_krings_create callback for all netif device adapters */
3229int
3230nx_netif_dev_krings_create(struct nexus_adapter *na, struct kern_channel *ch)
3231{
3232 int ret;
3233
3234 ASSERT(na->na_type == NA_NETIF_DEV ||
3235 na->na_type == NA_NETIF_COMPAT_DEV);
3236 /*
3237 * Allocate context structures for native netif only, for
3238 * IOSkywalkFamily to store its object references.
3239 */
3240 ret = na_rings_mem_setup(na, (na->na_flags & NAF_NATIVE), ch);
3241
3242 /*
3243 * We mark CKRF_DROP for kernel-only rings (kernel channel
3244 * opened by the flowswitch, etc.) to prevent packets from
3245 * going thru until after the client of the kernel channel
3246 * has fully plumbed things on its side. For userland-facing
3247 * rings (regular channel opened to netif), this is not
3248 * required, and so don't mark CKRF_DROP there.
3249 */
3250 if (ret == 0 && NA_KERNEL_ONLY(na)) {
3251 na_kr_drop(na, TRUE);
3252 }
3253
3254 return ret;
3255}
3256
3257/* call with SK_LOCK held */
3258void
3259nx_netif_dev_krings_delete(struct nexus_adapter *na, struct kern_channel *ch,
3260 boolean_t defunct)
3261{
3262 ASSERT(na->na_type == NA_NETIF_DEV ||
3263 na->na_type == NA_NETIF_COMPAT_DEV);
3264
3265 /* see comments in nx_netif_dev_krings_create() */
3266 if (NA_KERNEL_ONLY(na)) {
3267 na_kr_drop(na, TRUE);
3268 }
3269
3270 na_rings_mem_teardown(na, ch, defunct);
3271}
3272
3273struct nx_netif *
3274nx_netif_alloc(zalloc_flags_t how)
3275{
3276 struct nx_netif *n;
3277
3278 SK_LOCK_ASSERT_HELD();
3279
3280 n = zalloc_flags(nx_netif_zone, how | Z_ZERO);
3281 if (n == NULL) {
3282 return NULL;
3283 }
3284
3285 NETIF_RWINIT(n);
3286 os_ref_init(&n->nif_refcnt, NULL);
3287 SK_DF(SK_VERB_MEM, "netif 0x%llx", SK_KVA(n));
3288
3289 return n;
3290}
3291
3292static void
3293nx_netif_destroy(struct nx_netif *n)
3294{
3295 ASSERT(n->nif_dev_nxb == NULL);
3296 ASSERT(n->nif_host_nxb == NULL);
3297 ASSERT(os_ref_get_count(&n->nif_refcnt) == 0);
3298 nx_netif_llink_config_free(n);
3299 SK_DF(SK_VERB_MEM, "netif 0x%llx", SK_KVA(n));
3300 NETIF_RWDESTROY(n);
3301 zfree(nx_netif_zone, n);
3302}
3303
3304void
3305nx_netif_release(struct nx_netif *n)
3306{
3307 SK_LOCK_ASSERT_HELD();
3308
3309 SK_DF(SK_VERB_MEM, "netif 0x%llx, refcnt %d", SK_KVA(n),
3310 os_ref_get_count(&n->nif_refcnt));
3311 if (os_ref_release(rc: &n->nif_refcnt) == 0) {
3312 nx_netif_destroy(n);
3313 }
3314}
3315
3316void
3317nx_netif_retain(struct nx_netif *n)
3318{
3319 SK_LOCK_ASSERT_HELD();
3320
3321 /* retaining an object with a zero refcount is not allowed */
3322 ASSERT(os_ref_get_count(&n->nif_refcnt) >= 1);
3323 os_ref_retain(rc: &n->nif_refcnt);
3324 SK_DF(SK_VERB_MEM, "netif 0x%llx, refcnt %d", SK_KVA(n),
3325 os_ref_get_count(&n->nif_refcnt));
3326}
3327
3328void
3329nx_netif_free(struct nx_netif *n)
3330{
3331 nx_netif_release(n);
3332}
3333
3334static int
3335nx_netif_interface_advisory_report(struct kern_nexus *nx,
3336 const struct ifnet_interface_advisory *advisory)
3337{
3338 struct kern_nexus *notify_nx;
3339 struct __kern_netif_intf_advisory *intf_adv;
3340 struct nx_netif *nif = NX_NETIF_PRIVATE(nx);
3341 ifnet_t difp = nif->nif_ifp, parent = NULL;
3342
3343 /* If we are a delegate, notify the parent instead */
3344 if (ifnet_get_delegate_parent(difp, parent: &parent) == 0) {
3345 nif = parent->if_na->nifna_netif;
3346 }
3347 if (nif->nif_fsw_nxadv != NULL) {
3348 ASSERT(nif->nif_fsw != NULL);
3349 intf_adv = &nif->nif_fsw_nxadv->_nxadv_intf_adv;
3350 notify_nx = nif->nif_fsw->fsw_nx;
3351 } else {
3352 intf_adv = &nif->nif_netif_nxadv->__kern_intf_adv;
3353 notify_nx = nif->nif_nx;
3354 }
3355 /*
3356 * copy the advisory report in shared memory
3357 */
3358 intf_adv->cksum = os_cpu_copy_in_cksum(advisory, &intf_adv->adv,
3359 sizeof(*advisory), 0);
3360 STATS_INC(&nif->nif_stats, NETIF_STATS_IF_ADV_UPD_RECV);
3361 /*
3362 * notify user channels on advisory report availability
3363 */
3364 nx_interface_advisory_notify(notify_nx);
3365 if (parent != NULL) {
3366 ifnet_release_delegate_parent(difp);
3367 }
3368 return 0;
3369}
3370
3371static errno_t
3372nx_netif_interface_advisory_notify(void *kern_ctx,
3373 const struct ifnet_interface_advisory *advisory)
3374{
3375 _CASSERT(offsetof(struct ifnet_interface_advisory, version) ==
3376 offsetof(struct ifnet_interface_advisory, header.version));
3377 _CASSERT(offsetof(struct ifnet_interface_advisory, direction) ==
3378 offsetof(struct ifnet_interface_advisory, header.direction));
3379 _CASSERT(offsetof(struct ifnet_interface_advisory, _reserved) ==
3380 offsetof(struct ifnet_interface_advisory, header.interface_type));
3381
3382 if (__improbable(kern_ctx == NULL || advisory == NULL)) {
3383 return EINVAL;
3384 }
3385 if (__improbable((advisory->header.version <
3386 IF_INTERFACE_ADVISORY_VERSION_MIN) ||
3387 (advisory->header.version > IF_INTERFACE_ADVISORY_VERSION_MAX))) {
3388 SK_ERR("Invalid advisory version %d", advisory->header.version);
3389 return EINVAL;
3390 }
3391 if (__improbable((advisory->header.direction !=
3392 IF_INTERFACE_ADVISORY_DIRECTION_TX) &&
3393 (advisory->header.direction !=
3394 IF_INTERFACE_ADVISORY_DIRECTION_RX))) {
3395 SK_ERR("Invalid advisory direction %d",
3396 advisory->header.direction);
3397 return EINVAL;
3398 }
3399 if (__improbable(((advisory->header.interface_type <
3400 IF_INTERFACE_ADVISORY_INTERFACE_TYPE_MIN) ||
3401 (advisory->header.interface_type >
3402 IF_INTERFACE_ADVISORY_INTERFACE_TYPE_MAX)) &&
3403 (advisory->header.version >= IF_INTERFACE_ADVISORY_VERSION_2))) {
3404 SK_ERR("Invalid advisory interface type %d",
3405 advisory->header.interface_type);
3406 return EINVAL;
3407 }
3408 return nx_netif_interface_advisory_report(nx: kern_ctx, advisory);
3409}
3410
3411void
3412nx_netif_config_interface_advisory(struct kern_nexus *nx, bool enable)
3413{
3414 struct kern_nexus *nx_netif;
3415 struct nx_netif *nif;
3416
3417 if (NX_REJECT_ACT(nx) || (nx->nx_flags & NXF_CLOSED) != 0) {
3418 return;
3419 }
3420 if (NX_PROV(nx)->nxprov_params->nxp_type == NEXUS_TYPE_FLOW_SWITCH) {
3421 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
3422 nx_netif = fsw->fsw_nifna->na_nx;
3423 } else {
3424 nx_netif = nx;
3425 }
3426 ASSERT(NX_PROV(nx_netif)->nxprov_params->nxp_type == NEXUS_TYPE_NET_IF);
3427 nif = NX_NETIF_PRIVATE(nx_netif);
3428 if (nif->nif_intf_adv_config != NULL) {
3429 nif->nif_intf_adv_config(nif->nif_intf_adv_prov_ctx, enable);
3430 }
3431}
3432
3433/*
3434 * This function has no use anymore since we are now passing truncated packets
3435 * to filters. We keep this logic just in case we need to prevent certain
3436 * packets from being passed to filters.
3437 */
3438static boolean_t
3439packet_is_filterable(struct nexus_netif_adapter *nifna,
3440 struct __kern_packet *pkt)
3441{
3442#pragma unused (nifna, pkt)
3443 return TRUE;
3444}
3445
3446/*
3447 * This function is only meant for supporting the RX path because the TX path
3448 * will not send packets > MTU size due to the disabling of TSO when filters
3449 * are enabled.
3450 */
3451static void
3452get_filterable_packets(struct nexus_netif_adapter *nifna,
3453 struct __kern_packet *pkt_chain, struct __kern_packet **fpkt_chain,
3454 struct __kern_packet **passthrough_chain)
3455{
3456 struct nx_netif *nif = nifna->nifna_netif;
3457 struct netif_stats *nifs = &nif->nif_stats;
3458 struct __kern_packet *pkt = pkt_chain, *next, *fpkt;
3459 struct __kern_packet *fpkt_head = NULL, *passthrough_head = NULL;
3460 struct __kern_packet **fpkt_tailp = &fpkt_head;
3461 struct __kern_packet **passthrough_tailp = &passthrough_head;
3462 int fcnt = 0, pcnt = 0, dcnt = 0;
3463
3464 while (pkt != NULL) {
3465 next = pkt->pkt_nextpkt;
3466 pkt->pkt_nextpkt = NULL;
3467
3468 if (!packet_is_filterable(nifna, pkt)) {
3469 pcnt++;
3470 *passthrough_tailp = pkt;
3471 passthrough_tailp = &pkt->pkt_nextpkt;
3472 pkt = next;
3473 continue;
3474 }
3475 fpkt = nx_netif_pkt_to_filter_pkt(nifna, pkt, NETIF_CONVERT_RX);
3476 if (fpkt != NULL) {
3477 fcnt++;
3478 *fpkt_tailp = fpkt;
3479 fpkt_tailp = &fpkt->pkt_nextpkt;
3480 } else {
3481 dcnt++;
3482 }
3483 pkt = next;
3484 }
3485 *fpkt_chain = fpkt_head;
3486 *passthrough_chain = passthrough_head;
3487
3488 /*
3489 * No need to increment drop stats because that's already
3490 * done in nx_netif_pkt_to_filter_pkt.
3491 */
3492 STATS_ADD(nifs, NETIF_STATS_FILTER_RX_NOT_FILTERABLE, pcnt);
3493 DTRACE_SKYWALK6(filterable, struct nexus_netif_adapter *, nifna,
3494 int, fcnt, int, pcnt, int, dcnt, struct __kern_packet *,
3495 fpkt_head, struct __kern_packet *, passthrough_head);
3496}
3497
3498/*
3499 * This is only used by ring-based notify functions for now.
3500 * When a qset-based notify becomes available, this function can be used
3501 * unmodified.
3502 */
3503void
3504netif_receive(struct nexus_netif_adapter *nifna,
3505 struct __kern_packet *pkt_chain, struct nexus_pkt_stats *stats)
3506{
3507 struct nx_netif *nif = nifna->nifna_netif;
3508 struct nexus_adapter *na = &nifna->nifna_up;
3509 struct netif_stats *nifs = &nif->nif_stats;
3510 int err, dropcnt, dropstat = -1;
3511
3512 /* update our work timestamp */
3513 na->na_work_ts = _net_uptime;
3514
3515 if (nif->nif_filter_cnt > 0) {
3516 struct __kern_packet *fpkt_chain = NULL;
3517 struct __kern_packet *passthrough_chain = NULL;
3518
3519 get_filterable_packets(nifna, pkt_chain, fpkt_chain: &fpkt_chain,
3520 passthrough_chain: &passthrough_chain);
3521 if (fpkt_chain != NULL) {
3522 (void) nx_netif_filter_inject(nifna, NULL, fpkt_chain,
3523 NETIF_FILTER_RX | NETIF_FILTER_SOURCE);
3524 }
3525 if (passthrough_chain != NULL) {
3526 pkt_chain = passthrough_chain;
3527 } else {
3528 return;
3529 }
3530 } else if (nx_netif_filter_default_drop != 0) {
3531 DTRACE_SKYWALK2(rx__default__drop, struct nx_netif *, nif,
3532 struct __kern_packet *, pkt_chain);
3533 dropstat = NETIF_STATS_FILTER_DROP_DEFAULT;
3534 goto drop;
3535 }
3536 if (nif->nif_flow_cnt > 0) {
3537 struct __kern_packet *remain = NULL;
3538
3539 err = nx_netif_demux(nifna, pkt_chain, &remain,
3540 NETIF_FLOW_SOURCE);
3541 if (remain == NULL) {
3542 return;
3543 }
3544 pkt_chain = remain;
3545 }
3546 if (na->na_rx != NULL) {
3547 na->na_rx(na, pkt_chain, stats);
3548 } else {
3549 DTRACE_SKYWALK2(no__rx__cb, struct nx_netif *, nif,
3550 struct __kern_packet *, pkt_chain);
3551 dropstat = NETIF_STATS_DROP_NO_RX_CB;
3552 goto drop;
3553 }
3554 return;
3555drop:
3556 dropcnt = 0;
3557 nx_netif_free_packet_chain(pkt_chain, &dropcnt);
3558 if (dropstat != -1) {
3559 STATS_ADD(nifs, dropstat, dropcnt);
3560 }
3561 STATS_ADD(nifs, NETIF_STATS_DROP, dropcnt);
3562}
3563
3564static slot_idx_t
3565netif_rate_limit(struct __kern_channel_ring *r, uint64_t rate,
3566 slot_idx_t begin, slot_idx_t end, boolean_t *rate_limited)
3567{
3568 uint64_t elapsed;
3569 uint64_t now;
3570 struct __kern_packet *pkt;
3571 clock_sec_t sec;
3572 clock_usec_t usec;
3573 slot_idx_t i;
3574
3575 if (__probable(rate == 0)) {
3576 return end;
3577 }
3578
3579 /* init tbr if not so */
3580 if (__improbable(r->ckr_tbr_token == CKR_TBR_TOKEN_INVALID)) {
3581 r->ckr_tbr_token = rate;
3582 r->ckr_tbr_depth = rate;
3583 r->ckr_tbr_last = mach_absolute_time();
3584 } else {
3585 now = mach_absolute_time();
3586 elapsed = now - r->ckr_tbr_last;
3587 absolutetime_to_microtime(abstime: elapsed, secs: &sec, microsecs: &usec);
3588 r->ckr_tbr_token +=
3589 ((sec * USEC_PER_SEC + usec) * rate / USEC_PER_SEC);
3590 if (__improbable(r->ckr_tbr_token > r->ckr_tbr_depth)) {
3591 r->ckr_tbr_token = r->ckr_tbr_depth;
3592 }
3593 r->ckr_tbr_last = now;
3594 }
3595
3596 *rate_limited = FALSE;
3597 for (i = begin; i != end; i = SLOT_NEXT(i, lim: r->ckr_lim)) {
3598 pkt = KR_KSD(r, i)->sd_pkt;
3599 if (__improbable(pkt == NULL)) {
3600 continue;
3601 }
3602 if (__improbable(r->ckr_tbr_token <= 0)) {
3603 end = i;
3604 *rate_limited = TRUE;
3605 break;
3606 }
3607 r->ckr_tbr_token -= pkt->pkt_length * 8;
3608 }
3609
3610 SK_DF(SK_VERB_FSW | SK_VERB_RX, "ckr %p %s rate limited at %d",
3611 r, r->ckr_name, i);
3612
3613 return end;
3614}
3615
3616SK_NO_INLINE_ATTRIBUTE
3617static struct __kern_packet *
3618consume_pkts(struct __kern_channel_ring *ring, slot_idx_t end)
3619{
3620 struct __kern_packet *pkt_chain = NULL, **tailp = &pkt_chain;
3621 slot_idx_t idx = ring->ckr_rhead;
3622
3623 while (idx != end) {
3624 struct __kern_slot_desc *ksd = KR_KSD(ring, idx);
3625 struct __kern_packet *pkt = ksd->sd_pkt;
3626
3627 ASSERT(pkt->pkt_nextpkt == NULL);
3628 KR_SLOT_DETACH_METADATA(kring: ring, ksd);
3629 *tailp = pkt;
3630 tailp = &pkt->pkt_nextpkt;
3631 idx = SLOT_NEXT(i: idx, lim: ring->ckr_lim);
3632 }
3633 ring->ckr_rhead = end;
3634 ring->ckr_rtail = ring->ckr_ktail;
3635 return pkt_chain;
3636}
3637
3638int
3639netif_rx_notify_default(struct __kern_channel_ring *ring, struct proc *p,
3640 uint32_t flags)
3641{
3642 struct nexus_adapter *hwna;
3643 struct nexus_netif_adapter *nifna;
3644 struct nx_netif *nif;
3645 struct __kern_packet *pkt_chain;
3646 struct nexus_pkt_stats stats;
3647 sk_protect_t protect;
3648 slot_idx_t ktail;
3649 int err = 0;
3650
3651 KDBG((SK_KTRACE_NETIF_RX_NOTIFY_DEFAULT | DBG_FUNC_START),
3652 SK_KVA(ring));
3653
3654 ASSERT(ring->ckr_tx == NR_RX);
3655 ASSERT(!NA_KERNEL_ONLY(KRNA(ring)) || KR_KERNEL_ONLY(ring));
3656
3657 err = kr_enter(ring, ((flags & NA_NOTEF_CAN_SLEEP) != 0));
3658 if (err != 0) {
3659 /* not a serious error, so no need to be chatty here */
3660 SK_DF(SK_VERB_FSW,
3661 "hwna \"%s\" (0x%llx) kr \"%s\" (0x%llx) krflags 0x%b "
3662 "(%d)", KRNA(ring)->na_name, SK_KVA(KRNA(ring)),
3663 ring->ckr_name, SK_KVA(ring), ring->ckr_flags,
3664 CKRF_BITS, err);
3665 goto out;
3666 }
3667 if (__improbable(KR_DROP(ring))) {
3668 kr_exit(ring);
3669 err = ENODEV;
3670 goto out;
3671 }
3672 hwna = KRNA(ring);
3673 nifna = NIFNA(hwna);
3674 nif = nifna->nifna_netif;
3675 if (__improbable(hwna->na_ifp == NULL)) {
3676 kr_exit(ring);
3677 err = ENODEV;
3678 goto out;
3679 }
3680 protect = sk_sync_protect();
3681 err = ring->ckr_na_sync(ring, p, 0);
3682 if (err != 0 && err != EAGAIN) {
3683 goto put_out;
3684 }
3685
3686 /* read the tail pointer once */
3687 ktail = ring->ckr_ktail;
3688 if (__improbable(ring->ckr_khead == ktail)) {
3689 SK_DF(SK_VERB_FSW | SK_VERB_NOTIFY | SK_VERB_RX,
3690 "how strange, interrupt with no packets on hwna "
3691 "\"%s\" (0x%llx)", KRNA(ring)->na_name, SK_KVA(KRNA(ring)));
3692 goto put_out;
3693 }
3694 ktail = netif_rate_limit(r: ring, rate: nif->nif_input_rate, begin: ring->ckr_rhead,
3695 end: ktail, rate_limited: &ring->ckr_rate_limited);
3696
3697 pkt_chain = consume_pkts(ring, end: ktail);
3698 if (pkt_chain != NULL) {
3699 netif_receive(nifna, pkt_chain, stats: &stats);
3700
3701 if (ring->ckr_netif_mit_stats != NULL &&
3702 stats.nps_pkts != 0 && stats.nps_bytes != 0) {
3703 ring->ckr_netif_mit_stats(ring, stats.nps_pkts,
3704 stats.nps_bytes);
3705 }
3706 }
3707
3708put_out:
3709 sk_sync_unprotect(protect);
3710 kr_exit(ring);
3711
3712out:
3713 KDBG((SK_KTRACE_NETIF_RX_NOTIFY_DEFAULT | DBG_FUNC_END),
3714 SK_KVA(ring), err);
3715 return err;
3716}
3717
3718int
3719netif_rx_notify_fast(struct __kern_channel_ring *ring, struct proc *p,
3720 uint32_t flags)
3721{
3722#pragma unused(p, flags)
3723 sk_protect_t protect;
3724 struct nexus_adapter *hwna;
3725 struct nexus_pkt_stats stats = {};
3726 uint32_t i, count;
3727 int err = 0;
3728
3729 KDBG((SK_KTRACE_NETIF_RX_NOTIFY_FAST | DBG_FUNC_START),
3730 SK_KVA(ring));
3731
3732 /* XXX
3733 * sk_sync_protect() is not needed for this case because
3734 * we are not using the dev ring. Unfortunately lots of
3735 * macros used by fsw still require this.
3736 */
3737 protect = sk_sync_protect();
3738 hwna = KRNA(ring);
3739 count = na_get_nslots(na: hwna, t: NR_RX);
3740 err = nx_rx_sync_packets(kring: ring, packets: ring->ckr_scratch, count: &count);
3741 if (__improbable(err != 0)) {
3742 SK_ERR("nx_rx_sync_packets failed: %d", err);
3743 DTRACE_SKYWALK2(rx__sync__packets__failed,
3744 struct __kern_channel_ring *, ring, int, err);
3745 goto out;
3746 }
3747 DTRACE_SKYWALK1(chain__count, uint32_t, count);
3748 for (i = 0; i < count; i++) {
3749 struct __kern_packet *pkt_chain;
3750
3751 pkt_chain = SK_PTR_ADDR_KPKT(ring->ckr_scratch[i]);
3752 ASSERT(pkt_chain != NULL);
3753 netif_receive(NIFNA(KRNA(ring)), pkt_chain, stats: &stats);
3754
3755 if (ring->ckr_netif_mit_stats != NULL &&
3756 stats.nps_pkts != 0 && stats.nps_bytes != 0) {
3757 ring->ckr_netif_mit_stats(ring, stats.nps_pkts,
3758 stats.nps_bytes);
3759 }
3760 }
3761out:
3762 sk_sync_unprotect(protect);
3763 KDBG((SK_KTRACE_NETIF_RX_NOTIFY_FAST | DBG_FUNC_END),
3764 SK_KVA(ring), err);
3765 return err;
3766}
3767
3768
3769/*
3770 * Configure the NA to operate in a particular mode.
3771 */
3772static channel_ring_notify_t
3773netif_hwna_get_notify(struct __kern_channel_ring *ring, netif_mode_t mode)
3774{
3775 channel_ring_notify_t notify = NULL;
3776 boolean_t has_sync_pkts = (sk_rx_sync_packets != 0 &&
3777 nx_has_rx_sync_packets(kring: ring));
3778
3779 if (mode == NETIF_MODE_FSW) {
3780 notify = (has_sync_pkts ? netif_rx_notify_fast :
3781 netif_rx_notify_default);
3782 } else if (mode == NETIF_MODE_LLW) {
3783 notify = (has_sync_pkts ? netif_llw_rx_notify_fast :
3784 netif_llw_rx_notify_default);
3785 }
3786 return notify;
3787}
3788
3789
3790static uint32_t
3791netif_mode_to_flag(netif_mode_t mode)
3792{
3793 uint32_t flag = 0;
3794
3795 if (mode == NETIF_MODE_FSW) {
3796 flag = NAF_MODE_FSW;
3797 } else if (mode == NETIF_MODE_LLW) {
3798 flag = NAF_MODE_LLW;
3799 }
3800 return flag;
3801}
3802
3803static void
3804netif_hwna_config_mode(struct nexus_adapter *hwna, netif_mode_t mode,
3805 void (*rx)(struct nexus_adapter *, struct __kern_packet *,
3806 struct nexus_pkt_stats *), boolean_t set)
3807{
3808 uint32_t i;
3809 uint32_t flag;
3810
3811 ASSERT(hwna->na_type == NA_NETIF_DEV ||
3812 hwna->na_type == NA_NETIF_COMPAT_DEV);
3813
3814 for (i = 0; i < na_get_nrings(na: hwna, t: NR_RX); i++) {
3815 struct __kern_channel_ring *kr = &NAKR(na: hwna, t: NR_RX)[i];
3816 channel_ring_notify_t notify = netif_hwna_get_notify(ring: kr, mode);
3817
3818 if (set) {
3819 kr->ckr_save_notify = kr->ckr_netif_notify;
3820 kr->ckr_netif_notify = notify;
3821 } else {
3822 kr->ckr_netif_notify = kr->ckr_save_notify;
3823 kr->ckr_save_notify = NULL;
3824 }
3825 }
3826 if (set) {
3827 hwna->na_rx = rx;
3828 flag = netif_mode_to_flag(mode);
3829 os_atomic_or(&hwna->na_flags, flag, relaxed);
3830 } else {
3831 hwna->na_rx = NULL;
3832 os_atomic_andnot(&hwna->na_flags, (NAF_MODE_FSW | NAF_MODE_LLW), relaxed);
3833 }
3834}
3835
3836void
3837netif_hwna_set_mode(struct nexus_adapter *hwna, netif_mode_t mode,
3838 void (*rx)(struct nexus_adapter *, struct __kern_packet *,
3839 struct nexus_pkt_stats *))
3840{
3841 return netif_hwna_config_mode(hwna, mode, rx, TRUE);
3842}
3843
3844void
3845netif_hwna_clear_mode(struct nexus_adapter *hwna)
3846{
3847 return netif_hwna_config_mode(hwna, mode: NETIF_MODE_NONE, NULL, FALSE);
3848}
3849
3850static void
3851netif_inject_rx(struct nexus_adapter *na, struct __kern_packet *pkt_chain)
3852{
3853 struct nexus_netif_adapter *nifna = NIFNA(na);
3854 struct nx_netif *nif = nifna->nifna_netif;
3855 struct netif_stats *nifs = &nif->nif_stats;
3856 struct __kern_channel_ring *r;
3857 struct nexus_pkt_stats stats;
3858 sk_protect_t protect;
3859 boolean_t ring_drop = FALSE;
3860 int err, dropcnt;
3861
3862 if (!NA_OWNED_BY_FSW(na)) {
3863 DTRACE_SKYWALK1(fsw__disabled, struct nexus_adapter *, na);
3864 goto fail;
3865 }
3866 ASSERT(na->na_rx != NULL);
3867
3868 /*
3869 * XXX
3870 * This function is called when a filter injects a packet back to the
3871 * regular RX path. We can assume the ring is 0 for now because RSS
3872 * is not supported. This needs to be revisited when we add support for
3873 * RSS.
3874 */
3875 r = &na->na_rx_rings[0];
3876 ASSERT(r->ckr_tx == NR_RX);
3877 err = kr_enter(r, TRUE);
3878 VERIFY(err == 0);
3879
3880 if (__improbable(KR_DROP(r))) {
3881 kr_exit(r);
3882 DTRACE_SKYWALK2(ring__drop, struct nexus_adapter *, na,
3883 struct __kern_channel_ring *, r);
3884 ring_drop = TRUE;
3885 goto fail;
3886 }
3887 protect = sk_sync_protect();
3888 na->na_rx(na, pkt_chain, &stats);
3889
3890 if (r->ckr_netif_mit_stats != NULL &&
3891 stats.nps_pkts != 0 && stats.nps_bytes != 0) {
3892 r->ckr_netif_mit_stats(r, stats.nps_pkts, stats.nps_bytes);
3893 }
3894 sk_sync_unprotect(protect);
3895
3896 kr_exit(r);
3897 return;
3898
3899fail:
3900 dropcnt = 0;
3901 nx_netif_free_packet_chain(pkt_chain, &dropcnt);
3902 if (ring_drop) {
3903 STATS_ADD(nifs, NETIF_STATS_DROP_KRDROP_MODE, dropcnt);
3904 }
3905 STATS_ADD(nifs, NETIF_STATS_DROP, dropcnt);
3906}
3907
3908/*
3909 * This is called when an inbound packet has traversed all filters.
3910 */
3911errno_t
3912nx_netif_filter_rx_cb(struct nexus_netif_adapter *nifna,
3913 struct __kern_packet *fpkt_chain, uint32_t flags)
3914{
3915#pragma unused (flags)
3916 struct nx_netif *nif = nifna->nifna_netif;
3917 struct netif_stats *nifs = &nif->nif_stats;
3918 struct nexus_adapter *na = &nifna->nifna_up;
3919 struct __kern_packet *pkt_chain;
3920 int err;
3921
3922 pkt_chain = nx_netif_filter_pkt_to_pkt_chain(nifna,
3923 fpkt_chain, NETIF_CONVERT_RX);
3924 if (pkt_chain == NULL) {
3925 return ENOMEM;
3926 }
3927 if (nif->nif_flow_cnt > 0) {
3928 struct __kern_packet *remain = NULL;
3929
3930 err = nx_netif_demux(nifna, pkt_chain, &remain,
3931 NETIF_FLOW_INJECT);
3932 if (remain == NULL) {
3933 return err;
3934 }
3935 pkt_chain = remain;
3936 }
3937 if (na->na_rx != NULL) {
3938 netif_inject_rx(na, pkt_chain);
3939 } else {
3940 int dropcnt = 0;
3941 nx_netif_free_packet_chain(pkt_chain, &dropcnt);
3942 STATS_ADD(nifs,
3943 NETIF_STATS_FILTER_DROP_NO_RX_CB, dropcnt);
3944 STATS_ADD(nifs, NETIF_STATS_DROP, dropcnt);
3945 }
3946 return 0;
3947}
3948
3949/*
3950 * This is called when an outbound packet has traversed all filters.
3951 */
3952errno_t
3953nx_netif_filter_tx_cb(struct nexus_netif_adapter *nifna,
3954 struct __kern_packet *fpkt_chain, uint32_t flags)
3955{
3956#pragma unused (flags)
3957 struct nx_netif *nif = nifna->nifna_netif;
3958 struct nexus_adapter *na = &nifna->nifna_up;
3959 int err;
3960
3961 if (NETIF_IS_COMPAT(nif)) {
3962 struct mbuf *m_chain;
3963 mbuf_svc_class_t sc;
3964
3965 m_chain = nx_netif_filter_pkt_to_mbuf_chain(nifna,
3966 fpkt_chain, NETIF_CONVERT_TX);
3967 if (m_chain == NULL) {
3968 return ENOMEM;
3969 }
3970 /*
3971 * All packets in the chain have the same service class.
3972 * If the sc is missing or invalid, a valid value will be
3973 * returned.
3974 */
3975 sc = mbuf_get_service_class(mbuf: m_chain);
3976 err = nx_netif_filter_tx_processed_mbuf_enqueue(nifna,
3977 sc, m_chain);
3978 } else {
3979 struct __kern_packet *pkt_chain;
3980 kern_packet_svc_class_t sc;
3981
3982 pkt_chain = nx_netif_filter_pkt_to_pkt_chain(nifna,
3983 fpkt_chain, NETIF_CONVERT_TX);
3984 if (pkt_chain == NULL) {
3985 return ENOMEM;
3986 }
3987 /*
3988 * All packets in the chain have the same service class.
3989 * If the sc is missing or invalid, a valid value will be
3990 * returned.
3991 */
3992 sc = kern_packet_get_service_class(SK_PKT2PH(pkt_chain));
3993 err = nx_netif_filter_tx_processed_pkt_enqueue(nifna,
3994 sc, pkt_chain);
3995 }
3996 /* Tell driver to resume dequeuing */
3997 ifnet_start(interface: na->na_ifp);
3998 return err;
3999}
4000
4001void
4002nx_netif_vp_region_params_adjust(struct nexus_adapter *na,
4003 struct skmem_region_params *srp)
4004{
4005#pragma unused(na, srp)
4006 return;
4007}
4008
4009/* returns true, if starter thread is utilized */
4010static bool
4011netif_use_starter_thread(struct ifnet *ifp, uint32_t flags)
4012{
4013#if (DEVELOPMENT || DEBUG)
4014 if (__improbable(nx_netif_force_ifnet_start != 0)) {
4015 ifnet_start(ifp);
4016 return true;
4017 }
4018#endif /* !DEVELOPMENT && !DEBUG */
4019 /*
4020 * use starter thread in following conditions:
4021 * - interface is not skywalk native
4022 * - interface attached to virtual driver (ipsec, utun)
4023 * - TBR is enabled
4024 * - delayed start mechanism is in use
4025 * - remaining stack space on the thread is not enough for driver
4026 * - caller is in rx workloop context
4027 * - caller is from the flowswitch path doing ARP resolving
4028 * - caller requires the use of starter thread (stack usage)
4029 * - caller requires starter thread for pacing
4030 */
4031 if (!SKYWALK_NATIVE(ifp) || NA(ifp) == NULL ||
4032 !NA_IS_ACTIVE(&NA(ifp)->nifna_up) ||
4033 ((NA(ifp)->nifna_up.na_flags & NAF_VIRTUAL_DEVICE) != 0) ||
4034 IFCQ_TBR_IS_ENABLED(ifp->if_snd) ||
4035 (ifp->if_eflags & IFEF_ENQUEUE_MULTI) ||
4036 (flags & NETIF_XMIT_FLAG_PACING) != 0 ||
4037 sk_is_rx_notify_protected() ||
4038 sk_is_async_transmit_protected() ||
4039 (sk_is_sync_protected() && (flags & NETIF_XMIT_FLAG_HOST) != 0)) {
4040 DTRACE_SKYWALK2(use__starter__thread, struct ifnet *, ifp,
4041 uint32_t, flags);
4042 ifnet_start(interface: ifp);
4043 return true;
4044 }
4045 lck_mtx_lock_spin(lck: &ifp->if_start_lock);
4046 /* interface is flow controlled */
4047 if (__improbable(ifp->if_start_flags & IFSF_FLOW_CONTROLLED)) {
4048 lck_mtx_unlock(lck: &ifp->if_start_lock);
4049 return true;
4050 }
4051 /* if starter thread is active, utilize it */
4052 if (ifp->if_start_active) {
4053 ifp->if_start_req++;
4054 lck_mtx_unlock(lck: &ifp->if_start_lock);
4055 return true;
4056 }
4057 lck_mtx_unlock(lck: &ifp->if_start_lock);
4058 /* Check remaining stack space */
4059 if ((OSKernelStackRemaining() < NX_NETIF_MIN_DRIVER_STACK_SIZE)) {
4060 ifnet_start(interface: ifp);
4061 return true;
4062 }
4063 return false;
4064}
4065
4066void
4067netif_transmit(struct ifnet *ifp, uint32_t flags)
4068{
4069 if (netif_use_starter_thread(ifp, flags)) {
4070 return;
4071 }
4072 nx_netif_doorbell_internal(ifp, flags);
4073}
4074
4075static struct ifclassq *
4076netif_get_default_ifcq(struct nexus_adapter *hwna)
4077{
4078 struct nx_netif *nif;
4079 struct ifclassq *ifcq;
4080
4081 nif = NX_NETIF_PRIVATE(hwna->na_nx);
4082 if (NETIF_LLINK_ENABLED(nif)) {
4083 struct netif_qset *qset;
4084
4085 /*
4086 * Use the default ifcq for now.
4087 * In the future this could be chosen by the caller.
4088 */
4089 qset = nx_netif_get_default_qset_noref(nif);
4090 ASSERT(qset != NULL);
4091 ifcq = qset->nqs_ifcq;
4092 } else {
4093 ifcq = nif->nif_ifp->if_snd;
4094 }
4095 return ifcq;
4096}
4097
4098static errno_t
4099netif_deq_packets(struct nexus_adapter *hwna, struct ifclassq *ifcq,
4100 uint32_t pkt_limit, uint32_t byte_limit, struct __kern_packet **head,
4101 boolean_t *pkts_pending, kern_packet_svc_class_t sc,
4102 uint32_t *pkt_cnt, uint32_t *bytes, uint8_t qset_idx)
4103{
4104 classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
4105 struct ifnet *ifp = hwna->na_ifp;
4106 uint32_t pkts_cnt;
4107 uint32_t bytes_cnt;
4108 errno_t rc;
4109
4110 ASSERT(ifp != NULL);
4111 ASSERT(ifp->if_output_sched_model < IFNET_SCHED_MODEL_MAX);
4112 ASSERT((pkt_limit != 0) && (byte_limit != 0));
4113
4114 if (ifcq == NULL) {
4115 ifcq = netif_get_default_ifcq(hwna);
4116 }
4117 if (ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED) {
4118 rc = ifclassq_dequeue_sc(ifcq, (mbuf_svc_class_t)sc,
4119 pkt_limit, byte_limit, &pkt_head, NULL, pkt_cnt, bytes, qset_idx);
4120 } else {
4121 rc = ifclassq_dequeue(ifcq, pkt_limit, byte_limit,
4122 &pkt_head, NULL, pkt_cnt, bytes, qset_idx);
4123 }
4124 ASSERT((rc == 0) || (rc == EAGAIN));
4125 ASSERT((pkt_head.cp_ptype == QP_PACKET) || (pkt_head.cp_kpkt == NULL));
4126
4127 ifclassq_get_len(ifcq, (mbuf_svc_class_t)sc, qset_idx,
4128 &pkts_cnt, &bytes_cnt);
4129 *pkts_pending = pkts_cnt > 0;
4130
4131 *head = pkt_head.cp_kpkt;
4132 return rc;
4133}
4134
4135#if SK_LOG
4136/* Hoisted out of line to reduce kernel stack footprint */
4137SK_LOG_ATTRIBUTE
4138static void
4139netif_no_ring_space_log(const struct nexus_adapter *na,
4140 const kern_channel_ring_t ring)
4141{
4142 SK_DF(SK_VERB_SYNC | SK_VERB_TX,
4143 "no ring space: na \"%s\" [%u] "
4144 "\"%s\"(kh %u kt %u kl %u | rh %u rt %u)"
4145 "\"%s\"(kh %u kt %u kl %u | rh %u rt %u)",
4146 na->na_name, ring->ckr_ring_id,
4147 ring->ckr_name, ring->ckr_khead,
4148 ring->ckr_ktail, ring->ckr_klease,
4149 ring->ckr_rhead, ring->ckr_rtail);
4150}
4151#endif /* SK_LOG */
4152
4153/*
4154 * netif refill function for rings
4155 */
4156errno_t
4157netif_ring_tx_refill(const kern_channel_ring_t ring, uint32_t pkt_limit,
4158 uint32_t byte_limit, boolean_t tx_doorbell_ctxt, boolean_t *pkts_pending,
4159 boolean_t canblock)
4160{
4161 struct nexus_adapter *hwna;
4162 struct ifnet *ifp;
4163 struct __kern_packet *head = NULL;
4164 sk_protect_t protect;
4165 errno_t rc = 0;
4166 errno_t sync_err = 0;
4167 uint32_t npkts = 0, consumed = 0;
4168 uint32_t flags;
4169 slot_idx_t idx, ktail;
4170 int ring_space = 0;
4171
4172 KDBG((SK_KTRACE_NETIF_RING_TX_REFILL | DBG_FUNC_START), SK_KVA(ring));
4173
4174 VERIFY(ring != NULL);
4175 hwna = KRNA(ring);
4176 ifp = hwna->na_ifp;
4177
4178 ASSERT(hwna->na_type == NA_NETIF_DEV);
4179 ASSERT(ring->ckr_tx == NR_TX);
4180 *pkts_pending = FALSE;
4181
4182 if (__improbable(pkt_limit == 0 || byte_limit == 0)) {
4183 SK_ERR("invalid limits plim %d, blim %d",
4184 pkt_limit, byte_limit);
4185 rc = EINVAL;
4186 goto out;
4187 }
4188
4189 if (__improbable(!IF_FULLY_ATTACHED(ifp))) {
4190 SK_ERR("hwna 0x%llx ifp %s (0x%llx), interface not attached",
4191 SK_KVA(hwna), if_name(ifp), SK_KVA(ifp));
4192 rc = ENXIO;
4193 goto out;
4194 }
4195
4196 if (__improbable((ifp->if_start_flags & IFSF_FLOW_CONTROLLED) != 0)) {
4197 SK_DF(SK_VERB_SYNC | SK_VERB_TX, "hwna 0x%llx ifp %s (0x%llx), "
4198 "flow control ON", SK_KVA(hwna), if_name(ifp), SK_KVA(ifp));
4199 rc = ENXIO;
4200 goto out;
4201 }
4202
4203 /*
4204 * if the ring is busy, it means another dequeue is in
4205 * progress, so ignore this request and return success.
4206 */
4207 if (kr_enter(ring, canblock) != 0) {
4208 rc = 0;
4209 goto out;
4210 }
4211 /* mark thread with sync-in-progress flag */
4212 protect = sk_sync_protect();
4213
4214 if (__improbable(KR_DROP(ring) ||
4215 !NA_IS_ACTIVE(ring->ckr_na))) {
4216 SK_ERR("hw-kr 0x%llx stopped", SK_KVA(ring));
4217 rc = ENXIO;
4218 goto done;
4219 }
4220
4221 idx = ring->ckr_rhead;
4222 ktail = ring->ckr_ktail;
4223 /* calculate available space on tx ring */
4224 ring_space = ktail - idx;
4225 if (ring_space < 0) {
4226 ring_space += ring->ckr_num_slots;
4227 }
4228 if (ring_space == 0) {
4229 struct ifclassq *ifcq;
4230
4231 /* no space in ring, driver should retry */
4232#if SK_LOG
4233 if (__improbable((sk_verbose &
4234 (SK_VERB_SYNC | SK_VERB_TX)) != 0)) {
4235 netif_no_ring_space_log(hwna, ring);
4236 }
4237#endif /* SK_LOG */
4238 ifcq = netif_get_default_ifcq(hwna);
4239 if (IFCQ_LEN(ifcq) != 0) {
4240 *pkts_pending = TRUE;
4241 }
4242 /*
4243 * We ran out of space in ring, most probably
4244 * because the driver is slow to drain its TX queue.
4245 * We want another doorbell to be generated as soon
4246 * as the TX notify completion happens; mark this
4247 * through ckr_pending_doorbell counter. Do this
4248 * regardless of whether there's any pending packet.
4249 */
4250 ring->ckr_pending_doorbell++;
4251 rc = EAGAIN;
4252 goto sync_ring;
4253 }
4254
4255 if ((uint32_t)ring_space < pkt_limit) {
4256 pkt_limit = ring_space;
4257 }
4258
4259 if (tx_doorbell_ctxt &&
4260 ((hwna->na_flags & NAF_VIRTUAL_DEVICE) == 0)) {
4261 pkt_limit = MIN(pkt_limit,
4262 nx_netif_doorbell_max_dequeue);
4263 }
4264
4265 rc = netif_deq_packets(hwna, NULL, pkt_limit, byte_limit,
4266 head: &head, pkts_pending, sc: ring->ckr_svc, NULL, NULL, qset_idx: 0);
4267
4268 /*
4269 * There's room in ring; if we haven't dequeued everything,
4270 * mark ckr_pending_doorbell for the next TX notify to issue
4271 * a TX door bell; otherwise, clear it. The next packet that
4272 * gets enqueued will trigger a door bell again.
4273 */
4274 if (*pkts_pending) {
4275 ring->ckr_pending_doorbell++;
4276 } else if (ring->ckr_pending_doorbell != 0) {
4277 ring->ckr_pending_doorbell = 0;
4278 }
4279
4280 if (rc != 0) {
4281 /*
4282 * This is expected sometimes as the IOSkywalkFamily
4283 * errs on the side of caution to perform an extra
4284 * dequeue when multiple doorbells are pending;
4285 * nothing to dequeue, do a sync if there are slots
4286 * to reclaim else just return.
4287 */
4288 SK_DF(SK_VERB_SYNC | SK_VERB_TX,
4289 "nothing to dequeue, err %d", rc);
4290
4291 if ((uint32_t)ring_space == ring->ckr_lim) {
4292 goto done;
4293 } else {
4294 goto sync_ring;
4295 }
4296 }
4297 /* move the dequeued packets to tx ring */
4298 while (head != NULL && idx != ktail) {
4299 ASSERT(npkts <= pkt_limit);
4300 struct __kern_packet *pkt = head;
4301 KR_SLOT_ATTACH_METADATA(kring: ring, KR_KSD(ring, idx),
4302 kqum: (struct __kern_quantum *)pkt);
4303 npkts++;
4304 if (__improbable(pkt->pkt_trace_id != 0)) {
4305 KDBG(SK_KTRACE_PKT_TX_AQM | DBG_FUNC_END, pkt->pkt_trace_id);
4306 KDBG(SK_KTRACE_PKT_TX_DRV | DBG_FUNC_START, pkt->pkt_trace_id);
4307 }
4308 idx = SLOT_NEXT(i: idx, lim: ring->ckr_lim);
4309 head = pkt->pkt_nextpkt;
4310 pkt->pkt_nextpkt = NULL;
4311 }
4312
4313 /*
4314 * We checked for ring space earlier so the ring should have enough
4315 * space for the entire chain.
4316 */
4317 ASSERT(head == NULL);
4318 ring->ckr_rhead = idx;
4319
4320sync_ring:
4321 flags = NA_SYNCF_NETIF;
4322 if (ring->ckr_pending_doorbell != 0) {
4323 flags |= (NA_SYNCF_NETIF_DOORBELL | NA_SYNCF_NETIF_ASYNC);
4324 }
4325
4326 ring->ckr_khead_pre = ring->ckr_khead;
4327 sync_err = ring->ckr_na_sync(ring, kernproc, flags);
4328 if (sync_err != 0 && sync_err != EAGAIN) {
4329 SK_ERR("unexpected sync err %d", sync_err);
4330 if (rc == 0) {
4331 rc = sync_err;
4332 }
4333 goto done;
4334 }
4335 /*
4336 * Verify that the driver has detached packets from the consumed slots.
4337 */
4338 idx = ring->ckr_khead_pre;
4339 consumed = 0;
4340 while (idx != ring->ckr_khead) {
4341 struct __kern_slot_desc *ksd = KR_KSD(ring, idx);
4342
4343 consumed++;
4344 VERIFY(!KSD_VALID_METADATA(ksd));
4345 idx = SLOT_NEXT(i: idx, lim: ring->ckr_lim);
4346 }
4347 ring->ckr_khead_pre = ring->ckr_khead;
4348
4349done:
4350 sk_sync_unprotect(protect);
4351 kr_exit(ring);
4352out:
4353 KDBG((SK_KTRACE_NETIF_RING_TX_REFILL | DBG_FUNC_END),
4354 SK_KVA(ring), rc, 0, npkts);
4355
4356 return rc;
4357}
4358
4359#define NQ_EWMA(old, new, decay) do { \
4360 u_int64_t _avg; \
4361 if (__probable((_avg = (old)) > 0)) \
4362 _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
4363 else \
4364 _avg = (new); \
4365 (old) = _avg; \
4366} while (0)
4367
4368static void
4369kern_netif_increment_queue_stats(kern_netif_queue_t queue,
4370 uint32_t pkt_count, uint32_t byte_count)
4371{
4372 struct netif_llink *llink = queue->nq_qset->nqs_llink;
4373 struct ifnet *ifp = llink->nll_nif->nif_ifp;
4374 if ((queue->nq_flags & NETIF_QUEUE_IS_RX) == 0) {
4375 os_atomic_add(&ifp->if_data.ifi_opackets, pkt_count, relaxed);
4376 os_atomic_add(&ifp->if_data.ifi_obytes, byte_count, relaxed);
4377 } else {
4378 os_atomic_add(&ifp->if_data.ifi_ipackets, pkt_count, relaxed);
4379 os_atomic_add(&ifp->if_data.ifi_ibytes, byte_count, relaxed);
4380 }
4381
4382 if (ifp->if_data_threshold != 0) {
4383 ifnet_notify_data_threshold(ifp);
4384 }
4385
4386 uint64_t now;
4387 uint64_t diff_secs;
4388 struct netif_qstats *stats = &queue->nq_stats;
4389
4390 if (nq_stat_enable == 0) {
4391 return;
4392 }
4393
4394 if (__improbable(pkt_count == 0)) {
4395 return;
4396 }
4397
4398 stats->nq_num_xfers++;
4399 stats->nq_total_bytes += byte_count;
4400 stats->nq_total_pkts += pkt_count;
4401 if (pkt_count > stats->nq_max_pkts) {
4402 stats->nq_max_pkts = pkt_count;
4403 }
4404 if (stats->nq_min_pkts == 0 ||
4405 pkt_count < stats->nq_min_pkts) {
4406 stats->nq_min_pkts = pkt_count;
4407 }
4408
4409 now = net_uptime();
4410 if (__probable(queue->nq_accumulate_start != 0)) {
4411 diff_secs = now - queue->nq_accumulate_start;
4412 if (diff_secs >= nq_accumulate_interval) {
4413 uint64_t bps;
4414 uint64_t pps;
4415 uint64_t pps_ma;
4416
4417 /* bytes per second */
4418 bps = queue->nq_accumulated_bytes / diff_secs;
4419 NQ_EWMA(stats->nq_bytes_ps_ma,
4420 bps, nq_transfer_decay);
4421 stats->nq_bytes_ps = bps;
4422
4423 /* pkts per second */
4424 pps = queue->nq_accumulated_pkts / diff_secs;
4425 pps_ma = stats->nq_pkts_ps_ma;
4426 NQ_EWMA(pps_ma, pps, nq_transfer_decay);
4427 stats->nq_pkts_ps_ma = (uint32_t)pps_ma;
4428 stats->nq_pkts_ps = (uint32_t)pps;
4429
4430 /* start over */
4431 queue->nq_accumulate_start = now;
4432 queue->nq_accumulated_bytes = 0;
4433 queue->nq_accumulated_pkts = 0;
4434
4435 stats->nq_min_pkts = 0;
4436 stats->nq_max_pkts = 0;
4437 }
4438 } else {
4439 queue->nq_accumulate_start = now;
4440 }
4441 queue->nq_accumulated_bytes += byte_count;
4442 queue->nq_accumulated_pkts += pkt_count;
4443}
4444
4445void
4446kern_netif_queue_rx_enqueue(kern_netif_queue_t queue, kern_packet_t ph_chain,
4447 uint32_t count, uint32_t flags)
4448{
4449#pragma unused (count)
4450 struct netif_queue *q = queue;
4451 struct netif_llink *llink = q->nq_qset->nqs_llink;
4452 struct __kern_packet *pkt_chain = SK_PTR_ADDR_KPKT(ph_chain);
4453 bool flush = ((flags & KERN_NETIF_QUEUE_RX_ENQUEUE_FLAG_FLUSH) != 0);
4454 struct pktq *pktq = &q->nq_pktq;
4455 struct netif_stats *nifs = &llink->nll_nif->nif_stats;
4456 struct nexus_pkt_stats stats;
4457 sk_protect_t protect;
4458
4459 ASSERT((q->nq_flags & NETIF_QUEUE_IS_RX) != 0);
4460 if (llink->nll_state == NETIF_LLINK_STATE_DESTROYED) {
4461 int drop_cnt = 0;
4462
4463 pp_free_packet_chain(pkt_chain, &drop_cnt);
4464 STATS_ADD(nifs, NETIF_STATS_LLINK_RX_DROP_BAD_STATE, drop_cnt);
4465 return;
4466 }
4467 KPKTQ_ENQUEUE_LIST(pktq, pkt_chain);
4468 if (flush) {
4469 pkt_chain = KPKTQ_FIRST(pktq);
4470 KPKTQ_INIT(pktq);
4471
4472 protect = sk_sync_protect();
4473 netif_receive(NA(llink->nll_nif->nif_ifp), pkt_chain, stats: &stats);
4474 sk_sync_unprotect(protect);
4475 kern_netif_increment_queue_stats(queue, pkt_count: (uint32_t)stats.nps_pkts,
4476 byte_count: (uint32_t)stats.nps_bytes);
4477 }
4478}
4479
4480errno_t
4481kern_netif_queue_tx_dequeue(kern_netif_queue_t queue, uint32_t pkt_limit,
4482 uint32_t byte_limit, boolean_t *pending, kern_packet_t *ph_chain)
4483{
4484 struct netif_queue *q = queue;
4485 struct netif_llink *llink = q->nq_qset->nqs_llink;
4486 struct netif_stats *nifs = &llink->nll_nif->nif_stats;
4487 struct nexus_adapter *hwna;
4488 struct __kern_packet *pkt_chain = NULL;
4489 uint32_t bytes = 0, pkt_cnt = 0;
4490 errno_t rc;
4491
4492 ASSERT((q->nq_flags & NETIF_QUEUE_IS_RX) == 0);
4493 if (llink->nll_state == NETIF_LLINK_STATE_DESTROYED) {
4494 STATS_INC(nifs, NETIF_STATS_LLINK_AQM_DEQ_BAD_STATE);
4495 return ENXIO;
4496 }
4497 hwna = &NA(llink->nll_nif->nif_ifp)->nifna_up;
4498
4499 if (((hwna->na_flags & NAF_VIRTUAL_DEVICE) == 0) &&
4500 sk_is_tx_notify_protected()) {
4501 pkt_limit = MIN(pkt_limit, nx_netif_doorbell_max_dequeue);
4502 }
4503 rc = netif_deq_packets(hwna, ifcq: q->nq_qset->nqs_ifcq, pkt_limit,
4504 byte_limit, head: &pkt_chain, pkts_pending: pending, sc: q->nq_svc, pkt_cnt: &pkt_cnt, bytes: &bytes,
4505 qset_idx: q->nq_qset->nqs_idx);
4506
4507 if (pkt_cnt > 0) {
4508 kern_netif_increment_queue_stats(queue, pkt_count: pkt_cnt, byte_count: bytes);
4509 }
4510 if (pkt_chain != NULL) {
4511 *ph_chain = SK_PKT2PH(pkt_chain);
4512 }
4513 return rc;
4514}
4515
4516errno_t
4517kern_netif_qset_tx_queue_len(kern_netif_qset_t qset, uint32_t svc,
4518 uint32_t * pkts_cnt, uint32_t * bytes_cnt)
4519{
4520 VERIFY(qset != NULL);
4521 VERIFY(pkts_cnt != NULL);
4522 VERIFY(bytes_cnt != NULL);
4523
4524 return ifclassq_get_len(qset->nqs_ifcq, svc, qset->nqs_idx, pkts_cnt,
4525 bytes_cnt);
4526}
4527
4528void
4529kern_netif_set_qset_combined(kern_netif_qset_t qset)
4530{
4531 VERIFY(qset != NULL);
4532 VERIFY(qset->nqs_ifcq != NULL);
4533
4534 ifclassq_set_grp_combined(ifcq: qset->nqs_ifcq, grp_idx: qset->nqs_idx);
4535}
4536
4537void
4538kern_netif_set_qset_separate(kern_netif_qset_t qset)
4539{
4540 VERIFY(qset != NULL);
4541 VERIFY(qset->nqs_ifcq != NULL);
4542
4543 ifclassq_set_grp_separated(ifcq: qset->nqs_ifcq, grp_idx: qset->nqs_idx);
4544}
4545
4546errno_t
4547kern_nexus_netif_llink_add(struct kern_nexus *nx,
4548 struct kern_nexus_netif_llink_init *llink_init)
4549{
4550 errno_t err;
4551 struct nx_netif *nif;
4552 struct netif_llink *llink;
4553 struct netif_stats *nifs;
4554
4555 VERIFY(nx != NULL);
4556 VERIFY(llink_init != NULL);
4557 VERIFY((nx->nx_flags & NXF_ATTACHED) != 0);
4558
4559 nif = NX_NETIF_PRIVATE(nx);
4560 nifs = &nif->nif_stats;
4561
4562 err = nx_netif_validate_llink_config(llink_init, false);
4563 if (err != 0) {
4564 SK_ERR("Invalid llink init params");
4565 STATS_INC(nifs, NETIF_STATS_LLINK_ADD_BAD_PARAMS);
4566 return err;
4567 }
4568
4569 err = nx_netif_llink_add(nif, llink_init, &llink);
4570 return err;
4571}
4572
4573errno_t
4574kern_nexus_netif_llink_remove(struct kern_nexus *nx,
4575 kern_nexus_netif_llink_id_t llink_id)
4576{
4577 struct nx_netif *nif;
4578
4579 VERIFY(nx != NULL);
4580 VERIFY((nx->nx_flags & NXF_ATTACHED) != 0);
4581
4582 nif = NX_NETIF_PRIVATE(nx);
4583 return nx_netif_llink_remove(nif, llink_id);
4584}
4585
4586errno_t
4587kern_netif_queue_get_service_class(kern_netif_queue_t queue,
4588 kern_packet_svc_class_t *svc)
4589{
4590 *svc = queue->nq_svc;
4591 return 0;
4592}
4593