1 | /* |
2 | * Copyright (c) 2015-2022 Apple Inc. All rights reserved. |
3 | * |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
5 | * |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License |
8 | * Version 2.0 (the 'License'). You may not use this file except in |
9 | * compliance with the License. The rights granted to you under the License |
10 | * may not be used to create, or enable the creation or redistribution of, |
11 | * unlawful or unlicensed copies of an Apple operating system, or to |
12 | * circumvent, violate, or enable the circumvention or violation of, any |
13 | * terms of an Apple operating system software license agreement. |
14 | * |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
17 | * |
18 | * The Original Code and all software distributed under the License are |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and |
24 | * limitations under the License. |
25 | * |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
27 | */ |
28 | |
29 | /* |
30 | * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. |
31 | * |
32 | * Redistribution and use in source and binary forms, with or without |
33 | * modification, are permitted provided that the following conditions |
34 | * are met: |
35 | * 1. Redistributions of source code must retain the above copyright |
36 | * notice, this list of conditions and the following disclaimer. |
37 | * 2. Redistributions in binary form must reproduce the above copyright |
38 | * notice, this list of conditions and the following disclaimer in the |
39 | * documentation and/or other materials provided with the distribution. |
40 | * |
41 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
42 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
43 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
44 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
45 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
46 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
47 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
48 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
49 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
50 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
51 | * SUCH DAMAGE. |
52 | */ |
53 | |
54 | #include <skywalk/os_skywalk_private.h> |
55 | #include <skywalk/nexus/netif/nx_netif.h> |
56 | #include <skywalk/nexus/flowswitch/nx_flowswitch.h> |
57 | #include <mach/thread_act.h> |
58 | #include <kern/thread.h> |
59 | #include <kern/sched_prim.h> |
60 | |
61 | static void na_netif_compat_finalize(struct nexus_netif_adapter *, |
62 | struct ifnet *); |
63 | static errno_t nx_netif_compat_receive(struct ifnet *ifp, struct mbuf *m_head, |
64 | struct mbuf *m_tail, const struct ifnet_stat_increment_param *s, |
65 | boolean_t poll, struct thread *tp); |
66 | static int nx_netif_compat_catch_rx(struct nexus_netif_compat_adapter *na, |
67 | boolean_t enable); |
68 | static int nx_netif_compat_xmit_frame(struct nexus_adapter *, struct mbuf *, |
69 | struct __kern_packet *); |
70 | |
71 | static int nx_netif_compat_na_notify_tx(struct __kern_channel_ring *, |
72 | struct proc *, uint32_t); |
73 | static int nx_netif_compat_na_notify_rx(struct __kern_channel_ring *, |
74 | struct proc *, uint32_t); |
75 | static int nx_netif_compat_na_activate(struct nexus_adapter *, |
76 | na_activate_mode_t); |
77 | static int nx_netif_compat_na_txsync(struct __kern_channel_ring *, |
78 | struct proc *, uint32_t); |
79 | static int nx_netif_compat_na_rxsync(struct __kern_channel_ring *, |
80 | struct proc *, uint32_t); |
81 | static void nx_netif_compat_na_dtor(struct nexus_adapter *na); |
82 | |
83 | static void nx_netif_compat_tx_intr(struct ifnet *, enum txrx, uint32_t, |
84 | uint32_t *); |
85 | static inline struct mbuf *nx_netif_compat_ring_alloc(int, int, uint16_t); |
86 | static inline void nx_netif_compat_ring_free(struct mbuf *m); |
87 | static void nx_netif_compat_ringcb(caddr_t cl, uint32_t size, caddr_t arg); |
88 | |
89 | static uint32_t nx_netif_compat_tx_clean(struct netif_stats *nifs, |
90 | struct __kern_channel_ring *kring); |
91 | static void nx_netif_compat_set_tx_event(struct __kern_channel_ring *kring, |
92 | slot_idx_t khead); |
93 | |
94 | static struct nexus_netif_compat_adapter *na_netif_compat_alloc(zalloc_flags_t); |
95 | static void na_netif_compat_free(struct nexus_adapter *); |
96 | #if DEBUG || DEVELOPMENT |
97 | static struct mbuf *nx_netif_rx_split(struct mbuf *, uint32_t); |
98 | #endif /* DEBUG || DEVELOPMENT */ |
99 | |
100 | #define MBUF_TXQ(m) ((m)->m_pkthdr.pkt_flowid) |
101 | #define MBUF_RXQ(m) ((m)->m_pkthdr.pkt_flowid) |
102 | |
103 | #define NMB_PROPF_TX_NOTIFY 0x1 /* generate transmit event */ |
104 | #define NMB_FLAGS_MASK 0x0000ffff |
105 | #define NMB_INDEX_MASK 0xffff0000 |
106 | #define NMB_GET_FLAGS(p) (((uint32_t)(p) & NMB_FLAGS_MASK)) |
107 | #define NMB_SET_FLAGS(p, f) (((uint32_t)(p) & ~NMB_FLAGS_MASK) | (f)) |
108 | #define NMB_GET_INDEX(p) (((uint32_t)(p) & NMB_INDEX_MASK) >> 16) |
109 | #define NMB_SET_INDEX(p, i) (((uint32_t)(p) & ~NMB_INDEX_MASK) | (i << 16)) |
110 | |
111 | static SKMEM_TYPE_DEFINE(na_netif_compat_zone, struct nexus_netif_compat_adapter); |
112 | |
113 | static int netif_tx_event_mode = 0; |
114 | |
115 | #if (DEVELOPMENT || DEBUG) |
116 | SYSCTL_EXTENSIBLE_NODE(_kern_skywalk_netif, OID_AUTO, compat, |
117 | CTLFLAG_RW | CTLFLAG_LOCKED, |
118 | 0, "Skywalk netif Nexus legacy compatibility support" ); |
119 | SYSCTL_INT(_kern_skywalk_netif_compat, OID_AUTO, tx_event_mode, |
120 | CTLFLAG_RW | CTLFLAG_LOCKED, &netif_tx_event_mode, 0, "" ); |
121 | static uint32_t netif_rx_split = 0; |
122 | SYSCTL_UINT(_kern_skywalk_netif_compat, OID_AUTO, rx_split, |
123 | CTLFLAG_RW | CTLFLAG_LOCKED, &netif_rx_split, 0, "" ); |
124 | #endif /* !DEVELOPMENT && !DEBUG */ |
125 | |
126 | struct kern_nexus_domain_provider nx_netif_compat_prov_s = { |
127 | .nxdom_prov_name = NEXUS_PROVIDER_NET_IF_COMPAT, |
128 | .nxdom_prov_flags = NXDOMPROVF_DEFAULT, |
129 | .nxdom_prov_cb = { |
130 | .dp_cb_init = nx_netif_prov_init, |
131 | .dp_cb_fini = nx_netif_prov_fini, |
132 | .dp_cb_params = nx_netif_prov_params, |
133 | /* |
134 | * We must be using the native netif handlers below, |
135 | * since we act as the default domain provider; see |
136 | * kern_nexus_register_domain_provider(). |
137 | */ |
138 | .dp_cb_mem_new = nx_netif_prov_mem_new, |
139 | .dp_cb_config = nx_netif_prov_config, |
140 | .dp_cb_nx_ctor = nx_netif_prov_nx_ctor, |
141 | .dp_cb_nx_dtor = nx_netif_prov_nx_dtor, |
142 | .dp_cb_nx_mem_info = nx_netif_prov_nx_mem_info, |
143 | .dp_cb_nx_mib_get = nx_netif_prov_nx_mib_get, |
144 | .dp_cb_nx_stop = nx_netif_prov_nx_stop, |
145 | }, |
146 | }; |
147 | |
148 | struct nexus_ifnet_ops na_netif_compat_ops = { |
149 | .ni_finalize = na_netif_compat_finalize, |
150 | .ni_reap = nx_netif_reap, |
151 | .ni_dequeue = nx_netif_compat_tx_dequeue, |
152 | .ni_get_len = nx_netif_compat_tx_get_len, |
153 | }; |
154 | |
155 | #define SKMEM_TAG_NETIF_COMPAT_MIT "com.apple.skywalk.netif.compat.mit" |
156 | static SKMEM_TAG_DEFINE(skmem_tag_netif_compat_mit, SKMEM_TAG_NETIF_COMPAT_MIT); |
157 | |
158 | #define SKMEM_TAG_NETIF_COMPAT_POOL "com.apple.skywalk.netif.compat.pool" |
159 | static SKMEM_TAG_DEFINE(skmem_tag_netif_compat_pool, SKMEM_TAG_NETIF_COMPAT_POOL); |
160 | |
161 | void |
162 | nx_netif_compat_init(struct nxdom *nxdom) |
163 | { |
164 | _CASSERT(NETIF_COMPAT_MAX_MBUF_DATA_COPY <= NETIF_COMPAT_BUF_SIZE); |
165 | |
166 | /* |
167 | * We want nxprov_create() coming from userland to use the |
168 | * netif_compat domain provider, so install it as default. |
169 | * This is verified by the caller. |
170 | */ |
171 | (void) nxdom_prov_add(nxdom, &nx_netif_compat_prov_s); |
172 | } |
173 | |
174 | void |
175 | nx_netif_compat_fini(void) |
176 | { |
177 | (void) nxdom_prov_del(&nx_netif_compat_prov_s); |
178 | } |
179 | |
180 | static struct nexus_netif_compat_adapter * |
181 | na_netif_compat_alloc(zalloc_flags_t how) |
182 | { |
183 | struct nexus_netif_compat_adapter *nca; |
184 | |
185 | _CASSERT(offsetof(struct nexus_netif_compat_adapter, nca_up) == 0); |
186 | |
187 | nca = zalloc_flags(na_netif_compat_zone, how | Z_ZERO); |
188 | if (nca) { |
189 | SK_DF(SK_VERB_MEM, "nca %p ALLOC" , SK_KVA(nca)); |
190 | } |
191 | return nca; |
192 | } |
193 | |
194 | static void |
195 | na_netif_compat_free(struct nexus_adapter *na) |
196 | { |
197 | struct nexus_netif_compat_adapter *nca = |
198 | (struct nexus_netif_compat_adapter *)na; |
199 | |
200 | SK_LOCK_ASSERT_HELD(); |
201 | ASSERT(na->na_refcount == 0); |
202 | |
203 | SK_DF(SK_VERB_MEM, "nca [dev+host] %p FREE" , SK_KVA(nca)); |
204 | bzero(s: nca, n: sizeof(*nca)); |
205 | zfree(na_netif_compat_zone, nca); |
206 | } |
207 | |
208 | /* |
209 | * Callback invoked when the device driver frees an mbuf used |
210 | * by skywalk to transmit a packet. This usually happens when |
211 | * the NIC notifies the driver that transmission is completed. |
212 | */ |
213 | static void |
214 | nx_netif_compat_ringcb(caddr_t cl, uint32_t size, caddr_t arg) |
215 | { |
216 | #pragma unused(cl, size) |
217 | struct mbuf *m = (void *)arg; |
218 | struct ifnet *ifp = NULL; |
219 | struct netif_stats *nifs = NULL; |
220 | uintptr_t data; /* not used */ |
221 | uint32_t txq; |
222 | errno_t err; |
223 | |
224 | err = mbuf_get_tx_compl_data(m, arg: (uintptr_t *)&ifp, data: &data); |
225 | ASSERT(err == 0); |
226 | |
227 | nifs = &NX_NETIF_PRIVATE(NA(ifp)->nifna_up.na_nx)->nif_stats; |
228 | txq = MBUF_TXQ(m); |
229 | |
230 | for (;;) { |
231 | uint32_t p = 0, i, f; |
232 | |
233 | (void) mbuf_cluster_get_prop(mbuf: m, prop: &p); |
234 | f = NMB_GET_FLAGS(p); |
235 | i = NMB_GET_INDEX(p); |
236 | |
237 | SK_DF(SK_VERB_NETIF, "%s m 0x%llx txq %u i %u f 0x%x" , |
238 | if_name(ifp), SK_KVA(m), MBUF_TXQ(m), i, f); |
239 | |
240 | if (f & NMB_PROPF_TX_NOTIFY) { |
241 | uint32_t pn; |
242 | |
243 | f &= ~NMB_PROPF_TX_NOTIFY; |
244 | pn = NMB_SET_FLAGS(p, f); |
245 | |
246 | err = mbuf_cluster_set_prop(mbuf: m, oldprop: p, newprop: pn); |
247 | if (err != 0) { |
248 | if (err == EBUSY) { /* try again */ |
249 | continue; |
250 | } |
251 | /* TODO: adi@apple.com -- what to do? */ |
252 | SK_ERR("Failed to clear TX_NOTIFY " |
253 | "m 0x%llx i %u err %d" , SK_KVA(m), i, err); |
254 | } else { |
255 | nx_netif_compat_tx_intr(ifp, NR_TX, txq, NULL); |
256 | SK_DF(SK_VERB_NETIF | SK_VERB_INTR | SK_VERB_TX, |
257 | "%s TX irq m 0x%llx txq %u i %u f 0x%x" , |
258 | if_name(ifp), SK_KVA(m), MBUF_TXQ(m), i, f); |
259 | STATS_INC(nifs, NETIF_STATS_TX_IRQ); |
260 | } |
261 | } |
262 | break; |
263 | } |
264 | } |
265 | |
266 | /* Hoisted out of line to reduce kernel stack footprint */ |
267 | SK_NO_INLINE_ATTRIBUTE |
268 | static struct mbuf * |
269 | nx_netif_compat_ring_alloc(int how, int len, uint16_t idx) |
270 | { |
271 | struct mbuf *m = NULL; |
272 | size_t size = len; |
273 | uint32_t i; |
274 | |
275 | if (mbuf_ring_cluster_alloc(how, type: MBUF_TYPE_HEADER, mbuf: &m, |
276 | extfree: nx_netif_compat_ringcb, size: &size) != 0) { |
277 | return NULL; |
278 | } |
279 | |
280 | for (;;) { |
281 | uint32_t p = 0, pn; |
282 | int err; |
283 | |
284 | (void) mbuf_cluster_get_prop(mbuf: m, prop: &p); |
285 | pn = NMB_SET_FLAGS(p, 0); |
286 | pn = NMB_SET_INDEX(pn, idx); |
287 | |
288 | err = mbuf_cluster_set_prop(mbuf: m, oldprop: p, newprop: pn); |
289 | if (err != 0) { |
290 | if (err == EBUSY) { /* try again */ |
291 | continue; |
292 | } |
293 | SK_ERR("Failed to initialize properties m 0x%llx " |
294 | "err %d" , SK_KVA(m), err); |
295 | m_freem(m); |
296 | return NULL; |
297 | } |
298 | (void) mbuf_cluster_get_prop(mbuf: m, prop: &p); |
299 | i = NMB_GET_INDEX(p); |
300 | ASSERT(i == idx); |
301 | break; |
302 | } |
303 | |
304 | SK_DF(SK_VERB_MEM, "alloc m 0x%llx size %u i %u" , |
305 | SK_KVA(m), (uint32_t)size, i); |
306 | |
307 | return m; |
308 | } |
309 | |
310 | /* Hoisted out of line to reduce kernel stack footprint */ |
311 | SK_NO_INLINE_ATTRIBUTE |
312 | static void |
313 | nx_netif_compat_ring_free(struct mbuf *m) |
314 | { |
315 | if (m == NULL) { |
316 | return; |
317 | } |
318 | |
319 | for (;;) { |
320 | uint32_t p = 0; |
321 | int err; |
322 | |
323 | (void) mbuf_cluster_get_prop(mbuf: m, prop: &p); |
324 | err = mbuf_cluster_set_prop(mbuf: m, oldprop: p, newprop: 0); |
325 | if (err != 0) { |
326 | if (err == EBUSY) { /* try again */ |
327 | continue; |
328 | } |
329 | /* TODO: adi@apple.com -- what to do? */ |
330 | SK_ERR("Failed to clear properties m 0x%llx err %d" , |
331 | SK_KVA(m), err); |
332 | } |
333 | break; |
334 | } |
335 | m_freem(m); |
336 | } |
337 | |
338 | static void |
339 | nx_netif_compat_tx_intr(struct ifnet *ifp, enum txrx t, uint32_t q, |
340 | uint32_t *work_done) |
341 | { |
342 | struct nexus_adapter *na = &NA(ifp)->nifna_up; |
343 | |
344 | if (__improbable(!NA_IS_ACTIVE(na) || q >= na_get_nrings(na, t))) { |
345 | if (q >= na_get_nrings(na, t)) { |
346 | SK_ERR("na \"%s\" (0x%llx) invalid q %u >= %u" , |
347 | na->na_name, SK_KVA(na), q, na_get_nrings(na, t)); |
348 | } |
349 | } else { |
350 | (void) nx_netif_mit_tx_intr((NAKR(na, t) + q), kernproc, |
351 | 0, work_done); |
352 | } |
353 | } |
354 | |
355 | static int |
356 | nx_netif_compat_na_notify_tx(struct __kern_channel_ring *kring, |
357 | struct proc *p, uint32_t flags) |
358 | { |
359 | /* |
360 | * This should never get executed, as nothing should be invoking |
361 | * the TX ring notify callback. The compat adapter directly |
362 | * calls nx_netif_compat_tx_intr() for TX completion from within |
363 | * nx_netif_compat_ringcb(). |
364 | * |
365 | * If we ever get here, use the original na_notify callback |
366 | * saved during na_activate(). |
367 | */ |
368 | return kring->ckr_netif_notify(kring, p, flags); |
369 | } |
370 | |
371 | static int |
372 | nx_netif_compat_na_notify_rx(struct __kern_channel_ring *kring, |
373 | struct proc *p, uint32_t flags) |
374 | { |
375 | /* |
376 | * This should never get executed, as nothing should be invoking |
377 | * the RX ring notify callback. The compat adapter directly |
378 | * calls nx_netif_mit_rx_intr() for RX completion from within |
379 | * nx_netif_compat_receive(). |
380 | * |
381 | * If we ever get here, use the original na_notify callback |
382 | * saved during na_activate(). |
383 | */ |
384 | return kring->ckr_netif_notify(kring, p, flags); |
385 | } |
386 | |
387 | /* Enable/disable skywalk mode for a compat network interface. */ |
388 | static int |
389 | nx_netif_compat_na_activate(struct nexus_adapter *na, na_activate_mode_t mode) |
390 | { |
391 | struct nexus_netif_adapter *nifna = (struct nexus_netif_adapter *)na; |
392 | boolean_t tx_mit, rx_mit, tx_mit_simple, rx_mit_simple, rxpoll; |
393 | uint32_t limit = (uint32_t)sk_netif_compat_rx_mbq_limit; |
394 | struct nx_netif *nif = nifna->nifna_netif; |
395 | struct nexus_netif_compat_adapter *nca; |
396 | ifnet_t ifp = na->na_ifp; |
397 | uint32_t i, r; |
398 | int error; |
399 | |
400 | ASSERT(na->na_type == NA_NETIF_COMPAT_DEV); |
401 | ASSERT(!(na->na_flags & NAF_HOST_ONLY)); |
402 | |
403 | SK_DF(SK_VERB_NETIF, "na \"%s\" (0x%llx) %s" , na->na_name, |
404 | SK_KVA(na), na_activate_mode2str(mode)); |
405 | |
406 | nca = (struct nexus_netif_compat_adapter *)nifna; |
407 | |
408 | switch (mode) { |
409 | case NA_ACTIVATE_MODE_ON: |
410 | ASSERT(SKYWALK_CAPABLE(na->na_ifp)); |
411 | |
412 | nx_netif_mit_config(nifna, &tx_mit, &tx_mit_simple, |
413 | &rx_mit, &rx_mit_simple); |
414 | |
415 | /* |
416 | * Init the mitigation support on all the dev TX rings. |
417 | */ |
418 | if (na_get_nrings(na, t: NR_TX) != 0 && tx_mit) { |
419 | nifna->nifna_tx_mit = |
420 | skn_alloc_type_array(tx_on, struct nx_netif_mit, |
421 | na_get_nrings(na, NR_TX), Z_WAITOK, |
422 | skmem_tag_netif_compat_mit); |
423 | if (nifna->nifna_tx_mit == NULL) { |
424 | SK_ERR("TX mitigation allocation failed" ); |
425 | error = ENOMEM; |
426 | goto out; |
427 | } |
428 | } else { |
429 | ASSERT(nifna->nifna_tx_mit == NULL); |
430 | } |
431 | |
432 | /* |
433 | * Init either poller or mitigation support on all the |
434 | * dev RX rings; they're mutually exclusive and poller |
435 | * takes precedence. |
436 | */ |
437 | rxpoll = (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)); |
438 | if (rxpoll) { |
439 | int err; |
440 | __unused kern_return_t kret; |
441 | thread_precedence_policy_data_t info; |
442 | |
443 | ASSERT((ifp->if_xflags & IFXF_LEGACY) == 0); |
444 | ASSERT(ifp->if_input_poll != NULL); |
445 | ASSERT(ifp->if_input_ctl != NULL); |
446 | if ((err = |
447 | kernel_thread_start(continuation: netif_rxpoll_compat_thread_func, |
448 | parameter: ifp, new_thread: &ifp->if_poll_thread)) != KERN_SUCCESS) { |
449 | panic_plain("%s: ifp=%p couldn't get a poll " |
450 | " thread; err=%d" , __func__, ifp, err); |
451 | /* NOTREACHED */ |
452 | __builtin_unreachable(); |
453 | } |
454 | VERIFY(ifp->if_poll_thread != NULL); |
455 | |
456 | /* wait until thread is ready */ |
457 | lck_mtx_lock(lck: &ifp->if_poll_lock); |
458 | while (!(ifp->if_poll_flags & IF_POLLF_READY)) { |
459 | (void) assert_wait(event: &ifp->if_poll_flags, |
460 | THREAD_UNINT); |
461 | lck_mtx_unlock(lck: &ifp->if_poll_lock); |
462 | (void) thread_block(THREAD_CONTINUE_NULL); |
463 | lck_mtx_lock(lck: &ifp->if_poll_lock); |
464 | } |
465 | lck_mtx_unlock(lck: &ifp->if_poll_lock); |
466 | |
467 | bzero(s: &info, n: sizeof(info)); |
468 | info.importance = 1; |
469 | kret = thread_policy_set(thread: ifp->if_poll_thread, |
470 | THREAD_PRECEDENCE_POLICY, policy_info: (thread_policy_t)&info, |
471 | THREAD_PRECEDENCE_POLICY_COUNT); |
472 | ASSERT(kret == KERN_SUCCESS); |
473 | limit = if_rcvq_maxlen; |
474 | (void) netif_rxpoll_set_params(ifp, NULL, FALSE); |
475 | ASSERT(nifna->nifna_rx_mit == NULL); |
476 | } else if (rx_mit) { |
477 | nifna->nifna_rx_mit = |
478 | skn_alloc_type_array(rx_on, struct nx_netif_mit, |
479 | na_get_nrings(na, NR_RX), Z_WAITOK, |
480 | skmem_tag_netif_compat_mit); |
481 | if (nifna->nifna_rx_mit == NULL) { |
482 | SK_ERR("RX mitigation allocation failed" ); |
483 | if (nifna->nifna_tx_mit != NULL) { |
484 | skn_free_type_array(rx_fail, |
485 | struct nx_netif_mit, |
486 | na_get_nrings(na, NR_TX), |
487 | nifna->nifna_tx_mit); |
488 | nifna->nifna_tx_mit = NULL; |
489 | } |
490 | error = ENOMEM; |
491 | goto out; |
492 | } |
493 | } |
494 | |
495 | /* intercept na_notify callback on the TX rings */ |
496 | for (r = 0; r < na_get_nrings(na, t: NR_TX); r++) { |
497 | na->na_tx_rings[r].ckr_netif_notify = |
498 | na->na_tx_rings[r].ckr_na_notify; |
499 | na->na_tx_rings[r].ckr_na_notify = |
500 | nx_netif_compat_na_notify_tx; |
501 | if (nifna->nifna_tx_mit != NULL) { |
502 | nx_netif_mit_init(nif, na->na_ifp, |
503 | &nifna->nifna_tx_mit[r], |
504 | &na->na_tx_rings[r], tx_mit_simple); |
505 | } |
506 | } |
507 | |
508 | /* intercept na_notify callback on the RX rings */ |
509 | for (r = 0; r < na_get_nrings(na, t: NR_RX); r++) { |
510 | na->na_rx_rings[r].ckr_netif_notify = |
511 | na->na_rx_rings[r].ckr_na_notify; |
512 | na->na_rx_rings[r].ckr_na_notify = |
513 | nx_netif_compat_na_notify_rx; |
514 | if (nifna->nifna_rx_mit != NULL) { |
515 | nx_netif_mit_init(nif, na->na_ifp, |
516 | &nifna->nifna_rx_mit[r], |
517 | &na->na_rx_rings[r], rx_mit_simple); |
518 | } |
519 | } |
520 | /* |
521 | * Initialize the rx queue, as nx_netif_compat_receive() can |
522 | * be called as soon as nx_netif_compat_catch_rx() returns. |
523 | */ |
524 | for (r = 0; r < na_get_nrings(na, t: NR_RX); r++) { |
525 | struct __kern_channel_ring *kr = &na->na_rx_rings[r]; |
526 | |
527 | nx_mbq_safe_init(kr, q: &kr->ckr_rx_queue, lim: limit, |
528 | lck_grp: &nexus_mbq_lock_group, lck_attr: &nexus_lock_attr); |
529 | SK_DF(SK_VERB_NETIF, |
530 | "na \"%s\" (0x%llx) initialized kr \"%s\" " |
531 | "(0x%llx) krflags 0x%b" , na->na_name, SK_KVA(na), |
532 | kr->ckr_name, SK_KVA(kr), kr->ckr_flags, CKRF_BITS); |
533 | } |
534 | |
535 | /* |
536 | * Prepare packet buffers for the tx rings; don't preallocate |
537 | * the mbufs here, leave this to nx_netif_compat_na_txsync(). |
538 | */ |
539 | for (r = 0; r < na_get_nrings(na, t: NR_TX); r++) { |
540 | na->na_tx_rings[r].ckr_tx_pool = NULL; |
541 | } |
542 | |
543 | for (r = 0; r < na_get_nrings(na, t: NR_TX); r++) { |
544 | na->na_tx_rings[r].ckr_tx_pool = |
545 | skn_alloc_type_array(tx_pool_on, struct mbuf *, |
546 | na_get_nslots(na, NR_TX), Z_WAITOK, |
547 | skmem_tag_netif_compat_pool); |
548 | if (na->na_tx_rings[r].ckr_tx_pool == NULL) { |
549 | SK_ERR("ckr_tx_pool allocation failed" ); |
550 | error = ENOMEM; |
551 | goto free_tx_pools; |
552 | } |
553 | } |
554 | |
555 | /* Prepare to intercept incoming traffic. */ |
556 | error = nx_netif_compat_catch_rx(na: nca, TRUE); |
557 | if (error != 0) { |
558 | SK_ERR("RX intercept failed (%d)" , error); |
559 | goto uncatch; |
560 | } |
561 | nx_netif_filter_enable(nifna->nifna_netif); |
562 | nx_netif_flow_enable(nifna->nifna_netif); |
563 | os_atomic_or(&na->na_flags, NAF_ACTIVE, relaxed); |
564 | break; |
565 | |
566 | case NA_ACTIVATE_MODE_DEFUNCT: |
567 | ASSERT(SKYWALK_CAPABLE(na->na_ifp)); |
568 | break; |
569 | |
570 | case NA_ACTIVATE_MODE_OFF: |
571 | /* |
572 | * Note that here we cannot assert SKYWALK_CAPABLE() |
573 | * as we're called in the destructor path. |
574 | */ |
575 | os_atomic_andnot(&na->na_flags, NAF_ACTIVE, relaxed); |
576 | nx_netif_flow_disable(nifna->nifna_netif); |
577 | nx_netif_filter_disable(nifna->nifna_netif); |
578 | |
579 | /* |
580 | * Signal the poller thread to terminate itself, and |
581 | * wait for it to exit. |
582 | */ |
583 | if (ifp->if_poll_thread != THREAD_NULL) { |
584 | ASSERT(net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)); |
585 | ASSERT((ifp->if_xflags & IFXF_LEGACY) == 0); |
586 | lck_mtx_lock_spin(lck: &ifp->if_poll_lock); |
587 | ifp->if_poll_flags |= IF_POLLF_TERMINATING; |
588 | wakeup_one(chan: (caddr_t)&ifp->if_poll_thread); |
589 | lck_mtx_unlock(lck: &ifp->if_poll_lock); |
590 | |
591 | /* wait for poller thread to terminate */ |
592 | lck_mtx_lock(lck: &ifp->if_poll_lock); |
593 | while (ifp->if_poll_thread != THREAD_NULL) { |
594 | SK_DF(SK_VERB_NETIF_POLL, |
595 | "%s: waiting for poller thread to terminate" , |
596 | if_name(ifp)); |
597 | (void) msleep(chan: &ifp->if_poll_thread, |
598 | mtx: &ifp->if_poll_lock, pri: (PZERO - 1), |
599 | wmesg: "netif_poll_thread_exit" , NULL); |
600 | } |
601 | lck_mtx_unlock(lck: &ifp->if_poll_lock); |
602 | SK_DF(SK_VERB_NETIF_POLL, |
603 | "%s: poller thread termination complete" , |
604 | if_name(ifp)); |
605 | } |
606 | |
607 | /* Do not intercept packets on the rx path. */ |
608 | (void) nx_netif_compat_catch_rx(na: nca, FALSE); |
609 | |
610 | /* Free the mbufs going to the channel rings */ |
611 | for (r = 0; r < na_get_nrings(na, t: NR_RX); r++) { |
612 | nx_mbq_safe_purge(q: &na->na_rx_rings[r].ckr_rx_queue); |
613 | nx_mbq_safe_destroy(q: &na->na_rx_rings[r].ckr_rx_queue); |
614 | } |
615 | |
616 | /* reset all TX notify callbacks */ |
617 | for (r = 0; r < na_get_nrings(na, t: NR_TX); r++) { |
618 | na->na_tx_rings[r].ckr_na_notify = |
619 | na->na_tx_rings[r].ckr_netif_notify; |
620 | na->na_tx_rings[r].ckr_netif_notify = NULL; |
621 | if (nifna->nifna_tx_mit != NULL) { |
622 | na->na_tx_rings[r].ckr_netif_mit_stats = NULL; |
623 | nx_netif_mit_cleanup(&nifna->nifna_tx_mit[r]); |
624 | } |
625 | } |
626 | |
627 | if (nifna->nifna_tx_mit != NULL) { |
628 | skn_free_type_array(tx_off, struct nx_netif_mit, |
629 | na_get_nrings(na, NR_TX), nifna->nifna_tx_mit); |
630 | nifna->nifna_tx_mit = NULL; |
631 | } |
632 | |
633 | /* reset all RX notify callbacks */ |
634 | for (r = 0; r < na_get_nrings(na, t: NR_RX); r++) { |
635 | na->na_rx_rings[r].ckr_na_notify = |
636 | na->na_rx_rings[r].ckr_netif_notify; |
637 | na->na_rx_rings[r].ckr_netif_notify = NULL; |
638 | if (nifna->nifna_rx_mit != NULL) { |
639 | na->na_rx_rings[r].ckr_netif_mit_stats = NULL; |
640 | nx_netif_mit_cleanup(&nifna->nifna_rx_mit[r]); |
641 | } |
642 | } |
643 | if (nifna->nifna_rx_mit != NULL) { |
644 | skn_free_type_array(rx_off, struct nx_netif_mit, |
645 | na_get_nrings(na, NR_RX), nifna->nifna_rx_mit); |
646 | nifna->nifna_rx_mit = NULL; |
647 | } |
648 | |
649 | for (r = 0; r < na_get_nrings(na, t: NR_TX); r++) { |
650 | for (i = 0; i < na_get_nslots(na, t: NR_TX); i++) { |
651 | nx_netif_compat_ring_free(m: na-> |
652 | na_tx_rings[r].ckr_tx_pool[i]); |
653 | na->na_tx_rings[r].ckr_tx_pool[i] = NULL; |
654 | } |
655 | skn_free_type_array(tx_pool_off, |
656 | struct mbuf *, na_get_nslots(na, NR_TX), |
657 | na->na_tx_rings[r].ckr_tx_pool); |
658 | } |
659 | break; |
660 | |
661 | default: |
662 | VERIFY(0); |
663 | /* NOTREACHED */ |
664 | __builtin_unreachable(); |
665 | } |
666 | |
667 | return 0; |
668 | |
669 | uncatch: |
670 | (void) nx_netif_compat_catch_rx(na: nca, FALSE); |
671 | |
672 | free_tx_pools: |
673 | for (r = 0; r < na_get_nrings(na, t: NR_TX); r++) { |
674 | if (na->na_tx_rings[r].ckr_tx_pool == NULL) { |
675 | continue; |
676 | } |
677 | for (i = 0; i < na_get_nslots(na, t: NR_TX); i++) { |
678 | nx_netif_compat_ring_free( |
679 | m: na->na_tx_rings[r].ckr_tx_pool[i]); |
680 | na->na_tx_rings[r].ckr_tx_pool[i] = NULL; |
681 | } |
682 | skn_free_type_array(tx_pool, struct mbuf *, |
683 | na_get_nslots(na, NR_TX), na->na_tx_rings[r].ckr_tx_pool); |
684 | na->na_tx_rings[r].ckr_tx_pool = NULL; |
685 | } |
686 | if (nifna->nifna_tx_mit != NULL) { |
687 | for (r = 0; r < na_get_nrings(na, t: NR_TX); r++) { |
688 | nx_netif_mit_cleanup(&nifna->nifna_tx_mit[r]); |
689 | } |
690 | skn_free_type_array(tx, struct nx_netif_mit, |
691 | na_get_nrings(na, NR_TX), nifna->nifna_tx_mit); |
692 | nifna->nifna_tx_mit = NULL; |
693 | } |
694 | if (nifna->nifna_rx_mit != NULL) { |
695 | for (r = 0; r < na_get_nrings(na, t: NR_RX); r++) { |
696 | nx_netif_mit_cleanup(&nifna->nifna_rx_mit[r]); |
697 | } |
698 | skn_free_type_array(rx, struct nx_netif_mit, |
699 | na_get_nrings(na, NR_RX), nifna->nifna_rx_mit); |
700 | nifna->nifna_rx_mit = NULL; |
701 | } |
702 | for (r = 0; r < na_get_nrings(na, t: NR_RX); r++) { |
703 | nx_mbq_safe_destroy(q: &na->na_rx_rings[r].ckr_rx_queue); |
704 | } |
705 | out: |
706 | |
707 | return error; |
708 | } |
709 | |
710 | /* |
711 | * Record completed transmissions and update ktail. |
712 | * |
713 | * The oldest tx buffer not yet completed is at ckr_ktail + 1, |
714 | * ckr_khead is the first unsent buffer. |
715 | */ |
716 | /* Hoisted out of line to reduce kernel stack footprint */ |
717 | SK_NO_INLINE_ATTRIBUTE |
718 | static uint32_t |
719 | nx_netif_compat_tx_clean(struct netif_stats *nifs, |
720 | struct __kern_channel_ring *kring) |
721 | { |
722 | const slot_idx_t lim = kring->ckr_lim; |
723 | slot_idx_t nm_i = SLOT_NEXT(i: kring->ckr_ktail, lim); |
724 | slot_idx_t khead = kring->ckr_khead; |
725 | uint32_t n = 0; |
726 | struct mbuf **ckr_tx_pool = kring->ckr_tx_pool; |
727 | |
728 | while (nm_i != khead) { /* buffers not completed */ |
729 | struct mbuf *m = ckr_tx_pool[nm_i]; |
730 | |
731 | if (__improbable(m == NULL)) { |
732 | /* this is done, try to replenish the entry */ |
733 | VERIFY(nm_i <= UINT16_MAX); |
734 | ckr_tx_pool[nm_i] = m = |
735 | nx_netif_compat_ring_alloc(M_WAITOK, |
736 | len: kring->ckr_max_pkt_len, idx: (uint16_t)nm_i); |
737 | if (__improbable(m == NULL)) { |
738 | STATS_INC(nifs, NETIF_STATS_DROP_NOMEM_MBUF); |
739 | STATS_INC(nifs, NETIF_STATS_DROP); |
740 | SK_DF(SK_VERB_MEM, |
741 | "mbuf allocation failed (slot %u)" , nm_i); |
742 | /* XXX how do we proceed ? break ? */ |
743 | return -ENOMEM; |
744 | } |
745 | } else if (mbuf_ring_cluster_is_active(mbuf: m)) { |
746 | break; /* This mbuf is still busy */ |
747 | } |
748 | n++; |
749 | nm_i = SLOT_NEXT(i: nm_i, lim); |
750 | } |
751 | kring->ckr_ktail = SLOT_PREV(i: nm_i, lim); |
752 | |
753 | SK_RDF(SK_VERB_NETIF, 10, "kr \"%s\" (0x%llx) tx completed [%u] -> " |
754 | "kh %u kt %u | rh %u rt %u" , kring->ckr_name, SK_KVA(kring), |
755 | n, kring->ckr_khead, kring->ckr_ktail, |
756 | kring->ckr_rhead, kring->ckr_rtail); |
757 | |
758 | return n; |
759 | } |
760 | |
761 | /* Hoisted out of line to reduce kernel stack footprint */ |
762 | SK_NO_INLINE_ATTRIBUTE |
763 | static void |
764 | nx_netif_compat_set_tx_event(struct __kern_channel_ring *kring, |
765 | slot_idx_t khead) |
766 | { |
767 | const slot_idx_t lim = kring->ckr_lim; |
768 | slot_idx_t ntc = SLOT_NEXT(i: kring->ckr_ktail, lim); /* next to clean */ |
769 | struct mbuf *m; |
770 | slot_idx_t e; |
771 | |
772 | if (ntc == khead) { |
773 | return; /* all buffers are free */ |
774 | } |
775 | /* |
776 | * We have pending packet in the driver between ckr_ktail+1 and |
777 | * ckr_khead, and we have to choose one of these slots to generate |
778 | * a TX notification. There is a race, but this is only called |
779 | * within TX sync which does a double check. |
780 | */ |
781 | if (__probable(netif_tx_event_mode == 0)) { |
782 | /* |
783 | * Choose the first pending slot, to be safe against drivers |
784 | * reordering mbuf transmissions. |
785 | */ |
786 | e = ntc; |
787 | } else { |
788 | /* |
789 | * Choose a slot in the middle, so that we don't risk ending |
790 | * up in a situation where the client continuously wake up, |
791 | * fills one or a few TX slots and go to sleep again. |
792 | */ |
793 | slot_idx_t n = lim + 1; |
794 | |
795 | if (khead >= ntc) { |
796 | e = (khead + ntc) >> 1; |
797 | } else { /* wrap around */ |
798 | e = (khead + n + ntc) >> 1; |
799 | if (e >= n) { |
800 | e -= n; |
801 | } |
802 | } |
803 | |
804 | if (__improbable(e >= n)) { |
805 | SK_ERR("This cannot happen" ); |
806 | e = 0; |
807 | } |
808 | } |
809 | m = kring->ckr_tx_pool[e]; |
810 | |
811 | for (;;) { |
812 | uint32_t p = 0, pn, i, f; |
813 | int err; |
814 | |
815 | (void) mbuf_cluster_get_prop(mbuf: m, prop: &p); |
816 | f = NMB_GET_FLAGS(p); |
817 | i = NMB_GET_INDEX(p); |
818 | |
819 | if (f & NMB_PROPF_TX_NOTIFY) { |
820 | /* |
821 | * This can happen if there is already an event |
822 | * on the ring slot 'e': There is nothing to do. |
823 | */ |
824 | SK_DF(SK_VERB_NETIF | SK_VERB_NOTIFY | SK_VERB_TX, |
825 | "TX_NOTIFY already set at %u m 0x%llx kc %u ntc %u" , |
826 | e, SK_KVA(m), khead, ntc); |
827 | return; |
828 | } |
829 | |
830 | f |= NMB_PROPF_TX_NOTIFY; |
831 | pn = NMB_SET_FLAGS(p, f); |
832 | |
833 | err = mbuf_cluster_set_prop(mbuf: m, oldprop: p, newprop: pn); |
834 | if (err != 0) { |
835 | if (err == EBUSY) { /* try again */ |
836 | continue; |
837 | } |
838 | /* TODO: adi@apple.com -- what to do? */ |
839 | SK_ERR("Failed to set TX_NOTIFY at %u m 0x%llx kh %u " |
840 | "ntc %u, err %d" , e, SK_KVA(m), khead, ntc, err); |
841 | } else { |
842 | SK_DF(SK_VERB_NETIF | SK_VERB_NOTIFY | SK_VERB_TX, |
843 | "Request TX_NOTIFY at %u m 0x%llx kh %u ntc %u" , |
844 | e, SK_KVA(m), khead, ntc); |
845 | } |
846 | break; |
847 | } |
848 | } |
849 | |
850 | #if SK_LOG |
851 | /* Hoisted out of line to reduce kernel stack footprint */ |
852 | SK_LOG_ATTRIBUTE |
853 | static void |
854 | nx_netif_compat_na_txsync_log(struct __kern_channel_ring *kring, |
855 | struct proc *p, uint32_t flags, slot_idx_t nm_i) |
856 | { |
857 | SK_DF(SK_VERB_NETIF | SK_VERB_SYNC | SK_VERB_TX, |
858 | "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b ring %u flags 0x%x " |
859 | "nm_i %u, kh %u kt %u | rh %u rt %u" , |
860 | sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name, |
861 | SK_KVA(kring), kring->ckr_flags, CKRF_BITS, kring->ckr_ring_id, |
862 | flags, nm_i, kring->ckr_khead, kring->ckr_ktail, |
863 | kring->ckr_rhead, kring->ckr_rtail); |
864 | } |
865 | #endif /* SK_LOG */ |
866 | |
867 | /* |
868 | * nx_netif_compat_na_txsync() transforms packets into mbufs and passes |
869 | * them to the device driver. |
870 | */ |
871 | static int |
872 | nx_netif_compat_na_txsync(struct __kern_channel_ring *kring, struct proc *p, |
873 | uint32_t flags) |
874 | { |
875 | #pragma unused(p) |
876 | struct nexus_adapter *na = KRNA(kring); |
877 | struct netif_stats *nifs = &NX_NETIF_PRIVATE(na->na_nx)->nif_stats; |
878 | slot_idx_t nm_i; /* index into the channel ring */ // j |
879 | const slot_idx_t head = kring->ckr_rhead; |
880 | uint32_t slot_count = 0; |
881 | uint32_t byte_count = 0; |
882 | |
883 | STATS_INC(nifs, NETIF_STATS_TX_SYNC); |
884 | |
885 | /* update our work timestamp */ |
886 | na->na_work_ts = _net_uptime; |
887 | |
888 | /* |
889 | * First part: process new packets to send. |
890 | */ |
891 | nm_i = kring->ckr_khead; |
892 | if (nm_i != head) { /* we have new packets to send */ |
893 | while (nm_i != head) { |
894 | struct __kern_slot_desc *sd = KR_KSD(kring, nm_i); |
895 | |
896 | /* device-specific */ |
897 | struct mbuf *m; |
898 | int tx_ret; |
899 | /* |
900 | * Take a mbuf from the tx pool (replenishing the pool |
901 | * entry if necessary) and copy in the user packet. |
902 | */ |
903 | VERIFY(nm_i <= UINT16_MAX); |
904 | m = kring->ckr_tx_pool[nm_i]; |
905 | if (__improbable(m == NULL)) { |
906 | kring->ckr_tx_pool[nm_i] = m = |
907 | nx_netif_compat_ring_alloc(M_WAITOK, |
908 | len: kring->ckr_max_pkt_len, idx: (uint16_t)nm_i); |
909 | if (__improbable(m == NULL)) { |
910 | STATS_INC(nifs, NETIF_STATS_DROP); |
911 | STATS_INC(nifs, |
912 | NETIF_STATS_DROP_NOMEM_MBUF); |
913 | SK_DF(SK_VERB_MEM, |
914 | "%s(%d) kr \"%s\" (0x%llx) " |
915 | "krflags 0x%b ckr_tx_pool[%u] " |
916 | "allocation failed" , |
917 | sk_proc_name_address(p), |
918 | sk_proc_pid(p), kring->ckr_name, |
919 | SK_KVA(kring), kring->ckr_flags, |
920 | CKRF_BITS, nm_i); |
921 | /* |
922 | * Here we could schedule a timer |
923 | * which retries to replenish after |
924 | * a while, and notifies the client |
925 | * when it manages to replenish some |
926 | * slot. In any cae we break early |
927 | * to avoid crashes. |
928 | */ |
929 | break; |
930 | } |
931 | STATS_INC(nifs, NETIF_STATS_TX_REPL); |
932 | } |
933 | |
934 | byte_count += sd->sd_pkt->pkt_length; |
935 | slot_count++; |
936 | |
937 | /* |
938 | * We should ask notifications when CS_REPORT is set, |
939 | * or roughly every half ring. To optimize this, |
940 | * we set a notification event when the client runs |
941 | * out of TX ring space, or when transmission fails. |
942 | * In the latter case we also break early. |
943 | */ |
944 | tx_ret = nx_netif_compat_xmit_frame(na, m, sd->sd_pkt); |
945 | if (__improbable(tx_ret)) { |
946 | SK_RD(5, "start_xmit failed: err %d " |
947 | "[nm_i %u, h %u, kt %u]" , |
948 | tx_ret, nm_i, head, kring->ckr_ktail); |
949 | /* |
950 | * No room for this mbuf in the device driver. |
951 | * Request a notification FOR A PREVIOUS MBUF, |
952 | * then call nx_netif_compat_tx_clean(kring) to |
953 | * do the double check and see if we can free |
954 | * more buffers. If there is space continue, |
955 | * else break; NOTE: the double check is |
956 | * necessary if the problem occurs in the |
957 | * txsync call after selrecord(). Also, we |
958 | * need some way to tell the caller that not |
959 | * all buffers were queued onto the device |
960 | * (this was not a problem with native skywalk |
961 | * driver where space is preallocated). The |
962 | * bridge has a similar problem and we solve |
963 | * it there by dropping the excess packets. |
964 | */ |
965 | nx_netif_compat_set_tx_event(kring, khead: nm_i); |
966 | if (nx_netif_compat_tx_clean(nifs, kring)) { |
967 | /* space now available */ |
968 | continue; |
969 | } else { |
970 | break; |
971 | } |
972 | } |
973 | nm_i = SLOT_NEXT(i: nm_i, lim: kring->ckr_lim); |
974 | STATS_INC(nifs, NETIF_STATS_TX_PACKETS); |
975 | } |
976 | |
977 | /* |
978 | * Update khead to the next slot to transmit; Here nm_i |
979 | * is not necesarrily head, we could break early. |
980 | */ |
981 | kring->ckr_khead = nm_i; |
982 | |
983 | kr_update_stats(kring, slot_count, byte_count); |
984 | } |
985 | |
986 | /* |
987 | * Second, reclaim completed buffers |
988 | */ |
989 | if ((flags & NA_SYNCF_FORCE_RECLAIM) || kr_txempty(kring)) { |
990 | /* |
991 | * No more available slots? Set a notification event on a |
992 | * channel slot that will be cleaned in the future. No |
993 | * doublecheck is performed, since nx_netif_compat_na_txsync() |
994 | * will be called twice by ch_event(). |
995 | */ |
996 | nx_netif_compat_set_tx_event(kring, khead: nm_i); |
997 | } |
998 | kring->ckr_pending_intr = 0; |
999 | |
1000 | #if SK_LOG |
1001 | if (__improbable((sk_verbose & SK_VERB_NETIF) != 0)) { |
1002 | nx_netif_compat_na_txsync_log(kring, p, flags, nm_i); |
1003 | } |
1004 | #endif /* SK_LOG */ |
1005 | |
1006 | (void) nx_netif_compat_tx_clean(nifs, kring); |
1007 | |
1008 | return 0; |
1009 | } |
1010 | |
1011 | #if SK_LOG |
1012 | /* Hoisted out of line to reduce kernel stack footprint */ |
1013 | SK_LOG_ATTRIBUTE |
1014 | static void |
1015 | nx_netif_compat_receive_log1(const struct __kern_channel_ring *kring, |
1016 | struct nx_mbq *q) |
1017 | { |
1018 | SK_RD(10, "kr \"%s\" (0x%llx) krflags 0x%b FULL " |
1019 | "(qlen %u qsize %llu), kc %u kt %u" , kring->ckr_name, |
1020 | SK_KVA(kring), kring->ckr_flags, CKRF_BITS, nx_mbq_len(q), |
1021 | nx_mbq_size(q), kring->ckr_khead, kring->ckr_ktail); |
1022 | } |
1023 | |
1024 | /* Hoisted out of line to reduce kernel stack footprint */ |
1025 | SK_LOG_ATTRIBUTE |
1026 | static void |
1027 | nx_netif_compat_receive_log2(const struct __kern_channel_ring *kring, |
1028 | struct nx_mbq *q, const struct ifnet_stat_increment_param *s) |
1029 | { |
1030 | SK_RDF(SK_VERB_RX, 10, "kr \"%s\" (0x%llx) krflags 0x%b OK, " |
1031 | "added %u packets %u bytes, now qlen %u qsize %llu" , |
1032 | kring->ckr_name, SK_KVA(kring), kring->ckr_flags, CKRF_BITS, |
1033 | s->packets_in, s->bytes_in, nx_mbq_len(q), nx_mbq_size(q)); |
1034 | } |
1035 | #endif /* SK_LOG */ |
1036 | |
1037 | /* |
1038 | * This is the default RX path for the compat netif nexus. Packets |
1039 | * are enqueued and later extracted by nx_netif_compat_na_rxsync(). |
1040 | */ |
1041 | /* TODO: adi@apple.com -- implement chaining */ |
1042 | static errno_t |
1043 | nx_netif_compat_receive(struct ifnet *ifp, struct mbuf *m_head, |
1044 | struct mbuf *m_tail, const struct ifnet_stat_increment_param *s, |
1045 | boolean_t poll, struct thread *tp) |
1046 | { |
1047 | #pragma unused(tp) |
1048 | boolean_t ifp_rxpoll = ((ifp->if_eflags & IFEF_RXPOLL) && net_rxpoll); |
1049 | struct nexus_adapter *na = &NA(ifp)->nifna_up; |
1050 | struct __kern_channel_ring *kring; |
1051 | struct netif_stats *nifs; |
1052 | uint32_t r, work_done; |
1053 | unsigned int qlimit; |
1054 | struct nx_mbq *q; |
1055 | errno_t err = 0; |
1056 | |
1057 | /* update our work timestamp */ |
1058 | na->na_work_ts = _net_uptime; |
1059 | |
1060 | if (__improbable(m_head == NULL)) { |
1061 | ASSERT(m_tail == NULL); |
1062 | ASSERT(poll); |
1063 | ASSERT(s->bytes_in == 0); |
1064 | ASSERT(s->packets_in == 0); |
1065 | } |
1066 | |
1067 | /* BEGIN CSTYLED */ |
1068 | /* |
1069 | * TODO: adi@apple.com -- this needs to be revisited once we |
1070 | * have a clear definition of how multiple RX rings are mapped |
1071 | * to flows; this would involve the hardware/driver doing some |
1072 | * kind of classification and RSS-like demuxing. |
1073 | * |
1074 | * When we enable that, we'll need to consider sifting thru the |
1075 | * mbuf chain we get from the caller, and enqueue them across |
1076 | * per-ring temporary mbuf queue (along with marking the ring |
1077 | * indicating pending packets.) During second stage processing, |
1078 | * we'll issue nx_netif_mit_rx_intr() on each marked ring to |
1079 | * dispatch the packets upstream. |
1080 | * |
1081 | * r = MBUF_RXQ(m); |
1082 | * |
1083 | * if (r >= na->na_num_rx_rings) |
1084 | * r = r % na->na_num_rx_rings; |
1085 | * |
1086 | * kring = &na->na_rx_rings[r]; |
1087 | * q = &kring->ckr_rx_queue; |
1088 | * |
1089 | * For now, target only the first RX ring (ring 0). |
1090 | */ |
1091 | /* END CSTYLED */ |
1092 | r = 0; /* receive ring number */ |
1093 | kring = &na->na_rx_rings[r]; |
1094 | |
1095 | ASSERT(na->na_type == NA_NETIF_COMPAT_DEV); |
1096 | nifs = &NX_NETIF_PRIVATE(na->na_nx)->nif_stats; |
1097 | |
1098 | if (__improbable((!NA_IS_ACTIVE(na)) || KR_DROP(kring))) { |
1099 | /* BEGIN CSTYLED */ |
1100 | /* |
1101 | * If we deal with multiple rings, change above to: |
1102 | * |
1103 | * if (!NA_IS_ACTIVE(na) || r >= na_get_nrings(na, NR_RX))) |
1104 | * |
1105 | * then here do: |
1106 | * |
1107 | * if (r >= na_get_nrings(na, NR_RX)) { |
1108 | * SK_ERR("na \"%s\" (0x%llx) invalid r %u >= %u", |
1109 | * na->na_name, SK_KVA(na), r, |
1110 | * na_get_nrings(na, NR_RX)); |
1111 | * } |
1112 | */ |
1113 | /* END CSTYLED */ |
1114 | m_freem_list(m_head); |
1115 | if (!NA_IS_ACTIVE(na)) { |
1116 | STATS_ADD(nifs, NETIF_STATS_DROP_NA_INACTIVE, |
1117 | s->packets_in); |
1118 | } else if (KR_DROP(kring)) { |
1119 | STATS_ADD(nifs, NETIF_STATS_DROP_KRDROP_MODE, |
1120 | s->packets_in); |
1121 | } |
1122 | STATS_ADD(nifs, NETIF_STATS_DROP, s->packets_in); |
1123 | err = ENXIO; |
1124 | goto done; |
1125 | } |
1126 | if (__improbable(m_head == NULL)) { |
1127 | goto send_packets; |
1128 | } |
1129 | |
1130 | q = &kring->ckr_rx_queue; |
1131 | nx_mbq_lock_spin(q); |
1132 | qlimit = nx_mbq_limit(q); |
1133 | if (ifp_rxpoll) { |
1134 | /* |
1135 | * qlimit of the receive queue is much smaller when the |
1136 | * interface is in oppurtunistic polling mode. In this case |
1137 | * when the interface is operating in interrupt mode, |
1138 | * a sudden burst of input packets can cause the receive queue |
1139 | * to quickly buildup due to scheduling latency in waking up |
1140 | * the poller thread. To avoid drops here due to this latency |
1141 | * we provide a leeway on the qlimit. |
1142 | */ |
1143 | qlimit <<= 5; |
1144 | } |
1145 | if (__improbable(nx_mbq_len(q) > qlimit)) { |
1146 | #if SK_LOG |
1147 | if (__improbable(sk_verbose != 0)) { |
1148 | nx_netif_compat_receive_log1(kring, q); |
1149 | } |
1150 | #endif /* SK_LOG */ |
1151 | nx_mbq_unlock(q); |
1152 | m_freem_list(m_head); |
1153 | STATS_ADD(nifs, NETIF_STATS_DROP_RXQ_OVFL, s->packets_in); |
1154 | STATS_ADD(nifs, NETIF_STATS_DROP, s->packets_in); |
1155 | goto send_packets; |
1156 | } |
1157 | nx_mbq_enq_multi(q, m_head, m_tail, cnt: s->packets_in, size: s->bytes_in); |
1158 | |
1159 | #if SK_LOG |
1160 | if (__improbable((sk_verbose & SK_VERB_NETIF) != 0)) { |
1161 | nx_netif_compat_receive_log2(kring, q, s); |
1162 | } |
1163 | #endif /* SK_LOG */ |
1164 | |
1165 | nx_mbq_unlock(q); |
1166 | |
1167 | (void) ifnet_stat_increment_in(interface: ifp, packets_in: s->packets_in, bytes_in: s->bytes_in, |
1168 | errors_in: s->errors_in); |
1169 | |
1170 | if (poll) { |
1171 | /* update incremental poll stats */ |
1172 | PKTCNTR_ADD(&ifp->if_poll_tstats, s->packets_in, s->bytes_in); |
1173 | } |
1174 | |
1175 | send_packets: |
1176 | /* |
1177 | * if the interface supports oppurtunistic input polling, then the |
1178 | * input packet processing is performed in context of the poller thread. |
1179 | */ |
1180 | if (!poll && ifp_rxpoll) { |
1181 | /* wakeup the poller thread */ |
1182 | ifnet_poll(ifp); |
1183 | } else { |
1184 | /* |
1185 | * wakeup the mitigation thread if needed to perform input |
1186 | * packet processing. |
1187 | * if the interface supports oppurtunistic input polling, then |
1188 | * mitigation thread is not created and the input packet |
1189 | * processing happens in context of the poller thread. |
1190 | */ |
1191 | err = nx_netif_mit_rx_intr((NAKR(na, t: NR_RX) + r), kernproc, 0, |
1192 | &work_done); |
1193 | } |
1194 | done: |
1195 | return err; |
1196 | } |
1197 | |
1198 | #if SK_LOG |
1199 | /* Hoisted out of line to reduce kernel stack footprint */ |
1200 | SK_LOG_ATTRIBUTE |
1201 | static void |
1202 | nx_netif_compat_na_rxsync_log(const struct __kern_channel_ring *kring, |
1203 | struct proc *p, uint32_t flags, slot_idx_t nm_i) |
1204 | { |
1205 | SK_DF(SK_VERB_NETIF | SK_VERB_SYNC | SK_VERB_RX, |
1206 | "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b " |
1207 | "ring %u flags 0x%x nm_i %u kt %u" , sk_proc_name_address(p), |
1208 | sk_proc_pid(p), kring->ckr_name, SK_KVA(kring), kring->ckr_flags, |
1209 | CKRF_BITS, kring->ckr_ring_id, flags, nm_i, kring->ckr_ktail); |
1210 | } |
1211 | #endif /* SK_LOG */ |
1212 | |
1213 | #if DEBUG || DEVELOPMENT |
1214 | /* |
1215 | * Split an mbuf chain at offset "split", such that the first mbuf |
1216 | * is a zero-length M_PKTHDR, followed by the rest of the mbufs. |
1217 | * Typically, the "split" value is equal to the size of the link |
1218 | * layer header, e.g. Ethernet header. |
1219 | */ |
1220 | static struct mbuf * |
1221 | nx_netif_rx_split(struct mbuf *m0, uint32_t split) |
1222 | { |
1223 | struct mbuf *m = m0; |
1224 | |
1225 | if (split == 0) { |
1226 | split = MHLEN; |
1227 | M_PREPEND(m, split, M_DONTWAIT, 0); |
1228 | } else { |
1229 | m->m_data -= split; |
1230 | m->m_len += split; |
1231 | m_pktlen(m) += split; |
1232 | |
1233 | ASSERT((uintptr_t)m->m_data >= (uintptr_t)mbuf_datastart(m)); |
1234 | ASSERT((uintptr_t)m->m_data < ((uintptr_t)mbuf_datastart(m) + |
1235 | mbuf_maxlen(m))); |
1236 | } |
1237 | if (m != NULL) { |
1238 | struct mbuf *n = m_split(m, split, M_DONTWAIT); |
1239 | if (n == NULL) { |
1240 | m_freem(m); |
1241 | return NULL; |
1242 | } |
1243 | m0 = m; |
1244 | ASSERT((uint32_t)m->m_len == split); |
1245 | m->m_data += split; |
1246 | m->m_len -= split; |
1247 | while (m->m_next != NULL) { |
1248 | m = m->m_next; |
1249 | } |
1250 | m->m_next = n; |
1251 | m = m0; |
1252 | m_pktlen(m) = m_length2(m, NULL); |
1253 | } |
1254 | |
1255 | return m; |
1256 | } |
1257 | #endif /* DEBUG || DEVELOPMENT */ |
1258 | |
1259 | /* |
1260 | * nx_netif_compat_na_rxsync() extracts mbufs from the queue filled by |
1261 | * nx_netif_compat_receive() and puts their content in the channel |
1262 | * receive ring. |
1263 | * |
1264 | * Accesses to kring are serialized via kring->ckr_rx_queue lock, because |
1265 | * the rx handler is asynchronous, |
1266 | */ |
1267 | static int |
1268 | nx_netif_compat_na_rxsync(struct __kern_channel_ring *kring, struct proc *p, |
1269 | uint32_t flags) |
1270 | { |
1271 | #pragma unused(p) |
1272 | struct nexus_adapter *na = KRNA(kring); |
1273 | struct nexus_netif_adapter *nifna = (struct nexus_netif_adapter *)na; |
1274 | struct nx_netif *nif = nifna->nifna_netif; |
1275 | slot_idx_t nm_i; /* index into the channel ring */ |
1276 | struct netif_stats *nifs = &NX_NETIF_PRIVATE(na->na_nx)->nif_stats; |
1277 | uint32_t npkts = 0; |
1278 | uint32_t byte_count = 0; |
1279 | const slot_idx_t lim = kring->ckr_lim; |
1280 | const slot_idx_t head = kring->ckr_rhead; |
1281 | boolean_t force_update = ((flags & NA_SYNCF_FORCE_READ) || |
1282 | kring->ckr_pending_intr != 0); |
1283 | struct mbuf *m; |
1284 | uint32_t n; |
1285 | uint32_t avail; /* in slots */ |
1286 | int err, mlen; |
1287 | boolean_t attach_mbuf = FALSE; |
1288 | struct nx_mbq *q, tmpq; |
1289 | struct kern_pbufpool *pp = kring->ckr_pp; |
1290 | uint32_t ph_cnt, i = 0; |
1291 | |
1292 | ASSERT(pp->pp_max_frags == 1); |
1293 | ASSERT(head <= lim); |
1294 | |
1295 | /* |
1296 | * First part: skip past packets that userspace has released. |
1297 | * This can possibly make room for the second part. |
1298 | * equivalent to kr_reclaim() |
1299 | */ |
1300 | if (kring->ckr_khead != head) { |
1301 | kring->ckr_khead = head; |
1302 | /* ensure global visibility */ |
1303 | os_atomic_thread_fence(seq_cst); |
1304 | } |
1305 | |
1306 | STATS_INC(nifs, NETIF_STATS_RX_SYNC); |
1307 | |
1308 | /* |
1309 | * Second part: import newly received packets. |
1310 | */ |
1311 | if (!force_update) { |
1312 | return 0; |
1313 | } |
1314 | |
1315 | /* update our work timestamp */ |
1316 | na->na_work_ts = _net_uptime; |
1317 | |
1318 | /* first empty slot in the receive ring */ |
1319 | nm_i = kring->ckr_ktail; |
1320 | |
1321 | /* |
1322 | * Compute the available space (in bytes) in this ring. |
1323 | * The first slot that is not considered in is the one |
1324 | * before ckr_khead. |
1325 | */ |
1326 | avail = kr_available_slots_rxring(rxkring: kring); |
1327 | if (__improbable(avail == 0)) { |
1328 | return 0; |
1329 | } |
1330 | |
1331 | if (NA_KERNEL_ONLY(na)) { |
1332 | ASSERT(na->na_ifp != NULL && |
1333 | fsw_ifp_to_fsw(na->na_ifp) != NULL); |
1334 | /* |
1335 | * We are not supporting attachment to bridge flowswitch |
1336 | * for now, until we support PKT_F_MBUF_DATA packets |
1337 | * in bridge flowswitch. |
1338 | */ |
1339 | attach_mbuf = TRUE; |
1340 | } |
1341 | |
1342 | /* |
1343 | * Quickly move all of ckr_rx_queue to a temporary queue to dequeue |
1344 | * from. For each mbuf, attach or copy it to the packet attached |
1345 | * to the slot. Release the lock while we're doing that, to allow |
1346 | * for the input thread to enqueue. |
1347 | */ |
1348 | q = &kring->ckr_rx_queue; |
1349 | nx_mbq_init(q: &tmpq, NX_MBQ_NO_LIMIT); |
1350 | nx_mbq_lock_spin(q); |
1351 | nx_mbq_concat(&tmpq, q); |
1352 | nx_mbq_unlock(q); |
1353 | |
1354 | if (__improbable(nx_mbq_len(&tmpq) == 0)) { |
1355 | return 0; |
1356 | } |
1357 | |
1358 | ph_cnt = MIN(avail, nx_mbq_len(&tmpq)); |
1359 | err = kern_pbufpool_alloc_batch_nosleep(pbufpool: pp, bufcnt: 1, array: kring->ckr_scratch, |
1360 | size: &ph_cnt); |
1361 | if (err == ENOMEM) { |
1362 | SK_DF(SK_VERB_MEM, "%s(%p) failed to alloc %d pkts for kr " |
1363 | "0x%llu" , sk_proc_name_address(p), sk_proc_pid(p), ph_cnt, |
1364 | SK_KVA(kring)); |
1365 | goto done; |
1366 | } |
1367 | ASSERT(ph_cnt != 0); |
1368 | |
1369 | for (n = 0; (n < ph_cnt) && |
1370 | ((m = nx_mbq_deq(q: &tmpq)) != NULL); n++) { |
1371 | struct __kern_slot_desc *ksd = KR_KSD(kring, nm_i); |
1372 | struct __kern_packet *pkt; |
1373 | kern_packet_t ph; |
1374 | uint8_t hlen; |
1375 | uint16_t tag; |
1376 | char *h; |
1377 | |
1378 | ASSERT(m->m_flags & M_PKTHDR); |
1379 | mlen = m_pktlen(m); |
1380 | h = m->m_pkthdr.pkt_hdr; |
1381 | if (__improbable(mlen == 0 || h == NULL || |
1382 | h < (char *)mbuf_datastart(m) || h > (char *)m->m_data)) { |
1383 | STATS_INC(nifs, NETIF_STATS_DROP_BADLEN); |
1384 | SK_RD(5, "kr \"%s\" (0x%llx) m 0x%llx len %d" |
1385 | "bad pkt_hdr" , kring->ckr_name, |
1386 | SK_KVA(kring), SK_KVA(m), mlen); |
1387 | m_freem(m); |
1388 | m = NULL; |
1389 | continue; |
1390 | } |
1391 | |
1392 | hlen = (uint8_t)(m->m_data - (uintptr_t)h); |
1393 | mlen += hlen; |
1394 | |
1395 | #if DEBUG || DEVELOPMENT |
1396 | if (__improbable(netif_rx_split != 0)) { |
1397 | /* callee frees mbuf upon failure */ |
1398 | if ((m = nx_netif_rx_split(m, hlen)) == NULL) { |
1399 | continue; |
1400 | } |
1401 | |
1402 | ASSERT((uintptr_t)m->m_data >= |
1403 | (uintptr_t)mbuf_datastart(m)); |
1404 | ASSERT((uintptr_t)m->m_data < |
1405 | ((uintptr_t)mbuf_datastart(m) + |
1406 | mbuf_maxlen(m))); |
1407 | } |
1408 | #endif /* DEBUG || DEVELOPMENT */ |
1409 | |
1410 | ph = kring->ckr_scratch[i]; |
1411 | ASSERT(ph != 0); |
1412 | kring->ckr_scratch[i] = 0; |
1413 | pkt = SK_PTR_ADDR_KPKT(ph); |
1414 | ++i; |
1415 | |
1416 | /* |
1417 | * Wind back the data pointer to include any frame headers |
1418 | * as part of the copy below. The header length is then |
1419 | * stored in the corresponding metadata area of the buffer. |
1420 | */ |
1421 | m->m_data -= hlen; |
1422 | m->m_len += hlen; |
1423 | m->m_pkthdr.len += hlen; |
1424 | ASSERT(mlen == m->m_pkthdr.len); |
1425 | |
1426 | pkt->pkt_link_flags = 0; |
1427 | if (m->m_flags & M_HASFCS) { |
1428 | pkt->pkt_link_flags |= PKT_LINKF_ETHFCS; |
1429 | } |
1430 | if (mbuf_get_vlan_tag(mbuf: m, vlan: &tag) == 0) { |
1431 | (void) kern_packet_set_vlan_tag(SK_PKT2PH(pkt), tag, |
1432 | FALSE); |
1433 | } |
1434 | SK_DF(SK_VERB_NETIF | SK_VERB_SYNC | SK_VERB_RX, |
1435 | "kr \"%s\" (0x%llx) m 0x%llx idx %u slot_len %d" , |
1436 | kring->ckr_name, SK_KVA(kring), SK_KVA(m), nm_i, mlen); |
1437 | |
1438 | if (__probable(attach_mbuf)) { |
1439 | STATS_INC(nifs, NETIF_STATS_RX_COPY_ATTACH); |
1440 | err = __packet_initialize_with_mbuf(pkt, mbuf: m, headroom: 0, l2len: hlen); |
1441 | VERIFY(err == 0); |
1442 | } else if (__probable(mlen <= (int)PP_BUF_SIZE_DEF(pp))) { |
1443 | STATS_INC(nifs, NETIF_STATS_RX_COPY_DIRECT); |
1444 | /* |
1445 | * We're sending this up to a user channel opened |
1446 | * directly to the netif; copy everything. |
1447 | */ |
1448 | err = __packet_set_headroom(ph, headroom: 0); |
1449 | VERIFY(err == 0); |
1450 | err = __packet_set_link_header_length(ph, len: hlen); |
1451 | VERIFY(err == 0); |
1452 | nif->nif_pkt_copy_from_mbuf(NR_RX, ph, 0, m, 0, |
1453 | mlen, FALSE, 0); |
1454 | /* finalize and attach the packet */ |
1455 | err = __packet_finalize(ph); |
1456 | VERIFY(err == 0); |
1457 | m_freem(m); |
1458 | m = NULL; |
1459 | } else { |
1460 | STATS_INC(nifs, NETIF_STATS_DROP_BADLEN); |
1461 | STATS_INC(nifs, NETIF_STATS_DROP); |
1462 | m_freem(m); |
1463 | m = NULL; |
1464 | kern_pbufpool_free(pbufpool: pp, ph); |
1465 | ph = 0; |
1466 | pkt = NULL; |
1467 | continue; |
1468 | } |
1469 | |
1470 | err = KR_SLOT_ATTACH_METADATA(kring, ksd, |
1471 | kqum: (struct __kern_quantum *)pkt); |
1472 | ASSERT(err == 0); |
1473 | |
1474 | byte_count += mlen; |
1475 | ++npkts; |
1476 | ASSERT(npkts < kring->ckr_num_slots); |
1477 | nm_i = SLOT_NEXT(i: nm_i, lim); |
1478 | } |
1479 | |
1480 | if (__improbable(i < ph_cnt)) { |
1481 | kern_pbufpool_free_batch(pbufpool: pp, array: &kring->ckr_scratch[i], |
1482 | size: (ph_cnt - i)); |
1483 | } |
1484 | |
1485 | ASSERT(npkts <= ph_cnt); |
1486 | kr_update_stats(kring, slot_count: npkts, byte_count); |
1487 | |
1488 | if (npkts != 0) { |
1489 | kring->ckr_ktail = nm_i; |
1490 | STATS_ADD(nifs, NETIF_STATS_RX_PACKETS, npkts); |
1491 | } |
1492 | kring->ckr_pending_intr = 0; |
1493 | |
1494 | #if SK_LOG |
1495 | if (__improbable((sk_verbose & SK_VERB_NETIF) != 0)) { |
1496 | nx_netif_compat_na_rxsync_log(kring, p, flags, nm_i); |
1497 | } |
1498 | #endif /* SK_LOG */ |
1499 | |
1500 | done: |
1501 | /* |
1502 | * If we didn't process all packets in temporary queue, |
1503 | * move them back to the head of ckr_rx_queue. |
1504 | */ |
1505 | if (!nx_mbq_empty(&tmpq)) { |
1506 | nx_mbq_lock_spin(q); |
1507 | nx_mbq_concat(&tmpq, q); |
1508 | ASSERT(nx_mbq_empty(q)); |
1509 | nx_mbq_concat(q, &tmpq); |
1510 | nx_mbq_unlock(q); |
1511 | } |
1512 | ASSERT(nx_mbq_empty(&tmpq)); |
1513 | |
1514 | return 0; |
1515 | } |
1516 | |
1517 | static void |
1518 | nx_netif_compat_na_dtor(struct nexus_adapter *na) |
1519 | { |
1520 | struct ifnet *ifp; |
1521 | struct nexus_netif_compat_adapter *nca = |
1522 | (struct nexus_netif_compat_adapter *)na; |
1523 | |
1524 | SK_LOCK_ASSERT_HELD(); |
1525 | |
1526 | SK_DF(SK_VERB_NETIF, "na \"%s\" (0x%llx)" , na->na_name, SK_KVA(na)); |
1527 | |
1528 | /* |
1529 | * If the finalizer callback hasn't been called for whatever |
1530 | * reasons, pick up the embryonic ifnet stored in na_private. |
1531 | * Otherwise, release the I/O refcnt of a non-NULL na_ifp. |
1532 | */ |
1533 | if ((ifp = na->na_ifp) == NULL) { |
1534 | ifp = na->na_private; |
1535 | na->na_private = NULL; |
1536 | } else { |
1537 | ifnet_decr_iorefcnt(ifp); |
1538 | na->na_ifp = NULL; |
1539 | } |
1540 | |
1541 | if (nca->nca_up.nifna_netif != NULL) { |
1542 | nx_netif_release(nca->nca_up.nifna_netif); |
1543 | nca->nca_up.nifna_netif = NULL; |
1544 | } |
1545 | ASSERT(!SKYWALK_NATIVE(ifp)); |
1546 | } |
1547 | |
1548 | /* |
1549 | * nx_netif_compat_attach() makes it possible to use skywalk on |
1550 | * a device without native skywalk support. |
1551 | * This is less performant than native support but potentially |
1552 | * faster than raw sockets or similar schemes. |
1553 | */ |
1554 | int |
1555 | nx_netif_compat_attach(struct kern_nexus *nx, struct ifnet *ifp) |
1556 | { |
1557 | struct nx_netif *nif = NX_NETIF_PRIVATE(nx); |
1558 | struct nxprov_params *nxp = NX_PROV(nx)->nxprov_params; |
1559 | struct nexus_netif_compat_adapter *devnca = NULL; |
1560 | struct nexus_netif_compat_adapter *hostnca = NULL; |
1561 | struct nexus_adapter *devna = NULL; |
1562 | struct nexus_adapter *hostna = NULL; |
1563 | boolean_t embryonic = FALSE; |
1564 | uint32_t tx_rings, tx_slots; |
1565 | int retval = 0; |
1566 | |
1567 | SK_LOCK_ASSERT_HELD(); |
1568 | ASSERT(!SKYWALK_NATIVE(ifp)); |
1569 | ASSERT(!SKYWALK_CAPABLE(ifp)); |
1570 | ASSERT(ifp->if_na == NULL); |
1571 | ASSERT(ifp->if_na_ops == NULL); |
1572 | |
1573 | devnca = na_netif_compat_alloc(how: Z_WAITOK); |
1574 | hostnca = na_netif_compat_alloc(how: Z_WAITOK); |
1575 | |
1576 | /* |
1577 | * We can be called for two different interface states: |
1578 | * |
1579 | * Fully attached: get an io ref count; upon success, this |
1580 | * holds a reference to the ifnet for the ifp pointer stored |
1581 | * in 'na_ifp' down below for both adapters. |
1582 | * |
1583 | * Embryonic: temporary hold the ifnet in na_private, which |
1584 | * upon a successful ifnet_attach(), will be moved over to |
1585 | * the 'na_ifp' with an io ref count held. |
1586 | * |
1587 | * The ifnet in 'na_ifp' will be released by na_release_locked(). |
1588 | */ |
1589 | if (!ifnet_is_attached(ifp, refio: 1)) { |
1590 | if (!(ifp->if_refflags & IFRF_EMBRYONIC)) { |
1591 | ifp = NULL; |
1592 | retval = ENXIO; |
1593 | goto err; |
1594 | } |
1595 | embryonic = TRUE; |
1596 | } |
1597 | |
1598 | /* initialize the (compat) device netif adapter */ |
1599 | devnca->nca_up.nifna_netif = nif; |
1600 | nx_netif_retain(nif); |
1601 | devna = &devnca->nca_up.nifna_up; |
1602 | (void) strncpy(devna->na_name, ifp->if_xname, sizeof(devna->na_name) - 1); |
1603 | devna->na_name[sizeof(devna->na_name) - 1] = '\0'; |
1604 | uuid_generate_random(out: devna->na_uuid); |
1605 | if (embryonic) { |
1606 | /* |
1607 | * We will move this over to na_ifp once |
1608 | * the interface is fully attached. |
1609 | */ |
1610 | devna->na_private = ifp; |
1611 | ASSERT(devna->na_ifp == NULL); |
1612 | } else { |
1613 | ASSERT(devna->na_private == NULL); |
1614 | /* use I/O refcnt from ifnet_is_attached() */ |
1615 | devna->na_ifp = ifp; |
1616 | } |
1617 | |
1618 | devna->na_type = NA_NETIF_COMPAT_DEV; |
1619 | devna->na_free = na_netif_compat_free; |
1620 | devna->na_activate = nx_netif_compat_na_activate; |
1621 | devna->na_txsync = nx_netif_compat_na_txsync; |
1622 | devna->na_rxsync = nx_netif_compat_na_rxsync; |
1623 | devna->na_dtor = nx_netif_compat_na_dtor; |
1624 | devna->na_krings_create = nx_netif_dev_krings_create; |
1625 | devna->na_krings_delete = nx_netif_dev_krings_delete; |
1626 | devna->na_special = nx_netif_na_special; |
1627 | |
1628 | *(nexus_stats_type_t *)(uintptr_t)&devna->na_stats_type = |
1629 | NEXUS_STATS_TYPE_INVALID; |
1630 | |
1631 | if (skywalk_netif_direct_allowed(ifp->if_xname)) { |
1632 | tx_rings = nxp->nxp_tx_rings; |
1633 | tx_slots = nxp->nxp_tx_slots; |
1634 | } else { |
1635 | tx_rings = 0; |
1636 | tx_slots = 0; |
1637 | } |
1638 | na_set_nrings(na: devna, t: NR_TX, v: tx_rings); |
1639 | na_set_nrings(na: devna, t: NR_RX, v: nxp->nxp_rx_rings); |
1640 | na_set_nslots(na: devna, t: NR_TX, v: tx_slots); |
1641 | na_set_nslots(na: devna, t: NR_RX, v: nxp->nxp_rx_slots); |
1642 | /* |
1643 | * Verify upper bounds; the parameters must have already been |
1644 | * validated by nxdom_prov_params() by the time we get here. |
1645 | */ |
1646 | ASSERT(na_get_nrings(devna, NR_TX) <= NX_DOM(nx)->nxdom_tx_rings.nb_max); |
1647 | ASSERT(na_get_nrings(devna, NR_RX) <= NX_DOM(nx)->nxdom_rx_rings.nb_max); |
1648 | ASSERT(na_get_nslots(devna, NR_TX) <= NX_DOM(nx)->nxdom_tx_slots.nb_max); |
1649 | ASSERT(na_get_nslots(devna, NR_RX) <= NX_DOM(nx)->nxdom_rx_slots.nb_max); |
1650 | |
1651 | na_attach_common(devna, nx, &nx_netif_compat_prov_s); |
1652 | |
1653 | if ((retval = NX_DOM_PROV(nx)->nxdom_prov_mem_new(NX_DOM_PROV(nx), |
1654 | nx, devna)) != 0) { |
1655 | ASSERT(devna->na_arena == NULL); |
1656 | /* we've transferred the refcnt to na_ifp above */ |
1657 | ifp = NULL; |
1658 | goto err; |
1659 | } |
1660 | ASSERT(devna->na_arena != NULL); |
1661 | |
1662 | *(uint32_t *)(uintptr_t)&devna->na_flowadv_max = nxp->nxp_flowadv_max; |
1663 | ASSERT(devna->na_flowadv_max == 0 || |
1664 | skmem_arena_nexus(devna->na_arena)->arn_flowadv_obj != NULL); |
1665 | |
1666 | /* setup packet copy routines */ |
1667 | if (skmem_arena_nexus(ar: devna->na_arena)->arn_rx_pp->pp_max_frags > 1) { |
1668 | nif->nif_pkt_copy_from_mbuf = |
1669 | pkt_copy_multi_buflet_from_mbuf; |
1670 | nif->nif_pkt_copy_to_mbuf = |
1671 | pkt_copy_multi_buflet_to_mbuf; |
1672 | } else { |
1673 | nif->nif_pkt_copy_from_mbuf = pkt_copy_from_mbuf; |
1674 | nif->nif_pkt_copy_to_mbuf = pkt_copy_to_mbuf; |
1675 | } |
1676 | |
1677 | /* initialize the host netif adapter */ |
1678 | hostnca->nca_up.nifna_netif = nif; |
1679 | nx_netif_retain(nif); |
1680 | hostna = &hostnca->nca_up.nifna_up; |
1681 | (void) snprintf(hostna->na_name, count: sizeof(hostna->na_name), |
1682 | "%s^" , devna->na_name); |
1683 | uuid_generate_random(out: hostna->na_uuid); |
1684 | if (embryonic) { |
1685 | /* |
1686 | * We will move this over to na_ifp once |
1687 | * the interface is fully attached. |
1688 | */ |
1689 | hostna->na_private = ifp; |
1690 | ASSERT(hostna->na_ifp == NULL); |
1691 | } else { |
1692 | ASSERT(hostna->na_private == NULL); |
1693 | hostna->na_ifp = devna->na_ifp; |
1694 | ifnet_incr_iorefcnt(hostna->na_ifp); |
1695 | } |
1696 | hostna->na_type = NA_NETIF_COMPAT_HOST; |
1697 | hostna->na_free = na_netif_compat_free; |
1698 | hostna->na_activate = nx_netif_host_na_activate; |
1699 | hostna->na_txsync = nx_netif_host_na_txsync; |
1700 | hostna->na_rxsync = nx_netif_host_na_rxsync; |
1701 | hostna->na_dtor = nx_netif_compat_na_dtor; |
1702 | hostna->na_krings_create = nx_netif_host_krings_create; |
1703 | hostna->na_krings_delete = nx_netif_host_krings_delete; |
1704 | hostna->na_special = nx_netif_host_na_special; |
1705 | |
1706 | os_atomic_or(&hostna->na_flags, NAF_HOST_ONLY, relaxed); |
1707 | *(nexus_stats_type_t *)(uintptr_t)&hostna->na_stats_type = |
1708 | NEXUS_STATS_TYPE_INVALID; |
1709 | |
1710 | na_set_nrings(na: hostna, t: NR_TX, v: 1); |
1711 | na_set_nrings(na: hostna, t: NR_RX, v: 0); |
1712 | na_set_nslots(na: hostna, t: NR_TX, v: nxp->nxp_tx_slots); |
1713 | na_set_nslots(na: hostna, t: NR_RX, v: 0); |
1714 | |
1715 | na_attach_common(hostna, nx, &nx_netif_prov_s); |
1716 | |
1717 | if ((retval = NX_DOM_PROV(nx)->nxdom_prov_mem_new(NX_DOM_PROV(nx), |
1718 | nx, hostna)) != 0) { |
1719 | ASSERT(hostna->na_arena == NULL); |
1720 | /* we've transferred the refcnt to na_ifp above */ |
1721 | ifp = NULL; |
1722 | goto err; |
1723 | } |
1724 | ASSERT(hostna->na_arena != NULL); |
1725 | |
1726 | *(uint32_t *)(uintptr_t)&hostna->na_flowadv_max = nxp->nxp_flowadv_max; |
1727 | ASSERT(hostna->na_flowadv_max == 0 || |
1728 | skmem_arena_nexus(hostna->na_arena)->arn_flowadv_obj != NULL); |
1729 | |
1730 | /* these will be undone by destructor */ |
1731 | ifp->if_na_ops = &na_netif_compat_ops; |
1732 | ifp->if_na = &devnca->nca_up; |
1733 | na_retain_locked(na: devna); |
1734 | na_retain_locked(na: hostna); |
1735 | |
1736 | SKYWALK_SET_CAPABLE(ifp); |
1737 | |
1738 | NETIF_WLOCK(nif); |
1739 | nif->nif_ifp = ifp; |
1740 | retval = nx_port_alloc(nx, NEXUS_PORT_NET_IF_DEV, NULL, &devna, kernproc); |
1741 | ASSERT(retval == 0); |
1742 | retval = nx_port_alloc(nx, NEXUS_PORT_NET_IF_HOST, NULL, &hostna, kernproc); |
1743 | ASSERT(retval == 0); |
1744 | NETIF_WUNLOCK(nif); |
1745 | |
1746 | #if SK_LOG |
1747 | uuid_string_t uuidstr; |
1748 | SK_DF(SK_VERB_NETIF, "na_name: \"%s\"" , devna->na_name); |
1749 | SK_DF(SK_VERB_NETIF, " UUID: %s" , |
1750 | sk_uuid_unparse(devna->na_uuid, uuidstr)); |
1751 | SK_DF(SK_VERB_NETIF, " nx: 0x%llx (\"%s\":\"%s\")" , |
1752 | SK_KVA(devna->na_nx), NX_DOM(devna->na_nx)->nxdom_name, |
1753 | NX_DOM_PROV(devna->na_nx)->nxdom_prov_name); |
1754 | SK_DF(SK_VERB_NETIF, " flags: 0x%b" , devna->na_flags, NAF_BITS); |
1755 | SK_DF(SK_VERB_NETIF, " flowadv_max: %u" , devna->na_flowadv_max); |
1756 | SK_DF(SK_VERB_NETIF, " rings: tx %u rx %u" , |
1757 | na_get_nrings(devna, NR_TX), na_get_nrings(devna, NR_RX)); |
1758 | SK_DF(SK_VERB_NETIF, " slots: tx %u rx %u" , |
1759 | na_get_nslots(devna, NR_TX), na_get_nslots(devna, NR_RX)); |
1760 | #if CONFIG_NEXUS_USER_PIPE |
1761 | SK_DF(SK_VERB_NETIF, " next_pipe: %u" , devna->na_next_pipe); |
1762 | SK_DF(SK_VERB_NETIF, " max_pipes: %u" , devna->na_max_pipes); |
1763 | #endif /* CONFIG_NEXUS_USER_PIPE */ |
1764 | SK_DF(SK_VERB_NETIF, " ifp: 0x%llx %s [ioref %u]" , |
1765 | SK_KVA(ifp), ifp->if_xname, ifp->if_refio); |
1766 | SK_DF(SK_VERB_NETIF, "hostna: \"%s\"" , hostna->na_name); |
1767 | SK_DF(SK_VERB_NETIF, " UUID: %s" , |
1768 | sk_uuid_unparse(hostna->na_uuid, uuidstr)); |
1769 | SK_DF(SK_VERB_NETIF, " nx: 0x%llx (\"%s\":\"%s\")" , |
1770 | SK_KVA(hostna->na_nx), NX_DOM(hostna->na_nx)->nxdom_name, |
1771 | NX_DOM_PROV(hostna->na_nx)->nxdom_prov_name); |
1772 | SK_DF(SK_VERB_NETIF, " flags: 0x%b" , |
1773 | hostna->na_flags, NAF_BITS); |
1774 | SK_DF(SK_VERB_NETIF, " flowadv_max: %u" , hostna->na_flowadv_max); |
1775 | SK_DF(SK_VERB_NETIF, " rings: tx %u rx %u" , |
1776 | na_get_nrings(hostna, NR_TX), na_get_nrings(hostna, NR_RX)); |
1777 | SK_DF(SK_VERB_NETIF, " slots: tx %u rx %u" , |
1778 | na_get_nslots(hostna, NR_TX), na_get_nslots(hostna, NR_RX)); |
1779 | #if CONFIG_NEXUS_USER_PIPE |
1780 | SK_DF(SK_VERB_NETIF, " next_pipe: %u" , hostna->na_next_pipe); |
1781 | SK_DF(SK_VERB_NETIF, " max_pipes: %u" , hostna->na_max_pipes); |
1782 | #endif /* CONFIG_NEXUS_USER_PIPE */ |
1783 | SK_DF(SK_VERB_NETIF, " ifp: 0x%llx %s [ioref %u]" , SK_KVA(ifp), |
1784 | ifp->if_xname, ifp->if_refio); |
1785 | #endif /* SK_LOG */ |
1786 | |
1787 | err: |
1788 | if (retval != 0) { |
1789 | ASSERT(ifp == NULL); |
1790 | if (devna != NULL) { |
1791 | if (devna->na_arena != NULL) { |
1792 | skmem_arena_release(devna->na_arena); |
1793 | devna->na_arena = NULL; |
1794 | } |
1795 | if (devna->na_ifp != NULL) { |
1796 | ifnet_decr_iorefcnt(devna->na_ifp); |
1797 | devna->na_ifp = NULL; |
1798 | } |
1799 | devna->na_private = NULL; |
1800 | } |
1801 | if (hostna != NULL) { |
1802 | if (hostna->na_arena != NULL) { |
1803 | skmem_arena_release(hostna->na_arena); |
1804 | hostna->na_arena = NULL; |
1805 | } |
1806 | if (hostna->na_ifp != NULL) { |
1807 | ifnet_decr_iorefcnt(hostna->na_ifp); |
1808 | hostna->na_ifp = NULL; |
1809 | } |
1810 | hostna->na_private = NULL; |
1811 | } |
1812 | if (devnca != NULL) { |
1813 | if (devnca->nca_up.nifna_netif != NULL) { |
1814 | nx_netif_release(devnca->nca_up.nifna_netif); |
1815 | devnca->nca_up.nifna_netif = NULL; |
1816 | } |
1817 | na_netif_compat_free(na: (struct nexus_adapter *)devnca); |
1818 | } |
1819 | if (hostnca != NULL) { |
1820 | if (hostnca->nca_up.nifna_netif != NULL) { |
1821 | nx_netif_release(hostnca->nca_up.nifna_netif); |
1822 | hostnca->nca_up.nifna_netif = NULL; |
1823 | } |
1824 | na_netif_compat_free(na: (struct nexus_adapter *)hostnca); |
1825 | } |
1826 | } |
1827 | return retval; |
1828 | } |
1829 | |
1830 | static void |
1831 | na_netif_compat_finalize(struct nexus_netif_adapter *nifna, struct ifnet *ifp) |
1832 | { |
1833 | na_netif_finalize(nifna, ifp); |
1834 | } |
1835 | |
1836 | /* |
1837 | * Intercept the rx routine in the standard device driver. |
1838 | * Second argument is non-zero to intercept, 0 to restore |
1839 | */ |
1840 | static int |
1841 | nx_netif_compat_catch_rx(struct nexus_netif_compat_adapter *nca, |
1842 | boolean_t enable) |
1843 | { |
1844 | struct ifnet *ifp = nca->nca_up.nifna_up.na_ifp; |
1845 | int err = 0; |
1846 | |
1847 | ASSERT(!(nca->nca_up.nifna_up.na_flags & NAF_HOST_ONLY)); |
1848 | |
1849 | if (enable) { |
1850 | err = dlil_set_input_handler(ifp, fn: nx_netif_compat_receive); |
1851 | } else { |
1852 | dlil_reset_input_handler(ifp); |
1853 | } |
1854 | return err; |
1855 | } |
1856 | |
1857 | /* |
1858 | * Transmit routine used by nx_netif_compat_na_txsync(). Returns 0 on success |
1859 | * and non-zero on error (which may be packet drops or other errors). |
1860 | * len identifies the channel buffer, m is the (preallocated) mbuf to use |
1861 | * for transmissions. |
1862 | * |
1863 | * We should add a reference to the mbuf so the m_freem() at the end |
1864 | * of the transmission does not consume resources. |
1865 | * |
1866 | * On FreeBSD, and on multiqueue cards, we can force the queue using |
1867 | * if (M_HASHTYPE_GET(m) != M_HASHTYPE_NONE) |
1868 | * i = m->m_pkthdr.flowid % adapter->num_queues; |
1869 | * else |
1870 | * i = curcpu % adapter->num_queues; |
1871 | * |
1872 | */ |
1873 | static int |
1874 | nx_netif_compat_xmit_frame(struct nexus_adapter *na, struct mbuf *m, |
1875 | struct __kern_packet *pkt) |
1876 | { |
1877 | struct nexus_netif_adapter *nifna = (struct nexus_netif_adapter *)na; |
1878 | struct nx_netif *nif = nifna->nifna_netif; |
1879 | struct netif_stats *nifs = &NX_NETIF_PRIVATE(na->na_nx)->nif_stats; |
1880 | struct ifnet *ifp = na->na_ifp; |
1881 | kern_packet_t ph = SK_PTR_ENCODE(pkt, METADATA_TYPE(pkt), |
1882 | METADATA_SUBTYPE(pkt)); |
1883 | uint32_t len; |
1884 | int ret = 0; |
1885 | |
1886 | if ((ret = mbuf_ring_cluster_activate(mbuf: m)) != 0) { |
1887 | panic("Failed to activate mbuf ring cluster 0x%llx (%d)" , |
1888 | SK_KVA(m), ret); |
1889 | /* NOTREACHED */ |
1890 | __builtin_unreachable(); |
1891 | } |
1892 | |
1893 | len = pkt->pkt_length; |
1894 | |
1895 | /* |
1896 | * The mbuf should be a cluster from our special pool, |
1897 | * so we do not need to do an m_copyback but just copy. |
1898 | */ |
1899 | if (m->m_ext.ext_size < len) { |
1900 | SK_RD(5, "size %u < len %u" , m->m_ext.ext_size, len); |
1901 | len = m->m_ext.ext_size; |
1902 | } |
1903 | |
1904 | STATS_INC(nifs, NETIF_STATS_TX_COPY_MBUF); |
1905 | if (PACKET_HAS_PARTIAL_CHECKSUM(pkt)) { |
1906 | STATS_INC(nifs, NETIF_STATS_TX_COPY_SUM); |
1907 | } |
1908 | |
1909 | nif->nif_pkt_copy_to_mbuf(NR_TX, ph, pkt->pkt_headroom, m, 0, len, |
1910 | PACKET_HAS_PARTIAL_CHECKSUM(pkt), pkt->pkt_csum_tx_start_off); |
1911 | |
1912 | /* used for tx notification */ |
1913 | ret = mbuf_set_tx_compl_data(m, arg: (uintptr_t)ifp, data: (uintptr_t)NULL); |
1914 | ASSERT(ret == 0); |
1915 | |
1916 | ret = dlil_output_handler(ifp, m); |
1917 | return ret; |
1918 | } |
1919 | |