1/*
2 * Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*
30 * Copyright (C) 2014 Giuseppe Lettieri. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 *
41 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 */
53
54/*
55 * $FreeBSD$
56 *
57 * Monitors
58 *
59 * netmap monitors can be used to do monitoring of network traffic
60 * on another adapter, when the latter adapter is working in netmap mode.
61 *
62 * Monitors offer to userspace the same interface as any other netmap port,
63 * with as many pairs of netmap rings as the monitored adapter.
64 * However, only the rx rings are actually used. Each monitor rx ring receives
65 * the traffic transiting on both the tx and rx corresponding rings in the
66 * monitored adapter. During registration, the user can choose if she wants
67 * to intercept tx only, rx only, or both tx and rx traffic.
68 *
69 * If the monitor is not able to cope with the stream of frames, excess traffic
70 * will be dropped.
71 *
72 * If the monitored adapter leaves netmap mode, the monitor has to be restarted.
73 *
74 * Monitors can be either zero-copy or copy-based.
75 *
76 * Copy monitors see the frames before they are consumed:
77 *
78 * - For tx traffic, this is when the application sends them, before they are
79 * passed down to the adapter.
80 *
81 * - For rx traffic, this is when they are received by the adapter, before
82 * they are sent up to the application, if any (note that, if no
83 * application is reading from a monitored ring, the ring will eventually
84 * fill up and traffic will stop).
85 *
86 * Zero-copy monitors only see the frames after they have been consumed:
87 *
88 * - For tx traffic, this is after the slots containing the frames have been
89 * marked as free. Note that this may happen at a considerably delay after
90 * frame transmission, since freeing of slots is often done lazily.
91 *
92 * - For rx traffic, this is after the consumer on the monitored adapter
93 * has released them. In most cases, the consumer is a userspace
94 * application which may have modified the frame contents.
95 *
96 * Several copy monitors may be active on any ring. Zero-copy monitors,
97 * instead, need exclusive access to each of the monitored rings. This may
98 * change in the future, if we implement zero-copy monitor chaining.
99 *
100 */
101
102#include <skywalk/os_skywalk_private.h>
103#include <skywalk/nexus/monitor/nx_monitor.h>
104
105static int nx_mon_na_txsync(struct __kern_channel_ring *, struct proc *,
106 uint32_t);
107static int nx_mon_na_rxsync(struct __kern_channel_ring *, struct proc *,
108 uint32_t);
109static int nx_mon_na_krings_create(struct nexus_adapter *,
110 struct kern_channel *);
111static void nx_mon_na_krings_delete(struct nexus_adapter *,
112 struct kern_channel *, boolean_t);
113static uint32_t nx_mon_txrx2chmode(enum txrx);
114static int nx_mon_kr_alloc(struct __kern_channel_ring *, uint32_t);
115static void nx_mon_kr_dealloc(struct __kern_channel_ring *);
116static int nx_mon_na_krings_locks(struct nexus_adapter *,
117 uint32_t[NR_TXRX], uint32_t[NR_TXRX]);
118static void nx_mon_na_krings_unlock(struct nexus_adapter *,
119 const uint32_t[NR_TXRX], const uint32_t[NR_TXRX]);
120static int nx_mon_enable(struct nexus_adapter *, int);
121static void nx_mon_disable(struct nexus_adapter *);
122static int nx_mon_add(struct __kern_channel_ring *,
123 struct __kern_channel_ring *, boolean_t);
124static void nx_mon_del(struct __kern_channel_ring *,
125 struct __kern_channel_ring *, boolean_t);
126static int nx_mon_na_activate_common(struct nexus_adapter *,
127 na_activate_mode_t, boolean_t);
128static pkt_copy_from_pkt_t nx_mon_quantum_copy_64x;
129
130static int nx_mon_zcopy_parent_sync(struct __kern_channel_ring *,
131 struct proc *, uint32_t, enum txrx);
132static int nx_mon_zcopy_na_activate(struct nexus_adapter *, na_activate_mode_t);
133static void nx_mon_zcopy_na_dtor(struct nexus_adapter *);
134
135static void nx_mon_parent_sync(struct __kern_channel_ring *, struct proc *,
136 slot_idx_t, int);
137static int nx_mon_na_activate(struct nexus_adapter *, na_activate_mode_t);
138static void nx_mon_na_dtor(struct nexus_adapter *);
139
140/*
141 * monitors work by replacing the nm_sync() and possibly the
142 * nm_notify() callbacks in the monitored rings.
143 */
144static int nx_mon_zcopy_parent_txsync(struct __kern_channel_ring *,
145 struct proc *, uint32_t);
146static int nx_mon_zcopy_parent_rxsync(struct __kern_channel_ring *,
147 struct proc *, uint32_t);
148static int nx_mon_parent_txsync(struct __kern_channel_ring *,
149 struct proc *, uint32_t);
150static int nx_mon_parent_rxsync(struct __kern_channel_ring *,
151 struct proc *, uint32_t);
152static int nx_mon_parent_notify(struct __kern_channel_ring *,
153 struct proc *, uint32_t);
154
155static void nx_mon_dom_init(struct nxdom *);
156static void nx_mon_dom_terminate(struct nxdom *);
157static void nx_mon_dom_fini(struct nxdom *);
158static int nx_mon_dom_bind_port(struct kern_nexus *, nexus_port_t *,
159 struct nxbind *, void *);
160static int nx_mon_dom_unbind_port(struct kern_nexus *, nexus_port_t);
161static int nx_mon_dom_connect(struct kern_nexus_domain_provider *,
162 struct kern_nexus *, struct kern_channel *, struct chreq *,
163 struct kern_channel *, struct nxbind *, struct proc *);
164static void nx_mon_dom_disconnect(struct kern_nexus_domain_provider *,
165 struct kern_nexus *, struct kern_channel *);
166static void nx_mon_dom_defunct(struct kern_nexus_domain_provider *,
167 struct kern_nexus *, struct kern_channel *, struct proc *);
168static void nx_mon_dom_defunct_finalize(struct kern_nexus_domain_provider *,
169 struct kern_nexus *, struct kern_channel *, boolean_t);
170
171static int nx_mon_prov_init(struct kern_nexus_domain_provider *);
172static int nx_mon_prov_params_adjust(const struct kern_nexus_domain_provider *,
173 const struct nxprov_params *, struct nxprov_adjusted_params *);
174static int nx_mon_prov_params(struct kern_nexus_domain_provider *,
175 const uint32_t, const struct nxprov_params *, struct nxprov_params *,
176 struct skmem_region_params[SKMEM_REGIONS], uint32_t);
177static int nx_mon_prov_mem_new(struct kern_nexus_domain_provider *,
178 struct kern_nexus *, struct nexus_adapter *);
179static void nx_mon_prov_fini(struct kern_nexus_domain_provider *);
180
181static struct nexus_monitor_adapter *na_mon_alloc(zalloc_flags_t);
182static void na_mon_free(struct nexus_adapter *);
183
184struct nxdom nx_monitor_dom_s = {
185 .nxdom_prov_head =
186 STAILQ_HEAD_INITIALIZER(nx_monitor_dom_s.nxdom_prov_head),
187 .nxdom_type = NEXUS_TYPE_MONITOR,
188 .nxdom_md_type = NEXUS_META_TYPE_QUANTUM,
189 .nxdom_md_subtype = NEXUS_META_SUBTYPE_PAYLOAD,
190 .nxdom_name = "monitor",
191 /*
192 * The following values don't really matter much, as a monitor
193 * isn't usable on its own; we just define them as non-zeroes.
194 */
195 .nxdom_ports = {
196 .nb_def = 1,
197 .nb_min = 1,
198 .nb_max = 1,
199 },
200 .nxdom_tx_rings = {
201 .nb_def = 1,
202 .nb_min = 1,
203 .nb_max = 1,
204 },
205 .nxdom_rx_rings = {
206 .nb_def = 1,
207 .nb_min = 1,
208 .nb_max = 1,
209 },
210 .nxdom_tx_slots = {
211 .nb_def = 1,
212 .nb_min = 1,
213 .nb_max = 1,
214 },
215 .nxdom_rx_slots = {
216 .nb_def = 1,
217 .nb_min = 1,
218 .nb_max = 1,
219 },
220 .nxdom_buf_size = {
221 .nb_def = 64,
222 .nb_min = 64,
223 .nb_max = 64,
224 },
225 .nxdom_large_buf_size = {
226 .nb_def = 0,
227 .nb_min = 0,
228 .nb_max = 0,
229 },
230 .nxdom_meta_size = {
231 .nb_def = NX_METADATA_OBJ_MIN_SZ,
232 .nb_min = NX_METADATA_OBJ_MIN_SZ,
233 .nb_max = NX_METADATA_USR_MAX_SZ,
234 },
235 .nxdom_stats_size = {
236 .nb_def = 0,
237 .nb_min = 0,
238 .nb_max = NX_STATS_MAX_SZ,
239 },
240 .nxdom_pipes = {
241 .nb_def = 0,
242 .nb_min = 0,
243 .nb_max = 0,
244 },
245 .nxdom_flowadv_max = {
246 .nb_def = 0,
247 .nb_min = 0,
248 .nb_max = NX_FLOWADV_MAX,
249 },
250 .nxdom_nexusadv_size = {
251 .nb_def = 0,
252 .nb_min = 0,
253 .nb_max = NX_NEXUSADV_MAX_SZ,
254 },
255 .nxdom_capabilities = {
256 .nb_def = NXPCAP_USER_CHANNEL,
257 .nb_min = NXPCAP_USER_CHANNEL,
258 .nb_max = NXPCAP_USER_CHANNEL,
259 },
260 .nxdom_qmap = {
261 .nb_def = NEXUS_QMAP_TYPE_INVALID,
262 .nb_min = NEXUS_QMAP_TYPE_INVALID,
263 .nb_max = NEXUS_QMAP_TYPE_INVALID,
264 },
265 .nxdom_max_frags = {
266 .nb_def = NX_PBUF_FRAGS_DEFAULT,
267 .nb_min = NX_PBUF_FRAGS_MIN,
268 .nb_max = NX_PBUF_FRAGS_DEFAULT,
269 },
270 .nxdom_init = nx_mon_dom_init,
271 .nxdom_terminate = nx_mon_dom_terminate,
272 .nxdom_fini = nx_mon_dom_fini,
273 .nxdom_find_port = NULL,
274 .nxdom_port_is_reserved = NULL,
275 .nxdom_bind_port = nx_mon_dom_bind_port,
276 .nxdom_unbind_port = nx_mon_dom_unbind_port,
277 .nxdom_connect = nx_mon_dom_connect,
278 .nxdom_disconnect = nx_mon_dom_disconnect,
279 .nxdom_defunct = nx_mon_dom_defunct,
280 .nxdom_defunct_finalize = nx_mon_dom_defunct_finalize,
281};
282
283static struct kern_nexus_domain_provider nx_monitor_prov_s = {
284 .nxdom_prov_name = NEXUS_PROVIDER_MONITOR,
285 .nxdom_prov_flags = NXDOMPROVF_DEFAULT,
286 .nxdom_prov_cb = {
287 .dp_cb_init = nx_mon_prov_init,
288 .dp_cb_fini = nx_mon_prov_fini,
289 .dp_cb_params = nx_mon_prov_params,
290 .dp_cb_mem_new = nx_mon_prov_mem_new,
291 .dp_cb_config = NULL,
292 .dp_cb_nx_ctor = NULL,
293 .dp_cb_nx_dtor = NULL,
294 .dp_cb_nx_mem_info = NULL, /* not supported */
295 .dp_cb_nx_mib_get = NULL,
296 },
297};
298
299static SKMEM_TYPE_DEFINE(na_mon_zone, struct nexus_monitor_adapter);
300
301#define SKMEM_TAG_MONITORS "com.apple.skywalk.monitors"
302static SKMEM_TAG_DEFINE(skmem_tag_monitors, SKMEM_TAG_MONITORS);
303
304static void
305nx_mon_dom_init(struct nxdom *nxdom)
306{
307 SK_LOCK_ASSERT_HELD();
308 ASSERT(!(nxdom->nxdom_flags & NEXUSDOMF_INITIALIZED));
309
310 (void) nxdom_prov_add(nxdom, &nx_monitor_prov_s);
311}
312
313static void
314nx_mon_dom_terminate(struct nxdom *nxdom)
315{
316 struct kern_nexus_domain_provider *nxdom_prov, *tnxdp;
317
318 STAILQ_FOREACH_SAFE(nxdom_prov, &nxdom->nxdom_prov_head,
319 nxdom_prov_link, tnxdp) {
320 (void) nxdom_prov_del(nxdom_prov);
321 }
322}
323
324static void
325nx_mon_dom_fini(struct nxdom *nxdom)
326{
327#pragma unused(nxdom)
328}
329
330__attribute__((noreturn))
331static int
332nx_mon_dom_bind_port(struct kern_nexus *nx, nexus_port_t *nx_port,
333 struct nxbind *nxb, void *info)
334{
335#pragma unused(nx, nx_port, nxb, info)
336 VERIFY(0);
337 /* NOTREACHED */
338 __builtin_unreachable();
339}
340
341__attribute__((noreturn))
342static int
343nx_mon_dom_unbind_port(struct kern_nexus *nx, nexus_port_t nx_port)
344{
345#pragma unused(nx, nx_port)
346 VERIFY(0);
347 /* NOTREACHED */
348 __builtin_unreachable();
349}
350
351__attribute__((noreturn))
352static int
353nx_mon_dom_connect(struct kern_nexus_domain_provider *nxdom_prov,
354 struct kern_nexus *nx, struct kern_channel *ch, struct chreq *chr,
355 struct kern_channel *ch0, struct nxbind *nxb, struct proc *p)
356{
357#pragma unused(nxdom_prov, nx, ch, chr, ch0, nxb, p)
358 VERIFY(0);
359 /* NOTREACHED */
360 __builtin_unreachable();
361}
362
363__attribute__((noreturn))
364static void
365nx_mon_dom_disconnect(struct kern_nexus_domain_provider *nxdom_prov,
366 struct kern_nexus *nx, struct kern_channel *ch)
367{
368#pragma unused(nxdom_prov, nx, ch)
369 VERIFY(0);
370 /* NOTREACHED */
371 __builtin_unreachable();
372}
373
374static void
375nx_mon_dom_defunct(struct kern_nexus_domain_provider *nxdom_prov,
376 struct kern_nexus *nx, struct kern_channel *ch, struct proc *p)
377{
378#pragma unused(nxdom_prov, nx, ch, p)
379}
380
381static void
382nx_mon_dom_defunct_finalize(struct kern_nexus_domain_provider *nxdom_prov,
383 struct kern_nexus *nx, struct kern_channel *ch, boolean_t locked)
384{
385#pragma unused(nxdom_prov, nx, ch, locked)
386}
387
388static int
389nx_mon_prov_init(struct kern_nexus_domain_provider *nxdom_prov)
390{
391#pragma unused(nxdom_prov)
392 SK_D("initializing %s", nxdom_prov->nxdom_prov_name);
393 return 0;
394}
395
396static int
397nx_mon_prov_params_adjust(const struct kern_nexus_domain_provider *nxdom_prov,
398 const struct nxprov_params *nxp, struct nxprov_adjusted_params *adj)
399{
400#pragma unused(nxdom_prov, nxp, adj)
401
402 return 0;
403}
404
405static int
406nx_mon_prov_params(struct kern_nexus_domain_provider *nxdom_prov,
407 const uint32_t req, const struct nxprov_params *nxp0,
408 struct nxprov_params *nxp, struct skmem_region_params srp[SKMEM_REGIONS],
409 uint32_t pp_region_config_flags)
410{
411 struct nxdom *nxdom = nxdom_prov->nxdom_prov_dom;
412
413 return nxprov_params_adjust(nxdom_prov, req, nxp0, nxp, srp,
414 nxdom, nxdom, nxdom, pp_region_config_flags,
415 adjust_fn: nx_mon_prov_params_adjust);
416}
417
418static int
419nx_mon_prov_mem_new(struct kern_nexus_domain_provider *nxdom_prov,
420 struct kern_nexus *nx, struct nexus_adapter *na)
421{
422#pragma unused(nxdom_prov)
423 int err = 0;
424
425 SK_DF(SK_VERB_MONITOR,
426 "nx 0x%llx (\"%s\":\"%s\") na \"%s\" (0x%llx)", SK_KVA(nx),
427 NX_DOM(nx)->nxdom_name, nxdom_prov->nxdom_prov_name, na->na_name,
428 SK_KVA(na));
429
430 ASSERT(na->na_arena == NULL);
431 ASSERT(NX_USER_CHANNEL_PROV(nx));
432 /*
433 * The underlying nexus adapter uses the same memory allocator
434 * as the monitored adapter; don't store the pp in the nexus.
435 *
436 * This means that clients calling kern_nexus_get_pbufpool()
437 * will get NULL, but this is fine since we don't expose the
438 * monitor to external kernel clients.
439 */
440 na->na_arena = skmem_arena_create_for_nexus(na,
441 NX_PROV(nx)->nxprov_region_params, NULL, NULL, FALSE,
442 FALSE, NULL, &err);
443 ASSERT(na->na_arena != NULL || err != 0);
444
445 return err;
446}
447
448static void
449nx_mon_prov_fini(struct kern_nexus_domain_provider *nxdom_prov)
450{
451#pragma unused(nxdom_prov)
452 SK_D("destroying %s", nxdom_prov->nxdom_prov_name);
453}
454
455static struct nexus_monitor_adapter *
456na_mon_alloc(zalloc_flags_t how)
457{
458 struct nexus_monitor_adapter *mna;
459
460 _CASSERT(offsetof(struct nexus_monitor_adapter, mna_up) == 0);
461
462 mna = zalloc_flags(na_mon_zone, how | Z_ZERO);
463 if (mna) {
464 mna->mna_up.na_type = NA_MONITOR;
465 mna->mna_up.na_free = na_mon_free;
466 }
467 return mna;
468}
469
470static void
471na_mon_free(struct nexus_adapter *na)
472{
473 struct nexus_monitor_adapter *mna = (struct nexus_monitor_adapter *)na;
474
475 ASSERT(mna->mna_up.na_refcount == 0);
476 SK_DF(SK_VERB_MEM, "mna 0x%llx FREE", SK_KVA(mna));
477 bzero(s: mna, n: sizeof(*mna));
478 zfree(na_mon_zone, mna);
479}
480
481/*
482 * Functions common to both kind of monitors.
483 */
484
485/*
486 * nm_sync callback for the monitor's own tx rings.
487 * This makes no sense and always returns error
488 */
489static int
490nx_mon_na_txsync(struct __kern_channel_ring *kring, struct proc *p,
491 uint32_t flags)
492{
493#pragma unused(kring, p, flags)
494 SK_DF(SK_VERB_MONITOR | SK_VERB_SYNC | SK_VERB_TX,
495 "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b ring %u flags 0%x",
496 sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
497 SK_KVA(kring), kring->ckr_flags, CKRF_BITS, kring->ckr_ring_id,
498 flags);
499 return EIO;
500}
501
502/*
503 * nm_sync callback for the monitor's own rx rings.
504 * Note that the lock in nx_mon_zcopy_parent_sync only protects
505 * writers among themselves. Synchronization between writers
506 * (i.e., nx_mon_zcopy_parent_txsync and nx_mon_zcopy_parent_rxsync)
507 * and readers (i.e., nx_mon_zcopy_parent_rxsync) relies on memory barriers.
508 */
509static int
510nx_mon_na_rxsync(struct __kern_channel_ring *kring, struct proc *p,
511 uint32_t flags)
512{
513#pragma unused(p, flags)
514 SK_DF(SK_VERB_MONITOR | SK_VERB_SYNC | SK_VERB_RX,
515 "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b ring %u flags 0%x",
516 sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
517 SK_KVA(kring), kring->ckr_flags, CKRF_BITS, kring->ckr_ring_id,
518 flags);
519 kring->ckr_khead = kring->ckr_rhead;
520 os_atomic_thread_fence(seq_cst);
521 return 0;
522}
523
524/*
525 * na_krings_create callbacks for monitors.
526 * We could use the default netmap_hw_krings_zmon, but
527 * we don't need the nx_mbq.
528 */
529static int
530nx_mon_na_krings_create(struct nexus_adapter *na, struct kern_channel *ch)
531{
532 ASSERT(na->na_type == NA_MONITOR);
533 return na_rings_mem_setup(na, FALSE, ch);
534}
535
536/* na_krings_delete callback for monitors */
537static void
538nx_mon_na_krings_delete(struct nexus_adapter *na, struct kern_channel *ch,
539 boolean_t defunct)
540{
541 ASSERT(na->na_type == NA_MONITOR);
542 na_rings_mem_teardown(na, ch, defunct);
543}
544
545__attribute__((always_inline))
546static inline uint32_t
547nx_mon_txrx2chmode(enum txrx t)
548{
549 return t == NR_RX ? CHMODE_MONITOR_RX : CHMODE_MONITOR_TX;
550}
551
552/* allocate the monitors array in the monitored kring */
553static int
554nx_mon_kr_alloc(struct __kern_channel_ring *kring, uint32_t n)
555{
556 struct __kern_channel_ring **nm;
557
558 if (n <= kring->ckr_max_monitors) {
559 /* we already have more entries that requested */
560 return 0;
561 }
562
563 nm = sk_realloc_type_array(struct __kern_channel_ring *,
564 kring->ckr_max_monitors, n, kring->ckr_monitors,
565 Z_WAITOK, skmem_tag_monitors);
566 if (nm == NULL) {
567 return ENOMEM;
568 }
569
570 kring->ckr_monitors = nm;
571 kring->ckr_max_monitors = n;
572
573 return 0;
574}
575
576/* deallocate the parent array in the parent adapter */
577static void
578nx_mon_kr_dealloc(struct __kern_channel_ring *kring)
579{
580 if (kring->ckr_monitors != NULL) {
581 if (kring->ckr_n_monitors > 0) {
582 SK_ERR("freeing not empty monitor array for \"%s\" "
583 "(%u dangling monitors)!", kring->ckr_name,
584 kring->ckr_n_monitors);
585 }
586 sk_free_type_array(struct __kern_channel_ring *,
587 kring->ckr_max_monitors, kring->ckr_monitors);
588 kring->ckr_monitors = NULL;
589 kring->ckr_max_monitors = 0;
590 kring->ckr_n_monitors = 0;
591 }
592}
593
594static int
595nx_mon_na_krings_locks(struct nexus_adapter *na,
596 uint32_t qfirst[NR_TXRX], uint32_t qlast[NR_TXRX])
597{
598 struct nexus_monitor_adapter *mna = (struct nexus_monitor_adapter *)na;
599 struct nexus_adapter *pna = mna->mna_pna;
600 enum txrx t;
601 int err = 0;
602
603 for_rx_tx(t) {
604 uint32_t i;
605
606 if (!(mna->mna_mode & nx_mon_txrx2chmode(t))) {
607 continue;
608 }
609
610 qfirst[t] = qlast[t] = mna->mna_first[t];
611
612 /* synchronize with concurrently running nm_sync()s */
613 for (i = mna->mna_first[t]; i < mna->mna_last[t]; i++) {
614 struct __kern_channel_ring *kring;
615
616 /* the parent adapter's kring */
617 kring = &NAKR(na: pna, t)[i];
618 kr_stop(kr: kring, state: KR_LOCKED);
619 qlast[t] = i + 1;
620 }
621 if (err != 0) {
622 break;
623 }
624 }
625
626 return err;
627}
628
629static void
630nx_mon_na_krings_unlock(struct nexus_adapter *na,
631 const uint32_t qfirst[NR_TXRX], const uint32_t qlast[NR_TXRX])
632{
633 struct nexus_monitor_adapter *mna = (struct nexus_monitor_adapter *)na;
634 struct nexus_adapter *pna = mna->mna_pna;
635 enum txrx t;
636
637 for_rx_tx(t) {
638 uint32_t i;
639
640 if (!(mna->mna_mode & nx_mon_txrx2chmode(t))) {
641 continue;
642 }
643
644 /* synchronize with concurrently running nm_sync()s */
645 for (i = qfirst[t]; i < qlast[t]; i++) {
646 struct __kern_channel_ring *kring;
647
648 /* the parent adapter's kring */
649 kring = &NAKR(na: pna, t)[i];
650 kr_start(kring);
651 }
652 }
653}
654
655static int
656nx_mon_enable(struct nexus_adapter *na, boolean_t zcopy)
657{
658 struct nexus_monitor_adapter *mna = (struct nexus_monitor_adapter *)na;
659 struct nexus_adapter *pna = mna->mna_pna;
660 struct skmem_arena_nexus *na_arena = skmem_arena_nexus(ar: pna->na_arena);
661 uint32_t qfirst[NR_TXRX], qlast[NR_TXRX];
662 enum txrx t;
663 int err = 0;
664 uint32_t i;
665
666 ASSERT(!(na->na_flags & NAF_ACTIVE));
667
668 bzero(s: &qfirst, n: sizeof(qfirst));
669 bzero(s: &qlast, n: sizeof(qlast));
670
671 /*
672 * Acquire the target kring(s). q{first,last}0 represent the
673 * target ring set. q{first,last} represent the ones that have
674 * been successfully acquired. In the event the acquisition
675 * fails, we must release any previously-acquired rings.
676 */
677 if ((err = nx_mon_na_krings_locks(na, qfirst, qlast)) != 0) {
678 goto unlock;
679 }
680
681 ASSERT(na_arena->arn_rx_pp == na_arena->arn_tx_pp);
682 if (na_arena->arn_rx_pp->pp_max_frags > 1) {
683 VERIFY(na_arena->arn_rx_pp->pp_md_type == NEXUS_META_TYPE_PACKET);
684 mna->mna_pkt_copy_from_pkt = pkt_copy_multi_buflet_from_pkt;
685 } else {
686 if (na_arena->arn_rx_pp->pp_md_type == NEXUS_META_TYPE_PACKET) {
687 mna->mna_pkt_copy_from_pkt = pkt_copy_from_pkt;
688 } else {
689 mna->mna_pkt_copy_from_pkt = nx_mon_quantum_copy_64x;
690 }
691 }
692
693 for_rx_tx(t) {
694 if (!(mna->mna_mode & nx_mon_txrx2chmode(t))) {
695 continue;
696 }
697
698 for (i = qfirst[t]; i < qlast[t]; i++) {
699 struct __kern_channel_ring *kring, *mkring;
700
701 /* the parent adapter's kring */
702 kring = &NAKR(na: pna, t)[i];
703 mkring = &na->na_rx_rings[i];
704 err = nx_mon_add(mkring, kring, zcopy);
705 if (err != 0) {
706 break;
707 }
708 }
709 if (err != 0) {
710 break;
711 }
712 }
713
714 if (err == 0) {
715 os_atomic_or(&na->na_flags, NAF_ACTIVE, relaxed);
716 goto unlock;
717 }
718
719 for_rx_tx(t) {
720 if (!(mna->mna_mode & nx_mon_txrx2chmode(t))) {
721 continue;
722 }
723
724 for (i = qfirst[t]; i < qlast[t]; i++) {
725 struct __kern_channel_ring *kring, *mkring;
726
727 /* the parent adapter's kring */
728 kring = &NAKR(na: pna, t)[i];
729 mkring = &na->na_rx_rings[i];
730 nx_mon_del(mkring, kring, FALSE);
731 }
732 }
733 ASSERT(!(na->na_flags & NAF_ACTIVE));
734
735unlock:
736 nx_mon_na_krings_unlock(na, qfirst, qlast);
737
738 SK_DF(err ? SK_VERB_ERROR : SK_VERB_MONITOR,
739 "%s (0x%llx): mode 0x%x txrings[%u,%u], rxrings[%u,%u] err %d",
740 na->na_name, SK_KVA(na), mna->mna_mode, qfirst[NR_TX], qlast[NR_TX],
741 qfirst[NR_RX], qlast[NR_RX], err);
742
743 return err;
744}
745
746static void
747nx_mon_disable(struct nexus_adapter *na)
748{
749 struct nexus_monitor_adapter *mna = (struct nexus_monitor_adapter *)na;
750 struct nexus_adapter *pna = mna->mna_pna;
751 uint32_t qfirst[NR_TXRX], qlast[NR_TXRX];
752 enum txrx t;
753 int err;
754 uint32_t i;
755
756 ASSERT(na->na_flags & NAF_ACTIVE);
757
758 bzero(s: &qfirst, n: sizeof(qfirst));
759 bzero(s: &qlast, n: sizeof(qlast));
760
761 /* blocking kring(s) acquisition; must not fail */
762 err = nx_mon_na_krings_locks(na, qfirst, qlast);
763 ASSERT(err == 0);
764 mna->mna_pkt_copy_from_pkt = NULL;
765 for_rx_tx(t) {
766 if (!(mna->mna_mode & nx_mon_txrx2chmode(t))) {
767 continue;
768 }
769
770 for (i = qfirst[t]; i < qlast[t]; i++) {
771 struct __kern_channel_ring *kring, *mkring;
772
773 kring = &NAKR(na: pna, t)[i];
774 mkring = &na->na_rx_rings[i];
775 nx_mon_del(mkring, kring, FALSE);
776 }
777 }
778 os_atomic_andnot(&na->na_flags, NAF_ACTIVE, relaxed);
779
780 nx_mon_na_krings_unlock(na, qfirst, qlast);
781}
782
783/*
784 * Add the monitor mkring to the list of monitors of kring.
785 * If this is the first monitor, intercept the callbacks
786 */
787static int
788nx_mon_add(struct __kern_channel_ring *mkring,
789 struct __kern_channel_ring *kring, boolean_t zcopy)
790{
791 int error;
792
793 /* make sure the monitor array exists and is big enough */
794 error = nx_mon_kr_alloc(kring, n: kring->ckr_n_monitors + 1);
795 if (error != 0) {
796 return error;
797 }
798
799 kring->ckr_monitors[kring->ckr_n_monitors] = mkring;
800 mkring->ckr_mon_pos = kring->ckr_n_monitors;
801 kring->ckr_n_monitors++;
802 if (kring->ckr_n_monitors == 1) {
803 /* this is the first monitor, intercept callbacks */
804 SK_DF(SK_VERB_MONITOR,
805 "mkr \"%s\" (0x%llx) krflags 0x%b intercept callbacks "
806 "on kr \"%s\" (0x%llx) krflags 0x%b", mkring->ckr_name,
807 SK_KVA(mkring), mkring->ckr_flags, CKRF_BITS,
808 kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
809 CKRF_BITS);
810 kring->ckr_mon_sync = kring->ckr_na_sync;
811 /*
812 * zcopy monitors do not override nm_notify(), but
813 * we save the original one regardless, so that
814 * nx_mon_del() does not need to know the
815 * monitor type
816 */
817 kring->ckr_mon_notify = kring->ckr_na_notify;
818 if (kring->ckr_tx == NR_TX) {
819 kring->ckr_na_sync =
820 (zcopy ? nx_mon_zcopy_parent_txsync :
821 nx_mon_parent_txsync);
822 } else {
823 kring->ckr_na_sync =
824 (zcopy ? nx_mon_zcopy_parent_rxsync :
825 nx_mon_parent_rxsync);
826 if (!zcopy) {
827 /* also intercept notify */
828 kring->ckr_na_notify = nx_mon_parent_notify;
829 kring->ckr_mon_tail = kring->ckr_ktail;
830 }
831 }
832 } else {
833 SK_DF(SK_VERB_MONITOR,
834 "mkr \"%s\" (0x%llx) krflags 0x%b already intercept "
835 "callbacks on kr \"%s\" (0x%llx) krflags 0x%b, "
836 "%u monitors", mkring->ckr_name, SK_KVA(mkring),
837 mkring->ckr_flags, CKRF_BITS, kring->ckr_name,
838 SK_KVA(kring), kring->ckr_flags, CKRF_BITS,
839 kring->ckr_n_monitors);
840 }
841 return 0;
842}
843
844/*
845 * Remove the monitor mkring from the list of monitors of kring.
846 * If this is the last monitor, restore the original callbacks
847 */
848static void
849nx_mon_del(struct __kern_channel_ring *mkring,
850 struct __kern_channel_ring *kring, boolean_t all)
851{
852 ASSERT(kring->ckr_n_monitors != 0);
853 if (all) {
854 kring->ckr_n_monitors = 0;
855 } else {
856 kring->ckr_n_monitors--;
857 if (mkring->ckr_mon_pos != kring->ckr_n_monitors) {
858 kring->ckr_monitors[mkring->ckr_mon_pos] =
859 kring->ckr_monitors[kring->ckr_n_monitors];
860 kring->ckr_monitors[mkring->ckr_mon_pos]->ckr_mon_pos =
861 mkring->ckr_mon_pos;
862 }
863 kring->ckr_monitors[kring->ckr_n_monitors] = NULL;
864 }
865 if (kring->ckr_n_monitors == 0) {
866 /*
867 * This was the last monitor, restore callbacks
868 * and delete monitor array.
869 */
870 SK_DF(SK_VERB_MONITOR,
871 "restoring sync callback on kr \"%s\" (0x%llx) "
872 "krflags 0x%b", kring->ckr_name, SK_KVA(kring),
873 kring->ckr_flags, CKRF_BITS);
874 kring->ckr_na_sync = kring->ckr_mon_sync;
875 kring->ckr_mon_sync = NULL;
876 if (kring->ckr_tx == NR_RX) {
877 SK_DF(SK_VERB_MONITOR,
878 "restoring notify callback on kr \"%s\" (0x%llx) "
879 "krflags 0x%b", kring->ckr_name, SK_KVA(kring),
880 kring->ckr_flags, CKRF_BITS);
881 kring->ckr_na_notify = kring->ckr_mon_notify;
882 kring->ckr_mon_notify = NULL;
883 }
884 nx_mon_kr_dealloc(kring);
885 } else {
886 SK_DF(SK_VERB_MONITOR,
887 "NOT restoring callbacks on kr \"%s\" (0x%llx) "
888 "krflags 0x%b, %u monitors left", kring->ckr_name,
889 SK_KVA(kring), kring->ckr_flags, CKRF_BITS,
890 kring->ckr_n_monitors);
891 }
892}
893
894/*
895 * This is called when the monitored adapter leaves skywalk mode (see
896 * na_unbind_channel). We need to notify the monitors that the monitored
897 * rings are gone. We do this by setting their mna->mna_pna to NULL.
898 * Note that the rings must be stopped when this happens, so no monitor
899 * ring callback can be active.
900 */
901void
902nx_mon_stop(struct nexus_adapter *na)
903{
904 enum txrx t;
905
906 SK_LOCK_ASSERT_HELD();
907
908 /* skip if this adapter has no allocated rings */
909 if (na->na_tx_rings == NULL) {
910 return;
911 }
912
913 na_disable_all_rings(na);
914
915 for_rx_tx(t) {
916 uint32_t i;
917
918 for (i = 0; i < na_get_nrings(na, t); i++) {
919 struct __kern_channel_ring *kring = &NAKR(na, t)[i];
920 uint32_t j;
921
922 for (j = 0; j < kring->ckr_n_monitors; j++) {
923 struct __kern_channel_ring *mkring =
924 kring->ckr_monitors[j];
925 struct nexus_monitor_adapter *mna =
926 (struct nexus_monitor_adapter *)
927 KRNA(mkring);
928
929 /* forget about this adapter */
930 if (mna->mna_pna != NULL) {
931 ASSERT(na == mna->mna_pna);
932 (void) na_release_locked(na: mna->mna_pna);
933 mna->mna_pna = NULL;
934 }
935 }
936
937 /*
938 * Remove all monitors and restore callbacks;
939 * this is important for nexus adapters that
940 * are linked to one another, e.g. pipe, since
941 * the callback changes on one adapter affects
942 * its peer during sync times.
943 */
944 if (kring->ckr_n_monitors > 0) {
945 nx_mon_del(NULL, kring, TRUE);
946 }
947
948 ASSERT(kring->ckr_monitors == NULL);
949 ASSERT(kring->ckr_max_monitors == 0);
950 ASSERT(kring->ckr_n_monitors == 0);
951 }
952 }
953
954 na_enable_all_rings(na);
955}
956
957/*
958 * Common functions for the na_activate() callbacks of both kind of
959 * monitors.
960 */
961static int
962nx_mon_na_activate_common(struct nexus_adapter *na, na_activate_mode_t mode,
963 boolean_t zcopy)
964{
965 struct nexus_monitor_adapter *mna = (struct nexus_monitor_adapter *)na;
966 struct nexus_adapter *pna = mna->mna_pna;
967 int err = 0;
968
969 ASSERT(na->na_type == NA_MONITOR);
970
971 SK_DF(SK_VERB_MONITOR, "na \"%s\" (0x%llx) %s zcopy %u", na->na_name,
972 SK_KVA(na), na_activate_mode2str(mode), zcopy);
973
974 switch (mode) {
975 case NA_ACTIVATE_MODE_ON:
976 if (pna == NULL) {
977 /* parent left skywalk mode, fatal */
978 SK_ERR("%s: internal error", na->na_name);
979 err = ENXIO;
980 } else {
981 err = nx_mon_enable(na, zcopy);
982 }
983 break;
984
985 case NA_ACTIVATE_MODE_DEFUNCT:
986 break;
987
988 case NA_ACTIVATE_MODE_OFF:
989 if (pna == NULL) {
990 SK_DF(SK_VERB_MONITOR, "%s: parent left skywalk mode, "
991 "nothing to restore", na->na_name);
992 } else {
993 nx_mon_disable(na);
994 }
995 break;
996
997 default:
998 VERIFY(0);
999 /* NOTREACHED */
1000 __builtin_unreachable();
1001 }
1002
1003 return err;
1004}
1005
1006/*
1007 * Functions specific for zero-copy monitors.
1008 */
1009
1010/*
1011 * Common function for both zero-copy tx and rx nm_sync()
1012 * callbacks
1013 */
1014static int
1015nx_mon_zcopy_parent_sync(struct __kern_channel_ring *kring, struct proc *p,
1016 uint32_t flags, enum txrx tx)
1017{
1018 struct __kern_channel_ring *mkring = kring->ckr_monitors[0];
1019 int rel_slots, free_slots, busy, sent = 0;
1020 slot_idx_t beg, end, i;
1021 const slot_idx_t lim = kring->ckr_lim;
1022 const slot_idx_t mlim;
1023 int error = 0;
1024
1025 if (mkring == NULL) {
1026 SK_RD(5, "NULL monitor on kr \"%s\" (0x%llx) krflags 0x%b",
1027 kring->ckr_name, SK_KVA(kring), kring->ckr_flags,
1028 CKRF_BITS);
1029 return 0;
1030 }
1031
1032 ASSERT(!KR_KERNEL_ONLY(kring));
1033 ASSERT(!KR_KERNEL_ONLY(mkring));
1034
1035 /* deconst */
1036 *(slot_idx_t *)(uintptr_t)&mlim = mkring->ckr_lim;
1037
1038 /* get the relased slots (rel_slots) */
1039 if (tx == NR_TX) {
1040 beg = kring->ckr_ktail;
1041 error = kring->ckr_mon_sync(kring, p, NA_SYNCF_MONITOR | flags);
1042 if (error) {
1043 return error;
1044 }
1045 end = kring->ckr_ktail;
1046 } else { /* NR_RX */
1047 beg = kring->ckr_khead;
1048 end = kring->ckr_rhead;
1049 }
1050
1051 rel_slots = end - beg;
1052 if (rel_slots < 0) {
1053 rel_slots += kring->ckr_num_slots;
1054 }
1055
1056 if (!rel_slots) {
1057 /*
1058 * No released slots, but we still need
1059 * to call rxsync if this is a rx ring
1060 */
1061 goto out_rxsync;
1062 }
1063
1064 /*
1065 * We need to lock the monitor receive ring, since it
1066 * is the target of bot tx and rx traffic from the monitored
1067 * adapter
1068 */
1069 KR_LOCK(mkring);
1070 /* get the free slots available on the monitor ring */
1071 i = mkring->ckr_ktail;
1072 busy = i - mkring->ckr_khead;
1073 if (busy < 0) {
1074 busy += mkring->ckr_num_slots;
1075 }
1076 free_slots = mlim - busy;
1077
1078 if (!free_slots) {
1079 goto out;
1080 }
1081
1082 /* swap min(free_slots, rel_slots) slots */
1083 if (free_slots < rel_slots) {
1084 beg += (rel_slots - free_slots);
1085 if (beg >= kring->ckr_num_slots) {
1086 beg -= kring->ckr_num_slots;
1087 }
1088 rel_slots = free_slots;
1089 }
1090
1091 sent = rel_slots;
1092 for (; rel_slots; rel_slots--) {
1093 /*
1094 * Swap the slots.
1095 *
1096 * XXX: adi@apple.com -- this bypasses the slot attach/detach
1097 * interface, and needs to be changed when monitor adopts the
1098 * packet APIs. SD_SWAP() will perform a block copy of the
1099 * swap, and will readjust the kernel slot descriptor's sd_user
1100 * accordingly.
1101 */
1102 SD_SWAP(KR_KSD(mkring, i), KR_USD(mkring, i),
1103 KR_KSD(kring, beg), KR_USD(kring, beg));
1104
1105 SK_RD(5, "beg %u buf_idx %u", beg,
1106 METADATA_IDX(KR_KSD(kring, beg)->sd_qum));
1107
1108 beg = SLOT_NEXT(i: beg, lim);
1109 i = SLOT_NEXT(i, lim: mlim);
1110 }
1111 os_atomic_thread_fence(seq_cst);
1112 mkring->ckr_ktail = i;
1113
1114out:
1115 KR_UNLOCK(mkring);
1116
1117 if (sent) {
1118 /* notify the new frames to the monitor */
1119 (void) mkring->ckr_na_notify(mkring, p, 0);
1120 }
1121
1122out_rxsync:
1123 if (tx == NR_RX) {
1124 error = kring->ckr_mon_sync(kring, p, NA_SYNCF_MONITOR | flags);
1125 }
1126
1127 return error;
1128}
1129
1130/*
1131 * Callback used to replace the ckr_na_sync callback in the monitored tx rings.
1132 */
1133static int
1134nx_mon_zcopy_parent_txsync(struct __kern_channel_ring *kring, struct proc *p,
1135 uint32_t flags)
1136{
1137 SK_DF(SK_VERB_MONITOR,
1138 "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b flags 0x%x",
1139 sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
1140 SK_KVA(kring), kring->ckr_flags, CKRF_BITS, flags);
1141 return nx_mon_zcopy_parent_sync(kring, p, flags, tx: NR_TX);
1142}
1143
1144/* callback used to replace the nm_sync callback in the monitored rx rings */
1145static int
1146nx_mon_zcopy_parent_rxsync(struct __kern_channel_ring *kring, struct proc *p,
1147 uint32_t flags)
1148{
1149 SK_DF(SK_VERB_MONITOR,
1150 "%s(%d) kr \"%s\" (0x%llx) krflags 0x%b flags 0x%x",
1151 sk_proc_name_address(p), sk_proc_pid(p), kring->ckr_name,
1152 SK_KVA(kring), kring->ckr_flags, CKRF_BITS, flags);
1153 return nx_mon_zcopy_parent_sync(kring, p, flags, tx: NR_RX);
1154}
1155
1156static int
1157nx_mon_zcopy_na_activate(struct nexus_adapter *na, na_activate_mode_t mode)
1158{
1159 return nx_mon_na_activate_common(na, mode, TRUE /* zcopy */);
1160}
1161
1162/* na_dtor callback for monitors */
1163static void
1164nx_mon_zcopy_na_dtor(struct nexus_adapter *na)
1165{
1166 struct nexus_monitor_adapter *mna = (struct nexus_monitor_adapter *)na;
1167 struct nexus_adapter *pna = mna->mna_pna;
1168
1169 SK_LOCK_ASSERT_HELD();
1170 ASSERT(na->na_type == NA_MONITOR);
1171
1172 if (pna != NULL) {
1173 (void) na_release_locked(na: pna);
1174 mna->mna_pna = NULL;
1175 }
1176}
1177
1178/*
1179 * Functions specific for copy monitors.
1180 */
1181
1182static void
1183nx_mon_parent_sync(struct __kern_channel_ring *kring, struct proc *p,
1184 slot_idx_t first_new, int new_slots)
1185{
1186 nexus_meta_type_t md_type = KRNA(kring)->na_md_type;
1187 uint32_t j;
1188
1189 for (j = 0; j < kring->ckr_n_monitors; j++) {
1190 struct __kern_channel_ring *mkring = kring->ckr_monitors[j];
1191 slot_idx_t i, mlim, beg;
1192 int free_slots, busy, sent = 0, m;
1193 const slot_idx_t lim = kring->ckr_lim;
1194 struct nexus_adapter *dst_na = KRNA(mkring);
1195 struct nexus_monitor_adapter *mna =
1196 (struct nexus_monitor_adapter *)dst_na;
1197 uint32_t max_len = mkring->ckr_pp->pp_max_frags *
1198 PP_BUF_SIZE_DEF(mkring->ckr_pp);
1199
1200 /*
1201 * src and dst adapters must share the same nexus;
1202 * this test is done in nx_monitor_na_find(). This
1203 * covers both buffer and metadata sizes.
1204 */
1205
1206 mlim = mkring->ckr_lim;
1207
1208 /*
1209 * We need to lock the monitor receive ring, since it
1210 * is the target of both tx and rx traffics from the
1211 * monitored adapter.
1212 */
1213 KR_LOCK(mkring);
1214 /* get the free slots available on the monitor ring */
1215 i = mkring->ckr_ktail;
1216 busy = i - mkring->ckr_khead;
1217 if (busy < 0) {
1218 busy += mkring->ckr_num_slots;
1219 }
1220 free_slots = mlim - busy;
1221
1222 if (!free_slots) {
1223 goto out;
1224 }
1225
1226 /* copy min(free_slots, new_slots) slots */
1227 m = new_slots;
1228 beg = first_new;
1229 if (free_slots < m) {
1230 beg += (m - free_slots);
1231 if (beg >= kring->ckr_num_slots) {
1232 beg -= kring->ckr_num_slots;
1233 }
1234 m = free_slots;
1235 }
1236
1237 ASSERT(KRNA(mkring)->na_md_type == md_type);
1238
1239 for (; m; m--) {
1240 struct __kern_slot_desc *src_sd = KR_KSD(kring, beg);
1241 struct __kern_slot_desc *dst_sd = KR_KSD(mkring, i);
1242 struct __kern_packet *spkt, *dpkt;
1243 kern_packet_t sph, dph;
1244 uint32_t copy_len;
1245
1246 if (!KSD_VALID_METADATA(src_sd)) {
1247 goto skip;
1248 }
1249
1250 /* retreive packet handles from slot */
1251 spkt = src_sd->sd_pkt;
1252 sph = SK_PTR_ENCODE(spkt, METADATA_TYPE(spkt),
1253 METADATA_SUBTYPE(spkt));
1254 dpkt = dst_sd->sd_pkt;
1255 dph = SK_PTR_ENCODE(dpkt, METADATA_TYPE(dpkt),
1256 METADATA_SUBTYPE(dpkt));
1257
1258 ASSERT(METADATA_TYPE(spkt) == METADATA_TYPE(dpkt));
1259
1260 ASSERT(spkt->pkt_qum.qum_len <= (UINT32_MAX - 63));
1261 copy_len = spkt->pkt_qum.qum_len;
1262
1263 /* round to a multiple of 64 */
1264 copy_len = (copy_len + 63) & ~63;
1265
1266 if (__improbable(copy_len > max_len)) {
1267 SK_RD(5, "kr \"%s\" -> mkr \"%s\": "
1268 "truncating %u to %u",
1269 kring->ckr_name, mkring->ckr_name,
1270 (uint32_t)copy_len, max_len);
1271 copy_len = max_len;
1272 }
1273
1274 /* copy buffers */
1275 mna->mna_pkt_copy_from_pkt(kring->ckr_tx, dph, 0, sph,
1276 0, copy_len, FALSE, 0, 0, FALSE);
1277
1278 /* copy the associated meta data */
1279 _QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
1280 if (md_type == NEXUS_META_TYPE_PACKET) {
1281 _PKT_COPY(spkt, dpkt);
1282 ASSERT(dpkt->pkt_mbuf == NULL);
1283 }
1284
1285 ASSERT(!(dpkt->pkt_qum.qum_qflags & QUM_F_KERNEL_ONLY) ||
1286 PP_KERNEL_ONLY(dpkt->pkt_qum.qum_pp));
1287
1288 sent++;
1289 i = SLOT_NEXT(i, lim: mlim);
1290skip:
1291 beg = SLOT_NEXT(i: beg, lim);
1292 }
1293 os_atomic_thread_fence(seq_cst);
1294 mkring->ckr_ktail = i;
1295out:
1296 KR_UNLOCK(mkring);
1297
1298 if (sent) {
1299 /* notify the new frames to the monitor */
1300 (void) mkring->ckr_na_notify(mkring, p, 0);
1301 }
1302 }
1303}
1304
1305/* callback used to replace the nm_sync callback in the monitored tx rings */
1306static int
1307nx_mon_parent_txsync(struct __kern_channel_ring *kring, struct proc *p,
1308 uint32_t flags)
1309{
1310 slot_idx_t first_new;
1311 int new_slots;
1312 nexus_type_t nx_type =
1313 kring->ckr_na->na_nxdom_prov->nxdom_prov_dom->nxdom_type;
1314
1315 /*
1316 * For user pipe nexus, txsync can also be initated from RX process
1317 * context, hence user pipe tx ring should be accessed holding
1318 * ckr_qlock.
1319 */
1320 if (nx_type == NEXUS_TYPE_USER_PIPE) {
1321 KR_LOCK(kring);
1322 }
1323
1324 /* get the new slots */
1325 first_new = kring->ckr_khead;
1326 new_slots = kring->ckr_rhead - first_new;
1327 if (new_slots < 0) {
1328 new_slots += kring->ckr_num_slots;
1329 }
1330 if (new_slots) {
1331 nx_mon_parent_sync(kring, p, first_new, new_slots);
1332 }
1333
1334 if (nx_type == NEXUS_TYPE_USER_PIPE) {
1335 KR_UNLOCK(kring);
1336 }
1337
1338 return kring->ckr_mon_sync(kring, p, NA_SYNCF_MONITOR | flags);
1339}
1340
1341/* callback used to replace the nm_sync callback in the monitored rx rings */
1342static int
1343nx_mon_parent_rxsync(struct __kern_channel_ring *kring, struct proc *p,
1344 uint32_t flags)
1345{
1346 slot_idx_t first_new;
1347 int new_slots, error;
1348
1349 /* get the new slots */
1350 error = kring->ckr_mon_sync(kring, p, NA_SYNCF_MONITOR | flags);
1351 if (error) {
1352 return error;
1353 }
1354 first_new = kring->ckr_mon_tail;
1355 new_slots = kring->ckr_ktail - first_new;
1356 if (new_slots < 0) {
1357 new_slots += kring->ckr_num_slots;
1358 }
1359 if (new_slots) {
1360 nx_mon_parent_sync(kring, p, first_new, new_slots);
1361 }
1362 kring->ckr_mon_tail = kring->ckr_ktail;
1363 return 0;
1364}
1365
1366/*
1367 * Callback used to replace the nm_notify() callback in the monitored rx rings
1368 */
1369static int
1370nx_mon_parent_notify(struct __kern_channel_ring *kring, struct proc *p,
1371 uint32_t flags)
1372{
1373 int err = 0;
1374 sk_protect_t protect = NULL;
1375
1376 SK_DF(SK_VERB_MONITOR | SK_VERB_NOTIFY |
1377 ((kring->ckr_tx == NR_TX) ? SK_VERB_TX : SK_VERB_RX),
1378 "kr \"%s\" (0x%llx) krflags 0x%b flags 0x%x", kring->ckr_name,
1379 SK_KVA(kring), kring->ckr_flags, CKRF_BITS, flags);
1380 /*
1381 * ?xsync callbacks have tryget called by their callers,
1382 * but here we have to call it by ourself. If we can't
1383 * acquire the exclusive sync right, skip the sync.
1384 */
1385 if ((err = kr_enter(kring, FALSE)) == 0) {
1386 protect = sk_sync_protect();
1387 nx_mon_parent_rxsync(kring, p, NA_SYNCF_FORCE_READ);
1388 sk_sync_unprotect(protect);
1389 kr_exit(kring);
1390 }
1391 /* in all cases (even error), we must invoke notify */
1392 kring->ckr_mon_notify(kring, p, (NA_NOTEF_MONITOR | flags));
1393 return err;
1394}
1395
1396static int
1397nx_mon_na_activate(struct nexus_adapter *na, na_activate_mode_t mode)
1398{
1399 return nx_mon_na_activate_common(na, mode, FALSE /* no zcopy */);
1400}
1401
1402static void
1403nx_mon_na_dtor(struct nexus_adapter *na)
1404{
1405 struct nexus_monitor_adapter *mna = (struct nexus_monitor_adapter *)na;
1406 struct nexus_adapter *pna = mna->mna_pna;
1407
1408 SK_LOCK_ASSERT_HELD();
1409 ASSERT(na->na_type == NA_MONITOR);
1410
1411 if (pna != NULL) {
1412 (void) na_release_locked(na: pna);
1413 mna->mna_pna = NULL;
1414 }
1415}
1416
1417/* check if chr is a request for a monitor adapter that we can satisfy */
1418int
1419nx_monitor_na_find(struct kern_nexus *nx, struct kern_channel *ch,
1420 struct chreq *chr, struct kern_channel *ch0, struct nxbind *nxb,
1421 struct proc *p, struct nexus_adapter **na, boolean_t create)
1422{
1423#pragma unused(ch)
1424 boolean_t zcopy = !!(chr->cr_mode & CHMODE_MONITOR_NO_COPY);
1425 struct nexus_adapter *pna = NULL; /* parent adapter */
1426 struct nexus_monitor_adapter *mna = NULL;
1427 char monsuff[10] = "";
1428 struct chreq pchr;
1429 uint32_t i;
1430 int error;
1431 enum txrx t;
1432
1433 SK_LOCK_ASSERT_HELD();
1434 *na = NULL;
1435
1436#if SK_LOG
1437 uuid_string_t uuidstr;
1438 SK_D("name \"%s\" spec_uuid \"%s\" port %d mode 0x%b pipe_id %u "
1439 "ring_id %d ring_set %u ep_type %u:%u ch0 0x%llx create %u%s",
1440 chr->cr_name, sk_uuid_unparse(chr->cr_spec_uuid, uuidstr),
1441 (int)chr->cr_port, chr->cr_mode, CHMODE_BITS,
1442 chr->cr_pipe_id, (int)chr->cr_ring_id, chr->cr_ring_set,
1443 chr->cr_real_endpoint, chr->cr_endpoint, SK_KVA(ch0), create,
1444 !(chr->cr_mode & CHMODE_MONITOR) ? " (skipped)" : "");
1445#endif /* SK_LOG */
1446
1447 if (!(chr->cr_mode & CHMODE_MONITOR)) {
1448 return 0;
1449 }
1450
1451 /* XXX: Don't allow user packet pool mode in monitor for now */
1452 if (chr->cr_mode & CHMODE_USER_PACKET_POOL) {
1453 SK_ERR("User Packet pool mode not supported for monitor");
1454 return ENOTSUP;
1455 }
1456
1457 mna = na_mon_alloc(how: Z_WAITOK);
1458
1459 ASSERT(mna->mna_up.na_type == NA_MONITOR);
1460 ASSERT(mna->mna_up.na_free == na_mon_free);
1461
1462 /* override the ring set since we're monitoring */
1463 chr->cr_ring_set = RING_SET_ALL;
1464
1465 if (ch0 != NULL) {
1466 /*
1467 * We've been given the owning channel from ch_open();
1468 * use this as shortcut since otherwise we'd have to
1469 * find it ourselves.
1470 */
1471#if (DEBUG || DEVELOPMENT)
1472 ASSERT(!(ch0->ch_info->cinfo_ch_mode & CHMODE_MONITOR));
1473 ASSERT(ch0->ch_info->cinfo_nx_port == chr->cr_port);
1474#endif /* DEBUG || DEVELOPMENT */
1475 pna = ch0->ch_na;
1476 na_retain_locked(na: pna);
1477 } else {
1478 /*
1479 * First, try to find the adapter that we want to monitor
1480 * We use the same chr, after we have turned off the monitor
1481 * flags. In this way we can potentially monitor everything
1482 * skywalk understands, except other monitors.
1483 */
1484 memcpy(dst: &pchr, src: chr, n: sizeof(pchr));
1485 pchr.cr_mode &= ~CHMODE_MONITOR;
1486 error = na_find(ch, nx, &pchr, ch0, nxb, p, &pna, create);
1487 if (error != 0) {
1488 SK_ERR("parent lookup failed: %d", error);
1489 return error;
1490 }
1491 }
1492 ASSERT(pna != NULL);
1493 SK_DF(SK_VERB_MONITOR,
1494 "found parent: \"%s\" (0x%llx)", pna->na_name, SK_KVA(pna));
1495
1496 if (!NA_IS_ACTIVE(pna)) {
1497 /* parent not in skywalk mode */
1498 /*
1499 * XXX we can wait for the parent to enter skywalk mode,
1500 * by intercepting its na_activate() callback (2014-03-16)
1501 */
1502 SK_ERR("parent \"%s\" (0x%llx) not in skywalk mode",
1503 pna->na_name, SK_KVA(pna));
1504 error = ENXIO;
1505 goto put_out;
1506 } else if (zcopy && NA_KERNEL_ONLY(pna)) {
1507 /*
1508 * Zero-copy mode requires the parent adapter to be
1509 * created in a non-kernel-only mode.
1510 */
1511 SK_ERR("parent \"%s\" (0x%llx) is in kernel-only mode",
1512 pna->na_name, SK_KVA(pna));
1513 error = ENODEV;
1514 goto put_out;
1515 }
1516
1517 /* grab all the rings we need in the parent */
1518 mna->mna_pna = pna;
1519 error = na_interp_ringid(pna, chr->cr_ring_id, chr->cr_ring_set,
1520 mna->mna_first, mna->mna_last);
1521 if (error != 0) {
1522 SK_ERR("ring_mode %u ring_id %d error %d", chr->cr_ring_set,
1523 (int)chr->cr_ring_id, error);
1524 goto put_out;
1525 }
1526 if (mna->mna_last[NR_TX] - mna->mna_first[NR_TX] == 1) {
1527 (void) snprintf(monsuff, count: 10, "-%u", mna->mna_first[NR_TX]);
1528 }
1529 (void) snprintf(mna->mna_up.na_name, count: sizeof(mna->mna_up.na_name),
1530 "%s%s/%s%s%s", pna->na_name, monsuff, zcopy ? "z" : "",
1531 (chr->cr_mode & CHMODE_MONITOR_TX) ? "r" : "",
1532 (chr->cr_mode & CHMODE_MONITOR_RX) ? "t" : "");
1533 uuid_generate_random(out: mna->mna_up.na_uuid);
1534
1535 /* these don't apply to the monitor adapter */
1536 *(nexus_stats_type_t *)(uintptr_t)&mna->mna_up.na_stats_type =
1537 NEXUS_STATS_TYPE_INVALID;
1538 *(uint32_t *)(uintptr_t)&mna->mna_up.na_flowadv_max = 0;
1539
1540 if (zcopy) {
1541 /*
1542 * Zero copy monitors need exclusive access
1543 * to the monitored rings.
1544 */
1545 for_rx_tx(t) {
1546 if (!(chr->cr_mode & nx_mon_txrx2chmode(t))) {
1547 continue;
1548 }
1549 for (i = mna->mna_first[t];
1550 i < mna->mna_last[t]; i++) {
1551 struct __kern_channel_ring *kring =
1552 &NAKR(na: pna, t)[i];
1553 if (kring->ckr_n_monitors > 0) {
1554 error = EBUSY;
1555 SK_ERR("kr \"%s\" already monitored "
1556 "by \"%s\"", kring->ckr_name,
1557 kring->ckr_monitors[0]->ckr_name);
1558 goto put_out;
1559 }
1560 }
1561 }
1562 mna->mna_up.na_activate = nx_mon_zcopy_na_activate;
1563 mna->mna_up.na_dtor = nx_mon_zcopy_na_dtor;
1564 /*
1565 * To have zero copy, we need to use the same memory allocator
1566 * as the monitored port.
1567 */
1568 mna->mna_up.na_arena = pna->na_arena;
1569 skmem_arena_retain((&mna->mna_up)->na_arena);
1570 os_atomic_or(&mna->mna_up.na_flags, NAF_MEM_LOANED, relaxed);
1571 } else {
1572 /* normal monitors are incompatible with zero copy ones */
1573 for_rx_tx(t) {
1574 if (!(chr->cr_mode & nx_mon_txrx2chmode(t))) {
1575 continue;
1576 }
1577 for (i = mna->mna_first[t];
1578 i < mna->mna_last[t]; i++) {
1579 struct __kern_channel_ring *kring =
1580 &NAKR(na: pna, t)[i];
1581 if (kring->ckr_n_monitors > 0 &&
1582 KRNA(kring->ckr_monitors[0])->
1583 na_activate == nx_mon_zcopy_na_activate) {
1584 error = EBUSY;
1585 SK_ERR("kr \"%s\" is busy (zcopy)",
1586 kring->ckr_name);
1587 goto put_out;
1588 }
1589 }
1590 }
1591 mna->mna_up.na_activate = nx_mon_na_activate;
1592 mna->mna_up.na_dtor = nx_mon_na_dtor;
1593 /*
1594 * allocate a new (private) allocator instance using the
1595 * parent nexus configuration.
1596 */
1597 if ((error = nx_monitor_prov_s.nxdom_prov_mem_new(
1598 NX_DOM_PROV(nx), nx, &mna->mna_up)) != 0) {
1599 ASSERT(mna->mna_up.na_arena == NULL);
1600 goto put_out;
1601 }
1602 ASSERT(mna->mna_up.na_arena != NULL);
1603 mna->mna_up.na_rxsync = nx_mon_na_rxsync;
1604 }
1605 *(nexus_meta_type_t *)(uintptr_t)&mna->mna_up.na_md_type =
1606 pna->na_md_type;
1607 *(nexus_meta_subtype_t *)(uintptr_t)&mna->mna_up.na_md_subtype =
1608 pna->na_md_subtype;
1609
1610 /* a do-nothing txsync: monitors cannot be used to inject packets */
1611 mna->mna_up.na_txsync = nx_mon_na_txsync;
1612 mna->mna_up.na_rxsync = nx_mon_na_rxsync;
1613 mna->mna_up.na_krings_create = nx_mon_na_krings_create;
1614 mna->mna_up.na_krings_delete = nx_mon_na_krings_delete;
1615
1616 /*
1617 * We set the number of our na_rx_rings to be
1618 * max(na_num_tx_rings, na_num_rx_rings) in the parent
1619 */
1620 na_set_nrings(na: &mna->mna_up, t: NR_TX, v: na_get_nrings(na: pna, t: NR_TX));
1621 na_set_nrings(na: &mna->mna_up, t: NR_RX, v: na_get_nrings(na: pna, t: NR_RX));
1622 if (na_get_nrings(na: pna, t: NR_TX) > na_get_nrings(na: pna, t: NR_RX)) {
1623 na_set_nrings(na: &mna->mna_up, t: NR_RX, v: na_get_nrings(na: pna, t: NR_TX));
1624 }
1625 na_set_nslots(na: &mna->mna_up, t: NR_TX, v: na_get_nslots(na: pna, t: NR_TX));
1626 na_set_nslots(na: &mna->mna_up, t: NR_RX, v: na_get_nslots(na: pna, t: NR_RX));
1627
1628 na_attach_common(&mna->mna_up, nx, &nx_monitor_prov_s);
1629
1630 /* remember the traffic directions we have to monitor */
1631 mna->mna_mode = (chr->cr_mode & CHMODE_MONITOR);
1632
1633 /* keep the reference to the parent */
1634 *na = &mna->mna_up;
1635 na_retain_locked(na: *na);
1636
1637 /* sanity check: monitor and monitored adapters must share the nexus */
1638 ASSERT((*na)->na_nx == pna->na_nx);
1639
1640#if SK_LOG
1641 SK_DF(SK_VERB_MONITOR, "created monitor adapter 0x%llx", SK_KVA(mna));
1642 SK_DF(SK_VERB_MONITOR, "na_name: \"%s\"", mna->mna_up.na_name);
1643 SK_DF(SK_VERB_MONITOR, " UUID: %s",
1644 sk_uuid_unparse(mna->mna_up.na_uuid, uuidstr));
1645 SK_DF(SK_VERB_MONITOR, " nx: 0x%llx (\"%s\":\"%s\")",
1646 SK_KVA(mna->mna_up.na_nx), NX_DOM(mna->mna_up.na_nx)->nxdom_name,
1647 NX_DOM_PROV(mna->mna_up.na_nx)->nxdom_prov_name);
1648 SK_DF(SK_VERB_MONITOR, " flags: 0x%b",
1649 mna->mna_up.na_flags, NAF_BITS);
1650 SK_DF(SK_VERB_MONITOR, " rings: tx %u rx %u",
1651 na_get_nrings(&mna->mna_up, NR_TX),
1652 na_get_nrings(&mna->mna_up, NR_RX));
1653 SK_DF(SK_VERB_MONITOR, " slots: tx %u rx %u",
1654 na_get_nslots(&mna->mna_up, NR_TX),
1655 na_get_nslots(&mna->mna_up, NR_RX));
1656#if CONFIG_NEXUS_USER_PIPE
1657 SK_DF(SK_VERB_MONITOR, " next_pipe: %u", mna->mna_up.na_next_pipe);
1658 SK_DF(SK_VERB_MONITOR, " max_pipes: %u", mna->mna_up.na_max_pipes);
1659#endif /* CONFIG_NEXUS_USER_PIPE */
1660 SK_DF(SK_VERB_MONITOR, " mna_tx_rings: [%u,%u)", mna->mna_first[NR_TX],
1661 mna->mna_last[NR_TX]);
1662 SK_DF(SK_VERB_MONITOR, " mna_rx_rings: [%u,%u)", mna->mna_first[NR_RX],
1663 mna->mna_last[NR_RX]);
1664 SK_DF(SK_VERB_MONITOR, " mna_mode: %u", mna->mna_mode);
1665#endif /* SK_LOG */
1666
1667 return 0;
1668
1669put_out:
1670 if (pna != NULL) {
1671 (void) na_release_locked(na: pna);
1672 pna = NULL;
1673 }
1674 NA_FREE(&mna->mna_up);
1675 return error;
1676}
1677
1678static void
1679nx_mon_quantum_copy_64x(const enum txrx t, kern_packet_t dph,
1680 const uint16_t doff, kern_packet_t sph, const uint16_t soff,
1681 const uint32_t len, const boolean_t unused_arg1,
1682 const uint16_t unused_arg2, const uint16_t unused_arg3,
1683 const boolean_t unused_arg4)
1684{
1685 /* for function prototype parity with pkt_copy_from_pkt_t */
1686#pragma unused(unused_arg1, unused_arg2, unused_arg3, unused_arg4)
1687#pragma unused(t, doff, soff)
1688 struct __kern_quantum *dqum = SK_PTR_ADDR_KQUM(dph);
1689 struct __kern_quantum *squm = SK_PTR_ADDR_KQUM(sph);
1690 uint8_t *sbuf, *dbuf;
1691
1692 ASSERT(METADATA_TYPE(squm) == NEXUS_META_TYPE_QUANTUM);
1693 ASSERT(METADATA_TYPE(squm) == METADATA_TYPE(dqum));
1694 VERIFY(IS_P2ALIGNED(len, 64));
1695
1696 MD_BUFLET_ADDR(squm, sbuf);
1697 MD_BUFLET_ADDR(dqum, dbuf);
1698 VERIFY(IS_P2ALIGNED(dbuf, sizeof(uint64_t)));
1699
1700 if (__probable(IS_P2ALIGNED(sbuf, sizeof(uint64_t)))) {
1701 sk_copy64_64x(src: (uint64_t *)(void *)sbuf,
1702 dst: (uint64_t *)(void *)dbuf, l: len);
1703 } else {
1704 bcopy(src: sbuf, dst: dbuf, n: len);
1705 }
1706 /*
1707 * This copy routine only copies to/from a buflet, so the length
1708 * is guaranteed be <= the size of a buflet.
1709 */
1710 VERIFY(len <= UINT16_MAX);
1711 METADATA_SET_LEN(dqum, (uint16_t)len, 0);
1712}
1713