1/*
2 * Copyright (c) 2015-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*
30 * Copyright (C) 2012-2014 Matteo Landi, Luigi Rizzo, Giuseppe Lettieri.
31 * All rights reserved.
32 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
33 *
34 * Redistribution and use in source and binary forms, with or without
35 * modification, are permitted provided that the following conditions
36 * are met:
37 * 1. Redistributions of source code must retain the above copyright
38 * notice, this list of conditions and the following disclaimer.
39 * 2. Redistributions in binary form must reproduce the above copyright
40 * notice, this list of conditions and the following disclaimer in the
41 * documentation and/or other materials provided with the distribution.
42 *
43 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
44 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
45 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
46 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
47 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
48 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
49 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
50 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
51 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
52 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
53 * SUCH DAMAGE.
54 */
55
56#ifndef _SKYWALK_CHANNEL_CHANNELVAR_H_
57#define _SKYWALK_CHANNEL_CHANNELVAR_H_
58
59#ifdef BSD_KERNEL_PRIVATE
60#include <skywalk/core/skywalk_var.h>
61#include <skywalk/os_channel_private.h>
62#include <skywalk/nexus/nexus_mbq.h>
63#include <skywalk/nexus/nexus_pktq.h>
64#include <skywalk/mem/skmem_region_var.h>
65#include <skywalk/mem/skmem_arena_var.h>
66
67struct ch_selinfo {
68 decl_lck_mtx_data(, csi_lock);
69 struct selinfo csi_si;
70 uint32_t csi_flags;
71 uint32_t csi_pending;
72 uint64_t csi_eff_interval;
73 uint64_t csi_interval;
74 thread_call_t csi_tcall;
75};
76
77/* values for csi_flags */
78#define CSI_KNOTE 0x1 /* kernel note attached */
79#define CSI_MITIGATION 0x10 /* has mitigation */
80#define CSI_DESTROYED (1U << 31) /* has been destroyed */
81
82#define CSI_LOCK(_csi) \
83 lck_mtx_lock(&(_csi)->csi_lock)
84#define CSI_LOCK_ASSERT_HELD(_csi) \
85 LCK_MTX_ASSERT(&(_csi)->csi_lock, LCK_MTX_ASSERT_OWNED)
86#define CSI_LOCK_ASSERT_NOTHELD(_csi) \
87 LCK_MTX_ASSERT(&(_csi)->csi_lock, LCK_MTX_ASSERT_NOTOWNED)
88#define CSI_UNLOCK(_csi) \
89 lck_mtx_unlock(&(_csi)->csi_lock)
90
91/* mitigation intervals in ns */
92#define CH_MIT_IVAL_DEFAULT (0)
93#define CH_MIT_IVAL_WIFI CH_MIT_IVAL_DEFAULT
94#define CH_MIT_IVAL_CELLULAR CH_MIT_IVAL_DEFAULT
95#define CH_MIT_IVAL_ETHERNET CH_MIT_IVAL_DEFAULT
96
97/*
98 * Kernel version of __user_slot_desc.
99 *
100 * Keep slot descriptor as minimal as possible.
101 * TODO: wshen0123@apple.com -- Should we make use of RX/TX
102 * preparation/writeback descriptors (in a union)?
103 */
104struct __kern_slot_desc {
105 union {
106 struct __kern_quantum *sd_qum;
107 struct __kern_packet *sd_pkt;
108 struct __kern_buflet *sd_buf;
109 void *sd_md; /* metadata address */
110 };
111
112#ifndef __LP64__
113 uint32_t _sd_pad[1];
114#endif /* !__LP64__ */
115};
116
117/* _sd_{user,kern} are at same offset in the preamble */
118#define SLOT_DESC_KSD(_sdp) \
119 __unsafe_forge_single(struct __kern_slot_desc *, \
120 ((struct __kern_slot_desc *)((uintptr_t)&(_sdp)->_sd_private)))
121
122/*
123 * Optional, per-slot context information. An array of these structures
124 * is allocated per nexus_adapter, and each real kring will have its slots
125 * correspond to one. This the 'arg' value is retrieved via the slot_init
126 * nexus provider callback, and is retrievable via subsequently via calls
127 * to kern_channel_slot_get_context().
128 */
129struct slot_ctx {
130 mach_vm_address_t slot_ctx_arg; /* per-slot context */
131};
132
133extern lck_attr_t channel_lock_attr;
134extern uint64_t __ch_umd_redzone_cookie;
135extern uint32_t kr_stat_enable;
136
137struct kern_nexus;
138enum na_sync_mode;
139
140struct kern_channel {
141 decl_lck_mtx_data(, ch_lock);
142 struct nexus_adapter *ch_na;
143 struct kern_nexus *ch_nexus;
144 struct ch_info *ch_info;
145 struct kern_pbufpool *ch_pp;
146
147 uint32_t ch_refcnt;
148 volatile uint32_t ch_flags; /* CHANF_* flags */
149
150 /* range of tx/rx/allocator/event rings to scan */
151 ring_id_t ch_first[NR_ALL];
152 ring_id_t ch_last[NR_ALL];
153
154 struct __user_channel_schema *ch_schema;
155
156 /*
157 * Pointers to the selinfo to be used for selrecord.
158 * Either the local or the global one depending on the
159 * number of rings.
160 */
161 struct ch_selinfo *ch_si[NR_ALL];
162
163 STAILQ_ENTRY(kern_channel) ch_link;
164 STAILQ_ENTRY(kern_channel) ch_link_if_adv;
165 void *ch_ctx;
166 mach_vm_offset_t ch_schema_offset;
167 struct skmem_arena_mmap_info ch_mmap;
168 int ch_fd; /* might be -1 if no fd */
169 pid_t ch_pid; /* process ID */
170 char ch_name[32]; /* process name */
171};
172
173/* valid values for ch_flags */
174#define CHANF_ATTACHED 0x1 /* attached and connected to nexus */
175#define CHANF_PLATFORM 0x2 /* platform binary process */
176#define CHANF_KERNEL 0x4 /* kernel only; has no task map */
177#define CHANF_RXONLY 0x8 /* receive only, no transmit */
178#define CHANF_USER_PACKET_POOL 0x10 /* userspace using packet pool */
179#define CHANF_EXCLUSIVE 0x20 /* exclusive bind to ring(s) */
180#define CHANF_NONXREF 0x40 /* has no nexus reference */
181#define CHANF_HOST 0x80 /* opened to host (kernel) stack */
182#define CHANF_EXT_SKIP 0x100 /* don't notify external provider */
183#define CHANF_EXT_PRECONNECT 0x200 /* successful nxpi_pre_connect() */
184#define CHANF_EXT_CONNECTED 0x400 /* successful nxpi_connected() */
185#define CHANF_EVENT_RING 0x1000 /* channel has event rings */
186#define CHANF_IF_ADV 0x2000 /* interface advisory is active */
187#define CHANF_DEFUNCT_SKIP 0x4000 /* defunct skipped due to active use */
188#define CHANF_CLOSING (1U << 30) /* channel is being closed */
189#define CHANF_DEFUNCT (1U << 31) /* channel is now defunct */
190
191#define CHANF_BITS \
192 "\020\01ATTACHED\02PLATFORM\03KERNEL\04RXONLY\05USER_PKT_POOL" \
193 "\06EXCLUSIVE\07NONXREF\010HOST\011EXT_SKIP\012EXT_PRECONNECT" \
194 "\013EXT_CONNECTED\015EVENT\016ADVISORY" \
195 "\017DEFUNCT_SKIP\037CLOSING\040DEFUNCT"
196
197/* valid values for ch_kevhints */
198#define CHAN_FILT_HINT_FLOW_ADV_UPD 0x1 /* flow advisory update */
199#define CHAN_FILT_HINT_CHANNEL_EVENT 0x2 /* channel event */
200#define CHAN_FILT_HINT_IF_ADV_UPD 0x4 /* Interface advisory update */
201
202#define CHAN_FILT_HINT_BITS "\020\01FLOW_ADV\02CHANNEL_EVENT\03IF_ADV"
203
204typedef enum {
205 RING_SET_ALL = 0, /* all rings */
206 RING_SET_DEFAULT = RING_SET_ALL,
207} ring_set_t;
208
209typedef enum {
210 CH_ENDPOINT_NULL = 0,
211 CH_ENDPOINT_USER_PIPE_MASTER,
212 CH_ENDPOINT_USER_PIPE_SLAVE,
213 CH_ENDPOINT_KERNEL_PIPE,
214 CH_ENDPOINT_NET_IF,
215 CH_ENDPOINT_FLOW_SWITCH,
216} ch_endpoint_t;
217
218#define CHREQ_NAMELEN 64
219
220struct chreq {
221 char cr_name[CHREQ_NAMELEN]; /* in */
222 uuid_t cr_spec_uuid; /* in */
223 struct ch_ev_thresh cr_tx_lowat; /* in */
224 struct ch_ev_thresh cr_rx_lowat; /* in */
225 nexus_port_t cr_port; /* in/out */
226 uint32_t cr_mode; /* in */
227 uint32_t cr_pipe_id; /* in */
228 ring_id_t cr_ring_id; /* in */
229 ring_set_t cr_ring_set; /* out */
230 ch_endpoint_t cr_real_endpoint; /* out */
231 ch_endpoint_t cr_endpoint; /* out */
232 mach_vm_size_t cr_memsize; /* out */
233 mach_vm_offset_t cr_memoffset; /* out */
234};
235
236/*
237 * Private, kernel view of a ring. Keeps track of the status of
238 * a ring across system calls.
239 *
240 * ckr_khead Index of the next buffer to refill. It corresponds
241 * to ring_head at the time the system call returns.
242 *
243 * ckr_ktail Index of the first buffer owned by the kernel.
244 *
245 * On RX, ckr_khead to ckr_ktail are receive buffers that
246 * are not yet released. ckr_khead is advanced following
247 * ring_head, ckr_ktail is advanced on incoming packets.
248 *
249 * On TX, ckr_rhead has been filled by the sender but not
250 * sent yet to the destination; ckr_rhead to ckr_ktail are
251 * available for new transmissions, and ckr_ktail to
252 * ckr_khead-1 are pending transmissions.
253 *
254 * Here is the layout for the RX and TX rings.
255 *
256 * RX RING TX RING
257 *
258 * +-----------------+ +-----------------+
259 * | | | |
260 * |XXX free slot XXX| |XXX free slot XXX|
261 * +-----------------+ +-----------------+
262 * head->| owned by user |<-khead | not sent to nic |<-khead
263 * | | | yet |
264 * | | | |
265 * +-----------------+ + ------ +
266 * tail->| |<-ktail | |<-klease
267 * | (being | ... | | ...
268 * | prepared) | ... | | ...
269 * +-----------------+ ... | | ...
270 * | |<-klease +-----------------+
271 * | | tail->| |<-ktail
272 * | | | |
273 * | | | |
274 * | | | |
275 * +-----------------+ +-----------------+
276 *
277 * The head/tail (user view) and khead/ktail (kernel view)
278 * are used in the normal operation of the adapter.
279 *
280 * For flow switch nexus:
281 *
282 * The following fields are used to implement lock-free copy of packets
283 * from input to output ports in flow switch:
284 *
285 * ckr_klease Buffer after the last one being copied.
286 * A writer in nx_fsw_vp_flush() reserves N buffers
287 * from ckr_klease, advances it, then does the
288 * copy outside the lock.
289 *
290 * In RX rings (used for flow switch ports):
291 * ckr_ktail <= ckr_klease < nkr_khead+N-1
292 *
293 * In TX rings (used for NIC or host stack ports):
294 * nkr_khead <= ckr_klease < nkr_ktail
295 *
296 * ckr_leases Array of ckr_num_slots where writers can report
297 * completion of their block. CKR_NOSLOT (~0) indicates
298 * that the writer has not finished yet
299 *
300 * ckr_lease_idx Index of next free slot in ckr_leases, to be assigned.
301 *
302 * The kring is manipulated by txsync/rxsync and generic kring function.
303 *
304 * Concurrent rxsync or txsync on the same ring are prevented through
305 * by na_kr_(try)get() which in turn uses ckr_busy. This is all we need
306 * for NIC rings, and for TX rings attached to the host stack.
307 *
308 * RX rings attached to the host stack use an nx_mbq (ckr_rx_queue) on both
309 * nx_netif_rxsync_from_host() and nx_netif_compat_transmit(). The nx_mbq is
310 * protected by its internal lock.
311 *
312 * RX rings attached to the flow switch are accessed by both senders
313 * and receiver. They are protected through the q_lock on the RX ring.
314 *
315 * When a ring is the output of a switch port (RX ring for a flow switch
316 * port, TX ring for the host stack or NIC), slots are reserved in blocks
317 * through ckr_klease which points to the next unused slot.
318 *
319 * On an RX ring, ckr_klease is always after ckr_ktail, and completions cause
320 * ckr_ktail to advance. On a TX ring, ckr_klease is always between ckr_khead
321 * and ckr_ktail, and completions cause ckr_khead to advance.
322 *
323 * nx_fsw_vp_na_kr_space()
324 * returns the maximum number of slots that can be assigned.
325 *
326 * nx_fsw_vp_na_kr_lease() reserves the required number of buffers,
327 * advances ckr_klease and also returns an entry in a circular
328 * array where completions should be reported.
329 *
330 * For netif nexus:
331 *
332 * The indexes in the NIC and rings are offset by ckr_hwofs slots. This is
333 * so that, on a reset, buffers owned by userspace are not modified by the
334 * kernel. In particular:
335 *
336 * RX rings: the next empty buffer (ckr_ktail + ckr_hwofs) coincides with
337 * the next empty buffer as known by the hardware "next to check".
338 * TX rings: ckr_khead + ckr_hwofs coincides with "next to send".
339 *
340 */
341typedef int (*channel_ring_notify_t)(struct __kern_channel_ring *,
342 struct proc *, uint32_t);
343
344struct __kern_channel_ring {
345 struct __user_channel_ring *ckr_ring;
346
347 uint32_t ckr_flags; /* CKRF_* flags */
348 slot_idx_t ckr_num_slots; /* # of slots */
349 uint32_t ckr_max_pkt_len;/* max pp pkt size */
350 uint32_t ckr_largest; /* largest packet seen */
351 const slot_idx_t ckr_lim; /* ckr_num_slots - 1 */
352 enum txrx ckr_tx; /* kind of ring (tx/rx/alloc/free) */
353
354 volatile slot_idx_t ckr_khead;
355 volatile slot_idx_t ckr_ktail;
356 /*
357 * value of ckr_khead recorded at TX prologue (pre-sync)
358 */
359 volatile slot_idx_t ckr_khead_pre;
360 /*
361 * Copies of values in user rings, so we do not need to look
362 * at the ring (which could be modified). These are set in the
363 * *sync_prologue()/finalize() routines.
364 */
365 volatile slot_idx_t ckr_rhead;
366 volatile slot_idx_t ckr_rtail;
367
368 /* EWMA decay rate */
369 uint32_t ckr_transfer_decay;
370
371 uint64_t ckr_ready_bytes;
372 uint64_t ckr_ready_slots;
373
374 /*
375 * While ckr_state is set, no new [tr]xsync operations can be
376 * started on this kring. This is used by na_disable_all_rings()
377 * to find a synchronization point where critical data structures
378 * pointed to by the kring can be added or removed.
379 */
380 decl_lck_spin_data(, ckr_slock);
381 struct thread *ckr_owner; /* busy owner */
382 uint32_t ckr_busy; /* prevent kring modifications */
383 uint32_t ckr_want; /* # of threads that lost the race */
384 uint32_t ckr_state; /* KR_* states */
385
386 /* current working set for the allocator ring */
387 volatile uint32_t ckr_alloc_ws;
388
389 struct nexus_adapter *ckr_na; /* adapter this kring belongs to */
390 struct kern_pbufpool *ckr_pp; /* adapter's packet buffer pool */
391
392 /*
393 * Array of __slot_desc each representing slot-specific data, e.g.
394 * index to metadata, etc. There is exactly one descriptor for each
395 * slot in the ring. Note that the size of the array may be greater
396 * than the number of slots for this ring, and so we constrain
397 * range with [ckr_ksds, ckr_ksds_last] during validations.
398 */
399 struct __slot_desc *__unsafe_indexable ckr_usds; /* slot desc array (user) */
400 struct __slot_desc *__unsafe_indexable ckr_ksds; /* slot desc array (kernel) */
401 struct __slot_desc *__single ckr_ksds_last; /* cache last ksd */
402 struct skmem_cache *ckr_ksds_cache; /* owning skmem_cache for ksd */
403
404 uint32_t ckr_ring_id; /* ring ID */
405
406 boolean_t ckr_rate_limited; /* ring is rate limited */
407
408 /*
409 * Array of packet handles for as many slots as there are in the
410 * ring; this is useful for storing an array of kern_packet_t to
411 * be used when invoking the packet APIs. Only safe to be used
412 * in the context of a sync as we're single-threaded then.
413 * The memory is owned by the nexus adapter.
414 */
415 uint64_t *__unsafe_indexable ckr_scratch;
416
417 /*
418 * [tx]sync callback for this kring. The default na_kring_create
419 * callback (na_kr_create) sets the ckr_na_sync callback of each
420 * tx(rx) kring to the corresponding na_txsync(na_rxsync) taken
421 * from the nexus_adapter.
422 *
423 * Overrides: the above configuration is not changed by
424 * any of the nm_krings_create callbacks.
425 */
426 int (*ckr_na_sync)(struct __kern_channel_ring *,
427 struct proc *, uint32_t);
428 int(*volatile ckr_na_notify)(struct __kern_channel_ring *,
429 struct proc *, uint32_t);
430
431 int (*ckr_prologue)(struct kern_channel *,
432 struct __kern_channel_ring *, const slot_idx_t,
433 uint32_t *, uint64_t *, struct proc *);
434 void (*ckr_finalize)(struct kern_channel *,
435 struct __kern_channel_ring *, const slot_idx_t, struct proc *);
436
437 /* time of last channel sync (updated at sync prologue time) */
438 uint64_t ckr_sync_time;
439
440#if CONFIG_NEXUS_FLOWSWITCH
441 /* The following fields are for flow switch support */
442 int (*ckr_save_notify)(struct __kern_channel_ring *kring,
443 struct proc *, uint32_t flags);
444 uint32_t *ckr_leases;
445#define CKR_NOSLOT ((uint32_t)~0) /* used in nkr_*lease* */
446 slot_idx_t ckr_klease;
447 slot_idx_t ckr_lease_idx;
448#endif /* CONFIG_NEXUS_FLOWSWITCH */
449
450 kern_packet_svc_class_t ckr_svc;
451
452 /*
453 * (Optional) array of slot contexts for as many slots as there
454 * are in the ring; the memory is owned by the nexus adapter.
455 */
456 uint32_t ckr_slot_ctxs_set; /* number of valid/set contexts */
457 struct slot_ctx *__unsafe_indexable ckr_slot_ctxs; /* (optional) array of slot contexts */
458
459 void *ckr_ctx; /* ring context */
460
461 struct ch_selinfo ckr_si; /* per-ring wait queue */
462
463#if CONFIG_NEXUS_NETIF
464 /*
465 * netif adapters intercepts ckr_na_notify in order to
466 * mitigate IRQ events; the actual notification is done
467 * by invoking the original notify callback routine
468 * saved at na_activate() time.
469 */
470 int (*ckr_netif_notify)(struct __kern_channel_ring *kring,
471 struct proc *, uint32_t flags);
472 void (*ckr_netif_mit_stats)(struct __kern_channel_ring *kring,
473 uint64_t, uint64_t);
474 struct nx_netif_mit *ckr_mit;
475
476 volatile uint32_t ckr_pending_intr;
477 volatile uint32_t ckr_pending_doorbell;
478
479 /*
480 * Support for adapters without native Skywalk support.
481 * On tx rings we preallocate an array of tx buffers
482 * (same size as the channel ring), on rx rings we
483 * store incoming mbufs in a queue that is drained by
484 * a rxsync.
485 */
486 struct mbuf **ckr_tx_pool;
487 struct nx_mbq ckr_rx_queue; /* intercepted rx mbufs. */
488#endif /* CONFIG_NEXUS_NETIF */
489
490#if CONFIG_NEXUS_USER_PIPE
491 /* if this is a pipe ring, pointer to the other end */
492 struct __kern_channel_ring *ckr_pipe;
493 /* pointer to hidden rings see nx_user_pipe.c for details) */
494 struct __user_channel_ring *ckr_save_ring;
495#endif /* CONFIG_NEXUS_USER_PIPE */
496
497 /*
498 * Protects kring in the event of multiple writers;
499 * only used by flow switch and monitor.
500 */
501 decl_lck_mtx_data(, ckr_qlock);
502
503#if CONFIG_NEXUS_MONITOR
504 /* array of krings that are monitoring this kring */
505 struct __kern_channel_ring **ckr_monitors;
506 uint32_t ckr_max_monitors; /* current size of the monitors array */
507 uint32_t ckr_n_monitors; /* next unused entry in the monitor array */
508 /*
509 * Monitors work by intercepting the sync and notify callbacks of the
510 * monitored krings. This is implemented by replacing the pointers
511 * above and saving the previous ones in mon_* pointers below
512 */
513 int (*ckr_mon_sync)(struct __kern_channel_ring *kring, struct proc *,
514 uint32_t flags);
515 int (*ckr_mon_notify)(struct __kern_channel_ring *kring, struct proc *,
516 uint32_t flags);
517
518 uint32_t ckr_mon_tail; /* last seen slot on rx */
519 /* index of this ring in the monitored ring array */
520 uint32_t ckr_mon_pos;
521#endif /* CONFIG_NEXUS_MONITOR */
522
523 uint32_t ckr_users; /* existing bindings for this ring */
524
525 /* ring flush rate limit */
526 int64_t ckr_tbr_token;
527 int64_t ckr_tbr_depth;
528 uint64_t ckr_tbr_last;
529#define CKR_TBR_TOKEN_INVALID INT64_MAX
530
531 /* stats capturing errors */
532 channel_ring_error_stats ckr_err_stats
533 __attribute__((aligned(sizeof(uint64_t))));
534
535 /* stats capturing actual data movement (nexus provider's view) */
536 channel_ring_stats ckr_stats
537 __attribute__((aligned(sizeof(uint64_t))));
538 uint64_t ckr_accumulated_bytes;
539 uint64_t ckr_accumulated_slots;
540 uint64_t ckr_accumulate_start; /* in seconds */
541
542 /* stats capturing user activities per sync (user's view) */
543 channel_ring_user_stats ckr_usr_stats
544 __attribute__((aligned(sizeof(uint64_t))));
545 uint64_t ckr_user_accumulated_bytes;
546 uint64_t ckr_user_accumulated_slots;
547 uint64_t ckr_user_accumulated_syncs;
548 uint64_t ckr_user_accumulate_start; /* in seconds */
549
550 lck_grp_t *ckr_qlock_group;
551 lck_grp_t *ckr_slock_group;
552
553 char ckr_name[64]; /* diagnostic */
554} __attribute__((__aligned__(CHANNEL_CACHE_ALIGN_MAX)));
555
556#define KR_LOCK(_kr) \
557 lck_mtx_lock(&(_kr)->ckr_qlock)
558#define KR_LOCK_SPIN(_kr) \
559 lck_mtx_lock_spin(&(_kr)->ckr_qlock)
560#define KR_LOCK_TRY(_kr) \
561 lck_mtx_try_lock(&(_kr)->ckr_qlock)
562#define KR_LOCK_ASSERT_HELD(_kr) \
563 LCK_MTX_ASSERT(&(_kr)->ckr_qlock, LCK_MTX_ASSERT_OWNED)
564#define KR_LOCK_ASSERT_NOTHELD(_kr) \
565 LCK_MTX_ASSERT(&(_kr)->ckr_qlock, LCK_MTX_ASSERT_NOTOWNED)
566#define KR_UNLOCK(_kr) \
567 lck_mtx_unlock(&(_kr)->ckr_qlock)
568
569/* valid values for ckr_flags */
570#define CKRF_EXCLUSIVE 0x1 /* exclusive binding */
571#define CKRF_DROP 0x2 /* drop all mode */
572#define CKRF_HOST 0x4 /* host ring */
573#define CKRF_MEM_RING_INITED 0x8 /* na_kr_setup() succeeded */
574#define CKRF_MEM_SD_INITED 0x10 /* na_kr_setup() succeeded */
575#define CKRF_EXT_RING_INITED 0x20 /* nxpi_ring_init() succeeded */
576#define CKRF_EXT_SLOTS_INITED 0x40 /* nxpi_slot_init() succeeded */
577#define CKRF_SLOT_CONTEXT 0x80 /* ckr_slot_ctxs is valid */
578#define CKRF_MITIGATION 0x100 /* supports event mitigation */
579#define CKRF_DEFUNCT 0x200 /* no longer in service */
580#define CKRF_KERNEL_ONLY (1U << 31) /* not usable by userland */
581
582#define CKRF_BITS \
583 "\020\01EXCLUSIVE\02DROP\03HOST\04MEM_RING_INITED" \
584 "\05MEM_SD_INITED\06EXT_RING_INITED\07EXT_SLOTS_INITED" \
585 "\010SLOT_CONTEXT\011MITIGATION\012DEFUNCT\040KERNEL_ONLY"
586
587#define KRNA(_kr) \
588 ((__DECONST(struct __kern_channel_ring *, _kr))->ckr_na)
589
590#define KR_KERNEL_ONLY(_kr) \
591 (((_kr)->ckr_flags & CKRF_KERNEL_ONLY) != 0)
592#define KR_DROP(_kr) \
593 (((_kr)->ckr_flags & (CKRF_DROP|CKRF_DEFUNCT)) != 0)
594
595/* valid values for ckr_state */
596enum {
597 KR_READY = 0,
598 KR_STOPPED, /* unbounded stop */
599 KR_LOCKED, /* bounded, brief stop for mutual exclusion */
600};
601
602#define KR_KSD(_kring, _slot_idx) \
603 (SLOT_DESC_KSD(&(_kring)->ckr_ksds[_slot_idx]))
604
605#define KR_USD(_kring, _slot_idx) \
606 (SLOT_DESC_USD(&(_kring)->ckr_usds[_slot_idx]))
607
608__attribute__((always_inline))
609static inline slot_idx_t
610KR_SLOT_INDEX(const struct __kern_channel_ring *kr,
611 const struct __slot_desc *slot)
612{
613 ASSERT(slot >= kr->ckr_ksds && slot <= kr->ckr_ksds_last);
614 return (slot_idx_t)(slot - kr->ckr_ksds);
615}
616
617/* Helper macros for slot descriptor, decoupled for KSD/USD. */
618
619#define KSD_VALID_METADATA(_ksd) \
620 ((_ksd)->sd_md != NULL)
621
622#define KSD_INIT(_ksd) do { \
623 (_ksd)->sd_md = NULL; \
624} while (0)
625
626#define KSD_ATTACH_METADATA(_ksd, _md_addr) do { \
627 ASSERT((_ksd) != NULL); \
628 ASSERT((_ksd)->sd_md == NULL); \
629 (_ksd)->sd_md = (_md_addr); \
630} while (0)
631
632#define KSD_DETACH_METADATA(_ksd) do { \
633 ASSERT((_ksd) != NULL); \
634 ASSERT((_ksd)->sd_md != NULL); \
635 (_ksd)->sd_md = NULL; \
636} while (0)
637
638#define KSD_RESET(_ksd) KSD_INIT(_ksd)
639
640#define USD_INIT(_usd) do { \
641 (_usd)->sd_md_idx = OBJ_IDX_NONE; \
642 (_usd)->sd_flags = 0; \
643 (_usd)->sd_len = 0; \
644} while (0)
645
646#define USD_ATTACH_METADATA(_usd, _md_idx) do { \
647 ASSERT((_usd) != NULL); \
648 ASSERT((_usd)->sd_md_idx == OBJ_IDX_NONE); \
649 ASSERT(((_usd)->sd_flags & SD_IDX_VALID) == 0); \
650 (_usd)->sd_md_idx = (_md_idx); \
651 (_usd)->sd_flags |= SD_IDX_VALID; \
652 /* mask off non-user flags */ \
653 (_usd)->sd_flags &= SD_FLAGS_USER; \
654} while (0);
655
656#define USD_DETACH_METADATA(_usd) do { \
657 ASSERT((_usd) != NULL); \
658 (_usd)->sd_md_idx = OBJ_IDX_NONE; \
659 /* mask off non-user flags */ \
660 (_usd)->sd_flags &= SD_FLAGS_USER; \
661 (_usd)->sd_flags &= ~SD_IDX_VALID; \
662} while (0)
663
664#define USD_RESET(_usd) USD_INIT(_usd)
665
666#define USD_SET_LENGTH(_usd, _md_len) do { \
667 ASSERT((_usd) != NULL); \
668 (_usd)->sd_len = _md_len; \
669} while (0)
670
671#define _USD_COPY(_src, _dst) do { \
672 _CASSERT(sizeof (struct __user_slot_desc) == 8); \
673 sk_copy64_8((uint64_t *)(void *)_src, (uint64_t *)(void *)_dst); \
674} while (0)
675
676#define _USD_SWAP(_usd1, _usd2) do { \
677 struct __user_slot_desc _tusd \
678 __attribute((aligned(sizeof (uint64_t)))); \
679 _USD_COPY(_usd1, &_tusd); \
680 _USD_COPY(_usd2, _usd1); \
681 _USD_COPY(&_tusd, _usd2); \
682} while (0)
683
684#define _KSD_COPY(_src, _dst) do { \
685 _CASSERT(sizeof (struct __kern_slot_desc) == 8); \
686 sk_copy64_8((uint64_t *)(void *)_src, (uint64_t *)(void *)_dst); \
687} while (0)
688
689#define _KSD_SWAP(_ksd1, _ksd2) do { \
690 struct __kern_slot_desc _tksd \
691 __attribute((aligned(sizeof (uint64_t)))); \
692 _KSD_COPY(_ksd1, &_tksd); \
693 _KSD_COPY(_ksd2, _ksd1); \
694 _KSD_COPY(&_tksd, _ksd2); \
695} while (0)
696
697#define SD_SWAP(_ksd1, _usd1, _ksd2, _usd2) do { \
698 _USD_SWAP(_usd1, _usd2); \
699 _KSD_SWAP(_ksd1, _ksd2); \
700 /* swap packet attachment */ \
701 *(struct __kern_slot_desc **)(uintptr_t)&(_ksd1)->sd_qum->qum_ksd = \
702 (_ksd1); \
703 *(struct __kern_slot_desc **)(uintptr_t)&(_ksd2)->sd_qum->qum_ksd = \
704 (_ksd2); \
705} while (0)
706
707#define _MD_BUFLET_ADDROFF(_md, _addr, _objaddr, _doff, _dlen, _dlim) do { \
708 struct __kern_quantum *_q = SK_PTR_ADDR_KQUM(_md); \
709 switch (METADATA_TYPE(_q)) { \
710 case NEXUS_META_TYPE_PACKET: { \
711 struct __kern_packet *_p = \
712 (struct __kern_packet *)(void *)(_md); \
713 struct __kern_buflet *_kbft; \
714 PKT_GET_FIRST_BUFLET(_p, _p->pkt_bufs_cnt, _kbft); \
715 (_addr) = __DECONST(void *, _kbft->buf_addr); \
716 (_objaddr) = _kbft->buf_objaddr; \
717 (_doff) = _kbft->buf_doff; \
718 (_dlen) = _kbft->buf_dlen; \
719 (_dlim) = _kbft->buf_dlim; \
720 break; \
721 } \
722 default: \
723 (_addr) = __DECONST(void *, _q->qum_buf[0].buf_addr); \
724 (_objaddr) = _q->qum_buf[0].buf_objaddr; \
725 (_doff) = _q->qum_buf[0].buf_doff; \
726 (_dlen) = _q->qum_buf[0].buf_dlen; \
727 (_dlim) = _q->qum_buf[0].buf_dlim; \
728 break; \
729 } \
730 ASSERT((_addr) != NULL); \
731 ASSERT((_objaddr) != NULL); \
732} while (0)
733
734#define _MD_BUFLET_ADDR_PKT(_md, _addr) do { \
735 ASSERT(METADATA_TYPE(SK_PTR_ADDR_KQUM(_md)) == \
736 NEXUS_META_TYPE_PACKET); \
737 struct __kern_packet *_p = (struct __kern_packet *)(void *)(_md); \
738 struct __kern_buflet *_kbft; \
739 PKT_GET_FIRST_BUFLET(_p, _p->pkt_bufs_cnt, _kbft); \
740 (_addr) = __DECONST(void *, _kbft->buf_addr); \
741 ASSERT((_addr) != NULL); \
742} while (0)
743
744
745/*
746 * Return the data offset adjusted virtual address of a buffer associated
747 * with the metadata; for metadata with multiple buflets, this is the
748 * first buffer's address.
749 */
750#define MD_BUFLET_ADDR(_md, _val) do { \
751 void *__unsafe_indexable _addr, *__unsafe_indexable _objaddr; \
752 uint32_t _doff, _dlen, _dlim; \
753 _MD_BUFLET_ADDROFF(_md, _addr, _objaddr, _doff, _dlen, _dlim); \
754 /* skip past buflet data offset */ \
755 (_val) = (void *)((uint8_t *)_addr + _doff); \
756} while (0)
757
758/*
759 * Return the absolute virtual address of a buffer associated with the
760 * metadata; for metadata with multiple buflets, this is the first
761 * buffer's address.
762 */
763#define MD_BUFLET_ADDR_ABS(_md, _val) do { \
764 void *__unsafe_indexable _addr, *__unsafe_indexable _objaddr; \
765 uint32_t _doff, _dlen, _dlim; \
766 _MD_BUFLET_ADDROFF(_md, _addr, _objaddr, _doff, _dlen, _dlim); \
767 (_val) = (void *)_addr; \
768} while (0)
769
770/* similar to MD_BUFLET_ADDR_ABS() but optimized only for packets */
771#define MD_BUFLET_ADDR_ABS_PKT(_md, _val) do { \
772 void *__unsafe_indexable _addr; \
773 _MD_BUFLET_ADDR_PKT(_md, _addr); \
774 (_val) = (void *)_addr; \
775} while (0)
776
777
778#define MD_BUFLET_ADDR_ABS_DLEN(_md, _val, _dlen, _dlim, _doff) do { \
779 void *__unsafe_indexable _addr, *__unsafe_indexable _objaddr; \
780 _MD_BUFLET_ADDROFF(_md, _addr, _objaddr, _doff, _dlen, _dlim); \
781 (_val) = (void *)_addr; \
782} while (0)
783
784
785/*
786 * Return the buffer's object address associated with the metadata; for
787 * metadata with multiple buflets, this is the first buffer's object address.
788 */
789#define MD_BUFLET_OBJADDR(_md, _val) do { \
790 void *__unsafe_indexable _addr, *__unsafe_indexable _objaddr; \
791 uint32_t _doff, _dlen, _dlim; \
792 _MD_BUFLET_ADDROFF(_md, _addr, _objaddr, _doff, _dlen, _dlim); \
793 (_val) = (void *)_objaddr; \
794} while (0)
795
796/*
797 * Return the data offset adjusted virtual address of a buffer associated
798 * with the metadata; for metadata with multiple buflets, this is the
799 * first buffer's address and data length.
800 */
801#define MD_BUFLET_ADDR_DLEN(_md, _val, _dlen) do { \
802 void *__unsafe_indexable _addr, *__unsafe_indexable _objaddr; \
803 uint32_t _doff, _dlim; \
804 _MD_BUFLET_ADDROFF(_md, _addr, _objaddr, _doff, _dlen, _dlim); \
805 /* skip past buflet data offset */ \
806 (_val) = (void *)((uint8_t *)_addr + _doff); \
807} while (0)
808
809/* kr_space: return available space for enqueue into kring */
810__attribute__((always_inline))
811static inline uint32_t
812kr_available_slots(struct __kern_channel_ring *kr)
813{
814 int busy;
815 uint32_t space;
816
817 busy = (int)(kr->ckr_klease - kr->ckr_khead);
818 if (busy < 0) {
819 busy += kr->ckr_num_slots;
820 }
821 space = kr->ckr_lim - (uint32_t)busy;
822
823 return space;
824}
825
826/* kr_space: return available space for enqueue into Rx kring */
827__attribute__((always_inline))
828static inline uint32_t
829kr_available_slots_rxring(struct __kern_channel_ring *rxkring)
830{
831 int busy;
832 uint32_t space;
833
834 /* # of rx busy (unclaimed) slots */
835 busy = (int)(rxkring->ckr_ktail - rxkring->ckr_khead);
836 if (busy < 0) {
837 busy += rxkring->ckr_num_slots;
838 }
839
840 /* # of rx avail free slots (subtract busy from max) */
841 space = rxkring->ckr_lim - (uint32_t)busy;
842 return space;
843}
844
845extern kern_allocation_name_t skmem_tag_ch_key;
846
847#if (DEVELOPMENT || DEBUG)
848SYSCTL_DECL(_kern_skywalk_channel);
849#endif /* !DEVELOPMENT && !DEBUG */
850
851__BEGIN_DECLS
852extern int channel_init(void);
853extern void channel_fini(void);
854
855extern struct kern_channel *ch_open(struct ch_init *, struct proc *,
856 int, int *);
857extern struct kern_channel *ch_open_special(struct kern_nexus *,
858 struct chreq *, boolean_t, int *);
859extern void ch_close(struct kern_channel *, boolean_t);
860extern void ch_close_special(struct kern_channel *);
861extern int ch_kqfilter(struct kern_channel *, struct knote *,
862 struct kevent_qos_s *kev);
863extern boolean_t ch_is_multiplex(struct kern_channel *, enum txrx);
864extern int ch_select(struct kern_channel *, int, void *, struct proc *);
865extern int ch_get_opt(struct kern_channel *, struct sockopt *);
866extern int ch_set_opt(struct kern_channel *, struct sockopt *);
867extern void ch_deactivate(struct kern_channel *);
868extern void ch_retain(struct kern_channel *);
869extern void ch_retain_locked(struct kern_channel *);
870extern int ch_release(struct kern_channel *);
871extern int ch_release_locked(struct kern_channel *);
872extern void ch_dtor(void *);
873
874extern void csi_init(struct ch_selinfo *, boolean_t, uint64_t);
875extern void csi_destroy(struct ch_selinfo *);
876extern void csi_selrecord_one(struct __kern_channel_ring *, struct proc *,
877 void *);
878extern void csi_selrecord_all(struct nexus_adapter *, enum txrx, struct proc *,
879 void *);
880extern void csi_selwakeup_one(struct __kern_channel_ring *, boolean_t,
881 boolean_t, boolean_t, uint32_t);
882extern void csi_selwakeup_all(struct nexus_adapter *, enum txrx, boolean_t,
883 boolean_t, boolean_t, uint32_t);
884
885extern void kr_init_to_mhints(struct __kern_channel_ring *, uint32_t);
886extern int kr_enter(struct __kern_channel_ring *, boolean_t);
887extern void kr_exit(struct __kern_channel_ring *);
888extern void kr_start(struct __kern_channel_ring *);
889extern void kr_stop(struct __kern_channel_ring *kr, uint32_t state);
890extern void kr_update_stats(struct __kern_channel_ring *kring,
891 uint32_t slot_count, uint32_t byte_count);
892extern boolean_t kr_txempty(struct __kern_channel_ring *kring);
893extern uint32_t kr_reclaim(struct __kern_channel_ring *kr);
894
895extern slot_idx_t kr_txsync_prologue(struct kern_channel *,
896 struct __kern_channel_ring *, struct proc *);
897extern int kr_txprologue(struct kern_channel *,
898 struct __kern_channel_ring *, const slot_idx_t, uint32_t *, uint64_t *,
899 struct proc *);
900extern int kr_txprologue_upp(struct kern_channel *,
901 struct __kern_channel_ring *, const slot_idx_t, uint32_t *, uint64_t *,
902 struct proc *);
903
904extern void kr_txsync_finalize(struct kern_channel *,
905 struct __kern_channel_ring *, struct proc *);
906extern void kr_txfinalize(struct kern_channel *,
907 struct __kern_channel_ring *, const slot_idx_t, struct proc *p);
908extern void kr_txfinalize_upp(struct kern_channel *,
909 struct __kern_channel_ring *, const slot_idx_t, struct proc *p);
910
911extern slot_idx_t kr_rxsync_prologue(struct kern_channel *ch,
912 struct __kern_channel_ring *kring, struct proc *p);
913extern int kr_rxprologue(struct kern_channel *,
914 struct __kern_channel_ring *, const slot_idx_t, uint32_t *, uint64_t *,
915 struct proc *);
916extern int kr_rxprologue_nodetach(struct kern_channel *,
917 struct __kern_channel_ring *, const slot_idx_t, uint32_t *, uint64_t *,
918 struct proc *);
919extern int kr_rxprologue_upp(struct kern_channel *,
920 struct __kern_channel_ring *, const slot_idx_t, uint32_t *, uint64_t *,
921 struct proc *);
922
923extern void kr_rxsync_finalize(struct kern_channel *ch,
924 struct __kern_channel_ring *kring, struct proc *p);
925extern void kr_rxfinalize(struct kern_channel *,
926 struct __kern_channel_ring *, const slot_idx_t, struct proc *p);
927extern void kr_rxfinalize_upp(struct kern_channel *,
928 struct __kern_channel_ring *, const slot_idx_t, struct proc *p);
929
930extern void kr_txkring_reclaim_and_refill(struct __kern_channel_ring *kring,
931 slot_idx_t index);
932extern slot_idx_t kr_alloc_sync_prologue(struct __kern_channel_ring *kring,
933 struct proc *p);
934extern slot_idx_t kr_free_sync_prologue(struct __kern_channel_ring *kring,
935 struct proc *p);
936extern void kr_alloc_sync_finalize(struct __kern_channel_ring *kring,
937 struct proc *p);
938extern void kr_free_sync_finalize(struct __kern_channel_ring *kring,
939 struct proc *p);
940extern int kr_internalize_metadata(struct kern_channel *,
941 struct __kern_channel_ring *, const uint32_t, struct __kern_quantum *,
942 struct proc *);
943extern void kr_externalize_metadata(struct __kern_channel_ring *,
944 const uint32_t, struct __kern_quantum *, struct proc *);
945extern slot_idx_t kr_event_sync_prologue(struct __kern_channel_ring *kring,
946 struct proc *p);
947extern void kr_event_sync_finalize(struct kern_channel *ch,
948 struct __kern_channel_ring *kring, struct proc *p);
949
950#if SK_LOG
951extern void kr_log_bad_ring(struct __kern_channel_ring *);
952#else
953#define kr_log_bad_ring(_kr) do { ((void)0); } while (0)
954#endif /* SK_LOG */
955__END_DECLS
956#endif /* BSD_KERNEL_PRIVATE */
957#endif /* !_SKYWALK_CHANNEL_CHANNELVAR_H_ */
958