1/*
2 * Copyright (c) 1999-2024 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
30 * support for mandatory and extensible security protections. This notice
31 * is included in support of clause 2.2 (b) of the Apple Public License,
32 * Version 2.0.
33 */
34#include "kpi_interface.h"
35#include <stddef.h>
36#include <ptrauth.h>
37
38#include <sys/param.h>
39#include <sys/systm.h>
40#include <sys/kernel.h>
41#include <sys/malloc.h>
42#include <sys/mbuf.h>
43#include <sys/socket.h>
44#include <sys/domain.h>
45#include <sys/user.h>
46#include <sys/random.h>
47#include <sys/socketvar.h>
48#include <net/if_dl.h>
49#include <net/if.h>
50#include <net/route.h>
51#include <net/if_var.h>
52#include <net/dlil.h>
53#include <net/if_arp.h>
54#include <net/iptap.h>
55#include <net/pktap.h>
56#include <net/nwk_wq.h>
57#include <sys/kern_event.h>
58#include <sys/kdebug.h>
59#include <sys/mcache.h>
60#include <sys/syslog.h>
61#include <sys/protosw.h>
62#include <sys/priv.h>
63
64#include <kern/assert.h>
65#include <kern/task.h>
66#include <kern/thread.h>
67#include <kern/sched_prim.h>
68#include <kern/locks.h>
69#include <kern/zalloc.h>
70
71#include <net/kpi_protocol.h>
72#include <net/if_types.h>
73#include <net/if_ipsec.h>
74#include <net/if_llreach.h>
75#include <net/if_utun.h>
76#include <net/kpi_interfacefilter.h>
77#include <net/classq/classq.h>
78#include <net/classq/classq_sfb.h>
79#include <net/flowhash.h>
80#include <net/ntstat.h>
81#if SKYWALK && defined(XNU_TARGET_OS_OSX)
82#include <skywalk/lib/net_filter_event.h>
83#endif /* SKYWALK && XNU_TARGET_OS_OSX */
84#include <net/net_api_stats.h>
85#include <net/if_ports_used.h>
86#include <net/if_vlan_var.h>
87#include <netinet/in.h>
88#if INET
89#include <netinet/in_var.h>
90#include <netinet/igmp_var.h>
91#include <netinet/ip_var.h>
92#include <netinet/tcp.h>
93#include <netinet/tcp_var.h>
94#include <netinet/udp.h>
95#include <netinet/udp_var.h>
96#include <netinet/if_ether.h>
97#include <netinet/in_pcb.h>
98#include <netinet/in_tclass.h>
99#include <netinet/ip.h>
100#include <netinet/ip_icmp.h>
101#include <netinet/icmp_var.h>
102#endif /* INET */
103
104#include <net/nat464_utils.h>
105#include <netinet6/in6_var.h>
106#include <netinet6/nd6.h>
107#include <netinet6/mld6_var.h>
108#include <netinet6/scope6_var.h>
109#include <netinet/ip6.h>
110#include <netinet/icmp6.h>
111#include <net/pf_pbuf.h>
112#include <libkern/OSAtomic.h>
113#include <libkern/tree.h>
114
115#include <dev/random/randomdev.h>
116#include <machine/machine_routines.h>
117
118#include <mach/thread_act.h>
119#include <mach/sdt.h>
120
121#if CONFIG_MACF
122#include <sys/kauth.h>
123#include <security/mac_framework.h>
124#include <net/ethernet.h>
125#include <net/firewire.h>
126#endif
127
128#if PF
129#include <net/pfvar.h>
130#endif /* PF */
131#include <net/pktsched/pktsched.h>
132#include <net/pktsched/pktsched_netem.h>
133
134#if NECP
135#include <net/necp.h>
136#endif /* NECP */
137
138#if SKYWALK
139#include <skywalk/packet/packet_queue.h>
140#include <skywalk/nexus/netif/nx_netif.h>
141#include <skywalk/nexus/flowswitch/nx_flowswitch.h>
142#endif /* SKYWALK */
143
144#include <net/sockaddr_utils.h>
145
146#include <os/log.h>
147
148#define DBG_LAYER_BEG DLILDBG_CODE(DBG_DLIL_STATIC, 0)
149#define DBG_LAYER_END DLILDBG_CODE(DBG_DLIL_STATIC, 2)
150#define DBG_FNC_DLIL_INPUT DLILDBG_CODE(DBG_DLIL_STATIC, (1 << 8))
151#define DBG_FNC_DLIL_OUTPUT DLILDBG_CODE(DBG_DLIL_STATIC, (2 << 8))
152#define DBG_FNC_DLIL_IFOUT DLILDBG_CODE(DBG_DLIL_STATIC, (3 << 8))
153
154#define MAX_FRAME_TYPE_SIZE 4 /* LONGWORDS */
155#define MAX_LINKADDR 4 /* LONGWORDS */
156
157#if 1
158#define DLIL_PRINTF printf
159#else
160#define DLIL_PRINTF kprintf
161#endif
162
163#define IF_DATA_REQUIRE_ALIGNED_64(f) \
164 _CASSERT(!(offsetof(struct if_data_internal, f) % sizeof (u_int64_t)))
165
166#define IFNET_IF_DATA_REQUIRE_ALIGNED_64(f) \
167 _CASSERT(!(offsetof(struct ifnet, if_data.f) % sizeof (u_int64_t)))
168
169enum {
170 kProtoKPI_v1 = 1,
171 kProtoKPI_v2 = 2
172};
173
174uint64_t if_creation_generation_count = 0;
175
176/*
177 * List of if_proto structures in if_proto_hash[] is protected by
178 * the ifnet lock. The rest of the fields are initialized at protocol
179 * attach time and never change, thus no lock required as long as
180 * a reference to it is valid, via if_proto_ref().
181 */
182struct if_proto {
183 SLIST_ENTRY(if_proto) next_hash;
184 u_int32_t refcount;
185 u_int32_t detached;
186 struct ifnet *ifp;
187 protocol_family_t protocol_family;
188 int proto_kpi;
189 union {
190 struct {
191 proto_media_input input;
192 proto_media_preout pre_output;
193 proto_media_event event;
194 proto_media_ioctl ioctl;
195 proto_media_detached detached;
196 proto_media_resolve_multi resolve_multi;
197 proto_media_send_arp send_arp;
198 } v1;
199 struct {
200 proto_media_input_v2 input;
201 proto_media_preout pre_output;
202 proto_media_event event;
203 proto_media_ioctl ioctl;
204 proto_media_detached detached;
205 proto_media_resolve_multi resolve_multi;
206 proto_media_send_arp send_arp;
207 } v2;
208 } kpi;
209};
210
211SLIST_HEAD(proto_hash_entry, if_proto);
212
213#define DLIL_SDLDATALEN \
214 (DLIL_SDLMAXLEN - offsetof(struct sockaddr_dl, sdl_data[0]))
215
216/*
217 * In the common case, the LL address is stored in the
218 * `dl_if_lladdr' member of the `dlil_ifnet'. This is sufficient
219 * for LL addresses that do not exceed the `DLIL_SDLMAXLEN' constant.
220 */
221struct dl_if_lladdr_std {
222 struct ifaddr ifa;
223 u_int8_t addr_sdl_bytes[DLIL_SDLMAXLEN];
224 u_int8_t mask_sdl_bytes[DLIL_SDLMAXLEN];
225};
226
227/*
228 * However, in some rare cases we encounter LL addresses which
229 * would not fit in the `DLIL_SDLMAXLEN' limitation. In such cases
230 * we allocate the storage in the permanent arena, using this memory layout.
231 */
232struct dl_if_lladdr_xtra_space {
233 struct ifaddr ifa;
234 u_int8_t addr_sdl_bytes[SOCK_MAXADDRLEN];
235 u_int8_t mask_sdl_bytes[SOCK_MAXADDRLEN];
236};
237
238struct dlil_ifnet {
239 struct ifnet dl_if; /* public ifnet */
240 /*
241 * DLIL private fields, protected by dl_if_lock
242 */
243 decl_lck_mtx_data(, dl_if_lock);
244 TAILQ_ENTRY(dlil_ifnet) dl_if_link; /* dlil_ifnet link */
245 u_int32_t dl_if_flags; /* flags (below) */
246 u_int32_t dl_if_refcnt; /* refcnt */
247 void (*dl_if_trace)(struct dlil_ifnet *, int); /* ref trace callback */
248 void *dl_if_uniqueid; /* unique interface id */
249 size_t dl_if_uniqueid_len; /* length of the unique id */
250 char dl_if_namestorage[IFNAMSIZ]; /* interface name storage */
251 char dl_if_xnamestorage[IFXNAMSIZ]; /* external name storage */
252 struct dl_if_lladdr_std dl_if_lladdr; /* link-level address storage*/
253 u_int8_t dl_if_descstorage[IF_DESCSIZE]; /* desc storage */
254 u_int8_t dl_if_permanent_ether[ETHER_ADDR_LEN]; /* permanent address */
255 u_int8_t dl_if_permanent_ether_is_set;
256 u_int8_t dl_if_unused;
257 struct dlil_threading_info dl_if_inpstorage; /* input thread storage */
258 ctrace_t dl_if_attach; /* attach PC stacktrace */
259 ctrace_t dl_if_detach; /* detach PC stacktrace */
260};
261
262/* Values for dl_if_flags (private to DLIL) */
263#define DLIF_INUSE 0x1 /* DLIL ifnet recycler, ifnet in use */
264#define DLIF_REUSE 0x2 /* DLIL ifnet recycles, ifnet is not new */
265#define DLIF_DEBUG 0x4 /* has debugging info */
266
267#define IF_REF_TRACE_HIST_SIZE 8 /* size of ref trace history */
268
269/* For gdb */
270__private_extern__ unsigned int if_ref_trace_hist_size = IF_REF_TRACE_HIST_SIZE;
271
272struct dlil_ifnet_dbg {
273 struct dlil_ifnet dldbg_dlif; /* dlil_ifnet */
274 u_int16_t dldbg_if_refhold_cnt; /* # ifnet references */
275 u_int16_t dldbg_if_refrele_cnt; /* # ifnet releases */
276 /*
277 * Circular lists of ifnet_{reference,release} callers.
278 */
279 ctrace_t dldbg_if_refhold[IF_REF_TRACE_HIST_SIZE];
280 ctrace_t dldbg_if_refrele[IF_REF_TRACE_HIST_SIZE];
281};
282
283#define DLIL_TO_IFP(s) (&s->dl_if)
284#define IFP_TO_DLIL(s) ((struct dlil_ifnet *)s)
285
286struct ifnet_filter {
287 TAILQ_ENTRY(ifnet_filter) filt_next;
288 u_int32_t filt_skip;
289 u_int32_t filt_flags;
290 ifnet_t filt_ifp;
291 const char *filt_name;
292 void *filt_cookie;
293 protocol_family_t filt_protocol;
294 iff_input_func filt_input;
295 iff_output_func filt_output;
296 iff_event_func filt_event;
297 iff_ioctl_func filt_ioctl;
298 iff_detached_func filt_detached;
299};
300
301/* Mbuf queue used for freeing the excessive mbufs */
302typedef MBUFQ_HEAD(dlil_freeq) dlil_freeq_t;
303
304struct proto_input_entry;
305
306static TAILQ_HEAD(, dlil_ifnet) dlil_ifnet_head;
307
308static LCK_ATTR_DECLARE(dlil_lck_attributes, 0, 0);
309
310static LCK_GRP_DECLARE(dlil_lock_group, "DLIL internal locks");
311LCK_GRP_DECLARE(ifnet_lock_group, "ifnet locks");
312static LCK_GRP_DECLARE(ifnet_head_lock_group, "ifnet head lock");
313static LCK_GRP_DECLARE(ifnet_snd_lock_group, "ifnet snd locks");
314static LCK_GRP_DECLARE(ifnet_rcv_lock_group, "ifnet rcv locks");
315
316LCK_ATTR_DECLARE(ifnet_lock_attr, 0, 0);
317static LCK_RW_DECLARE_ATTR(ifnet_head_lock, &ifnet_head_lock_group,
318 &dlil_lck_attributes);
319static LCK_MTX_DECLARE_ATTR(dlil_ifnet_lock, &dlil_lock_group,
320 &dlil_lck_attributes);
321
322#if DEBUG
323static unsigned int ifnet_debug = 1; /* debugging (enabled) */
324#else
325static unsigned int ifnet_debug; /* debugging (disabled) */
326#endif /* !DEBUG */
327static unsigned int dlif_size; /* size of dlil_ifnet to allocate */
328static unsigned int dlif_bufsize; /* size of dlif_size + headroom */
329static struct zone *dlif_zone; /* zone for dlil_ifnet */
330#define DLIF_ZONE_NAME "ifnet" /* zone name */
331
332static KALLOC_TYPE_DEFINE(dlif_filt_zone, struct ifnet_filter, NET_KT_DEFAULT);
333
334static KALLOC_TYPE_DEFINE(dlif_proto_zone, struct if_proto, NET_KT_DEFAULT);
335
336static unsigned int dlif_tcpstat_size; /* size of tcpstat_local to allocate */
337static unsigned int dlif_tcpstat_bufsize; /* size of dlif_tcpstat_size + headroom */
338static struct zone *dlif_tcpstat_zone; /* zone for tcpstat_local */
339#define DLIF_TCPSTAT_ZONE_NAME "ifnet_tcpstat" /* zone name */
340
341static unsigned int dlif_udpstat_size; /* size of udpstat_local to allocate */
342static unsigned int dlif_udpstat_bufsize; /* size of dlif_udpstat_size + headroom */
343static struct zone *dlif_udpstat_zone; /* zone for udpstat_local */
344#define DLIF_UDPSTAT_ZONE_NAME "ifnet_udpstat" /* zone name */
345
346static u_int32_t net_rtref;
347
348static struct dlil_main_threading_info dlil_main_input_thread_info;
349__private_extern__ struct dlil_threading_info *dlil_main_input_thread =
350 (struct dlil_threading_info *)&dlil_main_input_thread_info;
351
352static int dlil_event_internal(struct ifnet *ifp, struct kev_msg *msg, bool update_generation);
353static int dlil_detach_filter_internal(interface_filter_t filter, int detached);
354static void dlil_if_trace(struct dlil_ifnet *, int);
355static void if_proto_ref(struct if_proto *);
356static void if_proto_free(struct if_proto *);
357static struct if_proto *find_attached_proto(struct ifnet *, u_int32_t);
358static u_int32_t dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
359 u_int32_t list_count);
360static void _dlil_if_release(ifnet_t ifp, bool clear_in_use);
361static void if_flt_monitor_busy(struct ifnet *);
362static void if_flt_monitor_unbusy(struct ifnet *);
363static void if_flt_monitor_enter(struct ifnet *);
364static void if_flt_monitor_leave(struct ifnet *);
365static int dlil_interface_filters_input(struct ifnet *, struct mbuf **,
366 char **, protocol_family_t);
367static int dlil_interface_filters_output(struct ifnet *, struct mbuf **,
368 protocol_family_t);
369static struct ifaddr *dlil_alloc_lladdr(struct ifnet *,
370 const struct sockaddr_dl *);
371static int ifnet_lookup(struct ifnet *);
372static void if_purgeaddrs(struct ifnet *);
373
374static errno_t ifproto_media_input_v1(struct ifnet *, protocol_family_t,
375 struct mbuf *, char *);
376static errno_t ifproto_media_input_v2(struct ifnet *, protocol_family_t,
377 struct mbuf *);
378static errno_t ifproto_media_preout(struct ifnet *, protocol_family_t,
379 mbuf_t *, const struct sockaddr *, void *, char *, char *);
380static void ifproto_media_event(struct ifnet *, protocol_family_t,
381 const struct kev_msg *);
382static errno_t ifproto_media_ioctl(struct ifnet *, protocol_family_t,
383 unsigned long, void *);
384static errno_t ifproto_media_resolve_multi(ifnet_t, const struct sockaddr *,
385 struct sockaddr_dl *, size_t);
386static errno_t ifproto_media_send_arp(struct ifnet *, u_short,
387 const struct sockaddr_dl *, const struct sockaddr *,
388 const struct sockaddr_dl *, const struct sockaddr *);
389
390static errno_t ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
391 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
392 boolean_t poll, struct thread *tp);
393static void ifp_if_input_poll(struct ifnet *, u_int32_t, u_int32_t,
394 struct mbuf **, struct mbuf **, u_int32_t *, u_int32_t *);
395static errno_t ifp_if_ctl(struct ifnet *, ifnet_ctl_cmd_t, u_int32_t, void *);
396static errno_t ifp_if_demux(struct ifnet *, struct mbuf *, char *,
397 protocol_family_t *);
398static errno_t ifp_if_add_proto(struct ifnet *, protocol_family_t,
399 const struct ifnet_demux_desc *, u_int32_t);
400static errno_t ifp_if_del_proto(struct ifnet *, protocol_family_t);
401static errno_t ifp_if_check_multi(struct ifnet *, const struct sockaddr *);
402#if !XNU_TARGET_OS_OSX
403static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
404 const struct sockaddr *, const char *, const char *,
405 u_int32_t *, u_int32_t *);
406#else /* XNU_TARGET_OS_OSX */
407static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
408 const struct sockaddr *, const char *, const char *);
409#endif /* XNU_TARGET_OS_OSX */
410static errno_t ifp_if_framer_extended(struct ifnet *, struct mbuf **,
411 const struct sockaddr *, const char *, const char *,
412 u_int32_t *, u_int32_t *);
413static errno_t ifp_if_set_bpf_tap(struct ifnet *, bpf_tap_mode, bpf_packet_func);
414static void ifp_if_free(struct ifnet *);
415static void ifp_if_event(struct ifnet *, const struct kev_msg *);
416static __inline void ifp_inc_traffic_class_in(struct ifnet *, struct mbuf *);
417static __inline void ifp_inc_traffic_class_out(struct ifnet *, struct mbuf *);
418
419static uint32_t dlil_trim_overcomitted_queue_locked(class_queue_t *,
420 dlil_freeq_t *, struct ifnet_stat_increment_param *);
421
422static errno_t dlil_input_async(struct dlil_threading_info *, struct ifnet *,
423 struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
424 boolean_t, struct thread *);
425static errno_t dlil_input_sync(struct dlil_threading_info *, struct ifnet *,
426 struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
427 boolean_t, struct thread *);
428
429static void dlil_main_input_thread_func(void *, wait_result_t);
430static void dlil_main_input_thread_cont(void *, wait_result_t);
431
432static void dlil_input_thread_func(void *, wait_result_t);
433static void dlil_input_thread_cont(void *, wait_result_t);
434
435static void dlil_rxpoll_input_thread_func(void *, wait_result_t);
436static void dlil_rxpoll_input_thread_cont(void *, wait_result_t);
437
438static int dlil_create_input_thread(ifnet_t, struct dlil_threading_info *,
439 thread_continue_t *);
440static void dlil_terminate_input_thread(struct dlil_threading_info *);
441static void dlil_input_stats_add(const struct ifnet_stat_increment_param *,
442 struct dlil_threading_info *, struct ifnet *, boolean_t);
443static boolean_t dlil_input_stats_sync(struct ifnet *,
444 struct dlil_threading_info *);
445static void dlil_input_packet_list_common(struct ifnet *, struct mbuf *,
446 u_int32_t, ifnet_model_t, boolean_t);
447static errno_t ifnet_input_common(struct ifnet *, struct mbuf *, struct mbuf *,
448 const struct ifnet_stat_increment_param *, boolean_t, boolean_t);
449static int dlil_is_clat_needed(protocol_family_t, mbuf_t );
450static errno_t dlil_clat46(ifnet_t, protocol_family_t *, mbuf_t *);
451static errno_t dlil_clat64(ifnet_t, protocol_family_t *, mbuf_t *);
452#if DEBUG || DEVELOPMENT
453static void dlil_verify_sum16(void);
454#endif /* DEBUG || DEVELOPMENT */
455static void dlil_output_cksum_dbg(struct ifnet *, struct mbuf *, uint32_t,
456 protocol_family_t);
457static void dlil_input_cksum_dbg(struct ifnet *, struct mbuf *, char *,
458 protocol_family_t);
459
460static void dlil_incr_pending_thread_count(void);
461static void dlil_decr_pending_thread_count(void);
462
463static void ifnet_detacher_thread_func(void *, wait_result_t);
464static void ifnet_detacher_thread_cont(void *, wait_result_t);
465static void ifnet_detach_final(struct ifnet *);
466static void ifnet_detaching_enqueue(struct ifnet *);
467static struct ifnet *ifnet_detaching_dequeue(void);
468
469static void ifnet_start_thread_func(void *, wait_result_t);
470static void ifnet_start_thread_cont(void *, wait_result_t);
471
472static void ifnet_poll_thread_func(void *, wait_result_t);
473static void ifnet_poll_thread_cont(void *, wait_result_t);
474
475static errno_t ifnet_enqueue_common(struct ifnet *, struct ifclassq *,
476 classq_pkt_t *, boolean_t, boolean_t *);
477
478static void ifp_src_route_copyout(struct ifnet *, struct route *);
479static void ifp_src_route_copyin(struct ifnet *, struct route *);
480static void ifp_src_route6_copyout(struct ifnet *, struct route_in6 *);
481static void ifp_src_route6_copyin(struct ifnet *, struct route_in6 *);
482
483static errno_t if_mcasts_update_async(struct ifnet *);
484
485static int sysctl_rxpoll SYSCTL_HANDLER_ARGS;
486static int sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS;
487static int sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS;
488static int sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS;
489static int sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS;
490static int sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS;
491static int sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS;
492static int sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS;
493static int sysctl_rcvq_burst_limit SYSCTL_HANDLER_ARGS;
494static int sysctl_rcvq_trim_pct SYSCTL_HANDLER_ARGS;
495static int sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS;
496static int sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS;
497static int sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS;
498
499struct chain_len_stats tx_chain_len_stats;
500static int sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS;
501
502#if TEST_INPUT_THREAD_TERMINATION
503static int sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS;
504#endif /* TEST_INPUT_THREAD_TERMINATION */
505
506/* The following are protected by dlil_ifnet_lock */
507static TAILQ_HEAD(, ifnet) ifnet_detaching_head;
508static u_int32_t ifnet_detaching_cnt;
509static boolean_t ifnet_detaching_embryonic;
510static void *ifnet_delayed_run; /* wait channel for detaching thread */
511
512static LCK_MTX_DECLARE_ATTR(ifnet_fc_lock, &dlil_lock_group,
513 &dlil_lck_attributes);
514
515static uint32_t ifnet_flowhash_seed;
516
517struct ifnet_flowhash_key {
518 char ifk_name[IFNAMSIZ];
519 uint32_t ifk_unit;
520 uint32_t ifk_flags;
521 uint32_t ifk_eflags;
522 uint32_t ifk_capabilities;
523 uint32_t ifk_capenable;
524 uint32_t ifk_output_sched_model;
525 uint32_t ifk_rand1;
526 uint32_t ifk_rand2;
527};
528
529/* Flow control entry per interface */
530struct ifnet_fc_entry {
531 RB_ENTRY(ifnet_fc_entry) ifce_entry;
532 u_int32_t ifce_flowhash;
533 struct ifnet *ifce_ifp;
534};
535
536static uint32_t ifnet_calc_flowhash(struct ifnet *);
537static int ifce_cmp(const struct ifnet_fc_entry *,
538 const struct ifnet_fc_entry *);
539static int ifnet_fc_add(struct ifnet *);
540static struct ifnet_fc_entry *ifnet_fc_get(u_int32_t);
541static void ifnet_fc_entry_free(struct ifnet_fc_entry *);
542
543/* protected by ifnet_fc_lock */
544RB_HEAD(ifnet_fc_tree, ifnet_fc_entry) ifnet_fc_tree;
545RB_PROTOTYPE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
546RB_GENERATE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
547
548static KALLOC_TYPE_DEFINE(ifnet_fc_zone, struct ifnet_fc_entry, NET_KT_DEFAULT);
549
550extern void bpfdetach(struct ifnet *);
551extern void proto_input_run(void);
552
553extern uint32_t udp_count_opportunistic(unsigned int ifindex,
554 u_int32_t flags);
555extern uint32_t tcp_count_opportunistic(unsigned int ifindex,
556 u_int32_t flags);
557
558__private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *);
559
560#if CONFIG_MACF
561#if !XNU_TARGET_OS_OSX
562int dlil_lladdr_ckreq = 1;
563#else /* XNU_TARGET_OS_OSX */
564int dlil_lladdr_ckreq = 0;
565#endif /* XNU_TARGET_OS_OSX */
566#endif /* CONFIG_MACF */
567
568#if DEBUG
569int dlil_verbose = 1;
570#else
571int dlil_verbose = 0;
572#endif /* DEBUG */
573#if IFNET_INPUT_SANITY_CHK
574/* sanity checking of input packet lists received */
575static u_int32_t dlil_input_sanity_check = 0;
576#endif /* IFNET_INPUT_SANITY_CHK */
577/* rate limit debug messages */
578struct timespec dlil_dbgrate = { .tv_sec = 1, .tv_nsec = 0 };
579
580SYSCTL_DECL(_net_link_generic_system);
581
582SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_verbose,
583 CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_verbose, 0, "Log DLIL error messages");
584
585#define IF_SNDQ_MINLEN 32
586u_int32_t if_sndq_maxlen = IFQ_MAXLEN;
587SYSCTL_PROC(_net_link_generic_system, OID_AUTO, sndq_maxlen,
588 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sndq_maxlen, IFQ_MAXLEN,
589 sysctl_sndq_maxlen, "I", "Default transmit queue max length");
590
591#define IF_RCVQ_MINLEN 32
592#define IF_RCVQ_MAXLEN 256
593u_int32_t if_rcvq_maxlen = IF_RCVQ_MAXLEN;
594SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_maxlen,
595 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_maxlen, IFQ_MAXLEN,
596 sysctl_rcvq_maxlen, "I", "Default receive queue max length");
597
598/*
599 * Protect against possible memory starvation that may happen
600 * when the driver is pushing data faster than the AP can process.
601 *
602 * If at any point during DLIL input phase any of the input queues
603 * exceeds the burst limit, DLIL will start to trim the queue,
604 * by returning mbufs in the input queue to the cache from which
605 * the mbufs were originally allocated, starting from the oldest
606 * mbuf and continuing until the new limit (see below) is reached.
607 *
608 * In order to avoid a steplocked equilibrium, the trimming
609 * will continue PAST the burst limit, until the corresponding
610 * input queue is reduced to `if_rcvq_trim_pct' %.
611 *
612 * For example, if the input queue limit is 1024 packets,
613 * and the trim percentage (`if_rcvq_trim_pct') is 80 %,
614 * the trimming will continue until the queue contains 819 packets
615 * (1024 * 80 / 100 == 819).
616 *
617 * Setting the burst limit too low can hurt the throughput,
618 * while setting the burst limit too high can defeat the purpose.
619 */
620#define IF_RCVQ_BURST_LIMIT_MIN 1024
621#define IF_RCVQ_BURST_LIMIT_DEFAULT 8192
622#define IF_RCVQ_BURST_LIMIT_MAX 32768
623uint32_t if_rcvq_burst_limit = IF_RCVQ_BURST_LIMIT_DEFAULT;
624SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_burst_limit,
625 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_burst_limit, IF_RCVQ_BURST_LIMIT_DEFAULT,
626 sysctl_rcvq_burst_limit, "I", "Upper memory limit for inbound data");
627
628#define IF_RCVQ_TRIM_PCT_MIN 20
629#define IF_RCVQ_TRIM_PCT_DEFAULT 80
630#define IF_RCVQ_TRIM_PCT_MAX 100
631uint32_t if_rcvq_trim_pct = IF_RCVQ_TRIM_PCT_DEFAULT;
632SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_trim_pct,
633 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_trim_pct, IF_RCVQ_TRIM_PCT_DEFAULT,
634 sysctl_rcvq_trim_pct, "I",
635 "Percentage (0 - 100) of the queue limit to keep after detecting an overflow burst");
636
637#define IF_RXPOLL_DECAY 2 /* ilog2 of EWMA decay rate (4) */
638u_int32_t if_rxpoll_decay = IF_RXPOLL_DECAY;
639SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_decay,
640 CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_decay, IF_RXPOLL_DECAY,
641 "ilog2 of EWMA decay rate of avg inbound packets");
642
643#define IF_RXPOLL_MODE_HOLDTIME_MIN (10ULL * 1000 * 1000) /* 10 ms */
644#define IF_RXPOLL_MODE_HOLDTIME (1000ULL * 1000 * 1000) /* 1 sec */
645static u_int64_t if_rxpoll_mode_holdtime = IF_RXPOLL_MODE_HOLDTIME;
646SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_freeze_time,
647 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_mode_holdtime,
648 IF_RXPOLL_MODE_HOLDTIME, sysctl_rxpoll_mode_holdtime,
649 "Q", "input poll mode freeze time");
650
651#define IF_RXPOLL_SAMPLETIME_MIN (1ULL * 1000 * 1000) /* 1 ms */
652#define IF_RXPOLL_SAMPLETIME (10ULL * 1000 * 1000) /* 10 ms */
653static u_int64_t if_rxpoll_sample_holdtime = IF_RXPOLL_SAMPLETIME;
654SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_sample_time,
655 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_sample_holdtime,
656 IF_RXPOLL_SAMPLETIME, sysctl_rxpoll_sample_holdtime,
657 "Q", "input poll sampling time");
658
659static u_int64_t if_rxpoll_interval_time = IF_RXPOLL_INTERVALTIME;
660SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_interval_time,
661 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_time,
662 IF_RXPOLL_INTERVALTIME, sysctl_rxpoll_interval_time,
663 "Q", "input poll interval (time)");
664
665#define IF_RXPOLL_INTERVAL_PKTS 0 /* 0 (disabled) */
666u_int32_t if_rxpoll_interval_pkts = IF_RXPOLL_INTERVAL_PKTS;
667SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_interval_pkts,
668 CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_pkts,
669 IF_RXPOLL_INTERVAL_PKTS, "input poll interval (packets)");
670
671#define IF_RXPOLL_WLOWAT 10
672static u_int32_t if_sysctl_rxpoll_wlowat = IF_RXPOLL_WLOWAT;
673SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_lowat,
674 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_wlowat,
675 IF_RXPOLL_WLOWAT, sysctl_rxpoll_wlowat,
676 "I", "input poll wakeup low watermark");
677
678#define IF_RXPOLL_WHIWAT 100
679static u_int32_t if_sysctl_rxpoll_whiwat = IF_RXPOLL_WHIWAT;
680SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_hiwat,
681 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_whiwat,
682 IF_RXPOLL_WHIWAT, sysctl_rxpoll_whiwat,
683 "I", "input poll wakeup high watermark");
684
685static u_int32_t if_rxpoll_max = 0; /* 0 (automatic) */
686SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_max,
687 CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_max, 0,
688 "max packets per poll call");
689
690u_int32_t if_rxpoll = 1;
691SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll,
692 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll, 0,
693 sysctl_rxpoll, "I", "enable opportunistic input polling");
694
695#if TEST_INPUT_THREAD_TERMINATION
696static u_int32_t if_input_thread_termination_spin = 0;
697SYSCTL_PROC(_net_link_generic_system, OID_AUTO, input_thread_termination_spin,
698 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
699 &if_input_thread_termination_spin, 0,
700 sysctl_input_thread_termination_spin,
701 "I", "input thread termination spin limit");
702#endif /* TEST_INPUT_THREAD_TERMINATION */
703
704static u_int32_t cur_dlil_input_threads = 0;
705SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_threads,
706 CTLFLAG_RD | CTLFLAG_LOCKED, &cur_dlil_input_threads, 0,
707 "Current number of DLIL input threads");
708
709#if IFNET_INPUT_SANITY_CHK
710SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_sanity_check,
711 CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_input_sanity_check, 0,
712 "Turn on sanity checking in DLIL input");
713#endif /* IFNET_INPUT_SANITY_CHK */
714
715static u_int32_t if_flowadv = 1;
716SYSCTL_UINT(_net_link_generic_system, OID_AUTO, flow_advisory,
717 CTLFLAG_RW | CTLFLAG_LOCKED, &if_flowadv, 1,
718 "enable flow-advisory mechanism");
719
720static u_int32_t if_delaybased_queue = 1;
721SYSCTL_UINT(_net_link_generic_system, OID_AUTO, delaybased_queue,
722 CTLFLAG_RW | CTLFLAG_LOCKED, &if_delaybased_queue, 1,
723 "enable delay based dynamic queue sizing");
724
725static uint64_t hwcksum_in_invalidated = 0;
726SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
727 hwcksum_in_invalidated, CTLFLAG_RD | CTLFLAG_LOCKED,
728 &hwcksum_in_invalidated, "inbound packets with invalidated hardware cksum");
729
730uint32_t hwcksum_dbg = 0;
731SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_dbg,
732 CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg, 0,
733 "enable hardware cksum debugging");
734
735u_int32_t ifnet_start_delayed = 0;
736SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delayed,
737 CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_start_delayed, 0,
738 "number of times start was delayed");
739
740u_int32_t ifnet_delay_start_disabled = 0;
741SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delay_disabled,
742 CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_delay_start_disabled, 0,
743 "number of times start was delayed");
744
745static inline void
746ifnet_delay_start_disabled_increment(void)
747{
748 OSIncrementAtomic(&ifnet_delay_start_disabled);
749}
750
751#define HWCKSUM_DBG_PARTIAL_FORCED 0x1 /* forced partial checksum */
752#define HWCKSUM_DBG_PARTIAL_RXOFF_ADJ 0x2 /* adjust start offset */
753#define HWCKSUM_DBG_FINALIZE_FORCED 0x10 /* forced finalize */
754#define HWCKSUM_DBG_MASK \
755 (HWCKSUM_DBG_PARTIAL_FORCED | HWCKSUM_DBG_PARTIAL_RXOFF_ADJ | \
756 HWCKSUM_DBG_FINALIZE_FORCED)
757
758static uint32_t hwcksum_dbg_mode = 0;
759SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_mode,
760 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_mode,
761 0, sysctl_hwcksum_dbg_mode, "I", "hardware cksum debugging mode");
762
763static uint64_t hwcksum_dbg_partial_forced = 0;
764SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
765 hwcksum_dbg_partial_forced, CTLFLAG_RD | CTLFLAG_LOCKED,
766 &hwcksum_dbg_partial_forced, "packets forced using partial cksum");
767
768static uint64_t hwcksum_dbg_partial_forced_bytes = 0;
769SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
770 hwcksum_dbg_partial_forced_bytes, CTLFLAG_RD | CTLFLAG_LOCKED,
771 &hwcksum_dbg_partial_forced_bytes, "bytes forced using partial cksum");
772
773static uint32_t hwcksum_dbg_partial_rxoff_forced = 0;
774SYSCTL_PROC(_net_link_generic_system, OID_AUTO,
775 hwcksum_dbg_partial_rxoff_forced, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
776 &hwcksum_dbg_partial_rxoff_forced, 0,
777 sysctl_hwcksum_dbg_partial_rxoff_forced, "I",
778 "forced partial cksum rx offset");
779
780static uint32_t hwcksum_dbg_partial_rxoff_adj = 0;
781SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_partial_rxoff_adj,
782 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_partial_rxoff_adj,
783 0, sysctl_hwcksum_dbg_partial_rxoff_adj, "I",
784 "adjusted partial cksum rx offset");
785
786static uint64_t hwcksum_dbg_verified = 0;
787SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
788 hwcksum_dbg_verified, CTLFLAG_RD | CTLFLAG_LOCKED,
789 &hwcksum_dbg_verified, "packets verified for having good checksum");
790
791static uint64_t hwcksum_dbg_bad_cksum = 0;
792SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
793 hwcksum_dbg_bad_cksum, CTLFLAG_RD | CTLFLAG_LOCKED,
794 &hwcksum_dbg_bad_cksum, "packets with bad hardware calculated checksum");
795
796static uint64_t hwcksum_dbg_bad_rxoff = 0;
797SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
798 hwcksum_dbg_bad_rxoff, CTLFLAG_RD | CTLFLAG_LOCKED,
799 &hwcksum_dbg_bad_rxoff, "packets with invalid rxoff");
800
801static uint64_t hwcksum_dbg_adjusted = 0;
802SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
803 hwcksum_dbg_adjusted, CTLFLAG_RD | CTLFLAG_LOCKED,
804 &hwcksum_dbg_adjusted, "packets with rxoff adjusted");
805
806static uint64_t hwcksum_dbg_finalized_hdr = 0;
807SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
808 hwcksum_dbg_finalized_hdr, CTLFLAG_RD | CTLFLAG_LOCKED,
809 &hwcksum_dbg_finalized_hdr, "finalized headers");
810
811static uint64_t hwcksum_dbg_finalized_data = 0;
812SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
813 hwcksum_dbg_finalized_data, CTLFLAG_RD | CTLFLAG_LOCKED,
814 &hwcksum_dbg_finalized_data, "finalized payloads");
815
816uint32_t hwcksum_tx = 1;
817SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_tx,
818 CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_tx, 0,
819 "enable transmit hardware checksum offload");
820
821uint32_t hwcksum_rx = 1;
822SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_rx,
823 CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_rx, 0,
824 "enable receive hardware checksum offload");
825
826SYSCTL_PROC(_net_link_generic_system, OID_AUTO, tx_chain_len_stats,
827 CTLFLAG_RD | CTLFLAG_LOCKED, 0, 9,
828 sysctl_tx_chain_len_stats, "S", "");
829
830uint32_t tx_chain_len_count = 0;
831SYSCTL_UINT(_net_link_generic_system, OID_AUTO, tx_chain_len_count,
832 CTLFLAG_RW | CTLFLAG_LOCKED, &tx_chain_len_count, 0, "");
833
834static uint32_t threshold_notify = 1; /* enable/disable */
835SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_notify,
836 CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_notify, 0, "");
837
838static uint32_t threshold_interval = 2; /* in seconds */
839SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_interval,
840 CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_interval, 0, "");
841
842#if (DEVELOPMENT || DEBUG)
843static int sysctl_get_kao_frames SYSCTL_HANDLER_ARGS;
844SYSCTL_NODE(_net_link_generic_system, OID_AUTO, get_kao_frames,
845 CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_get_kao_frames, "");
846#endif /* DEVELOPMENT || DEBUG */
847
848struct net_api_stats net_api_stats;
849SYSCTL_STRUCT(_net, OID_AUTO, api_stats, CTLFLAG_RD | CTLFLAG_LOCKED,
850 &net_api_stats, net_api_stats, "");
851
852uint32_t net_wake_pkt_debug = 0;
853SYSCTL_UINT(_net_link_generic_system, OID_AUTO, wake_pkt_debug,
854 CTLFLAG_RW | CTLFLAG_LOCKED, &net_wake_pkt_debug, 0, "");
855
856static void log_hexdump(void *data, size_t len);
857
858unsigned int net_rxpoll = 1;
859unsigned int net_affinity = 1;
860unsigned int net_async = 1; /* 0: synchronous, 1: asynchronous */
861
862static kern_return_t dlil_affinity_set(struct thread *, u_int32_t);
863
864extern u_int32_t inject_buckets;
865
866/* DLIL data threshold thread call */
867static void dlil_dt_tcall_fn(thread_call_param_t, thread_call_param_t);
868
869void
870ifnet_filter_update_tso(struct ifnet *ifp, boolean_t filter_enable)
871{
872 /*
873 * update filter count and route_generation ID to let TCP
874 * know it should reevalute doing TSO or not
875 */
876 if (filter_enable) {
877 OSAddAtomic(1, &ifp->if_flt_no_tso_count);
878 } else {
879 VERIFY(ifp->if_flt_no_tso_count != 0);
880 OSAddAtomic(-1, &ifp->if_flt_no_tso_count);
881 }
882 routegenid_update();
883}
884
885#if SKYWALK
886
887#if defined(XNU_TARGET_OS_OSX)
888static bool net_check_compatible_if_filter(struct ifnet *ifp);
889#endif /* XNU_TARGET_OS_OSX */
890
891/* if_attach_nx flags defined in os_skywalk_private.h */
892static unsigned int if_attach_nx = IF_ATTACH_NX_DEFAULT;
893unsigned int if_enable_fsw_ip_netagent =
894 ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
895unsigned int if_enable_fsw_transport_netagent =
896 ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
897
898unsigned int if_netif_all =
899 ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_NETIF_ALL) != 0);
900
901/* Configure flowswitch to use max mtu sized buffer */
902static bool fsw_use_max_mtu_buffer = false;
903
904#if (DEVELOPMENT || DEBUG)
905static int
906if_attach_nx_sysctl SYSCTL_HANDLER_ARGS
907{
908#pragma unused(oidp, arg1, arg2)
909 unsigned int new_value;
910 int changed;
911 int error = sysctl_io_number(req, if_attach_nx, sizeof(if_attach_nx),
912 &new_value, &changed);
913 if (error) {
914 return error;
915 }
916 if (changed) {
917 if ((new_value & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) !=
918 (if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT)) {
919 return ENOTSUP;
920 }
921 if_attach_nx = new_value;
922 }
923 return 0;
924}
925
926SYSCTL_PROC(_net_link_generic_system, OID_AUTO, if_attach_nx,
927 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
928 0, 0, &if_attach_nx_sysctl, "IU", "attach nexus");
929
930#endif /* DEVELOPMENT || DEBUG */
931
932static int
933if_enable_fsw_transport_netagent_sysctl SYSCTL_HANDLER_ARGS
934{
935#pragma unused(oidp, arg1, arg2)
936 unsigned int new_value;
937 int changed;
938 int error;
939
940 error = sysctl_io_number(req, bigValue: if_enable_fsw_transport_netagent,
941 valueSize: sizeof(if_enable_fsw_transport_netagent),
942 pValue: &new_value, changed: &changed);
943 if (error == 0 && changed != 0) {
944 if (new_value != 0 && new_value != 1) {
945 /* only allow 0 or 1 */
946 error = EINVAL;
947 } else if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
948 /* netagent can be enabled/disabled */
949 if_enable_fsw_transport_netagent = new_value;
950 if (new_value == 0) {
951 kern_nexus_deregister_netagents();
952 } else {
953 kern_nexus_register_netagents();
954 }
955 } else {
956 /* netagent can't be enabled */
957 error = ENOTSUP;
958 }
959 }
960 return error;
961}
962
963SYSCTL_PROC(_net_link_generic_system, OID_AUTO, enable_netagent,
964 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
965 0, 0, &if_enable_fsw_transport_netagent_sysctl, "IU",
966 "enable flowswitch netagent");
967
968static void dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw);
969
970#include <skywalk/os_skywalk_private.h>
971
972boolean_t
973ifnet_nx_noauto(ifnet_t ifp)
974{
975 return (ifp->if_xflags & IFXF_NX_NOAUTO) != 0;
976}
977
978boolean_t
979ifnet_nx_noauto_flowswitch(ifnet_t ifp)
980{
981 return ifnet_is_low_latency(ifp);
982}
983
984boolean_t
985ifnet_is_low_latency(ifnet_t ifp)
986{
987 return (ifp->if_xflags & IFXF_LOW_LATENCY) != 0;
988}
989
990boolean_t
991ifnet_needs_compat(ifnet_t ifp)
992{
993 if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
994 return FALSE;
995 }
996#if !XNU_TARGET_OS_OSX
997 /*
998 * To conserve memory, we plumb in the compat layer selectively; this
999 * can be overridden via if_attach_nx flag IF_ATTACH_NX_NETIF_ALL.
1000 * In particular, we check for Wi-Fi Access Point.
1001 */
1002 if (IFNET_IS_WIFI(ifp)) {
1003 /* Wi-Fi Access Point */
1004 if (ifp->if_name[0] == 'a' && ifp->if_name[1] == 'p' &&
1005 ifp->if_name[2] == '\0') {
1006 return if_netif_all;
1007 }
1008 }
1009#else /* XNU_TARGET_OS_OSX */
1010#pragma unused(ifp)
1011#endif /* XNU_TARGET_OS_OSX */
1012 return TRUE;
1013}
1014
1015boolean_t
1016ifnet_needs_fsw_transport_netagent(ifnet_t ifp)
1017{
1018 if (if_is_fsw_transport_netagent_enabled()) {
1019 /* check if netagent has been manually enabled for ipsec/utun */
1020 if (ifp->if_family == IFNET_FAMILY_IPSEC) {
1021 return ipsec_interface_needs_netagent(interface: ifp);
1022 } else if (ifp->if_family == IFNET_FAMILY_UTUN) {
1023 return utun_interface_needs_netagent(interface: ifp);
1024 }
1025
1026 /* check ifnet no auto nexus override */
1027 if (ifnet_nx_noauto(ifp)) {
1028 return FALSE;
1029 }
1030
1031 /* check global if_attach_nx configuration */
1032 switch (ifp->if_family) {
1033 case IFNET_FAMILY_CELLULAR:
1034 case IFNET_FAMILY_ETHERNET:
1035 if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
1036 return TRUE;
1037 }
1038 break;
1039 default:
1040 break;
1041 }
1042 }
1043 return FALSE;
1044}
1045
1046boolean_t
1047ifnet_needs_fsw_ip_netagent(ifnet_t ifp)
1048{
1049#pragma unused(ifp)
1050 if ((if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0) {
1051 return TRUE;
1052 }
1053 return FALSE;
1054}
1055
1056boolean_t
1057ifnet_needs_netif_netagent(ifnet_t ifp)
1058{
1059#pragma unused(ifp)
1060 return (if_attach_nx & IF_ATTACH_NX_NETIF_NETAGENT) != 0;
1061}
1062
1063static boolean_t
1064dlil_detach_nexus_instance(nexus_controller_t controller,
1065 const char *func_str, uuid_t instance, uuid_t device)
1066{
1067 errno_t err;
1068
1069 if (instance == NULL || uuid_is_null(uu: instance)) {
1070 return FALSE;
1071 }
1072
1073 /* followed by the device port */
1074 if (device != NULL && !uuid_is_null(uu: device)) {
1075 err = kern_nexus_ifdetach(ctl: controller, nx_uuid: instance, nx_if_uuid: device);
1076 if (err != 0) {
1077 DLIL_PRINTF("%s kern_nexus_ifdetach device failed %d\n",
1078 func_str, err);
1079 }
1080 }
1081 err = kern_nexus_controller_free_provider_instance(ctl: controller,
1082 nx_uuid: instance);
1083 if (err != 0) {
1084 DLIL_PRINTF("%s free_provider_instance failed %d\n",
1085 func_str, err);
1086 }
1087 return TRUE;
1088}
1089
1090static boolean_t
1091dlil_detach_nexus(const char *func_str, uuid_t provider, uuid_t instance,
1092 uuid_t device)
1093{
1094 boolean_t detached = FALSE;
1095 nexus_controller_t controller = kern_nexus_shared_controller();
1096 int err;
1097
1098 if (dlil_detach_nexus_instance(controller, func_str, instance,
1099 device)) {
1100 detached = TRUE;
1101 }
1102 if (provider != NULL && !uuid_is_null(uu: provider)) {
1103 detached = TRUE;
1104 err = kern_nexus_controller_deregister_provider(ctl: controller,
1105 nx_prov_uuid: provider);
1106 if (err != 0) {
1107 DLIL_PRINTF("%s deregister_provider %d\n",
1108 func_str, err);
1109 }
1110 }
1111 return detached;
1112}
1113
1114static errno_t
1115dlil_create_provider_and_instance(nexus_controller_t controller,
1116 nexus_type_t type, ifnet_t ifp, uuid_t *provider, uuid_t *instance,
1117 nexus_attr_t attr)
1118{
1119 uuid_t dom_prov;
1120 errno_t err;
1121 nexus_name_t provider_name;
1122 const char *type_name =
1123 (type == NEXUS_TYPE_NET_IF) ? "netif" : "flowswitch";
1124 struct kern_nexus_init init;
1125
1126 err = kern_nexus_get_default_domain_provider(type, dom_prov_uuid: &dom_prov);
1127 if (err != 0) {
1128 DLIL_PRINTF("%s can't get %s provider, error %d\n",
1129 __func__, type_name, err);
1130 goto failed;
1131 }
1132
1133 snprintf((char *)provider_name, count: sizeof(provider_name),
1134 "com.apple.%s.%s", type_name, if_name(ifp));
1135 err = kern_nexus_controller_register_provider(ctl: controller,
1136 dom_prov_uuid: dom_prov,
1137 provider_name,
1138 NULL,
1139 init_len: 0,
1140 nxa: attr,
1141 nx_prov_uuid: provider);
1142 if (err != 0) {
1143 DLIL_PRINTF("%s register %s provider failed, error %d\n",
1144 __func__, type_name, err);
1145 goto failed;
1146 }
1147 bzero(s: &init, n: sizeof(init));
1148 init.nxi_version = KERN_NEXUS_CURRENT_VERSION;
1149 err = kern_nexus_controller_alloc_provider_instance(ctl: controller,
1150 nx_prov_uuid: *provider,
1151 NULL, NULL,
1152 nx_uuid: instance, init: &init);
1153 if (err != 0) {
1154 DLIL_PRINTF("%s alloc_provider_instance %s failed, %d\n",
1155 __func__, type_name, err);
1156 kern_nexus_controller_deregister_provider(ctl: controller,
1157 nx_prov_uuid: *provider);
1158 goto failed;
1159 }
1160failed:
1161 return err;
1162}
1163
1164static boolean_t
1165dlil_attach_netif_nexus_common(ifnet_t ifp, if_nexus_netif_t netif_nx)
1166{
1167 nexus_attr_t attr = NULL;
1168 nexus_controller_t controller;
1169 errno_t err;
1170
1171 if ((ifp->if_capabilities & IFCAP_SKYWALK) != 0) {
1172 /* it's already attached */
1173 if (dlil_verbose) {
1174 DLIL_PRINTF("%s: %s already has nexus attached\n",
1175 __func__, if_name(ifp));
1176 /* already attached */
1177 }
1178 goto failed;
1179 }
1180
1181 err = kern_nexus_attr_create(&attr);
1182 if (err != 0) {
1183 DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1184 if_name(ifp));
1185 goto failed;
1186 }
1187 err = kern_nexus_attr_set(attr, type: NEXUS_ATTR_IFINDEX, value: ifp->if_index);
1188 VERIFY(err == 0);
1189
1190 controller = kern_nexus_shared_controller();
1191
1192 /* create the netif provider and instance */
1193 err = dlil_create_provider_and_instance(controller,
1194 type: NEXUS_TYPE_NET_IF, ifp, provider: &netif_nx->if_nif_provider,
1195 instance: &netif_nx->if_nif_instance, attr);
1196 if (err != 0) {
1197 goto failed;
1198 }
1199 err = kern_nexus_ifattach(controller, nx_uuid: netif_nx->if_nif_instance,
1200 ifp, NULL, FALSE, nx_if_uuid: &netif_nx->if_nif_attach);
1201 if (err != 0) {
1202 DLIL_PRINTF("%s kern_nexus_ifattach %d\n",
1203 __func__, err);
1204 /* cleanup provider and instance */
1205 dlil_detach_nexus(func_str: __func__, provider: netif_nx->if_nif_provider,
1206 instance: netif_nx->if_nif_instance, NULL);
1207 goto failed;
1208 }
1209 return TRUE;
1210
1211failed:
1212 if (attr != NULL) {
1213 kern_nexus_attr_destroy(attr);
1214 }
1215 return FALSE;
1216}
1217
1218static boolean_t
1219dlil_attach_netif_compat_nexus(ifnet_t ifp, if_nexus_netif_t netif_nx)
1220{
1221 if (ifnet_nx_noauto(ifp) || IFNET_IS_INTCOPROC(ifp) ||
1222 IFNET_IS_MANAGEMENT(ifp) || IFNET_IS_VMNET(ifp)) {
1223 goto failed;
1224 }
1225 switch (ifp->if_type) {
1226 case IFT_CELLULAR:
1227 case IFT_ETHER:
1228 if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
1229 /* don't auto-attach */
1230 goto failed;
1231 }
1232 break;
1233 default:
1234 /* don't auto-attach */
1235 goto failed;
1236 }
1237 return dlil_attach_netif_nexus_common(ifp, netif_nx);
1238
1239failed:
1240 return FALSE;
1241}
1242
1243static boolean_t
1244dlil_is_native_netif_nexus(ifnet_t ifp)
1245{
1246 return (ifp->if_eflags & IFEF_SKYWALK_NATIVE) && ifp->if_na != NULL;
1247}
1248
1249__attribute__((noinline))
1250static void
1251dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)
1252{
1253 dlil_detach_nexus(func_str: __func__, provider: nexus_netif->if_nif_provider,
1254 instance: nexus_netif->if_nif_instance, device: nexus_netif->if_nif_attach);
1255}
1256
1257static inline int
1258dlil_siocgifdevmtu(struct ifnet * ifp, struct ifdevmtu * ifdm_p)
1259{
1260 struct ifreq ifr;
1261 int error;
1262
1263 bzero(s: &ifr, n: sizeof(ifr));
1264 error = ifnet_ioctl(interface: ifp, protocol: 0, SIOCGIFDEVMTU, ioctl_arg: &ifr);
1265 if (error == 0) {
1266 *ifdm_p = ifr.ifr_devmtu;
1267 }
1268 return error;
1269}
1270
1271static inline void
1272_dlil_adjust_large_buf_size_for_tso(ifnet_t ifp, uint32_t *large_buf_size)
1273{
1274#ifdef XNU_TARGET_OS_OSX
1275 uint32_t tso_v4_mtu = 0;
1276 uint32_t tso_v6_mtu = 0;
1277
1278 if (!dlil_is_native_netif_nexus(ifp)) {
1279 return;
1280 }
1281 /*
1282 * Note that we are reading the real hwassist flags set by the driver
1283 * and not the adjusted ones because nx_netif_host_adjust_if_capabilities()
1284 * hasn't been called yet.
1285 */
1286 if ((ifp->if_hwassist & IFNET_TSO_IPV4) != 0) {
1287 tso_v4_mtu = ifp->if_tso_v4_mtu;
1288 }
1289 if ((ifp->if_hwassist & IFNET_TSO_IPV6) != 0) {
1290 tso_v6_mtu = ifp->if_tso_v6_mtu;
1291 }
1292 /*
1293 * If the hardware supports TSO, adjust the large buf size to match the
1294 * supported TSO MTU size.
1295 */
1296 if (tso_v4_mtu != 0 || tso_v6_mtu != 0) {
1297 *large_buf_size = MAX(tso_v4_mtu, tso_v6_mtu);
1298 } else {
1299 *large_buf_size = MAX(*large_buf_size, sk_fsw_gso_mtu);
1300 }
1301 *large_buf_size = MIN(NX_FSW_MAX_LARGE_BUFSIZE, *large_buf_size);
1302#else
1303#pragma unused(ifp, large_buf_size)
1304#endif /* XNU_TARGET_OS_OSX */
1305}
1306
1307static inline int
1308_dlil_get_flowswitch_buffer_size(ifnet_t ifp, uuid_t netif, uint32_t *buf_size,
1309 bool *use_multi_buflet, uint32_t *large_buf_size)
1310{
1311 struct kern_pbufpool_memory_info rx_pp_info;
1312 struct kern_pbufpool_memory_info tx_pp_info;
1313 uint32_t if_max_mtu = 0;
1314 uint32_t drv_buf_size;
1315 struct ifdevmtu ifdm;
1316 int err;
1317
1318 /*
1319 * To perform intra-stack RX aggregation flowswitch needs to use
1320 * multi-buflet packet.
1321 */
1322 *use_multi_buflet = NX_FSW_TCP_RX_AGG_ENABLED();
1323
1324 *large_buf_size = *use_multi_buflet ? NX_FSW_DEF_LARGE_BUFSIZE : 0;
1325 /*
1326 * IP over Thunderbolt interface can deliver the largest IP packet,
1327 * but the driver advertises the MAX MTU as only 9K.
1328 */
1329 if (IFNET_IS_THUNDERBOLT_IP(ifp)) {
1330 if_max_mtu = IP_MAXPACKET;
1331 goto skip_mtu_ioctl;
1332 }
1333
1334 /* determine max mtu */
1335 bzero(s: &ifdm, n: sizeof(ifdm));
1336 err = dlil_siocgifdevmtu(ifp, ifdm_p: &ifdm);
1337 if (__improbable(err != 0)) {
1338 DLIL_PRINTF("%s: SIOCGIFDEVMTU failed for %s\n",
1339 __func__, if_name(ifp));
1340 /* use default flowswitch buffer size */
1341 if_max_mtu = NX_FSW_BUFSIZE;
1342 } else {
1343 DLIL_PRINTF("%s: %s %d %d\n", __func__, if_name(ifp),
1344 ifdm.ifdm_max, ifdm.ifdm_current);
1345 /* rdar://problem/44589731 */
1346 if_max_mtu = MAX(ifdm.ifdm_max, ifdm.ifdm_current);
1347 }
1348
1349skip_mtu_ioctl:
1350 if (if_max_mtu == 0) {
1351 DLIL_PRINTF("%s: can't determine MAX MTU for %s\n",
1352 __func__, if_name(ifp));
1353 return EINVAL;
1354 }
1355 if ((if_max_mtu > NX_FSW_MAXBUFSIZE) && fsw_use_max_mtu_buffer) {
1356 DLIL_PRINTF("%s: interace (%s) has MAX MTU (%u) > flowswitch "
1357 "max bufsize(%d)\n", __func__,
1358 if_name(ifp), if_max_mtu, NX_FSW_MAXBUFSIZE);
1359 return EINVAL;
1360 }
1361
1362 /*
1363 * for skywalk native driver, consult the driver packet pool also.
1364 */
1365 if (dlil_is_native_netif_nexus(ifp)) {
1366 err = kern_nexus_get_pbufpool_info(nx_uuid: netif, rx_pool: &rx_pp_info,
1367 tx_pool: &tx_pp_info);
1368 if (err != 0) {
1369 DLIL_PRINTF("%s: can't get pbufpool info for %s\n",
1370 __func__, if_name(ifp));
1371 return ENXIO;
1372 }
1373 drv_buf_size = tx_pp_info.kpm_bufsize *
1374 tx_pp_info.kpm_max_frags;
1375 if (if_max_mtu > drv_buf_size) {
1376 DLIL_PRINTF("%s: interface %s packet pool (rx %d * %d, "
1377 "tx %d * %d) can't support max mtu(%d)\n", __func__,
1378 if_name(ifp), rx_pp_info.kpm_bufsize,
1379 rx_pp_info.kpm_max_frags, tx_pp_info.kpm_bufsize,
1380 tx_pp_info.kpm_max_frags, if_max_mtu);
1381 return EINVAL;
1382 }
1383 } else {
1384 drv_buf_size = if_max_mtu;
1385 }
1386
1387 if ((drv_buf_size > NX_FSW_BUFSIZE) && (!fsw_use_max_mtu_buffer)) {
1388 _CASSERT((NX_FSW_BUFSIZE * NX_PBUF_FRAGS_MAX) >= IP_MAXPACKET);
1389 *use_multi_buflet = true;
1390 /* default flowswitch buffer size */
1391 *buf_size = NX_FSW_BUFSIZE;
1392 *large_buf_size = MIN(NX_FSW_MAX_LARGE_BUFSIZE, drv_buf_size);
1393 } else {
1394 *buf_size = MAX(drv_buf_size, NX_FSW_BUFSIZE);
1395 }
1396 _dlil_adjust_large_buf_size_for_tso(ifp, large_buf_size);
1397 ASSERT(*buf_size <= NX_FSW_MAXBUFSIZE);
1398 if (*buf_size >= *large_buf_size) {
1399 *large_buf_size = 0;
1400 }
1401 return 0;
1402}
1403
1404static boolean_t
1405_dlil_attach_flowswitch_nexus(ifnet_t ifp, if_nexus_flowswitch_t nexus_fsw)
1406{
1407 nexus_attr_t attr = NULL;
1408 nexus_controller_t controller;
1409 errno_t err = 0;
1410 uuid_t netif;
1411 uint32_t buf_size = 0;
1412 uint32_t large_buf_size = 0;
1413 bool multi_buflet;
1414
1415 if (ifnet_nx_noauto(ifp) || ifnet_nx_noauto_flowswitch(ifp) ||
1416 IFNET_IS_VMNET(ifp)) {
1417 goto failed;
1418 }
1419
1420 if ((ifp->if_capabilities & IFCAP_SKYWALK) == 0) {
1421 /* not possible to attach (netif native/compat not plumbed) */
1422 goto failed;
1423 }
1424
1425 if ((if_attach_nx & IF_ATTACH_NX_FLOWSWITCH) == 0) {
1426 /* don't auto-attach */
1427 goto failed;
1428 }
1429
1430 /* get the netif instance from the ifp */
1431 err = kern_nexus_get_netif_instance(ifp, nx_uuid: netif);
1432 if (err != 0) {
1433 DLIL_PRINTF("%s: can't find netif for %s\n", __func__,
1434 if_name(ifp));
1435 goto failed;
1436 }
1437
1438 err = kern_nexus_attr_create(&attr);
1439 if (err != 0) {
1440 DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1441 if_name(ifp));
1442 goto failed;
1443 }
1444
1445 err = _dlil_get_flowswitch_buffer_size(ifp, netif, buf_size: &buf_size,
1446 use_multi_buflet: &multi_buflet, large_buf_size: &large_buf_size);
1447 if (err != 0) {
1448 goto failed;
1449 }
1450 ASSERT((buf_size >= NX_FSW_BUFSIZE) && (buf_size <= NX_FSW_MAXBUFSIZE));
1451 ASSERT(large_buf_size <= NX_FSW_MAX_LARGE_BUFSIZE);
1452
1453 /* Configure flowswitch buffer size */
1454 err = kern_nexus_attr_set(attr, type: NEXUS_ATTR_SLOT_BUF_SIZE, value: buf_size);
1455 VERIFY(err == 0);
1456 err = kern_nexus_attr_set(attr, type: NEXUS_ATTR_LARGE_BUF_SIZE,
1457 value: large_buf_size);
1458 VERIFY(err == 0);
1459
1460 /*
1461 * Configure flowswitch to use super-packet (multi-buflet).
1462 */
1463 err = kern_nexus_attr_set(attr, type: NEXUS_ATTR_MAX_FRAGS,
1464 value: multi_buflet ? NX_PBUF_FRAGS_MAX : 1);
1465 VERIFY(err == 0);
1466
1467 /* create the flowswitch provider and instance */
1468 controller = kern_nexus_shared_controller();
1469 err = dlil_create_provider_and_instance(controller,
1470 type: NEXUS_TYPE_FLOW_SWITCH, ifp, provider: &nexus_fsw->if_fsw_provider,
1471 instance: &nexus_fsw->if_fsw_instance, attr);
1472 if (err != 0) {
1473 goto failed;
1474 }
1475
1476 /* attach the device port */
1477 err = kern_nexus_ifattach(controller, nx_uuid: nexus_fsw->if_fsw_instance,
1478 NULL, nx_attachee: netif, FALSE, nx_if_uuid: &nexus_fsw->if_fsw_device);
1479 if (err != 0) {
1480 DLIL_PRINTF("%s kern_nexus_ifattach device failed %d %s\n",
1481 __func__, err, if_name(ifp));
1482 /* cleanup provider and instance */
1483 dlil_detach_nexus(func_str: __func__, provider: nexus_fsw->if_fsw_provider,
1484 instance: nexus_fsw->if_fsw_instance, device: nexus_fsw->if_fsw_device);
1485 goto failed;
1486 }
1487 return TRUE;
1488
1489failed:
1490 if (err != 0) {
1491 DLIL_PRINTF("%s: failed to attach flowswitch to %s, error %d\n",
1492 __func__, if_name(ifp), err);
1493 } else {
1494 DLIL_PRINTF("%s: not attaching flowswitch to %s\n",
1495 __func__, if_name(ifp));
1496 }
1497 if (attr != NULL) {
1498 kern_nexus_attr_destroy(attr);
1499 }
1500 return FALSE;
1501}
1502
1503static boolean_t
1504dlil_attach_flowswitch_nexus(ifnet_t ifp)
1505{
1506 boolean_t attached;
1507 if_nexus_flowswitch nexus_fsw;
1508
1509#if (DEVELOPMENT || DEBUG)
1510 if (skywalk_netif_direct_allowed(if_name(ifp))) {
1511 DLIL_PRINTF("skip attaching fsw to %s", if_name(ifp));
1512 return FALSE;
1513 }
1514#endif /* (DEVELOPMENT || DEBUG) */
1515
1516 /*
1517 * flowswitch attachment is not supported for interface using the
1518 * legacy model (IFNET_INIT_LEGACY)
1519 */
1520 if ((ifp->if_eflags & IFEF_TXSTART) == 0) {
1521 DLIL_PRINTF("skip attaching fsw to %s using legacy TX model",
1522 if_name(ifp));
1523 return FALSE;
1524 }
1525
1526 if (uuid_is_null(uu: ifp->if_nx_flowswitch.if_fsw_instance) == 0) {
1527 /* it's already attached */
1528 return FALSE;
1529 }
1530 bzero(s: &nexus_fsw, n: sizeof(nexus_fsw));
1531 attached = _dlil_attach_flowswitch_nexus(ifp, nexus_fsw: &nexus_fsw);
1532 if (attached) {
1533 ifnet_lock_exclusive(ifp);
1534 if (!IF_FULLY_ATTACHED(ifp)) {
1535 /* interface is going away */
1536 attached = FALSE;
1537 } else {
1538 ifp->if_nx_flowswitch = nexus_fsw;
1539 }
1540 ifnet_lock_done(ifp);
1541 if (!attached) {
1542 /* clean up flowswitch nexus */
1543 dlil_detach_flowswitch_nexus(nexus_fsw: &nexus_fsw);
1544 }
1545 }
1546 return attached;
1547}
1548
1549__attribute__((noinline))
1550static void
1551dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)
1552{
1553 dlil_detach_nexus(func_str: __func__, provider: nexus_fsw->if_fsw_provider,
1554 instance: nexus_fsw->if_fsw_instance, device: nexus_fsw->if_fsw_device);
1555}
1556
1557__attribute__((noinline))
1558static void
1559dlil_netif_detach_notify(ifnet_t ifp)
1560{
1561 ifnet_detach_notify_cb_t notify = NULL;
1562 void *arg = NULL;
1563
1564 ifnet_get_detach_notify(ifp, cbp: &notify, argp: &arg);
1565 if (notify == NULL) {
1566 DTRACE_SKYWALK1(no__notify, ifnet_t, ifp);
1567 return;
1568 }
1569 (*notify)(arg);
1570}
1571
1572__attribute__((noinline))
1573static void
1574dlil_quiesce_and_detach_nexuses(ifnet_t ifp)
1575{
1576 if_nexus_flowswitch *nx_fsw = &ifp->if_nx_flowswitch;
1577 if_nexus_netif *nx_netif = &ifp->if_nx_netif;
1578
1579 ifnet_datamov_suspend_and_drain(ifp);
1580 if (!uuid_is_null(uu: nx_fsw->if_fsw_device)) {
1581 ASSERT(!uuid_is_null(nx_fsw->if_fsw_provider));
1582 ASSERT(!uuid_is_null(nx_fsw->if_fsw_instance));
1583 dlil_detach_flowswitch_nexus(nexus_fsw: nx_fsw);
1584 bzero(s: nx_fsw, n: sizeof(*nx_fsw));
1585 } else {
1586 ASSERT(uuid_is_null(nx_fsw->if_fsw_provider));
1587 ASSERT(uuid_is_null(nx_fsw->if_fsw_instance));
1588 DTRACE_IP1(fsw__not__attached, ifnet_t, ifp);
1589 }
1590
1591 if (!uuid_is_null(uu: nx_netif->if_nif_attach)) {
1592 ASSERT(!uuid_is_null(nx_netif->if_nif_provider));
1593 ASSERT(!uuid_is_null(nx_netif->if_nif_instance));
1594 dlil_detach_netif_nexus(nexus_netif: nx_netif);
1595 bzero(s: nx_netif, n: sizeof(*nx_netif));
1596 } else {
1597 ASSERT(uuid_is_null(nx_netif->if_nif_provider));
1598 ASSERT(uuid_is_null(nx_netif->if_nif_instance));
1599 DTRACE_IP1(netif__not__attached, ifnet_t, ifp);
1600 }
1601 ifnet_datamov_resume(ifp);
1602}
1603
1604boolean_t
1605ifnet_add_netagent(ifnet_t ifp)
1606{
1607 int error;
1608
1609 error = kern_nexus_interface_add_netagent(ifp);
1610 os_log(OS_LOG_DEFAULT,
1611 "kern_nexus_interface_add_netagent(%s) returned %d",
1612 ifp->if_xname, error);
1613 return error == 0;
1614}
1615
1616boolean_t
1617ifnet_remove_netagent(ifnet_t ifp)
1618{
1619 int error;
1620
1621 error = kern_nexus_interface_remove_netagent(ifp);
1622 os_log(OS_LOG_DEFAULT,
1623 "kern_nexus_interface_remove_netagent(%s) returned %d",
1624 ifp->if_xname, error);
1625 return error == 0;
1626}
1627
1628boolean_t
1629ifnet_attach_flowswitch_nexus(ifnet_t ifp)
1630{
1631 if (!IF_FULLY_ATTACHED(ifp)) {
1632 return FALSE;
1633 }
1634 return dlil_attach_flowswitch_nexus(ifp);
1635}
1636
1637boolean_t
1638ifnet_detach_flowswitch_nexus(ifnet_t ifp)
1639{
1640 if_nexus_flowswitch nexus_fsw;
1641
1642 ifnet_lock_exclusive(ifp);
1643 nexus_fsw = ifp->if_nx_flowswitch;
1644 bzero(s: &ifp->if_nx_flowswitch, n: sizeof(ifp->if_nx_flowswitch));
1645 ifnet_lock_done(ifp);
1646 return dlil_detach_nexus(func_str: __func__, provider: nexus_fsw.if_fsw_provider,
1647 instance: nexus_fsw.if_fsw_instance, device: nexus_fsw.if_fsw_device);
1648}
1649
1650boolean_t
1651ifnet_attach_netif_nexus(ifnet_t ifp)
1652{
1653 boolean_t nexus_attached;
1654 if_nexus_netif nexus_netif;
1655
1656 if (!IF_FULLY_ATTACHED(ifp)) {
1657 return FALSE;
1658 }
1659 nexus_attached = dlil_attach_netif_nexus_common(ifp, netif_nx: &nexus_netif);
1660 if (nexus_attached) {
1661 ifnet_lock_exclusive(ifp);
1662 ifp->if_nx_netif = nexus_netif;
1663 ifnet_lock_done(ifp);
1664 }
1665 return nexus_attached;
1666}
1667
1668boolean_t
1669ifnet_detach_netif_nexus(ifnet_t ifp)
1670{
1671 if_nexus_netif nexus_netif;
1672
1673 ifnet_lock_exclusive(ifp);
1674 nexus_netif = ifp->if_nx_netif;
1675 bzero(s: &ifp->if_nx_netif, n: sizeof(ifp->if_nx_netif));
1676 ifnet_lock_done(ifp);
1677
1678 return dlil_detach_nexus(func_str: __func__, provider: nexus_netif.if_nif_provider,
1679 instance: nexus_netif.if_nif_instance, device: nexus_netif.if_nif_attach);
1680}
1681
1682void
1683ifnet_attach_native_flowswitch(ifnet_t ifp)
1684{
1685 if (!dlil_is_native_netif_nexus(ifp)) {
1686 /* not a native netif */
1687 return;
1688 }
1689 ifnet_attach_flowswitch_nexus(ifp);
1690}
1691
1692int
1693ifnet_set_flowswitch_rx_callback(ifnet_t ifp, ifnet_fsw_rx_cb_t cb, void *arg)
1694{
1695 lck_mtx_lock(lck: &ifp->if_delegate_lock);
1696 while (ifp->if_fsw_rx_cb_ref > 0) {
1697 DTRACE_SKYWALK1(wait__fsw, ifnet_t, ifp);
1698 (void) msleep(chan: &ifp->if_fsw_rx_cb_ref, mtx: &ifp->if_delegate_lock,
1699 pri: (PZERO + 1), wmesg: __FUNCTION__, NULL);
1700 DTRACE_SKYWALK1(wake__fsw, ifnet_t, ifp);
1701 }
1702 ifp->if_fsw_rx_cb = cb;
1703 ifp->if_fsw_rx_cb_arg = arg;
1704 lck_mtx_unlock(lck: &ifp->if_delegate_lock);
1705 return 0;
1706}
1707
1708int
1709ifnet_get_flowswitch_rx_callback(ifnet_t ifp, ifnet_fsw_rx_cb_t *cbp, void **argp)
1710{
1711 /*
1712 * This is for avoiding the unnecessary lock acquire for interfaces
1713 * not used by a redirect interface.
1714 */
1715 if (ifp->if_fsw_rx_cb == NULL) {
1716 return ENOENT;
1717 }
1718 lck_mtx_lock(lck: &ifp->if_delegate_lock);
1719 if (ifp->if_fsw_rx_cb == NULL) {
1720 lck_mtx_unlock(lck: &ifp->if_delegate_lock);
1721 return ENOENT;
1722 }
1723 *cbp = ifp->if_fsw_rx_cb;
1724 *argp = ifp->if_fsw_rx_cb_arg;
1725 ifp->if_fsw_rx_cb_ref++;
1726 lck_mtx_unlock(lck: &ifp->if_delegate_lock);
1727 return 0;
1728}
1729
1730void
1731ifnet_release_flowswitch_rx_callback(ifnet_t ifp)
1732{
1733 lck_mtx_lock(lck: &ifp->if_delegate_lock);
1734 if (--ifp->if_fsw_rx_cb_ref == 0) {
1735 wakeup(chan: &ifp->if_fsw_rx_cb_ref);
1736 }
1737 lck_mtx_unlock(lck: &ifp->if_delegate_lock);
1738}
1739
1740int
1741ifnet_set_delegate_parent(ifnet_t difp, ifnet_t parent)
1742{
1743 lck_mtx_lock(lck: &difp->if_delegate_lock);
1744 while (difp->if_delegate_parent_ref > 0) {
1745 DTRACE_SKYWALK1(wait__parent, ifnet_t, difp);
1746 (void) msleep(chan: &difp->if_delegate_parent_ref, mtx: &difp->if_delegate_lock,
1747 pri: (PZERO + 1), wmesg: __FUNCTION__, NULL);
1748 DTRACE_SKYWALK1(wake__parent, ifnet_t, difp);
1749 }
1750 difp->if_delegate_parent = parent;
1751 lck_mtx_unlock(lck: &difp->if_delegate_lock);
1752 return 0;
1753}
1754
1755int
1756ifnet_get_delegate_parent(ifnet_t difp, ifnet_t *parentp)
1757{
1758 lck_mtx_lock(lck: &difp->if_delegate_lock);
1759 if (difp->if_delegate_parent == NULL) {
1760 lck_mtx_unlock(lck: &difp->if_delegate_lock);
1761 return ENOENT;
1762 }
1763 *parentp = difp->if_delegate_parent;
1764 difp->if_delegate_parent_ref++;
1765 lck_mtx_unlock(lck: &difp->if_delegate_lock);
1766 return 0;
1767}
1768
1769void
1770ifnet_release_delegate_parent(ifnet_t difp)
1771{
1772 lck_mtx_lock(lck: &difp->if_delegate_lock);
1773 if (--difp->if_delegate_parent_ref == 0) {
1774 wakeup(chan: &difp->if_delegate_parent_ref);
1775 }
1776 lck_mtx_unlock(lck: &difp->if_delegate_lock);
1777}
1778
1779__attribute__((noinline))
1780void
1781ifnet_set_detach_notify_locked(ifnet_t ifp, ifnet_detach_notify_cb_t notify, void *arg)
1782{
1783 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
1784 ifp->if_detach_notify = notify;
1785 ifp->if_detach_notify_arg = arg;
1786}
1787
1788__attribute__((noinline))
1789void
1790ifnet_get_detach_notify_locked(ifnet_t ifp, ifnet_detach_notify_cb_t *notifyp, void **argp)
1791{
1792 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
1793 *notifyp = ifp->if_detach_notify;
1794 *argp = ifp->if_detach_notify_arg;
1795}
1796
1797__attribute__((noinline))
1798void
1799ifnet_set_detach_notify(ifnet_t ifp, ifnet_detach_notify_cb_t notify, void *arg)
1800{
1801 ifnet_lock_exclusive(ifp);
1802 ifnet_set_detach_notify_locked(ifp, notify, arg);
1803 ifnet_lock_done(ifp);
1804}
1805
1806__attribute__((noinline))
1807void
1808ifnet_get_detach_notify(ifnet_t ifp, ifnet_detach_notify_cb_t *notifyp, void **argp)
1809{
1810 ifnet_lock_exclusive(ifp);
1811 ifnet_get_detach_notify_locked(ifp, notifyp, argp);
1812 ifnet_lock_done(ifp);
1813}
1814#endif /* SKYWALK */
1815
1816#define DLIL_INPUT_CHECK(m, ifp) { \
1817 struct ifnet *_rcvif = mbuf_pkthdr_rcvif(m); \
1818 if (_rcvif == NULL || (ifp != lo_ifp && _rcvif != ifp) || \
1819 !(mbuf_flags(m) & MBUF_PKTHDR)) { \
1820 panic_plain("%s: invalid mbuf %p\n", __func__, m); \
1821 /* NOTREACHED */ \
1822 } \
1823}
1824
1825#define DLIL_EWMA(old, new, decay) do { \
1826 u_int32_t _avg; \
1827 if ((_avg = (old)) > 0) \
1828 _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
1829 else \
1830 _avg = (new); \
1831 (old) = _avg; \
1832} while (0)
1833
1834#define MBPS (1ULL * 1000 * 1000)
1835#define GBPS (MBPS * 1000)
1836
1837struct rxpoll_time_tbl {
1838 u_int64_t speed; /* downlink speed */
1839 u_int32_t plowat; /* packets low watermark */
1840 u_int32_t phiwat; /* packets high watermark */
1841 u_int32_t blowat; /* bytes low watermark */
1842 u_int32_t bhiwat; /* bytes high watermark */
1843};
1844
1845static struct rxpoll_time_tbl rxpoll_tbl[] = {
1846 { .speed = 10 * MBPS, .plowat = 2, .phiwat = 8, .blowat = (1 * 1024), .bhiwat = (6 * 1024) },
1847 { .speed = 100 * MBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1848 { .speed = 1 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1849 { .speed = 10 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1850 { .speed = 100 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1851 { .speed = 0, .plowat = 0, .phiwat = 0, .blowat = 0, .bhiwat = 0 }
1852};
1853
1854static LCK_MTX_DECLARE_ATTR(dlil_thread_sync_lock, &dlil_lock_group,
1855 &dlil_lck_attributes);
1856static uint32_t dlil_pending_thread_cnt = 0;
1857
1858static void
1859dlil_incr_pending_thread_count(void)
1860{
1861 LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1862 lck_mtx_lock(lck: &dlil_thread_sync_lock);
1863 dlil_pending_thread_cnt++;
1864 lck_mtx_unlock(lck: &dlil_thread_sync_lock);
1865}
1866
1867static void
1868dlil_decr_pending_thread_count(void)
1869{
1870 LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1871 lck_mtx_lock(lck: &dlil_thread_sync_lock);
1872 VERIFY(dlil_pending_thread_cnt > 0);
1873 dlil_pending_thread_cnt--;
1874 if (dlil_pending_thread_cnt == 0) {
1875 wakeup(chan: &dlil_pending_thread_cnt);
1876 }
1877 lck_mtx_unlock(lck: &dlil_thread_sync_lock);
1878}
1879
1880int
1881proto_hash_value(u_int32_t protocol_family)
1882{
1883 /*
1884 * dlil_proto_unplumb_all() depends on the mapping between
1885 * the hash bucket index and the protocol family defined
1886 * here; future changes must be applied there as well.
1887 */
1888 switch (protocol_family) {
1889 case PF_INET:
1890 return 0;
1891 case PF_INET6:
1892 return 1;
1893 case PF_VLAN:
1894 return 2;
1895 case PF_UNSPEC:
1896 default:
1897 return 3;
1898 }
1899}
1900
1901/*
1902 * Caller must already be holding ifnet lock.
1903 */
1904static struct if_proto *
1905find_attached_proto(struct ifnet *ifp, u_int32_t protocol_family)
1906{
1907 struct if_proto *proto = NULL;
1908 u_int32_t i = proto_hash_value(protocol_family);
1909
1910 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1911
1912 if (ifp->if_proto_hash != NULL) {
1913 proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
1914 }
1915
1916 while (proto != NULL && proto->protocol_family != protocol_family) {
1917 proto = SLIST_NEXT(proto, next_hash);
1918 }
1919
1920 if (proto != NULL) {
1921 if_proto_ref(proto);
1922 }
1923
1924 return proto;
1925}
1926
1927static void
1928if_proto_ref(struct if_proto *proto)
1929{
1930 os_atomic_inc(&proto->refcount, relaxed);
1931}
1932
1933extern void if_rtproto_del(struct ifnet *ifp, int protocol);
1934
1935static void
1936if_proto_free(struct if_proto *proto)
1937{
1938 u_int32_t oldval;
1939 struct ifnet *ifp = proto->ifp;
1940 u_int32_t proto_family = proto->protocol_family;
1941 struct kev_dl_proto_data ev_pr_data;
1942
1943 oldval = os_atomic_dec_orig(&proto->refcount, relaxed);
1944 if (oldval > 1) {
1945 return;
1946 }
1947
1948 if (proto->proto_kpi == kProtoKPI_v1) {
1949 if (proto->kpi.v1.detached) {
1950 proto->kpi.v1.detached(ifp, proto->protocol_family);
1951 }
1952 }
1953 if (proto->proto_kpi == kProtoKPI_v2) {
1954 if (proto->kpi.v2.detached) {
1955 proto->kpi.v2.detached(ifp, proto->protocol_family);
1956 }
1957 }
1958
1959 /*
1960 * Cleanup routes that may still be in the routing table for that
1961 * interface/protocol pair.
1962 */
1963 if_rtproto_del(ifp, protocol: proto_family);
1964
1965 ifnet_lock_shared(ifp);
1966
1967 /* No more reference on this, protocol must have been detached */
1968 VERIFY(proto->detached);
1969
1970 /*
1971 * The reserved field carries the number of protocol still attached
1972 * (subject to change)
1973 */
1974 ev_pr_data.proto_family = proto_family;
1975 ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, list_count: 0);
1976
1977 ifnet_lock_done(ifp);
1978
1979 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_DETACHED,
1980 (struct net_event_data *)&ev_pr_data,
1981 sizeof(struct kev_dl_proto_data), FALSE);
1982
1983 if (ev_pr_data.proto_remaining_count == 0) {
1984 /*
1985 * The protocol count has gone to zero, mark the interface down.
1986 * This used to be done by configd.KernelEventMonitor, but that
1987 * is inherently prone to races (rdar://problem/30810208).
1988 */
1989 (void) ifnet_set_flags(interface: ifp, new_flags: 0, IFF_UP);
1990 (void) ifnet_ioctl(interface: ifp, protocol: 0, SIOCSIFFLAGS, NULL);
1991 dlil_post_sifflags_msg(ifp);
1992 }
1993
1994 zfree(dlif_proto_zone, proto);
1995}
1996
1997__private_extern__ void
1998ifnet_lock_assert(struct ifnet *ifp, ifnet_lock_assert_t what)
1999{
2000#if !MACH_ASSERT
2001#pragma unused(ifp)
2002#endif
2003 unsigned int type = 0;
2004 int ass = 1;
2005
2006 switch (what) {
2007 case IFNET_LCK_ASSERT_EXCLUSIVE:
2008 type = LCK_RW_ASSERT_EXCLUSIVE;
2009 break;
2010
2011 case IFNET_LCK_ASSERT_SHARED:
2012 type = LCK_RW_ASSERT_SHARED;
2013 break;
2014
2015 case IFNET_LCK_ASSERT_OWNED:
2016 type = LCK_RW_ASSERT_HELD;
2017 break;
2018
2019 case IFNET_LCK_ASSERT_NOTOWNED:
2020 /* nothing to do here for RW lock; bypass assert */
2021 ass = 0;
2022 break;
2023
2024 default:
2025 panic("bad ifnet assert type: %d", what);
2026 /* NOTREACHED */
2027 }
2028 if (ass) {
2029 LCK_RW_ASSERT(&ifp->if_lock, type);
2030 }
2031}
2032
2033__private_extern__ void
2034ifnet_lock_shared(struct ifnet *ifp)
2035{
2036 lck_rw_lock_shared(lck: &ifp->if_lock);
2037}
2038
2039__private_extern__ void
2040ifnet_lock_exclusive(struct ifnet *ifp)
2041{
2042 lck_rw_lock_exclusive(lck: &ifp->if_lock);
2043}
2044
2045__private_extern__ void
2046ifnet_lock_done(struct ifnet *ifp)
2047{
2048 lck_rw_done(lck: &ifp->if_lock);
2049}
2050
2051#if INET
2052__private_extern__ void
2053if_inetdata_lock_shared(struct ifnet *ifp)
2054{
2055 lck_rw_lock_shared(lck: &ifp->if_inetdata_lock);
2056}
2057
2058__private_extern__ void
2059if_inetdata_lock_exclusive(struct ifnet *ifp)
2060{
2061 lck_rw_lock_exclusive(lck: &ifp->if_inetdata_lock);
2062}
2063
2064__private_extern__ void
2065if_inetdata_lock_done(struct ifnet *ifp)
2066{
2067 lck_rw_done(lck: &ifp->if_inetdata_lock);
2068}
2069#endif
2070
2071__private_extern__ void
2072if_inet6data_lock_shared(struct ifnet *ifp)
2073{
2074 lck_rw_lock_shared(lck: &ifp->if_inet6data_lock);
2075}
2076
2077__private_extern__ void
2078if_inet6data_lock_exclusive(struct ifnet *ifp)
2079{
2080 lck_rw_lock_exclusive(lck: &ifp->if_inet6data_lock);
2081}
2082
2083__private_extern__ void
2084if_inet6data_lock_done(struct ifnet *ifp)
2085{
2086 lck_rw_done(lck: &ifp->if_inet6data_lock);
2087}
2088
2089__private_extern__ void
2090ifnet_head_lock_shared(void)
2091{
2092 lck_rw_lock_shared(lck: &ifnet_head_lock);
2093}
2094
2095__private_extern__ void
2096ifnet_head_lock_exclusive(void)
2097{
2098 lck_rw_lock_exclusive(lck: &ifnet_head_lock);
2099}
2100
2101__private_extern__ void
2102ifnet_head_done(void)
2103{
2104 lck_rw_done(lck: &ifnet_head_lock);
2105}
2106
2107__private_extern__ void
2108ifnet_head_assert_exclusive(void)
2109{
2110 LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_EXCLUSIVE);
2111}
2112
2113/*
2114 * dlil_ifp_protolist
2115 * - get the list of protocols attached to the interface, or just the number
2116 * of attached protocols
2117 * - if the number returned is greater than 'list_count', truncation occurred
2118 *
2119 * Note:
2120 * - caller must already be holding ifnet lock.
2121 */
2122static u_int32_t
2123dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
2124 u_int32_t list_count)
2125{
2126 u_int32_t count = 0;
2127 int i;
2128
2129 ifnet_lock_assert(ifp, what: IFNET_LCK_ASSERT_OWNED);
2130
2131 if (ifp->if_proto_hash == NULL) {
2132 goto done;
2133 }
2134
2135 for (i = 0; i < PROTO_HASH_SLOTS; i++) {
2136 struct if_proto *proto;
2137 SLIST_FOREACH(proto, &ifp->if_proto_hash[i], next_hash) {
2138 if (list != NULL && count < list_count) {
2139 list[count] = proto->protocol_family;
2140 }
2141 count++;
2142 }
2143 }
2144done:
2145 return count;
2146}
2147
2148__private_extern__ u_int32_t
2149if_get_protolist(struct ifnet * ifp, u_int32_t *protolist, u_int32_t count)
2150{
2151 ifnet_lock_shared(ifp);
2152 count = dlil_ifp_protolist(ifp, list: protolist, list_count: count);
2153 ifnet_lock_done(ifp);
2154 return count;
2155}
2156
2157__private_extern__ void
2158if_free_protolist(u_int32_t *list)
2159{
2160 kfree_data_addr(list);
2161}
2162
2163__private_extern__ int
2164dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass,
2165 u_int32_t event_code, struct net_event_data *event_data,
2166 u_int32_t event_data_len, boolean_t suppress_generation)
2167{
2168 struct net_event_data ev_data;
2169 struct kev_msg ev_msg;
2170
2171 bzero(s: &ev_msg, n: sizeof(ev_msg));
2172 bzero(s: &ev_data, n: sizeof(ev_data));
2173 /*
2174 * a net event always starts with a net_event_data structure
2175 * but the caller can generate a simple net event or
2176 * provide a longer event structure to post
2177 */
2178 ev_msg.vendor_code = KEV_VENDOR_APPLE;
2179 ev_msg.kev_class = KEV_NETWORK_CLASS;
2180 ev_msg.kev_subclass = event_subclass;
2181 ev_msg.event_code = event_code;
2182
2183 if (event_data == NULL) {
2184 event_data = &ev_data;
2185 event_data_len = sizeof(struct net_event_data);
2186 }
2187
2188 strlcpy(dst: &event_data->if_name[0], src: ifp->if_name, IFNAMSIZ);
2189 event_data->if_family = ifp->if_family;
2190 event_data->if_unit = (u_int32_t)ifp->if_unit;
2191
2192 ev_msg.dv[0].data_length = event_data_len;
2193 ev_msg.dv[0].data_ptr = event_data;
2194 ev_msg.dv[1].data_length = 0;
2195
2196 bool update_generation = true;
2197 if (event_subclass == KEV_DL_SUBCLASS) {
2198 /* Don't update interface generation for frequent link quality and state changes */
2199 switch (event_code) {
2200 case KEV_DL_LINK_QUALITY_METRIC_CHANGED:
2201 case KEV_DL_RRC_STATE_CHANGED:
2202 case KEV_DL_PRIMARY_ELECTED:
2203 update_generation = false;
2204 break;
2205 default:
2206 break;
2207 }
2208 }
2209
2210 /*
2211 * Some events that update generation counts might
2212 * want to suppress generation count.
2213 * One example is node presence/absence where we still
2214 * issue kernel event for the invocation but want to avoid
2215 * expensive operation of updating generation which triggers
2216 * NECP client updates.
2217 */
2218 if (suppress_generation) {
2219 update_generation = false;
2220 }
2221
2222 return dlil_event_internal(ifp, msg: &ev_msg, update_generation);
2223}
2224
2225__private_extern__ int
2226dlil_alloc_local_stats(struct ifnet *ifp)
2227{
2228 int ret = EINVAL;
2229 void *buf, *base, **pbuf;
2230
2231 if (ifp == NULL) {
2232 goto end;
2233 }
2234
2235 if (ifp->if_tcp_stat == NULL && ifp->if_udp_stat == NULL) {
2236 /* allocate tcpstat_local structure */
2237 buf = zalloc_flags(dlif_tcpstat_zone,
2238 Z_WAITOK | Z_ZERO | Z_NOFAIL);
2239
2240 /* Get the 64-bit aligned base address for this object */
2241 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
2242 sizeof(u_int64_t));
2243 VERIFY(((intptr_t)base + dlif_tcpstat_size) <=
2244 ((intptr_t)buf + dlif_tcpstat_bufsize));
2245
2246 /*
2247 * Wind back a pointer size from the aligned base and
2248 * save the original address so we can free it later.
2249 */
2250 pbuf = (void **)((intptr_t)base - sizeof(void *));
2251 *pbuf = buf;
2252 ifp->if_tcp_stat = base;
2253
2254 /* allocate udpstat_local structure */
2255 buf = zalloc_flags(dlif_udpstat_zone,
2256 Z_WAITOK | Z_ZERO | Z_NOFAIL);
2257
2258 /* Get the 64-bit aligned base address for this object */
2259 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
2260 sizeof(u_int64_t));
2261 VERIFY(((intptr_t)base + dlif_udpstat_size) <=
2262 ((intptr_t)buf + dlif_udpstat_bufsize));
2263
2264 /*
2265 * Wind back a pointer size from the aligned base and
2266 * save the original address so we can free it later.
2267 */
2268 pbuf = (void **)((intptr_t)base - sizeof(void *));
2269 *pbuf = buf;
2270 ifp->if_udp_stat = base;
2271
2272 VERIFY(IS_P2ALIGNED(ifp->if_tcp_stat, sizeof(u_int64_t)) &&
2273 IS_P2ALIGNED(ifp->if_udp_stat, sizeof(u_int64_t)));
2274
2275 ret = 0;
2276 }
2277
2278 if (ifp->if_ipv4_stat == NULL) {
2279 ifp->if_ipv4_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
2280 }
2281
2282 if (ifp->if_ipv6_stat == NULL) {
2283 ifp->if_ipv6_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
2284 }
2285end:
2286 if (ifp != NULL && ret != 0) {
2287 if (ifp->if_tcp_stat != NULL) {
2288 pbuf = (void **)
2289 ((intptr_t)ifp->if_tcp_stat - sizeof(void *));
2290 zfree(dlif_tcpstat_zone, *pbuf);
2291 ifp->if_tcp_stat = NULL;
2292 }
2293 if (ifp->if_udp_stat != NULL) {
2294 pbuf = (void **)
2295 ((intptr_t)ifp->if_udp_stat - sizeof(void *));
2296 zfree(dlif_udpstat_zone, *pbuf);
2297 ifp->if_udp_stat = NULL;
2298 }
2299 /* The macro kfree_type sets the passed pointer to NULL */
2300 if (ifp->if_ipv4_stat != NULL) {
2301 kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv4_stat);
2302 }
2303 if (ifp->if_ipv6_stat != NULL) {
2304 kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv6_stat);
2305 }
2306 }
2307
2308 return ret;
2309}
2310
2311static void
2312dlil_reset_rxpoll_params(ifnet_t ifp)
2313{
2314 ASSERT(ifp != NULL);
2315 ifnet_set_poll_cycle(ifp, NULL);
2316 ifp->if_poll_update = 0;
2317 ifp->if_poll_flags = 0;
2318 ifp->if_poll_req = 0;
2319 ifp->if_poll_mode = IFNET_MODEL_INPUT_POLL_OFF;
2320 bzero(s: &ifp->if_poll_tstats, n: sizeof(ifp->if_poll_tstats));
2321 bzero(s: &ifp->if_poll_pstats, n: sizeof(ifp->if_poll_pstats));
2322 bzero(s: &ifp->if_poll_sstats, n: sizeof(ifp->if_poll_sstats));
2323 net_timerclear(&ifp->if_poll_mode_holdtime);
2324 net_timerclear(&ifp->if_poll_mode_lasttime);
2325 net_timerclear(&ifp->if_poll_sample_holdtime);
2326 net_timerclear(&ifp->if_poll_sample_lasttime);
2327 net_timerclear(&ifp->if_poll_dbg_lasttime);
2328}
2329
2330static int
2331dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp,
2332 thread_continue_t *thfunc)
2333{
2334 boolean_t dlil_rxpoll_input;
2335 thread_continue_t func = NULL;
2336 u_int32_t limit;
2337 int error = 0;
2338
2339 dlil_rxpoll_input = (ifp != NULL && net_rxpoll &&
2340 (ifp->if_eflags & IFEF_RXPOLL) && (ifp->if_xflags & IFXF_LEGACY));
2341
2342 /* default strategy utilizes the DLIL worker thread */
2343 inp->dlth_strategy = dlil_input_async;
2344
2345 /* NULL ifp indicates the main input thread, called at dlil_init time */
2346 if (ifp == NULL) {
2347 /*
2348 * Main input thread only.
2349 */
2350 func = dlil_main_input_thread_func;
2351 VERIFY(inp == dlil_main_input_thread);
2352 (void) strlcat(dst: inp->dlth_name,
2353 src: "main_input", DLIL_THREADNAME_LEN);
2354 } else if (dlil_rxpoll_input) {
2355 /*
2356 * Legacy (non-netif) hybrid polling.
2357 */
2358 func = dlil_rxpoll_input_thread_func;
2359 VERIFY(inp != dlil_main_input_thread);
2360 (void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2361 "%s_input_poll", if_name(ifp));
2362 } else if (net_async || (ifp->if_xflags & IFXF_LEGACY)) {
2363 /*
2364 * Asynchronous strategy.
2365 */
2366 func = dlil_input_thread_func;
2367 VERIFY(inp != dlil_main_input_thread);
2368 (void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2369 "%s_input", if_name(ifp));
2370 } else {
2371 /*
2372 * Synchronous strategy if there's a netif below and
2373 * the device isn't capable of hybrid polling.
2374 */
2375 ASSERT(func == NULL);
2376 ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2377 VERIFY(inp != dlil_main_input_thread);
2378 ASSERT(!inp->dlth_affinity);
2379 inp->dlth_strategy = dlil_input_sync;
2380 }
2381 VERIFY(inp->dlth_thread == THREAD_NULL);
2382
2383 /* let caller know */
2384 if (thfunc != NULL) {
2385 *thfunc = func;
2386 }
2387
2388 inp->dlth_lock_grp = lck_grp_alloc_init(grp_name: inp->dlth_name, LCK_GRP_ATTR_NULL);
2389 lck_mtx_init(lck: &inp->dlth_lock, grp: inp->dlth_lock_grp, attr: &dlil_lck_attributes);
2390
2391 inp->dlth_ifp = ifp; /* NULL for main input thread */
2392
2393 /*
2394 * For interfaces that support opportunistic polling, set the
2395 * low and high watermarks for outstanding inbound packets/bytes.
2396 * Also define freeze times for transitioning between modes
2397 * and updating the average.
2398 */
2399 if (ifp != NULL && net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
2400 limit = MAX(if_rcvq_maxlen, IF_RCVQ_MINLEN);
2401 if (ifp->if_xflags & IFXF_LEGACY) {
2402 (void) dlil_rxpoll_set_params(ifp, NULL, FALSE);
2403 }
2404 } else {
2405 /*
2406 * For interfaces that don't support opportunistic
2407 * polling, set the burst limit to prevent memory exhaustion.
2408 * The values of `if_rcvq_burst_limit' are safeguarded
2409 * on customer builds by `sysctl_rcvq_burst_limit'.
2410 */
2411 limit = if_rcvq_burst_limit;
2412 }
2413
2414 _qinit(&inp->dlth_pkts, Q_DROPTAIL, limit, QP_MBUF);
2415 if (inp == dlil_main_input_thread) {
2416 struct dlil_main_threading_info *inpm =
2417 (struct dlil_main_threading_info *)inp;
2418 _qinit(&inpm->lo_rcvq_pkts, Q_DROPTAIL, limit, QP_MBUF);
2419 }
2420
2421 if (func == NULL) {
2422 ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2423 ASSERT(error == 0);
2424 error = ENODEV;
2425 goto done;
2426 }
2427
2428 error = kernel_thread_start(continuation: func, parameter: inp, new_thread: &inp->dlth_thread);
2429 if (error == KERN_SUCCESS) {
2430 thread_precedence_policy_data_t info;
2431 __unused kern_return_t kret;
2432
2433 bzero(s: &info, n: sizeof(info));
2434 info.importance = 0;
2435 kret = thread_policy_set(thread: inp->dlth_thread,
2436 THREAD_PRECEDENCE_POLICY, policy_info: (thread_policy_t)&info,
2437 THREAD_PRECEDENCE_POLICY_COUNT);
2438 ASSERT(kret == KERN_SUCCESS);
2439 /*
2440 * We create an affinity set so that the matching workloop
2441 * thread or the starter thread (for loopback) can be
2442 * scheduled on the same processor set as the input thread.
2443 */
2444 if (net_affinity) {
2445 struct thread *tp = inp->dlth_thread;
2446 u_int32_t tag;
2447 /*
2448 * Randomize to reduce the probability
2449 * of affinity tag namespace collision.
2450 */
2451 read_frandom(buffer: &tag, numBytes: sizeof(tag));
2452 if (dlil_affinity_set(tp, tag) == KERN_SUCCESS) {
2453 thread_reference(thread: tp);
2454 inp->dlth_affinity_tag = tag;
2455 inp->dlth_affinity = TRUE;
2456 }
2457 }
2458 } else if (inp == dlil_main_input_thread) {
2459 panic_plain("%s: couldn't create main input thread", __func__);
2460 /* NOTREACHED */
2461 } else {
2462 panic_plain("%s: couldn't create %s input thread", __func__,
2463 if_name(ifp));
2464 /* NOTREACHED */
2465 }
2466 OSAddAtomic(1, &cur_dlil_input_threads);
2467
2468done:
2469 return error;
2470}
2471
2472#if TEST_INPUT_THREAD_TERMINATION
2473static int
2474sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS
2475{
2476#pragma unused(arg1, arg2)
2477 uint32_t i;
2478 int err;
2479
2480 i = if_input_thread_termination_spin;
2481
2482 err = sysctl_handle_int(oidp, &i, 0, req);
2483 if (err != 0 || req->newptr == USER_ADDR_NULL) {
2484 return err;
2485 }
2486
2487 if (net_rxpoll == 0) {
2488 return ENXIO;
2489 }
2490
2491 if_input_thread_termination_spin = i;
2492 return err;
2493}
2494#endif /* TEST_INPUT_THREAD_TERMINATION */
2495
2496static void
2497dlil_clean_threading_info(struct dlil_threading_info *inp)
2498{
2499 lck_mtx_destroy(lck: &inp->dlth_lock, grp: inp->dlth_lock_grp);
2500 lck_grp_free(grp: inp->dlth_lock_grp);
2501 inp->dlth_lock_grp = NULL;
2502
2503 inp->dlth_flags = 0;
2504 inp->dlth_wtot = 0;
2505 bzero(s: inp->dlth_name, n: sizeof(inp->dlth_name));
2506 inp->dlth_ifp = NULL;
2507 VERIFY(qhead(&inp->dlth_pkts) == NULL && qempty(&inp->dlth_pkts));
2508 qlimit(&inp->dlth_pkts) = 0;
2509 bzero(s: &inp->dlth_stats, n: sizeof(inp->dlth_stats));
2510
2511 VERIFY(!inp->dlth_affinity);
2512 inp->dlth_thread = THREAD_NULL;
2513 inp->dlth_strategy = NULL;
2514 VERIFY(inp->dlth_driver_thread == THREAD_NULL);
2515 VERIFY(inp->dlth_poller_thread == THREAD_NULL);
2516 VERIFY(inp->dlth_affinity_tag == 0);
2517#if IFNET_INPUT_SANITY_CHK
2518 inp->dlth_pkts_cnt = 0;
2519#endif /* IFNET_INPUT_SANITY_CHK */
2520}
2521
2522static void
2523dlil_terminate_input_thread(struct dlil_threading_info *inp)
2524{
2525 struct ifnet *ifp = inp->dlth_ifp;
2526 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2527
2528 VERIFY(current_thread() == inp->dlth_thread);
2529 VERIFY(inp != dlil_main_input_thread);
2530
2531 OSAddAtomic(-1, &cur_dlil_input_threads);
2532
2533#if TEST_INPUT_THREAD_TERMINATION
2534 { /* do something useless that won't get optimized away */
2535 uint32_t v = 1;
2536 for (uint32_t i = 0;
2537 i < if_input_thread_termination_spin;
2538 i++) {
2539 v = (i + 1) * v;
2540 }
2541 DLIL_PRINTF("the value is %d\n", v);
2542 }
2543#endif /* TEST_INPUT_THREAD_TERMINATION */
2544
2545 lck_mtx_lock_spin(lck: &inp->dlth_lock);
2546 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
2547 VERIFY((inp->dlth_flags & DLIL_INPUT_TERMINATE) != 0);
2548 inp->dlth_flags |= DLIL_INPUT_TERMINATE_COMPLETE;
2549 wakeup_one(chan: (caddr_t)&inp->dlth_flags);
2550 lck_mtx_unlock(lck: &inp->dlth_lock);
2551
2552 /* free up pending packets */
2553 if (pkt.cp_mbuf != NULL) {
2554 mbuf_freem_list(mbuf: pkt.cp_mbuf);
2555 }
2556
2557 /* for the extra refcnt from kernel_thread_start() */
2558 thread_deallocate(thread: current_thread());
2559
2560 if (dlil_verbose) {
2561 DLIL_PRINTF("%s: input thread terminated\n",
2562 if_name(ifp));
2563 }
2564
2565 /* this is the end */
2566 thread_terminate(target_act: current_thread());
2567 /* NOTREACHED */
2568}
2569
2570static kern_return_t
2571dlil_affinity_set(struct thread *tp, u_int32_t tag)
2572{
2573 thread_affinity_policy_data_t policy;
2574
2575 bzero(s: &policy, n: sizeof(policy));
2576 policy.affinity_tag = tag;
2577 return thread_policy_set(thread: tp, THREAD_AFFINITY_POLICY,
2578 policy_info: (thread_policy_t)&policy, THREAD_AFFINITY_POLICY_COUNT);
2579}
2580
2581#if SKYWALK && defined(XNU_TARGET_OS_OSX)
2582static void
2583dlil_filter_event(struct eventhandler_entry_arg arg __unused,
2584 enum net_filter_event_subsystems state)
2585{
2586 bool old_if_enable_fsw_transport_netagent = if_enable_fsw_transport_netagent;
2587 if ((state & ~NET_FILTER_EVENT_PF_PRIVATE_PROXY) == 0) {
2588 if_enable_fsw_transport_netagent = 1;
2589 } else {
2590 if_enable_fsw_transport_netagent = 0;
2591 }
2592 if (old_if_enable_fsw_transport_netagent != if_enable_fsw_transport_netagent) {
2593 kern_nexus_update_netagents();
2594 } else if (!if_enable_fsw_transport_netagent) {
2595 necp_update_all_clients();
2596 }
2597}
2598#endif /* SKYWALK && XNU_TARGET_OS_OSX */
2599
2600void
2601dlil_init(void)
2602{
2603 thread_t thread = THREAD_NULL;
2604
2605 /*
2606 * The following fields must be 64-bit aligned for atomic operations.
2607 */
2608 IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2609 IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2610 IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2611 IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2612 IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2613 IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2614 IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2615 IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2616 IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2617 IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2618 IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2619 IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2620 IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2621 IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2622 IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2623
2624 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2625 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2626 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2627 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2628 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2629 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2630 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2631 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2632 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2633 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2634 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2635 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2636 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2637 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2638 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2639
2640 /*
2641 * These IF_HWASSIST_ flags must be equal to their IFNET_* counterparts.
2642 */
2643 _CASSERT(IF_HWASSIST_CSUM_IP == IFNET_CSUM_IP);
2644 _CASSERT(IF_HWASSIST_CSUM_TCP == IFNET_CSUM_TCP);
2645 _CASSERT(IF_HWASSIST_CSUM_UDP == IFNET_CSUM_UDP);
2646 _CASSERT(IF_HWASSIST_CSUM_IP_FRAGS == IFNET_CSUM_FRAGMENT);
2647 _CASSERT(IF_HWASSIST_CSUM_FRAGMENT == IFNET_IP_FRAGMENT);
2648 _CASSERT(IF_HWASSIST_CSUM_TCPIPV6 == IFNET_CSUM_TCPIPV6);
2649 _CASSERT(IF_HWASSIST_CSUM_UDPIPV6 == IFNET_CSUM_UDPIPV6);
2650 _CASSERT(IF_HWASSIST_CSUM_FRAGMENT_IPV6 == IFNET_IPV6_FRAGMENT);
2651 _CASSERT(IF_HWASSIST_CSUM_PARTIAL == IFNET_CSUM_PARTIAL);
2652 _CASSERT(IF_HWASSIST_CSUM_ZERO_INVERT == IFNET_CSUM_ZERO_INVERT);
2653 _CASSERT(IF_HWASSIST_VLAN_TAGGING == IFNET_VLAN_TAGGING);
2654 _CASSERT(IF_HWASSIST_VLAN_MTU == IFNET_VLAN_MTU);
2655 _CASSERT(IF_HWASSIST_TSO_V4 == IFNET_TSO_IPV4);
2656 _CASSERT(IF_HWASSIST_TSO_V6 == IFNET_TSO_IPV6);
2657
2658 /*
2659 * ... as well as the mbuf checksum flags counterparts.
2660 */
2661 _CASSERT(CSUM_IP == IF_HWASSIST_CSUM_IP);
2662 _CASSERT(CSUM_TCP == IF_HWASSIST_CSUM_TCP);
2663 _CASSERT(CSUM_UDP == IF_HWASSIST_CSUM_UDP);
2664 _CASSERT(CSUM_IP_FRAGS == IF_HWASSIST_CSUM_IP_FRAGS);
2665 _CASSERT(CSUM_FRAGMENT == IF_HWASSIST_CSUM_FRAGMENT);
2666 _CASSERT(CSUM_TCPIPV6 == IF_HWASSIST_CSUM_TCPIPV6);
2667 _CASSERT(CSUM_UDPIPV6 == IF_HWASSIST_CSUM_UDPIPV6);
2668 _CASSERT(CSUM_FRAGMENT_IPV6 == IF_HWASSIST_CSUM_FRAGMENT_IPV6);
2669 _CASSERT(CSUM_PARTIAL == IF_HWASSIST_CSUM_PARTIAL);
2670 _CASSERT(CSUM_ZERO_INVERT == IF_HWASSIST_CSUM_ZERO_INVERT);
2671 _CASSERT(CSUM_VLAN_TAG_VALID == IF_HWASSIST_VLAN_TAGGING);
2672
2673 /*
2674 * Make sure we have at least IF_LLREACH_MAXLEN in the llreach info.
2675 */
2676 _CASSERT(IF_LLREACH_MAXLEN <= IF_LLREACHINFO_ADDRLEN);
2677 _CASSERT(IFNET_LLREACHINFO_ADDRLEN == IF_LLREACHINFO_ADDRLEN);
2678
2679 _CASSERT(IFRLOGF_DLIL == IFNET_LOGF_DLIL);
2680 _CASSERT(IFRLOGF_FAMILY == IFNET_LOGF_FAMILY);
2681 _CASSERT(IFRLOGF_DRIVER == IFNET_LOGF_DRIVER);
2682 _CASSERT(IFRLOGF_FIRMWARE == IFNET_LOGF_FIRMWARE);
2683
2684 _CASSERT(IFRLOGCAT_CONNECTIVITY == IFNET_LOGCAT_CONNECTIVITY);
2685 _CASSERT(IFRLOGCAT_QUALITY == IFNET_LOGCAT_QUALITY);
2686 _CASSERT(IFRLOGCAT_PERFORMANCE == IFNET_LOGCAT_PERFORMANCE);
2687
2688 _CASSERT(IFRTYPE_FAMILY_ANY == IFNET_FAMILY_ANY);
2689 _CASSERT(IFRTYPE_FAMILY_LOOPBACK == IFNET_FAMILY_LOOPBACK);
2690 _CASSERT(IFRTYPE_FAMILY_ETHERNET == IFNET_FAMILY_ETHERNET);
2691 _CASSERT(IFRTYPE_FAMILY_SLIP == IFNET_FAMILY_SLIP);
2692 _CASSERT(IFRTYPE_FAMILY_TUN == IFNET_FAMILY_TUN);
2693 _CASSERT(IFRTYPE_FAMILY_VLAN == IFNET_FAMILY_VLAN);
2694 _CASSERT(IFRTYPE_FAMILY_PPP == IFNET_FAMILY_PPP);
2695 _CASSERT(IFRTYPE_FAMILY_PVC == IFNET_FAMILY_PVC);
2696 _CASSERT(IFRTYPE_FAMILY_DISC == IFNET_FAMILY_DISC);
2697 _CASSERT(IFRTYPE_FAMILY_MDECAP == IFNET_FAMILY_MDECAP);
2698 _CASSERT(IFRTYPE_FAMILY_GIF == IFNET_FAMILY_GIF);
2699 _CASSERT(IFRTYPE_FAMILY_FAITH == IFNET_FAMILY_FAITH);
2700 _CASSERT(IFRTYPE_FAMILY_STF == IFNET_FAMILY_STF);
2701 _CASSERT(IFRTYPE_FAMILY_FIREWIRE == IFNET_FAMILY_FIREWIRE);
2702 _CASSERT(IFRTYPE_FAMILY_BOND == IFNET_FAMILY_BOND);
2703 _CASSERT(IFRTYPE_FAMILY_CELLULAR == IFNET_FAMILY_CELLULAR);
2704 _CASSERT(IFRTYPE_FAMILY_UTUN == IFNET_FAMILY_UTUN);
2705 _CASSERT(IFRTYPE_FAMILY_IPSEC == IFNET_FAMILY_IPSEC);
2706
2707 _CASSERT(IFRTYPE_SUBFAMILY_ANY == IFNET_SUBFAMILY_ANY);
2708 _CASSERT(IFRTYPE_SUBFAMILY_USB == IFNET_SUBFAMILY_USB);
2709 _CASSERT(IFRTYPE_SUBFAMILY_BLUETOOTH == IFNET_SUBFAMILY_BLUETOOTH);
2710 _CASSERT(IFRTYPE_SUBFAMILY_WIFI == IFNET_SUBFAMILY_WIFI);
2711 _CASSERT(IFRTYPE_SUBFAMILY_THUNDERBOLT == IFNET_SUBFAMILY_THUNDERBOLT);
2712 _CASSERT(IFRTYPE_SUBFAMILY_RESERVED == IFNET_SUBFAMILY_RESERVED);
2713 _CASSERT(IFRTYPE_SUBFAMILY_INTCOPROC == IFNET_SUBFAMILY_INTCOPROC);
2714 _CASSERT(IFRTYPE_SUBFAMILY_QUICKRELAY == IFNET_SUBFAMILY_QUICKRELAY);
2715 _CASSERT(IFRTYPE_SUBFAMILY_VMNET == IFNET_SUBFAMILY_VMNET);
2716 _CASSERT(IFRTYPE_SUBFAMILY_SIMCELL == IFNET_SUBFAMILY_SIMCELL);
2717 _CASSERT(IFRTYPE_SUBFAMILY_MANAGEMENT == IFNET_SUBFAMILY_MANAGEMENT);
2718
2719 _CASSERT(DLIL_MODIDLEN == IFNET_MODIDLEN);
2720 _CASSERT(DLIL_MODARGLEN == IFNET_MODARGLEN);
2721
2722 PE_parse_boot_argn(arg_string: "net_affinity", arg_ptr: &net_affinity,
2723 max_arg: sizeof(net_affinity));
2724
2725 PE_parse_boot_argn(arg_string: "net_rxpoll", arg_ptr: &net_rxpoll, max_arg: sizeof(net_rxpoll));
2726
2727 PE_parse_boot_argn(arg_string: "net_rtref", arg_ptr: &net_rtref, max_arg: sizeof(net_rtref));
2728
2729 PE_parse_boot_argn(arg_string: "net_async", arg_ptr: &net_async, max_arg: sizeof(net_async));
2730
2731 PE_parse_boot_argn(arg_string: "ifnet_debug", arg_ptr: &ifnet_debug, max_arg: sizeof(ifnet_debug));
2732
2733 VERIFY(dlil_pending_thread_cnt == 0);
2734#if SKYWALK
2735 boolean_t pe_enable_fsw_transport_netagent = FALSE;
2736 boolean_t pe_disable_fsw_transport_netagent = FALSE;
2737 boolean_t enable_fsw_netagent =
2738 (((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) ||
2739 (if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
2740
2741 /*
2742 * Check the device tree to see if Skywalk netagent has been explicitly
2743 * enabled or disabled. This can be overridden via if_attach_nx below.
2744 * Note that the property is a 0-length key, and so checking for the
2745 * presence itself is enough (no need to check for the actual value of
2746 * the retrieved variable.)
2747 */
2748 pe_enable_fsw_transport_netagent =
2749 PE_get_default(property_name: "kern.skywalk_netagent_enable",
2750 property_ptr: &pe_enable_fsw_transport_netagent,
2751 max_property: sizeof(pe_enable_fsw_transport_netagent));
2752 pe_disable_fsw_transport_netagent =
2753 PE_get_default(property_name: "kern.skywalk_netagent_disable",
2754 property_ptr: &pe_disable_fsw_transport_netagent,
2755 max_property: sizeof(pe_disable_fsw_transport_netagent));
2756
2757 /*
2758 * These two are mutually exclusive, i.e. they both can be absent,
2759 * but only one can be present at a time, and so we assert to make
2760 * sure it is correct.
2761 */
2762 VERIFY((!pe_enable_fsw_transport_netagent &&
2763 !pe_disable_fsw_transport_netagent) ||
2764 (pe_enable_fsw_transport_netagent ^
2765 pe_disable_fsw_transport_netagent));
2766
2767 if (pe_enable_fsw_transport_netagent) {
2768 kprintf(fmt: "SK: netagent is enabled via an override for "
2769 "this platform\n");
2770 if_attach_nx = SKYWALK_NETWORKING_ENABLED;
2771 } else if (pe_disable_fsw_transport_netagent) {
2772 kprintf(fmt: "SK: netagent is disabled via an override for "
2773 "this platform\n");
2774 if_attach_nx = SKYWALK_NETWORKING_DISABLED;
2775 } else {
2776 kprintf(fmt: "SK: netagent is %s by default for this platform\n",
2777 (enable_fsw_netagent ? "enabled" : "disabled"));
2778 if_attach_nx = IF_ATTACH_NX_DEFAULT;
2779 }
2780
2781 /*
2782 * Now see if there's a boot-arg override.
2783 */
2784 (void) PE_parse_boot_argn(arg_string: "if_attach_nx", arg_ptr: &if_attach_nx,
2785 max_arg: sizeof(if_attach_nx));
2786 if_enable_fsw_transport_netagent =
2787 ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
2788
2789 if_netif_all = ((if_attach_nx & IF_ATTACH_NX_NETIF_ALL) != 0);
2790
2791 if (pe_disable_fsw_transport_netagent &&
2792 if_enable_fsw_transport_netagent) {
2793 kprintf(fmt: "SK: netagent is force-enabled\n");
2794 } else if (!pe_disable_fsw_transport_netagent &&
2795 !if_enable_fsw_transport_netagent) {
2796 kprintf(fmt: "SK: netagent is force-disabled\n");
2797 }
2798#ifdef XNU_TARGET_OS_OSX
2799 if (if_enable_fsw_transport_netagent) {
2800 net_filter_event_register(callback: dlil_filter_event);
2801 }
2802#endif /* XNU_TARGET_OS_OSX */
2803
2804#if (DEVELOPMENT || DEBUG)
2805 (void) PE_parse_boot_argn("fsw_use_max_mtu_buffer",
2806 &fsw_use_max_mtu_buffer, sizeof(fsw_use_max_mtu_buffer));
2807#endif /* (DEVELOPMENT || DEBUG) */
2808
2809#endif /* SKYWALK */
2810 dlif_size = (ifnet_debug == 0) ? sizeof(struct dlil_ifnet) :
2811 sizeof(struct dlil_ifnet_dbg);
2812 /* Enforce 64-bit alignment for dlil_ifnet structure */
2813 dlif_bufsize = dlif_size + sizeof(void *) + sizeof(u_int64_t);
2814 dlif_bufsize = (uint32_t)P2ROUNDUP(dlif_bufsize, sizeof(u_int64_t));
2815 dlif_zone = zone_create(DLIF_ZONE_NAME, size: dlif_bufsize, flags: ZC_ZFREE_CLEARMEM);
2816
2817 dlif_tcpstat_size = sizeof(struct tcpstat_local);
2818 /* Enforce 64-bit alignment for tcpstat_local structure */
2819 dlif_tcpstat_bufsize =
2820 dlif_tcpstat_size + sizeof(void *) + sizeof(u_int64_t);
2821 dlif_tcpstat_bufsize = (uint32_t)
2822 P2ROUNDUP(dlif_tcpstat_bufsize, sizeof(u_int64_t));
2823 dlif_tcpstat_zone = zone_create(DLIF_TCPSTAT_ZONE_NAME,
2824 size: dlif_tcpstat_bufsize, flags: ZC_ZFREE_CLEARMEM);
2825
2826 dlif_udpstat_size = sizeof(struct udpstat_local);
2827 /* Enforce 64-bit alignment for udpstat_local structure */
2828 dlif_udpstat_bufsize =
2829 dlif_udpstat_size + sizeof(void *) + sizeof(u_int64_t);
2830 dlif_udpstat_bufsize = (uint32_t)
2831 P2ROUNDUP(dlif_udpstat_bufsize, sizeof(u_int64_t));
2832 dlif_udpstat_zone = zone_create(DLIF_UDPSTAT_ZONE_NAME,
2833 size: dlif_udpstat_bufsize, flags: ZC_ZFREE_CLEARMEM);
2834
2835 eventhandler_lists_ctxt_init(evthdlr_lists_ctxt: &ifnet_evhdlr_ctxt);
2836
2837 TAILQ_INIT(&dlil_ifnet_head);
2838 TAILQ_INIT(&ifnet_head);
2839 TAILQ_INIT(&ifnet_detaching_head);
2840 TAILQ_INIT(&ifnet_ordered_head);
2841
2842 /* Initialize interface address subsystem */
2843 ifa_init();
2844
2845#if PF
2846 /* Initialize the packet filter */
2847 pfinit();
2848#endif /* PF */
2849
2850 /* Initialize queue algorithms */
2851 classq_init();
2852
2853 /* Initialize packet schedulers */
2854 pktsched_init();
2855
2856 /* Initialize flow advisory subsystem */
2857 flowadv_init();
2858
2859 /* Initialize the pktap virtual interface */
2860 pktap_init();
2861
2862 /* Initialize the service class to dscp map */
2863 net_qos_map_init();
2864
2865 /* Initialize the interface low power mode event handler */
2866 if_low_power_evhdlr_init();
2867
2868 /* Initialize the interface offload port list subsystem */
2869 if_ports_used_init();
2870
2871#if DEBUG || DEVELOPMENT
2872 /* Run self-tests */
2873 dlil_verify_sum16();
2874#endif /* DEBUG || DEVELOPMENT */
2875
2876 /*
2877 * Create and start up the main DLIL input thread and the interface
2878 * detacher threads once everything is initialized.
2879 */
2880 dlil_incr_pending_thread_count();
2881 (void) dlil_create_input_thread(NULL, inp: dlil_main_input_thread, NULL);
2882
2883 /*
2884 * Create ifnet detacher thread.
2885 * When an interface gets detached, part of the detach processing
2886 * is delayed. The interface is added to delayed detach list
2887 * and this thread is woken up to call ifnet_detach_final
2888 * on these interfaces.
2889 */
2890 dlil_incr_pending_thread_count();
2891 if (kernel_thread_start(continuation: ifnet_detacher_thread_func,
2892 NULL, new_thread: &thread) != KERN_SUCCESS) {
2893 panic_plain("%s: couldn't create detacher thread", __func__);
2894 /* NOTREACHED */
2895 }
2896 thread_deallocate(thread);
2897
2898 /*
2899 * Wait for the created kernel threads for dlil to get
2900 * scheduled and run at least once before we proceed
2901 */
2902 lck_mtx_lock(lck: &dlil_thread_sync_lock);
2903 while (dlil_pending_thread_cnt != 0) {
2904 DLIL_PRINTF("%s: Waiting for all the create dlil kernel "
2905 "threads to get scheduled at least once.\n", __func__);
2906 (void) msleep(chan: &dlil_pending_thread_cnt, mtx: &dlil_thread_sync_lock,
2907 pri: (PZERO - 1), wmesg: __func__, NULL);
2908 LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_ASSERT_OWNED);
2909 }
2910 lck_mtx_unlock(lck: &dlil_thread_sync_lock);
2911 DLIL_PRINTF("%s: All the created dlil kernel threads have been "
2912 "scheduled at least once. Proceeding.\n", __func__);
2913}
2914
2915static void
2916if_flt_monitor_busy(struct ifnet *ifp)
2917{
2918 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2919
2920 ++ifp->if_flt_busy;
2921 VERIFY(ifp->if_flt_busy != 0);
2922}
2923
2924static void
2925if_flt_monitor_unbusy(struct ifnet *ifp)
2926{
2927 if_flt_monitor_leave(ifp);
2928}
2929
2930static void
2931if_flt_monitor_enter(struct ifnet *ifp)
2932{
2933 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2934
2935 while (ifp->if_flt_busy) {
2936 ++ifp->if_flt_waiters;
2937 (void) msleep(chan: &ifp->if_flt_head, mtx: &ifp->if_flt_lock,
2938 pri: (PZERO - 1), wmesg: "if_flt_monitor", NULL);
2939 }
2940 if_flt_monitor_busy(ifp);
2941}
2942
2943static void
2944if_flt_monitor_leave(struct ifnet *ifp)
2945{
2946 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2947
2948 VERIFY(ifp->if_flt_busy != 0);
2949 --ifp->if_flt_busy;
2950
2951 if (ifp->if_flt_busy == 0 && ifp->if_flt_waiters > 0) {
2952 ifp->if_flt_waiters = 0;
2953 wakeup(chan: &ifp->if_flt_head);
2954 }
2955}
2956
2957__private_extern__ int
2958dlil_attach_filter(struct ifnet *ifp, const struct iff_filter *if_filter,
2959 interface_filter_t *filter_ref, u_int32_t flags)
2960{
2961 int retval = 0;
2962 struct ifnet_filter *filter = NULL;
2963
2964 ifnet_head_lock_shared();
2965
2966 /* Check that the interface is in the global list */
2967 if (!ifnet_lookup(ifp)) {
2968 retval = ENXIO;
2969 goto done;
2970 }
2971 if (!ifnet_is_attached(ifp, refio: 1)) {
2972 os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
2973 __func__, if_name(ifp));
2974 retval = ENXIO;
2975 goto done;
2976 }
2977
2978 filter = zalloc_flags(dlif_filt_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2979
2980 /* refcnt held above during lookup */
2981 filter->filt_flags = flags;
2982 filter->filt_ifp = ifp;
2983 filter->filt_cookie = if_filter->iff_cookie;
2984 filter->filt_name = if_filter->iff_name;
2985 filter->filt_protocol = if_filter->iff_protocol;
2986 /*
2987 * Do not install filter callbacks for internal coproc interface
2988 * and for management interfaces
2989 */
2990 if (!IFNET_IS_INTCOPROC(ifp) && !IFNET_IS_MANAGEMENT(ifp)) {
2991 filter->filt_input = if_filter->iff_input;
2992 filter->filt_output = if_filter->iff_output;
2993 filter->filt_event = if_filter->iff_event;
2994 filter->filt_ioctl = if_filter->iff_ioctl;
2995 }
2996 filter->filt_detached = if_filter->iff_detached;
2997
2998 lck_mtx_lock(lck: &ifp->if_flt_lock);
2999 if_flt_monitor_enter(ifp);
3000
3001 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
3002 TAILQ_INSERT_TAIL(&ifp->if_flt_head, filter, filt_next);
3003
3004 *filter_ref = filter;
3005
3006 /*
3007 * Bump filter count and route_generation ID to let TCP
3008 * know it shouldn't do TSO on this connection
3009 */
3010 if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
3011 ifnet_filter_update_tso(ifp, TRUE);
3012 }
3013 OSIncrementAtomic64(address: &net_api_stats.nas_iflt_attach_count);
3014 INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_total);
3015 if (filter->filt_flags & DLIL_IFF_INTERNAL) {
3016 OSIncrementAtomic64(address: &net_api_stats.nas_iflt_attach_os_count);
3017 INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_os_total);
3018 } else {
3019 OSAddAtomic(1, &ifp->if_flt_non_os_count);
3020 }
3021 if_flt_monitor_leave(ifp);
3022 lck_mtx_unlock(lck: &ifp->if_flt_lock);
3023
3024#if SKYWALK && defined(XNU_TARGET_OS_OSX)
3025 net_filter_event_mark(subsystem: NET_FILTER_EVENT_INTERFACE,
3026 compatible: net_check_compatible_if_filter(NULL));
3027#endif /* SKYWALK && XNU_TARGET_OS_OSX */
3028
3029 if (dlil_verbose) {
3030 DLIL_PRINTF("%s: %s filter attached\n", if_name(ifp),
3031 if_filter->iff_name);
3032 }
3033 ifnet_decr_iorefcnt(ifp);
3034
3035done:
3036 ifnet_head_done();
3037 if (retval != 0 && ifp != NULL) {
3038 DLIL_PRINTF("%s: failed to attach %s (err=%d)\n",
3039 if_name(ifp), if_filter->iff_name, retval);
3040 }
3041 if (retval != 0 && filter != NULL) {
3042 zfree(dlif_filt_zone, filter);
3043 }
3044
3045 return retval;
3046}
3047
3048static int
3049dlil_detach_filter_internal(interface_filter_t filter, int detached)
3050{
3051 int retval = 0;
3052
3053 if (detached == 0) {
3054 ifnet_t ifp = NULL;
3055
3056 ifnet_head_lock_shared();
3057 TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
3058 interface_filter_t entry = NULL;
3059
3060 lck_mtx_lock(lck: &ifp->if_flt_lock);
3061 TAILQ_FOREACH(entry, &ifp->if_flt_head, filt_next) {
3062 if (entry != filter || entry->filt_skip) {
3063 continue;
3064 }
3065 /*
3066 * We've found a match; since it's possible
3067 * that the thread gets blocked in the monitor,
3068 * we do the lock dance. Interface should
3069 * not be detached since we still have a use
3070 * count held during filter attach.
3071 */
3072 entry->filt_skip = 1; /* skip input/output */
3073 lck_mtx_unlock(lck: &ifp->if_flt_lock);
3074 ifnet_head_done();
3075
3076 lck_mtx_lock(lck: &ifp->if_flt_lock);
3077 if_flt_monitor_enter(ifp);
3078 LCK_MTX_ASSERT(&ifp->if_flt_lock,
3079 LCK_MTX_ASSERT_OWNED);
3080
3081 /* Remove the filter from the list */
3082 TAILQ_REMOVE(&ifp->if_flt_head, filter,
3083 filt_next);
3084
3085 if (dlil_verbose) {
3086 DLIL_PRINTF("%s: %s filter detached\n",
3087 if_name(ifp), filter->filt_name);
3088 }
3089 if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
3090 VERIFY(ifp->if_flt_non_os_count != 0);
3091 OSAddAtomic(-1, &ifp->if_flt_non_os_count);
3092 }
3093 /*
3094 * Decrease filter count and route_generation
3095 * ID to let TCP know it should reevalute doing
3096 * TSO or not.
3097 */
3098 if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
3099 ifnet_filter_update_tso(ifp, FALSE);
3100 }
3101 if_flt_monitor_leave(ifp);
3102 lck_mtx_unlock(lck: &ifp->if_flt_lock);
3103 goto destroy;
3104 }
3105 lck_mtx_unlock(lck: &ifp->if_flt_lock);
3106 }
3107 ifnet_head_done();
3108
3109 /* filter parameter is not a valid filter ref */
3110 retval = EINVAL;
3111 goto done;
3112 } else {
3113 struct ifnet *ifp = filter->filt_ifp;
3114 /*
3115 * Here we are called from ifnet_detach_final(); the
3116 * caller had emptied if_flt_head and we're doing an
3117 * implicit filter detach because the interface is
3118 * about to go away. Make sure to adjust the counters
3119 * in this case. We don't need the protection of the
3120 * filter monitor since we're called as part of the
3121 * final detach in the context of the detacher thread.
3122 */
3123 if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
3124 VERIFY(ifp->if_flt_non_os_count != 0);
3125 OSAddAtomic(-1, &ifp->if_flt_non_os_count);
3126 }
3127 /*
3128 * Decrease filter count and route_generation
3129 * ID to let TCP know it should reevalute doing
3130 * TSO or not.
3131 */
3132 if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
3133 ifnet_filter_update_tso(ifp, FALSE);
3134 }
3135 }
3136
3137 if (dlil_verbose) {
3138 DLIL_PRINTF("%s filter detached\n", filter->filt_name);
3139 }
3140
3141destroy:
3142
3143 /* Call the detached function if there is one */
3144 if (filter->filt_detached) {
3145 filter->filt_detached(filter->filt_cookie, filter->filt_ifp);
3146 }
3147
3148 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_count) > 0);
3149 if (filter->filt_flags & DLIL_IFF_INTERNAL) {
3150 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_os_count) > 0);
3151 }
3152#if SKYWALK && defined(XNU_TARGET_OS_OSX)
3153 net_filter_event_mark(subsystem: NET_FILTER_EVENT_INTERFACE,
3154 compatible: net_check_compatible_if_filter(NULL));
3155#endif /* SKYWALK && XNU_TARGET_OS_OSX */
3156
3157 /* Free the filter */
3158 zfree(dlif_filt_zone, filter);
3159 filter = NULL;
3160done:
3161 if (retval != 0 && filter != NULL) {
3162 DLIL_PRINTF("failed to detach %s filter (err=%d)\n",
3163 filter->filt_name, retval);
3164 }
3165
3166 return retval;
3167}
3168
3169__private_extern__ void
3170dlil_detach_filter(interface_filter_t filter)
3171{
3172 if (filter == NULL) {
3173 return;
3174 }
3175 dlil_detach_filter_internal(filter, detached: 0);
3176}
3177
3178__private_extern__ boolean_t
3179dlil_has_ip_filter(void)
3180{
3181 boolean_t has_filter = ((net_api_stats.nas_ipf_add_count - net_api_stats.nas_ipf_add_os_count) > 0);
3182
3183 VERIFY(net_api_stats.nas_ipf_add_count >= net_api_stats.nas_ipf_add_os_count);
3184
3185 DTRACE_IP1(dlil_has_ip_filter, boolean_t, has_filter);
3186 return has_filter;
3187}
3188
3189__private_extern__ boolean_t
3190dlil_has_if_filter(struct ifnet *ifp)
3191{
3192 boolean_t has_filter = !TAILQ_EMPTY(&ifp->if_flt_head);
3193 DTRACE_IP1(dlil_has_if_filter, boolean_t, has_filter);
3194 return has_filter;
3195}
3196
3197static inline void
3198dlil_input_wakeup(struct dlil_threading_info *inp)
3199{
3200 LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
3201
3202 inp->dlth_flags |= DLIL_INPUT_WAITING;
3203 if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
3204 inp->dlth_wtot++;
3205 wakeup_one(chan: (caddr_t)&inp->dlth_flags);
3206 }
3207}
3208
3209__attribute__((noreturn))
3210static void
3211dlil_main_input_thread_func(void *v, wait_result_t w)
3212{
3213#pragma unused(w)
3214 struct dlil_threading_info *inp = v;
3215
3216 VERIFY(inp == dlil_main_input_thread);
3217 VERIFY(inp->dlth_ifp == NULL);
3218 VERIFY(current_thread() == inp->dlth_thread);
3219
3220 lck_mtx_lock(lck: &inp->dlth_lock);
3221 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3222 (void) assert_wait(event: &inp->dlth_flags, THREAD_UNINT);
3223 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3224 /* wake up once to get out of embryonic state */
3225 dlil_input_wakeup(inp);
3226 lck_mtx_unlock(lck: &inp->dlth_lock);
3227 (void) thread_block_parameter(continuation: dlil_main_input_thread_cont, parameter: inp);
3228 /* NOTREACHED */
3229 __builtin_unreachable();
3230}
3231
3232/*
3233 * Main input thread:
3234 *
3235 * a) handles all inbound packets for lo0
3236 * b) handles all inbound packets for interfaces with no dedicated
3237 * input thread (e.g. anything but Ethernet/PDP or those that support
3238 * opportunistic polling.)
3239 * c) protocol registrations
3240 * d) packet injections
3241 */
3242__attribute__((noreturn))
3243static void
3244dlil_main_input_thread_cont(void *v, wait_result_t wres)
3245{
3246 struct dlil_main_threading_info *inpm = v;
3247 struct dlil_threading_info *inp = v;
3248
3249 /* main input thread is uninterruptible */
3250 VERIFY(wres != THREAD_INTERRUPTED);
3251 lck_mtx_lock_spin(lck: &inp->dlth_lock);
3252 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_TERMINATE |
3253 DLIL_INPUT_RUNNING)));
3254 inp->dlth_flags |= DLIL_INPUT_RUNNING;
3255
3256 while (1) {
3257 struct mbuf *m = NULL, *m_loop = NULL;
3258 u_int32_t m_cnt, m_cnt_loop;
3259 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3260 boolean_t proto_req;
3261 boolean_t embryonic;
3262
3263 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3264
3265 if (__improbable(embryonic =
3266 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3267 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3268 }
3269
3270 proto_req = (inp->dlth_flags &
3271 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER));
3272
3273 /* Packets for non-dedicated interfaces other than lo0 */
3274 m_cnt = qlen(&inp->dlth_pkts);
3275 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3276 m = pkt.cp_mbuf;
3277
3278 /* Packets exclusive to lo0 */
3279 m_cnt_loop = qlen(&inpm->lo_rcvq_pkts);
3280 _getq_all(&inpm->lo_rcvq_pkts, &pkt, NULL, NULL, NULL);
3281 m_loop = pkt.cp_mbuf;
3282
3283 inp->dlth_wtot = 0;
3284
3285 lck_mtx_unlock(lck: &inp->dlth_lock);
3286
3287 if (__improbable(embryonic)) {
3288 dlil_decr_pending_thread_count();
3289 }
3290
3291 /*
3292 * NOTE warning %%% attention !!!!
3293 * We should think about putting some thread starvation
3294 * safeguards if we deal with long chains of packets.
3295 */
3296 if (__probable(m_loop != NULL)) {
3297 dlil_input_packet_list_extended(lo_ifp, m_loop,
3298 m_cnt_loop, IFNET_MODEL_INPUT_POLL_OFF);
3299 }
3300
3301 if (__probable(m != NULL)) {
3302 dlil_input_packet_list_extended(NULL, m,
3303 m_cnt, IFNET_MODEL_INPUT_POLL_OFF);
3304 }
3305
3306 if (__improbable(proto_req)) {
3307 proto_input_run();
3308 }
3309
3310 lck_mtx_lock_spin(lck: &inp->dlth_lock);
3311 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3312 /* main input thread cannot be terminated */
3313 VERIFY(!(inp->dlth_flags & DLIL_INPUT_TERMINATE));
3314 if (!(inp->dlth_flags & ~DLIL_INPUT_RUNNING)) {
3315 break;
3316 }
3317 }
3318
3319 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3320 (void) assert_wait(event: &inp->dlth_flags, THREAD_UNINT);
3321 lck_mtx_unlock(lck: &inp->dlth_lock);
3322 (void) thread_block_parameter(continuation: dlil_main_input_thread_cont, parameter: inp);
3323
3324 VERIFY(0); /* we should never get here */
3325 /* NOTREACHED */
3326 __builtin_unreachable();
3327}
3328
3329/*
3330 * Input thread for interfaces with legacy input model.
3331 */
3332__attribute__((noreturn))
3333static void
3334dlil_input_thread_func(void *v, wait_result_t w)
3335{
3336#pragma unused(w)
3337 char thread_name[MAXTHREADNAMESIZE];
3338 struct dlil_threading_info *inp = v;
3339 struct ifnet *ifp = inp->dlth_ifp;
3340
3341 VERIFY(inp != dlil_main_input_thread);
3342 VERIFY(ifp != NULL);
3343 VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll ||
3344 !(ifp->if_xflags & IFXF_LEGACY));
3345 VERIFY(ifp->if_poll_mode == IFNET_MODEL_INPUT_POLL_OFF ||
3346 !(ifp->if_xflags & IFXF_LEGACY));
3347 VERIFY(current_thread() == inp->dlth_thread);
3348
3349 /* construct the name for this thread, and then apply it */
3350 bzero(s: thread_name, n: sizeof(thread_name));
3351 (void) snprintf(thread_name, count: sizeof(thread_name),
3352 "dlil_input_%s", ifp->if_xname);
3353 thread_set_thread_name(th: inp->dlth_thread, name: thread_name);
3354
3355 lck_mtx_lock(lck: &inp->dlth_lock);
3356 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3357 (void) assert_wait(event: &inp->dlth_flags, THREAD_UNINT);
3358 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3359 /* wake up once to get out of embryonic state */
3360 dlil_input_wakeup(inp);
3361 lck_mtx_unlock(lck: &inp->dlth_lock);
3362 (void) thread_block_parameter(continuation: dlil_input_thread_cont, parameter: inp);
3363 /* NOTREACHED */
3364 __builtin_unreachable();
3365}
3366
3367__attribute__((noreturn))
3368static void
3369dlil_input_thread_cont(void *v, wait_result_t wres)
3370{
3371 struct dlil_threading_info *inp = v;
3372 struct ifnet *ifp = inp->dlth_ifp;
3373
3374 lck_mtx_lock_spin(lck: &inp->dlth_lock);
3375 if (__improbable(wres == THREAD_INTERRUPTED ||
3376 (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3377 goto terminate;
3378 }
3379
3380 VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3381 inp->dlth_flags |= DLIL_INPUT_RUNNING;
3382
3383 while (1) {
3384 struct mbuf *m = NULL;
3385 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3386 boolean_t notify = FALSE;
3387 boolean_t embryonic;
3388 u_int32_t m_cnt;
3389
3390 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3391
3392 if (__improbable(embryonic =
3393 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3394 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3395 }
3396
3397 /*
3398 * Protocol registration and injection must always use
3399 * the main input thread; in theory the latter can utilize
3400 * the corresponding input thread where the packet arrived
3401 * on, but that requires our knowing the interface in advance
3402 * (and the benefits might not worth the trouble.)
3403 */
3404 VERIFY(!(inp->dlth_flags &
3405 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3406
3407 /* Packets for this interface */
3408 m_cnt = qlen(&inp->dlth_pkts);
3409 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3410 m = pkt.cp_mbuf;
3411
3412 inp->dlth_wtot = 0;
3413
3414#if SKYWALK
3415 /*
3416 * If this interface is attached to a netif nexus,
3417 * the stats are already incremented there; otherwise
3418 * do it here.
3419 */
3420 if (!(ifp->if_capabilities & IFCAP_SKYWALK))
3421#endif /* SKYWALK */
3422 notify = dlil_input_stats_sync(ifp, inp);
3423
3424 lck_mtx_unlock(lck: &inp->dlth_lock);
3425
3426 if (__improbable(embryonic)) {
3427 ifnet_decr_pending_thread_count(ifp);
3428 }
3429
3430 if (__improbable(notify)) {
3431 ifnet_notify_data_threshold(ifp);
3432 }
3433
3434 /*
3435 * NOTE warning %%% attention !!!!
3436 * We should think about putting some thread starvation
3437 * safeguards if we deal with long chains of packets.
3438 */
3439 if (__probable(m != NULL)) {
3440 dlil_input_packet_list_extended(NULL, m,
3441 m_cnt, ifp->if_poll_mode);
3442 }
3443
3444 lck_mtx_lock_spin(lck: &inp->dlth_lock);
3445 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3446 if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3447 DLIL_INPUT_TERMINATE))) {
3448 break;
3449 }
3450 }
3451
3452 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3453
3454 if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3455terminate:
3456 lck_mtx_unlock(lck: &inp->dlth_lock);
3457 dlil_terminate_input_thread(inp);
3458 /* NOTREACHED */
3459 } else {
3460 (void) assert_wait(event: &inp->dlth_flags, THREAD_UNINT);
3461 lck_mtx_unlock(lck: &inp->dlth_lock);
3462 (void) thread_block_parameter(continuation: dlil_input_thread_cont, parameter: inp);
3463 /* NOTREACHED */
3464 }
3465
3466 VERIFY(0); /* we should never get here */
3467 /* NOTREACHED */
3468 __builtin_unreachable();
3469}
3470
3471/*
3472 * Input thread for interfaces with opportunistic polling input model.
3473 */
3474__attribute__((noreturn))
3475static void
3476dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
3477{
3478#pragma unused(w)
3479 char thread_name[MAXTHREADNAMESIZE];
3480 struct dlil_threading_info *inp = v;
3481 struct ifnet *ifp = inp->dlth_ifp;
3482
3483 VERIFY(inp != dlil_main_input_thread);
3484 VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL) &&
3485 (ifp->if_xflags & IFXF_LEGACY));
3486 VERIFY(current_thread() == inp->dlth_thread);
3487
3488 /* construct the name for this thread, and then apply it */
3489 bzero(s: thread_name, n: sizeof(thread_name));
3490 (void) snprintf(thread_name, count: sizeof(thread_name),
3491 "dlil_input_poll_%s", ifp->if_xname);
3492 thread_set_thread_name(th: inp->dlth_thread, name: thread_name);
3493
3494 lck_mtx_lock(lck: &inp->dlth_lock);
3495 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3496 (void) assert_wait(event: &inp->dlth_flags, THREAD_UNINT);
3497 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3498 /* wake up once to get out of embryonic state */
3499 dlil_input_wakeup(inp);
3500 lck_mtx_unlock(lck: &inp->dlth_lock);
3501 (void) thread_block_parameter(continuation: dlil_rxpoll_input_thread_cont, parameter: inp);
3502 /* NOTREACHED */
3503 __builtin_unreachable();
3504}
3505
3506__attribute__((noreturn))
3507static void
3508dlil_rxpoll_input_thread_cont(void *v, wait_result_t wres)
3509{
3510 struct dlil_threading_info *inp = v;
3511 struct ifnet *ifp = inp->dlth_ifp;
3512 struct timespec ts;
3513
3514 lck_mtx_lock_spin(lck: &inp->dlth_lock);
3515 if (__improbable(wres == THREAD_INTERRUPTED ||
3516 (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3517 goto terminate;
3518 }
3519
3520 VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3521 inp->dlth_flags |= DLIL_INPUT_RUNNING;
3522
3523 while (1) {
3524 struct mbuf *m = NULL;
3525 uint32_t m_cnt, poll_req = 0;
3526 uint64_t m_size = 0;
3527 ifnet_model_t mode;
3528 struct timespec now, delta;
3529 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3530 boolean_t notify;
3531 boolean_t embryonic;
3532 uint64_t ival;
3533
3534 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3535
3536 if (__improbable(embryonic =
3537 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3538 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3539 goto skip;
3540 }
3541
3542 if ((ival = ifp->if_rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN) {
3543 ival = IF_RXPOLL_INTERVALTIME_MIN;
3544 }
3545
3546 /* Link parameters changed? */
3547 if (ifp->if_poll_update != 0) {
3548 ifp->if_poll_update = 0;
3549 (void) dlil_rxpoll_set_params(ifp, NULL, TRUE);
3550 }
3551
3552 /* Current operating mode */
3553 mode = ifp->if_poll_mode;
3554
3555 /*
3556 * Protocol registration and injection must always use
3557 * the main input thread; in theory the latter can utilize
3558 * the corresponding input thread where the packet arrived
3559 * on, but that requires our knowing the interface in advance
3560 * (and the benefits might not worth the trouble.)
3561 */
3562 VERIFY(!(inp->dlth_flags &
3563 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3564
3565 /* Total count of all packets */
3566 m_cnt = qlen(&inp->dlth_pkts);
3567
3568 /* Total bytes of all packets */
3569 m_size = qsize(&inp->dlth_pkts);
3570
3571 /* Packets for this interface */
3572 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3573 m = pkt.cp_mbuf;
3574 VERIFY(m != NULL || m_cnt == 0);
3575
3576 nanouptime(ts: &now);
3577 if (!net_timerisset(&ifp->if_poll_sample_lasttime)) {
3578 *(&ifp->if_poll_sample_lasttime) = *(&now);
3579 }
3580
3581 net_timersub(&now, &ifp->if_poll_sample_lasttime, &delta);
3582 if (if_rxpoll && net_timerisset(&ifp->if_poll_sample_holdtime)) {
3583 u_int32_t ptot, btot;
3584
3585 /* Accumulate statistics for current sampling */
3586 PKTCNTR_ADD(&ifp->if_poll_sstats, m_cnt, m_size);
3587
3588 if (net_timercmp(&delta, &ifp->if_poll_sample_holdtime, <)) {
3589 goto skip;
3590 }
3591
3592 *(&ifp->if_poll_sample_lasttime) = *(&now);
3593
3594 /* Calculate min/max of inbound bytes */
3595 btot = (u_int32_t)ifp->if_poll_sstats.bytes;
3596 if (ifp->if_rxpoll_bmin == 0 || ifp->if_rxpoll_bmin > btot) {
3597 ifp->if_rxpoll_bmin = btot;
3598 }
3599 if (btot > ifp->if_rxpoll_bmax) {
3600 ifp->if_rxpoll_bmax = btot;
3601 }
3602
3603 /* Calculate EWMA of inbound bytes */
3604 DLIL_EWMA(ifp->if_rxpoll_bavg, btot, if_rxpoll_decay);
3605
3606 /* Calculate min/max of inbound packets */
3607 ptot = (u_int32_t)ifp->if_poll_sstats.packets;
3608 if (ifp->if_rxpoll_pmin == 0 || ifp->if_rxpoll_pmin > ptot) {
3609 ifp->if_rxpoll_pmin = ptot;
3610 }
3611 if (ptot > ifp->if_rxpoll_pmax) {
3612 ifp->if_rxpoll_pmax = ptot;
3613 }
3614
3615 /* Calculate EWMA of inbound packets */
3616 DLIL_EWMA(ifp->if_rxpoll_pavg, ptot, if_rxpoll_decay);
3617
3618 /* Reset sampling statistics */
3619 PKTCNTR_CLEAR(&ifp->if_poll_sstats);
3620
3621 /* Calculate EWMA of wakeup requests */
3622 DLIL_EWMA(ifp->if_rxpoll_wavg, inp->dlth_wtot,
3623 if_rxpoll_decay);
3624 inp->dlth_wtot = 0;
3625
3626 if (dlil_verbose) {
3627 if (!net_timerisset(&ifp->if_poll_dbg_lasttime)) {
3628 *(&ifp->if_poll_dbg_lasttime) = *(&now);
3629 }
3630 net_timersub(&now, &ifp->if_poll_dbg_lasttime, &delta);
3631 if (net_timercmp(&delta, &dlil_dbgrate, >=)) {
3632 *(&ifp->if_poll_dbg_lasttime) = *(&now);
3633 DLIL_PRINTF("%s: [%s] pkts avg %d max %d "
3634 "limits [%d/%d], wreq avg %d "
3635 "limits [%d/%d], bytes avg %d "
3636 "limits [%d/%d]\n", if_name(ifp),
3637 (ifp->if_poll_mode ==
3638 IFNET_MODEL_INPUT_POLL_ON) ?
3639 "ON" : "OFF", ifp->if_rxpoll_pavg,
3640 ifp->if_rxpoll_pmax,
3641 ifp->if_rxpoll_plowat,
3642 ifp->if_rxpoll_phiwat,
3643 ifp->if_rxpoll_wavg,
3644 ifp->if_rxpoll_wlowat,
3645 ifp->if_rxpoll_whiwat,
3646 ifp->if_rxpoll_bavg,
3647 ifp->if_rxpoll_blowat,
3648 ifp->if_rxpoll_bhiwat);
3649 }
3650 }
3651
3652 /* Perform mode transition, if necessary */
3653 if (!net_timerisset(&ifp->if_poll_mode_lasttime)) {
3654 *(&ifp->if_poll_mode_lasttime) = *(&now);
3655 }
3656
3657 net_timersub(&now, &ifp->if_poll_mode_lasttime, &delta);
3658 if (net_timercmp(&delta, &ifp->if_poll_mode_holdtime, <)) {
3659 goto skip;
3660 }
3661
3662 if (ifp->if_rxpoll_pavg <= ifp->if_rxpoll_plowat &&
3663 ifp->if_rxpoll_bavg <= ifp->if_rxpoll_blowat &&
3664 ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_OFF) {
3665 mode = IFNET_MODEL_INPUT_POLL_OFF;
3666 } else if (ifp->if_rxpoll_pavg >= ifp->if_rxpoll_phiwat &&
3667 (ifp->if_rxpoll_bavg >= ifp->if_rxpoll_bhiwat ||
3668 ifp->if_rxpoll_wavg >= ifp->if_rxpoll_whiwat) &&
3669 ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_ON) {
3670 mode = IFNET_MODEL_INPUT_POLL_ON;
3671 }
3672
3673 if (mode != ifp->if_poll_mode) {
3674 ifp->if_poll_mode = mode;
3675 *(&ifp->if_poll_mode_lasttime) = *(&now);
3676 poll_req++;
3677 }
3678 }
3679skip:
3680 notify = dlil_input_stats_sync(ifp, inp);
3681
3682 lck_mtx_unlock(lck: &inp->dlth_lock);
3683
3684 if (__improbable(embryonic)) {
3685 ifnet_decr_pending_thread_count(ifp);
3686 }
3687
3688 if (__improbable(notify)) {
3689 ifnet_notify_data_threshold(ifp);
3690 }
3691
3692 /*
3693 * If there's a mode change and interface is still attached,
3694 * perform a downcall to the driver for the new mode. Also
3695 * hold an IO refcnt on the interface to prevent it from
3696 * being detached (will be release below.)
3697 */
3698 if (poll_req != 0 && ifnet_is_attached(ifp, refio: 1)) {
3699 struct ifnet_model_params p = {
3700 .model = mode, .reserved = { 0 }
3701 };
3702 errno_t err;
3703
3704 if (dlil_verbose) {
3705 DLIL_PRINTF("%s: polling is now %s, "
3706 "pkts avg %d max %d limits [%d/%d], "
3707 "wreq avg %d limits [%d/%d], "
3708 "bytes avg %d limits [%d/%d]\n",
3709 if_name(ifp),
3710 (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3711 "ON" : "OFF", ifp->if_rxpoll_pavg,
3712 ifp->if_rxpoll_pmax, ifp->if_rxpoll_plowat,
3713 ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wavg,
3714 ifp->if_rxpoll_wlowat, ifp->if_rxpoll_whiwat,
3715 ifp->if_rxpoll_bavg, ifp->if_rxpoll_blowat,
3716 ifp->if_rxpoll_bhiwat);
3717 }
3718
3719 if ((err = ((*ifp->if_input_ctl)(ifp,
3720 IFNET_CTL_SET_INPUT_MODEL, sizeof(p), &p))) != 0) {
3721 DLIL_PRINTF("%s: error setting polling mode "
3722 "to %s (%d)\n", if_name(ifp),
3723 (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3724 "ON" : "OFF", err);
3725 }
3726
3727 switch (mode) {
3728 case IFNET_MODEL_INPUT_POLL_OFF:
3729 ifnet_set_poll_cycle(ifp, NULL);
3730 ifp->if_rxpoll_offreq++;
3731 if (err != 0) {
3732 ifp->if_rxpoll_offerr++;
3733 }
3734 break;
3735
3736 case IFNET_MODEL_INPUT_POLL_ON:
3737 net_nsectimer(&ival, &ts);
3738 ifnet_set_poll_cycle(ifp, &ts);
3739 ifnet_poll(ifp);
3740 ifp->if_rxpoll_onreq++;
3741 if (err != 0) {
3742 ifp->if_rxpoll_onerr++;
3743 }
3744 break;
3745
3746 default:
3747 VERIFY(0);
3748 /* NOTREACHED */
3749 }
3750
3751 /* Release the IO refcnt */
3752 ifnet_decr_iorefcnt(ifp);
3753 }
3754
3755 /*
3756 * NOTE warning %%% attention !!!!
3757 * We should think about putting some thread starvation
3758 * safeguards if we deal with long chains of packets.
3759 */
3760 if (__probable(m != NULL)) {
3761 dlil_input_packet_list_extended(NULL, m, m_cnt, mode);
3762 }
3763
3764 lck_mtx_lock_spin(lck: &inp->dlth_lock);
3765 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3766 if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3767 DLIL_INPUT_TERMINATE))) {
3768 break;
3769 }
3770 }
3771
3772 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3773
3774 if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3775terminate:
3776 lck_mtx_unlock(lck: &inp->dlth_lock);
3777 dlil_terminate_input_thread(inp);
3778 /* NOTREACHED */
3779 } else {
3780 (void) assert_wait(event: &inp->dlth_flags, THREAD_UNINT);
3781 lck_mtx_unlock(lck: &inp->dlth_lock);
3782 (void) thread_block_parameter(continuation: dlil_rxpoll_input_thread_cont,
3783 parameter: inp);
3784 /* NOTREACHED */
3785 }
3786
3787 VERIFY(0); /* we should never get here */
3788 /* NOTREACHED */
3789 __builtin_unreachable();
3790}
3791
3792errno_t
3793dlil_rxpoll_validate_params(struct ifnet_poll_params *p)
3794{
3795 if (p != NULL) {
3796 if ((p->packets_lowat == 0 && p->packets_hiwat != 0) ||
3797 (p->packets_lowat != 0 && p->packets_hiwat == 0)) {
3798 return EINVAL;
3799 }
3800 if (p->packets_lowat != 0 && /* hiwat must be non-zero */
3801 p->packets_lowat >= p->packets_hiwat) {
3802 return EINVAL;
3803 }
3804 if ((p->bytes_lowat == 0 && p->bytes_hiwat != 0) ||
3805 (p->bytes_lowat != 0 && p->bytes_hiwat == 0)) {
3806 return EINVAL;
3807 }
3808 if (p->bytes_lowat != 0 && /* hiwat must be non-zero */
3809 p->bytes_lowat >= p->bytes_hiwat) {
3810 return EINVAL;
3811 }
3812 if (p->interval_time != 0 &&
3813 p->interval_time < IF_RXPOLL_INTERVALTIME_MIN) {
3814 p->interval_time = IF_RXPOLL_INTERVALTIME_MIN;
3815 }
3816 }
3817 return 0;
3818}
3819
3820void
3821dlil_rxpoll_update_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3822{
3823 u_int64_t sample_holdtime, inbw;
3824
3825 if ((inbw = ifnet_input_linkrate(ifp)) == 0 && p == NULL) {
3826 sample_holdtime = 0; /* polling is disabled */
3827 ifp->if_rxpoll_wlowat = ifp->if_rxpoll_plowat =
3828 ifp->if_rxpoll_blowat = 0;
3829 ifp->if_rxpoll_whiwat = ifp->if_rxpoll_phiwat =
3830 ifp->if_rxpoll_bhiwat = (u_int32_t)-1;
3831 ifp->if_rxpoll_plim = 0;
3832 ifp->if_rxpoll_ival = IF_RXPOLL_INTERVALTIME_MIN;
3833 } else {
3834 u_int32_t plowat, phiwat, blowat, bhiwat, plim;
3835 u_int64_t ival;
3836 unsigned int n, i;
3837
3838 for (n = 0, i = 0; rxpoll_tbl[i].speed != 0; i++) {
3839 if (inbw < rxpoll_tbl[i].speed) {
3840 break;
3841 }
3842 n = i;
3843 }
3844 /* auto-tune if caller didn't specify a value */
3845 plowat = ((p == NULL || p->packets_lowat == 0) ?
3846 rxpoll_tbl[n].plowat : p->packets_lowat);
3847 phiwat = ((p == NULL || p->packets_hiwat == 0) ?
3848 rxpoll_tbl[n].phiwat : p->packets_hiwat);
3849 blowat = ((p == NULL || p->bytes_lowat == 0) ?
3850 rxpoll_tbl[n].blowat : p->bytes_lowat);
3851 bhiwat = ((p == NULL || p->bytes_hiwat == 0) ?
3852 rxpoll_tbl[n].bhiwat : p->bytes_hiwat);
3853 plim = ((p == NULL || p->packets_limit == 0 ||
3854 if_rxpoll_max != 0) ? if_rxpoll_max : p->packets_limit);
3855 ival = ((p == NULL || p->interval_time == 0 ||
3856 if_rxpoll_interval_time != IF_RXPOLL_INTERVALTIME) ?
3857 if_rxpoll_interval_time : p->interval_time);
3858
3859 VERIFY(plowat != 0 && phiwat != 0);
3860 VERIFY(blowat != 0 && bhiwat != 0);
3861 VERIFY(ival >= IF_RXPOLL_INTERVALTIME_MIN);
3862
3863 sample_holdtime = if_rxpoll_sample_holdtime;
3864 ifp->if_rxpoll_wlowat = if_sysctl_rxpoll_wlowat;
3865 ifp->if_rxpoll_whiwat = if_sysctl_rxpoll_whiwat;
3866 ifp->if_rxpoll_plowat = plowat;
3867 ifp->if_rxpoll_phiwat = phiwat;
3868 ifp->if_rxpoll_blowat = blowat;
3869 ifp->if_rxpoll_bhiwat = bhiwat;
3870 ifp->if_rxpoll_plim = plim;
3871 ifp->if_rxpoll_ival = ival;
3872 }
3873
3874 net_nsectimer(&if_rxpoll_mode_holdtime, &ifp->if_poll_mode_holdtime);
3875 net_nsectimer(&sample_holdtime, &ifp->if_poll_sample_holdtime);
3876
3877 if (dlil_verbose) {
3878 DLIL_PRINTF("%