1/*
2 * Copyright (c) 1999-2024 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
30 * support for mandatory and extensible security protections. This notice
31 * is included in support of clause 2.2 (b) of the Apple Public License,
32 * Version 2.0.
33 */
34#include "kpi_interface.h"
35#include <stddef.h>
36#include <ptrauth.h>
37
38#include <sys/param.h>
39#include <sys/systm.h>
40#include <sys/kernel.h>
41#include <sys/malloc.h>
42#include <sys/mbuf.h>
43#include <sys/socket.h>
44#include <sys/domain.h>
45#include <sys/user.h>
46#include <sys/random.h>
47#include <sys/socketvar.h>
48#include <net/if_dl.h>
49#include <net/if.h>
50#include <net/route.h>
51#include <net/if_var.h>
52#include <net/dlil.h>
53#include <net/if_arp.h>
54#include <net/iptap.h>
55#include <net/pktap.h>
56#include <net/nwk_wq.h>
57#include <sys/kern_event.h>
58#include <sys/kdebug.h>
59#include <sys/mcache.h>
60#include <sys/syslog.h>
61#include <sys/protosw.h>
62#include <sys/priv.h>
63
64#include <kern/assert.h>
65#include <kern/task.h>
66#include <kern/thread.h>
67#include <kern/sched_prim.h>
68#include <kern/locks.h>
69#include <kern/zalloc.h>
70
71#include <net/kpi_protocol.h>
72#include <net/if_types.h>
73#include <net/if_ipsec.h>
74#include <net/if_llreach.h>
75#include <net/if_utun.h>
76#include <net/kpi_interfacefilter.h>
77#include <net/classq/classq.h>
78#include <net/classq/classq_sfb.h>
79#include <net/flowhash.h>
80#include <net/ntstat.h>
81#if SKYWALK && defined(XNU_TARGET_OS_OSX)
82#include <skywalk/lib/net_filter_event.h>
83#endif /* SKYWALK && XNU_TARGET_OS_OSX */
84#include <net/net_api_stats.h>
85#include <net/if_ports_used.h>
86#include <net/if_vlan_var.h>
87#include <netinet/in.h>
88#if INET
89#include <netinet/in_var.h>
90#include <netinet/igmp_var.h>
91#include <netinet/ip_var.h>
92#include <netinet/tcp.h>
93#include <netinet/tcp_var.h>
94#include <netinet/udp.h>
95#include <netinet/udp_var.h>
96#include <netinet/if_ether.h>
97#include <netinet/in_pcb.h>
98#include <netinet/in_tclass.h>
99#include <netinet/ip.h>
100#include <netinet/ip_icmp.h>
101#include <netinet/icmp_var.h>
102#endif /* INET */
103
104#include <net/nat464_utils.h>
105#include <netinet6/in6_var.h>
106#include <netinet6/nd6.h>
107#include <netinet6/mld6_var.h>
108#include <netinet6/scope6_var.h>
109#include <netinet/ip6.h>
110#include <netinet/icmp6.h>
111#include <net/pf_pbuf.h>
112#include <libkern/OSAtomic.h>
113#include <libkern/tree.h>
114
115#include <dev/random/randomdev.h>
116#include <machine/machine_routines.h>
117
118#include <mach/thread_act.h>
119#include <mach/sdt.h>
120
121#if CONFIG_MACF
122#include <sys/kauth.h>
123#include <security/mac_framework.h>
124#include <net/ethernet.h>
125#include <net/firewire.h>
126#endif
127
128#if PF
129#include <net/pfvar.h>
130#endif /* PF */
131#include <net/pktsched/pktsched.h>
132#include <net/pktsched/pktsched_netem.h>
133
134#if NECP
135#include <net/necp.h>
136#endif /* NECP */
137
138#if SKYWALK
139#include <skywalk/packet/packet_queue.h>
140#include <skywalk/nexus/netif/nx_netif.h>
141#include <skywalk/nexus/flowswitch/nx_flowswitch.h>
142#endif /* SKYWALK */
143
144#include <net/sockaddr_utils.h>
145
146#include <os/log.h>
147
148#define DBG_LAYER_BEG DLILDBG_CODE(DBG_DLIL_STATIC, 0)
149#define DBG_LAYER_END DLILDBG_CODE(DBG_DLIL_STATIC, 2)
150#define DBG_FNC_DLIL_INPUT DLILDBG_CODE(DBG_DLIL_STATIC, (1 << 8))
151#define DBG_FNC_DLIL_OUTPUT DLILDBG_CODE(DBG_DLIL_STATIC, (2 << 8))
152#define DBG_FNC_DLIL_IFOUT DLILDBG_CODE(DBG_DLIL_STATIC, (3 << 8))
153
154#define MAX_FRAME_TYPE_SIZE 4 /* LONGWORDS */
155#define MAX_LINKADDR 4 /* LONGWORDS */
156
157#if 1
158#define DLIL_PRINTF printf
159#else
160#define DLIL_PRINTF kprintf
161#endif
162
163#define IF_DATA_REQUIRE_ALIGNED_64(f) \
164 _CASSERT(!(offsetof(struct if_data_internal, f) % sizeof (u_int64_t)))
165
166#define IFNET_IF_DATA_REQUIRE_ALIGNED_64(f) \
167 _CASSERT(!(offsetof(struct ifnet, if_data.f) % sizeof (u_int64_t)))
168
169enum {
170 kProtoKPI_v1 = 1,
171 kProtoKPI_v2 = 2
172};
173
174uint64_t if_creation_generation_count = 0;
175
176/*
177 * List of if_proto structures in if_proto_hash[] is protected by
178 * the ifnet lock. The rest of the fields are initialized at protocol
179 * attach time and never change, thus no lock required as long as
180 * a reference to it is valid, via if_proto_ref().
181 */
182struct if_proto {
183 SLIST_ENTRY(if_proto) next_hash;
184 u_int32_t refcount;
185 u_int32_t detached;
186 struct ifnet *ifp;
187 protocol_family_t protocol_family;
188 int proto_kpi;
189 union {
190 struct {
191 proto_media_input input;
192 proto_media_preout pre_output;
193 proto_media_event event;
194 proto_media_ioctl ioctl;
195 proto_media_detached detached;
196 proto_media_resolve_multi resolve_multi;
197 proto_media_send_arp send_arp;
198 } v1;
199 struct {
200 proto_media_input_v2 input;
201 proto_media_preout pre_output;
202 proto_media_event event;
203 proto_media_ioctl ioctl;
204 proto_media_detached detached;
205 proto_media_resolve_multi resolve_multi;
206 proto_media_send_arp send_arp;
207 } v2;
208 } kpi;
209};
210
211SLIST_HEAD(proto_hash_entry, if_proto);
212
213#define DLIL_SDLDATALEN \
214 (DLIL_SDLMAXLEN - offsetof(struct sockaddr_dl, sdl_data[0]))
215
216/*
217 * In the common case, the LL address is stored in the
218 * `dl_if_lladdr' member of the `dlil_ifnet'. This is sufficient
219 * for LL addresses that do not exceed the `DLIL_SDLMAXLEN' constant.
220 */
221struct dl_if_lladdr_std {
222 struct ifaddr ifa;
223 u_int8_t addr_sdl_bytes[DLIL_SDLMAXLEN];
224 u_int8_t mask_sdl_bytes[DLIL_SDLMAXLEN];
225};
226
227/*
228 * However, in some rare cases we encounter LL addresses which
229 * would not fit in the `DLIL_SDLMAXLEN' limitation. In such cases
230 * we allocate the storage in the permanent arena, using this memory layout.
231 */
232struct dl_if_lladdr_xtra_space {
233 struct ifaddr ifa;
234 u_int8_t addr_sdl_bytes[SOCK_MAXADDRLEN];
235 u_int8_t mask_sdl_bytes[SOCK_MAXADDRLEN];
236};
237
238struct dlil_ifnet {
239 struct ifnet dl_if; /* public ifnet */
240 /*
241 * DLIL private fields, protected by dl_if_lock
242 */
243 decl_lck_mtx_data(, dl_if_lock);
244 TAILQ_ENTRY(dlil_ifnet) dl_if_link; /* dlil_ifnet link */
245 u_int32_t dl_if_flags; /* flags (below) */
246 u_int32_t dl_if_refcnt; /* refcnt */
247 void (*dl_if_trace)(struct dlil_ifnet *, int); /* ref trace callback */
248 void *dl_if_uniqueid; /* unique interface id */
249 size_t dl_if_uniqueid_len; /* length of the unique id */
250 char dl_if_namestorage[IFNAMSIZ]; /* interface name storage */
251 char dl_if_xnamestorage[IFXNAMSIZ]; /* external name storage */
252 struct dl_if_lladdr_std dl_if_lladdr; /* link-level address storage*/
253 u_int8_t dl_if_descstorage[IF_DESCSIZE]; /* desc storage */
254 u_int8_t dl_if_permanent_ether[ETHER_ADDR_LEN]; /* permanent address */
255 u_int8_t dl_if_permanent_ether_is_set;
256 u_int8_t dl_if_unused;
257 struct dlil_threading_info dl_if_inpstorage; /* input thread storage */
258 ctrace_t dl_if_attach; /* attach PC stacktrace */
259 ctrace_t dl_if_detach; /* detach PC stacktrace */
260};
261
262/* Values for dl_if_flags (private to DLIL) */
263#define DLIF_INUSE 0x1 /* DLIL ifnet recycler, ifnet in use */
264#define DLIF_REUSE 0x2 /* DLIL ifnet recycles, ifnet is not new */
265#define DLIF_DEBUG 0x4 /* has debugging info */
266
267#define IF_REF_TRACE_HIST_SIZE 8 /* size of ref trace history */
268
269/* For gdb */
270__private_extern__ unsigned int if_ref_trace_hist_size = IF_REF_TRACE_HIST_SIZE;
271
272struct dlil_ifnet_dbg {
273 struct dlil_ifnet dldbg_dlif; /* dlil_ifnet */
274 u_int16_t dldbg_if_refhold_cnt; /* # ifnet references */
275 u_int16_t dldbg_if_refrele_cnt; /* # ifnet releases */
276 /*
277 * Circular lists of ifnet_{reference,release} callers.
278 */
279 ctrace_t dldbg_if_refhold[IF_REF_TRACE_HIST_SIZE];
280 ctrace_t dldbg_if_refrele[IF_REF_TRACE_HIST_SIZE];
281};
282
283#define DLIL_TO_IFP(s) (&s->dl_if)
284#define IFP_TO_DLIL(s) ((struct dlil_ifnet *)s)
285
286struct ifnet_filter {
287 TAILQ_ENTRY(ifnet_filter) filt_next;
288 u_int32_t filt_skip;
289 u_int32_t filt_flags;
290 ifnet_t filt_ifp;
291 const char *filt_name;
292 void *filt_cookie;
293 protocol_family_t filt_protocol;
294 iff_input_func filt_input;
295 iff_output_func filt_output;
296 iff_event_func filt_event;
297 iff_ioctl_func filt_ioctl;
298 iff_detached_func filt_detached;
299};
300
301/* Mbuf queue used for freeing the excessive mbufs */
302typedef MBUFQ_HEAD(dlil_freeq) dlil_freeq_t;
303
304struct proto_input_entry;
305
306static TAILQ_HEAD(, dlil_ifnet) dlil_ifnet_head;
307
308static LCK_ATTR_DECLARE(dlil_lck_attributes, 0, 0);
309
310static LCK_GRP_DECLARE(dlil_lock_group, "DLIL internal locks");
311LCK_GRP_DECLARE(ifnet_lock_group, "ifnet locks");
312static LCK_GRP_DECLARE(ifnet_head_lock_group, "ifnet head lock");
313static LCK_GRP_DECLARE(ifnet_snd_lock_group, "ifnet snd locks");
314static LCK_GRP_DECLARE(ifnet_rcv_lock_group, "ifnet rcv locks");
315
316LCK_ATTR_DECLARE(ifnet_lock_attr, 0, 0);
317static LCK_RW_DECLARE_ATTR(ifnet_head_lock, &ifnet_head_lock_group,
318 &dlil_lck_attributes);
319static LCK_MTX_DECLARE_ATTR(dlil_ifnet_lock, &dlil_lock_group,
320 &dlil_lck_attributes);
321
322#if DEBUG
323static unsigned int ifnet_debug = 1; /* debugging (enabled) */
324#else
325static unsigned int ifnet_debug; /* debugging (disabled) */
326#endif /* !DEBUG */
327static unsigned int dlif_size; /* size of dlil_ifnet to allocate */
328static unsigned int dlif_bufsize; /* size of dlif_size + headroom */
329static struct zone *dlif_zone; /* zone for dlil_ifnet */
330#define DLIF_ZONE_NAME "ifnet" /* zone name */
331
332static KALLOC_TYPE_DEFINE(dlif_filt_zone, struct ifnet_filter, NET_KT_DEFAULT);
333
334static KALLOC_TYPE_DEFINE(dlif_proto_zone, struct if_proto, NET_KT_DEFAULT);
335
336static unsigned int dlif_tcpstat_size; /* size of tcpstat_local to allocate */
337static unsigned int dlif_tcpstat_bufsize; /* size of dlif_tcpstat_size + headroom */
338static struct zone *dlif_tcpstat_zone; /* zone for tcpstat_local */
339#define DLIF_TCPSTAT_ZONE_NAME "ifnet_tcpstat" /* zone name */
340
341static unsigned int dlif_udpstat_size; /* size of udpstat_local to allocate */
342static unsigned int dlif_udpstat_bufsize; /* size of dlif_udpstat_size + headroom */
343static struct zone *dlif_udpstat_zone; /* zone for udpstat_local */
344#define DLIF_UDPSTAT_ZONE_NAME "ifnet_udpstat" /* zone name */
345
346static u_int32_t net_rtref;
347
348static struct dlil_main_threading_info dlil_main_input_thread_info;
349__private_extern__ struct dlil_threading_info *dlil_main_input_thread =
350 (struct dlil_threading_info *)&dlil_main_input_thread_info;
351
352static int dlil_event_internal(struct ifnet *ifp, struct kev_msg *msg, bool update_generation);
353static int dlil_detach_filter_internal(interface_filter_t filter, int detached);
354static void dlil_if_trace(struct dlil_ifnet *, int);
355static void if_proto_ref(struct if_proto *);
356static void if_proto_free(struct if_proto *);
357static struct if_proto *find_attached_proto(struct ifnet *, u_int32_t);
358static u_int32_t dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
359 u_int32_t list_count);
360static void _dlil_if_release(ifnet_t ifp, bool clear_in_use);
361static void if_flt_monitor_busy(struct ifnet *);
362static void if_flt_monitor_unbusy(struct ifnet *);
363static void if_flt_monitor_enter(struct ifnet *);
364static void if_flt_monitor_leave(struct ifnet *);
365static int dlil_interface_filters_input(struct ifnet *, struct mbuf **,
366 char **, protocol_family_t);
367static int dlil_interface_filters_output(struct ifnet *, struct mbuf **,
368 protocol_family_t);
369static struct ifaddr *dlil_alloc_lladdr(struct ifnet *,
370 const struct sockaddr_dl *);
371static int ifnet_lookup(struct ifnet *);
372static void if_purgeaddrs(struct ifnet *);
373
374static errno_t ifproto_media_input_v1(struct ifnet *, protocol_family_t,
375 struct mbuf *, char *);
376static errno_t ifproto_media_input_v2(struct ifnet *, protocol_family_t,
377 struct mbuf *);
378static errno_t ifproto_media_preout(struct ifnet *, protocol_family_t,
379 mbuf_t *, const struct sockaddr *, void *, char *, char *);
380static void ifproto_media_event(struct ifnet *, protocol_family_t,
381 const struct kev_msg *);
382static errno_t ifproto_media_ioctl(struct ifnet *, protocol_family_t,
383 unsigned long, void *);
384static errno_t ifproto_media_resolve_multi(ifnet_t, const struct sockaddr *,
385 struct sockaddr_dl *, size_t);
386static errno_t ifproto_media_send_arp(struct ifnet *, u_short,
387 const struct sockaddr_dl *, const struct sockaddr *,
388 const struct sockaddr_dl *, const struct sockaddr *);
389
390static errno_t ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
391 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
392 boolean_t poll, struct thread *tp);
393static void ifp_if_input_poll(struct ifnet *, u_int32_t, u_int32_t,
394 struct mbuf **, struct mbuf **, u_int32_t *, u_int32_t *);
395static errno_t ifp_if_ctl(struct ifnet *, ifnet_ctl_cmd_t, u_int32_t, void *);
396static errno_t ifp_if_demux(struct ifnet *, struct mbuf *, char *,
397 protocol_family_t *);
398static errno_t ifp_if_add_proto(struct ifnet *, protocol_family_t,
399 const struct ifnet_demux_desc *, u_int32_t);
400static errno_t ifp_if_del_proto(struct ifnet *, protocol_family_t);
401static errno_t ifp_if_check_multi(struct ifnet *, const struct sockaddr *);
402#if !XNU_TARGET_OS_OSX
403static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
404 const struct sockaddr *, const char *, const char *,
405 u_int32_t *, u_int32_t *);
406#else /* XNU_TARGET_OS_OSX */
407static errno_t ifp_if_framer(struct ifnet *, struct mbuf **,
408 const struct sockaddr *, const char *, const char *);
409#endif /* XNU_TARGET_OS_OSX */
410static errno_t ifp_if_framer_extended(struct ifnet *, struct mbuf **,
411 const struct sockaddr *, const char *, const char *,
412 u_int32_t *, u_int32_t *);
413static errno_t ifp_if_set_bpf_tap(struct ifnet *, bpf_tap_mode, bpf_packet_func);
414static void ifp_if_free(struct ifnet *);
415static void ifp_if_event(struct ifnet *, const struct kev_msg *);
416static __inline void ifp_inc_traffic_class_in(struct ifnet *, struct mbuf *);
417static __inline void ifp_inc_traffic_class_out(struct ifnet *, struct mbuf *);
418
419static uint32_t dlil_trim_overcomitted_queue_locked(class_queue_t *,
420 dlil_freeq_t *, struct ifnet_stat_increment_param *);
421
422static errno_t dlil_input_async(struct dlil_threading_info *, struct ifnet *,
423 struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
424 boolean_t, struct thread *);
425static errno_t dlil_input_sync(struct dlil_threading_info *, struct ifnet *,
426 struct mbuf *, struct mbuf *, const struct ifnet_stat_increment_param *,
427 boolean_t, struct thread *);
428
429static void dlil_main_input_thread_func(void *, wait_result_t);
430static void dlil_main_input_thread_cont(void *, wait_result_t);
431
432static void dlil_input_thread_func(void *, wait_result_t);
433static void dlil_input_thread_cont(void *, wait_result_t);
434
435static void dlil_rxpoll_input_thread_func(void *, wait_result_t);
436static void dlil_rxpoll_input_thread_cont(void *, wait_result_t);
437
438static int dlil_create_input_thread(ifnet_t, struct dlil_threading_info *,
439 thread_continue_t *);
440static void dlil_terminate_input_thread(struct dlil_threading_info *);
441static void dlil_input_stats_add(const struct ifnet_stat_increment_param *,
442 struct dlil_threading_info *, struct ifnet *, boolean_t);
443static boolean_t dlil_input_stats_sync(struct ifnet *,
444 struct dlil_threading_info *);
445static void dlil_input_packet_list_common(struct ifnet *, struct mbuf *,
446 u_int32_t, ifnet_model_t, boolean_t);
447static errno_t ifnet_input_common(struct ifnet *, struct mbuf *, struct mbuf *,
448 const struct ifnet_stat_increment_param *, boolean_t, boolean_t);
449static int dlil_is_clat_needed(protocol_family_t, mbuf_t );
450static errno_t dlil_clat46(ifnet_t, protocol_family_t *, mbuf_t *);
451static errno_t dlil_clat64(ifnet_t, protocol_family_t *, mbuf_t *);
452#if DEBUG || DEVELOPMENT
453static void dlil_verify_sum16(void);
454#endif /* DEBUG || DEVELOPMENT */
455static void dlil_output_cksum_dbg(struct ifnet *, struct mbuf *, uint32_t,
456 protocol_family_t);
457static void dlil_input_cksum_dbg(struct ifnet *, struct mbuf *, char *,
458 protocol_family_t);
459
460static void dlil_incr_pending_thread_count(void);
461static void dlil_decr_pending_thread_count(void);
462
463static void ifnet_detacher_thread_func(void *, wait_result_t);
464static void ifnet_detacher_thread_cont(void *, wait_result_t);
465static void ifnet_detach_final(struct ifnet *);
466static void ifnet_detaching_enqueue(struct ifnet *);
467static struct ifnet *ifnet_detaching_dequeue(void);
468
469static void ifnet_start_thread_func(void *, wait_result_t);
470static void ifnet_start_thread_cont(void *, wait_result_t);
471
472static void ifnet_poll_thread_func(void *, wait_result_t);
473static void ifnet_poll_thread_cont(void *, wait_result_t);
474
475static errno_t ifnet_enqueue_common(struct ifnet *, struct ifclassq *,
476 classq_pkt_t *, boolean_t, boolean_t *);
477
478static void ifp_src_route_copyout(struct ifnet *, struct route *);
479static void ifp_src_route_copyin(struct ifnet *, struct route *);
480static void ifp_src_route6_copyout(struct ifnet *, struct route_in6 *);
481static void ifp_src_route6_copyin(struct ifnet *, struct route_in6 *);
482
483static errno_t if_mcasts_update_async(struct ifnet *);
484
485static int sysctl_rxpoll SYSCTL_HANDLER_ARGS;
486static int sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS;
487static int sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS;
488static int sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS;
489static int sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS;
490static int sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS;
491static int sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS;
492static int sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS;
493static int sysctl_rcvq_burst_limit SYSCTL_HANDLER_ARGS;
494static int sysctl_rcvq_trim_pct SYSCTL_HANDLER_ARGS;
495static int sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS;
496static int sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS;
497static int sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS;
498
499struct chain_len_stats tx_chain_len_stats;
500static int sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS;
501
502#if TEST_INPUT_THREAD_TERMINATION
503static int sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS;
504#endif /* TEST_INPUT_THREAD_TERMINATION */
505
506/* The following are protected by dlil_ifnet_lock */
507static TAILQ_HEAD(, ifnet) ifnet_detaching_head;
508static u_int32_t ifnet_detaching_cnt;
509static boolean_t ifnet_detaching_embryonic;
510static void *ifnet_delayed_run; /* wait channel for detaching thread */
511
512static LCK_MTX_DECLARE_ATTR(ifnet_fc_lock, &dlil_lock_group,
513 &dlil_lck_attributes);
514
515static uint32_t ifnet_flowhash_seed;
516
517struct ifnet_flowhash_key {
518 char ifk_name[IFNAMSIZ];
519 uint32_t ifk_unit;
520 uint32_t ifk_flags;
521 uint32_t ifk_eflags;
522 uint32_t ifk_capabilities;
523 uint32_t ifk_capenable;
524 uint32_t ifk_output_sched_model;
525 uint32_t ifk_rand1;
526 uint32_t ifk_rand2;
527};
528
529/* Flow control entry per interface */
530struct ifnet_fc_entry {
531 RB_ENTRY(ifnet_fc_entry) ifce_entry;
532 u_int32_t ifce_flowhash;
533 struct ifnet *ifce_ifp;
534};
535
536static uint32_t ifnet_calc_flowhash(struct ifnet *);
537static int ifce_cmp(const struct ifnet_fc_entry *,
538 const struct ifnet_fc_entry *);
539static int ifnet_fc_add(struct ifnet *);
540static struct ifnet_fc_entry *ifnet_fc_get(u_int32_t);
541static void ifnet_fc_entry_free(struct ifnet_fc_entry *);
542
543/* protected by ifnet_fc_lock */
544RB_HEAD(ifnet_fc_tree, ifnet_fc_entry) ifnet_fc_tree;
545RB_PROTOTYPE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
546RB_GENERATE(ifnet_fc_tree, ifnet_fc_entry, ifce_entry, ifce_cmp);
547
548static KALLOC_TYPE_DEFINE(ifnet_fc_zone, struct ifnet_fc_entry, NET_KT_DEFAULT);
549
550extern void bpfdetach(struct ifnet *);
551extern void proto_input_run(void);
552
553extern uint32_t udp_count_opportunistic(unsigned int ifindex,
554 u_int32_t flags);
555extern uint32_t tcp_count_opportunistic(unsigned int ifindex,
556 u_int32_t flags);
557
558__private_extern__ void link_rtrequest(int, struct rtentry *, struct sockaddr *);
559
560#if CONFIG_MACF
561#if !XNU_TARGET_OS_OSX
562int dlil_lladdr_ckreq = 1;
563#else /* XNU_TARGET_OS_OSX */
564int dlil_lladdr_ckreq = 0;
565#endif /* XNU_TARGET_OS_OSX */
566#endif /* CONFIG_MACF */
567
568#if DEBUG
569int dlil_verbose = 1;
570#else
571int dlil_verbose = 0;
572#endif /* DEBUG */
573#if IFNET_INPUT_SANITY_CHK
574/* sanity checking of input packet lists received */
575static u_int32_t dlil_input_sanity_check = 0;
576#endif /* IFNET_INPUT_SANITY_CHK */
577/* rate limit debug messages */
578struct timespec dlil_dbgrate = { .tv_sec = 1, .tv_nsec = 0 };
579
580SYSCTL_DECL(_net_link_generic_system);
581
582SYSCTL_INT(_net_link_generic_system, OID_AUTO, dlil_verbose,
583 CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_verbose, 0, "Log DLIL error messages");
584
585#define IF_SNDQ_MINLEN 32
586u_int32_t if_sndq_maxlen = IFQ_MAXLEN;
587SYSCTL_PROC(_net_link_generic_system, OID_AUTO, sndq_maxlen,
588 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sndq_maxlen, IFQ_MAXLEN,
589 sysctl_sndq_maxlen, "I", "Default transmit queue max length");
590
591#define IF_RCVQ_MINLEN 32
592#define IF_RCVQ_MAXLEN 256
593u_int32_t if_rcvq_maxlen = IF_RCVQ_MAXLEN;
594SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_maxlen,
595 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_maxlen, IFQ_MAXLEN,
596 sysctl_rcvq_maxlen, "I", "Default receive queue max length");
597
598/*
599 * Protect against possible memory starvation that may happen
600 * when the driver is pushing data faster than the AP can process.
601 *
602 * If at any point during DLIL input phase any of the input queues
603 * exceeds the burst limit, DLIL will start to trim the queue,
604 * by returning mbufs in the input queue to the cache from which
605 * the mbufs were originally allocated, starting from the oldest
606 * mbuf and continuing until the new limit (see below) is reached.
607 *
608 * In order to avoid a steplocked equilibrium, the trimming
609 * will continue PAST the burst limit, until the corresponding
610 * input queue is reduced to `if_rcvq_trim_pct' %.
611 *
612 * For example, if the input queue limit is 1024 packets,
613 * and the trim percentage (`if_rcvq_trim_pct') is 80 %,
614 * the trimming will continue until the queue contains 819 packets
615 * (1024 * 80 / 100 == 819).
616 *
617 * Setting the burst limit too low can hurt the throughput,
618 * while setting the burst limit too high can defeat the purpose.
619 */
620#define IF_RCVQ_BURST_LIMIT_MIN 1024
621#define IF_RCVQ_BURST_LIMIT_DEFAULT 8192
622#define IF_RCVQ_BURST_LIMIT_MAX 32768
623uint32_t if_rcvq_burst_limit = IF_RCVQ_BURST_LIMIT_DEFAULT;
624SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_burst_limit,
625 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_burst_limit, IF_RCVQ_BURST_LIMIT_DEFAULT,
626 sysctl_rcvq_burst_limit, "I", "Upper memory limit for inbound data");
627
628#define IF_RCVQ_TRIM_PCT_MIN 20
629#define IF_RCVQ_TRIM_PCT_DEFAULT 80
630#define IF_RCVQ_TRIM_PCT_MAX 100
631uint32_t if_rcvq_trim_pct = IF_RCVQ_TRIM_PCT_DEFAULT;
632SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rcvq_trim_pct,
633 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rcvq_trim_pct, IF_RCVQ_TRIM_PCT_DEFAULT,
634 sysctl_rcvq_trim_pct, "I",
635 "Percentage (0 - 100) of the queue limit to keep after detecting an overflow burst");
636
637#define IF_RXPOLL_DECAY 2 /* ilog2 of EWMA decay rate (4) */
638u_int32_t if_rxpoll_decay = IF_RXPOLL_DECAY;
639SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_decay,
640 CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_decay, IF_RXPOLL_DECAY,
641 "ilog2 of EWMA decay rate of avg inbound packets");
642
643#define IF_RXPOLL_MODE_HOLDTIME_MIN (10ULL * 1000 * 1000) /* 10 ms */
644#define IF_RXPOLL_MODE_HOLDTIME (1000ULL * 1000 * 1000) /* 1 sec */
645static u_int64_t if_rxpoll_mode_holdtime = IF_RXPOLL_MODE_HOLDTIME;
646SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_freeze_time,
647 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_mode_holdtime,
648 IF_RXPOLL_MODE_HOLDTIME, sysctl_rxpoll_mode_holdtime,
649 "Q", "input poll mode freeze time");
650
651#define IF_RXPOLL_SAMPLETIME_MIN (1ULL * 1000 * 1000) /* 1 ms */
652#define IF_RXPOLL_SAMPLETIME (10ULL * 1000 * 1000) /* 10 ms */
653static u_int64_t if_rxpoll_sample_holdtime = IF_RXPOLL_SAMPLETIME;
654SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_sample_time,
655 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_sample_holdtime,
656 IF_RXPOLL_SAMPLETIME, sysctl_rxpoll_sample_holdtime,
657 "Q", "input poll sampling time");
658
659static u_int64_t if_rxpoll_interval_time = IF_RXPOLL_INTERVALTIME;
660SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_interval_time,
661 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_time,
662 IF_RXPOLL_INTERVALTIME, sysctl_rxpoll_interval_time,
663 "Q", "input poll interval (time)");
664
665#define IF_RXPOLL_INTERVAL_PKTS 0 /* 0 (disabled) */
666u_int32_t if_rxpoll_interval_pkts = IF_RXPOLL_INTERVAL_PKTS;
667SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_interval_pkts,
668 CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_interval_pkts,
669 IF_RXPOLL_INTERVAL_PKTS, "input poll interval (packets)");
670
671#define IF_RXPOLL_WLOWAT 10
672static u_int32_t if_sysctl_rxpoll_wlowat = IF_RXPOLL_WLOWAT;
673SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_lowat,
674 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_wlowat,
675 IF_RXPOLL_WLOWAT, sysctl_rxpoll_wlowat,
676 "I", "input poll wakeup low watermark");
677
678#define IF_RXPOLL_WHIWAT 100
679static u_int32_t if_sysctl_rxpoll_whiwat = IF_RXPOLL_WHIWAT;
680SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll_wakeups_hiwat,
681 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_sysctl_rxpoll_whiwat,
682 IF_RXPOLL_WHIWAT, sysctl_rxpoll_whiwat,
683 "I", "input poll wakeup high watermark");
684
685static u_int32_t if_rxpoll_max = 0; /* 0 (automatic) */
686SYSCTL_UINT(_net_link_generic_system, OID_AUTO, rxpoll_max,
687 CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll_max, 0,
688 "max packets per poll call");
689
690u_int32_t if_rxpoll = 1;
691SYSCTL_PROC(_net_link_generic_system, OID_AUTO, rxpoll,
692 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &if_rxpoll, 0,
693 sysctl_rxpoll, "I", "enable opportunistic input polling");
694
695#if TEST_INPUT_THREAD_TERMINATION
696static u_int32_t if_input_thread_termination_spin = 0;
697SYSCTL_PROC(_net_link_generic_system, OID_AUTO, input_thread_termination_spin,
698 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
699 &if_input_thread_termination_spin, 0,
700 sysctl_input_thread_termination_spin,
701 "I", "input thread termination spin limit");
702#endif /* TEST_INPUT_THREAD_TERMINATION */
703
704static u_int32_t cur_dlil_input_threads = 0;
705SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_threads,
706 CTLFLAG_RD | CTLFLAG_LOCKED, &cur_dlil_input_threads, 0,
707 "Current number of DLIL input threads");
708
709#if IFNET_INPUT_SANITY_CHK
710SYSCTL_UINT(_net_link_generic_system, OID_AUTO, dlil_input_sanity_check,
711 CTLFLAG_RW | CTLFLAG_LOCKED, &dlil_input_sanity_check, 0,
712 "Turn on sanity checking in DLIL input");
713#endif /* IFNET_INPUT_SANITY_CHK */
714
715static u_int32_t if_flowadv = 1;
716SYSCTL_UINT(_net_link_generic_system, OID_AUTO, flow_advisory,
717 CTLFLAG_RW | CTLFLAG_LOCKED, &if_flowadv, 1,
718 "enable flow-advisory mechanism");
719
720static u_int32_t if_delaybased_queue = 1;
721SYSCTL_UINT(_net_link_generic_system, OID_AUTO, delaybased_queue,
722 CTLFLAG_RW | CTLFLAG_LOCKED, &if_delaybased_queue, 1,
723 "enable delay based dynamic queue sizing");
724
725static uint64_t hwcksum_in_invalidated = 0;
726SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
727 hwcksum_in_invalidated, CTLFLAG_RD | CTLFLAG_LOCKED,
728 &hwcksum_in_invalidated, "inbound packets with invalidated hardware cksum");
729
730uint32_t hwcksum_dbg = 0;
731SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_dbg,
732 CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg, 0,
733 "enable hardware cksum debugging");
734
735u_int32_t ifnet_start_delayed = 0;
736SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delayed,
737 CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_start_delayed, 0,
738 "number of times start was delayed");
739
740u_int32_t ifnet_delay_start_disabled = 0;
741SYSCTL_UINT(_net_link_generic_system, OID_AUTO, start_delay_disabled,
742 CTLFLAG_RW | CTLFLAG_LOCKED, &ifnet_delay_start_disabled, 0,
743 "number of times start was delayed");
744
745static inline void
746ifnet_delay_start_disabled_increment(void)
747{
748 OSIncrementAtomic(&ifnet_delay_start_disabled);
749}
750
751#define HWCKSUM_DBG_PARTIAL_FORCED 0x1 /* forced partial checksum */
752#define HWCKSUM_DBG_PARTIAL_RXOFF_ADJ 0x2 /* adjust start offset */
753#define HWCKSUM_DBG_FINALIZE_FORCED 0x10 /* forced finalize */
754#define HWCKSUM_DBG_MASK \
755 (HWCKSUM_DBG_PARTIAL_FORCED | HWCKSUM_DBG_PARTIAL_RXOFF_ADJ | \
756 HWCKSUM_DBG_FINALIZE_FORCED)
757
758static uint32_t hwcksum_dbg_mode = 0;
759SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_mode,
760 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_mode,
761 0, sysctl_hwcksum_dbg_mode, "I", "hardware cksum debugging mode");
762
763static uint64_t hwcksum_dbg_partial_forced = 0;
764SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
765 hwcksum_dbg_partial_forced, CTLFLAG_RD | CTLFLAG_LOCKED,
766 &hwcksum_dbg_partial_forced, "packets forced using partial cksum");
767
768static uint64_t hwcksum_dbg_partial_forced_bytes = 0;
769SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
770 hwcksum_dbg_partial_forced_bytes, CTLFLAG_RD | CTLFLAG_LOCKED,
771 &hwcksum_dbg_partial_forced_bytes, "bytes forced using partial cksum");
772
773static uint32_t hwcksum_dbg_partial_rxoff_forced = 0;
774SYSCTL_PROC(_net_link_generic_system, OID_AUTO,
775 hwcksum_dbg_partial_rxoff_forced, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
776 &hwcksum_dbg_partial_rxoff_forced, 0,
777 sysctl_hwcksum_dbg_partial_rxoff_forced, "I",
778 "forced partial cksum rx offset");
779
780static uint32_t hwcksum_dbg_partial_rxoff_adj = 0;
781SYSCTL_PROC(_net_link_generic_system, OID_AUTO, hwcksum_dbg_partial_rxoff_adj,
782 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_dbg_partial_rxoff_adj,
783 0, sysctl_hwcksum_dbg_partial_rxoff_adj, "I",
784 "adjusted partial cksum rx offset");
785
786static uint64_t hwcksum_dbg_verified = 0;
787SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
788 hwcksum_dbg_verified, CTLFLAG_RD | CTLFLAG_LOCKED,
789 &hwcksum_dbg_verified, "packets verified for having good checksum");
790
791static uint64_t hwcksum_dbg_bad_cksum = 0;
792SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
793 hwcksum_dbg_bad_cksum, CTLFLAG_RD | CTLFLAG_LOCKED,
794 &hwcksum_dbg_bad_cksum, "packets with bad hardware calculated checksum");
795
796static uint64_t hwcksum_dbg_bad_rxoff = 0;
797SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
798 hwcksum_dbg_bad_rxoff, CTLFLAG_RD | CTLFLAG_LOCKED,
799 &hwcksum_dbg_bad_rxoff, "packets with invalid rxoff");
800
801static uint64_t hwcksum_dbg_adjusted = 0;
802SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
803 hwcksum_dbg_adjusted, CTLFLAG_RD | CTLFLAG_LOCKED,
804 &hwcksum_dbg_adjusted, "packets with rxoff adjusted");
805
806static uint64_t hwcksum_dbg_finalized_hdr = 0;
807SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
808 hwcksum_dbg_finalized_hdr, CTLFLAG_RD | CTLFLAG_LOCKED,
809 &hwcksum_dbg_finalized_hdr, "finalized headers");
810
811static uint64_t hwcksum_dbg_finalized_data = 0;
812SYSCTL_QUAD(_net_link_generic_system, OID_AUTO,
813 hwcksum_dbg_finalized_data, CTLFLAG_RD | CTLFLAG_LOCKED,
814 &hwcksum_dbg_finalized_data, "finalized payloads");
815
816uint32_t hwcksum_tx = 1;
817SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_tx,
818 CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_tx, 0,
819 "enable transmit hardware checksum offload");
820
821uint32_t hwcksum_rx = 1;
822SYSCTL_UINT(_net_link_generic_system, OID_AUTO, hwcksum_rx,
823 CTLFLAG_RW | CTLFLAG_LOCKED, &hwcksum_rx, 0,
824 "enable receive hardware checksum offload");
825
826SYSCTL_PROC(_net_link_generic_system, OID_AUTO, tx_chain_len_stats,
827 CTLFLAG_RD | CTLFLAG_LOCKED, 0, 9,
828 sysctl_tx_chain_len_stats, "S", "");
829
830uint32_t tx_chain_len_count = 0;
831SYSCTL_UINT(_net_link_generic_system, OID_AUTO, tx_chain_len_count,
832 CTLFLAG_RW | CTLFLAG_LOCKED, &tx_chain_len_count, 0, "");
833
834static uint32_t threshold_notify = 1; /* enable/disable */
835SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_notify,
836 CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_notify, 0, "");
837
838static uint32_t threshold_interval = 2; /* in seconds */
839SYSCTL_UINT(_net_link_generic_system, OID_AUTO, threshold_interval,
840 CTLFLAG_RW | CTLFLAG_LOCKED, &threshold_interval, 0, "");
841
842#if (DEVELOPMENT || DEBUG)
843static int sysctl_get_kao_frames SYSCTL_HANDLER_ARGS;
844SYSCTL_NODE(_net_link_generic_system, OID_AUTO, get_kao_frames,
845 CTLFLAG_RD | CTLFLAG_LOCKED, sysctl_get_kao_frames, "");
846#endif /* DEVELOPMENT || DEBUG */
847
848struct net_api_stats net_api_stats;
849SYSCTL_STRUCT(_net, OID_AUTO, api_stats, CTLFLAG_RD | CTLFLAG_LOCKED,
850 &net_api_stats, net_api_stats, "");
851
852uint32_t net_wake_pkt_debug = 0;
853SYSCTL_UINT(_net_link_generic_system, OID_AUTO, wake_pkt_debug,
854 CTLFLAG_RW | CTLFLAG_LOCKED, &net_wake_pkt_debug, 0, "");
855
856static void log_hexdump(void *data, size_t len);
857
858unsigned int net_rxpoll = 1;
859unsigned int net_affinity = 1;
860unsigned int net_async = 1; /* 0: synchronous, 1: asynchronous */
861
862static kern_return_t dlil_affinity_set(struct thread *, u_int32_t);
863
864extern u_int32_t inject_buckets;
865
866/* DLIL data threshold thread call */
867static void dlil_dt_tcall_fn(thread_call_param_t, thread_call_param_t);
868
869void
870ifnet_filter_update_tso(struct ifnet *ifp, boolean_t filter_enable)
871{
872 /*
873 * update filter count and route_generation ID to let TCP
874 * know it should reevalute doing TSO or not
875 */
876 if (filter_enable) {
877 OSAddAtomic(1, &ifp->if_flt_no_tso_count);
878 } else {
879 VERIFY(ifp->if_flt_no_tso_count != 0);
880 OSAddAtomic(-1, &ifp->if_flt_no_tso_count);
881 }
882 routegenid_update();
883}
884
885#if SKYWALK
886
887#if defined(XNU_TARGET_OS_OSX)
888static bool net_check_compatible_if_filter(struct ifnet *ifp);
889#endif /* XNU_TARGET_OS_OSX */
890
891/* if_attach_nx flags defined in os_skywalk_private.h */
892static unsigned int if_attach_nx = IF_ATTACH_NX_DEFAULT;
893unsigned int if_enable_fsw_ip_netagent =
894 ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
895unsigned int if_enable_fsw_transport_netagent =
896 ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
897
898unsigned int if_netif_all =
899 ((IF_ATTACH_NX_DEFAULT & IF_ATTACH_NX_NETIF_ALL) != 0);
900
901/* Configure flowswitch to use max mtu sized buffer */
902static bool fsw_use_max_mtu_buffer = false;
903
904#if (DEVELOPMENT || DEBUG)
905static int
906if_attach_nx_sysctl SYSCTL_HANDLER_ARGS
907{
908#pragma unused(oidp, arg1, arg2)
909 unsigned int new_value;
910 int changed;
911 int error = sysctl_io_number(req, if_attach_nx, sizeof(if_attach_nx),
912 &new_value, &changed);
913 if (error) {
914 return error;
915 }
916 if (changed) {
917 if ((new_value & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) !=
918 (if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT)) {
919 return ENOTSUP;
920 }
921 if_attach_nx = new_value;
922 }
923 return 0;
924}
925
926SYSCTL_PROC(_net_link_generic_system, OID_AUTO, if_attach_nx,
927 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
928 0, 0, &if_attach_nx_sysctl, "IU", "attach nexus");
929
930#endif /* DEVELOPMENT || DEBUG */
931
932static int
933if_enable_fsw_transport_netagent_sysctl SYSCTL_HANDLER_ARGS
934{
935#pragma unused(oidp, arg1, arg2)
936 unsigned int new_value;
937 int changed;
938 int error;
939
940 error = sysctl_io_number(req, bigValue: if_enable_fsw_transport_netagent,
941 valueSize: sizeof(if_enable_fsw_transport_netagent),
942 pValue: &new_value, changed: &changed);
943 if (error == 0 && changed != 0) {
944 if (new_value != 0 && new_value != 1) {
945 /* only allow 0 or 1 */
946 error = EINVAL;
947 } else if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
948 /* netagent can be enabled/disabled */
949 if_enable_fsw_transport_netagent = new_value;
950 if (new_value == 0) {
951 kern_nexus_deregister_netagents();
952 } else {
953 kern_nexus_register_netagents();
954 }
955 } else {
956 /* netagent can't be enabled */
957 error = ENOTSUP;
958 }
959 }
960 return error;
961}
962
963SYSCTL_PROC(_net_link_generic_system, OID_AUTO, enable_netagent,
964 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
965 0, 0, &if_enable_fsw_transport_netagent_sysctl, "IU",
966 "enable flowswitch netagent");
967
968static void dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw);
969
970#include <skywalk/os_skywalk_private.h>
971
972boolean_t
973ifnet_nx_noauto(ifnet_t ifp)
974{
975 return (ifp->if_xflags & IFXF_NX_NOAUTO) != 0;
976}
977
978boolean_t
979ifnet_nx_noauto_flowswitch(ifnet_t ifp)
980{
981 return ifnet_is_low_latency(ifp);
982}
983
984boolean_t
985ifnet_is_low_latency(ifnet_t ifp)
986{
987 return (ifp->if_xflags & IFXF_LOW_LATENCY) != 0;
988}
989
990boolean_t
991ifnet_needs_compat(ifnet_t ifp)
992{
993 if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
994 return FALSE;
995 }
996#if !XNU_TARGET_OS_OSX
997 /*
998 * To conserve memory, we plumb in the compat layer selectively; this
999 * can be overridden via if_attach_nx flag IF_ATTACH_NX_NETIF_ALL.
1000 * In particular, we check for Wi-Fi Access Point.
1001 */
1002 if (IFNET_IS_WIFI(ifp)) {
1003 /* Wi-Fi Access Point */
1004 if (ifp->if_name[0] == 'a' && ifp->if_name[1] == 'p' &&
1005 ifp->if_name[2] == '\0') {
1006 return if_netif_all;
1007 }
1008 }
1009#else /* XNU_TARGET_OS_OSX */
1010#pragma unused(ifp)
1011#endif /* XNU_TARGET_OS_OSX */
1012 return TRUE;
1013}
1014
1015boolean_t
1016ifnet_needs_fsw_transport_netagent(ifnet_t ifp)
1017{
1018 if (if_is_fsw_transport_netagent_enabled()) {
1019 /* check if netagent has been manually enabled for ipsec/utun */
1020 if (ifp->if_family == IFNET_FAMILY_IPSEC) {
1021 return ipsec_interface_needs_netagent(interface: ifp);
1022 } else if (ifp->if_family == IFNET_FAMILY_UTUN) {
1023 return utun_interface_needs_netagent(interface: ifp);
1024 }
1025
1026 /* check ifnet no auto nexus override */
1027 if (ifnet_nx_noauto(ifp)) {
1028 return FALSE;
1029 }
1030
1031 /* check global if_attach_nx configuration */
1032 switch (ifp->if_family) {
1033 case IFNET_FAMILY_CELLULAR:
1034 case IFNET_FAMILY_ETHERNET:
1035 if ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) {
1036 return TRUE;
1037 }
1038 break;
1039 default:
1040 break;
1041 }
1042 }
1043 return FALSE;
1044}
1045
1046boolean_t
1047ifnet_needs_fsw_ip_netagent(ifnet_t ifp)
1048{
1049#pragma unused(ifp)
1050 if ((if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0) {
1051 return TRUE;
1052 }
1053 return FALSE;
1054}
1055
1056boolean_t
1057ifnet_needs_netif_netagent(ifnet_t ifp)
1058{
1059#pragma unused(ifp)
1060 return (if_attach_nx & IF_ATTACH_NX_NETIF_NETAGENT) != 0;
1061}
1062
1063static boolean_t
1064dlil_detach_nexus_instance(nexus_controller_t controller,
1065 const char *func_str, uuid_t instance, uuid_t device)
1066{
1067 errno_t err;
1068
1069 if (instance == NULL || uuid_is_null(uu: instance)) {
1070 return FALSE;
1071 }
1072
1073 /* followed by the device port */
1074 if (device != NULL && !uuid_is_null(uu: device)) {
1075 err = kern_nexus_ifdetach(ctl: controller, nx_uuid: instance, nx_if_uuid: device);
1076 if (err != 0) {
1077 DLIL_PRINTF("%s kern_nexus_ifdetach device failed %d\n",
1078 func_str, err);
1079 }
1080 }
1081 err = kern_nexus_controller_free_provider_instance(ctl: controller,
1082 nx_uuid: instance);
1083 if (err != 0) {
1084 DLIL_PRINTF("%s free_provider_instance failed %d\n",
1085 func_str, err);
1086 }
1087 return TRUE;
1088}
1089
1090static boolean_t
1091dlil_detach_nexus(const char *func_str, uuid_t provider, uuid_t instance,
1092 uuid_t device)
1093{
1094 boolean_t detached = FALSE;
1095 nexus_controller_t controller = kern_nexus_shared_controller();
1096 int err;
1097
1098 if (dlil_detach_nexus_instance(controller, func_str, instance,
1099 device)) {
1100 detached = TRUE;
1101 }
1102 if (provider != NULL && !uuid_is_null(uu: provider)) {
1103 detached = TRUE;
1104 err = kern_nexus_controller_deregister_provider(ctl: controller,
1105 nx_prov_uuid: provider);
1106 if (err != 0) {
1107 DLIL_PRINTF("%s deregister_provider %d\n",
1108 func_str, err);
1109 }
1110 }
1111 return detached;
1112}
1113
1114static errno_t
1115dlil_create_provider_and_instance(nexus_controller_t controller,
1116 nexus_type_t type, ifnet_t ifp, uuid_t *provider, uuid_t *instance,
1117 nexus_attr_t attr)
1118{
1119 uuid_t dom_prov;
1120 errno_t err;
1121 nexus_name_t provider_name;
1122 const char *type_name =
1123 (type == NEXUS_TYPE_NET_IF) ? "netif" : "flowswitch";
1124 struct kern_nexus_init init;
1125
1126 err = kern_nexus_get_default_domain_provider(type, dom_prov_uuid: &dom_prov);
1127 if (err != 0) {
1128 DLIL_PRINTF("%s can't get %s provider, error %d\n",
1129 __func__, type_name, err);
1130 goto failed;
1131 }
1132
1133 snprintf((char *)provider_name, count: sizeof(provider_name),
1134 "com.apple.%s.%s", type_name, if_name(ifp));
1135 err = kern_nexus_controller_register_provider(ctl: controller,
1136 dom_prov_uuid: dom_prov,
1137 provider_name,
1138 NULL,
1139 init_len: 0,
1140 nxa: attr,
1141 nx_prov_uuid: provider);
1142 if (err != 0) {
1143 DLIL_PRINTF("%s register %s provider failed, error %d\n",
1144 __func__, type_name, err);
1145 goto failed;
1146 }
1147 bzero(s: &init, n: sizeof(init));
1148 init.nxi_version = KERN_NEXUS_CURRENT_VERSION;
1149 err = kern_nexus_controller_alloc_provider_instance(ctl: controller,
1150 nx_prov_uuid: *provider,
1151 NULL, NULL,
1152 nx_uuid: instance, init: &init);
1153 if (err != 0) {
1154 DLIL_PRINTF("%s alloc_provider_instance %s failed, %d\n",
1155 __func__, type_name, err);
1156 kern_nexus_controller_deregister_provider(ctl: controller,
1157 nx_prov_uuid: *provider);
1158 goto failed;
1159 }
1160failed:
1161 return err;
1162}
1163
1164static boolean_t
1165dlil_attach_netif_nexus_common(ifnet_t ifp, if_nexus_netif_t netif_nx)
1166{
1167 nexus_attr_t attr = NULL;
1168 nexus_controller_t controller;
1169 errno_t err;
1170
1171 if ((ifp->if_capabilities & IFCAP_SKYWALK) != 0) {
1172 /* it's already attached */
1173 if (dlil_verbose) {
1174 DLIL_PRINTF("%s: %s already has nexus attached\n",
1175 __func__, if_name(ifp));
1176 /* already attached */
1177 }
1178 goto failed;
1179 }
1180
1181 err = kern_nexus_attr_create(&attr);
1182 if (err != 0) {
1183 DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1184 if_name(ifp));
1185 goto failed;
1186 }
1187 err = kern_nexus_attr_set(attr, type: NEXUS_ATTR_IFINDEX, value: ifp->if_index);
1188 VERIFY(err == 0);
1189
1190 controller = kern_nexus_shared_controller();
1191
1192 /* create the netif provider and instance */
1193 err = dlil_create_provider_and_instance(controller,
1194 type: NEXUS_TYPE_NET_IF, ifp, provider: &netif_nx->if_nif_provider,
1195 instance: &netif_nx->if_nif_instance, attr);
1196 if (err != 0) {
1197 goto failed;
1198 }
1199 err = kern_nexus_ifattach(controller, nx_uuid: netif_nx->if_nif_instance,
1200 ifp, NULL, FALSE, nx_if_uuid: &netif_nx->if_nif_attach);
1201 if (err != 0) {
1202 DLIL_PRINTF("%s kern_nexus_ifattach %d\n",
1203 __func__, err);
1204 /* cleanup provider and instance */
1205 dlil_detach_nexus(func_str: __func__, provider: netif_nx->if_nif_provider,
1206 instance: netif_nx->if_nif_instance, NULL);
1207 goto failed;
1208 }
1209 return TRUE;
1210
1211failed:
1212 if (attr != NULL) {
1213 kern_nexus_attr_destroy(attr);
1214 }
1215 return FALSE;
1216}
1217
1218static boolean_t
1219dlil_attach_netif_compat_nexus(ifnet_t ifp, if_nexus_netif_t netif_nx)
1220{
1221 if (ifnet_nx_noauto(ifp) || IFNET_IS_INTCOPROC(ifp) ||
1222 IFNET_IS_MANAGEMENT(ifp) || IFNET_IS_VMNET(ifp)) {
1223 goto failed;
1224 }
1225 switch (ifp->if_type) {
1226 case IFT_CELLULAR:
1227 case IFT_ETHER:
1228 if ((if_attach_nx & IF_ATTACH_NX_NETIF_COMPAT) == 0) {
1229 /* don't auto-attach */
1230 goto failed;
1231 }
1232 break;
1233 default:
1234 /* don't auto-attach */
1235 goto failed;
1236 }
1237 return dlil_attach_netif_nexus_common(ifp, netif_nx);
1238
1239failed:
1240 return FALSE;
1241}
1242
1243static boolean_t
1244dlil_is_native_netif_nexus(ifnet_t ifp)
1245{
1246 return (ifp->if_eflags & IFEF_SKYWALK_NATIVE) && ifp->if_na != NULL;
1247}
1248
1249__attribute__((noinline))
1250static void
1251dlil_detach_netif_nexus(if_nexus_netif_t nexus_netif)
1252{
1253 dlil_detach_nexus(func_str: __func__, provider: nexus_netif->if_nif_provider,
1254 instance: nexus_netif->if_nif_instance, device: nexus_netif->if_nif_attach);
1255}
1256
1257static inline int
1258dlil_siocgifdevmtu(struct ifnet * ifp, struct ifdevmtu * ifdm_p)
1259{
1260 struct ifreq ifr;
1261 int error;
1262
1263 bzero(s: &ifr, n: sizeof(ifr));
1264 error = ifnet_ioctl(interface: ifp, protocol: 0, SIOCGIFDEVMTU, ioctl_arg: &ifr);
1265 if (error == 0) {
1266 *ifdm_p = ifr.ifr_devmtu;
1267 }
1268 return error;
1269}
1270
1271static inline void
1272_dlil_adjust_large_buf_size_for_tso(ifnet_t ifp, uint32_t *large_buf_size)
1273{
1274#ifdef XNU_TARGET_OS_OSX
1275 uint32_t tso_v4_mtu = 0;
1276 uint32_t tso_v6_mtu = 0;
1277
1278 if (!dlil_is_native_netif_nexus(ifp)) {
1279 return;
1280 }
1281 /*
1282 * Note that we are reading the real hwassist flags set by the driver
1283 * and not the adjusted ones because nx_netif_host_adjust_if_capabilities()
1284 * hasn't been called yet.
1285 */
1286 if ((ifp->if_hwassist & IFNET_TSO_IPV4) != 0) {
1287 tso_v4_mtu = ifp->if_tso_v4_mtu;
1288 }
1289 if ((ifp->if_hwassist & IFNET_TSO_IPV6) != 0) {
1290 tso_v6_mtu = ifp->if_tso_v6_mtu;
1291 }
1292 /*
1293 * If the hardware supports TSO, adjust the large buf size to match the
1294 * supported TSO MTU size.
1295 */
1296 if (tso_v4_mtu != 0 || tso_v6_mtu != 0) {
1297 *large_buf_size = MAX(tso_v4_mtu, tso_v6_mtu);
1298 } else {
1299 *large_buf_size = MAX(*large_buf_size, sk_fsw_gso_mtu);
1300 }
1301 *large_buf_size = MIN(NX_FSW_MAX_LARGE_BUFSIZE, *large_buf_size);
1302#else
1303#pragma unused(ifp, large_buf_size)
1304#endif /* XNU_TARGET_OS_OSX */
1305}
1306
1307static inline int
1308_dlil_get_flowswitch_buffer_size(ifnet_t ifp, uuid_t netif, uint32_t *buf_size,
1309 bool *use_multi_buflet, uint32_t *large_buf_size)
1310{
1311 struct kern_pbufpool_memory_info rx_pp_info;
1312 struct kern_pbufpool_memory_info tx_pp_info;
1313 uint32_t if_max_mtu = 0;
1314 uint32_t drv_buf_size;
1315 struct ifdevmtu ifdm;
1316 int err;
1317
1318 /*
1319 * To perform intra-stack RX aggregation flowswitch needs to use
1320 * multi-buflet packet.
1321 */
1322 *use_multi_buflet = NX_FSW_TCP_RX_AGG_ENABLED();
1323
1324 *large_buf_size = *use_multi_buflet ? NX_FSW_DEF_LARGE_BUFSIZE : 0;
1325 /*
1326 * IP over Thunderbolt interface can deliver the largest IP packet,
1327 * but the driver advertises the MAX MTU as only 9K.
1328 */
1329 if (IFNET_IS_THUNDERBOLT_IP(ifp)) {
1330 if_max_mtu = IP_MAXPACKET;
1331 goto skip_mtu_ioctl;
1332 }
1333
1334 /* determine max mtu */
1335 bzero(s: &ifdm, n: sizeof(ifdm));
1336 err = dlil_siocgifdevmtu(ifp, ifdm_p: &ifdm);
1337 if (__improbable(err != 0)) {
1338 DLIL_PRINTF("%s: SIOCGIFDEVMTU failed for %s\n",
1339 __func__, if_name(ifp));
1340 /* use default flowswitch buffer size */
1341 if_max_mtu = NX_FSW_BUFSIZE;
1342 } else {
1343 DLIL_PRINTF("%s: %s %d %d\n", __func__, if_name(ifp),
1344 ifdm.ifdm_max, ifdm.ifdm_current);
1345 /* rdar://problem/44589731 */
1346 if_max_mtu = MAX(ifdm.ifdm_max, ifdm.ifdm_current);
1347 }
1348
1349skip_mtu_ioctl:
1350 if (if_max_mtu == 0) {
1351 DLIL_PRINTF("%s: can't determine MAX MTU for %s\n",
1352 __func__, if_name(ifp));
1353 return EINVAL;
1354 }
1355 if ((if_max_mtu > NX_FSW_MAXBUFSIZE) && fsw_use_max_mtu_buffer) {
1356 DLIL_PRINTF("%s: interace (%s) has MAX MTU (%u) > flowswitch "
1357 "max bufsize(%d)\n", __func__,
1358 if_name(ifp), if_max_mtu, NX_FSW_MAXBUFSIZE);
1359 return EINVAL;
1360 }
1361
1362 /*
1363 * for skywalk native driver, consult the driver packet pool also.
1364 */
1365 if (dlil_is_native_netif_nexus(ifp)) {
1366 err = kern_nexus_get_pbufpool_info(nx_uuid: netif, rx_pool: &rx_pp_info,
1367 tx_pool: &tx_pp_info);
1368 if (err != 0) {
1369 DLIL_PRINTF("%s: can't get pbufpool info for %s\n",
1370 __func__, if_name(ifp));
1371 return ENXIO;
1372 }
1373 drv_buf_size = tx_pp_info.kpm_bufsize *
1374 tx_pp_info.kpm_max_frags;
1375 if (if_max_mtu > drv_buf_size) {
1376 DLIL_PRINTF("%s: interface %s packet pool (rx %d * %d, "
1377 "tx %d * %d) can't support max mtu(%d)\n", __func__,
1378 if_name(ifp), rx_pp_info.kpm_bufsize,
1379 rx_pp_info.kpm_max_frags, tx_pp_info.kpm_bufsize,
1380 tx_pp_info.kpm_max_frags, if_max_mtu);
1381 return EINVAL;
1382 }
1383 } else {
1384 drv_buf_size = if_max_mtu;
1385 }
1386
1387 if ((drv_buf_size > NX_FSW_BUFSIZE) && (!fsw_use_max_mtu_buffer)) {
1388 _CASSERT((NX_FSW_BUFSIZE * NX_PBUF_FRAGS_MAX) >= IP_MAXPACKET);
1389 *use_multi_buflet = true;
1390 /* default flowswitch buffer size */
1391 *buf_size = NX_FSW_BUFSIZE;
1392 *large_buf_size = MIN(NX_FSW_MAX_LARGE_BUFSIZE, drv_buf_size);
1393 } else {
1394 *buf_size = MAX(drv_buf_size, NX_FSW_BUFSIZE);
1395 }
1396 _dlil_adjust_large_buf_size_for_tso(ifp, large_buf_size);
1397 ASSERT(*buf_size <= NX_FSW_MAXBUFSIZE);
1398 if (*buf_size >= *large_buf_size) {
1399 *large_buf_size = 0;
1400 }
1401 return 0;
1402}
1403
1404static boolean_t
1405_dlil_attach_flowswitch_nexus(ifnet_t ifp, if_nexus_flowswitch_t nexus_fsw)
1406{
1407 nexus_attr_t attr = NULL;
1408 nexus_controller_t controller;
1409 errno_t err = 0;
1410 uuid_t netif;
1411 uint32_t buf_size = 0;
1412 uint32_t large_buf_size = 0;
1413 bool multi_buflet;
1414
1415 if (ifnet_nx_noauto(ifp) || ifnet_nx_noauto_flowswitch(ifp) ||
1416 IFNET_IS_VMNET(ifp)) {
1417 goto failed;
1418 }
1419
1420 if ((ifp->if_capabilities & IFCAP_SKYWALK) == 0) {
1421 /* not possible to attach (netif native/compat not plumbed) */
1422 goto failed;
1423 }
1424
1425 if ((if_attach_nx & IF_ATTACH_NX_FLOWSWITCH) == 0) {
1426 /* don't auto-attach */
1427 goto failed;
1428 }
1429
1430 /* get the netif instance from the ifp */
1431 err = kern_nexus_get_netif_instance(ifp, nx_uuid: netif);
1432 if (err != 0) {
1433 DLIL_PRINTF("%s: can't find netif for %s\n", __func__,
1434 if_name(ifp));
1435 goto failed;
1436 }
1437
1438 err = kern_nexus_attr_create(&attr);
1439 if (err != 0) {
1440 DLIL_PRINTF("%s: nexus attr create for %s\n", __func__,
1441 if_name(ifp));
1442 goto failed;
1443 }
1444
1445 err = _dlil_get_flowswitch_buffer_size(ifp, netif, buf_size: &buf_size,
1446 use_multi_buflet: &multi_buflet, large_buf_size: &large_buf_size);
1447 if (err != 0) {
1448 goto failed;
1449 }
1450 ASSERT((buf_size >= NX_FSW_BUFSIZE) && (buf_size <= NX_FSW_MAXBUFSIZE));
1451 ASSERT(large_buf_size <= NX_FSW_MAX_LARGE_BUFSIZE);
1452
1453 /* Configure flowswitch buffer size */
1454 err = kern_nexus_attr_set(attr, type: NEXUS_ATTR_SLOT_BUF_SIZE, value: buf_size);
1455 VERIFY(err == 0);
1456 err = kern_nexus_attr_set(attr, type: NEXUS_ATTR_LARGE_BUF_SIZE,
1457 value: large_buf_size);
1458 VERIFY(err == 0);
1459
1460 /*
1461 * Configure flowswitch to use super-packet (multi-buflet).
1462 */
1463 err = kern_nexus_attr_set(attr, type: NEXUS_ATTR_MAX_FRAGS,
1464 value: multi_buflet ? NX_PBUF_FRAGS_MAX : 1);
1465 VERIFY(err == 0);
1466
1467 /* create the flowswitch provider and instance */
1468 controller = kern_nexus_shared_controller();
1469 err = dlil_create_provider_and_instance(controller,
1470 type: NEXUS_TYPE_FLOW_SWITCH, ifp, provider: &nexus_fsw->if_fsw_provider,
1471 instance: &nexus_fsw->if_fsw_instance, attr);
1472 if (err != 0) {
1473 goto failed;
1474 }
1475
1476 /* attach the device port */
1477 err = kern_nexus_ifattach(controller, nx_uuid: nexus_fsw->if_fsw_instance,
1478 NULL, nx_attachee: netif, FALSE, nx_if_uuid: &nexus_fsw->if_fsw_device);
1479 if (err != 0) {
1480 DLIL_PRINTF("%s kern_nexus_ifattach device failed %d %s\n",
1481 __func__, err, if_name(ifp));
1482 /* cleanup provider and instance */
1483 dlil_detach_nexus(func_str: __func__, provider: nexus_fsw->if_fsw_provider,
1484 instance: nexus_fsw->if_fsw_instance, device: nexus_fsw->if_fsw_device);
1485 goto failed;
1486 }
1487 return TRUE;
1488
1489failed:
1490 if (err != 0) {
1491 DLIL_PRINTF("%s: failed to attach flowswitch to %s, error %d\n",
1492 __func__, if_name(ifp), err);
1493 } else {
1494 DLIL_PRINTF("%s: not attaching flowswitch to %s\n",
1495 __func__, if_name(ifp));
1496 }
1497 if (attr != NULL) {
1498 kern_nexus_attr_destroy(attr);
1499 }
1500 return FALSE;
1501}
1502
1503static boolean_t
1504dlil_attach_flowswitch_nexus(ifnet_t ifp)
1505{
1506 boolean_t attached;
1507 if_nexus_flowswitch nexus_fsw;
1508
1509#if (DEVELOPMENT || DEBUG)
1510 if (skywalk_netif_direct_allowed(if_name(ifp))) {
1511 DLIL_PRINTF("skip attaching fsw to %s", if_name(ifp));
1512 return FALSE;
1513 }
1514#endif /* (DEVELOPMENT || DEBUG) */
1515
1516 /*
1517 * flowswitch attachment is not supported for interface using the
1518 * legacy model (IFNET_INIT_LEGACY)
1519 */
1520 if ((ifp->if_eflags & IFEF_TXSTART) == 0) {
1521 DLIL_PRINTF("skip attaching fsw to %s using legacy TX model",
1522 if_name(ifp));
1523 return FALSE;
1524 }
1525
1526 if (uuid_is_null(uu: ifp->if_nx_flowswitch.if_fsw_instance) == 0) {
1527 /* it's already attached */
1528 return FALSE;
1529 }
1530 bzero(s: &nexus_fsw, n: sizeof(nexus_fsw));
1531 attached = _dlil_attach_flowswitch_nexus(ifp, nexus_fsw: &nexus_fsw);
1532 if (attached) {
1533 ifnet_lock_exclusive(ifp);
1534 if (!IF_FULLY_ATTACHED(ifp)) {
1535 /* interface is going away */
1536 attached = FALSE;
1537 } else {
1538 ifp->if_nx_flowswitch = nexus_fsw;
1539 }
1540 ifnet_lock_done(ifp);
1541 if (!attached) {
1542 /* clean up flowswitch nexus */
1543 dlil_detach_flowswitch_nexus(nexus_fsw: &nexus_fsw);
1544 }
1545 }
1546 return attached;
1547}
1548
1549__attribute__((noinline))
1550static void
1551dlil_detach_flowswitch_nexus(if_nexus_flowswitch_t nexus_fsw)
1552{
1553 dlil_detach_nexus(func_str: __func__, provider: nexus_fsw->if_fsw_provider,
1554 instance: nexus_fsw->if_fsw_instance, device: nexus_fsw->if_fsw_device);
1555}
1556
1557__attribute__((noinline))
1558static void
1559dlil_netif_detach_notify(ifnet_t ifp)
1560{
1561 ifnet_detach_notify_cb_t notify = NULL;
1562 void *arg = NULL;
1563
1564 ifnet_get_detach_notify(ifp, cbp: &notify, argp: &arg);
1565 if (notify == NULL) {
1566 DTRACE_SKYWALK1(no__notify, ifnet_t, ifp);
1567 return;
1568 }
1569 (*notify)(arg);
1570}
1571
1572__attribute__((noinline))
1573static void
1574dlil_quiesce_and_detach_nexuses(ifnet_t ifp)
1575{
1576 if_nexus_flowswitch *nx_fsw = &ifp->if_nx_flowswitch;
1577 if_nexus_netif *nx_netif = &ifp->if_nx_netif;
1578
1579 ifnet_datamov_suspend_and_drain(ifp);
1580 if (!uuid_is_null(uu: nx_fsw->if_fsw_device)) {
1581 ASSERT(!uuid_is_null(nx_fsw->if_fsw_provider));
1582 ASSERT(!uuid_is_null(nx_fsw->if_fsw_instance));
1583 dlil_detach_flowswitch_nexus(nexus_fsw: nx_fsw);
1584 bzero(s: nx_fsw, n: sizeof(*nx_fsw));
1585 } else {
1586 ASSERT(uuid_is_null(nx_fsw->if_fsw_provider));
1587 ASSERT(uuid_is_null(nx_fsw->if_fsw_instance));
1588 DTRACE_IP1(fsw__not__attached, ifnet_t, ifp);
1589 }
1590
1591 if (!uuid_is_null(uu: nx_netif->if_nif_attach)) {
1592 ASSERT(!uuid_is_null(nx_netif->if_nif_provider));
1593 ASSERT(!uuid_is_null(nx_netif->if_nif_instance));
1594 dlil_detach_netif_nexus(nexus_netif: nx_netif);
1595 bzero(s: nx_netif, n: sizeof(*nx_netif));
1596 } else {
1597 ASSERT(uuid_is_null(nx_netif->if_nif_provider));
1598 ASSERT(uuid_is_null(nx_netif->if_nif_instance));
1599 DTRACE_IP1(netif__not__attached, ifnet_t, ifp);
1600 }
1601 ifnet_datamov_resume(ifp);
1602}
1603
1604boolean_t
1605ifnet_add_netagent(ifnet_t ifp)
1606{
1607 int error;
1608
1609 error = kern_nexus_interface_add_netagent(ifp);
1610 os_log(OS_LOG_DEFAULT,
1611 "kern_nexus_interface_add_netagent(%s) returned %d",
1612 ifp->if_xname, error);
1613 return error == 0;
1614}
1615
1616boolean_t
1617ifnet_remove_netagent(ifnet_t ifp)
1618{
1619 int error;
1620
1621 error = kern_nexus_interface_remove_netagent(ifp);
1622 os_log(OS_LOG_DEFAULT,
1623 "kern_nexus_interface_remove_netagent(%s) returned %d",
1624 ifp->if_xname, error);
1625 return error == 0;
1626}
1627
1628boolean_t
1629ifnet_attach_flowswitch_nexus(ifnet_t ifp)
1630{
1631 if (!IF_FULLY_ATTACHED(ifp)) {
1632 return FALSE;
1633 }
1634 return dlil_attach_flowswitch_nexus(ifp);
1635}
1636
1637boolean_t
1638ifnet_detach_flowswitch_nexus(ifnet_t ifp)
1639{
1640 if_nexus_flowswitch nexus_fsw;
1641
1642 ifnet_lock_exclusive(ifp);
1643 nexus_fsw = ifp->if_nx_flowswitch;
1644 bzero(s: &ifp->if_nx_flowswitch, n: sizeof(ifp->if_nx_flowswitch));
1645 ifnet_lock_done(ifp);
1646 return dlil_detach_nexus(func_str: __func__, provider: nexus_fsw.if_fsw_provider,
1647 instance: nexus_fsw.if_fsw_instance, device: nexus_fsw.if_fsw_device);
1648}
1649
1650boolean_t
1651ifnet_attach_netif_nexus(ifnet_t ifp)
1652{
1653 boolean_t nexus_attached;
1654 if_nexus_netif nexus_netif;
1655
1656 if (!IF_FULLY_ATTACHED(ifp)) {
1657 return FALSE;
1658 }
1659 nexus_attached = dlil_attach_netif_nexus_common(ifp, netif_nx: &nexus_netif);
1660 if (nexus_attached) {
1661 ifnet_lock_exclusive(ifp);
1662 ifp->if_nx_netif = nexus_netif;
1663 ifnet_lock_done(ifp);
1664 }
1665 return nexus_attached;
1666}
1667
1668boolean_t
1669ifnet_detach_netif_nexus(ifnet_t ifp)
1670{
1671 if_nexus_netif nexus_netif;
1672
1673 ifnet_lock_exclusive(ifp);
1674 nexus_netif = ifp->if_nx_netif;
1675 bzero(s: &ifp->if_nx_netif, n: sizeof(ifp->if_nx_netif));
1676 ifnet_lock_done(ifp);
1677
1678 return dlil_detach_nexus(func_str: __func__, provider: nexus_netif.if_nif_provider,
1679 instance: nexus_netif.if_nif_instance, device: nexus_netif.if_nif_attach);
1680}
1681
1682void
1683ifnet_attach_native_flowswitch(ifnet_t ifp)
1684{
1685 if (!dlil_is_native_netif_nexus(ifp)) {
1686 /* not a native netif */
1687 return;
1688 }
1689 ifnet_attach_flowswitch_nexus(ifp);
1690}
1691
1692int
1693ifnet_set_flowswitch_rx_callback(ifnet_t ifp, ifnet_fsw_rx_cb_t cb, void *arg)
1694{
1695 lck_mtx_lock(lck: &ifp->if_delegate_lock);
1696 while (ifp->if_fsw_rx_cb_ref > 0) {
1697 DTRACE_SKYWALK1(wait__fsw, ifnet_t, ifp);
1698 (void) msleep(chan: &ifp->if_fsw_rx_cb_ref, mtx: &ifp->if_delegate_lock,
1699 pri: (PZERO + 1), wmesg: __FUNCTION__, NULL);
1700 DTRACE_SKYWALK1(wake__fsw, ifnet_t, ifp);
1701 }
1702 ifp->if_fsw_rx_cb = cb;
1703 ifp->if_fsw_rx_cb_arg = arg;
1704 lck_mtx_unlock(lck: &ifp->if_delegate_lock);
1705 return 0;
1706}
1707
1708int
1709ifnet_get_flowswitch_rx_callback(ifnet_t ifp, ifnet_fsw_rx_cb_t *cbp, void **argp)
1710{
1711 /*
1712 * This is for avoiding the unnecessary lock acquire for interfaces
1713 * not used by a redirect interface.
1714 */
1715 if (ifp->if_fsw_rx_cb == NULL) {
1716 return ENOENT;
1717 }
1718 lck_mtx_lock(lck: &ifp->if_delegate_lock);
1719 if (ifp->if_fsw_rx_cb == NULL) {
1720 lck_mtx_unlock(lck: &ifp->if_delegate_lock);
1721 return ENOENT;
1722 }
1723 *cbp = ifp->if_fsw_rx_cb;
1724 *argp = ifp->if_fsw_rx_cb_arg;
1725 ifp->if_fsw_rx_cb_ref++;
1726 lck_mtx_unlock(lck: &ifp->if_delegate_lock);
1727 return 0;
1728}
1729
1730void
1731ifnet_release_flowswitch_rx_callback(ifnet_t ifp)
1732{
1733 lck_mtx_lock(lck: &ifp->if_delegate_lock);
1734 if (--ifp->if_fsw_rx_cb_ref == 0) {
1735 wakeup(chan: &ifp->if_fsw_rx_cb_ref);
1736 }
1737 lck_mtx_unlock(lck: &ifp->if_delegate_lock);
1738}
1739
1740int
1741ifnet_set_delegate_parent(ifnet_t difp, ifnet_t parent)
1742{
1743 lck_mtx_lock(lck: &difp->if_delegate_lock);
1744 while (difp->if_delegate_parent_ref > 0) {
1745 DTRACE_SKYWALK1(wait__parent, ifnet_t, difp);
1746 (void) msleep(chan: &difp->if_delegate_parent_ref, mtx: &difp->if_delegate_lock,
1747 pri: (PZERO + 1), wmesg: __FUNCTION__, NULL);
1748 DTRACE_SKYWALK1(wake__parent, ifnet_t, difp);
1749 }
1750 difp->if_delegate_parent = parent;
1751 lck_mtx_unlock(lck: &difp->if_delegate_lock);
1752 return 0;
1753}
1754
1755int
1756ifnet_get_delegate_parent(ifnet_t difp, ifnet_t *parentp)
1757{
1758 lck_mtx_lock(lck: &difp->if_delegate_lock);
1759 if (difp->if_delegate_parent == NULL) {
1760 lck_mtx_unlock(lck: &difp->if_delegate_lock);
1761 return ENOENT;
1762 }
1763 *parentp = difp->if_delegate_parent;
1764 difp->if_delegate_parent_ref++;
1765 lck_mtx_unlock(lck: &difp->if_delegate_lock);
1766 return 0;
1767}
1768
1769void
1770ifnet_release_delegate_parent(ifnet_t difp)
1771{
1772 lck_mtx_lock(lck: &difp->if_delegate_lock);
1773 if (--difp->if_delegate_parent_ref == 0) {
1774 wakeup(chan: &difp->if_delegate_parent_ref);
1775 }
1776 lck_mtx_unlock(lck: &difp->if_delegate_lock);
1777}
1778
1779__attribute__((noinline))
1780void
1781ifnet_set_detach_notify_locked(ifnet_t ifp, ifnet_detach_notify_cb_t notify, void *arg)
1782{
1783 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
1784 ifp->if_detach_notify = notify;
1785 ifp->if_detach_notify_arg = arg;
1786}
1787
1788__attribute__((noinline))
1789void
1790ifnet_get_detach_notify_locked(ifnet_t ifp, ifnet_detach_notify_cb_t *notifyp, void **argp)
1791{
1792 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_EXCLUSIVE);
1793 *notifyp = ifp->if_detach_notify;
1794 *argp = ifp->if_detach_notify_arg;
1795}
1796
1797__attribute__((noinline))
1798void
1799ifnet_set_detach_notify(ifnet_t ifp, ifnet_detach_notify_cb_t notify, void *arg)
1800{
1801 ifnet_lock_exclusive(ifp);
1802 ifnet_set_detach_notify_locked(ifp, notify, arg);
1803 ifnet_lock_done(ifp);
1804}
1805
1806__attribute__((noinline))
1807void
1808ifnet_get_detach_notify(ifnet_t ifp, ifnet_detach_notify_cb_t *notifyp, void **argp)
1809{
1810 ifnet_lock_exclusive(ifp);
1811 ifnet_get_detach_notify_locked(ifp, notifyp, argp);
1812 ifnet_lock_done(ifp);
1813}
1814#endif /* SKYWALK */
1815
1816#define DLIL_INPUT_CHECK(m, ifp) { \
1817 struct ifnet *_rcvif = mbuf_pkthdr_rcvif(m); \
1818 if (_rcvif == NULL || (ifp != lo_ifp && _rcvif != ifp) || \
1819 !(mbuf_flags(m) & MBUF_PKTHDR)) { \
1820 panic_plain("%s: invalid mbuf %p\n", __func__, m); \
1821 /* NOTREACHED */ \
1822 } \
1823}
1824
1825#define DLIL_EWMA(old, new, decay) do { \
1826 u_int32_t _avg; \
1827 if ((_avg = (old)) > 0) \
1828 _avg = (((_avg << (decay)) - _avg) + (new)) >> (decay); \
1829 else \
1830 _avg = (new); \
1831 (old) = _avg; \
1832} while (0)
1833
1834#define MBPS (1ULL * 1000 * 1000)
1835#define GBPS (MBPS * 1000)
1836
1837struct rxpoll_time_tbl {
1838 u_int64_t speed; /* downlink speed */
1839 u_int32_t plowat; /* packets low watermark */
1840 u_int32_t phiwat; /* packets high watermark */
1841 u_int32_t blowat; /* bytes low watermark */
1842 u_int32_t bhiwat; /* bytes high watermark */
1843};
1844
1845static struct rxpoll_time_tbl rxpoll_tbl[] = {
1846 { .speed = 10 * MBPS, .plowat = 2, .phiwat = 8, .blowat = (1 * 1024), .bhiwat = (6 * 1024) },
1847 { .speed = 100 * MBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1848 { .speed = 1 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1849 { .speed = 10 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1850 { .speed = 100 * GBPS, .plowat = 10, .phiwat = 40, .blowat = (4 * 1024), .bhiwat = (64 * 1024) },
1851 { .speed = 0, .plowat = 0, .phiwat = 0, .blowat = 0, .bhiwat = 0 }
1852};
1853
1854static LCK_MTX_DECLARE_ATTR(dlil_thread_sync_lock, &dlil_lock_group,
1855 &dlil_lck_attributes);
1856static uint32_t dlil_pending_thread_cnt = 0;
1857
1858static void
1859dlil_incr_pending_thread_count(void)
1860{
1861 LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1862 lck_mtx_lock(lck: &dlil_thread_sync_lock);
1863 dlil_pending_thread_cnt++;
1864 lck_mtx_unlock(lck: &dlil_thread_sync_lock);
1865}
1866
1867static void
1868dlil_decr_pending_thread_count(void)
1869{
1870 LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_MTX_ASSERT_NOTOWNED);
1871 lck_mtx_lock(lck: &dlil_thread_sync_lock);
1872 VERIFY(dlil_pending_thread_cnt > 0);
1873 dlil_pending_thread_cnt--;
1874 if (dlil_pending_thread_cnt == 0) {
1875 wakeup(chan: &dlil_pending_thread_cnt);
1876 }
1877 lck_mtx_unlock(lck: &dlil_thread_sync_lock);
1878}
1879
1880int
1881proto_hash_value(u_int32_t protocol_family)
1882{
1883 /*
1884 * dlil_proto_unplumb_all() depends on the mapping between
1885 * the hash bucket index and the protocol family defined
1886 * here; future changes must be applied there as well.
1887 */
1888 switch (protocol_family) {
1889 case PF_INET:
1890 return 0;
1891 case PF_INET6:
1892 return 1;
1893 case PF_VLAN:
1894 return 2;
1895 case PF_UNSPEC:
1896 default:
1897 return 3;
1898 }
1899}
1900
1901/*
1902 * Caller must already be holding ifnet lock.
1903 */
1904static struct if_proto *
1905find_attached_proto(struct ifnet *ifp, u_int32_t protocol_family)
1906{
1907 struct if_proto *proto = NULL;
1908 u_int32_t i = proto_hash_value(protocol_family);
1909
1910 ifnet_lock_assert(ifp, IFNET_LCK_ASSERT_OWNED);
1911
1912 if (ifp->if_proto_hash != NULL) {
1913 proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
1914 }
1915
1916 while (proto != NULL && proto->protocol_family != protocol_family) {
1917 proto = SLIST_NEXT(proto, next_hash);
1918 }
1919
1920 if (proto != NULL) {
1921 if_proto_ref(proto);
1922 }
1923
1924 return proto;
1925}
1926
1927static void
1928if_proto_ref(struct if_proto *proto)
1929{
1930 os_atomic_inc(&proto->refcount, relaxed);
1931}
1932
1933extern void if_rtproto_del(struct ifnet *ifp, int protocol);
1934
1935static void
1936if_proto_free(struct if_proto *proto)
1937{
1938 u_int32_t oldval;
1939 struct ifnet *ifp = proto->ifp;
1940 u_int32_t proto_family = proto->protocol_family;
1941 struct kev_dl_proto_data ev_pr_data;
1942
1943 oldval = os_atomic_dec_orig(&proto->refcount, relaxed);
1944 if (oldval > 1) {
1945 return;
1946 }
1947
1948 if (proto->proto_kpi == kProtoKPI_v1) {
1949 if (proto->kpi.v1.detached) {
1950 proto->kpi.v1.detached(ifp, proto->protocol_family);
1951 }
1952 }
1953 if (proto->proto_kpi == kProtoKPI_v2) {
1954 if (proto->kpi.v2.detached) {
1955 proto->kpi.v2.detached(ifp, proto->protocol_family);
1956 }
1957 }
1958
1959 /*
1960 * Cleanup routes that may still be in the routing table for that
1961 * interface/protocol pair.
1962 */
1963 if_rtproto_del(ifp, protocol: proto_family);
1964
1965 ifnet_lock_shared(ifp);
1966
1967 /* No more reference on this, protocol must have been detached */
1968 VERIFY(proto->detached);
1969
1970 /*
1971 * The reserved field carries the number of protocol still attached
1972 * (subject to change)
1973 */
1974 ev_pr_data.proto_family = proto_family;
1975 ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, list_count: 0);
1976
1977 ifnet_lock_done(ifp);
1978
1979 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_DETACHED,
1980 (struct net_event_data *)&ev_pr_data,
1981 sizeof(struct kev_dl_proto_data), FALSE);
1982
1983 if (ev_pr_data.proto_remaining_count == 0) {
1984 /*
1985 * The protocol count has gone to zero, mark the interface down.
1986 * This used to be done by configd.KernelEventMonitor, but that
1987 * is inherently prone to races (rdar://problem/30810208).
1988 */
1989 (void) ifnet_set_flags(interface: ifp, new_flags: 0, IFF_UP);
1990 (void) ifnet_ioctl(interface: ifp, protocol: 0, SIOCSIFFLAGS, NULL);
1991 dlil_post_sifflags_msg(ifp);
1992 }
1993
1994 zfree(dlif_proto_zone, proto);
1995}
1996
1997__private_extern__ void
1998ifnet_lock_assert(struct ifnet *ifp, ifnet_lock_assert_t what)
1999{
2000#if !MACH_ASSERT
2001#pragma unused(ifp)
2002#endif
2003 unsigned int type = 0;
2004 int ass = 1;
2005
2006 switch (what) {
2007 case IFNET_LCK_ASSERT_EXCLUSIVE:
2008 type = LCK_RW_ASSERT_EXCLUSIVE;
2009 break;
2010
2011 case IFNET_LCK_ASSERT_SHARED:
2012 type = LCK_RW_ASSERT_SHARED;
2013 break;
2014
2015 case IFNET_LCK_ASSERT_OWNED:
2016 type = LCK_RW_ASSERT_HELD;
2017 break;
2018
2019 case IFNET_LCK_ASSERT_NOTOWNED:
2020 /* nothing to do here for RW lock; bypass assert */
2021 ass = 0;
2022 break;
2023
2024 default:
2025 panic("bad ifnet assert type: %d", what);
2026 /* NOTREACHED */
2027 }
2028 if (ass) {
2029 LCK_RW_ASSERT(&ifp->if_lock, type);
2030 }
2031}
2032
2033__private_extern__ void
2034ifnet_lock_shared(struct ifnet *ifp)
2035{
2036 lck_rw_lock_shared(lck: &ifp->if_lock);
2037}
2038
2039__private_extern__ void
2040ifnet_lock_exclusive(struct ifnet *ifp)
2041{
2042 lck_rw_lock_exclusive(lck: &ifp->if_lock);
2043}
2044
2045__private_extern__ void
2046ifnet_lock_done(struct ifnet *ifp)
2047{
2048 lck_rw_done(lck: &ifp->if_lock);
2049}
2050
2051#if INET
2052__private_extern__ void
2053if_inetdata_lock_shared(struct ifnet *ifp)
2054{
2055 lck_rw_lock_shared(lck: &ifp->if_inetdata_lock);
2056}
2057
2058__private_extern__ void
2059if_inetdata_lock_exclusive(struct ifnet *ifp)
2060{
2061 lck_rw_lock_exclusive(lck: &ifp->if_inetdata_lock);
2062}
2063
2064__private_extern__ void
2065if_inetdata_lock_done(struct ifnet *ifp)
2066{
2067 lck_rw_done(lck: &ifp->if_inetdata_lock);
2068}
2069#endif
2070
2071__private_extern__ void
2072if_inet6data_lock_shared(struct ifnet *ifp)
2073{
2074 lck_rw_lock_shared(lck: &ifp->if_inet6data_lock);
2075}
2076
2077__private_extern__ void
2078if_inet6data_lock_exclusive(struct ifnet *ifp)
2079{
2080 lck_rw_lock_exclusive(lck: &ifp->if_inet6data_lock);
2081}
2082
2083__private_extern__ void
2084if_inet6data_lock_done(struct ifnet *ifp)
2085{
2086 lck_rw_done(lck: &ifp->if_inet6data_lock);
2087}
2088
2089__private_extern__ void
2090ifnet_head_lock_shared(void)
2091{
2092 lck_rw_lock_shared(lck: &ifnet_head_lock);
2093}
2094
2095__private_extern__ void
2096ifnet_head_lock_exclusive(void)
2097{
2098 lck_rw_lock_exclusive(lck: &ifnet_head_lock);
2099}
2100
2101__private_extern__ void
2102ifnet_head_done(void)
2103{
2104 lck_rw_done(lck: &ifnet_head_lock);
2105}
2106
2107__private_extern__ void
2108ifnet_head_assert_exclusive(void)
2109{
2110 LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_EXCLUSIVE);
2111}
2112
2113/*
2114 * dlil_ifp_protolist
2115 * - get the list of protocols attached to the interface, or just the number
2116 * of attached protocols
2117 * - if the number returned is greater than 'list_count', truncation occurred
2118 *
2119 * Note:
2120 * - caller must already be holding ifnet lock.
2121 */
2122static u_int32_t
2123dlil_ifp_protolist(struct ifnet *ifp, protocol_family_t *list,
2124 u_int32_t list_count)
2125{
2126 u_int32_t count = 0;
2127 int i;
2128
2129 ifnet_lock_assert(ifp, what: IFNET_LCK_ASSERT_OWNED);
2130
2131 if (ifp->if_proto_hash == NULL) {
2132 goto done;
2133 }
2134
2135 for (i = 0; i < PROTO_HASH_SLOTS; i++) {
2136 struct if_proto *proto;
2137 SLIST_FOREACH(proto, &ifp->if_proto_hash[i], next_hash) {
2138 if (list != NULL && count < list_count) {
2139 list[count] = proto->protocol_family;
2140 }
2141 count++;
2142 }
2143 }
2144done:
2145 return count;
2146}
2147
2148__private_extern__ u_int32_t
2149if_get_protolist(struct ifnet * ifp, u_int32_t *protolist, u_int32_t count)
2150{
2151 ifnet_lock_shared(ifp);
2152 count = dlil_ifp_protolist(ifp, list: protolist, list_count: count);
2153 ifnet_lock_done(ifp);
2154 return count;
2155}
2156
2157__private_extern__ void
2158if_free_protolist(u_int32_t *list)
2159{
2160 kfree_data_addr(list);
2161}
2162
2163__private_extern__ int
2164dlil_post_msg(struct ifnet *ifp, u_int32_t event_subclass,
2165 u_int32_t event_code, struct net_event_data *event_data,
2166 u_int32_t event_data_len, boolean_t suppress_generation)
2167{
2168 struct net_event_data ev_data;
2169 struct kev_msg ev_msg;
2170
2171 bzero(s: &ev_msg, n: sizeof(ev_msg));
2172 bzero(s: &ev_data, n: sizeof(ev_data));
2173 /*
2174 * a net event always starts with a net_event_data structure
2175 * but the caller can generate a simple net event or
2176 * provide a longer event structure to post
2177 */
2178 ev_msg.vendor_code = KEV_VENDOR_APPLE;
2179 ev_msg.kev_class = KEV_NETWORK_CLASS;
2180 ev_msg.kev_subclass = event_subclass;
2181 ev_msg.event_code = event_code;
2182
2183 if (event_data == NULL) {
2184 event_data = &ev_data;
2185 event_data_len = sizeof(struct net_event_data);
2186 }
2187
2188 strlcpy(dst: &event_data->if_name[0], src: ifp->if_name, IFNAMSIZ);
2189 event_data->if_family = ifp->if_family;
2190 event_data->if_unit = (u_int32_t)ifp->if_unit;
2191
2192 ev_msg.dv[0].data_length = event_data_len;
2193 ev_msg.dv[0].data_ptr = event_data;
2194 ev_msg.dv[1].data_length = 0;
2195
2196 bool update_generation = true;
2197 if (event_subclass == KEV_DL_SUBCLASS) {
2198 /* Don't update interface generation for frequent link quality and state changes */
2199 switch (event_code) {
2200 case KEV_DL_LINK_QUALITY_METRIC_CHANGED:
2201 case KEV_DL_RRC_STATE_CHANGED:
2202 case KEV_DL_PRIMARY_ELECTED:
2203 update_generation = false;
2204 break;
2205 default:
2206 break;
2207 }
2208 }
2209
2210 /*
2211 * Some events that update generation counts might
2212 * want to suppress generation count.
2213 * One example is node presence/absence where we still
2214 * issue kernel event for the invocation but want to avoid
2215 * expensive operation of updating generation which triggers
2216 * NECP client updates.
2217 */
2218 if (suppress_generation) {
2219 update_generation = false;
2220 }
2221
2222 return dlil_event_internal(ifp, msg: &ev_msg, update_generation);
2223}
2224
2225__private_extern__ int
2226dlil_alloc_local_stats(struct ifnet *ifp)
2227{
2228 int ret = EINVAL;
2229 void *buf, *base, **pbuf;
2230
2231 if (ifp == NULL) {
2232 goto end;
2233 }
2234
2235 if (ifp->if_tcp_stat == NULL && ifp->if_udp_stat == NULL) {
2236 /* allocate tcpstat_local structure */
2237 buf = zalloc_flags(dlif_tcpstat_zone,
2238 Z_WAITOK | Z_ZERO | Z_NOFAIL);
2239
2240 /* Get the 64-bit aligned base address for this object */
2241 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
2242 sizeof(u_int64_t));
2243 VERIFY(((intptr_t)base + dlif_tcpstat_size) <=
2244 ((intptr_t)buf + dlif_tcpstat_bufsize));
2245
2246 /*
2247 * Wind back a pointer size from the aligned base and
2248 * save the original address so we can free it later.
2249 */
2250 pbuf = (void **)((intptr_t)base - sizeof(void *));
2251 *pbuf = buf;
2252 ifp->if_tcp_stat = base;
2253
2254 /* allocate udpstat_local structure */
2255 buf = zalloc_flags(dlif_udpstat_zone,
2256 Z_WAITOK | Z_ZERO | Z_NOFAIL);
2257
2258 /* Get the 64-bit aligned base address for this object */
2259 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
2260 sizeof(u_int64_t));
2261 VERIFY(((intptr_t)base + dlif_udpstat_size) <=
2262 ((intptr_t)buf + dlif_udpstat_bufsize));
2263
2264 /*
2265 * Wind back a pointer size from the aligned base and
2266 * save the original address so we can free it later.
2267 */
2268 pbuf = (void **)((intptr_t)base - sizeof(void *));
2269 *pbuf = buf;
2270 ifp->if_udp_stat = base;
2271
2272 VERIFY(IS_P2ALIGNED(ifp->if_tcp_stat, sizeof(u_int64_t)) &&
2273 IS_P2ALIGNED(ifp->if_udp_stat, sizeof(u_int64_t)));
2274
2275 ret = 0;
2276 }
2277
2278 if (ifp->if_ipv4_stat == NULL) {
2279 ifp->if_ipv4_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
2280 }
2281
2282 if (ifp->if_ipv6_stat == NULL) {
2283 ifp->if_ipv6_stat = kalloc_type(struct if_tcp_ecn_stat, Z_WAITOK | Z_ZERO);
2284 }
2285end:
2286 if (ifp != NULL && ret != 0) {
2287 if (ifp->if_tcp_stat != NULL) {
2288 pbuf = (void **)
2289 ((intptr_t)ifp->if_tcp_stat - sizeof(void *));
2290 zfree(dlif_tcpstat_zone, *pbuf);
2291 ifp->if_tcp_stat = NULL;
2292 }
2293 if (ifp->if_udp_stat != NULL) {
2294 pbuf = (void **)
2295 ((intptr_t)ifp->if_udp_stat - sizeof(void *));
2296 zfree(dlif_udpstat_zone, *pbuf);
2297 ifp->if_udp_stat = NULL;
2298 }
2299 /* The macro kfree_type sets the passed pointer to NULL */
2300 if (ifp->if_ipv4_stat != NULL) {
2301 kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv4_stat);
2302 }
2303 if (ifp->if_ipv6_stat != NULL) {
2304 kfree_type(struct if_tcp_ecn_stat, ifp->if_ipv6_stat);
2305 }
2306 }
2307
2308 return ret;
2309}
2310
2311static void
2312dlil_reset_rxpoll_params(ifnet_t ifp)
2313{
2314 ASSERT(ifp != NULL);
2315 ifnet_set_poll_cycle(ifp, NULL);
2316 ifp->if_poll_update = 0;
2317 ifp->if_poll_flags = 0;
2318 ifp->if_poll_req = 0;
2319 ifp->if_poll_mode = IFNET_MODEL_INPUT_POLL_OFF;
2320 bzero(s: &ifp->if_poll_tstats, n: sizeof(ifp->if_poll_tstats));
2321 bzero(s: &ifp->if_poll_pstats, n: sizeof(ifp->if_poll_pstats));
2322 bzero(s: &ifp->if_poll_sstats, n: sizeof(ifp->if_poll_sstats));
2323 net_timerclear(&ifp->if_poll_mode_holdtime);
2324 net_timerclear(&ifp->if_poll_mode_lasttime);
2325 net_timerclear(&ifp->if_poll_sample_holdtime);
2326 net_timerclear(&ifp->if_poll_sample_lasttime);
2327 net_timerclear(&ifp->if_poll_dbg_lasttime);
2328}
2329
2330static int
2331dlil_create_input_thread(ifnet_t ifp, struct dlil_threading_info *inp,
2332 thread_continue_t *thfunc)
2333{
2334 boolean_t dlil_rxpoll_input;
2335 thread_continue_t func = NULL;
2336 u_int32_t limit;
2337 int error = 0;
2338
2339 dlil_rxpoll_input = (ifp != NULL && net_rxpoll &&
2340 (ifp->if_eflags & IFEF_RXPOLL) && (ifp->if_xflags & IFXF_LEGACY));
2341
2342 /* default strategy utilizes the DLIL worker thread */
2343 inp->dlth_strategy = dlil_input_async;
2344
2345 /* NULL ifp indicates the main input thread, called at dlil_init time */
2346 if (ifp == NULL) {
2347 /*
2348 * Main input thread only.
2349 */
2350 func = dlil_main_input_thread_func;
2351 VERIFY(inp == dlil_main_input_thread);
2352 (void) strlcat(dst: inp->dlth_name,
2353 src: "main_input", DLIL_THREADNAME_LEN);
2354 } else if (dlil_rxpoll_input) {
2355 /*
2356 * Legacy (non-netif) hybrid polling.
2357 */
2358 func = dlil_rxpoll_input_thread_func;
2359 VERIFY(inp != dlil_main_input_thread);
2360 (void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2361 "%s_input_poll", if_name(ifp));
2362 } else if (net_async || (ifp->if_xflags & IFXF_LEGACY)) {
2363 /*
2364 * Asynchronous strategy.
2365 */
2366 func = dlil_input_thread_func;
2367 VERIFY(inp != dlil_main_input_thread);
2368 (void) snprintf(inp->dlth_name, DLIL_THREADNAME_LEN,
2369 "%s_input", if_name(ifp));
2370 } else {
2371 /*
2372 * Synchronous strategy if there's a netif below and
2373 * the device isn't capable of hybrid polling.
2374 */
2375 ASSERT(func == NULL);
2376 ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2377 VERIFY(inp != dlil_main_input_thread);
2378 ASSERT(!inp->dlth_affinity);
2379 inp->dlth_strategy = dlil_input_sync;
2380 }
2381 VERIFY(inp->dlth_thread == THREAD_NULL);
2382
2383 /* let caller know */
2384 if (thfunc != NULL) {
2385 *thfunc = func;
2386 }
2387
2388 inp->dlth_lock_grp = lck_grp_alloc_init(grp_name: inp->dlth_name, LCK_GRP_ATTR_NULL);
2389 lck_mtx_init(lck: &inp->dlth_lock, grp: inp->dlth_lock_grp, attr: &dlil_lck_attributes);
2390
2391 inp->dlth_ifp = ifp; /* NULL for main input thread */
2392
2393 /*
2394 * For interfaces that support opportunistic polling, set the
2395 * low and high watermarks for outstanding inbound packets/bytes.
2396 * Also define freeze times for transitioning between modes
2397 * and updating the average.
2398 */
2399 if (ifp != NULL && net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
2400 limit = MAX(if_rcvq_maxlen, IF_RCVQ_MINLEN);
2401 if (ifp->if_xflags & IFXF_LEGACY) {
2402 (void) dlil_rxpoll_set_params(ifp, NULL, FALSE);
2403 }
2404 } else {
2405 /*
2406 * For interfaces that don't support opportunistic
2407 * polling, set the burst limit to prevent memory exhaustion.
2408 * The values of `if_rcvq_burst_limit' are safeguarded
2409 * on customer builds by `sysctl_rcvq_burst_limit'.
2410 */
2411 limit = if_rcvq_burst_limit;
2412 }
2413
2414 _qinit(&inp->dlth_pkts, Q_DROPTAIL, limit, QP_MBUF);
2415 if (inp == dlil_main_input_thread) {
2416 struct dlil_main_threading_info *inpm =
2417 (struct dlil_main_threading_info *)inp;
2418 _qinit(&inpm->lo_rcvq_pkts, Q_DROPTAIL, limit, QP_MBUF);
2419 }
2420
2421 if (func == NULL) {
2422 ASSERT(!(ifp->if_xflags & IFXF_LEGACY));
2423 ASSERT(error == 0);
2424 error = ENODEV;
2425 goto done;
2426 }
2427
2428 error = kernel_thread_start(continuation: func, parameter: inp, new_thread: &inp->dlth_thread);
2429 if (error == KERN_SUCCESS) {
2430 thread_precedence_policy_data_t info;
2431 __unused kern_return_t kret;
2432
2433 bzero(s: &info, n: sizeof(info));
2434 info.importance = 0;
2435 kret = thread_policy_set(thread: inp->dlth_thread,
2436 THREAD_PRECEDENCE_POLICY, policy_info: (thread_policy_t)&info,
2437 THREAD_PRECEDENCE_POLICY_COUNT);
2438 ASSERT(kret == KERN_SUCCESS);
2439 /*
2440 * We create an affinity set so that the matching workloop
2441 * thread or the starter thread (for loopback) can be
2442 * scheduled on the same processor set as the input thread.
2443 */
2444 if (net_affinity) {
2445 struct thread *tp = inp->dlth_thread;
2446 u_int32_t tag;
2447 /*
2448 * Randomize to reduce the probability
2449 * of affinity tag namespace collision.
2450 */
2451 read_frandom(buffer: &tag, numBytes: sizeof(tag));
2452 if (dlil_affinity_set(tp, tag) == KERN_SUCCESS) {
2453 thread_reference(thread: tp);
2454 inp->dlth_affinity_tag = tag;
2455 inp->dlth_affinity = TRUE;
2456 }
2457 }
2458 } else if (inp == dlil_main_input_thread) {
2459 panic_plain("%s: couldn't create main input thread", __func__);
2460 /* NOTREACHED */
2461 } else {
2462 panic_plain("%s: couldn't create %s input thread", __func__,
2463 if_name(ifp));
2464 /* NOTREACHED */
2465 }
2466 OSAddAtomic(1, &cur_dlil_input_threads);
2467
2468done:
2469 return error;
2470}
2471
2472#if TEST_INPUT_THREAD_TERMINATION
2473static int
2474sysctl_input_thread_termination_spin SYSCTL_HANDLER_ARGS
2475{
2476#pragma unused(arg1, arg2)
2477 uint32_t i;
2478 int err;
2479
2480 i = if_input_thread_termination_spin;
2481
2482 err = sysctl_handle_int(oidp, &i, 0, req);
2483 if (err != 0 || req->newptr == USER_ADDR_NULL) {
2484 return err;
2485 }
2486
2487 if (net_rxpoll == 0) {
2488 return ENXIO;
2489 }
2490
2491 if_input_thread_termination_spin = i;
2492 return err;
2493}
2494#endif /* TEST_INPUT_THREAD_TERMINATION */
2495
2496static void
2497dlil_clean_threading_info(struct dlil_threading_info *inp)
2498{
2499 lck_mtx_destroy(lck: &inp->dlth_lock, grp: inp->dlth_lock_grp);
2500 lck_grp_free(grp: inp->dlth_lock_grp);
2501 inp->dlth_lock_grp = NULL;
2502
2503 inp->dlth_flags = 0;
2504 inp->dlth_wtot = 0;
2505 bzero(s: inp->dlth_name, n: sizeof(inp->dlth_name));
2506 inp->dlth_ifp = NULL;
2507 VERIFY(qhead(&inp->dlth_pkts) == NULL && qempty(&inp->dlth_pkts));
2508 qlimit(&inp->dlth_pkts) = 0;
2509 bzero(s: &inp->dlth_stats, n: sizeof(inp->dlth_stats));
2510
2511 VERIFY(!inp->dlth_affinity);
2512 inp->dlth_thread = THREAD_NULL;
2513 inp->dlth_strategy = NULL;
2514 VERIFY(inp->dlth_driver_thread == THREAD_NULL);
2515 VERIFY(inp->dlth_poller_thread == THREAD_NULL);
2516 VERIFY(inp->dlth_affinity_tag == 0);
2517#if IFNET_INPUT_SANITY_CHK
2518 inp->dlth_pkts_cnt = 0;
2519#endif /* IFNET_INPUT_SANITY_CHK */
2520}
2521
2522static void
2523dlil_terminate_input_thread(struct dlil_threading_info *inp)
2524{
2525 struct ifnet *ifp = inp->dlth_ifp;
2526 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
2527
2528 VERIFY(current_thread() == inp->dlth_thread);
2529 VERIFY(inp != dlil_main_input_thread);
2530
2531 OSAddAtomic(-1, &cur_dlil_input_threads);
2532
2533#if TEST_INPUT_THREAD_TERMINATION
2534 { /* do something useless that won't get optimized away */
2535 uint32_t v = 1;
2536 for (uint32_t i = 0;
2537 i < if_input_thread_termination_spin;
2538 i++) {
2539 v = (i + 1) * v;
2540 }
2541 DLIL_PRINTF("the value is %d\n", v);
2542 }
2543#endif /* TEST_INPUT_THREAD_TERMINATION */
2544
2545 lck_mtx_lock_spin(lck: &inp->dlth_lock);
2546 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
2547 VERIFY((inp->dlth_flags & DLIL_INPUT_TERMINATE) != 0);
2548 inp->dlth_flags |= DLIL_INPUT_TERMINATE_COMPLETE;
2549 wakeup_one(chan: (caddr_t)&inp->dlth_flags);
2550 lck_mtx_unlock(lck: &inp->dlth_lock);
2551
2552 /* free up pending packets */
2553 if (pkt.cp_mbuf != NULL) {
2554 mbuf_freem_list(mbuf: pkt.cp_mbuf);
2555 }
2556
2557 /* for the extra refcnt from kernel_thread_start() */
2558 thread_deallocate(thread: current_thread());
2559
2560 if (dlil_verbose) {
2561 DLIL_PRINTF("%s: input thread terminated\n",
2562 if_name(ifp));
2563 }
2564
2565 /* this is the end */
2566 thread_terminate(target_act: current_thread());
2567 /* NOTREACHED */
2568}
2569
2570static kern_return_t
2571dlil_affinity_set(struct thread *tp, u_int32_t tag)
2572{
2573 thread_affinity_policy_data_t policy;
2574
2575 bzero(s: &policy, n: sizeof(policy));
2576 policy.affinity_tag = tag;
2577 return thread_policy_set(thread: tp, THREAD_AFFINITY_POLICY,
2578 policy_info: (thread_policy_t)&policy, THREAD_AFFINITY_POLICY_COUNT);
2579}
2580
2581#if SKYWALK && defined(XNU_TARGET_OS_OSX)
2582static void
2583dlil_filter_event(struct eventhandler_entry_arg arg __unused,
2584 enum net_filter_event_subsystems state)
2585{
2586 bool old_if_enable_fsw_transport_netagent = if_enable_fsw_transport_netagent;
2587 if ((state & ~NET_FILTER_EVENT_PF_PRIVATE_PROXY) == 0) {
2588 if_enable_fsw_transport_netagent = 1;
2589 } else {
2590 if_enable_fsw_transport_netagent = 0;
2591 }
2592 if (old_if_enable_fsw_transport_netagent != if_enable_fsw_transport_netagent) {
2593 kern_nexus_update_netagents();
2594 } else if (!if_enable_fsw_transport_netagent) {
2595 necp_update_all_clients();
2596 }
2597}
2598#endif /* SKYWALK && XNU_TARGET_OS_OSX */
2599
2600void
2601dlil_init(void)
2602{
2603 thread_t thread = THREAD_NULL;
2604
2605 /*
2606 * The following fields must be 64-bit aligned for atomic operations.
2607 */
2608 IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2609 IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2610 IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2611 IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2612 IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2613 IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2614 IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2615 IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2616 IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2617 IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2618 IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2619 IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2620 IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2621 IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2622 IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2623
2624 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ipackets);
2625 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ierrors);
2626 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_opackets);
2627 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_oerrors);
2628 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_collisions);
2629 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_ibytes);
2630 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_obytes);
2631 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_imcasts);
2632 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_omcasts);
2633 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_iqdrops);
2634 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_noproto);
2635 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_alignerrs);
2636 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_dt_bytes);
2637 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fpackets);
2638 IFNET_IF_DATA_REQUIRE_ALIGNED_64(ifi_fbytes);
2639
2640 /*
2641 * These IF_HWASSIST_ flags must be equal to their IFNET_* counterparts.
2642 */
2643 _CASSERT(IF_HWASSIST_CSUM_IP == IFNET_CSUM_IP);
2644 _CASSERT(IF_HWASSIST_CSUM_TCP == IFNET_CSUM_TCP);
2645 _CASSERT(IF_HWASSIST_CSUM_UDP == IFNET_CSUM_UDP);
2646 _CASSERT(IF_HWASSIST_CSUM_IP_FRAGS == IFNET_CSUM_FRAGMENT);
2647 _CASSERT(IF_HWASSIST_CSUM_FRAGMENT == IFNET_IP_FRAGMENT);
2648 _CASSERT(IF_HWASSIST_CSUM_TCPIPV6 == IFNET_CSUM_TCPIPV6);
2649 _CASSERT(IF_HWASSIST_CSUM_UDPIPV6 == IFNET_CSUM_UDPIPV6);
2650 _CASSERT(IF_HWASSIST_CSUM_FRAGMENT_IPV6 == IFNET_IPV6_FRAGMENT);
2651 _CASSERT(IF_HWASSIST_CSUM_PARTIAL == IFNET_CSUM_PARTIAL);
2652 _CASSERT(IF_HWASSIST_CSUM_ZERO_INVERT == IFNET_CSUM_ZERO_INVERT);
2653 _CASSERT(IF_HWASSIST_VLAN_TAGGING == IFNET_VLAN_TAGGING);
2654 _CASSERT(IF_HWASSIST_VLAN_MTU == IFNET_VLAN_MTU);
2655 _CASSERT(IF_HWASSIST_TSO_V4 == IFNET_TSO_IPV4);
2656 _CASSERT(IF_HWASSIST_TSO_V6 == IFNET_TSO_IPV6);
2657
2658 /*
2659 * ... as well as the mbuf checksum flags counterparts.
2660 */
2661 _CASSERT(CSUM_IP == IF_HWASSIST_CSUM_IP);
2662 _CASSERT(CSUM_TCP == IF_HWASSIST_CSUM_TCP);
2663 _CASSERT(CSUM_UDP == IF_HWASSIST_CSUM_UDP);
2664 _CASSERT(CSUM_IP_FRAGS == IF_HWASSIST_CSUM_IP_FRAGS);
2665 _CASSERT(CSUM_FRAGMENT == IF_HWASSIST_CSUM_FRAGMENT);
2666 _CASSERT(CSUM_TCPIPV6 == IF_HWASSIST_CSUM_TCPIPV6);
2667 _CASSERT(CSUM_UDPIPV6 == IF_HWASSIST_CSUM_UDPIPV6);
2668 _CASSERT(CSUM_FRAGMENT_IPV6 == IF_HWASSIST_CSUM_FRAGMENT_IPV6);
2669 _CASSERT(CSUM_PARTIAL == IF_HWASSIST_CSUM_PARTIAL);
2670 _CASSERT(CSUM_ZERO_INVERT == IF_HWASSIST_CSUM_ZERO_INVERT);
2671 _CASSERT(CSUM_VLAN_TAG_VALID == IF_HWASSIST_VLAN_TAGGING);
2672
2673 /*
2674 * Make sure we have at least IF_LLREACH_MAXLEN in the llreach info.
2675 */
2676 _CASSERT(IF_LLREACH_MAXLEN <= IF_LLREACHINFO_ADDRLEN);
2677 _CASSERT(IFNET_LLREACHINFO_ADDRLEN == IF_LLREACHINFO_ADDRLEN);
2678
2679 _CASSERT(IFRLOGF_DLIL == IFNET_LOGF_DLIL);
2680 _CASSERT(IFRLOGF_FAMILY == IFNET_LOGF_FAMILY);
2681 _CASSERT(IFRLOGF_DRIVER == IFNET_LOGF_DRIVER);
2682 _CASSERT(IFRLOGF_FIRMWARE == IFNET_LOGF_FIRMWARE);
2683
2684 _CASSERT(IFRLOGCAT_CONNECTIVITY == IFNET_LOGCAT_CONNECTIVITY);
2685 _CASSERT(IFRLOGCAT_QUALITY == IFNET_LOGCAT_QUALITY);
2686 _CASSERT(IFRLOGCAT_PERFORMANCE == IFNET_LOGCAT_PERFORMANCE);
2687
2688 _CASSERT(IFRTYPE_FAMILY_ANY == IFNET_FAMILY_ANY);
2689 _CASSERT(IFRTYPE_FAMILY_LOOPBACK == IFNET_FAMILY_LOOPBACK);
2690 _CASSERT(IFRTYPE_FAMILY_ETHERNET == IFNET_FAMILY_ETHERNET);
2691 _CASSERT(IFRTYPE_FAMILY_SLIP == IFNET_FAMILY_SLIP);
2692 _CASSERT(IFRTYPE_FAMILY_TUN == IFNET_FAMILY_TUN);
2693 _CASSERT(IFRTYPE_FAMILY_VLAN == IFNET_FAMILY_VLAN);
2694 _CASSERT(IFRTYPE_FAMILY_PPP == IFNET_FAMILY_PPP);
2695 _CASSERT(IFRTYPE_FAMILY_PVC == IFNET_FAMILY_PVC);
2696 _CASSERT(IFRTYPE_FAMILY_DISC == IFNET_FAMILY_DISC);
2697 _CASSERT(IFRTYPE_FAMILY_MDECAP == IFNET_FAMILY_MDECAP);
2698 _CASSERT(IFRTYPE_FAMILY_GIF == IFNET_FAMILY_GIF);
2699 _CASSERT(IFRTYPE_FAMILY_FAITH == IFNET_FAMILY_FAITH);
2700 _CASSERT(IFRTYPE_FAMILY_STF == IFNET_FAMILY_STF);
2701 _CASSERT(IFRTYPE_FAMILY_FIREWIRE == IFNET_FAMILY_FIREWIRE);
2702 _CASSERT(IFRTYPE_FAMILY_BOND == IFNET_FAMILY_BOND);
2703 _CASSERT(IFRTYPE_FAMILY_CELLULAR == IFNET_FAMILY_CELLULAR);
2704 _CASSERT(IFRTYPE_FAMILY_UTUN == IFNET_FAMILY_UTUN);
2705 _CASSERT(IFRTYPE_FAMILY_IPSEC == IFNET_FAMILY_IPSEC);
2706
2707 _CASSERT(IFRTYPE_SUBFAMILY_ANY == IFNET_SUBFAMILY_ANY);
2708 _CASSERT(IFRTYPE_SUBFAMILY_USB == IFNET_SUBFAMILY_USB);
2709 _CASSERT(IFRTYPE_SUBFAMILY_BLUETOOTH == IFNET_SUBFAMILY_BLUETOOTH);
2710 _CASSERT(IFRTYPE_SUBFAMILY_WIFI == IFNET_SUBFAMILY_WIFI);
2711 _CASSERT(IFRTYPE_SUBFAMILY_THUNDERBOLT == IFNET_SUBFAMILY_THUNDERBOLT);
2712 _CASSERT(IFRTYPE_SUBFAMILY_RESERVED == IFNET_SUBFAMILY_RESERVED);
2713 _CASSERT(IFRTYPE_SUBFAMILY_INTCOPROC == IFNET_SUBFAMILY_INTCOPROC);
2714 _CASSERT(IFRTYPE_SUBFAMILY_QUICKRELAY == IFNET_SUBFAMILY_QUICKRELAY);
2715 _CASSERT(IFRTYPE_SUBFAMILY_VMNET == IFNET_SUBFAMILY_VMNET);
2716 _CASSERT(IFRTYPE_SUBFAMILY_SIMCELL == IFNET_SUBFAMILY_SIMCELL);
2717 _CASSERT(IFRTYPE_SUBFAMILY_MANAGEMENT == IFNET_SUBFAMILY_MANAGEMENT);
2718
2719 _CASSERT(DLIL_MODIDLEN == IFNET_MODIDLEN);
2720 _CASSERT(DLIL_MODARGLEN == IFNET_MODARGLEN);
2721
2722 PE_parse_boot_argn(arg_string: "net_affinity", arg_ptr: &net_affinity,
2723 max_arg: sizeof(net_affinity));
2724
2725 PE_parse_boot_argn(arg_string: "net_rxpoll", arg_ptr: &net_rxpoll, max_arg: sizeof(net_rxpoll));
2726
2727 PE_parse_boot_argn(arg_string: "net_rtref", arg_ptr: &net_rtref, max_arg: sizeof(net_rtref));
2728
2729 PE_parse_boot_argn(arg_string: "net_async", arg_ptr: &net_async, max_arg: sizeof(net_async));
2730
2731 PE_parse_boot_argn(arg_string: "ifnet_debug", arg_ptr: &ifnet_debug, max_arg: sizeof(ifnet_debug));
2732
2733 VERIFY(dlil_pending_thread_cnt == 0);
2734#if SKYWALK
2735 boolean_t pe_enable_fsw_transport_netagent = FALSE;
2736 boolean_t pe_disable_fsw_transport_netagent = FALSE;
2737 boolean_t enable_fsw_netagent =
2738 (((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0) ||
2739 (if_attach_nx & IF_ATTACH_NX_FSW_IP_NETAGENT) != 0);
2740
2741 /*
2742 * Check the device tree to see if Skywalk netagent has been explicitly
2743 * enabled or disabled. This can be overridden via if_attach_nx below.
2744 * Note that the property is a 0-length key, and so checking for the
2745 * presence itself is enough (no need to check for the actual value of
2746 * the retrieved variable.)
2747 */
2748 pe_enable_fsw_transport_netagent =
2749 PE_get_default(property_name: "kern.skywalk_netagent_enable",
2750 property_ptr: &pe_enable_fsw_transport_netagent,
2751 max_property: sizeof(pe_enable_fsw_transport_netagent));
2752 pe_disable_fsw_transport_netagent =
2753 PE_get_default(property_name: "kern.skywalk_netagent_disable",
2754 property_ptr: &pe_disable_fsw_transport_netagent,
2755 max_property: sizeof(pe_disable_fsw_transport_netagent));
2756
2757 /*
2758 * These two are mutually exclusive, i.e. they both can be absent,
2759 * but only one can be present at a time, and so we assert to make
2760 * sure it is correct.
2761 */
2762 VERIFY((!pe_enable_fsw_transport_netagent &&
2763 !pe_disable_fsw_transport_netagent) ||
2764 (pe_enable_fsw_transport_netagent ^
2765 pe_disable_fsw_transport_netagent));
2766
2767 if (pe_enable_fsw_transport_netagent) {
2768 kprintf(fmt: "SK: netagent is enabled via an override for "
2769 "this platform\n");
2770 if_attach_nx = SKYWALK_NETWORKING_ENABLED;
2771 } else if (pe_disable_fsw_transport_netagent) {
2772 kprintf(fmt: "SK: netagent is disabled via an override for "
2773 "this platform\n");
2774 if_attach_nx = SKYWALK_NETWORKING_DISABLED;
2775 } else {
2776 kprintf(fmt: "SK: netagent is %s by default for this platform\n",
2777 (enable_fsw_netagent ? "enabled" : "disabled"));
2778 if_attach_nx = IF_ATTACH_NX_DEFAULT;
2779 }
2780
2781 /*
2782 * Now see if there's a boot-arg override.
2783 */
2784 (void) PE_parse_boot_argn(arg_string: "if_attach_nx", arg_ptr: &if_attach_nx,
2785 max_arg: sizeof(if_attach_nx));
2786 if_enable_fsw_transport_netagent =
2787 ((if_attach_nx & IF_ATTACH_NX_FSW_TRANSPORT_NETAGENT) != 0);
2788
2789 if_netif_all = ((if_attach_nx & IF_ATTACH_NX_NETIF_ALL) != 0);
2790
2791 if (pe_disable_fsw_transport_netagent &&
2792 if_enable_fsw_transport_netagent) {
2793 kprintf(fmt: "SK: netagent is force-enabled\n");
2794 } else if (!pe_disable_fsw_transport_netagent &&
2795 !if_enable_fsw_transport_netagent) {
2796 kprintf(fmt: "SK: netagent is force-disabled\n");
2797 }
2798#ifdef XNU_TARGET_OS_OSX
2799 if (if_enable_fsw_transport_netagent) {
2800 net_filter_event_register(callback: dlil_filter_event);
2801 }
2802#endif /* XNU_TARGET_OS_OSX */
2803
2804#if (DEVELOPMENT || DEBUG)
2805 (void) PE_parse_boot_argn("fsw_use_max_mtu_buffer",
2806 &fsw_use_max_mtu_buffer, sizeof(fsw_use_max_mtu_buffer));
2807#endif /* (DEVELOPMENT || DEBUG) */
2808
2809#endif /* SKYWALK */
2810 dlif_size = (ifnet_debug == 0) ? sizeof(struct dlil_ifnet) :
2811 sizeof(struct dlil_ifnet_dbg);
2812 /* Enforce 64-bit alignment for dlil_ifnet structure */
2813 dlif_bufsize = dlif_size + sizeof(void *) + sizeof(u_int64_t);
2814 dlif_bufsize = (uint32_t)P2ROUNDUP(dlif_bufsize, sizeof(u_int64_t));
2815 dlif_zone = zone_create(DLIF_ZONE_NAME, size: dlif_bufsize, flags: ZC_ZFREE_CLEARMEM);
2816
2817 dlif_tcpstat_size = sizeof(struct tcpstat_local);
2818 /* Enforce 64-bit alignment for tcpstat_local structure */
2819 dlif_tcpstat_bufsize =
2820 dlif_tcpstat_size + sizeof(void *) + sizeof(u_int64_t);
2821 dlif_tcpstat_bufsize = (uint32_t)
2822 P2ROUNDUP(dlif_tcpstat_bufsize, sizeof(u_int64_t));
2823 dlif_tcpstat_zone = zone_create(DLIF_TCPSTAT_ZONE_NAME,
2824 size: dlif_tcpstat_bufsize, flags: ZC_ZFREE_CLEARMEM);
2825
2826 dlif_udpstat_size = sizeof(struct udpstat_local);
2827 /* Enforce 64-bit alignment for udpstat_local structure */
2828 dlif_udpstat_bufsize =
2829 dlif_udpstat_size + sizeof(void *) + sizeof(u_int64_t);
2830 dlif_udpstat_bufsize = (uint32_t)
2831 P2ROUNDUP(dlif_udpstat_bufsize, sizeof(u_int64_t));
2832 dlif_udpstat_zone = zone_create(DLIF_UDPSTAT_ZONE_NAME,
2833 size: dlif_udpstat_bufsize, flags: ZC_ZFREE_CLEARMEM);
2834
2835 eventhandler_lists_ctxt_init(evthdlr_lists_ctxt: &ifnet_evhdlr_ctxt);
2836
2837 TAILQ_INIT(&dlil_ifnet_head);
2838 TAILQ_INIT(&ifnet_head);
2839 TAILQ_INIT(&ifnet_detaching_head);
2840 TAILQ_INIT(&ifnet_ordered_head);
2841
2842 /* Initialize interface address subsystem */
2843 ifa_init();
2844
2845#if PF
2846 /* Initialize the packet filter */
2847 pfinit();
2848#endif /* PF */
2849
2850 /* Initialize queue algorithms */
2851 classq_init();
2852
2853 /* Initialize packet schedulers */
2854 pktsched_init();
2855
2856 /* Initialize flow advisory subsystem */
2857 flowadv_init();
2858
2859 /* Initialize the pktap virtual interface */
2860 pktap_init();
2861
2862 /* Initialize the service class to dscp map */
2863 net_qos_map_init();
2864
2865 /* Initialize the interface low power mode event handler */
2866 if_low_power_evhdlr_init();
2867
2868 /* Initialize the interface offload port list subsystem */
2869 if_ports_used_init();
2870
2871#if DEBUG || DEVELOPMENT
2872 /* Run self-tests */
2873 dlil_verify_sum16();
2874#endif /* DEBUG || DEVELOPMENT */
2875
2876 /*
2877 * Create and start up the main DLIL input thread and the interface
2878 * detacher threads once everything is initialized.
2879 */
2880 dlil_incr_pending_thread_count();
2881 (void) dlil_create_input_thread(NULL, inp: dlil_main_input_thread, NULL);
2882
2883 /*
2884 * Create ifnet detacher thread.
2885 * When an interface gets detached, part of the detach processing
2886 * is delayed. The interface is added to delayed detach list
2887 * and this thread is woken up to call ifnet_detach_final
2888 * on these interfaces.
2889 */
2890 dlil_incr_pending_thread_count();
2891 if (kernel_thread_start(continuation: ifnet_detacher_thread_func,
2892 NULL, new_thread: &thread) != KERN_SUCCESS) {
2893 panic_plain("%s: couldn't create detacher thread", __func__);
2894 /* NOTREACHED */
2895 }
2896 thread_deallocate(thread);
2897
2898 /*
2899 * Wait for the created kernel threads for dlil to get
2900 * scheduled and run at least once before we proceed
2901 */
2902 lck_mtx_lock(lck: &dlil_thread_sync_lock);
2903 while (dlil_pending_thread_cnt != 0) {
2904 DLIL_PRINTF("%s: Waiting for all the create dlil kernel "
2905 "threads to get scheduled at least once.\n", __func__);
2906 (void) msleep(chan: &dlil_pending_thread_cnt, mtx: &dlil_thread_sync_lock,
2907 pri: (PZERO - 1), wmesg: __func__, NULL);
2908 LCK_MTX_ASSERT(&dlil_thread_sync_lock, LCK_ASSERT_OWNED);
2909 }
2910 lck_mtx_unlock(lck: &dlil_thread_sync_lock);
2911 DLIL_PRINTF("%s: All the created dlil kernel threads have been "
2912 "scheduled at least once. Proceeding.\n", __func__);
2913}
2914
2915static void
2916if_flt_monitor_busy(struct ifnet *ifp)
2917{
2918 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2919
2920 ++ifp->if_flt_busy;
2921 VERIFY(ifp->if_flt_busy != 0);
2922}
2923
2924static void
2925if_flt_monitor_unbusy(struct ifnet *ifp)
2926{
2927 if_flt_monitor_leave(ifp);
2928}
2929
2930static void
2931if_flt_monitor_enter(struct ifnet *ifp)
2932{
2933 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2934
2935 while (ifp->if_flt_busy) {
2936 ++ifp->if_flt_waiters;
2937 (void) msleep(chan: &ifp->if_flt_head, mtx: &ifp->if_flt_lock,
2938 pri: (PZERO - 1), wmesg: "if_flt_monitor", NULL);
2939 }
2940 if_flt_monitor_busy(ifp);
2941}
2942
2943static void
2944if_flt_monitor_leave(struct ifnet *ifp)
2945{
2946 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
2947
2948 VERIFY(ifp->if_flt_busy != 0);
2949 --ifp->if_flt_busy;
2950
2951 if (ifp->if_flt_busy == 0 && ifp->if_flt_waiters > 0) {
2952 ifp->if_flt_waiters = 0;
2953 wakeup(chan: &ifp->if_flt_head);
2954 }
2955}
2956
2957__private_extern__ int
2958dlil_attach_filter(struct ifnet *ifp, const struct iff_filter *if_filter,
2959 interface_filter_t *filter_ref, u_int32_t flags)
2960{
2961 int retval = 0;
2962 struct ifnet_filter *filter = NULL;
2963
2964 ifnet_head_lock_shared();
2965
2966 /* Check that the interface is in the global list */
2967 if (!ifnet_lookup(ifp)) {
2968 retval = ENXIO;
2969 goto done;
2970 }
2971 if (!ifnet_is_attached(ifp, refio: 1)) {
2972 os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
2973 __func__, if_name(ifp));
2974 retval = ENXIO;
2975 goto done;
2976 }
2977
2978 filter = zalloc_flags(dlif_filt_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
2979
2980 /* refcnt held above during lookup */
2981 filter->filt_flags = flags;
2982 filter->filt_ifp = ifp;
2983 filter->filt_cookie = if_filter->iff_cookie;
2984 filter->filt_name = if_filter->iff_name;
2985 filter->filt_protocol = if_filter->iff_protocol;
2986 /*
2987 * Do not install filter callbacks for internal coproc interface
2988 * and for management interfaces
2989 */
2990 if (!IFNET_IS_INTCOPROC(ifp) && !IFNET_IS_MANAGEMENT(ifp)) {
2991 filter->filt_input = if_filter->iff_input;
2992 filter->filt_output = if_filter->iff_output;
2993 filter->filt_event = if_filter->iff_event;
2994 filter->filt_ioctl = if_filter->iff_ioctl;
2995 }
2996 filter->filt_detached = if_filter->iff_detached;
2997
2998 lck_mtx_lock(lck: &ifp->if_flt_lock);
2999 if_flt_monitor_enter(ifp);
3000
3001 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
3002 TAILQ_INSERT_TAIL(&ifp->if_flt_head, filter, filt_next);
3003
3004 *filter_ref = filter;
3005
3006 /*
3007 * Bump filter count and route_generation ID to let TCP
3008 * know it shouldn't do TSO on this connection
3009 */
3010 if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
3011 ifnet_filter_update_tso(ifp, TRUE);
3012 }
3013 OSIncrementAtomic64(address: &net_api_stats.nas_iflt_attach_count);
3014 INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_total);
3015 if (filter->filt_flags & DLIL_IFF_INTERNAL) {
3016 OSIncrementAtomic64(address: &net_api_stats.nas_iflt_attach_os_count);
3017 INC_ATOMIC_INT64_LIM(net_api_stats.nas_iflt_attach_os_total);
3018 } else {
3019 OSAddAtomic(1, &ifp->if_flt_non_os_count);
3020 }
3021 if_flt_monitor_leave(ifp);
3022 lck_mtx_unlock(lck: &ifp->if_flt_lock);
3023
3024#if SKYWALK && defined(XNU_TARGET_OS_OSX)
3025 net_filter_event_mark(subsystem: NET_FILTER_EVENT_INTERFACE,
3026 compatible: net_check_compatible_if_filter(NULL));
3027#endif /* SKYWALK && XNU_TARGET_OS_OSX */
3028
3029 if (dlil_verbose) {
3030 DLIL_PRINTF("%s: %s filter attached\n", if_name(ifp),
3031 if_filter->iff_name);
3032 }
3033 ifnet_decr_iorefcnt(ifp);
3034
3035done:
3036 ifnet_head_done();
3037 if (retval != 0 && ifp != NULL) {
3038 DLIL_PRINTF("%s: failed to attach %s (err=%d)\n",
3039 if_name(ifp), if_filter->iff_name, retval);
3040 }
3041 if (retval != 0 && filter != NULL) {
3042 zfree(dlif_filt_zone, filter);
3043 }
3044
3045 return retval;
3046}
3047
3048static int
3049dlil_detach_filter_internal(interface_filter_t filter, int detached)
3050{
3051 int retval = 0;
3052
3053 if (detached == 0) {
3054 ifnet_t ifp = NULL;
3055
3056 ifnet_head_lock_shared();
3057 TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
3058 interface_filter_t entry = NULL;
3059
3060 lck_mtx_lock(lck: &ifp->if_flt_lock);
3061 TAILQ_FOREACH(entry, &ifp->if_flt_head, filt_next) {
3062 if (entry != filter || entry->filt_skip) {
3063 continue;
3064 }
3065 /*
3066 * We've found a match; since it's possible
3067 * that the thread gets blocked in the monitor,
3068 * we do the lock dance. Interface should
3069 * not be detached since we still have a use
3070 * count held during filter attach.
3071 */
3072 entry->filt_skip = 1; /* skip input/output */
3073 lck_mtx_unlock(lck: &ifp->if_flt_lock);
3074 ifnet_head_done();
3075
3076 lck_mtx_lock(lck: &ifp->if_flt_lock);
3077 if_flt_monitor_enter(ifp);
3078 LCK_MTX_ASSERT(&ifp->if_flt_lock,
3079 LCK_MTX_ASSERT_OWNED);
3080
3081 /* Remove the filter from the list */
3082 TAILQ_REMOVE(&ifp->if_flt_head, filter,
3083 filt_next);
3084
3085 if (dlil_verbose) {
3086 DLIL_PRINTF("%s: %s filter detached\n",
3087 if_name(ifp), filter->filt_name);
3088 }
3089 if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
3090 VERIFY(ifp->if_flt_non_os_count != 0);
3091 OSAddAtomic(-1, &ifp->if_flt_non_os_count);
3092 }
3093 /*
3094 * Decrease filter count and route_generation
3095 * ID to let TCP know it should reevalute doing
3096 * TSO or not.
3097 */
3098 if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
3099 ifnet_filter_update_tso(ifp, FALSE);
3100 }
3101 if_flt_monitor_leave(ifp);
3102 lck_mtx_unlock(lck: &ifp->if_flt_lock);
3103 goto destroy;
3104 }
3105 lck_mtx_unlock(lck: &ifp->if_flt_lock);
3106 }
3107 ifnet_head_done();
3108
3109 /* filter parameter is not a valid filter ref */
3110 retval = EINVAL;
3111 goto done;
3112 } else {
3113 struct ifnet *ifp = filter->filt_ifp;
3114 /*
3115 * Here we are called from ifnet_detach_final(); the
3116 * caller had emptied if_flt_head and we're doing an
3117 * implicit filter detach because the interface is
3118 * about to go away. Make sure to adjust the counters
3119 * in this case. We don't need the protection of the
3120 * filter monitor since we're called as part of the
3121 * final detach in the context of the detacher thread.
3122 */
3123 if (!(filter->filt_flags & DLIL_IFF_INTERNAL)) {
3124 VERIFY(ifp->if_flt_non_os_count != 0);
3125 OSAddAtomic(-1, &ifp->if_flt_non_os_count);
3126 }
3127 /*
3128 * Decrease filter count and route_generation
3129 * ID to let TCP know it should reevalute doing
3130 * TSO or not.
3131 */
3132 if ((filter->filt_flags & DLIL_IFF_TSO) == 0) {
3133 ifnet_filter_update_tso(ifp, FALSE);
3134 }
3135 }
3136
3137 if (dlil_verbose) {
3138 DLIL_PRINTF("%s filter detached\n", filter->filt_name);
3139 }
3140
3141destroy:
3142
3143 /* Call the detached function if there is one */
3144 if (filter->filt_detached) {
3145 filter->filt_detached(filter->filt_cookie, filter->filt_ifp);
3146 }
3147
3148 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_count) > 0);
3149 if (filter->filt_flags & DLIL_IFF_INTERNAL) {
3150 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_iflt_attach_os_count) > 0);
3151 }
3152#if SKYWALK && defined(XNU_TARGET_OS_OSX)
3153 net_filter_event_mark(subsystem: NET_FILTER_EVENT_INTERFACE,
3154 compatible: net_check_compatible_if_filter(NULL));
3155#endif /* SKYWALK && XNU_TARGET_OS_OSX */
3156
3157 /* Free the filter */
3158 zfree(dlif_filt_zone, filter);
3159 filter = NULL;
3160done:
3161 if (retval != 0 && filter != NULL) {
3162 DLIL_PRINTF("failed to detach %s filter (err=%d)\n",
3163 filter->filt_name, retval);
3164 }
3165
3166 return retval;
3167}
3168
3169__private_extern__ void
3170dlil_detach_filter(interface_filter_t filter)
3171{
3172 if (filter == NULL) {
3173 return;
3174 }
3175 dlil_detach_filter_internal(filter, detached: 0);
3176}
3177
3178__private_extern__ boolean_t
3179dlil_has_ip_filter(void)
3180{
3181 boolean_t has_filter = ((net_api_stats.nas_ipf_add_count - net_api_stats.nas_ipf_add_os_count) > 0);
3182
3183 VERIFY(net_api_stats.nas_ipf_add_count >= net_api_stats.nas_ipf_add_os_count);
3184
3185 DTRACE_IP1(dlil_has_ip_filter, boolean_t, has_filter);
3186 return has_filter;
3187}
3188
3189__private_extern__ boolean_t
3190dlil_has_if_filter(struct ifnet *ifp)
3191{
3192 boolean_t has_filter = !TAILQ_EMPTY(&ifp->if_flt_head);
3193 DTRACE_IP1(dlil_has_if_filter, boolean_t, has_filter);
3194 return has_filter;
3195}
3196
3197static inline void
3198dlil_input_wakeup(struct dlil_threading_info *inp)
3199{
3200 LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
3201
3202 inp->dlth_flags |= DLIL_INPUT_WAITING;
3203 if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
3204 inp->dlth_wtot++;
3205 wakeup_one(chan: (caddr_t)&inp->dlth_flags);
3206 }
3207}
3208
3209__attribute__((noreturn))
3210static void
3211dlil_main_input_thread_func(void *v, wait_result_t w)
3212{
3213#pragma unused(w)
3214 struct dlil_threading_info *inp = v;
3215
3216 VERIFY(inp == dlil_main_input_thread);
3217 VERIFY(inp->dlth_ifp == NULL);
3218 VERIFY(current_thread() == inp->dlth_thread);
3219
3220 lck_mtx_lock(lck: &inp->dlth_lock);
3221 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3222 (void) assert_wait(event: &inp->dlth_flags, THREAD_UNINT);
3223 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3224 /* wake up once to get out of embryonic state */
3225 dlil_input_wakeup(inp);
3226 lck_mtx_unlock(lck: &inp->dlth_lock);
3227 (void) thread_block_parameter(continuation: dlil_main_input_thread_cont, parameter: inp);
3228 /* NOTREACHED */
3229 __builtin_unreachable();
3230}
3231
3232/*
3233 * Main input thread:
3234 *
3235 * a) handles all inbound packets for lo0
3236 * b) handles all inbound packets for interfaces with no dedicated
3237 * input thread (e.g. anything but Ethernet/PDP or those that support
3238 * opportunistic polling.)
3239 * c) protocol registrations
3240 * d) packet injections
3241 */
3242__attribute__((noreturn))
3243static void
3244dlil_main_input_thread_cont(void *v, wait_result_t wres)
3245{
3246 struct dlil_main_threading_info *inpm = v;
3247 struct dlil_threading_info *inp = v;
3248
3249 /* main input thread is uninterruptible */
3250 VERIFY(wres != THREAD_INTERRUPTED);
3251 lck_mtx_lock_spin(lck: &inp->dlth_lock);
3252 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_TERMINATE |
3253 DLIL_INPUT_RUNNING)));
3254 inp->dlth_flags |= DLIL_INPUT_RUNNING;
3255
3256 while (1) {
3257 struct mbuf *m = NULL, *m_loop = NULL;
3258 u_int32_t m_cnt, m_cnt_loop;
3259 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3260 boolean_t proto_req;
3261 boolean_t embryonic;
3262
3263 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3264
3265 if (__improbable(embryonic =
3266 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3267 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3268 }
3269
3270 proto_req = (inp->dlth_flags &
3271 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER));
3272
3273 /* Packets for non-dedicated interfaces other than lo0 */
3274 m_cnt = qlen(&inp->dlth_pkts);
3275 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3276 m = pkt.cp_mbuf;
3277
3278 /* Packets exclusive to lo0 */
3279 m_cnt_loop = qlen(&inpm->lo_rcvq_pkts);
3280 _getq_all(&inpm->lo_rcvq_pkts, &pkt, NULL, NULL, NULL);
3281 m_loop = pkt.cp_mbuf;
3282
3283 inp->dlth_wtot = 0;
3284
3285 lck_mtx_unlock(lck: &inp->dlth_lock);
3286
3287 if (__improbable(embryonic)) {
3288 dlil_decr_pending_thread_count();
3289 }
3290
3291 /*
3292 * NOTE warning %%% attention !!!!
3293 * We should think about putting some thread starvation
3294 * safeguards if we deal with long chains of packets.
3295 */
3296 if (__probable(m_loop != NULL)) {
3297 dlil_input_packet_list_extended(lo_ifp, m_loop,
3298 m_cnt_loop, IFNET_MODEL_INPUT_POLL_OFF);
3299 }
3300
3301 if (__probable(m != NULL)) {
3302 dlil_input_packet_list_extended(NULL, m,
3303 m_cnt, IFNET_MODEL_INPUT_POLL_OFF);
3304 }
3305
3306 if (__improbable(proto_req)) {
3307 proto_input_run();
3308 }
3309
3310 lck_mtx_lock_spin(lck: &inp->dlth_lock);
3311 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3312 /* main input thread cannot be terminated */
3313 VERIFY(!(inp->dlth_flags & DLIL_INPUT_TERMINATE));
3314 if (!(inp->dlth_flags & ~DLIL_INPUT_RUNNING)) {
3315 break;
3316 }
3317 }
3318
3319 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3320 (void) assert_wait(event: &inp->dlth_flags, THREAD_UNINT);
3321 lck_mtx_unlock(lck: &inp->dlth_lock);
3322 (void) thread_block_parameter(continuation: dlil_main_input_thread_cont, parameter: inp);
3323
3324 VERIFY(0); /* we should never get here */
3325 /* NOTREACHED */
3326 __builtin_unreachable();
3327}
3328
3329/*
3330 * Input thread for interfaces with legacy input model.
3331 */
3332__attribute__((noreturn))
3333static void
3334dlil_input_thread_func(void *v, wait_result_t w)
3335{
3336#pragma unused(w)
3337 char thread_name[MAXTHREADNAMESIZE];
3338 struct dlil_threading_info *inp = v;
3339 struct ifnet *ifp = inp->dlth_ifp;
3340
3341 VERIFY(inp != dlil_main_input_thread);
3342 VERIFY(ifp != NULL);
3343 VERIFY(!(ifp->if_eflags & IFEF_RXPOLL) || !net_rxpoll ||
3344 !(ifp->if_xflags & IFXF_LEGACY));
3345 VERIFY(ifp->if_poll_mode == IFNET_MODEL_INPUT_POLL_OFF ||
3346 !(ifp->if_xflags & IFXF_LEGACY));
3347 VERIFY(current_thread() == inp->dlth_thread);
3348
3349 /* construct the name for this thread, and then apply it */
3350 bzero(s: thread_name, n: sizeof(thread_name));
3351 (void) snprintf(thread_name, count: sizeof(thread_name),
3352 "dlil_input_%s", ifp->if_xname);
3353 thread_set_thread_name(th: inp->dlth_thread, name: thread_name);
3354
3355 lck_mtx_lock(lck: &inp->dlth_lock);
3356 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3357 (void) assert_wait(event: &inp->dlth_flags, THREAD_UNINT);
3358 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3359 /* wake up once to get out of embryonic state */
3360 dlil_input_wakeup(inp);
3361 lck_mtx_unlock(lck: &inp->dlth_lock);
3362 (void) thread_block_parameter(continuation: dlil_input_thread_cont, parameter: inp);
3363 /* NOTREACHED */
3364 __builtin_unreachable();
3365}
3366
3367__attribute__((noreturn))
3368static void
3369dlil_input_thread_cont(void *v, wait_result_t wres)
3370{
3371 struct dlil_threading_info *inp = v;
3372 struct ifnet *ifp = inp->dlth_ifp;
3373
3374 lck_mtx_lock_spin(lck: &inp->dlth_lock);
3375 if (__improbable(wres == THREAD_INTERRUPTED ||
3376 (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3377 goto terminate;
3378 }
3379
3380 VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3381 inp->dlth_flags |= DLIL_INPUT_RUNNING;
3382
3383 while (1) {
3384 struct mbuf *m = NULL;
3385 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3386 boolean_t notify = FALSE;
3387 boolean_t embryonic;
3388 u_int32_t m_cnt;
3389
3390 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3391
3392 if (__improbable(embryonic =
3393 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3394 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3395 }
3396
3397 /*
3398 * Protocol registration and injection must always use
3399 * the main input thread; in theory the latter can utilize
3400 * the corresponding input thread where the packet arrived
3401 * on, but that requires our knowing the interface in advance
3402 * (and the benefits might not worth the trouble.)
3403 */
3404 VERIFY(!(inp->dlth_flags &
3405 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3406
3407 /* Packets for this interface */
3408 m_cnt = qlen(&inp->dlth_pkts);
3409 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3410 m = pkt.cp_mbuf;
3411
3412 inp->dlth_wtot = 0;
3413
3414#if SKYWALK
3415 /*
3416 * If this interface is attached to a netif nexus,
3417 * the stats are already incremented there; otherwise
3418 * do it here.
3419 */
3420 if (!(ifp->if_capabilities & IFCAP_SKYWALK))
3421#endif /* SKYWALK */
3422 notify = dlil_input_stats_sync(ifp, inp);
3423
3424 lck_mtx_unlock(lck: &inp->dlth_lock);
3425
3426 if (__improbable(embryonic)) {
3427 ifnet_decr_pending_thread_count(ifp);
3428 }
3429
3430 if (__improbable(notify)) {
3431 ifnet_notify_data_threshold(ifp);
3432 }
3433
3434 /*
3435 * NOTE warning %%% attention !!!!
3436 * We should think about putting some thread starvation
3437 * safeguards if we deal with long chains of packets.
3438 */
3439 if (__probable(m != NULL)) {
3440 dlil_input_packet_list_extended(NULL, m,
3441 m_cnt, ifp->if_poll_mode);
3442 }
3443
3444 lck_mtx_lock_spin(lck: &inp->dlth_lock);
3445 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3446 if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3447 DLIL_INPUT_TERMINATE))) {
3448 break;
3449 }
3450 }
3451
3452 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3453
3454 if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3455terminate:
3456 lck_mtx_unlock(lck: &inp->dlth_lock);
3457 dlil_terminate_input_thread(inp);
3458 /* NOTREACHED */
3459 } else {
3460 (void) assert_wait(event: &inp->dlth_flags, THREAD_UNINT);
3461 lck_mtx_unlock(lck: &inp->dlth_lock);
3462 (void) thread_block_parameter(continuation: dlil_input_thread_cont, parameter: inp);
3463 /* NOTREACHED */
3464 }
3465
3466 VERIFY(0); /* we should never get here */
3467 /* NOTREACHED */
3468 __builtin_unreachable();
3469}
3470
3471/*
3472 * Input thread for interfaces with opportunistic polling input model.
3473 */
3474__attribute__((noreturn))
3475static void
3476dlil_rxpoll_input_thread_func(void *v, wait_result_t w)
3477{
3478#pragma unused(w)
3479 char thread_name[MAXTHREADNAMESIZE];
3480 struct dlil_threading_info *inp = v;
3481 struct ifnet *ifp = inp->dlth_ifp;
3482
3483 VERIFY(inp != dlil_main_input_thread);
3484 VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_RXPOLL) &&
3485 (ifp->if_xflags & IFXF_LEGACY));
3486 VERIFY(current_thread() == inp->dlth_thread);
3487
3488 /* construct the name for this thread, and then apply it */
3489 bzero(s: thread_name, n: sizeof(thread_name));
3490 (void) snprintf(thread_name, count: sizeof(thread_name),
3491 "dlil_input_poll_%s", ifp->if_xname);
3492 thread_set_thread_name(th: inp->dlth_thread, name: thread_name);
3493
3494 lck_mtx_lock(lck: &inp->dlth_lock);
3495 VERIFY(!(inp->dlth_flags & (DLIL_INPUT_EMBRYONIC | DLIL_INPUT_RUNNING)));
3496 (void) assert_wait(event: &inp->dlth_flags, THREAD_UNINT);
3497 inp->dlth_flags |= DLIL_INPUT_EMBRYONIC;
3498 /* wake up once to get out of embryonic state */
3499 dlil_input_wakeup(inp);
3500 lck_mtx_unlock(lck: &inp->dlth_lock);
3501 (void) thread_block_parameter(continuation: dlil_rxpoll_input_thread_cont, parameter: inp);
3502 /* NOTREACHED */
3503 __builtin_unreachable();
3504}
3505
3506__attribute__((noreturn))
3507static void
3508dlil_rxpoll_input_thread_cont(void *v, wait_result_t wres)
3509{
3510 struct dlil_threading_info *inp = v;
3511 struct ifnet *ifp = inp->dlth_ifp;
3512 struct timespec ts;
3513
3514 lck_mtx_lock_spin(lck: &inp->dlth_lock);
3515 if (__improbable(wres == THREAD_INTERRUPTED ||
3516 (inp->dlth_flags & DLIL_INPUT_TERMINATE))) {
3517 goto terminate;
3518 }
3519
3520 VERIFY(!(inp->dlth_flags & DLIL_INPUT_RUNNING));
3521 inp->dlth_flags |= DLIL_INPUT_RUNNING;
3522
3523 while (1) {
3524 struct mbuf *m = NULL;
3525 uint32_t m_cnt, poll_req = 0;
3526 uint64_t m_size = 0;
3527 ifnet_model_t mode;
3528 struct timespec now, delta;
3529 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
3530 boolean_t notify;
3531 boolean_t embryonic;
3532 uint64_t ival;
3533
3534 inp->dlth_flags &= ~DLIL_INPUT_WAITING;
3535
3536 if (__improbable(embryonic =
3537 (inp->dlth_flags & DLIL_INPUT_EMBRYONIC))) {
3538 inp->dlth_flags &= ~DLIL_INPUT_EMBRYONIC;
3539 goto skip;
3540 }
3541
3542 if ((ival = ifp->if_rxpoll_ival) < IF_RXPOLL_INTERVALTIME_MIN) {
3543 ival = IF_RXPOLL_INTERVALTIME_MIN;
3544 }
3545
3546 /* Link parameters changed? */
3547 if (ifp->if_poll_update != 0) {
3548 ifp->if_poll_update = 0;
3549 (void) dlil_rxpoll_set_params(ifp, NULL, TRUE);
3550 }
3551
3552 /* Current operating mode */
3553 mode = ifp->if_poll_mode;
3554
3555 /*
3556 * Protocol registration and injection must always use
3557 * the main input thread; in theory the latter can utilize
3558 * the corresponding input thread where the packet arrived
3559 * on, but that requires our knowing the interface in advance
3560 * (and the benefits might not worth the trouble.)
3561 */
3562 VERIFY(!(inp->dlth_flags &
3563 (DLIL_PROTO_WAITING | DLIL_PROTO_REGISTER)));
3564
3565 /* Total count of all packets */
3566 m_cnt = qlen(&inp->dlth_pkts);
3567
3568 /* Total bytes of all packets */
3569 m_size = qsize(&inp->dlth_pkts);
3570
3571 /* Packets for this interface */
3572 _getq_all(&inp->dlth_pkts, &pkt, NULL, NULL, NULL);
3573 m = pkt.cp_mbuf;
3574 VERIFY(m != NULL || m_cnt == 0);
3575
3576 nanouptime(ts: &now);
3577 if (!net_timerisset(&ifp->if_poll_sample_lasttime)) {
3578 *(&ifp->if_poll_sample_lasttime) = *(&now);
3579 }
3580
3581 net_timersub(&now, &ifp->if_poll_sample_lasttime, &delta);
3582 if (if_rxpoll && net_timerisset(&ifp->if_poll_sample_holdtime)) {
3583 u_int32_t ptot, btot;
3584
3585 /* Accumulate statistics for current sampling */
3586 PKTCNTR_ADD(&ifp->if_poll_sstats, m_cnt, m_size);
3587
3588 if (net_timercmp(&delta, &ifp->if_poll_sample_holdtime, <)) {
3589 goto skip;
3590 }
3591
3592 *(&ifp->if_poll_sample_lasttime) = *(&now);
3593
3594 /* Calculate min/max of inbound bytes */
3595 btot = (u_int32_t)ifp->if_poll_sstats.bytes;
3596 if (ifp->if_rxpoll_bmin == 0 || ifp->if_rxpoll_bmin > btot) {
3597 ifp->if_rxpoll_bmin = btot;
3598 }
3599 if (btot > ifp->if_rxpoll_bmax) {
3600 ifp->if_rxpoll_bmax = btot;
3601 }
3602
3603 /* Calculate EWMA of inbound bytes */
3604 DLIL_EWMA(ifp->if_rxpoll_bavg, btot, if_rxpoll_decay);
3605
3606 /* Calculate min/max of inbound packets */
3607 ptot = (u_int32_t)ifp->if_poll_sstats.packets;
3608 if (ifp->if_rxpoll_pmin == 0 || ifp->if_rxpoll_pmin > ptot) {
3609 ifp->if_rxpoll_pmin = ptot;
3610 }
3611 if (ptot > ifp->if_rxpoll_pmax) {
3612 ifp->if_rxpoll_pmax = ptot;
3613 }
3614
3615 /* Calculate EWMA of inbound packets */
3616 DLIL_EWMA(ifp->if_rxpoll_pavg, ptot, if_rxpoll_decay);
3617
3618 /* Reset sampling statistics */
3619 PKTCNTR_CLEAR(&ifp->if_poll_sstats);
3620
3621 /* Calculate EWMA of wakeup requests */
3622 DLIL_EWMA(ifp->if_rxpoll_wavg, inp->dlth_wtot,
3623 if_rxpoll_decay);
3624 inp->dlth_wtot = 0;
3625
3626 if (dlil_verbose) {
3627 if (!net_timerisset(&ifp->if_poll_dbg_lasttime)) {
3628 *(&ifp->if_poll_dbg_lasttime) = *(&now);
3629 }
3630 net_timersub(&now, &ifp->if_poll_dbg_lasttime, &delta);
3631 if (net_timercmp(&delta, &dlil_dbgrate, >=)) {
3632 *(&ifp->if_poll_dbg_lasttime) = *(&now);
3633 DLIL_PRINTF("%s: [%s] pkts avg %d max %d "
3634 "limits [%d/%d], wreq avg %d "
3635 "limits [%d/%d], bytes avg %d "
3636 "limits [%d/%d]\n", if_name(ifp),
3637 (ifp->if_poll_mode ==
3638 IFNET_MODEL_INPUT_POLL_ON) ?
3639 "ON" : "OFF", ifp->if_rxpoll_pavg,
3640 ifp->if_rxpoll_pmax,
3641 ifp->if_rxpoll_plowat,
3642 ifp->if_rxpoll_phiwat,
3643 ifp->if_rxpoll_wavg,
3644 ifp->if_rxpoll_wlowat,
3645 ifp->if_rxpoll_whiwat,
3646 ifp->if_rxpoll_bavg,
3647 ifp->if_rxpoll_blowat,
3648 ifp->if_rxpoll_bhiwat);
3649 }
3650 }
3651
3652 /* Perform mode transition, if necessary */
3653 if (!net_timerisset(&ifp->if_poll_mode_lasttime)) {
3654 *(&ifp->if_poll_mode_lasttime) = *(&now);
3655 }
3656
3657 net_timersub(&now, &ifp->if_poll_mode_lasttime, &delta);
3658 if (net_timercmp(&delta, &ifp->if_poll_mode_holdtime, <)) {
3659 goto skip;
3660 }
3661
3662 if (ifp->if_rxpoll_pavg <= ifp->if_rxpoll_plowat &&
3663 ifp->if_rxpoll_bavg <= ifp->if_rxpoll_blowat &&
3664 ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_OFF) {
3665 mode = IFNET_MODEL_INPUT_POLL_OFF;
3666 } else if (ifp->if_rxpoll_pavg >= ifp->if_rxpoll_phiwat &&
3667 (ifp->if_rxpoll_bavg >= ifp->if_rxpoll_bhiwat ||
3668 ifp->if_rxpoll_wavg >= ifp->if_rxpoll_whiwat) &&
3669 ifp->if_poll_mode != IFNET_MODEL_INPUT_POLL_ON) {
3670 mode = IFNET_MODEL_INPUT_POLL_ON;
3671 }
3672
3673 if (mode != ifp->if_poll_mode) {
3674 ifp->if_poll_mode = mode;
3675 *(&ifp->if_poll_mode_lasttime) = *(&now);
3676 poll_req++;
3677 }
3678 }
3679skip:
3680 notify = dlil_input_stats_sync(ifp, inp);
3681
3682 lck_mtx_unlock(lck: &inp->dlth_lock);
3683
3684 if (__improbable(embryonic)) {
3685 ifnet_decr_pending_thread_count(ifp);
3686 }
3687
3688 if (__improbable(notify)) {
3689 ifnet_notify_data_threshold(ifp);
3690 }
3691
3692 /*
3693 * If there's a mode change and interface is still attached,
3694 * perform a downcall to the driver for the new mode. Also
3695 * hold an IO refcnt on the interface to prevent it from
3696 * being detached (will be release below.)
3697 */
3698 if (poll_req != 0 && ifnet_is_attached(ifp, refio: 1)) {
3699 struct ifnet_model_params p = {
3700 .model = mode, .reserved = { 0 }
3701 };
3702 errno_t err;
3703
3704 if (dlil_verbose) {
3705 DLIL_PRINTF("%s: polling is now %s, "
3706 "pkts avg %d max %d limits [%d/%d], "
3707 "wreq avg %d limits [%d/%d], "
3708 "bytes avg %d limits [%d/%d]\n",
3709 if_name(ifp),
3710 (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3711 "ON" : "OFF", ifp->if_rxpoll_pavg,
3712 ifp->if_rxpoll_pmax, ifp->if_rxpoll_plowat,
3713 ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wavg,
3714 ifp->if_rxpoll_wlowat, ifp->if_rxpoll_whiwat,
3715 ifp->if_rxpoll_bavg, ifp->if_rxpoll_blowat,
3716 ifp->if_rxpoll_bhiwat);
3717 }
3718
3719 if ((err = ((*ifp->if_input_ctl)(ifp,
3720 IFNET_CTL_SET_INPUT_MODEL, sizeof(p), &p))) != 0) {
3721 DLIL_PRINTF("%s: error setting polling mode "
3722 "to %s (%d)\n", if_name(ifp),
3723 (mode == IFNET_MODEL_INPUT_POLL_ON) ?
3724 "ON" : "OFF", err);
3725 }
3726
3727 switch (mode) {
3728 case IFNET_MODEL_INPUT_POLL_OFF:
3729 ifnet_set_poll_cycle(ifp, NULL);
3730 ifp->if_rxpoll_offreq++;
3731 if (err != 0) {
3732 ifp->if_rxpoll_offerr++;
3733 }
3734 break;
3735
3736 case IFNET_MODEL_INPUT_POLL_ON:
3737 net_nsectimer(&ival, &ts);
3738 ifnet_set_poll_cycle(ifp, &ts);
3739 ifnet_poll(ifp);
3740 ifp->if_rxpoll_onreq++;
3741 if (err != 0) {
3742 ifp->if_rxpoll_onerr++;
3743 }
3744 break;
3745
3746 default:
3747 VERIFY(0);
3748 /* NOTREACHED */
3749 }
3750
3751 /* Release the IO refcnt */
3752 ifnet_decr_iorefcnt(ifp);
3753 }
3754
3755 /*
3756 * NOTE warning %%% attention !!!!
3757 * We should think about putting some thread starvation
3758 * safeguards if we deal with long chains of packets.
3759 */
3760 if (__probable(m != NULL)) {
3761 dlil_input_packet_list_extended(NULL, m, m_cnt, mode);
3762 }
3763
3764 lck_mtx_lock_spin(lck: &inp->dlth_lock);
3765 VERIFY(inp->dlth_flags & DLIL_INPUT_RUNNING);
3766 if (!(inp->dlth_flags & ~(DLIL_INPUT_RUNNING |
3767 DLIL_INPUT_TERMINATE))) {
3768 break;
3769 }
3770 }
3771
3772 inp->dlth_flags &= ~DLIL_INPUT_RUNNING;
3773
3774 if (__improbable(inp->dlth_flags & DLIL_INPUT_TERMINATE)) {
3775terminate:
3776 lck_mtx_unlock(lck: &inp->dlth_lock);
3777 dlil_terminate_input_thread(inp);
3778 /* NOTREACHED */
3779 } else {
3780 (void) assert_wait(event: &inp->dlth_flags, THREAD_UNINT);
3781 lck_mtx_unlock(lck: &inp->dlth_lock);
3782 (void) thread_block_parameter(continuation: dlil_rxpoll_input_thread_cont,
3783 parameter: inp);
3784 /* NOTREACHED */
3785 }
3786
3787 VERIFY(0); /* we should never get here */
3788 /* NOTREACHED */
3789 __builtin_unreachable();
3790}
3791
3792errno_t
3793dlil_rxpoll_validate_params(struct ifnet_poll_params *p)
3794{
3795 if (p != NULL) {
3796 if ((p->packets_lowat == 0 && p->packets_hiwat != 0) ||
3797 (p->packets_lowat != 0 && p->packets_hiwat == 0)) {
3798 return EINVAL;
3799 }
3800 if (p->packets_lowat != 0 && /* hiwat must be non-zero */
3801 p->packets_lowat >= p->packets_hiwat) {
3802 return EINVAL;
3803 }
3804 if ((p->bytes_lowat == 0 && p->bytes_hiwat != 0) ||
3805 (p->bytes_lowat != 0 && p->bytes_hiwat == 0)) {
3806 return EINVAL;
3807 }
3808 if (p->bytes_lowat != 0 && /* hiwat must be non-zero */
3809 p->bytes_lowat >= p->bytes_hiwat) {
3810 return EINVAL;
3811 }
3812 if (p->interval_time != 0 &&
3813 p->interval_time < IF_RXPOLL_INTERVALTIME_MIN) {
3814 p->interval_time = IF_RXPOLL_INTERVALTIME_MIN;
3815 }
3816 }
3817 return 0;
3818}
3819
3820void
3821dlil_rxpoll_update_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3822{
3823 u_int64_t sample_holdtime, inbw;
3824
3825 if ((inbw = ifnet_input_linkrate(ifp)) == 0 && p == NULL) {
3826 sample_holdtime = 0; /* polling is disabled */
3827 ifp->if_rxpoll_wlowat = ifp->if_rxpoll_plowat =
3828 ifp->if_rxpoll_blowat = 0;
3829 ifp->if_rxpoll_whiwat = ifp->if_rxpoll_phiwat =
3830 ifp->if_rxpoll_bhiwat = (u_int32_t)-1;
3831 ifp->if_rxpoll_plim = 0;
3832 ifp->if_rxpoll_ival = IF_RXPOLL_INTERVALTIME_MIN;
3833 } else {
3834 u_int32_t plowat, phiwat, blowat, bhiwat, plim;
3835 u_int64_t ival;
3836 unsigned int n, i;
3837
3838 for (n = 0, i = 0; rxpoll_tbl[i].speed != 0; i++) {
3839 if (inbw < rxpoll_tbl[i].speed) {
3840 break;
3841 }
3842 n = i;
3843 }
3844 /* auto-tune if caller didn't specify a value */
3845 plowat = ((p == NULL || p->packets_lowat == 0) ?
3846 rxpoll_tbl[n].plowat : p->packets_lowat);
3847 phiwat = ((p == NULL || p->packets_hiwat == 0) ?
3848 rxpoll_tbl[n].phiwat : p->packets_hiwat);
3849 blowat = ((p == NULL || p->bytes_lowat == 0) ?
3850 rxpoll_tbl[n].blowat : p->bytes_lowat);
3851 bhiwat = ((p == NULL || p->bytes_hiwat == 0) ?
3852 rxpoll_tbl[n].bhiwat : p->bytes_hiwat);
3853 plim = ((p == NULL || p->packets_limit == 0 ||
3854 if_rxpoll_max != 0) ? if_rxpoll_max : p->packets_limit);
3855 ival = ((p == NULL || p->interval_time == 0 ||
3856 if_rxpoll_interval_time != IF_RXPOLL_INTERVALTIME) ?
3857 if_rxpoll_interval_time : p->interval_time);
3858
3859 VERIFY(plowat != 0 && phiwat != 0);
3860 VERIFY(blowat != 0 && bhiwat != 0);
3861 VERIFY(ival >= IF_RXPOLL_INTERVALTIME_MIN);
3862
3863 sample_holdtime = if_rxpoll_sample_holdtime;
3864 ifp->if_rxpoll_wlowat = if_sysctl_rxpoll_wlowat;
3865 ifp->if_rxpoll_whiwat = if_sysctl_rxpoll_whiwat;
3866 ifp->if_rxpoll_plowat = plowat;
3867 ifp->if_rxpoll_phiwat = phiwat;
3868 ifp->if_rxpoll_blowat = blowat;
3869 ifp->if_rxpoll_bhiwat = bhiwat;
3870 ifp->if_rxpoll_plim = plim;
3871 ifp->if_rxpoll_ival = ival;
3872 }
3873
3874 net_nsectimer(&if_rxpoll_mode_holdtime, &ifp->if_poll_mode_holdtime);
3875 net_nsectimer(&sample_holdtime, &ifp->if_poll_sample_holdtime);
3876
3877 if (dlil_verbose) {
3878 DLIL_PRINTF("%s: speed %llu bps, sample per %llu nsec, "
3879 "poll interval %llu nsec, pkts per poll %u, "
3880 "pkt limits [%u/%u], wreq limits [%u/%u], "
3881 "bytes limits [%u/%u]\n", if_name(ifp),
3882 inbw, sample_holdtime, ifp->if_rxpoll_ival,
3883 ifp->if_rxpoll_plim, ifp->if_rxpoll_plowat,
3884 ifp->if_rxpoll_phiwat, ifp->if_rxpoll_wlowat,
3885 ifp->if_rxpoll_whiwat, ifp->if_rxpoll_blowat,
3886 ifp->if_rxpoll_bhiwat);
3887 }
3888}
3889
3890/*
3891 * Must be called on an attached ifnet (caller is expected to check.)
3892 * Caller may pass NULL for poll parameters to indicate "auto-tuning."
3893 */
3894errno_t
3895dlil_rxpoll_set_params(struct ifnet *ifp, struct ifnet_poll_params *p,
3896 boolean_t locked)
3897{
3898 errno_t err;
3899 struct dlil_threading_info *inp;
3900
3901 VERIFY(ifp != NULL);
3902 if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3903 return ENXIO;
3904 }
3905 err = dlil_rxpoll_validate_params(p);
3906 if (err != 0) {
3907 return err;
3908 }
3909
3910 if (!locked) {
3911 lck_mtx_lock(lck: &inp->dlth_lock);
3912 }
3913 LCK_MTX_ASSERT(&inp->dlth_lock, LCK_MTX_ASSERT_OWNED);
3914 /*
3915 * Normally, we'd reset the parameters to the auto-tuned values
3916 * if the the input thread detects a change in link rate. If the
3917 * driver provides its own parameters right after a link rate
3918 * changes, but before the input thread gets to run, we want to
3919 * make sure to keep the driver's values. Clearing if_poll_update
3920 * will achieve that.
3921 */
3922 if (p != NULL && !locked && ifp->if_poll_update != 0) {
3923 ifp->if_poll_update = 0;
3924 }
3925 dlil_rxpoll_update_params(ifp, p);
3926 if (!locked) {
3927 lck_mtx_unlock(lck: &inp->dlth_lock);
3928 }
3929 return 0;
3930}
3931
3932/*
3933 * Must be called on an attached ifnet (caller is expected to check.)
3934 */
3935errno_t
3936dlil_rxpoll_get_params(struct ifnet *ifp, struct ifnet_poll_params *p)
3937{
3938 struct dlil_threading_info *inp;
3939
3940 VERIFY(ifp != NULL && p != NULL);
3941 if (!(ifp->if_eflags & IFEF_RXPOLL) || (inp = ifp->if_inp) == NULL) {
3942 return ENXIO;
3943 }
3944
3945 bzero(s: p, n: sizeof(*p));
3946
3947 lck_mtx_lock(lck: &inp->dlth_lock);
3948 p->packets_limit = ifp->if_rxpoll_plim;
3949 p->packets_lowat = ifp->if_rxpoll_plowat;
3950 p->packets_hiwat = ifp->if_rxpoll_phiwat;
3951 p->bytes_lowat = ifp->if_rxpoll_blowat;
3952 p->bytes_hiwat = ifp->if_rxpoll_bhiwat;
3953 p->interval_time = ifp->if_rxpoll_ival;
3954 lck_mtx_unlock(lck: &inp->dlth_lock);
3955
3956 return 0;
3957}
3958
3959errno_t
3960ifnet_input(struct ifnet *ifp, struct mbuf *m_head,
3961 const struct ifnet_stat_increment_param *s)
3962{
3963 return ifnet_input_common(ifp, m_head, NULL, s, FALSE, FALSE);
3964}
3965
3966errno_t
3967ifnet_input_extended(struct ifnet *ifp, struct mbuf *m_head,
3968 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3969{
3970 return ifnet_input_common(ifp, m_head, m_tail, s, TRUE, FALSE);
3971}
3972
3973errno_t
3974ifnet_input_poll(struct ifnet *ifp, struct mbuf *m_head,
3975 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s)
3976{
3977 return ifnet_input_common(ifp, m_head, m_tail, s,
3978 (m_head != NULL), TRUE);
3979}
3980
3981static errno_t
3982ifnet_input_common(struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
3983 const struct ifnet_stat_increment_param *s, boolean_t ext, boolean_t poll)
3984{
3985 dlil_input_func input_func;
3986 struct ifnet_stat_increment_param _s;
3987 u_int32_t m_cnt = 0, m_size = 0;
3988 struct mbuf *last;
3989 errno_t err = 0;
3990
3991 if ((m_head == NULL && !poll) || (s == NULL && ext)) {
3992 if (m_head != NULL) {
3993 mbuf_freem_list(mbuf: m_head);
3994 }
3995 return EINVAL;
3996 }
3997
3998 VERIFY(m_head != NULL || (s == NULL && m_tail == NULL && !ext && poll));
3999 VERIFY(m_tail == NULL || ext);
4000 VERIFY(s != NULL || !ext);
4001
4002 /*
4003 * Drop the packet(s) if the parameters are invalid, or if the
4004 * interface is no longer attached; else hold an IO refcnt to
4005 * prevent it from being detached (will be released below.)
4006 */
4007 if (ifp == NULL || (ifp != lo_ifp && !ifnet_datamov_begin(ifp))) {
4008 if (m_head != NULL) {
4009 mbuf_freem_list(mbuf: m_head);
4010 }
4011 return EINVAL;
4012 }
4013
4014 input_func = ifp->if_input_dlil;
4015 VERIFY(input_func != NULL);
4016
4017 if (m_tail == NULL) {
4018 last = m_head;
4019 while (m_head != NULL) {
4020#if IFNET_INPUT_SANITY_CHK
4021 if (__improbable(dlil_input_sanity_check != 0)) {
4022 DLIL_INPUT_CHECK(last, ifp);
4023 }
4024#endif /* IFNET_INPUT_SANITY_CHK */
4025 m_cnt++;
4026 m_size += m_length(last);
4027 if (mbuf_nextpkt(mbuf: last) == NULL) {
4028 break;
4029 }
4030 last = mbuf_nextpkt(mbuf: last);
4031 }
4032 m_tail = last;
4033 } else {
4034#if IFNET_INPUT_SANITY_CHK
4035 if (__improbable(dlil_input_sanity_check != 0)) {
4036 last = m_head;
4037 while (1) {
4038 DLIL_INPUT_CHECK(last, ifp);
4039 m_cnt++;
4040 m_size += m_length(last);
4041 if (mbuf_nextpkt(mbuf: last) == NULL) {
4042 break;
4043 }
4044 last = mbuf_nextpkt(mbuf: last);
4045 }
4046 } else {
4047 m_cnt = s->packets_in;
4048 m_size = s->bytes_in;
4049 last = m_tail;
4050 }
4051#else
4052 m_cnt = s->packets_in;
4053 m_size = s->bytes_in;
4054 last = m_tail;
4055#endif /* IFNET_INPUT_SANITY_CHK */
4056 }
4057
4058 if (last != m_tail) {
4059 panic_plain("%s: invalid input packet chain for %s, "
4060 "tail mbuf %p instead of %p\n", __func__, if_name(ifp),
4061 m_tail, last);
4062 }
4063
4064 /*
4065 * Assert packet count only for the extended variant, for backwards
4066 * compatibility, since this came directly from the device driver.
4067 * Relax this assertion for input bytes, as the driver may have
4068 * included the link-layer headers in the computation; hence
4069 * m_size is just an approximation.
4070 */
4071 if (ext && s->packets_in != m_cnt) {
4072 panic_plain("%s: input packet count mismatch for %s, "
4073 "%d instead of %d\n", __func__, if_name(ifp),
4074 s->packets_in, m_cnt);
4075 }
4076
4077 if (s == NULL) {
4078 bzero(s: &_s, n: sizeof(_s));
4079 s = &_s;
4080 } else {
4081 _s = *s;
4082 }
4083 _s.packets_in = m_cnt;
4084 _s.bytes_in = m_size;
4085
4086 err = (*input_func)(ifp, m_head, m_tail, s, poll, current_thread());
4087
4088 if (ifp != lo_ifp) {
4089 /* Release the IO refcnt */
4090 ifnet_datamov_end(ifp);
4091 }
4092
4093 return err;
4094}
4095
4096#if SKYWALK
4097errno_t
4098dlil_set_input_handler(struct ifnet *ifp, dlil_input_func fn)
4099{
4100 return os_atomic_cmpxchg((void * volatile *)&ifp->if_input_dlil,
4101 ptrauth_nop_cast(void *, &dlil_input_handler),
4102 ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4103}
4104
4105void
4106dlil_reset_input_handler(struct ifnet *ifp)
4107{
4108 while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_input_dlil,
4109 ptrauth_nop_cast(void *, ifp->if_input_dlil),
4110 ptrauth_nop_cast(void *, &dlil_input_handler), acq_rel)) {
4111 ;
4112 }
4113}
4114
4115errno_t
4116dlil_set_output_handler(struct ifnet *ifp, dlil_output_func fn)
4117{
4118 return os_atomic_cmpxchg((void * volatile *)&ifp->if_output_dlil,
4119 ptrauth_nop_cast(void *, &dlil_output_handler),
4120 ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4121}
4122
4123void
4124dlil_reset_output_handler(struct ifnet *ifp)
4125{
4126 while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_output_dlil,
4127 ptrauth_nop_cast(void *, ifp->if_output_dlil),
4128 ptrauth_nop_cast(void *, &dlil_output_handler), acq_rel)) {
4129 ;
4130 }
4131}
4132#endif /* SKYWALK */
4133
4134errno_t
4135dlil_output_handler(struct ifnet *ifp, struct mbuf *m)
4136{
4137 return ifp->if_output(ifp, m);
4138}
4139
4140errno_t
4141dlil_input_handler(struct ifnet *ifp, struct mbuf *m_head,
4142 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
4143 boolean_t poll, struct thread *tp)
4144{
4145 struct dlil_threading_info *inp = ifp->if_inp;
4146
4147 if (__improbable(inp == NULL)) {
4148 inp = dlil_main_input_thread;
4149 }
4150
4151#if (DEVELOPMENT || DEBUG)
4152 if (__improbable(net_thread_is_marked(NET_THREAD_SYNC_RX))) {
4153 return dlil_input_sync(inp, ifp, m_head, m_tail, s, poll, tp);
4154 } else
4155#endif /* (DEVELOPMENT || DEBUG) */
4156 {
4157 return inp->dlth_strategy(inp, ifp, m_head, m_tail, s, poll, tp);
4158 }
4159}
4160
4161/*
4162 * Detect whether a queue contains a burst that needs to be trimmed.
4163 */
4164#define MBUF_QUEUE_IS_OVERCOMMITTED(q) \
4165 __improbable(MAX(if_rcvq_burst_limit, qlimit(q)) < qlen(q) && \
4166 qtype(q) == QP_MBUF)
4167
4168#define MAX_KNOWN_MBUF_CLASS 8
4169
4170static uint32_t
4171dlil_trim_overcomitted_queue_locked(class_queue_t *input_queue,
4172 dlil_freeq_t *freeq, struct ifnet_stat_increment_param *stat_delta)
4173{
4174 uint32_t overcommitted_qlen; /* Length in packets. */
4175 uint64_t overcommitted_qsize; /* Size in bytes. */
4176 uint32_t target_qlen; /* The desired queue length after trimming. */
4177 uint32_t pkts_to_drop = 0; /* Number of packets to drop. */
4178 uint32_t dropped_pkts = 0; /* Number of packets that were dropped. */
4179 uint32_t dropped_bytes = 0; /* Number of dropped bytes. */
4180 struct mbuf *m = NULL, *m_tmp = NULL;
4181
4182 overcommitted_qlen = qlen(input_queue);
4183 overcommitted_qsize = qsize(input_queue);
4184 target_qlen = (qlimit(input_queue) * if_rcvq_trim_pct) / 100;
4185
4186 if (overcommitted_qlen <= target_qlen) {
4187 /*
4188 * The queue is already within the target limits.
4189 */
4190 dropped_pkts = 0;
4191 goto out;
4192 }
4193
4194 pkts_to_drop = overcommitted_qlen - target_qlen;
4195
4196 /*
4197 * Proceed to removing packets from the head of the queue,
4198 * starting from the oldest, until the desired number of packets
4199 * has been dropped.
4200 */
4201 MBUFQ_FOREACH_SAFE(m, &qmbufq(input_queue), m_tmp) {
4202 if (pkts_to_drop <= dropped_pkts) {
4203 break;
4204 }
4205 MBUFQ_REMOVE(&qmbufq(input_queue), m);
4206 MBUFQ_NEXT(m) = NULL;
4207 MBUFQ_ENQUEUE(freeq, m);
4208
4209 dropped_pkts += 1;
4210 dropped_bytes += m_length(m);
4211 }
4212
4213 /*
4214 * Adjust the length and the estimated size of the queue
4215 * after trimming.
4216 */
4217 VERIFY(overcommitted_qlen == target_qlen + dropped_pkts);
4218 qlen(input_queue) = target_qlen;
4219
4220 /* qsize() is an approximation. */
4221 if (dropped_bytes < qsize(input_queue)) {
4222 qsize(input_queue) -= dropped_bytes;
4223 } else {
4224 qsize(input_queue) = 0;
4225 }
4226
4227 /*
4228 * Adjust the ifnet statistics increments, if needed.
4229 */
4230 stat_delta->dropped += dropped_pkts;
4231 if (dropped_pkts < stat_delta->packets_in) {
4232 stat_delta->packets_in -= dropped_pkts;
4233 } else {
4234 stat_delta->packets_in = 0;
4235 }
4236 if (dropped_bytes < stat_delta->bytes_in) {
4237 stat_delta->bytes_in -= dropped_bytes;
4238 } else {
4239 stat_delta->bytes_in = 0;
4240 }
4241
4242out:
4243 if (dlil_verbose) {
4244 /*
4245 * The basic information about the drop is logged
4246 * by the invoking function (dlil_input_{,a}sync).
4247 * If `dlil_verbose' flag is set, provide more information
4248 * that can be useful for debugging.
4249 */
4250 DLIL_PRINTF("%s: "
4251 "qlen: %u -> %u, "
4252 "qsize: %llu -> %llu "
4253 "qlimit: %u (sysctl: %u) "
4254 "target_qlen: %u (if_rcvq_trim_pct: %u) pkts_to_drop: %u "
4255 "dropped_pkts: %u dropped_bytes %u\n",
4256 __func__,
4257 overcommitted_qlen, qlen(input_queue),
4258 overcommitted_qsize, qsize(input_queue),
4259 qlimit(input_queue), if_rcvq_burst_limit,
4260 target_qlen, if_rcvq_trim_pct, pkts_to_drop,
4261 dropped_pkts, dropped_bytes);
4262 }
4263
4264 return dropped_pkts;
4265}
4266
4267static errno_t
4268dlil_input_async(struct dlil_threading_info *inp,
4269 struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
4270 const struct ifnet_stat_increment_param *s, boolean_t poll,
4271 struct thread *tp)
4272{
4273 u_int32_t m_cnt = s->packets_in;
4274 u_int32_t m_size = s->bytes_in;
4275 boolean_t notify = FALSE;
4276 struct ifnet_stat_increment_param s_adj = *s;
4277 dlil_freeq_t freeq;
4278 MBUFQ_INIT(&freeq);
4279
4280 /*
4281 * If there is a matching DLIL input thread associated with an
4282 * affinity set, associate this thread with the same set. We
4283 * will only do this once.
4284 */
4285 lck_mtx_lock_spin(lck: &inp->dlth_lock);
4286 if (inp != dlil_main_input_thread && inp->dlth_affinity && tp != NULL &&
4287 ((!poll && inp->dlth_driver_thread == THREAD_NULL) ||
4288 (poll && inp->dlth_poller_thread == THREAD_NULL))) {
4289 u_int32_t tag = inp->dlth_affinity_tag;
4290
4291 if (poll) {
4292 VERIFY(inp->dlth_poller_thread == THREAD_NULL);
4293 inp->dlth_poller_thread = tp;
4294 } else {
4295 VERIFY(inp->dlth_driver_thread == THREAD_NULL);
4296 inp->dlth_driver_thread = tp;
4297 }
4298 lck_mtx_unlock(lck: &inp->dlth_lock);
4299
4300 /* Associate the current thread with the new affinity tag */
4301 (void) dlil_affinity_set(tp, tag);
4302
4303 /*
4304 * Take a reference on the current thread; during detach,
4305 * we will need to refer to it in order to tear down its
4306 * affinity.
4307 */
4308 thread_reference(thread: tp);
4309 lck_mtx_lock_spin(lck: &inp->dlth_lock);
4310 }
4311
4312 VERIFY(m_head != NULL || (m_tail == NULL && m_cnt == 0));
4313
4314 /*
4315 * Because of loopbacked multicast we cannot stuff the ifp in
4316 * the rcvif of the packet header: loopback (lo0) packets use a
4317 * dedicated list so that we can later associate them with lo_ifp
4318 * on their way up the stack. Packets for other interfaces without
4319 * dedicated input threads go to the regular list.
4320 */
4321 if (m_head != NULL) {
4322 classq_pkt_t head, tail;
4323 class_queue_t *input_queue;
4324 CLASSQ_PKT_INIT_MBUF(&head, m_head);
4325 CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
4326 if (inp == dlil_main_input_thread && ifp == lo_ifp) {
4327 struct dlil_main_threading_info *inpm =
4328 (struct dlil_main_threading_info *)inp;
4329 input_queue = &inpm->lo_rcvq_pkts;
4330 } else {
4331 input_queue = &inp->dlth_pkts;
4332 }
4333
4334 _addq_multi(input_queue, &head, &tail, m_cnt, m_size);
4335
4336 if (MBUF_QUEUE_IS_OVERCOMMITTED(input_queue)) {
4337 dlil_trim_overcomitted_queue_locked(input_queue, freeq: &freeq, stat_delta: &s_adj);
4338 inp->dlth_trim_pkts_dropped += s_adj.dropped;
4339 inp->dlth_trim_cnt += 1;
4340
4341 os_log_error(OS_LOG_DEFAULT,
4342 "%s %s burst limit %u (sysctl: %u) exceeded. "
4343 "%u packets dropped [%u total in %u events]. new qlen %u ",
4344 __func__, if_name(ifp), qlimit(input_queue), if_rcvq_burst_limit,
4345 s_adj.dropped, inp->dlth_trim_pkts_dropped, inp->dlth_trim_cnt,
4346 qlen(input_queue));
4347 }
4348 }
4349
4350#if IFNET_INPUT_SANITY_CHK
4351 /*
4352 * Verify that the original stat increment parameter
4353 * accurately describes the input chain `m_head`.
4354 * This is not affected by the trimming of input queue.
4355 */
4356 if (__improbable(dlil_input_sanity_check != 0)) {
4357 u_int32_t count = 0, size = 0;
4358 struct mbuf *m0;
4359
4360 for (m0 = m_head; m0; m0 = mbuf_nextpkt(mbuf: m0)) {
4361 size += m_length(m0);
4362 count++;
4363 }
4364
4365 if (count != m_cnt) {
4366 panic_plain("%s: invalid total packet count %u "
4367 "(expected %u)\n", if_name(ifp), count, m_cnt);
4368 /* NOTREACHED */
4369 __builtin_unreachable();
4370 } else if (size != m_size) {
4371 panic_plain("%s: invalid total packet size %u "
4372 "(expected %u)\n", if_name(ifp), size, m_size);
4373 /* NOTREACHED */
4374 __builtin_unreachable();
4375 }
4376
4377 inp->dlth_pkts_cnt += m_cnt;
4378 }
4379#endif /* IFNET_INPUT_SANITY_CHK */
4380
4381 /* NOTE: use the adjusted parameter, vs the original one */
4382 dlil_input_stats_add(&s_adj, inp, ifp, poll);
4383 /*
4384 * If we're using the main input thread, synchronize the
4385 * stats now since we have the interface context. All
4386 * other cases involving dedicated input threads will
4387 * have their stats synchronized there.
4388 */
4389 if (inp == dlil_main_input_thread) {
4390 notify = dlil_input_stats_sync(ifp, inp);
4391 }
4392
4393 dlil_input_wakeup(inp);
4394 lck_mtx_unlock(lck: &inp->dlth_lock);
4395
4396 /*
4397 * Actual freeing of the excess packets must happen
4398 * after the dlth_lock had been released.
4399 */
4400 if (!MBUFQ_EMPTY(&freeq)) {
4401 m_freem_list(MBUFQ_FIRST(&freeq));
4402 }
4403
4404 if (notify) {
4405 ifnet_notify_data_threshold(ifp);
4406 }
4407
4408 return 0;
4409}
4410
4411static errno_t
4412dlil_input_sync(struct dlil_threading_info *inp,
4413 struct ifnet *ifp, struct mbuf *m_head, struct mbuf *m_tail,
4414 const struct ifnet_stat_increment_param *s, boolean_t poll,
4415 struct thread *tp)
4416{
4417#pragma unused(tp)
4418 u_int32_t m_cnt = s->packets_in;
4419 u_int32_t m_size = s->bytes_in;
4420 boolean_t notify = FALSE;
4421 classq_pkt_t head, tail;
4422 struct ifnet_stat_increment_param s_adj = *s;
4423 dlil_freeq_t freeq;
4424 MBUFQ_INIT(&freeq);
4425
4426 ASSERT(inp != dlil_main_input_thread);
4427
4428 /* XXX: should we just assert instead? */
4429 if (__improbable(m_head == NULL)) {
4430 return 0;
4431 }
4432
4433 CLASSQ_PKT_INIT_MBUF(&head, m_head);
4434 CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
4435
4436 lck_mtx_lock_spin(lck: &inp->dlth_lock);
4437 _addq_multi(&inp->dlth_pkts, &head, &tail, m_cnt, m_size);
4438
4439 if (MBUF_QUEUE_IS_OVERCOMMITTED(&inp->dlth_pkts)) {
4440 dlil_trim_overcomitted_queue_locked(input_queue: &inp->dlth_pkts, freeq: &freeq, stat_delta: &s_adj);
4441 inp->dlth_trim_pkts_dropped += s_adj.dropped;
4442 inp->dlth_trim_cnt += 1;
4443
4444 os_log_error(OS_LOG_DEFAULT,
4445 "%s %s burst limit %u (sysctl: %u) exceeded. "
4446 "%u packets dropped [%u total in %u events]. new qlen %u \n",
4447 __func__, if_name(ifp), qlimit(&inp->dlth_pkts), if_rcvq_burst_limit,
4448 s_adj.dropped, inp->dlth_trim_pkts_dropped, inp->dlth_trim_cnt,
4449 qlen(&inp->dlth_pkts));
4450 }
4451
4452#if IFNET_INPUT_SANITY_CHK
4453 if (__improbable(dlil_input_sanity_check != 0)) {
4454 u_int32_t count = 0, size = 0;
4455 struct mbuf *m0;
4456
4457 for (m0 = m_head; m0; m0 = mbuf_nextpkt(mbuf: m0)) {
4458 size += m_length(m0);
4459 count++;
4460 }
4461
4462 if (count != m_cnt) {
4463 panic_plain("%s: invalid total packet count %u "
4464 "(expected %u)\n", if_name(ifp), count, m_cnt);
4465 /* NOTREACHED */
4466 __builtin_unreachable();
4467 } else if (size != m_size) {
4468 panic_plain("%s: invalid total packet size %u "
4469 "(expected %u)\n", if_name(ifp), size, m_size);
4470 /* NOTREACHED */
4471 __builtin_unreachable();
4472 }
4473
4474 inp->dlth_pkts_cnt += m_cnt;
4475 }
4476#endif /* IFNET_INPUT_SANITY_CHK */
4477
4478 /* NOTE: use the adjusted parameter, vs the original one */
4479 dlil_input_stats_add(&s_adj, inp, ifp, poll);
4480
4481 m_cnt = qlen(&inp->dlth_pkts);
4482 _getq_all(&inp->dlth_pkts, &head, NULL, NULL, NULL);
4483
4484#if SKYWALK
4485 /*
4486 * If this interface is attached to a netif nexus,
4487 * the stats are already incremented there; otherwise
4488 * do it here.
4489 */
4490 if (!(ifp->if_capabilities & IFCAP_SKYWALK))
4491#endif /* SKYWALK */
4492 notify = dlil_input_stats_sync(ifp, inp);
4493
4494 lck_mtx_unlock(lck: &inp->dlth_lock);
4495
4496 /*
4497 * Actual freeing of the excess packets must happen
4498 * after the dlth_lock had been released.
4499 */
4500 if (!MBUFQ_EMPTY(&freeq)) {
4501 m_freem_list(MBUFQ_FIRST(&freeq));
4502 }
4503
4504 if (notify) {
4505 ifnet_notify_data_threshold(ifp);
4506 }
4507
4508 /*
4509 * NOTE warning %%% attention !!!!
4510 * We should think about putting some thread starvation
4511 * safeguards if we deal with long chains of packets.
4512 */
4513 if (head.cp_mbuf != NULL) {
4514 dlil_input_packet_list_extended(NULL, head.cp_mbuf,
4515 m_cnt, ifp->if_poll_mode);
4516 }
4517
4518 return 0;
4519}
4520
4521#if SKYWALK
4522errno_t
4523ifnet_set_output_handler(struct ifnet *ifp, ifnet_output_func fn)
4524{
4525 return os_atomic_cmpxchg((void * volatile *)&ifp->if_output,
4526 ptrauth_nop_cast(void *, ifp->if_save_output),
4527 ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4528}
4529
4530void
4531ifnet_reset_output_handler(struct ifnet *ifp)
4532{
4533 while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_output,
4534 ptrauth_nop_cast(void *, ifp->if_output),
4535 ptrauth_nop_cast(void *, ifp->if_save_output), acq_rel)) {
4536 ;
4537 }
4538}
4539
4540errno_t
4541ifnet_set_start_handler(struct ifnet *ifp, ifnet_start_func fn)
4542{
4543 return os_atomic_cmpxchg((void * volatile *)&ifp->if_start,
4544 ptrauth_nop_cast(void *, ifp->if_save_start),
4545 ptrauth_nop_cast(void *, fn), acq_rel) ? 0 : EBUSY;
4546}
4547
4548void
4549ifnet_reset_start_handler(struct ifnet *ifp)
4550{
4551 while (!os_atomic_cmpxchg((void * volatile *)&ifp->if_start,
4552 ptrauth_nop_cast(void *, ifp->if_start),
4553 ptrauth_nop_cast(void *, ifp->if_save_start), acq_rel)) {
4554 ;
4555 }
4556}
4557#endif /* SKYWALK */
4558
4559static void
4560ifnet_start_common(struct ifnet *ifp, boolean_t resetfc, boolean_t ignore_delay)
4561{
4562 if (!(ifp->if_eflags & IFEF_TXSTART)) {
4563 return;
4564 }
4565 /*
4566 * If the starter thread is inactive, signal it to do work,
4567 * unless the interface is being flow controlled from below,
4568 * e.g. a virtual interface being flow controlled by a real
4569 * network interface beneath it, or it's been disabled via
4570 * a call to ifnet_disable_output().
4571 */
4572 lck_mtx_lock_spin(lck: &ifp->if_start_lock);
4573 if (ignore_delay) {
4574 ifp->if_start_flags |= IFSF_NO_DELAY;
4575 }
4576 if (resetfc) {
4577 ifp->if_start_flags &= ~IFSF_FLOW_CONTROLLED;
4578 } else if (ifp->if_start_flags & IFSF_FLOW_CONTROLLED) {
4579 lck_mtx_unlock(lck: &ifp->if_start_lock);
4580 return;
4581 }
4582 ifp->if_start_req++;
4583 if (!ifp->if_start_active && ifp->if_start_thread != THREAD_NULL &&
4584 (resetfc || !(ifp->if_eflags & IFEF_ENQUEUE_MULTI) ||
4585 IFCQ_LEN(ifp->if_snd) >= ifp->if_start_delay_qlen ||
4586 ifp->if_start_delayed == 0)) {
4587 (void) wakeup_one(chan: (caddr_t)&ifp->if_start_thread);
4588 }
4589 lck_mtx_unlock(lck: &ifp->if_start_lock);
4590}
4591
4592void
4593ifnet_start(struct ifnet *ifp)
4594{
4595 ifnet_start_common(ifp, FALSE, FALSE);
4596}
4597
4598void
4599ifnet_start_ignore_delay(struct ifnet *ifp)
4600{
4601 ifnet_start_common(ifp, FALSE, TRUE);
4602}
4603
4604__attribute__((noreturn))
4605static void
4606ifnet_start_thread_func(void *v, wait_result_t w)
4607{
4608#pragma unused(w)
4609 struct ifnet *ifp = v;
4610 char thread_name[MAXTHREADNAMESIZE];
4611
4612 /* Construct the name for this thread, and then apply it. */
4613 bzero(s: thread_name, n: sizeof(thread_name));
4614 (void) snprintf(thread_name, count: sizeof(thread_name),
4615 "ifnet_start_%s", ifp->if_xname);
4616#if SKYWALK
4617 /* override name for native Skywalk interface */
4618 if (ifp->if_eflags & IFEF_SKYWALK_NATIVE) {
4619 (void) snprintf(thread_name, count: sizeof(thread_name),
4620 "skywalk_doorbell_%s_tx", ifp->if_xname);
4621 }
4622#endif /* SKYWALK */
4623 ASSERT(ifp->if_start_thread == current_thread());
4624 thread_set_thread_name(th: current_thread(), name: thread_name);
4625
4626 /*
4627 * Treat the dedicated starter thread for lo0 as equivalent to
4628 * the driver workloop thread; if net_affinity is enabled for
4629 * the main input thread, associate this starter thread to it
4630 * by binding them with the same affinity tag. This is done
4631 * only once (as we only have one lo_ifp which never goes away.)
4632 */
4633 if (ifp == lo_ifp) {
4634 struct dlil_threading_info *inp = dlil_main_input_thread;
4635 struct thread *tp = current_thread();
4636#if SKYWALK
4637 /* native skywalk loopback not yet implemented */
4638 VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
4639#endif /* SKYWALK */
4640
4641 lck_mtx_lock(lck: &inp->dlth_lock);
4642 if (inp->dlth_affinity) {
4643 u_int32_t tag = inp->dlth_affinity_tag;
4644
4645 VERIFY(inp->dlth_driver_thread == THREAD_NULL);
4646 VERIFY(inp->dlth_poller_thread == THREAD_NULL);
4647 inp->dlth_driver_thread = tp;
4648 lck_mtx_unlock(lck: &inp->dlth_lock);
4649
4650 /* Associate this thread with the affinity tag */
4651 (void) dlil_affinity_set(tp, tag);
4652 } else {
4653 lck_mtx_unlock(lck: &inp->dlth_lock);
4654 }
4655 }
4656
4657 lck_mtx_lock(lck: &ifp->if_start_lock);
4658 VERIFY(!ifp->if_start_embryonic && !ifp->if_start_active);
4659 (void) assert_wait(event: &ifp->if_start_thread, THREAD_UNINT);
4660 ifp->if_start_embryonic = 1;
4661 /* wake up once to get out of embryonic state */
4662 ifp->if_start_req++;
4663 (void) wakeup_one(chan: (caddr_t)&ifp->if_start_thread);
4664 lck_mtx_unlock(lck: &ifp->if_start_lock);
4665 (void) thread_block_parameter(continuation: ifnet_start_thread_cont, parameter: ifp);
4666 /* NOTREACHED */
4667 __builtin_unreachable();
4668}
4669
4670__attribute__((noreturn))
4671static void
4672ifnet_start_thread_cont(void *v, wait_result_t wres)
4673{
4674 struct ifnet *ifp = v;
4675 struct ifclassq *ifq = ifp->if_snd;
4676
4677 lck_mtx_lock_spin(lck: &ifp->if_start_lock);
4678 if (__improbable(wres == THREAD_INTERRUPTED ||
4679 (ifp->if_start_flags & IFSF_TERMINATING) != 0)) {
4680 goto terminate;
4681 }
4682
4683 if (__improbable(ifp->if_start_embryonic)) {
4684 ifp->if_start_embryonic = 0;
4685 lck_mtx_unlock(lck: &ifp->if_start_lock);
4686 ifnet_decr_pending_thread_count(ifp);
4687 lck_mtx_lock_spin(lck: &ifp->if_start_lock);
4688 goto skip;
4689 }
4690
4691 ifp->if_start_active = 1;
4692
4693 /*
4694 * Keep on servicing until no more request.
4695 */
4696 for (;;) {
4697 u_int32_t req = ifp->if_start_req;
4698 if ((ifp->if_start_flags & IFSF_NO_DELAY) == 0 &&
4699 !IFCQ_IS_EMPTY(ifq) &&
4700 (ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
4701 ifp->if_start_delayed == 0 &&
4702 IFCQ_LEN(ifq) < ifp->if_start_delay_qlen &&
4703 (ifp->if_eflags & IFEF_DELAY_START)) {
4704 ifp->if_start_delayed = 1;
4705 ifnet_start_delayed++;
4706 break;
4707 }
4708 ifp->if_start_flags &= ~IFSF_NO_DELAY;
4709 ifp->if_start_delayed = 0;
4710 lck_mtx_unlock(lck: &ifp->if_start_lock);
4711
4712 /*
4713 * If no longer attached, don't call start because ifp
4714 * is being destroyed; else hold an IO refcnt to
4715 * prevent the interface from being detached (will be
4716 * released below.)
4717 */
4718 if (!ifnet_datamov_begin(ifp)) {
4719 lck_mtx_lock_spin(lck: &ifp->if_start_lock);
4720 break;
4721 }
4722
4723 /* invoke the driver's start routine */
4724 ((*ifp->if_start)(ifp));
4725
4726 /*
4727 * Release the io ref count taken above.
4728 */
4729 ifnet_datamov_end(ifp);
4730
4731 lck_mtx_lock_spin(lck: &ifp->if_start_lock);
4732
4733 /*
4734 * If there's no pending request or if the
4735 * interface has been disabled, we're done.
4736 */
4737#define _IFSF_DISABLED (IFSF_FLOW_CONTROLLED | IFSF_TERMINATING)
4738 if (req == ifp->if_start_req ||
4739 (ifp->if_start_flags & _IFSF_DISABLED) != 0) {
4740 break;
4741 }
4742 }
4743skip:
4744 ifp->if_start_req = 0;
4745 ifp->if_start_active = 0;
4746
4747#if SKYWALK
4748 /*
4749 * Wakeup any waiters, e.g. any threads waiting to
4750 * detach the interface from the flowswitch, etc.
4751 */
4752 if (ifp->if_start_waiters != 0) {
4753 ifp->if_start_waiters = 0;
4754 wakeup(chan: &ifp->if_start_waiters);
4755 }
4756#endif /* SKYWALK */
4757 if (__probable((ifp->if_start_flags & IFSF_TERMINATING) == 0)) {
4758 uint64_t deadline = TIMEOUT_WAIT_FOREVER;
4759 struct timespec delay_start_ts;
4760 struct timespec *ts = NULL;
4761
4762 if (ts == NULL) {
4763 ts = ((IFCQ_TBR_IS_ENABLED(ifq) && !IFCQ_IS_EMPTY(ifq)) ?
4764 &ifp->if_start_cycle : NULL);
4765 }
4766
4767 if (ts == NULL && ifp->if_start_delayed == 1) {
4768 delay_start_ts.tv_sec = 0;
4769 delay_start_ts.tv_nsec = ifp->if_start_delay_timeout;
4770 ts = &delay_start_ts;
4771 }
4772
4773 if (ts != NULL && ts->tv_sec == 0 && ts->tv_nsec == 0) {
4774 ts = NULL;
4775 }
4776
4777 if (__improbable(ts != NULL)) {
4778 clock_interval_to_deadline(interval: (uint32_t)(ts->tv_nsec +
4779 (ts->tv_sec * NSEC_PER_SEC)), scale_factor: 1, result: &deadline);
4780 }
4781
4782 (void) assert_wait_deadline(event: &ifp->if_start_thread,
4783 THREAD_UNINT, deadline);
4784 lck_mtx_unlock(lck: &ifp->if_start_lock);
4785 (void) thread_block_parameter(continuation: ifnet_start_thread_cont, parameter: ifp);
4786 /* NOTREACHED */
4787 } else {
4788terminate:
4789 /* interface is detached? */
4790 ifnet_set_start_cycle(ifp, NULL);
4791
4792 /* clear if_start_thread to allow termination to continue */
4793 ASSERT(ifp->if_start_thread != THREAD_NULL);
4794 ifp->if_start_thread = THREAD_NULL;
4795 wakeup(chan: (caddr_t)&ifp->if_start_thread);
4796 lck_mtx_unlock(lck: &ifp->if_start_lock);
4797
4798 if (dlil_verbose) {
4799 DLIL_PRINTF("%s: starter thread terminated\n",
4800 if_name(ifp));
4801 }
4802
4803 /* for the extra refcnt from kernel_thread_start() */
4804 thread_deallocate(thread: current_thread());
4805 /* this is the end */
4806 thread_terminate(target_act: current_thread());
4807 /* NOTREACHED */
4808 }
4809
4810 /* must never get here */
4811 VERIFY(0);
4812 /* NOTREACHED */
4813 __builtin_unreachable();
4814}
4815
4816void
4817ifnet_set_start_cycle(struct ifnet *ifp, struct timespec *ts)
4818{
4819 if (ts == NULL) {
4820 bzero(s: &ifp->if_start_cycle, n: sizeof(ifp->if_start_cycle));
4821 } else {
4822 *(&ifp->if_start_cycle) = *ts;
4823 }
4824
4825 if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
4826 DLIL_PRINTF("%s: restart interval set to %lu nsec\n",
4827 if_name(ifp), ts->tv_nsec);
4828 }
4829}
4830
4831static inline void
4832ifnet_poll_wakeup(struct ifnet *ifp)
4833{
4834 LCK_MTX_ASSERT(&ifp->if_poll_lock, LCK_MTX_ASSERT_OWNED);
4835
4836 ifp->if_poll_req++;
4837 if (!(ifp->if_poll_flags & IF_POLLF_RUNNING) &&
4838 ifp->if_poll_thread != THREAD_NULL) {
4839 wakeup_one(chan: (caddr_t)&ifp->if_poll_thread);
4840 }
4841}
4842
4843void
4844ifnet_poll(struct ifnet *ifp)
4845{
4846 /*
4847 * If the poller thread is inactive, signal it to do work.
4848 */
4849 lck_mtx_lock_spin(lck: &ifp->if_poll_lock);
4850 ifnet_poll_wakeup(ifp);
4851 lck_mtx_unlock(lck: &ifp->if_poll_lock);
4852}
4853
4854__attribute__((noreturn))
4855static void
4856ifnet_poll_thread_func(void *v, wait_result_t w)
4857{
4858#pragma unused(w)
4859 char thread_name[MAXTHREADNAMESIZE];
4860 struct ifnet *ifp = v;
4861
4862 VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4863 VERIFY(current_thread() == ifp->if_poll_thread);
4864
4865 /* construct the name for this thread, and then apply it */
4866 bzero(s: thread_name, n: sizeof(thread_name));
4867 (void) snprintf(thread_name, count: sizeof(thread_name),
4868 "ifnet_poller_%s", ifp->if_xname);
4869 thread_set_thread_name(th: ifp->if_poll_thread, name: thread_name);
4870
4871 lck_mtx_lock(lck: &ifp->if_poll_lock);
4872 VERIFY(!(ifp->if_poll_flags & (IF_POLLF_EMBRYONIC | IF_POLLF_RUNNING)));
4873 (void) assert_wait(event: &ifp->if_poll_thread, THREAD_UNINT);
4874 ifp->if_poll_flags |= IF_POLLF_EMBRYONIC;
4875 /* wake up once to get out of embryonic state */
4876 ifnet_poll_wakeup(ifp);
4877 lck_mtx_unlock(lck: &ifp->if_poll_lock);
4878 (void) thread_block_parameter(continuation: ifnet_poll_thread_cont, parameter: ifp);
4879 /* NOTREACHED */
4880 __builtin_unreachable();
4881}
4882
4883__attribute__((noreturn))
4884static void
4885ifnet_poll_thread_cont(void *v, wait_result_t wres)
4886{
4887 struct dlil_threading_info *inp;
4888 struct ifnet *ifp = v;
4889 struct ifnet_stat_increment_param s;
4890 struct timespec start_time;
4891
4892 VERIFY(ifp->if_eflags & IFEF_RXPOLL);
4893
4894 bzero(s: &s, n: sizeof(s));
4895 net_timerclear(&start_time);
4896
4897 lck_mtx_lock_spin(lck: &ifp->if_poll_lock);
4898 if (__improbable(wres == THREAD_INTERRUPTED ||
4899 (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0)) {
4900 goto terminate;
4901 }
4902
4903 inp = ifp->if_inp;
4904 VERIFY(inp != NULL);
4905
4906 if (__improbable(ifp->if_poll_flags & IF_POLLF_EMBRYONIC)) {
4907 ifp->if_poll_flags &= ~IF_POLLF_EMBRYONIC;
4908 lck_mtx_unlock(lck: &ifp->if_poll_lock);
4909 ifnet_decr_pending_thread_count(ifp);
4910 lck_mtx_lock_spin(lck: &ifp->if_poll_lock);
4911 goto skip;
4912 }
4913
4914 ifp->if_poll_flags |= IF_POLLF_RUNNING;
4915
4916 /*
4917 * Keep on servicing until no more request.
4918 */
4919 for (;;) {
4920 struct mbuf *m_head, *m_tail;
4921 u_int32_t m_lim, m_cnt, m_totlen;
4922 u_int16_t req = ifp->if_poll_req;
4923
4924 m_lim = (ifp->if_rxpoll_plim != 0) ? ifp->if_rxpoll_plim :
4925 MAX((qlimit(&inp->dlth_pkts)), (ifp->if_rxpoll_phiwat << 2));
4926 lck_mtx_unlock(lck: &ifp->if_poll_lock);
4927
4928 /*
4929 * If no longer attached, there's nothing to do;
4930 * else hold an IO refcnt to prevent the interface
4931 * from being detached (will be released below.)
4932 */
4933 if (!ifnet_is_attached(ifp, refio: 1)) {
4934 lck_mtx_lock_spin(lck: &ifp->if_poll_lock);
4935 break;
4936 }
4937
4938 if (dlil_verbose > 1) {
4939 DLIL_PRINTF("%s: polling up to %d pkts, "
4940 "pkts avg %d max %d, wreq avg %d, "
4941 "bytes avg %d\n",
4942 if_name(ifp), m_lim,
4943 ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4944 ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4945 }
4946
4947 /* invoke the driver's input poll routine */
4948 ((*ifp->if_input_poll)(ifp, 0, m_lim, &m_head, &m_tail,
4949 &m_cnt, &m_totlen));
4950
4951 if (m_head != NULL) {
4952 VERIFY(m_tail != NULL && m_cnt > 0);
4953
4954 if (dlil_verbose > 1) {
4955 DLIL_PRINTF("%s: polled %d pkts, "
4956 "pkts avg %d max %d, wreq avg %d, "
4957 "bytes avg %d\n",
4958 if_name(ifp), m_cnt,
4959 ifp->if_rxpoll_pavg, ifp->if_rxpoll_pmax,
4960 ifp->if_rxpoll_wavg, ifp->if_rxpoll_bavg);
4961 }
4962
4963 /* stats are required for extended variant */
4964 s.packets_in = m_cnt;
4965 s.bytes_in = m_totlen;
4966
4967 (void) ifnet_input_common(ifp, m_head, m_tail,
4968 s: &s, TRUE, TRUE);
4969 } else {
4970 if (dlil_verbose > 1) {
4971 DLIL_PRINTF("%s: no packets, "
4972 "pkts avg %d max %d, wreq avg %d, "
4973 "bytes avg %d\n",
4974 if_name(ifp), ifp->if_rxpoll_pavg,
4975 ifp->if_rxpoll_pmax, ifp->if_rxpoll_wavg,
4976 ifp->if_rxpoll_bavg);
4977 }
4978
4979 (void) ifnet_input_common(ifp, NULL, NULL,
4980 NULL, FALSE, TRUE);
4981 }
4982
4983 /* Release the io ref count */
4984 ifnet_decr_iorefcnt(ifp);
4985
4986 lck_mtx_lock_spin(lck: &ifp->if_poll_lock);
4987
4988 /* if there's no pending request, we're done */
4989 if (req == ifp->if_poll_req ||
4990 (ifp->if_poll_flags & IF_POLLF_TERMINATING) != 0) {
4991 break;
4992 }
4993 }
4994skip:
4995 ifp->if_poll_req = 0;
4996 ifp->if_poll_flags &= ~IF_POLLF_RUNNING;
4997
4998 if (__probable((ifp->if_poll_flags & IF_POLLF_TERMINATING) == 0)) {
4999 uint64_t deadline = TIMEOUT_WAIT_FOREVER;
5000 struct timespec *ts;
5001
5002 /*
5003 * Wakeup N ns from now, else sleep indefinitely (ts = NULL)
5004 * until ifnet_poll() is called again.
5005 */
5006 ts = &ifp->if_poll_cycle;
5007 if (ts->tv_sec == 0 && ts->tv_nsec == 0) {
5008 ts = NULL;
5009 }
5010
5011 if (ts != NULL) {
5012 clock_interval_to_deadline(interval: (uint32_t)(ts->tv_nsec +
5013 (ts->tv_sec * NSEC_PER_SEC)), scale_factor: 1, result: &deadline);
5014 }
5015
5016 (void) assert_wait_deadline(event: &ifp->if_poll_thread,
5017 THREAD_UNINT, deadline);
5018 lck_mtx_unlock(lck: &ifp->if_poll_lock);
5019 (void) thread_block_parameter(continuation: ifnet_poll_thread_cont, parameter: ifp);
5020 /* NOTREACHED */
5021 } else {
5022terminate:
5023 /* interface is detached (maybe while asleep)? */
5024 ifnet_set_poll_cycle(ifp, NULL);
5025
5026 /* clear if_poll_thread to allow termination to continue */
5027 ASSERT(ifp->if_poll_thread != THREAD_NULL);
5028 ifp->if_poll_thread = THREAD_NULL;
5029 wakeup(chan: (caddr_t)&ifp->if_poll_thread);
5030 lck_mtx_unlock(lck: &ifp->if_poll_lock);
5031
5032 if (dlil_verbose) {
5033 DLIL_PRINTF("%s: poller thread terminated\n",
5034 if_name(ifp));
5035 }
5036
5037 /* for the extra refcnt from kernel_thread_start() */
5038 thread_deallocate(thread: current_thread());
5039 /* this is the end */
5040 thread_terminate(target_act: current_thread());
5041 /* NOTREACHED */
5042 }
5043
5044 /* must never get here */
5045 VERIFY(0);
5046 /* NOTREACHED */
5047 __builtin_unreachable();
5048}
5049
5050void
5051ifnet_set_poll_cycle(struct ifnet *ifp, struct timespec *ts)
5052{
5053 if (ts == NULL) {
5054 bzero(s: &ifp->if_poll_cycle, n: sizeof(ifp->if_poll_cycle));
5055 } else {
5056 *(&ifp->if_poll_cycle) = *ts;
5057 }
5058
5059 if (ts != NULL && ts->tv_nsec != 0 && dlil_verbose) {
5060 DLIL_PRINTF("%s: poll interval set to %lu nsec\n",
5061 if_name(ifp), ts->tv_nsec);
5062 }
5063}
5064
5065void
5066ifnet_purge(struct ifnet *ifp)
5067{
5068 if (ifp != NULL && (ifp->if_eflags & IFEF_TXSTART)) {
5069 if_qflush_snd(ifp, false);
5070 }
5071}
5072
5073void
5074ifnet_update_sndq(struct ifclassq *ifq, cqev_t ev)
5075{
5076 IFCQ_LOCK_ASSERT_HELD(ifq);
5077
5078 if (!(IFCQ_IS_READY(ifq))) {
5079 return;
5080 }
5081
5082 if (IFCQ_TBR_IS_ENABLED(ifq)) {
5083 struct tb_profile tb = {
5084 .rate = ifq->ifcq_tbr.tbr_rate_raw,
5085 .percent = ifq->ifcq_tbr.tbr_percent, .depth = 0
5086 };
5087 (void) ifclassq_tbr_set(ifq, &tb, FALSE);
5088 }
5089
5090 ifclassq_update(ifq, ev);
5091}
5092
5093void
5094ifnet_update_rcv(struct ifnet *ifp, cqev_t ev)
5095{
5096 switch (ev) {
5097 case CLASSQ_EV_LINK_BANDWIDTH:
5098 if (net_rxpoll && (ifp->if_eflags & IFEF_RXPOLL)) {
5099 ifp->if_poll_update++;
5100 }
5101 break;
5102
5103 default:
5104 break;
5105 }
5106}
5107
5108errno_t
5109ifnet_set_output_sched_model(struct ifnet *ifp, u_int32_t model)
5110{
5111 struct ifclassq *ifq;
5112 u_int32_t omodel;
5113 errno_t err;
5114
5115 if (ifp == NULL || model >= IFNET_SCHED_MODEL_MAX) {
5116 return EINVAL;
5117 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5118 return ENXIO;
5119 }
5120
5121 ifq = ifp->if_snd;
5122 IFCQ_LOCK(ifq);
5123 omodel = ifp->if_output_sched_model;
5124 ifp->if_output_sched_model = model;
5125 if ((err = ifclassq_pktsched_setup(ifq)) != 0) {
5126 ifp->if_output_sched_model = omodel;
5127 }
5128 IFCQ_UNLOCK(ifq);
5129
5130 return err;
5131}
5132
5133errno_t
5134ifnet_set_sndq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
5135{
5136 if (ifp == NULL) {
5137 return EINVAL;
5138 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5139 return ENXIO;
5140 }
5141
5142 ifclassq_set_maxlen(ifp->if_snd, maxqlen);
5143
5144 return 0;
5145}
5146
5147errno_t
5148ifnet_get_sndq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
5149{
5150 if (ifp == NULL || maxqlen == NULL) {
5151 return EINVAL;
5152 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5153 return ENXIO;
5154 }
5155
5156 *maxqlen = ifclassq_get_maxlen(ifp->if_snd);
5157
5158 return 0;
5159}
5160
5161errno_t
5162ifnet_get_sndq_len(struct ifnet *ifp, u_int32_t *pkts)
5163{
5164 errno_t err;
5165
5166 if (ifp == NULL || pkts == NULL) {
5167 err = EINVAL;
5168 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5169 err = ENXIO;
5170 } else {
5171 err = ifclassq_get_len(ifp->if_snd, MBUF_SC_UNSPEC,
5172 IF_CLASSQ_ALL_GRPS, pkts, NULL);
5173 }
5174
5175 return err;
5176}
5177
5178errno_t
5179ifnet_get_service_class_sndq_len(struct ifnet *ifp, mbuf_svc_class_t sc,
5180 u_int32_t *pkts, u_int32_t *bytes)
5181{
5182 errno_t err;
5183
5184 if (ifp == NULL || !MBUF_VALID_SC(sc) ||
5185 (pkts == NULL && bytes == NULL)) {
5186 err = EINVAL;
5187 } else if (!(ifp->if_eflags & IFEF_TXSTART)) {
5188 err = ENXIO;
5189 } else {
5190 err = ifclassq_get_len(ifp->if_snd, sc, IF_CLASSQ_ALL_GRPS,
5191 pkts, bytes);
5192 }
5193
5194 return err;
5195}
5196
5197errno_t
5198ifnet_set_rcvq_maxlen(struct ifnet *ifp, u_int32_t maxqlen)
5199{
5200 struct dlil_threading_info *inp;
5201
5202 if (ifp == NULL) {
5203 return EINVAL;
5204 } else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
5205 return ENXIO;
5206 }
5207
5208 if (maxqlen == 0) {
5209 maxqlen = if_rcvq_maxlen;
5210 } else if (maxqlen < IF_RCVQ_MINLEN) {
5211 maxqlen = IF_RCVQ_MINLEN;
5212 }
5213
5214 inp = ifp->if_inp;
5215 lck_mtx_lock(lck: &inp->dlth_lock);
5216 qlimit(&inp->dlth_pkts) = maxqlen;
5217 lck_mtx_unlock(lck: &inp->dlth_lock);
5218
5219 return 0;
5220}
5221
5222errno_t
5223ifnet_get_rcvq_maxlen(struct ifnet *ifp, u_int32_t *maxqlen)
5224{
5225 struct dlil_threading_info *inp;
5226
5227 if (ifp == NULL || maxqlen == NULL) {
5228 return EINVAL;
5229 } else if (!(ifp->if_eflags & IFEF_RXPOLL) || ifp->if_inp == NULL) {
5230 return ENXIO;
5231 }
5232
5233 inp = ifp->if_inp;
5234 lck_mtx_lock(lck: &inp->dlth_lock);
5235 *maxqlen = qlimit(&inp->dlth_pkts);
5236 lck_mtx_unlock(lck: &inp->dlth_lock);
5237 return 0;
5238}
5239
5240void
5241ifnet_enqueue_multi_setup(struct ifnet *ifp, uint16_t delay_qlen,
5242 uint16_t delay_timeout)
5243{
5244 if (delay_qlen > 0 && delay_timeout > 0) {
5245 if_set_eflags(ifp, IFEF_ENQUEUE_MULTI);
5246 ifp->if_start_delay_qlen = MIN(100, delay_qlen);
5247 ifp->if_start_delay_timeout = min(a: 20000, b: delay_timeout);
5248 /* convert timeout to nanoseconds */
5249 ifp->if_start_delay_timeout *= 1000;
5250 kprintf(fmt: "%s: forced IFEF_ENQUEUE_MULTI qlen %u timeout %u\n",
5251 ifp->if_xname, (uint32_t)delay_qlen,
5252 (uint32_t)delay_timeout);
5253 } else {
5254 if_clear_eflags(ifp, IFEF_ENQUEUE_MULTI);
5255 }
5256}
5257
5258/*
5259 * This function clears the DSCP bits in the IPV4/V6 header pointed to by buf.
5260 * While it's ok for buf to be not 32 bit aligned, the caller must ensure that
5261 * buf holds the full header.
5262 */
5263static __attribute__((noinline)) void
5264ifnet_mcast_clear_dscp(uint8_t *buf, uint8_t ip_ver)
5265{
5266 struct ip *ip;
5267 struct ip6_hdr *ip6;
5268 uint8_t lbuf[64] __attribute__((aligned(8)));
5269 uint8_t *p = buf;
5270
5271 if (ip_ver == IPVERSION) {
5272 uint8_t old_tos;
5273 uint32_t sum;
5274
5275 if (__improbable(!IP_HDR_ALIGNED_P(p))) {
5276 DTRACE_IP1(not__aligned__v4, uint8_t *, buf);
5277 bcopy(src: buf, dst: lbuf, n: sizeof(struct ip));
5278 p = lbuf;
5279 }
5280 ip = (struct ip *)(void *)p;
5281 if (__probable((ip->ip_tos & ~IPTOS_ECN_MASK) == 0)) {
5282 return;
5283 }
5284
5285 DTRACE_IP1(clear__v4, struct ip *, ip);
5286 old_tos = ip->ip_tos;
5287 ip->ip_tos &= IPTOS_ECN_MASK;
5288 sum = ip->ip_sum + htons(old_tos) - htons(ip->ip_tos);
5289 sum = (sum >> 16) + (sum & 0xffff);
5290 ip->ip_sum = (uint16_t)(sum & 0xffff);
5291
5292 if (__improbable(p == lbuf)) {
5293 bcopy(src: lbuf, dst: buf, n: sizeof(struct ip));
5294 }
5295 } else {
5296 uint32_t flow;
5297 ASSERT(ip_ver == IPV6_VERSION);
5298
5299 if (__improbable(!IP_HDR_ALIGNED_P(p))) {
5300 DTRACE_IP1(not__aligned__v6, uint8_t *, buf);
5301 bcopy(src: buf, dst: lbuf, n: sizeof(struct ip6_hdr));
5302 p = lbuf;
5303 }
5304 ip6 = (struct ip6_hdr *)(void *)p;
5305 flow = ntohl(ip6->ip6_flow);
5306 if (__probable((flow & IP6FLOW_DSCP_MASK) == 0)) {
5307 return;
5308 }
5309
5310 DTRACE_IP1(clear__v6, struct ip6_hdr *, ip6);
5311 ip6->ip6_flow = htonl(flow & ~IP6FLOW_DSCP_MASK);
5312
5313 if (__improbable(p == lbuf)) {
5314 bcopy(src: lbuf, dst: buf, n: sizeof(struct ip6_hdr));
5315 }
5316 }
5317}
5318
5319static inline errno_t
5320ifnet_enqueue_ifclassq(struct ifnet *ifp, struct ifclassq *ifcq,
5321 classq_pkt_t *p, boolean_t flush, boolean_t *pdrop)
5322{
5323#if SKYWALK
5324 volatile struct sk_nexusadv *nxadv = NULL;
5325#endif /* SKYWALK */
5326 volatile uint64_t *fg_ts = NULL;
5327 volatile uint64_t *rt_ts = NULL;
5328 struct timespec now;
5329 u_int64_t now_nsec = 0;
5330 int error = 0;
5331 uint8_t *mcast_buf = NULL;
5332 uint8_t ip_ver;
5333 uint32_t pktlen;
5334
5335 ASSERT(ifp->if_eflags & IFEF_TXSTART);
5336#if SKYWALK
5337 /*
5338 * If attached to flowswitch, grab pointers to the
5339 * timestamp variables in the nexus advisory region.
5340 */
5341 if ((ifp->if_capabilities & IFCAP_SKYWALK) && ifp->if_na != NULL &&
5342 (nxadv = ifp->if_na->nifna_netif->nif_fsw_nxadv) != NULL) {
5343 fg_ts = &nxadv->nxadv_fg_sendts;
5344 rt_ts = &nxadv->nxadv_rt_sendts;
5345 }
5346#endif /* SKYWALK */
5347
5348 /*
5349 * If packet already carries a timestamp, either from dlil_output()
5350 * or from flowswitch, use it here. Otherwise, record timestamp.
5351 * PKTF_TS_VALID is always cleared prior to entering classq, i.e.
5352 * the timestamp value is used internally there.
5353 */
5354 switch (p->cp_ptype) {
5355 case QP_MBUF:
5356#if SKYWALK
5357 /*
5358 * Valid only for non-native (compat) Skywalk interface.
5359 * If the data source uses packet, caller must convert
5360 * it to mbuf first prior to calling this routine.
5361 */
5362 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5363#endif /* SKYWALK */
5364 ASSERT(p->cp_mbuf->m_flags & M_PKTHDR);
5365 ASSERT(p->cp_mbuf->m_nextpkt == NULL);
5366
5367 if (!(p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_TS_VALID) ||
5368 p->cp_mbuf->m_pkthdr.pkt_timestamp == 0) {
5369 nanouptime(ts: &now);
5370 net_timernsec(&now, &now_nsec);
5371 p->cp_mbuf->m_pkthdr.pkt_timestamp = now_nsec;
5372 }
5373 p->cp_mbuf->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
5374 /*
5375 * If the packet service class is not background,
5376 * update the timestamp to indicate recent activity
5377 * on a foreground socket.
5378 */
5379 if ((p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_FLOW_ID) &&
5380 p->cp_mbuf->m_pkthdr.pkt_flowsrc == FLOWSRC_INPCB) {
5381 if (!(p->cp_mbuf->m_pkthdr.pkt_flags &
5382 PKTF_SO_BACKGROUND)) {
5383 ifp->if_fg_sendts = (uint32_t)_net_uptime;
5384 if (fg_ts != NULL) {
5385 *fg_ts = (uint32_t)_net_uptime;
5386 }
5387 }
5388 if (p->cp_mbuf->m_pkthdr.pkt_flags & PKTF_SO_REALTIME) {
5389 ifp->if_rt_sendts = (uint32_t)_net_uptime;
5390 if (rt_ts != NULL) {
5391 *rt_ts = (uint32_t)_net_uptime;
5392 }
5393 }
5394 }
5395 pktlen = m_pktlen(p->cp_mbuf);
5396
5397 /*
5398 * Some Wi-Fi AP implementations do not correctly handle
5399 * multicast IP packets with DSCP bits set (radr://9331522).
5400 * As a workaround we clear the DSCP bits but keep service
5401 * class (rdar://51507725).
5402 */
5403 if ((p->cp_mbuf->m_flags & M_MCAST) != 0 &&
5404 IFNET_IS_WIFI_INFRA(ifp)) {
5405 size_t len = mbuf_len(mbuf: p->cp_mbuf), hlen;
5406 struct ether_header *eh;
5407 boolean_t pullup = FALSE;
5408 uint16_t etype;
5409
5410 if (__improbable(len < sizeof(struct ether_header))) {
5411 DTRACE_IP1(small__ether, size_t, len);
5412 if ((p->cp_mbuf = m_pullup(p->cp_mbuf,
5413 sizeof(struct ether_header))) == NULL) {
5414 return ENOMEM;
5415 }
5416 }
5417 eh = (struct ether_header *)mbuf_data(mbuf: p->cp_mbuf);
5418 etype = ntohs(eh->ether_type);
5419 if (etype == ETHERTYPE_IP) {
5420 hlen = sizeof(struct ether_header) +
5421 sizeof(struct ip);
5422 if (len < hlen) {
5423 DTRACE_IP1(small__v4, size_t, len);
5424 pullup = TRUE;
5425 }
5426 ip_ver = IPVERSION;
5427 } else if (etype == ETHERTYPE_IPV6) {
5428 hlen = sizeof(struct ether_header) +
5429 sizeof(struct ip6_hdr);
5430 if (len < hlen) {
5431 DTRACE_IP1(small__v6, size_t, len);
5432 pullup = TRUE;
5433 }
5434 ip_ver = IPV6_VERSION;
5435 } else {
5436 DTRACE_IP1(invalid__etype, uint16_t, etype);
5437 break;
5438 }
5439 if (pullup) {
5440 if ((p->cp_mbuf = m_pullup(p->cp_mbuf, (int)hlen)) ==
5441 NULL) {
5442 return ENOMEM;
5443 }
5444
5445 eh = (struct ether_header *)mbuf_data(
5446 mbuf: p->cp_mbuf);
5447 }
5448 mcast_buf = (uint8_t *)(eh + 1);
5449 /*
5450 * ifnet_mcast_clear_dscp() will finish the work below.
5451 * Note that the pullups above ensure that mcast_buf
5452 * points to a full IP header.
5453 */
5454 }
5455 break;
5456
5457#if SKYWALK
5458 case QP_PACKET:
5459 /*
5460 * Valid only for native Skywalk interface. If the data
5461 * source uses mbuf, caller must convert it to packet first
5462 * prior to calling this routine.
5463 */
5464 ASSERT(ifp->if_eflags & IFEF_SKYWALK_NATIVE);
5465 if (!(p->cp_kpkt->pkt_pflags & PKT_F_TS_VALID) ||
5466 p->cp_kpkt->pkt_timestamp == 0) {
5467 nanouptime(ts: &now);
5468 net_timernsec(&now, &now_nsec);
5469 p->cp_kpkt->pkt_timestamp = now_nsec;
5470 }
5471 p->cp_kpkt->pkt_pflags &= ~PKT_F_TS_VALID;
5472 /*
5473 * If the packet service class is not background,
5474 * update the timestamps on the interface, as well as
5475 * the ones in nexus-wide advisory to indicate recent
5476 * activity on a foreground flow.
5477 */
5478 if (!(p->cp_kpkt->pkt_pflags & PKT_F_BACKGROUND)) {
5479 ifp->if_fg_sendts = (uint32_t)_net_uptime;
5480 if (fg_ts != NULL) {
5481 *fg_ts = (uint32_t)_net_uptime;
5482 }
5483 }
5484 if (p->cp_kpkt->pkt_pflags & PKT_F_REALTIME) {
5485 ifp->if_rt_sendts = (uint32_t)_net_uptime;
5486 if (rt_ts != NULL) {
5487 *rt_ts = (uint32_t)_net_uptime;
5488 }
5489 }
5490 pktlen = p->cp_kpkt->pkt_length;
5491
5492 /*
5493 * Some Wi-Fi AP implementations do not correctly handle
5494 * multicast IP packets with DSCP bits set (radr://9331522).
5495 * As a workaround we clear the DSCP bits but keep service
5496 * class (rdar://51507725).
5497 */
5498 if ((p->cp_kpkt->pkt_link_flags & PKT_LINKF_MCAST) != 0 &&
5499 IFNET_IS_WIFI_INFRA(ifp)) {
5500 uint8_t *baddr;
5501 struct ether_header *eh;
5502 uint16_t etype;
5503
5504 MD_BUFLET_ADDR_ABS(p->cp_kpkt, baddr);
5505 baddr += p->cp_kpkt->pkt_headroom;
5506 if (__improbable(pktlen < sizeof(struct ether_header))) {
5507 DTRACE_IP1(pkt__small__ether, __kern_packet *,
5508 p->cp_kpkt);
5509 break;
5510 }
5511 eh = (struct ether_header *)(void *)baddr;
5512 etype = ntohs(eh->ether_type);
5513 if (etype == ETHERTYPE_IP) {
5514 if (pktlen < sizeof(struct ether_header) +
5515 sizeof(struct ip)) {
5516 DTRACE_IP1(pkt__small__v4, uint32_t,
5517 pktlen);
5518 break;
5519 }
5520 ip_ver = IPVERSION;
5521 } else if (etype == ETHERTYPE_IPV6) {
5522 if (pktlen < sizeof(struct ether_header) +
5523 sizeof(struct ip6_hdr)) {
5524 DTRACE_IP1(pkt__small__v6, uint32_t,
5525 pktlen);
5526 break;
5527 }
5528 ip_ver = IPV6_VERSION;
5529 } else {
5530 DTRACE_IP1(pkt__invalid__etype, uint16_t,
5531 etype);
5532 break;
5533 }
5534 mcast_buf = (uint8_t *)(eh + 1);
5535 /*
5536 * ifnet_mcast_clear_dscp() will finish the work below.
5537 * The checks above verify that the IP header is in the
5538 * first buflet.
5539 */
5540 }
5541 break;
5542#endif /* SKYWALK */
5543
5544 default:
5545 VERIFY(0);
5546 /* NOTREACHED */
5547 __builtin_unreachable();
5548 }
5549
5550 if (mcast_buf != NULL) {
5551 ifnet_mcast_clear_dscp(buf: mcast_buf, ip_ver);
5552 }
5553
5554 if (ifp->if_eflags & IFEF_ENQUEUE_MULTI) {
5555 if (now_nsec == 0) {
5556 nanouptime(ts: &now);
5557 net_timernsec(&now, &now_nsec);
5558 }
5559 /*
5560 * If the driver chose to delay start callback for
5561 * coalescing multiple packets, Then use the following
5562 * heuristics to make sure that start callback will
5563 * be delayed only when bulk data transfer is detected.
5564 * 1. number of packets enqueued in (delay_win * 2) is
5565 * greater than or equal to the delay qlen.
5566 * 2. If delay_start is enabled it will stay enabled for
5567 * another 10 idle windows. This is to take into account
5568 * variable RTT and burst traffic.
5569 * 3. If the time elapsed since last enqueue is more
5570 * than 200ms we disable delaying start callback. This is
5571 * is to take idle time into account.
5572 */
5573 u_int64_t dwin = (ifp->if_start_delay_timeout << 1);
5574 if (ifp->if_start_delay_swin > 0) {
5575 if ((ifp->if_start_delay_swin + dwin) > now_nsec) {
5576 ifp->if_start_delay_cnt++;
5577 } else if ((now_nsec - ifp->if_start_delay_swin)
5578 >= (200 * 1000 * 1000)) {
5579 ifp->if_start_delay_swin = now_nsec;
5580 ifp->if_start_delay_cnt = 1;
5581 ifp->if_start_delay_idle = 0;
5582 if (ifp->if_eflags & IFEF_DELAY_START) {
5583 if_clear_eflags(ifp, IFEF_DELAY_START);
5584 ifnet_delay_start_disabled_increment();
5585 }
5586 } else {
5587 if (ifp->if_start_delay_cnt >=
5588 ifp->if_start_delay_qlen) {
5589 if_set_eflags(ifp, IFEF_DELAY_START);
5590 ifp->if_start_delay_idle = 0;
5591 } else {
5592 if (ifp->if_start_delay_idle >= 10) {
5593 if_clear_eflags(ifp,
5594 IFEF_DELAY_START);
5595 ifnet_delay_start_disabled_increment();
5596 } else {
5597 ifp->if_start_delay_idle++;
5598 }
5599 }
5600 ifp->if_start_delay_swin = now_nsec;
5601 ifp->if_start_delay_cnt = 1;
5602 }
5603 } else {
5604 ifp->if_start_delay_swin = now_nsec;
5605 ifp->if_start_delay_cnt = 1;
5606 ifp->if_start_delay_idle = 0;
5607 if_clear_eflags(ifp, IFEF_DELAY_START);
5608 }
5609 } else {
5610 if_clear_eflags(ifp, IFEF_DELAY_START);
5611 }
5612
5613 /* enqueue the packet (caller consumes object) */
5614 error = ifclassq_enqueue(((ifcq != NULL) ? ifcq : ifp->if_snd), p, p,
5615 1, pktlen, pdrop);
5616
5617 /*
5618 * Tell the driver to start dequeueing; do this even when the queue
5619 * for the packet is suspended (EQSUSPENDED), as the driver could still
5620 * be dequeueing from other unsuspended queues.
5621 */
5622 if (!(ifp->if_eflags & IFEF_ENQUEUE_MULTI) &&
5623 ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED)) {
5624 ifnet_start(ifp);
5625 }
5626
5627 return error;
5628}
5629
5630static inline errno_t
5631ifnet_enqueue_ifclassq_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5632 classq_pkt_t *head, classq_pkt_t *tail, uint32_t cnt, uint32_t bytes,
5633 boolean_t flush, boolean_t *pdrop)
5634{
5635 int error;
5636
5637 /* enqueue the packet (caller consumes object) */
5638 error = ifclassq_enqueue(ifcq != NULL ? ifcq : ifp->if_snd, head, tail,
5639 cnt, bytes, pdrop);
5640
5641 /*
5642 * Tell the driver to start dequeueing; do this even when the queue
5643 * for the packet is suspended (EQSUSPENDED), as the driver could still
5644 * be dequeueing from other unsuspended queues.
5645 */
5646 if ((error == 0 && flush) || error == EQFULL || error == EQSUSPENDED) {
5647 ifnet_start(ifp);
5648 }
5649 return error;
5650}
5651
5652int
5653ifnet_enqueue_netem(void *handle, pktsched_pkt_t *pkts, uint32_t n_pkts)
5654{
5655 struct ifnet *ifp = handle;
5656 boolean_t pdrop; /* dummy */
5657 uint32_t i;
5658
5659 ASSERT(n_pkts >= 1);
5660 for (i = 0; i < n_pkts - 1; i++) {
5661 (void) ifnet_enqueue_ifclassq(ifp, NULL, p: &pkts[i].pktsched_pkt,
5662 FALSE, pdrop: &pdrop);
5663 }
5664 /* flush with the last packet */
5665 (void) ifnet_enqueue_ifclassq(ifp, NULL, p: &pkts[i].pktsched_pkt,
5666 TRUE, pdrop: &pdrop);
5667
5668 return 0;
5669}
5670
5671static inline errno_t
5672ifnet_enqueue_common(struct ifnet *ifp, struct ifclassq *ifcq,
5673 classq_pkt_t *pkt, boolean_t flush, boolean_t *pdrop)
5674{
5675 if (ifp->if_output_netem != NULL) {
5676 bool drop;
5677 errno_t error;
5678 error = netem_enqueue(ne: ifp->if_output_netem, p: pkt, pdrop: &drop);
5679 *pdrop = drop ? TRUE : FALSE;
5680 return error;
5681 } else {
5682 return ifnet_enqueue_ifclassq(ifp, ifcq, p: pkt, flush, pdrop);
5683 }
5684}
5685
5686errno_t
5687ifnet_enqueue(struct ifnet *ifp, struct mbuf *m)
5688{
5689 uint32_t bytes = m_pktlen(m);
5690 struct mbuf *tail = m;
5691 uint32_t cnt = 1;
5692 boolean_t pdrop;
5693
5694 while (tail->m_nextpkt) {
5695 VERIFY(tail->m_flags & M_PKTHDR);
5696 tail = tail->m_nextpkt;
5697 cnt++;
5698 bytes += m_pktlen(tail);
5699 }
5700
5701 return ifnet_enqueue_mbuf_chain(ifp, m, tail, cnt, bytes, TRUE, &pdrop);
5702}
5703
5704errno_t
5705ifnet_enqueue_mbuf(struct ifnet *ifp, struct mbuf *m, boolean_t flush,
5706 boolean_t *pdrop)
5707{
5708 classq_pkt_t pkt;
5709
5710 if (ifp == NULL || m == NULL || !(m->m_flags & M_PKTHDR) ||
5711 m->m_nextpkt != NULL) {
5712 if (m != NULL) {
5713 m_freem_list(m);
5714 *pdrop = TRUE;
5715 }
5716 return EINVAL;
5717 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5718 !IF_FULLY_ATTACHED(ifp)) {
5719 /* flag tested without lock for performance */
5720 m_freem(m);
5721 *pdrop = TRUE;
5722 return ENXIO;
5723 } else if (!(ifp->if_flags & IFF_UP)) {
5724 m_freem(m);
5725 *pdrop = TRUE;
5726 return ENETDOWN;
5727 }
5728
5729 CLASSQ_PKT_INIT_MBUF(&pkt, m);
5730 return ifnet_enqueue_common(ifp, NULL, pkt: &pkt, flush, pdrop);
5731}
5732
5733errno_t
5734ifnet_enqueue_mbuf_chain(struct ifnet *ifp, struct mbuf *m_head,
5735 struct mbuf *m_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5736 boolean_t *pdrop)
5737{
5738 classq_pkt_t head, tail;
5739
5740 ASSERT(m_head != NULL);
5741 ASSERT((m_head->m_flags & M_PKTHDR) != 0);
5742 ASSERT(m_tail != NULL);
5743 ASSERT((m_tail->m_flags & M_PKTHDR) != 0);
5744 ASSERT(ifp != NULL);
5745 ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5746
5747 if (!IF_FULLY_ATTACHED(ifp)) {
5748 /* flag tested without lock for performance */
5749 m_freem_list(m_head);
5750 *pdrop = TRUE;
5751 return ENXIO;
5752 } else if (!(ifp->if_flags & IFF_UP)) {
5753 m_freem_list(m_head);
5754 *pdrop = TRUE;
5755 return ENETDOWN;
5756 }
5757
5758 CLASSQ_PKT_INIT_MBUF(&head, m_head);
5759 CLASSQ_PKT_INIT_MBUF(&tail, m_tail);
5760 return ifnet_enqueue_ifclassq_chain(ifp, NULL, head: &head, tail: &tail, cnt, bytes,
5761 flush, pdrop);
5762}
5763
5764#if SKYWALK
5765static errno_t
5766ifnet_enqueue_pkt_common(struct ifnet *ifp, struct ifclassq *ifcq,
5767 struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5768{
5769 classq_pkt_t pkt;
5770
5771 ASSERT(kpkt == NULL || kpkt->pkt_nextpkt == NULL);
5772
5773 if (__improbable(ifp == NULL || kpkt == NULL)) {
5774 if (kpkt != NULL) {
5775 pp_free_packet(__DECONST(struct kern_pbufpool *,
5776 kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5777 *pdrop = TRUE;
5778 }
5779 return EINVAL;
5780 } else if (__improbable(!(ifp->if_eflags & IFEF_TXSTART) ||
5781 !IF_FULLY_ATTACHED(ifp))) {
5782 /* flag tested without lock for performance */
5783 pp_free_packet(__DECONST(struct kern_pbufpool *,
5784 kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5785 *pdrop = TRUE;
5786 return ENXIO;
5787 } else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5788 pp_free_packet(__DECONST(struct kern_pbufpool *,
5789 kpkt->pkt_qum.qum_pp), SK_PTR_ADDR(kpkt));
5790 *pdrop = TRUE;
5791 return ENETDOWN;
5792 }
5793
5794 CLASSQ_PKT_INIT_PACKET(&pkt, kpkt);
5795 return ifnet_enqueue_common(ifp, ifcq, pkt: &pkt, flush, pdrop);
5796}
5797
5798errno_t
5799ifnet_enqueue_pkt(struct ifnet *ifp, struct __kern_packet *kpkt,
5800 boolean_t flush, boolean_t *pdrop)
5801{
5802 return ifnet_enqueue_pkt_common(ifp, NULL, kpkt, flush, pdrop);
5803}
5804
5805errno_t
5806ifnet_enqueue_ifcq_pkt(struct ifnet *ifp, struct ifclassq *ifcq,
5807 struct __kern_packet *kpkt, boolean_t flush, boolean_t *pdrop)
5808{
5809 return ifnet_enqueue_pkt_common(ifp, ifcq, kpkt, flush, pdrop);
5810}
5811
5812static errno_t
5813ifnet_enqueue_pkt_chain_common(struct ifnet *ifp, struct ifclassq *ifcq,
5814 struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5815 uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5816{
5817 classq_pkt_t head, tail;
5818
5819 ASSERT(k_head != NULL);
5820 ASSERT(k_tail != NULL);
5821 ASSERT(ifp != NULL);
5822 ASSERT((ifp->if_eflags & IFEF_TXSTART) != 0);
5823
5824 if (!IF_FULLY_ATTACHED(ifp)) {
5825 /* flag tested without lock for performance */
5826 pp_free_packet_chain(k_head, NULL);
5827 *pdrop = TRUE;
5828 return ENXIO;
5829 } else if (__improbable(!(ifp->if_flags & IFF_UP))) {
5830 pp_free_packet_chain(k_head, NULL);
5831 *pdrop = TRUE;
5832 return ENETDOWN;
5833 }
5834
5835 CLASSQ_PKT_INIT_PACKET(&head, k_head);
5836 CLASSQ_PKT_INIT_PACKET(&tail, k_tail);
5837 return ifnet_enqueue_ifclassq_chain(ifp, ifcq, head: &head, tail: &tail, cnt, bytes,
5838 flush, pdrop);
5839}
5840
5841errno_t
5842ifnet_enqueue_pkt_chain(struct ifnet *ifp, struct __kern_packet *k_head,
5843 struct __kern_packet *k_tail, uint32_t cnt, uint32_t bytes, boolean_t flush,
5844 boolean_t *pdrop)
5845{
5846 return ifnet_enqueue_pkt_chain_common(ifp, NULL, k_head, k_tail,
5847 cnt, bytes, flush, pdrop);
5848}
5849
5850errno_t
5851ifnet_enqueue_ifcq_pkt_chain(struct ifnet *ifp, struct ifclassq *ifcq,
5852 struct __kern_packet *k_head, struct __kern_packet *k_tail, uint32_t cnt,
5853 uint32_t bytes, boolean_t flush, boolean_t *pdrop)
5854{
5855 return ifnet_enqueue_pkt_chain_common(ifp, ifcq, k_head, k_tail,
5856 cnt, bytes, flush, pdrop);
5857}
5858#endif /* SKYWALK */
5859
5860errno_t
5861ifnet_dequeue(struct ifnet *ifp, struct mbuf **mp)
5862{
5863 errno_t rc;
5864 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5865
5866 if (ifp == NULL || mp == NULL) {
5867 return EINVAL;
5868 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5869 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5870 return ENXIO;
5871 }
5872 if (!ifnet_is_attached(ifp, refio: 1)) {
5873 return ENXIO;
5874 }
5875
5876#if SKYWALK
5877 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5878#endif /* SKYWALK */
5879 rc = ifclassq_dequeue(ifp->if_snd, 1, CLASSQ_DEQUEUE_MAX_BYTE_LIMIT,
5880 &pkt, NULL, NULL, NULL, 0);
5881 VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5882 ifnet_decr_iorefcnt(ifp);
5883 *mp = pkt.cp_mbuf;
5884 return rc;
5885}
5886
5887errno_t
5888ifnet_dequeue_service_class(struct ifnet *ifp, mbuf_svc_class_t sc,
5889 struct mbuf **mp)
5890{
5891 errno_t rc;
5892 classq_pkt_t pkt = CLASSQ_PKT_INITIALIZER(pkt);
5893
5894 if (ifp == NULL || mp == NULL || !MBUF_VALID_SC(sc)) {
5895 return EINVAL;
5896 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5897 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5898 return ENXIO;
5899 }
5900 if (!ifnet_is_attached(ifp, refio: 1)) {
5901 return ENXIO;
5902 }
5903
5904#if SKYWALK
5905 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5906#endif /* SKYWALK */
5907 rc = ifclassq_dequeue_sc(ifp->if_snd, sc, 1,
5908 CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt, NULL, NULL, NULL, 0);
5909 VERIFY((pkt.cp_ptype == QP_MBUF) || (pkt.cp_mbuf == NULL));
5910 ifnet_decr_iorefcnt(ifp);
5911 *mp = pkt.cp_mbuf;
5912 return rc;
5913}
5914
5915errno_t
5916ifnet_dequeue_multi(struct ifnet *ifp, u_int32_t pkt_limit,
5917 struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5918{
5919 errno_t rc;
5920 classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5921 classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5922
5923 if (ifp == NULL || head == NULL || pkt_limit < 1) {
5924 return EINVAL;
5925 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5926 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5927 return ENXIO;
5928 }
5929 if (!ifnet_is_attached(ifp, refio: 1)) {
5930 return ENXIO;
5931 }
5932
5933#if SKYWALK
5934 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5935#endif /* SKYWALK */
5936 rc = ifclassq_dequeue(ifp->if_snd, pkt_limit,
5937 CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail, cnt, len, 0);
5938 VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5939 ifnet_decr_iorefcnt(ifp);
5940 *head = pkt_head.cp_mbuf;
5941 if (tail != NULL) {
5942 *tail = pkt_tail.cp_mbuf;
5943 }
5944 return rc;
5945}
5946
5947errno_t
5948ifnet_dequeue_multi_bytes(struct ifnet *ifp, u_int32_t byte_limit,
5949 struct mbuf **head, struct mbuf **tail, u_int32_t *cnt, u_int32_t *len)
5950{
5951 errno_t rc;
5952 classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5953 classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5954
5955 if (ifp == NULL || head == NULL || byte_limit < 1) {
5956 return EINVAL;
5957 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5958 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5959 return ENXIO;
5960 }
5961 if (!ifnet_is_attached(ifp, refio: 1)) {
5962 return ENXIO;
5963 }
5964
5965#if SKYWALK
5966 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
5967#endif /* SKYWALK */
5968 rc = ifclassq_dequeue(ifp->if_snd, CLASSQ_DEQUEUE_MAX_PKT_LIMIT,
5969 byte_limit, &pkt_head, &pkt_tail, cnt, len, 0);
5970 VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
5971 ifnet_decr_iorefcnt(ifp);
5972 *head = pkt_head.cp_mbuf;
5973 if (tail != NULL) {
5974 *tail = pkt_tail.cp_mbuf;
5975 }
5976 return rc;
5977}
5978
5979errno_t
5980ifnet_dequeue_service_class_multi(struct ifnet *ifp, mbuf_svc_class_t sc,
5981 u_int32_t pkt_limit, struct mbuf **head, struct mbuf **tail, u_int32_t *cnt,
5982 u_int32_t *len)
5983{
5984 errno_t rc;
5985 classq_pkt_t pkt_head = CLASSQ_PKT_INITIALIZER(pkt_head);
5986 classq_pkt_t pkt_tail = CLASSQ_PKT_INITIALIZER(pkt_tail);
5987
5988 if (ifp == NULL || head == NULL || pkt_limit < 1 ||
5989 !MBUF_VALID_SC(sc)) {
5990 return EINVAL;
5991 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
5992 ifp->if_output_sched_model >= IFNET_SCHED_MODEL_MAX) {
5993 return ENXIO;
5994 }
5995 if (!ifnet_is_attached(ifp, refio: 1)) {
5996 return ENXIO;
5997 }
5998
5999#if SKYWALK
6000 ASSERT(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
6001#endif /* SKYWALK */
6002 rc = ifclassq_dequeue_sc(ifp->if_snd, sc, pkt_limit,
6003 CLASSQ_DEQUEUE_MAX_BYTE_LIMIT, &pkt_head, &pkt_tail,
6004 cnt, len, 0);
6005 VERIFY((pkt_head.cp_ptype == QP_MBUF) || (pkt_head.cp_mbuf == NULL));
6006 ifnet_decr_iorefcnt(ifp);
6007 *head = pkt_head.cp_mbuf;
6008 if (tail != NULL) {
6009 *tail = pkt_tail.cp_mbuf;
6010 }
6011 return rc;
6012}
6013
6014#if XNU_TARGET_OS_OSX
6015errno_t
6016ifnet_framer_stub(struct ifnet *ifp, struct mbuf **m,
6017 const struct sockaddr *dest, const char *dest_linkaddr,
6018 const char *frame_type, u_int32_t *pre, u_int32_t *post)
6019{
6020 if (pre != NULL) {
6021 *pre = 0;
6022 }
6023 if (post != NULL) {
6024 *post = 0;
6025 }
6026
6027 return ifp->if_framer_legacy(ifp, m, dest, dest_linkaddr, frame_type);
6028}
6029#endif /* XNU_TARGET_OS_OSX */
6030
6031static boolean_t
6032packet_has_vlan_tag(struct mbuf * m)
6033{
6034 u_int tag = 0;
6035
6036 if ((m->m_pkthdr.csum_flags & CSUM_VLAN_TAG_VALID) != 0) {
6037 tag = EVL_VLANOFTAG(m->m_pkthdr.vlan_tag);
6038 if (tag == 0) {
6039 /* the packet is just priority-tagged, clear the bit */
6040 m->m_pkthdr.csum_flags &= ~CSUM_VLAN_TAG_VALID;
6041 }
6042 }
6043 return tag != 0;
6044}
6045
6046static int
6047dlil_interface_filters_input(struct ifnet *ifp, struct mbuf **m_p,
6048 char **frame_header_p, protocol_family_t protocol_family)
6049{
6050 boolean_t is_vlan_packet = FALSE;
6051 struct ifnet_filter *filter;
6052 struct mbuf *m = *m_p;
6053
6054 is_vlan_packet = packet_has_vlan_tag(m);
6055
6056 if (TAILQ_EMPTY(&ifp->if_flt_head)) {
6057 return 0;
6058 }
6059
6060 /*
6061 * Pass the inbound packet to the interface filters
6062 */
6063 lck_mtx_lock_spin(lck: &ifp->if_flt_lock);
6064 /* prevent filter list from changing in case we drop the lock */
6065 if_flt_monitor_busy(ifp);
6066 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6067 int result;
6068
6069 /* exclude VLAN packets from external filters PR-3586856 */
6070 if (is_vlan_packet &&
6071 (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
6072 continue;
6073 }
6074
6075 if (!filter->filt_skip && filter->filt_input != NULL &&
6076 (filter->filt_protocol == 0 ||
6077 filter->filt_protocol == protocol_family)) {
6078 lck_mtx_unlock(lck: &ifp->if_flt_lock);
6079
6080 result = (*filter->filt_input)(filter->filt_cookie,
6081 ifp, protocol_family, m_p, frame_header_p);
6082
6083 lck_mtx_lock_spin(lck: &ifp->if_flt_lock);
6084 if (result != 0) {
6085 /* we're done with the filter list */
6086 if_flt_monitor_unbusy(ifp);
6087 lck_mtx_unlock(lck: &ifp->if_flt_lock);
6088 return result;
6089 }
6090 }
6091 }
6092 /* we're done with the filter list */
6093 if_flt_monitor_unbusy(ifp);
6094 lck_mtx_unlock(lck: &ifp->if_flt_lock);
6095
6096 /*
6097 * Strip away M_PROTO1 bit prior to sending packet up the stack as
6098 * it is meant to be local to a subsystem -- if_bridge for M_PROTO1
6099 */
6100 if (*m_p != NULL) {
6101 (*m_p)->m_flags &= ~M_PROTO1;
6102 }
6103
6104 return 0;
6105}
6106
6107__attribute__((noinline))
6108static int
6109dlil_interface_filters_output(struct ifnet *ifp, struct mbuf **m_p,
6110 protocol_family_t protocol_family)
6111{
6112 boolean_t is_vlan_packet;
6113 struct ifnet_filter *filter;
6114 struct mbuf *m = *m_p;
6115
6116 if (TAILQ_EMPTY(&ifp->if_flt_head)) {
6117 return 0;
6118 }
6119 is_vlan_packet = packet_has_vlan_tag(m);
6120
6121 /*
6122 * Pass the outbound packet to the interface filters
6123 */
6124 lck_mtx_lock_spin(lck: &ifp->if_flt_lock);
6125 /* prevent filter list from changing in case we drop the lock */
6126 if_flt_monitor_busy(ifp);
6127 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6128 int result;
6129
6130 /* exclude VLAN packets from external filters PR-3586856 */
6131 if (is_vlan_packet &&
6132 (filter->filt_flags & DLIL_IFF_INTERNAL) == 0) {
6133 continue;
6134 }
6135
6136 if (!filter->filt_skip && filter->filt_output != NULL &&
6137 (filter->filt_protocol == 0 ||
6138 filter->filt_protocol == protocol_family)) {
6139 lck_mtx_unlock(lck: &ifp->if_flt_lock);
6140
6141 result = filter->filt_output(filter->filt_cookie, ifp,
6142 protocol_family, m_p);
6143
6144 lck_mtx_lock_spin(lck: &ifp->if_flt_lock);
6145 if (result != 0) {
6146 /* we're done with the filter list */
6147 if_flt_monitor_unbusy(ifp);
6148 lck_mtx_unlock(lck: &ifp->if_flt_lock);
6149 return result;
6150 }
6151 }
6152 }
6153 /* we're done with the filter list */
6154 if_flt_monitor_unbusy(ifp);
6155 lck_mtx_unlock(lck: &ifp->if_flt_lock);
6156
6157 return 0;
6158}
6159
6160static void
6161dlil_ifproto_input(struct if_proto * ifproto, mbuf_t m)
6162{
6163 int error;
6164
6165 if (ifproto->proto_kpi == kProtoKPI_v1) {
6166 /* Version 1 protocols get one packet at a time */
6167 while (m != NULL) {
6168 char * frame_header;
6169 mbuf_t next_packet;
6170
6171 next_packet = m->m_nextpkt;
6172 m->m_nextpkt = NULL;
6173 frame_header = m->m_pkthdr.pkt_hdr;
6174 m->m_pkthdr.pkt_hdr = NULL;
6175 error = (*ifproto->kpi.v1.input)(ifproto->ifp,
6176 ifproto->protocol_family, m, frame_header);
6177 if (error != 0 && error != EJUSTRETURN) {
6178 m_freem(m);
6179 }
6180 m = next_packet;
6181 }
6182 } else if (ifproto->proto_kpi == kProtoKPI_v2) {
6183 /* Version 2 protocols support packet lists */
6184 error = (*ifproto->kpi.v2.input)(ifproto->ifp,
6185 ifproto->protocol_family, m);
6186 if (error != 0 && error != EJUSTRETURN) {
6187 m_freem_list(m);
6188 }
6189 }
6190}
6191
6192static void
6193dlil_input_stats_add(const struct ifnet_stat_increment_param *s,
6194 struct dlil_threading_info *inp, struct ifnet *ifp, boolean_t poll)
6195{
6196 struct ifnet_stat_increment_param *d = &inp->dlth_stats;
6197
6198 if (s->packets_in != 0) {
6199 d->packets_in += s->packets_in;
6200 }
6201 if (s->bytes_in != 0) {
6202 d->bytes_in += s->bytes_in;
6203 }
6204 if (s->errors_in != 0) {
6205 d->errors_in += s->errors_in;
6206 }
6207
6208 if (s->packets_out != 0) {
6209 d->packets_out += s->packets_out;
6210 }
6211 if (s->bytes_out != 0) {
6212 d->bytes_out += s->bytes_out;
6213 }
6214 if (s->errors_out != 0) {
6215 d->errors_out += s->errors_out;
6216 }
6217
6218 if (s->collisions != 0) {
6219 d->collisions += s->collisions;
6220 }
6221 if (s->dropped != 0) {
6222 d->dropped += s->dropped;
6223 }
6224
6225 if (poll) {
6226 PKTCNTR_ADD(&ifp->if_poll_tstats, s->packets_in, s->bytes_in);
6227 }
6228}
6229
6230static boolean_t
6231dlil_input_stats_sync(struct ifnet *ifp, struct dlil_threading_info *inp)
6232{
6233 struct ifnet_stat_increment_param *s = &inp->dlth_stats;
6234
6235 /*
6236 * Use of atomic operations is unavoidable here because
6237 * these stats may also be incremented elsewhere via KPIs.
6238 */
6239 if (s->packets_in != 0) {
6240 os_atomic_add(&ifp->if_data.ifi_ipackets, s->packets_in, relaxed);
6241 s->packets_in = 0;
6242 }
6243 if (s->bytes_in != 0) {
6244 os_atomic_add(&ifp->if_data.ifi_ibytes, s->bytes_in, relaxed);
6245 s->bytes_in = 0;
6246 }
6247 if (s->errors_in != 0) {
6248 os_atomic_add(&ifp->if_data.ifi_ierrors, s->errors_in, relaxed);
6249 s->errors_in = 0;
6250 }
6251
6252 if (s->packets_out != 0) {
6253 os_atomic_add(&ifp->if_data.ifi_opackets, s->packets_out, relaxed);
6254 s->packets_out = 0;
6255 }
6256 if (s->bytes_out != 0) {
6257 os_atomic_add(&ifp->if_data.ifi_obytes, s->bytes_out, relaxed);
6258 s->bytes_out = 0;
6259 }
6260 if (s->errors_out != 0) {
6261 os_atomic_add(&ifp->if_data.ifi_oerrors, s->errors_out, relaxed);
6262 s->errors_out = 0;
6263 }
6264
6265 if (s->collisions != 0) {
6266 os_atomic_add(&ifp->if_data.ifi_collisions, s->collisions, relaxed);
6267 s->collisions = 0;
6268 }
6269 if (s->dropped != 0) {
6270 os_atomic_add(&ifp->if_data.ifi_iqdrops, s->dropped, relaxed);
6271 s->dropped = 0;
6272 }
6273
6274 /*
6275 * No need for atomic operations as they are modified here
6276 * only from within the DLIL input thread context.
6277 */
6278 if (ifp->if_poll_tstats.packets != 0) {
6279 ifp->if_poll_pstats.ifi_poll_packets += ifp->if_poll_tstats.packets;
6280 ifp->if_poll_tstats.packets = 0;
6281 }
6282 if (ifp->if_poll_tstats.bytes != 0) {
6283 ifp->if_poll_pstats.ifi_poll_bytes += ifp->if_poll_tstats.bytes;
6284 ifp->if_poll_tstats.bytes = 0;
6285 }
6286
6287 return ifp->if_data_threshold != 0;
6288}
6289
6290__private_extern__ void
6291dlil_input_packet_list(struct ifnet *ifp, struct mbuf *m)
6292{
6293 return dlil_input_packet_list_common(ifp, m, 0,
6294 IFNET_MODEL_INPUT_POLL_OFF, FALSE);
6295}
6296
6297__private_extern__ void
6298dlil_input_packet_list_extended(struct ifnet *ifp, struct mbuf *m,
6299 u_int32_t cnt, ifnet_model_t mode)
6300{
6301 return dlil_input_packet_list_common(ifp, m, cnt, mode, TRUE);
6302}
6303
6304static void
6305dlil_input_packet_list_common(struct ifnet *ifp_param, struct mbuf *m,
6306 u_int32_t cnt, ifnet_model_t mode, boolean_t ext)
6307{
6308 int error = 0;
6309 protocol_family_t protocol_family;
6310 mbuf_t next_packet;
6311 ifnet_t ifp = ifp_param;
6312 char *frame_header = NULL;
6313 struct if_proto *last_ifproto = NULL;
6314 mbuf_t pkt_first = NULL;
6315 mbuf_t *pkt_next = NULL;
6316 u_int32_t poll_thresh = 0, poll_ival = 0;
6317 int iorefcnt = 0;
6318
6319 KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6320
6321 if (ext && mode == IFNET_MODEL_INPUT_POLL_ON && cnt > 1 &&
6322 (poll_ival = if_rxpoll_interval_pkts) > 0) {
6323 poll_thresh = cnt;
6324 }
6325
6326 while (m != NULL) {
6327 struct if_proto *ifproto = NULL;
6328 uint32_t pktf_mask; /* pkt flags to preserve */
6329
6330 m_add_crumb(m, PKT_CRUMB_DLIL_INPUT);
6331
6332 if (ifp_param == NULL) {
6333 ifp = m->m_pkthdr.rcvif;
6334 }
6335
6336 if ((ifp->if_eflags & IFEF_RXPOLL) &&
6337 (ifp->if_xflags & IFXF_LEGACY) && poll_thresh != 0 &&
6338 poll_ival > 0 && (--poll_thresh % poll_ival) == 0) {
6339 ifnet_poll(ifp);
6340 }
6341
6342 /* Check if this mbuf looks valid */
6343 MBUF_INPUT_CHECK(m, ifp);
6344
6345 next_packet = m->m_nextpkt;
6346 m->m_nextpkt = NULL;
6347 frame_header = m->m_pkthdr.pkt_hdr;
6348 m->m_pkthdr.pkt_hdr = NULL;
6349
6350 /*
6351 * Get an IO reference count if the interface is not
6352 * loopback (lo0) and it is attached; lo0 never goes
6353 * away, so optimize for that.
6354 */
6355 if (ifp != lo_ifp) {
6356 /* iorefcnt is 0 if it hasn't been taken yet */
6357 if (iorefcnt == 0) {
6358 if (!ifnet_datamov_begin(ifp)) {
6359 m_freem(m);
6360 goto next;
6361 }
6362 }
6363 iorefcnt = 1;
6364 /*
6365 * Preserve the time stamp and skip pktap flags.
6366 */
6367 pktf_mask = PKTF_TS_VALID | PKTF_SKIP_PKTAP;
6368 } else {
6369 /*
6370 * If this arrived on lo0, preserve interface addr
6371 * info to allow for connectivity between loopback
6372 * and local interface addresses.
6373 */
6374 pktf_mask = (PKTF_LOOP | PKTF_IFAINFO);
6375 }
6376 pktf_mask |= PKTF_WAKE_PKT;
6377
6378 /* make sure packet comes in clean */
6379 m_classifier_init(m, pktf_mask);
6380
6381 ifp_inc_traffic_class_in(ifp, m);
6382
6383 /* find which protocol family this packet is for */
6384 ifnet_lock_shared(ifp);
6385 error = (*ifp->if_demux)(ifp, m, frame_header,
6386 &protocol_family);
6387 ifnet_lock_done(ifp);
6388 if (error != 0) {
6389 if (error == EJUSTRETURN) {
6390 goto next;
6391 }
6392 protocol_family = 0;
6393 }
6394
6395#if (DEVELOPMENT || DEBUG)
6396 /*
6397 * For testing we do not care about broadcast and multicast packets as
6398 * they are not as controllable as unicast traffic
6399 */
6400 if (__improbable(ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
6401 if ((protocol_family == PF_INET || protocol_family == PF_INET6) &&
6402 (m->m_flags & (M_BCAST | M_MCAST)) == 0) {
6403 /*
6404 * This is a one-shot command
6405 */
6406 ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
6407 m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
6408 }
6409 }
6410#endif /* (DEVELOPMENT || DEBUG) */
6411 if (__improbable(net_wake_pkt_debug > 0 && (m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT))) {
6412 char buffer[64];
6413 size_t buflen = MIN(mbuf_pkthdr_len(m), sizeof(buffer));
6414
6415 os_log(OS_LOG_DEFAULT, "wake packet from %s len %d",
6416 ifp->if_xname, m_pktlen(m));
6417 if (mbuf_copydata(mbuf: m, offset: 0, length: buflen, out_data: buffer) == 0) {
6418 log_hexdump(data: buffer, len: buflen);
6419 }
6420 }
6421
6422 pktap_input(ifp, protocol_family, m, frame_header);
6423
6424 /* Drop v4 packets received on CLAT46 enabled cell interface */
6425 if (protocol_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6426 ifp->if_type == IFT_CELLULAR) {
6427 m_freem(m);
6428 ip6stat.ip6s_clat464_in_v4_drop++;
6429 goto next;
6430 }
6431
6432 /* Translate the packet if it is received on CLAT interface */
6433 if (protocol_family == PF_INET6 && IS_INTF_CLAT46(ifp)
6434 && dlil_is_clat_needed(protocol_family, m)) {
6435 char *data = NULL;
6436 struct ether_header eh;
6437 struct ether_header *ehp = NULL;
6438
6439 if (ifp->if_type == IFT_ETHER) {
6440 ehp = (struct ether_header *)(void *)frame_header;
6441 /* Skip RX Ethernet packets if they are not IPV6 */
6442 if (ntohs(ehp->ether_type) != ETHERTYPE_IPV6) {
6443 goto skip_clat;
6444 }
6445
6446 /* Keep a copy of frame_header for Ethernet packets */
6447 bcopy(src: frame_header, dst: (caddr_t)&eh, ETHER_HDR_LEN);
6448 }
6449 error = dlil_clat64(ifp, &protocol_family, &m);
6450 data = (char *) mbuf_data(mbuf: m);
6451 if (error != 0) {
6452 m_freem(m);
6453 ip6stat.ip6s_clat464_in_drop++;
6454 goto next;
6455 }
6456 /* Native v6 should be No-op */
6457 if (protocol_family != PF_INET) {
6458 goto skip_clat;
6459 }
6460
6461 /* Do this only for translated v4 packets. */
6462 switch (ifp->if_type) {
6463 case IFT_CELLULAR:
6464 frame_header = data;
6465 break;
6466 case IFT_ETHER:
6467 /*
6468 * Drop if the mbuf doesn't have enough
6469 * space for Ethernet header
6470 */
6471 if (M_LEADINGSPACE(m) < ETHER_HDR_LEN) {
6472 m_free(m);
6473 ip6stat.ip6s_clat464_in_drop++;
6474 goto next;
6475 }
6476 /*
6477 * Set the frame_header ETHER_HDR_LEN bytes
6478 * preceeding the data pointer. Change
6479 * the ether_type too.
6480 */
6481 frame_header = data - ETHER_HDR_LEN;
6482 eh.ether_type = htons(ETHERTYPE_IP);
6483 bcopy(src: (caddr_t)&eh, dst: frame_header, ETHER_HDR_LEN);
6484 break;
6485 }
6486 }
6487skip_clat:
6488 /*
6489 * Match the wake packet against the list of ports that has been
6490 * been queried by the driver before the device went to sleep
6491 */
6492 if (__improbable(m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) {
6493 if (protocol_family != PF_INET && protocol_family != PF_INET6) {
6494 if_ports_used_match_mbuf(ifp, proto_family: protocol_family, m);
6495 }
6496 }
6497 if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK) &&
6498 !(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
6499 dlil_input_cksum_dbg(ifp, m, frame_header,
6500 protocol_family);
6501 }
6502 /*
6503 * For partial checksum offload, we expect the driver to
6504 * set the start offset indicating the start of the span
6505 * that is covered by the hardware-computed checksum;
6506 * adjust this start offset accordingly because the data
6507 * pointer has been advanced beyond the link-layer header.
6508 *
6509 * Virtual lan types (bridge, vlan, bond) can call
6510 * dlil_input_packet_list() with the same packet with the
6511 * checksum flags set. Set a flag indicating that the
6512 * adjustment has already been done.
6513 */
6514 if ((m->m_pkthdr.csum_flags & CSUM_ADJUST_DONE) != 0) {
6515 /* adjustment has already been done */
6516 } else if ((m->m_pkthdr.csum_flags &
6517 (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
6518 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
6519 int adj;
6520 if (frame_header == NULL ||
6521 frame_header < (char *)mbuf_datastart(mbuf: m) ||
6522 frame_header > (char *)m->m_data ||
6523 (adj = (int)(m->m_data - (uintptr_t)frame_header)) >
6524 m->m_pkthdr.csum_rx_start) {
6525 m->m_pkthdr.csum_data = 0;
6526 m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
6527 hwcksum_in_invalidated++;
6528 } else {
6529 m->m_pkthdr.csum_rx_start -= adj;
6530 }
6531 /* make sure we don't adjust more than once */
6532 m->m_pkthdr.csum_flags |= CSUM_ADJUST_DONE;
6533 }
6534 if (clat_debug) {
6535 pktap_input(ifp, protocol_family, m, frame_header);
6536 }
6537
6538 if (m->m_flags & (M_BCAST | M_MCAST)) {
6539 os_atomic_inc(&ifp->if_imcasts, relaxed);
6540 }
6541
6542 /* run interface filters */
6543 error = dlil_interface_filters_input(ifp, m_p: &m,
6544 frame_header_p: &frame_header, protocol_family);
6545 if (error != 0) {
6546 if (error != EJUSTRETURN) {
6547 m_freem(m);
6548 }
6549 goto next;
6550 }
6551 /*
6552 * A VLAN interface receives VLAN-tagged packets by attaching
6553 * its PF_VLAN protocol to a parent interface. When a VLAN
6554 * interface is a member of a bridge, the parent interface
6555 * receives VLAN-tagged M_PROMISC packets. A VLAN-tagged
6556 * M_PROMISC packet must be processed by the VLAN protocol
6557 * so that it can be sent up the stack via
6558 * dlil_input_packet_list(). That allows the bridge interface's
6559 * input filter, attached to the VLAN interface, to process
6560 * the packet.
6561 */
6562 if (protocol_family != PF_VLAN &&
6563 (m->m_flags & M_PROMISC) != 0) {
6564 m_freem(m);
6565 goto next;
6566 }
6567
6568 /* Lookup the protocol attachment to this interface */
6569 if (protocol_family == 0) {
6570 ifproto = NULL;
6571 } else if (last_ifproto != NULL && last_ifproto->ifp == ifp &&
6572 (last_ifproto->protocol_family == protocol_family)) {
6573 VERIFY(ifproto == NULL);
6574 ifproto = last_ifproto;
6575 if_proto_ref(proto: last_ifproto);
6576 } else {
6577 VERIFY(ifproto == NULL);
6578 ifnet_lock_shared(ifp);
6579 /* callee holds a proto refcnt upon success */
6580 ifproto = find_attached_proto(ifp, protocol_family);
6581 ifnet_lock_done(ifp);
6582 }
6583 if (ifproto == NULL) {
6584 /* no protocol for this packet, discard */
6585 m_freem(m);
6586 goto next;
6587 }
6588 if (ifproto != last_ifproto) {
6589 if (last_ifproto != NULL) {
6590 /* pass up the list for the previous protocol */
6591 dlil_ifproto_input(ifproto: last_ifproto, m: pkt_first);
6592 pkt_first = NULL;
6593 if_proto_free(proto: last_ifproto);
6594 }
6595 last_ifproto = ifproto;
6596 if_proto_ref(proto: ifproto);
6597 }
6598 /* extend the list */
6599 m->m_pkthdr.pkt_hdr = frame_header;
6600 if (pkt_first == NULL) {
6601 pkt_first = m;
6602 } else {
6603 *pkt_next = m;
6604 }
6605 pkt_next = &m->m_nextpkt;
6606
6607next:
6608 if (next_packet == NULL && last_ifproto != NULL) {
6609 /* pass up the last list of packets */
6610 dlil_ifproto_input(ifproto: last_ifproto, m: pkt_first);
6611 if_proto_free(proto: last_ifproto);
6612 last_ifproto = NULL;
6613 }
6614 if (ifproto != NULL) {
6615 if_proto_free(proto: ifproto);
6616 ifproto = NULL;
6617 }
6618
6619 m = next_packet;
6620
6621 /* update the driver's multicast filter, if needed */
6622 if (ifp->if_updatemcasts > 0 && if_mcasts_update(ifp) == 0) {
6623 ifp->if_updatemcasts = 0;
6624 }
6625 if (iorefcnt == 1) {
6626 /* If the next mbuf is on a different interface, unlock data-mov */
6627 if (!m || (ifp != ifp_param && ifp != m->m_pkthdr.rcvif)) {
6628 ifnet_datamov_end(ifp);
6629 iorefcnt = 0;
6630 }
6631 }
6632 }
6633
6634 KERNEL_DEBUG(DBG_FNC_DLIL_INPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
6635}
6636
6637static errno_t
6638if_mcasts_update_common(struct ifnet * ifp, bool sync)
6639{
6640 errno_t err;
6641
6642 if (sync) {
6643 err = ifnet_ioctl(interface: ifp, protocol: 0, SIOCADDMULTI, NULL);
6644 if (err == EAFNOSUPPORT) {
6645 err = 0;
6646 }
6647 } else {
6648 ifnet_ioctl_async(ifp, SIOCADDMULTI);
6649 err = 0;
6650 }
6651 DLIL_PRINTF("%s: %s %d suspended link-layer multicast membership(s) "
6652 "(err=%d)\n", if_name(ifp),
6653 (err == 0 ? "successfully restored" : "failed to restore"),
6654 ifp->if_updatemcasts, err);
6655
6656 /* just return success */
6657 return 0;
6658}
6659
6660static errno_t
6661if_mcasts_update_async(struct ifnet *ifp)
6662{
6663 return if_mcasts_update_common(ifp, false);
6664}
6665
6666errno_t
6667if_mcasts_update(struct ifnet *ifp)
6668{
6669 return if_mcasts_update_common(ifp, true);
6670}
6671
6672/* If ifp is set, we will increment the generation for the interface */
6673int
6674dlil_post_complete_msg(struct ifnet *ifp, struct kev_msg *event)
6675{
6676 if (ifp != NULL) {
6677 ifnet_increment_generation(ifp);
6678 }
6679
6680#if NECP
6681 necp_update_all_clients();
6682#endif /* NECP */
6683
6684 return kev_post_msg(event);
6685}
6686
6687__private_extern__ void
6688dlil_post_sifflags_msg(struct ifnet * ifp)
6689{
6690 struct kev_msg ev_msg;
6691 struct net_event_data ev_data;
6692
6693 bzero(s: &ev_data, n: sizeof(ev_data));
6694 bzero(s: &ev_msg, n: sizeof(ev_msg));
6695 ev_msg.vendor_code = KEV_VENDOR_APPLE;
6696 ev_msg.kev_class = KEV_NETWORK_CLASS;
6697 ev_msg.kev_subclass = KEV_DL_SUBCLASS;
6698 ev_msg.event_code = KEV_DL_SIFFLAGS;
6699 strlcpy(dst: &ev_data.if_name[0], src: ifp->if_name, IFNAMSIZ);
6700 ev_data.if_family = ifp->if_family;
6701 ev_data.if_unit = (u_int32_t) ifp->if_unit;
6702 ev_msg.dv[0].data_length = sizeof(struct net_event_data);
6703 ev_msg.dv[0].data_ptr = &ev_data;
6704 ev_msg.dv[1].data_length = 0;
6705 dlil_post_complete_msg(ifp, event: &ev_msg);
6706}
6707
6708#define TMP_IF_PROTO_ARR_SIZE 10
6709static int
6710dlil_event_internal(struct ifnet *ifp, struct kev_msg *event, bool update_generation)
6711{
6712 struct ifnet_filter *filter = NULL;
6713 struct if_proto *proto = NULL;
6714 int if_proto_count = 0;
6715 struct if_proto *tmp_ifproto_stack_arr[TMP_IF_PROTO_ARR_SIZE] = {NULL};
6716 struct if_proto **tmp_ifproto_arr = tmp_ifproto_stack_arr;
6717 int tmp_ifproto_arr_idx = 0;
6718
6719 /*
6720 * Pass the event to the interface filters
6721 */
6722 lck_mtx_lock_spin(lck: &ifp->if_flt_lock);
6723 /* prevent filter list from changing in case we drop the lock */
6724 if_flt_monitor_busy(ifp);
6725 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
6726 if (filter->filt_event != NULL) {
6727 lck_mtx_unlock(lck: &ifp->if_flt_lock);
6728
6729 filter->filt_event(filter->filt_cookie, ifp,
6730 filter->filt_protocol, event);
6731
6732 lck_mtx_lock_spin(lck: &ifp->if_flt_lock);
6733 }
6734 }
6735 /* we're done with the filter list */
6736 if_flt_monitor_unbusy(ifp);
6737 lck_mtx_unlock(lck: &ifp->if_flt_lock);
6738
6739 /* Get an io ref count if the interface is attached */
6740 if (!ifnet_is_attached(ifp, refio: 1)) {
6741 goto done;
6742 }
6743
6744 /*
6745 * An embedded tmp_list_entry in if_proto may still get
6746 * over-written by another thread after giving up ifnet lock,
6747 * therefore we are avoiding embedded pointers here.
6748 */
6749 ifnet_lock_shared(ifp);
6750 if_proto_count = dlil_ifp_protolist(ifp, NULL, list_count: 0);
6751 if (if_proto_count) {
6752 int i;
6753 VERIFY(ifp->if_proto_hash != NULL);
6754 if (if_proto_count <= TMP_IF_PROTO_ARR_SIZE) {
6755 tmp_ifproto_arr = tmp_ifproto_stack_arr;
6756 } else {
6757 tmp_ifproto_arr = kalloc_type(struct if_proto *,
6758 if_proto_count, Z_WAITOK | Z_ZERO);
6759 if (tmp_ifproto_arr == NULL) {
6760 ifnet_lock_done(ifp);
6761 goto cleanup;
6762 }
6763 }
6764
6765 for (i = 0; i < PROTO_HASH_SLOTS; i++) {
6766 SLIST_FOREACH(proto, &ifp->if_proto_hash[i],
6767 next_hash) {
6768 if_proto_ref(proto);
6769 tmp_ifproto_arr[tmp_ifproto_arr_idx] = proto;
6770 tmp_ifproto_arr_idx++;
6771 }
6772 }
6773 VERIFY(if_proto_count == tmp_ifproto_arr_idx);
6774 }
6775 ifnet_lock_done(ifp);
6776
6777 for (tmp_ifproto_arr_idx = 0; tmp_ifproto_arr_idx < if_proto_count;
6778 tmp_ifproto_arr_idx++) {
6779 proto = tmp_ifproto_arr[tmp_ifproto_arr_idx];
6780 VERIFY(proto != NULL);
6781 proto_media_event eventp =
6782 (proto->proto_kpi == kProtoKPI_v1 ?
6783 proto->kpi.v1.event :
6784 proto->kpi.v2.event);
6785
6786 if (eventp != NULL) {
6787 eventp(ifp, proto->protocol_family,
6788 event);
6789 }
6790 if_proto_free(proto);
6791 }
6792
6793cleanup:
6794 if (tmp_ifproto_arr != tmp_ifproto_stack_arr) {
6795 kfree_type(struct if_proto *, if_proto_count, tmp_ifproto_arr);
6796 }
6797
6798 /* Pass the event to the interface */
6799 if (ifp->if_event != NULL) {
6800 ifp->if_event(ifp, event);
6801 }
6802
6803 /* Release the io ref count */
6804 ifnet_decr_iorefcnt(ifp);
6805done:
6806 return dlil_post_complete_msg(ifp: update_generation ? ifp : NULL, event);
6807}
6808
6809errno_t
6810ifnet_event(ifnet_t ifp, struct kern_event_msg *event)
6811{
6812 struct kev_msg kev_msg;
6813 int result = 0;
6814
6815 if (ifp == NULL || event == NULL) {
6816 return EINVAL;
6817 }
6818
6819 bzero(s: &kev_msg, n: sizeof(kev_msg));
6820 kev_msg.vendor_code = event->vendor_code;
6821 kev_msg.kev_class = event->kev_class;
6822 kev_msg.kev_subclass = event->kev_subclass;
6823 kev_msg.event_code = event->event_code;
6824 kev_msg.dv[0].data_ptr = &event->event_data[0];
6825 kev_msg.dv[0].data_length = event->total_size - KEV_MSG_HEADER_SIZE;
6826 kev_msg.dv[1].data_length = 0;
6827
6828 result = dlil_event_internal(ifp, event: &kev_msg, TRUE);
6829
6830 return result;
6831}
6832
6833static void
6834dlil_count_chain_len(mbuf_t m, struct chain_len_stats *cls)
6835{
6836 mbuf_t n = m;
6837 int chainlen = 0;
6838
6839 while (n != NULL) {
6840 chainlen++;
6841 n = n->m_next;
6842 }
6843 switch (chainlen) {
6844 case 0:
6845 break;
6846 case 1:
6847 os_atomic_inc(&cls->cls_one, relaxed);
6848 break;
6849 case 2:
6850 os_atomic_inc(&cls->cls_two, relaxed);
6851 break;
6852 case 3:
6853 os_atomic_inc(&cls->cls_three, relaxed);
6854 break;
6855 case 4:
6856 os_atomic_inc(&cls->cls_four, relaxed);
6857 break;
6858 case 5:
6859 default:
6860 os_atomic_inc(&cls->cls_five_or_more, relaxed);
6861 break;
6862 }
6863}
6864
6865#if CONFIG_DTRACE
6866__attribute__((noinline))
6867static void
6868dlil_output_dtrace(ifnet_t ifp, protocol_family_t proto_family, mbuf_t m)
6869{
6870 if (proto_family == PF_INET) {
6871 struct ip *ip = mtod(m, struct ip *);
6872 DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6873 struct ip *, ip, struct ifnet *, ifp,
6874 struct ip *, ip, struct ip6_hdr *, NULL);
6875 } else if (proto_family == PF_INET6) {
6876 struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
6877 DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
6878 struct ip6_hdr *, ip6, struct ifnet *, ifp,
6879 struct ip *, NULL, struct ip6_hdr *, ip6);
6880 }
6881}
6882#endif /* CONFIG_DTRACE */
6883
6884/*
6885 * dlil_output
6886 *
6887 * Caller should have a lock on the protocol domain if the protocol
6888 * doesn't support finer grained locking. In most cases, the lock
6889 * will be held from the socket layer and won't be released until
6890 * we return back to the socket layer.
6891 *
6892 * This does mean that we must take a protocol lock before we take
6893 * an interface lock if we're going to take both. This makes sense
6894 * because a protocol is likely to interact with an ifp while it
6895 * is under the protocol lock.
6896 *
6897 * An advisory code will be returned if adv is not null. This
6898 * can be used to provide feedback about interface queues to the
6899 * application.
6900 */
6901errno_t
6902dlil_output(ifnet_t ifp, protocol_family_t proto_family, mbuf_t packetlist,
6903 void *route, const struct sockaddr *dest, int raw, struct flowadv *adv)
6904{
6905 char *frame_type = NULL;
6906 char *dst_linkaddr = NULL;
6907 int retval = 0;
6908 char frame_type_buffer[MAX_FRAME_TYPE_SIZE * 4];
6909 char dst_linkaddr_buffer[MAX_LINKADDR * 4];
6910 struct if_proto *proto = NULL;
6911 mbuf_t m = NULL;
6912 mbuf_t send_head = NULL;
6913 mbuf_t *send_tail = &send_head;
6914 int iorefcnt = 0;
6915 u_int32_t pre = 0, post = 0;
6916 u_int32_t fpkts = 0, fbytes = 0;
6917 int32_t flen = 0;
6918 struct timespec now;
6919 u_int64_t now_nsec;
6920 boolean_t did_clat46 = FALSE;
6921 protocol_family_t old_proto_family = proto_family;
6922 struct sockaddr_in6 dest6;
6923 struct rtentry *rt = NULL;
6924 u_int16_t m_loop_set = 0;
6925
6926 KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
6927
6928 /*
6929 * Get an io refcnt if the interface is attached to prevent ifnet_detach
6930 * from happening while this operation is in progress
6931 */
6932 if (!ifnet_datamov_begin(ifp)) {
6933 retval = ENXIO;
6934 goto cleanup;
6935 }
6936 iorefcnt = 1;
6937
6938 VERIFY(ifp->if_output_dlil != NULL);
6939
6940 /* update the driver's multicast filter, if needed */
6941 if (ifp->if_updatemcasts > 0) {
6942 if_mcasts_update_async(ifp);
6943 ifp->if_updatemcasts = 0;
6944 }
6945
6946 frame_type = frame_type_buffer;
6947 dst_linkaddr = dst_linkaddr_buffer;
6948
6949 if (raw == 0) {
6950 ifnet_lock_shared(ifp);
6951 /* callee holds a proto refcnt upon success */
6952 proto = find_attached_proto(ifp, protocol_family: proto_family);
6953 if (proto == NULL) {
6954 ifnet_lock_done(ifp);
6955 retval = ENXIO;
6956 goto cleanup;
6957 }
6958 ifnet_lock_done(ifp);
6959 }
6960
6961preout_again:
6962 if (packetlist == NULL) {
6963 goto cleanup;
6964 }
6965
6966 m = packetlist;
6967 packetlist = packetlist->m_nextpkt;
6968 m->m_nextpkt = NULL;
6969
6970 m_add_crumb(m, PKT_CRUMB_DLIL_OUTPUT);
6971
6972 /*
6973 * Perform address family translation for the first
6974 * packet outside the loop in order to perform address
6975 * lookup for the translated proto family.
6976 */
6977 if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
6978 (ifp->if_type == IFT_CELLULAR ||
6979 dlil_is_clat_needed(proto_family, m))) {
6980 retval = dlil_clat46(ifp, &proto_family, &m);
6981 /*
6982 * Go to the next packet if translation fails
6983 */
6984 if (retval != 0) {
6985 m_freem(m);
6986 m = NULL;
6987 ip6stat.ip6s_clat464_out_drop++;
6988 /* Make sure that the proto family is PF_INET */
6989 ASSERT(proto_family == PF_INET);
6990 goto preout_again;
6991 }
6992 /*
6993 * Free the old one and make it point to the IPv6 proto structure.
6994 *
6995 * Change proto for the first time we have successfully
6996 * performed address family translation.
6997 */
6998 if (!did_clat46 && proto_family == PF_INET6) {
6999 did_clat46 = TRUE;
7000
7001 if (proto != NULL) {
7002 if_proto_free(proto);
7003 }
7004 ifnet_lock_shared(ifp);
7005 /* callee holds a proto refcnt upon success */
7006 proto = find_attached_proto(ifp, protocol_family: proto_family);
7007 if (proto == NULL) {
7008 ifnet_lock_done(ifp);
7009 retval = ENXIO;
7010 m_freem(m);
7011 m = NULL;
7012 goto cleanup;
7013 }
7014 ifnet_lock_done(ifp);
7015 if (ifp->if_type == IFT_ETHER) {
7016 /* Update the dest to translated v6 address */
7017 dest6.sin6_len = sizeof(struct sockaddr_in6);
7018 dest6.sin6_family = AF_INET6;
7019 dest6.sin6_addr = (mtod(m, struct ip6_hdr *))->ip6_dst;
7020 dest = SA(&dest6);
7021
7022 /*
7023 * Lookup route to the translated destination
7024 * Free this route ref during cleanup
7025 */
7026 rt = rtalloc1_scoped(SA(&dest6),
7027 0, 0, ifp->if_index);
7028
7029 route = rt;
7030 }
7031 }
7032 }
7033
7034 /*
7035 * This path gets packet chain going to the same destination.
7036 * The pre output routine is used to either trigger resolution of
7037 * the next hop or retreive the next hop's link layer addressing.
7038 * For ex: ether_inet(6)_pre_output routine.
7039 *
7040 * If the routine returns EJUSTRETURN, it implies that packet has
7041 * been queued, and therefore we have to call preout_again for the
7042 * following packet in the chain.
7043 *
7044 * For errors other than EJUSTRETURN, the current packet is freed
7045 * and the rest of the chain (pointed by packetlist is freed as
7046 * part of clean up.
7047 *
7048 * Else if there is no error the retrieved information is used for
7049 * all the packets in the chain.
7050 */
7051 if (raw == 0) {
7052 proto_media_preout preoutp = (proto->proto_kpi == kProtoKPI_v1 ?
7053 proto->kpi.v1.pre_output : proto->kpi.v2.pre_output);
7054 retval = 0;
7055 if (preoutp != NULL) {
7056 retval = preoutp(ifp, proto_family, &m, dest, route,
7057 frame_type, dst_linkaddr);
7058
7059 if (retval != 0) {
7060 if (retval == EJUSTRETURN) {
7061 goto preout_again;
7062 }
7063 m_freem(m);
7064 m = NULL;
7065 goto cleanup;
7066 }
7067 }
7068 }
7069
7070 nanouptime(ts: &now);
7071 net_timernsec(&now, &now_nsec);
7072
7073 do {
7074 /*
7075 * pkt_hdr is set here to point to m_data prior to
7076 * calling into the framer. This value of pkt_hdr is
7077 * used by the netif gso logic to retrieve the ip header
7078 * for the TCP packets, offloaded for TSO processing.
7079 */
7080 if ((raw != 0) && (ifp->if_family == IFNET_FAMILY_ETHERNET)) {
7081 uint8_t vlan_encap_len = 0;
7082
7083 if ((m->m_pkthdr.csum_flags & CSUM_VLAN_ENCAP_PRESENT) != 0) {
7084 vlan_encap_len = ETHER_VLAN_ENCAP_LEN;
7085 }
7086 m->m_pkthdr.pkt_hdr = mtod(m, char *) + ETHER_HDR_LEN + vlan_encap_len;
7087 } else {
7088 m->m_pkthdr.pkt_hdr = mtod(m, void *);
7089 }
7090
7091 /*
7092 * Perform address family translation if needed.
7093 * For now we only support stateless 4 to 6 translation
7094 * on the out path.
7095 *
7096 * The routine below translates IP header, updates protocol
7097 * checksum and also translates ICMP.
7098 *
7099 * We skip the first packet as it is already translated and
7100 * the proto family is set to PF_INET6.
7101 */
7102 if (proto_family == PF_INET && IS_INTF_CLAT46(ifp) &&
7103 (ifp->if_type == IFT_CELLULAR ||
7104 dlil_is_clat_needed(proto_family, m))) {
7105 retval = dlil_clat46(ifp, &proto_family, &m);
7106 /* Goto the next packet if the translation fails */
7107 if (retval != 0) {
7108 m_freem(m);
7109 m = NULL;
7110 ip6stat.ip6s_clat464_out_drop++;
7111 goto next;
7112 }
7113 }
7114
7115#if CONFIG_DTRACE
7116 if (!raw) {
7117 dlil_output_dtrace(ifp, proto_family, m);
7118 }
7119#endif /* CONFIG_DTRACE */
7120
7121 if (raw == 0 && ifp->if_framer != NULL) {
7122 int rcvif_set = 0;
7123
7124 /*
7125 * If this is a broadcast packet that needs to be
7126 * looped back into the system, set the inbound ifp
7127 * to that of the outbound ifp. This will allow
7128 * us to determine that it is a legitimate packet
7129 * for the system. Only set the ifp if it's not
7130 * already set, just to be safe.
7131 */
7132 if ((m->m_flags & (M_BCAST | M_LOOP)) &&
7133 m->m_pkthdr.rcvif == NULL) {
7134 m->m_pkthdr.rcvif = ifp;
7135 rcvif_set = 1;
7136 }
7137 m_loop_set = m->m_flags & M_LOOP;
7138 retval = ifp->if_framer(ifp, &m, dest, dst_linkaddr,
7139 frame_type, &pre, &post);
7140 if (retval != 0) {
7141 if (retval != EJUSTRETURN) {
7142 m_freem(m);
7143 }
7144 goto next;
7145 }
7146
7147 /*
7148 * For partial checksum offload, adjust the start
7149 * and stuff offsets based on the prepended header.
7150 */
7151 if ((m->m_pkthdr.csum_flags &
7152 (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
7153 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
7154 m->m_pkthdr.csum_tx_stuff += pre;
7155 m->m_pkthdr.csum_tx_start += pre;
7156 }
7157
7158 if (hwcksum_dbg != 0 && !(ifp->if_flags & IFF_LOOPBACK)) {
7159 dlil_output_cksum_dbg(ifp, m, pre,
7160 proto_family);
7161 }
7162
7163 /*
7164 * Clear the ifp if it was set above, and to be
7165 * safe, only if it is still the same as the
7166 * outbound ifp we have in context. If it was
7167 * looped back, then a copy of it was sent to the
7168 * loopback interface with the rcvif set, and we
7169 * are clearing the one that will go down to the
7170 * layer below.
7171 */
7172 if (rcvif_set && m->m_pkthdr.rcvif == ifp) {
7173 m->m_pkthdr.rcvif = NULL;
7174 }
7175 }
7176
7177 /*
7178 * Let interface filters (if any) do their thing ...
7179 */
7180 retval = dlil_interface_filters_output(ifp, m_p: &m, protocol_family: proto_family);
7181 if (retval != 0) {
7182 if (retval != EJUSTRETURN) {
7183 m_freem(m);
7184 }
7185 goto next;
7186 }
7187 /*
7188 * Strip away M_PROTO1 bit prior to sending packet
7189 * to the driver as this field may be used by the driver
7190 */
7191 m->m_flags &= ~M_PROTO1;
7192
7193 /*
7194 * If the underlying interface is not capable of handling a
7195 * packet whose data portion spans across physically disjoint
7196 * pages, we need to "normalize" the packet so that we pass
7197 * down a chain of mbufs where each mbuf points to a span that
7198 * resides in the system page boundary. If the packet does
7199 * not cross page(s), the following is a no-op.
7200 */
7201 if (!(ifp->if_hwassist & IFNET_MULTIPAGES)) {
7202 if ((m = m_normalize(m)) == NULL) {
7203 goto next;
7204 }
7205 }
7206
7207 /*
7208 * If this is a TSO packet, make sure the interface still
7209 * advertise TSO capability.
7210 */
7211 if (TSO_IPV4_NOTOK(ifp, m) || TSO_IPV6_NOTOK(ifp, m)) {
7212 retval = EMSGSIZE;
7213 m_freem(m);
7214 goto cleanup;
7215 }
7216
7217 ifp_inc_traffic_class_out(ifp, m);
7218
7219#if SKYWALK
7220 /*
7221 * For native skywalk devices, packets will be passed to pktap
7222 * after GSO or after the mbuf to packet conversion.
7223 * This is done for IPv4/IPv6 packets only because there is no
7224 * space in the mbuf to pass down the proto family.
7225 */
7226 if (dlil_is_native_netif_nexus(ifp)) {
7227 if (raw || m->m_pkthdr.pkt_proto == 0) {
7228 pktap_output(ifp, proto_family, m, pre, post);
7229 m->m_pkthdr.pkt_flags |= PKTF_SKIP_PKTAP;
7230 }
7231 } else {
7232 pktap_output(ifp, proto_family, m, pre, post);
7233 }
7234#else /* SKYWALK */
7235 pktap_output(ifp, proto_family, m, pre, post);
7236#endif /* SKYWALK */
7237
7238 /*
7239 * Count the number of elements in the mbuf chain
7240 */
7241 if (tx_chain_len_count) {
7242 dlil_count_chain_len(m, cls: &tx_chain_len_stats);
7243 }
7244
7245 /*
7246 * Discard partial sum information if this packet originated
7247 * from another interface; the packet would already have the
7248 * final checksum and we shouldn't recompute it.
7249 */
7250 if ((m->m_pkthdr.pkt_flags & PKTF_FORWARDED) &&
7251 (m->m_pkthdr.csum_flags & (CSUM_DATA_VALID | CSUM_PARTIAL)) ==
7252 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
7253 m->m_pkthdr.csum_flags &= ~CSUM_TX_FLAGS;
7254 m->m_pkthdr.csum_data = 0;
7255 }
7256
7257 /*
7258 * Finally, call the driver.
7259 */
7260 if (ifp->if_eflags & (IFEF_SENDLIST | IFEF_ENQUEUE_MULTI)) {
7261 if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
7262 flen += (m_pktlen(m) - (pre + post));
7263 m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
7264 }
7265 (void) mbuf_set_timestamp(mbuf: m, ts: now_nsec, TRUE);
7266
7267 *send_tail = m;
7268 send_tail = &m->m_nextpkt;
7269 } else {
7270 /*
7271 * Record timestamp; ifnet_enqueue() will use this info
7272 * rather than redoing the work.
7273 */
7274 nanouptime(ts: &now);
7275 net_timernsec(&now, &now_nsec);
7276 (void) mbuf_set_timestamp(mbuf: m, ts: now_nsec, TRUE);
7277
7278 if (m->m_pkthdr.pkt_flags & PKTF_FORWARDED) {
7279 flen = (m_pktlen(m) - (pre + post));
7280 m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
7281 } else {
7282 flen = 0;
7283 }
7284 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
7285 0, 0, 0, 0, 0);
7286 retval = (*ifp->if_output_dlil)(ifp, m);
7287 if (retval == EQFULL || retval == EQSUSPENDED) {
7288 if (adv != NULL && adv->code == FADV_SUCCESS) {
7289 adv->code = (retval == EQFULL ?
7290 FADV_FLOW_CONTROLLED :
7291 FADV_SUSPENDED);
7292 }
7293 retval = 0;
7294 }
7295 if (retval == 0 && flen > 0) {
7296 fbytes += flen;
7297 fpkts++;
7298 }
7299 if (retval != 0 && dlil_verbose) {
7300 DLIL_PRINTF("%s: output error on %s retval = %d\n",
7301 __func__, if_name(ifp),
7302 retval);
7303 }
7304 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END,
7305 0, 0, 0, 0, 0);
7306 }
7307 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7308
7309next:
7310 m = packetlist;
7311 if (m != NULL) {
7312 m->m_flags |= m_loop_set;
7313 packetlist = packetlist->m_nextpkt;
7314 m->m_nextpkt = NULL;
7315 }
7316 /* Reset the proto family to old proto family for CLAT */
7317 if (did_clat46) {
7318 proto_family = old_proto_family;
7319 }
7320 } while (m != NULL);
7321
7322 if (send_head != NULL) {
7323 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_START,
7324 0, 0, 0, 0, 0);
7325 if (ifp->if_eflags & IFEF_SENDLIST) {
7326 retval = (*ifp->if_output_dlil)(ifp, send_head);
7327 if (retval == EQFULL || retval == EQSUSPENDED) {
7328 if (adv != NULL) {
7329 adv->code = (retval == EQFULL ?
7330 FADV_FLOW_CONTROLLED :
7331 FADV_SUSPENDED);
7332 }
7333 retval = 0;
7334 }
7335 if (retval == 0 && flen > 0) {
7336 fbytes += flen;
7337 fpkts++;
7338 }
7339 if (retval != 0 && dlil_verbose) {
7340 DLIL_PRINTF("%s: output error on %s retval = %d\n",
7341 __func__, if_name(ifp), retval);
7342 }
7343 } else {
7344 struct mbuf *send_m;
7345 int enq_cnt = 0;
7346 VERIFY(ifp->if_eflags & IFEF_ENQUEUE_MULTI);
7347 while (send_head != NULL) {
7348 send_m = send_head;
7349 send_head = send_m->m_nextpkt;
7350 send_m->m_nextpkt = NULL;
7351 retval = (*ifp->if_output_dlil)(ifp, send_m);
7352 if (retval == EQFULL || retval == EQSUSPENDED) {
7353 if (adv != NULL) {
7354 adv->code = (retval == EQFULL ?
7355 FADV_FLOW_CONTROLLED :
7356 FADV_SUSPENDED);
7357 }
7358 retval = 0;
7359 }
7360 if (retval == 0) {
7361 enq_cnt++;
7362 if (flen > 0) {
7363 fpkts++;
7364 }
7365 }
7366 if (retval != 0 && dlil_verbose) {
7367 DLIL_PRINTF("%s: output error on %s "
7368 "retval = %d\n",
7369 __func__, if_name(ifp), retval);
7370 }
7371 }
7372 if (enq_cnt > 0) {
7373 fbytes += flen;
7374 ifnet_start(ifp);
7375 }
7376 }
7377 KERNEL_DEBUG(DBG_FNC_DLIL_IFOUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7378 }
7379
7380 KERNEL_DEBUG(DBG_FNC_DLIL_OUTPUT | DBG_FUNC_END, 0, 0, 0, 0, 0);
7381
7382cleanup:
7383 if (fbytes > 0) {
7384 ifp->if_fbytes += fbytes;
7385 }
7386 if (fpkts > 0) {
7387 ifp->if_fpackets += fpkts;
7388 }
7389 if (proto != NULL) {
7390 if_proto_free(proto);
7391 }
7392 if (packetlist) { /* if any packets are left, clean up */
7393 mbuf_freem_list(mbuf: packetlist);
7394 }
7395 if (retval == EJUSTRETURN) {
7396 retval = 0;
7397 }
7398 if (iorefcnt == 1) {
7399 ifnet_datamov_end(ifp);
7400 }
7401 if (rt != NULL) {
7402 rtfree(rt);
7403 rt = NULL;
7404 }
7405
7406 return retval;
7407}
7408
7409/*
7410 * This routine checks if the destination address is not a loopback, link-local,
7411 * multicast or broadcast address.
7412 */
7413static int
7414dlil_is_clat_needed(protocol_family_t proto_family, mbuf_t m)
7415{
7416 int ret = 0;
7417 switch (proto_family) {
7418 case PF_INET: {
7419 struct ip *iph = mtod(m, struct ip *);
7420 if (CLAT46_NEEDED(ntohl(iph->ip_dst.s_addr))) {
7421 ret = 1;
7422 }
7423 break;
7424 }
7425 case PF_INET6: {
7426 struct ip6_hdr *ip6h = mtod(m, struct ip6_hdr *);
7427 if ((size_t)m_pktlen(m) >= sizeof(struct ip6_hdr) &&
7428 CLAT64_NEEDED(&ip6h->ip6_dst)) {
7429 ret = 1;
7430 }
7431 break;
7432 }
7433 }
7434
7435 return ret;
7436}
7437/*
7438 * @brief This routine translates IPv4 packet to IPv6 packet,
7439 * updates protocol checksum and also translates ICMP for code
7440 * along with inner header translation.
7441 *
7442 * @param ifp Pointer to the interface
7443 * @param proto_family pointer to protocol family. It is updated if function
7444 * performs the translation successfully.
7445 * @param m Pointer to the pointer pointing to the packet. Needed because this
7446 * routine can end up changing the mbuf to a different one.
7447 *
7448 * @return 0 on success or else a negative value.
7449 */
7450static errno_t
7451dlil_clat46(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7452{
7453 VERIFY(*proto_family == PF_INET);
7454 VERIFY(IS_INTF_CLAT46(ifp));
7455
7456 pbuf_t pbuf_store, *pbuf = NULL;
7457 struct ip *iph = NULL;
7458 struct in_addr osrc, odst;
7459 uint8_t proto = 0;
7460 struct in6_addr src_storage = {};
7461 struct in6_addr *src = NULL;
7462 struct sockaddr_in6 dstsock = {};
7463 int error = 0;
7464 uint16_t off = 0;
7465 uint16_t tot_len = 0;
7466 uint16_t ip_id_val = 0;
7467 uint16_t ip_frag_off = 0;
7468
7469 boolean_t is_frag = FALSE;
7470 boolean_t is_first_frag = TRUE;
7471 boolean_t is_last_frag = TRUE;
7472
7473 pbuf_init_mbuf(&pbuf_store, *m, ifp);
7474 pbuf = &pbuf_store;
7475 iph = pbuf->pb_data;
7476
7477 osrc = iph->ip_src;
7478 odst = iph->ip_dst;
7479 proto = iph->ip_p;
7480 off = (uint16_t)(iph->ip_hl << 2);
7481 ip_id_val = iph->ip_id;
7482 ip_frag_off = ntohs(iph->ip_off) & IP_OFFMASK;
7483
7484 tot_len = ntohs(iph->ip_len);
7485
7486 /*
7487 * For packets that are not first frags
7488 * we only need to adjust CSUM.
7489 * For 4 to 6, Fragmentation header gets appended
7490 * after proto translation.
7491 */
7492 if (ntohs(iph->ip_off) & ~(IP_DF | IP_RF)) {
7493 is_frag = TRUE;
7494
7495 /* If the offset is not zero, it is not first frag */
7496 if (ip_frag_off != 0) {
7497 is_first_frag = FALSE;
7498 }
7499
7500 /* If IP_MF is set, then it is not last frag */
7501 if (ntohs(iph->ip_off) & IP_MF) {
7502 is_last_frag = FALSE;
7503 }
7504 }
7505
7506 /*
7507 * Translate IPv4 destination to IPv6 destination by using the
7508 * prefixes learned through prior PLAT discovery.
7509 */
7510 if ((error = nat464_synthesize_ipv6(ifp, &odst, &dstsock.sin6_addr)) != 0) {
7511 ip6stat.ip6s_clat464_out_v6synthfail_drop++;
7512 goto cleanup;
7513 }
7514
7515 dstsock.sin6_len = sizeof(struct sockaddr_in6);
7516 dstsock.sin6_family = AF_INET6;
7517
7518 /*
7519 * Retrive the local IPv6 CLAT46 address reserved for stateless
7520 * translation.
7521 */
7522 src = in6_selectsrc_core(&dstsock, 0, ifp, 0, &src_storage, NULL, &error,
7523 NULL, NULL, TRUE);
7524
7525 if (src == NULL) {
7526 ip6stat.ip6s_clat464_out_nov6addr_drop++;
7527 error = -1;
7528 goto cleanup;
7529 }
7530
7531
7532 /* Translate the IP header part first */
7533 error = (nat464_translate_46(pbuf, off, iph->ip_tos, iph->ip_p,
7534 iph->ip_ttl, src_storage, dstsock.sin6_addr, tot_len) == NT_NAT64) ? 0 : -1;
7535
7536 iph = NULL; /* Invalidate iph as pbuf has been modified */
7537
7538 if (error != 0) {
7539 ip6stat.ip6s_clat464_out_46transfail_drop++;
7540 goto cleanup;
7541 }
7542
7543 /*
7544 * Translate protocol header, update checksum, checksum flags
7545 * and related fields.
7546 */
7547 error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc, (struct nat464_addr *)&odst,
7548 proto, PF_INET, PF_INET6, NT_OUT, !is_first_frag) == NT_NAT64) ? 0 : -1;
7549
7550 if (error != 0) {
7551 ip6stat.ip6s_clat464_out_46proto_transfail_drop++;
7552 goto cleanup;
7553 }
7554
7555 /* Now insert the IPv6 fragment header */
7556 if (is_frag) {
7557 error = nat464_insert_frag46(pbuf, ip_id_val, ip_frag_off, is_last_frag);
7558
7559 if (error != 0) {
7560 ip6stat.ip6s_clat464_out_46frag_transfail_drop++;
7561 goto cleanup;
7562 }
7563 }
7564
7565cleanup:
7566 if (pbuf_is_valid(pbuf)) {
7567 *m = pbuf->pb_mbuf;
7568 pbuf->pb_mbuf = NULL;
7569 pbuf_destroy(pbuf);
7570 } else {
7571 error = -1;
7572 *m = NULL;
7573 ip6stat.ip6s_clat464_out_invalpbuf_drop++;
7574 }
7575
7576 if (error == 0) {
7577 *proto_family = PF_INET6;
7578 ip6stat.ip6s_clat464_out_success++;
7579 }
7580
7581 return error;
7582}
7583
7584/*
7585 * @brief This routine translates incoming IPv6 to IPv4 packet,
7586 * updates protocol checksum and also translates ICMPv6 outer
7587 * and inner headers
7588 *
7589 * @return 0 on success or else a negative value.
7590 */
7591static errno_t
7592dlil_clat64(ifnet_t ifp, protocol_family_t *proto_family, mbuf_t *m)
7593{
7594 VERIFY(*proto_family == PF_INET6);
7595 VERIFY(IS_INTF_CLAT46(ifp));
7596
7597 struct ip6_hdr *ip6h = NULL;
7598 struct in6_addr osrc, odst;
7599 uint8_t proto = 0;
7600 struct in6_ifaddr *ia6_clat_dst = NULL;
7601 struct in_ifaddr *ia4_clat_dst = NULL;
7602 struct in_addr *dst = NULL;
7603 struct in_addr src;
7604 int error = 0;
7605 uint32_t off = 0;
7606 u_int64_t tot_len = 0;
7607 uint8_t tos = 0;
7608 boolean_t is_first_frag = TRUE;
7609
7610 /* Incoming mbuf does not contain valid IP6 header */
7611 if ((size_t)(*m)->m_pkthdr.len < sizeof(struct ip6_hdr) ||
7612 ((size_t)(*m)->m_len < sizeof(struct ip6_hdr) &&
7613 (*m = m_pullup(*m, sizeof(struct ip6_hdr))) == NULL)) {
7614 ip6stat.ip6s_clat464_in_tooshort_drop++;
7615 return -1;
7616 }
7617
7618 ip6h = mtod(*m, struct ip6_hdr *);
7619 /* Validate that mbuf contains IP payload equal to ip6_plen */
7620 if ((size_t)(*m)->m_pkthdr.len < ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr)) {
7621 ip6stat.ip6s_clat464_in_tooshort_drop++;
7622 return -1;
7623 }
7624
7625 osrc = ip6h->ip6_src;
7626 odst = ip6h->ip6_dst;
7627
7628 /*
7629 * Retrieve the local CLAT46 reserved IPv6 address.
7630 * Let the packet pass if we don't find one, as the flag
7631 * may get set before IPv6 configuration has taken place.
7632 */
7633 ia6_clat_dst = in6ifa_ifpwithflag(ifp, IN6_IFF_CLAT46);
7634 if (ia6_clat_dst == NULL) {
7635 goto done;
7636 }
7637
7638 /*
7639 * Check if the original dest in the packet is same as the reserved
7640 * CLAT46 IPv6 address
7641 */
7642 if (IN6_ARE_ADDR_EQUAL(&odst, &ia6_clat_dst->ia_addr.sin6_addr)) {
7643 pbuf_t pbuf_store, *pbuf = NULL;
7644 pbuf_init_mbuf(&pbuf_store, *m, ifp);
7645 pbuf = &pbuf_store;
7646
7647 /*
7648 * Retrive the local CLAT46 IPv4 address reserved for stateless
7649 * translation.
7650 */
7651 ia4_clat_dst = inifa_ifpclatv4(ifp);
7652 if (ia4_clat_dst == NULL) {
7653 ifa_remref(ifa: &ia6_clat_dst->ia_ifa);
7654 ip6stat.ip6s_clat464_in_nov4addr_drop++;
7655 error = -1;
7656 goto cleanup;
7657 }
7658 ifa_remref(ifa: &ia6_clat_dst->ia_ifa);
7659
7660 /* Translate IPv6 src to IPv4 src by removing the NAT64 prefix */
7661 dst = &ia4_clat_dst->ia_addr.sin_addr;
7662 if ((error = nat464_synthesize_ipv4(ifp, &osrc, &src)) != 0) {
7663 ip6stat.ip6s_clat464_in_v4synthfail_drop++;
7664 error = -1;
7665 goto cleanup;
7666 }
7667
7668 ip6h = pbuf->pb_data;
7669 off = sizeof(struct ip6_hdr);
7670 proto = ip6h->ip6_nxt;
7671 tos = (ntohl(ip6h->ip6_flow) >> 20) & 0xff;
7672 tot_len = ntohs(ip6h->ip6_plen) + sizeof(struct ip6_hdr);
7673
7674 /*
7675 * Translate the IP header and update the fragmentation
7676 * header if needed
7677 */
7678 error = (nat464_translate_64(pbuf, off, tos, &proto,
7679 ip6h->ip6_hlim, src, *dst, tot_len, &is_first_frag) == NT_NAT64) ?
7680 0 : -1;
7681
7682 ip6h = NULL; /* Invalidate ip6h as pbuf has been changed */
7683
7684 if (error != 0) {
7685 ip6stat.ip6s_clat464_in_64transfail_drop++;
7686 goto cleanup;
7687 }
7688
7689 /*
7690 * Translate protocol header, update checksum, checksum flags
7691 * and related fields.
7692 */
7693 error = (nat464_translate_proto(pbuf, (struct nat464_addr *)&osrc,
7694 (struct nat464_addr *)&odst, proto, PF_INET6, PF_INET,
7695 NT_IN, !is_first_frag) == NT_NAT64) ? 0 : -1;
7696
7697 if (error != 0) {
7698 ip6stat.ip6s_clat464_in_64proto_transfail_drop++;
7699 goto cleanup;
7700 }
7701
7702cleanup:
7703 if (ia4_clat_dst != NULL) {
7704 ifa_remref(ifa: &ia4_clat_dst->ia_ifa);
7705 }
7706
7707 if (pbuf_is_valid(pbuf)) {
7708 *m = pbuf->pb_mbuf;
7709 pbuf->pb_mbuf = NULL;
7710 pbuf_destroy(pbuf);
7711 } else {
7712 error = -1;
7713 ip6stat.ip6s_clat464_in_invalpbuf_drop++;
7714 }
7715
7716 if (error == 0) {
7717 *proto_family = PF_INET;
7718 ip6stat.ip6s_clat464_in_success++;
7719 }
7720 } /* CLAT traffic */
7721
7722done:
7723 return error;
7724}
7725
7726/* The following is used to enqueue work items for ifnet ioctl events */
7727static void ifnet_ioctl_event_callback(struct nwk_wq_entry *);
7728
7729struct ifnet_ioctl_event {
7730 struct ifnet *ifp;
7731 u_long ioctl_code;
7732};
7733
7734struct ifnet_ioctl_event_nwk_wq_entry {
7735 struct nwk_wq_entry nwk_wqe;
7736 struct ifnet_ioctl_event ifnet_ioctl_ev_arg;
7737};
7738
7739void
7740ifnet_ioctl_async(struct ifnet *ifp, u_long ioctl_code)
7741{
7742 struct ifnet_ioctl_event_nwk_wq_entry *p_ifnet_ioctl_ev = NULL;
7743 bool compare_expected;
7744
7745 /*
7746 * Get an io ref count if the interface is attached.
7747 * At this point it most likely is. We are taking a reference for
7748 * deferred processing.
7749 */
7750 if (!ifnet_is_attached(ifp, refio: 1)) {
7751 os_log(OS_LOG_DEFAULT, "%s:%d %s Failed for ioctl %lu as interface "
7752 "is not attached",
7753 __func__, __LINE__, if_name(ifp), ioctl_code);
7754 return;
7755 }
7756 switch (ioctl_code) {
7757 case SIOCADDMULTI:
7758 compare_expected = false;
7759 if (!atomic_compare_exchange_strong(&ifp->if_mcast_add_signaled, &compare_expected, true)) {
7760 ifnet_decr_iorefcnt(ifp);
7761 return;
7762 }
7763 break;
7764 case SIOCDELMULTI:
7765 compare_expected = false;
7766 if (!atomic_compare_exchange_strong(&ifp->if_mcast_del_signaled, &compare_expected, true)) {
7767 ifnet_decr_iorefcnt(ifp);
7768 return;
7769 }
7770 break;
7771 default:
7772 os_log(OS_LOG_DEFAULT, "%s:%d %s unknown ioctl %lu",
7773 __func__, __LINE__, if_name(ifp), ioctl_code);
7774 return;
7775 }
7776
7777 p_ifnet_ioctl_ev = kalloc_type(struct ifnet_ioctl_event_nwk_wq_entry,
7778 Z_WAITOK | Z_ZERO | Z_NOFAIL);
7779
7780 p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ifp = ifp;
7781 p_ifnet_ioctl_ev->ifnet_ioctl_ev_arg.ioctl_code = ioctl_code;
7782 p_ifnet_ioctl_ev->nwk_wqe.func = ifnet_ioctl_event_callback;
7783 nwk_wq_enqueue(nwk_item: &p_ifnet_ioctl_ev->nwk_wqe);
7784}
7785
7786static void
7787ifnet_ioctl_event_callback(struct nwk_wq_entry *nwk_item)
7788{
7789 struct ifnet_ioctl_event_nwk_wq_entry *p_ev = __container_of(nwk_item,
7790 struct ifnet_ioctl_event_nwk_wq_entry, nwk_wqe);
7791
7792 struct ifnet *ifp = p_ev->ifnet_ioctl_ev_arg.ifp;
7793 u_long ioctl_code = p_ev->ifnet_ioctl_ev_arg.ioctl_code;
7794 int ret = 0;
7795
7796 switch (ioctl_code) {
7797 case SIOCADDMULTI:
7798 atomic_store(&ifp->if_mcast_add_signaled, false);
7799 break;
7800 case SIOCDELMULTI:
7801 atomic_store(&ifp->if_mcast_del_signaled, false);
7802 break;
7803 }
7804 if ((ret = ifnet_ioctl(interface: ifp, protocol: 0, ioctl_code, NULL)) != 0) {
7805 os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned %d for ioctl %lu",
7806 __func__, __LINE__, if_name(ifp), ret, ioctl_code);
7807 } else if (dlil_verbose) {
7808 os_log(OS_LOG_DEFAULT, "%s:%d %s ifnet_ioctl returned successfully "
7809 "for ioctl %lu",
7810 __func__, __LINE__, if_name(ifp), ioctl_code);
7811 }
7812 ifnet_decr_iorefcnt(ifp);
7813 kfree_type(struct ifnet_ioctl_event_nwk_wq_entry, p_ev);
7814 return;
7815}
7816
7817errno_t
7818ifnet_ioctl(ifnet_t ifp, protocol_family_t proto_fam, u_long ioctl_code,
7819 void *ioctl_arg)
7820{
7821 struct ifnet_filter *filter;
7822 int retval = EOPNOTSUPP;
7823 int result = 0;
7824
7825 if (ifp == NULL || ioctl_code == 0) {
7826 return EINVAL;
7827 }
7828
7829 /* Get an io ref count if the interface is attached */
7830 if (!ifnet_is_attached(ifp, refio: 1)) {
7831 return EOPNOTSUPP;
7832 }
7833
7834 /*
7835 * Run the interface filters first.
7836 * We want to run all filters before calling the protocol,
7837 * interface family, or interface.
7838 */
7839 lck_mtx_lock_spin(lck: &ifp->if_flt_lock);
7840 /* prevent filter list from changing in case we drop the lock */
7841 if_flt_monitor_busy(ifp);
7842 TAILQ_FOREACH(filter, &ifp->if_flt_head, filt_next) {
7843 if (filter->filt_ioctl != NULL && (filter->filt_protocol == 0 ||
7844 filter->filt_protocol == proto_fam)) {
7845 lck_mtx_unlock(lck: &ifp->if_flt_lock);
7846
7847 result = filter->filt_ioctl(filter->filt_cookie, ifp,
7848 proto_fam, ioctl_code, ioctl_arg);
7849
7850 lck_mtx_lock_spin(lck: &ifp->if_flt_lock);
7851
7852 /* Only update retval if no one has handled the ioctl */
7853 if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7854 if (result == ENOTSUP) {
7855 result = EOPNOTSUPP;
7856 }
7857 retval = result;
7858 if (retval != 0 && retval != EOPNOTSUPP) {
7859 /* we're done with the filter list */
7860 if_flt_monitor_unbusy(ifp);
7861 lck_mtx_unlock(lck: &ifp->if_flt_lock);
7862 goto cleanup;
7863 }
7864 }
7865 }
7866 }
7867 /* we're done with the filter list */
7868 if_flt_monitor_unbusy(ifp);
7869 lck_mtx_unlock(lck: &ifp->if_flt_lock);
7870
7871 /* Allow the protocol to handle the ioctl */
7872 if (proto_fam != 0) {
7873 struct if_proto *proto;
7874
7875 /* callee holds a proto refcnt upon success */
7876 ifnet_lock_shared(ifp);
7877 proto = find_attached_proto(ifp, protocol_family: proto_fam);
7878 ifnet_lock_done(ifp);
7879 if (proto != NULL) {
7880 proto_media_ioctl ioctlp =
7881 (proto->proto_kpi == kProtoKPI_v1 ?
7882 proto->kpi.v1.ioctl : proto->kpi.v2.ioctl);
7883 result = EOPNOTSUPP;
7884 if (ioctlp != NULL) {
7885 result = ioctlp(ifp, proto_fam, ioctl_code,
7886 ioctl_arg);
7887 }
7888 if_proto_free(proto);
7889
7890 /* Only update retval if no one has handled the ioctl */
7891 if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7892 if (result == ENOTSUP) {
7893 result = EOPNOTSUPP;
7894 }
7895 retval = result;
7896 if (retval && retval != EOPNOTSUPP) {
7897 goto cleanup;
7898 }
7899 }
7900 }
7901 }
7902
7903 /* retval is either 0 or EOPNOTSUPP */
7904
7905 /*
7906 * Let the interface handle this ioctl.
7907 * If it returns EOPNOTSUPP, ignore that, we may have
7908 * already handled this in the protocol or family.
7909 */
7910 if (ifp->if_ioctl) {
7911 result = (*ifp->if_ioctl)(ifp, ioctl_code, ioctl_arg);
7912 }
7913
7914 /* Only update retval if no one has handled the ioctl */
7915 if (retval == EOPNOTSUPP || result == EJUSTRETURN) {
7916 if (result == ENOTSUP) {
7917 result = EOPNOTSUPP;
7918 }
7919 retval = result;
7920 if (retval && retval != EOPNOTSUPP) {
7921 goto cleanup;
7922 }
7923 }
7924
7925cleanup:
7926 if (retval == EJUSTRETURN) {
7927 retval = 0;
7928 }
7929
7930 ifnet_decr_iorefcnt(ifp);
7931
7932 return retval;
7933}
7934
7935__private_extern__ errno_t
7936dlil_set_bpf_tap(ifnet_t ifp, bpf_tap_mode mode, bpf_packet_func callback)
7937{
7938 errno_t error = 0;
7939
7940 if (ifp->if_set_bpf_tap) {
7941 /* Get an io reference on the interface if it is attached */
7942 if (!ifnet_is_attached(ifp, refio: 1)) {
7943 return ENXIO;
7944 }
7945 error = ifp->if_set_bpf_tap(ifp, mode, callback);
7946 ifnet_decr_iorefcnt(ifp);
7947 }
7948 return error;
7949}
7950
7951errno_t
7952dlil_resolve_multi(struct ifnet *ifp, const struct sockaddr *proto_addr,
7953 struct sockaddr *ll_addr, size_t ll_len)
7954{
7955 errno_t result = EOPNOTSUPP;
7956 struct if_proto *proto;
7957 const struct sockaddr *verify;
7958 proto_media_resolve_multi resolvep;
7959
7960 if (!ifnet_is_attached(ifp, refio: 1)) {
7961 return result;
7962 }
7963
7964 bzero(s: ll_addr, n: ll_len);
7965
7966 /* Call the protocol first; callee holds a proto refcnt upon success */
7967 ifnet_lock_shared(ifp);
7968 proto = find_attached_proto(ifp, protocol_family: proto_addr->sa_family);
7969 ifnet_lock_done(ifp);
7970 if (proto != NULL) {
7971 resolvep = (proto->proto_kpi == kProtoKPI_v1 ?
7972 proto->kpi.v1.resolve_multi : proto->kpi.v2.resolve_multi);
7973 if (resolvep != NULL) {
7974 result = resolvep(ifp, proto_addr, SDL(ll_addr), ll_len);
7975 }
7976 if_proto_free(proto);
7977 }
7978
7979 /* Let the interface verify the multicast address */
7980 if ((result == EOPNOTSUPP || result == 0) && ifp->if_check_multi) {
7981 if (result == 0) {
7982 verify = ll_addr;
7983 } else {
7984 verify = proto_addr;
7985 }
7986 result = ifp->if_check_multi(ifp, verify);
7987 }
7988
7989 ifnet_decr_iorefcnt(ifp);
7990 return result;
7991}
7992
7993__private_extern__ errno_t
7994dlil_send_arp_internal(ifnet_t ifp, u_short arpop,
7995 const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
7996 const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
7997{
7998 struct if_proto *proto;
7999 errno_t result = 0;
8000
8001 if ((ifp->if_flags & IFF_NOARP) != 0) {
8002 result = ENOTSUP;
8003 goto done;
8004 }
8005
8006 /* callee holds a proto refcnt upon success */
8007 ifnet_lock_shared(ifp);
8008 proto = find_attached_proto(ifp, protocol_family: target_proto->sa_family);
8009 ifnet_lock_done(ifp);
8010 if (proto == NULL) {
8011 result = ENOTSUP;
8012 } else {
8013 proto_media_send_arp arpp;
8014 arpp = (proto->proto_kpi == kProtoKPI_v1 ?
8015 proto->kpi.v1.send_arp : proto->kpi.v2.send_arp);
8016 if (arpp == NULL) {
8017 result = ENOTSUP;
8018 } else {
8019 switch (arpop) {
8020 case ARPOP_REQUEST:
8021 arpstat.txrequests++;
8022 if (target_hw != NULL) {
8023 arpstat.txurequests++;
8024 }
8025 break;
8026 case ARPOP_REPLY:
8027 arpstat.txreplies++;
8028 break;
8029 }
8030 result = arpp(ifp, arpop, sender_hw, sender_proto,
8031 target_hw, target_proto);
8032 }
8033 if_proto_free(proto);
8034 }
8035done:
8036 return result;
8037}
8038
8039struct net_thread_marks { };
8040static const struct net_thread_marks net_thread_marks_base = { };
8041
8042__private_extern__ const net_thread_marks_t net_thread_marks_none =
8043 &net_thread_marks_base;
8044
8045__private_extern__ net_thread_marks_t
8046net_thread_marks_push(u_int32_t push)
8047{
8048 static const char *const base = (const void*)&net_thread_marks_base;
8049 u_int32_t pop = 0;
8050
8051 if (push != 0) {
8052 struct uthread *uth = current_uthread();
8053
8054 pop = push & ~uth->uu_network_marks;
8055 if (pop != 0) {
8056 uth->uu_network_marks |= pop;
8057 }
8058 }
8059
8060 return (net_thread_marks_t)&base[pop];
8061}
8062
8063__private_extern__ net_thread_marks_t
8064net_thread_unmarks_push(u_int32_t unpush)
8065{
8066 static const char *const base = (const void*)&net_thread_marks_base;
8067 u_int32_t unpop = 0;
8068
8069 if (unpush != 0) {
8070 struct uthread *uth = current_uthread();
8071
8072 unpop = unpush & uth->uu_network_marks;
8073 if (unpop != 0) {
8074 uth->uu_network_marks &= ~unpop;
8075 }
8076 }
8077
8078 return (net_thread_marks_t)&base[unpop];
8079}
8080
8081__private_extern__ void
8082net_thread_marks_pop(net_thread_marks_t popx)
8083{
8084 static const char *const base = (const void*)&net_thread_marks_base;
8085 const ptrdiff_t pop = (const char *)popx - (const char *)base;
8086
8087 if (pop != 0) {
8088 static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
8089 struct uthread *uth = current_uthread();
8090
8091 VERIFY((pop & ones) == pop);
8092 VERIFY((ptrdiff_t)(uth->uu_network_marks & pop) == pop);
8093 uth->uu_network_marks &= ~pop;
8094 }
8095}
8096
8097__private_extern__ void
8098net_thread_unmarks_pop(net_thread_marks_t unpopx)
8099{
8100 static const char *const base = (const void*)&net_thread_marks_base;
8101 ptrdiff_t unpop = (const char *)unpopx - (const char *)base;
8102
8103 if (unpop != 0) {
8104 static const ptrdiff_t ones = (ptrdiff_t)(u_int32_t)~0U;
8105 struct uthread *uth = current_uthread();
8106
8107 VERIFY((unpop & ones) == unpop);
8108 VERIFY((ptrdiff_t)(uth->uu_network_marks & unpop) == 0);
8109 uth->uu_network_marks |= (u_int32_t)unpop;
8110 }
8111}
8112
8113__private_extern__ u_int32_t
8114net_thread_is_marked(u_int32_t check)
8115{
8116 if (check != 0) {
8117 struct uthread *uth = current_uthread();
8118 return uth->uu_network_marks & check;
8119 } else {
8120 return 0;
8121 }
8122}
8123
8124__private_extern__ u_int32_t
8125net_thread_is_unmarked(u_int32_t check)
8126{
8127 if (check != 0) {
8128 struct uthread *uth = current_uthread();
8129 return ~uth->uu_network_marks & check;
8130 } else {
8131 return 0;
8132 }
8133}
8134
8135static __inline__ int
8136_is_announcement(const struct sockaddr_in * sender_sin,
8137 const struct sockaddr_in * target_sin)
8138{
8139 if (target_sin == NULL || sender_sin == NULL) {
8140 return FALSE;
8141 }
8142
8143 return sender_sin->sin_addr.s_addr == target_sin->sin_addr.s_addr;
8144}
8145
8146__private_extern__ errno_t
8147dlil_send_arp(ifnet_t ifp, u_short arpop, const struct sockaddr_dl *sender_hw,
8148 const struct sockaddr *sender_proto, const struct sockaddr_dl *target_hw,
8149 const struct sockaddr *target_proto0, u_int32_t rtflags)
8150{
8151 errno_t result = 0;
8152 const struct sockaddr_in * sender_sin;
8153 const struct sockaddr_in * target_sin;
8154 struct sockaddr_inarp target_proto_sinarp;
8155 struct sockaddr *target_proto = __DECONST_SA(target_proto0);
8156
8157 if (target_proto == NULL || sender_proto == NULL) {
8158 return EINVAL;
8159 }
8160
8161 if (sender_proto->sa_family != target_proto->sa_family) {
8162 return EINVAL;
8163 }
8164
8165 /*
8166 * If the target is a (default) router, provide that
8167 * information to the send_arp callback routine.
8168 */
8169 if (rtflags & RTF_ROUTER) {
8170 SOCKADDR_COPY(target_proto, &target_proto_sinarp, sizeof(struct sockaddr_in));
8171 target_proto_sinarp.sin_other |= SIN_ROUTER;
8172 target_proto = SA(&target_proto_sinarp);
8173 }
8174
8175 /*
8176 * If this is an ARP request and the target IP is IPv4LL,
8177 * send the request on all interfaces. The exception is
8178 * an announcement, which must only appear on the specific
8179 * interface.
8180 */
8181 sender_sin = SIN(sender_proto);
8182 target_sin = SIN(target_proto);
8183 if (target_proto->sa_family == AF_INET &&
8184 IN_LINKLOCAL(ntohl(target_sin->sin_addr.s_addr)) &&
8185 ipv4_ll_arp_aware != 0 && arpop == ARPOP_REQUEST &&
8186 !_is_announcement(sender_sin, target_sin)) {
8187 ifnet_t *ifp_list;
8188 u_int32_t count;
8189 u_int32_t ifp_on;
8190
8191 result = ENOTSUP;
8192
8193 if (ifnet_list_get(family: IFNET_FAMILY_ANY, interfaces: &ifp_list, count: &count) == 0) {
8194 for (ifp_on = 0; ifp_on < count; ifp_on++) {
8195 errno_t new_result;
8196 ifaddr_t source_hw = NULL;
8197 ifaddr_t source_ip = NULL;
8198 struct sockaddr_in source_ip_copy;
8199 struct ifnet *cur_ifp = ifp_list[ifp_on];
8200
8201 /*
8202 * Only arp on interfaces marked for IPv4LL
8203 * ARPing. This may mean that we don't ARP on
8204 * the interface the subnet route points to.
8205 */
8206 if (!(cur_ifp->if_eflags & IFEF_ARPLL)) {
8207 continue;
8208 }
8209
8210 /* Find the source IP address */
8211 ifnet_lock_shared(ifp: cur_ifp);
8212 source_hw = cur_ifp->if_lladdr;
8213 TAILQ_FOREACH(source_ip, &cur_ifp->if_addrhead,
8214 ifa_link) {
8215 IFA_LOCK(source_ip);
8216 if (source_ip->ifa_addr != NULL &&
8217 source_ip->ifa_addr->sa_family ==
8218 AF_INET) {
8219 /* Copy the source IP address */
8220 SOCKADDR_COPY(SIN(source_ip->ifa_addr), &source_ip_copy, sizeof(source_ip_copy));
8221 IFA_UNLOCK(source_ip);
8222 break;
8223 }
8224 IFA_UNLOCK(source_ip);
8225 }
8226
8227 /* No IP Source, don't arp */
8228 if (source_ip == NULL) {
8229 ifnet_lock_done(ifp: cur_ifp);
8230 continue;
8231 }
8232
8233 ifa_addref(ifa: source_hw);
8234 ifnet_lock_done(ifp: cur_ifp);
8235
8236 /* Send the ARP */
8237 new_result = dlil_send_arp_internal(ifp: cur_ifp,
8238 arpop, SDL(source_hw->ifa_addr),
8239 SA(&source_ip_copy), NULL,
8240 target_proto);
8241
8242 ifa_remref(ifa: source_hw);
8243 if (result == ENOTSUP) {
8244 result = new_result;
8245 }
8246 }
8247 ifnet_list_free(interfaces: ifp_list);
8248 }
8249 } else {
8250 result = dlil_send_arp_internal(ifp, arpop, sender_hw,
8251 sender_proto, target_hw, target_proto);
8252 }
8253
8254 return result;
8255}
8256
8257/*
8258 * Caller must hold ifnet head lock.
8259 */
8260static int
8261ifnet_lookup(struct ifnet *ifp)
8262{
8263 struct ifnet *_ifp;
8264
8265 LCK_RW_ASSERT(&ifnet_head_lock, LCK_RW_ASSERT_HELD);
8266 TAILQ_FOREACH(_ifp, &ifnet_head, if_link) {
8267 if (_ifp == ifp) {
8268 break;
8269 }
8270 }
8271 return _ifp != NULL;
8272}
8273
8274/*
8275 * Caller has to pass a non-zero refio argument to get a
8276 * IO reference count. This will prevent ifnet_detach from
8277 * being called when there are outstanding io reference counts.
8278 */
8279int
8280ifnet_is_attached(struct ifnet *ifp, int refio)
8281{
8282 int ret;
8283
8284 lck_mtx_lock_spin(lck: &ifp->if_ref_lock);
8285 if ((ret = IF_FULLY_ATTACHED(ifp))) {
8286 if (refio > 0) {
8287 ifp->if_refio++;
8288 }
8289 }
8290 lck_mtx_unlock(lck: &ifp->if_ref_lock);
8291
8292 return ret;
8293}
8294
8295void
8296ifnet_incr_pending_thread_count(struct ifnet *ifp)
8297{
8298 lck_mtx_lock_spin(lck: &ifp->if_ref_lock);
8299 ifp->if_threads_pending++;
8300 lck_mtx_unlock(lck: &ifp->if_ref_lock);
8301}
8302
8303void
8304ifnet_decr_pending_thread_count(struct ifnet *ifp)
8305{
8306 lck_mtx_lock_spin(lck: &ifp->if_ref_lock);
8307 VERIFY(ifp->if_threads_pending > 0);
8308 ifp->if_threads_pending--;
8309 if (ifp->if_threads_pending == 0) {
8310 wakeup(chan: &ifp->if_threads_pending);
8311 }
8312 lck_mtx_unlock(lck: &ifp->if_ref_lock);
8313}
8314
8315/*
8316 * Caller must ensure the interface is attached; the assumption is that
8317 * there is at least an outstanding IO reference count held already.
8318 * Most callers would call ifnet_is_{attached,data_ready}() instead.
8319 */
8320void
8321ifnet_incr_iorefcnt(struct ifnet *ifp)
8322{
8323 lck_mtx_lock_spin(lck: &ifp->if_ref_lock);
8324 VERIFY(IF_FULLY_ATTACHED(ifp));
8325 VERIFY(ifp->if_refio > 0);
8326 ifp->if_refio++;
8327 lck_mtx_unlock(lck: &ifp->if_ref_lock);
8328}
8329
8330__attribute__((always_inline))
8331static void
8332ifnet_decr_iorefcnt_locked(struct ifnet *ifp)
8333{
8334 LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
8335
8336 VERIFY(ifp->if_refio > 0);
8337 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8338
8339 ifp->if_refio--;
8340 VERIFY(ifp->if_refio != 0 || ifp->if_datamov == 0);
8341
8342 /*
8343 * if there are no more outstanding io references, wakeup the
8344 * ifnet_detach thread if detaching flag is set.
8345 */
8346 if (ifp->if_refio == 0 && (ifp->if_refflags & IFRF_DETACHING)) {
8347 wakeup(chan: &(ifp->if_refio));
8348 }
8349}
8350
8351void
8352ifnet_decr_iorefcnt(struct ifnet *ifp)
8353{
8354 lck_mtx_lock_spin(lck: &ifp->if_ref_lock);
8355 ifnet_decr_iorefcnt_locked(ifp);
8356 lck_mtx_unlock(lck: &ifp->if_ref_lock);
8357}
8358
8359boolean_t
8360ifnet_datamov_begin(struct ifnet *ifp)
8361{
8362 boolean_t ret;
8363
8364 lck_mtx_lock_spin(lck: &ifp->if_ref_lock);
8365 if ((ret = IF_FULLY_ATTACHED_AND_READY(ifp))) {
8366 ifp->if_refio++;
8367 ifp->if_datamov++;
8368 }
8369 lck_mtx_unlock(lck: &ifp->if_ref_lock);
8370
8371 DTRACE_IP2(datamov__begin, struct ifnet *, ifp, boolean_t, ret);
8372 return ret;
8373}
8374
8375void
8376ifnet_datamov_end(struct ifnet *ifp)
8377{
8378 lck_mtx_lock_spin(lck: &ifp->if_ref_lock);
8379 VERIFY(ifp->if_datamov > 0);
8380 /*
8381 * if there's no more thread moving data, wakeup any
8382 * drainers that's blocked waiting for this.
8383 */
8384 if (--ifp->if_datamov == 0 && ifp->if_drainers > 0) {
8385 DLIL_PRINTF("Waking up drainers on %s\n", if_name(ifp));
8386 DTRACE_IP1(datamov__drain__wake, struct ifnet *, ifp);
8387 wakeup(chan: &(ifp->if_datamov));
8388 }
8389 ifnet_decr_iorefcnt_locked(ifp);
8390 lck_mtx_unlock(lck: &ifp->if_ref_lock);
8391
8392 DTRACE_IP1(datamov__end, struct ifnet *, ifp);
8393}
8394
8395static void
8396ifnet_datamov_suspend_locked(struct ifnet *ifp)
8397{
8398 LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_MTX_ASSERT_OWNED);
8399 ifp->if_refio++;
8400 if (ifp->if_suspend++ == 0) {
8401 VERIFY(ifp->if_refflags & IFRF_READY);
8402 ifp->if_refflags &= ~IFRF_READY;
8403 }
8404}
8405
8406void
8407ifnet_datamov_suspend(struct ifnet *ifp)
8408{
8409 lck_mtx_lock_spin(lck: &ifp->if_ref_lock);
8410 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8411 ifnet_datamov_suspend_locked(ifp);
8412 lck_mtx_unlock(lck: &ifp->if_ref_lock);
8413}
8414
8415boolean_t
8416ifnet_datamov_suspend_if_needed(struct ifnet *ifp)
8417{
8418 lck_mtx_lock_spin(lck: &ifp->if_ref_lock);
8419 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8420 if (ifp->if_suspend > 0) {
8421 lck_mtx_unlock(lck: &ifp->if_ref_lock);
8422 return FALSE;
8423 }
8424 ifnet_datamov_suspend_locked(ifp);
8425 lck_mtx_unlock(lck: &ifp->if_ref_lock);
8426 return TRUE;
8427}
8428
8429void
8430ifnet_datamov_drain(struct ifnet *ifp)
8431{
8432 lck_mtx_lock(lck: &ifp->if_ref_lock);
8433 VERIFY(ifp->if_refflags & (IFRF_ATTACHED | IFRF_DETACHING));
8434 /* data movement must already be suspended */
8435 VERIFY(ifp->if_suspend > 0);
8436 VERIFY(!(ifp->if_refflags & IFRF_READY));
8437 ifp->if_drainers++;
8438 while (ifp->if_datamov != 0) {
8439 DLIL_PRINTF("Waiting for data path(s) to quiesce on %s\n",
8440 if_name(ifp));
8441 DTRACE_IP1(datamov__wait, struct ifnet *, ifp);
8442 (void) msleep(chan: &(ifp->if_datamov), mtx: &ifp->if_ref_lock,
8443 pri: (PZERO - 1), wmesg: __func__, NULL);
8444 DTRACE_IP1(datamov__wake, struct ifnet *, ifp);
8445 }
8446 VERIFY(!(ifp->if_refflags & IFRF_READY));
8447 VERIFY(ifp->if_drainers > 0);
8448 ifp->if_drainers--;
8449 lck_mtx_unlock(lck: &ifp->if_ref_lock);
8450
8451 /* purge the interface queues */
8452 if ((ifp->if_eflags & IFEF_TXSTART) != 0) {
8453 if_qflush_snd(ifp, false);
8454 }
8455}
8456
8457void
8458ifnet_datamov_suspend_and_drain(struct ifnet *ifp)
8459{
8460 ifnet_datamov_suspend(ifp);
8461 ifnet_datamov_drain(ifp);
8462}
8463
8464void
8465ifnet_datamov_resume(struct ifnet *ifp)
8466{
8467 lck_mtx_lock(lck: &ifp->if_ref_lock);
8468 /* data movement must already be suspended */
8469 VERIFY(ifp->if_suspend > 0);
8470 if (--ifp->if_suspend == 0) {
8471 VERIFY(!(ifp->if_refflags & IFRF_READY));
8472 ifp->if_refflags |= IFRF_READY;
8473 }
8474 ifnet_decr_iorefcnt_locked(ifp);
8475 lck_mtx_unlock(lck: &ifp->if_ref_lock);
8476}
8477
8478static void
8479dlil_if_trace(struct dlil_ifnet *dl_if, int refhold)
8480{
8481 struct dlil_ifnet_dbg *dl_if_dbg = (struct dlil_ifnet_dbg *)dl_if;
8482 ctrace_t *tr;
8483 u_int32_t idx;
8484 u_int16_t *cnt;
8485
8486 if (!(dl_if->dl_if_flags & DLIF_DEBUG)) {
8487 panic("%s: dl_if %p has no debug structure", __func__, dl_if);
8488 /* NOTREACHED */
8489 }
8490
8491 if (refhold) {
8492 cnt = &dl_if_dbg->dldbg_if_refhold_cnt;
8493 tr = dl_if_dbg->dldbg_if_refhold;
8494 } else {
8495 cnt = &dl_if_dbg->dldbg_if_refrele_cnt;
8496 tr = dl_if_dbg->dldbg_if_refrele;
8497 }
8498
8499 idx = os_atomic_inc_orig(cnt, relaxed) % IF_REF_TRACE_HIST_SIZE;
8500 ctrace_record(&tr[idx]);
8501}
8502
8503errno_t
8504dlil_if_ref(struct ifnet *ifp)
8505{
8506 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8507
8508 if (dl_if == NULL) {
8509 return EINVAL;
8510 }
8511
8512 lck_mtx_lock_spin(lck: &dl_if->dl_if_lock);
8513 ++dl_if->dl_if_refcnt;
8514 if (dl_if->dl_if_refcnt == 0) {
8515 panic("%s: wraparound refcnt for ifp=%p", __func__, ifp);
8516 /* NOTREACHED */
8517 }
8518 if (dl_if->dl_if_trace != NULL) {
8519 (*dl_if->dl_if_trace)(dl_if, TRUE);
8520 }
8521 lck_mtx_unlock(lck: &dl_if->dl_if_lock);
8522
8523 return 0;
8524}
8525
8526errno_t
8527dlil_if_free(struct ifnet *ifp)
8528{
8529 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8530 bool need_release = FALSE;
8531
8532 if (dl_if == NULL) {
8533 return EINVAL;
8534 }
8535
8536 lck_mtx_lock_spin(lck: &dl_if->dl_if_lock);
8537 switch (dl_if->dl_if_refcnt) {
8538 case 0:
8539 panic("%s: negative refcnt for ifp=%p", __func__, ifp);
8540 /* NOTREACHED */
8541 break;
8542 case 1:
8543 if ((ifp->if_refflags & IFRF_EMBRYONIC) != 0) {
8544 need_release = TRUE;
8545 }
8546 break;
8547 default:
8548 break;
8549 }
8550 --dl_if->dl_if_refcnt;
8551 if (dl_if->dl_if_trace != NULL) {
8552 (*dl_if->dl_if_trace)(dl_if, FALSE);
8553 }
8554 lck_mtx_unlock(lck: &dl_if->dl_if_lock);
8555 if (need_release) {
8556 _dlil_if_release(ifp, true);
8557 }
8558 return 0;
8559}
8560
8561static errno_t
8562dlil_attach_protocol(struct if_proto *proto,
8563 const struct ifnet_demux_desc *demux_list, u_int32_t demux_count,
8564 uint32_t * proto_count)
8565{
8566 struct kev_dl_proto_data ev_pr_data;
8567 struct ifnet *ifp = proto->ifp;
8568 errno_t retval = 0;
8569 u_int32_t hash_value = proto_hash_value(protocol_family: proto->protocol_family);
8570 struct if_proto *prev_proto;
8571 struct if_proto *_proto;
8572
8573 /* don't allow attaching anything but PF_BRIDGE to vmnet interfaces */
8574 if (IFNET_IS_VMNET(ifp) && proto->protocol_family != PF_BRIDGE) {
8575 return EINVAL;
8576 }
8577
8578 if (!ifnet_is_attached(ifp, refio: 1)) {
8579 os_log(OS_LOG_DEFAULT, "%s: %s is no longer attached",
8580 __func__, if_name(ifp));
8581 return ENXIO;
8582 }
8583 /* callee holds a proto refcnt upon success */
8584 ifnet_lock_exclusive(ifp);
8585 _proto = find_attached_proto(ifp, protocol_family: proto->protocol_family);
8586 if (_proto != NULL) {
8587 ifnet_lock_done(ifp);
8588 if_proto_free(proto: _proto);
8589 retval = EEXIST;
8590 goto ioref_done;
8591 }
8592
8593 /*
8594 * Call family module add_proto routine so it can refine the
8595 * demux descriptors as it wishes.
8596 */
8597 retval = ifp->if_add_proto(ifp, proto->protocol_family, demux_list,
8598 demux_count);
8599 if (retval) {
8600 ifnet_lock_done(ifp);
8601 goto ioref_done;
8602 }
8603
8604 /*
8605 * Insert the protocol in the hash
8606 */
8607 prev_proto = SLIST_FIRST(&ifp->if_proto_hash[hash_value]);
8608 while (prev_proto != NULL && SLIST_NEXT(prev_proto, next_hash) != NULL) {
8609 prev_proto = SLIST_NEXT(prev_proto, next_hash);
8610 }
8611 if (prev_proto) {
8612 SLIST_INSERT_AFTER(prev_proto, proto, next_hash);
8613 } else {
8614 SLIST_INSERT_HEAD(&ifp->if_proto_hash[hash_value],
8615 proto, next_hash);
8616 }
8617
8618 /* hold a proto refcnt for attach */
8619 if_proto_ref(proto);
8620
8621 /*
8622 * The reserved field carries the number of protocol still attached
8623 * (subject to change)
8624 */
8625 ev_pr_data.proto_family = proto->protocol_family;
8626 ev_pr_data.proto_remaining_count = dlil_ifp_protolist(ifp, NULL, list_count: 0);
8627
8628 ifnet_lock_done(ifp);
8629
8630 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_PROTO_ATTACHED,
8631 event_data: (struct net_event_data *)&ev_pr_data,
8632 event_data_len: sizeof(struct kev_dl_proto_data), FALSE);
8633 if (proto_count != NULL) {
8634 *proto_count = ev_pr_data.proto_remaining_count;
8635 }
8636ioref_done:
8637 ifnet_decr_iorefcnt(ifp);
8638 return retval;
8639}
8640
8641static void
8642dlil_handle_proto_attach(ifnet_t ifp, protocol_family_t protocol)
8643{
8644 /*
8645 * A protocol has been attached, mark the interface up.
8646 * This used to be done by configd.KernelEventMonitor, but that
8647 * is inherently prone to races (rdar://problem/30810208).
8648 */
8649 (void) ifnet_set_flags(interface: ifp, IFF_UP, IFF_UP);
8650 (void) ifnet_ioctl(ifp, proto_fam: 0, SIOCSIFFLAGS, NULL);
8651 dlil_post_sifflags_msg(ifp);
8652#if SKYWALK
8653 switch (protocol) {
8654 case AF_INET:
8655 case AF_INET6:
8656 /* don't attach the flowswitch unless attaching IP */
8657 dlil_attach_flowswitch_nexus(ifp);
8658 break;
8659 default:
8660 break;
8661 }
8662#endif /* SKYWALK */
8663}
8664
8665errno_t
8666ifnet_attach_protocol(ifnet_t ifp, protocol_family_t protocol,
8667 const struct ifnet_attach_proto_param *proto_details)
8668{
8669 int retval = 0;
8670 struct if_proto *ifproto = NULL;
8671 uint32_t proto_count = 0;
8672
8673 ifnet_head_lock_shared();
8674 if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8675 retval = EINVAL;
8676 goto end;
8677 }
8678 /* Check that the interface is in the global list */
8679 if (!ifnet_lookup(ifp)) {
8680 retval = ENXIO;
8681 goto end;
8682 }
8683
8684 ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8685
8686 /* refcnt held above during lookup */
8687 ifproto->ifp = ifp;
8688 ifproto->protocol_family = protocol;
8689 ifproto->proto_kpi = kProtoKPI_v1;
8690 ifproto->kpi.v1.input = proto_details->input;
8691 ifproto->kpi.v1.pre_output = proto_details->pre_output;
8692 ifproto->kpi.v1.event = proto_details->event;
8693 ifproto->kpi.v1.ioctl = proto_details->ioctl;
8694 ifproto->kpi.v1.detached = proto_details->detached;
8695 ifproto->kpi.v1.resolve_multi = proto_details->resolve;
8696 ifproto->kpi.v1.send_arp = proto_details->send_arp;
8697
8698 retval = dlil_attach_protocol(proto: ifproto,
8699 demux_array: proto_details->demux_list, demux_count: proto_details->demux_count,
8700 proto_count: &proto_count);
8701
8702end:
8703 if (retval == EEXIST) {
8704 /* already attached */
8705 if (dlil_verbose) {
8706 DLIL_PRINTF("%s: protocol %d already attached\n",
8707 ifp != NULL ? if_name(ifp) : "N/A",
8708 protocol);
8709 }
8710 } else if (retval != 0) {
8711 DLIL_PRINTF("%s: failed to attach v1 protocol %d (err=%d)\n",
8712 ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8713 } else if (dlil_verbose) {
8714 DLIL_PRINTF("%s: attached v1 protocol %d (count = %d)\n",
8715 ifp != NULL ? if_name(ifp) : "N/A",
8716 protocol, proto_count);
8717 }
8718 ifnet_head_done();
8719 if (retval == 0) {
8720 dlil_handle_proto_attach(ifp, protocol);
8721 } else if (ifproto != NULL) {
8722 zfree(dlif_proto_zone, ifproto);
8723 }
8724 return retval;
8725}
8726
8727errno_t
8728ifnet_attach_protocol_v2(ifnet_t ifp, protocol_family_t protocol,
8729 const struct ifnet_attach_proto_param_v2 *proto_details)
8730{
8731 int retval = 0;
8732 struct if_proto *ifproto = NULL;
8733 uint32_t proto_count = 0;
8734
8735 ifnet_head_lock_shared();
8736 if (ifp == NULL || protocol == 0 || proto_details == NULL) {
8737 retval = EINVAL;
8738 goto end;
8739 }
8740 /* Check that the interface is in the global list */
8741 if (!ifnet_lookup(ifp)) {
8742 retval = ENXIO;
8743 goto end;
8744 }
8745
8746 ifproto = zalloc_flags(dlif_proto_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
8747
8748 /* refcnt held above during lookup */
8749 ifproto->ifp = ifp;
8750 ifproto->protocol_family = protocol;
8751 ifproto->proto_kpi = kProtoKPI_v2;
8752 ifproto->kpi.v2.input = proto_details->input;
8753 ifproto->kpi.v2.pre_output = proto_details->pre_output;
8754 ifproto->kpi.v2.event = proto_details->event;
8755 ifproto->kpi.v2.ioctl = proto_details->ioctl;
8756 ifproto->kpi.v2.detached = proto_details->detached;
8757 ifproto->kpi.v2.resolve_multi = proto_details->resolve;
8758 ifproto->kpi.v2.send_arp = proto_details->send_arp;
8759
8760 retval = dlil_attach_protocol(proto: ifproto,
8761 demux_array: proto_details->demux_list, demux_count: proto_details->demux_count,
8762 proto_count: &proto_count);
8763
8764end:
8765 if (retval == EEXIST) {
8766 /* already attached */
8767 if (dlil_verbose) {
8768 DLIL_PRINTF("%s: protocol %d already attached\n",
8769 ifp != NULL ? if_name(ifp) : "N/A",
8770 protocol);
8771 }
8772 } else if (retval != 0) {
8773 DLIL_PRINTF("%s: failed to attach v2 protocol %d (err=%d)\n",
8774 ifp != NULL ? if_name(ifp) : "N/A", protocol, retval);
8775 } else if (dlil_verbose) {
8776 DLIL_PRINTF("%s: attached v2 protocol %d (count = %d)\n",
8777 ifp != NULL ? if_name(ifp) : "N/A",
8778 protocol, proto_count);
8779 }
8780 ifnet_head_done();
8781 if (retval == 0) {
8782 dlil_handle_proto_attach(ifp, protocol);
8783 } else if (ifproto != NULL) {
8784 zfree(dlif_proto_zone, ifproto);
8785 }
8786 return retval;
8787}
8788
8789errno_t
8790ifnet_detach_protocol(ifnet_t ifp, protocol_family_t proto_family)
8791{
8792 struct if_proto *proto = NULL;
8793 int retval = 0;
8794
8795 if (ifp == NULL || proto_family == 0) {
8796 retval = EINVAL;
8797 goto end;
8798 }
8799
8800 ifnet_lock_exclusive(ifp);
8801 /* callee holds a proto refcnt upon success */
8802 proto = find_attached_proto(ifp, protocol_family: proto_family);
8803 if (proto == NULL) {
8804 retval = ENXIO;
8805 ifnet_lock_done(ifp);
8806 goto end;
8807 }
8808
8809 /* call family module del_proto */
8810 if (ifp->if_del_proto) {
8811 ifp->if_del_proto(ifp, proto->protocol_family);
8812 }
8813
8814 SLIST_REMOVE(&ifp->if_proto_hash[proto_hash_value(proto_family)],
8815 proto, if_proto, next_hash);
8816
8817 if (proto->proto_kpi == kProtoKPI_v1) {
8818 proto->kpi.v1.input = ifproto_media_input_v1;
8819 proto->kpi.v1.pre_output = ifproto_media_preout;
8820 proto->kpi.v1.event = ifproto_media_event;
8821 proto->kpi.v1.ioctl = ifproto_media_ioctl;
8822 proto->kpi.v1.resolve_multi = ifproto_media_resolve_multi;
8823 proto->kpi.v1.send_arp = ifproto_media_send_arp;
8824 } else {
8825 proto->kpi.v2.input = ifproto_media_input_v2;
8826 proto->kpi.v2.pre_output = ifproto_media_preout;
8827 proto->kpi.v2.event = ifproto_media_event;
8828 proto->kpi.v2.ioctl = ifproto_media_ioctl;
8829 proto->kpi.v2.resolve_multi = ifproto_media_resolve_multi;
8830 proto->kpi.v2.send_arp = ifproto_media_send_arp;
8831 }
8832 proto->detached = 1;
8833 ifnet_lock_done(ifp);
8834
8835 if (dlil_verbose) {
8836 DLIL_PRINTF("%s: detached %s protocol %d\n", if_name(ifp),
8837 (proto->proto_kpi == kProtoKPI_v1) ?
8838 "v1" : "v2", proto_family);
8839 }
8840
8841 /* release proto refcnt held during protocol attach */
8842 if_proto_free(proto);
8843
8844 /*
8845 * Release proto refcnt held during lookup; the rest of
8846 * protocol detach steps will happen when the last proto
8847 * reference is released.
8848 */
8849 if_proto_free(proto);
8850
8851end:
8852 return retval;
8853}
8854
8855static errno_t
8856ifproto_media_input_v1(struct ifnet *ifp, protocol_family_t protocol,
8857 struct mbuf *packet, char *header)
8858{
8859#pragma unused(ifp, protocol, packet, header)
8860 return ENXIO;
8861}
8862
8863static errno_t
8864ifproto_media_input_v2(struct ifnet *ifp, protocol_family_t protocol,
8865 struct mbuf *packet)
8866{
8867#pragma unused(ifp, protocol, packet)
8868 return ENXIO;
8869}
8870
8871static errno_t
8872ifproto_media_preout(struct ifnet *ifp, protocol_family_t protocol,
8873 mbuf_t *packet, const struct sockaddr *dest, void *route, char *frame_type,
8874 char *link_layer_dest)
8875{
8876#pragma unused(ifp, protocol, packet, dest, route, frame_type, link_layer_dest)
8877 return ENXIO;
8878}
8879
8880static void
8881ifproto_media_event(struct ifnet *ifp, protocol_family_t protocol,
8882 const struct kev_msg *event)
8883{
8884#pragma unused(ifp, protocol, event)
8885}
8886
8887static errno_t
8888ifproto_media_ioctl(struct ifnet *ifp, protocol_family_t protocol,
8889 unsigned long command, void *argument)
8890{
8891#pragma unused(ifp, protocol, command, argument)
8892 return ENXIO;
8893}
8894
8895static errno_t
8896ifproto_media_resolve_multi(ifnet_t ifp, const struct sockaddr *proto_addr,
8897 struct sockaddr_dl *out_ll, size_t ll_len)
8898{
8899#pragma unused(ifp, proto_addr, out_ll, ll_len)
8900 return ENXIO;
8901}
8902
8903static errno_t
8904ifproto_media_send_arp(struct ifnet *ifp, u_short arpop,
8905 const struct sockaddr_dl *sender_hw, const struct sockaddr *sender_proto,
8906 const struct sockaddr_dl *target_hw, const struct sockaddr *target_proto)
8907{
8908#pragma unused(ifp, arpop, sender_hw, sender_proto, target_hw, target_proto)
8909 return ENXIO;
8910}
8911
8912extern int if_next_index(void);
8913extern int tcp_ecn_outbound;
8914
8915void
8916dlil_ifclassq_setup(struct ifnet *ifp, struct ifclassq *ifcq)
8917{
8918 uint32_t sflags = 0;
8919 int err;
8920
8921 if (if_flowadv) {
8922 sflags |= PKTSCHEDF_QALG_FLOWCTL;
8923 }
8924
8925 if (if_delaybased_queue) {
8926 sflags |= PKTSCHEDF_QALG_DELAYBASED;
8927 }
8928
8929 if (ifp->if_output_sched_model ==
8930 IFNET_SCHED_MODEL_DRIVER_MANAGED) {
8931 sflags |= PKTSCHEDF_QALG_DRIVER_MANAGED;
8932 }
8933 /* Inherit drop limit from the default queue */
8934 if (ifp->if_snd != ifcq) {
8935 IFCQ_PKT_DROP_LIMIT(ifcq) = IFCQ_PKT_DROP_LIMIT(ifp->if_snd);
8936 }
8937 /* Initialize transmit queue(s) */
8938 err = ifclassq_setup(ifcq, ifp, sflags);
8939 if (err != 0) {
8940 panic_plain("%s: ifp=%p couldn't initialize transmit queue; "
8941 "err=%d", __func__, ifp, err);
8942 /* NOTREACHED */
8943 }
8944}
8945
8946errno_t
8947ifnet_attach(ifnet_t ifp, const struct sockaddr_dl *ll_addr)
8948{
8949#if SKYWALK
8950 boolean_t netif_compat;
8951 if_nexus_netif nexus_netif;
8952#endif /* SKYWALK */
8953 struct ifnet *tmp_if;
8954 struct ifaddr *ifa;
8955 struct if_data_internal if_data_saved;
8956 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
8957 struct dlil_threading_info *dl_inp;
8958 thread_continue_t thfunc = NULL;
8959 int err;
8960
8961 if (ifp == NULL) {
8962 return EINVAL;
8963 }
8964
8965 /*
8966 * Serialize ifnet attach using dlil_ifnet_lock, in order to
8967 * prevent the interface from being configured while it is
8968 * embryonic, as ifnet_head_lock is dropped and reacquired
8969 * below prior to marking the ifnet with IFRF_ATTACHED.
8970 */
8971 dlil_if_lock();
8972 ifnet_head_lock_exclusive();
8973 /* Verify we aren't already on the list */
8974 TAILQ_FOREACH(tmp_if, &ifnet_head, if_link) {
8975 if (tmp_if == ifp) {
8976 ifnet_head_done();
8977 dlil_if_unlock();
8978 return EEXIST;
8979 }
8980 }
8981
8982 lck_mtx_lock_spin(lck: &ifp->if_ref_lock);
8983 if (!(ifp->if_refflags & IFRF_EMBRYONIC)) {
8984 panic_plain("%s: flags mismatch (embryonic not set) ifp=%p",
8985 __func__, ifp);
8986 /* NOTREACHED */
8987 }
8988 lck_mtx_unlock(lck: &ifp->if_ref_lock);
8989
8990 ifnet_lock_exclusive(ifp);
8991
8992 /* Sanity check */
8993 VERIFY(ifp->if_detaching_link.tqe_next == NULL);
8994 VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
8995 VERIFY(ifp->if_threads_pending == 0);
8996
8997 if (ll_addr != NULL) {
8998 if (ifp->if_addrlen == 0) {
8999 ifp->if_addrlen = ll_addr->sdl_alen;
9000 } else if (ll_addr->sdl_alen != ifp->if_addrlen) {
9001 ifnet_lock_done(ifp);
9002 ifnet_head_done();
9003 dlil_if_unlock();
9004 return EINVAL;
9005 }
9006 }
9007
9008 /*
9009 * Allow interfaces without protocol families to attach
9010 * only if they have the necessary fields filled out.
9011 */
9012 if (ifp->if_add_proto == NULL || ifp->if_del_proto == NULL) {
9013 DLIL_PRINTF("%s: Attempt to attach interface without "
9014 "family module - %d\n", __func__, ifp->if_family);
9015 ifnet_lock_done(ifp);
9016 ifnet_head_done();
9017 dlil_if_unlock();
9018 return ENODEV;
9019 }
9020
9021 /* Allocate protocol hash table */
9022 VERIFY(ifp->if_proto_hash == NULL);
9023 ifp->if_proto_hash = kalloc_type(struct proto_hash_entry,
9024 PROTO_HASH_SLOTS, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9025
9026 lck_mtx_lock_spin(lck: &ifp->if_flt_lock);
9027 VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
9028 TAILQ_INIT(&ifp->if_flt_head);
9029 VERIFY(ifp->if_flt_busy == 0);
9030 VERIFY(ifp->if_flt_waiters == 0);
9031 VERIFY(ifp->if_flt_non_os_count == 0);
9032 VERIFY(ifp->if_flt_no_tso_count == 0);
9033 lck_mtx_unlock(lck: &ifp->if_flt_lock);
9034
9035 if (!(dl_if->dl_if_flags & DLIF_REUSE)) {
9036 VERIFY(LIST_EMPTY(&ifp->if_multiaddrs));
9037 LIST_INIT(&ifp->if_multiaddrs);
9038 }
9039
9040 VERIFY(ifp->if_allhostsinm == NULL);
9041 VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
9042 TAILQ_INIT(&ifp->if_addrhead);
9043
9044 if (ifp->if_index == 0) {
9045 int idx = if_next_index();
9046
9047 /*
9048 * Since we exhausted the list of
9049 * if_index's, try to find an empty slot
9050 * in ifindex2ifnet.
9051 */
9052 if (idx == -1 && if_index >= UINT16_MAX) {
9053 for (int i = 1; i < if_index; i++) {
9054 if (ifindex2ifnet[i] == NULL &&
9055 ifnet_addrs[i - 1] == NULL) {
9056 idx = i;
9057 break;
9058 }
9059 }
9060 }
9061 if (idx == -1) {
9062 ifp->if_index = 0;
9063 ifnet_lock_done(ifp);
9064 ifnet_head_done();
9065 dlil_if_unlock();
9066 return ENOBUFS;
9067 }
9068 ifp->if_index = (uint16_t)idx;
9069
9070 /* the lladdr passed at attach time is the permanent address */
9071 if (ll_addr != NULL && ifp->if_type == IFT_ETHER &&
9072 ll_addr->sdl_alen == ETHER_ADDR_LEN) {
9073 bcopy(CONST_LLADDR(ll_addr),
9074 dst: dl_if->dl_if_permanent_ether,
9075 ETHER_ADDR_LEN);
9076 dl_if->dl_if_permanent_ether_is_set = 1;
9077 }
9078 }
9079 /* There should not be anything occupying this slot */
9080 VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
9081
9082 /* allocate (if needed) and initialize a link address */
9083 ifa = dlil_alloc_lladdr(ifp, ll_addr);
9084 if (ifa == NULL) {
9085 ifnet_lock_done(ifp);
9086 ifnet_head_done();
9087 dlil_if_unlock();
9088 return ENOBUFS;
9089 }
9090
9091 VERIFY(ifnet_addrs[ifp->if_index - 1] == NULL);
9092 ifnet_addrs[ifp->if_index - 1] = ifa;
9093
9094 /* make this address the first on the list */
9095 IFA_LOCK(ifa);
9096 /* hold a reference for ifnet_addrs[] */
9097 ifa_addref(ifa);
9098 /* if_attach_link_ifa() holds a reference for ifa_link */
9099 if_attach_link_ifa(ifp, ifa);
9100 IFA_UNLOCK(ifa);
9101
9102 TAILQ_INSERT_TAIL(&ifnet_head, ifp, if_link);
9103 ifindex2ifnet[ifp->if_index] = ifp;
9104
9105 /* Hold a reference to the underlying dlil_ifnet */
9106 ifnet_reference(interface: ifp);
9107
9108 /* Clear stats (save and restore other fields that we care) */
9109 if_data_saved = ifp->if_data;
9110 bzero(s: &ifp->if_data, n: sizeof(ifp->if_data));
9111 ifp->if_data.ifi_type = if_data_saved.ifi_type;
9112 ifp->if_data.ifi_typelen = if_data_saved.ifi_typelen;
9113 ifp->if_data.ifi_physical = if_data_saved.ifi_physical;
9114 ifp->if_data.ifi_addrlen = if_data_saved.ifi_addrlen;
9115 ifp->if_data.ifi_hdrlen = if_data_saved.ifi_hdrlen;
9116 ifp->if_data.ifi_mtu = if_data_saved.ifi_mtu;
9117 ifp->if_data.ifi_baudrate = if_data_saved.ifi_baudrate;
9118 ifp->if_data.ifi_hwassist = if_data_saved.ifi_hwassist;
9119 ifp->if_data.ifi_tso_v4_mtu = if_data_saved.ifi_tso_v4_mtu;
9120 ifp->if_data.ifi_tso_v6_mtu = if_data_saved.ifi_tso_v6_mtu;
9121 ifnet_touch_lastchange(interface: ifp);
9122
9123 VERIFY(ifp->if_output_sched_model == IFNET_SCHED_MODEL_NORMAL ||
9124 ifp->if_output_sched_model == IFNET_SCHED_MODEL_DRIVER_MANAGED ||
9125 ifp->if_output_sched_model == IFNET_SCHED_MODEL_FQ_CODEL);
9126
9127 dlil_ifclassq_setup(ifp, ifcq: ifp->if_snd);
9128
9129 /* Sanity checks on the input thread storage */
9130 dl_inp = &dl_if->dl_if_inpstorage;
9131 bzero(s: &dl_inp->dlth_stats, n: sizeof(dl_inp->dlth_stats));
9132 VERIFY(dl_inp->dlth_flags == 0);
9133 VERIFY(dl_inp->dlth_wtot == 0);
9134 VERIFY(dl_inp->dlth_ifp == NULL);
9135 VERIFY(qhead(&dl_inp->dlth_pkts) == NULL && qempty(&dl_inp->dlth_pkts));
9136 VERIFY(qlimit(&dl_inp->dlth_pkts) == 0);
9137 VERIFY(!dl_inp->dlth_affinity);
9138 VERIFY(ifp->if_inp == NULL);
9139 VERIFY(dl_inp->dlth_thread == THREAD_NULL);
9140 VERIFY(dl_inp->dlth_strategy == NULL);
9141 VERIFY(dl_inp->dlth_driver_thread == THREAD_NULL);
9142 VERIFY(dl_inp->dlth_poller_thread == THREAD_NULL);
9143 VERIFY(dl_inp->dlth_affinity_tag == 0);
9144
9145#if IFNET_INPUT_SANITY_CHK
9146 VERIFY(dl_inp->dlth_pkts_cnt == 0);
9147#endif /* IFNET_INPUT_SANITY_CHK */
9148
9149 VERIFY(ifp->if_poll_thread == THREAD_NULL);
9150 dlil_reset_rxpoll_params(ifp);
9151 /*
9152 * A specific DLIL input thread is created per non-loopback interface.
9153 */
9154 if (ifp->if_family != IFNET_FAMILY_LOOPBACK) {
9155 ifp->if_inp = dl_inp;
9156 ifnet_incr_pending_thread_count(ifp);
9157 err = dlil_create_input_thread(ifp, inp: ifp->if_inp, thfunc: &thfunc);
9158 if (err == ENODEV) {
9159 VERIFY(thfunc == NULL);
9160 ifnet_decr_pending_thread_count(ifp);
9161 } else if (err != 0) {
9162 panic_plain("%s: ifp=%p couldn't get an input thread; "
9163 "err=%d", __func__, ifp, err);
9164 /* NOTREACHED */
9165 }
9166 }
9167 /*
9168 * If the driver supports the new transmit model, calculate flow hash
9169 * and create a workloop starter thread to invoke the if_start callback
9170 * where the packets may be dequeued and transmitted.
9171 */
9172 if (ifp->if_eflags & IFEF_TXSTART) {
9173 thread_precedence_policy_data_t info;
9174 __unused kern_return_t kret;
9175
9176 ifp->if_flowhash = ifnet_calc_flowhash(ifp);
9177 VERIFY(ifp->if_flowhash != 0);
9178 VERIFY(ifp->if_start_thread == THREAD_NULL);
9179
9180 ifnet_set_start_cycle(ifp, NULL);
9181 ifp->if_start_active = 0;
9182 ifp->if_start_req = 0;
9183 ifp->if_start_flags = 0;
9184 VERIFY(ifp->if_start != NULL);
9185 ifnet_incr_pending_thread_count(ifp);
9186 if ((err = kernel_thread_start(continuation: ifnet_start_thread_func,
9187 parameter: ifp, new_thread: &ifp->if_start_thread)) != KERN_SUCCESS) {
9188 panic_plain("%s: "
9189 "ifp=%p couldn't get a start thread; "
9190 "err=%d", __func__, ifp, err);
9191 /* NOTREACHED */
9192 }
9193 bzero(s: &info, n: sizeof(info));
9194 info.importance = 1;
9195 kret = thread_policy_set(thread: ifp->if_start_thread,
9196 THREAD_PRECEDENCE_POLICY, policy_info: (thread_policy_t)&info,
9197 THREAD_PRECEDENCE_POLICY_COUNT);
9198 ASSERT(kret == KERN_SUCCESS);
9199 } else {
9200 ifp->if_flowhash = 0;
9201 }
9202
9203 /* Reset polling parameters */
9204 ifnet_set_poll_cycle(ifp, NULL);
9205 ifp->if_poll_update = 0;
9206 ifp->if_poll_flags = 0;
9207 ifp->if_poll_req = 0;
9208 VERIFY(ifp->if_poll_thread == THREAD_NULL);
9209
9210 /*
9211 * If the driver supports the new receive model, create a poller
9212 * thread to invoke if_input_poll callback where the packets may
9213 * be dequeued from the driver and processed for reception.
9214 * if the interface is netif compat then the poller thread is
9215 * managed by netif.
9216 */
9217 if (thfunc == dlil_rxpoll_input_thread_func) {
9218 thread_precedence_policy_data_t info;
9219 __unused kern_return_t kret;
9220#if SKYWALK
9221 VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
9222#endif /* SKYWALK */
9223 VERIFY(ifp->if_input_poll != NULL);
9224 VERIFY(ifp->if_input_ctl != NULL);
9225 ifnet_incr_pending_thread_count(ifp);
9226 if ((err = kernel_thread_start(continuation: ifnet_poll_thread_func, parameter: ifp,
9227 new_thread: &ifp->if_poll_thread)) != KERN_SUCCESS) {
9228 panic_plain("%s: ifp=%p couldn't get a poll thread; "
9229 "err=%d", __func__, ifp, err);
9230 /* NOTREACHED */
9231 }
9232 bzero(s: &info, n: sizeof(info));
9233 info.importance = 1;
9234 kret = thread_policy_set(thread: ifp->if_poll_thread,
9235 THREAD_PRECEDENCE_POLICY, policy_info: (thread_policy_t)&info,
9236 THREAD_PRECEDENCE_POLICY_COUNT);
9237 ASSERT(kret == KERN_SUCCESS);
9238 }
9239
9240 VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
9241 VERIFY(ifp->if_desc.ifd_len == 0);
9242 VERIFY(ifp->if_desc.ifd_desc != NULL);
9243
9244 /* Record attach PC stacktrace */
9245 ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_attach);
9246
9247 ifp->if_updatemcasts = 0;
9248 if (!LIST_EMPTY(&ifp->if_multiaddrs)) {
9249 struct ifmultiaddr *ifma;
9250 LIST_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
9251 IFMA_LOCK(ifma);
9252 if (ifma->ifma_addr->sa_family == AF_LINK ||
9253 ifma->ifma_addr->sa_family == AF_UNSPEC) {
9254 ifp->if_updatemcasts++;
9255 }
9256 IFMA_UNLOCK(ifma);
9257 }
9258
9259 DLIL_PRINTF("%s: attached with %d suspended link-layer multicast "
9260 "membership(s)\n", if_name(ifp),
9261 ifp->if_updatemcasts);
9262 }
9263
9264 /* Clear logging parameters */
9265 bzero(s: &ifp->if_log, n: sizeof(ifp->if_log));
9266
9267 /* Clear foreground/realtime activity timestamps */
9268 ifp->if_fg_sendts = 0;
9269 ifp->if_rt_sendts = 0;
9270
9271 /* Clear throughput estimates and radio type */
9272 ifp->if_estimated_up_bucket = 0;
9273 ifp->if_estimated_down_bucket = 0;
9274 ifp->if_radio_type = 0;
9275 ifp->if_radio_channel = 0;
9276
9277 VERIFY(ifp->if_delegated.ifp == NULL);
9278 VERIFY(ifp->if_delegated.type == 0);
9279 VERIFY(ifp->if_delegated.family == 0);
9280 VERIFY(ifp->if_delegated.subfamily == 0);
9281 VERIFY(ifp->if_delegated.expensive == 0);
9282 VERIFY(ifp->if_delegated.constrained == 0);
9283
9284 VERIFY(ifp->if_agentids == NULL);
9285 VERIFY(ifp->if_agentcount == 0);
9286
9287 /* Reset interface state */
9288 bzero(s: &ifp->if_interface_state, n: sizeof(ifp->if_interface_state));
9289 ifp->if_interface_state.valid_bitmask |=
9290 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
9291 ifp->if_interface_state.interface_availability =
9292 IF_INTERFACE_STATE_INTERFACE_AVAILABLE;
9293
9294 /* Initialize Link Quality Metric (loopback [lo0] is always good) */
9295 if (ifp == lo_ifp) {
9296 ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_GOOD;
9297 ifp->if_interface_state.valid_bitmask |=
9298 IF_INTERFACE_STATE_LQM_STATE_VALID;
9299 } else {
9300 ifp->if_interface_state.lqm_state = IFNET_LQM_THRESH_UNKNOWN;
9301 }
9302
9303 /*
9304 * Enable ECN capability on this interface depending on the
9305 * value of ECN global setting
9306 */
9307 if (tcp_ecn_outbound == 2 && !IFNET_IS_CELLULAR(ifp)) {
9308 if_set_eflags(ifp, IFEF_ECN_ENABLE);
9309 if_clear_eflags(ifp, IFEF_ECN_DISABLE);
9310 }
9311
9312 /*
9313 * Built-in Cyclops always on policy for WiFi infra
9314 */
9315 if (IFNET_IS_WIFI_INFRA(ifp) && net_qos_policy_wifi_enabled != 0) {
9316 errno_t error;
9317
9318 error = if_set_qosmarking_mode(ifp,
9319 IFRTYPE_QOSMARKING_FASTLANE);
9320 if (error != 0) {
9321 DLIL_PRINTF("%s if_set_qosmarking_mode(%s) error %d\n",
9322 __func__, ifp->if_xname, error);
9323 } else {
9324 if_set_eflags(ifp, IFEF_QOSMARKING_ENABLED);
9325#if (DEVELOPMENT || DEBUG)
9326 DLIL_PRINTF("%s fastlane enabled on %s\n",
9327 __func__, ifp->if_xname);
9328#endif /* (DEVELOPMENT || DEBUG) */
9329 }
9330 }
9331
9332 ifnet_lock_done(ifp);
9333 ifnet_head_done();
9334
9335#if SKYWALK
9336 netif_compat = dlil_attach_netif_compat_nexus(ifp, netif_nx: &nexus_netif);
9337#endif /* SKYWALK */
9338
9339 lck_mtx_lock(lck: &ifp->if_cached_route_lock);
9340 /* Enable forwarding cached route */
9341 ifp->if_fwd_cacheok = 1;
9342 /* Clean up any existing cached routes */
9343 ROUTE_RELEASE(&ifp->if_fwd_route);
9344 bzero(s: &ifp->if_fwd_route, n: sizeof(ifp->if_fwd_route));
9345 ROUTE_RELEASE(&ifp->if_src_route);
9346 bzero(s: &ifp->if_src_route, n: sizeof(ifp->if_src_route));
9347 ROUTE_RELEASE(&ifp->if_src_route6);
9348 bzero(s: &ifp->if_src_route6, n: sizeof(ifp->if_src_route6));
9349 lck_mtx_unlock(lck: &ifp->if_cached_route_lock);
9350
9351 ifnet_llreach_ifattach(ifp, (dl_if->dl_if_flags & DLIF_REUSE));
9352
9353 /*
9354 * Allocate and attach IGMPv3/MLDv2 interface specific variables
9355 * and trees; do this before the ifnet is marked as attached.
9356 * The ifnet keeps the reference to the info structures even after
9357 * the ifnet is detached, since the network-layer records still
9358 * refer to the info structures even after that. This also
9359 * makes it possible for them to still function after the ifnet
9360 * is recycled or reattached.
9361 */
9362#if INET
9363 if (IGMP_IFINFO(ifp) == NULL) {
9364 IGMP_IFINFO(ifp) = igmp_domifattach(ifp, Z_WAITOK);
9365 VERIFY(IGMP_IFINFO(ifp) != NULL);
9366 } else {
9367 VERIFY(IGMP_IFINFO(ifp)->igi_ifp == ifp);
9368 igmp_domifreattach(IGMP_IFINFO(ifp));
9369 }
9370#endif /* INET */
9371 if (MLD_IFINFO(ifp) == NULL) {
9372 MLD_IFINFO(ifp) = mld_domifattach(ifp, Z_WAITOK);
9373 VERIFY(MLD_IFINFO(ifp) != NULL);
9374 } else {
9375 VERIFY(MLD_IFINFO(ifp)->mli_ifp == ifp);
9376 mld_domifreattach(MLD_IFINFO(ifp));
9377 }
9378
9379 VERIFY(ifp->if_data_threshold == 0);
9380 VERIFY(ifp->if_dt_tcall != NULL);
9381
9382 /*
9383 * Wait for the created kernel threads for I/O to get
9384 * scheduled and run at least once before we proceed
9385 * to mark interface as attached.
9386 */
9387 lck_mtx_lock(lck: &ifp->if_ref_lock);
9388 while (ifp->if_threads_pending != 0) {
9389 DLIL_PRINTF("%s: Waiting for all kernel threads created for "
9390 "interface %s to get scheduled at least once.\n",
9391 __func__, ifp->if_xname);
9392 (void) msleep(chan: &ifp->if_threads_pending, mtx: &ifp->if_ref_lock, pri: (PZERO - 1),
9393 wmesg: __func__, NULL);
9394 LCK_MTX_ASSERT(&ifp->if_ref_lock, LCK_ASSERT_OWNED);
9395 }
9396 lck_mtx_unlock(lck: &ifp->if_ref_lock);
9397 DLIL_PRINTF("%s: All kernel threads created for interface %s have been scheduled "
9398 "at least once. Proceeding.\n", __func__, ifp->if_xname);
9399
9400 /* Final mark this ifnet as attached. */
9401 ifnet_lock_exclusive(ifp);
9402 lck_mtx_lock_spin(lck: &ifp->if_ref_lock);
9403 ifp->if_refflags = (IFRF_ATTACHED | IFRF_READY); /* clears embryonic */
9404 lck_mtx_unlock(lck: &ifp->if_ref_lock);
9405 if (net_rtref) {
9406 /* boot-args override; enable idle notification */
9407 (void) ifnet_set_idle_flags_locked(ifp, IFRF_IDLE_NOTIFY,
9408 IFRF_IDLE_NOTIFY);
9409 } else {
9410 /* apply previous request(s) to set the idle flags, if any */
9411 (void) ifnet_set_idle_flags_locked(ifp, ifp->if_idle_new_flags,
9412 ifp->if_idle_new_flags_mask);
9413 }
9414#if SKYWALK
9415 /* the interface is fully attached; let the nexus adapter know */
9416 if (netif_compat || dlil_is_native_netif_nexus(ifp)) {
9417 if (netif_compat) {
9418 if (sk_netif_compat_txmodel ==
9419 NETIF_COMPAT_TXMODEL_ENQUEUE_MULTI) {
9420 ifnet_enqueue_multi_setup(ifp,
9421 delay_qlen: sk_tx_delay_qlen, delay_timeout: sk_tx_delay_timeout);
9422 }
9423 ifp->if_nx_netif = nexus_netif;
9424 }
9425 ifp->if_na_ops->ni_finalize(ifp->if_na, ifp);
9426 }
9427#endif /* SKYWALK */
9428 ifnet_lock_done(ifp);
9429 dlil_if_unlock();
9430
9431#if PF
9432 /*
9433 * Attach packet filter to this interface, if enabled.
9434 */
9435 pf_ifnet_hook(ifp, 1);
9436#endif /* PF */
9437
9438 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_ATTACHED, NULL, event_data_len: 0, FALSE);
9439
9440 if (dlil_verbose) {
9441 DLIL_PRINTF("%s: attached%s\n", if_name(ifp),
9442 (dl_if->dl_if_flags & DLIF_REUSE) ? " (recycled)" : "");
9443 }
9444
9445 return 0;
9446}
9447
9448/*
9449 * Prepare the storage for the first/permanent link address, which must
9450 * must have the same lifetime as the ifnet itself. Although the link
9451 * address gets removed from if_addrhead and ifnet_addrs[] at detach time,
9452 * its location in memory must never change as it may still be referred
9453 * to by some parts of the system afterwards (unfortunate implementation
9454 * artifacts inherited from BSD.)
9455 *
9456 * Caller must hold ifnet lock as writer.
9457 */
9458static struct ifaddr *
9459dlil_alloc_lladdr(struct ifnet *ifp, const struct sockaddr_dl *ll_addr)
9460{
9461 struct ifaddr *ifa, *oifa = NULL;
9462 struct sockaddr_dl *addr_sdl, *mask_sdl;
9463 char workbuf[IFNAMSIZ * 2];
9464 int namelen, masklen, socksize;
9465 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
9466
9467 ifnet_lock_assert(ifp, what: IFNET_LCK_ASSERT_EXCLUSIVE);
9468 VERIFY(ll_addr == NULL || ll_addr->sdl_alen == ifp->if_addrlen);
9469
9470 namelen = scnprintf(workbuf, count: sizeof(workbuf), "%s",
9471 if_name(ifp));
9472 masklen = offsetof(struct sockaddr_dl, sdl_data[0])
9473 + ((namelen > 0) ? namelen : 0);
9474 socksize = masklen + ifp->if_addrlen;
9475#define ROUNDUP(a) (1 + (((a) - 1) | (sizeof (u_int32_t) - 1)))
9476 if ((u_int32_t)socksize < sizeof(struct sockaddr_dl)) {
9477 socksize = sizeof(struct sockaddr_dl);
9478 }
9479 socksize = ROUNDUP(socksize);
9480#undef ROUNDUP
9481
9482 ifa = ifp->if_lladdr;
9483 if (socksize > DLIL_SDLMAXLEN ||
9484 (ifa != NULL && ifa != &dl_if->dl_if_lladdr.ifa)) {
9485 /*
9486 * Rare, but in the event that the link address requires
9487 * more storage space than DLIL_SDLMAXLEN, allocate the
9488 * largest possible storages for address and mask, such
9489 * that we can reuse the same space when if_addrlen grows.
9490 * This same space will be used when if_addrlen shrinks.
9491 */
9492 struct dl_if_lladdr_xtra_space *__single dl_if_lladdr_ext;
9493
9494 if (ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa) {
9495 dl_if_lladdr_ext = zalloc_permanent(
9496 sizeof(*dl_if_lladdr_ext), ZALIGN(struct ifaddr));
9497
9498 ifa = &dl_if_lladdr_ext->ifa;
9499 ifa_lock_init(ifa);
9500 ifa_initref(ifa);
9501 /* Don't set IFD_ALLOC, as this is permanent */
9502 ifa->ifa_debug = IFD_LINK;
9503 } else {
9504 dl_if_lladdr_ext = __unsafe_forge_single(
9505 struct dl_if_lladdr_xtra_space*, ifa);
9506 ifa = &dl_if_lladdr_ext->ifa;
9507 }
9508
9509 IFA_LOCK(ifa);
9510 /* address and mask sockaddr_dl locations */
9511 bzero(s: dl_if_lladdr_ext->addr_sdl_bytes,
9512 n: sizeof(dl_if_lladdr_ext->addr_sdl_bytes));
9513 bzero(s: dl_if_lladdr_ext->mask_sdl_bytes,
9514 n: sizeof(dl_if_lladdr_ext->mask_sdl_bytes));
9515 addr_sdl = SDL(dl_if_lladdr_ext->addr_sdl_bytes);
9516 mask_sdl = SDL(dl_if_lladdr_ext->mask_sdl_bytes);
9517 } else {
9518 VERIFY(ifa == NULL || ifa == &dl_if->dl_if_lladdr.ifa);
9519 /*
9520 * Use the storage areas for address and mask within the
9521 * dlil_ifnet structure. This is the most common case.
9522 */
9523 if (ifa == NULL) {
9524 ifa = &dl_if->dl_if_lladdr.ifa;
9525 ifa_lock_init(ifa);
9526 ifa_initref(ifa);
9527 /* Don't set IFD_ALLOC, as this is permanent */
9528 ifa->ifa_debug = IFD_LINK;
9529 }
9530 IFA_LOCK(ifa);
9531 /* address and mask sockaddr_dl locations */
9532 bzero(s: dl_if->dl_if_lladdr.addr_sdl_bytes,
9533 n: sizeof(dl_if->dl_if_lladdr.addr_sdl_bytes));
9534 bzero(s: dl_if->dl_if_lladdr.mask_sdl_bytes,
9535 n: sizeof(dl_if->dl_if_lladdr.mask_sdl_bytes));
9536 addr_sdl = SDL(dl_if->dl_if_lladdr.addr_sdl_bytes);
9537 mask_sdl = SDL(dl_if->dl_if_lladdr.mask_sdl_bytes);
9538 }
9539
9540 if (ifp->if_lladdr != ifa) {
9541 oifa = ifp->if_lladdr;
9542 ifp->if_lladdr = ifa;
9543 }
9544
9545 VERIFY(ifa->ifa_debug == IFD_LINK);
9546 ifa->ifa_ifp = ifp;
9547 ifa->ifa_rtrequest = link_rtrequest;
9548 ifa->ifa_addr = SA(addr_sdl);
9549 addr_sdl->sdl_len = (u_char)socksize;
9550 addr_sdl->sdl_family = AF_LINK;
9551 if (namelen > 0) {
9552 bcopy(src: workbuf, dst: addr_sdl->sdl_data, n: min(a: namelen,
9553 b: sizeof(addr_sdl->sdl_data)));
9554 addr_sdl->sdl_nlen = (u_char)namelen;
9555 } else {
9556 addr_sdl->sdl_nlen = 0;
9557 }
9558 addr_sdl->sdl_index = ifp->if_index;
9559 addr_sdl->sdl_type = ifp->if_type;
9560 if (ll_addr != NULL) {
9561 addr_sdl->sdl_alen = ll_addr->sdl_alen;
9562 bcopy(CONST_LLADDR(ll_addr), LLADDR(addr_sdl), n: addr_sdl->sdl_alen);
9563 } else {
9564 addr_sdl->sdl_alen = 0;
9565 }
9566 ifa->ifa_netmask = SA(mask_sdl);
9567 mask_sdl->sdl_len = (u_char)masklen;
9568 while (namelen > 0) {
9569 mask_sdl->sdl_data[--namelen] = 0xff;
9570 }
9571 IFA_UNLOCK(ifa);
9572
9573 if (oifa != NULL) {
9574 ifa_remref(ifa: oifa);
9575 }
9576
9577 return ifa;
9578}
9579
9580static void
9581if_purgeaddrs(struct ifnet *ifp)
9582{
9583#if INET
9584 in_purgeaddrs(ifp);
9585#endif /* INET */
9586 in6_purgeaddrs(ifp);
9587}
9588
9589errno_t
9590ifnet_detach(ifnet_t ifp)
9591{
9592 struct ifnet *delegated_ifp;
9593 struct nd_ifinfo *ndi = NULL;
9594
9595 if (ifp == NULL) {
9596 return EINVAL;
9597 }
9598
9599 ndi = ND_IFINFO(ifp);
9600 if (NULL != ndi) {
9601 ndi->cga_initialized = FALSE;
9602 }
9603
9604 /* Mark the interface down */
9605 if_down(ifp);
9606
9607 /*
9608 * IMPORTANT NOTE
9609 *
9610 * Any field in the ifnet that relies on IF_FULLY_ATTACHED()
9611 * or equivalently, ifnet_is_attached(ifp, 1), can't be modified
9612 * until after we've waited for all I/O references to drain
9613 * in ifnet_detach_final().
9614 */
9615
9616 ifnet_head_lock_exclusive();
9617 ifnet_lock_exclusive(ifp);
9618
9619 if (ifp->if_output_netem != NULL) {
9620 netem_destroy(ne: ifp->if_output_netem);
9621 ifp->if_output_netem = NULL;
9622 }
9623
9624 /*
9625 * Check to see if this interface has previously triggered
9626 * aggressive protocol draining; if so, decrement the global
9627 * refcnt and clear PR_AGGDRAIN on the route domain if
9628 * there are no more of such an interface around.
9629 */
9630 (void) ifnet_set_idle_flags_locked(ifp, 0, ~0);
9631
9632 lck_mtx_lock_spin(lck: &ifp->if_ref_lock);
9633 if (!(ifp->if_refflags & IFRF_ATTACHED)) {
9634 lck_mtx_unlock(lck: &ifp->if_ref_lock);
9635 ifnet_lock_done(ifp);
9636 ifnet_head_done();
9637 return EINVAL;
9638 } else if (ifp->if_refflags & IFRF_DETACHING) {
9639 /* Interface has already been detached */
9640 lck_mtx_unlock(lck: &ifp->if_ref_lock);
9641 ifnet_lock_done(ifp);
9642 ifnet_head_done();
9643 return ENXIO;
9644 }
9645 VERIFY(!(ifp->if_refflags & IFRF_EMBRYONIC));
9646 /* Indicate this interface is being detached */
9647 ifp->if_refflags &= ~IFRF_ATTACHED;
9648 ifp->if_refflags |= IFRF_DETACHING;
9649 lck_mtx_unlock(lck: &ifp->if_ref_lock);
9650
9651 if (dlil_verbose) {
9652 DLIL_PRINTF("%s: detaching\n", if_name(ifp));
9653 }
9654
9655 /* clean up flow control entry object if there's any */
9656 if (ifp->if_eflags & IFEF_TXSTART) {
9657 ifnet_flowadv(ifp->if_flowhash);
9658 }
9659
9660 /* Reset ECN enable/disable flags */
9661 /* Reset CLAT46 flag */
9662 if_clear_eflags(ifp, IFEF_ECN_ENABLE | IFEF_ECN_DISABLE | IFEF_CLAT46);
9663
9664 /*
9665 * We do not reset the TCP keep alive counters in case
9666 * a TCP connection stays connection after the interface
9667 * went down
9668 */
9669 if (ifp->if_tcp_kao_cnt > 0) {
9670 os_log(OS_LOG_DEFAULT, "%s %s tcp_kao_cnt %u not zero",
9671 __func__, if_name(ifp), ifp->if_tcp_kao_cnt);
9672 }
9673 ifp->if_tcp_kao_max = 0;
9674
9675 /*
9676 * Remove ifnet from the ifnet_head, ifindex2ifnet[]; it will
9677 * no longer be visible during lookups from this point.
9678 */
9679 VERIFY(ifindex2ifnet[ifp->if_index] == ifp);
9680 TAILQ_REMOVE(&ifnet_head, ifp, if_link);
9681 ifp->if_link.tqe_next = NULL;
9682 ifp->if_link.tqe_prev = NULL;
9683 if (ifp->if_ordered_link.tqe_next != NULL ||
9684 ifp->if_ordered_link.tqe_prev != NULL) {
9685 ifnet_remove_from_ordered_list(ifp);
9686 }
9687 ifindex2ifnet[ifp->if_index] = NULL;
9688
9689 /* 18717626 - reset router mode */
9690 if_clear_eflags(ifp, IFEF_IPV4_ROUTER);
9691 ifp->if_ipv6_router_mode = IPV6_ROUTER_MODE_DISABLED;
9692
9693 /* Record detach PC stacktrace */
9694 ctrace_record(&((struct dlil_ifnet *)ifp)->dl_if_detach);
9695
9696 /* Clear logging parameters */
9697 bzero(s: &ifp->if_log, n: sizeof(ifp->if_log));
9698
9699 /* Clear delegated interface info (reference released below) */
9700 delegated_ifp = ifp->if_delegated.ifp;
9701 bzero(s: &ifp->if_delegated, n: sizeof(ifp->if_delegated));
9702
9703 /* Reset interface state */
9704 bzero(s: &ifp->if_interface_state, n: sizeof(ifp->if_interface_state));
9705
9706 /*
9707 * Increment the generation count on interface deletion
9708 */
9709 ifp->if_creation_generation_id = os_atomic_inc(&if_creation_generation_count, relaxed);
9710
9711 ifnet_lock_done(ifp);
9712 ifnet_head_done();
9713
9714 /* Release reference held on the delegated interface */
9715 if (delegated_ifp != NULL) {
9716 ifnet_release(interface: delegated_ifp);
9717 }
9718
9719 /* Reset Link Quality Metric (unless loopback [lo0]) */
9720 if (ifp != lo_ifp) {
9721 if_lqm_update(ifp, IFNET_LQM_THRESH_OFF, 0);
9722 }
9723
9724 /* Reset TCP local statistics */
9725 if (ifp->if_tcp_stat != NULL) {
9726 bzero(s: ifp->if_tcp_stat, n: sizeof(*ifp->if_tcp_stat));
9727 }
9728
9729 /* Reset UDP local statistics */
9730 if (ifp->if_udp_stat != NULL) {
9731 bzero(s: ifp->if_udp_stat, n: sizeof(*ifp->if_udp_stat));
9732 }
9733
9734 /* Reset ifnet IPv4 stats */
9735 if (ifp->if_ipv4_stat != NULL) {
9736 bzero(s: ifp->if_ipv4_stat, n: sizeof(*ifp->if_ipv4_stat));
9737 }
9738
9739 /* Reset ifnet IPv6 stats */
9740 if (ifp->if_ipv6_stat != NULL) {
9741 bzero(s: ifp->if_ipv6_stat, n: sizeof(*ifp->if_ipv6_stat));
9742 }
9743
9744 /* Release memory held for interface link status report */
9745 if (ifp->if_link_status != NULL) {
9746 kfree_type(struct if_link_status, ifp->if_link_status);
9747 ifp->if_link_status = NULL;
9748 }
9749
9750 /* Disable forwarding cached route */
9751 lck_mtx_lock(lck: &ifp->if_cached_route_lock);
9752 ifp->if_fwd_cacheok = 0;
9753 lck_mtx_unlock(lck: &ifp->if_cached_route_lock);
9754
9755 /* Disable data threshold and wait for any pending event posting */
9756 ifp->if_data_threshold = 0;
9757 VERIFY(ifp->if_dt_tcall != NULL);
9758 (void) thread_call_cancel_wait(call: ifp->if_dt_tcall);
9759
9760 /*
9761 * Drain any deferred IGMPv3/MLDv2 query responses, but keep the
9762 * references to the info structures and leave them attached to
9763 * this ifnet.
9764 */
9765#if INET
9766 igmp_domifdetach(ifp);
9767#endif /* INET */
9768 mld_domifdetach(ifp);
9769
9770#if SKYWALK
9771 /* Clean up any netns tokens still pointing to to this ifnet */
9772 netns_ifnet_detach(ifp);
9773#endif /* SKYWALK */
9774 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHING, NULL, event_data_len: 0, FALSE);
9775
9776 /* Let worker thread take care of the rest, to avoid reentrancy */
9777 dlil_if_lock();
9778 ifnet_detaching_enqueue(ifp);
9779 dlil_if_unlock();
9780
9781 return 0;
9782}
9783
9784static void
9785ifnet_detaching_enqueue(struct ifnet *ifp)
9786{
9787 dlil_if_lock_assert();
9788
9789 ++ifnet_detaching_cnt;
9790 VERIFY(ifnet_detaching_cnt != 0);
9791 TAILQ_INSERT_TAIL(&ifnet_detaching_head, ifp, if_detaching_link);
9792 wakeup(chan: (caddr_t)&ifnet_delayed_run);
9793}
9794
9795static struct ifnet *
9796ifnet_detaching_dequeue(void)
9797{
9798 struct ifnet *ifp;
9799
9800 dlil_if_lock_assert();
9801
9802 ifp = TAILQ_FIRST(&ifnet_detaching_head);
9803 VERIFY(ifnet_detaching_cnt != 0 || ifp == NULL);
9804 if (ifp != NULL) {
9805 VERIFY(ifnet_detaching_cnt != 0);
9806 --ifnet_detaching_cnt;
9807 TAILQ_REMOVE(&ifnet_detaching_head, ifp, if_detaching_link);
9808 ifp->if_detaching_link.tqe_next = NULL;
9809 ifp->if_detaching_link.tqe_prev = NULL;
9810 }
9811 return ifp;
9812}
9813
9814__attribute__((noreturn))
9815static void
9816ifnet_detacher_thread_cont(void *v, wait_result_t wres)
9817{
9818#pragma unused(v, wres)
9819 struct ifnet *ifp;
9820
9821 dlil_if_lock();
9822 if (__improbable(ifnet_detaching_embryonic)) {
9823 ifnet_detaching_embryonic = FALSE;
9824 /* there's no lock ordering constrain so OK to do this here */
9825 dlil_decr_pending_thread_count();
9826 }
9827
9828 for (;;) {
9829 dlil_if_lock_assert();
9830
9831 if (ifnet_detaching_cnt == 0) {
9832 break;
9833 }
9834
9835 net_update_uptime();
9836
9837 VERIFY(TAILQ_FIRST(&ifnet_detaching_head) != NULL);
9838
9839 /* Take care of detaching ifnet */
9840 ifp = ifnet_detaching_dequeue();
9841 if (ifp != NULL) {
9842 dlil_if_unlock();
9843 ifnet_detach_final(ifp);
9844 dlil_if_lock();
9845 }
9846 }
9847
9848 (void) assert_wait(event: &ifnet_delayed_run, THREAD_UNINT);
9849 dlil_if_unlock();
9850 (void) thread_block(continuation: ifnet_detacher_thread_cont);
9851
9852 VERIFY(0); /* we should never get here */
9853 /* NOTREACHED */
9854 __builtin_unreachable();
9855}
9856
9857__dead2
9858static void
9859ifnet_detacher_thread_func(void *v, wait_result_t w)
9860{
9861#pragma unused(v, w)
9862 dlil_if_lock();
9863 (void) assert_wait(event: &ifnet_delayed_run, THREAD_UNINT);
9864 ifnet_detaching_embryonic = TRUE;
9865 /* wake up once to get out of embryonic state */
9866 wakeup(chan: (caddr_t)&ifnet_delayed_run);
9867 dlil_if_unlock();
9868 (void) thread_block(continuation: ifnet_detacher_thread_cont);
9869 VERIFY(0);
9870 /* NOTREACHED */
9871 __builtin_unreachable();
9872}
9873
9874static void
9875ifnet_detach_final(struct ifnet *ifp)
9876{
9877 struct ifnet_filter *filter, *filter_next;
9878 struct dlil_ifnet *dlifp;
9879 struct ifnet_filter_head fhead;
9880 struct dlil_threading_info *inp;
9881 struct ifaddr *ifa;
9882 ifnet_detached_func if_free;
9883 int i;
9884
9885 /* Let BPF know we're detaching */
9886 bpfdetach(ifp);
9887
9888#if SKYWALK
9889 dlil_netif_detach_notify(ifp);
9890 /*
9891 * Wait for the datapath to quiesce before tearing down
9892 * netif/flowswitch nexuses.
9893 */
9894 dlil_quiesce_and_detach_nexuses(ifp);
9895#endif /* SKYWALK */
9896
9897 lck_mtx_lock(lck: &ifp->if_ref_lock);
9898 if (!(ifp->if_refflags & IFRF_DETACHING)) {
9899 panic("%s: flags mismatch (detaching not set) ifp=%p",
9900 __func__, ifp);
9901 /* NOTREACHED */
9902 }
9903
9904 /*
9905 * Wait until the existing IO references get released
9906 * before we proceed with ifnet_detach. This is not a
9907 * common case, so block without using a continuation.
9908 */
9909 while (ifp->if_refio > 0) {
9910 DLIL_PRINTF("%s: Waiting for IO references on %s interface "
9911 "to be released\n", __func__, if_name(ifp));
9912 (void) msleep(chan: &(ifp->if_refio), mtx: &ifp->if_ref_lock,
9913 pri: (PZERO - 1), wmesg: "ifnet_ioref_wait", NULL);
9914 }
9915
9916 VERIFY(ifp->if_datamov == 0);
9917 VERIFY(ifp->if_drainers == 0);
9918 VERIFY(ifp->if_suspend == 0);
9919 ifp->if_refflags &= ~IFRF_READY;
9920 lck_mtx_unlock(lck: &ifp->if_ref_lock);
9921
9922 /* Clear agent IDs */
9923 if (ifp->if_agentids != NULL) {
9924 kfree_data(ifp->if_agentids,
9925 sizeof(uuid_t) * ifp->if_agentcount);
9926 ifp->if_agentids = NULL;
9927 }
9928 ifp->if_agentcount = 0;
9929
9930#if SKYWALK
9931 VERIFY(SLIST_EMPTY(&ifp->if_netns_tokens));
9932#endif /* SKYWALK */
9933 /* Drain and destroy send queue */
9934 ifclassq_teardown(ifp->if_snd);
9935
9936 /* Detach interface filters */
9937 lck_mtx_lock(lck: &ifp->if_flt_lock);
9938 if_flt_monitor_enter(ifp);
9939
9940 LCK_MTX_ASSERT(&ifp->if_flt_lock, LCK_MTX_ASSERT_OWNED);
9941 fhead = ifp->if_flt_head;
9942 TAILQ_INIT(&ifp->if_flt_head);
9943
9944 for (filter = TAILQ_FIRST(&fhead); filter; filter = filter_next) {
9945 filter_next = TAILQ_NEXT(filter, filt_next);
9946 lck_mtx_unlock(lck: &ifp->if_flt_lock);
9947
9948 dlil_detach_filter_internal(filter, detached: 1);
9949 lck_mtx_lock(lck: &ifp->if_flt_lock);
9950 }
9951 if_flt_monitor_leave(ifp);
9952 lck_mtx_unlock(lck: &ifp->if_flt_lock);
9953
9954 /* Tell upper layers to drop their network addresses */
9955 if_purgeaddrs(ifp);
9956
9957 ifnet_lock_exclusive(ifp);
9958
9959 /* Unplumb all protocols */
9960 for (i = 0; i < PROTO_HASH_SLOTS; i++) {
9961 struct if_proto *proto;
9962
9963 proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9964 while (proto != NULL) {
9965 protocol_family_t family = proto->protocol_family;
9966 ifnet_lock_done(ifp);
9967 proto_unplumb(protocol_family: family, ifp);
9968 ifnet_lock_exclusive(ifp);
9969 proto = SLIST_FIRST(&ifp->if_proto_hash[i]);
9970 }
9971 /* There should not be any protocols left */
9972 VERIFY(SLIST_EMPTY(&ifp->if_proto_hash[i]));
9973 }
9974 kfree_type(struct proto_hash_entry, PROTO_HASH_SLOTS, ifp->if_proto_hash);
9975 ifp->if_proto_hash = NULL;
9976
9977 /* Detach (permanent) link address from if_addrhead */
9978 ifa = TAILQ_FIRST(&ifp->if_addrhead);
9979 VERIFY(ifnet_addrs[ifp->if_index - 1] == ifa);
9980 IFA_LOCK(ifa);
9981 if_detach_link_ifa(ifp, ifa);
9982 IFA_UNLOCK(ifa);
9983
9984 /* Remove (permanent) link address from ifnet_addrs[] */
9985 ifa_remref(ifa);
9986 ifnet_addrs[ifp->if_index - 1] = NULL;
9987
9988 /* This interface should not be on {ifnet_head,detaching} */
9989 VERIFY(ifp->if_link.tqe_next == NULL);
9990 VERIFY(ifp->if_link.tqe_prev == NULL);
9991 VERIFY(ifp->if_detaching_link.tqe_next == NULL);
9992 VERIFY(ifp->if_detaching_link.tqe_prev == NULL);
9993 VERIFY(ifp->if_ordered_link.tqe_next == NULL);
9994 VERIFY(ifp->if_ordered_link.tqe_prev == NULL);
9995
9996 /* The slot should have been emptied */
9997 VERIFY(ifindex2ifnet[ifp->if_index] == NULL);
9998
9999 /* There should not be any addresses left */
10000 VERIFY(TAILQ_EMPTY(&ifp->if_addrhead));
10001
10002 /*
10003 * Signal the starter thread to terminate itself, and wait until
10004 * it has exited.
10005 */
10006 if (ifp->if_start_thread != THREAD_NULL) {
10007 lck_mtx_lock_spin(lck: &ifp->if_start_lock);
10008 ifp->if_start_flags |= IFSF_TERMINATING;
10009 wakeup_one(chan: (caddr_t)&ifp->if_start_thread);
10010 lck_mtx_unlock(lck: &ifp->if_start_lock);
10011
10012 /* wait for starter thread to terminate */
10013 lck_mtx_lock(lck: &ifp->if_start_lock);
10014 while (ifp->if_start_thread != THREAD_NULL) {
10015 if (dlil_verbose) {
10016 DLIL_PRINTF("%s: waiting for %s starter thread to terminate\n",
10017 __func__,
10018 if_name(ifp));
10019 }
10020 (void) msleep(chan: &ifp->if_start_thread,
10021 mtx: &ifp->if_start_lock, pri: (PZERO - 1),
10022 wmesg: "ifnet_start_thread_exit", NULL);
10023 }
10024 lck_mtx_unlock(lck: &ifp->if_start_lock);
10025 if (dlil_verbose) {
10026 DLIL_PRINTF("%s: %s starter thread termination complete",
10027 __func__, if_name(ifp));
10028 }
10029 }
10030
10031 /*
10032 * Signal the poller thread to terminate itself, and wait until
10033 * it has exited.
10034 */
10035 if (ifp->if_poll_thread != THREAD_NULL) {
10036#if SKYWALK
10037 VERIFY(!(ifp->if_eflags & IFEF_SKYWALK_NATIVE));
10038#endif /* SKYWALK */
10039 lck_mtx_lock_spin(lck: &ifp->if_poll_lock);
10040 ifp->if_poll_flags |= IF_POLLF_TERMINATING;
10041 wakeup_one(chan: (caddr_t)&ifp->if_poll_thread);
10042 lck_mtx_unlock(lck: &ifp->if_poll_lock);
10043
10044 /* wait for poller thread to terminate */
10045 lck_mtx_lock(lck: &ifp->if_poll_lock);
10046 while (ifp->if_poll_thread != THREAD_NULL) {
10047 if (dlil_verbose) {
10048 DLIL_PRINTF("%s: waiting for %s poller thread to terminate\n",
10049 __func__,
10050 if_name(ifp));
10051 }
10052 (void) msleep(chan: &ifp->if_poll_thread,
10053 mtx: &ifp->if_poll_lock, pri: (PZERO - 1),
10054 wmesg: "ifnet_poll_thread_exit", NULL);
10055 }
10056 lck_mtx_unlock(lck: &ifp->if_poll_lock);
10057 if (dlil_verbose) {
10058 DLIL_PRINTF("%s: %s poller thread termination complete\n",
10059 __func__, if_name(ifp));
10060 }
10061 }
10062
10063 /*
10064 * If thread affinity was set for the workloop thread, we will need
10065 * to tear down the affinity and release the extra reference count
10066 * taken at attach time. Does not apply to lo0 or other interfaces
10067 * without dedicated input threads.
10068 */
10069 if ((inp = ifp->if_inp) != NULL) {
10070 VERIFY(inp != dlil_main_input_thread);
10071
10072 if (inp->dlth_affinity) {
10073 struct thread *tp, *wtp, *ptp;
10074
10075 lck_mtx_lock_spin(lck: &inp->dlth_lock);
10076 wtp = inp->dlth_driver_thread;
10077 inp->dlth_driver_thread = THREAD_NULL;
10078 ptp = inp->dlth_poller_thread;
10079 inp->dlth_poller_thread = THREAD_NULL;
10080 ASSERT(inp->dlth_thread != THREAD_NULL);
10081 tp = inp->dlth_thread; /* don't nullify now */
10082 inp->dlth_affinity_tag = 0;
10083 inp->dlth_affinity = FALSE;
10084 lck_mtx_unlock(lck: &inp->dlth_lock);
10085
10086 /* Tear down poll thread affinity */
10087 if (ptp != NULL) {
10088 VERIFY(ifp->if_eflags & IFEF_RXPOLL);
10089 VERIFY(ifp->if_xflags & IFXF_LEGACY);
10090 (void) dlil_affinity_set(tp: ptp,
10091 THREAD_AFFINITY_TAG_NULL);
10092 thread_deallocate(thread: ptp);
10093 }
10094
10095 /* Tear down workloop thread affinity */
10096 if (wtp != NULL) {
10097 (void) dlil_affinity_set(tp: wtp,
10098 THREAD_AFFINITY_TAG_NULL);
10099 thread_deallocate(thread: wtp);
10100 }
10101
10102 /* Tear down DLIL input thread affinity */
10103 (void) dlil_affinity_set(tp, THREAD_AFFINITY_TAG_NULL);
10104 thread_deallocate(thread: tp);
10105 }
10106
10107 /* disassociate ifp DLIL input thread */
10108 ifp->if_inp = NULL;
10109
10110 /* if the worker thread was created, tell it to terminate */
10111 if (inp->dlth_thread != THREAD_NULL) {
10112 lck_mtx_lock_spin(lck: &inp->dlth_lock);
10113 inp->dlth_flags |= DLIL_INPUT_TERMINATE;
10114 if (!(inp->dlth_flags & DLIL_INPUT_RUNNING)) {
10115 wakeup_one(chan: (caddr_t)&inp->dlth_flags);
10116 }
10117 lck_mtx_unlock(lck: &inp->dlth_lock);
10118 ifnet_lock_done(ifp);
10119
10120 /* wait for the input thread to terminate */
10121 lck_mtx_lock_spin(lck: &inp->dlth_lock);
10122 while ((inp->dlth_flags & DLIL_INPUT_TERMINATE_COMPLETE)
10123 == 0) {
10124 (void) msleep(chan: &inp->dlth_flags, mtx: &inp->dlth_lock,
10125 pri: (PZERO - 1) | PSPIN, wmesg: inp->dlth_name, NULL);
10126 }
10127 lck_mtx_unlock(lck: &inp->dlth_lock);
10128 ifnet_lock_exclusive(ifp);
10129 }
10130
10131 /* clean-up input thread state */
10132 dlil_clean_threading_info(inp);
10133 /* clean-up poll parameters */
10134 VERIFY(ifp->if_poll_thread == THREAD_NULL);
10135 dlil_reset_rxpoll_params(ifp);
10136 }
10137
10138 /* The driver might unload, so point these to ourselves */
10139 if_free = ifp->if_free;
10140 ifp->if_output_dlil = ifp_if_output;
10141 ifp->if_output = ifp_if_output;
10142 ifp->if_pre_enqueue = ifp_if_output;
10143 ifp->if_start = ifp_if_start;
10144 ifp->if_output_ctl = ifp_if_ctl;
10145 ifp->if_input_dlil = ifp_if_input;
10146 ifp->if_input_poll = ifp_if_input_poll;
10147 ifp->if_input_ctl = ifp_if_ctl;
10148 ifp->if_ioctl = ifp_if_ioctl;
10149 ifp->if_set_bpf_tap = ifp_if_set_bpf_tap;
10150 ifp->if_free = ifp_if_free;
10151 ifp->if_demux = ifp_if_demux;
10152 ifp->if_event = ifp_if_event;
10153 ifp->if_framer_legacy = ifp_if_framer;
10154 ifp->if_framer = ifp_if_framer_extended;
10155 ifp->if_add_proto = ifp_if_add_proto;
10156 ifp->if_del_proto = ifp_if_del_proto;
10157 ifp->if_check_multi = ifp_if_check_multi;
10158
10159 /* wipe out interface description */
10160 VERIFY(ifp->if_desc.ifd_maxlen == IF_DESCSIZE);
10161 ifp->if_desc.ifd_len = 0;
10162 VERIFY(ifp->if_desc.ifd_desc != NULL);
10163 bzero(s: ifp->if_desc.ifd_desc, IF_DESCSIZE);
10164
10165 /* there shouldn't be any delegation by now */
10166 VERIFY(ifp->if_delegated.ifp == NULL);
10167 VERIFY(ifp->if_delegated.type == 0);
10168 VERIFY(ifp->if_delegated.family == 0);
10169 VERIFY(ifp->if_delegated.subfamily == 0);
10170 VERIFY(ifp->if_delegated.expensive == 0);
10171 VERIFY(ifp->if_delegated.constrained == 0);
10172
10173 /* QoS marking get cleared */
10174 if_clear_eflags(ifp, IFEF_QOSMARKING_ENABLED);
10175 if_set_qosmarking_mode(ifp, IFRTYPE_QOSMARKING_MODE_NONE);
10176
10177#if SKYWALK
10178 /* the nexus destructor is responsible for clearing these */
10179 VERIFY(ifp->if_na_ops == NULL);
10180 VERIFY(ifp->if_na == NULL);
10181#endif /* SKYWALK */
10182
10183 /* promiscuous/allmulti counts need to start at zero again */
10184 ifp->if_pcount = 0;
10185 ifp->if_amcount = 0;
10186 ifp->if_flags &= ~(IFF_PROMISC | IFF_ALLMULTI);
10187
10188 ifnet_lock_done(ifp);
10189
10190#if PF
10191 /*
10192 * Detach this interface from packet filter, if enabled.
10193 */
10194 pf_ifnet_hook(ifp, 0);
10195#endif /* PF */
10196
10197 /* Filter list should be empty */
10198 lck_mtx_lock_spin(lck: &ifp->if_flt_lock);
10199 VERIFY(TAILQ_EMPTY(&ifp->if_flt_head));
10200 VERIFY(ifp->if_flt_busy == 0);
10201 VERIFY(ifp->if_flt_waiters == 0);
10202 VERIFY(ifp->if_flt_non_os_count == 0);
10203 VERIFY(ifp->if_flt_no_tso_count == 0);
10204 lck_mtx_unlock(lck: &ifp->if_flt_lock);
10205
10206 /* Last chance to drain send queue */
10207 if_qflush_snd(ifp, 0);
10208
10209 /* Last chance to cleanup any cached route */
10210 lck_mtx_lock(lck: &ifp->if_cached_route_lock);
10211 VERIFY(!ifp->if_fwd_cacheok);
10212 ROUTE_RELEASE(&ifp->if_fwd_route);
10213 bzero(s: &ifp->if_fwd_route, n: sizeof(ifp->if_fwd_route));
10214 ROUTE_RELEASE(&ifp->if_src_route);
10215 bzero(s: &ifp->if_src_route, n: sizeof(ifp->if_src_route));
10216 ROUTE_RELEASE(&ifp->if_src_route6);
10217 bzero(s: &ifp->if_src_route6, n: sizeof(ifp->if_src_route6));
10218 lck_mtx_unlock(lck: &ifp->if_cached_route_lock);
10219
10220 /* Ignore any pending data threshold as the interface is anyways gone */
10221 ifp->if_data_threshold = 0;
10222
10223 VERIFY(ifp->if_dt_tcall != NULL);
10224 VERIFY(!thread_call_isactive(ifp->if_dt_tcall));
10225
10226 ifnet_llreach_ifdetach(ifp);
10227
10228 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_IF_DETACHED, NULL, event_data_len: 0, FALSE);
10229
10230 /*
10231 * Finally, mark this ifnet as detached.
10232 */
10233 if (dlil_verbose) {
10234 DLIL_PRINTF("%s: detached\n", if_name(ifp));
10235 }
10236 lck_mtx_lock_spin(lck: &ifp->if_ref_lock);
10237 if (!(ifp->if_refflags & IFRF_DETACHING)) {
10238 panic("%s: flags mismatch (detaching not set) ifp=%p",
10239 __func__, ifp);
10240 /* NOTREACHED */
10241 }
10242 ifp->if_refflags &= ~IFRF_DETACHING;
10243 lck_mtx_unlock(lck: &ifp->if_ref_lock);
10244 if (if_free != NULL) {
10245 if_free(ifp);
10246 }
10247
10248 ifclassq_release(&ifp->if_snd);
10249
10250 /* we're fully detached, clear the "in use" bit */
10251 dlifp = (struct dlil_ifnet *)ifp;
10252 lck_mtx_lock(lck: &dlifp->dl_if_lock);
10253 ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
10254 dlifp->dl_if_flags &= ~DLIF_INUSE;
10255 lck_mtx_unlock(lck: &dlifp->dl_if_lock);
10256
10257 /* Release reference held during ifnet attach */
10258 ifnet_release(interface: ifp);
10259}
10260
10261errno_t
10262ifp_if_output(struct ifnet *ifp, struct mbuf *m)
10263{
10264#pragma unused(ifp)
10265 m_freem_list(m);
10266 return 0;
10267}
10268
10269void
10270ifp_if_start(struct ifnet *ifp)
10271{
10272 ifnet_purge(ifp);
10273}
10274
10275static errno_t
10276ifp_if_input(struct ifnet *ifp, struct mbuf *m_head,
10277 struct mbuf *m_tail, const struct ifnet_stat_increment_param *s,
10278 boolean_t poll, struct thread *tp)
10279{
10280#pragma unused(ifp, m_tail, s, poll, tp)
10281 m_freem_list(m_head);
10282 return ENXIO;
10283}
10284
10285static void
10286ifp_if_input_poll(struct ifnet *ifp, u_int32_t flags, u_int32_t max_cnt,
10287 struct mbuf **m_head, struct mbuf **m_tail, u_int32_t *cnt, u_int32_t *len)
10288{
10289#pragma unused(ifp, flags, max_cnt)
10290 if (m_head != NULL) {
10291 *m_head = NULL;
10292 }
10293 if (m_tail != NULL) {
10294 *m_tail = NULL;
10295 }
10296 if (cnt != NULL) {
10297 *cnt = 0;
10298 }
10299 if (len != NULL) {
10300 *len = 0;
10301 }
10302}
10303
10304static errno_t
10305ifp_if_ctl(struct ifnet *ifp, ifnet_ctl_cmd_t cmd, u_int32_t arglen, void *arg)
10306{
10307#pragma unused(ifp, cmd, arglen, arg)
10308 return EOPNOTSUPP;
10309}
10310
10311static errno_t
10312ifp_if_demux(struct ifnet *ifp, struct mbuf *m, char *fh, protocol_family_t *pf)
10313{
10314#pragma unused(ifp, fh, pf)
10315 m_freem(m);
10316 return EJUSTRETURN;
10317}
10318
10319static errno_t
10320ifp_if_add_proto(struct ifnet *ifp, protocol_family_t pf,
10321 const struct ifnet_demux_desc *da, u_int32_t dc)
10322{
10323#pragma unused(ifp, pf, da, dc)
10324 return EINVAL;
10325}
10326
10327static errno_t
10328ifp_if_del_proto(struct ifnet *ifp, protocol_family_t pf)
10329{
10330#pragma unused(ifp, pf)
10331 return EINVAL;
10332}
10333
10334static errno_t
10335ifp_if_check_multi(struct ifnet *ifp, const struct sockaddr *sa)
10336{
10337#pragma unused(ifp, sa)
10338 return EOPNOTSUPP;
10339}
10340
10341#if !XNU_TARGET_OS_OSX
10342static errno_t
10343ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
10344 const struct sockaddr *sa, const char *ll, const char *t,
10345 u_int32_t *pre, u_int32_t *post)
10346#else /* XNU_TARGET_OS_OSX */
10347static errno_t
10348ifp_if_framer(struct ifnet *ifp, struct mbuf **m,
10349 const struct sockaddr *sa, const char *ll, const char *t)
10350#endif /* XNU_TARGET_OS_OSX */
10351{
10352#pragma unused(ifp, m, sa, ll, t)
10353#if !XNU_TARGET_OS_OSX
10354 return ifp_if_framer_extended(ifp, m, sa, ll, t, pre, post);
10355#else /* XNU_TARGET_OS_OSX */
10356 return ifp_if_framer_extended(ifp, m, sa, ll, t, NULL, NULL);
10357#endif /* XNU_TARGET_OS_OSX */
10358}
10359
10360static errno_t
10361ifp_if_framer_extended(struct ifnet *ifp, struct mbuf **m,
10362 const struct sockaddr *sa, const char *ll, const char *t,
10363 u_int32_t *pre, u_int32_t *post)
10364{
10365#pragma unused(ifp, sa, ll, t)
10366 m_freem(*m);
10367 *m = NULL;
10368
10369 if (pre != NULL) {
10370 *pre = 0;
10371 }
10372 if (post != NULL) {
10373 *post = 0;
10374 }
10375
10376 return EJUSTRETURN;
10377}
10378
10379errno_t
10380ifp_if_ioctl(struct ifnet *ifp, unsigned long cmd, void *arg)
10381{
10382#pragma unused(ifp, cmd, arg)
10383 return EOPNOTSUPP;
10384}
10385
10386static errno_t
10387ifp_if_set_bpf_tap(struct ifnet *ifp, bpf_tap_mode tm, bpf_packet_func f)
10388{
10389#pragma unused(ifp, tm, f)
10390 /* XXX not sure what to do here */
10391 return 0;
10392}
10393
10394static void
10395ifp_if_free(struct ifnet *ifp)
10396{
10397#pragma unused(ifp)
10398}
10399
10400static void
10401ifp_if_event(struct ifnet *ifp, const struct kev_msg *e)
10402{
10403#pragma unused(ifp, e)
10404}
10405
10406int
10407dlil_if_acquire(u_int32_t family, const void *uniqueid,
10408 size_t uniqueid_len, const char *ifxname, struct ifnet **ifp)
10409{
10410 struct ifnet *ifp1 = NULL;
10411 struct dlil_ifnet *dlifp1 = NULL;
10412 struct dlil_ifnet *dlifp1_saved = NULL;
10413 void *buf, *base, **pbuf;
10414 int ret = 0;
10415
10416 VERIFY(*ifp == NULL);
10417 dlil_if_lock();
10418 /*
10419 * We absolutely can't have an interface with the same name
10420 * in in-use state.
10421 * To make sure of that list has to be traversed completely
10422 */
10423 TAILQ_FOREACH(dlifp1, &dlil_ifnet_head, dl_if_link) {
10424 ifp1 = (struct ifnet *)dlifp1;
10425
10426 if (ifp1->if_family != family) {
10427 continue;
10428 }
10429
10430 /*
10431 * If interface is in use, return EBUSY if either unique id
10432 * or interface extended names are the same
10433 */
10434 lck_mtx_lock(lck: &dlifp1->dl_if_lock);
10435 if (strncmp(s1: ifxname, s2: ifp1->if_xname, IFXNAMSIZ) == 0 &&
10436 (dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10437 lck_mtx_unlock(lck: &dlifp1->dl_if_lock);
10438 ret = EBUSY;
10439 goto end;
10440 }
10441
10442 if (uniqueid_len != 0 &&
10443 uniqueid_len == dlifp1->dl_if_uniqueid_len &&
10444 bcmp(s1: uniqueid, s2: dlifp1->dl_if_uniqueid, n: uniqueid_len) == 0) {
10445 if ((dlifp1->dl_if_flags & DLIF_INUSE) != 0) {
10446 lck_mtx_unlock(lck: &dlifp1->dl_if_lock);
10447 ret = EBUSY;
10448 goto end;
10449 }
10450 if (dlifp1_saved == NULL) {
10451 /* cache the first match */
10452 dlifp1_saved = dlifp1;
10453 }
10454 /*
10455 * Do not break or jump to end as we have to traverse
10456 * the whole list to ensure there are no name collisions
10457 */
10458 }
10459 lck_mtx_unlock(lck: &dlifp1->dl_if_lock);
10460 }
10461
10462 /* If there's an interface that can be recycled, use that */
10463 if (dlifp1_saved != NULL) {
10464 lck_mtx_lock(lck: &dlifp1_saved->dl_if_lock);
10465 if ((dlifp1_saved->dl_if_flags & DLIF_INUSE) != 0) {
10466 /* some other thread got in ahead of us */
10467 lck_mtx_unlock(lck: &dlifp1_saved->dl_if_lock);
10468 ret = EBUSY;
10469 goto end;
10470 }
10471 dlifp1_saved->dl_if_flags |= (DLIF_INUSE | DLIF_REUSE);
10472 lck_mtx_unlock(lck: &dlifp1_saved->dl_if_lock);
10473 *ifp = (struct ifnet *)dlifp1_saved;
10474 dlil_if_ref(ifp: *ifp);
10475 goto end;
10476 }
10477
10478 /* no interface found, allocate a new one */
10479 buf = zalloc_flags(dlif_zone, Z_WAITOK | Z_ZERO | Z_NOFAIL);
10480
10481 /* Get the 64-bit aligned base address for this object */
10482 base = (void *)P2ROUNDUP((intptr_t)buf + sizeof(u_int64_t),
10483 sizeof(u_int64_t));
10484 VERIFY(((intptr_t)base + dlif_size) <= ((intptr_t)buf + dlif_bufsize));
10485
10486 /*
10487 * Wind back a pointer size from the aligned base and
10488 * save the original address so we can free it later.
10489 */
10490 pbuf = (void **)((intptr_t)base - sizeof(void *));
10491 *pbuf = buf;
10492 dlifp1 = base;
10493
10494 if (uniqueid_len) {
10495 dlifp1->dl_if_uniqueid = kalloc_data(uniqueid_len,
10496 Z_WAITOK);
10497 if (dlifp1->dl_if_uniqueid == NULL) {
10498 zfree(dlif_zone, buf);
10499 ret = ENOMEM;
10500 goto end;
10501 }
10502 bcopy(src: uniqueid, dst: dlifp1->dl_if_uniqueid, n: uniqueid_len);
10503 dlifp1->dl_if_uniqueid_len = uniqueid_len;
10504 }
10505
10506 ifp1 = (struct ifnet *)dlifp1;
10507 dlifp1->dl_if_flags = DLIF_INUSE;
10508 if (ifnet_debug) {
10509 dlifp1->dl_if_flags |= DLIF_DEBUG;
10510 dlifp1->dl_if_trace = dlil_if_trace;
10511 }
10512 ifp1->if_name = dlifp1->dl_if_namestorage;
10513 ifp1->if_xname = dlifp1->dl_if_xnamestorage;
10514
10515 /* initialize interface description */
10516 ifp1->if_desc.ifd_maxlen = IF_DESCSIZE;
10517 ifp1->if_desc.ifd_len = 0;
10518 ifp1->if_desc.ifd_desc = dlifp1->dl_if_descstorage;
10519
10520#if SKYWALK
10521 SLIST_INIT(&ifp1->if_netns_tokens);
10522#endif /* SKYWALK */
10523
10524 if ((ret = dlil_alloc_local_stats(ifp: ifp1)) != 0) {
10525 DLIL_PRINTF("%s: failed to allocate if local stats, "
10526 "error: %d\n", __func__, ret);
10527 /* This probably shouldn't be fatal */
10528 ret = 0;
10529 }
10530
10531 lck_mtx_init(lck: &dlifp1->dl_if_lock, grp: &ifnet_lock_group, attr: &ifnet_lock_attr);
10532 lck_rw_init(lck: &ifp1->if_lock, grp: &ifnet_lock_group, attr: &ifnet_lock_attr);
10533 lck_mtx_init(lck: &ifp1->if_ref_lock, grp: &ifnet_lock_group, attr: &ifnet_lock_attr);
10534 lck_mtx_init(lck: &ifp1->if_flt_lock, grp: &ifnet_lock_group, attr: &ifnet_lock_attr);
10535 lck_mtx_init(lck: &ifp1->if_addrconfig_lock, grp: &ifnet_lock_group,
10536 attr: &ifnet_lock_attr);
10537 lck_rw_init(lck: &ifp1->if_llreach_lock, grp: &ifnet_lock_group, attr: &ifnet_lock_attr);
10538#if INET
10539 lck_rw_init(lck: &ifp1->if_inetdata_lock, grp: &ifnet_lock_group,
10540 attr: &ifnet_lock_attr);
10541 ifp1->if_inetdata = NULL;
10542#endif
10543 lck_mtx_init(lck: &ifp1->if_inet6_ioctl_lock, grp: &ifnet_lock_group, attr: &ifnet_lock_attr);
10544 ifp1->if_inet6_ioctl_busy = FALSE;
10545 lck_rw_init(lck: &ifp1->if_inet6data_lock, grp: &ifnet_lock_group,
10546 attr: &ifnet_lock_attr);
10547 ifp1->if_inet6data = NULL;
10548 lck_rw_init(lck: &ifp1->if_link_status_lock, grp: &ifnet_lock_group,
10549 attr: &ifnet_lock_attr);
10550 ifp1->if_link_status = NULL;
10551 lck_mtx_init(lck: &ifp1->if_delegate_lock, grp: &ifnet_lock_group, attr: &ifnet_lock_attr);
10552
10553 /* for send data paths */
10554 lck_mtx_init(lck: &ifp1->if_start_lock, grp: &ifnet_snd_lock_group,
10555 attr: &ifnet_lock_attr);
10556 lck_mtx_init(lck: &ifp1->if_cached_route_lock, grp: &ifnet_snd_lock_group,
10557 attr: &ifnet_lock_attr);
10558
10559 /* for receive data paths */
10560 lck_mtx_init(lck: &ifp1->if_poll_lock, grp: &ifnet_rcv_lock_group,
10561 attr: &ifnet_lock_attr);
10562
10563 /* thread call allocation is done with sleeping zalloc */
10564 ifp1->if_dt_tcall = thread_call_allocate_with_options(func: dlil_dt_tcall_fn,
10565 param0: ifp1, pri: THREAD_CALL_PRIORITY_KERNEL, options: THREAD_CALL_OPTIONS_ONCE);
10566 if (ifp1->if_dt_tcall == NULL) {
10567 panic_plain("%s: couldn't create if_dt_tcall", __func__);
10568 /* NOTREACHED */
10569 }
10570
10571 TAILQ_INSERT_TAIL(&dlil_ifnet_head, dlifp1, dl_if_link);
10572
10573 *ifp = ifp1;
10574 dlil_if_ref(ifp: *ifp);
10575
10576end:
10577 dlil_if_unlock();
10578
10579 VERIFY(dlifp1 == NULL || (IS_P2ALIGNED(dlifp1, sizeof(u_int64_t)) &&
10580 IS_P2ALIGNED(&ifp1->if_data, sizeof(u_int64_t))));
10581
10582 return ret;
10583}
10584
10585static void
10586_dlil_if_release(ifnet_t ifp, bool clear_in_use)
10587{
10588 struct dlil_ifnet *dlifp = (struct dlil_ifnet *)ifp;
10589
10590 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_count) > 0);
10591 if (!(ifp->if_xflags & IFXF_ALLOC_KPI)) {
10592 VERIFY(OSDecrementAtomic64(&net_api_stats.nas_ifnet_alloc_os_count) > 0);
10593 }
10594
10595 ifnet_lock_exclusive(ifp);
10596 if (ifp->if_broadcast.length > sizeof(ifp->if_broadcast.u.buffer)) {
10597 kfree_data(ifp->if_broadcast.u.ptr, ifp->if_broadcast.length);
10598 ifp->if_broadcast.length = 0;
10599 ifp->if_broadcast.u.ptr = NULL;
10600 }
10601 lck_mtx_lock(lck: &dlifp->dl_if_lock);
10602 strlcpy(dst: dlifp->dl_if_namestorage, src: ifp->if_name, IFNAMSIZ);
10603 ifp->if_name = dlifp->dl_if_namestorage;
10604 /* Reset external name (name + unit) */
10605 ifp->if_xname = dlifp->dl_if_xnamestorage;
10606 snprintf(__DECONST(char *, ifp->if_xname), IFXNAMSIZ,
10607 "%s?", ifp->if_name);
10608 if (clear_in_use) {
10609 ASSERT((dlifp->dl_if_flags & DLIF_INUSE) != 0);
10610 dlifp->dl_if_flags &= ~DLIF_INUSE;
10611 }
10612 lck_mtx_unlock(lck: &dlifp->dl_if_lock);
10613 ifnet_lock_done(ifp);
10614}
10615
10616__private_extern__ void
10617dlil_if_release(ifnet_t ifp)
10618{
10619 _dlil_if_release(ifp, false);
10620}
10621
10622__private_extern__ void
10623dlil_if_lock(void)
10624{
10625 lck_mtx_lock(lck: &dlil_ifnet_lock);
10626}
10627
10628__private_extern__ void
10629dlil_if_unlock(void)
10630{
10631 lck_mtx_unlock(lck: &dlil_ifnet_lock);
10632}
10633
10634__private_extern__ void
10635dlil_if_lock_assert(void)
10636{
10637 LCK_MTX_ASSERT(&dlil_ifnet_lock, LCK_MTX_ASSERT_OWNED);
10638}
10639
10640__private_extern__ void
10641dlil_proto_unplumb_all(struct ifnet *ifp)
10642{
10643 /*
10644 * if_proto_hash[0-2] are for PF_INET, PF_INET6 and PF_VLAN, where
10645 * each bucket contains exactly one entry; PF_VLAN does not need an
10646 * explicit unplumb.
10647 *
10648 * if_proto_hash[3] is for other protocols; we expect anything
10649 * in this bucket to respond to the DETACHING event (which would
10650 * have happened by now) and do the unplumb then.
10651 */
10652 (void) proto_unplumb(PF_INET, ifp);
10653 (void) proto_unplumb(PF_INET6, ifp);
10654}
10655
10656static void
10657ifp_src_route_copyout(struct ifnet *ifp, struct route *dst)
10658{
10659 lck_mtx_lock_spin(lck: &ifp->if_cached_route_lock);
10660 lck_mtx_convert_spin(lck: &ifp->if_cached_route_lock);
10661
10662 route_copyout(dst, &ifp->if_src_route, sizeof(*dst));
10663
10664 lck_mtx_unlock(lck: &ifp->if_cached_route_lock);
10665}
10666
10667static void
10668ifp_src_route_copyin(struct ifnet *ifp, struct route *src)
10669{
10670 lck_mtx_lock_spin(lck: &ifp->if_cached_route_lock);
10671 lck_mtx_convert_spin(lck: &ifp->if_cached_route_lock);
10672
10673 if (ifp->if_fwd_cacheok) {
10674 route_copyin(src, &ifp->if_src_route, sizeof(*src));
10675 } else {
10676 ROUTE_RELEASE(src);
10677 }
10678 lck_mtx_unlock(lck: &ifp->if_cached_route_lock);
10679}
10680
10681static void
10682ifp_src_route6_copyout(struct ifnet *ifp, struct route_in6 *dst)
10683{
10684 lck_mtx_lock_spin(lck: &ifp->if_cached_route_lock);
10685 lck_mtx_convert_spin(lck: &ifp->if_cached_route_lock);
10686
10687 route_copyout((struct route *)dst, (struct route *)&ifp->if_src_route6,
10688 sizeof(*dst));
10689
10690 lck_mtx_unlock(lck: &ifp->if_cached_route_lock);
10691}
10692
10693static void
10694ifp_src_route6_copyin(struct ifnet *ifp, struct route_in6 *src)
10695{
10696 lck_mtx_lock_spin(lck: &ifp->if_cached_route_lock);
10697 lck_mtx_convert_spin(lck: &ifp->if_cached_route_lock);
10698
10699 if (ifp->if_fwd_cacheok) {
10700 route_copyin((struct route *)src,
10701 (struct route *)&ifp->if_src_route6, sizeof(*src));
10702 } else {
10703 ROUTE_RELEASE(src);
10704 }
10705 lck_mtx_unlock(lck: &ifp->if_cached_route_lock);
10706}
10707
10708struct rtentry *
10709ifnet_cached_rtlookup_inet(struct ifnet *ifp, struct in_addr src_ip)
10710{
10711 struct route src_rt;
10712 struct sockaddr_in *dst;
10713
10714 dst = SIN(&src_rt.ro_dst);
10715
10716 ifp_src_route_copyout(ifp, dst: &src_rt);
10717
10718 if (ROUTE_UNUSABLE(&src_rt) || src_ip.s_addr != dst->sin_addr.s_addr) {
10719 ROUTE_RELEASE(&src_rt);
10720 if (dst->sin_family != AF_INET) {
10721 SOCKADDR_ZERO(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10722 dst->sin_len = sizeof(src_rt.ro_dst);
10723 dst->sin_family = AF_INET;
10724 }
10725 dst->sin_addr = src_ip;
10726
10727 VERIFY(src_rt.ro_rt == NULL);
10728 src_rt.ro_rt = rtalloc1_scoped(SA(dst),
10729 0, 0, ifp->if_index);
10730
10731 if (src_rt.ro_rt != NULL) {
10732 /* retain a ref, copyin consumes one */
10733 struct rtentry *rte = src_rt.ro_rt;
10734 RT_ADDREF(rte);
10735 ifp_src_route_copyin(ifp, src: &src_rt);
10736 src_rt.ro_rt = rte;
10737 }
10738 }
10739
10740 return src_rt.ro_rt;
10741}
10742
10743struct rtentry *
10744ifnet_cached_rtlookup_inet6(struct ifnet *ifp, struct in6_addr *src_ip6)
10745{
10746 struct route_in6 src_rt;
10747
10748 ifp_src_route6_copyout(ifp, dst: &src_rt);
10749
10750 if (ROUTE_UNUSABLE(&src_rt) ||
10751 !IN6_ARE_ADDR_EQUAL(src_ip6, &src_rt.ro_dst.sin6_addr)) {
10752 ROUTE_RELEASE(&src_rt);
10753 if (src_rt.ro_dst.sin6_family != AF_INET6) {
10754 SOCKADDR_ZERO(&src_rt.ro_dst, sizeof(src_rt.ro_dst));
10755 src_rt.ro_dst.sin6_len = sizeof(src_rt.ro_dst);
10756 src_rt.ro_dst.sin6_family = AF_INET6;
10757 }
10758 src_rt.ro_dst.sin6_scope_id = in6_addr2scopeid(ifp, src_ip6);
10759 bcopy(src: src_ip6, dst: &src_rt.ro_dst.sin6_addr,
10760 n: sizeof(src_rt.ro_dst.sin6_addr));
10761
10762 if (src_rt.ro_rt == NULL) {
10763 src_rt.ro_rt = rtalloc1_scoped(
10764 SA(&src_rt.ro_dst), 0, 0,
10765 ifp->if_index);
10766
10767 if (src_rt.ro_rt != NULL) {
10768 /* retain a ref, copyin consumes one */
10769 struct rtentry *rte = src_rt.ro_rt;
10770 RT_ADDREF(rte);
10771 ifp_src_route6_copyin(ifp, src: &src_rt);
10772 src_rt.ro_rt = rte;
10773 }
10774 }
10775 }
10776
10777 return src_rt.ro_rt;
10778}
10779
10780void
10781if_lqm_update(struct ifnet *ifp, int lqm, int locked)
10782{
10783 struct kev_dl_link_quality_metric_data ev_lqm_data;
10784
10785 VERIFY(lqm >= IFNET_LQM_MIN && lqm <= IFNET_LQM_MAX);
10786
10787 /* Normalize to edge */
10788 if (lqm >= 0 && lqm <= IFNET_LQM_THRESH_ABORT) {
10789 lqm = IFNET_LQM_THRESH_ABORT;
10790 os_atomic_or(&tcbinfo.ipi_flags, INPCBINFO_HANDLE_LQM_ABORT, relaxed);
10791 inpcb_timer_sched(&tcbinfo, type: INPCB_TIMER_FAST);
10792 } else if (lqm > IFNET_LQM_THRESH_ABORT &&
10793 lqm <= IFNET_LQM_THRESH_MINIMALLY_VIABLE) {
10794 lqm = IFNET_LQM_THRESH_MINIMALLY_VIABLE;
10795 } else if (lqm > IFNET_LQM_THRESH_MINIMALLY_VIABLE &&
10796 lqm <= IFNET_LQM_THRESH_POOR) {
10797 lqm = IFNET_LQM_THRESH_POOR;
10798 } else if (lqm > IFNET_LQM_THRESH_POOR &&
10799 lqm <= IFNET_LQM_THRESH_GOOD) {
10800 lqm = IFNET_LQM_THRESH_GOOD;
10801 }
10802
10803 /*
10804 * Take the lock if needed
10805 */
10806 if (!locked) {
10807 ifnet_lock_exclusive(ifp);
10808 }
10809
10810 if (lqm == ifp->if_interface_state.lqm_state &&
10811 (ifp->if_interface_state.valid_bitmask &
10812 IF_INTERFACE_STATE_LQM_STATE_VALID)) {
10813 /*
10814 * Release the lock if was not held by the caller
10815 */
10816 if (!locked) {
10817 ifnet_lock_done(ifp);
10818 }
10819 return; /* nothing to update */
10820 }
10821 ifp->if_interface_state.valid_bitmask |=
10822 IF_INTERFACE_STATE_LQM_STATE_VALID;
10823 ifp->if_interface_state.lqm_state = (int8_t)lqm;
10824
10825 /*
10826 * Don't want to hold the lock when issuing kernel events
10827 */
10828 ifnet_lock_done(ifp);
10829
10830 bzero(s: &ev_lqm_data, n: sizeof(ev_lqm_data));
10831 ev_lqm_data.link_quality_metric = lqm;
10832
10833 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_LINK_QUALITY_METRIC_CHANGED,
10834 event_data: (struct net_event_data *)&ev_lqm_data, event_data_len: sizeof(ev_lqm_data), FALSE);
10835
10836 /*
10837 * Reacquire the lock for the caller
10838 */
10839 if (locked) {
10840 ifnet_lock_exclusive(ifp);
10841 }
10842}
10843
10844static void
10845if_rrc_state_update(struct ifnet *ifp, unsigned int rrc_state)
10846{
10847 struct kev_dl_rrc_state kev;
10848
10849 if (rrc_state == ifp->if_interface_state.rrc_state &&
10850 (ifp->if_interface_state.valid_bitmask &
10851 IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10852 return;
10853 }
10854
10855 ifp->if_interface_state.valid_bitmask |=
10856 IF_INTERFACE_STATE_RRC_STATE_VALID;
10857
10858 ifp->if_interface_state.rrc_state = (uint8_t)rrc_state;
10859
10860 /*
10861 * Don't want to hold the lock when issuing kernel events
10862 */
10863 ifnet_lock_done(ifp);
10864
10865 bzero(s: &kev, n: sizeof(struct kev_dl_rrc_state));
10866 kev.rrc_state = rrc_state;
10867
10868 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_RRC_STATE_CHANGED,
10869 event_data: (struct net_event_data *)&kev, event_data_len: sizeof(struct kev_dl_rrc_state), FALSE);
10870
10871 ifnet_lock_exclusive(ifp);
10872}
10873
10874errno_t
10875if_state_update(struct ifnet *ifp,
10876 struct if_interface_state *if_interface_state)
10877{
10878 u_short if_index_available = 0;
10879
10880 ifnet_lock_exclusive(ifp);
10881
10882 if ((ifp->if_type != IFT_CELLULAR) &&
10883 (if_interface_state->valid_bitmask &
10884 IF_INTERFACE_STATE_RRC_STATE_VALID)) {
10885 ifnet_lock_done(ifp);
10886 return ENOTSUP;
10887 }
10888 if ((if_interface_state->valid_bitmask &
10889 IF_INTERFACE_STATE_LQM_STATE_VALID) &&
10890 (if_interface_state->lqm_state < IFNET_LQM_MIN ||
10891 if_interface_state->lqm_state > IFNET_LQM_MAX)) {
10892 ifnet_lock_done(ifp);
10893 return EINVAL;
10894 }
10895 if ((if_interface_state->valid_bitmask &
10896 IF_INTERFACE_STATE_RRC_STATE_VALID) &&
10897 if_interface_state->rrc_state !=
10898 IF_INTERFACE_STATE_RRC_STATE_IDLE &&
10899 if_interface_state->rrc_state !=
10900 IF_INTERFACE_STATE_RRC_STATE_CONNECTED) {
10901 ifnet_lock_done(ifp);
10902 return EINVAL;
10903 }
10904
10905 if (if_interface_state->valid_bitmask &
10906 IF_INTERFACE_STATE_LQM_STATE_VALID) {
10907 if_lqm_update(ifp, lqm: if_interface_state->lqm_state, locked: 1);
10908 }
10909 if (if_interface_state->valid_bitmask &
10910 IF_INTERFACE_STATE_RRC_STATE_VALID) {
10911 if_rrc_state_update(ifp, rrc_state: if_interface_state->rrc_state);
10912 }
10913 if (if_interface_state->valid_bitmask &
10914 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10915 ifp->if_interface_state.valid_bitmask |=
10916 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10917 ifp->if_interface_state.interface_availability =
10918 if_interface_state->interface_availability;
10919
10920 if (ifp->if_interface_state.interface_availability ==
10921 IF_INTERFACE_STATE_INTERFACE_AVAILABLE) {
10922 os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) available\n",
10923 __func__, if_name(ifp), ifp->if_index);
10924 if_index_available = ifp->if_index;
10925 } else {
10926 os_log(OS_LOG_DEFAULT, "%s: interface %s (%u) unavailable)\n",
10927 __func__, if_name(ifp), ifp->if_index);
10928 }
10929 }
10930 ifnet_lock_done(ifp);
10931
10932 /*
10933 * Check if the TCP connections going on this interface should be
10934 * forced to send probe packets instead of waiting for TCP timers
10935 * to fire. This is done on an explicit notification such as
10936 * SIOCSIFINTERFACESTATE which marks the interface as available.
10937 */
10938 if (if_index_available > 0) {
10939 tcp_interface_send_probe(if_index_available);
10940 }
10941
10942 return 0;
10943}
10944
10945void
10946if_get_state(struct ifnet *ifp,
10947 struct if_interface_state *if_interface_state)
10948{
10949 ifnet_lock_shared(ifp);
10950
10951 if_interface_state->valid_bitmask = 0;
10952
10953 if (ifp->if_interface_state.valid_bitmask &
10954 IF_INTERFACE_STATE_RRC_STATE_VALID) {
10955 if_interface_state->valid_bitmask |=
10956 IF_INTERFACE_STATE_RRC_STATE_VALID;
10957 if_interface_state->rrc_state =
10958 ifp->if_interface_state.rrc_state;
10959 }
10960 if (ifp->if_interface_state.valid_bitmask &
10961 IF_INTERFACE_STATE_LQM_STATE_VALID) {
10962 if_interface_state->valid_bitmask |=
10963 IF_INTERFACE_STATE_LQM_STATE_VALID;
10964 if_interface_state->lqm_state =
10965 ifp->if_interface_state.lqm_state;
10966 }
10967 if (ifp->if_interface_state.valid_bitmask &
10968 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID) {
10969 if_interface_state->valid_bitmask |=
10970 IF_INTERFACE_STATE_INTERFACE_AVAILABILITY_VALID;
10971 if_interface_state->interface_availability =
10972 ifp->if_interface_state.interface_availability;
10973 }
10974
10975 ifnet_lock_done(ifp);
10976}
10977
10978errno_t
10979if_probe_connectivity(struct ifnet *ifp, u_int32_t conn_probe)
10980{
10981 if (conn_probe > 1) {
10982 return EINVAL;
10983 }
10984 if (conn_probe == 0) {
10985 if_clear_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10986 } else {
10987 if_set_eflags(ifp, IFEF_PROBE_CONNECTIVITY);
10988 }
10989
10990#if NECP
10991 necp_update_all_clients();
10992#endif /* NECP */
10993
10994 tcp_probe_connectivity(ifp, enable: conn_probe);
10995 return 0;
10996}
10997
10998/* for uuid.c */
10999static int
11000get_ether_index(int * ret_other_index)
11001{
11002 struct ifnet *ifp;
11003 int en0_index = 0;
11004 int other_en_index = 0;
11005 int any_ether_index = 0;
11006 short best_unit = 0;
11007
11008 *ret_other_index = 0;
11009 TAILQ_FOREACH(ifp, &ifnet_head, if_link) {
11010 /*
11011 * find en0, or if not en0, the lowest unit en*, and if not
11012 * that, any ethernet
11013 */
11014 ifnet_lock_shared(ifp);
11015 if (strcmp(s1: ifp->if_name, s2: "en") == 0) {
11016 if (ifp->if_unit == 0) {
11017 /* found en0, we're done */
11018 en0_index = ifp->if_index;
11019 ifnet_lock_done(ifp);
11020 break;
11021 }
11022 if (other_en_index == 0 || ifp->if_unit < best_unit) {
11023 other_en_index = ifp->if_index;
11024 best_unit = ifp->if_unit;
11025 }
11026 } else if (ifp->if_type == IFT_ETHER && any_ether_index == 0) {
11027 any_ether_index = ifp->if_index;
11028 }
11029 ifnet_lock_done(ifp);
11030 }
11031 if (en0_index == 0) {
11032 if (other_en_index != 0) {
11033 *ret_other_index = other_en_index;
11034 } else if (any_ether_index != 0) {
11035 *ret_other_index = any_ether_index;
11036 }
11037 }
11038 return en0_index;
11039}
11040
11041int
11042uuid_get_ethernet(u_int8_t *node)
11043{
11044 static int en0_index;
11045 struct ifnet *ifp;
11046 int other_index = 0;
11047 int the_index = 0;
11048 int ret;
11049
11050 ifnet_head_lock_shared();
11051 if (en0_index == 0 || ifindex2ifnet[en0_index] == NULL) {
11052 en0_index = get_ether_index(ret_other_index: &other_index);
11053 }
11054 if (en0_index != 0) {
11055 the_index = en0_index;
11056 } else if (other_index != 0) {
11057 the_index = other_index;
11058 }
11059 if (the_index != 0) {
11060 struct dlil_ifnet *dl_if;
11061
11062 ifp = ifindex2ifnet[the_index];
11063 VERIFY(ifp != NULL);
11064 dl_if = (struct dlil_ifnet *)ifp;
11065 if (dl_if->dl_if_permanent_ether_is_set != 0) {
11066 /*
11067 * Use the permanent ethernet address if it is
11068 * available because it will never change.
11069 */
11070 memcpy(dst: node, src: dl_if->dl_if_permanent_ether,
11071 ETHER_ADDR_LEN);
11072 } else {
11073 memcpy(dst: node, IF_LLADDR(ifp), ETHER_ADDR_LEN);
11074 }
11075 ret = 0;
11076 } else {
11077 ret = -1;
11078 }
11079 ifnet_head_done();
11080 return ret;
11081}
11082
11083static int
11084sysctl_rxpoll SYSCTL_HANDLER_ARGS
11085{
11086#pragma unused(arg1, arg2)
11087 uint32_t i;
11088 int err;
11089
11090 i = if_rxpoll;
11091
11092 err = sysctl_handle_int(oidp, arg1: &i, arg2: 0, req);
11093 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11094 return err;
11095 }
11096
11097 if (net_rxpoll == 0) {
11098 return ENXIO;
11099 }
11100
11101 if_rxpoll = i;
11102 return err;
11103}
11104
11105static int
11106sysctl_rxpoll_mode_holdtime SYSCTL_HANDLER_ARGS
11107{
11108#pragma unused(arg1, arg2)
11109 uint64_t q;
11110 int err;
11111
11112 q = if_rxpoll_mode_holdtime;
11113
11114 err = sysctl_handle_quad(oidp, arg1: &q, arg2: 0, req);
11115 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11116 return err;
11117 }
11118
11119 if (q < IF_RXPOLL_MODE_HOLDTIME_MIN) {
11120 q = IF_RXPOLL_MODE_HOLDTIME_MIN;
11121 }
11122
11123 if_rxpoll_mode_holdtime = q;
11124
11125 return err;
11126}
11127
11128static int
11129sysctl_rxpoll_sample_holdtime SYSCTL_HANDLER_ARGS
11130{
11131#pragma unused(arg1, arg2)
11132 uint64_t q;
11133 int err;
11134
11135 q = if_rxpoll_sample_holdtime;
11136
11137 err = sysctl_handle_quad(oidp, arg1: &q, arg2: 0, req);
11138 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11139 return err;
11140 }
11141
11142 if (q < IF_RXPOLL_SAMPLETIME_MIN) {
11143 q = IF_RXPOLL_SAMPLETIME_MIN;
11144 }
11145
11146 if_rxpoll_sample_holdtime = q;
11147
11148 return err;
11149}
11150
11151static int
11152sysctl_rxpoll_interval_time SYSCTL_HANDLER_ARGS
11153{
11154#pragma unused(arg1, arg2)
11155 uint64_t q;
11156 int err;
11157
11158 q = if_rxpoll_interval_time;
11159
11160 err = sysctl_handle_quad(oidp, arg1: &q, arg2: 0, req);
11161 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11162 return err;
11163 }
11164
11165 if (q < IF_RXPOLL_INTERVALTIME_MIN) {
11166 q = IF_RXPOLL_INTERVALTIME_MIN;
11167 }
11168
11169 if_rxpoll_interval_time = q;
11170
11171 return err;
11172}
11173
11174static int
11175sysctl_rxpoll_wlowat SYSCTL_HANDLER_ARGS
11176{
11177#pragma unused(arg1, arg2)
11178 uint32_t i;
11179 int err;
11180
11181 i = if_sysctl_rxpoll_wlowat;
11182
11183 err = sysctl_handle_int(oidp, arg1: &i, arg2: 0, req);
11184 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11185 return err;
11186 }
11187
11188 if (i == 0 || i >= if_sysctl_rxpoll_whiwat) {
11189 return EINVAL;
11190 }
11191
11192 if_sysctl_rxpoll_wlowat = i;
11193 return err;
11194}
11195
11196static int
11197sysctl_rxpoll_whiwat SYSCTL_HANDLER_ARGS
11198{
11199#pragma unused(arg1, arg2)
11200 uint32_t i;
11201 int err;
11202
11203 i = if_sysctl_rxpoll_whiwat;
11204
11205 err = sysctl_handle_int(oidp, arg1: &i, arg2: 0, req);
11206 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11207 return err;
11208 }
11209
11210 if (i <= if_sysctl_rxpoll_wlowat) {
11211 return EINVAL;
11212 }
11213
11214 if_sysctl_rxpoll_whiwat = i;
11215 return err;
11216}
11217
11218static int
11219sysctl_sndq_maxlen SYSCTL_HANDLER_ARGS
11220{
11221#pragma unused(arg1, arg2)
11222 int i, err;
11223
11224 i = if_sndq_maxlen;
11225
11226 err = sysctl_handle_int(oidp, arg1: &i, arg2: 0, req);
11227 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11228 return err;
11229 }
11230
11231 if (i < IF_SNDQ_MINLEN) {
11232 i = IF_SNDQ_MINLEN;
11233 }
11234
11235 if_sndq_maxlen = i;
11236 return err;
11237}
11238
11239static int
11240sysctl_rcvq_maxlen SYSCTL_HANDLER_ARGS
11241{
11242#pragma unused(arg1, arg2)
11243 int i, err;
11244
11245 i = if_rcvq_maxlen;
11246
11247 err = sysctl_handle_int(oidp, arg1: &i, arg2: 0, req);
11248 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11249 return err;
11250 }
11251
11252 if (i < IF_RCVQ_MINLEN) {
11253 i = IF_RCVQ_MINLEN;
11254 }
11255
11256 if_rcvq_maxlen = i;
11257 return err;
11258}
11259
11260static int
11261sysctl_rcvq_burst_limit SYSCTL_HANDLER_ARGS
11262{
11263#pragma unused(arg1, arg2)
11264 int i, err;
11265
11266 i = if_rcvq_burst_limit;
11267
11268 err = sysctl_handle_int(oidp, arg1: &i, arg2: 0, req);
11269 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11270 return err;
11271 }
11272
11273/*
11274 * Safeguard the burst limit to "sane" values on customer builds.
11275 */
11276#if !(DEVELOPMENT || DEBUG)
11277 if (i < IF_RCVQ_BURST_LIMIT_MIN) {
11278 i = IF_RCVQ_BURST_LIMIT_MIN;
11279 }
11280
11281 if (IF_RCVQ_BURST_LIMIT_MAX < i) {
11282 i = IF_RCVQ_BURST_LIMIT_MAX;
11283 }
11284#endif
11285
11286 if_rcvq_burst_limit = i;
11287 return err;
11288}
11289
11290static int
11291sysctl_rcvq_trim_pct SYSCTL_HANDLER_ARGS
11292{
11293#pragma unused(arg1, arg2)
11294 int i, err;
11295
11296 i = if_rcvq_burst_limit;
11297
11298 err = sysctl_handle_int(oidp, arg1: &i, arg2: 0, req);
11299 if (err != 0 || req->newptr == USER_ADDR_NULL) {
11300 return err;
11301 }
11302
11303 if (IF_RCVQ_TRIM_PCT_MAX < i) {
11304 i = IF_RCVQ_TRIM_PCT_MAX;
11305 }
11306
11307 if (i < IF_RCVQ_TRIM_PCT_MIN) {
11308 i = IF_RCVQ_TRIM_PCT_MIN;
11309 }
11310
11311 if_rcvq_trim_pct = i;
11312 return err;
11313}
11314
11315int
11316dlil_node_present(struct ifnet *ifp, struct sockaddr *sa,
11317 int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
11318{
11319 struct kev_dl_node_presence kev;
11320 struct sockaddr_dl *sdl;
11321 struct sockaddr_in6 *sin6;
11322 int ret = 0;
11323
11324 VERIFY(ifp);
11325 VERIFY(sa);
11326 VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
11327
11328 bzero(s: &kev, n: sizeof(kev));
11329 sin6 = &kev.sin6_node_address;
11330 sdl = &kev.sdl_node_address;
11331 nd6_alt_node_addr_decompose(ifp, sa, sdl, sin6);
11332 kev.rssi = rssi;
11333 kev.link_quality_metric = lqm;
11334 kev.node_proximity_metric = npm;
11335 bcopy(src: srvinfo, dst: kev.node_service_info, n: sizeof(kev.node_service_info));
11336
11337 ret = nd6_alt_node_present(ifp, sin6, sdl, rssi, lqm, npm);
11338 if (ret == 0 || ret == EEXIST) {
11339 int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
11340 event_data: &kev.link_data, event_data_len: sizeof(kev), suppress_generation: (ret == EEXIST) ? TRUE : FALSE);
11341 if (err != 0) {
11342 log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with"
11343 "error %d\n", __func__, err);
11344 }
11345 }
11346
11347 if (ret == EEXIST) {
11348 ret = 0;
11349 }
11350 return ret;
11351}
11352
11353void
11354dlil_node_absent(struct ifnet *ifp, struct sockaddr *sa)
11355{
11356 struct kev_dl_node_absence kev = {};
11357 struct sockaddr_in6 *kev_sin6 = NULL;
11358 struct sockaddr_dl *kev_sdl = NULL;
11359 int error = 0;
11360
11361 VERIFY(ifp != NULL);
11362 VERIFY(sa != NULL);
11363 VERIFY(sa->sa_family == AF_LINK || sa->sa_family == AF_INET6);
11364
11365 kev_sin6 = &kev.sin6_node_address;
11366 kev_sdl = &kev.sdl_node_address;
11367
11368 if (sa->sa_family == AF_INET6) {
11369 /*
11370 * If IPv6 address is given, get the link layer
11371 * address from what was cached in the neighbor cache
11372 */
11373 VERIFY(sa->sa_len <= sizeof(*kev_sin6));
11374 bcopy(src: sa, dst: kev_sin6, n: sa->sa_len);
11375 error = nd6_alt_node_absent(ifp, kev_sin6, kev_sdl);
11376 } else {
11377 /*
11378 * If passed address is AF_LINK type, derive the address
11379 * based on the link address.
11380 */
11381 nd6_alt_node_addr_decompose(ifp, sa, kev_sdl, kev_sin6);
11382 error = nd6_alt_node_absent(ifp, kev_sin6, NULL);
11383 }
11384
11385 if (error == 0) {
11386 kev_sdl->sdl_type = ifp->if_type;
11387 kev_sdl->sdl_index = ifp->if_index;
11388
11389 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_ABSENCE,
11390 event_data: &kev.link_data, event_data_len: sizeof(kev), FALSE);
11391 }
11392}
11393
11394int
11395dlil_node_present_v2(struct ifnet *ifp, struct sockaddr *sa, struct sockaddr_dl *sdl,
11396 int32_t rssi, int lqm, int npm, u_int8_t srvinfo[48])
11397{
11398 struct kev_dl_node_presence kev = {};
11399 struct sockaddr_dl *kev_sdl = NULL;
11400 struct sockaddr_in6 *kev_sin6 = NULL;
11401 int ret = 0;
11402
11403 VERIFY(ifp != NULL);
11404 VERIFY(sa != NULL && sdl != NULL);
11405 VERIFY(sa->sa_family == AF_INET6 && sdl->sdl_family == AF_LINK);
11406
11407 kev_sin6 = &kev.sin6_node_address;
11408 kev_sdl = &kev.sdl_node_address;
11409
11410 VERIFY(sdl->sdl_len <= sizeof(*kev_sdl));
11411 bcopy(src: sdl, dst: kev_sdl, n: sdl->sdl_len);
11412 kev_sdl->sdl_type = ifp->if_type;
11413 kev_sdl->sdl_index = ifp->if_index;
11414
11415 VERIFY(sa->sa_len <= sizeof(*kev_sin6));
11416 bcopy(src: sa, dst: kev_sin6, n: sa->sa_len);
11417
11418 kev.rssi = rssi;
11419 kev.link_quality_metric = lqm;
11420 kev.node_proximity_metric = npm;
11421 bcopy(src: srvinfo, dst: kev.node_service_info, n: sizeof(kev.node_service_info));
11422
11423 ret = nd6_alt_node_present(ifp, SIN6(sa), sdl, rssi, lqm, npm);
11424 if (ret == 0 || ret == EEXIST) {
11425 int err = dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_NODE_PRESENCE,
11426 event_data: &kev.link_data, event_data_len: sizeof(kev), suppress_generation: (ret == EEXIST) ? TRUE : FALSE);
11427 if (err != 0) {
11428 log(LOG_ERR, "%s: Post DL_NODE_PRESENCE failed with error %d\n", __func__, err);
11429 }
11430 }
11431
11432 if (ret == EEXIST) {
11433 ret = 0;
11434 }
11435 return ret;
11436}
11437
11438const void * __indexable
11439dlil_ifaddr_bytes(const struct sockaddr_dl *sdl, size_t *sizep,
11440 kauth_cred_t *credp)
11441{
11442 const u_int8_t *bytes;
11443 size_t size;
11444
11445 bytes = CONST_LLADDR(sdl);
11446 size = sdl->sdl_alen;
11447
11448#if CONFIG_MACF
11449 if (dlil_lladdr_ckreq) {
11450 switch (sdl->sdl_type) {
11451 case IFT_ETHER:
11452 case IFT_IEEE1394:
11453 break;
11454 default:
11455 credp = NULL;
11456 break;
11457 }
11458 ;
11459
11460 if (credp && mac_system_check_info(*credp, info_type: "net.link.addr")) {
11461 static const u_int8_t unspec[FIREWIRE_EUI64_LEN] = {
11462 [0] = 2
11463 };
11464
11465 bytes = unspec;
11466 }
11467 }
11468#else
11469#pragma unused(credp)
11470#endif
11471
11472 if (sizep != NULL) {
11473 *sizep = size;
11474 }
11475 return bytes;
11476}
11477
11478void
11479dlil_report_issues(struct ifnet *ifp, u_int8_t modid[DLIL_MODIDLEN],
11480 u_int8_t info[DLIL_MODARGLEN])
11481{
11482 struct kev_dl_issues kev;
11483 struct timeval tv;
11484
11485 VERIFY(ifp != NULL);
11486 VERIFY(modid != NULL);
11487 _CASSERT(sizeof(kev.modid) == DLIL_MODIDLEN);
11488 _CASSERT(sizeof(kev.info) == DLIL_MODARGLEN);
11489
11490 bzero(s: &kev, n: sizeof(kev));
11491
11492 microtime(tv: &tv);
11493 kev.timestamp = tv.tv_sec;
11494 bcopy(src: modid, dst: &kev.modid, DLIL_MODIDLEN);
11495 if (info != NULL) {
11496 bcopy(src: info, dst: &kev.info, DLIL_MODARGLEN);
11497 }
11498
11499 dlil_post_msg(ifp, KEV_DL_SUBCLASS, KEV_DL_ISSUES,
11500 event_data: &kev.link_data, event_data_len: sizeof(kev), FALSE);
11501}
11502
11503errno_t
11504ifnet_getset_opportunistic(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
11505 struct proc *p)
11506{
11507 u_int32_t level = IFNET_THROTTLE_OFF;
11508 errno_t result = 0;
11509
11510 VERIFY(cmd == SIOCSIFOPPORTUNISTIC || cmd == SIOCGIFOPPORTUNISTIC);
11511
11512 if (cmd == SIOCSIFOPPORTUNISTIC) {
11513 /*
11514 * XXX: Use priv_check_cred() instead of root check?
11515 */
11516 if ((result = proc_suser(p)) != 0) {
11517 return result;
11518 }
11519
11520 if (ifr->ifr_opportunistic.ifo_flags ==
11521 IFRIFOF_BLOCK_OPPORTUNISTIC) {
11522 level = IFNET_THROTTLE_OPPORTUNISTIC;
11523 } else if (ifr->ifr_opportunistic.ifo_flags == 0) {
11524 level = IFNET_THROTTLE_OFF;
11525 } else {
11526 result = EINVAL;
11527 }
11528
11529 if (result == 0) {
11530 result = ifnet_set_throttle(ifp, level);
11531 }
11532 } else if ((result = ifnet_get_throttle(ifp, &level)) == 0) {
11533 ifr->ifr_opportunistic.ifo_flags = 0;
11534 if (level == IFNET_THROTTLE_OPPORTUNISTIC) {
11535 ifr->ifr_opportunistic.ifo_flags |=
11536 IFRIFOF_BLOCK_OPPORTUNISTIC;
11537 }
11538 }
11539
11540 /*
11541 * Return the count of current opportunistic connections
11542 * over the interface.
11543 */
11544 if (result == 0) {
11545 uint32_t flags = 0;
11546 flags |= (cmd == SIOCSIFOPPORTUNISTIC) ?
11547 INPCB_OPPORTUNISTIC_SETCMD : 0;
11548 flags |= (level == IFNET_THROTTLE_OPPORTUNISTIC) ?
11549 INPCB_OPPORTUNISTIC_THROTTLEON : 0;
11550 ifr->ifr_opportunistic.ifo_inuse =
11551 udp_count_opportunistic(ifindex: ifp->if_index, flags) +
11552 tcp_count_opportunistic(ifindex: ifp->if_index, flags);
11553 }
11554
11555 if (result == EALREADY) {
11556 result = 0;
11557 }
11558
11559 return result;
11560}
11561
11562int
11563ifnet_get_throttle(struct ifnet *ifp, u_int32_t *level)
11564{
11565 struct ifclassq *ifq;
11566 int err = 0;
11567
11568 if (!(ifp->if_eflags & IFEF_TXSTART)) {
11569 return ENXIO;
11570 }
11571
11572 *level = IFNET_THROTTLE_OFF;
11573
11574 ifq = ifp->if_snd;
11575 IFCQ_LOCK(ifq);
11576 /* Throttling works only for IFCQ, not ALTQ instances */
11577 if (IFCQ_IS_ENABLED(ifq)) {
11578 cqrq_throttle_t req = { 0, IFNET_THROTTLE_OFF };
11579
11580 err = fq_if_request_classq(ifq, rq: CLASSQRQ_THROTTLE, arg: &req);
11581 *level = req.level;
11582 }
11583 IFCQ_UNLOCK(ifq);
11584
11585 return err;
11586}
11587
11588int
11589ifnet_set_throttle(struct ifnet *ifp, u_int32_t level)
11590{
11591 struct ifclassq *ifq;
11592 int err = 0;
11593
11594 if (!(ifp->if_eflags & IFEF_TXSTART)) {
11595 return ENXIO;
11596 }
11597
11598 ifq = ifp->if_snd;
11599
11600 switch (level) {
11601 case IFNET_THROTTLE_OFF:
11602 case IFNET_THROTTLE_OPPORTUNISTIC:
11603 break;
11604 default:
11605 return EINVAL;
11606 }
11607
11608 IFCQ_LOCK(ifq);
11609 if (IFCQ_IS_ENABLED(ifq)) {
11610 cqrq_throttle_t req = { 1, level };
11611
11612 err = fq_if_request_classq(ifq, rq: CLASSQRQ_THROTTLE, arg: &req);
11613 }
11614 IFCQ_UNLOCK(ifq);
11615
11616 if (err == 0) {
11617 DLIL_PRINTF("%s: throttling level set to %d\n", if_name(ifp),
11618 level);
11619#if NECP
11620 necp_update_all_clients();
11621#endif /* NECP */
11622 if (level == IFNET_THROTTLE_OFF) {
11623 ifnet_start(ifp);
11624 }
11625 }
11626
11627 return err;
11628}
11629
11630errno_t
11631ifnet_getset_log(ifnet_t ifp, u_long cmd, struct ifreq *ifr,
11632 struct proc *p)
11633{
11634#pragma unused(p)
11635 errno_t result = 0;
11636 uint32_t flags;
11637 int level, category, subcategory;
11638
11639 VERIFY(cmd == SIOCSIFLOG || cmd == SIOCGIFLOG);
11640
11641 if (cmd == SIOCSIFLOG) {
11642 if ((result = priv_check_cred(cred: kauth_cred_get(),
11643 PRIV_NET_INTERFACE_CONTROL, flags: 0)) != 0) {
11644 return result;
11645 }
11646
11647 level = ifr->ifr_log.ifl_level;
11648 if (level < IFNET_LOG_MIN || level > IFNET_LOG_MAX) {
11649 result = EINVAL;
11650 }
11651
11652 flags = ifr->ifr_log.ifl_flags;
11653 if ((flags &= IFNET_LOGF_MASK) == 0) {
11654 result = EINVAL;
11655 }
11656
11657 category = ifr->ifr_log.ifl_category;
11658 subcategory = ifr->ifr_log.ifl_subcategory;
11659
11660 if (result == 0) {
11661 result = ifnet_set_log(ifp, level, flags,
11662 category, subcategory);
11663 }
11664 } else {
11665 result = ifnet_get_log(ifp, &level, &flags, &category,
11666 &subcategory);
11667 if (result == 0) {
11668 ifr->ifr_log.ifl_level = level;
11669 ifr->ifr_log.ifl_flags = flags;
11670 ifr->ifr_log.ifl_category = category;
11671 ifr->ifr_log.ifl_subcategory = subcategory;
11672 }
11673 }
11674
11675 return result;
11676}
11677
11678int
11679ifnet_set_log(struct ifnet *ifp, int32_t level, uint32_t flags,
11680 int32_t category, int32_t subcategory)
11681{
11682 int err = 0;
11683
11684 VERIFY(level >= IFNET_LOG_MIN && level <= IFNET_LOG_MAX);
11685 VERIFY(flags & IFNET_LOGF_MASK);
11686
11687 /*
11688 * The logging level applies to all facilities; make sure to
11689 * update them all with the most current level.
11690 */
11691 flags |= ifp->if_log.flags;
11692
11693 if (ifp->if_output_ctl != NULL) {
11694 struct ifnet_log_params l;
11695
11696 bzero(s: &l, n: sizeof(l));
11697 l.level = level;
11698 l.flags = flags;
11699 l.flags &= ~IFNET_LOGF_DLIL;
11700 l.category = category;
11701 l.subcategory = subcategory;
11702
11703 /* Send this request to lower layers */
11704 if (l.flags != 0) {
11705 err = ifp->if_output_ctl(ifp, IFNET_CTL_SET_LOG,
11706 sizeof(l), &l);
11707 }
11708 } else if ((flags & ~IFNET_LOGF_DLIL) && ifp->if_output_ctl == NULL) {
11709 /*
11710 * If targeted to the lower layers without an output
11711 * control callback registered on the interface, just
11712 * silently ignore facilities other than ours.
11713 */
11714 flags &= IFNET_LOGF_DLIL;
11715 if (flags == 0 && (!(ifp->if_log.flags & IFNET_LOGF_DLIL))) {
11716 level = 0;
11717 }
11718 }
11719
11720 if (err == 0) {
11721 if ((ifp->if_log.level = level) == IFNET_LOG_DEFAULT) {
11722 ifp->if_log.flags = 0;
11723 } else {
11724 ifp->if_log.flags |= flags;
11725 }
11726
11727 log(LOG_INFO, "%s: logging level set to %d flags=0x%x "
11728 "arg=0x%x, category=%d subcategory=%d\n", if_name(ifp),
11729 ifp->if_log.level, ifp->if_log.flags, flags,
11730 category, subcategory);
11731 }
11732
11733 return err;
11734}
11735
11736int
11737ifnet_get_log(struct ifnet *ifp, int32_t *level, uint32_t *flags,
11738 int32_t *category, int32_t *subcategory)
11739{
11740 if (level != NULL) {
11741 *level = ifp->if_log.level;
11742 }
11743 if (flags != NULL) {
11744 *flags = ifp->if_log.flags;
11745 }
11746 if (category != NULL) {
11747 *category = ifp->if_log.category;
11748 }
11749 if (subcategory != NULL) {
11750 *subcategory = ifp->if_log.subcategory;
11751 }
11752
11753 return 0;
11754}
11755
11756int
11757ifnet_notify_address(struct ifnet *ifp, int af)
11758{
11759 struct ifnet_notify_address_params na;
11760
11761#if PF
11762 (void) pf_ifaddr_hook(ifp);
11763#endif /* PF */
11764
11765 if (ifp->if_output_ctl == NULL) {
11766 return EOPNOTSUPP;
11767 }
11768
11769 bzero(s: &na, n: sizeof(na));
11770 na.address_family = (sa_family_t)af;
11771
11772 return ifp->if_output_ctl(ifp, IFNET_CTL_NOTIFY_ADDRESS,
11773 sizeof(na), &na);
11774}
11775
11776errno_t
11777ifnet_flowid(struct ifnet *ifp, uint32_t *flowid)
11778{
11779 if (ifp == NULL || flowid == NULL) {
11780 return EINVAL;
11781 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11782 !IF_FULLY_ATTACHED(ifp)) {
11783 return ENXIO;
11784 }
11785
11786 *flowid = ifp->if_flowhash;
11787
11788 return 0;
11789}
11790
11791errno_t
11792ifnet_disable_output(struct ifnet *ifp)
11793{
11794 int err;
11795
11796 if (ifp == NULL) {
11797 return EINVAL;
11798 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11799 !IF_FULLY_ATTACHED(ifp)) {
11800 return ENXIO;
11801 }
11802
11803 if ((err = ifnet_fc_add(ifp)) == 0) {
11804 lck_mtx_lock_spin(lck: &ifp->if_start_lock);
11805 ifp->if_start_flags |= IFSF_FLOW_CONTROLLED;
11806 lck_mtx_unlock(lck: &ifp->if_start_lock);
11807 }
11808 return err;
11809}
11810
11811errno_t
11812ifnet_enable_output(struct ifnet *ifp)
11813{
11814 if (ifp == NULL) {
11815 return EINVAL;
11816 } else if (!(ifp->if_eflags & IFEF_TXSTART) ||
11817 !IF_FULLY_ATTACHED(ifp)) {
11818 return ENXIO;
11819 }
11820
11821 ifnet_start_common(ifp, TRUE, FALSE);
11822 return 0;
11823}
11824
11825void
11826ifnet_flowadv(uint32_t flowhash)
11827{
11828 struct ifnet_fc_entry *ifce;
11829 struct ifnet *ifp;
11830
11831 ifce = ifnet_fc_get(flowhash);
11832 if (ifce == NULL) {
11833 return;
11834 }
11835
11836 VERIFY(ifce->ifce_ifp != NULL);
11837 ifp = ifce->ifce_ifp;
11838
11839 /* flow hash gets recalculated per attach, so check */
11840 if (ifnet_is_attached(ifp, refio: 1)) {
11841 if (ifp->if_flowhash == flowhash) {
11842 (void) ifnet_enable_output(ifp);
11843 }
11844 ifnet_decr_iorefcnt(ifp);
11845 }
11846 ifnet_fc_entry_free(ifce);
11847}
11848
11849/*
11850 * Function to compare ifnet_fc_entries in ifnet flow control tree
11851 */
11852static inline int
11853ifce_cmp(const struct ifnet_fc_entry *fc1, const struct ifnet_fc_entry *fc2)
11854{
11855 return fc1->ifce_flowhash - fc2->ifce_flowhash;
11856}
11857
11858static int
11859ifnet_fc_add(struct ifnet *ifp)
11860{
11861 struct ifnet_fc_entry keyfc, *ifce;
11862 uint32_t flowhash;
11863
11864 VERIFY(ifp != NULL && (ifp->if_eflags & IFEF_TXSTART));
11865 VERIFY(ifp->if_flowhash != 0);
11866 flowhash = ifp->if_flowhash;
11867
11868 bzero(s: &keyfc, n: sizeof(keyfc));
11869 keyfc.ifce_flowhash = flowhash;
11870
11871 lck_mtx_lock_spin(lck: &ifnet_fc_lock);
11872 ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11873 if (ifce != NULL && ifce->ifce_ifp == ifp) {
11874 /* Entry is already in ifnet_fc_tree, return */
11875 lck_mtx_unlock(lck: &ifnet_fc_lock);
11876 return 0;
11877 }
11878
11879 if (ifce != NULL) {
11880 /*
11881 * There is a different fc entry with the same flow hash
11882 * but different ifp pointer. There can be a collision
11883 * on flow hash but the probability is low. Let's just
11884 * avoid adding a second one when there is a collision.
11885 */
11886 lck_mtx_unlock(lck: &ifnet_fc_lock);
11887 return EAGAIN;
11888 }
11889
11890 /* become regular mutex */
11891 lck_mtx_convert_spin(lck: &ifnet_fc_lock);
11892
11893 ifce = zalloc_flags(ifnet_fc_zone, Z_WAITOK | Z_ZERO);
11894 ifce->ifce_flowhash = flowhash;
11895 ifce->ifce_ifp = ifp;
11896
11897 RB_INSERT(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11898 lck_mtx_unlock(lck: &ifnet_fc_lock);
11899 return 0;
11900}
11901
11902static struct ifnet_fc_entry *
11903ifnet_fc_get(uint32_t flowhash)
11904{
11905 struct ifnet_fc_entry keyfc, *ifce;
11906 struct ifnet *ifp;
11907
11908 bzero(s: &keyfc, n: sizeof(keyfc));
11909 keyfc.ifce_flowhash = flowhash;
11910
11911 lck_mtx_lock_spin(lck: &ifnet_fc_lock);
11912 ifce = RB_FIND(ifnet_fc_tree, &ifnet_fc_tree, &keyfc);
11913 if (ifce == NULL) {
11914 /* Entry is not present in ifnet_fc_tree, return */
11915 lck_mtx_unlock(lck: &ifnet_fc_lock);
11916 return NULL;
11917 }
11918
11919 RB_REMOVE(ifnet_fc_tree, &ifnet_fc_tree, ifce);
11920
11921 VERIFY(ifce->ifce_ifp != NULL);
11922 ifp = ifce->ifce_ifp;
11923
11924 /* become regular mutex */
11925 lck_mtx_convert_spin(lck: &ifnet_fc_lock);
11926
11927 if (!ifnet_is_attached(ifp, refio: 0)) {
11928 /*
11929 * This ifp is not attached or in the process of being
11930 * detached; just don't process it.
11931 */
11932 ifnet_fc_entry_free(ifce);
11933 ifce = NULL;
11934 }
11935 lck_mtx_unlock(lck: &ifnet_fc_lock);
11936
11937 return ifce;
11938}
11939
11940static void
11941ifnet_fc_entry_free(struct ifnet_fc_entry *ifce)
11942{
11943 zfree(ifnet_fc_zone, ifce);
11944}
11945
11946static uint32_t
11947ifnet_calc_flowhash(struct ifnet *ifp)
11948{
11949 struct ifnet_flowhash_key fh __attribute__((aligned(8)));
11950 uint32_t flowhash = 0;
11951
11952 if (ifnet_flowhash_seed == 0) {
11953 ifnet_flowhash_seed = RandomULong();
11954 }
11955
11956 bzero(s: &fh, n: sizeof(fh));
11957
11958 (void) snprintf(fh.ifk_name, count: sizeof(fh.ifk_name), "%s", ifp->if_name);
11959 fh.ifk_unit = ifp->if_unit;
11960 fh.ifk_flags = ifp->if_flags;
11961 fh.ifk_eflags = ifp->if_eflags;
11962 fh.ifk_capabilities = ifp->if_capabilities;
11963 fh.ifk_capenable = ifp->if_capenable;
11964 fh.ifk_output_sched_model = ifp->if_output_sched_model;
11965 fh.ifk_rand1 = RandomULong();
11966 fh.ifk_rand2 = RandomULong();
11967
11968try_again:
11969 flowhash = net_flowhash(&fh, sizeof(fh), ifnet_flowhash_seed);
11970 if (flowhash == 0) {
11971 /* try to get a non-zero flowhash */
11972 ifnet_flowhash_seed = RandomULong();
11973 goto try_again;
11974 }
11975
11976 return flowhash;
11977}
11978
11979int
11980ifnet_set_netsignature(struct ifnet *ifp, uint8_t family, uint8_t len,
11981 uint16_t flags, uint8_t *data)
11982{
11983#pragma unused(flags)
11984 int error = 0;
11985
11986 switch (family) {
11987 case AF_INET:
11988 if_inetdata_lock_exclusive(ifp);
11989 if (IN_IFEXTRA(ifp) != NULL) {
11990 if (len == 0) {
11991 /* Allow clearing the signature */
11992 IN_IFEXTRA(ifp)->netsig_len = 0;
11993 bzero(IN_IFEXTRA(ifp)->netsig,
11994 n: sizeof(IN_IFEXTRA(ifp)->netsig));
11995 if_inetdata_lock_done(ifp);
11996 break;
11997 } else if (len > sizeof(IN_IFEXTRA(ifp)->netsig)) {
11998 error = EINVAL;
11999 if_inetdata_lock_done(ifp);
12000 break;
12001 }
12002 IN_IFEXTRA(ifp)->netsig_len = len;
12003 bcopy(src: data, IN_IFEXTRA(ifp)->netsig, n: len);
12004 } else {
12005 error = ENOMEM;
12006 }
12007 if_inetdata_lock_done(ifp);
12008 break;
12009
12010 case AF_INET6:
12011 if_inet6data_lock_exclusive(ifp);
12012 if (IN6_IFEXTRA(ifp) != NULL) {
12013 if (len == 0) {
12014 /* Allow clearing the signature */
12015 IN6_IFEXTRA(ifp)->netsig_len = 0;
12016 bzero(IN6_IFEXTRA(ifp)->netsig,
12017 n: sizeof(IN6_IFEXTRA(ifp)->netsig));
12018 if_inet6data_lock_done(ifp);
12019 break;
12020 } else if (len > sizeof(IN6_IFEXTRA(ifp)->netsig)) {
12021 error = EINVAL;
12022 if_inet6data_lock_done(ifp);
12023 break;
12024 }
12025 IN6_IFEXTRA(ifp)->netsig_len = len;
12026 bcopy(src: data, IN6_IFEXTRA(ifp)->netsig, n: len);
12027 } else {
12028 error = ENOMEM;
12029 }
12030 if_inet6data_lock_done(ifp);
12031 break;
12032
12033 default:
12034 error = EINVAL;
12035 break;
12036 }
12037
12038 return error;
12039}
12040
12041int
12042ifnet_get_netsignature(struct ifnet *ifp, uint8_t family, uint8_t *len,
12043 uint16_t *flags, uint8_t *data)
12044{
12045 int error = 0;
12046
12047 if (ifp == NULL || len == NULL || data == NULL) {
12048 return EINVAL;
12049 }
12050
12051 switch (family) {
12052 case AF_INET:
12053 if_inetdata_lock_shared(ifp);
12054 if (IN_IFEXTRA(ifp) != NULL) {
12055 if (*len == 0 || *len < IN_IFEXTRA(ifp)->netsig_len) {
12056 error = EINVAL;
12057 if_inetdata_lock_done(ifp);
12058 break;
12059 }
12060 if ((*len = (uint8_t)IN_IFEXTRA(ifp)->netsig_len) > 0) {
12061 bcopy(IN_IFEXTRA(ifp)->netsig, dst: data, n: *len);
12062 } else {
12063 error = ENOENT;
12064 }
12065 } else {
12066 error = ENOMEM;
12067 }
12068 if_inetdata_lock_done(ifp);
12069 break;
12070
12071 case AF_INET6:
12072 if_inet6data_lock_shared(ifp);
12073 if (IN6_IFEXTRA(ifp) != NULL) {
12074 if (*len == 0 || *len < IN6_IFEXTRA(ifp)->netsig_len) {
12075 error = EINVAL;
12076 if_inet6data_lock_done(ifp);
12077 break;
12078 }
12079 if ((*len = (uint8_t)IN6_IFEXTRA(ifp)->netsig_len) > 0) {
12080 bcopy(IN6_IFEXTRA(ifp)->netsig, dst: data, n: *len);
12081 } else {
12082 error = ENOENT;
12083 }
12084 } else {
12085 error = ENOMEM;
12086 }
12087 if_inet6data_lock_done(ifp);
12088 break;
12089
12090 default:
12091 error = EINVAL;
12092 break;
12093 }
12094
12095 if (error == 0 && flags != NULL) {
12096 *flags = 0;
12097 }
12098
12099 return error;
12100}
12101
12102int
12103ifnet_set_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
12104{
12105 int i, error = 0, one_set = 0;
12106
12107 if_inet6data_lock_exclusive(ifp);
12108
12109 if (IN6_IFEXTRA(ifp) == NULL) {
12110 error = ENOMEM;
12111 goto out;
12112 }
12113
12114 for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
12115 uint32_t prefix_len =
12116 prefixes[i].prefix_len;
12117 struct in6_addr *prefix =
12118 &prefixes[i].ipv6_prefix;
12119
12120 if (prefix_len == 0) {
12121 clat_log0((LOG_DEBUG,
12122 "NAT64 prefixes purged from Interface %s\n",
12123 if_name(ifp)));
12124 /* Allow clearing the signature */
12125 IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = 0;
12126 bzero(s: &IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
12127 n: sizeof(struct in6_addr));
12128
12129 continue;
12130 } else if (prefix_len != NAT64_PREFIX_LEN_32 &&
12131 prefix_len != NAT64_PREFIX_LEN_40 &&
12132 prefix_len != NAT64_PREFIX_LEN_48 &&
12133 prefix_len != NAT64_PREFIX_LEN_56 &&
12134 prefix_len != NAT64_PREFIX_LEN_64 &&
12135 prefix_len != NAT64_PREFIX_LEN_96) {
12136 clat_log0((LOG_DEBUG,
12137 "NAT64 prefixlen is incorrect %d\n", prefix_len));
12138 error = EINVAL;
12139 goto out;
12140 }
12141
12142 if (IN6_IS_SCOPE_EMBED(prefix)) {
12143 clat_log0((LOG_DEBUG,
12144 "NAT64 prefix has interface/link local scope.\n"));
12145 error = EINVAL;
12146 goto out;
12147 }
12148
12149 IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len = prefix_len;
12150 bcopy(src: prefix, dst: &IN6_IFEXTRA(ifp)->nat64_prefixes[i].ipv6_prefix,
12151 n: sizeof(struct in6_addr));
12152 clat_log0((LOG_DEBUG,
12153 "NAT64 prefix set to %s with prefixlen: %d\n",
12154 ip6_sprintf(prefix), prefix_len));
12155 one_set = 1;
12156 }
12157
12158out:
12159 if_inet6data_lock_done(ifp);
12160
12161 if (error == 0 && one_set != 0) {
12162 necp_update_all_clients();
12163 }
12164
12165 return error;
12166}
12167
12168int
12169ifnet_get_nat64prefix(struct ifnet *ifp, struct ipv6_prefix *prefixes)
12170{
12171 int i, found_one = 0, error = 0;
12172
12173 if (ifp == NULL) {
12174 return EINVAL;
12175 }
12176
12177 if_inet6data_lock_shared(ifp);
12178
12179 if (IN6_IFEXTRA(ifp) == NULL) {
12180 error = ENOMEM;
12181 goto out;
12182 }
12183
12184 for (i = 0; i < NAT64_MAX_NUM_PREFIXES; i++) {
12185 if (IN6_IFEXTRA(ifp)->nat64_prefixes[i].prefix_len != 0) {
12186 found_one = 1;
12187 }
12188 }
12189
12190 if (found_one == 0) {
12191 error = ENOENT;
12192 goto out;
12193 }
12194
12195 if (prefixes) {
12196 bcopy(IN6_IFEXTRA(ifp)->nat64_prefixes, dst: prefixes,
12197 n: sizeof(IN6_IFEXTRA(ifp)->nat64_prefixes));
12198 }
12199
12200out:
12201 if_inet6data_lock_done(ifp);
12202
12203 return error;
12204}
12205
12206__attribute__((noinline))
12207static void
12208dlil_output_cksum_dbg(struct ifnet *ifp, struct mbuf *m, uint32_t hoff,
12209 protocol_family_t pf)
12210{
12211#pragma unused(ifp)
12212 uint32_t did_sw;
12213
12214 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_FINALIZE_FORCED) ||
12215 (m->m_pkthdr.csum_flags & (CSUM_TSO_IPV4 | CSUM_TSO_IPV6))) {
12216 return;
12217 }
12218
12219 switch (pf) {
12220 case PF_INET:
12221 did_sw = in_finalize_cksum(m, hoff, m->m_pkthdr.csum_flags);
12222 if (did_sw & CSUM_DELAY_IP) {
12223 hwcksum_dbg_finalized_hdr++;
12224 }
12225 if (did_sw & CSUM_DELAY_DATA) {
12226 hwcksum_dbg_finalized_data++;
12227 }
12228 break;
12229 case PF_INET6:
12230 /*
12231 * Checksum offload should not have been enabled when
12232 * extension headers exist; that also means that we
12233 * cannot force-finalize packets with extension headers.
12234 * Indicate to the callee should it skip such case by
12235 * setting optlen to -1.
12236 */
12237 did_sw = in6_finalize_cksum(m, hoff, -1, -1,
12238 m->m_pkthdr.csum_flags);
12239 if (did_sw & CSUM_DELAY_IPV6_DATA) {
12240 hwcksum_dbg_finalized_data++;
12241 }
12242 break;
12243 default:
12244 return;
12245 }
12246}
12247
12248static void
12249dlil_input_cksum_dbg(struct ifnet *ifp, struct mbuf *m, char *frame_header,
12250 protocol_family_t pf)
12251{
12252 uint16_t sum = 0;
12253 uint32_t hlen;
12254
12255 if (frame_header == NULL ||
12256 frame_header < (char *)mbuf_datastart(mbuf: m) ||
12257 frame_header > (char *)m->m_data) {
12258 DLIL_PRINTF("%s: frame header pointer 0x%llx out of range "
12259 "[0x%llx,0x%llx] for mbuf 0x%llx\n", if_name(ifp),
12260 (uint64_t)VM_KERNEL_ADDRPERM(frame_header),
12261 (uint64_t)VM_KERNEL_ADDRPERM(mbuf_datastart(m)),
12262 (uint64_t)VM_KERNEL_ADDRPERM(m->m_data),
12263 (uint64_t)VM_KERNEL_ADDRPERM(m));
12264 return;
12265 }
12266 hlen = (uint32_t)(m->m_data - (uintptr_t)frame_header);
12267
12268 switch (pf) {
12269 case PF_INET:
12270 case PF_INET6:
12271 break;
12272 default:
12273 return;
12274 }
12275
12276 /*
12277 * Force partial checksum offload; useful to simulate cases
12278 * where the hardware does not support partial checksum offload,
12279 * in order to validate correctness throughout the layers above.
12280 */
12281 if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED) {
12282 uint32_t foff = hwcksum_dbg_partial_rxoff_forced;
12283
12284 if (foff > (uint32_t)m->m_pkthdr.len) {
12285 return;
12286 }
12287
12288 m->m_pkthdr.csum_flags &= ~CSUM_RX_FLAGS;
12289
12290 /* Compute 16-bit 1's complement sum from forced offset */
12291 sum = m_sum16(m, foff, (m->m_pkthdr.len - foff));
12292
12293 m->m_pkthdr.csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
12294 m->m_pkthdr.csum_rx_val = sum;
12295 m->m_pkthdr.csum_rx_start = (uint16_t)(foff + hlen);
12296
12297 hwcksum_dbg_partial_forced++;
12298 hwcksum_dbg_partial_forced_bytes += m->m_pkthdr.len;
12299 }
12300
12301 /*
12302 * Partial checksum offload verification (and adjustment);
12303 * useful to validate and test cases where the hardware
12304 * supports partial checksum offload.
12305 */
12306 if ((m->m_pkthdr.csum_flags &
12307 (CSUM_DATA_VALID | CSUM_PARTIAL | CSUM_PSEUDO_HDR)) ==
12308 (CSUM_DATA_VALID | CSUM_PARTIAL)) {
12309 uint32_t rxoff;
12310
12311 /* Start offset must begin after frame header */
12312 rxoff = m->m_pkthdr.csum_rx_start;
12313 if (hlen > rxoff) {
12314 hwcksum_dbg_bad_rxoff++;
12315 if (dlil_verbose) {
12316 DLIL_PRINTF("%s: partial cksum start offset %d "
12317 "is less than frame header length %d for "
12318 "mbuf 0x%llx\n", if_name(ifp), rxoff, hlen,
12319 (uint64_t)VM_KERNEL_ADDRPERM(m));
12320 }
12321 return;
12322 }
12323 rxoff -= hlen;
12324
12325 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
12326 /*
12327 * Compute the expected 16-bit 1's complement sum;
12328 * skip this if we've already computed it above
12329 * when partial checksum offload is forced.
12330 */
12331 sum = m_sum16(m, rxoff, (m->m_pkthdr.len - rxoff));
12332
12333 /* Hardware or driver is buggy */
12334 if (sum != m->m_pkthdr.csum_rx_val) {
12335 hwcksum_dbg_bad_cksum++;
12336 if (dlil_verbose) {
12337 DLIL_PRINTF("%s: bad partial cksum value "
12338 "0x%x (expected 0x%x) for mbuf "
12339 "0x%llx [rx_start %d]\n",
12340 if_name(ifp),
12341 m->m_pkthdr.csum_rx_val, sum,
12342 (uint64_t)VM_KERNEL_ADDRPERM(m),
12343 m->m_pkthdr.csum_rx_start);
12344 }
12345 return;
12346 }
12347 }
12348 hwcksum_dbg_verified++;
12349
12350 /*
12351 * This code allows us to emulate various hardwares that
12352 * perform 16-bit 1's complement sum beginning at various
12353 * start offset values.
12354 */
12355 if (hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ) {
12356 uint32_t aoff = hwcksum_dbg_partial_rxoff_adj;
12357
12358 if (aoff == rxoff || aoff > (uint32_t)m->m_pkthdr.len) {
12359 return;
12360 }
12361
12362 sum = m_adj_sum16(m, rxoff, aoff,
12363 m_pktlen(m) - aoff, sum);
12364
12365 m->m_pkthdr.csum_rx_val = sum;
12366 m->m_pkthdr.csum_rx_start = (uint16_t)(aoff + hlen);
12367
12368 hwcksum_dbg_adjusted++;
12369 }
12370 }
12371}
12372
12373static int
12374sysctl_hwcksum_dbg_mode SYSCTL_HANDLER_ARGS
12375{
12376#pragma unused(arg1, arg2)
12377 u_int32_t i;
12378 int err;
12379
12380 i = hwcksum_dbg_mode;
12381
12382 err = sysctl_handle_int(oidp, arg1: &i, arg2: 0, req);
12383 if (err != 0 || req->newptr == USER_ADDR_NULL) {
12384 return err;
12385 }
12386
12387 if (hwcksum_dbg == 0) {
12388 return ENODEV;
12389 }
12390
12391 if ((i & ~HWCKSUM_DBG_MASK) != 0) {
12392 return EINVAL;
12393 }
12394
12395 hwcksum_dbg_mode = (i & HWCKSUM_DBG_MASK);
12396
12397 return err;
12398}
12399
12400static int
12401sysctl_hwcksum_dbg_partial_rxoff_forced SYSCTL_HANDLER_ARGS
12402{
12403#pragma unused(arg1, arg2)
12404 u_int32_t i;
12405 int err;
12406
12407 i = hwcksum_dbg_partial_rxoff_forced;
12408
12409 err = sysctl_handle_int(oidp, arg1: &i, arg2: 0, req);
12410 if (err != 0 || req->newptr == USER_ADDR_NULL) {
12411 return err;
12412 }
12413
12414 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_FORCED)) {
12415 return ENODEV;
12416 }
12417
12418 hwcksum_dbg_partial_rxoff_forced = i;
12419
12420 return err;
12421}
12422
12423static int
12424sysctl_hwcksum_dbg_partial_rxoff_adj SYSCTL_HANDLER_ARGS
12425{
12426#pragma unused(arg1, arg2)
12427 u_int32_t i;
12428 int err;
12429
12430 i = hwcksum_dbg_partial_rxoff_adj;
12431
12432 err = sysctl_handle_int(oidp, arg1: &i, arg2: 0, req);
12433 if (err != 0 || req->newptr == USER_ADDR_NULL) {
12434 return err;
12435 }
12436
12437 if (!(hwcksum_dbg_mode & HWCKSUM_DBG_PARTIAL_RXOFF_ADJ)) {
12438 return ENODEV;
12439 }
12440
12441 hwcksum_dbg_partial_rxoff_adj = i;
12442
12443 return err;
12444}
12445
12446static int
12447sysctl_tx_chain_len_stats SYSCTL_HANDLER_ARGS
12448{
12449#pragma unused(oidp, arg1, arg2)
12450 int err;
12451
12452 if (req->oldptr == USER_ADDR_NULL) {
12453 }
12454 if (req->newptr != USER_ADDR_NULL) {
12455 return EPERM;
12456 }
12457 err = SYSCTL_OUT(req, &tx_chain_len_stats,
12458 sizeof(struct chain_len_stats));
12459
12460 return err;
12461}
12462
12463#if DEBUG || DEVELOPMENT
12464/* Blob for sum16 verification */
12465static uint8_t sumdata[] = {
12466 0x1f, 0x8b, 0x08, 0x08, 0x4c, 0xe5, 0x9a, 0x4f, 0x00, 0x03,
12467 0x5f, 0x00, 0x5d, 0x91, 0x41, 0x4e, 0xc4, 0x30, 0x0c, 0x45,
12468 0xf7, 0x9c, 0xc2, 0x07, 0x18, 0xf5, 0x0e, 0xb0, 0xe2, 0x00,
12469 0x48, 0x88, 0xa5, 0xdb, 0xba, 0x49, 0x34, 0x69, 0xdc, 0x71,
12470 0x92, 0xa9, 0xc2, 0x8a, 0x6b, 0x70, 0x3d, 0x4e, 0x82, 0x93,
12471 0xb4, 0x08, 0xd8, 0xc5, 0xb1, 0xfd, 0xff, 0xb3, 0xfd, 0x4c,
12472 0x42, 0x5f, 0x1f, 0x9f, 0x11, 0x12, 0x43, 0xb2, 0x04, 0x93,
12473 0xe0, 0x7b, 0x01, 0x0e, 0x14, 0x07, 0x78, 0xd1, 0x78, 0x75,
12474 0x71, 0x71, 0xe9, 0x08, 0x84, 0x46, 0xf2, 0xc7, 0x3b, 0x09,
12475 0xe7, 0xd1, 0xd3, 0x8a, 0x57, 0x92, 0x33, 0xcd, 0x39, 0xcc,
12476 0xb0, 0x91, 0x89, 0xe0, 0x42, 0x53, 0x8b, 0xb7, 0x8c, 0x42,
12477 0x60, 0xd9, 0x9f, 0x7a, 0x55, 0x19, 0x76, 0xcb, 0x10, 0x49,
12478 0x35, 0xac, 0x0b, 0x5a, 0x3c, 0xbb, 0x65, 0x51, 0x8c, 0x90,
12479 0x7c, 0x69, 0x45, 0x45, 0x81, 0xb4, 0x2b, 0x70, 0x82, 0x85,
12480 0x55, 0x91, 0x17, 0x90, 0xdc, 0x14, 0x1e, 0x35, 0x52, 0xdd,
12481 0x02, 0x16, 0xef, 0xb5, 0x40, 0x89, 0xe2, 0x46, 0x53, 0xad,
12482 0x93, 0x6e, 0x98, 0x30, 0xe5, 0x08, 0xb7, 0xcc, 0x03, 0xbc,
12483 0x71, 0x86, 0x09, 0x43, 0x0d, 0x52, 0xf5, 0xa2, 0xf5, 0xa2,
12484 0x56, 0x11, 0x8d, 0xa8, 0xf5, 0xee, 0x92, 0x3d, 0xfe, 0x8c,
12485 0x67, 0x71, 0x8b, 0x0e, 0x2d, 0x70, 0x77, 0xbe, 0xbe, 0xea,
12486 0xbf, 0x9a, 0x8d, 0x9c, 0x53, 0x53, 0xe5, 0xe0, 0x4b, 0x87,
12487 0x85, 0xd2, 0x45, 0x95, 0x30, 0xc1, 0xcc, 0xe0, 0x74, 0x54,
12488 0x13, 0x58, 0xe8, 0xe8, 0x79, 0xa2, 0x09, 0x73, 0xa4, 0x0e,
12489 0x39, 0x59, 0x0c, 0xe6, 0x9c, 0xb2, 0x4f, 0x06, 0x5b, 0x8e,
12490 0xcd, 0x17, 0x6c, 0x5e, 0x95, 0x4d, 0x70, 0xa2, 0x0a, 0xbf,
12491 0xa3, 0xcc, 0x03, 0xbc, 0x5a, 0xe7, 0x75, 0x06, 0x5e, 0x75,
12492 0xef, 0x58, 0x8e, 0x15, 0xd1, 0x0a, 0x18, 0xff, 0xdd, 0xe6,
12493 0x02, 0x3b, 0xb5, 0xb4, 0xa1, 0xe0, 0x72, 0xfc, 0xe3, 0xab,
12494 0x07, 0xe0, 0x4d, 0x65, 0xea, 0x92, 0xeb, 0xf2, 0x7b, 0x17,
12495 0x05, 0xce, 0xc6, 0xf6, 0x2b, 0xbb, 0x70, 0x3d, 0x00, 0x95,
12496 0xe0, 0x07, 0x52, 0x3b, 0x58, 0xfc, 0x7c, 0x69, 0x4d, 0xe9,
12497 0xf7, 0xa9, 0x66, 0x1e, 0x1e, 0xbe, 0x01, 0x69, 0x98, 0xfe,
12498 0xc8, 0x28, 0x02, 0x00, 0x00
12499};
12500
12501/* Precomputed 16-bit 1's complement sums for various spans of the above data */
12502static struct {
12503 boolean_t init;
12504 uint16_t len;
12505 uint16_t sumr; /* reference */
12506 uint16_t sumrp; /* reference, precomputed */
12507} sumtbl[] = {
12508 { FALSE, 0, 0, 0x0000 },
12509 { FALSE, 1, 0, 0x001f },
12510 { FALSE, 2, 0, 0x8b1f },
12511 { FALSE, 3, 0, 0x8b27 },
12512 { FALSE, 7, 0, 0x790e },
12513 { FALSE, 11, 0, 0xcb6d },
12514 { FALSE, 20, 0, 0x20dd },
12515 { FALSE, 27, 0, 0xbabd },
12516 { FALSE, 32, 0, 0xf3e8 },
12517 { FALSE, 37, 0, 0x197d },
12518 { FALSE, 43, 0, 0x9eae },
12519 { FALSE, 64, 0, 0x4678 },
12520 { FALSE, 127, 0, 0x9399 },
12521 { FALSE, 256, 0, 0xd147 },
12522 { FALSE, 325, 0, 0x0358 },
12523};
12524#define SUMTBL_MAX ((int)sizeof (sumtbl) / (int)sizeof (sumtbl[0]))
12525
12526static void
12527dlil_verify_sum16(void)
12528{
12529 struct mbuf *m;
12530 uint8_t *buf;
12531 int n;
12532
12533 /* Make sure test data plus extra room for alignment fits in cluster */
12534 _CASSERT((sizeof(sumdata) + (sizeof(uint64_t) * 2)) <= MCLBYTES);
12535
12536 kprintf("DLIL: running SUM16 self-tests ... ");
12537
12538 m = m_getcl(M_WAITOK, MT_DATA, M_PKTHDR);
12539 m_align(m, sizeof(sumdata) + (sizeof(uint64_t) * 2));
12540
12541 buf = mtod(m, uint8_t *); /* base address */
12542
12543 for (n = 0; n < SUMTBL_MAX; n++) {
12544 uint16_t len = sumtbl[n].len;
12545 int i;
12546
12547 /* Verify for all possible alignments */
12548 for (i = 0; i < (int)sizeof(uint64_t); i++) {
12549 uint16_t sum, sumr;
12550 uint8_t *c;
12551
12552 /* Copy over test data to mbuf */
12553 VERIFY(len <= sizeof(sumdata));
12554 c = buf + i;
12555 bcopy(sumdata, c, len);
12556
12557 /* Zero-offset test (align by data pointer) */
12558 m->m_data = (uintptr_t)c;
12559 m->m_len = len;
12560 sum = m_sum16(m, 0, len);
12561
12562 if (!sumtbl[n].init) {
12563 sumr = (uint16_t)in_cksum_mbuf_ref(m, len, 0, 0);
12564 sumtbl[n].sumr = sumr;
12565 sumtbl[n].init = TRUE;
12566 } else {
12567 sumr = sumtbl[n].sumr;
12568 }
12569
12570 /* Something is horribly broken; stop now */
12571 if (sumr != sumtbl[n].sumrp) {
12572 panic_plain("\n%s: broken in_cksum_mbuf_ref() "
12573 "for len=%d align=%d sum=0x%04x "
12574 "[expected=0x%04x]\n", __func__,
12575 len, i, sum, sumr);
12576 /* NOTREACHED */
12577 } else if (sum != sumr) {
12578 panic_plain("\n%s: broken m_sum16() for len=%d "
12579 "align=%d sum=0x%04x [expected=0x%04x]\n",
12580 __func__, len, i, sum, sumr);
12581 /* NOTREACHED */
12582 }
12583
12584 /* Alignment test by offset (fixed data pointer) */
12585 m->m_data = (uintptr_t)buf;
12586 m->m_len = i + len;
12587 sum = m_sum16(m, i, len);
12588
12589 /* Something is horribly broken; stop now */
12590 if (sum != sumr) {
12591 panic_plain("\n%s: broken m_sum16() for len=%d "
12592 "offset=%d sum=0x%04x [expected=0x%04x]\n",
12593 __func__, len, i, sum, sumr);
12594 /* NOTREACHED */
12595 }
12596#if INET
12597 /* Simple sum16 contiguous buffer test by aligment */
12598 sum = b_sum16(c, len);
12599
12600 /* Something is horribly broken; stop now */
12601 if (sum != sumr) {
12602 panic_plain("\n%s: broken b_sum16() for len=%d "
12603 "align=%d sum=0x%04x [expected=0x%04x]\n",
12604 __func__, len, i, sum, sumr);
12605 /* NOTREACHED */
12606 }
12607#endif /* INET */
12608 }
12609 }
12610 m_freem(m);
12611
12612 kprintf("PASSED\n");
12613}
12614#endif /* DEBUG || DEVELOPMENT */
12615
12616#define CASE_STRINGIFY(x) case x: return #x
12617
12618__private_extern__ const char *
12619dlil_kev_dl_code_str(u_int32_t event_code)
12620{
12621 switch (event_code) {
12622 CASE_STRINGIFY(KEV_DL_SIFFLAGS);
12623 CASE_STRINGIFY(KEV_DL_SIFMETRICS);
12624 CASE_STRINGIFY(KEV_DL_SIFMTU);
12625 CASE_STRINGIFY(KEV_DL_SIFPHYS);
12626 CASE_STRINGIFY(KEV_DL_SIFMEDIA);
12627 CASE_STRINGIFY(KEV_DL_SIFGENERIC);
12628 CASE_STRINGIFY(KEV_DL_ADDMULTI);
12629 CASE_STRINGIFY(KEV_DL_DELMULTI);
12630 CASE_STRINGIFY(KEV_DL_IF_ATTACHED);
12631 CASE_STRINGIFY(KEV_DL_IF_DETACHING);
12632 CASE_STRINGIFY(KEV_DL_IF_DETACHED);
12633 CASE_STRINGIFY(KEV_DL_LINK_OFF);
12634 CASE_STRINGIFY(KEV_DL_LINK_ON);
12635 CASE_STRINGIFY(KEV_DL_PROTO_ATTACHED);
12636 CASE_STRINGIFY(KEV_DL_PROTO_DETACHED);
12637 CASE_STRINGIFY(KEV_DL_LINK_ADDRESS_CHANGED);
12638 CASE_STRINGIFY(KEV_DL_WAKEFLAGS_CHANGED);
12639 CASE_STRINGIFY(KEV_DL_IF_IDLE_ROUTE_REFCNT);
12640 CASE_STRINGIFY(KEV_DL_IFCAP_CHANGED);
12641 CASE_STRINGIFY(KEV_DL_LINK_QUALITY_METRIC_CHANGED);
12642 CASE_STRINGIFY(KEV_DL_NODE_PRESENCE);
12643 CASE_STRINGIFY(KEV_DL_NODE_ABSENCE);
12644 CASE_STRINGIFY(KEV_DL_PRIMARY_ELECTED);
12645 CASE_STRINGIFY(KEV_DL_ISSUES);
12646 CASE_STRINGIFY(KEV_DL_IFDELEGATE_CHANGED);
12647 default:
12648 break;
12649 }
12650 return "";
12651}
12652
12653static void
12654dlil_dt_tcall_fn(thread_call_param_t arg0, thread_call_param_t arg1)
12655{
12656#pragma unused(arg1)
12657 struct ifnet *ifp = arg0;
12658
12659 if (ifnet_is_attached(ifp, refio: 1)) {
12660 nstat_ifnet_threshold_reached(ifindex: ifp->if_index);
12661 ifnet_decr_iorefcnt(ifp);
12662 }
12663}
12664
12665void
12666ifnet_notify_data_threshold(struct ifnet *ifp)
12667{
12668 uint64_t bytes = (ifp->if_ibytes + ifp->if_obytes);
12669 uint64_t oldbytes = ifp->if_dt_bytes;
12670
12671 ASSERT(ifp->if_dt_tcall != NULL);
12672
12673 /*
12674 * If we went over the threshold, notify NetworkStatistics.
12675 * We rate-limit it based on the threshold interval value.
12676 */
12677 if (threshold_notify && (bytes - oldbytes) > ifp->if_data_threshold &&
12678 OSCompareAndSwap64(oldbytes, bytes, &ifp->if_dt_bytes) &&
12679 !thread_call_isactive(call: ifp->if_dt_tcall)) {
12680 uint64_t tival = (threshold_interval * NSEC_PER_SEC);
12681 uint64_t now = mach_absolute_time(), deadline = now;
12682 uint64_t ival;
12683
12684 if (tival != 0) {
12685 nanoseconds_to_absolutetime(nanoseconds: tival, result: &ival);
12686 clock_deadline_for_periodic_event(interval: ival, abstime: now, deadline: &deadline);
12687 (void) thread_call_enter_delayed(call: ifp->if_dt_tcall,
12688 deadline);
12689 } else {
12690 (void) thread_call_enter(call: ifp->if_dt_tcall);
12691 }
12692 }
12693}
12694
12695#if (DEVELOPMENT || DEBUG)
12696/*
12697 * The sysctl variable name contains the input parameters of
12698 * ifnet_get_keepalive_offload_frames()
12699 * ifp (interface index): name[0]
12700 * frames_array_count: name[1]
12701 * frame_data_offset: name[2]
12702 * The return length gives used_frames_count
12703 */
12704static int
12705sysctl_get_kao_frames SYSCTL_HANDLER_ARGS
12706{
12707#pragma unused(oidp)
12708 int *name = (int *)arg1;
12709 u_int namelen = arg2;
12710 int idx;
12711 ifnet_t ifp = NULL;
12712 u_int32_t frames_array_count;
12713 size_t frame_data_offset;
12714 u_int32_t used_frames_count;
12715 struct ifnet_keepalive_offload_frame *frames_array = NULL;
12716 int error = 0;
12717 u_int32_t i;
12718
12719 /*
12720 * Only root can get look at other people TCP frames
12721 */
12722 error = proc_suser(current_proc());
12723 if (error != 0) {
12724 goto done;
12725 }
12726 /*
12727 * Validate the input parameters
12728 */
12729 if (req->newptr != USER_ADDR_NULL) {
12730 error = EPERM;
12731 goto done;
12732 }
12733 if (namelen != 3) {
12734 error = EINVAL;
12735 goto done;
12736 }
12737 if (req->oldptr == USER_ADDR_NULL) {
12738 error = EINVAL;
12739 goto done;
12740 }
12741 if (req->oldlen == 0) {
12742 error = EINVAL;
12743 goto done;
12744 }
12745 idx = name[0];
12746 frames_array_count = name[1];
12747 frame_data_offset = name[2];
12748
12749 /* Make sure the passed buffer is large enough */
12750 if (frames_array_count * sizeof(struct ifnet_keepalive_offload_frame) >
12751 req->oldlen) {
12752 error = ENOMEM;
12753 goto done;
12754 }
12755
12756 ifnet_head_lock_shared();
12757 if (!IF_INDEX_IN_RANGE(idx)) {
12758 ifnet_head_done();
12759 error = ENOENT;
12760 goto done;
12761 }
12762 ifp = ifindex2ifnet[idx];
12763 ifnet_head_done();
12764
12765 frames_array = (struct ifnet_keepalive_offload_frame *)kalloc_data(
12766 frames_array_count * sizeof(struct ifnet_keepalive_offload_frame),
12767 Z_WAITOK);
12768 if (frames_array == NULL) {
12769 error = ENOMEM;
12770 goto done;
12771 }
12772
12773 error = ifnet_get_keepalive_offload_frames(ifp, frames_array,
12774 frames_array_count, frame_data_offset, &used_frames_count);
12775 if (error != 0) {
12776 DLIL_PRINTF("%s: ifnet_get_keepalive_offload_frames error %d\n",
12777 __func__, error);
12778 goto done;
12779 }
12780
12781 for (i = 0; i < used_frames_count; i++) {
12782 error = SYSCTL_OUT(req, frames_array + i,
12783 sizeof(struct ifnet_keepalive_offload_frame));
12784 if (error != 0) {
12785 goto done;
12786 }
12787 }
12788done:
12789 if (frames_array != NULL) {
12790 kfree_data(frames_array, frames_array_count *
12791 sizeof(struct ifnet_keepalive_offload_frame));
12792 }
12793 return error;
12794}
12795#endif /* DEVELOPMENT || DEBUG */
12796
12797void
12798ifnet_update_stats_per_flow(struct ifnet_stats_per_flow *ifs,
12799 struct ifnet *ifp)
12800{
12801 tcp_update_stats_per_flow(ifs, ifp);
12802}
12803
12804static inline u_int32_t
12805_set_flags(u_int32_t *flags_p, u_int32_t set_flags)
12806{
12807 return (u_int32_t)OSBitOrAtomic(set_flags, flags_p);
12808}
12809
12810static inline void
12811_clear_flags(u_int32_t *flags_p, u_int32_t clear_flags)
12812{
12813 OSBitAndAtomic(~clear_flags, flags_p);
12814}
12815
12816__private_extern__ u_int32_t
12817if_set_eflags(ifnet_t interface, u_int32_t set_flags)
12818{
12819 return _set_flags(flags_p: &interface->if_eflags, set_flags);
12820}
12821
12822__private_extern__ void
12823if_clear_eflags(ifnet_t interface, u_int32_t clear_flags)
12824{
12825 _clear_flags(flags_p: &interface->if_eflags, clear_flags);
12826}
12827
12828__private_extern__ u_int32_t
12829if_set_xflags(ifnet_t interface, u_int32_t set_flags)
12830{
12831 return _set_flags(flags_p: &interface->if_xflags, set_flags);
12832}
12833
12834__private_extern__ void
12835if_clear_xflags(ifnet_t interface, u_int32_t clear_flags)
12836{
12837 _clear_flags(flags_p: &interface->if_xflags, clear_flags);
12838}
12839
12840__private_extern__ void
12841ifnet_update_traffic_rule_genid(ifnet_t ifp)
12842{
12843 os_atomic_inc(&ifp->if_traffic_rule_genid, relaxed);
12844}
12845
12846__private_extern__ boolean_t
12847ifnet_sync_traffic_rule_genid(ifnet_t ifp, uint32_t *genid)
12848{
12849 if (*genid != ifp->if_traffic_rule_genid) {
12850 *genid = ifp->if_traffic_rule_genid;
12851 return TRUE;
12852 }
12853 return FALSE;
12854}
12855__private_extern__ void
12856ifnet_update_traffic_rule_count(ifnet_t ifp, uint32_t count)
12857{
12858 os_atomic_store(&ifp->if_traffic_rule_count, count, release);
12859 ifnet_update_traffic_rule_genid(ifp);
12860}
12861
12862static void
12863log_hexdump(void *data, size_t len)
12864{
12865 size_t i, j, k;
12866 unsigned char *ptr = (unsigned char *)data;
12867#define MAX_DUMP_BUF 32
12868 unsigned char buf[3 * MAX_DUMP_BUF + 1];
12869
12870 for (i = 0; i < len; i += MAX_DUMP_BUF) {
12871 for (j = i, k = 0; j < i + MAX_DUMP_BUF && j < len; j++) {
12872 unsigned char msnbl = ptr[j] >> 4;
12873 unsigned char lsnbl = ptr[j] & 0x0f;
12874
12875 buf[k++] = msnbl < 10 ? msnbl + '0' : msnbl + 'a' - 10;
12876 buf[k++] = lsnbl < 10 ? lsnbl + '0' : lsnbl + 'a' - 10;
12877
12878 if ((j % 2) == 1) {
12879 buf[k++] = ' ';
12880 }
12881 if ((j % MAX_DUMP_BUF) == MAX_DUMP_BUF - 1) {
12882 buf[k++] = ' ';
12883 }
12884 }
12885 buf[k] = 0;
12886 os_log(OS_LOG_DEFAULT, "%3lu: %s", i, buf);
12887 }
12888}
12889
12890#if SKYWALK && defined(XNU_TARGET_OS_OSX)
12891static bool
12892net_check_compatible_if_filter(struct ifnet *ifp)
12893{
12894 if (ifp == NULL) {
12895 if (net_api_stats.nas_iflt_attach_count > net_api_stats.nas_iflt_attach_os_count) {
12896 return false;
12897 }
12898 } else {
12899 if (ifp->if_flt_non_os_count > 0) {
12900 return false;
12901 }
12902 }
12903 return true;
12904}
12905#endif /* SKYWALK && XNU_TARGET_OS_OSX */
12906
12907#define DUMP_BUF_CHK() { \
12908 clen -= k; \
12909 if (clen < 1) \
12910 goto done; \
12911 c += k; \
12912}
12913
12914int dlil_dump_top_if_qlen(char *, int);
12915int
12916dlil_dump_top_if_qlen(char *str, int str_len)
12917{
12918 char *c = str;
12919 int k, clen = str_len;
12920 struct ifnet *top_ifcq_ifp = NULL;
12921 uint32_t top_ifcq_len = 0;
12922 struct ifnet *top_inq_ifp = NULL;
12923 uint32_t top_inq_len = 0;
12924
12925 for (int ifidx = 1; ifidx < if_index; ifidx++) {
12926 struct ifnet *ifp = ifindex2ifnet[ifidx];
12927 struct dlil_ifnet *dl_if = (struct dlil_ifnet *)ifp;
12928
12929 if (ifp == NULL) {
12930 continue;
12931 }
12932 if (ifp->if_snd != NULL && ifp->if_snd->ifcq_len > top_ifcq_len) {
12933 top_ifcq_len = ifp->if_snd->ifcq_len;
12934 top_ifcq_ifp = ifp;
12935 }
12936 if (dl_if->dl_if_inpstorage.dlth_pkts.qlen > top_inq_len) {
12937 top_inq_len = dl_if->dl_if_inpstorage.dlth_pkts.qlen;
12938 top_inq_ifp = ifp;
12939 }
12940 }
12941
12942 if (top_ifcq_ifp != NULL) {
12943 k = scnprintf(c, count: clen, "\ntop ifcq_len %u packets by %s\n",
12944 top_ifcq_len, top_ifcq_ifp->if_xname);
12945 DUMP_BUF_CHK();
12946 }
12947 if (top_inq_ifp != NULL) {
12948 k = scnprintf(c, count: clen, "\ntop inq_len %u packets by %s\n",
12949 top_inq_len, top_inq_ifp->if_xname);
12950 DUMP_BUF_CHK();
12951 }
12952done:
12953 return str_len - clen;
12954}
12955