fsw_dp.c source code [xnu/bsd/skywalk/nexus/flowswitch/fsw_dp.c]

1	/*
2	* Copyright (c) 2015-2023 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28
29	/*
30	* Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
31	*
32	* Redistribution and use in source and binary forms, with or without
33	* modification, are permitted provided that the following conditions
34	* are met:
35	* 1. Redistributions of source code must retain the above copyright
36	* notice, this list of conditions and the following disclaimer.
37	* 2. Redistributions in binary form must reproduce the above copyright
38	* notice, this list of conditions and the following disclaimer in the
39	* documentation and/or other materials provided with the distribution.
40	*
41	* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
42	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44	* ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51	* SUCH DAMAGE.
52	*/
53
54	/*
55	* BSD LICENSE
56	*
57	* Copyright(c) 2015 NEC Europe Ltd. All rights reserved.
58	* All rights reserved.
59	*
60	* Redistribution and use in source and binary forms, with or without
61	* modification, are permitted provided that the following conditions
62	* are met:
63	*
64	* * Redistributions of source code must retain the above copyright
65	* notice, this list of conditions and the following disclaimer.
66	* * Redistributions in binary form must reproduce the above copyright
67	* notice, this list of conditions and the following disclaimer in
68	* the documentation and/or other materials provided with the
69	* distribution.
70	* * Neither the name of NEC Europe Ltd. nor the names of
71	* its contributors may be used to endorse or promote products derived
72	* from this software without specific prior written permission.
73	*
74	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
75	* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
76	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
77	* A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
78	* OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
79	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
80	* LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
81	* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
82	* THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
83	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
84	* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
85	*/
86
87	#include <skywalk/os_skywalk_private.h>
88	#include <skywalk/nexus/flowswitch/nx_flowswitch.h>
89	#include <skywalk/nexus/flowswitch/fsw_var.h>
90	#include <skywalk/nexus/netif/nx_netif.h>
91	#include <skywalk/nexus/netif/nx_netif_compat.h>
92	#include <kern/sched_prim.h>
93	#include <sys/kdebug.h>
94	#include <sys/sdt.h>
95	#include <net/bpf.h>
96	#include <net/if_ports_used.h>
97	#include <net/pktap.h>
98	#include <net/pktsched/pktsched_netem.h>
99	#include <netinet/tcp.h>
100	#include <netinet/udp.h>
101	#include <netinet/ip.h>
102	#include <netinet/ip6.h>
103	#include <netinet/in_var.h>
104
105	extern kern_return_t thread_terminate(thread_t);
106
107	#define FSW_ZONE_MAX 256
108	#define FSW_ZONE_NAME "skywalk.nx.fsw"
109
110	static uint64_t fsw_reap_last __sk_aligned(`8`);
111	static uint64_t fsw_want_purge __sk_aligned(`8`);
112
113	#define NX_FSW_FE_TABLESZ 256 /* some power of 2 */
114	static uint32_t fsw_fe_table_size = NX_FSW_FE_TABLESZ;
115
116	#define NX_FSW_FOB_HASHSZ 31 /* some mersenne prime */
117	static uint32_t fsw_flow_owner_buckets = NX_FSW_FOB_HASHSZ;
118
119	#define NX_FSW_FRB_HASHSZ 128 /* some power of 2 */
120	static uint32_t fsw_flow_route_buckets = NX_FSW_FRB_HASHSZ;
121
122	#define NX_FSW_FRIB_HASHSZ 13 /* some mersenne prime */
123	static uint32_t fsw_flow_route_id_buckets = NX_FSW_FRIB_HASHSZ;
124
125	#define NX_FSW_FLOW_REAP_INTERVAL 1 /* seconds */
126	static uint32_t fsw_flow_reap_interval = NX_FSW_FLOW_REAP_INTERVAL;
127
128	#define NX_FSW_FLOW_PURGE_THRES 0 /* purge every N reaps (0 = disable) */
129	static uint32_t fsw_flow_purge_thresh = NX_FSW_FLOW_PURGE_THRES;
130
131	#define FSW_REAP_IVAL (MAX(1, fsw_flow_reap_interval))
132	#define FSW_REAP_SK_THRES (FSW_REAP_IVAL << 5)
133	#define FSW_REAP_IF_THRES (FSW_REAP_IVAL << 5)
134	#define FSW_DRAIN_CH_THRES (FSW_REAP_IVAL << 5)
135	#define FSW_IFSTATS_THRES 1
136
137	#define NX_FSW_CHANNEL_REAP_THRES 1000 /* threshold (bytes/sec) for reaping*/
138	uint64_t fsw_channel_reap_thresh = NX_FSW_CHANNEL_REAP_THRES;
139
140	#define RX_BUFLET_BATCH_COUNT 64 /* max batch size for buflet allocation */
141
142	uint32_t fsw_rx_batch = NX_FSW_RXBATCH; / # of packets per batch (RX) /
143	uint32_t fsw_tx_batch = NX_FSW_TXBATCH; / # of packets per batch (TX) /
144	uint32_t fsw_gso_batch = `8`;
145	#if (DEVELOPMENT \|\| DEBUG)
146	SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_batch,
147	CTLFLAG_RW \| CTLFLAG_LOCKED, &fsw_rx_batch, `0`,
148	"flowswitch Rx batch size");
149	SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, tx_batch,
150	CTLFLAG_RW \| CTLFLAG_LOCKED, &fsw_tx_batch, `0`,
151	"flowswitch Tx batch size");
152	SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, gso_batch,
153	CTLFLAG_RW \| CTLFLAG_LOCKED, &fsw_gso_batch, `0`,
154	"flowswitch GSO batch size");
155	SYSCTL_QUAD(_kern_skywalk_flowswitch, OID_AUTO, reap_throughput,
156	CTLFLAG_RW \| CTLFLAG_LOCKED, &fsw_channel_reap_thresh,
157	"flowswitch channel reap threshold throughput (bytes/sec)");
158	#endif /* !DEVELOPMENT && !DEBUG */
159
160	SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_agg_tcp,
161	CTLFLAG_RW \| CTLFLAG_LOCKED, &sk_fsw_rx_agg_tcp, `0`,
162	"flowswitch RX aggregation for tcp flows (enable/disable)");
163	SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_agg_tcp_host,
164	CTLFLAG_RW \| CTLFLAG_LOCKED, &sk_fsw_rx_agg_tcp_host, `0`,
165	"flowswitch RX aggregation for tcp kernel path (0/1/2 (off/on/auto))");
166	SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, gso_mtu,
167	CTLFLAG_RW \| CTLFLAG_LOCKED, &sk_fsw_gso_mtu, `0`,
168	"flowswitch GSO for tcp flows (mtu > 0: enable, mtu == 0: disable)");
169
170	/*
171	* IP reassembly
172	* The "kern.skywalk.flowswitch.ip_reass" sysctl can be used to force
173	* enable/disable the reassembly routine regardless of whether the
174	* transport netagent is enabled or not.
175	*
176	* 'fsw_ip_reass' is a tri-state:
177	* 0 means force IP reassembly off
178	* 1 means force IP reassembly on
179	* 2 means don't force the value, use what's appropriate for this flowswitch
180	*/
181	#define FSW_IP_REASS_FORCE_OFF 0
182	#define FSW_IP_REASS_FORCE_ON 1
183	#define FSW_IP_REASS_NO_FORCE 2
184
185	uint32_t fsw_ip_reass = FSW_IP_REASS_NO_FORCE;
186
187	static int
188	fsw_ip_reass_sysctl SYSCTL_HANDLER_ARGS
189	{
190	#pragma unused(oidp, arg1, arg2)
191	unsigned int new_value;
192	int changed;
193	int error;
194
195	error = sysctl_io_number(req, bigValue: fsw_ip_reass, valueSize: sizeof(fsw_ip_reass),
196	pValue: &new_value, changed: &changed);
197	if (error == `0` && changed != `0`) {
198	if (new_value > FSW_IP_REASS_NO_FORCE) {
199	return EINVAL;
200	}
201	fsw_ip_reass = new_value;
202	}
203	return error;
204	}
205
206	SYSCTL_PROC(_kern_skywalk_flowswitch, OID_AUTO, ip_reass,
207	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_LOCKED,
208	`0`, `0`, fsw_ip_reass_sysctl, "IU",
209	"adjust flowswitch IP reassembly");
210
211	#if (DEVELOPMENT \|\| DEBUG)
212	static uint64_t _fsw_inject_error = `0`;
213	#define _FSW_INJECT_ERROR(_en, _ev, _ec, _f, ...) \
214	_SK_INJECT_ERROR(_fsw_inject_error, _en, _ev, _ec, \
215	&FSW_STATS_VAL(_FSW_STATS_ERROR_INJECTIONS), _f, __VA_ARGS__)
216
217	#define _FSW_INJECT_ERROR_SET(_en, _f, ...) do { \
218	if (__improbable(((_fsw_inject_error) & (1ULL << (_en))) != 0)) { \
219	SK_DF(SK_VERB_ERROR_INJECT, "injecting error %d", (_en));\
220	if ((_f) != NULL) \
221	(_f)(__VA_ARGS__); \
222	} \
223	} while (0)
224
225	SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_owner_buckets,
226	CTLFLAG_RW \| CTLFLAG_LOCKED, &fsw_flow_owner_buckets, `0`, "");
227	SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, fe_table_size,
228	CTLFLAG_RW \| CTLFLAG_LOCKED, &fsw_fe_table_size, `0`, "");
229	SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_route_buckets,
230	CTLFLAG_RW \| CTLFLAG_LOCKED, &fsw_flow_route_buckets, `0`, "");
231	SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO,
232	flow_route_id_buckets, CTLFLAG_RW \| CTLFLAG_LOCKED,
233	&fsw_flow_route_id_buckets, `0`, "");
234	SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_reap_interval,
235	CTLFLAG_RW \| CTLFLAG_LOCKED, &fsw_flow_reap_interval, `0`, "");
236	SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_purge_thresh,
237	CTLFLAG_RW \| CTLFLAG_LOCKED, &fsw_flow_purge_thresh, `0`, "");
238	SYSCTL_QUAD(_kern_skywalk_flowswitch, OID_AUTO, fsw_inject_error,
239	CTLFLAG_RW \| CTLFLAG_LOCKED, &_fsw_inject_error, "");
240	#else
241	#define _FSW_INJECT_ERROR(_en, _ev, _ec, _f, ...) do { } while (0)
242	#define _FSW_INJECT_ERROR_SET(_en, _f, ...) do { } while (0)
243	#endif /* !DEVELOPMENT && !DEBUG */
244
245	static void fsw_linger_remove_internal(struct flow_entry_linger_head *,
246	struct flow_entry *);
247	static void fsw_reap_thread_func(void *, wait_result_t);
248	static void fsw_reap_thread_cont(void *, wait_result_t);
249	static void fsw_purge_cache(struct nx_flowswitch *, boolean_t);
250	static void fsw_drain_channels(struct nx_flowswitch *, uint64_t, boolean_t);
251	static uint32_t fsw_process_deferred(struct nx_flowswitch *);
252	static uint32_t fsw_process_linger(struct nx_flowswitch , uint32_t );
253
254	static int copy_packet_from_dev(struct nx_flowswitch , struct* __kern_packet *,
255	struct __kern_packet *);
256
257	static void fsw_ifp_inc_traffic_class_in_pkt(struct ifnet *, kern_packet_t);
258	static void fsw_ifp_inc_traffic_class_out_pkt(struct ifnet *, uint32_t,
259	uint32_t, uint32_t);
260
261	static int __fsw_dp_inited = `0`;
262
263	int
264	fsw_dp_init(void)
265	{
266	_CASSERT(FSW_VP_DEV == `0`);
267	_CASSERT(FSW_VP_HOST == `1`);
268	_CASSERT((FSW_VP_HOST + FSW_VP_DEV) < FSW_VP_USER_MIN);
269	_CASSERT((FSW_VP_HOST + FSW_VP_DEV) < NEXUS_PORT_FLOW_SWITCH_CLIENT);
270
271	ASSERT(!__fsw_dp_inited);
272
273	flow_mgr_init();
274	flow_init();
275
276	__fsw_dp_inited = `1`;
277
278	return `0`;
279	}
280
281	void
282	fsw_dp_uninit(void)
283	{
284	if (__fsw_dp_inited) {
285	flow_fini();
286	flow_mgr_fini();
287
288	__fsw_dp_inited = `0`;
289	}
290	}
291
292	static void
293	dp_free_pktq(struct nx_flowswitch fsw __sk_unused, struct* pktq *pktq)
294	{
295	pp_free_pktq(pktq);
296	}
297
298	#define dp_drop_pktq(fsw, pktq) do { \
299	uint32_t _len = KPKTQ_LEN(pktq); \
300	if (KPKTQ_EMPTY(pktq)) { \
301	ASSERT(_len == 0); \
302	return; \
303	} \
304	SK_DF(SK_VERB_FSW_DP \| SK_VERB_DROP, "drop %d packets", _len); \
305	FSW_STATS_ADD(FSW_STATS_DROP, _len); \
306	DTRACE_SKYWALK1(fsw__dp__drop, int, _len); \
307	dp_free_pktq(fsw, pktq); \
308	} while (0)
309
310	SK_NO_INLINE_ATTRIBUTE
311	void
312	fsw_snoop(struct nx_flowswitch fsw, struct* flow_entry *fe, bool input)
313	{
314	pid_t pid;
315	char proc_name_buf[FLOW_PROCESS_NAME_LENGTH];
316	char *proc_name = NULL;
317	pid_t epid;
318	char eproc_name_buf[FLOW_PROCESS_NAME_LENGTH];
319	char *eproc_name = NULL;
320	sa_family_t af;
321	bool tap_early = false;
322	struct __kern_packet *pkt;
323
324	ASSERT(fe != NULL);
325	ASSERT(fsw->fsw_ifp != NULL);
326
327	if (fe->fe_nx_port == FSW_VP_HOST) {
328	/ allow packets to be tapped before aggregation happens /
329	tap_early = (input && fe->fe_key.fk_proto == IPPROTO_TCP);
330	if (!tap_early) {
331	/ all other traffic will be tapped in the dlil input path /
332	return;
333	}
334	}
335	if (fe->fe_key.fk_ipver == IPVERSION) {
336	af = AF_INET;
337	} else if (fe->fe_key.fk_ipver == IPV6_VERSION) {
338	af = AF_INET6;
339	} else {
340	return;
341	}
342
343	pid = fe->fe_pid;
344	if (fe->fe_proc_name[`0`] != `'\0'`) {
345	(void) strlcpy(dst: proc_name_buf, src: fe->fe_proc_name,
346	n: sizeof(proc_name_buf));
347	proc_name = proc_name_buf;
348	}
349	epid = fe->fe_epid;
350	if (fe->fe_eproc_name[`0`] != `'\0'`) {
351	(void) strlcpy(dst: eproc_name_buf, src: fe->fe_eproc_name,
352	n: sizeof(eproc_name_buf));
353	eproc_name = eproc_name_buf;
354	}
355	if (input) {
356	KPKTQ_FOREACH(pkt, &fe->fe_rx_pktq) {
357	pktap_input_packet(fsw->fsw_ifp, af,
358	fsw->fsw_ifp_dlt, pid, proc_name, epid,
359	eproc_name, SK_PKT2PH(pkt), NULL, `0`,
360	IPPROTO_TCP, fe->fe_flowid,
361	tap_early ? PTH_FLAG_SOCKET: PTH_FLAG_NEXUS_CHAN);
362	}
363	} else {
364	KPKTQ_FOREACH(pkt, &fe->fe_tx_pktq) {
365	pktap_output_packet(fsw->fsw_ifp, af,
366	fsw->fsw_ifp_dlt, pid, proc_name, epid,
367	eproc_name, SK_PKT2PH(pkt), NULL, `0`,
368	`0`, `0`, PTH_FLAG_NEXUS_CHAN);
369	}
370	}
371	}
372
373	#if (DEVELOPMENT \|\| DEBUG)
374	static void
375	_fsw_error35_handler(int step, struct flow_route fr, struct* __kern_packet *pkt,
376	int *ret)
377	{
378	static boolean_t _err35_flag_modified = FALSE;
379
380	switch (step) {
381	case `1`:
382	if ((fr->fr_flags & (FLOWRTF_RESOLVED \| FLOWRTF_HAS_LLINFO)) ==
383	(FLOWRTF_RESOLVED \| FLOWRTF_HAS_LLINFO)) {
384	fr->fr_flags &= ~FLOWRTF_RESOLVED;
385	_err35_flag_modified = TRUE;
386	}
387	break;
388
389	case `2`:
390	if (!_err35_flag_modified) {
391	return;
392	}
393	if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
394	m_freem(pkt->pkt_mbuf);
395	pkt->pkt_pflags &= ~PKT_F_MBUF_DATA;
396	pkt->pkt_mbuf = NULL;
397	}
398	*ret = EJUSTRETURN;
399	fr->fr_flags \|= FLOWRTF_RESOLVED;
400	_err35_flag_modified = FALSE;
401	break;
402
403	default:
404	VERIFY(`0`);
405	/ not reached /
406	}
407	}
408
409	static void
410	_fsw_error36_handler(int step, struct flow_route fr, int* *ret)
411	{
412	static boolean_t _err36_flag_modified = FALSE;
413
414	switch (step) {
415	case `1`:
416	if ((fr->fr_flags & (FLOWRTF_RESOLVED \| FLOWRTF_HAS_LLINFO)) ==
417	(FLOWRTF_RESOLVED \| FLOWRTF_HAS_LLINFO)) {
418	fr->fr_flags &= ~FLOWRTF_RESOLVED;
419	_err36_flag_modified = TRUE;
420	}
421	break;
422
423	case `2`:
424	if (!_err36_flag_modified) {
425	return;
426	}
427	*ret = ENETUNREACH;
428	fr->fr_flags \|= FLOWRTF_RESOLVED;
429	_err36_flag_modified = FALSE;
430	break;
431
432	default:
433	VERIFY(`0`);
434	/ not reached /
435	}
436	}
437	#else /* !DEVELOPMENT && !DEBUG */
438	#define _fsw_error35_handler(...)
439	#define _fsw_error36_handler(...)
440	#endif /* DEVELOPMENT \|\| DEBUG */
441
442	/*
443	* Check if the source packet content can fit into the destination
444	* ring's packet. Returns TRUE if the source packet can fit.
445	* Note: Failures could be caused by misconfigured packet pool sizes,
446	* missing packet size check again MTU or if the source packet is from
447	* a compat netif and the attached mbuf is larger than MTU due to LRO.
448	*/
449	static inline boolean_t
450	validate_pkt_len(struct __kern_packet *spkt, kern_packet_t dph,
451	uint32_t skip_l2hlen, uint32_t l2hlen, uint16_t headroom,
452	uint32_t *copy_len)
453	{
454	uint32_t tlen = `0`;
455	uint32_t splen = spkt->pkt_length - skip_l2hlen;
456
457	if (l2hlen != `0`) {
458	VERIFY(skip_l2hlen == `0`);
459	tlen += l2hlen;
460	} else if ((spkt->pkt_link_flags & PKT_LINKF_ETHFCS) != `0`) {
461	splen -= ETHER_CRC_LEN;
462	}
463
464	tlen += splen;
465	*copy_len = splen;
466
467	return tlen <= ((__packet_get_buflet_count(ph: dph) *
468	PP_BUF_SIZE_DEF(SK_PTR_ADDR_KPKT(dph)->pkt_qum.qum_pp)) -
469	headroom);
470	}
471
472	#if SK_LOG
473	/ Hoisted out of line to reduce kernel stack footprint /
474	SK_LOG_ATTRIBUTE
475	static void
476	copy_packet_from_dev_log(struct __kern_packet *spkt,
477	struct __kern_packet dpkt, struct* proc *p)
478	{
479	uint64_t logflags = ((SK_VERB_FSW \| SK_VERB_RX) \|
480	((spkt->pkt_pflags & PKT_F_MBUF_DATA) ?
481	SK_VERB_COPY_MBUF : SK_VERB_COPY));
482	char *daddr;
483	MD_BUFLET_ADDR_ABS(dpkt, daddr);
484	SK_DF(logflags, "%s(%d) splen %u dplen %u hr %u l2 %u",
485	sk_proc_name_address(p), sk_proc_pid(p), spkt->pkt_length,
486	dpkt->pkt_length, (uint32_t)dpkt->pkt_headroom,
487	(uint32_t)dpkt->pkt_l2_len);
488	SK_DF(logflags \| SK_VERB_DUMP, "%s",
489	sk_dump("buf", daddr, dpkt->pkt_length, `128`, NULL, `0`));
490	}
491	#else
492	#define copy_packet_from_dev_log(...)
493	#endif /* SK_LOG */
494
495
496	static inline int
497	copy_packet_from_dev(struct nx_flowswitch fsw, struct* __kern_packet *spkt,
498	struct __kern_packet *dpkt)
499	{
500	/*
501	* source and destination nexus don't share the packet pool
502	* sync operation here is to
503	* - alloc packet for the rx(dst) ring
504	* - copy data/metadata from src packet to dst packet
505	* - attach alloc'd packet to rx(dst) ring
506	*/
507	kern_packet_t dph = SK_PTR_ENCODE(dpkt,
508	METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
509	kern_packet_t sph = SK_PTR_ENCODE(spkt, METADATA_TYPE(spkt),
510	METADATA_SUBTYPE(spkt));
511	boolean_t do_cksum_rx;
512	uint16_t skip_l2h_len = spkt->pkt_l2_len;
513	uint16_t iphlen;
514	uint32_t dlen;
515	int err;
516
517	if (__improbable(!validate_pkt_len(spkt, dph, skip_l2h_len, `0`, `0`,
518	&dlen))) {
519	SK_ERR("bufcnt %d, bufsz %d", __packet_get_buflet_count(dph),
520	PP_BUF_SIZE_DEF(dpkt->pkt_qum.qum_pp));
521	FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
522	return EINVAL;
523	}
524
525	/ Copy packet metadata /
526	_QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
527	_PKT_COPY(spkt, dpkt);
528	ASSERT(!(dpkt->pkt_qum.qum_qflags & QUM_F_KERNEL_ONLY) \|\|
529	PP_KERNEL_ONLY(dpkt->pkt_qum.qum_pp));
530	ASSERT(dpkt->pkt_mbuf == NULL);
531
532	dpkt->pkt_headroom = `0`;
533	dpkt->pkt_l2_len = `0`;
534
535	/ don't include IP header from partial sum /
536	if (__probable((spkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != `0`)) {
537	iphlen = spkt->pkt_flow_ip_hlen;
538	do_cksum_rx = sk_cksum_rx;
539	} else {
540	iphlen = `0`;
541	do_cksum_rx = FALSE;
542	}
543
544	/ Copy packet payload /
545	if ((spkt->pkt_pflags & PKT_F_MBUF_DATA) &&
546	(spkt->pkt_pflags & PKT_F_TRUNCATED)) {
547	FSW_STATS_INC(FSW_STATS_RX_COPY_MBUF2PKT);
548	/*
549	* Source packet has truncated contents (just enough for
550	* the classifer) of an mbuf from the compat driver; copy
551	* the entire entire mbuf contents to destination packet.
552	*/
553	m_adj(spkt->pkt_mbuf, skip_l2h_len);
554	ASSERT((uint32_t)m_pktlen(spkt->pkt_mbuf) >= dlen);
555	fsw->fsw_pkt_copy_from_mbuf(NR_RX, dph, `0`,
556	spkt->pkt_mbuf, `0`, dlen, do_cksum_rx, iphlen);
557	} else {
558	FSW_STATS_INC(FSW_STATS_RX_COPY_PKT2PKT);
559	/*
560	* Source packet has full contents, either from an mbuf
561	* that came up from the compat driver, or because it
562	* originated on the native driver; copy to destination.
563	*/
564	fsw->fsw_pkt_copy_from_pkt(NR_RX, dph, `0`, sph,
565	(spkt->pkt_headroom + spkt->pkt_l2_len), dlen, do_cksum_rx,
566	iphlen, `0`, FALSE);
567	}
568
569	#if DEBUG \|\| DEVELOPMENT
570	if (__improbable(pkt_trailers > `0`)) {
571	dlen += pkt_add_trailers(dph, dlen, iphlen);
572	}
573	#endif /* DEBUG \|\| DEVELOPMENT */
574
575	/ Finalize and attach packet to Rx ring /
576	METADATA_ADJUST_LEN(dpkt, `0`, `0`);
577	err = __packet_finalize(ph: dph);
578	VERIFY(err == `0`);
579
580	copy_packet_from_dev_log(spkt, dpkt, kernproc);
581
582	if (spkt->pkt_pflags & PKT_F_MBUF_DATA) {
583	ifp_inc_traffic_class_in(ifp: fsw->fsw_ifp, m: spkt->pkt_mbuf);
584	mbuf_free(mbuf: spkt->pkt_mbuf);
585	KPKT_CLEAR_MBUF_DATA(spkt);
586	} else {
587	fsw_ifp_inc_traffic_class_in_pkt(fsw->fsw_ifp, dph);
588	}
589
590	if (__probable(do_cksum_rx != `0`)) {
591	FSW_STATS_INC(FSW_STATS_RX_COPY_SUM);
592	}
593
594	return `0`;
595	}
596
597	SK_NO_INLINE_ATTRIBUTE
598	static struct __kern_packet *
599	rx_process_ip_frag(struct nx_flowswitch fsw, struct* __kern_packet *pkt)
600	{
601	char *pkt_buf;
602	void *l3_hdr;
603	uint16_t nfrags, tlen;
604	int err = `0`;
605
606	switch (fsw_ip_reass) {
607	case FSW_IP_REASS_FORCE_OFF:
608	return pkt;
609	case FSW_IP_REASS_FORCE_ON:
610	break;
611	default:
612	if (!FSW_NETAGENT_ENABLED(fsw) \|\|
613	flow_mgr_get_num_flows(mgr: fsw->fsw_flow_mgr) == `0`) {
614	return pkt;
615	}
616	break;
617	}
618
619	MD_BUFLET_ADDR_ABS(pkt, pkt_buf);
620	l3_hdr = pkt_buf + pkt->pkt_headroom + pkt->pkt_l2_len;
621
622	ASSERT(fsw->fsw_ipfm != NULL);
623	ASSERT((pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != `0`);
624
625	if (pkt->pkt_flow_ip_ver == IPVERSION) {
626	err = fsw_ip_frag_reass_v4(mgr: fsw->fsw_ipfm, pkt: &pkt,
627	ip4: (struct ip *)l3_hdr, nfrags: &nfrags, tlen: &tlen);
628	} else {
629	ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
630	/ we only handle frag header immediately after v6 header /
631	err = fsw_ip_frag_reass_v6(mgr: fsw->fsw_ipfm, pkt: &pkt,
632	ip6: (struct ip6_hdr *)l3_hdr,
633	ip6f: (struct ip6_frag )((uintptr_t)l3_hdr + sizeof(struct* ip6_hdr)),
634	nfrags: &nfrags, tlen: &tlen);
635	}
636	if (__improbable(err != `0`)) {
637	/ if we get a bad fragment, free it /
638	pp_free_packet_single(pkt);
639	pkt = NULL;
640	} else {
641	ASSERT(!((pkt != NULL) ^ (nfrags > `0`)));
642	}
643
644	return pkt;
645	}
646
647	SK_NO_INLINE_ATTRIBUTE
648	static void
649	rx_prepare_packet_mbuf(struct nx_flowswitch fsw, struct* __kern_packet *pkt)
650	{
651	ASSERT(pkt->pkt_pflags & PKT_F_MBUF_DATA);
652	uint32_t mlen = (uint32_t)m_pktlen(pkt->pkt_mbuf);
653	kern_packet_t ph = SK_PTR_ENCODE(pkt,
654	METADATA_TYPE(pkt), METADATA_SUBTYPE(pkt));
655	/*
656	* This is the case when the packet is coming in from
657	* compat-netif. This packet only has valid metadata
658	* and an attached mbuf. We need to copy enough data
659	* from the mbuf to the packet buffer for the
660	* classifier. Compat netif packet pool is configured
661	* with buffer size of NETIF_COMPAT_MAX_MBUF_DATA_COPY
662	* which is just enough to hold the protocol headers
663	* for the flowswitch classifier.
664	*/
665
666	pkt->pkt_headroom = `0`;
667	METADATA_ADJUST_LEN(pkt, `0`, `0`);
668	/*
669	* Copy the initial 128 bytes of the packet for
670	* classification.
671	* Ethernet(14) + IPv6 header(40) +
672	* + IPv6 fragment header(8) +
673	* TCP header with options(60).
674	*/
675	fsw->fsw_pkt_copy_from_mbuf(NR_RX, ph,
676	pkt->pkt_headroom, pkt->pkt_mbuf, `0`,
677	MIN(mlen, NETIF_COMPAT_MAX_MBUF_DATA_COPY),
678	FALSE, `0`);
679
680	int err = __packet_finalize_with_mbuf(pkt);
681	VERIFY(err == `0`);
682	}
683
684	static struct __kern_packet *
685	rx_prepare_packet(struct nx_flowswitch fsw, struct* __kern_packet *pkt)
686	{
687	pkt->pkt_qum_qflags &= ~QUM_F_FLOW_CLASSIFIED;
688
689	if (__improbable(pkt->pkt_pflags & PKT_F_MBUF_DATA)) {
690	rx_prepare_packet_mbuf(fsw, pkt);
691	}
692
693	return pkt;
694	}
695
696	static struct flow_entry *
697	lookup_flow_with_pkt(struct nx_flowswitch fsw, struct* __kern_packet *pkt,
698	bool input, struct flow_entry *prev_fe)
699	{
700	struct flow_key key __sk_aligned(`16`);
701	struct flow_entry *fe = NULL;
702
703	ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
704	flow_pkt2key(pkt, input, key: &key);
705
706	if (__probable(prev_fe != NULL &&
707	prev_fe->fe_key.fk_mask == FKMASK_5TUPLE)) {
708	uint16_t saved_mask = key.fk_mask;
709	key.fk_mask = FKMASK_5TUPLE;
710	if (flow_key_cmp_mask(match: &prev_fe->fe_key, key: &key, mask: &fk_mask_5tuple) == `0`) {
711	flow_entry_retain(fe: prev_fe);
712	fe = prev_fe;
713	} else {
714	key.fk_mask = saved_mask;
715	}
716	}
717
718	top:
719	if (__improbable(fe == NULL)) {
720	fe = flow_mgr_find_fe_by_key(fsw->fsw_flow_mgr, &key);
721	}
722
723	if (__improbable(fe != NULL &&
724	(fe->fe_flags & (FLOWENTF_PARENT \| FLOWENTF_CHILD)) != `0`)) {
725	/ Rx /
726	if (input) {
727	if (fe->fe_flags & FLOWENTF_PARENT) {
728	struct flow_entry *child_fe = rx_lookup_child_flow(fsw, fe, pkt);
729	if (child_fe != NULL) {
730	flow_entry_release(pfe: &fe);
731	fe = child_fe;
732	}
733	} else {
734	if (!rx_flow_demux_match(fsw, fe, pkt)) {
735	flow_entry_release(pfe: &fe);
736	fe = NULL;
737	goto top;
738	}
739	}
740	} else {
741	/ Tx /
742	if (__improbable(!_UUID_MATCH(pkt->pkt_flow_id, fe->fe_uuid))) {
743	if (__probable(fe->fe_flags & FLOWENTF_PARENT)) {
744	struct flow_entry *parent_fe = fe;
745	fe = tx_lookup_child_flow(parent_fe, pkt->pkt_flow_id);
746	flow_entry_release(pfe: &parent_fe);
747	} else {
748	flow_entry_release(pfe: &fe);
749	fe = NULL;
750	goto top;
751	}
752	}
753	}
754	}
755
756	SK_LOG_VAR(char fkbuf[FLOWKEY_DBGBUF_SIZE]);
757	SK_DF(SK_VERB_FSW_DP \| SK_VERB_LOOKUP,
758	"%s %s %s \"%s\" fe 0x%llx",
759	input ? "Rx" : "Tx", if_name(fsw->fsw_ifp),
760	sk_proc_name_address(current_proc()),
761	fk_as_string(&key, fkbuf, sizeof(fkbuf)),
762	SK_KVA(fe));
763
764	return fe;
765	}
766
767	SK_NO_INLINE_ATTRIBUTE
768	static bool
769	pkt_is_for_listener(struct flow_entry fe, struct* __kern_packet *pkt)
770	{
771	struct nx_flowswitch *fsw = fe->fe_fsw;
772	struct ifnet *ifp = fsw->fsw_ifp;
773	struct in_ifaddr *ia = NULL;
774	struct in_ifaddr *best_ia = NULL;
775	struct in6_ifaddr *ia6 = NULL;
776	struct in6_ifaddr *best_ia6 = NULL;
777	struct ifnet *match_ifp = NULL;
778	struct __flow *flow = pkt->pkt_flow;
779	bool result = false;
780
781	ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
782
783	if (flow->flow_ip_ver == IPVERSION) {
784	if (IN_ZERONET(ntohl(flow->flow_ipv4_dst.s_addr)) \|\|
785	IN_LOOPBACK(ntohl(flow->flow_ipv4_dst.s_addr)) \|\|
786	IN_LINKLOCAL(ntohl(flow->flow_ipv4_dst.s_addr)) \|\|
787	IN_DS_LITE(ntohl(flow->flow_ipv4_dst.s_addr)) \|\|
788	IN_6TO4_RELAY_ANYCAST(ntohl(flow->flow_ipv4_dst.s_addr)) \|\|
789	IN_MULTICAST(ntohl(flow->flow_ipv4_dst.s_addr)) \|\|
790	INADDR_BROADCAST == flow->flow_ipv4_dst.s_addr) {
791	result = true;
792	goto done;
793	}
794
795	/*
796	* Check for a match in the hash bucket.
797	*/
798	lck_rw_lock_shared(lck: &in_ifaddr_rwlock);
799	TAILQ_FOREACH(ia, INADDR_HASH(flow->flow_ipv4_dst.s_addr), ia_hash) {
800	if (IA_SIN(ia)->sin_addr.s_addr == flow->flow_ipv4_dst.s_addr) {
801	best_ia = ia;
802	match_ifp = ia->ia_ifp;
803
804	if (match_ifp == ifp) {
805	break;
806	}
807	/*
808	* Continue the loop in case there's a exact match with another
809	* interface
810	*/
811	}
812	}
813
814	if (best_ia != NULL) {
815	if (match_ifp != ifp && ipforwarding == `0` &&
816	(match_ifp->if_family == IFNET_FAMILY_IPSEC \|\|
817	match_ifp->if_family == IFNET_FAMILY_UTUN)) {
818	/*
819	* Drop when interface address check is strict and forwarding
820	* is disabled
821	*/
822	} else {
823	lck_rw_done(lck: &in_ifaddr_rwlock);
824	result = true;
825	goto done;
826	}
827	}
828	lck_rw_done(lck: &in_ifaddr_rwlock);
829
830	if (ifp->if_flags & IFF_BROADCAST) {
831	/*
832	* Check for broadcast addresses.
833	*
834	* Only accept broadcast packets that arrive via the matching
835	* interface. Reception of forwarded directed broadcasts would be
836	* handled via ip_forward() and ether_frameout() with the loopback
837	* into the stack for SIMPLEX interfaces handled by ether_frameout().
838	*/
839	struct ifaddr *ifa;
840
841	ifnet_lock_shared(ifp);
842	TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
843	if (ifa->ifa_addr->sa_family != AF_INET) {
844	continue;
845	}
846	ia = ifatoia(ifa);
847	if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == flow->flow_ipv4_dst.s_addr \|\|
848	ia->ia_netbroadcast.s_addr == flow->flow_ipv4_dst.s_addr) {
849	ifnet_lock_done(ifp);
850	result = true;
851	goto done;
852	}
853	}
854	ifnet_lock_done(ifp);
855	}
856	} else {
857	if (IN6_IS_ADDR_LOOPBACK(&flow->flow_ipv6_dst) \|\|
858	IN6_IS_ADDR_LINKLOCAL(&flow->flow_ipv6_dst) \|\|
859	IN6_IS_ADDR_MULTICAST(&flow->flow_ipv6_dst)) {
860	result = true;
861	goto done;
862	}
863
864	/*
865	* Check for exact addresses in the hash bucket.
866	*/
867	lck_rw_lock_shared(lck: &in6_ifaddr_rwlock);
868	TAILQ_FOREACH(ia6, IN6ADDR_HASH(&flow->flow_ipv6_dst), ia6_hash) {
869	if (in6_are_addr_equal_scoped(&ia6->ia_addr.sin6_addr, &flow->flow_ipv6_dst, ia6->ia_ifp->if_index, ifp->if_index)) {
870	if ((ia6->ia6_flags & (IN6_IFF_NOTREADY \| IN6_IFF_CLAT46))) {
871	continue;
872	}
873	best_ia6 = ia6;
874	if (ia6->ia_ifp == ifp) {
875	break;
876	}
877	/*
878	* Continue the loop in case there's a exact match with another
879	* interface
880	*/
881	}
882	}
883	if (best_ia6 != NULL) {
884	if (best_ia6->ia_ifp != ifp && ip6_forwarding == `0` &&
885	(best_ia6->ia_ifp->if_family == IFNET_FAMILY_IPSEC \|\|
886	best_ia6->ia_ifp->if_family == IFNET_FAMILY_UTUN)) {
887	/*
888	* Drop when interface address check is strict and forwarding
889	* is disabled
890	*/
891	} else {
892	lck_rw_done(lck: &in6_ifaddr_rwlock);
893	result = true;
894	goto done;
895	}
896	}
897	lck_rw_done(lck: &in6_ifaddr_rwlock);
898	}
899
900	/*
901	* In forwarding mode, if the destination address
902	* of the packet does not match any interface
903	* address, it maybe destined to the client device
904	*/
905	SK_DF(SK_VERB_FSW_DP \| SK_VERB_RX \| SK_VERB_FLOW,
906	"Rx flow does not match interface address");
907	done:
908	return result;
909	}
910
911	static struct flow_entry *
912	rx_lookup_flow(struct nx_flowswitch fsw, struct* __kern_packet *pkt,
913	struct flow_entry *prev_fe)
914	{
915	struct flow_entry *fe;
916
917	fe = lookup_flow_with_pkt(fsw, pkt, true, prev_fe);
918	_FSW_INJECT_ERROR(`2`, fe, NULL, flow_entry_release, &fe);
919	if (fe == NULL) {
920	FSW_STATS_INC(FSW_STATS_RX_FLOW_NOT_FOUND);
921	return NULL;
922	}
923
924	if (__improbable(fe->fe_key.fk_mask == FKMASK_2TUPLE &&
925	fe->fe_flags & FLOWENTF_LISTENER) &&
926	!pkt_is_for_listener(fe, pkt)) {
927	FSW_STATS_INC(FSW_STATS_RX_PKT_NOT_LISTENER);
928	flow_entry_release(pfe: &fe);
929	return NULL;
930	}
931
932	if (__improbable(fe->fe_flags & FLOWENTF_TORN_DOWN)) {
933	FSW_STATS_INC(FSW_STATS_RX_FLOW_TORNDOWN);
934	SK_DF(SK_VERB_FSW_DP \| SK_VERB_RX \| SK_VERB_FLOW,
935	"Rx flow torn down");
936	flow_entry_release(pfe: &fe);
937	fe = NULL;
938	}
939
940	return fe;
941	}
942
943	static inline void
944	rx_flow_batch_packet(struct flow_entry_list fes, struct* flow_entry *fe,
945	struct __kern_packet *pkt)
946	{
947	if (__improbable(pkt->pkt_flow_ip_is_frag)) {
948	fe->fe_rx_frag_count++;
949	}
950
951	/ KPKTQ_ENQUEUE_LIST is needed until frags become chained buflet /
952	if (KPKTQ_EMPTY(&fe->fe_rx_pktq)) {
953	ASSERT(KPKTQ_LEN(&fe->fe_rx_pktq) == `0`);
954	TAILQ_INSERT_TAIL(fes, fe, fe_rx_link);
955	KPKTQ_ENQUEUE_LIST(&fe->fe_rx_pktq, pkt);
956	} else {
957	ASSERT(!TAILQ_EMPTY(fes));
958	KPKTQ_ENQUEUE_LIST(&fe->fe_rx_pktq, pkt);
959	flow_entry_release(pfe: &fe);
960	}
961	}
962
963	static void
964	tx_flow_batch_packet(struct flow_entry_list fes, struct* flow_entry *fe,
965	struct __kern_packet *pkt)
966	{
967	/ record frag continuation /
968	if (__improbable(pkt->pkt_flow_ip_is_first_frag)) {
969	ASSERT(pkt->pkt_flow_ip_is_frag);
970	fe->fe_tx_is_cont_frag = true;
971	fe->fe_tx_frag_id = pkt->pkt_flow_ip_frag_id;
972	} else if (__probable(!pkt->pkt_flow_ip_is_frag)) {
973	fe->fe_tx_is_cont_frag = false;
974	fe->fe_tx_frag_id = `0`;
975	}
976
977	if (KPKTQ_EMPTY(&fe->fe_tx_pktq)) {
978	ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) == `0`);
979	TAILQ_INSERT_TAIL(fes, fe, fe_tx_link);
980	KPKTQ_ENQUEUE(&fe->fe_tx_pktq, pkt);
981	} else {
982	ASSERT(!TAILQ_EMPTY(fes));
983	KPKTQ_ENQUEUE(&fe->fe_tx_pktq, pkt);
984	flow_entry_release(pfe: &fe);
985	}
986	}
987
988	static inline void
989	fsw_rx_ring_dequeue_pktq(struct nx_flowswitch fsw, struct* __kern_channel_ring *r,
990	uint32_t n_pkts_max, struct pktq pktq, uint32_t n_bytes)
991	{
992	uint32_t n_pkts = `0`;
993	slot_idx_t idx, idx_end;
994	idx = r->ckr_khead;
995	idx_end = r->ckr_rhead;
996
997	ASSERT(KPKTQ_EMPTY(pktq));
998	*n_bytes = `0`;
999	for (; n_pkts < n_pkts_max && idx != idx_end;
1000	idx = SLOT_NEXT(i: idx, lim: r->ckr_lim)) {
1001	struct __kern_slot_desc *ksd = KR_KSD(r, idx);
1002	struct __kern_packet *pkt = ksd->sd_pkt;
1003
1004	ASSERT(pkt->pkt_nextpkt == NULL);
1005	KR_SLOT_DETACH_METADATA(kring: r, ksd);
1006
1007	_FSW_INJECT_ERROR(`20`, pkt->pkt_qum_qflags,
1008	pkt->pkt_qum_qflags \| QUM_F_DROPPED, null_func);
1009	if (__improbable(((pkt->pkt_qum_qflags & QUM_F_DROPPED) != `0`))
1010	\|\| (pkt->pkt_length == `0`)) {
1011	FSW_STATS_INC(FSW_STATS_DROP);
1012	pp_free_packet_single(pkt);
1013	continue;
1014	}
1015	n_pkts++;
1016	*n_bytes += pkt->pkt_length;
1017
1018	KPKTQ_ENQUEUE(pktq, pkt);
1019	}
1020	r->ckr_khead = idx;
1021	r->ckr_ktail = SLOT_PREV(i: idx, lim: r->ckr_lim);
1022	}
1023
1024	/*
1025	* This is only for estimating how many packets each GSO packet will need.
1026	* The number does not need to be exact because any leftover packets allocated
1027	* will be freed.
1028	*/
1029	static uint32_t
1030	estimate_gso_pkts(struct __kern_packet *pkt)
1031	{
1032	packet_tso_flags_t tso_flags;
1033	uint16_t mss;
1034	uint32_t n_pkts = `0`, total_hlen = `0`, total_len = `0`;
1035
1036	tso_flags = pkt->pkt_csum_flags & PACKET_CSUM_TSO_FLAGS;
1037	mss = pkt->pkt_proto_seg_sz;
1038
1039	if (tso_flags == PACKET_TSO_IPV4) {
1040	total_hlen = sizeof(struct ip) + sizeof(struct tcphdr);
1041	} else if (tso_flags == PACKET_TSO_IPV6) {
1042	total_hlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
1043	}
1044	if (total_hlen != `0` && mss != `0`) {
1045	total_len = pkt->pkt_length;
1046	n_pkts = (uint32_t)
1047	(SK_ROUNDUP((total_len - total_hlen), mss) / mss);
1048	}
1049	DTRACE_SKYWALK5(estimate__gso, packet_tso_flags_t, tso_flags,
1050	uint32_t, total_hlen, uint32_t, total_len, uint16_t, mss,
1051	uint32_t, n_pkts);
1052	return n_pkts;
1053	}
1054
1055	/*
1056	* This function retrieves a chain of packets of the same type only
1057	* (GSO or non-GSO).
1058	*/
1059	static inline void
1060	fsw_tx_ring_dequeue_pktq(struct nx_flowswitch *fsw,
1061	struct __kern_channel_ring *r, uint32_t n_pkts_max,
1062	struct pktq pktq, uint32_t n_bytes, uint32_t *gso_pkts_estimate)
1063	{
1064	uint32_t n_pkts = `0`;
1065	slot_idx_t idx, idx_end;
1066	idx = r->ckr_khead;
1067	idx_end = r->ckr_rhead;
1068	struct nexus_vp_adapter *vpna = VPNA(KRNA(r));
1069	boolean_t gso_enabled, gso_required;
1070	uint32_t gso_pkts;
1071
1072	gso_enabled = (fsw->fsw_tso_mode == FSW_TSO_MODE_SW);
1073	ASSERT(KPKTQ_EMPTY(pktq));
1074	*n_bytes = `0`;
1075	for (; n_pkts < n_pkts_max &&
1076	(!gso_enabled \|\| fsw_gso_batch == `0` \|\|
1077	*gso_pkts_estimate < fsw_gso_batch) &&
1078	idx != idx_end; idx = SLOT_NEXT(i: idx, lim: r->ckr_lim)) {
1079	struct __kern_slot_desc *ksd = KR_KSD(r, idx);
1080	struct __kern_packet *pkt = ksd->sd_pkt;
1081
1082	ASSERT(pkt->pkt_nextpkt == NULL);
1083
1084	_FSW_INJECT_ERROR(`20`, pkt->pkt_qum_qflags,
1085	pkt->pkt_qum_qflags \| QUM_F_DROPPED, null_func);
1086	if (__improbable(((pkt->pkt_qum_qflags & QUM_F_DROPPED) != `0`))
1087	\|\| (pkt->pkt_length == `0`)) {
1088	KR_SLOT_DETACH_METADATA(kring: r, ksd);
1089	FSW_STATS_INC(FSW_STATS_DROP);
1090	pp_free_packet_single(pkt);
1091	continue;
1092	}
1093	if (gso_enabled) {
1094	gso_pkts = estimate_gso_pkts(pkt);
1095
1096	/*
1097	* We use the first packet to determine what
1098	* type the subsequent ones need to be (GSO or
1099	* non-GSO).
1100	*/
1101	if (n_pkts == `0`) {
1102	gso_required = (gso_pkts != `0`);
1103	} else {
1104	if (gso_required != (gso_pkts != `0`)) {
1105	break;
1106	}
1107	}
1108	*gso_pkts_estimate += gso_pkts;
1109	}
1110	KR_SLOT_DETACH_METADATA(kring: r, ksd);
1111	if (NA_CHANNEL_EVENT_ATTACHED(&vpna->vpna_up)) {
1112	__packet_set_tx_nx_port(SK_PKT2PH(pkt),
1113	nx_port: vpna->vpna_nx_port, vpna_gencnt: vpna->vpna_gencnt);
1114	}
1115	n_pkts++;
1116	*n_bytes += pkt->pkt_length;
1117	KPKTQ_ENQUEUE(pktq, pkt);
1118	}
1119	r->ckr_khead = idx;
1120	r->ckr_ktail = SLOT_PREV(i: idx, lim: r->ckr_lim);
1121	DTRACE_SKYWALK5(tx__ring__dequeue, struct nx_flowswitch *, fsw,
1122	ifnet_t, fsw->fsw_ifp, uint32_t, n_pkts, uint32_t, *n_bytes,
1123	uint32_t, *gso_pkts_estimate);
1124	}
1125
1126	static void
1127	fsw_ring_enqueue_pktq(struct nx_flowswitch fsw, struct* __kern_channel_ring *r,
1128	struct pktq *pktq)
1129	{
1130	#pragma unused(fsw)
1131	struct __kern_packet *pkt;
1132	struct __kern_quantum *kqum;
1133	uint32_t kr_space_avail = `0`;
1134	uint32_t n, n_pkts = `0`, n_bytes = `0`;
1135	slot_idx_t idx = `0`, idx_start = `0`, idx_end = `0`;
1136
1137	kr_enter(r, TRUE);
1138
1139	idx_start = r->ckr_ktail;
1140	kr_space_avail = kr_available_slots_rxring(rxkring: r);
1141	_FSW_INJECT_ERROR(`40`, kr_space_avail, `0`, null_func);
1142	n = MIN(kr_space_avail, KPKTQ_LEN(pktq));
1143	_FSW_INJECT_ERROR(`41`, n, `0`, null_func);
1144	idx_end = SLOT_INCREMENT(i: idx_start, n, lim: r->ckr_lim);
1145
1146	idx = idx_start;
1147	while (idx != idx_end) {
1148	KPKTQ_DEQUEUE(pktq, pkt);
1149	kqum = SK_PTR_ADDR_KQUM(pkt);
1150	kqum->qum_qflags \|= QUM_F_FINALIZED;
1151	n_pkts++;
1152	n_bytes += pkt->pkt_length;
1153	KR_SLOT_ATTACH_METADATA(kring: r, KR_KSD(r, idx), kqum);
1154	if (__improbable(pkt->pkt_trace_id != `0`)) {
1155	KDBG(SK_KTRACE_PKT_RX_FSW \| DBG_FUNC_END, pkt->pkt_trace_id);
1156	KDBG(SK_KTRACE_PKT_RX_CHN \| DBG_FUNC_START, pkt->pkt_trace_id);
1157	}
1158	idx = SLOT_NEXT(i: idx, lim: r->ckr_lim);
1159	}
1160
1161	kr_update_stats(kring: r, slot_count: n_pkts, byte_count: n_bytes);
1162
1163	/*
1164	* ensure slot attachments are visible before updating the
1165	* tail pointer
1166	*/
1167	os_atomic_thread_fence(seq_cst);
1168
1169	r->ckr_ktail = idx_end;
1170
1171	kr_exit(r);
1172
1173	r->ckr_na_notify(r, kernproc, NA_NOTEF_PUSH);
1174
1175	SK_DF(SK_VERB_FSW_DP \| SK_VERB_RING, "%s enqueued %d pkts",
1176	r->ckr_name, n_pkts);
1177	}
1178
1179	static void
1180	pkts_to_pktq(struct __kern_packet pkts[], uint32_t n_pkts, struct* pktq *pktq)
1181	{
1182	ASSERT(KPKTQ_EMPTY(pktq));
1183
1184	for (uint32_t i = `0`; i < n_pkts; i++) {
1185	struct __kern_packet *pkt = pkts[i];
1186	ASSERT(pkt->pkt_nextpkt == NULL);
1187	KPKTQ_ENQUEUE(pktq, pkt);
1188	}
1189	}
1190
1191	/*
1192	* This function is modeled after nx_netif_host_grab_pkts() in nx_netif_host.c.
1193	*/
1194	SK_NO_INLINE_ATTRIBUTE
1195	static void
1196	convert_native_pktq_to_mbufs(struct nx_flowswitch fsw, struct* pktq *pktq,
1197	struct mbuf m_headp, struct mbuf m_tailp, uint32_t cnt, uint32_t bytes)
1198	{
1199	uint32_t tot_cnt;
1200	unsigned int num_segs = `1`;
1201	struct mbuf mhead, head = NULL, tail = NULL, *tailp = &head;
1202	uint32_t mhead_cnt, mhead_bufsize;
1203	uint32_t mhead_waste = `0`;
1204	uint32_t mcnt = `0`, mbytes = `0`;
1205	uint32_t largest, max_pkt_len;
1206	struct __kern_packet *pkt;
1207	struct kern_pbufpool *pp;
1208
1209	tot_cnt = KPKTQ_LEN(pktq);
1210	ASSERT(tot_cnt > `0`);
1211	mhead_cnt = tot_cnt;
1212
1213	/*
1214	* Opportunistically batch-allocate the mbufs based on the largest
1215	* packet size we've seen in the recent past. Note that we reset
1216	* fe_rx_largest_size below if we notice that we're under-utilizing the
1217	* allocated buffers (thus disabling this batch allocation).
1218	*/
1219	largest = (volatile* uint32_t)&fsw->fsw_rx_largest_size; /* read once /
1220	if (__probable(largest != `0`)) {
1221	if (largest <= MCLBYTES) {
1222	mhead = m_allocpacket_internal(&mhead_cnt, MCLBYTES,
1223	&num_segs, M_NOWAIT, `1`, `0`);
1224	mhead_bufsize = MCLBYTES;
1225	} else if (largest <= MBIGCLBYTES) {
1226	mhead = m_allocpacket_internal(&mhead_cnt, MBIGCLBYTES,
1227	&num_segs, M_NOWAIT, `1`, `0`);
1228	mhead_bufsize = MBIGCLBYTES;
1229	} else if (largest <= M16KCLBYTES) {
1230	mhead = m_allocpacket_internal(&mhead_cnt, M16KCLBYTES,
1231	&num_segs, M_NOWAIT, `1`, `0`);
1232	mhead_bufsize = M16KCLBYTES;
1233	} else if (largest <= M16KCLBYTES * `2`) {
1234	num_segs = `2`;
1235	mhead = m_allocpacket_internal(&mhead_cnt, M16KCLBYTES * `2`,
1236	&num_segs, M_NOWAIT, `1`, `0`);
1237	mhead_bufsize = M16KCLBYTES * `2`;
1238	} else {
1239	mhead = NULL;
1240	mhead_bufsize = mhead_cnt = `0`;
1241	}
1242	} else {
1243	mhead = NULL;
1244	mhead_bufsize = mhead_cnt = `0`;
1245	}
1246	DTRACE_SKYWALK4(bufstats, uint32_t, largest, uint32_t, mhead_bufsize,
1247	uint32_t, mhead_cnt, uint32_t, tot_cnt);
1248
1249	pp = __DECONST(struct kern_pbufpool *, KPKTQ_FIRST(pktq)->pkt_qum.qum_pp);
1250	max_pkt_len = PP_BUF_SIZE_DEF(pp) * pp->pp_max_frags;
1251
1252	KPKTQ_FOREACH(pkt, pktq) {
1253	uint32_t tot_len, len;
1254	uint16_t pad, llhlen, iphlen;
1255	boolean_t do_cksum_rx;
1256	struct mbuf *m;
1257	int error;
1258
1259	llhlen = pkt->pkt_l2_len;
1260	len = pkt->pkt_length;
1261	if (__improbable(len > max_pkt_len \|\| llhlen > len)) {
1262	DTRACE_SKYWALK2(bad__len, struct nx_flowswitch *, fsw,
1263	struct __kern_packet *, pkt);
1264	FSW_STATS_INC(FSW_STATS_DROP);
1265	FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
1266	continue;
1267	}
1268	/ begin payload on 32-bit boundary; figure out the padding /
1269	pad = (uint16_t)P2ROUNDUP(llhlen, sizeof(uint32_t)) - llhlen;
1270	tot_len = pad + len;
1271
1272	/ remember largest packet size /
1273	if (__improbable(largest < tot_len)) {
1274	largest = MAX(tot_len, MCLBYTES);
1275	}
1276
1277	/*
1278	* If the above batch allocation returned partial
1279	* success, we try a blocking allocation here again.
1280	*/
1281	m = mhead;
1282	if (__improbable(m == NULL \|\| tot_len > mhead_bufsize)) {
1283	ASSERT(mhead != NULL \|\| mhead_cnt == `0`);
1284	num_segs = `1`;
1285	if (tot_len > M16KCLBYTES) {
1286	num_segs = `0`;
1287	}
1288	if ((error = mbuf_allocpacket(how: MBUF_DONTWAIT, packetlen: tot_len,
1289	maxchunks: &num_segs, mbuf: &m)) != `0`) {
1290	DTRACE_SKYWALK2(bad__len,
1291	struct nx_flowswitch *, fsw,
1292	struct __kern_packet *, pkt);
1293	FSW_STATS_INC(FSW_STATS_DROP_NOMEM_MBUF);
1294	FSW_STATS_INC(FSW_STATS_DROP);
1295	continue;
1296	}
1297	} else {
1298	mhead = m->m_nextpkt;
1299	m->m_nextpkt = NULL;
1300	ASSERT(mhead_cnt != `0`);
1301	--mhead_cnt;
1302
1303	/ check if we're underutilizing large buffers /
1304	if (__improbable(mhead_bufsize > MCLBYTES &&
1305	tot_len < (mhead_bufsize >> `1`))) {
1306	++mhead_waste;
1307	}
1308	/*
1309	* Clean up unused mbuf.
1310	* Ony need to do this when we pre-alloc 2x16K mbufs
1311	*/
1312	if (__improbable(mhead_bufsize >= tot_len + M16KCLBYTES)) {
1313	ASSERT(mhead_bufsize == `2` * M16KCLBYTES);
1314	struct mbuf *m_extra = m->m_next;
1315	ASSERT(m_extra != NULL);
1316	ASSERT(m_extra->m_len == `0`);
1317	ASSERT(M_SIZE(m_extra) == M16KCLBYTES);
1318	m->m_next = NULL;
1319	m_freem(m_extra);
1320	FSW_STATS_INC(FSW_STATS_RX_WASTED_16KMBUF);
1321	}
1322	}
1323	m->m_data += pad;
1324	m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
1325
1326	/ don't include IP header from partial sum /
1327	if (__probable((pkt->pkt_qum_qflags &
1328	QUM_F_FLOW_CLASSIFIED) != `0`)) {
1329	iphlen = pkt->pkt_flow_ip_hlen;
1330	do_cksum_rx = sk_cksum_rx;
1331	} else {
1332	iphlen = `0`;
1333	do_cksum_rx = FALSE;
1334	}
1335
1336	fsw->fsw_pkt_copy_to_mbuf(NR_RX, SK_PKT2PH(pkt),
1337	pkt->pkt_headroom, m, `0`, len, do_cksum_rx,
1338	llhlen + iphlen);
1339
1340	FSW_STATS_INC(FSW_STATS_RX_COPY_PKT2MBUF);
1341	if (do_cksum_rx) {
1342	FSW_STATS_INC(FSW_STATS_RX_COPY_SUM);
1343	}
1344	#if DEBUG \|\| DEVELOPMENT
1345	if (__improbable(pkt_trailers > `0`)) {
1346	(void) pkt_add_trailers_mbuf(m, llhlen + iphlen);
1347	}
1348	#endif /* DEBUG \|\| DEVELOPMENT */
1349	m_adj(m, llhlen);
1350
1351	m->m_pkthdr.rcvif = fsw->fsw_ifp;
1352	if (__improbable((pkt->pkt_link_flags &
1353	PKT_LINKF_ETHFCS) != `0`)) {
1354	m->m_flags \|= M_HASFCS;
1355	}
1356	if (__improbable(pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
1357	m->m_pkthdr.pkt_flags \|= PKTF_WAKE_PKT;
1358	}
1359	ASSERT(m->m_nextpkt == NULL);
1360	tail = m;
1361	*tailp = m;
1362	tailp = &m->m_nextpkt;
1363	mcnt++;
1364	mbytes += m_pktlen(m);
1365	}
1366	/ free any leftovers /
1367	if (__improbable(mhead != NULL)) {
1368	DTRACE_SKYWALK1(mhead__leftover, uint32_t, mhead_cnt);
1369	ASSERT(mhead_cnt != `0`);
1370	(void) m_freem_list(mhead);
1371	mhead = NULL;
1372	mhead_cnt = `0`;
1373	}
1374
1375	/ reset if most packets (>50%) are smaller than our batch buffers /
1376	if (__improbable(mhead_waste > ((uint32_t)tot_cnt >> `1`))) {
1377	DTRACE_SKYWALK4(mhead__waste, struct nx_flowswitch *, fsw,
1378	struct flow_entry *, NULL, uint32_t, mhead_waste,
1379	uint32_t, tot_cnt);
1380	largest = `0`;
1381	}
1382
1383	if (largest != fsw->fsw_rx_largest_size) {
1384	os_atomic_store(&fsw->fsw_rx_largest_size, largest, release);
1385	}
1386
1387	pp_free_pktq(pktq);
1388	*m_headp = head;
1389	*m_tailp = tail;
1390	*cnt = mcnt;
1391	*bytes = mbytes;
1392	}
1393
1394	/*
1395	* This function only extracts the mbuf from the packet. The caller frees
1396	* the packet.
1397	*/
1398	static inline struct mbuf *
1399	convert_compat_pkt_to_mbuf(struct nx_flowswitch fsw, struct* __kern_packet *pkt)
1400	{
1401	struct mbuf *m;
1402	struct pkthdr *mhdr;
1403	uint16_t llhlen;
1404
1405	m = pkt->pkt_mbuf;
1406	ASSERT(m != NULL);
1407
1408	llhlen = pkt->pkt_l2_len;
1409	if (llhlen > pkt->pkt_length) {
1410	m_freem(m);
1411	KPKT_CLEAR_MBUF_DATA(pkt);
1412	DTRACE_SKYWALK2(bad__len, struct nx_flowswitch *, fsw,
1413	struct __kern_packet *, pkt);
1414	FSW_STATS_INC(FSW_STATS_DROP);
1415	FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
1416	return NULL;
1417	}
1418	mhdr = &m->m_pkthdr;
1419	if ((mhdr->csum_flags & CSUM_DATA_VALID) == `0` &&
1420	PACKET_HAS_PARTIAL_CHECKSUM(pkt)) {
1421	mhdr->csum_flags &= ~CSUM_RX_FLAGS;
1422	mhdr->csum_flags \|= (CSUM_DATA_VALID \| CSUM_PARTIAL);
1423	mhdr->csum_rx_start = pkt->pkt_csum_rx_start_off;
1424	mhdr->csum_rx_val = pkt->pkt_csum_rx_value;
1425	}
1426	#if DEBUG \|\| DEVELOPMENT
1427	uint32_t extra = `0`;
1428	if (__improbable(pkt_trailers > `0`)) {
1429	extra = pkt_add_trailers_mbuf(m, llhlen);
1430	}
1431	#endif /* DEBUG \|\| DEVELOPMENT */
1432	m_adj(m, llhlen);
1433	ASSERT((uint32_t)m_pktlen(m) == ((pkt->pkt_length - llhlen) + extra));
1434	KPKT_CLEAR_MBUF_DATA(pkt);
1435	return m;
1436	}
1437
1438	SK_NO_INLINE_ATTRIBUTE
1439	static void
1440	convert_compat_pktq_to_mbufs(struct nx_flowswitch fsw, struct* pktq *pktq,
1441	struct mbuf m_head, struct mbuf m_tail, uint32_t cnt, uint32_t bytes)
1442	{
1443	struct __kern_packet *pkt;
1444	struct mbuf m, head = NULL, tail = NULL, *tailp = &head;
1445	uint32_t c = `0`, b = `0`;
1446
1447	KPKTQ_FOREACH(pkt, pktq) {
1448	m = convert_compat_pkt_to_mbuf(fsw, pkt);
1449	if (__improbable(m == NULL)) {
1450	continue;
1451	}
1452	tail = m;
1453	*tailp = m;
1454	tailp = &m->m_nextpkt;
1455	c++;
1456	b += m_pktlen(m);
1457	}
1458	pp_free_pktq(pktq);
1459	*m_head = head;
1460	*m_tail = tail;
1461	*cnt = c;
1462	*bytes = b;
1463	}
1464
1465	void
1466	fsw_host_sendup(ifnet_t ifp, struct mbuf m_head, struct* mbuf *m_tail,
1467	uint32_t cnt, uint32_t bytes)
1468	{
1469	struct ifnet_stat_increment_param s;
1470
1471	bzero(s: &s, n: sizeof(s));
1472	s.packets_in = cnt;
1473	s.bytes_in = bytes;
1474	dlil_input_handler(ifp, m_head, m_tail, &s, FALSE, NULL);
1475	}
1476
1477	void
1478	fsw_host_rx(struct nx_flowswitch fsw, struct* pktq *pktq)
1479	{
1480	struct mbuf m_head = NULL, m_tail = NULL;
1481	uint32_t cnt = `0`, bytes = `0`;
1482	ifnet_fsw_rx_cb_t cb;
1483	void *cb_arg;
1484	boolean_t compat;
1485
1486	ASSERT(!KPKTQ_EMPTY(pktq));
1487	if (ifnet_get_flowswitch_rx_callback(ifp: fsw->fsw_ifp, cbp: &cb, argp: &cb_arg) == `0`) {
1488	ASSERT(cb != NULL);
1489	ASSERT(cb_arg != NULL);
1490	/ callback consumes packets /
1491	(*cb)(cb_arg, pktq);
1492	ifnet_release_flowswitch_rx_callback(ifp: fsw->fsw_ifp);
1493	return;
1494	}
1495
1496	/ All packets in the pktq must have the same type /
1497	compat = ((KPKTQ_FIRST(pktq)->pkt_pflags & PKT_F_MBUF_DATA) != `0`);
1498	if (compat) {
1499	convert_compat_pktq_to_mbufs(fsw, pktq, m_head: &m_head, m_tail: &m_tail, cnt: &cnt,
1500	bytes: &bytes);
1501	} else {
1502	convert_native_pktq_to_mbufs(fsw, pktq, m_headp: &m_head, m_tailp: &m_tail, cnt: &cnt,
1503	bytes: &bytes);
1504	}
1505	if (__improbable(m_head == NULL)) {
1506	DTRACE_SKYWALK1(empty__head, struct nx_flowswitch *, fsw);
1507	return;
1508	}
1509	fsw_host_sendup(ifp: fsw->fsw_ifp, m_head, m_tail, cnt, bytes);
1510	}
1511
1512	void
1513	fsw_ring_enqueue_tail_drop(struct nx_flowswitch *fsw,
1514	struct __kern_channel_ring r, struct* pktq *pktq)
1515	{
1516	fsw_ring_enqueue_pktq(fsw, r, pktq);
1517	FSW_STATS_ADD(FSW_STATS_RX_DST_RING_FULL, KPKTQ_LEN(pktq));
1518	dp_drop_pktq(fsw, pktq);
1519	}
1520
1521	static struct nexus_adapter *
1522	flow_get_na(struct nx_flowswitch fsw, struct* flow_entry *fe)
1523	{
1524	struct kern_nexus *nx = fsw->fsw_nx;
1525	struct nexus_adapter *na = NULL;
1526	nexus_port_t port = fe->fe_nx_port;
1527
1528	if (port == FSW_VP_DEV \|\| port == FSW_VP_HOST) {
1529	SK_ERR("dev or host ports have no NA");
1530	return NULL;
1531	}
1532
1533	if (__improbable(!nx_port_is_valid(nx, port))) {
1534	SK_DF(SK_VERB_FSW_DP, "%s[%d] port no longer valid",
1535	if_name(fsw->fsw_ifp), port);
1536	return NULL;
1537	}
1538
1539	na = nx_port_get_na(nx, port);
1540	if (__improbable(na == NULL)) {
1541	FSW_STATS_INC(FSW_STATS_DST_NXPORT_INVALID);
1542	SK_DF(SK_VERB_FSW_DP, "%s[%d] NA no longer valid",
1543	if_name(fsw->fsw_ifp), port);
1544	return NULL;
1545	}
1546
1547	if (__improbable(!NA_IS_ACTIVE(na))) {
1548	FSW_STATS_INC(FSW_STATS_DST_NXPORT_INACTIVE);
1549	SK_DF(SK_VERB_FSW_DP, "%s[%d] NA no longer active",
1550	if_name(fsw->fsw_ifp), port);
1551	return NULL;
1552	}
1553
1554	if (__improbable(nx_port_is_defunct(nx, port))) {
1555	FSW_STATS_INC(FSW_STATS_DST_NXPORT_DEFUNCT);
1556	SK_DF(SK_VERB_FSW_DP, "%s[%d] NA defuncted",
1557	if_name(fsw->fsw_ifp), port);
1558	return NULL;
1559	}
1560
1561	return na;
1562	}
1563
1564	static inline struct __kern_channel_ring *
1565	flow_get_ring(struct nx_flowswitch fsw, struct* flow_entry fe, enum* txrx txrx)
1566	{
1567	struct nexus_vp_adapter *na = NULL;
1568	struct __kern_channel_ring *r = NULL;
1569
1570	na = VPNA(flow_get_na(fsw, fe));
1571	if (__improbable(na == NULL)) {
1572	return NULL;
1573	}
1574
1575	switch (txrx) {
1576	case NR_RX:
1577	r = &na->vpna_up.na_rx_rings[`0`];
1578	break;
1579	case NR_TX:
1580	r = &na->vpna_up.na_tx_rings[`0`];
1581	break;
1582	default:
1583	__builtin_unreachable();
1584	VERIFY(`0`);
1585	}
1586
1587	if (__improbable(KR_DROP(r))) {
1588	FSW_STATS_INC(FSW_STATS_DST_RING_DROPMODE);
1589	SK_DF(SK_VERB_FSW_DP \| SK_VERB_RING, "r %0xllx %s drop mode",
1590	r->ckr_name, SK_KVA(r));
1591	return NULL;
1592	}
1593
1594	ASSERT(KRNA(r)->na_md_type == NEXUS_META_TYPE_PACKET);
1595
1596	#if (DEVELOPMENT \|\| DEBUG)
1597	if (r != NULL) {
1598	_FSW_INJECT_ERROR(`4`, r, NULL, null_func);
1599	}
1600	#endif /* DEVELOPMENT \|\| DEBUG */
1601
1602	return r;
1603	}
1604
1605	struct __kern_channel_ring *
1606	fsw_flow_get_rx_ring(struct nx_flowswitch fsw, struct* flow_entry *fe)
1607	{
1608	return flow_get_ring(fsw, fe, txrx: NR_RX);
1609	}
1610
1611	static inline struct __kern_channel_ring *
1612	fsw_flow_get_tx_ring(struct nx_flowswitch fsw, struct* flow_entry *fe)
1613	{
1614	return flow_get_ring(fsw, fe, txrx: NR_TX);
1615	}
1616
1617	static bool
1618	dp_flow_route_process(struct nx_flowswitch fsw, struct* flow_entry *fe)
1619	{
1620	struct flow_route *fr = fe->fe_route;
1621	struct ifnet *ifp = fsw->fsw_ifp;
1622
1623	if (__improbable(!(fe->fe_flags & FLOWENTF_NONVIABLE) &&
1624	!fe->fe_want_nonviable && (fe->fe_key.fk_mask & FKMASK_SRC) &&
1625	fe->fe_laddr_gencnt != ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt &&
1626	!flow_route_key_validate(&fe->fe_key, ifp, &fe->fe_laddr_gencnt))) {
1627	/*
1628	* The source address is no longer around; we want this
1629	* flow to be nonviable, but that requires holding the lock
1630	* as writer (which isn't the case now.) Indicate that
1631	* we need to finalize the nonviable later down below.
1632	*
1633	* We also request that the flow route be re-configured,
1634	* if this is a connected mode flow.
1635	*
1636	*/
1637	if (!(fe->fe_flags & FLOWENTF_NONVIABLE)) {
1638	/*
1639	* fsw_pending_nonviable is a hint for reaper thread;
1640	* due to the fact that setting fe_want_nonviable and
1641	* incrementing fsw_pending_nonviable counter is not
1642	* atomic, let the increment happen first, and the
1643	* thread losing the CAS does decrement.
1644	*/
1645	os_atomic_inc(&fsw->fsw_pending_nonviable, relaxed);
1646	if (os_atomic_cmpxchg(&fe->fe_want_nonviable, `0`, `1`, acq_rel)) {
1647	fsw_reap_sched(fsw);
1648	} else {
1649	os_atomic_dec(&fsw->fsw_pending_nonviable, relaxed);
1650	}
1651	}
1652	if (fr != NULL) {
1653	os_atomic_inc(&fr->fr_want_configure, relaxed);
1654	}
1655	}
1656
1657	/ if flow was (or is going to be) marked as nonviable, drop it /
1658	if (__improbable(fe->fe_want_nonviable \|\|
1659	(fe->fe_flags & FLOWENTF_NONVIABLE) != `0`)) {
1660	SK_DF(SK_VERB_FSW_DP \| SK_VERB_FLOW, "flow 0x%llx non-viable",
1661	SK_KVA(fe));
1662	return false;
1663	}
1664	return true;
1665	}
1666
1667	bool
1668	dp_flow_rx_route_process(struct nx_flowswitch fsw, struct* flow_entry *fe)
1669	{
1670	bool okay;
1671	okay = dp_flow_route_process(fsw, fe);
1672	#if (DEVELOPMENT \|\| DEBUG)
1673	if (okay) {
1674	_FSW_INJECT_ERROR(`5`, okay, false, null_func);
1675	}
1676	#endif /* DEVELOPMENT \|\| DEBUG */
1677
1678	return okay;
1679	}
1680
1681	void
1682	dp_flow_rx_process(struct nx_flowswitch fsw, struct* flow_entry *fe,
1683	uint32_t flags)
1684	{
1685	#pragma unused(flags)
1686	struct pktq dpkts; / dst pool alloc'ed packets /
1687	struct pktq disposed_pkts; / done src packets /
1688	struct pktq dropped_pkts; / dropped src packets /
1689	struct pktq transferred_pkts; / dst packet ready for ring /
1690	struct __kern_packet pkt, tpkt;
1691	struct kern_pbufpool *dpp;
1692	uint32_t n_pkts = KPKTQ_LEN(&fe->fe_rx_pktq);
1693	uint64_t buf_array[RX_BUFLET_BATCH_COUNT];
1694	uint16_t buf_array_iter = `0`;
1695	uint32_t cnt, buf_cnt = `0`;
1696	int err;
1697
1698	KPKTQ_INIT(&dpkts);
1699	KPKTQ_INIT(&dropped_pkts);
1700	KPKTQ_INIT(&disposed_pkts);
1701	KPKTQ_INIT(&transferred_pkts);
1702
1703	if (__improbable(!dp_flow_rx_route_process(fsw, fe))) {
1704	SK_ERR("Rx route bad");
1705	fsw_snoop_and_dequeue(fe, target: &dropped_pkts, true);
1706	FSW_STATS_ADD(FSW_STATS_RX_FLOW_NONVIABLE, n_pkts);
1707	goto done;
1708	}
1709
1710	if (fe->fe_nx_port == FSW_VP_HOST) {
1711	/*
1712	* The host ring does not exist anymore so we can't take
1713	* the enqueue path below. This path should only be hit
1714	* for the rare tcp fragmentation case.
1715	*/
1716	fsw_host_rx(fsw, pktq: &fe->fe_rx_pktq);
1717	return;
1718	}
1719
1720	/ find the ring /
1721	struct __kern_channel_ring *r;
1722	r = fsw_flow_get_rx_ring(fsw, fe);
1723	if (__improbable(r == NULL)) {
1724	fsw_snoop_and_dequeue(fe, target: &dropped_pkts, true);
1725	goto done;
1726	}
1727
1728	/ snoop before L2 is stripped /
1729	if (__improbable(pktap_total_tap_count != `0`)) {
1730	fsw_snoop(fsw, fe, true);
1731	}
1732
1733	dpp = r->ckr_pp;
1734	/ batch allocate enough packets /
1735	err = pp_alloc_pktq(dpp, `1`, &dpkts, n_pkts, NULL, NULL,
1736	SKMEM_NOSLEEP);
1737	if (__improbable(err == ENOMEM)) {
1738	ASSERT(KPKTQ_EMPTY(&dpkts));
1739	KPKTQ_CONCAT(&dropped_pkts, &fe->fe_rx_pktq);
1740	FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
1741	SK_ERR("failed to alloc %u pkts for kr %s, 0x%llu", n_pkts,
1742	r->ckr_name, SK_KVA(r));
1743	goto done;
1744	}
1745
1746	/*
1747	* estimate total number of buflets for the packet chain.
1748	*/
1749	cnt = howmany(fe->fe_rx_pktq_bytes, PP_BUF_SIZE_DEF(dpp));
1750	if (cnt > n_pkts) {
1751	ASSERT(dpp->pp_max_frags > `1`);
1752	cnt -= n_pkts;
1753	buf_cnt = MIN(RX_BUFLET_BATCH_COUNT, cnt);
1754	err = pp_alloc_buflet_batch(pp: dpp, array: buf_array, size: &buf_cnt,
1755	SKMEM_NOSLEEP, false);
1756	if (__improbable(buf_cnt == `0`)) {
1757	KPKTQ_CONCAT(&dropped_pkts, &fe->fe_rx_pktq);
1758	FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
1759	SK_ERR("failed to alloc %d buflets (err %d) for kr %s, "
1760	"0x%llu", cnt, err, r->ckr_name, SK_KVA(r));
1761	goto done;
1762	}
1763	err = `0`;
1764	}
1765
1766	/ extra processing for user flow /
1767	KPKTQ_FOREACH_SAFE(pkt, &fe->fe_rx_pktq, tpkt) {
1768	err = `0`;
1769	KPKTQ_REMOVE(&fe->fe_rx_pktq, pkt);
1770	if (fe->fe_rx_pktq_bytes > pkt->pkt_flow_ulen) {
1771	fe->fe_rx_pktq_bytes -= pkt->pkt_flow_ulen;
1772	} else {
1773	fe->fe_rx_pktq_bytes = `0`;
1774	}
1775	err = flow_pkt_track(fe, pkt, true);
1776	_FSW_INJECT_ERROR(`33`, err, EPROTO, null_func);
1777	if (__improbable(err != `0`)) {
1778	SK_ERR("flow_pkt_track failed (err %d)", err);
1779	FSW_STATS_INC(FSW_STATS_RX_FLOW_TRACK_ERR);
1780	/ if need to trigger RST /
1781	if (err == ENETRESET) {
1782	flow_track_abort_tcp(fe, in_pkt: pkt, NULL);
1783	}
1784	KPKTQ_ENQUEUE(&dropped_pkts, pkt);
1785	continue;
1786	}
1787
1788	/ transfer to dpkt /
1789	if (pkt->pkt_qum.qum_pp != dpp) {
1790	struct __kern_buflet bprev, bnew;
1791	struct __kern_packet *dpkt = NULL;
1792	uint32_t n_bufs, i;
1793
1794	KPKTQ_DEQUEUE(&dpkts, dpkt);
1795	if (__improbable(dpkt == NULL)) {
1796	FSW_STATS_INC(FSW_STATS_DROP_NOMEM_PKT);
1797	KPKTQ_ENQUEUE(&dropped_pkts, pkt);
1798	continue;
1799	}
1800	n_bufs = howmany(pkt->pkt_length, PP_BUF_SIZE_DEF(dpp));
1801	n_bufs--;
1802	for (i = `0`; i < n_bufs; i++) {
1803	if (__improbable(buf_cnt == `0`)) {
1804	ASSERT(dpp->pp_max_frags > `1`);
1805	buf_array_iter = `0`;
1806	cnt = howmany(fe->fe_rx_pktq_bytes,
1807	PP_BUF_SIZE_DEF(dpp));
1808	n_pkts = KPKTQ_LEN(&fe->fe_rx_pktq);
1809	if (cnt >= n_pkts) {
1810	cnt -= n_pkts;
1811	} else {
1812	cnt = `0`;
1813	}
1814	cnt += (n_bufs - i);
1815	buf_cnt = MIN(RX_BUFLET_BATCH_COUNT,
1816	cnt);
1817	cnt = buf_cnt;
1818	err = pp_alloc_buflet_batch(pp: dpp,
1819	array: buf_array, size: &buf_cnt,
1820	SKMEM_NOSLEEP, false);
1821	if (__improbable(buf_cnt == `0`)) {
1822	FSW_STATS_INC(FSW_STATS_DROP_NOMEM_PKT);
1823	KPKTQ_ENQUEUE(&dropped_pkts,
1824	pkt);
1825	pkt = NULL;
1826	pp_free_packet_single(dpkt);
1827	dpkt = NULL;
1828	SK_ERR("failed to alloc %d "
1829	"buflets (err %d) for "
1830	"kr %s, 0x%llu", cnt, err,
1831	r->ckr_name, SK_KVA(r));
1832	break;
1833	}
1834	err = `0`;
1835	}
1836	ASSERT(buf_cnt != `0`);
1837	if (i == `0`) {
1838	PKT_GET_FIRST_BUFLET(dpkt, `1`, bprev);
1839	}
1840	bnew = (kern_buflet_t)buf_array[buf_array_iter];
1841	buf_array[buf_array_iter] = `0`;
1842	buf_array_iter++;
1843	buf_cnt--;
1844	VERIFY(kern_packet_add_buflet(SK_PKT2PH(dpkt),
1845	bprev, bnew) == `0`);
1846	bprev = bnew;
1847	}
1848	if (__improbable(err != `0`)) {
1849	continue;
1850	}
1851	err = copy_packet_from_dev(fsw, spkt: pkt, dpkt);
1852	_FSW_INJECT_ERROR(`43`, err, EINVAL, null_func);
1853	if (__improbable(err != `0`)) {
1854	SK_ERR("copy packet failed (err %d)", err);
1855	KPKTQ_ENQUEUE(&dropped_pkts, pkt);
1856	pp_free_packet_single(dpkt);
1857	dpkt = NULL;
1858	continue;
1859	}
1860	KPKTQ_ENQUEUE(&disposed_pkts, pkt);
1861	pkt = dpkt;
1862	}
1863	_UUID_COPY(pkt->pkt_flow_id, fe->fe_uuid);
1864	_UUID_COPY(pkt->pkt_policy_euuid, fe->fe_eproc_uuid);
1865	pkt->pkt_policy_id = fe->fe_policy_id;
1866	pkt->pkt_skip_policy_id = fe->fe_skip_policy_id;
1867	pkt->pkt_transport_protocol = fe->fe_transport_protocol;
1868	if (pkt->pkt_bufs_cnt > `1`) {
1869	pkt->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
1870	pkt->pkt_seg_cnt = `1`;
1871	}
1872	KPKTQ_ENQUEUE(&transferred_pkts, pkt);
1873	}
1874	KPKTQ_FINI(&fe->fe_rx_pktq);
1875	KPKTQ_CONCAT(&fe->fe_rx_pktq, &transferred_pkts);
1876	KPKTQ_FINI(&transferred_pkts);
1877
1878	fsw_ring_enqueue_tail_drop(fsw, r, pktq: &fe->fe_rx_pktq);
1879
1880	done:
1881	/ Free unused buflets /
1882	while (buf_cnt > `0`) {
1883	pp_free_buflet(dpp, (kern_buflet_t)(buf_array[buf_array_iter]));
1884	buf_array[buf_array_iter] = `0`;
1885	buf_array_iter++;
1886	buf_cnt--;
1887	}
1888	dp_free_pktq(fsw, pktq: &dpkts);
1889	dp_free_pktq(fsw, pktq: &disposed_pkts);
1890	dp_drop_pktq(fsw, &dropped_pkts);
1891	}
1892
1893	static inline void
1894	rx_flow_process(struct nx_flowswitch fsw, struct* flow_entry *fe,
1895	uint32_t flags)
1896	{
1897	ASSERT(!KPKTQ_EMPTY(&fe->fe_rx_pktq));
1898	ASSERT(KPKTQ_LEN(&fe->fe_rx_pktq) != `0`);
1899
1900	SK_DF(SK_VERB_FSW_DP \| SK_VERB_RX, "Rx %d pkts for fe %p port %d",
1901	KPKTQ_LEN(&fe->fe_rx_pktq), fe, fe->fe_nx_port);
1902
1903	/ flow related processing (default, agg, fpd, etc.) /
1904	fe->fe_rx_process(fsw, fe, flags);
1905
1906	if (__improbable(fe->fe_want_withdraw)) {
1907	fsw_reap_sched(fsw);
1908	}
1909
1910	KPKTQ_FINI(&fe->fe_rx_pktq);
1911	}
1912
1913	static inline void
1914	dp_rx_process_wake_packet(struct nx_flowswitch fsw, struct* __kern_packet *pkt)
1915	{
1916	/*
1917	* We only care about wake packets of flows that belong the flow switch
1918	* as wake packets for the host stack are handled by the host input
1919	* function
1920	*/
1921	#if (DEBUG \|\| DEVELOPMENT)
1922	if (__improbable(fsw->fsw_ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
1923	/*
1924	* This is a one shot command
1925	*/
1926	fsw->fsw_ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
1927
1928	pkt->pkt_pflags \|= PKT_F_WAKE_PKT;
1929	}
1930	#endif /* (DEBUG \|\| DEVELOPMENT) */
1931	if (__improbable(pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
1932	if_ports_used_match_pkt(ifp: fsw->fsw_ifp, pkt);
1933	}
1934	}
1935
1936	static void
1937	_fsw_receive_locked(struct nx_flowswitch fsw, struct* pktq *pktq)
1938	{
1939	struct __kern_packet pkt, tpkt;
1940	struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
1941	struct flow_entry fe, prev_fe;
1942	sa_family_t af;
1943	struct pktq host_pkts, dropped_pkts;
1944	int err;
1945
1946	KPKTQ_INIT(&host_pkts);
1947	KPKTQ_INIT(&dropped_pkts);
1948
1949	if (__improbable(FSW_QUIESCED(fsw))) {
1950	DTRACE_SKYWALK1(rx__quiesced, struct nx_flowswitch *, fsw);
1951	KPKTQ_CONCAT(&dropped_pkts, pktq);
1952	goto done;
1953	}
1954	if (__improbable(fsw->fsw_demux == NULL)) {
1955	KPKTQ_CONCAT(&dropped_pkts, pktq);
1956	goto done;
1957	}
1958
1959	prev_fe = NULL;
1960	KPKTQ_FOREACH_SAFE(pkt, pktq, tpkt) {
1961	if (__probable(tpkt)) {
1962	void *baddr;
1963	MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
1964	SK_PREFETCH(baddr, `0`);
1965	/ prefetch L3 and L4 flow structs /
1966	SK_PREFETCHW(tpkt->pkt_flow, `0`);
1967	SK_PREFETCHW(tpkt->pkt_flow, `128`);
1968	}
1969
1970	KPKTQ_REMOVE(pktq, pkt);
1971
1972	pkt = rx_prepare_packet(fsw, pkt);
1973
1974	af = fsw->fsw_demux(fsw, pkt);
1975	if (__improbable(af == AF_UNSPEC)) {
1976	KPKTQ_ENQUEUE(&host_pkts, pkt);
1977	continue;
1978	}
1979
1980	err = flow_pkt_classify(pkt, ifp: fsw->fsw_ifp, af, TRUE);
1981	_FSW_INJECT_ERROR(`1`, err, ENXIO, null_func);
1982	if (__improbable(err != `0`)) {
1983	FSW_STATS_INC(FSW_STATS_RX_FLOW_EXTRACT_ERR);
1984	KPKTQ_ENQUEUE(&host_pkts, pkt);
1985	continue;
1986	}
1987
1988	if (__improbable(pkt->pkt_flow_ip_is_frag)) {
1989	pkt = rx_process_ip_frag(fsw, pkt);
1990	if (pkt == NULL) {
1991	continue;
1992	}
1993	}
1994
1995	prev_fe = fe = rx_lookup_flow(fsw, pkt, prev_fe);
1996	if (__improbable(fe == NULL)) {
1997	KPKTQ_ENQUEUE_LIST(&host_pkts, pkt);
1998	continue;
1999	}
2000
2001	fe->fe_rx_pktq_bytes += pkt->pkt_flow_ulen;
2002
2003	dp_rx_process_wake_packet(fsw, pkt);
2004
2005	rx_flow_batch_packet(fes: &fes, fe, pkt);
2006	prev_fe = fe;
2007	}
2008
2009	struct flow_entry *tfe = NULL;
2010	TAILQ_FOREACH_SAFE(fe, &fes, fe_rx_link, tfe) {
2011	rx_flow_process(fsw, fe, flags: `0`);
2012	TAILQ_REMOVE(&fes, fe, fe_rx_link);
2013	fe->fe_rx_pktq_bytes = `0`;
2014	fe->fe_rx_frag_count = `0`;
2015	flow_entry_release(pfe: &fe);
2016	}
2017
2018	if (!KPKTQ_EMPTY(&host_pkts)) {
2019	fsw_host_rx(fsw, pktq: &host_pkts);
2020	}
2021
2022	done:
2023	dp_drop_pktq(fsw, &dropped_pkts);
2024	}
2025
2026	#if (DEVELOPMENT \|\| DEBUG)
2027	static void
2028	fsw_rps_rx(struct nx_flowswitch *fsw, uint32_t id,
2029	struct __kern_packet *pkt)
2030	{
2031	struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[id];
2032
2033	lck_mtx_lock_spin(&frt->frt_lock);
2034	KPKTQ_ENQUEUE(&frt->frt_pktq, pkt);
2035	lck_mtx_unlock(&frt->frt_lock);
2036	}
2037
2038	static void
2039	fsw_rps_thread_schedule(struct nx_flowswitch *fsw, uint32_t id)
2040	{
2041	struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[id];
2042
2043	ASSERT(frt->frt_thread != THREAD_NULL);
2044	lck_mtx_lock_spin(&frt->frt_lock);
2045	ASSERT(!(frt->frt_flags & (FRT_TERMINATING \| FRT_TERMINATED)));
2046
2047	frt->frt_requests++;
2048	if (!(frt->frt_flags & FRT_RUNNING)) {
2049	thread_wakeup((caddr_t)frt);
2050	}
2051	lck_mtx_unlock(&frt->frt_lock);
2052	}
2053
2054	__attribute__((noreturn))
2055	static void
2056	fsw_rps_thread_cont(void *v, wait_result_t w)
2057	{
2058	struct fsw_rps_thread *frt = v;
2059	struct nx_flowswitch *fsw = frt->frt_fsw;
2060
2061	lck_mtx_lock(&frt->frt_lock);
2062	if (__improbable(w == THREAD_INTERRUPTIBLE \|\|
2063	(frt->frt_flags & FRT_TERMINATING) != `0`)) {
2064	goto terminate;
2065	}
2066	if (KPKTQ_EMPTY(&frt->frt_pktq)) {
2067	goto done;
2068	}
2069	frt->frt_flags \|= FRT_RUNNING;
2070
2071	for (;;) {
2072	uint32_t requests = frt->frt_requests;
2073	struct pktq pkts;
2074
2075	KPKTQ_INIT(&pkts);
2076	KPKTQ_CONCAT(&pkts, &frt->frt_pktq);
2077	lck_mtx_unlock(&frt->frt_lock);
2078
2079	sk_protect_t protect;
2080	protect = sk_sync_protect();
2081	FSW_RLOCK(fsw);
2082	_fsw_receive_locked(fsw, &pkts);
2083	FSW_RUNLOCK(fsw);
2084	sk_sync_unprotect(protect);
2085
2086	lck_mtx_lock(&frt->frt_lock);
2087	if ((frt->frt_flags & FRT_TERMINATING) != `0` \|\|
2088	requests == frt->frt_requests) {
2089	frt->frt_requests = `0`;
2090	break;
2091	}
2092	}
2093
2094	done:
2095	lck_mtx_unlock(&frt->frt_lock);
2096	if (!(frt->frt_flags & FRT_TERMINATING)) {
2097	frt->frt_flags &= ~FRT_RUNNING;
2098	assert_wait(frt, THREAD_UNINT);
2099	thread_block_parameter(fsw_rps_thread_cont, frt);
2100	__builtin_unreachable();
2101	} else {
2102	terminate:
2103	LCK_MTX_ASSERT(&frt->frt_lock, LCK_MTX_ASSERT_OWNED);
2104	frt->frt_flags &= ~(FRT_RUNNING \| FRT_TERMINATING);
2105	frt->frt_flags \|= FRT_TERMINATED;
2106
2107	if (frt->frt_flags & FRT_TERMINATEBLOCK) {
2108	thread_wakeup((caddr_t)&frt);
2109	}
2110	lck_mtx_unlock(&frt->frt_lock);
2111
2112	SK_D("fsw_rx_%s_%d terminated", if_name(fsw->fsw_ifp),
2113	frt->frt_idx);
2114
2115	/ for the extra refcnt from kernel_thread_start() /
2116	thread_deallocate(current_thread());
2117	/ this is the end /
2118	thread_terminate(current_thread());
2119	/ NOTREACHED /
2120	__builtin_unreachable();
2121	}
2122
2123	/ must never get here /
2124	VERIFY(`0`);
2125	/ NOTREACHED /
2126	__builtin_unreachable();
2127	}
2128
2129	__attribute__((noreturn))
2130	static void
2131	fsw_rps_thread_func(void *v, wait_result_t w)
2132	{
2133	#pragma unused(w)
2134	struct fsw_rps_thread *frt = v;
2135	struct nx_flowswitch *fsw = frt->frt_fsw;
2136
2137	char thread_name[MAXTHREADNAMESIZE];
2138	bzero(thread_name, sizeof(thread_name));
2139	(void) snprintf(thread_name, sizeof(thread_name), "fsw_rx_%s_%d",
2140	if_name(fsw->fsw_ifp), frt->frt_idx);
2141	thread_set_thread_name(frt->frt_thread, thread_name);
2142	SK_D("%s spawned", thread_name);
2143
2144	net_thread_marks_push(NET_THREAD_SYNC_RX);
2145	assert_wait(frt, THREAD_UNINT);
2146	(void) thread_block_parameter(fsw_rps_thread_cont, frt);
2147
2148	__builtin_unreachable();
2149	}
2150
2151	static void
2152	fsw_rps_thread_join(struct nx_flowswitch *fsw, uint32_t i)
2153	{
2154	struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[i];
2155	uint64_t f = (`1` * NSEC_PER_MSEC);
2156	uint64_t s = (`1000` * NSEC_PER_SEC);
2157	uint32_t c = `0`;
2158
2159	lck_mtx_lock(&frt->frt_lock);
2160	frt->frt_flags \|= FRT_TERMINATING;
2161
2162	while (!(frt->frt_flags & FRT_TERMINATED)) {
2163	uint64_t t = `0`;
2164	nanoseconds_to_absolutetime((c++ == `0`) ? f : s, &t);
2165	clock_absolutetime_interval_to_deadline(t, &t);
2166	ASSERT(t != `0`);
2167
2168	frt->frt_flags \|= FRT_TERMINATEBLOCK;
2169	if (!(frt->frt_flags & FRT_RUNNING)) {
2170	thread_wakeup_one((caddr_t)frt);
2171	}
2172	(void) assert_wait_deadline(&frt->frt_thread, THREAD_UNINT, t);
2173	lck_mtx_unlock(&frt->frt_lock);
2174	thread_block(THREAD_CONTINUE_NULL);
2175	lck_mtx_lock(&frt->frt_lock);
2176	frt->frt_flags &= ~FRT_TERMINATEBLOCK;
2177	}
2178	ASSERT(frt->frt_flags & FRT_TERMINATED);
2179	lck_mtx_unlock(&frt->frt_lock);
2180	frt->frt_thread = THREAD_NULL;
2181	}
2182
2183	static void
2184	fsw_rps_thread_spawn(struct nx_flowswitch *fsw, uint32_t i)
2185	{
2186	kern_return_t error;
2187	struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[i];
2188	lck_mtx_init(&frt->frt_lock, &nexus_lock_group, &nexus_lock_attr);
2189	frt->frt_idx = i;
2190	frt->frt_fsw = fsw;
2191	error = kernel_thread_start(fsw_rps_thread_func, frt, &frt->frt_thread);
2192	ASSERT(!error);
2193	KPKTQ_INIT(&frt->frt_pktq);
2194	}
2195
2196	int
2197	fsw_rps_set_nthreads(struct nx_flowswitch* fsw, uint32_t n)
2198	{
2199	if (n > FSW_RPS_MAX_NTHREADS) {
2200	SK_ERR("rps nthreads %d, max %d", n, FSW_RPS_MAX_NTHREADS);
2201	return EINVAL;
2202	}
2203
2204	FSW_WLOCK(fsw);
2205	if (n < fsw->fsw_rps_nthreads) {
2206	for (uint32_t i = n; i < fsw->fsw_rps_nthreads; i++) {
2207	fsw_rps_thread_join(fsw, i);
2208	}
2209	fsw->fsw_rps_threads = krealloc_type(struct fsw_rps_thread,
2210	fsw->fsw_rps_nthreads, n, fsw->fsw_rps_threads,
2211	Z_WAITOK \| Z_ZERO \| Z_NOFAIL);
2212	} else if (n > fsw->fsw_rps_nthreads) {
2213	fsw->fsw_rps_threads = krealloc_type(struct fsw_rps_thread,
2214	fsw->fsw_rps_nthreads, n, fsw->fsw_rps_threads,
2215	Z_WAITOK \| Z_ZERO \| Z_NOFAIL);
2216	for (uint32_t i = fsw->fsw_rps_nthreads; i < n; i++) {
2217	fsw_rps_thread_spawn(fsw, i);
2218	}
2219	}
2220	fsw->fsw_rps_nthreads = n;
2221	FSW_WUNLOCK(fsw);
2222	return `0`;
2223	}
2224
2225	static uint32_t
2226	get_rps_id(struct nx_flowswitch fsw, struct* __kern_packet *pkt)
2227	{
2228	sa_family_t af = fsw->fsw_demux(fsw, pkt);
2229	if (__improbable(af == AF_UNSPEC)) {
2230	return `0`;
2231	}
2232
2233	flow_pkt_classify(pkt, fsw->fsw_ifp, af, true);
2234
2235	if (__improbable((pkt->pkt_qum_qflags &
2236	QUM_F_FLOW_CLASSIFIED) == `0`)) {
2237	return `0`;
2238	}
2239
2240	struct flow_key key;
2241	flow_pkt2key(pkt, true, &key);
2242	key.fk_mask = FKMASK_5TUPLE;
2243
2244	uint32_t id = flow_key_hash(&key) % fsw->fsw_rps_nthreads;
2245
2246	return id;
2247	}
2248
2249	#endif /* !DEVELOPMENT && !DEBUG */
2250
2251	void
2252	fsw_receive(struct nx_flowswitch fsw, struct* pktq *pktq)
2253	{
2254	FSW_RLOCK(fsw);
2255	#if (DEVELOPMENT \|\| DEBUG)
2256	if (fsw->fsw_rps_nthreads != `0`) {
2257	struct __kern_packet pkt, tpkt;
2258	bitmap_t map = `0`;
2259
2260	_CASSERT(BITMAP_LEN(FSW_RPS_MAX_NTHREADS) == `1`);
2261	KPKTQ_FOREACH_SAFE(pkt, pktq, tpkt) {
2262	uint32_t id = get_rps_id(fsw, pkt);
2263	KPKTQ_REMOVE(pktq, pkt);
2264	fsw_rps_rx(fsw, id, pkt);
2265	bitmap_set(&map, id);
2266	}
2267	for (int i = bitmap_first(&map, `64`); i >= `0`;
2268	i = bitmap_next(&map, i)) {
2269	fsw_rps_thread_schedule(fsw, i);
2270	}
2271	} else
2272	#endif /* !DEVELOPMENT && !DEBUG */
2273	{
2274	_fsw_receive_locked(fsw, pktq);
2275	}
2276	FSW_RUNLOCK(fsw);
2277	}
2278
2279	int
2280	fsw_dev_input_netem_dequeue(void handle, pktsched_pkt_t pkts,
2281	uint32_t n_pkts)
2282	{
2283	#pragma unused(handle)
2284	struct nx_flowswitch *fsw = handle;
2285	struct __kern_packet *kpkts[FSW_VP_DEV_BATCH_MAX];
2286	struct pktq pktq;
2287	sk_protect_t protect;
2288	uint32_t i;
2289
2290	ASSERT(n_pkts <= FSW_VP_DEV_BATCH_MAX);
2291
2292	for (i = `0`; i < n_pkts; i++) {
2293	ASSERT(pkts[i].pktsched_ptype == QP_PACKET);
2294	ASSERT(pkts[i].pktsched_pkt_kpkt != NULL);
2295	kpkts[i] = pkts[i].pktsched_pkt_kpkt;
2296	}
2297
2298	protect = sk_sync_protect();
2299	KPKTQ_INIT(&pktq);
2300	pkts_to_pktq(pkts: kpkts, n_pkts, pktq: &pktq);
2301
2302	fsw_receive(fsw, pktq: &pktq);
2303	KPKTQ_FINI(&pktq);
2304	sk_sync_unprotect(protect);
2305
2306	return `0`;
2307	}
2308
2309	static void
2310	fsw_dev_input_netem_enqueue(struct nx_flowswitch fsw, struct* pktq *q)
2311	{
2312	classq_pkt_t p;
2313	struct netem *ne;
2314	struct __kern_packet pkt, tpkt;
2315
2316	ASSERT(fsw->fsw_ifp != NULL);
2317	ne = fsw->fsw_ifp->if_input_netem;
2318	ASSERT(ne != NULL);
2319	KPKTQ_FOREACH_SAFE(pkt, q, tpkt) {
2320	bool pdrop;
2321	KPKTQ_REMOVE(q, pkt);
2322	CLASSQ_PKT_INIT_PACKET(&p, pkt);
2323	netem_enqueue(ne, p: &p, pdrop: &pdrop);
2324	}
2325	}
2326
2327	void
2328	fsw_devna_rx(struct nexus_adapter devna, struct* __kern_packet *pkt_head,
2329	struct nexus_pkt_stats *out_stats)
2330	{
2331	struct __kern_packet pkt = pkt_head, next;
2332	struct nx_flowswitch *fsw;
2333	uint32_t n_bytes = `0`, n_pkts = `0`;
2334	uint64_t total_pkts = `0`, total_bytes = `0`;
2335	struct pktq q;
2336
2337	KPKTQ_INIT(&q);
2338	if (__improbable(devna->na_ifp == NULL \|\|
2339	(fsw = fsw_ifp_to_fsw(devna->na_ifp)) == NULL)) {
2340	SK_ERR("fsw not attached, dropping %d pkts", KPKTQ_LEN(&q));
2341	pp_free_packet_chain(pkt_head, NULL);
2342	return;
2343	}
2344	while (pkt != NULL) {
2345	if (__improbable(pkt->pkt_trace_id != `0`)) {
2346	KDBG(SK_KTRACE_PKT_RX_DRV \| DBG_FUNC_END, pkt->pkt_trace_id);
2347	KDBG(SK_KTRACE_PKT_RX_FSW \| DBG_FUNC_START, pkt->pkt_trace_id);
2348	}
2349	next = pkt->pkt_nextpkt;
2350	pkt->pkt_nextpkt = NULL;
2351
2352	if (__probable((pkt->pkt_qum_qflags & QUM_F_DROPPED) == `0`)) {
2353	KPKTQ_ENQUEUE(&q, pkt);
2354	n_bytes += pkt->pkt_length;
2355	} else {
2356	DTRACE_SKYWALK1(non__finalized__drop,
2357	struct __kern_packet *, pkt);
2358	FSW_STATS_INC(FSW_STATS_RX_PKT_NOT_FINALIZED);
2359	pp_free_packet_single(pkt);
2360	pkt = NULL;
2361	}
2362	n_pkts = KPKTQ_LEN(&q);
2363	if (n_pkts == fsw_rx_batch \|\| (next == NULL && n_pkts > `0`)) {
2364	if (__improbable(fsw->fsw_ifp->if_input_netem != NULL)) {
2365	fsw_dev_input_netem_enqueue(fsw, q: &q);
2366	} else {
2367	fsw_receive(fsw, pktq: &q);
2368	}
2369	total_pkts += n_pkts;
2370	total_bytes += n_bytes;
2371	n_pkts = `0`;
2372	n_bytes = `0`;
2373	KPKTQ_FINI(&q);
2374	}
2375	pkt = next;
2376	}
2377	ASSERT(KPKTQ_LEN(&q) == `0`);
2378	FSW_STATS_ADD(FSW_STATS_RX_PACKETS, total_pkts);
2379	if (out_stats != NULL) {
2380	out_stats->nps_pkts = total_pkts;
2381	out_stats->nps_bytes = total_bytes;
2382	}
2383	KDBG(SK_KTRACE_FSW_DEV_RING_FLUSH, SK_KVA(devna), total_pkts, total_bytes);
2384	}
2385
2386	static int
2387	dp_copy_to_dev_mbuf(struct nx_flowswitch fsw, struct* __kern_packet *spkt,
2388	struct __kern_packet *dpkt)
2389	{
2390	struct mbuf *m = NULL;
2391	uint32_t bdlen, bdlim, bdoff;
2392	uint8_t *bdaddr;
2393	unsigned int one = `1`;
2394	int err = `0`;
2395
2396	err = mbuf_allocpacket(how: MBUF_DONTWAIT,
2397	packetlen: (fsw->fsw_frame_headroom + spkt->pkt_length), maxchunks: &one, mbuf: &m);
2398	#if (DEVELOPMENT \|\| DEBUG)
2399	if (m != NULL) {
2400	_FSW_INJECT_ERROR(`11`, m, NULL, m_freem, m);
2401	}
2402	#endif /* DEVELOPMENT \|\| DEBUG */
2403	if (__improbable(m == NULL)) {
2404	FSW_STATS_INC(FSW_STATS_DROP_NOMEM_MBUF);
2405	err = ENOBUFS;
2406	goto done;
2407	}
2408
2409	MD_BUFLET_ADDR_ABS_DLEN(dpkt, bdaddr, bdlen, bdlim, bdoff);
2410	if (fsw->fsw_frame_headroom > bdlim) {
2411	SK_ERR("not enough space in buffer for headroom");
2412	err = EINVAL;
2413	goto done;
2414	}
2415
2416	dpkt->pkt_headroom = fsw->fsw_frame_headroom;
2417	dpkt->pkt_mbuf = m;
2418	dpkt->pkt_pflags \|= PKT_F_MBUF_DATA;
2419
2420	/ packet copy into mbuf /
2421	fsw->fsw_pkt_copy_to_mbuf(NR_TX, SK_PTR_ENCODE(spkt,
2422	METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt)), `0`, m,
2423	fsw->fsw_frame_headroom, spkt->pkt_length,
2424	PACKET_HAS_PARTIAL_CHECKSUM(spkt),
2425	spkt->pkt_csum_tx_start_off);
2426	FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2MBUF);
2427
2428	/ header copy into dpkt buffer for classification /
2429	kern_packet_t sph = SK_PTR_ENCODE(spkt,
2430	METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt));
2431	kern_packet_t dph = SK_PTR_ENCODE(dpkt,
2432	METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
2433	uint32_t copy_len = MIN(spkt->pkt_length, bdlim - dpkt->pkt_headroom);
2434	fsw->fsw_pkt_copy_from_pkt(NR_TX, dph, dpkt->pkt_headroom,
2435	sph, spkt->pkt_headroom, copy_len, FALSE, `0`, `0`, `0`);
2436
2437	/*
2438	* fsw->fsw_frame_headroom is after m_data, thus we treat m_data same as
2439	* buflet baddr m_data always points to the beginning of packet and
2440	* should represents the same as baddr + headroom
2441	*/
2442	ASSERT((uintptr_t)m->m_data ==
2443	((uintptr_t)mbuf_datastart(m) + fsw->fsw_frame_headroom));
2444
2445	done:
2446	return err;
2447	}
2448
2449	static int
2450	dp_copy_to_dev_pkt(struct nx_flowswitch fsw, struct* __kern_packet *spkt,
2451	struct __kern_packet *dpkt)
2452	{
2453	struct ifnet *ifp = fsw->fsw_ifp;
2454	uint16_t headroom = fsw->fsw_frame_headroom + ifp->if_tx_headroom;
2455
2456	if (headroom > UINT8_MAX) {
2457	SK_ERR("headroom too large %d", headroom);
2458	return ERANGE;
2459	}
2460	dpkt->pkt_headroom = (uint8_t)headroom;
2461	ASSERT((dpkt->pkt_headroom & `0x7`) == `0`);
2462	dpkt->pkt_l2_len = `0`;
2463	dpkt->pkt_link_flags = spkt->pkt_link_flags;
2464
2465	kern_packet_t sph = SK_PTR_ENCODE(spkt,
2466	METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt));
2467	kern_packet_t dph = SK_PTR_ENCODE(dpkt,
2468	METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
2469	fsw->fsw_pkt_copy_from_pkt(NR_TX, dph,
2470	dpkt->pkt_headroom, sph, spkt->pkt_headroom,
2471	spkt->pkt_length, PACKET_HAS_PARTIAL_CHECKSUM(spkt),
2472	(spkt->pkt_csum_tx_start_off - spkt->pkt_headroom),
2473	(spkt->pkt_csum_tx_stuff_off - spkt->pkt_headroom),
2474	(spkt->pkt_csum_flags & PACKET_CSUM_ZERO_INVERT));
2475
2476	FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2PKT);
2477
2478	return `0`;
2479	}
2480
2481	#if SK_LOG
2482	/ Hoisted out of line to reduce kernel stack footprint /
2483	SK_LOG_ATTRIBUTE
2484	static void
2485	dp_copy_to_dev_log(struct nx_flowswitch fsw, const* struct kern_pbufpool *pp,
2486	struct __kern_packet spkt, struct* __kern_packet dpkt, int* error)
2487	{
2488	struct proc *p = current_proc();
2489	struct ifnet *ifp = fsw->fsw_ifp;
2490	uint64_t logflags = (SK_VERB_FSW_DP \| SK_VERB_TX);
2491
2492	if (error == ERANGE) {
2493	SK_ERR("packet too long, hr(fr+tx)+slen (%u+%u)+%u > "
2494	"dev_pp_max %u", (uint32_t)fsw->fsw_frame_headroom,
2495	(uint32_t)ifp->if_tx_headroom, spkt->pkt_length,
2496	(uint32_t)pp->pp_max_frags * PP_BUF_SIZE_DEF(pp));
2497	} else if (error == ENOBUFS) {
2498	SK_DF(logflags, "%s(%d) packet allocation failure",
2499	sk_proc_name_address(p), sk_proc_pid(p));
2500	} else if (error == `0`) {
2501	ASSERT(dpkt != NULL);
2502	char *daddr;
2503	MD_BUFLET_ADDR_ABS(dpkt, daddr);
2504	SK_DF(logflags, "%s(%d) splen %u dplen %u hr %u (fr/tx %u/%u)",
2505	sk_proc_name_address(p), sk_proc_pid(p), spkt->pkt_length,
2506	dpkt->pkt_length, (uint32_t)dpkt->pkt_headroom,
2507	(uint32_t)fsw->fsw_frame_headroom,
2508	(uint32_t)ifp->if_tx_headroom);
2509	SK_DF(logflags \| SK_VERB_DUMP, "%s",
2510	sk_dump("buf", daddr, dpkt->pkt_length, `128`, NULL, `0`));
2511	} else {
2512	SK_DF(logflags, "%s(%d) error %d", error);
2513	}
2514	}
2515	#else
2516	#define dp_copy_to_dev_log(...)
2517	#endif /* SK_LOG */
2518
2519	static void
2520	fsw_pkt_copy_metadata(struct __kern_packet spkt, struct* __kern_packet *dpkt)
2521	{
2522	ASSERT(!(spkt->pkt_pflags & PKT_F_MBUF_MASK));
2523	ASSERT(!(spkt->pkt_pflags & PKT_F_PKT_MASK));
2524
2525	SK_PREFETCHW(dpkt->pkt_qum_buf.buf_addr, `0`);
2526	/ Copy packet metadata /
2527	_QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
2528	_PKT_COPY(spkt, dpkt);
2529	_PKT_COPY_TX_PORT_DATA(spkt, dpkt);
2530	ASSERT((dpkt->pkt_qum.qum_qflags & QUM_F_KERNEL_ONLY) \|\|
2531	!PP_KERNEL_ONLY(dpkt->pkt_qum.qum_pp));
2532	ASSERT(dpkt->pkt_mbuf == NULL);
2533
2534	/ Copy AQM metadata /
2535	dpkt->pkt_flowsrc_type = spkt->pkt_flowsrc_type;
2536	dpkt->pkt_flowsrc_fidx = spkt->pkt_flowsrc_fidx;
2537	_CASSERT((offsetof(struct __flow, flow_src_id) % `8`) == `0`);
2538	_UUID_COPY(dpkt->pkt_flowsrc_id, spkt->pkt_flowsrc_id);
2539	_UUID_COPY(dpkt->pkt_policy_euuid, spkt->pkt_policy_euuid);
2540	dpkt->pkt_policy_id = spkt->pkt_policy_id;
2541	dpkt->pkt_skip_policy_id = spkt->pkt_skip_policy_id;
2542	}
2543
2544	static int
2545	dp_copy_to_dev(struct nx_flowswitch fsw, struct* __kern_packet *spkt,
2546	struct __kern_packet *dpkt)
2547	{
2548	const struct kern_pbufpool *pp = dpkt->pkt_qum.qum_pp;
2549	struct ifnet *ifp = fsw->fsw_ifp;
2550	uint32_t dev_pkt_len;
2551	int err = `0`;
2552
2553	fsw_pkt_copy_metadata(spkt, dpkt);
2554	switch (fsw->fsw_classq_enq_ptype) {
2555	case QP_MBUF:
2556	err = dp_copy_to_dev_mbuf(fsw, spkt, dpkt);
2557	break;
2558
2559	case QP_PACKET:
2560	dev_pkt_len = fsw->fsw_frame_headroom + ifp->if_tx_headroom +
2561	spkt->pkt_length;
2562	if (dev_pkt_len > pp->pp_max_frags * PP_BUF_SIZE_DEF(pp)) {
2563	FSW_STATS_INC(FSW_STATS_TX_COPY_BAD_LEN);
2564	err = ERANGE;
2565	goto done;
2566	}
2567	err = dp_copy_to_dev_pkt(fsw, spkt, dpkt);
2568	break;
2569
2570	default:
2571	VERIFY(`0`);
2572	__builtin_unreachable();
2573	}
2574	done:
2575	dp_copy_to_dev_log(fsw, pp, spkt, dpkt, err);
2576	return err;
2577	}
2578
2579	static int
2580	dp_copy_headers_to_dev(struct nx_flowswitch fsw, struct* __kern_packet *spkt,
2581	struct __kern_packet *dpkt)
2582	{
2583	uint8_t sbaddr, dbaddr;
2584	uint16_t headroom = fsw->fsw_frame_headroom + fsw->fsw_ifp->if_tx_headroom;
2585	uint16_t hdrs_len_estimate = (uint16_t)MIN(spkt->pkt_length, `128`);
2586
2587	fsw_pkt_copy_metadata(spkt, dpkt);
2588
2589	MD_BUFLET_ADDR_ABS(spkt, sbaddr);
2590	ASSERT(sbaddr != NULL);
2591	sbaddr += spkt->pkt_headroom;
2592
2593	MD_BUFLET_ADDR_ABS(dpkt, dbaddr);
2594	ASSERT(dbaddr != NULL);
2595	dpkt->pkt_headroom = (uint8_t)headroom;
2596	dbaddr += headroom;
2597
2598	pkt_copy(src: sbaddr, dst: dbaddr, len: hdrs_len_estimate);
2599	METADATA_SET_LEN(dpkt, hdrs_len_estimate, headroom);
2600
2601	/ packet length is set to the full length /
2602	dpkt->pkt_length = spkt->pkt_length;
2603	dpkt->pkt_pflags \|= PKT_F_TRUNCATED;
2604	return `0`;
2605	}
2606
2607	static struct mbuf *
2608	convert_pkt_to_mbuf(struct __kern_packet *pkt)
2609	{
2610	ASSERT(pkt->pkt_pflags & PKT_F_MBUF_DATA);
2611	ASSERT(pkt->pkt_mbuf != NULL);
2612	struct mbuf *m = pkt->pkt_mbuf;
2613
2614	/ pass additional metadata generated from flow parse/lookup /
2615	_CASSERT(sizeof(m->m_pkthdr.pkt_flowid) ==
2616	sizeof(pkt->pkt_flow_token));
2617	_CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_srcid) ==
2618	sizeof(pkt->pkt_flowsrc_token));
2619	_CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_fidx) ==
2620	sizeof(pkt->pkt_flowsrc_fidx));
2621	m->m_pkthdr.pkt_svc = pkt->pkt_svc_class;
2622	m->m_pkthdr.pkt_proto = pkt->pkt_flow->flow_ip_proto;
2623	m->m_pkthdr.pkt_flowid = pkt->pkt_flow_token;
2624	m->m_pkthdr.comp_gencnt = pkt->pkt_comp_gencnt;
2625	m->m_pkthdr.pkt_flowsrc = pkt->pkt_flowsrc_type;
2626	m->m_pkthdr.pkt_mpriv_srcid = pkt->pkt_flowsrc_token;
2627	m->m_pkthdr.pkt_mpriv_fidx = pkt->pkt_flowsrc_fidx;
2628
2629	if (pkt->pkt_transport_protocol == IPPROTO_QUIC) {
2630	m->m_pkthdr.pkt_ext_flags \|= PKTF_EXT_QUIC;
2631	}
2632
2633	/ The packet should have a timestamp by the time we get here. /
2634	m->m_pkthdr.pkt_timestamp = pkt->pkt_timestamp;
2635	m->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
2636
2637	m->m_pkthdr.pkt_flags &= ~PKT_F_COMMON_MASK;
2638	m->m_pkthdr.pkt_flags \|= (pkt->pkt_pflags & PKT_F_COMMON_MASK);
2639	/ set pkt_hdr so that AQM can find IP header and mark ECN bits /
2640	m->m_pkthdr.pkt_hdr = m_mtod_current(m) + pkt->pkt_l2_len;
2641
2642	if ((pkt->pkt_pflags & PKT_F_START_SEQ) != `0`) {
2643	m->m_pkthdr.tx_start_seq = ntohl(pkt->pkt_flow_tcp_seq);
2644	}
2645	KPKT_CLEAR_MBUF_DATA(pkt);
2646
2647	/ mbuf has been consumed, release packet as well /
2648	ASSERT(pkt->pkt_qum.qum_ksd == NULL);
2649	pp_free_packet_single(pkt);
2650	return m;
2651	}
2652
2653	static void
2654	convert_pkt_to_mbuf_list(struct __kern_packet *pkt_list,
2655	struct mbuf head, struct mbuf tail,
2656	uint32_t cnt, uint32_t bytes)
2657	{
2658	struct __kern_packet pkt = pkt_list, next;
2659	struct mbuf m_head = NULL, m_tailp = &m_head, m = NULL;
2660	uint32_t c = `0`, b = `0`;
2661
2662	while (pkt != NULL) {
2663	next = pkt->pkt_nextpkt;
2664	pkt->pkt_nextpkt = NULL;
2665	m = convert_pkt_to_mbuf(pkt);
2666	ASSERT(m != NULL);
2667
2668	*m_tailp = m;
2669	m_tailp = &m->m_nextpkt;
2670	c++;
2671	b += m_pktlen(m);
2672	pkt = next;
2673	}
2674	if (head != NULL) {
2675	*head = m_head;
2676	}
2677	if (tail != NULL) {
2678	*tail = m;
2679	}
2680	if (cnt != NULL) {
2681	*cnt = c;
2682	}
2683	if (bytes != NULL) {
2684	*bytes = b;
2685	}
2686	}
2687
2688	SK_NO_INLINE_ATTRIBUTE
2689	static int
2690	classq_enqueue_flow_single(struct nx_flowswitch *fsw,
2691	struct __kern_packet *pkt)
2692	{
2693	struct ifnet *ifp = fsw->fsw_ifp;
2694	boolean_t pkt_drop = FALSE;
2695	int err;
2696
2697	FSW_LOCK_ASSERT_HELD(fsw);
2698	ASSERT(fsw->fsw_classq_enabled);
2699	ASSERT(pkt->pkt_flow_token != `0`);
2700	fsw_ifp_inc_traffic_class_out_pkt(ifp, pkt->pkt_svc_class,
2701	`1`, pkt->pkt_length);
2702
2703	if (__improbable(pkt->pkt_trace_id != `0`)) {
2704	KDBG(SK_KTRACE_PKT_TX_FSW \| DBG_FUNC_END, pkt->pkt_trace_id);
2705	KDBG(SK_KTRACE_PKT_TX_AQM \| DBG_FUNC_START, pkt->pkt_trace_id);
2706	}
2707
2708	switch (fsw->fsw_classq_enq_ptype) {
2709	case QP_MBUF: { / compat interface /
2710	struct mbuf *m;
2711
2712	m = convert_pkt_to_mbuf(pkt);
2713	ASSERT(m != NULL);
2714	pkt = NULL;
2715
2716	/ ifnet_enqueue consumes mbuf /
2717	err = ifnet_enqueue_mbuf(ifp, m, false, &pkt_drop);
2718	m = NULL;
2719	#if (DEVELOPMENT \|\| DEBUG)
2720	if (__improbable(!pkt_drop)) {
2721	_FSW_INJECT_ERROR(`14`, pkt_drop, TRUE, null_func);
2722	}
2723	#endif /* DEVELOPMENT \|\| DEBUG */
2724	if (pkt_drop) {
2725	FSW_STATS_INC(FSW_STATS_DROP);
2726	FSW_STATS_INC(FSW_STATS_TX_AQM_DROP);
2727	}
2728	break;
2729	}
2730	case QP_PACKET: { / native interface /
2731	/ ifnet_enqueue consumes packet /
2732	err = ifnet_enqueue_pkt(ifp, pkt, false, &pkt_drop);
2733	pkt = NULL;
2734	#if (DEVELOPMENT \|\| DEBUG)
2735	if (__improbable(!pkt_drop)) {
2736	_FSW_INJECT_ERROR(`14`, pkt_drop, TRUE, null_func);
2737	}
2738	#endif /* DEVELOPMENT \|\| DEBUG */
2739	if (pkt_drop) {
2740	FSW_STATS_INC(FSW_STATS_DROP);
2741	FSW_STATS_INC(FSW_STATS_TX_AQM_DROP);
2742	}
2743	break;
2744	}
2745	default:
2746	err = EINVAL;
2747	VERIFY(`0`);
2748	/ NOTREACHED /
2749	__builtin_unreachable();
2750	}
2751
2752	return err;
2753	}
2754
2755	static int
2756	classq_enqueue_flow_chain(struct nx_flowswitch *fsw,
2757	struct __kern_packet pkt_head, struct* __kern_packet *pkt_tail,
2758	uint32_t cnt, uint32_t bytes)
2759	{
2760	struct ifnet *ifp = fsw->fsw_ifp;
2761	boolean_t pkt_drop = FALSE;
2762	uint32_t svc;
2763	int err;
2764
2765	FSW_LOCK_ASSERT_HELD(fsw);
2766	ASSERT(fsw->fsw_classq_enabled);
2767	ASSERT(pkt_head->pkt_flow_token != `0`);
2768
2769	/*
2770	* All packets in the flow should have the same svc.
2771	*/
2772	svc = pkt_head->pkt_svc_class;
2773	fsw_ifp_inc_traffic_class_out_pkt(ifp, svc, cnt, bytes);
2774
2775	switch (fsw->fsw_classq_enq_ptype) {
2776	case QP_MBUF: { / compat interface /
2777	struct mbuf m_head = NULL, m_tail = NULL;
2778	uint32_t c = `0`, b = `0`;
2779
2780	convert_pkt_to_mbuf_list(pkt_list: pkt_head, head: &m_head, tail: &m_tail, cnt: &c, bytes: &b);
2781	ASSERT(m_head != NULL && m_tail != NULL);
2782	ASSERT(c == cnt);
2783	ASSERT(b == bytes);
2784	pkt_head = NULL;
2785
2786	/ ifnet_enqueue consumes mbuf /
2787	err = ifnet_enqueue_mbuf_chain(ifp, m_head, m_tail, cnt,
2788	bytes, FALSE, &pkt_drop);
2789	m_head = NULL;
2790	m_tail = NULL;
2791	#if (DEVELOPMENT \|\| DEBUG)
2792	if (__improbable(!pkt_drop)) {
2793	_FSW_INJECT_ERROR(`14`, pkt_drop, TRUE, null_func);
2794	}
2795	#endif /* DEVELOPMENT \|\| DEBUG */
2796	if (pkt_drop) {
2797	STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, cnt);
2798	STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
2799	cnt);
2800	}
2801	break;
2802	}
2803	case QP_PACKET: { / native interface /
2804	/ ifnet_enqueue consumes packet /
2805	err = ifnet_enqueue_pkt_chain(ifp, pkt_head, pkt_tail, cnt,
2806	bytes, FALSE, &pkt_drop);
2807	pkt_head = NULL;
2808	#if (DEVELOPMENT \|\| DEBUG)
2809	if (__improbable(!pkt_drop)) {
2810	_FSW_INJECT_ERROR(`14`, pkt_drop, TRUE, null_func);
2811	}
2812	#endif /* DEVELOPMENT \|\| DEBUG */
2813	if (pkt_drop) {
2814	STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, cnt);
2815	STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
2816	cnt);
2817	}
2818	break;
2819	}
2820	default:
2821	err = EINVAL;
2822	VERIFY(`0`);
2823	/ NOTREACHED /
2824	__builtin_unreachable();
2825	}
2826
2827	return err;
2828	}
2829
2830	/*
2831	* This code path needs to be kept for interfaces without logical link support.
2832	*/
2833	static void
2834	classq_enqueue_flow(struct nx_flowswitch fsw, struct* flow_entry *fe,
2835	bool chain, uint32_t cnt, uint32_t bytes)
2836	{
2837	bool flowadv_is_set = false;
2838	struct __kern_packet pkt, tail, *tpkt;
2839	flowadv_idx_t flow_adv_idx;
2840	bool flowadv_cap;
2841	flowadv_token_t flow_adv_token;
2842	int err;
2843
2844	SK_DF(SK_VERB_FSW_DP \| SK_VERB_AQM, "%s classq enqueued %d pkts",
2845	if_name(fsw->fsw_ifp), KPKTQ_LEN(&fe->fe_tx_pktq));
2846
2847	if (chain) {
2848	pkt = KPKTQ_FIRST(&fe->fe_tx_pktq);
2849	tail = KPKTQ_LAST(&fe->fe_tx_pktq);
2850	KPKTQ_INIT(&fe->fe_tx_pktq);
2851	if (pkt == NULL) {
2852	return;
2853	}
2854	flow_adv_idx = pkt->pkt_flowsrc_fidx;
2855	flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != `0`);
2856	flow_adv_token = pkt->pkt_flow_token;
2857
2858	err = classq_enqueue_flow_chain(fsw, pkt_head: pkt, pkt_tail: tail, cnt, bytes);
2859
2860	/ set flow advisory if needed /
2861	if (__improbable((err == EQFULL \|\| err == EQSUSPENDED) &&
2862	flowadv_cap)) {
2863	flowadv_is_set = na_flowadv_set(flow_get_na(fsw, fe),
2864	flow_adv_idx, flow_adv_token);
2865	}
2866	DTRACE_SKYWALK3(chain__enqueue, uint32_t, cnt, uint32_t, bytes,
2867	bool, flowadv_is_set);
2868	} else {
2869	uint32_t c = `0`, b = `0`;
2870
2871	KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
2872	KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
2873
2874	flow_adv_idx = pkt->pkt_flowsrc_fidx;
2875	flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != `0`);
2876	flow_adv_token = pkt->pkt_flow_token;
2877
2878	c++;
2879	b += pkt->pkt_length;
2880	err = classq_enqueue_flow_single(fsw, pkt);
2881
2882	/ set flow advisory if needed /
2883	if (__improbable(!flowadv_is_set &&
2884	((err == EQFULL \|\| err == EQSUSPENDED) &&
2885	flowadv_cap))) {
2886	flowadv_is_set = na_flowadv_set(
2887	flow_get_na(fsw, fe), flow_adv_idx,
2888	flow_adv_token);
2889	}
2890	}
2891	ASSERT(c == cnt);
2892	ASSERT(b == bytes);
2893	DTRACE_SKYWALK3(non__chain__enqueue, uint32_t, cnt, uint32_t, bytes,
2894	bool, flowadv_is_set);
2895	}
2896
2897	/ notify flow advisory event /
2898	if (__improbable(flowadv_is_set)) {
2899	struct __kern_channel_ring *r = fsw_flow_get_tx_ring(fsw, fe);
2900	if (__probable(r)) {
2901	na_flowadv_event(r);
2902	SK_DF(SK_VERB_FLOW_ADVISORY \| SK_VERB_TX,
2903	"%s(%d) notified of flow update",
2904	sk_proc_name_address(current_proc()),
2905	sk_proc_pid(current_proc()));
2906	}
2907	}
2908	}
2909
2910	/*
2911	* Logical link code path
2912	*/
2913	static void
2914	classq_qset_enqueue_flow(struct nx_flowswitch fsw, struct* flow_entry *fe,
2915	bool chain, uint32_t cnt, uint32_t bytes)
2916	{
2917	#pragma unused(chain)
2918	struct __kern_packet pkt, tail;
2919	flowadv_idx_t flow_adv_idx;
2920	bool flowadv_is_set = false;
2921	bool flowadv_cap;
2922	flowadv_token_t flow_adv_token;
2923	uint32_t flowctl = `0`, dropped = `0`;
2924	int err;
2925
2926	SK_DF(SK_VERB_FSW_DP \| SK_VERB_AQM, "%s classq enqueued %d pkts",
2927	if_name(fsw->fsw_ifp), KPKTQ_LEN(&fe->fe_tx_pktq));
2928
2929	pkt = KPKTQ_FIRST(&fe->fe_tx_pktq);
2930	tail = KPKTQ_LAST(&fe->fe_tx_pktq);
2931	KPKTQ_INIT(&fe->fe_tx_pktq);
2932	if (pkt == NULL) {
2933	return;
2934	}
2935	flow_adv_idx = pkt->pkt_flowsrc_fidx;
2936	flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != `0`);
2937	flow_adv_token = pkt->pkt_flow_token;
2938
2939	err = netif_qset_enqueue(fe->fe_qset, pkt, tail, cnt, bytes,
2940	&flowctl, &dropped);
2941
2942	if (__improbable(err != `0`)) {
2943	/ set flow advisory if needed /
2944	if (flowctl > `0` && flowadv_cap) {
2945	flowadv_is_set = na_flowadv_set(flow_get_na(fsw, fe),
2946	flow_adv_idx, flow_adv_token);
2947
2948	/ notify flow advisory event /
2949	if (flowadv_is_set) {
2950	struct __kern_channel_ring *r =
2951	fsw_flow_get_tx_ring(fsw, fe);
2952	if (__probable(r)) {
2953	na_flowadv_event(r);
2954	SK_DF(SK_VERB_FLOW_ADVISORY \|
2955	SK_VERB_TX,
2956	"%s(%d) notified of flow update",
2957	sk_proc_name_address(current_proc()),
2958	sk_proc_pid(current_proc()));
2959	}
2960	}
2961	}
2962	if (dropped > `0`) {
2963	STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, dropped);
2964	STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
2965	dropped);
2966	}
2967	}
2968	}
2969
2970	static void
2971	tx_finalize_packet(struct nx_flowswitch fsw, struct* __kern_packet *pkt)
2972	{
2973	#pragma unused(fsw)
2974	/ finalize here; no more changes to buflets after classq /
2975	if (__probable(!(pkt->pkt_pflags & PKT_F_MBUF_DATA))) {
2976	kern_packet_t ph = SK_PTR_ENCODE(pkt,
2977	METADATA_TYPE(pkt), METADATA_SUBTYPE(pkt));
2978	int err = __packet_finalize(ph);
2979	VERIFY(err == `0`);
2980	}
2981	}
2982
2983	static bool
2984	dp_flow_tx_route_process(struct nx_flowswitch fsw, struct* flow_entry *fe)
2985	{
2986	struct flow_route *fr = fe->fe_route;
2987	int err;
2988
2989	ASSERT(fr != NULL);
2990
2991	if (__improbable(!dp_flow_route_process(fsw, fe))) {
2992	return false;
2993	}
2994	if (fe->fe_qset_select == FE_QSET_SELECT_DYNAMIC) {
2995	flow_qset_select_dynamic(fsw, fe, TRUE);
2996	}
2997
2998	_FSW_INJECT_ERROR(`35`, fr->fr_flags, fr->fr_flags,
2999	_fsw_error35_handler, `1`, fr, NULL, NULL);
3000	_FSW_INJECT_ERROR(`36`, fr->fr_flags, fr->fr_flags,
3001	_fsw_error36_handler, `1`, fr, NULL);
3002
3003	/*
3004	* See if we need to resolve the flow route; note the test against
3005	* fr_flags here is done without any lock for performance. Thus
3006	* it's possible that we race against the thread performing route
3007	* event updates for a packet (which is OK). In any case we should
3008	* not have any assertion on fr_flags value(s) due to the lack of
3009	* serialization.
3010	*/
3011	if (fr->fr_flags & FLOWRTF_RESOLVED) {
3012	goto frame;
3013	}
3014
3015	struct __kern_packet pkt, tpkt;
3016	KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3017	err = fsw->fsw_resolve(fsw, fr, pkt);
3018	_FSW_INJECT_ERROR_SET(`35`, _fsw_error35_handler, `2`, fr, pkt, &err);
3019	_FSW_INJECT_ERROR_SET(`36`, _fsw_error36_handler, `2`, fr, &err);
3020	/*
3021	* If resolver returns EJUSTRETURN then we drop the pkt as the
3022	* resolver should have converted the pkt into mbuf (or
3023	* detached the attached mbuf from pkt) and added it to the
3024	* llinfo queue. If we do have a cached llinfo, then proceed
3025	* to using it even though it may be stale (very unlikely)
3026	* while the resolution is in progress.
3027	* Otherwise, any other error results in dropping pkt.
3028	*/
3029	if (err == EJUSTRETURN) {
3030	KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3031	pp_free_packet_single(pkt);
3032	FSW_STATS_INC(FSW_STATS_TX_RESOLV_PENDING);
3033	continue;
3034	} else if (err != `0` && (fr->fr_flags & FLOWRTF_HAS_LLINFO)) {
3035	/ use existing llinfo /
3036	FSW_STATS_INC(FSW_STATS_TX_RESOLV_STALE);
3037	} else if (err != `0`) {
3038	KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3039	pp_free_packet_single(pkt);
3040	FSW_STATS_INC(FSW_STATS_TX_RESOLV_FAIL);
3041	continue;
3042	}
3043	}
3044
3045	frame:
3046	KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3047	if (fsw->fsw_frame != NULL) {
3048	fsw->fsw_frame(fsw, fr, pkt);
3049	}
3050	}
3051
3052	return true;
3053	}
3054
3055	static void
3056	dp_listener_flow_tx_process(struct nx_flowswitch fsw, struct* flow_entry *fe)
3057	{
3058	#pragma unused(fsw)
3059	struct __kern_packet pkt, tpkt;
3060	KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3061	KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3062	/ listener is only allowed TCP RST /
3063	if (pkt->pkt_flow_ip_proto == IPPROTO_TCP &&
3064	(pkt->pkt_flow_tcp_flags & TH_RST) != `0`) {
3065	flow_track_abort_tcp(fe, NULL, rst_pkt: pkt);
3066	} else {
3067	char *addr;
3068	MD_BUFLET_ADDR_ABS(pkt, addr);
3069	SK_ERR("listener flow sends non-RST packet %s",
3070	sk_dump(sk_proc_name_address(current_proc()),
3071	addr, pkt->pkt_length, `128`, NULL, `0`));
3072	}
3073	pp_free_packet_single(pkt);
3074	}
3075	}
3076
3077	static void
3078	fsw_update_timestamps(struct __kern_packet pkt, volatile* uint64_t *fg_ts,
3079	volatile uint64_t *rt_ts, ifnet_t ifp)
3080	{
3081	struct timespec now;
3082	uint64_t now_nsec = `0`;
3083
3084	if (!(pkt->pkt_pflags & PKT_F_TS_VALID) \|\| pkt->pkt_timestamp == `0`) {
3085	nanouptime(ts: &now);
3086	net_timernsec(&now, &now_nsec);
3087	pkt->pkt_timestamp = now_nsec;
3088	}
3089	pkt->pkt_pflags &= ~PKT_F_TS_VALID;
3090
3091	/*
3092	* If the packet service class is not background,
3093	* update the timestamps on the interface, as well as
3094	* the ones in nexus-wide advisory to indicate recent
3095	* activity on a foreground flow.
3096	*/
3097	if (!(pkt->pkt_pflags & PKT_F_BACKGROUND)) {
3098	ifp->if_fg_sendts = (uint32_t)_net_uptime;
3099	if (fg_ts != NULL) {
3100	*fg_ts = _net_uptime;
3101	}
3102	}
3103	if (pkt->pkt_pflags & PKT_F_REALTIME) {
3104	ifp->if_rt_sendts = (uint32_t)_net_uptime;
3105	if (rt_ts != NULL) {
3106	*rt_ts = _net_uptime;
3107	}
3108	}
3109	}
3110
3111	static bool
3112	fsw_chain_enqueue_enabled(struct nx_flowswitch *fsw, bool gso_enabled)
3113	{
3114	return fsw_chain_enqueue != `0` &&
3115	fsw->fsw_ifp->if_output_netem == NULL &&
3116	(fsw->fsw_ifp->if_eflags & IFEF_ENQUEUE_MULTI) == `0` &&
3117	gso_enabled;
3118	}
3119
3120	void
3121	dp_flow_tx_process(struct nx_flowswitch fsw, struct* flow_entry *fe,
3122	uint32_t flags)
3123	{
3124	struct pktq dropped_pkts;
3125	bool chain, gso = ((flags & FLOW_PROC_FLAG_GSO) != `0`);
3126	uint32_t cnt = `0`, bytes = `0`;
3127	volatile struct sk_nexusadv *nxadv = NULL;
3128	volatile uint64_t *fg_ts = NULL;
3129	volatile uint64_t *rt_ts = NULL;
3130	uint8_t qset_idx = (fe->fe_qset != NULL) ? fe->fe_qset->nqs_idx : `0`;
3131
3132	KPKTQ_INIT(&dropped_pkts);
3133	ASSERT(!KPKTQ_EMPTY(&fe->fe_tx_pktq));
3134	if (__improbable(fe->fe_flags & FLOWENTF_LISTENER)) {
3135	dp_listener_flow_tx_process(fsw, fe);
3136	return;
3137	}
3138	if (__improbable(!dp_flow_tx_route_process(fsw, fe))) {
3139	SK_RDERR(`5`, "Tx route bad");
3140	FSW_STATS_ADD(FSW_STATS_TX_FLOW_NONVIABLE,
3141	KPKTQ_LEN(&fe->fe_tx_pktq));
3142	KPKTQ_CONCAT(&dropped_pkts, &fe->fe_tx_pktq);
3143	goto done;
3144	}
3145	chain = fsw_chain_enqueue_enabled(fsw, gso_enabled: gso);
3146	if (chain) {
3147	nxadv = fsw->fsw_nx->nx_adv.flowswitch_nxv_adv;
3148	if (nxadv != NULL) {
3149	fg_ts = &nxadv->nxadv_fg_sendts;
3150	rt_ts = &nxadv->nxadv_rt_sendts;
3151	}
3152	}
3153	struct __kern_packet pkt, tpkt;
3154	KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3155	int err = `0`;
3156
3157	err = flow_pkt_track(fe, pkt, false);
3158	if (__improbable(err != `0`)) {
3159	SK_RDERR(`5`, "flow_pkt_track failed (err %d)", err);
3160	FSW_STATS_INC(FSW_STATS_TX_FLOW_TRACK_ERR);
3161	KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3162	KPKTQ_ENQUEUE(&dropped_pkts, pkt);
3163	continue;
3164	}
3165	_UUID_COPY(pkt->pkt_policy_euuid, fe->fe_eproc_uuid);
3166	pkt->pkt_transport_protocol = fe->fe_transport_protocol;
3167
3168	/ set AQM related values for outgoing packet /
3169	if (fe->fe_adv_idx != FLOWADV_IDX_NONE) {
3170	pkt->pkt_pflags \|= PKT_F_FLOW_ADV;
3171	pkt->pkt_flowsrc_type = FLOWSRC_CHANNEL;
3172	pkt->pkt_flowsrc_fidx = fe->fe_adv_idx;
3173	} else {
3174	pkt->pkt_pflags &= ~PKT_F_FLOW_ADV;
3175	}
3176	_UUID_CLEAR(pkt->pkt_flow_id);
3177	pkt->pkt_flow_token = fe->fe_flowid;
3178	pkt->pkt_pflags \|= PKT_F_FLOW_ID;
3179	pkt->pkt_qset_idx = qset_idx;
3180	pkt->pkt_policy_id = fe->fe_policy_id;
3181	pkt->pkt_skip_policy_id = fe->fe_skip_policy_id;
3182
3183	/*
3184	* The same code is exercised per packet for the non-chain case
3185	* (see ifnet_enqueue_ifclassq()). It's replicated here to avoid
3186	* re-walking the chain later.
3187	*/
3188	if (chain) {
3189	fsw_update_timestamps(pkt, fg_ts, rt_ts, ifp: fsw->fsw_ifp);
3190	}
3191	/ mark packet tos/svc_class /
3192	fsw_qos_mark(fsw, fe, pkt);
3193
3194	tx_finalize_packet(fsw, pkt);
3195	bytes += pkt->pkt_length;
3196	cnt++;
3197	}
3198
3199	/ snoop after it's finalized /
3200	if (__improbable(pktap_total_tap_count != `0`)) {
3201	fsw_snoop(fsw, fe, false);
3202	}
3203	if (fe->fe_qset != NULL) {
3204	classq_qset_enqueue_flow(fsw, fe, chain, cnt, bytes);
3205	} else {
3206	classq_enqueue_flow(fsw, fe, chain, cnt, bytes);
3207	}
3208	done:
3209	dp_drop_pktq(fsw, &dropped_pkts);
3210	}
3211
3212	static struct flow_entry *
3213	tx_process_continuous_ip_frag(struct nx_flowswitch *fsw,
3214	struct flow_entry prev_fe, struct* __kern_packet *pkt)
3215	{
3216	ASSERT(!pkt->pkt_flow_ip_is_first_frag);
3217
3218	if (__improbable(pkt->pkt_flow_ip_frag_id == `0`)) {
3219	FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_ID);
3220	SK_ERR("%s(%d) invalid zero fragment id",
3221	sk_proc_name_address(current_proc()),
3222	sk_proc_pid(current_proc()));
3223	return NULL;
3224	}
3225
3226	SK_DF(SK_VERB_FSW_DP \| SK_VERB_TX,
3227	"%s(%d) continuation frag, id %u",
3228	sk_proc_name_address(current_proc()),
3229	sk_proc_pid(current_proc()),
3230	pkt->pkt_flow_ip_frag_id);
3231	if (__improbable(prev_fe == NULL \|\|
3232	!prev_fe->fe_tx_is_cont_frag)) {
3233	SK_ERR("%s(%d) unexpected continuation frag",
3234	sk_proc_name_address(current_proc()),
3235	sk_proc_pid(current_proc()),
3236	pkt->pkt_flow_ip_frag_id);
3237	FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
3238	return NULL;
3239	}
3240	if (__improbable(pkt->pkt_flow_ip_frag_id !=
3241	prev_fe->fe_tx_frag_id)) {
3242	FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
3243	SK_ERR("%s(%d) wrong continuation frag id %u expecting %u",
3244	sk_proc_name_address(current_proc()),
3245	sk_proc_pid(current_proc()),
3246	pkt->pkt_flow_ip_frag_id,
3247	prev_fe->fe_tx_frag_id);
3248	return NULL;
3249	}
3250
3251	return prev_fe;
3252	}
3253
3254	static struct flow_entry *
3255	tx_lookup_flow(struct nx_flowswitch fsw, struct* __kern_packet *pkt,
3256	struct flow_entry *prev_fe)
3257	{
3258	struct flow_entry *fe;
3259
3260	fe = lookup_flow_with_pkt(fsw, pkt, false, prev_fe);
3261	if (__improbable(fe == NULL)) {
3262	goto done;
3263	}
3264
3265	if (__improbable(fe->fe_flags & FLOWENTF_TORN_DOWN)) {
3266	SK_RDERR(`5`, "Tx flow torn down");
3267	FSW_STATS_INC(FSW_STATS_TX_FLOW_TORNDOWN);
3268	flow_entry_release(pfe: &fe);
3269	goto done;
3270	}
3271
3272	_FSW_INJECT_ERROR(`34`, pkt->pkt_flow_id[`0`], fe->fe_uuid[`0`] + `1`,
3273	null_func);
3274
3275	if (__improbable(!_UUID_MATCH(pkt->pkt_flow_id, fe->fe_uuid))) {
3276	uuid_string_t flow_id_str, pkt_id_str;
3277	sk_uuid_unparse(fe->fe_uuid, flow_id_str);
3278	sk_uuid_unparse(pkt->pkt_flow_id, pkt_id_str);
3279	SK_ERR("pkt flow id %s != flow id %s", pkt_id_str, flow_id_str);
3280	flow_entry_release(pfe: &fe);
3281	FSW_STATS_INC(FSW_STATS_TX_FLOW_BAD_ID);
3282	}
3283
3284	done:
3285	return fe;
3286	}
3287
3288	static inline void
3289	tx_flow_process(struct nx_flowswitch fsw, struct* flow_entry *fe,
3290	uint32_t flags)
3291	{
3292	ASSERT(!KPKTQ_EMPTY(&fe->fe_tx_pktq));
3293	ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) != `0`);
3294
3295	SK_DF(SK_VERB_FSW_DP \| SK_VERB_TX, "TX %d pkts from fe %p port %d",
3296	KPKTQ_LEN(&fe->fe_tx_pktq), fe, fe->fe_nx_port);
3297
3298	/ flow related processing (default, agg, etc.) /
3299	fe->fe_tx_process(fsw, fe, flags);
3300
3301	KPKTQ_FINI(&fe->fe_tx_pktq);
3302	}
3303
3304	#if SK_LOG
3305	static void
3306	dp_tx_log_pkt(uint64_t verb, char desc, struct* __kern_packet *pkt)
3307	{
3308	char *pkt_buf;
3309	MD_BUFLET_ADDR_ABS(pkt, pkt_buf);
3310	SK_DF(verb, "%s(%d) %s %s", sk_proc_name_address(current_proc()),
3311	sk_proc_pid(current_proc()), desc, sk_dump("buf", pkt_buf,
3312	pkt->pkt_length, `128`, NULL, `0`));
3313	}
3314	#else /* !SK_LOG */
3315	#define dp_tx_log_pkt(...)
3316	#endif /* !SK_LOG */
3317
3318	static inline struct ifnet *
3319	fsw_datamov_begin(struct nx_flowswitch *fsw)
3320	{
3321	struct ifnet *ifp;
3322
3323	ifp = fsw->fsw_ifp;
3324	if (!ifnet_datamov_begin(ifp)) {
3325	DTRACE_SKYWALK1(ifnet__detached, struct ifnet *, ifp);
3326	return NULL;
3327	}
3328	return ifp;
3329	}
3330
3331	static inline void
3332	fsw_datamov_end(struct nx_flowswitch *fsw)
3333	{
3334	ifnet_datamov_end(fsw->fsw_ifp);
3335	}
3336
3337	static void
3338	dp_tx_pktq(struct nx_flowswitch fsw, struct* pktq *spktq)
3339	{
3340	struct __kern_packet spkt, pkt;
3341	struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
3342	struct flow_entry fe, prev_fe;
3343	struct pktq dropped_pkts, dpktq;
3344	struct nexus_adapter *dev_na;
3345	struct kern_pbufpool *dev_pp;
3346	struct ifnet *ifp = NULL;
3347	sa_family_t af;
3348	uint32_t n_pkts, n_flows = `0`;
3349	boolean_t do_pacing = FALSE;
3350
3351	int err;
3352	KPKTQ_INIT(&dpktq);
3353	KPKTQ_INIT(&dropped_pkts);
3354	n_pkts = KPKTQ_LEN(spktq);
3355
3356	FSW_RLOCK(fsw);
3357	if (__improbable(FSW_QUIESCED(fsw))) {
3358	DTRACE_SKYWALK1(tx__quiesced, struct nx_flowswitch *, fsw);
3359	SK_ERR("flowswitch detached, dropping %d pkts", n_pkts);
3360	KPKTQ_CONCAT(&dropped_pkts, spktq);
3361	goto done;
3362	}
3363	dev_na = fsw->fsw_dev_ch->ch_na;
3364	if (__improbable(dev_na == NULL)) {
3365	SK_ERR("dev port not attached, dropping %d pkts", n_pkts);
3366	FSW_STATS_ADD(FSW_STATS_DST_NXPORT_INACTIVE, n_pkts);
3367	KPKTQ_CONCAT(&dropped_pkts, spktq);
3368	goto done;
3369	}
3370	ifp = fsw_datamov_begin(fsw);
3371	if (ifp == NULL) {
3372	SK_ERR("ifnet not attached, dropping %d pkts", n_pkts);
3373	KPKTQ_CONCAT(&dropped_pkts, spktq);
3374	goto done;
3375	}
3376
3377	/ batch allocate enough packets /
3378	dev_pp = na_kr_get_pp(dev_na, NR_TX);
3379
3380	err = pp_alloc_pktq(dev_pp, dev_pp->pp_max_frags, &dpktq, n_pkts, NULL,
3381	NULL, SKMEM_NOSLEEP);
3382	#if DEVELOPMENT \|\| DEBUG
3383	if (__probable(err != ENOMEM)) {
3384	_FSW_INJECT_ERROR(`12`, err, ENOMEM, pp_free_pktq, &dpktq);
3385	}
3386	#endif /* DEVELOPMENT \|\| DEBUG */
3387	if (__improbable(err == ENOMEM)) {
3388	ASSERT(KPKTQ_EMPTY(&dpktq));
3389	KPKTQ_CONCAT(&dropped_pkts, spktq);
3390	FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
3391	SK_ERR("failed to alloc %u pkts from device pool", n_pkts);
3392	goto done;
3393	} else if (__improbable(err == EAGAIN)) {
3394	FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT,
3395	(n_pkts - KPKTQ_LEN(&dpktq)));
3396	FSW_STATS_ADD(FSW_STATS_DROP,
3397	(n_pkts - KPKTQ_LEN(&dpktq)));
3398	}
3399
3400	n_pkts = KPKTQ_LEN(&dpktq);
3401	prev_fe = NULL;
3402	KPKTQ_FOREACH(spkt, spktq) {
3403	if (n_pkts == `0`) {
3404	break;
3405	}
3406	--n_pkts;
3407
3408	KPKTQ_DEQUEUE(&dpktq, pkt);
3409	ASSERT(pkt != NULL);
3410	err = dp_copy_to_dev(fsw, spkt, dpkt: pkt);
3411	if (__improbable(err != `0`)) {
3412	KPKTQ_ENQUEUE(&dropped_pkts, pkt);
3413	continue;
3414	}
3415
3416	do_pacing \|= ((pkt->pkt_pflags & PKT_F_OPT_TX_TIMESTAMP) != `0`);
3417	af = fsw_ip_demux(fsw, pkt);
3418	if (__improbable(af == AF_UNSPEC)) {
3419	dp_tx_log_pkt(SK_VERB_ERROR, "demux err", pkt);
3420	FSW_STATS_INC(FSW_STATS_TX_DEMUX_ERR);
3421	KPKTQ_ENQUEUE(&dropped_pkts, pkt);
3422	continue;
3423	}
3424
3425	err = flow_pkt_classify(pkt, ifp, af, false);
3426	if (__improbable(err != `0`)) {
3427	dp_tx_log_pkt(SK_VERB_ERROR, "flow extract err", pkt);
3428	FSW_STATS_INC(FSW_STATS_TX_FLOW_EXTRACT_ERR);
3429	KPKTQ_ENQUEUE(&dropped_pkts, pkt);
3430	continue;
3431	}
3432
3433	if (__improbable(pkt->pkt_flow_ip_is_frag &&
3434	!pkt->pkt_flow_ip_is_first_frag)) {
3435	fe = tx_process_continuous_ip_frag(fsw, prev_fe, pkt);
3436	if (__probable(fe != NULL)) {
3437	flow_entry_retain(fe);
3438	goto flow_batch;
3439	} else {
3440	FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
3441	KPKTQ_ENQUEUE(&dropped_pkts, pkt);
3442	continue;
3443	}
3444	}
3445
3446	fe = tx_lookup_flow(fsw, pkt, prev_fe);
3447	if (__improbable(fe == NULL)) {
3448	FSW_STATS_INC(FSW_STATS_TX_FLOW_NOT_FOUND);
3449	KPKTQ_ENQUEUE(&dropped_pkts, pkt);
3450	prev_fe = NULL;
3451	continue;
3452	}
3453	flow_batch:
3454	tx_flow_batch_packet(fes: &fes, fe, pkt);
3455	prev_fe = fe;
3456	}
3457
3458	struct flow_entry *tfe = NULL;
3459	TAILQ_FOREACH_SAFE(fe, &fes, fe_tx_link, tfe) {
3460	tx_flow_process(fsw, fe, flags: `0`);
3461	TAILQ_REMOVE(&fes, fe, fe_tx_link);
3462	fe->fe_tx_is_cont_frag = false;
3463	fe->fe_tx_frag_id = `0`;
3464	flow_entry_release(pfe: &fe);
3465	n_flows++;
3466	}
3467
3468	done:
3469	FSW_RUNLOCK(fsw);
3470	if (n_flows > `0`) {
3471	netif_transmit(ifp, NETIF_XMIT_FLAG_CHANNEL \| (do_pacing ? NETIF_XMIT_FLAG_PACING : `0`));
3472	}
3473	if (ifp != NULL) {
3474	fsw_datamov_end(fsw);
3475	}
3476	dp_drop_pktq(fsw, &dropped_pkts);
3477	KPKTQ_FINI(&dropped_pkts);
3478	KPKTQ_FINI(&dpktq);
3479	}
3480
3481	static sa_family_t
3482	get_tso_af(struct __kern_packet *pkt)
3483	{
3484	packet_tso_flags_t tso_flags;
3485
3486	tso_flags = pkt->pkt_csum_flags & PACKET_CSUM_TSO_FLAGS;
3487	if (tso_flags == PACKET_TSO_IPV4) {
3488	return AF_INET;
3489	} else if (tso_flags == PACKET_TSO_IPV6) {
3490	return AF_INET6;
3491	} else {
3492	panic("invalid tso flags: 0x%x\n", tso_flags);
3493	/ NOTREACHED /
3494	__builtin_unreachable();
3495	}
3496	}
3497
3498	static inline void
3499	update_flow_info(struct __kern_packet pkt, void* iphdr, void* *tcphdr,
3500	uint16_t payload_sz)
3501	{
3502	struct tcphdr *tcp = tcphdr;
3503
3504	DTRACE_SKYWALK4(update__flow__info, struct __kern_packet *, pkt,
3505	void , iphdr, void* *, tcphdr, uint16_t, payload_sz);
3506	pkt->pkt_flow_ip_hdr = (mach_vm_address_t)iphdr;
3507	pkt->pkt_flow_tcp_hdr = (mach_vm_address_t)tcphdr;
3508	pkt->pkt_flow_tcp_flags = tcp->th_flags;
3509	pkt->pkt_flow_tcp_seq = tcp->th_seq;
3510	pkt->pkt_flow_ulen = payload_sz;
3511	}
3512
3513	static int
3514	do_gso(struct nx_flowswitch fsw, int* af, struct __kern_packet *orig_pkt,
3515	struct __kern_packet first_pkt, struct* pktq *dev_pktq,
3516	struct pktq *gso_pktq)
3517	{
3518	ifnet_t ifp = fsw->fsw_ifp;
3519	struct __kern_packet *pkt = first_pkt;
3520	uint8_t proto = pkt->pkt_flow_ip_proto;
3521	uint16_t ip_hlen = pkt->pkt_flow_ip_hlen;
3522	uint16_t tcp_hlen = pkt->pkt_flow_tcp_hlen;
3523	uint16_t total_hlen = ip_hlen + tcp_hlen;
3524	uint16_t mtu = (uint16_t)ifp->if_mtu;
3525	uint16_t mss = pkt->pkt_proto_seg_sz, payload_sz;
3526	uint32_t n, n_pkts, off = `0`, total_len = orig_pkt->pkt_length;
3527	uint16_t headroom = fsw->fsw_frame_headroom + ifp->if_tx_headroom;
3528	kern_packet_t orig_ph = SK_PKT2PH(orig_pkt);
3529	uint8_t *orig_pkt_baddr;
3530	struct tcphdr *tcp;
3531	struct ip *ip;
3532	struct ip6_hdr *ip6;
3533	uint32_t tcp_seq;
3534	uint16_t ipid;
3535	uint32_t pseudo_hdr_csum, bufsz;
3536
3537	ASSERT(headroom <= UINT8_MAX);
3538	if (proto != IPPROTO_TCP) {
3539	SK_ERR("invalid proto: %d", proto);
3540	DTRACE_SKYWALK3(invalid__proto, struct nx_flowswitch *,
3541	fsw, ifnet_t, ifp, uint8_t, proto);
3542	return EINVAL;
3543	}
3544	if (mss == `0` \|\| mss > (mtu - total_hlen)) {
3545	SK_ERR("invalid args: mss %d, mtu %d, total_hlen %d",
3546	mss, mtu, total_hlen);
3547	DTRACE_SKYWALK5(invalid__args1, struct nx_flowswitch *,
3548	fsw, ifnet_t, ifp, uint16_t, mss, uint16_t, mtu,
3549	uint32_t, total_hlen);
3550	return EINVAL;
3551	}
3552	bufsz = PP_BUF_SIZE_DEF(pkt->pkt_qum.qum_pp);
3553	if ((headroom + total_hlen + mss) > bufsz) {
3554	SK_ERR("invalid args: headroom %d, total_hlen %d, "
3555	"mss %d, bufsz %d", headroom, total_hlen, mss, bufsz);
3556	DTRACE_SKYWALK6(invalid__args2, struct nx_flowswitch *,
3557	fsw, ifnet_t, ifp, uint16_t, headroom, uint16_t,
3558	total_hlen, uint16_t, mss, uint32_t, bufsz);
3559	return EINVAL;
3560	}
3561	n_pkts = (uint32_t)(SK_ROUNDUP((total_len - total_hlen), mss) / mss);
3562
3563	ASSERT(pkt->pkt_headroom == headroom);
3564	ASSERT(pkt->pkt_length == total_len);
3565	ASSERT(pkt->pkt_l2_len == `0`);
3566	ASSERT((pkt->pkt_qum.qum_qflags & QUM_F_FINALIZED) == `0`);
3567	ASSERT((pkt->pkt_pflags & PKT_F_TRUNCATED) != `0`);
3568	pkt->pkt_pflags &= ~PKT_F_TRUNCATED;
3569	pkt->pkt_proto_seg_sz = `0`;
3570	pkt->pkt_csum_flags = `0`;
3571	MD_BUFLET_ADDR_ABS(orig_pkt, orig_pkt_baddr);
3572	orig_pkt_baddr += orig_pkt->pkt_headroom;
3573
3574	if (af == AF_INET) {
3575	ip = (struct ip *)pkt->pkt_flow_ip_hdr;
3576	tcp = (struct tcphdr *)pkt->pkt_flow_tcp_hdr;
3577	ipid = ip->ip_id;
3578	pseudo_hdr_csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
3579	pkt->pkt_flow_ipv4_dst.s_addr, `0`);
3580	} else {
3581	ASSERT(af == AF_INET6);
3582	tcp = (struct tcphdr *)pkt->pkt_flow_tcp_hdr;
3583	pseudo_hdr_csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
3584	&pkt->pkt_flow_ipv6_dst, `0`);
3585	}
3586	tcp_seq = ntohl(tcp->th_seq);
3587
3588	for (n = `1`, payload_sz = mss, off = total_hlen; off < total_len;
3589	off += payload_sz) {
3590	uint8_t baddr, baddr0;
3591	uint32_t partial;
3592
3593	if (pkt == NULL) {
3594	n++;
3595	KPKTQ_DEQUEUE(dev_pktq, pkt);
3596	ASSERT(pkt != NULL);
3597	}
3598	MD_BUFLET_ADDR_ABS(pkt, baddr0);
3599	baddr = baddr0;
3600	baddr += headroom;
3601
3602	/ Copy headers from the original packet /
3603	if (n != `1`) {
3604	ASSERT(pkt != first_pkt);
3605	pkt_copy(src: orig_pkt_baddr, dst: baddr, len: total_hlen);
3606	fsw_pkt_copy_metadata(spkt: first_pkt, dpkt: pkt);
3607
3608	ASSERT((pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != `0`);
3609	/ flow info still needs to be updated below /
3610	bcopy(src: first_pkt->pkt_flow, dst: pkt->pkt_flow,
3611	n: sizeof(*pkt->pkt_flow));
3612	pkt->pkt_trace_id = `0`;
3613	ASSERT(pkt->pkt_headroom == headroom);
3614	} else {
3615	METADATA_SET_LEN(pkt, `0`, `0`);
3616	}
3617	baddr += total_hlen;
3618
3619	/ Copy/checksum the payload from the original packet /
3620	if (off + payload_sz > total_len) {
3621	payload_sz = (uint16_t)(total_len - off);
3622	}
3623	pkt_copypkt_sum(orig_ph,
3624	(uint16_t)(orig_pkt->pkt_headroom + off),
3625	SK_PKT2PH(pkt), headroom + total_hlen, payload_sz,
3626	&partial, TRUE);
3627
3628	DTRACE_SKYWALK6(copy__csum, struct nx_flowswitch *, fsw,
3629	ifnet_t, ifp, uint8_t *, baddr, uint16_t, payload_sz,
3630	uint16_t, mss, uint32_t, partial);
3631	FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2PKT);
3632
3633	/*
3634	* Adjust header information and fill in the missing fields.
3635	*/
3636	if (af == AF_INET) {
3637	ip = (struct ip )(void* *)(baddr0 + pkt->pkt_headroom);
3638	tcp = (struct tcphdr )(void* *)((caddr_t)ip + ip_hlen);
3639
3640	if (n != n_pkts) {
3641	tcp->th_flags &= ~(TH_FIN \| TH_PUSH);
3642	}
3643	if (n != `1`) {
3644	tcp->th_flags &= ~TH_CWR;
3645	tcp->th_seq = htonl(tcp_seq);
3646	}
3647	update_flow_info(pkt, iphdr: ip, tcphdr: tcp, payload_sz);
3648
3649	ip->ip_id = htons((ipid)++);
3650	ip->ip_len = htons(ip_hlen + tcp_hlen + payload_sz);
3651	ip->ip_sum = `0`;
3652	ip->ip_sum = inet_cksum_buffer(ip, `0`, `0`, len: ip_hlen);
3653	tcp->th_sum = `0`;
3654	partial = __packet_cksum(data: tcp, len: tcp_hlen, sum0: partial);
3655	partial += htons(tcp_hlen + IPPROTO_TCP + payload_sz);
3656	partial += pseudo_hdr_csum;
3657	ADDCARRY(partial);
3658	tcp->th_sum = ~(uint16_t)partial;
3659	} else {
3660	ASSERT(af == AF_INET6);
3661	ip6 = (struct ip6_hdr *)(baddr0 + pkt->pkt_headroom);
3662	tcp = (struct tcphdr )(void* *)((caddr_t)ip6 + ip_hlen);
3663
3664	if (n != n_pkts) {
3665	tcp->th_flags &= ~(TH_FIN \| TH_PUSH);
3666	}
3667	if (n != `1`) {
3668	tcp->th_flags &= ~TH_CWR;
3669	tcp->th_seq = htonl(tcp_seq);
3670	}
3671	update_flow_info(pkt, iphdr: ip6, tcphdr: tcp, payload_sz);
3672
3673	ip6->ip6_plen = htons(tcp_hlen + payload_sz);
3674	tcp->th_sum = `0`;
3675	partial = __packet_cksum(data: tcp, len: tcp_hlen, sum0: partial);
3676	partial += htonl(tcp_hlen + IPPROTO_TCP + payload_sz);
3677	partial += pseudo_hdr_csum;
3678	ADDCARRY(partial);
3679	tcp->th_sum = ~(uint16_t)partial;
3680	}
3681	tcp_seq += payload_sz;
3682	METADATA_ADJUST_LEN(pkt, total_hlen, headroom);
3683	#if (DEVELOPMENT \|\| DEBUG)
3684	struct __kern_buflet *bft;
3685	uint32_t blen;
3686	PKT_GET_FIRST_BUFLET(pkt, `1`, bft);
3687	blen = __buflet_get_data_length(bft);
3688	if (blen != total_hlen + payload_sz) {
3689	panic("blen (%d) != total_len + payload_sz (%d)\n",
3690	blen, total_hlen + payload_sz);
3691	}
3692	#endif /* DEVELOPMENT \|\| DEBUG */
3693
3694	pkt->pkt_length = total_hlen + payload_sz;
3695	KPKTQ_ENQUEUE(gso_pktq, pkt);
3696	pkt = NULL;
3697
3698	/*
3699	* Note that at this point the packet is not yet finalized.
3700	* The finalization happens in dp_flow_tx_process() after
3701	* the framing is done.
3702	*/
3703	}
3704	ASSERT(n == n_pkts);
3705	ASSERT(off == total_len);
3706	DTRACE_SKYWALK7(gso__done, struct nx_flowswitch *, fsw, ifnet_t, ifp,
3707	uint32_t, n_pkts, uint32_t, total_len, uint16_t, ip_hlen,
3708	uint16_t, tcp_hlen, uint8_t *, orig_pkt_baddr);
3709	return `0`;
3710	}
3711
3712	static void
3713	tx_flow_enqueue_gso_pktq(struct flow_entry_list fes, struct* flow_entry *fe,
3714	struct pktq *gso_pktq)
3715	{
3716	if (KPKTQ_EMPTY(&fe->fe_tx_pktq)) {
3717	ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) == `0`);
3718	TAILQ_INSERT_TAIL(fes, fe, fe_tx_link);
3719	KPKTQ_ENQUEUE_MULTI(&fe->fe_tx_pktq, KPKTQ_FIRST(gso_pktq),
3720	KPKTQ_LAST(gso_pktq), KPKTQ_LEN(gso_pktq));
3721	KPKTQ_INIT(gso_pktq);
3722	} else {
3723	ASSERT(!TAILQ_EMPTY(fes));
3724	KPKTQ_ENQUEUE_MULTI(&fe->fe_tx_pktq, KPKTQ_FIRST(gso_pktq),
3725	KPKTQ_LAST(gso_pktq), KPKTQ_LEN(gso_pktq));
3726	KPKTQ_INIT(gso_pktq);
3727	flow_entry_release(pfe: &fe);
3728	}
3729	}
3730
3731	static void
3732	dp_gso_pktq(struct nx_flowswitch fsw, struct* pktq *spktq,
3733	uint32_t gso_pkts_estimate)
3734	{
3735	struct __kern_packet spkt, pkt;
3736	struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
3737	struct flow_entry fe, prev_fe;
3738	struct pktq dpktq;
3739	struct nexus_adapter *dev_na;
3740	struct kern_pbufpool *dev_pp;
3741	struct ifnet *ifp = NULL;
3742	sa_family_t af;
3743	uint32_t n_pkts, n_flows = `0`;
3744	int err;
3745
3746	KPKTQ_INIT(&dpktq);
3747	n_pkts = KPKTQ_LEN(spktq);
3748
3749	FSW_RLOCK(fsw);
3750	if (__improbable(FSW_QUIESCED(fsw))) {
3751	DTRACE_SKYWALK1(tx__quiesced, struct nx_flowswitch *, fsw);
3752	SK_ERR("flowswitch detached, dropping %d pkts", n_pkts);
3753	dp_drop_pktq(fsw, spktq);
3754	goto done;
3755	}
3756	dev_na = fsw->fsw_dev_ch->ch_na;
3757	if (__improbable(dev_na == NULL)) {
3758	SK_ERR("dev port not attached, dropping %d pkts", n_pkts);
3759	FSW_STATS_ADD(FSW_STATS_DST_NXPORT_INACTIVE, n_pkts);
3760	dp_drop_pktq(fsw, spktq);
3761	goto done;
3762	}
3763	ifp = fsw_datamov_begin(fsw);
3764	if (ifp == NULL) {
3765	SK_ERR("ifnet not attached, dropping %d pkts", n_pkts);
3766	dp_drop_pktq(fsw, spktq);
3767	goto done;
3768	}
3769
3770	dev_pp = na_kr_get_pp(dev_na, NR_TX);
3771
3772	/*
3773	* Batch allocate enough packets to perform GSO on all
3774	* packets in spktq.
3775	*/
3776	err = pp_alloc_pktq(dev_pp, dev_pp->pp_max_frags, &dpktq,
3777	gso_pkts_estimate, NULL, NULL, SKMEM_NOSLEEP);
3778	#if DEVELOPMENT \|\| DEBUG
3779	if (__probable(err != ENOMEM)) {
3780	_FSW_INJECT_ERROR(`12`, err, ENOMEM, pp_free_pktq, &dpktq);
3781	}
3782	#endif /* DEVELOPMENT \|\| DEBUG */
3783	/*
3784	* We either get all packets or none. No partial allocations.
3785	*/
3786	if (__improbable(err != `0`)) {
3787	if (err == ENOMEM) {
3788	ASSERT(KPKTQ_EMPTY(&dpktq));
3789	} else {
3790	dp_free_pktq(fsw, pktq: &dpktq);
3791	}
3792	DTRACE_SKYWALK1(gso__no__mem, int, err);
3793	dp_drop_pktq(fsw, spktq);
3794	FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
3795	SK_ERR("failed to alloc %u pkts from device pool",
3796	gso_pkts_estimate);
3797	goto done;
3798	}
3799	prev_fe = NULL;
3800	KPKTQ_FOREACH(spkt, spktq) {
3801	KPKTQ_DEQUEUE(&dpktq, pkt);
3802	ASSERT(pkt != NULL);
3803	/*
3804	* Copy only headers to the first packet of the GSO chain.
3805	* The headers will be used for classification below.
3806	*/
3807	err = dp_copy_headers_to_dev(fsw, spkt, dpkt: pkt);
3808	if (__improbable(err != `0`)) {
3809	pp_free_packet_single(pkt);
3810	DTRACE_SKYWALK2(copy__headers__failed,
3811	struct nx_flowswitch *, fsw,
3812	struct __kern_packet *, spkt);
3813	continue;
3814	}
3815	af = get_tso_af(pkt);
3816	ASSERT(af == AF_INET \|\| af == AF_INET6);
3817
3818	err = flow_pkt_classify(pkt, ifp, af, false);
3819	if (__improbable(err != `0`)) {
3820	dp_tx_log_pkt(SK_VERB_ERROR, "flow extract err", pkt);
3821	FSW_STATS_INC(FSW_STATS_TX_FLOW_EXTRACT_ERR);
3822	pp_free_packet_single(pkt);
3823	DTRACE_SKYWALK4(classify__failed,
3824	struct nx_flowswitch *, fsw,
3825	struct __kern_packet *, spkt,
3826	struct __kern_packet *, pkt,
3827	int, err);
3828	continue;
3829	}
3830	/*
3831	* GSO cannot be done on a fragment and it's a bug in user
3832	* space to mark a fragment as needing GSO.
3833	*/
3834	if (__improbable(pkt->pkt_flow_ip_is_frag)) {
3835	FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
3836	pp_free_packet_single(pkt);
3837	DTRACE_SKYWALK3(is__frag,
3838	struct nx_flowswitch *, fsw,
3839	struct __kern_packet *, spkt,
3840	struct __kern_packet *, pkt);
3841	continue;
3842	}
3843	fe = tx_lookup_flow(fsw, pkt, prev_fe);
3844	if (__improbable(fe == NULL)) {
3845	FSW_STATS_INC(FSW_STATS_TX_FLOW_NOT_FOUND);
3846	pp_free_packet_single(pkt);
3847	DTRACE_SKYWALK3(lookup__failed,
3848	struct nx_flowswitch *, fsw,
3849	struct __kern_packet *, spkt,
3850	struct __kern_packet *, pkt);
3851	prev_fe = NULL;
3852	continue;
3853	}
3854	/*
3855	* Perform GSO on spkt using the flow information
3856	* obtained above.
3857	*/
3858	struct pktq gso_pktq;
3859	KPKTQ_INIT(&gso_pktq);
3860	err = do_gso(fsw, af, orig_pkt: spkt, first_pkt: pkt, dev_pktq: &dpktq, gso_pktq: &gso_pktq);
3861	if (__probable(err == `0`)) {
3862	tx_flow_enqueue_gso_pktq(fes: &fes, fe, gso_pktq: &gso_pktq);
3863	prev_fe = fe;
3864	} else {
3865	DTRACE_SKYWALK1(gso__error, int, err);
3866	/ TODO: increment error stat /
3867	pp_free_packet_single(pkt);
3868	flow_entry_release(pfe: &fe);
3869	prev_fe = NULL;
3870	}
3871	KPKTQ_FINI(&gso_pktq);
3872	}
3873	struct flow_entry *tfe = NULL;
3874	TAILQ_FOREACH_SAFE(fe, &fes, fe_tx_link, tfe) {
3875	/ Chain-enqueue can be used for GSO chains /
3876	tx_flow_process(fsw, fe, FLOW_PROC_FLAG_GSO);
3877	TAILQ_REMOVE(&fes, fe, fe_tx_link);
3878	flow_entry_release(pfe: &fe);
3879	n_flows++;
3880	}
3881	done:
3882	FSW_RUNLOCK(fsw);
3883	if (n_flows > `0`) {
3884	netif_transmit(ifp, NETIF_XMIT_FLAG_CHANNEL);
3885	}
3886	if (ifp != NULL) {
3887	fsw_datamov_end(fsw);
3888	}
3889
3890	/*
3891	* It's possible for packets to be left in dpktq because
3892	* gso_pkts_estimate is only an estimate. The actual number
3893	* of packets needed could be less.
3894	*/
3895	uint32_t dpktq_len;
3896	if ((dpktq_len = KPKTQ_LEN(&dpktq)) > `0`) {
3897	DTRACE_SKYWALK2(leftover__dev__pkts,
3898	struct nx_flowswitch *, fsw, uint32_t, dpktq_len);
3899	dp_free_pktq(fsw, pktq: &dpktq);
3900	}
3901	KPKTQ_FINI(&dpktq);
3902	}
3903
3904	static inline void
3905	fsw_dev_ring_flush(struct nx_flowswitch fsw, struct* __kern_channel_ring *r,
3906	struct proc *p)
3907	{
3908	#pragma unused(p)
3909	uint32_t total_pkts = `0`, total_bytes = `0`;
3910
3911	for (;;) {
3912	struct pktq pktq;
3913	KPKTQ_INIT(&pktq);
3914	uint32_t n_bytes;
3915	fsw_rx_ring_dequeue_pktq(fsw, r, n_pkts_max: fsw_rx_batch, pktq: &pktq, n_bytes: &n_bytes);
3916	if (n_bytes == `0`) {
3917	break;
3918	}
3919	total_pkts += KPKTQ_LEN(&pktq);
3920	total_bytes += n_bytes;
3921
3922	if (__probable(fsw->fsw_ifp->if_input_netem == NULL)) {
3923	fsw_receive(fsw, pktq: &pktq);
3924	} else {
3925	fsw_dev_input_netem_enqueue(fsw, q: &pktq);
3926	}
3927	KPKTQ_FINI(&pktq);
3928	}
3929
3930	KDBG(SK_KTRACE_FSW_DEV_RING_FLUSH, SK_KVA(r), total_pkts, total_bytes);
3931	DTRACE_SKYWALK2(fsw__dp__dev__ring__flush, uint32_t, total_pkts,
3932	uint32_t, total_bytes);
3933
3934	/ compute mitigation rate for delivered traffic /
3935	if (__probable(r->ckr_netif_mit_stats != NULL)) {
3936	r->ckr_netif_mit_stats(r, total_pkts, total_bytes);
3937	}
3938	}
3939
3940	static inline void
3941	fsw_user_ring_flush(struct nx_flowswitch fsw, struct* __kern_channel_ring *r,
3942	struct proc *p)
3943	{
3944	#pragma unused(p)
3945	static packet_trace_id_t trace_id = `0`;
3946	uint32_t total_pkts = `0`, total_bytes = `0`;
3947
3948	for (;;) {
3949	struct pktq pktq;
3950	KPKTQ_INIT(&pktq);
3951	uint32_t n_bytes;
3952	uint32_t gso_pkts_estimate = `0`;
3953
3954	fsw_tx_ring_dequeue_pktq(fsw, r, n_pkts_max: fsw_tx_batch, pktq: &pktq, n_bytes: &n_bytes,
3955	gso_pkts_estimate: &gso_pkts_estimate);
3956	if (n_bytes == `0`) {
3957	break;
3958	}
3959	total_pkts += KPKTQ_LEN(&pktq);
3960	total_bytes += n_bytes;
3961
3962	KPKTQ_FIRST(&pktq)->pkt_trace_id = ++trace_id;
3963	KDBG(SK_KTRACE_PKT_TX_FSW \| DBG_FUNC_START,
3964	KPKTQ_FIRST(&pktq)->pkt_trace_id);
3965
3966	if (gso_pkts_estimate > `0`) {
3967	dp_gso_pktq(fsw, spktq: &pktq, gso_pkts_estimate);
3968	} else {
3969	dp_tx_pktq(fsw, spktq: &pktq);
3970	}
3971	dp_free_pktq(fsw, pktq: &pktq);
3972	KPKTQ_FINI(&pktq);
3973	}
3974	kr_update_stats(kring: r, slot_count: total_pkts, byte_count: total_bytes);
3975
3976	KDBG(SK_KTRACE_FSW_USER_RING_FLUSH, SK_KVA(r), total_pkts, total_bytes);
3977	DTRACE_SKYWALK2(fsw__dp__user__ring__flush, uint32_t, total_pkts,
3978	uint32_t, total_bytes);
3979	}
3980
3981	void
3982	fsw_ring_flush(struct nx_flowswitch fsw, struct* __kern_channel_ring *r,
3983	struct proc *p)
3984	{
3985	struct nexus_vp_adapter *vpna = VPNA(KRNA(r));
3986
3987	ASSERT(sk_is_sync_protected());
3988	ASSERT(vpna->vpna_nx_port != FSW_VP_HOST);
3989	ASSERT(vpna->vpna_up.na_md_type == NEXUS_META_TYPE_PACKET);
3990
3991	if (vpna->vpna_nx_port == FSW_VP_DEV) {
3992	fsw_dev_ring_flush(fsw, r, p);
3993	} else {
3994	fsw_user_ring_flush(fsw, r, p);
3995	}
3996	}
3997
3998	int
3999	fsw_dp_ctor(struct nx_flowswitch *fsw)
4000	{
4001	uint32_t fe_cnt = fsw_fe_table_size;
4002	uint32_t fob_cnt = fsw_flow_owner_buckets;
4003	uint32_t frb_cnt = fsw_flow_route_buckets;
4004	uint32_t frib_cnt = fsw_flow_route_id_buckets;
4005	struct kern_nexus *nx = fsw->fsw_nx;
4006	char name[`64`];
4007	int error = `0`;
4008
4009	/ just in case /
4010	if (fe_cnt == `0`) {
4011	fe_cnt = NX_FSW_FE_TABLESZ;
4012	ASSERT(fe_cnt != `0`);
4013	}
4014	if (fob_cnt == `0`) {
4015	fob_cnt = NX_FSW_FOB_HASHSZ;
4016	ASSERT(fob_cnt != `0`);
4017	}
4018	if (frb_cnt == `0`) {
4019	frb_cnt = NX_FSW_FRB_HASHSZ;
4020	ASSERT(frb_cnt != `0`);
4021	}
4022	if (frib_cnt == `0`) {
4023	frib_cnt = NX_FSW_FRIB_HASHSZ;
4024	ASSERT(frib_cnt != `0`);
4025	}
4026
4027	/ make sure fe_cnt is a power of two, else round up /
4028	if ((fe_cnt & (fe_cnt - `1`)) != `0`) {
4029	fe_cnt--;
4030	fe_cnt \|= (fe_cnt >> `1`);
4031	fe_cnt \|= (fe_cnt >> `2`);
4032	fe_cnt \|= (fe_cnt >> `4`);
4033	fe_cnt \|= (fe_cnt >> `8`);
4034	fe_cnt \|= (fe_cnt >> `16`);
4035	fe_cnt++;
4036	}
4037
4038	/ make sure frb_cnt is a power of two, else round up /
4039	if ((frb_cnt & (frb_cnt - `1`)) != `0`) {
4040	frb_cnt--;
4041	frb_cnt \|= (frb_cnt >> `1`);
4042	frb_cnt \|= (frb_cnt >> `2`);
4043	frb_cnt \|= (frb_cnt >> `4`);
4044	frb_cnt \|= (frb_cnt >> `8`);
4045	frb_cnt \|= (frb_cnt >> `16`);
4046	frb_cnt++;
4047	}
4048
4049	lck_mtx_init(lck: &fsw->fsw_detach_barrier_lock, grp: &nexus_lock_group,
4050	attr: &nexus_lock_attr);
4051	lck_mtx_init(lck: &fsw->fsw_reap_lock, grp: &nexus_lock_group, attr: &nexus_lock_attr);
4052	lck_mtx_init(lck: &fsw->fsw_linger_lock, grp: &nexus_lock_group, attr: &nexus_lock_attr);
4053	TAILQ_INIT(&fsw->fsw_linger_head);
4054
4055	(void) snprintf(name, count: sizeof(name), "%s_%llu", NX_FSW_NAME, nx->nx_id);
4056	error = nx_advisory_alloc(nx, name,
4057	&NX_PROV(nx)->nxprov_region_params[SKMEM_REGION_NEXUSADV],
4058	NEXUS_ADVISORY_TYPE_FLOWSWITCH);
4059	if (error != `0`) {
4060	fsw_dp_dtor(fsw);
4061	return error;
4062	}
4063
4064	fsw->fsw_flow_mgr = flow_mgr_create(fe_cnt, fob_cnt, frb_cnt, frib_cnt);
4065	if (fsw->fsw_flow_mgr == NULL) {
4066	fsw_dp_dtor(fsw);
4067	return error;
4068	}
4069
4070	/ generic name; will be customized upon ifattach /
4071	(void) snprintf(fsw->fsw_reap_name, count: sizeof(fsw->fsw_reap_name),
4072	FSW_REAP_THREADNAME, name, "");
4073
4074	if (kernel_thread_start(continuation: fsw_reap_thread_func, parameter: fsw,
4075	new_thread: &fsw->fsw_reap_thread) != KERN_SUCCESS) {
4076	panic_plain("%s: can't create thread", __func__);
4077	/ NOTREACHED /
4078	__builtin_unreachable();
4079	}
4080	/ this must not fail /
4081	VERIFY(fsw->fsw_reap_thread != NULL);
4082
4083	SK_DF(SK_VERB_MEM, "fsw 0x%llx ALLOC", SK_KVA(fsw));
4084
4085
4086	return error;
4087	}
4088
4089	void
4090	fsw_dp_dtor(struct nx_flowswitch *fsw)
4091	{
4092	uint64_t f = (`1` * NSEC_PER_MSEC); / 1 ms /
4093	uint64_t s = (`1000` * NSEC_PER_SEC); / 1 sec /
4094	uint32_t i = `0`;
4095
4096	#if (DEVELOPMENT \|\| DEBUG)
4097	if (fsw->fsw_rps_threads != NULL) {
4098	for (i = `0`; i < fsw->fsw_rps_nthreads; i++) {
4099	fsw_rps_thread_join(fsw, i);
4100	}
4101	kfree_type(struct fsw_rps_thread, fsw->fsw_rps_threads);
4102	}
4103	#endif /* !DEVELOPMENT && !DEBUG */
4104
4105	nx_advisory_free(fsw->fsw_nx);
4106
4107	if (fsw->fsw_reap_thread != THREAD_NULL) {
4108	/ signal thread to begin self-termination /
4109	lck_mtx_lock(lck: &fsw->fsw_reap_lock);
4110	fsw->fsw_reap_flags \|= FSW_REAPF_TERMINATING;
4111
4112	/*
4113	* And wait for thread to terminate; use another
4114	* wait channel here other than fsw_reap_flags to
4115	* make it more explicit. In the event the reaper
4116	* thread misses a wakeup, we'll try again once
4117	* every second (except for the first time).
4118	*/
4119	while (!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED)) {
4120	uint64_t t = `0`;
4121
4122	nanoseconds_to_absolutetime(nanoseconds: (i++ == `0`) ? f : s, result: &t);
4123	clock_absolutetime_interval_to_deadline(abstime: t, result: &t);
4124	ASSERT(t != `0`);
4125
4126	fsw->fsw_reap_flags \|= FSW_REAPF_TERMINATEBLOCK;
4127	if (!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING)) {
4128	thread_wakeup((caddr_t)&fsw->fsw_reap_flags);
4129	}
4130	(void) assert_wait_deadline(event: &fsw->fsw_reap_thread,
4131	THREAD_UNINT, deadline: t);
4132	lck_mtx_unlock(lck: &fsw->fsw_reap_lock);
4133	thread_block(THREAD_CONTINUE_NULL);
4134	lck_mtx_lock(lck: &fsw->fsw_reap_lock);
4135	fsw->fsw_reap_flags &= ~FSW_REAPF_TERMINATEBLOCK;
4136	}
4137	ASSERT(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED);
4138	lck_mtx_unlock(lck: &fsw->fsw_reap_lock);
4139	fsw->fsw_reap_thread = THREAD_NULL;
4140	}
4141
4142	/ free any remaining flow entries in the linger list /
4143	fsw_linger_purge(fsw);
4144
4145	if (fsw->fsw_flow_mgr != NULL) {
4146	flow_mgr_destroy(fsw->fsw_flow_mgr);
4147	fsw->fsw_flow_mgr = NULL;
4148	}
4149
4150
4151	lck_mtx_destroy(lck: &fsw->fsw_detach_barrier_lock, grp: &nexus_lock_group);
4152	lck_mtx_destroy(lck: &fsw->fsw_reap_lock, grp: &nexus_lock_group);
4153	lck_mtx_destroy(lck: &fsw->fsw_linger_lock, grp: &nexus_lock_group);
4154	}
4155
4156	void
4157	fsw_linger_insert(struct flow_entry *fe)
4158	{
4159	struct nx_flowswitch *fsw = fe->fe_fsw;
4160	SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
4161	SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx flags 0x%b",
4162	fe_as_string(fe, dbgbuf, sizeof(dbgbuf)), SK_KVA(fe),
4163	fe->fe_flags, FLOWENTF_BITS);
4164
4165	net_update_uptime();
4166
4167	ASSERT(flow_entry_refcnt(fe) >= `1`);
4168	ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
4169	ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
4170	ASSERT(!(fe->fe_flags & FLOWENTF_LINGERING));
4171	ASSERT(fe->fe_flags & FLOWENTF_WAIT_CLOSE);
4172	ASSERT(fe->fe_linger_wait != `0`);
4173	fe->fe_linger_expire = (_net_uptime + fe->fe_linger_wait);
4174	os_atomic_or(&fe->fe_flags, FLOWENTF_LINGERING, relaxed);
4175
4176	lck_mtx_lock_spin(lck: &fsw->fsw_linger_lock);
4177	TAILQ_INSERT_TAIL(&fsw->fsw_linger_head, fe, fe_linger_link);
4178	fsw->fsw_linger_cnt++;
4179	VERIFY(fsw->fsw_linger_cnt != `0`);
4180	lck_mtx_unlock(lck: &fsw->fsw_linger_lock);
4181
4182	fsw_reap_sched(fsw);
4183	}
4184
4185	static void
4186	fsw_linger_remove_internal(struct flow_entry_linger_head *linger_head,
4187	struct flow_entry *fe)
4188	{
4189	SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
4190	SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx flags 0x%b",
4191	fe_as_string(fe, dbgbuf, sizeof(dbgbuf)), SK_KVA(fe),
4192	fe->fe_flags, FLOWENTF_BITS);
4193
4194	ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
4195	ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
4196	ASSERT(fe->fe_flags & FLOWENTF_LINGERING);
4197	os_atomic_andnot(&fe->fe_flags, FLOWENTF_LINGERING, relaxed);
4198
4199	TAILQ_REMOVE(linger_head, fe, fe_linger_link);
4200	flow_entry_release(pfe: &fe);
4201	}
4202
4203	static void
4204	fsw_linger_remove(struct flow_entry *fe)
4205	{
4206	struct nx_flowswitch *fsw = fe->fe_fsw;
4207
4208	LCK_MTX_ASSERT(&fsw->fsw_linger_lock, LCK_MTX_ASSERT_OWNED);
4209
4210	fsw_linger_remove_internal(linger_head: &fsw->fsw_linger_head, fe);
4211	VERIFY(fsw->fsw_linger_cnt != `0`);
4212	fsw->fsw_linger_cnt--;
4213	}
4214
4215	void
4216	fsw_linger_purge(struct nx_flowswitch *fsw)
4217	{
4218	struct flow_entry fe, tfe;
4219
4220	lck_mtx_lock(lck: &fsw->fsw_linger_lock);
4221	TAILQ_FOREACH_SAFE(fe, &fsw->fsw_linger_head, fe_linger_link, tfe) {
4222	fsw_linger_remove(fe);
4223	}
4224	ASSERT(fsw->fsw_linger_cnt == `0`);
4225	ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
4226	lck_mtx_unlock(lck: &fsw->fsw_linger_lock);
4227	}
4228
4229	void
4230	fsw_reap_sched(struct nx_flowswitch *fsw)
4231	{
4232	ASSERT(fsw->fsw_reap_thread != THREAD_NULL);
4233	lck_mtx_lock_spin(lck: &fsw->fsw_reap_lock);
4234	if (!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING) &&
4235	!(fsw->fsw_reap_flags & (FSW_REAPF_TERMINATING \| FSW_REAPF_TERMINATED))) {
4236	thread_wakeup((caddr_t)&fsw->fsw_reap_flags);
4237	}
4238	lck_mtx_unlock(lck: &fsw->fsw_reap_lock);
4239	}
4240
4241	__attribute__((noreturn))
4242	static void
4243	fsw_reap_thread_func(void *v, wait_result_t w)
4244	{
4245	#pragma unused(w)
4246	struct nx_flowswitch *fsw = v;
4247
4248	ASSERT(fsw->fsw_reap_thread == current_thread());
4249	thread_set_thread_name(th: current_thread(), name: fsw->fsw_reap_name);
4250
4251	net_update_uptime();
4252
4253	lck_mtx_lock(lck: &fsw->fsw_reap_lock);
4254	VERIFY(!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING));
4255	(void) assert_wait(event: &fsw->fsw_reap_flags, THREAD_UNINT);
4256	lck_mtx_unlock(lck: &fsw->fsw_reap_lock);
4257	thread_block_parameter(continuation: fsw_reap_thread_cont, parameter: fsw);
4258	/ NOTREACHED /
4259	__builtin_unreachable();
4260	}
4261
4262	__attribute__((noreturn))
4263	static void
4264	fsw_reap_thread_cont(void *v, wait_result_t wres)
4265	{
4266	struct nx_flowswitch *fsw = v;
4267	boolean_t low;
4268	uint64_t t = `0`;
4269
4270	SK_DF(SK_VERB_FLOW, "%s: running", fsw->fsw_reap_name);
4271
4272	lck_mtx_lock(lck: &fsw->fsw_reap_lock);
4273	if (__improbable(wres == THREAD_INTERRUPTED \|\|
4274	(fsw->fsw_reap_flags & FSW_REAPF_TERMINATING) != `0`)) {
4275	goto terminate;
4276	}
4277
4278	ASSERT(!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED));
4279	fsw->fsw_reap_flags \|= FSW_REAPF_RUNNING;
4280	lck_mtx_unlock(lck: &fsw->fsw_reap_lock);
4281
4282	net_update_uptime();
4283
4284	/ prevent detach from happening while we're here /
4285	if (!fsw_detach_barrier_add(fsw)) {
4286	SK_ERR("%s: netagent detached", fsw->fsw_reap_name);
4287	t = `0`;
4288	} else {
4289	uint32_t fe_nonviable, fe_freed, fe_aborted;
4290	uint32_t fr_freed, fr_resid = `0`;
4291	struct ifnet *ifp = fsw->fsw_ifp;
4292	uint64_t i = FSW_REAP_IVAL;
4293	uint64_t now = _net_uptime;
4294	uint64_t last;
4295
4296	ASSERT(fsw->fsw_ifp != NULL);
4297
4298	/*
4299	* Pass 1: process any deferred {withdrawn,nonviable} requests.
4300	*/
4301	fe_nonviable = fsw_process_deferred(fsw);
4302
4303	/*
4304	* Pass 2: remove any expired lingering flows.
4305	*/
4306	fe_freed = fsw_process_linger(fsw, &fe_aborted);
4307
4308	/*
4309	* Pass 3: prune idle flow routes.
4310	*/
4311	fr_freed = flow_route_prune(fsw->fsw_flow_mgr,
4312	ifp, &fr_resid);
4313
4314	/*
4315	* Pass 4: prune flow table
4316	*
4317	*/
4318	cuckoo_hashtable_try_shrink(h: fsw->fsw_flow_mgr->fm_flow_table);
4319
4320	SK_DF(SK_VERB_FLOW, "%s: fe_nonviable %u/%u fe_freed %u/%u "
4321	"fe_aborted %u fr_freed %u/%u",
4322	fsw->fsw_flow_mgr->fm_name, fe_nonviable,
4323	(fe_nonviable + fsw->fsw_pending_nonviable),
4324	fe_freed, fsw->fsw_linger_cnt, fe_aborted, fe_freed,
4325	(fe_freed + fr_resid));
4326
4327	/ see if VM memory level is critical /
4328	low = skmem_lowmem_check();
4329
4330	/*
4331	* If things appear to be idle, we can prune away cached
4332	* object that have fallen out of the working sets (this
4333	* is different than purging). Every once in a while, we
4334	* also purge the caches. Note that this is done across
4335	* all flowswitch instances, and so we limit this to no
4336	* more than once every FSW_REAP_SK_THRES seconds.
4337	*/
4338	last = os_atomic_load(&fsw_reap_last, relaxed);
4339	if ((low \|\| (last != `0` && (now - last) >= FSW_REAP_SK_THRES)) &&
4340	os_atomic_cmpxchg(&fsw_reap_last, last, now, acq_rel)) {
4341	fsw_purge_cache(fsw, low);
4342
4343	/ increase sleep interval if idle /
4344	if (kdebug_enable == `0` && fsw->fsw_linger_cnt == `0` &&
4345	fsw->fsw_pending_nonviable == `0` && fr_resid == `0`) {
4346	i <<= `3`;
4347	}
4348	} else if (last == `0`) {
4349	os_atomic_store(&fsw_reap_last, now, release);
4350	}
4351
4352	/*
4353	* Additionally, run thru the list of channels and prune
4354	* or purge away cached objects on "idle" channels. This
4355	* check is rate limited to no more than once every
4356	* FSW_DRAIN_CH_THRES seconds.
4357	*/
4358	last = fsw->fsw_drain_channel_chk_last;
4359	if (low \|\| (last != `0` && (now - last) >= FSW_DRAIN_CH_THRES)) {
4360	SK_DF(SK_VERB_FLOW, "%s: pruning channels",
4361	fsw->fsw_flow_mgr->fm_name);
4362
4363	fsw->fsw_drain_channel_chk_last = now;
4364	fsw_drain_channels(fsw, now, low);
4365	} else if (__improbable(last == `0`)) {
4366	fsw->fsw_drain_channel_chk_last = now;
4367	}
4368
4369	/*
4370	* Finally, invoke the interface's reap callback to
4371	* tell it to prune or purge away cached objects if
4372	* it is idle. This check is rate limited to no more
4373	* than once every FSW_REAP_IF_THRES seconds.
4374	*/
4375	last = fsw->fsw_drain_netif_chk_last;
4376	if (low \|\| (last != `0` && (now - last) >= FSW_REAP_IF_THRES)) {
4377	ASSERT(fsw->fsw_nifna != NULL);
4378
4379	if (ifp->if_na_ops != NULL &&
4380	ifp->if_na_ops->ni_reap != NULL) {
4381	SK_DF(SK_VERB_FLOW, "%s: pruning netif",
4382	fsw->fsw_flow_mgr->fm_name);
4383	ifp->if_na_ops->ni_reap(ifp->if_na, ifp,
4384	FSW_REAP_IF_THRES, low);
4385	}
4386
4387	fsw->fsw_drain_netif_chk_last = now;
4388	} else if (__improbable(last == `0`)) {
4389	fsw->fsw_drain_netif_chk_last = now;
4390	}
4391
4392	/ emit periodic interface stats ktrace /
4393	last = fsw->fsw_reap_last;
4394	if (last != `0` && (now - last) >= FSW_IFSTATS_THRES) {
4395	KDBG(SK_KTRACE_AON_IF_STATS, ifp->if_data.ifi_ipackets,
4396	ifp->if_data.ifi_ibytes * `8`,
4397	ifp->if_data.ifi_opackets,
4398	ifp->if_data.ifi_obytes * `8`);
4399
4400	fsw->fsw_reap_last = now;
4401	} else if (__improbable(last == `0`)) {
4402	fsw->fsw_reap_last = now;
4403	}
4404
4405	nanoseconds_to_absolutetime(nanoseconds: i * NSEC_PER_SEC, result: &t);
4406	clock_absolutetime_interval_to_deadline(abstime: t, result: &t);
4407	ASSERT(t != `0`);
4408
4409	/ allow any pending detach to proceed /
4410	fsw_detach_barrier_remove(fsw);
4411	}
4412
4413	lck_mtx_lock(lck: &fsw->fsw_reap_lock);
4414	if (!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATING)) {
4415	fsw->fsw_reap_flags &= ~FSW_REAPF_RUNNING;
4416	(void) assert_wait_deadline(event: &fsw->fsw_reap_flags,
4417	THREAD_UNINT, deadline: t);
4418	lck_mtx_unlock(lck: &fsw->fsw_reap_lock);
4419	thread_block_parameter(continuation: fsw_reap_thread_cont, parameter: fsw);
4420	/ NOTREACHED /
4421	__builtin_unreachable();
4422	} else {
4423	terminate:
4424	LCK_MTX_ASSERT(&fsw->fsw_reap_lock, LCK_MTX_ASSERT_OWNED);
4425	fsw->fsw_reap_flags &= ~(FSW_REAPF_RUNNING \| FSW_REAPF_TERMINATING);
4426	fsw->fsw_reap_flags \|= FSW_REAPF_TERMINATED;
4427	/*
4428	* And signal any thread waiting for us to terminate;
4429	* wait channel here other than fsw_reap_flags to make
4430	* it more explicit.
4431	*/
4432	if (fsw->fsw_reap_flags & FSW_REAPF_TERMINATEBLOCK) {
4433	thread_wakeup((caddr_t)&fsw->fsw_reap_thread);
4434	}
4435	lck_mtx_unlock(lck: &fsw->fsw_reap_lock);
4436
4437	SK_DF(SK_VERB_FLOW, "%s: terminating", fsw->fsw_reap_name);
4438
4439	/ for the extra refcnt from kernel_thread_start() /
4440	thread_deallocate(thread: current_thread());
4441	/ this is the end /
4442	thread_terminate(current_thread());
4443	/ NOTREACHED /
4444	__builtin_unreachable();
4445	}
4446
4447	/ must never get here /
4448	VERIFY(`0`);
4449	/ NOTREACHED /
4450	__builtin_unreachable();
4451	}
4452
4453	static void
4454	fsw_drain_channels(struct nx_flowswitch *fsw, uint64_t now, boolean_t low)
4455	{
4456	struct kern_nexus *nx = fsw->fsw_nx;
4457
4458	/ flowswitch protects NA via fsw_lock, see fsw_port_alloc/free /
4459	FSW_RLOCK(fsw);
4460
4461	/ uncrustify doesn't handle C blocks properly /
4462	/ BEGIN IGNORE CODESTYLE /
4463	nx_port_foreach(nx, ^(nexus_port_t p) {
4464	struct nexus_adapter *na = nx_port_get_na(nx, p);
4465	if (na == NULL \|\| na->na_work_ts == `0` \|\| na->na_rx_rings == NULL) {
4466	return;
4467	}
4468
4469	boolean_t purge;
4470
4471	/*
4472	* If some activity happened in the last FSW_DRAIN_CH_THRES
4473	* seconds on this channel, we reclaim memory if the channel
4474	* throughput is less than the reap threshold value.
4475	*/
4476	if ((now - na->na_work_ts) < FSW_DRAIN_CH_THRES) {
4477	struct __kern_channel_ring *ring;
4478	channel_ring_stats *stats;
4479	uint64_t bps;
4480
4481	ring = na->na_rx_rings;
4482	stats = &ring->ckr_stats;
4483	bps = stats->crs_bytes_per_second;
4484
4485	if (bps < fsw_channel_reap_thresh) {
4486	purge = FALSE;
4487	na_drain(na, purge);
4488	}
4489	return;
4490	}
4491
4492	/*
4493	* If NA has been inactive for some time (twice the drain
4494	* threshold), we clear the work timestamp to temporarily skip
4495	* this channel until it's active again. Purging cached objects
4496	* can be expensive since we'd need to allocate and construct
4497	* them again, so we do it only when necessary.
4498	*/
4499	if (low \|\| ((now - na->na_work_ts) >= (FSW_DRAIN_CH_THRES << `1`))) {
4500	na->na_work_ts = `0`;
4501	purge = TRUE;
4502	} else {
4503	purge = FALSE;
4504	}
4505
4506	na_drain(na, purge); / purge/prune caches /
4507	});
4508	/ END IGNORE CODESTYLE /
4509
4510	FSW_RUNLOCK(fsw);
4511	}
4512
4513	static void
4514	fsw_purge_cache(struct nx_flowswitch *fsw, boolean_t low)
4515	{
4516	#pragma unused(fsw)
4517	uint64_t o = os_atomic_inc_orig(&fsw_want_purge, relaxed);
4518	uint32_t p = fsw_flow_purge_thresh;
4519	boolean_t purge = (low \|\| (o != `0` && p != `0` && (o % p) == `0`));
4520
4521	SK_DF(SK_VERB_FLOW, "%s: %s caches",
4522	fsw->fsw_flow_mgr->fm_name,
4523	(purge ? "purge" : "prune"));
4524
4525	skmem_cache_reap_now(sk_fo_cache, purge);
4526	skmem_cache_reap_now(sk_fe_cache, purge);
4527	skmem_cache_reap_now(sk_fab_cache, purge);
4528	skmem_cache_reap_now(flow_route_cache, purge);
4529	skmem_cache_reap_now(flow_stats_cache, purge);
4530	netns_reap_caches(purge);
4531	skmem_reap_caches(purge);
4532
4533	#if CONFIG_MBUF_MCACHE
4534	if (if_is_fsw_transport_netagent_enabled() && purge) {
4535	mbuf_drain(FALSE);
4536	}
4537	#endif /* CONFIG_MBUF_MCACHE */
4538	}
4539
4540	static void
4541	fsw_flow_handle_low_power(struct nx_flowswitch fsw, struct* flow_entry *fe)
4542	{
4543	/ When the interface is in low power mode, the flow is nonviable /
4544	if (!(fe->fe_flags & FLOWENTF_NONVIABLE) &&
4545	os_atomic_cmpxchg(&fe->fe_want_nonviable, `0`, `1`, acq_rel)) {
4546	os_atomic_inc(&fsw->fsw_pending_nonviable, relaxed);
4547	}
4548	}
4549
4550	static uint32_t
4551	fsw_process_deferred(struct nx_flowswitch *fsw)
4552	{
4553	struct flow_entry_dead sfed __sk_aligned(`8`);
4554	struct flow_mgr *fm = fsw->fsw_flow_mgr;
4555	struct flow_entry_dead fed, tfed;
4556	LIST_HEAD(, flow_entry_dead) fed_head =
4557	LIST_HEAD_INITIALIZER(fed_head);
4558	uint32_t i, nonviable = `0`;
4559	boolean_t lowpowermode = FALSE;
4560
4561	bzero(s: &sfed, n: sizeof(sfed));
4562
4563	/*
4564	* The flows become nonviable when the interface
4565	* is in low power mode (edge trigger)
4566	*/
4567	if ((fsw->fsw_ifp->if_xflags & IFXF_LOW_POWER) &&
4568	fsw->fsw_ifp->if_low_power_gencnt != fsw->fsw_low_power_gencnt) {
4569	lowpowermode = TRUE;
4570	fsw->fsw_low_power_gencnt = fsw->fsw_ifp->if_low_power_gencnt;
4571	}
4572
4573	/*
4574	* Scan thru the flow entry tree, and commit any pending withdraw or
4575	* nonviable requests. We may need to push stats and/or unassign the
4576	* nexus from NECP, but we cannot do that while holding the locks;
4577	* build a temporary list for those entries.
4578	*/
4579	for (i = `0`; i < fm->fm_owner_buckets_cnt; i++) {
4580	struct flow_owner_bucket *fob = flow_mgr_get_fob_at_idx(fm, idx: i);
4581	struct flow_owner *fo;
4582
4583	/*
4584	* Grab the lock at all costs when handling low power mode
4585	*/
4586	if (__probable(!lowpowermode)) {
4587	if (!FOB_TRY_LOCK(fob)) {
4588	continue;
4589	}
4590	} else {
4591	FOB_LOCK(fob);
4592	}
4593
4594	FOB_LOCK_ASSERT_HELD(fob);
4595	RB_FOREACH(fo, flow_owner_tree, &fob->fob_owner_head) {
4596	struct flow_entry *fe;
4597
4598	RB_FOREACH(fe, flow_entry_id_tree,
4599	&fo->fo_flow_entry_id_head) {
4600	/ try first as reader; skip if we can't /
4601	if (__improbable(lowpowermode)) {
4602	fsw_flow_handle_low_power(fsw, fe);
4603	}
4604	if (__improbable(fe->fe_flags & FLOWENTF_HALF_CLOSED)) {
4605	os_atomic_andnot(&fe->fe_flags, FLOWENTF_HALF_CLOSED, relaxed);
4606	flow_namespace_half_close(token: &fe->fe_port_reservation);
4607	}
4608
4609	/ if not withdrawn/nonviable, skip /
4610	if (!fe->fe_want_withdraw &&
4611	!fe->fe_want_nonviable) {
4612	continue;
4613	}
4614	/*
4615	* Here we're holding the lock as writer;
4616	* don't spend too much time as we're
4617	* blocking the data path now.
4618	*/
4619	ASSERT(!uuid_is_null(fe->fe_uuid));
4620	/ only need flow UUID and booleans /
4621	uuid_copy(dst: sfed.fed_uuid, src: fe->fe_uuid);
4622	sfed.fed_want_clonotify =
4623	(fe->fe_flags & FLOWENTF_CLOSE_NOTIFY);
4624	sfed.fed_want_nonviable = fe->fe_want_nonviable;
4625	flow_entry_teardown(fo, fe);
4626
4627	/ do this outside the flow bucket lock /
4628	fed = flow_entry_dead_alloc(Z_WAITOK);
4629	ASSERT(fed != NULL);
4630	*fed = sfed;
4631	LIST_INSERT_HEAD(&fed_head, fed, fed_link);
4632	}
4633	}
4634	FOB_UNLOCK(fob);
4635	}
4636
4637	/*
4638	* These nonviable flows are no longer useful since we've lost
4639	* the source IP address; in the event the client monitors the
4640	* viability of the flow, explicitly mark it as nonviable so
4641	* that a new flow can be created.
4642	*/
4643	LIST_FOREACH_SAFE(fed, &fed_head, fed_link, tfed) {
4644	LIST_REMOVE(fed, fed_link);
4645	ASSERT(fsw->fsw_agent_session != NULL);
4646
4647	/ if flow is closed early /
4648	if (fed->fed_want_clonotify) {
4649	necp_client_early_close(client_id: fed->fed_uuid);
4650	}
4651
4652	/ if nonviable, unassign nexus attributes /
4653	if (fed->fed_want_nonviable) {
4654	(void) netagent_assign_nexus(session: fsw->fsw_agent_session,
4655	necp_client_uuid: fed->fed_uuid, NULL, assigned_results_length: `0`);
4656	}
4657
4658	flow_entry_dead_free(fed);
4659	++nonviable;
4660	}
4661	ASSERT(LIST_EMPTY(&fed_head));
4662
4663	return nonviable;
4664	}
4665
4666	static uint32_t
4667	fsw_process_linger(struct nx_flowswitch fsw, uint32_t abort)
4668	{
4669	struct flow_entry_linger_head linger_head =
4670	TAILQ_HEAD_INITIALIZER(linger_head);
4671	struct flow_entry fe, tfe;
4672	uint64_t now = _net_uptime;
4673	uint32_t i = `0`, cnt = `0`, freed = `0`;
4674
4675	ASSERT(fsw->fsw_ifp != NULL);
4676	ASSERT(abort != NULL);
4677	*abort = `0`;
4678
4679	/*
4680	* We don't want to contend with the datapath, so move
4681	* everything that's in the linger list into a local list.
4682	* This allows us to generate RSTs or free the flow entry
4683	* outside the lock. Any remaining flow entry in the local
4684	* list will get re-added back to the head of the linger
4685	* list, in front of any new ones added since then.
4686	*/
4687	lck_mtx_lock(lck: &fsw->fsw_linger_lock);
4688	TAILQ_CONCAT(&linger_head, &fsw->fsw_linger_head, fe_linger_link);
4689	ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
4690	cnt = fsw->fsw_linger_cnt;
4691	fsw->fsw_linger_cnt = `0`;
4692	lck_mtx_unlock(lck: &fsw->fsw_linger_lock);
4693
4694	TAILQ_FOREACH_SAFE(fe, &linger_head, fe_linger_link, tfe) {
4695	ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
4696	ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
4697	ASSERT(fe->fe_flags & FLOWENTF_LINGERING);
4698
4699	/*
4700	* See if this is a TCP flow that needs to generate
4701	* a RST to the remote peer (if not already).
4702	*/
4703	if (flow_track_tcp_want_abort(fe)) {
4704	VERIFY(fe->fe_flags & FLOWENTF_ABORTED);
4705	ASSERT(!uuid_is_null(fe->fe_uuid));
4706	flow_track_abort_tcp(fe, NULL, NULL);
4707	(*abort)++;
4708	SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
4709	SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx "
4710	"flags 0x%b [RST]", fe_as_string(fe, dbgbuf,
4711	sizeof(dbgbuf)), SK_KVA(fe), fe->fe_flags,
4712	FLOWENTF_BITS);
4713	}
4714
4715	/*
4716	* If flow has expired, remove from list and free;
4717	* otherwise leave it around in the linger list.
4718	*/
4719	if (fe->fe_linger_expire <= now) {
4720	freed++;
4721	fsw_linger_remove_internal(linger_head: &linger_head, fe);
4722	fe = NULL;
4723	}
4724	++i;
4725	}
4726	VERIFY(i == cnt && cnt >= freed);
4727
4728	/*
4729	* Add any remaining ones back into the linger list.
4730	*/
4731	lck_mtx_lock(lck: &fsw->fsw_linger_lock);
4732	if (!TAILQ_EMPTY(&linger_head)) {
4733	ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head) \|\| fsw->fsw_linger_cnt);
4734	TAILQ_CONCAT(&linger_head, &fsw->fsw_linger_head, fe_linger_link);
4735	ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
4736	TAILQ_CONCAT(&fsw->fsw_linger_head, &linger_head, fe_linger_link);
4737	fsw->fsw_linger_cnt += (cnt - freed);
4738	}
4739	ASSERT(TAILQ_EMPTY(&linger_head));
4740	lck_mtx_unlock(lck: &fsw->fsw_linger_lock);
4741
4742	return freed;
4743	}
4744
4745	__attribute__((always_inline))
4746	static inline void
4747	fsw_ifp_inc_traffic_class_in_pkt(struct ifnet *ifp, kern_packet_t ph)
4748	{
4749	switch (__packet_get_traffic_class(ph)) {
4750	case PKT_TC_BE:
4751	ifp->if_tc.ifi_ibepackets++;
4752	ifp->if_tc.ifi_ibebytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
4753	break;
4754	case PKT_TC_BK:
4755	ifp->if_tc.ifi_ibkpackets++;
4756	ifp->if_tc.ifi_ibkbytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
4757	break;
4758	case PKT_TC_VI:
4759	ifp->if_tc.ifi_ivipackets++;
4760	ifp->if_tc.ifi_ivibytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
4761	break;
4762	case PKT_TC_VO:
4763	ifp->if_tc.ifi_ivopackets++;
4764	ifp->if_tc.ifi_ivobytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
4765	break;
4766	default:
4767	break;
4768	}
4769	}
4770
4771	__attribute__((always_inline))
4772	static inline void
4773	fsw_ifp_inc_traffic_class_out_pkt(struct ifnet *ifp, uint32_t svc,
4774	uint32_t cnt, uint32_t len)
4775	{
4776	switch (svc) {
4777	case PKT_TC_BE:
4778	ifp->if_tc.ifi_obepackets += cnt;
4779	ifp->if_tc.ifi_obebytes += len;
4780	break;
4781	case PKT_TC_BK:
4782	ifp->if_tc.ifi_obkpackets += cnt;
4783	ifp->if_tc.ifi_obkbytes += len;
4784	break;
4785	case PKT_TC_VI:
4786	ifp->if_tc.ifi_ovipackets += cnt;
4787	ifp->if_tc.ifi_ovibytes += len;
4788	break;
4789	case PKT_TC_VO:
4790	ifp->if_tc.ifi_ovopackets += cnt;
4791	ifp->if_tc.ifi_ovobytes += len;
4792	break;
4793	default:
4794	break;
4795	}
4796	}
4797

Browse the source code of xnu/bsd/skywalk/nexus/flowswitch/fsw_dp.c