1/*
2 * Copyright (c) 2015-2023 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*
30 * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 *
41 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
42 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
43 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
44 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
45 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
46 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
47 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
48 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
49 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
50 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
51 * SUCH DAMAGE.
52 */
53
54/*
55 * BSD LICENSE
56 *
57 * Copyright(c) 2015 NEC Europe Ltd. All rights reserved.
58 * All rights reserved.
59 *
60 * Redistribution and use in source and binary forms, with or without
61 * modification, are permitted provided that the following conditions
62 * are met:
63 *
64 * * Redistributions of source code must retain the above copyright
65 * notice, this list of conditions and the following disclaimer.
66 * * Redistributions in binary form must reproduce the above copyright
67 * notice, this list of conditions and the following disclaimer in
68 * the documentation and/or other materials provided with the
69 * distribution.
70 * * Neither the name of NEC Europe Ltd. nor the names of
71 * its contributors may be used to endorse or promote products derived
72 * from this software without specific prior written permission.
73 *
74 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
75 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
76 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
77 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
78 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
79 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
80 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
81 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
82 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
83 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
84 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
85 */
86
87#include <skywalk/os_skywalk_private.h>
88#include <skywalk/nexus/flowswitch/nx_flowswitch.h>
89#include <skywalk/nexus/flowswitch/fsw_var.h>
90#include <skywalk/nexus/netif/nx_netif.h>
91#include <skywalk/nexus/netif/nx_netif_compat.h>
92#include <kern/sched_prim.h>
93#include <sys/kdebug.h>
94#include <sys/sdt.h>
95#include <net/bpf.h>
96#include <net/if_ports_used.h>
97#include <net/pktap.h>
98#include <net/pktsched/pktsched_netem.h>
99#include <netinet/tcp.h>
100#include <netinet/udp.h>
101#include <netinet/ip.h>
102#include <netinet/ip6.h>
103#include <netinet/in_var.h>
104
105extern kern_return_t thread_terminate(thread_t);
106
107#define FSW_ZONE_MAX 256
108#define FSW_ZONE_NAME "skywalk.nx.fsw"
109
110static uint64_t fsw_reap_last __sk_aligned(8);
111static uint64_t fsw_want_purge __sk_aligned(8);
112
113#define NX_FSW_FE_TABLESZ 256 /* some power of 2 */
114static uint32_t fsw_fe_table_size = NX_FSW_FE_TABLESZ;
115
116#define NX_FSW_FOB_HASHSZ 31 /* some mersenne prime */
117static uint32_t fsw_flow_owner_buckets = NX_FSW_FOB_HASHSZ;
118
119#define NX_FSW_FRB_HASHSZ 128 /* some power of 2 */
120static uint32_t fsw_flow_route_buckets = NX_FSW_FRB_HASHSZ;
121
122#define NX_FSW_FRIB_HASHSZ 13 /* some mersenne prime */
123static uint32_t fsw_flow_route_id_buckets = NX_FSW_FRIB_HASHSZ;
124
125#define NX_FSW_FLOW_REAP_INTERVAL 1 /* seconds */
126static uint32_t fsw_flow_reap_interval = NX_FSW_FLOW_REAP_INTERVAL;
127
128#define NX_FSW_FLOW_PURGE_THRES 0 /* purge every N reaps (0 = disable) */
129static uint32_t fsw_flow_purge_thresh = NX_FSW_FLOW_PURGE_THRES;
130
131#define FSW_REAP_IVAL (MAX(1, fsw_flow_reap_interval))
132#define FSW_REAP_SK_THRES (FSW_REAP_IVAL << 5)
133#define FSW_REAP_IF_THRES (FSW_REAP_IVAL << 5)
134#define FSW_DRAIN_CH_THRES (FSW_REAP_IVAL << 5)
135#define FSW_IFSTATS_THRES 1
136
137#define NX_FSW_CHANNEL_REAP_THRES 1000 /* threshold (bytes/sec) for reaping*/
138uint64_t fsw_channel_reap_thresh = NX_FSW_CHANNEL_REAP_THRES;
139
140#define RX_BUFLET_BATCH_COUNT 64 /* max batch size for buflet allocation */
141
142uint32_t fsw_rx_batch = NX_FSW_RXBATCH; /* # of packets per batch (RX) */
143uint32_t fsw_tx_batch = NX_FSW_TXBATCH; /* # of packets per batch (TX) */
144uint32_t fsw_gso_batch = 8;
145#if (DEVELOPMENT || DEBUG)
146SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_batch,
147 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_rx_batch, 0,
148 "flowswitch Rx batch size");
149SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, tx_batch,
150 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_tx_batch, 0,
151 "flowswitch Tx batch size");
152SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, gso_batch,
153 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_gso_batch, 0,
154 "flowswitch GSO batch size");
155SYSCTL_QUAD(_kern_skywalk_flowswitch, OID_AUTO, reap_throughput,
156 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_channel_reap_thresh,
157 "flowswitch channel reap threshold throughput (bytes/sec)");
158#endif /* !DEVELOPMENT && !DEBUG */
159
160SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_agg_tcp,
161 CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_rx_agg_tcp, 0,
162 "flowswitch RX aggregation for tcp flows (enable/disable)");
163SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_agg_tcp_host,
164 CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_rx_agg_tcp_host, 0,
165 "flowswitch RX aggregation for tcp kernel path (0/1/2 (off/on/auto))");
166SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, gso_mtu,
167 CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_gso_mtu, 0,
168 "flowswitch GSO for tcp flows (mtu > 0: enable, mtu == 0: disable)");
169
170/*
171 * IP reassembly
172 * The "kern.skywalk.flowswitch.ip_reass" sysctl can be used to force
173 * enable/disable the reassembly routine regardless of whether the
174 * transport netagent is enabled or not.
175 *
176 * 'fsw_ip_reass' is a tri-state:
177 * 0 means force IP reassembly off
178 * 1 means force IP reassembly on
179 * 2 means don't force the value, use what's appropriate for this flowswitch
180 */
181#define FSW_IP_REASS_FORCE_OFF 0
182#define FSW_IP_REASS_FORCE_ON 1
183#define FSW_IP_REASS_NO_FORCE 2
184
185uint32_t fsw_ip_reass = FSW_IP_REASS_NO_FORCE;
186
187static int
188fsw_ip_reass_sysctl SYSCTL_HANDLER_ARGS
189{
190#pragma unused(oidp, arg1, arg2)
191 unsigned int new_value;
192 int changed;
193 int error;
194
195 error = sysctl_io_number(req, bigValue: fsw_ip_reass, valueSize: sizeof(fsw_ip_reass),
196 pValue: &new_value, changed: &changed);
197 if (error == 0 && changed != 0) {
198 if (new_value > FSW_IP_REASS_NO_FORCE) {
199 return EINVAL;
200 }
201 fsw_ip_reass = new_value;
202 }
203 return error;
204}
205
206SYSCTL_PROC(_kern_skywalk_flowswitch, OID_AUTO, ip_reass,
207 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
208 0, 0, fsw_ip_reass_sysctl, "IU",
209 "adjust flowswitch IP reassembly");
210
211#if (DEVELOPMENT || DEBUG)
212static uint64_t _fsw_inject_error = 0;
213#define _FSW_INJECT_ERROR(_en, _ev, _ec, _f, ...) \
214 _SK_INJECT_ERROR(_fsw_inject_error, _en, _ev, _ec, \
215 &FSW_STATS_VAL(_FSW_STATS_ERROR_INJECTIONS), _f, __VA_ARGS__)
216
217#define _FSW_INJECT_ERROR_SET(_en, _f, ...) do { \
218 if (__improbable(((_fsw_inject_error) & (1ULL << (_en))) != 0)) { \
219 SK_DF(SK_VERB_ERROR_INJECT, "injecting error %d", (_en));\
220 if ((_f) != NULL) \
221 (_f)(__VA_ARGS__); \
222 } \
223} while (0)
224
225SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_owner_buckets,
226 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_owner_buckets, 0, "");
227SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, fe_table_size,
228 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_fe_table_size, 0, "");
229SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_route_buckets,
230 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_route_buckets, 0, "");
231SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO,
232 flow_route_id_buckets, CTLFLAG_RW | CTLFLAG_LOCKED,
233 &fsw_flow_route_id_buckets, 0, "");
234SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_reap_interval,
235 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_reap_interval, 0, "");
236SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_purge_thresh,
237 CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_purge_thresh, 0, "");
238SYSCTL_QUAD(_kern_skywalk_flowswitch, OID_AUTO, fsw_inject_error,
239 CTLFLAG_RW | CTLFLAG_LOCKED, &_fsw_inject_error, "");
240#else
241#define _FSW_INJECT_ERROR(_en, _ev, _ec, _f, ...) do { } while (0)
242#define _FSW_INJECT_ERROR_SET(_en, _f, ...) do { } while (0)
243#endif /* !DEVELOPMENT && !DEBUG */
244
245static void fsw_linger_remove_internal(struct flow_entry_linger_head *,
246 struct flow_entry *);
247static void fsw_reap_thread_func(void *, wait_result_t);
248static void fsw_reap_thread_cont(void *, wait_result_t);
249static void fsw_purge_cache(struct nx_flowswitch *, boolean_t);
250static void fsw_drain_channels(struct nx_flowswitch *, uint64_t, boolean_t);
251static uint32_t fsw_process_deferred(struct nx_flowswitch *);
252static uint32_t fsw_process_linger(struct nx_flowswitch *, uint32_t *);
253
254static int copy_packet_from_dev(struct nx_flowswitch *, struct __kern_packet *,
255 struct __kern_packet *);
256
257static void fsw_ifp_inc_traffic_class_in_pkt(struct ifnet *, kern_packet_t);
258static void fsw_ifp_inc_traffic_class_out_pkt(struct ifnet *, uint32_t,
259 uint32_t, uint32_t);
260
261static int __fsw_dp_inited = 0;
262
263int
264fsw_dp_init(void)
265{
266 _CASSERT(FSW_VP_DEV == 0);
267 _CASSERT(FSW_VP_HOST == 1);
268 _CASSERT((FSW_VP_HOST + FSW_VP_DEV) < FSW_VP_USER_MIN);
269 _CASSERT((FSW_VP_HOST + FSW_VP_DEV) < NEXUS_PORT_FLOW_SWITCH_CLIENT);
270
271 ASSERT(!__fsw_dp_inited);
272
273 flow_mgr_init();
274 flow_init();
275
276 __fsw_dp_inited = 1;
277
278 return 0;
279}
280
281void
282fsw_dp_uninit(void)
283{
284 if (__fsw_dp_inited) {
285 flow_fini();
286 flow_mgr_fini();
287
288 __fsw_dp_inited = 0;
289 }
290}
291
292static void
293dp_free_pktq(struct nx_flowswitch *fsw __sk_unused, struct pktq *pktq)
294{
295 pp_free_pktq(pktq);
296}
297
298#define dp_drop_pktq(fsw, pktq) do { \
299 uint32_t _len = KPKTQ_LEN(pktq); \
300 if (KPKTQ_EMPTY(pktq)) { \
301 ASSERT(_len == 0); \
302 return; \
303 } \
304 SK_DF(SK_VERB_FSW_DP | SK_VERB_DROP, "drop %d packets", _len); \
305 FSW_STATS_ADD(FSW_STATS_DROP, _len); \
306 DTRACE_SKYWALK1(fsw__dp__drop, int, _len); \
307 dp_free_pktq(fsw, pktq); \
308} while (0)
309
310SK_NO_INLINE_ATTRIBUTE
311void
312fsw_snoop(struct nx_flowswitch *fsw, struct flow_entry *fe, bool input)
313{
314 pid_t pid;
315 char proc_name_buf[FLOW_PROCESS_NAME_LENGTH];
316 char *proc_name = NULL;
317 pid_t epid;
318 char eproc_name_buf[FLOW_PROCESS_NAME_LENGTH];
319 char *eproc_name = NULL;
320 sa_family_t af;
321 bool tap_early = false;
322 struct __kern_packet *pkt;
323
324 ASSERT(fe != NULL);
325 ASSERT(fsw->fsw_ifp != NULL);
326
327 if (fe->fe_nx_port == FSW_VP_HOST) {
328 /* allow packets to be tapped before aggregation happens */
329 tap_early = (input && fe->fe_key.fk_proto == IPPROTO_TCP);
330 if (!tap_early) {
331 /* all other traffic will be tapped in the dlil input path */
332 return;
333 }
334 }
335 if (fe->fe_key.fk_ipver == IPVERSION) {
336 af = AF_INET;
337 } else if (fe->fe_key.fk_ipver == IPV6_VERSION) {
338 af = AF_INET6;
339 } else {
340 return;
341 }
342
343 pid = fe->fe_pid;
344 if (fe->fe_proc_name[0] != '\0') {
345 (void) strlcpy(dst: proc_name_buf, src: fe->fe_proc_name,
346 n: sizeof(proc_name_buf));
347 proc_name = proc_name_buf;
348 }
349 epid = fe->fe_epid;
350 if (fe->fe_eproc_name[0] != '\0') {
351 (void) strlcpy(dst: eproc_name_buf, src: fe->fe_eproc_name,
352 n: sizeof(eproc_name_buf));
353 eproc_name = eproc_name_buf;
354 }
355 if (input) {
356 KPKTQ_FOREACH(pkt, &fe->fe_rx_pktq) {
357 pktap_input_packet(fsw->fsw_ifp, af,
358 fsw->fsw_ifp_dlt, pid, proc_name, epid,
359 eproc_name, SK_PKT2PH(pkt), NULL, 0,
360 IPPROTO_TCP, fe->fe_flowid,
361 tap_early ? PTH_FLAG_SOCKET: PTH_FLAG_NEXUS_CHAN);
362 }
363 } else {
364 KPKTQ_FOREACH(pkt, &fe->fe_tx_pktq) {
365 pktap_output_packet(fsw->fsw_ifp, af,
366 fsw->fsw_ifp_dlt, pid, proc_name, epid,
367 eproc_name, SK_PKT2PH(pkt), NULL, 0,
368 0, 0, PTH_FLAG_NEXUS_CHAN);
369 }
370 }
371}
372
373#if (DEVELOPMENT || DEBUG)
374static void
375_fsw_error35_handler(int step, struct flow_route *fr, struct __kern_packet *pkt,
376 int *ret)
377{
378 static boolean_t _err35_flag_modified = FALSE;
379
380 switch (step) {
381 case 1:
382 if ((fr->fr_flags & (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) ==
383 (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) {
384 fr->fr_flags &= ~FLOWRTF_RESOLVED;
385 _err35_flag_modified = TRUE;
386 }
387 break;
388
389 case 2:
390 if (!_err35_flag_modified) {
391 return;
392 }
393 if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
394 m_freem(pkt->pkt_mbuf);
395 pkt->pkt_pflags &= ~PKT_F_MBUF_DATA;
396 pkt->pkt_mbuf = NULL;
397 }
398 *ret = EJUSTRETURN;
399 fr->fr_flags |= FLOWRTF_RESOLVED;
400 _err35_flag_modified = FALSE;
401 break;
402
403 default:
404 VERIFY(0);
405 /* not reached */
406 }
407}
408
409static void
410_fsw_error36_handler(int step, struct flow_route *fr, int *ret)
411{
412 static boolean_t _err36_flag_modified = FALSE;
413
414 switch (step) {
415 case 1:
416 if ((fr->fr_flags & (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) ==
417 (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) {
418 fr->fr_flags &= ~FLOWRTF_RESOLVED;
419 _err36_flag_modified = TRUE;
420 }
421 break;
422
423 case 2:
424 if (!_err36_flag_modified) {
425 return;
426 }
427 *ret = ENETUNREACH;
428 fr->fr_flags |= FLOWRTF_RESOLVED;
429 _err36_flag_modified = FALSE;
430 break;
431
432 default:
433 VERIFY(0);
434 /* not reached */
435 }
436}
437#else /* !DEVELOPMENT && !DEBUG */
438#define _fsw_error35_handler(...)
439#define _fsw_error36_handler(...)
440#endif /* DEVELOPMENT || DEBUG */
441
442/*
443 * Check if the source packet content can fit into the destination
444 * ring's packet. Returns TRUE if the source packet can fit.
445 * Note: Failures could be caused by misconfigured packet pool sizes,
446 * missing packet size check again MTU or if the source packet is from
447 * a compat netif and the attached mbuf is larger than MTU due to LRO.
448 */
449static inline boolean_t
450validate_pkt_len(struct __kern_packet *spkt, kern_packet_t dph,
451 uint32_t skip_l2hlen, uint32_t l2hlen, uint16_t headroom,
452 uint32_t *copy_len)
453{
454 uint32_t tlen = 0;
455 uint32_t splen = spkt->pkt_length - skip_l2hlen;
456
457 if (l2hlen != 0) {
458 VERIFY(skip_l2hlen == 0);
459 tlen += l2hlen;
460 } else if ((spkt->pkt_link_flags & PKT_LINKF_ETHFCS) != 0) {
461 splen -= ETHER_CRC_LEN;
462 }
463
464 tlen += splen;
465 *copy_len = splen;
466
467 return tlen <= ((__packet_get_buflet_count(ph: dph) *
468 PP_BUF_SIZE_DEF(SK_PTR_ADDR_KPKT(dph)->pkt_qum.qum_pp)) -
469 headroom);
470}
471
472#if SK_LOG
473/* Hoisted out of line to reduce kernel stack footprint */
474SK_LOG_ATTRIBUTE
475static void
476copy_packet_from_dev_log(struct __kern_packet *spkt,
477 struct __kern_packet *dpkt, struct proc *p)
478{
479 uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) |
480 ((spkt->pkt_pflags & PKT_F_MBUF_DATA) ?
481 SK_VERB_COPY_MBUF : SK_VERB_COPY));
482 char *daddr;
483 MD_BUFLET_ADDR_ABS(dpkt, daddr);
484 SK_DF(logflags, "%s(%d) splen %u dplen %u hr %u l2 %u",
485 sk_proc_name_address(p), sk_proc_pid(p), spkt->pkt_length,
486 dpkt->pkt_length, (uint32_t)dpkt->pkt_headroom,
487 (uint32_t)dpkt->pkt_l2_len);
488 SK_DF(logflags | SK_VERB_DUMP, "%s",
489 sk_dump("buf", daddr, dpkt->pkt_length, 128, NULL, 0));
490}
491#else
492#define copy_packet_from_dev_log(...)
493#endif /* SK_LOG */
494
495
496static inline int
497copy_packet_from_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
498 struct __kern_packet *dpkt)
499{
500 /*
501 * source and destination nexus don't share the packet pool
502 * sync operation here is to
503 * - alloc packet for the rx(dst) ring
504 * - copy data/metadata from src packet to dst packet
505 * - attach alloc'd packet to rx(dst) ring
506 */
507 kern_packet_t dph = SK_PTR_ENCODE(dpkt,
508 METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
509 kern_packet_t sph = SK_PTR_ENCODE(spkt, METADATA_TYPE(spkt),
510 METADATA_SUBTYPE(spkt));
511 boolean_t do_cksum_rx;
512 uint16_t skip_l2h_len = spkt->pkt_l2_len;
513 uint16_t iphlen;
514 uint32_t dlen;
515 int err;
516
517 if (__improbable(!validate_pkt_len(spkt, dph, skip_l2h_len, 0, 0,
518 &dlen))) {
519 SK_ERR("bufcnt %d, bufsz %d", __packet_get_buflet_count(dph),
520 PP_BUF_SIZE_DEF(dpkt->pkt_qum.qum_pp));
521 FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
522 return EINVAL;
523 }
524
525 /* Copy packet metadata */
526 _QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
527 _PKT_COPY(spkt, dpkt);
528 ASSERT(!(dpkt->pkt_qum.qum_qflags & QUM_F_KERNEL_ONLY) ||
529 PP_KERNEL_ONLY(dpkt->pkt_qum.qum_pp));
530 ASSERT(dpkt->pkt_mbuf == NULL);
531
532 dpkt->pkt_headroom = 0;
533 dpkt->pkt_l2_len = 0;
534
535 /* don't include IP header from partial sum */
536 if (__probable((spkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0)) {
537 iphlen = spkt->pkt_flow_ip_hlen;
538 do_cksum_rx = sk_cksum_rx;
539 } else {
540 iphlen = 0;
541 do_cksum_rx = FALSE;
542 }
543
544 /* Copy packet payload */
545 if ((spkt->pkt_pflags & PKT_F_MBUF_DATA) &&
546 (spkt->pkt_pflags & PKT_F_TRUNCATED)) {
547 FSW_STATS_INC(FSW_STATS_RX_COPY_MBUF2PKT);
548 /*
549 * Source packet has truncated contents (just enough for
550 * the classifer) of an mbuf from the compat driver; copy
551 * the entire entire mbuf contents to destination packet.
552 */
553 m_adj(spkt->pkt_mbuf, skip_l2h_len);
554 ASSERT((uint32_t)m_pktlen(spkt->pkt_mbuf) >= dlen);
555 fsw->fsw_pkt_copy_from_mbuf(NR_RX, dph, 0,
556 spkt->pkt_mbuf, 0, dlen, do_cksum_rx, iphlen);
557 } else {
558 FSW_STATS_INC(FSW_STATS_RX_COPY_PKT2PKT);
559 /*
560 * Source packet has full contents, either from an mbuf
561 * that came up from the compat driver, or because it
562 * originated on the native driver; copy to destination.
563 */
564 fsw->fsw_pkt_copy_from_pkt(NR_RX, dph, 0, sph,
565 (spkt->pkt_headroom + spkt->pkt_l2_len), dlen, do_cksum_rx,
566 iphlen, 0, FALSE);
567 }
568
569#if DEBUG || DEVELOPMENT
570 if (__improbable(pkt_trailers > 0)) {
571 dlen += pkt_add_trailers(dph, dlen, iphlen);
572 }
573#endif /* DEBUG || DEVELOPMENT */
574
575 /* Finalize and attach packet to Rx ring */
576 METADATA_ADJUST_LEN(dpkt, 0, 0);
577 err = __packet_finalize(ph: dph);
578 VERIFY(err == 0);
579
580 copy_packet_from_dev_log(spkt, dpkt, kernproc);
581
582 if (spkt->pkt_pflags & PKT_F_MBUF_DATA) {
583 ifp_inc_traffic_class_in(ifp: fsw->fsw_ifp, m: spkt->pkt_mbuf);
584 mbuf_free(mbuf: spkt->pkt_mbuf);
585 KPKT_CLEAR_MBUF_DATA(spkt);
586 } else {
587 fsw_ifp_inc_traffic_class_in_pkt(fsw->fsw_ifp, dph);
588 }
589
590 if (__probable(do_cksum_rx != 0)) {
591 FSW_STATS_INC(FSW_STATS_RX_COPY_SUM);
592 }
593
594 return 0;
595}
596
597SK_NO_INLINE_ATTRIBUTE
598static struct __kern_packet *
599rx_process_ip_frag(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
600{
601 char *pkt_buf;
602 void *l3_hdr;
603 uint16_t nfrags, tlen;
604 int err = 0;
605
606 switch (fsw_ip_reass) {
607 case FSW_IP_REASS_FORCE_OFF:
608 return pkt;
609 case FSW_IP_REASS_FORCE_ON:
610 break;
611 default:
612 if (!FSW_NETAGENT_ENABLED(fsw) ||
613 flow_mgr_get_num_flows(mgr: fsw->fsw_flow_mgr) == 0) {
614 return pkt;
615 }
616 break;
617 }
618
619 MD_BUFLET_ADDR_ABS(pkt, pkt_buf);
620 l3_hdr = pkt_buf + pkt->pkt_headroom + pkt->pkt_l2_len;
621
622 ASSERT(fsw->fsw_ipfm != NULL);
623 ASSERT((pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0);
624
625 if (pkt->pkt_flow_ip_ver == IPVERSION) {
626 err = fsw_ip_frag_reass_v4(mgr: fsw->fsw_ipfm, pkt: &pkt,
627 ip4: (struct ip *)l3_hdr, nfrags: &nfrags, tlen: &tlen);
628 } else {
629 ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION);
630 /* we only handle frag header immediately after v6 header */
631 err = fsw_ip_frag_reass_v6(mgr: fsw->fsw_ipfm, pkt: &pkt,
632 ip6: (struct ip6_hdr *)l3_hdr,
633 ip6f: (struct ip6_frag *)((uintptr_t)l3_hdr + sizeof(struct ip6_hdr)),
634 nfrags: &nfrags, tlen: &tlen);
635 }
636 if (__improbable(err != 0)) {
637 /* if we get a bad fragment, free it */
638 pp_free_packet_single(pkt);
639 pkt = NULL;
640 } else {
641 ASSERT(!((pkt != NULL) ^ (nfrags > 0)));
642 }
643
644 return pkt;
645}
646
647SK_NO_INLINE_ATTRIBUTE
648static void
649rx_prepare_packet_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
650{
651 ASSERT(pkt->pkt_pflags & PKT_F_MBUF_DATA);
652 uint32_t mlen = (uint32_t)m_pktlen(pkt->pkt_mbuf);
653 kern_packet_t ph = SK_PTR_ENCODE(pkt,
654 METADATA_TYPE(pkt), METADATA_SUBTYPE(pkt));
655 /*
656 * This is the case when the packet is coming in from
657 * compat-netif. This packet only has valid metadata
658 * and an attached mbuf. We need to copy enough data
659 * from the mbuf to the packet buffer for the
660 * classifier. Compat netif packet pool is configured
661 * with buffer size of NETIF_COMPAT_MAX_MBUF_DATA_COPY
662 * which is just enough to hold the protocol headers
663 * for the flowswitch classifier.
664 */
665
666 pkt->pkt_headroom = 0;
667 METADATA_ADJUST_LEN(pkt, 0, 0);
668 /*
669 * Copy the initial 128 bytes of the packet for
670 * classification.
671 * Ethernet(14) + IPv6 header(40) +
672 * + IPv6 fragment header(8) +
673 * TCP header with options(60).
674 */
675 fsw->fsw_pkt_copy_from_mbuf(NR_RX, ph,
676 pkt->pkt_headroom, pkt->pkt_mbuf, 0,
677 MIN(mlen, NETIF_COMPAT_MAX_MBUF_DATA_COPY),
678 FALSE, 0);
679
680 int err = __packet_finalize_with_mbuf(pkt);
681 VERIFY(err == 0);
682}
683
684static struct __kern_packet *
685rx_prepare_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
686{
687 pkt->pkt_qum_qflags &= ~QUM_F_FLOW_CLASSIFIED;
688
689 if (__improbable(pkt->pkt_pflags & PKT_F_MBUF_DATA)) {
690 rx_prepare_packet_mbuf(fsw, pkt);
691 }
692
693 return pkt;
694}
695
696static struct flow_entry *
697lookup_flow_with_pkt(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
698 bool input, struct flow_entry *prev_fe)
699{
700 struct flow_key key __sk_aligned(16);
701 struct flow_entry *fe = NULL;
702
703 ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
704 flow_pkt2key(pkt, input, key: &key);
705
706 if (__probable(prev_fe != NULL &&
707 prev_fe->fe_key.fk_mask == FKMASK_5TUPLE)) {
708 uint16_t saved_mask = key.fk_mask;
709 key.fk_mask = FKMASK_5TUPLE;
710 if (flow_key_cmp_mask(match: &prev_fe->fe_key, key: &key, mask: &fk_mask_5tuple) == 0) {
711 flow_entry_retain(fe: prev_fe);
712 fe = prev_fe;
713 } else {
714 key.fk_mask = saved_mask;
715 }
716 }
717
718top:
719 if (__improbable(fe == NULL)) {
720 fe = flow_mgr_find_fe_by_key(fsw->fsw_flow_mgr, &key);
721 }
722
723 if (__improbable(fe != NULL &&
724 (fe->fe_flags & (FLOWENTF_PARENT | FLOWENTF_CHILD)) != 0)) {
725 /* Rx */
726 if (input) {
727 if (fe->fe_flags & FLOWENTF_PARENT) {
728 struct flow_entry *child_fe = rx_lookup_child_flow(fsw, fe, pkt);
729 if (child_fe != NULL) {
730 flow_entry_release(pfe: &fe);
731 fe = child_fe;
732 }
733 } else {
734 if (!rx_flow_demux_match(fsw, fe, pkt)) {
735 flow_entry_release(pfe: &fe);
736 fe = NULL;
737 goto top;
738 }
739 }
740 } else {
741 /* Tx */
742 if (__improbable(!_UUID_MATCH(pkt->pkt_flow_id, fe->fe_uuid))) {
743 if (__probable(fe->fe_flags & FLOWENTF_PARENT)) {
744 struct flow_entry *parent_fe = fe;
745 fe = tx_lookup_child_flow(parent_fe, pkt->pkt_flow_id);
746 flow_entry_release(pfe: &parent_fe);
747 } else {
748 flow_entry_release(pfe: &fe);
749 fe = NULL;
750 goto top;
751 }
752 }
753 }
754 }
755
756 SK_LOG_VAR(char fkbuf[FLOWKEY_DBGBUF_SIZE]);
757 SK_DF(SK_VERB_FSW_DP | SK_VERB_LOOKUP,
758 "%s %s %s \"%s\" fe 0x%llx",
759 input ? "Rx" : "Tx", if_name(fsw->fsw_ifp),
760 sk_proc_name_address(current_proc()),
761 fk_as_string(&key, fkbuf, sizeof(fkbuf)),
762 SK_KVA(fe));
763
764 return fe;
765}
766
767SK_NO_INLINE_ATTRIBUTE
768static bool
769pkt_is_for_listener(struct flow_entry *fe, struct __kern_packet *pkt)
770{
771 struct nx_flowswitch *fsw = fe->fe_fsw;
772 struct ifnet *ifp = fsw->fsw_ifp;
773 struct in_ifaddr *ia = NULL;
774 struct in_ifaddr *best_ia = NULL;
775 struct in6_ifaddr *ia6 = NULL;
776 struct in6_ifaddr *best_ia6 = NULL;
777 struct ifnet *match_ifp = NULL;
778 struct __flow *flow = pkt->pkt_flow;
779 bool result = false;
780
781 ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED);
782
783 if (flow->flow_ip_ver == IPVERSION) {
784 if (IN_ZERONET(ntohl(flow->flow_ipv4_dst.s_addr)) ||
785 IN_LOOPBACK(ntohl(flow->flow_ipv4_dst.s_addr)) ||
786 IN_LINKLOCAL(ntohl(flow->flow_ipv4_dst.s_addr)) ||
787 IN_DS_LITE(ntohl(flow->flow_ipv4_dst.s_addr)) ||
788 IN_6TO4_RELAY_ANYCAST(ntohl(flow->flow_ipv4_dst.s_addr)) ||
789 IN_MULTICAST(ntohl(flow->flow_ipv4_dst.s_addr)) ||
790 INADDR_BROADCAST == flow->flow_ipv4_dst.s_addr) {
791 result = true;
792 goto done;
793 }
794
795 /*
796 * Check for a match in the hash bucket.
797 */
798 lck_rw_lock_shared(lck: &in_ifaddr_rwlock);
799 TAILQ_FOREACH(ia, INADDR_HASH(flow->flow_ipv4_dst.s_addr), ia_hash) {
800 if (IA_SIN(ia)->sin_addr.s_addr == flow->flow_ipv4_dst.s_addr) {
801 best_ia = ia;
802 match_ifp = ia->ia_ifp;
803
804 if (match_ifp == ifp) {
805 break;
806 }
807 /*
808 * Continue the loop in case there's a exact match with another
809 * interface
810 */
811 }
812 }
813
814 if (best_ia != NULL) {
815 if (match_ifp != ifp && ipforwarding == 0 &&
816 (match_ifp->if_family == IFNET_FAMILY_IPSEC ||
817 match_ifp->if_family == IFNET_FAMILY_UTUN)) {
818 /*
819 * Drop when interface address check is strict and forwarding
820 * is disabled
821 */
822 } else {
823 lck_rw_done(lck: &in_ifaddr_rwlock);
824 result = true;
825 goto done;
826 }
827 }
828 lck_rw_done(lck: &in_ifaddr_rwlock);
829
830 if (ifp->if_flags & IFF_BROADCAST) {
831 /*
832 * Check for broadcast addresses.
833 *
834 * Only accept broadcast packets that arrive via the matching
835 * interface. Reception of forwarded directed broadcasts would be
836 * handled via ip_forward() and ether_frameout() with the loopback
837 * into the stack for SIMPLEX interfaces handled by ether_frameout().
838 */
839 struct ifaddr *ifa;
840
841 ifnet_lock_shared(ifp);
842 TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
843 if (ifa->ifa_addr->sa_family != AF_INET) {
844 continue;
845 }
846 ia = ifatoia(ifa);
847 if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == flow->flow_ipv4_dst.s_addr ||
848 ia->ia_netbroadcast.s_addr == flow->flow_ipv4_dst.s_addr) {
849 ifnet_lock_done(ifp);
850 result = true;
851 goto done;
852 }
853 }
854 ifnet_lock_done(ifp);
855 }
856 } else {
857 if (IN6_IS_ADDR_LOOPBACK(&flow->flow_ipv6_dst) ||
858 IN6_IS_ADDR_LINKLOCAL(&flow->flow_ipv6_dst) ||
859 IN6_IS_ADDR_MULTICAST(&flow->flow_ipv6_dst)) {
860 result = true;
861 goto done;
862 }
863
864 /*
865 * Check for exact addresses in the hash bucket.
866 */
867 lck_rw_lock_shared(lck: &in6_ifaddr_rwlock);
868 TAILQ_FOREACH(ia6, IN6ADDR_HASH(&flow->flow_ipv6_dst), ia6_hash) {
869 if (in6_are_addr_equal_scoped(&ia6->ia_addr.sin6_addr, &flow->flow_ipv6_dst, ia6->ia_ifp->if_index, ifp->if_index)) {
870 if ((ia6->ia6_flags & (IN6_IFF_NOTREADY | IN6_IFF_CLAT46))) {
871 continue;
872 }
873 best_ia6 = ia6;
874 if (ia6->ia_ifp == ifp) {
875 break;
876 }
877 /*
878 * Continue the loop in case there's a exact match with another
879 * interface
880 */
881 }
882 }
883 if (best_ia6 != NULL) {
884 if (best_ia6->ia_ifp != ifp && ip6_forwarding == 0 &&
885 (best_ia6->ia_ifp->if_family == IFNET_FAMILY_IPSEC ||
886 best_ia6->ia_ifp->if_family == IFNET_FAMILY_UTUN)) {
887 /*
888 * Drop when interface address check is strict and forwarding
889 * is disabled
890 */
891 } else {
892 lck_rw_done(lck: &in6_ifaddr_rwlock);
893 result = true;
894 goto done;
895 }
896 }
897 lck_rw_done(lck: &in6_ifaddr_rwlock);
898 }
899
900 /*
901 * In forwarding mode, if the destination address
902 * of the packet does not match any interface
903 * address, it maybe destined to the client device
904 */
905 SK_DF(SK_VERB_FSW_DP | SK_VERB_RX | SK_VERB_FLOW,
906 "Rx flow does not match interface address");
907done:
908 return result;
909}
910
911static struct flow_entry *
912rx_lookup_flow(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
913 struct flow_entry *prev_fe)
914{
915 struct flow_entry *fe;
916
917 fe = lookup_flow_with_pkt(fsw, pkt, true, prev_fe);
918 _FSW_INJECT_ERROR(2, fe, NULL, flow_entry_release, &fe);
919 if (fe == NULL) {
920 FSW_STATS_INC(FSW_STATS_RX_FLOW_NOT_FOUND);
921 return NULL;
922 }
923
924 if (__improbable(fe->fe_key.fk_mask == FKMASK_2TUPLE &&
925 fe->fe_flags & FLOWENTF_LISTENER) &&
926 !pkt_is_for_listener(fe, pkt)) {
927 FSW_STATS_INC(FSW_STATS_RX_PKT_NOT_LISTENER);
928 flow_entry_release(pfe: &fe);
929 return NULL;
930 }
931
932 if (__improbable(fe->fe_flags & FLOWENTF_TORN_DOWN)) {
933 FSW_STATS_INC(FSW_STATS_RX_FLOW_TORNDOWN);
934 SK_DF(SK_VERB_FSW_DP | SK_VERB_RX | SK_VERB_FLOW,
935 "Rx flow torn down");
936 flow_entry_release(pfe: &fe);
937 fe = NULL;
938 }
939
940 return fe;
941}
942
943static inline void
944rx_flow_batch_packet(struct flow_entry_list *fes, struct flow_entry *fe,
945 struct __kern_packet *pkt)
946{
947 if (__improbable(pkt->pkt_flow_ip_is_frag)) {
948 fe->fe_rx_frag_count++;
949 }
950
951 /* KPKTQ_ENQUEUE_LIST is needed until frags become chained buflet */
952 if (KPKTQ_EMPTY(&fe->fe_rx_pktq)) {
953 ASSERT(KPKTQ_LEN(&fe->fe_rx_pktq) == 0);
954 TAILQ_INSERT_TAIL(fes, fe, fe_rx_link);
955 KPKTQ_ENQUEUE_LIST(&fe->fe_rx_pktq, pkt);
956 } else {
957 ASSERT(!TAILQ_EMPTY(fes));
958 KPKTQ_ENQUEUE_LIST(&fe->fe_rx_pktq, pkt);
959 flow_entry_release(pfe: &fe);
960 }
961}
962
963static void
964tx_flow_batch_packet(struct flow_entry_list *fes, struct flow_entry *fe,
965 struct __kern_packet *pkt)
966{
967 /* record frag continuation */
968 if (__improbable(pkt->pkt_flow_ip_is_first_frag)) {
969 ASSERT(pkt->pkt_flow_ip_is_frag);
970 fe->fe_tx_is_cont_frag = true;
971 fe->fe_tx_frag_id = pkt->pkt_flow_ip_frag_id;
972 } else if (__probable(!pkt->pkt_flow_ip_is_frag)) {
973 fe->fe_tx_is_cont_frag = false;
974 fe->fe_tx_frag_id = 0;
975 }
976
977 if (KPKTQ_EMPTY(&fe->fe_tx_pktq)) {
978 ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) == 0);
979 TAILQ_INSERT_TAIL(fes, fe, fe_tx_link);
980 KPKTQ_ENQUEUE(&fe->fe_tx_pktq, pkt);
981 } else {
982 ASSERT(!TAILQ_EMPTY(fes));
983 KPKTQ_ENQUEUE(&fe->fe_tx_pktq, pkt);
984 flow_entry_release(pfe: &fe);
985 }
986}
987
988static inline void
989fsw_rx_ring_dequeue_pktq(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
990 uint32_t n_pkts_max, struct pktq *pktq, uint32_t *n_bytes)
991{
992 uint32_t n_pkts = 0;
993 slot_idx_t idx, idx_end;
994 idx = r->ckr_khead;
995 idx_end = r->ckr_rhead;
996
997 ASSERT(KPKTQ_EMPTY(pktq));
998 *n_bytes = 0;
999 for (; n_pkts < n_pkts_max && idx != idx_end;
1000 idx = SLOT_NEXT(i: idx, lim: r->ckr_lim)) {
1001 struct __kern_slot_desc *ksd = KR_KSD(r, idx);
1002 struct __kern_packet *pkt = ksd->sd_pkt;
1003
1004 ASSERT(pkt->pkt_nextpkt == NULL);
1005 KR_SLOT_DETACH_METADATA(kring: r, ksd);
1006
1007 _FSW_INJECT_ERROR(20, pkt->pkt_qum_qflags,
1008 pkt->pkt_qum_qflags | QUM_F_DROPPED, null_func);
1009 if (__improbable(((pkt->pkt_qum_qflags & QUM_F_DROPPED) != 0))
1010 || (pkt->pkt_length == 0)) {
1011 FSW_STATS_INC(FSW_STATS_DROP);
1012 pp_free_packet_single(pkt);
1013 continue;
1014 }
1015 n_pkts++;
1016 *n_bytes += pkt->pkt_length;
1017
1018 KPKTQ_ENQUEUE(pktq, pkt);
1019 }
1020 r->ckr_khead = idx;
1021 r->ckr_ktail = SLOT_PREV(i: idx, lim: r->ckr_lim);
1022}
1023
1024/*
1025 * This is only for estimating how many packets each GSO packet will need.
1026 * The number does not need to be exact because any leftover packets allocated
1027 * will be freed.
1028 */
1029static uint32_t
1030estimate_gso_pkts(struct __kern_packet *pkt)
1031{
1032 packet_tso_flags_t tso_flags;
1033 uint16_t mss;
1034 uint32_t n_pkts = 0, total_hlen = 0, total_len = 0;
1035
1036 tso_flags = pkt->pkt_csum_flags & PACKET_CSUM_TSO_FLAGS;
1037 mss = pkt->pkt_proto_seg_sz;
1038
1039 if (tso_flags == PACKET_TSO_IPV4) {
1040 total_hlen = sizeof(struct ip) + sizeof(struct tcphdr);
1041 } else if (tso_flags == PACKET_TSO_IPV6) {
1042 total_hlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
1043 }
1044 if (total_hlen != 0 && mss != 0) {
1045 total_len = pkt->pkt_length;
1046 n_pkts = (uint32_t)
1047 (SK_ROUNDUP((total_len - total_hlen), mss) / mss);
1048 }
1049 DTRACE_SKYWALK5(estimate__gso, packet_tso_flags_t, tso_flags,
1050 uint32_t, total_hlen, uint32_t, total_len, uint16_t, mss,
1051 uint32_t, n_pkts);
1052 return n_pkts;
1053}
1054
1055/*
1056 * This function retrieves a chain of packets of the same type only
1057 * (GSO or non-GSO).
1058 */
1059static inline void
1060fsw_tx_ring_dequeue_pktq(struct nx_flowswitch *fsw,
1061 struct __kern_channel_ring *r, uint32_t n_pkts_max,
1062 struct pktq *pktq, uint32_t *n_bytes, uint32_t *gso_pkts_estimate)
1063{
1064 uint32_t n_pkts = 0;
1065 slot_idx_t idx, idx_end;
1066 idx = r->ckr_khead;
1067 idx_end = r->ckr_rhead;
1068 struct nexus_vp_adapter *vpna = VPNA(KRNA(r));
1069 boolean_t gso_enabled, gso_required;
1070 uint32_t gso_pkts;
1071
1072 gso_enabled = (fsw->fsw_tso_mode == FSW_TSO_MODE_SW);
1073 ASSERT(KPKTQ_EMPTY(pktq));
1074 *n_bytes = 0;
1075 for (; n_pkts < n_pkts_max &&
1076 (!gso_enabled || fsw_gso_batch == 0 ||
1077 *gso_pkts_estimate < fsw_gso_batch) &&
1078 idx != idx_end; idx = SLOT_NEXT(i: idx, lim: r->ckr_lim)) {
1079 struct __kern_slot_desc *ksd = KR_KSD(r, idx);
1080 struct __kern_packet *pkt = ksd->sd_pkt;
1081
1082 ASSERT(pkt->pkt_nextpkt == NULL);
1083
1084 _FSW_INJECT_ERROR(20, pkt->pkt_qum_qflags,
1085 pkt->pkt_qum_qflags | QUM_F_DROPPED, null_func);
1086 if (__improbable(((pkt->pkt_qum_qflags & QUM_F_DROPPED) != 0))
1087 || (pkt->pkt_length == 0)) {
1088 KR_SLOT_DETACH_METADATA(kring: r, ksd);
1089 FSW_STATS_INC(FSW_STATS_DROP);
1090 pp_free_packet_single(pkt);
1091 continue;
1092 }
1093 if (gso_enabled) {
1094 gso_pkts = estimate_gso_pkts(pkt);
1095
1096 /*
1097 * We use the first packet to determine what
1098 * type the subsequent ones need to be (GSO or
1099 * non-GSO).
1100 */
1101 if (n_pkts == 0) {
1102 gso_required = (gso_pkts != 0);
1103 } else {
1104 if (gso_required != (gso_pkts != 0)) {
1105 break;
1106 }
1107 }
1108 *gso_pkts_estimate += gso_pkts;
1109 }
1110 KR_SLOT_DETACH_METADATA(kring: r, ksd);
1111 if (NA_CHANNEL_EVENT_ATTACHED(&vpna->vpna_up)) {
1112 __packet_set_tx_nx_port(SK_PKT2PH(pkt),
1113 nx_port: vpna->vpna_nx_port, vpna_gencnt: vpna->vpna_gencnt);
1114 }
1115 n_pkts++;
1116 *n_bytes += pkt->pkt_length;
1117 KPKTQ_ENQUEUE(pktq, pkt);
1118 }
1119 r->ckr_khead = idx;
1120 r->ckr_ktail = SLOT_PREV(i: idx, lim: r->ckr_lim);
1121 DTRACE_SKYWALK5(tx__ring__dequeue, struct nx_flowswitch *, fsw,
1122 ifnet_t, fsw->fsw_ifp, uint32_t, n_pkts, uint32_t, *n_bytes,
1123 uint32_t, *gso_pkts_estimate);
1124}
1125
1126static void
1127fsw_ring_enqueue_pktq(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
1128 struct pktq *pktq)
1129{
1130#pragma unused(fsw)
1131 struct __kern_packet *pkt;
1132 struct __kern_quantum *kqum;
1133 uint32_t kr_space_avail = 0;
1134 uint32_t n, n_pkts = 0, n_bytes = 0;
1135 slot_idx_t idx = 0, idx_start = 0, idx_end = 0;
1136
1137 kr_enter(r, TRUE);
1138
1139 idx_start = r->ckr_ktail;
1140 kr_space_avail = kr_available_slots_rxring(rxkring: r);
1141 _FSW_INJECT_ERROR(40, kr_space_avail, 0, null_func);
1142 n = MIN(kr_space_avail, KPKTQ_LEN(pktq));
1143 _FSW_INJECT_ERROR(41, n, 0, null_func);
1144 idx_end = SLOT_INCREMENT(i: idx_start, n, lim: r->ckr_lim);
1145
1146 idx = idx_start;
1147 while (idx != idx_end) {
1148 KPKTQ_DEQUEUE(pktq, pkt);
1149 kqum = SK_PTR_ADDR_KQUM(pkt);
1150 kqum->qum_qflags |= QUM_F_FINALIZED;
1151 n_pkts++;
1152 n_bytes += pkt->pkt_length;
1153 KR_SLOT_ATTACH_METADATA(kring: r, KR_KSD(r, idx), kqum);
1154 if (__improbable(pkt->pkt_trace_id != 0)) {
1155 KDBG(SK_KTRACE_PKT_RX_FSW | DBG_FUNC_END, pkt->pkt_trace_id);
1156 KDBG(SK_KTRACE_PKT_RX_CHN | DBG_FUNC_START, pkt->pkt_trace_id);
1157 }
1158 idx = SLOT_NEXT(i: idx, lim: r->ckr_lim);
1159 }
1160
1161 kr_update_stats(kring: r, slot_count: n_pkts, byte_count: n_bytes);
1162
1163 /*
1164 * ensure slot attachments are visible before updating the
1165 * tail pointer
1166 */
1167 os_atomic_thread_fence(seq_cst);
1168
1169 r->ckr_ktail = idx_end;
1170
1171 kr_exit(r);
1172
1173 r->ckr_na_notify(r, kernproc, NA_NOTEF_PUSH);
1174
1175 SK_DF(SK_VERB_FSW_DP | SK_VERB_RING, "%s enqueued %d pkts",
1176 r->ckr_name, n_pkts);
1177}
1178
1179static void
1180pkts_to_pktq(struct __kern_packet *pkts[], uint32_t n_pkts, struct pktq *pktq)
1181{
1182 ASSERT(KPKTQ_EMPTY(pktq));
1183
1184 for (uint32_t i = 0; i < n_pkts; i++) {
1185 struct __kern_packet *pkt = pkts[i];
1186 ASSERT(pkt->pkt_nextpkt == NULL);
1187 KPKTQ_ENQUEUE(pktq, pkt);
1188 }
1189}
1190
1191/*
1192 * This function is modeled after nx_netif_host_grab_pkts() in nx_netif_host.c.
1193 */
1194SK_NO_INLINE_ATTRIBUTE
1195static void
1196convert_native_pktq_to_mbufs(struct nx_flowswitch *fsw, struct pktq *pktq,
1197 struct mbuf **m_headp, struct mbuf **m_tailp, uint32_t *cnt, uint32_t *bytes)
1198{
1199 uint32_t tot_cnt;
1200 unsigned int num_segs = 1;
1201 struct mbuf *mhead, *head = NULL, *tail = NULL, **tailp = &head;
1202 uint32_t mhead_cnt, mhead_bufsize;
1203 uint32_t mhead_waste = 0;
1204 uint32_t mcnt = 0, mbytes = 0;
1205 uint32_t largest, max_pkt_len;
1206 struct __kern_packet *pkt;
1207 struct kern_pbufpool *pp;
1208
1209 tot_cnt = KPKTQ_LEN(pktq);
1210 ASSERT(tot_cnt > 0);
1211 mhead_cnt = tot_cnt;
1212
1213 /*
1214 * Opportunistically batch-allocate the mbufs based on the largest
1215 * packet size we've seen in the recent past. Note that we reset
1216 * fe_rx_largest_size below if we notice that we're under-utilizing the
1217 * allocated buffers (thus disabling this batch allocation).
1218 */
1219 largest = *(volatile uint32_t*)&fsw->fsw_rx_largest_size; /* read once */
1220 if (__probable(largest != 0)) {
1221 if (largest <= MCLBYTES) {
1222 mhead = m_allocpacket_internal(&mhead_cnt, MCLBYTES,
1223 &num_segs, M_NOWAIT, 1, 0);
1224 mhead_bufsize = MCLBYTES;
1225 } else if (largest <= MBIGCLBYTES) {
1226 mhead = m_allocpacket_internal(&mhead_cnt, MBIGCLBYTES,
1227 &num_segs, M_NOWAIT, 1, 0);
1228 mhead_bufsize = MBIGCLBYTES;
1229 } else if (largest <= M16KCLBYTES) {
1230 mhead = m_allocpacket_internal(&mhead_cnt, M16KCLBYTES,
1231 &num_segs, M_NOWAIT, 1, 0);
1232 mhead_bufsize = M16KCLBYTES;
1233 } else if (largest <= M16KCLBYTES * 2) {
1234 num_segs = 2;
1235 mhead = m_allocpacket_internal(&mhead_cnt, M16KCLBYTES * 2,
1236 &num_segs, M_NOWAIT, 1, 0);
1237 mhead_bufsize = M16KCLBYTES * 2;
1238 } else {
1239 mhead = NULL;
1240 mhead_bufsize = mhead_cnt = 0;
1241 }
1242 } else {
1243 mhead = NULL;
1244 mhead_bufsize = mhead_cnt = 0;
1245 }
1246 DTRACE_SKYWALK4(bufstats, uint32_t, largest, uint32_t, mhead_bufsize,
1247 uint32_t, mhead_cnt, uint32_t, tot_cnt);
1248
1249 pp = __DECONST(struct kern_pbufpool *, KPKTQ_FIRST(pktq)->pkt_qum.qum_pp);
1250 max_pkt_len = PP_BUF_SIZE_DEF(pp) * pp->pp_max_frags;
1251
1252 KPKTQ_FOREACH(pkt, pktq) {
1253 uint32_t tot_len, len;
1254 uint16_t pad, llhlen, iphlen;
1255 boolean_t do_cksum_rx;
1256 struct mbuf *m;
1257 int error;
1258
1259 llhlen = pkt->pkt_l2_len;
1260 len = pkt->pkt_length;
1261 if (__improbable(len > max_pkt_len || llhlen > len)) {
1262 DTRACE_SKYWALK2(bad__len, struct nx_flowswitch *, fsw,
1263 struct __kern_packet *, pkt);
1264 FSW_STATS_INC(FSW_STATS_DROP);
1265 FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
1266 continue;
1267 }
1268 /* begin payload on 32-bit boundary; figure out the padding */
1269 pad = (uint16_t)P2ROUNDUP(llhlen, sizeof(uint32_t)) - llhlen;
1270 tot_len = pad + len;
1271
1272 /* remember largest packet size */
1273 if (__improbable(largest < tot_len)) {
1274 largest = MAX(tot_len, MCLBYTES);
1275 }
1276
1277 /*
1278 * If the above batch allocation returned partial
1279 * success, we try a blocking allocation here again.
1280 */
1281 m = mhead;
1282 if (__improbable(m == NULL || tot_len > mhead_bufsize)) {
1283 ASSERT(mhead != NULL || mhead_cnt == 0);
1284 num_segs = 1;
1285 if (tot_len > M16KCLBYTES) {
1286 num_segs = 0;
1287 }
1288 if ((error = mbuf_allocpacket(how: MBUF_DONTWAIT, packetlen: tot_len,
1289 maxchunks: &num_segs, mbuf: &m)) != 0) {
1290 DTRACE_SKYWALK2(bad__len,
1291 struct nx_flowswitch *, fsw,
1292 struct __kern_packet *, pkt);
1293 FSW_STATS_INC(FSW_STATS_DROP_NOMEM_MBUF);
1294 FSW_STATS_INC(FSW_STATS_DROP);
1295 continue;
1296 }
1297 } else {
1298 mhead = m->m_nextpkt;
1299 m->m_nextpkt = NULL;
1300 ASSERT(mhead_cnt != 0);
1301 --mhead_cnt;
1302
1303 /* check if we're underutilizing large buffers */
1304 if (__improbable(mhead_bufsize > MCLBYTES &&
1305 tot_len < (mhead_bufsize >> 1))) {
1306 ++mhead_waste;
1307 }
1308 /*
1309 * Clean up unused mbuf.
1310 * Ony need to do this when we pre-alloc 2x16K mbufs
1311 */
1312 if (__improbable(mhead_bufsize >= tot_len + M16KCLBYTES)) {
1313 ASSERT(mhead_bufsize == 2 * M16KCLBYTES);
1314 struct mbuf *m_extra = m->m_next;
1315 ASSERT(m_extra != NULL);
1316 ASSERT(m_extra->m_len == 0);
1317 ASSERT(M_SIZE(m_extra) == M16KCLBYTES);
1318 m->m_next = NULL;
1319 m_freem(m_extra);
1320 FSW_STATS_INC(FSW_STATS_RX_WASTED_16KMBUF);
1321 }
1322 }
1323 m->m_data += pad;
1324 m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *);
1325
1326 /* don't include IP header from partial sum */
1327 if (__probable((pkt->pkt_qum_qflags &
1328 QUM_F_FLOW_CLASSIFIED) != 0)) {
1329 iphlen = pkt->pkt_flow_ip_hlen;
1330 do_cksum_rx = sk_cksum_rx;
1331 } else {
1332 iphlen = 0;
1333 do_cksum_rx = FALSE;
1334 }
1335
1336 fsw->fsw_pkt_copy_to_mbuf(NR_RX, SK_PKT2PH(pkt),
1337 pkt->pkt_headroom, m, 0, len, do_cksum_rx,
1338 llhlen + iphlen);
1339
1340 FSW_STATS_INC(FSW_STATS_RX_COPY_PKT2MBUF);
1341 if (do_cksum_rx) {
1342 FSW_STATS_INC(FSW_STATS_RX_COPY_SUM);
1343 }
1344#if DEBUG || DEVELOPMENT
1345 if (__improbable(pkt_trailers > 0)) {
1346 (void) pkt_add_trailers_mbuf(m, llhlen + iphlen);
1347 }
1348#endif /* DEBUG || DEVELOPMENT */
1349 m_adj(m, llhlen);
1350
1351 m->m_pkthdr.rcvif = fsw->fsw_ifp;
1352 if (__improbable((pkt->pkt_link_flags &
1353 PKT_LINKF_ETHFCS) != 0)) {
1354 m->m_flags |= M_HASFCS;
1355 }
1356 if (__improbable(pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
1357 m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT;
1358 }
1359 ASSERT(m->m_nextpkt == NULL);
1360 tail = m;
1361 *tailp = m;
1362 tailp = &m->m_nextpkt;
1363 mcnt++;
1364 mbytes += m_pktlen(m);
1365 }
1366 /* free any leftovers */
1367 if (__improbable(mhead != NULL)) {
1368 DTRACE_SKYWALK1(mhead__leftover, uint32_t, mhead_cnt);
1369 ASSERT(mhead_cnt != 0);
1370 (void) m_freem_list(mhead);
1371 mhead = NULL;
1372 mhead_cnt = 0;
1373 }
1374
1375 /* reset if most packets (>50%) are smaller than our batch buffers */
1376 if (__improbable(mhead_waste > ((uint32_t)tot_cnt >> 1))) {
1377 DTRACE_SKYWALK4(mhead__waste, struct nx_flowswitch *, fsw,
1378 struct flow_entry *, NULL, uint32_t, mhead_waste,
1379 uint32_t, tot_cnt);
1380 largest = 0;
1381 }
1382
1383 if (largest != fsw->fsw_rx_largest_size) {
1384 os_atomic_store(&fsw->fsw_rx_largest_size, largest, release);
1385 }
1386
1387 pp_free_pktq(pktq);
1388 *m_headp = head;
1389 *m_tailp = tail;
1390 *cnt = mcnt;
1391 *bytes = mbytes;
1392}
1393
1394/*
1395 * This function only extracts the mbuf from the packet. The caller frees
1396 * the packet.
1397 */
1398static inline struct mbuf *
1399convert_compat_pkt_to_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
1400{
1401 struct mbuf *m;
1402 struct pkthdr *mhdr;
1403 uint16_t llhlen;
1404
1405 m = pkt->pkt_mbuf;
1406 ASSERT(m != NULL);
1407
1408 llhlen = pkt->pkt_l2_len;
1409 if (llhlen > pkt->pkt_length) {
1410 m_freem(m);
1411 KPKT_CLEAR_MBUF_DATA(pkt);
1412 DTRACE_SKYWALK2(bad__len, struct nx_flowswitch *, fsw,
1413 struct __kern_packet *, pkt);
1414 FSW_STATS_INC(FSW_STATS_DROP);
1415 FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN);
1416 return NULL;
1417 }
1418 mhdr = &m->m_pkthdr;
1419 if ((mhdr->csum_flags & CSUM_DATA_VALID) == 0 &&
1420 PACKET_HAS_PARTIAL_CHECKSUM(pkt)) {
1421 mhdr->csum_flags &= ~CSUM_RX_FLAGS;
1422 mhdr->csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL);
1423 mhdr->csum_rx_start = pkt->pkt_csum_rx_start_off;
1424 mhdr->csum_rx_val = pkt->pkt_csum_rx_value;
1425 }
1426#if DEBUG || DEVELOPMENT
1427 uint32_t extra = 0;
1428 if (__improbable(pkt_trailers > 0)) {
1429 extra = pkt_add_trailers_mbuf(m, llhlen);
1430 }
1431#endif /* DEBUG || DEVELOPMENT */
1432 m_adj(m, llhlen);
1433 ASSERT((uint32_t)m_pktlen(m) == ((pkt->pkt_length - llhlen) + extra));
1434 KPKT_CLEAR_MBUF_DATA(pkt);
1435 return m;
1436}
1437
1438SK_NO_INLINE_ATTRIBUTE
1439static void
1440convert_compat_pktq_to_mbufs(struct nx_flowswitch *fsw, struct pktq *pktq,
1441 struct mbuf **m_head, struct mbuf **m_tail, uint32_t *cnt, uint32_t *bytes)
1442{
1443 struct __kern_packet *pkt;
1444 struct mbuf *m, *head = NULL, *tail = NULL, **tailp = &head;
1445 uint32_t c = 0, b = 0;
1446
1447 KPKTQ_FOREACH(pkt, pktq) {
1448 m = convert_compat_pkt_to_mbuf(fsw, pkt);
1449 if (__improbable(m == NULL)) {
1450 continue;
1451 }
1452 tail = m;
1453 *tailp = m;
1454 tailp = &m->m_nextpkt;
1455 c++;
1456 b += m_pktlen(m);
1457 }
1458 pp_free_pktq(pktq);
1459 *m_head = head;
1460 *m_tail = tail;
1461 *cnt = c;
1462 *bytes = b;
1463}
1464
1465void
1466fsw_host_sendup(ifnet_t ifp, struct mbuf *m_head, struct mbuf *m_tail,
1467 uint32_t cnt, uint32_t bytes)
1468{
1469 struct ifnet_stat_increment_param s;
1470
1471 bzero(s: &s, n: sizeof(s));
1472 s.packets_in = cnt;
1473 s.bytes_in = bytes;
1474 dlil_input_handler(ifp, m_head, m_tail, &s, FALSE, NULL);
1475}
1476
1477void
1478fsw_host_rx(struct nx_flowswitch *fsw, struct pktq *pktq)
1479{
1480 struct mbuf *m_head = NULL, *m_tail = NULL;
1481 uint32_t cnt = 0, bytes = 0;
1482 ifnet_fsw_rx_cb_t cb;
1483 void *cb_arg;
1484 boolean_t compat;
1485
1486 ASSERT(!KPKTQ_EMPTY(pktq));
1487 if (ifnet_get_flowswitch_rx_callback(ifp: fsw->fsw_ifp, cbp: &cb, argp: &cb_arg) == 0) {
1488 ASSERT(cb != NULL);
1489 ASSERT(cb_arg != NULL);
1490 /* callback consumes packets */
1491 (*cb)(cb_arg, pktq);
1492 ifnet_release_flowswitch_rx_callback(ifp: fsw->fsw_ifp);
1493 return;
1494 }
1495
1496 /* All packets in the pktq must have the same type */
1497 compat = ((KPKTQ_FIRST(pktq)->pkt_pflags & PKT_F_MBUF_DATA) != 0);
1498 if (compat) {
1499 convert_compat_pktq_to_mbufs(fsw, pktq, m_head: &m_head, m_tail: &m_tail, cnt: &cnt,
1500 bytes: &bytes);
1501 } else {
1502 convert_native_pktq_to_mbufs(fsw, pktq, m_headp: &m_head, m_tailp: &m_tail, cnt: &cnt,
1503 bytes: &bytes);
1504 }
1505 if (__improbable(m_head == NULL)) {
1506 DTRACE_SKYWALK1(empty__head, struct nx_flowswitch *, fsw);
1507 return;
1508 }
1509 fsw_host_sendup(ifp: fsw->fsw_ifp, m_head, m_tail, cnt, bytes);
1510}
1511
1512void
1513fsw_ring_enqueue_tail_drop(struct nx_flowswitch *fsw,
1514 struct __kern_channel_ring *r, struct pktq *pktq)
1515{
1516 fsw_ring_enqueue_pktq(fsw, r, pktq);
1517 FSW_STATS_ADD(FSW_STATS_RX_DST_RING_FULL, KPKTQ_LEN(pktq));
1518 dp_drop_pktq(fsw, pktq);
1519}
1520
1521static struct nexus_adapter *
1522flow_get_na(struct nx_flowswitch *fsw, struct flow_entry *fe)
1523{
1524 struct kern_nexus *nx = fsw->fsw_nx;
1525 struct nexus_adapter *na = NULL;
1526 nexus_port_t port = fe->fe_nx_port;
1527
1528 if (port == FSW_VP_DEV || port == FSW_VP_HOST) {
1529 SK_ERR("dev or host ports have no NA");
1530 return NULL;
1531 }
1532
1533 if (__improbable(!nx_port_is_valid(nx, port))) {
1534 SK_DF(SK_VERB_FSW_DP, "%s[%d] port no longer valid",
1535 if_name(fsw->fsw_ifp), port);
1536 return NULL;
1537 }
1538
1539 na = nx_port_get_na(nx, port);
1540 if (__improbable(na == NULL)) {
1541 FSW_STATS_INC(FSW_STATS_DST_NXPORT_INVALID);
1542 SK_DF(SK_VERB_FSW_DP, "%s[%d] NA no longer valid",
1543 if_name(fsw->fsw_ifp), port);
1544 return NULL;
1545 }
1546
1547 if (__improbable(!NA_IS_ACTIVE(na))) {
1548 FSW_STATS_INC(FSW_STATS_DST_NXPORT_INACTIVE);
1549 SK_DF(SK_VERB_FSW_DP, "%s[%d] NA no longer active",
1550 if_name(fsw->fsw_ifp), port);
1551 return NULL;
1552 }
1553
1554 if (__improbable(nx_port_is_defunct(nx, port))) {
1555 FSW_STATS_INC(FSW_STATS_DST_NXPORT_DEFUNCT);
1556 SK_DF(SK_VERB_FSW_DP, "%s[%d] NA defuncted",
1557 if_name(fsw->fsw_ifp), port);
1558 return NULL;
1559 }
1560
1561 return na;
1562}
1563
1564static inline struct __kern_channel_ring *
1565flow_get_ring(struct nx_flowswitch *fsw, struct flow_entry *fe, enum txrx txrx)
1566{
1567 struct nexus_vp_adapter *na = NULL;
1568 struct __kern_channel_ring *r = NULL;
1569
1570 na = VPNA(flow_get_na(fsw, fe));
1571 if (__improbable(na == NULL)) {
1572 return NULL;
1573 }
1574
1575 switch (txrx) {
1576 case NR_RX:
1577 r = &na->vpna_up.na_rx_rings[0];
1578 break;
1579 case NR_TX:
1580 r = &na->vpna_up.na_tx_rings[0];
1581 break;
1582 default:
1583 __builtin_unreachable();
1584 VERIFY(0);
1585 }
1586
1587 if (__improbable(KR_DROP(r))) {
1588 FSW_STATS_INC(FSW_STATS_DST_RING_DROPMODE);
1589 SK_DF(SK_VERB_FSW_DP | SK_VERB_RING, "r %0xllx %s drop mode",
1590 r->ckr_name, SK_KVA(r));
1591 return NULL;
1592 }
1593
1594 ASSERT(KRNA(r)->na_md_type == NEXUS_META_TYPE_PACKET);
1595
1596#if (DEVELOPMENT || DEBUG)
1597 if (r != NULL) {
1598 _FSW_INJECT_ERROR(4, r, NULL, null_func);
1599 }
1600#endif /* DEVELOPMENT || DEBUG */
1601
1602 return r;
1603}
1604
1605struct __kern_channel_ring *
1606fsw_flow_get_rx_ring(struct nx_flowswitch *fsw, struct flow_entry *fe)
1607{
1608 return flow_get_ring(fsw, fe, txrx: NR_RX);
1609}
1610
1611static inline struct __kern_channel_ring *
1612fsw_flow_get_tx_ring(struct nx_flowswitch *fsw, struct flow_entry *fe)
1613{
1614 return flow_get_ring(fsw, fe, txrx: NR_TX);
1615}
1616
1617static bool
1618dp_flow_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
1619{
1620 struct flow_route *fr = fe->fe_route;
1621 struct ifnet *ifp = fsw->fsw_ifp;
1622
1623 if (__improbable(!(fe->fe_flags & FLOWENTF_NONVIABLE) &&
1624 !fe->fe_want_nonviable && (fe->fe_key.fk_mask & FKMASK_SRC) &&
1625 fe->fe_laddr_gencnt != ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt &&
1626 !flow_route_key_validate(&fe->fe_key, ifp, &fe->fe_laddr_gencnt))) {
1627 /*
1628 * The source address is no longer around; we want this
1629 * flow to be nonviable, but that requires holding the lock
1630 * as writer (which isn't the case now.) Indicate that
1631 * we need to finalize the nonviable later down below.
1632 *
1633 * We also request that the flow route be re-configured,
1634 * if this is a connected mode flow.
1635 *
1636 */
1637 if (!(fe->fe_flags & FLOWENTF_NONVIABLE)) {
1638 /*
1639 * fsw_pending_nonviable is a hint for reaper thread;
1640 * due to the fact that setting fe_want_nonviable and
1641 * incrementing fsw_pending_nonviable counter is not
1642 * atomic, let the increment happen first, and the
1643 * thread losing the CAS does decrement.
1644 */
1645 os_atomic_inc(&fsw->fsw_pending_nonviable, relaxed);
1646 if (os_atomic_cmpxchg(&fe->fe_want_nonviable, 0, 1, acq_rel)) {
1647 fsw_reap_sched(fsw);
1648 } else {
1649 os_atomic_dec(&fsw->fsw_pending_nonviable, relaxed);
1650 }
1651 }
1652 if (fr != NULL) {
1653 os_atomic_inc(&fr->fr_want_configure, relaxed);
1654 }
1655 }
1656
1657 /* if flow was (or is going to be) marked as nonviable, drop it */
1658 if (__improbable(fe->fe_want_nonviable ||
1659 (fe->fe_flags & FLOWENTF_NONVIABLE) != 0)) {
1660 SK_DF(SK_VERB_FSW_DP | SK_VERB_FLOW, "flow 0x%llx non-viable",
1661 SK_KVA(fe));
1662 return false;
1663 }
1664 return true;
1665}
1666
1667bool
1668dp_flow_rx_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
1669{
1670 bool okay;
1671 okay = dp_flow_route_process(fsw, fe);
1672#if (DEVELOPMENT || DEBUG)
1673 if (okay) {
1674 _FSW_INJECT_ERROR(5, okay, false, null_func);
1675 }
1676#endif /* DEVELOPMENT || DEBUG */
1677
1678 return okay;
1679}
1680
1681void
1682dp_flow_rx_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
1683 uint32_t flags)
1684{
1685#pragma unused(flags)
1686 struct pktq dpkts; /* dst pool alloc'ed packets */
1687 struct pktq disposed_pkts; /* done src packets */
1688 struct pktq dropped_pkts; /* dropped src packets */
1689 struct pktq transferred_pkts; /* dst packet ready for ring */
1690 struct __kern_packet *pkt, *tpkt;
1691 struct kern_pbufpool *dpp;
1692 uint32_t n_pkts = KPKTQ_LEN(&fe->fe_rx_pktq);
1693 uint64_t buf_array[RX_BUFLET_BATCH_COUNT];
1694 uint16_t buf_array_iter = 0;
1695 uint32_t cnt, buf_cnt = 0;
1696 int err;
1697
1698 KPKTQ_INIT(&dpkts);
1699 KPKTQ_INIT(&dropped_pkts);
1700 KPKTQ_INIT(&disposed_pkts);
1701 KPKTQ_INIT(&transferred_pkts);
1702
1703 if (__improbable(!dp_flow_rx_route_process(fsw, fe))) {
1704 SK_ERR("Rx route bad");
1705 fsw_snoop_and_dequeue(fe, target: &dropped_pkts, true);
1706 FSW_STATS_ADD(FSW_STATS_RX_FLOW_NONVIABLE, n_pkts);
1707 goto done;
1708 }
1709
1710 if (fe->fe_nx_port == FSW_VP_HOST) {
1711 /*
1712 * The host ring does not exist anymore so we can't take
1713 * the enqueue path below. This path should only be hit
1714 * for the rare tcp fragmentation case.
1715 */
1716 fsw_host_rx(fsw, pktq: &fe->fe_rx_pktq);
1717 return;
1718 }
1719
1720 /* find the ring */
1721 struct __kern_channel_ring *r;
1722 r = fsw_flow_get_rx_ring(fsw, fe);
1723 if (__improbable(r == NULL)) {
1724 fsw_snoop_and_dequeue(fe, target: &dropped_pkts, true);
1725 goto done;
1726 }
1727
1728 /* snoop before L2 is stripped */
1729 if (__improbable(pktap_total_tap_count != 0)) {
1730 fsw_snoop(fsw, fe, true);
1731 }
1732
1733 dpp = r->ckr_pp;
1734 /* batch allocate enough packets */
1735 err = pp_alloc_pktq(dpp, 1, &dpkts, n_pkts, NULL, NULL,
1736 SKMEM_NOSLEEP);
1737 if (__improbable(err == ENOMEM)) {
1738 ASSERT(KPKTQ_EMPTY(&dpkts));
1739 KPKTQ_CONCAT(&dropped_pkts, &fe->fe_rx_pktq);
1740 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
1741 SK_ERR("failed to alloc %u pkts for kr %s, 0x%llu", n_pkts,
1742 r->ckr_name, SK_KVA(r));
1743 goto done;
1744 }
1745
1746 /*
1747 * estimate total number of buflets for the packet chain.
1748 */
1749 cnt = howmany(fe->fe_rx_pktq_bytes, PP_BUF_SIZE_DEF(dpp));
1750 if (cnt > n_pkts) {
1751 ASSERT(dpp->pp_max_frags > 1);
1752 cnt -= n_pkts;
1753 buf_cnt = MIN(RX_BUFLET_BATCH_COUNT, cnt);
1754 err = pp_alloc_buflet_batch(pp: dpp, array: buf_array, size: &buf_cnt,
1755 SKMEM_NOSLEEP, false);
1756 if (__improbable(buf_cnt == 0)) {
1757 KPKTQ_CONCAT(&dropped_pkts, &fe->fe_rx_pktq);
1758 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
1759 SK_ERR("failed to alloc %d buflets (err %d) for kr %s, "
1760 "0x%llu", cnt, err, r->ckr_name, SK_KVA(r));
1761 goto done;
1762 }
1763 err = 0;
1764 }
1765
1766 /* extra processing for user flow */
1767 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_rx_pktq, tpkt) {
1768 err = 0;
1769 KPKTQ_REMOVE(&fe->fe_rx_pktq, pkt);
1770 if (fe->fe_rx_pktq_bytes > pkt->pkt_flow_ulen) {
1771 fe->fe_rx_pktq_bytes -= pkt->pkt_flow_ulen;
1772 } else {
1773 fe->fe_rx_pktq_bytes = 0;
1774 }
1775 err = flow_pkt_track(fe, pkt, true);
1776 _FSW_INJECT_ERROR(33, err, EPROTO, null_func);
1777 if (__improbable(err != 0)) {
1778 SK_ERR("flow_pkt_track failed (err %d)", err);
1779 FSW_STATS_INC(FSW_STATS_RX_FLOW_TRACK_ERR);
1780 /* if need to trigger RST */
1781 if (err == ENETRESET) {
1782 flow_track_abort_tcp(fe, in_pkt: pkt, NULL);
1783 }
1784 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
1785 continue;
1786 }
1787
1788 /* transfer to dpkt */
1789 if (pkt->pkt_qum.qum_pp != dpp) {
1790 struct __kern_buflet *bprev, *bnew;
1791 struct __kern_packet *dpkt = NULL;
1792 uint32_t n_bufs, i;
1793
1794 KPKTQ_DEQUEUE(&dpkts, dpkt);
1795 if (__improbable(dpkt == NULL)) {
1796 FSW_STATS_INC(FSW_STATS_DROP_NOMEM_PKT);
1797 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
1798 continue;
1799 }
1800 n_bufs = howmany(pkt->pkt_length, PP_BUF_SIZE_DEF(dpp));
1801 n_bufs--;
1802 for (i = 0; i < n_bufs; i++) {
1803 if (__improbable(buf_cnt == 0)) {
1804 ASSERT(dpp->pp_max_frags > 1);
1805 buf_array_iter = 0;
1806 cnt = howmany(fe->fe_rx_pktq_bytes,
1807 PP_BUF_SIZE_DEF(dpp));
1808 n_pkts = KPKTQ_LEN(&fe->fe_rx_pktq);
1809 if (cnt >= n_pkts) {
1810 cnt -= n_pkts;
1811 } else {
1812 cnt = 0;
1813 }
1814 cnt += (n_bufs - i);
1815 buf_cnt = MIN(RX_BUFLET_BATCH_COUNT,
1816 cnt);
1817 cnt = buf_cnt;
1818 err = pp_alloc_buflet_batch(pp: dpp,
1819 array: buf_array, size: &buf_cnt,
1820 SKMEM_NOSLEEP, false);
1821 if (__improbable(buf_cnt == 0)) {
1822 FSW_STATS_INC(FSW_STATS_DROP_NOMEM_PKT);
1823 KPKTQ_ENQUEUE(&dropped_pkts,
1824 pkt);
1825 pkt = NULL;
1826 pp_free_packet_single(dpkt);
1827 dpkt = NULL;
1828 SK_ERR("failed to alloc %d "
1829 "buflets (err %d) for "
1830 "kr %s, 0x%llu", cnt, err,
1831 r->ckr_name, SK_KVA(r));
1832 break;
1833 }
1834 err = 0;
1835 }
1836 ASSERT(buf_cnt != 0);
1837 if (i == 0) {
1838 PKT_GET_FIRST_BUFLET(dpkt, 1, bprev);
1839 }
1840 bnew = (kern_buflet_t)buf_array[buf_array_iter];
1841 buf_array[buf_array_iter] = 0;
1842 buf_array_iter++;
1843 buf_cnt--;
1844 VERIFY(kern_packet_add_buflet(SK_PKT2PH(dpkt),
1845 bprev, bnew) == 0);
1846 bprev = bnew;
1847 }
1848 if (__improbable(err != 0)) {
1849 continue;
1850 }
1851 err = copy_packet_from_dev(fsw, spkt: pkt, dpkt);
1852 _FSW_INJECT_ERROR(43, err, EINVAL, null_func);
1853 if (__improbable(err != 0)) {
1854 SK_ERR("copy packet failed (err %d)", err);
1855 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
1856 pp_free_packet_single(dpkt);
1857 dpkt = NULL;
1858 continue;
1859 }
1860 KPKTQ_ENQUEUE(&disposed_pkts, pkt);
1861 pkt = dpkt;
1862 }
1863 _UUID_COPY(pkt->pkt_flow_id, fe->fe_uuid);
1864 _UUID_COPY(pkt->pkt_policy_euuid, fe->fe_eproc_uuid);
1865 pkt->pkt_policy_id = fe->fe_policy_id;
1866 pkt->pkt_skip_policy_id = fe->fe_skip_policy_id;
1867 pkt->pkt_transport_protocol = fe->fe_transport_protocol;
1868 if (pkt->pkt_bufs_cnt > 1) {
1869 pkt->pkt_aggr_type = PKT_AGGR_SINGLE_IP;
1870 pkt->pkt_seg_cnt = 1;
1871 }
1872 KPKTQ_ENQUEUE(&transferred_pkts, pkt);
1873 }
1874 KPKTQ_FINI(&fe->fe_rx_pktq);
1875 KPKTQ_CONCAT(&fe->fe_rx_pktq, &transferred_pkts);
1876 KPKTQ_FINI(&transferred_pkts);
1877
1878 fsw_ring_enqueue_tail_drop(fsw, r, pktq: &fe->fe_rx_pktq);
1879
1880done:
1881 /* Free unused buflets */
1882 while (buf_cnt > 0) {
1883 pp_free_buflet(dpp, (kern_buflet_t)(buf_array[buf_array_iter]));
1884 buf_array[buf_array_iter] = 0;
1885 buf_array_iter++;
1886 buf_cnt--;
1887 }
1888 dp_free_pktq(fsw, pktq: &dpkts);
1889 dp_free_pktq(fsw, pktq: &disposed_pkts);
1890 dp_drop_pktq(fsw, &dropped_pkts);
1891}
1892
1893static inline void
1894rx_flow_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
1895 uint32_t flags)
1896{
1897 ASSERT(!KPKTQ_EMPTY(&fe->fe_rx_pktq));
1898 ASSERT(KPKTQ_LEN(&fe->fe_rx_pktq) != 0);
1899
1900 SK_DF(SK_VERB_FSW_DP | SK_VERB_RX, "Rx %d pkts for fe %p port %d",
1901 KPKTQ_LEN(&fe->fe_rx_pktq), fe, fe->fe_nx_port);
1902
1903 /* flow related processing (default, agg, fpd, etc.) */
1904 fe->fe_rx_process(fsw, fe, flags);
1905
1906 if (__improbable(fe->fe_want_withdraw)) {
1907 fsw_reap_sched(fsw);
1908 }
1909
1910 KPKTQ_FINI(&fe->fe_rx_pktq);
1911}
1912
1913static inline void
1914dp_rx_process_wake_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
1915{
1916 /*
1917 * We only care about wake packets of flows that belong the flow switch
1918 * as wake packets for the host stack are handled by the host input
1919 * function
1920 */
1921#if (DEBUG || DEVELOPMENT)
1922 if (__improbable(fsw->fsw_ifp->if_xflags & IFXF_MARK_WAKE_PKT)) {
1923 /*
1924 * This is a one shot command
1925 */
1926 fsw->fsw_ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT;
1927
1928 pkt->pkt_pflags |= PKT_F_WAKE_PKT;
1929 }
1930#endif /* (DEBUG || DEVELOPMENT) */
1931 if (__improbable(pkt->pkt_pflags & PKT_F_WAKE_PKT)) {
1932 if_ports_used_match_pkt(ifp: fsw->fsw_ifp, pkt);
1933 }
1934}
1935
1936static void
1937_fsw_receive_locked(struct nx_flowswitch *fsw, struct pktq *pktq)
1938{
1939 struct __kern_packet *pkt, *tpkt;
1940 struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
1941 struct flow_entry *fe, *prev_fe;
1942 sa_family_t af;
1943 struct pktq host_pkts, dropped_pkts;
1944 int err;
1945
1946 KPKTQ_INIT(&host_pkts);
1947 KPKTQ_INIT(&dropped_pkts);
1948
1949 if (__improbable(FSW_QUIESCED(fsw))) {
1950 DTRACE_SKYWALK1(rx__quiesced, struct nx_flowswitch *, fsw);
1951 KPKTQ_CONCAT(&dropped_pkts, pktq);
1952 goto done;
1953 }
1954 if (__improbable(fsw->fsw_demux == NULL)) {
1955 KPKTQ_CONCAT(&dropped_pkts, pktq);
1956 goto done;
1957 }
1958
1959 prev_fe = NULL;
1960 KPKTQ_FOREACH_SAFE(pkt, pktq, tpkt) {
1961 if (__probable(tpkt)) {
1962 void *baddr;
1963 MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr);
1964 SK_PREFETCH(baddr, 0);
1965 /* prefetch L3 and L4 flow structs */
1966 SK_PREFETCHW(tpkt->pkt_flow, 0);
1967 SK_PREFETCHW(tpkt->pkt_flow, 128);
1968 }
1969
1970 KPKTQ_REMOVE(pktq, pkt);
1971
1972 pkt = rx_prepare_packet(fsw, pkt);
1973
1974 af = fsw->fsw_demux(fsw, pkt);
1975 if (__improbable(af == AF_UNSPEC)) {
1976 KPKTQ_ENQUEUE(&host_pkts, pkt);
1977 continue;
1978 }
1979
1980 err = flow_pkt_classify(pkt, ifp: fsw->fsw_ifp, af, TRUE);
1981 _FSW_INJECT_ERROR(1, err, ENXIO, null_func);
1982 if (__improbable(err != 0)) {
1983 FSW_STATS_INC(FSW_STATS_RX_FLOW_EXTRACT_ERR);
1984 KPKTQ_ENQUEUE(&host_pkts, pkt);
1985 continue;
1986 }
1987
1988 if (__improbable(pkt->pkt_flow_ip_is_frag)) {
1989 pkt = rx_process_ip_frag(fsw, pkt);
1990 if (pkt == NULL) {
1991 continue;
1992 }
1993 }
1994
1995 prev_fe = fe = rx_lookup_flow(fsw, pkt, prev_fe);
1996 if (__improbable(fe == NULL)) {
1997 KPKTQ_ENQUEUE_LIST(&host_pkts, pkt);
1998 continue;
1999 }
2000
2001 fe->fe_rx_pktq_bytes += pkt->pkt_flow_ulen;
2002
2003 dp_rx_process_wake_packet(fsw, pkt);
2004
2005 rx_flow_batch_packet(fes: &fes, fe, pkt);
2006 prev_fe = fe;
2007 }
2008
2009 struct flow_entry *tfe = NULL;
2010 TAILQ_FOREACH_SAFE(fe, &fes, fe_rx_link, tfe) {
2011 rx_flow_process(fsw, fe, flags: 0);
2012 TAILQ_REMOVE(&fes, fe, fe_rx_link);
2013 fe->fe_rx_pktq_bytes = 0;
2014 fe->fe_rx_frag_count = 0;
2015 flow_entry_release(pfe: &fe);
2016 }
2017
2018 if (!KPKTQ_EMPTY(&host_pkts)) {
2019 fsw_host_rx(fsw, pktq: &host_pkts);
2020 }
2021
2022done:
2023 dp_drop_pktq(fsw, &dropped_pkts);
2024}
2025
2026#if (DEVELOPMENT || DEBUG)
2027static void
2028fsw_rps_rx(struct nx_flowswitch *fsw, uint32_t id,
2029 struct __kern_packet *pkt)
2030{
2031 struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[id];
2032
2033 lck_mtx_lock_spin(&frt->frt_lock);
2034 KPKTQ_ENQUEUE(&frt->frt_pktq, pkt);
2035 lck_mtx_unlock(&frt->frt_lock);
2036}
2037
2038static void
2039fsw_rps_thread_schedule(struct nx_flowswitch *fsw, uint32_t id)
2040{
2041 struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[id];
2042
2043 ASSERT(frt->frt_thread != THREAD_NULL);
2044 lck_mtx_lock_spin(&frt->frt_lock);
2045 ASSERT(!(frt->frt_flags & (FRT_TERMINATING | FRT_TERMINATED)));
2046
2047 frt->frt_requests++;
2048 if (!(frt->frt_flags & FRT_RUNNING)) {
2049 thread_wakeup((caddr_t)frt);
2050 }
2051 lck_mtx_unlock(&frt->frt_lock);
2052}
2053
2054__attribute__((noreturn))
2055static void
2056fsw_rps_thread_cont(void *v, wait_result_t w)
2057{
2058 struct fsw_rps_thread *frt = v;
2059 struct nx_flowswitch *fsw = frt->frt_fsw;
2060
2061 lck_mtx_lock(&frt->frt_lock);
2062 if (__improbable(w == THREAD_INTERRUPTIBLE ||
2063 (frt->frt_flags & FRT_TERMINATING) != 0)) {
2064 goto terminate;
2065 }
2066 if (KPKTQ_EMPTY(&frt->frt_pktq)) {
2067 goto done;
2068 }
2069 frt->frt_flags |= FRT_RUNNING;
2070
2071 for (;;) {
2072 uint32_t requests = frt->frt_requests;
2073 struct pktq pkts;
2074
2075 KPKTQ_INIT(&pkts);
2076 KPKTQ_CONCAT(&pkts, &frt->frt_pktq);
2077 lck_mtx_unlock(&frt->frt_lock);
2078
2079 sk_protect_t protect;
2080 protect = sk_sync_protect();
2081 FSW_RLOCK(fsw);
2082 _fsw_receive_locked(fsw, &pkts);
2083 FSW_RUNLOCK(fsw);
2084 sk_sync_unprotect(protect);
2085
2086 lck_mtx_lock(&frt->frt_lock);
2087 if ((frt->frt_flags & FRT_TERMINATING) != 0 ||
2088 requests == frt->frt_requests) {
2089 frt->frt_requests = 0;
2090 break;
2091 }
2092 }
2093
2094done:
2095 lck_mtx_unlock(&frt->frt_lock);
2096 if (!(frt->frt_flags & FRT_TERMINATING)) {
2097 frt->frt_flags &= ~FRT_RUNNING;
2098 assert_wait(frt, THREAD_UNINT);
2099 thread_block_parameter(fsw_rps_thread_cont, frt);
2100 __builtin_unreachable();
2101 } else {
2102terminate:
2103 LCK_MTX_ASSERT(&frt->frt_lock, LCK_MTX_ASSERT_OWNED);
2104 frt->frt_flags &= ~(FRT_RUNNING | FRT_TERMINATING);
2105 frt->frt_flags |= FRT_TERMINATED;
2106
2107 if (frt->frt_flags & FRT_TERMINATEBLOCK) {
2108 thread_wakeup((caddr_t)&frt);
2109 }
2110 lck_mtx_unlock(&frt->frt_lock);
2111
2112 SK_D("fsw_rx_%s_%d terminated", if_name(fsw->fsw_ifp),
2113 frt->frt_idx);
2114
2115 /* for the extra refcnt from kernel_thread_start() */
2116 thread_deallocate(current_thread());
2117 /* this is the end */
2118 thread_terminate(current_thread());
2119 /* NOTREACHED */
2120 __builtin_unreachable();
2121 }
2122
2123 /* must never get here */
2124 VERIFY(0);
2125 /* NOTREACHED */
2126 __builtin_unreachable();
2127}
2128
2129__attribute__((noreturn))
2130static void
2131fsw_rps_thread_func(void *v, wait_result_t w)
2132{
2133#pragma unused(w)
2134 struct fsw_rps_thread *frt = v;
2135 struct nx_flowswitch *fsw = frt->frt_fsw;
2136
2137 char thread_name[MAXTHREADNAMESIZE];
2138 bzero(thread_name, sizeof(thread_name));
2139 (void) snprintf(thread_name, sizeof(thread_name), "fsw_rx_%s_%d",
2140 if_name(fsw->fsw_ifp), frt->frt_idx);
2141 thread_set_thread_name(frt->frt_thread, thread_name);
2142 SK_D("%s spawned", thread_name);
2143
2144 net_thread_marks_push(NET_THREAD_SYNC_RX);
2145 assert_wait(frt, THREAD_UNINT);
2146 (void) thread_block_parameter(fsw_rps_thread_cont, frt);
2147
2148 __builtin_unreachable();
2149}
2150
2151static void
2152fsw_rps_thread_join(struct nx_flowswitch *fsw, uint32_t i)
2153{
2154 struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[i];
2155 uint64_t f = (1 * NSEC_PER_MSEC);
2156 uint64_t s = (1000 * NSEC_PER_SEC);
2157 uint32_t c = 0;
2158
2159 lck_mtx_lock(&frt->frt_lock);
2160 frt->frt_flags |= FRT_TERMINATING;
2161
2162 while (!(frt->frt_flags & FRT_TERMINATED)) {
2163 uint64_t t = 0;
2164 nanoseconds_to_absolutetime((c++ == 0) ? f : s, &t);
2165 clock_absolutetime_interval_to_deadline(t, &t);
2166 ASSERT(t != 0);
2167
2168 frt->frt_flags |= FRT_TERMINATEBLOCK;
2169 if (!(frt->frt_flags & FRT_RUNNING)) {
2170 thread_wakeup_one((caddr_t)frt);
2171 }
2172 (void) assert_wait_deadline(&frt->frt_thread, THREAD_UNINT, t);
2173 lck_mtx_unlock(&frt->frt_lock);
2174 thread_block(THREAD_CONTINUE_NULL);
2175 lck_mtx_lock(&frt->frt_lock);
2176 frt->frt_flags &= ~FRT_TERMINATEBLOCK;
2177 }
2178 ASSERT(frt->frt_flags & FRT_TERMINATED);
2179 lck_mtx_unlock(&frt->frt_lock);
2180 frt->frt_thread = THREAD_NULL;
2181}
2182
2183static void
2184fsw_rps_thread_spawn(struct nx_flowswitch *fsw, uint32_t i)
2185{
2186 kern_return_t error;
2187 struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[i];
2188 lck_mtx_init(&frt->frt_lock, &nexus_lock_group, &nexus_lock_attr);
2189 frt->frt_idx = i;
2190 frt->frt_fsw = fsw;
2191 error = kernel_thread_start(fsw_rps_thread_func, frt, &frt->frt_thread);
2192 ASSERT(!error);
2193 KPKTQ_INIT(&frt->frt_pktq);
2194}
2195
2196int
2197fsw_rps_set_nthreads(struct nx_flowswitch* fsw, uint32_t n)
2198{
2199 if (n > FSW_RPS_MAX_NTHREADS) {
2200 SK_ERR("rps nthreads %d, max %d", n, FSW_RPS_MAX_NTHREADS);
2201 return EINVAL;
2202 }
2203
2204 FSW_WLOCK(fsw);
2205 if (n < fsw->fsw_rps_nthreads) {
2206 for (uint32_t i = n; i < fsw->fsw_rps_nthreads; i++) {
2207 fsw_rps_thread_join(fsw, i);
2208 }
2209 fsw->fsw_rps_threads = krealloc_type(struct fsw_rps_thread,
2210 fsw->fsw_rps_nthreads, n, fsw->fsw_rps_threads,
2211 Z_WAITOK | Z_ZERO | Z_NOFAIL);
2212 } else if (n > fsw->fsw_rps_nthreads) {
2213 fsw->fsw_rps_threads = krealloc_type(struct fsw_rps_thread,
2214 fsw->fsw_rps_nthreads, n, fsw->fsw_rps_threads,
2215 Z_WAITOK | Z_ZERO | Z_NOFAIL);
2216 for (uint32_t i = fsw->fsw_rps_nthreads; i < n; i++) {
2217 fsw_rps_thread_spawn(fsw, i);
2218 }
2219 }
2220 fsw->fsw_rps_nthreads = n;
2221 FSW_WUNLOCK(fsw);
2222 return 0;
2223}
2224
2225static uint32_t
2226get_rps_id(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
2227{
2228 sa_family_t af = fsw->fsw_demux(fsw, pkt);
2229 if (__improbable(af == AF_UNSPEC)) {
2230 return 0;
2231 }
2232
2233 flow_pkt_classify(pkt, fsw->fsw_ifp, af, true);
2234
2235 if (__improbable((pkt->pkt_qum_qflags &
2236 QUM_F_FLOW_CLASSIFIED) == 0)) {
2237 return 0;
2238 }
2239
2240 struct flow_key key;
2241 flow_pkt2key(pkt, true, &key);
2242 key.fk_mask = FKMASK_5TUPLE;
2243
2244 uint32_t id = flow_key_hash(&key) % fsw->fsw_rps_nthreads;
2245
2246 return id;
2247}
2248
2249#endif /* !DEVELOPMENT && !DEBUG */
2250
2251void
2252fsw_receive(struct nx_flowswitch *fsw, struct pktq *pktq)
2253{
2254 FSW_RLOCK(fsw);
2255#if (DEVELOPMENT || DEBUG)
2256 if (fsw->fsw_rps_nthreads != 0) {
2257 struct __kern_packet *pkt, *tpkt;
2258 bitmap_t map = 0;
2259
2260 _CASSERT(BITMAP_LEN(FSW_RPS_MAX_NTHREADS) == 1);
2261 KPKTQ_FOREACH_SAFE(pkt, pktq, tpkt) {
2262 uint32_t id = get_rps_id(fsw, pkt);
2263 KPKTQ_REMOVE(pktq, pkt);
2264 fsw_rps_rx(fsw, id, pkt);
2265 bitmap_set(&map, id);
2266 }
2267 for (int i = bitmap_first(&map, 64); i >= 0;
2268 i = bitmap_next(&map, i)) {
2269 fsw_rps_thread_schedule(fsw, i);
2270 }
2271 } else
2272#endif /* !DEVELOPMENT && !DEBUG */
2273 {
2274 _fsw_receive_locked(fsw, pktq);
2275 }
2276 FSW_RUNLOCK(fsw);
2277}
2278
2279int
2280fsw_dev_input_netem_dequeue(void *handle, pktsched_pkt_t * pkts,
2281 uint32_t n_pkts)
2282{
2283#pragma unused(handle)
2284 struct nx_flowswitch *fsw = handle;
2285 struct __kern_packet *kpkts[FSW_VP_DEV_BATCH_MAX];
2286 struct pktq pktq;
2287 sk_protect_t protect;
2288 uint32_t i;
2289
2290 ASSERT(n_pkts <= FSW_VP_DEV_BATCH_MAX);
2291
2292 for (i = 0; i < n_pkts; i++) {
2293 ASSERT(pkts[i].pktsched_ptype == QP_PACKET);
2294 ASSERT(pkts[i].pktsched_pkt_kpkt != NULL);
2295 kpkts[i] = pkts[i].pktsched_pkt_kpkt;
2296 }
2297
2298 protect = sk_sync_protect();
2299 KPKTQ_INIT(&pktq);
2300 pkts_to_pktq(pkts: kpkts, n_pkts, pktq: &pktq);
2301
2302 fsw_receive(fsw, pktq: &pktq);
2303 KPKTQ_FINI(&pktq);
2304 sk_sync_unprotect(protect);
2305
2306 return 0;
2307}
2308
2309static void
2310fsw_dev_input_netem_enqueue(struct nx_flowswitch *fsw, struct pktq *q)
2311{
2312 classq_pkt_t p;
2313 struct netem *ne;
2314 struct __kern_packet *pkt, *tpkt;
2315
2316 ASSERT(fsw->fsw_ifp != NULL);
2317 ne = fsw->fsw_ifp->if_input_netem;
2318 ASSERT(ne != NULL);
2319 KPKTQ_FOREACH_SAFE(pkt, q, tpkt) {
2320 bool pdrop;
2321 KPKTQ_REMOVE(q, pkt);
2322 CLASSQ_PKT_INIT_PACKET(&p, pkt);
2323 netem_enqueue(ne, p: &p, pdrop: &pdrop);
2324 }
2325}
2326
2327void
2328fsw_devna_rx(struct nexus_adapter *devna, struct __kern_packet *pkt_head,
2329 struct nexus_pkt_stats *out_stats)
2330{
2331 struct __kern_packet *pkt = pkt_head, *next;
2332 struct nx_flowswitch *fsw;
2333 uint32_t n_bytes = 0, n_pkts = 0;
2334 uint64_t total_pkts = 0, total_bytes = 0;
2335 struct pktq q;
2336
2337 KPKTQ_INIT(&q);
2338 if (__improbable(devna->na_ifp == NULL ||
2339 (fsw = fsw_ifp_to_fsw(devna->na_ifp)) == NULL)) {
2340 SK_ERR("fsw not attached, dropping %d pkts", KPKTQ_LEN(&q));
2341 pp_free_packet_chain(pkt_head, NULL);
2342 return;
2343 }
2344 while (pkt != NULL) {
2345 if (__improbable(pkt->pkt_trace_id != 0)) {
2346 KDBG(SK_KTRACE_PKT_RX_DRV | DBG_FUNC_END, pkt->pkt_trace_id);
2347 KDBG(SK_KTRACE_PKT_RX_FSW | DBG_FUNC_START, pkt->pkt_trace_id);
2348 }
2349 next = pkt->pkt_nextpkt;
2350 pkt->pkt_nextpkt = NULL;
2351
2352 if (__probable((pkt->pkt_qum_qflags & QUM_F_DROPPED) == 0)) {
2353 KPKTQ_ENQUEUE(&q, pkt);
2354 n_bytes += pkt->pkt_length;
2355 } else {
2356 DTRACE_SKYWALK1(non__finalized__drop,
2357 struct __kern_packet *, pkt);
2358 FSW_STATS_INC(FSW_STATS_RX_PKT_NOT_FINALIZED);
2359 pp_free_packet_single(pkt);
2360 pkt = NULL;
2361 }
2362 n_pkts = KPKTQ_LEN(&q);
2363 if (n_pkts == fsw_rx_batch || (next == NULL && n_pkts > 0)) {
2364 if (__improbable(fsw->fsw_ifp->if_input_netem != NULL)) {
2365 fsw_dev_input_netem_enqueue(fsw, q: &q);
2366 } else {
2367 fsw_receive(fsw, pktq: &q);
2368 }
2369 total_pkts += n_pkts;
2370 total_bytes += n_bytes;
2371 n_pkts = 0;
2372 n_bytes = 0;
2373 KPKTQ_FINI(&q);
2374 }
2375 pkt = next;
2376 }
2377 ASSERT(KPKTQ_LEN(&q) == 0);
2378 FSW_STATS_ADD(FSW_STATS_RX_PACKETS, total_pkts);
2379 if (out_stats != NULL) {
2380 out_stats->nps_pkts = total_pkts;
2381 out_stats->nps_bytes = total_bytes;
2382 }
2383 KDBG(SK_KTRACE_FSW_DEV_RING_FLUSH, SK_KVA(devna), total_pkts, total_bytes);
2384}
2385
2386static int
2387dp_copy_to_dev_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2388 struct __kern_packet *dpkt)
2389{
2390 struct mbuf *m = NULL;
2391 uint32_t bdlen, bdlim, bdoff;
2392 uint8_t *bdaddr;
2393 unsigned int one = 1;
2394 int err = 0;
2395
2396 err = mbuf_allocpacket(how: MBUF_DONTWAIT,
2397 packetlen: (fsw->fsw_frame_headroom + spkt->pkt_length), maxchunks: &one, mbuf: &m);
2398#if (DEVELOPMENT || DEBUG)
2399 if (m != NULL) {
2400 _FSW_INJECT_ERROR(11, m, NULL, m_freem, m);
2401 }
2402#endif /* DEVELOPMENT || DEBUG */
2403 if (__improbable(m == NULL)) {
2404 FSW_STATS_INC(FSW_STATS_DROP_NOMEM_MBUF);
2405 err = ENOBUFS;
2406 goto done;
2407 }
2408
2409 MD_BUFLET_ADDR_ABS_DLEN(dpkt, bdaddr, bdlen, bdlim, bdoff);
2410 if (fsw->fsw_frame_headroom > bdlim) {
2411 SK_ERR("not enough space in buffer for headroom");
2412 err = EINVAL;
2413 goto done;
2414 }
2415
2416 dpkt->pkt_headroom = fsw->fsw_frame_headroom;
2417 dpkt->pkt_mbuf = m;
2418 dpkt->pkt_pflags |= PKT_F_MBUF_DATA;
2419
2420 /* packet copy into mbuf */
2421 fsw->fsw_pkt_copy_to_mbuf(NR_TX, SK_PTR_ENCODE(spkt,
2422 METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt)), 0, m,
2423 fsw->fsw_frame_headroom, spkt->pkt_length,
2424 PACKET_HAS_PARTIAL_CHECKSUM(spkt),
2425 spkt->pkt_csum_tx_start_off);
2426 FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2MBUF);
2427
2428 /* header copy into dpkt buffer for classification */
2429 kern_packet_t sph = SK_PTR_ENCODE(spkt,
2430 METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt));
2431 kern_packet_t dph = SK_PTR_ENCODE(dpkt,
2432 METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
2433 uint32_t copy_len = MIN(spkt->pkt_length, bdlim - dpkt->pkt_headroom);
2434 fsw->fsw_pkt_copy_from_pkt(NR_TX, dph, dpkt->pkt_headroom,
2435 sph, spkt->pkt_headroom, copy_len, FALSE, 0, 0, 0);
2436
2437 /*
2438 * fsw->fsw_frame_headroom is after m_data, thus we treat m_data same as
2439 * buflet baddr m_data always points to the beginning of packet and
2440 * should represents the same as baddr + headroom
2441 */
2442 ASSERT((uintptr_t)m->m_data ==
2443 ((uintptr_t)mbuf_datastart(m) + fsw->fsw_frame_headroom));
2444
2445done:
2446 return err;
2447}
2448
2449static int
2450dp_copy_to_dev_pkt(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2451 struct __kern_packet *dpkt)
2452{
2453 struct ifnet *ifp = fsw->fsw_ifp;
2454 uint16_t headroom = fsw->fsw_frame_headroom + ifp->if_tx_headroom;
2455
2456 if (headroom > UINT8_MAX) {
2457 SK_ERR("headroom too large %d", headroom);
2458 return ERANGE;
2459 }
2460 dpkt->pkt_headroom = (uint8_t)headroom;
2461 ASSERT((dpkt->pkt_headroom & 0x7) == 0);
2462 dpkt->pkt_l2_len = 0;
2463 dpkt->pkt_link_flags = spkt->pkt_link_flags;
2464
2465 kern_packet_t sph = SK_PTR_ENCODE(spkt,
2466 METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt));
2467 kern_packet_t dph = SK_PTR_ENCODE(dpkt,
2468 METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt));
2469 fsw->fsw_pkt_copy_from_pkt(NR_TX, dph,
2470 dpkt->pkt_headroom, sph, spkt->pkt_headroom,
2471 spkt->pkt_length, PACKET_HAS_PARTIAL_CHECKSUM(spkt),
2472 (spkt->pkt_csum_tx_start_off - spkt->pkt_headroom),
2473 (spkt->pkt_csum_tx_stuff_off - spkt->pkt_headroom),
2474 (spkt->pkt_csum_flags & PACKET_CSUM_ZERO_INVERT));
2475
2476 FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2PKT);
2477
2478 return 0;
2479}
2480
2481#if SK_LOG
2482/* Hoisted out of line to reduce kernel stack footprint */
2483SK_LOG_ATTRIBUTE
2484static void
2485dp_copy_to_dev_log(struct nx_flowswitch *fsw, const struct kern_pbufpool *pp,
2486 struct __kern_packet *spkt, struct __kern_packet *dpkt, int error)
2487{
2488 struct proc *p = current_proc();
2489 struct ifnet *ifp = fsw->fsw_ifp;
2490 uint64_t logflags = (SK_VERB_FSW_DP | SK_VERB_TX);
2491
2492 if (error == ERANGE) {
2493 SK_ERR("packet too long, hr(fr+tx)+slen (%u+%u)+%u > "
2494 "dev_pp_max %u", (uint32_t)fsw->fsw_frame_headroom,
2495 (uint32_t)ifp->if_tx_headroom, spkt->pkt_length,
2496 (uint32_t)pp->pp_max_frags * PP_BUF_SIZE_DEF(pp));
2497 } else if (error == ENOBUFS) {
2498 SK_DF(logflags, "%s(%d) packet allocation failure",
2499 sk_proc_name_address(p), sk_proc_pid(p));
2500 } else if (error == 0) {
2501 ASSERT(dpkt != NULL);
2502 char *daddr;
2503 MD_BUFLET_ADDR_ABS(dpkt, daddr);
2504 SK_DF(logflags, "%s(%d) splen %u dplen %u hr %u (fr/tx %u/%u)",
2505 sk_proc_name_address(p), sk_proc_pid(p), spkt->pkt_length,
2506 dpkt->pkt_length, (uint32_t)dpkt->pkt_headroom,
2507 (uint32_t)fsw->fsw_frame_headroom,
2508 (uint32_t)ifp->if_tx_headroom);
2509 SK_DF(logflags | SK_VERB_DUMP, "%s",
2510 sk_dump("buf", daddr, dpkt->pkt_length, 128, NULL, 0));
2511 } else {
2512 SK_DF(logflags, "%s(%d) error %d", error);
2513 }
2514}
2515#else
2516#define dp_copy_to_dev_log(...)
2517#endif /* SK_LOG */
2518
2519static void
2520fsw_pkt_copy_metadata(struct __kern_packet *spkt, struct __kern_packet *dpkt)
2521{
2522 ASSERT(!(spkt->pkt_pflags & PKT_F_MBUF_MASK));
2523 ASSERT(!(spkt->pkt_pflags & PKT_F_PKT_MASK));
2524
2525 SK_PREFETCHW(dpkt->pkt_qum_buf.buf_addr, 0);
2526 /* Copy packet metadata */
2527 _QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum);
2528 _PKT_COPY(spkt, dpkt);
2529 _PKT_COPY_TX_PORT_DATA(spkt, dpkt);
2530 ASSERT((dpkt->pkt_qum.qum_qflags & QUM_F_KERNEL_ONLY) ||
2531 !PP_KERNEL_ONLY(dpkt->pkt_qum.qum_pp));
2532 ASSERT(dpkt->pkt_mbuf == NULL);
2533
2534 /* Copy AQM metadata */
2535 dpkt->pkt_flowsrc_type = spkt->pkt_flowsrc_type;
2536 dpkt->pkt_flowsrc_fidx = spkt->pkt_flowsrc_fidx;
2537 _CASSERT((offsetof(struct __flow, flow_src_id) % 8) == 0);
2538 _UUID_COPY(dpkt->pkt_flowsrc_id, spkt->pkt_flowsrc_id);
2539 _UUID_COPY(dpkt->pkt_policy_euuid, spkt->pkt_policy_euuid);
2540 dpkt->pkt_policy_id = spkt->pkt_policy_id;
2541 dpkt->pkt_skip_policy_id = spkt->pkt_skip_policy_id;
2542}
2543
2544static int
2545dp_copy_to_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2546 struct __kern_packet *dpkt)
2547{
2548 const struct kern_pbufpool *pp = dpkt->pkt_qum.qum_pp;
2549 struct ifnet *ifp = fsw->fsw_ifp;
2550 uint32_t dev_pkt_len;
2551 int err = 0;
2552
2553 fsw_pkt_copy_metadata(spkt, dpkt);
2554 switch (fsw->fsw_classq_enq_ptype) {
2555 case QP_MBUF:
2556 err = dp_copy_to_dev_mbuf(fsw, spkt, dpkt);
2557 break;
2558
2559 case QP_PACKET:
2560 dev_pkt_len = fsw->fsw_frame_headroom + ifp->if_tx_headroom +
2561 spkt->pkt_length;
2562 if (dev_pkt_len > pp->pp_max_frags * PP_BUF_SIZE_DEF(pp)) {
2563 FSW_STATS_INC(FSW_STATS_TX_COPY_BAD_LEN);
2564 err = ERANGE;
2565 goto done;
2566 }
2567 err = dp_copy_to_dev_pkt(fsw, spkt, dpkt);
2568 break;
2569
2570 default:
2571 VERIFY(0);
2572 __builtin_unreachable();
2573 }
2574done:
2575 dp_copy_to_dev_log(fsw, pp, spkt, dpkt, err);
2576 return err;
2577}
2578
2579static int
2580dp_copy_headers_to_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt,
2581 struct __kern_packet *dpkt)
2582{
2583 uint8_t *sbaddr, *dbaddr;
2584 uint16_t headroom = fsw->fsw_frame_headroom + fsw->fsw_ifp->if_tx_headroom;
2585 uint16_t hdrs_len_estimate = (uint16_t)MIN(spkt->pkt_length, 128);
2586
2587 fsw_pkt_copy_metadata(spkt, dpkt);
2588
2589 MD_BUFLET_ADDR_ABS(spkt, sbaddr);
2590 ASSERT(sbaddr != NULL);
2591 sbaddr += spkt->pkt_headroom;
2592
2593 MD_BUFLET_ADDR_ABS(dpkt, dbaddr);
2594 ASSERT(dbaddr != NULL);
2595 dpkt->pkt_headroom = (uint8_t)headroom;
2596 dbaddr += headroom;
2597
2598 pkt_copy(src: sbaddr, dst: dbaddr, len: hdrs_len_estimate);
2599 METADATA_SET_LEN(dpkt, hdrs_len_estimate, headroom);
2600
2601 /* packet length is set to the full length */
2602 dpkt->pkt_length = spkt->pkt_length;
2603 dpkt->pkt_pflags |= PKT_F_TRUNCATED;
2604 return 0;
2605}
2606
2607static struct mbuf *
2608convert_pkt_to_mbuf(struct __kern_packet *pkt)
2609{
2610 ASSERT(pkt->pkt_pflags & PKT_F_MBUF_DATA);
2611 ASSERT(pkt->pkt_mbuf != NULL);
2612 struct mbuf *m = pkt->pkt_mbuf;
2613
2614 /* pass additional metadata generated from flow parse/lookup */
2615 _CASSERT(sizeof(m->m_pkthdr.pkt_flowid) ==
2616 sizeof(pkt->pkt_flow_token));
2617 _CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_srcid) ==
2618 sizeof(pkt->pkt_flowsrc_token));
2619 _CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_fidx) ==
2620 sizeof(pkt->pkt_flowsrc_fidx));
2621 m->m_pkthdr.pkt_svc = pkt->pkt_svc_class;
2622 m->m_pkthdr.pkt_proto = pkt->pkt_flow->flow_ip_proto;
2623 m->m_pkthdr.pkt_flowid = pkt->pkt_flow_token;
2624 m->m_pkthdr.comp_gencnt = pkt->pkt_comp_gencnt;
2625 m->m_pkthdr.pkt_flowsrc = pkt->pkt_flowsrc_type;
2626 m->m_pkthdr.pkt_mpriv_srcid = pkt->pkt_flowsrc_token;
2627 m->m_pkthdr.pkt_mpriv_fidx = pkt->pkt_flowsrc_fidx;
2628
2629 if (pkt->pkt_transport_protocol == IPPROTO_QUIC) {
2630 m->m_pkthdr.pkt_ext_flags |= PKTF_EXT_QUIC;
2631 }
2632
2633 /* The packet should have a timestamp by the time we get here. */
2634 m->m_pkthdr.pkt_timestamp = pkt->pkt_timestamp;
2635 m->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID;
2636
2637 m->m_pkthdr.pkt_flags &= ~PKT_F_COMMON_MASK;
2638 m->m_pkthdr.pkt_flags |= (pkt->pkt_pflags & PKT_F_COMMON_MASK);
2639 /* set pkt_hdr so that AQM can find IP header and mark ECN bits */
2640 m->m_pkthdr.pkt_hdr = m_mtod_current(m) + pkt->pkt_l2_len;
2641
2642 if ((pkt->pkt_pflags & PKT_F_START_SEQ) != 0) {
2643 m->m_pkthdr.tx_start_seq = ntohl(pkt->pkt_flow_tcp_seq);
2644 }
2645 KPKT_CLEAR_MBUF_DATA(pkt);
2646
2647 /* mbuf has been consumed, release packet as well */
2648 ASSERT(pkt->pkt_qum.qum_ksd == NULL);
2649 pp_free_packet_single(pkt);
2650 return m;
2651}
2652
2653static void
2654convert_pkt_to_mbuf_list(struct __kern_packet *pkt_list,
2655 struct mbuf **head, struct mbuf **tail,
2656 uint32_t *cnt, uint32_t *bytes)
2657{
2658 struct __kern_packet *pkt = pkt_list, *next;
2659 struct mbuf *m_head = NULL, **m_tailp = &m_head, *m = NULL;
2660 uint32_t c = 0, b = 0;
2661
2662 while (pkt != NULL) {
2663 next = pkt->pkt_nextpkt;
2664 pkt->pkt_nextpkt = NULL;
2665 m = convert_pkt_to_mbuf(pkt);
2666 ASSERT(m != NULL);
2667
2668 *m_tailp = m;
2669 m_tailp = &m->m_nextpkt;
2670 c++;
2671 b += m_pktlen(m);
2672 pkt = next;
2673 }
2674 if (head != NULL) {
2675 *head = m_head;
2676 }
2677 if (tail != NULL) {
2678 *tail = m;
2679 }
2680 if (cnt != NULL) {
2681 *cnt = c;
2682 }
2683 if (bytes != NULL) {
2684 *bytes = b;
2685 }
2686}
2687
2688SK_NO_INLINE_ATTRIBUTE
2689static int
2690classq_enqueue_flow_single(struct nx_flowswitch *fsw,
2691 struct __kern_packet *pkt)
2692{
2693 struct ifnet *ifp = fsw->fsw_ifp;
2694 boolean_t pkt_drop = FALSE;
2695 int err;
2696
2697 FSW_LOCK_ASSERT_HELD(fsw);
2698 ASSERT(fsw->fsw_classq_enabled);
2699 ASSERT(pkt->pkt_flow_token != 0);
2700 fsw_ifp_inc_traffic_class_out_pkt(ifp, pkt->pkt_svc_class,
2701 1, pkt->pkt_length);
2702
2703 if (__improbable(pkt->pkt_trace_id != 0)) {
2704 KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_END, pkt->pkt_trace_id);
2705 KDBG(SK_KTRACE_PKT_TX_AQM | DBG_FUNC_START, pkt->pkt_trace_id);
2706 }
2707
2708 switch (fsw->fsw_classq_enq_ptype) {
2709 case QP_MBUF: { /* compat interface */
2710 struct mbuf *m;
2711
2712 m = convert_pkt_to_mbuf(pkt);
2713 ASSERT(m != NULL);
2714 pkt = NULL;
2715
2716 /* ifnet_enqueue consumes mbuf */
2717 err = ifnet_enqueue_mbuf(ifp, m, false, &pkt_drop);
2718 m = NULL;
2719#if (DEVELOPMENT || DEBUG)
2720 if (__improbable(!pkt_drop)) {
2721 _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2722 }
2723#endif /* DEVELOPMENT || DEBUG */
2724 if (pkt_drop) {
2725 FSW_STATS_INC(FSW_STATS_DROP);
2726 FSW_STATS_INC(FSW_STATS_TX_AQM_DROP);
2727 }
2728 break;
2729 }
2730 case QP_PACKET: { /* native interface */
2731 /* ifnet_enqueue consumes packet */
2732 err = ifnet_enqueue_pkt(ifp, pkt, false, &pkt_drop);
2733 pkt = NULL;
2734#if (DEVELOPMENT || DEBUG)
2735 if (__improbable(!pkt_drop)) {
2736 _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2737 }
2738#endif /* DEVELOPMENT || DEBUG */
2739 if (pkt_drop) {
2740 FSW_STATS_INC(FSW_STATS_DROP);
2741 FSW_STATS_INC(FSW_STATS_TX_AQM_DROP);
2742 }
2743 break;
2744 }
2745 default:
2746 err = EINVAL;
2747 VERIFY(0);
2748 /* NOTREACHED */
2749 __builtin_unreachable();
2750 }
2751
2752 return err;
2753}
2754
2755static int
2756classq_enqueue_flow_chain(struct nx_flowswitch *fsw,
2757 struct __kern_packet *pkt_head, struct __kern_packet *pkt_tail,
2758 uint32_t cnt, uint32_t bytes)
2759{
2760 struct ifnet *ifp = fsw->fsw_ifp;
2761 boolean_t pkt_drop = FALSE;
2762 uint32_t svc;
2763 int err;
2764
2765 FSW_LOCK_ASSERT_HELD(fsw);
2766 ASSERT(fsw->fsw_classq_enabled);
2767 ASSERT(pkt_head->pkt_flow_token != 0);
2768
2769 /*
2770 * All packets in the flow should have the same svc.
2771 */
2772 svc = pkt_head->pkt_svc_class;
2773 fsw_ifp_inc_traffic_class_out_pkt(ifp, svc, cnt, bytes);
2774
2775 switch (fsw->fsw_classq_enq_ptype) {
2776 case QP_MBUF: { /* compat interface */
2777 struct mbuf *m_head = NULL, *m_tail = NULL;
2778 uint32_t c = 0, b = 0;
2779
2780 convert_pkt_to_mbuf_list(pkt_list: pkt_head, head: &m_head, tail: &m_tail, cnt: &c, bytes: &b);
2781 ASSERT(m_head != NULL && m_tail != NULL);
2782 ASSERT(c == cnt);
2783 ASSERT(b == bytes);
2784 pkt_head = NULL;
2785
2786 /* ifnet_enqueue consumes mbuf */
2787 err = ifnet_enqueue_mbuf_chain(ifp, m_head, m_tail, cnt,
2788 bytes, FALSE, &pkt_drop);
2789 m_head = NULL;
2790 m_tail = NULL;
2791#if (DEVELOPMENT || DEBUG)
2792 if (__improbable(!pkt_drop)) {
2793 _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2794 }
2795#endif /* DEVELOPMENT || DEBUG */
2796 if (pkt_drop) {
2797 STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, cnt);
2798 STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
2799 cnt);
2800 }
2801 break;
2802 }
2803 case QP_PACKET: { /* native interface */
2804 /* ifnet_enqueue consumes packet */
2805 err = ifnet_enqueue_pkt_chain(ifp, pkt_head, pkt_tail, cnt,
2806 bytes, FALSE, &pkt_drop);
2807 pkt_head = NULL;
2808#if (DEVELOPMENT || DEBUG)
2809 if (__improbable(!pkt_drop)) {
2810 _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func);
2811 }
2812#endif /* DEVELOPMENT || DEBUG */
2813 if (pkt_drop) {
2814 STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, cnt);
2815 STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
2816 cnt);
2817 }
2818 break;
2819 }
2820 default:
2821 err = EINVAL;
2822 VERIFY(0);
2823 /* NOTREACHED */
2824 __builtin_unreachable();
2825 }
2826
2827 return err;
2828}
2829
2830/*
2831 * This code path needs to be kept for interfaces without logical link support.
2832 */
2833static void
2834classq_enqueue_flow(struct nx_flowswitch *fsw, struct flow_entry *fe,
2835 bool chain, uint32_t cnt, uint32_t bytes)
2836{
2837 bool flowadv_is_set = false;
2838 struct __kern_packet *pkt, *tail, *tpkt;
2839 flowadv_idx_t flow_adv_idx;
2840 bool flowadv_cap;
2841 flowadv_token_t flow_adv_token;
2842 int err;
2843
2844 SK_DF(SK_VERB_FSW_DP | SK_VERB_AQM, "%s classq enqueued %d pkts",
2845 if_name(fsw->fsw_ifp), KPKTQ_LEN(&fe->fe_tx_pktq));
2846
2847 if (chain) {
2848 pkt = KPKTQ_FIRST(&fe->fe_tx_pktq);
2849 tail = KPKTQ_LAST(&fe->fe_tx_pktq);
2850 KPKTQ_INIT(&fe->fe_tx_pktq);
2851 if (pkt == NULL) {
2852 return;
2853 }
2854 flow_adv_idx = pkt->pkt_flowsrc_fidx;
2855 flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
2856 flow_adv_token = pkt->pkt_flow_token;
2857
2858 err = classq_enqueue_flow_chain(fsw, pkt_head: pkt, pkt_tail: tail, cnt, bytes);
2859
2860 /* set flow advisory if needed */
2861 if (__improbable((err == EQFULL || err == EQSUSPENDED) &&
2862 flowadv_cap)) {
2863 flowadv_is_set = na_flowadv_set(flow_get_na(fsw, fe),
2864 flow_adv_idx, flow_adv_token);
2865 }
2866 DTRACE_SKYWALK3(chain__enqueue, uint32_t, cnt, uint32_t, bytes,
2867 bool, flowadv_is_set);
2868 } else {
2869 uint32_t c = 0, b = 0;
2870
2871 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
2872 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
2873
2874 flow_adv_idx = pkt->pkt_flowsrc_fidx;
2875 flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
2876 flow_adv_token = pkt->pkt_flow_token;
2877
2878 c++;
2879 b += pkt->pkt_length;
2880 err = classq_enqueue_flow_single(fsw, pkt);
2881
2882 /* set flow advisory if needed */
2883 if (__improbable(!flowadv_is_set &&
2884 ((err == EQFULL || err == EQSUSPENDED) &&
2885 flowadv_cap))) {
2886 flowadv_is_set = na_flowadv_set(
2887 flow_get_na(fsw, fe), flow_adv_idx,
2888 flow_adv_token);
2889 }
2890 }
2891 ASSERT(c == cnt);
2892 ASSERT(b == bytes);
2893 DTRACE_SKYWALK3(non__chain__enqueue, uint32_t, cnt, uint32_t, bytes,
2894 bool, flowadv_is_set);
2895 }
2896
2897 /* notify flow advisory event */
2898 if (__improbable(flowadv_is_set)) {
2899 struct __kern_channel_ring *r = fsw_flow_get_tx_ring(fsw, fe);
2900 if (__probable(r)) {
2901 na_flowadv_event(r);
2902 SK_DF(SK_VERB_FLOW_ADVISORY | SK_VERB_TX,
2903 "%s(%d) notified of flow update",
2904 sk_proc_name_address(current_proc()),
2905 sk_proc_pid(current_proc()));
2906 }
2907 }
2908}
2909
2910/*
2911 * Logical link code path
2912 */
2913static void
2914classq_qset_enqueue_flow(struct nx_flowswitch *fsw, struct flow_entry *fe,
2915 bool chain, uint32_t cnt, uint32_t bytes)
2916{
2917#pragma unused(chain)
2918 struct __kern_packet *pkt, *tail;
2919 flowadv_idx_t flow_adv_idx;
2920 bool flowadv_is_set = false;
2921 bool flowadv_cap;
2922 flowadv_token_t flow_adv_token;
2923 uint32_t flowctl = 0, dropped = 0;
2924 int err;
2925
2926 SK_DF(SK_VERB_FSW_DP | SK_VERB_AQM, "%s classq enqueued %d pkts",
2927 if_name(fsw->fsw_ifp), KPKTQ_LEN(&fe->fe_tx_pktq));
2928
2929 pkt = KPKTQ_FIRST(&fe->fe_tx_pktq);
2930 tail = KPKTQ_LAST(&fe->fe_tx_pktq);
2931 KPKTQ_INIT(&fe->fe_tx_pktq);
2932 if (pkt == NULL) {
2933 return;
2934 }
2935 flow_adv_idx = pkt->pkt_flowsrc_fidx;
2936 flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0);
2937 flow_adv_token = pkt->pkt_flow_token;
2938
2939 err = netif_qset_enqueue(fe->fe_qset, pkt, tail, cnt, bytes,
2940 &flowctl, &dropped);
2941
2942 if (__improbable(err != 0)) {
2943 /* set flow advisory if needed */
2944 if (flowctl > 0 && flowadv_cap) {
2945 flowadv_is_set = na_flowadv_set(flow_get_na(fsw, fe),
2946 flow_adv_idx, flow_adv_token);
2947
2948 /* notify flow advisory event */
2949 if (flowadv_is_set) {
2950 struct __kern_channel_ring *r =
2951 fsw_flow_get_tx_ring(fsw, fe);
2952 if (__probable(r)) {
2953 na_flowadv_event(r);
2954 SK_DF(SK_VERB_FLOW_ADVISORY |
2955 SK_VERB_TX,
2956 "%s(%d) notified of flow update",
2957 sk_proc_name_address(current_proc()),
2958 sk_proc_pid(current_proc()));
2959 }
2960 }
2961 }
2962 if (dropped > 0) {
2963 STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, dropped);
2964 STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP,
2965 dropped);
2966 }
2967 }
2968}
2969
2970static void
2971tx_finalize_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt)
2972{
2973#pragma unused(fsw)
2974 /* finalize here; no more changes to buflets after classq */
2975 if (__probable(!(pkt->pkt_pflags & PKT_F_MBUF_DATA))) {
2976 kern_packet_t ph = SK_PTR_ENCODE(pkt,
2977 METADATA_TYPE(pkt), METADATA_SUBTYPE(pkt));
2978 int err = __packet_finalize(ph);
2979 VERIFY(err == 0);
2980 }
2981}
2982
2983static bool
2984dp_flow_tx_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
2985{
2986 struct flow_route *fr = fe->fe_route;
2987 int err;
2988
2989 ASSERT(fr != NULL);
2990
2991 if (__improbable(!dp_flow_route_process(fsw, fe))) {
2992 return false;
2993 }
2994 if (fe->fe_qset_select == FE_QSET_SELECT_DYNAMIC) {
2995 flow_qset_select_dynamic(fsw, fe, TRUE);
2996 }
2997
2998 _FSW_INJECT_ERROR(35, fr->fr_flags, fr->fr_flags,
2999 _fsw_error35_handler, 1, fr, NULL, NULL);
3000 _FSW_INJECT_ERROR(36, fr->fr_flags, fr->fr_flags,
3001 _fsw_error36_handler, 1, fr, NULL);
3002
3003 /*
3004 * See if we need to resolve the flow route; note the test against
3005 * fr_flags here is done without any lock for performance. Thus
3006 * it's possible that we race against the thread performing route
3007 * event updates for a packet (which is OK). In any case we should
3008 * not have any assertion on fr_flags value(s) due to the lack of
3009 * serialization.
3010 */
3011 if (fr->fr_flags & FLOWRTF_RESOLVED) {
3012 goto frame;
3013 }
3014
3015 struct __kern_packet *pkt, *tpkt;
3016 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3017 err = fsw->fsw_resolve(fsw, fr, pkt);
3018 _FSW_INJECT_ERROR_SET(35, _fsw_error35_handler, 2, fr, pkt, &err);
3019 _FSW_INJECT_ERROR_SET(36, _fsw_error36_handler, 2, fr, &err);
3020 /*
3021 * If resolver returns EJUSTRETURN then we drop the pkt as the
3022 * resolver should have converted the pkt into mbuf (or
3023 * detached the attached mbuf from pkt) and added it to the
3024 * llinfo queue. If we do have a cached llinfo, then proceed
3025 * to using it even though it may be stale (very unlikely)
3026 * while the resolution is in progress.
3027 * Otherwise, any other error results in dropping pkt.
3028 */
3029 if (err == EJUSTRETURN) {
3030 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3031 pp_free_packet_single(pkt);
3032 FSW_STATS_INC(FSW_STATS_TX_RESOLV_PENDING);
3033 continue;
3034 } else if (err != 0 && (fr->fr_flags & FLOWRTF_HAS_LLINFO)) {
3035 /* use existing llinfo */
3036 FSW_STATS_INC(FSW_STATS_TX_RESOLV_STALE);
3037 } else if (err != 0) {
3038 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3039 pp_free_packet_single(pkt);
3040 FSW_STATS_INC(FSW_STATS_TX_RESOLV_FAIL);
3041 continue;
3042 }
3043 }
3044
3045frame:
3046 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3047 if (fsw->fsw_frame != NULL) {
3048 fsw->fsw_frame(fsw, fr, pkt);
3049 }
3050 }
3051
3052 return true;
3053}
3054
3055static void
3056dp_listener_flow_tx_process(struct nx_flowswitch *fsw, struct flow_entry *fe)
3057{
3058#pragma unused(fsw)
3059 struct __kern_packet *pkt, *tpkt;
3060 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3061 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3062 /* listener is only allowed TCP RST */
3063 if (pkt->pkt_flow_ip_proto == IPPROTO_TCP &&
3064 (pkt->pkt_flow_tcp_flags & TH_RST) != 0) {
3065 flow_track_abort_tcp(fe, NULL, rst_pkt: pkt);
3066 } else {
3067 char *addr;
3068 MD_BUFLET_ADDR_ABS(pkt, addr);
3069 SK_ERR("listener flow sends non-RST packet %s",
3070 sk_dump(sk_proc_name_address(current_proc()),
3071 addr, pkt->pkt_length, 128, NULL, 0));
3072 }
3073 pp_free_packet_single(pkt);
3074 }
3075}
3076
3077static void
3078fsw_update_timestamps(struct __kern_packet *pkt, volatile uint64_t *fg_ts,
3079 volatile uint64_t *rt_ts, ifnet_t ifp)
3080{
3081 struct timespec now;
3082 uint64_t now_nsec = 0;
3083
3084 if (!(pkt->pkt_pflags & PKT_F_TS_VALID) || pkt->pkt_timestamp == 0) {
3085 nanouptime(ts: &now);
3086 net_timernsec(&now, &now_nsec);
3087 pkt->pkt_timestamp = now_nsec;
3088 }
3089 pkt->pkt_pflags &= ~PKT_F_TS_VALID;
3090
3091 /*
3092 * If the packet service class is not background,
3093 * update the timestamps on the interface, as well as
3094 * the ones in nexus-wide advisory to indicate recent
3095 * activity on a foreground flow.
3096 */
3097 if (!(pkt->pkt_pflags & PKT_F_BACKGROUND)) {
3098 ifp->if_fg_sendts = (uint32_t)_net_uptime;
3099 if (fg_ts != NULL) {
3100 *fg_ts = _net_uptime;
3101 }
3102 }
3103 if (pkt->pkt_pflags & PKT_F_REALTIME) {
3104 ifp->if_rt_sendts = (uint32_t)_net_uptime;
3105 if (rt_ts != NULL) {
3106 *rt_ts = _net_uptime;
3107 }
3108 }
3109}
3110
3111static bool
3112fsw_chain_enqueue_enabled(struct nx_flowswitch *fsw, bool gso_enabled)
3113{
3114 return fsw_chain_enqueue != 0 &&
3115 fsw->fsw_ifp->if_output_netem == NULL &&
3116 (fsw->fsw_ifp->if_eflags & IFEF_ENQUEUE_MULTI) == 0 &&
3117 gso_enabled;
3118}
3119
3120void
3121dp_flow_tx_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
3122 uint32_t flags)
3123{
3124 struct pktq dropped_pkts;
3125 bool chain, gso = ((flags & FLOW_PROC_FLAG_GSO) != 0);
3126 uint32_t cnt = 0, bytes = 0;
3127 volatile struct sk_nexusadv *nxadv = NULL;
3128 volatile uint64_t *fg_ts = NULL;
3129 volatile uint64_t *rt_ts = NULL;
3130 uint8_t qset_idx = (fe->fe_qset != NULL) ? fe->fe_qset->nqs_idx : 0;
3131
3132 KPKTQ_INIT(&dropped_pkts);
3133 ASSERT(!KPKTQ_EMPTY(&fe->fe_tx_pktq));
3134 if (__improbable(fe->fe_flags & FLOWENTF_LISTENER)) {
3135 dp_listener_flow_tx_process(fsw, fe);
3136 return;
3137 }
3138 if (__improbable(!dp_flow_tx_route_process(fsw, fe))) {
3139 SK_RDERR(5, "Tx route bad");
3140 FSW_STATS_ADD(FSW_STATS_TX_FLOW_NONVIABLE,
3141 KPKTQ_LEN(&fe->fe_tx_pktq));
3142 KPKTQ_CONCAT(&dropped_pkts, &fe->fe_tx_pktq);
3143 goto done;
3144 }
3145 chain = fsw_chain_enqueue_enabled(fsw, gso_enabled: gso);
3146 if (chain) {
3147 nxadv = fsw->fsw_nx->nx_adv.flowswitch_nxv_adv;
3148 if (nxadv != NULL) {
3149 fg_ts = &nxadv->nxadv_fg_sendts;
3150 rt_ts = &nxadv->nxadv_rt_sendts;
3151 }
3152 }
3153 struct __kern_packet *pkt, *tpkt;
3154 KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) {
3155 int err = 0;
3156
3157 err = flow_pkt_track(fe, pkt, false);
3158 if (__improbable(err != 0)) {
3159 SK_RDERR(5, "flow_pkt_track failed (err %d)", err);
3160 FSW_STATS_INC(FSW_STATS_TX_FLOW_TRACK_ERR);
3161 KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt);
3162 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
3163 continue;
3164 }
3165 _UUID_COPY(pkt->pkt_policy_euuid, fe->fe_eproc_uuid);
3166 pkt->pkt_transport_protocol = fe->fe_transport_protocol;
3167
3168 /* set AQM related values for outgoing packet */
3169 if (fe->fe_adv_idx != FLOWADV_IDX_NONE) {
3170 pkt->pkt_pflags |= PKT_F_FLOW_ADV;
3171 pkt->pkt_flowsrc_type = FLOWSRC_CHANNEL;
3172 pkt->pkt_flowsrc_fidx = fe->fe_adv_idx;
3173 } else {
3174 pkt->pkt_pflags &= ~PKT_F_FLOW_ADV;
3175 }
3176 _UUID_CLEAR(pkt->pkt_flow_id);
3177 pkt->pkt_flow_token = fe->fe_flowid;
3178 pkt->pkt_pflags |= PKT_F_FLOW_ID;
3179 pkt->pkt_qset_idx = qset_idx;
3180 pkt->pkt_policy_id = fe->fe_policy_id;
3181 pkt->pkt_skip_policy_id = fe->fe_skip_policy_id;
3182
3183 /*
3184 * The same code is exercised per packet for the non-chain case
3185 * (see ifnet_enqueue_ifclassq()). It's replicated here to avoid
3186 * re-walking the chain later.
3187 */
3188 if (chain) {
3189 fsw_update_timestamps(pkt, fg_ts, rt_ts, ifp: fsw->fsw_ifp);
3190 }
3191 /* mark packet tos/svc_class */
3192 fsw_qos_mark(fsw, fe, pkt);
3193
3194 tx_finalize_packet(fsw, pkt);
3195 bytes += pkt->pkt_length;
3196 cnt++;
3197 }
3198
3199 /* snoop after it's finalized */
3200 if (__improbable(pktap_total_tap_count != 0)) {
3201 fsw_snoop(fsw, fe, false);
3202 }
3203 if (fe->fe_qset != NULL) {
3204 classq_qset_enqueue_flow(fsw, fe, chain, cnt, bytes);
3205 } else {
3206 classq_enqueue_flow(fsw, fe, chain, cnt, bytes);
3207 }
3208done:
3209 dp_drop_pktq(fsw, &dropped_pkts);
3210}
3211
3212static struct flow_entry *
3213tx_process_continuous_ip_frag(struct nx_flowswitch *fsw,
3214 struct flow_entry *prev_fe, struct __kern_packet *pkt)
3215{
3216 ASSERT(!pkt->pkt_flow_ip_is_first_frag);
3217
3218 if (__improbable(pkt->pkt_flow_ip_frag_id == 0)) {
3219 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_ID);
3220 SK_ERR("%s(%d) invalid zero fragment id",
3221 sk_proc_name_address(current_proc()),
3222 sk_proc_pid(current_proc()));
3223 return NULL;
3224 }
3225
3226 SK_DF(SK_VERB_FSW_DP | SK_VERB_TX,
3227 "%s(%d) continuation frag, id %u",
3228 sk_proc_name_address(current_proc()),
3229 sk_proc_pid(current_proc()),
3230 pkt->pkt_flow_ip_frag_id);
3231 if (__improbable(prev_fe == NULL ||
3232 !prev_fe->fe_tx_is_cont_frag)) {
3233 SK_ERR("%s(%d) unexpected continuation frag",
3234 sk_proc_name_address(current_proc()),
3235 sk_proc_pid(current_proc()),
3236 pkt->pkt_flow_ip_frag_id);
3237 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
3238 return NULL;
3239 }
3240 if (__improbable(pkt->pkt_flow_ip_frag_id !=
3241 prev_fe->fe_tx_frag_id)) {
3242 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
3243 SK_ERR("%s(%d) wrong continuation frag id %u expecting %u",
3244 sk_proc_name_address(current_proc()),
3245 sk_proc_pid(current_proc()),
3246 pkt->pkt_flow_ip_frag_id,
3247 prev_fe->fe_tx_frag_id);
3248 return NULL;
3249 }
3250
3251 return prev_fe;
3252}
3253
3254static struct flow_entry *
3255tx_lookup_flow(struct nx_flowswitch *fsw, struct __kern_packet *pkt,
3256 struct flow_entry *prev_fe)
3257{
3258 struct flow_entry *fe;
3259
3260 fe = lookup_flow_with_pkt(fsw, pkt, false, prev_fe);
3261 if (__improbable(fe == NULL)) {
3262 goto done;
3263 }
3264
3265 if (__improbable(fe->fe_flags & FLOWENTF_TORN_DOWN)) {
3266 SK_RDERR(5, "Tx flow torn down");
3267 FSW_STATS_INC(FSW_STATS_TX_FLOW_TORNDOWN);
3268 flow_entry_release(pfe: &fe);
3269 goto done;
3270 }
3271
3272 _FSW_INJECT_ERROR(34, pkt->pkt_flow_id[0], fe->fe_uuid[0] + 1,
3273 null_func);
3274
3275 if (__improbable(!_UUID_MATCH(pkt->pkt_flow_id, fe->fe_uuid))) {
3276 uuid_string_t flow_id_str, pkt_id_str;
3277 sk_uuid_unparse(fe->fe_uuid, flow_id_str);
3278 sk_uuid_unparse(pkt->pkt_flow_id, pkt_id_str);
3279 SK_ERR("pkt flow id %s != flow id %s", pkt_id_str, flow_id_str);
3280 flow_entry_release(pfe: &fe);
3281 FSW_STATS_INC(FSW_STATS_TX_FLOW_BAD_ID);
3282 }
3283
3284done:
3285 return fe;
3286}
3287
3288static inline void
3289tx_flow_process(struct nx_flowswitch *fsw, struct flow_entry *fe,
3290 uint32_t flags)
3291{
3292 ASSERT(!KPKTQ_EMPTY(&fe->fe_tx_pktq));
3293 ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) != 0);
3294
3295 SK_DF(SK_VERB_FSW_DP | SK_VERB_TX, "TX %d pkts from fe %p port %d",
3296 KPKTQ_LEN(&fe->fe_tx_pktq), fe, fe->fe_nx_port);
3297
3298 /* flow related processing (default, agg, etc.) */
3299 fe->fe_tx_process(fsw, fe, flags);
3300
3301 KPKTQ_FINI(&fe->fe_tx_pktq);
3302}
3303
3304#if SK_LOG
3305static void
3306dp_tx_log_pkt(uint64_t verb, char *desc, struct __kern_packet *pkt)
3307{
3308 char *pkt_buf;
3309 MD_BUFLET_ADDR_ABS(pkt, pkt_buf);
3310 SK_DF(verb, "%s(%d) %s %s", sk_proc_name_address(current_proc()),
3311 sk_proc_pid(current_proc()), desc, sk_dump("buf", pkt_buf,
3312 pkt->pkt_length, 128, NULL, 0));
3313}
3314#else /* !SK_LOG */
3315#define dp_tx_log_pkt(...)
3316#endif /* !SK_LOG */
3317
3318static inline struct ifnet *
3319fsw_datamov_begin(struct nx_flowswitch *fsw)
3320{
3321 struct ifnet *ifp;
3322
3323 ifp = fsw->fsw_ifp;
3324 if (!ifnet_datamov_begin(ifp)) {
3325 DTRACE_SKYWALK1(ifnet__detached, struct ifnet *, ifp);
3326 return NULL;
3327 }
3328 return ifp;
3329}
3330
3331static inline void
3332fsw_datamov_end(struct nx_flowswitch *fsw)
3333{
3334 ifnet_datamov_end(fsw->fsw_ifp);
3335}
3336
3337static void
3338dp_tx_pktq(struct nx_flowswitch *fsw, struct pktq *spktq)
3339{
3340 struct __kern_packet *spkt, *pkt;
3341 struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
3342 struct flow_entry *fe, *prev_fe;
3343 struct pktq dropped_pkts, dpktq;
3344 struct nexus_adapter *dev_na;
3345 struct kern_pbufpool *dev_pp;
3346 struct ifnet *ifp = NULL;
3347 sa_family_t af;
3348 uint32_t n_pkts, n_flows = 0;
3349 boolean_t do_pacing = FALSE;
3350
3351 int err;
3352 KPKTQ_INIT(&dpktq);
3353 KPKTQ_INIT(&dropped_pkts);
3354 n_pkts = KPKTQ_LEN(spktq);
3355
3356 FSW_RLOCK(fsw);
3357 if (__improbable(FSW_QUIESCED(fsw))) {
3358 DTRACE_SKYWALK1(tx__quiesced, struct nx_flowswitch *, fsw);
3359 SK_ERR("flowswitch detached, dropping %d pkts", n_pkts);
3360 KPKTQ_CONCAT(&dropped_pkts, spktq);
3361 goto done;
3362 }
3363 dev_na = fsw->fsw_dev_ch->ch_na;
3364 if (__improbable(dev_na == NULL)) {
3365 SK_ERR("dev port not attached, dropping %d pkts", n_pkts);
3366 FSW_STATS_ADD(FSW_STATS_DST_NXPORT_INACTIVE, n_pkts);
3367 KPKTQ_CONCAT(&dropped_pkts, spktq);
3368 goto done;
3369 }
3370 ifp = fsw_datamov_begin(fsw);
3371 if (ifp == NULL) {
3372 SK_ERR("ifnet not attached, dropping %d pkts", n_pkts);
3373 KPKTQ_CONCAT(&dropped_pkts, spktq);
3374 goto done;
3375 }
3376
3377 /* batch allocate enough packets */
3378 dev_pp = na_kr_get_pp(dev_na, NR_TX);
3379
3380 err = pp_alloc_pktq(dev_pp, dev_pp->pp_max_frags, &dpktq, n_pkts, NULL,
3381 NULL, SKMEM_NOSLEEP);
3382#if DEVELOPMENT || DEBUG
3383 if (__probable(err != ENOMEM)) {
3384 _FSW_INJECT_ERROR(12, err, ENOMEM, pp_free_pktq, &dpktq);
3385 }
3386#endif /* DEVELOPMENT || DEBUG */
3387 if (__improbable(err == ENOMEM)) {
3388 ASSERT(KPKTQ_EMPTY(&dpktq));
3389 KPKTQ_CONCAT(&dropped_pkts, spktq);
3390 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
3391 SK_ERR("failed to alloc %u pkts from device pool", n_pkts);
3392 goto done;
3393 } else if (__improbable(err == EAGAIN)) {
3394 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT,
3395 (n_pkts - KPKTQ_LEN(&dpktq)));
3396 FSW_STATS_ADD(FSW_STATS_DROP,
3397 (n_pkts - KPKTQ_LEN(&dpktq)));
3398 }
3399
3400 n_pkts = KPKTQ_LEN(&dpktq);
3401 prev_fe = NULL;
3402 KPKTQ_FOREACH(spkt, spktq) {
3403 if (n_pkts == 0) {
3404 break;
3405 }
3406 --n_pkts;
3407
3408 KPKTQ_DEQUEUE(&dpktq, pkt);
3409 ASSERT(pkt != NULL);
3410 err = dp_copy_to_dev(fsw, spkt, dpkt: pkt);
3411 if (__improbable(err != 0)) {
3412 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
3413 continue;
3414 }
3415
3416 do_pacing |= ((pkt->pkt_pflags & PKT_F_OPT_TX_TIMESTAMP) != 0);
3417 af = fsw_ip_demux(fsw, pkt);
3418 if (__improbable(af == AF_UNSPEC)) {
3419 dp_tx_log_pkt(SK_VERB_ERROR, "demux err", pkt);
3420 FSW_STATS_INC(FSW_STATS_TX_DEMUX_ERR);
3421 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
3422 continue;
3423 }
3424
3425 err = flow_pkt_classify(pkt, ifp, af, false);
3426 if (__improbable(err != 0)) {
3427 dp_tx_log_pkt(SK_VERB_ERROR, "flow extract err", pkt);
3428 FSW_STATS_INC(FSW_STATS_TX_FLOW_EXTRACT_ERR);
3429 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
3430 continue;
3431 }
3432
3433 if (__improbable(pkt->pkt_flow_ip_is_frag &&
3434 !pkt->pkt_flow_ip_is_first_frag)) {
3435 fe = tx_process_continuous_ip_frag(fsw, prev_fe, pkt);
3436 if (__probable(fe != NULL)) {
3437 flow_entry_retain(fe);
3438 goto flow_batch;
3439 } else {
3440 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
3441 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
3442 continue;
3443 }
3444 }
3445
3446 fe = tx_lookup_flow(fsw, pkt, prev_fe);
3447 if (__improbable(fe == NULL)) {
3448 FSW_STATS_INC(FSW_STATS_TX_FLOW_NOT_FOUND);
3449 KPKTQ_ENQUEUE(&dropped_pkts, pkt);
3450 prev_fe = NULL;
3451 continue;
3452 }
3453flow_batch:
3454 tx_flow_batch_packet(fes: &fes, fe, pkt);
3455 prev_fe = fe;
3456 }
3457
3458 struct flow_entry *tfe = NULL;
3459 TAILQ_FOREACH_SAFE(fe, &fes, fe_tx_link, tfe) {
3460 tx_flow_process(fsw, fe, flags: 0);
3461 TAILQ_REMOVE(&fes, fe, fe_tx_link);
3462 fe->fe_tx_is_cont_frag = false;
3463 fe->fe_tx_frag_id = 0;
3464 flow_entry_release(pfe: &fe);
3465 n_flows++;
3466 }
3467
3468done:
3469 FSW_RUNLOCK(fsw);
3470 if (n_flows > 0) {
3471 netif_transmit(ifp, NETIF_XMIT_FLAG_CHANNEL | (do_pacing ? NETIF_XMIT_FLAG_PACING : 0));
3472 }
3473 if (ifp != NULL) {
3474 fsw_datamov_end(fsw);
3475 }
3476 dp_drop_pktq(fsw, &dropped_pkts);
3477 KPKTQ_FINI(&dropped_pkts);
3478 KPKTQ_FINI(&dpktq);
3479}
3480
3481static sa_family_t
3482get_tso_af(struct __kern_packet *pkt)
3483{
3484 packet_tso_flags_t tso_flags;
3485
3486 tso_flags = pkt->pkt_csum_flags & PACKET_CSUM_TSO_FLAGS;
3487 if (tso_flags == PACKET_TSO_IPV4) {
3488 return AF_INET;
3489 } else if (tso_flags == PACKET_TSO_IPV6) {
3490 return AF_INET6;
3491 } else {
3492 panic("invalid tso flags: 0x%x\n", tso_flags);
3493 /* NOTREACHED */
3494 __builtin_unreachable();
3495 }
3496}
3497
3498static inline void
3499update_flow_info(struct __kern_packet *pkt, void *iphdr, void *tcphdr,
3500 uint16_t payload_sz)
3501{
3502 struct tcphdr *tcp = tcphdr;
3503
3504 DTRACE_SKYWALK4(update__flow__info, struct __kern_packet *, pkt,
3505 void *, iphdr, void *, tcphdr, uint16_t, payload_sz);
3506 pkt->pkt_flow_ip_hdr = (mach_vm_address_t)iphdr;
3507 pkt->pkt_flow_tcp_hdr = (mach_vm_address_t)tcphdr;
3508 pkt->pkt_flow_tcp_flags = tcp->th_flags;
3509 pkt->pkt_flow_tcp_seq = tcp->th_seq;
3510 pkt->pkt_flow_ulen = payload_sz;
3511}
3512
3513static int
3514do_gso(struct nx_flowswitch *fsw, int af, struct __kern_packet *orig_pkt,
3515 struct __kern_packet *first_pkt, struct pktq *dev_pktq,
3516 struct pktq *gso_pktq)
3517{
3518 ifnet_t ifp = fsw->fsw_ifp;
3519 struct __kern_packet *pkt = first_pkt;
3520 uint8_t proto = pkt->pkt_flow_ip_proto;
3521 uint16_t ip_hlen = pkt->pkt_flow_ip_hlen;
3522 uint16_t tcp_hlen = pkt->pkt_flow_tcp_hlen;
3523 uint16_t total_hlen = ip_hlen + tcp_hlen;
3524 uint16_t mtu = (uint16_t)ifp->if_mtu;
3525 uint16_t mss = pkt->pkt_proto_seg_sz, payload_sz;
3526 uint32_t n, n_pkts, off = 0, total_len = orig_pkt->pkt_length;
3527 uint16_t headroom = fsw->fsw_frame_headroom + ifp->if_tx_headroom;
3528 kern_packet_t orig_ph = SK_PKT2PH(orig_pkt);
3529 uint8_t *orig_pkt_baddr;
3530 struct tcphdr *tcp;
3531 struct ip *ip;
3532 struct ip6_hdr *ip6;
3533 uint32_t tcp_seq;
3534 uint16_t ipid;
3535 uint32_t pseudo_hdr_csum, bufsz;
3536
3537 ASSERT(headroom <= UINT8_MAX);
3538 if (proto != IPPROTO_TCP) {
3539 SK_ERR("invalid proto: %d", proto);
3540 DTRACE_SKYWALK3(invalid__proto, struct nx_flowswitch *,
3541 fsw, ifnet_t, ifp, uint8_t, proto);
3542 return EINVAL;
3543 }
3544 if (mss == 0 || mss > (mtu - total_hlen)) {
3545 SK_ERR("invalid args: mss %d, mtu %d, total_hlen %d",
3546 mss, mtu, total_hlen);
3547 DTRACE_SKYWALK5(invalid__args1, struct nx_flowswitch *,
3548 fsw, ifnet_t, ifp, uint16_t, mss, uint16_t, mtu,
3549 uint32_t, total_hlen);
3550 return EINVAL;
3551 }
3552 bufsz = PP_BUF_SIZE_DEF(pkt->pkt_qum.qum_pp);
3553 if ((headroom + total_hlen + mss) > bufsz) {
3554 SK_ERR("invalid args: headroom %d, total_hlen %d, "
3555 "mss %d, bufsz %d", headroom, total_hlen, mss, bufsz);
3556 DTRACE_SKYWALK6(invalid__args2, struct nx_flowswitch *,
3557 fsw, ifnet_t, ifp, uint16_t, headroom, uint16_t,
3558 total_hlen, uint16_t, mss, uint32_t, bufsz);
3559 return EINVAL;
3560 }
3561 n_pkts = (uint32_t)(SK_ROUNDUP((total_len - total_hlen), mss) / mss);
3562
3563 ASSERT(pkt->pkt_headroom == headroom);
3564 ASSERT(pkt->pkt_length == total_len);
3565 ASSERT(pkt->pkt_l2_len == 0);
3566 ASSERT((pkt->pkt_qum.qum_qflags & QUM_F_FINALIZED) == 0);
3567 ASSERT((pkt->pkt_pflags & PKT_F_TRUNCATED) != 0);
3568 pkt->pkt_pflags &= ~PKT_F_TRUNCATED;
3569 pkt->pkt_proto_seg_sz = 0;
3570 pkt->pkt_csum_flags = 0;
3571 MD_BUFLET_ADDR_ABS(orig_pkt, orig_pkt_baddr);
3572 orig_pkt_baddr += orig_pkt->pkt_headroom;
3573
3574 if (af == AF_INET) {
3575 ip = (struct ip *)pkt->pkt_flow_ip_hdr;
3576 tcp = (struct tcphdr *)pkt->pkt_flow_tcp_hdr;
3577 ipid = ip->ip_id;
3578 pseudo_hdr_csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr,
3579 pkt->pkt_flow_ipv4_dst.s_addr, 0);
3580 } else {
3581 ASSERT(af == AF_INET6);
3582 tcp = (struct tcphdr *)pkt->pkt_flow_tcp_hdr;
3583 pseudo_hdr_csum = in6_pseudo(&pkt->pkt_flow_ipv6_src,
3584 &pkt->pkt_flow_ipv6_dst, 0);
3585 }
3586 tcp_seq = ntohl(tcp->th_seq);
3587
3588 for (n = 1, payload_sz = mss, off = total_hlen; off < total_len;
3589 off += payload_sz) {
3590 uint8_t *baddr, *baddr0;
3591 uint32_t partial;
3592
3593 if (pkt == NULL) {
3594 n++;
3595 KPKTQ_DEQUEUE(dev_pktq, pkt);
3596 ASSERT(pkt != NULL);
3597 }
3598 MD_BUFLET_ADDR_ABS(pkt, baddr0);
3599 baddr = baddr0;
3600 baddr += headroom;
3601
3602 /* Copy headers from the original packet */
3603 if (n != 1) {
3604 ASSERT(pkt != first_pkt);
3605 pkt_copy(src: orig_pkt_baddr, dst: baddr, len: total_hlen);
3606 fsw_pkt_copy_metadata(spkt: first_pkt, dpkt: pkt);
3607
3608 ASSERT((pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0);
3609 /* flow info still needs to be updated below */
3610 bcopy(src: first_pkt->pkt_flow, dst: pkt->pkt_flow,
3611 n: sizeof(*pkt->pkt_flow));
3612 pkt->pkt_trace_id = 0;
3613 ASSERT(pkt->pkt_headroom == headroom);
3614 } else {
3615 METADATA_SET_LEN(pkt, 0, 0);
3616 }
3617 baddr += total_hlen;
3618
3619 /* Copy/checksum the payload from the original packet */
3620 if (off + payload_sz > total_len) {
3621 payload_sz = (uint16_t)(total_len - off);
3622 }
3623 pkt_copypkt_sum(orig_ph,
3624 (uint16_t)(orig_pkt->pkt_headroom + off),
3625 SK_PKT2PH(pkt), headroom + total_hlen, payload_sz,
3626 &partial, TRUE);
3627
3628 DTRACE_SKYWALK6(copy__csum, struct nx_flowswitch *, fsw,
3629 ifnet_t, ifp, uint8_t *, baddr, uint16_t, payload_sz,
3630 uint16_t, mss, uint32_t, partial);
3631 FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2PKT);
3632
3633 /*
3634 * Adjust header information and fill in the missing fields.
3635 */
3636 if (af == AF_INET) {
3637 ip = (struct ip *)(void *)(baddr0 + pkt->pkt_headroom);
3638 tcp = (struct tcphdr *)(void *)((caddr_t)ip + ip_hlen);
3639
3640 if (n != n_pkts) {
3641 tcp->th_flags &= ~(TH_FIN | TH_PUSH);
3642 }
3643 if (n != 1) {
3644 tcp->th_flags &= ~TH_CWR;
3645 tcp->th_seq = htonl(tcp_seq);
3646 }
3647 update_flow_info(pkt, iphdr: ip, tcphdr: tcp, payload_sz);
3648
3649 ip->ip_id = htons((ipid)++);
3650 ip->ip_len = htons(ip_hlen + tcp_hlen + payload_sz);
3651 ip->ip_sum = 0;
3652 ip->ip_sum = inet_cksum_buffer(ip, 0, 0, len: ip_hlen);
3653 tcp->th_sum = 0;
3654 partial = __packet_cksum(data: tcp, len: tcp_hlen, sum0: partial);
3655 partial += htons(tcp_hlen + IPPROTO_TCP + payload_sz);
3656 partial += pseudo_hdr_csum;
3657 ADDCARRY(partial);
3658 tcp->th_sum = ~(uint16_t)partial;
3659 } else {
3660 ASSERT(af == AF_INET6);
3661 ip6 = (struct ip6_hdr *)(baddr0 + pkt->pkt_headroom);
3662 tcp = (struct tcphdr *)(void *)((caddr_t)ip6 + ip_hlen);
3663
3664 if (n != n_pkts) {
3665 tcp->th_flags &= ~(TH_FIN | TH_PUSH);
3666 }
3667 if (n != 1) {
3668 tcp->th_flags &= ~TH_CWR;
3669 tcp->th_seq = htonl(tcp_seq);
3670 }
3671 update_flow_info(pkt, iphdr: ip6, tcphdr: tcp, payload_sz);
3672
3673 ip6->ip6_plen = htons(tcp_hlen + payload_sz);
3674 tcp->th_sum = 0;
3675 partial = __packet_cksum(data: tcp, len: tcp_hlen, sum0: partial);
3676 partial += htonl(tcp_hlen + IPPROTO_TCP + payload_sz);
3677 partial += pseudo_hdr_csum;
3678 ADDCARRY(partial);
3679 tcp->th_sum = ~(uint16_t)partial;
3680 }
3681 tcp_seq += payload_sz;
3682 METADATA_ADJUST_LEN(pkt, total_hlen, headroom);
3683#if (DEVELOPMENT || DEBUG)
3684 struct __kern_buflet *bft;
3685 uint32_t blen;
3686 PKT_GET_FIRST_BUFLET(pkt, 1, bft);
3687 blen = __buflet_get_data_length(bft);
3688 if (blen != total_hlen + payload_sz) {
3689 panic("blen (%d) != total_len + payload_sz (%d)\n",
3690 blen, total_hlen + payload_sz);
3691 }
3692#endif /* DEVELOPMENT || DEBUG */
3693
3694 pkt->pkt_length = total_hlen + payload_sz;
3695 KPKTQ_ENQUEUE(gso_pktq, pkt);
3696 pkt = NULL;
3697
3698 /*
3699 * Note that at this point the packet is not yet finalized.
3700 * The finalization happens in dp_flow_tx_process() after
3701 * the framing is done.
3702 */
3703 }
3704 ASSERT(n == n_pkts);
3705 ASSERT(off == total_len);
3706 DTRACE_SKYWALK7(gso__done, struct nx_flowswitch *, fsw, ifnet_t, ifp,
3707 uint32_t, n_pkts, uint32_t, total_len, uint16_t, ip_hlen,
3708 uint16_t, tcp_hlen, uint8_t *, orig_pkt_baddr);
3709 return 0;
3710}
3711
3712static void
3713tx_flow_enqueue_gso_pktq(struct flow_entry_list *fes, struct flow_entry *fe,
3714 struct pktq *gso_pktq)
3715{
3716 if (KPKTQ_EMPTY(&fe->fe_tx_pktq)) {
3717 ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) == 0);
3718 TAILQ_INSERT_TAIL(fes, fe, fe_tx_link);
3719 KPKTQ_ENQUEUE_MULTI(&fe->fe_tx_pktq, KPKTQ_FIRST(gso_pktq),
3720 KPKTQ_LAST(gso_pktq), KPKTQ_LEN(gso_pktq));
3721 KPKTQ_INIT(gso_pktq);
3722 } else {
3723 ASSERT(!TAILQ_EMPTY(fes));
3724 KPKTQ_ENQUEUE_MULTI(&fe->fe_tx_pktq, KPKTQ_FIRST(gso_pktq),
3725 KPKTQ_LAST(gso_pktq), KPKTQ_LEN(gso_pktq));
3726 KPKTQ_INIT(gso_pktq);
3727 flow_entry_release(pfe: &fe);
3728 }
3729}
3730
3731static void
3732dp_gso_pktq(struct nx_flowswitch *fsw, struct pktq *spktq,
3733 uint32_t gso_pkts_estimate)
3734{
3735 struct __kern_packet *spkt, *pkt;
3736 struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes);
3737 struct flow_entry *fe, *prev_fe;
3738 struct pktq dpktq;
3739 struct nexus_adapter *dev_na;
3740 struct kern_pbufpool *dev_pp;
3741 struct ifnet *ifp = NULL;
3742 sa_family_t af;
3743 uint32_t n_pkts, n_flows = 0;
3744 int err;
3745
3746 KPKTQ_INIT(&dpktq);
3747 n_pkts = KPKTQ_LEN(spktq);
3748
3749 FSW_RLOCK(fsw);
3750 if (__improbable(FSW_QUIESCED(fsw))) {
3751 DTRACE_SKYWALK1(tx__quiesced, struct nx_flowswitch *, fsw);
3752 SK_ERR("flowswitch detached, dropping %d pkts", n_pkts);
3753 dp_drop_pktq(fsw, spktq);
3754 goto done;
3755 }
3756 dev_na = fsw->fsw_dev_ch->ch_na;
3757 if (__improbable(dev_na == NULL)) {
3758 SK_ERR("dev port not attached, dropping %d pkts", n_pkts);
3759 FSW_STATS_ADD(FSW_STATS_DST_NXPORT_INACTIVE, n_pkts);
3760 dp_drop_pktq(fsw, spktq);
3761 goto done;
3762 }
3763 ifp = fsw_datamov_begin(fsw);
3764 if (ifp == NULL) {
3765 SK_ERR("ifnet not attached, dropping %d pkts", n_pkts);
3766 dp_drop_pktq(fsw, spktq);
3767 goto done;
3768 }
3769
3770 dev_pp = na_kr_get_pp(dev_na, NR_TX);
3771
3772 /*
3773 * Batch allocate enough packets to perform GSO on all
3774 * packets in spktq.
3775 */
3776 err = pp_alloc_pktq(dev_pp, dev_pp->pp_max_frags, &dpktq,
3777 gso_pkts_estimate, NULL, NULL, SKMEM_NOSLEEP);
3778#if DEVELOPMENT || DEBUG
3779 if (__probable(err != ENOMEM)) {
3780 _FSW_INJECT_ERROR(12, err, ENOMEM, pp_free_pktq, &dpktq);
3781 }
3782#endif /* DEVELOPMENT || DEBUG */
3783 /*
3784 * We either get all packets or none. No partial allocations.
3785 */
3786 if (__improbable(err != 0)) {
3787 if (err == ENOMEM) {
3788 ASSERT(KPKTQ_EMPTY(&dpktq));
3789 } else {
3790 dp_free_pktq(fsw, pktq: &dpktq);
3791 }
3792 DTRACE_SKYWALK1(gso__no__mem, int, err);
3793 dp_drop_pktq(fsw, spktq);
3794 FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts);
3795 SK_ERR("failed to alloc %u pkts from device pool",
3796 gso_pkts_estimate);
3797 goto done;
3798 }
3799 prev_fe = NULL;
3800 KPKTQ_FOREACH(spkt, spktq) {
3801 KPKTQ_DEQUEUE(&dpktq, pkt);
3802 ASSERT(pkt != NULL);
3803 /*
3804 * Copy only headers to the first packet of the GSO chain.
3805 * The headers will be used for classification below.
3806 */
3807 err = dp_copy_headers_to_dev(fsw, spkt, dpkt: pkt);
3808 if (__improbable(err != 0)) {
3809 pp_free_packet_single(pkt);
3810 DTRACE_SKYWALK2(copy__headers__failed,
3811 struct nx_flowswitch *, fsw,
3812 struct __kern_packet *, spkt);
3813 continue;
3814 }
3815 af = get_tso_af(pkt);
3816 ASSERT(af == AF_INET || af == AF_INET6);
3817
3818 err = flow_pkt_classify(pkt, ifp, af, false);
3819 if (__improbable(err != 0)) {
3820 dp_tx_log_pkt(SK_VERB_ERROR, "flow extract err", pkt);
3821 FSW_STATS_INC(FSW_STATS_TX_FLOW_EXTRACT_ERR);
3822 pp_free_packet_single(pkt);
3823 DTRACE_SKYWALK4(classify__failed,
3824 struct nx_flowswitch *, fsw,
3825 struct __kern_packet *, spkt,
3826 struct __kern_packet *, pkt,
3827 int, err);
3828 continue;
3829 }
3830 /*
3831 * GSO cannot be done on a fragment and it's a bug in user
3832 * space to mark a fragment as needing GSO.
3833 */
3834 if (__improbable(pkt->pkt_flow_ip_is_frag)) {
3835 FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT);
3836 pp_free_packet_single(pkt);
3837 DTRACE_SKYWALK3(is__frag,
3838 struct nx_flowswitch *, fsw,
3839 struct __kern_packet *, spkt,
3840 struct __kern_packet *, pkt);
3841 continue;
3842 }
3843 fe = tx_lookup_flow(fsw, pkt, prev_fe);
3844 if (__improbable(fe == NULL)) {
3845 FSW_STATS_INC(FSW_STATS_TX_FLOW_NOT_FOUND);
3846 pp_free_packet_single(pkt);
3847 DTRACE_SKYWALK3(lookup__failed,
3848 struct nx_flowswitch *, fsw,
3849 struct __kern_packet *, spkt,
3850 struct __kern_packet *, pkt);
3851 prev_fe = NULL;
3852 continue;
3853 }
3854 /*
3855 * Perform GSO on spkt using the flow information
3856 * obtained above.
3857 */
3858 struct pktq gso_pktq;
3859 KPKTQ_INIT(&gso_pktq);
3860 err = do_gso(fsw, af, orig_pkt: spkt, first_pkt: pkt, dev_pktq: &dpktq, gso_pktq: &gso_pktq);
3861 if (__probable(err == 0)) {
3862 tx_flow_enqueue_gso_pktq(fes: &fes, fe, gso_pktq: &gso_pktq);
3863 prev_fe = fe;
3864 } else {
3865 DTRACE_SKYWALK1(gso__error, int, err);
3866 /* TODO: increment error stat */
3867 pp_free_packet_single(pkt);
3868 flow_entry_release(pfe: &fe);
3869 prev_fe = NULL;
3870 }
3871 KPKTQ_FINI(&gso_pktq);
3872 }
3873 struct flow_entry *tfe = NULL;
3874 TAILQ_FOREACH_SAFE(fe, &fes, fe_tx_link, tfe) {
3875 /* Chain-enqueue can be used for GSO chains */
3876 tx_flow_process(fsw, fe, FLOW_PROC_FLAG_GSO);
3877 TAILQ_REMOVE(&fes, fe, fe_tx_link);
3878 flow_entry_release(pfe: &fe);
3879 n_flows++;
3880 }
3881done:
3882 FSW_RUNLOCK(fsw);
3883 if (n_flows > 0) {
3884 netif_transmit(ifp, NETIF_XMIT_FLAG_CHANNEL);
3885 }
3886 if (ifp != NULL) {
3887 fsw_datamov_end(fsw);
3888 }
3889
3890 /*
3891 * It's possible for packets to be left in dpktq because
3892 * gso_pkts_estimate is only an estimate. The actual number
3893 * of packets needed could be less.
3894 */
3895 uint32_t dpktq_len;
3896 if ((dpktq_len = KPKTQ_LEN(&dpktq)) > 0) {
3897 DTRACE_SKYWALK2(leftover__dev__pkts,
3898 struct nx_flowswitch *, fsw, uint32_t, dpktq_len);
3899 dp_free_pktq(fsw, pktq: &dpktq);
3900 }
3901 KPKTQ_FINI(&dpktq);
3902}
3903
3904static inline void
3905fsw_dev_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
3906 struct proc *p)
3907{
3908#pragma unused(p)
3909 uint32_t total_pkts = 0, total_bytes = 0;
3910
3911 for (;;) {
3912 struct pktq pktq;
3913 KPKTQ_INIT(&pktq);
3914 uint32_t n_bytes;
3915 fsw_rx_ring_dequeue_pktq(fsw, r, n_pkts_max: fsw_rx_batch, pktq: &pktq, n_bytes: &n_bytes);
3916 if (n_bytes == 0) {
3917 break;
3918 }
3919 total_pkts += KPKTQ_LEN(&pktq);
3920 total_bytes += n_bytes;
3921
3922 if (__probable(fsw->fsw_ifp->if_input_netem == NULL)) {
3923 fsw_receive(fsw, pktq: &pktq);
3924 } else {
3925 fsw_dev_input_netem_enqueue(fsw, q: &pktq);
3926 }
3927 KPKTQ_FINI(&pktq);
3928 }
3929
3930 KDBG(SK_KTRACE_FSW_DEV_RING_FLUSH, SK_KVA(r), total_pkts, total_bytes);
3931 DTRACE_SKYWALK2(fsw__dp__dev__ring__flush, uint32_t, total_pkts,
3932 uint32_t, total_bytes);
3933
3934 /* compute mitigation rate for delivered traffic */
3935 if (__probable(r->ckr_netif_mit_stats != NULL)) {
3936 r->ckr_netif_mit_stats(r, total_pkts, total_bytes);
3937 }
3938}
3939
3940static inline void
3941fsw_user_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
3942 struct proc *p)
3943{
3944#pragma unused(p)
3945 static packet_trace_id_t trace_id = 0;
3946 uint32_t total_pkts = 0, total_bytes = 0;
3947
3948 for (;;) {
3949 struct pktq pktq;
3950 KPKTQ_INIT(&pktq);
3951 uint32_t n_bytes;
3952 uint32_t gso_pkts_estimate = 0;
3953
3954 fsw_tx_ring_dequeue_pktq(fsw, r, n_pkts_max: fsw_tx_batch, pktq: &pktq, n_bytes: &n_bytes,
3955 gso_pkts_estimate: &gso_pkts_estimate);
3956 if (n_bytes == 0) {
3957 break;
3958 }
3959 total_pkts += KPKTQ_LEN(&pktq);
3960 total_bytes += n_bytes;
3961
3962 KPKTQ_FIRST(&pktq)->pkt_trace_id = ++trace_id;
3963 KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_START,
3964 KPKTQ_FIRST(&pktq)->pkt_trace_id);
3965
3966 if (gso_pkts_estimate > 0) {
3967 dp_gso_pktq(fsw, spktq: &pktq, gso_pkts_estimate);
3968 } else {
3969 dp_tx_pktq(fsw, spktq: &pktq);
3970 }
3971 dp_free_pktq(fsw, pktq: &pktq);
3972 KPKTQ_FINI(&pktq);
3973 }
3974 kr_update_stats(kring: r, slot_count: total_pkts, byte_count: total_bytes);
3975
3976 KDBG(SK_KTRACE_FSW_USER_RING_FLUSH, SK_KVA(r), total_pkts, total_bytes);
3977 DTRACE_SKYWALK2(fsw__dp__user__ring__flush, uint32_t, total_pkts,
3978 uint32_t, total_bytes);
3979}
3980
3981void
3982fsw_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r,
3983 struct proc *p)
3984{
3985 struct nexus_vp_adapter *vpna = VPNA(KRNA(r));
3986
3987 ASSERT(sk_is_sync_protected());
3988 ASSERT(vpna->vpna_nx_port != FSW_VP_HOST);
3989 ASSERT(vpna->vpna_up.na_md_type == NEXUS_META_TYPE_PACKET);
3990
3991 if (vpna->vpna_nx_port == FSW_VP_DEV) {
3992 fsw_dev_ring_flush(fsw, r, p);
3993 } else {
3994 fsw_user_ring_flush(fsw, r, p);
3995 }
3996}
3997
3998int
3999fsw_dp_ctor(struct nx_flowswitch *fsw)
4000{
4001 uint32_t fe_cnt = fsw_fe_table_size;
4002 uint32_t fob_cnt = fsw_flow_owner_buckets;
4003 uint32_t frb_cnt = fsw_flow_route_buckets;
4004 uint32_t frib_cnt = fsw_flow_route_id_buckets;
4005 struct kern_nexus *nx = fsw->fsw_nx;
4006 char name[64];
4007 int error = 0;
4008
4009 /* just in case */
4010 if (fe_cnt == 0) {
4011 fe_cnt = NX_FSW_FE_TABLESZ;
4012 ASSERT(fe_cnt != 0);
4013 }
4014 if (fob_cnt == 0) {
4015 fob_cnt = NX_FSW_FOB_HASHSZ;
4016 ASSERT(fob_cnt != 0);
4017 }
4018 if (frb_cnt == 0) {
4019 frb_cnt = NX_FSW_FRB_HASHSZ;
4020 ASSERT(frb_cnt != 0);
4021 }
4022 if (frib_cnt == 0) {
4023 frib_cnt = NX_FSW_FRIB_HASHSZ;
4024 ASSERT(frib_cnt != 0);
4025 }
4026
4027 /* make sure fe_cnt is a power of two, else round up */
4028 if ((fe_cnt & (fe_cnt - 1)) != 0) {
4029 fe_cnt--;
4030 fe_cnt |= (fe_cnt >> 1);
4031 fe_cnt |= (fe_cnt >> 2);
4032 fe_cnt |= (fe_cnt >> 4);
4033 fe_cnt |= (fe_cnt >> 8);
4034 fe_cnt |= (fe_cnt >> 16);
4035 fe_cnt++;
4036 }
4037
4038 /* make sure frb_cnt is a power of two, else round up */
4039 if ((frb_cnt & (frb_cnt - 1)) != 0) {
4040 frb_cnt--;
4041 frb_cnt |= (frb_cnt >> 1);
4042 frb_cnt |= (frb_cnt >> 2);
4043 frb_cnt |= (frb_cnt >> 4);
4044 frb_cnt |= (frb_cnt >> 8);
4045 frb_cnt |= (frb_cnt >> 16);
4046 frb_cnt++;
4047 }
4048
4049 lck_mtx_init(lck: &fsw->fsw_detach_barrier_lock, grp: &nexus_lock_group,
4050 attr: &nexus_lock_attr);
4051 lck_mtx_init(lck: &fsw->fsw_reap_lock, grp: &nexus_lock_group, attr: &nexus_lock_attr);
4052 lck_mtx_init(lck: &fsw->fsw_linger_lock, grp: &nexus_lock_group, attr: &nexus_lock_attr);
4053 TAILQ_INIT(&fsw->fsw_linger_head);
4054
4055 (void) snprintf(name, count: sizeof(name), "%s_%llu", NX_FSW_NAME, nx->nx_id);
4056 error = nx_advisory_alloc(nx, name,
4057 &NX_PROV(nx)->nxprov_region_params[SKMEM_REGION_NEXUSADV],
4058 NEXUS_ADVISORY_TYPE_FLOWSWITCH);
4059 if (error != 0) {
4060 fsw_dp_dtor(fsw);
4061 return error;
4062 }
4063
4064 fsw->fsw_flow_mgr = flow_mgr_create(fe_cnt, fob_cnt, frb_cnt, frib_cnt);
4065 if (fsw->fsw_flow_mgr == NULL) {
4066 fsw_dp_dtor(fsw);
4067 return error;
4068 }
4069
4070 /* generic name; will be customized upon ifattach */
4071 (void) snprintf(fsw->fsw_reap_name, count: sizeof(fsw->fsw_reap_name),
4072 FSW_REAP_THREADNAME, name, "");
4073
4074 if (kernel_thread_start(continuation: fsw_reap_thread_func, parameter: fsw,
4075 new_thread: &fsw->fsw_reap_thread) != KERN_SUCCESS) {
4076 panic_plain("%s: can't create thread", __func__);
4077 /* NOTREACHED */
4078 __builtin_unreachable();
4079 }
4080 /* this must not fail */
4081 VERIFY(fsw->fsw_reap_thread != NULL);
4082
4083 SK_DF(SK_VERB_MEM, "fsw 0x%llx ALLOC", SK_KVA(fsw));
4084
4085
4086 return error;
4087}
4088
4089void
4090fsw_dp_dtor(struct nx_flowswitch *fsw)
4091{
4092 uint64_t f = (1 * NSEC_PER_MSEC); /* 1 ms */
4093 uint64_t s = (1000 * NSEC_PER_SEC); /* 1 sec */
4094 uint32_t i = 0;
4095
4096#if (DEVELOPMENT || DEBUG)
4097 if (fsw->fsw_rps_threads != NULL) {
4098 for (i = 0; i < fsw->fsw_rps_nthreads; i++) {
4099 fsw_rps_thread_join(fsw, i);
4100 }
4101 kfree_type(struct fsw_rps_thread, fsw->fsw_rps_threads);
4102 }
4103#endif /* !DEVELOPMENT && !DEBUG */
4104
4105 nx_advisory_free(fsw->fsw_nx);
4106
4107 if (fsw->fsw_reap_thread != THREAD_NULL) {
4108 /* signal thread to begin self-termination */
4109 lck_mtx_lock(lck: &fsw->fsw_reap_lock);
4110 fsw->fsw_reap_flags |= FSW_REAPF_TERMINATING;
4111
4112 /*
4113 * And wait for thread to terminate; use another
4114 * wait channel here other than fsw_reap_flags to
4115 * make it more explicit. In the event the reaper
4116 * thread misses a wakeup, we'll try again once
4117 * every second (except for the first time).
4118 */
4119 while (!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED)) {
4120 uint64_t t = 0;
4121
4122 nanoseconds_to_absolutetime(nanoseconds: (i++ == 0) ? f : s, result: &t);
4123 clock_absolutetime_interval_to_deadline(abstime: t, result: &t);
4124 ASSERT(t != 0);
4125
4126 fsw->fsw_reap_flags |= FSW_REAPF_TERMINATEBLOCK;
4127 if (!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING)) {
4128 thread_wakeup((caddr_t)&fsw->fsw_reap_flags);
4129 }
4130 (void) assert_wait_deadline(event: &fsw->fsw_reap_thread,
4131 THREAD_UNINT, deadline: t);
4132 lck_mtx_unlock(lck: &fsw->fsw_reap_lock);
4133 thread_block(THREAD_CONTINUE_NULL);
4134 lck_mtx_lock(lck: &fsw->fsw_reap_lock);
4135 fsw->fsw_reap_flags &= ~FSW_REAPF_TERMINATEBLOCK;
4136 }
4137 ASSERT(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED);
4138 lck_mtx_unlock(lck: &fsw->fsw_reap_lock);
4139 fsw->fsw_reap_thread = THREAD_NULL;
4140 }
4141
4142 /* free any remaining flow entries in the linger list */
4143 fsw_linger_purge(fsw);
4144
4145 if (fsw->fsw_flow_mgr != NULL) {
4146 flow_mgr_destroy(fsw->fsw_flow_mgr);
4147 fsw->fsw_flow_mgr = NULL;
4148 }
4149
4150
4151 lck_mtx_destroy(lck: &fsw->fsw_detach_barrier_lock, grp: &nexus_lock_group);
4152 lck_mtx_destroy(lck: &fsw->fsw_reap_lock, grp: &nexus_lock_group);
4153 lck_mtx_destroy(lck: &fsw->fsw_linger_lock, grp: &nexus_lock_group);
4154}
4155
4156void
4157fsw_linger_insert(struct flow_entry *fe)
4158{
4159 struct nx_flowswitch *fsw = fe->fe_fsw;
4160 SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
4161 SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx flags 0x%b",
4162 fe_as_string(fe, dbgbuf, sizeof(dbgbuf)), SK_KVA(fe),
4163 fe->fe_flags, FLOWENTF_BITS);
4164
4165 net_update_uptime();
4166
4167 ASSERT(flow_entry_refcnt(fe) >= 1);
4168 ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
4169 ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
4170 ASSERT(!(fe->fe_flags & FLOWENTF_LINGERING));
4171 ASSERT(fe->fe_flags & FLOWENTF_WAIT_CLOSE);
4172 ASSERT(fe->fe_linger_wait != 0);
4173 fe->fe_linger_expire = (_net_uptime + fe->fe_linger_wait);
4174 os_atomic_or(&fe->fe_flags, FLOWENTF_LINGERING, relaxed);
4175
4176 lck_mtx_lock_spin(lck: &fsw->fsw_linger_lock);
4177 TAILQ_INSERT_TAIL(&fsw->fsw_linger_head, fe, fe_linger_link);
4178 fsw->fsw_linger_cnt++;
4179 VERIFY(fsw->fsw_linger_cnt != 0);
4180 lck_mtx_unlock(lck: &fsw->fsw_linger_lock);
4181
4182 fsw_reap_sched(fsw);
4183}
4184
4185static void
4186fsw_linger_remove_internal(struct flow_entry_linger_head *linger_head,
4187 struct flow_entry *fe)
4188{
4189 SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
4190 SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx flags 0x%b",
4191 fe_as_string(fe, dbgbuf, sizeof(dbgbuf)), SK_KVA(fe),
4192 fe->fe_flags, FLOWENTF_BITS);
4193
4194 ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
4195 ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
4196 ASSERT(fe->fe_flags & FLOWENTF_LINGERING);
4197 os_atomic_andnot(&fe->fe_flags, FLOWENTF_LINGERING, relaxed);
4198
4199 TAILQ_REMOVE(linger_head, fe, fe_linger_link);
4200 flow_entry_release(pfe: &fe);
4201}
4202
4203static void
4204fsw_linger_remove(struct flow_entry *fe)
4205{
4206 struct nx_flowswitch *fsw = fe->fe_fsw;
4207
4208 LCK_MTX_ASSERT(&fsw->fsw_linger_lock, LCK_MTX_ASSERT_OWNED);
4209
4210 fsw_linger_remove_internal(linger_head: &fsw->fsw_linger_head, fe);
4211 VERIFY(fsw->fsw_linger_cnt != 0);
4212 fsw->fsw_linger_cnt--;
4213}
4214
4215void
4216fsw_linger_purge(struct nx_flowswitch *fsw)
4217{
4218 struct flow_entry *fe, *tfe;
4219
4220 lck_mtx_lock(lck: &fsw->fsw_linger_lock);
4221 TAILQ_FOREACH_SAFE(fe, &fsw->fsw_linger_head, fe_linger_link, tfe) {
4222 fsw_linger_remove(fe);
4223 }
4224 ASSERT(fsw->fsw_linger_cnt == 0);
4225 ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
4226 lck_mtx_unlock(lck: &fsw->fsw_linger_lock);
4227}
4228
4229void
4230fsw_reap_sched(struct nx_flowswitch *fsw)
4231{
4232 ASSERT(fsw->fsw_reap_thread != THREAD_NULL);
4233 lck_mtx_lock_spin(lck: &fsw->fsw_reap_lock);
4234 if (!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING) &&
4235 !(fsw->fsw_reap_flags & (FSW_REAPF_TERMINATING | FSW_REAPF_TERMINATED))) {
4236 thread_wakeup((caddr_t)&fsw->fsw_reap_flags);
4237 }
4238 lck_mtx_unlock(lck: &fsw->fsw_reap_lock);
4239}
4240
4241__attribute__((noreturn))
4242static void
4243fsw_reap_thread_func(void *v, wait_result_t w)
4244{
4245#pragma unused(w)
4246 struct nx_flowswitch *fsw = v;
4247
4248 ASSERT(fsw->fsw_reap_thread == current_thread());
4249 thread_set_thread_name(th: current_thread(), name: fsw->fsw_reap_name);
4250
4251 net_update_uptime();
4252
4253 lck_mtx_lock(lck: &fsw->fsw_reap_lock);
4254 VERIFY(!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING));
4255 (void) assert_wait(event: &fsw->fsw_reap_flags, THREAD_UNINT);
4256 lck_mtx_unlock(lck: &fsw->fsw_reap_lock);
4257 thread_block_parameter(continuation: fsw_reap_thread_cont, parameter: fsw);
4258 /* NOTREACHED */
4259 __builtin_unreachable();
4260}
4261
4262__attribute__((noreturn))
4263static void
4264fsw_reap_thread_cont(void *v, wait_result_t wres)
4265{
4266 struct nx_flowswitch *fsw = v;
4267 boolean_t low;
4268 uint64_t t = 0;
4269
4270 SK_DF(SK_VERB_FLOW, "%s: running", fsw->fsw_reap_name);
4271
4272 lck_mtx_lock(lck: &fsw->fsw_reap_lock);
4273 if (__improbable(wres == THREAD_INTERRUPTED ||
4274 (fsw->fsw_reap_flags & FSW_REAPF_TERMINATING) != 0)) {
4275 goto terminate;
4276 }
4277
4278 ASSERT(!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED));
4279 fsw->fsw_reap_flags |= FSW_REAPF_RUNNING;
4280 lck_mtx_unlock(lck: &fsw->fsw_reap_lock);
4281
4282 net_update_uptime();
4283
4284 /* prevent detach from happening while we're here */
4285 if (!fsw_detach_barrier_add(fsw)) {
4286 SK_ERR("%s: netagent detached", fsw->fsw_reap_name);
4287 t = 0;
4288 } else {
4289 uint32_t fe_nonviable, fe_freed, fe_aborted;
4290 uint32_t fr_freed, fr_resid = 0;
4291 struct ifnet *ifp = fsw->fsw_ifp;
4292 uint64_t i = FSW_REAP_IVAL;
4293 uint64_t now = _net_uptime;
4294 uint64_t last;
4295
4296 ASSERT(fsw->fsw_ifp != NULL);
4297
4298 /*
4299 * Pass 1: process any deferred {withdrawn,nonviable} requests.
4300 */
4301 fe_nonviable = fsw_process_deferred(fsw);
4302
4303 /*
4304 * Pass 2: remove any expired lingering flows.
4305 */
4306 fe_freed = fsw_process_linger(fsw, &fe_aborted);
4307
4308 /*
4309 * Pass 3: prune idle flow routes.
4310 */
4311 fr_freed = flow_route_prune(fsw->fsw_flow_mgr,
4312 ifp, &fr_resid);
4313
4314 /*
4315 * Pass 4: prune flow table
4316 *
4317 */
4318 cuckoo_hashtable_try_shrink(h: fsw->fsw_flow_mgr->fm_flow_table);
4319
4320 SK_DF(SK_VERB_FLOW, "%s: fe_nonviable %u/%u fe_freed %u/%u "
4321 "fe_aborted %u fr_freed %u/%u",
4322 fsw->fsw_flow_mgr->fm_name, fe_nonviable,
4323 (fe_nonviable + fsw->fsw_pending_nonviable),
4324 fe_freed, fsw->fsw_linger_cnt, fe_aborted, fe_freed,
4325 (fe_freed + fr_resid));
4326
4327 /* see if VM memory level is critical */
4328 low = skmem_lowmem_check();
4329
4330 /*
4331 * If things appear to be idle, we can prune away cached
4332 * object that have fallen out of the working sets (this
4333 * is different than purging). Every once in a while, we
4334 * also purge the caches. Note that this is done across
4335 * all flowswitch instances, and so we limit this to no
4336 * more than once every FSW_REAP_SK_THRES seconds.
4337 */
4338 last = os_atomic_load(&fsw_reap_last, relaxed);
4339 if ((low || (last != 0 && (now - last) >= FSW_REAP_SK_THRES)) &&
4340 os_atomic_cmpxchg(&fsw_reap_last, last, now, acq_rel)) {
4341 fsw_purge_cache(fsw, low);
4342
4343 /* increase sleep interval if idle */
4344 if (kdebug_enable == 0 && fsw->fsw_linger_cnt == 0 &&
4345 fsw->fsw_pending_nonviable == 0 && fr_resid == 0) {
4346 i <<= 3;
4347 }
4348 } else if (last == 0) {
4349 os_atomic_store(&fsw_reap_last, now, release);
4350 }
4351
4352 /*
4353 * Additionally, run thru the list of channels and prune
4354 * or purge away cached objects on "idle" channels. This
4355 * check is rate limited to no more than once every
4356 * FSW_DRAIN_CH_THRES seconds.
4357 */
4358 last = fsw->fsw_drain_channel_chk_last;
4359 if (low || (last != 0 && (now - last) >= FSW_DRAIN_CH_THRES)) {
4360 SK_DF(SK_VERB_FLOW, "%s: pruning channels",
4361 fsw->fsw_flow_mgr->fm_name);
4362
4363 fsw->fsw_drain_channel_chk_last = now;
4364 fsw_drain_channels(fsw, now, low);
4365 } else if (__improbable(last == 0)) {
4366 fsw->fsw_drain_channel_chk_last = now;
4367 }
4368
4369 /*
4370 * Finally, invoke the interface's reap callback to
4371 * tell it to prune or purge away cached objects if
4372 * it is idle. This check is rate limited to no more
4373 * than once every FSW_REAP_IF_THRES seconds.
4374 */
4375 last = fsw->fsw_drain_netif_chk_last;
4376 if (low || (last != 0 && (now - last) >= FSW_REAP_IF_THRES)) {
4377 ASSERT(fsw->fsw_nifna != NULL);
4378
4379 if (ifp->if_na_ops != NULL &&
4380 ifp->if_na_ops->ni_reap != NULL) {
4381 SK_DF(SK_VERB_FLOW, "%s: pruning netif",
4382 fsw->fsw_flow_mgr->fm_name);
4383 ifp->if_na_ops->ni_reap(ifp->if_na, ifp,
4384 FSW_REAP_IF_THRES, low);
4385 }
4386
4387 fsw->fsw_drain_netif_chk_last = now;
4388 } else if (__improbable(last == 0)) {
4389 fsw->fsw_drain_netif_chk_last = now;
4390 }
4391
4392 /* emit periodic interface stats ktrace */
4393 last = fsw->fsw_reap_last;
4394 if (last != 0 && (now - last) >= FSW_IFSTATS_THRES) {
4395 KDBG(SK_KTRACE_AON_IF_STATS, ifp->if_data.ifi_ipackets,
4396 ifp->if_data.ifi_ibytes * 8,
4397 ifp->if_data.ifi_opackets,
4398 ifp->if_data.ifi_obytes * 8);
4399
4400 fsw->fsw_reap_last = now;
4401 } else if (__improbable(last == 0)) {
4402 fsw->fsw_reap_last = now;
4403 }
4404
4405 nanoseconds_to_absolutetime(nanoseconds: i * NSEC_PER_SEC, result: &t);
4406 clock_absolutetime_interval_to_deadline(abstime: t, result: &t);
4407 ASSERT(t != 0);
4408
4409 /* allow any pending detach to proceed */
4410 fsw_detach_barrier_remove(fsw);
4411 }
4412
4413 lck_mtx_lock(lck: &fsw->fsw_reap_lock);
4414 if (!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATING)) {
4415 fsw->fsw_reap_flags &= ~FSW_REAPF_RUNNING;
4416 (void) assert_wait_deadline(event: &fsw->fsw_reap_flags,
4417 THREAD_UNINT, deadline: t);
4418 lck_mtx_unlock(lck: &fsw->fsw_reap_lock);
4419 thread_block_parameter(continuation: fsw_reap_thread_cont, parameter: fsw);
4420 /* NOTREACHED */
4421 __builtin_unreachable();
4422 } else {
4423terminate:
4424 LCK_MTX_ASSERT(&fsw->fsw_reap_lock, LCK_MTX_ASSERT_OWNED);
4425 fsw->fsw_reap_flags &= ~(FSW_REAPF_RUNNING | FSW_REAPF_TERMINATING);
4426 fsw->fsw_reap_flags |= FSW_REAPF_TERMINATED;
4427 /*
4428 * And signal any thread waiting for us to terminate;
4429 * wait channel here other than fsw_reap_flags to make
4430 * it more explicit.
4431 */
4432 if (fsw->fsw_reap_flags & FSW_REAPF_TERMINATEBLOCK) {
4433 thread_wakeup((caddr_t)&fsw->fsw_reap_thread);
4434 }
4435 lck_mtx_unlock(lck: &fsw->fsw_reap_lock);
4436
4437 SK_DF(SK_VERB_FLOW, "%s: terminating", fsw->fsw_reap_name);
4438
4439 /* for the extra refcnt from kernel_thread_start() */
4440 thread_deallocate(thread: current_thread());
4441 /* this is the end */
4442 thread_terminate(current_thread());
4443 /* NOTREACHED */
4444 __builtin_unreachable();
4445 }
4446
4447 /* must never get here */
4448 VERIFY(0);
4449 /* NOTREACHED */
4450 __builtin_unreachable();
4451}
4452
4453static void
4454fsw_drain_channels(struct nx_flowswitch *fsw, uint64_t now, boolean_t low)
4455{
4456 struct kern_nexus *nx = fsw->fsw_nx;
4457
4458 /* flowswitch protects NA via fsw_lock, see fsw_port_alloc/free */
4459 FSW_RLOCK(fsw);
4460
4461 /* uncrustify doesn't handle C blocks properly */
4462 /* BEGIN IGNORE CODESTYLE */
4463 nx_port_foreach(nx, ^(nexus_port_t p) {
4464 struct nexus_adapter *na = nx_port_get_na(nx, p);
4465 if (na == NULL || na->na_work_ts == 0 || na->na_rx_rings == NULL) {
4466 return;
4467 }
4468
4469 boolean_t purge;
4470
4471 /*
4472 * If some activity happened in the last FSW_DRAIN_CH_THRES
4473 * seconds on this channel, we reclaim memory if the channel
4474 * throughput is less than the reap threshold value.
4475 */
4476 if ((now - na->na_work_ts) < FSW_DRAIN_CH_THRES) {
4477 struct __kern_channel_ring *ring;
4478 channel_ring_stats *stats;
4479 uint64_t bps;
4480
4481 ring = na->na_rx_rings;
4482 stats = &ring->ckr_stats;
4483 bps = stats->crs_bytes_per_second;
4484
4485 if (bps < fsw_channel_reap_thresh) {
4486 purge = FALSE;
4487 na_drain(na, purge);
4488 }
4489 return;
4490 }
4491
4492 /*
4493 * If NA has been inactive for some time (twice the drain
4494 * threshold), we clear the work timestamp to temporarily skip
4495 * this channel until it's active again. Purging cached objects
4496 * can be expensive since we'd need to allocate and construct
4497 * them again, so we do it only when necessary.
4498 */
4499 if (low || ((now - na->na_work_ts) >= (FSW_DRAIN_CH_THRES << 1))) {
4500 na->na_work_ts = 0;
4501 purge = TRUE;
4502 } else {
4503 purge = FALSE;
4504 }
4505
4506 na_drain(na, purge); /* purge/prune caches */
4507 });
4508 /* END IGNORE CODESTYLE */
4509
4510 FSW_RUNLOCK(fsw);
4511}
4512
4513static void
4514fsw_purge_cache(struct nx_flowswitch *fsw, boolean_t low)
4515{
4516#pragma unused(fsw)
4517 uint64_t o = os_atomic_inc_orig(&fsw_want_purge, relaxed);
4518 uint32_t p = fsw_flow_purge_thresh;
4519 boolean_t purge = (low || (o != 0 && p != 0 && (o % p) == 0));
4520
4521 SK_DF(SK_VERB_FLOW, "%s: %s caches",
4522 fsw->fsw_flow_mgr->fm_name,
4523 (purge ? "purge" : "prune"));
4524
4525 skmem_cache_reap_now(sk_fo_cache, purge);
4526 skmem_cache_reap_now(sk_fe_cache, purge);
4527 skmem_cache_reap_now(sk_fab_cache, purge);
4528 skmem_cache_reap_now(flow_route_cache, purge);
4529 skmem_cache_reap_now(flow_stats_cache, purge);
4530 netns_reap_caches(purge);
4531 skmem_reap_caches(purge);
4532
4533#if CONFIG_MBUF_MCACHE
4534 if (if_is_fsw_transport_netagent_enabled() && purge) {
4535 mbuf_drain(FALSE);
4536 }
4537#endif /* CONFIG_MBUF_MCACHE */
4538}
4539
4540static void
4541fsw_flow_handle_low_power(struct nx_flowswitch *fsw, struct flow_entry *fe)
4542{
4543 /* When the interface is in low power mode, the flow is nonviable */
4544 if (!(fe->fe_flags & FLOWENTF_NONVIABLE) &&
4545 os_atomic_cmpxchg(&fe->fe_want_nonviable, 0, 1, acq_rel)) {
4546 os_atomic_inc(&fsw->fsw_pending_nonviable, relaxed);
4547 }
4548}
4549
4550static uint32_t
4551fsw_process_deferred(struct nx_flowswitch *fsw)
4552{
4553 struct flow_entry_dead sfed __sk_aligned(8);
4554 struct flow_mgr *fm = fsw->fsw_flow_mgr;
4555 struct flow_entry_dead *fed, *tfed;
4556 LIST_HEAD(, flow_entry_dead) fed_head =
4557 LIST_HEAD_INITIALIZER(fed_head);
4558 uint32_t i, nonviable = 0;
4559 boolean_t lowpowermode = FALSE;
4560
4561 bzero(s: &sfed, n: sizeof(sfed));
4562
4563 /*
4564 * The flows become nonviable when the interface
4565 * is in low power mode (edge trigger)
4566 */
4567 if ((fsw->fsw_ifp->if_xflags & IFXF_LOW_POWER) &&
4568 fsw->fsw_ifp->if_low_power_gencnt != fsw->fsw_low_power_gencnt) {
4569 lowpowermode = TRUE;
4570 fsw->fsw_low_power_gencnt = fsw->fsw_ifp->if_low_power_gencnt;
4571 }
4572
4573 /*
4574 * Scan thru the flow entry tree, and commit any pending withdraw or
4575 * nonviable requests. We may need to push stats and/or unassign the
4576 * nexus from NECP, but we cannot do that while holding the locks;
4577 * build a temporary list for those entries.
4578 */
4579 for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
4580 struct flow_owner_bucket *fob = flow_mgr_get_fob_at_idx(fm, idx: i);
4581 struct flow_owner *fo;
4582
4583 /*
4584 * Grab the lock at all costs when handling low power mode
4585 */
4586 if (__probable(!lowpowermode)) {
4587 if (!FOB_TRY_LOCK(fob)) {
4588 continue;
4589 }
4590 } else {
4591 FOB_LOCK(fob);
4592 }
4593
4594 FOB_LOCK_ASSERT_HELD(fob);
4595 RB_FOREACH(fo, flow_owner_tree, &fob->fob_owner_head) {
4596 struct flow_entry *fe;
4597
4598 RB_FOREACH(fe, flow_entry_id_tree,
4599 &fo->fo_flow_entry_id_head) {
4600 /* try first as reader; skip if we can't */
4601 if (__improbable(lowpowermode)) {
4602 fsw_flow_handle_low_power(fsw, fe);
4603 }
4604 if (__improbable(fe->fe_flags & FLOWENTF_HALF_CLOSED)) {
4605 os_atomic_andnot(&fe->fe_flags, FLOWENTF_HALF_CLOSED, relaxed);
4606 flow_namespace_half_close(token: &fe->fe_port_reservation);
4607 }
4608
4609 /* if not withdrawn/nonviable, skip */
4610 if (!fe->fe_want_withdraw &&
4611 !fe->fe_want_nonviable) {
4612 continue;
4613 }
4614 /*
4615 * Here we're holding the lock as writer;
4616 * don't spend too much time as we're
4617 * blocking the data path now.
4618 */
4619 ASSERT(!uuid_is_null(fe->fe_uuid));
4620 /* only need flow UUID and booleans */
4621 uuid_copy(dst: sfed.fed_uuid, src: fe->fe_uuid);
4622 sfed.fed_want_clonotify =
4623 (fe->fe_flags & FLOWENTF_CLOSE_NOTIFY);
4624 sfed.fed_want_nonviable = fe->fe_want_nonviable;
4625 flow_entry_teardown(fo, fe);
4626
4627 /* do this outside the flow bucket lock */
4628 fed = flow_entry_dead_alloc(Z_WAITOK);
4629 ASSERT(fed != NULL);
4630 *fed = sfed;
4631 LIST_INSERT_HEAD(&fed_head, fed, fed_link);
4632 }
4633 }
4634 FOB_UNLOCK(fob);
4635 }
4636
4637 /*
4638 * These nonviable flows are no longer useful since we've lost
4639 * the source IP address; in the event the client monitors the
4640 * viability of the flow, explicitly mark it as nonviable so
4641 * that a new flow can be created.
4642 */
4643 LIST_FOREACH_SAFE(fed, &fed_head, fed_link, tfed) {
4644 LIST_REMOVE(fed, fed_link);
4645 ASSERT(fsw->fsw_agent_session != NULL);
4646
4647 /* if flow is closed early */
4648 if (fed->fed_want_clonotify) {
4649 necp_client_early_close(client_id: fed->fed_uuid);
4650 }
4651
4652 /* if nonviable, unassign nexus attributes */
4653 if (fed->fed_want_nonviable) {
4654 (void) netagent_assign_nexus(session: fsw->fsw_agent_session,
4655 necp_client_uuid: fed->fed_uuid, NULL, assigned_results_length: 0);
4656 }
4657
4658 flow_entry_dead_free(fed);
4659 ++nonviable;
4660 }
4661 ASSERT(LIST_EMPTY(&fed_head));
4662
4663 return nonviable;
4664}
4665
4666static uint32_t
4667fsw_process_linger(struct nx_flowswitch *fsw, uint32_t *abort)
4668{
4669 struct flow_entry_linger_head linger_head =
4670 TAILQ_HEAD_INITIALIZER(linger_head);
4671 struct flow_entry *fe, *tfe;
4672 uint64_t now = _net_uptime;
4673 uint32_t i = 0, cnt = 0, freed = 0;
4674
4675 ASSERT(fsw->fsw_ifp != NULL);
4676 ASSERT(abort != NULL);
4677 *abort = 0;
4678
4679 /*
4680 * We don't want to contend with the datapath, so move
4681 * everything that's in the linger list into a local list.
4682 * This allows us to generate RSTs or free the flow entry
4683 * outside the lock. Any remaining flow entry in the local
4684 * list will get re-added back to the head of the linger
4685 * list, in front of any new ones added since then.
4686 */
4687 lck_mtx_lock(lck: &fsw->fsw_linger_lock);
4688 TAILQ_CONCAT(&linger_head, &fsw->fsw_linger_head, fe_linger_link);
4689 ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
4690 cnt = fsw->fsw_linger_cnt;
4691 fsw->fsw_linger_cnt = 0;
4692 lck_mtx_unlock(lck: &fsw->fsw_linger_lock);
4693
4694 TAILQ_FOREACH_SAFE(fe, &linger_head, fe_linger_link, tfe) {
4695 ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
4696 ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
4697 ASSERT(fe->fe_flags & FLOWENTF_LINGERING);
4698
4699 /*
4700 * See if this is a TCP flow that needs to generate
4701 * a RST to the remote peer (if not already).
4702 */
4703 if (flow_track_tcp_want_abort(fe)) {
4704 VERIFY(fe->fe_flags & FLOWENTF_ABORTED);
4705 ASSERT(!uuid_is_null(fe->fe_uuid));
4706 flow_track_abort_tcp(fe, NULL, NULL);
4707 (*abort)++;
4708 SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
4709 SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx "
4710 "flags 0x%b [RST]", fe_as_string(fe, dbgbuf,
4711 sizeof(dbgbuf)), SK_KVA(fe), fe->fe_flags,
4712 FLOWENTF_BITS);
4713 }
4714
4715 /*
4716 * If flow has expired, remove from list and free;
4717 * otherwise leave it around in the linger list.
4718 */
4719 if (fe->fe_linger_expire <= now) {
4720 freed++;
4721 fsw_linger_remove_internal(linger_head: &linger_head, fe);
4722 fe = NULL;
4723 }
4724 ++i;
4725 }
4726 VERIFY(i == cnt && cnt >= freed);
4727
4728 /*
4729 * Add any remaining ones back into the linger list.
4730 */
4731 lck_mtx_lock(lck: &fsw->fsw_linger_lock);
4732 if (!TAILQ_EMPTY(&linger_head)) {
4733 ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head) || fsw->fsw_linger_cnt);
4734 TAILQ_CONCAT(&linger_head, &fsw->fsw_linger_head, fe_linger_link);
4735 ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head));
4736 TAILQ_CONCAT(&fsw->fsw_linger_head, &linger_head, fe_linger_link);
4737 fsw->fsw_linger_cnt += (cnt - freed);
4738 }
4739 ASSERT(TAILQ_EMPTY(&linger_head));
4740 lck_mtx_unlock(lck: &fsw->fsw_linger_lock);
4741
4742 return freed;
4743}
4744
4745__attribute__((always_inline))
4746static inline void
4747fsw_ifp_inc_traffic_class_in_pkt(struct ifnet *ifp, kern_packet_t ph)
4748{
4749 switch (__packet_get_traffic_class(ph)) {
4750 case PKT_TC_BE:
4751 ifp->if_tc.ifi_ibepackets++;
4752 ifp->if_tc.ifi_ibebytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
4753 break;
4754 case PKT_TC_BK:
4755 ifp->if_tc.ifi_ibkpackets++;
4756 ifp->if_tc.ifi_ibkbytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
4757 break;
4758 case PKT_TC_VI:
4759 ifp->if_tc.ifi_ivipackets++;
4760 ifp->if_tc.ifi_ivibytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
4761 break;
4762 case PKT_TC_VO:
4763 ifp->if_tc.ifi_ivopackets++;
4764 ifp->if_tc.ifi_ivobytes += SK_PTR_ADDR_KPKT(ph)->pkt_length;
4765 break;
4766 default:
4767 break;
4768 }
4769}
4770
4771__attribute__((always_inline))
4772static inline void
4773fsw_ifp_inc_traffic_class_out_pkt(struct ifnet *ifp, uint32_t svc,
4774 uint32_t cnt, uint32_t len)
4775{
4776 switch (svc) {
4777 case PKT_TC_BE:
4778 ifp->if_tc.ifi_obepackets += cnt;
4779 ifp->if_tc.ifi_obebytes += len;
4780 break;
4781 case PKT_TC_BK:
4782 ifp->if_tc.ifi_obkpackets += cnt;
4783 ifp->if_tc.ifi_obkbytes += len;
4784 break;
4785 case PKT_TC_VI:
4786 ifp->if_tc.ifi_ovipackets += cnt;
4787 ifp->if_tc.ifi_ovibytes += len;
4788 break;
4789 case PKT_TC_VO:
4790 ifp->if_tc.ifi_ovopackets += cnt;
4791 ifp->if_tc.ifi_ovobytes += len;
4792 break;
4793 default:
4794 break;
4795 }
4796}
4797