1 | /* |
2 | * Copyright (c) 2015-2023 Apple Inc. All rights reserved. |
3 | * |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
5 | * |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License |
8 | * Version 2.0 (the 'License'). You may not use this file except in |
9 | * compliance with the License. The rights granted to you under the License |
10 | * may not be used to create, or enable the creation or redistribution of, |
11 | * unlawful or unlicensed copies of an Apple operating system, or to |
12 | * circumvent, violate, or enable the circumvention or violation of, any |
13 | * terms of an Apple operating system software license agreement. |
14 | * |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
17 | * |
18 | * The Original Code and all software distributed under the License are |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and |
24 | * limitations under the License. |
25 | * |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
27 | */ |
28 | |
29 | /* |
30 | * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. |
31 | * |
32 | * Redistribution and use in source and binary forms, with or without |
33 | * modification, are permitted provided that the following conditions |
34 | * are met: |
35 | * 1. Redistributions of source code must retain the above copyright |
36 | * notice, this list of conditions and the following disclaimer. |
37 | * 2. Redistributions in binary form must reproduce the above copyright |
38 | * notice, this list of conditions and the following disclaimer in the |
39 | * documentation and/or other materials provided with the distribution. |
40 | * |
41 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
42 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
43 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
44 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
45 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
46 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
47 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
48 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
49 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
50 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
51 | * SUCH DAMAGE. |
52 | */ |
53 | |
54 | /* |
55 | * BSD LICENSE |
56 | * |
57 | * Copyright(c) 2015 NEC Europe Ltd. All rights reserved. |
58 | * All rights reserved. |
59 | * |
60 | * Redistribution and use in source and binary forms, with or without |
61 | * modification, are permitted provided that the following conditions |
62 | * are met: |
63 | * |
64 | * * Redistributions of source code must retain the above copyright |
65 | * notice, this list of conditions and the following disclaimer. |
66 | * * Redistributions in binary form must reproduce the above copyright |
67 | * notice, this list of conditions and the following disclaimer in |
68 | * the documentation and/or other materials provided with the |
69 | * distribution. |
70 | * * Neither the name of NEC Europe Ltd. nor the names of |
71 | * its contributors may be used to endorse or promote products derived |
72 | * from this software without specific prior written permission. |
73 | * |
74 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
75 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
76 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
77 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
78 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
79 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
80 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
81 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
82 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
83 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
84 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
85 | */ |
86 | |
87 | #include <skywalk/os_skywalk_private.h> |
88 | #include <skywalk/nexus/flowswitch/nx_flowswitch.h> |
89 | #include <skywalk/nexus/flowswitch/fsw_var.h> |
90 | #include <skywalk/nexus/netif/nx_netif.h> |
91 | #include <skywalk/nexus/netif/nx_netif_compat.h> |
92 | #include <kern/sched_prim.h> |
93 | #include <sys/kdebug.h> |
94 | #include <sys/sdt.h> |
95 | #include <net/bpf.h> |
96 | #include <net/if_ports_used.h> |
97 | #include <net/pktap.h> |
98 | #include <net/pktsched/pktsched_netem.h> |
99 | #include <netinet/tcp.h> |
100 | #include <netinet/udp.h> |
101 | #include <netinet/ip.h> |
102 | #include <netinet/ip6.h> |
103 | #include <netinet/in_var.h> |
104 | |
105 | extern kern_return_t thread_terminate(thread_t); |
106 | |
107 | #define FSW_ZONE_MAX 256 |
108 | #define FSW_ZONE_NAME "skywalk.nx.fsw" |
109 | |
110 | static uint64_t fsw_reap_last __sk_aligned(8); |
111 | static uint64_t fsw_want_purge __sk_aligned(8); |
112 | |
113 | #define NX_FSW_FE_TABLESZ 256 /* some power of 2 */ |
114 | static uint32_t fsw_fe_table_size = NX_FSW_FE_TABLESZ; |
115 | |
116 | #define NX_FSW_FOB_HASHSZ 31 /* some mersenne prime */ |
117 | static uint32_t fsw_flow_owner_buckets = NX_FSW_FOB_HASHSZ; |
118 | |
119 | #define NX_FSW_FRB_HASHSZ 128 /* some power of 2 */ |
120 | static uint32_t fsw_flow_route_buckets = NX_FSW_FRB_HASHSZ; |
121 | |
122 | #define NX_FSW_FRIB_HASHSZ 13 /* some mersenne prime */ |
123 | static uint32_t fsw_flow_route_id_buckets = NX_FSW_FRIB_HASHSZ; |
124 | |
125 | #define NX_FSW_FLOW_REAP_INTERVAL 1 /* seconds */ |
126 | static uint32_t fsw_flow_reap_interval = NX_FSW_FLOW_REAP_INTERVAL; |
127 | |
128 | #define NX_FSW_FLOW_PURGE_THRES 0 /* purge every N reaps (0 = disable) */ |
129 | static uint32_t fsw_flow_purge_thresh = NX_FSW_FLOW_PURGE_THRES; |
130 | |
131 | #define FSW_REAP_IVAL (MAX(1, fsw_flow_reap_interval)) |
132 | #define FSW_REAP_SK_THRES (FSW_REAP_IVAL << 5) |
133 | #define FSW_REAP_IF_THRES (FSW_REAP_IVAL << 5) |
134 | #define FSW_DRAIN_CH_THRES (FSW_REAP_IVAL << 5) |
135 | #define FSW_IFSTATS_THRES 1 |
136 | |
137 | #define NX_FSW_CHANNEL_REAP_THRES 1000 /* threshold (bytes/sec) for reaping*/ |
138 | uint64_t fsw_channel_reap_thresh = NX_FSW_CHANNEL_REAP_THRES; |
139 | |
140 | #define RX_BUFLET_BATCH_COUNT 64 /* max batch size for buflet allocation */ |
141 | |
142 | uint32_t fsw_rx_batch = NX_FSW_RXBATCH; /* # of packets per batch (RX) */ |
143 | uint32_t fsw_tx_batch = NX_FSW_TXBATCH; /* # of packets per batch (TX) */ |
144 | uint32_t fsw_gso_batch = 8; |
145 | #if (DEVELOPMENT || DEBUG) |
146 | SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_batch, |
147 | CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_rx_batch, 0, |
148 | "flowswitch Rx batch size" ); |
149 | SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, tx_batch, |
150 | CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_tx_batch, 0, |
151 | "flowswitch Tx batch size" ); |
152 | SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, gso_batch, |
153 | CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_gso_batch, 0, |
154 | "flowswitch GSO batch size" ); |
155 | SYSCTL_QUAD(_kern_skywalk_flowswitch, OID_AUTO, reap_throughput, |
156 | CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_channel_reap_thresh, |
157 | "flowswitch channel reap threshold throughput (bytes/sec)" ); |
158 | #endif /* !DEVELOPMENT && !DEBUG */ |
159 | |
160 | SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_agg_tcp, |
161 | CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_rx_agg_tcp, 0, |
162 | "flowswitch RX aggregation for tcp flows (enable/disable)" ); |
163 | SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_agg_tcp_host, |
164 | CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_rx_agg_tcp_host, 0, |
165 | "flowswitch RX aggregation for tcp kernel path (0/1/2 (off/on/auto))" ); |
166 | SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, gso_mtu, |
167 | CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_gso_mtu, 0, |
168 | "flowswitch GSO for tcp flows (mtu > 0: enable, mtu == 0: disable)" ); |
169 | |
170 | /* |
171 | * IP reassembly |
172 | * The "kern.skywalk.flowswitch.ip_reass" sysctl can be used to force |
173 | * enable/disable the reassembly routine regardless of whether the |
174 | * transport netagent is enabled or not. |
175 | * |
176 | * 'fsw_ip_reass' is a tri-state: |
177 | * 0 means force IP reassembly off |
178 | * 1 means force IP reassembly on |
179 | * 2 means don't force the value, use what's appropriate for this flowswitch |
180 | */ |
181 | #define FSW_IP_REASS_FORCE_OFF 0 |
182 | #define FSW_IP_REASS_FORCE_ON 1 |
183 | #define FSW_IP_REASS_NO_FORCE 2 |
184 | |
185 | uint32_t fsw_ip_reass = FSW_IP_REASS_NO_FORCE; |
186 | |
187 | static int |
188 | fsw_ip_reass_sysctl SYSCTL_HANDLER_ARGS |
189 | { |
190 | #pragma unused(oidp, arg1, arg2) |
191 | unsigned int new_value; |
192 | int changed; |
193 | int error; |
194 | |
195 | error = sysctl_io_number(req, bigValue: fsw_ip_reass, valueSize: sizeof(fsw_ip_reass), |
196 | pValue: &new_value, changed: &changed); |
197 | if (error == 0 && changed != 0) { |
198 | if (new_value > FSW_IP_REASS_NO_FORCE) { |
199 | return EINVAL; |
200 | } |
201 | fsw_ip_reass = new_value; |
202 | } |
203 | return error; |
204 | } |
205 | |
206 | SYSCTL_PROC(_kern_skywalk_flowswitch, OID_AUTO, ip_reass, |
207 | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, |
208 | 0, 0, fsw_ip_reass_sysctl, "IU" , |
209 | "adjust flowswitch IP reassembly" ); |
210 | |
211 | #if (DEVELOPMENT || DEBUG) |
212 | static uint64_t _fsw_inject_error = 0; |
213 | #define _FSW_INJECT_ERROR(_en, _ev, _ec, _f, ...) \ |
214 | _SK_INJECT_ERROR(_fsw_inject_error, _en, _ev, _ec, \ |
215 | &FSW_STATS_VAL(_FSW_STATS_ERROR_INJECTIONS), _f, __VA_ARGS__) |
216 | |
217 | #define _FSW_INJECT_ERROR_SET(_en, _f, ...) do { \ |
218 | if (__improbable(((_fsw_inject_error) & (1ULL << (_en))) != 0)) { \ |
219 | SK_DF(SK_VERB_ERROR_INJECT, "injecting error %d", (_en));\ |
220 | if ((_f) != NULL) \ |
221 | (_f)(__VA_ARGS__); \ |
222 | } \ |
223 | } while (0) |
224 | |
225 | SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_owner_buckets, |
226 | CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_owner_buckets, 0, "" ); |
227 | SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, fe_table_size, |
228 | CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_fe_table_size, 0, "" ); |
229 | SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_route_buckets, |
230 | CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_route_buckets, 0, "" ); |
231 | SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, |
232 | flow_route_id_buckets, CTLFLAG_RW | CTLFLAG_LOCKED, |
233 | &fsw_flow_route_id_buckets, 0, "" ); |
234 | SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_reap_interval, |
235 | CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_reap_interval, 0, "" ); |
236 | SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_purge_thresh, |
237 | CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_purge_thresh, 0, "" ); |
238 | SYSCTL_QUAD(_kern_skywalk_flowswitch, OID_AUTO, fsw_inject_error, |
239 | CTLFLAG_RW | CTLFLAG_LOCKED, &_fsw_inject_error, "" ); |
240 | #else |
241 | #define _FSW_INJECT_ERROR(_en, _ev, _ec, _f, ...) do { } while (0) |
242 | #define _FSW_INJECT_ERROR_SET(_en, _f, ...) do { } while (0) |
243 | #endif /* !DEVELOPMENT && !DEBUG */ |
244 | |
245 | static void fsw_linger_remove_internal(struct flow_entry_linger_head *, |
246 | struct flow_entry *); |
247 | static void fsw_reap_thread_func(void *, wait_result_t); |
248 | static void fsw_reap_thread_cont(void *, wait_result_t); |
249 | static void fsw_purge_cache(struct nx_flowswitch *, boolean_t); |
250 | static void fsw_drain_channels(struct nx_flowswitch *, uint64_t, boolean_t); |
251 | static uint32_t fsw_process_deferred(struct nx_flowswitch *); |
252 | static uint32_t fsw_process_linger(struct nx_flowswitch *, uint32_t *); |
253 | |
254 | static int copy_packet_from_dev(struct nx_flowswitch *, struct __kern_packet *, |
255 | struct __kern_packet *); |
256 | |
257 | static void fsw_ifp_inc_traffic_class_in_pkt(struct ifnet *, kern_packet_t); |
258 | static void fsw_ifp_inc_traffic_class_out_pkt(struct ifnet *, uint32_t, |
259 | uint32_t, uint32_t); |
260 | |
261 | static int __fsw_dp_inited = 0; |
262 | |
263 | int |
264 | fsw_dp_init(void) |
265 | { |
266 | _CASSERT(FSW_VP_DEV == 0); |
267 | _CASSERT(FSW_VP_HOST == 1); |
268 | _CASSERT((FSW_VP_HOST + FSW_VP_DEV) < FSW_VP_USER_MIN); |
269 | _CASSERT((FSW_VP_HOST + FSW_VP_DEV) < NEXUS_PORT_FLOW_SWITCH_CLIENT); |
270 | |
271 | ASSERT(!__fsw_dp_inited); |
272 | |
273 | flow_mgr_init(); |
274 | flow_init(); |
275 | |
276 | __fsw_dp_inited = 1; |
277 | |
278 | return 0; |
279 | } |
280 | |
281 | void |
282 | fsw_dp_uninit(void) |
283 | { |
284 | if (__fsw_dp_inited) { |
285 | flow_fini(); |
286 | flow_mgr_fini(); |
287 | |
288 | __fsw_dp_inited = 0; |
289 | } |
290 | } |
291 | |
292 | static void |
293 | dp_free_pktq(struct nx_flowswitch *fsw __sk_unused, struct pktq *pktq) |
294 | { |
295 | pp_free_pktq(pktq); |
296 | } |
297 | |
298 | #define dp_drop_pktq(fsw, pktq) do { \ |
299 | uint32_t _len = KPKTQ_LEN(pktq); \ |
300 | if (KPKTQ_EMPTY(pktq)) { \ |
301 | ASSERT(_len == 0); \ |
302 | return; \ |
303 | } \ |
304 | SK_DF(SK_VERB_FSW_DP | SK_VERB_DROP, "drop %d packets", _len); \ |
305 | FSW_STATS_ADD(FSW_STATS_DROP, _len); \ |
306 | DTRACE_SKYWALK1(fsw__dp__drop, int, _len); \ |
307 | dp_free_pktq(fsw, pktq); \ |
308 | } while (0) |
309 | |
310 | SK_NO_INLINE_ATTRIBUTE |
311 | void |
312 | fsw_snoop(struct nx_flowswitch *fsw, struct flow_entry *fe, bool input) |
313 | { |
314 | pid_t pid; |
315 | char proc_name_buf[FLOW_PROCESS_NAME_LENGTH]; |
316 | char *proc_name = NULL; |
317 | pid_t epid; |
318 | char eproc_name_buf[FLOW_PROCESS_NAME_LENGTH]; |
319 | char *eproc_name = NULL; |
320 | sa_family_t af; |
321 | bool tap_early = false; |
322 | struct __kern_packet *pkt; |
323 | |
324 | ASSERT(fe != NULL); |
325 | ASSERT(fsw->fsw_ifp != NULL); |
326 | |
327 | if (fe->fe_nx_port == FSW_VP_HOST) { |
328 | /* allow packets to be tapped before aggregation happens */ |
329 | tap_early = (input && fe->fe_key.fk_proto == IPPROTO_TCP); |
330 | if (!tap_early) { |
331 | /* all other traffic will be tapped in the dlil input path */ |
332 | return; |
333 | } |
334 | } |
335 | if (fe->fe_key.fk_ipver == IPVERSION) { |
336 | af = AF_INET; |
337 | } else if (fe->fe_key.fk_ipver == IPV6_VERSION) { |
338 | af = AF_INET6; |
339 | } else { |
340 | return; |
341 | } |
342 | |
343 | pid = fe->fe_pid; |
344 | if (fe->fe_proc_name[0] != '\0') { |
345 | (void) strlcpy(dst: proc_name_buf, src: fe->fe_proc_name, |
346 | n: sizeof(proc_name_buf)); |
347 | proc_name = proc_name_buf; |
348 | } |
349 | epid = fe->fe_epid; |
350 | if (fe->fe_eproc_name[0] != '\0') { |
351 | (void) strlcpy(dst: eproc_name_buf, src: fe->fe_eproc_name, |
352 | n: sizeof(eproc_name_buf)); |
353 | eproc_name = eproc_name_buf; |
354 | } |
355 | if (input) { |
356 | KPKTQ_FOREACH(pkt, &fe->fe_rx_pktq) { |
357 | pktap_input_packet(fsw->fsw_ifp, af, |
358 | fsw->fsw_ifp_dlt, pid, proc_name, epid, |
359 | eproc_name, SK_PKT2PH(pkt), NULL, 0, |
360 | IPPROTO_TCP, fe->fe_flowid, |
361 | tap_early ? PTH_FLAG_SOCKET: PTH_FLAG_NEXUS_CHAN); |
362 | } |
363 | } else { |
364 | KPKTQ_FOREACH(pkt, &fe->fe_tx_pktq) { |
365 | pktap_output_packet(fsw->fsw_ifp, af, |
366 | fsw->fsw_ifp_dlt, pid, proc_name, epid, |
367 | eproc_name, SK_PKT2PH(pkt), NULL, 0, |
368 | 0, 0, PTH_FLAG_NEXUS_CHAN); |
369 | } |
370 | } |
371 | } |
372 | |
373 | #if (DEVELOPMENT || DEBUG) |
374 | static void |
375 | _fsw_error35_handler(int step, struct flow_route *fr, struct __kern_packet *pkt, |
376 | int *ret) |
377 | { |
378 | static boolean_t _err35_flag_modified = FALSE; |
379 | |
380 | switch (step) { |
381 | case 1: |
382 | if ((fr->fr_flags & (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) == |
383 | (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) { |
384 | fr->fr_flags &= ~FLOWRTF_RESOLVED; |
385 | _err35_flag_modified = TRUE; |
386 | } |
387 | break; |
388 | |
389 | case 2: |
390 | if (!_err35_flag_modified) { |
391 | return; |
392 | } |
393 | if (pkt->pkt_pflags & PKT_F_MBUF_DATA) { |
394 | m_freem(pkt->pkt_mbuf); |
395 | pkt->pkt_pflags &= ~PKT_F_MBUF_DATA; |
396 | pkt->pkt_mbuf = NULL; |
397 | } |
398 | *ret = EJUSTRETURN; |
399 | fr->fr_flags |= FLOWRTF_RESOLVED; |
400 | _err35_flag_modified = FALSE; |
401 | break; |
402 | |
403 | default: |
404 | VERIFY(0); |
405 | /* not reached */ |
406 | } |
407 | } |
408 | |
409 | static void |
410 | _fsw_error36_handler(int step, struct flow_route *fr, int *ret) |
411 | { |
412 | static boolean_t _err36_flag_modified = FALSE; |
413 | |
414 | switch (step) { |
415 | case 1: |
416 | if ((fr->fr_flags & (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) == |
417 | (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) { |
418 | fr->fr_flags &= ~FLOWRTF_RESOLVED; |
419 | _err36_flag_modified = TRUE; |
420 | } |
421 | break; |
422 | |
423 | case 2: |
424 | if (!_err36_flag_modified) { |
425 | return; |
426 | } |
427 | *ret = ENETUNREACH; |
428 | fr->fr_flags |= FLOWRTF_RESOLVED; |
429 | _err36_flag_modified = FALSE; |
430 | break; |
431 | |
432 | default: |
433 | VERIFY(0); |
434 | /* not reached */ |
435 | } |
436 | } |
437 | #else /* !DEVELOPMENT && !DEBUG */ |
438 | #define _fsw_error35_handler(...) |
439 | #define _fsw_error36_handler(...) |
440 | #endif /* DEVELOPMENT || DEBUG */ |
441 | |
442 | /* |
443 | * Check if the source packet content can fit into the destination |
444 | * ring's packet. Returns TRUE if the source packet can fit. |
445 | * Note: Failures could be caused by misconfigured packet pool sizes, |
446 | * missing packet size check again MTU or if the source packet is from |
447 | * a compat netif and the attached mbuf is larger than MTU due to LRO. |
448 | */ |
449 | static inline boolean_t |
450 | validate_pkt_len(struct __kern_packet *spkt, kern_packet_t dph, |
451 | uint32_t skip_l2hlen, uint32_t l2hlen, uint16_t headroom, |
452 | uint32_t *copy_len) |
453 | { |
454 | uint32_t tlen = 0; |
455 | uint32_t splen = spkt->pkt_length - skip_l2hlen; |
456 | |
457 | if (l2hlen != 0) { |
458 | VERIFY(skip_l2hlen == 0); |
459 | tlen += l2hlen; |
460 | } else if ((spkt->pkt_link_flags & PKT_LINKF_ETHFCS) != 0) { |
461 | splen -= ETHER_CRC_LEN; |
462 | } |
463 | |
464 | tlen += splen; |
465 | *copy_len = splen; |
466 | |
467 | return tlen <= ((__packet_get_buflet_count(ph: dph) * |
468 | PP_BUF_SIZE_DEF(SK_PTR_ADDR_KPKT(dph)->pkt_qum.qum_pp)) - |
469 | headroom); |
470 | } |
471 | |
472 | #if SK_LOG |
473 | /* Hoisted out of line to reduce kernel stack footprint */ |
474 | SK_LOG_ATTRIBUTE |
475 | static void |
476 | copy_packet_from_dev_log(struct __kern_packet *spkt, |
477 | struct __kern_packet *dpkt, struct proc *p) |
478 | { |
479 | uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) | |
480 | ((spkt->pkt_pflags & PKT_F_MBUF_DATA) ? |
481 | SK_VERB_COPY_MBUF : SK_VERB_COPY)); |
482 | char *daddr; |
483 | MD_BUFLET_ADDR_ABS(dpkt, daddr); |
484 | SK_DF(logflags, "%s(%d) splen %u dplen %u hr %u l2 %u" , |
485 | sk_proc_name_address(p), sk_proc_pid(p), spkt->pkt_length, |
486 | dpkt->pkt_length, (uint32_t)dpkt->pkt_headroom, |
487 | (uint32_t)dpkt->pkt_l2_len); |
488 | SK_DF(logflags | SK_VERB_DUMP, "%s" , |
489 | sk_dump("buf" , daddr, dpkt->pkt_length, 128, NULL, 0)); |
490 | } |
491 | #else |
492 | #define copy_packet_from_dev_log(...) |
493 | #endif /* SK_LOG */ |
494 | |
495 | |
496 | static inline int |
497 | copy_packet_from_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt, |
498 | struct __kern_packet *dpkt) |
499 | { |
500 | /* |
501 | * source and destination nexus don't share the packet pool |
502 | * sync operation here is to |
503 | * - alloc packet for the rx(dst) ring |
504 | * - copy data/metadata from src packet to dst packet |
505 | * - attach alloc'd packet to rx(dst) ring |
506 | */ |
507 | kern_packet_t dph = SK_PTR_ENCODE(dpkt, |
508 | METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt)); |
509 | kern_packet_t sph = SK_PTR_ENCODE(spkt, METADATA_TYPE(spkt), |
510 | METADATA_SUBTYPE(spkt)); |
511 | boolean_t do_cksum_rx; |
512 | uint16_t skip_l2h_len = spkt->pkt_l2_len; |
513 | uint16_t iphlen; |
514 | uint32_t dlen; |
515 | int err; |
516 | |
517 | if (__improbable(!validate_pkt_len(spkt, dph, skip_l2h_len, 0, 0, |
518 | &dlen))) { |
519 | SK_ERR("bufcnt %d, bufsz %d" , __packet_get_buflet_count(dph), |
520 | PP_BUF_SIZE_DEF(dpkt->pkt_qum.qum_pp)); |
521 | FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN); |
522 | return EINVAL; |
523 | } |
524 | |
525 | /* Copy packet metadata */ |
526 | _QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum); |
527 | _PKT_COPY(spkt, dpkt); |
528 | ASSERT(!(dpkt->pkt_qum.qum_qflags & QUM_F_KERNEL_ONLY) || |
529 | PP_KERNEL_ONLY(dpkt->pkt_qum.qum_pp)); |
530 | ASSERT(dpkt->pkt_mbuf == NULL); |
531 | |
532 | dpkt->pkt_headroom = 0; |
533 | dpkt->pkt_l2_len = 0; |
534 | |
535 | /* don't include IP header from partial sum */ |
536 | if (__probable((spkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0)) { |
537 | iphlen = spkt->pkt_flow_ip_hlen; |
538 | do_cksum_rx = sk_cksum_rx; |
539 | } else { |
540 | iphlen = 0; |
541 | do_cksum_rx = FALSE; |
542 | } |
543 | |
544 | /* Copy packet payload */ |
545 | if ((spkt->pkt_pflags & PKT_F_MBUF_DATA) && |
546 | (spkt->pkt_pflags & PKT_F_TRUNCATED)) { |
547 | FSW_STATS_INC(FSW_STATS_RX_COPY_MBUF2PKT); |
548 | /* |
549 | * Source packet has truncated contents (just enough for |
550 | * the classifer) of an mbuf from the compat driver; copy |
551 | * the entire entire mbuf contents to destination packet. |
552 | */ |
553 | m_adj(spkt->pkt_mbuf, skip_l2h_len); |
554 | ASSERT((uint32_t)m_pktlen(spkt->pkt_mbuf) >= dlen); |
555 | fsw->fsw_pkt_copy_from_mbuf(NR_RX, dph, 0, |
556 | spkt->pkt_mbuf, 0, dlen, do_cksum_rx, iphlen); |
557 | } else { |
558 | FSW_STATS_INC(FSW_STATS_RX_COPY_PKT2PKT); |
559 | /* |
560 | * Source packet has full contents, either from an mbuf |
561 | * that came up from the compat driver, or because it |
562 | * originated on the native driver; copy to destination. |
563 | */ |
564 | fsw->fsw_pkt_copy_from_pkt(NR_RX, dph, 0, sph, |
565 | (spkt->pkt_headroom + spkt->pkt_l2_len), dlen, do_cksum_rx, |
566 | iphlen, 0, FALSE); |
567 | } |
568 | |
569 | #if DEBUG || DEVELOPMENT |
570 | if (__improbable(pkt_trailers > 0)) { |
571 | dlen += pkt_add_trailers(dph, dlen, iphlen); |
572 | } |
573 | #endif /* DEBUG || DEVELOPMENT */ |
574 | |
575 | /* Finalize and attach packet to Rx ring */ |
576 | METADATA_ADJUST_LEN(dpkt, 0, 0); |
577 | err = __packet_finalize(ph: dph); |
578 | VERIFY(err == 0); |
579 | |
580 | copy_packet_from_dev_log(spkt, dpkt, kernproc); |
581 | |
582 | if (spkt->pkt_pflags & PKT_F_MBUF_DATA) { |
583 | ifp_inc_traffic_class_in(ifp: fsw->fsw_ifp, m: spkt->pkt_mbuf); |
584 | mbuf_free(mbuf: spkt->pkt_mbuf); |
585 | KPKT_CLEAR_MBUF_DATA(spkt); |
586 | } else { |
587 | fsw_ifp_inc_traffic_class_in_pkt(fsw->fsw_ifp, dph); |
588 | } |
589 | |
590 | if (__probable(do_cksum_rx != 0)) { |
591 | FSW_STATS_INC(FSW_STATS_RX_COPY_SUM); |
592 | } |
593 | |
594 | return 0; |
595 | } |
596 | |
597 | SK_NO_INLINE_ATTRIBUTE |
598 | static struct __kern_packet * |
599 | rx_process_ip_frag(struct nx_flowswitch *fsw, struct __kern_packet *pkt) |
600 | { |
601 | char *pkt_buf; |
602 | void *l3_hdr; |
603 | uint16_t nfrags, tlen; |
604 | int err = 0; |
605 | |
606 | switch (fsw_ip_reass) { |
607 | case FSW_IP_REASS_FORCE_OFF: |
608 | return pkt; |
609 | case FSW_IP_REASS_FORCE_ON: |
610 | break; |
611 | default: |
612 | if (!FSW_NETAGENT_ENABLED(fsw) || |
613 | flow_mgr_get_num_flows(mgr: fsw->fsw_flow_mgr) == 0) { |
614 | return pkt; |
615 | } |
616 | break; |
617 | } |
618 | |
619 | MD_BUFLET_ADDR_ABS(pkt, pkt_buf); |
620 | l3_hdr = pkt_buf + pkt->pkt_headroom + pkt->pkt_l2_len; |
621 | |
622 | ASSERT(fsw->fsw_ipfm != NULL); |
623 | ASSERT((pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0); |
624 | |
625 | if (pkt->pkt_flow_ip_ver == IPVERSION) { |
626 | err = fsw_ip_frag_reass_v4(mgr: fsw->fsw_ipfm, pkt: &pkt, |
627 | ip4: (struct ip *)l3_hdr, nfrags: &nfrags, tlen: &tlen); |
628 | } else { |
629 | ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION); |
630 | /* we only handle frag header immediately after v6 header */ |
631 | err = fsw_ip_frag_reass_v6(mgr: fsw->fsw_ipfm, pkt: &pkt, |
632 | ip6: (struct ip6_hdr *)l3_hdr, |
633 | ip6f: (struct ip6_frag *)((uintptr_t)l3_hdr + sizeof(struct ip6_hdr)), |
634 | nfrags: &nfrags, tlen: &tlen); |
635 | } |
636 | if (__improbable(err != 0)) { |
637 | /* if we get a bad fragment, free it */ |
638 | pp_free_packet_single(pkt); |
639 | pkt = NULL; |
640 | } else { |
641 | ASSERT(!((pkt != NULL) ^ (nfrags > 0))); |
642 | } |
643 | |
644 | return pkt; |
645 | } |
646 | |
647 | SK_NO_INLINE_ATTRIBUTE |
648 | static void |
649 | rx_prepare_packet_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *pkt) |
650 | { |
651 | ASSERT(pkt->pkt_pflags & PKT_F_MBUF_DATA); |
652 | uint32_t mlen = (uint32_t)m_pktlen(pkt->pkt_mbuf); |
653 | kern_packet_t ph = SK_PTR_ENCODE(pkt, |
654 | METADATA_TYPE(pkt), METADATA_SUBTYPE(pkt)); |
655 | /* |
656 | * This is the case when the packet is coming in from |
657 | * compat-netif. This packet only has valid metadata |
658 | * and an attached mbuf. We need to copy enough data |
659 | * from the mbuf to the packet buffer for the |
660 | * classifier. Compat netif packet pool is configured |
661 | * with buffer size of NETIF_COMPAT_MAX_MBUF_DATA_COPY |
662 | * which is just enough to hold the protocol headers |
663 | * for the flowswitch classifier. |
664 | */ |
665 | |
666 | pkt->pkt_headroom = 0; |
667 | METADATA_ADJUST_LEN(pkt, 0, 0); |
668 | /* |
669 | * Copy the initial 128 bytes of the packet for |
670 | * classification. |
671 | * Ethernet(14) + IPv6 header(40) + |
672 | * + IPv6 fragment header(8) + |
673 | * TCP header with options(60). |
674 | */ |
675 | fsw->fsw_pkt_copy_from_mbuf(NR_RX, ph, |
676 | pkt->pkt_headroom, pkt->pkt_mbuf, 0, |
677 | MIN(mlen, NETIF_COMPAT_MAX_MBUF_DATA_COPY), |
678 | FALSE, 0); |
679 | |
680 | int err = __packet_finalize_with_mbuf(pkt); |
681 | VERIFY(err == 0); |
682 | } |
683 | |
684 | static struct __kern_packet * |
685 | rx_prepare_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt) |
686 | { |
687 | pkt->pkt_qum_qflags &= ~QUM_F_FLOW_CLASSIFIED; |
688 | |
689 | if (__improbable(pkt->pkt_pflags & PKT_F_MBUF_DATA)) { |
690 | rx_prepare_packet_mbuf(fsw, pkt); |
691 | } |
692 | |
693 | return pkt; |
694 | } |
695 | |
696 | static struct flow_entry * |
697 | lookup_flow_with_pkt(struct nx_flowswitch *fsw, struct __kern_packet *pkt, |
698 | bool input, struct flow_entry *prev_fe) |
699 | { |
700 | struct flow_key key __sk_aligned(16); |
701 | struct flow_entry *fe = NULL; |
702 | |
703 | ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED); |
704 | flow_pkt2key(pkt, input, key: &key); |
705 | |
706 | if (__probable(prev_fe != NULL && |
707 | prev_fe->fe_key.fk_mask == FKMASK_5TUPLE)) { |
708 | uint16_t saved_mask = key.fk_mask; |
709 | key.fk_mask = FKMASK_5TUPLE; |
710 | if (flow_key_cmp_mask(match: &prev_fe->fe_key, key: &key, mask: &fk_mask_5tuple) == 0) { |
711 | flow_entry_retain(fe: prev_fe); |
712 | fe = prev_fe; |
713 | } else { |
714 | key.fk_mask = saved_mask; |
715 | } |
716 | } |
717 | |
718 | top: |
719 | if (__improbable(fe == NULL)) { |
720 | fe = flow_mgr_find_fe_by_key(fsw->fsw_flow_mgr, &key); |
721 | } |
722 | |
723 | if (__improbable(fe != NULL && |
724 | (fe->fe_flags & (FLOWENTF_PARENT | FLOWENTF_CHILD)) != 0)) { |
725 | /* Rx */ |
726 | if (input) { |
727 | if (fe->fe_flags & FLOWENTF_PARENT) { |
728 | struct flow_entry *child_fe = rx_lookup_child_flow(fsw, fe, pkt); |
729 | if (child_fe != NULL) { |
730 | flow_entry_release(pfe: &fe); |
731 | fe = child_fe; |
732 | } |
733 | } else { |
734 | if (!rx_flow_demux_match(fsw, fe, pkt)) { |
735 | flow_entry_release(pfe: &fe); |
736 | fe = NULL; |
737 | goto top; |
738 | } |
739 | } |
740 | } else { |
741 | /* Tx */ |
742 | if (__improbable(!_UUID_MATCH(pkt->pkt_flow_id, fe->fe_uuid))) { |
743 | if (__probable(fe->fe_flags & FLOWENTF_PARENT)) { |
744 | struct flow_entry *parent_fe = fe; |
745 | fe = tx_lookup_child_flow(parent_fe, pkt->pkt_flow_id); |
746 | flow_entry_release(pfe: &parent_fe); |
747 | } else { |
748 | flow_entry_release(pfe: &fe); |
749 | fe = NULL; |
750 | goto top; |
751 | } |
752 | } |
753 | } |
754 | } |
755 | |
756 | SK_LOG_VAR(char fkbuf[FLOWKEY_DBGBUF_SIZE]); |
757 | SK_DF(SK_VERB_FSW_DP | SK_VERB_LOOKUP, |
758 | "%s %s %s \"%s\" fe 0x%llx" , |
759 | input ? "Rx" : "Tx" , if_name(fsw->fsw_ifp), |
760 | sk_proc_name_address(current_proc()), |
761 | fk_as_string(&key, fkbuf, sizeof(fkbuf)), |
762 | SK_KVA(fe)); |
763 | |
764 | return fe; |
765 | } |
766 | |
767 | SK_NO_INLINE_ATTRIBUTE |
768 | static bool |
769 | pkt_is_for_listener(struct flow_entry *fe, struct __kern_packet *pkt) |
770 | { |
771 | struct nx_flowswitch *fsw = fe->fe_fsw; |
772 | struct ifnet *ifp = fsw->fsw_ifp; |
773 | struct in_ifaddr *ia = NULL; |
774 | struct in_ifaddr *best_ia = NULL; |
775 | struct in6_ifaddr *ia6 = NULL; |
776 | struct in6_ifaddr *best_ia6 = NULL; |
777 | struct ifnet *match_ifp = NULL; |
778 | struct __flow *flow = pkt->pkt_flow; |
779 | bool result = false; |
780 | |
781 | ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED); |
782 | |
783 | if (flow->flow_ip_ver == IPVERSION) { |
784 | if (IN_ZERONET(ntohl(flow->flow_ipv4_dst.s_addr)) || |
785 | IN_LOOPBACK(ntohl(flow->flow_ipv4_dst.s_addr)) || |
786 | IN_LINKLOCAL(ntohl(flow->flow_ipv4_dst.s_addr)) || |
787 | IN_DS_LITE(ntohl(flow->flow_ipv4_dst.s_addr)) || |
788 | IN_6TO4_RELAY_ANYCAST(ntohl(flow->flow_ipv4_dst.s_addr)) || |
789 | IN_MULTICAST(ntohl(flow->flow_ipv4_dst.s_addr)) || |
790 | INADDR_BROADCAST == flow->flow_ipv4_dst.s_addr) { |
791 | result = true; |
792 | goto done; |
793 | } |
794 | |
795 | /* |
796 | * Check for a match in the hash bucket. |
797 | */ |
798 | lck_rw_lock_shared(lck: &in_ifaddr_rwlock); |
799 | TAILQ_FOREACH(ia, INADDR_HASH(flow->flow_ipv4_dst.s_addr), ia_hash) { |
800 | if (IA_SIN(ia)->sin_addr.s_addr == flow->flow_ipv4_dst.s_addr) { |
801 | best_ia = ia; |
802 | match_ifp = ia->ia_ifp; |
803 | |
804 | if (match_ifp == ifp) { |
805 | break; |
806 | } |
807 | /* |
808 | * Continue the loop in case there's a exact match with another |
809 | * interface |
810 | */ |
811 | } |
812 | } |
813 | |
814 | if (best_ia != NULL) { |
815 | if (match_ifp != ifp && ipforwarding == 0 && |
816 | (match_ifp->if_family == IFNET_FAMILY_IPSEC || |
817 | match_ifp->if_family == IFNET_FAMILY_UTUN)) { |
818 | /* |
819 | * Drop when interface address check is strict and forwarding |
820 | * is disabled |
821 | */ |
822 | } else { |
823 | lck_rw_done(lck: &in_ifaddr_rwlock); |
824 | result = true; |
825 | goto done; |
826 | } |
827 | } |
828 | lck_rw_done(lck: &in_ifaddr_rwlock); |
829 | |
830 | if (ifp->if_flags & IFF_BROADCAST) { |
831 | /* |
832 | * Check for broadcast addresses. |
833 | * |
834 | * Only accept broadcast packets that arrive via the matching |
835 | * interface. Reception of forwarded directed broadcasts would be |
836 | * handled via ip_forward() and ether_frameout() with the loopback |
837 | * into the stack for SIMPLEX interfaces handled by ether_frameout(). |
838 | */ |
839 | struct ifaddr *ifa; |
840 | |
841 | ifnet_lock_shared(ifp); |
842 | TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { |
843 | if (ifa->ifa_addr->sa_family != AF_INET) { |
844 | continue; |
845 | } |
846 | ia = ifatoia(ifa); |
847 | if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == flow->flow_ipv4_dst.s_addr || |
848 | ia->ia_netbroadcast.s_addr == flow->flow_ipv4_dst.s_addr) { |
849 | ifnet_lock_done(ifp); |
850 | result = true; |
851 | goto done; |
852 | } |
853 | } |
854 | ifnet_lock_done(ifp); |
855 | } |
856 | } else { |
857 | if (IN6_IS_ADDR_LOOPBACK(&flow->flow_ipv6_dst) || |
858 | IN6_IS_ADDR_LINKLOCAL(&flow->flow_ipv6_dst) || |
859 | IN6_IS_ADDR_MULTICAST(&flow->flow_ipv6_dst)) { |
860 | result = true; |
861 | goto done; |
862 | } |
863 | |
864 | /* |
865 | * Check for exact addresses in the hash bucket. |
866 | */ |
867 | lck_rw_lock_shared(lck: &in6_ifaddr_rwlock); |
868 | TAILQ_FOREACH(ia6, IN6ADDR_HASH(&flow->flow_ipv6_dst), ia6_hash) { |
869 | if (in6_are_addr_equal_scoped(&ia6->ia_addr.sin6_addr, &flow->flow_ipv6_dst, ia6->ia_ifp->if_index, ifp->if_index)) { |
870 | if ((ia6->ia6_flags & (IN6_IFF_NOTREADY | IN6_IFF_CLAT46))) { |
871 | continue; |
872 | } |
873 | best_ia6 = ia6; |
874 | if (ia6->ia_ifp == ifp) { |
875 | break; |
876 | } |
877 | /* |
878 | * Continue the loop in case there's a exact match with another |
879 | * interface |
880 | */ |
881 | } |
882 | } |
883 | if (best_ia6 != NULL) { |
884 | if (best_ia6->ia_ifp != ifp && ip6_forwarding == 0 && |
885 | (best_ia6->ia_ifp->if_family == IFNET_FAMILY_IPSEC || |
886 | best_ia6->ia_ifp->if_family == IFNET_FAMILY_UTUN)) { |
887 | /* |
888 | * Drop when interface address check is strict and forwarding |
889 | * is disabled |
890 | */ |
891 | } else { |
892 | lck_rw_done(lck: &in6_ifaddr_rwlock); |
893 | result = true; |
894 | goto done; |
895 | } |
896 | } |
897 | lck_rw_done(lck: &in6_ifaddr_rwlock); |
898 | } |
899 | |
900 | /* |
901 | * In forwarding mode, if the destination address |
902 | * of the packet does not match any interface |
903 | * address, it maybe destined to the client device |
904 | */ |
905 | SK_DF(SK_VERB_FSW_DP | SK_VERB_RX | SK_VERB_FLOW, |
906 | "Rx flow does not match interface address" ); |
907 | done: |
908 | return result; |
909 | } |
910 | |
911 | static struct flow_entry * |
912 | rx_lookup_flow(struct nx_flowswitch *fsw, struct __kern_packet *pkt, |
913 | struct flow_entry *prev_fe) |
914 | { |
915 | struct flow_entry *fe; |
916 | |
917 | fe = lookup_flow_with_pkt(fsw, pkt, true, prev_fe); |
918 | _FSW_INJECT_ERROR(2, fe, NULL, flow_entry_release, &fe); |
919 | if (fe == NULL) { |
920 | FSW_STATS_INC(FSW_STATS_RX_FLOW_NOT_FOUND); |
921 | return NULL; |
922 | } |
923 | |
924 | if (__improbable(fe->fe_key.fk_mask == FKMASK_2TUPLE && |
925 | fe->fe_flags & FLOWENTF_LISTENER) && |
926 | !pkt_is_for_listener(fe, pkt)) { |
927 | FSW_STATS_INC(FSW_STATS_RX_PKT_NOT_LISTENER); |
928 | flow_entry_release(pfe: &fe); |
929 | return NULL; |
930 | } |
931 | |
932 | if (__improbable(fe->fe_flags & FLOWENTF_TORN_DOWN)) { |
933 | FSW_STATS_INC(FSW_STATS_RX_FLOW_TORNDOWN); |
934 | SK_DF(SK_VERB_FSW_DP | SK_VERB_RX | SK_VERB_FLOW, |
935 | "Rx flow torn down" ); |
936 | flow_entry_release(pfe: &fe); |
937 | fe = NULL; |
938 | } |
939 | |
940 | return fe; |
941 | } |
942 | |
943 | static inline void |
944 | rx_flow_batch_packet(struct flow_entry_list *fes, struct flow_entry *fe, |
945 | struct __kern_packet *pkt) |
946 | { |
947 | if (__improbable(pkt->pkt_flow_ip_is_frag)) { |
948 | fe->fe_rx_frag_count++; |
949 | } |
950 | |
951 | /* KPKTQ_ENQUEUE_LIST is needed until frags become chained buflet */ |
952 | if (KPKTQ_EMPTY(&fe->fe_rx_pktq)) { |
953 | ASSERT(KPKTQ_LEN(&fe->fe_rx_pktq) == 0); |
954 | TAILQ_INSERT_TAIL(fes, fe, fe_rx_link); |
955 | KPKTQ_ENQUEUE_LIST(&fe->fe_rx_pktq, pkt); |
956 | } else { |
957 | ASSERT(!TAILQ_EMPTY(fes)); |
958 | KPKTQ_ENQUEUE_LIST(&fe->fe_rx_pktq, pkt); |
959 | flow_entry_release(pfe: &fe); |
960 | } |
961 | } |
962 | |
963 | static void |
964 | tx_flow_batch_packet(struct flow_entry_list *fes, struct flow_entry *fe, |
965 | struct __kern_packet *pkt) |
966 | { |
967 | /* record frag continuation */ |
968 | if (__improbable(pkt->pkt_flow_ip_is_first_frag)) { |
969 | ASSERT(pkt->pkt_flow_ip_is_frag); |
970 | fe->fe_tx_is_cont_frag = true; |
971 | fe->fe_tx_frag_id = pkt->pkt_flow_ip_frag_id; |
972 | } else if (__probable(!pkt->pkt_flow_ip_is_frag)) { |
973 | fe->fe_tx_is_cont_frag = false; |
974 | fe->fe_tx_frag_id = 0; |
975 | } |
976 | |
977 | if (KPKTQ_EMPTY(&fe->fe_tx_pktq)) { |
978 | ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) == 0); |
979 | TAILQ_INSERT_TAIL(fes, fe, fe_tx_link); |
980 | KPKTQ_ENQUEUE(&fe->fe_tx_pktq, pkt); |
981 | } else { |
982 | ASSERT(!TAILQ_EMPTY(fes)); |
983 | KPKTQ_ENQUEUE(&fe->fe_tx_pktq, pkt); |
984 | flow_entry_release(pfe: &fe); |
985 | } |
986 | } |
987 | |
988 | static inline void |
989 | fsw_rx_ring_dequeue_pktq(struct nx_flowswitch *fsw, struct __kern_channel_ring *r, |
990 | uint32_t n_pkts_max, struct pktq *pktq, uint32_t *n_bytes) |
991 | { |
992 | uint32_t n_pkts = 0; |
993 | slot_idx_t idx, idx_end; |
994 | idx = r->ckr_khead; |
995 | idx_end = r->ckr_rhead; |
996 | |
997 | ASSERT(KPKTQ_EMPTY(pktq)); |
998 | *n_bytes = 0; |
999 | for (; n_pkts < n_pkts_max && idx != idx_end; |
1000 | idx = SLOT_NEXT(i: idx, lim: r->ckr_lim)) { |
1001 | struct __kern_slot_desc *ksd = KR_KSD(r, idx); |
1002 | struct __kern_packet *pkt = ksd->sd_pkt; |
1003 | |
1004 | ASSERT(pkt->pkt_nextpkt == NULL); |
1005 | KR_SLOT_DETACH_METADATA(kring: r, ksd); |
1006 | |
1007 | _FSW_INJECT_ERROR(20, pkt->pkt_qum_qflags, |
1008 | pkt->pkt_qum_qflags | QUM_F_DROPPED, null_func); |
1009 | if (__improbable(((pkt->pkt_qum_qflags & QUM_F_DROPPED) != 0)) |
1010 | || (pkt->pkt_length == 0)) { |
1011 | FSW_STATS_INC(FSW_STATS_DROP); |
1012 | pp_free_packet_single(pkt); |
1013 | continue; |
1014 | } |
1015 | n_pkts++; |
1016 | *n_bytes += pkt->pkt_length; |
1017 | |
1018 | KPKTQ_ENQUEUE(pktq, pkt); |
1019 | } |
1020 | r->ckr_khead = idx; |
1021 | r->ckr_ktail = SLOT_PREV(i: idx, lim: r->ckr_lim); |
1022 | } |
1023 | |
1024 | /* |
1025 | * This is only for estimating how many packets each GSO packet will need. |
1026 | * The number does not need to be exact because any leftover packets allocated |
1027 | * will be freed. |
1028 | */ |
1029 | static uint32_t |
1030 | estimate_gso_pkts(struct __kern_packet *pkt) |
1031 | { |
1032 | packet_tso_flags_t tso_flags; |
1033 | uint16_t mss; |
1034 | uint32_t n_pkts = 0, total_hlen = 0, total_len = 0; |
1035 | |
1036 | tso_flags = pkt->pkt_csum_flags & PACKET_CSUM_TSO_FLAGS; |
1037 | mss = pkt->pkt_proto_seg_sz; |
1038 | |
1039 | if (tso_flags == PACKET_TSO_IPV4) { |
1040 | total_hlen = sizeof(struct ip) + sizeof(struct tcphdr); |
1041 | } else if (tso_flags == PACKET_TSO_IPV6) { |
1042 | total_hlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); |
1043 | } |
1044 | if (total_hlen != 0 && mss != 0) { |
1045 | total_len = pkt->pkt_length; |
1046 | n_pkts = (uint32_t) |
1047 | (SK_ROUNDUP((total_len - total_hlen), mss) / mss); |
1048 | } |
1049 | DTRACE_SKYWALK5(estimate__gso, packet_tso_flags_t, tso_flags, |
1050 | uint32_t, total_hlen, uint32_t, total_len, uint16_t, mss, |
1051 | uint32_t, n_pkts); |
1052 | return n_pkts; |
1053 | } |
1054 | |
1055 | /* |
1056 | * This function retrieves a chain of packets of the same type only |
1057 | * (GSO or non-GSO). |
1058 | */ |
1059 | static inline void |
1060 | fsw_tx_ring_dequeue_pktq(struct nx_flowswitch *fsw, |
1061 | struct __kern_channel_ring *r, uint32_t n_pkts_max, |
1062 | struct pktq *pktq, uint32_t *n_bytes, uint32_t *gso_pkts_estimate) |
1063 | { |
1064 | uint32_t n_pkts = 0; |
1065 | slot_idx_t idx, idx_end; |
1066 | idx = r->ckr_khead; |
1067 | idx_end = r->ckr_rhead; |
1068 | struct nexus_vp_adapter *vpna = VPNA(KRNA(r)); |
1069 | boolean_t gso_enabled, gso_required; |
1070 | uint32_t gso_pkts; |
1071 | |
1072 | gso_enabled = (fsw->fsw_tso_mode == FSW_TSO_MODE_SW); |
1073 | ASSERT(KPKTQ_EMPTY(pktq)); |
1074 | *n_bytes = 0; |
1075 | for (; n_pkts < n_pkts_max && |
1076 | (!gso_enabled || fsw_gso_batch == 0 || |
1077 | *gso_pkts_estimate < fsw_gso_batch) && |
1078 | idx != idx_end; idx = SLOT_NEXT(i: idx, lim: r->ckr_lim)) { |
1079 | struct __kern_slot_desc *ksd = KR_KSD(r, idx); |
1080 | struct __kern_packet *pkt = ksd->sd_pkt; |
1081 | |
1082 | ASSERT(pkt->pkt_nextpkt == NULL); |
1083 | |
1084 | _FSW_INJECT_ERROR(20, pkt->pkt_qum_qflags, |
1085 | pkt->pkt_qum_qflags | QUM_F_DROPPED, null_func); |
1086 | if (__improbable(((pkt->pkt_qum_qflags & QUM_F_DROPPED) != 0)) |
1087 | || (pkt->pkt_length == 0)) { |
1088 | KR_SLOT_DETACH_METADATA(kring: r, ksd); |
1089 | FSW_STATS_INC(FSW_STATS_DROP); |
1090 | pp_free_packet_single(pkt); |
1091 | continue; |
1092 | } |
1093 | if (gso_enabled) { |
1094 | gso_pkts = estimate_gso_pkts(pkt); |
1095 | |
1096 | /* |
1097 | * We use the first packet to determine what |
1098 | * type the subsequent ones need to be (GSO or |
1099 | * non-GSO). |
1100 | */ |
1101 | if (n_pkts == 0) { |
1102 | gso_required = (gso_pkts != 0); |
1103 | } else { |
1104 | if (gso_required != (gso_pkts != 0)) { |
1105 | break; |
1106 | } |
1107 | } |
1108 | *gso_pkts_estimate += gso_pkts; |
1109 | } |
1110 | KR_SLOT_DETACH_METADATA(kring: r, ksd); |
1111 | if (NA_CHANNEL_EVENT_ATTACHED(&vpna->vpna_up)) { |
1112 | __packet_set_tx_nx_port(SK_PKT2PH(pkt), |
1113 | nx_port: vpna->vpna_nx_port, vpna_gencnt: vpna->vpna_gencnt); |
1114 | } |
1115 | n_pkts++; |
1116 | *n_bytes += pkt->pkt_length; |
1117 | KPKTQ_ENQUEUE(pktq, pkt); |
1118 | } |
1119 | r->ckr_khead = idx; |
1120 | r->ckr_ktail = SLOT_PREV(i: idx, lim: r->ckr_lim); |
1121 | DTRACE_SKYWALK5(tx__ring__dequeue, struct nx_flowswitch *, fsw, |
1122 | ifnet_t, fsw->fsw_ifp, uint32_t, n_pkts, uint32_t, *n_bytes, |
1123 | uint32_t, *gso_pkts_estimate); |
1124 | } |
1125 | |
1126 | static void |
1127 | fsw_ring_enqueue_pktq(struct nx_flowswitch *fsw, struct __kern_channel_ring *r, |
1128 | struct pktq *pktq) |
1129 | { |
1130 | #pragma unused(fsw) |
1131 | struct __kern_packet *pkt; |
1132 | struct __kern_quantum *kqum; |
1133 | uint32_t kr_space_avail = 0; |
1134 | uint32_t n, n_pkts = 0, n_bytes = 0; |
1135 | slot_idx_t idx = 0, idx_start = 0, idx_end = 0; |
1136 | |
1137 | kr_enter(r, TRUE); |
1138 | |
1139 | idx_start = r->ckr_ktail; |
1140 | kr_space_avail = kr_available_slots_rxring(rxkring: r); |
1141 | _FSW_INJECT_ERROR(40, kr_space_avail, 0, null_func); |
1142 | n = MIN(kr_space_avail, KPKTQ_LEN(pktq)); |
1143 | _FSW_INJECT_ERROR(41, n, 0, null_func); |
1144 | idx_end = SLOT_INCREMENT(i: idx_start, n, lim: r->ckr_lim); |
1145 | |
1146 | idx = idx_start; |
1147 | while (idx != idx_end) { |
1148 | KPKTQ_DEQUEUE(pktq, pkt); |
1149 | kqum = SK_PTR_ADDR_KQUM(pkt); |
1150 | kqum->qum_qflags |= QUM_F_FINALIZED; |
1151 | n_pkts++; |
1152 | n_bytes += pkt->pkt_length; |
1153 | KR_SLOT_ATTACH_METADATA(kring: r, KR_KSD(r, idx), kqum); |
1154 | if (__improbable(pkt->pkt_trace_id != 0)) { |
1155 | KDBG(SK_KTRACE_PKT_RX_FSW | DBG_FUNC_END, pkt->pkt_trace_id); |
1156 | KDBG(SK_KTRACE_PKT_RX_CHN | DBG_FUNC_START, pkt->pkt_trace_id); |
1157 | } |
1158 | idx = SLOT_NEXT(i: idx, lim: r->ckr_lim); |
1159 | } |
1160 | |
1161 | kr_update_stats(kring: r, slot_count: n_pkts, byte_count: n_bytes); |
1162 | |
1163 | /* |
1164 | * ensure slot attachments are visible before updating the |
1165 | * tail pointer |
1166 | */ |
1167 | os_atomic_thread_fence(seq_cst); |
1168 | |
1169 | r->ckr_ktail = idx_end; |
1170 | |
1171 | kr_exit(r); |
1172 | |
1173 | r->ckr_na_notify(r, kernproc, NA_NOTEF_PUSH); |
1174 | |
1175 | SK_DF(SK_VERB_FSW_DP | SK_VERB_RING, "%s enqueued %d pkts" , |
1176 | r->ckr_name, n_pkts); |
1177 | } |
1178 | |
1179 | static void |
1180 | pkts_to_pktq(struct __kern_packet *pkts[], uint32_t n_pkts, struct pktq *pktq) |
1181 | { |
1182 | ASSERT(KPKTQ_EMPTY(pktq)); |
1183 | |
1184 | for (uint32_t i = 0; i < n_pkts; i++) { |
1185 | struct __kern_packet *pkt = pkts[i]; |
1186 | ASSERT(pkt->pkt_nextpkt == NULL); |
1187 | KPKTQ_ENQUEUE(pktq, pkt); |
1188 | } |
1189 | } |
1190 | |
1191 | /* |
1192 | * This function is modeled after nx_netif_host_grab_pkts() in nx_netif_host.c. |
1193 | */ |
1194 | SK_NO_INLINE_ATTRIBUTE |
1195 | static void |
1196 | convert_native_pktq_to_mbufs(struct nx_flowswitch *fsw, struct pktq *pktq, |
1197 | struct mbuf **m_headp, struct mbuf **m_tailp, uint32_t *cnt, uint32_t *bytes) |
1198 | { |
1199 | uint32_t tot_cnt; |
1200 | unsigned int num_segs = 1; |
1201 | struct mbuf *mhead, *head = NULL, *tail = NULL, **tailp = &head; |
1202 | uint32_t mhead_cnt, mhead_bufsize; |
1203 | uint32_t mhead_waste = 0; |
1204 | uint32_t mcnt = 0, mbytes = 0; |
1205 | uint32_t largest, max_pkt_len; |
1206 | struct __kern_packet *pkt; |
1207 | struct kern_pbufpool *pp; |
1208 | |
1209 | tot_cnt = KPKTQ_LEN(pktq); |
1210 | ASSERT(tot_cnt > 0); |
1211 | mhead_cnt = tot_cnt; |
1212 | |
1213 | /* |
1214 | * Opportunistically batch-allocate the mbufs based on the largest |
1215 | * packet size we've seen in the recent past. Note that we reset |
1216 | * fe_rx_largest_size below if we notice that we're under-utilizing the |
1217 | * allocated buffers (thus disabling this batch allocation). |
1218 | */ |
1219 | largest = *(volatile uint32_t*)&fsw->fsw_rx_largest_size; /* read once */ |
1220 | if (__probable(largest != 0)) { |
1221 | if (largest <= MCLBYTES) { |
1222 | mhead = m_allocpacket_internal(&mhead_cnt, MCLBYTES, |
1223 | &num_segs, M_NOWAIT, 1, 0); |
1224 | mhead_bufsize = MCLBYTES; |
1225 | } else if (largest <= MBIGCLBYTES) { |
1226 | mhead = m_allocpacket_internal(&mhead_cnt, MBIGCLBYTES, |
1227 | &num_segs, M_NOWAIT, 1, 0); |
1228 | mhead_bufsize = MBIGCLBYTES; |
1229 | } else if (largest <= M16KCLBYTES) { |
1230 | mhead = m_allocpacket_internal(&mhead_cnt, M16KCLBYTES, |
1231 | &num_segs, M_NOWAIT, 1, 0); |
1232 | mhead_bufsize = M16KCLBYTES; |
1233 | } else if (largest <= M16KCLBYTES * 2) { |
1234 | num_segs = 2; |
1235 | mhead = m_allocpacket_internal(&mhead_cnt, M16KCLBYTES * 2, |
1236 | &num_segs, M_NOWAIT, 1, 0); |
1237 | mhead_bufsize = M16KCLBYTES * 2; |
1238 | } else { |
1239 | mhead = NULL; |
1240 | mhead_bufsize = mhead_cnt = 0; |
1241 | } |
1242 | } else { |
1243 | mhead = NULL; |
1244 | mhead_bufsize = mhead_cnt = 0; |
1245 | } |
1246 | DTRACE_SKYWALK4(bufstats, uint32_t, largest, uint32_t, mhead_bufsize, |
1247 | uint32_t, mhead_cnt, uint32_t, tot_cnt); |
1248 | |
1249 | pp = __DECONST(struct kern_pbufpool *, KPKTQ_FIRST(pktq)->pkt_qum.qum_pp); |
1250 | max_pkt_len = PP_BUF_SIZE_DEF(pp) * pp->pp_max_frags; |
1251 | |
1252 | KPKTQ_FOREACH(pkt, pktq) { |
1253 | uint32_t tot_len, len; |
1254 | uint16_t pad, llhlen, iphlen; |
1255 | boolean_t do_cksum_rx; |
1256 | struct mbuf *m; |
1257 | int error; |
1258 | |
1259 | llhlen = pkt->pkt_l2_len; |
1260 | len = pkt->pkt_length; |
1261 | if (__improbable(len > max_pkt_len || llhlen > len)) { |
1262 | DTRACE_SKYWALK2(bad__len, struct nx_flowswitch *, fsw, |
1263 | struct __kern_packet *, pkt); |
1264 | FSW_STATS_INC(FSW_STATS_DROP); |
1265 | FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN); |
1266 | continue; |
1267 | } |
1268 | /* begin payload on 32-bit boundary; figure out the padding */ |
1269 | pad = (uint16_t)P2ROUNDUP(llhlen, sizeof(uint32_t)) - llhlen; |
1270 | tot_len = pad + len; |
1271 | |
1272 | /* remember largest packet size */ |
1273 | if (__improbable(largest < tot_len)) { |
1274 | largest = MAX(tot_len, MCLBYTES); |
1275 | } |
1276 | |
1277 | /* |
1278 | * If the above batch allocation returned partial |
1279 | * success, we try a blocking allocation here again. |
1280 | */ |
1281 | m = mhead; |
1282 | if (__improbable(m == NULL || tot_len > mhead_bufsize)) { |
1283 | ASSERT(mhead != NULL || mhead_cnt == 0); |
1284 | num_segs = 1; |
1285 | if (tot_len > M16KCLBYTES) { |
1286 | num_segs = 0; |
1287 | } |
1288 | if ((error = mbuf_allocpacket(how: MBUF_DONTWAIT, packetlen: tot_len, |
1289 | maxchunks: &num_segs, mbuf: &m)) != 0) { |
1290 | DTRACE_SKYWALK2(bad__len, |
1291 | struct nx_flowswitch *, fsw, |
1292 | struct __kern_packet *, pkt); |
1293 | FSW_STATS_INC(FSW_STATS_DROP_NOMEM_MBUF); |
1294 | FSW_STATS_INC(FSW_STATS_DROP); |
1295 | continue; |
1296 | } |
1297 | } else { |
1298 | mhead = m->m_nextpkt; |
1299 | m->m_nextpkt = NULL; |
1300 | ASSERT(mhead_cnt != 0); |
1301 | --mhead_cnt; |
1302 | |
1303 | /* check if we're underutilizing large buffers */ |
1304 | if (__improbable(mhead_bufsize > MCLBYTES && |
1305 | tot_len < (mhead_bufsize >> 1))) { |
1306 | ++mhead_waste; |
1307 | } |
1308 | /* |
1309 | * Clean up unused mbuf. |
1310 | * Ony need to do this when we pre-alloc 2x16K mbufs |
1311 | */ |
1312 | if (__improbable(mhead_bufsize >= tot_len + M16KCLBYTES)) { |
1313 | ASSERT(mhead_bufsize == 2 * M16KCLBYTES); |
1314 | struct mbuf * = m->m_next; |
1315 | ASSERT(m_extra != NULL); |
1316 | ASSERT(m_extra->m_len == 0); |
1317 | ASSERT(M_SIZE(m_extra) == M16KCLBYTES); |
1318 | m->m_next = NULL; |
1319 | m_freem(m_extra); |
1320 | FSW_STATS_INC(FSW_STATS_RX_WASTED_16KMBUF); |
1321 | } |
1322 | } |
1323 | m->m_data += pad; |
1324 | m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *); |
1325 | |
1326 | /* don't include IP header from partial sum */ |
1327 | if (__probable((pkt->pkt_qum_qflags & |
1328 | QUM_F_FLOW_CLASSIFIED) != 0)) { |
1329 | iphlen = pkt->pkt_flow_ip_hlen; |
1330 | do_cksum_rx = sk_cksum_rx; |
1331 | } else { |
1332 | iphlen = 0; |
1333 | do_cksum_rx = FALSE; |
1334 | } |
1335 | |
1336 | fsw->fsw_pkt_copy_to_mbuf(NR_RX, SK_PKT2PH(pkt), |
1337 | pkt->pkt_headroom, m, 0, len, do_cksum_rx, |
1338 | llhlen + iphlen); |
1339 | |
1340 | FSW_STATS_INC(FSW_STATS_RX_COPY_PKT2MBUF); |
1341 | if (do_cksum_rx) { |
1342 | FSW_STATS_INC(FSW_STATS_RX_COPY_SUM); |
1343 | } |
1344 | #if DEBUG || DEVELOPMENT |
1345 | if (__improbable(pkt_trailers > 0)) { |
1346 | (void) pkt_add_trailers_mbuf(m, llhlen + iphlen); |
1347 | } |
1348 | #endif /* DEBUG || DEVELOPMENT */ |
1349 | m_adj(m, llhlen); |
1350 | |
1351 | m->m_pkthdr.rcvif = fsw->fsw_ifp; |
1352 | if (__improbable((pkt->pkt_link_flags & |
1353 | PKT_LINKF_ETHFCS) != 0)) { |
1354 | m->m_flags |= M_HASFCS; |
1355 | } |
1356 | if (__improbable(pkt->pkt_pflags & PKT_F_WAKE_PKT)) { |
1357 | m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT; |
1358 | } |
1359 | ASSERT(m->m_nextpkt == NULL); |
1360 | tail = m; |
1361 | *tailp = m; |
1362 | tailp = &m->m_nextpkt; |
1363 | mcnt++; |
1364 | mbytes += m_pktlen(m); |
1365 | } |
1366 | /* free any leftovers */ |
1367 | if (__improbable(mhead != NULL)) { |
1368 | DTRACE_SKYWALK1(mhead__leftover, uint32_t, mhead_cnt); |
1369 | ASSERT(mhead_cnt != 0); |
1370 | (void) m_freem_list(mhead); |
1371 | mhead = NULL; |
1372 | mhead_cnt = 0; |
1373 | } |
1374 | |
1375 | /* reset if most packets (>50%) are smaller than our batch buffers */ |
1376 | if (__improbable(mhead_waste > ((uint32_t)tot_cnt >> 1))) { |
1377 | DTRACE_SKYWALK4(mhead__waste, struct nx_flowswitch *, fsw, |
1378 | struct flow_entry *, NULL, uint32_t, mhead_waste, |
1379 | uint32_t, tot_cnt); |
1380 | largest = 0; |
1381 | } |
1382 | |
1383 | if (largest != fsw->fsw_rx_largest_size) { |
1384 | os_atomic_store(&fsw->fsw_rx_largest_size, largest, release); |
1385 | } |
1386 | |
1387 | pp_free_pktq(pktq); |
1388 | *m_headp = head; |
1389 | *m_tailp = tail; |
1390 | *cnt = mcnt; |
1391 | *bytes = mbytes; |
1392 | } |
1393 | |
1394 | /* |
1395 | * This function only extracts the mbuf from the packet. The caller frees |
1396 | * the packet. |
1397 | */ |
1398 | static inline struct mbuf * |
1399 | convert_compat_pkt_to_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *pkt) |
1400 | { |
1401 | struct mbuf *m; |
1402 | struct pkthdr *mhdr; |
1403 | uint16_t llhlen; |
1404 | |
1405 | m = pkt->pkt_mbuf; |
1406 | ASSERT(m != NULL); |
1407 | |
1408 | llhlen = pkt->pkt_l2_len; |
1409 | if (llhlen > pkt->pkt_length) { |
1410 | m_freem(m); |
1411 | KPKT_CLEAR_MBUF_DATA(pkt); |
1412 | DTRACE_SKYWALK2(bad__len, struct nx_flowswitch *, fsw, |
1413 | struct __kern_packet *, pkt); |
1414 | FSW_STATS_INC(FSW_STATS_DROP); |
1415 | FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN); |
1416 | return NULL; |
1417 | } |
1418 | mhdr = &m->m_pkthdr; |
1419 | if ((mhdr->csum_flags & CSUM_DATA_VALID) == 0 && |
1420 | PACKET_HAS_PARTIAL_CHECKSUM(pkt)) { |
1421 | mhdr->csum_flags &= ~CSUM_RX_FLAGS; |
1422 | mhdr->csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL); |
1423 | mhdr->csum_rx_start = pkt->pkt_csum_rx_start_off; |
1424 | mhdr->csum_rx_val = pkt->pkt_csum_rx_value; |
1425 | } |
1426 | #if DEBUG || DEVELOPMENT |
1427 | uint32_t extra = 0; |
1428 | if (__improbable(pkt_trailers > 0)) { |
1429 | extra = pkt_add_trailers_mbuf(m, llhlen); |
1430 | } |
1431 | #endif /* DEBUG || DEVELOPMENT */ |
1432 | m_adj(m, llhlen); |
1433 | ASSERT((uint32_t)m_pktlen(m) == ((pkt->pkt_length - llhlen) + extra)); |
1434 | KPKT_CLEAR_MBUF_DATA(pkt); |
1435 | return m; |
1436 | } |
1437 | |
1438 | SK_NO_INLINE_ATTRIBUTE |
1439 | static void |
1440 | convert_compat_pktq_to_mbufs(struct nx_flowswitch *fsw, struct pktq *pktq, |
1441 | struct mbuf **m_head, struct mbuf **m_tail, uint32_t *cnt, uint32_t *bytes) |
1442 | { |
1443 | struct __kern_packet *pkt; |
1444 | struct mbuf *m, *head = NULL, *tail = NULL, **tailp = &head; |
1445 | uint32_t c = 0, b = 0; |
1446 | |
1447 | KPKTQ_FOREACH(pkt, pktq) { |
1448 | m = convert_compat_pkt_to_mbuf(fsw, pkt); |
1449 | if (__improbable(m == NULL)) { |
1450 | continue; |
1451 | } |
1452 | tail = m; |
1453 | *tailp = m; |
1454 | tailp = &m->m_nextpkt; |
1455 | c++; |
1456 | b += m_pktlen(m); |
1457 | } |
1458 | pp_free_pktq(pktq); |
1459 | *m_head = head; |
1460 | *m_tail = tail; |
1461 | *cnt = c; |
1462 | *bytes = b; |
1463 | } |
1464 | |
1465 | void |
1466 | fsw_host_sendup(ifnet_t ifp, struct mbuf *m_head, struct mbuf *m_tail, |
1467 | uint32_t cnt, uint32_t bytes) |
1468 | { |
1469 | struct ifnet_stat_increment_param s; |
1470 | |
1471 | bzero(s: &s, n: sizeof(s)); |
1472 | s.packets_in = cnt; |
1473 | s.bytes_in = bytes; |
1474 | dlil_input_handler(ifp, m_head, m_tail, &s, FALSE, NULL); |
1475 | } |
1476 | |
1477 | void |
1478 | fsw_host_rx(struct nx_flowswitch *fsw, struct pktq *pktq) |
1479 | { |
1480 | struct mbuf *m_head = NULL, *m_tail = NULL; |
1481 | uint32_t cnt = 0, bytes = 0; |
1482 | ifnet_fsw_rx_cb_t cb; |
1483 | void *cb_arg; |
1484 | boolean_t compat; |
1485 | |
1486 | ASSERT(!KPKTQ_EMPTY(pktq)); |
1487 | if (ifnet_get_flowswitch_rx_callback(ifp: fsw->fsw_ifp, cbp: &cb, argp: &cb_arg) == 0) { |
1488 | ASSERT(cb != NULL); |
1489 | ASSERT(cb_arg != NULL); |
1490 | /* callback consumes packets */ |
1491 | (*cb)(cb_arg, pktq); |
1492 | ifnet_release_flowswitch_rx_callback(ifp: fsw->fsw_ifp); |
1493 | return; |
1494 | } |
1495 | |
1496 | /* All packets in the pktq must have the same type */ |
1497 | compat = ((KPKTQ_FIRST(pktq)->pkt_pflags & PKT_F_MBUF_DATA) != 0); |
1498 | if (compat) { |
1499 | convert_compat_pktq_to_mbufs(fsw, pktq, m_head: &m_head, m_tail: &m_tail, cnt: &cnt, |
1500 | bytes: &bytes); |
1501 | } else { |
1502 | convert_native_pktq_to_mbufs(fsw, pktq, m_headp: &m_head, m_tailp: &m_tail, cnt: &cnt, |
1503 | bytes: &bytes); |
1504 | } |
1505 | if (__improbable(m_head == NULL)) { |
1506 | DTRACE_SKYWALK1(empty__head, struct nx_flowswitch *, fsw); |
1507 | return; |
1508 | } |
1509 | fsw_host_sendup(ifp: fsw->fsw_ifp, m_head, m_tail, cnt, bytes); |
1510 | } |
1511 | |
1512 | void |
1513 | fsw_ring_enqueue_tail_drop(struct nx_flowswitch *fsw, |
1514 | struct __kern_channel_ring *r, struct pktq *pktq) |
1515 | { |
1516 | fsw_ring_enqueue_pktq(fsw, r, pktq); |
1517 | FSW_STATS_ADD(FSW_STATS_RX_DST_RING_FULL, KPKTQ_LEN(pktq)); |
1518 | dp_drop_pktq(fsw, pktq); |
1519 | } |
1520 | |
1521 | static struct nexus_adapter * |
1522 | flow_get_na(struct nx_flowswitch *fsw, struct flow_entry *fe) |
1523 | { |
1524 | struct kern_nexus *nx = fsw->fsw_nx; |
1525 | struct nexus_adapter *na = NULL; |
1526 | nexus_port_t port = fe->fe_nx_port; |
1527 | |
1528 | if (port == FSW_VP_DEV || port == FSW_VP_HOST) { |
1529 | SK_ERR("dev or host ports have no NA" ); |
1530 | return NULL; |
1531 | } |
1532 | |
1533 | if (__improbable(!nx_port_is_valid(nx, port))) { |
1534 | SK_DF(SK_VERB_FSW_DP, "%s[%d] port no longer valid" , |
1535 | if_name(fsw->fsw_ifp), port); |
1536 | return NULL; |
1537 | } |
1538 | |
1539 | na = nx_port_get_na(nx, port); |
1540 | if (__improbable(na == NULL)) { |
1541 | FSW_STATS_INC(FSW_STATS_DST_NXPORT_INVALID); |
1542 | SK_DF(SK_VERB_FSW_DP, "%s[%d] NA no longer valid" , |
1543 | if_name(fsw->fsw_ifp), port); |
1544 | return NULL; |
1545 | } |
1546 | |
1547 | if (__improbable(!NA_IS_ACTIVE(na))) { |
1548 | FSW_STATS_INC(FSW_STATS_DST_NXPORT_INACTIVE); |
1549 | SK_DF(SK_VERB_FSW_DP, "%s[%d] NA no longer active" , |
1550 | if_name(fsw->fsw_ifp), port); |
1551 | return NULL; |
1552 | } |
1553 | |
1554 | if (__improbable(nx_port_is_defunct(nx, port))) { |
1555 | FSW_STATS_INC(FSW_STATS_DST_NXPORT_DEFUNCT); |
1556 | SK_DF(SK_VERB_FSW_DP, "%s[%d] NA defuncted" , |
1557 | if_name(fsw->fsw_ifp), port); |
1558 | return NULL; |
1559 | } |
1560 | |
1561 | return na; |
1562 | } |
1563 | |
1564 | static inline struct __kern_channel_ring * |
1565 | flow_get_ring(struct nx_flowswitch *fsw, struct flow_entry *fe, enum txrx txrx) |
1566 | { |
1567 | struct nexus_vp_adapter *na = NULL; |
1568 | struct __kern_channel_ring *r = NULL; |
1569 | |
1570 | na = VPNA(flow_get_na(fsw, fe)); |
1571 | if (__improbable(na == NULL)) { |
1572 | return NULL; |
1573 | } |
1574 | |
1575 | switch (txrx) { |
1576 | case NR_RX: |
1577 | r = &na->vpna_up.na_rx_rings[0]; |
1578 | break; |
1579 | case NR_TX: |
1580 | r = &na->vpna_up.na_tx_rings[0]; |
1581 | break; |
1582 | default: |
1583 | __builtin_unreachable(); |
1584 | VERIFY(0); |
1585 | } |
1586 | |
1587 | if (__improbable(KR_DROP(r))) { |
1588 | FSW_STATS_INC(FSW_STATS_DST_RING_DROPMODE); |
1589 | SK_DF(SK_VERB_FSW_DP | SK_VERB_RING, "r %0xllx %s drop mode" , |
1590 | r->ckr_name, SK_KVA(r)); |
1591 | return NULL; |
1592 | } |
1593 | |
1594 | ASSERT(KRNA(r)->na_md_type == NEXUS_META_TYPE_PACKET); |
1595 | |
1596 | #if (DEVELOPMENT || DEBUG) |
1597 | if (r != NULL) { |
1598 | _FSW_INJECT_ERROR(4, r, NULL, null_func); |
1599 | } |
1600 | #endif /* DEVELOPMENT || DEBUG */ |
1601 | |
1602 | return r; |
1603 | } |
1604 | |
1605 | struct __kern_channel_ring * |
1606 | fsw_flow_get_rx_ring(struct nx_flowswitch *fsw, struct flow_entry *fe) |
1607 | { |
1608 | return flow_get_ring(fsw, fe, txrx: NR_RX); |
1609 | } |
1610 | |
1611 | static inline struct __kern_channel_ring * |
1612 | fsw_flow_get_tx_ring(struct nx_flowswitch *fsw, struct flow_entry *fe) |
1613 | { |
1614 | return flow_get_ring(fsw, fe, txrx: NR_TX); |
1615 | } |
1616 | |
1617 | static bool |
1618 | dp_flow_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe) |
1619 | { |
1620 | struct flow_route *fr = fe->fe_route; |
1621 | struct ifnet *ifp = fsw->fsw_ifp; |
1622 | |
1623 | if (__improbable(!(fe->fe_flags & FLOWENTF_NONVIABLE) && |
1624 | !fe->fe_want_nonviable && (fe->fe_key.fk_mask & FKMASK_SRC) && |
1625 | fe->fe_laddr_gencnt != ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt && |
1626 | !flow_route_key_validate(&fe->fe_key, ifp, &fe->fe_laddr_gencnt))) { |
1627 | /* |
1628 | * The source address is no longer around; we want this |
1629 | * flow to be nonviable, but that requires holding the lock |
1630 | * as writer (which isn't the case now.) Indicate that |
1631 | * we need to finalize the nonviable later down below. |
1632 | * |
1633 | * We also request that the flow route be re-configured, |
1634 | * if this is a connected mode flow. |
1635 | * |
1636 | */ |
1637 | if (!(fe->fe_flags & FLOWENTF_NONVIABLE)) { |
1638 | /* |
1639 | * fsw_pending_nonviable is a hint for reaper thread; |
1640 | * due to the fact that setting fe_want_nonviable and |
1641 | * incrementing fsw_pending_nonviable counter is not |
1642 | * atomic, let the increment happen first, and the |
1643 | * thread losing the CAS does decrement. |
1644 | */ |
1645 | os_atomic_inc(&fsw->fsw_pending_nonviable, relaxed); |
1646 | if (os_atomic_cmpxchg(&fe->fe_want_nonviable, 0, 1, acq_rel)) { |
1647 | fsw_reap_sched(fsw); |
1648 | } else { |
1649 | os_atomic_dec(&fsw->fsw_pending_nonviable, relaxed); |
1650 | } |
1651 | } |
1652 | if (fr != NULL) { |
1653 | os_atomic_inc(&fr->fr_want_configure, relaxed); |
1654 | } |
1655 | } |
1656 | |
1657 | /* if flow was (or is going to be) marked as nonviable, drop it */ |
1658 | if (__improbable(fe->fe_want_nonviable || |
1659 | (fe->fe_flags & FLOWENTF_NONVIABLE) != 0)) { |
1660 | SK_DF(SK_VERB_FSW_DP | SK_VERB_FLOW, "flow 0x%llx non-viable" , |
1661 | SK_KVA(fe)); |
1662 | return false; |
1663 | } |
1664 | return true; |
1665 | } |
1666 | |
1667 | bool |
1668 | dp_flow_rx_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe) |
1669 | { |
1670 | bool okay; |
1671 | okay = dp_flow_route_process(fsw, fe); |
1672 | #if (DEVELOPMENT || DEBUG) |
1673 | if (okay) { |
1674 | _FSW_INJECT_ERROR(5, okay, false, null_func); |
1675 | } |
1676 | #endif /* DEVELOPMENT || DEBUG */ |
1677 | |
1678 | return okay; |
1679 | } |
1680 | |
1681 | void |
1682 | dp_flow_rx_process(struct nx_flowswitch *fsw, struct flow_entry *fe, |
1683 | uint32_t flags) |
1684 | { |
1685 | #pragma unused(flags) |
1686 | struct pktq dpkts; /* dst pool alloc'ed packets */ |
1687 | struct pktq disposed_pkts; /* done src packets */ |
1688 | struct pktq dropped_pkts; /* dropped src packets */ |
1689 | struct pktq transferred_pkts; /* dst packet ready for ring */ |
1690 | struct __kern_packet *pkt, *tpkt; |
1691 | struct kern_pbufpool *dpp; |
1692 | uint32_t n_pkts = KPKTQ_LEN(&fe->fe_rx_pktq); |
1693 | uint64_t buf_array[RX_BUFLET_BATCH_COUNT]; |
1694 | uint16_t buf_array_iter = 0; |
1695 | uint32_t cnt, buf_cnt = 0; |
1696 | int err; |
1697 | |
1698 | KPKTQ_INIT(&dpkts); |
1699 | KPKTQ_INIT(&dropped_pkts); |
1700 | KPKTQ_INIT(&disposed_pkts); |
1701 | KPKTQ_INIT(&transferred_pkts); |
1702 | |
1703 | if (__improbable(!dp_flow_rx_route_process(fsw, fe))) { |
1704 | SK_ERR("Rx route bad" ); |
1705 | fsw_snoop_and_dequeue(fe, target: &dropped_pkts, true); |
1706 | FSW_STATS_ADD(FSW_STATS_RX_FLOW_NONVIABLE, n_pkts); |
1707 | goto done; |
1708 | } |
1709 | |
1710 | if (fe->fe_nx_port == FSW_VP_HOST) { |
1711 | /* |
1712 | * The host ring does not exist anymore so we can't take |
1713 | * the enqueue path below. This path should only be hit |
1714 | * for the rare tcp fragmentation case. |
1715 | */ |
1716 | fsw_host_rx(fsw, pktq: &fe->fe_rx_pktq); |
1717 | return; |
1718 | } |
1719 | |
1720 | /* find the ring */ |
1721 | struct __kern_channel_ring *r; |
1722 | r = fsw_flow_get_rx_ring(fsw, fe); |
1723 | if (__improbable(r == NULL)) { |
1724 | fsw_snoop_and_dequeue(fe, target: &dropped_pkts, true); |
1725 | goto done; |
1726 | } |
1727 | |
1728 | /* snoop before L2 is stripped */ |
1729 | if (__improbable(pktap_total_tap_count != 0)) { |
1730 | fsw_snoop(fsw, fe, true); |
1731 | } |
1732 | |
1733 | dpp = r->ckr_pp; |
1734 | /* batch allocate enough packets */ |
1735 | err = pp_alloc_pktq(dpp, 1, &dpkts, n_pkts, NULL, NULL, |
1736 | SKMEM_NOSLEEP); |
1737 | if (__improbable(err == ENOMEM)) { |
1738 | ASSERT(KPKTQ_EMPTY(&dpkts)); |
1739 | KPKTQ_CONCAT(&dropped_pkts, &fe->fe_rx_pktq); |
1740 | FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts); |
1741 | SK_ERR("failed to alloc %u pkts for kr %s, 0x%llu" , n_pkts, |
1742 | r->ckr_name, SK_KVA(r)); |
1743 | goto done; |
1744 | } |
1745 | |
1746 | /* |
1747 | * estimate total number of buflets for the packet chain. |
1748 | */ |
1749 | cnt = howmany(fe->fe_rx_pktq_bytes, PP_BUF_SIZE_DEF(dpp)); |
1750 | if (cnt > n_pkts) { |
1751 | ASSERT(dpp->pp_max_frags > 1); |
1752 | cnt -= n_pkts; |
1753 | buf_cnt = MIN(RX_BUFLET_BATCH_COUNT, cnt); |
1754 | err = pp_alloc_buflet_batch(pp: dpp, array: buf_array, size: &buf_cnt, |
1755 | SKMEM_NOSLEEP, false); |
1756 | if (__improbable(buf_cnt == 0)) { |
1757 | KPKTQ_CONCAT(&dropped_pkts, &fe->fe_rx_pktq); |
1758 | FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts); |
1759 | SK_ERR("failed to alloc %d buflets (err %d) for kr %s, " |
1760 | "0x%llu" , cnt, err, r->ckr_name, SK_KVA(r)); |
1761 | goto done; |
1762 | } |
1763 | err = 0; |
1764 | } |
1765 | |
1766 | /* extra processing for user flow */ |
1767 | KPKTQ_FOREACH_SAFE(pkt, &fe->fe_rx_pktq, tpkt) { |
1768 | err = 0; |
1769 | KPKTQ_REMOVE(&fe->fe_rx_pktq, pkt); |
1770 | if (fe->fe_rx_pktq_bytes > pkt->pkt_flow_ulen) { |
1771 | fe->fe_rx_pktq_bytes -= pkt->pkt_flow_ulen; |
1772 | } else { |
1773 | fe->fe_rx_pktq_bytes = 0; |
1774 | } |
1775 | err = flow_pkt_track(fe, pkt, true); |
1776 | _FSW_INJECT_ERROR(33, err, EPROTO, null_func); |
1777 | if (__improbable(err != 0)) { |
1778 | SK_ERR("flow_pkt_track failed (err %d)" , err); |
1779 | FSW_STATS_INC(FSW_STATS_RX_FLOW_TRACK_ERR); |
1780 | /* if need to trigger RST */ |
1781 | if (err == ENETRESET) { |
1782 | flow_track_abort_tcp(fe, in_pkt: pkt, NULL); |
1783 | } |
1784 | KPKTQ_ENQUEUE(&dropped_pkts, pkt); |
1785 | continue; |
1786 | } |
1787 | |
1788 | /* transfer to dpkt */ |
1789 | if (pkt->pkt_qum.qum_pp != dpp) { |
1790 | struct __kern_buflet *bprev, *bnew; |
1791 | struct __kern_packet *dpkt = NULL; |
1792 | uint32_t n_bufs, i; |
1793 | |
1794 | KPKTQ_DEQUEUE(&dpkts, dpkt); |
1795 | if (__improbable(dpkt == NULL)) { |
1796 | FSW_STATS_INC(FSW_STATS_DROP_NOMEM_PKT); |
1797 | KPKTQ_ENQUEUE(&dropped_pkts, pkt); |
1798 | continue; |
1799 | } |
1800 | n_bufs = howmany(pkt->pkt_length, PP_BUF_SIZE_DEF(dpp)); |
1801 | n_bufs--; |
1802 | for (i = 0; i < n_bufs; i++) { |
1803 | if (__improbable(buf_cnt == 0)) { |
1804 | ASSERT(dpp->pp_max_frags > 1); |
1805 | buf_array_iter = 0; |
1806 | cnt = howmany(fe->fe_rx_pktq_bytes, |
1807 | PP_BUF_SIZE_DEF(dpp)); |
1808 | n_pkts = KPKTQ_LEN(&fe->fe_rx_pktq); |
1809 | if (cnt >= n_pkts) { |
1810 | cnt -= n_pkts; |
1811 | } else { |
1812 | cnt = 0; |
1813 | } |
1814 | cnt += (n_bufs - i); |
1815 | buf_cnt = MIN(RX_BUFLET_BATCH_COUNT, |
1816 | cnt); |
1817 | cnt = buf_cnt; |
1818 | err = pp_alloc_buflet_batch(pp: dpp, |
1819 | array: buf_array, size: &buf_cnt, |
1820 | SKMEM_NOSLEEP, false); |
1821 | if (__improbable(buf_cnt == 0)) { |
1822 | FSW_STATS_INC(FSW_STATS_DROP_NOMEM_PKT); |
1823 | KPKTQ_ENQUEUE(&dropped_pkts, |
1824 | pkt); |
1825 | pkt = NULL; |
1826 | pp_free_packet_single(dpkt); |
1827 | dpkt = NULL; |
1828 | SK_ERR("failed to alloc %d " |
1829 | "buflets (err %d) for " |
1830 | "kr %s, 0x%llu" , cnt, err, |
1831 | r->ckr_name, SK_KVA(r)); |
1832 | break; |
1833 | } |
1834 | err = 0; |
1835 | } |
1836 | ASSERT(buf_cnt != 0); |
1837 | if (i == 0) { |
1838 | PKT_GET_FIRST_BUFLET(dpkt, 1, bprev); |
1839 | } |
1840 | bnew = (kern_buflet_t)buf_array[buf_array_iter]; |
1841 | buf_array[buf_array_iter] = 0; |
1842 | buf_array_iter++; |
1843 | buf_cnt--; |
1844 | VERIFY(kern_packet_add_buflet(SK_PKT2PH(dpkt), |
1845 | bprev, bnew) == 0); |
1846 | bprev = bnew; |
1847 | } |
1848 | if (__improbable(err != 0)) { |
1849 | continue; |
1850 | } |
1851 | err = copy_packet_from_dev(fsw, spkt: pkt, dpkt); |
1852 | _FSW_INJECT_ERROR(43, err, EINVAL, null_func); |
1853 | if (__improbable(err != 0)) { |
1854 | SK_ERR("copy packet failed (err %d)" , err); |
1855 | KPKTQ_ENQUEUE(&dropped_pkts, pkt); |
1856 | pp_free_packet_single(dpkt); |
1857 | dpkt = NULL; |
1858 | continue; |
1859 | } |
1860 | KPKTQ_ENQUEUE(&disposed_pkts, pkt); |
1861 | pkt = dpkt; |
1862 | } |
1863 | _UUID_COPY(pkt->pkt_flow_id, fe->fe_uuid); |
1864 | _UUID_COPY(pkt->pkt_policy_euuid, fe->fe_eproc_uuid); |
1865 | pkt->pkt_policy_id = fe->fe_policy_id; |
1866 | pkt->pkt_skip_policy_id = fe->fe_skip_policy_id; |
1867 | pkt->pkt_transport_protocol = fe->fe_transport_protocol; |
1868 | if (pkt->pkt_bufs_cnt > 1) { |
1869 | pkt->pkt_aggr_type = PKT_AGGR_SINGLE_IP; |
1870 | pkt->pkt_seg_cnt = 1; |
1871 | } |
1872 | KPKTQ_ENQUEUE(&transferred_pkts, pkt); |
1873 | } |
1874 | KPKTQ_FINI(&fe->fe_rx_pktq); |
1875 | KPKTQ_CONCAT(&fe->fe_rx_pktq, &transferred_pkts); |
1876 | KPKTQ_FINI(&transferred_pkts); |
1877 | |
1878 | fsw_ring_enqueue_tail_drop(fsw, r, pktq: &fe->fe_rx_pktq); |
1879 | |
1880 | done: |
1881 | /* Free unused buflets */ |
1882 | while (buf_cnt > 0) { |
1883 | pp_free_buflet(dpp, (kern_buflet_t)(buf_array[buf_array_iter])); |
1884 | buf_array[buf_array_iter] = 0; |
1885 | buf_array_iter++; |
1886 | buf_cnt--; |
1887 | } |
1888 | dp_free_pktq(fsw, pktq: &dpkts); |
1889 | dp_free_pktq(fsw, pktq: &disposed_pkts); |
1890 | dp_drop_pktq(fsw, &dropped_pkts); |
1891 | } |
1892 | |
1893 | static inline void |
1894 | rx_flow_process(struct nx_flowswitch *fsw, struct flow_entry *fe, |
1895 | uint32_t flags) |
1896 | { |
1897 | ASSERT(!KPKTQ_EMPTY(&fe->fe_rx_pktq)); |
1898 | ASSERT(KPKTQ_LEN(&fe->fe_rx_pktq) != 0); |
1899 | |
1900 | SK_DF(SK_VERB_FSW_DP | SK_VERB_RX, "Rx %d pkts for fe %p port %d" , |
1901 | KPKTQ_LEN(&fe->fe_rx_pktq), fe, fe->fe_nx_port); |
1902 | |
1903 | /* flow related processing (default, agg, fpd, etc.) */ |
1904 | fe->fe_rx_process(fsw, fe, flags); |
1905 | |
1906 | if (__improbable(fe->fe_want_withdraw)) { |
1907 | fsw_reap_sched(fsw); |
1908 | } |
1909 | |
1910 | KPKTQ_FINI(&fe->fe_rx_pktq); |
1911 | } |
1912 | |
1913 | static inline void |
1914 | dp_rx_process_wake_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt) |
1915 | { |
1916 | /* |
1917 | * We only care about wake packets of flows that belong the flow switch |
1918 | * as wake packets for the host stack are handled by the host input |
1919 | * function |
1920 | */ |
1921 | #if (DEBUG || DEVELOPMENT) |
1922 | if (__improbable(fsw->fsw_ifp->if_xflags & IFXF_MARK_WAKE_PKT)) { |
1923 | /* |
1924 | * This is a one shot command |
1925 | */ |
1926 | fsw->fsw_ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT; |
1927 | |
1928 | pkt->pkt_pflags |= PKT_F_WAKE_PKT; |
1929 | } |
1930 | #endif /* (DEBUG || DEVELOPMENT) */ |
1931 | if (__improbable(pkt->pkt_pflags & PKT_F_WAKE_PKT)) { |
1932 | if_ports_used_match_pkt(ifp: fsw->fsw_ifp, pkt); |
1933 | } |
1934 | } |
1935 | |
1936 | static void |
1937 | _fsw_receive_locked(struct nx_flowswitch *fsw, struct pktq *pktq) |
1938 | { |
1939 | struct __kern_packet *pkt, *tpkt; |
1940 | struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes); |
1941 | struct flow_entry *fe, *prev_fe; |
1942 | sa_family_t af; |
1943 | struct pktq host_pkts, dropped_pkts; |
1944 | int err; |
1945 | |
1946 | KPKTQ_INIT(&host_pkts); |
1947 | KPKTQ_INIT(&dropped_pkts); |
1948 | |
1949 | if (__improbable(FSW_QUIESCED(fsw))) { |
1950 | DTRACE_SKYWALK1(rx__quiesced, struct nx_flowswitch *, fsw); |
1951 | KPKTQ_CONCAT(&dropped_pkts, pktq); |
1952 | goto done; |
1953 | } |
1954 | if (__improbable(fsw->fsw_demux == NULL)) { |
1955 | KPKTQ_CONCAT(&dropped_pkts, pktq); |
1956 | goto done; |
1957 | } |
1958 | |
1959 | prev_fe = NULL; |
1960 | KPKTQ_FOREACH_SAFE(pkt, pktq, tpkt) { |
1961 | if (__probable(tpkt)) { |
1962 | void *baddr; |
1963 | MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr); |
1964 | SK_PREFETCH(baddr, 0); |
1965 | /* prefetch L3 and L4 flow structs */ |
1966 | SK_PREFETCHW(tpkt->pkt_flow, 0); |
1967 | SK_PREFETCHW(tpkt->pkt_flow, 128); |
1968 | } |
1969 | |
1970 | KPKTQ_REMOVE(pktq, pkt); |
1971 | |
1972 | pkt = rx_prepare_packet(fsw, pkt); |
1973 | |
1974 | af = fsw->fsw_demux(fsw, pkt); |
1975 | if (__improbable(af == AF_UNSPEC)) { |
1976 | KPKTQ_ENQUEUE(&host_pkts, pkt); |
1977 | continue; |
1978 | } |
1979 | |
1980 | err = flow_pkt_classify(pkt, ifp: fsw->fsw_ifp, af, TRUE); |
1981 | _FSW_INJECT_ERROR(1, err, ENXIO, null_func); |
1982 | if (__improbable(err != 0)) { |
1983 | FSW_STATS_INC(FSW_STATS_RX_FLOW_EXTRACT_ERR); |
1984 | KPKTQ_ENQUEUE(&host_pkts, pkt); |
1985 | continue; |
1986 | } |
1987 | |
1988 | if (__improbable(pkt->pkt_flow_ip_is_frag)) { |
1989 | pkt = rx_process_ip_frag(fsw, pkt); |
1990 | if (pkt == NULL) { |
1991 | continue; |
1992 | } |
1993 | } |
1994 | |
1995 | prev_fe = fe = rx_lookup_flow(fsw, pkt, prev_fe); |
1996 | if (__improbable(fe == NULL)) { |
1997 | KPKTQ_ENQUEUE_LIST(&host_pkts, pkt); |
1998 | continue; |
1999 | } |
2000 | |
2001 | fe->fe_rx_pktq_bytes += pkt->pkt_flow_ulen; |
2002 | |
2003 | dp_rx_process_wake_packet(fsw, pkt); |
2004 | |
2005 | rx_flow_batch_packet(fes: &fes, fe, pkt); |
2006 | prev_fe = fe; |
2007 | } |
2008 | |
2009 | struct flow_entry *tfe = NULL; |
2010 | TAILQ_FOREACH_SAFE(fe, &fes, fe_rx_link, tfe) { |
2011 | rx_flow_process(fsw, fe, flags: 0); |
2012 | TAILQ_REMOVE(&fes, fe, fe_rx_link); |
2013 | fe->fe_rx_pktq_bytes = 0; |
2014 | fe->fe_rx_frag_count = 0; |
2015 | flow_entry_release(pfe: &fe); |
2016 | } |
2017 | |
2018 | if (!KPKTQ_EMPTY(&host_pkts)) { |
2019 | fsw_host_rx(fsw, pktq: &host_pkts); |
2020 | } |
2021 | |
2022 | done: |
2023 | dp_drop_pktq(fsw, &dropped_pkts); |
2024 | } |
2025 | |
2026 | #if (DEVELOPMENT || DEBUG) |
2027 | static void |
2028 | fsw_rps_rx(struct nx_flowswitch *fsw, uint32_t id, |
2029 | struct __kern_packet *pkt) |
2030 | { |
2031 | struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[id]; |
2032 | |
2033 | lck_mtx_lock_spin(&frt->frt_lock); |
2034 | KPKTQ_ENQUEUE(&frt->frt_pktq, pkt); |
2035 | lck_mtx_unlock(&frt->frt_lock); |
2036 | } |
2037 | |
2038 | static void |
2039 | fsw_rps_thread_schedule(struct nx_flowswitch *fsw, uint32_t id) |
2040 | { |
2041 | struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[id]; |
2042 | |
2043 | ASSERT(frt->frt_thread != THREAD_NULL); |
2044 | lck_mtx_lock_spin(&frt->frt_lock); |
2045 | ASSERT(!(frt->frt_flags & (FRT_TERMINATING | FRT_TERMINATED))); |
2046 | |
2047 | frt->frt_requests++; |
2048 | if (!(frt->frt_flags & FRT_RUNNING)) { |
2049 | thread_wakeup((caddr_t)frt); |
2050 | } |
2051 | lck_mtx_unlock(&frt->frt_lock); |
2052 | } |
2053 | |
2054 | __attribute__((noreturn)) |
2055 | static void |
2056 | fsw_rps_thread_cont(void *v, wait_result_t w) |
2057 | { |
2058 | struct fsw_rps_thread *frt = v; |
2059 | struct nx_flowswitch *fsw = frt->frt_fsw; |
2060 | |
2061 | lck_mtx_lock(&frt->frt_lock); |
2062 | if (__improbable(w == THREAD_INTERRUPTIBLE || |
2063 | (frt->frt_flags & FRT_TERMINATING) != 0)) { |
2064 | goto terminate; |
2065 | } |
2066 | if (KPKTQ_EMPTY(&frt->frt_pktq)) { |
2067 | goto done; |
2068 | } |
2069 | frt->frt_flags |= FRT_RUNNING; |
2070 | |
2071 | for (;;) { |
2072 | uint32_t requests = frt->frt_requests; |
2073 | struct pktq pkts; |
2074 | |
2075 | KPKTQ_INIT(&pkts); |
2076 | KPKTQ_CONCAT(&pkts, &frt->frt_pktq); |
2077 | lck_mtx_unlock(&frt->frt_lock); |
2078 | |
2079 | sk_protect_t protect; |
2080 | protect = sk_sync_protect(); |
2081 | FSW_RLOCK(fsw); |
2082 | _fsw_receive_locked(fsw, &pkts); |
2083 | FSW_RUNLOCK(fsw); |
2084 | sk_sync_unprotect(protect); |
2085 | |
2086 | lck_mtx_lock(&frt->frt_lock); |
2087 | if ((frt->frt_flags & FRT_TERMINATING) != 0 || |
2088 | requests == frt->frt_requests) { |
2089 | frt->frt_requests = 0; |
2090 | break; |
2091 | } |
2092 | } |
2093 | |
2094 | done: |
2095 | lck_mtx_unlock(&frt->frt_lock); |
2096 | if (!(frt->frt_flags & FRT_TERMINATING)) { |
2097 | frt->frt_flags &= ~FRT_RUNNING; |
2098 | assert_wait(frt, THREAD_UNINT); |
2099 | thread_block_parameter(fsw_rps_thread_cont, frt); |
2100 | __builtin_unreachable(); |
2101 | } else { |
2102 | terminate: |
2103 | LCK_MTX_ASSERT(&frt->frt_lock, LCK_MTX_ASSERT_OWNED); |
2104 | frt->frt_flags &= ~(FRT_RUNNING | FRT_TERMINATING); |
2105 | frt->frt_flags |= FRT_TERMINATED; |
2106 | |
2107 | if (frt->frt_flags & FRT_TERMINATEBLOCK) { |
2108 | thread_wakeup((caddr_t)&frt); |
2109 | } |
2110 | lck_mtx_unlock(&frt->frt_lock); |
2111 | |
2112 | SK_D("fsw_rx_%s_%d terminated" , if_name(fsw->fsw_ifp), |
2113 | frt->frt_idx); |
2114 | |
2115 | /* for the extra refcnt from kernel_thread_start() */ |
2116 | thread_deallocate(current_thread()); |
2117 | /* this is the end */ |
2118 | thread_terminate(current_thread()); |
2119 | /* NOTREACHED */ |
2120 | __builtin_unreachable(); |
2121 | } |
2122 | |
2123 | /* must never get here */ |
2124 | VERIFY(0); |
2125 | /* NOTREACHED */ |
2126 | __builtin_unreachable(); |
2127 | } |
2128 | |
2129 | __attribute__((noreturn)) |
2130 | static void |
2131 | fsw_rps_thread_func(void *v, wait_result_t w) |
2132 | { |
2133 | #pragma unused(w) |
2134 | struct fsw_rps_thread *frt = v; |
2135 | struct nx_flowswitch *fsw = frt->frt_fsw; |
2136 | |
2137 | char thread_name[MAXTHREADNAMESIZE]; |
2138 | bzero(thread_name, sizeof(thread_name)); |
2139 | (void) snprintf(thread_name, sizeof(thread_name), "fsw_rx_%s_%d" , |
2140 | if_name(fsw->fsw_ifp), frt->frt_idx); |
2141 | thread_set_thread_name(frt->frt_thread, thread_name); |
2142 | SK_D("%s spawned" , thread_name); |
2143 | |
2144 | net_thread_marks_push(NET_THREAD_SYNC_RX); |
2145 | assert_wait(frt, THREAD_UNINT); |
2146 | (void) thread_block_parameter(fsw_rps_thread_cont, frt); |
2147 | |
2148 | __builtin_unreachable(); |
2149 | } |
2150 | |
2151 | static void |
2152 | fsw_rps_thread_join(struct nx_flowswitch *fsw, uint32_t i) |
2153 | { |
2154 | struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[i]; |
2155 | uint64_t f = (1 * NSEC_PER_MSEC); |
2156 | uint64_t s = (1000 * NSEC_PER_SEC); |
2157 | uint32_t c = 0; |
2158 | |
2159 | lck_mtx_lock(&frt->frt_lock); |
2160 | frt->frt_flags |= FRT_TERMINATING; |
2161 | |
2162 | while (!(frt->frt_flags & FRT_TERMINATED)) { |
2163 | uint64_t t = 0; |
2164 | nanoseconds_to_absolutetime((c++ == 0) ? f : s, &t); |
2165 | clock_absolutetime_interval_to_deadline(t, &t); |
2166 | ASSERT(t != 0); |
2167 | |
2168 | frt->frt_flags |= FRT_TERMINATEBLOCK; |
2169 | if (!(frt->frt_flags & FRT_RUNNING)) { |
2170 | thread_wakeup_one((caddr_t)frt); |
2171 | } |
2172 | (void) assert_wait_deadline(&frt->frt_thread, THREAD_UNINT, t); |
2173 | lck_mtx_unlock(&frt->frt_lock); |
2174 | thread_block(THREAD_CONTINUE_NULL); |
2175 | lck_mtx_lock(&frt->frt_lock); |
2176 | frt->frt_flags &= ~FRT_TERMINATEBLOCK; |
2177 | } |
2178 | ASSERT(frt->frt_flags & FRT_TERMINATED); |
2179 | lck_mtx_unlock(&frt->frt_lock); |
2180 | frt->frt_thread = THREAD_NULL; |
2181 | } |
2182 | |
2183 | static void |
2184 | fsw_rps_thread_spawn(struct nx_flowswitch *fsw, uint32_t i) |
2185 | { |
2186 | kern_return_t error; |
2187 | struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[i]; |
2188 | lck_mtx_init(&frt->frt_lock, &nexus_lock_group, &nexus_lock_attr); |
2189 | frt->frt_idx = i; |
2190 | frt->frt_fsw = fsw; |
2191 | error = kernel_thread_start(fsw_rps_thread_func, frt, &frt->frt_thread); |
2192 | ASSERT(!error); |
2193 | KPKTQ_INIT(&frt->frt_pktq); |
2194 | } |
2195 | |
2196 | int |
2197 | fsw_rps_set_nthreads(struct nx_flowswitch* fsw, uint32_t n) |
2198 | { |
2199 | if (n > FSW_RPS_MAX_NTHREADS) { |
2200 | SK_ERR("rps nthreads %d, max %d" , n, FSW_RPS_MAX_NTHREADS); |
2201 | return EINVAL; |
2202 | } |
2203 | |
2204 | FSW_WLOCK(fsw); |
2205 | if (n < fsw->fsw_rps_nthreads) { |
2206 | for (uint32_t i = n; i < fsw->fsw_rps_nthreads; i++) { |
2207 | fsw_rps_thread_join(fsw, i); |
2208 | } |
2209 | fsw->fsw_rps_threads = krealloc_type(struct fsw_rps_thread, |
2210 | fsw->fsw_rps_nthreads, n, fsw->fsw_rps_threads, |
2211 | Z_WAITOK | Z_ZERO | Z_NOFAIL); |
2212 | } else if (n > fsw->fsw_rps_nthreads) { |
2213 | fsw->fsw_rps_threads = krealloc_type(struct fsw_rps_thread, |
2214 | fsw->fsw_rps_nthreads, n, fsw->fsw_rps_threads, |
2215 | Z_WAITOK | Z_ZERO | Z_NOFAIL); |
2216 | for (uint32_t i = fsw->fsw_rps_nthreads; i < n; i++) { |
2217 | fsw_rps_thread_spawn(fsw, i); |
2218 | } |
2219 | } |
2220 | fsw->fsw_rps_nthreads = n; |
2221 | FSW_WUNLOCK(fsw); |
2222 | return 0; |
2223 | } |
2224 | |
2225 | static uint32_t |
2226 | get_rps_id(struct nx_flowswitch *fsw, struct __kern_packet *pkt) |
2227 | { |
2228 | sa_family_t af = fsw->fsw_demux(fsw, pkt); |
2229 | if (__improbable(af == AF_UNSPEC)) { |
2230 | return 0; |
2231 | } |
2232 | |
2233 | flow_pkt_classify(pkt, fsw->fsw_ifp, af, true); |
2234 | |
2235 | if (__improbable((pkt->pkt_qum_qflags & |
2236 | QUM_F_FLOW_CLASSIFIED) == 0)) { |
2237 | return 0; |
2238 | } |
2239 | |
2240 | struct flow_key key; |
2241 | flow_pkt2key(pkt, true, &key); |
2242 | key.fk_mask = FKMASK_5TUPLE; |
2243 | |
2244 | uint32_t id = flow_key_hash(&key) % fsw->fsw_rps_nthreads; |
2245 | |
2246 | return id; |
2247 | } |
2248 | |
2249 | #endif /* !DEVELOPMENT && !DEBUG */ |
2250 | |
2251 | void |
2252 | fsw_receive(struct nx_flowswitch *fsw, struct pktq *pktq) |
2253 | { |
2254 | FSW_RLOCK(fsw); |
2255 | #if (DEVELOPMENT || DEBUG) |
2256 | if (fsw->fsw_rps_nthreads != 0) { |
2257 | struct __kern_packet *pkt, *tpkt; |
2258 | bitmap_t map = 0; |
2259 | |
2260 | _CASSERT(BITMAP_LEN(FSW_RPS_MAX_NTHREADS) == 1); |
2261 | KPKTQ_FOREACH_SAFE(pkt, pktq, tpkt) { |
2262 | uint32_t id = get_rps_id(fsw, pkt); |
2263 | KPKTQ_REMOVE(pktq, pkt); |
2264 | fsw_rps_rx(fsw, id, pkt); |
2265 | bitmap_set(&map, id); |
2266 | } |
2267 | for (int i = bitmap_first(&map, 64); i >= 0; |
2268 | i = bitmap_next(&map, i)) { |
2269 | fsw_rps_thread_schedule(fsw, i); |
2270 | } |
2271 | } else |
2272 | #endif /* !DEVELOPMENT && !DEBUG */ |
2273 | { |
2274 | _fsw_receive_locked(fsw, pktq); |
2275 | } |
2276 | FSW_RUNLOCK(fsw); |
2277 | } |
2278 | |
2279 | int |
2280 | fsw_dev_input_netem_dequeue(void *handle, pktsched_pkt_t * pkts, |
2281 | uint32_t n_pkts) |
2282 | { |
2283 | #pragma unused(handle) |
2284 | struct nx_flowswitch *fsw = handle; |
2285 | struct __kern_packet *kpkts[FSW_VP_DEV_BATCH_MAX]; |
2286 | struct pktq pktq; |
2287 | sk_protect_t protect; |
2288 | uint32_t i; |
2289 | |
2290 | ASSERT(n_pkts <= FSW_VP_DEV_BATCH_MAX); |
2291 | |
2292 | for (i = 0; i < n_pkts; i++) { |
2293 | ASSERT(pkts[i].pktsched_ptype == QP_PACKET); |
2294 | ASSERT(pkts[i].pktsched_pkt_kpkt != NULL); |
2295 | kpkts[i] = pkts[i].pktsched_pkt_kpkt; |
2296 | } |
2297 | |
2298 | protect = sk_sync_protect(); |
2299 | KPKTQ_INIT(&pktq); |
2300 | pkts_to_pktq(pkts: kpkts, n_pkts, pktq: &pktq); |
2301 | |
2302 | fsw_receive(fsw, pktq: &pktq); |
2303 | KPKTQ_FINI(&pktq); |
2304 | sk_sync_unprotect(protect); |
2305 | |
2306 | return 0; |
2307 | } |
2308 | |
2309 | static void |
2310 | fsw_dev_input_netem_enqueue(struct nx_flowswitch *fsw, struct pktq *q) |
2311 | { |
2312 | classq_pkt_t p; |
2313 | struct netem *ne; |
2314 | struct __kern_packet *pkt, *tpkt; |
2315 | |
2316 | ASSERT(fsw->fsw_ifp != NULL); |
2317 | ne = fsw->fsw_ifp->if_input_netem; |
2318 | ASSERT(ne != NULL); |
2319 | KPKTQ_FOREACH_SAFE(pkt, q, tpkt) { |
2320 | bool pdrop; |
2321 | KPKTQ_REMOVE(q, pkt); |
2322 | CLASSQ_PKT_INIT_PACKET(&p, pkt); |
2323 | netem_enqueue(ne, p: &p, pdrop: &pdrop); |
2324 | } |
2325 | } |
2326 | |
2327 | void |
2328 | fsw_devna_rx(struct nexus_adapter *devna, struct __kern_packet *pkt_head, |
2329 | struct nexus_pkt_stats *out_stats) |
2330 | { |
2331 | struct __kern_packet *pkt = pkt_head, *next; |
2332 | struct nx_flowswitch *fsw; |
2333 | uint32_t n_bytes = 0, n_pkts = 0; |
2334 | uint64_t total_pkts = 0, total_bytes = 0; |
2335 | struct pktq q; |
2336 | |
2337 | KPKTQ_INIT(&q); |
2338 | if (__improbable(devna->na_ifp == NULL || |
2339 | (fsw = fsw_ifp_to_fsw(devna->na_ifp)) == NULL)) { |
2340 | SK_ERR("fsw not attached, dropping %d pkts" , KPKTQ_LEN(&q)); |
2341 | pp_free_packet_chain(pkt_head, NULL); |
2342 | return; |
2343 | } |
2344 | while (pkt != NULL) { |
2345 | if (__improbable(pkt->pkt_trace_id != 0)) { |
2346 | KDBG(SK_KTRACE_PKT_RX_DRV | DBG_FUNC_END, pkt->pkt_trace_id); |
2347 | KDBG(SK_KTRACE_PKT_RX_FSW | DBG_FUNC_START, pkt->pkt_trace_id); |
2348 | } |
2349 | next = pkt->pkt_nextpkt; |
2350 | pkt->pkt_nextpkt = NULL; |
2351 | |
2352 | if (__probable((pkt->pkt_qum_qflags & QUM_F_DROPPED) == 0)) { |
2353 | KPKTQ_ENQUEUE(&q, pkt); |
2354 | n_bytes += pkt->pkt_length; |
2355 | } else { |
2356 | DTRACE_SKYWALK1(non__finalized__drop, |
2357 | struct __kern_packet *, pkt); |
2358 | FSW_STATS_INC(FSW_STATS_RX_PKT_NOT_FINALIZED); |
2359 | pp_free_packet_single(pkt); |
2360 | pkt = NULL; |
2361 | } |
2362 | n_pkts = KPKTQ_LEN(&q); |
2363 | if (n_pkts == fsw_rx_batch || (next == NULL && n_pkts > 0)) { |
2364 | if (__improbable(fsw->fsw_ifp->if_input_netem != NULL)) { |
2365 | fsw_dev_input_netem_enqueue(fsw, q: &q); |
2366 | } else { |
2367 | fsw_receive(fsw, pktq: &q); |
2368 | } |
2369 | total_pkts += n_pkts; |
2370 | total_bytes += n_bytes; |
2371 | n_pkts = 0; |
2372 | n_bytes = 0; |
2373 | KPKTQ_FINI(&q); |
2374 | } |
2375 | pkt = next; |
2376 | } |
2377 | ASSERT(KPKTQ_LEN(&q) == 0); |
2378 | FSW_STATS_ADD(FSW_STATS_RX_PACKETS, total_pkts); |
2379 | if (out_stats != NULL) { |
2380 | out_stats->nps_pkts = total_pkts; |
2381 | out_stats->nps_bytes = total_bytes; |
2382 | } |
2383 | KDBG(SK_KTRACE_FSW_DEV_RING_FLUSH, SK_KVA(devna), total_pkts, total_bytes); |
2384 | } |
2385 | |
2386 | static int |
2387 | dp_copy_to_dev_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *spkt, |
2388 | struct __kern_packet *dpkt) |
2389 | { |
2390 | struct mbuf *m = NULL; |
2391 | uint32_t bdlen, bdlim, bdoff; |
2392 | uint8_t *bdaddr; |
2393 | unsigned int one = 1; |
2394 | int err = 0; |
2395 | |
2396 | err = mbuf_allocpacket(how: MBUF_DONTWAIT, |
2397 | packetlen: (fsw->fsw_frame_headroom + spkt->pkt_length), maxchunks: &one, mbuf: &m); |
2398 | #if (DEVELOPMENT || DEBUG) |
2399 | if (m != NULL) { |
2400 | _FSW_INJECT_ERROR(11, m, NULL, m_freem, m); |
2401 | } |
2402 | #endif /* DEVELOPMENT || DEBUG */ |
2403 | if (__improbable(m == NULL)) { |
2404 | FSW_STATS_INC(FSW_STATS_DROP_NOMEM_MBUF); |
2405 | err = ENOBUFS; |
2406 | goto done; |
2407 | } |
2408 | |
2409 | MD_BUFLET_ADDR_ABS_DLEN(dpkt, bdaddr, bdlen, bdlim, bdoff); |
2410 | if (fsw->fsw_frame_headroom > bdlim) { |
2411 | SK_ERR("not enough space in buffer for headroom" ); |
2412 | err = EINVAL; |
2413 | goto done; |
2414 | } |
2415 | |
2416 | dpkt->pkt_headroom = fsw->fsw_frame_headroom; |
2417 | dpkt->pkt_mbuf = m; |
2418 | dpkt->pkt_pflags |= PKT_F_MBUF_DATA; |
2419 | |
2420 | /* packet copy into mbuf */ |
2421 | fsw->fsw_pkt_copy_to_mbuf(NR_TX, SK_PTR_ENCODE(spkt, |
2422 | METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt)), 0, m, |
2423 | fsw->fsw_frame_headroom, spkt->pkt_length, |
2424 | PACKET_HAS_PARTIAL_CHECKSUM(spkt), |
2425 | spkt->pkt_csum_tx_start_off); |
2426 | FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2MBUF); |
2427 | |
2428 | /* header copy into dpkt buffer for classification */ |
2429 | kern_packet_t sph = SK_PTR_ENCODE(spkt, |
2430 | METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt)); |
2431 | kern_packet_t dph = SK_PTR_ENCODE(dpkt, |
2432 | METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt)); |
2433 | uint32_t copy_len = MIN(spkt->pkt_length, bdlim - dpkt->pkt_headroom); |
2434 | fsw->fsw_pkt_copy_from_pkt(NR_TX, dph, dpkt->pkt_headroom, |
2435 | sph, spkt->pkt_headroom, copy_len, FALSE, 0, 0, 0); |
2436 | |
2437 | /* |
2438 | * fsw->fsw_frame_headroom is after m_data, thus we treat m_data same as |
2439 | * buflet baddr m_data always points to the beginning of packet and |
2440 | * should represents the same as baddr + headroom |
2441 | */ |
2442 | ASSERT((uintptr_t)m->m_data == |
2443 | ((uintptr_t)mbuf_datastart(m) + fsw->fsw_frame_headroom)); |
2444 | |
2445 | done: |
2446 | return err; |
2447 | } |
2448 | |
2449 | static int |
2450 | dp_copy_to_dev_pkt(struct nx_flowswitch *fsw, struct __kern_packet *spkt, |
2451 | struct __kern_packet *dpkt) |
2452 | { |
2453 | struct ifnet *ifp = fsw->fsw_ifp; |
2454 | uint16_t headroom = fsw->fsw_frame_headroom + ifp->if_tx_headroom; |
2455 | |
2456 | if (headroom > UINT8_MAX) { |
2457 | SK_ERR("headroom too large %d" , headroom); |
2458 | return ERANGE; |
2459 | } |
2460 | dpkt->pkt_headroom = (uint8_t)headroom; |
2461 | ASSERT((dpkt->pkt_headroom & 0x7) == 0); |
2462 | dpkt->pkt_l2_len = 0; |
2463 | dpkt->pkt_link_flags = spkt->pkt_link_flags; |
2464 | |
2465 | kern_packet_t sph = SK_PTR_ENCODE(spkt, |
2466 | METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt)); |
2467 | kern_packet_t dph = SK_PTR_ENCODE(dpkt, |
2468 | METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt)); |
2469 | fsw->fsw_pkt_copy_from_pkt(NR_TX, dph, |
2470 | dpkt->pkt_headroom, sph, spkt->pkt_headroom, |
2471 | spkt->pkt_length, PACKET_HAS_PARTIAL_CHECKSUM(spkt), |
2472 | (spkt->pkt_csum_tx_start_off - spkt->pkt_headroom), |
2473 | (spkt->pkt_csum_tx_stuff_off - spkt->pkt_headroom), |
2474 | (spkt->pkt_csum_flags & PACKET_CSUM_ZERO_INVERT)); |
2475 | |
2476 | FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2PKT); |
2477 | |
2478 | return 0; |
2479 | } |
2480 | |
2481 | #if SK_LOG |
2482 | /* Hoisted out of line to reduce kernel stack footprint */ |
2483 | SK_LOG_ATTRIBUTE |
2484 | static void |
2485 | dp_copy_to_dev_log(struct nx_flowswitch *fsw, const struct kern_pbufpool *pp, |
2486 | struct __kern_packet *spkt, struct __kern_packet *dpkt, int error) |
2487 | { |
2488 | struct proc *p = current_proc(); |
2489 | struct ifnet *ifp = fsw->fsw_ifp; |
2490 | uint64_t logflags = (SK_VERB_FSW_DP | SK_VERB_TX); |
2491 | |
2492 | if (error == ERANGE) { |
2493 | SK_ERR("packet too long, hr(fr+tx)+slen (%u+%u)+%u > " |
2494 | "dev_pp_max %u" , (uint32_t)fsw->fsw_frame_headroom, |
2495 | (uint32_t)ifp->if_tx_headroom, spkt->pkt_length, |
2496 | (uint32_t)pp->pp_max_frags * PP_BUF_SIZE_DEF(pp)); |
2497 | } else if (error == ENOBUFS) { |
2498 | SK_DF(logflags, "%s(%d) packet allocation failure" , |
2499 | sk_proc_name_address(p), sk_proc_pid(p)); |
2500 | } else if (error == 0) { |
2501 | ASSERT(dpkt != NULL); |
2502 | char *daddr; |
2503 | MD_BUFLET_ADDR_ABS(dpkt, daddr); |
2504 | SK_DF(logflags, "%s(%d) splen %u dplen %u hr %u (fr/tx %u/%u)" , |
2505 | sk_proc_name_address(p), sk_proc_pid(p), spkt->pkt_length, |
2506 | dpkt->pkt_length, (uint32_t)dpkt->pkt_headroom, |
2507 | (uint32_t)fsw->fsw_frame_headroom, |
2508 | (uint32_t)ifp->if_tx_headroom); |
2509 | SK_DF(logflags | SK_VERB_DUMP, "%s" , |
2510 | sk_dump("buf" , daddr, dpkt->pkt_length, 128, NULL, 0)); |
2511 | } else { |
2512 | SK_DF(logflags, "%s(%d) error %d" , error); |
2513 | } |
2514 | } |
2515 | #else |
2516 | #define dp_copy_to_dev_log(...) |
2517 | #endif /* SK_LOG */ |
2518 | |
2519 | static void |
2520 | fsw_pkt_copy_metadata(struct __kern_packet *spkt, struct __kern_packet *dpkt) |
2521 | { |
2522 | ASSERT(!(spkt->pkt_pflags & PKT_F_MBUF_MASK)); |
2523 | ASSERT(!(spkt->pkt_pflags & PKT_F_PKT_MASK)); |
2524 | |
2525 | SK_PREFETCHW(dpkt->pkt_qum_buf.buf_addr, 0); |
2526 | /* Copy packet metadata */ |
2527 | _QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum); |
2528 | _PKT_COPY(spkt, dpkt); |
2529 | _PKT_COPY_TX_PORT_DATA(spkt, dpkt); |
2530 | ASSERT((dpkt->pkt_qum.qum_qflags & QUM_F_KERNEL_ONLY) || |
2531 | !PP_KERNEL_ONLY(dpkt->pkt_qum.qum_pp)); |
2532 | ASSERT(dpkt->pkt_mbuf == NULL); |
2533 | |
2534 | /* Copy AQM metadata */ |
2535 | dpkt->pkt_flowsrc_type = spkt->pkt_flowsrc_type; |
2536 | dpkt->pkt_flowsrc_fidx = spkt->pkt_flowsrc_fidx; |
2537 | _CASSERT((offsetof(struct __flow, flow_src_id) % 8) == 0); |
2538 | _UUID_COPY(dpkt->pkt_flowsrc_id, spkt->pkt_flowsrc_id); |
2539 | _UUID_COPY(dpkt->pkt_policy_euuid, spkt->pkt_policy_euuid); |
2540 | dpkt->pkt_policy_id = spkt->pkt_policy_id; |
2541 | dpkt->pkt_skip_policy_id = spkt->pkt_skip_policy_id; |
2542 | } |
2543 | |
2544 | static int |
2545 | dp_copy_to_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt, |
2546 | struct __kern_packet *dpkt) |
2547 | { |
2548 | const struct kern_pbufpool *pp = dpkt->pkt_qum.qum_pp; |
2549 | struct ifnet *ifp = fsw->fsw_ifp; |
2550 | uint32_t dev_pkt_len; |
2551 | int err = 0; |
2552 | |
2553 | fsw_pkt_copy_metadata(spkt, dpkt); |
2554 | switch (fsw->fsw_classq_enq_ptype) { |
2555 | case QP_MBUF: |
2556 | err = dp_copy_to_dev_mbuf(fsw, spkt, dpkt); |
2557 | break; |
2558 | |
2559 | case QP_PACKET: |
2560 | dev_pkt_len = fsw->fsw_frame_headroom + ifp->if_tx_headroom + |
2561 | spkt->pkt_length; |
2562 | if (dev_pkt_len > pp->pp_max_frags * PP_BUF_SIZE_DEF(pp)) { |
2563 | FSW_STATS_INC(FSW_STATS_TX_COPY_BAD_LEN); |
2564 | err = ERANGE; |
2565 | goto done; |
2566 | } |
2567 | err = dp_copy_to_dev_pkt(fsw, spkt, dpkt); |
2568 | break; |
2569 | |
2570 | default: |
2571 | VERIFY(0); |
2572 | __builtin_unreachable(); |
2573 | } |
2574 | done: |
2575 | dp_copy_to_dev_log(fsw, pp, spkt, dpkt, err); |
2576 | return err; |
2577 | } |
2578 | |
2579 | static int |
2580 | (struct nx_flowswitch *fsw, struct __kern_packet *spkt, |
2581 | struct __kern_packet *dpkt) |
2582 | { |
2583 | uint8_t *sbaddr, *dbaddr; |
2584 | uint16_t headroom = fsw->fsw_frame_headroom + fsw->fsw_ifp->if_tx_headroom; |
2585 | uint16_t hdrs_len_estimate = (uint16_t)MIN(spkt->pkt_length, 128); |
2586 | |
2587 | fsw_pkt_copy_metadata(spkt, dpkt); |
2588 | |
2589 | MD_BUFLET_ADDR_ABS(spkt, sbaddr); |
2590 | ASSERT(sbaddr != NULL); |
2591 | sbaddr += spkt->pkt_headroom; |
2592 | |
2593 | MD_BUFLET_ADDR_ABS(dpkt, dbaddr); |
2594 | ASSERT(dbaddr != NULL); |
2595 | dpkt->pkt_headroom = (uint8_t)headroom; |
2596 | dbaddr += headroom; |
2597 | |
2598 | pkt_copy(src: sbaddr, dst: dbaddr, len: hdrs_len_estimate); |
2599 | METADATA_SET_LEN(dpkt, hdrs_len_estimate, headroom); |
2600 | |
2601 | /* packet length is set to the full length */ |
2602 | dpkt->pkt_length = spkt->pkt_length; |
2603 | dpkt->pkt_pflags |= PKT_F_TRUNCATED; |
2604 | return 0; |
2605 | } |
2606 | |
2607 | static struct mbuf * |
2608 | convert_pkt_to_mbuf(struct __kern_packet *pkt) |
2609 | { |
2610 | ASSERT(pkt->pkt_pflags & PKT_F_MBUF_DATA); |
2611 | ASSERT(pkt->pkt_mbuf != NULL); |
2612 | struct mbuf *m = pkt->pkt_mbuf; |
2613 | |
2614 | /* pass additional metadata generated from flow parse/lookup */ |
2615 | _CASSERT(sizeof(m->m_pkthdr.pkt_flowid) == |
2616 | sizeof(pkt->pkt_flow_token)); |
2617 | _CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_srcid) == |
2618 | sizeof(pkt->pkt_flowsrc_token)); |
2619 | _CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_fidx) == |
2620 | sizeof(pkt->pkt_flowsrc_fidx)); |
2621 | m->m_pkthdr.pkt_svc = pkt->pkt_svc_class; |
2622 | m->m_pkthdr.pkt_proto = pkt->pkt_flow->flow_ip_proto; |
2623 | m->m_pkthdr.pkt_flowid = pkt->pkt_flow_token; |
2624 | m->m_pkthdr.comp_gencnt = pkt->pkt_comp_gencnt; |
2625 | m->m_pkthdr.pkt_flowsrc = pkt->pkt_flowsrc_type; |
2626 | m->m_pkthdr.pkt_mpriv_srcid = pkt->pkt_flowsrc_token; |
2627 | m->m_pkthdr.pkt_mpriv_fidx = pkt->pkt_flowsrc_fidx; |
2628 | |
2629 | if (pkt->pkt_transport_protocol == IPPROTO_QUIC) { |
2630 | m->m_pkthdr.pkt_ext_flags |= PKTF_EXT_QUIC; |
2631 | } |
2632 | |
2633 | /* The packet should have a timestamp by the time we get here. */ |
2634 | m->m_pkthdr.pkt_timestamp = pkt->pkt_timestamp; |
2635 | m->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID; |
2636 | |
2637 | m->m_pkthdr.pkt_flags &= ~PKT_F_COMMON_MASK; |
2638 | m->m_pkthdr.pkt_flags |= (pkt->pkt_pflags & PKT_F_COMMON_MASK); |
2639 | /* set pkt_hdr so that AQM can find IP header and mark ECN bits */ |
2640 | m->m_pkthdr.pkt_hdr = m_mtod_current(m) + pkt->pkt_l2_len; |
2641 | |
2642 | if ((pkt->pkt_pflags & PKT_F_START_SEQ) != 0) { |
2643 | m->m_pkthdr.tx_start_seq = ntohl(pkt->pkt_flow_tcp_seq); |
2644 | } |
2645 | KPKT_CLEAR_MBUF_DATA(pkt); |
2646 | |
2647 | /* mbuf has been consumed, release packet as well */ |
2648 | ASSERT(pkt->pkt_qum.qum_ksd == NULL); |
2649 | pp_free_packet_single(pkt); |
2650 | return m; |
2651 | } |
2652 | |
2653 | static void |
2654 | convert_pkt_to_mbuf_list(struct __kern_packet *pkt_list, |
2655 | struct mbuf **head, struct mbuf **tail, |
2656 | uint32_t *cnt, uint32_t *bytes) |
2657 | { |
2658 | struct __kern_packet *pkt = pkt_list, *next; |
2659 | struct mbuf *m_head = NULL, **m_tailp = &m_head, *m = NULL; |
2660 | uint32_t c = 0, b = 0; |
2661 | |
2662 | while (pkt != NULL) { |
2663 | next = pkt->pkt_nextpkt; |
2664 | pkt->pkt_nextpkt = NULL; |
2665 | m = convert_pkt_to_mbuf(pkt); |
2666 | ASSERT(m != NULL); |
2667 | |
2668 | *m_tailp = m; |
2669 | m_tailp = &m->m_nextpkt; |
2670 | c++; |
2671 | b += m_pktlen(m); |
2672 | pkt = next; |
2673 | } |
2674 | if (head != NULL) { |
2675 | *head = m_head; |
2676 | } |
2677 | if (tail != NULL) { |
2678 | *tail = m; |
2679 | } |
2680 | if (cnt != NULL) { |
2681 | *cnt = c; |
2682 | } |
2683 | if (bytes != NULL) { |
2684 | *bytes = b; |
2685 | } |
2686 | } |
2687 | |
2688 | SK_NO_INLINE_ATTRIBUTE |
2689 | static int |
2690 | classq_enqueue_flow_single(struct nx_flowswitch *fsw, |
2691 | struct __kern_packet *pkt) |
2692 | { |
2693 | struct ifnet *ifp = fsw->fsw_ifp; |
2694 | boolean_t pkt_drop = FALSE; |
2695 | int err; |
2696 | |
2697 | FSW_LOCK_ASSERT_HELD(fsw); |
2698 | ASSERT(fsw->fsw_classq_enabled); |
2699 | ASSERT(pkt->pkt_flow_token != 0); |
2700 | fsw_ifp_inc_traffic_class_out_pkt(ifp, pkt->pkt_svc_class, |
2701 | 1, pkt->pkt_length); |
2702 | |
2703 | if (__improbable(pkt->pkt_trace_id != 0)) { |
2704 | KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_END, pkt->pkt_trace_id); |
2705 | KDBG(SK_KTRACE_PKT_TX_AQM | DBG_FUNC_START, pkt->pkt_trace_id); |
2706 | } |
2707 | |
2708 | switch (fsw->fsw_classq_enq_ptype) { |
2709 | case QP_MBUF: { /* compat interface */ |
2710 | struct mbuf *m; |
2711 | |
2712 | m = convert_pkt_to_mbuf(pkt); |
2713 | ASSERT(m != NULL); |
2714 | pkt = NULL; |
2715 | |
2716 | /* ifnet_enqueue consumes mbuf */ |
2717 | err = ifnet_enqueue_mbuf(ifp, m, false, &pkt_drop); |
2718 | m = NULL; |
2719 | #if (DEVELOPMENT || DEBUG) |
2720 | if (__improbable(!pkt_drop)) { |
2721 | _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func); |
2722 | } |
2723 | #endif /* DEVELOPMENT || DEBUG */ |
2724 | if (pkt_drop) { |
2725 | FSW_STATS_INC(FSW_STATS_DROP); |
2726 | FSW_STATS_INC(FSW_STATS_TX_AQM_DROP); |
2727 | } |
2728 | break; |
2729 | } |
2730 | case QP_PACKET: { /* native interface */ |
2731 | /* ifnet_enqueue consumes packet */ |
2732 | err = ifnet_enqueue_pkt(ifp, pkt, false, &pkt_drop); |
2733 | pkt = NULL; |
2734 | #if (DEVELOPMENT || DEBUG) |
2735 | if (__improbable(!pkt_drop)) { |
2736 | _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func); |
2737 | } |
2738 | #endif /* DEVELOPMENT || DEBUG */ |
2739 | if (pkt_drop) { |
2740 | FSW_STATS_INC(FSW_STATS_DROP); |
2741 | FSW_STATS_INC(FSW_STATS_TX_AQM_DROP); |
2742 | } |
2743 | break; |
2744 | } |
2745 | default: |
2746 | err = EINVAL; |
2747 | VERIFY(0); |
2748 | /* NOTREACHED */ |
2749 | __builtin_unreachable(); |
2750 | } |
2751 | |
2752 | return err; |
2753 | } |
2754 | |
2755 | static int |
2756 | classq_enqueue_flow_chain(struct nx_flowswitch *fsw, |
2757 | struct __kern_packet *pkt_head, struct __kern_packet *pkt_tail, |
2758 | uint32_t cnt, uint32_t bytes) |
2759 | { |
2760 | struct ifnet *ifp = fsw->fsw_ifp; |
2761 | boolean_t pkt_drop = FALSE; |
2762 | uint32_t svc; |
2763 | int err; |
2764 | |
2765 | FSW_LOCK_ASSERT_HELD(fsw); |
2766 | ASSERT(fsw->fsw_classq_enabled); |
2767 | ASSERT(pkt_head->pkt_flow_token != 0); |
2768 | |
2769 | /* |
2770 | * All packets in the flow should have the same svc. |
2771 | */ |
2772 | svc = pkt_head->pkt_svc_class; |
2773 | fsw_ifp_inc_traffic_class_out_pkt(ifp, svc, cnt, bytes); |
2774 | |
2775 | switch (fsw->fsw_classq_enq_ptype) { |
2776 | case QP_MBUF: { /* compat interface */ |
2777 | struct mbuf *m_head = NULL, *m_tail = NULL; |
2778 | uint32_t c = 0, b = 0; |
2779 | |
2780 | convert_pkt_to_mbuf_list(pkt_list: pkt_head, head: &m_head, tail: &m_tail, cnt: &c, bytes: &b); |
2781 | ASSERT(m_head != NULL && m_tail != NULL); |
2782 | ASSERT(c == cnt); |
2783 | ASSERT(b == bytes); |
2784 | pkt_head = NULL; |
2785 | |
2786 | /* ifnet_enqueue consumes mbuf */ |
2787 | err = ifnet_enqueue_mbuf_chain(ifp, m_head, m_tail, cnt, |
2788 | bytes, FALSE, &pkt_drop); |
2789 | m_head = NULL; |
2790 | m_tail = NULL; |
2791 | #if (DEVELOPMENT || DEBUG) |
2792 | if (__improbable(!pkt_drop)) { |
2793 | _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func); |
2794 | } |
2795 | #endif /* DEVELOPMENT || DEBUG */ |
2796 | if (pkt_drop) { |
2797 | STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, cnt); |
2798 | STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP, |
2799 | cnt); |
2800 | } |
2801 | break; |
2802 | } |
2803 | case QP_PACKET: { /* native interface */ |
2804 | /* ifnet_enqueue consumes packet */ |
2805 | err = ifnet_enqueue_pkt_chain(ifp, pkt_head, pkt_tail, cnt, |
2806 | bytes, FALSE, &pkt_drop); |
2807 | pkt_head = NULL; |
2808 | #if (DEVELOPMENT || DEBUG) |
2809 | if (__improbable(!pkt_drop)) { |
2810 | _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func); |
2811 | } |
2812 | #endif /* DEVELOPMENT || DEBUG */ |
2813 | if (pkt_drop) { |
2814 | STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, cnt); |
2815 | STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP, |
2816 | cnt); |
2817 | } |
2818 | break; |
2819 | } |
2820 | default: |
2821 | err = EINVAL; |
2822 | VERIFY(0); |
2823 | /* NOTREACHED */ |
2824 | __builtin_unreachable(); |
2825 | } |
2826 | |
2827 | return err; |
2828 | } |
2829 | |
2830 | /* |
2831 | * This code path needs to be kept for interfaces without logical link support. |
2832 | */ |
2833 | static void |
2834 | classq_enqueue_flow(struct nx_flowswitch *fsw, struct flow_entry *fe, |
2835 | bool chain, uint32_t cnt, uint32_t bytes) |
2836 | { |
2837 | bool flowadv_is_set = false; |
2838 | struct __kern_packet *pkt, *tail, *tpkt; |
2839 | flowadv_idx_t flow_adv_idx; |
2840 | bool flowadv_cap; |
2841 | flowadv_token_t flow_adv_token; |
2842 | int err; |
2843 | |
2844 | SK_DF(SK_VERB_FSW_DP | SK_VERB_AQM, "%s classq enqueued %d pkts" , |
2845 | if_name(fsw->fsw_ifp), KPKTQ_LEN(&fe->fe_tx_pktq)); |
2846 | |
2847 | if (chain) { |
2848 | pkt = KPKTQ_FIRST(&fe->fe_tx_pktq); |
2849 | tail = KPKTQ_LAST(&fe->fe_tx_pktq); |
2850 | KPKTQ_INIT(&fe->fe_tx_pktq); |
2851 | if (pkt == NULL) { |
2852 | return; |
2853 | } |
2854 | flow_adv_idx = pkt->pkt_flowsrc_fidx; |
2855 | flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0); |
2856 | flow_adv_token = pkt->pkt_flow_token; |
2857 | |
2858 | err = classq_enqueue_flow_chain(fsw, pkt_head: pkt, pkt_tail: tail, cnt, bytes); |
2859 | |
2860 | /* set flow advisory if needed */ |
2861 | if (__improbable((err == EQFULL || err == EQSUSPENDED) && |
2862 | flowadv_cap)) { |
2863 | flowadv_is_set = na_flowadv_set(flow_get_na(fsw, fe), |
2864 | flow_adv_idx, flow_adv_token); |
2865 | } |
2866 | DTRACE_SKYWALK3(chain__enqueue, uint32_t, cnt, uint32_t, bytes, |
2867 | bool, flowadv_is_set); |
2868 | } else { |
2869 | uint32_t c = 0, b = 0; |
2870 | |
2871 | KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) { |
2872 | KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt); |
2873 | |
2874 | flow_adv_idx = pkt->pkt_flowsrc_fidx; |
2875 | flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0); |
2876 | flow_adv_token = pkt->pkt_flow_token; |
2877 | |
2878 | c++; |
2879 | b += pkt->pkt_length; |
2880 | err = classq_enqueue_flow_single(fsw, pkt); |
2881 | |
2882 | /* set flow advisory if needed */ |
2883 | if (__improbable(!flowadv_is_set && |
2884 | ((err == EQFULL || err == EQSUSPENDED) && |
2885 | flowadv_cap))) { |
2886 | flowadv_is_set = na_flowadv_set( |
2887 | flow_get_na(fsw, fe), flow_adv_idx, |
2888 | flow_adv_token); |
2889 | } |
2890 | } |
2891 | ASSERT(c == cnt); |
2892 | ASSERT(b == bytes); |
2893 | DTRACE_SKYWALK3(non__chain__enqueue, uint32_t, cnt, uint32_t, bytes, |
2894 | bool, flowadv_is_set); |
2895 | } |
2896 | |
2897 | /* notify flow advisory event */ |
2898 | if (__improbable(flowadv_is_set)) { |
2899 | struct __kern_channel_ring *r = fsw_flow_get_tx_ring(fsw, fe); |
2900 | if (__probable(r)) { |
2901 | na_flowadv_event(r); |
2902 | SK_DF(SK_VERB_FLOW_ADVISORY | SK_VERB_TX, |
2903 | "%s(%d) notified of flow update" , |
2904 | sk_proc_name_address(current_proc()), |
2905 | sk_proc_pid(current_proc())); |
2906 | } |
2907 | } |
2908 | } |
2909 | |
2910 | /* |
2911 | * Logical link code path |
2912 | */ |
2913 | static void |
2914 | classq_qset_enqueue_flow(struct nx_flowswitch *fsw, struct flow_entry *fe, |
2915 | bool chain, uint32_t cnt, uint32_t bytes) |
2916 | { |
2917 | #pragma unused(chain) |
2918 | struct __kern_packet *pkt, *tail; |
2919 | flowadv_idx_t flow_adv_idx; |
2920 | bool flowadv_is_set = false; |
2921 | bool flowadv_cap; |
2922 | flowadv_token_t flow_adv_token; |
2923 | uint32_t flowctl = 0, dropped = 0; |
2924 | int err; |
2925 | |
2926 | SK_DF(SK_VERB_FSW_DP | SK_VERB_AQM, "%s classq enqueued %d pkts" , |
2927 | if_name(fsw->fsw_ifp), KPKTQ_LEN(&fe->fe_tx_pktq)); |
2928 | |
2929 | pkt = KPKTQ_FIRST(&fe->fe_tx_pktq); |
2930 | tail = KPKTQ_LAST(&fe->fe_tx_pktq); |
2931 | KPKTQ_INIT(&fe->fe_tx_pktq); |
2932 | if (pkt == NULL) { |
2933 | return; |
2934 | } |
2935 | flow_adv_idx = pkt->pkt_flowsrc_fidx; |
2936 | flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0); |
2937 | flow_adv_token = pkt->pkt_flow_token; |
2938 | |
2939 | err = netif_qset_enqueue(fe->fe_qset, pkt, tail, cnt, bytes, |
2940 | &flowctl, &dropped); |
2941 | |
2942 | if (__improbable(err != 0)) { |
2943 | /* set flow advisory if needed */ |
2944 | if (flowctl > 0 && flowadv_cap) { |
2945 | flowadv_is_set = na_flowadv_set(flow_get_na(fsw, fe), |
2946 | flow_adv_idx, flow_adv_token); |
2947 | |
2948 | /* notify flow advisory event */ |
2949 | if (flowadv_is_set) { |
2950 | struct __kern_channel_ring *r = |
2951 | fsw_flow_get_tx_ring(fsw, fe); |
2952 | if (__probable(r)) { |
2953 | na_flowadv_event(r); |
2954 | SK_DF(SK_VERB_FLOW_ADVISORY | |
2955 | SK_VERB_TX, |
2956 | "%s(%d) notified of flow update" , |
2957 | sk_proc_name_address(current_proc()), |
2958 | sk_proc_pid(current_proc())); |
2959 | } |
2960 | } |
2961 | } |
2962 | if (dropped > 0) { |
2963 | STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, dropped); |
2964 | STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP, |
2965 | dropped); |
2966 | } |
2967 | } |
2968 | } |
2969 | |
2970 | static void |
2971 | tx_finalize_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt) |
2972 | { |
2973 | #pragma unused(fsw) |
2974 | /* finalize here; no more changes to buflets after classq */ |
2975 | if (__probable(!(pkt->pkt_pflags & PKT_F_MBUF_DATA))) { |
2976 | kern_packet_t ph = SK_PTR_ENCODE(pkt, |
2977 | METADATA_TYPE(pkt), METADATA_SUBTYPE(pkt)); |
2978 | int err = __packet_finalize(ph); |
2979 | VERIFY(err == 0); |
2980 | } |
2981 | } |
2982 | |
2983 | static bool |
2984 | dp_flow_tx_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe) |
2985 | { |
2986 | struct flow_route *fr = fe->fe_route; |
2987 | int err; |
2988 | |
2989 | ASSERT(fr != NULL); |
2990 | |
2991 | if (__improbable(!dp_flow_route_process(fsw, fe))) { |
2992 | return false; |
2993 | } |
2994 | if (fe->fe_qset_select == FE_QSET_SELECT_DYNAMIC) { |
2995 | flow_qset_select_dynamic(fsw, fe, TRUE); |
2996 | } |
2997 | |
2998 | _FSW_INJECT_ERROR(35, fr->fr_flags, fr->fr_flags, |
2999 | _fsw_error35_handler, 1, fr, NULL, NULL); |
3000 | _FSW_INJECT_ERROR(36, fr->fr_flags, fr->fr_flags, |
3001 | _fsw_error36_handler, 1, fr, NULL); |
3002 | |
3003 | /* |
3004 | * See if we need to resolve the flow route; note the test against |
3005 | * fr_flags here is done without any lock for performance. Thus |
3006 | * it's possible that we race against the thread performing route |
3007 | * event updates for a packet (which is OK). In any case we should |
3008 | * not have any assertion on fr_flags value(s) due to the lack of |
3009 | * serialization. |
3010 | */ |
3011 | if (fr->fr_flags & FLOWRTF_RESOLVED) { |
3012 | goto frame; |
3013 | } |
3014 | |
3015 | struct __kern_packet *pkt, *tpkt; |
3016 | KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) { |
3017 | err = fsw->fsw_resolve(fsw, fr, pkt); |
3018 | _FSW_INJECT_ERROR_SET(35, _fsw_error35_handler, 2, fr, pkt, &err); |
3019 | _FSW_INJECT_ERROR_SET(36, _fsw_error36_handler, 2, fr, &err); |
3020 | /* |
3021 | * If resolver returns EJUSTRETURN then we drop the pkt as the |
3022 | * resolver should have converted the pkt into mbuf (or |
3023 | * detached the attached mbuf from pkt) and added it to the |
3024 | * llinfo queue. If we do have a cached llinfo, then proceed |
3025 | * to using it even though it may be stale (very unlikely) |
3026 | * while the resolution is in progress. |
3027 | * Otherwise, any other error results in dropping pkt. |
3028 | */ |
3029 | if (err == EJUSTRETURN) { |
3030 | KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt); |
3031 | pp_free_packet_single(pkt); |
3032 | FSW_STATS_INC(FSW_STATS_TX_RESOLV_PENDING); |
3033 | continue; |
3034 | } else if (err != 0 && (fr->fr_flags & FLOWRTF_HAS_LLINFO)) { |
3035 | /* use existing llinfo */ |
3036 | FSW_STATS_INC(FSW_STATS_TX_RESOLV_STALE); |
3037 | } else if (err != 0) { |
3038 | KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt); |
3039 | pp_free_packet_single(pkt); |
3040 | FSW_STATS_INC(FSW_STATS_TX_RESOLV_FAIL); |
3041 | continue; |
3042 | } |
3043 | } |
3044 | |
3045 | frame: |
3046 | KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) { |
3047 | if (fsw->fsw_frame != NULL) { |
3048 | fsw->fsw_frame(fsw, fr, pkt); |
3049 | } |
3050 | } |
3051 | |
3052 | return true; |
3053 | } |
3054 | |
3055 | static void |
3056 | dp_listener_flow_tx_process(struct nx_flowswitch *fsw, struct flow_entry *fe) |
3057 | { |
3058 | #pragma unused(fsw) |
3059 | struct __kern_packet *pkt, *tpkt; |
3060 | KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) { |
3061 | KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt); |
3062 | /* listener is only allowed TCP RST */ |
3063 | if (pkt->pkt_flow_ip_proto == IPPROTO_TCP && |
3064 | (pkt->pkt_flow_tcp_flags & TH_RST) != 0) { |
3065 | flow_track_abort_tcp(fe, NULL, rst_pkt: pkt); |
3066 | } else { |
3067 | char *addr; |
3068 | MD_BUFLET_ADDR_ABS(pkt, addr); |
3069 | SK_ERR("listener flow sends non-RST packet %s" , |
3070 | sk_dump(sk_proc_name_address(current_proc()), |
3071 | addr, pkt->pkt_length, 128, NULL, 0)); |
3072 | } |
3073 | pp_free_packet_single(pkt); |
3074 | } |
3075 | } |
3076 | |
3077 | static void |
3078 | fsw_update_timestamps(struct __kern_packet *pkt, volatile uint64_t *fg_ts, |
3079 | volatile uint64_t *rt_ts, ifnet_t ifp) |
3080 | { |
3081 | struct timespec now; |
3082 | uint64_t now_nsec = 0; |
3083 | |
3084 | if (!(pkt->pkt_pflags & PKT_F_TS_VALID) || pkt->pkt_timestamp == 0) { |
3085 | nanouptime(ts: &now); |
3086 | net_timernsec(&now, &now_nsec); |
3087 | pkt->pkt_timestamp = now_nsec; |
3088 | } |
3089 | pkt->pkt_pflags &= ~PKT_F_TS_VALID; |
3090 | |
3091 | /* |
3092 | * If the packet service class is not background, |
3093 | * update the timestamps on the interface, as well as |
3094 | * the ones in nexus-wide advisory to indicate recent |
3095 | * activity on a foreground flow. |
3096 | */ |
3097 | if (!(pkt->pkt_pflags & PKT_F_BACKGROUND)) { |
3098 | ifp->if_fg_sendts = (uint32_t)_net_uptime; |
3099 | if (fg_ts != NULL) { |
3100 | *fg_ts = _net_uptime; |
3101 | } |
3102 | } |
3103 | if (pkt->pkt_pflags & PKT_F_REALTIME) { |
3104 | ifp->if_rt_sendts = (uint32_t)_net_uptime; |
3105 | if (rt_ts != NULL) { |
3106 | *rt_ts = _net_uptime; |
3107 | } |
3108 | } |
3109 | } |
3110 | |
3111 | static bool |
3112 | fsw_chain_enqueue_enabled(struct nx_flowswitch *fsw, bool gso_enabled) |
3113 | { |
3114 | return fsw_chain_enqueue != 0 && |
3115 | fsw->fsw_ifp->if_output_netem == NULL && |
3116 | (fsw->fsw_ifp->if_eflags & IFEF_ENQUEUE_MULTI) == 0 && |
3117 | gso_enabled; |
3118 | } |
3119 | |
3120 | void |
3121 | dp_flow_tx_process(struct nx_flowswitch *fsw, struct flow_entry *fe, |
3122 | uint32_t flags) |
3123 | { |
3124 | struct pktq dropped_pkts; |
3125 | bool chain, gso = ((flags & FLOW_PROC_FLAG_GSO) != 0); |
3126 | uint32_t cnt = 0, bytes = 0; |
3127 | volatile struct sk_nexusadv *nxadv = NULL; |
3128 | volatile uint64_t *fg_ts = NULL; |
3129 | volatile uint64_t *rt_ts = NULL; |
3130 | uint8_t qset_idx = (fe->fe_qset != NULL) ? fe->fe_qset->nqs_idx : 0; |
3131 | |
3132 | KPKTQ_INIT(&dropped_pkts); |
3133 | ASSERT(!KPKTQ_EMPTY(&fe->fe_tx_pktq)); |
3134 | if (__improbable(fe->fe_flags & FLOWENTF_LISTENER)) { |
3135 | dp_listener_flow_tx_process(fsw, fe); |
3136 | return; |
3137 | } |
3138 | if (__improbable(!dp_flow_tx_route_process(fsw, fe))) { |
3139 | SK_RDERR(5, "Tx route bad" ); |
3140 | FSW_STATS_ADD(FSW_STATS_TX_FLOW_NONVIABLE, |
3141 | KPKTQ_LEN(&fe->fe_tx_pktq)); |
3142 | KPKTQ_CONCAT(&dropped_pkts, &fe->fe_tx_pktq); |
3143 | goto done; |
3144 | } |
3145 | chain = fsw_chain_enqueue_enabled(fsw, gso_enabled: gso); |
3146 | if (chain) { |
3147 | nxadv = fsw->fsw_nx->nx_adv.flowswitch_nxv_adv; |
3148 | if (nxadv != NULL) { |
3149 | fg_ts = &nxadv->nxadv_fg_sendts; |
3150 | rt_ts = &nxadv->nxadv_rt_sendts; |
3151 | } |
3152 | } |
3153 | struct __kern_packet *pkt, *tpkt; |
3154 | KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) { |
3155 | int err = 0; |
3156 | |
3157 | err = flow_pkt_track(fe, pkt, false); |
3158 | if (__improbable(err != 0)) { |
3159 | SK_RDERR(5, "flow_pkt_track failed (err %d)" , err); |
3160 | FSW_STATS_INC(FSW_STATS_TX_FLOW_TRACK_ERR); |
3161 | KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt); |
3162 | KPKTQ_ENQUEUE(&dropped_pkts, pkt); |
3163 | continue; |
3164 | } |
3165 | _UUID_COPY(pkt->pkt_policy_euuid, fe->fe_eproc_uuid); |
3166 | pkt->pkt_transport_protocol = fe->fe_transport_protocol; |
3167 | |
3168 | /* set AQM related values for outgoing packet */ |
3169 | if (fe->fe_adv_idx != FLOWADV_IDX_NONE) { |
3170 | pkt->pkt_pflags |= PKT_F_FLOW_ADV; |
3171 | pkt->pkt_flowsrc_type = FLOWSRC_CHANNEL; |
3172 | pkt->pkt_flowsrc_fidx = fe->fe_adv_idx; |
3173 | } else { |
3174 | pkt->pkt_pflags &= ~PKT_F_FLOW_ADV; |
3175 | } |
3176 | _UUID_CLEAR(pkt->pkt_flow_id); |
3177 | pkt->pkt_flow_token = fe->fe_flowid; |
3178 | pkt->pkt_pflags |= PKT_F_FLOW_ID; |
3179 | pkt->pkt_qset_idx = qset_idx; |
3180 | pkt->pkt_policy_id = fe->fe_policy_id; |
3181 | pkt->pkt_skip_policy_id = fe->fe_skip_policy_id; |
3182 | |
3183 | /* |
3184 | * The same code is exercised per packet for the non-chain case |
3185 | * (see ifnet_enqueue_ifclassq()). It's replicated here to avoid |
3186 | * re-walking the chain later. |
3187 | */ |
3188 | if (chain) { |
3189 | fsw_update_timestamps(pkt, fg_ts, rt_ts, ifp: fsw->fsw_ifp); |
3190 | } |
3191 | /* mark packet tos/svc_class */ |
3192 | fsw_qos_mark(fsw, fe, pkt); |
3193 | |
3194 | tx_finalize_packet(fsw, pkt); |
3195 | bytes += pkt->pkt_length; |
3196 | cnt++; |
3197 | } |
3198 | |
3199 | /* snoop after it's finalized */ |
3200 | if (__improbable(pktap_total_tap_count != 0)) { |
3201 | fsw_snoop(fsw, fe, false); |
3202 | } |
3203 | if (fe->fe_qset != NULL) { |
3204 | classq_qset_enqueue_flow(fsw, fe, chain, cnt, bytes); |
3205 | } else { |
3206 | classq_enqueue_flow(fsw, fe, chain, cnt, bytes); |
3207 | } |
3208 | done: |
3209 | dp_drop_pktq(fsw, &dropped_pkts); |
3210 | } |
3211 | |
3212 | static struct flow_entry * |
3213 | tx_process_continuous_ip_frag(struct nx_flowswitch *fsw, |
3214 | struct flow_entry *prev_fe, struct __kern_packet *pkt) |
3215 | { |
3216 | ASSERT(!pkt->pkt_flow_ip_is_first_frag); |
3217 | |
3218 | if (__improbable(pkt->pkt_flow_ip_frag_id == 0)) { |
3219 | FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_ID); |
3220 | SK_ERR("%s(%d) invalid zero fragment id" , |
3221 | sk_proc_name_address(current_proc()), |
3222 | sk_proc_pid(current_proc())); |
3223 | return NULL; |
3224 | } |
3225 | |
3226 | SK_DF(SK_VERB_FSW_DP | SK_VERB_TX, |
3227 | "%s(%d) continuation frag, id %u" , |
3228 | sk_proc_name_address(current_proc()), |
3229 | sk_proc_pid(current_proc()), |
3230 | pkt->pkt_flow_ip_frag_id); |
3231 | if (__improbable(prev_fe == NULL || |
3232 | !prev_fe->fe_tx_is_cont_frag)) { |
3233 | SK_ERR("%s(%d) unexpected continuation frag" , |
3234 | sk_proc_name_address(current_proc()), |
3235 | sk_proc_pid(current_proc()), |
3236 | pkt->pkt_flow_ip_frag_id); |
3237 | FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT); |
3238 | return NULL; |
3239 | } |
3240 | if (__improbable(pkt->pkt_flow_ip_frag_id != |
3241 | prev_fe->fe_tx_frag_id)) { |
3242 | FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT); |
3243 | SK_ERR("%s(%d) wrong continuation frag id %u expecting %u" , |
3244 | sk_proc_name_address(current_proc()), |
3245 | sk_proc_pid(current_proc()), |
3246 | pkt->pkt_flow_ip_frag_id, |
3247 | prev_fe->fe_tx_frag_id); |
3248 | return NULL; |
3249 | } |
3250 | |
3251 | return prev_fe; |
3252 | } |
3253 | |
3254 | static struct flow_entry * |
3255 | tx_lookup_flow(struct nx_flowswitch *fsw, struct __kern_packet *pkt, |
3256 | struct flow_entry *prev_fe) |
3257 | { |
3258 | struct flow_entry *fe; |
3259 | |
3260 | fe = lookup_flow_with_pkt(fsw, pkt, false, prev_fe); |
3261 | if (__improbable(fe == NULL)) { |
3262 | goto done; |
3263 | } |
3264 | |
3265 | if (__improbable(fe->fe_flags & FLOWENTF_TORN_DOWN)) { |
3266 | SK_RDERR(5, "Tx flow torn down" ); |
3267 | FSW_STATS_INC(FSW_STATS_TX_FLOW_TORNDOWN); |
3268 | flow_entry_release(pfe: &fe); |
3269 | goto done; |
3270 | } |
3271 | |
3272 | _FSW_INJECT_ERROR(34, pkt->pkt_flow_id[0], fe->fe_uuid[0] + 1, |
3273 | null_func); |
3274 | |
3275 | if (__improbable(!_UUID_MATCH(pkt->pkt_flow_id, fe->fe_uuid))) { |
3276 | uuid_string_t flow_id_str, pkt_id_str; |
3277 | sk_uuid_unparse(fe->fe_uuid, flow_id_str); |
3278 | sk_uuid_unparse(pkt->pkt_flow_id, pkt_id_str); |
3279 | SK_ERR("pkt flow id %s != flow id %s" , pkt_id_str, flow_id_str); |
3280 | flow_entry_release(pfe: &fe); |
3281 | FSW_STATS_INC(FSW_STATS_TX_FLOW_BAD_ID); |
3282 | } |
3283 | |
3284 | done: |
3285 | return fe; |
3286 | } |
3287 | |
3288 | static inline void |
3289 | tx_flow_process(struct nx_flowswitch *fsw, struct flow_entry *fe, |
3290 | uint32_t flags) |
3291 | { |
3292 | ASSERT(!KPKTQ_EMPTY(&fe->fe_tx_pktq)); |
3293 | ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) != 0); |
3294 | |
3295 | SK_DF(SK_VERB_FSW_DP | SK_VERB_TX, "TX %d pkts from fe %p port %d" , |
3296 | KPKTQ_LEN(&fe->fe_tx_pktq), fe, fe->fe_nx_port); |
3297 | |
3298 | /* flow related processing (default, agg, etc.) */ |
3299 | fe->fe_tx_process(fsw, fe, flags); |
3300 | |
3301 | KPKTQ_FINI(&fe->fe_tx_pktq); |
3302 | } |
3303 | |
3304 | #if SK_LOG |
3305 | static void |
3306 | dp_tx_log_pkt(uint64_t verb, char *desc, struct __kern_packet *pkt) |
3307 | { |
3308 | char *pkt_buf; |
3309 | MD_BUFLET_ADDR_ABS(pkt, pkt_buf); |
3310 | SK_DF(verb, "%s(%d) %s %s" , sk_proc_name_address(current_proc()), |
3311 | sk_proc_pid(current_proc()), desc, sk_dump("buf" , pkt_buf, |
3312 | pkt->pkt_length, 128, NULL, 0)); |
3313 | } |
3314 | #else /* !SK_LOG */ |
3315 | #define dp_tx_log_pkt(...) |
3316 | #endif /* !SK_LOG */ |
3317 | |
3318 | static inline struct ifnet * |
3319 | fsw_datamov_begin(struct nx_flowswitch *fsw) |
3320 | { |
3321 | struct ifnet *ifp; |
3322 | |
3323 | ifp = fsw->fsw_ifp; |
3324 | if (!ifnet_datamov_begin(ifp)) { |
3325 | DTRACE_SKYWALK1(ifnet__detached, struct ifnet *, ifp); |
3326 | return NULL; |
3327 | } |
3328 | return ifp; |
3329 | } |
3330 | |
3331 | static inline void |
3332 | fsw_datamov_end(struct nx_flowswitch *fsw) |
3333 | { |
3334 | ifnet_datamov_end(fsw->fsw_ifp); |
3335 | } |
3336 | |
3337 | static void |
3338 | dp_tx_pktq(struct nx_flowswitch *fsw, struct pktq *spktq) |
3339 | { |
3340 | struct __kern_packet *spkt, *pkt; |
3341 | struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes); |
3342 | struct flow_entry *fe, *prev_fe; |
3343 | struct pktq dropped_pkts, dpktq; |
3344 | struct nexus_adapter *dev_na; |
3345 | struct kern_pbufpool *dev_pp; |
3346 | struct ifnet *ifp = NULL; |
3347 | sa_family_t af; |
3348 | uint32_t n_pkts, n_flows = 0; |
3349 | boolean_t do_pacing = FALSE; |
3350 | |
3351 | int err; |
3352 | KPKTQ_INIT(&dpktq); |
3353 | KPKTQ_INIT(&dropped_pkts); |
3354 | n_pkts = KPKTQ_LEN(spktq); |
3355 | |
3356 | FSW_RLOCK(fsw); |
3357 | if (__improbable(FSW_QUIESCED(fsw))) { |
3358 | DTRACE_SKYWALK1(tx__quiesced, struct nx_flowswitch *, fsw); |
3359 | SK_ERR("flowswitch detached, dropping %d pkts" , n_pkts); |
3360 | KPKTQ_CONCAT(&dropped_pkts, spktq); |
3361 | goto done; |
3362 | } |
3363 | dev_na = fsw->fsw_dev_ch->ch_na; |
3364 | if (__improbable(dev_na == NULL)) { |
3365 | SK_ERR("dev port not attached, dropping %d pkts" , n_pkts); |
3366 | FSW_STATS_ADD(FSW_STATS_DST_NXPORT_INACTIVE, n_pkts); |
3367 | KPKTQ_CONCAT(&dropped_pkts, spktq); |
3368 | goto done; |
3369 | } |
3370 | ifp = fsw_datamov_begin(fsw); |
3371 | if (ifp == NULL) { |
3372 | SK_ERR("ifnet not attached, dropping %d pkts" , n_pkts); |
3373 | KPKTQ_CONCAT(&dropped_pkts, spktq); |
3374 | goto done; |
3375 | } |
3376 | |
3377 | /* batch allocate enough packets */ |
3378 | dev_pp = na_kr_get_pp(dev_na, NR_TX); |
3379 | |
3380 | err = pp_alloc_pktq(dev_pp, dev_pp->pp_max_frags, &dpktq, n_pkts, NULL, |
3381 | NULL, SKMEM_NOSLEEP); |
3382 | #if DEVELOPMENT || DEBUG |
3383 | if (__probable(err != ENOMEM)) { |
3384 | _FSW_INJECT_ERROR(12, err, ENOMEM, pp_free_pktq, &dpktq); |
3385 | } |
3386 | #endif /* DEVELOPMENT || DEBUG */ |
3387 | if (__improbable(err == ENOMEM)) { |
3388 | ASSERT(KPKTQ_EMPTY(&dpktq)); |
3389 | KPKTQ_CONCAT(&dropped_pkts, spktq); |
3390 | FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts); |
3391 | SK_ERR("failed to alloc %u pkts from device pool" , n_pkts); |
3392 | goto done; |
3393 | } else if (__improbable(err == EAGAIN)) { |
3394 | FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, |
3395 | (n_pkts - KPKTQ_LEN(&dpktq))); |
3396 | FSW_STATS_ADD(FSW_STATS_DROP, |
3397 | (n_pkts - KPKTQ_LEN(&dpktq))); |
3398 | } |
3399 | |
3400 | n_pkts = KPKTQ_LEN(&dpktq); |
3401 | prev_fe = NULL; |
3402 | KPKTQ_FOREACH(spkt, spktq) { |
3403 | if (n_pkts == 0) { |
3404 | break; |
3405 | } |
3406 | --n_pkts; |
3407 | |
3408 | KPKTQ_DEQUEUE(&dpktq, pkt); |
3409 | ASSERT(pkt != NULL); |
3410 | err = dp_copy_to_dev(fsw, spkt, dpkt: pkt); |
3411 | if (__improbable(err != 0)) { |
3412 | KPKTQ_ENQUEUE(&dropped_pkts, pkt); |
3413 | continue; |
3414 | } |
3415 | |
3416 | do_pacing |= ((pkt->pkt_pflags & PKT_F_OPT_TX_TIMESTAMP) != 0); |
3417 | af = fsw_ip_demux(fsw, pkt); |
3418 | if (__improbable(af == AF_UNSPEC)) { |
3419 | dp_tx_log_pkt(SK_VERB_ERROR, "demux err" , pkt); |
3420 | FSW_STATS_INC(FSW_STATS_TX_DEMUX_ERR); |
3421 | KPKTQ_ENQUEUE(&dropped_pkts, pkt); |
3422 | continue; |
3423 | } |
3424 | |
3425 | err = flow_pkt_classify(pkt, ifp, af, false); |
3426 | if (__improbable(err != 0)) { |
3427 | dp_tx_log_pkt(SK_VERB_ERROR, "flow extract err" , pkt); |
3428 | FSW_STATS_INC(FSW_STATS_TX_FLOW_EXTRACT_ERR); |
3429 | KPKTQ_ENQUEUE(&dropped_pkts, pkt); |
3430 | continue; |
3431 | } |
3432 | |
3433 | if (__improbable(pkt->pkt_flow_ip_is_frag && |
3434 | !pkt->pkt_flow_ip_is_first_frag)) { |
3435 | fe = tx_process_continuous_ip_frag(fsw, prev_fe, pkt); |
3436 | if (__probable(fe != NULL)) { |
3437 | flow_entry_retain(fe); |
3438 | goto flow_batch; |
3439 | } else { |
3440 | FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT); |
3441 | KPKTQ_ENQUEUE(&dropped_pkts, pkt); |
3442 | continue; |
3443 | } |
3444 | } |
3445 | |
3446 | fe = tx_lookup_flow(fsw, pkt, prev_fe); |
3447 | if (__improbable(fe == NULL)) { |
3448 | FSW_STATS_INC(FSW_STATS_TX_FLOW_NOT_FOUND); |
3449 | KPKTQ_ENQUEUE(&dropped_pkts, pkt); |
3450 | prev_fe = NULL; |
3451 | continue; |
3452 | } |
3453 | flow_batch: |
3454 | tx_flow_batch_packet(fes: &fes, fe, pkt); |
3455 | prev_fe = fe; |
3456 | } |
3457 | |
3458 | struct flow_entry *tfe = NULL; |
3459 | TAILQ_FOREACH_SAFE(fe, &fes, fe_tx_link, tfe) { |
3460 | tx_flow_process(fsw, fe, flags: 0); |
3461 | TAILQ_REMOVE(&fes, fe, fe_tx_link); |
3462 | fe->fe_tx_is_cont_frag = false; |
3463 | fe->fe_tx_frag_id = 0; |
3464 | flow_entry_release(pfe: &fe); |
3465 | n_flows++; |
3466 | } |
3467 | |
3468 | done: |
3469 | FSW_RUNLOCK(fsw); |
3470 | if (n_flows > 0) { |
3471 | netif_transmit(ifp, NETIF_XMIT_FLAG_CHANNEL | (do_pacing ? NETIF_XMIT_FLAG_PACING : 0)); |
3472 | } |
3473 | if (ifp != NULL) { |
3474 | fsw_datamov_end(fsw); |
3475 | } |
3476 | dp_drop_pktq(fsw, &dropped_pkts); |
3477 | KPKTQ_FINI(&dropped_pkts); |
3478 | KPKTQ_FINI(&dpktq); |
3479 | } |
3480 | |
3481 | static sa_family_t |
3482 | get_tso_af(struct __kern_packet *pkt) |
3483 | { |
3484 | packet_tso_flags_t tso_flags; |
3485 | |
3486 | tso_flags = pkt->pkt_csum_flags & PACKET_CSUM_TSO_FLAGS; |
3487 | if (tso_flags == PACKET_TSO_IPV4) { |
3488 | return AF_INET; |
3489 | } else if (tso_flags == PACKET_TSO_IPV6) { |
3490 | return AF_INET6; |
3491 | } else { |
3492 | panic("invalid tso flags: 0x%x\n" , tso_flags); |
3493 | /* NOTREACHED */ |
3494 | __builtin_unreachable(); |
3495 | } |
3496 | } |
3497 | |
3498 | static inline void |
3499 | update_flow_info(struct __kern_packet *pkt, void *iphdr, void *tcphdr, |
3500 | uint16_t payload_sz) |
3501 | { |
3502 | struct tcphdr *tcp = tcphdr; |
3503 | |
3504 | DTRACE_SKYWALK4(update__flow__info, struct __kern_packet *, pkt, |
3505 | void *, iphdr, void *, tcphdr, uint16_t, payload_sz); |
3506 | pkt->pkt_flow_ip_hdr = (mach_vm_address_t)iphdr; |
3507 | pkt->pkt_flow_tcp_hdr = (mach_vm_address_t)tcphdr; |
3508 | pkt->pkt_flow_tcp_flags = tcp->th_flags; |
3509 | pkt->pkt_flow_tcp_seq = tcp->th_seq; |
3510 | pkt->pkt_flow_ulen = payload_sz; |
3511 | } |
3512 | |
3513 | static int |
3514 | do_gso(struct nx_flowswitch *fsw, int af, struct __kern_packet *orig_pkt, |
3515 | struct __kern_packet *first_pkt, struct pktq *dev_pktq, |
3516 | struct pktq *gso_pktq) |
3517 | { |
3518 | ifnet_t ifp = fsw->fsw_ifp; |
3519 | struct __kern_packet *pkt = first_pkt; |
3520 | uint8_t proto = pkt->pkt_flow_ip_proto; |
3521 | uint16_t ip_hlen = pkt->pkt_flow_ip_hlen; |
3522 | uint16_t tcp_hlen = pkt->pkt_flow_tcp_hlen; |
3523 | uint16_t total_hlen = ip_hlen + tcp_hlen; |
3524 | uint16_t mtu = (uint16_t)ifp->if_mtu; |
3525 | uint16_t mss = pkt->pkt_proto_seg_sz, payload_sz; |
3526 | uint32_t n, n_pkts, off = 0, total_len = orig_pkt->pkt_length; |
3527 | uint16_t headroom = fsw->fsw_frame_headroom + ifp->if_tx_headroom; |
3528 | kern_packet_t orig_ph = SK_PKT2PH(orig_pkt); |
3529 | uint8_t *orig_pkt_baddr; |
3530 | struct tcphdr *tcp; |
3531 | struct ip *ip; |
3532 | struct ip6_hdr *ip6; |
3533 | uint32_t tcp_seq; |
3534 | uint16_t ipid; |
3535 | uint32_t pseudo_hdr_csum, bufsz; |
3536 | |
3537 | ASSERT(headroom <= UINT8_MAX); |
3538 | if (proto != IPPROTO_TCP) { |
3539 | SK_ERR("invalid proto: %d" , proto); |
3540 | DTRACE_SKYWALK3(invalid__proto, struct nx_flowswitch *, |
3541 | fsw, ifnet_t, ifp, uint8_t, proto); |
3542 | return EINVAL; |
3543 | } |
3544 | if (mss == 0 || mss > (mtu - total_hlen)) { |
3545 | SK_ERR("invalid args: mss %d, mtu %d, total_hlen %d" , |
3546 | mss, mtu, total_hlen); |
3547 | DTRACE_SKYWALK5(invalid__args1, struct nx_flowswitch *, |
3548 | fsw, ifnet_t, ifp, uint16_t, mss, uint16_t, mtu, |
3549 | uint32_t, total_hlen); |
3550 | return EINVAL; |
3551 | } |
3552 | bufsz = PP_BUF_SIZE_DEF(pkt->pkt_qum.qum_pp); |
3553 | if ((headroom + total_hlen + mss) > bufsz) { |
3554 | SK_ERR("invalid args: headroom %d, total_hlen %d, " |
3555 | "mss %d, bufsz %d" , headroom, total_hlen, mss, bufsz); |
3556 | DTRACE_SKYWALK6(invalid__args2, struct nx_flowswitch *, |
3557 | fsw, ifnet_t, ifp, uint16_t, headroom, uint16_t, |
3558 | total_hlen, uint16_t, mss, uint32_t, bufsz); |
3559 | return EINVAL; |
3560 | } |
3561 | n_pkts = (uint32_t)(SK_ROUNDUP((total_len - total_hlen), mss) / mss); |
3562 | |
3563 | ASSERT(pkt->pkt_headroom == headroom); |
3564 | ASSERT(pkt->pkt_length == total_len); |
3565 | ASSERT(pkt->pkt_l2_len == 0); |
3566 | ASSERT((pkt->pkt_qum.qum_qflags & QUM_F_FINALIZED) == 0); |
3567 | ASSERT((pkt->pkt_pflags & PKT_F_TRUNCATED) != 0); |
3568 | pkt->pkt_pflags &= ~PKT_F_TRUNCATED; |
3569 | pkt->pkt_proto_seg_sz = 0; |
3570 | pkt->pkt_csum_flags = 0; |
3571 | MD_BUFLET_ADDR_ABS(orig_pkt, orig_pkt_baddr); |
3572 | orig_pkt_baddr += orig_pkt->pkt_headroom; |
3573 | |
3574 | if (af == AF_INET) { |
3575 | ip = (struct ip *)pkt->pkt_flow_ip_hdr; |
3576 | tcp = (struct tcphdr *)pkt->pkt_flow_tcp_hdr; |
3577 | ipid = ip->ip_id; |
3578 | pseudo_hdr_csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr, |
3579 | pkt->pkt_flow_ipv4_dst.s_addr, 0); |
3580 | } else { |
3581 | ASSERT(af == AF_INET6); |
3582 | tcp = (struct tcphdr *)pkt->pkt_flow_tcp_hdr; |
3583 | pseudo_hdr_csum = in6_pseudo(&pkt->pkt_flow_ipv6_src, |
3584 | &pkt->pkt_flow_ipv6_dst, 0); |
3585 | } |
3586 | tcp_seq = ntohl(tcp->th_seq); |
3587 | |
3588 | for (n = 1, payload_sz = mss, off = total_hlen; off < total_len; |
3589 | off += payload_sz) { |
3590 | uint8_t *baddr, *baddr0; |
3591 | uint32_t partial; |
3592 | |
3593 | if (pkt == NULL) { |
3594 | n++; |
3595 | KPKTQ_DEQUEUE(dev_pktq, pkt); |
3596 | ASSERT(pkt != NULL); |
3597 | } |
3598 | MD_BUFLET_ADDR_ABS(pkt, baddr0); |
3599 | baddr = baddr0; |
3600 | baddr += headroom; |
3601 | |
3602 | /* Copy headers from the original packet */ |
3603 | if (n != 1) { |
3604 | ASSERT(pkt != first_pkt); |
3605 | pkt_copy(src: orig_pkt_baddr, dst: baddr, len: total_hlen); |
3606 | fsw_pkt_copy_metadata(spkt: first_pkt, dpkt: pkt); |
3607 | |
3608 | ASSERT((pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0); |
3609 | /* flow info still needs to be updated below */ |
3610 | bcopy(src: first_pkt->pkt_flow, dst: pkt->pkt_flow, |
3611 | n: sizeof(*pkt->pkt_flow)); |
3612 | pkt->pkt_trace_id = 0; |
3613 | ASSERT(pkt->pkt_headroom == headroom); |
3614 | } else { |
3615 | METADATA_SET_LEN(pkt, 0, 0); |
3616 | } |
3617 | baddr += total_hlen; |
3618 | |
3619 | /* Copy/checksum the payload from the original packet */ |
3620 | if (off + payload_sz > total_len) { |
3621 | payload_sz = (uint16_t)(total_len - off); |
3622 | } |
3623 | pkt_copypkt_sum(orig_ph, |
3624 | (uint16_t)(orig_pkt->pkt_headroom + off), |
3625 | SK_PKT2PH(pkt), headroom + total_hlen, payload_sz, |
3626 | &partial, TRUE); |
3627 | |
3628 | DTRACE_SKYWALK6(copy__csum, struct nx_flowswitch *, fsw, |
3629 | ifnet_t, ifp, uint8_t *, baddr, uint16_t, payload_sz, |
3630 | uint16_t, mss, uint32_t, partial); |
3631 | FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2PKT); |
3632 | |
3633 | /* |
3634 | * Adjust header information and fill in the missing fields. |
3635 | */ |
3636 | if (af == AF_INET) { |
3637 | ip = (struct ip *)(void *)(baddr0 + pkt->pkt_headroom); |
3638 | tcp = (struct tcphdr *)(void *)((caddr_t)ip + ip_hlen); |
3639 | |
3640 | if (n != n_pkts) { |
3641 | tcp->th_flags &= ~(TH_FIN | TH_PUSH); |
3642 | } |
3643 | if (n != 1) { |
3644 | tcp->th_flags &= ~TH_CWR; |
3645 | tcp->th_seq = htonl(tcp_seq); |
3646 | } |
3647 | update_flow_info(pkt, iphdr: ip, tcphdr: tcp, payload_sz); |
3648 | |
3649 | ip->ip_id = htons((ipid)++); |
3650 | ip->ip_len = htons(ip_hlen + tcp_hlen + payload_sz); |
3651 | ip->ip_sum = 0; |
3652 | ip->ip_sum = inet_cksum_buffer(ip, 0, 0, len: ip_hlen); |
3653 | tcp->th_sum = 0; |
3654 | partial = __packet_cksum(data: tcp, len: tcp_hlen, sum0: partial); |
3655 | partial += htons(tcp_hlen + IPPROTO_TCP + payload_sz); |
3656 | partial += pseudo_hdr_csum; |
3657 | ADDCARRY(partial); |
3658 | tcp->th_sum = ~(uint16_t)partial; |
3659 | } else { |
3660 | ASSERT(af == AF_INET6); |
3661 | ip6 = (struct ip6_hdr *)(baddr0 + pkt->pkt_headroom); |
3662 | tcp = (struct tcphdr *)(void *)((caddr_t)ip6 + ip_hlen); |
3663 | |
3664 | if (n != n_pkts) { |
3665 | tcp->th_flags &= ~(TH_FIN | TH_PUSH); |
3666 | } |
3667 | if (n != 1) { |
3668 | tcp->th_flags &= ~TH_CWR; |
3669 | tcp->th_seq = htonl(tcp_seq); |
3670 | } |
3671 | update_flow_info(pkt, iphdr: ip6, tcphdr: tcp, payload_sz); |
3672 | |
3673 | ip6->ip6_plen = htons(tcp_hlen + payload_sz); |
3674 | tcp->th_sum = 0; |
3675 | partial = __packet_cksum(data: tcp, len: tcp_hlen, sum0: partial); |
3676 | partial += htonl(tcp_hlen + IPPROTO_TCP + payload_sz); |
3677 | partial += pseudo_hdr_csum; |
3678 | ADDCARRY(partial); |
3679 | tcp->th_sum = ~(uint16_t)partial; |
3680 | } |
3681 | tcp_seq += payload_sz; |
3682 | METADATA_ADJUST_LEN(pkt, total_hlen, headroom); |
3683 | #if (DEVELOPMENT || DEBUG) |
3684 | struct __kern_buflet *bft; |
3685 | uint32_t blen; |
3686 | PKT_GET_FIRST_BUFLET(pkt, 1, bft); |
3687 | blen = __buflet_get_data_length(bft); |
3688 | if (blen != total_hlen + payload_sz) { |
3689 | panic("blen (%d) != total_len + payload_sz (%d)\n" , |
3690 | blen, total_hlen + payload_sz); |
3691 | } |
3692 | #endif /* DEVELOPMENT || DEBUG */ |
3693 | |
3694 | pkt->pkt_length = total_hlen + payload_sz; |
3695 | KPKTQ_ENQUEUE(gso_pktq, pkt); |
3696 | pkt = NULL; |
3697 | |
3698 | /* |
3699 | * Note that at this point the packet is not yet finalized. |
3700 | * The finalization happens in dp_flow_tx_process() after |
3701 | * the framing is done. |
3702 | */ |
3703 | } |
3704 | ASSERT(n == n_pkts); |
3705 | ASSERT(off == total_len); |
3706 | DTRACE_SKYWALK7(gso__done, struct nx_flowswitch *, fsw, ifnet_t, ifp, |
3707 | uint32_t, n_pkts, uint32_t, total_len, uint16_t, ip_hlen, |
3708 | uint16_t, tcp_hlen, uint8_t *, orig_pkt_baddr); |
3709 | return 0; |
3710 | } |
3711 | |
3712 | static void |
3713 | tx_flow_enqueue_gso_pktq(struct flow_entry_list *fes, struct flow_entry *fe, |
3714 | struct pktq *gso_pktq) |
3715 | { |
3716 | if (KPKTQ_EMPTY(&fe->fe_tx_pktq)) { |
3717 | ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) == 0); |
3718 | TAILQ_INSERT_TAIL(fes, fe, fe_tx_link); |
3719 | KPKTQ_ENQUEUE_MULTI(&fe->fe_tx_pktq, KPKTQ_FIRST(gso_pktq), |
3720 | KPKTQ_LAST(gso_pktq), KPKTQ_LEN(gso_pktq)); |
3721 | KPKTQ_INIT(gso_pktq); |
3722 | } else { |
3723 | ASSERT(!TAILQ_EMPTY(fes)); |
3724 | KPKTQ_ENQUEUE_MULTI(&fe->fe_tx_pktq, KPKTQ_FIRST(gso_pktq), |
3725 | KPKTQ_LAST(gso_pktq), KPKTQ_LEN(gso_pktq)); |
3726 | KPKTQ_INIT(gso_pktq); |
3727 | flow_entry_release(pfe: &fe); |
3728 | } |
3729 | } |
3730 | |
3731 | static void |
3732 | dp_gso_pktq(struct nx_flowswitch *fsw, struct pktq *spktq, |
3733 | uint32_t gso_pkts_estimate) |
3734 | { |
3735 | struct __kern_packet *spkt, *pkt; |
3736 | struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes); |
3737 | struct flow_entry *fe, *prev_fe; |
3738 | struct pktq dpktq; |
3739 | struct nexus_adapter *dev_na; |
3740 | struct kern_pbufpool *dev_pp; |
3741 | struct ifnet *ifp = NULL; |
3742 | sa_family_t af; |
3743 | uint32_t n_pkts, n_flows = 0; |
3744 | int err; |
3745 | |
3746 | KPKTQ_INIT(&dpktq); |
3747 | n_pkts = KPKTQ_LEN(spktq); |
3748 | |
3749 | FSW_RLOCK(fsw); |
3750 | if (__improbable(FSW_QUIESCED(fsw))) { |
3751 | DTRACE_SKYWALK1(tx__quiesced, struct nx_flowswitch *, fsw); |
3752 | SK_ERR("flowswitch detached, dropping %d pkts" , n_pkts); |
3753 | dp_drop_pktq(fsw, spktq); |
3754 | goto done; |
3755 | } |
3756 | dev_na = fsw->fsw_dev_ch->ch_na; |
3757 | if (__improbable(dev_na == NULL)) { |
3758 | SK_ERR("dev port not attached, dropping %d pkts" , n_pkts); |
3759 | FSW_STATS_ADD(FSW_STATS_DST_NXPORT_INACTIVE, n_pkts); |
3760 | dp_drop_pktq(fsw, spktq); |
3761 | goto done; |
3762 | } |
3763 | ifp = fsw_datamov_begin(fsw); |
3764 | if (ifp == NULL) { |
3765 | SK_ERR("ifnet not attached, dropping %d pkts" , n_pkts); |
3766 | dp_drop_pktq(fsw, spktq); |
3767 | goto done; |
3768 | } |
3769 | |
3770 | dev_pp = na_kr_get_pp(dev_na, NR_TX); |
3771 | |
3772 | /* |
3773 | * Batch allocate enough packets to perform GSO on all |
3774 | * packets in spktq. |
3775 | */ |
3776 | err = pp_alloc_pktq(dev_pp, dev_pp->pp_max_frags, &dpktq, |
3777 | gso_pkts_estimate, NULL, NULL, SKMEM_NOSLEEP); |
3778 | #if DEVELOPMENT || DEBUG |
3779 | if (__probable(err != ENOMEM)) { |
3780 | _FSW_INJECT_ERROR(12, err, ENOMEM, pp_free_pktq, &dpktq); |
3781 | } |
3782 | #endif /* DEVELOPMENT || DEBUG */ |
3783 | /* |
3784 | * We either get all packets or none. No partial allocations. |
3785 | */ |
3786 | if (__improbable(err != 0)) { |
3787 | if (err == ENOMEM) { |
3788 | ASSERT(KPKTQ_EMPTY(&dpktq)); |
3789 | } else { |
3790 | dp_free_pktq(fsw, pktq: &dpktq); |
3791 | } |
3792 | DTRACE_SKYWALK1(gso__no__mem, int, err); |
3793 | dp_drop_pktq(fsw, spktq); |
3794 | FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts); |
3795 | SK_ERR("failed to alloc %u pkts from device pool" , |
3796 | gso_pkts_estimate); |
3797 | goto done; |
3798 | } |
3799 | prev_fe = NULL; |
3800 | KPKTQ_FOREACH(spkt, spktq) { |
3801 | KPKTQ_DEQUEUE(&dpktq, pkt); |
3802 | ASSERT(pkt != NULL); |
3803 | /* |
3804 | * Copy only headers to the first packet of the GSO chain. |
3805 | * The headers will be used for classification below. |
3806 | */ |
3807 | err = dp_copy_headers_to_dev(fsw, spkt, dpkt: pkt); |
3808 | if (__improbable(err != 0)) { |
3809 | pp_free_packet_single(pkt); |
3810 | DTRACE_SKYWALK2(copy__headers__failed, |
3811 | struct nx_flowswitch *, fsw, |
3812 | struct __kern_packet *, spkt); |
3813 | continue; |
3814 | } |
3815 | af = get_tso_af(pkt); |
3816 | ASSERT(af == AF_INET || af == AF_INET6); |
3817 | |
3818 | err = flow_pkt_classify(pkt, ifp, af, false); |
3819 | if (__improbable(err != 0)) { |
3820 | dp_tx_log_pkt(SK_VERB_ERROR, "flow extract err" , pkt); |
3821 | FSW_STATS_INC(FSW_STATS_TX_FLOW_EXTRACT_ERR); |
3822 | pp_free_packet_single(pkt); |
3823 | DTRACE_SKYWALK4(classify__failed, |
3824 | struct nx_flowswitch *, fsw, |
3825 | struct __kern_packet *, spkt, |
3826 | struct __kern_packet *, pkt, |
3827 | int, err); |
3828 | continue; |
3829 | } |
3830 | /* |
3831 | * GSO cannot be done on a fragment and it's a bug in user |
3832 | * space to mark a fragment as needing GSO. |
3833 | */ |
3834 | if (__improbable(pkt->pkt_flow_ip_is_frag)) { |
3835 | FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT); |
3836 | pp_free_packet_single(pkt); |
3837 | DTRACE_SKYWALK3(is__frag, |
3838 | struct nx_flowswitch *, fsw, |
3839 | struct __kern_packet *, spkt, |
3840 | struct __kern_packet *, pkt); |
3841 | continue; |
3842 | } |
3843 | fe = tx_lookup_flow(fsw, pkt, prev_fe); |
3844 | if (__improbable(fe == NULL)) { |
3845 | FSW_STATS_INC(FSW_STATS_TX_FLOW_NOT_FOUND); |
3846 | pp_free_packet_single(pkt); |
3847 | DTRACE_SKYWALK3(lookup__failed, |
3848 | struct nx_flowswitch *, fsw, |
3849 | struct __kern_packet *, spkt, |
3850 | struct __kern_packet *, pkt); |
3851 | prev_fe = NULL; |
3852 | continue; |
3853 | } |
3854 | /* |
3855 | * Perform GSO on spkt using the flow information |
3856 | * obtained above. |
3857 | */ |
3858 | struct pktq gso_pktq; |
3859 | KPKTQ_INIT(&gso_pktq); |
3860 | err = do_gso(fsw, af, orig_pkt: spkt, first_pkt: pkt, dev_pktq: &dpktq, gso_pktq: &gso_pktq); |
3861 | if (__probable(err == 0)) { |
3862 | tx_flow_enqueue_gso_pktq(fes: &fes, fe, gso_pktq: &gso_pktq); |
3863 | prev_fe = fe; |
3864 | } else { |
3865 | DTRACE_SKYWALK1(gso__error, int, err); |
3866 | /* TODO: increment error stat */ |
3867 | pp_free_packet_single(pkt); |
3868 | flow_entry_release(pfe: &fe); |
3869 | prev_fe = NULL; |
3870 | } |
3871 | KPKTQ_FINI(&gso_pktq); |
3872 | } |
3873 | struct flow_entry *tfe = NULL; |
3874 | TAILQ_FOREACH_SAFE(fe, &fes, fe_tx_link, tfe) { |
3875 | /* Chain-enqueue can be used for GSO chains */ |
3876 | tx_flow_process(fsw, fe, FLOW_PROC_FLAG_GSO); |
3877 | TAILQ_REMOVE(&fes, fe, fe_tx_link); |
3878 | flow_entry_release(pfe: &fe); |
3879 | n_flows++; |
3880 | } |
3881 | done: |
3882 | FSW_RUNLOCK(fsw); |
3883 | if (n_flows > 0) { |
3884 | netif_transmit(ifp, NETIF_XMIT_FLAG_CHANNEL); |
3885 | } |
3886 | if (ifp != NULL) { |
3887 | fsw_datamov_end(fsw); |
3888 | } |
3889 | |
3890 | /* |
3891 | * It's possible for packets to be left in dpktq because |
3892 | * gso_pkts_estimate is only an estimate. The actual number |
3893 | * of packets needed could be less. |
3894 | */ |
3895 | uint32_t dpktq_len; |
3896 | if ((dpktq_len = KPKTQ_LEN(&dpktq)) > 0) { |
3897 | DTRACE_SKYWALK2(leftover__dev__pkts, |
3898 | struct nx_flowswitch *, fsw, uint32_t, dpktq_len); |
3899 | dp_free_pktq(fsw, pktq: &dpktq); |
3900 | } |
3901 | KPKTQ_FINI(&dpktq); |
3902 | } |
3903 | |
3904 | static inline void |
3905 | fsw_dev_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r, |
3906 | struct proc *p) |
3907 | { |
3908 | #pragma unused(p) |
3909 | uint32_t total_pkts = 0, total_bytes = 0; |
3910 | |
3911 | for (;;) { |
3912 | struct pktq pktq; |
3913 | KPKTQ_INIT(&pktq); |
3914 | uint32_t n_bytes; |
3915 | fsw_rx_ring_dequeue_pktq(fsw, r, n_pkts_max: fsw_rx_batch, pktq: &pktq, n_bytes: &n_bytes); |
3916 | if (n_bytes == 0) { |
3917 | break; |
3918 | } |
3919 | total_pkts += KPKTQ_LEN(&pktq); |
3920 | total_bytes += n_bytes; |
3921 | |
3922 | if (__probable(fsw->fsw_ifp->if_input_netem == NULL)) { |
3923 | fsw_receive(fsw, pktq: &pktq); |
3924 | } else { |
3925 | fsw_dev_input_netem_enqueue(fsw, q: &pktq); |
3926 | } |
3927 | KPKTQ_FINI(&pktq); |
3928 | } |
3929 | |
3930 | KDBG(SK_KTRACE_FSW_DEV_RING_FLUSH, SK_KVA(r), total_pkts, total_bytes); |
3931 | DTRACE_SKYWALK2(fsw__dp__dev__ring__flush, uint32_t, total_pkts, |
3932 | uint32_t, total_bytes); |
3933 | |
3934 | /* compute mitigation rate for delivered traffic */ |
3935 | if (__probable(r->ckr_netif_mit_stats != NULL)) { |
3936 | r->ckr_netif_mit_stats(r, total_pkts, total_bytes); |
3937 | } |
3938 | } |
3939 | |
3940 | static inline void |
3941 | fsw_user_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r, |
3942 | struct proc *p) |
3943 | { |
3944 | #pragma unused(p) |
3945 | static packet_trace_id_t trace_id = 0; |
3946 | uint32_t total_pkts = 0, total_bytes = 0; |
3947 | |
3948 | for (;;) { |
3949 | struct pktq pktq; |
3950 | KPKTQ_INIT(&pktq); |
3951 | uint32_t n_bytes; |
3952 | uint32_t gso_pkts_estimate = 0; |
3953 | |
3954 | fsw_tx_ring_dequeue_pktq(fsw, r, n_pkts_max: fsw_tx_batch, pktq: &pktq, n_bytes: &n_bytes, |
3955 | gso_pkts_estimate: &gso_pkts_estimate); |
3956 | if (n_bytes == 0) { |
3957 | break; |
3958 | } |
3959 | total_pkts += KPKTQ_LEN(&pktq); |
3960 | total_bytes += n_bytes; |
3961 | |
3962 | KPKTQ_FIRST(&pktq)->pkt_trace_id = ++trace_id; |
3963 | KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_START, |
3964 | KPKTQ_FIRST(&pktq)->pkt_trace_id); |
3965 | |
3966 | if (gso_pkts_estimate > 0) { |
3967 | dp_gso_pktq(fsw, spktq: &pktq, gso_pkts_estimate); |
3968 | } else { |
3969 | dp_tx_pktq(fsw, spktq: &pktq); |
3970 | } |
3971 | dp_free_pktq(fsw, pktq: &pktq); |
3972 | KPKTQ_FINI(&pktq); |
3973 | } |
3974 | kr_update_stats(kring: r, slot_count: total_pkts, byte_count: total_bytes); |
3975 | |
3976 | KDBG(SK_KTRACE_FSW_USER_RING_FLUSH, SK_KVA(r), total_pkts, total_bytes); |
3977 | DTRACE_SKYWALK2(fsw__dp__user__ring__flush, uint32_t, total_pkts, |
3978 | uint32_t, total_bytes); |
3979 | } |
3980 | |
3981 | void |
3982 | fsw_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r, |
3983 | struct proc *p) |
3984 | { |
3985 | struct nexus_vp_adapter *vpna = VPNA(KRNA(r)); |
3986 | |
3987 | ASSERT(sk_is_sync_protected()); |
3988 | ASSERT(vpna->vpna_nx_port != FSW_VP_HOST); |
3989 | ASSERT(vpna->vpna_up.na_md_type == NEXUS_META_TYPE_PACKET); |
3990 | |
3991 | if (vpna->vpna_nx_port == FSW_VP_DEV) { |
3992 | fsw_dev_ring_flush(fsw, r, p); |
3993 | } else { |
3994 | fsw_user_ring_flush(fsw, r, p); |
3995 | } |
3996 | } |
3997 | |
3998 | int |
3999 | fsw_dp_ctor(struct nx_flowswitch *fsw) |
4000 | { |
4001 | uint32_t fe_cnt = fsw_fe_table_size; |
4002 | uint32_t fob_cnt = fsw_flow_owner_buckets; |
4003 | uint32_t frb_cnt = fsw_flow_route_buckets; |
4004 | uint32_t frib_cnt = fsw_flow_route_id_buckets; |
4005 | struct kern_nexus *nx = fsw->fsw_nx; |
4006 | char name[64]; |
4007 | int error = 0; |
4008 | |
4009 | /* just in case */ |
4010 | if (fe_cnt == 0) { |
4011 | fe_cnt = NX_FSW_FE_TABLESZ; |
4012 | ASSERT(fe_cnt != 0); |
4013 | } |
4014 | if (fob_cnt == 0) { |
4015 | fob_cnt = NX_FSW_FOB_HASHSZ; |
4016 | ASSERT(fob_cnt != 0); |
4017 | } |
4018 | if (frb_cnt == 0) { |
4019 | frb_cnt = NX_FSW_FRB_HASHSZ; |
4020 | ASSERT(frb_cnt != 0); |
4021 | } |
4022 | if (frib_cnt == 0) { |
4023 | frib_cnt = NX_FSW_FRIB_HASHSZ; |
4024 | ASSERT(frib_cnt != 0); |
4025 | } |
4026 | |
4027 | /* make sure fe_cnt is a power of two, else round up */ |
4028 | if ((fe_cnt & (fe_cnt - 1)) != 0) { |
4029 | fe_cnt--; |
4030 | fe_cnt |= (fe_cnt >> 1); |
4031 | fe_cnt |= (fe_cnt >> 2); |
4032 | fe_cnt |= (fe_cnt >> 4); |
4033 | fe_cnt |= (fe_cnt >> 8); |
4034 | fe_cnt |= (fe_cnt >> 16); |
4035 | fe_cnt++; |
4036 | } |
4037 | |
4038 | /* make sure frb_cnt is a power of two, else round up */ |
4039 | if ((frb_cnt & (frb_cnt - 1)) != 0) { |
4040 | frb_cnt--; |
4041 | frb_cnt |= (frb_cnt >> 1); |
4042 | frb_cnt |= (frb_cnt >> 2); |
4043 | frb_cnt |= (frb_cnt >> 4); |
4044 | frb_cnt |= (frb_cnt >> 8); |
4045 | frb_cnt |= (frb_cnt >> 16); |
4046 | frb_cnt++; |
4047 | } |
4048 | |
4049 | lck_mtx_init(lck: &fsw->fsw_detach_barrier_lock, grp: &nexus_lock_group, |
4050 | attr: &nexus_lock_attr); |
4051 | lck_mtx_init(lck: &fsw->fsw_reap_lock, grp: &nexus_lock_group, attr: &nexus_lock_attr); |
4052 | lck_mtx_init(lck: &fsw->fsw_linger_lock, grp: &nexus_lock_group, attr: &nexus_lock_attr); |
4053 | TAILQ_INIT(&fsw->fsw_linger_head); |
4054 | |
4055 | (void) snprintf(name, count: sizeof(name), "%s_%llu" , NX_FSW_NAME, nx->nx_id); |
4056 | error = nx_advisory_alloc(nx, name, |
4057 | &NX_PROV(nx)->nxprov_region_params[SKMEM_REGION_NEXUSADV], |
4058 | NEXUS_ADVISORY_TYPE_FLOWSWITCH); |
4059 | if (error != 0) { |
4060 | fsw_dp_dtor(fsw); |
4061 | return error; |
4062 | } |
4063 | |
4064 | fsw->fsw_flow_mgr = flow_mgr_create(fe_cnt, fob_cnt, frb_cnt, frib_cnt); |
4065 | if (fsw->fsw_flow_mgr == NULL) { |
4066 | fsw_dp_dtor(fsw); |
4067 | return error; |
4068 | } |
4069 | |
4070 | /* generic name; will be customized upon ifattach */ |
4071 | (void) snprintf(fsw->fsw_reap_name, count: sizeof(fsw->fsw_reap_name), |
4072 | FSW_REAP_THREADNAME, name, "" ); |
4073 | |
4074 | if (kernel_thread_start(continuation: fsw_reap_thread_func, parameter: fsw, |
4075 | new_thread: &fsw->fsw_reap_thread) != KERN_SUCCESS) { |
4076 | panic_plain("%s: can't create thread" , __func__); |
4077 | /* NOTREACHED */ |
4078 | __builtin_unreachable(); |
4079 | } |
4080 | /* this must not fail */ |
4081 | VERIFY(fsw->fsw_reap_thread != NULL); |
4082 | |
4083 | SK_DF(SK_VERB_MEM, "fsw 0x%llx ALLOC" , SK_KVA(fsw)); |
4084 | |
4085 | |
4086 | return error; |
4087 | } |
4088 | |
4089 | void |
4090 | fsw_dp_dtor(struct nx_flowswitch *fsw) |
4091 | { |
4092 | uint64_t f = (1 * NSEC_PER_MSEC); /* 1 ms */ |
4093 | uint64_t s = (1000 * NSEC_PER_SEC); /* 1 sec */ |
4094 | uint32_t i = 0; |
4095 | |
4096 | #if (DEVELOPMENT || DEBUG) |
4097 | if (fsw->fsw_rps_threads != NULL) { |
4098 | for (i = 0; i < fsw->fsw_rps_nthreads; i++) { |
4099 | fsw_rps_thread_join(fsw, i); |
4100 | } |
4101 | kfree_type(struct fsw_rps_thread, fsw->fsw_rps_threads); |
4102 | } |
4103 | #endif /* !DEVELOPMENT && !DEBUG */ |
4104 | |
4105 | nx_advisory_free(fsw->fsw_nx); |
4106 | |
4107 | if (fsw->fsw_reap_thread != THREAD_NULL) { |
4108 | /* signal thread to begin self-termination */ |
4109 | lck_mtx_lock(lck: &fsw->fsw_reap_lock); |
4110 | fsw->fsw_reap_flags |= FSW_REAPF_TERMINATING; |
4111 | |
4112 | /* |
4113 | * And wait for thread to terminate; use another |
4114 | * wait channel here other than fsw_reap_flags to |
4115 | * make it more explicit. In the event the reaper |
4116 | * thread misses a wakeup, we'll try again once |
4117 | * every second (except for the first time). |
4118 | */ |
4119 | while (!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED)) { |
4120 | uint64_t t = 0; |
4121 | |
4122 | nanoseconds_to_absolutetime(nanoseconds: (i++ == 0) ? f : s, result: &t); |
4123 | clock_absolutetime_interval_to_deadline(abstime: t, result: &t); |
4124 | ASSERT(t != 0); |
4125 | |
4126 | fsw->fsw_reap_flags |= FSW_REAPF_TERMINATEBLOCK; |
4127 | if (!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING)) { |
4128 | thread_wakeup((caddr_t)&fsw->fsw_reap_flags); |
4129 | } |
4130 | (void) assert_wait_deadline(event: &fsw->fsw_reap_thread, |
4131 | THREAD_UNINT, deadline: t); |
4132 | lck_mtx_unlock(lck: &fsw->fsw_reap_lock); |
4133 | thread_block(THREAD_CONTINUE_NULL); |
4134 | lck_mtx_lock(lck: &fsw->fsw_reap_lock); |
4135 | fsw->fsw_reap_flags &= ~FSW_REAPF_TERMINATEBLOCK; |
4136 | } |
4137 | ASSERT(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED); |
4138 | lck_mtx_unlock(lck: &fsw->fsw_reap_lock); |
4139 | fsw->fsw_reap_thread = THREAD_NULL; |
4140 | } |
4141 | |
4142 | /* free any remaining flow entries in the linger list */ |
4143 | fsw_linger_purge(fsw); |
4144 | |
4145 | if (fsw->fsw_flow_mgr != NULL) { |
4146 | flow_mgr_destroy(fsw->fsw_flow_mgr); |
4147 | fsw->fsw_flow_mgr = NULL; |
4148 | } |
4149 | |
4150 | |
4151 | lck_mtx_destroy(lck: &fsw->fsw_detach_barrier_lock, grp: &nexus_lock_group); |
4152 | lck_mtx_destroy(lck: &fsw->fsw_reap_lock, grp: &nexus_lock_group); |
4153 | lck_mtx_destroy(lck: &fsw->fsw_linger_lock, grp: &nexus_lock_group); |
4154 | } |
4155 | |
4156 | void |
4157 | fsw_linger_insert(struct flow_entry *fe) |
4158 | { |
4159 | struct nx_flowswitch *fsw = fe->fe_fsw; |
4160 | SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]); |
4161 | SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx flags 0x%b" , |
4162 | fe_as_string(fe, dbgbuf, sizeof(dbgbuf)), SK_KVA(fe), |
4163 | fe->fe_flags, FLOWENTF_BITS); |
4164 | |
4165 | net_update_uptime(); |
4166 | |
4167 | ASSERT(flow_entry_refcnt(fe) >= 1); |
4168 | ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN); |
4169 | ASSERT(fe->fe_flags & FLOWENTF_DESTROYED); |
4170 | ASSERT(!(fe->fe_flags & FLOWENTF_LINGERING)); |
4171 | ASSERT(fe->fe_flags & FLOWENTF_WAIT_CLOSE); |
4172 | ASSERT(fe->fe_linger_wait != 0); |
4173 | fe->fe_linger_expire = (_net_uptime + fe->fe_linger_wait); |
4174 | os_atomic_or(&fe->fe_flags, FLOWENTF_LINGERING, relaxed); |
4175 | |
4176 | lck_mtx_lock_spin(lck: &fsw->fsw_linger_lock); |
4177 | TAILQ_INSERT_TAIL(&fsw->fsw_linger_head, fe, fe_linger_link); |
4178 | fsw->fsw_linger_cnt++; |
4179 | VERIFY(fsw->fsw_linger_cnt != 0); |
4180 | lck_mtx_unlock(lck: &fsw->fsw_linger_lock); |
4181 | |
4182 | fsw_reap_sched(fsw); |
4183 | } |
4184 | |
4185 | static void |
4186 | fsw_linger_remove_internal(struct flow_entry_linger_head *linger_head, |
4187 | struct flow_entry *fe) |
4188 | { |
4189 | SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]); |
4190 | SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx flags 0x%b" , |
4191 | fe_as_string(fe, dbgbuf, sizeof(dbgbuf)), SK_KVA(fe), |
4192 | fe->fe_flags, FLOWENTF_BITS); |
4193 | |
4194 | ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN); |
4195 | ASSERT(fe->fe_flags & FLOWENTF_DESTROYED); |
4196 | ASSERT(fe->fe_flags & FLOWENTF_LINGERING); |
4197 | os_atomic_andnot(&fe->fe_flags, FLOWENTF_LINGERING, relaxed); |
4198 | |
4199 | TAILQ_REMOVE(linger_head, fe, fe_linger_link); |
4200 | flow_entry_release(pfe: &fe); |
4201 | } |
4202 | |
4203 | static void |
4204 | fsw_linger_remove(struct flow_entry *fe) |
4205 | { |
4206 | struct nx_flowswitch *fsw = fe->fe_fsw; |
4207 | |
4208 | LCK_MTX_ASSERT(&fsw->fsw_linger_lock, LCK_MTX_ASSERT_OWNED); |
4209 | |
4210 | fsw_linger_remove_internal(linger_head: &fsw->fsw_linger_head, fe); |
4211 | VERIFY(fsw->fsw_linger_cnt != 0); |
4212 | fsw->fsw_linger_cnt--; |
4213 | } |
4214 | |
4215 | void |
4216 | fsw_linger_purge(struct nx_flowswitch *fsw) |
4217 | { |
4218 | struct flow_entry *fe, *tfe; |
4219 | |
4220 | lck_mtx_lock(lck: &fsw->fsw_linger_lock); |
4221 | TAILQ_FOREACH_SAFE(fe, &fsw->fsw_linger_head, fe_linger_link, tfe) { |
4222 | fsw_linger_remove(fe); |
4223 | } |
4224 | ASSERT(fsw->fsw_linger_cnt == 0); |
4225 | ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head)); |
4226 | lck_mtx_unlock(lck: &fsw->fsw_linger_lock); |
4227 | } |
4228 | |
4229 | void |
4230 | fsw_reap_sched(struct nx_flowswitch *fsw) |
4231 | { |
4232 | ASSERT(fsw->fsw_reap_thread != THREAD_NULL); |
4233 | lck_mtx_lock_spin(lck: &fsw->fsw_reap_lock); |
4234 | if (!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING) && |
4235 | !(fsw->fsw_reap_flags & (FSW_REAPF_TERMINATING | FSW_REAPF_TERMINATED))) { |
4236 | thread_wakeup((caddr_t)&fsw->fsw_reap_flags); |
4237 | } |
4238 | lck_mtx_unlock(lck: &fsw->fsw_reap_lock); |
4239 | } |
4240 | |
4241 | __attribute__((noreturn)) |
4242 | static void |
4243 | fsw_reap_thread_func(void *v, wait_result_t w) |
4244 | { |
4245 | #pragma unused(w) |
4246 | struct nx_flowswitch *fsw = v; |
4247 | |
4248 | ASSERT(fsw->fsw_reap_thread == current_thread()); |
4249 | thread_set_thread_name(th: current_thread(), name: fsw->fsw_reap_name); |
4250 | |
4251 | net_update_uptime(); |
4252 | |
4253 | lck_mtx_lock(lck: &fsw->fsw_reap_lock); |
4254 | VERIFY(!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING)); |
4255 | (void) assert_wait(event: &fsw->fsw_reap_flags, THREAD_UNINT); |
4256 | lck_mtx_unlock(lck: &fsw->fsw_reap_lock); |
4257 | thread_block_parameter(continuation: fsw_reap_thread_cont, parameter: fsw); |
4258 | /* NOTREACHED */ |
4259 | __builtin_unreachable(); |
4260 | } |
4261 | |
4262 | __attribute__((noreturn)) |
4263 | static void |
4264 | fsw_reap_thread_cont(void *v, wait_result_t wres) |
4265 | { |
4266 | struct nx_flowswitch *fsw = v; |
4267 | boolean_t low; |
4268 | uint64_t t = 0; |
4269 | |
4270 | SK_DF(SK_VERB_FLOW, "%s: running" , fsw->fsw_reap_name); |
4271 | |
4272 | lck_mtx_lock(lck: &fsw->fsw_reap_lock); |
4273 | if (__improbable(wres == THREAD_INTERRUPTED || |
4274 | (fsw->fsw_reap_flags & FSW_REAPF_TERMINATING) != 0)) { |
4275 | goto terminate; |
4276 | } |
4277 | |
4278 | ASSERT(!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED)); |
4279 | fsw->fsw_reap_flags |= FSW_REAPF_RUNNING; |
4280 | lck_mtx_unlock(lck: &fsw->fsw_reap_lock); |
4281 | |
4282 | net_update_uptime(); |
4283 | |
4284 | /* prevent detach from happening while we're here */ |
4285 | if (!fsw_detach_barrier_add(fsw)) { |
4286 | SK_ERR("%s: netagent detached" , fsw->fsw_reap_name); |
4287 | t = 0; |
4288 | } else { |
4289 | uint32_t fe_nonviable, fe_freed, fe_aborted; |
4290 | uint32_t fr_freed, fr_resid = 0; |
4291 | struct ifnet *ifp = fsw->fsw_ifp; |
4292 | uint64_t i = FSW_REAP_IVAL; |
4293 | uint64_t now = _net_uptime; |
4294 | uint64_t last; |
4295 | |
4296 | ASSERT(fsw->fsw_ifp != NULL); |
4297 | |
4298 | /* |
4299 | * Pass 1: process any deferred {withdrawn,nonviable} requests. |
4300 | */ |
4301 | fe_nonviable = fsw_process_deferred(fsw); |
4302 | |
4303 | /* |
4304 | * Pass 2: remove any expired lingering flows. |
4305 | */ |
4306 | fe_freed = fsw_process_linger(fsw, &fe_aborted); |
4307 | |
4308 | /* |
4309 | * Pass 3: prune idle flow routes. |
4310 | */ |
4311 | fr_freed = flow_route_prune(fsw->fsw_flow_mgr, |
4312 | ifp, &fr_resid); |
4313 | |
4314 | /* |
4315 | * Pass 4: prune flow table |
4316 | * |
4317 | */ |
4318 | cuckoo_hashtable_try_shrink(h: fsw->fsw_flow_mgr->fm_flow_table); |
4319 | |
4320 | SK_DF(SK_VERB_FLOW, "%s: fe_nonviable %u/%u fe_freed %u/%u " |
4321 | "fe_aborted %u fr_freed %u/%u" , |
4322 | fsw->fsw_flow_mgr->fm_name, fe_nonviable, |
4323 | (fe_nonviable + fsw->fsw_pending_nonviable), |
4324 | fe_freed, fsw->fsw_linger_cnt, fe_aborted, fe_freed, |
4325 | (fe_freed + fr_resid)); |
4326 | |
4327 | /* see if VM memory level is critical */ |
4328 | low = skmem_lowmem_check(); |
4329 | |
4330 | /* |
4331 | * If things appear to be idle, we can prune away cached |
4332 | * object that have fallen out of the working sets (this |
4333 | * is different than purging). Every once in a while, we |
4334 | * also purge the caches. Note that this is done across |
4335 | * all flowswitch instances, and so we limit this to no |
4336 | * more than once every FSW_REAP_SK_THRES seconds. |
4337 | */ |
4338 | last = os_atomic_load(&fsw_reap_last, relaxed); |
4339 | if ((low || (last != 0 && (now - last) >= FSW_REAP_SK_THRES)) && |
4340 | os_atomic_cmpxchg(&fsw_reap_last, last, now, acq_rel)) { |
4341 | fsw_purge_cache(fsw, low); |
4342 | |
4343 | /* increase sleep interval if idle */ |
4344 | if (kdebug_enable == 0 && fsw->fsw_linger_cnt == 0 && |
4345 | fsw->fsw_pending_nonviable == 0 && fr_resid == 0) { |
4346 | i <<= 3; |
4347 | } |
4348 | } else if (last == 0) { |
4349 | os_atomic_store(&fsw_reap_last, now, release); |
4350 | } |
4351 | |
4352 | /* |
4353 | * Additionally, run thru the list of channels and prune |
4354 | * or purge away cached objects on "idle" channels. This |
4355 | * check is rate limited to no more than once every |
4356 | * FSW_DRAIN_CH_THRES seconds. |
4357 | */ |
4358 | last = fsw->fsw_drain_channel_chk_last; |
4359 | if (low || (last != 0 && (now - last) >= FSW_DRAIN_CH_THRES)) { |
4360 | SK_DF(SK_VERB_FLOW, "%s: pruning channels" , |
4361 | fsw->fsw_flow_mgr->fm_name); |
4362 | |
4363 | fsw->fsw_drain_channel_chk_last = now; |
4364 | fsw_drain_channels(fsw, now, low); |
4365 | } else if (__improbable(last == 0)) { |
4366 | fsw->fsw_drain_channel_chk_last = now; |
4367 | } |
4368 | |
4369 | /* |
4370 | * Finally, invoke the interface's reap callback to |
4371 | * tell it to prune or purge away cached objects if |
4372 | * it is idle. This check is rate limited to no more |
4373 | * than once every FSW_REAP_IF_THRES seconds. |
4374 | */ |
4375 | last = fsw->fsw_drain_netif_chk_last; |
4376 | if (low || (last != 0 && (now - last) >= FSW_REAP_IF_THRES)) { |
4377 | ASSERT(fsw->fsw_nifna != NULL); |
4378 | |
4379 | if (ifp->if_na_ops != NULL && |
4380 | ifp->if_na_ops->ni_reap != NULL) { |
4381 | SK_DF(SK_VERB_FLOW, "%s: pruning netif" , |
4382 | fsw->fsw_flow_mgr->fm_name); |
4383 | ifp->if_na_ops->ni_reap(ifp->if_na, ifp, |
4384 | FSW_REAP_IF_THRES, low); |
4385 | } |
4386 | |
4387 | fsw->fsw_drain_netif_chk_last = now; |
4388 | } else if (__improbable(last == 0)) { |
4389 | fsw->fsw_drain_netif_chk_last = now; |
4390 | } |
4391 | |
4392 | /* emit periodic interface stats ktrace */ |
4393 | last = fsw->fsw_reap_last; |
4394 | if (last != 0 && (now - last) >= FSW_IFSTATS_THRES) { |
4395 | KDBG(SK_KTRACE_AON_IF_STATS, ifp->if_data.ifi_ipackets, |
4396 | ifp->if_data.ifi_ibytes * 8, |
4397 | ifp->if_data.ifi_opackets, |
4398 | ifp->if_data.ifi_obytes * 8); |
4399 | |
4400 | fsw->fsw_reap_last = now; |
4401 | } else if (__improbable(last == 0)) { |
4402 | fsw->fsw_reap_last = now; |
4403 | } |
4404 | |
4405 | nanoseconds_to_absolutetime(nanoseconds: i * NSEC_PER_SEC, result: &t); |
4406 | clock_absolutetime_interval_to_deadline(abstime: t, result: &t); |
4407 | ASSERT(t != 0); |
4408 | |
4409 | /* allow any pending detach to proceed */ |
4410 | fsw_detach_barrier_remove(fsw); |
4411 | } |
4412 | |
4413 | lck_mtx_lock(lck: &fsw->fsw_reap_lock); |
4414 | if (!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATING)) { |
4415 | fsw->fsw_reap_flags &= ~FSW_REAPF_RUNNING; |
4416 | (void) assert_wait_deadline(event: &fsw->fsw_reap_flags, |
4417 | THREAD_UNINT, deadline: t); |
4418 | lck_mtx_unlock(lck: &fsw->fsw_reap_lock); |
4419 | thread_block_parameter(continuation: fsw_reap_thread_cont, parameter: fsw); |
4420 | /* NOTREACHED */ |
4421 | __builtin_unreachable(); |
4422 | } else { |
4423 | terminate: |
4424 | LCK_MTX_ASSERT(&fsw->fsw_reap_lock, LCK_MTX_ASSERT_OWNED); |
4425 | fsw->fsw_reap_flags &= ~(FSW_REAPF_RUNNING | FSW_REAPF_TERMINATING); |
4426 | fsw->fsw_reap_flags |= FSW_REAPF_TERMINATED; |
4427 | /* |
4428 | * And signal any thread waiting for us to terminate; |
4429 | * wait channel here other than fsw_reap_flags to make |
4430 | * it more explicit. |
4431 | */ |
4432 | if (fsw->fsw_reap_flags & FSW_REAPF_TERMINATEBLOCK) { |
4433 | thread_wakeup((caddr_t)&fsw->fsw_reap_thread); |
4434 | } |
4435 | lck_mtx_unlock(lck: &fsw->fsw_reap_lock); |
4436 | |
4437 | SK_DF(SK_VERB_FLOW, "%s: terminating" , fsw->fsw_reap_name); |
4438 | |
4439 | /* for the extra refcnt from kernel_thread_start() */ |
4440 | thread_deallocate(thread: current_thread()); |
4441 | /* this is the end */ |
4442 | thread_terminate(current_thread()); |
4443 | /* NOTREACHED */ |
4444 | __builtin_unreachable(); |
4445 | } |
4446 | |
4447 | /* must never get here */ |
4448 | VERIFY(0); |
4449 | /* NOTREACHED */ |
4450 | __builtin_unreachable(); |
4451 | } |
4452 | |
4453 | static void |
4454 | fsw_drain_channels(struct nx_flowswitch *fsw, uint64_t now, boolean_t low) |
4455 | { |
4456 | struct kern_nexus *nx = fsw->fsw_nx; |
4457 | |
4458 | /* flowswitch protects NA via fsw_lock, see fsw_port_alloc/free */ |
4459 | FSW_RLOCK(fsw); |
4460 | |
4461 | /* uncrustify doesn't handle C blocks properly */ |
4462 | /* BEGIN IGNORE CODESTYLE */ |
4463 | nx_port_foreach(nx, ^(nexus_port_t p) { |
4464 | struct nexus_adapter *na = nx_port_get_na(nx, p); |
4465 | if (na == NULL || na->na_work_ts == 0 || na->na_rx_rings == NULL) { |
4466 | return; |
4467 | } |
4468 | |
4469 | boolean_t purge; |
4470 | |
4471 | /* |
4472 | * If some activity happened in the last FSW_DRAIN_CH_THRES |
4473 | * seconds on this channel, we reclaim memory if the channel |
4474 | * throughput is less than the reap threshold value. |
4475 | */ |
4476 | if ((now - na->na_work_ts) < FSW_DRAIN_CH_THRES) { |
4477 | struct __kern_channel_ring *ring; |
4478 | channel_ring_stats *stats; |
4479 | uint64_t bps; |
4480 | |
4481 | ring = na->na_rx_rings; |
4482 | stats = &ring->ckr_stats; |
4483 | bps = stats->crs_bytes_per_second; |
4484 | |
4485 | if (bps < fsw_channel_reap_thresh) { |
4486 | purge = FALSE; |
4487 | na_drain(na, purge); |
4488 | } |
4489 | return; |
4490 | } |
4491 | |
4492 | /* |
4493 | * If NA has been inactive for some time (twice the drain |
4494 | * threshold), we clear the work timestamp to temporarily skip |
4495 | * this channel until it's active again. Purging cached objects |
4496 | * can be expensive since we'd need to allocate and construct |
4497 | * them again, so we do it only when necessary. |
4498 | */ |
4499 | if (low || ((now - na->na_work_ts) >= (FSW_DRAIN_CH_THRES << 1))) { |
4500 | na->na_work_ts = 0; |
4501 | purge = TRUE; |
4502 | } else { |
4503 | purge = FALSE; |
4504 | } |
4505 | |
4506 | na_drain(na, purge); /* purge/prune caches */ |
4507 | }); |
4508 | /* END IGNORE CODESTYLE */ |
4509 | |
4510 | FSW_RUNLOCK(fsw); |
4511 | } |
4512 | |
4513 | static void |
4514 | fsw_purge_cache(struct nx_flowswitch *fsw, boolean_t low) |
4515 | { |
4516 | #pragma unused(fsw) |
4517 | uint64_t o = os_atomic_inc_orig(&fsw_want_purge, relaxed); |
4518 | uint32_t p = fsw_flow_purge_thresh; |
4519 | boolean_t purge = (low || (o != 0 && p != 0 && (o % p) == 0)); |
4520 | |
4521 | SK_DF(SK_VERB_FLOW, "%s: %s caches" , |
4522 | fsw->fsw_flow_mgr->fm_name, |
4523 | (purge ? "purge" : "prune" )); |
4524 | |
4525 | skmem_cache_reap_now(sk_fo_cache, purge); |
4526 | skmem_cache_reap_now(sk_fe_cache, purge); |
4527 | skmem_cache_reap_now(sk_fab_cache, purge); |
4528 | skmem_cache_reap_now(flow_route_cache, purge); |
4529 | skmem_cache_reap_now(flow_stats_cache, purge); |
4530 | netns_reap_caches(purge); |
4531 | skmem_reap_caches(purge); |
4532 | |
4533 | #if CONFIG_MBUF_MCACHE |
4534 | if (if_is_fsw_transport_netagent_enabled() && purge) { |
4535 | mbuf_drain(FALSE); |
4536 | } |
4537 | #endif /* CONFIG_MBUF_MCACHE */ |
4538 | } |
4539 | |
4540 | static void |
4541 | fsw_flow_handle_low_power(struct nx_flowswitch *fsw, struct flow_entry *fe) |
4542 | { |
4543 | /* When the interface is in low power mode, the flow is nonviable */ |
4544 | if (!(fe->fe_flags & FLOWENTF_NONVIABLE) && |
4545 | os_atomic_cmpxchg(&fe->fe_want_nonviable, 0, 1, acq_rel)) { |
4546 | os_atomic_inc(&fsw->fsw_pending_nonviable, relaxed); |
4547 | } |
4548 | } |
4549 | |
4550 | static uint32_t |
4551 | fsw_process_deferred(struct nx_flowswitch *fsw) |
4552 | { |
4553 | struct flow_entry_dead sfed __sk_aligned(8); |
4554 | struct flow_mgr *fm = fsw->fsw_flow_mgr; |
4555 | struct flow_entry_dead *fed, *tfed; |
4556 | LIST_HEAD(, flow_entry_dead) fed_head = |
4557 | LIST_HEAD_INITIALIZER(fed_head); |
4558 | uint32_t i, nonviable = 0; |
4559 | boolean_t lowpowermode = FALSE; |
4560 | |
4561 | bzero(s: &sfed, n: sizeof(sfed)); |
4562 | |
4563 | /* |
4564 | * The flows become nonviable when the interface |
4565 | * is in low power mode (edge trigger) |
4566 | */ |
4567 | if ((fsw->fsw_ifp->if_xflags & IFXF_LOW_POWER) && |
4568 | fsw->fsw_ifp->if_low_power_gencnt != fsw->fsw_low_power_gencnt) { |
4569 | lowpowermode = TRUE; |
4570 | fsw->fsw_low_power_gencnt = fsw->fsw_ifp->if_low_power_gencnt; |
4571 | } |
4572 | |
4573 | /* |
4574 | * Scan thru the flow entry tree, and commit any pending withdraw or |
4575 | * nonviable requests. We may need to push stats and/or unassign the |
4576 | * nexus from NECP, but we cannot do that while holding the locks; |
4577 | * build a temporary list for those entries. |
4578 | */ |
4579 | for (i = 0; i < fm->fm_owner_buckets_cnt; i++) { |
4580 | struct flow_owner_bucket *fob = flow_mgr_get_fob_at_idx(fm, idx: i); |
4581 | struct flow_owner *fo; |
4582 | |
4583 | /* |
4584 | * Grab the lock at all costs when handling low power mode |
4585 | */ |
4586 | if (__probable(!lowpowermode)) { |
4587 | if (!FOB_TRY_LOCK(fob)) { |
4588 | continue; |
4589 | } |
4590 | } else { |
4591 | FOB_LOCK(fob); |
4592 | } |
4593 | |
4594 | FOB_LOCK_ASSERT_HELD(fob); |
4595 | RB_FOREACH(fo, flow_owner_tree, &fob->fob_owner_head) { |
4596 | struct flow_entry *fe; |
4597 | |
4598 | RB_FOREACH(fe, flow_entry_id_tree, |
4599 | &fo->fo_flow_entry_id_head) { |
4600 | /* try first as reader; skip if we can't */ |
4601 | if (__improbable(lowpowermode)) { |
4602 | fsw_flow_handle_low_power(fsw, fe); |
4603 | } |
4604 | if (__improbable(fe->fe_flags & FLOWENTF_HALF_CLOSED)) { |
4605 | os_atomic_andnot(&fe->fe_flags, FLOWENTF_HALF_CLOSED, relaxed); |
4606 | flow_namespace_half_close(token: &fe->fe_port_reservation); |
4607 | } |
4608 | |
4609 | /* if not withdrawn/nonviable, skip */ |
4610 | if (!fe->fe_want_withdraw && |
4611 | !fe->fe_want_nonviable) { |
4612 | continue; |
4613 | } |
4614 | /* |
4615 | * Here we're holding the lock as writer; |
4616 | * don't spend too much time as we're |
4617 | * blocking the data path now. |
4618 | */ |
4619 | ASSERT(!uuid_is_null(fe->fe_uuid)); |
4620 | /* only need flow UUID and booleans */ |
4621 | uuid_copy(dst: sfed.fed_uuid, src: fe->fe_uuid); |
4622 | sfed.fed_want_clonotify = |
4623 | (fe->fe_flags & FLOWENTF_CLOSE_NOTIFY); |
4624 | sfed.fed_want_nonviable = fe->fe_want_nonviable; |
4625 | flow_entry_teardown(fo, fe); |
4626 | |
4627 | /* do this outside the flow bucket lock */ |
4628 | fed = flow_entry_dead_alloc(Z_WAITOK); |
4629 | ASSERT(fed != NULL); |
4630 | *fed = sfed; |
4631 | LIST_INSERT_HEAD(&fed_head, fed, fed_link); |
4632 | } |
4633 | } |
4634 | FOB_UNLOCK(fob); |
4635 | } |
4636 | |
4637 | /* |
4638 | * These nonviable flows are no longer useful since we've lost |
4639 | * the source IP address; in the event the client monitors the |
4640 | * viability of the flow, explicitly mark it as nonviable so |
4641 | * that a new flow can be created. |
4642 | */ |
4643 | LIST_FOREACH_SAFE(fed, &fed_head, fed_link, tfed) { |
4644 | LIST_REMOVE(fed, fed_link); |
4645 | ASSERT(fsw->fsw_agent_session != NULL); |
4646 | |
4647 | /* if flow is closed early */ |
4648 | if (fed->fed_want_clonotify) { |
4649 | necp_client_early_close(client_id: fed->fed_uuid); |
4650 | } |
4651 | |
4652 | /* if nonviable, unassign nexus attributes */ |
4653 | if (fed->fed_want_nonviable) { |
4654 | (void) netagent_assign_nexus(session: fsw->fsw_agent_session, |
4655 | necp_client_uuid: fed->fed_uuid, NULL, assigned_results_length: 0); |
4656 | } |
4657 | |
4658 | flow_entry_dead_free(fed); |
4659 | ++nonviable; |
4660 | } |
4661 | ASSERT(LIST_EMPTY(&fed_head)); |
4662 | |
4663 | return nonviable; |
4664 | } |
4665 | |
4666 | static uint32_t |
4667 | fsw_process_linger(struct nx_flowswitch *fsw, uint32_t *abort) |
4668 | { |
4669 | struct flow_entry_linger_head linger_head = |
4670 | TAILQ_HEAD_INITIALIZER(linger_head); |
4671 | struct flow_entry *fe, *tfe; |
4672 | uint64_t now = _net_uptime; |
4673 | uint32_t i = 0, cnt = 0, freed = 0; |
4674 | |
4675 | ASSERT(fsw->fsw_ifp != NULL); |
4676 | ASSERT(abort != NULL); |
4677 | *abort = 0; |
4678 | |
4679 | /* |
4680 | * We don't want to contend with the datapath, so move |
4681 | * everything that's in the linger list into a local list. |
4682 | * This allows us to generate RSTs or free the flow entry |
4683 | * outside the lock. Any remaining flow entry in the local |
4684 | * list will get re-added back to the head of the linger |
4685 | * list, in front of any new ones added since then. |
4686 | */ |
4687 | lck_mtx_lock(lck: &fsw->fsw_linger_lock); |
4688 | TAILQ_CONCAT(&linger_head, &fsw->fsw_linger_head, fe_linger_link); |
4689 | ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head)); |
4690 | cnt = fsw->fsw_linger_cnt; |
4691 | fsw->fsw_linger_cnt = 0; |
4692 | lck_mtx_unlock(lck: &fsw->fsw_linger_lock); |
4693 | |
4694 | TAILQ_FOREACH_SAFE(fe, &linger_head, fe_linger_link, tfe) { |
4695 | ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN); |
4696 | ASSERT(fe->fe_flags & FLOWENTF_DESTROYED); |
4697 | ASSERT(fe->fe_flags & FLOWENTF_LINGERING); |
4698 | |
4699 | /* |
4700 | * See if this is a TCP flow that needs to generate |
4701 | * a RST to the remote peer (if not already). |
4702 | */ |
4703 | if (flow_track_tcp_want_abort(fe)) { |
4704 | VERIFY(fe->fe_flags & FLOWENTF_ABORTED); |
4705 | ASSERT(!uuid_is_null(fe->fe_uuid)); |
4706 | flow_track_abort_tcp(fe, NULL, NULL); |
4707 | (*abort)++; |
4708 | SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]); |
4709 | SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx " |
4710 | "flags 0x%b [RST]" , fe_as_string(fe, dbgbuf, |
4711 | sizeof(dbgbuf)), SK_KVA(fe), fe->fe_flags, |
4712 | FLOWENTF_BITS); |
4713 | } |
4714 | |
4715 | /* |
4716 | * If flow has expired, remove from list and free; |
4717 | * otherwise leave it around in the linger list. |
4718 | */ |
4719 | if (fe->fe_linger_expire <= now) { |
4720 | freed++; |
4721 | fsw_linger_remove_internal(linger_head: &linger_head, fe); |
4722 | fe = NULL; |
4723 | } |
4724 | ++i; |
4725 | } |
4726 | VERIFY(i == cnt && cnt >= freed); |
4727 | |
4728 | /* |
4729 | * Add any remaining ones back into the linger list. |
4730 | */ |
4731 | lck_mtx_lock(lck: &fsw->fsw_linger_lock); |
4732 | if (!TAILQ_EMPTY(&linger_head)) { |
4733 | ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head) || fsw->fsw_linger_cnt); |
4734 | TAILQ_CONCAT(&linger_head, &fsw->fsw_linger_head, fe_linger_link); |
4735 | ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head)); |
4736 | TAILQ_CONCAT(&fsw->fsw_linger_head, &linger_head, fe_linger_link); |
4737 | fsw->fsw_linger_cnt += (cnt - freed); |
4738 | } |
4739 | ASSERT(TAILQ_EMPTY(&linger_head)); |
4740 | lck_mtx_unlock(lck: &fsw->fsw_linger_lock); |
4741 | |
4742 | return freed; |
4743 | } |
4744 | |
4745 | __attribute__((always_inline)) |
4746 | static inline void |
4747 | fsw_ifp_inc_traffic_class_in_pkt(struct ifnet *ifp, kern_packet_t ph) |
4748 | { |
4749 | switch (__packet_get_traffic_class(ph)) { |
4750 | case PKT_TC_BE: |
4751 | ifp->if_tc.ifi_ibepackets++; |
4752 | ifp->if_tc.ifi_ibebytes += SK_PTR_ADDR_KPKT(ph)->pkt_length; |
4753 | break; |
4754 | case PKT_TC_BK: |
4755 | ifp->if_tc.ifi_ibkpackets++; |
4756 | ifp->if_tc.ifi_ibkbytes += SK_PTR_ADDR_KPKT(ph)->pkt_length; |
4757 | break; |
4758 | case PKT_TC_VI: |
4759 | ifp->if_tc.ifi_ivipackets++; |
4760 | ifp->if_tc.ifi_ivibytes += SK_PTR_ADDR_KPKT(ph)->pkt_length; |
4761 | break; |
4762 | case PKT_TC_VO: |
4763 | ifp->if_tc.ifi_ivopackets++; |
4764 | ifp->if_tc.ifi_ivobytes += SK_PTR_ADDR_KPKT(ph)->pkt_length; |
4765 | break; |
4766 | default: |
4767 | break; |
4768 | } |
4769 | } |
4770 | |
4771 | __attribute__((always_inline)) |
4772 | static inline void |
4773 | fsw_ifp_inc_traffic_class_out_pkt(struct ifnet *ifp, uint32_t svc, |
4774 | uint32_t cnt, uint32_t len) |
4775 | { |
4776 | switch (svc) { |
4777 | case PKT_TC_BE: |
4778 | ifp->if_tc.ifi_obepackets += cnt; |
4779 | ifp->if_tc.ifi_obebytes += len; |
4780 | break; |
4781 | case PKT_TC_BK: |
4782 | ifp->if_tc.ifi_obkpackets += cnt; |
4783 | ifp->if_tc.ifi_obkbytes += len; |
4784 | break; |
4785 | case PKT_TC_VI: |
4786 | ifp->if_tc.ifi_ovipackets += cnt; |
4787 | ifp->if_tc.ifi_ovibytes += len; |
4788 | break; |
4789 | case PKT_TC_VO: |
4790 | ifp->if_tc.ifi_ovopackets += cnt; |
4791 | ifp->if_tc.ifi_ovobytes += len; |
4792 | break; |
4793 | default: |
4794 | break; |
4795 | } |
4796 | } |
4797 | |