| 1 | /* |
| 2 | * Copyright (c) 2015-2023 Apple Inc. All rights reserved. |
| 3 | * |
| 4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
| 5 | * |
| 6 | * This file contains Original Code and/or Modifications of Original Code |
| 7 | * as defined in and that are subject to the Apple Public Source License |
| 8 | * Version 2.0 (the 'License'). You may not use this file except in |
| 9 | * compliance with the License. The rights granted to you under the License |
| 10 | * may not be used to create, or enable the creation or redistribution of, |
| 11 | * unlawful or unlicensed copies of an Apple operating system, or to |
| 12 | * circumvent, violate, or enable the circumvention or violation of, any |
| 13 | * terms of an Apple operating system software license agreement. |
| 14 | * |
| 15 | * Please obtain a copy of the License at |
| 16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
| 17 | * |
| 18 | * The Original Code and all software distributed under the License are |
| 19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
| 20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
| 21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
| 22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
| 23 | * Please see the License for the specific language governing rights and |
| 24 | * limitations under the License. |
| 25 | * |
| 26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
| 27 | */ |
| 28 | |
| 29 | /* |
| 30 | * Copyright (C) 2013-2014 Universita` di Pisa. All rights reserved. |
| 31 | * |
| 32 | * Redistribution and use in source and binary forms, with or without |
| 33 | * modification, are permitted provided that the following conditions |
| 34 | * are met: |
| 35 | * 1. Redistributions of source code must retain the above copyright |
| 36 | * notice, this list of conditions and the following disclaimer. |
| 37 | * 2. Redistributions in binary form must reproduce the above copyright |
| 38 | * notice, this list of conditions and the following disclaimer in the |
| 39 | * documentation and/or other materials provided with the distribution. |
| 40 | * |
| 41 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND |
| 42 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 43 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 44 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE |
| 45 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 46 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
| 47 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
| 48 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| 49 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| 50 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| 51 | * SUCH DAMAGE. |
| 52 | */ |
| 53 | |
| 54 | /* |
| 55 | * BSD LICENSE |
| 56 | * |
| 57 | * Copyright(c) 2015 NEC Europe Ltd. All rights reserved. |
| 58 | * All rights reserved. |
| 59 | * |
| 60 | * Redistribution and use in source and binary forms, with or without |
| 61 | * modification, are permitted provided that the following conditions |
| 62 | * are met: |
| 63 | * |
| 64 | * * Redistributions of source code must retain the above copyright |
| 65 | * notice, this list of conditions and the following disclaimer. |
| 66 | * * Redistributions in binary form must reproduce the above copyright |
| 67 | * notice, this list of conditions and the following disclaimer in |
| 68 | * the documentation and/or other materials provided with the |
| 69 | * distribution. |
| 70 | * * Neither the name of NEC Europe Ltd. nor the names of |
| 71 | * its contributors may be used to endorse or promote products derived |
| 72 | * from this software without specific prior written permission. |
| 73 | * |
| 74 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 75 | * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 76 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 77 | * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| 78 | * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| 79 | * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| 80 | * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| 81 | * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 82 | * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 83 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 84 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 85 | */ |
| 86 | |
| 87 | #include <skywalk/os_skywalk_private.h> |
| 88 | #include <skywalk/nexus/flowswitch/nx_flowswitch.h> |
| 89 | #include <skywalk/nexus/flowswitch/fsw_var.h> |
| 90 | #include <skywalk/nexus/netif/nx_netif.h> |
| 91 | #include <skywalk/nexus/netif/nx_netif_compat.h> |
| 92 | #include <kern/sched_prim.h> |
| 93 | #include <sys/kdebug.h> |
| 94 | #include <sys/sdt.h> |
| 95 | #include <net/bpf.h> |
| 96 | #include <net/if_ports_used.h> |
| 97 | #include <net/pktap.h> |
| 98 | #include <net/pktsched/pktsched_netem.h> |
| 99 | #include <netinet/tcp.h> |
| 100 | #include <netinet/udp.h> |
| 101 | #include <netinet/ip.h> |
| 102 | #include <netinet/ip6.h> |
| 103 | #include <netinet/in_var.h> |
| 104 | |
| 105 | extern kern_return_t thread_terminate(thread_t); |
| 106 | |
| 107 | #define FSW_ZONE_MAX 256 |
| 108 | #define FSW_ZONE_NAME "skywalk.nx.fsw" |
| 109 | |
| 110 | static uint64_t fsw_reap_last __sk_aligned(8); |
| 111 | static uint64_t fsw_want_purge __sk_aligned(8); |
| 112 | |
| 113 | #define NX_FSW_FE_TABLESZ 256 /* some power of 2 */ |
| 114 | static uint32_t fsw_fe_table_size = NX_FSW_FE_TABLESZ; |
| 115 | |
| 116 | #define NX_FSW_FOB_HASHSZ 31 /* some mersenne prime */ |
| 117 | static uint32_t fsw_flow_owner_buckets = NX_FSW_FOB_HASHSZ; |
| 118 | |
| 119 | #define NX_FSW_FRB_HASHSZ 128 /* some power of 2 */ |
| 120 | static uint32_t fsw_flow_route_buckets = NX_FSW_FRB_HASHSZ; |
| 121 | |
| 122 | #define NX_FSW_FRIB_HASHSZ 13 /* some mersenne prime */ |
| 123 | static uint32_t fsw_flow_route_id_buckets = NX_FSW_FRIB_HASHSZ; |
| 124 | |
| 125 | #define NX_FSW_FLOW_REAP_INTERVAL 1 /* seconds */ |
| 126 | static uint32_t fsw_flow_reap_interval = NX_FSW_FLOW_REAP_INTERVAL; |
| 127 | |
| 128 | #define NX_FSW_FLOW_PURGE_THRES 0 /* purge every N reaps (0 = disable) */ |
| 129 | static uint32_t fsw_flow_purge_thresh = NX_FSW_FLOW_PURGE_THRES; |
| 130 | |
| 131 | #define FSW_REAP_IVAL (MAX(1, fsw_flow_reap_interval)) |
| 132 | #define FSW_REAP_SK_THRES (FSW_REAP_IVAL << 5) |
| 133 | #define FSW_REAP_IF_THRES (FSW_REAP_IVAL << 5) |
| 134 | #define FSW_DRAIN_CH_THRES (FSW_REAP_IVAL << 5) |
| 135 | #define FSW_IFSTATS_THRES 1 |
| 136 | |
| 137 | #define NX_FSW_CHANNEL_REAP_THRES 1000 /* threshold (bytes/sec) for reaping*/ |
| 138 | uint64_t fsw_channel_reap_thresh = NX_FSW_CHANNEL_REAP_THRES; |
| 139 | |
| 140 | #define RX_BUFLET_BATCH_COUNT 64 /* max batch size for buflet allocation */ |
| 141 | |
| 142 | uint32_t fsw_rx_batch = NX_FSW_RXBATCH; /* # of packets per batch (RX) */ |
| 143 | uint32_t fsw_tx_batch = NX_FSW_TXBATCH; /* # of packets per batch (TX) */ |
| 144 | uint32_t fsw_gso_batch = 8; |
| 145 | #if (DEVELOPMENT || DEBUG) |
| 146 | SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_batch, |
| 147 | CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_rx_batch, 0, |
| 148 | "flowswitch Rx batch size" ); |
| 149 | SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, tx_batch, |
| 150 | CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_tx_batch, 0, |
| 151 | "flowswitch Tx batch size" ); |
| 152 | SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, gso_batch, |
| 153 | CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_gso_batch, 0, |
| 154 | "flowswitch GSO batch size" ); |
| 155 | SYSCTL_QUAD(_kern_skywalk_flowswitch, OID_AUTO, reap_throughput, |
| 156 | CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_channel_reap_thresh, |
| 157 | "flowswitch channel reap threshold throughput (bytes/sec)" ); |
| 158 | #endif /* !DEVELOPMENT && !DEBUG */ |
| 159 | |
| 160 | SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_agg_tcp, |
| 161 | CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_rx_agg_tcp, 0, |
| 162 | "flowswitch RX aggregation for tcp flows (enable/disable)" ); |
| 163 | SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, rx_agg_tcp_host, |
| 164 | CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_rx_agg_tcp_host, 0, |
| 165 | "flowswitch RX aggregation for tcp kernel path (0/1/2 (off/on/auto))" ); |
| 166 | SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, gso_mtu, |
| 167 | CTLFLAG_RW | CTLFLAG_LOCKED, &sk_fsw_gso_mtu, 0, |
| 168 | "flowswitch GSO for tcp flows (mtu > 0: enable, mtu == 0: disable)" ); |
| 169 | |
| 170 | /* |
| 171 | * IP reassembly |
| 172 | * The "kern.skywalk.flowswitch.ip_reass" sysctl can be used to force |
| 173 | * enable/disable the reassembly routine regardless of whether the |
| 174 | * transport netagent is enabled or not. |
| 175 | * |
| 176 | * 'fsw_ip_reass' is a tri-state: |
| 177 | * 0 means force IP reassembly off |
| 178 | * 1 means force IP reassembly on |
| 179 | * 2 means don't force the value, use what's appropriate for this flowswitch |
| 180 | */ |
| 181 | #define FSW_IP_REASS_FORCE_OFF 0 |
| 182 | #define FSW_IP_REASS_FORCE_ON 1 |
| 183 | #define FSW_IP_REASS_NO_FORCE 2 |
| 184 | |
| 185 | uint32_t fsw_ip_reass = FSW_IP_REASS_NO_FORCE; |
| 186 | |
| 187 | static int |
| 188 | fsw_ip_reass_sysctl SYSCTL_HANDLER_ARGS |
| 189 | { |
| 190 | #pragma unused(oidp, arg1, arg2) |
| 191 | unsigned int new_value; |
| 192 | int changed; |
| 193 | int error; |
| 194 | |
| 195 | error = sysctl_io_number(req, bigValue: fsw_ip_reass, valueSize: sizeof(fsw_ip_reass), |
| 196 | pValue: &new_value, changed: &changed); |
| 197 | if (error == 0 && changed != 0) { |
| 198 | if (new_value > FSW_IP_REASS_NO_FORCE) { |
| 199 | return EINVAL; |
| 200 | } |
| 201 | fsw_ip_reass = new_value; |
| 202 | } |
| 203 | return error; |
| 204 | } |
| 205 | |
| 206 | SYSCTL_PROC(_kern_skywalk_flowswitch, OID_AUTO, ip_reass, |
| 207 | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED, |
| 208 | 0, 0, fsw_ip_reass_sysctl, "IU" , |
| 209 | "adjust flowswitch IP reassembly" ); |
| 210 | |
| 211 | #if (DEVELOPMENT || DEBUG) |
| 212 | static uint64_t _fsw_inject_error = 0; |
| 213 | #define _FSW_INJECT_ERROR(_en, _ev, _ec, _f, ...) \ |
| 214 | _SK_INJECT_ERROR(_fsw_inject_error, _en, _ev, _ec, \ |
| 215 | &FSW_STATS_VAL(_FSW_STATS_ERROR_INJECTIONS), _f, __VA_ARGS__) |
| 216 | |
| 217 | #define _FSW_INJECT_ERROR_SET(_en, _f, ...) do { \ |
| 218 | if (__improbable(((_fsw_inject_error) & (1ULL << (_en))) != 0)) { \ |
| 219 | SK_DF(SK_VERB_ERROR_INJECT, "injecting error %d", (_en));\ |
| 220 | if ((_f) != NULL) \ |
| 221 | (_f)(__VA_ARGS__); \ |
| 222 | } \ |
| 223 | } while (0) |
| 224 | |
| 225 | SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_owner_buckets, |
| 226 | CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_owner_buckets, 0, "" ); |
| 227 | SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, fe_table_size, |
| 228 | CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_fe_table_size, 0, "" ); |
| 229 | SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_route_buckets, |
| 230 | CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_route_buckets, 0, "" ); |
| 231 | SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, |
| 232 | flow_route_id_buckets, CTLFLAG_RW | CTLFLAG_LOCKED, |
| 233 | &fsw_flow_route_id_buckets, 0, "" ); |
| 234 | SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_reap_interval, |
| 235 | CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_reap_interval, 0, "" ); |
| 236 | SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_purge_thresh, |
| 237 | CTLFLAG_RW | CTLFLAG_LOCKED, &fsw_flow_purge_thresh, 0, "" ); |
| 238 | SYSCTL_QUAD(_kern_skywalk_flowswitch, OID_AUTO, fsw_inject_error, |
| 239 | CTLFLAG_RW | CTLFLAG_LOCKED, &_fsw_inject_error, "" ); |
| 240 | #else |
| 241 | #define _FSW_INJECT_ERROR(_en, _ev, _ec, _f, ...) do { } while (0) |
| 242 | #define _FSW_INJECT_ERROR_SET(_en, _f, ...) do { } while (0) |
| 243 | #endif /* !DEVELOPMENT && !DEBUG */ |
| 244 | |
| 245 | static void fsw_linger_remove_internal(struct flow_entry_linger_head *, |
| 246 | struct flow_entry *); |
| 247 | static void fsw_reap_thread_func(void *, wait_result_t); |
| 248 | static void fsw_reap_thread_cont(void *, wait_result_t); |
| 249 | static void fsw_purge_cache(struct nx_flowswitch *, boolean_t); |
| 250 | static void fsw_drain_channels(struct nx_flowswitch *, uint64_t, boolean_t); |
| 251 | static uint32_t fsw_process_deferred(struct nx_flowswitch *); |
| 252 | static uint32_t fsw_process_linger(struct nx_flowswitch *, uint32_t *); |
| 253 | |
| 254 | static int copy_packet_from_dev(struct nx_flowswitch *, struct __kern_packet *, |
| 255 | struct __kern_packet *); |
| 256 | |
| 257 | static void fsw_ifp_inc_traffic_class_in_pkt(struct ifnet *, kern_packet_t); |
| 258 | static void fsw_ifp_inc_traffic_class_out_pkt(struct ifnet *, uint32_t, |
| 259 | uint32_t, uint32_t); |
| 260 | |
| 261 | static int __fsw_dp_inited = 0; |
| 262 | |
| 263 | int |
| 264 | fsw_dp_init(void) |
| 265 | { |
| 266 | _CASSERT(FSW_VP_DEV == 0); |
| 267 | _CASSERT(FSW_VP_HOST == 1); |
| 268 | _CASSERT((FSW_VP_HOST + FSW_VP_DEV) < FSW_VP_USER_MIN); |
| 269 | _CASSERT((FSW_VP_HOST + FSW_VP_DEV) < NEXUS_PORT_FLOW_SWITCH_CLIENT); |
| 270 | |
| 271 | ASSERT(!__fsw_dp_inited); |
| 272 | |
| 273 | flow_mgr_init(); |
| 274 | flow_init(); |
| 275 | |
| 276 | __fsw_dp_inited = 1; |
| 277 | |
| 278 | return 0; |
| 279 | } |
| 280 | |
| 281 | void |
| 282 | fsw_dp_uninit(void) |
| 283 | { |
| 284 | if (__fsw_dp_inited) { |
| 285 | flow_fini(); |
| 286 | flow_mgr_fini(); |
| 287 | |
| 288 | __fsw_dp_inited = 0; |
| 289 | } |
| 290 | } |
| 291 | |
| 292 | static void |
| 293 | dp_free_pktq(struct nx_flowswitch *fsw __sk_unused, struct pktq *pktq) |
| 294 | { |
| 295 | pp_free_pktq(pktq); |
| 296 | } |
| 297 | |
| 298 | #define dp_drop_pktq(fsw, pktq) do { \ |
| 299 | uint32_t _len = KPKTQ_LEN(pktq); \ |
| 300 | if (KPKTQ_EMPTY(pktq)) { \ |
| 301 | ASSERT(_len == 0); \ |
| 302 | return; \ |
| 303 | } \ |
| 304 | SK_DF(SK_VERB_FSW_DP | SK_VERB_DROP, "drop %d packets", _len); \ |
| 305 | FSW_STATS_ADD(FSW_STATS_DROP, _len); \ |
| 306 | DTRACE_SKYWALK1(fsw__dp__drop, int, _len); \ |
| 307 | dp_free_pktq(fsw, pktq); \ |
| 308 | } while (0) |
| 309 | |
| 310 | SK_NO_INLINE_ATTRIBUTE |
| 311 | void |
| 312 | fsw_snoop(struct nx_flowswitch *fsw, struct flow_entry *fe, bool input) |
| 313 | { |
| 314 | pid_t pid; |
| 315 | char proc_name_buf[FLOW_PROCESS_NAME_LENGTH]; |
| 316 | char *proc_name = NULL; |
| 317 | pid_t epid; |
| 318 | char eproc_name_buf[FLOW_PROCESS_NAME_LENGTH]; |
| 319 | char *eproc_name = NULL; |
| 320 | sa_family_t af; |
| 321 | bool tap_early = false; |
| 322 | struct __kern_packet *pkt; |
| 323 | |
| 324 | ASSERT(fe != NULL); |
| 325 | ASSERT(fsw->fsw_ifp != NULL); |
| 326 | |
| 327 | if (fe->fe_nx_port == FSW_VP_HOST) { |
| 328 | /* allow packets to be tapped before aggregation happens */ |
| 329 | tap_early = (input && fe->fe_key.fk_proto == IPPROTO_TCP); |
| 330 | if (!tap_early) { |
| 331 | /* all other traffic will be tapped in the dlil input path */ |
| 332 | return; |
| 333 | } |
| 334 | } |
| 335 | if (fe->fe_key.fk_ipver == IPVERSION) { |
| 336 | af = AF_INET; |
| 337 | } else if (fe->fe_key.fk_ipver == IPV6_VERSION) { |
| 338 | af = AF_INET6; |
| 339 | } else { |
| 340 | return; |
| 341 | } |
| 342 | |
| 343 | pid = fe->fe_pid; |
| 344 | if (fe->fe_proc_name[0] != '\0') { |
| 345 | (void) strlcpy(dst: proc_name_buf, src: fe->fe_proc_name, |
| 346 | n: sizeof(proc_name_buf)); |
| 347 | proc_name = proc_name_buf; |
| 348 | } |
| 349 | epid = fe->fe_epid; |
| 350 | if (fe->fe_eproc_name[0] != '\0') { |
| 351 | (void) strlcpy(dst: eproc_name_buf, src: fe->fe_eproc_name, |
| 352 | n: sizeof(eproc_name_buf)); |
| 353 | eproc_name = eproc_name_buf; |
| 354 | } |
| 355 | if (input) { |
| 356 | KPKTQ_FOREACH(pkt, &fe->fe_rx_pktq) { |
| 357 | pktap_input_packet(fsw->fsw_ifp, af, |
| 358 | fsw->fsw_ifp_dlt, pid, proc_name, epid, |
| 359 | eproc_name, SK_PKT2PH(pkt), NULL, 0, |
| 360 | IPPROTO_TCP, fe->fe_flowid, |
| 361 | tap_early ? PTH_FLAG_SOCKET: PTH_FLAG_NEXUS_CHAN); |
| 362 | } |
| 363 | } else { |
| 364 | KPKTQ_FOREACH(pkt, &fe->fe_tx_pktq) { |
| 365 | pktap_output_packet(fsw->fsw_ifp, af, |
| 366 | fsw->fsw_ifp_dlt, pid, proc_name, epid, |
| 367 | eproc_name, SK_PKT2PH(pkt), NULL, 0, |
| 368 | 0, 0, PTH_FLAG_NEXUS_CHAN); |
| 369 | } |
| 370 | } |
| 371 | } |
| 372 | |
| 373 | #if (DEVELOPMENT || DEBUG) |
| 374 | static void |
| 375 | _fsw_error35_handler(int step, struct flow_route *fr, struct __kern_packet *pkt, |
| 376 | int *ret) |
| 377 | { |
| 378 | static boolean_t _err35_flag_modified = FALSE; |
| 379 | |
| 380 | switch (step) { |
| 381 | case 1: |
| 382 | if ((fr->fr_flags & (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) == |
| 383 | (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) { |
| 384 | fr->fr_flags &= ~FLOWRTF_RESOLVED; |
| 385 | _err35_flag_modified = TRUE; |
| 386 | } |
| 387 | break; |
| 388 | |
| 389 | case 2: |
| 390 | if (!_err35_flag_modified) { |
| 391 | return; |
| 392 | } |
| 393 | if (pkt->pkt_pflags & PKT_F_MBUF_DATA) { |
| 394 | m_freem(pkt->pkt_mbuf); |
| 395 | pkt->pkt_pflags &= ~PKT_F_MBUF_DATA; |
| 396 | pkt->pkt_mbuf = NULL; |
| 397 | } |
| 398 | *ret = EJUSTRETURN; |
| 399 | fr->fr_flags |= FLOWRTF_RESOLVED; |
| 400 | _err35_flag_modified = FALSE; |
| 401 | break; |
| 402 | |
| 403 | default: |
| 404 | VERIFY(0); |
| 405 | /* not reached */ |
| 406 | } |
| 407 | } |
| 408 | |
| 409 | static void |
| 410 | _fsw_error36_handler(int step, struct flow_route *fr, int *ret) |
| 411 | { |
| 412 | static boolean_t _err36_flag_modified = FALSE; |
| 413 | |
| 414 | switch (step) { |
| 415 | case 1: |
| 416 | if ((fr->fr_flags & (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) == |
| 417 | (FLOWRTF_RESOLVED | FLOWRTF_HAS_LLINFO)) { |
| 418 | fr->fr_flags &= ~FLOWRTF_RESOLVED; |
| 419 | _err36_flag_modified = TRUE; |
| 420 | } |
| 421 | break; |
| 422 | |
| 423 | case 2: |
| 424 | if (!_err36_flag_modified) { |
| 425 | return; |
| 426 | } |
| 427 | *ret = ENETUNREACH; |
| 428 | fr->fr_flags |= FLOWRTF_RESOLVED; |
| 429 | _err36_flag_modified = FALSE; |
| 430 | break; |
| 431 | |
| 432 | default: |
| 433 | VERIFY(0); |
| 434 | /* not reached */ |
| 435 | } |
| 436 | } |
| 437 | #else /* !DEVELOPMENT && !DEBUG */ |
| 438 | #define _fsw_error35_handler(...) |
| 439 | #define _fsw_error36_handler(...) |
| 440 | #endif /* DEVELOPMENT || DEBUG */ |
| 441 | |
| 442 | /* |
| 443 | * Check if the source packet content can fit into the destination |
| 444 | * ring's packet. Returns TRUE if the source packet can fit. |
| 445 | * Note: Failures could be caused by misconfigured packet pool sizes, |
| 446 | * missing packet size check again MTU or if the source packet is from |
| 447 | * a compat netif and the attached mbuf is larger than MTU due to LRO. |
| 448 | */ |
| 449 | static inline boolean_t |
| 450 | validate_pkt_len(struct __kern_packet *spkt, kern_packet_t dph, |
| 451 | uint32_t skip_l2hlen, uint32_t l2hlen, uint16_t headroom, |
| 452 | uint32_t *copy_len) |
| 453 | { |
| 454 | uint32_t tlen = 0; |
| 455 | uint32_t splen = spkt->pkt_length - skip_l2hlen; |
| 456 | |
| 457 | if (l2hlen != 0) { |
| 458 | VERIFY(skip_l2hlen == 0); |
| 459 | tlen += l2hlen; |
| 460 | } else if ((spkt->pkt_link_flags & PKT_LINKF_ETHFCS) != 0) { |
| 461 | splen -= ETHER_CRC_LEN; |
| 462 | } |
| 463 | |
| 464 | tlen += splen; |
| 465 | *copy_len = splen; |
| 466 | |
| 467 | return tlen <= ((__packet_get_buflet_count(ph: dph) * |
| 468 | PP_BUF_SIZE_DEF(SK_PTR_ADDR_KPKT(dph)->pkt_qum.qum_pp)) - |
| 469 | headroom); |
| 470 | } |
| 471 | |
| 472 | #if SK_LOG |
| 473 | /* Hoisted out of line to reduce kernel stack footprint */ |
| 474 | SK_LOG_ATTRIBUTE |
| 475 | static void |
| 476 | copy_packet_from_dev_log(struct __kern_packet *spkt, |
| 477 | struct __kern_packet *dpkt, struct proc *p) |
| 478 | { |
| 479 | uint64_t logflags = ((SK_VERB_FSW | SK_VERB_RX) | |
| 480 | ((spkt->pkt_pflags & PKT_F_MBUF_DATA) ? |
| 481 | SK_VERB_COPY_MBUF : SK_VERB_COPY)); |
| 482 | char *daddr; |
| 483 | MD_BUFLET_ADDR_ABS(dpkt, daddr); |
| 484 | SK_DF(logflags, "%s(%d) splen %u dplen %u hr %u l2 %u" , |
| 485 | sk_proc_name_address(p), sk_proc_pid(p), spkt->pkt_length, |
| 486 | dpkt->pkt_length, (uint32_t)dpkt->pkt_headroom, |
| 487 | (uint32_t)dpkt->pkt_l2_len); |
| 488 | SK_DF(logflags | SK_VERB_DUMP, "%s" , |
| 489 | sk_dump("buf" , daddr, dpkt->pkt_length, 128, NULL, 0)); |
| 490 | } |
| 491 | #else |
| 492 | #define copy_packet_from_dev_log(...) |
| 493 | #endif /* SK_LOG */ |
| 494 | |
| 495 | |
| 496 | static inline int |
| 497 | copy_packet_from_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt, |
| 498 | struct __kern_packet *dpkt) |
| 499 | { |
| 500 | /* |
| 501 | * source and destination nexus don't share the packet pool |
| 502 | * sync operation here is to |
| 503 | * - alloc packet for the rx(dst) ring |
| 504 | * - copy data/metadata from src packet to dst packet |
| 505 | * - attach alloc'd packet to rx(dst) ring |
| 506 | */ |
| 507 | kern_packet_t dph = SK_PTR_ENCODE(dpkt, |
| 508 | METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt)); |
| 509 | kern_packet_t sph = SK_PTR_ENCODE(spkt, METADATA_TYPE(spkt), |
| 510 | METADATA_SUBTYPE(spkt)); |
| 511 | boolean_t do_cksum_rx; |
| 512 | uint16_t skip_l2h_len = spkt->pkt_l2_len; |
| 513 | uint16_t iphlen; |
| 514 | uint32_t dlen; |
| 515 | int err; |
| 516 | |
| 517 | if (__improbable(!validate_pkt_len(spkt, dph, skip_l2h_len, 0, 0, |
| 518 | &dlen))) { |
| 519 | SK_ERR("bufcnt %d, bufsz %d" , __packet_get_buflet_count(dph), |
| 520 | PP_BUF_SIZE_DEF(dpkt->pkt_qum.qum_pp)); |
| 521 | FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN); |
| 522 | return EINVAL; |
| 523 | } |
| 524 | |
| 525 | /* Copy packet metadata */ |
| 526 | _QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum); |
| 527 | _PKT_COPY(spkt, dpkt); |
| 528 | ASSERT(!(dpkt->pkt_qum.qum_qflags & QUM_F_KERNEL_ONLY) || |
| 529 | PP_KERNEL_ONLY(dpkt->pkt_qum.qum_pp)); |
| 530 | ASSERT(dpkt->pkt_mbuf == NULL); |
| 531 | |
| 532 | dpkt->pkt_headroom = 0; |
| 533 | dpkt->pkt_l2_len = 0; |
| 534 | |
| 535 | /* don't include IP header from partial sum */ |
| 536 | if (__probable((spkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0)) { |
| 537 | iphlen = spkt->pkt_flow_ip_hlen; |
| 538 | do_cksum_rx = sk_cksum_rx; |
| 539 | } else { |
| 540 | iphlen = 0; |
| 541 | do_cksum_rx = FALSE; |
| 542 | } |
| 543 | |
| 544 | /* Copy packet payload */ |
| 545 | if ((spkt->pkt_pflags & PKT_F_MBUF_DATA) && |
| 546 | (spkt->pkt_pflags & PKT_F_TRUNCATED)) { |
| 547 | FSW_STATS_INC(FSW_STATS_RX_COPY_MBUF2PKT); |
| 548 | /* |
| 549 | * Source packet has truncated contents (just enough for |
| 550 | * the classifer) of an mbuf from the compat driver; copy |
| 551 | * the entire entire mbuf contents to destination packet. |
| 552 | */ |
| 553 | m_adj(spkt->pkt_mbuf, skip_l2h_len); |
| 554 | ASSERT((uint32_t)m_pktlen(spkt->pkt_mbuf) >= dlen); |
| 555 | fsw->fsw_pkt_copy_from_mbuf(NR_RX, dph, 0, |
| 556 | spkt->pkt_mbuf, 0, dlen, do_cksum_rx, iphlen); |
| 557 | } else { |
| 558 | FSW_STATS_INC(FSW_STATS_RX_COPY_PKT2PKT); |
| 559 | /* |
| 560 | * Source packet has full contents, either from an mbuf |
| 561 | * that came up from the compat driver, or because it |
| 562 | * originated on the native driver; copy to destination. |
| 563 | */ |
| 564 | fsw->fsw_pkt_copy_from_pkt(NR_RX, dph, 0, sph, |
| 565 | (spkt->pkt_headroom + spkt->pkt_l2_len), dlen, do_cksum_rx, |
| 566 | iphlen, 0, FALSE); |
| 567 | } |
| 568 | |
| 569 | #if DEBUG || DEVELOPMENT |
| 570 | if (__improbable(pkt_trailers > 0)) { |
| 571 | dlen += pkt_add_trailers(dph, dlen, iphlen); |
| 572 | } |
| 573 | #endif /* DEBUG || DEVELOPMENT */ |
| 574 | |
| 575 | /* Finalize and attach packet to Rx ring */ |
| 576 | METADATA_ADJUST_LEN(dpkt, 0, 0); |
| 577 | err = __packet_finalize(ph: dph); |
| 578 | VERIFY(err == 0); |
| 579 | |
| 580 | copy_packet_from_dev_log(spkt, dpkt, kernproc); |
| 581 | |
| 582 | if (spkt->pkt_pflags & PKT_F_MBUF_DATA) { |
| 583 | ifp_inc_traffic_class_in(ifp: fsw->fsw_ifp, m: spkt->pkt_mbuf); |
| 584 | mbuf_free(mbuf: spkt->pkt_mbuf); |
| 585 | KPKT_CLEAR_MBUF_DATA(spkt); |
| 586 | } else { |
| 587 | fsw_ifp_inc_traffic_class_in_pkt(fsw->fsw_ifp, dph); |
| 588 | } |
| 589 | |
| 590 | if (__probable(do_cksum_rx != 0)) { |
| 591 | FSW_STATS_INC(FSW_STATS_RX_COPY_SUM); |
| 592 | } |
| 593 | |
| 594 | return 0; |
| 595 | } |
| 596 | |
| 597 | SK_NO_INLINE_ATTRIBUTE |
| 598 | static struct __kern_packet * |
| 599 | rx_process_ip_frag(struct nx_flowswitch *fsw, struct __kern_packet *pkt) |
| 600 | { |
| 601 | char *pkt_buf; |
| 602 | void *l3_hdr; |
| 603 | uint16_t nfrags, tlen; |
| 604 | int err = 0; |
| 605 | |
| 606 | switch (fsw_ip_reass) { |
| 607 | case FSW_IP_REASS_FORCE_OFF: |
| 608 | return pkt; |
| 609 | case FSW_IP_REASS_FORCE_ON: |
| 610 | break; |
| 611 | default: |
| 612 | if (!FSW_NETAGENT_ENABLED(fsw) || |
| 613 | flow_mgr_get_num_flows(mgr: fsw->fsw_flow_mgr) == 0) { |
| 614 | return pkt; |
| 615 | } |
| 616 | break; |
| 617 | } |
| 618 | |
| 619 | MD_BUFLET_ADDR_ABS(pkt, pkt_buf); |
| 620 | l3_hdr = pkt_buf + pkt->pkt_headroom + pkt->pkt_l2_len; |
| 621 | |
| 622 | ASSERT(fsw->fsw_ipfm != NULL); |
| 623 | ASSERT((pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0); |
| 624 | |
| 625 | if (pkt->pkt_flow_ip_ver == IPVERSION) { |
| 626 | err = fsw_ip_frag_reass_v4(mgr: fsw->fsw_ipfm, pkt: &pkt, |
| 627 | ip4: (struct ip *)l3_hdr, nfrags: &nfrags, tlen: &tlen); |
| 628 | } else { |
| 629 | ASSERT(pkt->pkt_flow_ip_ver == IPV6_VERSION); |
| 630 | /* we only handle frag header immediately after v6 header */ |
| 631 | err = fsw_ip_frag_reass_v6(mgr: fsw->fsw_ipfm, pkt: &pkt, |
| 632 | ip6: (struct ip6_hdr *)l3_hdr, |
| 633 | ip6f: (struct ip6_frag *)((uintptr_t)l3_hdr + sizeof(struct ip6_hdr)), |
| 634 | nfrags: &nfrags, tlen: &tlen); |
| 635 | } |
| 636 | if (__improbable(err != 0)) { |
| 637 | /* if we get a bad fragment, free it */ |
| 638 | pp_free_packet_single(pkt); |
| 639 | pkt = NULL; |
| 640 | } else { |
| 641 | ASSERT(!((pkt != NULL) ^ (nfrags > 0))); |
| 642 | } |
| 643 | |
| 644 | return pkt; |
| 645 | } |
| 646 | |
| 647 | SK_NO_INLINE_ATTRIBUTE |
| 648 | static void |
| 649 | rx_prepare_packet_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *pkt) |
| 650 | { |
| 651 | ASSERT(pkt->pkt_pflags & PKT_F_MBUF_DATA); |
| 652 | uint32_t mlen = (uint32_t)m_pktlen(pkt->pkt_mbuf); |
| 653 | kern_packet_t ph = SK_PTR_ENCODE(pkt, |
| 654 | METADATA_TYPE(pkt), METADATA_SUBTYPE(pkt)); |
| 655 | /* |
| 656 | * This is the case when the packet is coming in from |
| 657 | * compat-netif. This packet only has valid metadata |
| 658 | * and an attached mbuf. We need to copy enough data |
| 659 | * from the mbuf to the packet buffer for the |
| 660 | * classifier. Compat netif packet pool is configured |
| 661 | * with buffer size of NETIF_COMPAT_MAX_MBUF_DATA_COPY |
| 662 | * which is just enough to hold the protocol headers |
| 663 | * for the flowswitch classifier. |
| 664 | */ |
| 665 | |
| 666 | pkt->pkt_headroom = 0; |
| 667 | METADATA_ADJUST_LEN(pkt, 0, 0); |
| 668 | /* |
| 669 | * Copy the initial 128 bytes of the packet for |
| 670 | * classification. |
| 671 | * Ethernet(14) + IPv6 header(40) + |
| 672 | * + IPv6 fragment header(8) + |
| 673 | * TCP header with options(60). |
| 674 | */ |
| 675 | fsw->fsw_pkt_copy_from_mbuf(NR_RX, ph, |
| 676 | pkt->pkt_headroom, pkt->pkt_mbuf, 0, |
| 677 | MIN(mlen, NETIF_COMPAT_MAX_MBUF_DATA_COPY), |
| 678 | FALSE, 0); |
| 679 | |
| 680 | int err = __packet_finalize_with_mbuf(pkt); |
| 681 | VERIFY(err == 0); |
| 682 | } |
| 683 | |
| 684 | static struct __kern_packet * |
| 685 | rx_prepare_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt) |
| 686 | { |
| 687 | pkt->pkt_qum_qflags &= ~QUM_F_FLOW_CLASSIFIED; |
| 688 | |
| 689 | if (__improbable(pkt->pkt_pflags & PKT_F_MBUF_DATA)) { |
| 690 | rx_prepare_packet_mbuf(fsw, pkt); |
| 691 | } |
| 692 | |
| 693 | return pkt; |
| 694 | } |
| 695 | |
| 696 | static struct flow_entry * |
| 697 | lookup_flow_with_pkt(struct nx_flowswitch *fsw, struct __kern_packet *pkt, |
| 698 | bool input, struct flow_entry *prev_fe) |
| 699 | { |
| 700 | struct flow_key key __sk_aligned(16); |
| 701 | struct flow_entry *fe = NULL; |
| 702 | |
| 703 | ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED); |
| 704 | flow_pkt2key(pkt, input, key: &key); |
| 705 | |
| 706 | if (__probable(prev_fe != NULL && |
| 707 | prev_fe->fe_key.fk_mask == FKMASK_5TUPLE)) { |
| 708 | uint16_t saved_mask = key.fk_mask; |
| 709 | key.fk_mask = FKMASK_5TUPLE; |
| 710 | if (flow_key_cmp_mask(match: &prev_fe->fe_key, key: &key, mask: &fk_mask_5tuple) == 0) { |
| 711 | flow_entry_retain(fe: prev_fe); |
| 712 | fe = prev_fe; |
| 713 | } else { |
| 714 | key.fk_mask = saved_mask; |
| 715 | } |
| 716 | } |
| 717 | |
| 718 | top: |
| 719 | if (__improbable(fe == NULL)) { |
| 720 | fe = flow_mgr_find_fe_by_key(fsw->fsw_flow_mgr, &key); |
| 721 | } |
| 722 | |
| 723 | if (__improbable(fe != NULL && |
| 724 | (fe->fe_flags & (FLOWENTF_PARENT | FLOWENTF_CHILD)) != 0)) { |
| 725 | /* Rx */ |
| 726 | if (input) { |
| 727 | if (fe->fe_flags & FLOWENTF_PARENT) { |
| 728 | struct flow_entry *child_fe = rx_lookup_child_flow(fsw, fe, pkt); |
| 729 | if (child_fe != NULL) { |
| 730 | flow_entry_release(pfe: &fe); |
| 731 | fe = child_fe; |
| 732 | } |
| 733 | } else { |
| 734 | if (!rx_flow_demux_match(fsw, fe, pkt)) { |
| 735 | flow_entry_release(pfe: &fe); |
| 736 | fe = NULL; |
| 737 | goto top; |
| 738 | } |
| 739 | } |
| 740 | } else { |
| 741 | /* Tx */ |
| 742 | if (__improbable(!_UUID_MATCH(pkt->pkt_flow_id, fe->fe_uuid))) { |
| 743 | if (__probable(fe->fe_flags & FLOWENTF_PARENT)) { |
| 744 | struct flow_entry *parent_fe = fe; |
| 745 | fe = tx_lookup_child_flow(parent_fe, pkt->pkt_flow_id); |
| 746 | flow_entry_release(pfe: &parent_fe); |
| 747 | } else { |
| 748 | flow_entry_release(pfe: &fe); |
| 749 | fe = NULL; |
| 750 | goto top; |
| 751 | } |
| 752 | } |
| 753 | } |
| 754 | } |
| 755 | |
| 756 | SK_LOG_VAR(char fkbuf[FLOWKEY_DBGBUF_SIZE]); |
| 757 | SK_DF(SK_VERB_FSW_DP | SK_VERB_LOOKUP, |
| 758 | "%s %s %s \"%s\" fe 0x%llx" , |
| 759 | input ? "Rx" : "Tx" , if_name(fsw->fsw_ifp), |
| 760 | sk_proc_name_address(current_proc()), |
| 761 | fk_as_string(&key, fkbuf, sizeof(fkbuf)), |
| 762 | SK_KVA(fe)); |
| 763 | |
| 764 | return fe; |
| 765 | } |
| 766 | |
| 767 | SK_NO_INLINE_ATTRIBUTE |
| 768 | static bool |
| 769 | pkt_is_for_listener(struct flow_entry *fe, struct __kern_packet *pkt) |
| 770 | { |
| 771 | struct nx_flowswitch *fsw = fe->fe_fsw; |
| 772 | struct ifnet *ifp = fsw->fsw_ifp; |
| 773 | struct in_ifaddr *ia = NULL; |
| 774 | struct in_ifaddr *best_ia = NULL; |
| 775 | struct in6_ifaddr *ia6 = NULL; |
| 776 | struct in6_ifaddr *best_ia6 = NULL; |
| 777 | struct ifnet *match_ifp = NULL; |
| 778 | struct __flow *flow = pkt->pkt_flow; |
| 779 | bool result = false; |
| 780 | |
| 781 | ASSERT(pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED); |
| 782 | |
| 783 | if (flow->flow_ip_ver == IPVERSION) { |
| 784 | if (IN_ZERONET(ntohl(flow->flow_ipv4_dst.s_addr)) || |
| 785 | IN_LOOPBACK(ntohl(flow->flow_ipv4_dst.s_addr)) || |
| 786 | IN_LINKLOCAL(ntohl(flow->flow_ipv4_dst.s_addr)) || |
| 787 | IN_DS_LITE(ntohl(flow->flow_ipv4_dst.s_addr)) || |
| 788 | IN_6TO4_RELAY_ANYCAST(ntohl(flow->flow_ipv4_dst.s_addr)) || |
| 789 | IN_MULTICAST(ntohl(flow->flow_ipv4_dst.s_addr)) || |
| 790 | INADDR_BROADCAST == flow->flow_ipv4_dst.s_addr) { |
| 791 | result = true; |
| 792 | goto done; |
| 793 | } |
| 794 | |
| 795 | /* |
| 796 | * Check for a match in the hash bucket. |
| 797 | */ |
| 798 | lck_rw_lock_shared(lck: &in_ifaddr_rwlock); |
| 799 | TAILQ_FOREACH(ia, INADDR_HASH(flow->flow_ipv4_dst.s_addr), ia_hash) { |
| 800 | if (IA_SIN(ia)->sin_addr.s_addr == flow->flow_ipv4_dst.s_addr) { |
| 801 | best_ia = ia; |
| 802 | match_ifp = ia->ia_ifp; |
| 803 | |
| 804 | if (match_ifp == ifp) { |
| 805 | break; |
| 806 | } |
| 807 | /* |
| 808 | * Continue the loop in case there's a exact match with another |
| 809 | * interface |
| 810 | */ |
| 811 | } |
| 812 | } |
| 813 | |
| 814 | if (best_ia != NULL) { |
| 815 | if (match_ifp != ifp && ipforwarding == 0 && |
| 816 | (match_ifp->if_family == IFNET_FAMILY_IPSEC || |
| 817 | match_ifp->if_family == IFNET_FAMILY_UTUN)) { |
| 818 | /* |
| 819 | * Drop when interface address check is strict and forwarding |
| 820 | * is disabled |
| 821 | */ |
| 822 | } else { |
| 823 | lck_rw_done(lck: &in_ifaddr_rwlock); |
| 824 | result = true; |
| 825 | goto done; |
| 826 | } |
| 827 | } |
| 828 | lck_rw_done(lck: &in_ifaddr_rwlock); |
| 829 | |
| 830 | if (ifp->if_flags & IFF_BROADCAST) { |
| 831 | /* |
| 832 | * Check for broadcast addresses. |
| 833 | * |
| 834 | * Only accept broadcast packets that arrive via the matching |
| 835 | * interface. Reception of forwarded directed broadcasts would be |
| 836 | * handled via ip_forward() and ether_frameout() with the loopback |
| 837 | * into the stack for SIMPLEX interfaces handled by ether_frameout(). |
| 838 | */ |
| 839 | struct ifaddr *ifa; |
| 840 | |
| 841 | ifnet_lock_shared(ifp); |
| 842 | TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { |
| 843 | if (ifa->ifa_addr->sa_family != AF_INET) { |
| 844 | continue; |
| 845 | } |
| 846 | ia = ifatoia(ifa); |
| 847 | if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == flow->flow_ipv4_dst.s_addr || |
| 848 | ia->ia_netbroadcast.s_addr == flow->flow_ipv4_dst.s_addr) { |
| 849 | ifnet_lock_done(ifp); |
| 850 | result = true; |
| 851 | goto done; |
| 852 | } |
| 853 | } |
| 854 | ifnet_lock_done(ifp); |
| 855 | } |
| 856 | } else { |
| 857 | if (IN6_IS_ADDR_LOOPBACK(&flow->flow_ipv6_dst) || |
| 858 | IN6_IS_ADDR_LINKLOCAL(&flow->flow_ipv6_dst) || |
| 859 | IN6_IS_ADDR_MULTICAST(&flow->flow_ipv6_dst)) { |
| 860 | result = true; |
| 861 | goto done; |
| 862 | } |
| 863 | |
| 864 | /* |
| 865 | * Check for exact addresses in the hash bucket. |
| 866 | */ |
| 867 | lck_rw_lock_shared(lck: &in6_ifaddr_rwlock); |
| 868 | TAILQ_FOREACH(ia6, IN6ADDR_HASH(&flow->flow_ipv6_dst), ia6_hash) { |
| 869 | if (in6_are_addr_equal_scoped(&ia6->ia_addr.sin6_addr, &flow->flow_ipv6_dst, ia6->ia_ifp->if_index, ifp->if_index)) { |
| 870 | if ((ia6->ia6_flags & (IN6_IFF_NOTREADY | IN6_IFF_CLAT46))) { |
| 871 | continue; |
| 872 | } |
| 873 | best_ia6 = ia6; |
| 874 | if (ia6->ia_ifp == ifp) { |
| 875 | break; |
| 876 | } |
| 877 | /* |
| 878 | * Continue the loop in case there's a exact match with another |
| 879 | * interface |
| 880 | */ |
| 881 | } |
| 882 | } |
| 883 | if (best_ia6 != NULL) { |
| 884 | if (best_ia6->ia_ifp != ifp && ip6_forwarding == 0 && |
| 885 | (best_ia6->ia_ifp->if_family == IFNET_FAMILY_IPSEC || |
| 886 | best_ia6->ia_ifp->if_family == IFNET_FAMILY_UTUN)) { |
| 887 | /* |
| 888 | * Drop when interface address check is strict and forwarding |
| 889 | * is disabled |
| 890 | */ |
| 891 | } else { |
| 892 | lck_rw_done(lck: &in6_ifaddr_rwlock); |
| 893 | result = true; |
| 894 | goto done; |
| 895 | } |
| 896 | } |
| 897 | lck_rw_done(lck: &in6_ifaddr_rwlock); |
| 898 | } |
| 899 | |
| 900 | /* |
| 901 | * In forwarding mode, if the destination address |
| 902 | * of the packet does not match any interface |
| 903 | * address, it maybe destined to the client device |
| 904 | */ |
| 905 | SK_DF(SK_VERB_FSW_DP | SK_VERB_RX | SK_VERB_FLOW, |
| 906 | "Rx flow does not match interface address" ); |
| 907 | done: |
| 908 | return result; |
| 909 | } |
| 910 | |
| 911 | static struct flow_entry * |
| 912 | rx_lookup_flow(struct nx_flowswitch *fsw, struct __kern_packet *pkt, |
| 913 | struct flow_entry *prev_fe) |
| 914 | { |
| 915 | struct flow_entry *fe; |
| 916 | |
| 917 | fe = lookup_flow_with_pkt(fsw, pkt, true, prev_fe); |
| 918 | _FSW_INJECT_ERROR(2, fe, NULL, flow_entry_release, &fe); |
| 919 | if (fe == NULL) { |
| 920 | FSW_STATS_INC(FSW_STATS_RX_FLOW_NOT_FOUND); |
| 921 | return NULL; |
| 922 | } |
| 923 | |
| 924 | if (__improbable(fe->fe_key.fk_mask == FKMASK_2TUPLE && |
| 925 | fe->fe_flags & FLOWENTF_LISTENER) && |
| 926 | !pkt_is_for_listener(fe, pkt)) { |
| 927 | FSW_STATS_INC(FSW_STATS_RX_PKT_NOT_LISTENER); |
| 928 | flow_entry_release(pfe: &fe); |
| 929 | return NULL; |
| 930 | } |
| 931 | |
| 932 | if (__improbable(fe->fe_flags & FLOWENTF_TORN_DOWN)) { |
| 933 | FSW_STATS_INC(FSW_STATS_RX_FLOW_TORNDOWN); |
| 934 | SK_DF(SK_VERB_FSW_DP | SK_VERB_RX | SK_VERB_FLOW, |
| 935 | "Rx flow torn down" ); |
| 936 | flow_entry_release(pfe: &fe); |
| 937 | fe = NULL; |
| 938 | } |
| 939 | |
| 940 | return fe; |
| 941 | } |
| 942 | |
| 943 | static inline void |
| 944 | rx_flow_batch_packet(struct flow_entry_list *fes, struct flow_entry *fe, |
| 945 | struct __kern_packet *pkt) |
| 946 | { |
| 947 | if (__improbable(pkt->pkt_flow_ip_is_frag)) { |
| 948 | fe->fe_rx_frag_count++; |
| 949 | } |
| 950 | |
| 951 | /* KPKTQ_ENQUEUE_LIST is needed until frags become chained buflet */ |
| 952 | if (KPKTQ_EMPTY(&fe->fe_rx_pktq)) { |
| 953 | ASSERT(KPKTQ_LEN(&fe->fe_rx_pktq) == 0); |
| 954 | TAILQ_INSERT_TAIL(fes, fe, fe_rx_link); |
| 955 | KPKTQ_ENQUEUE_LIST(&fe->fe_rx_pktq, pkt); |
| 956 | } else { |
| 957 | ASSERT(!TAILQ_EMPTY(fes)); |
| 958 | KPKTQ_ENQUEUE_LIST(&fe->fe_rx_pktq, pkt); |
| 959 | flow_entry_release(pfe: &fe); |
| 960 | } |
| 961 | } |
| 962 | |
| 963 | static void |
| 964 | tx_flow_batch_packet(struct flow_entry_list *fes, struct flow_entry *fe, |
| 965 | struct __kern_packet *pkt) |
| 966 | { |
| 967 | /* record frag continuation */ |
| 968 | if (__improbable(pkt->pkt_flow_ip_is_first_frag)) { |
| 969 | ASSERT(pkt->pkt_flow_ip_is_frag); |
| 970 | fe->fe_tx_is_cont_frag = true; |
| 971 | fe->fe_tx_frag_id = pkt->pkt_flow_ip_frag_id; |
| 972 | } else if (__probable(!pkt->pkt_flow_ip_is_frag)) { |
| 973 | fe->fe_tx_is_cont_frag = false; |
| 974 | fe->fe_tx_frag_id = 0; |
| 975 | } |
| 976 | |
| 977 | if (KPKTQ_EMPTY(&fe->fe_tx_pktq)) { |
| 978 | ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) == 0); |
| 979 | TAILQ_INSERT_TAIL(fes, fe, fe_tx_link); |
| 980 | KPKTQ_ENQUEUE(&fe->fe_tx_pktq, pkt); |
| 981 | } else { |
| 982 | ASSERT(!TAILQ_EMPTY(fes)); |
| 983 | KPKTQ_ENQUEUE(&fe->fe_tx_pktq, pkt); |
| 984 | flow_entry_release(pfe: &fe); |
| 985 | } |
| 986 | } |
| 987 | |
| 988 | static inline void |
| 989 | fsw_rx_ring_dequeue_pktq(struct nx_flowswitch *fsw, struct __kern_channel_ring *r, |
| 990 | uint32_t n_pkts_max, struct pktq *pktq, uint32_t *n_bytes) |
| 991 | { |
| 992 | uint32_t n_pkts = 0; |
| 993 | slot_idx_t idx, idx_end; |
| 994 | idx = r->ckr_khead; |
| 995 | idx_end = r->ckr_rhead; |
| 996 | |
| 997 | ASSERT(KPKTQ_EMPTY(pktq)); |
| 998 | *n_bytes = 0; |
| 999 | for (; n_pkts < n_pkts_max && idx != idx_end; |
| 1000 | idx = SLOT_NEXT(i: idx, lim: r->ckr_lim)) { |
| 1001 | struct __kern_slot_desc *ksd = KR_KSD(r, idx); |
| 1002 | struct __kern_packet *pkt = ksd->sd_pkt; |
| 1003 | |
| 1004 | ASSERT(pkt->pkt_nextpkt == NULL); |
| 1005 | KR_SLOT_DETACH_METADATA(kring: r, ksd); |
| 1006 | |
| 1007 | _FSW_INJECT_ERROR(20, pkt->pkt_qum_qflags, |
| 1008 | pkt->pkt_qum_qflags | QUM_F_DROPPED, null_func); |
| 1009 | if (__improbable(((pkt->pkt_qum_qflags & QUM_F_DROPPED) != 0)) |
| 1010 | || (pkt->pkt_length == 0)) { |
| 1011 | FSW_STATS_INC(FSW_STATS_DROP); |
| 1012 | pp_free_packet_single(pkt); |
| 1013 | continue; |
| 1014 | } |
| 1015 | n_pkts++; |
| 1016 | *n_bytes += pkt->pkt_length; |
| 1017 | |
| 1018 | KPKTQ_ENQUEUE(pktq, pkt); |
| 1019 | } |
| 1020 | r->ckr_khead = idx; |
| 1021 | r->ckr_ktail = SLOT_PREV(i: idx, lim: r->ckr_lim); |
| 1022 | } |
| 1023 | |
| 1024 | /* |
| 1025 | * This is only for estimating how many packets each GSO packet will need. |
| 1026 | * The number does not need to be exact because any leftover packets allocated |
| 1027 | * will be freed. |
| 1028 | */ |
| 1029 | static uint32_t |
| 1030 | estimate_gso_pkts(struct __kern_packet *pkt) |
| 1031 | { |
| 1032 | packet_tso_flags_t tso_flags; |
| 1033 | uint16_t mss; |
| 1034 | uint32_t n_pkts = 0, total_hlen = 0, total_len = 0; |
| 1035 | |
| 1036 | tso_flags = pkt->pkt_csum_flags & PACKET_CSUM_TSO_FLAGS; |
| 1037 | mss = pkt->pkt_proto_seg_sz; |
| 1038 | |
| 1039 | if (tso_flags == PACKET_TSO_IPV4) { |
| 1040 | total_hlen = sizeof(struct ip) + sizeof(struct tcphdr); |
| 1041 | } else if (tso_flags == PACKET_TSO_IPV6) { |
| 1042 | total_hlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); |
| 1043 | } |
| 1044 | if (total_hlen != 0 && mss != 0) { |
| 1045 | total_len = pkt->pkt_length; |
| 1046 | n_pkts = (uint32_t) |
| 1047 | (SK_ROUNDUP((total_len - total_hlen), mss) / mss); |
| 1048 | } |
| 1049 | DTRACE_SKYWALK5(estimate__gso, packet_tso_flags_t, tso_flags, |
| 1050 | uint32_t, total_hlen, uint32_t, total_len, uint16_t, mss, |
| 1051 | uint32_t, n_pkts); |
| 1052 | return n_pkts; |
| 1053 | } |
| 1054 | |
| 1055 | /* |
| 1056 | * This function retrieves a chain of packets of the same type only |
| 1057 | * (GSO or non-GSO). |
| 1058 | */ |
| 1059 | static inline void |
| 1060 | fsw_tx_ring_dequeue_pktq(struct nx_flowswitch *fsw, |
| 1061 | struct __kern_channel_ring *r, uint32_t n_pkts_max, |
| 1062 | struct pktq *pktq, uint32_t *n_bytes, uint32_t *gso_pkts_estimate) |
| 1063 | { |
| 1064 | uint32_t n_pkts = 0; |
| 1065 | slot_idx_t idx, idx_end; |
| 1066 | idx = r->ckr_khead; |
| 1067 | idx_end = r->ckr_rhead; |
| 1068 | struct nexus_vp_adapter *vpna = VPNA(KRNA(r)); |
| 1069 | boolean_t gso_enabled, gso_required; |
| 1070 | uint32_t gso_pkts; |
| 1071 | |
| 1072 | gso_enabled = (fsw->fsw_tso_mode == FSW_TSO_MODE_SW); |
| 1073 | ASSERT(KPKTQ_EMPTY(pktq)); |
| 1074 | *n_bytes = 0; |
| 1075 | for (; n_pkts < n_pkts_max && |
| 1076 | (!gso_enabled || fsw_gso_batch == 0 || |
| 1077 | *gso_pkts_estimate < fsw_gso_batch) && |
| 1078 | idx != idx_end; idx = SLOT_NEXT(i: idx, lim: r->ckr_lim)) { |
| 1079 | struct __kern_slot_desc *ksd = KR_KSD(r, idx); |
| 1080 | struct __kern_packet *pkt = ksd->sd_pkt; |
| 1081 | |
| 1082 | ASSERT(pkt->pkt_nextpkt == NULL); |
| 1083 | |
| 1084 | _FSW_INJECT_ERROR(20, pkt->pkt_qum_qflags, |
| 1085 | pkt->pkt_qum_qflags | QUM_F_DROPPED, null_func); |
| 1086 | if (__improbable(((pkt->pkt_qum_qflags & QUM_F_DROPPED) != 0)) |
| 1087 | || (pkt->pkt_length == 0)) { |
| 1088 | KR_SLOT_DETACH_METADATA(kring: r, ksd); |
| 1089 | FSW_STATS_INC(FSW_STATS_DROP); |
| 1090 | pp_free_packet_single(pkt); |
| 1091 | continue; |
| 1092 | } |
| 1093 | if (gso_enabled) { |
| 1094 | gso_pkts = estimate_gso_pkts(pkt); |
| 1095 | |
| 1096 | /* |
| 1097 | * We use the first packet to determine what |
| 1098 | * type the subsequent ones need to be (GSO or |
| 1099 | * non-GSO). |
| 1100 | */ |
| 1101 | if (n_pkts == 0) { |
| 1102 | gso_required = (gso_pkts != 0); |
| 1103 | } else { |
| 1104 | if (gso_required != (gso_pkts != 0)) { |
| 1105 | break; |
| 1106 | } |
| 1107 | } |
| 1108 | *gso_pkts_estimate += gso_pkts; |
| 1109 | } |
| 1110 | KR_SLOT_DETACH_METADATA(kring: r, ksd); |
| 1111 | if (NA_CHANNEL_EVENT_ATTACHED(&vpna->vpna_up)) { |
| 1112 | __packet_set_tx_nx_port(SK_PKT2PH(pkt), |
| 1113 | nx_port: vpna->vpna_nx_port, vpna_gencnt: vpna->vpna_gencnt); |
| 1114 | } |
| 1115 | n_pkts++; |
| 1116 | *n_bytes += pkt->pkt_length; |
| 1117 | KPKTQ_ENQUEUE(pktq, pkt); |
| 1118 | } |
| 1119 | r->ckr_khead = idx; |
| 1120 | r->ckr_ktail = SLOT_PREV(i: idx, lim: r->ckr_lim); |
| 1121 | DTRACE_SKYWALK5(tx__ring__dequeue, struct nx_flowswitch *, fsw, |
| 1122 | ifnet_t, fsw->fsw_ifp, uint32_t, n_pkts, uint32_t, *n_bytes, |
| 1123 | uint32_t, *gso_pkts_estimate); |
| 1124 | } |
| 1125 | |
| 1126 | static void |
| 1127 | fsw_ring_enqueue_pktq(struct nx_flowswitch *fsw, struct __kern_channel_ring *r, |
| 1128 | struct pktq *pktq) |
| 1129 | { |
| 1130 | #pragma unused(fsw) |
| 1131 | struct __kern_packet *pkt; |
| 1132 | struct __kern_quantum *kqum; |
| 1133 | uint32_t kr_space_avail = 0; |
| 1134 | uint32_t n, n_pkts = 0, n_bytes = 0; |
| 1135 | slot_idx_t idx = 0, idx_start = 0, idx_end = 0; |
| 1136 | |
| 1137 | kr_enter(r, TRUE); |
| 1138 | |
| 1139 | idx_start = r->ckr_ktail; |
| 1140 | kr_space_avail = kr_available_slots_rxring(rxkring: r); |
| 1141 | _FSW_INJECT_ERROR(40, kr_space_avail, 0, null_func); |
| 1142 | n = MIN(kr_space_avail, KPKTQ_LEN(pktq)); |
| 1143 | _FSW_INJECT_ERROR(41, n, 0, null_func); |
| 1144 | idx_end = SLOT_INCREMENT(i: idx_start, n, lim: r->ckr_lim); |
| 1145 | |
| 1146 | idx = idx_start; |
| 1147 | while (idx != idx_end) { |
| 1148 | KPKTQ_DEQUEUE(pktq, pkt); |
| 1149 | kqum = SK_PTR_ADDR_KQUM(pkt); |
| 1150 | kqum->qum_qflags |= QUM_F_FINALIZED; |
| 1151 | n_pkts++; |
| 1152 | n_bytes += pkt->pkt_length; |
| 1153 | KR_SLOT_ATTACH_METADATA(kring: r, KR_KSD(r, idx), kqum); |
| 1154 | if (__improbable(pkt->pkt_trace_id != 0)) { |
| 1155 | KDBG(SK_KTRACE_PKT_RX_FSW | DBG_FUNC_END, pkt->pkt_trace_id); |
| 1156 | KDBG(SK_KTRACE_PKT_RX_CHN | DBG_FUNC_START, pkt->pkt_trace_id); |
| 1157 | } |
| 1158 | idx = SLOT_NEXT(i: idx, lim: r->ckr_lim); |
| 1159 | } |
| 1160 | |
| 1161 | kr_update_stats(kring: r, slot_count: n_pkts, byte_count: n_bytes); |
| 1162 | |
| 1163 | /* |
| 1164 | * ensure slot attachments are visible before updating the |
| 1165 | * tail pointer |
| 1166 | */ |
| 1167 | os_atomic_thread_fence(seq_cst); |
| 1168 | |
| 1169 | r->ckr_ktail = idx_end; |
| 1170 | |
| 1171 | kr_exit(r); |
| 1172 | |
| 1173 | r->ckr_na_notify(r, kernproc, NA_NOTEF_PUSH); |
| 1174 | |
| 1175 | SK_DF(SK_VERB_FSW_DP | SK_VERB_RING, "%s enqueued %d pkts" , |
| 1176 | r->ckr_name, n_pkts); |
| 1177 | } |
| 1178 | |
| 1179 | static void |
| 1180 | pkts_to_pktq(struct __kern_packet *pkts[], uint32_t n_pkts, struct pktq *pktq) |
| 1181 | { |
| 1182 | ASSERT(KPKTQ_EMPTY(pktq)); |
| 1183 | |
| 1184 | for (uint32_t i = 0; i < n_pkts; i++) { |
| 1185 | struct __kern_packet *pkt = pkts[i]; |
| 1186 | ASSERT(pkt->pkt_nextpkt == NULL); |
| 1187 | KPKTQ_ENQUEUE(pktq, pkt); |
| 1188 | } |
| 1189 | } |
| 1190 | |
| 1191 | /* |
| 1192 | * This function is modeled after nx_netif_host_grab_pkts() in nx_netif_host.c. |
| 1193 | */ |
| 1194 | SK_NO_INLINE_ATTRIBUTE |
| 1195 | static void |
| 1196 | convert_native_pktq_to_mbufs(struct nx_flowswitch *fsw, struct pktq *pktq, |
| 1197 | struct mbuf **m_headp, struct mbuf **m_tailp, uint32_t *cnt, uint32_t *bytes) |
| 1198 | { |
| 1199 | uint32_t tot_cnt; |
| 1200 | unsigned int num_segs = 1; |
| 1201 | struct mbuf *mhead, *head = NULL, *tail = NULL, **tailp = &head; |
| 1202 | uint32_t mhead_cnt, mhead_bufsize; |
| 1203 | uint32_t mhead_waste = 0; |
| 1204 | uint32_t mcnt = 0, mbytes = 0; |
| 1205 | uint32_t largest, max_pkt_len; |
| 1206 | struct __kern_packet *pkt; |
| 1207 | struct kern_pbufpool *pp; |
| 1208 | |
| 1209 | tot_cnt = KPKTQ_LEN(pktq); |
| 1210 | ASSERT(tot_cnt > 0); |
| 1211 | mhead_cnt = tot_cnt; |
| 1212 | |
| 1213 | /* |
| 1214 | * Opportunistically batch-allocate the mbufs based on the largest |
| 1215 | * packet size we've seen in the recent past. Note that we reset |
| 1216 | * fe_rx_largest_size below if we notice that we're under-utilizing the |
| 1217 | * allocated buffers (thus disabling this batch allocation). |
| 1218 | */ |
| 1219 | largest = *(volatile uint32_t*)&fsw->fsw_rx_largest_size; /* read once */ |
| 1220 | if (__probable(largest != 0)) { |
| 1221 | if (largest <= MCLBYTES) { |
| 1222 | mhead = m_allocpacket_internal(&mhead_cnt, MCLBYTES, |
| 1223 | &num_segs, M_NOWAIT, 1, 0); |
| 1224 | mhead_bufsize = MCLBYTES; |
| 1225 | } else if (largest <= MBIGCLBYTES) { |
| 1226 | mhead = m_allocpacket_internal(&mhead_cnt, MBIGCLBYTES, |
| 1227 | &num_segs, M_NOWAIT, 1, 0); |
| 1228 | mhead_bufsize = MBIGCLBYTES; |
| 1229 | } else if (largest <= M16KCLBYTES) { |
| 1230 | mhead = m_allocpacket_internal(&mhead_cnt, M16KCLBYTES, |
| 1231 | &num_segs, M_NOWAIT, 1, 0); |
| 1232 | mhead_bufsize = M16KCLBYTES; |
| 1233 | } else if (largest <= M16KCLBYTES * 2) { |
| 1234 | num_segs = 2; |
| 1235 | mhead = m_allocpacket_internal(&mhead_cnt, M16KCLBYTES * 2, |
| 1236 | &num_segs, M_NOWAIT, 1, 0); |
| 1237 | mhead_bufsize = M16KCLBYTES * 2; |
| 1238 | } else { |
| 1239 | mhead = NULL; |
| 1240 | mhead_bufsize = mhead_cnt = 0; |
| 1241 | } |
| 1242 | } else { |
| 1243 | mhead = NULL; |
| 1244 | mhead_bufsize = mhead_cnt = 0; |
| 1245 | } |
| 1246 | DTRACE_SKYWALK4(bufstats, uint32_t, largest, uint32_t, mhead_bufsize, |
| 1247 | uint32_t, mhead_cnt, uint32_t, tot_cnt); |
| 1248 | |
| 1249 | pp = __DECONST(struct kern_pbufpool *, KPKTQ_FIRST(pktq)->pkt_qum.qum_pp); |
| 1250 | max_pkt_len = PP_BUF_SIZE_DEF(pp) * pp->pp_max_frags; |
| 1251 | |
| 1252 | KPKTQ_FOREACH(pkt, pktq) { |
| 1253 | uint32_t tot_len, len; |
| 1254 | uint16_t pad, llhlen, iphlen; |
| 1255 | boolean_t do_cksum_rx; |
| 1256 | struct mbuf *m; |
| 1257 | int error; |
| 1258 | |
| 1259 | llhlen = pkt->pkt_l2_len; |
| 1260 | len = pkt->pkt_length; |
| 1261 | if (__improbable(len > max_pkt_len || llhlen > len)) { |
| 1262 | DTRACE_SKYWALK2(bad__len, struct nx_flowswitch *, fsw, |
| 1263 | struct __kern_packet *, pkt); |
| 1264 | FSW_STATS_INC(FSW_STATS_DROP); |
| 1265 | FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN); |
| 1266 | continue; |
| 1267 | } |
| 1268 | /* begin payload on 32-bit boundary; figure out the padding */ |
| 1269 | pad = (uint16_t)P2ROUNDUP(llhlen, sizeof(uint32_t)) - llhlen; |
| 1270 | tot_len = pad + len; |
| 1271 | |
| 1272 | /* remember largest packet size */ |
| 1273 | if (__improbable(largest < tot_len)) { |
| 1274 | largest = MAX(tot_len, MCLBYTES); |
| 1275 | } |
| 1276 | |
| 1277 | /* |
| 1278 | * If the above batch allocation returned partial |
| 1279 | * success, we try a blocking allocation here again. |
| 1280 | */ |
| 1281 | m = mhead; |
| 1282 | if (__improbable(m == NULL || tot_len > mhead_bufsize)) { |
| 1283 | ASSERT(mhead != NULL || mhead_cnt == 0); |
| 1284 | num_segs = 1; |
| 1285 | if (tot_len > M16KCLBYTES) { |
| 1286 | num_segs = 0; |
| 1287 | } |
| 1288 | if ((error = mbuf_allocpacket(how: MBUF_DONTWAIT, packetlen: tot_len, |
| 1289 | maxchunks: &num_segs, mbuf: &m)) != 0) { |
| 1290 | DTRACE_SKYWALK2(bad__len, |
| 1291 | struct nx_flowswitch *, fsw, |
| 1292 | struct __kern_packet *, pkt); |
| 1293 | FSW_STATS_INC(FSW_STATS_DROP_NOMEM_MBUF); |
| 1294 | FSW_STATS_INC(FSW_STATS_DROP); |
| 1295 | continue; |
| 1296 | } |
| 1297 | } else { |
| 1298 | mhead = m->m_nextpkt; |
| 1299 | m->m_nextpkt = NULL; |
| 1300 | ASSERT(mhead_cnt != 0); |
| 1301 | --mhead_cnt; |
| 1302 | |
| 1303 | /* check if we're underutilizing large buffers */ |
| 1304 | if (__improbable(mhead_bufsize > MCLBYTES && |
| 1305 | tot_len < (mhead_bufsize >> 1))) { |
| 1306 | ++mhead_waste; |
| 1307 | } |
| 1308 | /* |
| 1309 | * Clean up unused mbuf. |
| 1310 | * Ony need to do this when we pre-alloc 2x16K mbufs |
| 1311 | */ |
| 1312 | if (__improbable(mhead_bufsize >= tot_len + M16KCLBYTES)) { |
| 1313 | ASSERT(mhead_bufsize == 2 * M16KCLBYTES); |
| 1314 | struct mbuf * = m->m_next; |
| 1315 | ASSERT(m_extra != NULL); |
| 1316 | ASSERT(m_extra->m_len == 0); |
| 1317 | ASSERT(M_SIZE(m_extra) == M16KCLBYTES); |
| 1318 | m->m_next = NULL; |
| 1319 | m_freem(m_extra); |
| 1320 | FSW_STATS_INC(FSW_STATS_RX_WASTED_16KMBUF); |
| 1321 | } |
| 1322 | } |
| 1323 | m->m_data += pad; |
| 1324 | m->m_pkthdr.pkt_hdr = mtod(m, uint8_t *); |
| 1325 | |
| 1326 | /* don't include IP header from partial sum */ |
| 1327 | if (__probable((pkt->pkt_qum_qflags & |
| 1328 | QUM_F_FLOW_CLASSIFIED) != 0)) { |
| 1329 | iphlen = pkt->pkt_flow_ip_hlen; |
| 1330 | do_cksum_rx = sk_cksum_rx; |
| 1331 | } else { |
| 1332 | iphlen = 0; |
| 1333 | do_cksum_rx = FALSE; |
| 1334 | } |
| 1335 | |
| 1336 | fsw->fsw_pkt_copy_to_mbuf(NR_RX, SK_PKT2PH(pkt), |
| 1337 | pkt->pkt_headroom, m, 0, len, do_cksum_rx, |
| 1338 | llhlen + iphlen); |
| 1339 | |
| 1340 | FSW_STATS_INC(FSW_STATS_RX_COPY_PKT2MBUF); |
| 1341 | if (do_cksum_rx) { |
| 1342 | FSW_STATS_INC(FSW_STATS_RX_COPY_SUM); |
| 1343 | } |
| 1344 | #if DEBUG || DEVELOPMENT |
| 1345 | if (__improbable(pkt_trailers > 0)) { |
| 1346 | (void) pkt_add_trailers_mbuf(m, llhlen + iphlen); |
| 1347 | } |
| 1348 | #endif /* DEBUG || DEVELOPMENT */ |
| 1349 | m_adj(m, llhlen); |
| 1350 | |
| 1351 | m->m_pkthdr.rcvif = fsw->fsw_ifp; |
| 1352 | if (__improbable((pkt->pkt_link_flags & |
| 1353 | PKT_LINKF_ETHFCS) != 0)) { |
| 1354 | m->m_flags |= M_HASFCS; |
| 1355 | } |
| 1356 | if (__improbable(pkt->pkt_pflags & PKT_F_WAKE_PKT)) { |
| 1357 | m->m_pkthdr.pkt_flags |= PKTF_WAKE_PKT; |
| 1358 | } |
| 1359 | ASSERT(m->m_nextpkt == NULL); |
| 1360 | tail = m; |
| 1361 | *tailp = m; |
| 1362 | tailp = &m->m_nextpkt; |
| 1363 | mcnt++; |
| 1364 | mbytes += m_pktlen(m); |
| 1365 | } |
| 1366 | /* free any leftovers */ |
| 1367 | if (__improbable(mhead != NULL)) { |
| 1368 | DTRACE_SKYWALK1(mhead__leftover, uint32_t, mhead_cnt); |
| 1369 | ASSERT(mhead_cnt != 0); |
| 1370 | (void) m_freem_list(mhead); |
| 1371 | mhead = NULL; |
| 1372 | mhead_cnt = 0; |
| 1373 | } |
| 1374 | |
| 1375 | /* reset if most packets (>50%) are smaller than our batch buffers */ |
| 1376 | if (__improbable(mhead_waste > ((uint32_t)tot_cnt >> 1))) { |
| 1377 | DTRACE_SKYWALK4(mhead__waste, struct nx_flowswitch *, fsw, |
| 1378 | struct flow_entry *, NULL, uint32_t, mhead_waste, |
| 1379 | uint32_t, tot_cnt); |
| 1380 | largest = 0; |
| 1381 | } |
| 1382 | |
| 1383 | if (largest != fsw->fsw_rx_largest_size) { |
| 1384 | os_atomic_store(&fsw->fsw_rx_largest_size, largest, release); |
| 1385 | } |
| 1386 | |
| 1387 | pp_free_pktq(pktq); |
| 1388 | *m_headp = head; |
| 1389 | *m_tailp = tail; |
| 1390 | *cnt = mcnt; |
| 1391 | *bytes = mbytes; |
| 1392 | } |
| 1393 | |
| 1394 | /* |
| 1395 | * This function only extracts the mbuf from the packet. The caller frees |
| 1396 | * the packet. |
| 1397 | */ |
| 1398 | static inline struct mbuf * |
| 1399 | convert_compat_pkt_to_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *pkt) |
| 1400 | { |
| 1401 | struct mbuf *m; |
| 1402 | struct pkthdr *mhdr; |
| 1403 | uint16_t llhlen; |
| 1404 | |
| 1405 | m = pkt->pkt_mbuf; |
| 1406 | ASSERT(m != NULL); |
| 1407 | |
| 1408 | llhlen = pkt->pkt_l2_len; |
| 1409 | if (llhlen > pkt->pkt_length) { |
| 1410 | m_freem(m); |
| 1411 | KPKT_CLEAR_MBUF_DATA(pkt); |
| 1412 | DTRACE_SKYWALK2(bad__len, struct nx_flowswitch *, fsw, |
| 1413 | struct __kern_packet *, pkt); |
| 1414 | FSW_STATS_INC(FSW_STATS_DROP); |
| 1415 | FSW_STATS_INC(FSW_STATS_RX_COPY_BAD_LEN); |
| 1416 | return NULL; |
| 1417 | } |
| 1418 | mhdr = &m->m_pkthdr; |
| 1419 | if ((mhdr->csum_flags & CSUM_DATA_VALID) == 0 && |
| 1420 | PACKET_HAS_PARTIAL_CHECKSUM(pkt)) { |
| 1421 | mhdr->csum_flags &= ~CSUM_RX_FLAGS; |
| 1422 | mhdr->csum_flags |= (CSUM_DATA_VALID | CSUM_PARTIAL); |
| 1423 | mhdr->csum_rx_start = pkt->pkt_csum_rx_start_off; |
| 1424 | mhdr->csum_rx_val = pkt->pkt_csum_rx_value; |
| 1425 | } |
| 1426 | #if DEBUG || DEVELOPMENT |
| 1427 | uint32_t extra = 0; |
| 1428 | if (__improbable(pkt_trailers > 0)) { |
| 1429 | extra = pkt_add_trailers_mbuf(m, llhlen); |
| 1430 | } |
| 1431 | #endif /* DEBUG || DEVELOPMENT */ |
| 1432 | m_adj(m, llhlen); |
| 1433 | ASSERT((uint32_t)m_pktlen(m) == ((pkt->pkt_length - llhlen) + extra)); |
| 1434 | KPKT_CLEAR_MBUF_DATA(pkt); |
| 1435 | return m; |
| 1436 | } |
| 1437 | |
| 1438 | SK_NO_INLINE_ATTRIBUTE |
| 1439 | static void |
| 1440 | convert_compat_pktq_to_mbufs(struct nx_flowswitch *fsw, struct pktq *pktq, |
| 1441 | struct mbuf **m_head, struct mbuf **m_tail, uint32_t *cnt, uint32_t *bytes) |
| 1442 | { |
| 1443 | struct __kern_packet *pkt; |
| 1444 | struct mbuf *m, *head = NULL, *tail = NULL, **tailp = &head; |
| 1445 | uint32_t c = 0, b = 0; |
| 1446 | |
| 1447 | KPKTQ_FOREACH(pkt, pktq) { |
| 1448 | m = convert_compat_pkt_to_mbuf(fsw, pkt); |
| 1449 | if (__improbable(m == NULL)) { |
| 1450 | continue; |
| 1451 | } |
| 1452 | tail = m; |
| 1453 | *tailp = m; |
| 1454 | tailp = &m->m_nextpkt; |
| 1455 | c++; |
| 1456 | b += m_pktlen(m); |
| 1457 | } |
| 1458 | pp_free_pktq(pktq); |
| 1459 | *m_head = head; |
| 1460 | *m_tail = tail; |
| 1461 | *cnt = c; |
| 1462 | *bytes = b; |
| 1463 | } |
| 1464 | |
| 1465 | void |
| 1466 | fsw_host_sendup(ifnet_t ifp, struct mbuf *m_head, struct mbuf *m_tail, |
| 1467 | uint32_t cnt, uint32_t bytes) |
| 1468 | { |
| 1469 | struct ifnet_stat_increment_param s; |
| 1470 | |
| 1471 | bzero(s: &s, n: sizeof(s)); |
| 1472 | s.packets_in = cnt; |
| 1473 | s.bytes_in = bytes; |
| 1474 | dlil_input_handler(ifp, m_head, m_tail, &s, FALSE, NULL); |
| 1475 | } |
| 1476 | |
| 1477 | void |
| 1478 | fsw_host_rx(struct nx_flowswitch *fsw, struct pktq *pktq) |
| 1479 | { |
| 1480 | struct mbuf *m_head = NULL, *m_tail = NULL; |
| 1481 | uint32_t cnt = 0, bytes = 0; |
| 1482 | ifnet_fsw_rx_cb_t cb; |
| 1483 | void *cb_arg; |
| 1484 | boolean_t compat; |
| 1485 | |
| 1486 | ASSERT(!KPKTQ_EMPTY(pktq)); |
| 1487 | if (ifnet_get_flowswitch_rx_callback(ifp: fsw->fsw_ifp, cbp: &cb, argp: &cb_arg) == 0) { |
| 1488 | ASSERT(cb != NULL); |
| 1489 | ASSERT(cb_arg != NULL); |
| 1490 | /* callback consumes packets */ |
| 1491 | (*cb)(cb_arg, pktq); |
| 1492 | ifnet_release_flowswitch_rx_callback(ifp: fsw->fsw_ifp); |
| 1493 | return; |
| 1494 | } |
| 1495 | |
| 1496 | /* All packets in the pktq must have the same type */ |
| 1497 | compat = ((KPKTQ_FIRST(pktq)->pkt_pflags & PKT_F_MBUF_DATA) != 0); |
| 1498 | if (compat) { |
| 1499 | convert_compat_pktq_to_mbufs(fsw, pktq, m_head: &m_head, m_tail: &m_tail, cnt: &cnt, |
| 1500 | bytes: &bytes); |
| 1501 | } else { |
| 1502 | convert_native_pktq_to_mbufs(fsw, pktq, m_headp: &m_head, m_tailp: &m_tail, cnt: &cnt, |
| 1503 | bytes: &bytes); |
| 1504 | } |
| 1505 | if (__improbable(m_head == NULL)) { |
| 1506 | DTRACE_SKYWALK1(empty__head, struct nx_flowswitch *, fsw); |
| 1507 | return; |
| 1508 | } |
| 1509 | fsw_host_sendup(ifp: fsw->fsw_ifp, m_head, m_tail, cnt, bytes); |
| 1510 | } |
| 1511 | |
| 1512 | void |
| 1513 | fsw_ring_enqueue_tail_drop(struct nx_flowswitch *fsw, |
| 1514 | struct __kern_channel_ring *r, struct pktq *pktq) |
| 1515 | { |
| 1516 | fsw_ring_enqueue_pktq(fsw, r, pktq); |
| 1517 | FSW_STATS_ADD(FSW_STATS_RX_DST_RING_FULL, KPKTQ_LEN(pktq)); |
| 1518 | dp_drop_pktq(fsw, pktq); |
| 1519 | } |
| 1520 | |
| 1521 | static struct nexus_adapter * |
| 1522 | flow_get_na(struct nx_flowswitch *fsw, struct flow_entry *fe) |
| 1523 | { |
| 1524 | struct kern_nexus *nx = fsw->fsw_nx; |
| 1525 | struct nexus_adapter *na = NULL; |
| 1526 | nexus_port_t port = fe->fe_nx_port; |
| 1527 | |
| 1528 | if (port == FSW_VP_DEV || port == FSW_VP_HOST) { |
| 1529 | SK_ERR("dev or host ports have no NA" ); |
| 1530 | return NULL; |
| 1531 | } |
| 1532 | |
| 1533 | if (__improbable(!nx_port_is_valid(nx, port))) { |
| 1534 | SK_DF(SK_VERB_FSW_DP, "%s[%d] port no longer valid" , |
| 1535 | if_name(fsw->fsw_ifp), port); |
| 1536 | return NULL; |
| 1537 | } |
| 1538 | |
| 1539 | na = nx_port_get_na(nx, port); |
| 1540 | if (__improbable(na == NULL)) { |
| 1541 | FSW_STATS_INC(FSW_STATS_DST_NXPORT_INVALID); |
| 1542 | SK_DF(SK_VERB_FSW_DP, "%s[%d] NA no longer valid" , |
| 1543 | if_name(fsw->fsw_ifp), port); |
| 1544 | return NULL; |
| 1545 | } |
| 1546 | |
| 1547 | if (__improbable(!NA_IS_ACTIVE(na))) { |
| 1548 | FSW_STATS_INC(FSW_STATS_DST_NXPORT_INACTIVE); |
| 1549 | SK_DF(SK_VERB_FSW_DP, "%s[%d] NA no longer active" , |
| 1550 | if_name(fsw->fsw_ifp), port); |
| 1551 | return NULL; |
| 1552 | } |
| 1553 | |
| 1554 | if (__improbable(nx_port_is_defunct(nx, port))) { |
| 1555 | FSW_STATS_INC(FSW_STATS_DST_NXPORT_DEFUNCT); |
| 1556 | SK_DF(SK_VERB_FSW_DP, "%s[%d] NA defuncted" , |
| 1557 | if_name(fsw->fsw_ifp), port); |
| 1558 | return NULL; |
| 1559 | } |
| 1560 | |
| 1561 | return na; |
| 1562 | } |
| 1563 | |
| 1564 | static inline struct __kern_channel_ring * |
| 1565 | flow_get_ring(struct nx_flowswitch *fsw, struct flow_entry *fe, enum txrx txrx) |
| 1566 | { |
| 1567 | struct nexus_vp_adapter *na = NULL; |
| 1568 | struct __kern_channel_ring *r = NULL; |
| 1569 | |
| 1570 | na = VPNA(flow_get_na(fsw, fe)); |
| 1571 | if (__improbable(na == NULL)) { |
| 1572 | return NULL; |
| 1573 | } |
| 1574 | |
| 1575 | switch (txrx) { |
| 1576 | case NR_RX: |
| 1577 | r = &na->vpna_up.na_rx_rings[0]; |
| 1578 | break; |
| 1579 | case NR_TX: |
| 1580 | r = &na->vpna_up.na_tx_rings[0]; |
| 1581 | break; |
| 1582 | default: |
| 1583 | __builtin_unreachable(); |
| 1584 | VERIFY(0); |
| 1585 | } |
| 1586 | |
| 1587 | if (__improbable(KR_DROP(r))) { |
| 1588 | FSW_STATS_INC(FSW_STATS_DST_RING_DROPMODE); |
| 1589 | SK_DF(SK_VERB_FSW_DP | SK_VERB_RING, "r %0xllx %s drop mode" , |
| 1590 | r->ckr_name, SK_KVA(r)); |
| 1591 | return NULL; |
| 1592 | } |
| 1593 | |
| 1594 | ASSERT(KRNA(r)->na_md_type == NEXUS_META_TYPE_PACKET); |
| 1595 | |
| 1596 | #if (DEVELOPMENT || DEBUG) |
| 1597 | if (r != NULL) { |
| 1598 | _FSW_INJECT_ERROR(4, r, NULL, null_func); |
| 1599 | } |
| 1600 | #endif /* DEVELOPMENT || DEBUG */ |
| 1601 | |
| 1602 | return r; |
| 1603 | } |
| 1604 | |
| 1605 | struct __kern_channel_ring * |
| 1606 | fsw_flow_get_rx_ring(struct nx_flowswitch *fsw, struct flow_entry *fe) |
| 1607 | { |
| 1608 | return flow_get_ring(fsw, fe, txrx: NR_RX); |
| 1609 | } |
| 1610 | |
| 1611 | static inline struct __kern_channel_ring * |
| 1612 | fsw_flow_get_tx_ring(struct nx_flowswitch *fsw, struct flow_entry *fe) |
| 1613 | { |
| 1614 | return flow_get_ring(fsw, fe, txrx: NR_TX); |
| 1615 | } |
| 1616 | |
| 1617 | static bool |
| 1618 | dp_flow_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe) |
| 1619 | { |
| 1620 | struct flow_route *fr = fe->fe_route; |
| 1621 | struct ifnet *ifp = fsw->fsw_ifp; |
| 1622 | |
| 1623 | if (__improbable(!(fe->fe_flags & FLOWENTF_NONVIABLE) && |
| 1624 | !fe->fe_want_nonviable && (fe->fe_key.fk_mask & FKMASK_SRC) && |
| 1625 | fe->fe_laddr_gencnt != ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt && |
| 1626 | !flow_route_key_validate(&fe->fe_key, ifp, &fe->fe_laddr_gencnt))) { |
| 1627 | /* |
| 1628 | * The source address is no longer around; we want this |
| 1629 | * flow to be nonviable, but that requires holding the lock |
| 1630 | * as writer (which isn't the case now.) Indicate that |
| 1631 | * we need to finalize the nonviable later down below. |
| 1632 | * |
| 1633 | * We also request that the flow route be re-configured, |
| 1634 | * if this is a connected mode flow. |
| 1635 | * |
| 1636 | */ |
| 1637 | if (!(fe->fe_flags & FLOWENTF_NONVIABLE)) { |
| 1638 | /* |
| 1639 | * fsw_pending_nonviable is a hint for reaper thread; |
| 1640 | * due to the fact that setting fe_want_nonviable and |
| 1641 | * incrementing fsw_pending_nonviable counter is not |
| 1642 | * atomic, let the increment happen first, and the |
| 1643 | * thread losing the CAS does decrement. |
| 1644 | */ |
| 1645 | os_atomic_inc(&fsw->fsw_pending_nonviable, relaxed); |
| 1646 | if (os_atomic_cmpxchg(&fe->fe_want_nonviable, 0, 1, acq_rel)) { |
| 1647 | fsw_reap_sched(fsw); |
| 1648 | } else { |
| 1649 | os_atomic_dec(&fsw->fsw_pending_nonviable, relaxed); |
| 1650 | } |
| 1651 | } |
| 1652 | if (fr != NULL) { |
| 1653 | os_atomic_inc(&fr->fr_want_configure, relaxed); |
| 1654 | } |
| 1655 | } |
| 1656 | |
| 1657 | /* if flow was (or is going to be) marked as nonviable, drop it */ |
| 1658 | if (__improbable(fe->fe_want_nonviable || |
| 1659 | (fe->fe_flags & FLOWENTF_NONVIABLE) != 0)) { |
| 1660 | SK_DF(SK_VERB_FSW_DP | SK_VERB_FLOW, "flow 0x%llx non-viable" , |
| 1661 | SK_KVA(fe)); |
| 1662 | return false; |
| 1663 | } |
| 1664 | return true; |
| 1665 | } |
| 1666 | |
| 1667 | bool |
| 1668 | dp_flow_rx_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe) |
| 1669 | { |
| 1670 | bool okay; |
| 1671 | okay = dp_flow_route_process(fsw, fe); |
| 1672 | #if (DEVELOPMENT || DEBUG) |
| 1673 | if (okay) { |
| 1674 | _FSW_INJECT_ERROR(5, okay, false, null_func); |
| 1675 | } |
| 1676 | #endif /* DEVELOPMENT || DEBUG */ |
| 1677 | |
| 1678 | return okay; |
| 1679 | } |
| 1680 | |
| 1681 | void |
| 1682 | dp_flow_rx_process(struct nx_flowswitch *fsw, struct flow_entry *fe, |
| 1683 | uint32_t flags) |
| 1684 | { |
| 1685 | #pragma unused(flags) |
| 1686 | struct pktq dpkts; /* dst pool alloc'ed packets */ |
| 1687 | struct pktq disposed_pkts; /* done src packets */ |
| 1688 | struct pktq dropped_pkts; /* dropped src packets */ |
| 1689 | struct pktq transferred_pkts; /* dst packet ready for ring */ |
| 1690 | struct __kern_packet *pkt, *tpkt; |
| 1691 | struct kern_pbufpool *dpp; |
| 1692 | uint32_t n_pkts = KPKTQ_LEN(&fe->fe_rx_pktq); |
| 1693 | uint64_t buf_array[RX_BUFLET_BATCH_COUNT]; |
| 1694 | uint16_t buf_array_iter = 0; |
| 1695 | uint32_t cnt, buf_cnt = 0; |
| 1696 | int err; |
| 1697 | |
| 1698 | KPKTQ_INIT(&dpkts); |
| 1699 | KPKTQ_INIT(&dropped_pkts); |
| 1700 | KPKTQ_INIT(&disposed_pkts); |
| 1701 | KPKTQ_INIT(&transferred_pkts); |
| 1702 | |
| 1703 | if (__improbable(!dp_flow_rx_route_process(fsw, fe))) { |
| 1704 | SK_ERR("Rx route bad" ); |
| 1705 | fsw_snoop_and_dequeue(fe, target: &dropped_pkts, true); |
| 1706 | FSW_STATS_ADD(FSW_STATS_RX_FLOW_NONVIABLE, n_pkts); |
| 1707 | goto done; |
| 1708 | } |
| 1709 | |
| 1710 | if (fe->fe_nx_port == FSW_VP_HOST) { |
| 1711 | /* |
| 1712 | * The host ring does not exist anymore so we can't take |
| 1713 | * the enqueue path below. This path should only be hit |
| 1714 | * for the rare tcp fragmentation case. |
| 1715 | */ |
| 1716 | fsw_host_rx(fsw, pktq: &fe->fe_rx_pktq); |
| 1717 | return; |
| 1718 | } |
| 1719 | |
| 1720 | /* find the ring */ |
| 1721 | struct __kern_channel_ring *r; |
| 1722 | r = fsw_flow_get_rx_ring(fsw, fe); |
| 1723 | if (__improbable(r == NULL)) { |
| 1724 | fsw_snoop_and_dequeue(fe, target: &dropped_pkts, true); |
| 1725 | goto done; |
| 1726 | } |
| 1727 | |
| 1728 | /* snoop before L2 is stripped */ |
| 1729 | if (__improbable(pktap_total_tap_count != 0)) { |
| 1730 | fsw_snoop(fsw, fe, true); |
| 1731 | } |
| 1732 | |
| 1733 | dpp = r->ckr_pp; |
| 1734 | /* batch allocate enough packets */ |
| 1735 | err = pp_alloc_pktq(dpp, 1, &dpkts, n_pkts, NULL, NULL, |
| 1736 | SKMEM_NOSLEEP); |
| 1737 | if (__improbable(err == ENOMEM)) { |
| 1738 | ASSERT(KPKTQ_EMPTY(&dpkts)); |
| 1739 | KPKTQ_CONCAT(&dropped_pkts, &fe->fe_rx_pktq); |
| 1740 | FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts); |
| 1741 | SK_ERR("failed to alloc %u pkts for kr %s, 0x%llu" , n_pkts, |
| 1742 | r->ckr_name, SK_KVA(r)); |
| 1743 | goto done; |
| 1744 | } |
| 1745 | |
| 1746 | /* |
| 1747 | * estimate total number of buflets for the packet chain. |
| 1748 | */ |
| 1749 | cnt = howmany(fe->fe_rx_pktq_bytes, PP_BUF_SIZE_DEF(dpp)); |
| 1750 | if (cnt > n_pkts) { |
| 1751 | ASSERT(dpp->pp_max_frags > 1); |
| 1752 | cnt -= n_pkts; |
| 1753 | buf_cnt = MIN(RX_BUFLET_BATCH_COUNT, cnt); |
| 1754 | err = pp_alloc_buflet_batch(pp: dpp, array: buf_array, size: &buf_cnt, |
| 1755 | SKMEM_NOSLEEP, false); |
| 1756 | if (__improbable(buf_cnt == 0)) { |
| 1757 | KPKTQ_CONCAT(&dropped_pkts, &fe->fe_rx_pktq); |
| 1758 | FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts); |
| 1759 | SK_ERR("failed to alloc %d buflets (err %d) for kr %s, " |
| 1760 | "0x%llu" , cnt, err, r->ckr_name, SK_KVA(r)); |
| 1761 | goto done; |
| 1762 | } |
| 1763 | err = 0; |
| 1764 | } |
| 1765 | |
| 1766 | /* extra processing for user flow */ |
| 1767 | KPKTQ_FOREACH_SAFE(pkt, &fe->fe_rx_pktq, tpkt) { |
| 1768 | err = 0; |
| 1769 | KPKTQ_REMOVE(&fe->fe_rx_pktq, pkt); |
| 1770 | if (fe->fe_rx_pktq_bytes > pkt->pkt_flow_ulen) { |
| 1771 | fe->fe_rx_pktq_bytes -= pkt->pkt_flow_ulen; |
| 1772 | } else { |
| 1773 | fe->fe_rx_pktq_bytes = 0; |
| 1774 | } |
| 1775 | err = flow_pkt_track(fe, pkt, true); |
| 1776 | _FSW_INJECT_ERROR(33, err, EPROTO, null_func); |
| 1777 | if (__improbable(err != 0)) { |
| 1778 | SK_ERR("flow_pkt_track failed (err %d)" , err); |
| 1779 | FSW_STATS_INC(FSW_STATS_RX_FLOW_TRACK_ERR); |
| 1780 | /* if need to trigger RST */ |
| 1781 | if (err == ENETRESET) { |
| 1782 | flow_track_abort_tcp(fe, in_pkt: pkt, NULL); |
| 1783 | } |
| 1784 | KPKTQ_ENQUEUE(&dropped_pkts, pkt); |
| 1785 | continue; |
| 1786 | } |
| 1787 | |
| 1788 | /* transfer to dpkt */ |
| 1789 | if (pkt->pkt_qum.qum_pp != dpp) { |
| 1790 | struct __kern_buflet *bprev, *bnew; |
| 1791 | struct __kern_packet *dpkt = NULL; |
| 1792 | uint32_t n_bufs, i; |
| 1793 | |
| 1794 | KPKTQ_DEQUEUE(&dpkts, dpkt); |
| 1795 | if (__improbable(dpkt == NULL)) { |
| 1796 | FSW_STATS_INC(FSW_STATS_DROP_NOMEM_PKT); |
| 1797 | KPKTQ_ENQUEUE(&dropped_pkts, pkt); |
| 1798 | continue; |
| 1799 | } |
| 1800 | n_bufs = howmany(pkt->pkt_length, PP_BUF_SIZE_DEF(dpp)); |
| 1801 | n_bufs--; |
| 1802 | for (i = 0; i < n_bufs; i++) { |
| 1803 | if (__improbable(buf_cnt == 0)) { |
| 1804 | ASSERT(dpp->pp_max_frags > 1); |
| 1805 | buf_array_iter = 0; |
| 1806 | cnt = howmany(fe->fe_rx_pktq_bytes, |
| 1807 | PP_BUF_SIZE_DEF(dpp)); |
| 1808 | n_pkts = KPKTQ_LEN(&fe->fe_rx_pktq); |
| 1809 | if (cnt >= n_pkts) { |
| 1810 | cnt -= n_pkts; |
| 1811 | } else { |
| 1812 | cnt = 0; |
| 1813 | } |
| 1814 | cnt += (n_bufs - i); |
| 1815 | buf_cnt = MIN(RX_BUFLET_BATCH_COUNT, |
| 1816 | cnt); |
| 1817 | cnt = buf_cnt; |
| 1818 | err = pp_alloc_buflet_batch(pp: dpp, |
| 1819 | array: buf_array, size: &buf_cnt, |
| 1820 | SKMEM_NOSLEEP, false); |
| 1821 | if (__improbable(buf_cnt == 0)) { |
| 1822 | FSW_STATS_INC(FSW_STATS_DROP_NOMEM_PKT); |
| 1823 | KPKTQ_ENQUEUE(&dropped_pkts, |
| 1824 | pkt); |
| 1825 | pkt = NULL; |
| 1826 | pp_free_packet_single(dpkt); |
| 1827 | dpkt = NULL; |
| 1828 | SK_ERR("failed to alloc %d " |
| 1829 | "buflets (err %d) for " |
| 1830 | "kr %s, 0x%llu" , cnt, err, |
| 1831 | r->ckr_name, SK_KVA(r)); |
| 1832 | break; |
| 1833 | } |
| 1834 | err = 0; |
| 1835 | } |
| 1836 | ASSERT(buf_cnt != 0); |
| 1837 | if (i == 0) { |
| 1838 | PKT_GET_FIRST_BUFLET(dpkt, 1, bprev); |
| 1839 | } |
| 1840 | bnew = (kern_buflet_t)buf_array[buf_array_iter]; |
| 1841 | buf_array[buf_array_iter] = 0; |
| 1842 | buf_array_iter++; |
| 1843 | buf_cnt--; |
| 1844 | VERIFY(kern_packet_add_buflet(SK_PKT2PH(dpkt), |
| 1845 | bprev, bnew) == 0); |
| 1846 | bprev = bnew; |
| 1847 | } |
| 1848 | if (__improbable(err != 0)) { |
| 1849 | continue; |
| 1850 | } |
| 1851 | err = copy_packet_from_dev(fsw, spkt: pkt, dpkt); |
| 1852 | _FSW_INJECT_ERROR(43, err, EINVAL, null_func); |
| 1853 | if (__improbable(err != 0)) { |
| 1854 | SK_ERR("copy packet failed (err %d)" , err); |
| 1855 | KPKTQ_ENQUEUE(&dropped_pkts, pkt); |
| 1856 | pp_free_packet_single(dpkt); |
| 1857 | dpkt = NULL; |
| 1858 | continue; |
| 1859 | } |
| 1860 | KPKTQ_ENQUEUE(&disposed_pkts, pkt); |
| 1861 | pkt = dpkt; |
| 1862 | } |
| 1863 | _UUID_COPY(pkt->pkt_flow_id, fe->fe_uuid); |
| 1864 | _UUID_COPY(pkt->pkt_policy_euuid, fe->fe_eproc_uuid); |
| 1865 | pkt->pkt_policy_id = fe->fe_policy_id; |
| 1866 | pkt->pkt_skip_policy_id = fe->fe_skip_policy_id; |
| 1867 | pkt->pkt_transport_protocol = fe->fe_transport_protocol; |
| 1868 | if (pkt->pkt_bufs_cnt > 1) { |
| 1869 | pkt->pkt_aggr_type = PKT_AGGR_SINGLE_IP; |
| 1870 | pkt->pkt_seg_cnt = 1; |
| 1871 | } |
| 1872 | KPKTQ_ENQUEUE(&transferred_pkts, pkt); |
| 1873 | } |
| 1874 | KPKTQ_FINI(&fe->fe_rx_pktq); |
| 1875 | KPKTQ_CONCAT(&fe->fe_rx_pktq, &transferred_pkts); |
| 1876 | KPKTQ_FINI(&transferred_pkts); |
| 1877 | |
| 1878 | fsw_ring_enqueue_tail_drop(fsw, r, pktq: &fe->fe_rx_pktq); |
| 1879 | |
| 1880 | done: |
| 1881 | /* Free unused buflets */ |
| 1882 | while (buf_cnt > 0) { |
| 1883 | pp_free_buflet(dpp, (kern_buflet_t)(buf_array[buf_array_iter])); |
| 1884 | buf_array[buf_array_iter] = 0; |
| 1885 | buf_array_iter++; |
| 1886 | buf_cnt--; |
| 1887 | } |
| 1888 | dp_free_pktq(fsw, pktq: &dpkts); |
| 1889 | dp_free_pktq(fsw, pktq: &disposed_pkts); |
| 1890 | dp_drop_pktq(fsw, &dropped_pkts); |
| 1891 | } |
| 1892 | |
| 1893 | static inline void |
| 1894 | rx_flow_process(struct nx_flowswitch *fsw, struct flow_entry *fe, |
| 1895 | uint32_t flags) |
| 1896 | { |
| 1897 | ASSERT(!KPKTQ_EMPTY(&fe->fe_rx_pktq)); |
| 1898 | ASSERT(KPKTQ_LEN(&fe->fe_rx_pktq) != 0); |
| 1899 | |
| 1900 | SK_DF(SK_VERB_FSW_DP | SK_VERB_RX, "Rx %d pkts for fe %p port %d" , |
| 1901 | KPKTQ_LEN(&fe->fe_rx_pktq), fe, fe->fe_nx_port); |
| 1902 | |
| 1903 | /* flow related processing (default, agg, fpd, etc.) */ |
| 1904 | fe->fe_rx_process(fsw, fe, flags); |
| 1905 | |
| 1906 | if (__improbable(fe->fe_want_withdraw)) { |
| 1907 | fsw_reap_sched(fsw); |
| 1908 | } |
| 1909 | |
| 1910 | KPKTQ_FINI(&fe->fe_rx_pktq); |
| 1911 | } |
| 1912 | |
| 1913 | static inline void |
| 1914 | dp_rx_process_wake_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt) |
| 1915 | { |
| 1916 | /* |
| 1917 | * We only care about wake packets of flows that belong the flow switch |
| 1918 | * as wake packets for the host stack are handled by the host input |
| 1919 | * function |
| 1920 | */ |
| 1921 | #if (DEBUG || DEVELOPMENT) |
| 1922 | if (__improbable(fsw->fsw_ifp->if_xflags & IFXF_MARK_WAKE_PKT)) { |
| 1923 | /* |
| 1924 | * This is a one shot command |
| 1925 | */ |
| 1926 | fsw->fsw_ifp->if_xflags &= ~IFXF_MARK_WAKE_PKT; |
| 1927 | |
| 1928 | pkt->pkt_pflags |= PKT_F_WAKE_PKT; |
| 1929 | } |
| 1930 | #endif /* (DEBUG || DEVELOPMENT) */ |
| 1931 | if (__improbable(pkt->pkt_pflags & PKT_F_WAKE_PKT)) { |
| 1932 | if_ports_used_match_pkt(ifp: fsw->fsw_ifp, pkt); |
| 1933 | } |
| 1934 | } |
| 1935 | |
| 1936 | static void |
| 1937 | _fsw_receive_locked(struct nx_flowswitch *fsw, struct pktq *pktq) |
| 1938 | { |
| 1939 | struct __kern_packet *pkt, *tpkt; |
| 1940 | struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes); |
| 1941 | struct flow_entry *fe, *prev_fe; |
| 1942 | sa_family_t af; |
| 1943 | struct pktq host_pkts, dropped_pkts; |
| 1944 | int err; |
| 1945 | |
| 1946 | KPKTQ_INIT(&host_pkts); |
| 1947 | KPKTQ_INIT(&dropped_pkts); |
| 1948 | |
| 1949 | if (__improbable(FSW_QUIESCED(fsw))) { |
| 1950 | DTRACE_SKYWALK1(rx__quiesced, struct nx_flowswitch *, fsw); |
| 1951 | KPKTQ_CONCAT(&dropped_pkts, pktq); |
| 1952 | goto done; |
| 1953 | } |
| 1954 | if (__improbable(fsw->fsw_demux == NULL)) { |
| 1955 | KPKTQ_CONCAT(&dropped_pkts, pktq); |
| 1956 | goto done; |
| 1957 | } |
| 1958 | |
| 1959 | prev_fe = NULL; |
| 1960 | KPKTQ_FOREACH_SAFE(pkt, pktq, tpkt) { |
| 1961 | if (__probable(tpkt)) { |
| 1962 | void *baddr; |
| 1963 | MD_BUFLET_ADDR_ABS_PKT(tpkt, baddr); |
| 1964 | SK_PREFETCH(baddr, 0); |
| 1965 | /* prefetch L3 and L4 flow structs */ |
| 1966 | SK_PREFETCHW(tpkt->pkt_flow, 0); |
| 1967 | SK_PREFETCHW(tpkt->pkt_flow, 128); |
| 1968 | } |
| 1969 | |
| 1970 | KPKTQ_REMOVE(pktq, pkt); |
| 1971 | |
| 1972 | pkt = rx_prepare_packet(fsw, pkt); |
| 1973 | |
| 1974 | af = fsw->fsw_demux(fsw, pkt); |
| 1975 | if (__improbable(af == AF_UNSPEC)) { |
| 1976 | KPKTQ_ENQUEUE(&host_pkts, pkt); |
| 1977 | continue; |
| 1978 | } |
| 1979 | |
| 1980 | err = flow_pkt_classify(pkt, ifp: fsw->fsw_ifp, af, TRUE); |
| 1981 | _FSW_INJECT_ERROR(1, err, ENXIO, null_func); |
| 1982 | if (__improbable(err != 0)) { |
| 1983 | FSW_STATS_INC(FSW_STATS_RX_FLOW_EXTRACT_ERR); |
| 1984 | KPKTQ_ENQUEUE(&host_pkts, pkt); |
| 1985 | continue; |
| 1986 | } |
| 1987 | |
| 1988 | if (__improbable(pkt->pkt_flow_ip_is_frag)) { |
| 1989 | pkt = rx_process_ip_frag(fsw, pkt); |
| 1990 | if (pkt == NULL) { |
| 1991 | continue; |
| 1992 | } |
| 1993 | } |
| 1994 | |
| 1995 | prev_fe = fe = rx_lookup_flow(fsw, pkt, prev_fe); |
| 1996 | if (__improbable(fe == NULL)) { |
| 1997 | KPKTQ_ENQUEUE_LIST(&host_pkts, pkt); |
| 1998 | continue; |
| 1999 | } |
| 2000 | |
| 2001 | fe->fe_rx_pktq_bytes += pkt->pkt_flow_ulen; |
| 2002 | |
| 2003 | dp_rx_process_wake_packet(fsw, pkt); |
| 2004 | |
| 2005 | rx_flow_batch_packet(fes: &fes, fe, pkt); |
| 2006 | prev_fe = fe; |
| 2007 | } |
| 2008 | |
| 2009 | struct flow_entry *tfe = NULL; |
| 2010 | TAILQ_FOREACH_SAFE(fe, &fes, fe_rx_link, tfe) { |
| 2011 | rx_flow_process(fsw, fe, flags: 0); |
| 2012 | TAILQ_REMOVE(&fes, fe, fe_rx_link); |
| 2013 | fe->fe_rx_pktq_bytes = 0; |
| 2014 | fe->fe_rx_frag_count = 0; |
| 2015 | flow_entry_release(pfe: &fe); |
| 2016 | } |
| 2017 | |
| 2018 | if (!KPKTQ_EMPTY(&host_pkts)) { |
| 2019 | fsw_host_rx(fsw, pktq: &host_pkts); |
| 2020 | } |
| 2021 | |
| 2022 | done: |
| 2023 | dp_drop_pktq(fsw, &dropped_pkts); |
| 2024 | } |
| 2025 | |
| 2026 | #if (DEVELOPMENT || DEBUG) |
| 2027 | static void |
| 2028 | fsw_rps_rx(struct nx_flowswitch *fsw, uint32_t id, |
| 2029 | struct __kern_packet *pkt) |
| 2030 | { |
| 2031 | struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[id]; |
| 2032 | |
| 2033 | lck_mtx_lock_spin(&frt->frt_lock); |
| 2034 | KPKTQ_ENQUEUE(&frt->frt_pktq, pkt); |
| 2035 | lck_mtx_unlock(&frt->frt_lock); |
| 2036 | } |
| 2037 | |
| 2038 | static void |
| 2039 | fsw_rps_thread_schedule(struct nx_flowswitch *fsw, uint32_t id) |
| 2040 | { |
| 2041 | struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[id]; |
| 2042 | |
| 2043 | ASSERT(frt->frt_thread != THREAD_NULL); |
| 2044 | lck_mtx_lock_spin(&frt->frt_lock); |
| 2045 | ASSERT(!(frt->frt_flags & (FRT_TERMINATING | FRT_TERMINATED))); |
| 2046 | |
| 2047 | frt->frt_requests++; |
| 2048 | if (!(frt->frt_flags & FRT_RUNNING)) { |
| 2049 | thread_wakeup((caddr_t)frt); |
| 2050 | } |
| 2051 | lck_mtx_unlock(&frt->frt_lock); |
| 2052 | } |
| 2053 | |
| 2054 | __attribute__((noreturn)) |
| 2055 | static void |
| 2056 | fsw_rps_thread_cont(void *v, wait_result_t w) |
| 2057 | { |
| 2058 | struct fsw_rps_thread *frt = v; |
| 2059 | struct nx_flowswitch *fsw = frt->frt_fsw; |
| 2060 | |
| 2061 | lck_mtx_lock(&frt->frt_lock); |
| 2062 | if (__improbable(w == THREAD_INTERRUPTIBLE || |
| 2063 | (frt->frt_flags & FRT_TERMINATING) != 0)) { |
| 2064 | goto terminate; |
| 2065 | } |
| 2066 | if (KPKTQ_EMPTY(&frt->frt_pktq)) { |
| 2067 | goto done; |
| 2068 | } |
| 2069 | frt->frt_flags |= FRT_RUNNING; |
| 2070 | |
| 2071 | for (;;) { |
| 2072 | uint32_t requests = frt->frt_requests; |
| 2073 | struct pktq pkts; |
| 2074 | |
| 2075 | KPKTQ_INIT(&pkts); |
| 2076 | KPKTQ_CONCAT(&pkts, &frt->frt_pktq); |
| 2077 | lck_mtx_unlock(&frt->frt_lock); |
| 2078 | |
| 2079 | sk_protect_t protect; |
| 2080 | protect = sk_sync_protect(); |
| 2081 | FSW_RLOCK(fsw); |
| 2082 | _fsw_receive_locked(fsw, &pkts); |
| 2083 | FSW_RUNLOCK(fsw); |
| 2084 | sk_sync_unprotect(protect); |
| 2085 | |
| 2086 | lck_mtx_lock(&frt->frt_lock); |
| 2087 | if ((frt->frt_flags & FRT_TERMINATING) != 0 || |
| 2088 | requests == frt->frt_requests) { |
| 2089 | frt->frt_requests = 0; |
| 2090 | break; |
| 2091 | } |
| 2092 | } |
| 2093 | |
| 2094 | done: |
| 2095 | lck_mtx_unlock(&frt->frt_lock); |
| 2096 | if (!(frt->frt_flags & FRT_TERMINATING)) { |
| 2097 | frt->frt_flags &= ~FRT_RUNNING; |
| 2098 | assert_wait(frt, THREAD_UNINT); |
| 2099 | thread_block_parameter(fsw_rps_thread_cont, frt); |
| 2100 | __builtin_unreachable(); |
| 2101 | } else { |
| 2102 | terminate: |
| 2103 | LCK_MTX_ASSERT(&frt->frt_lock, LCK_MTX_ASSERT_OWNED); |
| 2104 | frt->frt_flags &= ~(FRT_RUNNING | FRT_TERMINATING); |
| 2105 | frt->frt_flags |= FRT_TERMINATED; |
| 2106 | |
| 2107 | if (frt->frt_flags & FRT_TERMINATEBLOCK) { |
| 2108 | thread_wakeup((caddr_t)&frt); |
| 2109 | } |
| 2110 | lck_mtx_unlock(&frt->frt_lock); |
| 2111 | |
| 2112 | SK_D("fsw_rx_%s_%d terminated" , if_name(fsw->fsw_ifp), |
| 2113 | frt->frt_idx); |
| 2114 | |
| 2115 | /* for the extra refcnt from kernel_thread_start() */ |
| 2116 | thread_deallocate(current_thread()); |
| 2117 | /* this is the end */ |
| 2118 | thread_terminate(current_thread()); |
| 2119 | /* NOTREACHED */ |
| 2120 | __builtin_unreachable(); |
| 2121 | } |
| 2122 | |
| 2123 | /* must never get here */ |
| 2124 | VERIFY(0); |
| 2125 | /* NOTREACHED */ |
| 2126 | __builtin_unreachable(); |
| 2127 | } |
| 2128 | |
| 2129 | __attribute__((noreturn)) |
| 2130 | static void |
| 2131 | fsw_rps_thread_func(void *v, wait_result_t w) |
| 2132 | { |
| 2133 | #pragma unused(w) |
| 2134 | struct fsw_rps_thread *frt = v; |
| 2135 | struct nx_flowswitch *fsw = frt->frt_fsw; |
| 2136 | |
| 2137 | char thread_name[MAXTHREADNAMESIZE]; |
| 2138 | bzero(thread_name, sizeof(thread_name)); |
| 2139 | (void) snprintf(thread_name, sizeof(thread_name), "fsw_rx_%s_%d" , |
| 2140 | if_name(fsw->fsw_ifp), frt->frt_idx); |
| 2141 | thread_set_thread_name(frt->frt_thread, thread_name); |
| 2142 | SK_D("%s spawned" , thread_name); |
| 2143 | |
| 2144 | net_thread_marks_push(NET_THREAD_SYNC_RX); |
| 2145 | assert_wait(frt, THREAD_UNINT); |
| 2146 | (void) thread_block_parameter(fsw_rps_thread_cont, frt); |
| 2147 | |
| 2148 | __builtin_unreachable(); |
| 2149 | } |
| 2150 | |
| 2151 | static void |
| 2152 | fsw_rps_thread_join(struct nx_flowswitch *fsw, uint32_t i) |
| 2153 | { |
| 2154 | struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[i]; |
| 2155 | uint64_t f = (1 * NSEC_PER_MSEC); |
| 2156 | uint64_t s = (1000 * NSEC_PER_SEC); |
| 2157 | uint32_t c = 0; |
| 2158 | |
| 2159 | lck_mtx_lock(&frt->frt_lock); |
| 2160 | frt->frt_flags |= FRT_TERMINATING; |
| 2161 | |
| 2162 | while (!(frt->frt_flags & FRT_TERMINATED)) { |
| 2163 | uint64_t t = 0; |
| 2164 | nanoseconds_to_absolutetime((c++ == 0) ? f : s, &t); |
| 2165 | clock_absolutetime_interval_to_deadline(t, &t); |
| 2166 | ASSERT(t != 0); |
| 2167 | |
| 2168 | frt->frt_flags |= FRT_TERMINATEBLOCK; |
| 2169 | if (!(frt->frt_flags & FRT_RUNNING)) { |
| 2170 | thread_wakeup_one((caddr_t)frt); |
| 2171 | } |
| 2172 | (void) assert_wait_deadline(&frt->frt_thread, THREAD_UNINT, t); |
| 2173 | lck_mtx_unlock(&frt->frt_lock); |
| 2174 | thread_block(THREAD_CONTINUE_NULL); |
| 2175 | lck_mtx_lock(&frt->frt_lock); |
| 2176 | frt->frt_flags &= ~FRT_TERMINATEBLOCK; |
| 2177 | } |
| 2178 | ASSERT(frt->frt_flags & FRT_TERMINATED); |
| 2179 | lck_mtx_unlock(&frt->frt_lock); |
| 2180 | frt->frt_thread = THREAD_NULL; |
| 2181 | } |
| 2182 | |
| 2183 | static void |
| 2184 | fsw_rps_thread_spawn(struct nx_flowswitch *fsw, uint32_t i) |
| 2185 | { |
| 2186 | kern_return_t error; |
| 2187 | struct fsw_rps_thread *frt = &fsw->fsw_rps_threads[i]; |
| 2188 | lck_mtx_init(&frt->frt_lock, &nexus_lock_group, &nexus_lock_attr); |
| 2189 | frt->frt_idx = i; |
| 2190 | frt->frt_fsw = fsw; |
| 2191 | error = kernel_thread_start(fsw_rps_thread_func, frt, &frt->frt_thread); |
| 2192 | ASSERT(!error); |
| 2193 | KPKTQ_INIT(&frt->frt_pktq); |
| 2194 | } |
| 2195 | |
| 2196 | int |
| 2197 | fsw_rps_set_nthreads(struct nx_flowswitch* fsw, uint32_t n) |
| 2198 | { |
| 2199 | if (n > FSW_RPS_MAX_NTHREADS) { |
| 2200 | SK_ERR("rps nthreads %d, max %d" , n, FSW_RPS_MAX_NTHREADS); |
| 2201 | return EINVAL; |
| 2202 | } |
| 2203 | |
| 2204 | FSW_WLOCK(fsw); |
| 2205 | if (n < fsw->fsw_rps_nthreads) { |
| 2206 | for (uint32_t i = n; i < fsw->fsw_rps_nthreads; i++) { |
| 2207 | fsw_rps_thread_join(fsw, i); |
| 2208 | } |
| 2209 | fsw->fsw_rps_threads = krealloc_type(struct fsw_rps_thread, |
| 2210 | fsw->fsw_rps_nthreads, n, fsw->fsw_rps_threads, |
| 2211 | Z_WAITOK | Z_ZERO | Z_NOFAIL); |
| 2212 | } else if (n > fsw->fsw_rps_nthreads) { |
| 2213 | fsw->fsw_rps_threads = krealloc_type(struct fsw_rps_thread, |
| 2214 | fsw->fsw_rps_nthreads, n, fsw->fsw_rps_threads, |
| 2215 | Z_WAITOK | Z_ZERO | Z_NOFAIL); |
| 2216 | for (uint32_t i = fsw->fsw_rps_nthreads; i < n; i++) { |
| 2217 | fsw_rps_thread_spawn(fsw, i); |
| 2218 | } |
| 2219 | } |
| 2220 | fsw->fsw_rps_nthreads = n; |
| 2221 | FSW_WUNLOCK(fsw); |
| 2222 | return 0; |
| 2223 | } |
| 2224 | |
| 2225 | static uint32_t |
| 2226 | get_rps_id(struct nx_flowswitch *fsw, struct __kern_packet *pkt) |
| 2227 | { |
| 2228 | sa_family_t af = fsw->fsw_demux(fsw, pkt); |
| 2229 | if (__improbable(af == AF_UNSPEC)) { |
| 2230 | return 0; |
| 2231 | } |
| 2232 | |
| 2233 | flow_pkt_classify(pkt, fsw->fsw_ifp, af, true); |
| 2234 | |
| 2235 | if (__improbable((pkt->pkt_qum_qflags & |
| 2236 | QUM_F_FLOW_CLASSIFIED) == 0)) { |
| 2237 | return 0; |
| 2238 | } |
| 2239 | |
| 2240 | struct flow_key key; |
| 2241 | flow_pkt2key(pkt, true, &key); |
| 2242 | key.fk_mask = FKMASK_5TUPLE; |
| 2243 | |
| 2244 | uint32_t id = flow_key_hash(&key) % fsw->fsw_rps_nthreads; |
| 2245 | |
| 2246 | return id; |
| 2247 | } |
| 2248 | |
| 2249 | #endif /* !DEVELOPMENT && !DEBUG */ |
| 2250 | |
| 2251 | void |
| 2252 | fsw_receive(struct nx_flowswitch *fsw, struct pktq *pktq) |
| 2253 | { |
| 2254 | FSW_RLOCK(fsw); |
| 2255 | #if (DEVELOPMENT || DEBUG) |
| 2256 | if (fsw->fsw_rps_nthreads != 0) { |
| 2257 | struct __kern_packet *pkt, *tpkt; |
| 2258 | bitmap_t map = 0; |
| 2259 | |
| 2260 | _CASSERT(BITMAP_LEN(FSW_RPS_MAX_NTHREADS) == 1); |
| 2261 | KPKTQ_FOREACH_SAFE(pkt, pktq, tpkt) { |
| 2262 | uint32_t id = get_rps_id(fsw, pkt); |
| 2263 | KPKTQ_REMOVE(pktq, pkt); |
| 2264 | fsw_rps_rx(fsw, id, pkt); |
| 2265 | bitmap_set(&map, id); |
| 2266 | } |
| 2267 | for (int i = bitmap_first(&map, 64); i >= 0; |
| 2268 | i = bitmap_next(&map, i)) { |
| 2269 | fsw_rps_thread_schedule(fsw, i); |
| 2270 | } |
| 2271 | } else |
| 2272 | #endif /* !DEVELOPMENT && !DEBUG */ |
| 2273 | { |
| 2274 | _fsw_receive_locked(fsw, pktq); |
| 2275 | } |
| 2276 | FSW_RUNLOCK(fsw); |
| 2277 | } |
| 2278 | |
| 2279 | int |
| 2280 | fsw_dev_input_netem_dequeue(void *handle, pktsched_pkt_t * pkts, |
| 2281 | uint32_t n_pkts) |
| 2282 | { |
| 2283 | #pragma unused(handle) |
| 2284 | struct nx_flowswitch *fsw = handle; |
| 2285 | struct __kern_packet *kpkts[FSW_VP_DEV_BATCH_MAX]; |
| 2286 | struct pktq pktq; |
| 2287 | sk_protect_t protect; |
| 2288 | uint32_t i; |
| 2289 | |
| 2290 | ASSERT(n_pkts <= FSW_VP_DEV_BATCH_MAX); |
| 2291 | |
| 2292 | for (i = 0; i < n_pkts; i++) { |
| 2293 | ASSERT(pkts[i].pktsched_ptype == QP_PACKET); |
| 2294 | ASSERT(pkts[i].pktsched_pkt_kpkt != NULL); |
| 2295 | kpkts[i] = pkts[i].pktsched_pkt_kpkt; |
| 2296 | } |
| 2297 | |
| 2298 | protect = sk_sync_protect(); |
| 2299 | KPKTQ_INIT(&pktq); |
| 2300 | pkts_to_pktq(pkts: kpkts, n_pkts, pktq: &pktq); |
| 2301 | |
| 2302 | fsw_receive(fsw, pktq: &pktq); |
| 2303 | KPKTQ_FINI(&pktq); |
| 2304 | sk_sync_unprotect(protect); |
| 2305 | |
| 2306 | return 0; |
| 2307 | } |
| 2308 | |
| 2309 | static void |
| 2310 | fsw_dev_input_netem_enqueue(struct nx_flowswitch *fsw, struct pktq *q) |
| 2311 | { |
| 2312 | classq_pkt_t p; |
| 2313 | struct netem *ne; |
| 2314 | struct __kern_packet *pkt, *tpkt; |
| 2315 | |
| 2316 | ASSERT(fsw->fsw_ifp != NULL); |
| 2317 | ne = fsw->fsw_ifp->if_input_netem; |
| 2318 | ASSERT(ne != NULL); |
| 2319 | KPKTQ_FOREACH_SAFE(pkt, q, tpkt) { |
| 2320 | bool pdrop; |
| 2321 | KPKTQ_REMOVE(q, pkt); |
| 2322 | CLASSQ_PKT_INIT_PACKET(&p, pkt); |
| 2323 | netem_enqueue(ne, p: &p, pdrop: &pdrop); |
| 2324 | } |
| 2325 | } |
| 2326 | |
| 2327 | void |
| 2328 | fsw_devna_rx(struct nexus_adapter *devna, struct __kern_packet *pkt_head, |
| 2329 | struct nexus_pkt_stats *out_stats) |
| 2330 | { |
| 2331 | struct __kern_packet *pkt = pkt_head, *next; |
| 2332 | struct nx_flowswitch *fsw; |
| 2333 | uint32_t n_bytes = 0, n_pkts = 0; |
| 2334 | uint64_t total_pkts = 0, total_bytes = 0; |
| 2335 | struct pktq q; |
| 2336 | |
| 2337 | KPKTQ_INIT(&q); |
| 2338 | if (__improbable(devna->na_ifp == NULL || |
| 2339 | (fsw = fsw_ifp_to_fsw(devna->na_ifp)) == NULL)) { |
| 2340 | SK_ERR("fsw not attached, dropping %d pkts" , KPKTQ_LEN(&q)); |
| 2341 | pp_free_packet_chain(pkt_head, NULL); |
| 2342 | return; |
| 2343 | } |
| 2344 | while (pkt != NULL) { |
| 2345 | if (__improbable(pkt->pkt_trace_id != 0)) { |
| 2346 | KDBG(SK_KTRACE_PKT_RX_DRV | DBG_FUNC_END, pkt->pkt_trace_id); |
| 2347 | KDBG(SK_KTRACE_PKT_RX_FSW | DBG_FUNC_START, pkt->pkt_trace_id); |
| 2348 | } |
| 2349 | next = pkt->pkt_nextpkt; |
| 2350 | pkt->pkt_nextpkt = NULL; |
| 2351 | |
| 2352 | if (__probable((pkt->pkt_qum_qflags & QUM_F_DROPPED) == 0)) { |
| 2353 | KPKTQ_ENQUEUE(&q, pkt); |
| 2354 | n_bytes += pkt->pkt_length; |
| 2355 | } else { |
| 2356 | DTRACE_SKYWALK1(non__finalized__drop, |
| 2357 | struct __kern_packet *, pkt); |
| 2358 | FSW_STATS_INC(FSW_STATS_RX_PKT_NOT_FINALIZED); |
| 2359 | pp_free_packet_single(pkt); |
| 2360 | pkt = NULL; |
| 2361 | } |
| 2362 | n_pkts = KPKTQ_LEN(&q); |
| 2363 | if (n_pkts == fsw_rx_batch || (next == NULL && n_pkts > 0)) { |
| 2364 | if (__improbable(fsw->fsw_ifp->if_input_netem != NULL)) { |
| 2365 | fsw_dev_input_netem_enqueue(fsw, q: &q); |
| 2366 | } else { |
| 2367 | fsw_receive(fsw, pktq: &q); |
| 2368 | } |
| 2369 | total_pkts += n_pkts; |
| 2370 | total_bytes += n_bytes; |
| 2371 | n_pkts = 0; |
| 2372 | n_bytes = 0; |
| 2373 | KPKTQ_FINI(&q); |
| 2374 | } |
| 2375 | pkt = next; |
| 2376 | } |
| 2377 | ASSERT(KPKTQ_LEN(&q) == 0); |
| 2378 | FSW_STATS_ADD(FSW_STATS_RX_PACKETS, total_pkts); |
| 2379 | if (out_stats != NULL) { |
| 2380 | out_stats->nps_pkts = total_pkts; |
| 2381 | out_stats->nps_bytes = total_bytes; |
| 2382 | } |
| 2383 | KDBG(SK_KTRACE_FSW_DEV_RING_FLUSH, SK_KVA(devna), total_pkts, total_bytes); |
| 2384 | } |
| 2385 | |
| 2386 | static int |
| 2387 | dp_copy_to_dev_mbuf(struct nx_flowswitch *fsw, struct __kern_packet *spkt, |
| 2388 | struct __kern_packet *dpkt) |
| 2389 | { |
| 2390 | struct mbuf *m = NULL; |
| 2391 | uint32_t bdlen, bdlim, bdoff; |
| 2392 | uint8_t *bdaddr; |
| 2393 | unsigned int one = 1; |
| 2394 | int err = 0; |
| 2395 | |
| 2396 | err = mbuf_allocpacket(how: MBUF_DONTWAIT, |
| 2397 | packetlen: (fsw->fsw_frame_headroom + spkt->pkt_length), maxchunks: &one, mbuf: &m); |
| 2398 | #if (DEVELOPMENT || DEBUG) |
| 2399 | if (m != NULL) { |
| 2400 | _FSW_INJECT_ERROR(11, m, NULL, m_freem, m); |
| 2401 | } |
| 2402 | #endif /* DEVELOPMENT || DEBUG */ |
| 2403 | if (__improbable(m == NULL)) { |
| 2404 | FSW_STATS_INC(FSW_STATS_DROP_NOMEM_MBUF); |
| 2405 | err = ENOBUFS; |
| 2406 | goto done; |
| 2407 | } |
| 2408 | |
| 2409 | MD_BUFLET_ADDR_ABS_DLEN(dpkt, bdaddr, bdlen, bdlim, bdoff); |
| 2410 | if (fsw->fsw_frame_headroom > bdlim) { |
| 2411 | SK_ERR("not enough space in buffer for headroom" ); |
| 2412 | err = EINVAL; |
| 2413 | goto done; |
| 2414 | } |
| 2415 | |
| 2416 | dpkt->pkt_headroom = fsw->fsw_frame_headroom; |
| 2417 | dpkt->pkt_mbuf = m; |
| 2418 | dpkt->pkt_pflags |= PKT_F_MBUF_DATA; |
| 2419 | |
| 2420 | /* packet copy into mbuf */ |
| 2421 | fsw->fsw_pkt_copy_to_mbuf(NR_TX, SK_PTR_ENCODE(spkt, |
| 2422 | METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt)), 0, m, |
| 2423 | fsw->fsw_frame_headroom, spkt->pkt_length, |
| 2424 | PACKET_HAS_PARTIAL_CHECKSUM(spkt), |
| 2425 | spkt->pkt_csum_tx_start_off); |
| 2426 | FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2MBUF); |
| 2427 | |
| 2428 | /* header copy into dpkt buffer for classification */ |
| 2429 | kern_packet_t sph = SK_PTR_ENCODE(spkt, |
| 2430 | METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt)); |
| 2431 | kern_packet_t dph = SK_PTR_ENCODE(dpkt, |
| 2432 | METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt)); |
| 2433 | uint32_t copy_len = MIN(spkt->pkt_length, bdlim - dpkt->pkt_headroom); |
| 2434 | fsw->fsw_pkt_copy_from_pkt(NR_TX, dph, dpkt->pkt_headroom, |
| 2435 | sph, spkt->pkt_headroom, copy_len, FALSE, 0, 0, 0); |
| 2436 | |
| 2437 | /* |
| 2438 | * fsw->fsw_frame_headroom is after m_data, thus we treat m_data same as |
| 2439 | * buflet baddr m_data always points to the beginning of packet and |
| 2440 | * should represents the same as baddr + headroom |
| 2441 | */ |
| 2442 | ASSERT((uintptr_t)m->m_data == |
| 2443 | ((uintptr_t)mbuf_datastart(m) + fsw->fsw_frame_headroom)); |
| 2444 | |
| 2445 | done: |
| 2446 | return err; |
| 2447 | } |
| 2448 | |
| 2449 | static int |
| 2450 | dp_copy_to_dev_pkt(struct nx_flowswitch *fsw, struct __kern_packet *spkt, |
| 2451 | struct __kern_packet *dpkt) |
| 2452 | { |
| 2453 | struct ifnet *ifp = fsw->fsw_ifp; |
| 2454 | uint16_t headroom = fsw->fsw_frame_headroom + ifp->if_tx_headroom; |
| 2455 | |
| 2456 | if (headroom > UINT8_MAX) { |
| 2457 | SK_ERR("headroom too large %d" , headroom); |
| 2458 | return ERANGE; |
| 2459 | } |
| 2460 | dpkt->pkt_headroom = (uint8_t)headroom; |
| 2461 | ASSERT((dpkt->pkt_headroom & 0x7) == 0); |
| 2462 | dpkt->pkt_l2_len = 0; |
| 2463 | dpkt->pkt_link_flags = spkt->pkt_link_flags; |
| 2464 | |
| 2465 | kern_packet_t sph = SK_PTR_ENCODE(spkt, |
| 2466 | METADATA_TYPE(spkt), METADATA_SUBTYPE(spkt)); |
| 2467 | kern_packet_t dph = SK_PTR_ENCODE(dpkt, |
| 2468 | METADATA_TYPE(dpkt), METADATA_SUBTYPE(dpkt)); |
| 2469 | fsw->fsw_pkt_copy_from_pkt(NR_TX, dph, |
| 2470 | dpkt->pkt_headroom, sph, spkt->pkt_headroom, |
| 2471 | spkt->pkt_length, PACKET_HAS_PARTIAL_CHECKSUM(spkt), |
| 2472 | (spkt->pkt_csum_tx_start_off - spkt->pkt_headroom), |
| 2473 | (spkt->pkt_csum_tx_stuff_off - spkt->pkt_headroom), |
| 2474 | (spkt->pkt_csum_flags & PACKET_CSUM_ZERO_INVERT)); |
| 2475 | |
| 2476 | FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2PKT); |
| 2477 | |
| 2478 | return 0; |
| 2479 | } |
| 2480 | |
| 2481 | #if SK_LOG |
| 2482 | /* Hoisted out of line to reduce kernel stack footprint */ |
| 2483 | SK_LOG_ATTRIBUTE |
| 2484 | static void |
| 2485 | dp_copy_to_dev_log(struct nx_flowswitch *fsw, const struct kern_pbufpool *pp, |
| 2486 | struct __kern_packet *spkt, struct __kern_packet *dpkt, int error) |
| 2487 | { |
| 2488 | struct proc *p = current_proc(); |
| 2489 | struct ifnet *ifp = fsw->fsw_ifp; |
| 2490 | uint64_t logflags = (SK_VERB_FSW_DP | SK_VERB_TX); |
| 2491 | |
| 2492 | if (error == ERANGE) { |
| 2493 | SK_ERR("packet too long, hr(fr+tx)+slen (%u+%u)+%u > " |
| 2494 | "dev_pp_max %u" , (uint32_t)fsw->fsw_frame_headroom, |
| 2495 | (uint32_t)ifp->if_tx_headroom, spkt->pkt_length, |
| 2496 | (uint32_t)pp->pp_max_frags * PP_BUF_SIZE_DEF(pp)); |
| 2497 | } else if (error == ENOBUFS) { |
| 2498 | SK_DF(logflags, "%s(%d) packet allocation failure" , |
| 2499 | sk_proc_name_address(p), sk_proc_pid(p)); |
| 2500 | } else if (error == 0) { |
| 2501 | ASSERT(dpkt != NULL); |
| 2502 | char *daddr; |
| 2503 | MD_BUFLET_ADDR_ABS(dpkt, daddr); |
| 2504 | SK_DF(logflags, "%s(%d) splen %u dplen %u hr %u (fr/tx %u/%u)" , |
| 2505 | sk_proc_name_address(p), sk_proc_pid(p), spkt->pkt_length, |
| 2506 | dpkt->pkt_length, (uint32_t)dpkt->pkt_headroom, |
| 2507 | (uint32_t)fsw->fsw_frame_headroom, |
| 2508 | (uint32_t)ifp->if_tx_headroom); |
| 2509 | SK_DF(logflags | SK_VERB_DUMP, "%s" , |
| 2510 | sk_dump("buf" , daddr, dpkt->pkt_length, 128, NULL, 0)); |
| 2511 | } else { |
| 2512 | SK_DF(logflags, "%s(%d) error %d" , error); |
| 2513 | } |
| 2514 | } |
| 2515 | #else |
| 2516 | #define dp_copy_to_dev_log(...) |
| 2517 | #endif /* SK_LOG */ |
| 2518 | |
| 2519 | static void |
| 2520 | fsw_pkt_copy_metadata(struct __kern_packet *spkt, struct __kern_packet *dpkt) |
| 2521 | { |
| 2522 | ASSERT(!(spkt->pkt_pflags & PKT_F_MBUF_MASK)); |
| 2523 | ASSERT(!(spkt->pkt_pflags & PKT_F_PKT_MASK)); |
| 2524 | |
| 2525 | SK_PREFETCHW(dpkt->pkt_qum_buf.buf_addr, 0); |
| 2526 | /* Copy packet metadata */ |
| 2527 | _QUM_COPY(&(spkt)->pkt_qum, &(dpkt)->pkt_qum); |
| 2528 | _PKT_COPY(spkt, dpkt); |
| 2529 | _PKT_COPY_TX_PORT_DATA(spkt, dpkt); |
| 2530 | ASSERT((dpkt->pkt_qum.qum_qflags & QUM_F_KERNEL_ONLY) || |
| 2531 | !PP_KERNEL_ONLY(dpkt->pkt_qum.qum_pp)); |
| 2532 | ASSERT(dpkt->pkt_mbuf == NULL); |
| 2533 | |
| 2534 | /* Copy AQM metadata */ |
| 2535 | dpkt->pkt_flowsrc_type = spkt->pkt_flowsrc_type; |
| 2536 | dpkt->pkt_flowsrc_fidx = spkt->pkt_flowsrc_fidx; |
| 2537 | _CASSERT((offsetof(struct __flow, flow_src_id) % 8) == 0); |
| 2538 | _UUID_COPY(dpkt->pkt_flowsrc_id, spkt->pkt_flowsrc_id); |
| 2539 | _UUID_COPY(dpkt->pkt_policy_euuid, spkt->pkt_policy_euuid); |
| 2540 | dpkt->pkt_policy_id = spkt->pkt_policy_id; |
| 2541 | dpkt->pkt_skip_policy_id = spkt->pkt_skip_policy_id; |
| 2542 | } |
| 2543 | |
| 2544 | static int |
| 2545 | dp_copy_to_dev(struct nx_flowswitch *fsw, struct __kern_packet *spkt, |
| 2546 | struct __kern_packet *dpkt) |
| 2547 | { |
| 2548 | const struct kern_pbufpool *pp = dpkt->pkt_qum.qum_pp; |
| 2549 | struct ifnet *ifp = fsw->fsw_ifp; |
| 2550 | uint32_t dev_pkt_len; |
| 2551 | int err = 0; |
| 2552 | |
| 2553 | fsw_pkt_copy_metadata(spkt, dpkt); |
| 2554 | switch (fsw->fsw_classq_enq_ptype) { |
| 2555 | case QP_MBUF: |
| 2556 | err = dp_copy_to_dev_mbuf(fsw, spkt, dpkt); |
| 2557 | break; |
| 2558 | |
| 2559 | case QP_PACKET: |
| 2560 | dev_pkt_len = fsw->fsw_frame_headroom + ifp->if_tx_headroom + |
| 2561 | spkt->pkt_length; |
| 2562 | if (dev_pkt_len > pp->pp_max_frags * PP_BUF_SIZE_DEF(pp)) { |
| 2563 | FSW_STATS_INC(FSW_STATS_TX_COPY_BAD_LEN); |
| 2564 | err = ERANGE; |
| 2565 | goto done; |
| 2566 | } |
| 2567 | err = dp_copy_to_dev_pkt(fsw, spkt, dpkt); |
| 2568 | break; |
| 2569 | |
| 2570 | default: |
| 2571 | VERIFY(0); |
| 2572 | __builtin_unreachable(); |
| 2573 | } |
| 2574 | done: |
| 2575 | dp_copy_to_dev_log(fsw, pp, spkt, dpkt, err); |
| 2576 | return err; |
| 2577 | } |
| 2578 | |
| 2579 | static int |
| 2580 | (struct nx_flowswitch *fsw, struct __kern_packet *spkt, |
| 2581 | struct __kern_packet *dpkt) |
| 2582 | { |
| 2583 | uint8_t *sbaddr, *dbaddr; |
| 2584 | uint16_t headroom = fsw->fsw_frame_headroom + fsw->fsw_ifp->if_tx_headroom; |
| 2585 | uint16_t hdrs_len_estimate = (uint16_t)MIN(spkt->pkt_length, 128); |
| 2586 | |
| 2587 | fsw_pkt_copy_metadata(spkt, dpkt); |
| 2588 | |
| 2589 | MD_BUFLET_ADDR_ABS(spkt, sbaddr); |
| 2590 | ASSERT(sbaddr != NULL); |
| 2591 | sbaddr += spkt->pkt_headroom; |
| 2592 | |
| 2593 | MD_BUFLET_ADDR_ABS(dpkt, dbaddr); |
| 2594 | ASSERT(dbaddr != NULL); |
| 2595 | dpkt->pkt_headroom = (uint8_t)headroom; |
| 2596 | dbaddr += headroom; |
| 2597 | |
| 2598 | pkt_copy(src: sbaddr, dst: dbaddr, len: hdrs_len_estimate); |
| 2599 | METADATA_SET_LEN(dpkt, hdrs_len_estimate, headroom); |
| 2600 | |
| 2601 | /* packet length is set to the full length */ |
| 2602 | dpkt->pkt_length = spkt->pkt_length; |
| 2603 | dpkt->pkt_pflags |= PKT_F_TRUNCATED; |
| 2604 | return 0; |
| 2605 | } |
| 2606 | |
| 2607 | static struct mbuf * |
| 2608 | convert_pkt_to_mbuf(struct __kern_packet *pkt) |
| 2609 | { |
| 2610 | ASSERT(pkt->pkt_pflags & PKT_F_MBUF_DATA); |
| 2611 | ASSERT(pkt->pkt_mbuf != NULL); |
| 2612 | struct mbuf *m = pkt->pkt_mbuf; |
| 2613 | |
| 2614 | /* pass additional metadata generated from flow parse/lookup */ |
| 2615 | _CASSERT(sizeof(m->m_pkthdr.pkt_flowid) == |
| 2616 | sizeof(pkt->pkt_flow_token)); |
| 2617 | _CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_srcid) == |
| 2618 | sizeof(pkt->pkt_flowsrc_token)); |
| 2619 | _CASSERT(sizeof(m->m_pkthdr.pkt_mpriv_fidx) == |
| 2620 | sizeof(pkt->pkt_flowsrc_fidx)); |
| 2621 | m->m_pkthdr.pkt_svc = pkt->pkt_svc_class; |
| 2622 | m->m_pkthdr.pkt_proto = pkt->pkt_flow->flow_ip_proto; |
| 2623 | m->m_pkthdr.pkt_flowid = pkt->pkt_flow_token; |
| 2624 | m->m_pkthdr.comp_gencnt = pkt->pkt_comp_gencnt; |
| 2625 | m->m_pkthdr.pkt_flowsrc = pkt->pkt_flowsrc_type; |
| 2626 | m->m_pkthdr.pkt_mpriv_srcid = pkt->pkt_flowsrc_token; |
| 2627 | m->m_pkthdr.pkt_mpriv_fidx = pkt->pkt_flowsrc_fidx; |
| 2628 | |
| 2629 | if (pkt->pkt_transport_protocol == IPPROTO_QUIC) { |
| 2630 | m->m_pkthdr.pkt_ext_flags |= PKTF_EXT_QUIC; |
| 2631 | } |
| 2632 | |
| 2633 | /* The packet should have a timestamp by the time we get here. */ |
| 2634 | m->m_pkthdr.pkt_timestamp = pkt->pkt_timestamp; |
| 2635 | m->m_pkthdr.pkt_flags &= ~PKTF_TS_VALID; |
| 2636 | |
| 2637 | m->m_pkthdr.pkt_flags &= ~PKT_F_COMMON_MASK; |
| 2638 | m->m_pkthdr.pkt_flags |= (pkt->pkt_pflags & PKT_F_COMMON_MASK); |
| 2639 | /* set pkt_hdr so that AQM can find IP header and mark ECN bits */ |
| 2640 | m->m_pkthdr.pkt_hdr = m_mtod_current(m) + pkt->pkt_l2_len; |
| 2641 | |
| 2642 | if ((pkt->pkt_pflags & PKT_F_START_SEQ) != 0) { |
| 2643 | m->m_pkthdr.tx_start_seq = ntohl(pkt->pkt_flow_tcp_seq); |
| 2644 | } |
| 2645 | KPKT_CLEAR_MBUF_DATA(pkt); |
| 2646 | |
| 2647 | /* mbuf has been consumed, release packet as well */ |
| 2648 | ASSERT(pkt->pkt_qum.qum_ksd == NULL); |
| 2649 | pp_free_packet_single(pkt); |
| 2650 | return m; |
| 2651 | } |
| 2652 | |
| 2653 | static void |
| 2654 | convert_pkt_to_mbuf_list(struct __kern_packet *pkt_list, |
| 2655 | struct mbuf **head, struct mbuf **tail, |
| 2656 | uint32_t *cnt, uint32_t *bytes) |
| 2657 | { |
| 2658 | struct __kern_packet *pkt = pkt_list, *next; |
| 2659 | struct mbuf *m_head = NULL, **m_tailp = &m_head, *m = NULL; |
| 2660 | uint32_t c = 0, b = 0; |
| 2661 | |
| 2662 | while (pkt != NULL) { |
| 2663 | next = pkt->pkt_nextpkt; |
| 2664 | pkt->pkt_nextpkt = NULL; |
| 2665 | m = convert_pkt_to_mbuf(pkt); |
| 2666 | ASSERT(m != NULL); |
| 2667 | |
| 2668 | *m_tailp = m; |
| 2669 | m_tailp = &m->m_nextpkt; |
| 2670 | c++; |
| 2671 | b += m_pktlen(m); |
| 2672 | pkt = next; |
| 2673 | } |
| 2674 | if (head != NULL) { |
| 2675 | *head = m_head; |
| 2676 | } |
| 2677 | if (tail != NULL) { |
| 2678 | *tail = m; |
| 2679 | } |
| 2680 | if (cnt != NULL) { |
| 2681 | *cnt = c; |
| 2682 | } |
| 2683 | if (bytes != NULL) { |
| 2684 | *bytes = b; |
| 2685 | } |
| 2686 | } |
| 2687 | |
| 2688 | SK_NO_INLINE_ATTRIBUTE |
| 2689 | static int |
| 2690 | classq_enqueue_flow_single(struct nx_flowswitch *fsw, |
| 2691 | struct __kern_packet *pkt) |
| 2692 | { |
| 2693 | struct ifnet *ifp = fsw->fsw_ifp; |
| 2694 | boolean_t pkt_drop = FALSE; |
| 2695 | int err; |
| 2696 | |
| 2697 | FSW_LOCK_ASSERT_HELD(fsw); |
| 2698 | ASSERT(fsw->fsw_classq_enabled); |
| 2699 | ASSERT(pkt->pkt_flow_token != 0); |
| 2700 | fsw_ifp_inc_traffic_class_out_pkt(ifp, pkt->pkt_svc_class, |
| 2701 | 1, pkt->pkt_length); |
| 2702 | |
| 2703 | if (__improbable(pkt->pkt_trace_id != 0)) { |
| 2704 | KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_END, pkt->pkt_trace_id); |
| 2705 | KDBG(SK_KTRACE_PKT_TX_AQM | DBG_FUNC_START, pkt->pkt_trace_id); |
| 2706 | } |
| 2707 | |
| 2708 | switch (fsw->fsw_classq_enq_ptype) { |
| 2709 | case QP_MBUF: { /* compat interface */ |
| 2710 | struct mbuf *m; |
| 2711 | |
| 2712 | m = convert_pkt_to_mbuf(pkt); |
| 2713 | ASSERT(m != NULL); |
| 2714 | pkt = NULL; |
| 2715 | |
| 2716 | /* ifnet_enqueue consumes mbuf */ |
| 2717 | err = ifnet_enqueue_mbuf(ifp, m, false, &pkt_drop); |
| 2718 | m = NULL; |
| 2719 | #if (DEVELOPMENT || DEBUG) |
| 2720 | if (__improbable(!pkt_drop)) { |
| 2721 | _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func); |
| 2722 | } |
| 2723 | #endif /* DEVELOPMENT || DEBUG */ |
| 2724 | if (pkt_drop) { |
| 2725 | FSW_STATS_INC(FSW_STATS_DROP); |
| 2726 | FSW_STATS_INC(FSW_STATS_TX_AQM_DROP); |
| 2727 | } |
| 2728 | break; |
| 2729 | } |
| 2730 | case QP_PACKET: { /* native interface */ |
| 2731 | /* ifnet_enqueue consumes packet */ |
| 2732 | err = ifnet_enqueue_pkt(ifp, pkt, false, &pkt_drop); |
| 2733 | pkt = NULL; |
| 2734 | #if (DEVELOPMENT || DEBUG) |
| 2735 | if (__improbable(!pkt_drop)) { |
| 2736 | _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func); |
| 2737 | } |
| 2738 | #endif /* DEVELOPMENT || DEBUG */ |
| 2739 | if (pkt_drop) { |
| 2740 | FSW_STATS_INC(FSW_STATS_DROP); |
| 2741 | FSW_STATS_INC(FSW_STATS_TX_AQM_DROP); |
| 2742 | } |
| 2743 | break; |
| 2744 | } |
| 2745 | default: |
| 2746 | err = EINVAL; |
| 2747 | VERIFY(0); |
| 2748 | /* NOTREACHED */ |
| 2749 | __builtin_unreachable(); |
| 2750 | } |
| 2751 | |
| 2752 | return err; |
| 2753 | } |
| 2754 | |
| 2755 | static int |
| 2756 | classq_enqueue_flow_chain(struct nx_flowswitch *fsw, |
| 2757 | struct __kern_packet *pkt_head, struct __kern_packet *pkt_tail, |
| 2758 | uint32_t cnt, uint32_t bytes) |
| 2759 | { |
| 2760 | struct ifnet *ifp = fsw->fsw_ifp; |
| 2761 | boolean_t pkt_drop = FALSE; |
| 2762 | uint32_t svc; |
| 2763 | int err; |
| 2764 | |
| 2765 | FSW_LOCK_ASSERT_HELD(fsw); |
| 2766 | ASSERT(fsw->fsw_classq_enabled); |
| 2767 | ASSERT(pkt_head->pkt_flow_token != 0); |
| 2768 | |
| 2769 | /* |
| 2770 | * All packets in the flow should have the same svc. |
| 2771 | */ |
| 2772 | svc = pkt_head->pkt_svc_class; |
| 2773 | fsw_ifp_inc_traffic_class_out_pkt(ifp, svc, cnt, bytes); |
| 2774 | |
| 2775 | switch (fsw->fsw_classq_enq_ptype) { |
| 2776 | case QP_MBUF: { /* compat interface */ |
| 2777 | struct mbuf *m_head = NULL, *m_tail = NULL; |
| 2778 | uint32_t c = 0, b = 0; |
| 2779 | |
| 2780 | convert_pkt_to_mbuf_list(pkt_list: pkt_head, head: &m_head, tail: &m_tail, cnt: &c, bytes: &b); |
| 2781 | ASSERT(m_head != NULL && m_tail != NULL); |
| 2782 | ASSERT(c == cnt); |
| 2783 | ASSERT(b == bytes); |
| 2784 | pkt_head = NULL; |
| 2785 | |
| 2786 | /* ifnet_enqueue consumes mbuf */ |
| 2787 | err = ifnet_enqueue_mbuf_chain(ifp, m_head, m_tail, cnt, |
| 2788 | bytes, FALSE, &pkt_drop); |
| 2789 | m_head = NULL; |
| 2790 | m_tail = NULL; |
| 2791 | #if (DEVELOPMENT || DEBUG) |
| 2792 | if (__improbable(!pkt_drop)) { |
| 2793 | _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func); |
| 2794 | } |
| 2795 | #endif /* DEVELOPMENT || DEBUG */ |
| 2796 | if (pkt_drop) { |
| 2797 | STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, cnt); |
| 2798 | STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP, |
| 2799 | cnt); |
| 2800 | } |
| 2801 | break; |
| 2802 | } |
| 2803 | case QP_PACKET: { /* native interface */ |
| 2804 | /* ifnet_enqueue consumes packet */ |
| 2805 | err = ifnet_enqueue_pkt_chain(ifp, pkt_head, pkt_tail, cnt, |
| 2806 | bytes, FALSE, &pkt_drop); |
| 2807 | pkt_head = NULL; |
| 2808 | #if (DEVELOPMENT || DEBUG) |
| 2809 | if (__improbable(!pkt_drop)) { |
| 2810 | _FSW_INJECT_ERROR(14, pkt_drop, TRUE, null_func); |
| 2811 | } |
| 2812 | #endif /* DEVELOPMENT || DEBUG */ |
| 2813 | if (pkt_drop) { |
| 2814 | STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, cnt); |
| 2815 | STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP, |
| 2816 | cnt); |
| 2817 | } |
| 2818 | break; |
| 2819 | } |
| 2820 | default: |
| 2821 | err = EINVAL; |
| 2822 | VERIFY(0); |
| 2823 | /* NOTREACHED */ |
| 2824 | __builtin_unreachable(); |
| 2825 | } |
| 2826 | |
| 2827 | return err; |
| 2828 | } |
| 2829 | |
| 2830 | /* |
| 2831 | * This code path needs to be kept for interfaces without logical link support. |
| 2832 | */ |
| 2833 | static void |
| 2834 | classq_enqueue_flow(struct nx_flowswitch *fsw, struct flow_entry *fe, |
| 2835 | bool chain, uint32_t cnt, uint32_t bytes) |
| 2836 | { |
| 2837 | bool flowadv_is_set = false; |
| 2838 | struct __kern_packet *pkt, *tail, *tpkt; |
| 2839 | flowadv_idx_t flow_adv_idx; |
| 2840 | bool flowadv_cap; |
| 2841 | flowadv_token_t flow_adv_token; |
| 2842 | int err; |
| 2843 | |
| 2844 | SK_DF(SK_VERB_FSW_DP | SK_VERB_AQM, "%s classq enqueued %d pkts" , |
| 2845 | if_name(fsw->fsw_ifp), KPKTQ_LEN(&fe->fe_tx_pktq)); |
| 2846 | |
| 2847 | if (chain) { |
| 2848 | pkt = KPKTQ_FIRST(&fe->fe_tx_pktq); |
| 2849 | tail = KPKTQ_LAST(&fe->fe_tx_pktq); |
| 2850 | KPKTQ_INIT(&fe->fe_tx_pktq); |
| 2851 | if (pkt == NULL) { |
| 2852 | return; |
| 2853 | } |
| 2854 | flow_adv_idx = pkt->pkt_flowsrc_fidx; |
| 2855 | flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0); |
| 2856 | flow_adv_token = pkt->pkt_flow_token; |
| 2857 | |
| 2858 | err = classq_enqueue_flow_chain(fsw, pkt_head: pkt, pkt_tail: tail, cnt, bytes); |
| 2859 | |
| 2860 | /* set flow advisory if needed */ |
| 2861 | if (__improbable((err == EQFULL || err == EQSUSPENDED) && |
| 2862 | flowadv_cap)) { |
| 2863 | flowadv_is_set = na_flowadv_set(flow_get_na(fsw, fe), |
| 2864 | flow_adv_idx, flow_adv_token); |
| 2865 | } |
| 2866 | DTRACE_SKYWALK3(chain__enqueue, uint32_t, cnt, uint32_t, bytes, |
| 2867 | bool, flowadv_is_set); |
| 2868 | } else { |
| 2869 | uint32_t c = 0, b = 0; |
| 2870 | |
| 2871 | KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) { |
| 2872 | KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt); |
| 2873 | |
| 2874 | flow_adv_idx = pkt->pkt_flowsrc_fidx; |
| 2875 | flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0); |
| 2876 | flow_adv_token = pkt->pkt_flow_token; |
| 2877 | |
| 2878 | c++; |
| 2879 | b += pkt->pkt_length; |
| 2880 | err = classq_enqueue_flow_single(fsw, pkt); |
| 2881 | |
| 2882 | /* set flow advisory if needed */ |
| 2883 | if (__improbable(!flowadv_is_set && |
| 2884 | ((err == EQFULL || err == EQSUSPENDED) && |
| 2885 | flowadv_cap))) { |
| 2886 | flowadv_is_set = na_flowadv_set( |
| 2887 | flow_get_na(fsw, fe), flow_adv_idx, |
| 2888 | flow_adv_token); |
| 2889 | } |
| 2890 | } |
| 2891 | ASSERT(c == cnt); |
| 2892 | ASSERT(b == bytes); |
| 2893 | DTRACE_SKYWALK3(non__chain__enqueue, uint32_t, cnt, uint32_t, bytes, |
| 2894 | bool, flowadv_is_set); |
| 2895 | } |
| 2896 | |
| 2897 | /* notify flow advisory event */ |
| 2898 | if (__improbable(flowadv_is_set)) { |
| 2899 | struct __kern_channel_ring *r = fsw_flow_get_tx_ring(fsw, fe); |
| 2900 | if (__probable(r)) { |
| 2901 | na_flowadv_event(r); |
| 2902 | SK_DF(SK_VERB_FLOW_ADVISORY | SK_VERB_TX, |
| 2903 | "%s(%d) notified of flow update" , |
| 2904 | sk_proc_name_address(current_proc()), |
| 2905 | sk_proc_pid(current_proc())); |
| 2906 | } |
| 2907 | } |
| 2908 | } |
| 2909 | |
| 2910 | /* |
| 2911 | * Logical link code path |
| 2912 | */ |
| 2913 | static void |
| 2914 | classq_qset_enqueue_flow(struct nx_flowswitch *fsw, struct flow_entry *fe, |
| 2915 | bool chain, uint32_t cnt, uint32_t bytes) |
| 2916 | { |
| 2917 | #pragma unused(chain) |
| 2918 | struct __kern_packet *pkt, *tail; |
| 2919 | flowadv_idx_t flow_adv_idx; |
| 2920 | bool flowadv_is_set = false; |
| 2921 | bool flowadv_cap; |
| 2922 | flowadv_token_t flow_adv_token; |
| 2923 | uint32_t flowctl = 0, dropped = 0; |
| 2924 | int err; |
| 2925 | |
| 2926 | SK_DF(SK_VERB_FSW_DP | SK_VERB_AQM, "%s classq enqueued %d pkts" , |
| 2927 | if_name(fsw->fsw_ifp), KPKTQ_LEN(&fe->fe_tx_pktq)); |
| 2928 | |
| 2929 | pkt = KPKTQ_FIRST(&fe->fe_tx_pktq); |
| 2930 | tail = KPKTQ_LAST(&fe->fe_tx_pktq); |
| 2931 | KPKTQ_INIT(&fe->fe_tx_pktq); |
| 2932 | if (pkt == NULL) { |
| 2933 | return; |
| 2934 | } |
| 2935 | flow_adv_idx = pkt->pkt_flowsrc_fidx; |
| 2936 | flowadv_cap = ((pkt->pkt_pflags & PKT_F_FLOW_ADV) != 0); |
| 2937 | flow_adv_token = pkt->pkt_flow_token; |
| 2938 | |
| 2939 | err = netif_qset_enqueue(fe->fe_qset, pkt, tail, cnt, bytes, |
| 2940 | &flowctl, &dropped); |
| 2941 | |
| 2942 | if (__improbable(err != 0)) { |
| 2943 | /* set flow advisory if needed */ |
| 2944 | if (flowctl > 0 && flowadv_cap) { |
| 2945 | flowadv_is_set = na_flowadv_set(flow_get_na(fsw, fe), |
| 2946 | flow_adv_idx, flow_adv_token); |
| 2947 | |
| 2948 | /* notify flow advisory event */ |
| 2949 | if (flowadv_is_set) { |
| 2950 | struct __kern_channel_ring *r = |
| 2951 | fsw_flow_get_tx_ring(fsw, fe); |
| 2952 | if (__probable(r)) { |
| 2953 | na_flowadv_event(r); |
| 2954 | SK_DF(SK_VERB_FLOW_ADVISORY | |
| 2955 | SK_VERB_TX, |
| 2956 | "%s(%d) notified of flow update" , |
| 2957 | sk_proc_name_address(current_proc()), |
| 2958 | sk_proc_pid(current_proc())); |
| 2959 | } |
| 2960 | } |
| 2961 | } |
| 2962 | if (dropped > 0) { |
| 2963 | STATS_ADD(&fsw->fsw_stats, FSW_STATS_DROP, dropped); |
| 2964 | STATS_ADD(&fsw->fsw_stats, FSW_STATS_TX_AQM_DROP, |
| 2965 | dropped); |
| 2966 | } |
| 2967 | } |
| 2968 | } |
| 2969 | |
| 2970 | static void |
| 2971 | tx_finalize_packet(struct nx_flowswitch *fsw, struct __kern_packet *pkt) |
| 2972 | { |
| 2973 | #pragma unused(fsw) |
| 2974 | /* finalize here; no more changes to buflets after classq */ |
| 2975 | if (__probable(!(pkt->pkt_pflags & PKT_F_MBUF_DATA))) { |
| 2976 | kern_packet_t ph = SK_PTR_ENCODE(pkt, |
| 2977 | METADATA_TYPE(pkt), METADATA_SUBTYPE(pkt)); |
| 2978 | int err = __packet_finalize(ph); |
| 2979 | VERIFY(err == 0); |
| 2980 | } |
| 2981 | } |
| 2982 | |
| 2983 | static bool |
| 2984 | dp_flow_tx_route_process(struct nx_flowswitch *fsw, struct flow_entry *fe) |
| 2985 | { |
| 2986 | struct flow_route *fr = fe->fe_route; |
| 2987 | int err; |
| 2988 | |
| 2989 | ASSERT(fr != NULL); |
| 2990 | |
| 2991 | if (__improbable(!dp_flow_route_process(fsw, fe))) { |
| 2992 | return false; |
| 2993 | } |
| 2994 | if (fe->fe_qset_select == FE_QSET_SELECT_DYNAMIC) { |
| 2995 | flow_qset_select_dynamic(fsw, fe, TRUE); |
| 2996 | } |
| 2997 | |
| 2998 | _FSW_INJECT_ERROR(35, fr->fr_flags, fr->fr_flags, |
| 2999 | _fsw_error35_handler, 1, fr, NULL, NULL); |
| 3000 | _FSW_INJECT_ERROR(36, fr->fr_flags, fr->fr_flags, |
| 3001 | _fsw_error36_handler, 1, fr, NULL); |
| 3002 | |
| 3003 | /* |
| 3004 | * See if we need to resolve the flow route; note the test against |
| 3005 | * fr_flags here is done without any lock for performance. Thus |
| 3006 | * it's possible that we race against the thread performing route |
| 3007 | * event updates for a packet (which is OK). In any case we should |
| 3008 | * not have any assertion on fr_flags value(s) due to the lack of |
| 3009 | * serialization. |
| 3010 | */ |
| 3011 | if (fr->fr_flags & FLOWRTF_RESOLVED) { |
| 3012 | goto frame; |
| 3013 | } |
| 3014 | |
| 3015 | struct __kern_packet *pkt, *tpkt; |
| 3016 | KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) { |
| 3017 | err = fsw->fsw_resolve(fsw, fr, pkt); |
| 3018 | _FSW_INJECT_ERROR_SET(35, _fsw_error35_handler, 2, fr, pkt, &err); |
| 3019 | _FSW_INJECT_ERROR_SET(36, _fsw_error36_handler, 2, fr, &err); |
| 3020 | /* |
| 3021 | * If resolver returns EJUSTRETURN then we drop the pkt as the |
| 3022 | * resolver should have converted the pkt into mbuf (or |
| 3023 | * detached the attached mbuf from pkt) and added it to the |
| 3024 | * llinfo queue. If we do have a cached llinfo, then proceed |
| 3025 | * to using it even though it may be stale (very unlikely) |
| 3026 | * while the resolution is in progress. |
| 3027 | * Otherwise, any other error results in dropping pkt. |
| 3028 | */ |
| 3029 | if (err == EJUSTRETURN) { |
| 3030 | KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt); |
| 3031 | pp_free_packet_single(pkt); |
| 3032 | FSW_STATS_INC(FSW_STATS_TX_RESOLV_PENDING); |
| 3033 | continue; |
| 3034 | } else if (err != 0 && (fr->fr_flags & FLOWRTF_HAS_LLINFO)) { |
| 3035 | /* use existing llinfo */ |
| 3036 | FSW_STATS_INC(FSW_STATS_TX_RESOLV_STALE); |
| 3037 | } else if (err != 0) { |
| 3038 | KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt); |
| 3039 | pp_free_packet_single(pkt); |
| 3040 | FSW_STATS_INC(FSW_STATS_TX_RESOLV_FAIL); |
| 3041 | continue; |
| 3042 | } |
| 3043 | } |
| 3044 | |
| 3045 | frame: |
| 3046 | KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) { |
| 3047 | if (fsw->fsw_frame != NULL) { |
| 3048 | fsw->fsw_frame(fsw, fr, pkt); |
| 3049 | } |
| 3050 | } |
| 3051 | |
| 3052 | return true; |
| 3053 | } |
| 3054 | |
| 3055 | static void |
| 3056 | dp_listener_flow_tx_process(struct nx_flowswitch *fsw, struct flow_entry *fe) |
| 3057 | { |
| 3058 | #pragma unused(fsw) |
| 3059 | struct __kern_packet *pkt, *tpkt; |
| 3060 | KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) { |
| 3061 | KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt); |
| 3062 | /* listener is only allowed TCP RST */ |
| 3063 | if (pkt->pkt_flow_ip_proto == IPPROTO_TCP && |
| 3064 | (pkt->pkt_flow_tcp_flags & TH_RST) != 0) { |
| 3065 | flow_track_abort_tcp(fe, NULL, rst_pkt: pkt); |
| 3066 | } else { |
| 3067 | char *addr; |
| 3068 | MD_BUFLET_ADDR_ABS(pkt, addr); |
| 3069 | SK_ERR("listener flow sends non-RST packet %s" , |
| 3070 | sk_dump(sk_proc_name_address(current_proc()), |
| 3071 | addr, pkt->pkt_length, 128, NULL, 0)); |
| 3072 | } |
| 3073 | pp_free_packet_single(pkt); |
| 3074 | } |
| 3075 | } |
| 3076 | |
| 3077 | static void |
| 3078 | fsw_update_timestamps(struct __kern_packet *pkt, volatile uint64_t *fg_ts, |
| 3079 | volatile uint64_t *rt_ts, ifnet_t ifp) |
| 3080 | { |
| 3081 | struct timespec now; |
| 3082 | uint64_t now_nsec = 0; |
| 3083 | |
| 3084 | if (!(pkt->pkt_pflags & PKT_F_TS_VALID) || pkt->pkt_timestamp == 0) { |
| 3085 | nanouptime(ts: &now); |
| 3086 | net_timernsec(&now, &now_nsec); |
| 3087 | pkt->pkt_timestamp = now_nsec; |
| 3088 | } |
| 3089 | pkt->pkt_pflags &= ~PKT_F_TS_VALID; |
| 3090 | |
| 3091 | /* |
| 3092 | * If the packet service class is not background, |
| 3093 | * update the timestamps on the interface, as well as |
| 3094 | * the ones in nexus-wide advisory to indicate recent |
| 3095 | * activity on a foreground flow. |
| 3096 | */ |
| 3097 | if (!(pkt->pkt_pflags & PKT_F_BACKGROUND)) { |
| 3098 | ifp->if_fg_sendts = (uint32_t)_net_uptime; |
| 3099 | if (fg_ts != NULL) { |
| 3100 | *fg_ts = _net_uptime; |
| 3101 | } |
| 3102 | } |
| 3103 | if (pkt->pkt_pflags & PKT_F_REALTIME) { |
| 3104 | ifp->if_rt_sendts = (uint32_t)_net_uptime; |
| 3105 | if (rt_ts != NULL) { |
| 3106 | *rt_ts = _net_uptime; |
| 3107 | } |
| 3108 | } |
| 3109 | } |
| 3110 | |
| 3111 | static bool |
| 3112 | fsw_chain_enqueue_enabled(struct nx_flowswitch *fsw, bool gso_enabled) |
| 3113 | { |
| 3114 | return fsw_chain_enqueue != 0 && |
| 3115 | fsw->fsw_ifp->if_output_netem == NULL && |
| 3116 | (fsw->fsw_ifp->if_eflags & IFEF_ENQUEUE_MULTI) == 0 && |
| 3117 | gso_enabled; |
| 3118 | } |
| 3119 | |
| 3120 | void |
| 3121 | dp_flow_tx_process(struct nx_flowswitch *fsw, struct flow_entry *fe, |
| 3122 | uint32_t flags) |
| 3123 | { |
| 3124 | struct pktq dropped_pkts; |
| 3125 | bool chain, gso = ((flags & FLOW_PROC_FLAG_GSO) != 0); |
| 3126 | uint32_t cnt = 0, bytes = 0; |
| 3127 | volatile struct sk_nexusadv *nxadv = NULL; |
| 3128 | volatile uint64_t *fg_ts = NULL; |
| 3129 | volatile uint64_t *rt_ts = NULL; |
| 3130 | uint8_t qset_idx = (fe->fe_qset != NULL) ? fe->fe_qset->nqs_idx : 0; |
| 3131 | |
| 3132 | KPKTQ_INIT(&dropped_pkts); |
| 3133 | ASSERT(!KPKTQ_EMPTY(&fe->fe_tx_pktq)); |
| 3134 | if (__improbable(fe->fe_flags & FLOWENTF_LISTENER)) { |
| 3135 | dp_listener_flow_tx_process(fsw, fe); |
| 3136 | return; |
| 3137 | } |
| 3138 | if (__improbable(!dp_flow_tx_route_process(fsw, fe))) { |
| 3139 | SK_RDERR(5, "Tx route bad" ); |
| 3140 | FSW_STATS_ADD(FSW_STATS_TX_FLOW_NONVIABLE, |
| 3141 | KPKTQ_LEN(&fe->fe_tx_pktq)); |
| 3142 | KPKTQ_CONCAT(&dropped_pkts, &fe->fe_tx_pktq); |
| 3143 | goto done; |
| 3144 | } |
| 3145 | chain = fsw_chain_enqueue_enabled(fsw, gso_enabled: gso); |
| 3146 | if (chain) { |
| 3147 | nxadv = fsw->fsw_nx->nx_adv.flowswitch_nxv_adv; |
| 3148 | if (nxadv != NULL) { |
| 3149 | fg_ts = &nxadv->nxadv_fg_sendts; |
| 3150 | rt_ts = &nxadv->nxadv_rt_sendts; |
| 3151 | } |
| 3152 | } |
| 3153 | struct __kern_packet *pkt, *tpkt; |
| 3154 | KPKTQ_FOREACH_SAFE(pkt, &fe->fe_tx_pktq, tpkt) { |
| 3155 | int err = 0; |
| 3156 | |
| 3157 | err = flow_pkt_track(fe, pkt, false); |
| 3158 | if (__improbable(err != 0)) { |
| 3159 | SK_RDERR(5, "flow_pkt_track failed (err %d)" , err); |
| 3160 | FSW_STATS_INC(FSW_STATS_TX_FLOW_TRACK_ERR); |
| 3161 | KPKTQ_REMOVE(&fe->fe_tx_pktq, pkt); |
| 3162 | KPKTQ_ENQUEUE(&dropped_pkts, pkt); |
| 3163 | continue; |
| 3164 | } |
| 3165 | _UUID_COPY(pkt->pkt_policy_euuid, fe->fe_eproc_uuid); |
| 3166 | pkt->pkt_transport_protocol = fe->fe_transport_protocol; |
| 3167 | |
| 3168 | /* set AQM related values for outgoing packet */ |
| 3169 | if (fe->fe_adv_idx != FLOWADV_IDX_NONE) { |
| 3170 | pkt->pkt_pflags |= PKT_F_FLOW_ADV; |
| 3171 | pkt->pkt_flowsrc_type = FLOWSRC_CHANNEL; |
| 3172 | pkt->pkt_flowsrc_fidx = fe->fe_adv_idx; |
| 3173 | } else { |
| 3174 | pkt->pkt_pflags &= ~PKT_F_FLOW_ADV; |
| 3175 | } |
| 3176 | _UUID_CLEAR(pkt->pkt_flow_id); |
| 3177 | pkt->pkt_flow_token = fe->fe_flowid; |
| 3178 | pkt->pkt_pflags |= PKT_F_FLOW_ID; |
| 3179 | pkt->pkt_qset_idx = qset_idx; |
| 3180 | pkt->pkt_policy_id = fe->fe_policy_id; |
| 3181 | pkt->pkt_skip_policy_id = fe->fe_skip_policy_id; |
| 3182 | |
| 3183 | /* |
| 3184 | * The same code is exercised per packet for the non-chain case |
| 3185 | * (see ifnet_enqueue_ifclassq()). It's replicated here to avoid |
| 3186 | * re-walking the chain later. |
| 3187 | */ |
| 3188 | if (chain) { |
| 3189 | fsw_update_timestamps(pkt, fg_ts, rt_ts, ifp: fsw->fsw_ifp); |
| 3190 | } |
| 3191 | /* mark packet tos/svc_class */ |
| 3192 | fsw_qos_mark(fsw, fe, pkt); |
| 3193 | |
| 3194 | tx_finalize_packet(fsw, pkt); |
| 3195 | bytes += pkt->pkt_length; |
| 3196 | cnt++; |
| 3197 | } |
| 3198 | |
| 3199 | /* snoop after it's finalized */ |
| 3200 | if (__improbable(pktap_total_tap_count != 0)) { |
| 3201 | fsw_snoop(fsw, fe, false); |
| 3202 | } |
| 3203 | if (fe->fe_qset != NULL) { |
| 3204 | classq_qset_enqueue_flow(fsw, fe, chain, cnt, bytes); |
| 3205 | } else { |
| 3206 | classq_enqueue_flow(fsw, fe, chain, cnt, bytes); |
| 3207 | } |
| 3208 | done: |
| 3209 | dp_drop_pktq(fsw, &dropped_pkts); |
| 3210 | } |
| 3211 | |
| 3212 | static struct flow_entry * |
| 3213 | tx_process_continuous_ip_frag(struct nx_flowswitch *fsw, |
| 3214 | struct flow_entry *prev_fe, struct __kern_packet *pkt) |
| 3215 | { |
| 3216 | ASSERT(!pkt->pkt_flow_ip_is_first_frag); |
| 3217 | |
| 3218 | if (__improbable(pkt->pkt_flow_ip_frag_id == 0)) { |
| 3219 | FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_ID); |
| 3220 | SK_ERR("%s(%d) invalid zero fragment id" , |
| 3221 | sk_proc_name_address(current_proc()), |
| 3222 | sk_proc_pid(current_proc())); |
| 3223 | return NULL; |
| 3224 | } |
| 3225 | |
| 3226 | SK_DF(SK_VERB_FSW_DP | SK_VERB_TX, |
| 3227 | "%s(%d) continuation frag, id %u" , |
| 3228 | sk_proc_name_address(current_proc()), |
| 3229 | sk_proc_pid(current_proc()), |
| 3230 | pkt->pkt_flow_ip_frag_id); |
| 3231 | if (__improbable(prev_fe == NULL || |
| 3232 | !prev_fe->fe_tx_is_cont_frag)) { |
| 3233 | SK_ERR("%s(%d) unexpected continuation frag" , |
| 3234 | sk_proc_name_address(current_proc()), |
| 3235 | sk_proc_pid(current_proc()), |
| 3236 | pkt->pkt_flow_ip_frag_id); |
| 3237 | FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT); |
| 3238 | return NULL; |
| 3239 | } |
| 3240 | if (__improbable(pkt->pkt_flow_ip_frag_id != |
| 3241 | prev_fe->fe_tx_frag_id)) { |
| 3242 | FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT); |
| 3243 | SK_ERR("%s(%d) wrong continuation frag id %u expecting %u" , |
| 3244 | sk_proc_name_address(current_proc()), |
| 3245 | sk_proc_pid(current_proc()), |
| 3246 | pkt->pkt_flow_ip_frag_id, |
| 3247 | prev_fe->fe_tx_frag_id); |
| 3248 | return NULL; |
| 3249 | } |
| 3250 | |
| 3251 | return prev_fe; |
| 3252 | } |
| 3253 | |
| 3254 | static struct flow_entry * |
| 3255 | tx_lookup_flow(struct nx_flowswitch *fsw, struct __kern_packet *pkt, |
| 3256 | struct flow_entry *prev_fe) |
| 3257 | { |
| 3258 | struct flow_entry *fe; |
| 3259 | |
| 3260 | fe = lookup_flow_with_pkt(fsw, pkt, false, prev_fe); |
| 3261 | if (__improbable(fe == NULL)) { |
| 3262 | goto done; |
| 3263 | } |
| 3264 | |
| 3265 | if (__improbable(fe->fe_flags & FLOWENTF_TORN_DOWN)) { |
| 3266 | SK_RDERR(5, "Tx flow torn down" ); |
| 3267 | FSW_STATS_INC(FSW_STATS_TX_FLOW_TORNDOWN); |
| 3268 | flow_entry_release(pfe: &fe); |
| 3269 | goto done; |
| 3270 | } |
| 3271 | |
| 3272 | _FSW_INJECT_ERROR(34, pkt->pkt_flow_id[0], fe->fe_uuid[0] + 1, |
| 3273 | null_func); |
| 3274 | |
| 3275 | if (__improbable(!_UUID_MATCH(pkt->pkt_flow_id, fe->fe_uuid))) { |
| 3276 | uuid_string_t flow_id_str, pkt_id_str; |
| 3277 | sk_uuid_unparse(fe->fe_uuid, flow_id_str); |
| 3278 | sk_uuid_unparse(pkt->pkt_flow_id, pkt_id_str); |
| 3279 | SK_ERR("pkt flow id %s != flow id %s" , pkt_id_str, flow_id_str); |
| 3280 | flow_entry_release(pfe: &fe); |
| 3281 | FSW_STATS_INC(FSW_STATS_TX_FLOW_BAD_ID); |
| 3282 | } |
| 3283 | |
| 3284 | done: |
| 3285 | return fe; |
| 3286 | } |
| 3287 | |
| 3288 | static inline void |
| 3289 | tx_flow_process(struct nx_flowswitch *fsw, struct flow_entry *fe, |
| 3290 | uint32_t flags) |
| 3291 | { |
| 3292 | ASSERT(!KPKTQ_EMPTY(&fe->fe_tx_pktq)); |
| 3293 | ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) != 0); |
| 3294 | |
| 3295 | SK_DF(SK_VERB_FSW_DP | SK_VERB_TX, "TX %d pkts from fe %p port %d" , |
| 3296 | KPKTQ_LEN(&fe->fe_tx_pktq), fe, fe->fe_nx_port); |
| 3297 | |
| 3298 | /* flow related processing (default, agg, etc.) */ |
| 3299 | fe->fe_tx_process(fsw, fe, flags); |
| 3300 | |
| 3301 | KPKTQ_FINI(&fe->fe_tx_pktq); |
| 3302 | } |
| 3303 | |
| 3304 | #if SK_LOG |
| 3305 | static void |
| 3306 | dp_tx_log_pkt(uint64_t verb, char *desc, struct __kern_packet *pkt) |
| 3307 | { |
| 3308 | char *pkt_buf; |
| 3309 | MD_BUFLET_ADDR_ABS(pkt, pkt_buf); |
| 3310 | SK_DF(verb, "%s(%d) %s %s" , sk_proc_name_address(current_proc()), |
| 3311 | sk_proc_pid(current_proc()), desc, sk_dump("buf" , pkt_buf, |
| 3312 | pkt->pkt_length, 128, NULL, 0)); |
| 3313 | } |
| 3314 | #else /* !SK_LOG */ |
| 3315 | #define dp_tx_log_pkt(...) |
| 3316 | #endif /* !SK_LOG */ |
| 3317 | |
| 3318 | static inline struct ifnet * |
| 3319 | fsw_datamov_begin(struct nx_flowswitch *fsw) |
| 3320 | { |
| 3321 | struct ifnet *ifp; |
| 3322 | |
| 3323 | ifp = fsw->fsw_ifp; |
| 3324 | if (!ifnet_datamov_begin(ifp)) { |
| 3325 | DTRACE_SKYWALK1(ifnet__detached, struct ifnet *, ifp); |
| 3326 | return NULL; |
| 3327 | } |
| 3328 | return ifp; |
| 3329 | } |
| 3330 | |
| 3331 | static inline void |
| 3332 | fsw_datamov_end(struct nx_flowswitch *fsw) |
| 3333 | { |
| 3334 | ifnet_datamov_end(fsw->fsw_ifp); |
| 3335 | } |
| 3336 | |
| 3337 | static void |
| 3338 | dp_tx_pktq(struct nx_flowswitch *fsw, struct pktq *spktq) |
| 3339 | { |
| 3340 | struct __kern_packet *spkt, *pkt; |
| 3341 | struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes); |
| 3342 | struct flow_entry *fe, *prev_fe; |
| 3343 | struct pktq dropped_pkts, dpktq; |
| 3344 | struct nexus_adapter *dev_na; |
| 3345 | struct kern_pbufpool *dev_pp; |
| 3346 | struct ifnet *ifp = NULL; |
| 3347 | sa_family_t af; |
| 3348 | uint32_t n_pkts, n_flows = 0; |
| 3349 | boolean_t do_pacing = FALSE; |
| 3350 | |
| 3351 | int err; |
| 3352 | KPKTQ_INIT(&dpktq); |
| 3353 | KPKTQ_INIT(&dropped_pkts); |
| 3354 | n_pkts = KPKTQ_LEN(spktq); |
| 3355 | |
| 3356 | FSW_RLOCK(fsw); |
| 3357 | if (__improbable(FSW_QUIESCED(fsw))) { |
| 3358 | DTRACE_SKYWALK1(tx__quiesced, struct nx_flowswitch *, fsw); |
| 3359 | SK_ERR("flowswitch detached, dropping %d pkts" , n_pkts); |
| 3360 | KPKTQ_CONCAT(&dropped_pkts, spktq); |
| 3361 | goto done; |
| 3362 | } |
| 3363 | dev_na = fsw->fsw_dev_ch->ch_na; |
| 3364 | if (__improbable(dev_na == NULL)) { |
| 3365 | SK_ERR("dev port not attached, dropping %d pkts" , n_pkts); |
| 3366 | FSW_STATS_ADD(FSW_STATS_DST_NXPORT_INACTIVE, n_pkts); |
| 3367 | KPKTQ_CONCAT(&dropped_pkts, spktq); |
| 3368 | goto done; |
| 3369 | } |
| 3370 | ifp = fsw_datamov_begin(fsw); |
| 3371 | if (ifp == NULL) { |
| 3372 | SK_ERR("ifnet not attached, dropping %d pkts" , n_pkts); |
| 3373 | KPKTQ_CONCAT(&dropped_pkts, spktq); |
| 3374 | goto done; |
| 3375 | } |
| 3376 | |
| 3377 | /* batch allocate enough packets */ |
| 3378 | dev_pp = na_kr_get_pp(dev_na, NR_TX); |
| 3379 | |
| 3380 | err = pp_alloc_pktq(dev_pp, dev_pp->pp_max_frags, &dpktq, n_pkts, NULL, |
| 3381 | NULL, SKMEM_NOSLEEP); |
| 3382 | #if DEVELOPMENT || DEBUG |
| 3383 | if (__probable(err != ENOMEM)) { |
| 3384 | _FSW_INJECT_ERROR(12, err, ENOMEM, pp_free_pktq, &dpktq); |
| 3385 | } |
| 3386 | #endif /* DEVELOPMENT || DEBUG */ |
| 3387 | if (__improbable(err == ENOMEM)) { |
| 3388 | ASSERT(KPKTQ_EMPTY(&dpktq)); |
| 3389 | KPKTQ_CONCAT(&dropped_pkts, spktq); |
| 3390 | FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts); |
| 3391 | SK_ERR("failed to alloc %u pkts from device pool" , n_pkts); |
| 3392 | goto done; |
| 3393 | } else if (__improbable(err == EAGAIN)) { |
| 3394 | FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, |
| 3395 | (n_pkts - KPKTQ_LEN(&dpktq))); |
| 3396 | FSW_STATS_ADD(FSW_STATS_DROP, |
| 3397 | (n_pkts - KPKTQ_LEN(&dpktq))); |
| 3398 | } |
| 3399 | |
| 3400 | n_pkts = KPKTQ_LEN(&dpktq); |
| 3401 | prev_fe = NULL; |
| 3402 | KPKTQ_FOREACH(spkt, spktq) { |
| 3403 | if (n_pkts == 0) { |
| 3404 | break; |
| 3405 | } |
| 3406 | --n_pkts; |
| 3407 | |
| 3408 | KPKTQ_DEQUEUE(&dpktq, pkt); |
| 3409 | ASSERT(pkt != NULL); |
| 3410 | err = dp_copy_to_dev(fsw, spkt, dpkt: pkt); |
| 3411 | if (__improbable(err != 0)) { |
| 3412 | KPKTQ_ENQUEUE(&dropped_pkts, pkt); |
| 3413 | continue; |
| 3414 | } |
| 3415 | |
| 3416 | do_pacing |= ((pkt->pkt_pflags & PKT_F_OPT_TX_TIMESTAMP) != 0); |
| 3417 | af = fsw_ip_demux(fsw, pkt); |
| 3418 | if (__improbable(af == AF_UNSPEC)) { |
| 3419 | dp_tx_log_pkt(SK_VERB_ERROR, "demux err" , pkt); |
| 3420 | FSW_STATS_INC(FSW_STATS_TX_DEMUX_ERR); |
| 3421 | KPKTQ_ENQUEUE(&dropped_pkts, pkt); |
| 3422 | continue; |
| 3423 | } |
| 3424 | |
| 3425 | err = flow_pkt_classify(pkt, ifp, af, false); |
| 3426 | if (__improbable(err != 0)) { |
| 3427 | dp_tx_log_pkt(SK_VERB_ERROR, "flow extract err" , pkt); |
| 3428 | FSW_STATS_INC(FSW_STATS_TX_FLOW_EXTRACT_ERR); |
| 3429 | KPKTQ_ENQUEUE(&dropped_pkts, pkt); |
| 3430 | continue; |
| 3431 | } |
| 3432 | |
| 3433 | if (__improbable(pkt->pkt_flow_ip_is_frag && |
| 3434 | !pkt->pkt_flow_ip_is_first_frag)) { |
| 3435 | fe = tx_process_continuous_ip_frag(fsw, prev_fe, pkt); |
| 3436 | if (__probable(fe != NULL)) { |
| 3437 | flow_entry_retain(fe); |
| 3438 | goto flow_batch; |
| 3439 | } else { |
| 3440 | FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT); |
| 3441 | KPKTQ_ENQUEUE(&dropped_pkts, pkt); |
| 3442 | continue; |
| 3443 | } |
| 3444 | } |
| 3445 | |
| 3446 | fe = tx_lookup_flow(fsw, pkt, prev_fe); |
| 3447 | if (__improbable(fe == NULL)) { |
| 3448 | FSW_STATS_INC(FSW_STATS_TX_FLOW_NOT_FOUND); |
| 3449 | KPKTQ_ENQUEUE(&dropped_pkts, pkt); |
| 3450 | prev_fe = NULL; |
| 3451 | continue; |
| 3452 | } |
| 3453 | flow_batch: |
| 3454 | tx_flow_batch_packet(fes: &fes, fe, pkt); |
| 3455 | prev_fe = fe; |
| 3456 | } |
| 3457 | |
| 3458 | struct flow_entry *tfe = NULL; |
| 3459 | TAILQ_FOREACH_SAFE(fe, &fes, fe_tx_link, tfe) { |
| 3460 | tx_flow_process(fsw, fe, flags: 0); |
| 3461 | TAILQ_REMOVE(&fes, fe, fe_tx_link); |
| 3462 | fe->fe_tx_is_cont_frag = false; |
| 3463 | fe->fe_tx_frag_id = 0; |
| 3464 | flow_entry_release(pfe: &fe); |
| 3465 | n_flows++; |
| 3466 | } |
| 3467 | |
| 3468 | done: |
| 3469 | FSW_RUNLOCK(fsw); |
| 3470 | if (n_flows > 0) { |
| 3471 | netif_transmit(ifp, NETIF_XMIT_FLAG_CHANNEL | (do_pacing ? NETIF_XMIT_FLAG_PACING : 0)); |
| 3472 | } |
| 3473 | if (ifp != NULL) { |
| 3474 | fsw_datamov_end(fsw); |
| 3475 | } |
| 3476 | dp_drop_pktq(fsw, &dropped_pkts); |
| 3477 | KPKTQ_FINI(&dropped_pkts); |
| 3478 | KPKTQ_FINI(&dpktq); |
| 3479 | } |
| 3480 | |
| 3481 | static sa_family_t |
| 3482 | get_tso_af(struct __kern_packet *pkt) |
| 3483 | { |
| 3484 | packet_tso_flags_t tso_flags; |
| 3485 | |
| 3486 | tso_flags = pkt->pkt_csum_flags & PACKET_CSUM_TSO_FLAGS; |
| 3487 | if (tso_flags == PACKET_TSO_IPV4) { |
| 3488 | return AF_INET; |
| 3489 | } else if (tso_flags == PACKET_TSO_IPV6) { |
| 3490 | return AF_INET6; |
| 3491 | } else { |
| 3492 | panic("invalid tso flags: 0x%x\n" , tso_flags); |
| 3493 | /* NOTREACHED */ |
| 3494 | __builtin_unreachable(); |
| 3495 | } |
| 3496 | } |
| 3497 | |
| 3498 | static inline void |
| 3499 | update_flow_info(struct __kern_packet *pkt, void *iphdr, void *tcphdr, |
| 3500 | uint16_t payload_sz) |
| 3501 | { |
| 3502 | struct tcphdr *tcp = tcphdr; |
| 3503 | |
| 3504 | DTRACE_SKYWALK4(update__flow__info, struct __kern_packet *, pkt, |
| 3505 | void *, iphdr, void *, tcphdr, uint16_t, payload_sz); |
| 3506 | pkt->pkt_flow_ip_hdr = (mach_vm_address_t)iphdr; |
| 3507 | pkt->pkt_flow_tcp_hdr = (mach_vm_address_t)tcphdr; |
| 3508 | pkt->pkt_flow_tcp_flags = tcp->th_flags; |
| 3509 | pkt->pkt_flow_tcp_seq = tcp->th_seq; |
| 3510 | pkt->pkt_flow_ulen = payload_sz; |
| 3511 | } |
| 3512 | |
| 3513 | static int |
| 3514 | do_gso(struct nx_flowswitch *fsw, int af, struct __kern_packet *orig_pkt, |
| 3515 | struct __kern_packet *first_pkt, struct pktq *dev_pktq, |
| 3516 | struct pktq *gso_pktq) |
| 3517 | { |
| 3518 | ifnet_t ifp = fsw->fsw_ifp; |
| 3519 | struct __kern_packet *pkt = first_pkt; |
| 3520 | uint8_t proto = pkt->pkt_flow_ip_proto; |
| 3521 | uint16_t ip_hlen = pkt->pkt_flow_ip_hlen; |
| 3522 | uint16_t tcp_hlen = pkt->pkt_flow_tcp_hlen; |
| 3523 | uint16_t total_hlen = ip_hlen + tcp_hlen; |
| 3524 | uint16_t mtu = (uint16_t)ifp->if_mtu; |
| 3525 | uint16_t mss = pkt->pkt_proto_seg_sz, payload_sz; |
| 3526 | uint32_t n, n_pkts, off = 0, total_len = orig_pkt->pkt_length; |
| 3527 | uint16_t headroom = fsw->fsw_frame_headroom + ifp->if_tx_headroom; |
| 3528 | kern_packet_t orig_ph = SK_PKT2PH(orig_pkt); |
| 3529 | uint8_t *orig_pkt_baddr; |
| 3530 | struct tcphdr *tcp; |
| 3531 | struct ip *ip; |
| 3532 | struct ip6_hdr *ip6; |
| 3533 | uint32_t tcp_seq; |
| 3534 | uint16_t ipid; |
| 3535 | uint32_t pseudo_hdr_csum, bufsz; |
| 3536 | |
| 3537 | ASSERT(headroom <= UINT8_MAX); |
| 3538 | if (proto != IPPROTO_TCP) { |
| 3539 | SK_ERR("invalid proto: %d" , proto); |
| 3540 | DTRACE_SKYWALK3(invalid__proto, struct nx_flowswitch *, |
| 3541 | fsw, ifnet_t, ifp, uint8_t, proto); |
| 3542 | return EINVAL; |
| 3543 | } |
| 3544 | if (mss == 0 || mss > (mtu - total_hlen)) { |
| 3545 | SK_ERR("invalid args: mss %d, mtu %d, total_hlen %d" , |
| 3546 | mss, mtu, total_hlen); |
| 3547 | DTRACE_SKYWALK5(invalid__args1, struct nx_flowswitch *, |
| 3548 | fsw, ifnet_t, ifp, uint16_t, mss, uint16_t, mtu, |
| 3549 | uint32_t, total_hlen); |
| 3550 | return EINVAL; |
| 3551 | } |
| 3552 | bufsz = PP_BUF_SIZE_DEF(pkt->pkt_qum.qum_pp); |
| 3553 | if ((headroom + total_hlen + mss) > bufsz) { |
| 3554 | SK_ERR("invalid args: headroom %d, total_hlen %d, " |
| 3555 | "mss %d, bufsz %d" , headroom, total_hlen, mss, bufsz); |
| 3556 | DTRACE_SKYWALK6(invalid__args2, struct nx_flowswitch *, |
| 3557 | fsw, ifnet_t, ifp, uint16_t, headroom, uint16_t, |
| 3558 | total_hlen, uint16_t, mss, uint32_t, bufsz); |
| 3559 | return EINVAL; |
| 3560 | } |
| 3561 | n_pkts = (uint32_t)(SK_ROUNDUP((total_len - total_hlen), mss) / mss); |
| 3562 | |
| 3563 | ASSERT(pkt->pkt_headroom == headroom); |
| 3564 | ASSERT(pkt->pkt_length == total_len); |
| 3565 | ASSERT(pkt->pkt_l2_len == 0); |
| 3566 | ASSERT((pkt->pkt_qum.qum_qflags & QUM_F_FINALIZED) == 0); |
| 3567 | ASSERT((pkt->pkt_pflags & PKT_F_TRUNCATED) != 0); |
| 3568 | pkt->pkt_pflags &= ~PKT_F_TRUNCATED; |
| 3569 | pkt->pkt_proto_seg_sz = 0; |
| 3570 | pkt->pkt_csum_flags = 0; |
| 3571 | MD_BUFLET_ADDR_ABS(orig_pkt, orig_pkt_baddr); |
| 3572 | orig_pkt_baddr += orig_pkt->pkt_headroom; |
| 3573 | |
| 3574 | if (af == AF_INET) { |
| 3575 | ip = (struct ip *)pkt->pkt_flow_ip_hdr; |
| 3576 | tcp = (struct tcphdr *)pkt->pkt_flow_tcp_hdr; |
| 3577 | ipid = ip->ip_id; |
| 3578 | pseudo_hdr_csum = in_pseudo(pkt->pkt_flow_ipv4_src.s_addr, |
| 3579 | pkt->pkt_flow_ipv4_dst.s_addr, 0); |
| 3580 | } else { |
| 3581 | ASSERT(af == AF_INET6); |
| 3582 | tcp = (struct tcphdr *)pkt->pkt_flow_tcp_hdr; |
| 3583 | pseudo_hdr_csum = in6_pseudo(&pkt->pkt_flow_ipv6_src, |
| 3584 | &pkt->pkt_flow_ipv6_dst, 0); |
| 3585 | } |
| 3586 | tcp_seq = ntohl(tcp->th_seq); |
| 3587 | |
| 3588 | for (n = 1, payload_sz = mss, off = total_hlen; off < total_len; |
| 3589 | off += payload_sz) { |
| 3590 | uint8_t *baddr, *baddr0; |
| 3591 | uint32_t partial; |
| 3592 | |
| 3593 | if (pkt == NULL) { |
| 3594 | n++; |
| 3595 | KPKTQ_DEQUEUE(dev_pktq, pkt); |
| 3596 | ASSERT(pkt != NULL); |
| 3597 | } |
| 3598 | MD_BUFLET_ADDR_ABS(pkt, baddr0); |
| 3599 | baddr = baddr0; |
| 3600 | baddr += headroom; |
| 3601 | |
| 3602 | /* Copy headers from the original packet */ |
| 3603 | if (n != 1) { |
| 3604 | ASSERT(pkt != first_pkt); |
| 3605 | pkt_copy(src: orig_pkt_baddr, dst: baddr, len: total_hlen); |
| 3606 | fsw_pkt_copy_metadata(spkt: first_pkt, dpkt: pkt); |
| 3607 | |
| 3608 | ASSERT((pkt->pkt_qum_qflags & QUM_F_FLOW_CLASSIFIED) != 0); |
| 3609 | /* flow info still needs to be updated below */ |
| 3610 | bcopy(src: first_pkt->pkt_flow, dst: pkt->pkt_flow, |
| 3611 | n: sizeof(*pkt->pkt_flow)); |
| 3612 | pkt->pkt_trace_id = 0; |
| 3613 | ASSERT(pkt->pkt_headroom == headroom); |
| 3614 | } else { |
| 3615 | METADATA_SET_LEN(pkt, 0, 0); |
| 3616 | } |
| 3617 | baddr += total_hlen; |
| 3618 | |
| 3619 | /* Copy/checksum the payload from the original packet */ |
| 3620 | if (off + payload_sz > total_len) { |
| 3621 | payload_sz = (uint16_t)(total_len - off); |
| 3622 | } |
| 3623 | pkt_copypkt_sum(orig_ph, |
| 3624 | (uint16_t)(orig_pkt->pkt_headroom + off), |
| 3625 | SK_PKT2PH(pkt), headroom + total_hlen, payload_sz, |
| 3626 | &partial, TRUE); |
| 3627 | |
| 3628 | DTRACE_SKYWALK6(copy__csum, struct nx_flowswitch *, fsw, |
| 3629 | ifnet_t, ifp, uint8_t *, baddr, uint16_t, payload_sz, |
| 3630 | uint16_t, mss, uint32_t, partial); |
| 3631 | FSW_STATS_INC(FSW_STATS_TX_COPY_PKT2PKT); |
| 3632 | |
| 3633 | /* |
| 3634 | * Adjust header information and fill in the missing fields. |
| 3635 | */ |
| 3636 | if (af == AF_INET) { |
| 3637 | ip = (struct ip *)(void *)(baddr0 + pkt->pkt_headroom); |
| 3638 | tcp = (struct tcphdr *)(void *)((caddr_t)ip + ip_hlen); |
| 3639 | |
| 3640 | if (n != n_pkts) { |
| 3641 | tcp->th_flags &= ~(TH_FIN | TH_PUSH); |
| 3642 | } |
| 3643 | if (n != 1) { |
| 3644 | tcp->th_flags &= ~TH_CWR; |
| 3645 | tcp->th_seq = htonl(tcp_seq); |
| 3646 | } |
| 3647 | update_flow_info(pkt, iphdr: ip, tcphdr: tcp, payload_sz); |
| 3648 | |
| 3649 | ip->ip_id = htons((ipid)++); |
| 3650 | ip->ip_len = htons(ip_hlen + tcp_hlen + payload_sz); |
| 3651 | ip->ip_sum = 0; |
| 3652 | ip->ip_sum = inet_cksum_buffer(ip, 0, 0, len: ip_hlen); |
| 3653 | tcp->th_sum = 0; |
| 3654 | partial = __packet_cksum(data: tcp, len: tcp_hlen, sum0: partial); |
| 3655 | partial += htons(tcp_hlen + IPPROTO_TCP + payload_sz); |
| 3656 | partial += pseudo_hdr_csum; |
| 3657 | ADDCARRY(partial); |
| 3658 | tcp->th_sum = ~(uint16_t)partial; |
| 3659 | } else { |
| 3660 | ASSERT(af == AF_INET6); |
| 3661 | ip6 = (struct ip6_hdr *)(baddr0 + pkt->pkt_headroom); |
| 3662 | tcp = (struct tcphdr *)(void *)((caddr_t)ip6 + ip_hlen); |
| 3663 | |
| 3664 | if (n != n_pkts) { |
| 3665 | tcp->th_flags &= ~(TH_FIN | TH_PUSH); |
| 3666 | } |
| 3667 | if (n != 1) { |
| 3668 | tcp->th_flags &= ~TH_CWR; |
| 3669 | tcp->th_seq = htonl(tcp_seq); |
| 3670 | } |
| 3671 | update_flow_info(pkt, iphdr: ip6, tcphdr: tcp, payload_sz); |
| 3672 | |
| 3673 | ip6->ip6_plen = htons(tcp_hlen + payload_sz); |
| 3674 | tcp->th_sum = 0; |
| 3675 | partial = __packet_cksum(data: tcp, len: tcp_hlen, sum0: partial); |
| 3676 | partial += htonl(tcp_hlen + IPPROTO_TCP + payload_sz); |
| 3677 | partial += pseudo_hdr_csum; |
| 3678 | ADDCARRY(partial); |
| 3679 | tcp->th_sum = ~(uint16_t)partial; |
| 3680 | } |
| 3681 | tcp_seq += payload_sz; |
| 3682 | METADATA_ADJUST_LEN(pkt, total_hlen, headroom); |
| 3683 | #if (DEVELOPMENT || DEBUG) |
| 3684 | struct __kern_buflet *bft; |
| 3685 | uint32_t blen; |
| 3686 | PKT_GET_FIRST_BUFLET(pkt, 1, bft); |
| 3687 | blen = __buflet_get_data_length(bft); |
| 3688 | if (blen != total_hlen + payload_sz) { |
| 3689 | panic("blen (%d) != total_len + payload_sz (%d)\n" , |
| 3690 | blen, total_hlen + payload_sz); |
| 3691 | } |
| 3692 | #endif /* DEVELOPMENT || DEBUG */ |
| 3693 | |
| 3694 | pkt->pkt_length = total_hlen + payload_sz; |
| 3695 | KPKTQ_ENQUEUE(gso_pktq, pkt); |
| 3696 | pkt = NULL; |
| 3697 | |
| 3698 | /* |
| 3699 | * Note that at this point the packet is not yet finalized. |
| 3700 | * The finalization happens in dp_flow_tx_process() after |
| 3701 | * the framing is done. |
| 3702 | */ |
| 3703 | } |
| 3704 | ASSERT(n == n_pkts); |
| 3705 | ASSERT(off == total_len); |
| 3706 | DTRACE_SKYWALK7(gso__done, struct nx_flowswitch *, fsw, ifnet_t, ifp, |
| 3707 | uint32_t, n_pkts, uint32_t, total_len, uint16_t, ip_hlen, |
| 3708 | uint16_t, tcp_hlen, uint8_t *, orig_pkt_baddr); |
| 3709 | return 0; |
| 3710 | } |
| 3711 | |
| 3712 | static void |
| 3713 | tx_flow_enqueue_gso_pktq(struct flow_entry_list *fes, struct flow_entry *fe, |
| 3714 | struct pktq *gso_pktq) |
| 3715 | { |
| 3716 | if (KPKTQ_EMPTY(&fe->fe_tx_pktq)) { |
| 3717 | ASSERT(KPKTQ_LEN(&fe->fe_tx_pktq) == 0); |
| 3718 | TAILQ_INSERT_TAIL(fes, fe, fe_tx_link); |
| 3719 | KPKTQ_ENQUEUE_MULTI(&fe->fe_tx_pktq, KPKTQ_FIRST(gso_pktq), |
| 3720 | KPKTQ_LAST(gso_pktq), KPKTQ_LEN(gso_pktq)); |
| 3721 | KPKTQ_INIT(gso_pktq); |
| 3722 | } else { |
| 3723 | ASSERT(!TAILQ_EMPTY(fes)); |
| 3724 | KPKTQ_ENQUEUE_MULTI(&fe->fe_tx_pktq, KPKTQ_FIRST(gso_pktq), |
| 3725 | KPKTQ_LAST(gso_pktq), KPKTQ_LEN(gso_pktq)); |
| 3726 | KPKTQ_INIT(gso_pktq); |
| 3727 | flow_entry_release(pfe: &fe); |
| 3728 | } |
| 3729 | } |
| 3730 | |
| 3731 | static void |
| 3732 | dp_gso_pktq(struct nx_flowswitch *fsw, struct pktq *spktq, |
| 3733 | uint32_t gso_pkts_estimate) |
| 3734 | { |
| 3735 | struct __kern_packet *spkt, *pkt; |
| 3736 | struct flow_entry_list fes = TAILQ_HEAD_INITIALIZER(fes); |
| 3737 | struct flow_entry *fe, *prev_fe; |
| 3738 | struct pktq dpktq; |
| 3739 | struct nexus_adapter *dev_na; |
| 3740 | struct kern_pbufpool *dev_pp; |
| 3741 | struct ifnet *ifp = NULL; |
| 3742 | sa_family_t af; |
| 3743 | uint32_t n_pkts, n_flows = 0; |
| 3744 | int err; |
| 3745 | |
| 3746 | KPKTQ_INIT(&dpktq); |
| 3747 | n_pkts = KPKTQ_LEN(spktq); |
| 3748 | |
| 3749 | FSW_RLOCK(fsw); |
| 3750 | if (__improbable(FSW_QUIESCED(fsw))) { |
| 3751 | DTRACE_SKYWALK1(tx__quiesced, struct nx_flowswitch *, fsw); |
| 3752 | SK_ERR("flowswitch detached, dropping %d pkts" , n_pkts); |
| 3753 | dp_drop_pktq(fsw, spktq); |
| 3754 | goto done; |
| 3755 | } |
| 3756 | dev_na = fsw->fsw_dev_ch->ch_na; |
| 3757 | if (__improbable(dev_na == NULL)) { |
| 3758 | SK_ERR("dev port not attached, dropping %d pkts" , n_pkts); |
| 3759 | FSW_STATS_ADD(FSW_STATS_DST_NXPORT_INACTIVE, n_pkts); |
| 3760 | dp_drop_pktq(fsw, spktq); |
| 3761 | goto done; |
| 3762 | } |
| 3763 | ifp = fsw_datamov_begin(fsw); |
| 3764 | if (ifp == NULL) { |
| 3765 | SK_ERR("ifnet not attached, dropping %d pkts" , n_pkts); |
| 3766 | dp_drop_pktq(fsw, spktq); |
| 3767 | goto done; |
| 3768 | } |
| 3769 | |
| 3770 | dev_pp = na_kr_get_pp(dev_na, NR_TX); |
| 3771 | |
| 3772 | /* |
| 3773 | * Batch allocate enough packets to perform GSO on all |
| 3774 | * packets in spktq. |
| 3775 | */ |
| 3776 | err = pp_alloc_pktq(dev_pp, dev_pp->pp_max_frags, &dpktq, |
| 3777 | gso_pkts_estimate, NULL, NULL, SKMEM_NOSLEEP); |
| 3778 | #if DEVELOPMENT || DEBUG |
| 3779 | if (__probable(err != ENOMEM)) { |
| 3780 | _FSW_INJECT_ERROR(12, err, ENOMEM, pp_free_pktq, &dpktq); |
| 3781 | } |
| 3782 | #endif /* DEVELOPMENT || DEBUG */ |
| 3783 | /* |
| 3784 | * We either get all packets or none. No partial allocations. |
| 3785 | */ |
| 3786 | if (__improbable(err != 0)) { |
| 3787 | if (err == ENOMEM) { |
| 3788 | ASSERT(KPKTQ_EMPTY(&dpktq)); |
| 3789 | } else { |
| 3790 | dp_free_pktq(fsw, pktq: &dpktq); |
| 3791 | } |
| 3792 | DTRACE_SKYWALK1(gso__no__mem, int, err); |
| 3793 | dp_drop_pktq(fsw, spktq); |
| 3794 | FSW_STATS_ADD(FSW_STATS_DROP_NOMEM_PKT, n_pkts); |
| 3795 | SK_ERR("failed to alloc %u pkts from device pool" , |
| 3796 | gso_pkts_estimate); |
| 3797 | goto done; |
| 3798 | } |
| 3799 | prev_fe = NULL; |
| 3800 | KPKTQ_FOREACH(spkt, spktq) { |
| 3801 | KPKTQ_DEQUEUE(&dpktq, pkt); |
| 3802 | ASSERT(pkt != NULL); |
| 3803 | /* |
| 3804 | * Copy only headers to the first packet of the GSO chain. |
| 3805 | * The headers will be used for classification below. |
| 3806 | */ |
| 3807 | err = dp_copy_headers_to_dev(fsw, spkt, dpkt: pkt); |
| 3808 | if (__improbable(err != 0)) { |
| 3809 | pp_free_packet_single(pkt); |
| 3810 | DTRACE_SKYWALK2(copy__headers__failed, |
| 3811 | struct nx_flowswitch *, fsw, |
| 3812 | struct __kern_packet *, spkt); |
| 3813 | continue; |
| 3814 | } |
| 3815 | af = get_tso_af(pkt); |
| 3816 | ASSERT(af == AF_INET || af == AF_INET6); |
| 3817 | |
| 3818 | err = flow_pkt_classify(pkt, ifp, af, false); |
| 3819 | if (__improbable(err != 0)) { |
| 3820 | dp_tx_log_pkt(SK_VERB_ERROR, "flow extract err" , pkt); |
| 3821 | FSW_STATS_INC(FSW_STATS_TX_FLOW_EXTRACT_ERR); |
| 3822 | pp_free_packet_single(pkt); |
| 3823 | DTRACE_SKYWALK4(classify__failed, |
| 3824 | struct nx_flowswitch *, fsw, |
| 3825 | struct __kern_packet *, spkt, |
| 3826 | struct __kern_packet *, pkt, |
| 3827 | int, err); |
| 3828 | continue; |
| 3829 | } |
| 3830 | /* |
| 3831 | * GSO cannot be done on a fragment and it's a bug in user |
| 3832 | * space to mark a fragment as needing GSO. |
| 3833 | */ |
| 3834 | if (__improbable(pkt->pkt_flow_ip_is_frag)) { |
| 3835 | FSW_STATS_INC(FSW_STATS_TX_FRAG_BAD_CONT); |
| 3836 | pp_free_packet_single(pkt); |
| 3837 | DTRACE_SKYWALK3(is__frag, |
| 3838 | struct nx_flowswitch *, fsw, |
| 3839 | struct __kern_packet *, spkt, |
| 3840 | struct __kern_packet *, pkt); |
| 3841 | continue; |
| 3842 | } |
| 3843 | fe = tx_lookup_flow(fsw, pkt, prev_fe); |
| 3844 | if (__improbable(fe == NULL)) { |
| 3845 | FSW_STATS_INC(FSW_STATS_TX_FLOW_NOT_FOUND); |
| 3846 | pp_free_packet_single(pkt); |
| 3847 | DTRACE_SKYWALK3(lookup__failed, |
| 3848 | struct nx_flowswitch *, fsw, |
| 3849 | struct __kern_packet *, spkt, |
| 3850 | struct __kern_packet *, pkt); |
| 3851 | prev_fe = NULL; |
| 3852 | continue; |
| 3853 | } |
| 3854 | /* |
| 3855 | * Perform GSO on spkt using the flow information |
| 3856 | * obtained above. |
| 3857 | */ |
| 3858 | struct pktq gso_pktq; |
| 3859 | KPKTQ_INIT(&gso_pktq); |
| 3860 | err = do_gso(fsw, af, orig_pkt: spkt, first_pkt: pkt, dev_pktq: &dpktq, gso_pktq: &gso_pktq); |
| 3861 | if (__probable(err == 0)) { |
| 3862 | tx_flow_enqueue_gso_pktq(fes: &fes, fe, gso_pktq: &gso_pktq); |
| 3863 | prev_fe = fe; |
| 3864 | } else { |
| 3865 | DTRACE_SKYWALK1(gso__error, int, err); |
| 3866 | /* TODO: increment error stat */ |
| 3867 | pp_free_packet_single(pkt); |
| 3868 | flow_entry_release(pfe: &fe); |
| 3869 | prev_fe = NULL; |
| 3870 | } |
| 3871 | KPKTQ_FINI(&gso_pktq); |
| 3872 | } |
| 3873 | struct flow_entry *tfe = NULL; |
| 3874 | TAILQ_FOREACH_SAFE(fe, &fes, fe_tx_link, tfe) { |
| 3875 | /* Chain-enqueue can be used for GSO chains */ |
| 3876 | tx_flow_process(fsw, fe, FLOW_PROC_FLAG_GSO); |
| 3877 | TAILQ_REMOVE(&fes, fe, fe_tx_link); |
| 3878 | flow_entry_release(pfe: &fe); |
| 3879 | n_flows++; |
| 3880 | } |
| 3881 | done: |
| 3882 | FSW_RUNLOCK(fsw); |
| 3883 | if (n_flows > 0) { |
| 3884 | netif_transmit(ifp, NETIF_XMIT_FLAG_CHANNEL); |
| 3885 | } |
| 3886 | if (ifp != NULL) { |
| 3887 | fsw_datamov_end(fsw); |
| 3888 | } |
| 3889 | |
| 3890 | /* |
| 3891 | * It's possible for packets to be left in dpktq because |
| 3892 | * gso_pkts_estimate is only an estimate. The actual number |
| 3893 | * of packets needed could be less. |
| 3894 | */ |
| 3895 | uint32_t dpktq_len; |
| 3896 | if ((dpktq_len = KPKTQ_LEN(&dpktq)) > 0) { |
| 3897 | DTRACE_SKYWALK2(leftover__dev__pkts, |
| 3898 | struct nx_flowswitch *, fsw, uint32_t, dpktq_len); |
| 3899 | dp_free_pktq(fsw, pktq: &dpktq); |
| 3900 | } |
| 3901 | KPKTQ_FINI(&dpktq); |
| 3902 | } |
| 3903 | |
| 3904 | static inline void |
| 3905 | fsw_dev_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r, |
| 3906 | struct proc *p) |
| 3907 | { |
| 3908 | #pragma unused(p) |
| 3909 | uint32_t total_pkts = 0, total_bytes = 0; |
| 3910 | |
| 3911 | for (;;) { |
| 3912 | struct pktq pktq; |
| 3913 | KPKTQ_INIT(&pktq); |
| 3914 | uint32_t n_bytes; |
| 3915 | fsw_rx_ring_dequeue_pktq(fsw, r, n_pkts_max: fsw_rx_batch, pktq: &pktq, n_bytes: &n_bytes); |
| 3916 | if (n_bytes == 0) { |
| 3917 | break; |
| 3918 | } |
| 3919 | total_pkts += KPKTQ_LEN(&pktq); |
| 3920 | total_bytes += n_bytes; |
| 3921 | |
| 3922 | if (__probable(fsw->fsw_ifp->if_input_netem == NULL)) { |
| 3923 | fsw_receive(fsw, pktq: &pktq); |
| 3924 | } else { |
| 3925 | fsw_dev_input_netem_enqueue(fsw, q: &pktq); |
| 3926 | } |
| 3927 | KPKTQ_FINI(&pktq); |
| 3928 | } |
| 3929 | |
| 3930 | KDBG(SK_KTRACE_FSW_DEV_RING_FLUSH, SK_KVA(r), total_pkts, total_bytes); |
| 3931 | DTRACE_SKYWALK2(fsw__dp__dev__ring__flush, uint32_t, total_pkts, |
| 3932 | uint32_t, total_bytes); |
| 3933 | |
| 3934 | /* compute mitigation rate for delivered traffic */ |
| 3935 | if (__probable(r->ckr_netif_mit_stats != NULL)) { |
| 3936 | r->ckr_netif_mit_stats(r, total_pkts, total_bytes); |
| 3937 | } |
| 3938 | } |
| 3939 | |
| 3940 | static inline void |
| 3941 | fsw_user_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r, |
| 3942 | struct proc *p) |
| 3943 | { |
| 3944 | #pragma unused(p) |
| 3945 | static packet_trace_id_t trace_id = 0; |
| 3946 | uint32_t total_pkts = 0, total_bytes = 0; |
| 3947 | |
| 3948 | for (;;) { |
| 3949 | struct pktq pktq; |
| 3950 | KPKTQ_INIT(&pktq); |
| 3951 | uint32_t n_bytes; |
| 3952 | uint32_t gso_pkts_estimate = 0; |
| 3953 | |
| 3954 | fsw_tx_ring_dequeue_pktq(fsw, r, n_pkts_max: fsw_tx_batch, pktq: &pktq, n_bytes: &n_bytes, |
| 3955 | gso_pkts_estimate: &gso_pkts_estimate); |
| 3956 | if (n_bytes == 0) { |
| 3957 | break; |
| 3958 | } |
| 3959 | total_pkts += KPKTQ_LEN(&pktq); |
| 3960 | total_bytes += n_bytes; |
| 3961 | |
| 3962 | KPKTQ_FIRST(&pktq)->pkt_trace_id = ++trace_id; |
| 3963 | KDBG(SK_KTRACE_PKT_TX_FSW | DBG_FUNC_START, |
| 3964 | KPKTQ_FIRST(&pktq)->pkt_trace_id); |
| 3965 | |
| 3966 | if (gso_pkts_estimate > 0) { |
| 3967 | dp_gso_pktq(fsw, spktq: &pktq, gso_pkts_estimate); |
| 3968 | } else { |
| 3969 | dp_tx_pktq(fsw, spktq: &pktq); |
| 3970 | } |
| 3971 | dp_free_pktq(fsw, pktq: &pktq); |
| 3972 | KPKTQ_FINI(&pktq); |
| 3973 | } |
| 3974 | kr_update_stats(kring: r, slot_count: total_pkts, byte_count: total_bytes); |
| 3975 | |
| 3976 | KDBG(SK_KTRACE_FSW_USER_RING_FLUSH, SK_KVA(r), total_pkts, total_bytes); |
| 3977 | DTRACE_SKYWALK2(fsw__dp__user__ring__flush, uint32_t, total_pkts, |
| 3978 | uint32_t, total_bytes); |
| 3979 | } |
| 3980 | |
| 3981 | void |
| 3982 | fsw_ring_flush(struct nx_flowswitch *fsw, struct __kern_channel_ring *r, |
| 3983 | struct proc *p) |
| 3984 | { |
| 3985 | struct nexus_vp_adapter *vpna = VPNA(KRNA(r)); |
| 3986 | |
| 3987 | ASSERT(sk_is_sync_protected()); |
| 3988 | ASSERT(vpna->vpna_nx_port != FSW_VP_HOST); |
| 3989 | ASSERT(vpna->vpna_up.na_md_type == NEXUS_META_TYPE_PACKET); |
| 3990 | |
| 3991 | if (vpna->vpna_nx_port == FSW_VP_DEV) { |
| 3992 | fsw_dev_ring_flush(fsw, r, p); |
| 3993 | } else { |
| 3994 | fsw_user_ring_flush(fsw, r, p); |
| 3995 | } |
| 3996 | } |
| 3997 | |
| 3998 | int |
| 3999 | fsw_dp_ctor(struct nx_flowswitch *fsw) |
| 4000 | { |
| 4001 | uint32_t fe_cnt = fsw_fe_table_size; |
| 4002 | uint32_t fob_cnt = fsw_flow_owner_buckets; |
| 4003 | uint32_t frb_cnt = fsw_flow_route_buckets; |
| 4004 | uint32_t frib_cnt = fsw_flow_route_id_buckets; |
| 4005 | struct kern_nexus *nx = fsw->fsw_nx; |
| 4006 | char name[64]; |
| 4007 | int error = 0; |
| 4008 | |
| 4009 | /* just in case */ |
| 4010 | if (fe_cnt == 0) { |
| 4011 | fe_cnt = NX_FSW_FE_TABLESZ; |
| 4012 | ASSERT(fe_cnt != 0); |
| 4013 | } |
| 4014 | if (fob_cnt == 0) { |
| 4015 | fob_cnt = NX_FSW_FOB_HASHSZ; |
| 4016 | ASSERT(fob_cnt != 0); |
| 4017 | } |
| 4018 | if (frb_cnt == 0) { |
| 4019 | frb_cnt = NX_FSW_FRB_HASHSZ; |
| 4020 | ASSERT(frb_cnt != 0); |
| 4021 | } |
| 4022 | if (frib_cnt == 0) { |
| 4023 | frib_cnt = NX_FSW_FRIB_HASHSZ; |
| 4024 | ASSERT(frib_cnt != 0); |
| 4025 | } |
| 4026 | |
| 4027 | /* make sure fe_cnt is a power of two, else round up */ |
| 4028 | if ((fe_cnt & (fe_cnt - 1)) != 0) { |
| 4029 | fe_cnt--; |
| 4030 | fe_cnt |= (fe_cnt >> 1); |
| 4031 | fe_cnt |= (fe_cnt >> 2); |
| 4032 | fe_cnt |= (fe_cnt >> 4); |
| 4033 | fe_cnt |= (fe_cnt >> 8); |
| 4034 | fe_cnt |= (fe_cnt >> 16); |
| 4035 | fe_cnt++; |
| 4036 | } |
| 4037 | |
| 4038 | /* make sure frb_cnt is a power of two, else round up */ |
| 4039 | if ((frb_cnt & (frb_cnt - 1)) != 0) { |
| 4040 | frb_cnt--; |
| 4041 | frb_cnt |= (frb_cnt >> 1); |
| 4042 | frb_cnt |= (frb_cnt >> 2); |
| 4043 | frb_cnt |= (frb_cnt >> 4); |
| 4044 | frb_cnt |= (frb_cnt >> 8); |
| 4045 | frb_cnt |= (frb_cnt >> 16); |
| 4046 | frb_cnt++; |
| 4047 | } |
| 4048 | |
| 4049 | lck_mtx_init(lck: &fsw->fsw_detach_barrier_lock, grp: &nexus_lock_group, |
| 4050 | attr: &nexus_lock_attr); |
| 4051 | lck_mtx_init(lck: &fsw->fsw_reap_lock, grp: &nexus_lock_group, attr: &nexus_lock_attr); |
| 4052 | lck_mtx_init(lck: &fsw->fsw_linger_lock, grp: &nexus_lock_group, attr: &nexus_lock_attr); |
| 4053 | TAILQ_INIT(&fsw->fsw_linger_head); |
| 4054 | |
| 4055 | (void) snprintf(name, count: sizeof(name), "%s_%llu" , NX_FSW_NAME, nx->nx_id); |
| 4056 | error = nx_advisory_alloc(nx, name, |
| 4057 | &NX_PROV(nx)->nxprov_region_params[SKMEM_REGION_NEXUSADV], |
| 4058 | NEXUS_ADVISORY_TYPE_FLOWSWITCH); |
| 4059 | if (error != 0) { |
| 4060 | fsw_dp_dtor(fsw); |
| 4061 | return error; |
| 4062 | } |
| 4063 | |
| 4064 | fsw->fsw_flow_mgr = flow_mgr_create(fe_cnt, fob_cnt, frb_cnt, frib_cnt); |
| 4065 | if (fsw->fsw_flow_mgr == NULL) { |
| 4066 | fsw_dp_dtor(fsw); |
| 4067 | return error; |
| 4068 | } |
| 4069 | |
| 4070 | /* generic name; will be customized upon ifattach */ |
| 4071 | (void) snprintf(fsw->fsw_reap_name, count: sizeof(fsw->fsw_reap_name), |
| 4072 | FSW_REAP_THREADNAME, name, "" ); |
| 4073 | |
| 4074 | if (kernel_thread_start(continuation: fsw_reap_thread_func, parameter: fsw, |
| 4075 | new_thread: &fsw->fsw_reap_thread) != KERN_SUCCESS) { |
| 4076 | panic_plain("%s: can't create thread" , __func__); |
| 4077 | /* NOTREACHED */ |
| 4078 | __builtin_unreachable(); |
| 4079 | } |
| 4080 | /* this must not fail */ |
| 4081 | VERIFY(fsw->fsw_reap_thread != NULL); |
| 4082 | |
| 4083 | SK_DF(SK_VERB_MEM, "fsw 0x%llx ALLOC" , SK_KVA(fsw)); |
| 4084 | |
| 4085 | |
| 4086 | return error; |
| 4087 | } |
| 4088 | |
| 4089 | void |
| 4090 | fsw_dp_dtor(struct nx_flowswitch *fsw) |
| 4091 | { |
| 4092 | uint64_t f = (1 * NSEC_PER_MSEC); /* 1 ms */ |
| 4093 | uint64_t s = (1000 * NSEC_PER_SEC); /* 1 sec */ |
| 4094 | uint32_t i = 0; |
| 4095 | |
| 4096 | #if (DEVELOPMENT || DEBUG) |
| 4097 | if (fsw->fsw_rps_threads != NULL) { |
| 4098 | for (i = 0; i < fsw->fsw_rps_nthreads; i++) { |
| 4099 | fsw_rps_thread_join(fsw, i); |
| 4100 | } |
| 4101 | kfree_type(struct fsw_rps_thread, fsw->fsw_rps_threads); |
| 4102 | } |
| 4103 | #endif /* !DEVELOPMENT && !DEBUG */ |
| 4104 | |
| 4105 | nx_advisory_free(fsw->fsw_nx); |
| 4106 | |
| 4107 | if (fsw->fsw_reap_thread != THREAD_NULL) { |
| 4108 | /* signal thread to begin self-termination */ |
| 4109 | lck_mtx_lock(lck: &fsw->fsw_reap_lock); |
| 4110 | fsw->fsw_reap_flags |= FSW_REAPF_TERMINATING; |
| 4111 | |
| 4112 | /* |
| 4113 | * And wait for thread to terminate; use another |
| 4114 | * wait channel here other than fsw_reap_flags to |
| 4115 | * make it more explicit. In the event the reaper |
| 4116 | * thread misses a wakeup, we'll try again once |
| 4117 | * every second (except for the first time). |
| 4118 | */ |
| 4119 | while (!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED)) { |
| 4120 | uint64_t t = 0; |
| 4121 | |
| 4122 | nanoseconds_to_absolutetime(nanoseconds: (i++ == 0) ? f : s, result: &t); |
| 4123 | clock_absolutetime_interval_to_deadline(abstime: t, result: &t); |
| 4124 | ASSERT(t != 0); |
| 4125 | |
| 4126 | fsw->fsw_reap_flags |= FSW_REAPF_TERMINATEBLOCK; |
| 4127 | if (!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING)) { |
| 4128 | thread_wakeup((caddr_t)&fsw->fsw_reap_flags); |
| 4129 | } |
| 4130 | (void) assert_wait_deadline(event: &fsw->fsw_reap_thread, |
| 4131 | THREAD_UNINT, deadline: t); |
| 4132 | lck_mtx_unlock(lck: &fsw->fsw_reap_lock); |
| 4133 | thread_block(THREAD_CONTINUE_NULL); |
| 4134 | lck_mtx_lock(lck: &fsw->fsw_reap_lock); |
| 4135 | fsw->fsw_reap_flags &= ~FSW_REAPF_TERMINATEBLOCK; |
| 4136 | } |
| 4137 | ASSERT(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED); |
| 4138 | lck_mtx_unlock(lck: &fsw->fsw_reap_lock); |
| 4139 | fsw->fsw_reap_thread = THREAD_NULL; |
| 4140 | } |
| 4141 | |
| 4142 | /* free any remaining flow entries in the linger list */ |
| 4143 | fsw_linger_purge(fsw); |
| 4144 | |
| 4145 | if (fsw->fsw_flow_mgr != NULL) { |
| 4146 | flow_mgr_destroy(fsw->fsw_flow_mgr); |
| 4147 | fsw->fsw_flow_mgr = NULL; |
| 4148 | } |
| 4149 | |
| 4150 | |
| 4151 | lck_mtx_destroy(lck: &fsw->fsw_detach_barrier_lock, grp: &nexus_lock_group); |
| 4152 | lck_mtx_destroy(lck: &fsw->fsw_reap_lock, grp: &nexus_lock_group); |
| 4153 | lck_mtx_destroy(lck: &fsw->fsw_linger_lock, grp: &nexus_lock_group); |
| 4154 | } |
| 4155 | |
| 4156 | void |
| 4157 | fsw_linger_insert(struct flow_entry *fe) |
| 4158 | { |
| 4159 | struct nx_flowswitch *fsw = fe->fe_fsw; |
| 4160 | SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]); |
| 4161 | SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx flags 0x%b" , |
| 4162 | fe_as_string(fe, dbgbuf, sizeof(dbgbuf)), SK_KVA(fe), |
| 4163 | fe->fe_flags, FLOWENTF_BITS); |
| 4164 | |
| 4165 | net_update_uptime(); |
| 4166 | |
| 4167 | ASSERT(flow_entry_refcnt(fe) >= 1); |
| 4168 | ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN); |
| 4169 | ASSERT(fe->fe_flags & FLOWENTF_DESTROYED); |
| 4170 | ASSERT(!(fe->fe_flags & FLOWENTF_LINGERING)); |
| 4171 | ASSERT(fe->fe_flags & FLOWENTF_WAIT_CLOSE); |
| 4172 | ASSERT(fe->fe_linger_wait != 0); |
| 4173 | fe->fe_linger_expire = (_net_uptime + fe->fe_linger_wait); |
| 4174 | os_atomic_or(&fe->fe_flags, FLOWENTF_LINGERING, relaxed); |
| 4175 | |
| 4176 | lck_mtx_lock_spin(lck: &fsw->fsw_linger_lock); |
| 4177 | TAILQ_INSERT_TAIL(&fsw->fsw_linger_head, fe, fe_linger_link); |
| 4178 | fsw->fsw_linger_cnt++; |
| 4179 | VERIFY(fsw->fsw_linger_cnt != 0); |
| 4180 | lck_mtx_unlock(lck: &fsw->fsw_linger_lock); |
| 4181 | |
| 4182 | fsw_reap_sched(fsw); |
| 4183 | } |
| 4184 | |
| 4185 | static void |
| 4186 | fsw_linger_remove_internal(struct flow_entry_linger_head *linger_head, |
| 4187 | struct flow_entry *fe) |
| 4188 | { |
| 4189 | SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]); |
| 4190 | SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx flags 0x%b" , |
| 4191 | fe_as_string(fe, dbgbuf, sizeof(dbgbuf)), SK_KVA(fe), |
| 4192 | fe->fe_flags, FLOWENTF_BITS); |
| 4193 | |
| 4194 | ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN); |
| 4195 | ASSERT(fe->fe_flags & FLOWENTF_DESTROYED); |
| 4196 | ASSERT(fe->fe_flags & FLOWENTF_LINGERING); |
| 4197 | os_atomic_andnot(&fe->fe_flags, FLOWENTF_LINGERING, relaxed); |
| 4198 | |
| 4199 | TAILQ_REMOVE(linger_head, fe, fe_linger_link); |
| 4200 | flow_entry_release(pfe: &fe); |
| 4201 | } |
| 4202 | |
| 4203 | static void |
| 4204 | fsw_linger_remove(struct flow_entry *fe) |
| 4205 | { |
| 4206 | struct nx_flowswitch *fsw = fe->fe_fsw; |
| 4207 | |
| 4208 | LCK_MTX_ASSERT(&fsw->fsw_linger_lock, LCK_MTX_ASSERT_OWNED); |
| 4209 | |
| 4210 | fsw_linger_remove_internal(linger_head: &fsw->fsw_linger_head, fe); |
| 4211 | VERIFY(fsw->fsw_linger_cnt != 0); |
| 4212 | fsw->fsw_linger_cnt--; |
| 4213 | } |
| 4214 | |
| 4215 | void |
| 4216 | fsw_linger_purge(struct nx_flowswitch *fsw) |
| 4217 | { |
| 4218 | struct flow_entry *fe, *tfe; |
| 4219 | |
| 4220 | lck_mtx_lock(lck: &fsw->fsw_linger_lock); |
| 4221 | TAILQ_FOREACH_SAFE(fe, &fsw->fsw_linger_head, fe_linger_link, tfe) { |
| 4222 | fsw_linger_remove(fe); |
| 4223 | } |
| 4224 | ASSERT(fsw->fsw_linger_cnt == 0); |
| 4225 | ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head)); |
| 4226 | lck_mtx_unlock(lck: &fsw->fsw_linger_lock); |
| 4227 | } |
| 4228 | |
| 4229 | void |
| 4230 | fsw_reap_sched(struct nx_flowswitch *fsw) |
| 4231 | { |
| 4232 | ASSERT(fsw->fsw_reap_thread != THREAD_NULL); |
| 4233 | lck_mtx_lock_spin(lck: &fsw->fsw_reap_lock); |
| 4234 | if (!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING) && |
| 4235 | !(fsw->fsw_reap_flags & (FSW_REAPF_TERMINATING | FSW_REAPF_TERMINATED))) { |
| 4236 | thread_wakeup((caddr_t)&fsw->fsw_reap_flags); |
| 4237 | } |
| 4238 | lck_mtx_unlock(lck: &fsw->fsw_reap_lock); |
| 4239 | } |
| 4240 | |
| 4241 | __attribute__((noreturn)) |
| 4242 | static void |
| 4243 | fsw_reap_thread_func(void *v, wait_result_t w) |
| 4244 | { |
| 4245 | #pragma unused(w) |
| 4246 | struct nx_flowswitch *fsw = v; |
| 4247 | |
| 4248 | ASSERT(fsw->fsw_reap_thread == current_thread()); |
| 4249 | thread_set_thread_name(th: current_thread(), name: fsw->fsw_reap_name); |
| 4250 | |
| 4251 | net_update_uptime(); |
| 4252 | |
| 4253 | lck_mtx_lock(lck: &fsw->fsw_reap_lock); |
| 4254 | VERIFY(!(fsw->fsw_reap_flags & FSW_REAPF_RUNNING)); |
| 4255 | (void) assert_wait(event: &fsw->fsw_reap_flags, THREAD_UNINT); |
| 4256 | lck_mtx_unlock(lck: &fsw->fsw_reap_lock); |
| 4257 | thread_block_parameter(continuation: fsw_reap_thread_cont, parameter: fsw); |
| 4258 | /* NOTREACHED */ |
| 4259 | __builtin_unreachable(); |
| 4260 | } |
| 4261 | |
| 4262 | __attribute__((noreturn)) |
| 4263 | static void |
| 4264 | fsw_reap_thread_cont(void *v, wait_result_t wres) |
| 4265 | { |
| 4266 | struct nx_flowswitch *fsw = v; |
| 4267 | boolean_t low; |
| 4268 | uint64_t t = 0; |
| 4269 | |
| 4270 | SK_DF(SK_VERB_FLOW, "%s: running" , fsw->fsw_reap_name); |
| 4271 | |
| 4272 | lck_mtx_lock(lck: &fsw->fsw_reap_lock); |
| 4273 | if (__improbable(wres == THREAD_INTERRUPTED || |
| 4274 | (fsw->fsw_reap_flags & FSW_REAPF_TERMINATING) != 0)) { |
| 4275 | goto terminate; |
| 4276 | } |
| 4277 | |
| 4278 | ASSERT(!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATED)); |
| 4279 | fsw->fsw_reap_flags |= FSW_REAPF_RUNNING; |
| 4280 | lck_mtx_unlock(lck: &fsw->fsw_reap_lock); |
| 4281 | |
| 4282 | net_update_uptime(); |
| 4283 | |
| 4284 | /* prevent detach from happening while we're here */ |
| 4285 | if (!fsw_detach_barrier_add(fsw)) { |
| 4286 | SK_ERR("%s: netagent detached" , fsw->fsw_reap_name); |
| 4287 | t = 0; |
| 4288 | } else { |
| 4289 | uint32_t fe_nonviable, fe_freed, fe_aborted; |
| 4290 | uint32_t fr_freed, fr_resid = 0; |
| 4291 | struct ifnet *ifp = fsw->fsw_ifp; |
| 4292 | uint64_t i = FSW_REAP_IVAL; |
| 4293 | uint64_t now = _net_uptime; |
| 4294 | uint64_t last; |
| 4295 | |
| 4296 | ASSERT(fsw->fsw_ifp != NULL); |
| 4297 | |
| 4298 | /* |
| 4299 | * Pass 1: process any deferred {withdrawn,nonviable} requests. |
| 4300 | */ |
| 4301 | fe_nonviable = fsw_process_deferred(fsw); |
| 4302 | |
| 4303 | /* |
| 4304 | * Pass 2: remove any expired lingering flows. |
| 4305 | */ |
| 4306 | fe_freed = fsw_process_linger(fsw, &fe_aborted); |
| 4307 | |
| 4308 | /* |
| 4309 | * Pass 3: prune idle flow routes. |
| 4310 | */ |
| 4311 | fr_freed = flow_route_prune(fsw->fsw_flow_mgr, |
| 4312 | ifp, &fr_resid); |
| 4313 | |
| 4314 | /* |
| 4315 | * Pass 4: prune flow table |
| 4316 | * |
| 4317 | */ |
| 4318 | cuckoo_hashtable_try_shrink(h: fsw->fsw_flow_mgr->fm_flow_table); |
| 4319 | |
| 4320 | SK_DF(SK_VERB_FLOW, "%s: fe_nonviable %u/%u fe_freed %u/%u " |
| 4321 | "fe_aborted %u fr_freed %u/%u" , |
| 4322 | fsw->fsw_flow_mgr->fm_name, fe_nonviable, |
| 4323 | (fe_nonviable + fsw->fsw_pending_nonviable), |
| 4324 | fe_freed, fsw->fsw_linger_cnt, fe_aborted, fe_freed, |
| 4325 | (fe_freed + fr_resid)); |
| 4326 | |
| 4327 | /* see if VM memory level is critical */ |
| 4328 | low = skmem_lowmem_check(); |
| 4329 | |
| 4330 | /* |
| 4331 | * If things appear to be idle, we can prune away cached |
| 4332 | * object that have fallen out of the working sets (this |
| 4333 | * is different than purging). Every once in a while, we |
| 4334 | * also purge the caches. Note that this is done across |
| 4335 | * all flowswitch instances, and so we limit this to no |
| 4336 | * more than once every FSW_REAP_SK_THRES seconds. |
| 4337 | */ |
| 4338 | last = os_atomic_load(&fsw_reap_last, relaxed); |
| 4339 | if ((low || (last != 0 && (now - last) >= FSW_REAP_SK_THRES)) && |
| 4340 | os_atomic_cmpxchg(&fsw_reap_last, last, now, acq_rel)) { |
| 4341 | fsw_purge_cache(fsw, low); |
| 4342 | |
| 4343 | /* increase sleep interval if idle */ |
| 4344 | if (kdebug_enable == 0 && fsw->fsw_linger_cnt == 0 && |
| 4345 | fsw->fsw_pending_nonviable == 0 && fr_resid == 0) { |
| 4346 | i <<= 3; |
| 4347 | } |
| 4348 | } else if (last == 0) { |
| 4349 | os_atomic_store(&fsw_reap_last, now, release); |
| 4350 | } |
| 4351 | |
| 4352 | /* |
| 4353 | * Additionally, run thru the list of channels and prune |
| 4354 | * or purge away cached objects on "idle" channels. This |
| 4355 | * check is rate limited to no more than once every |
| 4356 | * FSW_DRAIN_CH_THRES seconds. |
| 4357 | */ |
| 4358 | last = fsw->fsw_drain_channel_chk_last; |
| 4359 | if (low || (last != 0 && (now - last) >= FSW_DRAIN_CH_THRES)) { |
| 4360 | SK_DF(SK_VERB_FLOW, "%s: pruning channels" , |
| 4361 | fsw->fsw_flow_mgr->fm_name); |
| 4362 | |
| 4363 | fsw->fsw_drain_channel_chk_last = now; |
| 4364 | fsw_drain_channels(fsw, now, low); |
| 4365 | } else if (__improbable(last == 0)) { |
| 4366 | fsw->fsw_drain_channel_chk_last = now; |
| 4367 | } |
| 4368 | |
| 4369 | /* |
| 4370 | * Finally, invoke the interface's reap callback to |
| 4371 | * tell it to prune or purge away cached objects if |
| 4372 | * it is idle. This check is rate limited to no more |
| 4373 | * than once every FSW_REAP_IF_THRES seconds. |
| 4374 | */ |
| 4375 | last = fsw->fsw_drain_netif_chk_last; |
| 4376 | if (low || (last != 0 && (now - last) >= FSW_REAP_IF_THRES)) { |
| 4377 | ASSERT(fsw->fsw_nifna != NULL); |
| 4378 | |
| 4379 | if (ifp->if_na_ops != NULL && |
| 4380 | ifp->if_na_ops->ni_reap != NULL) { |
| 4381 | SK_DF(SK_VERB_FLOW, "%s: pruning netif" , |
| 4382 | fsw->fsw_flow_mgr->fm_name); |
| 4383 | ifp->if_na_ops->ni_reap(ifp->if_na, ifp, |
| 4384 | FSW_REAP_IF_THRES, low); |
| 4385 | } |
| 4386 | |
| 4387 | fsw->fsw_drain_netif_chk_last = now; |
| 4388 | } else if (__improbable(last == 0)) { |
| 4389 | fsw->fsw_drain_netif_chk_last = now; |
| 4390 | } |
| 4391 | |
| 4392 | /* emit periodic interface stats ktrace */ |
| 4393 | last = fsw->fsw_reap_last; |
| 4394 | if (last != 0 && (now - last) >= FSW_IFSTATS_THRES) { |
| 4395 | KDBG(SK_KTRACE_AON_IF_STATS, ifp->if_data.ifi_ipackets, |
| 4396 | ifp->if_data.ifi_ibytes * 8, |
| 4397 | ifp->if_data.ifi_opackets, |
| 4398 | ifp->if_data.ifi_obytes * 8); |
| 4399 | |
| 4400 | fsw->fsw_reap_last = now; |
| 4401 | } else if (__improbable(last == 0)) { |
| 4402 | fsw->fsw_reap_last = now; |
| 4403 | } |
| 4404 | |
| 4405 | nanoseconds_to_absolutetime(nanoseconds: i * NSEC_PER_SEC, result: &t); |
| 4406 | clock_absolutetime_interval_to_deadline(abstime: t, result: &t); |
| 4407 | ASSERT(t != 0); |
| 4408 | |
| 4409 | /* allow any pending detach to proceed */ |
| 4410 | fsw_detach_barrier_remove(fsw); |
| 4411 | } |
| 4412 | |
| 4413 | lck_mtx_lock(lck: &fsw->fsw_reap_lock); |
| 4414 | if (!(fsw->fsw_reap_flags & FSW_REAPF_TERMINATING)) { |
| 4415 | fsw->fsw_reap_flags &= ~FSW_REAPF_RUNNING; |
| 4416 | (void) assert_wait_deadline(event: &fsw->fsw_reap_flags, |
| 4417 | THREAD_UNINT, deadline: t); |
| 4418 | lck_mtx_unlock(lck: &fsw->fsw_reap_lock); |
| 4419 | thread_block_parameter(continuation: fsw_reap_thread_cont, parameter: fsw); |
| 4420 | /* NOTREACHED */ |
| 4421 | __builtin_unreachable(); |
| 4422 | } else { |
| 4423 | terminate: |
| 4424 | LCK_MTX_ASSERT(&fsw->fsw_reap_lock, LCK_MTX_ASSERT_OWNED); |
| 4425 | fsw->fsw_reap_flags &= ~(FSW_REAPF_RUNNING | FSW_REAPF_TERMINATING); |
| 4426 | fsw->fsw_reap_flags |= FSW_REAPF_TERMINATED; |
| 4427 | /* |
| 4428 | * And signal any thread waiting for us to terminate; |
| 4429 | * wait channel here other than fsw_reap_flags to make |
| 4430 | * it more explicit. |
| 4431 | */ |
| 4432 | if (fsw->fsw_reap_flags & FSW_REAPF_TERMINATEBLOCK) { |
| 4433 | thread_wakeup((caddr_t)&fsw->fsw_reap_thread); |
| 4434 | } |
| 4435 | lck_mtx_unlock(lck: &fsw->fsw_reap_lock); |
| 4436 | |
| 4437 | SK_DF(SK_VERB_FLOW, "%s: terminating" , fsw->fsw_reap_name); |
| 4438 | |
| 4439 | /* for the extra refcnt from kernel_thread_start() */ |
| 4440 | thread_deallocate(thread: current_thread()); |
| 4441 | /* this is the end */ |
| 4442 | thread_terminate(current_thread()); |
| 4443 | /* NOTREACHED */ |
| 4444 | __builtin_unreachable(); |
| 4445 | } |
| 4446 | |
| 4447 | /* must never get here */ |
| 4448 | VERIFY(0); |
| 4449 | /* NOTREACHED */ |
| 4450 | __builtin_unreachable(); |
| 4451 | } |
| 4452 | |
| 4453 | static void |
| 4454 | fsw_drain_channels(struct nx_flowswitch *fsw, uint64_t now, boolean_t low) |
| 4455 | { |
| 4456 | struct kern_nexus *nx = fsw->fsw_nx; |
| 4457 | |
| 4458 | /* flowswitch protects NA via fsw_lock, see fsw_port_alloc/free */ |
| 4459 | FSW_RLOCK(fsw); |
| 4460 | |
| 4461 | /* uncrustify doesn't handle C blocks properly */ |
| 4462 | /* BEGIN IGNORE CODESTYLE */ |
| 4463 | nx_port_foreach(nx, ^(nexus_port_t p) { |
| 4464 | struct nexus_adapter *na = nx_port_get_na(nx, p); |
| 4465 | if (na == NULL || na->na_work_ts == 0 || na->na_rx_rings == NULL) { |
| 4466 | return; |
| 4467 | } |
| 4468 | |
| 4469 | boolean_t purge; |
| 4470 | |
| 4471 | /* |
| 4472 | * If some activity happened in the last FSW_DRAIN_CH_THRES |
| 4473 | * seconds on this channel, we reclaim memory if the channel |
| 4474 | * throughput is less than the reap threshold value. |
| 4475 | */ |
| 4476 | if ((now - na->na_work_ts) < FSW_DRAIN_CH_THRES) { |
| 4477 | struct __kern_channel_ring *ring; |
| 4478 | channel_ring_stats *stats; |
| 4479 | uint64_t bps; |
| 4480 | |
| 4481 | ring = na->na_rx_rings; |
| 4482 | stats = &ring->ckr_stats; |
| 4483 | bps = stats->crs_bytes_per_second; |
| 4484 | |
| 4485 | if (bps < fsw_channel_reap_thresh) { |
| 4486 | purge = FALSE; |
| 4487 | na_drain(na, purge); |
| 4488 | } |
| 4489 | return; |
| 4490 | } |
| 4491 | |
| 4492 | /* |
| 4493 | * If NA has been inactive for some time (twice the drain |
| 4494 | * threshold), we clear the work timestamp to temporarily skip |
| 4495 | * this channel until it's active again. Purging cached objects |
| 4496 | * can be expensive since we'd need to allocate and construct |
| 4497 | * them again, so we do it only when necessary. |
| 4498 | */ |
| 4499 | if (low || ((now - na->na_work_ts) >= (FSW_DRAIN_CH_THRES << 1))) { |
| 4500 | na->na_work_ts = 0; |
| 4501 | purge = TRUE; |
| 4502 | } else { |
| 4503 | purge = FALSE; |
| 4504 | } |
| 4505 | |
| 4506 | na_drain(na, purge); /* purge/prune caches */ |
| 4507 | }); |
| 4508 | /* END IGNORE CODESTYLE */ |
| 4509 | |
| 4510 | FSW_RUNLOCK(fsw); |
| 4511 | } |
| 4512 | |
| 4513 | static void |
| 4514 | fsw_purge_cache(struct nx_flowswitch *fsw, boolean_t low) |
| 4515 | { |
| 4516 | #pragma unused(fsw) |
| 4517 | uint64_t o = os_atomic_inc_orig(&fsw_want_purge, relaxed); |
| 4518 | uint32_t p = fsw_flow_purge_thresh; |
| 4519 | boolean_t purge = (low || (o != 0 && p != 0 && (o % p) == 0)); |
| 4520 | |
| 4521 | SK_DF(SK_VERB_FLOW, "%s: %s caches" , |
| 4522 | fsw->fsw_flow_mgr->fm_name, |
| 4523 | (purge ? "purge" : "prune" )); |
| 4524 | |
| 4525 | skmem_cache_reap_now(sk_fo_cache, purge); |
| 4526 | skmem_cache_reap_now(sk_fe_cache, purge); |
| 4527 | skmem_cache_reap_now(sk_fab_cache, purge); |
| 4528 | skmem_cache_reap_now(flow_route_cache, purge); |
| 4529 | skmem_cache_reap_now(flow_stats_cache, purge); |
| 4530 | netns_reap_caches(purge); |
| 4531 | skmem_reap_caches(purge); |
| 4532 | |
| 4533 | #if CONFIG_MBUF_MCACHE |
| 4534 | if (if_is_fsw_transport_netagent_enabled() && purge) { |
| 4535 | mbuf_drain(FALSE); |
| 4536 | } |
| 4537 | #endif /* CONFIG_MBUF_MCACHE */ |
| 4538 | } |
| 4539 | |
| 4540 | static void |
| 4541 | fsw_flow_handle_low_power(struct nx_flowswitch *fsw, struct flow_entry *fe) |
| 4542 | { |
| 4543 | /* When the interface is in low power mode, the flow is nonviable */ |
| 4544 | if (!(fe->fe_flags & FLOWENTF_NONVIABLE) && |
| 4545 | os_atomic_cmpxchg(&fe->fe_want_nonviable, 0, 1, acq_rel)) { |
| 4546 | os_atomic_inc(&fsw->fsw_pending_nonviable, relaxed); |
| 4547 | } |
| 4548 | } |
| 4549 | |
| 4550 | static uint32_t |
| 4551 | fsw_process_deferred(struct nx_flowswitch *fsw) |
| 4552 | { |
| 4553 | struct flow_entry_dead sfed __sk_aligned(8); |
| 4554 | struct flow_mgr *fm = fsw->fsw_flow_mgr; |
| 4555 | struct flow_entry_dead *fed, *tfed; |
| 4556 | LIST_HEAD(, flow_entry_dead) fed_head = |
| 4557 | LIST_HEAD_INITIALIZER(fed_head); |
| 4558 | uint32_t i, nonviable = 0; |
| 4559 | boolean_t lowpowermode = FALSE; |
| 4560 | |
| 4561 | bzero(s: &sfed, n: sizeof(sfed)); |
| 4562 | |
| 4563 | /* |
| 4564 | * The flows become nonviable when the interface |
| 4565 | * is in low power mode (edge trigger) |
| 4566 | */ |
| 4567 | if ((fsw->fsw_ifp->if_xflags & IFXF_LOW_POWER) && |
| 4568 | fsw->fsw_ifp->if_low_power_gencnt != fsw->fsw_low_power_gencnt) { |
| 4569 | lowpowermode = TRUE; |
| 4570 | fsw->fsw_low_power_gencnt = fsw->fsw_ifp->if_low_power_gencnt; |
| 4571 | } |
| 4572 | |
| 4573 | /* |
| 4574 | * Scan thru the flow entry tree, and commit any pending withdraw or |
| 4575 | * nonviable requests. We may need to push stats and/or unassign the |
| 4576 | * nexus from NECP, but we cannot do that while holding the locks; |
| 4577 | * build a temporary list for those entries. |
| 4578 | */ |
| 4579 | for (i = 0; i < fm->fm_owner_buckets_cnt; i++) { |
| 4580 | struct flow_owner_bucket *fob = flow_mgr_get_fob_at_idx(fm, idx: i); |
| 4581 | struct flow_owner *fo; |
| 4582 | |
| 4583 | /* |
| 4584 | * Grab the lock at all costs when handling low power mode |
| 4585 | */ |
| 4586 | if (__probable(!lowpowermode)) { |
| 4587 | if (!FOB_TRY_LOCK(fob)) { |
| 4588 | continue; |
| 4589 | } |
| 4590 | } else { |
| 4591 | FOB_LOCK(fob); |
| 4592 | } |
| 4593 | |
| 4594 | FOB_LOCK_ASSERT_HELD(fob); |
| 4595 | RB_FOREACH(fo, flow_owner_tree, &fob->fob_owner_head) { |
| 4596 | struct flow_entry *fe; |
| 4597 | |
| 4598 | RB_FOREACH(fe, flow_entry_id_tree, |
| 4599 | &fo->fo_flow_entry_id_head) { |
| 4600 | /* try first as reader; skip if we can't */ |
| 4601 | if (__improbable(lowpowermode)) { |
| 4602 | fsw_flow_handle_low_power(fsw, fe); |
| 4603 | } |
| 4604 | if (__improbable(fe->fe_flags & FLOWENTF_HALF_CLOSED)) { |
| 4605 | os_atomic_andnot(&fe->fe_flags, FLOWENTF_HALF_CLOSED, relaxed); |
| 4606 | flow_namespace_half_close(token: &fe->fe_port_reservation); |
| 4607 | } |
| 4608 | |
| 4609 | /* if not withdrawn/nonviable, skip */ |
| 4610 | if (!fe->fe_want_withdraw && |
| 4611 | !fe->fe_want_nonviable) { |
| 4612 | continue; |
| 4613 | } |
| 4614 | /* |
| 4615 | * Here we're holding the lock as writer; |
| 4616 | * don't spend too much time as we're |
| 4617 | * blocking the data path now. |
| 4618 | */ |
| 4619 | ASSERT(!uuid_is_null(fe->fe_uuid)); |
| 4620 | /* only need flow UUID and booleans */ |
| 4621 | uuid_copy(dst: sfed.fed_uuid, src: fe->fe_uuid); |
| 4622 | sfed.fed_want_clonotify = |
| 4623 | (fe->fe_flags & FLOWENTF_CLOSE_NOTIFY); |
| 4624 | sfed.fed_want_nonviable = fe->fe_want_nonviable; |
| 4625 | flow_entry_teardown(fo, fe); |
| 4626 | |
| 4627 | /* do this outside the flow bucket lock */ |
| 4628 | fed = flow_entry_dead_alloc(Z_WAITOK); |
| 4629 | ASSERT(fed != NULL); |
| 4630 | *fed = sfed; |
| 4631 | LIST_INSERT_HEAD(&fed_head, fed, fed_link); |
| 4632 | } |
| 4633 | } |
| 4634 | FOB_UNLOCK(fob); |
| 4635 | } |
| 4636 | |
| 4637 | /* |
| 4638 | * These nonviable flows are no longer useful since we've lost |
| 4639 | * the source IP address; in the event the client monitors the |
| 4640 | * viability of the flow, explicitly mark it as nonviable so |
| 4641 | * that a new flow can be created. |
| 4642 | */ |
| 4643 | LIST_FOREACH_SAFE(fed, &fed_head, fed_link, tfed) { |
| 4644 | LIST_REMOVE(fed, fed_link); |
| 4645 | ASSERT(fsw->fsw_agent_session != NULL); |
| 4646 | |
| 4647 | /* if flow is closed early */ |
| 4648 | if (fed->fed_want_clonotify) { |
| 4649 | necp_client_early_close(client_id: fed->fed_uuid); |
| 4650 | } |
| 4651 | |
| 4652 | /* if nonviable, unassign nexus attributes */ |
| 4653 | if (fed->fed_want_nonviable) { |
| 4654 | (void) netagent_assign_nexus(session: fsw->fsw_agent_session, |
| 4655 | necp_client_uuid: fed->fed_uuid, NULL, assigned_results_length: 0); |
| 4656 | } |
| 4657 | |
| 4658 | flow_entry_dead_free(fed); |
| 4659 | ++nonviable; |
| 4660 | } |
| 4661 | ASSERT(LIST_EMPTY(&fed_head)); |
| 4662 | |
| 4663 | return nonviable; |
| 4664 | } |
| 4665 | |
| 4666 | static uint32_t |
| 4667 | fsw_process_linger(struct nx_flowswitch *fsw, uint32_t *abort) |
| 4668 | { |
| 4669 | struct flow_entry_linger_head linger_head = |
| 4670 | TAILQ_HEAD_INITIALIZER(linger_head); |
| 4671 | struct flow_entry *fe, *tfe; |
| 4672 | uint64_t now = _net_uptime; |
| 4673 | uint32_t i = 0, cnt = 0, freed = 0; |
| 4674 | |
| 4675 | ASSERT(fsw->fsw_ifp != NULL); |
| 4676 | ASSERT(abort != NULL); |
| 4677 | *abort = 0; |
| 4678 | |
| 4679 | /* |
| 4680 | * We don't want to contend with the datapath, so move |
| 4681 | * everything that's in the linger list into a local list. |
| 4682 | * This allows us to generate RSTs or free the flow entry |
| 4683 | * outside the lock. Any remaining flow entry in the local |
| 4684 | * list will get re-added back to the head of the linger |
| 4685 | * list, in front of any new ones added since then. |
| 4686 | */ |
| 4687 | lck_mtx_lock(lck: &fsw->fsw_linger_lock); |
| 4688 | TAILQ_CONCAT(&linger_head, &fsw->fsw_linger_head, fe_linger_link); |
| 4689 | ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head)); |
| 4690 | cnt = fsw->fsw_linger_cnt; |
| 4691 | fsw->fsw_linger_cnt = 0; |
| 4692 | lck_mtx_unlock(lck: &fsw->fsw_linger_lock); |
| 4693 | |
| 4694 | TAILQ_FOREACH_SAFE(fe, &linger_head, fe_linger_link, tfe) { |
| 4695 | ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN); |
| 4696 | ASSERT(fe->fe_flags & FLOWENTF_DESTROYED); |
| 4697 | ASSERT(fe->fe_flags & FLOWENTF_LINGERING); |
| 4698 | |
| 4699 | /* |
| 4700 | * See if this is a TCP flow that needs to generate |
| 4701 | * a RST to the remote peer (if not already). |
| 4702 | */ |
| 4703 | if (flow_track_tcp_want_abort(fe)) { |
| 4704 | VERIFY(fe->fe_flags & FLOWENTF_ABORTED); |
| 4705 | ASSERT(!uuid_is_null(fe->fe_uuid)); |
| 4706 | flow_track_abort_tcp(fe, NULL, NULL); |
| 4707 | (*abort)++; |
| 4708 | SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]); |
| 4709 | SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx " |
| 4710 | "flags 0x%b [RST]" , fe_as_string(fe, dbgbuf, |
| 4711 | sizeof(dbgbuf)), SK_KVA(fe), fe->fe_flags, |
| 4712 | FLOWENTF_BITS); |
| 4713 | } |
| 4714 | |
| 4715 | /* |
| 4716 | * If flow has expired, remove from list and free; |
| 4717 | * otherwise leave it around in the linger list. |
| 4718 | */ |
| 4719 | if (fe->fe_linger_expire <= now) { |
| 4720 | freed++; |
| 4721 | fsw_linger_remove_internal(linger_head: &linger_head, fe); |
| 4722 | fe = NULL; |
| 4723 | } |
| 4724 | ++i; |
| 4725 | } |
| 4726 | VERIFY(i == cnt && cnt >= freed); |
| 4727 | |
| 4728 | /* |
| 4729 | * Add any remaining ones back into the linger list. |
| 4730 | */ |
| 4731 | lck_mtx_lock(lck: &fsw->fsw_linger_lock); |
| 4732 | if (!TAILQ_EMPTY(&linger_head)) { |
| 4733 | ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head) || fsw->fsw_linger_cnt); |
| 4734 | TAILQ_CONCAT(&linger_head, &fsw->fsw_linger_head, fe_linger_link); |
| 4735 | ASSERT(TAILQ_EMPTY(&fsw->fsw_linger_head)); |
| 4736 | TAILQ_CONCAT(&fsw->fsw_linger_head, &linger_head, fe_linger_link); |
| 4737 | fsw->fsw_linger_cnt += (cnt - freed); |
| 4738 | } |
| 4739 | ASSERT(TAILQ_EMPTY(&linger_head)); |
| 4740 | lck_mtx_unlock(lck: &fsw->fsw_linger_lock); |
| 4741 | |
| 4742 | return freed; |
| 4743 | } |
| 4744 | |
| 4745 | __attribute__((always_inline)) |
| 4746 | static inline void |
| 4747 | fsw_ifp_inc_traffic_class_in_pkt(struct ifnet *ifp, kern_packet_t ph) |
| 4748 | { |
| 4749 | switch (__packet_get_traffic_class(ph)) { |
| 4750 | case PKT_TC_BE: |
| 4751 | ifp->if_tc.ifi_ibepackets++; |
| 4752 | ifp->if_tc.ifi_ibebytes += SK_PTR_ADDR_KPKT(ph)->pkt_length; |
| 4753 | break; |
| 4754 | case PKT_TC_BK: |
| 4755 | ifp->if_tc.ifi_ibkpackets++; |
| 4756 | ifp->if_tc.ifi_ibkbytes += SK_PTR_ADDR_KPKT(ph)->pkt_length; |
| 4757 | break; |
| 4758 | case PKT_TC_VI: |
| 4759 | ifp->if_tc.ifi_ivipackets++; |
| 4760 | ifp->if_tc.ifi_ivibytes += SK_PTR_ADDR_KPKT(ph)->pkt_length; |
| 4761 | break; |
| 4762 | case PKT_TC_VO: |
| 4763 | ifp->if_tc.ifi_ivopackets++; |
| 4764 | ifp->if_tc.ifi_ivobytes += SK_PTR_ADDR_KPKT(ph)->pkt_length; |
| 4765 | break; |
| 4766 | default: |
| 4767 | break; |
| 4768 | } |
| 4769 | } |
| 4770 | |
| 4771 | __attribute__((always_inline)) |
| 4772 | static inline void |
| 4773 | fsw_ifp_inc_traffic_class_out_pkt(struct ifnet *ifp, uint32_t svc, |
| 4774 | uint32_t cnt, uint32_t len) |
| 4775 | { |
| 4776 | switch (svc) { |
| 4777 | case PKT_TC_BE: |
| 4778 | ifp->if_tc.ifi_obepackets += cnt; |
| 4779 | ifp->if_tc.ifi_obebytes += len; |
| 4780 | break; |
| 4781 | case PKT_TC_BK: |
| 4782 | ifp->if_tc.ifi_obkpackets += cnt; |
| 4783 | ifp->if_tc.ifi_obkbytes += len; |
| 4784 | break; |
| 4785 | case PKT_TC_VI: |
| 4786 | ifp->if_tc.ifi_ovipackets += cnt; |
| 4787 | ifp->if_tc.ifi_ovibytes += len; |
| 4788 | break; |
| 4789 | case PKT_TC_VO: |
| 4790 | ifp->if_tc.ifi_ovopackets += cnt; |
| 4791 | ifp->if_tc.ifi_ovobytes += len; |
| 4792 | break; |
| 4793 | default: |
| 4794 | break; |
| 4795 | } |
| 4796 | } |
| 4797 | |