1/*
2 * Copyright (c) 2016-2022 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <skywalk/os_skywalk_private.h>
30
31#include <dev/random/randomdev.h>
32#include <net/flowhash.h>
33#include <netkey/key.h>
34
35#include <skywalk/nexus/flowswitch/fsw_var.h>
36#include <skywalk/nexus/flowswitch/flow/flow_var.h>
37#include <skywalk/nexus/netif/nx_netif.h>
38#include <skywalk/namespace/flowidns.h>
39
40struct flow_entry *fe_alloc(boolean_t);
41static void fe_free(struct flow_entry *);
42static int fe_id_cmp(const struct flow_entry *, const struct flow_entry *);
43static void fe_stats_init(struct flow_entry *);
44static void fe_stats_update(struct flow_entry *);
45
46RB_GENERATE_PREV(flow_entry_id_tree, flow_entry, fe_id_link, fe_id_cmp);
47
48os_refgrp_decl(static, flow_entry_refgrp, "flow_entry", NULL);
49
50KALLOC_TYPE_DECLARE(sk_fed_zone);
51
52const struct flow_key fk_mask_2tuple
53__sk_aligned(16) =
54{
55 .fk_mask = FKMASK_2TUPLE,
56 .fk_ipver = 0,
57 .fk_proto = 0xff,
58 .fk_sport = 0xffff,
59 .fk_dport = 0,
60 .fk_src._addr64[0] = 0,
61 .fk_src._addr64[1] = 0,
62 .fk_dst._addr64[0] = 0,
63 .fk_dst._addr64[1] = 0,
64 .fk_pad[0] = 0,
65};
66
67const struct flow_key fk_mask_3tuple
68__sk_aligned(16) =
69{
70 .fk_mask = FKMASK_3TUPLE,
71 .fk_ipver = 0xff,
72 .fk_proto = 0xff,
73 .fk_sport = 0xffff,
74 .fk_dport = 0,
75 .fk_src._addr64[0] = 0xffffffffffffffffULL,
76 .fk_src._addr64[1] = 0xffffffffffffffffULL,
77 .fk_dst._addr64[0] = 0,
78 .fk_dst._addr64[1] = 0,
79 .fk_pad[0] = 0,
80};
81
82const struct flow_key fk_mask_4tuple
83__sk_aligned(16) =
84{
85 .fk_mask = FKMASK_4TUPLE,
86 .fk_ipver = 0xff,
87 .fk_proto = 0xff,
88 .fk_sport = 0xffff,
89 .fk_dport = 0xffff,
90 .fk_src._addr64[0] = 0xffffffffffffffffULL,
91 .fk_src._addr64[1] = 0xffffffffffffffffULL,
92 .fk_dst._addr64[0] = 0,
93 .fk_dst._addr64[1] = 0,
94 .fk_pad[0] = 0,
95};
96
97const struct flow_key fk_mask_5tuple
98__sk_aligned(16) =
99{
100 .fk_mask = FKMASK_5TUPLE,
101 .fk_ipver = 0xff,
102 .fk_proto = 0xff,
103 .fk_sport = 0xffff,
104 .fk_dport = 0xffff,
105 .fk_src._addr64[0] = 0xffffffffffffffffULL,
106 .fk_src._addr64[1] = 0xffffffffffffffffULL,
107 .fk_dst._addr64[0] = 0xffffffffffffffffULL,
108 .fk_dst._addr64[1] = 0xffffffffffffffffULL,
109 .fk_pad[0] = 0,
110};
111
112const struct flow_key fk_mask_ipflow1
113__sk_aligned(16) =
114{
115 .fk_mask = FKMASK_IPFLOW1,
116 .fk_ipver = 0,
117 .fk_proto = 0xff,
118 .fk_sport = 0,
119 .fk_dport = 0,
120 .fk_src._addr64[0] = 0,
121 .fk_src._addr64[1] = 0,
122 .fk_dst._addr64[0] = 0,
123 .fk_dst._addr64[1] = 0,
124 .fk_pad[0] = 0,
125};
126
127const struct flow_key fk_mask_ipflow2
128__sk_aligned(16) =
129{
130 .fk_mask = FKMASK_IPFLOW2,
131 .fk_ipver = 0xff,
132 .fk_proto = 0xff,
133 .fk_sport = 0,
134 .fk_dport = 0,
135 .fk_src._addr64[0] = 0xffffffffffffffffULL,
136 .fk_src._addr64[1] = 0xffffffffffffffffULL,
137 .fk_dst._addr64[0] = 0,
138 .fk_dst._addr64[1] = 0,
139 .fk_pad[0] = 0,
140};
141
142const struct flow_key fk_mask_ipflow3
143__sk_aligned(16) =
144{
145 .fk_mask = FKMASK_IPFLOW3,
146 .fk_ipver = 0xff,
147 .fk_proto = 0xff,
148 .fk_sport = 0,
149 .fk_dport = 0,
150 .fk_src._addr64[0] = 0xffffffffffffffffULL,
151 .fk_src._addr64[1] = 0xffffffffffffffffULL,
152 .fk_dst._addr64[0] = 0xffffffffffffffffULL,
153 .fk_dst._addr64[1] = 0xffffffffffffffffULL,
154 .fk_pad[0] = 0,
155};
156
157struct flow_owner *
158flow_owner_find_by_pid(struct flow_owner_bucket *fob, pid_t pid, void *context,
159 bool low_latency)
160{
161 struct flow_owner find = { .fo_context = context, .fo_pid = pid,
162 .fo_low_latency = low_latency};
163
164 ASSERT(low_latency == true || low_latency == false);
165 FOB_LOCK_ASSERT_HELD(fob);
166 return RB_FIND(flow_owner_tree, &fob->fob_owner_head, &find);
167}
168
169struct flow_entry *
170flow_entry_find_by_uuid(struct flow_owner *fo, uuid_t uuid)
171{
172 struct flow_entry find, *fe = NULL;
173 FOB_LOCK_ASSERT_HELD(FO_BUCKET(fo));
174
175 uuid_copy(dst: find.fe_uuid, src: uuid);
176 fe = RB_FIND(flow_entry_id_tree, &fo->fo_flow_entry_id_head, &find);
177 if (fe != NULL) {
178 flow_entry_retain(fe);
179 }
180
181 return fe;
182}
183
184static uint32_t
185flow_entry_calc_flowid(struct flow_entry *fe)
186{
187 uint32_t flowid;
188 struct flowidns_flow_key fk;
189
190 bzero(s: &fk, n: sizeof(fk));
191 _CASSERT(sizeof(fe->fe_key.fk_src) == sizeof(fk.ffk_laddr));
192 _CASSERT(sizeof(fe->fe_key.fk_dst) == sizeof(fk.ffk_raddr));
193 bcopy(src: &fe->fe_key.fk_src, dst: &fk.ffk_laddr, n: sizeof(fk.ffk_laddr));
194 bcopy(src: &fe->fe_key.fk_dst, dst: &fk.ffk_raddr, n: sizeof(fk.ffk_raddr));
195
196 fk.ffk_lport = fe->fe_key.fk_sport;
197 fk.ffk_rport = fe->fe_key.fk_dport;
198 fk.ffk_af = (fe->fe_key.fk_ipver == 4) ? AF_INET : AF_INET6;
199 fk.ffk_proto = fe->fe_key.fk_proto;
200
201 flowidns_allocate_flowid(domain: FLOWIDNS_DOMAIN_FLOWSWITCH, flow_key: &fk, flowid: &flowid);
202 return flowid;
203}
204
205static bool
206flow_entry_add_child(struct flow_entry *parent_fe, struct flow_entry *child_fe)
207{
208 SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
209 ASSERT(parent_fe->fe_flags & FLOWENTF_PARENT);
210
211 lck_rw_lock_exclusive(lck: &parent_fe->fe_child_list_lock);
212
213 if (parent_fe->fe_flags & FLOWENTF_NONVIABLE) {
214 SK_ERR("child entry add failed, parent fe \"%s\" non viable 0x%llx "
215 "flags 0x%b %s(%d)", fe_as_string(parent_fe,
216 dbgbuf, sizeof(dbgbuf)), SK_KVA(parent_fe), parent_fe->fe_flags,
217 FLOWENTF_BITS, parent_fe->fe_proc_name,
218 parent_fe->fe_pid);
219 lck_rw_unlock_exclusive(lck: &parent_fe->fe_child_list_lock);
220 return false;
221 }
222
223 struct flow_entry *fe, *tfe;
224 TAILQ_FOREACH_SAFE(fe, &parent_fe->fe_child_list, fe_child_link, tfe) {
225 if (!fe_id_cmp(fe, child_fe)) {
226 lck_rw_unlock_exclusive(lck: &parent_fe->fe_child_list_lock);
227 SK_ERR("child entry \"%s\" already exists at fe 0x%llx "
228 "flags 0x%b %s(%d)", fe_as_string(fe,
229 dbgbuf, sizeof(dbgbuf)), SK_KVA(fe), fe->fe_flags,
230 FLOWENTF_BITS, fe->fe_proc_name,
231 fe->fe_pid);
232 return false;
233 }
234
235 if (fe->fe_flags & FLOWENTF_NONVIABLE) {
236 TAILQ_REMOVE(&parent_fe->fe_child_list, fe, fe_child_link);
237 ASSERT(--parent_fe->fe_child_count >= 0);
238 flow_entry_release(pfe: &fe);
239 }
240 }
241
242 flow_entry_retain(fe: child_fe);
243 TAILQ_INSERT_TAIL(&parent_fe->fe_child_list, child_fe, fe_child_link);
244 ASSERT(++parent_fe->fe_child_count > 0);
245
246 lck_rw_unlock_exclusive(lck: &parent_fe->fe_child_list_lock);
247
248 return true;
249}
250
251static void
252flow_entry_remove_all_children(struct flow_entry *parent_fe, struct nx_flowswitch *fsw)
253{
254 bool sched_reaper_thread = false;
255
256 ASSERT(parent_fe->fe_flags & FLOWENTF_PARENT);
257
258 lck_rw_lock_exclusive(lck: &parent_fe->fe_child_list_lock);
259
260 struct flow_entry *fe, *tfe;
261 TAILQ_FOREACH_SAFE(fe, &parent_fe->fe_child_list, fe_child_link, tfe) {
262 if (!(fe->fe_flags & FLOWENTF_NONVIABLE)) {
263 /*
264 * fsw_pending_nonviable is a hint for reaper thread;
265 * due to the fact that setting fe_want_nonviable and
266 * incrementing fsw_pending_nonviable counter is not
267 * atomic, let the increment happen first, and the
268 * thread losing the CAS does decrement.
269 */
270 os_atomic_inc(&fsw->fsw_pending_nonviable, relaxed);
271 if (os_atomic_cmpxchg(&fe->fe_want_nonviable, 0, 1, acq_rel)) {
272 sched_reaper_thread = true;
273 } else {
274 os_atomic_dec(&fsw->fsw_pending_nonviable, relaxed);
275 }
276 }
277
278 TAILQ_REMOVE(&parent_fe->fe_child_list, fe, fe_child_link);
279 ASSERT(--parent_fe->fe_child_count >= 0);
280 flow_entry_release(pfe: &fe);
281 }
282
283 lck_rw_unlock_exclusive(lck: &parent_fe->fe_child_list_lock);
284
285 if (sched_reaper_thread) {
286 fsw_reap_sched(fsw);
287 }
288}
289
290static void
291flow_entry_set_demux_patterns(struct flow_entry *fe, struct nx_flow_req *req)
292{
293 ASSERT(fe->fe_flags & FLOWENTF_CHILD);
294 ASSERT(req->nfr_flow_demux_count > 0);
295
296 fe->fe_demux_patterns = sk_alloc_type_array(struct kern_flow_demux_pattern, req->nfr_flow_demux_count,
297 Z_WAITOK | Z_NOFAIL, skmem_tag_flow_demux);
298
299 for (int i = 0; i < req->nfr_flow_demux_count; i++) {
300 bcopy(src: &req->nfr_flow_demux_patterns[i], dst: &fe->fe_demux_patterns[i].fdp_demux_pattern,
301 n: sizeof(struct flow_demux_pattern));
302
303 fe->fe_demux_patterns[i].fdp_memcmp_mask = NULL;
304 if (req->nfr_flow_demux_patterns[i].fdp_len == 16) {
305 fe->fe_demux_patterns[i].fdp_memcmp_mask = sk_memcmp_mask_16B;
306 } else if (req->nfr_flow_demux_patterns[i].fdp_len == 32) {
307 fe->fe_demux_patterns[i].fdp_memcmp_mask = sk_memcmp_mask_32B;
308 } else if (req->nfr_flow_demux_patterns[i].fdp_len > 32) {
309 VERIFY(0);
310 }
311 }
312
313 fe->fe_demux_pattern_count = req->nfr_flow_demux_count;
314}
315
316static int
317convert_flowkey_to_inet_td(struct flow_key *key,
318 struct ifnet_traffic_descriptor_inet *td)
319{
320 if ((key->fk_mask & FKMASK_IPVER) != 0) {
321 td->inet_ipver = key->fk_ipver;
322 td->inet_mask |= IFNET_TRAFFIC_DESCRIPTOR_INET_IPVER;
323 }
324 if ((key->fk_mask & FKMASK_PROTO) != 0) {
325 td->inet_proto = key->fk_proto;
326 td->inet_mask |= IFNET_TRAFFIC_DESCRIPTOR_INET_PROTO;
327 }
328 if ((key->fk_mask & FKMASK_SRC) != 0) {
329 if (td->inet_ipver == IPVERSION) {
330 bcopy(src: &key->fk_src4, dst: &td->inet_laddr.iia_v4addr,
331 n: sizeof(key->fk_src4));
332 } else {
333 bcopy(src: &key->fk_src6, dst: &td->inet_laddr,
334 n: sizeof(key->fk_src6));
335 }
336 td->inet_mask |= IFNET_TRAFFIC_DESCRIPTOR_INET_LADDR;
337 }
338 if ((key->fk_mask & FKMASK_DST) != 0) {
339 if (td->inet_ipver == IPVERSION) {
340 bcopy(src: &key->fk_dst4, dst: &td->inet_raddr.iia_v4addr,
341 n: sizeof(key->fk_dst4));
342 } else {
343 bcopy(src: &key->fk_dst6, dst: &td->inet_raddr,
344 n: sizeof(key->fk_dst6));
345 }
346 td->inet_mask |= IFNET_TRAFFIC_DESCRIPTOR_INET_RADDR;
347 }
348 if ((key->fk_mask & FKMASK_SPORT) != 0) {
349 td->inet_lport = key->fk_sport;
350 td->inet_mask |= IFNET_TRAFFIC_DESCRIPTOR_INET_LPORT;
351 }
352 if ((key->fk_mask & FKMASK_DPORT) != 0) {
353 td->inet_rport = key->fk_dport;
354 td->inet_mask |= IFNET_TRAFFIC_DESCRIPTOR_INET_RPORT;
355 }
356 td->inet_common.itd_type = IFNET_TRAFFIC_DESCRIPTOR_TYPE_INET;
357 td->inet_common.itd_len = sizeof(*td);
358 td->inet_common.itd_flags = IFNET_TRAFFIC_DESCRIPTOR_FLAG_INBOUND |
359 IFNET_TRAFFIC_DESCRIPTOR_FLAG_OUTBOUND;
360 return 0;
361}
362
363void
364flow_qset_select_dynamic(struct nx_flowswitch *fsw, struct flow_entry *fe,
365 boolean_t skip_if_no_change)
366{
367 struct ifnet_traffic_descriptor_inet td;
368 struct ifnet *ifp;
369 uint64_t qset_id;
370 struct nx_netif *nif;
371 boolean_t changed;
372 int err;
373
374 ifp = fsw->fsw_ifp;
375 changed = ifnet_sync_traffic_rule_genid(ifp, &fe->fe_tr_genid);
376 if (!changed && skip_if_no_change) {
377 return;
378 }
379 if (fe->fe_qset != NULL) {
380 nx_netif_qset_release(&fe->fe_qset);
381 ASSERT(fe->fe_qset == NULL);
382 }
383 if (ifp->if_traffic_rule_count == 0) {
384 DTRACE_SKYWALK2(no__rules, struct nx_flowswitch *, fsw,
385 struct flow_entry *, fe);
386 return;
387 }
388 err = convert_flowkey_to_inet_td(key: &fe->fe_key, td: &td);
389 ASSERT(err == 0);
390 err = nxctl_inet_traffic_rule_find_qset_id(ifp->if_xname, &td, &qset_id);
391 if (err != 0) {
392 DTRACE_SKYWALK3(qset__id__not__found,
393 struct nx_flowswitch *, fsw,
394 struct flow_entry *, fe,
395 struct ifnet_traffic_descriptor_inet *, &td);
396 return;
397 }
398 DTRACE_SKYWALK4(qset__id__found, struct nx_flowswitch *, fsw,
399 struct flow_entry *, fe, struct ifnet_traffic_descriptor_inet *,
400 &td, uint64_t, qset_id);
401 nif = NX_NETIF_PRIVATE(fsw->fsw_dev_ch->ch_na->na_nx);
402 ASSERT(fe->fe_qset == NULL);
403 fe->fe_qset = nx_netif_find_qset(nif, qset_id);
404}
405
406/* writer-lock must be owned for memory management functions */
407struct flow_entry *
408flow_entry_alloc(struct flow_owner *fo, struct nx_flow_req *req, int *perr)
409{
410 SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
411 nexus_port_t nx_port = req->nfr_nx_port;
412 struct flow_entry *fe = NULL;
413 struct flow_entry *parent_fe = NULL;
414 flowadv_idx_t fadv_idx = FLOWADV_IDX_NONE;
415 struct nexus_adapter *dev_na;
416 struct nx_netif *nif;
417 int err;
418
419 FOB_LOCK_ASSERT_HELD(FO_BUCKET(fo));
420 ASSERT(nx_port != NEXUS_PORT_ANY);
421 ASSERT(!fo->fo_nx_port_destroyed);
422
423 *perr = 0;
424
425 struct flow_key key __sk_aligned(16);
426 err = flow_req2key(req, key: &key);
427 if (__improbable(err != 0)) {
428 SK_ERR("invalid request (err %d)", err);
429 goto done;
430 }
431
432 struct flow_mgr *fm = fo->fo_fsw->fsw_flow_mgr;
433 fe = flow_mgr_find_conflicting_fe(fm, fe_key: &key);
434 if (fe != NULL) {
435 if ((fe->fe_flags & FLOWENTF_PARENT) &&
436 uuid_compare(uu1: fe->fe_uuid, uu2: req->nfr_parent_flow_uuid) == 0) {
437 parent_fe = fe;
438 fe = NULL;
439 } else {
440 SK_ERR("entry \"%s\" already exists at fe 0x%llx "
441 "flags 0x%b %s(%d)", fe_as_string(fe,
442 dbgbuf, sizeof(dbgbuf)), SK_KVA(fe), fe->fe_flags,
443 FLOWENTF_BITS, fe->fe_proc_name,
444 fe->fe_pid);
445 /* don't return it */
446 flow_entry_release(pfe: &fe);
447 err = EEXIST;
448 goto done;
449 }
450 } else if (!uuid_is_null(uu: req->nfr_parent_flow_uuid)) {
451 uuid_string_t uuid_str;
452 sk_uuid_unparse(req->nfr_parent_flow_uuid, uuid_str);
453 SK_ERR("parent entry \"%s\" does not exist", uuid_str);
454 err = ENOENT;
455 goto done;
456 }
457
458 if ((req->nfr_flags & NXFLOWREQF_FLOWADV) &&
459 (flow_owner_flowadv_index_alloc(fo, &fadv_idx) != 0)) {
460 SK_ERR("failed to alloc flowadv index for flow %s",
461 sk_uuid_unparse(req->nfr_flow_uuid, dbgbuf));
462 /* XXX: what is the most appropriate error code ? */
463 err = ENOSPC;
464 goto done;
465 }
466
467 fe = fe_alloc(TRUE);
468 if (__improbable(fe == NULL)) {
469 err = ENOMEM;
470 goto done;
471 }
472
473 fe->fe_key = key;
474 if (req->nfr_route != NULL) {
475 fe->fe_laddr_gencnt = req->nfr_route->fr_laddr_gencnt;
476 } else {
477 fe->fe_laddr_gencnt = req->nfr_saddr_gencnt;
478 }
479
480 if (__improbable(req->nfr_flags & NXFLOWREQF_LISTENER)) {
481 /* mark this as listener mode */
482 os_atomic_or(&fe->fe_flags, FLOWENTF_LISTENER, relaxed);
483 } else {
484 ASSERT((fe->fe_key.fk_ipver == IPVERSION &&
485 fe->fe_key.fk_src4.s_addr != INADDR_ANY) ||
486 (fe->fe_key.fk_ipver == IPV6_VERSION &&
487 !IN6_IS_ADDR_UNSPECIFIED(&fe->fe_key.fk_src6)));
488
489 /* mark this as connected mode */
490 os_atomic_or(&fe->fe_flags, FLOWENTF_CONNECTED, relaxed);
491 }
492
493 if (req->nfr_flags & NXFLOWREQF_NOWAKEFROMSLEEP) {
494 fe->fe_flags |= FLOWENTF_NOWAKEFROMSLEEP;
495 }
496 fe->fe_port_reservation = req->nfr_port_reservation;
497 req->nfr_port_reservation = NULL;
498 if (req->nfr_flags & NXFLOWREQF_EXT_PORT_RSV) {
499 fe->fe_flags |= FLOWENTF_EXTRL_PORT;
500 }
501 fe->fe_proto_reservation = req->nfr_proto_reservation;
502 req->nfr_proto_reservation = NULL;
503 if (req->nfr_flags & NXFLOWREQF_EXT_PROTO_RSV) {
504 fe->fe_flags |= FLOWENTF_EXTRL_PROTO;
505 }
506 fe->fe_ipsec_reservation = req->nfr_ipsec_reservation;
507 req->nfr_ipsec_reservation = NULL;
508
509 fe->fe_tx_process = dp_flow_tx_process;
510 fe->fe_rx_process = dp_flow_rx_process;
511
512 dev_na = fo->fo_fsw->fsw_dev_ch->ch_na;
513 nif = NX_NETIF_PRIVATE(dev_na->na_nx);
514 if (NX_LLINK_PROV(nif->nif_nx) &&
515 (fe->fe_key.fk_mask & (FKMASK_IPVER | FKMASK_PROTO | FKMASK_DST)) ==
516 (FKMASK_IPVER | FKMASK_PROTO | FKMASK_DST)) {
517 if (req->nfr_qset_id != 0) {
518 fe->fe_qset_select = FE_QSET_SELECT_FIXED;
519 fe->fe_qset_id = req->nfr_qset_id;
520 fe->fe_qset = nx_netif_find_qset(nif, req->nfr_qset_id);
521 } else {
522 fe->fe_qset_select = FE_QSET_SELECT_DYNAMIC;
523 fe->fe_qset_id = 0;
524 flow_qset_select_dynamic(fsw: fo->fo_fsw, fe, FALSE);
525 }
526 } else {
527 fe->fe_qset_select = FE_QSET_SELECT_NONE;
528 }
529 if (req->nfr_flags & NXFLOWREQF_LOW_LATENCY) {
530 os_atomic_or(&fe->fe_flags, FLOWENTF_LOW_LATENCY, relaxed);
531 }
532
533 fe->fe_transport_protocol = req->nfr_transport_protocol;
534 if (NX_FSW_TCP_RX_AGG_ENABLED() &&
535 (fo->fo_fsw->fsw_nx->nx_prov->nxprov_params->nxp_max_frags > 1) &&
536 (fe->fe_key.fk_proto == IPPROTO_TCP) &&
537 (fe->fe_key.fk_mask == FKMASK_5TUPLE)) {
538 fe->fe_rx_process = flow_rx_agg_tcp;
539 }
540 uuid_copy(dst: fe->fe_uuid, src: req->nfr_flow_uuid);
541 if ((req->nfr_flags & NXFLOWREQF_LISTENER) == 0 &&
542 (req->nfr_flags & NXFLOWREQF_TRACK) != 0) {
543 switch (req->nfr_ip_protocol) {
544 case IPPROTO_TCP:
545 case IPPROTO_UDP:
546 os_atomic_or(&fe->fe_flags, FLOWENTF_TRACK, relaxed);
547 break;
548 default:
549 break;
550 }
551 }
552
553 if (req->nfr_flags & NXFLOWREQF_QOS_MARKING) {
554 os_atomic_or(&fe->fe_flags, FLOWENTF_QOS_MARKING, relaxed);
555 }
556
557 if (req->nfr_flags & NXFLOWREQF_PARENT) {
558 os_atomic_or(&fe->fe_flags, FLOWENTF_PARENT, relaxed);
559 TAILQ_INIT(&fe->fe_child_list);
560 lck_rw_init(lck: &fe->fe_child_list_lock, grp: &nexus_lock_group, attr: &nexus_lock_attr);
561 }
562
563 if (req->nfr_route != NULL) {
564 fe->fe_route = req->nfr_route;
565 req->nfr_route = NULL;
566 }
567
568 fe->fe_nx_port = nx_port;
569 fe->fe_adv_idx = fadv_idx;
570
571 if (req->nfr_inp_flowhash != 0) {
572 /*
573 * BSD flow, use the inpcb flow hash value
574 */
575 fe->fe_flowid = req->nfr_inp_flowhash;
576 fe->fe_flags |= FLOWENTF_EXTRL_FLOWID;
577 } else {
578 fe->fe_flowid = flow_entry_calc_flowid(fe);
579 }
580
581 if (fe->fe_adv_idx != FLOWADV_IDX_NONE && fo->fo_nx_port_na != NULL) {
582 na_flowadv_entry_alloc(fo->fo_nx_port_na, fe->fe_uuid,
583 fe->fe_adv_idx, fe->fe_flowid);
584 }
585
586 if (KPKT_VALID_SVC(req->nfr_svc_class)) {
587 fe->fe_svc_class = (kern_packet_svc_class_t)req->nfr_svc_class;
588 } else {
589 fe->fe_svc_class = KPKT_SC_BE;
590 }
591
592 uuid_copy(dst: fe->fe_eproc_uuid, src: req->nfr_euuid);
593 fe->fe_policy_id = req->nfr_policy_id;
594 fe->fe_skip_policy_id = req->nfr_skip_policy_id;
595
596 err = flow_mgr_flow_hash_mask_add(fm, mask: fe->fe_key.fk_mask);
597 ASSERT(err == 0);
598
599 if (parent_fe != NULL) {
600 os_atomic_or(&fe->fe_flags, FLOWENTF_CHILD, relaxed);
601 flow_entry_set_demux_patterns(fe, req);
602 fe->fe_demux_pkt_data = sk_alloc_data(FLOW_DEMUX_MAX_LEN, Z_WAITOK | Z_NOFAIL, skmem_tag_flow_demux);
603 if (!flow_entry_add_child(parent_fe, child_fe: fe)) {
604 goto done;
605 }
606 } else {
607 fe->fe_key_hash = flow_key_hash(key: &fe->fe_key);
608 err = cuckoo_hashtable_add_with_hash(h: fm->fm_flow_table, node: &fe->fe_cnode,
609 key: fe->fe_key_hash);
610 if (err != 0) {
611 SK_ERR("flow table add failed (err %d)", err);
612 flow_mgr_flow_hash_mask_del(fm, mask: fe->fe_key.fk_mask);
613 goto done;
614 }
615 }
616
617 RB_INSERT(flow_entry_id_tree, &fo->fo_flow_entry_id_head, fe);
618 flow_entry_retain(fe); /* one refcnt in id_tree */
619
620 *(struct nx_flowswitch **)(uintptr_t)&fe->fe_fsw = fo->fo_fsw;
621 fe->fe_pid = fo->fo_pid;
622 if (req->nfr_epid != -1 && req->nfr_epid != fo->fo_pid) {
623 fe->fe_epid = req->nfr_epid;
624 proc_name(pid: fe->fe_epid, buf: fe->fe_eproc_name,
625 size: sizeof(fe->fe_eproc_name));
626 } else {
627 fe->fe_epid = -1;
628 }
629
630 (void) snprintf(fe->fe_proc_name, count: sizeof(fe->fe_proc_name), "%s",
631 fo->fo_name);
632
633 fe_stats_init(fe);
634 flow_stats_retain(fs: fe->fe_stats);
635 req->nfr_flow_stats = fe->fe_stats;
636
637#if SK_LOG
638 SK_DF(SK_VERB_FLOW, "allocated entry \"%s\" fe 0x%llx flags 0x%b "
639 "[fo 0x%llx ]", fe_as_string(fe, dbgbuf,
640 sizeof(dbgbuf)), SK_KVA(fe), fe->fe_flags, FLOWENTF_BITS,
641 SK_KVA(fo));
642#endif /* SK_LOG */
643
644done:
645 if (parent_fe != NULL) {
646 flow_entry_release(pfe: &parent_fe);
647 }
648 if (err != 0) {
649 if (fadv_idx != FLOWADV_IDX_NONE) {
650 flow_owner_flowadv_index_free(fo, fadv_idx);
651 }
652 if (fe != NULL) {
653 flow_entry_release(pfe: &fe);
654 }
655 }
656 *perr = err;
657 return fe;
658}
659
660void
661flow_entry_teardown(struct flow_owner *fo, struct flow_entry *fe)
662{
663#if SK_LOG
664 char dbgbuf[FLOWENTRY_DBGBUF_SIZE];
665 SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx flags 0x%b [fo 0x%llx] "
666 "non_via %d withdrawn %d", fe_as_string(fe, dbgbuf, sizeof(dbgbuf)),
667 SK_KVA(fe), fe->fe_flags, FLOWENTF_BITS, SK_KVA(fo),
668 fe->fe_want_nonviable, fe->fe_want_withdraw);
669#endif /* SK_LOG */
670 struct nx_flowswitch *fsw = fo->fo_fsw;
671
672 FOB_LOCK_ASSERT_HELD(FO_BUCKET(fo));
673
674 ASSERT(!(fe->fe_flags & FLOWENTF_DESTROYED));
675 ASSERT(!(fe->fe_flags & FLOWENTF_LINGERING));
676 ASSERT(fsw != NULL);
677
678 if (os_atomic_cmpxchg(&fe->fe_want_nonviable, 1, 0, acq_rel)) {
679 ASSERT(fsw->fsw_pending_nonviable != 0);
680 os_atomic_dec(&fsw->fsw_pending_nonviable, relaxed);
681 os_atomic_or(&fe->fe_flags, FLOWENTF_NONVIABLE, relaxed);
682 }
683
684 /* always withdraw namespace during tear down */
685 if (!(fe->fe_flags & FLOWENTF_EXTRL_PORT) &&
686 !(fe->fe_flags & FLOWENTF_WITHDRAWN)) {
687 os_atomic_or(&fe->fe_flags, FLOWENTF_WITHDRAWN, relaxed);
688 os_atomic_store(&fe->fe_want_withdraw, 0, release);
689 /* local port is now inactive; not eligible for offload */
690 flow_namespace_withdraw(&fe->fe_port_reservation);
691 }
692
693 /* we may get here multiple times, so check */
694 if (!(fe->fe_flags & FLOWENTF_TORN_DOWN)) {
695 os_atomic_or(&fe->fe_flags, FLOWENTF_TORN_DOWN, relaxed);
696 if (fe->fe_adv_idx != FLOWADV_IDX_NONE) {
697 if (fo->fo_nx_port_na != NULL) {
698 na_flowadv_entry_free(fo->fo_nx_port_na,
699 fe->fe_uuid, fe->fe_adv_idx, fe->fe_flowid);
700 }
701 flow_owner_flowadv_index_free(fo, fe->fe_adv_idx);
702 fe->fe_adv_idx = FLOWADV_IDX_NONE;
703 }
704 }
705 ASSERT(fe->fe_adv_idx == FLOWADV_IDX_NONE);
706 ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
707
708 /* mark child flow as nonviable */
709 if (fe->fe_flags & FLOWENTF_PARENT) {
710 flow_entry_remove_all_children(parent_fe: fe, fsw);
711 }
712}
713
714void
715flow_entry_destroy(struct flow_owner *fo, struct flow_entry *fe, bool nolinger,
716 void *close_params)
717{
718 struct flow_mgr *fm = fo->fo_fsw->fsw_flow_mgr;
719 int err;
720
721 FOB_LOCK_ASSERT_HELD(FO_BUCKET(fo));
722
723 /*
724 * regular flow: one in flow_table, one in id_tree, one here
725 * child flow: one in id_tree, one here
726 */
727 ASSERT(flow_entry_refcnt(fe) > 2 ||
728 ((fe->fe_flags & FLOWENTF_CHILD) && flow_entry_refcnt(fe) > 1));
729
730 flow_entry_teardown(fo, fe);
731
732 err = flow_mgr_flow_hash_mask_del(fm, mask: fe->fe_key.fk_mask);
733 ASSERT(err == 0);
734
735 /* only regular or parent flows have entries in flow_table */
736 if (__probable(!(fe->fe_flags & FLOWENTF_CHILD))) {
737 uint32_t hash;
738 hash = flow_key_hash(key: &fe->fe_key);
739 cuckoo_hashtable_del(h: fm->fm_flow_table, node: &fe->fe_cnode, key: hash);
740 }
741
742 RB_REMOVE(flow_entry_id_tree, &fo->fo_flow_entry_id_head, fe);
743 struct flow_entry *tfe = fe;
744 flow_entry_release(pfe: &tfe);
745
746 ASSERT(!(fe->fe_flags & FLOWENTF_DESTROYED));
747 os_atomic_or(&fe->fe_flags, FLOWENTF_DESTROYED, relaxed);
748
749 if (fe->fe_transport_protocol == IPPROTO_QUIC) {
750 if (!nolinger && close_params != NULL) {
751 flow_track_abort_quic(fe, token: close_params);
752 }
753 flow_entry_release(pfe: &fe);
754 } else if (nolinger || !(fe->fe_flags & FLOWENTF_WAIT_CLOSE)) {
755 flow_entry_release(pfe: &fe);
756 } else {
757 fsw_linger_insert(fsw: fe);
758 }
759}
760
761uint32_t
762flow_entry_refcnt(struct flow_entry *fe)
763{
764 return os_ref_get_count(rc: &fe->fe_refcnt);
765}
766
767void
768flow_entry_retain(struct flow_entry *fe)
769{
770 os_ref_retain(rc: &fe->fe_refcnt);
771}
772
773void
774flow_entry_release(struct flow_entry **pfe)
775{
776 struct flow_entry *fe = *pfe;
777 ASSERT(fe != NULL);
778 *pfe = NULL; /* caller lose reference */
779#if SK_LOG
780 if (__improbable(sk_verbose != 0)) {
781 char dbgbuf[FLOWENTRY_DBGBUF_SIZE];
782 SK_DF(SK_VERB_FLOW, "entry \"%s\" fe 0x%llx flags 0x%b",
783 fe_as_string(fe, dbgbuf, sizeof(dbgbuf)), SK_KVA(fe),
784 fe->fe_flags, FLOWENTF_BITS);
785 }
786#endif /* SK_LOG */
787
788 if (__improbable(os_ref_release(&fe->fe_refcnt) == 0)) {
789 fe->fe_nx_port = NEXUS_PORT_ANY;
790 if (fe->fe_route != NULL) {
791 flow_route_release(fe->fe_route);
792 fe->fe_route = NULL;
793 }
794 if (fe->fe_qset != NULL) {
795 nx_netif_qset_release(&fe->fe_qset);
796 ASSERT(fe->fe_qset == NULL);
797 }
798 if (fe->fe_demux_patterns != NULL) {
799 sk_free_type_array(struct kern_flow_demux_pattern,
800 fe->fe_demux_pattern_count, fe->fe_demux_patterns);
801 fe->fe_demux_patterns = NULL;
802 fe->fe_demux_pattern_count = 0;
803 }
804 if (fe->fe_demux_pkt_data != NULL) {
805 sk_free_data(fe->fe_demux_pkt_data, FLOW_DEMUX_MAX_LEN);
806 fe->fe_demux_pkt_data = NULL;
807 }
808 fe_free(fe);
809 }
810}
811
812struct flow_entry_dead *
813flow_entry_dead_alloc(zalloc_flags_t how)
814{
815 struct flow_entry_dead *fed;
816
817 fed = zalloc_flags(sk_fed_zone, how | Z_ZERO);
818 if (fed != NULL) {
819 SK_DF(SK_VERB_MEM, "fed 0x%llx ALLOC", SK_KVA(fed));
820 }
821 return fed;
822}
823
824void
825flow_entry_dead_free(struct flow_entry_dead *fed)
826{
827 SK_DF(SK_VERB_MEM, "fed 0x%llx FREE", SK_KVA(fed));
828 zfree(sk_fed_zone, fed);
829}
830
831static void
832fe_stats_init(struct flow_entry *fe)
833{
834 struct nx_flowswitch *fsw = fe->fe_fsw;
835 struct sk_stats_flow *sf = &fe->fe_stats->fs_stats;
836
837 ASSERT(fe->fe_stats != NULL);
838 ASSERT(os_ref_get_count(&fe->fe_stats->fs_refcnt) >= 1);
839
840 bzero(s: sf, n: sizeof(*sf));
841 uuid_copy(dst: sf->sf_nx_uuid, src: fsw->fsw_nx->nx_uuid);
842 uuid_copy(dst: sf->sf_uuid, src: fe->fe_uuid);
843 (void) strlcpy(dst: sf->sf_if_name, src: fsw->fsw_flow_mgr->fm_name, IFNAMSIZ);
844 sf->sf_if_index = fsw->fsw_ifp->if_index;
845 sf->sf_pid = fe->fe_pid;
846 sf->sf_epid = fe->fe_epid;
847 (void) snprintf(sf->sf_proc_name, count: sizeof(sf->sf_proc_name), "%s",
848 fe->fe_proc_name);
849 (void) snprintf(sf->sf_eproc_name, count: sizeof(sf->sf_eproc_name), "%s",
850 fe->fe_eproc_name);
851
852 sf->sf_nx_port = fe->fe_nx_port;
853 sf->sf_key = fe->fe_key;
854 sf->sf_protocol = fe->fe_transport_protocol;
855 sf->sf_svc_class = (packet_svc_class_t)fe->fe_svc_class;
856 sf->sf_adv_idx = fe->fe_adv_idx;
857
858 if (fe->fe_flags & FLOWENTF_TRACK) {
859 sf->sf_flags |= SFLOWF_TRACK;
860 }
861 if (fe->fe_flags & FLOWENTF_LISTENER) {
862 sf->sf_flags |= SFLOWF_LISTENER;
863 }
864 if (fe->fe_route != NULL && fe->fe_route->fr_flags & FLOWRTF_ONLINK) {
865 sf->sf_flags |= SFLOWF_ONLINK;
866 }
867
868 fe_stats_update(fe);
869}
870
871static void
872fe_stats_update(struct flow_entry *fe)
873{
874 struct sk_stats_flow *sf = &fe->fe_stats->fs_stats;
875
876 ASSERT(fe->fe_stats != NULL);
877 ASSERT(os_ref_get_count(&fe->fe_stats->fs_refcnt) >= 1);
878
879 if (fe->fe_flags & FLOWENTF_CONNECTED) {
880 sf->sf_flags |= SFLOWF_CONNECTED;
881 }
882 if (fe->fe_flags & FLOWENTF_QOS_MARKING) {
883 sf->sf_flags |= SFLOWF_QOS_MARKING;
884 }
885 if (fe->fe_flags & FLOWENTF_WAIT_CLOSE) {
886 sf->sf_flags |= SFLOWF_WAIT_CLOSE;
887 }
888 if (fe->fe_flags & FLOWENTF_CLOSE_NOTIFY) {
889 sf->sf_flags |= SFLOWF_CLOSE_NOTIFY;
890 }
891 if (fe->fe_flags & FLOWENTF_ABORTED) {
892 sf->sf_flags |= SFLOWF_ABORTED;
893 }
894 if (fe->fe_flags & FLOWENTF_NONVIABLE) {
895 sf->sf_flags |= SFLOWF_NONVIABLE;
896 }
897 if (fe->fe_flags & FLOWENTF_WITHDRAWN) {
898 sf->sf_flags |= SFLOWF_WITHDRAWN;
899 }
900 if (fe->fe_flags & FLOWENTF_TORN_DOWN) {
901 sf->sf_flags |= SFLOWF_TORN_DOWN;
902 }
903 if (fe->fe_flags & FLOWENTF_DESTROYED) {
904 sf->sf_flags |= SFLOWF_DESTROYED;
905 }
906 if (fe->fe_flags & FLOWENTF_LINGERING) {
907 sf->sf_flags |= SFLOWF_LINGERING;
908 }
909 if (fe->fe_flags & FLOWENTF_LOW_LATENCY) {
910 sf->sf_flags |= SFLOWF_LOW_LATENCY;
911 }
912 if (fe->fe_flags & FLOWENTF_PARENT) {
913 sf->sf_flags |= SFLOWF_PARENT;
914 }
915 if (fe->fe_flags & FLOWENTF_CHILD) {
916 sf->sf_flags |= SFLOWF_CHILD;
917 }
918 if (fe->fe_flags & FLOWENTF_NOWAKEFROMSLEEP) {
919 sf->sf_flags |= SFLOWF_NOWAKEFROMSLEEP;
920 } else {
921 sf->sf_flags &= ~SFLOWF_NOWAKEFROMSLEEP;
922 }
923
924 sf->sf_bucket_idx = SFLOW_BUCKET_NONE;
925
926 sf->sf_ltrack.sft_state = fe->fe_ltrack.fse_state;
927 sf->sf_ltrack.sft_seq = fe->fe_ltrack.fse_seqlo;
928 sf->sf_ltrack.sft_max_win = fe->fe_ltrack.fse_max_win;
929 sf->sf_ltrack.sft_wscale = fe->fe_ltrack.fse_wscale;
930 sf->sf_rtrack.sft_state = fe->fe_rtrack.fse_state;
931 sf->sf_rtrack.sft_seq = fe->fe_rtrack.fse_seqlo;
932 sf->sf_rtrack.sft_max_win = fe->fe_rtrack.fse_max_win;
933}
934
935void
936flow_entry_stats_get(struct flow_entry *fe, struct sk_stats_flow *sf)
937{
938 _CASSERT(sizeof(fe->fe_stats->fs_stats) == sizeof(*sf));
939
940 fe_stats_update(fe);
941 bcopy(src: &fe->fe_stats->fs_stats, dst: sf, n: sizeof(*sf));
942}
943
944struct flow_entry *
945fe_alloc(boolean_t can_block)
946{
947 struct flow_entry *fe;
948
949 _CASSERT((offsetof(struct flow_entry, fe_key) % 16) == 0);
950
951 fe = skmem_cache_alloc(sk_fe_cache,
952 can_block ? SKMEM_SLEEP : SKMEM_NOSLEEP);
953 if (fe == NULL) {
954 return NULL;
955 }
956
957 /*
958 * fe_key is 16-bytes aligned which requires fe to begin on
959 * a 16-bytes boundary as well. This alignment is specified
960 * at sk_fe_cache creation time and we assert here.
961 */
962 ASSERT(IS_P2ALIGNED(fe, 16));
963 bzero(s: fe, n: sk_fe_size);
964
965 fe->fe_stats = flow_stats_alloc(cansleep: can_block);
966 if (fe->fe_stats == NULL) {
967 skmem_cache_free(sk_fe_cache, fe);
968 return NULL;
969 }
970
971 SK_DF(SK_VERB_MEM, "fe 0x%llx ALLOC", SK_KVA(fe));
972
973 os_ref_init(&fe->fe_refcnt, &flow_entry_refgrp);
974
975 KPKTQ_INIT(&fe->fe_rx_pktq);
976 KPKTQ_INIT(&fe->fe_tx_pktq);
977
978 return fe;
979}
980
981static void
982fe_free(struct flow_entry *fe)
983{
984 ASSERT(fe->fe_flags & FLOWENTF_TORN_DOWN);
985 ASSERT(fe->fe_flags & FLOWENTF_DESTROYED);
986 ASSERT(!(fe->fe_flags & FLOWENTF_LINGERING));
987 ASSERT(fe->fe_route == NULL);
988
989 ASSERT(fe->fe_stats != NULL);
990 flow_stats_release(fs: fe->fe_stats);
991 fe->fe_stats = NULL;
992
993 /* only at very last existence of flow releases namespace reservation */
994 if (!(fe->fe_flags & FLOWENTF_EXTRL_PORT) &&
995 NETNS_TOKEN_VALID(&fe->fe_port_reservation)) {
996 flow_namespace_destroy(&fe->fe_port_reservation);
997 ASSERT(!NETNS_TOKEN_VALID(&fe->fe_port_reservation));
998 }
999 fe->fe_port_reservation = NULL;
1000
1001 if (!(fe->fe_flags & FLOWENTF_EXTRL_PROTO) &&
1002 protons_token_is_valid(pt: fe->fe_proto_reservation)) {
1003 protons_release(ptp: &fe->fe_proto_reservation);
1004 }
1005 fe->fe_proto_reservation = NULL;
1006
1007 if (key_custom_ipsec_token_is_valid(fe->fe_ipsec_reservation)) {
1008 key_release_custom_ipsec(&fe->fe_ipsec_reservation);
1009 }
1010 fe->fe_ipsec_reservation = NULL;
1011
1012 if (!(fe->fe_flags & FLOWENTF_EXTRL_FLOWID) && (fe->fe_flowid != 0)) {
1013 flowidns_release_flowid(flowid: fe->fe_flowid);
1014 fe->fe_flowid = 0;
1015 }
1016
1017 skmem_cache_free(sk_fe_cache, fe);
1018}
1019
1020static __inline__ int
1021fe_id_cmp(const struct flow_entry *a, const struct flow_entry *b)
1022{
1023 return uuid_compare(uu1: a->fe_uuid, uu2: b->fe_uuid);
1024}
1025
1026#if SK_LOG
1027SK_NO_INLINE_ATTRIBUTE
1028char *
1029fk_as_string(const struct flow_key *fk, char *dst, size_t dsz)
1030{
1031 int af;
1032 char src_s[MAX_IPv6_STR_LEN];
1033 char dst_s[MAX_IPv6_STR_LEN];
1034
1035 af = fk->fk_ipver == 4 ? AF_INET : AF_INET6;
1036
1037 (void) inet_ntop(af, &fk->fk_src, src_s, sizeof(src_s));
1038 (void) inet_ntop(af, &fk->fk_dst, dst_s, sizeof(dst_s));
1039 (void) snprintf(dst, dsz,
1040 "ipver=%u,src=%s,dst=%s,proto=0x%02u,sport=%u,dport=%u "
1041 "mask=%08x,hash=%08x",
1042 fk->fk_ipver, src_s, dst_s, fk->fk_proto, ntohs(fk->fk_sport),
1043 ntohs(fk->fk_dport), fk->fk_mask, flow_key_hash(fk));
1044
1045 return dst;
1046}
1047
1048SK_NO_INLINE_ATTRIBUTE
1049char *
1050fe_as_string(const struct flow_entry *fe, char *dst, size_t dsz)
1051{
1052 char keybuf[FLOWKEY_DBGBUF_SIZE]; /* just for debug message */
1053 uuid_string_t uuidstr;
1054
1055 fk_as_string(&fe->fe_key, keybuf, sizeof(keybuf));
1056
1057 (void) snprintf(dst, dsz,
1058 "fe 0x%llx proc %s nx_port %d flow_uuid %s %s tp_proto=0x%02u",
1059 SK_KVA(fe), fe->fe_proc_name, (int)fe->fe_nx_port,
1060 sk_uuid_unparse(fe->fe_uuid, uuidstr),
1061 keybuf, fe->fe_transport_protocol);
1062
1063 return dst;
1064}
1065#endif /* SK_LOG */
1066