1/*
2 * Copyright (c) 2015-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <skywalk/os_skywalk_private.h>
30#include <skywalk/os_skywalk.h>
31#include <skywalk/nexus/flowswitch/fsw_var.h>
32#include <skywalk/nexus/flowswitch/nx_flowswitch.h>
33#include <netinet/in.h>
34#include <netinet/in_var.h>
35#include <netinet6/ip6_var.h>
36#include <netkey/key.h>
37#include <netinet/udp.h>
38
39#include <skywalk/nexus/flowswitch/flow/flow_var.h>
40
41#if CONFIG_MACF
42#include <security/mac_framework.h>
43#endif /* CONFIG_MACF */
44
45#include <net/net_api_stats.h>
46
47#define SKMEM_TAG_FSW_FLOW_MGR "com.apple.skywalk.fsw.flow_mgr"
48static SKMEM_TAG_DEFINE(skmem_tag_fsw_flow_mgr, SKMEM_TAG_FSW_FLOW_MGR);
49
50static LCK_GRP_DECLARE(flow_mgr_lock_group, "sk_flow_mgr_lock");
51static LCK_RW_DECLARE(flow_mgr_lock, &flow_mgr_lock_group);
52
53static int fm_cmp(const struct flow_mgr *,
54 const struct flow_mgr *);
55
56RB_HEAD(flow_mgr_tree, flow_mgr);
57RB_PROTOTYPE_PREV(flow_mgr_tree, flow_mgr, fm_link, fm_cmp);
58RB_GENERATE_PREV(flow_mgr_tree, flow_mgr, fm_link, fm_cmp);
59
60/* protected by the global lock flow_mgr_lock */
61static struct flow_mgr_tree flow_mgr_head;
62
63static int __flow_mgr_inited = 0;
64
65void
66flow_mgr_init(void)
67{
68 ASSERT(!__flow_mgr_inited);
69
70 RB_INIT(&flow_mgr_head);
71 __flow_mgr_inited = 1;
72}
73
74void
75flow_mgr_fini(void)
76{
77 if (__flow_mgr_inited) {
78 VERIFY(RB_EMPTY(&flow_mgr_head));
79
80 __flow_mgr_inited = 0;
81 }
82}
83
84static int
85__fe_cuckoo_cmp(struct cuckoo_node *node, void *key0)
86{
87 struct flow_entry *fe = container_of(node, struct flow_entry, fe_cnode);
88 struct flow_key *key = key0;
89 const struct flow_key *mask;
90
91 /*
92 * This can probably be made more efficient by having "mask" be
93 * set by the original caller at the time the key is initialized,
94 * though that needs to be done carefully to ensure there is no
95 * mismatch between fk_mask value and "mask" itself.
96 */
97 switch (key->fk_mask) {
98 case FKMASK_5TUPLE:
99 mask = &fk_mask_5tuple;
100 break;
101 case FKMASK_4TUPLE:
102 mask = &fk_mask_4tuple;
103 break;
104 case FKMASK_3TUPLE:
105 mask = &fk_mask_3tuple;
106 break;
107 case FKMASK_2TUPLE:
108 mask = &fk_mask_2tuple;
109 break;
110 case FKMASK_IPFLOW3:
111 mask = &fk_mask_ipflow3;
112 break;
113 case FKMASK_IPFLOW2:
114 mask = &fk_mask_ipflow2;
115 break;
116 case FKMASK_IPFLOW1:
117 mask = &fk_mask_ipflow1;
118 break;
119 default:
120 return flow_key_cmp(match: &fe->fe_key, key);
121 }
122
123 return flow_key_cmp_mask(match: &fe->fe_key, key, mask);
124}
125
126static void
127__fe_cuckoo_retain(struct cuckoo_node *node)
128{
129 struct flow_entry *fe = container_of(node, struct flow_entry, fe_cnode);
130 return flow_entry_retain(fe);
131}
132
133static void
134__fe_cuckoo_release(struct cuckoo_node *node)
135{
136#pragma unused(node)
137 struct flow_entry *fe = container_of(node, struct flow_entry, fe_cnode);
138 flow_entry_release(pfe: &fe);
139}
140
141struct flow_mgr *
142flow_mgr_create(size_t fe_cnt, size_t fob_cnt,
143 size_t frb_cnt, size_t frib_cnt)
144{
145 struct flow_mgr *fm = NULL;
146 size_t fob_sz, frb_sz, frib_sz;
147 size_t fob_tot_sz, frb_tot_sz, frib_tot_sz;
148 uint32_t i;
149
150 /* caller needs to ensure {fb,frb}_cnt is a power of two */
151 ASSERT(frb_cnt != 0 && ((frb_cnt & (frb_cnt - 1)) == 0));
152 ASSERT(fob_cnt != 0);
153 ASSERT(frib_cnt != 0);
154
155 fm = sk_alloc_type(struct flow_mgr, Z_WAITOK | Z_NOFAIL, skmem_tag_fsw_flow_mgr);
156
157 struct cuckoo_hashtable_params p = {
158 .cht_capacity = fe_cnt,
159 .cht_obj_cmp = __fe_cuckoo_cmp,
160 .cht_obj_retain = __fe_cuckoo_retain,
161 .cht_obj_release = __fe_cuckoo_release,
162 };
163 fm->fm_flow_table = cuckoo_hashtable_create(p: &p);
164 if (fm->fm_flow_table == NULL) {
165 flow_mgr_destroy(fm);
166 return NULL;
167 }
168
169 /*
170 * flow_owner_bucket cache-aligned objects.
171 */
172 fm->fm_owner_buckets = flow_owner_buckets_alloc(fob_cnt, &fob_sz, &fob_tot_sz);
173 if (fm->fm_owner_buckets == NULL) {
174 flow_mgr_destroy(fm);
175 return NULL;
176 }
177 /* const overrides */
178 *(size_t *)(uintptr_t)&fm->fm_owner_buckets_cnt = fob_cnt;
179 *(size_t *)(uintptr_t)&fm->fm_owner_bucket_sz = fob_sz;
180 *(size_t *)(uintptr_t)&fm->fm_owner_bucket_tot_sz = fob_tot_sz;
181
182 /*
183 * flow_route_bucket cache-aligned objects.
184 */
185 fm->fm_route_buckets = flow_route_buckets_alloc(frb_cnt, &frb_sz, &frb_tot_sz);
186 if (fm->fm_route_buckets == NULL) {
187 flow_mgr_destroy(fm);
188 return NULL;
189 }
190 /* const overrides */
191 *(size_t *)(uintptr_t)&fm->fm_route_buckets_cnt = frb_cnt;
192 *(size_t *)(uintptr_t)&fm->fm_route_bucket_sz = frb_sz;
193 *(size_t *)(uintptr_t)&fm->fm_route_bucket_tot_sz = frb_tot_sz;
194
195 /*
196 * flow_route_id_bucket cache-aligned objects.
197 */
198 fm->fm_route_id_buckets =
199 flow_route_id_buckets_alloc(frib_cnt, &frib_sz, &frib_tot_sz);
200 if (fm->fm_route_id_buckets == NULL) {
201 flow_mgr_destroy(fm);
202 return NULL;
203 }
204 /* const overrides */
205 *(size_t *)(uintptr_t)&fm->fm_route_id_buckets_cnt = frib_cnt;
206 *(size_t *)(uintptr_t)&fm->fm_route_id_bucket_sz = frib_sz;
207 *(size_t *)(uintptr_t)&fm->fm_route_id_bucket_tot_sz = frib_tot_sz;
208
209 /* construct flow_owner_buckets */
210 for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
211 struct flow_owner_bucket *fob = flow_mgr_get_fob_at_idx(fm, idx: i);
212 flow_owner_bucket_init(fob);
213 /* const override */
214 *(size_t *)(uintptr_t)&fob->fob_idx = i;
215 }
216
217 /* construct flow_route_buckets */
218 for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
219 struct flow_route_bucket *frb = flow_mgr_get_frb_at_idx(fm, idx: i);
220 flow_route_bucket_init(frb);
221 /* const override */
222 *(size_t *)(uintptr_t)&frb->frb_idx = i;
223 }
224
225 /* construct flow_route_id_buckets */
226 for (i = 0; i < fm->fm_route_id_buckets_cnt; i++) {
227 struct flow_route_id_bucket *frib =
228 flow_mgr_get_frib_at_idx(fm, idx: i);
229 flow_route_id_bucket_init(frib);
230 /* const override */
231 *(size_t *)(uintptr_t)&frib->frib_idx = i;
232 }
233
234 uuid_generate_random(out: fm->fm_uuid);
235
236 lck_rw_lock_exclusive(lck: &flow_mgr_lock);
237 RB_INSERT(flow_mgr_tree, &flow_mgr_head, fm);
238#if DEBUG
239 struct flow_mgr find;
240 uuid_copy(find.fm_uuid, fm->fm_uuid);
241 /* make sure our tree compare routine is sane */
242 ASSERT(RB_FIND(flow_mgr_tree,
243 &flow_mgr_head, &find) == fm);
244#endif /* DEBUG */
245 lck_rw_done(lck: &flow_mgr_lock);
246
247 fm->fm_flow_hash_masks[0] = FKMASK_5TUPLE;
248 fm->fm_flow_hash_masks[1] = FKMASK_4TUPLE;
249 fm->fm_flow_hash_masks[2] = FKMASK_3TUPLE;
250 fm->fm_flow_hash_masks[3] = FKMASK_2TUPLE;
251 fm->fm_flow_hash_masks[4] = FKMASK_IPFLOW3;
252 fm->fm_flow_hash_masks[5] = FKMASK_IPFLOW2;
253 fm->fm_flow_hash_masks[6] = FKMASK_IPFLOW1;
254
255 memset(s: &fm->fm_flow_hash_count, c: 0, n: sizeof(fm->fm_flow_hash_count));
256
257 return fm;
258}
259
260void
261flow_mgr_destroy(struct flow_mgr *fm)
262{
263 uint32_t i;
264
265 lck_rw_lock_exclusive(lck: &flow_mgr_lock);
266 ASSERT(!uuid_is_null(fm->fm_uuid));
267
268 if (fm->fm_flow_table != NULL) {
269 cuckoo_hashtable_free(ht: fm->fm_flow_table);
270 }
271
272 if (fm->fm_owner_buckets != NULL) {
273 for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
274 struct flow_owner_bucket *fob =
275 flow_mgr_get_fob_at_idx(fm, idx: i);
276 ASSERT(fob->fob_idx == i);
277 flow_owner_bucket_destroy(fob);
278 }
279 flow_owner_buckets_free(fm->fm_owner_buckets,
280 fm->fm_owner_bucket_tot_sz);
281 fm->fm_owner_buckets = NULL;
282 *(uint32_t *)(uintptr_t)&fm->fm_owner_buckets_cnt = 0;
283 *(uint32_t *)(uintptr_t)&fm->fm_owner_bucket_sz = 0;
284 *(uint32_t *)(uintptr_t)&fm->fm_owner_bucket_tot_sz = 0;
285 }
286 ASSERT(fm->fm_owner_buckets_cnt == 0);
287 ASSERT(fm->fm_owner_bucket_sz == 0);
288 ASSERT(fm->fm_owner_bucket_tot_sz == 0);
289
290 if (fm->fm_route_buckets != NULL) {
291 for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
292 struct flow_route_bucket *frb =
293 flow_mgr_get_frb_at_idx(fm, idx: i);
294 ASSERT(frb->frb_idx == i);
295 flow_route_bucket_destroy(frb);
296 }
297 flow_route_buckets_free(fm->fm_route_buckets,
298 fm->fm_route_bucket_tot_sz);
299 fm->fm_route_buckets = NULL;
300 *(uint32_t *)(uintptr_t)&fm->fm_route_buckets_cnt = 0;
301 *(uint32_t *)(uintptr_t)&fm->fm_route_bucket_sz = 0;
302 *(uint32_t *)(uintptr_t)&fm->fm_route_bucket_tot_sz = 0;
303 }
304 ASSERT(fm->fm_route_buckets_cnt == 0);
305 ASSERT(fm->fm_route_bucket_sz == 0);
306 ASSERT(fm->fm_route_bucket_tot_sz == 0);
307
308 if (fm->fm_route_id_buckets != NULL) {
309 for (i = 0; i < fm->fm_route_id_buckets_cnt; i++) {
310 struct flow_route_id_bucket *frib =
311 flow_mgr_get_frib_at_idx(fm, idx: i);
312 ASSERT(frib->frib_idx == i);
313 flow_route_id_bucket_destroy(frib);
314 }
315 flow_route_id_buckets_free(fm->fm_route_id_buckets,
316 fm->fm_route_id_bucket_tot_sz);
317 fm->fm_route_id_buckets = NULL;
318 *(uint32_t *)(uintptr_t)&fm->fm_route_id_buckets_cnt = 0;
319 *(uint32_t *)(uintptr_t)&fm->fm_route_id_bucket_sz = 0;
320 *(uint32_t *)(uintptr_t)&fm->fm_route_id_bucket_tot_sz = 0;
321 }
322 ASSERT(fm->fm_route_id_buckets_cnt == 0);
323 ASSERT(fm->fm_route_id_bucket_sz == 0);
324 ASSERT(fm->fm_route_id_bucket_tot_sz == 0);
325
326 uuid_clear(uu: fm->fm_uuid);
327 RB_REMOVE(flow_mgr_tree, &flow_mgr_head, fm);
328 lck_rw_done(lck: &flow_mgr_lock);
329
330 sk_free_type(struct flow_mgr, fm);
331}
332
333void
334flow_mgr_terminate(struct flow_mgr *fm)
335{
336 uint32_t i;
337
338 /*
339 * Purge all flow entries.
340 */
341 for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
342 struct flow_owner_bucket *fob =
343 flow_mgr_get_fob_at_idx(fm, idx: i);
344 FOB_LOCK(fob);
345 fob->fob_busy_flags |= FOBF_DEAD;
346 }
347 for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
348 struct flow_owner_bucket *fob =
349 flow_mgr_get_fob_at_idx(fm, idx: i);
350 SK_DF(SK_VERB_FLOW, "purging fob 0x%llx [%u]", SK_KVA(fob), i);
351 flow_owner_bucket_purge_all(fob);
352 }
353
354 for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
355 FOB_UNLOCK(flow_mgr_get_fob_at_idx(fm, i));
356 }
357
358 /*
359 * Purge all flow routes.
360 */
361 for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
362 struct flow_route_bucket *frb =
363 flow_mgr_get_frb_at_idx(fm, idx: i);
364 FRB_WLOCK(frb);
365 }
366 for (i = 0; i < fm->fm_route_id_buckets_cnt; i++) {
367 FRIB_WLOCK(flow_mgr_get_frib_at_idx(fm, i));
368 }
369
370 for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
371 struct flow_route_bucket *frb =
372 flow_mgr_get_frb_at_idx(fm, idx: i);
373 SK_DF(SK_VERB_FLOW, "purging frb 0x%llx [%u]", SK_KVA(frb), i);
374 flow_route_bucket_purge_all(frb);
375 }
376
377 for (i = 0; i < fm->fm_route_id_buckets_cnt; i++) {
378 FRIB_WUNLOCK(flow_mgr_get_frib_at_idx(fm, i));
379 }
380 for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
381 FRB_WUNLOCK(flow_mgr_get_frb_at_idx(fm, i));
382 }
383}
384
385/*
386 * Must be matched with a call to flow_mgr_unlock(). Upon success will
387 * return the flow manager address of the specified UUID, and will acquire
388 * the global flow_mgr_lock as reader. The caller is then expected to release
389 * the lock.
390 */
391struct flow_mgr *
392flow_mgr_find_lock(uuid_t uuid)
393{
394 struct flow_mgr *fm, find;
395
396 uuid_copy(dst: find.fm_uuid, src: uuid);
397
398 lck_rw_lock_shared(lck: &flow_mgr_lock);
399
400 fm = RB_FIND(flow_mgr_tree, &flow_mgr_head, &find);
401 if (fm == NULL) {
402 lck_rw_done(lck: &flow_mgr_lock);
403 return NULL;
404 }
405
406 /* caller is expected to call flow_mgr_unlock() when done */
407 LCK_RW_ASSERT(&flow_mgr_lock, LCK_RW_ASSERT_SHARED);
408 return fm;
409}
410
411/*
412 * Must be matched with a successful call to flow_mgr_find_lock().
413 */
414void
415flow_mgr_unlock(void)
416{
417 lck_rw_done(lck: &flow_mgr_lock);
418}
419
420static inline int
421fm_cmp(const struct flow_mgr *a, const struct flow_mgr *b)
422{
423 return uuid_compare(uu1: a->fm_uuid, uu2: b->fm_uuid);
424}
425
426static void
427flow_mgr_clear_embedded_scope_id(struct sockaddr_in6 *addr)
428{
429 struct in6_addr *in6;
430 in6 = &addr->sin6_addr;
431 if (in6_embedded_scope && IN6_IS_SCOPE_EMBED(in6)) {
432 addr->sin6_scope_id = ntohs(in6->s6_addr16[1]);
433 in6->s6_addr16[1] = 0;
434 }
435}
436
437#if CONFIG_MACF
438static bool
439flow_req_check_mac_allowed(struct nx_flow_req *req)
440{
441 int socktype;
442 switch (req->nfr_ip_protocol) {
443 case IPPROTO_TCP:
444 socktype = SOCK_STREAM;
445 break;
446
447 case IPPROTO_UDP:
448 socktype = SOCK_DGRAM;
449 break;
450
451 default:
452 /* Custom IP protocol, which is treated as IP diagram type */
453 socktype = SOCK_DGRAM;
454 return 0;
455 }
456
457 if (req->nfr_flags & NXFLOWREQF_LISTENER) {
458 return mac_skywalk_flow_check_listen(p: req->nfr_proc, NULL,
459 SA(&req->nfr_saddr.sa), type: socktype, protocol: req->nfr_ip_protocol);
460 } else {
461 return mac_skywalk_flow_check_connect(p: req->nfr_proc, NULL,
462 SA(&req->nfr_daddr.sa), type: socktype, protocol: req->nfr_ip_protocol);
463 }
464}
465#endif /* CONFIG_MACF */
466
467static bool
468flow_req_needs_netns_reservation(struct nx_flow_req *req)
469{
470 uint8_t proto = req->nfr_ip_protocol;
471 return proto == IPPROTO_TCP || proto == IPPROTO_UDP;
472}
473
474static bool
475flow_req_needs_protons_reservation(struct nx_flow_req *req)
476{
477 uint8_t proto = req->nfr_ip_protocol;
478 return proto != IPPROTO_TCP && proto != IPPROTO_UDP &&
479 proto != IPPROTO_ESP && proto != IPPROTO_AH;
480}
481
482static bool
483flow_req_needs_ipsec_reservation(struct nx_flow_req *req)
484{
485 uint8_t proto = req->nfr_ip_protocol;
486 return proto == IPPROTO_ESP || proto == IPPROTO_AH;
487}
488
489static void
490flow_set_port_info(struct ns_flow_info *nfi, struct nx_flow_req *req)
491{
492 union sockaddr_in_4_6 *saddr = &req->nfr_saddr;
493 union sockaddr_in_4_6 *daddr = &req->nfr_daddr;
494
495 bzero(s: nfi, n: sizeof(struct ns_flow_info));
496
497 nfi->nfi_ifp = req->nfr_ifp;
498
499 nfi->nfi_laddr = *saddr;
500 nfi->nfi_faddr = *daddr;
501
502 nfi->nfi_protocol = req->nfr_ip_protocol;
503
504 uuid_copy(dst: nfi->nfi_flow_uuid, src: req->nfr_flow_uuid);
505 ASSERT(!uuid_is_null(nfi->nfi_flow_uuid));
506
507 nfi->nfi_owner_pid = req->nfr_pid;
508 if (req->nfr_epid != -1) {
509 nfi->nfi_effective_pid = req->nfr_epid;
510 proc_name(pid: req->nfr_epid, buf: nfi->nfi_effective_name,
511 size: sizeof(nfi->nfi_effective_name));
512 } else {
513 nfi->nfi_effective_pid = -1;
514 }
515
516 proc_name(pid: req->nfr_pid, buf: nfi->nfi_owner_name,
517 size: sizeof(nfi->nfi_owner_name));
518}
519
520static int
521flow_req_prepare_namespace(struct nx_flow_req *req)
522{
523 SK_LOG_VAR(char src_s[MAX_IPv6_STR_LEN]);
524 int err = 0;
525
526 if (flow_req_needs_netns_reservation(req)) {
527 if (!NETNS_TOKEN_VALID(&req->nfr_port_reservation)) {
528 union sockaddr_in_4_6 *saddr = &req->nfr_saddr;
529 struct ns_flow_info nfi;
530 netns_token ns_token;
531 flow_set_port_info(nfi: &nfi, req);
532 err = flow_namespace_create(saddr,
533 protocol: req->nfr_ip_protocol, &ns_token,
534 req->nfr_flags, &nfi);
535 if (err != 0) {
536 SK_ERR("netns for %s.%u failed",
537 sk_sa_ntop(SA(saddr), src_s, sizeof(src_s)),
538 sk_sa_get_port(SA(saddr)));
539 goto fail;
540 }
541 req->nfr_port_reservation = ns_token;
542 req->nfr_flags &= ~NXFLOWREQF_EXT_PORT_RSV;
543 } else {
544 /* Validate PID associated with provided reservation */
545 struct ns_flow_info nfi = {};
546 err = netns_get_flow_info(token: &req->nfr_port_reservation,
547 nfi: &nfi);
548 /* flow info could be NULL for socket flow */
549 if (!err && (req->nfr_pid != nfi.nfi_owner_pid ||
550 (req->nfr_epid != -1 && nfi.nfi_effective_pid !=
551 req->nfr_epid))) {
552 SK_ERR("netns flow info mismatch, "
553 "req_(e)pid %d(%d), nfr_(e)pid %d(%d)",
554 req->nfr_pid, req->nfr_epid,
555 nfi.nfi_owner_pid, nfi.nfi_effective_pid);
556 err = EPERM;
557 goto fail;
558 }
559 req->nfr_flags |= NXFLOWREQF_EXT_PORT_RSV;
560 }
561 }
562
563 if (flow_req_needs_ipsec_reservation(req)) {
564 union sockaddr_in_4_6 *saddr = &req->nfr_saddr;
565 union sockaddr_in_4_6 *daddr = &req->nfr_daddr;
566 void *ipsec_token = NULL;
567 ASSERT(req->nfr_ipsec_reservation == NULL);
568 err = key_reserve_custom_ipsec(&ipsec_token, saddr,
569 daddr, proto: req->nfr_ip_protocol);
570 if (err != 0) {
571 SK_ERR("custom ipsec %u reserve %s failed",
572 req->nfr_ip_protocol,
573 sk_sa_ntop(SA(saddr), src_s, sizeof(src_s)));
574 goto fail;
575 }
576 req->nfr_ipsec_reservation = ipsec_token;
577 }
578
579 if (flow_req_needs_protons_reservation(req)) {
580 struct protons_token *ns_token = NULL;
581 if (!protons_token_is_valid(pt: req->nfr_proto_reservation)) {
582 err = protons_reserve(ptp: &ns_token, pid: req->nfr_pid,
583 epid: req->nfr_epid, proto: req->nfr_ip_protocol);
584 if (err != 0) {
585 SK_ERR("protocol %u namespace failed",
586 req->nfr_ip_protocol);
587 goto fail;
588 }
589 req->nfr_flags &= ~NXFLOWREQF_EXT_PROTO_RSV;
590 req->nfr_proto_reservation = ns_token;
591 } else {
592 /* Validate PID associated with provided reservation */
593 if (!protons_token_has_matching_pid(pt: req->nfr_proto_reservation,
594 pid: req->nfr_pid, epid: req->nfr_epid)) {
595 SK_ERR("protons token pid mismatch");
596 err = EPERM;
597 goto fail;
598 }
599 req->nfr_flags |= NXFLOWREQF_EXT_PROTO_RSV;
600 }
601 }
602
603 return 0;
604
605fail:
606 VERIFY(err != 0);
607 SK_ERR("perparation failed (err %d)", err);
608 return err;
609}
610
611static int
612flow_req_prepare(struct nx_flow_req *req, struct kern_nexus *nx,
613 struct flow_mgr *fm, struct ifnet *ifp, flow_route_ctor_fn_t fr_ctor,
614 flow_route_resolve_fn_t fr_resolve, void *fr_arg)
615{
616 int err = 0;
617 union sockaddr_in_4_6 *saddr = &req->nfr_saddr;
618 union sockaddr_in_4_6 *daddr = &req->nfr_daddr;
619 uint8_t protocol = req->nfr_ip_protocol;
620
621 sa_family_t saf, daf, xaf, af;
622
623 saf = SA(saddr)->sa_family;
624 daf = SA(daddr)->sa_family;
625 xaf = saf ^ daf;
626 if (xaf != 0 && xaf != saf && xaf != daf) {
627 SK_ERR("invalid saddr af %d daddr af %d", saf, daf);
628 return EINVAL;
629 }
630 af = (xaf == 0) ? saf : xaf;
631
632 bool has_saddr = false, has_daddr = false;
633 bool has_sport = false, has_dport = false;
634 uint16_t sport, dport;
635 uint8_t sa_len;
636 switch (af) {
637 case AF_INET:
638 sa_len = sizeof(struct sockaddr_in);
639 has_saddr = (SIN(saddr)->sin_addr.s_addr != INADDR_ANY);
640 has_daddr = (SIN(daddr)->sin_addr.s_addr != INADDR_ANY);
641 sport = SIN(saddr)->sin_port;
642 dport = SIN(daddr)->sin_port;
643 has_sport = (sport != 0);
644 has_dport = (dport != 0);
645
646 if ((has_saddr && SIN(saddr)->sin_len != sa_len) ||
647 (has_daddr && SIN(daddr)->sin_len != sa_len)) {
648 SK_ERR("sin_len invalid");
649 err = EINVAL;
650 goto fail;
651 }
652 if ((has_saddr && IN_MULTICAST(ntohl(SIN(saddr)->sin_addr.s_addr))) ||
653 (has_daddr && IN_MULTICAST(ntohl(SIN(daddr)->sin_addr.s_addr)))) {
654 SK_ERR("multicast flow not yet supported");
655 err = EADDRNOTAVAIL;
656 goto fail;
657 }
658 if (__probable(protocol == IPPROTO_TCP)) {
659 INC_ATOMIC_INT64_LIM(
660 net_api_stats.nas_nx_flow_inet6_stream_total);
661 } else {
662 INC_ATOMIC_INT64_LIM(
663 net_api_stats.nas_nx_flow_inet6_dgram_total);
664 }
665 break;
666
667 case AF_INET6:
668 sa_len = sizeof(struct sockaddr_in6);
669 has_saddr = !IN6_IS_ADDR_UNSPECIFIED(&SIN6(saddr)->sin6_addr);
670 has_daddr = !IN6_IS_ADDR_UNSPECIFIED(&SIN6(daddr)->sin6_addr);
671 sport = SIN6(saddr)->sin6_port;
672 dport = SIN6(daddr)->sin6_port;
673 has_sport = (sport != 0);
674 has_dport = (dport != 0);
675 if ((has_saddr && SIN6(saddr)->sin6_len != sa_len) ||
676 (has_daddr && SIN6(daddr)->sin6_len != sa_len)) {
677 SK_ERR("sin_len invalid");
678 err = EINVAL;
679 goto fail;
680 }
681 /* clear embedded scope if link-local src */
682 if (has_saddr) {
683 flow_mgr_clear_embedded_scope_id(SIN6(saddr));
684 if (!in6_embedded_scope && IN6_IS_SCOPE_EMBED(&SIN6(saddr)->sin6_addr)) {
685 SIN6(saddr)->sin6_scope_id = ifp->if_index;
686 }
687 }
688 if (has_daddr) {
689 flow_mgr_clear_embedded_scope_id(SIN6(daddr));
690 if (!in6_embedded_scope && IN6_IS_SCOPE_EMBED(&SIN6(daddr)->sin6_addr)) {
691 SIN6(daddr)->sin6_scope_id = ifp->if_index;
692 }
693 }
694 if ((has_saddr && IN6_IS_ADDR_MULTICAST(&SIN6(saddr)->sin6_addr)) ||
695 (has_daddr && IN6_IS_ADDR_MULTICAST(&SIN6(daddr)->sin6_addr))) {
696 SK_ERR("multicast flow not yet supported");
697 err = EADDRNOTAVAIL;
698 goto fail;
699 }
700 if (__probable(protocol == IPPROTO_TCP)) {
701 INC_ATOMIC_INT64_LIM(
702 net_api_stats.nas_nx_flow_inet_stream_total);
703 } else {
704 INC_ATOMIC_INT64_LIM(
705 net_api_stats.nas_nx_flow_inet_dgram_total);
706 }
707 break;
708
709 default:
710 SK_ERR("unknown address families saf %d daf %d", saf, daf);
711 err = EINVAL;
712 goto fail;
713 }
714
715 SA(saddr)->sa_family = SA(daddr)->sa_family = af;
716 SA(saddr)->sa_len = SA(daddr)->sa_len = sa_len;
717
718 if (__improbable(has_saddr && !flow_route_laddr_validate(saddr, ifp,
719 &req->nfr_saddr_gencnt))) {
720 SK_LOG_VAR(char src_s[MAX_IPv6_STR_LEN]);
721 SK_ERR("src address %s is not valid",
722 sk_sa_ntop(SA(saddr), src_s, sizeof(src_s)));
723 err = EADDRNOTAVAIL;
724 goto fail;
725 }
726
727 bool is_tcp_udp = (protocol == IPPROTO_TCP || protocol == IPPROTO_UDP);
728 if (!is_tcp_udp) {
729 if (has_sport || has_dport) {
730 SK_ERR("non-zero port for IP flow");
731 return EINVAL;
732 }
733 } else {
734 /* dst:dport as connected, 0:0 as listener, but not partial */
735 if (has_daddr != has_dport) {
736 err = EINVAL;
737 SK_ERR("invalid dst/dport for TCP/UDP (err %d)", err);
738 goto fail;
739 }
740 }
741
742 if (!has_daddr && !has_dport) {
743 req->nfr_flags |= NXFLOWREQF_LISTENER;
744 }
745
746 if (req->nfr_transport_protocol == 0) {
747 req->nfr_transport_protocol = req->nfr_ip_protocol;
748 }
749
750 bool is_child_flow = !uuid_is_null(uu: req->nfr_parent_flow_uuid);
751 if ((is_child_flow && req->nfr_flow_demux_count == 0) ||
752 (!is_child_flow && req->nfr_flow_demux_count > 0)) {
753 err = EINVAL;
754 SK_ERR("invalid flow demux count");
755 goto fail;
756 }
757
758 if (req->nfr_flow_demux_count > 0) {
759 if (req->nfr_ip_protocol != IPPROTO_UDP) {
760 err = EINVAL;
761 SK_ERR("invalid ip protocol(%u) for flow demux",
762 req->nfr_ip_protocol);
763 goto fail;
764 }
765
766 for (int i = 0; i < req->nfr_flow_demux_count; i++) {
767 if (req->nfr_flow_demux_patterns[i].fdp_len > FLOW_DEMUX_MAX_LEN ||
768 req->nfr_flow_demux_patterns[i].fdp_len == 0) {
769 err = EINVAL;
770 SK_ERR("invalid flow demux pattern len %u",
771 req->nfr_flow_demux_patterns[i].fdp_len);
772 goto fail;
773 }
774 if (req->nfr_flow_demux_patterns[i].fdp_offset +
775 req->nfr_flow_demux_patterns[i].fdp_len > MAX_PKT_DEMUX_LIMIT) {
776 err = EINVAL;
777 SK_ERR("invalid demux offset plus length(%u > %d)",
778 req->nfr_flow_demux_patterns[i].fdp_offset +
779 req->nfr_flow_demux_patterns[i].fdp_len, MAX_PKT_DEMUX_LIMIT);
780 goto fail;
781 }
782 }
783 }
784
785 req->nfr_ifp = ifp;
786
787#if CONFIG_MACF
788 err = flow_req_check_mac_allowed(req);
789 if (err != 0) {
790 SK_ERR("flow req failed MAC check");
791 goto fail;
792 }
793#endif /* CONFIG_MACF */
794
795 /* setup flow route and prepare saddr if needed */
796 if (__probable(has_daddr || has_dport)) {
797 struct flow_route *fr = NULL;
798 err = flow_route_find(nx, fm, ifp, req, fr_ctor,
799 fr_resolve, fr_arg, &fr);
800 if (__improbable(err != 0)) {
801 SK_ERR("flow route lookup failed");
802 ASSERT(fr == NULL);
803 goto fail;
804 }
805 ASSERT(fr != NULL);
806 /* Pick up the default source address from flow route. */
807 if (!has_saddr) {
808 *saddr = fr->fr_laddr;
809 SIN(saddr)->sin_port = sport;
810 }
811 req->nfr_route = fr;
812 fr = NULL;
813 }
814
815 /* child flow do not hold namespace references */
816 if (__probable(uuid_is_null(req->nfr_parent_flow_uuid))) {
817 err = flow_req_prepare_namespace(req);
818 if (err != 0) {
819 goto fail;
820 }
821 }
822
823 return 0;
824
825fail:
826 VERIFY(err != 0);
827 if (req->nfr_route != NULL) {
828 flow_route_release(req->nfr_route);
829 req->nfr_route = NULL;
830 }
831 SK_ERR("preparation failed (err %d)", err);
832 return err;
833}
834
835static void
836flow_req_cleanup(struct nx_flow_req *req)
837{
838 if (NETNS_TOKEN_VALID(&req->nfr_port_reservation) &&
839 !(req->nfr_flags & NXFLOWREQF_EXT_PORT_RSV)) {
840 netns_release(token: &req->nfr_port_reservation);
841 }
842
843 if (protons_token_is_valid(pt: req->nfr_proto_reservation) &&
844 !(req->nfr_flags & NXFLOWREQF_EXT_PROTO_RSV)) {
845 protons_release(ptp: &req->nfr_proto_reservation);
846 }
847
848 if (key_custom_ipsec_token_is_valid(req->nfr_ipsec_reservation)) {
849 key_release_custom_ipsec(&req->nfr_ipsec_reservation);
850 }
851}
852
853#if SK_LOG
854/* Hoisted out of line to reduce kernel stack footprint */
855SK_LOG_ATTRIBUTE
856static void
857flow_req_dump(char *desc, struct nx_flow_req *req)
858{
859 if (!(sk_verbose & SK_VERB_FLOW)) {
860 return;
861 }
862
863 union sockaddr_in_4_6 *saddr = &req->nfr_saddr;
864 union sockaddr_in_4_6 *daddr = &req->nfr_daddr;
865 uint8_t protocol = req->nfr_ip_protocol;
866 char src_s[MAX_IPv6_STR_LEN];
867 char dst_s[MAX_IPv6_STR_LEN];
868 uint8_t sipver = 0, dipver = 0;
869 uint16_t sport = 0, dport = 0;
870 uuid_string_t uuid_s;
871
872 // unsanitized req, treat source and destination AF separately
873 if (saddr->sa.sa_family == AF_INET) {
874 sipver = IPVERSION;
875 (void) inet_ntop(AF_INET, &SIN(saddr)->sin_addr, src_s,
876 sizeof(src_s));
877 sport = ntohs(saddr->sin.sin_port);
878 } else if (saddr->sa.sa_family == AF_INET6) {
879 sipver = IPV6_VERSION;
880 (void) inet_ntop(AF_INET6, &SIN6(saddr)->sin6_addr, src_s,
881 sizeof(src_s));
882 sport = ntohs(saddr->sin6.sin6_port);
883 } else {
884 sipver = 0;
885 strlcpy(src_s, "INV", sizeof(src_s));
886 }
887 if (daddr->sa.sa_family == AF_INET) {
888 dipver = IPVERSION;
889 (void) inet_ntop(AF_INET, &SIN(daddr)->sin_addr, dst_s,
890 sizeof(dst_s));
891 dport = ntohs(daddr->sin.sin_port);
892 } else if (daddr->sa.sa_family == AF_INET6) {
893 dipver = IPV6_VERSION;
894 (void) inet_ntop(AF_INET6, &SIN6(daddr)->sin6_addr, dst_s,
895 sizeof(dst_s));
896 dport = ntohs(daddr->sin6.sin6_port);
897 } else {
898 dipver = 0;
899 strlcpy(dst_s, "INV", sizeof(src_s));
900 }
901
902 SK_DF(SK_VERB_FLOW,
903 "%s %s sipver=%u,dipver=%u,src=%s,dst=%s,proto=%d,sport=%u,dport=%d"
904 " nx_port=%u,flags 0x%b", desc, sk_uuid_unparse(req->nfr_flow_uuid,
905 uuid_s), sipver, dipver, src_s, dst_s, protocol, sport, dport,
906 req->nfr_nx_port, req->nfr_flags, NXFLOWREQF_BITS);
907}
908#else
909#define flow_req_dump(str, req) do { ((void)0); } while (0)
910#endif /* SK_LOG */
911
912/*
913 * Upon success, returns a non-NULL fb that is (writer) locked.
914 */
915int
916flow_mgr_flow_add(struct kern_nexus *nx, struct flow_mgr *fm,
917 struct flow_owner *fo, struct ifnet *ifp, struct nx_flow_req *req,
918 flow_route_ctor_fn_t fr_ctor, flow_route_resolve_fn_t fr_resolve,
919 void *fr_arg)
920{
921 struct flow_entry *fe;
922 int err = 0;
923
924 ASSERT(ifp != NULL);
925 ASSERT(fr_ctor != NULL && fr_resolve != NULL);
926 FOB_LOCK_ASSERT_HELD(FO_BUCKET(fo));
927
928 flow_req_dump("req", req);
929
930 if (!(req->nfr_flags & NXFLOWREQF_ASIS)) {
931 err = flow_req_prepare(req, nx, fm, ifp, fr_ctor, fr_resolve, fr_arg);
932 if (err != 0) {
933 SK_ERR("flow req preparation failure (err %d)", err);
934 return err;
935 }
936 }
937
938 /*
939 * Add entry in flowswitch table; upon success, flow entry adds a
940 * retain count on the flow route (we'll always need to release the
941 * refcnt from flow_route_find), and the local address:port of the
942 * flow entry will be set.
943 */
944 fe = flow_entry_alloc(fo, req, perr: &err);
945 if (__improbable(fe == NULL)) {
946 ASSERT(err != 0);
947 goto fail;
948 }
949
950 VERIFY(NETNS_TOKEN_VALID(&fe->fe_port_reservation) ||
951 !(fe->fe_key.fk_mask & FKMASK_SPORT) ||
952 req->nfr_flags & NXFLOWREQF_ASIS ||
953 (fe->fe_flags & FLOWENTF_CHILD));
954 VERIFY((req->nfr_flags & NXFLOWREQF_FLOWADV) ^
955 (req->nfr_flowadv_idx == FLOWADV_IDX_NONE));
956 req->nfr_flowadv_idx = fe->fe_adv_idx;
957
958 flow_req_dump("added ", req);
959
960 if (fe != NULL) {
961 flow_entry_release(pfe: &fe);
962 }
963
964 struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx);
965 if (req->nfr_saddr.sa.sa_family == AF_INET6 &&
966 IN6_IS_SCOPE_EMBED(&req->nfr_saddr.sin6.sin6_addr)) {
967 req->nfr_saddr.sin6.sin6_scope_id = ifnet_index(
968 interface: fsw->fsw_ifp);
969 }
970 if (req->nfr_daddr.sa.sa_family == AF_INET6 &&
971 IN6_IS_SCOPE_EMBED(&req->nfr_daddr.sin6.sin6_addr)) {
972 req->nfr_daddr.sin6.sin6_scope_id = ifnet_index(
973 interface: fsw->fsw_ifp);
974 }
975
976 return 0;
977
978fail:
979 VERIFY(err != 0);
980 flow_req_cleanup(req);
981
982 return err;
983}
984
985struct flow_owner_bucket *
986flow_mgr_get_fob_by_pid(struct flow_mgr *fm, pid_t pid)
987{
988 return flow_mgr_get_fob_at_idx(fm,
989 idx: (pid % fm->fm_owner_buckets_cnt));
990}
991
992struct flow_entry *
993flow_mgr_get_fe_by_uuid_rlock(struct flow_mgr *fm, uuid_t uuid)
994{
995 uint32_t i;
996 struct flow_owner_bucket *fob;
997 struct flow_owner *fo;
998 struct flow_entry *fe;
999
1000 for (i = 0; i < fm->fm_owner_buckets_cnt; i++) {
1001 fob = flow_mgr_get_fob_at_idx(fm, idx: i);
1002 FOB_LOCK_SPIN(fob);
1003 RB_FOREACH(fo, flow_owner_tree, &fob->fob_owner_head) {
1004 fe = flow_entry_find_by_uuid(fo, uuid);
1005 if (fe != NULL) {
1006 FOB_LOCK_CONVERT(fob);
1007 FOB_UNLOCK(fob);
1008 return fe;
1009 }
1010 }
1011 FOB_UNLOCK(fob);
1012 }
1013 return NULL;
1014}
1015
1016struct flow_route_bucket *
1017flow_mgr_get_frb_by_addr(struct flow_mgr *fm,
1018 union sockaddr_in_4_6 *daddr)
1019{
1020 uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = flow_seed;
1021
1022 switch (SA(daddr)->sa_family) {
1023 case AF_INET: {
1024 uint8_t *p = (uint8_t *)&SIN(daddr)->sin_addr.s_addr;
1025 b += ((uint32_t)p[3]);
1026 a += ((uint32_t)p[2]) << 24;
1027 a += ((uint32_t)p[1]) << 16;
1028 a += ((uint32_t)p[0]) << 8;
1029 break;
1030 }
1031
1032 case AF_INET6: {
1033 b += SIN6(daddr)->sin6_addr.s6_addr32[3];
1034 a += SIN6(daddr)->sin6_addr.s6_addr32[2];
1035 a += SIN6(daddr)->sin6_addr.s6_addr32[1];
1036 a += SIN6(daddr)->sin6_addr.s6_addr32[0];
1037 break;
1038 }
1039
1040 default:
1041 VERIFY(0);
1042 /* NOTREACHED */
1043 __builtin_unreachable();
1044 }
1045
1046 /* mix */
1047 a -= b; a -= c; a ^= (c >> 13);
1048 b -= c; b -= a; b ^= (a << 8);
1049 c -= a; c -= b; c ^= (b >> 13);
1050 a -= b; a -= c; a ^= (c >> 12);
1051 b -= c; b -= a; b ^= (a << 16);
1052 c -= a; c -= b; c ^= (b >> 5);
1053 a -= b; a -= c; a ^= (c >> 3);
1054 b -= c; b -= a; b ^= (a << 10);
1055 c -= a; c -= b; c ^= (b >> 15);
1056
1057 c &= (fm->fm_route_buckets_cnt - 1);
1058
1059 return flow_mgr_get_frb_at_idx(fm, idx: c);
1060}
1061
1062struct flow_route_id_bucket *
1063flow_mgr_get_frib_by_uuid(struct flow_mgr *fm, uuid_t fr_uuid)
1064{
1065 union {
1066 uuid_t uuid __sk_aligned(8);
1067 uint64_t u64[2];
1068 } u;
1069 uint64_t key;
1070
1071 _CASSERT(sizeof(u.uuid) == sizeof(u.u64));
1072 uuid_copy(dst: u.uuid, src: fr_uuid);
1073
1074 /* XOR fold UUID down to 4-bytes */
1075 key = (u.u64[0] ^ u.u64[1]);
1076 key = ((key >> 32) ^ (key & 0xffffffff));
1077
1078 /* add some offset to get more entropy */
1079 return flow_mgr_get_frib_at_idx(fm,
1080 idx: ((uint32_t)key % fm->fm_route_id_buckets_cnt));
1081}
1082
1083static int
1084flow_hash_mask_add(struct flow_mgr *fm, uint32_t mask, int32_t v)
1085{
1086 for (uint32_t i = 0; i < FKMASK_IDX_MAX; i++) {
1087 if (fm->fm_flow_hash_masks[i] == mask) {
1088 os_atomic_add(&fm->fm_flow_hash_count[i], v, relaxed);
1089 return 0;
1090 }
1091 }
1092 SK_ERR("unkown hash mask 0x%x", mask);
1093 return ENOTSUP;
1094}
1095
1096int
1097flow_mgr_flow_hash_mask_add(struct flow_mgr *fm, uint32_t mask)
1098{
1099 return flow_hash_mask_add(fm, mask, v: 1);
1100}
1101
1102int
1103flow_mgr_flow_hash_mask_del(struct flow_mgr *fm, uint32_t mask)
1104{
1105 return flow_hash_mask_add(fm, mask, v: -1);
1106}
1107
1108#if SK_LOG
1109SK_NO_INLINE_ATTRIBUTE
1110static void
1111__flow_mgr_find_fe_by_key_prelog(struct flow_key *key)
1112{
1113 SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
1114 SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP, "key %s",
1115 fk_as_string(key, dbgbuf, sizeof(dbgbuf)));
1116}
1117
1118SK_NO_INLINE_ATTRIBUTE
1119static void
1120__flow_mgr_find_fe_by_key_epilog(struct flow_entry *fe)
1121{
1122 SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]);
1123 if (fe != NULL) {
1124 SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP, "fe 0x%llx \"%s\"",
1125 SK_KVA(fe), fe_as_string(fe, dbgbuf, sizeof(dbgbuf)));
1126 } else {
1127 SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP, "fe not found");
1128 }
1129}
1130#else
1131#define __flow_mgr_find_fe_by_key_prelog(key) do { ((void)0); } while (0)
1132#define __flow_mgr_find_fe_by_key_epilog(fe) do { ((void)0); } while (0)
1133#endif /* SK_LOG */
1134
1135struct flow_entry *
1136flow_mgr_find_fe_by_key(struct flow_mgr *fm, struct flow_key *key)
1137{
1138 struct cuckoo_node *node = NULL;
1139 struct flow_entry *fe = NULL;
1140 uint32_t hash = 0;
1141 uint16_t saved_mask = key->fk_mask;
1142
1143 __flow_mgr_find_fe_by_key_prelog(key);
1144
1145 for (int i = 0; i < FKMASK_IDX_MAX; i++) {
1146 size_t count = fm->fm_flow_hash_count[i];
1147 uint16_t mask = fm->fm_flow_hash_masks[i];
1148 if (count == 0 || mask == 0) {
1149 SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP,
1150 "[%d] mask=%08x count=%zu skiped",
1151 i, mask, count);
1152 continue;
1153 }
1154 key->fk_mask = mask;
1155 hash = flow_key_hash(key);
1156 node = cuckoo_hashtable_find_with_hash(h: fm->fm_flow_table, key, hv: hash);
1157 SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP,
1158 "[%d] mask=%08x hash %08x node 0x%llx", i, mask, hash,
1159 SK_KVA(node));
1160 if (node != NULL) {
1161 fe = container_of(node, struct flow_entry, fe_cnode);
1162 /* v4 only listener fe shouldn't get v6 connection */
1163 if (__improbable(fe->fe_key.fk_mask == FKMASK_2TUPLE &&
1164 fe->fe_key.fk_ipver == IPVERSION &&
1165 key->fk_ipver == IPV6_VERSION)) {
1166 flow_entry_release(pfe: &fe);
1167 ASSERT(fe == NULL);
1168 SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP,
1169 "\tskip v4 only fe");
1170 continue;
1171 }
1172 break;
1173 }
1174 }
1175
1176 key->fk_mask = saved_mask;
1177
1178 __flow_mgr_find_fe_by_key_epilog(fe);
1179
1180 return fe;
1181}
1182
1183struct flow_entry *
1184flow_mgr_find_conflicting_fe(struct flow_mgr *fm, struct flow_key *key)
1185{
1186 struct cuckoo_node *node = NULL;
1187 struct flow_entry *fe = NULL;
1188 uint32_t hash = 0;
1189
1190 hash = flow_key_hash(key);
1191 node = cuckoo_hashtable_find_with_hash(h: fm->fm_flow_table, key, hv: hash);
1192 if (node != NULL) {
1193 fe = container_of(node, struct flow_entry, fe_cnode);
1194 return fe;
1195 }
1196
1197 /* listener flow confliction will be checked at netns reservation */
1198 return fe;
1199}
1200
1201void
1202flow_mgr_foreach_flow(struct flow_mgr *fm,
1203 void (^flow_handler)(struct flow_entry *fe))
1204{
1205 cuckoo_hashtable_foreach(ht: fm->fm_flow_table,
1206 handler: ^(struct cuckoo_node *node, uint32_t hv) {
1207 #pragma unused(hv)
1208 struct flow_entry *fe;
1209 fe = container_of(node, struct flow_entry, fe_cnode);
1210 flow_handler(fe);
1211
1212 if (fe->fe_flags & FLOWENTF_PARENT) {
1213 struct flow_entry *child_fe;
1214 lck_rw_lock_shared(lck: &fe->fe_child_list_lock);
1215 TAILQ_FOREACH(child_fe, &fe->fe_child_list, fe_child_link) {
1216 flow_handler(child_fe);
1217 }
1218 lck_rw_unlock_shared(lck: &fe->fe_child_list_lock);
1219 }
1220 }
1221 );
1222}
1223
1224bool
1225rx_flow_demux_match(struct nx_flowswitch *fsw, struct flow_entry *fe, struct __kern_packet *pkt)
1226{
1227 struct udphdr *uh;
1228 uint8_t *pkt_buf;
1229 uint32_t bdlen, bdlim, bdoff, pkt_payload_len;
1230 uint8_t *demux_data;
1231
1232 ASSERT(fe->fe_flags & FLOWENTF_CHILD);
1233 ASSERT(fe->fe_demux_pattern_count > 0);
1234
1235 if (fe->fe_flags & (FLOWENTF_TORN_DOWN | FLOWENTF_NONVIABLE)) {
1236 return false;
1237 }
1238
1239 /*
1240 * Demux only supported for UDP packets with payload
1241 */
1242 if (__improbable(pkt->pkt_flow_ip_proto != IPPROTO_UDP)) {
1243 return false;
1244 }
1245
1246 uh = (struct udphdr *)pkt->pkt_flow_udp_hdr;
1247 if (__improbable(uh == NULL || pkt->pkt_flow_ulen == 0)) {
1248 return false;
1249 }
1250
1251 int udp_payload_offset = pkt->pkt_l2_len + pkt->pkt_flow_ip_hlen + sizeof(*uh);
1252
1253 MD_BUFLET_ADDR_ABS_DLEN(pkt, pkt_buf, bdlen, bdlim, bdoff);
1254 pkt_payload_len = bdlim - bdoff;
1255 pkt_payload_len = MIN(pkt_payload_len, pkt->pkt_length);
1256 pkt_payload_len -= udp_payload_offset;
1257
1258 for (int index = 0; index < fe->fe_demux_pattern_count; index++) {
1259 struct flow_demux_pattern *demux_pattern = &fe->fe_demux_patterns[index].fdp_demux_pattern;
1260 ASSERT(demux_pattern->fdp_len > 0);
1261
1262 if (pkt->pkt_flow_ulen >= demux_pattern->fdp_offset + demux_pattern->fdp_len) {
1263 if (__probable(pkt_payload_len >= demux_pattern->fdp_offset + demux_pattern->fdp_len)) {
1264 demux_data = (uint8_t *)(uh + 1) + demux_pattern->fdp_offset;
1265 } else {
1266 if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
1267 m_copydata(pkt->pkt_mbuf, udp_payload_offset + demux_pattern->fdp_offset,
1268 demux_pattern->fdp_len, fe->fe_demux_pkt_data);
1269 demux_data = fe->fe_demux_pkt_data;
1270 } else {
1271 FSW_STATS_INC(FSW_STATS_RX_DEMUX_SHORT_ERR);
1272 return false;
1273 }
1274 }
1275
1276 int result = -1;
1277 if (fe->fe_demux_patterns[index].fdp_memcmp_mask != NULL) {
1278 result = fe->fe_demux_patterns[index].fdp_memcmp_mask(demux_data,
1279 demux_pattern->fdp_value, demux_pattern->fdp_mask);
1280 } else {
1281 result = sk_memcmp_mask(src1: demux_data, src2: demux_pattern->fdp_value,
1282 byte_mask: demux_pattern->fdp_mask, n: demux_pattern->fdp_len);
1283 }
1284
1285 if (result == 0) {
1286 return true;
1287 }
1288 }
1289 }
1290
1291 return false;
1292}
1293
1294struct flow_entry *
1295rx_lookup_child_flow(struct nx_flowswitch *fsw, struct flow_entry *parent_fe,
1296 struct __kern_packet *pkt)
1297{
1298 struct flow_entry *child_fe;
1299
1300 /*
1301 * Demux only supported for UDP packets with payload
1302 */
1303 if (__improbable(pkt->pkt_flow_ip_proto != IPPROTO_UDP)) {
1304 return NULL;
1305 }
1306
1307 lck_rw_lock_shared(lck: &parent_fe->fe_child_list_lock);
1308
1309 TAILQ_FOREACH(child_fe, &parent_fe->fe_child_list, fe_child_link) {
1310 if (rx_flow_demux_match(fsw, fe: child_fe, pkt)) {
1311 flow_entry_retain(fe: child_fe);
1312 lck_rw_unlock_shared(lck: &parent_fe->fe_child_list_lock);
1313 return child_fe;
1314 }
1315 }
1316
1317 lck_rw_unlock_shared(lck: &parent_fe->fe_child_list_lock);
1318 return NULL;
1319}
1320
1321struct flow_entry *
1322tx_lookup_child_flow(struct flow_entry *parent_fe, uuid_t flow_id)
1323{
1324 struct flow_entry *child_fe;
1325
1326 ASSERT(parent_fe->fe_flags & FLOWENTF_PARENT);
1327
1328 lck_rw_lock_shared(lck: &parent_fe->fe_child_list_lock);
1329 TAILQ_FOREACH(child_fe, &parent_fe->fe_child_list, fe_child_link) {
1330 if (_UUID_MATCH(u1: flow_id, u2: child_fe->fe_uuid)) {
1331 flow_entry_retain(fe: child_fe);
1332 lck_rw_unlock_shared(lck: &parent_fe->fe_child_list_lock);
1333 return child_fe;
1334 }
1335 }
1336
1337 lck_rw_unlock_shared(lck: &parent_fe->fe_child_list_lock);
1338 return NULL;
1339}
1340