1 | /* |
2 | * Copyright (c) 2015-2021 Apple Inc. All rights reserved. |
3 | * |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
5 | * |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License |
8 | * Version 2.0 (the 'License'). You may not use this file except in |
9 | * compliance with the License. The rights granted to you under the License |
10 | * may not be used to create, or enable the creation or redistribution of, |
11 | * unlawful or unlicensed copies of an Apple operating system, or to |
12 | * circumvent, violate, or enable the circumvention or violation of, any |
13 | * terms of an Apple operating system software license agreement. |
14 | * |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
17 | * |
18 | * The Original Code and all software distributed under the License are |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and |
24 | * limitations under the License. |
25 | * |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
27 | */ |
28 | |
29 | #include <skywalk/os_skywalk_private.h> |
30 | #include <skywalk/os_skywalk.h> |
31 | #include <skywalk/nexus/flowswitch/fsw_var.h> |
32 | #include <skywalk/nexus/flowswitch/nx_flowswitch.h> |
33 | #include <netinet/in.h> |
34 | #include <netinet/in_var.h> |
35 | #include <netinet6/ip6_var.h> |
36 | #include <netkey/key.h> |
37 | #include <netinet/udp.h> |
38 | |
39 | #include <skywalk/nexus/flowswitch/flow/flow_var.h> |
40 | |
41 | #if CONFIG_MACF |
42 | #include <security/mac_framework.h> |
43 | #endif /* CONFIG_MACF */ |
44 | |
45 | #include <net/net_api_stats.h> |
46 | |
47 | #define SKMEM_TAG_FSW_FLOW_MGR "com.apple.skywalk.fsw.flow_mgr" |
48 | static SKMEM_TAG_DEFINE(skmem_tag_fsw_flow_mgr, SKMEM_TAG_FSW_FLOW_MGR); |
49 | |
50 | static LCK_GRP_DECLARE(flow_mgr_lock_group, "sk_flow_mgr_lock" ); |
51 | static LCK_RW_DECLARE(flow_mgr_lock, &flow_mgr_lock_group); |
52 | |
53 | static int fm_cmp(const struct flow_mgr *, |
54 | const struct flow_mgr *); |
55 | |
56 | RB_HEAD(flow_mgr_tree, flow_mgr); |
57 | RB_PROTOTYPE_PREV(flow_mgr_tree, flow_mgr, fm_link, fm_cmp); |
58 | RB_GENERATE_PREV(flow_mgr_tree, flow_mgr, fm_link, fm_cmp); |
59 | |
60 | /* protected by the global lock flow_mgr_lock */ |
61 | static struct flow_mgr_tree flow_mgr_head; |
62 | |
63 | static int __flow_mgr_inited = 0; |
64 | |
65 | void |
66 | flow_mgr_init(void) |
67 | { |
68 | ASSERT(!__flow_mgr_inited); |
69 | |
70 | RB_INIT(&flow_mgr_head); |
71 | __flow_mgr_inited = 1; |
72 | } |
73 | |
74 | void |
75 | flow_mgr_fini(void) |
76 | { |
77 | if (__flow_mgr_inited) { |
78 | VERIFY(RB_EMPTY(&flow_mgr_head)); |
79 | |
80 | __flow_mgr_inited = 0; |
81 | } |
82 | } |
83 | |
84 | static int |
85 | __fe_cuckoo_cmp(struct cuckoo_node *node, void *key0) |
86 | { |
87 | struct flow_entry *fe = container_of(node, struct flow_entry, fe_cnode); |
88 | struct flow_key *key = key0; |
89 | const struct flow_key *mask; |
90 | |
91 | /* |
92 | * This can probably be made more efficient by having "mask" be |
93 | * set by the original caller at the time the key is initialized, |
94 | * though that needs to be done carefully to ensure there is no |
95 | * mismatch between fk_mask value and "mask" itself. |
96 | */ |
97 | switch (key->fk_mask) { |
98 | case FKMASK_5TUPLE: |
99 | mask = &fk_mask_5tuple; |
100 | break; |
101 | case FKMASK_4TUPLE: |
102 | mask = &fk_mask_4tuple; |
103 | break; |
104 | case FKMASK_3TUPLE: |
105 | mask = &fk_mask_3tuple; |
106 | break; |
107 | case FKMASK_2TUPLE: |
108 | mask = &fk_mask_2tuple; |
109 | break; |
110 | case FKMASK_IPFLOW3: |
111 | mask = &fk_mask_ipflow3; |
112 | break; |
113 | case FKMASK_IPFLOW2: |
114 | mask = &fk_mask_ipflow2; |
115 | break; |
116 | case FKMASK_IPFLOW1: |
117 | mask = &fk_mask_ipflow1; |
118 | break; |
119 | default: |
120 | return flow_key_cmp(match: &fe->fe_key, key); |
121 | } |
122 | |
123 | return flow_key_cmp_mask(match: &fe->fe_key, key, mask); |
124 | } |
125 | |
126 | static void |
127 | __fe_cuckoo_retain(struct cuckoo_node *node) |
128 | { |
129 | struct flow_entry *fe = container_of(node, struct flow_entry, fe_cnode); |
130 | return flow_entry_retain(fe); |
131 | } |
132 | |
133 | static void |
134 | __fe_cuckoo_release(struct cuckoo_node *node) |
135 | { |
136 | #pragma unused(node) |
137 | struct flow_entry *fe = container_of(node, struct flow_entry, fe_cnode); |
138 | flow_entry_release(pfe: &fe); |
139 | } |
140 | |
141 | struct flow_mgr * |
142 | flow_mgr_create(size_t fe_cnt, size_t fob_cnt, |
143 | size_t frb_cnt, size_t frib_cnt) |
144 | { |
145 | struct flow_mgr *fm = NULL; |
146 | size_t fob_sz, frb_sz, frib_sz; |
147 | size_t fob_tot_sz, frb_tot_sz, frib_tot_sz; |
148 | uint32_t i; |
149 | |
150 | /* caller needs to ensure {fb,frb}_cnt is a power of two */ |
151 | ASSERT(frb_cnt != 0 && ((frb_cnt & (frb_cnt - 1)) == 0)); |
152 | ASSERT(fob_cnt != 0); |
153 | ASSERT(frib_cnt != 0); |
154 | |
155 | fm = sk_alloc_type(struct flow_mgr, Z_WAITOK | Z_NOFAIL, skmem_tag_fsw_flow_mgr); |
156 | |
157 | struct cuckoo_hashtable_params p = { |
158 | .cht_capacity = fe_cnt, |
159 | .cht_obj_cmp = __fe_cuckoo_cmp, |
160 | .cht_obj_retain = __fe_cuckoo_retain, |
161 | .cht_obj_release = __fe_cuckoo_release, |
162 | }; |
163 | fm->fm_flow_table = cuckoo_hashtable_create(p: &p); |
164 | if (fm->fm_flow_table == NULL) { |
165 | flow_mgr_destroy(fm); |
166 | return NULL; |
167 | } |
168 | |
169 | /* |
170 | * flow_owner_bucket cache-aligned objects. |
171 | */ |
172 | fm->fm_owner_buckets = flow_owner_buckets_alloc(fob_cnt, &fob_sz, &fob_tot_sz); |
173 | if (fm->fm_owner_buckets == NULL) { |
174 | flow_mgr_destroy(fm); |
175 | return NULL; |
176 | } |
177 | /* const overrides */ |
178 | *(size_t *)(uintptr_t)&fm->fm_owner_buckets_cnt = fob_cnt; |
179 | *(size_t *)(uintptr_t)&fm->fm_owner_bucket_sz = fob_sz; |
180 | *(size_t *)(uintptr_t)&fm->fm_owner_bucket_tot_sz = fob_tot_sz; |
181 | |
182 | /* |
183 | * flow_route_bucket cache-aligned objects. |
184 | */ |
185 | fm->fm_route_buckets = flow_route_buckets_alloc(frb_cnt, &frb_sz, &frb_tot_sz); |
186 | if (fm->fm_route_buckets == NULL) { |
187 | flow_mgr_destroy(fm); |
188 | return NULL; |
189 | } |
190 | /* const overrides */ |
191 | *(size_t *)(uintptr_t)&fm->fm_route_buckets_cnt = frb_cnt; |
192 | *(size_t *)(uintptr_t)&fm->fm_route_bucket_sz = frb_sz; |
193 | *(size_t *)(uintptr_t)&fm->fm_route_bucket_tot_sz = frb_tot_sz; |
194 | |
195 | /* |
196 | * flow_route_id_bucket cache-aligned objects. |
197 | */ |
198 | fm->fm_route_id_buckets = |
199 | flow_route_id_buckets_alloc(frib_cnt, &frib_sz, &frib_tot_sz); |
200 | if (fm->fm_route_id_buckets == NULL) { |
201 | flow_mgr_destroy(fm); |
202 | return NULL; |
203 | } |
204 | /* const overrides */ |
205 | *(size_t *)(uintptr_t)&fm->fm_route_id_buckets_cnt = frib_cnt; |
206 | *(size_t *)(uintptr_t)&fm->fm_route_id_bucket_sz = frib_sz; |
207 | *(size_t *)(uintptr_t)&fm->fm_route_id_bucket_tot_sz = frib_tot_sz; |
208 | |
209 | /* construct flow_owner_buckets */ |
210 | for (i = 0; i < fm->fm_owner_buckets_cnt; i++) { |
211 | struct flow_owner_bucket *fob = flow_mgr_get_fob_at_idx(fm, idx: i); |
212 | flow_owner_bucket_init(fob); |
213 | /* const override */ |
214 | *(size_t *)(uintptr_t)&fob->fob_idx = i; |
215 | } |
216 | |
217 | /* construct flow_route_buckets */ |
218 | for (i = 0; i < fm->fm_route_buckets_cnt; i++) { |
219 | struct flow_route_bucket *frb = flow_mgr_get_frb_at_idx(fm, idx: i); |
220 | flow_route_bucket_init(frb); |
221 | /* const override */ |
222 | *(size_t *)(uintptr_t)&frb->frb_idx = i; |
223 | } |
224 | |
225 | /* construct flow_route_id_buckets */ |
226 | for (i = 0; i < fm->fm_route_id_buckets_cnt; i++) { |
227 | struct flow_route_id_bucket *frib = |
228 | flow_mgr_get_frib_at_idx(fm, idx: i); |
229 | flow_route_id_bucket_init(frib); |
230 | /* const override */ |
231 | *(size_t *)(uintptr_t)&frib->frib_idx = i; |
232 | } |
233 | |
234 | uuid_generate_random(out: fm->fm_uuid); |
235 | |
236 | lck_rw_lock_exclusive(lck: &flow_mgr_lock); |
237 | RB_INSERT(flow_mgr_tree, &flow_mgr_head, fm); |
238 | #if DEBUG |
239 | struct flow_mgr find; |
240 | uuid_copy(find.fm_uuid, fm->fm_uuid); |
241 | /* make sure our tree compare routine is sane */ |
242 | ASSERT(RB_FIND(flow_mgr_tree, |
243 | &flow_mgr_head, &find) == fm); |
244 | #endif /* DEBUG */ |
245 | lck_rw_done(lck: &flow_mgr_lock); |
246 | |
247 | fm->fm_flow_hash_masks[0] = FKMASK_5TUPLE; |
248 | fm->fm_flow_hash_masks[1] = FKMASK_4TUPLE; |
249 | fm->fm_flow_hash_masks[2] = FKMASK_3TUPLE; |
250 | fm->fm_flow_hash_masks[3] = FKMASK_2TUPLE; |
251 | fm->fm_flow_hash_masks[4] = FKMASK_IPFLOW3; |
252 | fm->fm_flow_hash_masks[5] = FKMASK_IPFLOW2; |
253 | fm->fm_flow_hash_masks[6] = FKMASK_IPFLOW1; |
254 | |
255 | memset(s: &fm->fm_flow_hash_count, c: 0, n: sizeof(fm->fm_flow_hash_count)); |
256 | |
257 | return fm; |
258 | } |
259 | |
260 | void |
261 | flow_mgr_destroy(struct flow_mgr *fm) |
262 | { |
263 | uint32_t i; |
264 | |
265 | lck_rw_lock_exclusive(lck: &flow_mgr_lock); |
266 | ASSERT(!uuid_is_null(fm->fm_uuid)); |
267 | |
268 | if (fm->fm_flow_table != NULL) { |
269 | cuckoo_hashtable_free(ht: fm->fm_flow_table); |
270 | } |
271 | |
272 | if (fm->fm_owner_buckets != NULL) { |
273 | for (i = 0; i < fm->fm_owner_buckets_cnt; i++) { |
274 | struct flow_owner_bucket *fob = |
275 | flow_mgr_get_fob_at_idx(fm, idx: i); |
276 | ASSERT(fob->fob_idx == i); |
277 | flow_owner_bucket_destroy(fob); |
278 | } |
279 | flow_owner_buckets_free(fm->fm_owner_buckets, |
280 | fm->fm_owner_bucket_tot_sz); |
281 | fm->fm_owner_buckets = NULL; |
282 | *(uint32_t *)(uintptr_t)&fm->fm_owner_buckets_cnt = 0; |
283 | *(uint32_t *)(uintptr_t)&fm->fm_owner_bucket_sz = 0; |
284 | *(uint32_t *)(uintptr_t)&fm->fm_owner_bucket_tot_sz = 0; |
285 | } |
286 | ASSERT(fm->fm_owner_buckets_cnt == 0); |
287 | ASSERT(fm->fm_owner_bucket_sz == 0); |
288 | ASSERT(fm->fm_owner_bucket_tot_sz == 0); |
289 | |
290 | if (fm->fm_route_buckets != NULL) { |
291 | for (i = 0; i < fm->fm_route_buckets_cnt; i++) { |
292 | struct flow_route_bucket *frb = |
293 | flow_mgr_get_frb_at_idx(fm, idx: i); |
294 | ASSERT(frb->frb_idx == i); |
295 | flow_route_bucket_destroy(frb); |
296 | } |
297 | flow_route_buckets_free(fm->fm_route_buckets, |
298 | fm->fm_route_bucket_tot_sz); |
299 | fm->fm_route_buckets = NULL; |
300 | *(uint32_t *)(uintptr_t)&fm->fm_route_buckets_cnt = 0; |
301 | *(uint32_t *)(uintptr_t)&fm->fm_route_bucket_sz = 0; |
302 | *(uint32_t *)(uintptr_t)&fm->fm_route_bucket_tot_sz = 0; |
303 | } |
304 | ASSERT(fm->fm_route_buckets_cnt == 0); |
305 | ASSERT(fm->fm_route_bucket_sz == 0); |
306 | ASSERT(fm->fm_route_bucket_tot_sz == 0); |
307 | |
308 | if (fm->fm_route_id_buckets != NULL) { |
309 | for (i = 0; i < fm->fm_route_id_buckets_cnt; i++) { |
310 | struct flow_route_id_bucket *frib = |
311 | flow_mgr_get_frib_at_idx(fm, idx: i); |
312 | ASSERT(frib->frib_idx == i); |
313 | flow_route_id_bucket_destroy(frib); |
314 | } |
315 | flow_route_id_buckets_free(fm->fm_route_id_buckets, |
316 | fm->fm_route_id_bucket_tot_sz); |
317 | fm->fm_route_id_buckets = NULL; |
318 | *(uint32_t *)(uintptr_t)&fm->fm_route_id_buckets_cnt = 0; |
319 | *(uint32_t *)(uintptr_t)&fm->fm_route_id_bucket_sz = 0; |
320 | *(uint32_t *)(uintptr_t)&fm->fm_route_id_bucket_tot_sz = 0; |
321 | } |
322 | ASSERT(fm->fm_route_id_buckets_cnt == 0); |
323 | ASSERT(fm->fm_route_id_bucket_sz == 0); |
324 | ASSERT(fm->fm_route_id_bucket_tot_sz == 0); |
325 | |
326 | uuid_clear(uu: fm->fm_uuid); |
327 | RB_REMOVE(flow_mgr_tree, &flow_mgr_head, fm); |
328 | lck_rw_done(lck: &flow_mgr_lock); |
329 | |
330 | sk_free_type(struct flow_mgr, fm); |
331 | } |
332 | |
333 | void |
334 | flow_mgr_terminate(struct flow_mgr *fm) |
335 | { |
336 | uint32_t i; |
337 | |
338 | /* |
339 | * Purge all flow entries. |
340 | */ |
341 | for (i = 0; i < fm->fm_owner_buckets_cnt; i++) { |
342 | struct flow_owner_bucket *fob = |
343 | flow_mgr_get_fob_at_idx(fm, idx: i); |
344 | FOB_LOCK(fob); |
345 | fob->fob_busy_flags |= FOBF_DEAD; |
346 | } |
347 | for (i = 0; i < fm->fm_owner_buckets_cnt; i++) { |
348 | struct flow_owner_bucket *fob = |
349 | flow_mgr_get_fob_at_idx(fm, idx: i); |
350 | SK_DF(SK_VERB_FLOW, "purging fob 0x%llx [%u]" , SK_KVA(fob), i); |
351 | flow_owner_bucket_purge_all(fob); |
352 | } |
353 | |
354 | for (i = 0; i < fm->fm_owner_buckets_cnt; i++) { |
355 | FOB_UNLOCK(flow_mgr_get_fob_at_idx(fm, i)); |
356 | } |
357 | |
358 | /* |
359 | * Purge all flow routes. |
360 | */ |
361 | for (i = 0; i < fm->fm_route_buckets_cnt; i++) { |
362 | struct flow_route_bucket *frb = |
363 | flow_mgr_get_frb_at_idx(fm, idx: i); |
364 | FRB_WLOCK(frb); |
365 | } |
366 | for (i = 0; i < fm->fm_route_id_buckets_cnt; i++) { |
367 | FRIB_WLOCK(flow_mgr_get_frib_at_idx(fm, i)); |
368 | } |
369 | |
370 | for (i = 0; i < fm->fm_route_buckets_cnt; i++) { |
371 | struct flow_route_bucket *frb = |
372 | flow_mgr_get_frb_at_idx(fm, idx: i); |
373 | SK_DF(SK_VERB_FLOW, "purging frb 0x%llx [%u]" , SK_KVA(frb), i); |
374 | flow_route_bucket_purge_all(frb); |
375 | } |
376 | |
377 | for (i = 0; i < fm->fm_route_id_buckets_cnt; i++) { |
378 | FRIB_WUNLOCK(flow_mgr_get_frib_at_idx(fm, i)); |
379 | } |
380 | for (i = 0; i < fm->fm_route_buckets_cnt; i++) { |
381 | FRB_WUNLOCK(flow_mgr_get_frb_at_idx(fm, i)); |
382 | } |
383 | } |
384 | |
385 | /* |
386 | * Must be matched with a call to flow_mgr_unlock(). Upon success will |
387 | * return the flow manager address of the specified UUID, and will acquire |
388 | * the global flow_mgr_lock as reader. The caller is then expected to release |
389 | * the lock. |
390 | */ |
391 | struct flow_mgr * |
392 | flow_mgr_find_lock(uuid_t uuid) |
393 | { |
394 | struct flow_mgr *fm, find; |
395 | |
396 | uuid_copy(dst: find.fm_uuid, src: uuid); |
397 | |
398 | lck_rw_lock_shared(lck: &flow_mgr_lock); |
399 | |
400 | fm = RB_FIND(flow_mgr_tree, &flow_mgr_head, &find); |
401 | if (fm == NULL) { |
402 | lck_rw_done(lck: &flow_mgr_lock); |
403 | return NULL; |
404 | } |
405 | |
406 | /* caller is expected to call flow_mgr_unlock() when done */ |
407 | LCK_RW_ASSERT(&flow_mgr_lock, LCK_RW_ASSERT_SHARED); |
408 | return fm; |
409 | } |
410 | |
411 | /* |
412 | * Must be matched with a successful call to flow_mgr_find_lock(). |
413 | */ |
414 | void |
415 | flow_mgr_unlock(void) |
416 | { |
417 | lck_rw_done(lck: &flow_mgr_lock); |
418 | } |
419 | |
420 | static inline int |
421 | fm_cmp(const struct flow_mgr *a, const struct flow_mgr *b) |
422 | { |
423 | return uuid_compare(uu1: a->fm_uuid, uu2: b->fm_uuid); |
424 | } |
425 | |
426 | static void |
427 | flow_mgr_clear_embedded_scope_id(struct sockaddr_in6 *addr) |
428 | { |
429 | struct in6_addr *in6; |
430 | in6 = &addr->sin6_addr; |
431 | if (in6_embedded_scope && IN6_IS_SCOPE_EMBED(in6)) { |
432 | addr->sin6_scope_id = ntohs(in6->s6_addr16[1]); |
433 | in6->s6_addr16[1] = 0; |
434 | } |
435 | } |
436 | |
437 | #if CONFIG_MACF |
438 | static bool |
439 | flow_req_check_mac_allowed(struct nx_flow_req *req) |
440 | { |
441 | int socktype; |
442 | switch (req->nfr_ip_protocol) { |
443 | case IPPROTO_TCP: |
444 | socktype = SOCK_STREAM; |
445 | break; |
446 | |
447 | case IPPROTO_UDP: |
448 | socktype = SOCK_DGRAM; |
449 | break; |
450 | |
451 | default: |
452 | /* Custom IP protocol, which is treated as IP diagram type */ |
453 | socktype = SOCK_DGRAM; |
454 | return 0; |
455 | } |
456 | |
457 | if (req->nfr_flags & NXFLOWREQF_LISTENER) { |
458 | return mac_skywalk_flow_check_listen(p: req->nfr_proc, NULL, |
459 | SA(&req->nfr_saddr.sa), type: socktype, protocol: req->nfr_ip_protocol); |
460 | } else { |
461 | return mac_skywalk_flow_check_connect(p: req->nfr_proc, NULL, |
462 | SA(&req->nfr_daddr.sa), type: socktype, protocol: req->nfr_ip_protocol); |
463 | } |
464 | } |
465 | #endif /* CONFIG_MACF */ |
466 | |
467 | static bool |
468 | flow_req_needs_netns_reservation(struct nx_flow_req *req) |
469 | { |
470 | uint8_t proto = req->nfr_ip_protocol; |
471 | return proto == IPPROTO_TCP || proto == IPPROTO_UDP; |
472 | } |
473 | |
474 | static bool |
475 | flow_req_needs_protons_reservation(struct nx_flow_req *req) |
476 | { |
477 | uint8_t proto = req->nfr_ip_protocol; |
478 | return proto != IPPROTO_TCP && proto != IPPROTO_UDP && |
479 | proto != IPPROTO_ESP && proto != IPPROTO_AH; |
480 | } |
481 | |
482 | static bool |
483 | flow_req_needs_ipsec_reservation(struct nx_flow_req *req) |
484 | { |
485 | uint8_t proto = req->nfr_ip_protocol; |
486 | return proto == IPPROTO_ESP || proto == IPPROTO_AH; |
487 | } |
488 | |
489 | static void |
490 | flow_set_port_info(struct ns_flow_info *nfi, struct nx_flow_req *req) |
491 | { |
492 | union sockaddr_in_4_6 *saddr = &req->nfr_saddr; |
493 | union sockaddr_in_4_6 *daddr = &req->nfr_daddr; |
494 | |
495 | bzero(s: nfi, n: sizeof(struct ns_flow_info)); |
496 | |
497 | nfi->nfi_ifp = req->nfr_ifp; |
498 | |
499 | nfi->nfi_laddr = *saddr; |
500 | nfi->nfi_faddr = *daddr; |
501 | |
502 | nfi->nfi_protocol = req->nfr_ip_protocol; |
503 | |
504 | uuid_copy(dst: nfi->nfi_flow_uuid, src: req->nfr_flow_uuid); |
505 | ASSERT(!uuid_is_null(nfi->nfi_flow_uuid)); |
506 | |
507 | nfi->nfi_owner_pid = req->nfr_pid; |
508 | if (req->nfr_epid != -1) { |
509 | nfi->nfi_effective_pid = req->nfr_epid; |
510 | proc_name(pid: req->nfr_epid, buf: nfi->nfi_effective_name, |
511 | size: sizeof(nfi->nfi_effective_name)); |
512 | } else { |
513 | nfi->nfi_effective_pid = -1; |
514 | } |
515 | |
516 | proc_name(pid: req->nfr_pid, buf: nfi->nfi_owner_name, |
517 | size: sizeof(nfi->nfi_owner_name)); |
518 | } |
519 | |
520 | static int |
521 | flow_req_prepare_namespace(struct nx_flow_req *req) |
522 | { |
523 | SK_LOG_VAR(char src_s[MAX_IPv6_STR_LEN]); |
524 | int err = 0; |
525 | |
526 | if (flow_req_needs_netns_reservation(req)) { |
527 | if (!NETNS_TOKEN_VALID(&req->nfr_port_reservation)) { |
528 | union sockaddr_in_4_6 *saddr = &req->nfr_saddr; |
529 | struct ns_flow_info nfi; |
530 | netns_token ns_token; |
531 | flow_set_port_info(nfi: &nfi, req); |
532 | err = flow_namespace_create(saddr, |
533 | protocol: req->nfr_ip_protocol, &ns_token, |
534 | req->nfr_flags, &nfi); |
535 | if (err != 0) { |
536 | SK_ERR("netns for %s.%u failed" , |
537 | sk_sa_ntop(SA(saddr), src_s, sizeof(src_s)), |
538 | sk_sa_get_port(SA(saddr))); |
539 | goto fail; |
540 | } |
541 | req->nfr_port_reservation = ns_token; |
542 | req->nfr_flags &= ~NXFLOWREQF_EXT_PORT_RSV; |
543 | } else { |
544 | /* Validate PID associated with provided reservation */ |
545 | struct ns_flow_info nfi = {}; |
546 | err = netns_get_flow_info(token: &req->nfr_port_reservation, |
547 | nfi: &nfi); |
548 | /* flow info could be NULL for socket flow */ |
549 | if (!err && (req->nfr_pid != nfi.nfi_owner_pid || |
550 | (req->nfr_epid != -1 && nfi.nfi_effective_pid != |
551 | req->nfr_epid))) { |
552 | SK_ERR("netns flow info mismatch, " |
553 | "req_(e)pid %d(%d), nfr_(e)pid %d(%d)" , |
554 | req->nfr_pid, req->nfr_epid, |
555 | nfi.nfi_owner_pid, nfi.nfi_effective_pid); |
556 | err = EPERM; |
557 | goto fail; |
558 | } |
559 | req->nfr_flags |= NXFLOWREQF_EXT_PORT_RSV; |
560 | } |
561 | } |
562 | |
563 | if (flow_req_needs_ipsec_reservation(req)) { |
564 | union sockaddr_in_4_6 *saddr = &req->nfr_saddr; |
565 | union sockaddr_in_4_6 *daddr = &req->nfr_daddr; |
566 | void *ipsec_token = NULL; |
567 | ASSERT(req->nfr_ipsec_reservation == NULL); |
568 | err = key_reserve_custom_ipsec(&ipsec_token, saddr, |
569 | daddr, proto: req->nfr_ip_protocol); |
570 | if (err != 0) { |
571 | SK_ERR("custom ipsec %u reserve %s failed" , |
572 | req->nfr_ip_protocol, |
573 | sk_sa_ntop(SA(saddr), src_s, sizeof(src_s))); |
574 | goto fail; |
575 | } |
576 | req->nfr_ipsec_reservation = ipsec_token; |
577 | } |
578 | |
579 | if (flow_req_needs_protons_reservation(req)) { |
580 | struct protons_token *ns_token = NULL; |
581 | if (!protons_token_is_valid(pt: req->nfr_proto_reservation)) { |
582 | err = protons_reserve(ptp: &ns_token, pid: req->nfr_pid, |
583 | epid: req->nfr_epid, proto: req->nfr_ip_protocol); |
584 | if (err != 0) { |
585 | SK_ERR("protocol %u namespace failed" , |
586 | req->nfr_ip_protocol); |
587 | goto fail; |
588 | } |
589 | req->nfr_flags &= ~NXFLOWREQF_EXT_PROTO_RSV; |
590 | req->nfr_proto_reservation = ns_token; |
591 | } else { |
592 | /* Validate PID associated with provided reservation */ |
593 | if (!protons_token_has_matching_pid(pt: req->nfr_proto_reservation, |
594 | pid: req->nfr_pid, epid: req->nfr_epid)) { |
595 | SK_ERR("protons token pid mismatch" ); |
596 | err = EPERM; |
597 | goto fail; |
598 | } |
599 | req->nfr_flags |= NXFLOWREQF_EXT_PROTO_RSV; |
600 | } |
601 | } |
602 | |
603 | return 0; |
604 | |
605 | fail: |
606 | VERIFY(err != 0); |
607 | SK_ERR("perparation failed (err %d)" , err); |
608 | return err; |
609 | } |
610 | |
611 | static int |
612 | flow_req_prepare(struct nx_flow_req *req, struct kern_nexus *nx, |
613 | struct flow_mgr *fm, struct ifnet *ifp, flow_route_ctor_fn_t fr_ctor, |
614 | flow_route_resolve_fn_t fr_resolve, void *fr_arg) |
615 | { |
616 | int err = 0; |
617 | union sockaddr_in_4_6 *saddr = &req->nfr_saddr; |
618 | union sockaddr_in_4_6 *daddr = &req->nfr_daddr; |
619 | uint8_t protocol = req->nfr_ip_protocol; |
620 | |
621 | sa_family_t saf, daf, xaf, af; |
622 | |
623 | saf = SA(saddr)->sa_family; |
624 | daf = SA(daddr)->sa_family; |
625 | xaf = saf ^ daf; |
626 | if (xaf != 0 && xaf != saf && xaf != daf) { |
627 | SK_ERR("invalid saddr af %d daddr af %d" , saf, daf); |
628 | return EINVAL; |
629 | } |
630 | af = (xaf == 0) ? saf : xaf; |
631 | |
632 | bool has_saddr = false, has_daddr = false; |
633 | bool has_sport = false, has_dport = false; |
634 | uint16_t sport, dport; |
635 | uint8_t sa_len; |
636 | switch (af) { |
637 | case AF_INET: |
638 | sa_len = sizeof(struct sockaddr_in); |
639 | has_saddr = (SIN(saddr)->sin_addr.s_addr != INADDR_ANY); |
640 | has_daddr = (SIN(daddr)->sin_addr.s_addr != INADDR_ANY); |
641 | sport = SIN(saddr)->sin_port; |
642 | dport = SIN(daddr)->sin_port; |
643 | has_sport = (sport != 0); |
644 | has_dport = (dport != 0); |
645 | |
646 | if ((has_saddr && SIN(saddr)->sin_len != sa_len) || |
647 | (has_daddr && SIN(daddr)->sin_len != sa_len)) { |
648 | SK_ERR("sin_len invalid" ); |
649 | err = EINVAL; |
650 | goto fail; |
651 | } |
652 | if ((has_saddr && IN_MULTICAST(ntohl(SIN(saddr)->sin_addr.s_addr))) || |
653 | (has_daddr && IN_MULTICAST(ntohl(SIN(daddr)->sin_addr.s_addr)))) { |
654 | SK_ERR("multicast flow not yet supported" ); |
655 | err = EADDRNOTAVAIL; |
656 | goto fail; |
657 | } |
658 | if (__probable(protocol == IPPROTO_TCP)) { |
659 | INC_ATOMIC_INT64_LIM( |
660 | net_api_stats.nas_nx_flow_inet6_stream_total); |
661 | } else { |
662 | INC_ATOMIC_INT64_LIM( |
663 | net_api_stats.nas_nx_flow_inet6_dgram_total); |
664 | } |
665 | break; |
666 | |
667 | case AF_INET6: |
668 | sa_len = sizeof(struct sockaddr_in6); |
669 | has_saddr = !IN6_IS_ADDR_UNSPECIFIED(&SIN6(saddr)->sin6_addr); |
670 | has_daddr = !IN6_IS_ADDR_UNSPECIFIED(&SIN6(daddr)->sin6_addr); |
671 | sport = SIN6(saddr)->sin6_port; |
672 | dport = SIN6(daddr)->sin6_port; |
673 | has_sport = (sport != 0); |
674 | has_dport = (dport != 0); |
675 | if ((has_saddr && SIN6(saddr)->sin6_len != sa_len) || |
676 | (has_daddr && SIN6(daddr)->sin6_len != sa_len)) { |
677 | SK_ERR("sin_len invalid" ); |
678 | err = EINVAL; |
679 | goto fail; |
680 | } |
681 | /* clear embedded scope if link-local src */ |
682 | if (has_saddr) { |
683 | flow_mgr_clear_embedded_scope_id(SIN6(saddr)); |
684 | if (!in6_embedded_scope && IN6_IS_SCOPE_EMBED(&SIN6(saddr)->sin6_addr)) { |
685 | SIN6(saddr)->sin6_scope_id = ifp->if_index; |
686 | } |
687 | } |
688 | if (has_daddr) { |
689 | flow_mgr_clear_embedded_scope_id(SIN6(daddr)); |
690 | if (!in6_embedded_scope && IN6_IS_SCOPE_EMBED(&SIN6(daddr)->sin6_addr)) { |
691 | SIN6(daddr)->sin6_scope_id = ifp->if_index; |
692 | } |
693 | } |
694 | if ((has_saddr && IN6_IS_ADDR_MULTICAST(&SIN6(saddr)->sin6_addr)) || |
695 | (has_daddr && IN6_IS_ADDR_MULTICAST(&SIN6(daddr)->sin6_addr))) { |
696 | SK_ERR("multicast flow not yet supported" ); |
697 | err = EADDRNOTAVAIL; |
698 | goto fail; |
699 | } |
700 | if (__probable(protocol == IPPROTO_TCP)) { |
701 | INC_ATOMIC_INT64_LIM( |
702 | net_api_stats.nas_nx_flow_inet_stream_total); |
703 | } else { |
704 | INC_ATOMIC_INT64_LIM( |
705 | net_api_stats.nas_nx_flow_inet_dgram_total); |
706 | } |
707 | break; |
708 | |
709 | default: |
710 | SK_ERR("unknown address families saf %d daf %d" , saf, daf); |
711 | err = EINVAL; |
712 | goto fail; |
713 | } |
714 | |
715 | SA(saddr)->sa_family = SA(daddr)->sa_family = af; |
716 | SA(saddr)->sa_len = SA(daddr)->sa_len = sa_len; |
717 | |
718 | if (__improbable(has_saddr && !flow_route_laddr_validate(saddr, ifp, |
719 | &req->nfr_saddr_gencnt))) { |
720 | SK_LOG_VAR(char src_s[MAX_IPv6_STR_LEN]); |
721 | SK_ERR("src address %s is not valid" , |
722 | sk_sa_ntop(SA(saddr), src_s, sizeof(src_s))); |
723 | err = EADDRNOTAVAIL; |
724 | goto fail; |
725 | } |
726 | |
727 | bool is_tcp_udp = (protocol == IPPROTO_TCP || protocol == IPPROTO_UDP); |
728 | if (!is_tcp_udp) { |
729 | if (has_sport || has_dport) { |
730 | SK_ERR("non-zero port for IP flow" ); |
731 | return EINVAL; |
732 | } |
733 | } else { |
734 | /* dst:dport as connected, 0:0 as listener, but not partial */ |
735 | if (has_daddr != has_dport) { |
736 | err = EINVAL; |
737 | SK_ERR("invalid dst/dport for TCP/UDP (err %d)" , err); |
738 | goto fail; |
739 | } |
740 | } |
741 | |
742 | if (!has_daddr && !has_dport) { |
743 | req->nfr_flags |= NXFLOWREQF_LISTENER; |
744 | } |
745 | |
746 | if (req->nfr_transport_protocol == 0) { |
747 | req->nfr_transport_protocol = req->nfr_ip_protocol; |
748 | } |
749 | |
750 | bool is_child_flow = !uuid_is_null(uu: req->nfr_parent_flow_uuid); |
751 | if ((is_child_flow && req->nfr_flow_demux_count == 0) || |
752 | (!is_child_flow && req->nfr_flow_demux_count > 0)) { |
753 | err = EINVAL; |
754 | SK_ERR("invalid flow demux count" ); |
755 | goto fail; |
756 | } |
757 | |
758 | if (req->nfr_flow_demux_count > 0) { |
759 | if (req->nfr_ip_protocol != IPPROTO_UDP) { |
760 | err = EINVAL; |
761 | SK_ERR("invalid ip protocol(%u) for flow demux" , |
762 | req->nfr_ip_protocol); |
763 | goto fail; |
764 | } |
765 | |
766 | for (int i = 0; i < req->nfr_flow_demux_count; i++) { |
767 | if (req->nfr_flow_demux_patterns[i].fdp_len > FLOW_DEMUX_MAX_LEN || |
768 | req->nfr_flow_demux_patterns[i].fdp_len == 0) { |
769 | err = EINVAL; |
770 | SK_ERR("invalid flow demux pattern len %u" , |
771 | req->nfr_flow_demux_patterns[i].fdp_len); |
772 | goto fail; |
773 | } |
774 | if (req->nfr_flow_demux_patterns[i].fdp_offset + |
775 | req->nfr_flow_demux_patterns[i].fdp_len > MAX_PKT_DEMUX_LIMIT) { |
776 | err = EINVAL; |
777 | SK_ERR("invalid demux offset plus length(%u > %d)" , |
778 | req->nfr_flow_demux_patterns[i].fdp_offset + |
779 | req->nfr_flow_demux_patterns[i].fdp_len, MAX_PKT_DEMUX_LIMIT); |
780 | goto fail; |
781 | } |
782 | } |
783 | } |
784 | |
785 | req->nfr_ifp = ifp; |
786 | |
787 | #if CONFIG_MACF |
788 | err = flow_req_check_mac_allowed(req); |
789 | if (err != 0) { |
790 | SK_ERR("flow req failed MAC check" ); |
791 | goto fail; |
792 | } |
793 | #endif /* CONFIG_MACF */ |
794 | |
795 | /* setup flow route and prepare saddr if needed */ |
796 | if (__probable(has_daddr || has_dport)) { |
797 | struct flow_route *fr = NULL; |
798 | err = flow_route_find(nx, fm, ifp, req, fr_ctor, |
799 | fr_resolve, fr_arg, &fr); |
800 | if (__improbable(err != 0)) { |
801 | SK_ERR("flow route lookup failed" ); |
802 | ASSERT(fr == NULL); |
803 | goto fail; |
804 | } |
805 | ASSERT(fr != NULL); |
806 | /* Pick up the default source address from flow route. */ |
807 | if (!has_saddr) { |
808 | *saddr = fr->fr_laddr; |
809 | SIN(saddr)->sin_port = sport; |
810 | } |
811 | req->nfr_route = fr; |
812 | fr = NULL; |
813 | } |
814 | |
815 | /* child flow do not hold namespace references */ |
816 | if (__probable(uuid_is_null(req->nfr_parent_flow_uuid))) { |
817 | err = flow_req_prepare_namespace(req); |
818 | if (err != 0) { |
819 | goto fail; |
820 | } |
821 | } |
822 | |
823 | return 0; |
824 | |
825 | fail: |
826 | VERIFY(err != 0); |
827 | if (req->nfr_route != NULL) { |
828 | flow_route_release(req->nfr_route); |
829 | req->nfr_route = NULL; |
830 | } |
831 | SK_ERR("preparation failed (err %d)" , err); |
832 | return err; |
833 | } |
834 | |
835 | static void |
836 | flow_req_cleanup(struct nx_flow_req *req) |
837 | { |
838 | if (NETNS_TOKEN_VALID(&req->nfr_port_reservation) && |
839 | !(req->nfr_flags & NXFLOWREQF_EXT_PORT_RSV)) { |
840 | netns_release(token: &req->nfr_port_reservation); |
841 | } |
842 | |
843 | if (protons_token_is_valid(pt: req->nfr_proto_reservation) && |
844 | !(req->nfr_flags & NXFLOWREQF_EXT_PROTO_RSV)) { |
845 | protons_release(ptp: &req->nfr_proto_reservation); |
846 | } |
847 | |
848 | if (key_custom_ipsec_token_is_valid(req->nfr_ipsec_reservation)) { |
849 | key_release_custom_ipsec(&req->nfr_ipsec_reservation); |
850 | } |
851 | } |
852 | |
853 | #if SK_LOG |
854 | /* Hoisted out of line to reduce kernel stack footprint */ |
855 | SK_LOG_ATTRIBUTE |
856 | static void |
857 | flow_req_dump(char *desc, struct nx_flow_req *req) |
858 | { |
859 | if (!(sk_verbose & SK_VERB_FLOW)) { |
860 | return; |
861 | } |
862 | |
863 | union sockaddr_in_4_6 *saddr = &req->nfr_saddr; |
864 | union sockaddr_in_4_6 *daddr = &req->nfr_daddr; |
865 | uint8_t protocol = req->nfr_ip_protocol; |
866 | char src_s[MAX_IPv6_STR_LEN]; |
867 | char dst_s[MAX_IPv6_STR_LEN]; |
868 | uint8_t sipver = 0, dipver = 0; |
869 | uint16_t sport = 0, dport = 0; |
870 | uuid_string_t uuid_s; |
871 | |
872 | // unsanitized req, treat source and destination AF separately |
873 | if (saddr->sa.sa_family == AF_INET) { |
874 | sipver = IPVERSION; |
875 | (void) inet_ntop(AF_INET, &SIN(saddr)->sin_addr, src_s, |
876 | sizeof(src_s)); |
877 | sport = ntohs(saddr->sin.sin_port); |
878 | } else if (saddr->sa.sa_family == AF_INET6) { |
879 | sipver = IPV6_VERSION; |
880 | (void) inet_ntop(AF_INET6, &SIN6(saddr)->sin6_addr, src_s, |
881 | sizeof(src_s)); |
882 | sport = ntohs(saddr->sin6.sin6_port); |
883 | } else { |
884 | sipver = 0; |
885 | strlcpy(src_s, "INV" , sizeof(src_s)); |
886 | } |
887 | if (daddr->sa.sa_family == AF_INET) { |
888 | dipver = IPVERSION; |
889 | (void) inet_ntop(AF_INET, &SIN(daddr)->sin_addr, dst_s, |
890 | sizeof(dst_s)); |
891 | dport = ntohs(daddr->sin.sin_port); |
892 | } else if (daddr->sa.sa_family == AF_INET6) { |
893 | dipver = IPV6_VERSION; |
894 | (void) inet_ntop(AF_INET6, &SIN6(daddr)->sin6_addr, dst_s, |
895 | sizeof(dst_s)); |
896 | dport = ntohs(daddr->sin6.sin6_port); |
897 | } else { |
898 | dipver = 0; |
899 | strlcpy(dst_s, "INV" , sizeof(src_s)); |
900 | } |
901 | |
902 | SK_DF(SK_VERB_FLOW, |
903 | "%s %s sipver=%u,dipver=%u,src=%s,dst=%s,proto=%d,sport=%u,dport=%d" |
904 | " nx_port=%u,flags 0x%b" , desc, sk_uuid_unparse(req->nfr_flow_uuid, |
905 | uuid_s), sipver, dipver, src_s, dst_s, protocol, sport, dport, |
906 | req->nfr_nx_port, req->nfr_flags, NXFLOWREQF_BITS); |
907 | } |
908 | #else |
909 | #define flow_req_dump(str, req) do { ((void)0); } while (0) |
910 | #endif /* SK_LOG */ |
911 | |
912 | /* |
913 | * Upon success, returns a non-NULL fb that is (writer) locked. |
914 | */ |
915 | int |
916 | flow_mgr_flow_add(struct kern_nexus *nx, struct flow_mgr *fm, |
917 | struct flow_owner *fo, struct ifnet *ifp, struct nx_flow_req *req, |
918 | flow_route_ctor_fn_t fr_ctor, flow_route_resolve_fn_t fr_resolve, |
919 | void *fr_arg) |
920 | { |
921 | struct flow_entry *fe; |
922 | int err = 0; |
923 | |
924 | ASSERT(ifp != NULL); |
925 | ASSERT(fr_ctor != NULL && fr_resolve != NULL); |
926 | FOB_LOCK_ASSERT_HELD(FO_BUCKET(fo)); |
927 | |
928 | flow_req_dump("req" , req); |
929 | |
930 | if (!(req->nfr_flags & NXFLOWREQF_ASIS)) { |
931 | err = flow_req_prepare(req, nx, fm, ifp, fr_ctor, fr_resolve, fr_arg); |
932 | if (err != 0) { |
933 | SK_ERR("flow req preparation failure (err %d)" , err); |
934 | return err; |
935 | } |
936 | } |
937 | |
938 | /* |
939 | * Add entry in flowswitch table; upon success, flow entry adds a |
940 | * retain count on the flow route (we'll always need to release the |
941 | * refcnt from flow_route_find), and the local address:port of the |
942 | * flow entry will be set. |
943 | */ |
944 | fe = flow_entry_alloc(fo, req, perr: &err); |
945 | if (__improbable(fe == NULL)) { |
946 | ASSERT(err != 0); |
947 | goto fail; |
948 | } |
949 | |
950 | VERIFY(NETNS_TOKEN_VALID(&fe->fe_port_reservation) || |
951 | !(fe->fe_key.fk_mask & FKMASK_SPORT) || |
952 | req->nfr_flags & NXFLOWREQF_ASIS || |
953 | (fe->fe_flags & FLOWENTF_CHILD)); |
954 | VERIFY((req->nfr_flags & NXFLOWREQF_FLOWADV) ^ |
955 | (req->nfr_flowadv_idx == FLOWADV_IDX_NONE)); |
956 | req->nfr_flowadv_idx = fe->fe_adv_idx; |
957 | |
958 | flow_req_dump("added " , req); |
959 | |
960 | if (fe != NULL) { |
961 | flow_entry_release(pfe: &fe); |
962 | } |
963 | |
964 | struct nx_flowswitch *fsw = NX_FSW_PRIVATE(nx); |
965 | if (req->nfr_saddr.sa.sa_family == AF_INET6 && |
966 | IN6_IS_SCOPE_EMBED(&req->nfr_saddr.sin6.sin6_addr)) { |
967 | req->nfr_saddr.sin6.sin6_scope_id = ifnet_index( |
968 | interface: fsw->fsw_ifp); |
969 | } |
970 | if (req->nfr_daddr.sa.sa_family == AF_INET6 && |
971 | IN6_IS_SCOPE_EMBED(&req->nfr_daddr.sin6.sin6_addr)) { |
972 | req->nfr_daddr.sin6.sin6_scope_id = ifnet_index( |
973 | interface: fsw->fsw_ifp); |
974 | } |
975 | |
976 | return 0; |
977 | |
978 | fail: |
979 | VERIFY(err != 0); |
980 | flow_req_cleanup(req); |
981 | |
982 | return err; |
983 | } |
984 | |
985 | struct flow_owner_bucket * |
986 | flow_mgr_get_fob_by_pid(struct flow_mgr *fm, pid_t pid) |
987 | { |
988 | return flow_mgr_get_fob_at_idx(fm, |
989 | idx: (pid % fm->fm_owner_buckets_cnt)); |
990 | } |
991 | |
992 | struct flow_entry * |
993 | flow_mgr_get_fe_by_uuid_rlock(struct flow_mgr *fm, uuid_t uuid) |
994 | { |
995 | uint32_t i; |
996 | struct flow_owner_bucket *fob; |
997 | struct flow_owner *fo; |
998 | struct flow_entry *fe; |
999 | |
1000 | for (i = 0; i < fm->fm_owner_buckets_cnt; i++) { |
1001 | fob = flow_mgr_get_fob_at_idx(fm, idx: i); |
1002 | FOB_LOCK_SPIN(fob); |
1003 | RB_FOREACH(fo, flow_owner_tree, &fob->fob_owner_head) { |
1004 | fe = flow_entry_find_by_uuid(fo, uuid); |
1005 | if (fe != NULL) { |
1006 | FOB_LOCK_CONVERT(fob); |
1007 | FOB_UNLOCK(fob); |
1008 | return fe; |
1009 | } |
1010 | } |
1011 | FOB_UNLOCK(fob); |
1012 | } |
1013 | return NULL; |
1014 | } |
1015 | |
1016 | struct flow_route_bucket * |
1017 | flow_mgr_get_frb_by_addr(struct flow_mgr *fm, |
1018 | union sockaddr_in_4_6 *daddr) |
1019 | { |
1020 | uint32_t a = 0x9e3779b9, b = 0x9e3779b9, c = flow_seed; |
1021 | |
1022 | switch (SA(daddr)->sa_family) { |
1023 | case AF_INET: { |
1024 | uint8_t *p = (uint8_t *)&SIN(daddr)->sin_addr.s_addr; |
1025 | b += ((uint32_t)p[3]); |
1026 | a += ((uint32_t)p[2]) << 24; |
1027 | a += ((uint32_t)p[1]) << 16; |
1028 | a += ((uint32_t)p[0]) << 8; |
1029 | break; |
1030 | } |
1031 | |
1032 | case AF_INET6: { |
1033 | b += SIN6(daddr)->sin6_addr.s6_addr32[3]; |
1034 | a += SIN6(daddr)->sin6_addr.s6_addr32[2]; |
1035 | a += SIN6(daddr)->sin6_addr.s6_addr32[1]; |
1036 | a += SIN6(daddr)->sin6_addr.s6_addr32[0]; |
1037 | break; |
1038 | } |
1039 | |
1040 | default: |
1041 | VERIFY(0); |
1042 | /* NOTREACHED */ |
1043 | __builtin_unreachable(); |
1044 | } |
1045 | |
1046 | /* mix */ |
1047 | a -= b; a -= c; a ^= (c >> 13); |
1048 | b -= c; b -= a; b ^= (a << 8); |
1049 | c -= a; c -= b; c ^= (b >> 13); |
1050 | a -= b; a -= c; a ^= (c >> 12); |
1051 | b -= c; b -= a; b ^= (a << 16); |
1052 | c -= a; c -= b; c ^= (b >> 5); |
1053 | a -= b; a -= c; a ^= (c >> 3); |
1054 | b -= c; b -= a; b ^= (a << 10); |
1055 | c -= a; c -= b; c ^= (b >> 15); |
1056 | |
1057 | c &= (fm->fm_route_buckets_cnt - 1); |
1058 | |
1059 | return flow_mgr_get_frb_at_idx(fm, idx: c); |
1060 | } |
1061 | |
1062 | struct flow_route_id_bucket * |
1063 | flow_mgr_get_frib_by_uuid(struct flow_mgr *fm, uuid_t fr_uuid) |
1064 | { |
1065 | union { |
1066 | uuid_t uuid __sk_aligned(8); |
1067 | uint64_t u64[2]; |
1068 | } u; |
1069 | uint64_t key; |
1070 | |
1071 | _CASSERT(sizeof(u.uuid) == sizeof(u.u64)); |
1072 | uuid_copy(dst: u.uuid, src: fr_uuid); |
1073 | |
1074 | /* XOR fold UUID down to 4-bytes */ |
1075 | key = (u.u64[0] ^ u.u64[1]); |
1076 | key = ((key >> 32) ^ (key & 0xffffffff)); |
1077 | |
1078 | /* add some offset to get more entropy */ |
1079 | return flow_mgr_get_frib_at_idx(fm, |
1080 | idx: ((uint32_t)key % fm->fm_route_id_buckets_cnt)); |
1081 | } |
1082 | |
1083 | static int |
1084 | flow_hash_mask_add(struct flow_mgr *fm, uint32_t mask, int32_t v) |
1085 | { |
1086 | for (uint32_t i = 0; i < FKMASK_IDX_MAX; i++) { |
1087 | if (fm->fm_flow_hash_masks[i] == mask) { |
1088 | os_atomic_add(&fm->fm_flow_hash_count[i], v, relaxed); |
1089 | return 0; |
1090 | } |
1091 | } |
1092 | SK_ERR("unkown hash mask 0x%x" , mask); |
1093 | return ENOTSUP; |
1094 | } |
1095 | |
1096 | int |
1097 | flow_mgr_flow_hash_mask_add(struct flow_mgr *fm, uint32_t mask) |
1098 | { |
1099 | return flow_hash_mask_add(fm, mask, v: 1); |
1100 | } |
1101 | |
1102 | int |
1103 | flow_mgr_flow_hash_mask_del(struct flow_mgr *fm, uint32_t mask) |
1104 | { |
1105 | return flow_hash_mask_add(fm, mask, v: -1); |
1106 | } |
1107 | |
1108 | #if SK_LOG |
1109 | SK_NO_INLINE_ATTRIBUTE |
1110 | static void |
1111 | __flow_mgr_find_fe_by_key_prelog(struct flow_key *key) |
1112 | { |
1113 | SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]); |
1114 | SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP, "key %s" , |
1115 | fk_as_string(key, dbgbuf, sizeof(dbgbuf))); |
1116 | } |
1117 | |
1118 | SK_NO_INLINE_ATTRIBUTE |
1119 | static void |
1120 | __flow_mgr_find_fe_by_key_epilog(struct flow_entry *fe) |
1121 | { |
1122 | SK_LOG_VAR(char dbgbuf[FLOWENTRY_DBGBUF_SIZE]); |
1123 | if (fe != NULL) { |
1124 | SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP, "fe 0x%llx \"%s\"" , |
1125 | SK_KVA(fe), fe_as_string(fe, dbgbuf, sizeof(dbgbuf))); |
1126 | } else { |
1127 | SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP, "fe not found" ); |
1128 | } |
1129 | } |
1130 | #else |
1131 | #define __flow_mgr_find_fe_by_key_prelog(key) do { ((void)0); } while (0) |
1132 | #define __flow_mgr_find_fe_by_key_epilog(fe) do { ((void)0); } while (0) |
1133 | #endif /* SK_LOG */ |
1134 | |
1135 | struct flow_entry * |
1136 | flow_mgr_find_fe_by_key(struct flow_mgr *fm, struct flow_key *key) |
1137 | { |
1138 | struct cuckoo_node *node = NULL; |
1139 | struct flow_entry *fe = NULL; |
1140 | uint32_t hash = 0; |
1141 | uint16_t saved_mask = key->fk_mask; |
1142 | |
1143 | __flow_mgr_find_fe_by_key_prelog(key); |
1144 | |
1145 | for (int i = 0; i < FKMASK_IDX_MAX; i++) { |
1146 | size_t count = fm->fm_flow_hash_count[i]; |
1147 | uint16_t mask = fm->fm_flow_hash_masks[i]; |
1148 | if (count == 0 || mask == 0) { |
1149 | SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP, |
1150 | "[%d] mask=%08x count=%zu skiped" , |
1151 | i, mask, count); |
1152 | continue; |
1153 | } |
1154 | key->fk_mask = mask; |
1155 | hash = flow_key_hash(key); |
1156 | node = cuckoo_hashtable_find_with_hash(h: fm->fm_flow_table, key, hv: hash); |
1157 | SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP, |
1158 | "[%d] mask=%08x hash %08x node 0x%llx" , i, mask, hash, |
1159 | SK_KVA(node)); |
1160 | if (node != NULL) { |
1161 | fe = container_of(node, struct flow_entry, fe_cnode); |
1162 | /* v4 only listener fe shouldn't get v6 connection */ |
1163 | if (__improbable(fe->fe_key.fk_mask == FKMASK_2TUPLE && |
1164 | fe->fe_key.fk_ipver == IPVERSION && |
1165 | key->fk_ipver == IPV6_VERSION)) { |
1166 | flow_entry_release(pfe: &fe); |
1167 | ASSERT(fe == NULL); |
1168 | SK_DF(SK_VERB_FLOW | SK_VERB_LOOKUP, |
1169 | "\tskip v4 only fe" ); |
1170 | continue; |
1171 | } |
1172 | break; |
1173 | } |
1174 | } |
1175 | |
1176 | key->fk_mask = saved_mask; |
1177 | |
1178 | __flow_mgr_find_fe_by_key_epilog(fe); |
1179 | |
1180 | return fe; |
1181 | } |
1182 | |
1183 | struct flow_entry * |
1184 | flow_mgr_find_conflicting_fe(struct flow_mgr *fm, struct flow_key *key) |
1185 | { |
1186 | struct cuckoo_node *node = NULL; |
1187 | struct flow_entry *fe = NULL; |
1188 | uint32_t hash = 0; |
1189 | |
1190 | hash = flow_key_hash(key); |
1191 | node = cuckoo_hashtable_find_with_hash(h: fm->fm_flow_table, key, hv: hash); |
1192 | if (node != NULL) { |
1193 | fe = container_of(node, struct flow_entry, fe_cnode); |
1194 | return fe; |
1195 | } |
1196 | |
1197 | /* listener flow confliction will be checked at netns reservation */ |
1198 | return fe; |
1199 | } |
1200 | |
1201 | void |
1202 | flow_mgr_foreach_flow(struct flow_mgr *fm, |
1203 | void (^flow_handler)(struct flow_entry *fe)) |
1204 | { |
1205 | cuckoo_hashtable_foreach(ht: fm->fm_flow_table, |
1206 | handler: ^(struct cuckoo_node *node, uint32_t hv) { |
1207 | #pragma unused(hv) |
1208 | struct flow_entry *fe; |
1209 | fe = container_of(node, struct flow_entry, fe_cnode); |
1210 | flow_handler(fe); |
1211 | |
1212 | if (fe->fe_flags & FLOWENTF_PARENT) { |
1213 | struct flow_entry *child_fe; |
1214 | lck_rw_lock_shared(lck: &fe->fe_child_list_lock); |
1215 | TAILQ_FOREACH(child_fe, &fe->fe_child_list, fe_child_link) { |
1216 | flow_handler(child_fe); |
1217 | } |
1218 | lck_rw_unlock_shared(lck: &fe->fe_child_list_lock); |
1219 | } |
1220 | } |
1221 | ); |
1222 | } |
1223 | |
1224 | bool |
1225 | rx_flow_demux_match(struct nx_flowswitch *fsw, struct flow_entry *fe, struct __kern_packet *pkt) |
1226 | { |
1227 | struct udphdr *uh; |
1228 | uint8_t *pkt_buf; |
1229 | uint32_t bdlen, bdlim, bdoff, pkt_payload_len; |
1230 | uint8_t *demux_data; |
1231 | |
1232 | ASSERT(fe->fe_flags & FLOWENTF_CHILD); |
1233 | ASSERT(fe->fe_demux_pattern_count > 0); |
1234 | |
1235 | if (fe->fe_flags & (FLOWENTF_TORN_DOWN | FLOWENTF_NONVIABLE)) { |
1236 | return false; |
1237 | } |
1238 | |
1239 | /* |
1240 | * Demux only supported for UDP packets with payload |
1241 | */ |
1242 | if (__improbable(pkt->pkt_flow_ip_proto != IPPROTO_UDP)) { |
1243 | return false; |
1244 | } |
1245 | |
1246 | uh = (struct udphdr *)pkt->pkt_flow_udp_hdr; |
1247 | if (__improbable(uh == NULL || pkt->pkt_flow_ulen == 0)) { |
1248 | return false; |
1249 | } |
1250 | |
1251 | int udp_payload_offset = pkt->pkt_l2_len + pkt->pkt_flow_ip_hlen + sizeof(*uh); |
1252 | |
1253 | MD_BUFLET_ADDR_ABS_DLEN(pkt, pkt_buf, bdlen, bdlim, bdoff); |
1254 | pkt_payload_len = bdlim - bdoff; |
1255 | pkt_payload_len = MIN(pkt_payload_len, pkt->pkt_length); |
1256 | pkt_payload_len -= udp_payload_offset; |
1257 | |
1258 | for (int index = 0; index < fe->fe_demux_pattern_count; index++) { |
1259 | struct flow_demux_pattern *demux_pattern = &fe->fe_demux_patterns[index].fdp_demux_pattern; |
1260 | ASSERT(demux_pattern->fdp_len > 0); |
1261 | |
1262 | if (pkt->pkt_flow_ulen >= demux_pattern->fdp_offset + demux_pattern->fdp_len) { |
1263 | if (__probable(pkt_payload_len >= demux_pattern->fdp_offset + demux_pattern->fdp_len)) { |
1264 | demux_data = (uint8_t *)(uh + 1) + demux_pattern->fdp_offset; |
1265 | } else { |
1266 | if (pkt->pkt_pflags & PKT_F_MBUF_DATA) { |
1267 | m_copydata(pkt->pkt_mbuf, udp_payload_offset + demux_pattern->fdp_offset, |
1268 | demux_pattern->fdp_len, fe->fe_demux_pkt_data); |
1269 | demux_data = fe->fe_demux_pkt_data; |
1270 | } else { |
1271 | FSW_STATS_INC(FSW_STATS_RX_DEMUX_SHORT_ERR); |
1272 | return false; |
1273 | } |
1274 | } |
1275 | |
1276 | int result = -1; |
1277 | if (fe->fe_demux_patterns[index].fdp_memcmp_mask != NULL) { |
1278 | result = fe->fe_demux_patterns[index].fdp_memcmp_mask(demux_data, |
1279 | demux_pattern->fdp_value, demux_pattern->fdp_mask); |
1280 | } else { |
1281 | result = sk_memcmp_mask(src1: demux_data, src2: demux_pattern->fdp_value, |
1282 | byte_mask: demux_pattern->fdp_mask, n: demux_pattern->fdp_len); |
1283 | } |
1284 | |
1285 | if (result == 0) { |
1286 | return true; |
1287 | } |
1288 | } |
1289 | } |
1290 | |
1291 | return false; |
1292 | } |
1293 | |
1294 | struct flow_entry * |
1295 | rx_lookup_child_flow(struct nx_flowswitch *fsw, struct flow_entry *parent_fe, |
1296 | struct __kern_packet *pkt) |
1297 | { |
1298 | struct flow_entry *child_fe; |
1299 | |
1300 | /* |
1301 | * Demux only supported for UDP packets with payload |
1302 | */ |
1303 | if (__improbable(pkt->pkt_flow_ip_proto != IPPROTO_UDP)) { |
1304 | return NULL; |
1305 | } |
1306 | |
1307 | lck_rw_lock_shared(lck: &parent_fe->fe_child_list_lock); |
1308 | |
1309 | TAILQ_FOREACH(child_fe, &parent_fe->fe_child_list, fe_child_link) { |
1310 | if (rx_flow_demux_match(fsw, fe: child_fe, pkt)) { |
1311 | flow_entry_retain(fe: child_fe); |
1312 | lck_rw_unlock_shared(lck: &parent_fe->fe_child_list_lock); |
1313 | return child_fe; |
1314 | } |
1315 | } |
1316 | |
1317 | lck_rw_unlock_shared(lck: &parent_fe->fe_child_list_lock); |
1318 | return NULL; |
1319 | } |
1320 | |
1321 | struct flow_entry * |
1322 | tx_lookup_child_flow(struct flow_entry *parent_fe, uuid_t flow_id) |
1323 | { |
1324 | struct flow_entry *child_fe; |
1325 | |
1326 | ASSERT(parent_fe->fe_flags & FLOWENTF_PARENT); |
1327 | |
1328 | lck_rw_lock_shared(lck: &parent_fe->fe_child_list_lock); |
1329 | TAILQ_FOREACH(child_fe, &parent_fe->fe_child_list, fe_child_link) { |
1330 | if (_UUID_MATCH(u1: flow_id, u2: child_fe->fe_uuid)) { |
1331 | flow_entry_retain(fe: child_fe); |
1332 | lck_rw_unlock_shared(lck: &parent_fe->fe_child_list_lock); |
1333 | return child_fe; |
1334 | } |
1335 | } |
1336 | |
1337 | lck_rw_unlock_shared(lck: &parent_fe->fe_child_list_lock); |
1338 | return NULL; |
1339 | } |
1340 | |