1 | /* |
2 | * Copyright (c) 2017-2021 Apple Inc. All rights reserved. |
3 | * |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
5 | * |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License |
8 | * Version 2.0 (the 'License'). You may not use this file except in |
9 | * compliance with the License. The rights granted to you under the License |
10 | * may not be used to create, or enable the creation or redistribution of, |
11 | * unlawful or unlicensed copies of an Apple operating system, or to |
12 | * circumvent, violate, or enable the circumvention or violation of, any |
13 | * terms of an Apple operating system software license agreement. |
14 | * |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
17 | * |
18 | * The Original Code and all software distributed under the License are |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and |
24 | * limitations under the License. |
25 | * |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
27 | */ |
28 | |
29 | /* |
30 | * Flow Routes. |
31 | * |
32 | * Each (non-listener) flow entry is always associated with a flow route |
33 | * object. Multiple flow entries sharing the same remote address will use |
34 | * the same flow route for that address. The flow route object contains |
35 | * the route information for the remote node. It gets allocated when a |
36 | * flow entry requests to connect, and is garbage-collected when it's no |
37 | * longer referred to after its expiration time has passed. |
38 | * |
39 | * A flow route also contains the default local address that's used to |
40 | * reach the remote node. This may not necessarily be the same local |
41 | * address used by the flow entry, if it has explicitly bound the entry |
42 | * to another local address. But for the majority of cases, having the |
43 | * local address be present in the flow route allows us to avoid doing |
44 | * source address selection each time a connect request happens. |
45 | * |
46 | * When the remote node is reachable via a gateway, the gateway address |
47 | * portion of the flow route contains its IP address and the flow route |
48 | * is marked with FLOWRTF_GATEWAY. We use this to optimize the gateway |
49 | * route lookup, since otherwise we'd have to perform an extra lookup |
50 | * each time we need to resolve the route. |
51 | * |
52 | * When the remote node is directly on the link, the FLOWRTF_ONLINK flag |
53 | * is set, and the gateway address isn't used. The target address used |
54 | * for resolution will the the remote address itself. |
55 | * |
56 | * On links with link-layer information, we store the resolved address |
57 | * of the target node (which may be the gateway's) in the flow route, |
58 | * and mark the flow route with FLOWRTF_HAS_LLINFO. |
59 | * |
60 | * Each flow route also registers itself to receive route events when |
61 | * the underlying rtentry is updated or deleted. |
62 | */ |
63 | |
64 | #include <skywalk/os_skywalk_private.h> |
65 | |
66 | #include <skywalk/nexus/flowswitch/nx_flowswitch.h> |
67 | #include <skywalk/nexus/flowswitch/fsw_var.h> |
68 | #include <skywalk/nexus/flowswitch/flow/flow_var.h> |
69 | |
70 | #include <netinet/in.h> |
71 | #include <netinet/in_var.h> |
72 | #include <netinet/in_arp.h> |
73 | #include <netinet6/nd6.h> |
74 | #include <net/route.h> |
75 | |
76 | extern struct rtstat rtstat; |
77 | |
78 | static LCK_GRP_DECLARE(flow_route_lock_group, "sk_flow_route_lock" ); |
79 | static LCK_ATTR_DECLARE(flow_route_lock_attr, 0, 0); |
80 | |
81 | static int fr_cmp(const struct flow_route *, const struct flow_route *); |
82 | static int fr_id_cmp(const struct flow_route *, const struct flow_route *); |
83 | static struct flow_route *fr_alloc(boolean_t); |
84 | static void fr_free(struct flow_route *); |
85 | static uint32_t flow_route_bucket_purge_common(struct flow_route_bucket *, |
86 | uint32_t *, boolean_t, boolean_t); |
87 | static void flow_route_ev_callback(struct eventhandler_entry_arg, |
88 | struct sockaddr *, int, struct sockaddr *, int); |
89 | |
90 | RB_GENERATE_PREV(flow_route_tree, flow_route, fr_link, fr_cmp); |
91 | RB_GENERATE_PREV(flow_route_id_tree, flow_route, fr_id_link, fr_id_cmp); |
92 | |
93 | KALLOC_TYPE_VAR_DEFINE(KT_SK_FRB, struct flow_route_bucket, KT_DEFAULT); |
94 | KALLOC_TYPE_VAR_DEFINE(KT_SK_FRIB, struct flow_route_id_bucket, KT_DEFAULT); |
95 | |
96 | #define FR_ZONE_NAME "flow.route" |
97 | |
98 | static unsigned int flow_route_size; /* size of flow_route */ |
99 | struct skmem_cache *flow_route_cache; /* cache for flow_route */ |
100 | |
101 | static int __flow_route_inited = 0; |
102 | |
103 | #define FLOW_ROUTE_EXPIRE 600 /* seconds */ |
104 | static unsigned int flow_route_expire = FLOW_ROUTE_EXPIRE; |
105 | |
106 | SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_route_expire, |
107 | CTLFLAG_RW | CTLFLAG_LOCKED, &flow_route_expire, 0, "" ); |
108 | |
109 | void |
110 | flow_route_init(void) |
111 | { |
112 | ASSERT(!__flow_route_inited); |
113 | |
114 | flow_route_size = sizeof(struct flow_route); |
115 | flow_route_cache = skmem_cache_create(FR_ZONE_NAME, flow_route_size, |
116 | sizeof(uint64_t), NULL, NULL, NULL, NULL, NULL, 0); |
117 | |
118 | __flow_route_inited = 1; |
119 | } |
120 | |
121 | void |
122 | flow_route_fini(void) |
123 | { |
124 | if (__flow_route_inited) { |
125 | skmem_cache_destroy(flow_route_cache); |
126 | flow_route_cache = NULL; |
127 | |
128 | __flow_route_inited = 0; |
129 | } |
130 | } |
131 | |
132 | struct flow_route_bucket * |
133 | flow_route_buckets_alloc(size_t frb_cnt, size_t *frb_sz, size_t *tot_sz) |
134 | { |
135 | uint32_t cache_sz = skmem_cpu_cache_line_size(); |
136 | struct flow_route_bucket *frb; |
137 | size_t frb_tot_sz; |
138 | |
139 | /* each bucket is CPU cache-aligned */ |
140 | *frb_sz = P2ROUNDUP(sizeof(*frb), cache_sz); |
141 | *tot_sz = frb_tot_sz = frb_cnt * (*frb_sz); |
142 | frb = sk_alloc_type_hash(KT_SK_FRB, frb_tot_sz, Z_WAITOK, |
143 | skmem_tag_fsw_frb_hash); |
144 | if (__improbable(frb == NULL)) { |
145 | return NULL; |
146 | } |
147 | |
148 | #if !KASAN_CLASSIC |
149 | /* |
150 | * except in KASAN_CLASSIC mode, kalloc will always maintain cacheline |
151 | * size alignment if the requested size is a multiple of a cacheline |
152 | * size (this is true for any size that is a power of two from 16 to |
153 | * PAGE_SIZE). |
154 | * |
155 | * Because this is an optimization only, it is OK to leave KASAN_CLASSIC |
156 | * not respect this. |
157 | */ |
158 | ASSERT(IS_P2ALIGNED(frb, cache_sz)); |
159 | #endif |
160 | |
161 | SK_DF(SK_VERB_MEM, "frb 0x%llx frb_cnt %zu frb_sz %zu " |
162 | "(total %zu bytes) ALLOC" , SK_KVA(frb), frb_cnt, |
163 | *frb_sz, frb_tot_sz); |
164 | |
165 | return frb; |
166 | } |
167 | |
168 | void |
169 | flow_route_buckets_free(struct flow_route_bucket *frb, size_t tot_sz) |
170 | { |
171 | SK_DF(SK_VERB_MEM, "frb 0x%llx FREE" , SK_KVA(frb)); |
172 | sk_free_type_hash(KT_SK_FRB, tot_sz, frb); |
173 | } |
174 | |
175 | void |
176 | flow_route_bucket_init(struct flow_route_bucket *frb) |
177 | { |
178 | #if !KASAN_CLASSIC |
179 | ASSERT(IS_P2ALIGNED(frb, skmem_cpu_cache_line_size())); |
180 | #endif /* !KASAN_CLASSIC */ |
181 | lck_rw_init(lck: &frb->frb_lock, grp: &flow_route_lock_group, |
182 | attr: &flow_route_lock_attr); |
183 | RB_INIT(&frb->frb_head); |
184 | } |
185 | |
186 | void |
187 | flow_route_bucket_destroy(struct flow_route_bucket *frb) |
188 | { |
189 | ASSERT(RB_EMPTY(&frb->frb_head)); |
190 | lck_rw_destroy(lck: &frb->frb_lock, grp: &flow_route_lock_group); |
191 | } |
192 | |
193 | static struct flow_route * |
194 | flow_route_find_by_addr(struct flow_route_bucket *frb, |
195 | union sockaddr_in_4_6 *dst) |
196 | { |
197 | struct flow_route *fr; |
198 | struct flow_route find; |
199 | |
200 | FRB_LOCK_ASSERT_HELD(frb); |
201 | |
202 | switch (SA(dst)->sa_family) { |
203 | case AF_INET: |
204 | find.fr_af = AF_INET; |
205 | find.fr_addr_len = sizeof(struct in_addr); |
206 | find.fr_addr_key = (void *)&SIN(dst)->sin_addr; |
207 | break; |
208 | |
209 | case AF_INET6: |
210 | find.fr_af = AF_INET6; |
211 | find.fr_addr_len = sizeof(struct in6_addr); |
212 | find.fr_addr_key = (void *)&SIN6(dst)->sin6_addr; |
213 | break; |
214 | |
215 | default: |
216 | VERIFY(0); |
217 | /* NOTREACHED */ |
218 | __builtin_unreachable(); |
219 | } |
220 | |
221 | fr = RB_FIND(flow_route_tree, &frb->frb_head, &find); |
222 | if (fr != NULL) { |
223 | flow_route_retain(fr); /* for the caller */ |
224 | } |
225 | return fr; |
226 | } |
227 | |
228 | struct flow_route_id_bucket * |
229 | flow_route_id_buckets_alloc(size_t frib_cnt, size_t *frib_sz, size_t *tot_sz) |
230 | { |
231 | uint32_t cache_sz = skmem_cpu_cache_line_size(); |
232 | struct flow_route_id_bucket *frib; |
233 | size_t frib_tot_sz; |
234 | |
235 | /* each bucket is CPU cache-aligned */ |
236 | *frib_sz = P2ROUNDUP(sizeof(*frib), cache_sz); |
237 | *tot_sz = frib_tot_sz = frib_cnt * (*frib_sz); |
238 | frib = sk_alloc_type_hash(KT_SK_FRIB, frib_tot_sz, Z_WAITOK, |
239 | skmem_tag_fsw_frib_hash); |
240 | /* END IGNORE CODESTYLE */ |
241 | if (__improbable(frib == NULL)) { |
242 | return NULL; |
243 | } |
244 | |
245 | #if !KASAN_CLASSIC |
246 | /* |
247 | * except in KASAN_CLASSIC mode, kalloc will always maintain cacheline |
248 | * size alignment if the requested size is a multiple of a cacheline |
249 | * size (this is true for any size that is a power of two from 16 to |
250 | * PAGE_SIZE). |
251 | * |
252 | * Because this is an optimization only, it is OK to leave KASAN_CLASSIC |
253 | * not respect this. |
254 | */ |
255 | ASSERT(IS_P2ALIGNED(frib, cache_sz)); |
256 | #endif /* !KASAN_CLASSIC */ |
257 | |
258 | SK_DF(SK_VERB_MEM, "frib 0x%llx frib_cnt %zu frib_sz %zu " |
259 | "(total %zu bytes) ALLOC" , SK_KVA(frib), frib_cnt, |
260 | *frib_sz, frib_tot_sz); |
261 | |
262 | return frib; |
263 | } |
264 | |
265 | void |
266 | flow_route_id_buckets_free(struct flow_route_id_bucket *frib, size_t tot_sz) |
267 | { |
268 | SK_DF(SK_VERB_MEM, "frib 0x%llx FREE" , SK_KVA(frib)); |
269 | sk_free_type_hash(KT_SK_FRIB, tot_sz, frib); |
270 | } |
271 | |
272 | void |
273 | flow_route_id_bucket_init(struct flow_route_id_bucket *frib) |
274 | { |
275 | #if !KASAN_CLASSIC |
276 | ASSERT(IS_P2ALIGNED(frib, skmem_cpu_cache_line_size())); |
277 | #endif |
278 | lck_rw_init(lck: &frib->frib_lock, grp: &flow_route_lock_group, |
279 | attr: &flow_route_lock_attr); |
280 | RB_INIT(&frib->frib_head); |
281 | } |
282 | |
283 | void |
284 | flow_route_id_bucket_destroy(struct flow_route_id_bucket *frib) |
285 | { |
286 | ASSERT(RB_EMPTY(&frib->frib_head)); |
287 | lck_rw_destroy(lck: &frib->frib_lock, grp: &flow_route_lock_group); |
288 | } |
289 | |
290 | static struct flow_route * |
291 | flow_route_find_by_uuid(struct flow_route_id_bucket *frib, uuid_t id) |
292 | { |
293 | struct flow_route *fr; |
294 | struct flow_route find; |
295 | |
296 | FRIB_LOCK_ASSERT_HELD(frib); |
297 | |
298 | uuid_copy(dst: find.fr_uuid, src: id); |
299 | fr = RB_FIND(flow_route_id_tree, &frib->frib_head, &find); |
300 | if (fr != NULL) { |
301 | flow_route_retain(fr); /* for the caller */ |
302 | } |
303 | return fr; |
304 | } |
305 | |
306 | static struct flow_route * |
307 | fr_alloc(boolean_t cansleep) |
308 | { |
309 | struct flow_route *fr; |
310 | |
311 | if ((fr = skmem_cache_alloc(flow_route_cache, |
312 | (cansleep ? SKMEM_SLEEP : SKMEM_NOSLEEP))) != NULL) { |
313 | bzero(s: fr, n: flow_route_size); |
314 | lck_spin_init(lck: &fr->fr_reflock, grp: &flow_route_lock_group, |
315 | attr: &flow_route_lock_attr); |
316 | lck_mtx_init(lck: &fr->fr_lock, grp: &flow_route_lock_group, |
317 | attr: &flow_route_lock_attr); |
318 | uuid_generate_random(out: fr->fr_uuid); |
319 | |
320 | SK_DF(SK_VERB_MEM, "allocated fr 0x%llx" , SK_KVA(fr)); |
321 | } |
322 | |
323 | return fr; |
324 | } |
325 | |
326 | static void |
327 | fr_free(struct flow_route *fr) |
328 | { |
329 | SK_DF(SK_VERB_MEM, "freeing fr 0x%llx" , SK_KVA(fr)); |
330 | |
331 | VERIFY(!(fr->fr_flags & FLOWRTF_ATTACHED)); |
332 | VERIFY(fr->fr_usecnt == 0); |
333 | |
334 | FR_LOCK(fr); |
335 | /* callee frees route entry */ |
336 | flow_route_cleanup(fr); |
337 | VERIFY(fr->fr_rt_dst == NULL); |
338 | VERIFY(fr->fr_rt_gw == NULL); |
339 | VERIFY(fr->fr_rt_evhdlr_tag == NULL); |
340 | FR_UNLOCK(fr); |
341 | |
342 | lck_mtx_destroy(lck: &fr->fr_lock, grp: &flow_route_lock_group); |
343 | lck_spin_destroy(lck: &fr->fr_reflock, grp: &flow_route_lock_group); |
344 | |
345 | skmem_cache_free(flow_route_cache, fr); |
346 | } |
347 | |
348 | static inline int |
349 | fr_cmp(const struct flow_route *a, const struct flow_route *b) |
350 | { |
351 | int d; |
352 | |
353 | if ((d = (a->fr_af - b->fr_af)) != 0) { |
354 | return d; |
355 | } |
356 | if ((d = flow_ip_cmp(a0: a->fr_addr_key, b0: b->fr_addr_key, |
357 | alen: b->fr_addr_len)) != 0) { |
358 | return d; |
359 | } |
360 | |
361 | return 0; |
362 | } |
363 | |
364 | static inline int |
365 | fr_id_cmp(const struct flow_route *a, const struct flow_route *b) |
366 | { |
367 | return uuid_compare(uu1: a->fr_uuid, uu2: b->fr_uuid); |
368 | } |
369 | |
370 | static inline int |
371 | fr_use_stable_address(struct nx_flow_req *req) |
372 | { |
373 | int use_stable_address = ip6_prefer_tempaddr ? 0 : 1; |
374 | if (req != NULL && |
375 | (req->nfr_flags & NXFLOWREQF_OVERRIDE_ADDRESS_SELECTION)) { |
376 | use_stable_address = (req->nfr_flags & NXFLOWREQF_USE_STABLE_ADDRESS) ? 1 : 0; |
377 | } |
378 | return use_stable_address; |
379 | } |
380 | |
381 | int |
382 | flow_route_configure(struct flow_route *fr, struct ifnet *ifp, struct nx_flow_req *req) |
383 | { |
384 | #if SK_LOG |
385 | char old_s[MAX_IPv6_STR_LEN]; /* src */ |
386 | char src_s[MAX_IPv6_STR_LEN]; /* src */ |
387 | char dst_s[MAX_IPv6_STR_LEN]; /* dst */ |
388 | #endif /* SK_LOG */ |
389 | struct rtentry *rt = NULL, *gwrt = NULL; |
390 | int err = 0; |
391 | |
392 | FR_LOCK_ASSERT_HELD(fr); |
393 | |
394 | /* |
395 | * If there is a route entry for the final destination, see if |
396 | * it's no longer valid and perform another routing table lookup. |
397 | * A non-NULL fr_rt_dst is always associated with a route event |
398 | * registration, and the route reference is held there. |
399 | */ |
400 | rt = fr->fr_rt_dst; |
401 | if (rt == NULL || !(rt->rt_flags & RTF_UP) || fr->fr_want_configure) { |
402 | struct eventhandler_entry_arg ee_arg; |
403 | |
404 | /* callee frees route entry */ |
405 | flow_route_cleanup(fr); |
406 | |
407 | /* lookup destination route */ |
408 | ASSERT(err == 0); |
409 | rt = rtalloc1_scoped(SA(&fr->fr_faddr), 1, 0, ifp->if_index); |
410 | if (rt == NULL) { |
411 | err = EHOSTUNREACH; |
412 | SK_ERR("no route to %s on %s (err %d)" , |
413 | sk_sa_ntop(SA(&fr->fr_faddr), dst_s, |
414 | sizeof(dst_s)), ifp->if_xname, err); |
415 | } else { |
416 | /* |
417 | * If route points to another interface and the |
418 | * route's gateway isn't link-layer, reject it. |
419 | * We make an exception otherwise, since local |
420 | * interface addresses resolve this way. |
421 | */ |
422 | if (rt->rt_ifp != ifp && rt->rt_ifp != lo_ifp && |
423 | (rt->rt_gateway == NULL || |
424 | SA(rt->rt_gateway)->sa_family != AF_LINK)) { |
425 | err = EHOSTUNREACH; |
426 | SK_ERR("route to %s on %s != %s (err %d)" , |
427 | sk_sa_ntop(SA(&fr->fr_faddr), dst_s, |
428 | sizeof(dst_s)), rt->rt_ifp->if_xname, |
429 | ifp->if_xname, err); |
430 | } |
431 | } |
432 | |
433 | if (err != 0) { |
434 | goto done; |
435 | } |
436 | |
437 | ASSERT(fr->fr_mgr != NULL); |
438 | ASSERT(!uuid_is_null(fr->fr_mgr->fm_uuid)); |
439 | ASSERT(!uuid_is_null(fr->fr_uuid)); |
440 | ASSERT(!uuid_is_null(fr->fr_nx_uuid)); |
441 | |
442 | bzero(s: &ee_arg, n: sizeof(ee_arg)); |
443 | uuid_copy(dst: ee_arg.ee_fm_uuid, src: fr->fr_mgr->fm_uuid); |
444 | uuid_copy(dst: ee_arg.ee_fr_uuid, src: fr->fr_uuid); |
445 | |
446 | /* |
447 | * Register for changes on destination route; this covers both |
448 | * cases where the destination is on-link, or if it is off-link |
449 | * and is using a gateway route. This also transfers the refcnt |
450 | * of the route entry to the event handler, released later when |
451 | * it is deregistered. |
452 | */ |
453 | ASSERT(fr->fr_rt_dst == NULL); |
454 | ASSERT(fr->fr_rt_evhdlr_tag == NULL); |
455 | fr->fr_rt_dst = rt; /* move reference to fr */ |
456 | fr->fr_rt_evhdlr_tag = |
457 | EVENTHANDLER_REGISTER(&rt->rt_evhdlr_ctxt, route_event, |
458 | flow_route_ev_callback, ee_arg, EVENTHANDLER_PRI_ANY); |
459 | ASSERT(fr->fr_rt_evhdlr_tag != NULL); |
460 | os_atomic_andnot(&fr->fr_flags, FLOWRTF_DELETED, relaxed); |
461 | |
462 | /* |
463 | * Lookup gateway route (if any); returns locked gwrt |
464 | * with a reference bumped up. |
465 | */ |
466 | err = route_to_gwroute(SA(&fr->fr_faddr), rt, &gwrt); |
467 | if (err != 0) { |
468 | /* |
469 | * Reference held by fr_rt_dst will be taken |
470 | * care of by flow_route_cleanup() below, so |
471 | * make sure we don't do an extra rtfree(). |
472 | */ |
473 | rt = NULL; |
474 | ASSERT(gwrt == NULL); |
475 | SK_ERR("no gw route to %s on %s (err %d)" , |
476 | sk_sa_ntop(SA(&fr->fr_faddr), dst_s, |
477 | sizeof(dst_s)), ifp->if_xname, err); |
478 | goto done; |
479 | } |
480 | |
481 | /* if RTF_GATEWAY isn't set, gwrt == rt */ |
482 | ASSERT(gwrt != NULL); |
483 | RT_LOCK_ASSERT_HELD(gwrt); |
484 | |
485 | /* |
486 | * Must have been cleared via cleanup, and that we're |
487 | * single-threaded here for fr by virtue of fr_lock. |
488 | */ |
489 | ASSERT(!(fr->fr_flags & (FLOWRTF_GATEWAY | FLOWRTF_ONLINK))); |
490 | |
491 | if (gwrt != rt && (rt->rt_flags & RTF_GATEWAY) && |
492 | (rt->rt_gateway->sa_family == AF_INET || |
493 | rt->rt_gateway->sa_family == AF_INET6)) { |
494 | struct sockaddr_storage ss; |
495 | |
496 | ASSERT(fr->fr_rt_gw == NULL); |
497 | /* locked via route_to_gwroute() above */ |
498 | fr->fr_rt_gw = gwrt; /* move reference to fr */ |
499 | RT_ADDREF_LOCKED(gwrt); /* for this routine */ |
500 | /* |
501 | * Destination is off-link and is reachable |
502 | * thru an IP gateway route. Save the IP |
503 | * address of the gateway in fr_gaddr. |
504 | */ |
505 | (void) sa_copy(rt->rt_gateway, &ss, NULL); |
506 | _CASSERT(sizeof(fr->fr_gaddr) <= sizeof(ss)); |
507 | bcopy(src: &ss, dst: &fr->fr_gaddr, n: sizeof(fr->fr_gaddr)); |
508 | os_atomic_or(&fr->fr_flags, FLOWRTF_GATEWAY, relaxed); |
509 | } else if (IS_DIRECT_HOSTROUTE(rt)) { |
510 | /* |
511 | * Destination is on-link. |
512 | */ |
513 | os_atomic_or(&fr->fr_flags, FLOWRTF_ONLINK, relaxed); |
514 | } |
515 | RT_UNLOCK(gwrt); |
516 | } |
517 | RT_ADDREF(rt); /* for this routine */ |
518 | |
519 | /* see if we need to re-select default source address */ |
520 | int use_stable_address = fr_use_stable_address(req); |
521 | if (fr->fr_want_configure || |
522 | fr->fr_laddr_gencnt != ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt || |
523 | !(fr->fr_flags & FLOWRTF_STABLE_ADDR) != !use_stable_address) { |
524 | union sockaddr_in_4_6 old = fr->fr_laddr; |
525 | if (use_stable_address) { |
526 | os_atomic_or(&fr->fr_flags, FLOWRTF_STABLE_ADDR, relaxed); |
527 | } else { |
528 | os_atomic_andnot(&fr->fr_flags, FLOWRTF_STABLE_ADDR, relaxed); |
529 | } |
530 | if ((err = flow_route_select_laddr(&fr->fr_laddr, &fr->fr_faddr, |
531 | ifp, rt, &fr->fr_laddr_gencnt, use_stable_address)) != 0) { |
532 | SK_ERR("no usable src address to reach %s on %s " |
533 | "(err %d)" , sk_sa_ntop(SA(&fr->fr_faddr), dst_s, |
534 | sizeof(dst_s)), ifp->if_xname, err); |
535 | goto done; |
536 | } |
537 | if (bcmp(s1: &old, s2: &fr->fr_laddr, SA(&old)->sa_len) != 0) { |
538 | SK_ERR("src address is now %s (was %s) to reach %s " |
539 | "on %s" , sk_sa_ntop(SA(&fr->fr_laddr), src_s, |
540 | sizeof(src_s)), sk_sa_ntop(SA(&old), old_s, |
541 | sizeof(old_s)), sk_sa_ntop(SA(&fr->fr_faddr), |
542 | dst_s, sizeof(dst_s)), ifp->if_xname); |
543 | } |
544 | } |
545 | ASSERT(err == 0); |
546 | |
547 | done: |
548 | if (__probable(err == 0)) { |
549 | os_atomic_store(&fr->fr_want_configure, 0, release); |
550 | } else { |
551 | /* callee frees route entry */ |
552 | flow_route_cleanup(fr); |
553 | } |
554 | |
555 | if (gwrt != NULL) { |
556 | ASSERT(rt != NULL); |
557 | if (gwrt == rt) { |
558 | RT_REMREF(gwrt); |
559 | } else { |
560 | rtfree(gwrt); |
561 | } |
562 | gwrt = NULL; |
563 | } |
564 | |
565 | if (rt != NULL) { |
566 | rtfree(rt); |
567 | rt = NULL; |
568 | } |
569 | |
570 | return err; |
571 | } |
572 | |
573 | int |
574 | flow_route_find(struct kern_nexus *nx, struct flow_mgr *fm, |
575 | struct ifnet *ifp, struct nx_flow_req *req, |
576 | flow_route_ctor_fn_t fr_ctor, flow_route_resolve_fn_t fr_resolve, |
577 | void *arg, struct flow_route **frp) |
578 | { |
579 | #if SK_LOG |
580 | char src_s[MAX_IPv6_STR_LEN]; /* dst */ |
581 | char dst_s[MAX_IPv6_STR_LEN]; /* dst */ |
582 | char gw_s[MAX_IPv6_STR_LEN]; /* gw */ |
583 | #endif /* SK_LOG */ |
584 | union sockaddr_in_4_6 *daddr = &req->nfr_daddr; |
585 | struct flow_route_bucket *frb; |
586 | struct flow_route_id_bucket *frib; |
587 | struct flow_route *fr = NULL; |
588 | int err = 0; |
589 | |
590 | ASSERT(fr_ctor != NULL && fr_resolve != NULL); |
591 | |
592 | ASSERT(frp != NULL); |
593 | *frp = NULL; |
594 | |
595 | frb = flow_mgr_get_frb_by_addr(fm, daddr); |
596 | |
597 | int use_stable_address = fr_use_stable_address(req); |
598 | |
599 | /* see if there is a cached flow route (as reader) */ |
600 | FRB_RLOCK(frb); |
601 | fr = flow_route_find_by_addr(frb, dst: daddr); |
602 | if (fr != NULL) { |
603 | if (__improbable(fr->fr_want_configure || fr->fr_laddr_gencnt != |
604 | ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt) || |
605 | __improbable(!(fr->fr_flags & FLOWRTF_STABLE_ADDR) != !use_stable_address)) { |
606 | os_atomic_inc(&fr->fr_want_configure, relaxed); |
607 | FR_LOCK(fr); |
608 | err = flow_route_configure(fr, ifp, req); |
609 | if (err != 0) { |
610 | SK_ERR("fr 0x%llx error re-configuring dst %s " |
611 | "on %s (err %d) [R]" , SK_KVA(fr), |
612 | sk_sa_ntop(SA(&fr->fr_faddr), dst_s, |
613 | sizeof(dst_s)), ifp->if_xname, err); |
614 | } |
615 | FR_UNLOCK(fr); |
616 | } |
617 | if (err == 0) { |
618 | SK_DF(SK_VERB_FLOW_ROUTE, |
619 | "fr 0x%llx found for dst %s " "on %s [R,%u]" , |
620 | SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_faddr), dst_s, |
621 | sizeof(dst_s)), ifp->if_xname, fr->fr_usecnt); |
622 | } |
623 | FRB_RUNLOCK(frb); /* reader */ |
624 | goto done; |
625 | } |
626 | |
627 | /* |
628 | * Flow route doesn't exist; become a writer and prepare to |
629 | * allocate one. We could be racing with other threads here, |
630 | * so check first if there is now a cached flow route that |
631 | * got created by the winning thread. |
632 | */ |
633 | if (!FRB_RLOCKTOWLOCK(frb)) { |
634 | FRB_WLOCK(frb); |
635 | } |
636 | |
637 | fr = flow_route_find_by_addr(frb, dst: daddr); |
638 | if (fr != NULL) { |
639 | if (__improbable(fr->fr_want_configure) || |
640 | __improbable(!(fr->fr_flags & FLOWRTF_STABLE_ADDR) != !use_stable_address)) { |
641 | FR_LOCK(fr); |
642 | err = flow_route_configure(fr, ifp, req); |
643 | if (err != 0) { |
644 | SK_ERR("fr 0x%llx error re-configuring dst %s " |
645 | "on %s (err %d) [W]" , SK_KVA(fr), |
646 | sk_sa_ntop(SA(&fr->fr_faddr), dst_s, |
647 | sizeof(dst_s)), ifp->if_xname, err); |
648 | } |
649 | FR_UNLOCK(fr); |
650 | } |
651 | if (err == 0) { |
652 | SK_DF(SK_VERB_FLOW_ROUTE, |
653 | "fr 0x%llx found for dst %s on %s [W,%u]" , |
654 | SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_faddr), dst_s, |
655 | sizeof(dst_s)), ifp->if_xname, fr->fr_usecnt); |
656 | } |
657 | FRB_WUNLOCK(frb); /* writer */ |
658 | goto done; |
659 | } |
660 | |
661 | /* allocate one */ |
662 | fr = fr_alloc(TRUE); |
663 | fr->fr_faddr = *daddr; /* remote address */ |
664 | |
665 | switch (SA(&fr->fr_faddr)->sa_family) { |
666 | case AF_INET: |
667 | SIN(&fr->fr_faddr)->sin_port = 0; |
668 | fr->fr_addr_len = sizeof(struct in_addr); |
669 | fr->fr_addr_key = &SIN(&fr->fr_faddr)->sin_addr; |
670 | break; |
671 | |
672 | case AF_INET6: |
673 | SIN6(&fr->fr_faddr)->sin6_port = 0; |
674 | fr->fr_addr_len = sizeof(struct in6_addr); |
675 | fr->fr_addr_key = &SIN6(&fr->fr_faddr)->sin6_addr; |
676 | break; |
677 | |
678 | default: |
679 | VERIFY(0); |
680 | /* NOTREACHED */ |
681 | __builtin_unreachable(); |
682 | } |
683 | |
684 | ASSERT(!uuid_is_null(fr->fr_uuid)); |
685 | uuid_copy(dst: fr->fr_nx_uuid, src: nx->nx_uuid); |
686 | *(struct flow_mgr **)(uintptr_t)&fr->fr_mgr = fm; |
687 | |
688 | /* force configure newly-created flow route */ |
689 | os_atomic_inc(&fr->fr_want_configure, relaxed); |
690 | |
691 | FR_LOCK(fr); |
692 | if ((err = flow_route_configure(fr, ifp, req)) != 0) { |
693 | SK_ERR("fr 0x%llx error configuring dst %s on %s (err %d)" , |
694 | SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_faddr), dst_s, |
695 | sizeof(dst_s)), ifp->if_xname, err); |
696 | FR_UNLOCK(fr); |
697 | FRB_WUNLOCK(frb); /* writer */ |
698 | /* not yet in tree, so free immediately */ |
699 | fr_free(fr); |
700 | fr = NULL; |
701 | goto done; |
702 | } |
703 | |
704 | /* execute nexus-specific constructor */ |
705 | fr_ctor(arg, fr); |
706 | FR_UNLOCK(fr); |
707 | |
708 | frib = flow_mgr_get_frib_by_uuid(fm, fr->fr_uuid); |
709 | FRIB_WLOCK(frib); |
710 | |
711 | *(struct flow_route_bucket **)(uintptr_t)&fr->fr_frb = frb; |
712 | *(struct flow_route_id_bucket **)(uintptr_t)&fr->fr_frib = frib; |
713 | |
714 | FRB_WLOCK_ASSERT_HELD(frb); |
715 | FRIB_WLOCK_ASSERT_HELD(frib); |
716 | |
717 | RB_INSERT(flow_route_tree, &frb->frb_head, fr); |
718 | RB_INSERT(flow_route_id_tree, &frib->frib_head, fr); |
719 | |
720 | os_atomic_or(&fr->fr_flags, FLOWRTF_ATTACHED, relaxed); |
721 | |
722 | #if DEBUG |
723 | /* sanity checks for comparator routines */ |
724 | VERIFY(flow_route_find_by_addr(frb, &fr->fr_faddr) == fr); |
725 | flow_route_release(fr); |
726 | VERIFY(flow_route_find_by_uuid(frib, fr->fr_uuid) == fr); |
727 | flow_route_release(fr); |
728 | #endif /* DEBUG */ |
729 | |
730 | /* for the trees */ |
731 | _CASSERT(FLOW_ROUTE_MINREF == 2); |
732 | flow_route_retain(fr); |
733 | flow_route_retain(fr); |
734 | ASSERT(fr->fr_usecnt == FLOW_ROUTE_MINREF); |
735 | |
736 | /* for the caller */ |
737 | flow_route_retain(fr); |
738 | |
739 | FRIB_WUNLOCK(frib); /* writer */ |
740 | FRB_WUNLOCK(frb); /* writer */ |
741 | |
742 | /* execute nexus-specific resolver */ |
743 | if (!(fr->fr_flags & FLOWRTF_RESOLVED) && |
744 | (err = fr_resolve(arg, fr, NULL)) != 0) { |
745 | if (fr->fr_flags & FLOWRTF_GATEWAY) { |
746 | SK_ERR("fr 0x%llx resolve %s gw %s on %s (err %d)" , |
747 | SK_KVA(fr), (err == EJUSTRETURN ? "pending" : |
748 | "fail" ), sk_sa_ntop(SA(&fr->fr_gaddr), dst_s, |
749 | sizeof(dst_s)), ifp->if_xname, err); |
750 | } else { |
751 | SK_ERR("fr 0x%llx resolve %s dst %s on %s (err %d)" , |
752 | SK_KVA(fr), (err == EJUSTRETURN ? "pending" : |
753 | "fail" ), sk_sa_ntop(SA(&fr->fr_faddr), dst_s, |
754 | sizeof(dst_s)), ifp->if_xname, err); |
755 | } |
756 | if (err == EJUSTRETURN) { |
757 | err = 0; |
758 | } else { |
759 | goto done; |
760 | } |
761 | } |
762 | ASSERT(err == 0); |
763 | |
764 | #if SK_LOG |
765 | if (fr->fr_flags & FLOWRTF_GATEWAY) { |
766 | SK_DF(SK_VERB_FLOW_ROUTE, |
767 | "add fr 0x%llx %s -> %s via gw %s on %s" , SK_KVA(fr), |
768 | sk_sa_ntop(SA(&fr->fr_laddr), src_s, sizeof(src_s)), |
769 | sk_sa_ntop(SA(&fr->fr_faddr), dst_s, sizeof(dst_s)), |
770 | sk_sa_ntop(SA(&fr->fr_gaddr), gw_s, sizeof(gw_s)), |
771 | ifp->if_xname); |
772 | } else { |
773 | SK_DF(SK_VERB_FLOW_ROUTE, |
774 | "add fr 0x%llx %s -> %s on %s" , SK_KVA(fr), |
775 | sk_sa_ntop(SA(&fr->fr_laddr), src_s, sizeof(src_s)), |
776 | sk_sa_ntop(SA(&fr->fr_faddr), dst_s, sizeof(dst_s)), |
777 | ifp->if_xname); |
778 | } |
779 | #endif /* SK_LOG */ |
780 | |
781 | done: |
782 | if (err == 0) { |
783 | ASSERT(fr != NULL); |
784 | *frp = fr; |
785 | } else if (fr != NULL) { |
786 | /* can't directly call fr_free() if it's in the tree */ |
787 | flow_route_release(fr); |
788 | fr = NULL; |
789 | } |
790 | |
791 | return err; |
792 | } |
793 | |
794 | void |
795 | flow_route_retain(struct flow_route *fr) |
796 | { |
797 | lck_spin_lock(lck: &fr->fr_reflock); |
798 | if (fr->fr_usecnt++ == FLOW_ROUTE_MINREF) { |
799 | fr->fr_expire = 0; |
800 | } |
801 | lck_spin_unlock(lck: &fr->fr_reflock); |
802 | } |
803 | |
804 | void |
805 | flow_route_release(struct flow_route *fr) |
806 | { |
807 | bool should_free = false; |
808 | |
809 | lck_spin_lock(lck: &fr->fr_reflock); |
810 | VERIFY(fr->fr_usecnt > 0); |
811 | if (fr->fr_flags & FLOWRTF_ATTACHED) { |
812 | if (fr->fr_usecnt-- == (FLOW_ROUTE_MINREF + 1)) { |
813 | fr->fr_expire = _net_uptime + flow_route_expire; |
814 | } |
815 | } else { |
816 | /* |
817 | * fr is no longer in lookup tree, so there shouldn't be |
818 | * further usecnt, if we reach 0 usecnt, then this is the very |
819 | * last reference and is safe to unlock and call fr_free. |
820 | */ |
821 | if (--(fr->fr_usecnt) == 0) { |
822 | should_free = true; |
823 | } |
824 | } |
825 | lck_spin_unlock(lck: &fr->fr_reflock); |
826 | |
827 | if (should_free) { |
828 | fr_free(fr); |
829 | } |
830 | } |
831 | |
832 | static uint32_t |
833 | flow_route_bucket_purge_common(struct flow_route_bucket *frb, uint32_t *resid, |
834 | boolean_t all, boolean_t early_expire) |
835 | { |
836 | #if SK_LOG |
837 | char ss[MAX_IPv6_STR_LEN]; /* dst */ |
838 | char ds[MAX_IPv6_STR_LEN]; /* dst */ |
839 | char gs[MAX_IPv6_STR_LEN]; /* gw */ |
840 | #endif /* SK_LOG */ |
841 | struct flow_route *fr, *tfr; |
842 | uint64_t now = net_uptime(); |
843 | uint32_t i = 0, tot = 0; |
844 | |
845 | FRB_WLOCK_ASSERT_HELD(frb); |
846 | |
847 | RB_FOREACH_SAFE(fr, flow_route_tree, &frb->frb_head, tfr) { |
848 | struct flow_route_id_bucket *frib = |
849 | __DECONST(struct flow_route_id_bucket *, fr->fr_frib); |
850 | |
851 | ++tot; |
852 | /* |
853 | * We're not holding fr_lock here, since this is a |
854 | * best-effort check. If there's a race and we miss |
855 | * it now, we'll come back again shortly. |
856 | */ |
857 | lck_spin_lock(lck: &fr->fr_reflock); |
858 | if (!all && (fr->fr_usecnt > FLOW_ROUTE_MINREF || |
859 | (fr->fr_expire > now && !early_expire && |
860 | !(fr->fr_flags & FLOWRTF_DELETED)))) { |
861 | lck_spin_unlock(lck: &fr->fr_reflock); |
862 | SK_DF(SK_VERB_FLOW_ROUTE, "skipping fr 0x%llx " |
863 | "refcnt %u expire %llu" , SK_KVA(fr), |
864 | fr->fr_usecnt, fr->fr_expire); |
865 | continue; |
866 | } |
867 | lck_spin_unlock(lck: &fr->fr_reflock); |
868 | |
869 | /* |
870 | * If "all" is set, flow entries must be gone by now, as |
871 | * we must be called by flow_route_bucket_purge_all(). |
872 | * It also means that the caller has acquired writer lock |
873 | * on all flow {route,route_id} buckets, and fr_usecnt |
874 | * must be at its minimum value now. |
875 | */ |
876 | if (!all) { |
877 | FRIB_WLOCK(frib); |
878 | } |
879 | FRIB_WLOCK_ASSERT_HELD(frib); |
880 | |
881 | _CASSERT(FLOW_ROUTE_MINREF == 2); |
882 | ASSERT(fr->fr_usecnt >= FLOW_ROUTE_MINREF); |
883 | |
884 | RB_REMOVE(flow_route_tree, &frb->frb_head, fr); |
885 | RB_REMOVE(flow_route_id_tree, &frib->frib_head, fr); |
886 | |
887 | os_atomic_andnot(&fr->fr_flags, FLOWRTF_ATTACHED, relaxed); |
888 | |
889 | #if SK_LOG |
890 | if (fr->fr_flags & FLOWRTF_GATEWAY) { |
891 | SK_DF(SK_VERB_FLOW_ROUTE, |
892 | "remove fr 0x%llx %s -> %s via gw %s [exp %lld]" , |
893 | SK_KVA(fr), |
894 | sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)), |
895 | sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)), |
896 | sk_sa_ntop(SA(&fr->fr_gaddr), gs, sizeof(gs)), |
897 | (int64_t)(fr->fr_expire - now)); |
898 | } else { |
899 | SK_DF(SK_VERB_FLOW_ROUTE, |
900 | "remove fr 0x%llx %s -> %s [exp %lld]" , SK_KVA(fr), |
901 | sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)), |
902 | sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)), |
903 | (int64_t)(fr->fr_expire - now)); |
904 | } |
905 | #endif /* SK_LOG */ |
906 | |
907 | /* for the trees */ |
908 | flow_route_release(fr); |
909 | flow_route_release(fr); |
910 | ++i; |
911 | |
912 | if (!all) { |
913 | FRIB_WUNLOCK(frib); |
914 | } |
915 | } |
916 | |
917 | if (resid != NULL) { |
918 | *resid = (tot - i); |
919 | } |
920 | |
921 | return i; |
922 | } |
923 | |
924 | void |
925 | flow_route_bucket_purge_all(struct flow_route_bucket *frb) |
926 | { |
927 | (void) flow_route_bucket_purge_common(frb, NULL, TRUE, FALSE); |
928 | } |
929 | |
930 | static uint32_t |
931 | flow_route_bucket_prune(struct flow_route_bucket *frb, struct ifnet *ifp, |
932 | uint32_t *resid) |
933 | { |
934 | uint64_t now = net_uptime(); |
935 | struct flow_route *fr; |
936 | uint32_t i = 0, tot = 0; |
937 | boolean_t ifdown = !(ifp->if_flags & IFF_UP); |
938 | |
939 | FRB_RLOCK(frb); |
940 | RB_FOREACH(fr, flow_route_tree, &frb->frb_head) { |
941 | ++tot; |
942 | /* loose check; do this without holding fr_reflock */ |
943 | if (fr->fr_usecnt > FLOW_ROUTE_MINREF || |
944 | (fr->fr_expire > now && !ifdown && |
945 | !(fr->fr_flags & FLOWRTF_DELETED))) { |
946 | continue; |
947 | } |
948 | ++i; |
949 | } |
950 | |
951 | /* |
952 | * If there's nothing to prune or there's a writer, we're done. |
953 | * Note that if we failed to upgrade to writer, the lock would |
954 | * have been released automatically. |
955 | */ |
956 | if (i == 0 || !FRB_RLOCKTOWLOCK(frb)) { |
957 | if (i == 0) { |
958 | FRB_RUNLOCK(frb); |
959 | } |
960 | if (resid != NULL) { |
961 | *resid = (tot - i); |
962 | } |
963 | return 0; |
964 | } |
965 | |
966 | SK_DF(SK_VERB_FLOW_ROUTE, "purging at least %u idle routes on %s" , |
967 | i, ifp->if_xname); |
968 | |
969 | /* purge idle ones */ |
970 | i = flow_route_bucket_purge_common(frb, resid, FALSE, early_expire: ifdown); |
971 | FRB_WUNLOCK(frb); |
972 | |
973 | return i; |
974 | } |
975 | |
976 | uint32_t |
977 | flow_route_prune(struct flow_mgr *fm, struct ifnet *ifp, |
978 | uint32_t *tot_resid) |
979 | { |
980 | uint32_t pruned = 0; |
981 | uint32_t resid; |
982 | uint32_t i; |
983 | |
984 | for (i = 0; i < fm->fm_route_buckets_cnt; i++) { |
985 | struct flow_route_bucket *frb = flow_mgr_get_frb_at_idx(fm, idx: i); |
986 | pruned += flow_route_bucket_prune(frb, ifp, resid: &resid); |
987 | if (tot_resid != NULL) { |
988 | *tot_resid += resid; |
989 | } |
990 | } |
991 | |
992 | return pruned; |
993 | } |
994 | |
995 | /* |
996 | * This runs in the context of eventhandler invocation routine which loops |
997 | * through all the registered callbacks. Care must be taken to not call |
998 | * any primitives here that would lead to routing changes in the same context |
999 | * as it would lead to deadlock in eventhandler code. |
1000 | */ |
1001 | static void |
1002 | flow_route_ev_callback(struct eventhandler_entry_arg ee_arg, |
1003 | struct sockaddr *dst, int route_ev, struct sockaddr *gw_addr, int flags) |
1004 | { |
1005 | #pragma unused(dst, flags) |
1006 | #if SK_LOG |
1007 | char dst_s[MAX_IPv6_STR_LEN]; |
1008 | #endif /* SK_LOG */ |
1009 | struct flow_route_id_bucket *frib = NULL; |
1010 | struct flow_route *fr = NULL; |
1011 | struct flow_mgr *fm; |
1012 | |
1013 | VERIFY(!uuid_is_null(ee_arg.ee_fm_uuid)); |
1014 | VERIFY(!uuid_is_null(ee_arg.ee_fr_uuid)); |
1015 | |
1016 | /* |
1017 | * Upon success, callee will hold flow manager lock as reader, |
1018 | * and we'll need to unlock it below. Otherwise there's no |
1019 | * need to unlock here and just return. |
1020 | */ |
1021 | fm = flow_mgr_find_lock(ee_arg.ee_fm_uuid); |
1022 | if (fm == NULL) { |
1023 | SK_ERR("Event %s for dst %s ignored; flow manager not found" , |
1024 | route_event2str(route_ev), sk_sa_ntop(dst, dst_s, |
1025 | sizeof(dst_s))); |
1026 | return; |
1027 | } |
1028 | |
1029 | SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s event %s" , fm->fm_name, |
1030 | sk_sa_ntop(dst, dst_s, sizeof(dst_s)), route_event2str(route_ev)); |
1031 | |
1032 | do { |
1033 | frib = flow_mgr_get_frib_by_uuid(fm, ee_arg.ee_fr_uuid); |
1034 | |
1035 | FRIB_RLOCK(frib); |
1036 | /* callee returns a reference that we need to release below */ |
1037 | fr = flow_route_find_by_uuid(frib, id: ee_arg.ee_fr_uuid); |
1038 | if (fr == NULL) { |
1039 | SK_ERR("%s: dst %s flow route not found" , fm->fm_name, |
1040 | sk_sa_ntop(dst, dst_s, sizeof(dst_s))); |
1041 | break; |
1042 | } |
1043 | |
1044 | /* |
1045 | * Grab fr_lock to prevent flow route configuration or |
1046 | * resolver from using stale info while we are updating. |
1047 | */ |
1048 | FR_LOCK(fr); |
1049 | |
1050 | switch (route_ev) { |
1051 | case ROUTE_ENTRY_REFRESH: |
1052 | /* |
1053 | * This is the case where the route entry has been |
1054 | * updated (for example through RTM_CHANGE). Some |
1055 | * of it may not warrant a lookup again and some of |
1056 | * it may. For now, mark flow to perform a look-up |
1057 | * again as the gateway may have changed. |
1058 | */ |
1059 | os_atomic_inc(&fr->fr_want_configure, relaxed); |
1060 | os_atomic_andnot(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed); |
1061 | SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s route changed" , |
1062 | fm->fm_name, sk_sa_ntop(dst, dst_s, |
1063 | sizeof(dst_s))); |
1064 | break; |
1065 | |
1066 | case ROUTE_ENTRY_DELETED: |
1067 | /* |
1068 | * NOTE: flow_route_cleanup() should not be called |
1069 | * to de-register eventhandler in the context of |
1070 | * eventhandler callback to avoid deadlock in |
1071 | * eventhandler code. Instead, just mark the flow |
1072 | * route un-resolved. When it is being used again |
1073 | * or being deleted the old eventhandler must be |
1074 | * de-registered. |
1075 | */ |
1076 | os_atomic_inc(&fr->fr_want_configure, relaxed); |
1077 | os_atomic_andnot(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed); |
1078 | os_atomic_or(&fr->fr_flags, FLOWRTF_DELETED, relaxed); |
1079 | SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s route deleted" , |
1080 | fm->fm_name, sk_sa_ntop(dst, dst_s, |
1081 | sizeof(dst_s))); |
1082 | break; |
1083 | |
1084 | case ROUTE_LLENTRY_STALE: |
1085 | /* |
1086 | * When the route entry is deemed unreliable or old |
1087 | * enough to trigger a route lookup again. Don't |
1088 | * reconfigure the flow route, but simply attempt |
1089 | * to resolve it next time to trigger a probe. |
1090 | */ |
1091 | os_atomic_inc(&fr->fr_want_probe, relaxed); |
1092 | os_atomic_andnot(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed); |
1093 | SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s llentry stale" , |
1094 | fm->fm_name, sk_sa_ntop(dst, dst_s, |
1095 | sizeof(dst_s))); |
1096 | break; |
1097 | |
1098 | case ROUTE_LLENTRY_CHANGED: |
1099 | /* |
1100 | * When the link-layer info has changed; replace |
1101 | * cached llinfo in the flow route (treat this |
1102 | * as ROUTE_LLENTRY_RESOLVED). |
1103 | */ |
1104 | OS_FALLTHROUGH; |
1105 | |
1106 | case ROUTE_LLENTRY_RESOLVED: |
1107 | /* |
1108 | * SDL address length may be 0 for cellular. |
1109 | * If Ethernet, copy into flow route and mark |
1110 | * it as cached. In all cases, mark the flow |
1111 | * route as resolved. |
1112 | */ |
1113 | ASSERT(SDL(gw_addr)->sdl_family == AF_LINK); |
1114 | if (SDL(gw_addr)->sdl_alen == ETHER_ADDR_LEN) { |
1115 | FLOWRT_UPD_ETH_DST(fr, LLADDR(SDL(gw_addr))); |
1116 | SK_DF(SK_VERB_FLOW_ROUTE, |
1117 | "%s: dst %s llentry %s" , fm->fm_name, |
1118 | sk_sa_ntop(dst, dst_s, sizeof(dst_s)), |
1119 | (!(fr->fr_flags & FLOWRTF_HAS_LLINFO) ? |
1120 | "resolved" : "changed" )); |
1121 | os_atomic_or(&fr->fr_flags, FLOWRTF_HAS_LLINFO, relaxed); |
1122 | } else { |
1123 | os_atomic_andnot(&fr->fr_flags, FLOWRTF_HAS_LLINFO, relaxed); |
1124 | } |
1125 | os_atomic_or(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed); |
1126 | #if SK_LOG |
1127 | if (__improbable((sk_verbose & SK_VERB_FLOW_ROUTE) != |
1128 | 0) && (fr->fr_flags & FLOWRTF_HAS_LLINFO)) { |
1129 | SK_DF(SK_VERB_FLOW_ROUTE, |
1130 | "%s: fr 0x%llx eth_type 0x%x " |
1131 | "eth_src %x:%x:%x:%x:%x:%x " |
1132 | "eth_dst %x:%x:%x:%x:%x:%x [%s])" , |
1133 | fm->fm_name, SK_KVA(fr), |
1134 | ntohs(fr->fr_eth.ether_type), |
1135 | fr->fr_eth.ether_shost[0], |
1136 | fr->fr_eth.ether_shost[1], |
1137 | fr->fr_eth.ether_shost[2], |
1138 | fr->fr_eth.ether_shost[3], |
1139 | fr->fr_eth.ether_shost[4], |
1140 | fr->fr_eth.ether_shost[5], |
1141 | fr->fr_eth.ether_dhost[0], |
1142 | fr->fr_eth.ether_dhost[1], |
1143 | fr->fr_eth.ether_dhost[2], |
1144 | fr->fr_eth.ether_dhost[3], |
1145 | fr->fr_eth.ether_dhost[4], |
1146 | fr->fr_eth.ether_dhost[5], |
1147 | sk_sa_ntop(dst, dst_s, sizeof(dst_s))); |
1148 | } |
1149 | #endif /* SK_LOG */ |
1150 | break; |
1151 | |
1152 | case ROUTE_LLENTRY_DELETED: |
1153 | /* |
1154 | * If the route entry points to a router and an |
1155 | * RTM_DELETE has been issued on it; force the |
1156 | * flow route to be reconfigured. |
1157 | */ |
1158 | os_atomic_inc(&fr->fr_want_configure, relaxed); |
1159 | os_atomic_andnot(&fr->fr_flags, (FLOWRTF_HAS_LLINFO | FLOWRTF_RESOLVED), relaxed); |
1160 | SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s llentry deleted" , |
1161 | fm->fm_name, sk_sa_ntop(dst, dst_s, |
1162 | sizeof(dst_s))); |
1163 | break; |
1164 | |
1165 | case ROUTE_LLENTRY_PROBED: |
1166 | /* |
1167 | * When the resolver has begun probing the target; |
1168 | * nothing to do here. |
1169 | */ |
1170 | SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s llentry probed" , |
1171 | fm->fm_name, sk_sa_ntop(dst, dst_s, |
1172 | sizeof(dst_s))); |
1173 | break; |
1174 | |
1175 | case ROUTE_LLENTRY_UNREACH: |
1176 | /* |
1177 | * When the route entry is marked with RTF_REJECT |
1178 | * or the probes have timed out, reconfigure. |
1179 | */ |
1180 | os_atomic_inc(&fr->fr_want_configure, relaxed); |
1181 | os_atomic_andnot(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed); |
1182 | SK_ERR("%s: dst %s llentry unreachable" , fm->fm_name, |
1183 | sk_sa_ntop(dst, dst_s, sizeof(dst_s))); |
1184 | break; |
1185 | |
1186 | default: |
1187 | break; |
1188 | } |
1189 | } while (0); |
1190 | |
1191 | if (fr != NULL) { |
1192 | flow_route_release(fr); |
1193 | FR_UNLOCK(fr); |
1194 | } |
1195 | |
1196 | if (frib != NULL) { |
1197 | FRIB_UNLOCK(frib); |
1198 | } |
1199 | |
1200 | if (fm != NULL) { |
1201 | flow_mgr_unlock(); |
1202 | } |
1203 | } |
1204 | |
1205 | int |
1206 | flow_route_select_laddr(union sockaddr_in_4_6 *src, union sockaddr_in_4_6 *dst, |
1207 | struct ifnet *ifp, struct rtentry *rt, uint32_t *ipaddr_gencnt, |
1208 | int use_stable_address) |
1209 | { |
1210 | #if SK_LOG |
1211 | char src_s[MAX_IPv6_STR_LEN]; /* src */ |
1212 | char dst_s[MAX_IPv6_STR_LEN]; /* dst */ |
1213 | #endif /* SK_LOG */ |
1214 | sa_family_t af = SA(dst)->sa_family; |
1215 | struct ifnet *src_ifp = NULL; |
1216 | struct ifaddr *ifa = NULL; |
1217 | int err = 0; |
1218 | |
1219 | /* see comments in flow_route_configure() regarding loopback */ |
1220 | ASSERT(rt->rt_ifp == ifp || rt->rt_ifp == lo_ifp); |
1221 | |
1222 | switch (af) { |
1223 | case AF_INET: { |
1224 | ifnet_lock_shared(ifp); |
1225 | if (__improbable(rt->rt_ifa->ifa_debug & IFD_DETACHING) != 0) { |
1226 | err = EHOSTUNREACH; |
1227 | SK_ERR("route to %s has src address marked detaching " |
1228 | "(err %d)" , inet_ntop(AF_INET, |
1229 | &SIN(dst)->sin_addr, dst_s, sizeof(dst_s)), err); |
1230 | ifnet_lock_done(ifp); |
1231 | break; |
1232 | } |
1233 | SIN(src)->sin_len = sizeof(struct sockaddr_in); |
1234 | SIN(src)->sin_family = AF_INET; |
1235 | SIN(src)->sin_addr = IA_SIN(rt->rt_ifa)->sin_addr; |
1236 | ASSERT(SIN(src)->sin_addr.s_addr != INADDR_ANY); |
1237 | *ipaddr_gencnt = ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt; |
1238 | ifnet_lock_done(ifp); |
1239 | break; |
1240 | } |
1241 | |
1242 | case AF_INET6: { |
1243 | struct in6_addr src_storage, *in6; |
1244 | struct route_in6 ro = {}; |
1245 | uint32_t hints = (use_stable_address ? 0 : IPV6_SRCSEL_HINT_PREFER_TMPADDR); |
1246 | ro.ro_rt = rt; |
1247 | |
1248 | if ((in6 = in6_selectsrc_core(SIN6(dst), hints, |
1249 | ifp, 0, &src_storage, &src_ifp, &err, &ifa, &ro, FALSE)) == NULL) { |
1250 | if (err == 0) { |
1251 | err = EADDRNOTAVAIL; |
1252 | } |
1253 | VERIFY(src_ifp == NULL); |
1254 | SK_ERR("src address to dst %s on %s not available " |
1255 | "(err %d)" , inet_ntop(AF_INET6, |
1256 | &SIN6(dst)->sin6_addr, dst_s, sizeof(dst_s)), |
1257 | ifp->if_xname, err); |
1258 | break; |
1259 | } |
1260 | |
1261 | VERIFY(src_ifp != NULL); |
1262 | VERIFY(ifa != NULL); |
1263 | |
1264 | if (__improbable(src_ifp != ifp)) { |
1265 | if (err == 0) { |
1266 | err = ENETUNREACH; |
1267 | } |
1268 | SK_ERR("dst %s, src %s ifp %s != %s (err %d)" , |
1269 | inet_ntop(AF_INET6, &SIN6(dst)->sin6_addr, |
1270 | dst_s, sizeof(dst_s)), |
1271 | inet_ntop(AF_INET6, &SIN6(src)->sin6_addr, |
1272 | src_s, sizeof(src_s)), |
1273 | src_ifp->if_xname, ifp->if_xname, err); |
1274 | break; |
1275 | } |
1276 | |
1277 | ifnet_lock_shared(ifp); |
1278 | if (__improbable(ifa->ifa_debug & IFD_DETACHING) != 0) { |
1279 | err = EHOSTUNREACH; |
1280 | SK_ERR("IPv6 address selected is marked to be " |
1281 | "detached (err %d)" , err); |
1282 | ifnet_lock_done(ifp); |
1283 | break; |
1284 | } |
1285 | |
1286 | /* clear embedded scope if link-local src */ |
1287 | if (IN6_IS_SCOPE_EMBED(in6)) { |
1288 | if (in6_embedded_scope) { |
1289 | SIN6(src)->sin6_scope_id = ntohs(in6->s6_addr16[1]); |
1290 | in6->s6_addr16[1] = 0; |
1291 | } else { |
1292 | SIN6(src)->sin6_scope_id = src_ifp->if_index; |
1293 | } |
1294 | } |
1295 | SIN6(src)->sin6_len = sizeof(struct sockaddr_in6); |
1296 | SIN6(src)->sin6_family = AF_INET6; |
1297 | SIN6(src)->sin6_addr = *in6; |
1298 | ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&SIN6(src)->sin6_addr)); |
1299 | *ipaddr_gencnt = ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt; |
1300 | ifnet_lock_done(ifp); |
1301 | break; |
1302 | } |
1303 | |
1304 | default: |
1305 | VERIFY(0); |
1306 | /* NOTREACHED */ |
1307 | __builtin_unreachable(); |
1308 | } |
1309 | |
1310 | if (ifa != NULL) { |
1311 | ifa_remref(ifa); |
1312 | } |
1313 | |
1314 | if (src_ifp != NULL) { |
1315 | ifnet_release(interface: src_ifp); |
1316 | } |
1317 | |
1318 | #if SK_LOG |
1319 | if (err == 0 && __improbable((sk_verbose & SK_VERB_FLOW_ROUTE) != 0)) { |
1320 | SK_DF(SK_VERB_FLOW_ROUTE, "src %s to dst %s on %s" , |
1321 | sk_sa_ntop(SA(src), src_s, sizeof(src_s)), |
1322 | sk_sa_ntop(SA(dst), dst_s, sizeof(dst_s)), |
1323 | ifp->if_xname); |
1324 | } |
1325 | #endif /* SK_LOG */ |
1326 | |
1327 | return err; |
1328 | } |
1329 | |
1330 | void |
1331 | flow_route_cleanup(struct flow_route *fr) |
1332 | { |
1333 | #if SK_LOG |
1334 | char ss[MAX_IPv6_STR_LEN]; /* dst */ |
1335 | char ds[MAX_IPv6_STR_LEN]; /* dst */ |
1336 | char gs[MAX_IPv6_STR_LEN]; /* gw */ |
1337 | #endif /* SK_LOG */ |
1338 | |
1339 | FR_LOCK_ASSERT_HELD(fr); |
1340 | |
1341 | if (fr->fr_rt_evhdlr_tag != NULL) { |
1342 | ASSERT(fr->fr_rt_dst != NULL); |
1343 | route_event_enqueue_nwk_wq_entry(fr->fr_rt_dst, NULL, |
1344 | ROUTE_EVHDLR_DEREGISTER, fr->fr_rt_evhdlr_tag, FALSE); |
1345 | fr->fr_rt_evhdlr_tag = NULL; |
1346 | fr->fr_rt_dst = NULL; |
1347 | } |
1348 | ASSERT(fr->fr_rt_dst == NULL); |
1349 | if (fr->fr_rt_gw != NULL) { |
1350 | rtfree(fr->fr_rt_gw); |
1351 | fr->fr_rt_gw = NULL; |
1352 | } |
1353 | |
1354 | #if SK_LOG |
1355 | if (fr->fr_flags & FLOWRTF_GATEWAY) { |
1356 | SK_DF(SK_VERB_FLOW_ROUTE, |
1357 | "clean fr 0x%llx %s -> %s via gw %s" , SK_KVA(fr), |
1358 | sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)), |
1359 | sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)), |
1360 | sk_sa_ntop(SA(&fr->fr_gaddr), gs, sizeof(gs))); |
1361 | } else if (fr->fr_flags & FLOWRTF_ONLINK) { |
1362 | SK_DF(SK_VERB_FLOW_ROUTE, |
1363 | "clean fr 0x%llx %s -> %s" , SK_KVA(fr), |
1364 | sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)), |
1365 | sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds))); |
1366 | } |
1367 | #endif /* SK_LOG */ |
1368 | |
1369 | os_atomic_andnot(&fr->fr_flags, (FLOWRTF_GATEWAY | FLOWRTF_ONLINK), relaxed); |
1370 | } |
1371 | |
1372 | static boolean_t |
1373 | _flow_route_laddr_validate(struct flow_ip_addr *src_ip0, uint8_t ip_v, |
1374 | struct ifnet *ifp, uint32_t *gencnt) |
1375 | { |
1376 | boolean_t address_found = TRUE; |
1377 | struct ifaddr *ifa = NULL; |
1378 | struct flow_ip_addr src_ip = {}; |
1379 | uint32_t scope = ifp->if_index; |
1380 | |
1381 | VERIFY(gencnt != NULL); |
1382 | VERIFY(ip_v == IPVERSION || ip_v == IPV6_VERSION); |
1383 | |
1384 | if (ip_v == IPVERSION) { |
1385 | memcpy(dst: &src_ip._v4, src: &src_ip0->_v4, n: sizeof(src_ip._v4)); |
1386 | |
1387 | ifa = (struct ifaddr *)ifa_foraddr_scoped( |
1388 | src_ip._v4.s_addr, scope); |
1389 | } else { |
1390 | memcpy(dst: &src_ip, src: src_ip0, n: sizeof(*src_ip0)); |
1391 | |
1392 | if (in6_embedded_scope && IN6_IS_SCOPE_EMBED(&src_ip._v6)) { |
1393 | src_ip._v6.s6_addr16[1] = htons((uint16_t)scope); |
1394 | } |
1395 | ifa = (struct ifaddr *)ifa_foraddr6_scoped(&src_ip._v6, |
1396 | scope); |
1397 | } |
1398 | |
1399 | if (__improbable(ifa == NULL)) { |
1400 | address_found = FALSE; |
1401 | goto done; |
1402 | } |
1403 | |
1404 | ifnet_lock_shared(ifp); |
1405 | if (__improbable(ifa->ifa_debug & IFD_DETACHING) != 0) { |
1406 | address_found = FALSE; |
1407 | ifnet_lock_done(ifp); |
1408 | goto done; |
1409 | } |
1410 | |
1411 | if (ip_v == IPV6_VERSION) { |
1412 | struct in6_ifaddr *ia6 = (struct in6_ifaddr *)ifa; |
1413 | |
1414 | /* |
1415 | * Fail if IPv6 address is not ready or if the address |
1416 | * is reserved * for CLAT46. |
1417 | */ |
1418 | if (__improbable(ia6->ia6_flags & |
1419 | (IN6_IFF_NOTREADY | IN6_IFF_CLAT46)) != 0) { |
1420 | address_found = FALSE; |
1421 | ifnet_lock_done(ifp); |
1422 | goto done; |
1423 | } |
1424 | } else { |
1425 | /* |
1426 | * If interface has CLAT46 enabled, fail IPv4 bind. |
1427 | * Since this implies network is NAT64/DNS64, Internet |
1428 | * effectively becomes reachable over IPv6. If on |
1429 | * system IPv4 to IPv6 translation is required, that |
1430 | * should be handled solely through bump in the API. |
1431 | * The in kernel translation is only done for apps |
1432 | * directly using low level networking APIs. |
1433 | */ |
1434 | if (__improbable(IS_INTF_CLAT46(ifp))) { |
1435 | address_found = FALSE; |
1436 | ifnet_lock_done(ifp); |
1437 | goto done; |
1438 | } |
1439 | } |
1440 | |
1441 | *gencnt = ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt; |
1442 | ifnet_lock_done(ifp); |
1443 | done: |
1444 | if (ifa != NULL) { |
1445 | ifa_remref(ifa); |
1446 | } |
1447 | |
1448 | return address_found; |
1449 | } |
1450 | |
1451 | boolean_t |
1452 | flow_route_laddr_validate(union sockaddr_in_4_6 *saddr, struct ifnet *ifp, |
1453 | uint32_t *gencnt) |
1454 | { |
1455 | VERIFY(saddr->sa.sa_family == AF_INET || |
1456 | saddr->sa.sa_family == AF_INET6); |
1457 | |
1458 | struct flow_ip_addr *ipa; |
1459 | uint8_t ipv; |
1460 | if (saddr->sa.sa_family == AF_INET) { |
1461 | ipv = IPVERSION; |
1462 | ipa = (struct flow_ip_addr *)(void *)&saddr->sin.sin_addr; |
1463 | } else { |
1464 | ipv = IPV6_VERSION; |
1465 | ipa = (struct flow_ip_addr *)(void *)&saddr->sin6.sin6_addr; |
1466 | } |
1467 | |
1468 | return _flow_route_laddr_validate(src_ip0: ipa, ip_v: ipv, ifp, gencnt); |
1469 | } |
1470 | |
1471 | boolean_t |
1472 | flow_route_key_validate(struct flow_key *fk, struct ifnet *ifp, |
1473 | uint32_t *gencnt) |
1474 | { |
1475 | return _flow_route_laddr_validate(src_ip0: &fk->fk_src, ip_v: fk->fk_ipver, ifp, |
1476 | gencnt); |
1477 | } |
1478 | |