1/*
2 * Copyright (c) 2017-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*
30 * Flow Routes.
31 *
32 * Each (non-listener) flow entry is always associated with a flow route
33 * object. Multiple flow entries sharing the same remote address will use
34 * the same flow route for that address. The flow route object contains
35 * the route information for the remote node. It gets allocated when a
36 * flow entry requests to connect, and is garbage-collected when it's no
37 * longer referred to after its expiration time has passed.
38 *
39 * A flow route also contains the default local address that's used to
40 * reach the remote node. This may not necessarily be the same local
41 * address used by the flow entry, if it has explicitly bound the entry
42 * to another local address. But for the majority of cases, having the
43 * local address be present in the flow route allows us to avoid doing
44 * source address selection each time a connect request happens.
45 *
46 * When the remote node is reachable via a gateway, the gateway address
47 * portion of the flow route contains its IP address and the flow route
48 * is marked with FLOWRTF_GATEWAY. We use this to optimize the gateway
49 * route lookup, since otherwise we'd have to perform an extra lookup
50 * each time we need to resolve the route.
51 *
52 * When the remote node is directly on the link, the FLOWRTF_ONLINK flag
53 * is set, and the gateway address isn't used. The target address used
54 * for resolution will the the remote address itself.
55 *
56 * On links with link-layer information, we store the resolved address
57 * of the target node (which may be the gateway's) in the flow route,
58 * and mark the flow route with FLOWRTF_HAS_LLINFO.
59 *
60 * Each flow route also registers itself to receive route events when
61 * the underlying rtentry is updated or deleted.
62 */
63
64#include <skywalk/os_skywalk_private.h>
65
66#include <skywalk/nexus/flowswitch/nx_flowswitch.h>
67#include <skywalk/nexus/flowswitch/fsw_var.h>
68#include <skywalk/nexus/flowswitch/flow/flow_var.h>
69
70#include <netinet/in.h>
71#include <netinet/in_var.h>
72#include <netinet/in_arp.h>
73#include <netinet6/nd6.h>
74#include <net/route.h>
75
76extern struct rtstat rtstat;
77
78static LCK_GRP_DECLARE(flow_route_lock_group, "sk_flow_route_lock");
79static LCK_ATTR_DECLARE(flow_route_lock_attr, 0, 0);
80
81static int fr_cmp(const struct flow_route *, const struct flow_route *);
82static int fr_id_cmp(const struct flow_route *, const struct flow_route *);
83static struct flow_route *fr_alloc(boolean_t);
84static void fr_free(struct flow_route *);
85static uint32_t flow_route_bucket_purge_common(struct flow_route_bucket *,
86 uint32_t *, boolean_t, boolean_t);
87static void flow_route_ev_callback(struct eventhandler_entry_arg,
88 struct sockaddr *, int, struct sockaddr *, int);
89
90RB_GENERATE_PREV(flow_route_tree, flow_route, fr_link, fr_cmp);
91RB_GENERATE_PREV(flow_route_id_tree, flow_route, fr_id_link, fr_id_cmp);
92
93KALLOC_TYPE_VAR_DEFINE(KT_SK_FRB, struct flow_route_bucket, KT_DEFAULT);
94KALLOC_TYPE_VAR_DEFINE(KT_SK_FRIB, struct flow_route_id_bucket, KT_DEFAULT);
95
96#define FR_ZONE_NAME "flow.route"
97
98static unsigned int flow_route_size; /* size of flow_route */
99struct skmem_cache *flow_route_cache; /* cache for flow_route */
100
101static int __flow_route_inited = 0;
102
103#define FLOW_ROUTE_EXPIRE 600 /* seconds */
104static unsigned int flow_route_expire = FLOW_ROUTE_EXPIRE;
105
106SYSCTL_UINT(_kern_skywalk_flowswitch, OID_AUTO, flow_route_expire,
107 CTLFLAG_RW | CTLFLAG_LOCKED, &flow_route_expire, 0, "");
108
109void
110flow_route_init(void)
111{
112 ASSERT(!__flow_route_inited);
113
114 flow_route_size = sizeof(struct flow_route);
115 flow_route_cache = skmem_cache_create(FR_ZONE_NAME, flow_route_size,
116 sizeof(uint64_t), NULL, NULL, NULL, NULL, NULL, 0);
117
118 __flow_route_inited = 1;
119}
120
121void
122flow_route_fini(void)
123{
124 if (__flow_route_inited) {
125 skmem_cache_destroy(flow_route_cache);
126 flow_route_cache = NULL;
127
128 __flow_route_inited = 0;
129 }
130}
131
132struct flow_route_bucket *
133flow_route_buckets_alloc(size_t frb_cnt, size_t *frb_sz, size_t *tot_sz)
134{
135 uint32_t cache_sz = skmem_cpu_cache_line_size();
136 struct flow_route_bucket *frb;
137 size_t frb_tot_sz;
138
139 /* each bucket is CPU cache-aligned */
140 *frb_sz = P2ROUNDUP(sizeof(*frb), cache_sz);
141 *tot_sz = frb_tot_sz = frb_cnt * (*frb_sz);
142 frb = sk_alloc_type_hash(KT_SK_FRB, frb_tot_sz, Z_WAITOK,
143 skmem_tag_fsw_frb_hash);
144 if (__improbable(frb == NULL)) {
145 return NULL;
146 }
147
148#if !KASAN_CLASSIC
149 /*
150 * except in KASAN_CLASSIC mode, kalloc will always maintain cacheline
151 * size alignment if the requested size is a multiple of a cacheline
152 * size (this is true for any size that is a power of two from 16 to
153 * PAGE_SIZE).
154 *
155 * Because this is an optimization only, it is OK to leave KASAN_CLASSIC
156 * not respect this.
157 */
158 ASSERT(IS_P2ALIGNED(frb, cache_sz));
159#endif
160
161 SK_DF(SK_VERB_MEM, "frb 0x%llx frb_cnt %zu frb_sz %zu "
162 "(total %zu bytes) ALLOC", SK_KVA(frb), frb_cnt,
163 *frb_sz, frb_tot_sz);
164
165 return frb;
166}
167
168void
169flow_route_buckets_free(struct flow_route_bucket *frb, size_t tot_sz)
170{
171 SK_DF(SK_VERB_MEM, "frb 0x%llx FREE", SK_KVA(frb));
172 sk_free_type_hash(KT_SK_FRB, tot_sz, frb);
173}
174
175void
176flow_route_bucket_init(struct flow_route_bucket *frb)
177{
178#if !KASAN_CLASSIC
179 ASSERT(IS_P2ALIGNED(frb, skmem_cpu_cache_line_size()));
180#endif /* !KASAN_CLASSIC */
181 lck_rw_init(lck: &frb->frb_lock, grp: &flow_route_lock_group,
182 attr: &flow_route_lock_attr);
183 RB_INIT(&frb->frb_head);
184}
185
186void
187flow_route_bucket_destroy(struct flow_route_bucket *frb)
188{
189 ASSERT(RB_EMPTY(&frb->frb_head));
190 lck_rw_destroy(lck: &frb->frb_lock, grp: &flow_route_lock_group);
191}
192
193static struct flow_route *
194flow_route_find_by_addr(struct flow_route_bucket *frb,
195 union sockaddr_in_4_6 *dst)
196{
197 struct flow_route *fr;
198 struct flow_route find;
199
200 FRB_LOCK_ASSERT_HELD(frb);
201
202 switch (SA(dst)->sa_family) {
203 case AF_INET:
204 find.fr_af = AF_INET;
205 find.fr_addr_len = sizeof(struct in_addr);
206 find.fr_addr_key = (void *)&SIN(dst)->sin_addr;
207 break;
208
209 case AF_INET6:
210 find.fr_af = AF_INET6;
211 find.fr_addr_len = sizeof(struct in6_addr);
212 find.fr_addr_key = (void *)&SIN6(dst)->sin6_addr;
213 break;
214
215 default:
216 VERIFY(0);
217 /* NOTREACHED */
218 __builtin_unreachable();
219 }
220
221 fr = RB_FIND(flow_route_tree, &frb->frb_head, &find);
222 if (fr != NULL) {
223 flow_route_retain(fr); /* for the caller */
224 }
225 return fr;
226}
227
228struct flow_route_id_bucket *
229flow_route_id_buckets_alloc(size_t frib_cnt, size_t *frib_sz, size_t *tot_sz)
230{
231 uint32_t cache_sz = skmem_cpu_cache_line_size();
232 struct flow_route_id_bucket *frib;
233 size_t frib_tot_sz;
234
235 /* each bucket is CPU cache-aligned */
236 *frib_sz = P2ROUNDUP(sizeof(*frib), cache_sz);
237 *tot_sz = frib_tot_sz = frib_cnt * (*frib_sz);
238 frib = sk_alloc_type_hash(KT_SK_FRIB, frib_tot_sz, Z_WAITOK,
239 skmem_tag_fsw_frib_hash);
240 /* END IGNORE CODESTYLE */
241 if (__improbable(frib == NULL)) {
242 return NULL;
243 }
244
245#if !KASAN_CLASSIC
246 /*
247 * except in KASAN_CLASSIC mode, kalloc will always maintain cacheline
248 * size alignment if the requested size is a multiple of a cacheline
249 * size (this is true for any size that is a power of two from 16 to
250 * PAGE_SIZE).
251 *
252 * Because this is an optimization only, it is OK to leave KASAN_CLASSIC
253 * not respect this.
254 */
255 ASSERT(IS_P2ALIGNED(frib, cache_sz));
256#endif /* !KASAN_CLASSIC */
257
258 SK_DF(SK_VERB_MEM, "frib 0x%llx frib_cnt %zu frib_sz %zu "
259 "(total %zu bytes) ALLOC", SK_KVA(frib), frib_cnt,
260 *frib_sz, frib_tot_sz);
261
262 return frib;
263}
264
265void
266flow_route_id_buckets_free(struct flow_route_id_bucket *frib, size_t tot_sz)
267{
268 SK_DF(SK_VERB_MEM, "frib 0x%llx FREE", SK_KVA(frib));
269 sk_free_type_hash(KT_SK_FRIB, tot_sz, frib);
270}
271
272void
273flow_route_id_bucket_init(struct flow_route_id_bucket *frib)
274{
275#if !KASAN_CLASSIC
276 ASSERT(IS_P2ALIGNED(frib, skmem_cpu_cache_line_size()));
277#endif
278 lck_rw_init(lck: &frib->frib_lock, grp: &flow_route_lock_group,
279 attr: &flow_route_lock_attr);
280 RB_INIT(&frib->frib_head);
281}
282
283void
284flow_route_id_bucket_destroy(struct flow_route_id_bucket *frib)
285{
286 ASSERT(RB_EMPTY(&frib->frib_head));
287 lck_rw_destroy(lck: &frib->frib_lock, grp: &flow_route_lock_group);
288}
289
290static struct flow_route *
291flow_route_find_by_uuid(struct flow_route_id_bucket *frib, uuid_t id)
292{
293 struct flow_route *fr;
294 struct flow_route find;
295
296 FRIB_LOCK_ASSERT_HELD(frib);
297
298 uuid_copy(dst: find.fr_uuid, src: id);
299 fr = RB_FIND(flow_route_id_tree, &frib->frib_head, &find);
300 if (fr != NULL) {
301 flow_route_retain(fr); /* for the caller */
302 }
303 return fr;
304}
305
306static struct flow_route *
307fr_alloc(boolean_t cansleep)
308{
309 struct flow_route *fr;
310
311 if ((fr = skmem_cache_alloc(flow_route_cache,
312 (cansleep ? SKMEM_SLEEP : SKMEM_NOSLEEP))) != NULL) {
313 bzero(s: fr, n: flow_route_size);
314 lck_spin_init(lck: &fr->fr_reflock, grp: &flow_route_lock_group,
315 attr: &flow_route_lock_attr);
316 lck_mtx_init(lck: &fr->fr_lock, grp: &flow_route_lock_group,
317 attr: &flow_route_lock_attr);
318 uuid_generate_random(out: fr->fr_uuid);
319
320 SK_DF(SK_VERB_MEM, "allocated fr 0x%llx", SK_KVA(fr));
321 }
322
323 return fr;
324}
325
326static void
327fr_free(struct flow_route *fr)
328{
329 SK_DF(SK_VERB_MEM, "freeing fr 0x%llx", SK_KVA(fr));
330
331 VERIFY(!(fr->fr_flags & FLOWRTF_ATTACHED));
332 VERIFY(fr->fr_usecnt == 0);
333
334 FR_LOCK(fr);
335 /* callee frees route entry */
336 flow_route_cleanup(fr);
337 VERIFY(fr->fr_rt_dst == NULL);
338 VERIFY(fr->fr_rt_gw == NULL);
339 VERIFY(fr->fr_rt_evhdlr_tag == NULL);
340 FR_UNLOCK(fr);
341
342 lck_mtx_destroy(lck: &fr->fr_lock, grp: &flow_route_lock_group);
343 lck_spin_destroy(lck: &fr->fr_reflock, grp: &flow_route_lock_group);
344
345 skmem_cache_free(flow_route_cache, fr);
346}
347
348static inline int
349fr_cmp(const struct flow_route *a, const struct flow_route *b)
350{
351 int d;
352
353 if ((d = (a->fr_af - b->fr_af)) != 0) {
354 return d;
355 }
356 if ((d = flow_ip_cmp(a0: a->fr_addr_key, b0: b->fr_addr_key,
357 alen: b->fr_addr_len)) != 0) {
358 return d;
359 }
360
361 return 0;
362}
363
364static inline int
365fr_id_cmp(const struct flow_route *a, const struct flow_route *b)
366{
367 return uuid_compare(uu1: a->fr_uuid, uu2: b->fr_uuid);
368}
369
370static inline int
371fr_use_stable_address(struct nx_flow_req *req)
372{
373 int use_stable_address = ip6_prefer_tempaddr ? 0 : 1;
374 if (req != NULL &&
375 (req->nfr_flags & NXFLOWREQF_OVERRIDE_ADDRESS_SELECTION)) {
376 use_stable_address = (req->nfr_flags & NXFLOWREQF_USE_STABLE_ADDRESS) ? 1 : 0;
377 }
378 return use_stable_address;
379}
380
381int
382flow_route_configure(struct flow_route *fr, struct ifnet *ifp, struct nx_flow_req *req)
383{
384#if SK_LOG
385 char old_s[MAX_IPv6_STR_LEN]; /* src */
386 char src_s[MAX_IPv6_STR_LEN]; /* src */
387 char dst_s[MAX_IPv6_STR_LEN]; /* dst */
388#endif /* SK_LOG */
389 struct rtentry *rt = NULL, *gwrt = NULL;
390 int err = 0;
391
392 FR_LOCK_ASSERT_HELD(fr);
393
394 /*
395 * If there is a route entry for the final destination, see if
396 * it's no longer valid and perform another routing table lookup.
397 * A non-NULL fr_rt_dst is always associated with a route event
398 * registration, and the route reference is held there.
399 */
400 rt = fr->fr_rt_dst;
401 if (rt == NULL || !(rt->rt_flags & RTF_UP) || fr->fr_want_configure) {
402 struct eventhandler_entry_arg ee_arg;
403
404 /* callee frees route entry */
405 flow_route_cleanup(fr);
406
407 /* lookup destination route */
408 ASSERT(err == 0);
409 rt = rtalloc1_scoped(SA(&fr->fr_faddr), 1, 0, ifp->if_index);
410 if (rt == NULL) {
411 err = EHOSTUNREACH;
412 SK_ERR("no route to %s on %s (err %d)",
413 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
414 sizeof(dst_s)), ifp->if_xname, err);
415 } else {
416 /*
417 * If route points to another interface and the
418 * route's gateway isn't link-layer, reject it.
419 * We make an exception otherwise, since local
420 * interface addresses resolve this way.
421 */
422 if (rt->rt_ifp != ifp && rt->rt_ifp != lo_ifp &&
423 (rt->rt_gateway == NULL ||
424 SA(rt->rt_gateway)->sa_family != AF_LINK)) {
425 err = EHOSTUNREACH;
426 SK_ERR("route to %s on %s != %s (err %d)",
427 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
428 sizeof(dst_s)), rt->rt_ifp->if_xname,
429 ifp->if_xname, err);
430 }
431 }
432
433 if (err != 0) {
434 goto done;
435 }
436
437 ASSERT(fr->fr_mgr != NULL);
438 ASSERT(!uuid_is_null(fr->fr_mgr->fm_uuid));
439 ASSERT(!uuid_is_null(fr->fr_uuid));
440 ASSERT(!uuid_is_null(fr->fr_nx_uuid));
441
442 bzero(s: &ee_arg, n: sizeof(ee_arg));
443 uuid_copy(dst: ee_arg.ee_fm_uuid, src: fr->fr_mgr->fm_uuid);
444 uuid_copy(dst: ee_arg.ee_fr_uuid, src: fr->fr_uuid);
445
446 /*
447 * Register for changes on destination route; this covers both
448 * cases where the destination is on-link, or if it is off-link
449 * and is using a gateway route. This also transfers the refcnt
450 * of the route entry to the event handler, released later when
451 * it is deregistered.
452 */
453 ASSERT(fr->fr_rt_dst == NULL);
454 ASSERT(fr->fr_rt_evhdlr_tag == NULL);
455 fr->fr_rt_dst = rt; /* move reference to fr */
456 fr->fr_rt_evhdlr_tag =
457 EVENTHANDLER_REGISTER(&rt->rt_evhdlr_ctxt, route_event,
458 flow_route_ev_callback, ee_arg, EVENTHANDLER_PRI_ANY);
459 ASSERT(fr->fr_rt_evhdlr_tag != NULL);
460 os_atomic_andnot(&fr->fr_flags, FLOWRTF_DELETED, relaxed);
461
462 /*
463 * Lookup gateway route (if any); returns locked gwrt
464 * with a reference bumped up.
465 */
466 err = route_to_gwroute(SA(&fr->fr_faddr), rt, &gwrt);
467 if (err != 0) {
468 /*
469 * Reference held by fr_rt_dst will be taken
470 * care of by flow_route_cleanup() below, so
471 * make sure we don't do an extra rtfree().
472 */
473 rt = NULL;
474 ASSERT(gwrt == NULL);
475 SK_ERR("no gw route to %s on %s (err %d)",
476 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
477 sizeof(dst_s)), ifp->if_xname, err);
478 goto done;
479 }
480
481 /* if RTF_GATEWAY isn't set, gwrt == rt */
482 ASSERT(gwrt != NULL);
483 RT_LOCK_ASSERT_HELD(gwrt);
484
485 /*
486 * Must have been cleared via cleanup, and that we're
487 * single-threaded here for fr by virtue of fr_lock.
488 */
489 ASSERT(!(fr->fr_flags & (FLOWRTF_GATEWAY | FLOWRTF_ONLINK)));
490
491 if (gwrt != rt && (rt->rt_flags & RTF_GATEWAY) &&
492 (rt->rt_gateway->sa_family == AF_INET ||
493 rt->rt_gateway->sa_family == AF_INET6)) {
494 struct sockaddr_storage ss;
495
496 ASSERT(fr->fr_rt_gw == NULL);
497 /* locked via route_to_gwroute() above */
498 fr->fr_rt_gw = gwrt; /* move reference to fr */
499 RT_ADDREF_LOCKED(gwrt); /* for this routine */
500 /*
501 * Destination is off-link and is reachable
502 * thru an IP gateway route. Save the IP
503 * address of the gateway in fr_gaddr.
504 */
505 (void) sa_copy(rt->rt_gateway, &ss, NULL);
506 _CASSERT(sizeof(fr->fr_gaddr) <= sizeof(ss));
507 bcopy(src: &ss, dst: &fr->fr_gaddr, n: sizeof(fr->fr_gaddr));
508 os_atomic_or(&fr->fr_flags, FLOWRTF_GATEWAY, relaxed);
509 } else if (IS_DIRECT_HOSTROUTE(rt)) {
510 /*
511 * Destination is on-link.
512 */
513 os_atomic_or(&fr->fr_flags, FLOWRTF_ONLINK, relaxed);
514 }
515 RT_UNLOCK(gwrt);
516 }
517 RT_ADDREF(rt); /* for this routine */
518
519 /* see if we need to re-select default source address */
520 int use_stable_address = fr_use_stable_address(req);
521 if (fr->fr_want_configure ||
522 fr->fr_laddr_gencnt != ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt ||
523 !(fr->fr_flags & FLOWRTF_STABLE_ADDR) != !use_stable_address) {
524 union sockaddr_in_4_6 old = fr->fr_laddr;
525 if (use_stable_address) {
526 os_atomic_or(&fr->fr_flags, FLOWRTF_STABLE_ADDR, relaxed);
527 } else {
528 os_atomic_andnot(&fr->fr_flags, FLOWRTF_STABLE_ADDR, relaxed);
529 }
530 if ((err = flow_route_select_laddr(&fr->fr_laddr, &fr->fr_faddr,
531 ifp, rt, &fr->fr_laddr_gencnt, use_stable_address)) != 0) {
532 SK_ERR("no usable src address to reach %s on %s "
533 "(err %d)", sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
534 sizeof(dst_s)), ifp->if_xname, err);
535 goto done;
536 }
537 if (bcmp(s1: &old, s2: &fr->fr_laddr, SA(&old)->sa_len) != 0) {
538 SK_ERR("src address is now %s (was %s) to reach %s "
539 "on %s", sk_sa_ntop(SA(&fr->fr_laddr), src_s,
540 sizeof(src_s)), sk_sa_ntop(SA(&old), old_s,
541 sizeof(old_s)), sk_sa_ntop(SA(&fr->fr_faddr),
542 dst_s, sizeof(dst_s)), ifp->if_xname);
543 }
544 }
545 ASSERT(err == 0);
546
547done:
548 if (__probable(err == 0)) {
549 os_atomic_store(&fr->fr_want_configure, 0, release);
550 } else {
551 /* callee frees route entry */
552 flow_route_cleanup(fr);
553 }
554
555 if (gwrt != NULL) {
556 ASSERT(rt != NULL);
557 if (gwrt == rt) {
558 RT_REMREF(gwrt);
559 } else {
560 rtfree(gwrt);
561 }
562 gwrt = NULL;
563 }
564
565 if (rt != NULL) {
566 rtfree(rt);
567 rt = NULL;
568 }
569
570 return err;
571}
572
573int
574flow_route_find(struct kern_nexus *nx, struct flow_mgr *fm,
575 struct ifnet *ifp, struct nx_flow_req *req,
576 flow_route_ctor_fn_t fr_ctor, flow_route_resolve_fn_t fr_resolve,
577 void *arg, struct flow_route **frp)
578{
579#if SK_LOG
580 char src_s[MAX_IPv6_STR_LEN]; /* dst */
581 char dst_s[MAX_IPv6_STR_LEN]; /* dst */
582 char gw_s[MAX_IPv6_STR_LEN]; /* gw */
583#endif /* SK_LOG */
584 union sockaddr_in_4_6 *daddr = &req->nfr_daddr;
585 struct flow_route_bucket *frb;
586 struct flow_route_id_bucket *frib;
587 struct flow_route *fr = NULL;
588 int err = 0;
589
590 ASSERT(fr_ctor != NULL && fr_resolve != NULL);
591
592 ASSERT(frp != NULL);
593 *frp = NULL;
594
595 frb = flow_mgr_get_frb_by_addr(fm, daddr);
596
597 int use_stable_address = fr_use_stable_address(req);
598
599 /* see if there is a cached flow route (as reader) */
600 FRB_RLOCK(frb);
601 fr = flow_route_find_by_addr(frb, dst: daddr);
602 if (fr != NULL) {
603 if (__improbable(fr->fr_want_configure || fr->fr_laddr_gencnt !=
604 ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt) ||
605 __improbable(!(fr->fr_flags & FLOWRTF_STABLE_ADDR) != !use_stable_address)) {
606 os_atomic_inc(&fr->fr_want_configure, relaxed);
607 FR_LOCK(fr);
608 err = flow_route_configure(fr, ifp, req);
609 if (err != 0) {
610 SK_ERR("fr 0x%llx error re-configuring dst %s "
611 "on %s (err %d) [R]", SK_KVA(fr),
612 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
613 sizeof(dst_s)), ifp->if_xname, err);
614 }
615 FR_UNLOCK(fr);
616 }
617 if (err == 0) {
618 SK_DF(SK_VERB_FLOW_ROUTE,
619 "fr 0x%llx found for dst %s " "on %s [R,%u]",
620 SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
621 sizeof(dst_s)), ifp->if_xname, fr->fr_usecnt);
622 }
623 FRB_RUNLOCK(frb); /* reader */
624 goto done;
625 }
626
627 /*
628 * Flow route doesn't exist; become a writer and prepare to
629 * allocate one. We could be racing with other threads here,
630 * so check first if there is now a cached flow route that
631 * got created by the winning thread.
632 */
633 if (!FRB_RLOCKTOWLOCK(frb)) {
634 FRB_WLOCK(frb);
635 }
636
637 fr = flow_route_find_by_addr(frb, dst: daddr);
638 if (fr != NULL) {
639 if (__improbable(fr->fr_want_configure) ||
640 __improbable(!(fr->fr_flags & FLOWRTF_STABLE_ADDR) != !use_stable_address)) {
641 FR_LOCK(fr);
642 err = flow_route_configure(fr, ifp, req);
643 if (err != 0) {
644 SK_ERR("fr 0x%llx error re-configuring dst %s "
645 "on %s (err %d) [W]", SK_KVA(fr),
646 sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
647 sizeof(dst_s)), ifp->if_xname, err);
648 }
649 FR_UNLOCK(fr);
650 }
651 if (err == 0) {
652 SK_DF(SK_VERB_FLOW_ROUTE,
653 "fr 0x%llx found for dst %s on %s [W,%u]",
654 SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
655 sizeof(dst_s)), ifp->if_xname, fr->fr_usecnt);
656 }
657 FRB_WUNLOCK(frb); /* writer */
658 goto done;
659 }
660
661 /* allocate one */
662 fr = fr_alloc(TRUE);
663 fr->fr_faddr = *daddr; /* remote address */
664
665 switch (SA(&fr->fr_faddr)->sa_family) {
666 case AF_INET:
667 SIN(&fr->fr_faddr)->sin_port = 0;
668 fr->fr_addr_len = sizeof(struct in_addr);
669 fr->fr_addr_key = &SIN(&fr->fr_faddr)->sin_addr;
670 break;
671
672 case AF_INET6:
673 SIN6(&fr->fr_faddr)->sin6_port = 0;
674 fr->fr_addr_len = sizeof(struct in6_addr);
675 fr->fr_addr_key = &SIN6(&fr->fr_faddr)->sin6_addr;
676 break;
677
678 default:
679 VERIFY(0);
680 /* NOTREACHED */
681 __builtin_unreachable();
682 }
683
684 ASSERT(!uuid_is_null(fr->fr_uuid));
685 uuid_copy(dst: fr->fr_nx_uuid, src: nx->nx_uuid);
686 *(struct flow_mgr **)(uintptr_t)&fr->fr_mgr = fm;
687
688 /* force configure newly-created flow route */
689 os_atomic_inc(&fr->fr_want_configure, relaxed);
690
691 FR_LOCK(fr);
692 if ((err = flow_route_configure(fr, ifp, req)) != 0) {
693 SK_ERR("fr 0x%llx error configuring dst %s on %s (err %d)",
694 SK_KVA(fr), sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
695 sizeof(dst_s)), ifp->if_xname, err);
696 FR_UNLOCK(fr);
697 FRB_WUNLOCK(frb); /* writer */
698 /* not yet in tree, so free immediately */
699 fr_free(fr);
700 fr = NULL;
701 goto done;
702 }
703
704 /* execute nexus-specific constructor */
705 fr_ctor(arg, fr);
706 FR_UNLOCK(fr);
707
708 frib = flow_mgr_get_frib_by_uuid(fm, fr->fr_uuid);
709 FRIB_WLOCK(frib);
710
711 *(struct flow_route_bucket **)(uintptr_t)&fr->fr_frb = frb;
712 *(struct flow_route_id_bucket **)(uintptr_t)&fr->fr_frib = frib;
713
714 FRB_WLOCK_ASSERT_HELD(frb);
715 FRIB_WLOCK_ASSERT_HELD(frib);
716
717 RB_INSERT(flow_route_tree, &frb->frb_head, fr);
718 RB_INSERT(flow_route_id_tree, &frib->frib_head, fr);
719
720 os_atomic_or(&fr->fr_flags, FLOWRTF_ATTACHED, relaxed);
721
722#if DEBUG
723 /* sanity checks for comparator routines */
724 VERIFY(flow_route_find_by_addr(frb, &fr->fr_faddr) == fr);
725 flow_route_release(fr);
726 VERIFY(flow_route_find_by_uuid(frib, fr->fr_uuid) == fr);
727 flow_route_release(fr);
728#endif /* DEBUG */
729
730 /* for the trees */
731 _CASSERT(FLOW_ROUTE_MINREF == 2);
732 flow_route_retain(fr);
733 flow_route_retain(fr);
734 ASSERT(fr->fr_usecnt == FLOW_ROUTE_MINREF);
735
736 /* for the caller */
737 flow_route_retain(fr);
738
739 FRIB_WUNLOCK(frib); /* writer */
740 FRB_WUNLOCK(frb); /* writer */
741
742 /* execute nexus-specific resolver */
743 if (!(fr->fr_flags & FLOWRTF_RESOLVED) &&
744 (err = fr_resolve(arg, fr, NULL)) != 0) {
745 if (fr->fr_flags & FLOWRTF_GATEWAY) {
746 SK_ERR("fr 0x%llx resolve %s gw %s on %s (err %d)",
747 SK_KVA(fr), (err == EJUSTRETURN ? "pending" :
748 "fail"), sk_sa_ntop(SA(&fr->fr_gaddr), dst_s,
749 sizeof(dst_s)), ifp->if_xname, err);
750 } else {
751 SK_ERR("fr 0x%llx resolve %s dst %s on %s (err %d)",
752 SK_KVA(fr), (err == EJUSTRETURN ? "pending" :
753 "fail"), sk_sa_ntop(SA(&fr->fr_faddr), dst_s,
754 sizeof(dst_s)), ifp->if_xname, err);
755 }
756 if (err == EJUSTRETURN) {
757 err = 0;
758 } else {
759 goto done;
760 }
761 }
762 ASSERT(err == 0);
763
764#if SK_LOG
765 if (fr->fr_flags & FLOWRTF_GATEWAY) {
766 SK_DF(SK_VERB_FLOW_ROUTE,
767 "add fr 0x%llx %s -> %s via gw %s on %s", SK_KVA(fr),
768 sk_sa_ntop(SA(&fr->fr_laddr), src_s, sizeof(src_s)),
769 sk_sa_ntop(SA(&fr->fr_faddr), dst_s, sizeof(dst_s)),
770 sk_sa_ntop(SA(&fr->fr_gaddr), gw_s, sizeof(gw_s)),
771 ifp->if_xname);
772 } else {
773 SK_DF(SK_VERB_FLOW_ROUTE,
774 "add fr 0x%llx %s -> %s on %s", SK_KVA(fr),
775 sk_sa_ntop(SA(&fr->fr_laddr), src_s, sizeof(src_s)),
776 sk_sa_ntop(SA(&fr->fr_faddr), dst_s, sizeof(dst_s)),
777 ifp->if_xname);
778 }
779#endif /* SK_LOG */
780
781done:
782 if (err == 0) {
783 ASSERT(fr != NULL);
784 *frp = fr;
785 } else if (fr != NULL) {
786 /* can't directly call fr_free() if it's in the tree */
787 flow_route_release(fr);
788 fr = NULL;
789 }
790
791 return err;
792}
793
794void
795flow_route_retain(struct flow_route *fr)
796{
797 lck_spin_lock(lck: &fr->fr_reflock);
798 if (fr->fr_usecnt++ == FLOW_ROUTE_MINREF) {
799 fr->fr_expire = 0;
800 }
801 lck_spin_unlock(lck: &fr->fr_reflock);
802}
803
804void
805flow_route_release(struct flow_route *fr)
806{
807 bool should_free = false;
808
809 lck_spin_lock(lck: &fr->fr_reflock);
810 VERIFY(fr->fr_usecnt > 0);
811 if (fr->fr_flags & FLOWRTF_ATTACHED) {
812 if (fr->fr_usecnt-- == (FLOW_ROUTE_MINREF + 1)) {
813 fr->fr_expire = _net_uptime + flow_route_expire;
814 }
815 } else {
816 /*
817 * fr is no longer in lookup tree, so there shouldn't be
818 * further usecnt, if we reach 0 usecnt, then this is the very
819 * last reference and is safe to unlock and call fr_free.
820 */
821 if (--(fr->fr_usecnt) == 0) {
822 should_free = true;
823 }
824 }
825 lck_spin_unlock(lck: &fr->fr_reflock);
826
827 if (should_free) {
828 fr_free(fr);
829 }
830}
831
832static uint32_t
833flow_route_bucket_purge_common(struct flow_route_bucket *frb, uint32_t *resid,
834 boolean_t all, boolean_t early_expire)
835{
836#if SK_LOG
837 char ss[MAX_IPv6_STR_LEN]; /* dst */
838 char ds[MAX_IPv6_STR_LEN]; /* dst */
839 char gs[MAX_IPv6_STR_LEN]; /* gw */
840#endif /* SK_LOG */
841 struct flow_route *fr, *tfr;
842 uint64_t now = net_uptime();
843 uint32_t i = 0, tot = 0;
844
845 FRB_WLOCK_ASSERT_HELD(frb);
846
847 RB_FOREACH_SAFE(fr, flow_route_tree, &frb->frb_head, tfr) {
848 struct flow_route_id_bucket *frib =
849 __DECONST(struct flow_route_id_bucket *, fr->fr_frib);
850
851 ++tot;
852 /*
853 * We're not holding fr_lock here, since this is a
854 * best-effort check. If there's a race and we miss
855 * it now, we'll come back again shortly.
856 */
857 lck_spin_lock(lck: &fr->fr_reflock);
858 if (!all && (fr->fr_usecnt > FLOW_ROUTE_MINREF ||
859 (fr->fr_expire > now && !early_expire &&
860 !(fr->fr_flags & FLOWRTF_DELETED)))) {
861 lck_spin_unlock(lck: &fr->fr_reflock);
862 SK_DF(SK_VERB_FLOW_ROUTE, "skipping fr 0x%llx "
863 "refcnt %u expire %llu", SK_KVA(fr),
864 fr->fr_usecnt, fr->fr_expire);
865 continue;
866 }
867 lck_spin_unlock(lck: &fr->fr_reflock);
868
869 /*
870 * If "all" is set, flow entries must be gone by now, as
871 * we must be called by flow_route_bucket_purge_all().
872 * It also means that the caller has acquired writer lock
873 * on all flow {route,route_id} buckets, and fr_usecnt
874 * must be at its minimum value now.
875 */
876 if (!all) {
877 FRIB_WLOCK(frib);
878 }
879 FRIB_WLOCK_ASSERT_HELD(frib);
880
881 _CASSERT(FLOW_ROUTE_MINREF == 2);
882 ASSERT(fr->fr_usecnt >= FLOW_ROUTE_MINREF);
883
884 RB_REMOVE(flow_route_tree, &frb->frb_head, fr);
885 RB_REMOVE(flow_route_id_tree, &frib->frib_head, fr);
886
887 os_atomic_andnot(&fr->fr_flags, FLOWRTF_ATTACHED, relaxed);
888
889#if SK_LOG
890 if (fr->fr_flags & FLOWRTF_GATEWAY) {
891 SK_DF(SK_VERB_FLOW_ROUTE,
892 "remove fr 0x%llx %s -> %s via gw %s [exp %lld]",
893 SK_KVA(fr),
894 sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)),
895 sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)),
896 sk_sa_ntop(SA(&fr->fr_gaddr), gs, sizeof(gs)),
897 (int64_t)(fr->fr_expire - now));
898 } else {
899 SK_DF(SK_VERB_FLOW_ROUTE,
900 "remove fr 0x%llx %s -> %s [exp %lld]", SK_KVA(fr),
901 sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)),
902 sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)),
903 (int64_t)(fr->fr_expire - now));
904 }
905#endif /* SK_LOG */
906
907 /* for the trees */
908 flow_route_release(fr);
909 flow_route_release(fr);
910 ++i;
911
912 if (!all) {
913 FRIB_WUNLOCK(frib);
914 }
915 }
916
917 if (resid != NULL) {
918 *resid = (tot - i);
919 }
920
921 return i;
922}
923
924void
925flow_route_bucket_purge_all(struct flow_route_bucket *frb)
926{
927 (void) flow_route_bucket_purge_common(frb, NULL, TRUE, FALSE);
928}
929
930static uint32_t
931flow_route_bucket_prune(struct flow_route_bucket *frb, struct ifnet *ifp,
932 uint32_t *resid)
933{
934 uint64_t now = net_uptime();
935 struct flow_route *fr;
936 uint32_t i = 0, tot = 0;
937 boolean_t ifdown = !(ifp->if_flags & IFF_UP);
938
939 FRB_RLOCK(frb);
940 RB_FOREACH(fr, flow_route_tree, &frb->frb_head) {
941 ++tot;
942 /* loose check; do this without holding fr_reflock */
943 if (fr->fr_usecnt > FLOW_ROUTE_MINREF ||
944 (fr->fr_expire > now && !ifdown &&
945 !(fr->fr_flags & FLOWRTF_DELETED))) {
946 continue;
947 }
948 ++i;
949 }
950
951 /*
952 * If there's nothing to prune or there's a writer, we're done.
953 * Note that if we failed to upgrade to writer, the lock would
954 * have been released automatically.
955 */
956 if (i == 0 || !FRB_RLOCKTOWLOCK(frb)) {
957 if (i == 0) {
958 FRB_RUNLOCK(frb);
959 }
960 if (resid != NULL) {
961 *resid = (tot - i);
962 }
963 return 0;
964 }
965
966 SK_DF(SK_VERB_FLOW_ROUTE, "purging at least %u idle routes on %s",
967 i, ifp->if_xname);
968
969 /* purge idle ones */
970 i = flow_route_bucket_purge_common(frb, resid, FALSE, early_expire: ifdown);
971 FRB_WUNLOCK(frb);
972
973 return i;
974}
975
976uint32_t
977flow_route_prune(struct flow_mgr *fm, struct ifnet *ifp,
978 uint32_t *tot_resid)
979{
980 uint32_t pruned = 0;
981 uint32_t resid;
982 uint32_t i;
983
984 for (i = 0; i < fm->fm_route_buckets_cnt; i++) {
985 struct flow_route_bucket *frb = flow_mgr_get_frb_at_idx(fm, idx: i);
986 pruned += flow_route_bucket_prune(frb, ifp, resid: &resid);
987 if (tot_resid != NULL) {
988 *tot_resid += resid;
989 }
990 }
991
992 return pruned;
993}
994
995/*
996 * This runs in the context of eventhandler invocation routine which loops
997 * through all the registered callbacks. Care must be taken to not call
998 * any primitives here that would lead to routing changes in the same context
999 * as it would lead to deadlock in eventhandler code.
1000 */
1001static void
1002flow_route_ev_callback(struct eventhandler_entry_arg ee_arg,
1003 struct sockaddr *dst, int route_ev, struct sockaddr *gw_addr, int flags)
1004{
1005#pragma unused(dst, flags)
1006#if SK_LOG
1007 char dst_s[MAX_IPv6_STR_LEN];
1008#endif /* SK_LOG */
1009 struct flow_route_id_bucket *frib = NULL;
1010 struct flow_route *fr = NULL;
1011 struct flow_mgr *fm;
1012
1013 VERIFY(!uuid_is_null(ee_arg.ee_fm_uuid));
1014 VERIFY(!uuid_is_null(ee_arg.ee_fr_uuid));
1015
1016 /*
1017 * Upon success, callee will hold flow manager lock as reader,
1018 * and we'll need to unlock it below. Otherwise there's no
1019 * need to unlock here and just return.
1020 */
1021 fm = flow_mgr_find_lock(ee_arg.ee_fm_uuid);
1022 if (fm == NULL) {
1023 SK_ERR("Event %s for dst %s ignored; flow manager not found",
1024 route_event2str(route_ev), sk_sa_ntop(dst, dst_s,
1025 sizeof(dst_s)));
1026 return;
1027 }
1028
1029 SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s event %s", fm->fm_name,
1030 sk_sa_ntop(dst, dst_s, sizeof(dst_s)), route_event2str(route_ev));
1031
1032 do {
1033 frib = flow_mgr_get_frib_by_uuid(fm, ee_arg.ee_fr_uuid);
1034
1035 FRIB_RLOCK(frib);
1036 /* callee returns a reference that we need to release below */
1037 fr = flow_route_find_by_uuid(frib, id: ee_arg.ee_fr_uuid);
1038 if (fr == NULL) {
1039 SK_ERR("%s: dst %s flow route not found", fm->fm_name,
1040 sk_sa_ntop(dst, dst_s, sizeof(dst_s)));
1041 break;
1042 }
1043
1044 /*
1045 * Grab fr_lock to prevent flow route configuration or
1046 * resolver from using stale info while we are updating.
1047 */
1048 FR_LOCK(fr);
1049
1050 switch (route_ev) {
1051 case ROUTE_ENTRY_REFRESH:
1052 /*
1053 * This is the case where the route entry has been
1054 * updated (for example through RTM_CHANGE). Some
1055 * of it may not warrant a lookup again and some of
1056 * it may. For now, mark flow to perform a look-up
1057 * again as the gateway may have changed.
1058 */
1059 os_atomic_inc(&fr->fr_want_configure, relaxed);
1060 os_atomic_andnot(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
1061 SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s route changed",
1062 fm->fm_name, sk_sa_ntop(dst, dst_s,
1063 sizeof(dst_s)));
1064 break;
1065
1066 case ROUTE_ENTRY_DELETED:
1067 /*
1068 * NOTE: flow_route_cleanup() should not be called
1069 * to de-register eventhandler in the context of
1070 * eventhandler callback to avoid deadlock in
1071 * eventhandler code. Instead, just mark the flow
1072 * route un-resolved. When it is being used again
1073 * or being deleted the old eventhandler must be
1074 * de-registered.
1075 */
1076 os_atomic_inc(&fr->fr_want_configure, relaxed);
1077 os_atomic_andnot(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
1078 os_atomic_or(&fr->fr_flags, FLOWRTF_DELETED, relaxed);
1079 SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s route deleted",
1080 fm->fm_name, sk_sa_ntop(dst, dst_s,
1081 sizeof(dst_s)));
1082 break;
1083
1084 case ROUTE_LLENTRY_STALE:
1085 /*
1086 * When the route entry is deemed unreliable or old
1087 * enough to trigger a route lookup again. Don't
1088 * reconfigure the flow route, but simply attempt
1089 * to resolve it next time to trigger a probe.
1090 */
1091 os_atomic_inc(&fr->fr_want_probe, relaxed);
1092 os_atomic_andnot(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
1093 SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s llentry stale",
1094 fm->fm_name, sk_sa_ntop(dst, dst_s,
1095 sizeof(dst_s)));
1096 break;
1097
1098 case ROUTE_LLENTRY_CHANGED:
1099 /*
1100 * When the link-layer info has changed; replace
1101 * cached llinfo in the flow route (treat this
1102 * as ROUTE_LLENTRY_RESOLVED).
1103 */
1104 OS_FALLTHROUGH;
1105
1106 case ROUTE_LLENTRY_RESOLVED:
1107 /*
1108 * SDL address length may be 0 for cellular.
1109 * If Ethernet, copy into flow route and mark
1110 * it as cached. In all cases, mark the flow
1111 * route as resolved.
1112 */
1113 ASSERT(SDL(gw_addr)->sdl_family == AF_LINK);
1114 if (SDL(gw_addr)->sdl_alen == ETHER_ADDR_LEN) {
1115 FLOWRT_UPD_ETH_DST(fr, LLADDR(SDL(gw_addr)));
1116 SK_DF(SK_VERB_FLOW_ROUTE,
1117 "%s: dst %s llentry %s", fm->fm_name,
1118 sk_sa_ntop(dst, dst_s, sizeof(dst_s)),
1119 (!(fr->fr_flags & FLOWRTF_HAS_LLINFO) ?
1120 "resolved" : "changed"));
1121 os_atomic_or(&fr->fr_flags, FLOWRTF_HAS_LLINFO, relaxed);
1122 } else {
1123 os_atomic_andnot(&fr->fr_flags, FLOWRTF_HAS_LLINFO, relaxed);
1124 }
1125 os_atomic_or(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
1126#if SK_LOG
1127 if (__improbable((sk_verbose & SK_VERB_FLOW_ROUTE) !=
1128 0) && (fr->fr_flags & FLOWRTF_HAS_LLINFO)) {
1129 SK_DF(SK_VERB_FLOW_ROUTE,
1130 "%s: fr 0x%llx eth_type 0x%x "
1131 "eth_src %x:%x:%x:%x:%x:%x "
1132 "eth_dst %x:%x:%x:%x:%x:%x [%s])",
1133 fm->fm_name, SK_KVA(fr),
1134 ntohs(fr->fr_eth.ether_type),
1135 fr->fr_eth.ether_shost[0],
1136 fr->fr_eth.ether_shost[1],
1137 fr->fr_eth.ether_shost[2],
1138 fr->fr_eth.ether_shost[3],
1139 fr->fr_eth.ether_shost[4],
1140 fr->fr_eth.ether_shost[5],
1141 fr->fr_eth.ether_dhost[0],
1142 fr->fr_eth.ether_dhost[1],
1143 fr->fr_eth.ether_dhost[2],
1144 fr->fr_eth.ether_dhost[3],
1145 fr->fr_eth.ether_dhost[4],
1146 fr->fr_eth.ether_dhost[5],
1147 sk_sa_ntop(dst, dst_s, sizeof(dst_s)));
1148 }
1149#endif /* SK_LOG */
1150 break;
1151
1152 case ROUTE_LLENTRY_DELETED:
1153 /*
1154 * If the route entry points to a router and an
1155 * RTM_DELETE has been issued on it; force the
1156 * flow route to be reconfigured.
1157 */
1158 os_atomic_inc(&fr->fr_want_configure, relaxed);
1159 os_atomic_andnot(&fr->fr_flags, (FLOWRTF_HAS_LLINFO | FLOWRTF_RESOLVED), relaxed);
1160 SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s llentry deleted",
1161 fm->fm_name, sk_sa_ntop(dst, dst_s,
1162 sizeof(dst_s)));
1163 break;
1164
1165 case ROUTE_LLENTRY_PROBED:
1166 /*
1167 * When the resolver has begun probing the target;
1168 * nothing to do here.
1169 */
1170 SK_DF(SK_VERB_FLOW_ROUTE, "%s: dst %s llentry probed",
1171 fm->fm_name, sk_sa_ntop(dst, dst_s,
1172 sizeof(dst_s)));
1173 break;
1174
1175 case ROUTE_LLENTRY_UNREACH:
1176 /*
1177 * When the route entry is marked with RTF_REJECT
1178 * or the probes have timed out, reconfigure.
1179 */
1180 os_atomic_inc(&fr->fr_want_configure, relaxed);
1181 os_atomic_andnot(&fr->fr_flags, FLOWRTF_RESOLVED, relaxed);
1182 SK_ERR("%s: dst %s llentry unreachable", fm->fm_name,
1183 sk_sa_ntop(dst, dst_s, sizeof(dst_s)));
1184 break;
1185
1186 default:
1187 break;
1188 }
1189 } while (0);
1190
1191 if (fr != NULL) {
1192 flow_route_release(fr);
1193 FR_UNLOCK(fr);
1194 }
1195
1196 if (frib != NULL) {
1197 FRIB_UNLOCK(frib);
1198 }
1199
1200 if (fm != NULL) {
1201 flow_mgr_unlock();
1202 }
1203}
1204
1205int
1206flow_route_select_laddr(union sockaddr_in_4_6 *src, union sockaddr_in_4_6 *dst,
1207 struct ifnet *ifp, struct rtentry *rt, uint32_t *ipaddr_gencnt,
1208 int use_stable_address)
1209{
1210#if SK_LOG
1211 char src_s[MAX_IPv6_STR_LEN]; /* src */
1212 char dst_s[MAX_IPv6_STR_LEN]; /* dst */
1213#endif /* SK_LOG */
1214 sa_family_t af = SA(dst)->sa_family;
1215 struct ifnet *src_ifp = NULL;
1216 struct ifaddr *ifa = NULL;
1217 int err = 0;
1218
1219 /* see comments in flow_route_configure() regarding loopback */
1220 ASSERT(rt->rt_ifp == ifp || rt->rt_ifp == lo_ifp);
1221
1222 switch (af) {
1223 case AF_INET: {
1224 ifnet_lock_shared(ifp);
1225 if (__improbable(rt->rt_ifa->ifa_debug & IFD_DETACHING) != 0) {
1226 err = EHOSTUNREACH;
1227 SK_ERR("route to %s has src address marked detaching "
1228 "(err %d)", inet_ntop(AF_INET,
1229 &SIN(dst)->sin_addr, dst_s, sizeof(dst_s)), err);
1230 ifnet_lock_done(ifp);
1231 break;
1232 }
1233 SIN(src)->sin_len = sizeof(struct sockaddr_in);
1234 SIN(src)->sin_family = AF_INET;
1235 SIN(src)->sin_addr = IA_SIN(rt->rt_ifa)->sin_addr;
1236 ASSERT(SIN(src)->sin_addr.s_addr != INADDR_ANY);
1237 *ipaddr_gencnt = ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt;
1238 ifnet_lock_done(ifp);
1239 break;
1240 }
1241
1242 case AF_INET6: {
1243 struct in6_addr src_storage, *in6;
1244 struct route_in6 ro = {};
1245 uint32_t hints = (use_stable_address ? 0 : IPV6_SRCSEL_HINT_PREFER_TMPADDR);
1246 ro.ro_rt = rt;
1247
1248 if ((in6 = in6_selectsrc_core(SIN6(dst), hints,
1249 ifp, 0, &src_storage, &src_ifp, &err, &ifa, &ro, FALSE)) == NULL) {
1250 if (err == 0) {
1251 err = EADDRNOTAVAIL;
1252 }
1253 VERIFY(src_ifp == NULL);
1254 SK_ERR("src address to dst %s on %s not available "
1255 "(err %d)", inet_ntop(AF_INET6,
1256 &SIN6(dst)->sin6_addr, dst_s, sizeof(dst_s)),
1257 ifp->if_xname, err);
1258 break;
1259 }
1260
1261 VERIFY(src_ifp != NULL);
1262 VERIFY(ifa != NULL);
1263
1264 if (__improbable(src_ifp != ifp)) {
1265 if (err == 0) {
1266 err = ENETUNREACH;
1267 }
1268 SK_ERR("dst %s, src %s ifp %s != %s (err %d)",
1269 inet_ntop(AF_INET6, &SIN6(dst)->sin6_addr,
1270 dst_s, sizeof(dst_s)),
1271 inet_ntop(AF_INET6, &SIN6(src)->sin6_addr,
1272 src_s, sizeof(src_s)),
1273 src_ifp->if_xname, ifp->if_xname, err);
1274 break;
1275 }
1276
1277 ifnet_lock_shared(ifp);
1278 if (__improbable(ifa->ifa_debug & IFD_DETACHING) != 0) {
1279 err = EHOSTUNREACH;
1280 SK_ERR("IPv6 address selected is marked to be "
1281 "detached (err %d)", err);
1282 ifnet_lock_done(ifp);
1283 break;
1284 }
1285
1286 /* clear embedded scope if link-local src */
1287 if (IN6_IS_SCOPE_EMBED(in6)) {
1288 if (in6_embedded_scope) {
1289 SIN6(src)->sin6_scope_id = ntohs(in6->s6_addr16[1]);
1290 in6->s6_addr16[1] = 0;
1291 } else {
1292 SIN6(src)->sin6_scope_id = src_ifp->if_index;
1293 }
1294 }
1295 SIN6(src)->sin6_len = sizeof(struct sockaddr_in6);
1296 SIN6(src)->sin6_family = AF_INET6;
1297 SIN6(src)->sin6_addr = *in6;
1298 ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&SIN6(src)->sin6_addr));
1299 *ipaddr_gencnt = ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt;
1300 ifnet_lock_done(ifp);
1301 break;
1302 }
1303
1304 default:
1305 VERIFY(0);
1306 /* NOTREACHED */
1307 __builtin_unreachable();
1308 }
1309
1310 if (ifa != NULL) {
1311 ifa_remref(ifa);
1312 }
1313
1314 if (src_ifp != NULL) {
1315 ifnet_release(interface: src_ifp);
1316 }
1317
1318#if SK_LOG
1319 if (err == 0 && __improbable((sk_verbose & SK_VERB_FLOW_ROUTE) != 0)) {
1320 SK_DF(SK_VERB_FLOW_ROUTE, "src %s to dst %s on %s",
1321 sk_sa_ntop(SA(src), src_s, sizeof(src_s)),
1322 sk_sa_ntop(SA(dst), dst_s, sizeof(dst_s)),
1323 ifp->if_xname);
1324 }
1325#endif /* SK_LOG */
1326
1327 return err;
1328}
1329
1330void
1331flow_route_cleanup(struct flow_route *fr)
1332{
1333#if SK_LOG
1334 char ss[MAX_IPv6_STR_LEN]; /* dst */
1335 char ds[MAX_IPv6_STR_LEN]; /* dst */
1336 char gs[MAX_IPv6_STR_LEN]; /* gw */
1337#endif /* SK_LOG */
1338
1339 FR_LOCK_ASSERT_HELD(fr);
1340
1341 if (fr->fr_rt_evhdlr_tag != NULL) {
1342 ASSERT(fr->fr_rt_dst != NULL);
1343 route_event_enqueue_nwk_wq_entry(fr->fr_rt_dst, NULL,
1344 ROUTE_EVHDLR_DEREGISTER, fr->fr_rt_evhdlr_tag, FALSE);
1345 fr->fr_rt_evhdlr_tag = NULL;
1346 fr->fr_rt_dst = NULL;
1347 }
1348 ASSERT(fr->fr_rt_dst == NULL);
1349 if (fr->fr_rt_gw != NULL) {
1350 rtfree(fr->fr_rt_gw);
1351 fr->fr_rt_gw = NULL;
1352 }
1353
1354#if SK_LOG
1355 if (fr->fr_flags & FLOWRTF_GATEWAY) {
1356 SK_DF(SK_VERB_FLOW_ROUTE,
1357 "clean fr 0x%llx %s -> %s via gw %s", SK_KVA(fr),
1358 sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)),
1359 sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)),
1360 sk_sa_ntop(SA(&fr->fr_gaddr), gs, sizeof(gs)));
1361 } else if (fr->fr_flags & FLOWRTF_ONLINK) {
1362 SK_DF(SK_VERB_FLOW_ROUTE,
1363 "clean fr 0x%llx %s -> %s", SK_KVA(fr),
1364 sk_sa_ntop(SA(&fr->fr_laddr), ss, sizeof(ss)),
1365 sk_sa_ntop(SA(&fr->fr_faddr), ds, sizeof(ds)));
1366 }
1367#endif /* SK_LOG */
1368
1369 os_atomic_andnot(&fr->fr_flags, (FLOWRTF_GATEWAY | FLOWRTF_ONLINK), relaxed);
1370}
1371
1372static boolean_t
1373_flow_route_laddr_validate(struct flow_ip_addr *src_ip0, uint8_t ip_v,
1374 struct ifnet *ifp, uint32_t *gencnt)
1375{
1376 boolean_t address_found = TRUE;
1377 struct ifaddr *ifa = NULL;
1378 struct flow_ip_addr src_ip = {};
1379 uint32_t scope = ifp->if_index;
1380
1381 VERIFY(gencnt != NULL);
1382 VERIFY(ip_v == IPVERSION || ip_v == IPV6_VERSION);
1383
1384 if (ip_v == IPVERSION) {
1385 memcpy(dst: &src_ip._v4, src: &src_ip0->_v4, n: sizeof(src_ip._v4));
1386
1387 ifa = (struct ifaddr *)ifa_foraddr_scoped(
1388 src_ip._v4.s_addr, scope);
1389 } else {
1390 memcpy(dst: &src_ip, src: src_ip0, n: sizeof(*src_ip0));
1391
1392 if (in6_embedded_scope && IN6_IS_SCOPE_EMBED(&src_ip._v6)) {
1393 src_ip._v6.s6_addr16[1] = htons((uint16_t)scope);
1394 }
1395 ifa = (struct ifaddr *)ifa_foraddr6_scoped(&src_ip._v6,
1396 scope);
1397 }
1398
1399 if (__improbable(ifa == NULL)) {
1400 address_found = FALSE;
1401 goto done;
1402 }
1403
1404 ifnet_lock_shared(ifp);
1405 if (__improbable(ifa->ifa_debug & IFD_DETACHING) != 0) {
1406 address_found = FALSE;
1407 ifnet_lock_done(ifp);
1408 goto done;
1409 }
1410
1411 if (ip_v == IPV6_VERSION) {
1412 struct in6_ifaddr *ia6 = (struct in6_ifaddr *)ifa;
1413
1414 /*
1415 * Fail if IPv6 address is not ready or if the address
1416 * is reserved * for CLAT46.
1417 */
1418 if (__improbable(ia6->ia6_flags &
1419 (IN6_IFF_NOTREADY | IN6_IFF_CLAT46)) != 0) {
1420 address_found = FALSE;
1421 ifnet_lock_done(ifp);
1422 goto done;
1423 }
1424 } else {
1425 /*
1426 * If interface has CLAT46 enabled, fail IPv4 bind.
1427 * Since this implies network is NAT64/DNS64, Internet
1428 * effectively becomes reachable over IPv6. If on
1429 * system IPv4 to IPv6 translation is required, that
1430 * should be handled solely through bump in the API.
1431 * The in kernel translation is only done for apps
1432 * directly using low level networking APIs.
1433 */
1434 if (__improbable(IS_INTF_CLAT46(ifp))) {
1435 address_found = FALSE;
1436 ifnet_lock_done(ifp);
1437 goto done;
1438 }
1439 }
1440
1441 *gencnt = ifp->if_nx_flowswitch.if_fsw_ipaddr_gencnt;
1442 ifnet_lock_done(ifp);
1443done:
1444 if (ifa != NULL) {
1445 ifa_remref(ifa);
1446 }
1447
1448 return address_found;
1449}
1450
1451boolean_t
1452flow_route_laddr_validate(union sockaddr_in_4_6 *saddr, struct ifnet *ifp,
1453 uint32_t *gencnt)
1454{
1455 VERIFY(saddr->sa.sa_family == AF_INET ||
1456 saddr->sa.sa_family == AF_INET6);
1457
1458 struct flow_ip_addr *ipa;
1459 uint8_t ipv;
1460 if (saddr->sa.sa_family == AF_INET) {
1461 ipv = IPVERSION;
1462 ipa = (struct flow_ip_addr *)(void *)&saddr->sin.sin_addr;
1463 } else {
1464 ipv = IPV6_VERSION;
1465 ipa = (struct flow_ip_addr *)(void *)&saddr->sin6.sin6_addr;
1466 }
1467
1468 return _flow_route_laddr_validate(src_ip0: ipa, ip_v: ipv, ifp, gencnt);
1469}
1470
1471boolean_t
1472flow_route_key_validate(struct flow_key *fk, struct ifnet *ifp,
1473 uint32_t *gencnt)
1474{
1475 return _flow_route_laddr_validate(src_ip0: &fk->fk_src, ip_v: fk->fk_ipver, ifp,
1476 gencnt);
1477}
1478