1/*
2 * Copyright (c) 2017-2023 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/* $FreeBSD: src/sys/netinet6/frag6.c,v 1.2.2.5 2001/07/03 11:01:50 ume Exp $ */
30/* $KAME: frag6.c,v 1.31 2001/05/17 13:45:34 jinmei Exp $ */
31
32/*
33 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
34 * All rights reserved.
35 *
36 * Redistribution and use in source and binary forms, with or without
37 * modification, are permitted provided that the following conditions
38 * are met:
39 * 1. Redistributions of source code must retain the above copyright
40 * notice, this list of conditions and the following disclaimer.
41 * 2. Redistributions in binary form must reproduce the above copyright
42 * notice, this list of conditions and the following disclaimer in the
43 * documentation and/or other materials provided with the distribution.
44 * 3. Neither the name of the project nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 */
60
61/*
62 * @file
63 * flowswitch IP Reassembly for both v4 and v6
64 *
65 * Implementation of IP packet fragmentation and reassembly.
66 *
67 */
68
69#include <sys/domain.h>
70#include <netinet/in.h>
71#include <netinet/ip6.h>
72#include <netinet/icmp6.h>
73#include <skywalk/os_skywalk_private.h>
74#include <skywalk/nexus/flowswitch/nx_flowswitch.h>
75#include <skywalk/nexus/flowswitch/fsw_var.h>
76
77#define IPFM_MAX_FRAGS_PER_QUEUE 128 /* RFC 791 64K/(512 min MTU) */
78#define IPFM_MAX_QUEUES 1024 /* same as ip/ip6 */
79#define IPFM_FRAG_TTL 60 /* RFC 2460 */
80#define IPFM_TIMEOUT_TCALL_INTERVAL 1
81
82static uint32_t ipfm_max_frags_per_queue = IPFM_MAX_FRAGS_PER_QUEUE;
83static uint32_t ipfm_frag_ttl = IPFM_FRAG_TTL;
84static uint32_t ipfm_timeout_tcall_ival = IPFM_TIMEOUT_TCALL_INTERVAL;
85
86#if (DEVELOPMENT || DEBUG)
87SYSCTL_INT(_kern_skywalk_flowswitch, OID_AUTO,
88 ipfm_max_frags_per_queue, CTLFLAG_RW | CTLFLAG_LOCKED,
89 &ipfm_max_frags_per_queue, 0, "");
90#endif /* !DEVELOPMENT && !DEBUG */
91
92SYSCTL_INT(_kern_skywalk_flowswitch, OID_AUTO, ipfm_frag_ttl,
93 CTLFLAG_RW | CTLFLAG_LOCKED, &ipfm_frag_ttl, 0, "");
94SYSCTL_INT(_kern_skywalk_flowswitch, OID_AUTO,
95 ipfm_timeout_tcall_ival, CTLFLAG_RW | CTLFLAG_LOCKED,
96 &ipfm_timeout_tcall_ival, 0, "");
97
98static LCK_GRP_DECLARE(fsw_ipfm_lock_group, "sk_fsw_ipfm_lock");
99static LCK_ATTR_DECLARE(fsw_ipfm_lock_attr, 0, 0);
100
101/* @internal ip fragment wrapper (chained in an ipfq) for __kern_packet */
102struct ipf {
103 struct ipf *ipf_down;
104 struct ipf *ipf_up;
105 struct __kern_packet *ipf_pkt;
106 int ipf_len; /* fragmentable part length */
107 int ipf_off; /* fragment offset */
108 uint16_t ipf_mff; /* more fragment bit in frag off */
109};
110
111/* @internal ip fragment lookup key */
112struct ipf_key {
113 uint64_t ipfk_addr[4]; /* src + dst ip addr (v4/v6) */
114 uint32_t ipfk_ident; /* IP identification */
115 uint16_t ipfk_len; /* len of ipfk_addr field */
116};
117
118enum {
119 IPFK_LEN_V4 = 2 * sizeof(struct in_addr),
120 IPFK_LEN_V6 = 2 * sizeof(struct in6_addr),
121};
122
123/*
124 * @internal
125 * IP reassembly queue structure. Each fragment (struct ipf)
126 * being reassembled is attached to one of these structures.
127 */
128struct ipfq {
129 struct ipf *ipfq_down; /* fragment chain */
130 struct ipf *ipfq_up;
131 struct ipfq *ipfq_next; /* queue chain */
132 struct ipfq *ipfq_prev;
133 uint64_t ipfq_timestamp; /* time of creation */
134 struct ipf_key ipfq_key; /* ipfq search key */
135 uint16_t ipfq_nfrag; /* # of fragments in queue */
136 int ipfq_unfraglen; /* len of unfragmentable part */
137 bool ipfq_is_dirty; /* q is dirty, don't use */
138};
139
140/*
141 * @internal (externally opaque)
142 * flowswitch IP Fragment Manager
143 */
144struct fsw_ip_frag_mgr {
145 struct skoid ipfm_skoid;
146 struct ipfq ipfm_q; /* ip reassembly queues */
147 uint32_t ipfm_q_limit; /* limit # of reass queues */
148 uint32_t ipfm_q_count; /* # of allocated reass queues */
149 uint32_t ipfm_f_limit; /* limit # of ipfs */
150 uint32_t ipfm_f_count; /* current # of allocated ipfs */
151 decl_lck_mtx_data(, ipfm_lock); /* guard reass and timeout cleanup */
152 thread_call_t ipfm_timeout_tcall; /* frag timeout thread */
153
154 struct ifnet *ipfm_ifp;
155 struct fsw_stats *ipfm_stats; /* indirect stats in fsw */
156};
157
158static int ipf_process(struct fsw_ip_frag_mgr *, struct __kern_packet **,
159 struct ipf_key *, uint16_t, uint16_t, uint16_t, uint16_t, uint16_t *,
160 uint16_t *);
161static int ipf_key_cmp(struct ipf_key *, struct ipf_key *);
162static void ipf_enq(struct ipf *, struct ipf *);
163static void ipf_deq(struct ipf *);
164static void ipfq_insque(struct ipfq *, struct ipfq *);
165static void ipfq_remque(struct ipfq *);
166static uint32_t ipfq_freef(struct fsw_ip_frag_mgr *mgr, struct ipfq *,
167 void (*)(struct fsw_ip_frag_mgr *, struct ipf *));
168
169static void ipfq_timeout(thread_call_param_t, thread_call_param_t);
170static void ipfq_sched_timeout(struct fsw_ip_frag_mgr *, boolean_t);
171
172static struct ipfq *ipfq_alloc(struct fsw_ip_frag_mgr *mgr);
173static void ipfq_free(struct fsw_ip_frag_mgr *mgr, struct ipfq *q);
174static uint32_t ipfq_freefq(struct fsw_ip_frag_mgr *mgr, struct ipfq *q,
175 void (*ipf_cb)(struct fsw_ip_frag_mgr *, struct ipf *));
176static struct ipf *ipf_alloc(struct fsw_ip_frag_mgr *mgr);
177static void ipf_free(struct fsw_ip_frag_mgr *mgr, struct ipf *f);
178static void ipf_free_pkt(struct ipf *f);
179static void ipfq_drain(struct fsw_ip_frag_mgr *mgr);
180static void ipfq_reap(struct fsw_ip_frag_mgr *mgr);
181static int ipfq_drain_sysctl SYSCTL_HANDLER_ARGS;
182void ipf_icmp_param_err(struct fsw_ip_frag_mgr *, struct __kern_packet *pkt,
183 int param);
184void ipf_icmp_timeout_err(struct fsw_ip_frag_mgr *, struct ipf *f);
185
186/* Create a flowswitch IP fragment manager. */
187struct fsw_ip_frag_mgr *
188fsw_ip_frag_mgr_create(struct nx_flowswitch *fsw, struct ifnet *ifp,
189 size_t f_limit)
190{
191 struct fsw_ip_frag_mgr *mgr;
192
193 ASSERT(ifp != NULL);
194
195 mgr = sk_alloc_type(struct fsw_ip_frag_mgr, Z_WAITOK | Z_NOFAIL,
196 skmem_tag_fsw_frag_mgr);
197
198 mgr->ipfm_q.ipfq_next = mgr->ipfm_q.ipfq_prev = &mgr->ipfm_q;
199 lck_mtx_init(lck: &mgr->ipfm_lock, grp: &fsw_ipfm_lock_group, attr: &fsw_ipfm_lock_attr);
200
201 mgr->ipfm_timeout_tcall =
202 thread_call_allocate_with_options(func: ipfq_timeout, param0: mgr,
203 pri: THREAD_CALL_PRIORITY_KERNEL, options: THREAD_CALL_OPTIONS_ONCE);
204 VERIFY(mgr->ipfm_timeout_tcall != NULL);
205
206 mgr->ipfm_ifp = ifp;
207 mgr->ipfm_stats = &fsw->fsw_stats;
208
209 /* Use caller provided limit (caller knows pool size) */
210 ASSERT(f_limit >= 2 && f_limit < UINT32_MAX);
211 mgr->ipfm_f_limit = (uint32_t)f_limit;
212 mgr->ipfm_f_count = 0;
213 mgr->ipfm_q_limit = MIN(IPFM_MAX_QUEUES, mgr->ipfm_f_limit / 2);
214 mgr->ipfm_q_count = 0;
215
216 skoid_create(skoid: &mgr->ipfm_skoid, SKOID_DNODE(fsw->fsw_skoid), name: "ipfm", kind: 0);
217 skoid_add_uint(skoid: &mgr->ipfm_skoid, name: "frag_limit", CTLFLAG_RW,
218 ptr: &mgr->ipfm_f_limit);
219 skoid_add_uint(skoid: &mgr->ipfm_skoid, name: "frag_count", CTLFLAG_RD,
220 ptr: &mgr->ipfm_f_count);
221 skoid_add_uint(skoid: &mgr->ipfm_skoid, name: "queue_limit", CTLFLAG_RW,
222 ptr: &mgr->ipfm_q_limit);
223 skoid_add_uint(skoid: &mgr->ipfm_skoid, name: "queue_count", CTLFLAG_RD,
224 ptr: &mgr->ipfm_q_count);
225 skoid_add_handler(skoid: &mgr->ipfm_skoid, name: "drain", CTLFLAG_RW,
226 handler: ipfq_drain_sysctl, arg1: mgr, arg2: 0);
227
228 return mgr;
229}
230
231/* Free a flowswitch IP fragment manager. */
232void
233fsw_ip_frag_mgr_destroy(struct fsw_ip_frag_mgr *mgr)
234{
235 thread_call_t tcall;
236
237 lck_mtx_lock(lck: &mgr->ipfm_lock);
238 if ((tcall = mgr->ipfm_timeout_tcall) != NULL) {
239 lck_mtx_unlock(lck: &mgr->ipfm_lock);
240 (void) thread_call_cancel_wait(call: tcall);
241 (void) thread_call_free(call: tcall);
242 mgr->ipfm_timeout_tcall = NULL;
243 lck_mtx_lock(lck: &mgr->ipfm_lock);
244 }
245
246 ipfq_drain(mgr);
247
248 lck_mtx_unlock(lck: &mgr->ipfm_lock);
249 lck_mtx_destroy(lck: &mgr->ipfm_lock, grp: &fsw_ipfm_lock_group);
250
251 skoid_destroy(skoid: &mgr->ipfm_skoid);
252 sk_free_type(struct fsw_ip_frag_mgr, mgr);
253}
254
255/*
256 * Reassemble a received IPv4 fragment.
257 *
258 * @param mgr
259 * fragment manager
260 * @param pkt
261 * received packet (must have ipv4 header validated)
262 * @param ip4
263 * pointer to the packet's IPv4 header
264 * @param nfrags
265 * number of fragments reassembled
266 * @return
267 * Successfully processed (not fully reassembled)
268 * ret = 0, *pkt = NULL(ipfm owns it), *nfrags=0
269 * Successfully reassembled
270 * ret = 0, *pkt = 1st fragment(fragments chained in order by pkt_nextpkt)
271 * *nfrags = number of all fragments (>0)
272 * Error
273 * ret != 0 && *pkt unmodified (caller to decide what to do with *pkt)
274 * *nfrags = 0
275 */
276int
277fsw_ip_frag_reass_v4(struct fsw_ip_frag_mgr *mgr, struct __kern_packet **pkt,
278 struct ip *ip4, uint16_t *nfrags, uint16_t *tlen)
279{
280 struct ipf_key key;
281 uint16_t unfragpartlen, offflag, fragoff, fragpartlen, fragflag;
282 int err;
283
284 STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_V4);
285
286 bcopy(src: (void *)&ip4->ip_src, dst: (void *)key.ipfk_addr, n: IPFK_LEN_V4);
287 key.ipfk_len = IPFK_LEN_V4;
288 key.ipfk_ident = (uint32_t)ip4->ip_id;
289
290 unfragpartlen = (uint16_t)(ip4->ip_hl << 2);
291 offflag = ntohs(ip4->ip_off);
292 fragoff = (uint16_t)(offflag << 3);
293 fragpartlen = ntohs(ip4->ip_len) - (uint16_t)(ip4->ip_hl << 2);
294 fragflag = offflag & IP_MF;
295
296 err = ipf_process(mgr, pkt, &key, unfragpartlen, fragoff, fragpartlen,
297 fragflag, nfrags, tlen);
298
299 /*
300 * If packet has been reassembled compute the user data length.
301 */
302 if (*pkt != NULL) {
303 struct __kern_packet *p = *pkt;
304 struct ip *iph = (struct ip *)p->pkt_flow_ip_hdr;
305
306 p->pkt_flow_ulen = ntohs(iph->ip_len) -
307 p->pkt_flow_ip_hlen - p->pkt_flow->flow_l4._l4_hlen;
308 }
309 return err;
310}
311
312/*
313 * Reassemble a received IPv6 fragment.
314 *
315 * @param mgr
316 * fragment manager
317 * @param pkt
318 * received packet (must have ipv6 header validated)
319 * @param ip6
320 * pointer to the packet's IPv6 header
321 * @param ip6f
322 * pointer to the packet's IPv6 Fragment Header
323 * @param nfrags
324 * number of fragments reassembled
325 * @return
326 * Successfully processed (not fully reassembled)
327 * ret = 0, *pkt = NULL(ipfm owns it), *nfrags=0
328 * Successfully reassembled
329 * ret = 0, *pkt = 1st fragment(fragments chained in ordrer by pkt_nextpkt)
330 * *nfrags = number of all fragments (>0)
331 * Error
332 * ret != 0 && *pkt unmodified (caller to decide what to do with *pkt)
333 * *nfrags = 0
334 */
335int
336fsw_ip_frag_reass_v6(struct fsw_ip_frag_mgr *mgr, struct __kern_packet **pkt,
337 struct ip6_hdr *ip6, struct ip6_frag *ip6f, uint16_t *nfrags,
338 uint16_t *tlen)
339{
340 struct ipf_key key;
341 ptrdiff_t ip6f_ptroff = (uintptr_t)ip6f - (uintptr_t)ip6;
342 uint16_t ip6f_off, fragoff, fragpartlen, unfragpartlen, fragflag;
343 int err;
344
345 STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_V6);
346
347 /* jumbo payload can't contain a fragment header */
348 if (ip6->ip6_plen == 0) {
349 *nfrags = 0;
350 return ERANGE;
351 }
352
353 ASSERT(ip6f_ptroff < UINT16_MAX);
354 ip6f_off = (uint16_t)ip6f_ptroff;
355 fragoff = ntohs(ip6f->ip6f_offlg & IP6F_OFF_MASK);
356 fragpartlen = ntohs(ip6->ip6_plen) -
357 (ip6f_off + sizeof(struct ip6_frag) - sizeof(struct ip6_hdr));
358 unfragpartlen = ip6f_off;
359 fragflag = ip6f->ip6f_offlg & IP6F_MORE_FRAG;
360
361 /*
362 * RFC 6946: Handle "atomic" fragments (offset and m bit set to 0)
363 * upfront, unrelated to any reassembly.
364 *
365 * Flow classifier should process those as non-frag, ipfm shouldn't see
366 * them.
367 */
368 ASSERT((ip6f->ip6f_offlg & ~IP6F_RESERVED_MASK) != 0);
369
370 bcopy(src: (void *)&ip6->ip6_src, dst: (void *)key.ipfk_addr, n: IPFK_LEN_V6);
371 key.ipfk_len = IPFK_LEN_V6;
372 key.ipfk_ident = ip6f->ip6f_ident;
373
374 err = ipf_process(mgr, pkt, &key, unfragpartlen, fragoff, fragpartlen,
375 fragflag, nfrags, tlen);
376
377 /*
378 * If packet has been reassembled compute the user data length.
379 */
380 if (*pkt != NULL) {
381 struct __kern_packet *p = *pkt;
382 struct ip6_hdr *ip6h = (struct ip6_hdr *)p->pkt_flow_ip_hdr;
383
384 p->pkt_flow_ulen = ntohs(ip6h->ip6_plen) -
385 p->pkt_flow->flow_l4._l4_hlen;
386 }
387 return err;
388}
389
390static struct mbuf *
391ipf_pkt2mbuf(struct fsw_ip_frag_mgr *mgr, struct __kern_packet *pkt)
392{
393 unsigned int one = 1;
394 struct mbuf *m = NULL;
395 uint8_t *buf;
396 struct ip6_hdr *ip6;
397 uint32_t l3t_len;
398 int err;
399
400 l3t_len = pkt->pkt_length - pkt->pkt_l2_len;
401 if (pkt->pkt_link_flags & PKT_LINKF_ETHFCS) {
402 l3t_len -= ETHER_CRC_LEN;
403 }
404
405 err = mbuf_allocpacket(how: MBUF_WAITOK, packetlen: l3t_len, maxchunks: &one, mbuf: &m);
406 VERIFY(err == 0);
407 ASSERT(l3t_len <= mbuf_maxlen(m));
408
409 if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
410 bcopy(src: m_mtod_current(m: pkt->pkt_mbuf) + pkt->pkt_l2_len,
411 dst: m_mtod_current(m), n: l3t_len);
412 } else {
413 MD_BUFLET_ADDR_ABS(pkt, buf);
414 buf += (pkt->pkt_headroom + pkt->pkt_l2_len);
415 bcopy(src: buf, dst: m_mtod_current(m), n: l3t_len);
416 }
417 m->m_pkthdr.len = m->m_len = l3t_len;
418
419 ip6 = mtod(m, struct ip6_hdr *);
420 /* note for casting: IN6_IS_SCOPE_ doesn't need alignment */
421 if (IN6_IS_SCOPE_LINKLOCAL((struct in6_addr *)(uintptr_t)&ip6->ip6_src)) {
422 if (in6_embedded_scope) {
423 ip6->ip6_src.s6_addr16[1] = htons(mgr->ipfm_ifp->if_index);
424 }
425 ip6_output_setsrcifscope(m, mgr->ipfm_ifp->if_index, NULL);
426 }
427 if (IN6_IS_SCOPE_EMBED((struct in6_addr *)(uintptr_t)&ip6->ip6_dst)) {
428 if (in6_embedded_scope) {
429 ip6->ip6_dst.s6_addr16[1] = htons(mgr->ipfm_ifp->if_index);
430 }
431 ip6_output_setdstifscope(m, mgr->ipfm_ifp->if_index, NULL);
432 }
433
434 return m;
435}
436
437/*
438 * Since this function can be called while holding fsw_ip_frag_mgr.ipfm_lock,
439 * we need to ensure we don't enter the driver directly because a deadlock
440 * can happen if this same thread tries to get the workloop lock.
441 */
442static void
443ipf_icmp6_error_flag(struct mbuf *m, int type, int code, int param, int flags)
444{
445 sk_protect_t protect = sk_async_transmit_protect();
446 icmp6_error_flag(m, type, code, param, flags);
447 sk_async_transmit_unprotect(protect);
448}
449
450/*
451 * @internal IP fragment ICMP parameter problem error handling
452 *
453 * @param param
454 * offending parameter offset, only applicable to ICMPv6
455 */
456void
457ipf_icmp_param_err(struct fsw_ip_frag_mgr *mgr, struct __kern_packet *pkt,
458 int param_offset)
459{
460 if (pkt->pkt_flow_ip_ver != IPV6_VERSION) {
461 return;
462 }
463
464 struct mbuf *m = NULL;
465 m = ipf_pkt2mbuf(mgr, pkt);
466 if (__probable(m != NULL)) {
467 ipf_icmp6_error_flag(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
468 param: param_offset, flags: 0);
469 }
470
471 /* m would be free by icmp6_error_flag function */
472}
473
474/* @internal IP fragment ICMP timeout error handling */
475void
476ipf_icmp_timeout_err(struct fsw_ip_frag_mgr *mgr, struct ipf *f)
477{
478 struct __kern_packet *pkt = f->ipf_pkt;
479 ASSERT(pkt != NULL);
480
481 /* no icmp error packet for ipv4 */
482 if (pkt->pkt_flow_ip_ver != IPV6_VERSION) {
483 return;
484 }
485
486 /* only for the first fragment */
487 if (f->ipf_off != 0) {
488 return;
489 }
490
491 struct mbuf *m = NULL;
492 m = ipf_pkt2mbuf(mgr, pkt);
493 if (__probable(m != NULL)) {
494 ipf_icmp6_error_flag(m, ICMP6_TIME_EXCEEDED,
495 ICMP6_TIME_EXCEED_REASSEMBLY, param: 0, flags: 0);
496 }
497
498 /* m would be free by icmp6_error_flag function */
499}
500
501/* @internal IP fragment processing, v4/v6 agonistic */
502int
503ipf_process(struct fsw_ip_frag_mgr *mgr, struct __kern_packet **pkt_ptr,
504 struct ipf_key *key, uint16_t unfraglen, uint16_t fragoff,
505 uint16_t fragpartlen, uint16_t fragflag, uint16_t *nfrags, uint16_t *tlen)
506{
507 struct __kern_packet *pkt = *pkt_ptr;
508 struct __kern_packet *pkt_reassed = NULL;
509 struct ipfq *q, *mq = &mgr->ipfm_q;
510 struct ipf *f, *f_new, *f_down;
511 uint32_t nfrags_freed;
512 int next;
513 int first_frag = 0;
514 int err = 0;
515 int local_ipfq_unfraglen;
516
517 *nfrags = 0;
518
519 SK_DF(SK_VERB_IP_FRAG, "id %5d fragoff %5d fragpartlen %5d "
520 "fragflag 0x%x", key->ipfk_ident, fragoff, fragpartlen, fragflag);
521
522 /*
523 * Make sure that all fragments except last one have a data length
524 * that's a non-zero multiple of 8 bytes.
525 */
526 if (fragflag && (fragpartlen == 0 || (fragpartlen & 0x7) != 0)) {
527 SK_DF(SK_VERB_IP_FRAG, "frag not multiple of 8 bytes");
528 STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_BAD_LEN);
529 ipf_icmp_param_err(mgr, pkt,
530 offsetof(struct ip6_hdr, ip6_plen));
531 return ERANGE;
532 }
533
534 lck_mtx_lock(lck: &mgr->ipfm_lock);
535
536 /* find ipfq */
537 for (q = mq->ipfq_next; q != mq; q = q->ipfq_next) {
538 if (ipf_key_cmp(key, &q->ipfq_key) == 0) {
539 if (q->ipfq_is_dirty) {
540 SK_DF(SK_VERB_IP_FRAG, "found dirty q, skip");
541 err = EINVAL;
542 goto done;
543 }
544 break;
545 }
546 }
547
548 /* not found, create new ipfq */
549 if (q == mq) {
550 first_frag = 1;
551
552 q = ipfq_alloc(mgr);
553 if (q == NULL) {
554 STATS_INC(mgr->ipfm_stats,
555 FSW_STATS_RX_FRAG_DROP_NOMEM);
556 err = ENOMEM;
557 goto done;
558 }
559
560 ipfq_insque(q, mq);
561 net_update_uptime();
562
563 bcopy(src: key, dst: &q->ipfq_key, n: sizeof(struct ipf_key));
564 q->ipfq_down = q->ipfq_up = (struct ipf *)q;
565 q->ipfq_unfraglen = -1; /* The 1st fragment has not arrived. */
566 q->ipfq_nfrag = 0;
567 q->ipfq_timestamp = _net_uptime;
568 }
569
570 ASSERT(!q->ipfq_is_dirty);
571
572 /* this queue has reached per queue frag limit */
573 if (q->ipfq_nfrag > ipfm_max_frags_per_queue) {
574 nfrags_freed = ipfq_freefq(mgr, q, NULL);
575 STATS_ADD(mgr->ipfm_stats,
576 FSW_STATS_RX_FRAG_DROP_PER_QUEUE_LIMIT, nfrags_freed);
577 err = ENOMEM;
578 goto done;
579 }
580
581 local_ipfq_unfraglen = q->ipfq_unfraglen;
582
583 /*
584 * If it's the 1st fragment, record the length of the
585 * unfragmentable part and the next header of the fragment header.
586 * Assume the first fragement to arrive will be correct.
587 * We do not have any duplicate checks here yet so another packet
588 * with fragoff == 0 could come and overwrite the ipfq_unfraglen
589 * and worse, the next header, at any time.
590 */
591 if (fragoff == 0 && local_ipfq_unfraglen == -1) {
592 local_ipfq_unfraglen = unfraglen;
593 }
594
595 /* Check that the reassembled packet would not exceed 65535 bytes. */
596 if (local_ipfq_unfraglen + fragoff + fragpartlen > IP_MAXPACKET) {
597 SK_DF(SK_VERB_IP_FRAG, "frag too big");
598 STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_BAD);
599 ipf_icmp_param_err(mgr, pkt, param_offset: sizeof(struct ip6_hdr) +
600 offsetof(struct ip6_frag, ip6f_offlg));
601 err = ERANGE;
602 goto done;
603 }
604
605 /*
606 * If it's the 1st fragment, do the above check for each
607 * fragment already stored in the reassembly queue.
608 * If an error is found, still return 0, since we don't return
609 * ownership of a chain of offending packets back to caller.
610 */
611 if (fragoff == 0) {
612 for (f = q->ipfq_down; f != (struct ipf *)q; f = f_down) {
613 f_down = f->ipf_down;
614 if (local_ipfq_unfraglen + f->ipf_off + f->ipf_len >
615 IP_MAXPACKET) {
616 SK_DF(SK_VERB_IP_FRAG, "frag too big");
617 STATS_INC(mgr->ipfm_stats,
618 FSW_STATS_RX_FRAG_BAD);
619 ipf_deq(f);
620 ipf_free_pkt(f);
621 ipf_free(mgr, f);
622 }
623 }
624 }
625
626 f_new = ipf_alloc(mgr);
627 if (f_new == NULL) {
628 STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_NOMEM);
629 err = ENOMEM;
630 goto done;
631 }
632
633 f_new->ipf_mff = fragflag;
634 f_new->ipf_off = fragoff;
635 f_new->ipf_len = fragpartlen;
636 f_new->ipf_pkt = pkt;
637
638 if (first_frag) {
639 f = (struct ipf *)q;
640 goto insert;
641 }
642
643 /* Find a segment which begins after this one does. */
644 for (f = q->ipfq_down; f != (struct ipf *)q; f = f->ipf_down) {
645 if (f->ipf_off > f_new->ipf_off) {
646 break;
647 }
648 }
649
650 /*
651 * If any of the fragments being reassembled overlap with any
652 * other fragments being reassembled for the same packet,
653 * reassembly of that packet must be abandoned and all the
654 * fragments that have been received for that packet must be
655 * discarded, and no ICMP error messages should be sent.
656 *
657 * It should be noted that fragments may be duplicated in the
658 * network. Instead of treating these exact duplicate fragments
659 * as overlapping fragments, an implementation may choose to
660 * detect this case and drop exact duplicate fragments while
661 * keeping the other fragments belonging to the same packet.
662 *
663 * https://tools.ietf.org/html/rfc8200#appendix-B
664 *
665 * We apply this rule for both for IPv4 and IPv6 here.
666 */
667 if (((f->ipf_up != (struct ipf *)q) && /* prev frag spans into f_new */
668 (f->ipf_up->ipf_off + f->ipf_up->ipf_len - f_new->ipf_off > 0)) ||
669 ((f != (struct ipf *)q) && /* f_new spans into next */
670 (f_new->ipf_off + f_new->ipf_len - f->ipf_off > 0))) {
671 STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_BAD);
672 /* Check for exact duplicate offset/length */
673 if (((f->ipf_up != (struct ipf *)q) &&
674 ((f->ipf_up->ipf_off != f_new->ipf_off) ||
675 (f->ipf_up->ipf_len != f_new->ipf_len))) ||
676 ((f != (struct ipf *)q) &&
677 ((f->ipf_off != f_new->ipf_off) ||
678 (f->ipf_len != f_new->ipf_len)))) {
679 SK_DF(SK_VERB_IP_FRAG, "frag overlap");
680 ipf_free(mgr, f: f_new);
681 /* give up over-lapping fragments queue */
682 SK_DF(SK_VERB_IP_FRAG, "free overlapping queue");
683 ipfq_freef(mgr, q, NULL);
684 q->ipfq_is_dirty = true;
685 } else {
686 ipf_free(mgr, f: f_new);
687 SK_DF(SK_VERB_IP_FRAG, "frag dup");
688 }
689 err = ERANGE;
690 goto done;
691 }
692
693insert:
694 q->ipfq_unfraglen = local_ipfq_unfraglen;
695
696 /*
697 * Stick new segment in its place;
698 * check for complete reassembly.
699 * Move to front of packet queue, as we are
700 * the most recently active fragmented packet.
701 */
702 ipf_enq(f_new, f->ipf_up);
703 q->ipfq_nfrag++;
704 next = 0;
705 for (f = q->ipfq_down; f != (struct ipf *)q; f = f->ipf_down) {
706 /* there is a hole */
707 if (f->ipf_off != next) {
708 goto done;
709 }
710 next += f->ipf_len;
711 }
712 /* we haven't got last frag yet */
713 if (f->ipf_up->ipf_mff) {
714 goto done;
715 }
716
717 /*
718 * Reassembly is complete; concatenate fragments.
719 */
720 f = q->ipfq_down;
721 f_down = f->ipf_down;
722 pkt_reassed = f->ipf_pkt;
723 *nfrags = 1;
724 while (f_down != (struct ipf *)q) {
725 /* chain __kern_packet with pkt_nextpkt ptr */
726 f->ipf_pkt->pkt_nextpkt = f_down->ipf_pkt;
727 (*nfrags)++;
728 (*tlen) += f_down->ipf_len;
729 f_down = f->ipf_down;
730 ipf_deq(f);
731 ipf_free(mgr, f);
732 f = f_down;
733 f_down = f->ipf_down;
734 }
735 ipf_free(mgr, f);
736
737 err = 0;
738 STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_REASSED);
739 ipfq_remque(q);
740 ipfq_free(mgr, q);
741
742done:
743 /* ipfm take ownership of, or return assembled packet, if no error */
744 if (err == 0) {
745 /* reass'ed packet if done; NULL otherwise */
746 *pkt_ptr = pkt_reassed;
747 }
748 ipfq_sched_timeout(mgr, FALSE);
749 lck_mtx_unlock(lck: &mgr->ipfm_lock);
750 return err;
751}
752
753static int
754ipf_key_cmp(struct ipf_key *a, struct ipf_key *b)
755{
756 int d;
757
758 if ((d = (a->ipfk_len - b->ipfk_len)) != 0) {
759 return d;
760 }
761
762 if ((d = (a->ipfk_ident - b->ipfk_ident)) != 0) {
763 return d;
764 }
765
766 return memcmp(s1: a->ipfk_addr, s2: b->ipfk_addr, n: a->ipfk_len);
767}
768
769/*
770 * Put an ip fragment on a reassembly chain.
771 * Like insque, but pointers in middle of structure.
772 */
773static void
774ipf_enq(struct ipf *f, struct ipf *up6)
775{
776 f->ipf_up = up6;
777 f->ipf_down = up6->ipf_down;
778 up6->ipf_down->ipf_up = f;
779 up6->ipf_down = f;
780}
781
782/*
783 * To ipf_enq as remque is to insque.
784 */
785static void
786ipf_deq(struct ipf *f)
787{
788 f->ipf_up->ipf_down = f->ipf_down;
789 f->ipf_down->ipf_up = f->ipf_up;
790}
791
792static void
793ipfq_insque(struct ipfq *new, struct ipfq *old)
794{
795 new->ipfq_prev = old;
796 new->ipfq_next = old->ipfq_next;
797 old->ipfq_next->ipfq_prev = new;
798 old->ipfq_next = new;
799}
800
801static void
802ipfq_remque(struct ipfq *p6)
803{
804 p6->ipfq_prev->ipfq_next = p6->ipfq_next;
805 p6->ipfq_next->ipfq_prev = p6->ipfq_prev;
806}
807
808/*
809 * @internal drain reassembly queue till reaching target q count.
810 */
811static void
812_ipfq_reap(struct fsw_ip_frag_mgr *mgr, uint32_t target_q_count,
813 void (*ipf_cb)(struct fsw_ip_frag_mgr *, struct ipf *))
814{
815 uint32_t n_freed = 0;
816
817 LCK_MTX_ASSERT(&mgr->ipfm_lock, LCK_MTX_ASSERT_OWNED);
818
819 SK_DF(SK_VERB_IP_FRAG, "draining (frag %d/%d queue %d/%d)",
820 mgr->ipfm_f_count, mgr->ipfm_f_limit, mgr->ipfm_q_count,
821 mgr->ipfm_q_limit);
822
823 while (mgr->ipfm_q.ipfq_next != &mgr->ipfm_q &&
824 mgr->ipfm_q_count > target_q_count) {
825 n_freed += ipfq_freefq(mgr, q: mgr->ipfm_q.ipfq_prev,
826 ipf_cb: mgr->ipfm_q.ipfq_prev->ipfq_is_dirty ? NULL : ipf_cb);
827 }
828
829 STATS_ADD(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_REAPED, n_freed);
830}
831
832/*
833 * @internal reap half reassembly queues to allow newer fragment assembly.
834 */
835static void
836ipfq_reap(struct fsw_ip_frag_mgr *mgr)
837{
838 _ipfq_reap(mgr, target_q_count: mgr->ipfm_q_count / 2, ipf_cb: ipf_icmp_timeout_err);
839}
840
841/*
842 * @internal reap all reassembly queues, for shutdown etc.
843 */
844static void
845ipfq_drain(struct fsw_ip_frag_mgr *mgr)
846{
847 _ipfq_reap(mgr, target_q_count: 0, NULL);
848}
849
850static void
851ipfq_timeout(thread_call_param_t arg0, thread_call_param_t arg1)
852{
853#pragma unused(arg1)
854 struct fsw_ip_frag_mgr *mgr = arg0;
855 struct ipfq *q;
856 uint64_t now, elapsed;
857 uint32_t n_freed = 0;
858
859 net_update_uptime();
860 now = _net_uptime;
861
862 SK_DF(SK_VERB_IP_FRAG, "run");
863 lck_mtx_lock(lck: &mgr->ipfm_lock);
864 q = mgr->ipfm_q.ipfq_next;
865 if (q) {
866 while (q != &mgr->ipfm_q) {
867 q = q->ipfq_next;
868 elapsed = now - q->ipfq_prev->ipfq_timestamp;
869 if (elapsed > ipfm_frag_ttl) {
870 SK_DF(SK_VERB_IP_FRAG, "timing out q id %5d",
871 q->ipfq_prev->ipfq_key.ipfk_ident);
872 n_freed = ipfq_freefq(mgr, q: q->ipfq_prev,
873 ipf_cb: q->ipfq_is_dirty ? NULL :
874 ipf_icmp_timeout_err);
875 }
876 }
877 }
878 STATS_ADD(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_TIMEOUT, n_freed);
879
880 /* If running out of resources, drain ipfm queues (oldest one first) */
881 if (mgr->ipfm_f_count >= mgr->ipfm_f_limit ||
882 mgr->ipfm_q_count >= mgr->ipfm_q_limit) {
883 ipfq_reap(mgr);
884 }
885
886 /* re-arm the purge timer if there's work to do */
887 if (mgr->ipfm_q_count > 0) {
888 ipfq_sched_timeout(mgr, TRUE);
889 }
890 lck_mtx_unlock(lck: &mgr->ipfm_lock);
891}
892
893static void
894ipfq_sched_timeout(struct fsw_ip_frag_mgr *mgr, boolean_t in_tcall)
895{
896 uint32_t delay = MAX(1, ipfm_timeout_tcall_ival); /* seconds */
897 thread_call_t tcall = mgr->ipfm_timeout_tcall;
898 uint64_t now = mach_absolute_time();
899 uint64_t ival, deadline = now;
900
901 LCK_MTX_ASSERT(&mgr->ipfm_lock, LCK_MTX_ASSERT_OWNED);
902
903 ASSERT(tcall != NULL);
904 if (mgr->ipfm_q_count > 0 &&
905 (!thread_call_isactive(call: tcall) || in_tcall)) {
906 nanoseconds_to_absolutetime(nanoseconds: delay * NSEC_PER_SEC, result: &ival);
907 clock_deadline_for_periodic_event(interval: ival, abstime: now, deadline: &deadline);
908 (void) thread_call_enter_delayed(call: tcall, deadline);
909 }
910}
911
912static int
913ipfq_drain_sysctl SYSCTL_HANDLER_ARGS
914{
915#pragma unused(oidp, arg2)
916 struct fsw_ip_frag_mgr *mgr = arg1;
917
918 SKOID_PROC_CALL_GUARD;
919
920 lck_mtx_lock(lck: &mgr->ipfm_lock);
921 ipfq_drain(mgr);
922 lck_mtx_unlock(lck: &mgr->ipfm_lock);
923
924 return 0;
925}
926
927static struct ipfq *
928ipfq_alloc(struct fsw_ip_frag_mgr *mgr)
929{
930 struct ipfq *q;
931
932 if (mgr->ipfm_q_count > mgr->ipfm_q_limit) {
933 ipfq_reap(mgr);
934 }
935 ASSERT(mgr->ipfm_q_count <= mgr->ipfm_q_limit);
936
937 q = kalloc_type(struct ipfq, Z_WAITOK | Z_ZERO);
938 if (q != NULL) {
939 mgr->ipfm_q_count++;
940 q->ipfq_is_dirty = false;
941 }
942 return q;
943}
944
945/* free q */
946static void
947ipfq_free(struct fsw_ip_frag_mgr *mgr, struct ipfq *q)
948{
949 kfree_type(struct ipfq, q);
950 mgr->ipfm_q_count--;
951}
952
953/*
954 * Free all fragments, keep q.
955 * @return: number of frags freed
956 */
957static uint32_t
958ipfq_freef(struct fsw_ip_frag_mgr *mgr, struct ipfq *q,
959 void (*ipf_cb)(struct fsw_ip_frag_mgr *, struct ipf *))
960{
961 struct ipf *f, *down6;
962 uint32_t nfrags = 0;
963
964 for (f = q->ipfq_down; f != (struct ipf *)q; f = down6) {
965 nfrags++;
966 down6 = f->ipf_down;
967 ipf_deq(f);
968 if (ipf_cb != NULL) {
969 (*ipf_cb)(mgr, f);
970 }
971 ipf_free_pkt(f);
972 ipf_free(mgr, f);
973 }
974
975 return nfrags;
976}
977
978/* Free both all fragments and q
979 * @return: number of frags freed
980 */
981static uint32_t
982ipfq_freefq(struct fsw_ip_frag_mgr *mgr, struct ipfq *q,
983 void (*ipf_cb)(struct fsw_ip_frag_mgr *, struct ipf *))
984{
985 uint32_t freed_count;
986 freed_count = ipfq_freef(mgr, q, ipf_cb);
987 ipfq_remque(p6: q);
988 ipfq_free(mgr, q);
989 return freed_count;
990}
991
992static struct ipf *
993ipf_alloc(struct fsw_ip_frag_mgr *mgr)
994{
995 struct ipf *f;
996
997 if (mgr->ipfm_f_count > mgr->ipfm_f_limit) {
998 STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_FRAG_LIMIT);
999 return NULL;
1000 }
1001
1002 f = kalloc_type(struct ipf, Z_WAITOK | Z_ZERO);
1003 if (f != NULL) {
1004 mgr->ipfm_f_count++;
1005 }
1006 return f;
1007}
1008
1009static void
1010ipf_free_pkt(struct ipf *f)
1011{
1012 struct __kern_packet *pkt = f->ipf_pkt;
1013 ASSERT(pkt != NULL);
1014 pp_free_packet(__DECONST(struct kern_pbufpool *, pkt->pkt_qum.qum_pp),
1015 SK_PTR_ADDR(pkt));
1016}
1017
1018static void
1019ipf_free(struct fsw_ip_frag_mgr *mgr, struct ipf *f)
1020{
1021 kfree_type(struct ipf, f);
1022 mgr->ipfm_f_count--;
1023}
1024