fsw_ip_frag.c source code [xnu/bsd/skywalk/nexus/flowswitch/fsw_ip_frag.c]

1	/*
2	* Copyright (c) 2017-2023 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28
29	/ $FreeBSD: src/sys/netinet6/frag6.c,v 1.2.2.5 2001/07/03 11:01:50 ume Exp $ /
30	/ $KAME: frag6.c,v 1.31 2001/05/17 13:45:34 jinmei Exp $ /
31
32	/*
33	* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
34	* All rights reserved.
35	*
36	* Redistribution and use in source and binary forms, with or without
37	* modification, are permitted provided that the following conditions
38	* are met:
39	* 1. Redistributions of source code must retain the above copyright
40	* notice, this list of conditions and the following disclaimer.
41	* 2. Redistributions in binary form must reproduce the above copyright
42	* notice, this list of conditions and the following disclaimer in the
43	* documentation and/or other materials provided with the distribution.
44	* 3. Neither the name of the project nor the names of its contributors
45	* may be used to endorse or promote products derived from this software
46	* without specific prior written permission.
47	*
48	* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
49	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51	* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
52	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58	* SUCH DAMAGE.
59	*/
60
61	/*
62	* @file
63	* flowswitch IP Reassembly for both v4 and v6
64	*
65	* Implementation of IP packet fragmentation and reassembly.
66	*
67	*/
68
69	#include <sys/domain.h>
70	#include <netinet/in.h>
71	#include <netinet/ip6.h>
72	#include <netinet/icmp6.h>
73	#include <skywalk/os_skywalk_private.h>
74	#include <skywalk/nexus/flowswitch/nx_flowswitch.h>
75	#include <skywalk/nexus/flowswitch/fsw_var.h>
76
77	#define IPFM_MAX_FRAGS_PER_QUEUE 128 /* RFC 791 64K/(512 min MTU) */
78	#define IPFM_MAX_QUEUES 1024 /* same as ip/ip6 */
79	#define IPFM_FRAG_TTL 60 /* RFC 2460 */
80	#define IPFM_TIMEOUT_TCALL_INTERVAL 1
81
82	static uint32_t ipfm_max_frags_per_queue = IPFM_MAX_FRAGS_PER_QUEUE;
83	static uint32_t ipfm_frag_ttl = IPFM_FRAG_TTL;
84	static uint32_t ipfm_timeout_tcall_ival = IPFM_TIMEOUT_TCALL_INTERVAL;
85
86	#if (DEVELOPMENT \|\| DEBUG)
87	SYSCTL_INT(_kern_skywalk_flowswitch, OID_AUTO,
88	ipfm_max_frags_per_queue, CTLFLAG_RW \| CTLFLAG_LOCKED,
89	&ipfm_max_frags_per_queue, `0`, "");
90	#endif /* !DEVELOPMENT && !DEBUG */
91
92	SYSCTL_INT(_kern_skywalk_flowswitch, OID_AUTO, ipfm_frag_ttl,
93	CTLFLAG_RW \| CTLFLAG_LOCKED, &ipfm_frag_ttl, `0`, "");
94	SYSCTL_INT(_kern_skywalk_flowswitch, OID_AUTO,
95	ipfm_timeout_tcall_ival, CTLFLAG_RW \| CTLFLAG_LOCKED,
96	&ipfm_timeout_tcall_ival, `0`, "");
97
98	static LCK_GRP_DECLARE(fsw_ipfm_lock_group, "sk_fsw_ipfm_lock");
99	static LCK_ATTR_DECLARE(fsw_ipfm_lock_attr, `0`, `0`);
100
101	/ @internal ip fragment wrapper (chained in an ipfq) for __kern_packet /
102	struct ipf {
103	struct ipf *ipf_down;
104	struct ipf *ipf_up;
105	struct __kern_packet *ipf_pkt;
106	int ipf_len; / fragmentable part length /
107	int ipf_off; / fragment offset /
108	uint16_t ipf_mff; / more fragment bit in frag off /
109	};
110
111	/ @internal ip fragment lookup key /
112	struct ipf_key {
113	uint64_t ipfk_addr[`4`]; / src + dst ip addr (v4/v6) /
114	uint32_t ipfk_ident; / IP identification /
115	uint16_t ipfk_len; / len of ipfk_addr field /
116	};
117
118	enum {
119	IPFK_LEN_V4 = `2` * sizeof(struct in_addr),
120	IPFK_LEN_V6 = `2` * sizeof(struct in6_addr),
121	};
122
123	/*
124	* @internal
125	* IP reassembly queue structure. Each fragment (struct ipf)
126	* being reassembled is attached to one of these structures.
127	*/
128	struct ipfq {
129	struct ipf ipfq_down; /* fragment chain /
130	struct ipf *ipfq_up;
131	struct ipfq ipfq_next; /* queue chain /
132	struct ipfq *ipfq_prev;
133	uint64_t ipfq_timestamp; / time of creation /
134	struct ipf_key ipfq_key; / ipfq search key /
135	uint16_t ipfq_nfrag; / # of fragments in queue /
136	int ipfq_unfraglen; / len of unfragmentable part /
137	bool ipfq_is_dirty; / q is dirty, don't use /
138	};
139
140	/*
141	* @internal (externally opaque)
142	* flowswitch IP Fragment Manager
143	*/
144	struct fsw_ip_frag_mgr {
145	struct skoid ipfm_skoid;
146	struct ipfq ipfm_q; / ip reassembly queues /
147	uint32_t ipfm_q_limit; / limit # of reass queues /
148	uint32_t ipfm_q_count; / # of allocated reass queues /
149	uint32_t ipfm_f_limit; / limit # of ipfs /
150	uint32_t ipfm_f_count; / current # of allocated ipfs /
151	decl_lck_mtx_data(, ipfm_lock); / guard reass and timeout cleanup /
152	thread_call_t ipfm_timeout_tcall; / frag timeout thread /
153
154	struct ifnet *ipfm_ifp;
155	struct fsw_stats ipfm_stats; /* indirect stats in fsw /
156	};
157
158	static int ipf_process(struct fsw_ip_frag_mgr , struct* __kern_packet **,
159	struct ipf_key , uint16_t, uint16_t, uint16_t, uint16_t, uint16_t ,
160	uint16_t *);
161	static int ipf_key_cmp(struct ipf_key , struct* ipf_key *);
162	static void ipf_enq(struct ipf , struct* ipf *);
163	static void ipf_deq(struct ipf *);
164	static void ipfq_insque(struct ipfq , struct* ipfq *);
165	static void ipfq_remque(struct ipfq *);
166	static uint32_t ipfq_freef(struct fsw_ip_frag_mgr mgr, struct* ipfq *,
167	void ()(struct* fsw_ip_frag_mgr , struct* ipf *));
168
169	static void ipfq_timeout(thread_call_param_t, thread_call_param_t);
170	static void ipfq_sched_timeout(struct fsw_ip_frag_mgr *, boolean_t);
171
172	static struct ipfq ipfq_alloc(struct* fsw_ip_frag_mgr *mgr);
173	static void ipfq_free(struct fsw_ip_frag_mgr mgr, struct* ipfq *q);
174	static uint32_t ipfq_freefq(struct fsw_ip_frag_mgr mgr, struct* ipfq *q,
175	void (ipf_cb)(struct* fsw_ip_frag_mgr , struct* ipf *));
176	static struct ipf ipf_alloc(struct* fsw_ip_frag_mgr *mgr);
177	static void ipf_free(struct fsw_ip_frag_mgr mgr, struct* ipf *f);
178	static void ipf_free_pkt(struct ipf *f);
179	static void ipfq_drain(struct fsw_ip_frag_mgr *mgr);
180	static void ipfq_reap(struct fsw_ip_frag_mgr *mgr);
181	static int ipfq_drain_sysctl SYSCTL_HANDLER_ARGS;
182	void ipf_icmp_param_err(struct fsw_ip_frag_mgr , struct* __kern_packet *pkt,
183	int param);
184	void ipf_icmp_timeout_err(struct fsw_ip_frag_mgr , struct* ipf *f);
185
186	/ Create a flowswitch IP fragment manager. /
187	struct fsw_ip_frag_mgr *
188	fsw_ip_frag_mgr_create(struct nx_flowswitch fsw, struct* ifnet *ifp,
189	size_t f_limit)
190	{
191	struct fsw_ip_frag_mgr *mgr;
192
193	ASSERT(ifp != NULL);
194
195	mgr = sk_alloc_type(struct fsw_ip_frag_mgr, Z_WAITOK \| Z_NOFAIL,
196	skmem_tag_fsw_frag_mgr);
197
198	mgr->ipfm_q.ipfq_next = mgr->ipfm_q.ipfq_prev = &mgr->ipfm_q;
199	lck_mtx_init(lck: &mgr->ipfm_lock, grp: &fsw_ipfm_lock_group, attr: &fsw_ipfm_lock_attr);
200
201	mgr->ipfm_timeout_tcall =
202	thread_call_allocate_with_options(func: ipfq_timeout, param0: mgr,
203	pri: THREAD_CALL_PRIORITY_KERNEL, options: THREAD_CALL_OPTIONS_ONCE);
204	VERIFY(mgr->ipfm_timeout_tcall != NULL);
205
206	mgr->ipfm_ifp = ifp;
207	mgr->ipfm_stats = &fsw->fsw_stats;
208
209	/ Use caller provided limit (caller knows pool size) /
210	ASSERT(f_limit >= `2` && f_limit < UINT32_MAX);
211	mgr->ipfm_f_limit = (uint32_t)f_limit;
212	mgr->ipfm_f_count = `0`;
213	mgr->ipfm_q_limit = MIN(IPFM_MAX_QUEUES, mgr->ipfm_f_limit / `2`);
214	mgr->ipfm_q_count = `0`;
215
216	skoid_create(skoid: &mgr->ipfm_skoid, SKOID_DNODE(fsw->fsw_skoid), name: "ipfm", kind: `0`);
217	skoid_add_uint(skoid: &mgr->ipfm_skoid, name: "frag_limit", CTLFLAG_RW,
218	ptr: &mgr->ipfm_f_limit);
219	skoid_add_uint(skoid: &mgr->ipfm_skoid, name: "frag_count", CTLFLAG_RD,
220	ptr: &mgr->ipfm_f_count);
221	skoid_add_uint(skoid: &mgr->ipfm_skoid, name: "queue_limit", CTLFLAG_RW,
222	ptr: &mgr->ipfm_q_limit);
223	skoid_add_uint(skoid: &mgr->ipfm_skoid, name: "queue_count", CTLFLAG_RD,
224	ptr: &mgr->ipfm_q_count);
225	skoid_add_handler(skoid: &mgr->ipfm_skoid, name: "drain", CTLFLAG_RW,
226	handler: ipfq_drain_sysctl, arg1: mgr, arg2: `0`);
227
228	return mgr;
229	}
230
231	/ Free a flowswitch IP fragment manager. /
232	void
233	fsw_ip_frag_mgr_destroy(struct fsw_ip_frag_mgr *mgr)
234	{
235	thread_call_t tcall;
236
237	lck_mtx_lock(lck: &mgr->ipfm_lock);
238	if ((tcall = mgr->ipfm_timeout_tcall) != NULL) {
239	lck_mtx_unlock(lck: &mgr->ipfm_lock);
240	(void) thread_call_cancel_wait(call: tcall);
241	(void) thread_call_free(call: tcall);
242	mgr->ipfm_timeout_tcall = NULL;
243	lck_mtx_lock(lck: &mgr->ipfm_lock);
244	}
245
246	ipfq_drain(mgr);
247
248	lck_mtx_unlock(lck: &mgr->ipfm_lock);
249	lck_mtx_destroy(lck: &mgr->ipfm_lock, grp: &fsw_ipfm_lock_group);
250
251	skoid_destroy(skoid: &mgr->ipfm_skoid);
252	sk_free_type(struct fsw_ip_frag_mgr, mgr);
253	}
254
255	/*
256	* Reassemble a received IPv4 fragment.
257	*
258	* @param mgr
259	* fragment manager
260	* @param pkt
261	* received packet (must have ipv4 header validated)
262	* @param ip4
263	* pointer to the packet's IPv4 header
264	* @param nfrags
265	* number of fragments reassembled
266	* @return
267	* Successfully processed (not fully reassembled)
268	* ret = 0, pkt = NULL(ipfm owns it), nfrags=0
269	* Successfully reassembled
270	* ret = 0, *pkt = 1st fragment(fragments chained in order by pkt_nextpkt)
271	* *nfrags = number of all fragments (>0)
272	* Error
273	* ret != 0 && pkt unmodified (caller to decide what to do with pkt)
274	* *nfrags = 0
275	*/
276	int
277	fsw_ip_frag_reass_v4(struct fsw_ip_frag_mgr mgr, struct* __kern_packet **pkt,
278	struct ip ip4, uint16_t nfrags, uint16_t *tlen)
279	{
280	struct ipf_key key;
281	uint16_t unfragpartlen, offflag, fragoff, fragpartlen, fragflag;
282	int err;
283
284	STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_V4);
285
286	bcopy(src: (void )&ip4->ip_src, dst: (void* *)key.ipfk_addr, n: IPFK_LEN_V4);
287	key.ipfk_len = IPFK_LEN_V4;
288	key.ipfk_ident = (uint32_t)ip4->ip_id;
289
290	unfragpartlen = (uint16_t)(ip4->ip_hl << `2`);
291	offflag = ntohs(ip4->ip_off);
292	fragoff = (uint16_t)(offflag << `3`);
293	fragpartlen = ntohs(ip4->ip_len) - (uint16_t)(ip4->ip_hl << `2`);
294	fragflag = offflag & IP_MF;
295
296	err = ipf_process(mgr, pkt, &key, unfragpartlen, fragoff, fragpartlen,
297	fragflag, nfrags, tlen);
298
299	/*
300	* If packet has been reassembled compute the user data length.
301	*/
302	if (*pkt != NULL) {
303	struct __kern_packet p = pkt;
304	struct ip iph = (struct* ip *)p->pkt_flow_ip_hdr;
305
306	p->pkt_flow_ulen = ntohs(iph->ip_len) -
307	p->pkt_flow_ip_hlen - p->pkt_flow->flow_l4._l4_hlen;
308	}
309	return err;
310	}
311
312	/*
313	* Reassemble a received IPv6 fragment.
314	*
315	* @param mgr
316	* fragment manager
317	* @param pkt
318	* received packet (must have ipv6 header validated)
319	* @param ip6
320	* pointer to the packet's IPv6 header
321	* @param ip6f
322	* pointer to the packet's IPv6 Fragment Header
323	* @param nfrags
324	* number of fragments reassembled
325	* @return
326	* Successfully processed (not fully reassembled)
327	* ret = 0, pkt = NULL(ipfm owns it), nfrags=0
328	* Successfully reassembled
329	* ret = 0, *pkt = 1st fragment(fragments chained in ordrer by pkt_nextpkt)
330	* *nfrags = number of all fragments (>0)
331	* Error
332	* ret != 0 && pkt unmodified (caller to decide what to do with pkt)
333	* *nfrags = 0
334	*/
335	int
336	fsw_ip_frag_reass_v6(struct fsw_ip_frag_mgr mgr, struct* __kern_packet **pkt,
337	struct ip6_hdr ip6, struct* ip6_frag ip6f, uint16_t nfrags,
338	uint16_t *tlen)
339	{
340	struct ipf_key key;
341	ptrdiff_t ip6f_ptroff = (uintptr_t)ip6f - (uintptr_t)ip6;
342	uint16_t ip6f_off, fragoff, fragpartlen, unfragpartlen, fragflag;
343	int err;
344
345	STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_V6);
346
347	/ jumbo payload can't contain a fragment header /
348	if (ip6->ip6_plen == `0`) {
349	*nfrags = `0`;
350	return ERANGE;
351	}
352
353	ASSERT(ip6f_ptroff < UINT16_MAX);
354	ip6f_off = (uint16_t)ip6f_ptroff;
355	fragoff = ntohs(ip6f->ip6f_offlg & IP6F_OFF_MASK);
356	fragpartlen = ntohs(ip6->ip6_plen) -
357	(ip6f_off + sizeof(struct ip6_frag) - sizeof(struct ip6_hdr));
358	unfragpartlen = ip6f_off;
359	fragflag = ip6f->ip6f_offlg & IP6F_MORE_FRAG;
360
361	/*
362	* RFC 6946: Handle "atomic" fragments (offset and m bit set to 0)
363	* upfront, unrelated to any reassembly.
364	*
365	* Flow classifier should process those as non-frag, ipfm shouldn't see
366	* them.
367	*/
368	ASSERT((ip6f->ip6f_offlg & ~IP6F_RESERVED_MASK) != `0`);
369
370	bcopy(src: (void )&ip6->ip6_src, dst: (void* *)key.ipfk_addr, n: IPFK_LEN_V6);
371	key.ipfk_len = IPFK_LEN_V6;
372	key.ipfk_ident = ip6f->ip6f_ident;
373
374	err = ipf_process(mgr, pkt, &key, unfragpartlen, fragoff, fragpartlen,
375	fragflag, nfrags, tlen);
376
377	/*
378	* If packet has been reassembled compute the user data length.
379	*/
380	if (*pkt != NULL) {
381	struct __kern_packet p = pkt;
382	struct ip6_hdr ip6h = (struct* ip6_hdr *)p->pkt_flow_ip_hdr;
383
384	p->pkt_flow_ulen = ntohs(ip6h->ip6_plen) -
385	p->pkt_flow->flow_l4._l4_hlen;
386	}
387	return err;
388	}
389
390	static struct mbuf *
391	ipf_pkt2mbuf(struct fsw_ip_frag_mgr mgr, struct* __kern_packet *pkt)
392	{
393	unsigned int one = `1`;
394	struct mbuf *m = NULL;
395	uint8_t *buf;
396	struct ip6_hdr *ip6;
397	uint32_t l3t_len;
398	int err;
399
400	l3t_len = pkt->pkt_length - pkt->pkt_l2_len;
401	if (pkt->pkt_link_flags & PKT_LINKF_ETHFCS) {
402	l3t_len -= ETHER_CRC_LEN;
403	}
404
405	err = mbuf_allocpacket(how: MBUF_WAITOK, packetlen: l3t_len, maxchunks: &one, mbuf: &m);
406	VERIFY(err == `0`);
407	ASSERT(l3t_len <= mbuf_maxlen(m));
408
409	if (pkt->pkt_pflags & PKT_F_MBUF_DATA) {
410	bcopy(src: m_mtod_current(m: pkt->pkt_mbuf) + pkt->pkt_l2_len,
411	dst: m_mtod_current(m), n: l3t_len);
412	} else {
413	MD_BUFLET_ADDR_ABS(pkt, buf);
414	buf += (pkt->pkt_headroom + pkt->pkt_l2_len);
415	bcopy(src: buf, dst: m_mtod_current(m), n: l3t_len);
416	}
417	m->m_pkthdr.len = m->m_len = l3t_len;
418
419	ip6 = mtod(m, struct ip6_hdr *);
420	/ note for casting: IN6_IS_SCOPE_ doesn't need alignment /
421	if (IN6_IS_SCOPE_LINKLOCAL((struct in6_addr *)(uintptr_t)&ip6->ip6_src)) {
422	if (in6_embedded_scope) {
423	ip6->ip6_src.s6_addr16[`1`] = htons(mgr->ipfm_ifp->if_index);
424	}
425	ip6_output_setsrcifscope(m, mgr->ipfm_ifp->if_index, NULL);
426	}
427	if (IN6_IS_SCOPE_EMBED((struct in6_addr *)(uintptr_t)&ip6->ip6_dst)) {
428	if (in6_embedded_scope) {
429	ip6->ip6_dst.s6_addr16[`1`] = htons(mgr->ipfm_ifp->if_index);
430	}
431	ip6_output_setdstifscope(m, mgr->ipfm_ifp->if_index, NULL);
432	}
433
434	return m;
435	}
436
437	/*
438	* Since this function can be called while holding fsw_ip_frag_mgr.ipfm_lock,
439	* we need to ensure we don't enter the driver directly because a deadlock
440	* can happen if this same thread tries to get the workloop lock.
441	*/
442	static void
443	ipf_icmp6_error_flag(struct mbuf m, int* type, int code, int param, int flags)
444	{
445	sk_protect_t protect = sk_async_transmit_protect();
446	icmp6_error_flag(m, type, code, param, flags);
447	sk_async_transmit_unprotect(protect);
448	}
449
450	/*
451	* @internal IP fragment ICMP parameter problem error handling
452	*
453	* @param param
454	* offending parameter offset, only applicable to ICMPv6
455	*/
456	void
457	ipf_icmp_param_err(struct fsw_ip_frag_mgr mgr, struct* __kern_packet *pkt,
458	int param_offset)
459	{
460	if (pkt->pkt_flow_ip_ver != IPV6_VERSION) {
461	return;
462	}
463
464	struct mbuf *m = NULL;
465	m = ipf_pkt2mbuf(mgr, pkt);
466	if (__probable(m != NULL)) {
467	ipf_icmp6_error_flag(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
468	param: param_offset, flags: `0`);
469	}
470
471	/ m would be free by icmp6_error_flag function /
472	}
473
474	/ @internal IP fragment ICMP timeout error handling /
475	void
476	ipf_icmp_timeout_err(struct fsw_ip_frag_mgr mgr, struct* ipf *f)
477	{
478	struct __kern_packet *pkt = f->ipf_pkt;
479	ASSERT(pkt != NULL);
480
481	/ no icmp error packet for ipv4 /
482	if (pkt->pkt_flow_ip_ver != IPV6_VERSION) {
483	return;
484	}
485
486	/ only for the first fragment /
487	if (f->ipf_off != `0`) {
488	return;
489	}
490
491	struct mbuf *m = NULL;
492	m = ipf_pkt2mbuf(mgr, pkt);
493	if (__probable(m != NULL)) {
494	ipf_icmp6_error_flag(m, ICMP6_TIME_EXCEEDED,
495	ICMP6_TIME_EXCEED_REASSEMBLY, param: `0`, flags: `0`);
496	}
497
498	/ m would be free by icmp6_error_flag function /
499	}
500
501	/ @internal IP fragment processing, v4/v6 agonistic /
502	int
503	ipf_process(struct fsw_ip_frag_mgr mgr, struct* __kern_packet **pkt_ptr,
504	struct ipf_key *key, uint16_t unfraglen, uint16_t fragoff,
505	uint16_t fragpartlen, uint16_t fragflag, uint16_t nfrags, uint16_t tlen)
506	{
507	struct __kern_packet pkt = pkt_ptr;
508	struct __kern_packet *pkt_reassed = NULL;
509	struct ipfq q, mq = &mgr->ipfm_q;
510	struct ipf f, f_new, *f_down;
511	uint32_t nfrags_freed;
512	int next;
513	int first_frag = `0`;
514	int err = `0`;
515	int local_ipfq_unfraglen;
516
517	*nfrags = `0`;
518
519	SK_DF(SK_VERB_IP_FRAG, "id %5d fragoff %5d fragpartlen %5d "
520	"fragflag 0x%x", key->ipfk_ident, fragoff, fragpartlen, fragflag);
521
522	/*
523	* Make sure that all fragments except last one have a data length
524	* that's a non-zero multiple of 8 bytes.
525	*/
526	if (fragflag && (fragpartlen == `0` \|\| (fragpartlen & `0x7`) != `0`)) {
527	SK_DF(SK_VERB_IP_FRAG, "frag not multiple of 8 bytes");
528	STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_BAD_LEN);
529	ipf_icmp_param_err(mgr, pkt,
530	offsetof(struct ip6_hdr, ip6_plen));
531	return ERANGE;
532	}
533
534	lck_mtx_lock(lck: &mgr->ipfm_lock);
535
536	/ find ipfq /
537	for (q = mq->ipfq_next; q != mq; q = q->ipfq_next) {
538	if (ipf_key_cmp(key, &q->ipfq_key) == `0`) {
539	if (q->ipfq_is_dirty) {
540	SK_DF(SK_VERB_IP_FRAG, "found dirty q, skip");
541	err = EINVAL;
542	goto done;
543	}
544	break;
545	}
546	}
547
548	/ not found, create new ipfq /
549	if (q == mq) {
550	first_frag = `1`;
551
552	q = ipfq_alloc(mgr);
553	if (q == NULL) {
554	STATS_INC(mgr->ipfm_stats,
555	FSW_STATS_RX_FRAG_DROP_NOMEM);
556	err = ENOMEM;
557	goto done;
558	}
559
560	ipfq_insque(q, mq);
561	net_update_uptime();
562
563	bcopy(src: key, dst: &q->ipfq_key, n: sizeof(struct ipf_key));
564	q->ipfq_down = q->ipfq_up = (struct ipf *)q;
565	q->ipfq_unfraglen = -`1`; / The 1st fragment has not arrived. /
566	q->ipfq_nfrag = `0`;
567	q->ipfq_timestamp = _net_uptime;
568	}
569
570	ASSERT(!q->ipfq_is_dirty);
571
572	/ this queue has reached per queue frag limit /
573	if (q->ipfq_nfrag > ipfm_max_frags_per_queue) {
574	nfrags_freed = ipfq_freefq(mgr, q, NULL);
575	STATS_ADD(mgr->ipfm_stats,
576	FSW_STATS_RX_FRAG_DROP_PER_QUEUE_LIMIT, nfrags_freed);
577	err = ENOMEM;
578	goto done;
579	}
580
581	local_ipfq_unfraglen = q->ipfq_unfraglen;
582
583	/*
584	* If it's the 1st fragment, record the length of the
585	* unfragmentable part and the next header of the fragment header.
586	* Assume the first fragement to arrive will be correct.
587	* We do not have any duplicate checks here yet so another packet
588	* with fragoff == 0 could come and overwrite the ipfq_unfraglen
589	* and worse, the next header, at any time.
590	*/
591	if (fragoff == `0` && local_ipfq_unfraglen == -`1`) {
592	local_ipfq_unfraglen = unfraglen;
593	}
594
595	/ Check that the reassembled packet would not exceed 65535 bytes. /
596	if (local_ipfq_unfraglen + fragoff + fragpartlen > IP_MAXPACKET) {
597	SK_DF(SK_VERB_IP_FRAG, "frag too big");
598	STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_BAD);
599	ipf_icmp_param_err(mgr, pkt, param_offset: sizeof(struct ip6_hdr) +
600	offsetof(struct ip6_frag, ip6f_offlg));
601	err = ERANGE;
602	goto done;
603	}
604
605	/*
606	* If it's the 1st fragment, do the above check for each
607	* fragment already stored in the reassembly queue.
608	* If an error is found, still return 0, since we don't return
609	* ownership of a chain of offending packets back to caller.
610	*/
611	if (fragoff == `0`) {
612	for (f = q->ipfq_down; f != (struct ipf *)q; f = f_down) {
613	f_down = f->ipf_down;
614	if (local_ipfq_unfraglen + f->ipf_off + f->ipf_len >
615	IP_MAXPACKET) {
616	SK_DF(SK_VERB_IP_FRAG, "frag too big");
617	STATS_INC(mgr->ipfm_stats,
618	FSW_STATS_RX_FRAG_BAD);
619	ipf_deq(f);
620	ipf_free_pkt(f);
621	ipf_free(mgr, f);
622	}
623	}
624	}
625
626	f_new = ipf_alloc(mgr);
627	if (f_new == NULL) {
628	STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_NOMEM);
629	err = ENOMEM;
630	goto done;
631	}
632
633	f_new->ipf_mff = fragflag;
634	f_new->ipf_off = fragoff;
635	f_new->ipf_len = fragpartlen;
636	f_new->ipf_pkt = pkt;
637
638	if (first_frag) {
639	f = (struct ipf *)q;
640	goto insert;
641	}
642
643	/ Find a segment which begins after this one does. /
644	for (f = q->ipfq_down; f != (struct ipf *)q; f = f->ipf_down) {
645	if (f->ipf_off > f_new->ipf_off) {
646	break;
647	}
648	}
649
650	/*
651	* If any of the fragments being reassembled overlap with any
652	* other fragments being reassembled for the same packet,
653	* reassembly of that packet must be abandoned and all the
654	* fragments that have been received for that packet must be
655	* discarded, and no ICMP error messages should be sent.
656	*
657	* It should be noted that fragments may be duplicated in the
658	* network. Instead of treating these exact duplicate fragments
659	* as overlapping fragments, an implementation may choose to
660	* detect this case and drop exact duplicate fragments while
661	* keeping the other fragments belonging to the same packet.
662	*
663	* https://tools.ietf.org/html/rfc8200#appendix-B
664	*
665	* We apply this rule for both for IPv4 and IPv6 here.
666	*/
667	if (((f->ipf_up != (struct ipf )q) && /* prev frag spans into f_new /
668	(f->ipf_up->ipf_off + f->ipf_up->ipf_len - f_new->ipf_off > `0`)) \|\|
669	((f != (struct ipf )q) && /* f_new spans into next /
670	(f_new->ipf_off + f_new->ipf_len - f->ipf_off > `0`))) {
671	STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_BAD);
672	/ Check for exact duplicate offset/length /
673	if (((f->ipf_up != (struct ipf *)q) &&
674	((f->ipf_up->ipf_off != f_new->ipf_off) \|\|
675	(f->ipf_up->ipf_len != f_new->ipf_len))) \|\|
676	((f != (struct ipf *)q) &&
677	((f->ipf_off != f_new->ipf_off) \|\|
678	(f->ipf_len != f_new->ipf_len)))) {
679	SK_DF(SK_VERB_IP_FRAG, "frag overlap");
680	ipf_free(mgr, f: f_new);
681	/ give up over-lapping fragments queue /
682	SK_DF(SK_VERB_IP_FRAG, "free overlapping queue");
683	ipfq_freef(mgr, q, NULL);
684	q->ipfq_is_dirty = true;
685	} else {
686	ipf_free(mgr, f: f_new);
687	SK_DF(SK_VERB_IP_FRAG, "frag dup");
688	}
689	err = ERANGE;
690	goto done;
691	}
692
693	insert:
694	q->ipfq_unfraglen = local_ipfq_unfraglen;
695
696	/*
697	* Stick new segment in its place;
698	* check for complete reassembly.
699	* Move to front of packet queue, as we are
700	* the most recently active fragmented packet.
701	*/
702	ipf_enq(f_new, f->ipf_up);
703	q->ipfq_nfrag++;
704	next = `0`;
705	for (f = q->ipfq_down; f != (struct ipf *)q; f = f->ipf_down) {
706	/ there is a hole /
707	if (f->ipf_off != next) {
708	goto done;
709	}
710	next += f->ipf_len;
711	}
712	/ we haven't got last frag yet /
713	if (f->ipf_up->ipf_mff) {
714	goto done;
715	}
716
717	/*
718	* Reassembly is complete; concatenate fragments.
719	*/
720	f = q->ipfq_down;
721	f_down = f->ipf_down;
722	pkt_reassed = f->ipf_pkt;
723	*nfrags = `1`;
724	while (f_down != (struct ipf *)q) {
725	/ chain __kern_packet with pkt_nextpkt ptr /
726	f->ipf_pkt->pkt_nextpkt = f_down->ipf_pkt;
727	(*nfrags)++;
728	(*tlen) += f_down->ipf_len;
729	f_down = f->ipf_down;
730	ipf_deq(f);
731	ipf_free(mgr, f);
732	f = f_down;
733	f_down = f->ipf_down;
734	}
735	ipf_free(mgr, f);
736
737	err = `0`;
738	STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_REASSED);
739	ipfq_remque(q);
740	ipfq_free(mgr, q);
741
742	done:
743	/ ipfm take ownership of, or return assembled packet, if no error /
744	if (err == `0`) {
745	/ reass'ed packet if done; NULL otherwise /
746	*pkt_ptr = pkt_reassed;
747	}
748	ipfq_sched_timeout(mgr, FALSE);
749	lck_mtx_unlock(lck: &mgr->ipfm_lock);
750	return err;
751	}
752
753	static int
754	ipf_key_cmp(struct ipf_key a, struct* ipf_key *b)
755	{
756	int d;
757
758	if ((d = (a->ipfk_len - b->ipfk_len)) != `0`) {
759	return d;
760	}
761
762	if ((d = (a->ipfk_ident - b->ipfk_ident)) != `0`) {
763	return d;
764	}
765
766	return memcmp(s1: a->ipfk_addr, s2: b->ipfk_addr, n: a->ipfk_len);
767	}
768
769	/*
770	* Put an ip fragment on a reassembly chain.
771	* Like insque, but pointers in middle of structure.
772	*/
773	static void
774	ipf_enq(struct ipf f, struct* ipf *up6)
775	{
776	f->ipf_up = up6;
777	f->ipf_down = up6->ipf_down;
778	up6->ipf_down->ipf_up = f;
779	up6->ipf_down = f;
780	}
781
782	/*
783	* To ipf_enq as remque is to insque.
784	*/
785	static void
786	ipf_deq(struct ipf *f)
787	{
788	f->ipf_up->ipf_down = f->ipf_down;
789	f->ipf_down->ipf_up = f->ipf_up;
790	}
791
792	static void
793	ipfq_insque(struct ipfq new, struct* ipfq *old)
794	{
795	new->ipfq_prev = old;
796	new->ipfq_next = old->ipfq_next;
797	old->ipfq_next->ipfq_prev = new;
798	old->ipfq_next = new;
799	}
800
801	static void
802	ipfq_remque(struct ipfq *p6)
803	{
804	p6->ipfq_prev->ipfq_next = p6->ipfq_next;
805	p6->ipfq_next->ipfq_prev = p6->ipfq_prev;
806	}
807
808	/*
809	* @internal drain reassembly queue till reaching target q count.
810	*/
811	static void
812	_ipfq_reap(struct fsw_ip_frag_mgr *mgr, uint32_t target_q_count,
813	void (ipf_cb)(struct* fsw_ip_frag_mgr , struct* ipf *))
814	{
815	uint32_t n_freed = `0`;
816
817	LCK_MTX_ASSERT(&mgr->ipfm_lock, LCK_MTX_ASSERT_OWNED);
818
819	SK_DF(SK_VERB_IP_FRAG, "draining (frag %d/%d queue %d/%d)",
820	mgr->ipfm_f_count, mgr->ipfm_f_limit, mgr->ipfm_q_count,
821	mgr->ipfm_q_limit);
822
823	while (mgr->ipfm_q.ipfq_next != &mgr->ipfm_q &&
824	mgr->ipfm_q_count > target_q_count) {
825	n_freed += ipfq_freefq(mgr, q: mgr->ipfm_q.ipfq_prev,
826	ipf_cb: mgr->ipfm_q.ipfq_prev->ipfq_is_dirty ? NULL : ipf_cb);
827	}
828
829	STATS_ADD(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_REAPED, n_freed);
830	}
831
832	/*
833	* @internal reap half reassembly queues to allow newer fragment assembly.
834	*/
835	static void
836	ipfq_reap(struct fsw_ip_frag_mgr *mgr)
837	{
838	_ipfq_reap(mgr, target_q_count: mgr->ipfm_q_count / `2`, ipf_cb: ipf_icmp_timeout_err);
839	}
840
841	/*
842	* @internal reap all reassembly queues, for shutdown etc.
843	*/
844	static void
845	ipfq_drain(struct fsw_ip_frag_mgr *mgr)
846	{
847	_ipfq_reap(mgr, target_q_count: `0`, NULL);
848	}
849
850	static void
851	ipfq_timeout(thread_call_param_t arg0, thread_call_param_t arg1)
852	{
853	#pragma unused(arg1)
854	struct fsw_ip_frag_mgr *mgr = arg0;
855	struct ipfq *q;
856	uint64_t now, elapsed;
857	uint32_t n_freed = `0`;
858
859	net_update_uptime();
860	now = _net_uptime;
861
862	SK_DF(SK_VERB_IP_FRAG, "run");
863	lck_mtx_lock(lck: &mgr->ipfm_lock);
864	q = mgr->ipfm_q.ipfq_next;
865	if (q) {
866	while (q != &mgr->ipfm_q) {
867	q = q->ipfq_next;
868	elapsed = now - q->ipfq_prev->ipfq_timestamp;
869	if (elapsed > ipfm_frag_ttl) {
870	SK_DF(SK_VERB_IP_FRAG, "timing out q id %5d",
871	q->ipfq_prev->ipfq_key.ipfk_ident);
872	n_freed = ipfq_freefq(mgr, q: q->ipfq_prev,
873	ipf_cb: q->ipfq_is_dirty ? NULL :
874	ipf_icmp_timeout_err);
875	}
876	}
877	}
878	STATS_ADD(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_TIMEOUT, n_freed);
879
880	/ If running out of resources, drain ipfm queues (oldest one first) /
881	if (mgr->ipfm_f_count >= mgr->ipfm_f_limit \|\|
882	mgr->ipfm_q_count >= mgr->ipfm_q_limit) {
883	ipfq_reap(mgr);
884	}
885
886	/ re-arm the purge timer if there's work to do /
887	if (mgr->ipfm_q_count > `0`) {
888	ipfq_sched_timeout(mgr, TRUE);
889	}
890	lck_mtx_unlock(lck: &mgr->ipfm_lock);
891	}
892
893	static void
894	ipfq_sched_timeout(struct fsw_ip_frag_mgr *mgr, boolean_t in_tcall)
895	{
896	uint32_t delay = MAX(`1`, ipfm_timeout_tcall_ival); / seconds /
897	thread_call_t tcall = mgr->ipfm_timeout_tcall;
898	uint64_t now = mach_absolute_time();
899	uint64_t ival, deadline = now;
900
901	LCK_MTX_ASSERT(&mgr->ipfm_lock, LCK_MTX_ASSERT_OWNED);
902
903	ASSERT(tcall != NULL);
904	if (mgr->ipfm_q_count > `0` &&
905	(!thread_call_isactive(call: tcall) \|\| in_tcall)) {
906	nanoseconds_to_absolutetime(nanoseconds: delay * NSEC_PER_SEC, result: &ival);
907	clock_deadline_for_periodic_event(interval: ival, abstime: now, deadline: &deadline);
908	(void) thread_call_enter_delayed(call: tcall, deadline);
909	}
910	}
911
912	static int
913	ipfq_drain_sysctl SYSCTL_HANDLER_ARGS
914	{
915	#pragma unused(oidp, arg2)
916	struct fsw_ip_frag_mgr *mgr = arg1;
917
918	SKOID_PROC_CALL_GUARD;
919
920	lck_mtx_lock(lck: &mgr->ipfm_lock);
921	ipfq_drain(mgr);
922	lck_mtx_unlock(lck: &mgr->ipfm_lock);
923
924	return `0`;
925	}
926
927	static struct ipfq *
928	ipfq_alloc(struct fsw_ip_frag_mgr *mgr)
929	{
930	struct ipfq *q;
931
932	if (mgr->ipfm_q_count > mgr->ipfm_q_limit) {
933	ipfq_reap(mgr);
934	}
935	ASSERT(mgr->ipfm_q_count <= mgr->ipfm_q_limit);
936
937	q = kalloc_type(struct ipfq, Z_WAITOK \| Z_ZERO);
938	if (q != NULL) {
939	mgr->ipfm_q_count++;
940	q->ipfq_is_dirty = false;
941	}
942	return q;
943	}
944
945	/ free q /
946	static void
947	ipfq_free(struct fsw_ip_frag_mgr mgr, struct* ipfq *q)
948	{
949	kfree_type(struct ipfq, q);
950	mgr->ipfm_q_count--;
951	}
952
953	/*
954	* Free all fragments, keep q.
955	* @return: number of frags freed
956	*/
957	static uint32_t
958	ipfq_freef(struct fsw_ip_frag_mgr mgr, struct* ipfq *q,
959	void (ipf_cb)(struct* fsw_ip_frag_mgr , struct* ipf *))
960	{
961	struct ipf f, down6;
962	uint32_t nfrags = `0`;
963
964	for (f = q->ipfq_down; f != (struct ipf *)q; f = down6) {
965	nfrags++;
966	down6 = f->ipf_down;
967	ipf_deq(f);
968	if (ipf_cb != NULL) {
969	(*ipf_cb)(mgr, f);
970	}
971	ipf_free_pkt(f);
972	ipf_free(mgr, f);
973	}
974
975	return nfrags;
976	}
977
978	/ Free both all fragments and q*
979	* @return: number of frags freed
980	*/
981	static uint32_t
982	ipfq_freefq(struct fsw_ip_frag_mgr mgr, struct* ipfq *q,
983	void (ipf_cb)(struct* fsw_ip_frag_mgr , struct* ipf *))
984	{
985	uint32_t freed_count;
986	freed_count = ipfq_freef(mgr, q, ipf_cb);
987	ipfq_remque(p6: q);
988	ipfq_free(mgr, q);
989	return freed_count;
990	}
991
992	static struct ipf *
993	ipf_alloc(struct fsw_ip_frag_mgr *mgr)
994	{
995	struct ipf *f;
996
997	if (mgr->ipfm_f_count > mgr->ipfm_f_limit) {
998	STATS_INC(mgr->ipfm_stats, FSW_STATS_RX_FRAG_DROP_FRAG_LIMIT);
999	return NULL;
1000	}
1001
1002	f = kalloc_type(struct ipf, Z_WAITOK \| Z_ZERO);
1003	if (f != NULL) {
1004	mgr->ipfm_f_count++;
1005	}
1006	return f;
1007	}
1008
1009	static void
1010	ipf_free_pkt(struct ipf *f)
1011	{
1012	struct __kern_packet *pkt = f->ipf_pkt;
1013	ASSERT(pkt != NULL);
1014	pp_free_packet(__DECONST(struct kern_pbufpool *, pkt->pkt_qum.qum_pp),
1015	SK_PTR_ADDR(pkt));
1016	}
1017
1018	static void
1019	ipf_free(struct fsw_ip_frag_mgr mgr, struct* ipf *f)
1020	{
1021	kfree_type(struct ipf, f);
1022	mgr->ipfm_f_count--;
1023	}
1024

Browse the source code of xnu/bsd/skywalk/nexus/flowswitch/fsw_ip_frag.c