ip_input.c source code [xnu/bsd/netinet/ip_input.c]

1	/*
2	* Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28	/*
29	* Copyright (c) 1982, 1986, 1988, 1993
30	* The Regents of the University of California. All rights reserved.
31	*
32	* Redistribution and use in source and binary forms, with or without
33	* modification, are permitted provided that the following conditions
34	* are met:
35	* 1. Redistributions of source code must retain the above copyright
36	* notice, this list of conditions and the following disclaimer.
37	* 2. Redistributions in binary form must reproduce the above copyright
38	* notice, this list of conditions and the following disclaimer in the
39	* documentation and/or other materials provided with the distribution.
40	* 3. All advertising materials mentioning features or use of this software
41	* must display the following acknowledgement:
42	* This product includes software developed by the University of
43	* California, Berkeley and its contributors.
44	* 4. Neither the name of the University nor the names of its contributors
45	* may be used to endorse or promote products derived from this software
46	* without specific prior written permission.
47	*
48	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58	* SUCH DAMAGE.
59	*
60	* @(#)ip_input.c 8.2 (Berkeley) 1/4/94
61	*/
62	/*
63	* NOTICE: This file was modified by SPARTA, Inc. in 2007 to introduce
64	* support for mandatory and extensible security protections. This notice
65	* is included in support of clause 2.2 (b) of the Apple Public License,
66	* Version 2.0.
67	*/
68
69	#define _IP_VHL
70
71	#include <sys/param.h>
72	#include <sys/systm.h>
73	#include <sys/mbuf.h>
74	#include <sys/malloc.h>
75	#include <sys/domain.h>
76	#include <sys/protosw.h>
77	#include <sys/socket.h>
78	#include <sys/time.h>
79	#include <sys/kernel.h>
80	#include <sys/syslog.h>
81	#include <sys/sysctl.h>
82	#include <sys/mcache.h>
83	#include <sys/socketvar.h>
84	#include <sys/kdebug.h>
85	#include <mach/mach_time.h>
86	#include <mach/sdt.h>
87
88	#include <machine/endian.h>
89	#include <dev/random/randomdev.h>
90
91	#include <kern/queue.h>
92	#include <kern/locks.h>
93	#include <libkern/OSAtomic.h>
94
95	#include <pexpert/pexpert.h>
96
97	#include <net/if.h>
98	#include <net/if_var.h>
99	#include <net/if_dl.h>
100	#include <net/route.h>
101	#include <net/kpi_protocol.h>
102	#include <net/ntstat.h>
103	#include <net/dlil.h>
104	#include <net/classq/classq.h>
105	#include <net/net_perf.h>
106	#include <net/init.h>
107	#if PF
108	#include <net/pfvar.h>
109	#endif /* PF */
110	#include <net/if_ports_used.h>
111
112	#include <netinet/in.h>
113	#include <netinet/in_systm.h>
114	#include <netinet/in_var.h>
115	#include <netinet/in_arp.h>
116	#include <netinet/ip.h>
117	#include <netinet/in_pcb.h>
118	#include <netinet/ip_var.h>
119	#include <netinet/ip_icmp.h>
120	#include <netinet/kpi_ipfilter_var.h>
121	#include <netinet/udp.h>
122	#include <netinet/udp_var.h>
123	#include <netinet/bootp.h>
124
125	#if DUMMYNET
126	#include <netinet/ip_dummynet.h>
127	#endif /* DUMMYNET */
128
129	#if IPSEC
130	#include <netinet6/ipsec.h>
131	#include <netkey/key.h>
132	#endif /* IPSEC */
133
134	#include <net/sockaddr_utils.h>
135
136	#include <os/log.h>
137
138	#define DBG_LAYER_BEG NETDBG_CODE(DBG_NETIP, 0)
139	#define DBG_LAYER_END NETDBG_CODE(DBG_NETIP, 2)
140	#define DBG_FNC_IP_INPUT NETDBG_CODE(DBG_NETIP, (2 << 8))
141
142	#if IPSEC
143	extern int ipsec_bypass;
144	#endif /* IPSEC */
145
146	MBUFQ_HEAD(fq_head);
147
148	static int frag_timeout_run; / frag timer is scheduled to run /
149	static void frag_timeout(void *);
150	static void frag_sched_timeout(void);
151
152	static struct ipq ipq_alloc(void*);
153	static void ipq_free(struct ipq *);
154	static void ipq_updateparams(void);
155	static void ip_input_second_pass(struct mbuf , struct* ifnet *,
156	int, int, struct ip_fw_in_args *);
157
158	static LCK_GRP_DECLARE(ipqlock_grp, "ipqlock");
159	static LCK_MTX_DECLARE(ipqlock, &ipqlock_grp);
160
161
162	/ Packet reassembly stuff /
163	#define IPREASS_NHASH_LOG2 6
164	#define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2)
165	#define IPREASS_HMASK (IPREASS_NHASH - 1)
166	#define IPREASS_HASH(x, y) \
167	(((((x) & 0xF) \| ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK)
168
169	/ IP fragment reassembly queues (protected by ipqlock) /
170	static TAILQ_HEAD(ipqhead, ipq) ipq[IPREASS_NHASH]; / ip reassembly queues /
171	static int maxnipq; / max packets in reass queues /
172	static u_int32_t maxfragsperpacket; / max frags/packet in reass queues /
173	static u_int32_t nipq; / # of packets in reass queues /
174	static u_int32_t ipq_limit; / ipq allocation limit /
175	static u_int32_t ipq_count; / current # of allocated ipq's /
176
177	static int sysctl_ipforwarding SYSCTL_HANDLER_ARGS;
178	static int sysctl_maxnipq SYSCTL_HANDLER_ARGS;
179	static int sysctl_maxfragsperpacket SYSCTL_HANDLER_ARGS;
180
181	#if (DEBUG \|\| DEVELOPMENT)
182	static int sysctl_reset_ip_input_stats SYSCTL_HANDLER_ARGS;
183	static int sysctl_ip_input_measure_bins SYSCTL_HANDLER_ARGS;
184	static int sysctl_ip_input_getperf SYSCTL_HANDLER_ARGS;
185	#endif /* (DEBUG \|\| DEVELOPMENT) */
186
187	int ipforwarding = `0`;
188	SYSCTL_PROC(_net_inet_ip, IPCTL_FORWARDING, forwarding,
189	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_LOCKED, &ipforwarding, `0`,
190	sysctl_ipforwarding, "I", "Enable IP forwarding between interfaces");
191
192	static int ipsendredirects = `1`; / XXX /
193	SYSCTL_INT(_net_inet_ip, IPCTL_SENDREDIRECTS, redirect,
194	CTLFLAG_RW \| CTLFLAG_LOCKED, &ipsendredirects, `0`,
195	"Enable sending IP redirects");
196
197	int ip_defttl = IPDEFTTL;
198	SYSCTL_INT(_net_inet_ip, IPCTL_DEFTTL, ttl, CTLFLAG_RW \| CTLFLAG_LOCKED,
199	&ip_defttl, `0`, "Maximum TTL on IP packets");
200
201	static int ip_dosourceroute = `0`;
202	SYSCTL_INT(_net_inet_ip, IPCTL_SOURCEROUTE, sourceroute,
203	CTLFLAG_RW \| CTLFLAG_LOCKED, &ip_dosourceroute, `0`,
204	"Enable forwarding source routed IP packets");
205
206	static int ip_acceptsourceroute = `0`;
207	SYSCTL_INT(_net_inet_ip, IPCTL_ACCEPTSOURCEROUTE, accept_sourceroute,
208	CTLFLAG_RW \| CTLFLAG_LOCKED, &ip_acceptsourceroute, `0`,
209	"Enable accepting source routed IP packets");
210
211	static int ip_sendsourcequench = `0`;
212	SYSCTL_INT(_net_inet_ip, OID_AUTO, sendsourcequench,
213	CTLFLAG_RW \| CTLFLAG_LOCKED, &ip_sendsourcequench, `0`,
214	"Enable the transmission of source quench packets");
215
216	SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets,
217	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_LOCKED, &maxnipq, `0`, sysctl_maxnipq,
218	"I", "Maximum number of IPv4 fragment reassembly queue entries");
219
220	SYSCTL_UINT(_net_inet_ip, OID_AUTO, fragpackets, CTLFLAG_RD \| CTLFLAG_LOCKED,
221	&nipq, `0`, "Current number of IPv4 fragment reassembly queue entries");
222
223	SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragsperpacket,
224	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_LOCKED, &maxfragsperpacket, `0`,
225	sysctl_maxfragsperpacket, "I",
226	"Maximum number of IPv4 fragments allowed per packet");
227
228	static uint32_t ip_adj_clear_hwcksum = `0`;
229	SYSCTL_UINT(_net_inet_ip, OID_AUTO, adj_clear_hwcksum,
230	CTLFLAG_RW \| CTLFLAG_LOCKED, &ip_adj_clear_hwcksum, `0`,
231	"Invalidate hwcksum info when adjusting length");
232
233	static uint32_t ip_adj_partial_sum = `1`;
234	SYSCTL_UINT(_net_inet_ip, OID_AUTO, adj_partial_sum,
235	CTLFLAG_RW \| CTLFLAG_LOCKED, &ip_adj_partial_sum, `0`,
236	"Perform partial sum adjustment of trailing bytes at IP layer");
237
238	/*
239	* ip_checkinterface controls the receive side of the models for multihoming
240	* that are discussed in RFC 1122.
241	*
242	* ip_checkinterface values are:
243	* IP_CHECKINTERFACE_WEAK_ES:
244	* This corresponds to the Weak End-System model where incoming packets from
245	* any interface are accepted provided the destination address of the incoming packet
246	* is assigned to some interface.
247	*
248	* IP_CHECKINTERFACE_HYBRID_ES:
249	* The Hybrid End-System model use the Strong End-System for tunnel interfaces
250	* (ipsec and utun) and the weak End-System model for other interfaces families.
251	* This prevents a rogue middle box to probe for signs of TCP connections
252	* that use the tunnel interface.
253	*
254	* IP_CHECKINTERFACE_STRONG_ES:
255	* The Strong model model requires the packet arrived on an interface that
256	* is assigned the destination address of the packet.
257	*
258	* Since the routing table and transmit implementation do not implement the Strong ES model,
259	* setting this to a value different from IP_CHECKINTERFACE_WEAK_ES may lead to unexpected results.
260	*
261	* When forwarding is enabled, the system reverts to the Weak ES model as a router
262	* is expected by design to receive packets from several interfaces to the same address.
263	*
264	* XXX - ip_checkinterface currently must be set to IP_CHECKINTERFACE_WEAK_ES if you use ipnat
265	* to translate the destination address to another local interface.
266	*
267	* XXX - ip_checkinterface must be set to IP_CHECKINTERFACE_WEAK_ES if you add IP aliases
268	* to the loopback interface instead of the interface where the
269	* packets for those addresses are received.
270	*/
271	#define IP_CHECKINTERFACE_WEAK_ES 0
272	#define IP_CHECKINTERFACE_HYBRID_ES 1
273	#define IP_CHECKINTERFACE_STRONG_ES 2
274
275	static int ip_checkinterface = IP_CHECKINTERFACE_HYBRID_ES;
276
277	static int sysctl_ip_checkinterface SYSCTL_HANDLER_ARGS;
278	SYSCTL_PROC(_net_inet_ip, OID_AUTO, check_interface,
279	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_LOCKED,
280	`0`, `0`, sysctl_ip_checkinterface, "I", "Verify packet arrives on correct interface");
281
282	#if (DEBUG \|\| DEVELOPMENT)
283	#define IP_CHECK_IF_DEBUG 1
284	#else
285	#define IP_CHECK_IF_DEBUG 0
286	#endif /* (DEBUG \|\| DEVELOPMENT) */
287	static int ip_checkinterface_debug = IP_CHECK_IF_DEBUG;
288	SYSCTL_INT(_net_inet_ip, OID_AUTO, checkinterface_debug, CTLFLAG_RW \| CTLFLAG_LOCKED,
289	&ip_checkinterface_debug, IP_CHECK_IF_DEBUG, "");
290
291	static int ip_chaining = `1`;
292	SYSCTL_INT(_net_inet_ip, OID_AUTO, rx_chaining, CTLFLAG_RW \| CTLFLAG_LOCKED,
293	&ip_chaining, `1`, "Do receive side ip address based chaining");
294
295	static int ip_chainsz = `6`;
296	SYSCTL_INT(_net_inet_ip, OID_AUTO, rx_chainsz, CTLFLAG_RW \| CTLFLAG_LOCKED,
297	&ip_chainsz, `1`, "IP receive side max chaining");
298
299	#if (DEBUG \|\| DEVELOPMENT)
300	static int ip_input_measure = `0`;
301	SYSCTL_PROC(_net_inet_ip, OID_AUTO, input_perf,
302	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_LOCKED,
303	&ip_input_measure, `0`, sysctl_reset_ip_input_stats, "I", "Do time measurement");
304
305	static uint64_t ip_input_measure_bins = `0`;
306	SYSCTL_PROC(_net_inet_ip, OID_AUTO, input_perf_bins,
307	CTLTYPE_QUAD \| CTLFLAG_RW \| CTLFLAG_LOCKED, &ip_input_measure_bins, `0`,
308	sysctl_ip_input_measure_bins, "I",
309	"bins for chaining performance data histogram");
310
311	static net_perf_t net_perf;
312	SYSCTL_PROC(_net_inet_ip, OID_AUTO, input_perf_data,
313	CTLTYPE_STRUCT \| CTLFLAG_RD \| CTLFLAG_LOCKED,
314	`0`, `0`, sysctl_ip_input_getperf, "S,net_perf",
315	"IP input performance data (struct net_perf, net/net_perf.h)");
316	#endif /* (DEBUG \|\| DEVELOPMENT) */
317
318	#if DIAGNOSTIC
319	static int ipprintfs = `0`;
320	#endif
321
322	struct protosw *ip_protox[IPPROTO_MAX];
323
324	static LCK_GRP_DECLARE(in_ifaddr_rwlock_grp, "in_ifaddr_rwlock");
325	LCK_RW_DECLARE(in_ifaddr_rwlock, &in_ifaddr_rwlock_grp);
326
327	/ Protected by in_ifaddr_rwlock /
328	struct in_ifaddrhead in_ifaddrhead; / first inet address /
329	struct in_ifaddrhashhead in_ifaddrhashtbl; /* inet addr hash table /
330
331	#define INADDR_NHASH 61
332	static uint32_t inaddr_nhash; / hash table size /
333	static uint32_t inaddr_hashp; / next largest prime /
334
335	static int ip_getstat SYSCTL_HANDLER_ARGS;
336	struct ipstat ipstat;
337	SYSCTL_PROC(_net_inet_ip, IPCTL_STATS, stats,
338	CTLTYPE_STRUCT \| CTLFLAG_RD \| CTLFLAG_LOCKED,
339	`0`, `0`, ip_getstat, "S,ipstat",
340	"IP statistics (struct ipstat, netinet/ip_var.h)");
341
342	#if IPCTL_DEFMTU
343	SYSCTL_INT(_net_inet_ip, IPCTL_DEFMTU, mtu, CTLFLAG_RW \| CTLFLAG_LOCKED,
344	&ip_mtu, `0`, "Default MTU");
345	#endif /* IPCTL_DEFMTU */
346
347	#if IPSTEALTH
348	static int ipstealth = `0`;
349	SYSCTL_INT(_net_inet_ip, OID_AUTO, stealth, CTLFLAG_RW \| CTLFLAG_LOCKED,
350	&ipstealth, `0`, "");
351	#endif /* IPSTEALTH */
352
353	#if DUMMYNET
354	ip_dn_io_t *ip_dn_io_ptr;
355	#endif /* DUMMYNET */
356
357	SYSCTL_NODE(_net_inet_ip, OID_AUTO, linklocal,
358	CTLFLAG_RW \| CTLFLAG_LOCKED, `0`, "link local");
359
360	struct ip_linklocal_stat ip_linklocal_stat;
361	SYSCTL_STRUCT(_net_inet_ip_linklocal, OID_AUTO, stat,
362	CTLFLAG_RD \| CTLFLAG_LOCKED, &ip_linklocal_stat, ip_linklocal_stat,
363	"Number of link local packets with TTL less than 255");
364
365	SYSCTL_NODE(_net_inet_ip_linklocal, OID_AUTO, in,
366	CTLFLAG_RW \| CTLFLAG_LOCKED, `0`, "link local input");
367
368	int ip_linklocal_in_allowbadttl = `1`;
369	SYSCTL_INT(_net_inet_ip_linklocal_in, OID_AUTO, allowbadttl,
370	CTLFLAG_RW \| CTLFLAG_LOCKED, &ip_linklocal_in_allowbadttl, `0`,
371	"Allow incoming link local packets with TTL less than 255");
372
373
374	/*
375	* We need to save the IP options in case a protocol wants to respond
376	* to an incoming packet over the same route if the packet got here
377	* using IP source routing. This allows connection establishment and
378	* maintenance when the remote end is on a network that is not known
379	* to us.
380	*/
381	static int ip_nhops = `0`;
382	static struct ip_srcrt {
383	struct in_addr dst; / final destination /
384	char nop; / one NOP to align /
385	char srcopt[IPOPT_OFFSET + `1`]; / OPTVAL, OLEN and OFFSET /
386	struct in_addr route[MAX_IPOPTLEN / sizeof(struct in_addr)];
387	} ip_srcrt;
388
389	static void in_ifaddrhashtbl_init(void);
390	static void save_rte(u_char , struct* in_addr);
391	static int ip_dooptions(struct mbuf , int, struct* sockaddr_in *);
392	static void ip_forward(struct mbuf , int, struct* sockaddr_in *);
393	static void frag_freef(struct ipqhead , struct* ipq *);
394	static struct mbuf ip_reass(struct* mbuf *);
395	static void ip_fwd_route_copyout(struct ifnet , struct* route *);
396	static void ip_fwd_route_copyin(struct ifnet , struct* route *);
397	static inline u_short ip_cksum(struct mbuf , int*);
398
399	/*
400	* On platforms which require strict alignment (currently for anything but
401	* i386 or x86_64 or arm64), check if the IP header pointer is 32-bit aligned; if not,
402	* copy the contents of the mbuf chain into a new chain, and free the original
403	* one. Create some head room in the first mbuf of the new chain, in case
404	* it's needed later on.
405	*/
406	#if defined(__i386__) \|\| defined(__x86_64__) \|\| defined(__arm64__)
407	#define IP_HDR_ALIGNMENT_FIXUP(_m, _ifp, _action) do { } while (0)
408	#else /* !__i386__ && !__x86_64__ && !__arm64__ */
409	#define IP_HDR_ALIGNMENT_FIXUP(_m, _ifp, _action) do { \
410	if (!IP_HDR_ALIGNED_P(mtod(_m, caddr_t))) { \
411	struct mbuf *_n; \
412	struct ifnet *__ifp = (_ifp); \
413	os_atomic_inc(&(__ifp)->if_alignerrs, relaxed); \
414	if (((_m)->m_flags & M_PKTHDR) && \
415	(_m)->m_pkthdr.pkt_hdr != NULL) \
416	(_m)->m_pkthdr.pkt_hdr = NULL; \
417	_n = m_defrag_offset(_m, max_linkhdr, M_NOWAIT); \
418	if (_n == NULL) { \
419	os_atomic_inc(&ipstat.ips_toosmall, relaxed); \
420	m_freem(_m); \
421	(_m) = NULL; \
422	_action; \
423	} else { \
424	VERIFY(_n != (_m)); \
425	(_m) = _n; \
426	} \
427	} \
428	} while (0)
429	#endif /* !__i386__ && !__x86_64__ && !__arm64__ */
430
431
432	typedef enum ip_check_if_result {
433	IP_CHECK_IF_NONE = `0`,
434	IP_CHECK_IF_OURS = `1`,
435	IP_CHECK_IF_DROP = `2`,
436	IP_CHECK_IF_FORWARD = `3`
437	} ip_check_if_result_t;
438
439	static ip_check_if_result_t ip_input_check_interface(struct mbuf , struct** ip , struct* ifnet *);
440
441	/*
442	* GRE input handler function, settable via ip_gre_register_input() for PPTP.
443	*/
444	static gre_input_func_t gre_input_func;
445
446	static void
447	ip_init_delayed(void)
448	{
449	struct ifreq ifr;
450	int error;
451	struct sockaddr_in *sin;
452
453	bzero(s: &ifr, n: sizeof(ifr));
454	strlcpy(dst: ifr.ifr_name, src: "lo0", n: sizeof(ifr.ifr_name));
455	sin = SIN(&ifr.ifr_addr);
456	sin->sin_len = sizeof(struct sockaddr_in);
457	sin->sin_family = AF_INET;
458	sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
459	error = in_control(NULL, SIOCSIFADDR, (caddr_t)&ifr, lo_ifp, kernproc);
460	if (error) {
461	printf("%s: failed to initialise lo0's address, error=%d\n",
462	__func__, error);
463	}
464	}
465
466	/*
467	* IP initialization: fill in IP protocol switch table.
468	* All protocols not implemented in kernel go to raw IP protocol handler.
469	*/
470	void
471	ip_init(struct protosw pp, struct* domain *dp)
472	{
473	static int ip_initialized = `0`;
474	struct protosw *pr;
475	struct timeval tv;
476	int i;
477
478	domain_proto_mtx_lock_assert_held();
479	VERIFY((pp->pr_flags & (PR_INITIALIZED \| PR_ATTACHED)) == PR_ATTACHED);
480
481	/*
482	* Some ioctls (e.g. SIOCAIFADDR) use ifaliasreq struct, which is
483	* interchangeable with in_aliasreq; they must have the same size.
484	*/
485	_CASSERT(sizeof(struct ifaliasreq) == sizeof(struct in_aliasreq));
486
487	if (ip_initialized) {
488	return;
489	}
490	ip_initialized = `1`;
491
492	TAILQ_INIT(&in_ifaddrhead);
493	in_ifaddrhashtbl_init();
494
495	ip_moptions_init();
496
497	pr = pffindproto_locked(PF_INET, IPPROTO_RAW, SOCK_RAW);
498	if (pr == NULL) {
499	panic("%s: Unable to find [PF_INET,IPPROTO_RAW,SOCK_RAW]",
500	__func__);
501	/ NOTREACHED /
502	}
503
504	/ Initialize the entire ip_protox[] array to IPPROTO_RAW. /
505	for (i = `0`; i < IPPROTO_MAX; i++) {
506	ip_protox[i] = pr;
507	}
508	/*
509	* Cycle through IP protocols and put them into the appropriate place
510	* in ip_protox[], skipping protocols IPPROTO_{IP,RAW}.
511	*/
512	VERIFY(dp == inetdomain && dp->dom_family == PF_INET);
513	TAILQ_FOREACH(pr, &dp->dom_protosw, pr_entry) {
514	VERIFY(pr->pr_domain == dp);
515	if (pr->pr_protocol != `0` && pr->pr_protocol != IPPROTO_RAW) {
516	/ Be careful to only index valid IP protocols. /
517	if (pr->pr_protocol < IPPROTO_MAX) {
518	ip_protox[pr->pr_protocol] = pr;
519	}
520	}
521	}
522
523	lck_mtx_lock(lck: &ipqlock);
524	/ Initialize IP reassembly queue. /
525	for (i = `0`; i < IPREASS_NHASH; i++) {
526	TAILQ_INIT(&ipq[i]);
527	}
528
529	maxnipq = nmbclusters / `32`;
530	maxfragsperpacket = `128`; / enough for 64k in 512 byte fragments /
531	ipq_updateparams();
532	lck_mtx_unlock(lck: &ipqlock);
533
534	getmicrotime(&tv);
535	ip_id = (u_short)(RandomULong() ^ tv.tv_usec);
536
537	PE_parse_boot_argn(arg_string: "ip_checkinterface", arg_ptr: &i, max_arg: sizeof(i));
538	switch (i) {
539	case IP_CHECKINTERFACE_WEAK_ES:
540	case IP_CHECKINTERFACE_HYBRID_ES:
541	case IP_CHECKINTERFACE_STRONG_ES:
542	ip_checkinterface = i;
543	break;
544	default:
545	break;
546	}
547
548	arp_init();
549	net_init_add(init_func: ip_init_delayed);
550	}
551
552	/*
553	* Initialize IPv4 source address hash table.
554	*/
555	static void
556	in_ifaddrhashtbl_init(void)
557	{
558	int i, k, p;
559
560	if (in_ifaddrhashtbl != NULL) {
561	return;
562	}
563
564	PE_parse_boot_argn(arg_string: "inaddr_nhash", arg_ptr: &inaddr_nhash,
565	max_arg: sizeof(inaddr_nhash));
566	if (inaddr_nhash == `0`) {
567	inaddr_nhash = INADDR_NHASH;
568	}
569
570	in_ifaddrhashtbl = zalloc_permanent(
571	inaddr_nhash * sizeof(*in_ifaddrhashtbl),
572	ZALIGN_PTR);
573
574	/*
575	* Generate the next largest prime greater than inaddr_nhash.
576	*/
577	k = (inaddr_nhash % `2` == `0`) ? inaddr_nhash + `1` : inaddr_nhash + `2`;
578	for (;;) {
579	p = `1`;
580	for (i = `3`; i * i <= k; i += `2`) {
581	if (k % i == `0`) {
582	p = `0`;
583	}
584	}
585	if (p == `1`) {
586	break;
587	}
588	k += `2`;
589	}
590	inaddr_hashp = k;
591	}
592
593	uint32_t
594	inaddr_hashval(uint32_t key)
595	{
596	/*
597	* The hash index is the computed prime times the key modulo
598	* the hash size, as documented in "Introduction to Algorithms"
599	* (Cormen, Leiserson, Rivest).
600	*/
601	if (inaddr_nhash > `1`) {
602	return (key * inaddr_hashp) % inaddr_nhash;
603	} else {
604	return `0`;
605	}
606	}
607
608	struct in_ifaddrhashhead *
609	inaddr_hashlookup(uint32_t key)
610	{
611	return &in_ifaddrhashtbl[inaddr_hashval(key)];
612	}
613
614	__private_extern__ void
615	ip_proto_dispatch_in(struct mbuf m, int* hlen, u_int8_t proto,
616	ipfilter_t inject_ipfref)
617	{
618	struct ipfilter *filter;
619	int seen = (inject_ipfref == NULL);
620	int changed_header = `0`;
621	struct ip *ip;
622	void (pr_input)(struct* mbuf , int* len);
623
624	if (!TAILQ_EMPTY(&ipv4_filters)) {
625	ipf_ref();
626	TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
627	if (seen == `0`) {
628	if ((struct ipfilter *)inject_ipfref == filter) {
629	seen = `1`;
630	}
631	} else if (filter->ipf_filter.ipf_input) {
632	errno_t result;
633
634	if (changed_header == `0`) {
635	/*
636	* Perform IP header alignment fixup,
637	* if needed, before passing packet
638	* into filter(s).
639	*/
640	IP_HDR_ALIGNMENT_FIXUP(m,
641	m->m_pkthdr.rcvif, ipf_unref());
642
643	/ ipf_unref() already called /
644	if (m == NULL) {
645	return;
646	}
647
648	changed_header = `1`;
649	ip = mtod(m, struct ip *);
650	ip->ip_len = htons(ip->ip_len + (uint16_t)hlen);
651	ip->ip_off = htons(ip->ip_off);
652	ip->ip_sum = `0`;
653	ip->ip_sum = ip_cksum_hdr_in(m, hlen);
654	}
655	result = filter->ipf_filter.ipf_input(
656	filter->ipf_filter.cookie, (mbuf_t *)&m,
657	hlen, proto);
658	if (result == EJUSTRETURN) {
659	ipf_unref();
660	return;
661	}
662	if (result != `0`) {
663	ipf_unref();
664	m_freem(m);
665	return;
666	}
667	}
668	}
669	ipf_unref();
670	}
671
672	/ Perform IP header alignment fixup (post-filters), if needed /
673	IP_HDR_ALIGNMENT_FIXUP(m, m->m_pkthdr.rcvif, return );
674
675	ip = mtod(m, struct ip *);
676
677	if (changed_header) {
678	ip->ip_len = ntohs(ip->ip_len) - (u_short)hlen;
679	ip->ip_off = ntohs(ip->ip_off);
680	}
681
682	/*
683	* If there isn't a specific lock for the protocol
684	* we're about to call, use the generic lock for AF_INET.
685	* otherwise let the protocol deal with its own locking
686	*/
687	if ((pr_input = ip_protox[ip->ip_p]->pr_input) == NULL) {
688	m_freem(m);
689	} else if (!(ip_protox[ip->ip_p]->pr_flags & PR_PROTOLOCK)) {
690	lck_mtx_lock(lck: inet_domain_mutex);
691	pr_input(m, hlen);
692	lck_mtx_unlock(lck: inet_domain_mutex);
693	} else {
694	pr_input(m, hlen);
695	}
696	}
697
698	struct pktchain_elm {
699	struct mbuf *pkte_head;
700	struct mbuf *pkte_tail;
701	struct in_addr pkte_saddr;
702	struct in_addr pkte_daddr;
703	uint16_t pkte_npkts;
704	uint16_t pkte_proto;
705	uint32_t pkte_nbytes;
706	};
707
708	typedef struct pktchain_elm pktchain_elm_t;
709
710	/ Store upto PKTTBL_SZ unique flows on the stack /
711	#define PKTTBL_SZ 7
712
713	static struct mbuf *
714	ip_chain_insert(struct mbuf packet, pktchain_elm_t tbl)
715	{
716	struct ip* ip;
717	int pkttbl_idx = `0`;
718
719	ip = mtod(packet, struct ip*);
720
721	/ reusing the hash function from inaddr_hashval /
722	pkttbl_idx = inaddr_hashval(ntohl(ip->ip_src.s_addr)) % PKTTBL_SZ;
723	if (tbl[pkttbl_idx].pkte_head == NULL) {
724	tbl[pkttbl_idx].pkte_head = packet;
725	tbl[pkttbl_idx].pkte_saddr.s_addr = ip->ip_src.s_addr;
726	tbl[pkttbl_idx].pkte_daddr.s_addr = ip->ip_dst.s_addr;
727	tbl[pkttbl_idx].pkte_proto = ip->ip_p;
728	} else {
729	if ((ip->ip_dst.s_addr == tbl[pkttbl_idx].pkte_daddr.s_addr) &&
730	(ip->ip_src.s_addr == tbl[pkttbl_idx].pkte_saddr.s_addr) &&
731	(ip->ip_p == tbl[pkttbl_idx].pkte_proto)) {
732	} else {
733	return packet;
734	}
735	}
736	if (tbl[pkttbl_idx].pkte_tail != NULL) {
737	mbuf_setnextpkt(mbuf: tbl[pkttbl_idx].pkte_tail, nextpkt: packet);
738	}
739
740	tbl[pkttbl_idx].pkte_tail = packet;
741	tbl[pkttbl_idx].pkte_npkts += `1`;
742	tbl[pkttbl_idx].pkte_nbytes += packet->m_pkthdr.len;
743	return NULL;
744	}
745
746	/ args is a dummy variable here for backward compatibility /
747	static void
748	ip_input_second_pass_loop_tbl(pktchain_elm_t tbl, struct* ip_fw_in_args *args)
749	{
750	int i = `0`;
751
752	for (i = `0`; i < PKTTBL_SZ; i++) {
753	if (tbl[i].pkte_head != NULL) {
754	struct mbuf *m = tbl[i].pkte_head;
755	ip_input_second_pass(m, m->m_pkthdr.rcvif,
756	tbl[i].pkte_npkts, tbl[i].pkte_nbytes, args);
757
758	if (tbl[i].pkte_npkts > `2`) {
759	ipstat.ips_rxc_chainsz_gt2++;
760	}
761	if (tbl[i].pkte_npkts > `4`) {
762	ipstat.ips_rxc_chainsz_gt4++;
763	}
764	#if (DEBUG \|\| DEVELOPMENT)
765	if (ip_input_measure) {
766	net_perf_histogram(&net_perf, tbl[i].pkte_npkts);
767	}
768	#endif /* (DEBUG \|\| DEVELOPMENT) */
769	tbl[i].pkte_head = tbl[i].pkte_tail = NULL;
770	tbl[i].pkte_npkts = `0`;
771	tbl[i].pkte_nbytes = `0`;
772	/ no need to initialize address and protocol in tbl /
773	}
774	}
775	}
776
777	static void
778	ip_input_cpout_args(struct ip_fw_in_args args, struct* ip_fw_args *args1,
779	boolean_t *done_init)
780	{
781	if (*done_init == FALSE) {
782	bzero(s: args1, n: sizeof(struct ip_fw_args));
783	*done_init = TRUE;
784	}
785	args1->fwa_pf_rule = args->fwai_pf_rule;
786	}
787
788	static void
789	ip_input_cpin_args(struct ip_fw_args args1, struct* ip_fw_in_args *args)
790	{
791	args->fwai_pf_rule = args1->fwa_pf_rule;
792	}
793
794	typedef enum {
795	IPINPUT_DOCHAIN = `0`,
796	IPINPUT_DONTCHAIN,
797	IPINPUT_FREED,
798	IPINPUT_DONE
799	} ipinput_chain_ret_t;
800
801	static void
802	ip_input_update_nstat(struct ifnet ifp, struct* in_addr src_ip,
803	u_int32_t packets, u_int32_t bytes)
804	{
805	if (nstat_collect) {
806	struct rtentry *rt = ifnet_cached_rtlookup_inet(ifp,
807	src_ip);
808	if (rt != NULL) {
809	nstat_route_rx(rte: rt, packets, bytes, flags: `0`);
810	rtfree(rt);
811	}
812	}
813	}
814
815	static void
816	ip_input_dispatch_chain(struct mbuf *m)
817	{
818	struct mbuf *tmp_mbuf = m;
819	struct mbuf *nxt_mbuf = NULL;
820	struct ip *ip = NULL;
821	unsigned int hlen;
822
823	ip = mtod(tmp_mbuf, struct ip *);
824	hlen = IP_VHL_HL(ip->ip_vhl) << `2`;
825	while (tmp_mbuf != NULL) {
826	nxt_mbuf = mbuf_nextpkt(mbuf: tmp_mbuf);
827	mbuf_setnextpkt(mbuf: tmp_mbuf, NULL);
828	ip_proto_dispatch_in(m: tmp_mbuf, hlen, proto: ip->ip_p, inject_ipfref: `0`);
829	tmp_mbuf = nxt_mbuf;
830	if (tmp_mbuf) {
831	ip = mtod(tmp_mbuf, struct ip *);
832	/ first mbuf of chain already has adjusted ip_len /
833	hlen = IP_VHL_HL(ip->ip_vhl) << `2`;
834	ip->ip_len -= hlen;
835	}
836	}
837	}
838
839	static void
840	ip_input_setdst_chain(struct mbuf m, uint16_t ifindex, struct* in_ifaddr *ia)
841	{
842	struct mbuf *tmp_mbuf = m;
843
844	while (tmp_mbuf != NULL) {
845	ip_setdstifaddr_info(tmp_mbuf, ifindex, ia);
846	tmp_mbuf = mbuf_nextpkt(mbuf: tmp_mbuf);
847	}
848	}
849
850	static void
851	ip_input_adjust(struct mbuf m, struct* ip ip, struct* ifnet *inifp)
852	{
853	boolean_t adjust = TRUE;
854
855	ASSERT(m_pktlen(m) > ip->ip_len);
856
857	/*
858	* Invalidate hardware checksum info if ip_adj_clear_hwcksum
859	* is set; useful to handle buggy drivers. Note that this
860	* should not be enabled by default, as we may get here due
861	* to link-layer padding.
862	*/
863	if (ip_adj_clear_hwcksum &&
864	(m->m_pkthdr.csum_flags & CSUM_DATA_VALID) &&
865	!(inifp->if_flags & IFF_LOOPBACK) &&
866	!(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
867	m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
868	m->m_pkthdr.csum_data = `0`;
869	ipstat.ips_adj_hwcsum_clr++;
870	}
871
872	/*
873	* If partial checksum information is available, subtract
874	* out the partial sum of postpended extraneous bytes, and
875	* update the checksum metadata accordingly. By doing it
876	* here, the upper layer transport only needs to adjust any
877	* prepended extraneous bytes (else it will do both.)
878	*/
879	if (ip_adj_partial_sum &&
880	(m->m_pkthdr.csum_flags & (CSUM_DATA_VALID \| CSUM_PARTIAL)) ==
881	(CSUM_DATA_VALID \| CSUM_PARTIAL)) {
882	m->m_pkthdr.csum_rx_val = m_adj_sum16(m,
883	m->m_pkthdr.csum_rx_start, m->m_pkthdr.csum_rx_start,
884	(ip->ip_len - m->m_pkthdr.csum_rx_start),
885	m->m_pkthdr.csum_rx_val);
886	} else if ((m->m_pkthdr.csum_flags &
887	(CSUM_DATA_VALID \| CSUM_PARTIAL)) ==
888	(CSUM_DATA_VALID \| CSUM_PARTIAL)) {
889	/*
890	* If packet has partial checksum info and we decided not
891	* to subtract the partial sum of postpended extraneous
892	* bytes here (not the default case), leave that work to
893	* be handled by the other layers. For now, only TCP, UDP
894	* layers are capable of dealing with this. For all other
895	* protocols (including fragments), trim and ditch the
896	* partial sum as those layers might not implement partial
897	* checksumming (or adjustment) at all.
898	*/
899	if ((ip->ip_off & (IP_MF \| IP_OFFMASK)) == `0` &&
900	(ip->ip_p == IPPROTO_TCP \|\| ip->ip_p == IPPROTO_UDP)) {
901	adjust = FALSE;
902	} else {
903	m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
904	m->m_pkthdr.csum_data = `0`;
905	ipstat.ips_adj_hwcsum_clr++;
906	}
907	}
908
909	if (adjust) {
910	ipstat.ips_adj++;
911	if (m->m_len == m->m_pkthdr.len) {
912	m->m_len = ip->ip_len;
913	m->m_pkthdr.len = ip->ip_len;
914	} else {
915	m_adj(m, ip->ip_len - m->m_pkthdr.len);
916	}
917	}
918	}
919
920	/*
921	* First pass does all essential packet validation and places on a per flow
922	* queue for doing operations that have same outcome for all packets of a flow.
923	*/
924	static ipinput_chain_ret_t
925	ip_input_first_pass(struct mbuf m, struct* ip_fw_in_args args, struct* mbuf **modm)
926	{
927	struct ip *ip;
928	struct ifnet *inifp;
929	unsigned int hlen;
930	int retval = IPINPUT_DOCHAIN;
931	int len = `0`;
932	struct in_addr src_ip;
933	#if DUMMYNET
934	struct m_tag *copy;
935	struct m_tag *p;
936	boolean_t delete = FALSE;
937	struct ip_fw_args args1;
938	boolean_t init = FALSE;
939	#endif /* DUMMYNET */
940	ipfilter_t inject_filter_ref = NULL;
941
942	/ Check if the mbuf is still valid after interface filter processing /
943	MBUF_INPUT_CHECK(m, m->m_pkthdr.rcvif);
944	inifp = mbuf_pkthdr_rcvif(mbuf: m);
945	VERIFY(inifp != NULL);
946
947	/ Perform IP header alignment fixup, if needed /
948	IP_HDR_ALIGNMENT_FIXUP(m, inifp, goto bad);
949
950	m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
951
952	#if DUMMYNET
953	/*
954	* Don't bother searching for tag(s) if there's none.
955	*/
956	if (SLIST_EMPTY(&m->m_pkthdr.tags)) {
957	goto ipfw_tags_done;
958	}
959
960	/ Grab info from mtags prepended to the chain /
961	p = m_tag_first(m);
962	while (p) {
963	if (p->m_tag_id == KERNEL_MODULE_TAG_ID) {
964	if (p->m_tag_type == KERNEL_TAG_TYPE_DUMMYNET) {
965	struct dn_pkt_tag *dn_tag;
966
967	dn_tag = (struct dn_pkt_tag *)(p->m_tag_data);
968	args->fwai_pf_rule = dn_tag->dn_pf_rule;
969	delete = TRUE;
970	}
971
972	if (delete) {
973	copy = p;
974	p = m_tag_next(m, p);
975	m_tag_delete(m, copy);
976	} else {
977	p = m_tag_next(m, p);
978	}
979	} else {
980	p = m_tag_next(m, p);
981	}
982	}
983
984	#if DIAGNOSTIC
985	if (m == NULL \|\| !(m->m_flags & M_PKTHDR)) {
986	panic("ip_input no HDR");
987	}
988	#endif
989
990	if (args->fwai_pf_rule) {
991	/ dummynet already filtered us /
992	ip = mtod(m, struct ip *);
993	hlen = IP_VHL_HL(ip->ip_vhl) << `2`;
994	inject_filter_ref = ipf_get_inject_filter(m);
995	if (args->fwai_pf_rule) {
996	goto check_with_pf;
997	}
998	}
999	ipfw_tags_done:
1000	#endif /* DUMMYNET */
1001
1002	/*
1003	* No need to process packet twice if we've already seen it.
1004	*/
1005	if (!SLIST_EMPTY(&m->m_pkthdr.tags)) {
1006	inject_filter_ref = ipf_get_inject_filter(m);
1007	}
1008	if (inject_filter_ref != NULL) {
1009	ip = mtod(m, struct ip *);
1010	hlen = IP_VHL_HL(ip->ip_vhl) << `2`;
1011
1012	DTRACE_IP6(receive, struct mbuf , m, struct* inpcb *, NULL,
1013	struct ip , ip, struct* ifnet *, inifp,
1014	struct ip , ip, struct* ip6_hdr *, NULL);
1015
1016	ip->ip_len = ntohs(ip->ip_len) - (u_short)hlen;
1017	ip->ip_off = ntohs(ip->ip_off);
1018	ip_proto_dispatch_in(m, hlen, proto: ip->ip_p, inject_ipfref: inject_filter_ref);
1019	return IPINPUT_DONE;
1020	}
1021
1022	if (__improbable(m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) {
1023	if_ports_used_match_mbuf(ifp: inifp, PF_INET, m);
1024	}
1025
1026	if (m->m_pkthdr.len < sizeof(struct ip)) {
1027	OSAddAtomic(`1`, &ipstat.ips_total);
1028	OSAddAtomic(`1`, &ipstat.ips_tooshort);
1029	m_freem(m);
1030	return IPINPUT_FREED;
1031	}
1032
1033	if (m->m_len < sizeof(struct ip) &&
1034	(m = m_pullup(m, sizeof(struct ip))) == NULL) {
1035	OSAddAtomic(`1`, &ipstat.ips_total);
1036	OSAddAtomic(`1`, &ipstat.ips_toosmall);
1037	return IPINPUT_FREED;
1038	}
1039
1040	ip = mtod(m, struct ip *);
1041	*modm = m;
1042
1043	KERNEL_DEBUG(DBG_LAYER_BEG, ip->ip_dst.s_addr, ip->ip_src.s_addr,
1044	ip->ip_p, ip->ip_off, ip->ip_len);
1045
1046	if (IP_VHL_V(ip->ip_vhl) != IPVERSION) {
1047	OSAddAtomic(`1`, &ipstat.ips_total);
1048	OSAddAtomic(`1`, &ipstat.ips_badvers);
1049	KERNEL_DEBUG(DBG_LAYER_END, `0`, `0`, `0`, `0`, `0`);
1050	m_freem(m);
1051	return IPINPUT_FREED;
1052	}
1053
1054	hlen = IP_VHL_HL(ip->ip_vhl) << `2`;
1055	if (hlen < sizeof(struct ip)) {
1056	OSAddAtomic(`1`, &ipstat.ips_total);
1057	OSAddAtomic(`1`, &ipstat.ips_badhlen);
1058	KERNEL_DEBUG(DBG_LAYER_END, `0`, `0`, `0`, `0`, `0`);
1059	m_freem(m);
1060	return IPINPUT_FREED;
1061	}
1062
1063	if (hlen > m->m_len) {
1064	if ((m = m_pullup(m, hlen)) == NULL) {
1065	OSAddAtomic(`1`, &ipstat.ips_total);
1066	OSAddAtomic(`1`, &ipstat.ips_badhlen);
1067	KERNEL_DEBUG(DBG_LAYER_END, `0`, `0`, `0`, `0`, `0`);
1068	return IPINPUT_FREED;
1069	}
1070	ip = mtod(m, struct ip *);
1071	*modm = m;
1072	}
1073
1074	if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT1) {
1075	m->m_pkthdr.pkt_ext_flags \|= PKTF_EXT_L4S;
1076	}
1077
1078	/ 127/8 must not appear on wire - RFC1122 /
1079	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET \|\|
1080	(ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
1081	/*
1082	* Allow for the following exceptions:
1083	*
1084	* 1. If the packet was sent to loopback (i.e. rcvif
1085	* would have been set earlier at output time.)
1086	*
1087	* 2. If the packet was sent out on loopback from a local
1088	* source address which belongs to a non-loopback
1089	* interface (i.e. rcvif may not necessarily be a
1090	* loopback interface, hence the test for PKTF_LOOP.)
1091	* Unlike IPv6, there is no interface scope ID, and
1092	* therefore we don't care so much about PKTF_IFINFO.
1093	*/
1094	if (!(inifp->if_flags & IFF_LOOPBACK) &&
1095	!(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
1096	OSAddAtomic(`1`, &ipstat.ips_total);
1097	OSAddAtomic(`1`, &ipstat.ips_badaddr);
1098	KERNEL_DEBUG(DBG_LAYER_END, `0`, `0`, `0`, `0`, `0`);
1099	m_freem(m);
1100	return IPINPUT_FREED;
1101	}
1102	}
1103
1104	/ IPv4 Link-Local Addresses as defined in RFC3927 /
1105	if ((IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)) \|\|
1106	IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)))) {
1107	ip_linklocal_stat.iplls_in_total++;
1108	if (ip->ip_ttl != MAXTTL) {
1109	OSAddAtomic(`1`, &ip_linklocal_stat.iplls_in_badttl);
1110	/ Silently drop link local traffic with bad TTL /
1111	if (!ip_linklocal_in_allowbadttl) {
1112	OSAddAtomic(`1`, &ipstat.ips_total);
1113	KERNEL_DEBUG(DBG_LAYER_END, `0`, `0`, `0`, `0`, `0`);
1114	m_freem(m);
1115	return IPINPUT_FREED;
1116	}
1117	}
1118	}
1119
1120	if (ip_cksum(m, hlen)) {
1121	OSAddAtomic(`1`, &ipstat.ips_total);
1122	KERNEL_DEBUG(DBG_LAYER_END, `0`, `0`, `0`, `0`, `0`);
1123	m_freem(m);
1124	return IPINPUT_FREED;
1125	}
1126
1127	DTRACE_IP6(receive, struct mbuf , m, struct* inpcb *, NULL,
1128	struct ip , ip, struct* ifnet *, inifp,
1129	struct ip , ip, struct* ip6_hdr *, NULL);
1130
1131	/*
1132	* Convert fields to host representation.
1133	*/
1134	#if BYTE_ORDER != BIG_ENDIAN
1135	NTOHS(ip->ip_len);
1136	#endif
1137
1138	if (ip->ip_len < hlen) {
1139	OSAddAtomic(`1`, &ipstat.ips_total);
1140	OSAddAtomic(`1`, &ipstat.ips_badlen);
1141	KERNEL_DEBUG(DBG_LAYER_END, `0`, `0`, `0`, `0`, `0`);
1142	m_freem(m);
1143	return IPINPUT_FREED;
1144	}
1145
1146	#if BYTE_ORDER != BIG_ENDIAN
1147	NTOHS(ip->ip_off);
1148	#endif
1149
1150	/*
1151	* Check that the amount of data in the buffers
1152	* is as at least much as the IP header would have us expect.
1153	* Trim mbufs if longer than we expect.
1154	* Drop packet if shorter than we expect.
1155	*/
1156	if (m->m_pkthdr.len < ip->ip_len) {
1157	OSAddAtomic(`1`, &ipstat.ips_total);
1158	OSAddAtomic(`1`, &ipstat.ips_tooshort);
1159	KERNEL_DEBUG(DBG_LAYER_END, `0`, `0`, `0`, `0`, `0`);
1160	m_freem(m);
1161	return IPINPUT_FREED;
1162	}
1163
1164	if (m->m_pkthdr.len > ip->ip_len) {
1165	ip_input_adjust(m, ip, inifp);
1166	}
1167
1168	/ for netstat route statistics /
1169	src_ip = ip->ip_src;
1170	len = m->m_pkthdr.len;
1171
1172	#if DUMMYNET
1173	check_with_pf:
1174	#endif /* DUMMYNET */
1175	#if PF
1176	/ Invoke inbound packet filter /
1177	if (PF_IS_ENABLED) {
1178	int error;
1179	ip_input_cpout_args(args, args1: &args1, done_init: &init);
1180	ip = mtod(m, struct ip *);
1181	src_ip = ip->ip_src;
1182
1183	#if DUMMYNET
1184	error = pf_af_hook(inifp, NULL, &m, AF_INET, TRUE, &args1);
1185	#else
1186	error = pf_af_hook(inifp, NULL, &m, AF_INET, TRUE, NULL);
1187	#endif /* DUMMYNET */
1188	if (error != `0` \|\| m == NULL) {
1189	if (m != NULL) {
1190	panic("%s: unexpected packet %p",
1191	__func__, m);
1192	/ NOTREACHED /
1193	}
1194	/ Already freed by callee /
1195	ip_input_update_nstat(ifp: inifp, src_ip, packets: `1`, bytes: len);
1196	KERNEL_DEBUG(DBG_LAYER_END, `0`, `0`, `0`, `0`, `0`);
1197	OSAddAtomic(`1`, &ipstat.ips_total);
1198	return IPINPUT_FREED;
1199	}
1200	ip = mtod(m, struct ip *);
1201	hlen = IP_VHL_HL(ip->ip_vhl) << `2`;
1202	*modm = m;
1203	ip_input_cpin_args(args1: &args1, args);
1204	}
1205	#endif /* PF */
1206
1207	#if IPSEC
1208	if (ipsec_bypass == `0` && ipsec_get_history_count(m)) {
1209	retval = IPINPUT_DONTCHAIN; / XXX scope for chaining here? /
1210	goto pass;
1211	}
1212	#endif
1213
1214	#if IPSEC
1215	pass:
1216	#endif
1217	/*
1218	* Process options and, if not destined for us,
1219	* ship it on. ip_dooptions returns 1 when an
1220	* error was detected (causing an icmp message
1221	* to be sent and the original packet to be freed).
1222	*/
1223	ip_nhops = `0`; / for source routed packets /
1224	if (hlen > sizeof(struct ip) && ip_dooptions(m, `0`, NULL)) {
1225	src_ip = ip->ip_src;
1226	ip_input_update_nstat(ifp: inifp, src_ip, packets: `1`, bytes: len);
1227	KERNEL_DEBUG(DBG_LAYER_END, `0`, `0`, `0`, `0`, `0`);
1228	OSAddAtomic(`1`, &ipstat.ips_total);
1229	return IPINPUT_FREED;
1230	}
1231
1232	/*
1233	* Don't chain fragmented packets
1234	*/
1235	if (ip->ip_off & ~(IP_DF \| IP_RF)) {
1236	return IPINPUT_DONTCHAIN;
1237	}
1238
1239	/ Allow DHCP/BootP responses through /
1240	if ((inifp->if_eflags & IFEF_AUTOCONFIGURING) &&
1241	hlen == sizeof(struct ip) && ip->ip_p == IPPROTO_UDP) {
1242	struct udpiphdr *ui;
1243
1244	if (m->m_len < sizeof(struct udpiphdr) &&
1245	(m = m_pullup(m, sizeof(struct udpiphdr))) == NULL) {
1246	OSAddAtomic(`1`, &udpstat.udps_hdrops);
1247	KERNEL_DEBUG(DBG_LAYER_END, `0`, `0`, `0`, `0`, `0`);
1248	OSAddAtomic(`1`, &ipstat.ips_total);
1249	return IPINPUT_FREED;
1250	}
1251	*modm = m;
1252	ui = mtod(m, struct udpiphdr *);
1253	if (ntohs(ui->ui_dport) == IPPORT_BOOTPC) {
1254	ip_setdstifaddr_info(m, inifp->if_index, NULL);
1255	return IPINPUT_DONTCHAIN;
1256	}
1257	}
1258
1259	/ Avoid chaining raw sockets as ipsec checks occur later for them /
1260	if (ip_protox[ip->ip_p]->pr_flags & PR_LASTHDR) {
1261	return IPINPUT_DONTCHAIN;
1262	}
1263
1264	return retval;
1265	#if !defined(__i386__) && !defined(__x86_64__) && !defined(__arm64__)
1266	bad:
1267	m_freem(m);
1268	return IPINPUT_FREED;
1269	#endif
1270	}
1271
1272	/*
1273	* Because the call to m_pullup() may freem the mbuf, the function frees the mbuf packet
1274	* chain before it return IP_CHECK_IF_DROP
1275	*/
1276	static ip_check_if_result_t
1277	ip_input_check_interface(struct mbuf mp, struct** ip ip, struct* ifnet *inifp)
1278	{
1279	struct mbuf m = mp;
1280	struct in_ifaddr *ia = NULL;
1281	struct in_ifaddr *best_ia = NULL;
1282	struct ifnet *match_ifp = NULL;
1283	ip_check_if_result_t result = IP_CHECK_IF_NONE;
1284
1285	/*
1286	* Host broadcast and all network broadcast addresses are always a match
1287	*/
1288	if (ip->ip_dst.s_addr == (u_int32_t)INADDR_BROADCAST \|\|
1289	ip->ip_dst.s_addr == INADDR_ANY) {
1290	ip_input_setdst_chain(m, ifindex: inifp->if_index, NULL);
1291	return IP_CHECK_IF_OURS;
1292	}
1293
1294	/*
1295	* Check for a match in the hash bucket.
1296	*/
1297	lck_rw_lock_shared(lck: &in_ifaddr_rwlock);
1298	TAILQ_FOREACH(ia, INADDR_HASH(ip->ip_dst.s_addr), ia_hash) {
1299	if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_dst.s_addr) {
1300	best_ia = ia;
1301	match_ifp = best_ia->ia_ifp;
1302
1303	if (ia->ia_ifp == inifp \|\| (inifp->if_flags & IFF_LOOPBACK) \|\|
1304	(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
1305	/*
1306	* A locally originated packet or packet from the loopback
1307	* interface is always an exact interface address match
1308	*/
1309	match_ifp = inifp;
1310	break;
1311	}
1312	/*
1313	* Continue the loop in case there's a exact match with another
1314	* interface
1315	*/
1316	}
1317	}
1318	if (best_ia != NULL) {
1319	if (match_ifp != inifp && ipforwarding == `0` &&
1320	((ip_checkinterface == IP_CHECKINTERFACE_HYBRID_ES &&
1321	(match_ifp->if_family == IFNET_FAMILY_IPSEC \|\|
1322	match_ifp->if_family == IFNET_FAMILY_UTUN)) \|\|
1323	ip_checkinterface == IP_CHECKINTERFACE_STRONG_ES)) {
1324	/*
1325	* Drop when interface address check is strict and forwarding
1326	* is disabled
1327	*/
1328	result = IP_CHECK_IF_DROP;
1329	} else {
1330	result = IP_CHECK_IF_OURS;
1331	ip_input_setdst_chain(m, ifindex: `0`, ia: best_ia);
1332	}
1333	}
1334	lck_rw_done(lck: &in_ifaddr_rwlock);
1335
1336	if (result == IP_CHECK_IF_NONE && (inifp->if_flags & IFF_BROADCAST)) {
1337	/*
1338	* Check for broadcast addresses.
1339	*
1340	* Only accept broadcast packets that arrive via the matching
1341	* interface. Reception of forwarded directed broadcasts would be
1342	* handled via ip_forward() and ether_frameout() with the loopback
1343	* into the stack for SIMPLEX interfaces handled by ether_frameout().
1344	*/
1345	struct ifaddr *ifa;
1346
1347	ifnet_lock_shared(ifp: inifp);
1348	TAILQ_FOREACH(ifa, &inifp->if_addrhead, ifa_link) {
1349	if (ifa->ifa_addr->sa_family != AF_INET) {
1350	continue;
1351	}
1352	ia = ifatoia(ifa);
1353	if (satosin(&ia->ia_broadaddr)->sin_addr.s_addr == ip->ip_dst.s_addr \|\|
1354	ia->ia_netbroadcast.s_addr == ip->ip_dst.s_addr) {
1355	ip_input_setdst_chain(m, ifindex: `0`, ia);
1356	result = IP_CHECK_IF_OURS;
1357	match_ifp = inifp;
1358	break;
1359	}
1360	}
1361	ifnet_lock_done(ifp: inifp);
1362	}
1363
1364	/ Allow DHCP/BootP responses through /
1365	if (result == IP_CHECK_IF_NONE && (inifp->if_eflags & IFEF_AUTOCONFIGURING) &&
1366	ip->ip_p == IPPROTO_UDP && (IP_VHL_HL(ip->ip_vhl) << `2`) == sizeof(struct ip)) {
1367	struct udpiphdr *ui;
1368
1369	if (m->m_len < sizeof(struct udpiphdr)) {
1370	if ((m = m_pullup(m, sizeof(struct udpiphdr))) == NULL) {
1371	OSAddAtomic(`1`, &udpstat.udps_hdrops);
1372	*mp = NULL;
1373	return IP_CHECK_IF_DROP;
1374	}
1375	/*
1376	* m_pullup can return a different mbuf
1377	*/
1378	*mp = m;
1379	ip = mtod(m, struct ip *);
1380	}
1381	ui = mtod(m, struct udpiphdr *);
1382	if (ntohs(ui->ui_dport) == IPPORT_BOOTPC) {
1383	ip_input_setdst_chain(m, ifindex: inifp->if_index, NULL);
1384	result = IP_CHECK_IF_OURS;
1385	match_ifp = inifp;
1386	}
1387	}
1388
1389	if (result == IP_CHECK_IF_NONE) {
1390	if (ipforwarding == `0`) {
1391	result = IP_CHECK_IF_DROP;
1392	} else {
1393	result = IP_CHECK_IF_FORWARD;
1394	ip_input_setdst_chain(m, ifindex: inifp->if_index, NULL);
1395	}
1396	}
1397
1398	if (result == IP_CHECK_IF_OURS && match_ifp != inifp) {
1399	ipstat.ips_rcv_if_weak_match++;
1400
1401	/ Logging is too noisy when forwarding is enabled /
1402	if (ip_checkinterface_debug != `0` && ipforwarding == `0`) {
1403	char src_str[MAX_IPv4_STR_LEN];
1404	char dst_str[MAX_IPv4_STR_LEN];
1405
1406	inet_ntop(AF_INET, &ip->ip_src, src_str, sizeof(src_str));
1407	inet_ntop(AF_INET, &ip->ip_dst, dst_str, sizeof(dst_str));
1408	os_log_info(OS_LOG_DEFAULT,
1409	"%s: weak ES interface match to %s for packet from %s to %s proto %u received via %s",
1410	__func__, best_ia->ia_ifp->if_xname, src_str, dst_str, ip->ip_p, inifp->if_xname);
1411	}
1412	} else if (result == IP_CHECK_IF_DROP) {
1413	if (ip_checkinterface_debug > `0`) {
1414	char src_str[MAX_IPv4_STR_LEN];
1415	char dst_str[MAX_IPv4_STR_LEN];
1416
1417	inet_ntop(AF_INET, &ip->ip_src, src_str, sizeof(src_str));
1418	inet_ntop(AF_INET, &ip->ip_dst, dst_str, sizeof(dst_str));
1419	os_log(OS_LOG_DEFAULT,
1420	"%s: no interface match for packet from %s to %s proto %u received via %s",
1421	__func__, src_str, dst_str, ip->ip_p, inifp->if_xname);
1422	}
1423	struct mbuf *tmp_mbuf = m;
1424	while (tmp_mbuf != NULL) {
1425	ipstat.ips_rcv_if_no_match++;
1426	tmp_mbuf = tmp_mbuf->m_nextpkt;
1427	}
1428	m_freem_list(m);
1429	*mp = NULL;
1430	}
1431
1432	return result;
1433	}
1434
1435	static void
1436	ip_input_second_pass(struct mbuf m, struct* ifnet *inifp,
1437	int npkts_in_chain, int bytes_in_chain, struct ip_fw_in_args *args)
1438	{
1439	struct mbuf *tmp_mbuf = NULL;
1440	unsigned int hlen;
1441
1442	#pragma unused (args)
1443
1444	struct ip ip = mtod(m, struct* ip *);
1445	hlen = IP_VHL_HL(ip->ip_vhl) << `2`;
1446
1447	OSAddAtomic(npkts_in_chain, &ipstat.ips_total);
1448
1449	/*
1450	* Naively assume we can attribute inbound data to the route we would
1451	* use to send to this destination. Asymmetric routing breaks this
1452	* assumption, but it still allows us to account for traffic from
1453	* a remote node in the routing table.
1454	* this has a very significant performance impact so we bypass
1455	* if nstat_collect is disabled. We may also bypass if the
1456	* protocol is tcp in the future because tcp will have a route that
1457	* we can use to attribute the data to. That does mean we would not
1458	* account for forwarded tcp traffic.
1459	*/
1460	ip_input_update_nstat(ifp: inifp, src_ip: ip->ip_src, packets: npkts_in_chain,
1461	bytes: bytes_in_chain);
1462
1463	/*
1464	* Check our list of addresses, to see if the packet is for us.
1465	* If we don't have any addresses, assume any unicast packet
1466	* we receive might be for us (and let the upper layers deal
1467	* with it).
1468	*/
1469	tmp_mbuf = m;
1470	if (TAILQ_EMPTY(&in_ifaddrhead)) {
1471	while (tmp_mbuf != NULL) {
1472	if (!(tmp_mbuf->m_flags & (M_MCAST \| M_BCAST))) {
1473	ip_setdstifaddr_info(tmp_mbuf, inifp->if_index,
1474	NULL);
1475	}
1476	tmp_mbuf = mbuf_nextpkt(mbuf: tmp_mbuf);
1477	}
1478	goto ours;
1479	}
1480
1481	/*
1482	* Enable a consistency check between the destination address
1483	* and the arrival interface for a unicast packet (the RFC 1122
1484	* strong ES model) if IP forwarding is disabled and the packet
1485	* is not locally generated
1486	*
1487	* XXX - Checking also should be disabled if the destination
1488	* address is ipnat'ed to a different interface.
1489	*
1490	* XXX - Checking is incompatible with IP aliases added
1491	* to the loopback interface instead of the interface where
1492	* the packets are received.
1493	*/
1494	if (!IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
1495	ip_check_if_result_t ip_check_if_result = IP_CHECK_IF_NONE;
1496
1497	ip_check_if_result = ip_input_check_interface(mp: &m, ip, inifp);
1498	ASSERT(ip_check_if_result != IP_CHECK_IF_NONE);
1499	if (ip_check_if_result == IP_CHECK_IF_OURS) {
1500	goto ours;
1501	} else if (ip_check_if_result == IP_CHECK_IF_DROP) {
1502	return;
1503	}
1504	} else {
1505	struct in_multi *inm;
1506	/*
1507	* See if we belong to the destination multicast group on the
1508	* arrival interface.
1509	*/
1510	in_multihead_lock_shared();
1511	IN_LOOKUP_MULTI(&ip->ip_dst, inifp, inm);
1512	in_multihead_lock_done();
1513	if (inm == NULL) {
1514	OSAddAtomic(npkts_in_chain, &ipstat.ips_notmember);
1515	m_freem_list(m);
1516	KERNEL_DEBUG(DBG_LAYER_END, `0`, `0`, `0`, `0`, `0`);
1517	return;
1518	}
1519	ip_input_setdst_chain(m, ifindex: inifp->if_index, NULL);
1520	INM_REMREF(inm);
1521	goto ours;
1522	}
1523
1524	tmp_mbuf = m;
1525	struct mbuf *nxt_mbuf = NULL;
1526	while (tmp_mbuf != NULL) {
1527	nxt_mbuf = mbuf_nextpkt(mbuf: tmp_mbuf);
1528	/*
1529	* Not for us; forward if possible and desirable.
1530	*/
1531	mbuf_setnextpkt(mbuf: tmp_mbuf, NULL);
1532	if (ipforwarding == `0`) {
1533	OSAddAtomic(`1`, &ipstat.ips_cantforward);
1534	m_freem(tmp_mbuf);
1535	} else {
1536	ip_forward(tmp_mbuf, `0`, NULL);
1537	}
1538	tmp_mbuf = nxt_mbuf;
1539	}
1540	KERNEL_DEBUG(DBG_LAYER_END, `0`, `0`, `0`, `0`, `0`);
1541	return;
1542	ours:
1543	ip = mtod(m, struct ip ); /* in case it changed /
1544	/*
1545	* If offset is set, must reassemble.
1546	*/
1547	if (ip->ip_off & ~(IP_DF \| IP_RF)) {
1548	VERIFY(npkts_in_chain == `1`);
1549	m = ip_reass(m);
1550	if (m == NULL) {
1551	return;
1552	}
1553	ip = mtod(m, struct ip *);
1554	/ Get the header length of the reassembled packet /
1555	hlen = IP_VHL_HL(ip->ip_vhl) << `2`;
1556	}
1557
1558	/*
1559	* Further protocols expect the packet length to be w/o the
1560	* IP header.
1561	*/
1562	ip->ip_len -= hlen;
1563
1564	#if IPSEC
1565	/*
1566	* enforce IPsec policy checking if we are seeing last header.
1567	* note that we do not visit this with protocols with pcb layer
1568	* code - like udp/tcp/raw ip.
1569	*/
1570	if (ipsec_bypass == `0` && (ip_protox[ip->ip_p]->pr_flags & PR_LASTHDR)) {
1571	VERIFY(npkts_in_chain == `1`);
1572	if (ipsec4_in_reject(m, NULL)) {
1573	IPSEC_STAT_INCREMENT(ipsecstat.in_polvio);
1574	goto bad;
1575	}
1576	}
1577	#endif /* IPSEC */
1578
1579	/*
1580	* Switch out to protocol's input routine.
1581	*/
1582	OSAddAtomic(npkts_in_chain, &ipstat.ips_delivered);
1583
1584	ip_input_dispatch_chain(m);
1585
1586	KERNEL_DEBUG(DBG_LAYER_END, `0`, `0`, `0`, `0`, `0`);
1587	return;
1588	bad:
1589	KERNEL_DEBUG(DBG_LAYER_END, `0`, `0`, `0`, `0`, `0`);
1590	m_freem(m);
1591	}
1592
1593	void
1594	ip_input_process_list(struct mbuf *packet_list)
1595	{
1596	pktchain_elm_t pktchain_tbl[PKTTBL_SZ];
1597
1598	struct mbuf *packet = NULL;
1599	struct mbuf modm = NULL; /* modified mbuf /
1600	int retval = `0`;
1601	#if (DEBUG \|\| DEVELOPMENT)
1602	struct timeval start_tv;
1603	#endif /* (DEBUG \|\| DEVELOPMENT) */
1604	int num_pkts = `0`;
1605	int chain = `0`;
1606	struct ip_fw_in_args args;
1607
1608	if (ip_chaining == `0`) {
1609	struct mbuf *m = packet_list;
1610	#if (DEBUG \|\| DEVELOPMENT)
1611	if (ip_input_measure) {
1612	net_perf_start_time(&net_perf, &start_tv);
1613	}
1614	#endif /* (DEBUG \|\| DEVELOPMENT) */
1615
1616	while (m) {
1617	packet_list = mbuf_nextpkt(mbuf: m);
1618	mbuf_setnextpkt(mbuf: m, NULL);
1619	ip_input(m);
1620	m = packet_list;
1621	num_pkts++;
1622	}
1623	#if (DEBUG \|\| DEVELOPMENT)
1624	if (ip_input_measure) {
1625	net_perf_measure_time(&net_perf, &start_tv, num_pkts);
1626	}
1627	#endif /* (DEBUG \|\| DEVELOPMENT) */
1628	return;
1629	}
1630	#if (DEBUG \|\| DEVELOPMENT)
1631	if (ip_input_measure) {
1632	net_perf_start_time(&net_perf, &start_tv);
1633	}
1634	#endif /* (DEBUG \|\| DEVELOPMENT) */
1635
1636	bzero(s: &pktchain_tbl, n: sizeof(pktchain_tbl));
1637	restart_list_process:
1638	chain = `0`;
1639	for (packet = packet_list; packet; packet = packet_list) {
1640	m_add_crumb(packet, PKT_CRUMB_IP_INPUT);
1641
1642	packet_list = mbuf_nextpkt(mbuf: packet);
1643	mbuf_setnextpkt(mbuf: packet, NULL);
1644
1645	num_pkts++;
1646	modm = NULL;
1647	bzero(s: &args, n: sizeof(args));
1648
1649	retval = ip_input_first_pass(m: packet, args: &args, modm: &modm);
1650
1651	if (retval == IPINPUT_DOCHAIN) {
1652	if (modm) {
1653	packet = modm;
1654	}
1655	packet = ip_chain_insert(packet, tbl: &pktchain_tbl[`0`]);
1656	if (packet == NULL) {
1657	ipstat.ips_rxc_chained++;
1658	chain++;
1659	if (chain > ip_chainsz) {
1660	break;
1661	}
1662	} else {
1663	ipstat.ips_rxc_collisions++;
1664	break;
1665	}
1666	} else if (retval == IPINPUT_DONTCHAIN) {
1667	/ in order to preserve order, exit from chaining /
1668	if (modm) {
1669	packet = modm;
1670	}
1671	ipstat.ips_rxc_notchain++;
1672	break;
1673	} else {
1674	/ packet was freed or delivered, do nothing. /
1675	}
1676	}
1677
1678	/ do second pass here for pktchain_tbl /
1679	if (chain) {
1680	ip_input_second_pass_loop_tbl(tbl: &pktchain_tbl[`0`], args: &args);
1681	}
1682
1683	if (packet) {
1684	/*
1685	* equivalent update in chaining case if performed in
1686	* ip_input_second_pass_loop_tbl().
1687	*/
1688	#if (DEBUG \|\| DEVELOPMENT)
1689	if (ip_input_measure) {
1690	net_perf_histogram(&net_perf, `1`);
1691	}
1692	#endif /* (DEBUG \|\| DEVELOPMENT) */
1693	ip_input_second_pass(m: packet, inifp: packet->m_pkthdr.rcvif,
1694	npkts_in_chain: `1`, bytes_in_chain: packet->m_pkthdr.len, args: &args);
1695	}
1696
1697	if (packet_list) {
1698	goto restart_list_process;
1699	}
1700
1701	#if (DEBUG \|\| DEVELOPMENT)
1702	if (ip_input_measure) {
1703	net_perf_measure_time(&net_perf, &start_tv, num_pkts);
1704	}
1705	#endif /* (DEBUG \|\| DEVELOPMENT) */
1706	}
1707	/*
1708	* Ip input routine. Checksum and byte swap header. If fragmented
1709	* try to reassemble. Process options. Pass to next level.
1710	*/
1711	void
1712	ip_input(struct mbuf *m)
1713	{
1714	struct ip *ip;
1715	unsigned int hlen;
1716	u_short sum = `0`;
1717	#if DUMMYNET
1718	struct ip_fw_args args;
1719	struct m_tag *tag;
1720	#endif
1721	ipfilter_t inject_filter_ref = NULL;
1722	struct ifnet *inifp;
1723
1724	/ Check if the mbuf is still valid after interface filter processing /
1725	MBUF_INPUT_CHECK(m, m->m_pkthdr.rcvif);
1726	inifp = m->m_pkthdr.rcvif;
1727	VERIFY(inifp != NULL);
1728
1729	m_add_crumb(m, PKT_CRUMB_IP_INPUT);
1730
1731	ipstat.ips_rxc_notlist++;
1732
1733	/ Perform IP header alignment fixup, if needed /
1734	IP_HDR_ALIGNMENT_FIXUP(m, inifp, goto bad);
1735
1736	m->m_pkthdr.pkt_flags &= ~PKTF_FORWARDED;
1737
1738	#if DUMMYNET
1739	bzero(s: &args, n: sizeof(struct ip_fw_args));
1740
1741	/*
1742	* Don't bother searching for tag(s) if there's none.
1743	*/
1744	if (SLIST_EMPTY(&m->m_pkthdr.tags)) {
1745	goto ipfw_tags_done;
1746	}
1747
1748	/ Grab info from mtags prepended to the chain /
1749	if ((tag = m_tag_locate(m, KERNEL_MODULE_TAG_ID,
1750	KERNEL_TAG_TYPE_DUMMYNET)) != NULL) {
1751	struct dn_pkt_tag *dn_tag;
1752
1753	dn_tag = (struct dn_pkt_tag *)(tag->m_tag_data);
1754	args.fwa_pf_rule = dn_tag->dn_pf_rule;
1755
1756	m_tag_delete(m, tag);
1757	}
1758
1759	#if DIAGNOSTIC
1760	if (m == NULL \|\| !(m->m_flags & M_PKTHDR)) {
1761	panic("ip_input no HDR");
1762	}
1763	#endif
1764
1765	if (args.fwa_pf_rule) {
1766	/ dummynet already filtered us /
1767	ip = mtod(m, struct ip *);
1768	hlen = IP_VHL_HL(ip->ip_vhl) << `2`;
1769	inject_filter_ref = ipf_get_inject_filter(m);
1770	if (args.fwa_pf_rule) {
1771	goto check_with_pf;
1772	}
1773	}
1774	ipfw_tags_done:
1775	#endif /* DUMMYNET */
1776
1777	/*
1778	* No need to process packet twice if we've already seen it.
1779	*/
1780	if (!SLIST_EMPTY(&m->m_pkthdr.tags)) {
1781	inject_filter_ref = ipf_get_inject_filter(m);
1782	}
1783	if (inject_filter_ref != NULL) {
1784	ip = mtod(m, struct ip *);
1785	hlen = IP_VHL_HL(ip->ip_vhl) << `2`;
1786
1787	DTRACE_IP6(receive, struct mbuf , m, struct* inpcb *, NULL,
1788	struct ip , ip, struct* ifnet *, inifp,
1789	struct ip , ip, struct* ip6_hdr *, NULL);
1790
1791	ip->ip_len = ntohs(ip->ip_len) - (u_short)hlen;
1792	ip->ip_off = ntohs(ip->ip_off);
1793	ip_proto_dispatch_in(m, hlen, proto: ip->ip_p, inject_ipfref: inject_filter_ref);
1794	return;
1795	}
1796
1797	if (__improbable(m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) {
1798	if_ports_used_match_mbuf(ifp: inifp, PF_INET, m);
1799	}
1800
1801	OSAddAtomic(`1`, &ipstat.ips_total);
1802	if (m->m_pkthdr.len < sizeof(struct ip)) {
1803	goto tooshort;
1804	}
1805
1806	if (m->m_len < sizeof(struct ip) &&
1807	(m = m_pullup(m, sizeof(struct ip))) == NULL) {
1808	OSAddAtomic(`1`, &ipstat.ips_toosmall);
1809	return;
1810	}
1811	ip = mtod(m, struct ip *);
1812
1813	KERNEL_DEBUG(DBG_LAYER_BEG, ip->ip_dst.s_addr, ip->ip_src.s_addr,
1814	ip->ip_p, ip->ip_off, ip->ip_len);
1815
1816	if (IP_VHL_V(ip->ip_vhl) != IPVERSION) {
1817	OSAddAtomic(`1`, &ipstat.ips_badvers);
1818	goto bad;
1819	}
1820
1821	hlen = IP_VHL_HL(ip->ip_vhl) << `2`;
1822	if (hlen < sizeof(struct ip)) { / minimum header length /
1823	OSAddAtomic(`1`, &ipstat.ips_badhlen);
1824	goto bad;
1825	}
1826	if (hlen > m->m_len) {
1827	if ((m = m_pullup(m, hlen)) == NULL) {
1828	OSAddAtomic(`1`, &ipstat.ips_badhlen);
1829	return;
1830	}
1831	ip = mtod(m, struct ip *);
1832	}
1833
1834	/ 127/8 must not appear on wire - RFC1122 /
1835	if ((ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET \|\|
1836	(ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) {
1837	/*
1838	* Allow for the following exceptions:
1839	*
1840	* 1. If the packet was sent to loopback (i.e. rcvif
1841	* would have been set earlier at output time.)
1842	*
1843	* 2. If the packet was sent out on loopback from a local
1844	* source address which belongs to a non-loopback
1845	* interface (i.e. rcvif may not necessarily be a
1846	* loopback interface, hence the test for PKTF_LOOP.)
1847	* Unlike IPv6, there is no interface scope ID, and
1848	* therefore we don't care so much about PKTF_IFINFO.
1849	*/
1850	if (!(inifp->if_flags & IFF_LOOPBACK) &&
1851	!(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
1852	OSAddAtomic(`1`, &ipstat.ips_badaddr);
1853	goto bad;
1854	}
1855	}
1856
1857	/ IPv4 Link-Local Addresses as defined in RFC3927 /
1858	if ((IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr)) \|\|
1859	IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)))) {
1860	ip_linklocal_stat.iplls_in_total++;
1861	if (ip->ip_ttl != MAXTTL) {
1862	OSAddAtomic(`1`, &ip_linklocal_stat.iplls_in_badttl);
1863	/ Silently drop link local traffic with bad TTL /
1864	if (!ip_linklocal_in_allowbadttl) {
1865	goto bad;
1866	}
1867	}
1868	}
1869
1870	sum = ip_cksum(m, hlen);
1871	if (sum) {
1872	goto bad;
1873	}
1874
1875	DTRACE_IP6(receive, struct mbuf , m, struct* inpcb *, NULL,
1876	struct ip , ip, struct* ifnet *, inifp,
1877	struct ip , ip, struct* ip6_hdr *, NULL);
1878
1879	/*
1880	* Naively assume we can attribute inbound data to the route we would
1881	* use to send to this destination. Asymmetric routing breaks this
1882	* assumption, but it still allows us to account for traffic from
1883	* a remote node in the routing table.
1884	* this has a very significant performance impact so we bypass
1885	* if nstat_collect is disabled. We may also bypass if the
1886	* protocol is tcp in the future because tcp will have a route that
1887	* we can use to attribute the data to. That does mean we would not
1888	* account for forwarded tcp traffic.
1889	*/
1890	if (nstat_collect) {
1891	struct rtentry *rt =
1892	ifnet_cached_rtlookup_inet(inifp, ip->ip_src);
1893	if (rt != NULL) {
1894	nstat_route_rx(rte: rt, packets: `1`, bytes: m->m_pkthdr.len, flags: `0`);
1895	rtfree(rt);
1896	}
1897	}
1898
1899	/*
1900	* Convert fields to host representation.
1901	*/
1902	#if BYTE_ORDER != BIG_ENDIAN
1903	NTOHS(ip->ip_len);
1904	#endif
1905
1906	if (ip->ip_len < hlen) {
1907	OSAddAtomic(`1`, &ipstat.ips_badlen);
1908	goto bad;
1909	}
1910
1911	#if BYTE_ORDER != BIG_ENDIAN
1912	NTOHS(ip->ip_off);
1913	#endif
1914	/*
1915	* Check that the amount of data in the buffers
1916	* is as at least much as the IP header would have us expect.
1917	* Trim mbufs if longer than we expect.
1918	* Drop packet if shorter than we expect.
1919	*/
1920	if (m->m_pkthdr.len < ip->ip_len) {
1921	tooshort:
1922	OSAddAtomic(`1`, &ipstat.ips_tooshort);
1923	goto bad;
1924	}
1925	if (m->m_pkthdr.len > ip->ip_len) {
1926	ip_input_adjust(m, ip, inifp);
1927	}
1928
1929	#if DUMMYNET
1930	check_with_pf:
1931	#endif
1932	#if PF
1933	/ Invoke inbound packet filter /
1934	if (PF_IS_ENABLED) {
1935	int error;
1936	#if DUMMYNET
1937	error = pf_af_hook(inifp, NULL, &m, AF_INET, TRUE, &args);
1938	#else
1939	error = pf_af_hook(inifp, NULL, &m, AF_INET, TRUE, NULL);
1940	#endif /* DUMMYNET */
1941	if (error != `0` \|\| m == NULL) {
1942	if (m != NULL) {
1943	panic("%s: unexpected packet %p",
1944	__func__, m);
1945	/ NOTREACHED /
1946	}
1947	/ Already freed by callee /
1948	return;
1949	}
1950	ip = mtod(m, struct ip *);
1951	hlen = IP_VHL_HL(ip->ip_vhl) << `2`;
1952	}
1953	#endif /* PF */
1954
1955	#if IPSEC
1956	if (ipsec_bypass == `0` && ipsec_get_history_count(m)) {
1957	goto pass;
1958	}
1959	#endif
1960
1961	pass:
1962	/*
1963	* Process options and, if not destined for us,
1964	* ship it on. ip_dooptions returns 1 when an
1965	* error was detected (causing an icmp message
1966	* to be sent and the original packet to be freed).
1967	*/
1968	ip_nhops = `0`; / for source routed packets /
1969	if (hlen > sizeof(struct ip) && ip_dooptions(m, `0`, NULL)) {
1970	return;
1971	}
1972
1973	/*
1974	* Check our list of addresses, to see if the packet is for us.
1975	* If we don't have any addresses, assume any unicast packet
1976	* we receive might be for us (and let the upper layers deal
1977	* with it).
1978	*/
1979	if (TAILQ_EMPTY(&in_ifaddrhead) && !(m->m_flags & (M_MCAST \| M_BCAST))) {
1980	ip_setdstifaddr_info(m, inifp->if_index, NULL);
1981	goto ours;
1982	}
1983
1984	/*
1985	* Enable a consistency check between the destination address
1986	* and the arrival interface for a unicast packet (the RFC 1122
1987	* strong ES model) if IP forwarding is disabled and the packet
1988	* is not locally generated and the packet is not subject to
1989	* 'ipfw fwd'.
1990	*
1991	* XXX - Checking also should be disabled if the destination
1992	* address is ipnat'ed to a different interface.
1993	*
1994	* XXX - Checking is incompatible with IP aliases added
1995	* to the loopback interface instead of the interface where
1996	* the packets are received.
1997	*/
1998	if (!IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
1999	ip_check_if_result_t check_if_result = IP_CHECK_IF_NONE;
2000
2001	check_if_result = ip_input_check_interface(mp: &m, ip, inifp);
2002	ASSERT(check_if_result != IP_CHECK_IF_NONE);
2003	if (check_if_result == IP_CHECK_IF_OURS) {
2004	goto ours;
2005	} else if (check_if_result == IP_CHECK_IF_DROP) {
2006	return;
2007	}
2008	} else {
2009	struct in_multi *inm;
2010	/*
2011	* See if we belong to the destination multicast group on the
2012	* arrival interface.
2013	*/
2014	in_multihead_lock_shared();
2015	IN_LOOKUP_MULTI(&ip->ip_dst, inifp, inm);
2016	in_multihead_lock_done();
2017	if (inm == NULL) {
2018	OSAddAtomic(`1`, &ipstat.ips_notmember);
2019	m_freem(m);
2020	return;
2021	}
2022	ip_setdstifaddr_info(m, inifp->if_index, NULL);
2023	INM_REMREF(inm);
2024	goto ours;
2025	}
2026
2027	/*
2028	* Not for us; forward if possible and desirable.
2029	*/
2030	if (ipforwarding == `0`) {
2031	OSAddAtomic(`1`, &ipstat.ips_cantforward);
2032	m_freem(m);
2033	} else {
2034	ip_forward(m, `0`, NULL);
2035	}
2036	return;
2037
2038	ours:
2039	/*
2040	* If offset or IP_MF are set, must reassemble.
2041	*/
2042	if (ip->ip_off & ~(IP_DF \| IP_RF)) {
2043	m = ip_reass(m);
2044	if (m == NULL) {
2045	return;
2046	}
2047	ip = mtod(m, struct ip *);
2048	/ Get the header length of the reassembled packet /
2049	hlen = IP_VHL_HL(ip->ip_vhl) << `2`;
2050	}
2051
2052	/*
2053	* Further protocols expect the packet length to be w/o the
2054	* IP header.
2055	*/
2056	ip->ip_len -= hlen;
2057
2058
2059	#if IPSEC
2060	/*
2061	* enforce IPsec policy checking if we are seeing last header.
2062	* note that we do not visit this with protocols with pcb layer
2063	* code - like udp/tcp/raw ip.
2064	*/
2065	if (ipsec_bypass == `0` && (ip_protox[ip->ip_p]->pr_flags & PR_LASTHDR)) {
2066	if (ipsec4_in_reject(m, NULL)) {
2067	IPSEC_STAT_INCREMENT(ipsecstat.in_polvio);
2068	goto bad;
2069	}
2070	}
2071	#endif /* IPSEC */
2072
2073	/*
2074	* Switch out to protocol's input routine.
2075	*/
2076	OSAddAtomic(`1`, &ipstat.ips_delivered);
2077
2078	ip_proto_dispatch_in(m, hlen, proto: ip->ip_p, inject_ipfref: `0`);
2079	return;
2080
2081	bad:
2082	KERNEL_DEBUG(DBG_LAYER_END, `0`, `0`, `0`, `0`, `0`);
2083	m_freem(m);
2084	}
2085
2086	static void
2087	ipq_updateparams(void)
2088	{
2089	LCK_MTX_ASSERT(&ipqlock, LCK_MTX_ASSERT_OWNED);
2090	/*
2091	* -1 for unlimited allocation.
2092	*/
2093	if (maxnipq < `0`) {
2094	ipq_limit = `0`;
2095	}
2096	/*
2097	* Positive number for specific bound.
2098	*/
2099	if (maxnipq > `0`) {
2100	ipq_limit = maxnipq;
2101	}
2102	/*
2103	* Zero specifies no further fragment queue allocation -- set the
2104	* bound very low, but rely on implementation elsewhere to actually
2105	* prevent allocation and reclaim current queues.
2106	*/
2107	if (maxnipq == `0`) {
2108	ipq_limit = `1`;
2109	}
2110	/*
2111	* Arm the purge timer if not already and if there's work to do
2112	*/
2113	frag_sched_timeout();
2114	}
2115
2116	static int
2117	sysctl_maxnipq SYSCTL_HANDLER_ARGS
2118	{
2119	#pragma unused(arg1, arg2)
2120	int error, i;
2121
2122	lck_mtx_lock(lck: &ipqlock);
2123	i = maxnipq;
2124	error = sysctl_handle_int(oidp, arg1: &i, arg2: `0`, req);
2125	if (error \|\| req->newptr == USER_ADDR_NULL) {
2126	goto done;
2127	}
2128	/ impose bounds /
2129	if (i < -`1` \|\| i > (nmbclusters / `4`)) {
2130	error = EINVAL;
2131	goto done;
2132	}
2133	maxnipq = i;
2134	ipq_updateparams();
2135	done:
2136	lck_mtx_unlock(lck: &ipqlock);
2137	return error;
2138	}
2139
2140	static int
2141	sysctl_maxfragsperpacket SYSCTL_HANDLER_ARGS
2142	{
2143	#pragma unused(arg1, arg2)
2144	int error, i;
2145
2146	lck_mtx_lock(lck: &ipqlock);
2147	i = maxfragsperpacket;
2148	error = sysctl_handle_int(oidp, arg1: &i, arg2: `0`, req);
2149	if (error \|\| req->newptr == USER_ADDR_NULL) {
2150	goto done;
2151	}
2152	maxfragsperpacket = i;
2153	ipq_updateparams(); / see if we need to arm timer /
2154	done:
2155	lck_mtx_unlock(lck: &ipqlock);
2156	return error;
2157	}
2158
2159	/*
2160	* Take incoming datagram fragment and try to reassemble it into
2161	* whole datagram. If a chain for reassembly of this datagram already
2162	* exists, then it is given as fp; otherwise have to make a chain.
2163	*
2164	* The IP header is NOT adjusted out of iplen (but in host byte order).
2165	*/
2166	static struct mbuf *
2167	ip_reass(struct mbuf *m)
2168	{
2169	struct ip *ip;
2170	struct mbuf p, q, nq, t;
2171	struct ipq *fp = NULL;
2172	struct ipqhead *head;
2173	int i, hlen, next;
2174	u_int8_t ecn, ecn0;
2175	uint32_t csum, csum_flags;
2176	uint16_t hash;
2177	struct fq_head dfq;
2178
2179	MBUFQ_INIT(&dfq); / for deferred frees /
2180
2181	/ If maxnipq or maxfragsperpacket is 0, never accept fragments. /
2182	if (maxnipq == `0` \|\| maxfragsperpacket == `0`) {
2183	ipstat.ips_fragments++;
2184	ipstat.ips_fragdropped++;
2185	m_freem(m);
2186	if (nipq > `0`) {
2187	lck_mtx_lock(lck: &ipqlock);
2188	frag_sched_timeout(); / purge stale fragments /
2189	lck_mtx_unlock(lck: &ipqlock);
2190	}
2191	return NULL;
2192	}
2193
2194	ip = mtod(m, struct ip *);
2195	hlen = IP_VHL_HL(ip->ip_vhl) << `2`;
2196
2197	lck_mtx_lock(lck: &ipqlock);
2198
2199	hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id);
2200	head = &ipq[hash];
2201
2202	/*
2203	* Look for queue of fragments
2204	* of this datagram.
2205	*/
2206	TAILQ_FOREACH(fp, head, ipq_list) {
2207	if (ip->ip_id == fp->ipq_id &&
2208	ip->ip_src.s_addr == fp->ipq_src.s_addr &&
2209	ip->ip_dst.s_addr == fp->ipq_dst.s_addr &&
2210	ip->ip_p == fp->ipq_p) {
2211	goto found;
2212	}
2213	}
2214
2215	fp = NULL;
2216
2217	/*
2218	* Attempt to trim the number of allocated fragment queues if it
2219	* exceeds the administrative limit.
2220	*/
2221	if ((nipq > (unsigned)maxnipq) && (maxnipq > `0`)) {
2222	/*
2223	* drop something from the tail of the current queue
2224	* before proceeding further
2225	*/
2226	struct ipq *fq = TAILQ_LAST(head, ipqhead);
2227	if (fq == NULL) { / gak /
2228	for (i = `0`; i < IPREASS_NHASH; i++) {
2229	struct ipq *r = TAILQ_LAST(&ipq[i], ipqhead);
2230	if (r) {
2231	ipstat.ips_fragtimeout += r->ipq_nfrags;
2232	frag_freef(&ipq[i], r);
2233	break;
2234	}
2235	}
2236	} else {
2237	ipstat.ips_fragtimeout += fq->ipq_nfrags;
2238	frag_freef(head, fq);
2239	}
2240	}
2241
2242	found:
2243	/*
2244	* Leverage partial checksum offload for IP fragments. Narrow down
2245	* the scope to cover only UDP without IP options, as that is the
2246	* most common case.
2247	*
2248	* Perform 1's complement adjustment of octets that got included/
2249	* excluded in the hardware-calculated checksum value. Ignore cases
2250	* where the value includes the entire IPv4 header span, as the sum
2251	* for those octets would already be 0 by the time we get here; IP
2252	* has already performed its header checksum validation. Also take
2253	* care of any trailing bytes and subtract out their partial sum.
2254	*/
2255	if (ip->ip_p == IPPROTO_UDP && hlen == sizeof(struct ip) &&
2256	(m->m_pkthdr.csum_flags &
2257	(CSUM_DATA_VALID \| CSUM_PARTIAL \| CSUM_PSEUDO_HDR)) ==
2258	(CSUM_DATA_VALID \| CSUM_PARTIAL)) {
2259	uint32_t start = m->m_pkthdr.csum_rx_start;
2260	int32_t trailer = (m_pktlen(m) - ip->ip_len);
2261	uint32_t swbytes = (uint32_t)trailer;
2262
2263	csum = m->m_pkthdr.csum_rx_val;
2264
2265	ASSERT(trailer >= `0`);
2266	if ((start != `0` && start != hlen) \|\| trailer != `0`) {
2267	uint32_t datalen = ip->ip_len - hlen;
2268
2269	#if BYTE_ORDER != BIG_ENDIAN
2270	if (start < hlen) {
2271	HTONS(ip->ip_len);
2272	HTONS(ip->ip_off);
2273	}
2274	#endif /* BYTE_ORDER != BIG_ENDIAN */
2275	/ callee folds in sum /
2276	csum = m_adj_sum16(m, start, hlen, datalen, csum);
2277	if (hlen > start) {
2278	swbytes += (hlen - start);
2279	} else {
2280	swbytes += (start - hlen);
2281	}
2282	#if BYTE_ORDER != BIG_ENDIAN
2283	if (start < hlen) {
2284	NTOHS(ip->ip_off);
2285	NTOHS(ip->ip_len);
2286	}
2287	#endif /* BYTE_ORDER != BIG_ENDIAN */
2288	}
2289	csum_flags = m->m_pkthdr.csum_flags;
2290
2291	if (swbytes != `0`) {
2292	udp_in_cksum_stats(swbytes);
2293	}
2294	if (trailer != `0`) {
2295	m_adj(m, -trailer);
2296	}
2297	} else {
2298	csum = `0`;
2299	csum_flags = `0`;
2300	}
2301
2302	/ Invalidate checksum /
2303	m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
2304
2305	ipstat.ips_fragments++;
2306
2307	/*
2308	* Adjust ip_len to not reflect header,
2309	* convert offset of this to bytes.
2310	*/
2311	ip->ip_len -= hlen;
2312	if (ip->ip_off & IP_MF) {
2313	/*
2314	* Make sure that fragments have a data length
2315	* that's a non-zero multiple of 8 bytes.
2316	*/
2317	if (ip->ip_len == `0` \|\| (ip->ip_len & `0x7`) != `0`) {
2318	OSAddAtomic(`1`, &ipstat.ips_toosmall);
2319	/*
2320	* Reassembly queue may have been found if previous
2321	* fragments were valid; given that this one is bad,
2322	* we need to drop it. Make sure to set fp to NULL
2323	* if not already, since we don't want to decrement
2324	* ipq_nfrags as it doesn't include this packet.
2325	*/
2326	fp = NULL;
2327	goto dropfrag;
2328	}
2329	m->m_flags \|= M_FRAG;
2330	} else {
2331	/ Clear the flag in case packet comes from loopback /
2332	m->m_flags &= ~M_FRAG;
2333	}
2334	ip->ip_off = (u_short)(ip->ip_off << `3`);
2335
2336	m->m_pkthdr.pkt_hdr = ip;
2337
2338	/ Previous ip_reass() started here. /
2339	/*
2340	* Presence of header sizes in mbufs
2341	* would confuse code below.
2342	*/
2343	m->m_data += hlen;
2344	m->m_len -= hlen;
2345
2346	/*
2347	* If first fragment to arrive, create a reassembly queue.
2348	*/
2349	if (fp == NULL) {
2350	fp = ipq_alloc();
2351	if (fp == NULL) {
2352	goto dropfrag;
2353	}
2354	TAILQ_INSERT_HEAD(head, fp, ipq_list);
2355	nipq++;
2356	fp->ipq_nfrags = `1`;
2357	fp->ipq_ttl = IPFRAGTTL;
2358	fp->ipq_p = ip->ip_p;
2359	fp->ipq_id = ip->ip_id;
2360	fp->ipq_src = ip->ip_src;
2361	fp->ipq_dst = ip->ip_dst;
2362	fp->ipq_frags = m;
2363	m->m_nextpkt = NULL;
2364	/*
2365	* If the first fragment has valid checksum offload
2366	* info, the rest of fragments are eligible as well.
2367	*/
2368	if (csum_flags != `0`) {
2369	fp->ipq_csum = csum;
2370	fp->ipq_csum_flags = csum_flags;
2371	}
2372	m = NULL; / nothing to return /
2373	goto done;
2374	} else {
2375	fp->ipq_nfrags++;
2376	}
2377
2378	#define GETIP(m) ((struct ip *)((m)->m_pkthdr.pkt_hdr))
2379
2380	/*
2381	* Handle ECN by comparing this segment with the first one;
2382	* if CE is set, do not lose CE.
2383	* drop if CE and not-ECT are mixed for the same packet.
2384	*/
2385	ecn = ip->ip_tos & IPTOS_ECN_MASK;
2386	ecn0 = GETIP(fp->ipq_frags)->ip_tos & IPTOS_ECN_MASK;
2387	if (ecn == IPTOS_ECN_CE) {
2388	if (ecn0 == IPTOS_ECN_NOTECT) {
2389	goto dropfrag;
2390	}
2391	if (ecn0 != IPTOS_ECN_CE) {
2392	GETIP(fp->ipq_frags)->ip_tos \|= IPTOS_ECN_CE;
2393	}
2394	}
2395	if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) {
2396	goto dropfrag;
2397	}
2398
2399	/*
2400	* Find a segment which begins after this one does.
2401	*/
2402	for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) {
2403	if (GETIP(q)->ip_off > ip->ip_off) {
2404	break;
2405	}
2406	}
2407
2408	/*
2409	* If there is a preceding segment, it may provide some of
2410	* our data already. If so, drop the data from the incoming
2411	* segment. If it provides all of our data, drop us, otherwise
2412	* stick new segment in the proper place.
2413	*
2414	* If some of the data is dropped from the preceding
2415	* segment, then it's checksum is invalidated.
2416	*/
2417	if (p) {
2418	i = GETIP(p)->ip_off + GETIP(p)->ip_len - ip->ip_off;
2419	if (i > `0`) {
2420	if (i >= ip->ip_len) {
2421	goto dropfrag;
2422	}
2423	m_adj(m, i);
2424	fp->ipq_csum_flags = `0`;
2425	ip->ip_off += i;
2426	ip->ip_len -= i;
2427	}
2428	m->m_nextpkt = p->m_nextpkt;
2429	p->m_nextpkt = m;
2430	} else {
2431	m->m_nextpkt = fp->ipq_frags;
2432	fp->ipq_frags = m;
2433	}
2434
2435	/*
2436	* While we overlap succeeding segments trim them or,
2437	* if they are completely covered, dequeue them.
2438	*/
2439	for (; q != NULL && ip->ip_off + ip->ip_len > GETIP(q)->ip_off;
2440	q = nq) {
2441	i = (ip->ip_off + ip->ip_len) - GETIP(q)->ip_off;
2442	if (i < GETIP(q)->ip_len) {
2443	GETIP(q)->ip_len -= i;
2444	GETIP(q)->ip_off += i;
2445	m_adj(q, i);
2446	fp->ipq_csum_flags = `0`;
2447	break;
2448	}
2449	nq = q->m_nextpkt;
2450	m->m_nextpkt = nq;
2451	ipstat.ips_fragdropped++;
2452	fp->ipq_nfrags--;
2453	/ defer freeing until after lock is dropped /
2454	MBUFQ_ENQUEUE(&dfq, q);
2455	}
2456
2457	/*
2458	* If this fragment contains similar checksum offload info
2459	* as that of the existing ones, accumulate checksum. Otherwise,
2460	* invalidate checksum offload info for the entire datagram.
2461	*/
2462	if (csum_flags != `0` && csum_flags == fp->ipq_csum_flags) {
2463	fp->ipq_csum += csum;
2464	} else if (fp->ipq_csum_flags != `0`) {
2465	fp->ipq_csum_flags = `0`;
2466	}
2467
2468
2469	/*
2470	* Check for complete reassembly and perform frag per packet
2471	* limiting.
2472	*
2473	* Frag limiting is performed here so that the nth frag has
2474	* a chance to complete the packet before we drop the packet.
2475	* As a result, n+1 frags are actually allowed per packet, but
2476	* only n will ever be stored. (n = maxfragsperpacket.)
2477	*
2478	*/
2479	next = `0`;
2480	for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) {
2481	if (GETIP(q)->ip_off != next) {
2482	if (fp->ipq_nfrags > maxfragsperpacket) {
2483	ipstat.ips_fragdropped += fp->ipq_nfrags;
2484	frag_freef(head, fp);
2485	}
2486	m = NULL; / nothing to return /
2487	goto done;
2488	}
2489	next += GETIP(q)->ip_len;
2490	}
2491	/ Make sure the last packet didn't have the IP_MF flag /
2492	if (p->m_flags & M_FRAG) {
2493	if (fp->ipq_nfrags > maxfragsperpacket) {
2494	ipstat.ips_fragdropped += fp->ipq_nfrags;
2495	frag_freef(head, fp);
2496	}
2497	m = NULL; / nothing to return /
2498	goto done;
2499	}
2500
2501	/*
2502	* Reassembly is complete. Make sure the packet is a sane size.
2503	*/
2504	q = fp->ipq_frags;
2505	ip = GETIP(q);
2506	if (next + (IP_VHL_HL(ip->ip_vhl) << `2`) > IP_MAXPACKET) {
2507	ipstat.ips_toolong++;
2508	ipstat.ips_fragdropped += fp->ipq_nfrags;
2509	frag_freef(head, fp);
2510	m = NULL; / nothing to return /
2511	goto done;
2512	}
2513
2514	/*
2515	* Concatenate fragments.
2516	*/
2517	m = q;
2518	t = m->m_next;
2519	m->m_next = NULL;
2520	m_cat(m, t);
2521	nq = q->m_nextpkt;
2522	q->m_nextpkt = NULL;
2523	for (q = nq; q != NULL; q = nq) {
2524	nq = q->m_nextpkt;
2525	q->m_nextpkt = NULL;
2526	m_cat(m, q);
2527	}
2528
2529	/*
2530	* Store partial hardware checksum info from the fragment queue;
2531	* the receive start offset is set to 20 bytes (see code at the
2532	* top of this routine.)
2533	*/
2534	if (fp->ipq_csum_flags != `0`) {
2535	csum = fp->ipq_csum;
2536
2537	ADDCARRY(csum);
2538
2539	m->m_pkthdr.csum_rx_val = (uint16_t)csum;
2540	m->m_pkthdr.csum_rx_start = sizeof(struct ip);
2541	m->m_pkthdr.csum_flags = fp->ipq_csum_flags;
2542	} else if ((m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) \|\|
2543	(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
2544	/ loopback checksums are always OK /
2545	m->m_pkthdr.csum_data = `0xffff`;
2546	m->m_pkthdr.csum_flags =
2547	CSUM_DATA_VALID \| CSUM_PSEUDO_HDR \|
2548	CSUM_IP_CHECKED \| CSUM_IP_VALID;
2549	}
2550
2551	/*
2552	* Create header for new ip packet by modifying header of first
2553	* packet; dequeue and discard fragment reassembly header.
2554	* Make header visible.
2555	*/
2556	ip->ip_len = (u_short)((IP_VHL_HL(ip->ip_vhl) << `2`) + next);
2557	ip->ip_src = fp->ipq_src;
2558	ip->ip_dst = fp->ipq_dst;
2559
2560	fp->ipq_frags = NULL; / return to caller as 'm' /
2561	frag_freef(head, fp);
2562	fp = NULL;
2563
2564	m->m_len += (IP_VHL_HL(ip->ip_vhl) << `2`);
2565	m->m_data -= (IP_VHL_HL(ip->ip_vhl) << `2`);
2566	/ some debugging cruft by sklower, below, will go away soon /
2567	if (m->m_flags & M_PKTHDR) { / XXX this should be done elsewhere /
2568	m_fixhdr(m);
2569	}
2570	ipstat.ips_reassembled++;
2571
2572	/ arm the purge timer if not already and if there's work to do /
2573	frag_sched_timeout();
2574	lck_mtx_unlock(lck: &ipqlock);
2575	/ perform deferred free (if needed) now that lock is dropped /
2576	if (!MBUFQ_EMPTY(&dfq)) {
2577	MBUFQ_DRAIN(&dfq);
2578	}
2579	VERIFY(MBUFQ_EMPTY(&dfq));
2580	return m;
2581
2582	done:
2583	VERIFY(m == NULL);
2584	/ arm the purge timer if not already and if there's work to do /
2585	frag_sched_timeout();
2586	lck_mtx_unlock(lck: &ipqlock);
2587	/ perform deferred free (if needed) /
2588	if (!MBUFQ_EMPTY(&dfq)) {
2589	MBUFQ_DRAIN(&dfq);
2590	}
2591	VERIFY(MBUFQ_EMPTY(&dfq));
2592	return NULL;
2593
2594	dropfrag:
2595	ipstat.ips_fragdropped++;
2596	if (fp != NULL) {
2597	fp->ipq_nfrags--;
2598	}
2599	/ arm the purge timer if not already and if there's work to do /
2600	frag_sched_timeout();
2601	lck_mtx_unlock(lck: &ipqlock);
2602	m_freem(m);
2603	/ perform deferred free (if needed) /
2604	if (!MBUFQ_EMPTY(&dfq)) {
2605	MBUFQ_DRAIN(&dfq);
2606	}
2607	VERIFY(MBUFQ_EMPTY(&dfq));
2608	return NULL;
2609	#undef GETIP
2610	}
2611
2612	/*
2613	* Free a fragment reassembly header and all
2614	* associated datagrams.
2615	*/
2616	static void
2617	frag_freef(struct ipqhead fhp, struct* ipq *fp)
2618	{
2619	LCK_MTX_ASSERT(&ipqlock, LCK_MTX_ASSERT_OWNED);
2620
2621	fp->ipq_nfrags = `0`;
2622	if (fp->ipq_frags != NULL) {
2623	m_freem_list(fp->ipq_frags);
2624	fp->ipq_frags = NULL;
2625	}
2626	TAILQ_REMOVE(fhp, fp, ipq_list);
2627	nipq--;
2628	ipq_free(fp);
2629	}
2630
2631	/*
2632	* IP reassembly timer processing
2633	*/
2634	static void
2635	frag_timeout(void *arg)
2636	{
2637	#pragma unused(arg)
2638	struct ipq *fp;
2639	int i;
2640
2641	/*
2642	* Update coarse-grained networking timestamp (in sec.); the idea
2643	* is to piggy-back on the timeout callout to update the counter
2644	* returnable via net_uptime().
2645	*/
2646	net_update_uptime();
2647
2648	lck_mtx_lock(lck: &ipqlock);
2649	for (i = `0`; i < IPREASS_NHASH; i++) {
2650	for (fp = TAILQ_FIRST(&ipq[i]); fp;) {
2651	struct ipq *fpp;
2652
2653	fpp = fp;
2654	fp = TAILQ_NEXT(fp, ipq_list);
2655	if (--fpp->ipq_ttl == `0`) {
2656	ipstat.ips_fragtimeout += fpp->ipq_nfrags;
2657	frag_freef(fhp: &ipq[i], fp: fpp);
2658	}
2659	}
2660	}
2661	/*
2662	* If we are over the maximum number of fragments
2663	* (due to the limit being lowered), drain off
2664	* enough to get down to the new limit.
2665	*/
2666	if (maxnipq >= `0` && nipq > (unsigned)maxnipq) {
2667	for (i = `0`; i < IPREASS_NHASH; i++) {
2668	while (nipq > (unsigned)maxnipq &&
2669	!TAILQ_EMPTY(&ipq[i])) {
2670	ipstat.ips_fragdropped +=
2671	TAILQ_FIRST(&ipq[i])->ipq_nfrags;
2672	frag_freef(fhp: &ipq[i], TAILQ_FIRST(&ipq[i]));
2673	}
2674	}
2675	}
2676	/ re-arm the purge timer if there's work to do /
2677	frag_timeout_run = `0`;
2678	frag_sched_timeout();
2679	lck_mtx_unlock(lck: &ipqlock);
2680	}
2681
2682	static void
2683	frag_sched_timeout(void)
2684	{
2685	LCK_MTX_ASSERT(&ipqlock, LCK_MTX_ASSERT_OWNED);
2686
2687	if (!frag_timeout_run && nipq > `0`) {
2688	frag_timeout_run = `1`;
2689	timeout(frag_timeout, NULL, ticks: hz);
2690	}
2691	}
2692
2693	/*
2694	* Drain off all datagram fragments.
2695	*/
2696	static void
2697	frag_drain(void)
2698	{
2699	int i;
2700
2701	lck_mtx_lock(lck: &ipqlock);
2702	for (i = `0`; i < IPREASS_NHASH; i++) {
2703	while (!TAILQ_EMPTY(&ipq[i])) {
2704	ipstat.ips_fragdropped +=
2705	TAILQ_FIRST(&ipq[i])->ipq_nfrags;
2706	frag_freef(fhp: &ipq[i], TAILQ_FIRST(&ipq[i]));
2707	}
2708	}
2709	lck_mtx_unlock(lck: &ipqlock);
2710	}
2711
2712	static struct ipq *
2713	ipq_alloc(void)
2714	{
2715	struct ipq *fp;
2716
2717	/*
2718	* See comments in ipq_updateparams(). Keep the count separate
2719	* from nipq since the latter represents the elements already
2720	* in the reassembly queues.
2721	*/
2722	if (ipq_limit > `0` && ipq_count > ipq_limit) {
2723	return NULL;
2724	}
2725
2726	fp = kalloc_type(struct ipq, Z_NOWAIT \| Z_ZERO);
2727	if (fp != NULL) {
2728	os_atomic_inc(&ipq_count, relaxed);
2729	}
2730	return fp;
2731	}
2732
2733	static void
2734	ipq_free(struct ipq *fp)
2735	{
2736	kfree_type(struct ipq, fp);
2737	os_atomic_dec(&ipq_count, relaxed);
2738	}
2739
2740	/*
2741	* Drain callback
2742	*/
2743	void
2744	ip_drain(void)
2745	{
2746	frag_drain(); / fragments /
2747	in_rtqdrain(); / protocol cloned routes /
2748	in_arpdrain(NULL); / cloned routes: ARP /
2749	}
2750
2751	/*
2752	* Do option processing on a datagram,
2753	* possibly discarding it if bad options are encountered,
2754	* or forwarding it if source-routed.
2755	* The pass argument is used when operating in the IPSTEALTH
2756	* mode to tell what options to process:
2757	* [LS]SRR (pass 0) or the others (pass 1).
2758	* The reason for as many as two passes is that when doing IPSTEALTH,
2759	* non-routing options should be processed only if the packet is for us.
2760	* Returns 1 if packet has been forwarded/freed,
2761	* 0 if the packet should be processed further.
2762	*/
2763	static int
2764	ip_dooptions(struct mbuf m, int* pass, struct sockaddr_in *next_hop)
2765	{
2766	#pragma unused(pass)
2767	struct ip ip = mtod(m, struct* ip *);
2768	u_char *cp;
2769	struct ip_timestamp *ipt;
2770	struct in_ifaddr *ia;
2771	int opt, optlen, cnt, off, type = ICMP_PARAMPROB, forward = `0`;
2772	uint8_t code = `0`;
2773	struct in_addr *sin, dst;
2774	u_int32_t ntime;
2775	struct sockaddr_in ipaddr = {
2776	.sin_len = sizeof(ipaddr),
2777	.sin_family = AF_INET,
2778	.sin_port = `0`,
2779	.sin_addr = { .s_addr = `0` },
2780	.sin_zero = { `0`, }
2781	};
2782
2783	/ Expect 32-bit aligned data pointer on strict-align platforms /
2784	MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
2785
2786	dst = ip->ip_dst;
2787	cp = (u_char *)(ip + `1`);
2788	cnt = (IP_VHL_HL(ip->ip_vhl) << `2`) - sizeof(struct ip);
2789	for (; cnt > `0`; cnt -= optlen, cp += optlen) {
2790	opt = cp[IPOPT_OPTVAL];
2791	if (opt == IPOPT_EOL) {
2792	break;
2793	}
2794	if (opt == IPOPT_NOP) {
2795	optlen = `1`;
2796	} else {
2797	if (cnt < IPOPT_OLEN + sizeof(*cp)) {
2798	code = (uint8_t)(&cp[IPOPT_OLEN] - (u_char *)ip);
2799	goto bad;
2800	}
2801	optlen = cp[IPOPT_OLEN];
2802	if (optlen < IPOPT_OLEN + sizeof(*cp) \|\|
2803	optlen > cnt) {
2804	code = (uint8_t)(&cp[IPOPT_OLEN] - (u_char *)ip);
2805	goto bad;
2806	}
2807	}
2808	switch (opt) {
2809	default:
2810	break;
2811
2812	/*
2813	* Source routing with record.
2814	* Find interface with current destination address.
2815	* If none on this machine then drop if strictly routed,
2816	* or do nothing if loosely routed.
2817	* Record interface address and bring up next address
2818	* component. If strictly routed make sure next
2819	* address is on directly accessible net.
2820	*/
2821	case IPOPT_LSRR:
2822	case IPOPT_SSRR:
2823	if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
2824	code = (uint8_t)(&cp[IPOPT_OLEN] - (u_char *)ip);
2825	goto bad;
2826	}
2827	if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
2828	code = (uint8_t)(&cp[IPOPT_OFFSET] - (u_char *)ip);
2829	goto bad;
2830	}
2831	ipaddr.sin_addr = ip->ip_dst;
2832	ia = (struct in_ifaddr *)ifa_ifwithaddr(SA(&ipaddr));
2833	if (ia == NULL) {
2834	if (opt == IPOPT_SSRR) {
2835	type = ICMP_UNREACH;
2836	code = ICMP_UNREACH_SRCFAIL;
2837	goto bad;
2838	}
2839	if (!ip_dosourceroute) {
2840	goto nosourcerouting;
2841	}
2842	/*
2843	* Loose routing, and not at next destination
2844	* yet; nothing to do except forward.
2845	*/
2846	break;
2847	} else {
2848	ifa_remref(ifa: &ia->ia_ifa);
2849	ia = NULL;
2850	}
2851	off--; / 0 origin /
2852	if (off > optlen - (int)sizeof(struct in_addr)) {
2853	/*
2854	* End of source route. Should be for us.
2855	*/
2856	if (!ip_acceptsourceroute) {
2857	goto nosourcerouting;
2858	}
2859	save_rte(cp, ip->ip_src);
2860	break;
2861	}
2862
2863	if (!ip_dosourceroute) {
2864	if (ipforwarding) {
2865	char buf[MAX_IPv4_STR_LEN];
2866	char buf2[MAX_IPv4_STR_LEN];
2867	/*
2868	* Acting as a router, so generate ICMP
2869	*/
2870	nosourcerouting:
2871	log(LOG_WARNING,
2872	"attempted source route from %s "
2873	"to %s\n",
2874	inet_ntop(AF_INET, &ip->ip_src,
2875	buf, sizeof(buf)),
2876	inet_ntop(AF_INET, &ip->ip_dst,
2877	buf2, sizeof(buf2)));
2878	type = ICMP_UNREACH;
2879	code = ICMP_UNREACH_SRCFAIL;
2880	goto bad;
2881	} else {
2882	/*
2883	* Not acting as a router,
2884	* so silently drop.
2885	*/
2886	OSAddAtomic(`1`, &ipstat.ips_cantforward);
2887	m_freem(m);
2888	return `1`;
2889	}
2890	}
2891
2892	/*
2893	* locate outgoing interface
2894	*/
2895	(void) memcpy(dst: &ipaddr.sin_addr, src: cp + off,
2896	n: sizeof(ipaddr.sin_addr));
2897
2898	if (opt == IPOPT_SSRR) {
2899	#define INA struct in_ifaddr *
2900	if ((ia = (INA)ifa_ifwithdstaddr(
2901	SA(&ipaddr))) == NULL) {
2902	ia = (INA)ifa_ifwithnet(SA(&ipaddr));
2903	}
2904	} else {
2905	ia = ip_rtaddr(ipaddr.sin_addr);
2906	}
2907	if (ia == NULL) {
2908	type = ICMP_UNREACH;
2909	code = ICMP_UNREACH_SRCFAIL;
2910	goto bad;
2911	}
2912	ip->ip_dst = ipaddr.sin_addr;
2913	IFA_LOCK(&ia->ia_ifa);
2914	(void) memcpy(dst: cp + off, src: &(IA_SIN(ia)->sin_addr),
2915	n: sizeof(struct in_addr));
2916	IFA_UNLOCK(&ia->ia_ifa);
2917	ifa_remref(ifa: &ia->ia_ifa);
2918	ia = NULL;
2919	cp[IPOPT_OFFSET] += sizeof(struct in_addr);
2920	/*
2921	* Let ip_intr's mcast routing check handle mcast pkts
2922	*/
2923	forward = !IN_MULTICAST(ntohl(ip->ip_dst.s_addr));
2924	break;
2925
2926	case IPOPT_RR:
2927	if (optlen < IPOPT_OFFSET + sizeof(*cp)) {
2928	code = (uint8_t)(&cp[IPOPT_OFFSET] - (u_char *)ip);
2929	goto bad;
2930	}
2931	if ((off = cp[IPOPT_OFFSET]) < IPOPT_MINOFF) {
2932	code = (uint8_t)(&cp[IPOPT_OFFSET] - (u_char *)ip);
2933	goto bad;
2934	}
2935	/*
2936	* If no space remains, ignore.
2937	*/
2938	off--; / 0 origin /
2939	if (off > optlen - (int)sizeof(struct in_addr)) {
2940	break;
2941	}
2942	(void) memcpy(dst: &ipaddr.sin_addr, src: &ip->ip_dst,
2943	n: sizeof(ipaddr.sin_addr));
2944	/*
2945	* locate outgoing interface; if we're the destination,
2946	* use the incoming interface (should be same).
2947	*/
2948	if ((ia = (INA)ifa_ifwithaddr(SA(&ipaddr))) == NULL) {
2949	if ((ia = ip_rtaddr(ipaddr.sin_addr)) == NULL) {
2950	type = ICMP_UNREACH;
2951	code = ICMP_UNREACH_HOST;
2952	goto bad;
2953	}
2954	}
2955	IFA_LOCK(&ia->ia_ifa);
2956	(void) memcpy(dst: cp + off, src: &(IA_SIN(ia)->sin_addr),
2957	n: sizeof(struct in_addr));
2958	IFA_UNLOCK(&ia->ia_ifa);
2959	ifa_remref(ifa: &ia->ia_ifa);
2960	ia = NULL;
2961	cp[IPOPT_OFFSET] += sizeof(struct in_addr);
2962	break;
2963
2964	case IPOPT_TS:
2965	code = (uint8_t)(cp - (u_char *)ip);
2966	ipt = (struct ip_timestamp )(void* *)cp;
2967	if (ipt->ipt_len < `4` \|\| ipt->ipt_len > `40`) {
2968	code = (uint8_t)((u_char *)&ipt->ipt_len -
2969	(u_char *)ip);
2970	goto bad;
2971	}
2972	if (ipt->ipt_ptr < `5`) {
2973	code = (uint8_t)((u_char *)&ipt->ipt_ptr -
2974	(u_char *)ip);
2975	goto bad;
2976	}
2977	if (ipt->ipt_ptr >
2978	ipt->ipt_len - (int)sizeof(int32_t)) {
2979	if (++ipt->ipt_oflw == `0`) {
2980	code = (uint8_t)((u_char *)&ipt->ipt_ptr -
2981	(u_char *)ip);
2982	goto bad;
2983	}
2984	break;
2985	}
2986	sin = (struct in_addr )(void* *)(cp + ipt->ipt_ptr - `1`);
2987	switch (ipt->ipt_flg) {
2988	case IPOPT_TS_TSONLY:
2989	break;
2990
2991	case IPOPT_TS_TSANDADDR:
2992	if (ipt->ipt_ptr - `1` + sizeof(n_time) +
2993	sizeof(struct in_addr) > ipt->ipt_len) {
2994	code = (uint8_t)((u_char *)&ipt->ipt_ptr -
2995	(u_char *)ip);
2996	goto bad;
2997	}
2998	ipaddr.sin_addr = dst;
2999	ia = (INA)ifaof_ifpforaddr(SA(&ipaddr),
3000	m->m_pkthdr.rcvif);
3001	if (ia == NULL) {
3002	continue;
3003	}
3004	IFA_LOCK(&ia->ia_ifa);
3005	(void) memcpy(dst: sin, src: &IA_SIN(ia)->sin_addr,
3006	n: sizeof(struct in_addr));
3007	IFA_UNLOCK(&ia->ia_ifa);
3008	ipt->ipt_ptr += sizeof(struct in_addr);
3009	ifa_remref(ifa: &ia->ia_ifa);
3010	ia = NULL;
3011	break;
3012
3013	case IPOPT_TS_PRESPEC:
3014	if (ipt->ipt_ptr - `1` + sizeof(n_time) +
3015	sizeof(struct in_addr) > ipt->ipt_len) {
3016	code = (uint8_t)((u_char *)&ipt->ipt_ptr -
3017	(u_char *)ip);
3018	goto bad;
3019	}
3020	(void) memcpy(dst: &ipaddr.sin_addr, src: sin,
3021	n: sizeof(struct in_addr));
3022	if ((ia = (struct in_ifaddr *)ifa_ifwithaddr(
3023	SA(&ipaddr))) == NULL) {
3024	continue;
3025	}
3026	ifa_remref(ifa: &ia->ia_ifa);
3027	ia = NULL;
3028	ipt->ipt_ptr += sizeof(struct in_addr);
3029	break;
3030
3031	default:
3032	/ XXX can't take &ipt->ipt_flg /
3033	code = (uint8_t)((u_char *)&ipt->ipt_ptr -
3034	(u_char *)ip + `1`);
3035	goto bad;
3036	}
3037	ntime = iptime();
3038	(void) memcpy(dst: cp + ipt->ipt_ptr - `1`, src: &ntime,
3039	n: sizeof(n_time));
3040	ipt->ipt_ptr += sizeof(n_time);
3041	}
3042	}
3043	if (forward && ipforwarding) {
3044	ip_forward(m, `1`, next_hop);
3045	return `1`;
3046	}
3047	return `0`;
3048	bad:
3049	icmp_error(m, type, code, `0`, `0`);
3050	OSAddAtomic(`1`, &ipstat.ips_badoptions);
3051	return `1`;
3052	}
3053
3054	/*
3055	* Check for the presence of the IP Router Alert option [RFC2113]
3056	* in the header of an IPv4 datagram.
3057	*
3058	* This call is not intended for use from the forwarding path; it is here
3059	* so that protocol domains may check for the presence of the option.
3060	* Given how FreeBSD's IPv4 stack is currently structured, the Router Alert
3061	* option does not have much relevance to the implementation, though this
3062	* may change in future.
3063	* Router alert options SHOULD be passed if running in IPSTEALTH mode and
3064	* we are not the endpoint.
3065	* Length checks on individual options should already have been peformed
3066	* by ip_dooptions() therefore they are folded under DIAGNOSTIC here.
3067	*
3068	* Return zero if not present or options are invalid, non-zero if present.
3069	*/
3070	int
3071	ip_checkrouteralert(struct mbuf *m)
3072	{
3073	struct ip ip = mtod(m, struct* ip *);
3074	u_char *cp;
3075	int opt, optlen, cnt, found_ra;
3076
3077	found_ra = `0`;
3078	cp = (u_char *)(ip + `1`);
3079	cnt = (IP_VHL_HL(ip->ip_vhl) << `2`) - sizeof(struct ip);
3080	for (; cnt > `0`; cnt -= optlen, cp += optlen) {
3081	opt = cp[IPOPT_OPTVAL];
3082	if (opt == IPOPT_EOL) {
3083	break;
3084	}
3085	if (opt == IPOPT_NOP) {
3086	optlen = `1`;
3087	} else {
3088	#ifdef DIAGNOSTIC
3089	if (cnt < IPOPT_OLEN + sizeof(*cp)) {
3090	break;
3091	}
3092	#endif
3093	optlen = cp[IPOPT_OLEN];
3094	#ifdef DIAGNOSTIC
3095	if (optlen < IPOPT_OLEN + sizeof(*cp) \|\| optlen > cnt) {
3096	break;
3097	}
3098	#endif
3099	}
3100	switch (opt) {
3101	case IPOPT_RA:
3102	#ifdef DIAGNOSTIC
3103	if (optlen != IPOPT_OFFSET + sizeof(uint16_t) \|\|
3104	(((uint16_t )(void *)&cp[IPOPT_OFFSET]) != `0`)) {
3105	break;
3106	} else
3107	#endif
3108	found_ra = `1`;
3109	break;
3110	default:
3111	break;
3112	}
3113	}
3114
3115	return found_ra;
3116	}
3117
3118	/*
3119	* Given address of next destination (final or next hop),
3120	* return internet address info of interface to be used to get there.
3121	*/
3122	struct in_ifaddr *
3123	ip_rtaddr(struct in_addr dst)
3124	{
3125	struct sockaddr_in *sin;
3126	struct ifaddr *rt_ifa;
3127	struct route ro;
3128
3129	bzero(s: &ro, n: sizeof(ro));
3130	sin = SIN(&ro.ro_dst);
3131	sin->sin_family = AF_INET;
3132	sin->sin_len = sizeof(*sin);
3133	sin->sin_addr = dst;
3134
3135	rtalloc_ign(&ro, RTF_PRCLONING);
3136	if (ro.ro_rt == NULL) {
3137	ROUTE_RELEASE(&ro);
3138	return NULL;
3139	}
3140
3141	RT_LOCK(ro.ro_rt);
3142	if ((rt_ifa = ro.ro_rt->rt_ifa) != NULL) {
3143	ifa_addref(ifa: rt_ifa);
3144	}
3145	RT_UNLOCK(ro.ro_rt);
3146	ROUTE_RELEASE(&ro);
3147
3148	return (struct in_ifaddr *)rt_ifa;
3149	}
3150
3151	/*
3152	* Save incoming source route for use in replies,
3153	* to be picked up later by ip_srcroute if the receiver is interested.
3154	*/
3155	void
3156	save_rte(u_char option, struct* in_addr dst)
3157	{
3158	unsigned olen;
3159
3160	olen = option[IPOPT_OLEN];
3161	#if DIAGNOSTIC
3162	if (ipprintfs) {
3163	printf("save_rte: olen %d\n", olen);
3164	}
3165	#endif
3166	if (olen > sizeof(ip_srcrt) - (`1` + sizeof(dst))) {
3167	return;
3168	}
3169	bcopy(src: option, dst: ip_srcrt.srcopt, n: olen);
3170	ip_nhops = (olen - IPOPT_OFFSET - `1`) / sizeof(struct in_addr);
3171	ip_srcrt.dst = dst;
3172	}
3173
3174	/*
3175	* Retrieve incoming source route for use in replies,
3176	* in the same form used by setsockopt.
3177	* The first hop is placed before the options, will be removed later.
3178	*/
3179	struct mbuf *
3180	ip_srcroute(void)
3181	{
3182	struct in_addr p, q;
3183	struct mbuf *m;
3184
3185	if (ip_nhops == `0`) {
3186	return NULL;
3187	}
3188
3189	m = m_get(M_DONTWAIT, MT_HEADER);
3190	if (m == NULL) {
3191	return NULL;
3192	}
3193
3194	#define OPTSIZ (sizeof (ip_srcrt.nop) + sizeof (ip_srcrt.srcopt))
3195
3196	/ length is (nhops+1)sizeof(addr) + sizeof(nop + srcrt header) /*
3197	m->m_len = ip_nhops * sizeof(struct in_addr) +
3198	sizeof(struct in_addr) + OPTSIZ;
3199	#if DIAGNOSTIC
3200	if (ipprintfs) {
3201	printf("ip_srcroute: nhops %d mlen %d", ip_nhops, m->m_len);
3202	}
3203	#endif
3204
3205	/*
3206	* First save first hop for return route
3207	*/
3208	p = &ip_srcrt.route[ip_nhops - `1`];
3209	(mtod(m, struct* in_addr )) = p--;
3210	#if DIAGNOSTIC
3211	if (ipprintfs) {
3212	printf(" hops %lx",
3213	(u_int32_t)ntohl(mtod(m, struct in_addr *)->s_addr));
3214	}
3215	#endif
3216
3217	/*
3218	* Copy option fields and padding (nop) to mbuf.
3219	*/
3220	ip_srcrt.nop = IPOPT_NOP;
3221	ip_srcrt.srcopt[IPOPT_OFFSET] = IPOPT_MINOFF;
3222	(void) __nochk_memcpy(mtod(m, caddr_t) + sizeof(struct in_addr),
3223	src: &ip_srcrt.nop, OPTSIZ);
3224	q = (struct in_addr )(void* *)(mtod(m, caddr_t) +
3225	sizeof(struct in_addr) + OPTSIZ);
3226	#undef OPTSIZ
3227	/*
3228	* Record return path as an IP source route,
3229	* reversing the path (pointers are now aligned).
3230	*/
3231	while (p >= ip_srcrt.route) {
3232	#if DIAGNOSTIC
3233	if (ipprintfs) {
3234	printf(" %lx", (u_int32_t)ntohl(q->s_addr));
3235	}
3236	#endif
3237	q++ = p--;
3238	}
3239	/*
3240	* Last hop goes to final destination.
3241	*/
3242	*q = ip_srcrt.dst;
3243	#if DIAGNOSTIC
3244	if (ipprintfs) {
3245	printf(" %lx\n", (u_int32_t)ntohl(q->s_addr));
3246	}
3247	#endif
3248	return m;
3249	}
3250
3251	/*
3252	* Strip out IP options, at higher level protocol in the kernel.
3253	*/
3254	void
3255	ip_stripoptions(struct mbuf *m)
3256	{
3257	int i;
3258	struct ip ip = mtod(m, struct* ip *);
3259	caddr_t opts;
3260	int olen;
3261
3262	/ Expect 32-bit aligned data pointer on strict-align platforms /
3263	MBUF_STRICT_DATA_ALIGNMENT_CHECK_32(m);
3264
3265	/ use bcopy() since it supports overlapping range /
3266	olen = (IP_VHL_HL(ip->ip_vhl) << `2`) - sizeof(struct ip);
3267	opts = (caddr_t)(ip + `1`);
3268	i = m->m_len - (sizeof(struct ip) + olen);
3269	bcopy(src: opts + olen, dst: opts, n: (unsigned)i);
3270	m->m_len -= olen;
3271	if (m->m_flags & M_PKTHDR) {
3272	m->m_pkthdr.len -= olen;
3273	}
3274	ip->ip_vhl = IP_MAKE_VHL(IPVERSION, sizeof(struct ip) >> `2`);
3275
3276	/*
3277	* We expect ip_{off,len} to be in host order by now, and
3278	* that the original IP header length has been subtracted
3279	* out from ip_len. Temporarily adjust ip_len for checksum
3280	* recalculation, and restore it afterwards.
3281	*/
3282	ip->ip_len += sizeof(struct ip);
3283
3284	/ recompute checksum now that IP header is smaller /
3285	#if BYTE_ORDER != BIG_ENDIAN
3286	HTONS(ip->ip_len);
3287	HTONS(ip->ip_off);
3288	#endif /* BYTE_ORDER != BIG_ENDIAN */
3289	ip->ip_sum = in_cksum_hdr(ip);
3290	#if BYTE_ORDER != BIG_ENDIAN
3291	NTOHS(ip->ip_off);
3292	NTOHS(ip->ip_len);
3293	#endif /* BYTE_ORDER != BIG_ENDIAN */
3294
3295	ip->ip_len -= sizeof(struct ip);
3296
3297	/*
3298	* Given that we've just stripped IP options from the header,
3299	* we need to adjust the start offset accordingly if this
3300	* packet had gone thru partial checksum offload.
3301	*/
3302	if ((m->m_pkthdr.csum_flags & (CSUM_DATA_VALID \| CSUM_PARTIAL)) ==
3303	(CSUM_DATA_VALID \| CSUM_PARTIAL)) {
3304	if (m->m_pkthdr.csum_rx_start >= (sizeof(struct ip) + olen)) {
3305	/ most common case /
3306	m->m_pkthdr.csum_rx_start -= olen;
3307	} else {
3308	/ compute checksum in software instead /
3309	m->m_pkthdr.csum_flags &= ~CSUM_DATA_VALID;
3310	m->m_pkthdr.csum_data = `0`;
3311	ipstat.ips_adj_hwcsum_clr++;
3312	}
3313	}
3314	}
3315
3316	u_char inetctlerrmap[PRC_NCMDS] = {
3317	`0`, `0`, `0`, `0`,
3318	`0`, EMSGSIZE, EHOSTDOWN, EHOSTUNREACH,
3319	ENETUNREACH, EHOSTUNREACH, ECONNREFUSED, ECONNREFUSED,
3320	EMSGSIZE, EHOSTUNREACH, `0`, `0`,
3321	`0`, `0`, EHOSTUNREACH, `0`,
3322	ENOPROTOOPT, ECONNREFUSED
3323	};
3324
3325	static int
3326	sysctl_ipforwarding SYSCTL_HANDLER_ARGS
3327	{
3328	#pragma unused(arg1, arg2)
3329	int i, was_ipforwarding = ipforwarding;
3330
3331	i = sysctl_handle_int(oidp, arg1: oidp->oid_arg1, arg2: oidp->oid_arg2, req);
3332	if (i != `0` \|\| req->newptr == USER_ADDR_NULL) {
3333	return i;
3334	}
3335
3336	if (was_ipforwarding && !ipforwarding) {
3337	/ clean up IPv4 forwarding cached routes /
3338	ifnet_head_lock_shared();
3339	for (i = `0`; i <= if_index; i++) {
3340	struct ifnet *ifp = ifindex2ifnet[i];
3341	if (ifp != NULL) {
3342	lck_mtx_lock(lck: &ifp->if_cached_route_lock);
3343	ROUTE_RELEASE(&ifp->if_fwd_route);
3344	bzero(s: &ifp->if_fwd_route,
3345	n: sizeof(ifp->if_fwd_route));
3346	lck_mtx_unlock(lck: &ifp->if_cached_route_lock);
3347	}
3348	}
3349	ifnet_head_done();
3350	}
3351
3352	return `0`;
3353	}
3354
3355	/*
3356	* Similar to inp_route_{copyout,copyin} routines except that these copy
3357	* out the cached IPv4 forwarding route from struct ifnet instead of the
3358	* inpcb. See comments for those routines for explanations.
3359	*/
3360	static void
3361	ip_fwd_route_copyout(struct ifnet ifp, struct* route *dst)
3362	{
3363	struct route *src = &ifp->if_fwd_route;
3364
3365	lck_mtx_lock_spin(lck: &ifp->if_cached_route_lock);
3366	lck_mtx_convert_spin(lck: &ifp->if_cached_route_lock);
3367
3368	/ Minor sanity check /
3369	if (src->ro_rt != NULL && rt_key(src->ro_rt)->sa_family != AF_INET) {
3370	panic("%s: wrong or corrupted route: %p", __func__, src);
3371	}
3372
3373	route_copyout(dst, src, sizeof(*dst));
3374
3375	lck_mtx_unlock(lck: &ifp->if_cached_route_lock);
3376	}
3377
3378	static void
3379	ip_fwd_route_copyin(struct ifnet ifp, struct* route *src)
3380	{
3381	struct route *dst = &ifp->if_fwd_route;
3382
3383	lck_mtx_lock_spin(lck: &ifp->if_cached_route_lock);
3384	lck_mtx_convert_spin(lck: &ifp->if_cached_route_lock);
3385
3386	/ Minor sanity check /
3387	if (src->ro_rt != NULL && rt_key(src->ro_rt)->sa_family != AF_INET) {
3388	panic("%s: wrong or corrupted route: %p", __func__, src);
3389	}
3390
3391	if (ifp->if_fwd_cacheok) {
3392	route_copyin(src, dst, sizeof(*src));
3393	}
3394
3395	lck_mtx_unlock(lck: &ifp->if_cached_route_lock);
3396	}
3397
3398	/*
3399	* Forward a packet. If some error occurs return the sender
3400	* an icmp packet. Note we can't always generate a meaningful
3401	* icmp message because icmp doesn't have a large enough repertoire
3402	* of codes and types.
3403	*
3404	* If not forwarding, just drop the packet. This could be confusing
3405	* if ipforwarding was zero but some routing protocol was advancing
3406	* us as a gateway to somewhere. However, we must let the routing
3407	* protocol deal with that.
3408	*
3409	* The srcrt parameter indicates whether the packet is being forwarded
3410	* via a source route.
3411	*/
3412	static void
3413	ip_forward(struct mbuf m, int* srcrt, struct sockaddr_in *next_hop)
3414	{
3415	#pragma unused(next_hop)
3416	struct ip ip = mtod(m, struct* ip *);
3417	struct sockaddr_in *sin;
3418	struct rtentry *rt;
3419	struct route fwd_rt;
3420	int error, type = `0`, code = `0`;
3421	struct mbuf *mcopy;
3422	n_long dest;
3423	struct in_addr pkt_dst;
3424	u_int32_t nextmtu = `0`, len;
3425	struct ip_out_args ipoa;
3426	struct ifnet *rcvifp = m->m_pkthdr.rcvif;
3427
3428	bzero(s: &ipoa, n: sizeof(ipoa));
3429	ipoa.ipoa_boundif = IFSCOPE_NONE;
3430	ipoa.ipoa_sotc = SO_TC_UNSPEC;
3431	ipoa.ipoa_netsvctype = _NET_SERVICE_TYPE_UNSPEC;
3432
3433	#if IPSEC
3434	struct secpolicy *sp = NULL;
3435	int ipsecerror;
3436	#endif /* IPSEC */
3437	#if PF
3438	struct pf_mtag *pf_mtag;
3439	#endif /* PF */
3440
3441	dest = `0`;
3442	pkt_dst = ip->ip_dst;
3443
3444	#if DIAGNOSTIC
3445	if (ipprintfs) {
3446	printf("forward: src %lx dst %lx ttl %x\n",
3447	(u_int32_t)ip->ip_src.s_addr, (u_int32_t)pkt_dst.s_addr,
3448	ip->ip_ttl);
3449	}
3450	#endif
3451
3452	if (m->m_flags & (M_BCAST \| M_MCAST) \|\| !in_canforward(pkt_dst)) {
3453	OSAddAtomic(`1`, &ipstat.ips_cantforward);
3454	m_freem(m);
3455	return;
3456	}
3457	#if IPSTEALTH
3458	if (!ipstealth) {
3459	#endif /* IPSTEALTH */
3460	if (ip->ip_ttl <= IPTTLDEC) {
3461	icmp_error(m, ICMP_TIMXCEED, ICMP_TIMXCEED_INTRANS,
3462	dest, `0`);
3463	return;
3464	}
3465	#if IPSTEALTH
3466	}
3467	#endif /* IPSTEALTH */
3468
3469	#if PF
3470	pf_mtag = pf_find_mtag(m);
3471	if (pf_mtag != NULL && pf_mtag->pftag_rtableid != IFSCOPE_NONE) {
3472	ipoa.ipoa_boundif = pf_mtag->pftag_rtableid;
3473	ipoa.ipoa_flags \|= IPOAF_BOUND_IF;
3474	}
3475	#endif /* PF */
3476
3477	ip_fwd_route_copyout(ifp: rcvifp, dst: &fwd_rt);
3478
3479	sin = SIN(&fwd_rt.ro_dst);
3480	if (ROUTE_UNUSABLE(&fwd_rt) \|\| pkt_dst.s_addr != sin->sin_addr.s_addr) {
3481	ROUTE_RELEASE(&fwd_rt);
3482
3483	sin->sin_family = AF_INET;
3484	sin->sin_len = sizeof(*sin);
3485	sin->sin_addr = pkt_dst;
3486
3487	rtalloc_scoped_ign(&fwd_rt, RTF_PRCLONING, ipoa.ipoa_boundif);
3488	if (fwd_rt.ro_rt == NULL) {
3489	icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_HOST, dest, `0`);
3490	goto done;
3491	}
3492	}
3493	rt = fwd_rt.ro_rt;
3494
3495	/*
3496	* Save the IP header and at most 8 bytes of the payload,
3497	* in case we need to generate an ICMP message to the src.
3498	*
3499	* We don't use m_copy() because it might return a reference
3500	* to a shared cluster. Both this function and ip_output()
3501	* assume exclusive access to the IP header in `m', so any
3502	* data in a cluster may change before we reach icmp_error().
3503	*/
3504	MGET(mcopy, M_DONTWAIT, m->m_type);
3505	if (mcopy != NULL && m_dup_pkthdr(mcopy, m, M_DONTWAIT) == `0`) {
3506	mcopy->m_len = imin(a: (IP_VHL_HL(ip->ip_vhl) << `2`) + `8`,
3507	b: (int)ip->ip_len);
3508	m_copydata(m, `0`, mcopy->m_len, mtod(mcopy, caddr_t));
3509	}
3510
3511	#if IPSTEALTH
3512	if (!ipstealth) {
3513	#endif /* IPSTEALTH */
3514	ip->ip_ttl -= IPTTLDEC;
3515	#if IPSTEALTH
3516	}
3517	#endif /* IPSTEALTH */
3518
3519	/*
3520	* If forwarding packet using same interface that it came in on,
3521	* perhaps should send a redirect to sender to shortcut a hop.
3522	* Only send redirect if source is sending directly to us,
3523	* and if packet was not source routed (or has any options).
3524	* Also, don't send redirect if forwarding using a default route
3525	* or a route modified by a redirect.
3526	*/
3527	RT_LOCK_SPIN(rt);
3528	if (rt->rt_ifp == m->m_pkthdr.rcvif &&
3529	!(rt->rt_flags & (RTF_DYNAMIC \| RTF_MODIFIED)) &&
3530	satosin(rt_key(rt))->sin_addr.s_addr != INADDR_ANY &&
3531	ipsendredirects && !srcrt && rt->rt_ifa != NULL) {
3532	struct in_ifaddr ia = (struct* in_ifaddr *)rt->rt_ifa;
3533	u_int32_t src = ntohl(ip->ip_src.s_addr);
3534
3535	/ Become a regular mutex /
3536	RT_CONVERT_LOCK(rt);
3537	IFA_LOCK_SPIN(&ia->ia_ifa);
3538	if ((src & ia->ia_subnetmask) == ia->ia_subnet) {
3539	if (rt->rt_flags & RTF_GATEWAY) {
3540	dest = satosin(rt->rt_gateway)->sin_addr.s_addr;
3541	} else {
3542	dest = pkt_dst.s_addr;
3543	}
3544	/*
3545	* Router requirements says to only send
3546	* host redirects.
3547	*/
3548	type = ICMP_REDIRECT;
3549	code = ICMP_REDIRECT_HOST;
3550	#if DIAGNOSTIC
3551	if (ipprintfs) {
3552	printf("redirect (%d) to %lx\n", code,
3553	(u_int32_t)dest);
3554	}
3555	#endif
3556	}
3557	IFA_UNLOCK(&ia->ia_ifa);
3558	}
3559	RT_UNLOCK(rt);
3560
3561
3562	/ Mark this packet as being forwarded from another interface /
3563	m->m_pkthdr.pkt_flags \|= PKTF_FORWARDED;
3564	len = m_pktlen(m);
3565
3566	error = ip_output(m, NULL, &fwd_rt, IP_FORWARDING \| IP_OUTARGS,
3567	NULL, &ipoa);
3568
3569	/ Refresh rt since the route could have changed while in IP /
3570	rt = fwd_rt.ro_rt;
3571
3572	if (error != `0`) {
3573	OSAddAtomic(`1`, &ipstat.ips_cantforward);
3574	} else {
3575	/*
3576	* Increment stats on the source interface; the ones
3577	* for destination interface has been taken care of
3578	* during output above by virtue of PKTF_FORWARDED.
3579	*/
3580	rcvifp->if_fpackets++;
3581	rcvifp->if_fbytes += len;
3582
3583	OSAddAtomic(`1`, &ipstat.ips_forward);
3584	if (type != `0`) {
3585	OSAddAtomic(`1`, &ipstat.ips_redirectsent);
3586	} else {
3587	if (mcopy != NULL) {
3588	/*
3589	* If we didn't have to go thru ipflow and
3590	* the packet was successfully consumed by
3591	* ip_output, the mcopy is rather a waste;
3592	* this could be further optimized.
3593	*/
3594	m_freem(mcopy);
3595	}
3596	goto done;
3597	}
3598	}
3599	if (mcopy == NULL) {
3600	goto done;
3601	}
3602
3603	switch (error) {
3604	case `0`: / forwarded, but need redirect /
3605	/ type, code set above /
3606	break;
3607
3608	case ENETUNREACH: / shouldn't happen, checked above /
3609	case EHOSTUNREACH:
3610	case ENETDOWN:
3611	case EHOSTDOWN:
3612	default:
3613	type = ICMP_UNREACH;
3614	code = ICMP_UNREACH_HOST;
3615	break;
3616
3617	case EMSGSIZE:
3618	type = ICMP_UNREACH;
3619	code = ICMP_UNREACH_NEEDFRAG;
3620
3621	if (rt == NULL) {
3622	break;
3623	} else {
3624	RT_LOCK_SPIN(rt);
3625	if (rt->rt_ifp != NULL) {
3626	nextmtu = rt->rt_ifp->if_mtu;
3627	}
3628	RT_UNLOCK(rt);
3629	}
3630	#ifdef IPSEC
3631	if (ipsec_bypass) {
3632	break;
3633	}
3634
3635	/*
3636	* If the packet is routed over IPsec tunnel, tell the
3637	* originator the tunnel MTU.
3638	* tunnel MTU = if MTU - sizeof(IP) - ESP/AH hdrsiz
3639	* XXX quickhack!!!
3640	*/
3641	sp = ipsec4_getpolicybyaddr(mcopy, IPSEC_DIR_OUTBOUND,
3642	IP_FORWARDING, &ipsecerror);
3643
3644	if (sp == NULL) {
3645	break;
3646	}
3647
3648	/*
3649	* find the correct route for outer IPv4
3650	* header, compute tunnel MTU.
3651	*/
3652	nextmtu = `0`;
3653
3654	if (sp->req != NULL &&
3655	sp->req->saidx.mode == IPSEC_MODE_TUNNEL) {
3656	struct secasindex saidx;
3657	struct secasvar *sav;
3658	struct route *ro;
3659	struct ip *ipm;
3660	size_t ipsechdr;
3661
3662	/ count IPsec header size /
3663	ipsechdr = ipsec_hdrsiz(sp);
3664
3665	ipm = mtod(mcopy, struct ip *);
3666	bcopy(src: &sp->req->saidx, dst: &saidx, n: sizeof(saidx));
3667	saidx.mode = sp->req->saidx.mode;
3668	saidx.reqid = sp->req->saidx.reqid;
3669	sin = SIN(&saidx.src);
3670	if (sin->sin_len == `0`) {
3671	sin->sin_len = sizeof(*sin);
3672	sin->sin_family = AF_INET;
3673	sin->sin_port = IPSEC_PORT_ANY;
3674	bcopy(src: &ipm->ip_src, dst: &sin->sin_addr,
3675	n: sizeof(sin->sin_addr));
3676	}
3677	sin = SIN(&saidx.dst);
3678	if (sin->sin_len == `0`) {
3679	sin->sin_len = sizeof(*sin);
3680	sin->sin_family = AF_INET;
3681	sin->sin_port = IPSEC_PORT_ANY;
3682	bcopy(src: &ipm->ip_dst, dst: &sin->sin_addr,
3683	n: sizeof(sin->sin_addr));
3684	}
3685	sav = key_allocsa_policy(&saidx);
3686	if (sav != NULL) {
3687	lck_mtx_lock(sadb_mutex);
3688	if (sav->sah != NULL) {
3689	ro = (struct route *)&sav->sah->sa_route;
3690	if (ro->ro_rt != NULL) {
3691	RT_LOCK(ro->ro_rt);
3692	if (ro->ro_rt->rt_ifp != NULL) {
3693	nextmtu = ro->ro_rt->
3694	rt_ifp->if_mtu;
3695	nextmtu -= ipsechdr;
3696	}
3697	RT_UNLOCK(ro->ro_rt);
3698	}
3699	}
3700	key_freesav(sav, KEY_SADB_LOCKED);
3701	lck_mtx_unlock(sadb_mutex);
3702	}
3703	}
3704	key_freesp(sp, KEY_SADB_UNLOCKED);
3705	#endif /* IPSEC */
3706	break;
3707
3708	case ENOBUFS:
3709	/*
3710	* A router should not generate ICMP_SOURCEQUENCH as
3711	* required in RFC1812 Requirements for IP Version 4 Routers.
3712	* Source quench could be a big problem under DoS attacks,
3713	* or if the underlying interface is rate-limited.
3714	* Those who need source quench packets may re-enable them
3715	* via the net.inet.ip.sendsourcequench sysctl.
3716	*/
3717	if (ip_sendsourcequench == `0`) {
3718	m_freem(mcopy);
3719	goto done;
3720	} else {
3721	type = ICMP_SOURCEQUENCH;
3722	code = `0`;
3723	}
3724	break;
3725
3726	case EACCES:
3727	m_freem(mcopy);
3728	goto done;
3729	}
3730
3731	if (type == ICMP_UNREACH && code == ICMP_UNREACH_NEEDFRAG) {
3732	OSAddAtomic(`1`, &ipstat.ips_cantfrag);
3733	}
3734
3735	icmp_error(mcopy, type, code, dest, nextmtu);
3736	done:
3737	ip_fwd_route_copyin(ifp: rcvifp, src: &fwd_rt);
3738	}
3739
3740	int
3741	ip_savecontrol(struct inpcb inp, struct* mbuf mp, struct** ip *ip,
3742	struct mbuf *m)
3743	{
3744	*mp = NULL;
3745	if (inp->inp_socket->so_options & SO_TIMESTAMP) {
3746	struct timeval tv;
3747
3748	getmicrotime(&tv);
3749	mp = sbcreatecontrol_mbuf(p: (caddr_t)&tv, size: sizeof(tv),
3750	SCM_TIMESTAMP, SOL_SOCKET, m: mp);
3751	if (*mp == NULL) {
3752	goto no_mbufs;
3753	}
3754	}
3755	if (inp->inp_socket->so_options & SO_TIMESTAMP_MONOTONIC) {
3756	uint64_t time;
3757
3758	time = mach_absolute_time();
3759	mp = sbcreatecontrol_mbuf(p: (caddr_t)&time, size: sizeof(time),
3760	SCM_TIMESTAMP_MONOTONIC, SOL_SOCKET, m: mp);
3761	if (*mp == NULL) {
3762	goto no_mbufs;
3763	}
3764	}
3765	if (inp->inp_socket->so_options & SO_TIMESTAMP_CONTINUOUS) {
3766	uint64_t time;
3767
3768	time = mach_continuous_time();
3769	mp = sbcreatecontrol_mbuf(p: (caddr_t)&time, size: sizeof(time),
3770	SCM_TIMESTAMP_CONTINUOUS, SOL_SOCKET, m: mp);
3771	if (*mp == NULL) {
3772	goto no_mbufs;
3773	}
3774	}
3775	if (inp->inp_socket->so_flags & SOF_RECV_TRAFFIC_CLASS) {
3776	int tc = m_get_traffic_class(m);
3777
3778	mp = sbcreatecontrol_mbuf(p: (caddr_t)&tc, size: sizeof(tc),
3779	SO_TRAFFIC_CLASS, SOL_SOCKET, m: mp);
3780	if (*mp == NULL) {
3781	goto no_mbufs;
3782	}
3783	}
3784	if ((inp->inp_socket->so_flags & SOF_RECV_WAKE_PKT) &&
3785	(m->m_pkthdr.pkt_flags & PKTF_WAKE_PKT)) {
3786	int flag = `1`;
3787
3788	mp = sbcreatecontrol_mbuf(p: (caddr_t)&flag, size: sizeof(flag),
3789	SO_RECV_WAKE_PKT, SOL_SOCKET, m: mp);
3790	if (*mp == NULL) {
3791	goto no_mbufs;
3792	}
3793	}
3794
3795	if (inp->inp_flags & INP_RECVDSTADDR \|\| SOFLOW_ENABLED(inp->inp_socket)) {
3796	mp = sbcreatecontrol_mbuf(p: (caddr_t)&ip->ip_dst,
3797	size: sizeof(struct in_addr), IP_RECVDSTADDR, IPPROTO_IP, m: mp);
3798	if (*mp == NULL) {
3799	goto no_mbufs;
3800	}
3801	}
3802	#ifdef notyet
3803	/*
3804	* XXX
3805	* Moving these out of udp_input() made them even more broken
3806	* than they already were.
3807	*/
3808	/ options were tossed already /
3809	if (inp->inp_flags & INP_RECVOPTS) {
3810	mp = sbcreatecontrol_mbuf((caddr_t)opts_deleted_above,
3811	sizeof(struct in_addr), IP_RECVOPTS, IPPROTO_IP, mp);
3812	if (*mp == NULL) {
3813	goto no_mbufs;
3814	}
3815	}
3816	/ ip_srcroute doesn't do what we want here, need to fix /
3817	if (inp->inp_flags & INP_RECVRETOPTS) {
3818	mp = sbcreatecontrol_mbuf((caddr_t)ip_srcroute(),
3819	sizeof(struct in_addr), IP_RECVRETOPTS, IPPROTO_IP, mp);
3820	if (*mp == NULL) {
3821	goto no_mbufs;
3822	}
3823	}
3824	#endif /* notyet */
3825	if (inp->inp_flags & INP_RECVIF) {
3826	struct ifnet *ifp;
3827	uint8_t sdlbuf[SOCK_MAXADDRLEN + `1`];
3828	struct sockaddr_dl *sdl2 = SDL(sdlbuf);
3829
3830	/*
3831	* Make sure to accomodate the largest possible
3832	* size of SA(if_lladdr)->sa_len.
3833	*/
3834	_CASSERT(sizeof(sdlbuf) == (SOCK_MAXADDRLEN + `1`));
3835
3836	ifnet_head_lock_shared();
3837	if ((ifp = m->m_pkthdr.rcvif) != NULL &&
3838	ifp->if_index && IF_INDEX_IN_RANGE(ifp->if_index)) {
3839	struct ifaddr *ifa = ifnet_addrs[ifp->if_index - `1`];
3840	struct sockaddr_dl *sdp;
3841
3842	if (!ifa \|\| !ifa->ifa_addr) {
3843	goto makedummy;
3844	}
3845
3846	IFA_LOCK_SPIN(ifa);
3847	sdp = SDL(ifa->ifa_addr);
3848	/*
3849	* Change our mind and don't try copy.
3850	*/
3851	if (sdp->sdl_family != AF_LINK) {
3852	IFA_UNLOCK(ifa);
3853	goto makedummy;
3854	}
3855	/ the above _CASSERT ensures sdl_len fits in sdlbuf /
3856	SOCKADDR_COPY(sdp, sdl2, sdp->sdl_len);
3857	IFA_UNLOCK(ifa);
3858	} else {
3859	makedummy:
3860	sdl2->sdl_len =
3861	offsetof(struct sockaddr_dl, sdl_data[`0`]);
3862	sdl2->sdl_family = AF_LINK;
3863	sdl2->sdl_index = `0`;
3864	sdl2->sdl_nlen = sdl2->sdl_alen = sdl2->sdl_slen = `0`;
3865	}
3866	ifnet_head_done();
3867	mp = sbcreatecontrol_mbuf(p: (caddr_t)sdl2, size: sdl2->sdl_len,
3868	IP_RECVIF, IPPROTO_IP, m: mp);
3869	if (*mp == NULL) {
3870	goto no_mbufs;
3871	}
3872	}
3873	if (inp->inp_flags & INP_RECVTTL) {
3874	mp = sbcreatecontrol_mbuf(p: (caddr_t)&ip->ip_ttl,
3875	size: sizeof(ip->ip_ttl), IP_RECVTTL, IPPROTO_IP, m: mp);
3876	if (*mp == NULL) {
3877	goto no_mbufs;
3878	}
3879	}
3880	if (inp->inp_flags & INP_PKTINFO) {
3881	struct in_pktinfo pi;
3882
3883	bzero(s: &pi, n: sizeof(struct in_pktinfo));
3884	bcopy(src: &ip->ip_dst, dst: &pi.ipi_addr, n: sizeof(struct in_addr));
3885	pi.ipi_ifindex = (m != NULL && m->m_pkthdr.rcvif != NULL) ?
3886	m->m_pkthdr.rcvif->if_index : `0`;
3887
3888	mp = sbcreatecontrol_mbuf(p: (caddr_t)&pi,
3889	size: sizeof(struct in_pktinfo), IP_RECVPKTINFO, IPPROTO_IP, m: mp);
3890	if (*mp == NULL) {
3891	goto no_mbufs;
3892	}
3893	}
3894	if (inp->inp_flags & INP_RECVTOS) {
3895	mp = sbcreatecontrol_mbuf(p: (caddr_t)&ip->ip_tos,
3896	size: sizeof(u_char), IP_RECVTOS, IPPROTO_IP, m: mp);
3897	if (*mp == NULL) {
3898	goto no_mbufs;
3899	}
3900	}
3901	return `0`;
3902
3903	no_mbufs:
3904	ipstat.ips_pktdropcntrl++;
3905	return ENOBUFS;
3906	}
3907
3908	static inline u_short
3909	ip_cksum(struct mbuf m, int* hlen)
3910	{
3911	u_short sum;
3912
3913	if (m->m_pkthdr.csum_flags & CSUM_IP_CHECKED) {
3914	sum = !(m->m_pkthdr.csum_flags & CSUM_IP_VALID);
3915	} else if (!(m->m_pkthdr.rcvif->if_flags & IFF_LOOPBACK) &&
3916	!(m->m_pkthdr.pkt_flags & PKTF_LOOP)) {
3917	/*
3918	* The packet arrived on an interface which isn't capable
3919	* of performing IP header checksum; compute it now.
3920	*/
3921	sum = ip_cksum_hdr_in(m, hlen);
3922	} else {
3923	sum = `0`;
3924	m->m_pkthdr.csum_flags \|= (CSUM_DATA_VALID \| CSUM_PSEUDO_HDR \|
3925	CSUM_IP_CHECKED \| CSUM_IP_VALID);
3926	m->m_pkthdr.csum_data = `0xffff`;
3927	}
3928
3929	if (sum != `0`) {
3930	OSAddAtomic(`1`, &ipstat.ips_badsum);
3931	}
3932
3933	return sum;
3934	}
3935
3936	static int
3937	ip_getstat SYSCTL_HANDLER_ARGS
3938	{
3939	#pragma unused(oidp, arg1, arg2)
3940	if (req->oldptr == USER_ADDR_NULL) {
3941	req->oldlen = (size_t)sizeof(struct ipstat);
3942	}
3943
3944	return SYSCTL_OUT(req, &ipstat, MIN(sizeof(ipstat), req->oldlen));
3945	}
3946
3947	void
3948	ip_setsrcifaddr_info(struct mbuf m, uint16_t src_idx, struct* in_ifaddr *ia)
3949	{
3950	VERIFY(m->m_flags & M_PKTHDR);
3951
3952	/*
3953	* If the source ifaddr is specified, pick up the information
3954	* from there; otherwise just grab the passed-in ifindex as the
3955	* caller may not have the ifaddr available.
3956	*/
3957	if (ia != NULL) {
3958	m->m_pkthdr.pkt_flags \|= PKTF_IFAINFO;
3959	m->m_pkthdr.src_ifindex = ia->ia_ifp->if_index;
3960	} else {
3961	m->m_pkthdr.src_ifindex = src_idx;
3962	if (src_idx != `0`) {
3963	m->m_pkthdr.pkt_flags \|= PKTF_IFAINFO;
3964	}
3965	}
3966	}
3967
3968	void
3969	ip_setdstifaddr_info(struct mbuf m, uint16_t dst_idx, struct* in_ifaddr *ia)
3970	{
3971	VERIFY(m->m_flags & M_PKTHDR);
3972
3973	/*
3974	* If the destination ifaddr is specified, pick up the information
3975	* from there; otherwise just grab the passed-in ifindex as the
3976	* caller may not have the ifaddr available.
3977	*/
3978	if (ia != NULL) {
3979	m->m_pkthdr.pkt_flags \|= PKTF_IFAINFO;
3980	m->m_pkthdr.dst_ifindex = ia->ia_ifp->if_index;
3981	} else {
3982	m->m_pkthdr.dst_ifindex = dst_idx;
3983	if (dst_idx != `0`) {
3984	m->m_pkthdr.pkt_flags \|= PKTF_IFAINFO;
3985	}
3986	}
3987	}
3988
3989	int
3990	ip_getsrcifaddr_info(struct mbuf m, uint32_t src_idx, uint32_t *iaf)
3991	{
3992	VERIFY(m->m_flags & M_PKTHDR);
3993
3994	if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO)) {
3995	return -`1`;
3996	}
3997
3998	if (src_idx != NULL) {
3999	*src_idx = m->m_pkthdr.src_ifindex;
4000	}
4001
4002	if (iaf != NULL) {
4003	*iaf = `0`;
4004	}
4005
4006	return `0`;
4007	}
4008
4009	int
4010	ip_getdstifaddr_info(struct mbuf m, uint32_t dst_idx, uint32_t *iaf)
4011	{
4012	VERIFY(m->m_flags & M_PKTHDR);
4013
4014	if (!(m->m_pkthdr.pkt_flags & PKTF_IFAINFO)) {
4015	return -`1`;
4016	}
4017
4018	if (dst_idx != NULL) {
4019	*dst_idx = m->m_pkthdr.dst_ifindex;
4020	}
4021
4022	if (iaf != NULL) {
4023	*iaf = `0`;
4024	}
4025
4026	return `0`;
4027	}
4028
4029	/*
4030	* Protocol input handler for IPPROTO_GRE.
4031	*/
4032	void
4033	gre_input(struct mbuf m, int* off)
4034	{
4035	gre_input_func_t fn = gre_input_func;
4036
4037	/*
4038	* If there is a registered GRE input handler, pass mbuf to it.
4039	*/
4040	if (fn != NULL) {
4041	lck_mtx_unlock(lck: inet_domain_mutex);
4042	m = fn(m, off, (mtod(m, struct ip *))->ip_p);
4043	lck_mtx_lock(lck: inet_domain_mutex);
4044	}
4045
4046	/*
4047	* If no matching tunnel that is up is found, we inject
4048	* the mbuf to raw ip socket to see if anyone picks it up.
4049	*/
4050	if (m != NULL) {
4051	rip_input(m, off);
4052	}
4053	}
4054
4055	/*
4056	* Private KPI for PPP/PPTP.
4057	*/
4058	int
4059	ip_gre_register_input(gre_input_func_t fn)
4060	{
4061	lck_mtx_lock(lck: inet_domain_mutex);
4062	gre_input_func = fn;
4063	lck_mtx_unlock(lck: inet_domain_mutex);
4064
4065	return `0`;
4066	}
4067
4068	#if (DEBUG \|\| DEVELOPMENT)
4069	static int
4070	sysctl_reset_ip_input_stats SYSCTL_HANDLER_ARGS
4071	{
4072	#pragma unused(arg1, arg2)
4073	int error, i;
4074
4075	i = ip_input_measure;
4076	error = sysctl_handle_int(oidp, &i, `0`, req);
4077	if (error \|\| req->newptr == USER_ADDR_NULL) {
4078	goto done;
4079	}
4080	/ impose bounds /
4081	if (i < `0` \|\| i > `1`) {
4082	error = EINVAL;
4083	goto done;
4084	}
4085	if (ip_input_measure != i && i == `1`) {
4086	net_perf_initialize(&net_perf, ip_input_measure_bins);
4087	}
4088	ip_input_measure = i;
4089	done:
4090	return error;
4091	}
4092
4093	static int
4094	sysctl_ip_input_measure_bins SYSCTL_HANDLER_ARGS
4095	{
4096	#pragma unused(arg1, arg2)
4097	int error;
4098	uint64_t i;
4099
4100	i = ip_input_measure_bins;
4101	error = sysctl_handle_quad(oidp, &i, `0`, req);
4102	if (error \|\| req->newptr == USER_ADDR_NULL) {
4103	goto done;
4104	}
4105	/ validate data /
4106	if (!net_perf_validate_bins(i)) {
4107	error = EINVAL;
4108	goto done;
4109	}
4110	ip_input_measure_bins = i;
4111	done:
4112	return error;
4113	}
4114
4115	static int
4116	sysctl_ip_input_getperf SYSCTL_HANDLER_ARGS
4117	{
4118	#pragma unused(oidp, arg1, arg2)
4119	if (req->oldptr == USER_ADDR_NULL) {
4120	req->oldlen = (size_t)sizeof(struct ipstat);
4121	}
4122
4123	return SYSCTL_OUT(req, &net_perf, MIN(sizeof(net_perf), req->oldlen));
4124	}
4125	#endif /* (DEBUG \|\| DEVELOPMENT) */
4126
4127	static int
4128	sysctl_ip_checkinterface SYSCTL_HANDLER_ARGS
4129	{
4130	#pragma unused(arg1, arg2)
4131	int error, i;
4132
4133	i = ip_checkinterface;
4134	error = sysctl_handle_int(oidp, arg1: &i, arg2: `0`, req);
4135	if (error != `0` \|\| req->newptr == USER_ADDR_NULL) {
4136	return error;
4137	}
4138
4139	switch (i) {
4140	case IP_CHECKINTERFACE_WEAK_ES:
4141	case IP_CHECKINTERFACE_HYBRID_ES:
4142	case IP_CHECKINTERFACE_STRONG_ES:
4143	if (ip_checkinterface != i) {
4144	ip_checkinterface = i;
4145	os_log(OS_LOG_DEFAULT, "%s: ip_checkinterface is now %d\n",
4146	__func__, ip_checkinterface);
4147	}
4148	break;
4149	default:
4150	error = EINVAL;
4151	break;
4152	}
4153	return error;
4154	}
4155

Browse the source code of xnu/bsd/netinet/ip_input.c