1/*
2 * Copyright (c) 2000-2018 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * Copyright (c) 1982, 1986, 1988, 1990, 1993
30 * The Regents of the University of California. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 * @(#)ip_output.c 8.3 (Berkeley) 1/21/94
61 */
62/*
63 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
64 * support for mandatory and extensible security protections. This notice
65 * is included in support of clause 2.2 (b) of the Apple Public License,
66 * Version 2.0.
67 */
68
69#define _IP_VHL
70
71#include <sys/param.h>
72#include <sys/systm.h>
73#include <sys/kernel.h>
74#include <sys/malloc.h>
75#include <sys/mbuf.h>
76#include <sys/protosw.h>
77#include <sys/socket.h>
78#include <sys/socketvar.h>
79#include <kern/locks.h>
80#include <sys/sysctl.h>
81#include <sys/mcache.h>
82#include <sys/kdebug.h>
83
84#include <machine/endian.h>
85#include <pexpert/pexpert.h>
86#include <mach/sdt.h>
87
88#include <libkern/OSAtomic.h>
89#include <libkern/OSByteOrder.h>
90
91#include <net/if.h>
92#include <net/if_dl.h>
93#include <net/if_types.h>
94#include <net/route.h>
95#include <net/ntstat.h>
96#include <net/net_osdep.h>
97#include <net/dlil.h>
98#include <net/net_perf.h>
99
100#include <netinet/in.h>
101#include <netinet/in_systm.h>
102#include <netinet/ip.h>
103#include <netinet/in_pcb.h>
104#include <netinet/in_var.h>
105#include <netinet/ip_var.h>
106#include <netinet/kpi_ipfilter_var.h>
107#include <netinet/in_tclass.h>
108#include <netinet/udp.h>
109
110#include <netinet6/nd6.h>
111
112#if CONFIG_MACF_NET
113#include <security/mac_framework.h>
114#endif /* CONFIG_MACF_NET */
115
116#define DBG_LAYER_BEG NETDBG_CODE(DBG_NETIP, 1)
117#define DBG_LAYER_END NETDBG_CODE(DBG_NETIP, 3)
118#define DBG_FNC_IP_OUTPUT NETDBG_CODE(DBG_NETIP, (1 << 8) | 1)
119#define DBG_FNC_IPSEC4_OUTPUT NETDBG_CODE(DBG_NETIP, (2 << 8) | 1)
120
121#if IPSEC
122#include <netinet6/ipsec.h>
123#include <netkey/key.h>
124#if IPSEC_DEBUG
125#include <netkey/key_debug.h>
126#else
127#define KEYDEBUG(lev, arg)
128#endif
129#endif /* IPSEC */
130
131#if NECP
132#include <net/necp.h>
133#endif /* NECP */
134
135#if IPFIREWALL
136#include <netinet/ip_fw.h>
137#if IPDIVERT
138#include <netinet/ip_divert.h>
139#endif /* IPDIVERT */
140#endif /* IPFIREWALL */
141
142#if DUMMYNET
143#include <netinet/ip_dummynet.h>
144#endif
145
146#if PF
147#include <net/pfvar.h>
148#endif /* PF */
149
150#if IPFIREWALL_FORWARD && IPFIREWALL_FORWARD_DEBUG
151#define print_ip(a) \
152 printf("%ld.%ld.%ld.%ld", (ntohl(a.s_addr) >> 24) & 0xFF, \
153 (ntohl(a.s_addr) >> 16) & 0xFF, \
154 (ntohl(a.s_addr) >> 8) & 0xFF, \
155 (ntohl(a.s_addr)) & 0xFF);
156#endif /* IPFIREWALL_FORWARD && IPFIREWALL_FORWARD_DEBUG */
157
158u_short ip_id;
159
160static int sysctl_reset_ip_output_stats SYSCTL_HANDLER_ARGS;
161static int sysctl_ip_output_measure_bins SYSCTL_HANDLER_ARGS;
162static int sysctl_ip_output_getperf SYSCTL_HANDLER_ARGS;
163static void ip_out_cksum_stats(int, u_int32_t);
164static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *);
165static int ip_optcopy(struct ip *, struct ip *);
166static int ip_pcbopts(int, struct mbuf **, struct mbuf *);
167static void imo_trace(struct ip_moptions *, int);
168static void ip_mloopback(struct ifnet *, struct ifnet *, struct mbuf *,
169 struct sockaddr_in *, int);
170static struct ifaddr *in_selectsrcif(struct ip *, struct route *, unsigned int);
171
172extern struct ip_linklocal_stat ip_linklocal_stat;
173
174/* temporary: for testing */
175#if IPSEC
176extern int ipsec_bypass;
177#endif
178
179static int ip_maxchainsent = 0;
180SYSCTL_INT(_net_inet_ip, OID_AUTO, maxchainsent,
181 CTLFLAG_RW | CTLFLAG_LOCKED, &ip_maxchainsent, 0,
182 "use dlil_output_list");
183#if DEBUG
184static int forge_ce = 0;
185SYSCTL_INT(_net_inet_ip, OID_AUTO, forge_ce,
186 CTLFLAG_RW | CTLFLAG_LOCKED, &forge_ce, 0,
187 "Forge ECN CE");
188#endif /* DEBUG */
189
190static int ip_select_srcif_debug = 0;
191SYSCTL_INT(_net_inet_ip, OID_AUTO, select_srcif_debug,
192 CTLFLAG_RW | CTLFLAG_LOCKED, &ip_select_srcif_debug, 0,
193 "log source interface selection debug info");
194
195static int ip_output_measure = 0;
196SYSCTL_PROC(_net_inet_ip, OID_AUTO, output_perf,
197 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
198 &ip_output_measure, 0, sysctl_reset_ip_output_stats, "I",
199 "Do time measurement");
200
201static uint64_t ip_output_measure_bins = 0;
202SYSCTL_PROC(_net_inet_ip, OID_AUTO, output_perf_bins,
203 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &ip_output_measure_bins, 0,
204 sysctl_ip_output_measure_bins, "I",
205 "bins for chaining performance data histogram");
206
207static net_perf_t net_perf;
208SYSCTL_PROC(_net_inet_ip, OID_AUTO, output_perf_data,
209 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
210 0, 0, sysctl_ip_output_getperf, "S,net_perf",
211 "IP output performance data (struct net_perf, net/net_perf.h)");
212
213__private_extern__ int rfc6864 = 1;
214SYSCTL_INT(_net_inet_ip, OID_AUTO, rfc6864, CTLFLAG_RW | CTLFLAG_LOCKED,
215 &rfc6864, 0, "updated ip id field behavior");
216
217#define IMO_TRACE_HIST_SIZE 32 /* size of trace history */
218
219/* For gdb */
220__private_extern__ unsigned int imo_trace_hist_size = IMO_TRACE_HIST_SIZE;
221
222struct ip_moptions_dbg {
223 struct ip_moptions imo; /* ip_moptions */
224 u_int16_t imo_refhold_cnt; /* # of IMO_ADDREF */
225 u_int16_t imo_refrele_cnt; /* # of IMO_REMREF */
226 /*
227 * Alloc and free callers.
228 */
229 ctrace_t imo_alloc;
230 ctrace_t imo_free;
231 /*
232 * Circular lists of IMO_ADDREF and IMO_REMREF callers.
233 */
234 ctrace_t imo_refhold[IMO_TRACE_HIST_SIZE];
235 ctrace_t imo_refrele[IMO_TRACE_HIST_SIZE];
236};
237
238#if DEBUG
239static unsigned int imo_debug = 1; /* debugging (enabled) */
240#else
241static unsigned int imo_debug; /* debugging (disabled) */
242#endif /* !DEBUG */
243static unsigned int imo_size; /* size of zone element */
244static struct zone *imo_zone; /* zone for ip_moptions */
245
246#define IMO_ZONE_MAX 64 /* maximum elements in zone */
247#define IMO_ZONE_NAME "ip_moptions" /* zone name */
248
249/*
250 * IP output. The packet in mbuf chain m contains a skeletal IP
251 * header (with len, off, ttl, proto, tos, src, dst).
252 * The mbuf chain containing the packet will be freed.
253 * The mbuf opt, if present, will not be freed.
254 */
255int
256ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro, int flags,
257 struct ip_moptions *imo, struct ip_out_args *ipoa)
258{
259 return (ip_output_list(m0, 0, opt, ro, flags, imo, ipoa));
260}
261
262/*
263 * IP output. The packet in mbuf chain m contains a skeletal IP
264 * header (with len, off, ttl, proto, tos, src, dst).
265 * The mbuf chain containing the packet will be freed.
266 * The mbuf opt, if present, will not be freed.
267 *
268 * Route ro MUST be non-NULL; if ro->ro_rt is valid, route lookup would be
269 * skipped and ro->ro_rt would be used. Otherwise the result of route
270 * lookup is stored in ro->ro_rt.
271 *
272 * In the IP forwarding case, the packet will arrive with options already
273 * inserted, so must have a NULL opt pointer.
274 */
275int
276ip_output_list(struct mbuf *m0, int packetchain, struct mbuf *opt,
277 struct route *ro, int flags, struct ip_moptions *imo,
278 struct ip_out_args *ipoa)
279{
280 struct ip *ip;
281 struct ifnet *ifp = NULL; /* not refcnt'd */
282 struct mbuf *m = m0, *prevnxt = NULL, **mppn = &prevnxt;
283 int hlen = sizeof (struct ip);
284 int len = 0, error = 0;
285 struct sockaddr_in *dst = NULL;
286 struct in_ifaddr *ia = NULL, *src_ia = NULL;
287 struct in_addr pkt_dst;
288 struct ipf_pktopts *ippo = NULL;
289 ipfilter_t inject_filter_ref = NULL;
290 struct mbuf *packetlist;
291 uint32_t sw_csum, pktcnt = 0, scnt = 0, bytecnt = 0;
292 uint32_t packets_processed = 0;
293 unsigned int ifscope = IFSCOPE_NONE;
294 struct flowadv *adv = NULL;
295 struct timeval start_tv;
296#if IPSEC
297 struct socket *so = NULL;
298 struct secpolicy *sp = NULL;
299#endif /* IPSEC */
300#if NECP
301 necp_kernel_policy_result necp_result = 0;
302 necp_kernel_policy_result_parameter necp_result_parameter;
303 necp_kernel_policy_id necp_matched_policy_id = 0;
304#endif /* NECP */
305#if IPFIREWALL
306 int ipfwoff;
307 struct sockaddr_in *next_hop_from_ipfwd_tag = NULL;
308#endif /* IPFIREWALL */
309#if IPFIREWALL || DUMMYNET
310 struct m_tag *tag;
311#endif /* IPFIREWALL || DUMMYNET */
312#if DUMMYNET
313 struct ip_out_args saved_ipoa;
314 struct sockaddr_in dst_buf;
315#endif /* DUMMYNET */
316 struct {
317#if IPSEC
318 struct ipsec_output_state ipsec_state;
319#endif /* IPSEC */
320#if NECP
321 struct route necp_route;
322#endif /* NECP */
323#if IPFIREWALL || DUMMYNET
324 struct ip_fw_args args;
325#endif /* IPFIREWALL || DUMMYNET */
326#if IPFIREWALL_FORWARD
327 struct route sro_fwd;
328#endif /* IPFIREWALL_FORWARD */
329#if DUMMYNET
330 struct route saved_route;
331#endif /* DUMMYNET */
332 struct ipf_pktopts ipf_pktopts;
333 } ipobz;
334#define ipsec_state ipobz.ipsec_state
335#define necp_route ipobz.necp_route
336#define args ipobz.args
337#define sro_fwd ipobz.sro_fwd
338#define saved_route ipobz.saved_route
339#define ipf_pktopts ipobz.ipf_pktopts
340 union {
341 struct {
342 boolean_t select_srcif : 1; /* set once */
343 boolean_t srcbound : 1; /* set once */
344 boolean_t nocell : 1; /* set once */
345 boolean_t isbroadcast : 1;
346 boolean_t didfilter : 1;
347 boolean_t noexpensive : 1; /* set once */
348 boolean_t awdl_unrestricted : 1; /* set once */
349#if IPFIREWALL_FORWARD
350 boolean_t fwd_rewrite_src : 1;
351#endif /* IPFIREWALL_FORWARD */
352 };
353 uint32_t raw;
354 } ipobf = { .raw = 0 };
355
356 int interface_mtu = 0;
357
358/*
359 * Here we check for restrictions when sending frames.
360 * N.B.: IPv4 over internal co-processor interfaces is not allowed.
361 */
362#define IP_CHECK_RESTRICTIONS(_ifp, _ipobf) \
363 (((_ipobf).nocell && IFNET_IS_CELLULAR(_ifp)) || \
364 ((_ipobf).noexpensive && IFNET_IS_EXPENSIVE(_ifp)) || \
365 (IFNET_IS_INTCOPROC(_ifp)) || \
366 (!(_ipobf).awdl_unrestricted && IFNET_IS_AWDL_RESTRICTED(_ifp)))
367
368 if (ip_output_measure)
369 net_perf_start_time(&net_perf, &start_tv);
370 KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
371
372 VERIFY(m0->m_flags & M_PKTHDR);
373 packetlist = m0;
374
375 /* zero out {ipsec_state, args, sro_fwd, saved_route, ipf_pktops} */
376 bzero(&ipobz, sizeof (ipobz));
377 ippo = &ipf_pktopts;
378
379#if IPFIREWALL || DUMMYNET
380 if (SLIST_EMPTY(&m0->m_pkthdr.tags))
381 goto ipfw_tags_done;
382
383 /* Grab info from mtags prepended to the chain */
384#if DUMMYNET
385 if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID,
386 KERNEL_TAG_TYPE_DUMMYNET, NULL)) != NULL) {
387 struct dn_pkt_tag *dn_tag;
388
389 dn_tag = (struct dn_pkt_tag *)(tag+1);
390 args.fwa_ipfw_rule = dn_tag->dn_ipfw_rule;
391 args.fwa_pf_rule = dn_tag->dn_pf_rule;
392 opt = NULL;
393 saved_route = dn_tag->dn_ro;
394 ro = &saved_route;
395
396 imo = NULL;
397 bcopy(&dn_tag->dn_dst, &dst_buf, sizeof (dst_buf));
398 dst = &dst_buf;
399 ifp = dn_tag->dn_ifp;
400 flags = dn_tag->dn_flags;
401 if ((dn_tag->dn_flags & IP_OUTARGS)) {
402 saved_ipoa = dn_tag->dn_ipoa;
403 ipoa = &saved_ipoa;
404 }
405
406 m_tag_delete(m0, tag);
407 }
408#endif /* DUMMYNET */
409
410#if IPDIVERT
411 if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID,
412 KERNEL_TAG_TYPE_DIVERT, NULL)) != NULL) {
413 struct divert_tag *div_tag;
414
415 div_tag = (struct divert_tag *)(tag+1);
416 args.fwa_divert_rule = div_tag->cookie;
417
418 m_tag_delete(m0, tag);
419 }
420#endif /* IPDIVERT */
421
422#if IPFIREWALL
423 if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID,
424 KERNEL_TAG_TYPE_IPFORWARD, NULL)) != NULL) {
425 struct ip_fwd_tag *ipfwd_tag;
426
427 ipfwd_tag = (struct ip_fwd_tag *)(tag+1);
428 next_hop_from_ipfwd_tag = ipfwd_tag->next_hop;
429
430 m_tag_delete(m0, tag);
431 }
432#endif /* IPFIREWALL */
433
434ipfw_tags_done:
435#endif /* IPFIREWALL || DUMMYNET */
436
437 m = m0;
438 m->m_pkthdr.pkt_flags &= ~(PKTF_LOOP|PKTF_IFAINFO);
439
440#if IPSEC
441 if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) {
442 /* If packet is bound to an interface, check bound policies */
443 if ((flags & IP_OUTARGS) && (ipoa != NULL) &&
444 (ipoa->ipoa_flags & IPOAF_BOUND_IF) &&
445 ipoa->ipoa_boundif != IFSCOPE_NONE) {
446 if (ipsec4_getpolicybyinterface(m, IPSEC_DIR_OUTBOUND,
447 &flags, ipoa, &sp) != 0)
448 goto bad;
449 }
450 }
451#endif /* IPSEC */
452
453 VERIFY(ro != NULL);
454
455 if (flags & IP_OUTARGS) {
456 /*
457 * In the forwarding case, only the ifscope value is used,
458 * as source interface selection doesn't take place.
459 */
460 if ((ipobf.select_srcif = (!(flags & IP_FORWARDING) &&
461 (ipoa->ipoa_flags & IPOAF_SELECT_SRCIF)))) {
462 ipf_pktopts.ippo_flags |= IPPOF_SELECT_SRCIF;
463 }
464
465 if ((ipoa->ipoa_flags & IPOAF_BOUND_IF) &&
466 ipoa->ipoa_boundif != IFSCOPE_NONE) {
467 ifscope = ipoa->ipoa_boundif;
468 ipf_pktopts.ippo_flags |=
469 (IPPOF_BOUND_IF | (ifscope << IPPOF_SHIFT_IFSCOPE));
470 }
471
472 /* double negation needed for bool bit field */
473 ipobf.srcbound = !!(ipoa->ipoa_flags & IPOAF_BOUND_SRCADDR);
474 if (ipobf.srcbound)
475 ipf_pktopts.ippo_flags |= IPPOF_BOUND_SRCADDR;
476 } else {
477 ipobf.select_srcif = FALSE;
478 ipobf.srcbound = FALSE;
479 ifscope = IFSCOPE_NONE;
480 if (flags & IP_OUTARGS) {
481 ipoa->ipoa_boundif = IFSCOPE_NONE;
482 ipoa->ipoa_flags &= ~(IPOAF_SELECT_SRCIF |
483 IPOAF_BOUND_IF | IPOAF_BOUND_SRCADDR);
484 }
485 }
486
487 if (flags & IP_OUTARGS) {
488 if (ipoa->ipoa_flags & IPOAF_NO_CELLULAR) {
489 ipobf.nocell = TRUE;
490 ipf_pktopts.ippo_flags |= IPPOF_NO_IFT_CELLULAR;
491 }
492 if (ipoa->ipoa_flags & IPOAF_NO_EXPENSIVE) {
493 ipobf.noexpensive = TRUE;
494 ipf_pktopts.ippo_flags |= IPPOF_NO_IFF_EXPENSIVE;
495 }
496 if (ipoa->ipoa_flags & IPOAF_AWDL_UNRESTRICTED)
497 ipobf.awdl_unrestricted = TRUE;
498 adv = &ipoa->ipoa_flowadv;
499 adv->code = FADV_SUCCESS;
500 ipoa->ipoa_retflags = 0;
501 }
502
503#if IPSEC
504 if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) {
505 so = ipsec_getsocket(m);
506 if (so != NULL) {
507 (void) ipsec_setsocket(m, NULL);
508 }
509 }
510#endif /* IPSEC */
511
512#if DUMMYNET
513 if (args.fwa_ipfw_rule != NULL || args.fwa_pf_rule != NULL) {
514 /* dummynet already saw us */
515 ip = mtod(m, struct ip *);
516 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
517 pkt_dst = ip->ip_dst;
518 if (ro->ro_rt != NULL) {
519 RT_LOCK_SPIN(ro->ro_rt);
520 ia = (struct in_ifaddr *)ro->ro_rt->rt_ifa;
521 if (ia) {
522 /* Become a regular mutex */
523 RT_CONVERT_LOCK(ro->ro_rt);
524 IFA_ADDREF(&ia->ia_ifa);
525 }
526 RT_UNLOCK(ro->ro_rt);
527 }
528
529#if IPFIREWALL
530 if (args.fwa_ipfw_rule != NULL)
531 goto skip_ipsec;
532#endif /* IPFIREWALL */
533 if (args.fwa_pf_rule != NULL)
534 goto sendit;
535 }
536#endif /* DUMMYNET */
537
538loopit:
539 packets_processed++;
540 ipobf.isbroadcast = FALSE;
541 ipobf.didfilter = FALSE;
542#if IPFIREWALL_FORWARD
543 ipobf.fwd_rewrite_src = FALSE;
544#endif /* IPFIREWALL_FORWARD */
545
546 VERIFY(m->m_flags & M_PKTHDR);
547 /*
548 * No need to proccess packet twice if we've already seen it.
549 */
550 if (!SLIST_EMPTY(&m->m_pkthdr.tags))
551 inject_filter_ref = ipf_get_inject_filter(m);
552 else
553 inject_filter_ref = NULL;
554
555 if (opt) {
556 m = ip_insertoptions(m, opt, &len);
557 hlen = len;
558 /* Update the chain */
559 if (m != m0) {
560 if (m0 == packetlist)
561 packetlist = m;
562 m0 = m;
563 }
564 }
565 ip = mtod(m, struct ip *);
566
567#if IPFIREWALL
568 /*
569 * rdar://8542331
570 *
571 * When dealing with a packet chain, we need to reset "next_hop"
572 * because "dst" may have been changed to the gateway address below
573 * for the previous packet of the chain. This could cause the route
574 * to be inavertandly changed to the route to the gateway address
575 * (instead of the route to the destination).
576 */
577 args.fwa_next_hop = next_hop_from_ipfwd_tag;
578 pkt_dst = args.fwa_next_hop ? args.fwa_next_hop->sin_addr : ip->ip_dst;
579#else /* !IPFIREWALL */
580 pkt_dst = ip->ip_dst;
581#endif /* !IPFIREWALL */
582
583 /*
584 * We must not send if the packet is destined to network zero.
585 * RFC1122 3.2.1.3 (a) and (b).
586 */
587 if (IN_ZERONET(ntohl(pkt_dst.s_addr))) {
588 error = EHOSTUNREACH;
589 goto bad;
590 }
591
592 /*
593 * Fill in IP header.
594 */
595 if (!(flags & (IP_FORWARDING|IP_RAWOUTPUT))) {
596 ip->ip_vhl = IP_MAKE_VHL(IPVERSION, hlen >> 2);
597 ip->ip_off &= IP_DF;
598 if (rfc6864 && IP_OFF_IS_ATOMIC(ip->ip_off)) {
599 // Per RFC6864, value of ip_id is undefined for atomic ip packets
600 ip->ip_id = 0;
601 } else {
602 ip->ip_id = ip_randomid();
603 }
604 OSAddAtomic(1, &ipstat.ips_localout);
605 } else {
606 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
607 }
608
609#if DEBUG
610 /* For debugging, we let the stack forge congestion */
611 if (forge_ce != 0 &&
612 ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT1 ||
613 (ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT0)) {
614 ip->ip_tos = (ip->ip_tos & ~IPTOS_ECN_MASK) | IPTOS_ECN_CE;
615 forge_ce--;
616 }
617#endif /* DEBUG */
618
619 KERNEL_DEBUG(DBG_LAYER_BEG, ip->ip_dst.s_addr, ip->ip_src.s_addr,
620 ip->ip_p, ip->ip_off, ip->ip_len);
621
622 dst = SIN(&ro->ro_dst);
623
624 /*
625 * If there is a cached route,
626 * check that it is to the same destination
627 * and is still up. If not, free it and try again.
628 * The address family should also be checked in case of sharing the
629 * cache with IPv6.
630 */
631
632 if (ro->ro_rt != NULL) {
633 if (ROUTE_UNUSABLE(ro) && ip->ip_src.s_addr != INADDR_ANY &&
634 !(flags & (IP_ROUTETOIF | IP_FORWARDING))) {
635 src_ia = ifa_foraddr(ip->ip_src.s_addr);
636 if (src_ia == NULL) {
637 error = EADDRNOTAVAIL;
638 goto bad;
639 }
640 IFA_REMREF(&src_ia->ia_ifa);
641 src_ia = NULL;
642 }
643 /*
644 * Test rt_flags without holding rt_lock for performance
645 * reasons; if the route is down it will hopefully be
646 * caught by the layer below (since it uses this route
647 * as a hint) or during the next transmit.
648 */
649 if (ROUTE_UNUSABLE(ro) || dst->sin_family != AF_INET ||
650 dst->sin_addr.s_addr != pkt_dst.s_addr)
651 ROUTE_RELEASE(ro);
652
653 /*
654 * If we're doing source interface selection, we may not
655 * want to use this route; only synch up the generation
656 * count otherwise.
657 */
658 if (!ipobf.select_srcif && ro->ro_rt != NULL &&
659 RT_GENID_OUTOFSYNC(ro->ro_rt))
660 RT_GENID_SYNC(ro->ro_rt);
661 }
662 if (ro->ro_rt == NULL) {
663 bzero(dst, sizeof (*dst));
664 dst->sin_family = AF_INET;
665 dst->sin_len = sizeof (*dst);
666 dst->sin_addr = pkt_dst;
667 }
668 /*
669 * If routing to interface only,
670 * short circuit routing lookup.
671 */
672 if (flags & IP_ROUTETOIF) {
673 if (ia != NULL)
674 IFA_REMREF(&ia->ia_ifa);
675 if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) {
676 ia = ifatoia(ifa_ifwithnet(sintosa(dst)));
677 if (ia == NULL) {
678 OSAddAtomic(1, &ipstat.ips_noroute);
679 error = ENETUNREACH;
680 /* XXX IPv6 APN fallback notification?? */
681 goto bad;
682 }
683 }
684 ifp = ia->ia_ifp;
685 ip->ip_ttl = 1;
686 ipobf.isbroadcast = in_broadcast(dst->sin_addr, ifp);
687 /*
688 * For consistency with other cases below. Loopback
689 * multicast case is handled separately by ip_mloopback().
690 */
691 if ((ifp->if_flags & IFF_LOOPBACK) &&
692 !IN_MULTICAST(ntohl(pkt_dst.s_addr))) {
693 m->m_pkthdr.rcvif = ifp;
694 ip_setsrcifaddr_info(m, ifp->if_index, NULL);
695 ip_setdstifaddr_info(m, ifp->if_index, NULL);
696 }
697 } else if (IN_MULTICAST(ntohl(pkt_dst.s_addr)) &&
698 imo != NULL && (ifp = imo->imo_multicast_ifp) != NULL) {
699 /*
700 * Bypass the normal routing lookup for multicast
701 * packets if the interface is specified.
702 */
703 ipobf.isbroadcast = FALSE;
704 if (ia != NULL)
705 IFA_REMREF(&ia->ia_ifa);
706
707 /* Macro takes reference on ia */
708 IFP_TO_IA(ifp, ia);
709 } else {
710 struct ifaddr *ia0 = NULL;
711 boolean_t cloneok = FALSE;
712 /*
713 * Perform source interface selection; the source IP address
714 * must belong to one of the addresses of the interface used
715 * by the route. For performance reasons, do this only if
716 * there is no route, or if the routing table has changed,
717 * or if we haven't done source interface selection on this
718 * route (for this PCB instance) before.
719 */
720 if (ipobf.select_srcif &&
721 ip->ip_src.s_addr != INADDR_ANY && (ROUTE_UNUSABLE(ro) ||
722 !(ro->ro_flags & ROF_SRCIF_SELECTED))) {
723 /* Find the source interface */
724 ia0 = in_selectsrcif(ip, ro, ifscope);
725
726 /*
727 * If the source address belongs to a restricted
728 * interface and the caller forbids our using
729 * interfaces of such type, pretend that there is no
730 * route.
731 */
732 if (ia0 != NULL &&
733 IP_CHECK_RESTRICTIONS(ia0->ifa_ifp, ipobf)) {
734 IFA_REMREF(ia0);
735 ia0 = NULL;
736 error = EHOSTUNREACH;
737 if (flags & IP_OUTARGS)
738 ipoa->ipoa_retflags |= IPOARF_IFDENIED;
739 goto bad;
740 }
741
742 /*
743 * If the source address is spoofed (in the case of
744 * IP_RAWOUTPUT on an unbounded socket), or if this
745 * is destined for local/loopback, just let it go out
746 * using the interface of the route. Otherwise,
747 * there's no interface having such an address,
748 * so bail out.
749 */
750 if (ia0 == NULL && (!(flags & IP_RAWOUTPUT) ||
751 ipobf.srcbound) && ifscope != lo_ifp->if_index) {
752 error = EADDRNOTAVAIL;
753 goto bad;
754 }
755
756 /*
757 * If the caller didn't explicitly specify the scope,
758 * pick it up from the source interface. If the cached
759 * route was wrong and was blown away as part of source
760 * interface selection, don't mask out RTF_PRCLONING
761 * since that route may have been allocated by the ULP,
762 * unless the IP header was created by the caller or
763 * the destination is IPv4 LLA. The check for the
764 * latter is needed because IPv4 LLAs are never scoped
765 * in the current implementation, and we don't want to
766 * replace the resolved IPv4 LLA route with one whose
767 * gateway points to that of the default gateway on
768 * the primary interface of the system.
769 */
770 if (ia0 != NULL) {
771 if (ifscope == IFSCOPE_NONE)
772 ifscope = ia0->ifa_ifp->if_index;
773 cloneok = (!(flags & IP_RAWOUTPUT) &&
774 !(IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))));
775 }
776 }
777
778 /*
779 * If this is the case, we probably don't want to allocate
780 * a protocol-cloned route since we didn't get one from the
781 * ULP. This lets TCP do its thing, while not burdening
782 * forwarding or ICMP with the overhead of cloning a route.
783 * Of course, we still want to do any cloning requested by
784 * the link layer, as this is probably required in all cases
785 * for correct operation (as it is for ARP).
786 */
787 if (ro->ro_rt == NULL) {
788 unsigned long ign = RTF_PRCLONING;
789 /*
790 * We make an exception here: if the destination
791 * address is INADDR_BROADCAST, allocate a protocol-
792 * cloned host route so that we end up with a route
793 * marked with the RTF_BROADCAST flag. Otherwise,
794 * we would end up referring to the default route,
795 * instead of creating a cloned host route entry.
796 * That would introduce inconsistencies between ULPs
797 * that allocate a route and those that don't. The
798 * RTF_BROADCAST route is important since we'd want
799 * to send out undirected IP broadcast packets using
800 * link-level broadcast address. Another exception
801 * is for ULP-created routes that got blown away by
802 * source interface selection (see above).
803 *
804 * These exceptions will no longer be necessary when
805 * the RTF_PRCLONING scheme is no longer present.
806 */
807 if (cloneok || dst->sin_addr.s_addr == INADDR_BROADCAST)
808 ign &= ~RTF_PRCLONING;
809
810 /*
811 * Loosen the route lookup criteria if the ifscope
812 * corresponds to the loopback interface; this is
813 * needed to support Application Layer Gateways
814 * listening on loopback, in conjunction with packet
815 * filter redirection rules. The final source IP
816 * address will be rewritten by the packet filter
817 * prior to the RFC1122 loopback check below.
818 */
819 if (ifscope == lo_ifp->if_index)
820 rtalloc_ign(ro, ign);
821 else
822 rtalloc_scoped_ign(ro, ign, ifscope);
823
824 /*
825 * If the route points to a cellular/expensive interface
826 * and the caller forbids our using interfaces of such type,
827 * pretend that there is no route.
828 */
829 if (ro->ro_rt != NULL) {
830 RT_LOCK_SPIN(ro->ro_rt);
831 if (IP_CHECK_RESTRICTIONS(ro->ro_rt->rt_ifp,
832 ipobf)) {
833 RT_UNLOCK(ro->ro_rt);
834 ROUTE_RELEASE(ro);
835 if (flags & IP_OUTARGS) {
836 ipoa->ipoa_retflags |=
837 IPOARF_IFDENIED;
838 }
839 } else {
840 RT_UNLOCK(ro->ro_rt);
841 }
842 }
843 }
844
845 if (ro->ro_rt == NULL) {
846 OSAddAtomic(1, &ipstat.ips_noroute);
847 error = EHOSTUNREACH;
848 if (ia0 != NULL) {
849 IFA_REMREF(ia0);
850 ia0 = NULL;
851 }
852 goto bad;
853 }
854
855 if (ia != NULL)
856 IFA_REMREF(&ia->ia_ifa);
857 RT_LOCK_SPIN(ro->ro_rt);
858 ia = ifatoia(ro->ro_rt->rt_ifa);
859 if (ia != NULL) {
860 /* Become a regular mutex */
861 RT_CONVERT_LOCK(ro->ro_rt);
862 IFA_ADDREF(&ia->ia_ifa);
863 }
864 /*
865 * Note: ia_ifp may not be the same as rt_ifp; the latter
866 * is what we use for determining outbound i/f, mtu, etc.
867 */
868 ifp = ro->ro_rt->rt_ifp;
869 ro->ro_rt->rt_use++;
870 if (ro->ro_rt->rt_flags & RTF_GATEWAY) {
871 dst = SIN(ro->ro_rt->rt_gateway);
872 }
873 if (ro->ro_rt->rt_flags & RTF_HOST) {
874 /* double negation needed for bool bit field */
875 ipobf.isbroadcast =
876 !!(ro->ro_rt->rt_flags & RTF_BROADCAST);
877 } else {
878 /* Become a regular mutex */
879 RT_CONVERT_LOCK(ro->ro_rt);
880 ipobf.isbroadcast = in_broadcast(dst->sin_addr, ifp);
881 }
882 /*
883 * For consistency with IPv6, as well as to ensure that
884 * IP_RECVIF is set correctly for packets that are sent
885 * to one of the local addresses. ia (rt_ifa) would have
886 * been fixed up by rt_setif for local routes. This
887 * would make it appear as if the packet arrives on the
888 * interface which owns the local address. Loopback
889 * multicast case is handled separately by ip_mloopback().
890 */
891 if (ia != NULL && (ifp->if_flags & IFF_LOOPBACK) &&
892 !IN_MULTICAST(ntohl(pkt_dst.s_addr))) {
893 uint32_t srcidx;
894
895 m->m_pkthdr.rcvif = ia->ia_ifa.ifa_ifp;
896
897 if (ia0 != NULL)
898 srcidx = ia0->ifa_ifp->if_index;
899 else if ((ro->ro_flags & ROF_SRCIF_SELECTED) &&
900 ro->ro_srcia != NULL)
901 srcidx = ro->ro_srcia->ifa_ifp->if_index;
902 else
903 srcidx = 0;
904
905 ip_setsrcifaddr_info(m, srcidx, NULL);
906 ip_setdstifaddr_info(m, 0, ia);
907 }
908 RT_UNLOCK(ro->ro_rt);
909 if (ia0 != NULL) {
910 IFA_REMREF(ia0);
911 ia0 = NULL;
912 }
913 }
914
915 if (IN_MULTICAST(ntohl(pkt_dst.s_addr))) {
916 struct ifnet *srcifp = NULL;
917 struct in_multi *inm;
918 u_int32_t vif = 0;
919 u_int8_t ttl = IP_DEFAULT_MULTICAST_TTL;
920 u_int8_t loop = IP_DEFAULT_MULTICAST_LOOP;
921
922 m->m_flags |= M_MCAST;
923 /*
924 * IP destination address is multicast. Make sure "dst"
925 * still points to the address in "ro". (It may have been
926 * changed to point to a gateway address, above.)
927 */
928 dst = SIN(&ro->ro_dst);
929 /*
930 * See if the caller provided any multicast options
931 */
932 if (imo != NULL) {
933 IMO_LOCK(imo);
934 vif = imo->imo_multicast_vif;
935 ttl = imo->imo_multicast_ttl;
936 loop = imo->imo_multicast_loop;
937 if (!(flags & IP_RAWOUTPUT))
938 ip->ip_ttl = ttl;
939 if (imo->imo_multicast_ifp != NULL)
940 ifp = imo->imo_multicast_ifp;
941 IMO_UNLOCK(imo);
942 } else if (!(flags & IP_RAWOUTPUT)) {
943 vif = -1;
944 ip->ip_ttl = ttl;
945 }
946 /*
947 * Confirm that the outgoing interface supports multicast.
948 */
949 if (imo == NULL || vif == -1) {
950 if (!(ifp->if_flags & IFF_MULTICAST)) {
951 OSAddAtomic(1, &ipstat.ips_noroute);
952 error = ENETUNREACH;
953 goto bad;
954 }
955 }
956 /*
957 * If source address not specified yet, use address
958 * of outgoing interface.
959 */
960 if (ip->ip_src.s_addr == INADDR_ANY) {
961 struct in_ifaddr *ia1;
962 lck_rw_lock_shared(in_ifaddr_rwlock);
963 TAILQ_FOREACH(ia1, &in_ifaddrhead, ia_link) {
964 IFA_LOCK_SPIN(&ia1->ia_ifa);
965 if (ia1->ia_ifp == ifp) {
966 ip->ip_src = IA_SIN(ia1)->sin_addr;
967 srcifp = ifp;
968 IFA_UNLOCK(&ia1->ia_ifa);
969 break;
970 }
971 IFA_UNLOCK(&ia1->ia_ifa);
972 }
973 lck_rw_done(in_ifaddr_rwlock);
974 if (ip->ip_src.s_addr == INADDR_ANY) {
975 error = ENETUNREACH;
976 goto bad;
977 }
978 }
979
980 in_multihead_lock_shared();
981 IN_LOOKUP_MULTI(&pkt_dst, ifp, inm);
982 in_multihead_lock_done();
983 if (inm != NULL && (imo == NULL || loop)) {
984 /*
985 * If we belong to the destination multicast group
986 * on the outgoing interface, and the caller did not
987 * forbid loopback, loop back a copy.
988 */
989 if (!TAILQ_EMPTY(&ipv4_filters)) {
990 struct ipfilter *filter;
991 int seen = (inject_filter_ref == NULL);
992
993 if (imo != NULL) {
994 ipf_pktopts.ippo_flags |=
995 IPPOF_MCAST_OPTS;
996 ipf_pktopts.ippo_mcast_ifnet = ifp;
997 ipf_pktopts.ippo_mcast_ttl = ttl;
998 ipf_pktopts.ippo_mcast_loop = loop;
999 }
1000
1001 ipf_ref();
1002
1003 /*
1004 * 4135317 - always pass network byte
1005 * order to filter
1006 */
1007#if BYTE_ORDER != BIG_ENDIAN
1008 HTONS(ip->ip_len);
1009 HTONS(ip->ip_off);
1010#endif
1011 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
1012 if (seen == 0) {
1013 if ((struct ipfilter *)
1014 inject_filter_ref == filter)
1015 seen = 1;
1016 } else if (filter->ipf_filter.
1017 ipf_output != NULL) {
1018 errno_t result;
1019 result = filter->ipf_filter.
1020 ipf_output(filter->
1021 ipf_filter.cookie,
1022 (mbuf_t *)&m, ippo);
1023 if (result == EJUSTRETURN) {
1024 ipf_unref();
1025 INM_REMREF(inm);
1026 goto done;
1027 }
1028 if (result != 0) {
1029 ipf_unref();
1030 INM_REMREF(inm);
1031 goto bad;
1032 }
1033 }
1034 }
1035
1036 /* set back to host byte order */
1037 ip = mtod(m, struct ip *);
1038#if BYTE_ORDER != BIG_ENDIAN
1039 NTOHS(ip->ip_len);
1040 NTOHS(ip->ip_off);
1041#endif
1042 ipf_unref();
1043 ipobf.didfilter = TRUE;
1044 }
1045 ip_mloopback(srcifp, ifp, m, dst, hlen);
1046 }
1047 if (inm != NULL)
1048 INM_REMREF(inm);
1049 /*
1050 * Multicasts with a time-to-live of zero may be looped-
1051 * back, above, but must not be transmitted on a network.
1052 * Also, multicasts addressed to the loopback interface
1053 * are not sent -- the above call to ip_mloopback() will
1054 * loop back a copy if this host actually belongs to the
1055 * destination group on the loopback interface.
1056 */
1057 if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
1058 m_freem(m);
1059 goto done;
1060 }
1061
1062 goto sendit;
1063 }
1064 /*
1065 * If source address not specified yet, use address
1066 * of outgoing interface.
1067 */
1068 if (ip->ip_src.s_addr == INADDR_ANY) {
1069 IFA_LOCK_SPIN(&ia->ia_ifa);
1070 ip->ip_src = IA_SIN(ia)->sin_addr;
1071 IFA_UNLOCK(&ia->ia_ifa);
1072#if IPFIREWALL_FORWARD
1073 /*
1074 * Keep note that we did this - if the firewall changes
1075 * the next-hop, our interface may change, changing the
1076 * default source IP. It's a shame so much effort happens
1077 * twice. Oh well.
1078 */
1079 ipobf.fwd_rewrite_src = TRUE;
1080#endif /* IPFIREWALL_FORWARD */
1081 }
1082
1083 /*
1084 * Look for broadcast address and
1085 * and verify user is allowed to send
1086 * such a packet.
1087 */
1088 if (ipobf.isbroadcast) {
1089 if (!(ifp->if_flags & IFF_BROADCAST)) {
1090 error = EADDRNOTAVAIL;
1091 goto bad;
1092 }
1093 if (!(flags & IP_ALLOWBROADCAST)) {
1094 error = EACCES;
1095 goto bad;
1096 }
1097 /* don't allow broadcast messages to be fragmented */
1098 if ((u_short)ip->ip_len > ifp->if_mtu) {
1099 error = EMSGSIZE;
1100 goto bad;
1101 }
1102 m->m_flags |= M_BCAST;
1103 } else {
1104 m->m_flags &= ~M_BCAST;
1105 }
1106
1107sendit:
1108#if PF
1109 /* Invoke outbound packet filter */
1110 if (PF_IS_ENABLED) {
1111 int rc;
1112
1113 m0 = m; /* Save for later */
1114#if DUMMYNET
1115 args.fwa_m = m;
1116 args.fwa_next_hop = dst;
1117 args.fwa_oif = ifp;
1118 args.fwa_ro = ro;
1119 args.fwa_dst = dst;
1120 args.fwa_oflags = flags;
1121 if (flags & IP_OUTARGS)
1122 args.fwa_ipoa = ipoa;
1123 rc = pf_af_hook(ifp, mppn, &m, AF_INET, FALSE, &args);
1124#else /* DUMMYNET */
1125 rc = pf_af_hook(ifp, mppn, &m, AF_INET, FALSE, NULL);
1126#endif /* DUMMYNET */
1127 if (rc != 0 || m == NULL) {
1128 /* Move to the next packet */
1129 m = *mppn;
1130
1131 /* Skip ahead if first packet in list got dropped */
1132 if (packetlist == m0)
1133 packetlist = m;
1134
1135 if (m != NULL) {
1136 m0 = m;
1137 /* Next packet in the chain */
1138 goto loopit;
1139 } else if (packetlist != NULL) {
1140 /* No more packet; send down the chain */
1141 goto sendchain;
1142 }
1143 /* Nothing left; we're done */
1144 goto done;
1145 }
1146 m0 = m;
1147 ip = mtod(m, struct ip *);
1148 pkt_dst = ip->ip_dst;
1149 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1150 }
1151#endif /* PF */
1152 /*
1153 * Force IP TTL to 255 following draft-ietf-zeroconf-ipv4-linklocal.txt
1154 */
1155 if (IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)) ||
1156 IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) {
1157 ip_linklocal_stat.iplls_out_total++;
1158 if (ip->ip_ttl != MAXTTL) {
1159 ip_linklocal_stat.iplls_out_badttl++;
1160 ip->ip_ttl = MAXTTL;
1161 }
1162 }
1163
1164 if (!ipobf.didfilter && !TAILQ_EMPTY(&ipv4_filters)) {
1165 struct ipfilter *filter;
1166 int seen = (inject_filter_ref == NULL);
1167 ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS;
1168
1169 /*
1170 * Check that a TSO frame isn't passed to a filter.
1171 * This could happen if a filter is inserted while
1172 * TCP is sending the TSO packet.
1173 */
1174 if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) {
1175 error = EMSGSIZE;
1176 goto bad;
1177 }
1178
1179 ipf_ref();
1180
1181 /* 4135317 - always pass network byte order to filter */
1182#if BYTE_ORDER != BIG_ENDIAN
1183 HTONS(ip->ip_len);
1184 HTONS(ip->ip_off);
1185#endif
1186 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
1187 if (seen == 0) {
1188 if ((struct ipfilter *)inject_filter_ref ==
1189 filter)
1190 seen = 1;
1191 } else if (filter->ipf_filter.ipf_output) {
1192 errno_t result;
1193 result = filter->ipf_filter.
1194 ipf_output(filter->ipf_filter.cookie,
1195 (mbuf_t *)&m, ippo);
1196 if (result == EJUSTRETURN) {
1197 ipf_unref();
1198 goto done;
1199 }
1200 if (result != 0) {
1201 ipf_unref();
1202 goto bad;
1203 }
1204 }
1205 }
1206 /* set back to host byte order */
1207 ip = mtod(m, struct ip *);
1208#if BYTE_ORDER != BIG_ENDIAN
1209 NTOHS(ip->ip_len);
1210 NTOHS(ip->ip_off);
1211#endif
1212 ipf_unref();
1213 }
1214
1215#if NECP
1216 /* Process Network Extension Policy. Will Pass, Drop, or Rebind packet. */
1217 necp_matched_policy_id = necp_ip_output_find_policy_match (m,
1218 flags, (flags & IP_OUTARGS) ? ipoa : NULL, &necp_result, &necp_result_parameter);
1219 if (necp_matched_policy_id) {
1220 necp_mark_packet_from_ip(m, necp_matched_policy_id);
1221 switch (necp_result) {
1222 case NECP_KERNEL_POLICY_RESULT_PASS:
1223 /* Check if the interface is allowed */
1224 if (!necp_packet_is_allowed_over_interface(m, ifp)) {
1225 error = EHOSTUNREACH;
1226 OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
1227 goto bad;
1228 }
1229 goto skip_ipsec;
1230 case NECP_KERNEL_POLICY_RESULT_DROP:
1231 case NECP_KERNEL_POLICY_RESULT_SOCKET_DIVERT:
1232 /* Flow divert packets should be blocked at the IP layer */
1233 error = EHOSTUNREACH;
1234 OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
1235 goto bad;
1236 case NECP_KERNEL_POLICY_RESULT_IP_TUNNEL: {
1237 /* Verify that the packet is being routed to the tunnel */
1238 struct ifnet *policy_ifp = necp_get_ifnet_from_result_parameter(&necp_result_parameter);
1239 if (policy_ifp == ifp) {
1240 /* Check if the interface is allowed */
1241 if (!necp_packet_is_allowed_over_interface(m, ifp)) {
1242 error = EHOSTUNREACH;
1243 OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
1244 goto bad;
1245 }
1246 goto skip_ipsec;
1247 } else {
1248 if (necp_packet_can_rebind_to_ifnet(m, policy_ifp, &necp_route, AF_INET)) {
1249 /* Check if the interface is allowed */
1250 if (!necp_packet_is_allowed_over_interface(m, policy_ifp)) {
1251 error = EHOSTUNREACH;
1252 OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
1253 goto bad;
1254 }
1255
1256 /* Set ifp to the tunnel interface, since it is compatible with the packet */
1257 ifp = policy_ifp;
1258 ro = &necp_route;
1259 goto skip_ipsec;
1260 } else {
1261 error = ENETUNREACH;
1262 OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
1263 goto bad;
1264 }
1265 }
1266 }
1267 default:
1268 break;
1269 }
1270 }
1271 /* Catch-all to check if the interface is allowed */
1272 if (!necp_packet_is_allowed_over_interface(m, ifp)) {
1273 error = EHOSTUNREACH;
1274 OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
1275 goto bad;
1276 }
1277#endif /* NECP */
1278
1279#if IPSEC
1280 if (ipsec_bypass != 0 || (flags & IP_NOIPSEC))
1281 goto skip_ipsec;
1282
1283 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
1284
1285 if (sp == NULL) {
1286 /* get SP for this packet */
1287 if (so != NULL) {
1288 sp = ipsec4_getpolicybysock(m, IPSEC_DIR_OUTBOUND,
1289 so, &error);
1290 } else {
1291 sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND,
1292 flags, &error);
1293 }
1294 if (sp == NULL) {
1295 IPSEC_STAT_INCREMENT(ipsecstat.out_inval);
1296 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1297 0, 0, 0, 0, 0);
1298 goto bad;
1299 }
1300 }
1301
1302 error = 0;
1303
1304 /* check policy */
1305 switch (sp->policy) {
1306 case IPSEC_POLICY_DISCARD:
1307 case IPSEC_POLICY_GENERATE:
1308 /*
1309 * This packet is just discarded.
1310 */
1311 IPSEC_STAT_INCREMENT(ipsecstat.out_polvio);
1312 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1313 1, 0, 0, 0, 0);
1314 goto bad;
1315
1316 case IPSEC_POLICY_BYPASS:
1317 case IPSEC_POLICY_NONE:
1318 /* no need to do IPsec. */
1319 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1320 2, 0, 0, 0, 0);
1321 goto skip_ipsec;
1322
1323 case IPSEC_POLICY_IPSEC:
1324 if (sp->req == NULL) {
1325 /* acquire a policy */
1326 error = key_spdacquire(sp);
1327 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1328 3, 0, 0, 0, 0);
1329 goto bad;
1330 }
1331 if (sp->ipsec_if) {
1332 /* Verify the redirect to ipsec interface */
1333 if (sp->ipsec_if == ifp) {
1334 goto skip_ipsec;
1335 }
1336 goto bad;
1337 }
1338 break;
1339
1340 case IPSEC_POLICY_ENTRUST:
1341 default:
1342 printf("ip_output: Invalid policy found. %d\n", sp->policy);
1343 }
1344 {
1345 ipsec_state.m = m;
1346 if (flags & IP_ROUTETOIF) {
1347 bzero(&ipsec_state.ro, sizeof (ipsec_state.ro));
1348 } else {
1349 route_copyout((struct route *)&ipsec_state.ro, ro, sizeof (struct route));
1350 }
1351 ipsec_state.dst = SA(dst);
1352
1353 ip->ip_sum = 0;
1354
1355 /*
1356 * XXX
1357 * delayed checksums are not currently compatible with IPsec
1358 */
1359 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA)
1360 in_delayed_cksum(m);
1361
1362#if BYTE_ORDER != BIG_ENDIAN
1363 HTONS(ip->ip_len);
1364 HTONS(ip->ip_off);
1365#endif
1366
1367 DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
1368 struct ip *, ip, struct ifnet *, ifp,
1369 struct ip *, ip, struct ip6_hdr *, NULL);
1370
1371 error = ipsec4_output(&ipsec_state, sp, flags);
1372 if (ipsec_state.tunneled == 6) {
1373 m0 = m = NULL;
1374 error = 0;
1375 goto bad;
1376 }
1377
1378 m0 = m = ipsec_state.m;
1379
1380#if DUMMYNET
1381 /*
1382 * If we're about to use the route in ipsec_state
1383 * and this came from dummynet, cleaup now.
1384 */
1385 if (ro == &saved_route &&
1386 (!(flags & IP_ROUTETOIF) || ipsec_state.tunneled))
1387 ROUTE_RELEASE(ro);
1388#endif /* DUMMYNET */
1389
1390 if (flags & IP_ROUTETOIF) {
1391 /*
1392 * if we have tunnel mode SA, we may need to ignore
1393 * IP_ROUTETOIF.
1394 */
1395 if (ipsec_state.tunneled) {
1396 flags &= ~IP_ROUTETOIF;
1397 ro = (struct route *)&ipsec_state.ro;
1398 }
1399 } else {
1400 ro = (struct route *)&ipsec_state.ro;
1401 }
1402 dst = SIN(ipsec_state.dst);
1403 if (error) {
1404 /* mbuf is already reclaimed in ipsec4_output. */
1405 m0 = NULL;
1406 switch (error) {
1407 case EHOSTUNREACH:
1408 case ENETUNREACH:
1409 case EMSGSIZE:
1410 case ENOBUFS:
1411 case ENOMEM:
1412 break;
1413 default:
1414 printf("ip4_output (ipsec): error code %d\n", error);
1415 /* FALLTHRU */
1416 case ENOENT:
1417 /* don't show these error codes to the user */
1418 error = 0;
1419 break;
1420 }
1421 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1422 4, 0, 0, 0, 0);
1423 goto bad;
1424 }
1425 }
1426
1427 /* be sure to update variables that are affected by ipsec4_output() */
1428 ip = mtod(m, struct ip *);
1429
1430#ifdef _IP_VHL
1431 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1432#else /* !_IP_VHL */
1433 hlen = ip->ip_hl << 2;
1434#endif /* !_IP_VHL */
1435 /* Check that there wasn't a route change and src is still valid */
1436 if (ROUTE_UNUSABLE(ro)) {
1437 ROUTE_RELEASE(ro);
1438 VERIFY(src_ia == NULL);
1439 if (ip->ip_src.s_addr != INADDR_ANY &&
1440 !(flags & (IP_ROUTETOIF | IP_FORWARDING)) &&
1441 (src_ia = ifa_foraddr(ip->ip_src.s_addr)) == NULL) {
1442 error = EADDRNOTAVAIL;
1443 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1444 5, 0, 0, 0, 0);
1445 goto bad;
1446 }
1447 if (src_ia != NULL) {
1448 IFA_REMREF(&src_ia->ia_ifa);
1449 src_ia = NULL;
1450 }
1451 }
1452
1453 if (ro->ro_rt == NULL) {
1454 if (!(flags & IP_ROUTETOIF)) {
1455 printf("%s: can't update route after "
1456 "IPsec processing\n", __func__);
1457 error = EHOSTUNREACH; /* XXX */
1458 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1459 6, 0, 0, 0, 0);
1460 goto bad;
1461 }
1462 } else {
1463 if (ia != NULL)
1464 IFA_REMREF(&ia->ia_ifa);
1465 RT_LOCK_SPIN(ro->ro_rt);
1466 ia = ifatoia(ro->ro_rt->rt_ifa);
1467 if (ia != NULL) {
1468 /* Become a regular mutex */
1469 RT_CONVERT_LOCK(ro->ro_rt);
1470 IFA_ADDREF(&ia->ia_ifa);
1471 }
1472 ifp = ro->ro_rt->rt_ifp;
1473 RT_UNLOCK(ro->ro_rt);
1474 }
1475
1476 /* make it flipped, again. */
1477#if BYTE_ORDER != BIG_ENDIAN
1478 NTOHS(ip->ip_len);
1479 NTOHS(ip->ip_off);
1480#endif
1481 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1482 7, 0xff, 0xff, 0xff, 0xff);
1483
1484 /* Pass to filters again */
1485 if (!TAILQ_EMPTY(&ipv4_filters)) {
1486 struct ipfilter *filter;
1487
1488 ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS;
1489
1490 /*
1491 * Check that a TSO frame isn't passed to a filter.
1492 * This could happen if a filter is inserted while
1493 * TCP is sending the TSO packet.
1494 */
1495 if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) {
1496 error = EMSGSIZE;
1497 goto bad;
1498 }
1499
1500 ipf_ref();
1501
1502 /* 4135317 - always pass network byte order to filter */
1503#if BYTE_ORDER != BIG_ENDIAN
1504 HTONS(ip->ip_len);
1505 HTONS(ip->ip_off);
1506#endif
1507 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
1508 if (filter->ipf_filter.ipf_output) {
1509 errno_t result;
1510 result = filter->ipf_filter.
1511 ipf_output(filter->ipf_filter.cookie,
1512 (mbuf_t *)&m, ippo);
1513 if (result == EJUSTRETURN) {
1514 ipf_unref();
1515 goto done;
1516 }
1517 if (result != 0) {
1518 ipf_unref();
1519 goto bad;
1520 }
1521 }
1522 }
1523 /* set back to host byte order */
1524 ip = mtod(m, struct ip *);
1525#if BYTE_ORDER != BIG_ENDIAN
1526 NTOHS(ip->ip_len);
1527 NTOHS(ip->ip_off);
1528#endif
1529 ipf_unref();
1530 }
1531skip_ipsec:
1532#endif /* IPSEC */
1533
1534#if IPFIREWALL
1535 /*
1536 * Check with the firewall...
1537 * but not if we are already being fwd'd from a firewall.
1538 */
1539 if (fw_enable && IPFW_LOADED && !args.fwa_next_hop) {
1540 struct sockaddr_in *old = dst;
1541
1542 args.fwa_m = m;
1543 args.fwa_next_hop = dst;
1544 args.fwa_oif = ifp;
1545 ipfwoff = ip_fw_chk_ptr(&args);
1546 m = args.fwa_m;
1547 dst = args.fwa_next_hop;
1548
1549 /*
1550 * On return we must do the following:
1551 * IP_FW_PORT_DENY_FLAG -> drop the pkt (XXX new)
1552 * 1<=off<= 0xffff -> DIVERT
1553 * (off & IP_FW_PORT_DYNT_FLAG) -> send to a DUMMYNET pipe
1554 * (off & IP_FW_PORT_TEE_FLAG) -> TEE the packet
1555 * dst != old -> IPFIREWALL_FORWARD
1556 * off==0, dst==old -> accept
1557 * If some of the above modules is not compiled in, then
1558 * we should't have to check the corresponding condition
1559 * (because the ipfw control socket should not accept
1560 * unsupported rules), but better play safe and drop
1561 * packets in case of doubt.
1562 */
1563 m0 = m;
1564 if ((ipfwoff & IP_FW_PORT_DENY_FLAG) || m == NULL) {
1565 if (m)
1566 m_freem(m);
1567 error = EACCES;
1568 goto done;
1569 }
1570 ip = mtod(m, struct ip *);
1571
1572 if (ipfwoff == 0 && dst == old) { /* common case */
1573 goto pass;
1574 }
1575#if DUMMYNET
1576 if (DUMMYNET_LOADED && (ipfwoff & IP_FW_PORT_DYNT_FLAG) != 0) {
1577 /*
1578 * pass the pkt to dummynet. Need to include
1579 * pipe number, m, ifp, ro, dst because these are
1580 * not recomputed in the next pass.
1581 * All other parameters have been already used and
1582 * so they are not needed anymore.
1583 * XXX note: if the ifp or ro entry are deleted
1584 * while a pkt is in dummynet, we are in trouble!
1585 */
1586 args.fwa_ro = ro;
1587 args.fwa_dst = dst;
1588 args.fwa_oflags = flags;
1589 if (flags & IP_OUTARGS)
1590 args.fwa_ipoa = ipoa;
1591
1592 error = ip_dn_io_ptr(m, ipfwoff & 0xffff, DN_TO_IP_OUT,
1593 &args, DN_CLIENT_IPFW);
1594 goto done;
1595 }
1596#endif /* DUMMYNET */
1597#if IPDIVERT
1598 if (ipfwoff != 0 && (ipfwoff & IP_FW_PORT_DYNT_FLAG) == 0) {
1599 struct mbuf *clone = NULL;
1600
1601 /* Clone packet if we're doing a 'tee' */
1602 if ((ipfwoff & IP_FW_PORT_TEE_FLAG) != 0)
1603 clone = m_dup(m, M_DONTWAIT);
1604 /*
1605 * XXX
1606 * delayed checksums are not currently compatible
1607 * with divert sockets.
1608 */
1609 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA)
1610 in_delayed_cksum(m);
1611
1612 /* Restore packet header fields to original values */
1613
1614#if BYTE_ORDER != BIG_ENDIAN
1615 HTONS(ip->ip_len);
1616 HTONS(ip->ip_off);
1617#endif
1618
1619 /* Deliver packet to divert input routine */
1620 divert_packet(m, 0, ipfwoff & 0xffff,
1621 args.fwa_divert_rule);
1622
1623 /* If 'tee', continue with original packet */
1624 if (clone != NULL) {
1625 m0 = m = clone;
1626 ip = mtod(m, struct ip *);
1627 goto pass;
1628 }
1629 goto done;
1630 }
1631#endif /* IPDIVERT */
1632#if IPFIREWALL_FORWARD
1633 /*
1634 * Here we check dst to make sure it's directly reachable on
1635 * the interface we previously thought it was.
1636 * If it isn't (which may be likely in some situations) we have
1637 * to re-route it (ie, find a route for the next-hop and the
1638 * associated interface) and set them here. This is nested
1639 * forwarding which in most cases is undesirable, except where
1640 * such control is nigh impossible. So we do it here.
1641 * And I'm babbling.
1642 */
1643 if (ipfwoff == 0 && old != dst) {
1644 struct in_ifaddr *ia_fw;
1645 struct route *ro_fwd = &sro_fwd;
1646
1647#if IPFIREWALL_FORWARD_DEBUG
1648 printf("IPFIREWALL_FORWARD: New dst ip: ");
1649 print_ip(dst->sin_addr);
1650 printf("\n");
1651#endif /* IPFIREWALL_FORWARD_DEBUG */
1652 /*
1653 * We need to figure out if we have been forwarded
1654 * to a local socket. If so then we should somehow
1655 * "loop back" to ip_input, and get directed to the
1656 * PCB as if we had received this packet. This is
1657 * because it may be dificult to identify the packets
1658 * you want to forward until they are being output
1659 * and have selected an interface. (e.g. locally
1660 * initiated packets) If we used the loopback inteface,
1661 * we would not be able to control what happens
1662 * as the packet runs through ip_input() as
1663 * it is done through a ISR.
1664 */
1665 lck_rw_lock_shared(in_ifaddr_rwlock);
1666 TAILQ_FOREACH(ia_fw, &in_ifaddrhead, ia_link) {
1667 /*
1668 * If the addr to forward to is one
1669 * of ours, we pretend to
1670 * be the destination for this packet.
1671 */
1672 IFA_LOCK_SPIN(&ia_fw->ia_ifa);
1673 if (IA_SIN(ia_fw)->sin_addr.s_addr ==
1674 dst->sin_addr.s_addr) {
1675 IFA_UNLOCK(&ia_fw->ia_ifa);
1676 break;
1677 }
1678 IFA_UNLOCK(&ia_fw->ia_ifa);
1679 }
1680 lck_rw_done(in_ifaddr_rwlock);
1681 if (ia_fw) {
1682 /* tell ip_input "dont filter" */
1683 struct m_tag *fwd_tag;
1684 struct ip_fwd_tag *ipfwd_tag;
1685
1686 fwd_tag = m_tag_create(KERNEL_MODULE_TAG_ID,
1687 KERNEL_TAG_TYPE_IPFORWARD,
1688 sizeof (*ipfwd_tag), M_NOWAIT, m);
1689 if (fwd_tag == NULL) {
1690 error = ENOBUFS;
1691 goto bad;
1692 }
1693
1694 ipfwd_tag = (struct ip_fwd_tag *)(fwd_tag+1);
1695 ipfwd_tag->next_hop = args.fwa_next_hop;
1696
1697 m_tag_prepend(m, fwd_tag);
1698
1699 if (m->m_pkthdr.rcvif == NULL)
1700 m->m_pkthdr.rcvif = lo_ifp;
1701
1702#if BYTE_ORDER != BIG_ENDIAN
1703 HTONS(ip->ip_len);
1704 HTONS(ip->ip_off);
1705#endif
1706 mbuf_outbound_finalize(m, PF_INET, 0);
1707
1708 /*
1709 * we need to call dlil_output to run filters
1710 * and resync to avoid recursion loops.
1711 */
1712 if (lo_ifp) {
1713 dlil_output(lo_ifp, PF_INET, m, NULL,
1714 SA(dst), 0, adv);
1715 } else {
1716 printf("%s: no loopback ifp for "
1717 "forwarding!!!\n", __func__);
1718 }
1719 goto done;
1720 }
1721 /*
1722 * Some of the logic for this was nicked from above.
1723 *
1724 * This rewrites the cached route in a local PCB.
1725 * Is this what we want to do?
1726 */
1727 ROUTE_RELEASE(ro_fwd);
1728 bcopy(dst, &ro_fwd->ro_dst, sizeof (*dst));
1729
1730 rtalloc_ign(ro_fwd, RTF_PRCLONING, false);
1731
1732 if (ro_fwd->ro_rt == NULL) {
1733 OSAddAtomic(1, &ipstat.ips_noroute);
1734 error = EHOSTUNREACH;
1735 goto bad;
1736 }
1737
1738 RT_LOCK_SPIN(ro_fwd->ro_rt);
1739 ia_fw = ifatoia(ro_fwd->ro_rt->rt_ifa);
1740 if (ia_fw != NULL) {
1741 /* Become a regular mutex */
1742 RT_CONVERT_LOCK(ro_fwd->ro_rt);
1743 IFA_ADDREF(&ia_fw->ia_ifa);
1744 }
1745 ifp = ro_fwd->ro_rt->rt_ifp;
1746 ro_fwd->ro_rt->rt_use++;
1747 if (ro_fwd->ro_rt->rt_flags & RTF_GATEWAY)
1748 dst = SIN(ro_fwd->ro_rt->rt_gateway);
1749 if (ro_fwd->ro_rt->rt_flags & RTF_HOST) {
1750 /* double negation needed for bool bit field */
1751 ipobf.isbroadcast =
1752 !!(ro_fwd->ro_rt->rt_flags & RTF_BROADCAST);
1753 } else {
1754 /* Become a regular mutex */
1755 RT_CONVERT_LOCK(ro_fwd->ro_rt);
1756 ipobf.isbroadcast =
1757 in_broadcast(dst->sin_addr, ifp);
1758 }
1759 RT_UNLOCK(ro_fwd->ro_rt);
1760 ROUTE_RELEASE(ro);
1761 ro->ro_rt = ro_fwd->ro_rt;
1762 ro_fwd->ro_rt = NULL;
1763 dst = SIN(&ro_fwd->ro_dst);
1764
1765 /*
1766 * If we added a default src ip earlier,
1767 * which would have been gotten from the-then
1768 * interface, do it again, from the new one.
1769 */
1770 if (ia_fw != NULL) {
1771 if (ipobf.fwd_rewrite_src) {
1772 IFA_LOCK_SPIN(&ia_fw->ia_ifa);
1773 ip->ip_src = IA_SIN(ia_fw)->sin_addr;
1774 IFA_UNLOCK(&ia_fw->ia_ifa);
1775 }
1776 IFA_REMREF(&ia_fw->ia_ifa);
1777 }
1778 goto pass;
1779 }
1780#endif /* IPFIREWALL_FORWARD */
1781 /*
1782 * if we get here, none of the above matches, and
1783 * we have to drop the pkt
1784 */
1785 m_freem(m);
1786 error = EACCES; /* not sure this is the right error msg */
1787 goto done;
1788 }
1789
1790pass:
1791#endif /* IPFIREWALL */
1792
1793 /* 127/8 must not appear on wire - RFC1122 */
1794 if (!(ifp->if_flags & IFF_LOOPBACK) &&
1795 ((ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
1796 (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)) {
1797 OSAddAtomic(1, &ipstat.ips_badaddr);
1798 error = EADDRNOTAVAIL;
1799 goto bad;
1800 }
1801
1802 if (ipoa != NULL) {
1803 u_int8_t dscp = ip->ip_tos >> IPTOS_DSCP_SHIFT;
1804
1805 error = set_packet_qos(m, ifp,
1806 ipoa->ipoa_flags & IPOAF_QOSMARKING_ALLOWED ? TRUE : FALSE,
1807 ipoa->ipoa_sotc, ipoa->ipoa_netsvctype, &dscp);
1808 if (error == 0) {
1809 ip->ip_tos &= IPTOS_ECN_MASK;
1810 ip->ip_tos |= dscp << IPTOS_DSCP_SHIFT;
1811 } else {
1812 printf("%s if_dscp_for_mbuf() error %d\n", __func__, error);
1813 error = 0;
1814 }
1815 }
1816
1817 /*
1818 * Some Wi-Fi AP implementations do not correctly handle multicast IP
1819 * packets with DSCP bits set -- see radr://9331522 -- so as a
1820 * workaround we clear the DSCP bits and set the service class to BE
1821 */
1822 if (IN_MULTICAST(ntohl(pkt_dst.s_addr)) && IFNET_IS_WIFI_INFRA(ifp)) {
1823 ip->ip_tos &= IPTOS_ECN_MASK;
1824 mbuf_set_service_class(m, MBUF_SC_BE);
1825 }
1826
1827 ip_output_checksum(ifp, m, (IP_VHL_HL(ip->ip_vhl) << 2),
1828 ip->ip_len, &sw_csum);
1829
1830 interface_mtu = ifp->if_mtu;
1831
1832 if (INTF_ADJUST_MTU_FOR_CLAT46(ifp)) {
1833 interface_mtu = IN6_LINKMTU(ifp);
1834 /* Further adjust the size for CLAT46 expansion */
1835 interface_mtu -= CLAT46_HDR_EXPANSION_OVERHD;
1836 }
1837
1838 /*
1839 * If small enough for interface, or the interface will take
1840 * care of the fragmentation for us, can just send directly.
1841 */
1842 if ((u_short)ip->ip_len <= interface_mtu || TSO_IPV4_OK(ifp, m) ||
1843 (!(ip->ip_off & IP_DF) && (ifp->if_hwassist & CSUM_FRAGMENT))) {
1844#if BYTE_ORDER != BIG_ENDIAN
1845 HTONS(ip->ip_len);
1846 HTONS(ip->ip_off);
1847#endif
1848
1849 ip->ip_sum = 0;
1850 if (sw_csum & CSUM_DELAY_IP) {
1851 ip->ip_sum = ip_cksum_hdr_out(m, hlen);
1852 sw_csum &= ~CSUM_DELAY_IP;
1853 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP;
1854 }
1855
1856#if IPSEC
1857 /* clean ipsec history once it goes out of the node */
1858 if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC))
1859 ipsec_delaux(m);
1860#endif /* IPSEC */
1861 if ((m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) &&
1862 (m->m_pkthdr.tso_segsz > 0))
1863 scnt += m->m_pkthdr.len / m->m_pkthdr.tso_segsz;
1864 else
1865 scnt++;
1866
1867 if (packetchain == 0) {
1868 if (ro->ro_rt != NULL && nstat_collect)
1869 nstat_route_tx(ro->ro_rt, scnt,
1870 m->m_pkthdr.len, 0);
1871
1872 error = dlil_output(ifp, PF_INET, m, ro->ro_rt,
1873 SA(dst), 0, adv);
1874 if (dlil_verbose && error) {
1875 printf("dlil_output error on interface %s: %d\n",
1876 ifp->if_xname, error);
1877 }
1878 scnt = 0;
1879 goto done;
1880 } else {
1881 /*
1882 * packet chaining allows us to reuse the
1883 * route for all packets
1884 */
1885 bytecnt += m->m_pkthdr.len;
1886 mppn = &m->m_nextpkt;
1887 m = m->m_nextpkt;
1888 if (m == NULL) {
1889#if PF
1890sendchain:
1891#endif /* PF */
1892 if (pktcnt > ip_maxchainsent)
1893 ip_maxchainsent = pktcnt;
1894 if (ro->ro_rt != NULL && nstat_collect)
1895 nstat_route_tx(ro->ro_rt, scnt,
1896 bytecnt, 0);
1897
1898 error = dlil_output(ifp, PF_INET, packetlist,
1899 ro->ro_rt, SA(dst), 0, adv);
1900 if (dlil_verbose && error) {
1901 printf("dlil_output error on interface %s: %d\n",
1902 ifp->if_xname, error);
1903 }
1904 pktcnt = 0;
1905 scnt = 0;
1906 bytecnt = 0;
1907 goto done;
1908
1909 }
1910 m0 = m;
1911 pktcnt++;
1912 goto loopit;
1913 }
1914 }
1915
1916 VERIFY(interface_mtu != 0);
1917 /*
1918 * Too large for interface; fragment if possible.
1919 * Must be able to put at least 8 bytes per fragment.
1920 * Balk when DF bit is set or the interface didn't support TSO.
1921 */
1922 if ((ip->ip_off & IP_DF) || pktcnt > 0 ||
1923 (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4)) {
1924 error = EMSGSIZE;
1925 /*
1926 * This case can happen if the user changed the MTU
1927 * of an interface after enabling IP on it. Because
1928 * most netifs don't keep track of routes pointing to
1929 * them, there is no way for one to update all its
1930 * routes when the MTU is changed.
1931 */
1932 if (ro->ro_rt) {
1933 RT_LOCK_SPIN(ro->ro_rt);
1934 if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) &&
1935 !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) &&
1936 (ro->ro_rt->rt_rmx.rmx_mtu > interface_mtu)) {
1937 ro->ro_rt->rt_rmx.rmx_mtu = interface_mtu;
1938 }
1939 RT_UNLOCK(ro->ro_rt);
1940 }
1941 if (pktcnt > 0) {
1942 m0 = packetlist;
1943 }
1944 OSAddAtomic(1, &ipstat.ips_cantfrag);
1945 goto bad;
1946 }
1947
1948 /*
1949 * XXX Only TCP seems to be passing a list of packets here.
1950 * The following issue is limited to UDP datagrams with 0 checksum.
1951 * For now limit it to the case when single packet is passed down.
1952 */
1953 if (packetchain == 0 && IS_INTF_CLAT46(ifp)) {
1954 /*
1955 * If it is a UDP packet that has checksum set to 0
1956 * and is also not being offloaded, compute a full checksum
1957 * and update the UDP checksum.
1958 */
1959 if (ip->ip_p == IPPROTO_UDP &&
1960 !(m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_PARTIAL))) {
1961 struct udphdr *uh = NULL;
1962
1963 if (m->m_len < hlen + sizeof (struct udphdr)) {
1964 m = m_pullup(m, hlen + sizeof (struct udphdr));
1965 if (m == NULL) {
1966 error = ENOBUFS;
1967 m0 = m;
1968 goto bad;
1969 }
1970 m0 = m;
1971 ip = mtod(m, struct ip *);
1972 }
1973 /*
1974 * Get UDP header and if checksum is 0, then compute the full
1975 * checksum.
1976 */
1977 uh = (struct udphdr *)(void *)((caddr_t)ip + hlen);
1978 if (uh->uh_sum == 0) {
1979 uh->uh_sum = inet_cksum(m, IPPROTO_UDP, hlen,
1980 ip->ip_len - hlen);
1981 if (uh->uh_sum == 0)
1982 uh->uh_sum = 0xffff;
1983 }
1984 }
1985 }
1986
1987 error = ip_fragment(m, ifp, interface_mtu, sw_csum);
1988 if (error != 0) {
1989 m0 = m = NULL;
1990 goto bad;
1991 }
1992
1993 KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr,
1994 ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len);
1995
1996 for (m = m0; m; m = m0) {
1997 m0 = m->m_nextpkt;
1998 m->m_nextpkt = 0;
1999#if IPSEC
2000 /* clean ipsec history once it goes out of the node */
2001 if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC))
2002 ipsec_delaux(m);
2003#endif /* IPSEC */
2004 if (error == 0) {
2005 if ((packetchain != 0) && (pktcnt > 0)) {
2006 panic("%s: mix of packet in packetlist is "
2007 "wrong=%p", __func__, packetlist);
2008 /* NOTREACHED */
2009 }
2010 if (ro->ro_rt != NULL && nstat_collect) {
2011 nstat_route_tx(ro->ro_rt, 1,
2012 m->m_pkthdr.len, 0);
2013 }
2014 error = dlil_output(ifp, PF_INET, m, ro->ro_rt,
2015 SA(dst), 0, adv);
2016 if (dlil_verbose && error) {
2017 printf("dlil_output error on interface %s: %d\n",
2018 ifp->if_xname, error);
2019 }
2020 } else {
2021 m_freem(m);
2022 }
2023 }
2024
2025 if (error == 0)
2026 OSAddAtomic(1, &ipstat.ips_fragmented);
2027
2028done:
2029 if (ia != NULL) {
2030 IFA_REMREF(&ia->ia_ifa);
2031 ia = NULL;
2032 }
2033#if IPSEC
2034 ROUTE_RELEASE(&ipsec_state.ro);
2035 if (sp != NULL) {
2036 KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
2037 printf("DP ip_output call free SP:%x\n", sp));
2038 key_freesp(sp, KEY_SADB_UNLOCKED);
2039 }
2040#endif /* IPSEC */
2041#if NECP
2042 ROUTE_RELEASE(&necp_route);
2043#endif /* NECP */
2044#if DUMMYNET
2045 ROUTE_RELEASE(&saved_route);
2046#endif /* DUMMYNET */
2047#if IPFIREWALL_FORWARD
2048 ROUTE_RELEASE(&sro_fwd);
2049#endif /* IPFIREWALL_FORWARD */
2050
2051 KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_END, error, 0, 0, 0, 0);
2052 if (ip_output_measure) {
2053 net_perf_measure_time(&net_perf, &start_tv, packets_processed);
2054 net_perf_histogram(&net_perf, packets_processed);
2055 }
2056 return (error);
2057bad:
2058 if (pktcnt > 0)
2059 m0 = packetlist;
2060 m_freem_list(m0);
2061 goto done;
2062
2063#undef ipsec_state
2064#undef args
2065#undef sro_fwd
2066#undef saved_route
2067#undef ipf_pktopts
2068#undef IP_CHECK_RESTRICTIONS
2069}
2070
2071int
2072ip_fragment(struct mbuf *m, struct ifnet *ifp, unsigned long mtu, int sw_csum)
2073{
2074 struct ip *ip, *mhip;
2075 int len, hlen, mhlen, firstlen, off, error = 0;
2076 struct mbuf **mnext = &m->m_nextpkt, *m0;
2077 int nfrags = 1;
2078
2079 ip = mtod(m, struct ip *);
2080#ifdef _IP_VHL
2081 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
2082#else /* !_IP_VHL */
2083 hlen = ip->ip_hl << 2;
2084#endif /* !_IP_VHL */
2085
2086#ifdef INET6
2087 /*
2088 * We need to adjust the fragment sizes to account
2089 * for IPv6 fragment header if it needs to be translated
2090 * from IPv4 to IPv6.
2091 */
2092 if (IS_INTF_CLAT46(ifp))
2093 mtu -= sizeof(struct ip6_frag);
2094
2095#endif
2096 firstlen = len = (mtu - hlen) &~ 7;
2097 if (len < 8) {
2098 m_freem(m);
2099 return (EMSGSIZE);
2100 }
2101
2102 /*
2103 * if the interface will not calculate checksums on
2104 * fragmented packets, then do it here.
2105 */
2106 if ((m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) &&
2107 !(ifp->if_hwassist & CSUM_IP_FRAGS))
2108 in_delayed_cksum(m);
2109
2110 /*
2111 * Loop through length of segment after first fragment,
2112 * make new header and copy data of each part and link onto chain.
2113 */
2114 m0 = m;
2115 mhlen = sizeof (struct ip);
2116 for (off = hlen + len; off < (u_short)ip->ip_len; off += len) {
2117 MGETHDR(m, M_DONTWAIT, MT_HEADER); /* MAC-OK */
2118 if (m == NULL) {
2119 error = ENOBUFS;
2120 OSAddAtomic(1, &ipstat.ips_odropped);
2121 goto sendorfree;
2122 }
2123 m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
2124 m->m_data += max_linkhdr;
2125 mhip = mtod(m, struct ip *);
2126 *mhip = *ip;
2127 if (hlen > sizeof (struct ip)) {
2128 mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
2129 mhip->ip_vhl = IP_MAKE_VHL(IPVERSION, mhlen >> 2);
2130 }
2131 m->m_len = mhlen;
2132 mhip->ip_off = ((off - hlen) >> 3) + (ip->ip_off & ~IP_MF);
2133 if (ip->ip_off & IP_MF)
2134 mhip->ip_off |= IP_MF;
2135 if (off + len >= (u_short)ip->ip_len)
2136 len = (u_short)ip->ip_len - off;
2137 else
2138 mhip->ip_off |= IP_MF;
2139 mhip->ip_len = htons((u_short)(len + mhlen));
2140 m->m_next = m_copy(m0, off, len);
2141 if (m->m_next == NULL) {
2142 (void) m_free(m);
2143 error = ENOBUFS; /* ??? */
2144 OSAddAtomic(1, &ipstat.ips_odropped);
2145 goto sendorfree;
2146 }
2147 m->m_pkthdr.len = mhlen + len;
2148 m->m_pkthdr.rcvif = NULL;
2149 m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
2150
2151 M_COPY_CLASSIFIER(m, m0);
2152 M_COPY_PFTAG(m, m0);
2153
2154#if CONFIG_MACF_NET
2155 mac_netinet_fragment(m0, m);
2156#endif /* CONFIG_MACF_NET */
2157
2158#if BYTE_ORDER != BIG_ENDIAN
2159 HTONS(mhip->ip_off);
2160#endif
2161
2162 mhip->ip_sum = 0;
2163 if (sw_csum & CSUM_DELAY_IP) {
2164 mhip->ip_sum = ip_cksum_hdr_out(m, mhlen);
2165 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP;
2166 }
2167 *mnext = m;
2168 mnext = &m->m_nextpkt;
2169 nfrags++;
2170 }
2171 OSAddAtomic(nfrags, &ipstat.ips_ofragments);
2172
2173 /* set first/last markers for fragment chain */
2174 m->m_flags |= M_LASTFRAG;
2175 m0->m_flags |= M_FIRSTFRAG | M_FRAG;
2176 m0->m_pkthdr.csum_data = nfrags;
2177
2178 /*
2179 * Update first fragment by trimming what's been copied out
2180 * and updating header, then send each fragment (in order).
2181 */
2182 m = m0;
2183 m_adj(m, hlen + firstlen - (u_short)ip->ip_len);
2184 m->m_pkthdr.len = hlen + firstlen;
2185 ip->ip_len = htons((u_short)m->m_pkthdr.len);
2186 ip->ip_off |= IP_MF;
2187
2188#if BYTE_ORDER != BIG_ENDIAN
2189 HTONS(ip->ip_off);
2190#endif
2191
2192 ip->ip_sum = 0;
2193 if (sw_csum & CSUM_DELAY_IP) {
2194 ip->ip_sum = ip_cksum_hdr_out(m, hlen);
2195 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP;
2196 }
2197sendorfree:
2198 if (error)
2199 m_freem_list(m0);
2200
2201 return (error);
2202}
2203
2204static void
2205ip_out_cksum_stats(int proto, u_int32_t len)
2206{
2207 switch (proto) {
2208 case IPPROTO_TCP:
2209 tcp_out_cksum_stats(len);
2210 break;
2211 case IPPROTO_UDP:
2212 udp_out_cksum_stats(len);
2213 break;
2214 default:
2215 /* keep only TCP or UDP stats for now */
2216 break;
2217 }
2218}
2219
2220/*
2221 * Process a delayed payload checksum calculation (outbound path.)
2222 *
2223 * hoff is the number of bytes beyond the mbuf data pointer which
2224 * points to the IP header.
2225 *
2226 * Returns a bitmask representing all the work done in software.
2227 */
2228uint32_t
2229in_finalize_cksum(struct mbuf *m, uint32_t hoff, uint32_t csum_flags)
2230{
2231 unsigned char buf[15 << 2] __attribute__((aligned(8)));
2232 struct ip *ip;
2233 uint32_t offset, _hlen, mlen, hlen, len, sw_csum;
2234 uint16_t csum, ip_len;
2235
2236 _CASSERT(sizeof (csum) == sizeof (uint16_t));
2237 VERIFY(m->m_flags & M_PKTHDR);
2238
2239 sw_csum = (csum_flags & m->m_pkthdr.csum_flags);
2240
2241 if ((sw_csum &= (CSUM_DELAY_IP | CSUM_DELAY_DATA)) == 0)
2242 goto done;
2243
2244 mlen = m->m_pkthdr.len; /* total mbuf len */
2245
2246 /* sanity check (need at least simple IP header) */
2247 if (mlen < (hoff + sizeof (*ip))) {
2248 panic("%s: mbuf %p pkt len (%u) < hoff+ip_hdr "
2249 "(%u+%u)\n", __func__, m, mlen, hoff,
2250 (uint32_t)sizeof (*ip));
2251 /* NOTREACHED */
2252 }
2253
2254 /*
2255 * In case the IP header is not contiguous, or not 32-bit aligned,
2256 * or if we're computing the IP header checksum, copy it to a local
2257 * buffer. Copy only the simple IP header here (IP options case
2258 * is handled below.)
2259 */
2260 if ((sw_csum & CSUM_DELAY_IP) || (hoff + sizeof (*ip)) > m->m_len ||
2261 !IP_HDR_ALIGNED_P(mtod(m, caddr_t) + hoff)) {
2262 m_copydata(m, hoff, sizeof (*ip), (caddr_t)buf);
2263 ip = (struct ip *)(void *)buf;
2264 _hlen = sizeof (*ip);
2265 } else {
2266 ip = (struct ip *)(void *)(m->m_data + hoff);
2267 _hlen = 0;
2268 }
2269
2270 hlen = IP_VHL_HL(ip->ip_vhl) << 2; /* IP header len */
2271
2272 /* sanity check */
2273 if (mlen < (hoff + hlen)) {
2274 panic("%s: mbuf %p pkt too short (%d) for IP header (%u), "
2275 "hoff %u", __func__, m, mlen, hlen, hoff);
2276 /* NOTREACHED */
2277 }
2278
2279 /*
2280 * We could be in the context of an IP or interface filter; in the
2281 * former case, ip_len would be in host (correct) order while for
2282 * the latter it would be in network order. Because of this, we
2283 * attempt to interpret the length field by comparing it against
2284 * the actual packet length. If the comparison fails, byte swap
2285 * the length and check again. If it still fails, use the actual
2286 * packet length. This also covers the trailing bytes case.
2287 */
2288 ip_len = ip->ip_len;
2289 if (ip_len != (mlen - hoff)) {
2290 ip_len = OSSwapInt16(ip_len);
2291 if (ip_len != (mlen - hoff)) {
2292 printf("%s: mbuf 0x%llx proto %d IP len %d (%x) "
2293 "[swapped %d (%x)] doesn't match actual packet "
2294 "length; %d is used instead\n", __func__,
2295 (uint64_t)VM_KERNEL_ADDRPERM(m), ip->ip_p,
2296 ip->ip_len, ip->ip_len, ip_len, ip_len,
2297 (mlen - hoff));
2298 ip_len = mlen - hoff;
2299 }
2300 }
2301
2302 len = ip_len - hlen; /* csum span */
2303
2304 if (sw_csum & CSUM_DELAY_DATA) {
2305 uint16_t ulpoff;
2306
2307 /*
2308 * offset is added to the lower 16-bit value of csum_data,
2309 * which is expected to contain the ULP offset; therefore
2310 * CSUM_PARTIAL offset adjustment must be undone.
2311 */
2312 if ((m->m_pkthdr.csum_flags & (CSUM_PARTIAL|CSUM_DATA_VALID)) ==
2313 (CSUM_PARTIAL|CSUM_DATA_VALID)) {
2314 /*
2315 * Get back the original ULP offset (this will
2316 * undo the CSUM_PARTIAL logic in ip_output.)
2317 */
2318 m->m_pkthdr.csum_data = (m->m_pkthdr.csum_tx_stuff -
2319 m->m_pkthdr.csum_tx_start);
2320 }
2321
2322 ulpoff = (m->m_pkthdr.csum_data & 0xffff); /* ULP csum offset */
2323 offset = hoff + hlen; /* ULP header */
2324
2325 if (mlen < (ulpoff + sizeof (csum))) {
2326 panic("%s: mbuf %p pkt len (%u) proto %d invalid ULP "
2327 "cksum offset (%u) cksum flags 0x%x\n", __func__,
2328 m, mlen, ip->ip_p, ulpoff, m->m_pkthdr.csum_flags);
2329 /* NOTREACHED */
2330 }
2331
2332 csum = inet_cksum(m, 0, offset, len);
2333
2334 /* Update stats */
2335 ip_out_cksum_stats(ip->ip_p, len);
2336
2337 /* RFC1122 4.1.3.4 */
2338 if (csum == 0 &&
2339 (m->m_pkthdr.csum_flags & (CSUM_UDP|CSUM_ZERO_INVERT)))
2340 csum = 0xffff;
2341
2342 /* Insert the checksum in the ULP csum field */
2343 offset += ulpoff;
2344 if (offset + sizeof (csum) > m->m_len) {
2345 m_copyback(m, offset, sizeof (csum), &csum);
2346 } else if (IP_HDR_ALIGNED_P(mtod(m, char *) + hoff)) {
2347 *(uint16_t *)(void *)(mtod(m, char *) + offset) = csum;
2348 } else {
2349 bcopy(&csum, (mtod(m, char *) + offset), sizeof (csum));
2350 }
2351 m->m_pkthdr.csum_flags &= ~(CSUM_DELAY_DATA | CSUM_DATA_VALID |
2352 CSUM_PARTIAL | CSUM_ZERO_INVERT);
2353 }
2354
2355 if (sw_csum & CSUM_DELAY_IP) {
2356 /* IP header must be in the local buffer */
2357 VERIFY(_hlen == sizeof (*ip));
2358 if (_hlen != hlen) {
2359 VERIFY(hlen <= sizeof (buf));
2360 m_copydata(m, hoff, hlen, (caddr_t)buf);
2361 ip = (struct ip *)(void *)buf;
2362 _hlen = hlen;
2363 }
2364
2365 /*
2366 * Compute the IP header checksum as if the IP length
2367 * is the length which we believe is "correct"; see
2368 * how ip_len gets calculated above. Note that this
2369 * is done on the local copy and not on the real one.
2370 */
2371 ip->ip_len = htons(ip_len);
2372 ip->ip_sum = 0;
2373 csum = in_cksum_hdr_opt(ip);
2374
2375 /* Update stats */
2376 ipstat.ips_snd_swcsum++;
2377 ipstat.ips_snd_swcsum_bytes += hlen;
2378
2379 /*
2380 * Insert only the checksum in the existing IP header
2381 * csum field; all other fields are left unchanged.
2382 */
2383 offset = hoff + offsetof(struct ip, ip_sum);
2384 if (offset + sizeof (csum) > m->m_len) {
2385 m_copyback(m, offset, sizeof (csum), &csum);
2386 } else if (IP_HDR_ALIGNED_P(mtod(m, char *) + hoff)) {
2387 *(uint16_t *)(void *)(mtod(m, char *) + offset) = csum;
2388 } else {
2389 bcopy(&csum, (mtod(m, char *) + offset), sizeof (csum));
2390 }
2391 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP;
2392 }
2393
2394done:
2395 return (sw_csum);
2396}
2397
2398/*
2399 * Insert IP options into preformed packet.
2400 * Adjust IP destination as required for IP source routing,
2401 * as indicated by a non-zero in_addr at the start of the options.
2402 *
2403 * XXX This routine assumes that the packet has no options in place.
2404 */
2405static struct mbuf *
2406ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen)
2407{
2408 struct ipoption *p = mtod(opt, struct ipoption *);
2409 struct mbuf *n;
2410 struct ip *ip = mtod(m, struct ip *);
2411 unsigned optlen;
2412
2413 optlen = opt->m_len - sizeof (p->ipopt_dst);
2414 if (optlen + (u_short)ip->ip_len > IP_MAXPACKET)
2415 return (m); /* XXX should fail */
2416 if (p->ipopt_dst.s_addr)
2417 ip->ip_dst = p->ipopt_dst;
2418 if (m->m_flags & M_EXT || m->m_data - optlen < m->m_pktdat) {
2419 MGETHDR(n, M_DONTWAIT, MT_HEADER); /* MAC-OK */
2420 if (n == NULL)
2421 return (m);
2422 n->m_pkthdr.rcvif = 0;
2423#if CONFIG_MACF_NET
2424 mac_mbuf_label_copy(m, n);
2425#endif /* CONFIG_MACF_NET */
2426 n->m_pkthdr.len = m->m_pkthdr.len + optlen;
2427 m->m_len -= sizeof (struct ip);
2428 m->m_data += sizeof (struct ip);
2429 n->m_next = m;
2430 m = n;
2431 m->m_len = optlen + sizeof (struct ip);
2432 m->m_data += max_linkhdr;
2433 (void) memcpy(mtod(m, void *), ip, sizeof (struct ip));
2434 } else {
2435 m->m_data -= optlen;
2436 m->m_len += optlen;
2437 m->m_pkthdr.len += optlen;
2438 ovbcopy((caddr_t)ip, mtod(m, caddr_t), sizeof (struct ip));
2439 }
2440 ip = mtod(m, struct ip *);
2441 bcopy(p->ipopt_list, ip + 1, optlen);
2442 *phlen = sizeof (struct ip) + optlen;
2443 ip->ip_vhl = IP_MAKE_VHL(IPVERSION, *phlen >> 2);
2444 ip->ip_len += optlen;
2445 return (m);
2446}
2447
2448/*
2449 * Copy options from ip to jp,
2450 * omitting those not copied during fragmentation.
2451 */
2452static int
2453ip_optcopy(struct ip *ip, struct ip *jp)
2454{
2455 u_char *cp, *dp;
2456 int opt, optlen, cnt;
2457
2458 cp = (u_char *)(ip + 1);
2459 dp = (u_char *)(jp + 1);
2460 cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof (struct ip);
2461 for (; cnt > 0; cnt -= optlen, cp += optlen) {
2462 opt = cp[0];
2463 if (opt == IPOPT_EOL)
2464 break;
2465 if (opt == IPOPT_NOP) {
2466 /* Preserve for IP mcast tunnel's LSRR alignment. */
2467 *dp++ = IPOPT_NOP;
2468 optlen = 1;
2469 continue;
2470 }
2471#if DIAGNOSTIC
2472 if (cnt < IPOPT_OLEN + sizeof (*cp)) {
2473 panic("malformed IPv4 option passed to ip_optcopy");
2474 /* NOTREACHED */
2475 }
2476#endif
2477 optlen = cp[IPOPT_OLEN];
2478#if DIAGNOSTIC
2479 if (optlen < IPOPT_OLEN + sizeof (*cp) || optlen > cnt) {
2480 panic("malformed IPv4 option passed to ip_optcopy");
2481 /* NOTREACHED */
2482 }
2483#endif
2484 /* bogus lengths should have been caught by ip_dooptions */
2485 if (optlen > cnt)
2486 optlen = cnt;
2487 if (IPOPT_COPIED(opt)) {
2488 bcopy(cp, dp, optlen);
2489 dp += optlen;
2490 }
2491 }
2492 for (optlen = dp - (u_char *)(jp+1); optlen & 0x3; optlen++)
2493 *dp++ = IPOPT_EOL;
2494 return (optlen);
2495}
2496
2497/*
2498 * IP socket option processing.
2499 */
2500int
2501ip_ctloutput(struct socket *so, struct sockopt *sopt)
2502{
2503 struct inpcb *inp = sotoinpcb(so);
2504 int error, optval;
2505
2506 error = optval = 0;
2507 if (sopt->sopt_level != IPPROTO_IP)
2508 return (EINVAL);
2509
2510 switch (sopt->sopt_dir) {
2511 case SOPT_SET:
2512 switch (sopt->sopt_name) {
2513#ifdef notyet
2514 case IP_RETOPTS:
2515#endif
2516 case IP_OPTIONS: {
2517 struct mbuf *m;
2518
2519 if (sopt->sopt_valsize > MLEN) {
2520 error = EMSGSIZE;
2521 break;
2522 }
2523 MGET(m, sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT,
2524 MT_HEADER);
2525 if (m == NULL) {
2526 error = ENOBUFS;
2527 break;
2528 }
2529 m->m_len = sopt->sopt_valsize;
2530 error = sooptcopyin(sopt, mtod(m, char *),
2531 m->m_len, m->m_len);
2532 if (error) {
2533 m_freem(m);
2534 break;
2535 }
2536
2537 return (ip_pcbopts(sopt->sopt_name,
2538 &inp->inp_options, m));
2539 }
2540
2541 case IP_TOS:
2542 case IP_TTL:
2543 case IP_RECVOPTS:
2544 case IP_RECVRETOPTS:
2545 case IP_RECVDSTADDR:
2546 case IP_RECVIF:
2547 case IP_RECVTTL:
2548 case IP_RECVPKTINFO:
2549 case IP_RECVTOS:
2550 error = sooptcopyin(sopt, &optval, sizeof (optval),
2551 sizeof (optval));
2552 if (error)
2553 break;
2554
2555 switch (sopt->sopt_name) {
2556 case IP_TOS:
2557 inp->inp_ip_tos = optval;
2558 break;
2559
2560 case IP_TTL:
2561 inp->inp_ip_ttl = optval;
2562 break;
2563#define OPTSET(bit) \
2564 if (optval) \
2565 inp->inp_flags |= bit; \
2566 else \
2567 inp->inp_flags &= ~bit;
2568
2569 case IP_RECVOPTS:
2570 OPTSET(INP_RECVOPTS);
2571 break;
2572
2573 case IP_RECVRETOPTS:
2574 OPTSET(INP_RECVRETOPTS);
2575 break;
2576
2577 case IP_RECVDSTADDR:
2578 OPTSET(INP_RECVDSTADDR);
2579 break;
2580
2581 case IP_RECVIF:
2582 OPTSET(INP_RECVIF);
2583 break;
2584
2585 case IP_RECVTTL:
2586 OPTSET(INP_RECVTTL);
2587 break;
2588
2589 case IP_RECVPKTINFO:
2590 OPTSET(INP_PKTINFO);
2591 break;
2592
2593 case IP_RECVTOS:
2594 OPTSET(INP_RECVTOS);
2595 break;
2596 #undef OPTSET
2597 }
2598 break;
2599 /*
2600 * Multicast socket options are processed by the in_mcast
2601 * module.
2602 */
2603 case IP_MULTICAST_IF:
2604 case IP_MULTICAST_IFINDEX:
2605 case IP_MULTICAST_VIF:
2606 case IP_MULTICAST_TTL:
2607 case IP_MULTICAST_LOOP:
2608 case IP_ADD_MEMBERSHIP:
2609 case IP_DROP_MEMBERSHIP:
2610 case IP_ADD_SOURCE_MEMBERSHIP:
2611 case IP_DROP_SOURCE_MEMBERSHIP:
2612 case IP_BLOCK_SOURCE:
2613 case IP_UNBLOCK_SOURCE:
2614 case IP_MSFILTER:
2615 case MCAST_JOIN_GROUP:
2616 case MCAST_LEAVE_GROUP:
2617 case MCAST_JOIN_SOURCE_GROUP:
2618 case MCAST_LEAVE_SOURCE_GROUP:
2619 case MCAST_BLOCK_SOURCE:
2620 case MCAST_UNBLOCK_SOURCE:
2621 error = inp_setmoptions(inp, sopt);
2622 break;
2623
2624 case IP_PORTRANGE:
2625 error = sooptcopyin(sopt, &optval, sizeof (optval),
2626 sizeof (optval));
2627 if (error)
2628 break;
2629
2630 switch (optval) {
2631 case IP_PORTRANGE_DEFAULT:
2632 inp->inp_flags &= ~(INP_LOWPORT);
2633 inp->inp_flags &= ~(INP_HIGHPORT);
2634 break;
2635
2636 case IP_PORTRANGE_HIGH:
2637 inp->inp_flags &= ~(INP_LOWPORT);
2638 inp->inp_flags |= INP_HIGHPORT;
2639 break;
2640
2641 case IP_PORTRANGE_LOW:
2642 inp->inp_flags &= ~(INP_HIGHPORT);
2643 inp->inp_flags |= INP_LOWPORT;
2644 break;
2645
2646 default:
2647 error = EINVAL;
2648 break;
2649 }
2650 break;
2651
2652#if IPSEC
2653 case IP_IPSEC_POLICY: {
2654 caddr_t req = NULL;
2655 size_t len = 0;
2656 int priv;
2657 struct mbuf *m;
2658 int optname;
2659
2660 if ((error = soopt_getm(sopt, &m)) != 0) /* XXX */
2661 break;
2662 if ((error = soopt_mcopyin(sopt, m)) != 0) /* XXX */
2663 break;
2664 priv = (proc_suser(sopt->sopt_p) == 0);
2665 if (m) {
2666 req = mtod(m, caddr_t);
2667 len = m->m_len;
2668 }
2669 optname = sopt->sopt_name;
2670 error = ipsec4_set_policy(inp, optname, req, len, priv);
2671 m_freem(m);
2672 break;
2673 }
2674#endif /* IPSEC */
2675
2676#if TRAFFIC_MGT
2677 case IP_TRAFFIC_MGT_BACKGROUND: {
2678 unsigned background = 0;
2679
2680 error = sooptcopyin(sopt, &background,
2681 sizeof (background), sizeof (background));
2682 if (error)
2683 break;
2684
2685 if (background) {
2686 socket_set_traffic_mgt_flags_locked(so,
2687 TRAFFIC_MGT_SO_BACKGROUND);
2688 } else {
2689 socket_clear_traffic_mgt_flags_locked(so,
2690 TRAFFIC_MGT_SO_BACKGROUND);
2691 }
2692
2693 break;
2694 }
2695#endif /* TRAFFIC_MGT */
2696
2697 /*
2698 * On a multihomed system, scoped routing can be used to
2699 * restrict the source interface used for sending packets.
2700 * The socket option IP_BOUND_IF binds a particular AF_INET
2701 * socket to an interface such that data sent on the socket
2702 * is restricted to that interface. This is unlike the
2703 * SO_DONTROUTE option where the routing table is bypassed;
2704 * therefore it allows for a greater flexibility and control
2705 * over the system behavior, and does not place any restriction
2706 * on the destination address type (e.g. unicast, multicast,
2707 * or broadcast if applicable) or whether or not the host is
2708 * directly reachable. Note that in the multicast transmit
2709 * case, IP_MULTICAST_{IF,IFINDEX} takes precedence over
2710 * IP_BOUND_IF, since the former practically bypasses the
2711 * routing table; in this case, IP_BOUND_IF sets the default
2712 * interface used for sending multicast packets in the absence
2713 * of an explicit multicast transmit interface.
2714 */
2715 case IP_BOUND_IF:
2716 /* This option is settable only for IPv4 */
2717 if (!(inp->inp_vflag & INP_IPV4)) {
2718 error = EINVAL;
2719 break;
2720 }
2721
2722 error = sooptcopyin(sopt, &optval, sizeof (optval),
2723 sizeof (optval));
2724
2725 if (error)
2726 break;
2727
2728 error = inp_bindif(inp, optval, NULL);
2729 break;
2730
2731 case IP_NO_IFT_CELLULAR:
2732 /* This option is settable only for IPv4 */
2733 if (!(inp->inp_vflag & INP_IPV4)) {
2734 error = EINVAL;
2735 break;
2736 }
2737
2738 error = sooptcopyin(sopt, &optval, sizeof (optval),
2739 sizeof (optval));
2740
2741 if (error)
2742 break;
2743
2744 /* once set, it cannot be unset */
2745 if (!optval && INP_NO_CELLULAR(inp)) {
2746 error = EINVAL;
2747 break;
2748 }
2749
2750 error = so_set_restrictions(so,
2751 SO_RESTRICT_DENY_CELLULAR);
2752 break;
2753
2754 case IP_OUT_IF:
2755 /* This option is not settable */
2756 error = EINVAL;
2757 break;
2758
2759 default:
2760 error = ENOPROTOOPT;
2761 break;
2762 }
2763 break;
2764
2765 case SOPT_GET:
2766 switch (sopt->sopt_name) {
2767 case IP_OPTIONS:
2768 case IP_RETOPTS:
2769 if (inp->inp_options) {
2770 error = sooptcopyout(sopt,
2771 mtod(inp->inp_options, char *),
2772 inp->inp_options->m_len);
2773 } else {
2774 sopt->sopt_valsize = 0;
2775 }
2776 break;
2777
2778 case IP_TOS:
2779 case IP_TTL:
2780 case IP_RECVOPTS:
2781 case IP_RECVRETOPTS:
2782 case IP_RECVDSTADDR:
2783 case IP_RECVIF:
2784 case IP_RECVTTL:
2785 case IP_PORTRANGE:
2786 case IP_RECVPKTINFO:
2787 case IP_RECVTOS:
2788 switch (sopt->sopt_name) {
2789 case IP_TOS:
2790 optval = inp->inp_ip_tos;
2791 break;
2792
2793 case IP_TTL:
2794 optval = inp->inp_ip_ttl;
2795 break;
2796
2797#define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0)
2798
2799 case IP_RECVOPTS:
2800 optval = OPTBIT(INP_RECVOPTS);
2801 break;
2802
2803 case IP_RECVRETOPTS:
2804 optval = OPTBIT(INP_RECVRETOPTS);
2805 break;
2806
2807 case IP_RECVDSTADDR:
2808 optval = OPTBIT(INP_RECVDSTADDR);
2809 break;
2810
2811 case IP_RECVIF:
2812 optval = OPTBIT(INP_RECVIF);
2813 break;
2814
2815 case IP_RECVTTL:
2816 optval = OPTBIT(INP_RECVTTL);
2817 break;
2818
2819 case IP_PORTRANGE:
2820 if (inp->inp_flags & INP_HIGHPORT)
2821 optval = IP_PORTRANGE_HIGH;
2822 else if (inp->inp_flags & INP_LOWPORT)
2823 optval = IP_PORTRANGE_LOW;
2824 else
2825 optval = 0;
2826 break;
2827
2828 case IP_RECVPKTINFO:
2829 optval = OPTBIT(INP_PKTINFO);
2830 break;
2831
2832 case IP_RECVTOS:
2833 optval = OPTBIT(INP_RECVTOS);
2834 break;
2835 }
2836 error = sooptcopyout(sopt, &optval, sizeof (optval));
2837 break;
2838
2839 case IP_MULTICAST_IF:
2840 case IP_MULTICAST_IFINDEX:
2841 case IP_MULTICAST_VIF:
2842 case IP_MULTICAST_TTL:
2843 case IP_MULTICAST_LOOP:
2844 case IP_MSFILTER:
2845 error = inp_getmoptions(inp, sopt);
2846 break;
2847
2848#if IPSEC
2849 case IP_IPSEC_POLICY: {
2850 error = 0; /* This option is no longer supported */
2851 break;
2852 }
2853#endif /* IPSEC */
2854
2855#if TRAFFIC_MGT
2856 case IP_TRAFFIC_MGT_BACKGROUND: {
2857 unsigned background = (so->so_flags1 &
2858 SOF1_TRAFFIC_MGT_SO_BACKGROUND) ? 1 : 0;
2859 return (sooptcopyout(sopt, &background,
2860 sizeof (background)));
2861 }
2862#endif /* TRAFFIC_MGT */
2863
2864 case IP_BOUND_IF:
2865 if (inp->inp_flags & INP_BOUND_IF)
2866 optval = inp->inp_boundifp->if_index;
2867 error = sooptcopyout(sopt, &optval, sizeof (optval));
2868 break;
2869
2870 case IP_NO_IFT_CELLULAR:
2871 optval = INP_NO_CELLULAR(inp) ? 1 : 0;
2872 error = sooptcopyout(sopt, &optval, sizeof (optval));
2873 break;
2874
2875 case IP_OUT_IF:
2876 optval = (inp->inp_last_outifp != NULL) ?
2877 inp->inp_last_outifp->if_index : 0;
2878 error = sooptcopyout(sopt, &optval, sizeof (optval));
2879 break;
2880
2881 default:
2882 error = ENOPROTOOPT;
2883 break;
2884 }
2885 break;
2886 }
2887 return (error);
2888}
2889
2890/*
2891 * Set up IP options in pcb for insertion in output packets.
2892 * Store in mbuf with pointer in pcbopt, adding pseudo-option
2893 * with destination address if source routed.
2894 */
2895static int
2896ip_pcbopts(int optname, struct mbuf **pcbopt, struct mbuf *m)
2897{
2898#pragma unused(optname)
2899 int cnt, optlen;
2900 u_char *cp;
2901 u_char opt;
2902
2903 /* turn off any old options */
2904 if (*pcbopt)
2905 (void) m_free(*pcbopt);
2906 *pcbopt = 0;
2907 if (m == (struct mbuf *)0 || m->m_len == 0) {
2908 /*
2909 * Only turning off any previous options.
2910 */
2911 if (m)
2912 (void) m_free(m);
2913 return (0);
2914 }
2915
2916 if (m->m_len % sizeof (int32_t))
2917 goto bad;
2918
2919 /*
2920 * IP first-hop destination address will be stored before
2921 * actual options; move other options back
2922 * and clear it when none present.
2923 */
2924 if (m->m_data + m->m_len + sizeof (struct in_addr) >= &m->m_dat[MLEN])
2925 goto bad;
2926 cnt = m->m_len;
2927 m->m_len += sizeof (struct in_addr);
2928 cp = mtod(m, u_char *) + sizeof (struct in_addr);
2929 ovbcopy(mtod(m, caddr_t), (caddr_t)cp, (unsigned)cnt);
2930 bzero(mtod(m, caddr_t), sizeof (struct in_addr));
2931
2932 for (; cnt > 0; cnt -= optlen, cp += optlen) {
2933 opt = cp[IPOPT_OPTVAL];
2934 if (opt == IPOPT_EOL)
2935 break;
2936 if (opt == IPOPT_NOP)
2937 optlen = 1;
2938 else {
2939 if (cnt < IPOPT_OLEN + sizeof (*cp))
2940 goto bad;
2941 optlen = cp[IPOPT_OLEN];
2942 if (optlen < IPOPT_OLEN + sizeof (*cp) || optlen > cnt)
2943 goto bad;
2944 }
2945 switch (opt) {
2946
2947 default:
2948 break;
2949
2950 case IPOPT_LSRR:
2951 case IPOPT_SSRR:
2952 /*
2953 * user process specifies route as:
2954 * ->A->B->C->D
2955 * D must be our final destination (but we can't
2956 * check that since we may not have connected yet).
2957 * A is first hop destination, which doesn't appear in
2958 * actual IP option, but is stored before the options.
2959 */
2960 if (optlen < IPOPT_MINOFF - 1 + sizeof (struct in_addr))
2961 goto bad;
2962 m->m_len -= sizeof (struct in_addr);
2963 cnt -= sizeof (struct in_addr);
2964 optlen -= sizeof (struct in_addr);
2965 cp[IPOPT_OLEN] = optlen;
2966 /*
2967 * Move first hop before start of options.
2968 */
2969 bcopy((caddr_t)&cp[IPOPT_OFFSET+1], mtod(m, caddr_t),
2970 sizeof (struct in_addr));
2971 /*
2972 * Then copy rest of options back
2973 * to close up the deleted entry.
2974 */
2975 ovbcopy((caddr_t)(&cp[IPOPT_OFFSET+1] +
2976 sizeof (struct in_addr)),
2977 (caddr_t)&cp[IPOPT_OFFSET+1],
2978 (unsigned)cnt + sizeof (struct in_addr));
2979 break;
2980 }
2981 }
2982 if (m->m_len > MAX_IPOPTLEN + sizeof (struct in_addr))
2983 goto bad;
2984 *pcbopt = m;
2985 return (0);
2986
2987bad:
2988 (void) m_free(m);
2989 return (EINVAL);
2990}
2991
2992void
2993ip_moptions_init(void)
2994{
2995 PE_parse_boot_argn("ifa_debug", &imo_debug, sizeof (imo_debug));
2996
2997 imo_size = (imo_debug == 0) ? sizeof (struct ip_moptions) :
2998 sizeof (struct ip_moptions_dbg);
2999
3000 imo_zone = zinit(imo_size, IMO_ZONE_MAX * imo_size, 0,
3001 IMO_ZONE_NAME);
3002 if (imo_zone == NULL) {
3003 panic("%s: failed allocating %s", __func__, IMO_ZONE_NAME);
3004 /* NOTREACHED */
3005 }
3006 zone_change(imo_zone, Z_EXPAND, TRUE);
3007}
3008
3009void
3010imo_addref(struct ip_moptions *imo, int locked)
3011{
3012 if (!locked)
3013 IMO_LOCK(imo);
3014 else
3015 IMO_LOCK_ASSERT_HELD(imo);
3016
3017 if (++imo->imo_refcnt == 0) {
3018 panic("%s: imo %p wraparound refcnt\n", __func__, imo);
3019 /* NOTREACHED */
3020 } else if (imo->imo_trace != NULL) {
3021 (*imo->imo_trace)(imo, TRUE);
3022 }
3023
3024 if (!locked)
3025 IMO_UNLOCK(imo);
3026}
3027
3028void
3029imo_remref(struct ip_moptions *imo)
3030{
3031 int i;
3032
3033 IMO_LOCK(imo);
3034 if (imo->imo_refcnt == 0) {
3035 panic("%s: imo %p negative refcnt", __func__, imo);
3036 /* NOTREACHED */
3037 } else if (imo->imo_trace != NULL) {
3038 (*imo->imo_trace)(imo, FALSE);
3039 }
3040
3041 --imo->imo_refcnt;
3042 if (imo->imo_refcnt > 0) {
3043 IMO_UNLOCK(imo);
3044 return;
3045 }
3046
3047 for (i = 0; i < imo->imo_num_memberships; ++i) {
3048 struct in_mfilter *imf;
3049
3050 imf = imo->imo_mfilters ? &imo->imo_mfilters[i] : NULL;
3051 if (imf != NULL)
3052 imf_leave(imf);
3053
3054 (void) in_leavegroup(imo->imo_membership[i], imf);
3055
3056 if (imf != NULL)
3057 imf_purge(imf);
3058
3059 INM_REMREF(imo->imo_membership[i]);
3060 imo->imo_membership[i] = NULL;
3061 }
3062 imo->imo_num_memberships = 0;
3063 if (imo->imo_mfilters != NULL) {
3064 FREE(imo->imo_mfilters, M_INMFILTER);
3065 imo->imo_mfilters = NULL;
3066 }
3067 if (imo->imo_membership != NULL) {
3068 FREE(imo->imo_membership, M_IPMOPTS);
3069 imo->imo_membership = NULL;
3070 }
3071 IMO_UNLOCK(imo);
3072
3073 lck_mtx_destroy(&imo->imo_lock, ifa_mtx_grp);
3074
3075 if (!(imo->imo_debug & IFD_ALLOC)) {
3076 panic("%s: imo %p cannot be freed", __func__, imo);
3077 /* NOTREACHED */
3078 }
3079 zfree(imo_zone, imo);
3080}
3081
3082static void
3083imo_trace(struct ip_moptions *imo, int refhold)
3084{
3085 struct ip_moptions_dbg *imo_dbg = (struct ip_moptions_dbg *)imo;
3086 ctrace_t *tr;
3087 u_int32_t idx;
3088 u_int16_t *cnt;
3089
3090 if (!(imo->imo_debug & IFD_DEBUG)) {
3091 panic("%s: imo %p has no debug structure", __func__, imo);
3092 /* NOTREACHED */
3093 }
3094 if (refhold) {
3095 cnt = &imo_dbg->imo_refhold_cnt;
3096 tr = imo_dbg->imo_refhold;
3097 } else {
3098 cnt = &imo_dbg->imo_refrele_cnt;
3099 tr = imo_dbg->imo_refrele;
3100 }
3101
3102 idx = atomic_add_16_ov(cnt, 1) % IMO_TRACE_HIST_SIZE;
3103 ctrace_record(&tr[idx]);
3104}
3105
3106struct ip_moptions *
3107ip_allocmoptions(int how)
3108{
3109 struct ip_moptions *imo;
3110
3111 imo = (how == M_WAITOK) ? zalloc(imo_zone) : zalloc_noblock(imo_zone);
3112 if (imo != NULL) {
3113 bzero(imo, imo_size);
3114 lck_mtx_init(&imo->imo_lock, ifa_mtx_grp, ifa_mtx_attr);
3115 imo->imo_debug |= IFD_ALLOC;
3116 if (imo_debug != 0) {
3117 imo->imo_debug |= IFD_DEBUG;
3118 imo->imo_trace = imo_trace;
3119 }
3120 IMO_ADDREF(imo);
3121 }
3122
3123 return (imo);
3124}
3125
3126/*
3127 * Routine called from ip_output() to loop back a copy of an IP multicast
3128 * packet to the input queue of a specified interface. Note that this
3129 * calls the output routine of the loopback "driver", but with an interface
3130 * pointer that might NOT be a loopback interface -- evil, but easier than
3131 * replicating that code here.
3132 */
3133static void
3134ip_mloopback(struct ifnet *srcifp, struct ifnet *origifp, struct mbuf *m,
3135 struct sockaddr_in *dst, int hlen)
3136{
3137 struct mbuf *copym;
3138 struct ip *ip;
3139
3140 if (lo_ifp == NULL)
3141 return;
3142
3143 /*
3144 * Copy the packet header as it's needed for the checksum
3145 * Make sure to deep-copy IP header portion in case the data
3146 * is in an mbuf cluster, so that we can safely override the IP
3147 * header portion later.
3148 */
3149 copym = m_copym_mode(m, 0, M_COPYALL, M_DONTWAIT, M_COPYM_COPY_HDR);
3150 if (copym != NULL && ((copym->m_flags & M_EXT) || copym->m_len < hlen))
3151 copym = m_pullup(copym, hlen);
3152
3153 if (copym == NULL)
3154 return;
3155
3156 /*
3157 * We don't bother to fragment if the IP length is greater
3158 * than the interface's MTU. Can this possibly matter?
3159 */
3160 ip = mtod(copym, struct ip *);
3161#if BYTE_ORDER != BIG_ENDIAN
3162 HTONS(ip->ip_len);
3163 HTONS(ip->ip_off);
3164#endif
3165 ip->ip_sum = 0;
3166 ip->ip_sum = ip_cksum_hdr_out(copym, hlen);
3167
3168 /*
3169 * Mark checksum as valid unless receive checksum offload is
3170 * disabled; if so, compute checksum in software. If the
3171 * interface itself is lo0, this will be overridden by if_loop.
3172 */
3173 if (hwcksum_rx) {
3174 copym->m_pkthdr.csum_flags &= ~(CSUM_PARTIAL|CSUM_ZERO_INVERT);
3175 copym->m_pkthdr.csum_flags |=
3176 CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
3177 copym->m_pkthdr.csum_data = 0xffff;
3178 } else if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
3179#if BYTE_ORDER != BIG_ENDIAN
3180 NTOHS(ip->ip_len);
3181#endif
3182 in_delayed_cksum(copym);
3183#if BYTE_ORDER != BIG_ENDIAN
3184 HTONS(ip->ip_len);
3185#endif
3186 }
3187
3188 /*
3189 * Stuff the 'real' ifp into the pkthdr, to be used in matching
3190 * in ip_input(); we need the loopback ifp/dl_tag passed as args
3191 * to make the loopback driver compliant with the data link
3192 * requirements.
3193 */
3194 copym->m_pkthdr.rcvif = origifp;
3195
3196 /*
3197 * Also record the source interface (which owns the source address).
3198 * This is basically a stripped down version of ifa_foraddr().
3199 */
3200 if (srcifp == NULL) {
3201 struct in_ifaddr *ia;
3202
3203 lck_rw_lock_shared(in_ifaddr_rwlock);
3204 TAILQ_FOREACH(ia, INADDR_HASH(ip->ip_src.s_addr), ia_hash) {
3205 IFA_LOCK_SPIN(&ia->ia_ifa);
3206 if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_src.s_addr) {
3207 srcifp = ia->ia_ifp;
3208 IFA_UNLOCK(&ia->ia_ifa);
3209 break;
3210 }
3211 IFA_UNLOCK(&ia->ia_ifa);
3212 }
3213 lck_rw_done(in_ifaddr_rwlock);
3214 }
3215 if (srcifp != NULL)
3216 ip_setsrcifaddr_info(copym, srcifp->if_index, NULL);
3217 ip_setdstifaddr_info(copym, origifp->if_index, NULL);
3218
3219 dlil_output(lo_ifp, PF_INET, copym, NULL, SA(dst), 0, NULL);
3220}
3221
3222/*
3223 * Given a source IP address (and route, if available), determine the best
3224 * interface to send the packet from. Checking for (and updating) the
3225 * ROF_SRCIF_SELECTED flag in the pcb-supplied route placeholder is done
3226 * without any locks based on the assumption that ip_output() is single-
3227 * threaded per-pcb, i.e. for any given pcb there can only be one thread
3228 * performing output at the IP layer.
3229 *
3230 * This routine is analogous to in6_selectroute() for IPv6.
3231 */
3232static struct ifaddr *
3233in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope)
3234{
3235 struct ifaddr *ifa = NULL;
3236 struct in_addr src = ip->ip_src;
3237 struct in_addr dst = ip->ip_dst;
3238 struct ifnet *rt_ifp;
3239 char s_src[MAX_IPv4_STR_LEN], s_dst[MAX_IPv4_STR_LEN];
3240
3241 VERIFY(src.s_addr != INADDR_ANY);
3242
3243 if (ip_select_srcif_debug) {
3244 (void) inet_ntop(AF_INET, &src.s_addr, s_src, sizeof (s_src));
3245 (void) inet_ntop(AF_INET, &dst.s_addr, s_dst, sizeof (s_dst));
3246 }
3247
3248 if (ro->ro_rt != NULL)
3249 RT_LOCK(ro->ro_rt);
3250
3251 rt_ifp = (ro->ro_rt != NULL) ? ro->ro_rt->rt_ifp : NULL;
3252
3253 /*
3254 * Given the source IP address, find a suitable source interface
3255 * to use for transmission; if the caller has specified a scope,
3256 * optimize the search by looking at the addresses only for that
3257 * interface. This is still suboptimal, however, as we need to
3258 * traverse the per-interface list.
3259 */
3260 if (ifscope != IFSCOPE_NONE || ro->ro_rt != NULL) {
3261 unsigned int scope = ifscope;
3262
3263 /*
3264 * If no scope is specified and the route is stale (pointing
3265 * to a defunct interface) use the current primary interface;
3266 * this happens when switching between interfaces configured
3267 * with the same IP address. Otherwise pick up the scope
3268 * information from the route; the ULP may have looked up a
3269 * correct route and we just need to verify it here and mark
3270 * it with the ROF_SRCIF_SELECTED flag below.
3271 */
3272 if (scope == IFSCOPE_NONE) {
3273 scope = rt_ifp->if_index;
3274 if (scope != get_primary_ifscope(AF_INET) &&
3275 ROUTE_UNUSABLE(ro))
3276 scope = get_primary_ifscope(AF_INET);
3277 }
3278
3279 ifa = (struct ifaddr *)ifa_foraddr_scoped(src.s_addr, scope);
3280
3281 if (ifa == NULL && ip->ip_p != IPPROTO_UDP &&
3282 ip->ip_p != IPPROTO_TCP && ipforwarding) {
3283 /*
3284 * If forwarding is enabled, and if the packet isn't
3285 * TCP or UDP, check if the source address belongs
3286 * to one of our own interfaces; if so, demote the
3287 * interface scope and do a route lookup right below.
3288 */
3289 ifa = (struct ifaddr *)ifa_foraddr(src.s_addr);
3290 if (ifa != NULL) {
3291 IFA_REMREF(ifa);
3292 ifa = NULL;
3293 ifscope = IFSCOPE_NONE;
3294 }
3295 }
3296
3297 if (ip_select_srcif_debug && ifa != NULL) {
3298 if (ro->ro_rt != NULL) {
3299 printf("%s->%s ifscope %d->%d ifa_if %s "
3300 "ro_if %s\n", s_src, s_dst, ifscope,
3301 scope, if_name(ifa->ifa_ifp),
3302 if_name(rt_ifp));
3303 } else {
3304 printf("%s->%s ifscope %d->%d ifa_if %s\n",
3305 s_src, s_dst, ifscope, scope,
3306 if_name(ifa->ifa_ifp));
3307 }
3308 }
3309 }
3310
3311 /*
3312 * Slow path; search for an interface having the corresponding source
3313 * IP address if the scope was not specified by the caller, and:
3314 *
3315 * 1) There currently isn't any route, or,
3316 * 2) The interface used by the route does not own that source
3317 * IP address; in this case, the route will get blown away
3318 * and we'll do a more specific scoped search using the newly
3319 * found interface.
3320 */
3321 if (ifa == NULL && ifscope == IFSCOPE_NONE) {
3322 ifa = (struct ifaddr *)ifa_foraddr(src.s_addr);
3323
3324 /*
3325 * If we have the IP address, but not the route, we don't
3326 * really know whether or not it belongs to the correct
3327 * interface (it could be shared across multiple interfaces.)
3328 * The only way to find out is to do a route lookup.
3329 */
3330 if (ifa != NULL && ro->ro_rt == NULL) {
3331 struct rtentry *rt;
3332 struct sockaddr_in sin;
3333 struct ifaddr *oifa = NULL;
3334
3335 bzero(&sin, sizeof (sin));
3336 sin.sin_family = AF_INET;
3337 sin.sin_len = sizeof (sin);
3338 sin.sin_addr = dst;
3339
3340 lck_mtx_lock(rnh_lock);
3341 if ((rt = rt_lookup(TRUE, SA(&sin), NULL,
3342 rt_tables[AF_INET], IFSCOPE_NONE)) != NULL) {
3343 RT_LOCK(rt);
3344 /*
3345 * If the route uses a different interface,
3346 * use that one instead. The IP address of
3347 * the ifaddr that we pick up here is not
3348 * relevant.
3349 */
3350 if (ifa->ifa_ifp != rt->rt_ifp) {
3351 oifa = ifa;
3352 ifa = rt->rt_ifa;
3353 IFA_ADDREF(ifa);
3354 RT_UNLOCK(rt);
3355 } else {
3356 RT_UNLOCK(rt);
3357 }
3358 rtfree_locked(rt);
3359 }
3360 lck_mtx_unlock(rnh_lock);
3361
3362 if (oifa != NULL) {
3363 struct ifaddr *iifa;
3364
3365 /*
3366 * See if the interface pointed to by the
3367 * route is configured with the source IP
3368 * address of the packet.
3369 */
3370 iifa = (struct ifaddr *)ifa_foraddr_scoped(
3371 src.s_addr, ifa->ifa_ifp->if_index);
3372
3373 if (iifa != NULL) {
3374 /*
3375 * Found it; drop the original one
3376 * as well as the route interface
3377 * address, and use this instead.
3378 */
3379 IFA_REMREF(oifa);
3380 IFA_REMREF(ifa);
3381 ifa = iifa;
3382 } else if (!ipforwarding ||
3383 (rt->rt_flags & RTF_GATEWAY)) {
3384 /*
3385 * This interface doesn't have that
3386 * source IP address; drop the route
3387 * interface address and just use the
3388 * original one, and let the caller
3389 * do a scoped route lookup.
3390 */
3391 IFA_REMREF(ifa);
3392 ifa = oifa;
3393 } else {
3394 /*
3395 * Forwarding is enabled and the source
3396 * address belongs to one of our own
3397 * interfaces which isn't the outgoing
3398 * interface, and we have a route, and
3399 * the destination is on a network that
3400 * is directly attached (onlink); drop
3401 * the original one and use the route
3402 * interface address instead.
3403 */
3404 IFA_REMREF(oifa);
3405 }
3406 }
3407 } else if (ifa != NULL && ro->ro_rt != NULL &&
3408 !(ro->ro_rt->rt_flags & RTF_GATEWAY) &&
3409 ifa->ifa_ifp != ro->ro_rt->rt_ifp && ipforwarding) {
3410 /*
3411 * Forwarding is enabled and the source address belongs
3412 * to one of our own interfaces which isn't the same
3413 * as the interface used by the known route; drop the
3414 * original one and use the route interface address.
3415 */
3416 IFA_REMREF(ifa);
3417 ifa = ro->ro_rt->rt_ifa;
3418 IFA_ADDREF(ifa);
3419 }
3420
3421 if (ip_select_srcif_debug && ifa != NULL) {
3422 printf("%s->%s ifscope %d ifa_if %s\n",
3423 s_src, s_dst, ifscope, if_name(ifa->ifa_ifp));
3424 }
3425 }
3426
3427 if (ro->ro_rt != NULL)
3428 RT_LOCK_ASSERT_HELD(ro->ro_rt);
3429 /*
3430 * If there is a non-loopback route with the wrong interface, or if
3431 * there is no interface configured with such an address, blow it
3432 * away. Except for local/loopback, we look for one with a matching
3433 * interface scope/index.
3434 */
3435 if (ro->ro_rt != NULL &&
3436 (ifa == NULL || (ifa->ifa_ifp != rt_ifp && rt_ifp != lo_ifp) ||
3437 !(ro->ro_rt->rt_flags & RTF_UP))) {
3438 if (ip_select_srcif_debug) {
3439 if (ifa != NULL) {
3440 printf("%s->%s ifscope %d ro_if %s != "
3441 "ifa_if %s (cached route cleared)\n",
3442 s_src, s_dst, ifscope, if_name(rt_ifp),
3443 if_name(ifa->ifa_ifp));
3444 } else {
3445 printf("%s->%s ifscope %d ro_if %s "
3446 "(no ifa_if found)\n",
3447 s_src, s_dst, ifscope, if_name(rt_ifp));
3448 }
3449 }
3450
3451 RT_UNLOCK(ro->ro_rt);
3452 ROUTE_RELEASE(ro);
3453
3454 /*
3455 * If the destination is IPv4 LLA and the route's interface
3456 * doesn't match the source interface, then the source IP
3457 * address is wrong; it most likely belongs to the primary
3458 * interface associated with the IPv4 LL subnet. Drop the
3459 * packet rather than letting it go out and return an error
3460 * to the ULP. This actually applies not only to IPv4 LL
3461 * but other shared subnets; for now we explicitly test only
3462 * for the former case and save the latter for future.
3463 */
3464 if (IN_LINKLOCAL(ntohl(dst.s_addr)) &&
3465 !IN_LINKLOCAL(ntohl(src.s_addr)) && ifa != NULL) {
3466 IFA_REMREF(ifa);
3467 ifa = NULL;
3468 }
3469 }
3470
3471 if (ip_select_srcif_debug && ifa == NULL) {
3472 printf("%s->%s ifscope %d (neither ro_if/ifa_if found)\n",
3473 s_src, s_dst, ifscope);
3474 }
3475
3476 /*
3477 * If there is a route, mark it accordingly. If there isn't one,
3478 * we'll get here again during the next transmit (possibly with a
3479 * route) and the flag will get set at that point. For IPv4 LLA
3480 * destination, mark it only if the route has been fully resolved;
3481 * otherwise we want to come back here again when the route points
3482 * to the interface over which the ARP reply arrives on.
3483 */
3484 if (ro->ro_rt != NULL && (!IN_LINKLOCAL(ntohl(dst.s_addr)) ||
3485 (ro->ro_rt->rt_gateway->sa_family == AF_LINK &&
3486 SDL(ro->ro_rt->rt_gateway)->sdl_alen != 0))) {
3487 if (ifa != NULL)
3488 IFA_ADDREF(ifa); /* for route */
3489 if (ro->ro_srcia != NULL)
3490 IFA_REMREF(ro->ro_srcia);
3491 ro->ro_srcia = ifa;
3492 ro->ro_flags |= ROF_SRCIF_SELECTED;
3493 RT_GENID_SYNC(ro->ro_rt);
3494 }
3495
3496 if (ro->ro_rt != NULL)
3497 RT_UNLOCK(ro->ro_rt);
3498
3499 return (ifa);
3500}
3501
3502/*
3503 * @brief Given outgoing interface it determines what checksum needs
3504 * to be computed in software and what needs to be offloaded to the
3505 * interface.
3506 *
3507 * @param ifp Pointer to the outgoing interface
3508 * @param m Pointer to the packet
3509 * @param hlen IP header length
3510 * @param ip_len Total packet size i.e. headers + data payload
3511 * @param sw_csum Pointer to a software checksum flag set
3512 *
3513 * @return void
3514 */
3515void
3516ip_output_checksum(struct ifnet *ifp, struct mbuf *m, int hlen, int ip_len,
3517 uint32_t *sw_csum)
3518{
3519 int tso = TSO_IPV4_OK(ifp, m);
3520 uint32_t hwcap = ifp->if_hwassist;
3521
3522 m->m_pkthdr.csum_flags |= CSUM_IP;
3523
3524 if (!hwcksum_tx) {
3525 /* do all in software; hardware checksum offload is disabled */
3526 *sw_csum = (CSUM_DELAY_DATA | CSUM_DELAY_IP) &
3527 m->m_pkthdr.csum_flags;
3528 } else {
3529 /* do in software what the hardware cannot */
3530 *sw_csum = m->m_pkthdr.csum_flags &
3531 ~IF_HWASSIST_CSUM_FLAGS(hwcap);
3532 }
3533
3534 if (hlen != sizeof (struct ip)) {
3535 *sw_csum |= ((CSUM_DELAY_DATA | CSUM_DELAY_IP) &
3536 m->m_pkthdr.csum_flags);
3537 } else if (!(*sw_csum & CSUM_DELAY_DATA) && (hwcap & CSUM_PARTIAL)) {
3538 int interface_mtu = ifp->if_mtu;
3539
3540 if (INTF_ADJUST_MTU_FOR_CLAT46(ifp)) {
3541 interface_mtu = IN6_LINKMTU(ifp);
3542 /* Further adjust the size for CLAT46 expansion */
3543 interface_mtu -= CLAT46_HDR_EXPANSION_OVERHD;
3544 }
3545
3546 /*
3547 * Partial checksum offload, if non-IP fragment, and TCP only
3548 * (no UDP support, as the hardware may not be able to convert
3549 * +0 to -0 (0xffff) per RFC1122 4.1.3.4. unless the interface
3550 * supports "invert zero" capability.)
3551 */
3552 if (hwcksum_tx && !tso &&
3553 ((m->m_pkthdr.csum_flags & CSUM_TCP) ||
3554 ((hwcap & CSUM_ZERO_INVERT) &&
3555 (m->m_pkthdr.csum_flags & CSUM_ZERO_INVERT))) &&
3556 ip_len <= interface_mtu) {
3557 uint16_t start = sizeof (struct ip);
3558 uint16_t ulpoff = m->m_pkthdr.csum_data & 0xffff;
3559 m->m_pkthdr.csum_flags |=
3560 (CSUM_DATA_VALID | CSUM_PARTIAL);
3561 m->m_pkthdr.csum_tx_stuff = (ulpoff + start);
3562 m->m_pkthdr.csum_tx_start = start;
3563 /* do IP hdr chksum in software */
3564 *sw_csum = CSUM_DELAY_IP;
3565 } else {
3566 *sw_csum |= (CSUM_DELAY_DATA & m->m_pkthdr.csum_flags);
3567 }
3568 }
3569
3570 if (*sw_csum & CSUM_DELAY_DATA) {
3571 in_delayed_cksum(m);
3572 *sw_csum &= ~CSUM_DELAY_DATA;
3573 }
3574
3575 if (hwcksum_tx) {
3576 /*
3577 * Drop off bits that aren't supported by hardware;
3578 * also make sure to preserve non-checksum related bits.
3579 */
3580 m->m_pkthdr.csum_flags =
3581 ((m->m_pkthdr.csum_flags &
3582 (IF_HWASSIST_CSUM_FLAGS(hwcap) | CSUM_DATA_VALID)) |
3583 (m->m_pkthdr.csum_flags & ~IF_HWASSIST_CSUM_MASK));
3584 } else {
3585 /* drop all bits; hardware checksum offload is disabled */
3586 m->m_pkthdr.csum_flags = 0;
3587 }
3588}
3589
3590/*
3591 * GRE protocol output for PPP/PPTP
3592 */
3593int
3594ip_gre_output(struct mbuf *m)
3595{
3596 struct route ro;
3597 int error;
3598
3599 bzero(&ro, sizeof (ro));
3600
3601 error = ip_output(m, NULL, &ro, 0, NULL, NULL);
3602
3603 ROUTE_RELEASE(&ro);
3604
3605 return (error);
3606}
3607
3608static int
3609sysctl_reset_ip_output_stats SYSCTL_HANDLER_ARGS
3610{
3611#pragma unused(arg1, arg2)
3612 int error, i;
3613
3614 i = ip_output_measure;
3615 error = sysctl_handle_int(oidp, &i, 0, req);
3616 if (error || req->newptr == USER_ADDR_NULL)
3617 goto done;
3618 /* impose bounds */
3619 if (i < 0 || i > 1) {
3620 error = EINVAL;
3621 goto done;
3622 }
3623 if (ip_output_measure != i && i == 1) {
3624 net_perf_initialize(&net_perf, ip_output_measure_bins);
3625 }
3626 ip_output_measure = i;
3627done:
3628 return (error);
3629}
3630
3631static int
3632sysctl_ip_output_measure_bins SYSCTL_HANDLER_ARGS
3633{
3634#pragma unused(arg1, arg2)
3635 int error;
3636 uint64_t i;
3637
3638 i = ip_output_measure_bins;
3639 error = sysctl_handle_quad(oidp, &i, 0, req);
3640 if (error || req->newptr == USER_ADDR_NULL)
3641 goto done;
3642 /* validate data */
3643 if (!net_perf_validate_bins(i)) {
3644 error = EINVAL;
3645 goto done;
3646 }
3647 ip_output_measure_bins = i;
3648done:
3649 return (error);
3650}
3651
3652static int
3653sysctl_ip_output_getperf SYSCTL_HANDLER_ARGS
3654{
3655#pragma unused(oidp, arg1, arg2)
3656 if (req->oldptr == USER_ADDR_NULL)
3657 req->oldlen = (size_t)sizeof (struct ipstat);
3658
3659 return (SYSCTL_OUT(req, &net_perf, MIN(sizeof (net_perf), req->oldlen)));
3660}
3661