1/*
2 * Copyright (c) 2000-2023 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * Copyright (c) 1982, 1986, 1988, 1990, 1993
30 * The Regents of the University of California. All rights reserved.
31 *
32 * Redistribution and use in source and binary forms, with or without
33 * modification, are permitted provided that the following conditions
34 * are met:
35 * 1. Redistributions of source code must retain the above copyright
36 * notice, this list of conditions and the following disclaimer.
37 * 2. Redistributions in binary form must reproduce the above copyright
38 * notice, this list of conditions and the following disclaimer in the
39 * documentation and/or other materials provided with the distribution.
40 * 3. All advertising materials mentioning features or use of this software
41 * must display the following acknowledgement:
42 * This product includes software developed by the University of
43 * California, Berkeley and its contributors.
44 * 4. Neither the name of the University nor the names of its contributors
45 * may be used to endorse or promote products derived from this software
46 * without specific prior written permission.
47 *
48 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
49 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
50 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
51 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
52 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
53 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
54 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
55 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
56 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
57 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
58 * SUCH DAMAGE.
59 *
60 * @(#)ip_output.c 8.3 (Berkeley) 1/21/94
61 */
62/*
63 * NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
64 * support for mandatory and extensible security protections. This notice
65 * is included in support of clause 2.2 (b) of the Apple Public License,
66 * Version 2.0.
67 */
68
69#define _IP_VHL
70
71#include <sys/param.h>
72#include <sys/systm.h>
73#include <sys/kernel.h>
74#include <sys/malloc.h>
75#include <sys/mbuf.h>
76#include <sys/protosw.h>
77#include <sys/socket.h>
78#include <sys/socketvar.h>
79#include <kern/locks.h>
80#include <sys/sysctl.h>
81#include <sys/mcache.h>
82#include <sys/kdebug.h>
83
84#include <machine/endian.h>
85#include <pexpert/pexpert.h>
86#include <mach/sdt.h>
87
88#include <libkern/OSAtomic.h>
89#include <libkern/OSByteOrder.h>
90
91#include <net/if.h>
92#include <net/if_dl.h>
93#include <net/if_types.h>
94#include <net/route.h>
95#include <net/ntstat.h>
96#include <net/net_osdep.h>
97#include <net/dlil.h>
98#include <net/net_perf.h>
99
100#include <netinet/in.h>
101#include <netinet/in_systm.h>
102#include <netinet/ip.h>
103#include <netinet/in_pcb.h>
104#include <netinet/in_var.h>
105#include <netinet/ip_var.h>
106#include <netinet/kpi_ipfilter_var.h>
107#include <netinet/in_tclass.h>
108#include <netinet/udp.h>
109
110#include <netinet6/nd6.h>
111
112#define DBG_LAYER_BEG NETDBG_CODE(DBG_NETIP, 1)
113#define DBG_LAYER_END NETDBG_CODE(DBG_NETIP, 3)
114#define DBG_FNC_IP_OUTPUT NETDBG_CODE(DBG_NETIP, (1 << 8) | 1)
115#define DBG_FNC_IPSEC4_OUTPUT NETDBG_CODE(DBG_NETIP, (2 << 8) | 1)
116
117#if IPSEC
118#include <netinet6/ipsec.h>
119#include <netkey/key.h>
120#if IPSEC_DEBUG
121#include <netkey/key_debug.h>
122#else
123#define KEYDEBUG(lev, arg)
124#endif
125#endif /* IPSEC */
126
127#if NECP
128#include <net/necp.h>
129#endif /* NECP */
130
131
132#if DUMMYNET
133#include <netinet/ip_dummynet.h>
134#endif
135
136#if PF
137#include <net/pfvar.h>
138#endif /* PF */
139
140#include <net/sockaddr_utils.h>
141
142u_short ip_id;
143
144static int sysctl_reset_ip_output_stats SYSCTL_HANDLER_ARGS;
145static int sysctl_ip_output_measure_bins SYSCTL_HANDLER_ARGS;
146static int sysctl_ip_output_getperf SYSCTL_HANDLER_ARGS;
147static void ip_out_cksum_stats(int, u_int32_t);
148static struct mbuf *ip_insertoptions(struct mbuf *, struct mbuf *, int *);
149static int ip_optcopy(struct ip *, struct ip *);
150static int ip_pcbopts(int, struct mbuf **, struct mbuf *);
151static void imo_trace(struct ip_moptions *, int);
152static void ip_mloopback(struct ifnet *, struct ifnet *, struct mbuf *,
153 struct sockaddr_in *, int);
154static struct ifaddr *in_selectsrcif(struct ip *, struct route *, unsigned int);
155
156extern struct ip_linklocal_stat ip_linklocal_stat;
157
158/* temporary: for testing */
159#if IPSEC
160extern int ipsec_bypass;
161#endif
162
163static int force_ipsum = 0;
164static int ip_maxchainsent = 0;
165SYSCTL_INT(_net_inet_ip, OID_AUTO, maxchainsent,
166 CTLFLAG_RW | CTLFLAG_LOCKED, &ip_maxchainsent, 0,
167 "use dlil_output_list");
168
169SYSCTL_INT(_net_inet_ip, OID_AUTO, force_ipsum,
170 CTLFLAG_RW | CTLFLAG_LOCKED, &force_ipsum, 0,
171 "force IP checksum");
172#if DEBUG
173static int forge_ce = 0;
174SYSCTL_INT(_net_inet_ip, OID_AUTO, forge_ce,
175 CTLFLAG_RW | CTLFLAG_LOCKED, &forge_ce, 0,
176 "Forge ECN CE");
177#endif /* DEBUG */
178
179static int ip_select_srcif_debug = 0;
180SYSCTL_INT(_net_inet_ip, OID_AUTO, select_srcif_debug,
181 CTLFLAG_RW | CTLFLAG_LOCKED, &ip_select_srcif_debug, 0,
182 "log source interface selection debug info");
183
184static int ip_output_measure = 0;
185SYSCTL_PROC(_net_inet_ip, OID_AUTO, output_perf,
186 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
187 &ip_output_measure, 0, sysctl_reset_ip_output_stats, "I",
188 "Do time measurement");
189
190static uint64_t ip_output_measure_bins = 0;
191SYSCTL_PROC(_net_inet_ip, OID_AUTO, output_perf_bins,
192 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED, &ip_output_measure_bins, 0,
193 sysctl_ip_output_measure_bins, "I",
194 "bins for chaining performance data histogram");
195
196static net_perf_t net_perf;
197SYSCTL_PROC(_net_inet_ip, OID_AUTO, output_perf_data,
198 CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_LOCKED,
199 0, 0, sysctl_ip_output_getperf, "S,net_perf",
200 "IP output performance data (struct net_perf, net/net_perf.h)");
201
202__private_extern__ int rfc6864 = 1;
203SYSCTL_INT(_net_inet_ip, OID_AUTO, rfc6864, CTLFLAG_RW | CTLFLAG_LOCKED,
204 &rfc6864, 0, "updated ip id field behavior");
205
206#define IMO_TRACE_HIST_SIZE 32 /* size of trace history */
207
208/* For gdb */
209__private_extern__ unsigned int imo_trace_hist_size = IMO_TRACE_HIST_SIZE;
210
211struct ip_moptions_dbg {
212 struct ip_moptions imo; /* ip_moptions */
213 u_int16_t imo_refhold_cnt; /* # of IMO_ADDREF */
214 u_int16_t imo_refrele_cnt; /* # of IMO_REMREF */
215 /*
216 * Alloc and free callers.
217 */
218 ctrace_t imo_alloc;
219 ctrace_t imo_free;
220 /*
221 * Circular lists of IMO_ADDREF and IMO_REMREF callers.
222 */
223 ctrace_t imo_refhold[IMO_TRACE_HIST_SIZE];
224 ctrace_t imo_refrele[IMO_TRACE_HIST_SIZE];
225};
226
227#if DEBUG
228static unsigned int imo_debug = 1; /* debugging (enabled) */
229#else
230static unsigned int imo_debug; /* debugging (disabled) */
231#endif /* !DEBUG */
232
233static struct zone *imo_zone; /* zone for ip_moptions */
234#define IMO_ZONE_NAME "ip_moptions" /* zone name */
235
236#if PF
237__attribute__((noinline))
238static int
239ip_output_pf_dn_hook(struct ifnet *ifp, struct mbuf **mppn, struct mbuf **mp,
240 struct pf_rule *dn_pf_rule, struct route *ro, struct sockaddr_in *dst, int flags,
241 struct ip_out_args *ipoa)
242{
243 int rc;
244 struct ip_fw_args args = {};
245
246 args.fwa_pf_rule = dn_pf_rule;
247 args.fwa_oif = ifp;
248 args.fwa_ro = ro;
249 args.fwa_dst = dst;
250 args.fwa_oflags = flags;
251 if (flags & IP_OUTARGS) {
252 args.fwa_ipoa = ipoa;
253 }
254 rc = pf_af_hook(ifp, mppn, mp, AF_INET, FALSE, &args);
255
256 return rc;
257}
258
259#endif /* PF */
260
261
262/*
263 * IP output. The packet in mbuf chain m contains a skeletal IP
264 * header (with len, off, ttl, proto, tos, src, dst).
265 * The mbuf chain containing the packet will be freed.
266 * The mbuf opt, if present, will not be freed.
267 */
268int
269ip_output(struct mbuf *m0, struct mbuf *opt, struct route *ro, int flags,
270 struct ip_moptions *imo, struct ip_out_args *ipoa)
271{
272 return ip_output_list(m0, 0, opt, ro, flags, imo, ipoa);
273}
274
275/*
276 * IP output. The packet in mbuf chain m contains a skeletal IP
277 * header (with len, off, ttl, proto, tos, src, dst).
278 * The mbuf chain containing the packet will be freed.
279 * The mbuf opt, if present, will not be freed.
280 *
281 * Route ro MUST be non-NULL; if ro->ro_rt is valid, route lookup would be
282 * skipped and ro->ro_rt would be used. Otherwise the result of route
283 * lookup is stored in ro->ro_rt.
284 *
285 * In the IP forwarding case, the packet will arrive with options already
286 * inserted, so must have a NULL opt pointer.
287 */
288int
289ip_output_list(struct mbuf *m0, int packetchain, struct mbuf *opt,
290 struct route *ro, int flags, struct ip_moptions *imo,
291 struct ip_out_args *ipoa)
292{
293 struct ip *ip;
294 struct ifnet *ifp = NULL; /* not refcnt'd */
295 struct mbuf *m = m0, *prevnxt = NULL, **mppn = &prevnxt;
296 int hlen = sizeof(struct ip);
297 int len = 0, error = 0;
298 struct sockaddr_in *dst = NULL;
299 struct in_ifaddr *ia = NULL, *src_ia = NULL;
300 struct in_addr pkt_dst;
301 struct ipf_pktopts *ippo = NULL;
302 ipfilter_t inject_filter_ref = NULL;
303 struct mbuf *packetlist;
304 uint32_t sw_csum, pktcnt = 0, scnt = 0, bytecnt = 0;
305 uint32_t packets_processed = 0;
306 unsigned int ifscope = IFSCOPE_NONE;
307 struct flowadv *adv = NULL;
308 struct timeval start_tv;
309#if IPSEC
310 struct socket *so = NULL;
311 struct secpolicy *sp = NULL;
312#endif /* IPSEC */
313#if NECP
314 necp_kernel_policy_result necp_result = 0;
315 necp_kernel_policy_result_parameter necp_result_parameter;
316 necp_kernel_policy_id necp_matched_policy_id = 0;
317#endif /* NECP */
318#if DUMMYNET
319 struct m_tag *tag;
320 struct ip_out_args saved_ipoa;
321 struct sockaddr_in dst_buf;
322#endif /* DUMMYNET */
323 struct {
324#if IPSEC
325 struct ipsec_output_state ipsec_state;
326#endif /* IPSEC */
327#if NECP
328 struct route necp_route;
329#endif /* NECP */
330#if DUMMYNET
331 struct route saved_route;
332#endif /* DUMMYNET */
333 struct ipf_pktopts ipf_pktopts;
334 } ipobz;
335#define ipsec_state ipobz.ipsec_state
336#define necp_route ipobz.necp_route
337#define sro_fwd ipobz.sro_fwd
338#define saved_route ipobz.saved_route
339#define ipf_pktopts ipobz.ipf_pktopts
340 union {
341 struct {
342 boolean_t select_srcif : 1; /* set once */
343 boolean_t srcbound : 1; /* set once */
344 boolean_t nocell : 1; /* set once */
345 boolean_t isbroadcast : 1;
346 boolean_t didfilter : 1;
347 boolean_t noexpensive : 1; /* set once */
348 boolean_t noconstrained : 1; /* set once */
349 boolean_t awdl_unrestricted : 1; /* set once */
350 boolean_t management_allowed : 1; /* set once */
351 };
352 uint32_t raw;
353 } ipobf = { .raw = 0 };
354
355 int interface_mtu = 0;
356 struct pf_rule *dn_pf_rule = NULL;
357/*
358 * Here we check for restrictions when sending frames.
359 * N.B.: IPv4 over internal co-processor interfaces is not allowed.
360 */
361#define IP_CHECK_RESTRICTIONS(_ifp, _ipobf) \
362 (((_ipobf).nocell && IFNET_IS_CELLULAR(_ifp)) || \
363 ((_ipobf).noexpensive && IFNET_IS_EXPENSIVE(_ifp)) || \
364 ((_ipobf).noconstrained && IFNET_IS_CONSTRAINED(_ifp)) || \
365 (IFNET_IS_INTCOPROC(_ifp)) || \
366 (!(_ipobf).management_allowed && IFNET_IS_MANAGEMENT(_ifp)) || \
367 (!(_ipobf).awdl_unrestricted && IFNET_IS_AWDL_RESTRICTED(_ifp)))
368
369 if (ip_output_measure) {
370 net_perf_start_time(npp: &net_perf, tv: &start_tv);
371 }
372 KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
373
374 VERIFY(m0->m_flags & M_PKTHDR);
375 packetlist = m0;
376
377 /* zero out {ipsec_state, args, sro_fwd, saved_route, ipf_pktops} */
378 bzero(s: &ipobz, n: sizeof(ipobz));
379 ippo = &ipf_pktopts;
380
381#if DUMMYNET
382 if (SLIST_EMPTY(&m0->m_pkthdr.tags)) {
383 goto ipfw_tags_done;
384 }
385
386 /* Grab info from mtags prepended to the chain */
387 if ((tag = m_tag_locate(m0, KERNEL_MODULE_TAG_ID,
388 KERNEL_TAG_TYPE_DUMMYNET)) != NULL) {
389 struct dn_pkt_tag *dn_tag;
390
391 dn_tag = (struct dn_pkt_tag *)(tag->m_tag_data);
392 dn_pf_rule = dn_tag->dn_pf_rule;
393 opt = NULL;
394 saved_route = dn_tag->dn_ro;
395 ro = &saved_route;
396
397 imo = NULL;
398 SOCKADDR_COPY(&dn_tag->dn_dst, &dst_buf, sizeof(dst_buf));
399 dst = &dst_buf;
400 ifp = dn_tag->dn_ifp;
401 flags = dn_tag->dn_flags;
402 if ((dn_tag->dn_flags & IP_OUTARGS)) {
403 saved_ipoa = dn_tag->dn_ipoa;
404 ipoa = &saved_ipoa;
405 }
406
407 m_tag_delete(m0, tag);
408 }
409ipfw_tags_done:
410#endif /* DUMMYNET */
411
412 m = m0;
413 m->m_pkthdr.pkt_flags &= ~(PKTF_LOOP | PKTF_IFAINFO);
414
415#if IPSEC
416 if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) {
417 /* If packet is bound to an interface, check bound policies */
418 if ((flags & IP_OUTARGS) && (ipoa != NULL) &&
419 (ipoa->ipoa_flags & IPOAF_BOUND_IF) &&
420 ipoa->ipoa_boundif != IFSCOPE_NONE) {
421 if (ipsec4_getpolicybyinterface(m, IPSEC_DIR_OUTBOUND,
422 &flags, ipoa, &sp) != 0) {
423 goto bad;
424 }
425 }
426 }
427#endif /* IPSEC */
428
429 VERIFY(ro != NULL);
430
431 if (flags & IP_OUTARGS) {
432 /*
433 * In the forwarding case, only the ifscope value is used,
434 * as source interface selection doesn't take place.
435 */
436 if ((ipobf.select_srcif = (!(flags & IP_FORWARDING) &&
437 (ipoa->ipoa_flags & IPOAF_SELECT_SRCIF)))) {
438 ipf_pktopts.ippo_flags |= IPPOF_SELECT_SRCIF;
439 }
440
441 if ((ipoa->ipoa_flags & IPOAF_BOUND_IF) &&
442 ipoa->ipoa_boundif != IFSCOPE_NONE) {
443 ifscope = ipoa->ipoa_boundif;
444 ipf_pktopts.ippo_flags |=
445 (IPPOF_BOUND_IF | (ifscope << IPPOF_SHIFT_IFSCOPE));
446 }
447
448 /* double negation needed for bool bit field */
449 ipobf.srcbound = !!(ipoa->ipoa_flags & IPOAF_BOUND_SRCADDR);
450 if (ipobf.srcbound) {
451 ipf_pktopts.ippo_flags |= IPPOF_BOUND_SRCADDR;
452 }
453 } else {
454 ipobf.select_srcif = FALSE;
455 ipobf.srcbound = FALSE;
456 ifscope = IFSCOPE_NONE;
457 if (flags & IP_OUTARGS) {
458 ipoa->ipoa_boundif = IFSCOPE_NONE;
459 ipoa->ipoa_flags &= ~(IPOAF_SELECT_SRCIF |
460 IPOAF_BOUND_IF | IPOAF_BOUND_SRCADDR);
461 }
462 }
463
464 if (flags & IP_OUTARGS) {
465 if (ipoa->ipoa_flags & IPOAF_NO_CELLULAR) {
466 ipobf.nocell = true;
467 ipf_pktopts.ippo_flags |= IPPOF_NO_IFT_CELLULAR;
468 }
469 if (ipoa->ipoa_flags & IPOAF_NO_EXPENSIVE) {
470 ipobf.noexpensive = true;
471 ipf_pktopts.ippo_flags |= IPPOF_NO_IFF_EXPENSIVE;
472 }
473 if (ipoa->ipoa_flags & IPOAF_NO_CONSTRAINED) {
474 ipobf.noconstrained = true;
475 ipf_pktopts.ippo_flags |= IPPOF_NO_IFF_CONSTRAINED;
476 }
477 if (ipoa->ipoa_flags & IPOAF_AWDL_UNRESTRICTED) {
478 ipobf.awdl_unrestricted = true;
479 }
480 if (ipoa->ipoa_flags & IPOAF_MANAGEMENT_ALLOWED) {
481 ipobf.management_allowed = true;
482 }
483 adv = &ipoa->ipoa_flowadv;
484 adv->code = FADV_SUCCESS;
485 ipoa->ipoa_flags &= ~IPOAF_RET_MASK;
486 }
487
488#if IPSEC
489 if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) {
490 so = ipsec_getsocket(m);
491 if (so != NULL) {
492 (void) ipsec_setsocket(m, NULL);
493 }
494 }
495#endif /* IPSEC */
496
497#if DUMMYNET
498 if (dn_pf_rule != NULL) {
499 /* dummynet already saw us */
500 ip = mtod(m, struct ip *);
501 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
502 pkt_dst = ip->ip_dst;
503 if (ro->ro_rt != NULL) {
504 RT_LOCK_SPIN(ro->ro_rt);
505 ia = (struct in_ifaddr *)ro->ro_rt->rt_ifa;
506 if (ia) {
507 /* Become a regular mutex */
508 RT_CONVERT_LOCK(ro->ro_rt);
509 ifa_addref(ifa: &ia->ia_ifa);
510 }
511 RT_UNLOCK(ro->ro_rt);
512 }
513
514 goto sendit;
515 }
516#endif /* DUMMYNET */
517
518loopit:
519 packets_processed++;
520 ipobf.isbroadcast = FALSE;
521 ipobf.didfilter = FALSE;
522
523 VERIFY(m->m_flags & M_PKTHDR);
524 /*
525 * No need to proccess packet twice if we've already seen it.
526 */
527 if (!SLIST_EMPTY(&m->m_pkthdr.tags)) {
528 inject_filter_ref = ipf_get_inject_filter(m);
529 } else {
530 inject_filter_ref = NULL;
531 }
532
533 if (opt) {
534 m = ip_insertoptions(m, opt, &len);
535 hlen = len;
536 /* Update the chain */
537 if (m != m0) {
538 if (m0 == packetlist) {
539 packetlist = m;
540 }
541 m0 = m;
542 }
543 }
544 ip = mtod(m, struct ip *);
545
546 pkt_dst = ip->ip_dst;
547
548 /*
549 * We must not send if the packet is destined to network zero.
550 * RFC1122 3.2.1.3 (a) and (b).
551 */
552 if (IN_ZERONET(ntohl(pkt_dst.s_addr))) {
553 error = EHOSTUNREACH;
554 goto bad;
555 }
556
557 /*
558 * Fill in IP header.
559 */
560 if (!(flags & (IP_FORWARDING | IP_RAWOUTPUT))) {
561 ip->ip_vhl = IP_MAKE_VHL(IPVERSION, hlen >> 2);
562 ip->ip_off &= IP_DF;
563 if (rfc6864 && IP_OFF_IS_ATOMIC(ip->ip_off)) {
564 // Per RFC6864, value of ip_id is undefined for atomic ip packets
565 ip->ip_id = 0;
566 } else {
567 ip->ip_id = ip_randomid((uint64_t)m);
568 }
569 OSAddAtomic(1, &ipstat.ips_localout);
570 } else {
571 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
572 }
573
574#if DEBUG
575 /* For debugging, we let the stack forge congestion */
576 if (forge_ce != 0 &&
577 ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT1 ||
578 (ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT0)) {
579 ip->ip_tos = (ip->ip_tos & ~IPTOS_ECN_MASK) | IPTOS_ECN_CE;
580 forge_ce--;
581 }
582#endif /* DEBUG */
583
584 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_ECT1) {
585 m->m_pkthdr.pkt_ext_flags |= PKTF_EXT_L4S;
586 }
587
588 KERNEL_DEBUG(DBG_LAYER_BEG, ip->ip_dst.s_addr, ip->ip_src.s_addr,
589 ip->ip_p, ip->ip_off, ip->ip_len);
590
591 dst = SIN(&ro->ro_dst);
592
593 /*
594 * If there is a cached route,
595 * check that it is to the same destination
596 * and is still up. If not, free it and try again.
597 * The address family should also be checked in case of sharing the
598 * cache with IPv6.
599 */
600
601 if (ro->ro_rt != NULL) {
602 if (ROUTE_UNUSABLE(ro) && ip->ip_src.s_addr != INADDR_ANY &&
603 !(flags & (IP_ROUTETOIF | IP_FORWARDING))) {
604 src_ia = ifa_foraddr(ip->ip_src.s_addr);
605 if (src_ia == NULL) {
606 error = EADDRNOTAVAIL;
607 goto bad;
608 }
609 ifa_remref(ifa: &src_ia->ia_ifa);
610 src_ia = NULL;
611 }
612 /*
613 * Test rt_flags without holding rt_lock for performance
614 * reasons; if the route is down it will hopefully be
615 * caught by the layer below (since it uses this route
616 * as a hint) or during the next transmit.
617 */
618 if (ROUTE_UNUSABLE(ro) || dst->sin_family != AF_INET ||
619 dst->sin_addr.s_addr != pkt_dst.s_addr) {
620 ROUTE_RELEASE(ro);
621 }
622
623 /*
624 * If we're doing source interface selection, we may not
625 * want to use this route; only synch up the generation
626 * count otherwise.
627 */
628 if (!ipobf.select_srcif && ro->ro_rt != NULL &&
629 RT_GENID_OUTOFSYNC(ro->ro_rt)) {
630 RT_GENID_SYNC(ro->ro_rt);
631 }
632 }
633 if (ro->ro_rt == NULL) {
634 SOCKADDR_ZERO(dst, sizeof(*dst));
635 dst->sin_family = AF_INET;
636 dst->sin_len = sizeof(*dst);
637 dst->sin_addr = pkt_dst;
638 }
639 /*
640 * If routing to interface only,
641 * short circuit routing lookup.
642 */
643 if (flags & IP_ROUTETOIF) {
644 if (ia != NULL) {
645 ifa_remref(ifa: &ia->ia_ifa);
646 }
647 if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst)))) == NULL) {
648 ia = ifatoia(ifa_ifwithnet(sintosa(dst)));
649 if (ia == NULL) {
650 OSAddAtomic(1, &ipstat.ips_noroute);
651 error = ENETUNREACH;
652 /* XXX IPv6 APN fallback notification?? */
653 goto bad;
654 }
655 }
656 ifp = ia->ia_ifp;
657 ip->ip_ttl = 1;
658 ipobf.isbroadcast = in_broadcast(dst->sin_addr, ifp);
659 /*
660 * For consistency with other cases below. Loopback
661 * multicast case is handled separately by ip_mloopback().
662 */
663 if ((ifp->if_flags & IFF_LOOPBACK) &&
664 !IN_MULTICAST(ntohl(pkt_dst.s_addr))) {
665 m->m_pkthdr.rcvif = ifp;
666 ip_setsrcifaddr_info(m, ifp->if_index, NULL);
667 ip_setdstifaddr_info(m, ifp->if_index, NULL);
668 }
669 } else if (IN_MULTICAST(ntohl(pkt_dst.s_addr)) &&
670 imo != NULL && (ifp = imo->imo_multicast_ifp) != NULL) {
671 /*
672 * Bypass the normal routing lookup for multicast
673 * packets if the interface is specified.
674 */
675 ipobf.isbroadcast = FALSE;
676 if (ia != NULL) {
677 ifa_remref(ifa: &ia->ia_ifa);
678 }
679
680 /* Macro takes reference on ia */
681 IFP_TO_IA(ifp, ia);
682 } else {
683 struct ifaddr *ia0 = NULL;
684 boolean_t cloneok = FALSE;
685 /*
686 * Perform source interface selection; the source IP address
687 * must belong to one of the addresses of the interface used
688 * by the route. For performance reasons, do this only if
689 * there is no route, or if the routing table has changed,
690 * or if we haven't done source interface selection on this
691 * route (for this PCB instance) before.
692 */
693 if (ipobf.select_srcif &&
694 ip->ip_src.s_addr != INADDR_ANY && (ROUTE_UNUSABLE(ro) ||
695 !(ro->ro_flags & ROF_SRCIF_SELECTED))) {
696 /* Find the source interface */
697 ia0 = in_selectsrcif(ip, ro, ifscope);
698
699 /*
700 * If the source address belongs to a restricted
701 * interface and the caller forbids our using
702 * interfaces of such type, pretend that there is no
703 * route.
704 */
705 if (ia0 != NULL &&
706 IP_CHECK_RESTRICTIONS(ia0->ifa_ifp, ipobf)) {
707 ifa_remref(ifa: ia0);
708 ia0 = NULL;
709 error = EHOSTUNREACH;
710 if (flags & IP_OUTARGS) {
711 ipoa->ipoa_flags |= IPOAF_R_IFDENIED;
712 }
713 goto bad;
714 }
715
716 /*
717 * If the source address is spoofed (in the case of
718 * IP_RAWOUTPUT on an unbounded socket), or if this
719 * is destined for local/loopback, just let it go out
720 * using the interface of the route. Otherwise,
721 * there's no interface having such an address,
722 * so bail out.
723 */
724 if (ia0 == NULL && (!(flags & IP_RAWOUTPUT) ||
725 ipobf.srcbound) && ifscope != lo_ifp->if_index) {
726 error = EADDRNOTAVAIL;
727 goto bad;
728 }
729
730 /*
731 * If the caller didn't explicitly specify the scope,
732 * pick it up from the source interface. If the cached
733 * route was wrong and was blown away as part of source
734 * interface selection, don't mask out RTF_PRCLONING
735 * since that route may have been allocated by the ULP,
736 * unless the IP header was created by the caller or
737 * the destination is IPv4 LLA. The check for the
738 * latter is needed because IPv4 LLAs are never scoped
739 * in the current implementation, and we don't want to
740 * replace the resolved IPv4 LLA route with one whose
741 * gateway points to that of the default gateway on
742 * the primary interface of the system.
743 */
744 if (ia0 != NULL) {
745 if (ifscope == IFSCOPE_NONE) {
746 ifscope = ia0->ifa_ifp->if_index;
747 }
748 cloneok = (!(flags & IP_RAWOUTPUT) &&
749 !(IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))));
750 }
751 }
752
753 /*
754 * If this is the case, we probably don't want to allocate
755 * a protocol-cloned route since we didn't get one from the
756 * ULP. This lets TCP do its thing, while not burdening
757 * forwarding or ICMP with the overhead of cloning a route.
758 * Of course, we still want to do any cloning requested by
759 * the link layer, as this is probably required in all cases
760 * for correct operation (as it is for ARP).
761 */
762 if (ro->ro_rt == NULL) {
763 uint32_t ign = RTF_PRCLONING;
764 /*
765 * We make an exception here: if the destination
766 * address is INADDR_BROADCAST, allocate a protocol-
767 * cloned host route so that we end up with a route
768 * marked with the RTF_BROADCAST flag. Otherwise,
769 * we would end up referring to the default route,
770 * instead of creating a cloned host route entry.
771 * That would introduce inconsistencies between ULPs
772 * that allocate a route and those that don't. The
773 * RTF_BROADCAST route is important since we'd want
774 * to send out undirected IP broadcast packets using
775 * link-level broadcast address. Another exception
776 * is for ULP-created routes that got blown away by
777 * source interface selection (see above).
778 *
779 * These exceptions will no longer be necessary when
780 * the RTF_PRCLONING scheme is no longer present.
781 */
782 if (cloneok || dst->sin_addr.s_addr == INADDR_BROADCAST) {
783 ign &= ~RTF_PRCLONING;
784 }
785
786 /*
787 * Loosen the route lookup criteria if the ifscope
788 * corresponds to the loopback interface; this is
789 * needed to support Application Layer Gateways
790 * listening on loopback, in conjunction with packet
791 * filter redirection rules. The final source IP
792 * address will be rewritten by the packet filter
793 * prior to the RFC1122 loopback check below.
794 */
795 if (ifscope == lo_ifp->if_index) {
796 rtalloc_ign(ro, ign);
797 } else {
798 rtalloc_scoped_ign(ro, ign, ifscope);
799 }
800
801 /*
802 * If the route points to a cellular/expensive interface
803 * and the caller forbids our using interfaces of such type,
804 * pretend that there is no route.
805 */
806 if (ro->ro_rt != NULL) {
807 RT_LOCK_SPIN(ro->ro_rt);
808 if (IP_CHECK_RESTRICTIONS(ro->ro_rt->rt_ifp,
809 ipobf)) {
810 RT_UNLOCK(ro->ro_rt);
811 ROUTE_RELEASE(ro);
812 if (flags & IP_OUTARGS) {
813 ipoa->ipoa_flags |=
814 IPOAF_R_IFDENIED;
815 }
816 } else {
817 RT_UNLOCK(ro->ro_rt);
818 }
819 }
820 }
821
822 if (ro->ro_rt == NULL) {
823 OSAddAtomic(1, &ipstat.ips_noroute);
824 error = EHOSTUNREACH;
825 if (ia0 != NULL) {
826 ifa_remref(ifa: ia0);
827 ia0 = NULL;
828 }
829 goto bad;
830 }
831
832 if (ia != NULL) {
833 ifa_remref(ifa: &ia->ia_ifa);
834 }
835 RT_LOCK_SPIN(ro->ro_rt);
836 ia = ifatoia(ro->ro_rt->rt_ifa);
837 if (ia != NULL) {
838 /* Become a regular mutex */
839 RT_CONVERT_LOCK(ro->ro_rt);
840 ifa_addref(ifa: &ia->ia_ifa);
841 }
842 /*
843 * Note: ia_ifp may not be the same as rt_ifp; the latter
844 * is what we use for determining outbound i/f, mtu, etc.
845 */
846 ifp = ro->ro_rt->rt_ifp;
847 ro->ro_rt->rt_use++;
848 if (ro->ro_rt->rt_flags & RTF_GATEWAY) {
849 dst = SIN(ro->ro_rt->rt_gateway);
850 }
851 if (ro->ro_rt->rt_flags & RTF_HOST) {
852 /* double negation needed for bool bit field */
853 ipobf.isbroadcast =
854 !!(ro->ro_rt->rt_flags & RTF_BROADCAST);
855 } else {
856 /* Become a regular mutex */
857 RT_CONVERT_LOCK(ro->ro_rt);
858 ipobf.isbroadcast = in_broadcast(dst->sin_addr, ifp);
859 }
860 /*
861 * For consistency with IPv6, as well as to ensure that
862 * IP_RECVIF is set correctly for packets that are sent
863 * to one of the local addresses. ia (rt_ifa) would have
864 * been fixed up by rt_setif for local routes. This
865 * would make it appear as if the packet arrives on the
866 * interface which owns the local address. Loopback
867 * multicast case is handled separately by ip_mloopback().
868 */
869 if (ia != NULL && (ifp->if_flags & IFF_LOOPBACK) &&
870 !IN_MULTICAST(ntohl(pkt_dst.s_addr))) {
871 uint16_t srcidx;
872
873 m->m_pkthdr.rcvif = ia->ia_ifa.ifa_ifp;
874
875 if (ia0 != NULL) {
876 srcidx = ia0->ifa_ifp->if_index;
877 } else if ((ro->ro_flags & ROF_SRCIF_SELECTED) &&
878 ro->ro_srcia != NULL) {
879 srcidx = ro->ro_srcia->ifa_ifp->if_index;
880 } else {
881 srcidx = 0;
882 }
883
884 ip_setsrcifaddr_info(m, srcidx, NULL);
885 ip_setdstifaddr_info(m, 0, ia);
886 }
887 RT_UNLOCK(ro->ro_rt);
888 if (ia0 != NULL) {
889 ifa_remref(ifa: ia0);
890 ia0 = NULL;
891 }
892 }
893
894 if (IN_MULTICAST(ntohl(pkt_dst.s_addr))) {
895 struct ifnet *srcifp = NULL;
896 struct in_multi *inm;
897 u_int32_t vif = 0;
898 u_int8_t ttl = IP_DEFAULT_MULTICAST_TTL;
899 u_int8_t loop = IP_DEFAULT_MULTICAST_LOOP;
900
901 m->m_flags |= M_MCAST;
902 /*
903 * IP destination address is multicast. Make sure "dst"
904 * still points to the address in "ro". (It may have been
905 * changed to point to a gateway address, above.)
906 */
907 dst = SIN(&ro->ro_dst);
908 /*
909 * See if the caller provided any multicast options
910 */
911 if (imo != NULL) {
912 IMO_LOCK(imo);
913 vif = imo->imo_multicast_vif;
914 ttl = imo->imo_multicast_ttl;
915 loop = imo->imo_multicast_loop;
916 if (!(flags & IP_RAWOUTPUT)) {
917 ip->ip_ttl = ttl;
918 }
919 if (imo->imo_multicast_ifp != NULL) {
920 ifp = imo->imo_multicast_ifp;
921 }
922 IMO_UNLOCK(imo);
923 } else if (!(flags & IP_RAWOUTPUT)) {
924 vif = -1;
925 ip->ip_ttl = ttl;
926 }
927 /*
928 * Confirm that the outgoing interface supports multicast.
929 */
930 if (imo == NULL || vif == -1) {
931 if (!(ifp->if_flags & IFF_MULTICAST)) {
932 OSAddAtomic(1, &ipstat.ips_noroute);
933 error = ENETUNREACH;
934 goto bad;
935 }
936 }
937 /*
938 * If source address not specified yet, use address
939 * of outgoing interface.
940 */
941 if (ip->ip_src.s_addr == INADDR_ANY) {
942 struct in_ifaddr *ia1;
943 lck_rw_lock_shared(lck: &in_ifaddr_rwlock);
944 TAILQ_FOREACH(ia1, &in_ifaddrhead, ia_link) {
945 IFA_LOCK_SPIN(&ia1->ia_ifa);
946 if (ia1->ia_ifp == ifp) {
947 ip->ip_src = IA_SIN(ia1)->sin_addr;
948 srcifp = ifp;
949 IFA_UNLOCK(&ia1->ia_ifa);
950 break;
951 }
952 IFA_UNLOCK(&ia1->ia_ifa);
953 }
954 lck_rw_done(lck: &in_ifaddr_rwlock);
955 if (ip->ip_src.s_addr == INADDR_ANY) {
956 error = ENETUNREACH;
957 goto bad;
958 }
959 }
960
961 in_multihead_lock_shared();
962 IN_LOOKUP_MULTI(&pkt_dst, ifp, inm);
963 in_multihead_lock_done();
964 if (inm != NULL && (imo == NULL || loop)) {
965 /*
966 * If we belong to the destination multicast group
967 * on the outgoing interface, and the caller did not
968 * forbid loopback, loop back a copy.
969 */
970 if (!TAILQ_EMPTY(&ipv4_filters)
971#if NECP
972 && !necp_packet_should_skip_filters(packet: m)
973#endif // NECP
974 ) {
975 struct ipfilter *filter;
976 int seen = (inject_filter_ref == NULL);
977
978 if (imo != NULL) {
979 ipf_pktopts.ippo_flags |=
980 IPPOF_MCAST_OPTS;
981 ipf_pktopts.ippo_mcast_ifnet = ifp;
982 ipf_pktopts.ippo_mcast_ttl = ttl;
983 ipf_pktopts.ippo_mcast_loop = loop;
984 }
985
986 ipf_ref();
987
988 /*
989 * 4135317 - always pass network byte
990 * order to filter
991 */
992#if BYTE_ORDER != BIG_ENDIAN
993 HTONS(ip->ip_len);
994 HTONS(ip->ip_off);
995#endif
996 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
997 if (seen == 0) {
998 if ((struct ipfilter *)
999 inject_filter_ref == filter) {
1000 seen = 1;
1001 }
1002 } else if (filter->ipf_filter.
1003 ipf_output != NULL) {
1004 errno_t result;
1005 result = filter->ipf_filter.
1006 ipf_output(filter->
1007 ipf_filter.cookie,
1008 (mbuf_t *)&m, ippo);
1009 if (result == EJUSTRETURN) {
1010 ipf_unref();
1011 INM_REMREF(inm);
1012 goto done;
1013 }
1014 if (result != 0) {
1015 ipf_unref();
1016 INM_REMREF(inm);
1017 goto bad;
1018 }
1019 }
1020 }
1021
1022 /* set back to host byte order */
1023 ip = mtod(m, struct ip *);
1024#if BYTE_ORDER != BIG_ENDIAN
1025 NTOHS(ip->ip_len);
1026 NTOHS(ip->ip_off);
1027#endif
1028 ipf_unref();
1029 ipobf.didfilter = true;
1030 }
1031 ip_mloopback(srcifp, ifp, m, dst, hlen);
1032 }
1033 if (inm != NULL) {
1034 INM_REMREF(inm);
1035 }
1036 /*
1037 * Multicasts with a time-to-live of zero may be looped-
1038 * back, above, but must not be transmitted on a network.
1039 * Also, multicasts addressed to the loopback interface
1040 * are not sent -- the above call to ip_mloopback() will
1041 * loop back a copy if this host actually belongs to the
1042 * destination group on the loopback interface.
1043 */
1044 if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
1045 m_freem(m);
1046 goto done;
1047 }
1048
1049 goto sendit;
1050 }
1051 /*
1052 * If source address not specified yet, use address
1053 * of outgoing interface.
1054 */
1055 if (ip->ip_src.s_addr == INADDR_ANY) {
1056 IFA_LOCK_SPIN(&ia->ia_ifa);
1057 ip->ip_src = IA_SIN(ia)->sin_addr;
1058 IFA_UNLOCK(&ia->ia_ifa);
1059 }
1060
1061 /*
1062 * Look for broadcast address and
1063 * and verify user is allowed to send
1064 * such a packet.
1065 */
1066 if (ipobf.isbroadcast) {
1067 if (!(ifp->if_flags & IFF_BROADCAST)) {
1068 error = EADDRNOTAVAIL;
1069 goto bad;
1070 }
1071 if (!(flags & IP_ALLOWBROADCAST)) {
1072 error = EACCES;
1073 goto bad;
1074 }
1075 /* don't allow broadcast messages to be fragmented */
1076 if ((u_short)ip->ip_len > ifp->if_mtu) {
1077 error = EMSGSIZE;
1078 goto bad;
1079 }
1080 m->m_flags |= M_BCAST;
1081 } else {
1082 m->m_flags &= ~M_BCAST;
1083 }
1084
1085sendit:
1086#if PF
1087 /* Invoke outbound packet filter */
1088 if (PF_IS_ENABLED) {
1089 int rc;
1090
1091 m0 = m; /* Save for later */
1092#if DUMMYNET
1093 rc = ip_output_pf_dn_hook(ifp, mppn, mp: &m, dn_pf_rule, ro, dst, flags, ipoa);
1094#else /* DUMMYNET */
1095 rc = pf_af_hook(ifp, mppn, &m, AF_INET, FALSE, NULL);
1096#endif /* DUMMYNET */
1097 if (rc != 0 || m == NULL) {
1098 /* Move to the next packet */
1099 m = *mppn;
1100
1101 /* Skip ahead if first packet in list got dropped */
1102 if (packetlist == m0) {
1103 packetlist = m;
1104 }
1105
1106 if (m != NULL) {
1107 m0 = m;
1108 /* Next packet in the chain */
1109 goto loopit;
1110 } else if (packetlist != NULL) {
1111 /* No more packet; send down the chain */
1112 goto sendchain;
1113 }
1114 /* Nothing left; we're done */
1115 goto done;
1116 }
1117 m0 = m;
1118 ip = mtod(m, struct ip *);
1119 pkt_dst = ip->ip_dst;
1120 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1121 }
1122#endif /* PF */
1123 /*
1124 * Force IP TTL to 255 following draft-ietf-zeroconf-ipv4-linklocal.txt
1125 */
1126 if (IN_LINKLOCAL(ntohl(ip->ip_src.s_addr)) ||
1127 IN_LINKLOCAL(ntohl(ip->ip_dst.s_addr))) {
1128 ip_linklocal_stat.iplls_out_total++;
1129 if (ip->ip_ttl != MAXTTL) {
1130 ip_linklocal_stat.iplls_out_badttl++;
1131 ip->ip_ttl = MAXTTL;
1132 }
1133 }
1134
1135 if (!ipobf.didfilter &&
1136 !TAILQ_EMPTY(&ipv4_filters)
1137#if NECP
1138 && !necp_packet_should_skip_filters(packet: m)
1139#endif // NECP
1140 ) {
1141 struct ipfilter *filter;
1142 int seen = (inject_filter_ref == NULL);
1143 ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS;
1144
1145 /*
1146 * Check that a TSO frame isn't passed to a filter.
1147 * This could happen if a filter is inserted while
1148 * TCP is sending the TSO packet.
1149 */
1150 if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) {
1151 error = EMSGSIZE;
1152 goto bad;
1153 }
1154
1155 ipf_ref();
1156
1157 /* 4135317 - always pass network byte order to filter */
1158#if BYTE_ORDER != BIG_ENDIAN
1159 HTONS(ip->ip_len);
1160 HTONS(ip->ip_off);
1161#endif
1162 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
1163 if (seen == 0) {
1164 if ((struct ipfilter *)inject_filter_ref ==
1165 filter) {
1166 seen = 1;
1167 }
1168 } else if (filter->ipf_filter.ipf_output) {
1169 errno_t result;
1170 result = filter->ipf_filter.
1171 ipf_output(filter->ipf_filter.cookie,
1172 (mbuf_t *)&m, ippo);
1173 if (result == EJUSTRETURN) {
1174 ipf_unref();
1175 goto done;
1176 }
1177 if (result != 0) {
1178 ipf_unref();
1179 goto bad;
1180 }
1181 }
1182 }
1183 /* set back to host byte order */
1184 ip = mtod(m, struct ip *);
1185#if BYTE_ORDER != BIG_ENDIAN
1186 NTOHS(ip->ip_len);
1187 NTOHS(ip->ip_off);
1188#endif
1189 ipf_unref();
1190 }
1191
1192#if NECP
1193 /* Process Network Extension Policy. Will Pass, Drop, or Rebind packet. */
1194 necp_matched_policy_id = necp_ip_output_find_policy_match(packet: m,
1195 flags, ipoa: (flags & IP_OUTARGS) ? ipoa : NULL, rt: ro ? ro->ro_rt : NULL, result: &necp_result, result_parameter: &necp_result_parameter);
1196 if (necp_matched_policy_id) {
1197 necp_mark_packet_from_ip(packet: m, policy_id: necp_matched_policy_id);
1198 switch (necp_result) {
1199 case NECP_KERNEL_POLICY_RESULT_PASS:
1200 if (necp_result_parameter.pass_flags & NECP_KERNEL_POLICY_PASS_NO_SKIP_IPSEC) {
1201 break;
1202 }
1203 /* Check if the interface is allowed */
1204 if (!necp_packet_is_allowed_over_interface(packet: m, interface: ifp)) {
1205 error = EHOSTUNREACH;
1206 OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
1207 goto bad;
1208 }
1209 goto skip_ipsec;
1210 case NECP_KERNEL_POLICY_RESULT_DROP:
1211 case NECP_KERNEL_POLICY_RESULT_SOCKET_DIVERT:
1212 /* Flow divert packets should be blocked at the IP layer */
1213 error = EHOSTUNREACH;
1214 OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
1215 goto bad;
1216 case NECP_KERNEL_POLICY_RESULT_IP_TUNNEL: {
1217 /* Verify that the packet is being routed to the tunnel */
1218 struct ifnet *policy_ifp = necp_get_ifnet_from_result_parameter(result_parameter: &necp_result_parameter);
1219 if (policy_ifp == ifp) {
1220 /* Check if the interface is allowed */
1221 if (!necp_packet_is_allowed_over_interface(packet: m, interface: ifp)) {
1222 error = EHOSTUNREACH;
1223 OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
1224 goto bad;
1225 }
1226 goto skip_ipsec;
1227 } else {
1228 if (necp_packet_can_rebind_to_ifnet(packet: m, interface: policy_ifp, new_route: &necp_route, AF_INET)) {
1229 /* Check if the interface is allowed */
1230 if (!necp_packet_is_allowed_over_interface(packet: m, interface: policy_ifp)) {
1231 error = EHOSTUNREACH;
1232 OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
1233 goto bad;
1234 }
1235
1236 /*
1237 * Update the QOS marking policy if
1238 * 1. up layer asks it to do so
1239 * 2. net_qos_policy_restricted is not set
1240 * 3. qos_marking_gencount doesn't match necp_kernel_socket_policies_gencount (checked in necp_lookup_current_qos_marking)
1241 */
1242 if (ipoa != NULL &&
1243 (ipoa->ipoa_flags & IPOAF_REDO_QOSMARKING_POLICY) &&
1244 net_qos_policy_restricted != 0) {
1245 bool qos_marking = (ipoa->ipoa_flags & IPOAF_QOSMARKING_ALLOWED) ? TRUE : FALSE;
1246 qos_marking = necp_lookup_current_qos_marking(qos_marking_gencount: &ipoa->qos_marking_gencount, NULL, interface: policy_ifp, route_rule_id: necp_result_parameter.route_rule_id, old_qos_marking: qos_marking);
1247 if (qos_marking) {
1248 ipoa->ipoa_flags |= IPOAF_QOSMARKING_ALLOWED;
1249 } else {
1250 ipoa->ipoa_flags &= ~IPOAF_QOSMARKING_ALLOWED;
1251 }
1252 }
1253
1254 /* Set ifp to the tunnel interface, since it is compatible with the packet */
1255 ifp = policy_ifp;
1256 ro = &necp_route;
1257 goto skip_ipsec;
1258 } else {
1259 error = ENETUNREACH;
1260 OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
1261 goto bad;
1262 }
1263 }
1264 }
1265 default:
1266 break;
1267 }
1268 }
1269 /* Catch-all to check if the interface is allowed */
1270 if (!necp_packet_is_allowed_over_interface(packet: m, interface: ifp)) {
1271 error = EHOSTUNREACH;
1272 OSAddAtomic(1, &ipstat.ips_necp_policy_drop);
1273 goto bad;
1274 }
1275#endif /* NECP */
1276
1277#if IPSEC
1278 if (ipsec_bypass != 0 || (flags & IP_NOIPSEC)) {
1279 goto skip_ipsec;
1280 }
1281
1282 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_START, 0, 0, 0, 0, 0);
1283
1284 if (sp == NULL) {
1285 /* get SP for this packet */
1286 if (so != NULL) {
1287 sp = ipsec4_getpolicybysock(m, IPSEC_DIR_OUTBOUND,
1288 so, &error);
1289 } else {
1290 sp = ipsec4_getpolicybyaddr(m, IPSEC_DIR_OUTBOUND,
1291 flags, &error);
1292 }
1293 if (sp == NULL) {
1294 IPSEC_STAT_INCREMENT(ipsecstat.out_inval);
1295 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1296 0, 0, 0, 0, 0);
1297 goto bad;
1298 }
1299 }
1300
1301 error = 0;
1302
1303 /* check policy */
1304 switch (sp->policy) {
1305 case IPSEC_POLICY_DISCARD:
1306 case IPSEC_POLICY_GENERATE:
1307 /*
1308 * This packet is just discarded.
1309 */
1310 IPSEC_STAT_INCREMENT(ipsecstat.out_polvio);
1311 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1312 1, 0, 0, 0, 0);
1313 goto bad;
1314
1315 case IPSEC_POLICY_BYPASS:
1316 case IPSEC_POLICY_NONE:
1317 /* no need to do IPsec. */
1318 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1319 2, 0, 0, 0, 0);
1320 goto skip_ipsec;
1321
1322 case IPSEC_POLICY_IPSEC:
1323 if (sp->req == NULL) {
1324 /* acquire a policy */
1325 error = key_spdacquire(sp);
1326 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1327 3, 0, 0, 0, 0);
1328 goto bad;
1329 }
1330 if (sp->ipsec_if) {
1331 /* Verify the redirect to ipsec interface */
1332 if (sp->ipsec_if == ifp) {
1333 goto skip_ipsec;
1334 }
1335 goto bad;
1336 }
1337 break;
1338
1339 case IPSEC_POLICY_ENTRUST:
1340 default:
1341 printf("ip_output: Invalid policy found. %d\n", sp->policy);
1342 }
1343 {
1344 ipsec_state.m = m;
1345 if (flags & IP_ROUTETOIF) {
1346 bzero(s: &ipsec_state.ro, n: sizeof(ipsec_state.ro));
1347 } else {
1348 route_copyout((struct route *)&ipsec_state.ro, ro, sizeof(struct route));
1349 }
1350 ipsec_state.dst = SA(dst);
1351
1352 ip->ip_sum = 0;
1353
1354 /*
1355 * XXX
1356 * delayed checksums are not currently compatible with IPsec
1357 */
1358 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
1359 in_delayed_cksum(m);
1360 }
1361
1362#if BYTE_ORDER != BIG_ENDIAN
1363 HTONS(ip->ip_len);
1364 HTONS(ip->ip_off);
1365#endif
1366
1367 DTRACE_IP6(send, struct mbuf *, m, struct inpcb *, NULL,
1368 struct ip *, ip, struct ifnet *, ifp,
1369 struct ip *, ip, struct ip6_hdr *, NULL);
1370
1371 error = ipsec4_output(&ipsec_state, sp, flags);
1372 if (ipsec_state.tunneled == 6) {
1373 m0 = m = NULL;
1374 error = 0;
1375 goto bad;
1376 }
1377
1378 m0 = m = ipsec_state.m;
1379
1380#if DUMMYNET
1381 /*
1382 * If we're about to use the route in ipsec_state
1383 * and this came from dummynet, cleaup now.
1384 */
1385 if (ro == &saved_route &&
1386 (!(flags & IP_ROUTETOIF) || ipsec_state.tunneled)) {
1387 ROUTE_RELEASE(ro);
1388 }
1389#endif /* DUMMYNET */
1390
1391 if (flags & IP_ROUTETOIF) {
1392 /*
1393 * if we have tunnel mode SA, we may need to ignore
1394 * IP_ROUTETOIF.
1395 */
1396 if (ipsec_state.tunneled) {
1397 flags &= ~IP_ROUTETOIF;
1398 ro = (struct route *)&ipsec_state.ro;
1399 }
1400 } else {
1401 ro = (struct route *)&ipsec_state.ro;
1402 }
1403 dst = SIN(ipsec_state.dst);
1404 if (error) {
1405 /* mbuf is already reclaimed in ipsec4_output. */
1406 m0 = NULL;
1407 switch (error) {
1408 case EHOSTUNREACH:
1409 case ENETUNREACH:
1410 case EMSGSIZE:
1411 case ENOBUFS:
1412 case ENOMEM:
1413 break;
1414 default:
1415 printf("ip4_output (ipsec): error code %d\n", error);
1416 OS_FALLTHROUGH;
1417 case ENOENT:
1418 /* don't show these error codes to the user */
1419 error = 0;
1420 break;
1421 }
1422 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1423 4, 0, 0, 0, 0);
1424 goto bad;
1425 }
1426 }
1427
1428 /* be sure to update variables that are affected by ipsec4_output() */
1429 ip = mtod(m, struct ip *);
1430
1431#ifdef _IP_VHL
1432 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1433#else /* !_IP_VHL */
1434 hlen = ip->ip_hl << 2;
1435#endif /* !_IP_VHL */
1436 /* Check that there wasn't a route change and src is still valid */
1437 if (ROUTE_UNUSABLE(ro)) {
1438 ROUTE_RELEASE(ro);
1439 VERIFY(src_ia == NULL);
1440 if (ip->ip_src.s_addr != INADDR_ANY &&
1441 !(flags & (IP_ROUTETOIF | IP_FORWARDING)) &&
1442 (src_ia = ifa_foraddr(ip->ip_src.s_addr)) == NULL) {
1443 error = EADDRNOTAVAIL;
1444 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1445 5, 0, 0, 0, 0);
1446 goto bad;
1447 }
1448 if (src_ia != NULL) {
1449 ifa_remref(ifa: &src_ia->ia_ifa);
1450 src_ia = NULL;
1451 }
1452 }
1453
1454 if (ro->ro_rt == NULL) {
1455 if (!(flags & IP_ROUTETOIF)) {
1456 printf("%s: can't update route after "
1457 "IPsec processing\n", __func__);
1458 error = EHOSTUNREACH; /* XXX */
1459 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1460 6, 0, 0, 0, 0);
1461 goto bad;
1462 }
1463 } else {
1464 if (ia != NULL) {
1465 ifa_remref(ifa: &ia->ia_ifa);
1466 }
1467 RT_LOCK_SPIN(ro->ro_rt);
1468 ia = ifatoia(ro->ro_rt->rt_ifa);
1469 if (ia != NULL) {
1470 /* Become a regular mutex */
1471 RT_CONVERT_LOCK(ro->ro_rt);
1472 ifa_addref(ifa: &ia->ia_ifa);
1473 }
1474 ifp = ro->ro_rt->rt_ifp;
1475 RT_UNLOCK(ro->ro_rt);
1476 }
1477
1478 /* make it flipped, again. */
1479#if BYTE_ORDER != BIG_ENDIAN
1480 NTOHS(ip->ip_len);
1481 NTOHS(ip->ip_off);
1482#endif
1483 KERNEL_DEBUG(DBG_FNC_IPSEC4_OUTPUT | DBG_FUNC_END,
1484 7, 0xff, 0xff, 0xff, 0xff);
1485
1486 /* Pass to filters again */
1487 if (!TAILQ_EMPTY(&ipv4_filters)
1488#if NECP
1489 && !necp_packet_should_skip_filters(packet: m)
1490#endif // NECP
1491 ) {
1492 struct ipfilter *filter;
1493
1494 ipf_pktopts.ippo_flags &= ~IPPOF_MCAST_OPTS;
1495
1496 /*
1497 * Check that a TSO frame isn't passed to a filter.
1498 * This could happen if a filter is inserted while
1499 * TCP is sending the TSO packet.
1500 */
1501 if (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) {
1502 error = EMSGSIZE;
1503 goto bad;
1504 }
1505
1506 ipf_ref();
1507
1508 /* 4135317 - always pass network byte order to filter */
1509#if BYTE_ORDER != BIG_ENDIAN
1510 HTONS(ip->ip_len);
1511 HTONS(ip->ip_off);
1512#endif
1513 TAILQ_FOREACH(filter, &ipv4_filters, ipf_link) {
1514 if (filter->ipf_filter.ipf_output) {
1515 errno_t result;
1516 result = filter->ipf_filter.
1517 ipf_output(filter->ipf_filter.cookie,
1518 (mbuf_t *)&m, ippo);
1519 if (result == EJUSTRETURN) {
1520 ipf_unref();
1521 goto done;
1522 }
1523 if (result != 0) {
1524 ipf_unref();
1525 goto bad;
1526 }
1527 }
1528 }
1529 /* set back to host byte order */
1530 ip = mtod(m, struct ip *);
1531#if BYTE_ORDER != BIG_ENDIAN
1532 NTOHS(ip->ip_len);
1533 NTOHS(ip->ip_off);
1534#endif
1535 ipf_unref();
1536 }
1537skip_ipsec:
1538#endif /* IPSEC */
1539
1540
1541 /* 127/8 must not appear on wire - RFC1122 */
1542 if (!(ifp->if_flags & IFF_LOOPBACK) &&
1543 ((ntohl(ip->ip_src.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET ||
1544 (ntohl(ip->ip_dst.s_addr) >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET)) {
1545 OSAddAtomic(1, &ipstat.ips_badaddr);
1546 error = EADDRNOTAVAIL;
1547 goto bad;
1548 }
1549
1550 if (ipoa != NULL) {
1551 u_int8_t dscp = ip->ip_tos >> IPTOS_DSCP_SHIFT;
1552
1553 error = set_packet_qos(m, ifp,
1554 ipoa->ipoa_flags & IPOAF_QOSMARKING_ALLOWED ? TRUE : FALSE,
1555 ipoa->ipoa_sotc, ipoa->ipoa_netsvctype, &dscp);
1556 if (error == 0) {
1557 ip->ip_tos &= IPTOS_ECN_MASK;
1558 ip->ip_tos |= (u_char)(dscp << IPTOS_DSCP_SHIFT);
1559 } else {
1560 printf("%s if_dscp_for_mbuf() error %d\n", __func__, error);
1561 error = 0;
1562 }
1563 }
1564
1565 ip_output_checksum(ifp, m, (IP_VHL_HL(ip->ip_vhl) << 2),
1566 ip->ip_len, &sw_csum);
1567
1568 interface_mtu = ifp->if_mtu;
1569
1570 if (INTF_ADJUST_MTU_FOR_CLAT46(ifp)) {
1571 interface_mtu = IN6_LINKMTU(ifp);
1572 /* Further adjust the size for CLAT46 expansion */
1573 interface_mtu -= CLAT46_HDR_EXPANSION_OVERHD;
1574 }
1575
1576 /*
1577 * If small enough for interface, or the interface will take
1578 * care of the fragmentation for us, can just send directly.
1579 */
1580 if ((u_short)ip->ip_len <= interface_mtu || TSO_IPV4_OK(ifp, m) ||
1581 (!(ip->ip_off & IP_DF) && (ifp->if_hwassist & CSUM_FRAGMENT))) {
1582#if BYTE_ORDER != BIG_ENDIAN
1583 HTONS(ip->ip_len);
1584 HTONS(ip->ip_off);
1585#endif
1586
1587 ip->ip_sum = 0;
1588 if ((sw_csum & CSUM_DELAY_IP) || __improbable(force_ipsum != 0)) {
1589 ip->ip_sum = ip_cksum_hdr_out(m, hlen);
1590 sw_csum &= ~CSUM_DELAY_IP;
1591 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP;
1592 }
1593
1594#if IPSEC
1595 /* clean ipsec history once it goes out of the node */
1596 if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) {
1597 ipsec_delaux(m);
1598 }
1599#endif /* IPSEC */
1600 if ((m->m_pkthdr.csum_flags & CSUM_TSO_IPV4) &&
1601 (m->m_pkthdr.tso_segsz > 0)) {
1602 scnt += m->m_pkthdr.len / m->m_pkthdr.tso_segsz;
1603 } else {
1604 scnt++;
1605 }
1606
1607 if (packetchain == 0) {
1608 if (ro->ro_rt != NULL && nstat_collect) {
1609 nstat_route_tx(rte: ro->ro_rt, packets: scnt,
1610 bytes: m->m_pkthdr.len, flags: 0);
1611 }
1612
1613 error = dlil_output(ifp, PF_INET, m, ro->ro_rt,
1614 SA(dst), 0, adv);
1615 if (dlil_verbose && error) {
1616 printf("dlil_output error on interface %s: %d\n",
1617 ifp->if_xname, error);
1618 }
1619 scnt = 0;
1620 goto done;
1621 } else {
1622 /*
1623 * packet chaining allows us to reuse the
1624 * route for all packets
1625 */
1626 bytecnt += m->m_pkthdr.len;
1627 mppn = &m->m_nextpkt;
1628 m = m->m_nextpkt;
1629 if (m == NULL) {
1630#if PF
1631sendchain:
1632#endif /* PF */
1633 if (pktcnt > ip_maxchainsent) {
1634 ip_maxchainsent = pktcnt;
1635 }
1636 if (ro->ro_rt != NULL && nstat_collect) {
1637 nstat_route_tx(rte: ro->ro_rt, packets: scnt,
1638 bytes: bytecnt, flags: 0);
1639 }
1640
1641 error = dlil_output(ifp, PF_INET, packetlist,
1642 ro->ro_rt, SA(dst), 0, adv);
1643 if (dlil_verbose && error) {
1644 printf("dlil_output error on interface %s: %d\n",
1645 ifp->if_xname, error);
1646 }
1647 pktcnt = 0;
1648 scnt = 0;
1649 bytecnt = 0;
1650 goto done;
1651 }
1652 m0 = m;
1653 pktcnt++;
1654 goto loopit;
1655 }
1656 }
1657
1658 VERIFY(interface_mtu != 0);
1659 /*
1660 * Too large for interface; fragment if possible.
1661 * Must be able to put at least 8 bytes per fragment.
1662 * Balk when DF bit is set or the interface didn't support TSO.
1663 */
1664 if ((ip->ip_off & IP_DF) || pktcnt > 0 ||
1665 (m->m_pkthdr.csum_flags & CSUM_TSO_IPV4)) {
1666 error = EMSGSIZE;
1667 /*
1668 * This case can happen if the user changed the MTU
1669 * of an interface after enabling IP on it. Because
1670 * most netifs don't keep track of routes pointing to
1671 * them, there is no way for one to update all its
1672 * routes when the MTU is changed.
1673 */
1674 if (ro->ro_rt) {
1675 RT_LOCK_SPIN(ro->ro_rt);
1676 if ((ro->ro_rt->rt_flags & (RTF_UP | RTF_HOST)) &&
1677 !(ro->ro_rt->rt_rmx.rmx_locks & RTV_MTU) &&
1678 (ro->ro_rt->rt_rmx.rmx_mtu > interface_mtu)) {
1679 ro->ro_rt->rt_rmx.rmx_mtu = interface_mtu;
1680 }
1681 RT_UNLOCK(ro->ro_rt);
1682 }
1683 if (pktcnt > 0) {
1684 m0 = packetlist;
1685 }
1686 OSAddAtomic(1, &ipstat.ips_cantfrag);
1687 goto bad;
1688 }
1689
1690 /*
1691 * XXX Only TCP seems to be passing a list of packets here.
1692 * The following issue is limited to UDP datagrams with 0 checksum.
1693 * For now limit it to the case when single packet is passed down.
1694 */
1695 if (packetchain == 0 && IS_INTF_CLAT46(ifp)) {
1696 /*
1697 * If it is a UDP packet that has checksum set to 0
1698 * and is also not being offloaded, compute a full checksum
1699 * and update the UDP checksum.
1700 */
1701 if (ip->ip_p == IPPROTO_UDP &&
1702 !(m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_PARTIAL))) {
1703 struct udphdr *uh = NULL;
1704
1705 if (m->m_len < hlen + sizeof(struct udphdr)) {
1706 m = m_pullup(m, hlen + sizeof(struct udphdr));
1707 if (m == NULL) {
1708 error = ENOBUFS;
1709 m0 = m;
1710 goto bad;
1711 }
1712 m0 = m;
1713 ip = mtod(m, struct ip *);
1714 }
1715 /*
1716 * Get UDP header and if checksum is 0, then compute the full
1717 * checksum.
1718 */
1719 uh = (struct udphdr *)(void *)((caddr_t)ip + hlen);
1720 if (uh->uh_sum == 0) {
1721 uh->uh_sum = inet_cksum(m, IPPROTO_UDP, hlen,
1722 ip->ip_len - hlen);
1723 if (uh->uh_sum == 0) {
1724 uh->uh_sum = 0xffff;
1725 }
1726 }
1727 }
1728 }
1729
1730 error = ip_fragment(m, ifp, interface_mtu, sw_csum);
1731 if (error != 0) {
1732 m0 = m = NULL;
1733 goto bad;
1734 }
1735
1736 KERNEL_DEBUG(DBG_LAYER_END, ip->ip_dst.s_addr,
1737 ip->ip_src.s_addr, ip->ip_p, ip->ip_off, ip->ip_len);
1738
1739 for (m = m0; m; m = m0) {
1740 m0 = m->m_nextpkt;
1741 m->m_nextpkt = 0;
1742#if IPSEC
1743 /* clean ipsec history once it goes out of the node */
1744 if (ipsec_bypass == 0 && !(flags & IP_NOIPSEC)) {
1745 ipsec_delaux(m);
1746 }
1747#endif /* IPSEC */
1748 if (error == 0) {
1749 if ((packetchain != 0) && (pktcnt > 0)) {
1750 panic("%s: mix of packet in packetlist is "
1751 "wrong=%p", __func__, packetlist);
1752 /* NOTREACHED */
1753 }
1754 if (ro->ro_rt != NULL && nstat_collect) {
1755 nstat_route_tx(rte: ro->ro_rt, packets: 1,
1756 bytes: m->m_pkthdr.len, flags: 0);
1757 }
1758 error = dlil_output(ifp, PF_INET, m, ro->ro_rt,
1759 SA(dst), 0, adv);
1760 if (dlil_verbose && error) {
1761 printf("dlil_output error on interface %s: %d\n",
1762 ifp->if_xname, error);
1763 }
1764 } else {
1765 m_freem(m);
1766 }
1767 }
1768
1769 if (error == 0) {
1770 OSAddAtomic(1, &ipstat.ips_fragmented);
1771 }
1772
1773done:
1774 if (ia != NULL) {
1775 ifa_remref(ifa: &ia->ia_ifa);
1776 ia = NULL;
1777 }
1778#if IPSEC
1779 ROUTE_RELEASE(&ipsec_state.ro);
1780 if (sp != NULL) {
1781 KEYDEBUG(KEYDEBUG_IPSEC_STAMP,
1782 printf("DP ip_output call free SP:%x\n", sp));
1783 key_freesp(sp, KEY_SADB_UNLOCKED);
1784 }
1785#endif /* IPSEC */
1786#if NECP
1787 ROUTE_RELEASE(&necp_route);
1788#endif /* NECP */
1789#if DUMMYNET
1790 ROUTE_RELEASE(&saved_route);
1791#endif /* DUMMYNET */
1792
1793 KERNEL_DEBUG(DBG_FNC_IP_OUTPUT | DBG_FUNC_END, error, 0, 0, 0, 0);
1794 if (ip_output_measure) {
1795 net_perf_measure_time(npp: &net_perf, start: &start_tv, num_pkts: packets_processed);
1796 net_perf_histogram(npp: &net_perf, num_pkts: packets_processed);
1797 }
1798 return error;
1799bad:
1800 if (pktcnt > 0) {
1801 m0 = packetlist;
1802 }
1803 m_freem_list(m0);
1804 goto done;
1805
1806#undef ipsec_state
1807#undef args
1808#undef sro_fwd
1809#undef saved_route
1810#undef ipf_pktopts
1811#undef IP_CHECK_RESTRICTIONS
1812}
1813
1814int
1815ip_fragment(struct mbuf *m, struct ifnet *ifp, uint32_t mtu, int sw_csum)
1816{
1817 struct ip *ip, *mhip;
1818 int len, hlen, mhlen, firstlen, off, error = 0;
1819 struct mbuf **mnext = &m->m_nextpkt, *m0;
1820 int nfrags = 1;
1821
1822 ip = mtod(m, struct ip *);
1823#ifdef _IP_VHL
1824 hlen = IP_VHL_HL(ip->ip_vhl) << 2;
1825#else /* !_IP_VHL */
1826 hlen = ip->ip_hl << 2;
1827#endif /* !_IP_VHL */
1828
1829 /*
1830 * We need to adjust the fragment sizes to account
1831 * for IPv6 fragment header if it needs to be translated
1832 * from IPv4 to IPv6.
1833 */
1834 if (IS_INTF_CLAT46(ifp)) {
1835 mtu -= sizeof(struct ip6_frag);
1836 }
1837
1838 firstlen = len = (mtu - hlen) & ~7;
1839 if (len < 8) {
1840 m_freem(m);
1841 return EMSGSIZE;
1842 }
1843
1844 /*
1845 * if the interface will not calculate checksums on
1846 * fragmented packets, then do it here.
1847 */
1848 if ((m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) &&
1849 !(ifp->if_hwassist & CSUM_IP_FRAGS)) {
1850 in_delayed_cksum(m);
1851 }
1852
1853 /*
1854 * Loop through length of segment after first fragment,
1855 * make new header and copy data of each part and link onto chain.
1856 */
1857 m0 = m;
1858 mhlen = sizeof(struct ip);
1859 for (off = hlen + len; off < (u_short)ip->ip_len; off += len) {
1860 MGETHDR(m, M_DONTWAIT, MT_HEADER); /* MAC-OK */
1861 if (m == NULL) {
1862 error = ENOBUFS;
1863 OSAddAtomic(1, &ipstat.ips_odropped);
1864 goto sendorfree;
1865 }
1866 m->m_flags |= (m0->m_flags & M_MCAST) | M_FRAG;
1867 m->m_data += max_linkhdr;
1868 mhip = mtod(m, struct ip *);
1869 *mhip = *ip;
1870 if (hlen > sizeof(struct ip)) {
1871 mhlen = ip_optcopy(ip, mhip) + sizeof(struct ip);
1872 mhip->ip_vhl = IP_MAKE_VHL(IPVERSION, mhlen >> 2);
1873 }
1874 m->m_len = mhlen;
1875 mhip->ip_off = (u_short)(((off - hlen) >> 3) + (ip->ip_off & ~IP_MF));
1876 if (ip->ip_off & IP_MF) {
1877 mhip->ip_off |= IP_MF;
1878 }
1879 if (off + len >= (u_short)ip->ip_len) {
1880 len = (u_short)ip->ip_len - off;
1881 } else {
1882 mhip->ip_off |= IP_MF;
1883 }
1884 mhip->ip_len = htons((u_short)(len + mhlen));
1885 m->m_next = m_copy(m0, off, len);
1886 if (m->m_next == NULL) {
1887 (void) m_free(m);
1888 error = ENOBUFS; /* ??? */
1889 OSAddAtomic(1, &ipstat.ips_odropped);
1890 goto sendorfree;
1891 }
1892 m->m_pkthdr.len = mhlen + len;
1893 m->m_pkthdr.rcvif = NULL;
1894 m->m_pkthdr.csum_flags = m0->m_pkthdr.csum_flags;
1895
1896 M_COPY_CLASSIFIER(m, m0);
1897 M_COPY_PFTAG(m, m0);
1898 M_COPY_NECPTAG(m, m0);
1899
1900#if BYTE_ORDER != BIG_ENDIAN
1901 HTONS(mhip->ip_off);
1902#endif
1903
1904 mhip->ip_sum = 0;
1905 if (sw_csum & CSUM_DELAY_IP) {
1906 mhip->ip_sum = ip_cksum_hdr_out(m, mhlen);
1907 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP;
1908 }
1909 *mnext = m;
1910 mnext = &m->m_nextpkt;
1911 nfrags++;
1912 }
1913 OSAddAtomic(nfrags, &ipstat.ips_ofragments);
1914
1915 /* set first/last markers for fragment chain */
1916 m->m_flags |= M_LASTFRAG;
1917 m0->m_flags |= M_FIRSTFRAG | M_FRAG;
1918 m0->m_pkthdr.csum_data = nfrags;
1919
1920 /*
1921 * Update first fragment by trimming what's been copied out
1922 * and updating header, then send each fragment (in order).
1923 */
1924 m = m0;
1925 m_adj(m, hlen + firstlen - (u_short)ip->ip_len);
1926 m->m_pkthdr.len = hlen + firstlen;
1927 ip->ip_len = htons((u_short)m->m_pkthdr.len);
1928 ip->ip_off |= IP_MF;
1929
1930#if BYTE_ORDER != BIG_ENDIAN
1931 HTONS(ip->ip_off);
1932#endif
1933
1934 ip->ip_sum = 0;
1935 if (sw_csum & CSUM_DELAY_IP) {
1936 ip->ip_sum = ip_cksum_hdr_out(m, hlen);
1937 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP;
1938 }
1939sendorfree:
1940 if (error) {
1941 m_freem_list(m0);
1942 }
1943
1944 return error;
1945}
1946
1947static void
1948ip_out_cksum_stats(int proto, u_int32_t len)
1949{
1950 switch (proto) {
1951 case IPPROTO_TCP:
1952 tcp_out_cksum_stats(len);
1953 break;
1954 case IPPROTO_UDP:
1955 udp_out_cksum_stats(len);
1956 break;
1957 default:
1958 /* keep only TCP or UDP stats for now */
1959 break;
1960 }
1961}
1962
1963/*
1964 * Process a delayed payload checksum calculation (outbound path.)
1965 *
1966 * hoff is the number of bytes beyond the mbuf data pointer which
1967 * points to the IP header.
1968 *
1969 * Returns a bitmask representing all the work done in software.
1970 */
1971uint32_t
1972in_finalize_cksum(struct mbuf *m, uint32_t hoff, uint32_t csum_flags)
1973{
1974 unsigned char buf[15 << 2] __attribute__((aligned(8)));
1975 struct ip *ip;
1976 uint32_t offset, _hlen, mlen, hlen, len, sw_csum;
1977 uint16_t csum, ip_len;
1978
1979 _CASSERT(sizeof(csum) == sizeof(uint16_t));
1980 VERIFY(m->m_flags & M_PKTHDR);
1981
1982 sw_csum = (csum_flags & m->m_pkthdr.csum_flags);
1983
1984 if ((sw_csum &= (CSUM_DELAY_IP | CSUM_DELAY_DATA)) == 0) {
1985 goto done;
1986 }
1987
1988 mlen = m->m_pkthdr.len; /* total mbuf len */
1989
1990 /* sanity check (need at least simple IP header) */
1991 if (mlen < (hoff + sizeof(*ip))) {
1992 panic("%s: mbuf %p pkt len (%u) < hoff+ip_hdr "
1993 "(%u+%u)\n", __func__, m, mlen, hoff,
1994 (uint32_t)sizeof(*ip));
1995 /* NOTREACHED */
1996 }
1997
1998 /*
1999 * In case the IP header is not contiguous, or not 32-bit aligned,
2000 * or if we're computing the IP header checksum, copy it to a local
2001 * buffer. Copy only the simple IP header here (IP options case
2002 * is handled below.)
2003 */
2004 if ((sw_csum & CSUM_DELAY_IP) || (hoff + sizeof(*ip)) > m->m_len ||
2005 !IP_HDR_ALIGNED_P(mtod(m, caddr_t) + hoff)) {
2006 m_copydata(m, hoff, sizeof(*ip), (caddr_t)buf);
2007 ip = (struct ip *)(void *)buf;
2008 _hlen = sizeof(*ip);
2009 } else {
2010 ip = (struct ip *)(void *)(m->m_data + hoff);
2011 _hlen = 0;
2012 }
2013
2014 hlen = IP_VHL_HL(ip->ip_vhl) << 2; /* IP header len */
2015
2016 /* sanity check */
2017 if (mlen < (hoff + hlen)) {
2018 panic("%s: mbuf %p pkt too short (%d) for IP header (%u), "
2019 "hoff %u", __func__, m, mlen, hlen, hoff);
2020 /* NOTREACHED */
2021 }
2022
2023 /*
2024 * We could be in the context of an IP or interface filter; in the
2025 * former case, ip_len would be in host (correct) order while for
2026 * the latter it would be in network order. Because of this, we
2027 * attempt to interpret the length field by comparing it against
2028 * the actual packet length. If the comparison fails, byte swap
2029 * the length and check again. If it still fails, use the actual
2030 * packet length. This also covers the trailing bytes case.
2031 */
2032 ip_len = ip->ip_len;
2033 if (ip_len != (mlen - hoff)) {
2034 ip_len = OSSwapInt16(ip_len);
2035 if (ip_len != (mlen - hoff)) {
2036 printf("%s: mbuf 0x%llx proto %d IP len %d (%x) "
2037 "[swapped %d (%x)] doesn't match actual packet "
2038 "length; %d is used instead\n", __func__,
2039 (uint64_t)VM_KERNEL_ADDRHASH(m), ip->ip_p,
2040 ip->ip_len, ip->ip_len, ip_len, ip_len,
2041 (mlen - hoff));
2042 if (mlen - hoff > UINT16_MAX) {
2043 panic("%s: mlen %u - hoff %u > 65535",
2044 __func__, mlen, hoff);
2045 }
2046 ip_len = (uint16_t)(mlen - hoff);
2047 }
2048 }
2049
2050 len = ip_len - hlen; /* csum span */
2051
2052 if (sw_csum & CSUM_DELAY_DATA) {
2053 uint16_t ulpoff;
2054
2055 /*
2056 * offset is added to the lower 16-bit value of csum_data,
2057 * which is expected to contain the ULP offset; therefore
2058 * CSUM_PARTIAL offset adjustment must be undone.
2059 */
2060 if ((m->m_pkthdr.csum_flags & (CSUM_PARTIAL | CSUM_DATA_VALID)) ==
2061 (CSUM_PARTIAL | CSUM_DATA_VALID)) {
2062 /*
2063 * Get back the original ULP offset (this will
2064 * undo the CSUM_PARTIAL logic in ip_output.)
2065 */
2066 m->m_pkthdr.csum_data = (m->m_pkthdr.csum_tx_stuff -
2067 m->m_pkthdr.csum_tx_start);
2068 }
2069
2070 ulpoff = (m->m_pkthdr.csum_data & 0xffff); /* ULP csum offset */
2071 offset = hoff + hlen; /* ULP header */
2072
2073 if (mlen < (ulpoff + sizeof(csum))) {
2074 panic("%s: mbuf %p pkt len (%u) proto %d invalid ULP "
2075 "cksum offset (%u) cksum flags 0x%x\n", __func__,
2076 m, mlen, ip->ip_p, ulpoff, m->m_pkthdr.csum_flags);
2077 /* NOTREACHED */
2078 }
2079
2080 csum = inet_cksum(m, 0, offset, len);
2081
2082 /* Update stats */
2083 ip_out_cksum_stats(proto: ip->ip_p, len);
2084
2085 /* RFC1122 4.1.3.4 */
2086 if (csum == 0 &&
2087 (m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_ZERO_INVERT))) {
2088 csum = 0xffff;
2089 }
2090
2091 /* Insert the checksum in the ULP csum field */
2092 offset += ulpoff;
2093 if (offset + sizeof(csum) > m->m_len) {
2094 m_copyback(m, offset, sizeof(csum), &csum);
2095 } else if (IP_HDR_ALIGNED_P(mtod(m, char *) + hoff)) {
2096 *(uint16_t *)(void *)(mtod(m, char *) + offset) = csum;
2097 } else {
2098 bcopy(src: &csum, dst: (mtod(m, char *) + offset), n: sizeof(csum));
2099 }
2100 m->m_pkthdr.csum_flags &= ~(CSUM_DELAY_DATA | CSUM_DATA_VALID |
2101 CSUM_PARTIAL | CSUM_ZERO_INVERT);
2102 }
2103
2104 if (sw_csum & CSUM_DELAY_IP) {
2105 /* IP header must be in the local buffer */
2106 VERIFY(_hlen == sizeof(*ip));
2107 if (_hlen != hlen) {
2108 VERIFY(hlen <= sizeof(buf));
2109 m_copydata(m, hoff, hlen, (caddr_t)buf);
2110 ip = (struct ip *)(void *)buf;
2111 _hlen = hlen;
2112 }
2113
2114 /*
2115 * Compute the IP header checksum as if the IP length
2116 * is the length which we believe is "correct"; see
2117 * how ip_len gets calculated above. Note that this
2118 * is done on the local copy and not on the real one.
2119 */
2120 ip->ip_len = htons(ip_len);
2121 ip->ip_sum = 0;
2122 csum = in_cksum_hdr_opt(ip);
2123
2124 /* Update stats */
2125 ipstat.ips_snd_swcsum++;
2126 ipstat.ips_snd_swcsum_bytes += hlen;
2127
2128 /*
2129 * Insert only the checksum in the existing IP header
2130 * csum field; all other fields are left unchanged.
2131 */
2132 offset = hoff + offsetof(struct ip, ip_sum);
2133 if (offset + sizeof(csum) > m->m_len) {
2134 m_copyback(m, offset, sizeof(csum), &csum);
2135 } else if (IP_HDR_ALIGNED_P(mtod(m, char *) + hoff)) {
2136 *(uint16_t *)(void *)(mtod(m, char *) + offset) = csum;
2137 } else {
2138 bcopy(src: &csum, dst: (mtod(m, char *) + offset), n: sizeof(csum));
2139 }
2140 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_IP;
2141 }
2142
2143done:
2144 return sw_csum;
2145}
2146
2147/*
2148 * Insert IP options into preformed packet.
2149 * Adjust IP destination as required for IP source routing,
2150 * as indicated by a non-zero in_addr at the start of the options.
2151 *
2152 * XXX This routine assumes that the packet has no options in place.
2153 */
2154static struct mbuf *
2155ip_insertoptions(struct mbuf *m, struct mbuf *opt, int *phlen)
2156{
2157 struct ipoption *p = mtod(opt, struct ipoption *);
2158 struct mbuf *n;
2159 struct ip *ip = mtod(m, struct ip *);
2160 unsigned optlen;
2161
2162 optlen = opt->m_len - sizeof(p->ipopt_dst);
2163 if (optlen + (u_short)ip->ip_len > IP_MAXPACKET) {
2164 return m; /* XXX should fail */
2165 }
2166 if (p->ipopt_dst.s_addr) {
2167 ip->ip_dst = p->ipopt_dst;
2168 }
2169 if (m->m_flags & M_EXT || m_mtod_current(m) - optlen < m->m_pktdat) {
2170 MGETHDR(n, M_DONTWAIT, MT_HEADER); /* MAC-OK */
2171 if (n == NULL) {
2172 return m;
2173 }
2174 n->m_pkthdr.rcvif = 0;
2175 n->m_pkthdr.len = m->m_pkthdr.len + optlen;
2176 m->m_len -= sizeof(struct ip);
2177 m->m_data += sizeof(struct ip);
2178 n->m_next = m;
2179 m = n;
2180 m->m_len = optlen + sizeof(struct ip);
2181 m->m_data += max_linkhdr;
2182 (void) memcpy(mtod(m, void *), src: ip, n: sizeof(struct ip));
2183 } else {
2184 m->m_data -= optlen;
2185 m->m_len += optlen;
2186 m->m_pkthdr.len += optlen;
2187 ovbcopy(from: (caddr_t)ip, mtod(m, caddr_t), len: sizeof(struct ip));
2188 }
2189 ip = mtod(m, struct ip *);
2190 bcopy(src: p->ipopt_list, dst: ip + 1, n: optlen);
2191 *phlen = sizeof(struct ip) + optlen;
2192 ip->ip_vhl = IP_MAKE_VHL(IPVERSION, *phlen >> 2);
2193 ip->ip_len += optlen;
2194 return m;
2195}
2196
2197/*
2198 * Copy options from ip to jp,
2199 * omitting those not copied during fragmentation.
2200 */
2201static int
2202ip_optcopy(struct ip *ip, struct ip *jp)
2203{
2204 u_char *cp, *dp;
2205 int opt, optlen, cnt;
2206
2207 cp = (u_char *)(ip + 1);
2208 dp = (u_char *)(jp + 1);
2209 cnt = (IP_VHL_HL(ip->ip_vhl) << 2) - sizeof(struct ip);
2210 for (; cnt > 0; cnt -= optlen, cp += optlen) {
2211 opt = cp[0];
2212 if (opt == IPOPT_EOL) {
2213 break;
2214 }
2215 if (opt == IPOPT_NOP) {
2216 /* Preserve for IP mcast tunnel's LSRR alignment. */
2217 *dp++ = IPOPT_NOP;
2218 optlen = 1;
2219 continue;
2220 }
2221#if DIAGNOSTIC
2222 if (cnt < IPOPT_OLEN + sizeof(*cp)) {
2223 panic("malformed IPv4 option passed to ip_optcopy");
2224 /* NOTREACHED */
2225 }
2226#endif
2227 optlen = cp[IPOPT_OLEN];
2228#if DIAGNOSTIC
2229 if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) {
2230 panic("malformed IPv4 option passed to ip_optcopy");
2231 /* NOTREACHED */
2232 }
2233#endif
2234 /* bogus lengths should have been caught by ip_dooptions */
2235 if (optlen > cnt) {
2236 optlen = cnt;
2237 }
2238 if (IPOPT_COPIED(opt)) {
2239 bcopy(src: cp, dst: dp, n: optlen);
2240 dp += optlen;
2241 }
2242 }
2243 for (optlen = (int)(dp - (u_char *)(jp + 1)); optlen & 0x3; optlen++) {
2244 *dp++ = IPOPT_EOL;
2245 }
2246 return optlen;
2247}
2248
2249/*
2250 * IP socket option processing.
2251 */
2252int
2253ip_ctloutput(struct socket *so, struct sockopt *sopt)
2254{
2255 struct inpcb *inp = sotoinpcb(so);
2256 int error, optval;
2257 lck_mtx_t *mutex_held = NULL;
2258
2259 error = optval = 0;
2260 if (sopt->sopt_level != IPPROTO_IP) {
2261 return EINVAL;
2262 }
2263
2264 switch (sopt->sopt_dir) {
2265 case SOPT_SET:
2266 mutex_held = socket_getlock(so, PR_F_WILLUNLOCK);
2267 /*
2268 * Wait if we are in the middle of ip_output
2269 * as we unlocked the socket there and don't
2270 * want to overwrite the IP options
2271 */
2272 if (inp->inp_sndinprog_cnt > 0) {
2273 inp->inp_sndingprog_waiters++;
2274
2275 while (inp->inp_sndinprog_cnt > 0) {
2276 msleep(chan: &inp->inp_sndinprog_cnt, mtx: mutex_held,
2277 PSOCK | PCATCH, wmesg: "inp_sndinprog_cnt", NULL);
2278 }
2279 inp->inp_sndingprog_waiters--;
2280 }
2281 switch (sopt->sopt_name) {
2282#ifdef notyet
2283 case IP_RETOPTS:
2284#endif
2285 case IP_OPTIONS: {
2286 struct mbuf *m;
2287
2288 if (sopt->sopt_valsize > MLEN) {
2289 error = EMSGSIZE;
2290 break;
2291 }
2292 MGET(m, sopt->sopt_p != kernproc ? M_WAIT : M_DONTWAIT,
2293 MT_HEADER);
2294 if (m == NULL) {
2295 error = ENOBUFS;
2296 break;
2297 }
2298 m->m_len = (int32_t)sopt->sopt_valsize;
2299 error = sooptcopyin(sopt, mtod(m, char *),
2300 len: m->m_len, minlen: m->m_len);
2301 if (error) {
2302 m_freem(m);
2303 break;
2304 }
2305
2306 return ip_pcbopts(sopt->sopt_name,
2307 &inp->inp_options, m);
2308 }
2309
2310 case IP_TOS:
2311 case IP_TTL:
2312 case IP_RECVOPTS:
2313 case IP_RECVRETOPTS:
2314 case IP_RECVDSTADDR:
2315 case IP_RECVIF:
2316 case IP_RECVTTL:
2317 case IP_RECVPKTINFO:
2318 case IP_RECVTOS:
2319 case IP_DONTFRAG:
2320 error = sooptcopyin(sopt, &optval, len: sizeof(optval),
2321 minlen: sizeof(optval));
2322 if (error) {
2323 break;
2324 }
2325
2326 switch (sopt->sopt_name) {
2327 case IP_TOS:
2328 if (optval > UINT8_MAX) {
2329 error = EINVAL;
2330 break;
2331 }
2332 inp->inp_ip_tos = (uint8_t)optval;
2333 break;
2334
2335 case IP_TTL:
2336 if (optval > UINT8_MAX) {
2337 error = EINVAL;
2338 break;
2339 }
2340 inp->inp_ip_ttl = (uint8_t)optval;
2341 break;
2342#define OPTSET(bit) do { \
2343 if (optval) { \
2344 inp->inp_flags |= bit; \
2345 } else { \
2346 inp->inp_flags &= ~bit; \
2347 } \
2348} while (0)
2349
2350#define OPTSET2(bit) do { \
2351 if (optval) { \
2352 inp->inp_flags2 |= bit; \
2353 } else { \
2354 inp->inp_flags2 &= ~bit; \
2355 } \
2356} while (0)
2357
2358 case IP_RECVOPTS:
2359 OPTSET(INP_RECVOPTS);
2360 break;
2361
2362 case IP_RECVRETOPTS:
2363 OPTSET(INP_RECVRETOPTS);
2364 break;
2365
2366 case IP_RECVDSTADDR:
2367 OPTSET(INP_RECVDSTADDR);
2368 break;
2369
2370 case IP_RECVIF:
2371 OPTSET(INP_RECVIF);
2372 break;
2373
2374 case IP_RECVTTL:
2375 OPTSET(INP_RECVTTL);
2376 break;
2377
2378 case IP_RECVPKTINFO:
2379 OPTSET(INP_PKTINFO);
2380 break;
2381
2382 case IP_RECVTOS:
2383 OPTSET(INP_RECVTOS);
2384 break;
2385
2386 case IP_DONTFRAG:
2387 /* This option is settable only for IPv4 */
2388 if (!(inp->inp_vflag & INP_IPV4)) {
2389 error = EINVAL;
2390 break;
2391 }
2392 OPTSET2(INP2_DONTFRAG);
2393 break;
2394#undef OPTSET
2395#undef OPTSET2
2396 }
2397 break;
2398 /*
2399 * Multicast socket options are processed by the in_mcast
2400 * module.
2401 */
2402 case IP_MULTICAST_IF:
2403 case IP_MULTICAST_IFINDEX:
2404 case IP_MULTICAST_VIF:
2405 case IP_MULTICAST_TTL:
2406 case IP_MULTICAST_LOOP:
2407 case IP_ADD_MEMBERSHIP:
2408 case IP_DROP_MEMBERSHIP:
2409 case IP_ADD_SOURCE_MEMBERSHIP:
2410 case IP_DROP_SOURCE_MEMBERSHIP:
2411 case IP_BLOCK_SOURCE:
2412 case IP_UNBLOCK_SOURCE:
2413 case IP_MSFILTER:
2414 case MCAST_JOIN_GROUP:
2415 case MCAST_LEAVE_GROUP:
2416 case MCAST_JOIN_SOURCE_GROUP:
2417 case MCAST_LEAVE_SOURCE_GROUP:
2418 case MCAST_BLOCK_SOURCE:
2419 case MCAST_UNBLOCK_SOURCE:
2420 error = inp_setmoptions(inp, sopt);
2421 break;
2422
2423 case IP_PORTRANGE:
2424 error = sooptcopyin(sopt, &optval, len: sizeof(optval),
2425 minlen: sizeof(optval));
2426 if (error) {
2427 break;
2428 }
2429
2430 switch (optval) {
2431 case IP_PORTRANGE_DEFAULT:
2432 inp->inp_flags &= ~(INP_LOWPORT);
2433 inp->inp_flags &= ~(INP_HIGHPORT);
2434 break;
2435
2436 case IP_PORTRANGE_HIGH:
2437 inp->inp_flags &= ~(INP_LOWPORT);
2438 inp->inp_flags |= INP_HIGHPORT;
2439 break;
2440
2441 case IP_PORTRANGE_LOW:
2442 inp->inp_flags &= ~(INP_HIGHPORT);
2443 inp->inp_flags |= INP_LOWPORT;
2444 break;
2445
2446 default:
2447 error = EINVAL;
2448 break;
2449 }
2450 break;
2451
2452#if IPSEC
2453 case IP_IPSEC_POLICY: {
2454 caddr_t req = NULL;
2455 size_t len = 0;
2456 int priv;
2457 struct mbuf *m;
2458 int optname;
2459
2460 if ((error = soopt_getm(sopt, mp: &m)) != 0) { /* XXX */
2461 break;
2462 }
2463 if ((error = soopt_mcopyin(sopt, m)) != 0) { /* XXX */
2464 break;
2465 }
2466 priv = (proc_suser(p: sopt->sopt_p) == 0);
2467 if (m) {
2468 req = mtod(m, caddr_t);
2469 len = m->m_len;
2470 }
2471 optname = sopt->sopt_name;
2472 error = ipsec4_set_policy(inp, optname, request: req, len, priv);
2473 m_freem(m);
2474 break;
2475 }
2476#endif /* IPSEC */
2477
2478#if TRAFFIC_MGT
2479 case IP_TRAFFIC_MGT_BACKGROUND: {
2480 unsigned background = 0;
2481
2482 error = sooptcopyin(sopt, &background,
2483 len: sizeof(background), minlen: sizeof(background));
2484 if (error) {
2485 break;
2486 }
2487
2488 if (background) {
2489 socket_set_traffic_mgt_flags_locked(so,
2490 TRAFFIC_MGT_SO_BACKGROUND);
2491 } else {
2492 socket_clear_traffic_mgt_flags_locked(so,
2493 TRAFFIC_MGT_SO_BACKGROUND);
2494 }
2495
2496 break;
2497 }
2498#endif /* TRAFFIC_MGT */
2499
2500 /*
2501 * On a multihomed system, scoped routing can be used to
2502 * restrict the source interface used for sending packets.
2503 * The socket option IP_BOUND_IF binds a particular AF_INET
2504 * socket to an interface such that data sent on the socket
2505 * is restricted to that interface. This is unlike the
2506 * SO_DONTROUTE option where the routing table is bypassed;
2507 * therefore it allows for a greater flexibility and control
2508 * over the system behavior, and does not place any restriction
2509 * on the destination address type (e.g. unicast, multicast,
2510 * or broadcast if applicable) or whether or not the host is
2511 * directly reachable. Note that in the multicast transmit
2512 * case, IP_MULTICAST_{IF,IFINDEX} takes precedence over
2513 * IP_BOUND_IF, since the former practically bypasses the
2514 * routing table; in this case, IP_BOUND_IF sets the default
2515 * interface used for sending multicast packets in the absence
2516 * of an explicit multicast transmit interface.
2517 */
2518 case IP_BOUND_IF:
2519 /* This option is settable only for IPv4 */
2520 if (!(inp->inp_vflag & INP_IPV4)) {
2521 error = EINVAL;
2522 break;
2523 }
2524
2525 error = sooptcopyin(sopt, &optval, len: sizeof(optval),
2526 minlen: sizeof(optval));
2527
2528 if (error) {
2529 break;
2530 }
2531
2532 error = inp_bindif(inp, optval, NULL);
2533 break;
2534
2535 case IP_NO_IFT_CELLULAR:
2536 /* This option is settable only for IPv4 */
2537 if (!(inp->inp_vflag & INP_IPV4)) {
2538 error = EINVAL;
2539 break;
2540 }
2541
2542 error = sooptcopyin(sopt, &optval, len: sizeof(optval),
2543 minlen: sizeof(optval));
2544
2545 if (error) {
2546 break;
2547 }
2548
2549 /* once set, it cannot be unset */
2550 if (!optval && INP_NO_CELLULAR(inp)) {
2551 error = EINVAL;
2552 break;
2553 }
2554
2555 error = so_set_restrictions(so,
2556 SO_RESTRICT_DENY_CELLULAR);
2557 break;
2558
2559 case IP_OUT_IF:
2560 /* This option is not settable */
2561 error = EINVAL;
2562 break;
2563
2564 default:
2565 error = ENOPROTOOPT;
2566 break;
2567 }
2568 break;
2569
2570 case SOPT_GET:
2571 switch (sopt->sopt_name) {
2572 case IP_OPTIONS:
2573 case IP_RETOPTS:
2574 if (inp->inp_options) {
2575 error = sooptcopyout(sopt,
2576 mtod(inp->inp_options, char *),
2577 len: inp->inp_options->m_len);
2578 } else {
2579 sopt->sopt_valsize = 0;
2580 }
2581 break;
2582
2583 case IP_TOS:
2584 case IP_TTL:
2585 case IP_RECVOPTS:
2586 case IP_RECVRETOPTS:
2587 case IP_RECVDSTADDR:
2588 case IP_RECVIF:
2589 case IP_RECVTTL:
2590 case IP_PORTRANGE:
2591 case IP_RECVPKTINFO:
2592 case IP_RECVTOS:
2593 case IP_DONTFRAG:
2594 switch (sopt->sopt_name) {
2595 case IP_TOS:
2596 optval = inp->inp_ip_tos;
2597 break;
2598
2599 case IP_TTL:
2600 optval = inp->inp_ip_ttl;
2601 break;
2602
2603#define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0)
2604#define OPTBIT2(bit) (inp->inp_flags2 & bit ? 1 : 0)
2605 case IP_RECVOPTS:
2606 optval = OPTBIT(INP_RECVOPTS);
2607 break;
2608
2609 case IP_RECVRETOPTS:
2610 optval = OPTBIT(INP_RECVRETOPTS);
2611 break;
2612
2613 case IP_RECVDSTADDR:
2614 optval = OPTBIT(INP_RECVDSTADDR);
2615 break;
2616
2617 case IP_RECVIF:
2618 optval = OPTBIT(INP_RECVIF);
2619 break;
2620
2621 case IP_RECVTTL:
2622 optval = OPTBIT(INP_RECVTTL);
2623 break;
2624
2625 case IP_PORTRANGE:
2626 if (inp->inp_flags & INP_HIGHPORT) {
2627 optval = IP_PORTRANGE_HIGH;
2628 } else if (inp->inp_flags & INP_LOWPORT) {
2629 optval = IP_PORTRANGE_LOW;
2630 } else {
2631 optval = 0;
2632 }
2633 break;
2634
2635 case IP_RECVPKTINFO:
2636 optval = OPTBIT(INP_PKTINFO);
2637 break;
2638
2639 case IP_RECVTOS:
2640 optval = OPTBIT(INP_RECVTOS);
2641 break;
2642 case IP_DONTFRAG:
2643 optval = OPTBIT2(INP2_DONTFRAG);
2644 break;
2645 }
2646 error = sooptcopyout(sopt, data: &optval, len: sizeof(optval));
2647 break;
2648
2649 case IP_MULTICAST_IF:
2650 case IP_MULTICAST_IFINDEX:
2651 case IP_MULTICAST_VIF:
2652 case IP_MULTICAST_TTL:
2653 case IP_MULTICAST_LOOP:
2654 case IP_MSFILTER:
2655 error = inp_getmoptions(inp, sopt);
2656 break;
2657
2658#if IPSEC
2659 case IP_IPSEC_POLICY: {
2660 error = 0; /* This option is no longer supported */
2661 break;
2662 }
2663#endif /* IPSEC */
2664
2665#if TRAFFIC_MGT
2666 case IP_TRAFFIC_MGT_BACKGROUND: {
2667 unsigned background = (so->so_flags1 &
2668 SOF1_TRAFFIC_MGT_SO_BACKGROUND) ? 1 : 0;
2669 return sooptcopyout(sopt, data: &background,
2670 len: sizeof(background));
2671 }
2672#endif /* TRAFFIC_MGT */
2673
2674 case IP_BOUND_IF:
2675 if (inp->inp_flags & INP_BOUND_IF) {
2676 optval = inp->inp_boundifp->if_index;
2677 }
2678 error = sooptcopyout(sopt, data: &optval, len: sizeof(optval));
2679 break;
2680
2681 case IP_NO_IFT_CELLULAR:
2682 optval = INP_NO_CELLULAR(inp) ? 1 : 0;
2683 error = sooptcopyout(sopt, data: &optval, len: sizeof(optval));
2684 break;
2685
2686 case IP_OUT_IF:
2687 optval = (inp->inp_last_outifp != NULL) ?
2688 inp->inp_last_outifp->if_index : 0;
2689 error = sooptcopyout(sopt, data: &optval, len: sizeof(optval));
2690 break;
2691
2692 default:
2693 error = ENOPROTOOPT;
2694 break;
2695 }
2696 break;
2697 }
2698 return error;
2699}
2700
2701/*
2702 * Set up IP options in pcb for insertion in output packets.
2703 * Store in mbuf with pointer in pcbopt, adding pseudo-option
2704 * with destination address if source routed.
2705 */
2706static int
2707ip_pcbopts(int optname, struct mbuf **pcbopt, struct mbuf *m)
2708{
2709#pragma unused(optname)
2710 int cnt, optlen;
2711 u_char *cp;
2712 u_char opt;
2713
2714 /* turn off any old options */
2715 if (*pcbopt) {
2716 (void) m_free(*pcbopt);
2717 }
2718 *pcbopt = 0;
2719 if (m == (struct mbuf *)0 || m->m_len == 0) {
2720 /*
2721 * Only turning off any previous options.
2722 */
2723 if (m) {
2724 (void) m_free(m);
2725 }
2726 return 0;
2727 }
2728
2729 if (m->m_len % sizeof(int32_t)) {
2730 goto bad;
2731 }
2732
2733 /*
2734 * IP first-hop destination address will be stored before
2735 * actual options; move other options back
2736 * and clear it when none present.
2737 */
2738 if (m_mtod_upper_bound(m) - m_mtod_end(m) < sizeof(struct in_addr)) {
2739 goto bad;
2740 }
2741 cnt = m->m_len;
2742 m->m_len += sizeof(struct in_addr);
2743 cp = mtod(m, u_char *) + sizeof(struct in_addr);
2744 ovbcopy(mtod(m, caddr_t), to: (caddr_t)cp, len: (unsigned)cnt);
2745 bzero(mtod(m, caddr_t), n: sizeof(struct in_addr));
2746
2747 for (; cnt > 0; cnt -= optlen, cp += optlen) {
2748 opt = cp[IPOPT_OPTVAL];
2749 if (opt == IPOPT_EOL) {
2750 break;
2751 }
2752 if (opt == IPOPT_NOP) {
2753 optlen = 1;
2754 } else {
2755 if (cnt < IPOPT_OLEN + sizeof(*cp)) {
2756 goto bad;
2757 }
2758 optlen = cp[IPOPT_OLEN];
2759 if (optlen < IPOPT_OLEN + sizeof(*cp) || optlen > cnt) {
2760 goto bad;
2761 }
2762 }
2763 switch (opt) {
2764 default:
2765 break;
2766
2767 case IPOPT_LSRR:
2768 case IPOPT_SSRR:
2769 /*
2770 * user process specifies route as:
2771 * ->A->B->C->D
2772 * D must be our final destination (but we can't
2773 * check that since we may not have connected yet).
2774 * A is first hop destination, which doesn't appear in
2775 * actual IP option, but is stored before the options.
2776 */
2777 if (optlen < IPOPT_MINOFF - 1 + sizeof(struct in_addr)) {
2778 goto bad;
2779 }
2780 if (optlen > UINT8_MAX) {
2781 goto bad;
2782 }
2783 m->m_len -= sizeof(struct in_addr);
2784 cnt -= sizeof(struct in_addr);
2785 optlen -= sizeof(struct in_addr);
2786 cp[IPOPT_OLEN] = (uint8_t)optlen;
2787 /*
2788 * Move first hop before start of options.
2789 */
2790 bcopy(src: (caddr_t)&cp[IPOPT_OFFSET + 1], mtod(m, caddr_t),
2791 n: sizeof(struct in_addr));
2792 /*
2793 * Then copy rest of options back
2794 * to close up the deleted entry.
2795 */
2796 ovbcopy(from: (caddr_t)(&cp[IPOPT_OFFSET + 1] +
2797 sizeof(struct in_addr)),
2798 to: (caddr_t)&cp[IPOPT_OFFSET + 1],
2799 len: (unsigned)cnt - (IPOPT_MINOFF - 1));
2800 break;
2801 }
2802 }
2803 if (m->m_len > MAX_IPOPTLEN + sizeof(struct in_addr)) {
2804 goto bad;
2805 }
2806 *pcbopt = m;
2807 return 0;
2808
2809bad:
2810 (void) m_free(m);
2811 return EINVAL;
2812}
2813
2814void
2815ip_moptions_init(void)
2816{
2817 PE_parse_boot_argn(arg_string: "ifa_debug", arg_ptr: &imo_debug, max_arg: sizeof(imo_debug));
2818
2819 vm_size_t imo_size = (imo_debug == 0) ? sizeof(struct ip_moptions) :
2820 sizeof(struct ip_moptions_dbg);
2821
2822 imo_zone = zone_create(IMO_ZONE_NAME, size: imo_size, flags: ZC_ZFREE_CLEARMEM);
2823}
2824
2825void
2826imo_addref(struct ip_moptions *imo, int locked)
2827{
2828 if (!locked) {
2829 IMO_LOCK(imo);
2830 } else {
2831 IMO_LOCK_ASSERT_HELD(imo);
2832 }
2833
2834 if (++imo->imo_refcnt == 0) {
2835 panic("%s: imo %p wraparound refcnt", __func__, imo);
2836 /* NOTREACHED */
2837 } else if (imo->imo_trace != NULL) {
2838 (*imo->imo_trace)(imo, TRUE);
2839 }
2840
2841 if (!locked) {
2842 IMO_UNLOCK(imo);
2843 }
2844}
2845
2846void
2847imo_remref(struct ip_moptions *imo)
2848{
2849 IMO_LOCK(imo);
2850 if (imo->imo_refcnt == 0) {
2851 panic("%s: imo %p negative refcnt", __func__, imo);
2852 /* NOTREACHED */
2853 } else if (imo->imo_trace != NULL) {
2854 (*imo->imo_trace)(imo, FALSE);
2855 }
2856
2857 --imo->imo_refcnt;
2858 if (imo->imo_refcnt > 0) {
2859 IMO_UNLOCK(imo);
2860 return;
2861 }
2862
2863 IMO_PURGE_LOCKED(imo);
2864
2865 IMO_UNLOCK(imo);
2866
2867 kfree_type(struct in_multi *, imo->imo_max_memberships, imo->imo_membership);
2868 kfree_type(struct in_mfilter, imo->imo_max_memberships, imo->imo_mfilters);
2869 lck_mtx_destroy(lck: &imo->imo_lock, grp: &ifa_mtx_grp);
2870
2871 if (!(imo->imo_debug & IFD_ALLOC)) {
2872 panic("%s: imo %p cannot be freed", __func__, imo);
2873 /* NOTREACHED */
2874 }
2875 zfree(imo_zone, imo);
2876}
2877
2878static void
2879imo_trace(struct ip_moptions *imo, int refhold)
2880{
2881 struct ip_moptions_dbg *imo_dbg = (struct ip_moptions_dbg *)imo;
2882 ctrace_t *tr;
2883 u_int32_t idx;
2884 u_int16_t *cnt;
2885
2886 if (!(imo->imo_debug & IFD_DEBUG)) {
2887 panic("%s: imo %p has no debug structure", __func__, imo);
2888 /* NOTREACHED */
2889 }
2890 if (refhold) {
2891 cnt = &imo_dbg->imo_refhold_cnt;
2892 tr = imo_dbg->imo_refhold;
2893 } else {
2894 cnt = &imo_dbg->imo_refrele_cnt;
2895 tr = imo_dbg->imo_refrele;
2896 }
2897
2898 idx = os_atomic_inc_orig(cnt, relaxed) % IMO_TRACE_HIST_SIZE;
2899 ctrace_record(&tr[idx]);
2900}
2901
2902struct ip_moptions *
2903ip_allocmoptions(zalloc_flags_t how)
2904{
2905 struct ip_moptions *imo;
2906
2907 imo = zalloc_flags(imo_zone, how | Z_ZERO);
2908 if (imo != NULL) {
2909 lck_mtx_init(lck: &imo->imo_lock, grp: &ifa_mtx_grp, attr: &ifa_mtx_attr);
2910 imo->imo_debug |= IFD_ALLOC;
2911 if (imo_debug != 0) {
2912 imo->imo_debug |= IFD_DEBUG;
2913 imo->imo_trace = imo_trace;
2914 }
2915 IMO_ADDREF(imo);
2916 }
2917
2918 return imo;
2919}
2920
2921/*
2922 * Routine called from ip_output() to loop back a copy of an IP multicast
2923 * packet to the input queue of a specified interface. Note that this
2924 * calls the output routine of the loopback "driver", but with an interface
2925 * pointer that might NOT be a loopback interface -- evil, but easier than
2926 * replicating that code here.
2927 */
2928static void
2929ip_mloopback(struct ifnet *srcifp, struct ifnet *origifp, struct mbuf *m,
2930 struct sockaddr_in *dst, int hlen)
2931{
2932 struct mbuf *copym;
2933 struct ip *ip;
2934
2935 if (lo_ifp == NULL) {
2936 return;
2937 }
2938
2939 /*
2940 * Copy the packet header as it's needed for the checksum
2941 * Make sure to deep-copy IP header portion in case the data
2942 * is in an mbuf cluster, so that we can safely override the IP
2943 * header portion later.
2944 */
2945 copym = m_copym_mode(m, 0, M_COPYALL, M_DONTWAIT, NULL, NULL, M_COPYM_COPY_HDR);
2946 if (copym != NULL && ((copym->m_flags & M_EXT) || copym->m_len < hlen)) {
2947 copym = m_pullup(copym, hlen);
2948 }
2949
2950 if (copym == NULL) {
2951 return;
2952 }
2953
2954 /*
2955 * We don't bother to fragment if the IP length is greater
2956 * than the interface's MTU. Can this possibly matter?
2957 */
2958 ip = mtod(copym, struct ip *);
2959#if BYTE_ORDER != BIG_ENDIAN
2960 HTONS(ip->ip_len);
2961 HTONS(ip->ip_off);
2962#endif
2963 ip->ip_sum = 0;
2964 ip->ip_sum = ip_cksum_hdr_out(copym, hlen);
2965
2966 /*
2967 * Mark checksum as valid unless receive checksum offload is
2968 * disabled; if so, compute checksum in software. If the
2969 * interface itself is lo0, this will be overridden by if_loop.
2970 */
2971 if (hwcksum_rx) {
2972 copym->m_pkthdr.csum_flags &= ~(CSUM_PARTIAL | CSUM_ZERO_INVERT);
2973 copym->m_pkthdr.csum_flags |=
2974 CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
2975 copym->m_pkthdr.csum_data = 0xffff;
2976 } else if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
2977#if BYTE_ORDER != BIG_ENDIAN
2978 NTOHS(ip->ip_len);
2979#endif
2980 in_delayed_cksum(copym);
2981#if BYTE_ORDER != BIG_ENDIAN
2982 HTONS(ip->ip_len);
2983#endif
2984 }
2985
2986 /*
2987 * Stuff the 'real' ifp into the pkthdr, to be used in matching
2988 * in ip_input(); we need the loopback ifp/dl_tag passed as args
2989 * to make the loopback driver compliant with the data link
2990 * requirements.
2991 */
2992 copym->m_pkthdr.rcvif = origifp;
2993
2994 /*
2995 * Also record the source interface (which owns the source address).
2996 * This is basically a stripped down version of ifa_foraddr().
2997 */
2998 if (srcifp == NULL) {
2999 struct in_ifaddr *ia;
3000
3001 lck_rw_lock_shared(lck: &in_ifaddr_rwlock);
3002 TAILQ_FOREACH(ia, INADDR_HASH(ip->ip_src.s_addr), ia_hash) {
3003 IFA_LOCK_SPIN(&ia->ia_ifa);
3004 if (IA_SIN(ia)->sin_addr.s_addr == ip->ip_src.s_addr) {
3005 srcifp = ia->ia_ifp;
3006 IFA_UNLOCK(&ia->ia_ifa);
3007 break;
3008 }
3009 IFA_UNLOCK(&ia->ia_ifa);
3010 }
3011 lck_rw_done(lck: &in_ifaddr_rwlock);
3012 }
3013 if (srcifp != NULL) {
3014 ip_setsrcifaddr_info(copym, srcifp->if_index, NULL);
3015 }
3016 ip_setdstifaddr_info(copym, origifp->if_index, NULL);
3017
3018 dlil_output(lo_ifp, PF_INET, copym, NULL, SA(dst), 0, NULL);
3019}
3020
3021/*
3022 * Given a source IP address (and route, if available), determine the best
3023 * interface to send the packet from. Checking for (and updating) the
3024 * ROF_SRCIF_SELECTED flag in the pcb-supplied route placeholder is done
3025 * without any locks based on the assumption that ip_output() is single-
3026 * threaded per-pcb, i.e. for any given pcb there can only be one thread
3027 * performing output at the IP layer.
3028 *
3029 * This routine is analogous to in6_selectroute() for IPv6.
3030 */
3031static struct ifaddr *
3032in_selectsrcif(struct ip *ip, struct route *ro, unsigned int ifscope)
3033{
3034 struct ifaddr *ifa = NULL;
3035 struct in_addr src = ip->ip_src;
3036 struct in_addr dst = ip->ip_dst;
3037 struct ifnet *rt_ifp;
3038 char s_src[MAX_IPv4_STR_LEN], s_dst[MAX_IPv4_STR_LEN];
3039
3040 VERIFY(src.s_addr != INADDR_ANY);
3041
3042 if (ip_select_srcif_debug) {
3043 (void) inet_ntop(AF_INET, &src.s_addr, s_src, sizeof(s_src));
3044 (void) inet_ntop(AF_INET, &dst.s_addr, s_dst, sizeof(s_dst));
3045 }
3046
3047 if (ro->ro_rt != NULL) {
3048 RT_LOCK(ro->ro_rt);
3049 }
3050
3051 rt_ifp = (ro->ro_rt != NULL) ? ro->ro_rt->rt_ifp : NULL;
3052
3053 /*
3054 * Given the source IP address, find a suitable source interface
3055 * to use for transmission; if the caller has specified a scope,
3056 * optimize the search by looking at the addresses only for that
3057 * interface. This is still suboptimal, however, as we need to
3058 * traverse the per-interface list.
3059 */
3060 if (ifscope != IFSCOPE_NONE || ro->ro_rt != NULL) {
3061 unsigned int scope = ifscope;
3062
3063 /*
3064 * If no scope is specified and the route is stale (pointing
3065 * to a defunct interface) use the current primary interface;
3066 * this happens when switching between interfaces configured
3067 * with the same IP address. Otherwise pick up the scope
3068 * information from the route; the ULP may have looked up a
3069 * correct route and we just need to verify it here and mark
3070 * it with the ROF_SRCIF_SELECTED flag below.
3071 */
3072 if (scope == IFSCOPE_NONE) {
3073 scope = rt_ifp->if_index;
3074 if (scope != get_primary_ifscope(AF_INET) &&
3075 ROUTE_UNUSABLE(ro)) {
3076 scope = get_primary_ifscope(AF_INET);
3077 }
3078 }
3079
3080 ifa = (struct ifaddr *)ifa_foraddr_scoped(src.s_addr, scope);
3081
3082 if (ifa == NULL && ip->ip_p != IPPROTO_UDP &&
3083 ip->ip_p != IPPROTO_TCP && ipforwarding) {
3084 /*
3085 * If forwarding is enabled, and if the packet isn't
3086 * TCP or UDP, check if the source address belongs
3087 * to one of our own interfaces; if so, demote the
3088 * interface scope and do a route lookup right below.
3089 */
3090 ifa = (struct ifaddr *)ifa_foraddr(src.s_addr);
3091 if (ifa != NULL) {
3092 ifa_remref(ifa);
3093 ifa = NULL;
3094 ifscope = IFSCOPE_NONE;
3095 }
3096 }
3097
3098 if (ip_select_srcif_debug && ifa != NULL) {
3099 if (ro->ro_rt != NULL) {
3100 printf("%s->%s ifscope %d->%d ifa_if %s "
3101 "ro_if %s\n", s_src, s_dst, ifscope,
3102 scope, if_name(ifa->ifa_ifp),
3103 if_name(rt_ifp));
3104 } else {
3105 printf("%s->%s ifscope %d->%d ifa_if %s\n",
3106 s_src, s_dst, ifscope, scope,
3107 if_name(ifa->ifa_ifp));
3108 }
3109 }
3110 }
3111
3112 /*
3113 * Slow path; search for an interface having the corresponding source
3114 * IP address if the scope was not specified by the caller, and:
3115 *
3116 * 1) There currently isn't any route, or,
3117 * 2) The interface used by the route does not own that source
3118 * IP address; in this case, the route will get blown away
3119 * and we'll do a more specific scoped search using the newly
3120 * found interface.
3121 */
3122 if (ifa == NULL && ifscope == IFSCOPE_NONE) {
3123 ifa = (struct ifaddr *)ifa_foraddr(src.s_addr);
3124
3125 /*
3126 * If we have the IP address, but not the route, we don't
3127 * really know whether or not it belongs to the correct
3128 * interface (it could be shared across multiple interfaces.)
3129 * The only way to find out is to do a route lookup.
3130 */
3131 if (ifa != NULL && ro->ro_rt == NULL) {
3132 struct rtentry *rt;
3133 struct sockaddr_in sin;
3134 struct ifaddr *oifa = NULL;
3135
3136 SOCKADDR_ZERO(&sin, sizeof(sin));
3137 sin.sin_family = AF_INET;
3138 sin.sin_len = sizeof(sin);
3139 sin.sin_addr = dst;
3140
3141 lck_mtx_lock(rnh_lock);
3142 if ((rt = rt_lookup(TRUE, SA(&sin), NULL,
3143 rt_tables[AF_INET], IFSCOPE_NONE)) != NULL) {
3144 RT_LOCK(rt);
3145 /*
3146 * If the route uses a different interface,
3147 * use that one instead. The IP address of
3148 * the ifaddr that we pick up here is not
3149 * relevant.
3150 */
3151 if (ifa->ifa_ifp != rt->rt_ifp) {
3152 oifa = ifa;
3153 ifa = rt->rt_ifa;
3154 ifa_addref(ifa);
3155 RT_UNLOCK(rt);
3156 } else {
3157 RT_UNLOCK(rt);
3158 }
3159 rtfree_locked(rt);
3160 }
3161 lck_mtx_unlock(rnh_lock);
3162
3163 if (oifa != NULL) {
3164 struct ifaddr *iifa;
3165
3166 /*
3167 * See if the interface pointed to by the
3168 * route is configured with the source IP
3169 * address of the packet.
3170 */
3171 iifa = (struct ifaddr *)ifa_foraddr_scoped(
3172 src.s_addr, ifa->ifa_ifp->if_index);
3173
3174 if (iifa != NULL) {
3175 /*
3176 * Found it; drop the original one
3177 * as well as the route interface
3178 * address, and use this instead.
3179 */
3180 ifa_remref(ifa: oifa);
3181 ifa_remref(ifa);
3182 ifa = iifa;
3183 } else if (!ipforwarding ||
3184 (rt->rt_flags & RTF_GATEWAY)) {
3185 /*
3186 * This interface doesn't have that
3187 * source IP address; drop the route
3188 * interface address and just use the
3189 * original one, and let the caller
3190 * do a scoped route lookup.
3191 */
3192 ifa_remref(ifa);
3193 ifa = oifa;
3194 } else {
3195 /*
3196 * Forwarding is enabled and the source
3197 * address belongs to one of our own
3198 * interfaces which isn't the outgoing
3199 * interface, and we have a route, and
3200 * the destination is on a network that
3201 * is directly attached (onlink); drop
3202 * the original one and use the route
3203 * interface address instead.
3204 */
3205 ifa_remref(ifa: oifa);
3206 }
3207 }
3208 } else if (ifa != NULL && ro->ro_rt != NULL &&
3209 !(ro->ro_rt->rt_flags & RTF_GATEWAY) &&
3210 ifa->ifa_ifp != ro->ro_rt->rt_ifp && ipforwarding) {
3211 /*
3212 * Forwarding is enabled and the source address belongs
3213 * to one of our own interfaces which isn't the same
3214 * as the interface used by the known route; drop the
3215 * original one and use the route interface address.
3216 */
3217 ifa_remref(ifa);
3218 ifa = ro->ro_rt->rt_ifa;
3219 ifa_addref(ifa);
3220 }
3221
3222 if (ip_select_srcif_debug && ifa != NULL) {
3223 printf("%s->%s ifscope %d ifa_if %s\n",
3224 s_src, s_dst, ifscope, if_name(ifa->ifa_ifp));
3225 }
3226 }
3227
3228 if (ro->ro_rt != NULL) {
3229 RT_LOCK_ASSERT_HELD(ro->ro_rt);
3230 }
3231 /*
3232 * If there is a non-loopback route with the wrong interface, or if
3233 * there is no interface configured with such an address, blow it
3234 * away. Except for local/loopback, we look for one with a matching
3235 * interface scope/index.
3236 */
3237 if (ro->ro_rt != NULL &&
3238 (ifa == NULL || (ifa->ifa_ifp != rt_ifp && rt_ifp != lo_ifp) ||
3239 !(ro->ro_rt->rt_flags & RTF_UP))) {
3240 if (ip_select_srcif_debug) {
3241 if (ifa != NULL) {
3242 printf("%s->%s ifscope %d ro_if %s != "
3243 "ifa_if %s (cached route cleared)\n",
3244 s_src, s_dst, ifscope, if_name(rt_ifp),
3245 if_name(ifa->ifa_ifp));
3246 } else {
3247 printf("%s->%s ifscope %d ro_if %s "
3248 "(no ifa_if found)\n",
3249 s_src, s_dst, ifscope, if_name(rt_ifp));
3250 }
3251 }
3252
3253 RT_UNLOCK(ro->ro_rt);
3254 ROUTE_RELEASE(ro);
3255
3256 /*
3257 * If the destination is IPv4 LLA and the route's interface
3258 * doesn't match the source interface, then the source IP
3259 * address is wrong; it most likely belongs to the primary
3260 * interface associated with the IPv4 LL subnet. Drop the
3261 * packet rather than letting it go out and return an error
3262 * to the ULP. This actually applies not only to IPv4 LL
3263 * but other shared subnets; for now we explicitly test only
3264 * for the former case and save the latter for future.
3265 */
3266 if (IN_LINKLOCAL(ntohl(dst.s_addr)) &&
3267 !IN_LINKLOCAL(ntohl(src.s_addr)) && ifa != NULL) {
3268 ifa_remref(ifa);
3269 ifa = NULL;
3270 }
3271 }
3272
3273 if (ip_select_srcif_debug && ifa == NULL) {
3274 printf("%s->%s ifscope %d (neither ro_if/ifa_if found)\n",
3275 s_src, s_dst, ifscope);
3276 }
3277
3278 /*
3279 * If there is a route, mark it accordingly. If there isn't one,
3280 * we'll get here again during the next transmit (possibly with a
3281 * route) and the flag will get set at that point. For IPv4 LLA
3282 * destination, mark it only if the route has been fully resolved;
3283 * otherwise we want to come back here again when the route points
3284 * to the interface over which the ARP reply arrives on.
3285 */
3286 if (ro->ro_rt != NULL && (!IN_LINKLOCAL(ntohl(dst.s_addr)) ||
3287 (ro->ro_rt->rt_gateway->sa_family == AF_LINK &&
3288 SDL(ro->ro_rt->rt_gateway)->sdl_alen != 0))) {
3289 if (ifa != NULL) {
3290 ifa_addref(ifa); /* for route */
3291 }
3292 if (ro->ro_srcia != NULL) {
3293 ifa_remref(ifa: ro->ro_srcia);
3294 }
3295 ro->ro_srcia = ifa;
3296 ro->ro_flags |= ROF_SRCIF_SELECTED;
3297 RT_GENID_SYNC(ro->ro_rt);
3298 }
3299
3300 if (ro->ro_rt != NULL) {
3301 RT_UNLOCK(ro->ro_rt);
3302 }
3303
3304 return ifa;
3305}
3306
3307/*
3308 * @brief Given outgoing interface it determines what checksum needs
3309 * to be computed in software and what needs to be offloaded to the
3310 * interface.
3311 *
3312 * @param ifp Pointer to the outgoing interface
3313 * @param m Pointer to the packet
3314 * @param hlen IP header length
3315 * @param ip_len Total packet size i.e. headers + data payload
3316 * @param sw_csum Pointer to a software checksum flag set
3317 *
3318 * @return void
3319 */
3320void
3321ip_output_checksum(struct ifnet *ifp, struct mbuf *m, int hlen, int ip_len,
3322 uint32_t *sw_csum)
3323{
3324 uint32_t hwcap = ifp->if_hwassist;
3325
3326 m->m_pkthdr.csum_flags |= CSUM_IP;
3327
3328 if (!hwcksum_tx) {
3329 /* do all in software; hardware checksum offload is disabled */
3330 *sw_csum = (CSUM_DELAY_DATA | CSUM_DELAY_IP) &
3331 m->m_pkthdr.csum_flags;
3332 } else {
3333 /* do in software what the hardware cannot */
3334 *sw_csum = m->m_pkthdr.csum_flags & ~IF_HWASSIST_CSUM_FLAGS(hwcap);
3335 }
3336
3337 if (hlen != sizeof(struct ip)) {
3338 *sw_csum |= ((CSUM_DELAY_DATA | CSUM_DELAY_IP) &
3339 m->m_pkthdr.csum_flags);
3340 } else if ((*sw_csum & CSUM_DELAY_DATA) && (hwcap & CSUM_PARTIAL)) {
3341 /*
3342 * If the explicitly required data csum offload is not supported by hardware,
3343 * do it by partial checksum. Here we assume TSO implies support for IP
3344 * and data sum.
3345 */
3346 int interface_mtu = ifp->if_mtu;
3347
3348 if (INTF_ADJUST_MTU_FOR_CLAT46(ifp)) {
3349 interface_mtu = IN6_LINKMTU(ifp);
3350 /* Further adjust the size for CLAT46 expansion */
3351 interface_mtu -= CLAT46_HDR_EXPANSION_OVERHD;
3352 }
3353
3354 /*
3355 * Partial checksum offload, if non-IP fragment, and TCP only
3356 * (no UDP support, as the hardware may not be able to convert
3357 * +0 to -0 (0xffff) per RFC1122 4.1.3.4. unless the interface
3358 * supports "invert zero" capability.)
3359 */
3360 if (hwcksum_tx &&
3361 ((m->m_pkthdr.csum_flags & CSUM_TCP) ||
3362 ((hwcap & CSUM_ZERO_INVERT) &&
3363 (m->m_pkthdr.csum_flags & CSUM_ZERO_INVERT))) &&
3364 ip_len <= interface_mtu) {
3365 uint16_t start = sizeof(struct ip);
3366 uint16_t ulpoff = m->m_pkthdr.csum_data & 0xffff;
3367 m->m_pkthdr.csum_flags |=
3368 (CSUM_DATA_VALID | CSUM_PARTIAL);
3369 m->m_pkthdr.csum_tx_stuff = (ulpoff + start);
3370 m->m_pkthdr.csum_tx_start = start;
3371 /* do IP hdr chksum in software */
3372 *sw_csum = CSUM_DELAY_IP;
3373 } else {
3374 *sw_csum |= (CSUM_DELAY_DATA & m->m_pkthdr.csum_flags);
3375 }
3376 }
3377
3378 if (*sw_csum & CSUM_DELAY_DATA) {
3379 in_delayed_cksum(m);
3380 *sw_csum &= ~CSUM_DELAY_DATA;
3381 }
3382
3383 if (hwcksum_tx) {
3384 uint32_t delay_data = m->m_pkthdr.csum_flags & CSUM_DELAY_DATA;
3385 uint32_t hw_csum = IF_HWASSIST_CSUM_FLAGS(hwcap);
3386
3387 /*
3388 * Drop off bits that aren't supported by hardware;
3389 * also make sure to preserve non-checksum related bits.
3390 */
3391 m->m_pkthdr.csum_flags =
3392 ((m->m_pkthdr.csum_flags & (hw_csum | CSUM_DATA_VALID)) |
3393 (m->m_pkthdr.csum_flags & ~IF_HWASSIST_CSUM_MASK));
3394
3395 /*
3396 * If hardware supports partial checksum but not delay_data,
3397 * add back delay_data.
3398 */
3399 if ((hw_csum & CSUM_PARTIAL) != 0 &&
3400 (hw_csum & delay_data) == 0) {
3401 m->m_pkthdr.csum_flags |= delay_data;
3402 }
3403 } else {
3404 /* drop all bits; hardware checksum offload is disabled */
3405 m->m_pkthdr.csum_flags = 0;
3406 }
3407}
3408
3409/*
3410 * GRE protocol output for PPP/PPTP
3411 */
3412int
3413ip_gre_output(struct mbuf *m)
3414{
3415 struct route ro;
3416 int error;
3417
3418 bzero(s: &ro, n: sizeof(ro));
3419
3420 error = ip_output(m0: m, NULL, ro: &ro, flags: 0, NULL, NULL);
3421
3422 ROUTE_RELEASE(&ro);
3423
3424 return error;
3425}
3426
3427static int
3428sysctl_reset_ip_output_stats SYSCTL_HANDLER_ARGS
3429{
3430#pragma unused(arg1, arg2)
3431 int error, i;
3432
3433 i = ip_output_measure;
3434 error = sysctl_handle_int(oidp, arg1: &i, arg2: 0, req);
3435 if (error || req->newptr == USER_ADDR_NULL) {
3436 goto done;
3437 }
3438 /* impose bounds */
3439 if (i < 0 || i > 1) {
3440 error = EINVAL;
3441 goto done;
3442 }
3443 if (ip_output_measure != i && i == 1) {
3444 net_perf_initialize(npp: &net_perf, bins: ip_output_measure_bins);
3445 }
3446 ip_output_measure = i;
3447done:
3448 return error;
3449}
3450
3451static int
3452sysctl_ip_output_measure_bins SYSCTL_HANDLER_ARGS
3453{
3454#pragma unused(arg1, arg2)
3455 int error;
3456 uint64_t i;
3457
3458 i = ip_output_measure_bins;
3459 error = sysctl_handle_quad(oidp, arg1: &i, arg2: 0, req);
3460 if (error || req->newptr == USER_ADDR_NULL) {
3461 goto done;
3462 }
3463 /* validate data */
3464 if (!net_perf_validate_bins(bins: i)) {
3465 error = EINVAL;
3466 goto done;
3467 }
3468 ip_output_measure_bins = i;
3469done:
3470 return error;
3471}
3472
3473static int
3474sysctl_ip_output_getperf SYSCTL_HANDLER_ARGS
3475{
3476#pragma unused(oidp, arg1, arg2)
3477 if (req->oldptr == USER_ADDR_NULL) {
3478 req->oldlen = (size_t)sizeof(struct ipstat);
3479 }
3480
3481 return SYSCTL_OUT(req, &net_perf, MIN(sizeof(net_perf), req->oldlen));
3482}
3483