kern_exec.c source code [xnu/bsd/kern/kern_exec.c]

1	/*
2	* Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28	/ Copyright (c) 1995 NeXT Computer, Inc. All Rights Reserved /
29	/*
30	* Mach Operating System
31	* Copyright (c) 1987 Carnegie-Mellon University
32	* All rights reserved. The CMU software License Agreement specifies
33	* the terms and conditions for use and redistribution.
34	*/
35
36	/-*
37	* Copyright (c) 1982, 1986, 1991, 1993
38	* The Regents of the University of California. All rights reserved.
39	* (c) UNIX System Laboratories, Inc.
40	* All or some portions of this file are derived from material licensed
41	* to the University of California by American Telephone and Telegraph
42	* Co. or Unix System Laboratories, Inc. and are reproduced herein with
43	* the permission of UNIX System Laboratories, Inc.
44	*
45	* Redistribution and use in source and binary forms, with or without
46	* modification, are permitted provided that the following conditions
47	* are met:
48	* 1. Redistributions of source code must retain the above copyright
49	* notice, this list of conditions and the following disclaimer.
50	* 2. Redistributions in binary form must reproduce the above copyright
51	* notice, this list of conditions and the following disclaimer in the
52	* documentation and/or other materials provided with the distribution.
53	* 3. All advertising materials mentioning features or use of this software
54	* must display the following acknowledgement:
55	* This product includes software developed by the University of
56	* California, Berkeley and its contributors.
57	* 4. Neither the name of the University nor the names of its contributors
58	* may be used to endorse or promote products derived from this software
59	* without specific prior written permission.
60	*
61	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
62	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
63	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
64	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
65	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
66	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
67	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
68	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
69	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
70	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
71	* SUCH DAMAGE.
72	*
73	* from: @(#)kern_exec.c 8.1 (Berkeley) 6/10/93
74	*/
75	/*
76	* NOTICE: This file was modified by SPARTA, Inc. in 2005 to introduce
77	* support for mandatory and extensible security protections. This notice
78	* is included in support of clause 2.2 (b) of the Apple Public License,
79	* Version 2.0.
80	*/
81	#include <machine/reg.h>
82	#include <machine/cpu_capabilities.h>
83
84	#include <sys/cdefs.h>
85	#include <sys/param.h>
86	#include <sys/systm.h>
87	#include <sys/filedesc.h>
88	#include <sys/kernel.h>
89	#include <sys/proc_internal.h>
90	#include <sys/kauth.h>
91	#include <sys/user.h>
92	#include <sys/socketvar.h>
93	#include <sys/malloc.h>
94	#include <sys/namei.h>
95	#include <sys/mount_internal.h>
96	#include <sys/vnode_internal.h>
97	#include <sys/file_internal.h>
98	#include <sys/stat.h>
99	#include <sys/uio_internal.h>
100	#include <sys/acct.h>
101	#include <sys/exec.h>
102	#include <sys/kdebug.h>
103	#include <sys/signal.h>
104	#include <sys/aio_kern.h>
105	#include <sys/sysproto.h>
106	#include <sys/sysctl.h>
107	#include <sys/persona.h>
108	#include <sys/reason.h>
109	#if SYSV_SHM
110	#include <sys/shm_internal.h> /* shmexec() */
111	#endif
112	#include <sys/ubc_internal.h> /* ubc_map() */
113	#include <sys/spawn.h>
114	#include <sys/spawn_internal.h>
115	#include <sys/process_policy.h>
116	#include <sys/codesign.h>
117	#include <sys/random.h>
118	#include <crypto/sha1.h>
119
120	#include <libkern/libkern.h>
121	#include <libkern/crypto/sha2.h>
122	#include <security/audit/audit.h>
123
124	#include <ipc/ipc_types.h>
125
126	#include <mach/mach_param.h>
127	#include <mach/mach_types.h>
128	#include <mach/port.h>
129	#include <mach/task.h>
130	#include <mach/task_access.h>
131	#include <mach/thread_act.h>
132	#include <mach/vm_map.h>
133	#include <mach/mach_vm.h>
134	#include <mach/vm_param.h>
135	#include <mach_debug/mach_debug_types.h>
136
137	#include <kern/sched_prim.h> /* thread_wakeup() */
138	#include <kern/affinity.h>
139	#include <kern/assert.h>
140	#include <kern/task.h>
141	#include <kern/thread.h>
142	#include <kern/coalition.h>
143	#include <kern/policy_internal.h>
144	#include <kern/kalloc.h>
145	#include <kern/zalloc.h> /* zone_userspace_reboot_checks() */
146
147	#include <os/log.h>
148
149	#if CONFIG_MACF
150	#include <security/mac_framework.h>
151	#include <security/mac_mach_internal.h>
152	#endif
153
154	#if CONFIG_AUDIT
155	#include <bsm/audit_kevents.h>
156	#endif
157
158	#if CONFIG_ARCADE
159	#include <kern/arcade.h>
160	#endif
161
162	#include <vm/vm_map.h>
163	#include <vm/vm_kern.h>
164	#include <vm/vm_protos.h>
165	#include <vm/vm_kern.h>
166	#include <vm/vm_fault.h>
167	#include <vm/vm_pageout.h>
168	#include <vm/pmap.h>
169	#include <vm/vm_reclaim_internal.h>
170
171	#include <kdp/kdp_dyld.h>
172
173	#include <machine/machine_routines.h>
174	#include <machine/pal_routines.h>
175
176	#include <pexpert/pexpert.h>
177
178	#if CONFIG_MEMORYSTATUS
179	#include <sys/kern_memorystatus.h>
180	#endif
181
182	#include <IOKit/IOBSD.h>
183
184	#include "kern_exec_internal.h"
185
186	#include <CoreEntitlements/CoreEntitlements.h>
187
188	#include <mach/exclaves.h>
189
190	extern boolean_t vm_darkwake_mode;
191
192	/ enable crash reports on various exec failures /
193	static TUNABLE(bool, bootarg_execfailurereports, "execfailurecrashes", false);
194
195	#if XNU_TARGET_OS_OSX
196	#if __has_feature(ptrauth_calls)
197	static TUNABLE(bool, bootarg_arm64e_preview_abi, "-arm64e_preview_abi", false);
198	#endif /* __has_feature(ptrauth_calls) */
199
200	#if DEBUG \|\| DEVELOPMENT
201	static TUNABLE(bool, unentitled_ios_sim_launch, "unentitled_ios_sim_launch", false);
202	#endif /* DEBUG \|\| DEVELOPMENT */
203	#endif /* XNU_TARGET_OS_OSX */
204
205	#if CONFIG_DTRACE
206	/ Do not include dtrace.h, it redefines kmem_[alloc/free] /
207	extern void dtrace_proc_exec(proc_t);
208	extern void (*dtrace_proc_waitfor_exec_ptr)(proc_t);
209
210	/*
211	* Since dtrace_proc_waitfor_exec_ptr can be added/removed in dtrace_subr.c,
212	* we will store its value before actually calling it.
213	*/
214	static void (*dtrace_proc_waitfor_hook)(proc_t) = NULL;
215
216	#include <sys/dtrace_ptss.h>
217	#endif
218
219	#if __has_feature(ptrauth_calls)
220	static TUNABLE_DEV_WRITEABLE(int, vm_shared_region_per_team_id,
221	"vm_shared_region_per_team_id", `1`);
222	static TUNABLE_DEV_WRITEABLE(int, vm_shared_region_by_entitlement,
223	"vm_shared_region_by_entitlement", `1`);
224
225	/ Upon userland request, reslide the shared cache. /
226	static TUNABLE_DEV_WRITEABLE(int, vm_shared_region_reslide_aslr,
227	"vm_shared_region_reslide_aslr",
228	#if CONFIG_RESLIDE_SHARED_CACHE
229	`1`
230	#else
231	`0`
232	#endif /* CONFIG_RESLIDE_SHARED_CACHE */
233	);
234
235	/*
236	* Flag to control what processes should get shared cache randomize resliding
237	* after a fault in the shared cache region:
238	*
239	* 0 - all processes get a new randomized slide
240	* 1 - only platform processes get a new randomized slide
241	*/
242	TUNABLE_DEV_WRITEABLE(int, vm_shared_region_reslide_restrict,
243	"vm_shared_region_reslide_restrict", `1`);
244
245	#if DEVELOPMENT \|\| DEBUG
246	SYSCTL_INT(_vm, OID_AUTO, vm_shared_region_per_team_id,
247	CTLFLAG_RW, &vm_shared_region_per_team_id, `0`, "");
248	SYSCTL_INT(_vm, OID_AUTO, vm_shared_region_by_entitlement,
249	CTLFLAG_RW, &vm_shared_region_by_entitlement, `0`, "");
250	SYSCTL_INT(_vm, OID_AUTO, vm_shared_region_reslide_restrict,
251	CTLFLAG_RW, &vm_shared_region_reslide_restrict, `0`, "");
252	SYSCTL_INT(_vm, OID_AUTO, vm_shared_region_reslide_aslr,
253	CTLFLAG_RW, &vm_shared_region_reslide_aslr, `0`, "");
254	#endif
255	#endif /* __has_feature(ptrauth_calls) */
256
257	#if DEVELOPMENT \|\| DEBUG
258	static TUNABLE(bool, enable_dext_coredumps_on_panic, "dext_panic_coredump", true);
259	#else
260	static TUNABLE(bool, enable_dext_coredumps_on_panic, "dext_panic_coredump", false);
261	#endif
262	extern kern_return_t kern_register_userspace_coredump(task_t task, const char * name);
263	#define USERSPACE_COREDUMP_PANIC_ENTITLEMENT "com.apple.private.enable-coredump-on-panic"
264	#define USERSPACE_COREDUMP_PANIC_SEED_ENTITLEMENT \
265	"com.apple.private.enable-coredump-on-panic-seed-privacy-approved"
266
267	extern void proc_apply_task_networkbg_internal(proc_t, thread_t);
268	extern void task_set_did_exec_flag(task_t task);
269	extern void task_clear_exec_copy_flag(task_t task);
270	proc_t proc_exec_switch_task(proc_t old_proc, proc_t new_proc, task_t old_task,
271	task_t new_task, struct image_params imgp, void* **inherit);
272	boolean_t task_is_active(task_t);
273	boolean_t thread_is_active(thread_t thread);
274	void thread_copy_resource_info(thread_t dst_thread, thread_t src_thread);
275	void *ipc_importance_exec_switch_task(task_t old_task, task_t new_task);
276	extern void ipc_importance_release(void *elem);
277	extern boolean_t task_has_watchports(task_t task);
278	extern void task_set_no_smt(task_t task);
279	#if defined(HAS_APPLE_PAC)
280	char task_get_vm_shared_region_id_and_jop_pid(task_t task, uint64_t jop_pid);
281	#endif
282	task_t convert_port_to_task(ipc_port_t port);
283
284	#if CONFIG_EXCLAVES
285	int task_add_conclave(task_t task, void vnode, int64_t off, const* char *task_conclave_id);
286	kern_return_t task_inherit_conclave(task_t old_task, task_t new_task, void *vnode, int64_t off);
287	#endif /* CONFIG_EXCLAVES */
288
289
290	/*
291	* Mach things for which prototypes are unavailable from Mach headers
292	*/
293	#define IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND 0x1
294	void ipc_task_enable(
295	task_t task);
296	void ipc_task_reset(
297	task_t task);
298	void ipc_thread_reset(
299	thread_t thread);
300	kern_return_t ipc_object_copyin(
301	ipc_space_t space,
302	mach_port_name_t name,
303	mach_msg_type_name_t msgt_name,
304	ipc_object_t *objectp,
305	mach_port_context_t context,
306	mach_msg_guard_flags_t *guard_flags,
307	uint32_t kmsg_flags);
308	void ipc_port_release_send(ipc_port_t);
309
310	#if DEVELOPMENT \|\| DEBUG
311	void task_importance_update_owner_info(task_t);
312	#endif
313
314	extern struct savearea *get_user_regs(thread_t);
315
316	__attribute__((noinline)) int __EXEC_WAITING_ON_TASKGATED_CODE_SIGNATURE_UPCALL__(mach_port_t task_access_port, int32_t new_pid);
317
318	#include <kern/thread.h>
319	#include <kern/task.h>
320	#include <kern/ast.h>
321	#include <kern/mach_loader.h>
322	#include <kern/mach_fat.h>
323	#include <mach-o/fat.h>
324	#include <mach-o/loader.h>
325	#include <machine/vmparam.h>
326	#include <sys/imgact.h>
327
328	#include <sys/sdt.h>
329
330
331	/*
332	* EAI_ITERLIMIT The maximum number of times to iterate an image
333	* activator in exec_activate_image() before treating
334	* it as malformed/corrupt.
335	*/
336	#define EAI_ITERLIMIT 3
337
338	/*
339	* For #! interpreter parsing
340	*/
341	#define IS_WHITESPACE(ch) ((ch == ' ') \|\| (ch == '\t'))
342	#define IS_EOL(ch) ((ch == '#') \|\| (ch == '\n'))
343
344	extern vm_map_t bsd_pageable_map;
345	extern const struct fileops vnops;
346	extern int nextpidversion;
347
348
349	#define USER_ADDR_ALIGN(addr, val) \
350	( ( (user_addr_t)(addr) + (val) - 1) \
351	& ~((val) - 1) )
352
353	/*
354	* For subsystem root support
355	*/
356	#define SPAWN_SUBSYSTEM_ROOT_ENTITLEMENT "com.apple.private.spawn-subsystem-root"
357
358	/*
359	* Allow setting p_crash_behavior to trigger panic on crash
360	*/
361	#define SPAWN_SET_PANIC_CRASH_BEHAVIOR "com.apple.private.spawn-panic-crash-behavior"
362
363	/ Platform Code Exec Logging /
364	static int platform_exec_logging = `0`;
365
366	SYSCTL_DECL(_security_mac);
367
368	SYSCTL_INT(_security_mac, OID_AUTO, platform_exec_logging, CTLFLAG_RW, &platform_exec_logging, `0`,
369	"log cdhashes for all platform binary executions");
370
371	static os_log_t peLog = OS_LOG_DEFAULT;
372
373	struct exception_port_action_t {
374	ipc_port_t port;
375	_ps_port_action_t *port_action;
376	};
377
378	struct exec_port_actions {
379	uint32_t exception_port_count;
380	uint32_t portwatch_count;
381	uint32_t registered_count;
382	struct exception_port_action_t *excport_array;
383	ipc_port_t *portwatch_array;
384	ipc_port_t *registered_array;
385	};
386
387	struct image_params; / Forward /
388	static int exec_activate_image(struct image_params *imgp);
389	static int exec_copyout_strings(struct image_params imgp, user_addr_t stackp);
390	static int load_return_to_errno(load_return_t lrtn);
391	static int execargs_alloc(struct image_params *imgp);
392	static int execargs_free(struct image_params *imgp);
393	static int exec_check_permissions(struct image_params *imgp);
394	static int exec_extract_strings(struct image_params *imgp);
395	static int exec_add_apple_strings(struct image_params imgp, const* load_result_t *load_result);
396	static int exec_handle_sugid(struct image_params *imgp);
397	static int sugid_scripts = `0`;
398	SYSCTL_INT(_kern, OID_AUTO, sugid_scripts, CTLFLAG_RW \| CTLFLAG_LOCKED, &sugid_scripts, `0`, "");
399	static kern_return_t create_unix_stack(vm_map_t map, load_result_t* load_result, proc_t p);
400	static int copyoutptr(user_addr_t ua, user_addr_t ptr, int ptr_size);
401	static void exec_resettextvp(proc_t, struct image_params *);
402	static int process_signature(proc_t, struct image_params *);
403	static void exec_prefault_data(proc_t, struct image_params , load_result_t );
404	static errno_t exec_handle_port_actions(struct image_params *imgp,
405	struct exec_port_actions *port_actions);
406	static errno_t exec_handle_exception_port_actions(const struct image_params *imgp,
407	const struct exec_port_actions *port_actions);
408	static errno_t exec_handle_spawnattr_policy(proc_t p, thread_t thread, int psa_apptype, uint64_t psa_qos_clamp,
409	task_role_t psa_darwin_role, struct exec_port_actions *port_actions);
410	static void exec_port_actions_destroy(struct exec_port_actions *port_actions);
411
412	/*
413	* exec_add_user_string
414	*
415	* Add the requested string to the string space area.
416	*
417	* Parameters; struct image_params * image parameter block
418	* user_addr_t string to add to strings area
419	* int segment from which string comes
420	* boolean_t TRUE if string contributes to NCARGS
421	*
422	* Returns: 0 Success
423	* !0 Failure errno from copyinstr()
424	*
425	* Implicit returns:
426	* (imgp->ip_strendp) updated location of next add, if any
427	* (imgp->ip_strspace) updated byte count of space remaining
428	* (imgp->ip_argspace) updated byte count of space in NCARGS
429	*/
430	__attribute__((noinline))
431	static int
432	exec_add_user_string(struct image_params imgp, user_addr_t str, int* seg, boolean_t is_ncargs)
433	{
434	int error = `0`;
435
436	do {
437	size_t len = `0`;
438	int space;
439
440	if (is_ncargs) {
441	space = imgp->ip_argspace; / by definition smaller than ip_strspace /
442	} else {
443	space = imgp->ip_strspace;
444	}
445
446	if (space <= `0`) {
447	error = E2BIG;
448	break;
449	}
450
451	if (!UIO_SEG_IS_USER_SPACE(seg)) {
452	char kstr = CAST_DOWN(char* , str); /* SAFE /
453	error = copystr(kfaddr: kstr, kdaddr: imgp->ip_strendp, len: space, done: &len);
454	} else {
455	error = copyinstr(uaddr: str, kaddr: imgp->ip_strendp, len: space, done: &len);
456	}
457
458	imgp->ip_strendp += len;
459	imgp->ip_strspace -= len;
460	if (is_ncargs) {
461	imgp->ip_argspace -= len;
462	}
463	} while (error == ENAMETOOLONG);
464
465	return error;
466	}
467
468	/*
469	* dyld is now passed the executable path as a getenv-like variable
470	* in the same fashion as the stack_guard and malloc_entropy keys.
471	*/
472	#define EXECUTABLE_KEY "executable_path="
473
474	/*
475	* exec_save_path
476	*
477	* To support new app package launching for Mac OS X, the dyld needs the
478	* first argument to execve() stored on the user stack.
479	*
480	* Save the executable path name at the bottom of the strings area and set
481	* the argument vector pointer to the location following that to indicate
482	* the start of the argument and environment tuples, setting the remaining
483	* string space count to the size of the string area minus the path length.
484	*
485	* Parameters; struct image_params * image parameter block
486	* char * path used to invoke program
487	* int segment from which path comes
488	*
489	* Returns: int 0 Success
490	* EFAULT Bad address
491	* copy[in]str:EFAULT Bad address
492	* copy[in]str:ENAMETOOLONG Filename too long
493	*
494	* Implicit returns:
495	* (imgp->ip_strings) saved path
496	* (imgp->ip_strspace) space remaining in ip_strings
497	* (imgp->ip_strendp) start of remaining copy area
498	* (imgp->ip_argspace) space remaining of NCARGS
499	* (imgp->ip_applec) Initial applev[0]
500	*
501	* Note: We have to do this before the initial namei() since in the
502	* path contains symbolic links, namei() will overwrite the
503	* original path buffer contents. If the last symbolic link
504	* resolved was a relative pathname, we would lose the original
505	* "path", which could be an absolute pathname. This might be
506	* unacceptable for dyld.
507	*/
508	static int
509	exec_save_path(struct image_params imgp, user_addr_t path, int* seg, const char **excpath)
510	{
511	int error;
512	size_t len;
513	char *kpath;
514
515	// imgp->ip_strings can come out of a cache, so we need to obliterate the
516	// old path.
517	memset(s: imgp->ip_strings, c: `'\0'`, n: strlen(EXECUTABLE_KEY) + MAXPATHLEN);
518
519	len = MIN(MAXPATHLEN, imgp->ip_strspace);
520
521	switch (seg) {
522	case UIO_USERSPACE32:
523	case UIO_USERSPACE64: / Same for copyin()... /
524	error = copyinstr(uaddr: path, kaddr: imgp->ip_strings + strlen(EXECUTABLE_KEY), len, done: &len);
525	break;
526	case UIO_SYSSPACE:
527	kpath = CAST_DOWN(char , path); /* SAFE /
528	error = copystr(kfaddr: kpath, kdaddr: imgp->ip_strings + strlen(EXECUTABLE_KEY), len, done: &len);
529	break;
530	default:
531	error = EFAULT;
532	break;
533	}
534
535	if (!error) {
536	bcopy(EXECUTABLE_KEY, dst: imgp->ip_strings, n: strlen(EXECUTABLE_KEY));
537	len += strlen(EXECUTABLE_KEY);
538
539	imgp->ip_strendp += len;
540	imgp->ip_strspace -= len;
541
542	if (excpath) {
543	*excpath = imgp->ip_strings + strlen(EXECUTABLE_KEY);
544	}
545	}
546
547	return error;
548	}
549
550	/*
551	* exec_reset_save_path
552	*
553	* If we detect a shell script, we need to reset the string area
554	* state so that the interpreter can be saved onto the stack.
555	*
556	* Parameters; struct image_params * image parameter block
557	*
558	* Returns: int 0 Success
559	*
560	* Implicit returns:
561	* (imgp->ip_strings) saved path
562	* (imgp->ip_strspace) space remaining in ip_strings
563	* (imgp->ip_strendp) start of remaining copy area
564	* (imgp->ip_argspace) space remaining of NCARGS
565	*
566	*/
567	static int
568	exec_reset_save_path(struct image_params *imgp)
569	{
570	imgp->ip_strendp = imgp->ip_strings;
571	imgp->ip_argspace = NCARGS;
572	imgp->ip_strspace = (NCARGS + PAGE_SIZE);
573
574	return `0`;
575	}
576
577	/*
578	* exec_shell_imgact
579	*
580	* Image activator for interpreter scripts. If the image begins with
581	* the characters "#!", then it is an interpreter script. Verify the
582	* length of the script line indicating the interpreter is not in
583	* excess of the maximum allowed size. If this is the case, then
584	* break out the arguments, if any, which are separated by white
585	* space, and copy them into the argument save area as if they were
586	* provided on the command line before all other arguments. The line
587	* ends when we encounter a comment character ('#') or newline.
588	*
589	* Parameters; struct image_params * image parameter block
590	*
591	* Returns: -1 not an interpreter (keep looking)
592	* -3 Success: interpreter: relookup
593	* >0 Failure: interpreter: error number
594	*
595	* A return value other than -1 indicates subsequent image activators should
596	* not be given the opportunity to attempt to activate the image.
597	*/
598	static int
599	exec_shell_imgact(struct image_params *imgp)
600	{
601	char *vdata = imgp->ip_vdata;
602	char *ihp;
603	char line_startp, line_endp;
604	char *interp;
605
606	/*
607	* Make sure it's a shell script. If we've already redirected
608	* from an interpreted file once, don't do it again.
609	*/
610	if (vdata[`0`] != `'#'` \|\|
611	vdata[`1`] != `'!'` \|\|
612	(imgp->ip_flags & IMGPF_INTERPRET) != `0`) {
613	return -`1`;
614	}
615
616	if (imgp->ip_origcputype != `0`) {
617	/ Fat header previously matched, don't allow shell script inside /
618	return -`1`;
619	}
620
621	imgp->ip_flags \|= IMGPF_INTERPRET;
622	imgp->ip_interp_sugid_fd = -`1`;
623	imgp->ip_interp_buffer[`0`] = `'\0'`;
624
625	/ Check to see if SUGID scripts are permitted. If they aren't then*
626	* clear the SUGID bits.
627	* imgp->ip_vattr is known to be valid.
628	*/
629	if (sugid_scripts == `0`) {
630	imgp->ip_origvattr->va_mode &= ~(VSUID \| VSGID);
631	}
632
633	/ Try to find the first non-whitespace character /
634	for (ihp = &vdata[`2`]; ihp < &vdata[IMG_SHSIZE]; ihp++) {
635	if (IS_EOL(*ihp)) {
636	/ Did not find interpreter, "#!\n" /
637	return ENOEXEC;
638	} else if (IS_WHITESPACE(*ihp)) {
639	/ Whitespace, like "#! /bin/sh\n", keep going. /
640	} else {
641	/ Found start of interpreter /
642	break;
643	}
644	}
645
646	if (ihp == &vdata[IMG_SHSIZE]) {
647	/ All whitespace, like "#! " /
648	return ENOEXEC;
649	}
650
651	line_startp = ihp;
652
653	/ Try to find the end of the interpreter+args string /
654	for (; ihp < &vdata[IMG_SHSIZE]; ihp++) {
655	if (IS_EOL(*ihp)) {
656	/ Got it /
657	break;
658	} else {
659	/ Still part of interpreter or args /
660	}
661	}
662
663	if (ihp == &vdata[IMG_SHSIZE]) {
664	/ A long line, like "#! blah blah blah" without end /
665	return ENOEXEC;
666	}
667
668	/ Backtrack until we find the last non-whitespace /
669	while (IS_EOL(ihp) \|\| IS_WHITESPACE(ihp)) {
670	ihp--;
671	}
672
673	/ The character after the last non-whitespace is our logical end of line /
674	line_endp = ihp + `1`;
675
676	/*
677	* Now we have pointers to the usable part of:
678	*
679	* "#! /usr/bin/int first second third \n"
680	* ^ line_startp ^ line_endp
681	*/
682
683	/ copy the interpreter name /
684	interp = imgp->ip_interp_buffer;
685	for (ihp = line_startp; (ihp < line_endp) && !IS_WHITESPACE(*ihp); ihp++) {
686	interp++ = ihp;
687	}
688	*interp = `'\0'`;
689
690	exec_reset_save_path(imgp);
691	exec_save_path(imgp, CAST_USER_ADDR_T(imgp->ip_interp_buffer),
692	seg: UIO_SYSSPACE, NULL);
693
694	/ Copy the entire interpreter + args for later processing into argv[] /
695	interp = imgp->ip_interp_buffer;
696	for (ihp = line_startp; (ihp < line_endp); ihp++) {
697	interp++ = ihp;
698	}
699	*interp = `'\0'`;
700
701	#if CONFIG_SETUID
702	/*
703	* If we have an SUID or SGID script, create a file descriptor
704	* from the vnode and pass /dev/fd/%d instead of the actual
705	* path name so that the script does not get opened twice
706	*/
707	if (imgp->ip_origvattr->va_mode & (VSUID \| VSGID)) {
708	proc_t p;
709	struct fileproc *fp;
710	int fd;
711	int error;
712
713	p = vfs_context_proc(ctx: imgp->ip_vfs_context);
714	error = falloc_exec(p, imgp->ip_vfs_context, &fp, &fd);
715	if (error) {
716	return error;
717	}
718
719	fp->fp_glob->fg_flag = FREAD;
720	fp->fp_glob->fg_ops = &vnops;
721	fp_set_data(fp, fg_data: imgp->ip_vp);
722
723	proc_fdlock(p);
724	procfdtbl_releasefd(p, fd, NULL);
725	fp_drop(p, fd, fp, locked: `1`);
726	proc_fdunlock(p);
727	vnode_ref(vp: imgp->ip_vp);
728
729	imgp->ip_interp_sugid_fd = fd;
730	}
731	#endif /* CONFIG_SETUID */
732
733	return -`3`;
734	}
735
736
737
738	/*
739	* exec_fat_imgact
740	*
741	* Image activator for fat 1.0 binaries. If the binary is fat, then we
742	* need to select an image from it internally, and make that the image
743	* we are going to attempt to execute. At present, this consists of
744	* reloading the first page for the image with a first page from the
745	* offset location indicated by the fat header.
746	*
747	* Parameters; struct image_params * image parameter block
748	*
749	* Returns: -1 not a fat binary (keep looking)
750	* -2 Success: encapsulated binary: reread
751	* >0 Failure: error number
752	*
753	* Important: This image activator is byte order neutral.
754	*
755	* Note: A return value other than -1 indicates subsequent image
756	* activators should not be given the opportunity to attempt
757	* to activate the image.
758	*
759	* If we find an encapsulated binary, we make no assertions
760	* about its validity; instead, we leave that up to a rescan
761	* for an activator to claim it, and, if it is claimed by one,
762	* that activator is responsible for determining validity.
763	*/
764	static int
765	exec_fat_imgact(struct image_params *imgp)
766	{
767	proc_t p = vfs_context_proc(ctx: imgp->ip_vfs_context);
768	kauth_cred_t cred = kauth_cred_proc_ref(procp: p);
769	struct fat_header fat_header = (struct* fat_header *)imgp->ip_vdata;
770	struct _posix_spawnattr *psa = NULL;
771	struct fat_arch fat_arch;
772	int resid, error;
773	load_return_t lret;
774
775	if (imgp->ip_origcputype != `0`) {
776	/ Fat header previously matched, don't allow another fat file inside /
777	error = -`1`; / not claimed /
778	goto bad;
779	}
780
781	/ Make sure it's a fat binary /
782	if (OSSwapBigToHostInt32(fat_header->magic) != FAT_MAGIC) {
783	error = -`1`; / not claimed /
784	goto bad;
785	}
786
787	/ imgp->ip_vdata has PAGE_SIZE, zerofilled if the file is smaller /
788	lret = fatfile_validate_fatarches(data_ptr: (vm_offset_t)fat_header, PAGE_SIZE,
789	file_size: (off_t)imgp->ip_vattr->va_data_size);
790	if (lret != LOAD_SUCCESS) {
791	error = load_return_to_errno(lrtn: lret);
792	goto bad;
793	}
794
795	/ If posix_spawn binprefs exist, respect those prefs. /
796	psa = (struct _posix_spawnattr *) imgp->ip_px_sa;
797	if (psa != NULL && psa->psa_binprefs[`0`] != `0`) {
798	uint32_t pr = `0`;
799
800	/ Check each preference listed against all arches in header /
801	for (pr = `0`; pr < NBINPREFS; pr++) {
802	cpu_type_t pref = psa->psa_binprefs[pr];
803	cpu_type_t subpref = psa->psa_subcpuprefs[pr];
804
805	if (pref == `0`) {
806	/ No suitable arch in the pref list /
807	error = EBADARCH;
808	goto bad;
809	}
810
811	if (pref == CPU_TYPE_ANY) {
812	/ Fall through to regular grading /
813	goto regular_grading;
814	}
815
816	lret = fatfile_getbestarch_for_cputype(cputype: pref,
817	cpusubtype: subpref,
818	data_ptr: (vm_offset_t)fat_header,
819	PAGE_SIZE,
820	imgp,
821	archret: &fat_arch);
822	if (lret == LOAD_SUCCESS) {
823	goto use_arch;
824	}
825	}
826
827	/ Requested binary preference was not honored /
828	error = EBADEXEC;
829	goto bad;
830	}
831
832	regular_grading:
833	/ Look up our preferred architecture in the fat file. /
834	lret = fatfile_getbestarch(data_ptr: (vm_offset_t)fat_header,
835	PAGE_SIZE,
836	imgp,
837	archret: &fat_arch,
838	affinity: (p->p_flag & P_AFFINITY) != `0`);
839	if (lret != LOAD_SUCCESS) {
840	error = load_return_to_errno(lrtn: lret);
841	goto bad;
842	}
843
844	use_arch:
845	/ Read the Mach-O header out of fat_arch /
846	error = vn_rdwr(rw: UIO_READ, vp: imgp->ip_vp, base: imgp->ip_vdata,
847	PAGE_SIZE, offset: fat_arch.offset,
848	segflg: UIO_SYSSPACE, ioflg: (IO_UNIT \| IO_NODELOCKED),
849	cred, aresid: &resid, p);
850	if (error) {
851	goto bad;
852	}
853
854	if (resid) {
855	memset(s: imgp->ip_vdata + (PAGE_SIZE - resid), c: `0x0`, n: resid);
856	}
857
858	/ Success. Indicate we have identified an encapsulated binary /
859	error = -`2`;
860	imgp->ip_arch_offset = (user_size_t)fat_arch.offset;
861	imgp->ip_arch_size = (user_size_t)fat_arch.size;
862	imgp->ip_origcputype = fat_arch.cputype;
863	imgp->ip_origcpusubtype = fat_arch.cpusubtype;
864
865	bad:
866	kauth_cred_unref(&cred);
867	return error;
868	}
869
870	static int
871	activate_exec_state(task_t task, proc_t p, thread_t thread, load_result_t *result)
872	{
873	int ret;
874
875	(void)task_set_dyld_info(task, MACH_VM_MIN_ADDRESS, size: `0`);
876	task_set_64bit(task, is_64bit: result->is_64bit_addr, is_64bit_data: result->is_64bit_data);
877	if (result->is_64bit_addr) {
878	OSBitOrAtomic(P_LP64, &p->p_flag);
879	get_bsdthread_info(thread)->uu_flag \|= UT_LP64;
880	} else {
881	OSBitAndAtomic(~((uint32_t)P_LP64), &p->p_flag);
882	get_bsdthread_info(thread)->uu_flag &= ~UT_LP64;
883	}
884	task_set_mach_header_address(task, addr: result->mach_header);
885
886	ret = thread_state_initialize(thread);
887	if (ret != KERN_SUCCESS) {
888	return ret;
889	}
890
891	if (result->threadstate) {
892	uint32_t *ts = result->threadstate;
893	uint32_t total_size = (uint32_t)result->threadstate_sz;
894
895	while (total_size > `0`) {
896	uint32_t flavor = *ts++;
897	uint32_t size = *ts++;
898
899	ret = thread_setstatus(thread, flavor, tstate: (thread_state_t)ts, count: size);
900	if (ret) {
901	return ret;
902	}
903	ts += size;
904	total_size -= (size + `2`) * sizeof(uint32_t);
905	}
906	}
907
908	thread_setentrypoint(thread, entry: result->entry_point);
909
910	return KERN_SUCCESS;
911	}
912
913	#if (DEVELOPMENT \|\| DEBUG)
914	extern char panic_on_proc_crash[];
915	extern int use_panic_on_proc_crash;
916
917	extern char panic_on_proc_exit[];
918	extern int use_panic_on_proc_exit;
919
920	extern char panic_on_proc_spawn_fail[];
921	extern int use_panic_on_proc_spawn_fail;
922	#endif
923
924	void
925	set_proc_name(struct image_params *imgp, proc_t p)
926	{
927	int p_name_len = sizeof(p->p_name) - `1`;
928
929	if (imgp->ip_ndp->ni_cnd.cn_namelen > p_name_len) {
930	imgp->ip_ndp->ni_cnd.cn_namelen = p_name_len;
931	}
932
933	bcopy(src: (caddr_t)imgp->ip_ndp->ni_cnd.cn_nameptr, dst: (caddr_t)p->p_name,
934	n: (unsigned)imgp->ip_ndp->ni_cnd.cn_namelen);
935	p->p_name[imgp->ip_ndp->ni_cnd.cn_namelen] = `'\0'`;
936
937	if (imgp->ip_ndp->ni_cnd.cn_namelen > MAXCOMLEN) {
938	imgp->ip_ndp->ni_cnd.cn_namelen = MAXCOMLEN;
939	}
940
941	bcopy(src: (caddr_t)imgp->ip_ndp->ni_cnd.cn_nameptr, dst: (caddr_t)p->p_comm,
942	n: (unsigned)imgp->ip_ndp->ni_cnd.cn_namelen);
943	p->p_comm[imgp->ip_ndp->ni_cnd.cn_namelen] = `'\0'`;
944
945	#if DEVELOPMENT \|\| DEBUG
946	/*
947	* This happens during image activation, so the crash behavior flags from
948	* posix_spawn will have already been set. So we don't have to worry about
949	* this being overridden.
950	*/
951	if (use_panic_on_proc_crash && strcmp(p->p_comm, panic_on_proc_crash) == `0`) {
952	printf("will panic on proc crash: %s\n", p->p_comm);
953	p->p_crash_behavior \|= POSIX_SPAWN_PANIC_ON_CRASH;
954	}
955
956	if (use_panic_on_proc_exit && strcmp(p->p_comm, panic_on_proc_exit) == `0`) {
957	printf("will panic on proc exit: %s\n", p->p_comm);
958	p->p_crash_behavior \|= POSIX_SPAWN_PANIC_ON_EXIT;
959	}
960
961	if (use_panic_on_proc_spawn_fail && strcmp(p->p_comm, panic_on_proc_spawn_fail) == `0`) {
962	printf("will panic on proc spawn fail: %s\n", p->p_comm);
963	p->p_crash_behavior \|= POSIX_SPAWN_PANIC_ON_SPAWN_FAIL;
964	}
965	#endif
966	}
967
968	#if __has_feature(ptrauth_calls)
969	/**
970	* Returns a team ID string that may be used to assign a shared region.
971	*
972	* Platform binaries do not have team IDs and will return NULL. Non-platform
973	* binaries without a team ID will be assigned an artificial team ID of ""
974	* (empty string) so that they will not be assigned to the default shared
975	* region.
976	*
977	* @param imgp image parameter block
978	* @return NULL if this is a platform binary, or an appropriate team ID string
979	* otherwise
980	*/
981	static inline const char *
982	get_teamid_for_shared_region(struct image_params *imgp)
983	{
984	assert(imgp->ip_vp != NULL);
985
986	const char *ret = csvnode_get_teamid(imgp->ip_vp, imgp->ip_arch_offset);
987	if (ret) {
988	return ret;
989	}
990
991	struct cs_blob *blob = csvnode_get_blob(imgp->ip_vp, imgp->ip_arch_offset);
992	if (csblob_get_platform_binary(blob)) {
993	return NULL;
994	} else {
995	static const char *NO_TEAM_ID = "";
996	return NO_TEAM_ID;
997	}
998	}
999
1000	/**
1001	* Determines whether ptrauth should be enabled for the provided arm64 CPU subtype.
1002	*
1003	* @param cpusubtype Mach-O style CPU subtype
1004	* @return whether the CPU subtype matches arm64e with the current ptrauth ABI
1005	*/
1006	static inline bool
1007	arm64_cpusubtype_uses_ptrauth(cpu_subtype_t cpusubtype)
1008	{
1009	return (cpusubtype & ~CPU_SUBTYPE_MASK) == CPU_SUBTYPE_ARM64E &&
1010	CPU_SUBTYPE_ARM64_PTR_AUTH_VERSION(cpusubtype) == CPU_SUBTYPE_ARM64_PTR_AUTH_CURRENT_VERSION;
1011	}
1012
1013	#endif /* __has_feature(ptrauth_calls) */
1014
1015	/**
1016	* Returns whether a type/subtype slice matches the requested
1017	* type/subtype.
1018	*
1019	* @param mask Bits to mask from the requested/tested cpu type
1020	* @param req_cpu Requested cpu type
1021	* @param req_subcpu Requested cpu subtype
1022	* @param test_cpu Tested slice cpu type
1023	* @param test_subcpu Tested slice cpu subtype
1024	*/
1025	boolean_t
1026	binary_match(cpu_type_t mask, cpu_type_t req_cpu,
1027	cpu_subtype_t req_subcpu, cpu_type_t test_cpu,
1028	cpu_subtype_t test_subcpu)
1029	{
1030	if ((test_cpu & ~mask) != (req_cpu & ~mask)) {
1031	return FALSE;
1032	}
1033
1034	test_subcpu &= ~CPU_SUBTYPE_MASK;
1035	req_subcpu &= ~CPU_SUBTYPE_MASK;
1036
1037	if (test_subcpu != req_subcpu && req_subcpu != (CPU_SUBTYPE_ANY & ~CPU_SUBTYPE_MASK)) {
1038	return FALSE;
1039	}
1040
1041	return TRUE;
1042	}
1043
1044
1045	#define MIN_IOS_TPRO_SDK_VERSION 0x00100000
1046	#define MIN_OSX_TPRO_SDK_VERSION 0x000D0000
1047	#define MIN_TVOS_TPRO_SDK_VERSION 0x000D0000
1048	#define MIN_WATCHOS_TPRO_SDK_VERSION 0x00090000
1049	#define MIN_DRIVERKIT_TPRO_SDK_VERSION 0x00600000
1050
1051	static void
1052	exec_setup_tpro(struct image_params imgp, load_result_t load_result)
1053	{
1054	extern boolean_t xprr_tpro_enabled;
1055	extern boolean_t enable_user_modifiable_perms;
1056	uint32_t min_sdk_version = `0`;
1057
1058	/ x86-64 translated code cannot take advantage of TPRO /
1059	if (imgp->ip_flags & IMGPF_ROSETTA) {
1060	return;
1061	}
1062
1063	/ Do not enable on 32-bit VA targets /
1064	if (!(imgp->ip_flags & IMGPF_IS_64BIT_ADDR)) {
1065	return;
1066	}
1067
1068	switch (load_result->ip_platform) {
1069	case PLATFORM_IOS:
1070	case PLATFORM_IOSSIMULATOR:
1071	case PLATFORM_MACCATALYST:
1072	min_sdk_version = MIN_IOS_TPRO_SDK_VERSION;
1073	break;
1074	case PLATFORM_MACOS:
1075	min_sdk_version = MIN_OSX_TPRO_SDK_VERSION;
1076	break;
1077	case PLATFORM_TVOS:
1078	case PLATFORM_TVOSSIMULATOR:
1079	min_sdk_version = MIN_TVOS_TPRO_SDK_VERSION;
1080	break;
1081	case PLATFORM_WATCHOS:
1082	case PLATFORM_WATCHOSSIMULATOR:
1083	min_sdk_version = MIN_WATCHOS_TPRO_SDK_VERSION;
1084	break;
1085	case PLATFORM_DRIVERKIT:
1086	min_sdk_version = MIN_DRIVERKIT_TPRO_SDK_VERSION;
1087	break;
1088	default:
1089	/ TPRO is on by default for newer platforms /
1090	break;
1091	}
1092
1093	}
1094
1095	/*
1096	* If the passed in executable's vnode should use the RSR
1097	* shared region, then this should return TRUE, otherwise, return FALSE.
1098	*/
1099	static uint32_t rsr_current_version = `0`;
1100	boolean_t (rsr_check_vnode)(void* *vnode) = NULL;
1101
1102	boolean_t
1103	vnode_is_rsr(vnode_t vp)
1104	{
1105	if (!(vnode_isreg(vp) && vnode_tag(vp) == VT_APFS)) {
1106	return FALSE;
1107	}
1108
1109	if (rsr_check_vnode != NULL && rsr_check_vnode((void *)vp)) {
1110	return TRUE;
1111	}
1112	return FALSE;
1113	}
1114
1115
1116	static inline void
1117	encode_HR_entitlement(const char *entitlement, HR_flags_t mask,
1118	const struct image_params imgp, load_result_t load_result)
1119	{
1120	if (IOVnodeHasEntitlement(vnode: imgp->ip_vp, off: (int64_t)imgp->ip_arch_offset, entitlement)) {
1121	load_result->hardened_runtime_binary \|= mask;
1122	}
1123	}
1124
1125	uint32_t
1126	rsr_get_version(void)
1127	{
1128	return os_atomic_load(&rsr_current_version, relaxed);
1129	}
1130
1131	void
1132	rsr_bump_version(void)
1133	{
1134	os_atomic_inc(&rsr_current_version, relaxed);
1135	}
1136
1137	#if XNU_TARGET_OS_OSX
1138	static int
1139	rsr_version_sysctl SYSCTL_HANDLER_ARGS
1140	{
1141	#pragma unused(arg1, arg2, oidp)
1142	int value = rsr_get_version();
1143	int error = SYSCTL_OUT(req, &value, sizeof(int));
1144	if (error) {
1145	return error;
1146	}
1147
1148	if (!req->newptr) {
1149	return `0`;
1150	}
1151
1152	error = SYSCTL_IN(req, &value, sizeof(int));
1153	if (error) {
1154	return error;
1155	}
1156	if (value != `0`) {
1157	rsr_bump_version();
1158	}
1159	return `0`;
1160	}
1161
1162
1163	SYSCTL_PROC(_vm, OID_AUTO, shared_region_control,
1164	CTLTYPE_INT \| CTLFLAG_RW \| CTLFLAG_LOCKED \| CTLFLAG_MASKED,
1165	`0`, `0`, rsr_version_sysctl, "I", "");
1166	#endif /* XNU_TARGET_OS_OSX */
1167
1168	/*
1169	* exec_mach_imgact
1170	*
1171	* Image activator for mach-o 1.0 binaries.
1172	*
1173	* Parameters; struct image_params * image parameter block
1174	*
1175	* Returns: -1 not a fat binary (keep looking)
1176	* -2 Success: encapsulated binary: reread
1177	* >0 Failure: error number
1178	* EBADARCH Mach-o binary, but with an unrecognized
1179	* architecture
1180	* ENOMEM No memory for child process after -
1181	* can only happen after vfork()
1182	*
1183	* Important: This image activator is NOT byte order neutral.
1184	*
1185	* Note: A return value other than -1 indicates subsequent image
1186	* activators should not be given the opportunity to attempt
1187	* to activate the image.
1188	*/
1189	static int
1190	exec_mach_imgact(struct image_params *imgp)
1191	{
1192	struct mach_header mach_header = (struct* mach_header *)imgp->ip_vdata;
1193	proc_t p = vfs_context_proc(ctx: imgp->ip_vfs_context);
1194	int error = `0`;
1195	task_t task;
1196	task_t new_task = NULL; / protected by vfexec /
1197	thread_t thread;
1198	struct uthread *uthread;
1199	vm_map_t old_map = VM_MAP_NULL;
1200	vm_map_t map = VM_MAP_NULL;
1201	load_return_t lret;
1202	load_result_t load_result = {};
1203	struct _posix_spawnattr *psa = NULL;
1204	int spawn = (imgp->ip_flags & IMGPF_SPAWN);
1205	const int vfexec = `0`;
1206	int exec = (imgp->ip_flags & IMGPF_EXEC);
1207	os_reason_t exec_failure_reason = OS_REASON_NULL;
1208	boolean_t reslide = FALSE;
1209	char * userspace_coredump_name = NULL;
1210
1211	/*
1212	* make sure it's a Mach-O 1.0 or Mach-O 2.0 binary; the difference
1213	* is a reserved field on the end, so for the most part, we can
1214	* treat them as if they were identical. Reverse-endian Mach-O
1215	* binaries are recognized but not compatible.
1216	*/
1217	if ((mach_header->magic == MH_CIGAM) \|\|
1218	(mach_header->magic == MH_CIGAM_64)) {
1219	error = EBADARCH;
1220	goto bad;
1221	}
1222
1223	if ((mach_header->magic != MH_MAGIC) &&
1224	(mach_header->magic != MH_MAGIC_64)) {
1225	error = -`1`;
1226	goto bad;
1227	}
1228
1229	if (mach_header->filetype != MH_EXECUTE) {
1230	error = -`1`;
1231	goto bad;
1232	}
1233
1234	if (imgp->ip_origcputype != `0`) {
1235	/ Fat header previously had an idea about this thin file /
1236	if (imgp->ip_origcputype != mach_header->cputype \|\|
1237	imgp->ip_origcpusubtype != mach_header->cpusubtype) {
1238	error = EBADARCH;
1239	goto bad;
1240	}
1241	} else {
1242	imgp->ip_origcputype = mach_header->cputype;
1243	imgp->ip_origcpusubtype = mach_header->cpusubtype;
1244	}
1245
1246	task = current_task();
1247	thread = current_thread();
1248	uthread = get_bsdthread_info(thread);
1249
1250	if ((mach_header->cputype & CPU_ARCH_ABI64) == CPU_ARCH_ABI64) {
1251	imgp->ip_flags \|= IMGPF_IS_64BIT_ADDR \| IMGPF_IS_64BIT_DATA;
1252	}
1253
1254
1255	/ If posix_spawn binprefs exist, respect those prefs. /
1256	psa = (struct _posix_spawnattr *) imgp->ip_px_sa;
1257	if (psa != NULL && psa->psa_binprefs[`0`] != `0`) {
1258	int pr = `0`;
1259	for (pr = `0`; pr < NBINPREFS; pr++) {
1260	cpu_type_t pref = psa->psa_binprefs[pr];
1261	cpu_subtype_t subpref = psa->psa_subcpuprefs[pr];
1262
1263	if (pref == `0`) {
1264	/ No suitable arch in the pref list /
1265	error = EBADARCH;
1266	goto bad;
1267	}
1268
1269	if (pref == CPU_TYPE_ANY) {
1270	/ Jump to regular grading /
1271	goto grade;
1272	}
1273
1274	if (binary_match(CPU_ARCH_MASK, req_cpu: pref, req_subcpu: subpref,
1275	test_cpu: imgp->ip_origcputype, test_subcpu: imgp->ip_origcpusubtype)) {
1276	goto grade;
1277	}
1278	}
1279	error = EBADARCH;
1280	goto bad;
1281	}
1282	grade:
1283	if (!grade_binary(imgp->ip_origcputype, imgp->ip_origcpusubtype & ~CPU_SUBTYPE_MASK,
1284	imgp->ip_origcpusubtype & CPU_SUBTYPE_MASK, TRUE)) {
1285	error = EBADARCH;
1286	goto bad;
1287	}
1288
1289	if (validate_potential_simulator_binary(exectype: imgp->ip_origcputype, imgp,
1290	file_offset: imgp->ip_arch_offset, macho_size: imgp->ip_arch_size) != LOAD_SUCCESS) {
1291	#if __x86_64__
1292	const char *excpath;
1293	error = exec_save_path(imgp, imgp->ip_user_fname, imgp->ip_seg, &excpath);
1294	os_log_error(OS_LOG_DEFAULT, "Unsupported 32-bit executable: \"%s\"", (error) ? imgp->ip_vp->v_name : excpath);
1295	#endif
1296	error = EBADARCH;
1297	goto bad;
1298	}
1299
1300	#if defined(HAS_APPLE_PAC)
1301	assert(mach_header->cputype == CPU_TYPE_ARM64
1302	);
1303
1304	if ((mach_header->cputype == CPU_TYPE_ARM64 &&
1305	arm64_cpusubtype_uses_ptrauth(mach_header->cpusubtype))
1306	) {
1307	imgp->ip_flags &= ~IMGPF_NOJOP;
1308	} else {
1309	imgp->ip_flags \|= IMGPF_NOJOP;
1310	}
1311	#endif
1312
1313	/ Copy in arguments/environment from the old process /
1314	error = exec_extract_strings(imgp);
1315	if (error) {
1316	goto bad;
1317	}
1318
1319	AUDIT_ARG(argv, imgp->ip_startargv, imgp->ip_argc,
1320	imgp->ip_endargv - imgp->ip_startargv);
1321	AUDIT_ARG(envv, imgp->ip_endargv, imgp->ip_envc,
1322	imgp->ip_endenvv - imgp->ip_endargv);
1323
1324
1325
1326	/ reset local idea of thread, uthread, task /
1327	thread = imgp->ip_new_thread;
1328	uthread = get_bsdthread_info(thread);
1329	task = new_task = get_threadtask(thread);
1330
1331	/*
1332	* Load the Mach-O file.
1333	*
1334	* NOTE: An error after this point indicates we have potentially
1335	* destroyed or overwritten some process state while attempting an
1336	* execve() following a vfork(), which is an unrecoverable condition.
1337	* We send the new process an immediate SIGKILL to avoid it executing
1338	* any instructions in the mutated address space. For true spawns,
1339	* this is not the case, and "too late" is still not too late to
1340	* return an error code to the parent process.
1341	*/
1342
1343	/*
1344	* Actually load the image file we previously decided to load.
1345	*/
1346	lret = load_machfile(imgp, header: mach_header, thread, mapp: &map, result: &load_result);
1347	if (lret != LOAD_SUCCESS) {
1348	error = load_return_to_errno(lrtn: lret);
1349
1350	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) \| DBG_FUNC_NONE,
1351	proc_getpid(p), OS_REASON_EXEC, EXEC_EXIT_REASON_BAD_MACHO, `0`, `0`);
1352	if (lret == LOAD_BADMACHO_UPX) {
1353	set_proc_name(imgp, p);
1354	exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_UPX);
1355	exec_failure_reason->osr_flags \|= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
1356	} else {
1357	exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_BAD_MACHO);
1358
1359	if (bootarg_execfailurereports) {
1360	set_proc_name(imgp, p);
1361	exec_failure_reason->osr_flags \|= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
1362	}
1363	}
1364
1365	exec_failure_reason->osr_flags \|= OS_REASON_FLAG_CONSISTENT_FAILURE;
1366
1367	goto badtoolate;
1368	}
1369
1370	/*
1371	* ERROR RECOVERY
1372	*
1373	* load_machfile() returned the new VM map ("map") but we haven't
1374	* committed to it yet.
1375	* Any error path between here and the point where we commit to using
1376	* the new "map" (with swap_task_map()) should deallocate "map".
1377	*/
1378
1379	#ifndef KASAN
1380	/*
1381	* Security: zone sanity checks on fresh boot or initproc re-exec.
1382	* launchd by design does not tear down its own service port on USR (rdar://72797967),
1383	* which means here is the earliest point we can assert on empty service port label zone,
1384	* after load_machfile() above terminates old launchd's IPC space.
1385	*
1386	* Disable on KASAN builds since zone_size_allocated() accounts for elements
1387	* under quarantine.
1388	*/
1389	if (task_pid(task) == `1`) {
1390	zone_userspace_reboot_checks();
1391	}
1392	#endif
1393
1394	proc_lock(p);
1395	p->p_cputype = imgp->ip_origcputype;
1396	p->p_cpusubtype = imgp->ip_origcpusubtype;
1397	proc_setplatformdata(p, load_result.ip_platform, load_result.lr_min_sdk, load_result.lr_sdk);
1398	exec_setup_tpro(imgp, load_result: &load_result);
1399
1400	vm_map_set_size_limit(map, limit: proc_limitgetcur(p, RLIMIT_AS));
1401	vm_map_set_data_limit(map, limit: proc_limitgetcur(p, RLIMIT_DATA));
1402	vm_map_set_user_wire_limit(map, limit: (vm_size_t)proc_limitgetcur(p, RLIMIT_MEMLOCK));
1403	#if XNU_TARGET_OS_OSX
1404	if (proc_platform(p) == PLATFORM_IOS) {
1405	assert(vm_map_is_alien(map));
1406	} else {
1407	assert(!vm_map_is_alien(map));
1408	}
1409	#endif /* XNU_TARGET_OS_OSX */
1410	proc_unlock(p);
1411
1412	/*
1413	* Set TPRO flags if enabled
1414	*/
1415
1416	/*
1417	* Set code-signing flags if this binary is signed, or if parent has
1418	* requested them on exec.
1419	*/
1420	if (load_result.csflags & CS_VALID) {
1421	imgp->ip_csflags \|= load_result.csflags &
1422	(CS_VALID \| CS_SIGNED \| CS_DEV_CODE \| CS_LINKER_SIGNED \|
1423	CS_HARD \| CS_KILL \| CS_RESTRICT \| CS_ENFORCEMENT \| CS_REQUIRE_LV \|
1424	CS_FORCED_LV \| CS_ENTITLEMENTS_VALIDATED \| CS_NO_UNTRUSTED_HELPERS \| CS_RUNTIME \|
1425	CS_ENTITLEMENT_FLAGS \|
1426	CS_EXEC_SET_HARD \| CS_EXEC_SET_KILL \| CS_EXEC_SET_ENFORCEMENT);
1427	} else {
1428	imgp->ip_csflags &= ~CS_VALID;
1429	}
1430
1431	if (proc_getcsflags(p) & CS_EXEC_SET_HARD) {
1432	imgp->ip_csflags \|= CS_HARD;
1433	}
1434	if (proc_getcsflags(p) & CS_EXEC_SET_KILL) {
1435	imgp->ip_csflags \|= CS_KILL;
1436	}
1437	if (proc_getcsflags(p) & CS_EXEC_SET_ENFORCEMENT) {
1438	imgp->ip_csflags \|= CS_ENFORCEMENT;
1439	}
1440	if (proc_getcsflags(p) & CS_EXEC_INHERIT_SIP) {
1441	if (proc_getcsflags(p) & CS_INSTALLER) {
1442	imgp->ip_csflags \|= CS_INSTALLER;
1443	}
1444	if (proc_getcsflags(p) & CS_DATAVAULT_CONTROLLER) {
1445	imgp->ip_csflags \|= CS_DATAVAULT_CONTROLLER;
1446	}
1447	if (proc_getcsflags(p) & CS_NVRAM_UNRESTRICTED) {
1448	imgp->ip_csflags \|= CS_NVRAM_UNRESTRICTED;
1449	}
1450	}
1451
1452	#if __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX)
1453	/*
1454	* ptrauth version 0 is a preview ABI. Developers can opt into running
1455	* their own arm64e binaries for local testing, with the understanding
1456	* that future OSes may break ABI.
1457	*/
1458	if ((imgp->ip_origcpusubtype & ~CPU_SUBTYPE_MASK) == CPU_SUBTYPE_ARM64E &&
1459	CPU_SUBTYPE_ARM64_PTR_AUTH_VERSION(imgp->ip_origcpusubtype) == `0` &&
1460	!load_result.platform_binary &&
1461	!bootarg_arm64e_preview_abi) {
1462	static bool logged_once = false;
1463	set_proc_name(imgp, p);
1464
1465	printf("%s: not running binary \"%s\" built against preview arm64e ABI\n", __func__, p->p_name);
1466	if (!os_atomic_xchg(&logged_once, true, relaxed)) {
1467	printf("%s: (to allow this, add \"-arm64e_preview_abi\" to boot-args)\n", __func__);
1468	}
1469
1470	exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_BAD_MACHO);
1471	if (bootarg_execfailurereports) {
1472	exec_failure_reason->osr_flags \|= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
1473	exec_failure_reason->osr_flags \|= OS_REASON_FLAG_CONSISTENT_FAILURE;
1474	}
1475
1476	/ release new address space since we won't use it /
1477	imgp->ip_free_map = map;
1478	map = VM_MAP_NULL;
1479	goto badtoolate;
1480	}
1481
1482	if ((imgp->ip_origcpusubtype & ~CPU_SUBTYPE_MASK) != CPU_SUBTYPE_ARM64E &&
1483	imgp->ip_origcputype == CPU_TYPE_ARM64 &&
1484	load_result.platform_binary &&
1485	(imgp->ip_flags & IMGPF_DRIVER) != `0`) {
1486	set_proc_name(imgp, p);
1487	printf("%s: disallowing arm64 platform driverkit binary \"%s\", should be arm64e\n", __func__, p->p_name);
1488	exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_BAD_MACHO);
1489	if (bootarg_execfailurereports) {
1490	exec_failure_reason->osr_flags \|= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
1491	exec_failure_reason->osr_flags \|= OS_REASON_FLAG_CONSISTENT_FAILURE;
1492	}
1493
1494	/ release new address space since we won't use it /
1495	imgp->ip_free_map = map;
1496	map = VM_MAP_NULL;
1497	goto badtoolate;
1498	}
1499	#endif /* __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX) */
1500
1501
1502	load_result.hardened_runtime_binary = `0`;
1503	// Propogate which hardened runtime entitlements are active to the apple array
1504	encode_HR_entitlement(kCSWebBrowserHostEntitlement, mask: BrowserHostEntitlementMask, imgp, load_result: &load_result);
1505	encode_HR_entitlement(kCSWebBrowserGPUEntitlement, mask: BrowserGPUEntitlementMask, imgp, load_result: &load_result);
1506	encode_HR_entitlement(kCSWebBrowserNetworkEntitlement, mask: BrowserNetworkEntitlementMask, imgp, load_result: &load_result);
1507	encode_HR_entitlement(kCSWebBrowserWebContentEntitlement, mask: BrowserWebContentEntitlementMask, imgp, load_result: &load_result);
1508
1509	/*
1510	* Set up the shared cache region in the new process.
1511	*
1512	* Normally there is a single shared region per architecture.
1513	* However on systems with Pointer Authentication, we can create
1514	* multiple shared caches with the amount of sharing determined
1515	* by team-id or entitlement. Inherited shared region IDs are used
1516	* for system processes that need to match and be able to inspect
1517	* a pre-existing task.
1518	*/
1519	int cpu_subtype = `0`; / all cpu_subtypes use the same shared region /
1520	#if __has_feature(ptrauth_calls)
1521	char *shared_region_id = NULL;
1522	size_t len;
1523	char *base;
1524	const char *cbase;
1525	#define HARDENED_RUNTIME_CONTENT_ID "C-"
1526	#define TEAM_ID_PREFIX "T-"
1527	#define ENTITLE_PREFIX "E-"
1528	#define SR_PREFIX_LEN 2
1529	#define SR_ENTITLEMENT "com.apple.pac.shared_region_id"
1530
1531	if (cpu_type() == CPU_TYPE_ARM64 &&
1532	arm64_cpusubtype_uses_ptrauth(p->p_cpusubtype) &&
1533	(imgp->ip_flags & IMGPF_NOJOP) == `0`) {
1534	assertf(p->p_cputype == CPU_TYPE_ARM64,
1535	"p %p cpu_type() 0x%x p->p_cputype 0x%x p->p_cpusubtype 0x%x",
1536	p, cpu_type(), p->p_cputype, p->p_cpusubtype);
1537
1538	/*
1539	* arm64e uses pointer authentication, so request a separate
1540	* shared region for this CPU subtype.
1541	*/
1542	cpu_subtype = p->p_cpusubtype & ~CPU_SUBTYPE_MASK;
1543
1544	/*
1545	* Determine which shared cache to select based on being told,
1546	* matching a team-id or matching an entitlement.
1547	*/
1548	if (load_result.hardened_runtime_binary & BrowserWebContentEntitlementMask) {
1549	len = sizeof(HARDENED_RUNTIME_CONTENT_ID);
1550	shared_region_id = kalloc_data(len, Z_WAITOK \| Z_NOFAIL);
1551	strlcpy(shared_region_id, HARDENED_RUNTIME_CONTENT_ID, len);
1552	} else if (imgp->ip_inherited_shared_region_id) {
1553	len = strlen(imgp->ip_inherited_shared_region_id);
1554	shared_region_id = kalloc_data(len + `1`, Z_WAITOK \| Z_NOFAIL);
1555	memcpy(shared_region_id, imgp->ip_inherited_shared_region_id, len + `1`);
1556	} else if ((cbase = get_teamid_for_shared_region(imgp)) != NULL) {
1557	len = strlen(cbase);
1558	if (vm_shared_region_per_team_id) {
1559	shared_region_id = kalloc_data(len + SR_PREFIX_LEN + `1`,
1560	Z_WAITOK \| Z_NOFAIL);
1561	memcpy(shared_region_id, TEAM_ID_PREFIX, SR_PREFIX_LEN);
1562	memcpy(shared_region_id + SR_PREFIX_LEN, cbase, len + `1`);
1563	}
1564	} else if ((base = IOVnodeGetEntitlement(imgp->ip_vp,
1565	(int64_t)imgp->ip_arch_offset, SR_ENTITLEMENT)) != NULL) {
1566	len = strlen(base);
1567	if (vm_shared_region_by_entitlement) {
1568	shared_region_id = kalloc_data(len + SR_PREFIX_LEN + `1`,
1569	Z_WAITOK \| Z_NOFAIL);
1570	memcpy(shared_region_id, ENTITLE_PREFIX, SR_PREFIX_LEN);
1571	memcpy(shared_region_id + SR_PREFIX_LEN, base, len + `1`);
1572	}
1573	/ Discard the copy of the entitlement /
1574	kfree_data(base, len + `1`);
1575	}
1576	}
1577
1578	if (imgp->ip_flags & IMGPF_RESLIDE) {
1579	reslide = TRUE;
1580	}
1581
1582	/ use "" as the default shared_region_id /
1583	if (shared_region_id == NULL) {
1584	shared_region_id = kalloc_data(`1`, Z_WAITOK \| Z_ZERO \| Z_NOFAIL);
1585	}
1586
1587	/ ensure there's a unique pointer signing key for this shared_region_id /
1588	shared_region_key_alloc(shared_region_id,
1589	imgp->ip_inherited_shared_region_id != NULL, imgp->ip_inherited_jop_pid);
1590	task_set_shared_region_id(task, shared_region_id);
1591	shared_region_id = NULL;
1592	#endif /* __has_feature(ptrauth_calls) */
1593
1594	#if CONFIG_ROSETTA
1595	if (imgp->ip_flags & IMGPF_ROSETTA) {
1596	OSBitOrAtomic(P_TRANSLATED, &p->p_flag);
1597	} else if (p->p_flag & P_TRANSLATED) {
1598	OSBitAndAtomic(~P_TRANSLATED, &p->p_flag);
1599	}
1600	#endif
1601
1602	int cputype = cpu_type();
1603
1604	uint32_t rsr_version = `0`;
1605	#if XNU_TARGET_OS_OSX
1606	if (vnode_is_rsr(vp: imgp->ip_vp)) {
1607	rsr_version = rsr_get_version();
1608	os_atomic_or(&p->p_ladvflag, P_RSR, relaxed);
1609	os_atomic_or(&p->p_vfs_iopolicy, P_VFS_IOPOLICY_ALTLINK, relaxed);
1610	}
1611	#endif /* XNU_TARGET_OS_OSX */
1612
1613	vm_map_exec(new_map: map, task, is64bit: load_result.is_64bit_addr,
1614	fsroot: (void *)p->p_fd.fd_rdir, cpu: cputype, cpu_subtype, reslide,
1615	is_driverkit: (imgp->ip_flags & IMGPF_DRIVER) != `0`,
1616	rsr_version);
1617
1618	/*
1619	* Close file descriptors which specify close-on-exec.
1620	*/
1621	fdt_exec(p, p_cred: vfs_context_ucred(ctx: imgp->ip_vfs_context),
1622	posix_spawn_flags: psa != NULL ? psa->psa_flags : `0`, thread: imgp->ip_new_thread, in_exec: exec);
1623
1624	/*
1625	* deal with set[ug]id.
1626	*/
1627	error = exec_handle_sugid(imgp);
1628	if (error) {
1629	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) \| DBG_FUNC_NONE,
1630	proc_getpid(p), OS_REASON_EXEC, EXEC_EXIT_REASON_SUGID_FAILURE, `0`, `0`);
1631
1632	exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_SUGID_FAILURE);
1633	if (bootarg_execfailurereports) {
1634	set_proc_name(imgp, p);
1635	exec_failure_reason->osr_flags \|= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
1636	}
1637
1638	/ release new address space since we won't use it /
1639	imgp->ip_free_map = map;
1640	map = VM_MAP_NULL;
1641	goto badtoolate;
1642	}
1643
1644	/*
1645	* Commit to new map.
1646	*
1647	* Swap the new map for the old for target task, which consumes
1648	* our new map reference but each leaves us responsible for the
1649	* old_map reference. That lets us get off the pmap associated
1650	* with it, and then we can release it.
1651	*
1652	* The map needs to be set on the target task which is different
1653	* than current task, thus swap_task_map is used instead of
1654	* vm_map_switch.
1655	*/
1656	old_map = swap_task_map(task, thread, map);
1657	#if MACH_ASSERT
1658	/*
1659	* Reset the pmap's process info to prevent ledger checks
1660	* which might fail due to the ledgers being shared between
1661	* the old and new pmaps.
1662	*/
1663	vm_map_pmap_set_process(old_map, -`1`, "<old_map>");
1664	#endif /* MACH_ASSERT */
1665	imgp->ip_free_map = old_map;
1666	old_map = NULL;
1667
1668	lret = activate_exec_state(task, p, thread, result: &load_result);
1669	if (lret != KERN_SUCCESS) {
1670	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) \| DBG_FUNC_NONE,
1671	proc_getpid(p), OS_REASON_EXEC, EXEC_EXIT_REASON_ACTV_THREADSTATE, `0`, `0`);
1672
1673	exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_ACTV_THREADSTATE);
1674	if (bootarg_execfailurereports) {
1675	set_proc_name(imgp, p);
1676	exec_failure_reason->osr_flags \|= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
1677	}
1678
1679	goto badtoolate;
1680	}
1681
1682	/*
1683	* deal with voucher on exec-calling thread.
1684	*/
1685	if (imgp->ip_new_thread == NULL) {
1686	thread_set_mach_voucher(thr_act: current_thread(), IPC_VOUCHER_NULL);
1687	}
1688
1689	/ Make sure we won't interrupt ourself signalling a partial process /
1690	if (!vfexec && !spawn && (p->p_lflag & P_LTRACED)) {
1691	psignal(p, SIGTRAP);
1692	}
1693
1694	if (load_result.unixproc &&
1695	create_unix_stack(map: get_task_map(task),
1696	load_result: &load_result,
1697	p) != KERN_SUCCESS) {
1698	error = load_return_to_errno(LOAD_NOSPACE);
1699
1700	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) \| DBG_FUNC_NONE,
1701	proc_getpid(p), OS_REASON_EXEC, EXEC_EXIT_REASON_STACK_ALLOC, `0`, `0`);
1702
1703	exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_STACK_ALLOC);
1704	if (bootarg_execfailurereports) {
1705	set_proc_name(imgp, p);
1706	exec_failure_reason->osr_flags \|= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
1707	}
1708
1709	goto badtoolate;
1710	}
1711
1712	if (load_result.hardened_runtime_binary) {
1713	if (cs_debug) {
1714	printf("setting hardened runtime with entitlement mask= "
1715	"0x%x on task: pid = %d\n",
1716	load_result.hardened_runtime_binary,
1717	proc_getpid(p));
1718	}
1719	task_set_hardened_runtime(task, true);
1720	}
1721
1722	/*
1723	* The load result will have already been munged by AMFI to include the
1724	* platform binary flag if boot-args dictated it (AMFI will mark anything
1725	* that doesn't go through the upcall path as a platform binary if its
1726	* enforcement is disabled).
1727	*/
1728	if (load_result.platform_binary) {
1729	if (cs_debug) {
1730	printf("setting platform binary on task: pid = %d\n", proc_getpid(p));
1731	}
1732
1733	/*
1734	* We must use 'task' here because the proc's task has not yet been
1735	* switched to the new one.
1736	*/
1737	task_set_platform_binary(task, TRUE);
1738	} else {
1739	if (cs_debug) {
1740	printf("clearing platform binary on task: pid = %d\n", proc_getpid(p));
1741	}
1742
1743	task_set_platform_binary(task, FALSE);
1744	}
1745
1746	#if XNU_TARGET_OS_OSX
1747	/ Disable mach hardening for all 1P tasks which load 3P plugins /
1748	if (imgp->ip_flags & IMGPF_3P_PLUGINS) {
1749	if (cs_debug) {
1750	printf("Disabling some mach hardening on task due to 3P plugins: pid = %d\n", proc_getpid(p));
1751	}
1752	task_disable_mach_hardening(task);
1753	}
1754	#if DEVELOPMENT \|\| DEBUG
1755	/ Disable mach hardening for all tasks if amfi_get_out_of_my_way is set.*
1756	* Customers will have to turn SIP off to use this boot-arg, and so this is
1757	* only needed internally since we disable this feature when SIP is off. */
1758	if (AMFI_bootarg_disable_mach_hardening) {
1759	if (cs_debug) {
1760	printf("Disabling some mach hardening on task due to AMFI boot-args: pid = %d\n", proc_getpid(p));
1761	}
1762	task_disable_mach_hardening(task);
1763	}
1764	#endif /* DEVELOPMENT \|\| DEBUG */
1765	#endif /* XNU_TARGET_OS_OSX */
1766
1767	/*
1768	* Set starting EXC_GUARD and control port behavior for task now that
1769	* platform and hardened runtime is set. Use the name directly from imgp since we haven't
1770	* set_proc_name() yet. Also make control port for the task and main thread
1771	* immovable/pinned based on task's option.
1772	*
1773	* Must happen before main thread port copyout in exc_add_apple_strings.
1774	*/
1775	task_set_exc_guard_ctrl_port_default(task, main_thread: thread,
1776	name: imgp->ip_ndp->ni_cnd.cn_nameptr,
1777	namelen: (unsigned)imgp->ip_ndp->ni_cnd.cn_namelen,
1778	is_simulated: proc_is_simulated(p),
1779	platform: load_result.ip_platform,
1780	sdk: load_result.lr_sdk);
1781
1782	error = exec_add_apple_strings(imgp, load_result: &load_result); / copies out main thread port /
1783
1784	if (error) {
1785	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) \| DBG_FUNC_NONE,
1786	proc_getpid(p), OS_REASON_EXEC, EXEC_EXIT_REASON_APPLE_STRING_INIT, `0`, `0`);
1787
1788	exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_APPLE_STRING_INIT);
1789	if (bootarg_execfailurereports) {
1790	set_proc_name(imgp, p);
1791	exec_failure_reason->osr_flags \|= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
1792	}
1793	goto badtoolate;
1794	}
1795
1796	/ Switch to target task's map to copy out strings /
1797	old_map = vm_map_switch(map: get_task_map(task));
1798
1799	if (load_result.unixproc) {
1800	user_addr_t ap;
1801
1802	/*
1803	* Copy the strings area out into the new process address
1804	* space.
1805	*/
1806	ap = p->user_stack;
1807	error = exec_copyout_strings(imgp, stackp: &ap);
1808	if (error) {
1809	vm_map_switch(map: old_map);
1810
1811	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) \| DBG_FUNC_NONE,
1812	proc_getpid(p), OS_REASON_EXEC, EXEC_EXIT_REASON_COPYOUT_STRINGS, `0`, `0`);
1813
1814	exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_COPYOUT_STRINGS);
1815	if (bootarg_execfailurereports) {
1816	set_proc_name(imgp, p);
1817	exec_failure_reason->osr_flags \|= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
1818	}
1819	goto badtoolate;
1820	}
1821	/ Set the stack /
1822	thread_setuserstack(thread, user_stack: ap);
1823	}
1824
1825	if (load_result.dynlinker \|\| load_result.is_rosetta) {
1826	user_addr_t ap;
1827	int new_ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT_ADDR) ? `8` : `4`;
1828
1829	/ Adjust the stack /
1830	ap = thread_adjuserstack(thread, adjust: -new_ptr_size);
1831	error = copyoutptr(ua: load_result.mach_header, ptr: ap, ptr_size: new_ptr_size);
1832
1833	if (error) {
1834	vm_map_switch(map: old_map);
1835
1836	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) \| DBG_FUNC_NONE,
1837	proc_getpid(p), OS_REASON_EXEC, EXEC_EXIT_REASON_COPYOUT_DYNLINKER, `0`, `0`);
1838
1839	exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_COPYOUT_DYNLINKER);
1840	if (bootarg_execfailurereports) {
1841	set_proc_name(imgp, p);
1842	exec_failure_reason->osr_flags \|= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
1843	}
1844	goto badtoolate;
1845	}
1846	error = task_set_dyld_info(task, addr: load_result.all_image_info_addr,
1847	size: load_result.all_image_info_size);
1848	if (error) {
1849	vm_map_switch(map: old_map);
1850
1851	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) \| DBG_FUNC_NONE,
1852	proc_getpid(p), OS_REASON_EXEC, EXEC_EXIT_REASON_SET_DYLD_INFO, `0`, `0`);
1853
1854	exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_SET_DYLD_INFO);
1855	if (bootarg_execfailurereports) {
1856	set_proc_name(imgp, p);
1857	exec_failure_reason->osr_flags \|= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
1858	}
1859	error = EINVAL;
1860	goto badtoolate;
1861	}
1862	}
1863
1864	#if CONFIG_ROSETTA
1865	if (load_result.is_rosetta) {
1866	// Add an fd for the executable file for Rosetta's use
1867	int main_binary_fd;
1868	struct fileproc *fp;
1869
1870	error = falloc_exec(p, imgp->ip_vfs_context, &fp, &main_binary_fd);
1871	if (error) {
1872	vm_map_switch(old_map);
1873
1874	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) \| DBG_FUNC_NONE,
1875	proc_getpid(p), OS_REASON_EXEC, EXEC_EXIT_REASON_MAIN_FD_ALLOC, `0`, `0`);
1876
1877	exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_MAIN_FD_ALLOC);
1878	if (bootarg_execfailurereports) {
1879	set_proc_name(imgp, p);
1880	exec_failure_reason->osr_flags \|= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
1881	}
1882	goto badtoolate;
1883	}
1884
1885	error = VNOP_OPEN(imgp->ip_vp, FREAD, imgp->ip_vfs_context);
1886	if (error) {
1887	vm_map_switch(old_map);
1888
1889	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) \| DBG_FUNC_NONE,
1890	proc_getpid(p), OS_REASON_EXEC, EXEC_EXIT_REASON_MAIN_FD_ALLOC, `0`, `0`);
1891
1892	exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_MAIN_FD_ALLOC);
1893	if (bootarg_execfailurereports) {
1894	set_proc_name(imgp, p);
1895	exec_failure_reason->osr_flags \|= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
1896	}
1897	goto cleanup_rosetta_fp;
1898	}
1899
1900	fp->fp_glob->fg_flag = FREAD;
1901	fp->fp_glob->fg_ops = &vnops;
1902	fp_set_data(fp, imgp->ip_vp);
1903
1904	proc_fdlock(p);
1905	procfdtbl_releasefd(p, main_binary_fd, NULL);
1906	fp_drop(p, main_binary_fd, fp, `1`);
1907	proc_fdunlock(p);
1908
1909	vnode_ref(imgp->ip_vp);
1910
1911	// Pass the dyld load address, main binary fd, and dyld fd on the stack
1912	uint64_t ap = thread_adjuserstack(thread, -`24`);
1913
1914	error = copyoutptr((user_addr_t)load_result.dynlinker_fd, ap, `8`);
1915	if (error) {
1916	vm_map_switch(old_map);
1917
1918	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) \| DBG_FUNC_NONE,
1919	proc_getpid(p), OS_REASON_EXEC, EXEC_EXIT_REASON_COPYOUT_ROSETTA, `0`, `0`);
1920
1921	exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_COPYOUT_ROSETTA);
1922	if (bootarg_execfailurereports) {
1923	set_proc_name(imgp, p);
1924	exec_failure_reason->osr_flags \|= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
1925	}
1926	goto cleanup_rosetta_fp;
1927	}
1928
1929	error = copyoutptr(load_result.dynlinker_mach_header, ap + `8`, `8`);
1930	if (error) {
1931	vm_map_switch(old_map);
1932
1933	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) \| DBG_FUNC_NONE,
1934	proc_getpid(p), OS_REASON_EXEC, EXEC_EXIT_REASON_COPYOUT_ROSETTA, `0`, `0`);
1935
1936	exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_COPYOUT_ROSETTA);
1937	if (bootarg_execfailurereports) {
1938	set_proc_name(imgp, p);
1939	exec_failure_reason->osr_flags \|= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
1940	}
1941	goto cleanup_rosetta_fp;
1942	}
1943
1944	error = copyoutptr((user_addr_t)main_binary_fd, ap + `16`, `8`);
1945	if (error) {
1946	vm_map_switch(old_map);
1947
1948	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) \| DBG_FUNC_NONE,
1949	proc_getpid(p), OS_REASON_EXEC, EXEC_EXIT_REASON_COPYOUT_ROSETTA, `0`, `0`);
1950
1951	exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_COPYOUT_ROSETTA);
1952	if (bootarg_execfailurereports) {
1953	set_proc_name(imgp, p);
1954	exec_failure_reason->osr_flags \|= OS_REASON_FLAG_GENERATE_CRASH_REPORT;
1955	}
1956	goto cleanup_rosetta_fp;
1957	}
1958
1959	cleanup_rosetta_fp:
1960	if (error) {
1961	fp_free(p, load_result.dynlinker_fd, load_result.dynlinker_fp);
1962	fp_free(p, main_binary_fd, fp);
1963	goto badtoolate;
1964	}
1965	}
1966
1967	#endif
1968
1969	/ Avoid immediate VM faults back into kernel /
1970	exec_prefault_data(p, imgp, &load_result);
1971
1972	vm_map_switch(map: old_map);
1973
1974	/*
1975	* Reset signal state.
1976	*/
1977	execsigs(p, thread);
1978
1979	/*
1980	* need to cancel async IO requests that can be cancelled and wait for those
1981	* already active. MAY BLOCK!
1982	*/
1983	_aio_exec( p );
1984
1985	#if SYSV_SHM
1986	/ FIXME: Till vmspace inherit is fixed: /
1987	if (!vfexec && p->vm_shm) {
1988	shmexec(p);
1989	}
1990	#endif
1991	#if SYSV_SEM
1992	/ Clean up the semaphores /
1993	semexit(p);
1994	#endif
1995
1996	/*
1997	* Remember file name for accounting.
1998	*/
1999	p->p_acflag &= ~AFORK;
2000
2001	set_proc_name(imgp, p);
2002
2003	#if CONFIG_SECLUDED_MEMORY
2004	if (secluded_for_apps &&
2005	load_result.platform_binary) {
2006	if (strncmp(p->p_name,
2007	"Camera",
2008	sizeof(p->p_name)) == `0`) {
2009	task_set_could_use_secluded_mem(task, TRUE);
2010	} else {
2011	task_set_could_use_secluded_mem(task, FALSE);
2012	}
2013	if (strncmp(p->p_name,
2014	"mediaserverd",
2015	sizeof(p->p_name)) == `0`) {
2016	task_set_could_also_use_secluded_mem(task, TRUE);
2017	}
2018	}
2019	#endif /* CONFIG_SECLUDED_MEMORY */
2020
2021	#if __arm64__
2022	if (load_result.legacy_footprint) {
2023	task_set_legacy_footprint(task);
2024	}
2025	#endif /* __arm64__ */
2026
2027	pal_dbg_set_task_name(task);
2028
2029	#if DEVELOPMENT \|\| DEBUG
2030	/*
2031	* Update the pid an proc name for importance base if any
2032	*/
2033	task_importance_update_owner_info(task);
2034	#endif
2035
2036	proc_setexecutableuuid(p, &load_result.uuid[`0`]);
2037
2038	#if CONFIG_DTRACE
2039	dtrace_proc_exec(p);
2040	#endif
2041
2042	if (kdebug_enable) {
2043	long args[`4`] = {};
2044
2045	uintptr_t fsid = `0`, fileid = `0`;
2046	if (imgp->ip_vattr) {
2047	uint64_t fsid64 = vnode_get_va_fsid(vap: imgp->ip_vattr);
2048	fsid = (uintptr_t)fsid64;
2049	fileid = (uintptr_t)imgp->ip_vattr->va_fileid;
2050	// check for (unexpected) overflow and trace zero in that case
2051	if (fsid != fsid64 \|\| fileid != imgp->ip_vattr->va_fileid) {
2052	fsid = fileid = `0`;
2053	}
2054	}
2055	KERNEL_DEBUG_CONSTANT_IST1(TRACE_DATA_EXEC, proc_getpid(p), fsid, fileid, `0`,
2056	(uintptr_t)thread_tid(thread));
2057
2058	extern void kdebug_proc_name_args(struct proc proc, long* args[static `4`]);
2059	kdebug_proc_name_args(proc: p, args);
2060	KERNEL_DEBUG_CONSTANT_IST1(TRACE_STRING_EXEC, args[`0`], args[`1`],
2061	args[`2`], args[`3`], (uintptr_t)thread_tid(thread));
2062	}
2063
2064
2065	/*
2066	* If posix_spawned with the START_SUSPENDED flag, stop the
2067	* process before it runs.
2068	*/
2069	if (imgp->ip_px_sa != NULL) {
2070	psa = (struct _posix_spawnattr *) imgp->ip_px_sa;
2071	if (psa->psa_flags & POSIX_SPAWN_START_SUSPENDED) {
2072	proc_lock(p);
2073	p->p_stat = SSTOP;
2074	proc_unlock(p);
2075	(void) task_suspend_internal(task);
2076	}
2077	}
2078
2079	/*
2080	* mark as execed
2081	*/
2082	OSBitOrAtomic(P_EXEC, &p->p_flag);
2083	proc_resetregister(p);
2084	if (p->p_pptr && (p->p_lflag & P_LPPWAIT)) {
2085	proc_lock(p);
2086	p->p_lflag &= ~P_LPPWAIT;
2087	proc_unlock(p);
2088	wakeup(chan: (caddr_t)p->p_pptr);
2089	}
2090
2091	/*
2092	* Set up dext coredumps on kernel panic.
2093	* This requires the following:
2094	* - dext_panic_coredump=1 boot-arg (enabled by default on DEVELOPMENT, DEBUG and certain Seed builds)
2095	* - process must be a driver
2096	* - process must have the com.apple.private.enable-coredump-on-panic entitlement, and the
2097	* entitlement has a string value.
2098	* - process must have the com.apple.private.enable-coredump-on-panic-seed-privacy-approved
2099	* entitlement (Seed builds only).
2100	*
2101	* The core dump file name is formatted with the entitlement string value, followed by a hyphen
2102	* and the process PID.
2103	*/
2104	if (enable_dext_coredumps_on_panic &&
2105	(imgp->ip_flags & IMGPF_DRIVER) != `0` &&
2106	(userspace_coredump_name = IOVnodeGetEntitlement(vnode: imgp->ip_vp,
2107	offset: (int64_t)imgp->ip_arch_offset, USERSPACE_COREDUMP_PANIC_ENTITLEMENT)) != NULL) {
2108	size_t userspace_coredump_name_len = strlen(s: userspace_coredump_name);
2109
2110	char core_name[MACH_CORE_FILEHEADER_NAMELEN];
2111	/ 16 - NULL char - strlen("-") - maximum of 5 digits for pid /
2112	snprintf(core_name, MACH_CORE_FILEHEADER_NAMELEN, "%.9s-%d", userspace_coredump_name, proc_getpid(p));
2113
2114	kern_register_userspace_coredump(task, name: core_name);
2115
2116	/ Discard the copy of the entitlement /
2117	kfree_data(userspace_coredump_name, userspace_coredump_name_len + `1`);
2118	userspace_coredump_name = NULL;
2119	}
2120
2121	goto done;
2122
2123	badtoolate:
2124	/ Don't allow child process to execute any instructions /
2125	if (!spawn) {
2126	{
2127	assert(exec_failure_reason != OS_REASON_NULL);
2128	if (bootarg_execfailurereports) {
2129	set_proc_name(imgp, p: current_proc());
2130	}
2131	psignal_with_reason(p: current_proc(), SIGKILL, signal_reason: exec_failure_reason);
2132	exec_failure_reason = OS_REASON_NULL;
2133
2134	if (exec) {
2135	/ Terminate the exec copy task /
2136	task_terminate_internal(task);
2137	}
2138	}
2139
2140	/ We can't stop this system call at this point, so just pretend we succeeded /
2141	error = `0`;
2142	} else {
2143	os_reason_free(cur_reason: exec_failure_reason);
2144	exec_failure_reason = OS_REASON_NULL;
2145	}
2146
2147	done:
2148	if (load_result.threadstate) {
2149	kfree_data(load_result.threadstate, load_result.threadstate_sz);
2150	load_result.threadstate = NULL;
2151	}
2152
2153	bad:
2154	/ If we hit this, we likely would have leaked an exit reason /
2155	assert(exec_failure_reason == OS_REASON_NULL);
2156	return error;
2157	}
2158
2159
2160
2161
2162	/*
2163	* Our image activator table; this is the table of the image types we are
2164	* capable of loading. We list them in order of preference to ensure the
2165	* fastest image load speed.
2166	*
2167	* XXX hardcoded, for now; should use linker sets
2168	*/
2169	struct execsw {
2170	int(*const ex_imgact)(struct image_params *);
2171	const char *ex_name;
2172	}const execsw[] = {
2173	{ exec_mach_imgact, "Mach-o Binary" },
2174	{ .ex_imgact: exec_fat_imgact, .ex_name: "Fat Binary" },
2175	{ .ex_imgact: exec_shell_imgact, .ex_name: "Interpreter Script" },
2176	{ NULL, NULL}
2177	};
2178
2179
2180	/*
2181	* exec_activate_image
2182	*
2183	* Description: Iterate through the available image activators, and activate
2184	* the image associated with the imgp structure. We start with
2185	* the activator for Mach-o binaries followed by that for Fat binaries
2186	* for Interpreter scripts.
2187	*
2188	* Parameters: struct image_params * Image parameter block
2189	*
2190	* Returns: 0 Success
2191	* ENOEXEC No activator for image.
2192	* EBADEXEC The executable is corrupt/unknown
2193	* execargs_alloc:EINVAL Invalid argument
2194	* execargs_alloc:EACCES Permission denied
2195	* execargs_alloc:EINTR Interrupted function
2196	* execargs_alloc:ENOMEM Not enough space
2197	* exec_save_path:EFAULT Bad address
2198	* exec_save_path:ENAMETOOLONG Filename too long
2199	* exec_check_permissions:EACCES Permission denied
2200	* exec_check_permissions:ENOEXEC Executable file format error
2201	* exec_check_permissions:ETXTBSY Text file busy [misuse of error code]
2202	* exec_check_permissions:???
2203	* namei:???
2204	* vn_rdwr:??? [anything vn_rdwr can return]
2205	* <ex_imgact>:??? [anything an imgact can return]
2206	* EDEADLK Process is being terminated
2207	*/
2208	static int
2209	exec_activate_image(struct image_params *imgp)
2210	{
2211	struct nameidata *ndp = NULL;
2212	const char *excpath;
2213	int error;
2214	int resid;
2215	int once = `1`; / save SGUID-ness for interpreted files /
2216	int i;
2217	int itercount = `0`;
2218	proc_t p = vfs_context_proc(ctx: imgp->ip_vfs_context);
2219
2220	/*
2221	* For exec, the translock needs to be taken on old proc and not
2222	* on new shadow proc.
2223	*/
2224	if (imgp->ip_flags & IMGPF_EXEC) {
2225	p = current_proc();
2226	}
2227
2228	error = execargs_alloc(imgp);
2229	if (error) {
2230	goto bad_notrans;
2231	}
2232
2233	error = exec_save_path(imgp, path: imgp->ip_user_fname, seg: imgp->ip_seg, excpath: &excpath);
2234	if (error) {
2235	goto bad_notrans;
2236	}
2237
2238	/ Use excpath, which contains the copyin-ed exec path /
2239	DTRACE_PROC1(exec, uintptr_t, excpath);
2240
2241	ndp = kalloc_type(struct nameidata, Z_WAITOK \| Z_ZERO \| Z_NOFAIL);
2242
2243	NDINIT(ndp, LOOKUP, OP_LOOKUP, FOLLOW \| LOCKLEAF \| AUDITVNPATH1,
2244	UIO_SYSSPACE, CAST_USER_ADDR_T(excpath), imgp->ip_vfs_context);
2245
2246	again:
2247	error = namei(ndp);
2248	if (error) {
2249	if (error == ERESTART) {
2250	error = EINTR;
2251	}
2252	goto bad_notrans;
2253	}
2254	imgp->ip_ndp = ndp; / successful namei(); call nameidone() later /
2255	imgp->ip_vp = ndp->ni_vp; / if set, need to vnode_put() at some point /
2256
2257	/*
2258	* Before we start the transition from binary A to binary B, make
2259	* sure another thread hasn't started exiting the process. We grab
2260	* the proc lock to check p_lflag initially, and the transition
2261	* mechanism ensures that the value doesn't change after we release
2262	* the lock.
2263	*/
2264	proc_lock(p);
2265	if (p->p_lflag & P_LEXIT) {
2266	error = EDEADLK;
2267	proc_unlock(p);
2268	goto bad_notrans;
2269	}
2270	error = proc_transstart(p, locked: `1`, non_blocking: `0`);
2271	proc_unlock(p);
2272	if (error) {
2273	goto bad_notrans;
2274	}
2275
2276	error = exec_check_permissions(imgp);
2277	if (error) {
2278	goto bad;
2279	}
2280
2281	/ Copy; avoid invocation of an interpreter overwriting the original /
2282	if (once) {
2283	once = `0`;
2284	imgp->ip_origvattr = imgp->ip_vattr;
2285	}
2286
2287	error = vn_rdwr(rw: UIO_READ, vp: imgp->ip_vp, base: imgp->ip_vdata, PAGE_SIZE, offset: `0`,
2288	segflg: UIO_SYSSPACE, IO_NODELOCKED,
2289	cred: vfs_context_ucred(ctx: imgp->ip_vfs_context),
2290	aresid: &resid, p: vfs_context_proc(ctx: imgp->ip_vfs_context));
2291	if (error) {
2292	goto bad;
2293	}
2294
2295	if (resid) {
2296	memset(s: imgp->ip_vdata + (PAGE_SIZE - resid), c: `0x0`, n: resid);
2297	}
2298
2299	encapsulated_binary:
2300	/ Limit the number of iterations we will attempt on each binary /
2301	if (++itercount > EAI_ITERLIMIT) {
2302	error = EBADEXEC;
2303	goto bad;
2304	}
2305	error = -`1`;
2306	for (i = `0`; error == -`1` && execsw[i].ex_imgact != NULL; i++) {
2307	error = (*execsw[i].ex_imgact)(imgp);
2308
2309	switch (error) {
2310	/ case -1: not claimed: continue /
2311	case -`2`: / Encapsulated binary, imgp->ip_XXX set for next iteration /
2312	goto encapsulated_binary;
2313
2314	case -`3`: / Interpreter /
2315	#if CONFIG_MACF
2316	/*
2317	* Copy the script label for later use. Note that
2318	* the label can be different when the script is
2319	* actually read by the interpreter.
2320	*/
2321	if (imgp->ip_scriptlabelp) {
2322	mac_vnode_label_free(label: imgp->ip_scriptlabelp);
2323	imgp->ip_scriptlabelp = NULL;
2324	}
2325	imgp->ip_scriptlabelp = mac_vnode_label_alloc(NULL);
2326	if (imgp->ip_scriptlabelp == NULL) {
2327	error = ENOMEM;
2328	break;
2329	}
2330	mac_vnode_label_copy(l1: mac_vnode_label(vp: imgp->ip_vp),
2331	l2: imgp->ip_scriptlabelp);
2332
2333	/*
2334	* Take a ref of the script vnode for later use.
2335	*/
2336	if (imgp->ip_scriptvp) {
2337	vnode_put(vp: imgp->ip_scriptvp);
2338	imgp->ip_scriptvp = NULLVP;
2339	}
2340	if (vnode_getwithref(vp: imgp->ip_vp) == `0`) {
2341	imgp->ip_scriptvp = imgp->ip_vp;
2342	}
2343	#endif
2344
2345	nameidone(ndp);
2346
2347	vnode_put(vp: imgp->ip_vp);
2348	imgp->ip_vp = NULL; / already put /
2349	imgp->ip_ndp = NULL; / already nameidone /
2350
2351	/ Use excpath, which exec_shell_imgact reset to the interpreter /
2352	NDINIT(ndp, LOOKUP, OP_LOOKUP, FOLLOW \| LOCKLEAF,
2353	UIO_SYSSPACE, CAST_USER_ADDR_T(excpath), imgp->ip_vfs_context);
2354
2355	proc_transend(p, locked: `0`);
2356	goto again;
2357
2358	default:
2359	break;
2360	}
2361	}
2362
2363	if (error == -`1`) {
2364	error = ENOEXEC;
2365	} else if (error == `0`) {
2366	if (imgp->ip_flags & IMGPF_INTERPRET && ndp->ni_vp) {
2367	AUDIT_ARG(vnpath, ndp->ni_vp, ARG_VNODE2);
2368	}
2369
2370	/*
2371	* Call out to allow 3rd party notification of exec.
2372	* Ignore result of kauth_authorize_fileop call.
2373	*/
2374	if (kauth_authorize_fileop_has_listeners()) {
2375	kauth_authorize_fileop(credential: vfs_context_ucred(ctx: imgp->ip_vfs_context),
2376	KAUTH_FILEOP_EXEC,
2377	arg0: (uintptr_t)ndp->ni_vp, arg1: `0`);
2378	}
2379	}
2380	bad:
2381	proc_transend(p, locked: `0`);
2382
2383	bad_notrans:
2384	if (imgp->ip_strings) {
2385	execargs_free(imgp);
2386	}
2387	if (imgp->ip_ndp) {
2388	nameidone(imgp->ip_ndp);
2389	}
2390	kfree_type(struct nameidata, ndp);
2391
2392	return error;
2393	}
2394
2395	/*
2396	* exec_validate_spawnattr_policy
2397	*
2398	* Description: Validates the entitlements required to set the apptype.
2399	*
2400	* Parameters: int psa_apptype posix spawn attribute apptype
2401	*
2402	* Returns: 0 Success
2403	* EPERM Failure
2404	*/
2405	static errno_t
2406	exec_validate_spawnattr_policy(int psa_apptype)
2407	{
2408	if ((psa_apptype & POSIX_SPAWN_PROC_TYPE_MASK) != `0`) {
2409	int proctype = psa_apptype & POSIX_SPAWN_PROC_TYPE_MASK;
2410	if (proctype == POSIX_SPAWN_PROC_TYPE_DRIVER) {
2411	if (!IOCurrentTaskHasEntitlement(POSIX_SPAWN_ENTITLEMENT_DRIVER)) {
2412	return EPERM;
2413	}
2414	}
2415	}
2416
2417	return `0`;
2418	}
2419
2420	/*
2421	* exec_handle_spawnattr_policy
2422	*
2423	* Description: Decode and apply the posix_spawn apptype, qos clamp, and watchport ports to the task.
2424	*
2425	* Parameters: proc_t p process to apply attributes to
2426	* int psa_apptype posix spawn attribute apptype
2427	*
2428	* Returns: 0 Success
2429	*/
2430	static errno_t
2431	exec_handle_spawnattr_policy(proc_t p, thread_t thread, int psa_apptype, uint64_t psa_qos_clamp,
2432	task_role_t psa_darwin_role, struct exec_port_actions *port_actions)
2433	{
2434	int apptype = TASK_APPTYPE_NONE;
2435	int qos_clamp = THREAD_QOS_UNSPECIFIED;
2436	task_role_t role = TASK_UNSPECIFIED;
2437
2438	if ((psa_apptype & POSIX_SPAWN_PROC_TYPE_MASK) != `0`) {
2439	int proctype = psa_apptype & POSIX_SPAWN_PROC_TYPE_MASK;
2440
2441	switch (proctype) {
2442	case POSIX_SPAWN_PROC_TYPE_DAEMON_INTERACTIVE:
2443	apptype = TASK_APPTYPE_DAEMON_INTERACTIVE;
2444	break;
2445	case POSIX_SPAWN_PROC_TYPE_DAEMON_STANDARD:
2446	apptype = TASK_APPTYPE_DAEMON_STANDARD;
2447	break;
2448	case POSIX_SPAWN_PROC_TYPE_DAEMON_ADAPTIVE:
2449	apptype = TASK_APPTYPE_DAEMON_ADAPTIVE;
2450	break;
2451	case POSIX_SPAWN_PROC_TYPE_DAEMON_BACKGROUND:
2452	apptype = TASK_APPTYPE_DAEMON_BACKGROUND;
2453	break;
2454	case POSIX_SPAWN_PROC_TYPE_APP_DEFAULT:
2455	apptype = TASK_APPTYPE_APP_DEFAULT;
2456	break;
2457	case POSIX_SPAWN_PROC_TYPE_DRIVER:
2458	apptype = TASK_APPTYPE_DRIVER;
2459	break;
2460	default:
2461	apptype = TASK_APPTYPE_NONE;
2462	/ TODO: Should an invalid value here fail the spawn? /
2463	break;
2464	}
2465	}
2466
2467	if (psa_qos_clamp != POSIX_SPAWN_PROC_CLAMP_NONE) {
2468	switch (psa_qos_clamp) {
2469	case POSIX_SPAWN_PROC_CLAMP_UTILITY:
2470	qos_clamp = THREAD_QOS_UTILITY;
2471	break;
2472	case POSIX_SPAWN_PROC_CLAMP_BACKGROUND:
2473	qos_clamp = THREAD_QOS_BACKGROUND;
2474	break;
2475	case POSIX_SPAWN_PROC_CLAMP_MAINTENANCE:
2476	qos_clamp = THREAD_QOS_MAINTENANCE;
2477	break;
2478	default:
2479	qos_clamp = THREAD_QOS_UNSPECIFIED;
2480	/ TODO: Should an invalid value here fail the spawn? /
2481	break;
2482	}
2483	}
2484
2485	if (psa_darwin_role != PRIO_DARWIN_ROLE_DEFAULT) {
2486	proc_darwin_role_to_task_role(darwin_role: psa_darwin_role, task_role: &role);
2487	}
2488
2489	if (apptype != TASK_APPTYPE_NONE \|\|
2490	qos_clamp != THREAD_QOS_UNSPECIFIED \|\|
2491	role != TASK_UNSPECIFIED \|\|
2492	port_actions->portwatch_count) {
2493	proc_set_task_spawnpolicy(task: proc_task(p), thread, apptype, qos_clamp, role,
2494	portwatch_ports: port_actions->portwatch_array, portwatch_count: port_actions->portwatch_count);
2495	}
2496
2497	if (port_actions->registered_count) {
2498	if (mach_ports_register(target_task: proc_task(p), init_port_set: port_actions->registered_array,
2499	init_port_setCnt: port_actions->registered_count)) {
2500	return EINVAL;
2501	}
2502	/ mach_ports_register() consumed the array /
2503	port_actions->registered_array = NULL;
2504	port_actions->registered_count = `0`;
2505	}
2506
2507	return `0`;
2508	}
2509
2510	static void
2511	exec_port_actions_destroy(struct exec_port_actions *port_actions)
2512	{
2513	if (port_actions->excport_array) {
2514	for (uint32_t i = `0`; i < port_actions->exception_port_count; i++) {
2515	ipc_port_t port = NULL;
2516	if ((port = port_actions->excport_array[i].port) != NULL) {
2517	ipc_port_release_send(port);
2518	}
2519	}
2520	kfree_type(struct exception_port_action_t, port_actions->exception_port_count,
2521	port_actions->excport_array);
2522	}
2523
2524	if (port_actions->portwatch_array) {
2525	for (uint32_t i = `0`; i < port_actions->portwatch_count; i++) {
2526	ipc_port_t port = NULL;
2527	if ((port = port_actions->portwatch_array[i]) != NULL) {
2528	ipc_port_release_send(port);
2529	}
2530	}
2531	kfree_type(ipc_port_t, port_actions->portwatch_count,
2532	port_actions->portwatch_array);
2533	}
2534
2535	if (port_actions->registered_array) {
2536	for (uint32_t i = `0`; i < port_actions->registered_count; i++) {
2537	ipc_port_t port = NULL;
2538	if ((port = port_actions->registered_array[i]) != NULL) {
2539	ipc_port_release_send(port);
2540	}
2541	}
2542	kfree_type(ipc_port_t, port_actions->registered_count,
2543	port_actions->registered_array);
2544	}
2545	}
2546
2547	/*
2548	* exec_handle_port_actions
2549	*
2550	* Description: Go through the _posix_port_actions_t contents,
2551	* calling task_set_special_port, task_set_exception_ports
2552	* and/or audit_session_spawnjoin for the current task.
2553	*
2554	* Parameters: struct image_params * Image parameter block
2555	*
2556	* Returns: 0 Success
2557	* EINVAL Failure
2558	* ENOTSUP Illegal posix_spawn attr flag was set
2559	*/
2560	static errno_t
2561	exec_handle_port_actions(struct image_params *imgp,
2562	struct exec_port_actions *actions)
2563	{
2564	_posix_spawn_port_actions_t pacts = imgp->ip_px_spa;
2565	#if CONFIG_AUDIT
2566	proc_t p = vfs_context_proc(ctx: imgp->ip_vfs_context);
2567	#endif
2568	_ps_port_action_t *act = NULL;
2569	task_t task = get_threadtask(imgp->ip_new_thread);
2570	ipc_port_t port = NULL;
2571	errno_t ret = `0`;
2572	int i = `0`, portwatch_i = `0`, registered_i = `0`, excport_i = `0`;
2573	kern_return_t kr;
2574	boolean_t task_has_watchport_boost = task_has_watchports(task: current_task());
2575	boolean_t in_exec = (imgp->ip_flags & IMGPF_EXEC);
2576	int ptrauth_task_port_count = `0`;
2577
2578	for (i = `0`; i < pacts->pspa_count; i++) {
2579	act = &pacts->pspa_actions[i];
2580
2581	switch (act->port_type) {
2582	case PSPA_SPECIAL:
2583	#if CONFIG_AUDIT
2584	case PSPA_AU_SESSION:
2585	#endif
2586	break;
2587	case PSPA_EXCEPTION:
2588	if (++actions->exception_port_count > TASK_MAX_EXCEPTION_PORT_COUNT) {
2589	ret = EINVAL;
2590	goto done;
2591	}
2592	break;
2593	case PSPA_IMP_WATCHPORTS:
2594	if (++actions->portwatch_count > TASK_MAX_WATCHPORT_COUNT) {
2595	ret = EINVAL;
2596	goto done;
2597	}
2598	break;
2599	case PSPA_REGISTERED_PORTS:
2600	if (++actions->registered_count > TASK_PORT_REGISTER_MAX) {
2601	ret = EINVAL;
2602	goto done;
2603	}
2604	break;
2605	case PSPA_PTRAUTH_TASK_PORT:
2606	if (++ptrauth_task_port_count > `1`) {
2607	ret = EINVAL;
2608	goto done;
2609	}
2610	break;
2611	default:
2612	ret = EINVAL;
2613	goto done;
2614	}
2615	}
2616
2617	if (actions->exception_port_count) {
2618	actions->excport_array = kalloc_type(struct exception_port_action_t,
2619	actions->exception_port_count, Z_WAITOK \| Z_ZERO);
2620
2621	if (actions->excport_array == NULL) {
2622	ret = ENOMEM;
2623	goto done;
2624	}
2625	}
2626	if (actions->portwatch_count) {
2627	if (in_exec && task_has_watchport_boost) {
2628	ret = EINVAL;
2629	goto done;
2630	}
2631	actions->portwatch_array = kalloc_type(ipc_port_t,
2632	actions->portwatch_count, Z_WAITOK \| Z_ZERO);
2633	if (actions->portwatch_array == NULL) {
2634	ret = ENOMEM;
2635	goto done;
2636	}
2637	}
2638
2639	if (actions->registered_count) {
2640	actions->registered_array = kalloc_type(ipc_port_t,
2641	actions->registered_count, Z_WAITOK \| Z_ZERO);
2642	if (actions->registered_array == NULL) {
2643	ret = ENOMEM;
2644	goto done;
2645	}
2646	}
2647
2648	for (i = `0`; i < pacts->pspa_count; i++) {
2649	act = &pacts->pspa_actions[i];
2650
2651	if (MACH_PORT_VALID(act->new_port)) {
2652	kr = ipc_object_copyin(space: get_task_ipcspace(t: current_task()),
2653	name: act->new_port, MACH_MSG_TYPE_COPY_SEND,
2654	objectp: (ipc_object_t *) &port, context: `0`, NULL, IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND);
2655
2656	if (kr != KERN_SUCCESS) {
2657	ret = EINVAL;
2658	goto done;
2659	}
2660	} else {
2661	/ it's NULL or DEAD /
2662	port = CAST_MACH_NAME_TO_PORT(act->new_port);
2663	}
2664
2665	switch (act->port_type) {
2666	case PSPA_SPECIAL:
2667	kr = task_set_special_port(task, which_port: act->which, special_port: port);
2668
2669	if (kr != KERN_SUCCESS) {
2670	ret = EINVAL;
2671	}
2672	break;
2673
2674	#if CONFIG_AUDIT
2675	case PSPA_AU_SESSION:
2676	ret = audit_session_spawnjoin(p, port);
2677	if (ret) {
2678	/ audit_session_spawnjoin() has already dropped the reference in case of error. /
2679	goto done;
2680	}
2681
2682	break;
2683	#endif
2684	case PSPA_EXCEPTION:
2685	assert(excport_i < actions->exception_port_count);
2686	/ hold on to this till end of spawn /
2687	actions->excport_array[excport_i].port_action = act;
2688	actions->excport_array[excport_i].port = port;
2689	excport_i++;
2690	break;
2691	case PSPA_IMP_WATCHPORTS:
2692	assert(portwatch_i < actions->portwatch_count);
2693	/ hold on to this till end of spawn /
2694	actions->portwatch_array[portwatch_i++] = port;
2695	break;
2696	case PSPA_REGISTERED_PORTS:
2697	assert(registered_i < actions->registered_count);
2698	/ hold on to this till end of spawn /
2699	actions->registered_array[registered_i++] = port;
2700	break;
2701
2702	case PSPA_PTRAUTH_TASK_PORT:
2703	#if (DEVELOPMENT \|\| DEBUG)
2704	#if defined(HAS_APPLE_PAC)
2705	{
2706	task_t ptr_auth_task = convert_port_to_task(port);
2707
2708	if (ptr_auth_task == TASK_NULL) {
2709	ret = EINVAL;
2710	break;
2711	}
2712
2713	imgp->ip_inherited_shared_region_id =
2714	task_get_vm_shared_region_id_and_jop_pid(ptr_auth_task,
2715	&imgp->ip_inherited_jop_pid);
2716
2717	/ Deallocate task ref returned by convert_port_to_task /
2718	task_deallocate(ptr_auth_task);
2719	}
2720	#endif /* HAS_APPLE_PAC */
2721	#endif /* (DEVELOPMENT \|\| DEBUG) */
2722
2723	/ consume the port right in case of success /
2724	ipc_port_release_send(port);
2725	break;
2726	default:
2727	ret = EINVAL;
2728	break;
2729	}
2730
2731	if (ret) {
2732	/ action failed, so release port resources /
2733	ipc_port_release_send(port);
2734	break;
2735	}
2736	}
2737
2738	done:
2739	if (`0` != ret) {
2740	DTRACE_PROC1(spawn__port__failure, mach_port_name_t, act->new_port);
2741	}
2742	return ret;
2743	}
2744
2745
2746	/*
2747	* exec_handle_exception_port_actions
2748	*
2749	* Description: Go through the saved exception ports in exec_port_actions,
2750	* calling task_set_exception_ports for the current Task.
2751	* This must happen after image activation, and after exec_resettextvp()
2752	* because task_set_exception_ports checks the `TF_PLATFORM` bit and entitlements.
2753	*
2754	* Parameters: struct image_params * Image parameter block
2755	* struct exec_port_actions * Saved Port Actions
2756	*
2757	* Returns: 0 Success
2758	* EINVAL task_set_exception_ports failed
2759	*/
2760	static errno_t
2761	exec_handle_exception_port_actions(const struct image_params *imgp,
2762	const struct exec_port_actions *actions)
2763	{
2764	task_t task = get_threadtask(imgp->ip_new_thread);
2765
2766	for (int i = `0`; i < actions->exception_port_count; i++) {
2767	ipc_port_t port = actions->excport_array[i].port;
2768	_ps_port_action_t *act = actions->excport_array[i].port_action;
2769	assert(act != NULL);
2770	kern_return_t kr = task_set_exception_ports(task, exception_mask: act->mask, new_port: port,
2771	behavior: act->behavior, new_flavor: act->flavor);
2772	if (kr != KERN_SUCCESS) {
2773	DTRACE_PROC1(spawn__exception__port__failure, mach_port_name_t, act->new_port);
2774	return EINVAL;
2775	}
2776	actions->excport_array[i].port = NULL;
2777	}
2778
2779	return `0`;
2780	}
2781
2782
2783	/*
2784	* exec_handle_file_actions
2785	*
2786	* Description: Go through the _posix_file_actions_t contents applying the
2787	* open, close, and dup2 operations to the open file table for
2788	* the current process.
2789	*
2790	* Parameters: struct image_params * Image parameter block
2791	*
2792	* Returns: 0 Success
2793	* ???
2794	*
2795	* Note: Actions are applied in the order specified, with the credential
2796	* of the parent process. This is done to permit the parent
2797	* process to utilize POSIX_SPAWN_RESETIDS to drop privilege in
2798	* the child following operations the child may in fact not be
2799	* normally permitted to perform.
2800	*/
2801	static int
2802	exec_handle_file_actions(struct image_params imgp, short* psa_flags)
2803	{
2804	int error = `0`;
2805	int action;
2806	proc_t p = vfs_context_proc(ctx: imgp->ip_vfs_context);
2807	kauth_cred_t p_cred = vfs_context_ucred(ctx: imgp->ip_vfs_context);
2808	_posix_spawn_file_actions_t px_sfap = imgp->ip_px_sfa;
2809	int ival[`2`]; / dummy retval for system calls) /
2810	#if CONFIG_AUDIT
2811	struct uthread *uthread = current_uthread();
2812	#endif
2813
2814	for (action = `0`; action < px_sfap->psfa_act_count; action++) {
2815	_psfa_action_t *psfa = &px_sfap->psfa_act_acts[action];
2816
2817	switch (psfa->psfaa_type) {
2818	case PSFA_OPEN: {
2819	/*
2820	* Open is different, in that it requires the use of
2821	* a path argument, which is normally copied in from
2822	* user space; because of this, we have to support an
2823	* open from kernel space that passes an address space
2824	* context of UIO_SYSSPACE, and casts the address
2825	* argument to a user_addr_t.
2826	*/
2827	struct vnode_attr *vap;
2828	struct nameidata *ndp;
2829	int mode = psfa->psfaa_openargs.psfao_mode;
2830	int origfd;
2831	struct {
2832	struct vnode_attr va;
2833	struct nameidata nd;
2834	} *__open_data;
2835
2836	__open_data = kalloc_type(typeof(*__open_data), Z_WAITOK \| Z_ZERO);
2837	if (__open_data == NULL) {
2838	error = ENOMEM;
2839	break;
2840	}
2841
2842	vap = &__open_data->va;
2843	ndp = &__open_data->nd;
2844
2845	VATTR_INIT(vap);
2846	/ Mask off all but regular access permissions /
2847	mode = ((mode & ~p->p_fd.fd_cmask) & ALLPERMS) & ~S_ISTXT;
2848	VATTR_SET(vap, va_mode, mode & ACCESSPERMS);
2849
2850	AUDIT_SUBCALL_ENTER(OPEN, p, uthread);
2851
2852	NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW \| AUDITVNPATH1, UIO_SYSSPACE,
2853	CAST_USER_ADDR_T(psfa->psfaa_openargs.psfao_path),
2854	imgp->ip_vfs_context);
2855
2856	error = open1(ctx: imgp->ip_vfs_context, ndp,
2857	uflags: psfa->psfaa_openargs.psfao_oflag,
2858	vap, NULL, NULL, retval: &origfd, AUTH_OPEN_NOAUTHFD);
2859
2860	kfree_type(typeof(*__open_data), __open_data);
2861
2862	AUDIT_SUBCALL_EXIT(uthread, error);
2863
2864	/*
2865	* If there's an error, or we get the right fd by
2866	* accident, then drop out here. This is easier than
2867	* reworking all the open code to preallocate fd
2868	* slots, and internally taking one as an argument.
2869	*/
2870	if (error \|\| origfd == psfa->psfaa_filedes) {
2871	break;
2872	}
2873
2874	/*
2875	* If we didn't fall out from an error, we ended up
2876	* with the wrong fd; so now we've got to try to dup2
2877	* it to the right one.
2878	*/
2879	AUDIT_SUBCALL_ENTER(DUP2, p, uthread);
2880	error = dup2(p, p_cred, from: origfd, to: psfa->psfaa_filedes, fd: ival);
2881	AUDIT_SUBCALL_EXIT(uthread, error);
2882	if (error) {
2883	break;
2884	}
2885
2886	/*
2887	* Finally, close the original fd.
2888	*/
2889	AUDIT_SUBCALL_ENTER(CLOSE, p, uthread);
2890	error = close_nocancel(p, p_cred, fd: origfd);
2891	AUDIT_SUBCALL_EXIT(uthread, error);
2892	}
2893	break;
2894
2895	case PSFA_DUP2: {
2896	AUDIT_SUBCALL_ENTER(DUP2, p, uthread);
2897	error = dup2(p, p_cred, from: psfa->psfaa_filedes,
2898	to: psfa->psfaa_dup2args.psfad_newfiledes, fd: ival);
2899	AUDIT_SUBCALL_EXIT(uthread, error);
2900	}
2901	break;
2902
2903	case PSFA_FILEPORT_DUP2: {
2904	ipc_port_t port;
2905	kern_return_t kr;
2906	int origfd;
2907
2908	if (!MACH_PORT_VALID(psfa->psfaa_fileport)) {
2909	error = EINVAL;
2910	break;
2911	}
2912
2913	kr = ipc_object_copyin(space: get_task_ipcspace(t: current_task()),
2914	name: psfa->psfaa_fileport, MACH_MSG_TYPE_COPY_SEND,
2915	objectp: (ipc_object_t *) &port, context: `0`, NULL, IPC_OBJECT_COPYIN_FLAGS_ALLOW_IMMOVABLE_SEND);
2916
2917	if (kr != KERN_SUCCESS) {
2918	error = EINVAL;
2919	break;
2920	}
2921
2922	error = fileport_makefd(p, port, fp_flags: `0`, fd: &origfd);
2923
2924	if (IPC_PORT_NULL != port) {
2925	ipc_port_release_send(port);
2926	}
2927
2928	if (error \|\| origfd == psfa->psfaa_dup2args.psfad_newfiledes) {
2929	break;
2930	}
2931
2932	AUDIT_SUBCALL_ENTER(DUP2, p, uthread);
2933	error = dup2(p, p_cred, from: origfd,
2934	to: psfa->psfaa_dup2args.psfad_newfiledes, fd: ival);
2935	AUDIT_SUBCALL_EXIT(uthread, error);
2936	if (error) {
2937	break;
2938	}
2939
2940	AUDIT_SUBCALL_ENTER(CLOSE, p, uthread);
2941	error = close_nocancel(p, p_cred, fd: origfd);
2942	AUDIT_SUBCALL_EXIT(uthread, error);
2943	}
2944	break;
2945
2946	case PSFA_CLOSE: {
2947	AUDIT_SUBCALL_ENTER(CLOSE, p, uthread);
2948	error = close_nocancel(p, p_cred, fd: psfa->psfaa_filedes);
2949	AUDIT_SUBCALL_EXIT(uthread, error);
2950	}
2951	break;
2952
2953	case PSFA_INHERIT: {
2954	struct fileproc *fp;
2955
2956	/*
2957	* Check to see if the descriptor exists, and
2958	* ensure it's -not- marked as close-on-exec.
2959	*
2960	* Attempting to "inherit" a guarded fd will
2961	* result in a error.
2962	*/
2963
2964	proc_fdlock(p);
2965	if ((fp = fp_get_noref_locked(p, fd: psfa->psfaa_filedes)) == NULL) {
2966	error = EBADF;
2967	} else if (fp->fp_guard_attrs) {
2968	error = fp_guard_exception(p, fd: psfa->psfaa_filedes,
2969	fp, attribs: kGUARD_EXC_NOCLOEXEC);
2970	} else {
2971	fp->fp_flags &= ~FP_CLOEXEC;
2972	error = `0`;
2973	}
2974	proc_fdunlock(p);
2975	}
2976	break;
2977
2978	case PSFA_CHDIR: {
2979	/*
2980	* Chdir is different, in that it requires the use of
2981	* a path argument, which is normally copied in from
2982	* user space; because of this, we have to support a
2983	* chdir from kernel space that passes an address space
2984	* context of UIO_SYSSPACE, and casts the address
2985	* argument to a user_addr_t.
2986	*/
2987	struct nameidata *nd;
2988	nd = kalloc_type(struct nameidata,
2989	Z_WAITOK \| Z_ZERO \| Z_NOFAIL);
2990
2991	AUDIT_SUBCALL_ENTER(CHDIR, p, uthread);
2992	NDINIT(nd, LOOKUP, OP_CHDIR, FOLLOW \| AUDITVNPATH1, UIO_SYSSPACE,
2993	CAST_USER_ADDR_T(psfa->psfaa_chdirargs.psfac_path),
2994	imgp->ip_vfs_context);
2995
2996	error = chdir_internal(p, ctx: imgp->ip_vfs_context, ndp: nd, per_thread: `0`);
2997	kfree_type(struct nameidata, nd);
2998	AUDIT_SUBCALL_EXIT(uthread, error);
2999	}
3000	break;
3001
3002	case PSFA_FCHDIR: {
3003	AUDIT_SUBCALL_ENTER(FCHDIR, p, uthread);
3004	error = fchdir(p, ctx: imgp->ip_vfs_context,
3005	fd: psfa->psfaa_filedes, false);
3006	AUDIT_SUBCALL_EXIT(uthread, error);
3007	}
3008	break;
3009
3010	default:
3011	error = EINVAL;
3012	break;
3013	}
3014
3015	/ All file actions failures are considered fatal, per POSIX /
3016
3017	if (error) {
3018	if (PSFA_OPEN == psfa->psfaa_type) {
3019	DTRACE_PROC1(spawn__open__failure, uintptr_t,
3020	psfa->psfaa_openargs.psfao_path);
3021	} else {
3022	DTRACE_PROC1(spawn__fd__failure, int, psfa->psfaa_filedes);
3023	}
3024	break;
3025	}
3026	}
3027
3028	if (error != `0` \|\| (psa_flags & POSIX_SPAWN_CLOEXEC_DEFAULT) == `0`) {
3029	return error;
3030	}
3031
3032	/*
3033	* If POSIX_SPAWN_CLOEXEC_DEFAULT is set, behave (during
3034	* this spawn only) as if "close on exec" is the default
3035	* disposition of all pre-existing file descriptors. In this case,
3036	* the list of file descriptors mentioned in the file actions
3037	* are the only ones that can be inherited, so mark them now.
3038	*
3039	* The actual closing part comes later, in fdt_exec().
3040	*/
3041	proc_fdlock(p);
3042	for (action = `0`; action < px_sfap->psfa_act_count; action++) {
3043	_psfa_action_t *psfa = &px_sfap->psfa_act_acts[action];
3044	int fd = psfa->psfaa_filedes;
3045
3046	switch (psfa->psfaa_type) {
3047	case PSFA_DUP2:
3048	case PSFA_FILEPORT_DUP2:
3049	fd = psfa->psfaa_dup2args.psfad_newfiledes;
3050	OS_FALLTHROUGH;
3051	case PSFA_OPEN:
3052	case PSFA_INHERIT:
3053	*fdflags(p, fd) \|= UF_INHERIT;
3054	break;
3055
3056	case PSFA_CLOSE:
3057	case PSFA_CHDIR:
3058	case PSFA_FCHDIR:
3059	/*
3060	* Although PSFA_FCHDIR does have a file descriptor, it is not
3061	* creating one, thus we do not automatically mark it for
3062	* inheritance under POSIX_SPAWN_CLOEXEC_DEFAULT. A client that
3063	* wishes it to be inherited should use the PSFA_INHERIT action
3064	* explicitly.
3065	*/
3066	break;
3067	}
3068	}
3069	proc_fdunlock(p);
3070
3071	return `0`;
3072	}
3073
3074	#if CONFIG_MACF
3075	/*
3076	* Check that the extension's data is within the bounds of the
3077	* allocation storing all extensions' data
3078	*/
3079	static inline errno_t
3080	exec_spawnattr_validate_policyext_data(const struct ip_px_smpx_s *px_s,
3081	const _ps_mac_policy_extension_t *ext)
3082	{
3083	uint64_t dataend;
3084
3085	if (__improbable(os_add_overflow(ext->dataoff, ext->datalen, &dataend))) {
3086	return EOVERFLOW;
3087	}
3088	if (__improbable(dataend > px_s->datalen)) {
3089	return EINVAL;
3090	}
3091
3092	return `0`;
3093	}
3094
3095	/*
3096	* exec_spawnattr_getmacpolicyinfo
3097	*/
3098	void *
3099	exec_spawnattr_getmacpolicyinfo(const void macextensions, const* char policyname, size_t lenp)
3100	{
3101	const struct ip_px_smpx_s *px_s = macextensions;
3102	const struct _posix_spawn_mac_policy_extensions *psmx = NULL;
3103	int i;
3104
3105	if (px_s == NULL) {
3106	return NULL;
3107	}
3108
3109	psmx = px_s->array;
3110	if (psmx == NULL) {
3111	return NULL;
3112	}
3113
3114	for (i = `0`; i < psmx->psmx_count; i++) {
3115	const _ps_mac_policy_extension_t *extension = &psmx->psmx_extensions[i];
3116	if (strncmp(s1: extension->policyname, s2: policyname, n: sizeof(extension->policyname)) == `0`) {
3117	if (__improbable(exec_spawnattr_validate_policyext_data(px_s, extension))) {
3118	panic("invalid mac policy extension data");
3119	}
3120	if (lenp != NULL) {
3121	*lenp = (size_t)extension->datalen;
3122	}
3123	return (void *)((uintptr_t)px_s->data + extension->dataoff);
3124	}
3125	}
3126
3127	if (lenp != NULL) {
3128	*lenp = `0`;
3129	}
3130	return NULL;
3131	}
3132
3133	static int
3134	spawn_copyin_macpolicyinfo(const struct user__posix_spawn_args_desc *px_args,
3135	struct ip_px_smpx_s *pxsp)
3136	{
3137	_posix_spawn_mac_policy_extensions_t psmx = NULL;
3138	uint8_t *data = NULL;
3139	uint64_t datalen = `0`;
3140	uint64_t dataoff = `0`;
3141	int error = `0`;
3142
3143	bzero(s: pxsp, n: sizeof(*pxsp));
3144
3145	if (px_args->mac_extensions_size < PS_MAC_EXTENSIONS_SIZE(`1`) \|\|
3146	px_args->mac_extensions_size > PAGE_SIZE) {
3147	error = EINVAL;
3148	goto bad;
3149	}
3150
3151	psmx = kalloc_data(px_args->mac_extensions_size, Z_WAITOK);
3152	if (psmx == NULL) {
3153	error = ENOMEM;
3154	goto bad;
3155	}
3156
3157	error = copyin(px_args->mac_extensions, psmx, px_args->mac_extensions_size);
3158	if (error) {
3159	goto bad;
3160	}
3161
3162	size_t extsize = PS_MAC_EXTENSIONS_SIZE(psmx->psmx_count);
3163	if (extsize == `0` \|\| extsize > px_args->mac_extensions_size) {
3164	error = EINVAL;
3165	goto bad;
3166	}
3167
3168	for (int i = `0`; i < psmx->psmx_count; i++) {
3169	_ps_mac_policy_extension_t *extension = &psmx->psmx_extensions[i];
3170	if (extension->datalen == `0` \|\| extension->datalen > PAGE_SIZE) {
3171	error = EINVAL;
3172	goto bad;
3173	}
3174	if (__improbable(os_add_overflow(datalen, extension->datalen, &datalen))) {
3175	error = ENOMEM;
3176	goto bad;
3177	}
3178	}
3179
3180	data = kalloc_data((vm_size_t)datalen, Z_WAITOK);
3181	if (data == NULL) {
3182	error = ENOMEM;
3183	goto bad;
3184	}
3185
3186	for (int i = `0`; i < psmx->psmx_count; i++) {
3187	_ps_mac_policy_extension_t *extension = &psmx->psmx_extensions[i];
3188
3189	#if !__LP64__
3190	if (extension->data > UINT32_MAX) {
3191	goto bad;
3192	}
3193	#endif
3194	error = copyin((user_addr_t)extension->data, &data[dataoff], (size_t)extension->datalen);
3195	if (error) {
3196	error = ENOMEM;
3197	goto bad;
3198	}
3199	extension->dataoff = dataoff;
3200	dataoff += extension->datalen;
3201	}
3202
3203	pxsp->array = psmx;
3204	pxsp->data = data;
3205	pxsp->datalen = datalen;
3206	return `0`;
3207
3208	bad:
3209	kfree_data(psmx, px_args->mac_extensions_size);
3210	kfree_data(data, (vm_size_t)datalen);
3211	return error;
3212	}
3213	#endif /* CONFIG_MACF */
3214
3215	#if CONFIG_COALITIONS
3216	static inline void
3217	spawn_coalitions_release_all(coalition_t coal[COALITION_NUM_TYPES])
3218	{
3219	for (int c = `0`; c < COALITION_NUM_TYPES; c++) {
3220	if (coal[c]) {
3221	coalition_remove_active(coal: coal[c]);
3222	coalition_release(coal: coal[c]);
3223	}
3224	}
3225	}
3226	#endif
3227
3228	#if CONFIG_PERSONAS
3229	static int
3230	spawn_validate_persona(struct _posix_spawn_persona_info *px_persona)
3231	{
3232	int error = `0`;
3233	struct persona *persona = NULL;
3234
3235	if (!IOCurrentTaskHasEntitlement( PERSONA_MGMT_ENTITLEMENT)) {
3236	return EPERM;
3237	}
3238
3239	if (px_persona->pspi_flags & POSIX_SPAWN_PERSONA_GROUPS) {
3240	if (px_persona->pspi_ngroups > NGROUPS_MAX) {
3241	return EINVAL;
3242	}
3243	}
3244
3245	persona = persona_lookup(id: px_persona->pspi_id);
3246	if (!persona) {
3247	error = ESRCH;
3248	goto out;
3249	}
3250
3251	out:
3252	if (persona) {
3253	persona_put(persona);
3254	}
3255
3256	return error;
3257	}
3258
3259	static bool
3260	kauth_cred_model_setpersona(
3261	kauth_cred_t model,
3262	struct _posix_spawn_persona_info *px_persona)
3263	{
3264	bool updated = false;
3265
3266	if (px_persona->pspi_flags & POSIX_SPAWN_PERSONA_UID) {
3267	updated \|= kauth_cred_model_setresuid(model,
3268	ruid: px_persona->pspi_uid,
3269	euid: px_persona->pspi_uid,
3270	svuid: px_persona->pspi_uid,
3271	KAUTH_UID_NONE);
3272	}
3273
3274	if (px_persona->pspi_flags & POSIX_SPAWN_PERSONA_GID) {
3275	updated \|= kauth_cred_model_setresgid(model,
3276	rgid: px_persona->pspi_gid,
3277	egid: px_persona->pspi_gid,
3278	svgid: px_persona->pspi_gid);
3279	}
3280
3281	if (px_persona->pspi_flags & POSIX_SPAWN_PERSONA_GROUPS) {
3282	updated \|= kauth_cred_model_setgroups(model,
3283	groups: px_persona->pspi_groups,
3284	groupcount: px_persona->pspi_ngroups,
3285	gmuid: px_persona->pspi_gmuid);
3286	}
3287
3288	return updated;
3289	}
3290
3291	static int
3292	spawn_persona_adopt(proc_t p, struct _posix_spawn_persona_info *px_persona)
3293	{
3294	struct persona *persona = NULL;
3295
3296	/*
3297	* we want to spawn into the given persona, but we want to override
3298	* the kauth with a different UID/GID combo
3299	*/
3300	persona = persona_lookup(id: px_persona->pspi_id);
3301	if (!persona) {
3302	return ESRCH;
3303	}
3304
3305	return persona_proc_adopt(p, persona,
3306	fn: ^bool (kauth_cred_t parent __unused, kauth_cred_t model) {
3307	return kauth_cred_model_setpersona(model, px_persona);
3308	});
3309	}
3310	#endif
3311
3312	#if __arm64__
3313	#if DEVELOPMENT \|\| DEBUG
3314	TUNABLE(int, legacy_footprint_entitlement_mode, "legacy_footprint_entitlement_mode",
3315	LEGACY_FOOTPRINT_ENTITLEMENT_IGNORE);
3316
3317	__startup_func
3318	static void
3319	legacy_footprint_entitlement_mode_init(void)
3320	{
3321	/*
3322	* legacy_footprint_entitlement_mode specifies the behavior we want associated
3323	* with the entitlement. The supported modes are:
3324	*
3325	* LEGACY_FOOTPRINT_ENTITLEMENT_IGNORE:
3326	* Indicates that we want every process to have the memory accounting
3327	* that is available in iOS 12.0 and beyond.
3328	*
3329	* LEGACY_FOOTPRINT_ENTITLEMENT_IOS11_ACCT:
3330	* Indicates that for every process that has the 'legacy footprint entitlement',
3331	* we want to give it the old iOS 11.0 accounting behavior which accounted some
3332	* of the process's memory to the kernel.
3333	*
3334	* LEGACY_FOOTPRINT_ENTITLEMENT_LIMIT_INCREASE:
3335	* Indicates that for every process that has the 'legacy footprint entitlement',
3336	* we want it to have a higher memory limit which will help them acclimate to the
3337	* iOS 12.0 (& beyond) accounting behavior that does the right accounting.
3338	* The bonus added to the system-wide task limit to calculate this higher memory limit
3339	* is available in legacy_footprint_bonus_mb.
3340	*/
3341
3342	if (legacy_footprint_entitlement_mode < LEGACY_FOOTPRINT_ENTITLEMENT_IGNORE \|\|
3343	legacy_footprint_entitlement_mode > LEGACY_FOOTPRINT_ENTITLEMENT_LIMIT_INCREASE) {
3344	legacy_footprint_entitlement_mode = LEGACY_FOOTPRINT_ENTITLEMENT_LIMIT_INCREASE;
3345	}
3346	}
3347	STARTUP(TUNABLES, STARTUP_RANK_MIDDLE, legacy_footprint_entitlement_mode_init);
3348	#else
3349	const int legacy_footprint_entitlement_mode = LEGACY_FOOTPRINT_ENTITLEMENT_IGNORE;
3350	#endif
3351
3352	static inline void
3353	proc_legacy_footprint_entitled(proc_t p, task_t task)
3354	{
3355	#pragma unused(p)
3356	boolean_t legacy_footprint_entitled;
3357
3358	switch (legacy_footprint_entitlement_mode) {
3359	case LEGACY_FOOTPRINT_ENTITLEMENT_IGNORE:
3360	/ the entitlement is ignored /
3361	break;
3362	case LEGACY_FOOTPRINT_ENTITLEMENT_IOS11_ACCT:
3363	/ the entitlement grants iOS11 legacy accounting /
3364	legacy_footprint_entitled = memorystatus_task_has_legacy_footprint_entitlement(task: proc_task(p));
3365	if (legacy_footprint_entitled) {
3366	task_set_legacy_footprint(task);
3367	}
3368	break;
3369	case LEGACY_FOOTPRINT_ENTITLEMENT_LIMIT_INCREASE:
3370	/ the entitlement grants a footprint limit increase /
3371	legacy_footprint_entitled = memorystatus_task_has_legacy_footprint_entitlement(task: proc_task(p));
3372	if (legacy_footprint_entitled) {
3373	task_set_extra_footprint_limit(task);
3374	}
3375	break;
3376	default:
3377	break;
3378	}
3379	}
3380
3381	static inline void
3382	proc_ios13extended_footprint_entitled(proc_t p, task_t task)
3383	{
3384	#pragma unused(p)
3385	boolean_t ios13extended_footprint_entitled;
3386
3387	/ the entitlement grants a footprint limit increase /
3388	ios13extended_footprint_entitled = memorystatus_task_has_ios13extended_footprint_limit(task: proc_task(p));
3389	if (ios13extended_footprint_entitled) {
3390	task_set_ios13extended_footprint_limit(task);
3391	}
3392	}
3393
3394	static inline void
3395	proc_increased_memory_limit_entitled(proc_t p, task_t task)
3396	{
3397	bool entitled = memorystatus_task_has_increased_memory_limit_entitlement(task);
3398
3399	if (entitled) {
3400	memorystatus_act_on_entitled_task_limit(p);
3401	}
3402	}
3403
3404	/*
3405	* Check for any of the various entitlements that permit a higher
3406	* task footprint limit or alternate accounting and apply them.
3407	*/
3408	static inline void
3409	proc_footprint_entitlement_hacks(proc_t p, task_t task)
3410	{
3411	proc_legacy_footprint_entitled(p, task);
3412	proc_ios13extended_footprint_entitled(p, task);
3413	proc_increased_memory_limit_entitled(p, task);
3414	}
3415	#endif /* __arm64__ */
3416
3417	/*
3418	* Processes with certain entitlements are granted a jumbo-size VM map.
3419	*/
3420	static inline void
3421	proc_apply_jit_and_vm_policies(struct image_params *imgp, proc_t p, task_t task)
3422	{
3423	#if CONFIG_MACF
3424	bool jit_entitled = false;
3425	#endif /* CONFIG_MACF */
3426	bool needs_jumbo_va = false;
3427	struct _posix_spawnattr *psa = imgp->ip_px_sa;
3428
3429	#if CONFIG_MACF
3430	jit_entitled = (mac_proc_check_map_anon(proc: p, cred: proc_ucred_unsafe(p),
3431	u_addr: `0`, u_size: `0`, prot: `0`, MAP_JIT, NULL) == `0`);
3432	needs_jumbo_va = jit_entitled \|\| IOTaskHasEntitlement(task,
3433	entitlement: "com.apple.developer.kernel.extended-virtual-addressing") \|\|
3434	memorystatus_task_has_increased_memory_limit_entitlement(task);
3435	#else
3436	#pragma unused(p)
3437	#endif /* CONFIG_MACF */
3438
3439
3440	if (needs_jumbo_va) {
3441	vm_map_set_jumbo(map: get_task_map(task));
3442	}
3443
3444	if (psa && psa->psa_max_addr) {
3445	vm_map_set_max_addr(map: get_task_map(task), new_max_offset: psa->psa_max_addr);
3446	}
3447
3448	#if CONFIG_MAP_RANGES
3449	if (task_is_hardened_binary(task) && !proc_is_simulated(p)) {
3450	/*
3451	* This must be done last as it needs to observe
3452	* any kind of VA space growth that was requested.
3453	* This is used by the secure allocator, so
3454	* must be applied to all hardened binaries
3455	*/
3456	vm_map_range_configure(get_task_map(task));
3457	}
3458	#endif /* CONFIG_MAP_RANGES */
3459
3460	#if CONFIG_MACF
3461	if (jit_entitled) {
3462	vm_map_set_jit_entitled(map: get_task_map(task));
3463
3464	}
3465	#endif /* CONFIG_MACF */
3466
3467	#if XNU_TARGET_OS_OSX
3468	/ TPRO cannot be enforced on binaries that load 3P plugins on macos - rdar://107420220 /
3469	const bool task_loads_3P_plugins = imgp->ip_flags & IMGPF_3P_PLUGINS;
3470	#endif /* XNU_TARGET_OS_OSX */
3471
3472	if (task_is_hardened_binary(task)
3473	#if XNU_TARGET_OS_OSX
3474	&& !task_loads_3P_plugins
3475	#endif /* XNU_TARGET_OS_OSX */
3476	) {
3477	/*
3478	* Pre-emptively disable TPRO remapping for
3479	* hardened binaries (which do not load 3P plugins)
3480	*/
3481	vm_map_set_tpro_enforcement(map: get_task_map(task));
3482	}
3483	}
3484
3485	static int
3486	spawn_posix_cred_adopt(proc_t p,
3487	struct _posix_spawn_posix_cred_info *px_pcred_info)
3488	{
3489	int error = `0`;
3490
3491	if (px_pcred_info->pspci_flags & POSIX_SPAWN_POSIX_CRED_GID) {
3492	struct setgid_args args = {
3493	.gid = px_pcred_info->pspci_gid,
3494	};
3495	error = setgid(p, &args, NULL);
3496	if (error) {
3497	return error;
3498	}
3499	}
3500
3501	if (px_pcred_info->pspci_flags & POSIX_SPAWN_POSIX_CRED_GROUPS) {
3502	error = setgroups_internal(p,
3503	gidsetsize: px_pcred_info->pspci_ngroups,
3504	gidset: px_pcred_info->pspci_groups,
3505	gmuid: px_pcred_info->pspci_gmuid);
3506	if (error) {
3507	return error;
3508	}
3509	}
3510
3511	if (px_pcred_info->pspci_flags & POSIX_SPAWN_POSIX_CRED_UID) {
3512	struct setuid_args args = {
3513	.uid = px_pcred_info->pspci_uid,
3514	};
3515	error = setuid(p, &args, NULL);
3516	if (error) {
3517	return error;
3518	}
3519	}
3520	return `0`;
3521	}
3522
3523	/*
3524	* posix_spawn
3525	*
3526	* Parameters: uap->pid Pointer to pid return area
3527	* uap->fname File name to exec
3528	* uap->argp Argument list
3529	* uap->envp Environment list
3530	*
3531	* Returns: 0 Success
3532	* EINVAL Invalid argument
3533	* ENOTSUP Not supported
3534	* ENOEXEC Executable file format error
3535	* exec_activate_image:EINVAL Invalid argument
3536	* exec_activate_image:EACCES Permission denied
3537	* exec_activate_image:EINTR Interrupted function
3538	* exec_activate_image:ENOMEM Not enough space
3539	* exec_activate_image:EFAULT Bad address
3540	* exec_activate_image:ENAMETOOLONG Filename too long
3541	* exec_activate_image:ENOEXEC Executable file format error
3542	* exec_activate_image:ETXTBSY Text file busy [misuse of error code]
3543	* exec_activate_image:EAUTH Image decryption failed
3544	* exec_activate_image:EBADEXEC The executable is corrupt/unknown
3545	* exec_activate_image:???
3546	* mac_execve_enter:???
3547	*
3548	* TODO: Expect to need __mac_posix_spawn() at some point...
3549	* Handle posix_spawnattr_t
3550	* Handle posix_spawn_file_actions_t
3551	*/
3552	int
3553	posix_spawn(proc_t ap, struct posix_spawn_args uap, int32_t retval)
3554	{
3555	proc_t p = ap;
3556	user_addr_t pid = uap->pid;
3557	int ival[`2`]; / dummy retval for setpgid() /
3558	char *subsystem_root_path = NULL;
3559	struct image_params *imgp = NULL;
3560	struct vnode_attr *vap = NULL;
3561	struct vnode_attr *origvap = NULL;
3562	struct uthread uthread = `0`; /* compiler complains if not set to 0/
3563	int error, sig;
3564	int is_64 = IS_64BIT_PROCESS(p);
3565	struct vfs_context context;
3566	struct user__posix_spawn_args_desc px_args = {};
3567	struct _posix_spawnattr px_sa = {};
3568	_posix_spawn_file_actions_t px_sfap = NULL;
3569	_posix_spawn_port_actions_t px_spap = NULL;
3570	struct __kern_sigaction vec;
3571	boolean_t spawn_no_exec = FALSE;
3572	boolean_t proc_transit_set = TRUE;
3573	boolean_t proc_signal_set = TRUE;
3574	boolean_t exec_done = FALSE;
3575	os_reason_t exec_failure_reason = NULL;
3576
3577	struct exec_port_actions port_actions = { };
3578	vm_size_t px_sa_offset = offsetof(struct _posix_spawnattr, psa_ports);
3579	task_t old_task = current_task();
3580	task_t new_task = NULL;
3581	boolean_t should_release_proc_ref = FALSE;
3582	void *inherit = NULL;
3583	uint8_t crash_behavior = `0`;
3584	uint64_t crash_behavior_deadline = `0`;
3585	#if CONFIG_EXCLAVES
3586	char *task_conclave_id = NULL;
3587	#endif
3588	#if CONFIG_PERSONAS
3589	struct _posix_spawn_persona_info *px_persona = NULL;
3590	#endif
3591	struct _posix_spawn_posix_cred_info *px_pcred_info = NULL;
3592	struct {
3593	struct image_params imgp;
3594	struct vnode_attr va;
3595	struct vnode_attr origva;
3596	} *__spawn_data;
3597
3598	/*
3599	* Allocate a big chunk for locals instead of using stack since these
3600	* structures are pretty big.
3601	*/
3602	__spawn_data = kalloc_type(typeof(*__spawn_data), Z_WAITOK \| Z_ZERO);
3603	if (__spawn_data == NULL) {
3604	error = ENOMEM;
3605	goto bad;
3606	}
3607	imgp = &__spawn_data->imgp;
3608	vap = &__spawn_data->va;
3609	origvap = &__spawn_data->origva;
3610
3611	/ Initialize the common data in the image_params structure /
3612	imgp->ip_user_fname = uap->path;
3613	imgp->ip_user_argv = uap->argv;
3614	imgp->ip_user_envv = uap->envp;
3615	imgp->ip_vattr = vap;
3616	imgp->ip_origvattr = origvap;
3617	imgp->ip_vfs_context = &context;
3618	imgp->ip_flags = (is_64 ? IMGPF_WAS_64BIT_ADDR : IMGPF_NONE);
3619	imgp->ip_seg = (is_64 ? UIO_USERSPACE64 : UIO_USERSPACE32);
3620	imgp->ip_mac_return = `0`;
3621	imgp->ip_px_persona = NULL;
3622	imgp->ip_px_pcred_info = NULL;
3623	imgp->ip_cs_error = OS_REASON_NULL;
3624	imgp->ip_simulator_binary = IMGPF_SB_DEFAULT;
3625	imgp->ip_subsystem_root_path = NULL;
3626	imgp->ip_inherited_shared_region_id = NULL;
3627	imgp->ip_inherited_jop_pid = `0`;
3628	uthread_set_exec_data(uth: current_uthread(), imgp);
3629
3630	if (uap->adesc != USER_ADDR_NULL) {
3631	if (is_64) {
3632	error = copyin(uap->adesc, &px_args, sizeof(px_args));
3633	} else {
3634	struct user32__posix_spawn_args_desc px_args32;
3635
3636	error = copyin(uap->adesc, &px_args32, sizeof(px_args32));
3637
3638	/*
3639	* Convert arguments descriptor from external 32 bit
3640	* representation to internal 64 bit representation
3641	*/
3642	px_args.attr_size = px_args32.attr_size;
3643	px_args.attrp = CAST_USER_ADDR_T(px_args32.attrp);
3644	px_args.file_actions_size = px_args32.file_actions_size;
3645	px_args.file_actions = CAST_USER_ADDR_T(px_args32.file_actions);
3646	px_args.port_actions_size = px_args32.port_actions_size;
3647	px_args.port_actions = CAST_USER_ADDR_T(px_args32.port_actions);
3648	px_args.mac_extensions_size = px_args32.mac_extensions_size;
3649	px_args.mac_extensions = CAST_USER_ADDR_T(px_args32.mac_extensions);
3650	px_args.coal_info_size = px_args32.coal_info_size;
3651	px_args.coal_info = CAST_USER_ADDR_T(px_args32.coal_info);
3652	px_args.persona_info_size = px_args32.persona_info_size;
3653	px_args.persona_info = CAST_USER_ADDR_T(px_args32.persona_info);
3654	px_args.posix_cred_info_size = px_args32.posix_cred_info_size;
3655	px_args.posix_cred_info = CAST_USER_ADDR_T(px_args32.posix_cred_info);
3656	px_args.subsystem_root_path_size = px_args32.subsystem_root_path_size;
3657	px_args.subsystem_root_path = CAST_USER_ADDR_T(px_args32.subsystem_root_path);
3658	px_args.conclave_id_size = px_args32.conclave_id_size;
3659	px_args.conclave_id = CAST_USER_ADDR_T(px_args32.conclave_id);
3660	}
3661	if (error) {
3662	goto bad;
3663	}
3664
3665	if (px_args.attr_size != `0`) {
3666	/*
3667	* We are not copying the port_actions pointer,
3668	* because we already have it from px_args.
3669	* This is a bit fragile: <rdar://problem/16427422>
3670	*/
3671
3672	if ((error = copyin(px_args.attrp, &px_sa, px_sa_offset)) != `0`) {
3673	goto bad;
3674	}
3675
3676	imgp->ip_px_sa = &px_sa;
3677	}
3678	if (px_args.file_actions_size != `0`) {
3679	/ Limit file_actions to allowed number of open files /
3680	size_t maxfa_size = PSF_ACTIONS_SIZE(proc_limitgetcur_nofile(p));
3681
3682	if (px_args.file_actions_size < PSF_ACTIONS_SIZE(`1`) \|\|
3683	maxfa_size == `0` \|\| px_args.file_actions_size > maxfa_size) {
3684	error = EINVAL;
3685	goto bad;
3686	}
3687
3688	px_sfap = kalloc_data(px_args.file_actions_size, Z_WAITOK);
3689	if (px_sfap == NULL) {
3690	error = ENOMEM;
3691	goto bad;
3692	}
3693	imgp->ip_px_sfa = px_sfap;
3694
3695	if ((error = copyin(px_args.file_actions, px_sfap,
3696	px_args.file_actions_size)) != `0`) {
3697	goto bad;
3698	}
3699
3700	/ Verify that the action count matches the struct size /
3701	size_t psfsize = PSF_ACTIONS_SIZE(px_sfap->psfa_act_count);
3702	if (psfsize == `0` \|\| psfsize != px_args.file_actions_size) {
3703	error = EINVAL;
3704	goto bad;
3705	}
3706	}
3707	if (px_args.port_actions_size != `0`) {
3708	/ Limit port_actions to one page of data /
3709	if (px_args.port_actions_size < PS_PORT_ACTIONS_SIZE(`1`) \|\|
3710	px_args.port_actions_size > PAGE_SIZE) {
3711	error = EINVAL;
3712	goto bad;
3713	}
3714
3715	px_spap = kalloc_data(px_args.port_actions_size, Z_WAITOK);
3716	if (px_spap == NULL) {
3717	error = ENOMEM;
3718	goto bad;
3719	}
3720	imgp->ip_px_spa = px_spap;
3721
3722	if ((error = copyin(px_args.port_actions, px_spap,
3723	px_args.port_actions_size)) != `0`) {
3724	goto bad;
3725	}
3726
3727	/ Verify that the action count matches the struct size /
3728	size_t pasize = PS_PORT_ACTIONS_SIZE(px_spap->pspa_count);
3729	if (pasize == `0` \|\| pasize != px_args.port_actions_size) {
3730	error = EINVAL;
3731	goto bad;
3732	}
3733	}
3734	#if CONFIG_PERSONAS
3735	/ copy in the persona info /
3736	if (px_args.persona_info_size != `0` && px_args.persona_info != `0`) {
3737	/ for now, we need the exact same struct in user space /
3738	if (px_args.persona_info_size != sizeof(*px_persona)) {
3739	error = ERANGE;
3740	goto bad;
3741	}
3742
3743	px_persona = kalloc_data(px_args.persona_info_size, Z_WAITOK);
3744	if (px_persona == NULL) {
3745	error = ENOMEM;
3746	goto bad;
3747	}
3748	imgp->ip_px_persona = px_persona;
3749
3750	if ((error = copyin(px_args.persona_info, px_persona,
3751	px_args.persona_info_size)) != `0`) {
3752	goto bad;
3753	}
3754	if ((error = spawn_validate_persona(px_persona)) != `0`) {
3755	goto bad;
3756	}
3757	}
3758	#endif
3759	/ copy in the posix cred info /
3760	if (px_args.posix_cred_info_size != `0` && px_args.posix_cred_info != `0`) {
3761	/ for now, we need the exact same struct in user space /
3762	if (px_args.posix_cred_info_size != sizeof(*px_pcred_info)) {
3763	error = ERANGE;
3764	goto bad;
3765	}
3766
3767	if (!kauth_cred_issuser(cred: kauth_cred_get())) {
3768	error = EPERM;
3769	goto bad;
3770	}
3771
3772	px_pcred_info = kalloc_data(px_args.posix_cred_info_size, Z_WAITOK);
3773	if (px_pcred_info == NULL) {
3774	error = ENOMEM;
3775	goto bad;
3776	}
3777	imgp->ip_px_pcred_info = px_pcred_info;
3778
3779	if ((error = copyin(px_args.posix_cred_info, px_pcred_info,
3780	px_args.posix_cred_info_size)) != `0`) {
3781	goto bad;
3782	}
3783
3784	if (px_pcred_info->pspci_flags & POSIX_SPAWN_POSIX_CRED_GROUPS) {
3785	if (px_pcred_info->pspci_ngroups > NGROUPS_MAX) {
3786	error = EINVAL;
3787	goto bad;
3788	}
3789	}
3790	}
3791	#if CONFIG_MACF
3792	if (px_args.mac_extensions_size != `0`) {
3793	if ((error = spawn_copyin_macpolicyinfo(px_args: &px_args, pxsp: (struct ip_px_smpx_s *)&imgp->ip_px_smpx)) != `0`) {
3794	goto bad;
3795	}
3796	}
3797	#endif /* CONFIG_MACF */
3798	if ((px_args.subsystem_root_path_size > `0`) && (px_args.subsystem_root_path_size <= MAXPATHLEN)) {
3799	/*
3800	* If a valid-looking subsystem root has been
3801	* specified...
3802	*/
3803	if (IOTaskHasEntitlement(task: old_task, SPAWN_SUBSYSTEM_ROOT_ENTITLEMENT)) {
3804	/*
3805	* ...AND the parent has the entitlement, copy
3806	* the subsystem root path in.
3807	*/
3808	subsystem_root_path = zalloc_flags(ZV_NAMEI,
3809	Z_WAITOK \| Z_ZERO \| Z_NOFAIL);
3810
3811	if ((error = copyin(px_args.subsystem_root_path, subsystem_root_path, px_args.subsystem_root_path_size))) {
3812	goto bad;
3813	}
3814
3815	/ Paranoia /
3816	subsystem_root_path[px_args.subsystem_root_path_size - `1`] = `0`;
3817	}
3818	}
3819	#if CONFIG_EXCLAVES
3820	if ((px_args.conclave_id_size > `0`) && (px_args.conclave_id_size <= MAXCONCLAVENAME) &&
3821	(exclaves_get_status() == EXCLAVES_STATUS_AVAILABLE)) {
3822	if (px_args.conclave_id) {
3823	if (imgp->ip_px_sa != NULL && (px_sa.psa_flags & POSIX_SPAWN_SETEXEC)) {
3824	/ Conclave id could be set only for true spawn /
3825	error = EINVAL;
3826	goto bad;
3827	}
3828	task_conclave_id = kalloc_data(MAXCONCLAVENAME,
3829	Z_WAITOK \| Z_ZERO \| Z_NOFAIL);
3830	if ((error = copyin(px_args.conclave_id, task_conclave_id, MAXCONCLAVENAME))) {
3831	goto bad;
3832	}
3833	task_conclave_id[MAXCONCLAVENAME - `1`] = `0`;
3834	}
3835	}
3836	#endif
3837	}
3838
3839	if (IOTaskHasEntitlement(task: old_task, SPAWN_SET_PANIC_CRASH_BEHAVIOR)) {
3840	/ Truncate to uint8_t since we only support 2 flags for now /
3841	crash_behavior = (uint8_t)px_sa.psa_crash_behavior;
3842	crash_behavior_deadline = px_sa.psa_crash_behavior_deadline;
3843	}
3844
3845	/ set uthread to parent /
3846	uthread = current_uthread();
3847
3848	/*
3849	* <rdar://6640530>; this does not result in a behaviour change
3850	* relative to Leopard, so there should not be any existing code
3851	* which depends on it.
3852	*/
3853
3854	if (imgp->ip_px_sa != NULL) {
3855	struct _posix_spawnattr psa = (struct* _posix_spawnattr *) imgp->ip_px_sa;
3856	if ((psa->psa_options & PSA_OPTION_PLUGIN_HOST_DISABLE_A_KEYS) == PSA_OPTION_PLUGIN_HOST_DISABLE_A_KEYS) {
3857	imgp->ip_flags \|= IMGPF_PLUGIN_HOST_DISABLE_A_KEYS;
3858	}
3859	#if (DEVELOPMENT \|\| DEBUG)
3860	if ((psa->psa_options & PSA_OPTION_ALT_ROSETTA) == PSA_OPTION_ALT_ROSETTA) {
3861	imgp->ip_flags \|= (IMGPF_ROSETTA \| IMGPF_ALT_ROSETTA);
3862	}
3863	#endif
3864
3865	if ((error = exec_validate_spawnattr_policy(psa_apptype: psa->psa_apptype)) != `0`) {
3866	goto bad;
3867	}
3868	}
3869
3870	/*
3871	* If we don't have the extension flag that turns "posix_spawn()"
3872	* into "execve() with options", then we will be creating a new
3873	* process which does not inherit memory from the parent process,
3874	* which is one of the most expensive things about using fork()
3875	* and execve().
3876	*/
3877	if (imgp->ip_px_sa == NULL \|\| !(px_sa.psa_flags & POSIX_SPAWN_SETEXEC)) {
3878	/ Set the new task's coalition, if it is requested. /
3879	coalition_t coal[COALITION_NUM_TYPES] = { COALITION_NULL };
3880	#if CONFIG_COALITIONS
3881	int i, ncoals;
3882	kern_return_t kr = KERN_SUCCESS;
3883	struct _posix_spawn_coalition_info coal_info;
3884	int coal_role[COALITION_NUM_TYPES];
3885
3886	if (imgp->ip_px_sa == NULL \|\| !px_args.coal_info) {
3887	goto do_fork1;
3888	}
3889
3890	memset(s: &coal_info, c: `0`, n: sizeof(coal_info));
3891
3892	if (px_args.coal_info_size > sizeof(coal_info)) {
3893	px_args.coal_info_size = sizeof(coal_info);
3894	}
3895	error = copyin(px_args.coal_info,
3896	&coal_info, px_args.coal_info_size);
3897	if (error != `0`) {
3898	goto bad;
3899	}
3900
3901	ncoals = `0`;
3902	for (i = `0`; i < COALITION_NUM_TYPES; i++) {
3903	uint64_t cid = coal_info.psci_info[i].psci_id;
3904	if (cid != `0`) {
3905	/*
3906	* don't allow tasks which are not in a
3907	* privileged coalition to spawn processes
3908	* into coalitions other than their own
3909	*/
3910	if (!task_is_in_privileged_coalition(task: proc_task(p), type: i) &&
3911	!IOTaskHasEntitlement(task: proc_task(p), COALITION_SPAWN_ENTITLEMENT)) {
3912	coal_dbg("ERROR: %d not in privilegd "
3913	"coalition of type %d",
3914	proc_getpid(p), i);
3915	spawn_coalitions_release_all(coal);
3916	error = EPERM;
3917	goto bad;
3918	}
3919
3920	coal_dbg("searching for coalition id:%llu", cid);
3921	/*
3922	* take a reference and activation on the
3923	* coalition to guard against free-while-spawn
3924	* races
3925	*/
3926	coal[i] = coalition_find_and_activate_by_id(coal_id: cid);
3927	if (coal[i] == COALITION_NULL) {
3928	coal_dbg("could not find coalition id:%llu "
3929	"(perhaps it has been terminated or reaped)", cid);
3930	/*
3931	* release any other coalition's we
3932	* may have a reference to
3933	*/
3934	spawn_coalitions_release_all(coal);
3935	error = ESRCH;
3936	goto bad;
3937	}
3938	if (coalition_type(coal: coal[i]) != i) {
3939	coal_dbg("coalition with id:%lld is not of type:%d"
3940	" (it's type:%d)", cid, i, coalition_type(coal[i]));
3941	spawn_coalitions_release_all(coal);
3942	error = ESRCH;
3943	goto bad;
3944	}
3945	coal_role[i] = coal_info.psci_info[i].psci_role;
3946	ncoals++;
3947	}
3948	}
3949	if (ncoals < COALITION_NUM_TYPES) {
3950	/*
3951	* If the user is attempting to spawn into a subset of
3952	* the known coalition types, then make sure they have
3953	* _at_least_ specified a resource coalition. If not,
3954	* the following fork1() call will implicitly force an
3955	* inheritance from 'p' and won't actually spawn the
3956	* new task into the coalitions the user specified.
3957	* (also the call to coalitions_set_roles will panic)
3958	*/
3959	if (coal[COALITION_TYPE_RESOURCE] == COALITION_NULL) {
3960	spawn_coalitions_release_all(coal);
3961	error = EINVAL;
3962	goto bad;
3963	}
3964	}
3965	do_fork1:
3966	#endif /* CONFIG_COALITIONS */
3967
3968	/*
3969	* note that this will implicitly inherit the
3970	* caller's persona (if it exists)
3971	*/
3972	error = fork1(p, &imgp->ip_new_thread, PROC_CREATE_SPAWN, coal);
3973	/ returns a thread and task reference /
3974
3975	if (error == `0`) {
3976	new_task = get_threadtask(imgp->ip_new_thread);
3977	}
3978	#if CONFIG_COALITIONS
3979	/ set the roles of this task within each given coalition /
3980	if (error == `0`) {
3981	kr = coalitions_set_roles(coalitions: coal, task: new_task, roles: coal_role);
3982	if (kr != KERN_SUCCESS) {
3983	error = EINVAL;
3984	}
3985	if (kdebug_debugid_enabled(MACHDBG_CODE(DBG_MACH_COALITION,
3986	MACH_COALITION_ADOPT))) {
3987	for (i = `0`; i < COALITION_NUM_TYPES; i++) {
3988	if (coal[i] != COALITION_NULL) {
3989	/*
3990	* On 32-bit targets, uniqueid
3991	* will get truncated to 32 bits
3992	*/
3993	KDBG_RELEASE(MACHDBG_CODE(
3994	DBG_MACH_COALITION,
3995	MACH_COALITION_ADOPT),
3996	coalition_id(coal[i]),
3997	get_task_uniqueid(new_task));
3998	}
3999	}
4000	}
4001	}
4002
4003	/ drop our references and activations - fork1() now holds them /
4004	spawn_coalitions_release_all(coal);
4005	#endif /* CONFIG_COALITIONS */
4006	if (error != `0`) {
4007	goto bad;
4008	}
4009	imgp->ip_flags \|= IMGPF_SPAWN; / spawn w/o exec /
4010	spawn_no_exec = TRUE; / used in later tests /
4011	} else {
4012	/ Adjust the user proc count /
4013	(void)chgproccnt(uid: kauth_getruid(), diff: `1`);
4014	/*
4015	* For execve case, create a new proc, task and thread
4016	* but don't make the proc visible to userland. After
4017	* image activation, the new proc would take place of
4018	* the old proc in pid hash and other lists that make
4019	* the proc visible to the system.
4020	*/
4021	imgp->ip_new_thread = cloneproc(old_task, NULL, p, CLONEPROC_EXEC);
4022
4023	/ task and thread ref returned by cloneproc /
4024	if (imgp->ip_new_thread == NULL) {
4025	(void)chgproccnt(uid: kauth_getruid(), diff: -`1`);
4026	error = ENOMEM;
4027	goto bad;
4028	}
4029
4030	new_task = get_threadtask(imgp->ip_new_thread);
4031	imgp->ip_flags \|= IMGPF_EXEC;
4032	}
4033
4034	p = (proc_t)get_bsdthreadtask_info(imgp->ip_new_thread);
4035
4036	if (spawn_no_exec) {
4037	/*
4038	* We had to wait until this point before firing the
4039	* proc:::create probe, otherwise p would not point to the
4040	* child process.
4041	*/
4042	DTRACE_PROC1(create, proc_t, p);
4043	}
4044	assert(p != NULL);
4045
4046	if (subsystem_root_path) {
4047	/ If a subsystem root was specified, swap it in /
4048	char * old_subsystem_root_path = p->p_subsystem_root_path;
4049	p->p_subsystem_root_path = subsystem_root_path;
4050	subsystem_root_path = old_subsystem_root_path;
4051	}
4052
4053	p->p_crash_behavior = crash_behavior;
4054	p->p_crash_behavior_deadline = crash_behavior_deadline;
4055
4056	p->p_crash_count = px_sa.psa_crash_count;
4057	p->p_throttle_timeout = px_sa.psa_throttle_timeout;
4058
4059	/ We'll need the subsystem root for setting up Apple strings /
4060	imgp->ip_subsystem_root_path = p->p_subsystem_root_path;
4061
4062	context.vc_thread = imgp->ip_new_thread;
4063	context.vc_ucred = proc_ucred_unsafe(p); / in init /
4064
4065	/*
4066	* Post fdt_fork(), pre exec_handle_sugid() - this is where we want
4067	* to handle the file_actions.
4068	*/
4069
4070	/ Has spawn file actions? /
4071	if (imgp->ip_px_sfa != NULL) {
4072	/*
4073	* The POSIX_SPAWN_CLOEXEC_DEFAULT flag
4074	* is handled in exec_handle_file_actions().
4075	*/
4076	#if CONFIG_AUDIT
4077	/*
4078	* The file actions auditing can overwrite the upath of
4079	* AUE_POSIX_SPAWN audit record. Save the audit record.
4080	*/
4081	struct kaudit_record *save_uu_ar = uthread->uu_ar;
4082	uthread->uu_ar = NULL;
4083	#endif
4084	error = exec_handle_file_actions(imgp,
4085	psa_flags: imgp->ip_px_sa != NULL ? px_sa.psa_flags : `0`);
4086	#if CONFIG_AUDIT
4087	/ Restore the AUE_POSIX_SPAWN audit record. /
4088	uthread->uu_ar = save_uu_ar;
4089	#endif
4090	if (error != `0`) {
4091	goto bad;
4092	}
4093	}
4094
4095	/ Has spawn port actions? /
4096	if (imgp->ip_px_spa != NULL) {
4097	#if CONFIG_AUDIT
4098	/*
4099	* Do the same for the port actions as we did for the file
4100	* actions. Save the AUE_POSIX_SPAWN audit record.
4101	*/
4102	struct kaudit_record *save_uu_ar = uthread->uu_ar;
4103	uthread->uu_ar = NULL;
4104	#endif
4105	error = exec_handle_port_actions(imgp, actions: &port_actions);
4106	#if CONFIG_AUDIT
4107	/ Restore the AUE_POSIX_SPAWN audit record. /
4108	uthread->uu_ar = save_uu_ar;
4109	#endif
4110	if (error != `0`) {
4111	goto bad;
4112	}
4113	}
4114
4115	/ Has spawn attr? /
4116	if (imgp->ip_px_sa != NULL) {
4117	/*
4118	* Reset UID/GID to parent's RUID/RGID; This works only
4119	* because the operation occurs before the call
4120	* to exec_handle_sugid() by the image activator called
4121	* from exec_activate_image().
4122	*
4123	* POSIX requires that any setuid/setgid bits on the process
4124	* image will take precedence over the spawn attributes
4125	* (re)setting them.
4126	*
4127	* Modifications to p_ucred must be guarded using the
4128	* proc's ucred lock. This prevents others from accessing
4129	* a garbage credential.
4130	*/
4131	if (px_sa.psa_flags & POSIX_SPAWN_RESETIDS) {
4132	kauth_cred_proc_update(p, action: PROC_SETTOKEN_NONE,
4133	fn: ^bool (kauth_cred_t parent __unused, kauth_cred_t model){
4134	return kauth_cred_model_setuidgid(model,
4135	uid: kauth_cred_getruid(cred: parent),
4136	gid: kauth_cred_getrgid(cred: parent));
4137	});
4138	}
4139
4140	if (imgp->ip_px_pcred_info) {
4141	if (!spawn_no_exec) {
4142	error = ENOTSUP;
4143	goto bad;
4144	}
4145
4146	error = spawn_posix_cred_adopt(p, px_pcred_info: imgp->ip_px_pcred_info);
4147	if (error != `0`) {
4148	goto bad;
4149	}
4150	}
4151
4152	#if CONFIG_PERSONAS
4153	if (imgp->ip_px_persona != NULL) {
4154	if (!spawn_no_exec) {
4155	error = ENOTSUP;
4156	goto bad;
4157	}
4158
4159	/*
4160	* If we were asked to spawn a process into a new persona,
4161	* do the credential switch now (which may override the UID/GID
4162	* inherit done just above). It's important to do this switch
4163	* before image activation both for reasons stated above, and
4164	* to ensure that the new persona has access to the image/file
4165	* being executed.
4166	*/
4167	error = spawn_persona_adopt(p, px_persona: imgp->ip_px_persona);
4168	if (error != `0`) {
4169	goto bad;
4170	}
4171	}
4172	#endif /* CONFIG_PERSONAS */
4173	#if !SECURE_KERNEL
4174	/*
4175	* Disable ASLR for the spawned process.
4176	*
4177	* But only do so if we are not embedded + RELEASE.
4178	* While embedded allows for a boot-arg (-disable_aslr)
4179	* to deal with this (which itself is only honored on
4180	* DEVELOPMENT or DEBUG builds of xnu), it is often
4181	* useful or necessary to disable ASLR on a per-process
4182	* basis for unit testing and debugging.
4183	*/
4184	if (px_sa.psa_flags & _POSIX_SPAWN_DISABLE_ASLR) {
4185	OSBitOrAtomic(P_DISABLE_ASLR, &p->p_flag);
4186	}
4187	#endif /* !SECURE_KERNEL */
4188
4189	/ Randomize high bits of ASLR slide /
4190	if (px_sa.psa_flags & _POSIX_SPAWN_HIGH_BITS_ASLR) {
4191	imgp->ip_flags \|= IMGPF_HIGH_BITS_ASLR;
4192	}
4193
4194	#if !SECURE_KERNEL
4195	/*
4196	* Forcibly disallow execution from data pages for the spawned process
4197	* even if it would otherwise be permitted by the architecture default.
4198	*/
4199	if (px_sa.psa_flags & _POSIX_SPAWN_ALLOW_DATA_EXEC) {
4200	imgp->ip_flags \|= IMGPF_ALLOW_DATA_EXEC;
4201	}
4202	#endif /* !SECURE_KERNEL */
4203
4204	#if __has_feature(ptrauth_calls)
4205	if (vm_shared_region_reslide_aslr && is_64 && (px_sa.psa_flags & _POSIX_SPAWN_RESLIDE)) {
4206	imgp->ip_flags \|= IMGPF_RESLIDE;
4207	}
4208	#endif /* __has_feature(ptrauth_calls) */
4209
4210	if ((px_sa.psa_apptype & POSIX_SPAWN_PROC_TYPE_MASK) ==
4211	POSIX_SPAWN_PROC_TYPE_DRIVER) {
4212	imgp->ip_flags \|= IMGPF_DRIVER;
4213	}
4214	}
4215
4216	/*
4217	* Disable ASLR during image activation. This occurs either if the
4218	* _POSIX_SPAWN_DISABLE_ASLR attribute was found above or if
4219	* P_DISABLE_ASLR was inherited from the parent process.
4220	*/
4221	if (p->p_flag & P_DISABLE_ASLR) {
4222	imgp->ip_flags \|= IMGPF_DISABLE_ASLR;
4223	}
4224
4225	/*
4226	* Clear transition flag so we won't hang if exec_activate_image() causes
4227	* an automount (and launchd does a proc sysctl to service it).
4228	*
4229	* <rdar://problem/6848672>, <rdar://problem/5959568>.
4230	*/
4231	proc_transend(p, locked: `0`);
4232	proc_transit_set = `0`;
4233
4234	if (!spawn_no_exec) {
4235	/*
4236	* Clear the signal lock in case of exec, since
4237	* image activation uses psignal on child process.
4238	*/
4239	proc_signalend(p, locked: `0`);
4240	proc_signal_set = `0`;
4241	}
4242
4243	#if MAC_SPAWN /* XXX */
4244	if (uap->mac_p != USER_ADDR_NULL) {
4245	error = mac_execve_enter(uap->mac_p, imgp);
4246	if (error) {
4247	goto bad;
4248	}
4249	}
4250	#endif
4251
4252
4253	/*
4254	* Activate the image.
4255	* Warning: If activation failed after point of no return, it returns error
4256	* as 0 and pretends the call succeeded.
4257	*/
4258	error = exec_activate_image(imgp);
4259	#if defined(HAS_APPLE_PAC)
4260	const uint8_t disable_user_jop = imgp->ip_flags & IMGPF_NOJOP ? TRUE : FALSE;
4261	ml_task_set_jop_pid_from_shared_region(task: new_task, disable_user_jop);
4262	ml_task_set_disable_user_jop(task: new_task, disable_user_jop);
4263	ml_thread_set_disable_user_jop(thread: imgp->ip_new_thread, disable_user_jop);
4264	ml_thread_set_jop_pid(thread: imgp->ip_new_thread, task: new_task);
4265	#endif
4266
4267
4268	/*
4269	* If you've come here to add support for some new HW feature or some per-process or per-vmmap
4270	* or per-pmap flag that needs to be set before the process runs, or are in general lost, here
4271	* is some help. This summary was accurate as of Jul 2022. Use git log as needed. This comment
4272	* is here to prevent a recurrence of rdar://96307913
4273	*
4274	* In posix_spawn, following is what happens:
4275	* 1. Lots of prep and checking work
4276	* 2. Image activation via exec_activate_image(). The new task will get a new pmap here
4277	* 3. More prep work. (YOU ARE HERE)
4278	* 4. exec_resettextvp() is called
4279	* 5. At this point it is safe to check entitlements and code signatures
4280	* 6. task_clear_return_wait(get_threadtask(imgp->ip_new_thread), TCRW_CLEAR_INITIAL_WAIT);
4281	* The new thread is allowed to run in kernel. It cannot yet get to userland
4282	* 7. More things done here. This is your chance to affect the task before it runs in
4283	* userspace
4284	* 8. task_clear_return_wait(get_threadtask(imgp->ip_new_thread), TCRW_CLEAR_FINAL_WAIT);
4285	* The new thread is allowed to run in userland
4286	*/
4287
4288	if (error == `0` && !spawn_no_exec) {
4289	p = proc_exec_switch_task(old_proc: current_proc(), new_proc: p, old_task, new_task, imgp, inherit: &inherit);
4290	/ proc ref returned /
4291	should_release_proc_ref = TRUE;
4292	}
4293
4294	if (error == `0`) {
4295	/ process completed the exec, but may have failed after point of no return /
4296	exec_done = TRUE;
4297	}
4298
4299	#if CONFIG_EXCLAVES
4300	if (!error && task_conclave_id != NULL) {
4301	kern_return_t kr;
4302	kr = task_add_conclave(new_task, imgp->ip_vp, (int64_t)imgp->ip_arch_offset,
4303	task_conclave_id);
4304	if (kr != KERN_SUCCESS) {
4305	error = EINVAL;
4306	goto bad;
4307	}
4308	}
4309	#endif
4310
4311	if (!error && imgp->ip_px_sa != NULL) {
4312	thread_t child_thread = imgp->ip_new_thread;
4313	uthread_t child_uthread = get_bsdthread_info(child_thread);
4314
4315	/*
4316	* Because of POSIX_SPAWN_SETEXEC, we need to handle this after image
4317	* activation, else when image activation fails (before the point of no
4318	* return) would leave the parent process in a modified state.
4319	*/
4320	if (px_sa.psa_flags & POSIX_SPAWN_SETPGROUP) {
4321	struct setpgid_args spga;
4322	spga.pid = proc_getpid(p);
4323	spga.pgid = px_sa.psa_pgroup;
4324	/*
4325	* Effectively, call setpgid() system call; works
4326	* because there are no pointer arguments.
4327	*/
4328	if ((error = setpgid(p, &spga, ival)) != `0`) {
4329	goto bad_px_sa;
4330	}
4331	}
4332
4333	if (px_sa.psa_flags & POSIX_SPAWN_SETSID) {
4334	error = setsid_internal(p);
4335	if (error != `0`) {
4336	goto bad_px_sa;
4337	}
4338	}
4339
4340	/*
4341	* If we have a spawn attr, and it contains signal related flags,
4342	* the we need to process them in the "context" of the new child
4343	* process, so we have to process it following image activation,
4344	* prior to making the thread runnable in user space. This is
4345	* necessitated by some signal information being per-thread rather
4346	* than per-process, and we don't have the new allocation in hand
4347	* until after the image is activated.
4348	*/
4349
4350	/*
4351	* Mask a list of signals, instead of them being unmasked, if
4352	* they were unmasked in the parent; note that some signals
4353	* are not maskable.
4354	*/
4355	if (px_sa.psa_flags & POSIX_SPAWN_SETSIGMASK) {
4356	child_uthread->uu_sigmask = (px_sa.psa_sigmask & ~sigcantmask);
4357	}
4358	/*
4359	* Default a list of signals instead of ignoring them, if
4360	* they were ignored in the parent. Note that we pass
4361	* spawn_no_exec to setsigvec() to indicate that we called
4362	* fork1() and therefore do not need to call proc_signalstart()
4363	* internally.
4364	*/
4365	if (px_sa.psa_flags & POSIX_SPAWN_SETSIGDEF) {
4366	vec.sa_handler = SIG_DFL;
4367	vec.sa_tramp = `0`;
4368	vec.sa_mask = `0`;
4369	vec.sa_flags = `0`;
4370	for (sig = `1`; sig < NSIG; sig++) {
4371	if (px_sa.psa_sigdefault & (`1` << (sig - `1`))) {
4372	error = setsigvec(p, child_thread, signum: sig, &vec, in_sigstart: spawn_no_exec);
4373	}
4374	}
4375	}
4376
4377	/*
4378	* Activate the CPU usage monitor, if requested. This is done via a task-wide, per-thread CPU
4379	* usage limit, which will generate a resource exceeded exception if any one thread exceeds the
4380	* limit.
4381	*
4382	* Userland gives us interval in seconds, and the kernel SPI expects nanoseconds.
4383	*/
4384	if ((px_sa.psa_cpumonitor_percent != `0`) && (px_sa.psa_cpumonitor_percent < UINT8_MAX)) {
4385	/*
4386	* Always treat a CPU monitor activation coming from spawn as entitled. Requiring
4387	* an entitlement to configure the monitor a certain way seems silly, since
4388	* whomever is turning it on could just as easily choose not to do so.
4389	*/
4390	error = proc_set_task_ruse_cpu(task: proc_task(p),
4391	TASK_POLICY_RESOURCE_ATTRIBUTE_NOTIFY_EXC,
4392	percentage: (uint8_t)px_sa.psa_cpumonitor_percent,
4393	interval: px_sa.psa_cpumonitor_interval * NSEC_PER_SEC,
4394	deadline: `0`, TRUE);
4395	}
4396
4397
4398	if (px_pcred_info &&
4399	(px_pcred_info->pspci_flags & POSIX_SPAWN_POSIX_CRED_LOGIN)) {
4400	/*
4401	* setlogin() must happen after setsid()
4402	*/
4403	setlogin_internal(p, login: px_pcred_info->pspci_login);
4404	}
4405
4406	bad_px_sa:
4407	if (error != `0`) {
4408	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) \| DBG_FUNC_NONE,
4409	proc_getpid(p), OS_REASON_EXEC, EXEC_EXIT_REASON_BAD_PSATTR, `0`, `0`);
4410	exec_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_BAD_PSATTR);
4411	}
4412	}
4413
4414	bad:
4415
4416	if (error == `0`) {
4417	/ reset delay idle sleep status if set /
4418	#if CONFIG_DELAY_IDLE_SLEEP
4419	if ((p->p_flag & P_DELAYIDLESLEEP) == P_DELAYIDLESLEEP) {
4420	OSBitAndAtomic(~((uint32_t)P_DELAYIDLESLEEP), &p->p_flag);
4421	}
4422	#endif /* CONFIG_DELAY_IDLE_SLEEP */
4423	/ upon successful spawn, re/set the proc control state /
4424	if (imgp->ip_px_sa != NULL) {
4425	switch (px_sa.psa_pcontrol) {
4426	case POSIX_SPAWN_PCONTROL_THROTTLE:
4427	p->p_pcaction = P_PCTHROTTLE;
4428	break;
4429	case POSIX_SPAWN_PCONTROL_SUSPEND:
4430	p->p_pcaction = P_PCSUSP;
4431	break;
4432	case POSIX_SPAWN_PCONTROL_KILL:
4433	p->p_pcaction = P_PCKILL;
4434	break;
4435	case POSIX_SPAWN_PCONTROL_NONE:
4436	default:
4437	p->p_pcaction = `0`;
4438	break;
4439	}
4440	;
4441	}
4442	exec_resettextvp(p, imgp);
4443
4444	/*
4445	* Enable new task IPC access if exec_activate_image() returned an
4446	* active task. (Checks active bit in ipc_task_enable() under lock).
4447	* Must enable after resettextvp so that task port policies are not evaluated
4448	* until the csblob in the textvp is accurately reflected.
4449	*/
4450	ipc_task_enable(task: new_task);
4451
4452	/ Set task exception ports now that we can check entitlements /
4453	if (imgp->ip_px_spa != NULL) {
4454	error = exec_handle_exception_port_actions(imgp, actions: &port_actions);
4455	}
4456
4457	#if CONFIG_MEMORYSTATUS
4458	/ Set jetsam priority for DriverKit processes /
4459	if (px_sa.psa_apptype == POSIX_SPAWN_PROC_TYPE_DRIVER) {
4460	px_sa.psa_priority = JETSAM_PRIORITY_DRIVER_APPLE;
4461	}
4462
4463	/ Has jetsam attributes? /
4464	if (imgp->ip_px_sa != NULL && (px_sa.psa_jetsam_flags & POSIX_SPAWN_JETSAM_SET)) {
4465	/*
4466	* With 2-level high-water-mark support, POSIX_SPAWN_JETSAM_HIWATER_BACKGROUND is no
4467	* longer relevant, as background limits are described via the inactive limit slots.
4468	*
4469	* That said, however, if the POSIX_SPAWN_JETSAM_HIWATER_BACKGROUND is passed in,
4470	* we attempt to mimic previous behavior by forcing the BG limit data into the
4471	* inactive/non-fatal mode and force the active slots to hold system_wide/fatal mode.
4472	*/
4473
4474	if (px_sa.psa_jetsam_flags & POSIX_SPAWN_JETSAM_HIWATER_BACKGROUND) {
4475	memorystatus_update(p, priority: px_sa.psa_priority, user_data: `0`, FALSE, / assertion priority /
4476	effective: (px_sa.psa_jetsam_flags & POSIX_SPAWN_JETSAM_USE_EFFECTIVE_PRIORITY),
4477	TRUE,
4478	memlimit_active: -`1`, TRUE,
4479	memlimit_inactive: px_sa.psa_memlimit_inactive, FALSE);
4480	} else {
4481	memorystatus_update(p, priority: px_sa.psa_priority, user_data: `0`, FALSE, / assertion priority /
4482	effective: (px_sa.psa_jetsam_flags & POSIX_SPAWN_JETSAM_USE_EFFECTIVE_PRIORITY),
4483	TRUE,
4484	memlimit_active: px_sa.psa_memlimit_active,
4485	memlimit_active_is_fatal: (px_sa.psa_jetsam_flags & POSIX_SPAWN_JETSAM_MEMLIMIT_ACTIVE_FATAL),
4486	memlimit_inactive: px_sa.psa_memlimit_inactive,
4487	memlimit_inactive_is_fatal: (px_sa.psa_jetsam_flags & POSIX_SPAWN_JETSAM_MEMLIMIT_INACTIVE_FATAL));
4488	}
4489	}
4490
4491	/ Has jetsam relaunch behavior? /
4492	if (imgp->ip_px_sa != NULL && (px_sa.psa_jetsam_flags & POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_MASK)) {
4493	/*
4494	* Launchd has passed in data indicating the behavior of this process in response to jetsam.
4495	* This data would be used by the jetsam subsystem to determine the position and protection
4496	* offered to this process on dirty -> clean transitions.
4497	*/
4498	int relaunch_flags = P_MEMSTAT_RELAUNCH_UNKNOWN;
4499	switch (px_sa.psa_jetsam_flags & POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_MASK) {
4500	case POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_LOW:
4501	relaunch_flags = P_MEMSTAT_RELAUNCH_LOW;
4502	break;
4503	case POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_MED:
4504	relaunch_flags = P_MEMSTAT_RELAUNCH_MED;
4505	break;
4506	case POSIX_SPAWN_JETSAM_RELAUNCH_BEHAVIOR_HIGH:
4507	relaunch_flags = P_MEMSTAT_RELAUNCH_HIGH;
4508	break;
4509	default:
4510	break;
4511	}
4512	memorystatus_relaunch_flags_update(p, relaunch_flags);
4513	}
4514
4515	#endif /* CONFIG_MEMORYSTATUS */
4516	if (imgp->ip_px_sa != NULL && px_sa.psa_thread_limit > `0`) {
4517	task_set_thread_limit(task: new_task, thread_limit: (uint16_t)px_sa.psa_thread_limit);
4518	}
4519
4520	#if CONFIG_PROC_RESOURCE_LIMITS
4521	if (imgp->ip_px_sa != NULL && (px_sa.psa_port_soft_limit > `0` \|\| px_sa.psa_port_hard_limit > `0`)) {
4522	task_set_port_space_limits(new_task, (uint32_t)px_sa.psa_port_soft_limit,
4523	(uint32_t)px_sa.psa_port_hard_limit);
4524	}
4525
4526	if (imgp->ip_px_sa != NULL && (px_sa.psa_filedesc_soft_limit > `0` \|\| px_sa.psa_filedesc_hard_limit > `0`)) {
4527	proc_set_filedesc_limits(p, (int)px_sa.psa_filedesc_soft_limit,
4528	(int)px_sa.psa_filedesc_hard_limit);
4529	}
4530	if (imgp->ip_px_sa != NULL && (px_sa.psa_kqworkloop_soft_limit > `0` \|\| px_sa.psa_kqworkloop_hard_limit > `0`)) {
4531	proc_set_kqworkloop_limits(p, (int)px_sa.psa_kqworkloop_soft_limit,
4532	(int)px_sa.psa_kqworkloop_hard_limit);
4533	}
4534	#endif /* CONFIG_PROC_RESOURCE_LIMITS */
4535
4536	/ Disable wakeup monitoring for DriverKit processes /
4537	if (px_sa.psa_apptype == POSIX_SPAWN_PROC_TYPE_DRIVER) {
4538	uint32_t flags = WAKEMON_DISABLE;
4539	task_wakeups_monitor_ctl(task: new_task, rate_hz: &flags, NULL);
4540	}
4541	}
4542
4543
4544	/*
4545	* If we successfully called fork1() or cloneproc, we always need
4546	* to do this. This is because we come back from that call with
4547	* signals blocked in the child, and we have to unblock them, for exec
4548	* case they are unblocked before activation, but for true spawn case
4549	* we want to wait until after we've performed any spawn actions.
4550	* This has to happen before process_signature(), which uses psignal.
4551	*/
4552	if (proc_transit_set) {
4553	proc_transend(p, locked: `0`);
4554	}
4555
4556	/*
4557	* Drop the signal lock on the child which was taken on our
4558	* behalf by forkproc()/cloneproc() to prevent signals being
4559	* received by the child in a partially constructed state.
4560	*/
4561	if (proc_signal_set) {
4562	proc_signalend(p, locked: `0`);
4563	}
4564
4565	if (error == `0`) {
4566	/*
4567	* We need to initialize the bank context behind the protection of
4568	* the proc_trans lock to prevent a race with exit. We can't do this during
4569	* exec_activate_image because task_bank_init checks entitlements that
4570	* aren't loaded until subsequent calls (including exec_resettextvp).
4571	*/
4572	error = proc_transstart(p, locked: `0`, non_blocking: `0`);
4573
4574	if (error == `0`) {
4575	task_bank_init(task: new_task);
4576	proc_transend(p, locked: `0`);
4577	}
4578
4579	#if __arm64__
4580	proc_footprint_entitlement_hacks(p, task: new_task);
4581	#endif /* __arm64__ */
4582
4583	#if XNU_TARGET_OS_OSX
4584	#define SINGLE_JIT_ENTITLEMENT "com.apple.security.cs.single-jit"
4585	if (IOTaskHasEntitlement(task: new_task, SINGLE_JIT_ENTITLEMENT)) {
4586	vm_map_single_jit(map: get_task_map(new_task));
4587	}
4588	#endif /* XNU_TARGET_OS_OSX */
4589
4590	#if __has_feature(ptrauth_calls)
4591	task_set_pac_exception_fatal_flag(new_task);
4592	#endif /* __has_feature(ptrauth_calls) */
4593	task_set_jit_exception_fatal_flag(task: new_task);
4594	}
4595
4596	/ Inherit task role from old task to new task for exec /
4597	if (error == `0` && !spawn_no_exec) {
4598	proc_inherit_task_role(new_task, old_task);
4599	}
4600
4601	#if CONFIG_ARCADE
4602	if (error == `0`) {
4603	/*
4604	* Check to see if we need to trigger an arcade upcall AST now
4605	* that the vnode has been reset on the task.
4606	*/
4607	arcade_prepare(task: new_task, thread: imgp->ip_new_thread);
4608	}
4609	#endif /* CONFIG_ARCADE */
4610
4611	if (error == `0`) {
4612	proc_apply_jit_and_vm_policies(imgp, p, task: new_task);
4613	}
4614
4615	/ Clear the initial wait on the thread before handling spawn policy /
4616	if (imgp && imgp->ip_new_thread) {
4617	task_clear_return_wait(task: get_threadtask(imgp->ip_new_thread), TCRW_CLEAR_INITIAL_WAIT);
4618	}
4619
4620	/*
4621	* Apply the spawnattr policy, apptype (which primes the task for importance donation),
4622	* and bind any portwatch ports to the new task.
4623	* This must be done after the exec so that the child's thread is ready,
4624	* and after the in transit state has been released, because priority is
4625	* dropped here so we need to be prepared for a potentially long preemption interval
4626	*
4627	* TODO: Consider splitting this up into separate phases
4628	*/
4629	if (error == `0` && imgp->ip_px_sa != NULL) {
4630	struct _posix_spawnattr psa = (struct* _posix_spawnattr *) imgp->ip_px_sa;
4631
4632	error = exec_handle_spawnattr_policy(p, thread: imgp->ip_new_thread, psa_apptype: psa->psa_apptype, psa_qos_clamp: psa->psa_qos_clamp,
4633	psa_darwin_role: psa->psa_darwin_role, port_actions: &port_actions);
4634	}
4635
4636	/ Transfer the turnstile watchport boost to new task if in exec /
4637	if (error == `0` && !spawn_no_exec) {
4638	task_transfer_turnstile_watchports(old_task, new_task, new_thread: imgp->ip_new_thread);
4639	}
4640
4641	if (error == `0` && imgp->ip_px_sa != NULL) {
4642	struct _posix_spawnattr psa = (struct* _posix_spawnattr *) imgp->ip_px_sa;
4643
4644	if (psa->psa_no_smt) {
4645	task_set_no_smt(task: new_task);
4646	}
4647	if (psa->psa_tecs) {
4648	task_set_tecs(task: new_task);
4649	}
4650	}
4651
4652	if (error == `0` && imgp->ip_px_sa != NULL) {
4653	struct _posix_spawnattr psa = (struct* _posix_spawnattr *) imgp->ip_px_sa;
4654
4655	if (psa->psa_options & PSA_OPTION_DATALESS_IOPOLICY) {
4656	struct _iopol_param_t iop_param = {
4657	.iop_scope = IOPOL_SCOPE_PROCESS,
4658	.iop_iotype = IOPOL_TYPE_VFS_MATERIALIZE_DATALESS_FILES,
4659	.iop_policy = psa->psa_dataless_iopolicy,
4660	};
4661	error = iopolicysys_vfs_materialize_dataless_files(p, IOPOL_CMD_SET, scope: iop_param.iop_scope,
4662	policy: iop_param.iop_policy, iop_param: &iop_param);
4663	}
4664	}
4665
4666	if (error == `0`) {
4667	/ Apply the main thread qos /
4668	thread_t main_thread = imgp->ip_new_thread;
4669	task_set_main_thread_qos(task: new_task, main_thread);
4670	}
4671
4672	/*
4673	* Release any ports we kept around for binding to the new task
4674	* We need to release the rights even if the posix_spawn has failed.
4675	*/
4676	if (imgp->ip_px_spa != NULL) {
4677	exec_port_actions_destroy(port_actions: &port_actions);
4678	}
4679
4680	/*
4681	* We have to delay operations which might throw a signal until after
4682	* the signals have been unblocked; however, we want that to happen
4683	* after exec_resettextvp() so that the textvp is correct when they
4684	* fire.
4685	*/
4686	if (error == `0`) {
4687	error = process_signature(p, imgp);
4688
4689	/*
4690	* Pay for our earlier safety; deliver the delayed signals from
4691	* the incomplete spawn process now that it's complete.
4692	*/
4693	if (imgp != NULL && spawn_no_exec && (p->p_lflag & P_LTRACED)) {
4694	psignal_vfork(p, new_task: proc_task(p), thread: imgp->ip_new_thread, SIGTRAP);
4695	}
4696
4697	if (error == `0` && !spawn_no_exec) {
4698	KDBG(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXEC),
4699	proc_getpid(p));
4700	}
4701	}
4702
4703	if (spawn_no_exec) {
4704	/ flag the 'fork' has occurred /
4705	proc_knote(p: p->p_pptr, NOTE_FORK \| proc_getpid(p));
4706	}
4707
4708	/ flag exec has occurred, notify only if it has not failed due to FP Key error /
4709	if (!error && ((p->p_lflag & P_LTERM_DECRYPTFAIL) == `0`)) {
4710	proc_knote(p, NOTE_EXEC);
4711	}
4712
4713	if (imgp != NULL) {
4714	uthread_set_exec_data(uth: current_uthread(), NULL);
4715	if (imgp->ip_vp) {
4716	vnode_put(vp: imgp->ip_vp);
4717	}
4718	if (imgp->ip_scriptvp) {
4719	vnode_put(vp: imgp->ip_scriptvp);
4720	}
4721	if (imgp->ip_strings) {
4722	execargs_free(imgp);
4723	}
4724	if (imgp->ip_free_map) {
4725	/ Free the map after dropping iocount on vnode to avoid deadlock /
4726	vm_map_deallocate(map: imgp->ip_free_map);
4727	}
4728	kfree_data(imgp->ip_px_sfa,
4729	px_args.file_actions_size);
4730	kfree_data(imgp->ip_px_spa,
4731	px_args.port_actions_size);
4732	#if CONFIG_PERSONAS
4733	kfree_data(imgp->ip_px_persona,
4734	px_args.persona_info_size);
4735	#endif
4736	kfree_data(imgp->ip_px_pcred_info,
4737	px_args.posix_cred_info_size);
4738
4739	if (subsystem_root_path != NULL) {
4740	zfree(ZV_NAMEI, subsystem_root_path);
4741	}
4742	#if CONFIG_MACF
4743	struct ip_px_smpx_s *px_s = &imgp->ip_px_smpx;
4744	kfree_data(px_s->array, px_args.mac_extensions_size);
4745	kfree_data(px_s->data, (vm_size_t)px_s->datalen);
4746
4747	if (imgp->ip_execlabelp) {
4748	mac_cred_label_free(label: imgp->ip_execlabelp);
4749	imgp->ip_execlabelp = NULL;
4750	}
4751	if (imgp->ip_scriptlabelp) {
4752	mac_vnode_label_free(label: imgp->ip_scriptlabelp);
4753	imgp->ip_scriptlabelp = NULL;
4754	}
4755	if (imgp->ip_cs_error != OS_REASON_NULL) {
4756	os_reason_free(cur_reason: imgp->ip_cs_error);
4757	imgp->ip_cs_error = OS_REASON_NULL;
4758	}
4759	if (imgp->ip_inherited_shared_region_id != NULL) {
4760	kfree_data(imgp->ip_inherited_shared_region_id,
4761	strlen(imgp->ip_inherited_shared_region_id) + `1`);
4762	imgp->ip_inherited_shared_region_id = NULL;
4763	}
4764	#endif
4765	}
4766
4767	#if CONFIG_DTRACE
4768	if (spawn_no_exec) {
4769	/*
4770	* In the original DTrace reference implementation,
4771	* posix_spawn() was a libc routine that just
4772	* did vfork(2) then exec(2). Thus the proc::: probes
4773	* are very fork/exec oriented. The details of this
4774	* in-kernel implementation of posix_spawn() is different
4775	* (while producing the same process-observable effects)
4776	* particularly w.r.t. errors, and which thread/process
4777	* is constructing what on behalf of whom.
4778	*/
4779	if (error) {
4780	DTRACE_PROC1(spawn__failure, int, error);
4781	} else {
4782	DTRACE_PROC(spawn__success);
4783	/*
4784	* Some DTrace scripts, e.g. newproc.d in
4785	* /usr/bin, rely on the the 'exec-success'
4786	* probe being fired in the child after the
4787	* new process image has been constructed
4788	* in order to determine the associated pid.
4789	*
4790	* So, even though the parent built the image
4791	* here, for compatibility, mark the new thread
4792	* so 'exec-success' fires on it as it leaves
4793	* the kernel.
4794	*/
4795	dtrace_thread_didexec(imgp->ip_new_thread);
4796	}
4797	} else {
4798	if (error) {
4799	DTRACE_PROC1(exec__failure, int, error);
4800	} else {
4801	dtrace_thread_didexec(imgp->ip_new_thread);
4802	}
4803	}
4804
4805	if ((dtrace_proc_waitfor_hook = dtrace_proc_waitfor_exec_ptr) != NULL) {
4806	(*dtrace_proc_waitfor_hook)(p);
4807	}
4808	#endif
4809
4810	#if CONFIG_AUDIT
4811	if (!error && AUDIT_ENABLED() && p) {
4812	/ Add the CDHash of the new process to the audit record /
4813	uint8_t *cdhash = cs_get_cdhash(p);
4814	if (cdhash) {
4815	AUDIT_ARG(data, cdhash, sizeof(uint8_t), CS_CDHASH_LEN);
4816	}
4817	}
4818	#endif
4819
4820	/ terminate the new task if exec failed /
4821	if (new_task != NULL && task_is_exec_copy(new_task)) {
4822	task_terminate_internal(task: new_task);
4823	}
4824
4825	if (exec_failure_reason && !spawn_no_exec) {
4826	psignal_with_reason(p, SIGKILL, signal_reason: exec_failure_reason);
4827	exec_failure_reason = NULL;
4828	}
4829
4830	/ Return to both the parent and the child? /
4831	if (imgp != NULL && spawn_no_exec) {
4832	/*
4833	* If the parent wants the pid, copy it out
4834	*/
4835	if (error == `0` && pid != USER_ADDR_NULL) {
4836	_Static_assert(sizeof(pid_t) == `4`, "posix_spawn() assumes a 32-bit pid_t");
4837	bool aligned = (pid & `3`) == `0`;
4838	if (aligned) {
4839	(void)copyout_atomic32(u32: proc_getpid(p), user_addr: pid);
4840	} else {
4841	(void)suword(addr: pid, word: proc_getpid(p));
4842	}
4843	}
4844	retval[`0`] = error;
4845
4846	/*
4847	* If we had an error, perform an internal reap ; this is
4848	* entirely safe, as we have a real process backing us.
4849	*/
4850	if (error) {
4851	proc_list_lock();
4852	p->p_listflag \|= P_LIST_DEADPARENT;
4853	proc_list_unlock();
4854	proc_lock(p);
4855	/ make sure no one else has killed it off... /
4856	if (p->p_stat != SZOMB && p->exit_thread == NULL) {
4857	p->exit_thread = current_thread();
4858	p->p_posix_spawn_failed = true;
4859	proc_unlock(p);
4860	exit1(p, `1`, (int *)NULL);
4861	} else {
4862	/ someone is doing it for us; just skip it /
4863	proc_unlock(p);
4864	}
4865	}
4866	}
4867
4868	/*
4869	* Do not terminate the current task, if proc_exec_switch_task did not
4870	* switch the tasks, terminating the current task without the switch would
4871	* result in loosing the SIGKILL status.
4872	*/
4873	if (task_did_exec(task: old_task)) {
4874	/ Terminate the current task, since exec will start in new task /
4875	task_terminate_internal(task: old_task);
4876	}
4877
4878	/ Release the thread ref returned by cloneproc/fork1 /
4879	if (imgp != NULL && imgp->ip_new_thread) {
4880	/ clear the exec complete flag if there is an error before point of no-return /
4881	uint32_t clearwait_flags = TCRW_CLEAR_FINAL_WAIT;
4882	if (!spawn_no_exec && !exec_done && error != `0`) {
4883	clearwait_flags \|= TCRW_CLEAR_EXEC_COMPLETE;
4884	}
4885	/ wake up the new thread /
4886	task_clear_return_wait(task: get_threadtask(imgp->ip_new_thread), flags: clearwait_flags);
4887	thread_deallocate(thread: imgp->ip_new_thread);
4888	imgp->ip_new_thread = NULL;
4889	}
4890
4891	/ Release the ref returned by cloneproc/fork1 /
4892	if (new_task) {
4893	task_deallocate(new_task);
4894	new_task = NULL;
4895	}
4896
4897	if (should_release_proc_ref) {
4898	proc_rele(p);
4899	}
4900
4901	kfree_type(typeof(*__spawn_data), __spawn_data);
4902
4903	if (inherit != NULL) {
4904	ipc_importance_release(elem: inherit);
4905	}
4906
4907	#if CONFIG_EXCLAVES
4908	if (task_conclave_id != NULL) {
4909	kfree_data(task_conclave_id, MAXCONCLAVENAME);
4910	}
4911	#endif
4912
4913	assert(spawn_no_exec \|\| exec_failure_reason == NULL);
4914	return error;
4915	}
4916
4917	/*
4918	* proc_exec_switch_task
4919	*
4920	* Parameters: old_proc proc before exec
4921	* new_proc proc after exec
4922	* old_task task before exec
4923	* new_task task after exec
4924	* imgp image params
4925	* inherit resulting importance linkage
4926	*
4927	* Returns: proc.
4928	*
4929	* Note: The function will switch proc in pid hash from old proc to new proc.
4930	* The switch needs to happen after draining all proc refs and inside
4931	* a proc list lock. In the case of failure to switch the proc, which
4932	* might happen if the process received a SIGKILL or jetsam killed it,
4933	* it will make sure that the new tasks terminates. User proc ref returned
4934	* to caller.
4935	*
4936	* This function is called after point of no return, in the case
4937	* failure to switch, it will terminate the new task and swallow the
4938	* error and let the terminated process complete exec and die.
4939	*/
4940	proc_t
4941	proc_exec_switch_task(proc_t old_proc, proc_t new_proc, task_t old_task, task_t new_task, struct image_params imgp, void* **inherit)
4942	{
4943	boolean_t task_active;
4944	boolean_t proc_active;
4945	boolean_t thread_active;
4946	boolean_t reparent_traced_child = FALSE;
4947	thread_t old_thread = current_thread();
4948	thread_t new_thread = imgp->ip_new_thread;
4949
4950	thread_set_exec_promotion(thread: old_thread);
4951	old_proc = proc_refdrain_will_exec(p: old_proc);
4952
4953	new_proc = proc_refdrain_will_exec(p: new_proc);
4954	/ extra proc ref returned to the caller /
4955
4956	assert(get_threadtask(new_thread) == new_task);
4957	task_active = task_is_active(new_task);
4958	proc_active = !(old_proc->p_lflag & P_LEXIT);
4959
4960	/ Check if the current thread is not aborted due to SIGKILL /
4961	thread_active = thread_is_active(thread: old_thread);
4962
4963	/*
4964	* Do not switch the proc if the new task or proc is already terminated
4965	* as a result of error in exec past point of no return
4966	*/
4967	if (proc_active && task_active && thread_active) {
4968	uthread_t new_uthread = get_bsdthread_info(new_thread);
4969	uthread_t old_uthread = current_uthread();
4970
4971	/ Clear dispatchqueue and workloop ast offset /
4972	new_proc->p_dispatchqueue_offset = `0`;
4973	new_proc->p_dispatchqueue_serialno_offset = `0`;
4974	new_proc->p_dispatchqueue_label_offset = `0`;
4975	new_proc->p_return_to_kernel_offset = `0`;
4976	new_proc->p_pthread_wq_quantum_offset = `0`;
4977
4978	/ If old_proc is session leader, change the leader to new proc /
4979	session_replace_leader(old_proc, new_proc);
4980
4981	proc_lock(old_proc);
4982
4983	/ Copy the signal state, dtrace state and set bsd ast on new thread /
4984	act_set_astbsd(new_thread);
4985	new_uthread->uu_siglist \|= old_uthread->uu_siglist;
4986	new_uthread->uu_siglist \|= old_proc->p_siglist;
4987	new_uthread->uu_sigwait = old_uthread->uu_sigwait;
4988	new_uthread->uu_sigmask = old_uthread->uu_sigmask;
4989	new_uthread->uu_oldmask = old_uthread->uu_oldmask;
4990	new_uthread->uu_exit_reason = old_uthread->uu_exit_reason;
4991	#if CONFIG_DTRACE
4992	new_uthread->t_dtrace_sig = old_uthread->t_dtrace_sig;
4993	new_uthread->t_dtrace_stop = old_uthread->t_dtrace_stop;
4994	new_uthread->t_dtrace_resumepid = old_uthread->t_dtrace_resumepid;
4995	assert(new_uthread->t_dtrace_scratch == NULL);
4996	new_uthread->t_dtrace_scratch = old_uthread->t_dtrace_scratch;
4997
4998	old_uthread->t_dtrace_sig = `0`;
4999	old_uthread->t_dtrace_stop = `0`;
5000	old_uthread->t_dtrace_resumepid = `0`;
5001	old_uthread->t_dtrace_scratch = NULL;
5002	#endif
5003
5004	#if CONFIG_PROC_UDATA_STORAGE
5005	new_proc->p_user_data = old_proc->p_user_data;
5006	#endif /* CONFIG_PROC_UDATA_STORAGE */
5007
5008	/ Copy the resource accounting info /
5009	thread_copy_resource_info(dst_thread: new_thread, src_thread: current_thread());
5010
5011	/ Clear the exit reason and signal state on old thread /
5012	old_uthread->uu_exit_reason = NULL;
5013	old_uthread->uu_siglist = `0`;
5014
5015	task_set_did_exec_flag(task: old_task);
5016	task_clear_exec_copy_flag(task: new_task);
5017
5018	task_copy_fields_for_exec(dst_task: new_task, src_task: old_task);
5019
5020	/*
5021	* Need to transfer pending watch port boosts to the new task
5022	* while still making sure that the old task remains in the
5023	* importance linkage. Create an importance linkage from old task
5024	* to new task, then switch the task importance base of old task
5025	* and new task. After the switch the port watch boost will be
5026	* boosting the new task and new task will be donating importance
5027	* to old task.
5028	*/
5029	*inherit = ipc_importance_exec_switch_task(old_task, new_task);
5030
5031	/ Transfer parent's ptrace state to child /
5032	new_proc->p_lflag &= ~(P_LTRACED \| P_LSIGEXC \| P_LNOATTACH);
5033	new_proc->p_lflag \|= (old_proc->p_lflag & (P_LTRACED \| P_LSIGEXC \| P_LNOATTACH));
5034	new_proc->p_oppid = old_proc->p_oppid;
5035
5036	if (old_proc->p_pptr != new_proc->p_pptr) {
5037	reparent_traced_child = TRUE;
5038	new_proc->p_lflag \|= P_LTRACE_WAIT;
5039	}
5040
5041	proc_unlock(old_proc);
5042
5043	/ Update the list of proc knotes /
5044	proc_transfer_knotes(old_proc, new_proc);
5045
5046	/ Update the proc interval timers /
5047	proc_inherit_itimers(old_proc, new_proc);
5048
5049	proc_list_lock();
5050
5051	/ Insert the new proc in child list of parent proc /
5052	p_reparentallchildren(old_proc, new_proc);
5053
5054	/ Switch proc in pid hash /
5055	phash_replace_locked(old_proc, new_proc);
5056
5057	/ Transfer the shadow flag to old proc /
5058	os_atomic_andnot(&new_proc->p_refcount, P_REF_SHADOW, relaxed);
5059	os_atomic_or(&old_proc->p_refcount, P_REF_SHADOW, relaxed);
5060
5061	/ Change init proc if launchd exec /
5062	if (old_proc == initproc) {
5063	/ Take the ref on new proc after proc_refwake_did_exec /
5064	initproc = new_proc;
5065	/ Drop the proc ref on old proc /
5066	proc_rele(p: old_proc);
5067	}
5068
5069	proc_list_unlock();
5070	#if CONFIG_EXCLAVES
5071	if (task_inherit_conclave(old_task, new_task, imgp->ip_vp,
5072	(int64_t)imgp->ip_arch_offset) != KERN_SUCCESS) {
5073	task_terminate_internal(new_task);
5074	}
5075	#endif
5076	} else {
5077	task_terminate_internal(task: new_task);
5078	}
5079
5080	proc_refwake_did_exec(p: new_proc);
5081	proc_refwake_did_exec(p: old_proc);
5082
5083	/ Take a ref on initproc if it changed /
5084	if (new_proc == initproc) {
5085	initproc = proc_ref(p: new_proc, false);
5086	assert(initproc != PROC_NULL);
5087	}
5088
5089	thread_clear_exec_promotion(thread: old_thread);
5090	proc_rele(p: old_proc);
5091
5092	if (reparent_traced_child) {
5093	proc_t pp = proc_parent(old_proc);
5094	assert(pp != PROC_NULL);
5095
5096	proc_reparentlocked(child: new_proc, newparent: pp, cansignal: `1`, locked: `0`);
5097	proc_rele(p: pp);
5098
5099	proc_lock(new_proc);
5100	new_proc->p_lflag &= ~P_LTRACE_WAIT;
5101	proc_unlock(new_proc);
5102	}
5103
5104	return new_proc;
5105	}
5106
5107	/*
5108	* execve
5109	*
5110	* Parameters: uap->fname File name to exec
5111	* uap->argp Argument list
5112	* uap->envp Environment list
5113	*
5114	* Returns: 0 Success
5115	* __mac_execve:EINVAL Invalid argument
5116	* __mac_execve:ENOTSUP Invalid argument
5117	* __mac_execve:EACCES Permission denied
5118	* __mac_execve:EINTR Interrupted function
5119	* __mac_execve:ENOMEM Not enough space
5120	* __mac_execve:EFAULT Bad address
5121	* __mac_execve:ENAMETOOLONG Filename too long
5122	* __mac_execve:ENOEXEC Executable file format error
5123	* __mac_execve:ETXTBSY Text file busy [misuse of error code]
5124	* __mac_execve:???
5125	*
5126	* TODO: Dynamic linker header address on stack is copied via suword()
5127	*/
5128	/ ARGSUSED /
5129	int
5130	execve(proc_t p, struct execve_args uap, int32_t retval)
5131	{
5132	struct __mac_execve_args muap;
5133	int err;
5134
5135	memoryshot(VM_EXECVE, DBG_FUNC_NONE);
5136
5137	muap.fname = uap->fname;
5138	muap.argp = uap->argp;
5139	muap.envp = uap->envp;
5140	muap.mac_p = USER_ADDR_NULL;
5141	err = __mac_execve(p, &muap, retval);
5142
5143	return err;
5144	}
5145
5146	/*
5147	* __mac_execve
5148	*
5149	* Parameters: uap->fname File name to exec
5150	* uap->argp Argument list
5151	* uap->envp Environment list
5152	* uap->mac_p MAC label supplied by caller
5153	*
5154	* Returns: 0 Success
5155	* EINVAL Invalid argument
5156	* ENOTSUP Not supported
5157	* ENOEXEC Executable file format error
5158	* exec_activate_image:EINVAL Invalid argument
5159	* exec_activate_image:EACCES Permission denied
5160	* exec_activate_image:EINTR Interrupted function
5161	* exec_activate_image:ENOMEM Not enough space
5162	* exec_activate_image:EFAULT Bad address
5163	* exec_activate_image:ENAMETOOLONG Filename too long
5164	* exec_activate_image:ENOEXEC Executable file format error
5165	* exec_activate_image:ETXTBSY Text file busy [misuse of error code]
5166	* exec_activate_image:EBADEXEC The executable is corrupt/unknown
5167	* exec_activate_image:???
5168	* mac_execve_enter:???
5169	*
5170	* TODO: Dynamic linker header address on stack is copied via suword()
5171	*/
5172	int
5173	__mac_execve(proc_t p, struct __mac_execve_args uap, int32_t retval __unused)
5174	{
5175	struct image_params *imgp = NULL;
5176	struct vnode_attr *vap = NULL;
5177	struct vnode_attr *origvap = NULL;
5178	int error;
5179	int is_64 = IS_64BIT_PROCESS(p);
5180	struct vfs_context context;
5181	struct uthread *uthread = NULL;
5182	task_t old_task = current_task();
5183	task_t new_task = NULL;
5184	boolean_t should_release_proc_ref = FALSE;
5185	boolean_t exec_done = FALSE;
5186	void *inherit = NULL;
5187	struct {
5188	struct image_params imgp;
5189	struct vnode_attr va;
5190	struct vnode_attr origva;
5191	} *__execve_data;
5192
5193	/ Allocate a big chunk for locals instead of using stack since these*
5194	* structures a pretty big.
5195	*/
5196	__execve_data = kalloc_type(typeof(*__execve_data), Z_WAITOK \| Z_ZERO);
5197	if (__execve_data == NULL) {
5198	error = ENOMEM;
5199	goto exit_with_error;
5200	}
5201	imgp = &__execve_data->imgp;
5202	vap = &__execve_data->va;
5203	origvap = &__execve_data->origva;
5204
5205	/ Initialize the common data in the image_params structure /
5206	imgp->ip_user_fname = uap->fname;
5207	imgp->ip_user_argv = uap->argp;
5208	imgp->ip_user_envv = uap->envp;
5209	imgp->ip_vattr = vap;
5210	imgp->ip_origvattr = origvap;
5211	imgp->ip_vfs_context = &context;
5212	imgp->ip_flags = (is_64 ? IMGPF_WAS_64BIT_ADDR : IMGPF_NONE) \| ((p->p_flag & P_DISABLE_ASLR) ? IMGPF_DISABLE_ASLR : IMGPF_NONE);
5213	imgp->ip_seg = (is_64 ? UIO_USERSPACE64 : UIO_USERSPACE32);
5214	imgp->ip_mac_return = `0`;
5215	imgp->ip_cs_error = OS_REASON_NULL;
5216	imgp->ip_simulator_binary = IMGPF_SB_DEFAULT;
5217	imgp->ip_subsystem_root_path = NULL;
5218	uthread_set_exec_data(uth: current_uthread(), imgp);
5219
5220	#if CONFIG_MACF
5221	if (uap->mac_p != USER_ADDR_NULL) {
5222	error = mac_execve_enter(mac_p: uap->mac_p, imgp);
5223	if (error) {
5224	goto exit_with_error;
5225	}
5226	}
5227	#endif
5228	uthread = current_uthread();
5229	{
5230	imgp->ip_flags \|= IMGPF_EXEC;
5231
5232	/ Adjust the user proc count /
5233	(void)chgproccnt(uid: kauth_getruid(), diff: `1`);
5234	/*
5235	* For execve case, create a new proc, task and thread
5236	* but don't make the proc visible to userland. After
5237	* image activation, the new proc would take place of
5238	* the old proc in pid hash and other lists that make
5239	* the proc visible to the system.
5240	*/
5241	imgp->ip_new_thread = cloneproc(old_task, NULL, p, CLONEPROC_EXEC);
5242	/ task and thread ref returned by cloneproc /
5243	if (imgp->ip_new_thread == NULL) {
5244	(void)chgproccnt(uid: kauth_getruid(), diff: -`1`);
5245	error = ENOMEM;
5246	goto exit_with_error;
5247	}
5248
5249	new_task = get_threadtask(imgp->ip_new_thread);
5250	}
5251
5252	p = (proc_t)get_bsdthreadtask_info(imgp->ip_new_thread);
5253
5254	context.vc_thread = imgp->ip_new_thread;
5255	context.vc_ucred = kauth_cred_proc_ref(procp: p); / XXX must NOT be kauth_cred_get() /
5256
5257	imgp->ip_subsystem_root_path = p->p_subsystem_root_path;
5258
5259	proc_transend(p, locked: `0`);
5260	proc_signalend(p, locked: `0`);
5261
5262
5263	/*
5264	* Activate the image.
5265	* Warning: If activation failed after point of no return, it returns error
5266	* as 0 and pretends the call succeeded.
5267	*/
5268	error = exec_activate_image(imgp);
5269	/ thread and task ref returned for vfexec case /
5270
5271	if (imgp->ip_new_thread != NULL) {
5272	/*
5273	* task reference might be returned by exec_activate_image
5274	* for vfexec.
5275	*/
5276	new_task = get_threadtask(imgp->ip_new_thread);
5277	#if defined(HAS_APPLE_PAC)
5278	ml_task_set_disable_user_jop(task: new_task, disable_user_jop: imgp->ip_flags & IMGPF_NOJOP ? TRUE : FALSE);
5279	ml_thread_set_disable_user_jop(thread: imgp->ip_new_thread, disable_user_jop: imgp->ip_flags & IMGPF_NOJOP ? TRUE : FALSE);
5280	#endif
5281	}
5282
5283	if (!error) {
5284	p = proc_exec_switch_task(old_proc: current_proc(), new_proc: p, old_task, new_task, imgp, inherit: &inherit);
5285	/ proc ref returned /
5286	should_release_proc_ref = TRUE;
5287	}
5288
5289	kauth_cred_unref(&context.vc_ucred);
5290
5291	if (!error) {
5292	exec_done = TRUE;
5293	assert(imgp->ip_new_thread != NULL);
5294
5295	exec_resettextvp(p, imgp);
5296	/*
5297	* Enable new task IPC access if exec_activate_image() returned an
5298	* active task. (Checks active bit in ipc_task_enable() under lock).
5299	* Must enable after resettextvp so that task port policies are not evaluated
5300	* until the csblob in the textvp is accurately reflected.
5301	*/
5302	ipc_task_enable(task: new_task);
5303	error = process_signature(p, imgp);
5304	}
5305
5306	#if defined(HAS_APPLE_PAC)
5307	if (imgp->ip_new_thread && !error) {
5308	ml_task_set_jop_pid_from_shared_region(task: new_task, disable_user_jop: imgp->ip_flags & IMGPF_NOJOP);
5309	ml_thread_set_jop_pid(thread: imgp->ip_new_thread, task: new_task);
5310	}
5311	#endif /* defined(HAS_APPLE_PAC) */
5312
5313	/ flag exec has occurred, notify only if it has not failed due to FP Key error /
5314	if (exec_done && ((p->p_lflag & P_LTERM_DECRYPTFAIL) == `0`)) {
5315	proc_knote(p, NOTE_EXEC);
5316	}
5317
5318	if (imgp->ip_vp != NULLVP) {
5319	vnode_put(vp: imgp->ip_vp);
5320	}
5321	if (imgp->ip_scriptvp != NULLVP) {
5322	vnode_put(vp: imgp->ip_scriptvp);
5323	}
5324	if (imgp->ip_free_map) {
5325	/ Free the map after dropping iocount on vnode to avoid deadlock /
5326	vm_map_deallocate(map: imgp->ip_free_map);
5327	}
5328	if (imgp->ip_strings) {
5329	execargs_free(imgp);
5330	}
5331	#if CONFIG_MACF
5332	if (imgp->ip_execlabelp) {
5333	mac_cred_label_free(label: imgp->ip_execlabelp);
5334	imgp->ip_execlabelp = NULL;
5335	}
5336	if (imgp->ip_scriptlabelp) {
5337	mac_vnode_label_free(label: imgp->ip_scriptlabelp);
5338	imgp->ip_scriptlabelp = NULL;
5339	}
5340	#endif
5341	if (imgp->ip_cs_error != OS_REASON_NULL) {
5342	os_reason_free(cur_reason: imgp->ip_cs_error);
5343	imgp->ip_cs_error = OS_REASON_NULL;
5344	}
5345
5346	if (!error) {
5347	/*
5348	* We need to initialize the bank context behind the protection of
5349	* the proc_trans lock to prevent a race with exit. We can't do this during
5350	* exec_activate_image because task_bank_init checks entitlements that
5351	* aren't loaded until subsequent calls (including exec_resettextvp).
5352	*/
5353	error = proc_transstart(p, locked: `0`, non_blocking: `0`);
5354	}
5355
5356	if (!error) {
5357	task_bank_init(task: new_task);
5358	proc_transend(p, locked: `0`);
5359
5360	// Don't inherit crash behavior across exec
5361	p->p_crash_behavior = `0`;
5362	p->p_crash_behavior_deadline = `0`;
5363
5364	#if __arm64__
5365	proc_footprint_entitlement_hacks(p, task: new_task);
5366	#endif /* __arm64__ */
5367
5368	#if XNU_TARGET_OS_OSX
5369	if (IOTaskHasEntitlement(task: new_task, SINGLE_JIT_ENTITLEMENT)) {
5370	vm_map_single_jit(map: get_task_map(new_task));
5371	}
5372	#endif /* XNU_TARGET_OS_OSX */
5373
5374	/ Sever any extant thread affinity /
5375	thread_affinity_exec(thread: current_thread());
5376
5377	/ Inherit task role from old task to new task for exec /
5378	proc_inherit_task_role(new_task, old_task);
5379
5380	thread_t main_thread = imgp->ip_new_thread;
5381
5382	task_set_main_thread_qos(task: new_task, main_thread);
5383
5384	#if __has_feature(ptrauth_calls)
5385	task_set_pac_exception_fatal_flag(new_task);
5386	#endif /* __has_feature(ptrauth_calls) */
5387	task_set_jit_exception_fatal_flag(task: new_task);
5388
5389	#if CONFIG_ARCADE
5390	/*
5391	* Check to see if we need to trigger an arcade upcall AST now
5392	* that the vnode has been reset on the task.
5393	*/
5394	arcade_prepare(task: new_task, thread: imgp->ip_new_thread);
5395	#endif /* CONFIG_ARCADE */
5396
5397	proc_apply_jit_and_vm_policies(imgp, p, task: new_task);
5398
5399	if (vm_darkwake_mode == TRUE) {
5400	/*
5401	* This process is being launched when the system
5402	* is in darkwake. So mark it specially. This will
5403	* cause all its pages to be entered in the background Q.
5404	*/
5405	task_set_darkwake_mode(new_task, vm_darkwake_mode);
5406	}
5407
5408	#if CONFIG_DTRACE
5409	dtrace_thread_didexec(imgp->ip_new_thread);
5410
5411	if ((dtrace_proc_waitfor_hook = dtrace_proc_waitfor_exec_ptr) != NULL) {
5412	(*dtrace_proc_waitfor_hook)(p);
5413	}
5414	#endif
5415
5416	#if CONFIG_AUDIT
5417	if (!error && AUDIT_ENABLED() && p) {
5418	/ Add the CDHash of the new process to the audit record /
5419	uint8_t *cdhash = cs_get_cdhash(p);
5420	if (cdhash) {
5421	AUDIT_ARG(data, cdhash, sizeof(uint8_t), CS_CDHASH_LEN);
5422	}
5423	}
5424	#endif
5425	} else {
5426	DTRACE_PROC1(exec__failure, int, error);
5427	}
5428
5429	exit_with_error:
5430
5431	/ terminate the new task it if exec failed /
5432	if (new_task != NULL && task_is_exec_copy(new_task)) {
5433	task_terminate_internal(task: new_task);
5434	}
5435
5436	if (imgp != NULL) {
5437	/ Clear the initial wait on the thread transferring watchports /
5438	if (imgp->ip_new_thread) {
5439	task_clear_return_wait(task: get_threadtask(imgp->ip_new_thread), TCRW_CLEAR_INITIAL_WAIT);
5440	}
5441
5442	/ Transfer the watchport boost to new task /
5443	if (!error) {
5444	task_transfer_turnstile_watchports(old_task,
5445	new_task, new_thread: imgp->ip_new_thread);
5446	}
5447	/*
5448	* Do not terminate the current task, if proc_exec_switch_task did not
5449	* switch the tasks, terminating the current task without the switch would
5450	* result in loosing the SIGKILL status.
5451	*/
5452	if (task_did_exec(task: old_task)) {
5453	/ Terminate the current task, since exec will start in new task /
5454	task_terminate_internal(task: old_task);
5455	}
5456
5457	/ Release the thread ref returned by cloneproc /
5458	if (imgp->ip_new_thread) {
5459	/ clear the exec complete flag if there is an error before point of no-return /
5460	uint32_t clearwait_flags = TCRW_CLEAR_FINAL_WAIT;
5461	if (!exec_done && error != `0`) {
5462	clearwait_flags \|= TCRW_CLEAR_EXEC_COMPLETE;
5463	}
5464	/ wake up the new exec thread /
5465	task_clear_return_wait(task: get_threadtask(imgp->ip_new_thread), flags: clearwait_flags);
5466	thread_deallocate(thread: imgp->ip_new_thread);
5467	imgp->ip_new_thread = NULL;
5468	}
5469	}
5470
5471	/ Release the ref returned by fork_create_child /
5472	if (new_task) {
5473	task_deallocate(new_task);
5474	new_task = NULL;
5475	}
5476
5477	if (should_release_proc_ref) {
5478	proc_rele(p);
5479	}
5480
5481	uthread_set_exec_data(uth: current_uthread(), NULL);
5482	kfree_type(typeof(*__execve_data), __execve_data);
5483
5484	if (inherit != NULL) {
5485	ipc_importance_release(elem: inherit);
5486	}
5487
5488	return error;
5489	}
5490
5491
5492	/*
5493	* copyinptr
5494	*
5495	* Description: Copy a pointer in from user space to a user_addr_t in kernel
5496	* space, based on 32/64 bitness of the user space
5497	*
5498	* Parameters: froma User space address
5499	* toptr Address of kernel space user_addr_t
5500	* ptr_size 4/8, based on 'froma' address space
5501	*
5502	* Returns: 0 Success
5503	* EFAULT Bad 'froma'
5504	*
5505	* Implicit returns:
5506	* *ptr_size Modified
5507	*/
5508	static int
5509	copyinptr(user_addr_t froma, user_addr_t toptr, int* ptr_size)
5510	{
5511	int error;
5512
5513	if (ptr_size == `4`) {
5514	/ 64 bit value containing 32 bit address /
5515	unsigned int i = `0`;
5516
5517	error = copyin(froma, &i, `4`);
5518	toptr = CAST_USER_ADDR_T(i); /* SAFE /
5519	} else {
5520	error = copyin(froma, toptr, `8`);
5521	}
5522	return error;
5523	}
5524
5525
5526	/*
5527	* copyoutptr
5528	*
5529	* Description: Copy a pointer out from a user_addr_t in kernel space to
5530	* user space, based on 32/64 bitness of the user space
5531	*
5532	* Parameters: ua User space address to copy to
5533	* ptr Address of kernel space user_addr_t
5534	* ptr_size 4/8, based on 'ua' address space
5535	*
5536	* Returns: 0 Success
5537	* EFAULT Bad 'ua'
5538	*
5539	*/
5540	static int
5541	copyoutptr(user_addr_t ua, user_addr_t ptr, int ptr_size)
5542	{
5543	int error;
5544
5545	if (ptr_size == `4`) {
5546	/ 64 bit value containing 32 bit address /
5547	unsigned int i = CAST_DOWN_EXPLICIT(unsigned int, ua); / SAFE /
5548
5549	error = copyout(&i, ptr, `4`);
5550	} else {
5551	error = copyout(&ua, ptr, `8`);
5552	}
5553	return error;
5554	}
5555
5556
5557	/*
5558	* exec_copyout_strings
5559	*
5560	* Copy out the strings segment to user space. The strings segment is put
5561	* on a preinitialized stack frame.
5562	*
5563	* Parameters: struct image_params * the image parameter block
5564	* int * a pointer to the stack offset variable
5565	*
5566	* Returns: 0 Success
5567	* !0 Faiure: errno
5568	*
5569	* Implicit returns:
5570	* (*stackp) The stack offset, modified
5571	*
5572	* Note: The strings segment layout is backward, from the beginning
5573	* of the top of the stack to consume the minimal amount of
5574	* space possible; the returned stack pointer points to the
5575	* end of the area consumed (stacks grow downward).
5576	*
5577	* argc is an int; arg[i] are pointers; env[i] are pointers;
5578	* the 0's are (void *)NULL's
5579	*
5580	* The stack frame layout is:
5581	*
5582	* +-------------+ <- p->user_stack
5583	* \| 16b \|
5584	* +-------------+
5585	* \| STRING AREA \|
5586	* \| : \|
5587	* \| : \|
5588	* \| : \|
5589	* +- -- -- -- --+
5590	* \| PATH AREA \|
5591	* +-------------+
5592	* \| 0 \|
5593	* +-------------+
5594	* \| applev[n] \|
5595	* +-------------+
5596	* :
5597	* :
5598	* +-------------+
5599	* \| applev[1] \|
5600	* +-------------+
5601	* \| exec_path / \|
5602	* \| applev[0] \|
5603	* +-------------+
5604	* \| 0 \|
5605	* +-------------+
5606	* \| env[n] \|
5607	* +-------------+
5608	* :
5609	* :
5610	* +-------------+
5611	* \| env[0] \|
5612	* +-------------+
5613	* \| 0 \|
5614	* +-------------+
5615	* \| arg[argc-1] \|
5616	* +-------------+
5617	* :
5618	* :
5619	* +-------------+
5620	* \| arg[0] \|
5621	* +-------------+
5622	* \| argc \|
5623	* sp-> +-------------+
5624	*
5625	* Although technically a part of the STRING AREA, we treat the PATH AREA as
5626	* a separate entity. This allows us to align the beginning of the PATH AREA
5627	* to a pointer boundary so that the exec_path, env[i], and argv[i] pointers
5628	* which preceed it on the stack are properly aligned.
5629	*/
5630	__attribute__((noinline))
5631	static int
5632	exec_copyout_strings(struct image_params imgp, user_addr_t stackp)
5633	{
5634	proc_t p = vfs_context_proc(ctx: imgp->ip_vfs_context);
5635	int ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT_ADDR) ? `8` : `4`;
5636	int ptr_area_size;
5637	void ptr_buffer_start, ptr_buffer;
5638	size_t string_size;
5639
5640	user_addr_t string_area; / argv[], env[] /
5641	user_addr_t ptr_area; / argv[], env[], applev[] /
5642	user_addr_t argc_area; / argc /
5643	user_addr_t stack;
5644	int error;
5645
5646	unsigned i;
5647	struct copyout_desc {
5648	char *start_string;
5649	int count;
5650	#if CONFIG_DTRACE
5651	user_addr_t *dtrace_cookie;
5652	#endif
5653	boolean_t null_term;
5654	} descriptors[] = {
5655	{
5656	.start_string = imgp->ip_startargv,
5657	.count = imgp->ip_argc,
5658	#if CONFIG_DTRACE
5659	.dtrace_cookie = &p->p_dtrace_argv,
5660	#endif
5661	.null_term = TRUE
5662	},
5663	{
5664	.start_string = imgp->ip_endargv,
5665	.count = imgp->ip_envc,
5666	#if CONFIG_DTRACE
5667	.dtrace_cookie = &p->p_dtrace_envp,
5668	#endif
5669	.null_term = TRUE
5670	},
5671	{
5672	.start_string = imgp->ip_strings,
5673	.count = `1`,
5674	#if CONFIG_DTRACE
5675	.dtrace_cookie = NULL,
5676	#endif
5677	.null_term = FALSE
5678	},
5679	{
5680	.start_string = imgp->ip_endenvv,
5681	.count = imgp->ip_applec - `1`, / exec_path handled above /
5682	#if CONFIG_DTRACE
5683	.dtrace_cookie = NULL,
5684	#endif
5685	.null_term = TRUE
5686	}
5687	};
5688
5689	stack = *stackp;
5690
5691	/*
5692	* All previous contributors to the string area
5693	* should have aligned their sub-area
5694	*/
5695	if (imgp->ip_strspace % ptr_size != `0`) {
5696	error = EINVAL;
5697	goto bad;
5698	}
5699
5700	/ Grow the stack down for the strings we've been building up /
5701	string_size = imgp->ip_strendp - imgp->ip_strings;
5702	stack -= string_size;
5703	string_area = stack;
5704
5705	/*
5706	* Need room for one pointer for each string, plus
5707	* one for the NULLs terminating the argv, envv, and apple areas.
5708	*/
5709	ptr_area_size = (imgp->ip_argc + imgp->ip_envc + imgp->ip_applec + `3`) * ptr_size;
5710	stack -= ptr_area_size;
5711	ptr_area = stack;
5712
5713	/ We'll construct all the pointer arrays in our string buffer,*
5714	* which we already know is aligned properly, and ip_argspace
5715	* was used to verify we have enough space.
5716	*/
5717	ptr_buffer_start = ptr_buffer = (void *)imgp->ip_strendp;
5718
5719	/*
5720	* Need room for pointer-aligned argc slot.
5721	*/
5722	stack -= ptr_size;
5723	argc_area = stack;
5724
5725	/*
5726	* Record the size of the arguments area so that sysctl_procargs()
5727	* can return the argument area without having to parse the arguments.
5728	*/
5729	proc_lock(p);
5730	p->p_argc = imgp->ip_argc;
5731	p->p_argslen = (int)(*stackp - string_area);
5732	proc_unlock(p);
5733
5734	/ Return the initial stack address: the location of argc /
5735	*stackp = stack;
5736
5737	/*
5738	* Copy out the entire strings area.
5739	*/
5740	error = copyout(imgp->ip_strings, string_area,
5741	string_size);
5742	if (error) {
5743	goto bad;
5744	}
5745
5746	for (i = `0`; i < sizeof(descriptors) / sizeof(descriptors[`0`]); i++) {
5747	char *cur_string = descriptors[i].start_string;
5748	int j;
5749
5750	#if CONFIG_DTRACE
5751	if (descriptors[i].dtrace_cookie) {
5752	proc_lock(p);
5753	descriptors[i].dtrace_cookie = ptr_area + ((uintptr_t)ptr_buffer - (uintptr_t)ptr_buffer_start); /* dtrace convenience /
5754	proc_unlock(p);
5755	}
5756	#endif /* CONFIG_DTRACE */
5757
5758	/*
5759	* For each segment (argv, envv, applev), copy as many pointers as requested
5760	* to our pointer buffer.
5761	*/
5762	for (j = `0`; j < descriptors[i].count; j++) {
5763	user_addr_t cur_address = string_area + (cur_string - imgp->ip_strings);
5764
5765	/ Copy out the pointer to the current string. Alignment has been verified /
5766	if (ptr_size == `8`) {
5767	(uint64_t )ptr_buffer = (uint64_t)cur_address;
5768	} else {
5769	(uint32_t )ptr_buffer = (uint32_t)cur_address;
5770	}
5771
5772	ptr_buffer = (void *)((uintptr_t)ptr_buffer + ptr_size);
5773	cur_string += strlen(s: cur_string) + `1`; / Only a NUL between strings in the same area /
5774	}
5775
5776	if (descriptors[i].null_term) {
5777	if (ptr_size == `8`) {
5778	(uint64_t )ptr_buffer = `0ULL`;
5779	} else {
5780	(uint32_t )ptr_buffer = `0`;
5781	}
5782
5783	ptr_buffer = (void *)((uintptr_t)ptr_buffer + ptr_size);
5784	}
5785	}
5786
5787	/*
5788	* Copy out all our pointer arrays in bulk.
5789	*/
5790	error = copyout(ptr_buffer_start, ptr_area,
5791	ptr_area_size);
5792	if (error) {
5793	goto bad;
5794	}
5795
5796	/ argc (int32, stored in a ptr_size area) /
5797	error = copyoutptr(ua: (user_addr_t)imgp->ip_argc, ptr: argc_area, ptr_size);
5798	if (error) {
5799	goto bad;
5800	}
5801
5802	bad:
5803	return error;
5804	}
5805
5806
5807	/*
5808	* exec_extract_strings
5809	*
5810	* Copy arguments and environment from user space into work area; we may
5811	* have already copied some early arguments into the work area, and if
5812	* so, any arguments opied in are appended to those already there.
5813	* This function is the primary manipulator of ip_argspace, since
5814	* these are the arguments the client of execve(2) knows about. After
5815	* each argv[]/envv[] string is copied, we charge the string length
5816	* and argv[]/envv[] pointer slot to ip_argspace, so that we can
5817	* full preflight the arg list size.
5818	*
5819	* Parameters: struct image_params * the image parameter block
5820	*
5821	* Returns: 0 Success
5822	* !0 Failure: errno
5823	*
5824	* Implicit returns;
5825	* (imgp->ip_argc) Count of arguments, updated
5826	* (imgp->ip_envc) Count of environment strings, updated
5827	* (imgp->ip_argspace) Count of remaining of NCARGS
5828	* (imgp->ip_interp_buffer) Interpreter and args (mutated in place)
5829	*
5830	*
5831	* Note: The argument and environment vectors are user space pointers
5832	* to arrays of user space pointers.
5833	*/
5834	__attribute__((noinline))
5835	static int
5836	exec_extract_strings(struct image_params *imgp)
5837	{
5838	int error = `0`;
5839	int ptr_size = (imgp->ip_flags & IMGPF_WAS_64BIT_ADDR) ? `8` : `4`;
5840	int new_ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT_ADDR) ? `8` : `4`;
5841	user_addr_t argv = imgp->ip_user_argv;
5842	user_addr_t envv = imgp->ip_user_envv;
5843
5844	/*
5845	* Adjust space reserved for the path name by however much padding it
5846	* needs. Doing this here since we didn't know if this would be a 32-
5847	* or 64-bit process back in exec_save_path.
5848	*/
5849	while (imgp->ip_strspace % new_ptr_size != `0`) {
5850	*imgp->ip_strendp++ = `'\0'`;
5851	imgp->ip_strspace--;
5852	/ imgp->ip_argspace--; not counted towards exec args total /
5853	}
5854
5855	/*
5856	* From now on, we start attributing string space to ip_argspace
5857	*/
5858	imgp->ip_startargv = imgp->ip_strendp;
5859	imgp->ip_argc = `0`;
5860
5861	if ((imgp->ip_flags & IMGPF_INTERPRET) != `0`) {
5862	user_addr_t arg;
5863	char argstart, ch;
5864
5865	/ First, the arguments in the "#!" string are tokenized and extracted. /
5866	argstart = imgp->ip_interp_buffer;
5867	while (argstart) {
5868	ch = argstart;
5869	while (ch && !IS_WHITESPACE(ch)) {
5870	ch++;
5871	}
5872
5873	if (*ch == `'\0'`) {
5874	/ last argument, no need to NUL-terminate /
5875	error = exec_add_user_string(imgp, CAST_USER_ADDR_T(argstart), seg: UIO_SYSSPACE, TRUE);
5876	argstart = NULL;
5877	} else {
5878	/ NUL-terminate /
5879	*ch = `'\0'`;
5880	error = exec_add_user_string(imgp, CAST_USER_ADDR_T(argstart), seg: UIO_SYSSPACE, TRUE);
5881
5882	/*
5883	* Find the next string. We know spaces at the end of the string have already
5884	* been stripped.
5885	*/
5886	argstart = ch + `1`;
5887	while (IS_WHITESPACE(*argstart)) {
5888	argstart++;
5889	}
5890	}
5891
5892	/ Error-check, regardless of whether this is the last interpreter arg or not /
5893	if (error) {
5894	goto bad;
5895	}
5896	if (imgp->ip_argspace < new_ptr_size) {
5897	error = E2BIG;
5898	goto bad;
5899	}
5900	imgp->ip_argspace -= new_ptr_size; / to hold argv[] entry /
5901	imgp->ip_argc++;
5902	}
5903
5904	if (argv != `0LL`) {
5905	/*
5906	* If we are running an interpreter, replace the av[0] that was
5907	* passed to execve() with the path name that was
5908	* passed to execve() for interpreters which do not use the PATH
5909	* to locate their script arguments.
5910	*/
5911	error = copyinptr(froma: argv, toptr: &arg, ptr_size);
5912	if (error) {
5913	goto bad;
5914	}
5915	if (arg != `0LL`) {
5916	argv += ptr_size; / consume without using /
5917	}
5918	}
5919
5920	if (imgp->ip_interp_sugid_fd != -`1`) {
5921	char temp[`19`]; / "/dev/fd/" + 10 digits + NUL /
5922	snprintf(temp, count: sizeof(temp), "/dev/fd/%d", imgp->ip_interp_sugid_fd);
5923	error = exec_add_user_string(imgp, CAST_USER_ADDR_T(temp), seg: UIO_SYSSPACE, TRUE);
5924	} else {
5925	error = exec_add_user_string(imgp, str: imgp->ip_user_fname, seg: imgp->ip_seg, TRUE);
5926	}
5927
5928	if (error) {
5929	goto bad;
5930	}
5931	if (imgp->ip_argspace < new_ptr_size) {
5932	error = E2BIG;
5933	goto bad;
5934	}
5935	imgp->ip_argspace -= new_ptr_size; / to hold argv[] entry /
5936	imgp->ip_argc++;
5937	}
5938
5939	while (argv != `0LL`) {
5940	user_addr_t arg;
5941
5942	error = copyinptr(froma: argv, toptr: &arg, ptr_size);
5943	if (error) {
5944	goto bad;
5945	}
5946
5947	if (arg == `0LL`) {
5948	break;
5949	}
5950
5951	argv += ptr_size;
5952
5953	/*
5954	* av[n...] = arg[n]
5955	*/
5956	error = exec_add_user_string(imgp, str: arg, seg: imgp->ip_seg, TRUE);
5957	if (error) {
5958	goto bad;
5959	}
5960	if (imgp->ip_argspace < new_ptr_size) {
5961	error = E2BIG;
5962	goto bad;
5963	}
5964	imgp->ip_argspace -= new_ptr_size; / to hold argv[] entry /
5965	imgp->ip_argc++;
5966	}
5967
5968	/ Save space for argv[] NULL terminator /
5969	if (imgp->ip_argspace < new_ptr_size) {
5970	error = E2BIG;
5971	goto bad;
5972	}
5973	imgp->ip_argspace -= new_ptr_size;
5974
5975	/ Note where the args ends and env begins. /
5976	imgp->ip_endargv = imgp->ip_strendp;
5977	imgp->ip_envc = `0`;
5978
5979	/ Now, get the environment /
5980	while (envv != `0LL`) {
5981	user_addr_t env;
5982
5983	error = copyinptr(froma: envv, toptr: &env, ptr_size);
5984	if (error) {
5985	goto bad;
5986	}
5987
5988	envv += ptr_size;
5989	if (env == `0LL`) {
5990	break;
5991	}
5992	/*
5993	* av[n...] = env[n]
5994	*/
5995	error = exec_add_user_string(imgp, str: env, seg: imgp->ip_seg, TRUE);
5996	if (error) {
5997	goto bad;
5998	}
5999	if (imgp->ip_argspace < new_ptr_size) {
6000	error = E2BIG;
6001	goto bad;
6002	}
6003	imgp->ip_argspace -= new_ptr_size; / to hold envv[] entry /
6004	imgp->ip_envc++;
6005	}
6006
6007	/ Save space for envv[] NULL terminator /
6008	if (imgp->ip_argspace < new_ptr_size) {
6009	error = E2BIG;
6010	goto bad;
6011	}
6012	imgp->ip_argspace -= new_ptr_size;
6013
6014	/ Align the tail of the combined argv+envv area /
6015	while (imgp->ip_strspace % new_ptr_size != `0`) {
6016	if (imgp->ip_argspace < `1`) {
6017	error = E2BIG;
6018	goto bad;
6019	}
6020	*imgp->ip_strendp++ = `'\0'`;
6021	imgp->ip_strspace--;
6022	imgp->ip_argspace--;
6023	}
6024
6025	/ Note where the envv ends and applev begins. /
6026	imgp->ip_endenvv = imgp->ip_strendp;
6027
6028	/*
6029	* From now on, we are no longer charging argument
6030	* space to ip_argspace.
6031	*/
6032
6033	bad:
6034	return error;
6035	}
6036
6037	/*
6038	* Libc has an 8-element array set up for stack guard values. It only fills
6039	* in one of those entries, and both gcc and llvm seem to use only a single
6040	* 8-byte guard. Until somebody needs more than an 8-byte guard value, don't
6041	* do the work to construct them.
6042	*/
6043	#define GUARD_VALUES 1
6044	#define GUARD_KEY "stack_guard="
6045
6046	/*
6047	* System malloc needs some entropy when it is initialized.
6048	*/
6049	#define ENTROPY_VALUES 2
6050	#define ENTROPY_KEY "malloc_entropy="
6051
6052	/*
6053	* libplatform needs a random pointer-obfuscation value when it is initialized.
6054	*/
6055	#define PTR_MUNGE_VALUES 1
6056	#define PTR_MUNGE_KEY "ptr_munge="
6057
6058	/*
6059	* System malloc engages nanozone for UIAPP.
6060	*/
6061	#define NANO_ENGAGE_KEY "MallocNanoZone=1"
6062
6063	/*
6064	* Used to pass experiment flags up to libmalloc.
6065	*/
6066	#define LIBMALLOC_EXPERIMENT_FACTORS_KEY "MallocExperiment="
6067
6068	/*
6069	* Passes information about hardened runtime entitlements to libsystem/libmalloc
6070	*/
6071	#define HARDENED_RUNTIME_KEY "HardenedRuntime="
6072
6073	#define PFZ_KEY "pfz="
6074	extern user32_addr_t commpage_text32_location;
6075	extern user64_addr_t commpage_text64_location;
6076
6077	extern uuid_string_t bootsessionuuid_string;
6078	static TUNABLE(uint32_t, exe_boothash_salt, "exe_boothash_salt", `0`);
6079
6080	__startup_func
6081	static void
6082	exe_boothash_salt_generate(void)
6083	{
6084	if (!PE_parse_boot_argn(arg_string: "exe_boothash_salt", NULL, max_arg: `0`)) {
6085	read_random(buffer: &exe_boothash_salt, numBytes: sizeof(exe_boothash_salt));
6086	}
6087	}
6088	STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, exe_boothash_salt_generate);
6089
6090
6091	#define MAIN_STACK_VALUES 4
6092	#define MAIN_STACK_KEY "main_stack="
6093
6094	#define FSID_KEY "executable_file="
6095	#define DYLD_FSID_KEY "dyld_file="
6096	#define CDHASH_KEY "executable_cdhash="
6097	#define DYLD_FLAGS_KEY "dyld_flags="
6098	#define SUBSYSTEM_ROOT_PATH_KEY "subsystem_root_path="
6099	#define APP_BOOT_SESSION_KEY "executable_boothash="
6100	#if __has_feature(ptrauth_calls)
6101	#define PTRAUTH_DISABLED_FLAG "ptrauth_disabled=1"
6102	#define DYLD_ARM64E_ABI_KEY "arm64e_abi="
6103	#endif /* __has_feature(ptrauth_calls) */
6104	#define MAIN_TH_PORT_KEY "th_port="
6105
6106	#define FSID_MAX_STRING "0x1234567890abcdef,0x1234567890abcdef"
6107
6108	#define HEX_STR_LEN 18 // 64-bit hex value "0x0123456701234567"
6109	#define HEX_STR_LEN32 10 // 32-bit hex value "0x01234567"
6110
6111	#if XNU_TARGET_OS_OSX && _POSIX_SPAWN_FORCE_4K_PAGES && PMAP_CREATE_FORCE_4K_PAGES
6112	#define VM_FORCE_4K_PAGES_KEY "vm_force_4k_pages=1"
6113	#endif /* XNU_TARGET_OS_OSX && _POSIX_SPAWN_FORCE_4K_PAGES && PMAP_CREATE_FORCE_4K_PAGES */
6114
6115	static int
6116	exec_add_entropy_key(struct image_params *imgp,
6117	const char *key,
6118	int values,
6119	boolean_t embedNUL)
6120	{
6121	const int limit = `8`;
6122	uint64_t entropy[limit];
6123	char str[strlen(s: key) + (HEX_STR_LEN + `1`) * limit + `1`];
6124	if (values > limit) {
6125	values = limit;
6126	}
6127
6128	read_random(buffer: entropy, numBytes: sizeof(entropy[`0`]) * values);
6129
6130	if (embedNUL) {
6131	entropy[`0`] &= ~(`0xffull` << `8`);
6132	}
6133
6134	int len = scnprintf(str, count: sizeof(str), "%s0x%llx", key, entropy[`0`]);
6135	size_t remaining = sizeof(str) - len;
6136	for (int i = `1`; i < values && remaining > `0`; ++i) {
6137	size_t start = sizeof(str) - remaining;
6138	len = scnprintf(&str[start], count: remaining, ",0x%llx", entropy[i]);
6139	remaining -= len;
6140	}
6141
6142	return exec_add_user_string(imgp, CAST_USER_ADDR_T(str), seg: UIO_SYSSPACE, FALSE);
6143	}
6144
6145	/*
6146	* Build up the contents of the apple[] string vector
6147	*/
6148	#if (DEVELOPMENT \|\| DEBUG)
6149	extern uint64_t dyld_flags;
6150	#endif
6151
6152	#if __has_feature(ptrauth_calls)
6153	static inline bool
6154	is_arm64e_running_as_arm64(const struct image_params *imgp)
6155	{
6156	return (imgp->ip_origcpusubtype & ~CPU_SUBTYPE_MASK) == CPU_SUBTYPE_ARM64E &&
6157	(imgp->ip_flags & IMGPF_NOJOP);
6158	}
6159	#endif /* __has_feature(ptrauth_calls) */
6160
6161	_Atomic uint64_t libmalloc_experiment_factors = `0`;
6162
6163	static int
6164	exec_add_apple_strings(struct image_params *imgp,
6165	const load_result_t *load_result)
6166	{
6167	int error;
6168	int img_ptr_size = (imgp->ip_flags & IMGPF_IS_64BIT_ADDR) ? `8` : `4`;
6169	thread_t new_thread;
6170	ipc_port_t sright;
6171	uint64_t local_experiment_factors = `0`;
6172
6173	/ exec_save_path stored the first string /
6174	imgp->ip_applec = `1`;
6175
6176	/ adding the pfz string /
6177	{
6178	char pfz_string[strlen(PFZ_KEY) + HEX_STR_LEN + `1`];
6179
6180	if (img_ptr_size == `8`) {
6181	__assert_only size_t ret = snprintf(pfz_string, count: sizeof(pfz_string), PFZ_KEY "0x%llx", commpage_text64_location);
6182	assert(ret < sizeof(pfz_string));
6183	} else {
6184	snprintf(pfz_string, count: sizeof(pfz_string), PFZ_KEY "0x%x", commpage_text32_location);
6185	}
6186	error = exec_add_user_string(imgp, CAST_USER_ADDR_T(pfz_string), seg: UIO_SYSSPACE, FALSE);
6187	if (error) {
6188	printf("Failed to add the pfz string with error %d\n", error);
6189	goto bad;
6190	}
6191	imgp->ip_applec++;
6192	}
6193
6194	/ adding the NANO_ENGAGE_KEY key /
6195	if (imgp->ip_px_sa) {
6196	struct _posix_spawnattr* psa = (struct _posix_spawnattr *) imgp->ip_px_sa;
6197	int proc_flags = psa->psa_flags;
6198
6199	if ((proc_flags & _POSIX_SPAWN_NANO_ALLOCATOR) == _POSIX_SPAWN_NANO_ALLOCATOR) {
6200	const char *nano_string = NANO_ENGAGE_KEY;
6201	error = exec_add_user_string(imgp, CAST_USER_ADDR_T(nano_string), seg: UIO_SYSSPACE, FALSE);
6202	if (error) {
6203	goto bad;
6204	}
6205	imgp->ip_applec++;
6206	}
6207	}
6208
6209	/*
6210	* Supply libc with a collection of random values to use when
6211	* implementing -fstack-protector.
6212	*
6213	* (The first random string always contains an embedded NUL so that
6214	* __stack_chk_guard also protects against C string vulnerabilities)
6215	*/
6216	error = exec_add_entropy_key(imgp, GUARD_KEY, GUARD_VALUES, TRUE);
6217	if (error) {
6218	goto bad;
6219	}
6220	imgp->ip_applec++;
6221
6222	/*
6223	* Supply libc with entropy for system malloc.
6224	*/
6225	error = exec_add_entropy_key(imgp, ENTROPY_KEY, ENTROPY_VALUES, FALSE);
6226	if (error) {
6227	goto bad;
6228	}
6229	imgp->ip_applec++;
6230
6231	/*
6232	* Supply libpthread & libplatform with a random value to use for pointer
6233	* obfuscation.
6234	*/
6235	error = exec_add_entropy_key(imgp, PTR_MUNGE_KEY, PTR_MUNGE_VALUES, FALSE);
6236	if (error) {
6237	goto bad;
6238	}
6239	imgp->ip_applec++;
6240
6241	/*
6242	* Add MAIN_STACK_KEY: Supplies the address and size of the main thread's
6243	* stack if it was allocated by the kernel.
6244	*
6245	* The guard page is not included in this stack size as libpthread
6246	* expects to add it back in after receiving this value.
6247	*/
6248	if (load_result->unixproc) {
6249	char stack_string[strlen(MAIN_STACK_KEY) + (HEX_STR_LEN + `1`) * MAIN_STACK_VALUES + `1`];
6250	snprintf(stack_string, count: sizeof(stack_string),
6251	MAIN_STACK_KEY "0x%llx,0x%llx,0x%llx,0x%llx",
6252	(uint64_t)load_result->user_stack,
6253	(uint64_t)load_result->user_stack_size,
6254	(uint64_t)load_result->user_stack_alloc,
6255	(uint64_t)load_result->user_stack_alloc_size);
6256	error = exec_add_user_string(imgp, CAST_USER_ADDR_T(stack_string), seg: UIO_SYSSPACE, FALSE);
6257	if (error) {
6258	goto bad;
6259	}
6260	imgp->ip_applec++;
6261	}
6262
6263	if (imgp->ip_vattr) {
6264	uint64_t fsid = vnode_get_va_fsid(vap: imgp->ip_vattr);
6265	uint64_t fsobjid = imgp->ip_vattr->va_fileid;
6266
6267	char fsid_string[strlen(FSID_KEY) + strlen(FSID_MAX_STRING) + `1`];
6268	snprintf(fsid_string, count: sizeof(fsid_string),
6269	FSID_KEY "0x%llx,0x%llx", fsid, fsobjid);
6270	error = exec_add_user_string(imgp, CAST_USER_ADDR_T(fsid_string), seg: UIO_SYSSPACE, FALSE);
6271	if (error) {
6272	goto bad;
6273	}
6274	imgp->ip_applec++;
6275	}
6276
6277	if (imgp->ip_dyld_fsid \|\| imgp->ip_dyld_fsobjid) {
6278	char fsid_string[strlen(DYLD_FSID_KEY) + strlen(FSID_MAX_STRING) + `1`];
6279	snprintf(fsid_string, count: sizeof(fsid_string),
6280	DYLD_FSID_KEY "0x%llx,0x%llx", imgp->ip_dyld_fsid, imgp->ip_dyld_fsobjid);
6281	error = exec_add_user_string(imgp, CAST_USER_ADDR_T(fsid_string), seg: UIO_SYSSPACE, FALSE);
6282	if (error) {
6283	goto bad;
6284	}
6285	imgp->ip_applec++;
6286	}
6287
6288	uint8_t cdhash[SHA1_RESULTLEN];
6289	int cdhash_errror = ubc_cs_getcdhash(imgp->ip_vp, imgp->ip_arch_offset, cdhash);
6290	if (cdhash_errror == `0`) {
6291	char hash_string[strlen(CDHASH_KEY) + `2` * SHA1_RESULTLEN + `1`];
6292	strncpy(hash_string, CDHASH_KEY, sizeof(hash_string));
6293	char p = hash_string + sizeof*(CDHASH_KEY) - `1`;
6294	for (int i = `0`; i < SHA1_RESULTLEN; i++) {
6295	snprintf(p, count: `3`, "%02x", (int) cdhash[i]);
6296	p += `2`;
6297	}
6298	error = exec_add_user_string(imgp, CAST_USER_ADDR_T(hash_string), seg: UIO_SYSSPACE, FALSE);
6299	if (error) {
6300	goto bad;
6301	}
6302	imgp->ip_applec++;
6303
6304	/ hash together cd-hash and boot-session-uuid /
6305	uint8_t sha_digest[SHA256_DIGEST_LENGTH];
6306	SHA256_CTX sha_ctx;
6307	SHA256_Init(ctx: &sha_ctx);
6308	SHA256_Update(ctx: &sha_ctx, data: &exe_boothash_salt, len: sizeof(exe_boothash_salt));
6309	SHA256_Update(ctx: &sha_ctx, data: bootsessionuuid_string, len: sizeof(bootsessionuuid_string));
6310	SHA256_Update(ctx: &sha_ctx, data: cdhash, len: sizeof(cdhash));
6311	SHA256_Final(digest: sha_digest, ctx: &sha_ctx);
6312	char app_boot_string[strlen(APP_BOOT_SESSION_KEY) + `2` * SHA1_RESULTLEN + `1`];
6313	strncpy(app_boot_string, APP_BOOT_SESSION_KEY, sizeof(app_boot_string));
6314	char s = app_boot_string + sizeof*(APP_BOOT_SESSION_KEY) - `1`;
6315	for (int i = `0`; i < SHA1_RESULTLEN; i++) {
6316	snprintf(s, count: `3`, "%02x", (int) sha_digest[i]);
6317	s += `2`;
6318	}
6319	error = exec_add_user_string(imgp, CAST_USER_ADDR_T(app_boot_string), seg: UIO_SYSSPACE, FALSE);
6320	if (error) {
6321	goto bad;
6322	}
6323	imgp->ip_applec++;
6324	}
6325	#if (DEVELOPMENT \|\| DEBUG)
6326	if (dyld_flags) {
6327	char dyld_flags_string[strlen(DYLD_FLAGS_KEY) + HEX_STR_LEN + `1`];
6328	snprintf(dyld_flags_string, sizeof(dyld_flags_string), DYLD_FLAGS_KEY "0x%llx", dyld_flags);
6329	error = exec_add_user_string(imgp, CAST_USER_ADDR_T(dyld_flags_string), UIO_SYSSPACE, FALSE);
6330	if (error) {
6331	goto bad;
6332	}
6333	imgp->ip_applec++;
6334	}
6335	#endif
6336	if (imgp->ip_subsystem_root_path) {
6337	size_t buffer_len = MAXPATHLEN + strlen(SUBSYSTEM_ROOT_PATH_KEY);
6338	char subsystem_root_path_string[buffer_len];
6339	int required_len = snprintf(subsystem_root_path_string, count: buffer_len, SUBSYSTEM_ROOT_PATH_KEY "%s", imgp->ip_subsystem_root_path);
6340
6341	if (((size_t)required_len >= buffer_len) \|\| (required_len < `0`)) {
6342	error = ENAMETOOLONG;
6343	goto bad;
6344	}
6345
6346	error = exec_add_user_string(imgp, CAST_USER_ADDR_T(subsystem_root_path_string), seg: UIO_SYSSPACE, FALSE);
6347	if (error) {
6348	goto bad;
6349	}
6350
6351	imgp->ip_applec++;
6352	}
6353	#if __has_feature(ptrauth_calls)
6354	if (is_arm64e_running_as_arm64(imgp)) {
6355	error = exec_add_user_string(imgp, CAST_USER_ADDR_T(PTRAUTH_DISABLED_FLAG), UIO_SYSSPACE, FALSE);
6356	if (error) {
6357	goto bad;
6358	}
6359
6360	imgp->ip_applec++;
6361	}
6362	#endif /* __has_feature(ptrauth_calls) */
6363
6364
6365	#if __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX)
6366	{
6367	char dyld_abi_string[strlen(DYLD_ARM64E_ABI_KEY) + `8`];
6368	strlcpy(dyld_abi_string, DYLD_ARM64E_ABI_KEY, sizeof(dyld_abi_string));
6369	bool allowAll = bootarg_arm64e_preview_abi;
6370	strlcat(dyld_abi_string, (allowAll ? "all" : "os"), sizeof(dyld_abi_string));
6371	error = exec_add_user_string(imgp, CAST_USER_ADDR_T(dyld_abi_string), UIO_SYSSPACE, FALSE);
6372	if (error) {
6373	goto bad;
6374	}
6375
6376	imgp->ip_applec++;
6377	}
6378	#endif
6379	/*
6380	* Add main thread mach port name
6381	* +1 uref on main thread port, this ref will be extracted by libpthread in __pthread_init
6382	* and consumed in _bsdthread_terminate. Leaking the main thread port name if not linked
6383	* against libpthread.
6384	*/
6385	if ((new_thread = imgp->ip_new_thread) != THREAD_NULL) {
6386	thread_reference(thread: new_thread);
6387	sright = convert_thread_to_port_pinned(new_thread);
6388	task_t new_task = get_threadtask(new_thread);
6389	mach_port_name_t name = ipc_port_copyout_send(sright, space: get_task_ipcspace(t: new_task));
6390	char port_name_hex_str[strlen(MAIN_TH_PORT_KEY) + HEX_STR_LEN32 + `1`];
6391	snprintf(port_name_hex_str, count: sizeof(port_name_hex_str), MAIN_TH_PORT_KEY "0x%x", name);
6392
6393	error = exec_add_user_string(imgp, CAST_USER_ADDR_T(port_name_hex_str), seg: UIO_SYSSPACE, FALSE);
6394	if (error) {
6395	goto bad;
6396	}
6397	imgp->ip_applec++;
6398	}
6399
6400	#if XNU_TARGET_OS_OSX && _POSIX_SPAWN_FORCE_4K_PAGES && PMAP_CREATE_FORCE_4K_PAGES
6401	if (imgp->ip_px_sa != NULL) {
6402	struct _posix_spawnattr* psa = (struct _posix_spawnattr *) imgp->ip_px_sa;
6403	if (psa->psa_flags & _POSIX_SPAWN_FORCE_4K_PAGES) {
6404	const char *vm_force_4k_string = VM_FORCE_4K_PAGES_KEY;
6405	error = exec_add_user_string(imgp, CAST_USER_ADDR_T(vm_force_4k_string), UIO_SYSSPACE, FALSE);
6406	if (error) {
6407	goto bad;
6408	}
6409	imgp->ip_applec++;
6410	}
6411	}
6412	#endif /* XNU_TARGET_OS_OSX && _POSIX_SPAWN_FORCE_4K_PAGES && PMAP_CREATE_FORCE_4K_PAGES */
6413
6414	/ adding the libmalloc experiment string /
6415	local_experiment_factors = os_atomic_load_wide(&libmalloc_experiment_factors, relaxed);
6416	if (__improbable(local_experiment_factors != `0`)) {
6417	char libmalloc_experiment_factors_string[strlen(LIBMALLOC_EXPERIMENT_FACTORS_KEY) + HEX_STR_LEN + `1`];
6418
6419	snprintf(
6420	libmalloc_experiment_factors_string,
6421	count: sizeof(libmalloc_experiment_factors_string),
6422	LIBMALLOC_EXPERIMENT_FACTORS_KEY "0x%llx",
6423	local_experiment_factors);
6424	error = exec_add_user_string(
6425	imgp,
6426	CAST_USER_ADDR_T(libmalloc_experiment_factors_string),
6427	seg: UIO_SYSSPACE,
6428	FALSE);
6429	if (error) {
6430	printf("Failed to add the libmalloc experiment factors string with error %d\n", error);
6431	goto bad;
6432	}
6433	imgp->ip_applec++;
6434	}
6435
6436
6437	/ tell dyld that it can leverage hardware for its read-only/read-write trusted path /
6438	if (imgp->ip_flags & IMGPF_HW_TPRO) {
6439	const char *dyld_hw_tpro = "dyld_hw_tpro=1";
6440	error = exec_add_user_string(imgp, CAST_USER_ADDR_T(dyld_hw_tpro), seg: UIO_SYSSPACE, FALSE);
6441	if (error) {
6442	printf("Failed to add dyld hw tpro setting with error %d\n", error);
6443	goto bad;
6444	}
6445
6446	imgp->ip_applec++;
6447
6448	}
6449
6450	if (load_result->hardened_runtime_binary) {
6451	const size_t HR_STRING_SIZE = sizeof(HARDENED_RUNTIME_KEY) + HR_FLAGS_NUM_NIBBLES + `2` + `1`;
6452	char hardened_runtime[HR_STRING_SIZE];
6453	snprintf(hardened_runtime, count: HR_STRING_SIZE, HARDENED_RUNTIME_KEY"0x%x", load_result->hardened_runtime_binary);
6454	error = exec_add_user_string(imgp, CAST_USER_ADDR_T(hardened_runtime), seg: UIO_SYSSPACE, FALSE);
6455	if (error) {
6456	printf("Failed to add hardened runtime flag with error %d\n", error);
6457	goto bad;
6458	}
6459	imgp->ip_applec++;
6460	}
6461	/ Align the tail of the combined applev area /
6462	while (imgp->ip_strspace % img_ptr_size != `0`) {
6463	*imgp->ip_strendp++ = `'\0'`;
6464	imgp->ip_strspace--;
6465	}
6466
6467	bad:
6468	return error;
6469	}
6470
6471	/*
6472	* exec_check_permissions
6473	*
6474	* Description: Verify that the file that is being attempted to be executed
6475	* is in fact allowed to be executed based on it POSIX file
6476	* permissions and other access control criteria
6477	*
6478	* Parameters: struct image_params * the image parameter block
6479	*
6480	* Returns: 0 Success
6481	* EACCES Permission denied
6482	* ENOEXEC Executable file format error
6483	* ETXTBSY Text file busy [misuse of error code]
6484	* vnode_getattr:???
6485	* vnode_authorize:???
6486	*/
6487	static int
6488	exec_check_permissions(struct image_params *imgp)
6489	{
6490	struct vnode *vp = imgp->ip_vp;
6491	struct vnode_attr *vap = imgp->ip_vattr;
6492	proc_t p = vfs_context_proc(ctx: imgp->ip_vfs_context);
6493	int error;
6494	kauth_action_t action;
6495
6496	/ Only allow execution of regular files /
6497	if (!vnode_isreg(vp)) {
6498	return EACCES;
6499	}
6500
6501	/ Get the file attributes that we will be using here and elsewhere /
6502	VATTR_INIT(vap);
6503	VATTR_WANTED(vap, va_uid);
6504	VATTR_WANTED(vap, va_gid);
6505	VATTR_WANTED(vap, va_mode);
6506	VATTR_WANTED(vap, va_fsid);
6507	VATTR_WANTED(vap, va_fsid64);
6508	VATTR_WANTED(vap, va_fileid);
6509	VATTR_WANTED(vap, va_data_size);
6510	if ((error = vnode_getattr(vp, vap, ctx: imgp->ip_vfs_context)) != `0`) {
6511	return error;
6512	}
6513
6514	/*
6515	* Ensure that at least one execute bit is on - otherwise root
6516	* will always succeed, and we don't want to happen unless the
6517	* file really is executable.
6518	*/
6519	if (!vfs_authopaque(mp: vnode_mount(vp)) && ((vap->va_mode & (S_IXUSR \| S_IXGRP \| S_IXOTH)) == `0`)) {
6520	return EACCES;
6521	}
6522
6523	/ Disallow zero length files /
6524	if (vap->va_data_size == `0`) {
6525	return ENOEXEC;
6526	}
6527
6528	imgp->ip_arch_offset = (user_size_t)`0`;
6529	#if __LP64__
6530	imgp->ip_arch_size = vap->va_data_size;
6531	#else
6532	if (vap->va_data_size > UINT32_MAX) {
6533	return ENOEXEC;
6534	}
6535	imgp->ip_arch_size = (user_size_t)vap->va_data_size;
6536	#endif
6537
6538	/ Disable setuid-ness for traced programs or if MNT_NOSUID /
6539	if ((vp->v_mount->mnt_flag & MNT_NOSUID) \|\| (p->p_lflag & P_LTRACED)) {
6540	vap->va_mode &= ~(VSUID \| VSGID);
6541	}
6542
6543	/*
6544	* Disable _POSIX_SPAWN_ALLOW_DATA_EXEC and _POSIX_SPAWN_DISABLE_ASLR
6545	* flags for setuid/setgid binaries.
6546	*/
6547	if (vap->va_mode & (VSUID \| VSGID)) {
6548	imgp->ip_flags &= ~(IMGPF_ALLOW_DATA_EXEC \| IMGPF_DISABLE_ASLR);
6549	}
6550
6551	#if CONFIG_MACF
6552	error = mac_vnode_check_exec(ctx: imgp->ip_vfs_context, vp, imgp);
6553	if (error) {
6554	return error;
6555	}
6556	#endif
6557
6558	/ Check for execute permission /
6559	action = KAUTH_VNODE_EXECUTE;
6560	/ Traced images must also be readable /
6561	if (p->p_lflag & P_LTRACED) {
6562	action \|= KAUTH_VNODE_READ_DATA;
6563	}
6564	if ((error = vnode_authorize(vp, NULL, action, ctx: imgp->ip_vfs_context)) != `0`) {
6565	return error;
6566	}
6567
6568	#if 0
6569	/ Don't let it run if anyone had it open for writing /
6570	vnode_lock(vp);
6571	if (vp->v_writecount) {
6572	panic("going to return ETXTBSY %x", vp);
6573	vnode_unlock(vp);
6574	return ETXTBSY;
6575	}
6576	vnode_unlock(vp);
6577	#endif
6578
6579	/ XXX May want to indicate to underlying FS that vnode is open /
6580
6581	return error;
6582	}
6583
6584
6585	/*
6586	* exec_handle_sugid
6587	*
6588	* Initially clear the P_SUGID in the process flags; if an SUGID process is
6589	* exec'ing a non-SUGID image, then this is the point of no return.
6590	*
6591	* If the image being activated is SUGID, then replace the credential with a
6592	* copy, disable tracing (unless the tracing process is root), reset the
6593	* mach task port to revoke it, set the P_SUGID bit,
6594	*
6595	* If the saved user and group ID will be changing, then make sure it happens
6596	* to a new credential, rather than a shared one.
6597	*
6598	* Set the security token (this is probably obsolete, given that the token
6599	* should not technically be separate from the credential itself).
6600	*
6601	* Parameters: struct image_params * the image parameter block
6602	*
6603	* Returns: void No failure indication
6604	*
6605	* Implicit returns:
6606	* <process credential> Potentially modified/replaced
6607	* <task port> Potentially revoked
6608	* <process flags> P_SUGID bit potentially modified
6609	* <security token> Potentially modified
6610	*/
6611	__attribute__((noinline))
6612	static int
6613	exec_handle_sugid(struct image_params *imgp)
6614	{
6615	proc_t p = vfs_context_proc(ctx: imgp->ip_vfs_context);
6616	kauth_cred_t cred = vfs_context_ucred(ctx: imgp->ip_vfs_context);
6617	int i;
6618	int leave_sugid_clear = `0`;
6619	int mac_reset_ipc = `0`;
6620	int error = `0`;
6621	#if CONFIG_MACF
6622	int mac_transition, disjoint_cred = `0`;
6623	int label_update_return = `0`;
6624
6625	/*
6626	* Determine whether a call to update the MAC label will result in the
6627	* credential changing.
6628	*
6629	* Note: MAC policies which do not actually end up modifying
6630	* the label subsequently are strongly encouraged to
6631	* return 0 for this check, since a non-zero answer will
6632	* slow down the exec fast path for normal binaries.
6633	*/
6634	mac_transition = mac_cred_check_label_update_execve(
6635	ctx: imgp->ip_vfs_context,
6636	vp: imgp->ip_vp,
6637	offset: imgp->ip_arch_offset,
6638	scriptvp: imgp->ip_scriptvp,
6639	scriptvnodelabel: imgp->ip_scriptlabelp,
6640	execlabel: imgp->ip_execlabelp,
6641	proc: p,
6642	macextensions: &imgp->ip_px_smpx);
6643	#endif
6644
6645	OSBitAndAtomic(~((uint32_t)P_SUGID), &p->p_flag);
6646
6647	/*
6648	* Order of the following is important; group checks must go last,
6649	* as we use the success of the 'ismember' check combined with the
6650	* failure of the explicit match to indicate that we will be setting
6651	* the egid of the process even though the new process did not
6652	* require VSUID/VSGID bits in order for it to set the new group as
6653	* its egid.
6654	*
6655	* Note: Technically, by this we are implying a call to
6656	* setegid() in the new process, rather than implying
6657	* it used its VSGID bit to set the effective group,
6658	* even though there is no code in that process to make
6659	* such a call.
6660	*/
6661	if (((imgp->ip_origvattr->va_mode & VSUID) != `0` &&
6662	kauth_cred_getuid(cred: cred) != imgp->ip_origvattr->va_uid) \|\|
6663	((imgp->ip_origvattr->va_mode & VSGID) != `0` &&
6664	((kauth_cred_ismember_gid(cred: cred, gid: imgp->ip_origvattr->va_gid, resultp: &leave_sugid_clear) \|\| !leave_sugid_clear) \|\|
6665	(kauth_cred_getgid(cred: cred) != imgp->ip_origvattr->va_gid)))) {
6666	#if CONFIG_MACF
6667	/ label for MAC transition and neither VSUID nor VSGID /
6668	handle_mac_transition:
6669	#endif
6670
6671	#if CONFIG_SETUID
6672	/*
6673	* Replace the credential with a copy of itself if euid or
6674	* egid change.
6675	*
6676	* Note: setuid binaries will automatically opt out of
6677	* group resolver participation as a side effect
6678	* of this operation. This is an intentional
6679	* part of the security model, which requires a
6680	* participating credential be established by
6681	* escalating privilege, setting up all other
6682	* aspects of the credential including whether
6683	* or not to participate in external group
6684	* membership resolution, then dropping their
6685	* effective privilege to that of the desired
6686	* final credential state.
6687	*
6688	* Modifications to p_ucred must be guarded using the
6689	* proc's ucred lock. This prevents others from accessing
6690	* a garbage credential.
6691	*/
6692
6693	if (imgp->ip_origvattr->va_mode & VSUID) {
6694	kauth_cred_proc_update(p, action: PROC_SETTOKEN_NONE,
6695	fn: ^bool (kauth_cred_t parent __unused, kauth_cred_t model) {
6696	return kauth_cred_model_setresuid(model,
6697	KAUTH_UID_NONE,
6698	euid: imgp->ip_origvattr->va_uid,
6699	svuid: imgp->ip_origvattr->va_uid,
6700	KAUTH_UID_NONE);
6701	});
6702	}
6703
6704	if (imgp->ip_origvattr->va_mode & VSGID) {
6705	kauth_cred_proc_update(p, action: PROC_SETTOKEN_NONE,
6706	fn: ^bool (kauth_cred_t parent __unused, kauth_cred_t model) {
6707	return kauth_cred_model_setresgid(model,
6708	KAUTH_GID_NONE,
6709	egid: imgp->ip_origvattr->va_gid,
6710	svgid: imgp->ip_origvattr->va_gid);
6711	});
6712	}
6713	#endif /* CONFIG_SETUID */
6714
6715	#if CONFIG_MACF
6716	/*
6717	* If a policy has indicated that it will transition the label,
6718	* before making the call into the MAC policies, get a new
6719	* duplicate credential, so they can modify it without
6720	* modifying any others sharing it.
6721	*/
6722	if (mac_transition) {
6723	/*
6724	* This hook may generate upcalls that require
6725	* importance donation from the kernel.
6726	* (23925818)
6727	*/
6728	thread_t thread = current_thread();
6729	thread_enable_send_importance(thread, TRUE);
6730	kauth_proc_label_update_execve(p,
6731	ctx: imgp->ip_vfs_context,
6732	vp: imgp->ip_vp,
6733	offset: imgp->ip_arch_offset,
6734	scriptvp: imgp->ip_scriptvp,
6735	scriptlabel: imgp->ip_scriptlabelp,
6736	execlabel: imgp->ip_execlabelp,
6737	csflags: &imgp->ip_csflags,
6738	psattr: &imgp->ip_px_smpx,
6739	disjoint: &disjoint_cred, / will be non zero if disjoint /
6740	update_return: &label_update_return);
6741	thread_enable_send_importance(thread, FALSE);
6742
6743	if (disjoint_cred) {
6744	/*
6745	* If updating the MAC label resulted in a
6746	* disjoint credential, flag that we need to
6747	* set the P_SUGID bit. This protects
6748	* against debuggers being attached by an
6749	* insufficiently privileged process onto the
6750	* result of a transition to a more privileged
6751	* credential.
6752	*/
6753	leave_sugid_clear = `0`;
6754	}
6755
6756	imgp->ip_mac_return = label_update_return;
6757	}
6758
6759	mac_reset_ipc = mac_proc_check_inherit_ipc_ports(p, cur_vp: p->p_textvp, cur_offset: p->p_textoff, img_vp: imgp->ip_vp, img_offset: imgp->ip_arch_offset, scriptvp: imgp->ip_scriptvp);
6760
6761	#endif /* CONFIG_MACF */
6762
6763	/*
6764	* If 'leave_sugid_clear' is non-zero, then we passed the
6765	* VSUID and MACF checks, and successfully determined that
6766	* the previous cred was a member of the VSGID group, but
6767	* that it was not the default at the time of the execve,
6768	* and that the post-labelling credential was not disjoint.
6769	* So we don't set the P_SUGID or reset mach ports and fds
6770	* on the basis of simply running this code.
6771	*/
6772	if (mac_reset_ipc \|\| !leave_sugid_clear) {
6773	/*
6774	* Have mach reset the task and thread ports.
6775	* We don't want anyone who had the ports before
6776	* a setuid exec to be able to access/control the
6777	* task/thread after.
6778	*/
6779	ipc_task_reset(task: (imgp->ip_new_thread != NULL) ?
6780	get_threadtask(imgp->ip_new_thread) : proc_task(p));
6781	ipc_thread_reset(thread: (imgp->ip_new_thread != NULL) ?
6782	imgp->ip_new_thread : current_thread());
6783	}
6784
6785	if (!leave_sugid_clear) {
6786	/*
6787	* Flag the process as setuid.
6788	*/
6789	OSBitOrAtomic(P_SUGID, &p->p_flag);
6790
6791	/*
6792	* Radar 2261856; setuid security hole fix
6793	* XXX For setuid processes, attempt to ensure that
6794	* stdin, stdout, and stderr are already allocated.
6795	* We do not want userland to accidentally allocate
6796	* descriptors in this range which has implied meaning
6797	* to libc.
6798	*/
6799	for (i = `0`; i < `3`; i++) {
6800	if (fp_get_noref_locked(p, fd: i) != NULL) {
6801	continue;
6802	}
6803
6804	/*
6805	* Do the kernel equivalent of
6806	*
6807	* if i == 0
6808	* (void) open("/dev/null", O_RDONLY);
6809	* else
6810	* (void) open("/dev/null", O_WRONLY);
6811	*/
6812
6813	struct fileproc *fp;
6814	int indx;
6815	int flag;
6816	struct nameidata *ndp = NULL;
6817
6818	if (i == `0`) {
6819	flag = FREAD;
6820	} else {
6821	flag = FWRITE;
6822	}
6823
6824	if ((error = falloc_exec(p, imgp->ip_vfs_context,
6825	&fp, &indx)) != `0`) {
6826	continue;
6827	}
6828
6829	ndp = kalloc_type(struct nameidata,
6830	Z_WAITOK \| Z_ZERO \| Z_NOFAIL);
6831
6832	NDINIT(ndp, LOOKUP, OP_OPEN, FOLLOW, UIO_SYSSPACE,
6833	CAST_USER_ADDR_T("/dev/null"),
6834	imgp->ip_vfs_context);
6835
6836	if ((error = vn_open(ndp, fmode: flag, cmode: `0`)) != `0`) {
6837	fp_free(p, fd: indx, fp);
6838	kfree_type(struct nameidata, ndp);
6839	break;
6840	}
6841
6842	struct fileglob *fg = fp->fp_glob;
6843
6844	fg->fg_flag = flag;
6845	fg->fg_ops = &vnops;
6846	fp_set_data(fp, fg_data: ndp->ni_vp);
6847
6848	vnode_put(vp: ndp->ni_vp);
6849
6850	proc_fdlock(p);
6851	procfdtbl_releasefd(p, fd: indx, NULL);
6852	fp_drop(p, fd: indx, fp, locked: `1`);
6853	proc_fdunlock(p);
6854
6855	kfree_type(struct nameidata, ndp);
6856	}
6857	}
6858	}
6859	#if CONFIG_MACF
6860	else {
6861	/*
6862	* We are here because we were told that the MAC label will
6863	* be transitioned, and the binary is not VSUID or VSGID; to
6864	* deal with this case, we could either duplicate a lot of
6865	* code, or we can indicate we want to default the P_SUGID
6866	* bit clear and jump back up.
6867	*/
6868	if (mac_transition) {
6869	leave_sugid_clear = `1`;
6870	goto handle_mac_transition;
6871	}
6872	}
6873
6874	#endif /* CONFIG_MACF */
6875
6876	/ Update the process' identity version and set the security token /
6877	proc_setpidversion(p, OSIncrementAtomic(&nextpidversion));
6878	task_set_uniqueid(task: proc_task(p));
6879
6880	/*
6881	* Implement the semantic where the effective user and group become
6882	* the saved user and group in exec'ed programs.
6883	*/
6884	kauth_cred_proc_update(p, action: PROC_SETTOKEN_ALWAYS,
6885	fn: ^bool (kauth_cred_t parent __unused, kauth_cred_t model) {
6886	posix_cred_t pcred = posix_cred_get(cred: model);
6887
6888	if (pcred->cr_svuid == pcred->cr_uid &&
6889	pcred->cr_svgid == pcred->cr_gid) {
6890	return false;
6891	}
6892
6893	pcred->cr_svuid = pcred->cr_uid;
6894	pcred->cr_svgid = pcred->cr_gid;
6895	return true;
6896	});
6897
6898	return error;
6899	}
6900
6901
6902	/*
6903	* create_unix_stack
6904	*
6905	* Description: Set the user stack address for the process to the provided
6906	* address. If a custom stack was not set as a result of the
6907	* load process (i.e. as specified by the image file for the
6908	* executable), then allocate the stack in the provided map and
6909	* set up appropriate guard pages for enforcing administrative
6910	* limits on stack growth, if they end up being needed.
6911	*
6912	* Parameters: p Process to set stack on
6913	* load_result Information from mach-o load commands
6914	* map Address map in which to allocate the new stack
6915	*
6916	* Returns: KERN_SUCCESS Stack successfully created
6917	* !KERN_SUCCESS Mach failure code
6918	*/
6919	__attribute__((noinline))
6920	static kern_return_t
6921	create_unix_stack(vm_map_t map, load_result_t* load_result,
6922	proc_t p)
6923	{
6924	mach_vm_size_t size, prot_size;
6925	mach_vm_offset_t addr, prot_addr;
6926	kern_return_t kr;
6927
6928	mach_vm_address_t user_stack = load_result->user_stack;
6929
6930	proc_lock(p);
6931	p->user_stack = (uintptr_t)user_stack;
6932	if (load_result->custom_stack) {
6933	p->p_lflag \|= P_LCUSTOM_STACK;
6934	}
6935	proc_unlock(p);
6936	if (vm_map_page_shift(map) < (int)PAGE_SHIFT) {
6937	DEBUG4K_LOAD("map %p user_stack 0x%llx custom %d user_stack_alloc_size 0x%llx\n", map, user_stack, load_result->custom_stack, load_result->user_stack_alloc_size);
6938	}
6939
6940	if (load_result->user_stack_alloc_size > `0`) {
6941	/*
6942	* Allocate enough space for the maximum stack size we
6943	* will ever authorize and an extra page to act as
6944	* a guard page for stack overflows. For default stacks,
6945	* vm_initial_limit_stack takes care of the extra guard page.
6946	* Otherwise we must allocate it ourselves.
6947	*/
6948	if (mach_vm_round_page_overflow(in: load_result->user_stack_alloc_size, out: &size)) {
6949	return KERN_INVALID_ARGUMENT;
6950	}
6951	addr = vm_map_trunc_page(load_result->user_stack - size,
6952	vm_map_page_mask(map));
6953	kr = mach_vm_allocate_kernel(map, addr: &addr, size,
6954	VM_FLAGS_FIXED, VM_MEMORY_STACK);
6955	if (kr != KERN_SUCCESS) {
6956	// Can't allocate at default location, try anywhere
6957	addr = `0`;
6958	kr = mach_vm_allocate_kernel(map, addr: &addr, size,
6959	VM_FLAGS_ANYWHERE, VM_MEMORY_STACK);
6960	if (kr != KERN_SUCCESS) {
6961	return kr;
6962	}
6963
6964	user_stack = addr + size;
6965	load_result->user_stack = (user_addr_t)user_stack;
6966
6967	proc_lock(p);
6968	p->user_stack = (uintptr_t)user_stack;
6969	proc_unlock(p);
6970	}
6971
6972	load_result->user_stack_alloc = (user_addr_t)addr;
6973
6974	/*
6975	* And prevent access to what's above the current stack
6976	* size limit for this process.
6977	*/
6978	if (load_result->user_stack_size == `0`) {
6979	load_result->user_stack_size = proc_limitgetcur(p, RLIMIT_STACK);
6980	prot_size = vm_map_trunc_page(size - load_result->user_stack_size, vm_map_page_mask(map));
6981	} else {
6982	prot_size = PAGE_SIZE;
6983	}
6984
6985	prot_addr = addr;
6986	kr = mach_vm_protect(target_task: map,
6987	address: prot_addr,
6988	size: prot_size,
6989	FALSE,
6990	VM_PROT_NONE);
6991	if (kr != KERN_SUCCESS) {
6992	(void)mach_vm_deallocate(target: map, address: addr, size);
6993	return kr;
6994	}
6995	}
6996
6997	return KERN_SUCCESS;
6998	}
6999
7000	#include <sys/reboot.h>
7001
7002	/*
7003	* load_init_program_at_path
7004	*
7005	* Description: Load the "init" program; in most cases, this will be "launchd"
7006	*
7007	* Parameters: p Process to call execve() to create
7008	* the "init" program
7009	* scratch_addr Page in p, scratch space
7010	* path NULL terminated path
7011	*
7012	* Returns: KERN_SUCCESS Success
7013	* !KERN_SUCCESS See execve/mac_execve for error codes
7014	*
7015	* Notes: The process that is passed in is the first manufactured
7016	* process on the system, and gets here via bsd_ast() firing
7017	* for the first time. This is done to ensure that bsd_init()
7018	* has run to completion.
7019	*
7020	* The address map of the first manufactured process matches the
7021	* word width of the kernel. Once the self-exec completes, the
7022	* initproc might be different.
7023	*/
7024	static int
7025	load_init_program_at_path(proc_t p, user_addr_t scratch_addr, const char* path)
7026	{
7027	int retval[`2`];
7028	int error;
7029	struct execve_args init_exec_args;
7030	user_addr_t argv0 = USER_ADDR_NULL, argv1 = USER_ADDR_NULL;
7031
7032	/*
7033	* Validate inputs and pre-conditions
7034	*/
7035	assert(p);
7036	assert(scratch_addr);
7037	assert(path);
7038
7039	/*
7040	* Copy out program name.
7041	*/
7042	size_t path_length = strlen(s: path) + `1`;
7043	argv0 = scratch_addr;
7044	error = copyout(path, argv0, path_length);
7045	if (error) {
7046	return error;
7047	}
7048
7049	scratch_addr = USER_ADDR_ALIGN(scratch_addr + path_length, sizeof(user_addr_t));
7050
7051	/*
7052	* Put out first (and only) argument, similarly.
7053	* Assumes everything fits in a page as allocated above.
7054	*/
7055	if (boothowto & RB_SINGLE) {
7056	const char *init_args = "-s";
7057	size_t init_args_length = strlen(s: init_args) + `1`;
7058
7059	argv1 = scratch_addr;
7060	error = copyout(init_args, argv1, init_args_length);
7061	if (error) {
7062	return error;
7063	}
7064
7065	scratch_addr = USER_ADDR_ALIGN(scratch_addr + init_args_length, sizeof(user_addr_t));
7066	}
7067
7068	if (proc_is64bit(p)) {
7069	user64_addr_t argv64bit[`3`] = {};
7070
7071	argv64bit[`0`] = argv0;
7072	argv64bit[`1`] = argv1;
7073	argv64bit[`2`] = USER_ADDR_NULL;
7074
7075	error = copyout(argv64bit, scratch_addr, sizeof(argv64bit));
7076	if (error) {
7077	return error;
7078	}
7079	} else {
7080	user32_addr_t argv32bit[`3`] = {};
7081
7082	argv32bit[`0`] = (user32_addr_t)argv0;
7083	argv32bit[`1`] = (user32_addr_t)argv1;
7084	argv32bit[`2`] = USER_ADDR_NULL;
7085
7086	error = copyout(argv32bit, scratch_addr, sizeof(argv32bit));
7087	if (error) {
7088	return error;
7089	}
7090	}
7091
7092	/*
7093	* Set up argument block for fake call to execve.
7094	*/
7095	init_exec_args.fname = argv0;
7096	init_exec_args.argp = scratch_addr;
7097	init_exec_args.envp = USER_ADDR_NULL;
7098
7099	/*
7100	* So that init task is set with uid,gid 0 token
7101	*
7102	* The access to the cred is safe:
7103	* the proc isn't running yet, it's stable.
7104	*/
7105	set_security_token(p, cred: proc_ucred_unsafe(p));
7106
7107	return execve(p, uap: &init_exec_args, retval);
7108	}
7109
7110	static const char * init_programs[] = {
7111	#if DEBUG
7112	"/usr/appleinternal/sbin/launchd.debug",
7113	#endif
7114	#if DEVELOPMENT \|\| DEBUG
7115	"/usr/appleinternal/sbin/launchd.development",
7116	#endif
7117	"/sbin/launchd",
7118	};
7119
7120	/*
7121	* load_init_program
7122	*
7123	* Description: Load the "init" program; in most cases, this will be "launchd"
7124	*
7125	* Parameters: p Process to call execve() to create
7126	* the "init" program
7127	*
7128	* Returns: (void)
7129	*
7130	* Notes: The process that is passed in is the first manufactured
7131	* process on the system, and gets here via bsd_ast() firing
7132	* for the first time. This is done to ensure that bsd_init()
7133	* has run to completion.
7134	*
7135	* In DEBUG & DEVELOPMENT builds, the launchdsuffix boot-arg
7136	* may be used to select a specific launchd executable. As with
7137	* the kcsuffix boot-arg, setting launchdsuffix to "" or "release"
7138	* will force /sbin/launchd to be selected.
7139	*
7140	* Search order by build:
7141	*
7142	* DEBUG DEVELOPMENT RELEASE PATH
7143	* ----------------------------------------------------------------------------------
7144	* 1 1 NA /usr/appleinternal/sbin/launchd.$LAUNCHDSUFFIX
7145	* 2 NA NA /usr/appleinternal/sbin/launchd.debug
7146	* 3 2 NA /usr/appleinternal/sbin/launchd.development
7147	* 4 3 1 /sbin/launchd
7148	*/
7149	void
7150	load_init_program(proc_t p)
7151	{
7152	uint32_t i;
7153	int error;
7154	vm_map_t map = current_map();
7155	mach_vm_offset_t scratch_addr = `0`;
7156	mach_vm_size_t map_page_size = vm_map_page_size(map);
7157
7158	(void) mach_vm_allocate_kernel(map, addr: &scratch_addr, size: map_page_size, VM_FLAGS_ANYWHERE, VM_KERN_MEMORY_NONE);
7159	#if CONFIG_MEMORYSTATUS
7160	(void) memorystatus_init_at_boot_snapshot();
7161	#endif /* CONFIG_MEMORYSTATUS */
7162
7163	#if DEBUG \|\| DEVELOPMENT
7164	/ Check for boot-arg suffix first /
7165	char launchd_suffix[`64`];
7166	if (PE_parse_boot_argn("launchdsuffix", launchd_suffix, sizeof(launchd_suffix))) {
7167	char launchd_path[`128`];
7168	boolean_t is_release_suffix = ((launchd_suffix[`0`] == `0`) \|\|
7169	(strcmp(launchd_suffix, "release") == `0`));
7170
7171	if (is_release_suffix) {
7172	printf("load_init_program: attempting to load /sbin/launchd\n");
7173	error = load_init_program_at_path(p, (user_addr_t)scratch_addr, "/sbin/launchd");
7174	if (!error) {
7175	return;
7176	}
7177
7178	panic("Process 1 exec of launchd.release failed, errno %d", error);
7179	} else {
7180	strlcpy(launchd_path, "/usr/appleinternal/sbin/launchd.", sizeof(launchd_path));
7181	strlcat(launchd_path, launchd_suffix, sizeof(launchd_path));
7182
7183	printf("load_init_program: attempting to load %s\n", launchd_path);
7184	error = load_init_program_at_path(p, (user_addr_t)scratch_addr, launchd_path);
7185	if (!error) {
7186	return;
7187	} else if (error != ENOENT) {
7188	printf("load_init_program: failed loading %s: errno %d\n", launchd_path, error);
7189	}
7190	}
7191	}
7192	#endif
7193
7194	error = ENOENT;
7195	for (i = `0`; i < sizeof(init_programs) / sizeof(init_programs[`0`]); i++) {
7196	printf("load_init_program: attempting to load %s\n", init_programs[i]);
7197	error = load_init_program_at_path(p, scratch_addr: (user_addr_t)scratch_addr, path: init_programs[i]);
7198	if (!error) {
7199	return;
7200	} else if (error != ENOENT) {
7201	printf("load_init_program: failed loading %s: errno %d\n", init_programs[i], error);
7202	}
7203	}
7204
7205	panic("Process 1 exec of %s failed, errno %d", ((i == `0`) ? "<null>" : init_programs[i - `1`]), error);
7206	}
7207
7208	/*
7209	* load_return_to_errno
7210	*
7211	* Description: Convert a load_return_t (Mach error) to an errno (BSD error)
7212	*
7213	* Parameters: lrtn Mach error number
7214	*
7215	* Returns: (int) BSD error number
7216	* 0 Success
7217	* EBADARCH Bad architecture
7218	* EBADMACHO Bad Mach object file
7219	* ESHLIBVERS Bad shared library version
7220	* ENOMEM Out of memory/resource shortage
7221	* EACCES Access denied
7222	* ENOENT Entry not found (usually "file does
7223	* does not exist")
7224	* EIO An I/O error occurred
7225	* EBADEXEC The executable is corrupt/unknown
7226	*/
7227	static int
7228	load_return_to_errno(load_return_t lrtn)
7229	{
7230	switch (lrtn) {
7231	case LOAD_SUCCESS:
7232	return `0`;
7233	case LOAD_BADARCH:
7234	return EBADARCH;
7235	case LOAD_BADMACHO:
7236	case LOAD_BADMACHO_UPX:
7237	return EBADMACHO;
7238	case LOAD_SHLIB:
7239	return ESHLIBVERS;
7240	case LOAD_NOSPACE:
7241	case LOAD_RESOURCE:
7242	return ENOMEM;
7243	case LOAD_PROTECT:
7244	return EACCES;
7245	case LOAD_ENOENT:
7246	return ENOENT;
7247	case LOAD_IOERROR:
7248	return EIO;
7249	case LOAD_DECRYPTFAIL:
7250	return EAUTH;
7251	case LOAD_FAILURE:
7252	default:
7253	return EBADEXEC;
7254	}
7255	}
7256
7257	#include <mach/mach_types.h>
7258	#include <mach/vm_prot.h>
7259	#include <mach/semaphore.h>
7260	#include <mach/sync_policy.h>
7261	#include <kern/clock.h>
7262	#include <mach/kern_return.h>
7263
7264	/*
7265	* execargs_alloc
7266	*
7267	* Description: Allocate the block of memory used by the execve arguments.
7268	* At the same time, we allocate a page so that we can read in
7269	* the first page of the image.
7270	*
7271	* Parameters: struct image_params * the image parameter block
7272	*
7273	* Returns: 0 Success
7274	* EINVAL Invalid argument
7275	* EACCES Permission denied
7276	* EINTR Interrupted function
7277	* ENOMEM Not enough space
7278	*
7279	* Notes: This is a temporary allocation into the kernel address space
7280	* to enable us to copy arguments in from user space. This is
7281	* necessitated by not mapping the process calling execve() into
7282	* the kernel address space during the execve() system call.
7283	*
7284	* We assemble the argument and environment, etc., into this
7285	* region before copying it as a single block into the child
7286	* process address space (at the top or bottom of the stack,
7287	* depending on which way the stack grows; see the function
7288	* exec_copyout_strings() for details).
7289	*
7290	* This ends up with a second (possibly unnecessary) copy compared
7291	* with assembing the data directly into the child address space,
7292	* instead, but since we cannot be guaranteed that the parent has
7293	* not modified its environment, we can't really know that it's
7294	* really a block there as well.
7295	*/
7296
7297
7298	static int execargs_waiters = `0`;
7299	static LCK_MTX_DECLARE_ATTR(execargs_cache_lock, &proc_lck_grp, &proc_lck_attr);
7300
7301	static void
7302	execargs_lock_lock(void)
7303	{
7304	lck_mtx_lock_spin(lck: &execargs_cache_lock);
7305	}
7306
7307	static void
7308	execargs_lock_unlock(void)
7309	{
7310	lck_mtx_unlock(lck: &execargs_cache_lock);
7311	}
7312
7313	static wait_result_t
7314	execargs_lock_sleep(void)
7315	{
7316	return lck_mtx_sleep(lck: &execargs_cache_lock, lck_sleep_action: LCK_SLEEP_DEFAULT, event: &execargs_free_count, THREAD_INTERRUPTIBLE);
7317	}
7318
7319	static kern_return_t
7320	execargs_purgeable_allocate(char **execarg_address)
7321	{
7322	mach_vm_offset_t addr = `0`;
7323	kern_return_t kr = mach_vm_allocate_kernel(map: bsd_pageable_map, addr: &addr,
7324	BSD_PAGEABLE_SIZE_PER_EXEC, VM_FLAGS_ANYWHERE \| VM_FLAGS_PURGABLE,
7325	VM_KERN_MEMORY_NONE);
7326	execarg_address = (char* *)addr;
7327	assert(kr == KERN_SUCCESS);
7328	return kr;
7329	}
7330
7331	static kern_return_t
7332	execargs_purgeable_reference(void *execarg_address)
7333	{
7334	int state = VM_PURGABLE_NONVOLATILE;
7335	kern_return_t kr = vm_purgable_control(target_task: bsd_pageable_map, address: (vm_offset_t) execarg_address, VM_PURGABLE_SET_STATE, state: &state);
7336
7337	assert(kr == KERN_SUCCESS);
7338	return kr;
7339	}
7340
7341	static kern_return_t
7342	execargs_purgeable_volatilize(void *execarg_address)
7343	{
7344	int state = VM_PURGABLE_VOLATILE \| VM_PURGABLE_ORDERING_OBSOLETE;
7345	kern_return_t kr;
7346	kr = vm_purgable_control(target_task: bsd_pageable_map, address: (vm_offset_t) execarg_address, VM_PURGABLE_SET_STATE, state: &state);
7347
7348	assert(kr == KERN_SUCCESS);
7349
7350	return kr;
7351	}
7352
7353	static void
7354	execargs_wakeup_waiters(void)
7355	{
7356	thread_wakeup(&execargs_free_count);
7357	}
7358
7359	static int
7360	execargs_alloc(struct image_params *imgp)
7361	{
7362	kern_return_t kret;
7363	wait_result_t res;
7364	int i, cache_index = -`1`;
7365
7366	execargs_lock_lock();
7367
7368	while (execargs_free_count == `0`) {
7369	execargs_waiters++;
7370	res = execargs_lock_sleep();
7371	execargs_waiters--;
7372	if (res != THREAD_AWAKENED) {
7373	execargs_lock_unlock();
7374	return EINTR;
7375	}
7376	}
7377
7378	execargs_free_count--;
7379
7380	for (i = `0`; i < execargs_cache_size; i++) {
7381	vm_offset_t element = execargs_cache[i];
7382	if (element) {
7383	cache_index = i;
7384	imgp->ip_strings = (char *)(execargs_cache[i]);
7385	execargs_cache[i] = `0`;
7386	break;
7387	}
7388	}
7389
7390	assert(execargs_free_count >= `0`);
7391
7392	execargs_lock_unlock();
7393
7394	if (cache_index == -`1`) {
7395	kret = execargs_purgeable_allocate(execarg_address: &imgp->ip_strings);
7396	} else {
7397	kret = execargs_purgeable_reference(execarg_address: imgp->ip_strings);
7398	}
7399
7400	assert(kret == KERN_SUCCESS);
7401	if (kret != KERN_SUCCESS) {
7402	return ENOMEM;
7403	}
7404
7405	/ last page used to read in file headers /
7406	imgp->ip_vdata = imgp->ip_strings + (NCARGS + PAGE_SIZE);
7407	imgp->ip_strendp = imgp->ip_strings;
7408	imgp->ip_argspace = NCARGS;
7409	imgp->ip_strspace = (NCARGS + PAGE_SIZE);
7410
7411	return `0`;
7412	}
7413
7414	/*
7415	* execargs_free
7416	*
7417	* Description: Free the block of memory used by the execve arguments and the
7418	* first page of the executable by a previous call to the function
7419	* execargs_alloc().
7420	*
7421	* Parameters: struct image_params * the image parameter block
7422	*
7423	* Returns: 0 Success
7424	* EINVAL Invalid argument
7425	* EINTR Oeration interrupted
7426	*/
7427	static int
7428	execargs_free(struct image_params *imgp)
7429	{
7430	kern_return_t kret;
7431	int i;
7432	boolean_t needs_wakeup = FALSE;
7433
7434	kret = execargs_purgeable_volatilize(execarg_address: imgp->ip_strings);
7435
7436	execargs_lock_lock();
7437	execargs_free_count++;
7438
7439	for (i = `0`; i < execargs_cache_size; i++) {
7440	vm_offset_t element = execargs_cache[i];
7441	if (element == `0`) {
7442	execargs_cache[i] = (vm_offset_t) imgp->ip_strings;
7443	imgp->ip_strings = NULL;
7444	break;
7445	}
7446	}
7447
7448	assert(imgp->ip_strings == NULL);
7449
7450	if (execargs_waiters > `0`) {
7451	needs_wakeup = TRUE;
7452	}
7453
7454	execargs_lock_unlock();
7455
7456	if (needs_wakeup == TRUE) {
7457	execargs_wakeup_waiters();
7458	}
7459
7460	return kret == KERN_SUCCESS ? `0` : EINVAL;
7461	}
7462
7463	void
7464	uthread_set_exec_data(struct uthread uth, struct* image_params *imgp)
7465	{
7466	uth->uu_save.uus_exec_data.imgp = imgp;
7467	}
7468
7469	size_t
7470	thread_get_current_exec_path(char *path, size_t size)
7471	{
7472	struct uthread *uth = current_uthread();
7473	struct image_params *imgp = uth->uu_save.uus_exec_data.imgp;
7474	size_t string_size = `0`;
7475	char *exec_path;
7476
7477	if (path == NULL \|\| imgp == NULL \|\| imgp->ip_strings == NULL) {
7478	return `0`;
7479	}
7480
7481	exec_path = imgp->ip_strings + strlen(EXECUTABLE_KEY);
7482	string_size = imgp->ip_strendp - exec_path;
7483	string_size = MIN(MAXPATHLEN, string_size);
7484	string_size = MIN(size, string_size);
7485
7486	string_size = strlcpy(dst: path, src: exec_path, n: string_size);
7487	return string_size;
7488	}
7489	static void
7490	exec_resettextvp(proc_t p, struct image_params *imgp)
7491	{
7492	vnode_t vp;
7493	off_t offset;
7494	vnode_t tvp = p->p_textvp;
7495	int ret;
7496
7497	vp = imgp->ip_vp;
7498	offset = imgp->ip_arch_offset;
7499
7500	if (vp == NULLVP) {
7501	panic("exec_resettextvp: expected valid vp");
7502	}
7503
7504	ret = vnode_ref(vp);
7505	proc_lock(p);
7506	if (ret == `0`) {
7507	p->p_textvp = vp;
7508	p->p_textoff = offset;
7509	} else {
7510	p->p_textvp = NULLVP; / this is paranoia /
7511	p->p_textoff = `0`;
7512	}
7513	proc_unlock(p);
7514
7515	if (tvp != NULLVP) {
7516	if (vnode_getwithref(vp: tvp) == `0`) {
7517	vnode_rele(vp: tvp);
7518	vnode_put(vp: tvp);
7519	}
7520	}
7521	}
7522
7523	// Includes the 0-byte (therefore "SIZE" instead of "LEN").
7524	static const size_t CS_CDHASH_STRING_SIZE = CS_CDHASH_LEN * `2` + `1`;
7525
7526	static void
7527	cdhash_to_string(char str[CS_CDHASH_STRING_SIZE], uint8_t const * const cdhash)
7528	{
7529	static char const nibble[] = "0123456789abcdef";
7530
7531	/ Apparently still the safest way to get a hex representation*
7532	* of binary data.
7533	* xnu's printf routines have %*D/%20D in theory, but "not really", see:
7534	* <rdar://problem/33328859> confusion around %*D/%nD in printf
7535	*/
7536	for (int i = `0`; i < CS_CDHASH_LEN; ++i) {
7537	str[i * `2`] = nibble[(cdhash[i] & `0xf0`) >> `4`];
7538	str[i * `2` + `1`] = nibble[cdhash[i] & `0x0f`];
7539	}
7540	str[CS_CDHASH_STRING_SIZE - `1`] = `0`;
7541	}
7542
7543	/*
7544	* __EXEC_WAITING_ON_TASKGATED_CODE_SIGNATURE_UPCALL__
7545	*
7546	* Description: Waits for the userspace daemon to respond to the request
7547	* we made. Function declared non inline to be visible in
7548	* stackshots and spindumps as well as debugging.
7549	*/
7550	__attribute__((noinline)) int
7551	__EXEC_WAITING_ON_TASKGATED_CODE_SIGNATURE_UPCALL__(mach_port_t task_access_port, int32_t new_pid)
7552	{
7553	return find_code_signature(task_access_port, new_pid);
7554	}
7555
7556	/*
7557	* Update signature dependent process state, called by
7558	* process_signature.
7559	*/
7560	static int
7561	proc_process_signature(proc_t p, os_reason_t *signature_failure_reason)
7562	{
7563	int error = `0`;
7564	char const *error_msg = NULL;
7565
7566	kern_return_t kr = machine_task_process_signature(task: proc_get_task_raw(proc: p), platform: proc_platform(p), sdk: proc_sdk(p), error_msg: &error_msg);
7567
7568	if (kr != KERN_SUCCESS) {
7569	error = EINVAL;
7570
7571	if (error_msg != NULL) {
7572	uint32_t error_msg_len = (uint32_t)strlen(s: error_msg) + `1`;
7573	mach_vm_address_t data_addr = `0`;
7574	int reason_error = `0`;
7575	int kcdata_error = `0`;
7576
7577	os_reason_t reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_SECURITY_POLICY);
7578	reason->osr_flags = OS_REASON_FLAG_GENERATE_CRASH_REPORT \| OS_REASON_FLAG_CONSISTENT_FAILURE;
7579
7580	if ((reason_error = os_reason_alloc_buffer_noblock(cur_reason: reason,
7581	osr_bufsize: kcdata_estimate_required_buffer_size(num_items: `1`, payload_size: error_msg_len))) == `0` &&
7582	(kcdata_error = kcdata_get_memory_addr(data: &reason->osr_kcd_descriptor,
7583	EXIT_REASON_USER_DESC, size: error_msg_len,
7584	user_addr: &data_addr)) == KERN_SUCCESS) {
7585	kern_return_t mc_error = kcdata_memcpy(data: &reason->osr_kcd_descriptor, dst_addr: (mach_vm_address_t)data_addr,
7586	src_addr: error_msg, size: error_msg_len);
7587
7588	if (mc_error != KERN_SUCCESS) {
7589	printf("process_signature: failed to copy reason string (kcdata_memcpy error: %d)\n",
7590	mc_error);
7591	}
7592	} else {
7593	printf("failed to allocate space for reason string (os_reason_alloc_buffer error: %d, kcdata error: %d, length: %u)\n",
7594	reason_error, kcdata_error, error_msg_len);
7595	}
7596
7597	assert(signature_failure_reason == NULL); // shouldn't have gotten so far*
7598	*signature_failure_reason = reason;
7599	}
7600	}
7601	return error;
7602	}
7603
7604	static int
7605	process_signature(proc_t p, struct image_params *imgp)
7606	{
7607	mach_port_t port = IPC_PORT_NULL;
7608	kern_return_t kr = KERN_FAILURE;
7609	int error = EACCES;
7610	boolean_t unexpected_failure = FALSE;
7611	struct cs_blob *csb;
7612	boolean_t require_success = FALSE;
7613	int spawn = (imgp->ip_flags & IMGPF_SPAWN);
7614	const int vfexec = `0`;
7615	os_reason_t signature_failure_reason = OS_REASON_NULL;
7616
7617	/*
7618	* Override inherited code signing flags with the
7619	* ones for the process that is being successfully
7620	* loaded
7621	*/
7622	proc_lock(p);
7623	proc_csflags_update(p, imgp->ip_csflags);
7624	proc_unlock(p);
7625
7626	/ Set the switch_protect flag on the map /
7627	if (proc_getcsflags(p) & (CS_HARD \| CS_KILL)) {
7628	vm_map_switch_protect(map: get_task_map(proc_task(p)), TRUE);
7629	}
7630	/ set the cs_enforced flags in the map /
7631	if (proc_getcsflags(p) & CS_ENFORCEMENT) {
7632	vm_map_cs_enforcement_set(map: get_task_map(proc_task(p)), TRUE);
7633	} else {
7634	vm_map_cs_enforcement_set(map: get_task_map(proc_task(p)), FALSE);
7635	}
7636
7637	/*
7638	* image activation may be failed due to policy
7639	* which is unexpected but security framework does not
7640	* approve of exec, kill and return immediately.
7641	*/
7642	if (imgp->ip_mac_return != `0`) {
7643	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) \| DBG_FUNC_NONE,
7644	proc_getpid(p), OS_REASON_EXEC, EXEC_EXIT_REASON_SECURITY_POLICY, `0`, `0`);
7645	signature_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_SECURITY_POLICY);
7646	error = imgp->ip_mac_return;
7647	unexpected_failure = TRUE;
7648	goto done;
7649	}
7650
7651	if (imgp->ip_cs_error != OS_REASON_NULL) {
7652	signature_failure_reason = imgp->ip_cs_error;
7653	imgp->ip_cs_error = OS_REASON_NULL;
7654	error = EACCES;
7655	goto done;
7656	}
7657
7658	/ call the launch constraints hook /
7659	os_reason_t launch_constraint_reason;
7660	if ((error = mac_proc_check_launch_constraints(curp: p, imgp, reasonp: &launch_constraint_reason)) != `0`) {
7661	signature_failure_reason = launch_constraint_reason;
7662	goto done;
7663	}
7664
7665	#if XNU_TARGET_OS_OSX
7666	/ Check for platform passed in spawn attr if iOS binary is being spawned /
7667	if (proc_platform(p) == PLATFORM_IOS) {
7668	struct _posix_spawnattr psa = (struct* _posix_spawnattr *) imgp->ip_px_sa;
7669	if (psa == NULL \|\| psa->psa_platform == `0`) {
7670	boolean_t no_sandbox_entitled = FALSE;
7671	#if DEBUG \|\| DEVELOPMENT
7672	/*
7673	* Allow iOS binaries to spawn on internal systems
7674	* if no-sandbox entitlement is present of unentitled_ios_sim_launch
7675	* boot-arg set to true
7676	*/
7677	if (unentitled_ios_sim_launch) {
7678	no_sandbox_entitled = TRUE;
7679	} else {
7680	no_sandbox_entitled = IOVnodeHasEntitlement(imgp->ip_vp,
7681	(int64_t)imgp->ip_arch_offset, "com.apple.private.security.no-sandbox");
7682	}
7683	#endif /* DEBUG \|\| DEVELOPMENT */
7684	if (!no_sandbox_entitled) {
7685	signature_failure_reason = os_reason_create(OS_REASON_EXEC,
7686	EXEC_EXIT_REASON_WRONG_PLATFORM);
7687	error = EACCES;
7688	goto done;
7689	}
7690	printf("Allowing spawn of iOS binary %s since it has "
7691	"com.apple.private.security.no-sandbox entitlement or unentitled_ios_sim_launch "
7692	"boot-arg set to true\n", p->p_name);
7693	} else if (psa->psa_platform != PLATFORM_IOS) {
7694	/ Simulator binary spawned with wrong platform /
7695	signature_failure_reason = os_reason_create(OS_REASON_EXEC,
7696	EXEC_EXIT_REASON_WRONG_PLATFORM);
7697	error = EACCES;
7698	goto done;
7699	} else {
7700	printf("Allowing spawn of iOS binary %s since correct platform was passed in spawn\n",
7701	p->p_name);
7702	}
7703	}
7704	#endif /* XNU_TARGET_OS_OSX */
7705
7706	/ If the code signature came through the image activation path, we skip the*
7707	* taskgated / externally attached path. */
7708	if (imgp->ip_csflags & CS_SIGNED) {
7709	error = `0`;
7710	goto done;
7711	}
7712
7713	/ The rest of the code is for signatures that either already have been externally*
7714	* attached (likely, but not necessarily by a previous run through the taskgated
7715	* path), or that will now be attached by taskgated. */
7716
7717	kr = task_get_task_access_port(proc_task(p), &port);
7718	if (KERN_SUCCESS != kr \|\| !IPC_PORT_VALID(port)) {
7719	error = `0`;
7720	if (require_success) {
7721	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) \| DBG_FUNC_NONE,
7722	proc_getpid(p), OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_TASK_ACCESS_PORT, `0`, `0`);
7723	signature_failure_reason = os_reason_create(OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_TASK_ACCESS_PORT);
7724	error = EACCES;
7725	}
7726	goto done;
7727	}
7728
7729	/*
7730	* taskgated returns KERN_SUCCESS if it has completed its work
7731	* and the exec should continue, KERN_FAILURE if the exec should
7732	* fail, or it may error out with different error code in an
7733	* event of mig failure (e.g. process was signalled during the
7734	* rpc call, taskgated died, mig server died etc.).
7735	*/
7736
7737	kr = __EXEC_WAITING_ON_TASKGATED_CODE_SIGNATURE_UPCALL__(task_access_port: port, new_pid: proc_getpid(p));
7738	switch (kr) {
7739	case KERN_SUCCESS:
7740	error = `0`;
7741	break;
7742	case KERN_FAILURE:
7743	error = EACCES;
7744
7745	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) \| DBG_FUNC_NONE,
7746	proc_getpid(p), OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_TASKGATED_INVALID_SIG, `0`, `0`);
7747	signature_failure_reason = os_reason_create(OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_TASKGATED_INVALID_SIG);
7748	goto done;
7749	default:
7750	error = EACCES;
7751
7752	KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) \| DBG_FUNC_NONE,
7753	proc_getpid(p), OS_REASON_EXEC, EXEC_EXIT_REASON_TASKGATED_OTHER, `0`, `0`);
7754	signature_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_TASKGATED_OTHER);
7755	unexpected_failure = TRUE;
7756	goto done;
7757	}
7758
7759	/ Only do this if exec_resettextvp() did not fail /
7760	if (p->p_textvp != NULLVP) {
7761	csb = ubc_cs_blob_get(p->p_textvp, -`1`, -`1`, p->p_textoff);
7762
7763	if (csb != NULL) {
7764	/ As the enforcement we can do here is very limited, we only allow things that*
7765	* are the only reason why this code path still exists:
7766	* Adhoc signed non-platform binaries without special cs_flags and without any
7767	* entitlements (unrestricted ones still pass AMFI). */
7768	if (
7769	/ Revalidate the blob if necessary through bumped generation count. /
7770	(ubc_cs_generation_check(p->p_textvp) == `0` \|\|
7771	ubc_cs_blob_revalidate(p->p_textvp, csb, imgp, `0`, proc_platform(p)) == `0`) &&
7772	/ Only CS_ADHOC, no CS_KILL, CS_HARD etc. /
7773	(csb->csb_flags & CS_ALLOWED_MACHO) == CS_ADHOC &&
7774	/ If it has a CMS blob, it's not adhoc. The CS_ADHOC flag can lie. /
7775	csblob_find_blob_bytes((const uint8_t *)csb->csb_mem_kaddr, csb->csb_mem_size,
7776	CSSLOT_SIGNATURESLOT,
7777	CSMAGIC_BLOBWRAPPER) == NULL &&
7778	/ It could still be in a trust cache (unlikely with CS_ADHOC), or a magic path. /
7779	csb->csb_platform_binary == `0` &&
7780	/ No entitlements, not even unrestricted ones. /
7781	csb->csb_entitlements_blob == NULL &&
7782	csb->csb_der_entitlements_blob == NULL) {
7783	proc_lock(p);
7784	proc_csflags_set(p, CS_SIGNED \| CS_VALID);
7785	proc_unlock(p);
7786	} else {
7787	uint8_t cdhash[CS_CDHASH_LEN];
7788	char cdhash_string[CS_CDHASH_STRING_SIZE];
7789	proc_getcdhash(p, cdhash);
7790	cdhash_to_string(str: cdhash_string, cdhash);
7791	printf("ignoring detached code signature on '%s' with cdhash '%s' "
7792	"because it is invalid, or not a simple adhoc signature.\n",
7793	p->p_name, cdhash_string);
7794	}
7795	}
7796	}
7797
7798	done:
7799	if (`0` == error) {
7800	/*
7801	* Update the new process's signature-dependent process state.
7802	* state.
7803	*/
7804
7805	error = proc_process_signature(p, signature_failure_reason: &signature_failure_reason);
7806	}
7807
7808	if (`0` == error) {
7809	/*
7810	* Update the new main thread's signature-dependent thread
7811	* state. This was also called when the thread was created,
7812	* but for the main thread the signature was not yet attached
7813	* at that time.
7814	*/
7815	kr = thread_process_signature(thread: imgp->ip_new_thread, task: proc_get_task_raw(proc: p));
7816
7817	if (kr != KERN_SUCCESS) {
7818	error = EINVAL;
7819	signature_failure_reason = os_reason_create(OS_REASON_EXEC, EXEC_EXIT_REASON_MACHINE_THREAD);
7820	}
7821	}
7822
7823	if (`0` == error) {
7824	/ The process's code signature related properties are*
7825	* fully set up, so this is an opportune moment to log
7826	* platform binary execution, if desired. */
7827	if (platform_exec_logging != `0` && csproc_get_platform_binary(p)) {
7828	uint8_t cdhash[CS_CDHASH_LEN];
7829	char cdhash_string[CS_CDHASH_STRING_SIZE];
7830	proc_getcdhash(p, cdhash);
7831	cdhash_to_string(str: cdhash_string, cdhash);
7832
7833	os_log(peLog, "CS Platform Exec Logging: Executing platform signed binary "
7834	"'%s' with cdhash %s\n", p->p_name, cdhash_string);
7835	}
7836	} else {
7837	if (!unexpected_failure) {
7838	proc_csflags_set(p, CS_KILLED);
7839	}
7840	/ make very sure execution fails /
7841	if (vfexec \|\| spawn) {
7842	assert(signature_failure_reason != OS_REASON_NULL);
7843	psignal_vfork_with_reason(p, new_task: proc_task(p), thread: imgp->ip_new_thread,
7844	SIGKILL, signal_reason: signature_failure_reason);
7845	signature_failure_reason = OS_REASON_NULL;
7846	error = `0`;
7847	} else {
7848	assert(signature_failure_reason != OS_REASON_NULL);
7849	psignal_with_reason(p, SIGKILL, signal_reason: signature_failure_reason);
7850	signature_failure_reason = OS_REASON_NULL;
7851	}
7852	}
7853
7854	if (port != IPC_PORT_NULL) {
7855	ipc_port_release_send(port);
7856	}
7857
7858	/ If we hit this, we likely would have leaked an exit reason /
7859	assert(signature_failure_reason == OS_REASON_NULL);
7860	return error;
7861	}
7862
7863	/*
7864	* Typically as soon as we start executing this process, the
7865	* first instruction will trigger a VM fault to bring the text
7866	* pages (as executable) into the address space, followed soon
7867	* thereafter by dyld data structures (for dynamic executable).
7868	* To optimize this, as well as improve support for hardware
7869	* debuggers that can only access resident pages present
7870	* in the process' page tables, we prefault some pages if
7871	* possible. Errors are non-fatal.
7872	*/
7873	#ifndef PREVENT_CALLER_STACK_USE
7874	#define PREVENT_CALLER_STACK_USE __attribute__((noinline))
7875	#endif
7876	static void PREVENT_CALLER_STACK_USE
7877	exec_prefault_data(proc_t p __unused, struct image_params imgp, load_result_t load_result)
7878	{
7879	int ret;
7880	size_t expected_all_image_infos_size;
7881	kern_return_t kr;
7882
7883	/*
7884	* Prefault executable or dyld entry point.
7885	*/
7886	if (vm_map_page_shift(map: current_map()) < (int)PAGE_SHIFT) {
7887	DEBUG4K_LOAD("entry_point 0x%llx\n", (uint64_t)load_result->entry_point);
7888	}
7889	kr = vm_fault(map: current_map(),
7890	vm_map_trunc_page(load_result->entry_point,
7891	vm_map_page_mask(current_map())),
7892	VM_PROT_READ \| VM_PROT_EXECUTE,
7893	FALSE, VM_KERN_MEMORY_NONE,
7894	THREAD_UNINT, NULL, pmap_addr: `0`);
7895	if (kr != KERN_SUCCESS) {
7896	DEBUG4K_ERROR("map %p va 0x%llx -> 0x%x\n", current_map(), (uint64_t)vm_map_trunc_page(load_result->entry_point, vm_map_page_mask(current_map())), kr);
7897	}
7898
7899	if (imgp->ip_flags & IMGPF_IS_64BIT_ADDR) {
7900	expected_all_image_infos_size = sizeof(struct user64_dyld_all_image_infos);
7901	} else {
7902	expected_all_image_infos_size = sizeof(struct user32_dyld_all_image_infos);
7903	}
7904
7905	/ Decode dyld anchor structure from <mach-o/dyld_images.h> /
7906	if (load_result->dynlinker &&
7907	load_result->all_image_info_addr &&
7908	load_result->all_image_info_size >= expected_all_image_infos_size) {
7909	union {
7910	struct user64_dyld_all_image_infos infos64;
7911	struct user32_dyld_all_image_infos infos32;
7912	} all_image_infos;
7913
7914	/*
7915	* Pre-fault to avoid copyin() going through the trap handler
7916	* and recovery path.
7917	*/
7918	if (vm_map_page_shift(map: current_map()) < (int)PAGE_SHIFT) {
7919	DEBUG4K_LOAD("all_image_info_addr 0x%llx\n", load_result->all_image_info_addr);
7920	}
7921	kr = vm_fault(map: current_map(),
7922	vm_map_trunc_page(load_result->all_image_info_addr,
7923	vm_map_page_mask(current_map())),
7924	VM_PROT_READ \| VM_PROT_WRITE,
7925	FALSE, VM_KERN_MEMORY_NONE,
7926	THREAD_UNINT, NULL, pmap_addr: `0`);
7927	if (kr != KERN_SUCCESS) {
7928	// printf("%s:%d map %p va 0x%llx -> 0x%x\n", __FUNCTION__, __LINE__, current_map(), vm_map_trunc_page(load_result->all_image_info_addr, vm_map_page_mask(current_map())), kr);
7929	}
7930	if ((load_result->all_image_info_addr & PAGE_MASK) + expected_all_image_infos_size > PAGE_SIZE) {
7931	/ all_image_infos straddles a page /
7932	kr = vm_fault(map: current_map(),
7933	vm_map_trunc_page(load_result->all_image_info_addr + expected_all_image_infos_size - `1`,
7934	vm_map_page_mask(current_map())),
7935	VM_PROT_READ \| VM_PROT_WRITE,
7936	FALSE, VM_KERN_MEMORY_NONE,
7937	THREAD_UNINT, NULL, pmap_addr: `0`);
7938	if (kr != KERN_SUCCESS) {
7939	// printf("%s:%d map %p va 0x%llx -> 0x%x\n", __FUNCTION__, __LINE__, current_map(), vm_map_trunc_page(load_result->all_image_info_addr + expected_all_image_infos_size -1, vm_map_page_mask(current_map())), kr);
7940	}
7941	}
7942
7943	if (vm_map_page_shift(map: current_map()) < (int)PAGE_SHIFT) {
7944	DEBUG4K_LOAD("copyin(0x%llx, 0x%lx)\n", load_result->all_image_info_addr, expected_all_image_infos_size);
7945	}
7946	ret = copyin((user_addr_t)load_result->all_image_info_addr,
7947	&all_image_infos,
7948	expected_all_image_infos_size);
7949	if (ret == `0` && all_image_infos.infos32.version >= DYLD_ALL_IMAGE_INFOS_ADDRESS_MINIMUM_VERSION) {
7950	user_addr_t notification_address;
7951	user_addr_t dyld_image_address;
7952	user_addr_t dyld_version_address;
7953	user_addr_t dyld_all_image_infos_address;
7954	user_addr_t dyld_slide_amount;
7955
7956	if (imgp->ip_flags & IMGPF_IS_64BIT_ADDR) {
7957	notification_address = (user_addr_t)all_image_infos.infos64.notification;
7958	dyld_image_address = (user_addr_t)all_image_infos.infos64.dyldImageLoadAddress;
7959	dyld_version_address = (user_addr_t)all_image_infos.infos64.dyldVersion;
7960	dyld_all_image_infos_address = (user_addr_t)all_image_infos.infos64.dyldAllImageInfosAddress;
7961	} else {
7962	notification_address = all_image_infos.infos32.notification;
7963	dyld_image_address = all_image_infos.infos32.dyldImageLoadAddress;
7964	dyld_version_address = all_image_infos.infos32.dyldVersion;
7965	dyld_all_image_infos_address = all_image_infos.infos32.dyldAllImageInfosAddress;
7966	}
7967
7968	/*
7969	* dyld statically sets up the all_image_infos in its Mach-O
7970	* binary at static link time, with pointers relative to its default
7971	* load address. Since ASLR might slide dyld before its first
7972	* instruction is executed, "dyld_slide_amount" tells us how far
7973	* dyld was loaded compared to its default expected load address.
7974	* All other pointers into dyld's image should be adjusted by this
7975	* amount. At some point later, dyld will fix up pointers to take
7976	* into account the slide, at which point the all_image_infos_address
7977	* field in the structure will match the runtime load address, and
7978	* "dyld_slide_amount" will be 0, if we were to consult it again.
7979	*/
7980
7981	dyld_slide_amount = (user_addr_t)load_result->all_image_info_addr - dyld_all_image_infos_address;
7982
7983	#if 0
7984	kprintf("exec_prefault: 0x%016llx 0x%08x 0x%016llx 0x%016llx 0x%016llx 0x%016llx\n",
7985	(uint64_t)load_result->all_image_info_addr,
7986	all_image_infos.infos32.version,
7987	(uint64_t)notification_address,
7988	(uint64_t)dyld_image_address,
7989	(uint64_t)dyld_version_address,
7990	(uint64_t)dyld_all_image_infos_address);
7991	#endif
7992
7993	if (vm_map_page_shift(map: current_map()) < (int)PAGE_SHIFT) {
7994	DEBUG4K_LOAD("notification_address 0x%llx dyld_slide_amount 0x%llx\n", (uint64_t)notification_address, (uint64_t)dyld_slide_amount);
7995	}
7996	kr = vm_fault(map: current_map(),
7997	vm_map_trunc_page(notification_address + dyld_slide_amount,
7998	vm_map_page_mask(current_map())),
7999	VM_PROT_READ \| VM_PROT_EXECUTE,
8000	FALSE, VM_KERN_MEMORY_NONE,
8001	THREAD_UNINT, NULL, pmap_addr: `0`);
8002	if (kr != KERN_SUCCESS) {
8003	// printf("%s:%d map %p va 0x%llx -> 0x%x\n", __FUNCTION__, __LINE__, current_map(), vm_map_trunc_page(notification_address + dyld_slide_amount, vm_map_page_mask(current_map())), kr);
8004	}
8005	if (vm_map_page_shift(map: current_map()) < (int)PAGE_SHIFT) {
8006	DEBUG4K_LOAD("dyld_image_address 0x%llx dyld_slide_amount 0x%llx\n", (uint64_t)dyld_image_address, (uint64_t)dyld_slide_amount);
8007	}
8008	kr = vm_fault(map: current_map(),
8009	vm_map_trunc_page(dyld_image_address + dyld_slide_amount,
8010	vm_map_page_mask(current_map())),
8011	VM_PROT_READ \| VM_PROT_EXECUTE,
8012	FALSE, VM_KERN_MEMORY_NONE,
8013	THREAD_UNINT, NULL, pmap_addr: `0`);
8014	if (kr != KERN_SUCCESS) {
8015	// printf("%s:%d map %p va 0x%llx -> 0x%x\n", __FUNCTION__, __LINE__, current_map(), vm_map_trunc_page(dyld_image_address + dyld_slide_amount, vm_map_page_mask(current_map())), kr);
8016	}
8017	if (vm_map_page_shift(map: current_map()) < (int)PAGE_SHIFT) {
8018	DEBUG4K_LOAD("dyld_version_address 0x%llx dyld_slide_amount 0x%llx\n", (uint64_t)dyld_version_address, (uint64_t)dyld_slide_amount);
8019	}
8020	kr = vm_fault(map: current_map(),
8021	vm_map_trunc_page(dyld_version_address + dyld_slide_amount,
8022	vm_map_page_mask(current_map())),
8023	VM_PROT_READ,
8024	FALSE, VM_KERN_MEMORY_NONE,
8025	THREAD_UNINT, NULL, pmap_addr: `0`);
8026	if (kr != KERN_SUCCESS) {
8027	// printf("%s:%d map %p va 0x%llx -> 0x%x\n", __FUNCTION__, __LINE__, current_map(), vm_map_trunc_page(dyld_version_address + dyld_slide_amount, vm_map_page_mask(current_map())), kr);
8028	}
8029	if (vm_map_page_shift(map: current_map()) < (int)PAGE_SHIFT) {
8030	DEBUG4K_LOAD("dyld_all_image_infos_address 0x%llx dyld_slide_amount 0x%llx\n", (uint64_t)dyld_version_address, (uint64_t)dyld_slide_amount);
8031	}
8032	kr = vm_fault(map: current_map(),
8033	vm_map_trunc_page(dyld_all_image_infos_address + dyld_slide_amount,
8034	vm_map_page_mask(current_map())),
8035	VM_PROT_READ \| VM_PROT_WRITE,
8036	FALSE, VM_KERN_MEMORY_NONE,
8037	THREAD_UNINT, NULL, pmap_addr: `0`);
8038	if (kr != KERN_SUCCESS) {
8039	// printf("%s:%d map %p va 0x%llx -> 0x%x\n", __FUNCTION__, __LINE__, current_map(), vm_map_trunc_page(dyld_all_image_infos_address + dyld_slide_amount, vm_map_page_mask(current_map())), kr);
8040	}
8041	}
8042	}
8043	}
8044
8045	static int
8046	sysctl_libmalloc_experiments SYSCTL_HANDLER_ARGS
8047	{
8048	#pragma unused(oidp, arg2, req)
8049	int changed;
8050	errno_t error;
8051	uint64_t value = os_atomic_load_wide(&libmalloc_experiment_factors, relaxed);
8052
8053	error = sysctl_io_number(req, bigValue: value, valueSize: sizeof(value), pValue: &value, changed: &changed);
8054	if (error) {
8055	return error;
8056	}
8057
8058	if (changed) {
8059	os_atomic_store_wide(&libmalloc_experiment_factors, value, relaxed);
8060	}
8061
8062	return `0`;
8063	}
8064
8065	EXPERIMENT_FACTOR_PROC(_kern, libmalloc_experiments, CTLTYPE_QUAD \| CTLFLAG_RW, `0`, `0`, &sysctl_libmalloc_experiments, "A", "");
8066

Browse the source code of xnu/bsd/kern/kern_exec.c