vm_fault.c source code [xnu/osfmk/vm/vm_fault.c]

1	/*
2	* Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28	/*
29	* @OSF_COPYRIGHT@
30	*/
31	/*
32	* Mach Operating System
33	* Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34	* All Rights Reserved.
35	*
36	* Permission to use, copy, modify and distribute this software and its
37	* documentation is hereby granted, provided that both the copyright
38	* notice and this permission notice appear in all copies of the
39	* software, derivative works or modified versions, and any portions
40	* thereof, and that both notices appear in supporting documentation.
41	*
42	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44	* ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45	*
46	* Carnegie Mellon requests users of this software to return to
47	*
48	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49	* School of Computer Science
50	* Carnegie Mellon University
51	* Pittsburgh PA 15213-3890
52	*
53	* any improvements or extensions that they make and grant Carnegie Mellon
54	* the rights to redistribute these changes.
55	*/
56	/*
57	*/
58	/*
59	* File: vm_fault.c
60	* Author: Avadis Tevanian, Jr., Michael Wayne Young
61	*
62	* Page fault handling module.
63	*/
64
65	#include <libkern/OSAtomic.h>
66
67	#include <mach/mach_types.h>
68	#include <mach/kern_return.h>
69	#include <mach/message.h> /* for error codes */
70	#include <mach/vm_param.h>
71	#include <mach/vm_behavior.h>
72	#include <mach/memory_object.h>
73	/ For memory_object_data_{request,unlock} /
74	#include <mach/sdt.h>
75
76	#include <kern/kern_types.h>
77	#include <kern/host_statistics.h>
78	#include <kern/counter.h>
79	#include <kern/task.h>
80	#include <kern/thread.h>
81	#include <kern/sched_prim.h>
82	#include <kern/host.h>
83	#include <kern/mach_param.h>
84	#include <kern/macro_help.h>
85	#include <kern/zalloc_internal.h>
86	#include <kern/misc_protos.h>
87	#include <kern/policy_internal.h>
88
89	#include <vm/vm_compressor.h>
90	#include <vm/vm_compressor_pager.h>
91	#include <vm/vm_fault.h>
92	#include <vm/vm_map.h>
93	#include <vm/vm_object.h>
94	#include <vm/vm_page.h>
95	#include <vm/vm_kern.h>
96	#include <vm/pmap.h>
97	#include <vm/vm_pageout.h>
98	#include <vm/vm_protos.h>
99	#include <vm/vm_external.h>
100	#include <vm/memory_object.h>
101	#include <vm/vm_purgeable_internal.h> /* Needed by some vm_page.h macros */
102	#include <vm/vm_shared_region.h>
103
104	#include <sys/codesign.h>
105	#include <sys/code_signing.h>
106	#include <sys/reason.h>
107	#include <sys/signalvar.h>
108
109	#include <sys/kdebug_triage.h>
110
111	#include <san/kasan.h>
112	#include <libkern/coreanalytics/coreanalytics.h>
113
114	#define VM_FAULT_CLASSIFY 0
115
116	#define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
117
118	int vm_protect_privileged_from_untrusted = `1`;
119
120	unsigned int vm_object_pagein_throttle = `16`;
121
122	/*
123	* We apply a hard throttle to the demand zero rate of tasks that we believe are running out of control which
124	* kicks in when swap space runs out. 64-bit programs have massive address spaces and can leak enormous amounts
125	* of memory if they're buggy and can run the system completely out of swap space. If this happens, we
126	* impose a hard throttle on them to prevent them from taking the last bit of memory left. This helps
127	* keep the UI active so that the user has a chance to kill the offending task before the system
128	* completely hangs.
129	*
130	* The hard throttle is only applied when the system is nearly completely out of swap space and is only applied
131	* to tasks that appear to be bloated. When swap runs out, any task using more than vm_hard_throttle_threshold
132	* will be throttled. The throttling is done by giving the thread that's trying to demand zero a page a
133	* delay of HARD_THROTTLE_DELAY microseconds before being allowed to try the page fault again.
134	*/
135
136	extern void throttle_lowpri_io(int);
137
138	extern struct vnode *vnode_pager_lookup_vnode(memory_object_t);
139
140	uint64_t vm_hard_throttle_threshold;
141
142	#if DEBUG \|\| DEVELOPMENT
143	static bool vmtc_panic_instead = false;
144	int panic_object_not_alive = `1`;
145	#endif /* DEBUG \|\| DEVELOPMENT */
146
147	OS_ALWAYS_INLINE
148	boolean_t
149	NEED_TO_HARD_THROTTLE_THIS_TASK(void)
150	{
151	return vm_wants_task_throttled(current_task()) \|\|
152	((vm_page_free_count < vm_page_throttle_limit \|\|
153	HARD_THROTTLE_LIMIT_REACHED()) &&
154	proc_get_effective_thread_policy(thread: current_thread(), TASK_POLICY_IO) >= THROTTLE_LEVEL_THROTTLED);
155	}
156
157	#define HARD_THROTTLE_DELAY 10000 /* 10000 us == 10 ms */
158	#define SOFT_THROTTLE_DELAY 200 /* 200 us == .2 ms */
159
160	#define VM_PAGE_CREATION_THROTTLE_PERIOD_SECS 6
161	#define VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC 20000
162
163
164	#define VM_STAT_DECOMPRESSIONS() \
165	MACRO_BEGIN \
166	counter_inc(&vm_statistics_decompressions); \
167	current_thread()->decompressions++; \
168	MACRO_END
169
170	boolean_t current_thread_aborted(void);
171
172	/ Forward declarations of internal routines. /
173	static kern_return_t vm_fault_wire_fast(
174	vm_map_t map,
175	vm_map_offset_t va,
176	vm_prot_t prot,
177	vm_tag_t wire_tag,
178	vm_map_entry_t entry,
179	pmap_t pmap,
180	vm_map_offset_t pmap_addr,
181	ppnum_t *physpage_p);
182
183	static kern_return_t vm_fault_internal(
184	vm_map_t map,
185	vm_map_offset_t vaddr,
186	vm_prot_t caller_prot,
187	boolean_t change_wiring,
188	vm_tag_t wire_tag,
189	int interruptible,
190	pmap_t pmap,
191	vm_map_offset_t pmap_addr,
192	ppnum_t *physpage_p);
193
194	static void vm_fault_copy_cleanup(
195	vm_page_t page,
196	vm_page_t top_page);
197
198	static void vm_fault_copy_dst_cleanup(
199	vm_page_t page);
200
201	#if VM_FAULT_CLASSIFY
202	extern void vm_fault_classify(vm_object_t object,
203	vm_object_offset_t offset,
204	vm_prot_t fault_type);
205
206	extern void vm_fault_classify_init(void);
207	#endif
208
209	unsigned long vm_pmap_enter_blocked = `0`;
210	unsigned long vm_pmap_enter_retried = `0`;
211
212	unsigned long vm_cs_validates = `0`;
213	unsigned long vm_cs_revalidates = `0`;
214	unsigned long vm_cs_query_modified = `0`;
215	unsigned long vm_cs_validated_dirtied = `0`;
216	unsigned long vm_cs_bitmap_validated = `0`;
217
218	#if CODE_SIGNING_MONITOR
219	uint64_t vm_cs_defer_to_csm = `0`;
220	uint64_t vm_cs_defer_to_csm_not = `0`;
221	#endif /* CODE_SIGNING_MONITOR */
222
223	void vm_pre_fault(vm_map_offset_t, vm_prot_t);
224
225	extern char *kdp_compressor_decompressed_page;
226	extern addr64_t kdp_compressor_decompressed_page_paddr;
227	extern ppnum_t kdp_compressor_decompressed_page_ppnum;
228
229	struct vmrtfr {
230	int vmrtfr_maxi;
231	int vmrtfr_curi;
232	int64_t vmrtf_total;
233	vm_rtfault_record_t *vm_rtf_records;
234	} vmrtfrs;
235	#define VMRTF_DEFAULT_BUFSIZE (4096)
236	#define VMRTF_NUM_RECORDS_DEFAULT (VMRTF_DEFAULT_BUFSIZE / sizeof(vm_rtfault_record_t))
237	TUNABLE(int, vmrtf_num_records, "vm_rtfault_records", VMRTF_NUM_RECORDS_DEFAULT);
238
239	static void vm_rtfrecord_lock(void);
240	static void vm_rtfrecord_unlock(void);
241	static void vm_record_rtfault(thread_t, uint64_t, vm_map_offset_t, int);
242
243	extern lck_grp_t vm_page_lck_grp_bucket;
244	extern lck_attr_t vm_page_lck_attr;
245	LCK_SPIN_DECLARE_ATTR(vm_rtfr_slock, &vm_page_lck_grp_bucket, &vm_page_lck_attr);
246
247	#if DEVELOPMENT \|\| DEBUG
248	extern int madvise_free_debug;
249	extern int madvise_free_debug_sometimes;
250	#endif /* DEVELOPMENT \|\| DEBUG */
251
252	extern int vm_pageout_protect_realtime;
253
254	#if CONFIG_FREEZE
255	#endif /* CONFIG_FREEZE */
256
257	/*
258	* Routine: vm_fault_init
259	* Purpose:
260	* Initialize our private data structures.
261	*/
262	__startup_func
263	void
264	vm_fault_init(void)
265	{
266	int i, vm_compressor_temp;
267	boolean_t need_default_val = TRUE;
268	/*
269	* Choose a value for the hard throttle threshold based on the amount of ram. The threshold is
270	* computed as a percentage of available memory, and the percentage used is scaled inversely with
271	* the amount of memory. The percentage runs between 10% and 35%. We use 35% for small memory systems
272	* and reduce the value down to 10% for very large memory configurations. This helps give us a
273	* definition of a memory hog that makes more sense relative to the amount of ram in the machine.
274	* The formula here simply uses the number of gigabytes of ram to adjust the percentage.
275	*/
276
277	vm_hard_throttle_threshold = sane_size * (`35` - MIN((int)(sane_size / (`1024` * `1024` * `1024`)), `25`)) / `100`;
278
279	/*
280	* Configure compressed pager behavior. A boot arg takes precedence over a device tree entry.
281	*/
282
283	if (PE_parse_boot_argn(arg_string: "vm_compressor", arg_ptr: &vm_compressor_temp, max_arg: sizeof(vm_compressor_temp))) {
284	for (i = `0`; i < VM_PAGER_MAX_MODES; i++) {
285	if (((vm_compressor_temp & (`1` << i)) == vm_compressor_temp)) {
286	need_default_val = FALSE;
287	vm_compressor_mode = vm_compressor_temp;
288	break;
289	}
290	}
291	if (need_default_val) {
292	printf(format: "Ignoring \"vm_compressor\" boot arg %d\n", vm_compressor_temp);
293	}
294	}
295	#if CONFIG_FREEZE
296	if (need_default_val) {
297	if (osenvironment_is_diagnostics()) {
298	printf("osenvironment == \"diagnostics\". Setting \"vm_compressor_mode\" to in-core compressor only\n");
299	vm_compressor_mode = VM_PAGER_COMPRESSOR_NO_SWAP;
300	need_default_val = false;
301	}
302	}
303	#endif /* CONFIG_FREEZE */
304	if (need_default_val) {
305	/ If no boot arg or incorrect boot arg, try device tree. /
306	PE_get_default(property_name: "kern.vm_compressor", property_ptr: &vm_compressor_mode, max_property: sizeof(vm_compressor_mode));
307	}
308	printf(format: "\"vm_compressor_mode\" is %d\n", vm_compressor_mode);
309	vm_config_init();
310
311	PE_parse_boot_argn(arg_string: "vm_protect_privileged_from_untrusted",
312	arg_ptr: &vm_protect_privileged_from_untrusted,
313	max_arg: sizeof(vm_protect_privileged_from_untrusted));
314
315	#if DEBUG \|\| DEVELOPMENT
316	(void)PE_parse_boot_argn("text_corruption_panic", &vmtc_panic_instead, sizeof(vmtc_panic_instead));
317
318	if (kern_feature_override(KF_MADVISE_FREE_DEBUG_OVRD)) {
319	madvise_free_debug = `0`;
320	madvise_free_debug_sometimes = `0`;
321	}
322
323	PE_parse_boot_argn("panic_object_not_alive", &panic_object_not_alive, sizeof(panic_object_not_alive));
324	#endif /* DEBUG \|\| DEVELOPMENT */
325	}
326
327	__startup_func
328	static void
329	vm_rtfault_record_init(void)
330	{
331	size_t size;
332
333	vmrtf_num_records = MAX(vmrtf_num_records, `1`);
334	size = vmrtf_num_records * sizeof(vm_rtfault_record_t);
335	vmrtfrs.vm_rtf_records = zalloc_permanent_tag(size,
336	ZALIGN(vm_rtfault_record_t), VM_KERN_MEMORY_DIAG);
337	vmrtfrs.vmrtfr_maxi = vmrtf_num_records - `1`;
338	}
339	STARTUP(ZALLOC, STARTUP_RANK_MIDDLE, vm_rtfault_record_init);
340
341	/*
342	* Routine: vm_fault_cleanup
343	* Purpose:
344	* Clean up the result of vm_fault_page.
345	* Results:
346	* The paging reference for "object" is released.
347	* "object" is unlocked.
348	* If "top_page" is not null, "top_page" is
349	* freed and the paging reference for the object
350	* containing it is released.
351	*
352	* In/out conditions:
353	* "object" must be locked.
354	*/
355	void
356	vm_fault_cleanup(
357	vm_object_t object,
358	vm_page_t top_page)
359	{
360	vm_object_paging_end(object);
361	vm_object_unlock(object);
362
363	if (top_page != VM_PAGE_NULL) {
364	object = VM_PAGE_OBJECT(top_page);
365
366	vm_object_lock(object);
367	VM_PAGE_FREE(top_page);
368	vm_object_paging_end(object);
369	vm_object_unlock(object);
370	}
371	}
372
373	#define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
374
375
376	boolean_t vm_page_deactivate_behind = TRUE;
377	/*
378	* default sizes given VM_BEHAVIOR_DEFAULT reference behavior
379	*/
380	#define VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW 128
381	#define VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER 16 /* don't make this too big... */
382	/ we use it to size an array on the stack /
383
384	int vm_default_behind = VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW;
385
386	#define MAX_SEQUENTIAL_RUN (1024 * 1024 * 1024)
387
388	/*
389	* vm_page_is_sequential
390	*
391	* Determine if sequential access is in progress
392	* in accordance with the behavior specified.
393	* Update state to indicate current access pattern.
394	*
395	* object must have at least the shared lock held
396	*/
397	static
398	void
399	vm_fault_is_sequential(
400	vm_object_t object,
401	vm_object_offset_t offset,
402	vm_behavior_t behavior)
403	{
404	vm_object_offset_t last_alloc;
405	int sequential;
406	int orig_sequential;
407
408	last_alloc = object->last_alloc;
409	sequential = object->sequential;
410	orig_sequential = sequential;
411
412	offset = vm_object_trunc_page(offset);
413	if (offset == last_alloc && behavior != VM_BEHAVIOR_RANDOM) {
414	/ re-faulting in the same page: no change in behavior /
415	return;
416	}
417
418	switch (behavior) {
419	case VM_BEHAVIOR_RANDOM:
420	/*
421	* reset indicator of sequential behavior
422	*/
423	sequential = `0`;
424	break;
425
426	case VM_BEHAVIOR_SEQUENTIAL:
427	if (offset && last_alloc == offset - PAGE_SIZE_64) {
428	/*
429	* advance indicator of sequential behavior
430	*/
431	if (sequential < MAX_SEQUENTIAL_RUN) {
432	sequential += PAGE_SIZE;
433	}
434	} else {
435	/*
436	* reset indicator of sequential behavior
437	*/
438	sequential = `0`;
439	}
440	break;
441
442	case VM_BEHAVIOR_RSEQNTL:
443	if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
444	/*
445	* advance indicator of sequential behavior
446	*/
447	if (sequential > -MAX_SEQUENTIAL_RUN) {
448	sequential -= PAGE_SIZE;
449	}
450	} else {
451	/*
452	* reset indicator of sequential behavior
453	*/
454	sequential = `0`;
455	}
456	break;
457
458	case VM_BEHAVIOR_DEFAULT:
459	default:
460	if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
461	/*
462	* advance indicator of sequential behavior
463	*/
464	if (sequential < `0`) {
465	sequential = `0`;
466	}
467	if (sequential < MAX_SEQUENTIAL_RUN) {
468	sequential += PAGE_SIZE;
469	}
470	} else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
471	/*
472	* advance indicator of sequential behavior
473	*/
474	if (sequential > `0`) {
475	sequential = `0`;
476	}
477	if (sequential > -MAX_SEQUENTIAL_RUN) {
478	sequential -= PAGE_SIZE;
479	}
480	} else {
481	/*
482	* reset indicator of sequential behavior
483	*/
484	sequential = `0`;
485	}
486	break;
487	}
488	if (sequential != orig_sequential) {
489	if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
490	/*
491	* if someone else has already updated object->sequential
492	* don't bother trying to update it or object->last_alloc
493	*/
494	return;
495	}
496	}
497	/*
498	* I'd like to do this with a OSCompareAndSwap64, but that
499	* doesn't exist for PPC... however, it shouldn't matter
500	* that much... last_alloc is maintained so that we can determine
501	* if a sequential access pattern is taking place... if only
502	* one thread is banging on this object, no problem with the unprotected
503	* update... if 2 or more threads are banging away, we run the risk of
504	* someone seeing a mangled update... however, in the face of multiple
505	* accesses, no sequential access pattern can develop anyway, so we
506	* haven't lost any real info.
507	*/
508	object->last_alloc = offset;
509	}
510
511	#if DEVELOPMENT \|\| DEBUG
512	uint64_t vm_page_deactivate_behind_count = `0`;
513	#endif /* DEVELOPMENT \|\| DEBUG */
514
515	/*
516	* vm_page_deactivate_behind
517	*
518	* Determine if sequential access is in progress
519	* in accordance with the behavior specified. If
520	* so, compute a potential page to deactivate and
521	* deactivate it.
522	*
523	* object must be locked.
524	*
525	* return TRUE if we actually deactivate a page
526	*/
527	static
528	boolean_t
529	vm_fault_deactivate_behind(
530	vm_object_t object,
531	vm_object_offset_t offset,
532	vm_behavior_t behavior)
533	{
534	int n;
535	int pages_in_run = `0`;
536	int max_pages_in_run = `0`;
537	int sequential_run;
538	int sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
539	vm_object_offset_t run_offset = `0`;
540	vm_object_offset_t pg_offset = `0`;
541	vm_page_t m;
542	vm_page_t page_run[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER];
543
544	pages_in_run = `0`;
545	#if TRACEFAULTPAGE
546	dbgTrace(`0xBEEF0018`, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); / (TEST/DEBUG) /
547	#endif
548	if (is_kernel_object(object) \|\| vm_page_deactivate_behind == FALSE \|\| (vm_object_trunc_page(offset) != offset)) {
549	/*
550	* Do not deactivate pages from the kernel object: they
551	* are not intended to become pageable.
552	* or we've disabled the deactivate behind mechanism
553	* or we are dealing with an offset that is not aligned to
554	* the system's PAGE_SIZE because in that case we will
555	* handle the deactivation on the aligned offset and, thus,
556	* the full PAGE_SIZE page once. This helps us avoid the redundant
557	* deactivates and the extra faults.
558	*/
559	return FALSE;
560	}
561	if ((sequential_run = object->sequential)) {
562	if (sequential_run < `0`) {
563	sequential_behavior = VM_BEHAVIOR_RSEQNTL;
564	sequential_run = `0` - sequential_run;
565	} else {
566	sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
567	}
568	}
569	switch (behavior) {
570	case VM_BEHAVIOR_RANDOM:
571	break;
572	case VM_BEHAVIOR_SEQUENTIAL:
573	if (sequential_run >= (int)PAGE_SIZE) {
574	run_offset = `0` - PAGE_SIZE_64;
575	max_pages_in_run = `1`;
576	}
577	break;
578	case VM_BEHAVIOR_RSEQNTL:
579	if (sequential_run >= (int)PAGE_SIZE) {
580	run_offset = PAGE_SIZE_64;
581	max_pages_in_run = `1`;
582	}
583	break;
584	case VM_BEHAVIOR_DEFAULT:
585	default:
586	{ vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
587
588	/*
589	* determine if the run of sequential accesss has been
590	* long enough on an object with default access behavior
591	* to consider it for deactivation
592	*/
593	if ((uint64_t)sequential_run >= behind && (sequential_run % (VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER * PAGE_SIZE)) == `0`) {
594	/*
595	* the comparisons between offset and behind are done
596	* in this kind of odd fashion in order to prevent wrap around
597	* at the end points
598	*/
599	if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
600	if (offset >= behind) {
601	run_offset = `0` - behind;
602	pg_offset = PAGE_SIZE_64;
603	max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
604	}
605	} else {
606	if (offset < -behind) {
607	run_offset = behind;
608	pg_offset = `0` - PAGE_SIZE_64;
609	max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
610	}
611	}
612	}
613	break;}
614	}
615	for (n = `0`; n < max_pages_in_run; n++) {
616	m = vm_page_lookup(object, offset: offset + run_offset + (n * pg_offset));
617
618	if (m && !m->vmp_laundry && !m->vmp_busy && !m->vmp_no_cache && (m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && !m->vmp_fictitious && !m->vmp_absent) {
619	page_run[pages_in_run++] = m;
620
621	/*
622	* by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
623	*
624	* a TLB flush isn't really needed here since at worst we'll miss the reference bit being
625	* updated in the PTE if a remote processor still has this mapping cached in its TLB when the
626	* new reference happens. If no futher references happen on the page after that remote TLB flushes
627	* we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
628	* by pageout_scan, which is just fine since the last reference would have happened quite far
629	* in the past (TLB caches don't hang around for very long), and of course could just as easily
630	* have happened before we did the deactivate_behind.
631	*/
632	pmap_clear_refmod_options(pn: VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
633	}
634	}
635	if (pages_in_run) {
636	vm_page_lockspin_queues();
637
638	for (n = `0`; n < pages_in_run; n++) {
639	m = page_run[n];
640
641	vm_page_deactivate_internal(page: m, FALSE);
642
643	#if DEVELOPMENT \|\| DEBUG
644	vm_page_deactivate_behind_count++;
645	#endif /* DEVELOPMENT \|\| DEBUG */
646
647	#if TRACEFAULTPAGE
648	dbgTrace(`0xBEEF0019`, (unsigned int) object, (unsigned int) m); / (TEST/DEBUG) /
649	#endif
650	}
651	vm_page_unlock_queues();
652
653	return TRUE;
654	}
655	return FALSE;
656	}
657
658
659	#if (DEVELOPMENT \|\| DEBUG)
660	uint32_t vm_page_creation_throttled_hard = `0`;
661	uint32_t vm_page_creation_throttled_soft = `0`;
662	uint64_t vm_page_creation_throttle_avoided = `0`;
663	#endif /* DEVELOPMENT \|\| DEBUG */
664
665	static int
666	vm_page_throttled(boolean_t page_kept)
667	{
668	clock_sec_t elapsed_sec;
669	clock_sec_t tv_sec;
670	clock_usec_t tv_usec;
671	task_t curtask = current_task_early();
672
673	thread_t thread = current_thread();
674
675	if (thread->options & TH_OPT_VMPRIV) {
676	return `0`;
677	}
678
679	if (curtask && !curtask->active) {
680	return `0`;
681	}
682
683	if (thread->t_page_creation_throttled) {
684	thread->t_page_creation_throttled = `0`;
685
686	if (page_kept == FALSE) {
687	goto no_throttle;
688	}
689	}
690	if (NEED_TO_HARD_THROTTLE_THIS_TASK()) {
691	#if (DEVELOPMENT \|\| DEBUG)
692	thread->t_page_creation_throttled_hard++;
693	OSAddAtomic(`1`, &vm_page_creation_throttled_hard);
694	#endif /* DEVELOPMENT \|\| DEBUG */
695	return HARD_THROTTLE_DELAY;
696	}
697
698	if ((vm_page_free_count < vm_page_throttle_limit \|\| (VM_CONFIG_COMPRESSOR_IS_PRESENT && SWAPPER_NEEDS_TO_UNTHROTTLE())) &&
699	thread->t_page_creation_count > (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS * VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC)) {
700	if (vm_page_free_wanted == `0` && vm_page_free_wanted_privileged == `0`) {
701	#if (DEVELOPMENT \|\| DEBUG)
702	OSAddAtomic64(`1`, &vm_page_creation_throttle_avoided);
703	#endif
704	goto no_throttle;
705	}
706	clock_get_system_microtime(secs: &tv_sec, microsecs: &tv_usec);
707
708	elapsed_sec = tv_sec - thread->t_page_creation_time;
709
710	if (elapsed_sec <= VM_PAGE_CREATION_THROTTLE_PERIOD_SECS \|\|
711	(thread->t_page_creation_count / elapsed_sec) >= VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC) {
712	if (elapsed_sec >= (`3` * VM_PAGE_CREATION_THROTTLE_PERIOD_SECS)) {
713	/*
714	* we'll reset our stats to give a well behaved app
715	* that was unlucky enough to accumulate a bunch of pages
716	* over a long period of time a chance to get out of
717	* the throttled state... we reset the counter and timestamp
718	* so that if it stays under the rate limit for the next second
719	* it will be back in our good graces... if it exceeds it, it
720	* will remain in the throttled state
721	*/
722	thread->t_page_creation_time = tv_sec;
723	thread->t_page_creation_count = VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC * (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS - `1`);
724	}
725	VM_PAGEOUT_DEBUG(vm_page_throttle_count, `1`);
726
727	thread->t_page_creation_throttled = `1`;
728
729	if (VM_CONFIG_COMPRESSOR_IS_PRESENT && HARD_THROTTLE_LIMIT_REACHED()) {
730	#if (DEVELOPMENT \|\| DEBUG)
731	thread->t_page_creation_throttled_hard++;
732	OSAddAtomic(`1`, &vm_page_creation_throttled_hard);
733	#endif /* DEVELOPMENT \|\| DEBUG */
734	return HARD_THROTTLE_DELAY;
735	} else {
736	#if (DEVELOPMENT \|\| DEBUG)
737	thread->t_page_creation_throttled_soft++;
738	OSAddAtomic(`1`, &vm_page_creation_throttled_soft);
739	#endif /* DEVELOPMENT \|\| DEBUG */
740	return SOFT_THROTTLE_DELAY;
741	}
742	}
743	thread->t_page_creation_time = tv_sec;
744	thread->t_page_creation_count = `0`;
745	}
746	no_throttle:
747	thread->t_page_creation_count++;
748
749	return `0`;
750	}
751
752	extern boolean_t vm_pageout_running;
753	static __attribute__((noinline, not_tail_called)) void
754	__VM_FAULT_THROTTLE_FOR_PAGEOUT_SCAN__(
755	int throttle_delay)
756	{
757	/ make sure vm_pageout_scan() gets to work while we're throttled /
758	if (!vm_pageout_running) {
759	thread_wakeup((event_t)&vm_page_free_wanted);
760	}
761	delay(usec: throttle_delay);
762	}
763
764
765	/*
766	* check for various conditions that would
767	* prevent us from creating a ZF page...
768	* cleanup is based on being called from vm_fault_page
769	*
770	* object must be locked
771	* object == m->vmp_object
772	*/
773	static vm_fault_return_t
774	vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, wait_interrupt_t interruptible_state, boolean_t page_throttle)
775	{
776	int throttle_delay;
777
778	if (object->shadow_severed \|\|
779	VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
780	/*
781	* Either:
782	* 1. the shadow chain was severed,
783	* 2. the purgeable object is volatile or empty and is marked
784	* to fault on access while volatile.
785	* Just have to return an error at this point
786	*/
787	if (m != VM_PAGE_NULL) {
788	VM_PAGE_FREE(m);
789	}
790	vm_fault_cleanup(object, top_page: first_m);
791
792	thread_interrupt_level(interruptible: interruptible_state);
793
794	if (VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
795	ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PURGEABLE_FAULT_ERROR), arg: `0` / arg /);
796	}
797
798	if (object->shadow_severed) {
799	ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_OBJECT_SHADOW_SEVERED), arg: `0` / arg /);
800	}
801	return VM_FAULT_MEMORY_ERROR;
802	}
803	if (page_throttle == TRUE) {
804	if ((throttle_delay = vm_page_throttled(FALSE))) {
805	/*
806	* we're throttling zero-fills...
807	* treat this as if we couldn't grab a page
808	*/
809	if (m != VM_PAGE_NULL) {
810	VM_PAGE_FREE(m);
811	}
812	vm_fault_cleanup(object, top_page: first_m);
813
814	VM_DEBUG_EVENT(vmf_check_zfdelay, VMF_CHECK_ZFDELAY, DBG_FUNC_NONE, throttle_delay, `0`, `0`, `0`);
815
816	__VM_FAULT_THROTTLE_FOR_PAGEOUT_SCAN__(throttle_delay);
817
818	if (current_thread_aborted()) {
819	thread_interrupt_level(interruptible: interruptible_state);
820	ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_INTERRUPTED), arg: `0` / arg /);
821	return VM_FAULT_INTERRUPTED;
822	}
823	thread_interrupt_level(interruptible: interruptible_state);
824
825	return VM_FAULT_MEMORY_SHORTAGE;
826	}
827	}
828	return VM_FAULT_SUCCESS;
829	}
830
831	/*
832	* Clear the code signing bits on the given page_t
833	*/
834	static void
835	vm_fault_cs_clear(vm_page_t m)
836	{
837	m->vmp_cs_validated = VMP_CS_ALL_FALSE;
838	m->vmp_cs_tainted = VMP_CS_ALL_FALSE;
839	m->vmp_cs_nx = VMP_CS_ALL_FALSE;
840	}
841
842	/*
843	* Enqueues the given page on the throttled queue.
844	* The caller must hold the vm_page_queue_lock and it will be held on return.
845	*/
846	static void
847	vm_fault_enqueue_throttled_locked(vm_page_t m)
848	{
849	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
850	assert(!VM_PAGE_WIRED(m));
851
852	/*
853	* can't be on the pageout queue since we don't
854	* have a pager to try and clean to
855	*/
856	vm_page_queues_remove(mem: m, TRUE);
857	vm_page_check_pageable_safe(page: m);
858	vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
859	m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
860	vm_page_throttled_count++;
861	}
862
863	/*
864	* do the work to zero fill a page and
865	* inject it into the correct paging queue
866	*
867	* m->vmp_object must be locked
868	* page queue lock must NOT be held
869	*/
870	static int
871	vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
872	{
873	int my_fault = DBG_ZERO_FILL_FAULT;
874	vm_object_t object;
875
876	object = VM_PAGE_OBJECT(m);
877
878	/*
879	* This is is a zero-fill page fault...
880	*
881	* Checking the page lock is a waste of
882	* time; this page was absent, so
883	* it can't be page locked by a pager.
884	*
885	* we also consider it undefined
886	* with respect to instruction
887	* execution. i.e. it is the responsibility
888	* of higher layers to call for an instruction
889	* sync after changing the contents and before
890	* sending a program into this area. We
891	* choose this approach for performance
892	*/
893	vm_fault_cs_clear(m);
894	m->vmp_pmapped = TRUE;
895
896	if (no_zero_fill == TRUE) {
897	my_fault = DBG_NZF_PAGE_FAULT;
898
899	if (m->vmp_absent && m->vmp_busy) {
900	return my_fault;
901	}
902	} else {
903	vm_page_zero_fill(page: m);
904
905	counter_inc(&vm_statistics_zero_fill_count);
906	DTRACE_VM2(zfod, int, `1`, (uint64_t *), NULL);
907	}
908	assert(!m->vmp_laundry);
909	assert(!is_kernel_object(object));
910	//assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
911	if (!VM_DYNAMIC_PAGING_ENABLED() &&
912	(object->purgable == VM_PURGABLE_DENY \|\|
913	object->purgable == VM_PURGABLE_NONVOLATILE \|\|
914	object->purgable == VM_PURGABLE_VOLATILE)) {
915	vm_page_lockspin_queues();
916	if (!VM_DYNAMIC_PAGING_ENABLED()) {
917	vm_fault_enqueue_throttled_locked(m);
918	}
919	vm_page_unlock_queues();
920	}
921	return my_fault;
922	}
923
924
925	/*
926	* Routine: vm_fault_page
927	* Purpose:
928	* Find the resident page for the virtual memory
929	* specified by the given virtual memory object
930	* and offset.
931	* Additional arguments:
932	* The required permissions for the page is given
933	* in "fault_type". Desired permissions are included
934	* in "protection".
935	* fault_info is passed along to determine pagein cluster
936	* limits... it contains the expected reference pattern,
937	* cluster size if available, etc...
938	*
939	* If the desired page is known to be resident (for
940	* example, because it was previously wired down), asserting
941	* the "unwiring" parameter will speed the search.
942	*
943	* If the operation can be interrupted (by thread_abort
944	* or thread_terminate), then the "interruptible"
945	* parameter should be asserted.
946	*
947	* Results:
948	* The page containing the proper data is returned
949	* in "result_page".
950	*
951	* In/out conditions:
952	* The source object must be locked and referenced,
953	* and must donate one paging reference. The reference
954	* is not affected. The paging reference and lock are
955	* consumed.
956	*
957	* If the call succeeds, the object in which "result_page"
958	* resides is left locked and holding a paging reference.
959	* If this is not the original object, a busy page in the
960	* original object is returned in "top_page", to prevent other
961	* callers from pursuing this same data, along with a paging
962	* reference for the original object. The "top_page" should
963	* be destroyed when this guarantee is no longer required.
964	* The "result_page" is also left busy. It is not removed
965	* from the pageout queues.
966	* Special Case:
967	* A return value of VM_FAULT_SUCCESS_NO_PAGE means that the
968	* fault succeeded but there's no VM page (i.e. the VM object
969	* does not actually hold VM pages, but device memory or
970	* large pages). The object is still locked and we still hold a
971	* paging_in_progress reference.
972	*/
973	unsigned int vm_fault_page_blocked_access = `0`;
974	unsigned int vm_fault_page_forced_retry = `0`;
975
976	vm_fault_return_t
977	vm_fault_page(
978	/ Arguments: /
979	vm_object_t first_object, / Object to begin search /
980	vm_object_offset_t first_offset, / Offset into object /
981	vm_prot_t fault_type, / What access is requested /
982	boolean_t must_be_resident,/ Must page be resident? /
983	boolean_t caller_lookup, / caller looked up page /
984	/ Modifies in place: /
985	vm_prot_t protection, /* Protection for mapping /
986	vm_page_t result_page, /* Page found, if successful /
987	/ Returns: /
988	vm_page_t top_page, /* Page in top object, if*
989	* not result_page. */
990	int type_of_fault, /* if non-null, fill in with type of fault*
991	* COW, zero-fill, etc... returned in trace point */
992	/ More arguments: /
993	kern_return_t error_code, /* code if page is in error /
994	boolean_t no_zero_fill, / don't zero fill absent pages /
995	vm_object_fault_info_t fault_info)
996	{
997	vm_page_t m;
998	vm_object_t object;
999	vm_object_offset_t offset;
1000	vm_page_t first_m;
1001	vm_object_t next_object;
1002	vm_object_t copy_object;
1003	boolean_t look_for_page;
1004	boolean_t force_fault_retry = FALSE;
1005	vm_prot_t access_required = fault_type;
1006	vm_prot_t wants_copy_flag;
1007	kern_return_t wait_result;
1008	wait_interrupt_t interruptible_state;
1009	boolean_t data_already_requested = FALSE;
1010	vm_behavior_t orig_behavior;
1011	vm_size_t orig_cluster_size;
1012	vm_fault_return_t error;
1013	int my_fault;
1014	uint32_t try_failed_count;
1015	int interruptible; / how may fault be interrupted? /
1016	int external_state = VM_EXTERNAL_STATE_UNKNOWN;
1017	memory_object_t pager;
1018	vm_fault_return_t retval;
1019	int grab_options;
1020	bool clear_absent_on_error = false;
1021
1022	/*
1023	* MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
1024	* marked as paged out in the compressor pager or the pager doesn't exist.
1025	* Note also that if the pager for an internal object
1026	* has not been created, the pager is not invoked regardless of the value
1027	* of MUST_ASK_PAGER().
1028	*
1029	* PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
1030	* is marked as paged out in the compressor pager.
1031	* PAGED_OUT() is used to determine if a page has already been pushed
1032	* into a copy object in order to avoid a redundant page out operation.
1033	*/
1034	#define MUST_ASK_PAGER(o, f, s) \
1035	((s = VM_COMPRESSOR_PAGER_STATE_GET((o), (f))) != VM_EXTERNAL_STATE_ABSENT)
1036
1037	#define PAGED_OUT(o, f) \
1038	(VM_COMPRESSOR_PAGER_STATE_GET((o), (f)) == VM_EXTERNAL_STATE_EXISTS)
1039
1040	/*
1041	* Recovery actions
1042	*/
1043	#define RELEASE_PAGE(m) \
1044	MACRO_BEGIN \
1045	PAGE_WAKEUP_DONE(m); \
1046	if ( !VM_PAGE_PAGEABLE(m)) { \
1047	vm_page_lockspin_queues(); \
1048	if (clear_absent_on_error && m->vmp_absent) {\
1049	vm_page_zero_fill(m); \
1050	counter_inc(&vm_statistics_zero_fill_count);\
1051	DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);\
1052	m->vmp_absent = false; \
1053	} \
1054	if ( !VM_PAGE_PAGEABLE(m)) { \
1055	if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) \
1056	vm_page_deactivate(m); \
1057	else \
1058	vm_page_activate(m); \
1059	} \
1060	vm_page_unlock_queues(); \
1061	} \
1062	clear_absent_on_error = false; \
1063	MACRO_END
1064
1065	#if TRACEFAULTPAGE
1066	dbgTrace(`0xBEEF0002`, (unsigned int) first_object, (unsigned int) first_offset); / (TEST/DEBUG) /
1067	#endif
1068
1069	interruptible = fault_info->interruptible;
1070	interruptible_state = thread_interrupt_level(interruptible);
1071
1072	/*
1073	* INVARIANTS (through entire routine):
1074	*
1075	* 1) At all times, we must either have the object
1076	* lock or a busy page in some object to prevent
1077	* some other thread from trying to bring in
1078	* the same page.
1079	*
1080	* Note that we cannot hold any locks during the
1081	* pager access or when waiting for memory, so
1082	* we use a busy page then.
1083	*
1084	* 2) To prevent another thread from racing us down the
1085	* shadow chain and entering a new page in the top
1086	* object before we do, we must keep a busy page in
1087	* the top object while following the shadow chain.
1088	*
1089	* 3) We must increment paging_in_progress on any object
1090	* for which we have a busy page before dropping
1091	* the object lock
1092	*
1093	* 4) We leave busy pages on the pageout queues.
1094	* If the pageout daemon comes across a busy page,
1095	* it will remove the page from the pageout queues.
1096	*/
1097
1098	object = first_object;
1099	offset = first_offset;
1100	first_m = VM_PAGE_NULL;
1101	access_required = fault_type;
1102
1103	/*
1104	* default type of fault
1105	*/
1106	my_fault = DBG_CACHE_HIT_FAULT;
1107	thread_pri_floor_t token;
1108	bool drop_floor = false;
1109
1110	while (TRUE) {
1111	#if TRACEFAULTPAGE
1112	dbgTrace(`0xBEEF0003`, (unsigned int) `0`, (unsigned int) `0`); / (TEST/DEBUG) /
1113	#endif
1114
1115	grab_options = `0`;
1116	#if CONFIG_SECLUDED_MEMORY
1117	if (object->can_grab_secluded) {
1118	grab_options \|= VM_PAGE_GRAB_SECLUDED;
1119	}
1120	#endif /* CONFIG_SECLUDED_MEMORY */
1121
1122	if (!object->alive) {
1123	/*
1124	* object is no longer valid
1125	* clean up and return error
1126	*/
1127	#if DEVELOPMENT \|\| DEBUG
1128	printf("FBDP rdar://93769854 %s:%d object %p internal %d pager %p (%s) copy %p shadow %p alive %d terminating %d named %d ref %d shadow_severed %d\n", __FUNCTION__, __LINE__, object, object->internal, object->pager, object->pager ? object->pager->mo_pager_ops->memory_object_pager_name : "?", object->vo_copy, object->shadow, object->alive, object->terminating, object->named, object->ref_count, object->shadow_severed);
1129	if (panic_object_not_alive) {
1130	panic("FBDP rdar://93769854 %s:%d object %p internal %d pager %p (%s) copy %p shadow %p alive %d terminating %d named %d ref %d shadow_severed %d\n", __FUNCTION__, __LINE__, object, object->internal, object->pager, object->pager ? object->pager->mo_pager_ops->memory_object_pager_name : "?", object->vo_copy, object->shadow, object->alive, object->terminating, object->named, object->ref_count, object->shadow_severed);
1131	}
1132	#endif /* DEVELOPMENT \|\| DEBUG */
1133	vm_fault_cleanup(object, top_page: first_m);
1134	thread_interrupt_level(interruptible: interruptible_state);
1135
1136	ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_OBJECT_NOT_ALIVE), arg: `0` / arg /);
1137	return VM_FAULT_MEMORY_ERROR;
1138	}
1139
1140	if (!object->pager_created && object->phys_contiguous) {
1141	/*
1142	* A physically-contiguous object without a pager:
1143	* must be a "large page" object. We do not deal
1144	* with VM pages for this object.
1145	*/
1146	caller_lookup = FALSE;
1147	m = VM_PAGE_NULL;
1148	goto phys_contig_object;
1149	}
1150
1151	if (object->blocked_access) {
1152	/*
1153	* Access to this VM object has been blocked.
1154	* Replace our "paging_in_progress" reference with
1155	* a "activity_in_progress" reference and wait for
1156	* access to be unblocked.
1157	*/
1158	caller_lookup = FALSE; / no longer valid after sleep /
1159	vm_object_activity_begin(object);
1160	vm_object_paging_end(object);
1161	while (object->blocked_access) {
1162	vm_object_sleep(object,
1163	VM_OBJECT_EVENT_UNBLOCKED,
1164	THREAD_UNINT);
1165	}
1166	vm_fault_page_blocked_access++;
1167	vm_object_paging_begin(object);
1168	vm_object_activity_end(object);
1169	}
1170
1171	/*
1172	* See whether the page at 'offset' is resident
1173	*/
1174	if (caller_lookup == TRUE) {
1175	/*
1176	* The caller has already looked up the page
1177	* and gave us the result in "result_page".
1178	* We can use this for the first lookup but
1179	* it loses its validity as soon as we unlock
1180	* the object.
1181	*/
1182	m = *result_page;
1183	caller_lookup = FALSE; / no longer valid after that /
1184	} else {
1185	m = vm_page_lookup(object, vm_object_trunc_page(offset));
1186	}
1187	#if TRACEFAULTPAGE
1188	dbgTrace(`0xBEEF0004`, (unsigned int) m, (unsigned int) object); / (TEST/DEBUG) /
1189	#endif
1190	if (m != VM_PAGE_NULL) {
1191	if (m->vmp_busy) {
1192	/*
1193	* The page is being brought in,
1194	* wait for it and then retry.
1195	*/
1196	#if TRACEFAULTPAGE
1197	dbgTrace(`0xBEEF0005`, (unsigned int) m, (unsigned int) `0`); / (TEST/DEBUG) /
1198	#endif
1199	wait_result = PAGE_SLEEP(object, m, interruptible);
1200
1201	if (wait_result != THREAD_AWAKENED) {
1202	vm_fault_cleanup(object, top_page: first_m);
1203	thread_interrupt_level(interruptible: interruptible_state);
1204
1205	if (wait_result == THREAD_RESTART) {
1206	return VM_FAULT_RETRY;
1207	} else {
1208	ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_BUSYPAGE_WAIT_INTERRUPTED), arg: `0` / arg /);
1209	return VM_FAULT_INTERRUPTED;
1210	}
1211	}
1212	continue;
1213	}
1214	if (m->vmp_laundry) {
1215	m->vmp_free_when_done = FALSE;
1216
1217	if (!m->vmp_cleaning) {
1218	vm_pageout_steal_laundry(page: m, FALSE);
1219	}
1220	}
1221	vm_object_lock_assert_exclusive(VM_PAGE_OBJECT(m));
1222	if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
1223	/*
1224	* Guard page: off limits !
1225	*/
1226	if (fault_type == VM_PROT_NONE) {
1227	/*
1228	* The fault is not requesting any
1229	* access to the guard page, so it must
1230	* be just to wire or unwire it.
1231	* Let's pretend it succeeded...
1232	*/
1233	m->vmp_busy = TRUE;
1234	*result_page = m;
1235	assert(first_m == VM_PAGE_NULL);
1236	*top_page = first_m;
1237	if (type_of_fault) {
1238	*type_of_fault = DBG_GUARD_FAULT;
1239	}
1240	thread_interrupt_level(interruptible: interruptible_state);
1241	return VM_FAULT_SUCCESS;
1242	} else {
1243	/*
1244	* The fault requests access to the
1245	* guard page: let's deny that !
1246	*/
1247	vm_fault_cleanup(object, top_page: first_m);
1248	thread_interrupt_level(interruptible: interruptible_state);
1249	ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_GUARDPAGE_FAULT), arg: `0` / arg /);
1250	return VM_FAULT_MEMORY_ERROR;
1251	}
1252	}
1253
1254
1255	if (m->vmp_error) {
1256	/*
1257	* The page is in error, give up now.
1258	*/
1259	#if TRACEFAULTPAGE
1260	dbgTrace(`0xBEEF0006`, (unsigned int) m, (unsigned int) error_code); / (TEST/DEBUG) /
1261	#endif
1262	if (error_code) {
1263	*error_code = KERN_MEMORY_ERROR;
1264	}
1265	VM_PAGE_FREE(m);
1266
1267	vm_fault_cleanup(object, top_page: first_m);
1268	thread_interrupt_level(interruptible: interruptible_state);
1269
1270	ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PAGE_HAS_ERROR), arg: `0` / arg /);
1271	return VM_FAULT_MEMORY_ERROR;
1272	}
1273	if (m->vmp_restart) {
1274	/*
1275	* The pager wants us to restart
1276	* at the top of the chain,
1277	* typically because it has moved the
1278	* page to another pager, then do so.
1279	*/
1280	#if TRACEFAULTPAGE
1281	dbgTrace(`0xBEEF0007`, (unsigned int) m, (unsigned int) `0`); / (TEST/DEBUG) /
1282	#endif
1283	VM_PAGE_FREE(m);
1284
1285	vm_fault_cleanup(object, top_page: first_m);
1286	thread_interrupt_level(interruptible: interruptible_state);
1287
1288	ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PAGE_HAS_RESTART), arg: `0` / arg /);
1289	return VM_FAULT_RETRY;
1290	}
1291	if (m->vmp_absent) {
1292	/*
1293	* The page isn't busy, but is absent,
1294	* therefore it's deemed "unavailable".
1295	*
1296	* Remove the non-existent page (unless it's
1297	* in the top object) and move on down to the
1298	* next object (if there is one).
1299	*/
1300	#if TRACEFAULTPAGE
1301	dbgTrace(`0xBEEF0008`, (unsigned int) m, (unsigned int) object->shadow); / (TEST/DEBUG) /
1302	#endif
1303	next_object = object->shadow;
1304
1305	if (next_object == VM_OBJECT_NULL) {
1306	/*
1307	* Absent page at bottom of shadow
1308	* chain; zero fill the page we left
1309	* busy in the first object, and free
1310	* the absent page.
1311	*/
1312	assert(!must_be_resident);
1313
1314	/*
1315	* check for any conditions that prevent
1316	* us from creating a new zero-fill page
1317	* vm_fault_check will do all of the
1318	* fault cleanup in the case of an error condition
1319	* including resetting the thread_interrupt_level
1320	*/
1321	error = vm_fault_check(object, m, first_m, interruptible_state, page_throttle: (type_of_fault == NULL) ? TRUE : FALSE);
1322
1323	if (error != VM_FAULT_SUCCESS) {
1324	return error;
1325	}
1326
1327	if (object != first_object) {
1328	/*
1329	* free the absent page we just found
1330	*/
1331	VM_PAGE_FREE(m);
1332
1333	/*
1334	* drop reference and lock on current object
1335	*/
1336	vm_object_paging_end(object);
1337	vm_object_unlock(object);
1338
1339	/*
1340	* grab the original page we
1341	* 'soldered' in place and
1342	* retake lock on 'first_object'
1343	*/
1344	m = first_m;
1345	first_m = VM_PAGE_NULL;
1346
1347	object = first_object;
1348	offset = first_offset;
1349
1350	vm_object_lock(object);
1351	} else {
1352	/*
1353	* we're going to use the absent page we just found
1354	* so convert it to a 'busy' page
1355	*/
1356	m->vmp_absent = FALSE;
1357	m->vmp_busy = TRUE;
1358	}
1359	if (fault_info->mark_zf_absent && no_zero_fill == TRUE) {
1360	m->vmp_absent = TRUE;
1361	clear_absent_on_error = true;
1362	}
1363	/*
1364	* zero-fill the page and put it on
1365	* the correct paging queue
1366	*/
1367	my_fault = vm_fault_zero_page(m, no_zero_fill);
1368
1369	break;
1370	} else {
1371	if (must_be_resident) {
1372	vm_object_paging_end(object);
1373	} else if (object != first_object) {
1374	vm_object_paging_end(object);
1375	VM_PAGE_FREE(m);
1376	} else {
1377	first_m = m;
1378	m->vmp_absent = FALSE;
1379	m->vmp_busy = TRUE;
1380
1381	vm_page_lockspin_queues();
1382	vm_page_queues_remove(mem: m, FALSE);
1383	vm_page_unlock_queues();
1384	}
1385
1386	offset += object->vo_shadow_offset;
1387	fault_info->lo_offset += object->vo_shadow_offset;
1388	fault_info->hi_offset += object->vo_shadow_offset;
1389	access_required = VM_PROT_READ;
1390
1391	vm_object_lock(next_object);
1392	vm_object_unlock(object);
1393	object = next_object;
1394	vm_object_paging_begin(object);
1395
1396	/*
1397	* reset to default type of fault
1398	*/
1399	my_fault = DBG_CACHE_HIT_FAULT;
1400
1401	continue;
1402	}
1403	}
1404	if ((m->vmp_cleaning)
1405	&& ((object != first_object) \|\| (object->vo_copy != VM_OBJECT_NULL))
1406	&& (fault_type & VM_PROT_WRITE)) {
1407	/*
1408	* This is a copy-on-write fault that will
1409	* cause us to revoke access to this page, but
1410	* this page is in the process of being cleaned
1411	* in a clustered pageout. We must wait until
1412	* the cleaning operation completes before
1413	* revoking access to the original page,
1414	* otherwise we might attempt to remove a
1415	* wired mapping.
1416	*/
1417	#if TRACEFAULTPAGE
1418	dbgTrace(`0xBEEF0009`, (unsigned int) m, (unsigned int) offset); / (TEST/DEBUG) /
1419	#endif
1420	/*
1421	* take an extra ref so that object won't die
1422	*/
1423	vm_object_reference_locked(object);
1424
1425	vm_fault_cleanup(object, top_page: first_m);
1426
1427	vm_object_lock(object);
1428	assert(object->ref_count > `0`);
1429
1430	m = vm_page_lookup(object, vm_object_trunc_page(offset));
1431
1432	if (m != VM_PAGE_NULL && m->vmp_cleaning) {
1433	PAGE_ASSERT_WAIT(m, interruptible);
1434
1435	vm_object_unlock(object);
1436	wait_result = thread_block(THREAD_CONTINUE_NULL);
1437	vm_object_deallocate(object);
1438
1439	goto backoff;
1440	} else {
1441	vm_object_unlock(object);
1442
1443	vm_object_deallocate(object);
1444	thread_interrupt_level(interruptible: interruptible_state);
1445
1446	return VM_FAULT_RETRY;
1447	}
1448	}
1449	if (type_of_fault == NULL && (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) &&
1450	!(fault_info != NULL && fault_info->stealth)) {
1451	/*
1452	* If we were passed a non-NULL pointer for
1453	* "type_of_fault", than we came from
1454	* vm_fault... we'll let it deal with
1455	* this condition, since it
1456	* needs to see m->vmp_speculative to correctly
1457	* account the pageins, otherwise...
1458	* take it off the speculative queue, we'll
1459	* let the caller of vm_fault_page deal
1460	* with getting it onto the correct queue
1461	*
1462	* If the caller specified in fault_info that
1463	* it wants a "stealth" fault, we also leave
1464	* the page in the speculative queue.
1465	*/
1466	vm_page_lockspin_queues();
1467	if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
1468	vm_page_queues_remove(mem: m, FALSE);
1469	}
1470	vm_page_unlock_queues();
1471	}
1472	assert(object == VM_PAGE_OBJECT(m));
1473
1474	if (object->code_signed) {
1475	/*
1476	* CODE SIGNING:
1477	* We just paged in a page from a signed
1478	* memory object but we don't need to
1479	* validate it now. We'll validate it if
1480	* when it gets mapped into a user address
1481	* space for the first time or when the page
1482	* gets copied to another object as a result
1483	* of a copy-on-write.
1484	*/
1485	}
1486
1487	/*
1488	* We mark the page busy and leave it on
1489	* the pageout queues. If the pageout
1490	* deamon comes across it, then it will
1491	* remove the page from the queue, but not the object
1492	*/
1493	#if TRACEFAULTPAGE
1494	dbgTrace(`0xBEEF000B`, (unsigned int) m, (unsigned int) `0`); / (TEST/DEBUG) /
1495	#endif
1496	assert(!m->vmp_busy);
1497	assert(!m->vmp_absent);
1498
1499	m->vmp_busy = TRUE;
1500	break;
1501	}
1502
1503	/*
1504	* we get here when there is no page present in the object at
1505	* the offset we're interested in... we'll allocate a page
1506	* at this point if the pager associated with
1507	* this object can provide the data or we're the top object...
1508	* object is locked; m == NULL
1509	*/
1510
1511	if (must_be_resident) {
1512	if (fault_type == VM_PROT_NONE &&
1513	is_kernel_object(object)) {
1514	/*
1515	* We've been called from vm_fault_unwire()
1516	* while removing a map entry that was allocated
1517	* with KMA_KOBJECT and KMA_VAONLY. This page
1518	* is not present and there's nothing more to
1519	* do here (nothing to unwire).
1520	*/
1521	vm_fault_cleanup(object, top_page: first_m);
1522	thread_interrupt_level(interruptible: interruptible_state);
1523
1524	return VM_FAULT_MEMORY_ERROR;
1525	}
1526
1527	goto dont_look_for_page;
1528	}
1529
1530	/ Don't expect to fault pages into the kernel object. /
1531	assert(!is_kernel_object(object));
1532
1533	look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset, external_state) == TRUE));
1534
1535	#if TRACEFAULTPAGE
1536	dbgTrace(`0xBEEF000C`, (unsigned int) look_for_page, (unsigned int) object); / (TEST/DEBUG) /
1537	#endif
1538	if (!look_for_page && object == first_object && !object->phys_contiguous) {
1539	/*
1540	* Allocate a new page for this object/offset pair as a placeholder
1541	*/
1542	m = vm_page_grab_options(flags: grab_options);
1543	#if TRACEFAULTPAGE
1544	dbgTrace(`0xBEEF000D`, (unsigned int) m, (unsigned int) object); / (TEST/DEBUG) /
1545	#endif
1546	if (m == VM_PAGE_NULL) {
1547	vm_fault_cleanup(object, top_page: first_m);
1548	thread_interrupt_level(interruptible: interruptible_state);
1549
1550	return VM_FAULT_MEMORY_SHORTAGE;
1551	}
1552
1553	if (fault_info && fault_info->batch_pmap_op == TRUE) {
1554	vm_page_insert_internal(page: m, object,
1555	vm_object_trunc_page(offset),
1556	VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
1557	} else {
1558	vm_page_insert(page: m, object, vm_object_trunc_page(offset));
1559	}
1560	}
1561	if (look_for_page) {
1562	kern_return_t rc;
1563	int my_fault_type;
1564
1565	/*
1566	* If the memory manager is not ready, we
1567	* cannot make requests.
1568	*/
1569	if (!object->pager_ready) {
1570	#if TRACEFAULTPAGE
1571	dbgTrace(`0xBEEF000E`, (unsigned int) `0`, (unsigned int) `0`); / (TEST/DEBUG) /
1572	#endif
1573	if (m != VM_PAGE_NULL) {
1574	VM_PAGE_FREE(m);
1575	}
1576
1577	/*
1578	* take an extra ref so object won't die
1579	*/
1580	vm_object_reference_locked(object);
1581	vm_fault_cleanup(object, top_page: first_m);
1582
1583	vm_object_lock(object);
1584	assert(object->ref_count > `0`);
1585
1586	if (!object->pager_ready) {
1587	wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
1588
1589	vm_object_unlock(object);
1590	if (wait_result == THREAD_WAITING) {
1591	wait_result = thread_block(THREAD_CONTINUE_NULL);
1592	}
1593	vm_object_deallocate(object);
1594
1595	goto backoff;
1596	} else {
1597	vm_object_unlock(object);
1598	vm_object_deallocate(object);
1599	thread_interrupt_level(interruptible: interruptible_state);
1600
1601	return VM_FAULT_RETRY;
1602	}
1603	}
1604	if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1605	/*
1606	* If there are too many outstanding page
1607	* requests pending on this external object, we
1608	* wait for them to be resolved now.
1609	*/
1610	#if TRACEFAULTPAGE
1611	dbgTrace(`0xBEEF0010`, (unsigned int) m, (unsigned int) `0`); / (TEST/DEBUG) /
1612	#endif
1613	if (m != VM_PAGE_NULL) {
1614	VM_PAGE_FREE(m);
1615	}
1616	/*
1617	* take an extra ref so object won't die
1618	*/
1619	vm_object_reference_locked(object);
1620
1621	vm_fault_cleanup(object, top_page: first_m);
1622
1623	vm_object_lock(object);
1624	assert(object->ref_count > `0`);
1625
1626	if (object->paging_in_progress >= vm_object_pagein_throttle) {
1627	vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_ONLY_IN_PROGRESS, interruptible);
1628
1629	vm_object_unlock(object);
1630	wait_result = thread_block(THREAD_CONTINUE_NULL);
1631	vm_object_deallocate(object);
1632
1633	goto backoff;
1634	} else {
1635	vm_object_unlock(object);
1636	vm_object_deallocate(object);
1637	thread_interrupt_level(interruptible: interruptible_state);
1638
1639	return VM_FAULT_RETRY;
1640	}
1641	}
1642	if (object->internal) {
1643	int compressed_count_delta;
1644
1645	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
1646
1647	if (m == VM_PAGE_NULL) {
1648	/*
1649	* Allocate a new page for this object/offset pair as a placeholder
1650	*/
1651	m = vm_page_grab_options(flags: grab_options);
1652	#if TRACEFAULTPAGE
1653	dbgTrace(`0xBEEF000D`, (unsigned int) m, (unsigned int) object); / (TEST/DEBUG) /
1654	#endif
1655	if (m == VM_PAGE_NULL) {
1656	vm_fault_cleanup(object, top_page: first_m);
1657	thread_interrupt_level(interruptible: interruptible_state);
1658
1659	return VM_FAULT_MEMORY_SHORTAGE;
1660	}
1661
1662	m->vmp_absent = TRUE;
1663	if (fault_info && fault_info->batch_pmap_op == TRUE) {
1664	vm_page_insert_internal(page: m, object, vm_object_trunc_page(offset), VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
1665	} else {
1666	vm_page_insert(page: m, object, vm_object_trunc_page(offset));
1667	}
1668	}
1669	assert(m->vmp_busy);
1670
1671	m->vmp_absent = TRUE;
1672	pager = object->pager;
1673
1674	assert(object->paging_in_progress > `0`);
1675	vm_object_unlock(object);
1676
1677	rc = vm_compressor_pager_get(
1678	mem_obj: pager,
1679	offset: offset + object->paging_offset,
1680	ppnum: VM_PAGE_GET_PHYS_PAGE(m),
1681	my_fault_type: &my_fault_type,
1682	flags: `0`,
1683	compressed_count_delta_p: &compressed_count_delta);
1684
1685	if (type_of_fault == NULL) {
1686	int throttle_delay;
1687
1688	/*
1689	* we weren't called from vm_fault, so we
1690	* need to apply page creation throttling
1691	* do it before we re-acquire any locks
1692	*/
1693	if (my_fault_type == DBG_COMPRESSOR_FAULT) {
1694	if ((throttle_delay = vm_page_throttled(TRUE))) {
1695	VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, `0`, `1`, `0`);
1696	__VM_FAULT_THROTTLE_FOR_PAGEOUT_SCAN__(throttle_delay);
1697	}
1698	}
1699	}
1700	vm_object_lock(object);
1701	assert(object->paging_in_progress > `0`);
1702
1703	vm_compressor_pager_count(
1704	mem_obj: pager,
1705	compressed_count_delta,
1706	FALSE, / shared_lock /
1707	object);
1708
1709	switch (rc) {
1710	case KERN_SUCCESS:
1711	m->vmp_absent = FALSE;
1712	m->vmp_dirty = TRUE;
1713	if ((object->wimg_bits &
1714	VM_WIMG_MASK) !=
1715	VM_WIMG_USE_DEFAULT) {
1716	/*
1717	* If the page is not cacheable,
1718	* we can't let its contents
1719	* linger in the data cache
1720	* after the decompression.
1721	*/
1722	pmap_sync_page_attributes_phys(
1723	pa: VM_PAGE_GET_PHYS_PAGE(m));
1724	} else {
1725	m->vmp_written_by_kernel = TRUE;
1726	}
1727	#if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
1728	if ((fault_type & VM_PROT_WRITE) == `0`) {
1729	vm_object_lock_assert_exclusive(object);
1730	vm_page_lockspin_queues();
1731	m->vmp_unmodified_ro = true;
1732	vm_page_unlock_queues();
1733	os_atomic_inc(&compressor_ro_uncompressed, relaxed);
1734	*protection &= ~VM_PROT_WRITE;
1735	}
1736	#endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
1737
1738	/*
1739	* If the object is purgeable, its
1740	* owner's purgeable ledgers have been
1741	* updated in vm_page_insert() but the
1742	* page was also accounted for in a
1743	* "compressed purgeable" ledger, so
1744	* update that now.
1745	*/
1746	if (((object->purgable !=
1747	VM_PURGABLE_DENY) \|\|
1748	object->vo_ledger_tag) &&
1749	(object->vo_owner !=
1750	NULL)) {
1751	/*
1752	* One less compressed
1753	* purgeable/tagged page.
1754	*/
1755	if (compressed_count_delta) {
1756	vm_object_owner_compressed_update(
1757	object,
1758	delta: -`1`);
1759	}
1760	}
1761
1762	break;
1763	case KERN_MEMORY_FAILURE:
1764	m->vmp_unusual = TRUE;
1765	m->vmp_error = TRUE;
1766	m->vmp_absent = FALSE;
1767	break;
1768	case KERN_MEMORY_ERROR:
1769	assert(m->vmp_absent);
1770	break;
1771	default:
1772	panic("vm_fault_page(): unexpected "
1773	"error %d from "
1774	"vm_compressor_pager_get()\n",
1775	rc);
1776	}
1777	PAGE_WAKEUP_DONE(m);
1778
1779	rc = KERN_SUCCESS;
1780	goto data_requested;
1781	}
1782	my_fault_type = DBG_PAGEIN_FAULT;
1783
1784	if (m != VM_PAGE_NULL) {
1785	VM_PAGE_FREE(m);
1786	m = VM_PAGE_NULL;
1787	}
1788
1789	#if TRACEFAULTPAGE
1790	dbgTrace(`0xBEEF0012`, (unsigned int) object, (unsigned int) `0`); / (TEST/DEBUG) /
1791	#endif
1792
1793	/*
1794	* It's possible someone called vm_object_destroy while we weren't
1795	* holding the object lock. If that has happened, then bail out
1796	* here.
1797	*/
1798
1799	pager = object->pager;
1800
1801	if (pager == MEMORY_OBJECT_NULL) {
1802	vm_fault_cleanup(object, top_page: first_m);
1803	thread_interrupt_level(interruptible: interruptible_state);
1804
1805	static const enum vm_subsys_error_codes object_destroy_errors[VM_OBJECT_DESTROY_MAX + `1`] = {
1806	[VM_OBJECT_DESTROY_UNKNOWN_REASON] = KDBG_TRIAGE_VM_OBJECT_NO_PAGER,
1807	[VM_OBJECT_DESTROY_FORCED_UNMOUNT] = KDBG_TRIAGE_VM_OBJECT_NO_PAGER_FORCED_UNMOUNT,
1808	[VM_OBJECT_DESTROY_UNGRAFT] = KDBG_TRIAGE_VM_OBJECT_NO_PAGER_UNGRAFT,
1809	};
1810	enum vm_subsys_error_codes kdbg_code = object_destroy_errors[(vm_object_destroy_reason_t)object->no_pager_reason];
1811	ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, kdbg_code), arg: `0` / arg /);
1812	return VM_FAULT_MEMORY_ERROR;
1813	}
1814
1815	/*
1816	* We have an absent page in place for the faulting offset,
1817	* so we can release the object lock.
1818	*/
1819
1820	if (object->object_is_shared_cache) {
1821	token = thread_priority_floor_start();
1822	/*
1823	* A non-native shared cache object might
1824	* be getting set up in parallel with this
1825	* fault and so we can't assume that this
1826	* check will be valid after we drop the
1827	* object lock below.
1828	*/
1829	drop_floor = true;
1830	}
1831
1832	vm_object_unlock(object);
1833
1834	/*
1835	* If this object uses a copy_call strategy,
1836	* and we are interested in a copy of this object
1837	* (having gotten here only by following a
1838	* shadow chain), then tell the memory manager
1839	* via a flag added to the desired_access
1840	* parameter, so that it can detect a race
1841	* between our walking down the shadow chain
1842	* and its pushing pages up into a copy of
1843	* the object that it manages.
1844	*/
1845	if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object) {
1846	wants_copy_flag = VM_PROT_WANTS_COPY;
1847	} else {
1848	wants_copy_flag = VM_PROT_NONE;
1849	}
1850
1851	if (object->vo_copy == first_object) {
1852	/*
1853	* if we issue the memory_object_data_request in
1854	* this state, we are subject to a deadlock with
1855	* the underlying filesystem if it is trying to
1856	* shrink the file resulting in a push of pages
1857	* into the copy object... that push will stall
1858	* on the placeholder page, and if the pushing thread
1859	* is holding a lock that is required on the pagein
1860	* path (such as a truncate lock), we'll deadlock...
1861	* to avoid this potential deadlock, we throw away
1862	* our placeholder page before calling memory_object_data_request
1863	* and force this thread to retry the vm_fault_page after
1864	* we have issued the I/O. the second time through this path
1865	* we will find the page already in the cache (presumably still
1866	* busy waiting for the I/O to complete) and then complete
1867	* the fault w/o having to go through memory_object_data_request again
1868	*/
1869	assert(first_m != VM_PAGE_NULL);
1870	assert(VM_PAGE_OBJECT(first_m) == first_object);
1871
1872	vm_object_lock(first_object);
1873	VM_PAGE_FREE(first_m);
1874	vm_object_paging_end(first_object);
1875	vm_object_unlock(first_object);
1876
1877	first_m = VM_PAGE_NULL;
1878	force_fault_retry = TRUE;
1879
1880	vm_fault_page_forced_retry++;
1881	}
1882
1883	if (data_already_requested == TRUE) {
1884	orig_behavior = fault_info->behavior;
1885	orig_cluster_size = fault_info->cluster_size;
1886
1887	fault_info->behavior = VM_BEHAVIOR_RANDOM;
1888	fault_info->cluster_size = PAGE_SIZE;
1889	}
1890	/*
1891	* Call the memory manager to retrieve the data.
1892	*/
1893	rc = memory_object_data_request(
1894	memory_object: pager,
1895	vm_object_trunc_page(offset) + object->paging_offset,
1896	PAGE_SIZE,
1897	desired_access: access_required \| wants_copy_flag,
1898	fault_info: (memory_object_fault_info_t)fault_info);
1899
1900	if (data_already_requested == TRUE) {
1901	fault_info->behavior = orig_behavior;
1902	fault_info->cluster_size = orig_cluster_size;
1903	} else {
1904	data_already_requested = TRUE;
1905	}
1906
1907	DTRACE_VM2(maj_fault, int, `1`, (uint64_t *), NULL);
1908	#if TRACEFAULTPAGE
1909	dbgTrace(`0xBEEF0013`, (unsigned int) object, (unsigned int) rc); / (TEST/DEBUG) /
1910	#endif
1911	vm_object_lock(object);
1912
1913	if (drop_floor && object->object_is_shared_cache) {
1914	thread_priority_floor_end(token: &token);
1915	drop_floor = false;
1916	}
1917
1918	data_requested:
1919	if (rc != KERN_SUCCESS) {
1920	vm_fault_cleanup(object, top_page: first_m);
1921	thread_interrupt_level(interruptible: interruptible_state);
1922
1923	ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_NO_DATA), arg: `0` / arg /);
1924
1925	return (rc == MACH_SEND_INTERRUPTED) ?
1926	VM_FAULT_INTERRUPTED :
1927	VM_FAULT_MEMORY_ERROR;
1928	} else {
1929	clock_sec_t tv_sec;
1930	clock_usec_t tv_usec;
1931
1932	if (my_fault_type == DBG_PAGEIN_FAULT) {
1933	clock_get_system_microtime(secs: &tv_sec, microsecs: &tv_usec);
1934	current_thread()->t_page_creation_time = tv_sec;
1935	current_thread()->t_page_creation_count = `0`;
1936	}
1937	}
1938	if ((interruptible != THREAD_UNINT) && (current_thread()->sched_flags & TH_SFLAG_ABORT)) {
1939	vm_fault_cleanup(object, top_page: first_m);
1940	thread_interrupt_level(interruptible: interruptible_state);
1941
1942	ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_INTERRUPTED), arg: `0` / arg /);
1943	return VM_FAULT_INTERRUPTED;
1944	}
1945	if (force_fault_retry == TRUE) {
1946	vm_fault_cleanup(object, top_page: first_m);
1947	thread_interrupt_level(interruptible: interruptible_state);
1948
1949	return VM_FAULT_RETRY;
1950	}
1951	if (m == VM_PAGE_NULL && object->phys_contiguous) {
1952	/*
1953	* No page here means that the object we
1954	* initially looked up was "physically
1955	* contiguous" (i.e. device memory). However,
1956	* with Virtual VRAM, the object might not
1957	* be backed by that device memory anymore,
1958	* so we're done here only if the object is
1959	* still "phys_contiguous".
1960	* Otherwise, if the object is no longer
1961	* "phys_contiguous", we need to retry the
1962	* page fault against the object's new backing
1963	* store (different memory object).
1964	*/
1965	phys_contig_object:
1966	assert(object->copy_strategy == MEMORY_OBJECT_COPY_NONE);
1967	assert(object == first_object);
1968	goto done;
1969	}
1970	/*
1971	* potentially a pagein fault
1972	* if we make it through the state checks
1973	* above, than we'll count it as such
1974	*/
1975	my_fault = my_fault_type;
1976
1977	/*
1978	* Retry with same object/offset, since new data may
1979	* be in a different page (i.e., m is meaningless at
1980	* this point).
1981	*/
1982	continue;
1983	}
1984	dont_look_for_page:
1985	/*
1986	* We get here if the object has no pager, or an existence map
1987	* exists and indicates the page isn't present on the pager
1988	* or we're unwiring a page. If a pager exists, but there
1989	* is no existence map, then the m->vmp_absent case above handles
1990	* the ZF case when the pager can't provide the page
1991	*/
1992	#if TRACEFAULTPAGE
1993	dbgTrace(`0xBEEF0014`, (unsigned int) object, (unsigned int) m); / (TEST/DEBUG) /
1994	#endif
1995	if (object == first_object) {
1996	first_m = m;
1997	} else {
1998	assert(m == VM_PAGE_NULL);
1999	}
2000
2001	next_object = object->shadow;
2002
2003	if (next_object == VM_OBJECT_NULL) {
2004	/*
2005	* we've hit the bottom of the shadown chain,
2006	* fill the page in the top object with zeros.
2007	*/
2008	assert(!must_be_resident);
2009
2010	if (object != first_object) {
2011	vm_object_paging_end(object);
2012	vm_object_unlock(object);
2013
2014	object = first_object;
2015	offset = first_offset;
2016	vm_object_lock(object);
2017	}
2018	m = first_m;
2019	assert(VM_PAGE_OBJECT(m) == object);
2020	first_m = VM_PAGE_NULL;
2021
2022	/*
2023	* check for any conditions that prevent
2024	* us from creating a new zero-fill page
2025	* vm_fault_check will do all of the
2026	* fault cleanup in the case of an error condition
2027	* including resetting the thread_interrupt_level
2028	*/
2029	error = vm_fault_check(object, m, first_m, interruptible_state, page_throttle: (type_of_fault == NULL) ? TRUE : FALSE);
2030
2031	if (error != VM_FAULT_SUCCESS) {
2032	return error;
2033	}
2034
2035	if (m == VM_PAGE_NULL) {
2036	m = vm_page_grab_options(flags: grab_options);
2037
2038	if (m == VM_PAGE_NULL) {
2039	vm_fault_cleanup(object, VM_PAGE_NULL);
2040	thread_interrupt_level(interruptible: interruptible_state);
2041
2042	return VM_FAULT_MEMORY_SHORTAGE;
2043	}
2044	vm_page_insert(page: m, object, vm_object_trunc_page(offset));
2045	}
2046	if (fault_info->mark_zf_absent && no_zero_fill == TRUE) {
2047	m->vmp_absent = TRUE;
2048	clear_absent_on_error = true;
2049	}
2050
2051	my_fault = vm_fault_zero_page(m, no_zero_fill);
2052
2053	break;
2054	} else {
2055	/*
2056	* Move on to the next object. Lock the next
2057	* object before unlocking the current one.
2058	*/
2059	if ((object != first_object) \|\| must_be_resident) {
2060	vm_object_paging_end(object);
2061	}
2062
2063	offset += object->vo_shadow_offset;
2064	fault_info->lo_offset += object->vo_shadow_offset;
2065	fault_info->hi_offset += object->vo_shadow_offset;
2066	access_required = VM_PROT_READ;
2067
2068	vm_object_lock(next_object);
2069	vm_object_unlock(object);
2070
2071	object = next_object;
2072	vm_object_paging_begin(object);
2073	}
2074	}
2075
2076	/*
2077	* PAGE HAS BEEN FOUND.
2078	*
2079	* This page (m) is:
2080	* busy, so that we can play with it;
2081	* not absent, so that nobody else will fill it;
2082	* possibly eligible for pageout;
2083	*
2084	* The top-level page (first_m) is:
2085	* VM_PAGE_NULL if the page was found in the
2086	* top-level object;
2087	* busy, not absent, and ineligible for pageout.
2088	*
2089	* The current object (object) is locked. A paging
2090	* reference is held for the current and top-level
2091	* objects.
2092	*/
2093
2094	#if TRACEFAULTPAGE
2095	dbgTrace(`0xBEEF0015`, (unsigned int) object, (unsigned int) m); / (TEST/DEBUG) /
2096	#endif
2097	#if EXTRA_ASSERTIONS
2098	assert(m->vmp_busy && !m->vmp_absent);
2099	assert((first_m == VM_PAGE_NULL) \|\|
2100	(first_m->vmp_busy && !first_m->vmp_absent &&
2101	!first_m->vmp_active && !first_m->vmp_inactive && !first_m->vmp_secluded));
2102	#endif /* EXTRA_ASSERTIONS */
2103
2104	/*
2105	* If the page is being written, but isn't
2106	* already owned by the top-level object,
2107	* we have to copy it into a new page owned
2108	* by the top-level object.
2109	*/
2110	if (object != first_object) {
2111	#if TRACEFAULTPAGE
2112	dbgTrace(`0xBEEF0016`, (unsigned int) object, (unsigned int) fault_type); / (TEST/DEBUG) /
2113	#endif
2114	if (fault_type & VM_PROT_WRITE) {
2115	vm_page_t copy_m;
2116
2117	/*
2118	* We only really need to copy if we
2119	* want to write it.
2120	*/
2121	assert(!must_be_resident);
2122
2123	/*
2124	* If we try to collapse first_object at this
2125	* point, we may deadlock when we try to get
2126	* the lock on an intermediate object (since we
2127	* have the bottom object locked). We can't
2128	* unlock the bottom object, because the page
2129	* we found may move (by collapse) if we do.
2130	*
2131	* Instead, we first copy the page. Then, when
2132	* we have no more use for the bottom object,
2133	* we unlock it and try to collapse.
2134	*
2135	* Note that we copy the page even if we didn't
2136	* need to... that's the breaks.
2137	*/
2138
2139	/*
2140	* Allocate a page for the copy
2141	*/
2142	copy_m = vm_page_grab_options(flags: grab_options);
2143
2144	if (copy_m == VM_PAGE_NULL) {
2145	RELEASE_PAGE(m);
2146
2147	vm_fault_cleanup(object, top_page: first_m);
2148	thread_interrupt_level(interruptible: interruptible_state);
2149
2150	return VM_FAULT_MEMORY_SHORTAGE;
2151	}
2152
2153	vm_page_copy(src_page: m, dest_page: copy_m);
2154
2155	/*
2156	* If another map is truly sharing this
2157	* page with us, we have to flush all
2158	* uses of the original page, since we
2159	* can't distinguish those which want the
2160	* original from those which need the
2161	* new copy.
2162	*
2163	* XXXO If we know that only one map has
2164	* access to this page, then we could
2165	* avoid the pmap_disconnect() call.
2166	*/
2167	if (m->vmp_pmapped) {
2168	pmap_disconnect(phys: VM_PAGE_GET_PHYS_PAGE(m));
2169	}
2170
2171	if (m->vmp_clustered) {
2172	VM_PAGE_COUNT_AS_PAGEIN(m);
2173	VM_PAGE_CONSUME_CLUSTERED(m);
2174	}
2175	assert(!m->vmp_cleaning);
2176
2177	/*
2178	* We no longer need the old page or object.
2179	*/
2180	RELEASE_PAGE(m);
2181
2182	/*
2183	* This check helps with marking the object as having a sequential pattern
2184	* Normally we'll miss doing this below because this fault is about COW to
2185	* the first_object i.e. bring page in from disk, push to object above but
2186	* don't update the file object's sequential pattern.
2187	*/
2188	if (object->internal == FALSE) {
2189	vm_fault_is_sequential(object, offset, behavior: fault_info->behavior);
2190	}
2191
2192	vm_object_paging_end(object);
2193	vm_object_unlock(object);
2194
2195	my_fault = DBG_COW_FAULT;
2196	counter_inc(&vm_statistics_cow_faults);
2197	DTRACE_VM2(cow_fault, int, `1`, (uint64_t *), NULL);
2198	counter_inc(&current_task()->cow_faults);
2199
2200	object = first_object;
2201	offset = first_offset;
2202
2203	vm_object_lock(object);
2204	/*
2205	* get rid of the place holder
2206	* page that we soldered in earlier
2207	*/
2208	VM_PAGE_FREE(first_m);
2209	first_m = VM_PAGE_NULL;
2210
2211	/*
2212	* and replace it with the
2213	* page we just copied into
2214	*/
2215	assert(copy_m->vmp_busy);
2216	vm_page_insert(page: copy_m, object, vm_object_trunc_page(offset));
2217	SET_PAGE_DIRTY(copy_m, TRUE);
2218
2219	m = copy_m;
2220	/*
2221	* Now that we've gotten the copy out of the
2222	* way, let's try to collapse the top object.
2223	* But we have to play ugly games with
2224	* paging_in_progress to do that...
2225	*/
2226	vm_object_paging_end(object);
2227	vm_object_collapse(object, vm_object_trunc_page(offset), TRUE);
2228	vm_object_paging_begin(object);
2229	} else {
2230	*protection &= (~VM_PROT_WRITE);
2231	}
2232	}
2233	/*
2234	* Now check whether the page needs to be pushed into the
2235	* copy object. The use of asymmetric copy on write for
2236	* shared temporary objects means that we may do two copies to
2237	* satisfy the fault; one above to get the page from a
2238	* shadowed object, and one here to push it into the copy.
2239	*/
2240	try_failed_count = `0`;
2241
2242	while ((copy_object = first_object->vo_copy) != VM_OBJECT_NULL) {
2243	vm_object_offset_t copy_offset;
2244	vm_page_t copy_m;
2245
2246	#if TRACEFAULTPAGE
2247	dbgTrace(`0xBEEF0017`, (unsigned int) copy_object, (unsigned int) fault_type); / (TEST/DEBUG) /
2248	#endif
2249	/*
2250	* If the page is being written, but hasn't been
2251	* copied to the copy-object, we have to copy it there.
2252	*/
2253	if ((fault_type & VM_PROT_WRITE) == `0`) {
2254	*protection &= ~VM_PROT_WRITE;
2255	break;
2256	}
2257
2258	/*
2259	* If the page was guaranteed to be resident,
2260	* we must have already performed the copy.
2261	*/
2262	if (must_be_resident) {
2263	break;
2264	}
2265
2266	/*
2267	* Try to get the lock on the copy_object.
2268	*/
2269	if (!vm_object_lock_try(copy_object)) {
2270	vm_object_unlock(object);
2271	try_failed_count++;
2272
2273	mutex_pause(try_failed_count); / wait a bit /
2274	vm_object_lock(object);
2275
2276	continue;
2277	}
2278	try_failed_count = `0`;
2279
2280	/*
2281	* Make another reference to the copy-object,
2282	* to keep it from disappearing during the
2283	* copy.
2284	*/
2285	vm_object_reference_locked(copy_object);
2286
2287	/*
2288	* Does the page exist in the copy?
2289	*/
2290	copy_offset = first_offset - copy_object->vo_shadow_offset;
2291	copy_offset = vm_object_trunc_page(copy_offset);
2292
2293	if (copy_object->vo_size <= copy_offset) {
2294	/*
2295	* Copy object doesn't cover this page -- do nothing.
2296	*/
2297	;
2298	} else if ((copy_m = vm_page_lookup(object: copy_object, offset: copy_offset)) != VM_PAGE_NULL) {
2299	/*
2300	* Page currently exists in the copy object
2301	*/
2302	if (copy_m->vmp_busy) {
2303	/*
2304	* If the page is being brought
2305	* in, wait for it and then retry.
2306	*/
2307	RELEASE_PAGE(m);
2308
2309	/*
2310	* take an extra ref so object won't die
2311	*/
2312	vm_object_reference_locked(copy_object);
2313	vm_object_unlock(copy_object);
2314	vm_fault_cleanup(object, top_page: first_m);
2315
2316	vm_object_lock(copy_object);
2317	assert(copy_object->ref_count > `0`);
2318	vm_object_lock_assert_exclusive(copy_object);
2319	copy_object->ref_count--;
2320	assert(copy_object->ref_count > `0`);
2321	copy_m = vm_page_lookup(object: copy_object, offset: copy_offset);
2322
2323	if (copy_m != VM_PAGE_NULL && copy_m->vmp_busy) {
2324	PAGE_ASSERT_WAIT(copy_m, interruptible);
2325
2326	vm_object_unlock(copy_object);
2327	wait_result = thread_block(THREAD_CONTINUE_NULL);
2328	vm_object_deallocate(object: copy_object);
2329
2330	goto backoff;
2331	} else {
2332	vm_object_unlock(copy_object);
2333	vm_object_deallocate(object: copy_object);
2334	thread_interrupt_level(interruptible: interruptible_state);
2335
2336	return VM_FAULT_RETRY;
2337	}
2338	}
2339	} else if (!PAGED_OUT(copy_object, copy_offset)) {
2340	/*
2341	* If PAGED_OUT is TRUE, then the page used to exist
2342	* in the copy-object, and has already been paged out.
2343	* We don't need to repeat this. If PAGED_OUT is
2344	* FALSE, then either we don't know (!pager_created,
2345	* for example) or it hasn't been paged out.
2346	* (VM_EXTERNAL_STATE_UNKNOWN\|\|VM_EXTERNAL_STATE_ABSENT)
2347	* We must copy the page to the copy object.
2348	*
2349	* Allocate a page for the copy
2350	*/
2351	copy_m = vm_page_alloc(object: copy_object, offset: copy_offset);
2352
2353	if (copy_m == VM_PAGE_NULL) {
2354	RELEASE_PAGE(m);
2355
2356	vm_object_lock_assert_exclusive(copy_object);
2357	copy_object->ref_count--;
2358	assert(copy_object->ref_count > `0`);
2359
2360	vm_object_unlock(copy_object);
2361	vm_fault_cleanup(object, top_page: first_m);
2362	thread_interrupt_level(interruptible: interruptible_state);
2363
2364	return VM_FAULT_MEMORY_SHORTAGE;
2365	}
2366	/*
2367	* Must copy page into copy-object.
2368	*/
2369	vm_page_copy(src_page: m, dest_page: copy_m);
2370
2371	/*
2372	* If the old page was in use by any users
2373	* of the copy-object, it must be removed
2374	* from all pmaps. (We can't know which
2375	* pmaps use it.)
2376	*/
2377	if (m->vmp_pmapped) {
2378	pmap_disconnect(phys: VM_PAGE_GET_PHYS_PAGE(m));
2379	}
2380
2381	if (m->vmp_clustered) {
2382	VM_PAGE_COUNT_AS_PAGEIN(m);
2383	VM_PAGE_CONSUME_CLUSTERED(m);
2384	}
2385	/*
2386	* If there's a pager, then immediately
2387	* page out this page, using the "initialize"
2388	* option. Else, we use the copy.
2389	*/
2390	if ((!copy_object->pager_ready)
2391	\|\| VM_COMPRESSOR_PAGER_STATE_GET(copy_object, copy_offset) == VM_EXTERNAL_STATE_ABSENT
2392	) {
2393	vm_page_lockspin_queues();
2394	assert(!m->vmp_cleaning);
2395	vm_page_activate(page: copy_m);
2396	vm_page_unlock_queues();
2397
2398	SET_PAGE_DIRTY(copy_m, TRUE);
2399	PAGE_WAKEUP_DONE(copy_m);
2400	} else {
2401	assert(copy_m->vmp_busy == TRUE);
2402	assert(!m->vmp_cleaning);
2403
2404	/*
2405	* dirty is protected by the object lock
2406	*/
2407	SET_PAGE_DIRTY(copy_m, TRUE);
2408
2409	/*
2410	* The page is already ready for pageout:
2411	* not on pageout queues and busy.
2412	* Unlock everything except the
2413	* copy_object itself.
2414	*/
2415	vm_object_unlock(object);
2416
2417	/*
2418	* Write the page to the copy-object,
2419	* flushing it from the kernel.
2420	*/
2421	vm_pageout_initialize_page(m: copy_m);
2422
2423	/*
2424	* Since the pageout may have
2425	* temporarily dropped the
2426	* copy_object's lock, we
2427	* check whether we'll have
2428	* to deallocate the hard way.
2429	*/
2430	if ((copy_object->shadow != object) \|\| (copy_object->ref_count == `1`)) {
2431	vm_object_unlock(copy_object);
2432	vm_object_deallocate(object: copy_object);
2433	vm_object_lock(object);
2434
2435	continue;
2436	}
2437	/*
2438	* Pick back up the old object's
2439	* lock. [It is safe to do so,
2440	* since it must be deeper in the
2441	* object tree.]
2442	*/
2443	vm_object_lock(object);
2444	}
2445
2446	/*
2447	* Because we're pushing a page upward
2448	* in the object tree, we must restart
2449	* any faults that are waiting here.
2450	* [Note that this is an expansion of
2451	* PAGE_WAKEUP that uses the THREAD_RESTART
2452	* wait result]. Can't turn off the page's
2453	* busy bit because we're not done with it.
2454	*/
2455	if (m->vmp_wanted) {
2456	m->vmp_wanted = FALSE;
2457	thread_wakeup_with_result((event_t) m, THREAD_RESTART);
2458	}
2459	}
2460	/*
2461	* The reference count on copy_object must be
2462	* at least 2: one for our extra reference,
2463	* and at least one from the outside world
2464	* (we checked that when we last locked
2465	* copy_object).
2466	*/
2467	vm_object_lock_assert_exclusive(copy_object);
2468	copy_object->ref_count--;
2469	assert(copy_object->ref_count > `0`);
2470
2471	vm_object_unlock(copy_object);
2472
2473	break;
2474	}
2475
2476	done:
2477	*result_page = m;
2478	*top_page = first_m;
2479
2480	if (m != VM_PAGE_NULL) {
2481	assert(VM_PAGE_OBJECT(m) == object);
2482
2483	retval = VM_FAULT_SUCCESS;
2484
2485	if (my_fault == DBG_PAGEIN_FAULT) {
2486	VM_PAGE_COUNT_AS_PAGEIN(m);
2487
2488	if (object->internal) {
2489	my_fault = DBG_PAGEIND_FAULT;
2490	} else {
2491	my_fault = DBG_PAGEINV_FAULT;
2492	}
2493
2494	/*
2495	* evaluate access pattern and update state
2496	* vm_fault_deactivate_behind depends on the
2497	* state being up to date
2498	*/
2499	vm_fault_is_sequential(object, offset, behavior: fault_info->behavior);
2500	vm_fault_deactivate_behind(object, offset, behavior: fault_info->behavior);
2501	} else if (type_of_fault == NULL && my_fault == DBG_CACHE_HIT_FAULT) {
2502	/*
2503	* we weren't called from vm_fault, so handle the
2504	* accounting here for hits in the cache
2505	*/
2506	if (m->vmp_clustered) {
2507	VM_PAGE_COUNT_AS_PAGEIN(m);
2508	VM_PAGE_CONSUME_CLUSTERED(m);
2509	}
2510	vm_fault_is_sequential(object, offset, behavior: fault_info->behavior);
2511	vm_fault_deactivate_behind(object, offset, behavior: fault_info->behavior);
2512	} else if (my_fault == DBG_COMPRESSOR_FAULT \|\| my_fault == DBG_COMPRESSOR_SWAPIN_FAULT) {
2513	VM_STAT_DECOMPRESSIONS();
2514	}
2515	if (type_of_fault) {
2516	*type_of_fault = my_fault;
2517	}
2518	} else {
2519	ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUCCESS_NO_PAGE), arg: `0` / arg /);
2520	retval = VM_FAULT_SUCCESS_NO_VM_PAGE;
2521	assert(first_m == VM_PAGE_NULL);
2522	assert(object == first_object);
2523	}
2524
2525	thread_interrupt_level(interruptible: interruptible_state);
2526
2527	#if TRACEFAULTPAGE
2528	dbgTrace(`0xBEEF001A`, (unsigned int) VM_FAULT_SUCCESS, `0`); / (TEST/DEBUG) /
2529	#endif
2530	return retval;
2531
2532	backoff:
2533	thread_interrupt_level(interruptible: interruptible_state);
2534
2535	if (wait_result == THREAD_INTERRUPTED) {
2536	ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_INTERRUPTED), arg: `0` / arg /);
2537	return VM_FAULT_INTERRUPTED;
2538	}
2539	return VM_FAULT_RETRY;
2540
2541	#undef RELEASE_PAGE
2542	}
2543
2544	#if MACH_ASSERT && (XNU_PLATFORM_WatchOS \|\| __x86_64__)
2545	#define PANIC_ON_CS_KILLED_DEFAULT true
2546	#else
2547	#define PANIC_ON_CS_KILLED_DEFAULT false
2548	#endif
2549	static TUNABLE(bool, panic_on_cs_killed, "panic_on_cs_killed",
2550	PANIC_ON_CS_KILLED_DEFAULT);
2551
2552	extern int proc_selfpid(void);
2553	extern char proc_name_address(struct* proc *p);
2554	extern char proc_best_name(struct* proc *);
2555	unsigned long cs_enter_tainted_rejected = `0`;
2556	unsigned long cs_enter_tainted_accepted = `0`;
2557
2558	/*
2559	* CODE SIGNING:
2560	* When soft faulting a page, we have to validate the page if:
2561	* 1. the page is being mapped in user space
2562	* 2. the page hasn't already been found to be "tainted"
2563	* 3. the page belongs to a code-signed object
2564	* 4. the page has not been validated yet or has been mapped for write.
2565	*/
2566	static bool
2567	vm_fault_cs_need_validation(
2568	pmap_t pmap,
2569	vm_page_t page,
2570	vm_object_t page_obj,
2571	vm_map_size_t fault_page_size,
2572	vm_map_offset_t fault_phys_offset)
2573	{
2574	if (pmap == kernel_pmap) {
2575	/ 1 - not user space /
2576	return false;
2577	}
2578	if (!page_obj->code_signed) {
2579	/ 3 - page does not belong to a code-signed object /
2580	return false;
2581	}
2582	if (fault_page_size == PAGE_SIZE) {
2583	/ looking at the whole page /
2584	assertf(fault_phys_offset == `0`,
2585	"fault_page_size 0x%llx fault_phys_offset 0x%llx\n",
2586	(uint64_t)fault_page_size,
2587	(uint64_t)fault_phys_offset);
2588	if (page->vmp_cs_tainted == VMP_CS_ALL_TRUE) {
2589	/ 2 - page is all tainted /
2590	return false;
2591	}
2592	if (page->vmp_cs_validated == VMP_CS_ALL_TRUE &&
2593	!page->vmp_wpmapped) {
2594	/ 4 - already fully validated and never mapped writable /
2595	return false;
2596	}
2597	} else {
2598	/ looking at a specific sub-page /
2599	if (VMP_CS_TAINTED(p: page, fault_page_size, fault_phys_offset)) {
2600	/ 2 - sub-page was already marked as tainted /
2601	return false;
2602	}
2603	if (VMP_CS_VALIDATED(p: page, fault_page_size, fault_phys_offset) &&
2604	!page->vmp_wpmapped) {
2605	/ 4 - already validated and never mapped writable /
2606	return false;
2607	}
2608	}
2609	/ page needs to be validated /
2610	return true;
2611	}
2612
2613
2614	static bool
2615	vm_fault_cs_page_immutable(
2616	vm_page_t m,
2617	vm_map_size_t fault_page_size,
2618	vm_map_offset_t fault_phys_offset,
2619	vm_prot_t prot __unused)
2620	{
2621	if (VMP_CS_VALIDATED(p: m, fault_page_size, fault_phys_offset)
2622	/&& ((prot) & VM_PROT_EXECUTE)/) {
2623	return true;
2624	}
2625	return false;
2626	}
2627
2628	static bool
2629	vm_fault_cs_page_nx(
2630	vm_page_t m,
2631	vm_map_size_t fault_page_size,
2632	vm_map_offset_t fault_phys_offset)
2633	{
2634	return VMP_CS_NX(p: m, fault_page_size, fault_phys_offset);
2635	}
2636
2637	/*
2638	* Check if the page being entered into the pmap violates code signing.
2639	*/
2640	static kern_return_t
2641	vm_fault_cs_check_violation(
2642	bool cs_bypass,
2643	vm_object_t object,
2644	vm_page_t m,
2645	pmap_t pmap,
2646	vm_prot_t prot,
2647	vm_prot_t caller_prot,
2648	vm_map_size_t fault_page_size,
2649	vm_map_offset_t fault_phys_offset,
2650	vm_object_fault_info_t fault_info,
2651	bool map_is_switched,
2652	bool map_is_switch_protected,
2653	bool *cs_violation)
2654	{
2655	#if !CODE_SIGNING_MONITOR
2656	#pragma unused(caller_prot)
2657	#pragma unused(fault_info)
2658	#endif /* !CODE_SIGNING_MONITOR */
2659
2660	int cs_enforcement_enabled;
2661	if (!cs_bypass &&
2662	vm_fault_cs_need_validation(pmap, page: m, page_obj: object,
2663	fault_page_size, fault_phys_offset)) {
2664	vm_object_lock_assert_exclusive(object);
2665
2666	if (VMP_CS_VALIDATED(p: m, fault_page_size, fault_phys_offset)) {
2667	vm_cs_revalidates++;
2668	}
2669
2670	/ VM map is locked, so 1 ref will remain on VM object -*
2671	* so no harm if vm_page_validate_cs drops the object lock */
2672
2673	#if CODE_SIGNING_MONITOR
2674	if (fault_info->csm_associated &&
2675	csm_enabled() &&
2676	!VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset) &&
2677	!VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset) &&
2678	!VMP_CS_NX(m, fault_page_size, fault_phys_offset) &&
2679	(prot & VM_PROT_EXECUTE) &&
2680	(caller_prot & VM_PROT_EXECUTE)) {
2681	/*
2682	* When we have a code signing monitor, the monitor will evaluate the code signature
2683	* for any executable page mapping. No need for the VM to also validate the page.
2684	* In the code signing monitor we trust :)
2685	*/
2686	vm_cs_defer_to_csm++;
2687	} else {
2688	vm_cs_defer_to_csm_not++;
2689	vm_page_validate_cs(m, fault_page_size, fault_phys_offset);
2690	}
2691	#else /* CODE_SIGNING_MONITOR */
2692	vm_page_validate_cs(page: m, fault_page_size, fault_phys_offset);
2693	#endif /* CODE_SIGNING_MONITOR */
2694	}
2695
2696	/ If the map is switched, and is switch-protected, we must protect*
2697	* some pages from being write-faulted: immutable pages because by
2698	* definition they may not be written, and executable pages because that
2699	* would provide a way to inject unsigned code.
2700	* If the page is immutable, we can simply return. However, we can't
2701	* immediately determine whether a page is executable anywhere. But,
2702	* we can disconnect it everywhere and remove the executable protection
2703	* from the current map. We do that below right before we do the
2704	* PMAP_ENTER.
2705	*/
2706	if (pmap == kernel_pmap) {
2707	/ kernel fault: cs_enforcement does not apply /
2708	cs_enforcement_enabled = `0`;
2709	} else {
2710	cs_enforcement_enabled = pmap_get_vm_map_cs_enforced(pmap);
2711	}
2712
2713	if (cs_enforcement_enabled && map_is_switched &&
2714	map_is_switch_protected &&
2715	vm_fault_cs_page_immutable(m, fault_page_size, fault_phys_offset, prot) &&
2716	(prot & VM_PROT_WRITE)) {
2717	ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAILED_IMMUTABLE_PAGE_WRITE), arg: `0` / arg /);
2718	return KERN_CODESIGN_ERROR;
2719	}
2720
2721	if (cs_enforcement_enabled &&
2722	vm_fault_cs_page_nx(m, fault_page_size, fault_phys_offset) &&
2723	(prot & VM_PROT_EXECUTE)) {
2724	if (cs_debug) {
2725	printf(format: "page marked to be NX, not letting it be mapped EXEC\n");
2726	}
2727	ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAILED_NX_PAGE_EXEC_MAPPING), arg: `0` / arg /);
2728	return KERN_CODESIGN_ERROR;
2729	}
2730
2731	/ A page could be tainted, or pose a risk of being tainted later.*
2732	* Check whether the receiving process wants it, and make it feel
2733	* the consequences (that hapens in cs_invalid_page()).
2734	* For CS Enforcement, two other conditions will
2735	* cause that page to be tainted as well:
2736	* - pmapping an unsigned page executable - this means unsigned code;
2737	* - writeable mapping of a validated page - the content of that page
2738	* can be changed without the kernel noticing, therefore unsigned
2739	* code can be created
2740	*/
2741	if (cs_bypass) {
2742	/ code-signing is bypassed /
2743	*cs_violation = FALSE;
2744	} else if (VMP_CS_TAINTED(p: m, fault_page_size, fault_phys_offset)) {
2745	/ tainted page /
2746	*cs_violation = TRUE;
2747	} else if (!cs_enforcement_enabled) {
2748	/ no further code-signing enforcement /
2749	*cs_violation = FALSE;
2750	} else if (vm_fault_cs_page_immutable(m, fault_page_size, fault_phys_offset, prot) &&
2751	((prot & VM_PROT_WRITE) \|\|
2752	m->vmp_wpmapped)) {
2753	/*
2754	* The page should be immutable, but is in danger of being
2755	* modified.
2756	* This is the case where we want policy from the code
2757	* directory - is the page immutable or not? For now we have
2758	* to assume that code pages will be immutable, data pages not.
2759	* We'll assume a page is a code page if it has a code directory
2760	* and we fault for execution.
2761	* That is good enough since if we faulted the code page for
2762	* writing in another map before, it is wpmapped; if we fault
2763	* it for writing in this map later it will also be faulted for
2764	* executing at the same time; and if we fault for writing in
2765	* another map later, we will disconnect it from this pmap so
2766	* we'll notice the change.
2767	*/
2768	*cs_violation = TRUE;
2769	} else if (!VMP_CS_VALIDATED(p: m, fault_page_size, fault_phys_offset) &&
2770	(prot & VM_PROT_EXECUTE)
2771	#if CODE_SIGNING_MONITOR
2772	/*
2773	* Executable pages will be validated by the code signing monitor. If the
2774	* code signing monitor is turned off, then this is a code-signing violation.
2775	*/
2776	&& !csm_enabled()
2777	#endif /* CODE_SIGNING_MONITOR */
2778	) {
2779	*cs_violation = TRUE;
2780	} else {
2781	*cs_violation = FALSE;
2782	}
2783	return KERN_SUCCESS;
2784	}
2785
2786	/*
2787	* Handles a code signing violation by either rejecting the page or forcing a disconnect.
2788	* @param must_disconnect This value will be set to true if the caller must disconnect
2789	* this page.
2790	* @return If this function does not return KERN_SUCCESS, the caller must abort the page fault.
2791	*/
2792	static kern_return_t
2793	vm_fault_cs_handle_violation(
2794	vm_object_t object,
2795	vm_page_t m,
2796	pmap_t pmap,
2797	vm_prot_t prot,
2798	vm_map_offset_t vaddr,
2799	vm_map_size_t fault_page_size,
2800	vm_map_offset_t fault_phys_offset,
2801	bool map_is_switched,
2802	bool map_is_switch_protected,
2803	bool *must_disconnect)
2804	{
2805	#if !MACH_ASSERT
2806	#pragma unused(pmap)
2807	#pragma unused(map_is_switch_protected)
2808	#endif /* !MACH_ASSERT */
2809	/*
2810	* We will have a tainted page. Have to handle the special case
2811	* of a switched map now. If the map is not switched, standard
2812	* procedure applies - call cs_invalid_page().
2813	* If the map is switched, the real owner is invalid already.
2814	* There is no point in invalidating the switching process since
2815	* it will not be executing from the map. So we don't call
2816	* cs_invalid_page() in that case.
2817	*/
2818	boolean_t reject_page, cs_killed;
2819	kern_return_t kr;
2820	if (map_is_switched) {
2821	assert(pmap == vm_map_pmap(current_thread()->map));
2822	assert(!(prot & VM_PROT_WRITE) \|\| (map_is_switch_protected == FALSE));
2823	reject_page = FALSE;
2824	} else {
2825	if (cs_debug > `5`) {
2826	printf(format: "vm_fault: signed: %s validate: %s tainted: %s wpmapped: %s prot: 0x%x\n",
2827	object->code_signed ? "yes" : "no",
2828	VMP_CS_VALIDATED(p: m, fault_page_size, fault_phys_offset) ? "yes" : "no",
2829	VMP_CS_TAINTED(p: m, fault_page_size, fault_phys_offset) ? "yes" : "no",
2830	m->vmp_wpmapped ? "yes" : "no",
2831	(int)prot);
2832	}
2833	reject_page = cs_invalid_page(vaddr: (addr64_t) vaddr, cs_killed: &cs_killed);
2834	}
2835
2836	if (reject_page) {
2837	/ reject the invalid page: abort the page fault /
2838	int pid;
2839	const char *procname;
2840	task_t task;
2841	vm_object_t file_object, shadow;
2842	vm_object_offset_t file_offset;
2843	char pathname, filename;
2844	vm_size_t pathname_len, filename_len;
2845	boolean_t truncated_path;
2846	#define __PATH_MAX 1024
2847	struct timespec mtime, cs_mtime;
2848	int shadow_depth;
2849	os_reason_t codesigning_exit_reason = OS_REASON_NULL;
2850
2851	kr = KERN_CODESIGN_ERROR;
2852	cs_enter_tainted_rejected++;
2853
2854	/ get process name and pid /
2855	procname = "?";
2856	task = current_task();
2857	pid = proc_selfpid();
2858	if (get_bsdtask_info(task) != NULL) {
2859	procname = proc_name_address(p: get_bsdtask_info(task));
2860	}
2861
2862	/ get file's VM object /
2863	file_object = object;
2864	file_offset = m->vmp_offset;
2865	for (shadow = file_object->shadow,
2866	shadow_depth = `0`;
2867	shadow != VM_OBJECT_NULL;
2868	shadow = file_object->shadow,
2869	shadow_depth++) {
2870	vm_object_lock_shared(shadow);
2871	if (file_object != object) {
2872	vm_object_unlock(file_object);
2873	}
2874	file_offset += file_object->vo_shadow_offset;
2875	file_object = shadow;
2876	}
2877
2878	mtime.tv_sec = `0`;
2879	mtime.tv_nsec = `0`;
2880	cs_mtime.tv_sec = `0`;
2881	cs_mtime.tv_nsec = `0`;
2882
2883	/ get file's pathname and/or filename /
2884	pathname = NULL;
2885	filename = NULL;
2886	pathname_len = `0`;
2887	filename_len = `0`;
2888	truncated_path = FALSE;
2889	/ no pager -> no file -> no pathname, use "<nil>" in that case /
2890	if (file_object->pager != NULL) {
2891	pathname = kalloc_data(__PATH_MAX * `2`, Z_WAITOK);
2892	if (pathname) {
2893	pathname[`0`] = `'\0'`;
2894	pathname_len = __PATH_MAX;
2895	filename = pathname + pathname_len;
2896	filename_len = __PATH_MAX;
2897
2898	if (vnode_pager_get_object_name(mem_obj: file_object->pager,
2899	pathname,
2900	pathname_len,
2901	filename,
2902	filename_len,
2903	truncated_path_p: &truncated_path) == KERN_SUCCESS) {
2904	/ safety first... /
2905	pathname[__PATH_MAX - `1`] = `'\0'`;
2906	filename[__PATH_MAX - `1`] = `'\0'`;
2907
2908	vnode_pager_get_object_mtime(mem_obj: file_object->pager,
2909	mtime: &mtime,
2910	cs_mtime: &cs_mtime);
2911	} else {
2912	kfree_data(pathname, __PATH_MAX * `2`);
2913	pathname = NULL;
2914	filename = NULL;
2915	pathname_len = `0`;
2916	filename_len = `0`;
2917	truncated_path = FALSE;
2918	}
2919	}
2920	}
2921	printf(format: "CODE SIGNING: process %d[%s]: "
2922	"rejecting invalid page at address 0x%llx "
2923	"from offset 0x%llx in file \"%s%s%s\" "
2924	"(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
2925	"(signed:%d validated:%d tainted:%d nx:%d "
2926	"wpmapped:%d dirty:%d depth:%d)\n",
2927	pid, procname, (addr64_t) vaddr,
2928	file_offset,
2929	(pathname ? pathname : "<nil>"),
2930	(truncated_path ? "/.../" : ""),
2931	(truncated_path ? filename : ""),
2932	cs_mtime.tv_sec, cs_mtime.tv_nsec,
2933	((cs_mtime.tv_sec == mtime.tv_sec &&
2934	cs_mtime.tv_nsec == mtime.tv_nsec)
2935	? "=="
2936	: "!="),
2937	mtime.tv_sec, mtime.tv_nsec,
2938	object->code_signed,
2939	VMP_CS_VALIDATED(p: m, fault_page_size, fault_phys_offset),
2940	VMP_CS_TAINTED(p: m, fault_page_size, fault_phys_offset),
2941	VMP_CS_NX(p: m, fault_page_size, fault_phys_offset),
2942	m->vmp_wpmapped,
2943	m->vmp_dirty,
2944	shadow_depth);
2945
2946	/*
2947	* We currently only generate an exit reason if cs_invalid_page directly killed a process. If cs_invalid_page
2948	* did not kill the process (more the case on desktop), vm_fault_enter will not satisfy the fault and whether the
2949	* process dies is dependent on whether there is a signal handler registered for SIGSEGV and how that handler
2950	* will deal with the segmentation fault.
2951	*/
2952	if (cs_killed) {
2953	KDBG(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) \| DBG_FUNC_NONE,
2954	pid, OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE);
2955
2956	codesigning_exit_reason = os_reason_create(OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE);
2957	if (codesigning_exit_reason == NULL) {
2958	printf(format: "vm_fault_enter: failed to allocate codesigning exit reason\n");
2959	} else {
2960	mach_vm_address_t data_addr = `0`;
2961	struct codesigning_exit_reason_info *ceri = NULL;
2962	uint32_t reason_buffer_size_estimate = kcdata_estimate_required_buffer_size(num_items: `1`, payload_size: sizeof(*ceri));
2963
2964	if (os_reason_alloc_buffer_noblock(cur_reason: codesigning_exit_reason, osr_bufsize: reason_buffer_size_estimate)) {
2965	printf(format: "vm_fault_enter: failed to allocate buffer for codesigning exit reason\n");
2966	} else {
2967	if (KERN_SUCCESS == kcdata_get_memory_addr(data: &codesigning_exit_reason->osr_kcd_descriptor,
2968	EXIT_REASON_CODESIGNING_INFO, size: sizeof(*ceri), user_addr: &data_addr)) {
2969	ceri = (struct codesigning_exit_reason_info *)data_addr;
2970	static_assert(__PATH_MAX == sizeof(ceri->ceri_pathname));
2971
2972	ceri->ceri_virt_addr = vaddr;
2973	ceri->ceri_file_offset = file_offset;
2974	if (pathname) {
2975	strncpy((char )&ceri->ceri_pathname, pathname, sizeof*(ceri->ceri_pathname));
2976	} else {
2977	ceri->ceri_pathname[`0`] = `'\0'`;
2978	}
2979	if (filename) {
2980	strncpy((char )&ceri->ceri_filename, filename, sizeof*(ceri->ceri_filename));
2981	} else {
2982	ceri->ceri_filename[`0`] = `'\0'`;
2983	}
2984	ceri->ceri_path_truncated = (truncated_path ? `1` : `0`);
2985	ceri->ceri_codesig_modtime_secs = cs_mtime.tv_sec;
2986	ceri->ceri_codesig_modtime_nsecs = cs_mtime.tv_nsec;
2987	ceri->ceri_page_modtime_secs = mtime.tv_sec;
2988	ceri->ceri_page_modtime_nsecs = mtime.tv_nsec;
2989	ceri->ceri_object_codesigned = (object->code_signed);
2990	ceri->ceri_page_codesig_validated = VMP_CS_VALIDATED(p: m, fault_page_size, fault_phys_offset);
2991	ceri->ceri_page_codesig_tainted = VMP_CS_TAINTED(p: m, fault_page_size, fault_phys_offset);
2992	ceri->ceri_page_codesig_nx = VMP_CS_NX(p: m, fault_page_size, fault_phys_offset);
2993	ceri->ceri_page_wpmapped = (m->vmp_wpmapped);
2994	ceri->ceri_page_slid = `0`;
2995	ceri->ceri_page_dirty = (m->vmp_dirty);
2996	ceri->ceri_page_shadow_depth = shadow_depth;
2997	} else {
2998	#if DEBUG \|\| DEVELOPMENT
2999	panic("vm_fault_enter: failed to allocate kcdata for codesigning exit reason");
3000	#else
3001	printf(format: "vm_fault_enter: failed to allocate kcdata for codesigning exit reason\n");
3002	#endif /* DEBUG \|\| DEVELOPMENT */
3003	/ Free the buffer /
3004	os_reason_alloc_buffer_noblock(cur_reason: codesigning_exit_reason, osr_bufsize: `0`);
3005	}
3006	}
3007	}
3008
3009	set_thread_exit_reason(th: current_thread(), reason: codesigning_exit_reason, FALSE);
3010	}
3011	if (panic_on_cs_killed &&
3012	object->object_is_shared_cache) {
3013	char *tainted_contents;
3014	vm_map_offset_t src_vaddr;
3015	src_vaddr = (vm_map_offset_t) phystokv(pa: (pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m) << PAGE_SHIFT);
3016	tainted_contents = kalloc_data(PAGE_SIZE, Z_WAITOK);
3017	bcopy(src: (const char *)src_vaddr, dst: tainted_contents, PAGE_SIZE);
3018	printf(format: "CODE SIGNING: tainted page %p phys 0x%x phystokv 0x%llx copied to %p\n", m, VM_PAGE_GET_PHYS_PAGE(m), (uint64_t)src_vaddr, tainted_contents);
3019	panic("CODE SIGNING: process %d[%s]: "
3020	"rejecting invalid page (phys#0x%x) at address 0x%llx "
3021	"from offset 0x%llx in file \"%s%s%s\" "
3022	"(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
3023	"(signed:%d validated:%d tainted:%d nx:%d"
3024	"wpmapped:%d dirty:%d depth:%d)\n",
3025	pid, procname,
3026	VM_PAGE_GET_PHYS_PAGE(m),
3027	(addr64_t) vaddr,
3028	file_offset,
3029	(pathname ? pathname : "<nil>"),
3030	(truncated_path ? "/.../" : ""),
3031	(truncated_path ? filename : ""),
3032	cs_mtime.tv_sec, cs_mtime.tv_nsec,
3033	((cs_mtime.tv_sec == mtime.tv_sec &&
3034	cs_mtime.tv_nsec == mtime.tv_nsec)
3035	? "=="
3036	: "!="),
3037	mtime.tv_sec, mtime.tv_nsec,
3038	object->code_signed,
3039	VMP_CS_VALIDATED(m, fault_page_size, fault_phys_offset),
3040	VMP_CS_TAINTED(m, fault_page_size, fault_phys_offset),
3041	VMP_CS_NX(m, fault_page_size, fault_phys_offset),
3042	m->vmp_wpmapped,
3043	m->vmp_dirty,
3044	shadow_depth);
3045	}
3046
3047	if (file_object != object) {
3048	vm_object_unlock(file_object);
3049	}
3050	if (pathname_len != `0`) {
3051	kfree_data(pathname, __PATH_MAX * `2`);
3052	pathname = NULL;
3053	filename = NULL;
3054	}
3055	} else {
3056	/ proceed with the invalid page /
3057	kr = KERN_SUCCESS;
3058	if (!VMP_CS_VALIDATED(p: m, fault_page_size, fault_phys_offset) &&
3059	!object->code_signed) {
3060	/*
3061	* This page has not been (fully) validated but
3062	* does not belong to a code-signed object
3063	* so it should not be forcefully considered
3064	* as tainted.
3065	* We're just concerned about it here because
3066	* we've been asked to "execute" it but that
3067	* does not mean that it should cause other
3068	* accesses to fail.
3069	* This happens when a debugger sets a
3070	* breakpoint and we then execute code in
3071	* that page. Marking the page as "tainted"
3072	* would cause any inspection tool ("leaks",
3073	* "vmmap", "CrashReporter", ...) to get killed
3074	* due to code-signing violation on that page,
3075	* even though they're just reading it and not
3076	* executing from it.
3077	*/
3078	} else {
3079	/*
3080	* Page might have been tainted before or not;
3081	* now it definitively is. If the page wasn't
3082	* tainted, we must disconnect it from all
3083	* pmaps later, to force existing mappings
3084	* through that code path for re-consideration
3085	* of the validity of that page.
3086	*/
3087	if (!VMP_CS_TAINTED(p: m, fault_page_size, fault_phys_offset)) {
3088	*must_disconnect = TRUE;
3089	VMP_CS_SET_TAINTED(p: m, fault_page_size, fault_phys_offset, TRUE);
3090	}
3091	}
3092	cs_enter_tainted_accepted++;
3093	}
3094	if (kr != KERN_SUCCESS) {
3095	if (cs_debug) {
3096	printf(format: "CODESIGNING: vm_fault_enter(0x%llx): "
3097	"* INVALID PAGE *\n",
3098	(long long)vaddr);
3099	}
3100	#if !SECURE_KERNEL
3101	if (cs_enforcement_panic) {
3102	panic("CODESIGNING: panicking on invalid page");
3103	}
3104	#endif
3105	}
3106	return kr;
3107	}
3108
3109	/*
3110	* Check that the code signature is valid for the given page being inserted into
3111	* the pmap.
3112	*
3113	* @param must_disconnect This value will be set to true if the caller must disconnect
3114	* this page.
3115	* @return If this function does not return KERN_SUCCESS, the caller must abort the page fault.
3116	*/
3117	static kern_return_t
3118	vm_fault_validate_cs(
3119	bool cs_bypass,
3120	vm_object_t object,
3121	vm_page_t m,
3122	pmap_t pmap,
3123	vm_map_offset_t vaddr,
3124	vm_prot_t prot,
3125	vm_prot_t caller_prot,
3126	vm_map_size_t fault_page_size,
3127	vm_map_offset_t fault_phys_offset,
3128	vm_object_fault_info_t fault_info,
3129	bool *must_disconnect)
3130	{
3131	bool map_is_switched, map_is_switch_protected, cs_violation;
3132	kern_return_t kr;
3133	/ Validate code signature if necessary. /
3134	map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) &&
3135	(pmap == vm_map_pmap(current_thread()->map)));
3136	map_is_switch_protected = current_thread()->map->switch_protect;
3137	kr = vm_fault_cs_check_violation(cs_bypass, object, m, pmap,
3138	prot, caller_prot, fault_page_size, fault_phys_offset, fault_info,
3139	map_is_switched, map_is_switch_protected, cs_violation: &cs_violation);
3140	if (kr != KERN_SUCCESS) {
3141	return kr;
3142	}
3143	if (cs_violation) {
3144	kr = vm_fault_cs_handle_violation(object, m, pmap, prot, vaddr,
3145	fault_page_size, fault_phys_offset,
3146	map_is_switched, map_is_switch_protected, must_disconnect);
3147	}
3148	return kr;
3149	}
3150
3151	/*
3152	* Enqueue the page on the appropriate paging queue.
3153	*/
3154	static void
3155	vm_fault_enqueue_page(
3156	vm_object_t object,
3157	vm_page_t m,
3158	bool wired,
3159	bool change_wiring,
3160	vm_tag_t wire_tag,
3161	bool no_cache,
3162	int *type_of_fault,
3163	kern_return_t kr)
3164	{
3165	assert((m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) \|\| object != compressor_object);
3166	boolean_t page_queues_locked = FALSE;
3167	boolean_t previously_pmapped = m->vmp_pmapped;
3168	#define __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED() \
3169	MACRO_BEGIN \
3170	if (! page_queues_locked) { \
3171	page_queues_locked = TRUE; \
3172	vm_page_lockspin_queues(); \
3173	} \
3174	MACRO_END
3175	#define __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED() \
3176	MACRO_BEGIN \
3177	if (page_queues_locked) { \
3178	page_queues_locked = FALSE; \
3179	vm_page_unlock_queues(); \
3180	} \
3181	MACRO_END
3182
3183	vm_page_update_special_state(mem: m);
3184	if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
3185	/*
3186	* Compressor pages are neither wired
3187	* nor pageable and should never change.
3188	*/
3189	assert(object == compressor_object);
3190	} else if (change_wiring) {
3191	__VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3192
3193	if (wired) {
3194	if (kr == KERN_SUCCESS) {
3195	vm_page_wire(page: m, tag: wire_tag, TRUE);
3196	}
3197	} else {
3198	vm_page_unwire(page: m, TRUE);
3199	}
3200	/ we keep the page queues lock, if we need it later /
3201	} else {
3202	if (object->internal == TRUE) {
3203	/*
3204	* don't allow anonymous pages on
3205	* the speculative queues
3206	*/
3207	no_cache = FALSE;
3208	}
3209	if (kr != KERN_SUCCESS) {
3210	__VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3211	vm_page_deactivate(page: m);
3212	/ we keep the page queues lock, if we need it later /
3213	} else if (((m->vmp_q_state == VM_PAGE_NOT_ON_Q) \|\|
3214	(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) \|\|
3215	(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) \|\|
3216	((m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && no_cache)) &&
3217	!VM_PAGE_WIRED(m)) {
3218	if (vm_page_local_q &&
3219	(*type_of_fault == DBG_COW_FAULT \|\|
3220	*type_of_fault == DBG_ZERO_FILL_FAULT)) {
3221	struct vpl *lq;
3222	uint32_t lid;
3223
3224	assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3225
3226	__VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3227	vm_object_lock_assert_exclusive(object);
3228
3229	/*
3230	* we got a local queue to stuff this
3231	* new page on...
3232	* its safe to manipulate local and
3233	* local_id at this point since we're
3234	* behind an exclusive object lock and
3235	* the page is not on any global queue.
3236	*
3237	* we'll use the current cpu number to
3238	* select the queue note that we don't
3239	* need to disable preemption... we're
3240	* going to be behind the local queue's
3241	* lock to do the real work
3242	*/
3243	lid = cpu_number();
3244
3245	lq = zpercpu_get_cpu(vm_page_local_q, lid);
3246
3247	VPL_LOCK(&lq->vpl_lock);
3248
3249	vm_page_check_pageable_safe(page: m);
3250	vm_page_queue_enter(&lq->vpl_queue, m, vmp_pageq);
3251	m->vmp_q_state = VM_PAGE_ON_ACTIVE_LOCAL_Q;
3252	m->vmp_local_id = lid;
3253	lq->vpl_count++;
3254
3255	if (object->internal) {
3256	lq->vpl_internal_count++;
3257	} else {
3258	lq->vpl_external_count++;
3259	}
3260
3261	VPL_UNLOCK(&lq->vpl_lock);
3262
3263	if (lq->vpl_count > vm_page_local_q_soft_limit) {
3264	/*
3265	* we're beyond the soft limit
3266	* for the local queue
3267	* vm_page_reactivate_local will
3268	* 'try' to take the global page
3269	* queue lock... if it can't
3270	* that's ok... we'll let the
3271	* queue continue to grow up
3272	* to the hard limit... at that
3273	* point we'll wait for the
3274	* lock... once we've got the
3275	* lock, we'll transfer all of
3276	* the pages from the local
3277	* queue to the global active
3278	* queue
3279	*/
3280	vm_page_reactivate_local(lid, FALSE, FALSE);
3281	}
3282	} else {
3283	__VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3284
3285	/*
3286	* test again now that we hold the
3287	* page queue lock
3288	*/
3289	if (!VM_PAGE_WIRED(m)) {
3290	if (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3291	vm_page_queues_remove(mem: m, FALSE);
3292
3293	VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, `1`);
3294	VM_PAGEOUT_DEBUG(vm_pageout_cleaned_fault_reactivated, `1`);
3295	}
3296
3297	if (!VM_PAGE_ACTIVE_OR_INACTIVE(m) \|\|
3298	no_cache) {
3299	/*
3300	* If this is a no_cache mapping
3301	* and the page has never been
3302	* mapped before or was
3303	* previously a no_cache page,
3304	* then we want to leave pages
3305	* in the speculative state so
3306	* that they can be readily
3307	* recycled if free memory runs
3308	* low. Otherwise the page is
3309	* activated as normal.
3310	*/
3311
3312	if (no_cache &&
3313	(!previously_pmapped \|\|
3314	m->vmp_no_cache)) {
3315	m->vmp_no_cache = TRUE;
3316
3317	if (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q) {
3318	vm_page_speculate(page: m, FALSE);
3319	}
3320	} else if (!VM_PAGE_ACTIVE_OR_INACTIVE(m)) {
3321	vm_page_activate(page: m);
3322	}
3323	}
3324	}
3325	/ we keep the page queues lock, if we need it later /
3326	}
3327	}
3328	}
3329	/ we're done with the page queues lock, if we ever took it /
3330	__VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3331	}
3332
3333	/*
3334	* Sets the pmmpped, xpmapped, and wpmapped bits on the vm_page_t and updates accounting.
3335	* @return true if the page needs to be sync'ed via pmap_sync-page_data_physo
3336	* before being inserted into the pmap.
3337	*/
3338	static bool
3339	vm_fault_enter_set_mapped(
3340	vm_object_t object,
3341	vm_page_t m,
3342	vm_prot_t prot,
3343	vm_prot_t fault_type)
3344	{
3345	bool page_needs_sync = false;
3346	/*
3347	* NOTE: we may only hold the vm_object lock SHARED
3348	* at this point, so we need the phys_page lock to
3349	* properly serialize updating the pmapped and
3350	* xpmapped bits
3351	*/
3352	if ((prot & VM_PROT_EXECUTE) && !m->vmp_xpmapped) {
3353	ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
3354
3355	pmap_lock_phys_page(pn: phys_page);
3356	m->vmp_pmapped = TRUE;
3357
3358	if (!m->vmp_xpmapped) {
3359	m->vmp_xpmapped = TRUE;
3360
3361	pmap_unlock_phys_page(pn: phys_page);
3362
3363	if (!object->internal) {
3364	OSAddAtomic(`1`, &vm_page_xpmapped_external_count);
3365	}
3366
3367	#if defined(__arm64__)
3368	page_needs_sync = true;
3369	#else
3370	if (object->internal &&
3371	object->pager != NULL) {
3372	/*
3373	* This page could have been
3374	* uncompressed by the
3375	* compressor pager and its
3376	* contents might be only in
3377	* the data cache.
3378	* Since it's being mapped for
3379	* "execute" for the fist time,
3380	* make sure the icache is in
3381	* sync.
3382	*/
3383	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
3384	page_needs_sync = true;
3385	}
3386	#endif
3387	} else {
3388	pmap_unlock_phys_page(pn: phys_page);
3389	}
3390	} else {
3391	if (m->vmp_pmapped == FALSE) {
3392	ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
3393
3394	pmap_lock_phys_page(pn: phys_page);
3395	m->vmp_pmapped = TRUE;
3396	pmap_unlock_phys_page(pn: phys_page);
3397	}
3398	}
3399
3400	if (fault_type & VM_PROT_WRITE) {
3401	if (m->vmp_wpmapped == FALSE) {
3402	vm_object_lock_assert_exclusive(object);
3403	if (!object->internal && object->pager) {
3404	task_update_logical_writes(task: current_task(), PAGE_SIZE, TASK_WRITE_DEFERRED, vp: vnode_pager_lookup_vnode(object->pager));
3405	}
3406	m->vmp_wpmapped = TRUE;
3407	}
3408	}
3409	return page_needs_sync;
3410	}
3411
3412	/*
3413	* wrapper for pmap_enter_options()
3414	*/
3415	static kern_return_t
3416	pmap_enter_options_check(
3417	pmap_t pmap,
3418	vm_map_address_t virtual_address,
3419	vm_map_offset_t fault_phys_offset,
3420	vm_page_t page,
3421	vm_prot_t protection,
3422	vm_prot_t fault_type,
3423	unsigned int flags,
3424	boolean_t wired,
3425	unsigned int options)
3426	{
3427	int extra_options = `0`;
3428	vm_object_t obj;
3429
3430	if (page->vmp_error) {
3431	return KERN_MEMORY_FAILURE;
3432	}
3433	obj = VM_PAGE_OBJECT(page);
3434	if (obj->internal) {
3435	extra_options \|= PMAP_OPTIONS_INTERNAL;
3436	}
3437	if (page->vmp_reusable \|\| obj->all_reusable) {
3438	extra_options \|= PMAP_OPTIONS_REUSABLE;
3439	}
3440	return pmap_enter_options_addr(pmap,
3441	v: virtual_address,
3442	pa: (pmap_paddr_t)ptoa(VM_PAGE_GET_PHYS_PAGE(page)) + fault_phys_offset,
3443	prot: protection,
3444	fault_type,
3445	flags,
3446	wired,
3447	options: options \| extra_options,
3448	NULL,
3449	mapping_type: PMAP_MAPPING_TYPE_INFER);
3450	}
3451
3452	/*
3453	* Try to enter the given page into the pmap.
3454	* Will retry without execute permission if the code signing monitor is enabled and
3455	* we encounter a codesigning failure on a non-execute fault.
3456	*/
3457	static kern_return_t
3458	vm_fault_attempt_pmap_enter(
3459	pmap_t pmap,
3460	vm_map_offset_t vaddr,
3461	vm_map_size_t fault_page_size,
3462	vm_map_offset_t fault_phys_offset,
3463	vm_page_t m,
3464	vm_prot_t *prot,
3465	vm_prot_t caller_prot,
3466	vm_prot_t fault_type,
3467	bool wired,
3468	int pmap_options)
3469	{
3470	#if !CODE_SIGNING_MONITOR
3471	#pragma unused(caller_prot)
3472	#endif /* !CODE_SIGNING_MONITOR */
3473
3474	kern_return_t kr;
3475	if (fault_page_size != PAGE_SIZE) {
3476	DEBUG4K_FAULT("pmap %p va 0x%llx pa 0x%llx (0x%llx+0x%llx) prot 0x%x fault_type 0x%x\n", pmap, (uint64_t)vaddr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, *prot, fault_type);
3477	assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
3478	fault_phys_offset < PAGE_SIZE),
3479	"0x%llx\n", (uint64_t)fault_phys_offset);
3480	} else {
3481	assertf(fault_phys_offset == `0`,
3482	"0x%llx\n", (uint64_t)fault_phys_offset);
3483	}
3484
3485	kr = pmap_enter_options_check(pmap, virtual_address: vaddr,
3486	fault_phys_offset,
3487	page: m, protection: *prot, fault_type, flags: `0`,
3488	wired,
3489	options: pmap_options);
3490
3491	#if CODE_SIGNING_MONITOR
3492	/*
3493	* Retry without execute permission if we encountered a codesigning
3494	* failure on a non-execute fault. This allows applications which
3495	* don't actually need to execute code to still map it for read access.
3496	*/
3497	if (kr == KERN_CODESIGN_ERROR &&
3498	csm_enabled() &&
3499	(*prot & VM_PROT_EXECUTE) &&
3500	!(caller_prot & VM_PROT_EXECUTE)) {
3501	*prot &= ~VM_PROT_EXECUTE;
3502	kr = pmap_enter_options_check(pmap, vaddr,
3503	fault_phys_offset,
3504	m, *prot, fault_type, `0`,
3505	wired,
3506	pmap_options);
3507	}
3508	#endif /* CODE_SIGNING_MONITOR */
3509
3510	return kr;
3511	}
3512
3513	/*
3514	* Enter the given page into the pmap.
3515	* The map must be locked shared.
3516	* The vm object must NOT be locked.
3517	*
3518	* @param need_retry if not null, avoid making a (potentially) blocking call into
3519	* the pmap layer. When such a call would be necessary, return true in this boolean instead.
3520	*/
3521	static kern_return_t
3522	vm_fault_pmap_enter(
3523	pmap_t pmap,
3524	vm_map_offset_t vaddr,
3525	vm_map_size_t fault_page_size,
3526	vm_map_offset_t fault_phys_offset,
3527	vm_page_t m,
3528	vm_prot_t *prot,
3529	vm_prot_t caller_prot,
3530	vm_prot_t fault_type,
3531	bool wired,
3532	int pmap_options,
3533	boolean_t *need_retry)
3534	{
3535	kern_return_t kr;
3536	if (need_retry != NULL) {
3537	/*
3538	* Although we don't hold a lock on this object, we hold a lock
3539	* on the top object in the chain. To prevent a deadlock, we
3540	* can't allow the pmap layer to block.
3541	*/
3542	pmap_options \|= PMAP_OPTIONS_NOWAIT;
3543	}
3544	kr = vm_fault_attempt_pmap_enter(pmap, vaddr,
3545	fault_page_size, fault_phys_offset,
3546	m, prot, caller_prot, fault_type, wired, pmap_options);
3547	if (kr == KERN_RESOURCE_SHORTAGE) {
3548	if (need_retry) {
3549	/*
3550	* There's nothing we can do here since we hold the
3551	* lock on the top object in the chain. The caller
3552	* will need to deal with this by dropping that lock and retrying.
3553	*/
3554	*need_retry = TRUE;
3555	vm_pmap_enter_retried++;
3556	}
3557	}
3558	return kr;
3559	}
3560
3561	/*
3562	* Enter the given page into the pmap.
3563	* The vm map must be locked shared.
3564	* The vm object must be locked exclusive, unless this is a soft fault.
3565	* For a soft fault, the object must be locked shared or exclusive.
3566	*
3567	* @param need_retry if not null, avoid making a (potentially) blocking call into
3568	* the pmap layer. When such a call would be necessary, return true in this boolean instead.
3569	*/
3570	static kern_return_t
3571	vm_fault_pmap_enter_with_object_lock(
3572	vm_object_t object,
3573	pmap_t pmap,
3574	vm_map_offset_t vaddr,
3575	vm_map_size_t fault_page_size,
3576	vm_map_offset_t fault_phys_offset,
3577	vm_page_t m,
3578	vm_prot_t *prot,
3579	vm_prot_t caller_prot,
3580	vm_prot_t fault_type,
3581	bool wired,
3582	int pmap_options,
3583	boolean_t *need_retry,
3584	uint8_t *object_lock_type)
3585	{
3586	kern_return_t kr;
3587	/*
3588	* Prevent a deadlock by not
3589	* holding the object lock if we need to wait for a page in
3590	* pmap_enter() - <rdar://problem/7138958>
3591	*/
3592	kr = vm_fault_attempt_pmap_enter(pmap, vaddr,
3593	fault_page_size, fault_phys_offset,
3594	m, prot, caller_prot, fault_type, wired, pmap_options: pmap_options \| PMAP_OPTIONS_NOWAIT);
3595	#if __x86_64__
3596	if (kr == KERN_INVALID_ARGUMENT &&
3597	pmap == PMAP_NULL &&
3598	wired) {
3599	/*
3600	* Wiring a page in a pmap-less VM map:
3601	* VMware's "vmmon" kernel extension does this
3602	* to grab pages.
3603	* Let it proceed even though the PMAP_ENTER() failed.
3604	*/
3605	kr = KERN_SUCCESS;
3606	}
3607	#endif /* __x86_64__ */
3608
3609	if (kr == KERN_RESOURCE_SHORTAGE) {
3610	if (need_retry) {
3611	/*
3612	* this will be non-null in the case where we hold the lock
3613	* on the top-object in this chain... we can't just drop
3614	* the lock on the object we're inserting the page into
3615	* and recall the PMAP_ENTER since we can still cause
3616	* a deadlock if one of the critical paths tries to
3617	* acquire the lock on the top-object and we're blocked
3618	* in PMAP_ENTER waiting for memory... our only recourse
3619	* is to deal with it at a higher level where we can
3620	* drop both locks.
3621	*/
3622	*need_retry = TRUE;
3623	vm_pmap_enter_retried++;
3624	goto done;
3625	}
3626	/*
3627	* The nonblocking version of pmap_enter did not succeed.
3628	* and we don't need to drop other locks and retry
3629	* at the level above us, so
3630	* use the blocking version instead. Requires marking
3631	* the page busy and unlocking the object
3632	*/
3633	boolean_t was_busy = m->vmp_busy;
3634
3635	vm_object_lock_assert_exclusive(object);
3636
3637	m->vmp_busy = TRUE;
3638	vm_object_unlock(object);
3639
3640	kr = pmap_enter_options_check(pmap, virtual_address: vaddr,
3641	fault_phys_offset,
3642	page: m, protection: *prot, fault_type,
3643	flags: `0`, wired,
3644	options: pmap_options);
3645
3646	assert(VM_PAGE_OBJECT(m) == object);
3647
3648	/ Take the object lock again. /
3649	vm_object_lock(object);
3650
3651	/ If the page was busy, someone else will wake it up.*
3652	* Otherwise, we have to do it now. */
3653	assert(m->vmp_busy);
3654	if (!was_busy) {
3655	PAGE_WAKEUP_DONE(m);
3656	}
3657	vm_pmap_enter_blocked++;
3658	}
3659
3660	#if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
3661	if ((*prot & VM_PROT_WRITE) && m->vmp_unmodified_ro) {
3662	if (*object_lock_type == OBJECT_LOCK_SHARED) {
3663	boolean_t was_busy = m->vmp_busy;
3664	m->vmp_busy = TRUE;
3665
3666	*object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3667
3668	if (vm_object_lock_upgrade(object) == FALSE) {
3669	vm_object_lock(object);
3670	}
3671
3672	if (!was_busy) {
3673	PAGE_WAKEUP_DONE(m);
3674	}
3675	}
3676	vm_object_lock_assert_exclusive(object);
3677	vm_page_lockspin_queues();
3678	m->vmp_unmodified_ro = false;
3679	vm_page_unlock_queues();
3680	os_atomic_dec(&compressor_ro_uncompressed, relaxed);
3681
3682	VM_COMPRESSOR_PAGER_STATE_CLR(VM_PAGE_OBJECT(m), m->vmp_offset);
3683	}
3684	#else /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
3685	#pragma unused(object_lock_type)
3686	#endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
3687
3688	done:
3689	return kr;
3690	}
3691
3692	/*
3693	* Prepare to enter a page into the pmap by checking CS, protection bits,
3694	* and setting mapped bits on the page_t.
3695	* Does not modify the page's paging queue.
3696	*
3697	* page queue lock must NOT be held
3698	* m->vmp_object must be locked
3699	*
3700	* NOTE: m->vmp_object could be locked "shared" only if we are called
3701	* from vm_fault() as part of a soft fault.
3702	*/
3703	static kern_return_t
3704	vm_fault_enter_prepare(
3705	vm_page_t m,
3706	pmap_t pmap,
3707	vm_map_offset_t vaddr,
3708	vm_prot_t *prot,
3709	vm_prot_t caller_prot,
3710	vm_map_size_t fault_page_size,
3711	vm_map_offset_t fault_phys_offset,
3712	boolean_t change_wiring,
3713	vm_prot_t fault_type,
3714	vm_object_fault_info_t fault_info,
3715	int *type_of_fault,
3716	bool *page_needs_data_sync)
3717	{
3718	kern_return_t kr;
3719	bool is_tainted = false;
3720	vm_object_t object;
3721	boolean_t cs_bypass = fault_info->cs_bypass;
3722
3723	object = VM_PAGE_OBJECT(m);
3724
3725	vm_object_lock_assert_held(object);
3726
3727	#if KASAN
3728	if (pmap == kernel_pmap) {
3729	kasan_notify_address(vaddr, PAGE_SIZE);
3730	}
3731	#endif
3732
3733	#if CODE_SIGNING_MONITOR
3734	if (csm_address_space_exempt(pmap) == KERN_SUCCESS) {
3735	cs_bypass = TRUE;
3736	}
3737	#endif
3738
3739	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
3740
3741	if (*type_of_fault == DBG_ZERO_FILL_FAULT) {
3742	vm_object_lock_assert_exclusive(object);
3743	} else if ((fault_type & VM_PROT_WRITE) == `0` &&
3744	!change_wiring &&
3745	(!m->vmp_wpmapped
3746	#if VM_OBJECT_ACCESS_TRACKING
3747	\|\| object->access_tracking
3748	#endif /* VM_OBJECT_ACCESS_TRACKING */
3749	)) {
3750	/*
3751	* This is not a "write" fault, so we
3752	* might not have taken the object lock
3753	* exclusively and we might not be able
3754	* to update the "wpmapped" bit in
3755	* vm_fault_enter().
3756	* Let's just grant read access to
3757	* the page for now and we'll
3758	* soft-fault again if we need write
3759	* access later...
3760	*/
3761
3762	/ This had better not be a JIT page. /
3763	if (pmap_has_prot_policy(pmap, translated_allow_execute: fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot: *prot)) {
3764	/*
3765	* This pmap enforces extra constraints for this set of
3766	* protections, so we can't modify them.
3767	*/
3768	if (!cs_bypass) {
3769	panic("%s: pmap %p vaddr 0x%llx prot 0x%x options 0x%x !cs_bypass",
3770	__FUNCTION__, pmap, (uint64_t)vaddr,
3771	*prot, fault_info->pmap_options);
3772	}
3773	} else {
3774	*prot &= ~VM_PROT_WRITE;
3775	}
3776	}
3777	if (m->vmp_pmapped == FALSE) {
3778	if (m->vmp_clustered) {
3779	if (*type_of_fault == DBG_CACHE_HIT_FAULT) {
3780	/*
3781	* found it in the cache, but this
3782	* is the first fault-in of the page (m->vmp_pmapped == FALSE)
3783	* so it must have come in as part of
3784	* a cluster... account 1 pagein against it
3785	*/
3786	if (object->internal) {
3787	*type_of_fault = DBG_PAGEIND_FAULT;
3788	} else {
3789	*type_of_fault = DBG_PAGEINV_FAULT;
3790	}
3791
3792	VM_PAGE_COUNT_AS_PAGEIN(m);
3793	}
3794	VM_PAGE_CONSUME_CLUSTERED(m);
3795	}
3796	}
3797
3798	if (*type_of_fault != DBG_COW_FAULT) {
3799	DTRACE_VM2(as_fault, int, `1`, (uint64_t *), NULL);
3800
3801	if (pmap == kernel_pmap) {
3802	DTRACE_VM2(kernel_asflt, int, `1`, (uint64_t *), NULL);
3803	}
3804	}
3805
3806	kr = vm_fault_validate_cs(cs_bypass, object, m, pmap, vaddr,
3807	prot: *prot, caller_prot, fault_page_size, fault_phys_offset,
3808	fault_info, must_disconnect: &is_tainted);
3809	if (kr == KERN_SUCCESS) {
3810	/*
3811	* We either have a good page, or a tainted page that has been accepted by the process.
3812	* In both cases the page will be entered into the pmap.
3813	*/
3814	page_needs_data_sync = vm_fault_enter_set_mapped(object, m, prot: prot, fault_type);
3815	if ((fault_type & VM_PROT_WRITE) && is_tainted) {
3816	/*
3817	* This page is tainted but we're inserting it anyways.
3818	* Since it's writeable, we need to disconnect it from other pmaps
3819	* now so those processes can take note.
3820	*/
3821
3822	/*
3823	* We can only get here
3824	* because of the CSE logic
3825	*/
3826	assert(pmap_get_vm_map_cs_enforced(pmap));
3827	pmap_disconnect(phys: VM_PAGE_GET_PHYS_PAGE(m));
3828	/*
3829	* If we are faulting for a write, we can clear
3830	* the execute bit - that will ensure the page is
3831	* checked again before being executable, which
3832	* protects against a map switch.
3833	* This only happens the first time the page
3834	* gets tainted, so we won't get stuck here
3835	* to make an already writeable page executable.
3836	*/
3837	if (!cs_bypass) {
3838	if (pmap_has_prot_policy(pmap, translated_allow_execute: fault_info->pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot: *prot)) {
3839	/*
3840	* This pmap enforces extra constraints
3841	* for this set of protections, so we
3842	* can't change the protections.
3843	*/
3844	panic("%s: pmap %p vaddr 0x%llx prot 0x%x options 0x%x",
3845	__FUNCTION__, pmap,
3846	(uint64_t)vaddr, *prot,
3847	fault_info->pmap_options);
3848	}
3849	*prot &= ~VM_PROT_EXECUTE;
3850	}
3851	}
3852	assert(VM_PAGE_OBJECT(m) == object);
3853
3854	#if VM_OBJECT_ACCESS_TRACKING
3855	if (object->access_tracking) {
3856	DTRACE_VM2(access_tracking, vm_map_offset_t, vaddr, int, fault_type);
3857	if (fault_type & VM_PROT_WRITE) {
3858	object->access_tracking_writes++;
3859	vm_object_access_tracking_writes++;
3860	} else {
3861	object->access_tracking_reads++;
3862	vm_object_access_tracking_reads++;
3863	}
3864	}
3865	#endif /* VM_OBJECT_ACCESS_TRACKING */
3866	}
3867
3868	return kr;
3869	}
3870
3871	/*
3872	* page queue lock must NOT be held
3873	* m->vmp_object must be locked
3874	*
3875	* NOTE: m->vmp_object could be locked "shared" only if we are called
3876	* from vm_fault() as part of a soft fault. If so, we must be
3877	* careful not to modify the VM object in any way that is not
3878	* legal under a shared lock...
3879	*/
3880	kern_return_t
3881	vm_fault_enter(
3882	vm_page_t m,
3883	pmap_t pmap,
3884	vm_map_offset_t vaddr,
3885	vm_map_size_t fault_page_size,
3886	vm_map_offset_t fault_phys_offset,
3887	vm_prot_t prot,
3888	vm_prot_t caller_prot,
3889	boolean_t wired,
3890	boolean_t change_wiring,
3891	vm_tag_t wire_tag,
3892	vm_object_fault_info_t fault_info,
3893	boolean_t *need_retry,
3894	int *type_of_fault,
3895	uint8_t *object_lock_type)
3896	{
3897	kern_return_t kr;
3898	vm_object_t object;
3899	bool page_needs_data_sync;
3900	vm_prot_t fault_type;
3901	int pmap_options = fault_info->pmap_options;
3902
3903	if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
3904	assert(m->vmp_fictitious);
3905	return KERN_SUCCESS;
3906	}
3907
3908	fault_type = change_wiring ? VM_PROT_NONE : caller_prot;
3909
3910	assertf(VM_PAGE_OBJECT(m) != VM_OBJECT_NULL, "m=%p", m);
3911	kr = vm_fault_enter_prepare(m, pmap, vaddr, prot: &prot, caller_prot,
3912	fault_page_size, fault_phys_offset, change_wiring, fault_type,
3913	fault_info, type_of_fault, page_needs_data_sync: &page_needs_data_sync);
3914	object = VM_PAGE_OBJECT(m);
3915
3916	vm_fault_enqueue_page(object, m, wired, change_wiring, wire_tag, no_cache: fault_info->no_cache, type_of_fault, kr);
3917
3918	if (kr == KERN_SUCCESS) {
3919	if (page_needs_data_sync) {
3920	pmap_sync_page_data_phys(pa: VM_PAGE_GET_PHYS_PAGE(m));
3921	}
3922
3923	if (fault_info->fi_xnu_user_debug && !object->code_signed) {
3924	pmap_options \|= PMAP_OPTIONS_XNU_USER_DEBUG;
3925	}
3926
3927
3928	kr = vm_fault_pmap_enter_with_object_lock(object, pmap, vaddr,
3929	fault_page_size, fault_phys_offset, m,
3930	prot: &prot, caller_prot, fault_type, wired, pmap_options, need_retry, object_lock_type);
3931	}
3932
3933	return kr;
3934	}
3935
3936	void
3937	vm_pre_fault(vm_map_offset_t vaddr, vm_prot_t prot)
3938	{
3939	if (pmap_find_phys(current_map()->pmap, va: vaddr) == `0`) {
3940	vm_fault(current_map(), / map /
3941	vaddr, / vaddr /
3942	fault_type: prot, / fault_type /
3943	FALSE, / change_wiring /
3944	VM_KERN_MEMORY_NONE, / tag - not wiring /
3945	THREAD_UNINT, / interruptible /
3946	NULL, / caller_pmap /
3947	pmap_addr: `0` / caller_pmap_addr /);
3948	}
3949	}
3950
3951
3952	/*
3953	* Routine: vm_fault
3954	* Purpose:
3955	* Handle page faults, including pseudo-faults
3956	* used to change the wiring status of pages.
3957	* Returns:
3958	* Explicit continuations have been removed.
3959	* Implementation:
3960	* vm_fault and vm_fault_page save mucho state
3961	* in the moral equivalent of a closure. The state
3962	* structure is allocated when first entering vm_fault
3963	* and deallocated when leaving vm_fault.
3964	*/
3965
3966	extern uint64_t get_current_unique_pid(void);
3967
3968	unsigned long vm_fault_collapse_total = `0`;
3969	unsigned long vm_fault_collapse_skipped = `0`;
3970
3971
3972	kern_return_t
3973	vm_fault_external(
3974	vm_map_t map,
3975	vm_map_offset_t vaddr,
3976	vm_prot_t fault_type,
3977	boolean_t change_wiring,
3978	int interruptible,
3979	pmap_t caller_pmap,
3980	vm_map_offset_t caller_pmap_addr)
3981	{
3982	return vm_fault_internal(map, vaddr, caller_prot: fault_type, change_wiring,
3983	wire_tag: change_wiring ? vm_tag_bt() : VM_KERN_MEMORY_NONE,
3984	interruptible, pmap: caller_pmap, pmap_addr: caller_pmap_addr,
3985	NULL);
3986	}
3987
3988	kern_return_t
3989	vm_fault(
3990	vm_map_t map,
3991	vm_map_offset_t vaddr,
3992	vm_prot_t fault_type,
3993	boolean_t change_wiring,
3994	vm_tag_t wire_tag, / if wiring must pass tag != VM_KERN_MEMORY_NONE /
3995	int interruptible,
3996	pmap_t caller_pmap,
3997	vm_map_offset_t caller_pmap_addr)
3998	{
3999	return vm_fault_internal(map, vaddr, caller_prot: fault_type, change_wiring, wire_tag,
4000	interruptible, pmap: caller_pmap, pmap_addr: caller_pmap_addr,
4001	NULL);
4002	}
4003
4004	static boolean_t
4005	current_proc_is_privileged(void)
4006	{
4007	return csproc_get_platform_binary(current_proc());
4008	}
4009
4010	uint64_t vm_copied_on_read = `0`;
4011
4012	/*
4013	* Cleanup after a vm_fault_enter.
4014	* At this point, the fault should either have failed (kr != KERN_SUCCESS)
4015	* or the page should be in the pmap and on the correct paging queue.
4016	*
4017	* Precondition:
4018	* map must be locked shared.
4019	* m_object must be locked.
4020	* If top_object != VM_OBJECT_NULL, it must be locked.
4021	* real_map must be locked.
4022	*
4023	* Postcondition:
4024	* map will be unlocked
4025	* m_object will be unlocked
4026	* top_object will be unlocked
4027	* If real_map != map, it will be unlocked
4028	*/
4029	static void
4030	vm_fault_complete(
4031	vm_map_t map,
4032	vm_map_t real_map,
4033	vm_object_t object,
4034	vm_object_t m_object,
4035	vm_page_t m,
4036	vm_map_offset_t offset,
4037	vm_map_offset_t trace_real_vaddr,
4038	vm_object_fault_info_t fault_info,
4039	vm_prot_t caller_prot,
4040	#if CONFIG_DTRACE
4041	vm_map_offset_t real_vaddr,
4042	#else
4043	__unused vm_map_offset_t real_vaddr,
4044	#endif /* CONFIG_DTRACE */
4045	int type_of_fault,
4046	boolean_t need_retry,
4047	kern_return_t kr,
4048	ppnum_t *physpage_p,
4049	vm_prot_t prot,
4050	vm_object_t top_object,
4051	boolean_t need_collapse,
4052	vm_map_offset_t cur_offset,
4053	vm_prot_t fault_type,
4054	vm_object_t *written_on_object,
4055	memory_object_t *written_on_pager,
4056	vm_object_offset_t *written_on_offset)
4057	{
4058	int event_code = `0`;
4059	vm_map_lock_assert_shared(map);
4060	vm_object_lock_assert_held(m_object);
4061	if (top_object != VM_OBJECT_NULL) {
4062	vm_object_lock_assert_held(top_object);
4063	}
4064	vm_map_lock_assert_held(real_map);
4065
4066	if (m_object->internal) {
4067	event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
4068	} else if (m_object->object_is_shared_cache) {
4069	event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
4070	} else {
4071	event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
4072	}
4073	KDBG_RELEASE(event_code \| DBG_FUNC_NONE, trace_real_vaddr, (fault_info->user_tag << `16`) \| (caller_prot << `8`) \| type_of_fault, m->vmp_offset, get_current_unique_pid());
4074	if (need_retry == FALSE) {
4075	KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_FAST), get_current_unique_pid());
4076	}
4077	DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info->user_tag);
4078	if (kr == KERN_SUCCESS &&
4079	physpage_p != NULL) {
4080	/ for vm_map_wire_and_extract() /
4081	*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
4082	if (prot & VM_PROT_WRITE) {
4083	vm_object_lock_assert_exclusive(m_object);
4084	m->vmp_dirty = TRUE;
4085	}
4086	}
4087
4088	if (top_object != VM_OBJECT_NULL) {
4089	/*
4090	* It's safe to drop the top object
4091	* now that we've done our
4092	* vm_fault_enter(). Any other fault
4093	* in progress for that virtual
4094	* address will either find our page
4095	* and translation or put in a new page
4096	* and translation.
4097	*/
4098	vm_object_unlock(top_object);
4099	top_object = VM_OBJECT_NULL;
4100	}
4101
4102	if (need_collapse == TRUE) {
4103	vm_object_collapse(object, vm_object_trunc_page(offset), TRUE);
4104	}
4105
4106	if (need_retry == FALSE &&
4107	(type_of_fault == DBG_PAGEIND_FAULT \|\| type_of_fault == DBG_PAGEINV_FAULT \|\| type_of_fault == DBG_CACHE_HIT_FAULT)) {
4108	/*
4109	* evaluate access pattern and update state
4110	* vm_fault_deactivate_behind depends on the
4111	* state being up to date
4112	*/
4113	vm_fault_is_sequential(object: m_object, offset: cur_offset, behavior: fault_info->behavior);
4114
4115	vm_fault_deactivate_behind(object: m_object, offset: cur_offset, behavior: fault_info->behavior);
4116	}
4117	/*
4118	* That's it, clean up and return.
4119	*/
4120	if (m->vmp_busy) {
4121	vm_object_lock_assert_exclusive(m_object);
4122	PAGE_WAKEUP_DONE(m);
4123	}
4124
4125	if (need_retry == FALSE && !m_object->internal && (fault_type & VM_PROT_WRITE)) {
4126	vm_object_paging_begin(m_object);
4127
4128	assert(*written_on_object == VM_OBJECT_NULL);
4129	*written_on_object = m_object;
4130	*written_on_pager = m_object->pager;
4131	*written_on_offset = m_object->paging_offset + m->vmp_offset;
4132	}
4133	vm_object_unlock(object);
4134
4135	vm_map_unlock_read(map);
4136	if (real_map != map) {
4137	vm_map_unlock(real_map);
4138	}
4139	}
4140
4141	static inline int
4142	vm_fault_type_for_tracing(boolean_t need_copy_on_read, int type_of_fault)
4143	{
4144	if (need_copy_on_read && type_of_fault == DBG_COW_FAULT) {
4145	return DBG_COR_FAULT;
4146	}
4147	return type_of_fault;
4148	}
4149
4150	uint64_t vm_fault_resilient_media_initiate = `0`;
4151	uint64_t vm_fault_resilient_media_retry = `0`;
4152	uint64_t vm_fault_resilient_media_proceed = `0`;
4153	uint64_t vm_fault_resilient_media_release = `0`;
4154	uint64_t vm_fault_resilient_media_abort1 = `0`;
4155	uint64_t vm_fault_resilient_media_abort2 = `0`;
4156
4157	#if MACH_ASSERT
4158	int vm_fault_resilient_media_inject_error1_rate = `0`;
4159	int vm_fault_resilient_media_inject_error1 = `0`;
4160	int vm_fault_resilient_media_inject_error2_rate = `0`;
4161	int vm_fault_resilient_media_inject_error2 = `0`;
4162	int vm_fault_resilient_media_inject_error3_rate = `0`;
4163	int vm_fault_resilient_media_inject_error3 = `0`;
4164	#endif /* MACH_ASSERT */
4165
4166	kern_return_t
4167	vm_fault_internal(
4168	vm_map_t map,
4169	vm_map_offset_t vaddr,
4170	vm_prot_t caller_prot,
4171	boolean_t change_wiring,
4172	vm_tag_t wire_tag, / if wiring must pass tag != VM_KERN_MEMORY_NONE /
4173	int interruptible,
4174	pmap_t caller_pmap,
4175	vm_map_offset_t caller_pmap_addr,
4176	ppnum_t *physpage_p)
4177	{
4178	vm_map_version_t version; / Map version for verificiation /
4179	boolean_t wired; / Should mapping be wired down? /
4180	vm_object_t object; / Top-level object /
4181	vm_object_offset_t offset; / Top-level offset /
4182	vm_prot_t prot; / Protection for mapping /
4183	vm_object_t old_copy_object; / Saved copy object /
4184	uint32_t old_copy_version;
4185	vm_page_t result_page; / Result of vm_fault_page /
4186	vm_page_t top_page; / Placeholder page /
4187	kern_return_t kr;
4188
4189	vm_page_t m; / Fast access to result_page /
4190	kern_return_t error_code;
4191	vm_object_t cur_object;
4192	vm_object_t m_object = NULL;
4193	vm_object_offset_t cur_offset;
4194	vm_page_t cur_m;
4195	vm_object_t new_object;
4196	int type_of_fault;
4197	pmap_t pmap;
4198	wait_interrupt_t interruptible_state;
4199	vm_map_t real_map = map;
4200	vm_map_t original_map = map;
4201	bool object_locks_dropped = FALSE;
4202	vm_prot_t fault_type;
4203	vm_prot_t original_fault_type;
4204	struct vm_object_fault_info fault_info = {};
4205	bool need_collapse = FALSE;
4206	boolean_t need_retry = FALSE;
4207	boolean_t *need_retry_ptr = NULL;
4208	uint8_t object_lock_type = `0`;
4209	uint8_t cur_object_lock_type;
4210	vm_object_t top_object = VM_OBJECT_NULL;
4211	vm_object_t written_on_object = VM_OBJECT_NULL;
4212	memory_object_t written_on_pager = NULL;
4213	vm_object_offset_t written_on_offset = `0`;
4214	int throttle_delay;
4215	int compressed_count_delta;
4216	uint8_t grab_options;
4217	bool need_copy;
4218	bool need_copy_on_read;
4219	vm_map_offset_t trace_vaddr;
4220	vm_map_offset_t trace_real_vaddr;
4221	vm_map_size_t fault_page_size;
4222	vm_map_size_t fault_page_mask;
4223	int fault_page_shift;
4224	vm_map_offset_t fault_phys_offset;
4225	vm_map_offset_t real_vaddr;
4226	bool resilient_media_retry = false;
4227	bool resilient_media_ref_transfer = false;
4228	vm_object_t resilient_media_object = VM_OBJECT_NULL;
4229	vm_object_offset_t resilient_media_offset = (vm_object_offset_t)-`1`;
4230	bool page_needs_data_sync = false;
4231	/*
4232	* Was the VM object contended when vm_map_lookup_and_lock_object locked it?
4233	* If so, the zero fill path will drop the lock
4234	* NB: Ideally we would always drop the lock rather than rely on
4235	* this heuristic, but vm_object_unlock currently takes > 30 cycles.
4236	*/
4237	bool object_is_contended = false;
4238
4239
4240	real_vaddr = vaddr;
4241	trace_real_vaddr = vaddr;
4242
4243	/*
4244	* Some (kernel) submaps are marked with "should never fault".
4245	*
4246	* We do this for two reasons:
4247	* - PGZ which is inside the zone map range can't go down the normal
4248	* lookup path (vm_map_lookup_entry() would panic).
4249	*
4250	* - we want for guard pages to not have to use fictitious pages at all
4251	* to prevent from ZFOD pages to be made.
4252	*
4253	* We also want capture the fault address easily so that the zone
4254	* allocator might present an enhanced panic log.
4255	*/
4256	if (map->never_faults \|\| (pgz_owned(vaddr) && map->pmap == kernel_pmap)) {
4257	assert(map->pmap == kernel_pmap);
4258	return KERN_INVALID_ADDRESS;
4259	}
4260
4261	if (VM_MAP_PAGE_SIZE(original_map) < PAGE_SIZE) {
4262	fault_phys_offset = (vm_map_offset_t)-`1`;
4263	fault_page_size = VM_MAP_PAGE_SIZE(original_map);
4264	fault_page_mask = VM_MAP_PAGE_MASK(original_map);
4265	fault_page_shift = VM_MAP_PAGE_SHIFT(map: original_map);
4266	if (fault_page_size < PAGE_SIZE) {
4267	DEBUG4K_FAULT("map %p vaddr 0x%llx caller_prot 0x%x\n", map, (uint64_t)trace_real_vaddr, caller_prot);
4268	vaddr = vm_map_trunc_page(vaddr, fault_page_mask);
4269	}
4270	} else {
4271	fault_phys_offset = `0`;
4272	fault_page_size = PAGE_SIZE;
4273	fault_page_mask = PAGE_MASK;
4274	fault_page_shift = PAGE_SHIFT;
4275	vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
4276	}
4277
4278	if (map == kernel_map) {
4279	trace_vaddr = VM_KERNEL_ADDRHIDE(vaddr);
4280	trace_real_vaddr = VM_KERNEL_ADDRHIDE(trace_real_vaddr);
4281	} else {
4282	trace_vaddr = vaddr;
4283	}
4284
4285	KDBG_RELEASE(
4286	(MACHDBG_CODE(DBG_MACH_VM, `2`)) \| DBG_FUNC_START,
4287	((uint64_t)trace_vaddr >> `32`),
4288	trace_vaddr,
4289	(map == kernel_map));
4290
4291	if (get_preemption_level() != `0`) {
4292	KDBG_RELEASE(
4293	(MACHDBG_CODE(DBG_MACH_VM, `2`)) \| DBG_FUNC_END,
4294	((uint64_t)trace_vaddr >> `32`),
4295	trace_vaddr,
4296	KERN_FAILURE);
4297
4298	ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_NONZERO_PREEMPTION_LEVEL), arg: `0` / arg /);
4299	return KERN_FAILURE;
4300	}
4301
4302	thread_t cthread = current_thread();
4303	bool rtfault = (cthread->sched_mode == TH_MODE_REALTIME);
4304	uint64_t fstart = `0`;
4305
4306	if (rtfault) {
4307	fstart = mach_continuous_time();
4308	}
4309
4310	interruptible_state = thread_interrupt_level(interruptible);
4311
4312	fault_type = (change_wiring ? VM_PROT_NONE : caller_prot);
4313
4314	counter_inc(&vm_statistics_faults);
4315	counter_inc(&current_task()->faults);
4316	original_fault_type = fault_type;
4317
4318	need_copy = FALSE;
4319	if (fault_type & VM_PROT_WRITE) {
4320	need_copy = TRUE;
4321	}
4322
4323	if (need_copy \|\| change_wiring) {
4324	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4325	} else {
4326	object_lock_type = OBJECT_LOCK_SHARED;
4327	}
4328
4329	cur_object_lock_type = OBJECT_LOCK_SHARED;
4330
4331	if ((map == kernel_map) && (caller_prot & VM_PROT_WRITE)) {
4332	if (compressor_map) {
4333	if ((vaddr >= vm_map_min(compressor_map)) && (vaddr < vm_map_max(compressor_map))) {
4334	panic("Write fault on compressor map, va: %p type: %u bounds: %p->%p", (void ) vaddr, caller_prot, (void* ) vm_map_min(compressor_map), (void* *) vm_map_max(compressor_map));
4335	}
4336	}
4337	}
4338	RetryFault:
4339	assert(written_on_object == VM_OBJECT_NULL);
4340
4341	/*
4342	* assume we will hit a page in the cache
4343	* otherwise, explicitly override with
4344	* the real fault type once we determine it
4345	*/
4346	type_of_fault = DBG_CACHE_HIT_FAULT;
4347
4348	/*
4349	* Find the backing store object and offset into
4350	* it to begin the search.
4351	*/
4352	fault_type = original_fault_type;
4353	map = original_map;
4354	vm_map_lock_read(map);
4355
4356	if (resilient_media_retry) {
4357	/*
4358	* If we have to insert a fake zero-filled page to hide
4359	* a media failure to provide the real page, we need to
4360	* resolve any pending copy-on-write on this mapping.
4361	* VM_PROT_COPY tells vm_map_lookup_and_lock_object() to deal
4362	* with that even if this is not a "write" fault.
4363	*/
4364	need_copy = TRUE;
4365	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4366	vm_fault_resilient_media_retry++;
4367	}
4368
4369	kr = vm_map_lookup_and_lock_object(var_map: &map, vaddr,
4370	fault_type: (fault_type \| (need_copy ? VM_PROT_COPY : `0`)),
4371	object_lock_type, out_version: &version,
4372	object: &object, offset: &offset, out_prot: &prot, wired: &wired,
4373	fault_info: &fault_info,
4374	real_map: &real_map,
4375	contended: &object_is_contended);
4376	object_is_contended = false; / avoid unsafe optimization /
4377
4378	if (kr != KERN_SUCCESS) {
4379	vm_map_unlock_read(map);
4380	/*
4381	* This can be seen in a crash report if indeed the
4382	* thread is crashing due to an invalid access in a non-existent
4383	* range.
4384	* Turning this OFF for now because it is noisy and not always fatal
4385	* eg prefaulting.
4386	*
4387	* if (kr == KERN_INVALID_ADDRESS) {
4388	* ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_ADDRESS_NOT_FOUND), 0);
4389	* }
4390	*/
4391	goto done;
4392	}
4393
4394
4395	pmap = real_map->pmap;
4396	fault_info.interruptible = interruptible;
4397	fault_info.stealth = FALSE;
4398	fault_info.io_sync = FALSE;
4399	fault_info.mark_zf_absent = FALSE;
4400	fault_info.batch_pmap_op = FALSE;
4401
4402	if (resilient_media_retry) {
4403	/*
4404	* We're retrying this fault after having detected a media
4405	* failure from a "resilient_media" mapping.
4406	* Check that the mapping is still pointing at the object
4407	* that just failed to provide a page.
4408	*/
4409	assert(resilient_media_object != VM_OBJECT_NULL);
4410	assert(resilient_media_offset != (vm_object_offset_t)-`1`);
4411	if ((object != VM_OBJECT_NULL &&
4412	object == resilient_media_object &&
4413	offset == resilient_media_offset &&
4414	fault_info.resilient_media)
4415	#if MACH_ASSERT
4416	&& (vm_fault_resilient_media_inject_error1_rate == `0` \|\|
4417	(++vm_fault_resilient_media_inject_error1 % vm_fault_resilient_media_inject_error1_rate) != `0`)
4418	#endif /* MACH_ASSERT */
4419	) {
4420	/*
4421	* This mapping still points at the same object
4422	* and is still "resilient_media": proceed in
4423	* "recovery-from-media-failure" mode, where we'll
4424	* insert a zero-filled page in the top object.
4425	*/
4426	// printf("RESILIENT_MEDIA %s:%d recovering for object %p offset 0x%llx\n", __FUNCTION__, __LINE__, object, offset);
4427	vm_fault_resilient_media_proceed++;
4428	} else {
4429	/ not recovering: reset state and retry fault /
4430	// printf("RESILIENT_MEDIA %s:%d no recovery resilient %d object %p/%p offset 0x%llx/0x%llx\n", __FUNCTION__, __LINE__, fault_info.resilient_media, object, resilient_media_object, offset, resilient_media_offset);
4431	vm_object_unlock(object);
4432	if (real_map != map) {
4433	vm_map_unlock(real_map);
4434	}
4435	vm_map_unlock_read(map);
4436	/ release our extra reference on failed object /
4437	// printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
4438	vm_object_deallocate(object: resilient_media_object);
4439	resilient_media_object = VM_OBJECT_NULL;
4440	resilient_media_offset = (vm_object_offset_t)-`1`;
4441	resilient_media_retry = false;
4442	vm_fault_resilient_media_abort1++;
4443	goto RetryFault;
4444	}
4445	} else {
4446	assert(resilient_media_object == VM_OBJECT_NULL);
4447	resilient_media_offset = (vm_object_offset_t)-`1`;
4448	}
4449
4450	/*
4451	* If the page is wired, we must fault for the current protection
4452	* value, to avoid further faults.
4453	*/
4454	if (wired) {
4455	fault_type = prot \| VM_PROT_WRITE;
4456	}
4457	if (wired \|\| need_copy) {
4458	/*
4459	* since we're treating this fault as a 'write'
4460	* we must hold the top object lock exclusively
4461	*/
4462	if (object_lock_type == OBJECT_LOCK_SHARED) {
4463	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4464
4465	if (vm_object_lock_upgrade(object) == FALSE) {
4466	/*
4467	* couldn't upgrade, so explictly
4468	* take the lock exclusively
4469	*/
4470	vm_object_lock(object);
4471	}
4472	}
4473	}
4474
4475	#if VM_FAULT_CLASSIFY
4476	/*
4477	* Temporary data gathering code
4478	*/
4479	vm_fault_classify(object, offset, fault_type);
4480	#endif
4481	/*
4482	* Fast fault code. The basic idea is to do as much as
4483	* possible while holding the map lock and object locks.
4484	* Busy pages are not used until the object lock has to
4485	* be dropped to do something (copy, zero fill, pmap enter).
4486	* Similarly, paging references aren't acquired until that
4487	* point, and object references aren't used.
4488	*
4489	* If we can figure out what to do
4490	* (zero fill, copy on write, pmap enter) while holding
4491	* the locks, then it gets done. Otherwise, we give up,
4492	* and use the original fault path (which doesn't hold
4493	* the map lock, and relies on busy pages).
4494	* The give up cases include:
4495	* - Have to talk to pager.
4496	* - Page is busy, absent or in error.
4497	* - Pager has locked out desired access.
4498	* - Fault needs to be restarted.
4499	* - Have to push page into copy object.
4500	*
4501	* The code is an infinite loop that moves one level down
4502	* the shadow chain each time. cur_object and cur_offset
4503	* refer to the current object being examined. object and offset
4504	* are the original object from the map. The loop is at the
4505	* top level if and only if object and cur_object are the same.
4506	*
4507	* Invariants: Map lock is held throughout. Lock is held on
4508	* original object and cur_object (if different) when
4509	* continuing or exiting loop.
4510	*
4511	*/
4512
4513	#if defined(__arm64__)
4514	/*
4515	* Fail if reading an execute-only page in a
4516	* pmap that enforces execute-only protection.
4517	*/
4518	if (fault_type == VM_PROT_READ &&
4519	(prot & VM_PROT_EXECUTE) &&
4520	!(prot & VM_PROT_READ) &&
4521	pmap_enforces_execute_only(pmap)) {
4522	vm_object_unlock(object);
4523	vm_map_unlock_read(map);
4524	if (real_map != map) {
4525	vm_map_unlock(real_map);
4526	}
4527	kr = KERN_PROTECTION_FAILURE;
4528	goto done;
4529	}
4530	#endif
4531
4532	fault_phys_offset = (vm_map_offset_t)offset - vm_map_trunc_page((vm_map_offset_t)offset, PAGE_MASK);
4533
4534	/*
4535	* If this page is to be inserted in a copy delay object
4536	* for writing, and if the object has a copy, then the
4537	* copy delay strategy is implemented in the slow fault page.
4538	*/
4539	if ((object->copy_strategy == MEMORY_OBJECT_COPY_DELAY \|\|
4540	object->copy_strategy == MEMORY_OBJECT_COPY_DELAY_FORK) &&
4541	object->vo_copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE)) {
4542	goto handle_copy_delay;
4543	}
4544
4545	cur_object = object;
4546	cur_offset = offset;
4547
4548	grab_options = `0`;
4549	#if CONFIG_SECLUDED_MEMORY
4550	if (object->can_grab_secluded) {
4551	grab_options \|= VM_PAGE_GRAB_SECLUDED;
4552	}
4553	#endif /* CONFIG_SECLUDED_MEMORY */
4554
4555	while (TRUE) {
4556	if (!cur_object->pager_created &&
4557	cur_object->phys_contiguous) { / superpage /
4558	break;
4559	}
4560
4561	if (cur_object->blocked_access) {
4562	/*
4563	* Access to this VM object has been blocked.
4564	* Let the slow path handle it.
4565	*/
4566	break;
4567	}
4568
4569	m = vm_page_lookup(object: cur_object, vm_object_trunc_page(cur_offset));
4570	m_object = NULL;
4571
4572	if (m != VM_PAGE_NULL) {
4573	m_object = cur_object;
4574
4575	if (m->vmp_busy) {
4576	wait_result_t result;
4577
4578	/*
4579	* in order to do the PAGE_ASSERT_WAIT, we must
4580	* have object that 'm' belongs to locked exclusively
4581	*/
4582	if (object != cur_object) {
4583	if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4584	cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4585
4586	if (vm_object_lock_upgrade(cur_object) == FALSE) {
4587	/*
4588	* couldn't upgrade so go do a full retry
4589	* immediately since we can no longer be
4590	* certain about cur_object (since we
4591	* don't hold a reference on it)...
4592	* first drop the top object lock
4593	*/
4594	vm_object_unlock(object);
4595
4596	vm_map_unlock_read(map);
4597	if (real_map != map) {
4598	vm_map_unlock(real_map);
4599	}
4600
4601	goto RetryFault;
4602	}
4603	}
4604	} else if (object_lock_type == OBJECT_LOCK_SHARED) {
4605	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4606
4607	if (vm_object_lock_upgrade(object) == FALSE) {
4608	/*
4609	* couldn't upgrade, so explictly take the lock
4610	* exclusively and go relookup the page since we
4611	* will have dropped the object lock and
4612	* a different thread could have inserted
4613	* a page at this offset
4614	* no need for a full retry since we're
4615	* at the top level of the object chain
4616	*/
4617	vm_object_lock(object);
4618
4619	continue;
4620	}
4621	}
4622	if ((m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) && m_object->internal) {
4623	/*
4624	* m->vmp_busy == TRUE and the object is locked exclusively
4625	* if m->pageout_queue == TRUE after we acquire the
4626	* queues lock, we are guaranteed that it is stable on
4627	* the pageout queue and therefore reclaimable
4628	*
4629	* NOTE: this is only true for the internal pageout queue
4630	* in the compressor world
4631	*/
4632	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
4633
4634	vm_page_lock_queues();
4635
4636	if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
4637	vm_pageout_throttle_up(page: m);
4638	vm_page_unlock_queues();
4639
4640	PAGE_WAKEUP_DONE(m);
4641	goto reclaimed_from_pageout;
4642	}
4643	vm_page_unlock_queues();
4644	}
4645	if (object != cur_object) {
4646	vm_object_unlock(object);
4647	}
4648
4649	vm_map_unlock_read(map);
4650	if (real_map != map) {
4651	vm_map_unlock(real_map);
4652	}
4653
4654	result = PAGE_ASSERT_WAIT(m, interruptible);
4655
4656	vm_object_unlock(cur_object);
4657
4658	if (result == THREAD_WAITING) {
4659	result = thread_block(THREAD_CONTINUE_NULL);
4660	}
4661	if (result == THREAD_AWAKENED \|\| result == THREAD_RESTART) {
4662	goto RetryFault;
4663	}
4664
4665	ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_BUSYPAGE_WAIT_INTERRUPTED), arg: `0` / arg /);
4666	kr = KERN_ABORTED;
4667	goto done;
4668	}
4669	reclaimed_from_pageout:
4670	if (m->vmp_laundry) {
4671	if (object != cur_object) {
4672	if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4673	cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4674
4675	vm_object_unlock(object);
4676	vm_object_unlock(cur_object);
4677
4678	vm_map_unlock_read(map);
4679	if (real_map != map) {
4680	vm_map_unlock(real_map);
4681	}
4682
4683	goto RetryFault;
4684	}
4685	} else if (object_lock_type == OBJECT_LOCK_SHARED) {
4686	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4687
4688	if (vm_object_lock_upgrade(object) == FALSE) {
4689	/*
4690	* couldn't upgrade, so explictly take the lock
4691	* exclusively and go relookup the page since we
4692	* will have dropped the object lock and
4693	* a different thread could have inserted
4694	* a page at this offset
4695	* no need for a full retry since we're
4696	* at the top level of the object chain
4697	*/
4698	vm_object_lock(object);
4699
4700	continue;
4701	}
4702	}
4703	vm_object_lock_assert_exclusive(VM_PAGE_OBJECT(m));
4704	vm_pageout_steal_laundry(page: m, FALSE);
4705	}
4706
4707
4708	if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
4709	/*
4710	* Guard page: let the slow path deal with it
4711	*/
4712	break;
4713	}
4714	if (m->vmp_unusual && (m->vmp_error \|\| m->vmp_restart \|\| m->vmp_private \|\| m->vmp_absent)) {
4715	/*
4716	* Unusual case... let the slow path deal with it
4717	*/
4718	break;
4719	}
4720	if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m_object)) {
4721	if (object != cur_object) {
4722	vm_object_unlock(object);
4723	}
4724	vm_map_unlock_read(map);
4725	if (real_map != map) {
4726	vm_map_unlock(real_map);
4727	}
4728	vm_object_unlock(cur_object);
4729	ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PURGEABLE_FAULT_ERROR), arg: `0` / arg /);
4730	kr = KERN_MEMORY_ERROR;
4731	goto done;
4732	}
4733	assert(m_object == VM_PAGE_OBJECT(m));
4734
4735	if (vm_fault_cs_need_validation(pmap: map->pmap, page: m, page_obj: m_object,
4736	PAGE_SIZE, fault_phys_offset: `0`) \|\|
4737	(physpage_p != NULL && (prot & VM_PROT_WRITE))) {
4738	upgrade_lock_and_retry:
4739	/*
4740	* We might need to validate this page
4741	* against its code signature, so we
4742	* want to hold the VM object exclusively.
4743	*/
4744	if (object != cur_object) {
4745	if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4746	vm_object_unlock(object);
4747	vm_object_unlock(cur_object);
4748
4749	cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4750
4751	vm_map_unlock_read(map);
4752	if (real_map != map) {
4753	vm_map_unlock(real_map);
4754	}
4755
4756	goto RetryFault;
4757	}
4758	} else if (object_lock_type == OBJECT_LOCK_SHARED) {
4759	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4760
4761	if (vm_object_lock_upgrade(object) == FALSE) {
4762	/*
4763	* couldn't upgrade, so explictly take the lock
4764	* exclusively and go relookup the page since we
4765	* will have dropped the object lock and
4766	* a different thread could have inserted
4767	* a page at this offset
4768	* no need for a full retry since we're
4769	* at the top level of the object chain
4770	*/
4771	vm_object_lock(object);
4772
4773	continue;
4774	}
4775	}
4776	}
4777	/*
4778	* Two cases of map in faults:
4779	* - At top level w/o copy object.
4780	* - Read fault anywhere.
4781	* --> must disallow write.
4782	*/
4783
4784	if (object == cur_object && object->vo_copy == VM_OBJECT_NULL) {
4785	#if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
4786	if ((fault_type & VM_PROT_WRITE) && m->vmp_unmodified_ro) {
4787	assert(cur_object == VM_PAGE_OBJECT(m));
4788	assert(cur_object->internal);
4789	vm_object_lock_assert_exclusive(cur_object);
4790	vm_page_lockspin_queues();
4791	m->vmp_unmodified_ro = false;
4792	vm_page_unlock_queues();
4793	os_atomic_dec(&compressor_ro_uncompressed, relaxed);
4794	VM_COMPRESSOR_PAGER_STATE_CLR(cur_object, m->vmp_offset);
4795	}
4796	#endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
4797	goto FastPmapEnter;
4798	}
4799
4800	if (!need_copy &&
4801	!fault_info.no_copy_on_read &&
4802	cur_object != object &&
4803	!cur_object->internal &&
4804	!cur_object->pager_trusted &&
4805	vm_protect_privileged_from_untrusted &&
4806	!cur_object->code_signed &&
4807	current_proc_is_privileged()) {
4808	/*
4809	* We're faulting on a page in "object" and
4810	* went down the shadow chain to "cur_object"
4811	* to find out that "cur_object"'s pager
4812	* is not "trusted", i.e. we can not trust it
4813	* to always return the same contents.
4814	* Since the target is a "privileged" process,
4815	* let's treat this as a copy-on-read fault, as
4816	* if it was a copy-on-write fault.
4817	* Once "object" gets a copy of this page, it
4818	* won't have to rely on "cur_object" to
4819	* provide the contents again.
4820	*
4821	* This is done by setting "need_copy" and
4822	* retrying the fault from the top with the
4823	* appropriate locking.
4824	*
4825	* Special case: if the mapping is executable
4826	* and the untrusted object is code-signed and
4827	* the process is "cs_enforced", we do not
4828	* copy-on-read because that would break
4829	* code-signing enforcement expectations (an
4830	* executable page must belong to a code-signed
4831	* object) and we can rely on code-signing
4832	* to re-validate the page if it gets evicted
4833	* and paged back in.
4834	*/
4835	// printf("COPY-ON-READ %s:%d map %p va 0x%llx page %p object %p offset 0x%llx UNTRUSTED: need copy-on-read!\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, m, VM_PAGE_OBJECT(m), m->vmp_offset);
4836	vm_copied_on_read++;
4837	need_copy = TRUE;
4838
4839	vm_object_unlock(object);
4840	vm_object_unlock(cur_object);
4841	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4842	vm_map_unlock_read(map);
4843	if (real_map != map) {
4844	vm_map_unlock(real_map);
4845	}
4846	goto RetryFault;
4847	}
4848
4849	if (!(fault_type & VM_PROT_WRITE) && !need_copy) {
4850	if (pmap_has_prot_policy(pmap, translated_allow_execute: fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot)) {
4851	/*
4852	* For a protection that the pmap cares
4853	* about, we must hand over the full
4854	* set of protections (so that the pmap
4855	* layer can apply any desired policy).
4856	* This means that cs_bypass must be
4857	* set, as this can force us to pass
4858	* RWX.
4859	*/
4860	if (!fault_info.cs_bypass) {
4861	panic("%s: pmap %p vaddr 0x%llx prot 0x%x options 0x%x",
4862	__FUNCTION__, pmap,
4863	(uint64_t)vaddr, prot,
4864	fault_info.pmap_options);
4865	}
4866	} else {
4867	prot &= ~VM_PROT_WRITE;
4868	}
4869
4870	if (object != cur_object) {
4871	/*
4872	* We still need to hold the top object
4873	* lock here to prevent a race between
4874	* a read fault (taking only "shared"
4875	* locks) and a write fault (taking
4876	* an "exclusive" lock on the top
4877	* object.
4878	* Otherwise, as soon as we release the
4879	* top lock, the write fault could
4880	* proceed and actually complete before
4881	* the read fault, and the copied page's
4882	* translation could then be overwritten
4883	* by the read fault's translation for
4884	* the original page.
4885	*
4886	* Let's just record what the top object
4887	* is and we'll release it later.
4888	*/
4889	top_object = object;
4890
4891	/*
4892	* switch to the object that has the new page
4893	*/
4894	object = cur_object;
4895	object_lock_type = cur_object_lock_type;
4896	}
4897	FastPmapEnter:
4898	assert(m_object == VM_PAGE_OBJECT(m));
4899
4900	/*
4901	* prepare for the pmap_enter...
4902	* object and map are both locked
4903	* m contains valid data
4904	* object == m->vmp_object
4905	* cur_object == NULL or it's been unlocked
4906	* no paging references on either object or cur_object
4907	*/
4908	if (top_object != VM_OBJECT_NULL \|\| object_lock_type != OBJECT_LOCK_EXCLUSIVE) {
4909	need_retry_ptr = &need_retry;
4910	} else {
4911	need_retry_ptr = NULL;
4912	}
4913
4914	if (fault_page_size < PAGE_SIZE) {
4915	DEBUG4K_FAULT("map %p original %p pmap %p va 0x%llx caller pmap %p va 0x%llx pa 0x%llx (0x%llx+0x%llx) prot 0x%x caller_prot 0x%x\n", map, original_map, pmap, (uint64_t)vaddr, caller_pmap, (uint64_t)caller_pmap_addr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, prot, caller_prot);
4916	assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
4917	fault_phys_offset < PAGE_SIZE),
4918	"0x%llx\n", (uint64_t)fault_phys_offset);
4919	} else {
4920	assertf(fault_phys_offset == `0`,
4921	"0x%llx\n", (uint64_t)fault_phys_offset);
4922	}
4923
4924	if (__improbable(rtfault &&
4925	!m->vmp_realtime &&
4926	vm_pageout_protect_realtime)) {
4927	vm_page_lock_queues();
4928	if (!m->vmp_realtime) {
4929	m->vmp_realtime = true;
4930	vm_page_realtime_count++;
4931	}
4932	vm_page_unlock_queues();
4933	}
4934	assertf(VM_PAGE_OBJECT(m) == m_object, "m=%p m_object=%p object=%p", m, m_object, object);
4935	assert(VM_PAGE_OBJECT(m) != VM_OBJECT_NULL);
4936	if (caller_pmap) {
4937	kr = vm_fault_enter(m,
4938	pmap: caller_pmap,
4939	vaddr: caller_pmap_addr,
4940	fault_page_size,
4941	fault_phys_offset,
4942	prot,
4943	caller_prot,
4944	wired,
4945	change_wiring,
4946	wire_tag,
4947	fault_info: &fault_info,
4948	need_retry: need_retry_ptr,
4949	type_of_fault: &type_of_fault,
4950	object_lock_type: &object_lock_type);
4951	} else {
4952	kr = vm_fault_enter(m,
4953	pmap,
4954	vaddr,
4955	fault_page_size,
4956	fault_phys_offset,
4957	prot,
4958	caller_prot,
4959	wired,
4960	change_wiring,
4961	wire_tag,
4962	fault_info: &fault_info,
4963	need_retry: need_retry_ptr,
4964	type_of_fault: &type_of_fault,
4965	object_lock_type: &object_lock_type);
4966	}
4967
4968	vm_fault_complete(
4969	map,
4970	real_map,
4971	object,
4972	m_object,
4973	m,
4974	offset,
4975	trace_real_vaddr,
4976	fault_info: &fault_info,
4977	caller_prot,
4978	real_vaddr,
4979	type_of_fault: vm_fault_type_for_tracing(need_copy_on_read, type_of_fault),
4980	need_retry,
4981	kr,
4982	physpage_p,
4983	prot,
4984	top_object,
4985	need_collapse,
4986	cur_offset,
4987	fault_type,
4988	written_on_object: &written_on_object,
4989	written_on_pager: &written_on_pager,
4990	written_on_offset: &written_on_offset);
4991	top_object = VM_OBJECT_NULL;
4992	if (need_retry == TRUE) {
4993	/*
4994	* vm_fault_enter couldn't complete the PMAP_ENTER...
4995	* at this point we don't hold any locks so it's safe
4996	* to ask the pmap layer to expand the page table to
4997	* accommodate this mapping... once expanded, we'll
4998	* re-drive the fault which should result in vm_fault_enter
4999	* being able to successfully enter the mapping this time around
5000	*/
5001	(void)pmap_enter_options(
5002	pmap, v: vaddr, pn: `0`, prot: `0`, fault_type: `0`, flags: `0`, wired: `0`,
5003	PMAP_OPTIONS_NOENTER, NULL, mapping_type: PMAP_MAPPING_TYPE_INFER);
5004
5005	need_retry = FALSE;
5006	goto RetryFault;
5007	}
5008	goto done;
5009	}
5010	/*
5011	* COPY ON WRITE FAULT
5012	*/
5013	assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
5014
5015	/*
5016	* If objects match, then
5017	* object->vo_copy must not be NULL (else control
5018	* would be in previous code block), and we
5019	* have a potential push into the copy object
5020	* with which we can't cope with here.
5021	*/
5022	if (cur_object == object) {
5023	/*
5024	* must take the slow path to
5025	* deal with the copy push
5026	*/
5027	break;
5028	}
5029
5030	/*
5031	* This is now a shadow based copy on write
5032	* fault -- it requires a copy up the shadow
5033	* chain.
5034	*/
5035	assert(m_object == VM_PAGE_OBJECT(m));
5036
5037	if ((cur_object_lock_type == OBJECT_LOCK_SHARED) &&
5038	vm_fault_cs_need_validation(NULL, page: m, page_obj: m_object,
5039	PAGE_SIZE, fault_phys_offset: `0`)) {
5040	goto upgrade_lock_and_retry;
5041	}
5042
5043	#if MACH_ASSERT
5044	if (resilient_media_retry &&
5045	vm_fault_resilient_media_inject_error2_rate != `0` &&
5046	(++vm_fault_resilient_media_inject_error2 % vm_fault_resilient_media_inject_error2_rate) == `0`) {
5047	/ inject an error /
5048	cur_m = m;
5049	m = VM_PAGE_NULL;
5050	m_object = VM_OBJECT_NULL;
5051	break;
5052	}
5053	#endif /* MACH_ASSERT */
5054	/*
5055	* Allocate a page in the original top level
5056	* object. Give up if allocate fails. Also
5057	* need to remember current page, as it's the
5058	* source of the copy.
5059	*
5060	* at this point we hold locks on both
5061	* object and cur_object... no need to take
5062	* paging refs or mark pages BUSY since
5063	* we don't drop either object lock until
5064	* the page has been copied and inserted
5065	*/
5066	cur_m = m;
5067	m = vm_page_grab_options(flags: grab_options);
5068	m_object = NULL;
5069
5070	if (m == VM_PAGE_NULL) {
5071	/*
5072	* no free page currently available...
5073	* must take the slow path
5074	*/
5075	break;
5076	}
5077
5078	/*
5079	* Now do the copy. Mark the source page busy...
5080	*
5081	* NOTE: This code holds the map lock across
5082	* the page copy.
5083	*/
5084	vm_page_copy(src_page: cur_m, dest_page: m);
5085	vm_page_insert(page: m, object, vm_object_trunc_page(offset));
5086	if (VM_MAP_PAGE_MASK(map) != PAGE_MASK) {
5087	DEBUG4K_FAULT("map %p vaddr 0x%llx page %p [%p 0x%llx] copied to %p [%p 0x%llx]\n", map, (uint64_t)vaddr, cur_m, VM_PAGE_OBJECT(cur_m), cur_m->vmp_offset, m, VM_PAGE_OBJECT(m), m->vmp_offset);
5088	}
5089	m_object = object;
5090	SET_PAGE_DIRTY(m, FALSE);
5091
5092	/*
5093	* Now cope with the source page and object
5094	*/
5095	if (object->ref_count > `1` && cur_m->vmp_pmapped) {
5096	pmap_disconnect(phys: VM_PAGE_GET_PHYS_PAGE(m: cur_m));
5097	} else if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) {
5098	/*
5099	* We've copied the full 16K page but we're
5100	* about to call vm_fault_enter() only for
5101	* the 4K chunk we're faulting on. The other
5102	* three 4K chunks in that page could still
5103	* be pmapped in this pmap.
5104	* Since the VM object layer thinks that the
5105	* entire page has been dealt with and the
5106	* original page might no longer be needed,
5107	* it might collapse/bypass the original VM
5108	* object and free its pages, which would be
5109	* bad (and would trigger pmap_verify_free()
5110	* assertions) if the other 4K chunks are still
5111	* pmapped.
5112	*/
5113	/*
5114	* XXX FBDP TODO4K: to be revisisted
5115	* Technically, we need to pmap_disconnect()
5116	* only the target pmap's mappings for the 4K
5117	* chunks of this 16K VM page. If other pmaps
5118	* have PTEs on these chunks, that means that
5119	* the associated VM map must have a reference
5120	* on the VM object, so no need to worry about
5121	* those.
5122	* pmap_protect() for each 4K chunk would be
5123	* better but we'd have to check which chunks
5124	* are actually mapped before and after this
5125	* one.
5126	* A full-blown pmap_disconnect() is easier
5127	* for now but not efficient.
5128	*/
5129	DEBUG4K_FAULT("pmap_disconnect() page %p object %p offset 0x%llx phys 0x%x\n", cur_m, VM_PAGE_OBJECT(cur_m), cur_m->vmp_offset, VM_PAGE_GET_PHYS_PAGE(cur_m));
5130	pmap_disconnect(phys: VM_PAGE_GET_PHYS_PAGE(m: cur_m));
5131	}
5132
5133	if (cur_m->vmp_clustered) {
5134	VM_PAGE_COUNT_AS_PAGEIN(cur_m);
5135	VM_PAGE_CONSUME_CLUSTERED(cur_m);
5136	vm_fault_is_sequential(object: cur_object, offset: cur_offset, behavior: fault_info.behavior);
5137	}
5138	need_collapse = TRUE;
5139
5140	if (!cur_object->internal &&
5141	cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
5142	/*
5143	* The object from which we've just
5144	* copied a page is most probably backed
5145	* by a vnode. We don't want to waste too
5146	* much time trying to collapse the VM objects
5147	* and create a bottleneck when several tasks
5148	* map the same file.
5149	*/
5150	if (cur_object->vo_copy == object) {
5151	/*
5152	* Shared mapping or no COW yet.
5153	* We can never collapse a copy
5154	* object into its backing object.
5155	*/
5156	need_collapse = FALSE;
5157	} else if (cur_object->vo_copy == object->shadow &&
5158	object->shadow->resident_page_count == `0`) {
5159	/*
5160	* Shared mapping after a COW occurred.
5161	*/
5162	need_collapse = FALSE;
5163	}
5164	}
5165	vm_object_unlock(cur_object);
5166
5167	if (need_collapse == FALSE) {
5168	vm_fault_collapse_skipped++;
5169	}
5170	vm_fault_collapse_total++;
5171
5172	type_of_fault = DBG_COW_FAULT;
5173	counter_inc(&vm_statistics_cow_faults);
5174	DTRACE_VM2(cow_fault, int, `1`, (uint64_t *), NULL);
5175	counter_inc(&current_task()->cow_faults);
5176
5177	goto FastPmapEnter;
5178	} else {
5179	/*
5180	* No page at cur_object, cur_offset... m == NULL
5181	*/
5182	if (cur_object->pager_created) {
5183	vm_external_state_t compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
5184
5185	if (MUST_ASK_PAGER(cur_object, cur_offset, compressor_external_state) == TRUE) {
5186	int my_fault_type;
5187	vm_compressor_options_t c_flags = C_DONT_BLOCK;
5188	bool insert_cur_object = FALSE;
5189
5190	/*
5191	* May have to talk to a pager...
5192	* if so, take the slow path by
5193	* doing a 'break' from the while (TRUE) loop
5194	*
5195	* external_state will only be set to VM_EXTERNAL_STATE_EXISTS
5196	* if the compressor is active and the page exists there
5197	*/
5198	if (compressor_external_state != VM_EXTERNAL_STATE_EXISTS) {
5199	break;
5200	}
5201
5202	if (map == kernel_map \|\| real_map == kernel_map) {
5203	/*
5204	* can't call into the compressor with the kernel_map
5205	* lock held, since the compressor may try to operate
5206	* on the kernel map in order to return an empty c_segment
5207	*/
5208	break;
5209	}
5210	if (object != cur_object) {
5211	if (fault_type & VM_PROT_WRITE) {
5212	c_flags \|= C_KEEP;
5213	} else {
5214	insert_cur_object = TRUE;
5215	}
5216	}
5217	if (insert_cur_object == TRUE) {
5218	if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
5219	cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5220
5221	if (vm_object_lock_upgrade(cur_object) == FALSE) {
5222	/*
5223	* couldn't upgrade so go do a full retry
5224	* immediately since we can no longer be
5225	* certain about cur_object (since we
5226	* don't hold a reference on it)...
5227	* first drop the top object lock
5228	*/
5229	vm_object_unlock(object);
5230
5231	vm_map_unlock_read(map);
5232	if (real_map != map) {
5233	vm_map_unlock(real_map);
5234	}
5235
5236	goto RetryFault;
5237	}
5238	}
5239	} else if (object_lock_type == OBJECT_LOCK_SHARED) {
5240	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5241
5242	if (object != cur_object) {
5243	/*
5244	* we can't go for the upgrade on the top
5245	* lock since the upgrade may block waiting
5246	* for readers to drain... since we hold
5247	* cur_object locked at this point, waiting
5248	* for the readers to drain would represent
5249	* a lock order inversion since the lock order
5250	* for objects is the reference order in the
5251	* shadown chain
5252	*/
5253	vm_object_unlock(object);
5254	vm_object_unlock(cur_object);
5255
5256	vm_map_unlock_read(map);
5257	if (real_map != map) {
5258	vm_map_unlock(real_map);
5259	}
5260
5261	goto RetryFault;
5262	}
5263	if (vm_object_lock_upgrade(object) == FALSE) {
5264	/*
5265	* couldn't upgrade, so explictly take the lock
5266	* exclusively and go relookup the page since we
5267	* will have dropped the object lock and
5268	* a different thread could have inserted
5269	* a page at this offset
5270	* no need for a full retry since we're
5271	* at the top level of the object chain
5272	*/
5273	vm_object_lock(object);
5274
5275	continue;
5276	}
5277	}
5278	m = vm_page_grab_options(flags: grab_options);
5279	m_object = NULL;
5280
5281	if (m == VM_PAGE_NULL) {
5282	/*
5283	* no free page currently available...
5284	* must take the slow path
5285	*/
5286	break;
5287	}
5288
5289	/*
5290	* The object is and remains locked
5291	* so no need to take a
5292	* "paging_in_progress" reference.
5293	*/
5294	bool shared_lock;
5295	if ((object == cur_object &&
5296	object_lock_type == OBJECT_LOCK_EXCLUSIVE) \|\|
5297	(object != cur_object &&
5298	cur_object_lock_type == OBJECT_LOCK_EXCLUSIVE)) {
5299	shared_lock = FALSE;
5300	} else {
5301	shared_lock = TRUE;
5302	}
5303
5304	kr = vm_compressor_pager_get(
5305	mem_obj: cur_object->pager,
5306	offset: (vm_object_trunc_page(cur_offset)
5307	+ cur_object->paging_offset),
5308	ppnum: VM_PAGE_GET_PHYS_PAGE(m),
5309	my_fault_type: &my_fault_type,
5310	flags: c_flags,
5311	compressed_count_delta_p: &compressed_count_delta);
5312
5313	vm_compressor_pager_count(
5314	mem_obj: cur_object->pager,
5315	compressed_count_delta,
5316	shared_lock,
5317	object: cur_object);
5318
5319	if (kr != KERN_SUCCESS) {
5320	vm_page_release(page: m, FALSE);
5321	m = VM_PAGE_NULL;
5322	}
5323	/*
5324	* If vm_compressor_pager_get() returns
5325	* KERN_MEMORY_FAILURE, then the
5326	* compressed data is permanently lost,
5327	* so return this error immediately.
5328	*/
5329	if (kr == KERN_MEMORY_FAILURE) {
5330	if (object != cur_object) {
5331	vm_object_unlock(cur_object);
5332	}
5333	vm_object_unlock(object);
5334	vm_map_unlock_read(map);
5335	if (real_map != map) {
5336	vm_map_unlock(real_map);
5337	}
5338
5339	goto done;
5340	} else if (kr != KERN_SUCCESS) {
5341	break;
5342	}
5343	m->vmp_dirty = TRUE;
5344	#if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
5345	if ((fault_type & VM_PROT_WRITE) == `0`) {
5346	prot &= ~VM_PROT_WRITE;
5347	/*
5348	* The page, m, has yet to be inserted
5349	* into an object. So we are fine with
5350	* the object/cur_object lock being held
5351	* shared.
5352	*/
5353	vm_page_lockspin_queues();
5354	m->vmp_unmodified_ro = true;
5355	vm_page_unlock_queues();
5356	os_atomic_inc(&compressor_ro_uncompressed, relaxed);
5357	}
5358	#endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
5359
5360	/*
5361	* If the object is purgeable, its
5362	* owner's purgeable ledgers will be
5363	* updated in vm_page_insert() but the
5364	* page was also accounted for in a
5365	* "compressed purgeable" ledger, so
5366	* update that now.
5367	*/
5368	if (object != cur_object &&
5369	!insert_cur_object) {
5370	/*
5371	* We're not going to insert
5372	* the decompressed page into
5373	* the object it came from.
5374	*
5375	* We're dealing with a
5376	* copy-on-write fault on
5377	* "object".
5378	* We're going to decompress
5379	* the page directly into the
5380	* target "object" while
5381	* keepin the compressed
5382	* page for "cur_object", so
5383	* no ledger update in that
5384	* case.
5385	*/
5386	} else if (((cur_object->purgable ==
5387	VM_PURGABLE_DENY) &&
5388	(!cur_object->vo_ledger_tag)) \|\|
5389	(cur_object->vo_owner ==
5390	NULL)) {
5391	/*
5392	* "cur_object" is not purgeable
5393	* and is not ledger-taged, or
5394	* there's no owner for it,
5395	* so no owner's ledgers to
5396	* update.
5397	*/
5398	} else {
5399	/*
5400	* One less compressed
5401	* purgeable/tagged page for
5402	* cur_object's owner.
5403	*/
5404	if (compressed_count_delta) {
5405	vm_object_owner_compressed_update(
5406	object: cur_object,
5407	delta: -`1`);
5408	}
5409	}
5410
5411	if (insert_cur_object) {
5412	vm_page_insert(page: m, object: cur_object, vm_object_trunc_page(cur_offset));
5413	m_object = cur_object;
5414	} else {
5415	vm_page_insert(page: m, object, vm_object_trunc_page(offset));
5416	m_object = object;
5417	}
5418
5419	if ((m_object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_USE_DEFAULT) {
5420	/*
5421	* If the page is not cacheable,
5422	* we can't let its contents
5423	* linger in the data cache
5424	* after the decompression.
5425	*/
5426	pmap_sync_page_attributes_phys(pa: VM_PAGE_GET_PHYS_PAGE(m));
5427	}
5428
5429	type_of_fault = my_fault_type;
5430
5431	VM_STAT_DECOMPRESSIONS();
5432
5433	if (cur_object != object) {
5434	if (insert_cur_object) {
5435	top_object = object;
5436	/*
5437	* switch to the object that has the new page
5438	*/
5439	object = cur_object;
5440	object_lock_type = cur_object_lock_type;
5441	} else {
5442	vm_object_unlock(cur_object);
5443	cur_object = object;
5444	}
5445	}
5446	goto FastPmapEnter;
5447	}
5448	/*
5449	* existence map present and indicates
5450	* that the pager doesn't have this page
5451	*/
5452	}
5453	if (cur_object->shadow == VM_OBJECT_NULL \|\|
5454	resilient_media_retry) {
5455	/*
5456	* Zero fill fault. Page gets
5457	* inserted into the original object.
5458	*/
5459	if (cur_object->shadow_severed \|\|
5460	VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object) \|\|
5461	cur_object == compressor_object \|\|
5462	is_kernel_object(cur_object)) {
5463	if (object != cur_object) {
5464	vm_object_unlock(cur_object);
5465	}
5466	vm_object_unlock(object);
5467
5468	vm_map_unlock_read(map);
5469	if (real_map != map) {
5470	vm_map_unlock(real_map);
5471	}
5472	if (VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object)) {
5473	ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PURGEABLE_FAULT_ERROR), arg: `0` / arg /);
5474	}
5475
5476	if (cur_object->shadow_severed) {
5477	ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_OBJECT_SHADOW_SEVERED), arg: `0` / arg /);
5478	}
5479
5480	kr = KERN_MEMORY_ERROR;
5481	goto done;
5482	}
5483	if (cur_object != object) {
5484	vm_object_unlock(cur_object);
5485
5486	cur_object = object;
5487	}
5488	if (object_lock_type == OBJECT_LOCK_SHARED) {
5489	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5490
5491	if (vm_object_lock_upgrade(object) == FALSE) {
5492	/*
5493	* couldn't upgrade so do a full retry on the fault
5494	* since we dropped the object lock which
5495	* could allow another thread to insert
5496	* a page at this offset
5497	*/
5498	vm_map_unlock_read(map);
5499	if (real_map != map) {
5500	vm_map_unlock(real_map);
5501	}
5502
5503	goto RetryFault;
5504	}
5505	}
5506	if (!object->internal) {
5507	panic("%s:%d should not zero-fill page at offset 0x%llx in external object %p", __FUNCTION__, __LINE__, (uint64_t)offset, object);
5508	}
5509	#if MACH_ASSERT
5510	if (resilient_media_retry &&
5511	vm_fault_resilient_media_inject_error3_rate != `0` &&
5512	(++vm_fault_resilient_media_inject_error3 % vm_fault_resilient_media_inject_error3_rate) == `0`) {
5513	/ inject an error /
5514	m_object = NULL;
5515	break;
5516	}
5517	#endif /* MACH_ASSERT */
5518	m = vm_page_alloc(object, vm_object_trunc_page(offset));
5519	m_object = NULL;
5520
5521	if (m == VM_PAGE_NULL) {
5522	/*
5523	* no free page currently available...
5524	* must take the slow path
5525	*/
5526	break;
5527	}
5528	m_object = object;
5529
5530	if ((prot & VM_PROT_WRITE) &&
5531	!(fault_type & VM_PROT_WRITE) &&
5532	object->vo_copy != VM_OBJECT_NULL) {
5533	/*
5534	* This is not a write fault and
5535	* we might have a copy-on-write
5536	* obligation to honor (copy object or
5537	* "needs_copy" map entry), so do not
5538	* give write access yet.
5539	* We'll need to catch the first write
5540	* to resolve the copy-on-write by
5541	* pushing this page to a copy object
5542	* or making a shadow object.
5543	*/
5544	if (pmap_has_prot_policy(pmap, translated_allow_execute: fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot)) {
5545	/*
5546	* This pmap enforces extra
5547	* constraints for this set of
5548	* protections, so we can't
5549	* change the protections.
5550	* We would expect code-signing
5551	* to be bypassed in this case.
5552	*/
5553	if (!fault_info.cs_bypass) {
5554	panic("%s: pmap %p vaddr 0x%llx prot 0x%x options 0x%x",
5555	__FUNCTION__,
5556	pmap,
5557	(uint64_t)vaddr,
5558	prot,
5559	fault_info.pmap_options);
5560	}
5561	} else {
5562	prot &= ~VM_PROT_WRITE;
5563	}
5564	}
5565	assertf(!((fault_type & VM_PROT_WRITE) && object->vo_copy),
5566	"map %p va 0x%llx wrong path for write fault (fault_type 0x%x) on object %p with copy %p\n",
5567	map, (uint64_t)vaddr, fault_type, object, object->vo_copy);
5568
5569	vm_object_t saved_copy_object;
5570	uint32_t saved_copy_version;
5571	saved_copy_object = object->vo_copy;
5572	saved_copy_version = object->vo_copy_version;
5573
5574	/*
5575	* Zeroing the page and entering into it into the pmap
5576	* represents a significant amount of the zero fill fault handler's work.
5577	*
5578	* To improve fault scalability, we'll drop the object lock, if it appears contended,
5579	* now that we've inserted the page into the vm object.
5580	* Before dropping the lock, we need to check protection bits and set the
5581	* mapped bits on the page. Then we can mark the page busy, drop the lock,
5582	* zero it, and do the pmap enter. We'll need to reacquire the lock
5583	* to clear the busy bit and wake up any waiters.
5584	*/
5585	vm_fault_cs_clear(m);
5586	m->vmp_pmapped = TRUE;
5587	if (map->no_zero_fill) {
5588	type_of_fault = DBG_NZF_PAGE_FAULT;
5589	} else {
5590	type_of_fault = DBG_ZERO_FILL_FAULT;
5591	}
5592	{
5593	pmap_t destination_pmap;
5594	vm_map_offset_t destination_pmap_vaddr;
5595	vm_prot_t enter_fault_type;
5596	if (caller_pmap) {
5597	destination_pmap = caller_pmap;
5598	destination_pmap_vaddr = caller_pmap_addr;
5599	} else {
5600	destination_pmap = pmap;
5601	destination_pmap_vaddr = vaddr;
5602	}
5603	if (change_wiring) {
5604	enter_fault_type = VM_PROT_NONE;
5605	} else {
5606	enter_fault_type = caller_prot;
5607	}
5608	assertf(VM_PAGE_OBJECT(m) == object, "m=%p object=%p", m, object);
5609	kr = vm_fault_enter_prepare(m,
5610	pmap: destination_pmap,
5611	vaddr: destination_pmap_vaddr,
5612	prot: &prot,
5613	caller_prot,
5614	fault_page_size,
5615	fault_phys_offset,
5616	change_wiring,
5617	fault_type: enter_fault_type,
5618	fault_info: &fault_info,
5619	type_of_fault: &type_of_fault,
5620	page_needs_data_sync: &page_needs_data_sync);
5621	if (kr != KERN_SUCCESS) {
5622	goto zero_fill_cleanup;
5623	}
5624
5625	if (object_is_contended) {
5626	/*
5627	* At this point the page is in the vm object, but not on a paging queue.
5628	* Since it's accessible to another thread but its contents are invalid
5629	* (it hasn't been zeroed) mark it busy before dropping the object lock.
5630	*/
5631	m->vmp_busy = TRUE;
5632	vm_object_paging_begin(object); / keep object alive /
5633	vm_object_unlock(object);
5634	}
5635	if (type_of_fault == DBG_ZERO_FILL_FAULT) {
5636	/*
5637	* Now zero fill page...
5638	* the page is probably going to
5639	* be written soon, so don't bother
5640	* to clear the modified bit
5641	*
5642	* NOTE: This code holds the map
5643	* lock across the zero fill.
5644	*/
5645	vm_page_zero_fill(page: m);
5646	counter_inc(&vm_statistics_zero_fill_count);
5647	DTRACE_VM2(zfod, int, `1`, (uint64_t *), NULL);
5648	}
5649
5650	if (object_is_contended) {
5651	/*
5652	* It's not safe to do the pmap_enter() without holding
5653	* the object lock because its "vo_copy" could change.
5654	*/
5655	object_is_contended = false; / get out of that code path /
5656
5657	vm_object_lock(object);
5658	vm_object_paging_end(object);
5659	if (object->vo_copy != saved_copy_object \|\|
5660	object->vo_copy_version != saved_copy_version) {
5661	/*
5662	* The COPY_DELAY copy-on-write situation for
5663	* this VM object has changed while it was
5664	* unlocked, so do not grant write access to
5665	* this page.
5666	* The write access will fault again and we'll
5667	* resolve the copy-on-write then.
5668	*/
5669	if (pmap_has_prot_policy(pmap,
5670	translated_allow_execute: fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE,
5671	prot)) {
5672	/ we should not do CoW on pmap_has_prot_policy mappings /
5673	panic("%s: map %p va 0x%llx obj %p,%u saved %p,%u: unexpected CoW",
5674	__FUNCTION__,
5675	map, (uint64_t)vaddr,
5676	object, object->vo_copy_version,
5677	saved_copy_object, saved_copy_version);
5678	} else {
5679	/ the pmap layer is OK with changing the PTE's prot /
5680	prot &= ~VM_PROT_WRITE;
5681	}
5682	}
5683	}
5684
5685	if (page_needs_data_sync) {
5686	pmap_sync_page_data_phys(pa: VM_PAGE_GET_PHYS_PAGE(m));
5687	}
5688
5689	if (top_object != VM_OBJECT_NULL) {
5690	need_retry_ptr = &need_retry;
5691	} else {
5692	need_retry_ptr = NULL;
5693	}
5694	if (fault_info.fi_xnu_user_debug &&
5695	!object->code_signed) {
5696	fault_info.pmap_options \|= PMAP_OPTIONS_XNU_USER_DEBUG;
5697	}
5698	if (object_is_contended) {
5699	panic("object_is_contended");
5700	kr = vm_fault_pmap_enter(pmap: destination_pmap, vaddr: destination_pmap_vaddr,
5701	fault_page_size, fault_phys_offset,
5702	m, prot: &prot, caller_prot, fault_type: enter_fault_type, wired,
5703	pmap_options: fault_info.pmap_options, need_retry: need_retry_ptr);
5704	vm_object_lock(object);
5705	assertf(!((prot & VM_PROT_WRITE) && object->vo_copy),
5706	"prot 0x%x object %p copy %p\n",
5707	prot, object, object->vo_copy);
5708	} else {
5709	kr = vm_fault_pmap_enter_with_object_lock(object, pmap: destination_pmap, vaddr: destination_pmap_vaddr,
5710	fault_page_size, fault_phys_offset,
5711	m, prot: &prot, caller_prot, fault_type: enter_fault_type, wired,
5712	pmap_options: fault_info.pmap_options, need_retry: need_retry_ptr, object_lock_type: &object_lock_type);
5713	}
5714	}
5715	zero_fill_cleanup:
5716	if (!VM_DYNAMIC_PAGING_ENABLED() &&
5717	(object->purgable == VM_PURGABLE_DENY \|\|
5718	object->purgable == VM_PURGABLE_NONVOLATILE \|\|
5719	object->purgable == VM_PURGABLE_VOLATILE)) {
5720	vm_page_lockspin_queues();
5721	if (!VM_DYNAMIC_PAGING_ENABLED()) {
5722	vm_fault_enqueue_throttled_locked(m);
5723	}
5724	vm_page_unlock_queues();
5725	}
5726	vm_fault_enqueue_page(object, m, wired, change_wiring, wire_tag, no_cache: fault_info.no_cache, type_of_fault: &type_of_fault, kr);
5727
5728	if (__improbable(rtfault &&
5729	!m->vmp_realtime &&
5730	vm_pageout_protect_realtime)) {
5731	vm_page_lock_queues();
5732	if (!m->vmp_realtime) {
5733	m->vmp_realtime = true;
5734	vm_page_realtime_count++;
5735	}
5736	vm_page_unlock_queues();
5737	}
5738	vm_fault_complete(
5739	map,
5740	real_map,
5741	object,
5742	m_object,
5743	m,
5744	offset,
5745	trace_real_vaddr,
5746	fault_info: &fault_info,
5747	caller_prot,
5748	real_vaddr,
5749	type_of_fault,
5750	need_retry,
5751	kr,
5752	physpage_p,
5753	prot,
5754	top_object,
5755	need_collapse,
5756	cur_offset,
5757	fault_type,
5758	written_on_object: &written_on_object,
5759	written_on_pager: &written_on_pager,
5760	written_on_offset: &written_on_offset);
5761	top_object = VM_OBJECT_NULL;
5762	if (need_retry == TRUE) {
5763	/*
5764	* vm_fault_enter couldn't complete the PMAP_ENTER...
5765	* at this point we don't hold any locks so it's safe
5766	* to ask the pmap layer to expand the page table to
5767	* accommodate this mapping... once expanded, we'll
5768	* re-drive the fault which should result in vm_fault_enter
5769	* being able to successfully enter the mapping this time around
5770	*/
5771	(void)pmap_enter_options(
5772	pmap, v: vaddr, pn: `0`, prot: `0`, fault_type: `0`, flags: `0`, wired: `0`,
5773	PMAP_OPTIONS_NOENTER, NULL, mapping_type: PMAP_MAPPING_TYPE_INFER);
5774
5775	need_retry = FALSE;
5776	goto RetryFault;
5777	}
5778	goto done;
5779	}
5780	/*
5781	* On to the next level in the shadow chain
5782	*/
5783	cur_offset += cur_object->vo_shadow_offset;
5784	new_object = cur_object->shadow;
5785	fault_phys_offset = cur_offset - vm_object_trunc_page(cur_offset);
5786
5787	/*
5788	* take the new_object's lock with the indicated state
5789	*/
5790	if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
5791	vm_object_lock_shared(new_object);
5792	} else {
5793	vm_object_lock(new_object);
5794	}
5795
5796	if (cur_object != object) {
5797	vm_object_unlock(cur_object);
5798	}
5799
5800	cur_object = new_object;
5801
5802	continue;
5803	}
5804	}
5805	/*
5806	* Cleanup from fast fault failure. Drop any object
5807	* lock other than original and drop map lock.
5808	*/
5809	if (object != cur_object) {
5810	vm_object_unlock(cur_object);
5811	}
5812
5813	/*
5814	* must own the object lock exclusively at this point
5815	*/
5816	if (object_lock_type == OBJECT_LOCK_SHARED) {
5817	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5818
5819	if (vm_object_lock_upgrade(object) == FALSE) {
5820	/*
5821	* couldn't upgrade, so explictly
5822	* take the lock exclusively
5823	* no need to retry the fault at this
5824	* point since "vm_fault_page" will
5825	* completely re-evaluate the state
5826	*/
5827	vm_object_lock(object);
5828	}
5829	}
5830
5831	handle_copy_delay:
5832	vm_map_unlock_read(map);
5833	if (real_map != map) {
5834	vm_map_unlock(real_map);
5835	}
5836
5837	if (__improbable(object == compressor_object \|\|
5838	is_kernel_object(object))) {
5839	/*
5840	* These objects are explicitly managed and populated by the
5841	* kernel. The virtual ranges backed by these objects should
5842	* either have wired pages or "holes" that are not supposed to
5843	* be accessed at all until they get explicitly populated.
5844	* We should never have to resolve a fault on a mapping backed
5845	* by one of these VM objects and providing a zero-filled page
5846	* would be wrong here, so let's fail the fault and let the
5847	* caller crash or recover.
5848	*/
5849	vm_object_unlock(object);
5850	kr = KERN_MEMORY_ERROR;
5851	goto done;
5852	}
5853
5854	resilient_media_ref_transfer = false;
5855	if (resilient_media_retry) {
5856	/*
5857	* We could get here if we failed to get a free page
5858	* to zero-fill and had to take the slow path again.
5859	* Reset our "recovery-from-failed-media" state.
5860	*/
5861	assert(resilient_media_object != VM_OBJECT_NULL);
5862	assert(resilient_media_offset != (vm_object_offset_t)-`1`);
5863	/ release our extra reference on failed object /
5864	// printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
5865	if (object == resilient_media_object) {
5866	/*
5867	* We're holding "object"'s lock, so we can't release
5868	* our extra reference at this point.
5869	* We need an extra reference on "object" anyway
5870	* (see below), so let's just transfer this reference.
5871	*/
5872	resilient_media_ref_transfer = true;
5873	} else {
5874	vm_object_deallocate(object: resilient_media_object);
5875	}
5876	resilient_media_object = VM_OBJECT_NULL;
5877	resilient_media_offset = (vm_object_offset_t)-`1`;
5878	resilient_media_retry = false;
5879	vm_fault_resilient_media_abort2++;
5880	}
5881
5882	/*
5883	* Make a reference to this object to
5884	* prevent its disposal while we are messing with
5885	* it. Once we have the reference, the map is free
5886	* to be diddled. Since objects reference their
5887	* shadows (and copies), they will stay around as well.
5888	*/
5889	if (resilient_media_ref_transfer) {
5890	/ we already have an extra reference on this object /
5891	resilient_media_ref_transfer = false;
5892	} else {
5893	vm_object_reference_locked(object);
5894	}
5895	vm_object_paging_begin(object);
5896
5897	set_thread_pagein_error(cthread, `0`);
5898	error_code = `0`;
5899
5900	result_page = VM_PAGE_NULL;
5901	kr = vm_fault_page(first_object: object, first_offset: offset, fault_type,
5902	must_be_resident: (change_wiring && !wired),
5903	FALSE, / page not looked up /
5904	protection: &prot, result_page: &result_page, top_page: &top_page,
5905	type_of_fault: &type_of_fault,
5906	error_code: &error_code, no_zero_fill: map->no_zero_fill,
5907	fault_info: &fault_info);
5908
5909	/*
5910	* if kr != VM_FAULT_SUCCESS, then the paging reference
5911	* has been dropped and the object unlocked... the ref_count
5912	* is still held
5913	*
5914	* if kr == VM_FAULT_SUCCESS, then the paging reference
5915	* is still held along with the ref_count on the original object
5916	*
5917	* the object is returned locked with a paging reference
5918	*
5919	* if top_page != NULL, then it's BUSY and the
5920	* object it belongs to has a paging reference
5921	* but is returned unlocked
5922	*/
5923	if (kr != VM_FAULT_SUCCESS &&
5924	kr != VM_FAULT_SUCCESS_NO_VM_PAGE) {
5925	if (kr == VM_FAULT_MEMORY_ERROR &&
5926	fault_info.resilient_media) {
5927	assertf(object->internal, "object %p", object);
5928	/*
5929	* This fault failed but the mapping was
5930	* "media resilient", so we'll retry the fault in
5931	* recovery mode to get a zero-filled page in the
5932	* top object.
5933	* Keep the reference on the failing object so
5934	* that we can check that the mapping is still
5935	* pointing to it when we retry the fault.
5936	*/
5937	// printf("RESILIENT_MEDIA %s:%d: object %p offset 0x%llx recover from media error 0x%x kr 0x%x top_page %p result_page %p\n", __FUNCTION__, __LINE__, object, offset, error_code, kr, top_page, result_page);
5938	assert(!resilient_media_retry); / no double retry /
5939	assert(resilient_media_object == VM_OBJECT_NULL);
5940	assert(resilient_media_offset == (vm_object_offset_t)-`1`);
5941	resilient_media_retry = true;
5942	resilient_media_object = object;
5943	resilient_media_offset = offset;
5944	// printf("FBDP %s:%d resilient_media_object %p offset 0x%llx kept reference\n", __FUNCTION__, __LINE__, resilient_media_object, resilient_mmedia_offset);
5945	vm_fault_resilient_media_initiate++;
5946	goto RetryFault;
5947	} else {
5948	/*
5949	* we didn't succeed, lose the object reference
5950	* immediately.
5951	*/
5952	vm_object_deallocate(object);
5953	object = VM_OBJECT_NULL; / no longer valid /
5954	}
5955
5956	/*
5957	* See why we failed, and take corrective action.
5958	*/
5959	switch (kr) {
5960	case VM_FAULT_MEMORY_SHORTAGE:
5961	if (vm_page_wait(interruptible: (change_wiring) ?
5962	THREAD_UNINT :
5963	THREAD_ABORTSAFE)) {
5964	goto RetryFault;
5965	}
5966	ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_MEMORY_SHORTAGE), arg: `0` / arg /);
5967	OS_FALLTHROUGH;
5968	case VM_FAULT_INTERRUPTED:
5969	ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_INTERRUPTED), arg: `0` / arg /);
5970	kr = KERN_ABORTED;
5971	goto done;
5972	case VM_FAULT_RETRY:
5973	goto RetryFault;
5974	case VM_FAULT_MEMORY_ERROR:
5975	if (error_code) {
5976	kr = error_code;
5977	} else {
5978	kr = KERN_MEMORY_ERROR;
5979	}
5980	goto done;
5981	default:
5982	panic("vm_fault: unexpected error 0x%x from "
5983	"vm_fault_page()\n", kr);
5984	}
5985	}
5986	m = result_page;
5987	m_object = NULL;
5988
5989	if (m != VM_PAGE_NULL) {
5990	m_object = VM_PAGE_OBJECT(m);
5991	assert((change_wiring && !wired) ?
5992	(top_page == VM_PAGE_NULL) :
5993	((top_page == VM_PAGE_NULL) == (m_object == object)));
5994	}
5995
5996	/*
5997	* What to do with the resulting page from vm_fault_page
5998	* if it doesn't get entered into the physical map:
5999	*/
6000	#define RELEASE_PAGE(m) \
6001	MACRO_BEGIN \
6002	PAGE_WAKEUP_DONE(m); \
6003	if ( !VM_PAGE_PAGEABLE(m)) { \
6004	vm_page_lockspin_queues(); \
6005	if ( !VM_PAGE_PAGEABLE(m)) \
6006	vm_page_activate(m); \
6007	vm_page_unlock_queues(); \
6008	} \
6009	MACRO_END
6010
6011
6012	object_locks_dropped = FALSE;
6013	/*
6014	* We must verify that the maps have not changed
6015	* since our last lookup. vm_map_verify() needs the
6016	* map lock (shared) but we are holding object locks.
6017	* So we do a try_lock() first and, if that fails, we
6018	* drop the object locks and go in for the map lock again.
6019	*/
6020	if (m != VM_PAGE_NULL) {
6021	old_copy_object = m_object->vo_copy;
6022	old_copy_version = m_object->vo_copy_version;
6023	} else {
6024	old_copy_object = VM_OBJECT_NULL;
6025	old_copy_version = `0`;
6026	}
6027	if (!vm_map_try_lock_read(map: original_map)) {
6028	if (m != VM_PAGE_NULL) {
6029	vm_object_unlock(m_object);
6030	} else {
6031	vm_object_unlock(object);
6032	}
6033
6034	object_locks_dropped = TRUE;
6035
6036	vm_map_lock_read(original_map);
6037	}
6038
6039	if ((map != original_map) \|\| !vm_map_verify(map, version: &version)) {
6040	if (object_locks_dropped == FALSE) {
6041	if (m != VM_PAGE_NULL) {
6042	vm_object_unlock(m_object);
6043	} else {
6044	vm_object_unlock(object);
6045	}
6046
6047	object_locks_dropped = TRUE;
6048	}
6049
6050	/*
6051	* no object locks are held at this point
6052	*/
6053	vm_object_t retry_object;
6054	vm_object_offset_t retry_offset;
6055	vm_prot_t retry_prot;
6056
6057	/*
6058	* To avoid trying to write_lock the map while another
6059	* thread has it read_locked (in vm_map_pageable), we
6060	* do not try for write permission. If the page is
6061	* still writable, we will get write permission. If it
6062	* is not, or has been marked needs_copy, we enter the
6063	* mapping without write permission, and will merely
6064	* take another fault.
6065	*/
6066	map = original_map;
6067
6068	kr = vm_map_lookup_and_lock_object(var_map: &map, vaddr,
6069	fault_type: fault_type & ~VM_PROT_WRITE,
6070	OBJECT_LOCK_EXCLUSIVE, out_version: &version,
6071	object: &retry_object, offset: &retry_offset, out_prot: &retry_prot,
6072	wired: &wired,
6073	fault_info: &fault_info,
6074	real_map: &real_map,
6075	NULL);
6076	pmap = real_map->pmap;
6077
6078	if (kr != KERN_SUCCESS) {
6079	vm_map_unlock_read(map);
6080
6081	if (m != VM_PAGE_NULL) {
6082	assert(VM_PAGE_OBJECT(m) == m_object);
6083
6084	/*
6085	* retake the lock so that
6086	* we can drop the paging reference
6087	* in vm_fault_cleanup and do the
6088	* PAGE_WAKEUP_DONE in RELEASE_PAGE
6089	*/
6090	vm_object_lock(m_object);
6091
6092	RELEASE_PAGE(m);
6093
6094	vm_fault_cleanup(object: m_object, top_page);
6095	} else {
6096	/*
6097	* retake the lock so that
6098	* we can drop the paging reference
6099	* in vm_fault_cleanup
6100	*/
6101	vm_object_lock(object);
6102
6103	vm_fault_cleanup(object, top_page);
6104	}
6105	vm_object_deallocate(object);
6106
6107	if (kr == KERN_INVALID_ADDRESS) {
6108	ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_ADDRESS_NOT_FOUND), arg: `0` / arg /);
6109	}
6110	goto done;
6111	}
6112	vm_object_unlock(retry_object);
6113
6114	if ((retry_object != object) \|\| (retry_offset != offset)) {
6115	vm_map_unlock_read(map);
6116	if (real_map != map) {
6117	vm_map_unlock(real_map);
6118	}
6119
6120	if (m != VM_PAGE_NULL) {
6121	assert(VM_PAGE_OBJECT(m) == m_object);
6122
6123	/*
6124	* retake the lock so that
6125	* we can drop the paging reference
6126	* in vm_fault_cleanup and do the
6127	* PAGE_WAKEUP_DONE in RELEASE_PAGE
6128	*/
6129	vm_object_lock(m_object);
6130
6131	RELEASE_PAGE(m);
6132
6133	vm_fault_cleanup(object: m_object, top_page);
6134	} else {
6135	/*
6136	* retake the lock so that
6137	* we can drop the paging reference
6138	* in vm_fault_cleanup
6139	*/
6140	vm_object_lock(object);
6141
6142	vm_fault_cleanup(object, top_page);
6143	}
6144	vm_object_deallocate(object);
6145
6146	goto RetryFault;
6147	}
6148	/*
6149	* Check whether the protection has changed or the object
6150	* has been copied while we left the map unlocked.
6151	*/
6152	if (pmap_has_prot_policy(pmap, translated_allow_execute: fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot: retry_prot)) {
6153	/ If the pmap layer cares, pass the full set. /
6154	prot = retry_prot;
6155	} else {
6156	prot &= retry_prot;
6157	}
6158	}
6159
6160	if (object_locks_dropped == TRUE) {
6161	if (m != VM_PAGE_NULL) {
6162	assertf(VM_PAGE_OBJECT(m) == m_object, "m=%p m_object=%p", m, m_object);
6163	assert(VM_PAGE_OBJECT(m) != VM_OBJECT_NULL);
6164	vm_object_lock(m_object);
6165	} else {
6166	vm_object_lock(object);
6167	}
6168
6169	object_locks_dropped = FALSE;
6170	}
6171
6172	if ((prot & VM_PROT_WRITE) &&
6173	m != VM_PAGE_NULL &&
6174	(m_object->vo_copy != old_copy_object \|\|
6175	m_object->vo_copy_version != old_copy_version)) {
6176	/*
6177	* The copy object changed while the top-level object
6178	* was unlocked, so take away write permission.
6179	*/
6180	if (pmap_has_prot_policy(pmap, translated_allow_execute: fault_info.pmap_options & PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE, prot)) {
6181	/*
6182	* This pmap enforces extra constraints for this set
6183	* of protections, so we can't change the protections.
6184	* This mapping should have been setup to avoid
6185	* copy-on-write since that requires removing write
6186	* access.
6187	*/
6188	panic("%s: pmap %p vaddr 0x%llx prot 0x%x options 0x%x m%p obj %p copyobj %p",
6189	__FUNCTION__, pmap, (uint64_t)vaddr, prot,
6190	fault_info.pmap_options,
6191	m, m_object, m_object->vo_copy);
6192	}
6193	prot &= ~VM_PROT_WRITE;
6194	}
6195
6196	if (!need_copy &&
6197	!fault_info.no_copy_on_read &&
6198	m != VM_PAGE_NULL &&
6199	VM_PAGE_OBJECT(m) != object &&
6200	!VM_PAGE_OBJECT(m)->pager_trusted &&
6201	vm_protect_privileged_from_untrusted &&
6202	!VM_PAGE_OBJECT(m)->code_signed &&
6203	current_proc_is_privileged()) {
6204	/*
6205	* We found the page we want in an "untrusted" VM object
6206	* down the shadow chain. Since the target is "privileged"
6207	* we want to perform a copy-on-read of that page, so that the
6208	* mapped object gets a stable copy and does not have to
6209	* rely on the "untrusted" object to provide the same
6210	* contents if the page gets reclaimed and has to be paged
6211	* in again later on.
6212	*
6213	* Special case: if the mapping is executable and the untrusted
6214	* object is code-signed and the process is "cs_enforced", we
6215	* do not copy-on-read because that would break code-signing
6216	* enforcement expectations (an executable page must belong
6217	* to a code-signed object) and we can rely on code-signing
6218	* to re-validate the page if it gets evicted and paged back in.
6219	*/
6220	// printf("COPY-ON-READ %s:%d map %p vaddr 0x%llx obj %p offset 0x%llx found page %p (obj %p offset 0x%llx) UNTRUSTED -> need copy-on-read\n", __FUNCTION__, __LINE__, map, (uint64_t)vaddr, object, offset, m, VM_PAGE_OBJECT(m), m->vmp_offset);
6221	vm_copied_on_read++;
6222	need_copy_on_read = TRUE;
6223	need_copy = TRUE;
6224	} else {
6225	need_copy_on_read = FALSE;
6226	}
6227
6228	/*
6229	* If we want to wire down this page, but no longer have
6230	* adequate permissions, we must start all over.
6231	* If we decided to copy-on-read, we must also start all over.
6232	*/
6233	if ((wired && (fault_type != (prot \| VM_PROT_WRITE))) \|\|
6234	need_copy_on_read) {
6235	vm_map_unlock_read(map);
6236	if (real_map != map) {
6237	vm_map_unlock(real_map);
6238	}
6239
6240	if (m != VM_PAGE_NULL) {
6241	assert(VM_PAGE_OBJECT(m) == m_object);
6242
6243	RELEASE_PAGE(m);
6244
6245	vm_fault_cleanup(object: m_object, top_page);
6246	} else {
6247	vm_fault_cleanup(object, top_page);
6248	}
6249
6250	vm_object_deallocate(object);
6251
6252	goto RetryFault;
6253	}
6254	if (m != VM_PAGE_NULL) {
6255	/*
6256	* Put this page into the physical map.
6257	* We had to do the unlock above because pmap_enter
6258	* may cause other faults. The page may be on
6259	* the pageout queues. If the pageout daemon comes
6260	* across the page, it will remove it from the queues.
6261	*/
6262	if (fault_page_size < PAGE_SIZE) {
6263	DEBUG4K_FAULT("map %p original %p pmap %p va 0x%llx pa 0x%llx(0x%llx+0x%llx) prot 0x%x caller_prot 0x%x\n", map, original_map, pmap, (uint64_t)vaddr, (uint64_t)((((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT) + fault_phys_offset), (uint64_t)(((pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m)) << PAGE_SHIFT), (uint64_t)fault_phys_offset, prot, caller_prot);
6264	assertf((!(fault_phys_offset & FOURK_PAGE_MASK) &&
6265	fault_phys_offset < PAGE_SIZE),
6266	"0x%llx\n", (uint64_t)fault_phys_offset);
6267	} else {
6268	assertf(fault_phys_offset == `0`,
6269	"0x%llx\n", (uint64_t)fault_phys_offset);
6270	}
6271	assertf(VM_PAGE_OBJECT(m) == m_object, "m=%p m_object=%p", m, m_object);
6272	assert(VM_PAGE_OBJECT(m) != VM_OBJECT_NULL);
6273	if (caller_pmap) {
6274	kr = vm_fault_enter(m,
6275	pmap: caller_pmap,
6276	vaddr: caller_pmap_addr,
6277	fault_page_size,
6278	fault_phys_offset,
6279	prot,
6280	caller_prot,
6281	wired,
6282	change_wiring,
6283	wire_tag,
6284	fault_info: &fault_info,
6285	NULL,
6286	type_of_fault: &type_of_fault,
6287	object_lock_type: &object_lock_type);
6288	} else {
6289	kr = vm_fault_enter(m,
6290	pmap,
6291	vaddr,
6292	fault_page_size,
6293	fault_phys_offset,
6294	prot,
6295	caller_prot,
6296	wired,
6297	change_wiring,
6298	wire_tag,
6299	fault_info: &fault_info,
6300	NULL,
6301	type_of_fault: &type_of_fault,
6302	object_lock_type: &object_lock_type);
6303	}
6304	assert(VM_PAGE_OBJECT(m) == m_object);
6305
6306	{
6307	int event_code = `0`;
6308
6309	if (m_object->internal) {
6310	event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
6311	} else if (m_object->object_is_shared_cache) {
6312	event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
6313	} else {
6314	event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
6315	}
6316
6317	KDBG_RELEASE(event_code \| DBG_FUNC_NONE, trace_real_vaddr, (fault_info.user_tag << `16`) \| (caller_prot << `8`) \| vm_fault_type_for_tracing(need_copy_on_read, type_of_fault), m->vmp_offset, get_current_unique_pid());
6318	KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_SLOW), get_current_unique_pid());
6319
6320	DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag);
6321	}
6322	if (kr != KERN_SUCCESS) {
6323	/ abort this page fault /
6324	vm_map_unlock_read(map);
6325	if (real_map != map) {
6326	vm_map_unlock(real_map);
6327	}
6328	PAGE_WAKEUP_DONE(m);
6329	vm_fault_cleanup(object: m_object, top_page);
6330	vm_object_deallocate(object);
6331	goto done;
6332	}
6333	if (physpage_p != NULL) {
6334	/ for vm_map_wire_and_extract() /
6335	*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
6336	if (prot & VM_PROT_WRITE) {
6337	vm_object_lock_assert_exclusive(m_object);
6338	m->vmp_dirty = TRUE;
6339	}
6340	}
6341	} else {
6342	vm_map_entry_t entry;
6343	vm_map_offset_t laddr;
6344	vm_map_offset_t ldelta, hdelta;
6345
6346	/*
6347	* do a pmap block mapping from the physical address
6348	* in the object
6349	*/
6350
6351	if (real_map != map) {
6352	vm_map_unlock(real_map);
6353	}
6354
6355	if (original_map != map) {
6356	vm_map_unlock_read(map);
6357	vm_map_lock_read(original_map);
6358	map = original_map;
6359	}
6360	real_map = map;
6361
6362	laddr = vaddr;
6363	hdelta = ldelta = (vm_map_offset_t)`0xFFFFFFFFFFFFF000ULL`;
6364
6365	while (vm_map_lookup_entry(map, address: laddr, entry: &entry)) {
6366	if (ldelta > (laddr - entry->vme_start)) {
6367	ldelta = laddr - entry->vme_start;
6368	}
6369	if (hdelta > (entry->vme_end - laddr)) {
6370	hdelta = entry->vme_end - laddr;
6371	}
6372	if (entry->is_sub_map) {
6373	laddr = ((laddr - entry->vme_start)
6374	+ VME_OFFSET(entry));
6375	vm_map_lock_read(VME_SUBMAP(entry));
6376
6377	if (map != real_map) {
6378	vm_map_unlock_read(map);
6379	}
6380	if (entry->use_pmap) {
6381	vm_map_unlock_read(real_map);
6382	real_map = VME_SUBMAP(entry);
6383	}
6384	map = VME_SUBMAP(entry);
6385	} else {
6386	break;
6387	}
6388	}
6389
6390	if (vm_map_lookup_entry(map, address: laddr, entry: &entry) &&
6391	(!entry->is_sub_map) &&
6392	(object != VM_OBJECT_NULL) &&
6393	(VME_OBJECT(entry) == object)) {
6394	uint16_t superpage;
6395
6396	if (!object->pager_created &&
6397	object->phys_contiguous &&
6398	VME_OFFSET(entry) == `0` &&
6399	(entry->vme_end - entry->vme_start == object->vo_size) &&
6400	VM_MAP_PAGE_ALIGNED(entry->vme_start, (object->vo_size - `1`))) {
6401	superpage = VM_MEM_SUPERPAGE;
6402	} else {
6403	superpage = `0`;
6404	}
6405
6406	if (superpage && physpage_p) {
6407	/ for vm_map_wire_and_extract() /
6408	*physpage_p = (ppnum_t)
6409	((((vm_map_offset_t)
6410	object->vo_shadow_offset)
6411	+ VME_OFFSET(entry)
6412	+ (laddr - entry->vme_start))
6413	>> PAGE_SHIFT);
6414	}
6415
6416	if (caller_pmap) {
6417	/*
6418	* Set up a block mapped area
6419	*/
6420	assert((uint32_t)((ldelta + hdelta) >> fault_page_shift) == ((ldelta + hdelta) >> fault_page_shift));
6421	kr = pmap_map_block_addr(pmap: caller_pmap,
6422	va: (addr64_t)(caller_pmap_addr - ldelta),
6423	pa: (pmap_paddr_t)(((vm_map_offset_t) (object->vo_shadow_offset)) +
6424	VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta),
6425	size: (uint32_t)((ldelta + hdelta) >> fault_page_shift), prot,
6426	attr: (VM_WIMG_MASK & (int)object->wimg_bits) \| superpage, flags: `0`);
6427
6428	if (kr != KERN_SUCCESS) {
6429	goto cleanup;
6430	}
6431	} else {
6432	/*
6433	* Set up a block mapped area
6434	*/
6435	assert((uint32_t)((ldelta + hdelta) >> fault_page_shift) == ((ldelta + hdelta) >> fault_page_shift));
6436	kr = pmap_map_block_addr(pmap: real_map->pmap,
6437	va: (addr64_t)(vaddr - ldelta),
6438	pa: (pmap_paddr_t)(((vm_map_offset_t)(object->vo_shadow_offset)) +
6439	VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta),
6440	size: (uint32_t)((ldelta + hdelta) >> fault_page_shift), prot,
6441	attr: (VM_WIMG_MASK & (int)object->wimg_bits) \| superpage, flags: `0`);
6442
6443	if (kr != KERN_SUCCESS) {
6444	goto cleanup;
6445	}
6446	}
6447	}
6448	}
6449
6450	/*
6451	* Success
6452	*/
6453	kr = KERN_SUCCESS;
6454
6455	/*
6456	* TODO: could most of the done cases just use cleanup?
6457	*/
6458	cleanup:
6459	/*
6460	* Unlock everything, and return
6461	*/
6462	vm_map_unlock_read(map);
6463	if (real_map != map) {
6464	vm_map_unlock(real_map);
6465	}
6466
6467	if (m != VM_PAGE_NULL) {
6468	if (__improbable(rtfault &&
6469	!m->vmp_realtime &&
6470	vm_pageout_protect_realtime)) {
6471	vm_page_lock_queues();
6472	if (!m->vmp_realtime) {
6473	m->vmp_realtime = true;
6474	vm_page_realtime_count++;
6475	}
6476	vm_page_unlock_queues();
6477	}
6478	assert(VM_PAGE_OBJECT(m) == m_object);
6479
6480	if (!m_object->internal && (fault_type & VM_PROT_WRITE)) {
6481	vm_object_paging_begin(m_object);
6482
6483	assert(written_on_object == VM_OBJECT_NULL);
6484	written_on_object = m_object;
6485	written_on_pager = m_object->pager;
6486	written_on_offset = m_object->paging_offset + m->vmp_offset;
6487	}
6488	PAGE_WAKEUP_DONE(m);
6489
6490	vm_fault_cleanup(object: m_object, top_page);
6491	} else {
6492	vm_fault_cleanup(object, top_page);
6493	}
6494
6495	vm_object_deallocate(object);
6496
6497	#undef RELEASE_PAGE
6498
6499	done:
6500	thread_interrupt_level(interruptible: interruptible_state);
6501
6502	if (resilient_media_object != VM_OBJECT_NULL) {
6503	assert(resilient_media_retry);
6504	assert(resilient_media_offset != (vm_object_offset_t)-`1`);
6505	/ release extra reference on failed object /
6506	// printf("FBDP %s:%d resilient_media_object %p deallocate\n", __FUNCTION__, __LINE__, resilient_media_object);
6507	vm_object_deallocate(object: resilient_media_object);
6508	resilient_media_object = VM_OBJECT_NULL;
6509	resilient_media_offset = (vm_object_offset_t)-`1`;
6510	resilient_media_retry = false;
6511	vm_fault_resilient_media_release++;
6512	}
6513	assert(!resilient_media_retry);
6514
6515	/*
6516	* Only I/O throttle on faults which cause a pagein/swapin.
6517	*/
6518	if ((type_of_fault == DBG_PAGEIND_FAULT) \|\| (type_of_fault == DBG_PAGEINV_FAULT) \|\| (type_of_fault == DBG_COMPRESSOR_SWAPIN_FAULT)) {
6519	throttle_lowpri_io(`1`);
6520	} else {
6521	if (kr == KERN_SUCCESS && type_of_fault != DBG_CACHE_HIT_FAULT && type_of_fault != DBG_GUARD_FAULT) {
6522	if ((throttle_delay = vm_page_throttled(TRUE))) {
6523	if (vm_debug_events) {
6524	if (type_of_fault == DBG_COMPRESSOR_FAULT) {
6525	VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, `0`, `0`, `0`);
6526	} else if (type_of_fault == DBG_COW_FAULT) {
6527	VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, `0`, `0`, `0`);
6528	} else {
6529	VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, `0`, `0`, `0`);
6530	}
6531	}
6532	__VM_FAULT_THROTTLE_FOR_PAGEOUT_SCAN__(throttle_delay);
6533	}
6534	}
6535	}
6536
6537	if (written_on_object) {
6538	vnode_pager_dirtied(written_on_pager, written_on_offset, written_on_offset + PAGE_SIZE_64);
6539
6540	vm_object_lock(written_on_object);
6541	vm_object_paging_end(written_on_object);
6542	vm_object_unlock(written_on_object);
6543
6544	written_on_object = VM_OBJECT_NULL;
6545	}
6546
6547	if (rtfault) {
6548	vm_record_rtfault(cthread, fstart, trace_vaddr, type_of_fault);
6549	}
6550
6551	KDBG_RELEASE(
6552	(MACHDBG_CODE(DBG_MACH_VM, `2`)) \| DBG_FUNC_END,
6553	((uint64_t)trace_vaddr >> `32`),
6554	trace_vaddr,
6555	kr,
6556	vm_fault_type_for_tracing(need_copy_on_read, type_of_fault));
6557
6558	if (fault_page_size < PAGE_SIZE && kr != KERN_SUCCESS) {
6559	DEBUG4K_FAULT("map %p original %p vaddr 0x%llx -> 0x%x\n", map, original_map, (uint64_t)trace_real_vaddr, kr);
6560	}
6561
6562	return kr;
6563	}
6564
6565	/*
6566	* vm_fault_wire:
6567	*
6568	* Wire down a range of virtual addresses in a map.
6569	*/
6570	kern_return_t
6571	vm_fault_wire(
6572	vm_map_t map,
6573	vm_map_entry_t entry,
6574	vm_prot_t prot,
6575	vm_tag_t wire_tag,
6576	pmap_t pmap,
6577	vm_map_offset_t pmap_addr,
6578	ppnum_t *physpage_p)
6579	{
6580	vm_map_offset_t va;
6581	vm_map_offset_t end_addr = entry->vme_end;
6582	kern_return_t rc;
6583	vm_map_size_t effective_page_size;
6584
6585	assert(entry->in_transition);
6586
6587	if (!entry->is_sub_map &&
6588	VME_OBJECT(entry) != VM_OBJECT_NULL &&
6589	VME_OBJECT(entry)->phys_contiguous) {
6590	return KERN_SUCCESS;
6591	}
6592
6593	/*
6594	* Inform the physical mapping system that the
6595	* range of addresses may not fault, so that
6596	* page tables and such can be locked down as well.
6597	*/
6598
6599	pmap_pageable(pmap, pmap_addr,
6600	pmap_addr + (end_addr - entry->vme_start), FALSE);
6601
6602	/*
6603	* We simulate a fault to get the page and enter it
6604	* in the physical map.
6605	*/
6606
6607	effective_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
6608	for (va = entry->vme_start;
6609	va < end_addr;
6610	va += effective_page_size) {
6611	rc = vm_fault_wire_fast(map, va, prot, wire_tag, entry, pmap,
6612	pmap_addr: pmap_addr + (va - entry->vme_start),
6613	physpage_p);
6614	if (rc != KERN_SUCCESS) {
6615	rc = vm_fault_internal(map, vaddr: va, caller_prot: prot, TRUE, wire_tag,
6616	interruptible: ((pmap == kernel_pmap)
6617	? THREAD_UNINT
6618	: THREAD_ABORTSAFE),
6619	caller_pmap: pmap,
6620	caller_pmap_addr: (pmap_addr +
6621	(va - entry->vme_start)),
6622	physpage_p);
6623	DTRACE_VM2(softlock, int, `1`, (uint64_t *), NULL);
6624	}
6625
6626	if (rc != KERN_SUCCESS) {
6627	struct vm_map_entry tmp_entry = *entry;
6628
6629	/ unwire wired pages /
6630	tmp_entry.vme_end = va;
6631	vm_fault_unwire(map, entry: &tmp_entry, FALSE,
6632	pmap, pmap_addr, end_addr: tmp_entry.vme_end);
6633
6634	return rc;
6635	}
6636	}
6637	return KERN_SUCCESS;
6638	}
6639
6640	/*
6641	* vm_fault_unwire:
6642	*
6643	* Unwire a range of virtual addresses in a map.
6644	*/
6645	void
6646	vm_fault_unwire(
6647	vm_map_t map,
6648	vm_map_entry_t entry,
6649	boolean_t deallocate,
6650	pmap_t pmap,
6651	vm_map_offset_t pmap_addr,
6652	vm_map_offset_t end_addr)
6653	{
6654	vm_map_offset_t va;
6655	vm_object_t object;
6656	struct vm_object_fault_info fault_info = {};
6657	unsigned int unwired_pages;
6658	vm_map_size_t effective_page_size;
6659
6660	object = (entry->is_sub_map) ? VM_OBJECT_NULL : VME_OBJECT(entry);
6661
6662	/*
6663	* If it's marked phys_contiguous, then vm_fault_wire() didn't actually
6664	* do anything since such memory is wired by default. So we don't have
6665	* anything to undo here.
6666	*/
6667
6668	if (object != VM_OBJECT_NULL && object->phys_contiguous) {
6669	return;
6670	}
6671
6672	fault_info.interruptible = THREAD_UNINT;
6673	fault_info.behavior = entry->behavior;
6674	fault_info.user_tag = VME_ALIAS(entry);
6675	if (entry->iokit_acct \|\|
6676	(!entry->is_sub_map && !entry->use_pmap)) {
6677	fault_info.pmap_options \|= PMAP_OPTIONS_ALT_ACCT;
6678	}
6679	fault_info.lo_offset = VME_OFFSET(entry);
6680	fault_info.hi_offset = (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
6681	fault_info.no_cache = entry->no_cache;
6682	fault_info.stealth = TRUE;
6683	if (entry->vme_xnu_user_debug) {
6684	/*
6685	* Modified code-signed executable region: wired pages must
6686	* have been copied, so they should be XNU_USER_DEBUG rather
6687	* than XNU_USER_EXEC.
6688	*/
6689	fault_info.pmap_options \|= PMAP_OPTIONS_XNU_USER_DEBUG;
6690	}
6691
6692	unwired_pages = `0`;
6693
6694	/*
6695	* Since the pages are wired down, we must be able to
6696	* get their mappings from the physical map system.
6697	*/
6698
6699	effective_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
6700	for (va = entry->vme_start;
6701	va < end_addr;
6702	va += effective_page_size) {
6703	if (object == VM_OBJECT_NULL) {
6704	if (pmap) {
6705	pmap_change_wiring(pmap,
6706	va: pmap_addr + (va - entry->vme_start), FALSE);
6707	}
6708	(void) vm_fault(map, vaddr: va, VM_PROT_NONE,
6709	TRUE, VM_KERN_MEMORY_NONE, THREAD_UNINT, caller_pmap: pmap, caller_pmap_addr: pmap_addr);
6710	} else {
6711	vm_prot_t prot;
6712	vm_page_t result_page;
6713	vm_page_t top_page;
6714	vm_object_t result_object;
6715	vm_fault_return_t result;
6716
6717	/ cap cluster size at maximum UPL size /
6718	upl_size_t cluster_size;
6719	if (os_sub_overflow(end_addr, va, &cluster_size)) {
6720	cluster_size = `0` - (upl_size_t)PAGE_SIZE;
6721	}
6722	fault_info.cluster_size = cluster_size;
6723
6724	do {
6725	prot = VM_PROT_NONE;
6726
6727	vm_object_lock(object);
6728	vm_object_paging_begin(object);
6729	result_page = VM_PAGE_NULL;
6730	result = vm_fault_page(
6731	first_object: object,
6732	first_offset: (VME_OFFSET(entry) +
6733	(va - entry->vme_start)),
6734	VM_PROT_NONE, TRUE,
6735	FALSE, / page not looked up /
6736	protection: &prot, result_page: &result_page, top_page: &top_page,
6737	type_of_fault: (int *)`0`,
6738	NULL, no_zero_fill: map->no_zero_fill,
6739	fault_info: &fault_info);
6740	} while (result == VM_FAULT_RETRY);
6741
6742	/*
6743	* If this was a mapping to a file on a device that has been forcibly
6744	* unmounted, then we won't get a page back from vm_fault_page(). Just
6745	* move on to the next one in case the remaining pages are mapped from
6746	* different objects. During a forced unmount, the object is terminated
6747	* so the alive flag will be false if this happens. A forced unmount will
6748	* will occur when an external disk is unplugged before the user does an
6749	* eject, so we don't want to panic in that situation.
6750	*/
6751
6752	if (result == VM_FAULT_MEMORY_ERROR) {
6753	if (!object->alive) {
6754	continue;
6755	}
6756	if (!object->internal && object->pager == NULL) {
6757	continue;
6758	}
6759	}
6760
6761	if (result == VM_FAULT_MEMORY_ERROR &&
6762	is_kernel_object(object)) {
6763	/*
6764	* This must have been allocated with
6765	* KMA_KOBJECT and KMA_VAONLY and there's
6766	* no physical page at this offset.
6767	* We're done (no page to free).
6768	*/
6769	assert(deallocate);
6770	continue;
6771	}
6772
6773	if (result != VM_FAULT_SUCCESS) {
6774	panic("vm_fault_unwire: failure");
6775	}
6776
6777	result_object = VM_PAGE_OBJECT(result_page);
6778
6779	if (deallocate) {
6780	assert(VM_PAGE_GET_PHYS_PAGE(result_page) !=
6781	vm_page_fictitious_addr);
6782	pmap_disconnect(phys: VM_PAGE_GET_PHYS_PAGE(m: result_page));
6783	if (VM_PAGE_WIRED(result_page)) {
6784	unwired_pages++;
6785	}
6786	VM_PAGE_FREE(result_page);
6787	} else {
6788	if ((pmap) && (VM_PAGE_GET_PHYS_PAGE(m: result_page) != vm_page_guard_addr)) {
6789	pmap_change_wiring(pmap,
6790	va: pmap_addr + (va - entry->vme_start), FALSE);
6791	}
6792
6793
6794	if (VM_PAGE_WIRED(result_page)) {
6795	vm_page_lockspin_queues();
6796	vm_page_unwire(page: result_page, TRUE);
6797	vm_page_unlock_queues();
6798	unwired_pages++;
6799	}
6800	if (entry->zero_wired_pages) {
6801	pmap_zero_page(pn: VM_PAGE_GET_PHYS_PAGE(m: result_page));
6802	entry->zero_wired_pages = FALSE;
6803	}
6804
6805	PAGE_WAKEUP_DONE(result_page);
6806	}
6807	vm_fault_cleanup(object: result_object, top_page);
6808	}
6809	}
6810
6811	/*
6812	* Inform the physical mapping system that the range
6813	* of addresses may fault, so that page tables and
6814	* such may be unwired themselves.
6815	*/
6816
6817	pmap_pageable(pmap, pmap_addr,
6818	pmap_addr + (end_addr - entry->vme_start), TRUE);
6819
6820	if (is_kernel_object(object)) {
6821	/*
6822	* Would like to make user_tag in vm_object_fault_info
6823	* vm_tag_t (unsigned short) but user_tag derives its value from
6824	* VME_ALIAS(entry) at a few places and VME_ALIAS, in turn, casts
6825	* to an _unsigned int_ which is used by non-fault_info paths throughout the
6826	* code at many places.
6827	*
6828	* So, for now, an explicit truncation to unsigned short (vm_tag_t).
6829	*/
6830	assertf((fault_info.user_tag & VME_ALIAS_MASK) == fault_info.user_tag,
6831	"VM Tag truncated from 0x%x to 0x%x\n", fault_info.user_tag, (fault_info.user_tag & VME_ALIAS_MASK));
6832	vm_tag_update_size(tag: (vm_tag_t) fault_info.user_tag, size: -ptoa_64(unwired_pages), NULL);
6833	}
6834	}
6835
6836	/*
6837	* vm_fault_wire_fast:
6838	*
6839	* Handle common case of a wire down page fault at the given address.
6840	* If successful, the page is inserted into the associated physical map.
6841	* The map entry is passed in to avoid the overhead of a map lookup.
6842	*
6843	* NOTE: the given address should be truncated to the
6844	* proper page address.
6845	*
6846	* KERN_SUCCESS is returned if the page fault is handled; otherwise,
6847	* a standard error specifying why the fault is fatal is returned.
6848	*
6849	* The map in question must be referenced, and remains so.
6850	* Caller has a read lock on the map.
6851	*
6852	* This is a stripped version of vm_fault() for wiring pages. Anything
6853	* other than the common case will return KERN_FAILURE, and the caller
6854	* is expected to call vm_fault().
6855	*/
6856	static kern_return_t
6857	vm_fault_wire_fast(
6858	__unused vm_map_t map,
6859	vm_map_offset_t va,
6860	__unused vm_prot_t caller_prot,
6861	vm_tag_t wire_tag,
6862	vm_map_entry_t entry,
6863	pmap_t pmap,
6864	vm_map_offset_t pmap_addr,
6865	ppnum_t *physpage_p)
6866	{
6867	vm_object_t object;
6868	vm_object_offset_t offset;
6869	vm_page_t m;
6870	vm_prot_t prot;
6871	thread_t thread = current_thread();
6872	int type_of_fault;
6873	kern_return_t kr;
6874	vm_map_size_t fault_page_size;
6875	vm_map_offset_t fault_phys_offset;
6876	struct vm_object_fault_info fault_info = {};
6877	uint8_t object_lock_type = `0`;
6878
6879	counter_inc(&vm_statistics_faults);
6880
6881	if (thread != THREAD_NULL) {
6882	counter_inc(&get_threadtask(thread)->faults);
6883	}
6884
6885	/*
6886	* Recovery actions
6887	*/
6888
6889	#undef RELEASE_PAGE
6890	#define RELEASE_PAGE(m) { \
6891	PAGE_WAKEUP_DONE(m); \
6892	vm_page_lockspin_queues(); \
6893	vm_page_unwire(m, TRUE); \
6894	vm_page_unlock_queues(); \
6895	}
6896
6897
6898	#undef UNLOCK_THINGS
6899	#define UNLOCK_THINGS { \
6900	vm_object_paging_end(object); \
6901	vm_object_unlock(object); \
6902	}
6903
6904	#undef UNLOCK_AND_DEALLOCATE
6905	#define UNLOCK_AND_DEALLOCATE { \
6906	UNLOCK_THINGS; \
6907	vm_object_deallocate(object); \
6908	}
6909	/*
6910	* Give up and have caller do things the hard way.
6911	*/
6912
6913	#define GIVE_UP { \
6914	UNLOCK_AND_DEALLOCATE; \
6915	return(KERN_FAILURE); \
6916	}
6917
6918
6919	/*
6920	* If this entry is not directly to a vm_object, bail out.
6921	*/
6922	if (entry->is_sub_map) {
6923	assert(physpage_p == NULL);
6924	return KERN_FAILURE;
6925	}
6926
6927	/*
6928	* Find the backing store object and offset into it.
6929	*/
6930
6931	object = VME_OBJECT(entry);
6932	offset = (va - entry->vme_start) + VME_OFFSET(entry);
6933	prot = entry->protection;
6934
6935	/*
6936	* Make a reference to this object to prevent its
6937	* disposal while we are messing with it.
6938	*/
6939
6940	object_lock_type = OBJECT_LOCK_EXCLUSIVE;
6941	vm_object_lock(object);
6942	vm_object_reference_locked(object);
6943	vm_object_paging_begin(object);
6944
6945	/*
6946	* INVARIANTS (through entire routine):
6947	*
6948	* 1) At all times, we must either have the object
6949	* lock or a busy page in some object to prevent
6950	* some other thread from trying to bring in
6951	* the same page.
6952	*
6953	* 2) Once we have a busy page, we must remove it from
6954	* the pageout queues, so that the pageout daemon
6955	* will not grab it away.
6956	*
6957	*/
6958
6959	/*
6960	* Look for page in top-level object. If it's not there or
6961	* there's something going on, give up.
6962	*/
6963	m = vm_page_lookup(object, vm_object_trunc_page(offset));
6964	if ((m == VM_PAGE_NULL) \|\| (m->vmp_busy) \|\|
6965	(m->vmp_unusual && (m->vmp_error \|\| m->vmp_restart \|\| m->vmp_absent))) {
6966	GIVE_UP;
6967	}
6968	if (m->vmp_fictitious &&
6969	VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
6970	/*
6971	* Guard pages are fictitious pages and are never
6972	* entered into a pmap, so let's say it's been wired...
6973	*/
6974	kr = KERN_SUCCESS;
6975	goto done;
6976	}
6977
6978	/*
6979	* Wire the page down now. All bail outs beyond this
6980	* point must unwire the page.
6981	*/
6982
6983	vm_page_lockspin_queues();
6984	vm_page_wire(page: m, tag: wire_tag, TRUE);
6985	vm_page_unlock_queues();
6986
6987	/*
6988	* Mark page busy for other threads.
6989	*/
6990	assert(!m->vmp_busy);
6991	m->vmp_busy = TRUE;
6992	assert(!m->vmp_absent);
6993
6994	/*
6995	* Give up if the page is being written and there's a copy object
6996	*/
6997	if ((object->vo_copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
6998	RELEASE_PAGE(m);
6999	GIVE_UP;
7000	}
7001
7002	fault_info.user_tag = VME_ALIAS(entry);
7003	fault_info.pmap_options = `0`;
7004	if (entry->iokit_acct \|\|
7005	(!entry->is_sub_map && !entry->use_pmap)) {
7006	fault_info.pmap_options \|= PMAP_OPTIONS_ALT_ACCT;
7007	}
7008	if (entry->vme_xnu_user_debug) {
7009	/*
7010	* Modified code-signed executable region: wiring will
7011	* copy the pages, so they should be XNU_USER_DEBUG rather
7012	* than XNU_USER_EXEC.
7013	*/
7014	fault_info.pmap_options \|= PMAP_OPTIONS_XNU_USER_DEBUG;
7015	}
7016
7017	fault_page_size = MIN(VM_MAP_PAGE_SIZE(map), PAGE_SIZE);
7018	fault_phys_offset = offset - vm_object_trunc_page(offset);
7019
7020	/*
7021	* Put this page into the physical map.
7022	*/
7023	type_of_fault = DBG_CACHE_HIT_FAULT;
7024	assertf(VM_PAGE_OBJECT(m) == object, "m=%p object=%p", m, object);
7025	assert(VM_PAGE_OBJECT(m) != VM_OBJECT_NULL);
7026	kr = vm_fault_enter(m,
7027	pmap,
7028	vaddr: pmap_addr,
7029	fault_page_size,
7030	fault_phys_offset,
7031	prot,
7032	caller_prot: prot,
7033	TRUE, / wired /
7034	FALSE, / change_wiring /
7035	wire_tag,
7036	fault_info: &fault_info,
7037	NULL,
7038	type_of_fault: &type_of_fault,
7039	object_lock_type: &object_lock_type); / Exclusive lock mode. Will remain unchanged./
7040	if (kr != KERN_SUCCESS) {
7041	RELEASE_PAGE(m);
7042	GIVE_UP;
7043	}
7044
7045	done:
7046	/*
7047	* Unlock everything, and return
7048	*/
7049
7050	if (physpage_p) {
7051	/ for vm_map_wire_and_extract() /
7052	if (kr == KERN_SUCCESS) {
7053	assert(object == VM_PAGE_OBJECT(m));
7054	*physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
7055	if (prot & VM_PROT_WRITE) {
7056	vm_object_lock_assert_exclusive(object);
7057	m->vmp_dirty = TRUE;
7058	}
7059	} else {
7060	*physpage_p = `0`;
7061	}
7062	}
7063
7064	PAGE_WAKEUP_DONE(m);
7065	UNLOCK_AND_DEALLOCATE;
7066
7067	return kr;
7068	}
7069
7070	/*
7071	* Routine: vm_fault_copy_cleanup
7072	* Purpose:
7073	* Release a page used by vm_fault_copy.
7074	*/
7075
7076	static void
7077	vm_fault_copy_cleanup(
7078	vm_page_t page,
7079	vm_page_t top_page)
7080	{
7081	vm_object_t object = VM_PAGE_OBJECT(page);
7082
7083	vm_object_lock(object);
7084	PAGE_WAKEUP_DONE(page);
7085	if (!VM_PAGE_PAGEABLE(page)) {
7086	vm_page_lockspin_queues();
7087	if (!VM_PAGE_PAGEABLE(page)) {
7088	vm_page_activate(page);
7089	}
7090	vm_page_unlock_queues();
7091	}
7092	vm_fault_cleanup(object, top_page);
7093	}
7094
7095	static void
7096	vm_fault_copy_dst_cleanup(
7097	vm_page_t page)
7098	{
7099	vm_object_t object;
7100
7101	if (page != VM_PAGE_NULL) {
7102	object = VM_PAGE_OBJECT(page);
7103	vm_object_lock(object);
7104	vm_page_lockspin_queues();
7105	vm_page_unwire(page, TRUE);
7106	vm_page_unlock_queues();
7107	vm_object_paging_end(object);
7108	vm_object_unlock(object);
7109	}
7110	}
7111
7112	/*
7113	* Routine: vm_fault_copy
7114	*
7115	* Purpose:
7116	* Copy pages from one virtual memory object to another --
7117	* neither the source nor destination pages need be resident.
7118	*
7119	* Before actually copying a page, the version associated with
7120	* the destination address map wil be verified.
7121	*
7122	* In/out conditions:
7123	* The caller must hold a reference, but not a lock, to
7124	* each of the source and destination objects and to the
7125	* destination map.
7126	*
7127	* Results:
7128	* Returns KERN_SUCCESS if no errors were encountered in
7129	* reading or writing the data. Returns KERN_INTERRUPTED if
7130	* the operation was interrupted (only possible if the
7131	* "interruptible" argument is asserted). Other return values
7132	* indicate a permanent error in copying the data.
7133	*
7134	* The actual amount of data copied will be returned in the
7135	* "copy_size" argument. In the event that the destination map
7136	* verification failed, this amount may be less than the amount
7137	* requested.
7138	*/
7139	kern_return_t
7140	vm_fault_copy(
7141	vm_object_t src_object,
7142	vm_object_offset_t src_offset,
7143	vm_map_size_t copy_size, /* INOUT /
7144	vm_object_t dst_object,
7145	vm_object_offset_t dst_offset,
7146	vm_map_t dst_map,
7147	vm_map_version_t *dst_version,
7148	int interruptible)
7149	{
7150	vm_page_t result_page;
7151
7152	vm_page_t src_page;
7153	vm_page_t src_top_page;
7154	vm_prot_t src_prot;
7155
7156	vm_page_t dst_page;
7157	vm_page_t dst_top_page;
7158	vm_prot_t dst_prot;
7159
7160	vm_map_size_t amount_left;
7161	vm_object_t old_copy_object;
7162	uint32_t old_copy_version;
7163	vm_object_t result_page_object = NULL;
7164	kern_return_t error = `0`;
7165	vm_fault_return_t result;
7166
7167	vm_map_size_t part_size;
7168	struct vm_object_fault_info fault_info_src = {};
7169	struct vm_object_fault_info fault_info_dst = {};
7170
7171	/*
7172	* In order not to confuse the clustered pageins, align
7173	* the different offsets on a page boundary.
7174	*/
7175
7176	#define RETURN(x) \
7177	MACRO_BEGIN \
7178	*copy_size -= amount_left; \
7179	MACRO_RETURN(x); \
7180	MACRO_END
7181
7182	amount_left = *copy_size;
7183
7184	fault_info_src.interruptible = interruptible;
7185	fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
7186	fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
7187	fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
7188	fault_info_src.stealth = TRUE;
7189
7190	fault_info_dst.interruptible = interruptible;
7191	fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
7192	fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
7193	fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
7194	fault_info_dst.stealth = TRUE;
7195
7196	do { / while (amount_left > 0) /
7197	/*
7198	* There may be a deadlock if both source and destination
7199	* pages are the same. To avoid this deadlock, the copy must
7200	* start by getting the destination page in order to apply
7201	* COW semantics if any.
7202	*/
7203
7204	RetryDestinationFault:;
7205
7206	dst_prot = VM_PROT_WRITE \| VM_PROT_READ;
7207
7208	vm_object_lock(dst_object);
7209	vm_object_paging_begin(dst_object);
7210
7211	/ cap cluster size at maximum UPL size /
7212	upl_size_t cluster_size;
7213	if (os_convert_overflow(amount_left, &cluster_size)) {
7214	cluster_size = `0` - (upl_size_t)PAGE_SIZE;
7215	}
7216	fault_info_dst.cluster_size = cluster_size;
7217
7218	dst_page = VM_PAGE_NULL;
7219	result = vm_fault_page(first_object: dst_object,
7220	vm_object_trunc_page(dst_offset),
7221	VM_PROT_WRITE \| VM_PROT_READ,
7222	FALSE,
7223	FALSE, / page not looked up /
7224	protection: &dst_prot, result_page: &dst_page, top_page: &dst_top_page,
7225	type_of_fault: (int *)`0`,
7226	error_code: &error,
7227	no_zero_fill: dst_map->no_zero_fill,
7228	fault_info: &fault_info_dst);
7229	switch (result) {
7230	case VM_FAULT_SUCCESS:
7231	break;
7232	case VM_FAULT_RETRY:
7233	goto RetryDestinationFault;
7234	case VM_FAULT_MEMORY_SHORTAGE:
7235	if (vm_page_wait(interruptible)) {
7236	goto RetryDestinationFault;
7237	}
7238	ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_COPY_MEMORY_SHORTAGE), arg: `0` / arg /);
7239	OS_FALLTHROUGH;
7240	case VM_FAULT_INTERRUPTED:
7241	RETURN(MACH_SEND_INTERRUPTED);
7242	case VM_FAULT_SUCCESS_NO_VM_PAGE:
7243	/ success but no VM page: fail the copy /
7244	vm_object_paging_end(dst_object);
7245	vm_object_unlock(dst_object);
7246	OS_FALLTHROUGH;
7247	case VM_FAULT_MEMORY_ERROR:
7248	if (error) {
7249	return error;
7250	} else {
7251	return KERN_MEMORY_ERROR;
7252	}
7253	default:
7254	panic("vm_fault_copy: unexpected error 0x%x from "
7255	"vm_fault_page()\n", result);
7256	}
7257	assert((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
7258
7259	assert(dst_object == VM_PAGE_OBJECT(dst_page));
7260	old_copy_object = dst_object->vo_copy;
7261	old_copy_version = dst_object->vo_copy_version;
7262
7263	/*
7264	* There exists the possiblity that the source and
7265	* destination page are the same. But we can't
7266	* easily determine that now. If they are the
7267	* same, the call to vm_fault_page() for the
7268	* destination page will deadlock. To prevent this we
7269	* wire the page so we can drop busy without having
7270	* the page daemon steal the page. We clean up the
7271	* top page but keep the paging reference on the object
7272	* holding the dest page so it doesn't go away.
7273	*/
7274
7275	vm_page_lockspin_queues();
7276	vm_page_wire(page: dst_page, VM_KERN_MEMORY_OSFMK, TRUE);
7277	vm_page_unlock_queues();
7278	PAGE_WAKEUP_DONE(dst_page);
7279	vm_object_unlock(dst_object);
7280
7281	if (dst_top_page != VM_PAGE_NULL) {
7282	vm_object_lock(dst_object);
7283	VM_PAGE_FREE(dst_top_page);
7284	vm_object_paging_end(dst_object);
7285	vm_object_unlock(dst_object);
7286	}
7287
7288	RetrySourceFault:;
7289
7290	if (src_object == VM_OBJECT_NULL) {
7291	/*
7292	* No source object. We will just
7293	* zero-fill the page in dst_object.
7294	*/
7295	src_page = VM_PAGE_NULL;
7296	result_page = VM_PAGE_NULL;
7297	} else {
7298	vm_object_lock(src_object);
7299	src_page = vm_page_lookup(object: src_object,
7300	vm_object_trunc_page(src_offset));
7301	if (src_page == dst_page) {
7302	src_prot = dst_prot;
7303	result_page = VM_PAGE_NULL;
7304	} else {
7305	src_prot = VM_PROT_READ;
7306	vm_object_paging_begin(src_object);
7307
7308	/ cap cluster size at maximum UPL size /
7309	if (os_convert_overflow(amount_left, &cluster_size)) {
7310	cluster_size = `0` - (upl_size_t)PAGE_SIZE;
7311	}
7312	fault_info_src.cluster_size = cluster_size;
7313
7314	result_page = VM_PAGE_NULL;
7315	result = vm_fault_page(
7316	first_object: src_object,
7317	vm_object_trunc_page(src_offset),
7318	VM_PROT_READ, FALSE,
7319	FALSE, / page not looked up /
7320	protection: &src_prot,
7321	result_page: &result_page, top_page: &src_top_page,
7322	type_of_fault: (int *)`0`, error_code: &error, FALSE,
7323	fault_info: &fault_info_src);
7324
7325	switch (result) {
7326	case VM_FAULT_SUCCESS:
7327	break;
7328	case VM_FAULT_RETRY:
7329	goto RetrySourceFault;
7330	case VM_FAULT_MEMORY_SHORTAGE:
7331	if (vm_page_wait(interruptible)) {
7332	goto RetrySourceFault;
7333	}
7334	OS_FALLTHROUGH;
7335	case VM_FAULT_INTERRUPTED:
7336	vm_fault_copy_dst_cleanup(page: dst_page);
7337	RETURN(MACH_SEND_INTERRUPTED);
7338	case VM_FAULT_SUCCESS_NO_VM_PAGE:
7339	/ success but no VM page: fail /
7340	vm_object_paging_end(src_object);
7341	vm_object_unlock(src_object);
7342	OS_FALLTHROUGH;
7343	case VM_FAULT_MEMORY_ERROR:
7344	vm_fault_copy_dst_cleanup(page: dst_page);
7345	if (error) {
7346	return error;
7347	} else {
7348	return KERN_MEMORY_ERROR;
7349	}
7350	default:
7351	panic("vm_fault_copy(2): unexpected "
7352	"error 0x%x from "
7353	"vm_fault_page()\n", result);
7354	}
7355
7356	result_page_object = VM_PAGE_OBJECT(result_page);
7357	assert((src_top_page == VM_PAGE_NULL) ==
7358	(result_page_object == src_object));
7359	}
7360	assert((src_prot & VM_PROT_READ) != VM_PROT_NONE);
7361	vm_object_unlock(result_page_object);
7362	}
7363
7364	vm_map_lock_read(dst_map);
7365
7366	if (!vm_map_verify(map: dst_map, version: dst_version)) {
7367	vm_map_unlock_read(dst_map);
7368	if (result_page != VM_PAGE_NULL && src_page != dst_page) {
7369	vm_fault_copy_cleanup(page: result_page, top_page: src_top_page);
7370	}
7371	vm_fault_copy_dst_cleanup(page: dst_page);
7372	break;
7373	}
7374	assert(dst_object == VM_PAGE_OBJECT(dst_page));
7375
7376	vm_object_lock(dst_object);
7377
7378	if ((dst_object->vo_copy != old_copy_object \|\|
7379	dst_object->vo_copy_version != old_copy_version)) {
7380	vm_object_unlock(dst_object);
7381	vm_map_unlock_read(dst_map);
7382	if (result_page != VM_PAGE_NULL && src_page != dst_page) {
7383	vm_fault_copy_cleanup(page: result_page, top_page: src_top_page);
7384	}
7385	vm_fault_copy_dst_cleanup(page: dst_page);
7386	break;
7387	}
7388	vm_object_unlock(dst_object);
7389
7390	/*
7391	* Copy the page, and note that it is dirty
7392	* immediately.
7393	*/
7394
7395	if (!page_aligned(src_offset) \|\|
7396	!page_aligned(dst_offset) \|\|
7397	!page_aligned(amount_left)) {
7398	vm_object_offset_t src_po,
7399	dst_po;
7400
7401	src_po = src_offset - vm_object_trunc_page(src_offset);
7402	dst_po = dst_offset - vm_object_trunc_page(dst_offset);
7403
7404	if (dst_po > src_po) {
7405	part_size = PAGE_SIZE - dst_po;
7406	} else {
7407	part_size = PAGE_SIZE - src_po;
7408	}
7409	if (part_size > (amount_left)) {
7410	part_size = amount_left;
7411	}
7412
7413	if (result_page == VM_PAGE_NULL) {
7414	assert((vm_offset_t) dst_po == dst_po);
7415	assert((vm_size_t) part_size == part_size);
7416	vm_page_part_zero_fill(m: dst_page,
7417	m_pa: (vm_offset_t) dst_po,
7418	len: (vm_size_t) part_size);
7419	} else {
7420	assert((vm_offset_t) src_po == src_po);
7421	assert((vm_offset_t) dst_po == dst_po);
7422	assert((vm_size_t) part_size == part_size);
7423	vm_page_part_copy(src_m: result_page,
7424	src_pa: (vm_offset_t) src_po,
7425	dst_m: dst_page,
7426	dst_pa: (vm_offset_t) dst_po,
7427	len: (vm_size_t)part_size);
7428	if (!dst_page->vmp_dirty) {
7429	vm_object_lock(dst_object);
7430	SET_PAGE_DIRTY(dst_page, TRUE);
7431	vm_object_unlock(dst_object);
7432	}
7433	}
7434	} else {
7435	part_size = PAGE_SIZE;
7436
7437	if (result_page == VM_PAGE_NULL) {
7438	vm_page_zero_fill(page: dst_page);
7439	} else {
7440	vm_object_lock(result_page_object);
7441	vm_page_copy(src_page: result_page, dest_page: dst_page);
7442	vm_object_unlock(result_page_object);
7443
7444	if (!dst_page->vmp_dirty) {
7445	vm_object_lock(dst_object);
7446	SET_PAGE_DIRTY(dst_page, TRUE);
7447	vm_object_unlock(dst_object);
7448	}
7449	}
7450	}
7451
7452	/*
7453	* Unlock everything, and return
7454	*/
7455
7456	vm_map_unlock_read(dst_map);
7457
7458	if (result_page != VM_PAGE_NULL && src_page != dst_page) {
7459	vm_fault_copy_cleanup(page: result_page, top_page: src_top_page);
7460	}
7461	vm_fault_copy_dst_cleanup(page: dst_page);
7462
7463	amount_left -= part_size;
7464	src_offset += part_size;
7465	dst_offset += part_size;
7466	} while (amount_left > `0`);
7467
7468	RETURN(KERN_SUCCESS);
7469	#undef RETURN
7470
7471	/NOTREACHED/
7472	}
7473
7474	#if VM_FAULT_CLASSIFY
7475	/*
7476	* Temporary statistics gathering support.
7477	*/
7478
7479	/*
7480	* Statistics arrays:
7481	*/
7482	#define VM_FAULT_TYPES_MAX 5
7483	#define VM_FAULT_LEVEL_MAX 8
7484
7485	int vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
7486
7487	#define VM_FAULT_TYPE_ZERO_FILL 0
7488	#define VM_FAULT_TYPE_MAP_IN 1
7489	#define VM_FAULT_TYPE_PAGER 2
7490	#define VM_FAULT_TYPE_COPY 3
7491	#define VM_FAULT_TYPE_OTHER 4
7492
7493
7494	void
7495	vm_fault_classify(vm_object_t object,
7496	vm_object_offset_t offset,
7497	vm_prot_t fault_type)
7498	{
7499	int type, level = `0`;
7500	vm_page_t m;
7501
7502	while (TRUE) {
7503	m = vm_page_lookup(object, offset);
7504	if (m != VM_PAGE_NULL) {
7505	if (m->vmp_busy \|\| m->vmp_error \|\| m->vmp_restart \|\| m->vmp_absent) {
7506	type = VM_FAULT_TYPE_OTHER;
7507	break;
7508	}
7509	if (((fault_type & VM_PROT_WRITE) == `0`) \|\|
7510	((level == `0`) && object->vo_copy == VM_OBJECT_NULL)) {
7511	type = VM_FAULT_TYPE_MAP_IN;
7512	break;
7513	}
7514	type = VM_FAULT_TYPE_COPY;
7515	break;
7516	} else {
7517	if (object->pager_created) {
7518	type = VM_FAULT_TYPE_PAGER;
7519	break;
7520	}
7521	if (object->shadow == VM_OBJECT_NULL) {
7522	type = VM_FAULT_TYPE_ZERO_FILL;
7523	break;
7524	}
7525
7526	offset += object->vo_shadow_offset;
7527	object = object->shadow;
7528	level++;
7529	continue;
7530	}
7531	}
7532
7533	if (level > VM_FAULT_LEVEL_MAX) {
7534	level = VM_FAULT_LEVEL_MAX;
7535	}
7536
7537	vm_fault_stats[type][level] += `1`;
7538
7539	return;
7540	}
7541
7542	/ cleanup routine to call from debugger /
7543
7544	void
7545	vm_fault_classify_init(void)
7546	{
7547	int type, level;
7548
7549	for (type = `0`; type < VM_FAULT_TYPES_MAX; type++) {
7550	for (level = `0`; level < VM_FAULT_LEVEL_MAX; level++) {
7551	vm_fault_stats[type][level] = `0`;
7552	}
7553	}
7554
7555	return;
7556	}
7557	#endif /* VM_FAULT_CLASSIFY */
7558
7559	vm_offset_t
7560	kdp_lightweight_fault(vm_map_t map, vm_offset_t cur_target_addr)
7561	{
7562	vm_map_entry_t entry;
7563	vm_object_t object;
7564	vm_offset_t object_offset;
7565	vm_page_t m;
7566	int compressor_external_state, compressed_count_delta;
7567	vm_compressor_options_t compressor_flags = (C_DONT_BLOCK \| C_KEEP \| C_KDP);
7568	int my_fault_type = VM_PROT_READ;
7569	kern_return_t kr;
7570	int effective_page_mask, effective_page_size;
7571
7572	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
7573	effective_page_mask = VM_MAP_PAGE_MASK(map);
7574	effective_page_size = VM_MAP_PAGE_SIZE(map);
7575	} else {
7576	effective_page_mask = PAGE_MASK;
7577	effective_page_size = PAGE_SIZE;
7578	}
7579
7580	if (not_in_kdp) {
7581	panic("kdp_lightweight_fault called from outside of debugger context");
7582	}
7583
7584	assert(map != VM_MAP_NULL);
7585
7586	assert((cur_target_addr & effective_page_mask) == `0`);
7587	if ((cur_target_addr & effective_page_mask) != `0`) {
7588	return `0`;
7589	}
7590
7591	if (kdp_lck_rw_lock_is_acquired_exclusive(lck: &map->lock)) {
7592	return `0`;
7593	}
7594
7595	if (!vm_map_lookup_entry(map, address: cur_target_addr, entry: &entry)) {
7596	return `0`;
7597	}
7598
7599	if (entry->is_sub_map) {
7600	return `0`;
7601	}
7602
7603	object = VME_OBJECT(entry);
7604	if (object == VM_OBJECT_NULL) {
7605	return `0`;
7606	}
7607
7608	object_offset = cur_target_addr - entry->vme_start + VME_OFFSET(entry);
7609
7610	while (TRUE) {
7611	if (kdp_lck_rw_lock_is_acquired_exclusive(lck: &object->Lock)) {
7612	return `0`;
7613	}
7614
7615	if (object->pager_created && (object->paging_in_progress \|\|
7616	object->activity_in_progress)) {
7617	return `0`;
7618	}
7619
7620	m = kdp_vm_page_lookup(object, vm_object_trunc_page(object_offset));
7621
7622	if (m != VM_PAGE_NULL) {
7623	if ((object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_DEFAULT) {
7624	return `0`;
7625	}
7626
7627	if (m->vmp_laundry \|\| m->vmp_busy \|\| m->vmp_free_when_done \|\| m->vmp_absent \|\| VMP_ERROR_GET(m) \|\| m->vmp_cleaning \|\|
7628	m->vmp_overwriting \|\| m->vmp_restart \|\| m->vmp_unusual) {
7629	return `0`;
7630	}
7631
7632	assert(!m->vmp_private);
7633	if (m->vmp_private) {
7634	return `0`;
7635	}
7636
7637	assert(!m->vmp_fictitious);
7638	if (m->vmp_fictitious) {
7639	return `0`;
7640	}
7641
7642	assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
7643	if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
7644	return `0`;
7645	}
7646
7647	return ptoa(VM_PAGE_GET_PHYS_PAGE(m));
7648	}
7649
7650	compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
7651
7652	if (object->pager_created && MUST_ASK_PAGER(object, object_offset, compressor_external_state)) {
7653	if (compressor_external_state == VM_EXTERNAL_STATE_EXISTS) {
7654	kr = vm_compressor_pager_get(mem_obj: object->pager,
7655	vm_object_trunc_page(object_offset + object->paging_offset),
7656	ppnum: kdp_compressor_decompressed_page_ppnum, my_fault_type: &my_fault_type,
7657	flags: compressor_flags, compressed_count_delta_p: &compressed_count_delta);
7658	if (kr == KERN_SUCCESS) {
7659	return kdp_compressor_decompressed_page_paddr;
7660	} else {
7661	return `0`;
7662	}
7663	}
7664	}
7665
7666	if (object->shadow == VM_OBJECT_NULL) {
7667	return `0`;
7668	}
7669
7670	object_offset += object->vo_shadow_offset;
7671	object = object->shadow;
7672	}
7673	}
7674
7675	/*
7676	* vm_page_validate_cs_fast():
7677	* Performs a few quick checks to determine if the page's code signature
7678	* really needs to be fully validated. It could:
7679	* 1. have been modified (i.e. automatically tainted),
7680	* 2. have already been validated,
7681	* 3. have already been found to be tainted,
7682	* 4. no longer have a backing store.
7683	* Returns FALSE if the page needs to be fully validated.
7684	*/
7685	static boolean_t
7686	vm_page_validate_cs_fast(
7687	vm_page_t page,
7688	vm_map_size_t fault_page_size,
7689	vm_map_offset_t fault_phys_offset)
7690	{
7691	vm_object_t object;
7692
7693	object = VM_PAGE_OBJECT(page);
7694	vm_object_lock_assert_held(object);
7695
7696	if (page->vmp_wpmapped &&
7697	!VMP_CS_TAINTED(p: page, fault_page_size, fault_phys_offset)) {
7698	/*
7699	* This page was mapped for "write" access sometime in the
7700	* past and could still be modifiable in the future.
7701	* Consider it tainted.
7702	* [ If the page was already found to be "tainted", no
7703	* need to re-validate. ]
7704	*/
7705	vm_object_lock_assert_exclusive(object);
7706	VMP_CS_SET_VALIDATED(p: page, fault_page_size, fault_phys_offset, TRUE);
7707	VMP_CS_SET_TAINTED(p: page, fault_page_size, fault_phys_offset, TRUE);
7708	if (cs_debug) {
7709	printf(format: "CODESIGNING: %s: "
7710	"page %p obj %p off 0x%llx "
7711	"was modified\n",
7712	__FUNCTION__,
7713	page, object, page->vmp_offset);
7714	}
7715	vm_cs_validated_dirtied++;
7716	}
7717
7718	if (VMP_CS_VALIDATED(p: page, fault_page_size, fault_phys_offset) \|\|
7719	VMP_CS_TAINTED(p: page, fault_page_size, fault_phys_offset)) {
7720	return TRUE;
7721	}
7722	vm_object_lock_assert_exclusive(object);
7723
7724	#if CHECK_CS_VALIDATION_BITMAP
7725	kern_return_t kr;
7726
7727	kr = vnode_pager_cs_check_validation_bitmap(
7728	object->pager,
7729	page->vmp_offset + object->paging_offset,
7730	CS_BITMAP_CHECK);
7731	if (kr == KERN_SUCCESS) {
7732	page->vmp_cs_validated = VMP_CS_ALL_TRUE;
7733	page->vmp_cs_tainted = VMP_CS_ALL_FALSE;
7734	vm_cs_bitmap_validated++;
7735	return TRUE;
7736	}
7737	#endif /* CHECK_CS_VALIDATION_BITMAP */
7738
7739	if (!object->alive \|\| object->terminating \|\| object->pager == NULL) {
7740	/*
7741	* The object is terminating and we don't have its pager
7742	* so we can't validate the data...
7743	*/
7744	return TRUE;
7745	}
7746
7747	/ we need to really validate this page /
7748	vm_object_lock_assert_exclusive(object);
7749	return FALSE;
7750	}
7751
7752	void
7753	vm_page_validate_cs_mapped_slow(
7754	vm_page_t page,
7755	const void *kaddr)
7756	{
7757	vm_object_t object;
7758	memory_object_offset_t mo_offset;
7759	memory_object_t pager;
7760	struct vnode *vnode;
7761	int validated, tainted, nx;
7762
7763	assert(page->vmp_busy);
7764	object = VM_PAGE_OBJECT(page);
7765	vm_object_lock_assert_exclusive(object);
7766
7767	vm_cs_validates++;
7768
7769	/*
7770	* Since we get here to validate a page that was brought in by
7771	* the pager, we know that this pager is all setup and ready
7772	* by now.
7773	*/
7774	assert(object->code_signed);
7775	assert(!object->internal);
7776	assert(object->pager != NULL);
7777	assert(object->pager_ready);
7778
7779	pager = object->pager;
7780	assert(object->paging_in_progress);
7781	vnode = vnode_pager_lookup_vnode(pager);
7782	mo_offset = page->vmp_offset + object->paging_offset;
7783
7784	/ verify the SHA1 hash for this page /
7785	validated = `0`;
7786	tainted = `0`;
7787	nx = `0`;
7788	cs_validate_page(vp: vnode,
7789	pager,
7790	offset: mo_offset,
7791	data: (const void )((const* char *)kaddr),
7792	validated_p: &validated,
7793	tainted_p: &tainted,
7794	nx_p: &nx);
7795
7796	page->vmp_cs_validated \|= validated;
7797	page->vmp_cs_tainted \|= tainted;
7798	page->vmp_cs_nx \|= nx;
7799
7800	#if CHECK_CS_VALIDATION_BITMAP
7801	if (page->vmp_cs_validated == VMP_CS_ALL_TRUE &&
7802	page->vmp_cs_tainted == VMP_CS_ALL_FALSE) {
7803	vnode_pager_cs_check_validation_bitmap(object->pager,
7804	mo_offset,
7805	CS_BITMAP_SET);
7806	}
7807	#endif /* CHECK_CS_VALIDATION_BITMAP */
7808	}
7809
7810	void
7811	vm_page_validate_cs_mapped(
7812	vm_page_t page,
7813	vm_map_size_t fault_page_size,
7814	vm_map_offset_t fault_phys_offset,
7815	const void *kaddr)
7816	{
7817	if (!vm_page_validate_cs_fast(page, fault_page_size, fault_phys_offset)) {
7818	vm_page_validate_cs_mapped_slow(page, kaddr);
7819	}
7820	}
7821
7822	static void
7823	vm_page_map_and_validate_cs(
7824	vm_object_t object,
7825	vm_page_t page)
7826	{
7827	vm_object_offset_t offset;
7828	vm_map_offset_t koffset;
7829	vm_map_size_t ksize;
7830	vm_offset_t kaddr;
7831	kern_return_t kr;
7832	boolean_t busy_page;
7833	boolean_t need_unmap;
7834
7835	vm_object_lock_assert_exclusive(object);
7836
7837	assert(object->code_signed);
7838	offset = page->vmp_offset;
7839
7840	busy_page = page->vmp_busy;
7841	if (!busy_page) {
7842	/ keep page busy while we map (and unlock) the VM object /
7843	page->vmp_busy = TRUE;
7844	}
7845
7846	/*
7847	* Take a paging reference on the VM object
7848	* to protect it from collapse or bypass,
7849	* and keep it from disappearing too.
7850	*/
7851	vm_object_paging_begin(object);
7852
7853	/ map the page in the kernel address space /
7854	ksize = PAGE_SIZE_64;
7855	koffset = `0`;
7856	need_unmap = FALSE;
7857	kr = vm_paging_map_object(page,
7858	object,
7859	offset,
7860	VM_PROT_READ,
7861	FALSE, / can't unlock object ! /
7862	size: &ksize,
7863	address: &koffset,
7864	need_unmap: &need_unmap);
7865	if (kr != KERN_SUCCESS) {
7866	panic("%s: could not map page: 0x%x", __FUNCTION__, kr);
7867	}
7868	kaddr = CAST_DOWN(vm_offset_t, koffset);
7869
7870	/ validate the mapped page /
7871	vm_page_validate_cs_mapped_slow(page, kaddr: (const void *) kaddr);
7872
7873	assert(page->vmp_busy);
7874	assert(object == VM_PAGE_OBJECT(page));
7875	vm_object_lock_assert_exclusive(object);
7876
7877	if (!busy_page) {
7878	PAGE_WAKEUP_DONE(page);
7879	}
7880	if (need_unmap) {
7881	/ unmap the map from the kernel address space /
7882	vm_paging_unmap_object(object, start: koffset, end: koffset + ksize);
7883	koffset = `0`;
7884	ksize = `0`;
7885	kaddr = `0`;
7886	}
7887	vm_object_paging_end(object);
7888	}
7889
7890	void
7891	vm_page_validate_cs(
7892	vm_page_t page,
7893	vm_map_size_t fault_page_size,
7894	vm_map_offset_t fault_phys_offset)
7895	{
7896	vm_object_t object;
7897
7898	object = VM_PAGE_OBJECT(page);
7899	vm_object_lock_assert_held(object);
7900
7901	if (vm_page_validate_cs_fast(page, fault_page_size, fault_phys_offset)) {
7902	return;
7903	}
7904	vm_page_map_and_validate_cs(object, page);
7905	}
7906
7907	void
7908	vm_page_validate_cs_mapped_chunk(
7909	vm_page_t page,
7910	const void *kaddr,
7911	vm_offset_t chunk_offset,
7912	vm_size_t chunk_size,
7913	boolean_t *validated_p,
7914	unsigned *tainted_p)
7915	{
7916	vm_object_t object;
7917	vm_object_offset_t offset, offset_in_page;
7918	memory_object_t pager;
7919	struct vnode *vnode;
7920	boolean_t validated;
7921	unsigned tainted;
7922
7923	*validated_p = FALSE;
7924	*tainted_p = `0`;
7925
7926	assert(page->vmp_busy);
7927	object = VM_PAGE_OBJECT(page);
7928	vm_object_lock_assert_exclusive(object);
7929
7930	assert(object->code_signed);
7931	offset = page->vmp_offset;
7932
7933	if (!object->alive \|\| object->terminating \|\| object->pager == NULL) {
7934	/*
7935	* The object is terminating and we don't have its pager
7936	* so we can't validate the data...
7937	*/
7938	return;
7939	}
7940	/*
7941	* Since we get here to validate a page that was brought in by
7942	* the pager, we know that this pager is all setup and ready
7943	* by now.
7944	*/
7945	assert(!object->internal);
7946	assert(object->pager != NULL);
7947	assert(object->pager_ready);
7948
7949	pager = object->pager;
7950	assert(object->paging_in_progress);
7951	vnode = vnode_pager_lookup_vnode(pager);
7952
7953	/ verify the signature for this chunk /
7954	offset_in_page = chunk_offset;
7955	assert(offset_in_page < PAGE_SIZE);
7956
7957	tainted = `0`;
7958	validated = cs_validate_range(vp: vnode,
7959	pager,
7960	offset: (object->paging_offset +
7961	offset +
7962	offset_in_page),
7963	data: (const void )((const* char *)kaddr
7964	+ offset_in_page),
7965	size: chunk_size,
7966	result: &tainted);
7967	if (validated) {
7968	*validated_p = TRUE;
7969	}
7970	if (tainted) {
7971	*tainted_p = tainted;
7972	}
7973	}
7974
7975	static void
7976	vm_rtfrecord_lock(void)
7977	{
7978	lck_spin_lock(lck: &vm_rtfr_slock);
7979	}
7980
7981	static void
7982	vm_rtfrecord_unlock(void)
7983	{
7984	lck_spin_unlock(lck: &vm_rtfr_slock);
7985	}
7986
7987	unsigned int
7988	vmrtfaultinfo_bufsz(void)
7989	{
7990	return vmrtf_num_records * sizeof(vm_rtfault_record_t);
7991	}
7992
7993	#include <kern/backtrace.h>
7994
7995	__attribute__((noinline))
7996	static void
7997	vm_record_rtfault(thread_t cthread, uint64_t fstart, vm_map_offset_t fault_vaddr, int type_of_fault)
7998	{
7999	uint64_t fend = mach_continuous_time();
8000
8001	uint64_t cfpc = `0`;
8002	uint64_t ctid = cthread->thread_id;
8003	uint64_t cupid = get_current_unique_pid();
8004
8005	uintptr_t bpc = `0`;
8006	errno_t btr = `0`;
8007
8008	/*
8009	* Capture a single-frame backtrace. This extracts just the program
8010	* counter at the point of the fault, and should not use copyin to get
8011	* Rosetta save state.
8012	*/
8013	struct backtrace_control ctl = {
8014	.btc_user_thread = cthread,
8015	.btc_user_copy = backtrace_user_copy_error,
8016	};
8017	unsigned int bfrs = backtrace_user(bt: &bpc, btlen: `1U`, ctl: &ctl, NULL);
8018	if ((btr == `0`) && (bfrs > `0`)) {
8019	cfpc = bpc;
8020	}
8021
8022	assert((fstart != `0`) && fend >= fstart);
8023	vm_rtfrecord_lock();
8024	assert(vmrtfrs.vmrtfr_curi <= vmrtfrs.vmrtfr_maxi);
8025
8026	vmrtfrs.vmrtf_total++;
8027	vm_rtfault_record_t *cvmr = &vmrtfrs.vm_rtf_records[vmrtfrs.vmrtfr_curi++];
8028
8029	cvmr->rtfabstime = fstart;
8030	cvmr->rtfduration = fend - fstart;
8031	cvmr->rtfaddr = fault_vaddr;
8032	cvmr->rtfpc = cfpc;
8033	cvmr->rtftype = type_of_fault;
8034	cvmr->rtfupid = cupid;
8035	cvmr->rtftid = ctid;
8036
8037	if (vmrtfrs.vmrtfr_curi > vmrtfrs.vmrtfr_maxi) {
8038	vmrtfrs.vmrtfr_curi = `0`;
8039	}
8040
8041	vm_rtfrecord_unlock();
8042	}
8043
8044	int
8045	vmrtf_extract(uint64_t cupid, __unused boolean_t isroot, unsigned long vrecordsz, void vrecords, unsigned* long *vmrtfrv)
8046	{
8047	vm_rtfault_record_t *cvmrd = vrecords;
8048	size_t residue = vrecordsz;
8049	size_t numextracted = `0`;
8050	boolean_t early_exit = FALSE;
8051
8052	vm_rtfrecord_lock();
8053
8054	for (int vmfi = `0`; vmfi <= vmrtfrs.vmrtfr_maxi; vmfi++) {
8055	if (residue < sizeof(vm_rtfault_record_t)) {
8056	early_exit = TRUE;
8057	break;
8058	}
8059
8060	if (vmrtfrs.vm_rtf_records[vmfi].rtfupid != cupid) {
8061	#if DEVELOPMENT \|\| DEBUG
8062	if (isroot == FALSE) {
8063	continue;
8064	}
8065	#else
8066	continue;
8067	#endif /* DEVDEBUG */
8068	}
8069
8070	*cvmrd = vmrtfrs.vm_rtf_records[vmfi];
8071	cvmrd++;
8072	residue -= sizeof(vm_rtfault_record_t);
8073	numextracted++;
8074	}
8075
8076	vm_rtfrecord_unlock();
8077
8078	*vmrtfrv = numextracted;
8079	return early_exit;
8080	}
8081
8082	/*
8083	* Only allow one diagnosis to be in flight at a time, to avoid
8084	* creating too much additional memory usage.
8085	*/
8086	static volatile uint_t vmtc_diagnosing;
8087	unsigned int vmtc_total = `0`;
8088
8089	/*
8090	* Type used to update telemetry for the diagnosis counts.
8091	*/
8092	CA_EVENT(vmtc_telemetry,
8093	CA_INT, vmtc_num_byte, / number of corrupt bytes found /
8094	CA_BOOL, vmtc_undiagnosed, / undiagnosed because more than 1 at a time /
8095	CA_BOOL, vmtc_not_eligible, / the page didn't qualify /
8096	CA_BOOL, vmtc_copyin_fail, / unable to copy in the page /
8097	CA_BOOL, vmtc_not_found, / no corruption found even though CS failed /
8098	CA_BOOL, vmtc_one_bit_flip, / single bit flip /
8099	CA_BOOL, vmtc_testing); / caused on purpose by testing /
8100
8101	#if DEVELOPMENT \|\| DEBUG
8102	/*
8103	* Buffers used to compare before/after page contents.
8104	* Stashed to aid when debugging crashes.
8105	*/
8106	static size_t vmtc_last_buffer_size = `0`;
8107	static uint64_t *vmtc_last_before_buffer = NULL;
8108	static uint64_t *vmtc_last_after_buffer = NULL;
8109
8110	/*
8111	* Needed to record corruptions due to testing.
8112	*/
8113	static uintptr_t corruption_test_va = `0`;
8114	#endif /* DEVELOPMENT \|\| DEBUG */
8115
8116	/*
8117	* Stash a copy of data from a possibly corrupt page.
8118	*/
8119	static uint64_t *
8120	vmtc_get_page_data(
8121	vm_map_offset_t code_addr,
8122	vm_page_t page)
8123	{
8124	uint64_t *buffer = NULL;
8125	addr64_t buffer_paddr;
8126	addr64_t page_paddr;
8127	extern void bcopy_phys(addr64_t from, addr64_t to, vm_size_t bytes);
8128	uint_t size = MIN(vm_map_page_size(current_map()), PAGE_SIZE);
8129
8130	/*
8131	* Need an aligned buffer to do a physical copy.
8132	*/
8133	if (kernel_memory_allocate(map: kernel_map, addrp: (vm_offset_t *)&buffer,
8134	size, mask: size - `1`, flags: KMA_KOBJECT, VM_KERN_MEMORY_DIAG) != KERN_SUCCESS) {
8135	return NULL;
8136	}
8137	buffer_paddr = kvtophys(va: (vm_offset_t)buffer);
8138	page_paddr = ptoa(VM_PAGE_GET_PHYS_PAGE(page));
8139
8140	/ adjust the page start address if we need only 4K of a 16K page /
8141	if (size < PAGE_SIZE) {
8142	uint_t subpage_start = ((code_addr & (PAGE_SIZE - `1`)) & ~(size - `1`));
8143	page_paddr += subpage_start;
8144	}
8145
8146	bcopy_phys(from: page_paddr, to: buffer_paddr, bytes: size);
8147	return buffer;
8148	}
8149
8150	/*
8151	* Set things up so we can diagnose a potential text page corruption.
8152	*/
8153	static uint64_t *
8154	vmtc_text_page_diagnose_setup(
8155	vm_map_offset_t code_addr,
8156	vm_page_t page,
8157	CA_EVENT_TYPE(vmtc_telemetry) *event)
8158	{
8159	uint64_t *buffer = NULL;
8160
8161	/*
8162	* If another is being diagnosed, skip this one.
8163	*/
8164	if (!OSCompareAndSwap(`0`, `1`, &vmtc_diagnosing)) {
8165	event->vmtc_undiagnosed = true;
8166	return NULL;
8167	}
8168
8169	/*
8170	* Get the contents of the corrupt page.
8171	*/
8172	buffer = vmtc_get_page_data(code_addr, page);
8173	if (buffer == NULL) {
8174	event->vmtc_copyin_fail = true;
8175	if (!OSCompareAndSwap(`1`, `0`, &vmtc_diagnosing)) {
8176	panic("Bad compare and swap in setup!");
8177	}
8178	return NULL;
8179	}
8180	return buffer;
8181	}
8182
8183	/*
8184	* Diagnose the text page by comparing its contents with
8185	* the one we've previously saved.
8186	*/
8187	static void
8188	vmtc_text_page_diagnose(
8189	vm_map_offset_t code_addr,
8190	uint64_t *old_code_buffer,
8191	CA_EVENT_TYPE(vmtc_telemetry) *event)
8192	{
8193	uint64_t *new_code_buffer;
8194	size_t size = MIN(vm_map_page_size(current_map()), PAGE_SIZE);
8195	uint_t count = (uint_t)size / sizeof(uint64_t);
8196	uint_t diff_count = `0`;
8197	bool bit_flip = false;
8198	uint_t b;
8199	uint64_t *new;
8200	uint64_t *old;
8201
8202	new_code_buffer = kalloc_data(size, Z_WAITOK);
8203	assert(new_code_buffer != NULL);
8204	if (copyin((user_addr_t)vm_map_trunc_page(code_addr, size - `1`), new_code_buffer, size) != `0`) {
8205	/ copyin error, so undo things /
8206	event->vmtc_copyin_fail = true;
8207	goto done;
8208	}
8209
8210	new = new_code_buffer;
8211	old = old_code_buffer;
8212	for (; count-- > `0`; ++new, ++old) {
8213	if (new == old) {
8214	continue;
8215	}
8216
8217	/*
8218	* On first diff, check for a single bit flip
8219	*/
8220	if (diff_count == `0`) {
8221	uint64_t x = (new ^ old);
8222	assert(x != `0`);
8223	if ((x & (x - `1`)) == `0`) {
8224	bit_flip = true;
8225	++diff_count;
8226	continue;
8227	}
8228	}
8229
8230	/*
8231	* count up the number of different bytes.
8232	*/
8233	for (b = `0`; b < sizeof(uint64_t); ++b) {
8234	char n = (char* *)new;
8235	char o = (char* *)old;
8236	if (n[b] != o[b]) {
8237	++diff_count;
8238	}
8239	}
8240	}
8241
8242	if (diff_count > `1`) {
8243	bit_flip = false;
8244	}
8245
8246	if (diff_count == `0`) {
8247	event->vmtc_not_found = true;
8248	} else {
8249	event->vmtc_num_byte = diff_count;
8250	}
8251	if (bit_flip) {
8252	event->vmtc_one_bit_flip = true;
8253	}
8254
8255	done:
8256	/*
8257	* Free up the code copy buffers, but save the last
8258	* set on development / debug kernels in case they
8259	* can provide evidence for debugging memory stomps.
8260	*/
8261	#if DEVELOPMENT \|\| DEBUG
8262	if (vmtc_last_before_buffer != NULL) {
8263	kmem_free(kernel_map, (vm_offset_t)vmtc_last_before_buffer, vmtc_last_buffer_size);
8264	}
8265	if (vmtc_last_after_buffer != NULL) {
8266	kfree_data(vmtc_last_after_buffer, vmtc_last_buffer_size);
8267	}
8268	vmtc_last_before_buffer = old_code_buffer;
8269	vmtc_last_after_buffer = new_code_buffer;
8270	vmtc_last_buffer_size = size;
8271	#else /* DEVELOPMENT \|\| DEBUG */
8272	kfree_data(new_code_buffer, size);
8273	kmem_free(map: kernel_map, addr: (vm_offset_t)old_code_buffer, size);
8274	#endif /* DEVELOPMENT \|\| DEBUG */
8275
8276	/*
8277	* We're finished, so clear the diagnosing flag.
8278	*/
8279	if (!OSCompareAndSwap(`1`, `0`, &vmtc_diagnosing)) {
8280	panic("Bad compare and swap in diagnose!");
8281	}
8282	}
8283
8284	/*
8285	* For the given map, virt address, find the object, offset, and page.
8286	* This has to lookup the map entry, verify protections, walk any shadow chains.
8287	* If found, returns with the object locked.
8288	*/
8289	static kern_return_t
8290	vmtc_revalidate_lookup(
8291	vm_map_t map,
8292	vm_map_offset_t vaddr,
8293	vm_object_t *ret_object,
8294	vm_object_offset_t *ret_offset,
8295	vm_page_t *ret_page,
8296	vm_prot_t *ret_prot)
8297	{
8298	vm_object_t object;
8299	vm_object_offset_t offset;
8300	vm_page_t page;
8301	kern_return_t kr = KERN_SUCCESS;
8302	uint8_t object_lock_type = OBJECT_LOCK_EXCLUSIVE;
8303	vm_map_version_t version;
8304	boolean_t wired;
8305	struct vm_object_fault_info fault_info = {};
8306	vm_map_t real_map = NULL;
8307	vm_prot_t prot;
8308	vm_object_t shadow;
8309
8310	/*
8311	* Find the object/offset for the given location/map.
8312	* Note this returns with the object locked.
8313	*/
8314	restart:
8315	vm_map_lock_read(map);
8316	object = VM_OBJECT_NULL; / in case we come around the restart path /
8317	kr = vm_map_lookup_and_lock_object(var_map: &map, vaddr, VM_PROT_READ,
8318	object_lock_type, out_version: &version, object: &object, offset: &offset, out_prot: &prot, wired: &wired,
8319	fault_info: &fault_info, real_map: &real_map, NULL);
8320	vm_map_unlock_read(map);
8321	if (real_map != NULL && real_map != map) {
8322	vm_map_unlock(real_map);
8323	}
8324
8325	/*
8326	* If there's no page here, fail.
8327	*/
8328	if (kr != KERN_SUCCESS \|\| object == NULL) {
8329	kr = KERN_FAILURE;
8330	goto done;
8331	}
8332
8333	/*
8334	* Chase down any shadow chains to find the actual page.
8335	*/
8336	for (;;) {
8337	/*
8338	* See if the page is on the current object.
8339	*/
8340	page = vm_page_lookup(object, vm_object_trunc_page(offset));
8341	if (page != NULL) {
8342	/ restart the lookup /
8343	if (page->vmp_restart) {
8344	vm_object_unlock(object);
8345	goto restart;
8346	}
8347
8348	/*
8349	* If this page is busy, we need to wait for it.
8350	*/
8351	if (page->vmp_busy) {
8352	PAGE_SLEEP(object, page, TRUE);
8353	vm_object_unlock(object);
8354	goto restart;
8355	}
8356	break;
8357	}
8358
8359	/*
8360	* If the object doesn't have the page and
8361	* has no shadow, then we can quit.
8362	*/
8363	shadow = object->shadow;
8364	if (shadow == NULL) {
8365	kr = KERN_FAILURE;
8366	goto done;
8367	}
8368
8369	/*
8370	* Move to the next object
8371	*/
8372	offset += object->vo_shadow_offset;
8373	vm_object_lock(shadow);
8374	vm_object_unlock(object);
8375	object = shadow;
8376	shadow = VM_OBJECT_NULL;
8377	}
8378	*ret_object = object;
8379	*ret_offset = vm_object_trunc_page(offset);
8380	*ret_page = page;
8381	*ret_prot = prot;
8382
8383	done:
8384	if (kr != KERN_SUCCESS && object != NULL) {
8385	vm_object_unlock(object);
8386	}
8387	return kr;
8388	}
8389
8390	/*
8391	* Check if a page is wired, needs extra locking.
8392	*/
8393	static bool
8394	is_page_wired(vm_page_t page)
8395	{
8396	bool result;
8397	vm_page_lock_queues();
8398	result = VM_PAGE_WIRED(page);
8399	vm_page_unlock_queues();
8400	return result;
8401	}
8402
8403	/*
8404	* A fatal process error has occurred in the given task.
8405	* Recheck the code signing of the text page at the given
8406	* address to check for a text page corruption.
8407	*
8408	* Returns KERN_FAILURE if a page was found to be corrupt
8409	* by failing to match its code signature. KERN_SUCCESS
8410	* means the page is either valid or we don't have the
8411	* information to say it's corrupt.
8412	*/
8413	kern_return_t
8414	revalidate_text_page(task_t task, vm_map_offset_t code_addr)
8415	{
8416	kern_return_t kr;
8417	vm_map_t map;
8418	vm_object_t object = NULL;
8419	vm_object_offset_t offset;
8420	vm_page_t page = NULL;
8421	struct vnode *vnode;
8422	uint64_t *diagnose_buffer = NULL;
8423	CA_EVENT_TYPE(vmtc_telemetry) * event = NULL;
8424	ca_event_t ca_event = NULL;
8425	vm_prot_t prot;
8426
8427	map = task->map;
8428	if (task->map == NULL) {
8429	return KERN_SUCCESS;
8430	}
8431
8432	kr = vmtc_revalidate_lookup(map, vaddr: code_addr, ret_object: &object, ret_offset: &offset, ret_page: &page, ret_prot: &prot);
8433	if (kr != KERN_SUCCESS) {
8434	goto done;
8435	}
8436
8437	/*
8438	* The page must be executable.
8439	*/
8440	if (!(prot & VM_PROT_EXECUTE)) {
8441	goto done;
8442	}
8443
8444	/*
8445	* The object needs to have a pager.
8446	*/
8447	if (object->pager == NULL) {
8448	goto done;
8449	}
8450
8451	/*
8452	* Needs to be a vnode backed page to have a signature.
8453	*/
8454	vnode = vnode_pager_lookup_vnode(object->pager);
8455	if (vnode == NULL) {
8456	goto done;
8457	}
8458
8459	/*
8460	* Object checks to see if we should proceed.
8461	*/
8462	if (!object->code_signed \|\| / no code signature to check /
8463	object->internal \|\| / internal objects aren't signed /
8464	object->terminating \|\| / the object and its pages are already going away /
8465	!object->pager_ready) { / this should happen, but check shouldn't hurt /
8466	goto done;
8467	}
8468
8469
8470	/*
8471	* Check the code signature of the page in question.
8472	*/
8473	vm_page_map_and_validate_cs(object, page);
8474
8475	/*
8476	* At this point:
8477	* vmp_cs_validated \|= validated (set if a code signature exists)
8478	* vmp_cs_tainted \|= tainted (set if code signature violation)
8479	* vmp_cs_nx \|= nx; ??
8480	*
8481	* if vmp_pmapped then have to pmap_disconnect..
8482	* other flags to check on object or page?
8483	*/
8484	if (page->vmp_cs_tainted != VMP_CS_ALL_FALSE) {
8485	#if DEBUG \|\| DEVELOPMENT
8486	/*
8487	* On development builds, a boot-arg can be used to cause
8488	* a panic, instead of a quiet repair.
8489	*/
8490	if (vmtc_panic_instead) {
8491	panic("Text page corruption detected: vm_page_t 0x%llx", (long long)(uintptr_t)page);
8492	}
8493	#endif /* DEBUG \|\| DEVELOPMENT */
8494
8495	/*
8496	* We're going to invalidate this page. Grab a copy of it for comparison.
8497	*/
8498	ca_event = CA_EVENT_ALLOCATE(vmtc_telemetry);
8499	event = ca_event->data;
8500	diagnose_buffer = vmtc_text_page_diagnose_setup(code_addr, page, event);
8501
8502	/*
8503	* Invalidate, i.e. toss, the corrupted page.
8504	*/
8505	if (!page->vmp_cleaning &&
8506	!page->vmp_laundry &&
8507	!page->vmp_fictitious &&
8508	!page->vmp_precious &&
8509	!page->vmp_absent &&
8510	!VMP_ERROR_GET(page) &&
8511	!page->vmp_dirty &&
8512	!is_page_wired(page)) {
8513	if (page->vmp_pmapped) {
8514	int refmod = pmap_disconnect(phys: VM_PAGE_GET_PHYS_PAGE(m: page));
8515	if (refmod & VM_MEM_MODIFIED) {
8516	SET_PAGE_DIRTY(page, FALSE);
8517	}
8518	if (refmod & VM_MEM_REFERENCED) {
8519	page->vmp_reference = TRUE;
8520	}
8521	}
8522	/ If the page seems intentionally modified, don't trash it. /
8523	if (!page->vmp_dirty) {
8524	VM_PAGE_FREE(page);
8525	} else {
8526	event->vmtc_not_eligible = true;
8527	}
8528	} else {
8529	event->vmtc_not_eligible = true;
8530	}
8531	vm_object_unlock(object);
8532	object = VM_OBJECT_NULL;
8533
8534	/*
8535	* Now try to diagnose the type of failure by faulting
8536	* in a new copy and diff'ing it with what we saved.
8537	*/
8538	if (diagnose_buffer != NULL) {
8539	vmtc_text_page_diagnose(code_addr, old_code_buffer: diagnose_buffer, event);
8540	}
8541	#if DEBUG \|\| DEVELOPMENT
8542	if (corruption_test_va != `0`) {
8543	corruption_test_va = `0`;
8544	event->vmtc_testing = true;
8545	}
8546	#endif /* DEBUG \|\| DEVELOPMENT */
8547	ktriage_record(thread_id: thread_tid(thread: current_thread()),
8548	KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_TEXT_CORRUPTION),
8549	arg: `0` / arg /);
8550	CA_EVENT_SEND(ca_event);
8551	printf(format: "Text page corruption detected for pid %d\n", proc_selfpid());
8552	++vmtc_total;
8553	return KERN_FAILURE; / failure means we definitely found a corrupt page /
8554	}
8555	done:
8556	if (object != NULL) {
8557	vm_object_unlock(object);
8558	}
8559	return KERN_SUCCESS;
8560	}
8561
8562	#if DEBUG \|\| DEVELOPMENT
8563	/*
8564	* For implementing unit tests - ask the pmap to corrupt a text page.
8565	* We have to find the page, to get the physical address, then invoke
8566	* the pmap.
8567	*/
8568	extern kern_return_t vm_corrupt_text_addr(uintptr_t);
8569
8570	kern_return_t
8571	vm_corrupt_text_addr(uintptr_t va)
8572	{
8573	task_t task = current_task();
8574	vm_map_t map;
8575	kern_return_t kr = KERN_SUCCESS;
8576	vm_object_t object = VM_OBJECT_NULL;
8577	vm_object_offset_t offset;
8578	vm_page_t page = NULL;
8579	pmap_paddr_t pa;
8580	vm_prot_t prot;
8581
8582	map = task->map;
8583	if (task->map == NULL) {
8584	printf("corrupt_text_addr: no map\n");
8585	return KERN_FAILURE;
8586	}
8587
8588	kr = vmtc_revalidate_lookup(map, (vm_map_offset_t)va, &object, &offset, &page, &prot);
8589	if (kr != KERN_SUCCESS) {
8590	printf("corrupt_text_addr: page lookup failed\n");
8591	return kr;
8592	}
8593	if (!(prot & VM_PROT_EXECUTE)) {
8594	printf("corrupt_text_addr: page not executable\n");
8595	return KERN_FAILURE;
8596	}
8597
8598	/ get the physical address to use /
8599	pa = ptoa(VM_PAGE_GET_PHYS_PAGE(page)) + (va - vm_object_trunc_page(va));
8600
8601	/*
8602	* Check we have something we can work with.
8603	* Due to racing with pageout as we enter the sysctl,
8604	* it's theoretically possible to have the page disappear, just
8605	* before the lookup.
8606	*
8607	* That's highly likely to happen often. I've filed a radar 72857482
8608	* to bubble up the error here to the sysctl result and have the
8609	* test not FAIL in that case.
8610	*/
8611	if (page->vmp_busy) {
8612	printf("corrupt_text_addr: vmp_busy\n");
8613	kr = KERN_FAILURE;
8614	}
8615	if (page->vmp_cleaning) {
8616	printf("corrupt_text_addr: vmp_cleaning\n");
8617	kr = KERN_FAILURE;
8618	}
8619	if (page->vmp_laundry) {
8620	printf("corrupt_text_addr: vmp_cleaning\n");
8621	kr = KERN_FAILURE;
8622	}
8623	if (page->vmp_fictitious) {
8624	printf("corrupt_text_addr: vmp_fictitious\n");
8625	kr = KERN_FAILURE;
8626	}
8627	if (page->vmp_precious) {
8628	printf("corrupt_text_addr: vmp_precious\n");
8629	kr = KERN_FAILURE;
8630	}
8631	if (page->vmp_absent) {
8632	printf("corrupt_text_addr: vmp_absent\n");
8633	kr = KERN_FAILURE;
8634	}
8635	if (VMP_ERROR_GET(page)) {
8636	printf("corrupt_text_addr: vmp_error\n");
8637	kr = KERN_FAILURE;
8638	}
8639	if (page->vmp_dirty) {
8640	printf("corrupt_text_addr: vmp_dirty\n");
8641	kr = KERN_FAILURE;
8642	}
8643	if (is_page_wired(page)) {
8644	printf("corrupt_text_addr: wired\n");
8645	kr = KERN_FAILURE;
8646	}
8647	if (!page->vmp_pmapped) {
8648	printf("corrupt_text_addr: !vmp_pmapped\n");
8649	kr = KERN_FAILURE;
8650	}
8651
8652	if (kr == KERN_SUCCESS) {
8653	printf("corrupt_text_addr: using physaddr 0x%llx\n", (long long)pa);
8654	kr = pmap_test_text_corruption(pa);
8655	if (kr != KERN_SUCCESS) {
8656	printf("corrupt_text_addr: pmap error %d\n", kr);
8657	} else {
8658	corruption_test_va = va;
8659	}
8660	} else {
8661	printf("corrupt_text_addr: object %p\n", object);
8662	printf("corrupt_text_addr: offset 0x%llx\n", (uint64_t)offset);
8663	printf("corrupt_text_addr: va 0x%llx\n", (uint64_t)va);
8664	printf("corrupt_text_addr: vm_object_trunc_page(va) 0x%llx\n", (uint64_t)vm_object_trunc_page(va));
8665	printf("corrupt_text_addr: vm_page_t %p\n", page);
8666	printf("corrupt_text_addr: ptoa(PHYS_PAGE) 0x%llx\n", (uint64_t)ptoa(VM_PAGE_GET_PHYS_PAGE(page)));
8667	printf("corrupt_text_addr: using physaddr 0x%llx\n", (uint64_t)pa);
8668	}
8669
8670	if (object != VM_OBJECT_NULL) {
8671	vm_object_unlock(object);
8672	}
8673	return kr;
8674	}
8675
8676	#endif /* DEBUG \|\| DEVELOPMENT */
8677

Browse the source code of xnu/osfmk/vm/vm_fault.c