1/*
2 * Copyright (c) 2000-2009 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 */
58/*
59 * File: vm_fault.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 *
62 * Page fault handling module.
63 */
64
65#include <mach_cluster_stats.h>
66#include <mach_pagemap.h>
67#include <libkern/OSAtomic.h>
68
69#include <mach/mach_types.h>
70#include <mach/kern_return.h>
71#include <mach/message.h> /* for error codes */
72#include <mach/vm_param.h>
73#include <mach/vm_behavior.h>
74#include <mach/memory_object.h>
75 /* For memory_object_data_{request,unlock} */
76#include <mach/sdt.h>
77
78#include <kern/kern_types.h>
79#include <kern/host_statistics.h>
80#include <kern/counters.h>
81#include <kern/task.h>
82#include <kern/thread.h>
83#include <kern/sched_prim.h>
84#include <kern/host.h>
85#include <kern/xpr.h>
86#include <kern/mach_param.h>
87#include <kern/macro_help.h>
88#include <kern/zalloc.h>
89#include <kern/misc_protos.h>
90#include <kern/policy_internal.h>
91
92#include <vm/vm_compressor.h>
93#include <vm/vm_compressor_pager.h>
94#include <vm/vm_fault.h>
95#include <vm/vm_map.h>
96#include <vm/vm_object.h>
97#include <vm/vm_page.h>
98#include <vm/vm_kern.h>
99#include <vm/pmap.h>
100#include <vm/vm_pageout.h>
101#include <vm/vm_protos.h>
102#include <vm/vm_external.h>
103#include <vm/memory_object.h>
104#include <vm/vm_purgeable_internal.h> /* Needed by some vm_page.h macros */
105#include <vm/vm_shared_region.h>
106
107#include <sys/codesign.h>
108#include <sys/reason.h>
109#include <sys/signalvar.h>
110
111#include <san/kasan.h>
112
113#define VM_FAULT_CLASSIFY 0
114
115#define TRACEFAULTPAGE 0 /* (TEST/DEBUG) */
116
117unsigned int vm_object_pagein_throttle = 16;
118
119/*
120 * We apply a hard throttle to the demand zero rate of tasks that we believe are running out of control which
121 * kicks in when swap space runs out. 64-bit programs have massive address spaces and can leak enormous amounts
122 * of memory if they're buggy and can run the system completely out of swap space. If this happens, we
123 * impose a hard throttle on them to prevent them from taking the last bit of memory left. This helps
124 * keep the UI active so that the user has a chance to kill the offending task before the system
125 * completely hangs.
126 *
127 * The hard throttle is only applied when the system is nearly completely out of swap space and is only applied
128 * to tasks that appear to be bloated. When swap runs out, any task using more than vm_hard_throttle_threshold
129 * will be throttled. The throttling is done by giving the thread that's trying to demand zero a page a
130 * delay of HARD_THROTTLE_DELAY microseconds before being allowed to try the page fault again.
131 */
132
133extern void throttle_lowpri_io(int);
134
135extern struct vnode *vnode_pager_lookup_vnode(memory_object_t);
136
137uint64_t vm_hard_throttle_threshold;
138
139
140
141#define NEED_TO_HARD_THROTTLE_THIS_TASK() (vm_wants_task_throttled(current_task()) || \
142 ((vm_page_free_count < vm_page_throttle_limit || \
143 HARD_THROTTLE_LIMIT_REACHED()) && \
144 proc_get_effective_thread_policy(current_thread(), TASK_POLICY_IO) >= THROTTLE_LEVEL_THROTTLED))
145
146
147#define HARD_THROTTLE_DELAY 10000 /* 10000 us == 10 ms */
148#define SOFT_THROTTLE_DELAY 200 /* 200 us == .2 ms */
149
150#define VM_PAGE_CREATION_THROTTLE_PERIOD_SECS 6
151#define VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC 20000
152
153
154boolean_t current_thread_aborted(void);
155
156/* Forward declarations of internal routines. */
157static kern_return_t vm_fault_wire_fast(
158 vm_map_t map,
159 vm_map_offset_t va,
160 vm_prot_t prot,
161 vm_tag_t wire_tag,
162 vm_map_entry_t entry,
163 pmap_t pmap,
164 vm_map_offset_t pmap_addr,
165 ppnum_t *physpage_p);
166
167static kern_return_t vm_fault_internal(
168 vm_map_t map,
169 vm_map_offset_t vaddr,
170 vm_prot_t caller_prot,
171 boolean_t change_wiring,
172 vm_tag_t wire_tag,
173 int interruptible,
174 pmap_t pmap,
175 vm_map_offset_t pmap_addr,
176 ppnum_t *physpage_p);
177
178static void vm_fault_copy_cleanup(
179 vm_page_t page,
180 vm_page_t top_page);
181
182static void vm_fault_copy_dst_cleanup(
183 vm_page_t page);
184
185#if VM_FAULT_CLASSIFY
186extern void vm_fault_classify(vm_object_t object,
187 vm_object_offset_t offset,
188 vm_prot_t fault_type);
189
190extern void vm_fault_classify_init(void);
191#endif
192
193unsigned long vm_pmap_enter_blocked = 0;
194unsigned long vm_pmap_enter_retried = 0;
195
196unsigned long vm_cs_validates = 0;
197unsigned long vm_cs_revalidates = 0;
198unsigned long vm_cs_query_modified = 0;
199unsigned long vm_cs_validated_dirtied = 0;
200unsigned long vm_cs_bitmap_validated = 0;
201#if PMAP_CS
202uint64_t vm_cs_defer_to_pmap_cs = 0;
203uint64_t vm_cs_defer_to_pmap_cs_not = 0;
204#endif /* PMAP_CS */
205
206void vm_pre_fault(vm_map_offset_t);
207
208extern char *kdp_compressor_decompressed_page;
209extern addr64_t kdp_compressor_decompressed_page_paddr;
210extern ppnum_t kdp_compressor_decompressed_page_ppnum;
211
212struct vmrtfr {
213 int vmrtfr_maxi;
214 int vmrtfr_curi;
215 int64_t vmrtf_total;
216 vm_rtfault_record_t *vm_rtf_records;
217} vmrtfrs;
218#define VMRTF_DEFAULT_BUFSIZE (4096)
219#define VMRTF_NUM_RECORDS_DEFAULT (VMRTF_DEFAULT_BUFSIZE / sizeof(vm_rtfault_record_t))
220int vmrtf_num_records = VMRTF_NUM_RECORDS_DEFAULT;
221
222static void vm_rtfrecord_lock(void);
223static void vm_rtfrecord_unlock(void);
224static void vm_record_rtfault(thread_t, uint64_t, vm_map_offset_t, int);
225
226lck_spin_t vm_rtfr_slock;
227extern lck_grp_t vm_page_lck_grp_bucket;
228extern lck_attr_t vm_page_lck_attr;
229
230/*
231 * Routine: vm_fault_init
232 * Purpose:
233 * Initialize our private data structures.
234 */
235void
236vm_fault_init(void)
237{
238 int i, vm_compressor_temp;
239 boolean_t need_default_val = TRUE;
240 /*
241 * Choose a value for the hard throttle threshold based on the amount of ram. The threshold is
242 * computed as a percentage of available memory, and the percentage used is scaled inversely with
243 * the amount of memory. The percentage runs between 10% and 35%. We use 35% for small memory systems
244 * and reduce the value down to 10% for very large memory configurations. This helps give us a
245 * definition of a memory hog that makes more sense relative to the amount of ram in the machine.
246 * The formula here simply uses the number of gigabytes of ram to adjust the percentage.
247 */
248
249 vm_hard_throttle_threshold = sane_size * (35 - MIN((int)(sane_size / (1024*1024*1024)), 25)) / 100;
250
251 /*
252 * Configure compressed pager behavior. A boot arg takes precedence over a device tree entry.
253 */
254
255 if (PE_parse_boot_argn("vm_compressor", &vm_compressor_temp, sizeof (vm_compressor_temp))) {
256 for ( i = 0; i < VM_PAGER_MAX_MODES; i++) {
257 if (vm_compressor_temp > 0 &&
258 ((vm_compressor_temp & ( 1 << i)) == vm_compressor_temp)) {
259 need_default_val = FALSE;
260 vm_compressor_mode = vm_compressor_temp;
261 break;
262 }
263 }
264 if (need_default_val)
265 printf("Ignoring \"vm_compressor\" boot arg %d\n", vm_compressor_temp);
266 }
267 if (need_default_val) {
268 /* If no boot arg or incorrect boot arg, try device tree. */
269 PE_get_default("kern.vm_compressor", &vm_compressor_mode, sizeof(vm_compressor_mode));
270 }
271 printf("\"vm_compressor_mode\" is %d\n", vm_compressor_mode);
272}
273
274void vm_rtfault_record_init(void) {
275 PE_parse_boot_argn("vm_rtfault_records", &vmrtf_num_records, sizeof(vmrtf_num_records));
276
277 assert(vmrtf_num_records >= 1);
278 vmrtf_num_records = MAX(vmrtf_num_records, 1);
279 size_t kallocsz = vmrtf_num_records * sizeof(vm_rtfault_record_t);
280 vmrtfrs.vm_rtf_records = kalloc(kallocsz);
281 bzero(vmrtfrs.vm_rtf_records, kallocsz);
282 vmrtfrs.vmrtfr_maxi = vmrtf_num_records - 1;
283 lck_spin_init(&vm_rtfr_slock, &vm_page_lck_grp_bucket, &vm_page_lck_attr);
284}
285/*
286 * Routine: vm_fault_cleanup
287 * Purpose:
288 * Clean up the result of vm_fault_page.
289 * Results:
290 * The paging reference for "object" is released.
291 * "object" is unlocked.
292 * If "top_page" is not null, "top_page" is
293 * freed and the paging reference for the object
294 * containing it is released.
295 *
296 * In/out conditions:
297 * "object" must be locked.
298 */
299void
300vm_fault_cleanup(
301 vm_object_t object,
302 vm_page_t top_page)
303{
304 vm_object_paging_end(object);
305 vm_object_unlock(object);
306
307 if (top_page != VM_PAGE_NULL) {
308 object = VM_PAGE_OBJECT(top_page);
309
310 vm_object_lock(object);
311 VM_PAGE_FREE(top_page);
312 vm_object_paging_end(object);
313 vm_object_unlock(object);
314 }
315}
316
317#define ALIGNED(x) (((x) & (PAGE_SIZE_64 - 1)) == 0)
318
319
320boolean_t vm_page_deactivate_behind = TRUE;
321/*
322 * default sizes given VM_BEHAVIOR_DEFAULT reference behavior
323 */
324#define VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW 128
325#define VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER 16 /* don't make this too big... */
326 /* we use it to size an array on the stack */
327
328int vm_default_behind = VM_DEFAULT_DEACTIVATE_BEHIND_WINDOW;
329
330#define MAX_SEQUENTIAL_RUN (1024 * 1024 * 1024)
331
332/*
333 * vm_page_is_sequential
334 *
335 * Determine if sequential access is in progress
336 * in accordance with the behavior specified.
337 * Update state to indicate current access pattern.
338 *
339 * object must have at least the shared lock held
340 */
341static
342void
343vm_fault_is_sequential(
344 vm_object_t object,
345 vm_object_offset_t offset,
346 vm_behavior_t behavior)
347{
348 vm_object_offset_t last_alloc;
349 int sequential;
350 int orig_sequential;
351
352 last_alloc = object->last_alloc;
353 sequential = object->sequential;
354 orig_sequential = sequential;
355
356 switch (behavior) {
357 case VM_BEHAVIOR_RANDOM:
358 /*
359 * reset indicator of sequential behavior
360 */
361 sequential = 0;
362 break;
363
364 case VM_BEHAVIOR_SEQUENTIAL:
365 if (offset && last_alloc == offset - PAGE_SIZE_64) {
366 /*
367 * advance indicator of sequential behavior
368 */
369 if (sequential < MAX_SEQUENTIAL_RUN)
370 sequential += PAGE_SIZE;
371 } else {
372 /*
373 * reset indicator of sequential behavior
374 */
375 sequential = 0;
376 }
377 break;
378
379 case VM_BEHAVIOR_RSEQNTL:
380 if (last_alloc && last_alloc == offset + PAGE_SIZE_64) {
381 /*
382 * advance indicator of sequential behavior
383 */
384 if (sequential > -MAX_SEQUENTIAL_RUN)
385 sequential -= PAGE_SIZE;
386 } else {
387 /*
388 * reset indicator of sequential behavior
389 */
390 sequential = 0;
391 }
392 break;
393
394 case VM_BEHAVIOR_DEFAULT:
395 default:
396 if (offset && last_alloc == (offset - PAGE_SIZE_64)) {
397 /*
398 * advance indicator of sequential behavior
399 */
400 if (sequential < 0)
401 sequential = 0;
402 if (sequential < MAX_SEQUENTIAL_RUN)
403 sequential += PAGE_SIZE;
404
405 } else if (last_alloc && last_alloc == (offset + PAGE_SIZE_64)) {
406 /*
407 * advance indicator of sequential behavior
408 */
409 if (sequential > 0)
410 sequential = 0;
411 if (sequential > -MAX_SEQUENTIAL_RUN)
412 sequential -= PAGE_SIZE;
413 } else {
414 /*
415 * reset indicator of sequential behavior
416 */
417 sequential = 0;
418 }
419 break;
420 }
421 if (sequential != orig_sequential) {
422 if (!OSCompareAndSwap(orig_sequential, sequential, (UInt32 *)&object->sequential)) {
423 /*
424 * if someone else has already updated object->sequential
425 * don't bother trying to update it or object->last_alloc
426 */
427 return;
428 }
429 }
430 /*
431 * I'd like to do this with a OSCompareAndSwap64, but that
432 * doesn't exist for PPC... however, it shouldn't matter
433 * that much... last_alloc is maintained so that we can determine
434 * if a sequential access pattern is taking place... if only
435 * one thread is banging on this object, no problem with the unprotected
436 * update... if 2 or more threads are banging away, we run the risk of
437 * someone seeing a mangled update... however, in the face of multiple
438 * accesses, no sequential access pattern can develop anyway, so we
439 * haven't lost any real info.
440 */
441 object->last_alloc = offset;
442}
443
444
445int vm_page_deactivate_behind_count = 0;
446
447/*
448 * vm_page_deactivate_behind
449 *
450 * Determine if sequential access is in progress
451 * in accordance with the behavior specified. If
452 * so, compute a potential page to deactivate and
453 * deactivate it.
454 *
455 * object must be locked.
456 *
457 * return TRUE if we actually deactivate a page
458 */
459static
460boolean_t
461vm_fault_deactivate_behind(
462 vm_object_t object,
463 vm_object_offset_t offset,
464 vm_behavior_t behavior)
465{
466 int n;
467 int pages_in_run = 0;
468 int max_pages_in_run = 0;
469 int sequential_run;
470 int sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
471 vm_object_offset_t run_offset = 0;
472 vm_object_offset_t pg_offset = 0;
473 vm_page_t m;
474 vm_page_t page_run[VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER];
475
476 pages_in_run = 0;
477#if TRACEFAULTPAGE
478 dbgTrace(0xBEEF0018, (unsigned int) object, (unsigned int) vm_fault_deactivate_behind); /* (TEST/DEBUG) */
479#endif
480
481 if (object == kernel_object || vm_page_deactivate_behind == FALSE) {
482 /*
483 * Do not deactivate pages from the kernel object: they
484 * are not intended to become pageable.
485 * or we've disabled the deactivate behind mechanism
486 */
487 return FALSE;
488 }
489 if ((sequential_run = object->sequential)) {
490 if (sequential_run < 0) {
491 sequential_behavior = VM_BEHAVIOR_RSEQNTL;
492 sequential_run = 0 - sequential_run;
493 } else {
494 sequential_behavior = VM_BEHAVIOR_SEQUENTIAL;
495 }
496 }
497 switch (behavior) {
498 case VM_BEHAVIOR_RANDOM:
499 break;
500 case VM_BEHAVIOR_SEQUENTIAL:
501 if (sequential_run >= (int)PAGE_SIZE) {
502 run_offset = 0 - PAGE_SIZE_64;
503 max_pages_in_run = 1;
504 }
505 break;
506 case VM_BEHAVIOR_RSEQNTL:
507 if (sequential_run >= (int)PAGE_SIZE) {
508 run_offset = PAGE_SIZE_64;
509 max_pages_in_run = 1;
510 }
511 break;
512 case VM_BEHAVIOR_DEFAULT:
513 default:
514 { vm_object_offset_t behind = vm_default_behind * PAGE_SIZE_64;
515
516 /*
517 * determine if the run of sequential accesss has been
518 * long enough on an object with default access behavior
519 * to consider it for deactivation
520 */
521 if ((uint64_t)sequential_run >= behind && (sequential_run % (VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER * PAGE_SIZE)) == 0) {
522 /*
523 * the comparisons between offset and behind are done
524 * in this kind of odd fashion in order to prevent wrap around
525 * at the end points
526 */
527 if (sequential_behavior == VM_BEHAVIOR_SEQUENTIAL) {
528 if (offset >= behind) {
529 run_offset = 0 - behind;
530 pg_offset = PAGE_SIZE_64;
531 max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
532 }
533 } else {
534 if (offset < -behind) {
535 run_offset = behind;
536 pg_offset = 0 - PAGE_SIZE_64;
537 max_pages_in_run = VM_DEFAULT_DEACTIVATE_BEHIND_CLUSTER;
538 }
539 }
540 }
541 break;
542 }
543 }
544 for (n = 0; n < max_pages_in_run; n++) {
545 m = vm_page_lookup(object, offset + run_offset + (n * pg_offset));
546
547 if (m && !m->vmp_laundry && !m->vmp_busy && !m->vmp_no_cache && (m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && !m->vmp_fictitious && !m->vmp_absent) {
548 page_run[pages_in_run++] = m;
549
550 /*
551 * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
552 *
553 * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
554 * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
555 * new reference happens. If no futher references happen on the page after that remote TLB flushes
556 * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
557 * by pageout_scan, which is just fine since the last reference would have happened quite far
558 * in the past (TLB caches don't hang around for very long), and of course could just as easily
559 * have happened before we did the deactivate_behind.
560 */
561 pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
562 }
563 }
564 if (pages_in_run) {
565 vm_page_lockspin_queues();
566
567 for (n = 0; n < pages_in_run; n++) {
568
569 m = page_run[n];
570
571 vm_page_deactivate_internal(m, FALSE);
572
573 vm_page_deactivate_behind_count++;
574#if TRACEFAULTPAGE
575 dbgTrace(0xBEEF0019, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
576#endif
577 }
578 vm_page_unlock_queues();
579
580 return TRUE;
581 }
582 return FALSE;
583}
584
585
586#if (DEVELOPMENT || DEBUG)
587uint32_t vm_page_creation_throttled_hard = 0;
588uint32_t vm_page_creation_throttled_soft = 0;
589uint64_t vm_page_creation_throttle_avoided = 0;
590#endif /* DEVELOPMENT || DEBUG */
591
592static int
593vm_page_throttled(boolean_t page_kept)
594{
595 clock_sec_t elapsed_sec;
596 clock_sec_t tv_sec;
597 clock_usec_t tv_usec;
598
599 thread_t thread = current_thread();
600
601 if (thread->options & TH_OPT_VMPRIV)
602 return (0);
603
604 if (thread->t_page_creation_throttled) {
605 thread->t_page_creation_throttled = 0;
606
607 if (page_kept == FALSE)
608 goto no_throttle;
609 }
610 if (NEED_TO_HARD_THROTTLE_THIS_TASK()) {
611#if (DEVELOPMENT || DEBUG)
612 thread->t_page_creation_throttled_hard++;
613 OSAddAtomic(1, &vm_page_creation_throttled_hard);
614#endif /* DEVELOPMENT || DEBUG */
615 return (HARD_THROTTLE_DELAY);
616 }
617
618 if ((vm_page_free_count < vm_page_throttle_limit || (VM_CONFIG_COMPRESSOR_IS_PRESENT && SWAPPER_NEEDS_TO_UNTHROTTLE())) &&
619 thread->t_page_creation_count > (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS * VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC)) {
620
621 if (vm_page_free_wanted == 0 && vm_page_free_wanted_privileged == 0) {
622#if (DEVELOPMENT || DEBUG)
623 OSAddAtomic64(1, &vm_page_creation_throttle_avoided);
624#endif
625 goto no_throttle;
626 }
627 clock_get_system_microtime(&tv_sec, &tv_usec);
628
629 elapsed_sec = tv_sec - thread->t_page_creation_time;
630
631 if (elapsed_sec <= VM_PAGE_CREATION_THROTTLE_PERIOD_SECS ||
632 (thread->t_page_creation_count / elapsed_sec) >= VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC) {
633
634 if (elapsed_sec >= (3 * VM_PAGE_CREATION_THROTTLE_PERIOD_SECS)) {
635 /*
636 * we'll reset our stats to give a well behaved app
637 * that was unlucky enough to accumulate a bunch of pages
638 * over a long period of time a chance to get out of
639 * the throttled state... we reset the counter and timestamp
640 * so that if it stays under the rate limit for the next second
641 * it will be back in our good graces... if it exceeds it, it
642 * will remain in the throttled state
643 */
644 thread->t_page_creation_time = tv_sec;
645 thread->t_page_creation_count = VM_PAGE_CREATION_THROTTLE_RATE_PER_SEC * (VM_PAGE_CREATION_THROTTLE_PERIOD_SECS - 1);
646 }
647 VM_PAGEOUT_DEBUG(vm_page_throttle_count, 1);
648
649 thread->t_page_creation_throttled = 1;
650
651 if (VM_CONFIG_COMPRESSOR_IS_PRESENT && HARD_THROTTLE_LIMIT_REACHED()) {
652#if (DEVELOPMENT || DEBUG)
653 thread->t_page_creation_throttled_hard++;
654 OSAddAtomic(1, &vm_page_creation_throttled_hard);
655#endif /* DEVELOPMENT || DEBUG */
656 return (HARD_THROTTLE_DELAY);
657 } else {
658#if (DEVELOPMENT || DEBUG)
659 thread->t_page_creation_throttled_soft++;
660 OSAddAtomic(1, &vm_page_creation_throttled_soft);
661#endif /* DEVELOPMENT || DEBUG */
662 return (SOFT_THROTTLE_DELAY);
663 }
664 }
665 thread->t_page_creation_time = tv_sec;
666 thread->t_page_creation_count = 0;
667 }
668no_throttle:
669 thread->t_page_creation_count++;
670
671 return (0);
672}
673
674
675/*
676 * check for various conditions that would
677 * prevent us from creating a ZF page...
678 * cleanup is based on being called from vm_fault_page
679 *
680 * object must be locked
681 * object == m->vmp_object
682 */
683static vm_fault_return_t
684vm_fault_check(vm_object_t object, vm_page_t m, vm_page_t first_m, wait_interrupt_t interruptible_state, boolean_t page_throttle)
685{
686 int throttle_delay;
687
688 if (object->shadow_severed ||
689 VM_OBJECT_PURGEABLE_FAULT_ERROR(object)) {
690 /*
691 * Either:
692 * 1. the shadow chain was severed,
693 * 2. the purgeable object is volatile or empty and is marked
694 * to fault on access while volatile.
695 * Just have to return an error at this point
696 */
697 if (m != VM_PAGE_NULL)
698 VM_PAGE_FREE(m);
699 vm_fault_cleanup(object, first_m);
700
701 thread_interrupt_level(interruptible_state);
702
703 return (VM_FAULT_MEMORY_ERROR);
704 }
705 if (page_throttle == TRUE) {
706 if ((throttle_delay = vm_page_throttled(FALSE))) {
707 /*
708 * we're throttling zero-fills...
709 * treat this as if we couldn't grab a page
710 */
711 if (m != VM_PAGE_NULL)
712 VM_PAGE_FREE(m);
713 vm_fault_cleanup(object, first_m);
714
715 VM_DEBUG_EVENT(vmf_check_zfdelay, VMF_CHECK_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
716
717 delay(throttle_delay);
718
719 if (current_thread_aborted()) {
720 thread_interrupt_level(interruptible_state);
721 return VM_FAULT_INTERRUPTED;
722 }
723 thread_interrupt_level(interruptible_state);
724
725 return (VM_FAULT_MEMORY_SHORTAGE);
726 }
727 }
728 return (VM_FAULT_SUCCESS);
729}
730
731
732/*
733 * do the work to zero fill a page and
734 * inject it into the correct paging queue
735 *
736 * m->vmp_object must be locked
737 * page queue lock must NOT be held
738 */
739static int
740vm_fault_zero_page(vm_page_t m, boolean_t no_zero_fill)
741{
742 int my_fault = DBG_ZERO_FILL_FAULT;
743 vm_object_t object;
744
745 object = VM_PAGE_OBJECT(m);
746
747 /*
748 * This is is a zero-fill page fault...
749 *
750 * Checking the page lock is a waste of
751 * time; this page was absent, so
752 * it can't be page locked by a pager.
753 *
754 * we also consider it undefined
755 * with respect to instruction
756 * execution. i.e. it is the responsibility
757 * of higher layers to call for an instruction
758 * sync after changing the contents and before
759 * sending a program into this area. We
760 * choose this approach for performance
761 */
762 m->vmp_pmapped = TRUE;
763
764 m->vmp_cs_validated = FALSE;
765 m->vmp_cs_tainted = FALSE;
766 m->vmp_cs_nx = FALSE;
767
768 if (no_zero_fill == TRUE) {
769 my_fault = DBG_NZF_PAGE_FAULT;
770
771 if (m->vmp_absent && m->vmp_busy)
772 return (my_fault);
773 } else {
774 vm_page_zero_fill(m);
775
776 VM_STAT_INCR(zero_fill_count);
777 DTRACE_VM2(zfod, int, 1, (uint64_t *), NULL);
778 }
779 assert(!m->vmp_laundry);
780 assert(object != kernel_object);
781 //assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
782
783 if (!VM_DYNAMIC_PAGING_ENABLED() &&
784 (object->purgable == VM_PURGABLE_DENY ||
785 object->purgable == VM_PURGABLE_NONVOLATILE ||
786 object->purgable == VM_PURGABLE_VOLATILE )) {
787
788 vm_page_lockspin_queues();
789
790 if (!VM_DYNAMIC_PAGING_ENABLED()) {
791 assert(!VM_PAGE_WIRED(m));
792
793 /*
794 * can't be on the pageout queue since we don't
795 * have a pager to try and clean to
796 */
797 vm_page_queues_remove(m, TRUE);
798 vm_page_check_pageable_safe(m);
799 vm_page_queue_enter(&vm_page_queue_throttled, m, vm_page_t, vmp_pageq);
800 m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
801 vm_page_throttled_count++;
802 }
803 vm_page_unlock_queues();
804 }
805 return (my_fault);
806}
807
808
809/*
810 * Routine: vm_fault_page
811 * Purpose:
812 * Find the resident page for the virtual memory
813 * specified by the given virtual memory object
814 * and offset.
815 * Additional arguments:
816 * The required permissions for the page is given
817 * in "fault_type". Desired permissions are included
818 * in "protection".
819 * fault_info is passed along to determine pagein cluster
820 * limits... it contains the expected reference pattern,
821 * cluster size if available, etc...
822 *
823 * If the desired page is known to be resident (for
824 * example, because it was previously wired down), asserting
825 * the "unwiring" parameter will speed the search.
826 *
827 * If the operation can be interrupted (by thread_abort
828 * or thread_terminate), then the "interruptible"
829 * parameter should be asserted.
830 *
831 * Results:
832 * The page containing the proper data is returned
833 * in "result_page".
834 *
835 * In/out conditions:
836 * The source object must be locked and referenced,
837 * and must donate one paging reference. The reference
838 * is not affected. The paging reference and lock are
839 * consumed.
840 *
841 * If the call succeeds, the object in which "result_page"
842 * resides is left locked and holding a paging reference.
843 * If this is not the original object, a busy page in the
844 * original object is returned in "top_page", to prevent other
845 * callers from pursuing this same data, along with a paging
846 * reference for the original object. The "top_page" should
847 * be destroyed when this guarantee is no longer required.
848 * The "result_page" is also left busy. It is not removed
849 * from the pageout queues.
850 * Special Case:
851 * A return value of VM_FAULT_SUCCESS_NO_PAGE means that the
852 * fault succeeded but there's no VM page (i.e. the VM object
853 * does not actually hold VM pages, but device memory or
854 * large pages). The object is still locked and we still hold a
855 * paging_in_progress reference.
856 */
857unsigned int vm_fault_page_blocked_access = 0;
858unsigned int vm_fault_page_forced_retry = 0;
859
860vm_fault_return_t
861vm_fault_page(
862 /* Arguments: */
863 vm_object_t first_object, /* Object to begin search */
864 vm_object_offset_t first_offset, /* Offset into object */
865 vm_prot_t fault_type, /* What access is requested */
866 boolean_t must_be_resident,/* Must page be resident? */
867 boolean_t caller_lookup, /* caller looked up page */
868 /* Modifies in place: */
869 vm_prot_t *protection, /* Protection for mapping */
870 vm_page_t *result_page, /* Page found, if successful */
871 /* Returns: */
872 vm_page_t *top_page, /* Page in top object, if
873 * not result_page. */
874 int *type_of_fault, /* if non-null, fill in with type of fault
875 * COW, zero-fill, etc... returned in trace point */
876 /* More arguments: */
877 kern_return_t *error_code, /* code if page is in error */
878 boolean_t no_zero_fill, /* don't zero fill absent pages */
879 boolean_t data_supply, /* treat as data_supply if
880 * it is a write fault and a full
881 * page is provided */
882 vm_object_fault_info_t fault_info)
883{
884 vm_page_t m;
885 vm_object_t object;
886 vm_object_offset_t offset;
887 vm_page_t first_m;
888 vm_object_t next_object;
889 vm_object_t copy_object;
890 boolean_t look_for_page;
891 boolean_t force_fault_retry = FALSE;
892 vm_prot_t access_required = fault_type;
893 vm_prot_t wants_copy_flag;
894 kern_return_t wait_result;
895 wait_interrupt_t interruptible_state;
896 boolean_t data_already_requested = FALSE;
897 vm_behavior_t orig_behavior;
898 vm_size_t orig_cluster_size;
899 vm_fault_return_t error;
900 int my_fault;
901 uint32_t try_failed_count;
902 int interruptible; /* how may fault be interrupted? */
903 int external_state = VM_EXTERNAL_STATE_UNKNOWN;
904 memory_object_t pager;
905 vm_fault_return_t retval;
906 int grab_options;
907
908/*
909 * MUST_ASK_PAGER() evaluates to TRUE if the page specified by object/offset is
910 * marked as paged out in the compressor pager or the pager doesn't exist.
911 * Note also that if the pager for an internal object
912 * has not been created, the pager is not invoked regardless of the value
913 * of MUST_ASK_PAGER().
914 *
915 * PAGED_OUT() evaluates to TRUE if the page specified by the object/offset
916 * is marked as paged out in the compressor pager.
917 * PAGED_OUT() is used to determine if a page has already been pushed
918 * into a copy object in order to avoid a redundant page out operation.
919 */
920#define MUST_ASK_PAGER(o, f, s) \
921 ((s = VM_COMPRESSOR_PAGER_STATE_GET((o), (f))) != VM_EXTERNAL_STATE_ABSENT)
922
923#define PAGED_OUT(o, f) \
924 (VM_COMPRESSOR_PAGER_STATE_GET((o), (f)) == VM_EXTERNAL_STATE_EXISTS)
925
926/*
927 * Recovery actions
928 */
929#define RELEASE_PAGE(m) \
930 MACRO_BEGIN \
931 PAGE_WAKEUP_DONE(m); \
932 if ( !VM_PAGE_PAGEABLE(m)) { \
933 vm_page_lockspin_queues(); \
934 if ( !VM_PAGE_PAGEABLE(m)) { \
935 if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) \
936 vm_page_deactivate(m); \
937 else \
938 vm_page_activate(m); \
939 } \
940 vm_page_unlock_queues(); \
941 } \
942 MACRO_END
943
944#if TRACEFAULTPAGE
945 dbgTrace(0xBEEF0002, (unsigned int) first_object, (unsigned int) first_offset); /* (TEST/DEBUG) */
946#endif
947
948 interruptible = fault_info->interruptible;
949 interruptible_state = thread_interrupt_level(interruptible);
950
951 /*
952 * INVARIANTS (through entire routine):
953 *
954 * 1) At all times, we must either have the object
955 * lock or a busy page in some object to prevent
956 * some other thread from trying to bring in
957 * the same page.
958 *
959 * Note that we cannot hold any locks during the
960 * pager access or when waiting for memory, so
961 * we use a busy page then.
962 *
963 * 2) To prevent another thread from racing us down the
964 * shadow chain and entering a new page in the top
965 * object before we do, we must keep a busy page in
966 * the top object while following the shadow chain.
967 *
968 * 3) We must increment paging_in_progress on any object
969 * for which we have a busy page before dropping
970 * the object lock
971 *
972 * 4) We leave busy pages on the pageout queues.
973 * If the pageout daemon comes across a busy page,
974 * it will remove the page from the pageout queues.
975 */
976
977 object = first_object;
978 offset = first_offset;
979 first_m = VM_PAGE_NULL;
980 access_required = fault_type;
981
982
983 XPR(XPR_VM_FAULT,
984 "vm_f_page: obj 0x%X, offset 0x%X, type %d, prot %d\n",
985 object, offset, fault_type, *protection, 0);
986
987 /*
988 * default type of fault
989 */
990 my_fault = DBG_CACHE_HIT_FAULT;
991
992 while (TRUE) {
993#if TRACEFAULTPAGE
994 dbgTrace(0xBEEF0003, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
995#endif
996
997 grab_options = 0;
998#if CONFIG_SECLUDED_MEMORY
999 if (object->can_grab_secluded) {
1000 grab_options |= VM_PAGE_GRAB_SECLUDED;
1001 }
1002#endif /* CONFIG_SECLUDED_MEMORY */
1003
1004 if (!object->alive) {
1005 /*
1006 * object is no longer valid
1007 * clean up and return error
1008 */
1009 vm_fault_cleanup(object, first_m);
1010 thread_interrupt_level(interruptible_state);
1011
1012 return (VM_FAULT_MEMORY_ERROR);
1013 }
1014
1015 if (!object->pager_created && object->phys_contiguous) {
1016 /*
1017 * A physically-contiguous object without a pager:
1018 * must be a "large page" object. We do not deal
1019 * with VM pages for this object.
1020 */
1021 caller_lookup = FALSE;
1022 m = VM_PAGE_NULL;
1023 goto phys_contig_object;
1024 }
1025
1026 if (object->blocked_access) {
1027 /*
1028 * Access to this VM object has been blocked.
1029 * Replace our "paging_in_progress" reference with
1030 * a "activity_in_progress" reference and wait for
1031 * access to be unblocked.
1032 */
1033 caller_lookup = FALSE; /* no longer valid after sleep */
1034 vm_object_activity_begin(object);
1035 vm_object_paging_end(object);
1036 while (object->blocked_access) {
1037 vm_object_sleep(object,
1038 VM_OBJECT_EVENT_UNBLOCKED,
1039 THREAD_UNINT);
1040 }
1041 vm_fault_page_blocked_access++;
1042 vm_object_paging_begin(object);
1043 vm_object_activity_end(object);
1044 }
1045
1046 /*
1047 * See whether the page at 'offset' is resident
1048 */
1049 if (caller_lookup == TRUE) {
1050 /*
1051 * The caller has already looked up the page
1052 * and gave us the result in "result_page".
1053 * We can use this for the first lookup but
1054 * it loses its validity as soon as we unlock
1055 * the object.
1056 */
1057 m = *result_page;
1058 caller_lookup = FALSE; /* no longer valid after that */
1059 } else {
1060 m = vm_page_lookup(object, offset);
1061 }
1062#if TRACEFAULTPAGE
1063 dbgTrace(0xBEEF0004, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
1064#endif
1065 if (m != VM_PAGE_NULL) {
1066
1067 if (m->vmp_busy) {
1068 /*
1069 * The page is being brought in,
1070 * wait for it and then retry.
1071 */
1072#if TRACEFAULTPAGE
1073 dbgTrace(0xBEEF0005, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1074#endif
1075 wait_result = PAGE_SLEEP(object, m, interruptible);
1076
1077 XPR(XPR_VM_FAULT,
1078 "vm_f_page: block busy obj 0x%X, offset 0x%X, page 0x%X\n",
1079 object, offset,
1080 m, 0, 0);
1081 counter(c_vm_fault_page_block_busy_kernel++);
1082
1083 if (wait_result != THREAD_AWAKENED) {
1084 vm_fault_cleanup(object, first_m);
1085 thread_interrupt_level(interruptible_state);
1086
1087 if (wait_result == THREAD_RESTART)
1088 return (VM_FAULT_RETRY);
1089 else
1090 return (VM_FAULT_INTERRUPTED);
1091 }
1092 continue;
1093 }
1094 if (m->vmp_laundry) {
1095 m->vmp_free_when_done = FALSE;
1096
1097 if (!m->vmp_cleaning)
1098 vm_pageout_steal_laundry(m, FALSE);
1099 }
1100 if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
1101 /*
1102 * Guard page: off limits !
1103 */
1104 if (fault_type == VM_PROT_NONE) {
1105 /*
1106 * The fault is not requesting any
1107 * access to the guard page, so it must
1108 * be just to wire or unwire it.
1109 * Let's pretend it succeeded...
1110 */
1111 m->vmp_busy = TRUE;
1112 *result_page = m;
1113 assert(first_m == VM_PAGE_NULL);
1114 *top_page = first_m;
1115 if (type_of_fault)
1116 *type_of_fault = DBG_GUARD_FAULT;
1117 thread_interrupt_level(interruptible_state);
1118 return VM_FAULT_SUCCESS;
1119 } else {
1120 /*
1121 * The fault requests access to the
1122 * guard page: let's deny that !
1123 */
1124 vm_fault_cleanup(object, first_m);
1125 thread_interrupt_level(interruptible_state);
1126 return VM_FAULT_MEMORY_ERROR;
1127 }
1128 }
1129
1130 if (m->vmp_error) {
1131 /*
1132 * The page is in error, give up now.
1133 */
1134#if TRACEFAULTPAGE
1135 dbgTrace(0xBEEF0006, (unsigned int) m, (unsigned int) error_code); /* (TEST/DEBUG) */
1136#endif
1137 if (error_code)
1138 *error_code = KERN_MEMORY_ERROR;
1139 VM_PAGE_FREE(m);
1140
1141 vm_fault_cleanup(object, first_m);
1142 thread_interrupt_level(interruptible_state);
1143
1144 return (VM_FAULT_MEMORY_ERROR);
1145 }
1146 if (m->vmp_restart) {
1147 /*
1148 * The pager wants us to restart
1149 * at the top of the chain,
1150 * typically because it has moved the
1151 * page to another pager, then do so.
1152 */
1153#if TRACEFAULTPAGE
1154 dbgTrace(0xBEEF0007, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1155#endif
1156 VM_PAGE_FREE(m);
1157
1158 vm_fault_cleanup(object, first_m);
1159 thread_interrupt_level(interruptible_state);
1160
1161 return (VM_FAULT_RETRY);
1162 }
1163 if (m->vmp_absent) {
1164 /*
1165 * The page isn't busy, but is absent,
1166 * therefore it's deemed "unavailable".
1167 *
1168 * Remove the non-existent page (unless it's
1169 * in the top object) and move on down to the
1170 * next object (if there is one).
1171 */
1172#if TRACEFAULTPAGE
1173 dbgTrace(0xBEEF0008, (unsigned int) m, (unsigned int) object->shadow); /* (TEST/DEBUG) */
1174#endif
1175 next_object = object->shadow;
1176
1177 if (next_object == VM_OBJECT_NULL) {
1178 /*
1179 * Absent page at bottom of shadow
1180 * chain; zero fill the page we left
1181 * busy in the first object, and free
1182 * the absent page.
1183 */
1184 assert(!must_be_resident);
1185
1186 /*
1187 * check for any conditions that prevent
1188 * us from creating a new zero-fill page
1189 * vm_fault_check will do all of the
1190 * fault cleanup in the case of an error condition
1191 * including resetting the thread_interrupt_level
1192 */
1193 error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
1194
1195 if (error != VM_FAULT_SUCCESS)
1196 return (error);
1197
1198 XPR(XPR_VM_FAULT,
1199 "vm_f_page: zero obj 0x%X, off 0x%X, page 0x%X, first_obj 0x%X\n",
1200 object, offset,
1201 m,
1202 first_object, 0);
1203
1204 if (object != first_object) {
1205 /*
1206 * free the absent page we just found
1207 */
1208 VM_PAGE_FREE(m);
1209
1210 /*
1211 * drop reference and lock on current object
1212 */
1213 vm_object_paging_end(object);
1214 vm_object_unlock(object);
1215
1216 /*
1217 * grab the original page we
1218 * 'soldered' in place and
1219 * retake lock on 'first_object'
1220 */
1221 m = first_m;
1222 first_m = VM_PAGE_NULL;
1223
1224 object = first_object;
1225 offset = first_offset;
1226
1227 vm_object_lock(object);
1228 } else {
1229 /*
1230 * we're going to use the absent page we just found
1231 * so convert it to a 'busy' page
1232 */
1233 m->vmp_absent = FALSE;
1234 m->vmp_busy = TRUE;
1235 }
1236 if (fault_info->mark_zf_absent && no_zero_fill == TRUE)
1237 m->vmp_absent = TRUE;
1238 /*
1239 * zero-fill the page and put it on
1240 * the correct paging queue
1241 */
1242 my_fault = vm_fault_zero_page(m, no_zero_fill);
1243
1244 break;
1245 } else {
1246 if (must_be_resident)
1247 vm_object_paging_end(object);
1248 else if (object != first_object) {
1249 vm_object_paging_end(object);
1250 VM_PAGE_FREE(m);
1251 } else {
1252 first_m = m;
1253 m->vmp_absent = FALSE;
1254 m->vmp_busy = TRUE;
1255
1256 vm_page_lockspin_queues();
1257 vm_page_queues_remove(m, FALSE);
1258 vm_page_unlock_queues();
1259 }
1260 XPR(XPR_VM_FAULT,
1261 "vm_f_page: unavail obj 0x%X, off 0x%X, next_obj 0x%X, newoff 0x%X\n",
1262 object, offset,
1263 next_object,
1264 offset+object->vo_shadow_offset,0);
1265
1266 offset += object->vo_shadow_offset;
1267 fault_info->lo_offset += object->vo_shadow_offset;
1268 fault_info->hi_offset += object->vo_shadow_offset;
1269 access_required = VM_PROT_READ;
1270
1271 vm_object_lock(next_object);
1272 vm_object_unlock(object);
1273 object = next_object;
1274 vm_object_paging_begin(object);
1275
1276 /*
1277 * reset to default type of fault
1278 */
1279 my_fault = DBG_CACHE_HIT_FAULT;
1280
1281 continue;
1282 }
1283 }
1284 if ((m->vmp_cleaning)
1285 && ((object != first_object) || (object->copy != VM_OBJECT_NULL))
1286 && (fault_type & VM_PROT_WRITE)) {
1287 /*
1288 * This is a copy-on-write fault that will
1289 * cause us to revoke access to this page, but
1290 * this page is in the process of being cleaned
1291 * in a clustered pageout. We must wait until
1292 * the cleaning operation completes before
1293 * revoking access to the original page,
1294 * otherwise we might attempt to remove a
1295 * wired mapping.
1296 */
1297#if TRACEFAULTPAGE
1298 dbgTrace(0xBEEF0009, (unsigned int) m, (unsigned int) offset); /* (TEST/DEBUG) */
1299#endif
1300 XPR(XPR_VM_FAULT,
1301 "vm_f_page: cleaning obj 0x%X, offset 0x%X, page 0x%X\n",
1302 object, offset,
1303 m, 0, 0);
1304 /*
1305 * take an extra ref so that object won't die
1306 */
1307 vm_object_reference_locked(object);
1308
1309 vm_fault_cleanup(object, first_m);
1310
1311 counter(c_vm_fault_page_block_backoff_kernel++);
1312 vm_object_lock(object);
1313 assert(object->ref_count > 0);
1314
1315 m = vm_page_lookup(object, offset);
1316
1317 if (m != VM_PAGE_NULL && m->vmp_cleaning) {
1318 PAGE_ASSERT_WAIT(m, interruptible);
1319
1320 vm_object_unlock(object);
1321 wait_result = thread_block(THREAD_CONTINUE_NULL);
1322 vm_object_deallocate(object);
1323
1324 goto backoff;
1325 } else {
1326 vm_object_unlock(object);
1327
1328 vm_object_deallocate(object);
1329 thread_interrupt_level(interruptible_state);
1330
1331 return (VM_FAULT_RETRY);
1332 }
1333 }
1334 if (type_of_fault == NULL && (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) &&
1335 !(fault_info != NULL && fault_info->stealth)) {
1336 /*
1337 * If we were passed a non-NULL pointer for
1338 * "type_of_fault", than we came from
1339 * vm_fault... we'll let it deal with
1340 * this condition, since it
1341 * needs to see m->vmp_speculative to correctly
1342 * account the pageins, otherwise...
1343 * take it off the speculative queue, we'll
1344 * let the caller of vm_fault_page deal
1345 * with getting it onto the correct queue
1346 *
1347 * If the caller specified in fault_info that
1348 * it wants a "stealth" fault, we also leave
1349 * the page in the speculative queue.
1350 */
1351 vm_page_lockspin_queues();
1352 if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q)
1353 vm_page_queues_remove(m, FALSE);
1354 vm_page_unlock_queues();
1355 }
1356 assert(object == VM_PAGE_OBJECT(m));
1357
1358 if (object->code_signed) {
1359 /*
1360 * CODE SIGNING:
1361 * We just paged in a page from a signed
1362 * memory object but we don't need to
1363 * validate it now. We'll validate it if
1364 * when it gets mapped into a user address
1365 * space for the first time or when the page
1366 * gets copied to another object as a result
1367 * of a copy-on-write.
1368 */
1369 }
1370
1371 /*
1372 * We mark the page busy and leave it on
1373 * the pageout queues. If the pageout
1374 * deamon comes across it, then it will
1375 * remove the page from the queue, but not the object
1376 */
1377#if TRACEFAULTPAGE
1378 dbgTrace(0xBEEF000B, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1379#endif
1380 XPR(XPR_VM_FAULT,
1381 "vm_f_page: found page obj 0x%X, offset 0x%X, page 0x%X\n",
1382 object, offset, m, 0, 0);
1383 assert(!m->vmp_busy);
1384 assert(!m->vmp_absent);
1385
1386 m->vmp_busy = TRUE;
1387 break;
1388 }
1389
1390
1391 /*
1392 * we get here when there is no page present in the object at
1393 * the offset we're interested in... we'll allocate a page
1394 * at this point if the pager associated with
1395 * this object can provide the data or we're the top object...
1396 * object is locked; m == NULL
1397 */
1398
1399 if (must_be_resident) {
1400 if (fault_type == VM_PROT_NONE &&
1401 object == kernel_object) {
1402 /*
1403 * We've been called from vm_fault_unwire()
1404 * while removing a map entry that was allocated
1405 * with KMA_KOBJECT and KMA_VAONLY. This page
1406 * is not present and there's nothing more to
1407 * do here (nothing to unwire).
1408 */
1409 vm_fault_cleanup(object, first_m);
1410 thread_interrupt_level(interruptible_state);
1411
1412 return VM_FAULT_MEMORY_ERROR;
1413 }
1414
1415 goto dont_look_for_page;
1416 }
1417
1418 /* Don't expect to fault pages into the kernel object. */
1419 assert(object != kernel_object);
1420
1421 data_supply = FALSE;
1422
1423 look_for_page = (object->pager_created && (MUST_ASK_PAGER(object, offset, external_state) == TRUE) && !data_supply);
1424
1425#if TRACEFAULTPAGE
1426 dbgTrace(0xBEEF000C, (unsigned int) look_for_page, (unsigned int) object); /* (TEST/DEBUG) */
1427#endif
1428 if (!look_for_page && object == first_object && !object->phys_contiguous) {
1429 /*
1430 * Allocate a new page for this object/offset pair as a placeholder
1431 */
1432 m = vm_page_grab_options(grab_options);
1433#if TRACEFAULTPAGE
1434 dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
1435#endif
1436 if (m == VM_PAGE_NULL) {
1437
1438 vm_fault_cleanup(object, first_m);
1439 thread_interrupt_level(interruptible_state);
1440
1441 return (VM_FAULT_MEMORY_SHORTAGE);
1442 }
1443
1444 if (fault_info && fault_info->batch_pmap_op == TRUE) {
1445 vm_page_insert_internal(m, object, offset, VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
1446 } else {
1447 vm_page_insert(m, object, offset);
1448 }
1449 }
1450 if (look_for_page) {
1451 kern_return_t rc;
1452 int my_fault_type;
1453
1454 /*
1455 * If the memory manager is not ready, we
1456 * cannot make requests.
1457 */
1458 if (!object->pager_ready) {
1459#if TRACEFAULTPAGE
1460 dbgTrace(0xBEEF000E, (unsigned int) 0, (unsigned int) 0); /* (TEST/DEBUG) */
1461#endif
1462 if (m != VM_PAGE_NULL)
1463 VM_PAGE_FREE(m);
1464
1465 XPR(XPR_VM_FAULT,
1466 "vm_f_page: ready wait obj 0x%X, offset 0x%X\n",
1467 object, offset, 0, 0, 0);
1468
1469 /*
1470 * take an extra ref so object won't die
1471 */
1472 vm_object_reference_locked(object);
1473 vm_fault_cleanup(object, first_m);
1474 counter(c_vm_fault_page_block_backoff_kernel++);
1475
1476 vm_object_lock(object);
1477 assert(object->ref_count > 0);
1478
1479 if (!object->pager_ready) {
1480 wait_result = vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGER_READY, interruptible);
1481
1482 vm_object_unlock(object);
1483 if (wait_result == THREAD_WAITING)
1484 wait_result = thread_block(THREAD_CONTINUE_NULL);
1485 vm_object_deallocate(object);
1486
1487 goto backoff;
1488 } else {
1489 vm_object_unlock(object);
1490 vm_object_deallocate(object);
1491 thread_interrupt_level(interruptible_state);
1492
1493 return (VM_FAULT_RETRY);
1494 }
1495 }
1496 if (!object->internal && !object->phys_contiguous && object->paging_in_progress > vm_object_pagein_throttle) {
1497 /*
1498 * If there are too many outstanding page
1499 * requests pending on this external object, we
1500 * wait for them to be resolved now.
1501 */
1502#if TRACEFAULTPAGE
1503 dbgTrace(0xBEEF0010, (unsigned int) m, (unsigned int) 0); /* (TEST/DEBUG) */
1504#endif
1505 if (m != VM_PAGE_NULL)
1506 VM_PAGE_FREE(m);
1507 /*
1508 * take an extra ref so object won't die
1509 */
1510 vm_object_reference_locked(object);
1511
1512 vm_fault_cleanup(object, first_m);
1513
1514 counter(c_vm_fault_page_block_backoff_kernel++);
1515
1516 vm_object_lock(object);
1517 assert(object->ref_count > 0);
1518
1519 if (object->paging_in_progress >= vm_object_pagein_throttle) {
1520 vm_object_assert_wait(object, VM_OBJECT_EVENT_PAGING_ONLY_IN_PROGRESS, interruptible);
1521
1522 vm_object_unlock(object);
1523 wait_result = thread_block(THREAD_CONTINUE_NULL);
1524 vm_object_deallocate(object);
1525
1526 goto backoff;
1527 } else {
1528 vm_object_unlock(object);
1529 vm_object_deallocate(object);
1530 thread_interrupt_level(interruptible_state);
1531
1532 return (VM_FAULT_RETRY);
1533 }
1534 }
1535 if (object->internal) {
1536 int compressed_count_delta;
1537
1538 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
1539
1540 if (m == VM_PAGE_NULL) {
1541 /*
1542 * Allocate a new page for this object/offset pair as a placeholder
1543 */
1544 m = vm_page_grab_options(grab_options);
1545#if TRACEFAULTPAGE
1546 dbgTrace(0xBEEF000D, (unsigned int) m, (unsigned int) object); /* (TEST/DEBUG) */
1547#endif
1548 if (m == VM_PAGE_NULL) {
1549
1550 vm_fault_cleanup(object, first_m);
1551 thread_interrupt_level(interruptible_state);
1552
1553 return (VM_FAULT_MEMORY_SHORTAGE);
1554 }
1555
1556 m->vmp_absent = TRUE;
1557 if (fault_info && fault_info->batch_pmap_op == TRUE) {
1558 vm_page_insert_internal(m, object, offset, VM_KERN_MEMORY_NONE, FALSE, TRUE, TRUE, FALSE, NULL);
1559 } else {
1560 vm_page_insert(m, object, offset);
1561 }
1562 }
1563 assert(m->vmp_busy);
1564
1565 m->vmp_absent = TRUE;
1566 pager = object->pager;
1567
1568 assert(object->paging_in_progress > 0);
1569 vm_object_unlock(object);
1570
1571 rc = vm_compressor_pager_get(
1572 pager,
1573 offset + object->paging_offset,
1574 VM_PAGE_GET_PHYS_PAGE(m),
1575 &my_fault_type,
1576 0,
1577 &compressed_count_delta);
1578
1579 if (type_of_fault == NULL) {
1580 int throttle_delay;
1581
1582 /*
1583 * we weren't called from vm_fault, so we
1584 * need to apply page creation throttling
1585 * do it before we re-acquire any locks
1586 */
1587 if (my_fault_type == DBG_COMPRESSOR_FAULT) {
1588 if ((throttle_delay = vm_page_throttled(TRUE))) {
1589 VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 1, 0);
1590 delay(throttle_delay);
1591 }
1592 }
1593 }
1594 vm_object_lock(object);
1595 assert(object->paging_in_progress > 0);
1596
1597 vm_compressor_pager_count(
1598 pager,
1599 compressed_count_delta,
1600 FALSE, /* shared_lock */
1601 object);
1602
1603 switch (rc) {
1604 case KERN_SUCCESS:
1605 m->vmp_absent = FALSE;
1606 m->vmp_dirty = TRUE;
1607 if ((object->wimg_bits &
1608 VM_WIMG_MASK) !=
1609 VM_WIMG_USE_DEFAULT) {
1610 /*
1611 * If the page is not cacheable,
1612 * we can't let its contents
1613 * linger in the data cache
1614 * after the decompression.
1615 */
1616 pmap_sync_page_attributes_phys(
1617 VM_PAGE_GET_PHYS_PAGE(m));
1618 } else {
1619 m->vmp_written_by_kernel = TRUE;
1620 }
1621
1622 /*
1623 * If the object is purgeable, its
1624 * owner's purgeable ledgers have been
1625 * updated in vm_page_insert() but the
1626 * page was also accounted for in a
1627 * "compressed purgeable" ledger, so
1628 * update that now.
1629 */
1630 if (((object->purgable !=
1631 VM_PURGABLE_DENY) ||
1632 object->vo_ledger_tag) &&
1633 (object->vo_owner !=
1634 NULL)) {
1635 /*
1636 * One less compressed
1637 * purgeable/tagged page.
1638 */
1639 vm_object_owner_compressed_update(
1640 object,
1641 -1);
1642 }
1643
1644 break;
1645 case KERN_MEMORY_FAILURE:
1646 m->vmp_unusual = TRUE;
1647 m->vmp_error = TRUE;
1648 m->vmp_absent = FALSE;
1649 break;
1650 case KERN_MEMORY_ERROR:
1651 assert(m->vmp_absent);
1652 break;
1653 default:
1654 panic("vm_fault_page(): unexpected "
1655 "error %d from "
1656 "vm_compressor_pager_get()\n",
1657 rc);
1658 }
1659 PAGE_WAKEUP_DONE(m);
1660
1661 rc = KERN_SUCCESS;
1662 goto data_requested;
1663 }
1664 my_fault_type = DBG_PAGEIN_FAULT;
1665
1666 if (m != VM_PAGE_NULL) {
1667 VM_PAGE_FREE(m);
1668 m = VM_PAGE_NULL;
1669 }
1670
1671#if TRACEFAULTPAGE
1672 dbgTrace(0xBEEF0012, (unsigned int) object, (unsigned int) 0); /* (TEST/DEBUG) */
1673#endif
1674
1675 /*
1676 * It's possible someone called vm_object_destroy while we weren't
1677 * holding the object lock. If that has happened, then bail out
1678 * here.
1679 */
1680
1681 pager = object->pager;
1682
1683 if (pager == MEMORY_OBJECT_NULL) {
1684 vm_fault_cleanup(object, first_m);
1685 thread_interrupt_level(interruptible_state);
1686 return VM_FAULT_MEMORY_ERROR;
1687 }
1688
1689 /*
1690 * We have an absent page in place for the faulting offset,
1691 * so we can release the object lock.
1692 */
1693
1694 if (object->object_is_shared_cache) {
1695 set_thread_rwlock_boost();
1696 }
1697
1698 vm_object_unlock(object);
1699
1700 /*
1701 * If this object uses a copy_call strategy,
1702 * and we are interested in a copy of this object
1703 * (having gotten here only by following a
1704 * shadow chain), then tell the memory manager
1705 * via a flag added to the desired_access
1706 * parameter, so that it can detect a race
1707 * between our walking down the shadow chain
1708 * and its pushing pages up into a copy of
1709 * the object that it manages.
1710 */
1711 if (object->copy_strategy == MEMORY_OBJECT_COPY_CALL && object != first_object)
1712 wants_copy_flag = VM_PROT_WANTS_COPY;
1713 else
1714 wants_copy_flag = VM_PROT_NONE;
1715
1716 XPR(XPR_VM_FAULT,
1717 "vm_f_page: data_req obj 0x%X, offset 0x%X, page 0x%X, acc %d\n",
1718 object, offset, m,
1719 access_required | wants_copy_flag, 0);
1720
1721 if (object->copy == first_object) {
1722 /*
1723 * if we issue the memory_object_data_request in
1724 * this state, we are subject to a deadlock with
1725 * the underlying filesystem if it is trying to
1726 * shrink the file resulting in a push of pages
1727 * into the copy object... that push will stall
1728 * on the placeholder page, and if the pushing thread
1729 * is holding a lock that is required on the pagein
1730 * path (such as a truncate lock), we'll deadlock...
1731 * to avoid this potential deadlock, we throw away
1732 * our placeholder page before calling memory_object_data_request
1733 * and force this thread to retry the vm_fault_page after
1734 * we have issued the I/O. the second time through this path
1735 * we will find the page already in the cache (presumably still
1736 * busy waiting for the I/O to complete) and then complete
1737 * the fault w/o having to go through memory_object_data_request again
1738 */
1739 assert(first_m != VM_PAGE_NULL);
1740 assert(VM_PAGE_OBJECT(first_m) == first_object);
1741
1742 vm_object_lock(first_object);
1743 VM_PAGE_FREE(first_m);
1744 vm_object_paging_end(first_object);
1745 vm_object_unlock(first_object);
1746
1747 first_m = VM_PAGE_NULL;
1748 force_fault_retry = TRUE;
1749
1750 vm_fault_page_forced_retry++;
1751 }
1752
1753 if (data_already_requested == TRUE) {
1754 orig_behavior = fault_info->behavior;
1755 orig_cluster_size = fault_info->cluster_size;
1756
1757 fault_info->behavior = VM_BEHAVIOR_RANDOM;
1758 fault_info->cluster_size = PAGE_SIZE;
1759 }
1760 /*
1761 * Call the memory manager to retrieve the data.
1762 */
1763 rc = memory_object_data_request(
1764 pager,
1765 offset + object->paging_offset,
1766 PAGE_SIZE,
1767 access_required | wants_copy_flag,
1768 (memory_object_fault_info_t)fault_info);
1769
1770 if (data_already_requested == TRUE) {
1771 fault_info->behavior = orig_behavior;
1772 fault_info->cluster_size = orig_cluster_size;
1773 } else
1774 data_already_requested = TRUE;
1775
1776 DTRACE_VM2(maj_fault, int, 1, (uint64_t *), NULL);
1777#if TRACEFAULTPAGE
1778 dbgTrace(0xBEEF0013, (unsigned int) object, (unsigned int) rc); /* (TEST/DEBUG) */
1779#endif
1780 vm_object_lock(object);
1781
1782 if (object->object_is_shared_cache) {
1783 clear_thread_rwlock_boost();
1784 }
1785
1786 data_requested:
1787 if (rc != KERN_SUCCESS) {
1788
1789 vm_fault_cleanup(object, first_m);
1790 thread_interrupt_level(interruptible_state);
1791
1792 return ((rc == MACH_SEND_INTERRUPTED) ?
1793 VM_FAULT_INTERRUPTED :
1794 VM_FAULT_MEMORY_ERROR);
1795 } else {
1796 clock_sec_t tv_sec;
1797 clock_usec_t tv_usec;
1798
1799 if (my_fault_type == DBG_PAGEIN_FAULT) {
1800 clock_get_system_microtime(&tv_sec, &tv_usec);
1801 current_thread()->t_page_creation_time = tv_sec;
1802 current_thread()->t_page_creation_count = 0;
1803 }
1804 }
1805 if ((interruptible != THREAD_UNINT) && (current_thread()->sched_flags & TH_SFLAG_ABORT)) {
1806
1807 vm_fault_cleanup(object, first_m);
1808 thread_interrupt_level(interruptible_state);
1809
1810 return (VM_FAULT_INTERRUPTED);
1811 }
1812 if (force_fault_retry == TRUE) {
1813
1814 vm_fault_cleanup(object, first_m);
1815 thread_interrupt_level(interruptible_state);
1816
1817 return (VM_FAULT_RETRY);
1818 }
1819 if (m == VM_PAGE_NULL && object->phys_contiguous) {
1820 /*
1821 * No page here means that the object we
1822 * initially looked up was "physically
1823 * contiguous" (i.e. device memory). However,
1824 * with Virtual VRAM, the object might not
1825 * be backed by that device memory anymore,
1826 * so we're done here only if the object is
1827 * still "phys_contiguous".
1828 * Otherwise, if the object is no longer
1829 * "phys_contiguous", we need to retry the
1830 * page fault against the object's new backing
1831 * store (different memory object).
1832 */
1833 phys_contig_object:
1834 goto done;
1835 }
1836 /*
1837 * potentially a pagein fault
1838 * if we make it through the state checks
1839 * above, than we'll count it as such
1840 */
1841 my_fault = my_fault_type;
1842
1843 /*
1844 * Retry with same object/offset, since new data may
1845 * be in a different page (i.e., m is meaningless at
1846 * this point).
1847 */
1848 continue;
1849 }
1850dont_look_for_page:
1851 /*
1852 * We get here if the object has no pager, or an existence map
1853 * exists and indicates the page isn't present on the pager
1854 * or we're unwiring a page. If a pager exists, but there
1855 * is no existence map, then the m->vmp_absent case above handles
1856 * the ZF case when the pager can't provide the page
1857 */
1858#if TRACEFAULTPAGE
1859 dbgTrace(0xBEEF0014, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
1860#endif
1861 if (object == first_object)
1862 first_m = m;
1863 else
1864 assert(m == VM_PAGE_NULL);
1865
1866 XPR(XPR_VM_FAULT,
1867 "vm_f_page: no pager obj 0x%X, offset 0x%X, page 0x%X, next_obj 0x%X\n",
1868 object, offset, m,
1869 object->shadow, 0);
1870
1871 next_object = object->shadow;
1872
1873 if (next_object == VM_OBJECT_NULL) {
1874 /*
1875 * we've hit the bottom of the shadown chain,
1876 * fill the page in the top object with zeros.
1877 */
1878 assert(!must_be_resident);
1879
1880 if (object != first_object) {
1881 vm_object_paging_end(object);
1882 vm_object_unlock(object);
1883
1884 object = first_object;
1885 offset = first_offset;
1886 vm_object_lock(object);
1887 }
1888 m = first_m;
1889 assert(VM_PAGE_OBJECT(m) == object);
1890 first_m = VM_PAGE_NULL;
1891
1892 /*
1893 * check for any conditions that prevent
1894 * us from creating a new zero-fill page
1895 * vm_fault_check will do all of the
1896 * fault cleanup in the case of an error condition
1897 * including resetting the thread_interrupt_level
1898 */
1899 error = vm_fault_check(object, m, first_m, interruptible_state, (type_of_fault == NULL) ? TRUE : FALSE);
1900
1901 if (error != VM_FAULT_SUCCESS)
1902 return (error);
1903
1904 if (m == VM_PAGE_NULL) {
1905 m = vm_page_grab_options(grab_options);
1906
1907 if (m == VM_PAGE_NULL) {
1908 vm_fault_cleanup(object, VM_PAGE_NULL);
1909 thread_interrupt_level(interruptible_state);
1910
1911 return (VM_FAULT_MEMORY_SHORTAGE);
1912 }
1913 vm_page_insert(m, object, offset);
1914 }
1915 if (fault_info->mark_zf_absent && no_zero_fill == TRUE)
1916 m->vmp_absent = TRUE;
1917
1918 my_fault = vm_fault_zero_page(m, no_zero_fill);
1919
1920 break;
1921
1922 } else {
1923 /*
1924 * Move on to the next object. Lock the next
1925 * object before unlocking the current one.
1926 */
1927 if ((object != first_object) || must_be_resident)
1928 vm_object_paging_end(object);
1929
1930 offset += object->vo_shadow_offset;
1931 fault_info->lo_offset += object->vo_shadow_offset;
1932 fault_info->hi_offset += object->vo_shadow_offset;
1933 access_required = VM_PROT_READ;
1934
1935 vm_object_lock(next_object);
1936 vm_object_unlock(object);
1937
1938 object = next_object;
1939 vm_object_paging_begin(object);
1940 }
1941 }
1942
1943 /*
1944 * PAGE HAS BEEN FOUND.
1945 *
1946 * This page (m) is:
1947 * busy, so that we can play with it;
1948 * not absent, so that nobody else will fill it;
1949 * possibly eligible for pageout;
1950 *
1951 * The top-level page (first_m) is:
1952 * VM_PAGE_NULL if the page was found in the
1953 * top-level object;
1954 * busy, not absent, and ineligible for pageout.
1955 *
1956 * The current object (object) is locked. A paging
1957 * reference is held for the current and top-level
1958 * objects.
1959 */
1960
1961#if TRACEFAULTPAGE
1962 dbgTrace(0xBEEF0015, (unsigned int) object, (unsigned int) m); /* (TEST/DEBUG) */
1963#endif
1964#if EXTRA_ASSERTIONS
1965 assert(m->vmp_busy && !m->vmp_absent);
1966 assert((first_m == VM_PAGE_NULL) ||
1967 (first_m->vmp_busy && !first_m->vmp_absent &&
1968 !first_m->vmp_active && !first_m->vmp_inactive && !first_m->vmp_secluded));
1969#endif /* EXTRA_ASSERTIONS */
1970
1971 XPR(XPR_VM_FAULT,
1972 "vm_f_page: FOUND obj 0x%X, off 0x%X, page 0x%X, 1_obj 0x%X, 1_m 0x%X\n",
1973 object, offset, m,
1974 first_object, first_m);
1975
1976 /*
1977 * If the page is being written, but isn't
1978 * already owned by the top-level object,
1979 * we have to copy it into a new page owned
1980 * by the top-level object.
1981 */
1982 if (object != first_object) {
1983
1984#if TRACEFAULTPAGE
1985 dbgTrace(0xBEEF0016, (unsigned int) object, (unsigned int) fault_type); /* (TEST/DEBUG) */
1986#endif
1987 if (fault_type & VM_PROT_WRITE) {
1988 vm_page_t copy_m;
1989
1990 /*
1991 * We only really need to copy if we
1992 * want to write it.
1993 */
1994 assert(!must_be_resident);
1995
1996 /*
1997 * If we try to collapse first_object at this
1998 * point, we may deadlock when we try to get
1999 * the lock on an intermediate object (since we
2000 * have the bottom object locked). We can't
2001 * unlock the bottom object, because the page
2002 * we found may move (by collapse) if we do.
2003 *
2004 * Instead, we first copy the page. Then, when
2005 * we have no more use for the bottom object,
2006 * we unlock it and try to collapse.
2007 *
2008 * Note that we copy the page even if we didn't
2009 * need to... that's the breaks.
2010 */
2011
2012 /*
2013 * Allocate a page for the copy
2014 */
2015 copy_m = vm_page_grab_options(grab_options);
2016
2017 if (copy_m == VM_PAGE_NULL) {
2018 RELEASE_PAGE(m);
2019
2020 vm_fault_cleanup(object, first_m);
2021 thread_interrupt_level(interruptible_state);
2022
2023 return (VM_FAULT_MEMORY_SHORTAGE);
2024 }
2025 XPR(XPR_VM_FAULT,
2026 "vm_f_page: page_copy obj 0x%X, offset 0x%X, m 0x%X, copy_m 0x%X\n",
2027 object, offset,
2028 m, copy_m, 0);
2029
2030 vm_page_copy(m, copy_m);
2031
2032 /*
2033 * If another map is truly sharing this
2034 * page with us, we have to flush all
2035 * uses of the original page, since we
2036 * can't distinguish those which want the
2037 * original from those which need the
2038 * new copy.
2039 *
2040 * XXXO If we know that only one map has
2041 * access to this page, then we could
2042 * avoid the pmap_disconnect() call.
2043 */
2044 if (m->vmp_pmapped)
2045 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
2046
2047 if (m->vmp_clustered) {
2048 VM_PAGE_COUNT_AS_PAGEIN(m);
2049 VM_PAGE_CONSUME_CLUSTERED(m);
2050 }
2051 assert(!m->vmp_cleaning);
2052
2053 /*
2054 * We no longer need the old page or object.
2055 */
2056 RELEASE_PAGE(m);
2057
2058 /*
2059 * This check helps with marking the object as having a sequential pattern
2060 * Normally we'll miss doing this below because this fault is about COW to
2061 * the first_object i.e. bring page in from disk, push to object above but
2062 * don't update the file object's sequential pattern.
2063 */
2064 if (object->internal == FALSE) {
2065 vm_fault_is_sequential(object, offset, fault_info->behavior);
2066 }
2067
2068 vm_object_paging_end(object);
2069 vm_object_unlock(object);
2070
2071 my_fault = DBG_COW_FAULT;
2072 VM_STAT_INCR(cow_faults);
2073 DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
2074 current_task()->cow_faults++;
2075
2076 object = first_object;
2077 offset = first_offset;
2078
2079 vm_object_lock(object);
2080 /*
2081 * get rid of the place holder
2082 * page that we soldered in earlier
2083 */
2084 VM_PAGE_FREE(first_m);
2085 first_m = VM_PAGE_NULL;
2086
2087 /*
2088 * and replace it with the
2089 * page we just copied into
2090 */
2091 assert(copy_m->vmp_busy);
2092 vm_page_insert(copy_m, object, offset);
2093 SET_PAGE_DIRTY(copy_m, TRUE);
2094
2095 m = copy_m;
2096 /*
2097 * Now that we've gotten the copy out of the
2098 * way, let's try to collapse the top object.
2099 * But we have to play ugly games with
2100 * paging_in_progress to do that...
2101 */
2102 vm_object_paging_end(object);
2103 vm_object_collapse(object, offset, TRUE);
2104 vm_object_paging_begin(object);
2105
2106 } else
2107 *protection &= (~VM_PROT_WRITE);
2108 }
2109 /*
2110 * Now check whether the page needs to be pushed into the
2111 * copy object. The use of asymmetric copy on write for
2112 * shared temporary objects means that we may do two copies to
2113 * satisfy the fault; one above to get the page from a
2114 * shadowed object, and one here to push it into the copy.
2115 */
2116 try_failed_count = 0;
2117
2118 while ((copy_object = first_object->copy) != VM_OBJECT_NULL) {
2119 vm_object_offset_t copy_offset;
2120 vm_page_t copy_m;
2121
2122#if TRACEFAULTPAGE
2123 dbgTrace(0xBEEF0017, (unsigned int) copy_object, (unsigned int) fault_type); /* (TEST/DEBUG) */
2124#endif
2125 /*
2126 * If the page is being written, but hasn't been
2127 * copied to the copy-object, we have to copy it there.
2128 */
2129 if ((fault_type & VM_PROT_WRITE) == 0) {
2130 *protection &= ~VM_PROT_WRITE;
2131 break;
2132 }
2133
2134 /*
2135 * If the page was guaranteed to be resident,
2136 * we must have already performed the copy.
2137 */
2138 if (must_be_resident)
2139 break;
2140
2141 /*
2142 * Try to get the lock on the copy_object.
2143 */
2144 if (!vm_object_lock_try(copy_object)) {
2145
2146 vm_object_unlock(object);
2147 try_failed_count++;
2148
2149 mutex_pause(try_failed_count); /* wait a bit */
2150 vm_object_lock(object);
2151
2152 continue;
2153 }
2154 try_failed_count = 0;
2155
2156 /*
2157 * Make another reference to the copy-object,
2158 * to keep it from disappearing during the
2159 * copy.
2160 */
2161 vm_object_reference_locked(copy_object);
2162
2163 /*
2164 * Does the page exist in the copy?
2165 */
2166 copy_offset = first_offset - copy_object->vo_shadow_offset;
2167
2168 if (copy_object->vo_size <= copy_offset)
2169 /*
2170 * Copy object doesn't cover this page -- do nothing.
2171 */
2172 ;
2173 else if ((copy_m = vm_page_lookup(copy_object, copy_offset)) != VM_PAGE_NULL) {
2174 /*
2175 * Page currently exists in the copy object
2176 */
2177 if (copy_m->vmp_busy) {
2178 /*
2179 * If the page is being brought
2180 * in, wait for it and then retry.
2181 */
2182 RELEASE_PAGE(m);
2183
2184 /*
2185 * take an extra ref so object won't die
2186 */
2187 vm_object_reference_locked(copy_object);
2188 vm_object_unlock(copy_object);
2189 vm_fault_cleanup(object, first_m);
2190 counter(c_vm_fault_page_block_backoff_kernel++);
2191
2192 vm_object_lock(copy_object);
2193 assert(copy_object->ref_count > 0);
2194 VM_OBJ_RES_DECR(copy_object);
2195 vm_object_lock_assert_exclusive(copy_object);
2196 copy_object->ref_count--;
2197 assert(copy_object->ref_count > 0);
2198 copy_m = vm_page_lookup(copy_object, copy_offset);
2199
2200 if (copy_m != VM_PAGE_NULL && copy_m->vmp_busy) {
2201 PAGE_ASSERT_WAIT(copy_m, interruptible);
2202
2203 vm_object_unlock(copy_object);
2204 wait_result = thread_block(THREAD_CONTINUE_NULL);
2205 vm_object_deallocate(copy_object);
2206
2207 goto backoff;
2208 } else {
2209 vm_object_unlock(copy_object);
2210 vm_object_deallocate(copy_object);
2211 thread_interrupt_level(interruptible_state);
2212
2213 return (VM_FAULT_RETRY);
2214 }
2215 }
2216 }
2217 else if (!PAGED_OUT(copy_object, copy_offset)) {
2218 /*
2219 * If PAGED_OUT is TRUE, then the page used to exist
2220 * in the copy-object, and has already been paged out.
2221 * We don't need to repeat this. If PAGED_OUT is
2222 * FALSE, then either we don't know (!pager_created,
2223 * for example) or it hasn't been paged out.
2224 * (VM_EXTERNAL_STATE_UNKNOWN||VM_EXTERNAL_STATE_ABSENT)
2225 * We must copy the page to the copy object.
2226 *
2227 * Allocate a page for the copy
2228 */
2229 copy_m = vm_page_alloc(copy_object, copy_offset);
2230
2231 if (copy_m == VM_PAGE_NULL) {
2232 RELEASE_PAGE(m);
2233
2234 VM_OBJ_RES_DECR(copy_object);
2235 vm_object_lock_assert_exclusive(copy_object);
2236 copy_object->ref_count--;
2237 assert(copy_object->ref_count > 0);
2238
2239 vm_object_unlock(copy_object);
2240 vm_fault_cleanup(object, first_m);
2241 thread_interrupt_level(interruptible_state);
2242
2243 return (VM_FAULT_MEMORY_SHORTAGE);
2244 }
2245 /*
2246 * Must copy page into copy-object.
2247 */
2248 vm_page_copy(m, copy_m);
2249
2250 /*
2251 * If the old page was in use by any users
2252 * of the copy-object, it must be removed
2253 * from all pmaps. (We can't know which
2254 * pmaps use it.)
2255 */
2256 if (m->vmp_pmapped)
2257 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
2258
2259 if (m->vmp_clustered) {
2260 VM_PAGE_COUNT_AS_PAGEIN(m);
2261 VM_PAGE_CONSUME_CLUSTERED(m);
2262 }
2263 /*
2264 * If there's a pager, then immediately
2265 * page out this page, using the "initialize"
2266 * option. Else, we use the copy.
2267 */
2268 if ((!copy_object->pager_ready)
2269 || VM_COMPRESSOR_PAGER_STATE_GET(copy_object, copy_offset) == VM_EXTERNAL_STATE_ABSENT
2270 ) {
2271
2272 vm_page_lockspin_queues();
2273 assert(!m->vmp_cleaning);
2274 vm_page_activate(copy_m);
2275 vm_page_unlock_queues();
2276
2277 SET_PAGE_DIRTY(copy_m, TRUE);
2278 PAGE_WAKEUP_DONE(copy_m);
2279
2280 } else {
2281
2282 assert(copy_m->vmp_busy == TRUE);
2283 assert(!m->vmp_cleaning);
2284
2285 /*
2286 * dirty is protected by the object lock
2287 */
2288 SET_PAGE_DIRTY(copy_m, TRUE);
2289
2290 /*
2291 * The page is already ready for pageout:
2292 * not on pageout queues and busy.
2293 * Unlock everything except the
2294 * copy_object itself.
2295 */
2296 vm_object_unlock(object);
2297
2298 /*
2299 * Write the page to the copy-object,
2300 * flushing it from the kernel.
2301 */
2302 vm_pageout_initialize_page(copy_m);
2303
2304 /*
2305 * Since the pageout may have
2306 * temporarily dropped the
2307 * copy_object's lock, we
2308 * check whether we'll have
2309 * to deallocate the hard way.
2310 */
2311 if ((copy_object->shadow != object) || (copy_object->ref_count == 1)) {
2312 vm_object_unlock(copy_object);
2313 vm_object_deallocate(copy_object);
2314 vm_object_lock(object);
2315
2316 continue;
2317 }
2318 /*
2319 * Pick back up the old object's
2320 * lock. [It is safe to do so,
2321 * since it must be deeper in the
2322 * object tree.]
2323 */
2324 vm_object_lock(object);
2325 }
2326
2327 /*
2328 * Because we're pushing a page upward
2329 * in the object tree, we must restart
2330 * any faults that are waiting here.
2331 * [Note that this is an expansion of
2332 * PAGE_WAKEUP that uses the THREAD_RESTART
2333 * wait result]. Can't turn off the page's
2334 * busy bit because we're not done with it.
2335 */
2336 if (m->vmp_wanted) {
2337 m->vmp_wanted = FALSE;
2338 thread_wakeup_with_result((event_t) m, THREAD_RESTART);
2339 }
2340 }
2341 /*
2342 * The reference count on copy_object must be
2343 * at least 2: one for our extra reference,
2344 * and at least one from the outside world
2345 * (we checked that when we last locked
2346 * copy_object).
2347 */
2348 vm_object_lock_assert_exclusive(copy_object);
2349 copy_object->ref_count--;
2350 assert(copy_object->ref_count > 0);
2351
2352 VM_OBJ_RES_DECR(copy_object);
2353 vm_object_unlock(copy_object);
2354
2355 break;
2356 }
2357
2358done:
2359 *result_page = m;
2360 *top_page = first_m;
2361
2362 XPR(XPR_VM_FAULT,
2363 "vm_f_page: DONE obj 0x%X, offset 0x%X, m 0x%X, first_m 0x%X\n",
2364 object, offset, m, first_m, 0);
2365
2366 if (m != VM_PAGE_NULL) {
2367 assert(VM_PAGE_OBJECT(m) == object);
2368
2369 retval = VM_FAULT_SUCCESS;
2370
2371 if (my_fault == DBG_PAGEIN_FAULT) {
2372
2373 VM_PAGE_COUNT_AS_PAGEIN(m);
2374
2375 if (object->internal)
2376 my_fault = DBG_PAGEIND_FAULT;
2377 else
2378 my_fault = DBG_PAGEINV_FAULT;
2379
2380 /*
2381 * evaluate access pattern and update state
2382 * vm_fault_deactivate_behind depends on the
2383 * state being up to date
2384 */
2385 vm_fault_is_sequential(object, offset, fault_info->behavior);
2386 vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2387
2388 } else if (type_of_fault == NULL && my_fault == DBG_CACHE_HIT_FAULT) {
2389 /*
2390 * we weren't called from vm_fault, so handle the
2391 * accounting here for hits in the cache
2392 */
2393 if (m->vmp_clustered) {
2394 VM_PAGE_COUNT_AS_PAGEIN(m);
2395 VM_PAGE_CONSUME_CLUSTERED(m);
2396 }
2397 vm_fault_is_sequential(object, offset, fault_info->behavior);
2398 vm_fault_deactivate_behind(object, offset, fault_info->behavior);
2399
2400 } else if (my_fault == DBG_COMPRESSOR_FAULT || my_fault == DBG_COMPRESSOR_SWAPIN_FAULT) {
2401
2402 VM_STAT_INCR(decompressions);
2403 }
2404 if (type_of_fault)
2405 *type_of_fault = my_fault;
2406 } else {
2407 retval = VM_FAULT_SUCCESS_NO_VM_PAGE;
2408 assert(first_m == VM_PAGE_NULL);
2409 assert(object == first_object);
2410 }
2411
2412 thread_interrupt_level(interruptible_state);
2413
2414#if TRACEFAULTPAGE
2415 dbgTrace(0xBEEF001A, (unsigned int) VM_FAULT_SUCCESS, 0); /* (TEST/DEBUG) */
2416#endif
2417 return retval;
2418
2419backoff:
2420 thread_interrupt_level(interruptible_state);
2421
2422 if (wait_result == THREAD_INTERRUPTED)
2423 return (VM_FAULT_INTERRUPTED);
2424 return (VM_FAULT_RETRY);
2425
2426#undef RELEASE_PAGE
2427}
2428
2429
2430
2431/*
2432 * CODE SIGNING:
2433 * When soft faulting a page, we have to validate the page if:
2434 * 1. the page is being mapped in user space
2435 * 2. the page hasn't already been found to be "tainted"
2436 * 3. the page belongs to a code-signed object
2437 * 4. the page has not been validated yet or has been mapped for write.
2438 */
2439#define VM_FAULT_NEED_CS_VALIDATION(pmap, page, page_obj) \
2440 ((pmap) != kernel_pmap /*1*/ && \
2441 !(page)->vmp_cs_tainted /*2*/ && \
2442 (page_obj)->code_signed /*3*/ && \
2443 (!(page)->vmp_cs_validated || (page)->vmp_wpmapped /*4*/))
2444
2445
2446/*
2447 * page queue lock must NOT be held
2448 * m->vmp_object must be locked
2449 *
2450 * NOTE: m->vmp_object could be locked "shared" only if we are called
2451 * from vm_fault() as part of a soft fault. If so, we must be
2452 * careful not to modify the VM object in any way that is not
2453 * legal under a shared lock...
2454 */
2455extern int panic_on_cs_killed;
2456extern int proc_selfpid(void);
2457extern char *proc_name_address(void *p);
2458unsigned long cs_enter_tainted_rejected = 0;
2459unsigned long cs_enter_tainted_accepted = 0;
2460kern_return_t
2461vm_fault_enter(vm_page_t m,
2462 pmap_t pmap,
2463 vm_map_offset_t vaddr,
2464 vm_prot_t prot,
2465 vm_prot_t caller_prot,
2466 boolean_t wired,
2467 boolean_t change_wiring,
2468 vm_tag_t wire_tag,
2469 vm_object_fault_info_t fault_info,
2470 boolean_t *need_retry,
2471 int *type_of_fault)
2472{
2473 kern_return_t kr, pe_result;
2474 boolean_t previously_pmapped = m->vmp_pmapped;
2475 boolean_t must_disconnect = 0;
2476 boolean_t map_is_switched, map_is_switch_protected;
2477 boolean_t cs_violation;
2478 int cs_enforcement_enabled;
2479 vm_prot_t fault_type;
2480 vm_object_t object;
2481 boolean_t no_cache = fault_info->no_cache;
2482 boolean_t cs_bypass = fault_info->cs_bypass;
2483 int pmap_options = fault_info->pmap_options;
2484
2485 fault_type = change_wiring ? VM_PROT_NONE : caller_prot;
2486 object = VM_PAGE_OBJECT(m);
2487
2488 vm_object_lock_assert_held(object);
2489
2490#if KASAN
2491 if (pmap == kernel_pmap) {
2492 kasan_notify_address(vaddr, PAGE_SIZE);
2493 }
2494#endif
2495
2496 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_NOTOWNED);
2497
2498 if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
2499 assert(m->vmp_fictitious);
2500 return KERN_SUCCESS;
2501 }
2502
2503 if (*type_of_fault == DBG_ZERO_FILL_FAULT) {
2504
2505 vm_object_lock_assert_exclusive(object);
2506
2507 } else if ((fault_type & VM_PROT_WRITE) == 0 &&
2508 (!m->vmp_wpmapped
2509#if VM_OBJECT_ACCESS_TRACKING
2510 || object->access_tracking
2511#endif /* VM_OBJECT_ACCESS_TRACKING */
2512 )) {
2513 /*
2514 * This is not a "write" fault, so we
2515 * might not have taken the object lock
2516 * exclusively and we might not be able
2517 * to update the "wpmapped" bit in
2518 * vm_fault_enter().
2519 * Let's just grant read access to
2520 * the page for now and we'll
2521 * soft-fault again if we need write
2522 * access later...
2523 */
2524
2525 /* This had better not be a JIT page. */
2526 if (!pmap_has_prot_policy(prot)) {
2527 prot &= ~VM_PROT_WRITE;
2528 } else {
2529 assert(cs_bypass);
2530 }
2531 }
2532 if (m->vmp_pmapped == FALSE) {
2533
2534 if (m->vmp_clustered) {
2535 if (*type_of_fault == DBG_CACHE_HIT_FAULT) {
2536 /*
2537 * found it in the cache, but this
2538 * is the first fault-in of the page (m->vmp_pmapped == FALSE)
2539 * so it must have come in as part of
2540 * a cluster... account 1 pagein against it
2541 */
2542 if (object->internal)
2543 *type_of_fault = DBG_PAGEIND_FAULT;
2544 else
2545 *type_of_fault = DBG_PAGEINV_FAULT;
2546
2547 VM_PAGE_COUNT_AS_PAGEIN(m);
2548 }
2549 VM_PAGE_CONSUME_CLUSTERED(m);
2550 }
2551 }
2552
2553 if (*type_of_fault != DBG_COW_FAULT) {
2554 DTRACE_VM2(as_fault, int, 1, (uint64_t *), NULL);
2555
2556 if (pmap == kernel_pmap) {
2557 DTRACE_VM2(kernel_asflt, int, 1, (uint64_t *), NULL);
2558 }
2559 }
2560
2561 /* Validate code signature if necessary. */
2562 if (!cs_bypass &&
2563 VM_FAULT_NEED_CS_VALIDATION(pmap, m, object)) {
2564 vm_object_lock_assert_exclusive(object);
2565
2566 if (m->vmp_cs_validated) {
2567 vm_cs_revalidates++;
2568 }
2569
2570 /* VM map is locked, so 1 ref will remain on VM object -
2571 * so no harm if vm_page_validate_cs drops the object lock */
2572
2573#if PMAP_CS
2574 if (fault_info->pmap_cs_associated &&
2575 pmap_cs_enforced(pmap) &&
2576 !m->vmp_cs_validated &&
2577 !m->vmp_cs_tainted &&
2578 !m->vmp_cs_nx &&
2579 (prot & VM_PROT_EXECUTE) &&
2580 (caller_prot & VM_PROT_EXECUTE)) {
2581 /*
2582 * With pmap_cs, the pmap layer will validate the
2583 * code signature for any executable pmap mapping.
2584 * No need for us to validate this page too:
2585 * in pmap_cs we trust...
2586 */
2587 vm_cs_defer_to_pmap_cs++;
2588 } else {
2589 vm_cs_defer_to_pmap_cs_not++;
2590 vm_page_validate_cs(m);
2591 }
2592#else /* PMAP_CS */
2593 vm_page_validate_cs(m);
2594#endif /* PMAP_CS */
2595 }
2596
2597#define page_immutable(m,prot) ((m)->vmp_cs_validated /*&& ((prot) & VM_PROT_EXECUTE)*/)
2598#define page_nx(m) ((m)->vmp_cs_nx)
2599
2600 map_is_switched = ((pmap != vm_map_pmap(current_task()->map)) &&
2601 (pmap == vm_map_pmap(current_thread()->map)));
2602 map_is_switch_protected = current_thread()->map->switch_protect;
2603
2604 /* If the map is switched, and is switch-protected, we must protect
2605 * some pages from being write-faulted: immutable pages because by
2606 * definition they may not be written, and executable pages because that
2607 * would provide a way to inject unsigned code.
2608 * If the page is immutable, we can simply return. However, we can't
2609 * immediately determine whether a page is executable anywhere. But,
2610 * we can disconnect it everywhere and remove the executable protection
2611 * from the current map. We do that below right before we do the
2612 * PMAP_ENTER.
2613 */
2614 cs_enforcement_enabled = cs_process_enforcement(NULL);
2615
2616 if(cs_enforcement_enabled && map_is_switched &&
2617 map_is_switch_protected && page_immutable(m, prot) &&
2618 (prot & VM_PROT_WRITE))
2619 {
2620 return KERN_CODESIGN_ERROR;
2621 }
2622
2623 if (cs_enforcement_enabled && page_nx(m) && (prot & VM_PROT_EXECUTE)) {
2624 if (cs_debug)
2625 printf("page marked to be NX, not letting it be mapped EXEC\n");
2626 return KERN_CODESIGN_ERROR;
2627 }
2628
2629 /* A page could be tainted, or pose a risk of being tainted later.
2630 * Check whether the receiving process wants it, and make it feel
2631 * the consequences (that hapens in cs_invalid_page()).
2632 * For CS Enforcement, two other conditions will
2633 * cause that page to be tainted as well:
2634 * - pmapping an unsigned page executable - this means unsigned code;
2635 * - writeable mapping of a validated page - the content of that page
2636 * can be changed without the kernel noticing, therefore unsigned
2637 * code can be created
2638 */
2639 if (cs_bypass) {
2640 /* code-signing is bypassed */
2641 cs_violation = FALSE;
2642 } else if (m->vmp_cs_tainted) {
2643 /* tainted page */
2644 cs_violation = TRUE;
2645 } else if (!cs_enforcement_enabled) {
2646 /* no further code-signing enforcement */
2647 cs_violation = FALSE;
2648 } else if (page_immutable(m, prot) &&
2649 ((prot & VM_PROT_WRITE) ||
2650 m->vmp_wpmapped)) {
2651 /*
2652 * The page should be immutable, but is in danger of being
2653 * modified.
2654 * This is the case where we want policy from the code
2655 * directory - is the page immutable or not? For now we have
2656 * to assume that code pages will be immutable, data pages not.
2657 * We'll assume a page is a code page if it has a code directory
2658 * and we fault for execution.
2659 * That is good enough since if we faulted the code page for
2660 * writing in another map before, it is wpmapped; if we fault
2661 * it for writing in this map later it will also be faulted for
2662 * executing at the same time; and if we fault for writing in
2663 * another map later, we will disconnect it from this pmap so
2664 * we'll notice the change.
2665 */
2666 cs_violation = TRUE;
2667 } else if (!m->vmp_cs_validated &&
2668 (prot & VM_PROT_EXECUTE)
2669#if PMAP_CS
2670 /*
2671 * Executable pages will be validated by pmap_cs;
2672 * in pmap_cs we trust...
2673 * If pmap_cs is turned off, this is a code-signing
2674 * violation.
2675 */
2676 && ! (pmap_cs_enforced(pmap))
2677#endif /* PMAP_CS */
2678 ) {
2679 cs_violation = TRUE;
2680 } else {
2681 cs_violation = FALSE;
2682 }
2683
2684 if (cs_violation) {
2685 /* We will have a tainted page. Have to handle the special case
2686 * of a switched map now. If the map is not switched, standard
2687 * procedure applies - call cs_invalid_page().
2688 * If the map is switched, the real owner is invalid already.
2689 * There is no point in invalidating the switching process since
2690 * it will not be executing from the map. So we don't call
2691 * cs_invalid_page() in that case. */
2692 boolean_t reject_page, cs_killed;
2693 if(map_is_switched) {
2694 assert(pmap==vm_map_pmap(current_thread()->map));
2695 assert(!(prot & VM_PROT_WRITE) || (map_is_switch_protected == FALSE));
2696 reject_page = FALSE;
2697 } else {
2698 if (cs_debug > 5)
2699 printf("vm_fault: signed: %s validate: %s tainted: %s wpmapped: %s prot: 0x%x\n",
2700 object->code_signed ? "yes" : "no",
2701 m->vmp_cs_validated ? "yes" : "no",
2702 m->vmp_cs_tainted ? "yes" : "no",
2703 m->vmp_wpmapped ? "yes" : "no",
2704 (int)prot);
2705 reject_page = cs_invalid_page((addr64_t) vaddr, &cs_killed);
2706 }
2707
2708 if (reject_page) {
2709 /* reject the invalid page: abort the page fault */
2710 int pid;
2711 const char *procname;
2712 task_t task;
2713 vm_object_t file_object, shadow;
2714 vm_object_offset_t file_offset;
2715 char *pathname, *filename;
2716 vm_size_t pathname_len, filename_len;
2717 boolean_t truncated_path;
2718#define __PATH_MAX 1024
2719 struct timespec mtime, cs_mtime;
2720 int shadow_depth;
2721 os_reason_t codesigning_exit_reason = OS_REASON_NULL;
2722
2723 kr = KERN_CODESIGN_ERROR;
2724 cs_enter_tainted_rejected++;
2725
2726 /* get process name and pid */
2727 procname = "?";
2728 task = current_task();
2729 pid = proc_selfpid();
2730 if (task->bsd_info != NULL)
2731 procname = proc_name_address(task->bsd_info);
2732
2733 /* get file's VM object */
2734 file_object = object;
2735 file_offset = m->vmp_offset;
2736 for (shadow = file_object->shadow,
2737 shadow_depth = 0;
2738 shadow != VM_OBJECT_NULL;
2739 shadow = file_object->shadow,
2740 shadow_depth++) {
2741 vm_object_lock_shared(shadow);
2742 if (file_object != object) {
2743 vm_object_unlock(file_object);
2744 }
2745 file_offset += file_object->vo_shadow_offset;
2746 file_object = shadow;
2747 }
2748
2749 mtime.tv_sec = 0;
2750 mtime.tv_nsec = 0;
2751 cs_mtime.tv_sec = 0;
2752 cs_mtime.tv_nsec = 0;
2753
2754 /* get file's pathname and/or filename */
2755 pathname = NULL;
2756 filename = NULL;
2757 pathname_len = 0;
2758 filename_len = 0;
2759 truncated_path = FALSE;
2760 /* no pager -> no file -> no pathname, use "<nil>" in that case */
2761 if (file_object->pager != NULL) {
2762 pathname = (char *)kalloc(__PATH_MAX * 2);
2763 if (pathname) {
2764 pathname[0] = '\0';
2765 pathname_len = __PATH_MAX;
2766 filename = pathname + pathname_len;
2767 filename_len = __PATH_MAX;
2768 }
2769 vnode_pager_get_object_name(file_object->pager,
2770 pathname,
2771 pathname_len,
2772 filename,
2773 filename_len,
2774 &truncated_path);
2775 if (pathname) {
2776 /* safety first... */
2777 pathname[__PATH_MAX-1] = '\0';
2778 filename[__PATH_MAX-1] = '\0';
2779 }
2780 vnode_pager_get_object_mtime(file_object->pager,
2781 &mtime,
2782 &cs_mtime);
2783 }
2784 printf("CODE SIGNING: process %d[%s]: "
2785 "rejecting invalid page at address 0x%llx "
2786 "from offset 0x%llx in file \"%s%s%s\" "
2787 "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
2788 "(signed:%d validated:%d tainted:%d nx:%d "
2789 "wpmapped:%d dirty:%d depth:%d)\n",
2790 pid, procname, (addr64_t) vaddr,
2791 file_offset,
2792 (pathname ? pathname : "<nil>"),
2793 (truncated_path ? "/.../" : ""),
2794 (truncated_path ? filename : ""),
2795 cs_mtime.tv_sec, cs_mtime.tv_nsec,
2796 ((cs_mtime.tv_sec == mtime.tv_sec &&
2797 cs_mtime.tv_nsec == mtime.tv_nsec)
2798 ? "=="
2799 : "!="),
2800 mtime.tv_sec, mtime.tv_nsec,
2801 object->code_signed,
2802 m->vmp_cs_validated,
2803 m->vmp_cs_tainted,
2804 m->vmp_cs_nx,
2805 m->vmp_wpmapped,
2806 m->vmp_dirty,
2807 shadow_depth);
2808
2809 /*
2810 * We currently only generate an exit reason if cs_invalid_page directly killed a process. If cs_invalid_page
2811 * did not kill the process (more the case on desktop), vm_fault_enter will not satisfy the fault and whether the
2812 * process dies is dependent on whether there is a signal handler registered for SIGSEGV and how that handler
2813 * will deal with the segmentation fault.
2814 */
2815 if (cs_killed) {
2816 KERNEL_DEBUG_CONSTANT(BSDDBG_CODE(DBG_BSD_PROC, BSD_PROC_EXITREASON_CREATE) | DBG_FUNC_NONE,
2817 pid, OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE, 0, 0);
2818
2819 codesigning_exit_reason = os_reason_create(OS_REASON_CODESIGNING, CODESIGNING_EXIT_REASON_INVALID_PAGE);
2820 if (codesigning_exit_reason == NULL) {
2821 printf("vm_fault_enter: failed to allocate codesigning exit reason\n");
2822 } else {
2823 mach_vm_address_t data_addr = 0;
2824 struct codesigning_exit_reason_info *ceri = NULL;
2825 uint32_t reason_buffer_size_estimate = kcdata_estimate_required_buffer_size(1, sizeof(*ceri));
2826
2827 if (os_reason_alloc_buffer_noblock(codesigning_exit_reason, reason_buffer_size_estimate)) {
2828 printf("vm_fault_enter: failed to allocate buffer for codesigning exit reason\n");
2829 } else {
2830 if (KERN_SUCCESS == kcdata_get_memory_addr(&codesigning_exit_reason->osr_kcd_descriptor,
2831 EXIT_REASON_CODESIGNING_INFO, sizeof(*ceri), &data_addr)) {
2832 ceri = (struct codesigning_exit_reason_info *)data_addr;
2833 static_assert(__PATH_MAX == sizeof(ceri->ceri_pathname));
2834
2835 ceri->ceri_virt_addr = vaddr;
2836 ceri->ceri_file_offset = file_offset;
2837 if (pathname)
2838 strncpy((char *)&ceri->ceri_pathname, pathname, sizeof(ceri->ceri_pathname));
2839 else
2840 ceri->ceri_pathname[0] = '\0';
2841 if (filename)
2842 strncpy((char *)&ceri->ceri_filename, filename, sizeof(ceri->ceri_filename));
2843 else
2844 ceri->ceri_filename[0] = '\0';
2845 ceri->ceri_path_truncated = (truncated_path);
2846 ceri->ceri_codesig_modtime_secs = cs_mtime.tv_sec;
2847 ceri->ceri_codesig_modtime_nsecs = cs_mtime.tv_nsec;
2848 ceri->ceri_page_modtime_secs = mtime.tv_sec;
2849 ceri->ceri_page_modtime_nsecs = mtime.tv_nsec;
2850 ceri->ceri_object_codesigned = (object->code_signed);
2851 ceri->ceri_page_codesig_validated = (m->vmp_cs_validated);
2852 ceri->ceri_page_codesig_tainted = (m->vmp_cs_tainted);
2853 ceri->ceri_page_codesig_nx = (m->vmp_cs_nx);
2854 ceri->ceri_page_wpmapped = (m->vmp_wpmapped);
2855 ceri->ceri_page_slid = 0;
2856 ceri->ceri_page_dirty = (m->vmp_dirty);
2857 ceri->ceri_page_shadow_depth = shadow_depth;
2858 } else {
2859#if DEBUG || DEVELOPMENT
2860 panic("vm_fault_enter: failed to allocate kcdata for codesigning exit reason");
2861#else
2862 printf("vm_fault_enter: failed to allocate kcdata for codesigning exit reason\n");
2863#endif /* DEBUG || DEVELOPMENT */
2864 /* Free the buffer */
2865 os_reason_alloc_buffer_noblock(codesigning_exit_reason, 0);
2866 }
2867 }
2868 }
2869
2870 set_thread_exit_reason(current_thread(), codesigning_exit_reason, FALSE);
2871 }
2872 if (panic_on_cs_killed &&
2873 object->object_is_shared_cache) {
2874 panic("CODE SIGNING: process %d[%s]: "
2875 "rejecting invalid page at address 0x%llx "
2876 "from offset 0x%llx in file \"%s%s%s\" "
2877 "(cs_mtime:%lu.%ld %s mtime:%lu.%ld) "
2878 "(signed:%d validated:%d tainted:%d nx:%d"
2879 "wpmapped:%d dirty:%d depth:%d)\n",
2880 pid, procname, (addr64_t) vaddr,
2881 file_offset,
2882 (pathname ? pathname : "<nil>"),
2883 (truncated_path ? "/.../" : ""),
2884 (truncated_path ? filename : ""),
2885 cs_mtime.tv_sec, cs_mtime.tv_nsec,
2886 ((cs_mtime.tv_sec == mtime.tv_sec &&
2887 cs_mtime.tv_nsec == mtime.tv_nsec)
2888 ? "=="
2889 : "!="),
2890 mtime.tv_sec, mtime.tv_nsec,
2891 object->code_signed,
2892 m->vmp_cs_validated,
2893 m->vmp_cs_tainted,
2894 m->vmp_cs_nx,
2895 m->vmp_wpmapped,
2896 m->vmp_dirty,
2897 shadow_depth);
2898 }
2899
2900 if (file_object != object) {
2901 vm_object_unlock(file_object);
2902 }
2903 if (pathname_len != 0) {
2904 kfree(pathname, __PATH_MAX * 2);
2905 pathname = NULL;
2906 filename = NULL;
2907 }
2908 } else {
2909 /* proceed with the invalid page */
2910 kr = KERN_SUCCESS;
2911 if (!m->vmp_cs_validated &&
2912 !object->code_signed) {
2913 /*
2914 * This page has not been (fully) validated but
2915 * does not belong to a code-signed object
2916 * so it should not be forcefully considered
2917 * as tainted.
2918 * We're just concerned about it here because
2919 * we've been asked to "execute" it but that
2920 * does not mean that it should cause other
2921 * accesses to fail.
2922 * This happens when a debugger sets a
2923 * breakpoint and we then execute code in
2924 * that page. Marking the page as "tainted"
2925 * would cause any inspection tool ("leaks",
2926 * "vmmap", "CrashReporter", ...) to get killed
2927 * due to code-signing violation on that page,
2928 * even though they're just reading it and not
2929 * executing from it.
2930 */
2931 } else {
2932 /*
2933 * Page might have been tainted before or not;
2934 * now it definitively is. If the page wasn't
2935 * tainted, we must disconnect it from all
2936 * pmaps later, to force existing mappings
2937 * through that code path for re-consideration
2938 * of the validity of that page.
2939 */
2940 must_disconnect = !m->vmp_cs_tainted;
2941 m->vmp_cs_tainted = TRUE;
2942 }
2943 cs_enter_tainted_accepted++;
2944 }
2945 if (kr != KERN_SUCCESS) {
2946 if (cs_debug) {
2947 printf("CODESIGNING: vm_fault_enter(0x%llx): "
2948 "*** INVALID PAGE ***\n",
2949 (long long)vaddr);
2950 }
2951#if !SECURE_KERNEL
2952 if (cs_enforcement_panic) {
2953 panic("CODESIGNING: panicking on invalid page\n");
2954 }
2955#endif
2956 }
2957
2958 } else {
2959 /* proceed with the valid page */
2960 kr = KERN_SUCCESS;
2961 }
2962
2963 boolean_t page_queues_locked = FALSE;
2964#define __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED() \
2965MACRO_BEGIN \
2966 if (! page_queues_locked) { \
2967 page_queues_locked = TRUE; \
2968 vm_page_lockspin_queues(); \
2969 } \
2970MACRO_END
2971#define __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED() \
2972MACRO_BEGIN \
2973 if (page_queues_locked) { \
2974 page_queues_locked = FALSE; \
2975 vm_page_unlock_queues(); \
2976 } \
2977MACRO_END
2978
2979 /*
2980 * Hold queues lock to manipulate
2981 * the page queues. Change wiring
2982 * case is obvious.
2983 */
2984 assert((m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) || object != compressor_object);
2985
2986#if CONFIG_BACKGROUND_QUEUE
2987 vm_page_update_background_state(m);
2988#endif
2989 if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
2990 /*
2991 * Compressor pages are neither wired
2992 * nor pageable and should never change.
2993 */
2994 assert(object == compressor_object);
2995 } else if (change_wiring) {
2996 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
2997
2998 if (wired) {
2999 if (kr == KERN_SUCCESS) {
3000 vm_page_wire(m, wire_tag, TRUE);
3001 }
3002 } else {
3003 vm_page_unwire(m, TRUE);
3004 }
3005 /* we keep the page queues lock, if we need it later */
3006
3007 } else {
3008 if (object->internal == TRUE) {
3009 /*
3010 * don't allow anonymous pages on
3011 * the speculative queues
3012 */
3013 no_cache = FALSE;
3014 }
3015 if (kr != KERN_SUCCESS) {
3016 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3017 vm_page_deactivate(m);
3018 /* we keep the page queues lock, if we need it later */
3019 } else if (((m->vmp_q_state == VM_PAGE_NOT_ON_Q) ||
3020 (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ||
3021 (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) ||
3022 ((m->vmp_q_state != VM_PAGE_ON_THROTTLED_Q) && no_cache)) &&
3023 !VM_PAGE_WIRED(m)) {
3024
3025 if (vm_page_local_q &&
3026 (*type_of_fault == DBG_COW_FAULT ||
3027 *type_of_fault == DBG_ZERO_FILL_FAULT) ) {
3028 struct vpl *lq;
3029 uint32_t lid;
3030
3031 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3032
3033 __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3034 vm_object_lock_assert_exclusive(object);
3035
3036 /*
3037 * we got a local queue to stuff this
3038 * new page on...
3039 * its safe to manipulate local and
3040 * local_id at this point since we're
3041 * behind an exclusive object lock and
3042 * the page is not on any global queue.
3043 *
3044 * we'll use the current cpu number to
3045 * select the queue note that we don't
3046 * need to disable preemption... we're
3047 * going to be behind the local queue's
3048 * lock to do the real work
3049 */
3050 lid = cpu_number();
3051
3052 lq = &vm_page_local_q[lid].vpl_un.vpl;
3053
3054 VPL_LOCK(&lq->vpl_lock);
3055
3056 vm_page_check_pageable_safe(m);
3057 vm_page_queue_enter(&lq->vpl_queue, m,
3058 vm_page_t, vmp_pageq);
3059 m->vmp_q_state = VM_PAGE_ON_ACTIVE_LOCAL_Q;
3060 m->vmp_local_id = lid;
3061 lq->vpl_count++;
3062
3063 if (object->internal)
3064 lq->vpl_internal_count++;
3065 else
3066 lq->vpl_external_count++;
3067
3068 VPL_UNLOCK(&lq->vpl_lock);
3069
3070 if (lq->vpl_count > vm_page_local_q_soft_limit)
3071 {
3072 /*
3073 * we're beyond the soft limit
3074 * for the local queue
3075 * vm_page_reactivate_local will
3076 * 'try' to take the global page
3077 * queue lock... if it can't
3078 * that's ok... we'll let the
3079 * queue continue to grow up
3080 * to the hard limit... at that
3081 * point we'll wait for the
3082 * lock... once we've got the
3083 * lock, we'll transfer all of
3084 * the pages from the local
3085 * queue to the global active
3086 * queue
3087 */
3088 vm_page_reactivate_local(lid, FALSE, FALSE);
3089 }
3090 } else {
3091
3092 __VM_PAGE_LOCKSPIN_QUEUES_IF_NEEDED();
3093
3094 /*
3095 * test again now that we hold the
3096 * page queue lock
3097 */
3098 if (!VM_PAGE_WIRED(m)) {
3099 if (m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3100 vm_page_queues_remove(m, FALSE);
3101
3102 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3103 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_fault_reactivated, 1);
3104 }
3105
3106 if ( !VM_PAGE_ACTIVE_OR_INACTIVE(m) ||
3107 no_cache) {
3108 /*
3109 * If this is a no_cache mapping
3110 * and the page has never been
3111 * mapped before or was
3112 * previously a no_cache page,
3113 * then we want to leave pages
3114 * in the speculative state so
3115 * that they can be readily
3116 * recycled if free memory runs
3117 * low. Otherwise the page is
3118 * activated as normal.
3119 */
3120
3121 if (no_cache &&
3122 (!previously_pmapped ||
3123 m->vmp_no_cache)) {
3124 m->vmp_no_cache = TRUE;
3125
3126 if (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q)
3127 vm_page_speculate(m, FALSE);
3128
3129 } else if ( !VM_PAGE_ACTIVE_OR_INACTIVE(m)) {
3130 vm_page_activate(m);
3131 }
3132 }
3133 }
3134 /* we keep the page queues lock, if we need it later */
3135 }
3136 }
3137 }
3138 /* we're done with the page queues lock, if we ever took it */
3139 __VM_PAGE_UNLOCK_QUEUES_IF_NEEDED();
3140
3141
3142 /* If we have a KERN_SUCCESS from the previous checks, we either have
3143 * a good page, or a tainted page that has been accepted by the process.
3144 * In both cases the page will be entered into the pmap.
3145 * If the page is writeable, we need to disconnect it from other pmaps
3146 * now so those processes can take note.
3147 */
3148 if (kr == KERN_SUCCESS) {
3149 /*
3150 * NOTE: we may only hold the vm_object lock SHARED
3151 * at this point, so we need the phys_page lock to
3152 * properly serialize updating the pmapped and
3153 * xpmapped bits
3154 */
3155 if ((prot & VM_PROT_EXECUTE) && !m->vmp_xpmapped) {
3156 ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
3157
3158 pmap_lock_phys_page(phys_page);
3159 /*
3160 * go ahead and take the opportunity
3161 * to set 'pmapped' here so that we don't
3162 * need to grab this lock a 2nd time
3163 * just below
3164 */
3165 m->vmp_pmapped = TRUE;
3166
3167 if (!m->vmp_xpmapped) {
3168
3169 m->vmp_xpmapped = TRUE;
3170
3171 pmap_unlock_phys_page(phys_page);
3172
3173 if (!object->internal)
3174 OSAddAtomic(1, &vm_page_xpmapped_external_count);
3175
3176#if defined(__arm__) || defined(__arm64__)
3177 pmap_sync_page_data_phys(phys_page);
3178#else
3179 if (object->internal &&
3180 object->pager != NULL) {
3181 /*
3182 * This page could have been
3183 * uncompressed by the
3184 * compressor pager and its
3185 * contents might be only in
3186 * the data cache.
3187 * Since it's being mapped for
3188 * "execute" for the fist time,
3189 * make sure the icache is in
3190 * sync.
3191 */
3192 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
3193 pmap_sync_page_data_phys(phys_page);
3194 }
3195#endif
3196 } else
3197 pmap_unlock_phys_page(phys_page);
3198 } else {
3199 if (m->vmp_pmapped == FALSE) {
3200 ppnum_t phys_page = VM_PAGE_GET_PHYS_PAGE(m);
3201
3202 pmap_lock_phys_page(phys_page);
3203 m->vmp_pmapped = TRUE;
3204 pmap_unlock_phys_page(phys_page);
3205 }
3206 }
3207
3208 if (fault_type & VM_PROT_WRITE) {
3209
3210 if (m->vmp_wpmapped == FALSE) {
3211 vm_object_lock_assert_exclusive(object);
3212 if (!object->internal && object->pager) {
3213 task_update_logical_writes(current_task(), PAGE_SIZE, TASK_WRITE_DEFERRED, vnode_pager_lookup_vnode(object->pager));
3214 }
3215 m->vmp_wpmapped = TRUE;
3216 }
3217 if (must_disconnect) {
3218 /*
3219 * We can only get here
3220 * because of the CSE logic
3221 */
3222 assert(cs_enforcement_enabled);
3223 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
3224 /*
3225 * If we are faulting for a write, we can clear
3226 * the execute bit - that will ensure the page is
3227 * checked again before being executable, which
3228 * protects against a map switch.
3229 * This only happens the first time the page
3230 * gets tainted, so we won't get stuck here
3231 * to make an already writeable page executable.
3232 */
3233 if (!cs_bypass){
3234 assert(!pmap_has_prot_policy(prot));
3235 prot &= ~VM_PROT_EXECUTE;
3236 }
3237 }
3238 }
3239 assert(VM_PAGE_OBJECT(m) == object);
3240
3241#if VM_OBJECT_ACCESS_TRACKING
3242 if (object->access_tracking) {
3243 DTRACE_VM2(access_tracking, vm_map_offset_t, vaddr, int, fault_type);
3244 if (fault_type & VM_PROT_WRITE) {
3245 object->access_tracking_writes++;
3246 vm_object_access_tracking_writes++;
3247 } else {
3248 object->access_tracking_reads++;
3249 vm_object_access_tracking_reads++;
3250 }
3251 }
3252#endif /* VM_OBJECT_ACCESS_TRACKING */
3253
3254#if PMAP_CS
3255 /*
3256 * If CS enforcement is on, we don't ask for an executable page if the
3257 * fault does not call for execution, because that can fail in
3258 * situations where the caller only actually wanted read access.
3259 * However, it may be better to instead retry without execute on
3260 * failure, or pass a flag into pmap_enter to do the right thing.
3261 */
3262 // TODO: <rdar://problem/30997388> maybe do something better than masking out VM_PROT_EXECUTE on non-execute faults
3263 if (pmap_cs_enforced(pmap) && !(caller_prot & VM_PROT_EXECUTE)) {
3264 prot &= ~VM_PROT_EXECUTE;
3265 }
3266#endif
3267
3268 /* Prevent a deadlock by not
3269 * holding the object lock if we need to wait for a page in
3270 * pmap_enter() - <rdar://problem/7138958> */
3271 PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, fault_type, 0,
3272 wired,
3273 pmap_options | PMAP_OPTIONS_NOWAIT,
3274 pe_result);
3275#if __x86_64__
3276 if (pe_result == KERN_INVALID_ARGUMENT &&
3277 pmap == PMAP_NULL &&
3278 wired) {
3279 /*
3280 * Wiring a page in a pmap-less VM map:
3281 * VMware's "vmmon" kernel extension does this
3282 * to grab pages.
3283 * Let it proceed even though the PMAP_ENTER() failed.
3284 */
3285 pe_result = KERN_SUCCESS;
3286 }
3287#endif /* __x86_64__ */
3288
3289 if(pe_result == KERN_RESOURCE_SHORTAGE) {
3290
3291 if (need_retry) {
3292 /*
3293 * this will be non-null in the case where we hold the lock
3294 * on the top-object in this chain... we can't just drop
3295 * the lock on the object we're inserting the page into
3296 * and recall the PMAP_ENTER since we can still cause
3297 * a deadlock if one of the critical paths tries to
3298 * acquire the lock on the top-object and we're blocked
3299 * in PMAP_ENTER waiting for memory... our only recourse
3300 * is to deal with it at a higher level where we can
3301 * drop both locks.
3302 */
3303 *need_retry = TRUE;
3304 vm_pmap_enter_retried++;
3305 goto after_the_pmap_enter;
3306 }
3307 /* The nonblocking version of pmap_enter did not succeed.
3308 * and we don't need to drop other locks and retry
3309 * at the level above us, so
3310 * use the blocking version instead. Requires marking
3311 * the page busy and unlocking the object */
3312 boolean_t was_busy = m->vmp_busy;
3313
3314 vm_object_lock_assert_exclusive(object);
3315
3316 m->vmp_busy = TRUE;
3317 vm_object_unlock(object);
3318
3319 PMAP_ENTER_OPTIONS(pmap, vaddr, m, prot, fault_type,
3320 0, wired,
3321 pmap_options, pe_result);
3322
3323 assert(VM_PAGE_OBJECT(m) == object);
3324
3325 /* Take the object lock again. */
3326 vm_object_lock(object);
3327
3328 /* If the page was busy, someone else will wake it up.
3329 * Otherwise, we have to do it now. */
3330 assert(m->vmp_busy);
3331 if(!was_busy) {
3332 PAGE_WAKEUP_DONE(m);
3333 }
3334 vm_pmap_enter_blocked++;
3335 }
3336
3337 kr = pe_result;
3338 }
3339
3340after_the_pmap_enter:
3341 return kr;
3342}
3343
3344void
3345vm_pre_fault(vm_map_offset_t vaddr)
3346{
3347 if (pmap_find_phys(current_map()->pmap, vaddr) == 0) {
3348
3349 vm_fault(current_map(), /* map */
3350 vaddr, /* vaddr */
3351 VM_PROT_READ, /* fault_type */
3352 FALSE, /* change_wiring */
3353 VM_KERN_MEMORY_NONE, /* tag - not wiring */
3354 THREAD_UNINT, /* interruptible */
3355 NULL, /* caller_pmap */
3356 0 /* caller_pmap_addr */);
3357 }
3358}
3359
3360
3361/*
3362 * Routine: vm_fault
3363 * Purpose:
3364 * Handle page faults, including pseudo-faults
3365 * used to change the wiring status of pages.
3366 * Returns:
3367 * Explicit continuations have been removed.
3368 * Implementation:
3369 * vm_fault and vm_fault_page save mucho state
3370 * in the moral equivalent of a closure. The state
3371 * structure is allocated when first entering vm_fault
3372 * and deallocated when leaving vm_fault.
3373 */
3374
3375extern int _map_enter_debug;
3376extern uint64_t get_current_unique_pid(void);
3377
3378unsigned long vm_fault_collapse_total = 0;
3379unsigned long vm_fault_collapse_skipped = 0;
3380
3381
3382kern_return_t
3383vm_fault_external(
3384 vm_map_t map,
3385 vm_map_offset_t vaddr,
3386 vm_prot_t fault_type,
3387 boolean_t change_wiring,
3388 int interruptible,
3389 pmap_t caller_pmap,
3390 vm_map_offset_t caller_pmap_addr)
3391{
3392 return vm_fault_internal(map, vaddr, fault_type, change_wiring, vm_tag_bt(),
3393 interruptible, caller_pmap, caller_pmap_addr,
3394 NULL);
3395}
3396
3397kern_return_t
3398vm_fault(
3399 vm_map_t map,
3400 vm_map_offset_t vaddr,
3401 vm_prot_t fault_type,
3402 boolean_t change_wiring,
3403 vm_tag_t wire_tag, /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
3404 int interruptible,
3405 pmap_t caller_pmap,
3406 vm_map_offset_t caller_pmap_addr)
3407{
3408 return vm_fault_internal(map, vaddr, fault_type, change_wiring, wire_tag,
3409 interruptible, caller_pmap, caller_pmap_addr,
3410 NULL);
3411}
3412
3413kern_return_t
3414vm_fault_internal(
3415 vm_map_t map,
3416 vm_map_offset_t vaddr,
3417 vm_prot_t caller_prot,
3418 boolean_t change_wiring,
3419 vm_tag_t wire_tag, /* if wiring must pass tag != VM_KERN_MEMORY_NONE */
3420 int interruptible,
3421 pmap_t caller_pmap,
3422 vm_map_offset_t caller_pmap_addr,
3423 ppnum_t *physpage_p)
3424{
3425 vm_map_version_t version; /* Map version for verificiation */
3426 boolean_t wired; /* Should mapping be wired down? */
3427 vm_object_t object; /* Top-level object */
3428 vm_object_offset_t offset; /* Top-level offset */
3429 vm_prot_t prot; /* Protection for mapping */
3430 vm_object_t old_copy_object; /* Saved copy object */
3431 vm_page_t result_page; /* Result of vm_fault_page */
3432 vm_page_t top_page; /* Placeholder page */
3433 kern_return_t kr;
3434
3435 vm_page_t m; /* Fast access to result_page */
3436 kern_return_t error_code;
3437 vm_object_t cur_object;
3438 vm_object_t m_object = NULL;
3439 vm_object_offset_t cur_offset;
3440 vm_page_t cur_m;
3441 vm_object_t new_object;
3442 int type_of_fault;
3443 pmap_t pmap;
3444 wait_interrupt_t interruptible_state;
3445 vm_map_t real_map = map;
3446 vm_map_t original_map = map;
3447 boolean_t object_locks_dropped = FALSE;
3448 vm_prot_t fault_type;
3449 vm_prot_t original_fault_type;
3450 struct vm_object_fault_info fault_info = {};
3451 boolean_t need_collapse = FALSE;
3452 boolean_t need_retry = FALSE;
3453 boolean_t *need_retry_ptr = NULL;
3454 int object_lock_type = 0;
3455 int cur_object_lock_type;
3456 vm_object_t top_object = VM_OBJECT_NULL;
3457 vm_object_t written_on_object = VM_OBJECT_NULL;
3458 memory_object_t written_on_pager = NULL;
3459 vm_object_offset_t written_on_offset = 0;
3460 int throttle_delay;
3461 int compressed_count_delta;
3462 int grab_options;
3463 vm_map_offset_t trace_vaddr;
3464 vm_map_offset_t trace_real_vaddr;
3465#if DEVELOPMENT || DEBUG
3466 vm_map_offset_t real_vaddr;
3467
3468 real_vaddr = vaddr;
3469#endif /* DEVELOPMENT || DEBUG */
3470 trace_real_vaddr = vaddr;
3471 vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
3472
3473 if (map == kernel_map) {
3474 trace_vaddr = VM_KERNEL_ADDRHIDE(vaddr);
3475 trace_real_vaddr = VM_KERNEL_ADDRHIDE(trace_real_vaddr);
3476 } else {
3477 trace_vaddr = vaddr;
3478 }
3479
3480 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3481 (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_START,
3482 ((uint64_t)trace_vaddr >> 32),
3483 trace_vaddr,
3484 (map == kernel_map),
3485 0,
3486 0);
3487
3488 if (get_preemption_level() != 0) {
3489 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
3490 (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
3491 ((uint64_t)trace_vaddr >> 32),
3492 trace_vaddr,
3493 KERN_FAILURE,
3494 0,
3495 0);
3496
3497 return (KERN_FAILURE);
3498 }
3499
3500 thread_t cthread = current_thread();
3501 boolean_t rtfault = (cthread->sched_mode == TH_MODE_REALTIME);
3502 uint64_t fstart = 0;
3503
3504 if (rtfault) {
3505 fstart = mach_continuous_time();
3506 }
3507
3508 interruptible_state = thread_interrupt_level(interruptible);
3509
3510 fault_type = (change_wiring ? VM_PROT_NONE : caller_prot);
3511
3512 VM_STAT_INCR(faults);
3513 current_task()->faults++;
3514 original_fault_type = fault_type;
3515
3516 if (fault_type & VM_PROT_WRITE)
3517 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3518 else
3519 object_lock_type = OBJECT_LOCK_SHARED;
3520
3521 cur_object_lock_type = OBJECT_LOCK_SHARED;
3522
3523 if ((map == kernel_map) && (caller_prot & VM_PROT_WRITE)) {
3524 if (compressor_map) {
3525 if ((vaddr >= vm_map_min(compressor_map)) && (vaddr < vm_map_max(compressor_map))) {
3526 panic("Write fault on compressor map, va: %p type: %u bounds: %p->%p", (void *) vaddr, caller_prot, (void *) vm_map_min(compressor_map), (void *) vm_map_max(compressor_map));
3527
3528 }
3529 }
3530 }
3531RetryFault:
3532 assert(written_on_object == VM_OBJECT_NULL);
3533
3534 /*
3535 * assume we will hit a page in the cache
3536 * otherwise, explicitly override with
3537 * the real fault type once we determine it
3538 */
3539 type_of_fault = DBG_CACHE_HIT_FAULT;
3540
3541 /*
3542 * Find the backing store object and offset into
3543 * it to begin the search.
3544 */
3545 fault_type = original_fault_type;
3546 map = original_map;
3547 vm_map_lock_read(map);
3548
3549 kr = vm_map_lookup_locked(&map, vaddr, fault_type,
3550 object_lock_type, &version,
3551 &object, &offset, &prot, &wired,
3552 &fault_info,
3553 &real_map);
3554
3555 if (kr != KERN_SUCCESS) {
3556 vm_map_unlock_read(map);
3557 goto done;
3558 }
3559 pmap = real_map->pmap;
3560 fault_info.interruptible = interruptible;
3561 fault_info.stealth = FALSE;
3562 fault_info.io_sync = FALSE;
3563 fault_info.mark_zf_absent = FALSE;
3564 fault_info.batch_pmap_op = FALSE;
3565
3566 /*
3567 * If the page is wired, we must fault for the current protection
3568 * value, to avoid further faults.
3569 */
3570 if (wired) {
3571 fault_type = prot | VM_PROT_WRITE;
3572 /*
3573 * since we're treating this fault as a 'write'
3574 * we must hold the top object lock exclusively
3575 */
3576 if (object_lock_type == OBJECT_LOCK_SHARED) {
3577
3578 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3579
3580 if (vm_object_lock_upgrade(object) == FALSE) {
3581 /*
3582 * couldn't upgrade, so explictly
3583 * take the lock exclusively
3584 */
3585 vm_object_lock(object);
3586 }
3587 }
3588 }
3589
3590#if VM_FAULT_CLASSIFY
3591 /*
3592 * Temporary data gathering code
3593 */
3594 vm_fault_classify(object, offset, fault_type);
3595#endif
3596 /*
3597 * Fast fault code. The basic idea is to do as much as
3598 * possible while holding the map lock and object locks.
3599 * Busy pages are not used until the object lock has to
3600 * be dropped to do something (copy, zero fill, pmap enter).
3601 * Similarly, paging references aren't acquired until that
3602 * point, and object references aren't used.
3603 *
3604 * If we can figure out what to do
3605 * (zero fill, copy on write, pmap enter) while holding
3606 * the locks, then it gets done. Otherwise, we give up,
3607 * and use the original fault path (which doesn't hold
3608 * the map lock, and relies on busy pages).
3609 * The give up cases include:
3610 * - Have to talk to pager.
3611 * - Page is busy, absent or in error.
3612 * - Pager has locked out desired access.
3613 * - Fault needs to be restarted.
3614 * - Have to push page into copy object.
3615 *
3616 * The code is an infinite loop that moves one level down
3617 * the shadow chain each time. cur_object and cur_offset
3618 * refer to the current object being examined. object and offset
3619 * are the original object from the map. The loop is at the
3620 * top level if and only if object and cur_object are the same.
3621 *
3622 * Invariants: Map lock is held throughout. Lock is held on
3623 * original object and cur_object (if different) when
3624 * continuing or exiting loop.
3625 *
3626 */
3627
3628#if defined(__arm64__)
3629 /*
3630 * Fail if reading an execute-only page in a
3631 * pmap that enforces execute-only protection.
3632 */
3633 if (fault_type == VM_PROT_READ &&
3634 (prot & VM_PROT_EXECUTE) &&
3635 !(prot & VM_PROT_READ) &&
3636 pmap_enforces_execute_only(pmap)) {
3637 vm_object_unlock(object);
3638 vm_map_unlock_read(map);
3639 if (real_map != map) {
3640 vm_map_unlock(real_map);
3641 }
3642 kr = KERN_PROTECTION_FAILURE;
3643 goto done;
3644 }
3645#endif
3646
3647 /*
3648 * If this page is to be inserted in a copy delay object
3649 * for writing, and if the object has a copy, then the
3650 * copy delay strategy is implemented in the slow fault page.
3651 */
3652 if (object->copy_strategy == MEMORY_OBJECT_COPY_DELAY &&
3653 object->copy != VM_OBJECT_NULL && (fault_type & VM_PROT_WRITE))
3654 goto handle_copy_delay;
3655
3656 cur_object = object;
3657 cur_offset = offset;
3658
3659 grab_options = 0;
3660#if CONFIG_SECLUDED_MEMORY
3661 if (object->can_grab_secluded) {
3662 grab_options |= VM_PAGE_GRAB_SECLUDED;
3663 }
3664#endif /* CONFIG_SECLUDED_MEMORY */
3665
3666 while (TRUE) {
3667 if (!cur_object->pager_created &&
3668 cur_object->phys_contiguous) /* superpage */
3669 break;
3670
3671 if (cur_object->blocked_access) {
3672 /*
3673 * Access to this VM object has been blocked.
3674 * Let the slow path handle it.
3675 */
3676 break;
3677 }
3678
3679 m = vm_page_lookup(cur_object, cur_offset);
3680 m_object = NULL;
3681
3682 if (m != VM_PAGE_NULL) {
3683 m_object = cur_object;
3684
3685 if (m->vmp_busy) {
3686 wait_result_t result;
3687
3688 /*
3689 * in order to do the PAGE_ASSERT_WAIT, we must
3690 * have object that 'm' belongs to locked exclusively
3691 */
3692 if (object != cur_object) {
3693
3694 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3695
3696 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3697
3698 if (vm_object_lock_upgrade(cur_object) == FALSE) {
3699 /*
3700 * couldn't upgrade so go do a full retry
3701 * immediately since we can no longer be
3702 * certain about cur_object (since we
3703 * don't hold a reference on it)...
3704 * first drop the top object lock
3705 */
3706 vm_object_unlock(object);
3707
3708 vm_map_unlock_read(map);
3709 if (real_map != map)
3710 vm_map_unlock(real_map);
3711
3712 goto RetryFault;
3713 }
3714 }
3715 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3716
3717 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3718
3719 if (vm_object_lock_upgrade(object) == FALSE) {
3720 /*
3721 * couldn't upgrade, so explictly take the lock
3722 * exclusively and go relookup the page since we
3723 * will have dropped the object lock and
3724 * a different thread could have inserted
3725 * a page at this offset
3726 * no need for a full retry since we're
3727 * at the top level of the object chain
3728 */
3729 vm_object_lock(object);
3730
3731 continue;
3732 }
3733 }
3734 if ((m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) && m_object->internal) {
3735 /*
3736 * m->vmp_busy == TRUE and the object is locked exclusively
3737 * if m->pageout_queue == TRUE after we acquire the
3738 * queues lock, we are guaranteed that it is stable on
3739 * the pageout queue and therefore reclaimable
3740 *
3741 * NOTE: this is only true for the internal pageout queue
3742 * in the compressor world
3743 */
3744 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
3745
3746 vm_page_lock_queues();
3747
3748 if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
3749 vm_pageout_throttle_up(m);
3750 vm_page_unlock_queues();
3751
3752 PAGE_WAKEUP_DONE(m);
3753 goto reclaimed_from_pageout;
3754 }
3755 vm_page_unlock_queues();
3756 }
3757 if (object != cur_object)
3758 vm_object_unlock(object);
3759
3760 vm_map_unlock_read(map);
3761 if (real_map != map)
3762 vm_map_unlock(real_map);
3763
3764 result = PAGE_ASSERT_WAIT(m, interruptible);
3765
3766 vm_object_unlock(cur_object);
3767
3768 if (result == THREAD_WAITING) {
3769 result = thread_block(THREAD_CONTINUE_NULL);
3770
3771 counter(c_vm_fault_page_block_busy_kernel++);
3772 }
3773 if (result == THREAD_AWAKENED || result == THREAD_RESTART)
3774 goto RetryFault;
3775
3776 kr = KERN_ABORTED;
3777 goto done;
3778 }
3779reclaimed_from_pageout:
3780 if (m->vmp_laundry) {
3781 if (object != cur_object) {
3782 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3783 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3784
3785 vm_object_unlock(object);
3786 vm_object_unlock(cur_object);
3787
3788 vm_map_unlock_read(map);
3789 if (real_map != map)
3790 vm_map_unlock(real_map);
3791
3792 goto RetryFault;
3793 }
3794
3795 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3796
3797 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3798
3799 if (vm_object_lock_upgrade(object) == FALSE) {
3800 /*
3801 * couldn't upgrade, so explictly take the lock
3802 * exclusively and go relookup the page since we
3803 * will have dropped the object lock and
3804 * a different thread could have inserted
3805 * a page at this offset
3806 * no need for a full retry since we're
3807 * at the top level of the object chain
3808 */
3809 vm_object_lock(object);
3810
3811 continue;
3812 }
3813 }
3814 vm_pageout_steal_laundry(m, FALSE);
3815 }
3816
3817 if (VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
3818 /*
3819 * Guard page: let the slow path deal with it
3820 */
3821 break;
3822 }
3823 if (m->vmp_unusual && (m->vmp_error || m->vmp_restart || m->vmp_private || m->vmp_absent)) {
3824 /*
3825 * Unusual case... let the slow path deal with it
3826 */
3827 break;
3828 }
3829 if (VM_OBJECT_PURGEABLE_FAULT_ERROR(m_object)) {
3830 if (object != cur_object)
3831 vm_object_unlock(object);
3832 vm_map_unlock_read(map);
3833 if (real_map != map)
3834 vm_map_unlock(real_map);
3835 vm_object_unlock(cur_object);
3836 kr = KERN_MEMORY_ERROR;
3837 goto done;
3838 }
3839 assert(m_object == VM_PAGE_OBJECT(m));
3840
3841 if (VM_FAULT_NEED_CS_VALIDATION(map->pmap, m, m_object) ||
3842 (physpage_p != NULL && (prot & VM_PROT_WRITE))) {
3843upgrade_for_validation:
3844 /*
3845 * We might need to validate this page
3846 * against its code signature, so we
3847 * want to hold the VM object exclusively.
3848 */
3849 if (object != cur_object) {
3850 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
3851 vm_object_unlock(object);
3852 vm_object_unlock(cur_object);
3853
3854 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3855
3856 vm_map_unlock_read(map);
3857 if (real_map != map)
3858 vm_map_unlock(real_map);
3859
3860 goto RetryFault;
3861 }
3862
3863 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
3864
3865 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
3866
3867 if (vm_object_lock_upgrade(object) == FALSE) {
3868 /*
3869 * couldn't upgrade, so explictly take the lock
3870 * exclusively and go relookup the page since we
3871 * will have dropped the object lock and
3872 * a different thread could have inserted
3873 * a page at this offset
3874 * no need for a full retry since we're
3875 * at the top level of the object chain
3876 */
3877 vm_object_lock(object);
3878
3879 continue;
3880 }
3881 }
3882 }
3883 /*
3884 * Two cases of map in faults:
3885 * - At top level w/o copy object.
3886 * - Read fault anywhere.
3887 * --> must disallow write.
3888 */
3889
3890 if (object == cur_object && object->copy == VM_OBJECT_NULL) {
3891
3892 goto FastPmapEnter;
3893 }
3894
3895 if ((fault_type & VM_PROT_WRITE) == 0) {
3896 if (!pmap_has_prot_policy(prot)) {
3897 prot &= ~VM_PROT_WRITE;
3898 } else {
3899 /*
3900 * For a protection that the pmap cares
3901 * about, we must hand over the full
3902 * set of protections (so that the pmap
3903 * layer can apply any desired policy).
3904 * This means that cs_bypass must be
3905 * set, as this can force us to pass
3906 * RWX.
3907 */
3908 assert(fault_info.cs_bypass);
3909 }
3910
3911 if (object != cur_object) {
3912 /*
3913 * We still need to hold the top object
3914 * lock here to prevent a race between
3915 * a read fault (taking only "shared"
3916 * locks) and a write fault (taking
3917 * an "exclusive" lock on the top
3918 * object.
3919 * Otherwise, as soon as we release the
3920 * top lock, the write fault could
3921 * proceed and actually complete before
3922 * the read fault, and the copied page's
3923 * translation could then be overwritten
3924 * by the read fault's translation for
3925 * the original page.
3926 *
3927 * Let's just record what the top object
3928 * is and we'll release it later.
3929 */
3930 top_object = object;
3931
3932 /*
3933 * switch to the object that has the new page
3934 */
3935 object = cur_object;
3936 object_lock_type = cur_object_lock_type;
3937 }
3938FastPmapEnter:
3939 assert(m_object == VM_PAGE_OBJECT(m));
3940
3941 /*
3942 * prepare for the pmap_enter...
3943 * object and map are both locked
3944 * m contains valid data
3945 * object == m->vmp_object
3946 * cur_object == NULL or it's been unlocked
3947 * no paging references on either object or cur_object
3948 */
3949 if (top_object != VM_OBJECT_NULL || object_lock_type != OBJECT_LOCK_EXCLUSIVE)
3950 need_retry_ptr = &need_retry;
3951 else
3952 need_retry_ptr = NULL;
3953
3954 if (caller_pmap) {
3955 kr = vm_fault_enter(m,
3956 caller_pmap,
3957 caller_pmap_addr,
3958 prot,
3959 caller_prot,
3960 wired,
3961 change_wiring,
3962 wire_tag,
3963 &fault_info,
3964 need_retry_ptr,
3965 &type_of_fault);
3966 } else {
3967 kr = vm_fault_enter(m,
3968 pmap,
3969 vaddr,
3970 prot,
3971 caller_prot,
3972 wired,
3973 change_wiring,
3974 wire_tag,
3975 &fault_info,
3976 need_retry_ptr,
3977 &type_of_fault);
3978 }
3979#if DEVELOPMENT || DEBUG
3980 {
3981 int event_code = 0;
3982
3983 if (m_object->internal)
3984 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
3985 else if (m_object->object_is_shared_cache)
3986 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
3987 else
3988 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
3989
3990 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, trace_real_vaddr, (fault_info.user_tag << 16) | (caller_prot << 8) | type_of_fault, m->vmp_offset, get_current_unique_pid(), 0);
3991
3992 DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag);
3993 }
3994#endif
3995 if (kr == KERN_SUCCESS &&
3996 physpage_p != NULL) {
3997 /* for vm_map_wire_and_extract() */
3998 *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
3999 if (prot & VM_PROT_WRITE) {
4000 vm_object_lock_assert_exclusive(m_object);
4001 m->vmp_dirty = TRUE;
4002 }
4003 }
4004
4005 if (top_object != VM_OBJECT_NULL) {
4006 /*
4007 * It's safe to drop the top object
4008 * now that we've done our
4009 * vm_fault_enter(). Any other fault
4010 * in progress for that virtual
4011 * address will either find our page
4012 * and translation or put in a new page
4013 * and translation.
4014 */
4015 vm_object_unlock(top_object);
4016 top_object = VM_OBJECT_NULL;
4017 }
4018
4019 if (need_collapse == TRUE)
4020 vm_object_collapse(object, offset, TRUE);
4021
4022 if (need_retry == FALSE &&
4023 (type_of_fault == DBG_PAGEIND_FAULT || type_of_fault == DBG_PAGEINV_FAULT || type_of_fault == DBG_CACHE_HIT_FAULT)) {
4024 /*
4025 * evaluate access pattern and update state
4026 * vm_fault_deactivate_behind depends on the
4027 * state being up to date
4028 */
4029 vm_fault_is_sequential(m_object, cur_offset, fault_info.behavior);
4030
4031 vm_fault_deactivate_behind(m_object, cur_offset, fault_info.behavior);
4032 }
4033 /*
4034 * That's it, clean up and return.
4035 */
4036 if (m->vmp_busy)
4037 PAGE_WAKEUP_DONE(m);
4038
4039 if (need_retry == FALSE && !m_object->internal && (fault_type & VM_PROT_WRITE)) {
4040
4041 vm_object_paging_begin(m_object);
4042
4043 assert(written_on_object == VM_OBJECT_NULL);
4044 written_on_object = m_object;
4045 written_on_pager = m_object->pager;
4046 written_on_offset = m_object->paging_offset + m->vmp_offset;
4047 }
4048 vm_object_unlock(object);
4049
4050 vm_map_unlock_read(map);
4051 if (real_map != map)
4052 vm_map_unlock(real_map);
4053
4054 if (need_retry == TRUE) {
4055 /*
4056 * vm_fault_enter couldn't complete the PMAP_ENTER...
4057 * at this point we don't hold any locks so it's safe
4058 * to ask the pmap layer to expand the page table to
4059 * accommodate this mapping... once expanded, we'll
4060 * re-drive the fault which should result in vm_fault_enter
4061 * being able to successfully enter the mapping this time around
4062 */
4063 (void)pmap_enter_options(
4064 pmap, vaddr, 0, 0, 0, 0, 0,
4065 PMAP_OPTIONS_NOENTER, NULL);
4066
4067 need_retry = FALSE;
4068 goto RetryFault;
4069 }
4070 goto done;
4071 }
4072 /*
4073 * COPY ON WRITE FAULT
4074 */
4075 assert(object_lock_type == OBJECT_LOCK_EXCLUSIVE);
4076
4077 /*
4078 * If objects match, then
4079 * object->copy must not be NULL (else control
4080 * would be in previous code block), and we
4081 * have a potential push into the copy object
4082 * with which we can't cope with here.
4083 */
4084 if (cur_object == object) {
4085 /*
4086 * must take the slow path to
4087 * deal with the copy push
4088 */
4089 break;
4090 }
4091
4092 /*
4093 * This is now a shadow based copy on write
4094 * fault -- it requires a copy up the shadow
4095 * chain.
4096 */
4097 assert(m_object == VM_PAGE_OBJECT(m));
4098
4099 if ((cur_object_lock_type == OBJECT_LOCK_SHARED) &&
4100 VM_FAULT_NEED_CS_VALIDATION(NULL, m, m_object)) {
4101 goto upgrade_for_validation;
4102 }
4103
4104 /*
4105 * Allocate a page in the original top level
4106 * object. Give up if allocate fails. Also
4107 * need to remember current page, as it's the
4108 * source of the copy.
4109 *
4110 * at this point we hold locks on both
4111 * object and cur_object... no need to take
4112 * paging refs or mark pages BUSY since
4113 * we don't drop either object lock until
4114 * the page has been copied and inserted
4115 */
4116 cur_m = m;
4117 m = vm_page_grab_options(grab_options);
4118 m_object = NULL;
4119
4120 if (m == VM_PAGE_NULL) {
4121 /*
4122 * no free page currently available...
4123 * must take the slow path
4124 */
4125 break;
4126 }
4127 /*
4128 * Now do the copy. Mark the source page busy...
4129 *
4130 * NOTE: This code holds the map lock across
4131 * the page copy.
4132 */
4133 vm_page_copy(cur_m, m);
4134 vm_page_insert(m, object, offset);
4135 m_object = object;
4136 SET_PAGE_DIRTY(m, FALSE);
4137
4138 /*
4139 * Now cope with the source page and object
4140 */
4141 if (object->ref_count > 1 && cur_m->vmp_pmapped)
4142 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(cur_m));
4143
4144 if (cur_m->vmp_clustered) {
4145 VM_PAGE_COUNT_AS_PAGEIN(cur_m);
4146 VM_PAGE_CONSUME_CLUSTERED(cur_m);
4147 vm_fault_is_sequential(cur_object, cur_offset, fault_info.behavior);
4148 }
4149 need_collapse = TRUE;
4150
4151 if (!cur_object->internal &&
4152 cur_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY) {
4153 /*
4154 * The object from which we've just
4155 * copied a page is most probably backed
4156 * by a vnode. We don't want to waste too
4157 * much time trying to collapse the VM objects
4158 * and create a bottleneck when several tasks
4159 * map the same file.
4160 */
4161 if (cur_object->copy == object) {
4162 /*
4163 * Shared mapping or no COW yet.
4164 * We can never collapse a copy
4165 * object into its backing object.
4166 */
4167 need_collapse = FALSE;
4168 } else if (cur_object->copy == object->shadow &&
4169 object->shadow->resident_page_count == 0) {
4170 /*
4171 * Shared mapping after a COW occurred.
4172 */
4173 need_collapse = FALSE;
4174 }
4175 }
4176 vm_object_unlock(cur_object);
4177
4178 if (need_collapse == FALSE)
4179 vm_fault_collapse_skipped++;
4180 vm_fault_collapse_total++;
4181
4182 type_of_fault = DBG_COW_FAULT;
4183 VM_STAT_INCR(cow_faults);
4184 DTRACE_VM2(cow_fault, int, 1, (uint64_t *), NULL);
4185 current_task()->cow_faults++;
4186
4187 goto FastPmapEnter;
4188
4189 } else {
4190 /*
4191 * No page at cur_object, cur_offset... m == NULL
4192 */
4193 if (cur_object->pager_created) {
4194 int compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
4195
4196 if (MUST_ASK_PAGER(cur_object, cur_offset, compressor_external_state) == TRUE) {
4197 int my_fault_type;
4198 int c_flags = C_DONT_BLOCK;
4199 boolean_t insert_cur_object = FALSE;
4200
4201 /*
4202 * May have to talk to a pager...
4203 * if so, take the slow path by
4204 * doing a 'break' from the while (TRUE) loop
4205 *
4206 * external_state will only be set to VM_EXTERNAL_STATE_EXISTS
4207 * if the compressor is active and the page exists there
4208 */
4209 if (compressor_external_state != VM_EXTERNAL_STATE_EXISTS)
4210 break;
4211
4212 if (map == kernel_map || real_map == kernel_map) {
4213 /*
4214 * can't call into the compressor with the kernel_map
4215 * lock held, since the compressor may try to operate
4216 * on the kernel map in order to return an empty c_segment
4217 */
4218 break;
4219 }
4220 if (object != cur_object) {
4221 if (fault_type & VM_PROT_WRITE)
4222 c_flags |= C_KEEP;
4223 else
4224 insert_cur_object = TRUE;
4225 }
4226 if (insert_cur_object == TRUE) {
4227
4228 if (cur_object_lock_type == OBJECT_LOCK_SHARED) {
4229
4230 cur_object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4231
4232 if (vm_object_lock_upgrade(cur_object) == FALSE) {
4233 /*
4234 * couldn't upgrade so go do a full retry
4235 * immediately since we can no longer be
4236 * certain about cur_object (since we
4237 * don't hold a reference on it)...
4238 * first drop the top object lock
4239 */
4240 vm_object_unlock(object);
4241
4242 vm_map_unlock_read(map);
4243 if (real_map != map)
4244 vm_map_unlock(real_map);
4245
4246 goto RetryFault;
4247 }
4248 }
4249 } else if (object_lock_type == OBJECT_LOCK_SHARED) {
4250
4251 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4252
4253 if (object != cur_object) {
4254 /*
4255 * we can't go for the upgrade on the top
4256 * lock since the upgrade may block waiting
4257 * for readers to drain... since we hold
4258 * cur_object locked at this point, waiting
4259 * for the readers to drain would represent
4260 * a lock order inversion since the lock order
4261 * for objects is the reference order in the
4262 * shadown chain
4263 */
4264 vm_object_unlock(object);
4265 vm_object_unlock(cur_object);
4266
4267 vm_map_unlock_read(map);
4268 if (real_map != map)
4269 vm_map_unlock(real_map);
4270
4271 goto RetryFault;
4272 }
4273 if (vm_object_lock_upgrade(object) == FALSE) {
4274 /*
4275 * couldn't upgrade, so explictly take the lock
4276 * exclusively and go relookup the page since we
4277 * will have dropped the object lock and
4278 * a different thread could have inserted
4279 * a page at this offset
4280 * no need for a full retry since we're
4281 * at the top level of the object chain
4282 */
4283 vm_object_lock(object);
4284
4285 continue;
4286 }
4287 }
4288 m = vm_page_grab_options(grab_options);
4289 m_object = NULL;
4290
4291 if (m == VM_PAGE_NULL) {
4292 /*
4293 * no free page currently available...
4294 * must take the slow path
4295 */
4296 break;
4297 }
4298
4299 /*
4300 * The object is and remains locked
4301 * so no need to take a
4302 * "paging_in_progress" reference.
4303 */
4304 boolean_t shared_lock;
4305 if ((object == cur_object &&
4306 object_lock_type == OBJECT_LOCK_EXCLUSIVE) ||
4307 (object != cur_object &&
4308 cur_object_lock_type == OBJECT_LOCK_EXCLUSIVE)) {
4309 shared_lock = FALSE;
4310 } else {
4311 shared_lock = TRUE;
4312 }
4313
4314 kr = vm_compressor_pager_get(
4315 cur_object->pager,
4316 (cur_offset +
4317 cur_object->paging_offset),
4318 VM_PAGE_GET_PHYS_PAGE(m),
4319 &my_fault_type,
4320 c_flags,
4321 &compressed_count_delta);
4322
4323 vm_compressor_pager_count(
4324 cur_object->pager,
4325 compressed_count_delta,
4326 shared_lock,
4327 cur_object);
4328
4329 if (kr != KERN_SUCCESS) {
4330 vm_page_release(m, FALSE);
4331 m = VM_PAGE_NULL;
4332 break;
4333 }
4334 m->vmp_dirty = TRUE;
4335
4336 /*
4337 * If the object is purgeable, its
4338 * owner's purgeable ledgers will be
4339 * updated in vm_page_insert() but the
4340 * page was also accounted for in a
4341 * "compressed purgeable" ledger, so
4342 * update that now.
4343 */
4344 if (object != cur_object &&
4345 !insert_cur_object) {
4346 /*
4347 * We're not going to insert
4348 * the decompressed page into
4349 * the object it came from.
4350 *
4351 * We're dealing with a
4352 * copy-on-write fault on
4353 * "object".
4354 * We're going to decompress
4355 * the page directly into the
4356 * target "object" while
4357 * keepin the compressed
4358 * page for "cur_object", so
4359 * no ledger update in that
4360 * case.
4361 */
4362 } else if (((cur_object->purgable ==
4363 VM_PURGABLE_DENY) &&
4364 (!cur_object->vo_ledger_tag)) ||
4365 (cur_object->vo_owner ==
4366 NULL)) {
4367 /*
4368 * "cur_object" is not purgeable
4369 * and is not ledger-taged, or
4370 * there's no owner for it,
4371 * so no owner's ledgers to
4372 * update.
4373 */
4374 } else {
4375 /*
4376 * One less compressed
4377 * purgeable/tagged page for
4378 * cur_object's owner.
4379 */
4380 vm_object_owner_compressed_update(
4381 cur_object,
4382 -1);
4383 }
4384
4385 if (insert_cur_object) {
4386 vm_page_insert(m, cur_object, cur_offset);
4387 m_object = cur_object;
4388 } else {
4389 vm_page_insert(m, object, offset);
4390 m_object = object;
4391 }
4392
4393 if ((m_object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_USE_DEFAULT) {
4394 /*
4395 * If the page is not cacheable,
4396 * we can't let its contents
4397 * linger in the data cache
4398 * after the decompression.
4399 */
4400 pmap_sync_page_attributes_phys(VM_PAGE_GET_PHYS_PAGE(m));
4401 }
4402
4403 type_of_fault = my_fault_type;
4404
4405 VM_STAT_INCR(decompressions);
4406
4407 if (cur_object != object) {
4408 if (insert_cur_object) {
4409 top_object = object;
4410 /*
4411 * switch to the object that has the new page
4412 */
4413 object = cur_object;
4414 object_lock_type = cur_object_lock_type;
4415 } else {
4416 vm_object_unlock(cur_object);
4417 cur_object = object;
4418 }
4419 }
4420 goto FastPmapEnter;
4421 }
4422 /*
4423 * existence map present and indicates
4424 * that the pager doesn't have this page
4425 */
4426 }
4427 if (cur_object->shadow == VM_OBJECT_NULL) {
4428 /*
4429 * Zero fill fault. Page gets
4430 * inserted into the original object.
4431 */
4432 if (cur_object->shadow_severed ||
4433 VM_OBJECT_PURGEABLE_FAULT_ERROR(cur_object) ||
4434 cur_object == compressor_object ||
4435 cur_object == kernel_object ||
4436 cur_object == vm_submap_object) {
4437 if (object != cur_object)
4438 vm_object_unlock(cur_object);
4439 vm_object_unlock(object);
4440
4441 vm_map_unlock_read(map);
4442 if (real_map != map)
4443 vm_map_unlock(real_map);
4444
4445 kr = KERN_MEMORY_ERROR;
4446 goto done;
4447 }
4448 if (cur_object != object) {
4449 vm_object_unlock(cur_object);
4450
4451 cur_object = object;
4452 }
4453 if (object_lock_type == OBJECT_LOCK_SHARED) {
4454
4455 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4456
4457 if (vm_object_lock_upgrade(object) == FALSE) {
4458 /*
4459 * couldn't upgrade so do a full retry on the fault
4460 * since we dropped the object lock which
4461 * could allow another thread to insert
4462 * a page at this offset
4463 */
4464 vm_map_unlock_read(map);
4465 if (real_map != map)
4466 vm_map_unlock(real_map);
4467
4468 goto RetryFault;
4469 }
4470 }
4471 m = vm_page_alloc(object, offset);
4472 m_object = NULL;
4473
4474 if (m == VM_PAGE_NULL) {
4475 /*
4476 * no free page currently available...
4477 * must take the slow path
4478 */
4479 break;
4480 }
4481 m_object = object;
4482
4483 /*
4484 * Now zero fill page...
4485 * the page is probably going to
4486 * be written soon, so don't bother
4487 * to clear the modified bit
4488 *
4489 * NOTE: This code holds the map
4490 * lock across the zero fill.
4491 */
4492 type_of_fault = vm_fault_zero_page(m, map->no_zero_fill);
4493
4494 goto FastPmapEnter;
4495 }
4496 /*
4497 * On to the next level in the shadow chain
4498 */
4499 cur_offset += cur_object->vo_shadow_offset;
4500 new_object = cur_object->shadow;
4501
4502 /*
4503 * take the new_object's lock with the indicated state
4504 */
4505 if (cur_object_lock_type == OBJECT_LOCK_SHARED)
4506 vm_object_lock_shared(new_object);
4507 else
4508 vm_object_lock(new_object);
4509
4510 if (cur_object != object)
4511 vm_object_unlock(cur_object);
4512
4513 cur_object = new_object;
4514
4515 continue;
4516 }
4517 }
4518 /*
4519 * Cleanup from fast fault failure. Drop any object
4520 * lock other than original and drop map lock.
4521 */
4522 if (object != cur_object)
4523 vm_object_unlock(cur_object);
4524
4525 /*
4526 * must own the object lock exclusively at this point
4527 */
4528 if (object_lock_type == OBJECT_LOCK_SHARED) {
4529 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
4530
4531 if (vm_object_lock_upgrade(object) == FALSE) {
4532 /*
4533 * couldn't upgrade, so explictly
4534 * take the lock exclusively
4535 * no need to retry the fault at this
4536 * point since "vm_fault_page" will
4537 * completely re-evaluate the state
4538 */
4539 vm_object_lock(object);
4540 }
4541 }
4542
4543handle_copy_delay:
4544 vm_map_unlock_read(map);
4545 if (real_map != map)
4546 vm_map_unlock(real_map);
4547
4548 if (__improbable(object == compressor_object ||
4549 object == kernel_object ||
4550 object == vm_submap_object)) {
4551 /*
4552 * These objects are explicitly managed and populated by the
4553 * kernel. The virtual ranges backed by these objects should
4554 * either have wired pages or "holes" that are not supposed to
4555 * be accessed at all until they get explicitly populated.
4556 * We should never have to resolve a fault on a mapping backed
4557 * by one of these VM objects and providing a zero-filled page
4558 * would be wrong here, so let's fail the fault and let the
4559 * caller crash or recover.
4560 */
4561 vm_object_unlock(object);
4562 kr = KERN_MEMORY_ERROR;
4563 goto done;
4564 }
4565
4566 assert(object != compressor_object);
4567 assert(object != kernel_object);
4568 assert(object != vm_submap_object);
4569
4570 /*
4571 * Make a reference to this object to
4572 * prevent its disposal while we are messing with
4573 * it. Once we have the reference, the map is free
4574 * to be diddled. Since objects reference their
4575 * shadows (and copies), they will stay around as well.
4576 */
4577 vm_object_reference_locked(object);
4578 vm_object_paging_begin(object);
4579
4580 XPR(XPR_VM_FAULT,"vm_fault -> vm_fault_page\n",0,0,0,0,0);
4581
4582 error_code = 0;
4583
4584 result_page = VM_PAGE_NULL;
4585 kr = vm_fault_page(object, offset, fault_type,
4586 (change_wiring && !wired),
4587 FALSE, /* page not looked up */
4588 &prot, &result_page, &top_page,
4589 &type_of_fault,
4590 &error_code, map->no_zero_fill,
4591 FALSE, &fault_info);
4592
4593 /*
4594 * if kr != VM_FAULT_SUCCESS, then the paging reference
4595 * has been dropped and the object unlocked... the ref_count
4596 * is still held
4597 *
4598 * if kr == VM_FAULT_SUCCESS, then the paging reference
4599 * is still held along with the ref_count on the original object
4600 *
4601 * the object is returned locked with a paging reference
4602 *
4603 * if top_page != NULL, then it's BUSY and the
4604 * object it belongs to has a paging reference
4605 * but is returned unlocked
4606 */
4607 if (kr != VM_FAULT_SUCCESS &&
4608 kr != VM_FAULT_SUCCESS_NO_VM_PAGE) {
4609 /*
4610 * we didn't succeed, lose the object reference immediately.
4611 */
4612 vm_object_deallocate(object);
4613
4614 /*
4615 * See why we failed, and take corrective action.
4616 */
4617 switch (kr) {
4618 case VM_FAULT_MEMORY_SHORTAGE:
4619 if (vm_page_wait((change_wiring) ?
4620 THREAD_UNINT :
4621 THREAD_ABORTSAFE))
4622 goto RetryFault;
4623 /*
4624 * fall thru
4625 */
4626 case VM_FAULT_INTERRUPTED:
4627 kr = KERN_ABORTED;
4628 goto done;
4629 case VM_FAULT_RETRY:
4630 goto RetryFault;
4631 case VM_FAULT_MEMORY_ERROR:
4632 if (error_code)
4633 kr = error_code;
4634 else
4635 kr = KERN_MEMORY_ERROR;
4636 goto done;
4637 default:
4638 panic("vm_fault: unexpected error 0x%x from "
4639 "vm_fault_page()\n", kr);
4640 }
4641 }
4642 m = result_page;
4643 m_object = NULL;
4644
4645 if (m != VM_PAGE_NULL) {
4646 m_object = VM_PAGE_OBJECT(m);
4647 assert((change_wiring && !wired) ?
4648 (top_page == VM_PAGE_NULL) :
4649 ((top_page == VM_PAGE_NULL) == (m_object == object)));
4650 }
4651
4652 /*
4653 * What to do with the resulting page from vm_fault_page
4654 * if it doesn't get entered into the physical map:
4655 */
4656#define RELEASE_PAGE(m) \
4657 MACRO_BEGIN \
4658 PAGE_WAKEUP_DONE(m); \
4659 if ( !VM_PAGE_PAGEABLE(m)) { \
4660 vm_page_lockspin_queues(); \
4661 if ( !VM_PAGE_PAGEABLE(m)) \
4662 vm_page_activate(m); \
4663 vm_page_unlock_queues(); \
4664 } \
4665 MACRO_END
4666
4667
4668 object_locks_dropped = FALSE;
4669 /*
4670 * We must verify that the maps have not changed
4671 * since our last lookup. vm_map_verify() needs the
4672 * map lock (shared) but we are holding object locks.
4673 * So we do a try_lock() first and, if that fails, we
4674 * drop the object locks and go in for the map lock again.
4675 */
4676 if (!vm_map_try_lock_read(original_map)) {
4677
4678 if (m != VM_PAGE_NULL) {
4679 old_copy_object = m_object->copy;
4680 vm_object_unlock(m_object);
4681 } else {
4682 old_copy_object = VM_OBJECT_NULL;
4683 vm_object_unlock(object);
4684 }
4685
4686 object_locks_dropped = TRUE;
4687
4688 vm_map_lock_read(original_map);
4689 }
4690
4691 if ((map != original_map) || !vm_map_verify(map, &version)) {
4692
4693 if (object_locks_dropped == FALSE) {
4694 if (m != VM_PAGE_NULL) {
4695 old_copy_object = m_object->copy;
4696 vm_object_unlock(m_object);
4697 } else {
4698 old_copy_object = VM_OBJECT_NULL;
4699 vm_object_unlock(object);
4700 }
4701
4702 object_locks_dropped = TRUE;
4703 }
4704
4705 /*
4706 * no object locks are held at this point
4707 */
4708 vm_object_t retry_object;
4709 vm_object_offset_t retry_offset;
4710 vm_prot_t retry_prot;
4711
4712 /*
4713 * To avoid trying to write_lock the map while another
4714 * thread has it read_locked (in vm_map_pageable), we
4715 * do not try for write permission. If the page is
4716 * still writable, we will get write permission. If it
4717 * is not, or has been marked needs_copy, we enter the
4718 * mapping without write permission, and will merely
4719 * take another fault.
4720 */
4721 map = original_map;
4722
4723 kr = vm_map_lookup_locked(&map, vaddr,
4724 fault_type & ~VM_PROT_WRITE,
4725 OBJECT_LOCK_EXCLUSIVE, &version,
4726 &retry_object, &retry_offset, &retry_prot,
4727 &wired,
4728 &fault_info,
4729 &real_map);
4730 pmap = real_map->pmap;
4731
4732 if (kr != KERN_SUCCESS) {
4733 vm_map_unlock_read(map);
4734
4735 if (m != VM_PAGE_NULL) {
4736 assert(VM_PAGE_OBJECT(m) == m_object);
4737
4738 /*
4739 * retake the lock so that
4740 * we can drop the paging reference
4741 * in vm_fault_cleanup and do the
4742 * PAGE_WAKEUP_DONE in RELEASE_PAGE
4743 */
4744 vm_object_lock(m_object);
4745
4746 RELEASE_PAGE(m);
4747
4748 vm_fault_cleanup(m_object, top_page);
4749 } else {
4750 /*
4751 * retake the lock so that
4752 * we can drop the paging reference
4753 * in vm_fault_cleanup
4754 */
4755 vm_object_lock(object);
4756
4757 vm_fault_cleanup(object, top_page);
4758 }
4759 vm_object_deallocate(object);
4760
4761 goto done;
4762 }
4763 vm_object_unlock(retry_object);
4764
4765 if ((retry_object != object) || (retry_offset != offset)) {
4766
4767 vm_map_unlock_read(map);
4768 if (real_map != map)
4769 vm_map_unlock(real_map);
4770
4771 if (m != VM_PAGE_NULL) {
4772 assert(VM_PAGE_OBJECT(m) == m_object);
4773
4774 /*
4775 * retake the lock so that
4776 * we can drop the paging reference
4777 * in vm_fault_cleanup and do the
4778 * PAGE_WAKEUP_DONE in RELEASE_PAGE
4779 */
4780 vm_object_lock(m_object);
4781
4782 RELEASE_PAGE(m);
4783
4784 vm_fault_cleanup(m_object, top_page);
4785 } else {
4786 /*
4787 * retake the lock so that
4788 * we can drop the paging reference
4789 * in vm_fault_cleanup
4790 */
4791 vm_object_lock(object);
4792
4793 vm_fault_cleanup(object, top_page);
4794 }
4795 vm_object_deallocate(object);
4796
4797 goto RetryFault;
4798 }
4799 /*
4800 * Check whether the protection has changed or the object
4801 * has been copied while we left the map unlocked.
4802 */
4803 if (pmap_has_prot_policy(retry_prot)) {
4804 /* If the pmap layer cares, pass the full set. */
4805 prot = retry_prot;
4806 } else {
4807 prot &= retry_prot;
4808 }
4809 }
4810
4811 if (object_locks_dropped == TRUE) {
4812 if (m != VM_PAGE_NULL) {
4813 vm_object_lock(m_object);
4814
4815 if (m_object->copy != old_copy_object) {
4816 /*
4817 * The copy object changed while the top-level object
4818 * was unlocked, so take away write permission.
4819 */
4820 assert(!pmap_has_prot_policy(prot));
4821 prot &= ~VM_PROT_WRITE;
4822 }
4823 } else
4824 vm_object_lock(object);
4825
4826 object_locks_dropped = FALSE;
4827 }
4828
4829 /*
4830 * If we want to wire down this page, but no longer have
4831 * adequate permissions, we must start all over.
4832 */
4833 if (wired && (fault_type != (prot | VM_PROT_WRITE))) {
4834
4835 vm_map_unlock_read(map);
4836 if (real_map != map)
4837 vm_map_unlock(real_map);
4838
4839 if (m != VM_PAGE_NULL) {
4840 assert(VM_PAGE_OBJECT(m) == m_object);
4841
4842 RELEASE_PAGE(m);
4843
4844 vm_fault_cleanup(m_object, top_page);
4845 } else
4846 vm_fault_cleanup(object, top_page);
4847
4848 vm_object_deallocate(object);
4849
4850 goto RetryFault;
4851 }
4852 if (m != VM_PAGE_NULL) {
4853 /*
4854 * Put this page into the physical map.
4855 * We had to do the unlock above because pmap_enter
4856 * may cause other faults. The page may be on
4857 * the pageout queues. If the pageout daemon comes
4858 * across the page, it will remove it from the queues.
4859 */
4860 if (caller_pmap) {
4861 kr = vm_fault_enter(m,
4862 caller_pmap,
4863 caller_pmap_addr,
4864 prot,
4865 caller_prot,
4866 wired,
4867 change_wiring,
4868 wire_tag,
4869 &fault_info,
4870 NULL,
4871 &type_of_fault);
4872 } else {
4873 kr = vm_fault_enter(m,
4874 pmap,
4875 vaddr,
4876 prot,
4877 caller_prot,
4878 wired,
4879 change_wiring,
4880 wire_tag,
4881 &fault_info,
4882 NULL,
4883 &type_of_fault);
4884 }
4885 assert(VM_PAGE_OBJECT(m) == m_object);
4886
4887#if DEVELOPMENT || DEBUG
4888 {
4889 int event_code = 0;
4890
4891 if (m_object->internal)
4892 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_INTERNAL));
4893 else if (m_object->object_is_shared_cache)
4894 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_SHAREDCACHE));
4895 else
4896 event_code = (MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_REAL_FAULT_ADDR_EXTERNAL));
4897
4898 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE, event_code, trace_real_vaddr, (fault_info.user_tag << 16) | (caller_prot << 8) | type_of_fault, m->vmp_offset, get_current_unique_pid(), 0);
4899
4900 DTRACE_VM6(real_fault, vm_map_offset_t, real_vaddr, vm_map_offset_t, m->vmp_offset, int, event_code, int, caller_prot, int, type_of_fault, int, fault_info.user_tag);
4901 }
4902#endif
4903 if (kr != KERN_SUCCESS) {
4904 /* abort this page fault */
4905 vm_map_unlock_read(map);
4906 if (real_map != map)
4907 vm_map_unlock(real_map);
4908 PAGE_WAKEUP_DONE(m);
4909 vm_fault_cleanup(m_object, top_page);
4910 vm_object_deallocate(object);
4911 goto done;
4912 }
4913 if (physpage_p != NULL) {
4914 /* for vm_map_wire_and_extract() */
4915 *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
4916 if (prot & VM_PROT_WRITE) {
4917 vm_object_lock_assert_exclusive(m_object);
4918 m->vmp_dirty = TRUE;
4919 }
4920 }
4921 } else {
4922
4923 vm_map_entry_t entry;
4924 vm_map_offset_t laddr;
4925 vm_map_offset_t ldelta, hdelta;
4926
4927 /*
4928 * do a pmap block mapping from the physical address
4929 * in the object
4930 */
4931
4932 if (real_map != map)
4933 vm_map_unlock(real_map);
4934
4935 if (original_map != map) {
4936 vm_map_unlock_read(map);
4937 vm_map_lock_read(original_map);
4938 map = original_map;
4939 }
4940 real_map = map;
4941
4942 laddr = vaddr;
4943 hdelta = 0xFFFFF000;
4944 ldelta = 0xFFFFF000;
4945
4946 while (vm_map_lookup_entry(map, laddr, &entry)) {
4947 if (ldelta > (laddr - entry->vme_start))
4948 ldelta = laddr - entry->vme_start;
4949 if (hdelta > (entry->vme_end - laddr))
4950 hdelta = entry->vme_end - laddr;
4951 if (entry->is_sub_map) {
4952
4953 laddr = ((laddr - entry->vme_start)
4954 + VME_OFFSET(entry));
4955 vm_map_lock_read(VME_SUBMAP(entry));
4956
4957 if (map != real_map)
4958 vm_map_unlock_read(map);
4959 if (entry->use_pmap) {
4960 vm_map_unlock_read(real_map);
4961 real_map = VME_SUBMAP(entry);
4962 }
4963 map = VME_SUBMAP(entry);
4964
4965 } else {
4966 break;
4967 }
4968 }
4969
4970 if (vm_map_lookup_entry(map, laddr, &entry) &&
4971 (VME_OBJECT(entry) != NULL) &&
4972 (VME_OBJECT(entry) == object)) {
4973 int superpage;
4974
4975 if (!object->pager_created &&
4976 object->phys_contiguous &&
4977 VME_OFFSET(entry) == 0 &&
4978 (entry->vme_end - entry->vme_start == object->vo_size) &&
4979 VM_MAP_PAGE_ALIGNED(entry->vme_start, (object->vo_size-1))) {
4980 superpage = VM_MEM_SUPERPAGE;
4981 } else {
4982 superpage = 0;
4983 }
4984
4985 if (superpage && physpage_p) {
4986 /* for vm_map_wire_and_extract() */
4987 *physpage_p = (ppnum_t)
4988 ((((vm_map_offset_t)
4989 object->vo_shadow_offset)
4990 + VME_OFFSET(entry)
4991 + (laddr - entry->vme_start))
4992 >> PAGE_SHIFT);
4993 }
4994
4995 if (caller_pmap) {
4996 /*
4997 * Set up a block mapped area
4998 */
4999 assert((uint32_t)((ldelta + hdelta) >> PAGE_SHIFT) == ((ldelta + hdelta) >> PAGE_SHIFT));
5000 kr = pmap_map_block(caller_pmap,
5001 (addr64_t)(caller_pmap_addr - ldelta),
5002 (ppnum_t)((((vm_map_offset_t) (VME_OBJECT(entry)->vo_shadow_offset)) +
5003 VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta) >> PAGE_SHIFT),
5004 (uint32_t)((ldelta + hdelta) >> PAGE_SHIFT), prot,
5005 (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
5006
5007 if (kr != KERN_SUCCESS) {
5008 goto cleanup;
5009 }
5010 } else {
5011 /*
5012 * Set up a block mapped area
5013 */
5014 assert((uint32_t)((ldelta + hdelta) >> PAGE_SHIFT) == ((ldelta + hdelta) >> PAGE_SHIFT));
5015 kr = pmap_map_block(real_map->pmap,
5016 (addr64_t)(vaddr - ldelta),
5017 (ppnum_t)((((vm_map_offset_t)(VME_OBJECT(entry)->vo_shadow_offset)) +
5018 VME_OFFSET(entry) + (laddr - entry->vme_start) - ldelta) >> PAGE_SHIFT),
5019 (uint32_t)((ldelta + hdelta) >> PAGE_SHIFT), prot,
5020 (VM_WIMG_MASK & (int)object->wimg_bits) | superpage, 0);
5021
5022 if (kr != KERN_SUCCESS) {
5023 goto cleanup;
5024 }
5025 }
5026 }
5027 }
5028
5029 /*
5030 * Success
5031 */
5032 kr = KERN_SUCCESS;
5033
5034 /*
5035 * TODO: could most of the done cases just use cleanup?
5036 */
5037cleanup:
5038 /*
5039 * Unlock everything, and return
5040 */
5041 vm_map_unlock_read(map);
5042 if (real_map != map)
5043 vm_map_unlock(real_map);
5044
5045 if (m != VM_PAGE_NULL) {
5046 assert(VM_PAGE_OBJECT(m) == m_object);
5047
5048 if (!m_object->internal && (fault_type & VM_PROT_WRITE)) {
5049
5050 vm_object_paging_begin(m_object);
5051
5052 assert(written_on_object == VM_OBJECT_NULL);
5053 written_on_object = m_object;
5054 written_on_pager = m_object->pager;
5055 written_on_offset = m_object->paging_offset + m->vmp_offset;
5056 }
5057 PAGE_WAKEUP_DONE(m);
5058
5059 vm_fault_cleanup(m_object, top_page);
5060 } else
5061 vm_fault_cleanup(object, top_page);
5062
5063 vm_object_deallocate(object);
5064
5065#undef RELEASE_PAGE
5066
5067done:
5068 thread_interrupt_level(interruptible_state);
5069
5070 /*
5071 * Only I/O throttle on faults which cause a pagein/swapin.
5072 */
5073 if ((type_of_fault == DBG_PAGEIND_FAULT) || (type_of_fault == DBG_PAGEINV_FAULT) || (type_of_fault == DBG_COMPRESSOR_SWAPIN_FAULT)) {
5074 throttle_lowpri_io(1);
5075 } else {
5076 if (kr == KERN_SUCCESS && type_of_fault != DBG_CACHE_HIT_FAULT && type_of_fault != DBG_GUARD_FAULT) {
5077
5078 if ((throttle_delay = vm_page_throttled(TRUE))) {
5079
5080 if (vm_debug_events) {
5081 if (type_of_fault == DBG_COMPRESSOR_FAULT)
5082 VM_DEBUG_EVENT(vmf_compressordelay, VMF_COMPRESSORDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
5083 else if (type_of_fault == DBG_COW_FAULT)
5084 VM_DEBUG_EVENT(vmf_cowdelay, VMF_COWDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
5085 else
5086 VM_DEBUG_EVENT(vmf_zfdelay, VMF_ZFDELAY, DBG_FUNC_NONE, throttle_delay, 0, 0, 0);
5087 }
5088 delay(throttle_delay);
5089 }
5090 }
5091 }
5092
5093 if (written_on_object) {
5094
5095 vnode_pager_dirtied(written_on_pager, written_on_offset, written_on_offset + PAGE_SIZE_64);
5096
5097 vm_object_lock(written_on_object);
5098 vm_object_paging_end(written_on_object);
5099 vm_object_unlock(written_on_object);
5100
5101 written_on_object = VM_OBJECT_NULL;
5102 }
5103
5104 if (rtfault) {
5105 vm_record_rtfault(cthread, fstart, trace_vaddr, type_of_fault);
5106 }
5107
5108 KERNEL_DEBUG_CONSTANT_IST(KDEBUG_TRACE,
5109 (MACHDBG_CODE(DBG_MACH_VM, 2)) | DBG_FUNC_END,
5110 ((uint64_t)trace_vaddr >> 32),
5111 trace_vaddr,
5112 kr,
5113 type_of_fault,
5114 0);
5115
5116 return (kr);
5117}
5118
5119/*
5120 * vm_fault_wire:
5121 *
5122 * Wire down a range of virtual addresses in a map.
5123 */
5124kern_return_t
5125vm_fault_wire(
5126 vm_map_t map,
5127 vm_map_entry_t entry,
5128 vm_prot_t prot,
5129 vm_tag_t wire_tag,
5130 pmap_t pmap,
5131 vm_map_offset_t pmap_addr,
5132 ppnum_t *physpage_p)
5133{
5134 vm_map_offset_t va;
5135 vm_map_offset_t end_addr = entry->vme_end;
5136 kern_return_t rc;
5137
5138 assert(entry->in_transition);
5139
5140 if ((VME_OBJECT(entry) != NULL) &&
5141 !entry->is_sub_map &&
5142 VME_OBJECT(entry)->phys_contiguous) {
5143 return KERN_SUCCESS;
5144 }
5145
5146 /*
5147 * Inform the physical mapping system that the
5148 * range of addresses may not fault, so that
5149 * page tables and such can be locked down as well.
5150 */
5151
5152 pmap_pageable(pmap, pmap_addr,
5153 pmap_addr + (end_addr - entry->vme_start), FALSE);
5154
5155 /*
5156 * We simulate a fault to get the page and enter it
5157 * in the physical map.
5158 */
5159
5160 for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
5161 rc = vm_fault_wire_fast(map, va, prot, wire_tag, entry, pmap,
5162 pmap_addr + (va - entry->vme_start),
5163 physpage_p);
5164 if (rc != KERN_SUCCESS) {
5165 rc = vm_fault_internal(map, va, prot, TRUE, wire_tag,
5166 ((pmap == kernel_pmap)
5167 ? THREAD_UNINT
5168 : THREAD_ABORTSAFE),
5169 pmap,
5170 (pmap_addr +
5171 (va - entry->vme_start)),
5172 physpage_p);
5173 DTRACE_VM2(softlock, int, 1, (uint64_t *), NULL);
5174 }
5175
5176 if (rc != KERN_SUCCESS) {
5177 struct vm_map_entry tmp_entry = *entry;
5178
5179 /* unwire wired pages */
5180 tmp_entry.vme_end = va;
5181 vm_fault_unwire(map,
5182 &tmp_entry, FALSE, pmap, pmap_addr);
5183
5184 return rc;
5185 }
5186 }
5187 return KERN_SUCCESS;
5188}
5189
5190/*
5191 * vm_fault_unwire:
5192 *
5193 * Unwire a range of virtual addresses in a map.
5194 */
5195void
5196vm_fault_unwire(
5197 vm_map_t map,
5198 vm_map_entry_t entry,
5199 boolean_t deallocate,
5200 pmap_t pmap,
5201 vm_map_offset_t pmap_addr)
5202{
5203 vm_map_offset_t va;
5204 vm_map_offset_t end_addr = entry->vme_end;
5205 vm_object_t object;
5206 struct vm_object_fault_info fault_info = {};
5207 unsigned int unwired_pages;
5208
5209 object = (entry->is_sub_map) ? VM_OBJECT_NULL : VME_OBJECT(entry);
5210
5211 /*
5212 * If it's marked phys_contiguous, then vm_fault_wire() didn't actually
5213 * do anything since such memory is wired by default. So we don't have
5214 * anything to undo here.
5215 */
5216
5217 if (object != VM_OBJECT_NULL && object->phys_contiguous)
5218 return;
5219
5220 fault_info.interruptible = THREAD_UNINT;
5221 fault_info.behavior = entry->behavior;
5222 fault_info.user_tag = VME_ALIAS(entry);
5223 if (entry->iokit_acct ||
5224 (!entry->is_sub_map && !entry->use_pmap)) {
5225 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
5226 }
5227 fault_info.lo_offset = VME_OFFSET(entry);
5228 fault_info.hi_offset = (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
5229 fault_info.no_cache = entry->no_cache;
5230 fault_info.stealth = TRUE;
5231
5232 unwired_pages = 0;
5233
5234 /*
5235 * Since the pages are wired down, we must be able to
5236 * get their mappings from the physical map system.
5237 */
5238
5239 for (va = entry->vme_start; va < end_addr; va += PAGE_SIZE) {
5240
5241 if (object == VM_OBJECT_NULL) {
5242 if (pmap) {
5243 pmap_change_wiring(pmap,
5244 pmap_addr + (va - entry->vme_start), FALSE);
5245 }
5246 (void) vm_fault(map, va, VM_PROT_NONE,
5247 TRUE, VM_KERN_MEMORY_NONE, THREAD_UNINT, pmap, pmap_addr);
5248 } else {
5249 vm_prot_t prot;
5250 vm_page_t result_page;
5251 vm_page_t top_page;
5252 vm_object_t result_object;
5253 vm_fault_return_t result;
5254
5255 /* cap cluster size at maximum UPL size */
5256 upl_size_t cluster_size;
5257 if (os_sub_overflow(end_addr, va, &cluster_size)) {
5258 cluster_size = 0 - (upl_size_t)PAGE_SIZE;
5259 }
5260 fault_info.cluster_size = cluster_size;
5261
5262 do {
5263 prot = VM_PROT_NONE;
5264
5265 vm_object_lock(object);
5266 vm_object_paging_begin(object);
5267 XPR(XPR_VM_FAULT,
5268 "vm_fault_unwire -> vm_fault_page\n",
5269 0,0,0,0,0);
5270 result_page = VM_PAGE_NULL;
5271 result = vm_fault_page(
5272 object,
5273 (VME_OFFSET(entry) +
5274 (va - entry->vme_start)),
5275 VM_PROT_NONE, TRUE,
5276 FALSE, /* page not looked up */
5277 &prot, &result_page, &top_page,
5278 (int *)0,
5279 NULL, map->no_zero_fill,
5280 FALSE, &fault_info);
5281 } while (result == VM_FAULT_RETRY);
5282
5283 /*
5284 * If this was a mapping to a file on a device that has been forcibly
5285 * unmounted, then we won't get a page back from vm_fault_page(). Just
5286 * move on to the next one in case the remaining pages are mapped from
5287 * different objects. During a forced unmount, the object is terminated
5288 * so the alive flag will be false if this happens. A forced unmount will
5289 * will occur when an external disk is unplugged before the user does an
5290 * eject, so we don't want to panic in that situation.
5291 */
5292
5293 if (result == VM_FAULT_MEMORY_ERROR && !object->alive)
5294 continue;
5295
5296 if (result == VM_FAULT_MEMORY_ERROR &&
5297 object == kernel_object) {
5298 /*
5299 * This must have been allocated with
5300 * KMA_KOBJECT and KMA_VAONLY and there's
5301 * no physical page at this offset.
5302 * We're done (no page to free).
5303 */
5304 assert(deallocate);
5305 continue;
5306 }
5307
5308 if (result != VM_FAULT_SUCCESS)
5309 panic("vm_fault_unwire: failure");
5310
5311 result_object = VM_PAGE_OBJECT(result_page);
5312
5313 if (deallocate) {
5314 assert(VM_PAGE_GET_PHYS_PAGE(result_page) !=
5315 vm_page_fictitious_addr);
5316 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(result_page));
5317 if (VM_PAGE_WIRED(result_page)) {
5318 unwired_pages++;
5319 }
5320 VM_PAGE_FREE(result_page);
5321 } else {
5322 if ((pmap) && (VM_PAGE_GET_PHYS_PAGE(result_page) != vm_page_guard_addr))
5323 pmap_change_wiring(pmap,
5324 pmap_addr + (va - entry->vme_start), FALSE);
5325
5326
5327 if (VM_PAGE_WIRED(result_page)) {
5328 vm_page_lockspin_queues();
5329 vm_page_unwire(result_page, TRUE);
5330 vm_page_unlock_queues();
5331 unwired_pages++;
5332 }
5333 if(entry->zero_wired_pages) {
5334 pmap_zero_page(VM_PAGE_GET_PHYS_PAGE(result_page));
5335 entry->zero_wired_pages = FALSE;
5336 }
5337
5338 PAGE_WAKEUP_DONE(result_page);
5339 }
5340 vm_fault_cleanup(result_object, top_page);
5341 }
5342 }
5343
5344 /*
5345 * Inform the physical mapping system that the range
5346 * of addresses may fault, so that page tables and
5347 * such may be unwired themselves.
5348 */
5349
5350 pmap_pageable(pmap, pmap_addr,
5351 pmap_addr + (end_addr - entry->vme_start), TRUE);
5352
5353 if (kernel_object == object) {
5354 vm_tag_update_size(fault_info.user_tag, -ptoa_64(unwired_pages));
5355 }
5356}
5357
5358/*
5359 * vm_fault_wire_fast:
5360 *
5361 * Handle common case of a wire down page fault at the given address.
5362 * If successful, the page is inserted into the associated physical map.
5363 * The map entry is passed in to avoid the overhead of a map lookup.
5364 *
5365 * NOTE: the given address should be truncated to the
5366 * proper page address.
5367 *
5368 * KERN_SUCCESS is returned if the page fault is handled; otherwise,
5369 * a standard error specifying why the fault is fatal is returned.
5370 *
5371 * The map in question must be referenced, and remains so.
5372 * Caller has a read lock on the map.
5373 *
5374 * This is a stripped version of vm_fault() for wiring pages. Anything
5375 * other than the common case will return KERN_FAILURE, and the caller
5376 * is expected to call vm_fault().
5377 */
5378static kern_return_t
5379vm_fault_wire_fast(
5380 __unused vm_map_t map,
5381 vm_map_offset_t va,
5382 __unused vm_prot_t caller_prot,
5383 vm_tag_t wire_tag,
5384 vm_map_entry_t entry,
5385 pmap_t pmap,
5386 vm_map_offset_t pmap_addr,
5387 ppnum_t *physpage_p)
5388{
5389 vm_object_t object;
5390 vm_object_offset_t offset;
5391 vm_page_t m;
5392 vm_prot_t prot;
5393 thread_t thread = current_thread();
5394 int type_of_fault;
5395 kern_return_t kr;
5396 struct vm_object_fault_info fault_info = {};
5397
5398 VM_STAT_INCR(faults);
5399
5400 if (thread != THREAD_NULL && thread->task != TASK_NULL)
5401 thread->task->faults++;
5402
5403/*
5404 * Recovery actions
5405 */
5406
5407#undef RELEASE_PAGE
5408#define RELEASE_PAGE(m) { \
5409 PAGE_WAKEUP_DONE(m); \
5410 vm_page_lockspin_queues(); \
5411 vm_page_unwire(m, TRUE); \
5412 vm_page_unlock_queues(); \
5413}
5414
5415
5416#undef UNLOCK_THINGS
5417#define UNLOCK_THINGS { \
5418 vm_object_paging_end(object); \
5419 vm_object_unlock(object); \
5420}
5421
5422#undef UNLOCK_AND_DEALLOCATE
5423#define UNLOCK_AND_DEALLOCATE { \
5424 UNLOCK_THINGS; \
5425 vm_object_deallocate(object); \
5426}
5427/*
5428 * Give up and have caller do things the hard way.
5429 */
5430
5431#define GIVE_UP { \
5432 UNLOCK_AND_DEALLOCATE; \
5433 return(KERN_FAILURE); \
5434}
5435
5436
5437 /*
5438 * If this entry is not directly to a vm_object, bail out.
5439 */
5440 if (entry->is_sub_map) {
5441 assert(physpage_p == NULL);
5442 return(KERN_FAILURE);
5443 }
5444
5445 /*
5446 * Find the backing store object and offset into it.
5447 */
5448
5449 object = VME_OBJECT(entry);
5450 offset = (va - entry->vme_start) + VME_OFFSET(entry);
5451 prot = entry->protection;
5452
5453 /*
5454 * Make a reference to this object to prevent its
5455 * disposal while we are messing with it.
5456 */
5457
5458 vm_object_lock(object);
5459 vm_object_reference_locked(object);
5460 vm_object_paging_begin(object);
5461
5462 /*
5463 * INVARIANTS (through entire routine):
5464 *
5465 * 1) At all times, we must either have the object
5466 * lock or a busy page in some object to prevent
5467 * some other thread from trying to bring in
5468 * the same page.
5469 *
5470 * 2) Once we have a busy page, we must remove it from
5471 * the pageout queues, so that the pageout daemon
5472 * will not grab it away.
5473 *
5474 */
5475
5476 /*
5477 * Look for page in top-level object. If it's not there or
5478 * there's something going on, give up.
5479 */
5480 m = vm_page_lookup(object, offset);
5481 if ((m == VM_PAGE_NULL) || (m->vmp_busy) ||
5482 (m->vmp_unusual && ( m->vmp_error || m->vmp_restart || m->vmp_absent))) {
5483
5484 GIVE_UP;
5485 }
5486 if (m->vmp_fictitious &&
5487 VM_PAGE_GET_PHYS_PAGE(m) == vm_page_guard_addr) {
5488 /*
5489 * Guard pages are fictitious pages and are never
5490 * entered into a pmap, so let's say it's been wired...
5491 */
5492 kr = KERN_SUCCESS;
5493 goto done;
5494 }
5495
5496 /*
5497 * Wire the page down now. All bail outs beyond this
5498 * point must unwire the page.
5499 */
5500
5501 vm_page_lockspin_queues();
5502 vm_page_wire(m, wire_tag, TRUE);
5503 vm_page_unlock_queues();
5504
5505 /*
5506 * Mark page busy for other threads.
5507 */
5508 assert(!m->vmp_busy);
5509 m->vmp_busy = TRUE;
5510 assert(!m->vmp_absent);
5511
5512 /*
5513 * Give up if the page is being written and there's a copy object
5514 */
5515 if ((object->copy != VM_OBJECT_NULL) && (prot & VM_PROT_WRITE)) {
5516 RELEASE_PAGE(m);
5517 GIVE_UP;
5518 }
5519
5520 fault_info.user_tag = VME_ALIAS(entry);
5521 fault_info.pmap_options = 0;
5522 if (entry->iokit_acct ||
5523 (!entry->is_sub_map && !entry->use_pmap)) {
5524 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
5525 }
5526
5527 /*
5528 * Put this page into the physical map.
5529 */
5530 type_of_fault = DBG_CACHE_HIT_FAULT;
5531 kr = vm_fault_enter(m,
5532 pmap,
5533 pmap_addr,
5534 prot,
5535 prot,
5536 TRUE, /* wired */
5537 FALSE, /* change_wiring */
5538 wire_tag,
5539 &fault_info,
5540 NULL,
5541 &type_of_fault);
5542 if (kr != KERN_SUCCESS) {
5543 RELEASE_PAGE(m);
5544 GIVE_UP;
5545 }
5546
5547done:
5548 /*
5549 * Unlock everything, and return
5550 */
5551
5552 if (physpage_p) {
5553 /* for vm_map_wire_and_extract() */
5554 if (kr == KERN_SUCCESS) {
5555 assert(object == VM_PAGE_OBJECT(m));
5556 *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
5557 if (prot & VM_PROT_WRITE) {
5558 vm_object_lock_assert_exclusive(object);
5559 m->vmp_dirty = TRUE;
5560 }
5561 } else {
5562 *physpage_p = 0;
5563 }
5564 }
5565
5566 PAGE_WAKEUP_DONE(m);
5567 UNLOCK_AND_DEALLOCATE;
5568
5569 return kr;
5570
5571}
5572
5573/*
5574 * Routine: vm_fault_copy_cleanup
5575 * Purpose:
5576 * Release a page used by vm_fault_copy.
5577 */
5578
5579static void
5580vm_fault_copy_cleanup(
5581 vm_page_t page,
5582 vm_page_t top_page)
5583{
5584 vm_object_t object = VM_PAGE_OBJECT(page);
5585
5586 vm_object_lock(object);
5587 PAGE_WAKEUP_DONE(page);
5588 if ( !VM_PAGE_PAGEABLE(page)) {
5589 vm_page_lockspin_queues();
5590 if ( !VM_PAGE_PAGEABLE(page)) {
5591 vm_page_activate(page);
5592 }
5593 vm_page_unlock_queues();
5594 }
5595 vm_fault_cleanup(object, top_page);
5596}
5597
5598static void
5599vm_fault_copy_dst_cleanup(
5600 vm_page_t page)
5601{
5602 vm_object_t object;
5603
5604 if (page != VM_PAGE_NULL) {
5605 object = VM_PAGE_OBJECT(page);
5606 vm_object_lock(object);
5607 vm_page_lockspin_queues();
5608 vm_page_unwire(page, TRUE);
5609 vm_page_unlock_queues();
5610 vm_object_paging_end(object);
5611 vm_object_unlock(object);
5612 }
5613}
5614
5615/*
5616 * Routine: vm_fault_copy
5617 *
5618 * Purpose:
5619 * Copy pages from one virtual memory object to another --
5620 * neither the source nor destination pages need be resident.
5621 *
5622 * Before actually copying a page, the version associated with
5623 * the destination address map wil be verified.
5624 *
5625 * In/out conditions:
5626 * The caller must hold a reference, but not a lock, to
5627 * each of the source and destination objects and to the
5628 * destination map.
5629 *
5630 * Results:
5631 * Returns KERN_SUCCESS if no errors were encountered in
5632 * reading or writing the data. Returns KERN_INTERRUPTED if
5633 * the operation was interrupted (only possible if the
5634 * "interruptible" argument is asserted). Other return values
5635 * indicate a permanent error in copying the data.
5636 *
5637 * The actual amount of data copied will be returned in the
5638 * "copy_size" argument. In the event that the destination map
5639 * verification failed, this amount may be less than the amount
5640 * requested.
5641 */
5642kern_return_t
5643vm_fault_copy(
5644 vm_object_t src_object,
5645 vm_object_offset_t src_offset,
5646 vm_map_size_t *copy_size, /* INOUT */
5647 vm_object_t dst_object,
5648 vm_object_offset_t dst_offset,
5649 vm_map_t dst_map,
5650 vm_map_version_t *dst_version,
5651 int interruptible)
5652{
5653 vm_page_t result_page;
5654
5655 vm_page_t src_page;
5656 vm_page_t src_top_page;
5657 vm_prot_t src_prot;
5658
5659 vm_page_t dst_page;
5660 vm_page_t dst_top_page;
5661 vm_prot_t dst_prot;
5662
5663 vm_map_size_t amount_left;
5664 vm_object_t old_copy_object;
5665 vm_object_t result_page_object = NULL;
5666 kern_return_t error = 0;
5667 vm_fault_return_t result;
5668
5669 vm_map_size_t part_size;
5670 struct vm_object_fault_info fault_info_src = {};
5671 struct vm_object_fault_info fault_info_dst = {};
5672
5673 /*
5674 * In order not to confuse the clustered pageins, align
5675 * the different offsets on a page boundary.
5676 */
5677
5678#define RETURN(x) \
5679 MACRO_BEGIN \
5680 *copy_size -= amount_left; \
5681 MACRO_RETURN(x); \
5682 MACRO_END
5683
5684 amount_left = *copy_size;
5685
5686 fault_info_src.interruptible = interruptible;
5687 fault_info_src.behavior = VM_BEHAVIOR_SEQUENTIAL;
5688 fault_info_src.lo_offset = vm_object_trunc_page(src_offset);
5689 fault_info_src.hi_offset = fault_info_src.lo_offset + amount_left;
5690 fault_info_src.stealth = TRUE;
5691
5692 fault_info_dst.interruptible = interruptible;
5693 fault_info_dst.behavior = VM_BEHAVIOR_SEQUENTIAL;
5694 fault_info_dst.lo_offset = vm_object_trunc_page(dst_offset);
5695 fault_info_dst.hi_offset = fault_info_dst.lo_offset + amount_left;
5696 fault_info_dst.stealth = TRUE;
5697
5698 do { /* while (amount_left > 0) */
5699 /*
5700 * There may be a deadlock if both source and destination
5701 * pages are the same. To avoid this deadlock, the copy must
5702 * start by getting the destination page in order to apply
5703 * COW semantics if any.
5704 */
5705
5706 RetryDestinationFault: ;
5707
5708 dst_prot = VM_PROT_WRITE|VM_PROT_READ;
5709
5710 vm_object_lock(dst_object);
5711 vm_object_paging_begin(dst_object);
5712
5713 /* cap cluster size at maximum UPL size */
5714 upl_size_t cluster_size;
5715 if (os_convert_overflow(amount_left, &cluster_size)) {
5716 cluster_size = 0 - (upl_size_t)PAGE_SIZE;
5717 }
5718 fault_info_dst.cluster_size = cluster_size;
5719
5720 XPR(XPR_VM_FAULT,"vm_fault_copy -> vm_fault_page\n",0,0,0,0,0);
5721 dst_page = VM_PAGE_NULL;
5722 result = vm_fault_page(dst_object,
5723 vm_object_trunc_page(dst_offset),
5724 VM_PROT_WRITE|VM_PROT_READ,
5725 FALSE,
5726 FALSE, /* page not looked up */
5727 &dst_prot, &dst_page, &dst_top_page,
5728 (int *)0,
5729 &error,
5730 dst_map->no_zero_fill,
5731 FALSE, &fault_info_dst);
5732 switch (result) {
5733 case VM_FAULT_SUCCESS:
5734 break;
5735 case VM_FAULT_RETRY:
5736 goto RetryDestinationFault;
5737 case VM_FAULT_MEMORY_SHORTAGE:
5738 if (vm_page_wait(interruptible))
5739 goto RetryDestinationFault;
5740 /* fall thru */
5741 case VM_FAULT_INTERRUPTED:
5742 RETURN(MACH_SEND_INTERRUPTED);
5743 case VM_FAULT_SUCCESS_NO_VM_PAGE:
5744 /* success but no VM page: fail the copy */
5745 vm_object_paging_end(dst_object);
5746 vm_object_unlock(dst_object);
5747 /*FALLTHROUGH*/
5748 case VM_FAULT_MEMORY_ERROR:
5749 if (error)
5750 return (error);
5751 else
5752 return(KERN_MEMORY_ERROR);
5753 default:
5754 panic("vm_fault_copy: unexpected error 0x%x from "
5755 "vm_fault_page()\n", result);
5756 }
5757 assert ((dst_prot & VM_PROT_WRITE) != VM_PROT_NONE);
5758
5759 assert(dst_object == VM_PAGE_OBJECT(dst_page));
5760 old_copy_object = dst_object->copy;
5761
5762 /*
5763 * There exists the possiblity that the source and
5764 * destination page are the same. But we can't
5765 * easily determine that now. If they are the
5766 * same, the call to vm_fault_page() for the
5767 * destination page will deadlock. To prevent this we
5768 * wire the page so we can drop busy without having
5769 * the page daemon steal the page. We clean up the
5770 * top page but keep the paging reference on the object
5771 * holding the dest page so it doesn't go away.
5772 */
5773
5774 vm_page_lockspin_queues();
5775 vm_page_wire(dst_page, VM_KERN_MEMORY_OSFMK, TRUE);
5776 vm_page_unlock_queues();
5777 PAGE_WAKEUP_DONE(dst_page);
5778 vm_object_unlock(dst_object);
5779
5780 if (dst_top_page != VM_PAGE_NULL) {
5781 vm_object_lock(dst_object);
5782 VM_PAGE_FREE(dst_top_page);
5783 vm_object_paging_end(dst_object);
5784 vm_object_unlock(dst_object);
5785 }
5786
5787 RetrySourceFault: ;
5788
5789 if (src_object == VM_OBJECT_NULL) {
5790 /*
5791 * No source object. We will just
5792 * zero-fill the page in dst_object.
5793 */
5794 src_page = VM_PAGE_NULL;
5795 result_page = VM_PAGE_NULL;
5796 } else {
5797 vm_object_lock(src_object);
5798 src_page = vm_page_lookup(src_object,
5799 vm_object_trunc_page(src_offset));
5800 if (src_page == dst_page) {
5801 src_prot = dst_prot;
5802 result_page = VM_PAGE_NULL;
5803 } else {
5804 src_prot = VM_PROT_READ;
5805 vm_object_paging_begin(src_object);
5806
5807 /* cap cluster size at maximum UPL size */
5808 if (os_convert_overflow(amount_left, &cluster_size)) {
5809 cluster_size = 0 - (upl_size_t)PAGE_SIZE;
5810 }
5811 fault_info_src.cluster_size = cluster_size;
5812
5813 XPR(XPR_VM_FAULT,
5814 "vm_fault_copy(2) -> vm_fault_page\n",
5815 0,0,0,0,0);
5816 result_page = VM_PAGE_NULL;
5817 result = vm_fault_page(
5818 src_object,
5819 vm_object_trunc_page(src_offset),
5820 VM_PROT_READ, FALSE,
5821 FALSE, /* page not looked up */
5822 &src_prot,
5823 &result_page, &src_top_page,
5824 (int *)0, &error, FALSE,
5825 FALSE, &fault_info_src);
5826
5827 switch (result) {
5828 case VM_FAULT_SUCCESS:
5829 break;
5830 case VM_FAULT_RETRY:
5831 goto RetrySourceFault;
5832 case VM_FAULT_MEMORY_SHORTAGE:
5833 if (vm_page_wait(interruptible))
5834 goto RetrySourceFault;
5835 /* fall thru */
5836 case VM_FAULT_INTERRUPTED:
5837 vm_fault_copy_dst_cleanup(dst_page);
5838 RETURN(MACH_SEND_INTERRUPTED);
5839 case VM_FAULT_SUCCESS_NO_VM_PAGE:
5840 /* success but no VM page: fail */
5841 vm_object_paging_end(src_object);
5842 vm_object_unlock(src_object);
5843 /*FALLTHROUGH*/
5844 case VM_FAULT_MEMORY_ERROR:
5845 vm_fault_copy_dst_cleanup(dst_page);
5846 if (error)
5847 return (error);
5848 else
5849 return(KERN_MEMORY_ERROR);
5850 default:
5851 panic("vm_fault_copy(2): unexpected "
5852 "error 0x%x from "
5853 "vm_fault_page()\n", result);
5854 }
5855
5856 result_page_object = VM_PAGE_OBJECT(result_page);
5857 assert((src_top_page == VM_PAGE_NULL) ==
5858 (result_page_object == src_object));
5859 }
5860 assert ((src_prot & VM_PROT_READ) != VM_PROT_NONE);
5861 vm_object_unlock(result_page_object);
5862 }
5863
5864 vm_map_lock_read(dst_map);
5865
5866 if (!vm_map_verify(dst_map, dst_version)) {
5867 vm_map_unlock_read(dst_map);
5868 if (result_page != VM_PAGE_NULL && src_page != dst_page)
5869 vm_fault_copy_cleanup(result_page, src_top_page);
5870 vm_fault_copy_dst_cleanup(dst_page);
5871 break;
5872 }
5873 assert(dst_object == VM_PAGE_OBJECT(dst_page));
5874
5875 vm_object_lock(dst_object);
5876
5877 if (dst_object->copy != old_copy_object) {
5878 vm_object_unlock(dst_object);
5879 vm_map_unlock_read(dst_map);
5880 if (result_page != VM_PAGE_NULL && src_page != dst_page)
5881 vm_fault_copy_cleanup(result_page, src_top_page);
5882 vm_fault_copy_dst_cleanup(dst_page);
5883 break;
5884 }
5885 vm_object_unlock(dst_object);
5886
5887 /*
5888 * Copy the page, and note that it is dirty
5889 * immediately.
5890 */
5891
5892 if (!page_aligned(src_offset) ||
5893 !page_aligned(dst_offset) ||
5894 !page_aligned(amount_left)) {
5895
5896 vm_object_offset_t src_po,
5897 dst_po;
5898
5899 src_po = src_offset - vm_object_trunc_page(src_offset);
5900 dst_po = dst_offset - vm_object_trunc_page(dst_offset);
5901
5902 if (dst_po > src_po) {
5903 part_size = PAGE_SIZE - dst_po;
5904 } else {
5905 part_size = PAGE_SIZE - src_po;
5906 }
5907 if (part_size > (amount_left)){
5908 part_size = amount_left;
5909 }
5910
5911 if (result_page == VM_PAGE_NULL) {
5912 assert((vm_offset_t) dst_po == dst_po);
5913 assert((vm_size_t) part_size == part_size);
5914 vm_page_part_zero_fill(dst_page,
5915 (vm_offset_t) dst_po,
5916 (vm_size_t) part_size);
5917 } else {
5918 assert((vm_offset_t) src_po == src_po);
5919 assert((vm_offset_t) dst_po == dst_po);
5920 assert((vm_size_t) part_size == part_size);
5921 vm_page_part_copy(result_page,
5922 (vm_offset_t) src_po,
5923 dst_page,
5924 (vm_offset_t) dst_po,
5925 (vm_size_t)part_size);
5926 if(!dst_page->vmp_dirty){
5927 vm_object_lock(dst_object);
5928 SET_PAGE_DIRTY(dst_page, TRUE);
5929 vm_object_unlock(dst_object);
5930 }
5931
5932 }
5933 } else {
5934 part_size = PAGE_SIZE;
5935
5936 if (result_page == VM_PAGE_NULL)
5937 vm_page_zero_fill(dst_page);
5938 else{
5939 vm_object_lock(result_page_object);
5940 vm_page_copy(result_page, dst_page);
5941 vm_object_unlock(result_page_object);
5942
5943 if(!dst_page->vmp_dirty){
5944 vm_object_lock(dst_object);
5945 SET_PAGE_DIRTY(dst_page, TRUE);
5946 vm_object_unlock(dst_object);
5947 }
5948 }
5949
5950 }
5951
5952 /*
5953 * Unlock everything, and return
5954 */
5955
5956 vm_map_unlock_read(dst_map);
5957
5958 if (result_page != VM_PAGE_NULL && src_page != dst_page)
5959 vm_fault_copy_cleanup(result_page, src_top_page);
5960 vm_fault_copy_dst_cleanup(dst_page);
5961
5962 amount_left -= part_size;
5963 src_offset += part_size;
5964 dst_offset += part_size;
5965 } while (amount_left > 0);
5966
5967 RETURN(KERN_SUCCESS);
5968#undef RETURN
5969
5970 /*NOTREACHED*/
5971}
5972
5973#if VM_FAULT_CLASSIFY
5974/*
5975 * Temporary statistics gathering support.
5976 */
5977
5978/*
5979 * Statistics arrays:
5980 */
5981#define VM_FAULT_TYPES_MAX 5
5982#define VM_FAULT_LEVEL_MAX 8
5983
5984int vm_fault_stats[VM_FAULT_TYPES_MAX][VM_FAULT_LEVEL_MAX];
5985
5986#define VM_FAULT_TYPE_ZERO_FILL 0
5987#define VM_FAULT_TYPE_MAP_IN 1
5988#define VM_FAULT_TYPE_PAGER 2
5989#define VM_FAULT_TYPE_COPY 3
5990#define VM_FAULT_TYPE_OTHER 4
5991
5992
5993void
5994vm_fault_classify(vm_object_t object,
5995 vm_object_offset_t offset,
5996 vm_prot_t fault_type)
5997{
5998 int type, level = 0;
5999 vm_page_t m;
6000
6001 while (TRUE) {
6002 m = vm_page_lookup(object, offset);
6003 if (m != VM_PAGE_NULL) {
6004 if (m->vmp_busy || m->vmp_error || m->vmp_restart || m->vmp_absent) {
6005 type = VM_FAULT_TYPE_OTHER;
6006 break;
6007 }
6008 if (((fault_type & VM_PROT_WRITE) == 0) ||
6009 ((level == 0) && object->copy == VM_OBJECT_NULL)) {
6010 type = VM_FAULT_TYPE_MAP_IN;
6011 break;
6012 }
6013 type = VM_FAULT_TYPE_COPY;
6014 break;
6015 }
6016 else {
6017 if (object->pager_created) {
6018 type = VM_FAULT_TYPE_PAGER;
6019 break;
6020 }
6021 if (object->shadow == VM_OBJECT_NULL) {
6022 type = VM_FAULT_TYPE_ZERO_FILL;
6023 break;
6024 }
6025
6026 offset += object->vo_shadow_offset;
6027 object = object->shadow;
6028 level++;
6029 continue;
6030 }
6031 }
6032
6033 if (level > VM_FAULT_LEVEL_MAX)
6034 level = VM_FAULT_LEVEL_MAX;
6035
6036 vm_fault_stats[type][level] += 1;
6037
6038 return;
6039}
6040
6041/* cleanup routine to call from debugger */
6042
6043void
6044vm_fault_classify_init(void)
6045{
6046 int type, level;
6047
6048 for (type = 0; type < VM_FAULT_TYPES_MAX; type++) {
6049 for (level = 0; level < VM_FAULT_LEVEL_MAX; level++) {
6050 vm_fault_stats[type][level] = 0;
6051 }
6052 }
6053
6054 return;
6055}
6056#endif /* VM_FAULT_CLASSIFY */
6057
6058vm_offset_t
6059kdp_lightweight_fault(vm_map_t map, vm_offset_t cur_target_addr)
6060{
6061 vm_map_entry_t entry;
6062 vm_object_t object;
6063 vm_offset_t object_offset;
6064 vm_page_t m;
6065 int compressor_external_state, compressed_count_delta;
6066 int compressor_flags = (C_DONT_BLOCK | C_KEEP | C_KDP);
6067 int my_fault_type = VM_PROT_READ;
6068 kern_return_t kr;
6069
6070 if (not_in_kdp) {
6071 panic("kdp_lightweight_fault called from outside of debugger context");
6072 }
6073
6074 assert(map != VM_MAP_NULL);
6075
6076 assert((cur_target_addr & PAGE_MASK) == 0);
6077 if ((cur_target_addr & PAGE_MASK) != 0) {
6078 return 0;
6079 }
6080
6081 if (kdp_lck_rw_lock_is_acquired_exclusive(&map->lock)) {
6082 return 0;
6083 }
6084
6085 if (!vm_map_lookup_entry(map, cur_target_addr, &entry)) {
6086 return 0;
6087 }
6088
6089 if (entry->is_sub_map) {
6090 return 0;
6091 }
6092
6093 object = VME_OBJECT(entry);
6094 if (object == VM_OBJECT_NULL) {
6095 return 0;
6096 }
6097
6098 object_offset = cur_target_addr - entry->vme_start + VME_OFFSET(entry);
6099
6100 while (TRUE) {
6101 if (kdp_lck_rw_lock_is_acquired_exclusive(&object->Lock)) {
6102 return 0;
6103 }
6104
6105 if (object->pager_created && (object->paging_in_progress ||
6106 object->activity_in_progress)) {
6107 return 0;
6108 }
6109
6110 m = kdp_vm_page_lookup(object, object_offset);
6111
6112 if (m != VM_PAGE_NULL) {
6113
6114 if ((object->wimg_bits & VM_WIMG_MASK) != VM_WIMG_DEFAULT) {
6115 return 0;
6116 }
6117
6118 if (m->vmp_laundry || m->vmp_busy || m->vmp_free_when_done || m->vmp_absent || m->vmp_error || m->vmp_cleaning ||
6119 m->vmp_overwriting || m->vmp_restart || m->vmp_unusual) {
6120 return 0;
6121 }
6122
6123 assert(!m->vmp_private);
6124 if (m->vmp_private) {
6125 return 0;
6126 }
6127
6128 assert(!m->vmp_fictitious);
6129 if (m->vmp_fictitious) {
6130 return 0;
6131 }
6132
6133 assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
6134 if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
6135 return 0;
6136 }
6137
6138 return ptoa(VM_PAGE_GET_PHYS_PAGE(m));
6139 }
6140
6141 compressor_external_state = VM_EXTERNAL_STATE_UNKNOWN;
6142
6143 if (object->pager_created && MUST_ASK_PAGER(object, object_offset, compressor_external_state)) {
6144 if (compressor_external_state == VM_EXTERNAL_STATE_EXISTS) {
6145 kr = vm_compressor_pager_get(object->pager, (object_offset + object->paging_offset),
6146 kdp_compressor_decompressed_page_ppnum, &my_fault_type,
6147 compressor_flags, &compressed_count_delta);
6148 if (kr == KERN_SUCCESS) {
6149 return kdp_compressor_decompressed_page_paddr;
6150 } else {
6151 return 0;
6152 }
6153 }
6154 }
6155
6156 if (object->shadow == VM_OBJECT_NULL) {
6157 return 0;
6158 }
6159
6160 object_offset += object->vo_shadow_offset;
6161 object = object->shadow;
6162 }
6163
6164}
6165
6166/*
6167 * vm_page_validate_cs_fast():
6168 * Performs a few quick checks to determine if the page's code signature
6169 * really needs to be fully validated. It could:
6170 * 1. have been modified (i.e. automatically tainted),
6171 * 2. have already been validated,
6172 * 3. have already been found to be tainted,
6173 * 4. no longer have a backing store.
6174 * Returns FALSE if the page needs to be fully validated.
6175 */
6176static boolean_t
6177vm_page_validate_cs_fast(
6178 vm_page_t page)
6179{
6180 vm_object_t object;
6181
6182 object = VM_PAGE_OBJECT(page);
6183 vm_object_lock_assert_held(object);
6184
6185 if (page->vmp_wpmapped && !page->vmp_cs_tainted) {
6186 /*
6187 * This page was mapped for "write" access sometime in the
6188 * past and could still be modifiable in the future.
6189 * Consider it tainted.
6190 * [ If the page was already found to be "tainted", no
6191 * need to re-validate. ]
6192 */
6193 vm_object_lock_assert_exclusive(object);
6194 page->vmp_cs_validated = TRUE;
6195 page->vmp_cs_tainted = TRUE;
6196 if (cs_debug) {
6197 printf("CODESIGNING: %s: "
6198 "page %p obj %p off 0x%llx "
6199 "was modified\n",
6200 __FUNCTION__,
6201 page, object, page->vmp_offset);
6202 }
6203 vm_cs_validated_dirtied++;
6204 }
6205
6206 if (page->vmp_cs_validated || page->vmp_cs_tainted) {
6207 return TRUE;
6208 }
6209 vm_object_lock_assert_exclusive(object);
6210
6211#if CHECK_CS_VALIDATION_BITMAP
6212 kern_return_t kr;
6213
6214 kr = vnode_pager_cs_check_validation_bitmap(
6215 object->pager,
6216 page->vmp_offset + object->paging_offset,
6217 CS_BITMAP_CHECK);
6218 if (kr == KERN_SUCCESS) {
6219 page->vmp_cs_validated = TRUE;
6220 page->vmp_cs_tainted = FALSE;
6221 vm_cs_bitmap_validated++;
6222 return TRUE;
6223 }
6224#endif /* CHECK_CS_VALIDATION_BITMAP */
6225
6226 if (!object->alive || object->terminating || object->pager == NULL) {
6227 /*
6228 * The object is terminating and we don't have its pager
6229 * so we can't validate the data...
6230 */
6231 return TRUE;
6232 }
6233
6234 /* we need to really validate this page */
6235 vm_object_lock_assert_exclusive(object);
6236 return FALSE;
6237}
6238
6239void
6240vm_page_validate_cs_mapped_slow(
6241 vm_page_t page,
6242 const void *kaddr)
6243{
6244 vm_object_t object;
6245 memory_object_offset_t mo_offset;
6246 memory_object_t pager;
6247 struct vnode *vnode;
6248 boolean_t validated;
6249 unsigned tainted;
6250
6251 assert(page->vmp_busy);
6252 object = VM_PAGE_OBJECT(page);
6253 vm_object_lock_assert_exclusive(object);
6254
6255 vm_cs_validates++;
6256
6257 /*
6258 * Since we get here to validate a page that was brought in by
6259 * the pager, we know that this pager is all setup and ready
6260 * by now.
6261 */
6262 assert(object->code_signed);
6263 assert(!object->internal);
6264 assert(object->pager != NULL);
6265 assert(object->pager_ready);
6266
6267 pager = object->pager;
6268 assert(object->paging_in_progress);
6269 vnode = vnode_pager_lookup_vnode(pager);
6270 mo_offset = page->vmp_offset + object->paging_offset;
6271
6272 /* verify the SHA1 hash for this page */
6273 tainted = 0;
6274 validated = cs_validate_range(vnode,
6275 pager,
6276 mo_offset,
6277 (const void *)((const char *)kaddr),
6278 PAGE_SIZE_64,
6279 &tainted);
6280
6281 if (tainted & CS_VALIDATE_TAINTED) {
6282 page->vmp_cs_tainted = TRUE;
6283 }
6284 if (tainted & CS_VALIDATE_NX) {
6285 page->vmp_cs_nx = TRUE;
6286 }
6287 if (validated) {
6288 page->vmp_cs_validated = TRUE;
6289 }
6290
6291#if CHECK_CS_VALIDATION_BITMAP
6292 if (page->vmp_cs_validated && !page->vmp_cs_tainted) {
6293 vnode_pager_cs_check_validation_bitmap(object->pager,
6294 mo_offset,
6295 CS_BITMAP_SET);
6296 }
6297#endif /* CHECK_CS_VALIDATION_BITMAP */
6298}
6299
6300void
6301vm_page_validate_cs_mapped(
6302 vm_page_t page,
6303 const void *kaddr)
6304{
6305 if (!vm_page_validate_cs_fast(page)) {
6306 vm_page_validate_cs_mapped_slow(page, kaddr);
6307 }
6308}
6309
6310void
6311vm_page_validate_cs(
6312 vm_page_t page)
6313{
6314 vm_object_t object;
6315 vm_object_offset_t offset;
6316 vm_map_offset_t koffset;
6317 vm_map_size_t ksize;
6318 vm_offset_t kaddr;
6319 kern_return_t kr;
6320 boolean_t busy_page;
6321 boolean_t need_unmap;
6322
6323 object = VM_PAGE_OBJECT(page);
6324 vm_object_lock_assert_held(object);
6325
6326 if (vm_page_validate_cs_fast(page)) {
6327 return;
6328 }
6329 vm_object_lock_assert_exclusive(object);
6330
6331 assert(object->code_signed);
6332 offset = page->vmp_offset;
6333
6334 busy_page = page->vmp_busy;
6335 if (!busy_page) {
6336 /* keep page busy while we map (and unlock) the VM object */
6337 page->vmp_busy = TRUE;
6338 }
6339
6340 /*
6341 * Take a paging reference on the VM object
6342 * to protect it from collapse or bypass,
6343 * and keep it from disappearing too.
6344 */
6345 vm_object_paging_begin(object);
6346
6347 /* map the page in the kernel address space */
6348 ksize = PAGE_SIZE_64;
6349 koffset = 0;
6350 need_unmap = FALSE;
6351 kr = vm_paging_map_object(page,
6352 object,
6353 offset,
6354 VM_PROT_READ,
6355 FALSE, /* can't unlock object ! */
6356 &ksize,
6357 &koffset,
6358 &need_unmap);
6359 if (kr != KERN_SUCCESS) {
6360 panic("%s: could not map page: 0x%x\n", __FUNCTION__, kr);
6361 }
6362 kaddr = CAST_DOWN(vm_offset_t, koffset);
6363
6364 /* validate the mapped page */
6365 vm_page_validate_cs_mapped_slow(page, (const void *) kaddr);
6366
6367 assert(page->vmp_busy);
6368 assert(object == VM_PAGE_OBJECT(page));
6369 vm_object_lock_assert_exclusive(object);
6370
6371 if (!busy_page) {
6372 PAGE_WAKEUP_DONE(page);
6373 }
6374 if (need_unmap) {
6375 /* unmap the map from the kernel address space */
6376 vm_paging_unmap_object(object, koffset, koffset + ksize);
6377 koffset = 0;
6378 ksize = 0;
6379 kaddr = 0;
6380 }
6381 vm_object_paging_end(object);
6382}
6383
6384void
6385vm_page_validate_cs_mapped_chunk(
6386 vm_page_t page,
6387 const void *kaddr,
6388 vm_offset_t chunk_offset,
6389 vm_size_t chunk_size,
6390 boolean_t *validated_p,
6391 unsigned *tainted_p)
6392{
6393 vm_object_t object;
6394 vm_object_offset_t offset, offset_in_page;
6395 memory_object_t pager;
6396 struct vnode *vnode;
6397 boolean_t validated;
6398 unsigned tainted;
6399
6400 *validated_p = FALSE;
6401 *tainted_p = 0;
6402
6403 assert(page->vmp_busy);
6404 object = VM_PAGE_OBJECT(page);
6405 vm_object_lock_assert_exclusive(object);
6406
6407 assert(object->code_signed);
6408 offset = page->vmp_offset;
6409
6410 if (!object->alive || object->terminating || object->pager == NULL) {
6411 /*
6412 * The object is terminating and we don't have its pager
6413 * so we can't validate the data...
6414 */
6415 return;
6416 }
6417 /*
6418 * Since we get here to validate a page that was brought in by
6419 * the pager, we know that this pager is all setup and ready
6420 * by now.
6421 */
6422 assert(!object->internal);
6423 assert(object->pager != NULL);
6424 assert(object->pager_ready);
6425
6426 pager = object->pager;
6427 assert(object->paging_in_progress);
6428 vnode = vnode_pager_lookup_vnode(pager);
6429
6430 /* verify the signature for this chunk */
6431 offset_in_page = chunk_offset;
6432 assert(offset_in_page < PAGE_SIZE);
6433
6434 tainted = 0;
6435 validated = cs_validate_range(vnode,
6436 pager,
6437 (object->paging_offset +
6438 offset +
6439 offset_in_page),
6440 (const void *)((const char *)kaddr
6441 + offset_in_page),
6442 chunk_size,
6443 &tainted);
6444 if (validated) {
6445 *validated_p = TRUE;
6446 }
6447 if (tainted) {
6448 *tainted_p = tainted;
6449 }
6450}
6451
6452static void vm_rtfrecord_lock(void) {
6453 lck_spin_lock(&vm_rtfr_slock);
6454}
6455
6456static void vm_rtfrecord_unlock(void) {
6457 lck_spin_unlock(&vm_rtfr_slock);
6458}
6459
6460unsigned int vmrtfaultinfo_bufsz(void) {
6461 return (vmrtf_num_records * sizeof(vm_rtfault_record_t));
6462}
6463
6464#include <kern/backtrace.h>
6465
6466static void vm_record_rtfault(thread_t cthread, uint64_t fstart, vm_map_offset_t fault_vaddr, int type_of_fault) {
6467 uint64_t fend = mach_continuous_time();
6468
6469 uint64_t cfpc = 0;
6470 uint64_t ctid = cthread->thread_id;
6471 uint64_t cupid = get_current_unique_pid();
6472
6473 uintptr_t bpc = 0;
6474 uint32_t bfrs = 0;
6475 bool u64 = false;
6476
6477 /* Capture a single-frame backtrace; this extracts just the program
6478 * counter at the point of the fault into "bpc", and should perform no
6479 * further user stack traversals, thus avoiding copyin()s and further
6480 * faults.
6481 */
6482 int btr = backtrace_thread_user(cthread, &bpc, 1U, &bfrs, &u64);
6483
6484 if ((btr == 0) && (bfrs > 0)) {
6485 cfpc = bpc;
6486 }
6487
6488 assert((fstart != 0) && fend >= fstart);
6489 vm_rtfrecord_lock();
6490 assert(vmrtfrs.vmrtfr_curi <= vmrtfrs.vmrtfr_maxi);
6491
6492 vmrtfrs.vmrtf_total++;
6493 vm_rtfault_record_t *cvmr = &vmrtfrs.vm_rtf_records[vmrtfrs.vmrtfr_curi++];
6494
6495 cvmr->rtfabstime = fstart;
6496 cvmr->rtfduration = fend - fstart;
6497 cvmr->rtfaddr = fault_vaddr;
6498 cvmr->rtfpc = cfpc;
6499 cvmr->rtftype = type_of_fault;
6500 cvmr->rtfupid = cupid;
6501 cvmr->rtftid = ctid;
6502
6503 if (vmrtfrs.vmrtfr_curi > vmrtfrs.vmrtfr_maxi) {
6504 vmrtfrs.vmrtfr_curi = 0;
6505 }
6506
6507 vm_rtfrecord_unlock();
6508}
6509
6510int vmrtf_extract(uint64_t cupid, __unused boolean_t isroot, int vrecordsz, void *vrecords, int *vmrtfrv) {
6511 vm_rtfault_record_t *cvmrd = vrecords;
6512 size_t residue = vrecordsz;
6513 int numextracted = 0;
6514 boolean_t early_exit = FALSE;
6515
6516 vm_rtfrecord_lock();
6517
6518 for (int vmfi = 0; vmfi <= vmrtfrs.vmrtfr_maxi; vmfi++) {
6519
6520 if (residue < sizeof(vm_rtfault_record_t)) {
6521 early_exit = TRUE;
6522 break;
6523 }
6524
6525 if (vmrtfrs.vm_rtf_records[vmfi].rtfupid != cupid) {
6526#if DEVELOPMENT || DEBUG
6527 if (isroot == FALSE) {
6528 continue;
6529 }
6530#else
6531 continue;
6532#endif /* DEVDEBUG */
6533 }
6534
6535 *cvmrd = vmrtfrs.vm_rtf_records[vmfi];
6536 cvmrd++;
6537 residue -= sizeof(vm_rtfault_record_t);
6538 numextracted++;
6539 }
6540
6541 vm_rtfrecord_unlock();
6542
6543 *vmrtfrv = numextracted;
6544 return (early_exit);
6545}
6546