1/*
2 * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 */
58/*
59 * File: vm/vm_map.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Date: 1985
62 *
63 * Virtual memory mapping module.
64 */
65
66#include <mach/vm_types.h>
67#include <mach_assert.h>
68
69#include <vm/vm_options.h>
70
71#include <libkern/OSAtomic.h>
72
73#include <mach/kern_return.h>
74#include <mach/port.h>
75#include <mach/vm_attributes.h>
76#include <mach/vm_param.h>
77#include <mach/vm_behavior.h>
78#include <mach/vm_statistics.h>
79#include <mach/memory_object.h>
80#include <mach/mach_vm.h>
81#include <machine/cpu_capabilities.h>
82#include <mach/sdt.h>
83
84#include <kern/assert.h>
85#include <kern/backtrace.h>
86#include <kern/counter.h>
87#include <kern/exc_guard.h>
88#include <kern/kalloc.h>
89#include <kern/zalloc_internal.h>
90
91#include <vm/cpm.h>
92#include <vm/vm_compressor.h>
93#include <vm/vm_compressor_pager.h>
94#include <vm/vm_init.h>
95#include <vm/vm_fault.h>
96#include <vm/vm_map_internal.h>
97#include <vm/vm_object.h>
98#include <vm/vm_page.h>
99#include <vm/vm_pageout.h>
100#include <vm/pmap.h>
101#include <vm/vm_kern.h>
102#include <ipc/ipc_port.h>
103#include <kern/sched_prim.h>
104#include <kern/misc_protos.h>
105
106#include <mach/vm_map_server.h>
107#include <mach/mach_host_server.h>
108#include <vm/vm_memtag.h>
109#include <vm/vm_protos.h>
110#include <vm/vm_purgeable_internal.h>
111#include <vm/vm_reclaim_internal.h>
112
113#include <vm/vm_protos.h>
114#include <vm/vm_shared_region.h>
115#include <vm/vm_map_store.h>
116
117#include <san/kasan.h>
118
119#include <sys/resource.h>
120#include <sys/random.h>
121#include <sys/codesign.h>
122#include <sys/code_signing.h>
123#include <sys/mman.h>
124#include <sys/reboot.h>
125#include <sys/kdebug_triage.h>
126
127#include <libkern/section_keywords.h>
128
129#if DEVELOPMENT || DEBUG
130extern int proc_selfcsflags(void);
131int vm_log_xnu_user_debug = 0;
132int panic_on_unsigned_execute = 0;
133int panic_on_mlock_failure = 0;
134#endif /* DEVELOPMENT || DEBUG */
135
136#if MACH_ASSERT
137int debug4k_filter = 0;
138char debug4k_proc_name[1024] = "";
139int debug4k_proc_filter = (int)-1 & ~(1 << __DEBUG4K_FAULT);
140int debug4k_panic_on_misaligned_sharing = 0;
141const char *debug4k_category_name[] = {
142 "error", /* 0 */
143 "life", /* 1 */
144 "load", /* 2 */
145 "fault", /* 3 */
146 "copy", /* 4 */
147 "share", /* 5 */
148 "adjust", /* 6 */
149 "pmap", /* 7 */
150 "mementry", /* 8 */
151 "iokit", /* 9 */
152 "upl", /* 10 */
153 "exc", /* 11 */
154 "vfs" /* 12 */
155};
156#endif /* MACH_ASSERT */
157int debug4k_no_cow_copyin = 0;
158
159
160#if __arm64__
161extern const int fourk_binary_compatibility_unsafe;
162extern const int fourk_binary_compatibility_allow_wx;
163#endif /* __arm64__ */
164extern void qsort(void *a, size_t n, size_t es, int (*cmp)(const void *, const void *));
165extern int proc_selfpid(void);
166extern char *proc_name_address(void *p);
167extern char *proc_best_name(struct proc *p);
168
169#if VM_MAP_DEBUG_APPLE_PROTECT
170int vm_map_debug_apple_protect = 0;
171#endif /* VM_MAP_DEBUG_APPLE_PROTECT */
172#if VM_MAP_DEBUG_FOURK
173int vm_map_debug_fourk = 0;
174#endif /* VM_MAP_DEBUG_FOURK */
175
176#if DEBUG || DEVELOPMENT
177static TUNABLE(bool, vm_map_executable_immutable,
178 "vm_map_executable_immutable", true);
179#else
180#define vm_map_executable_immutable true
181#endif
182
183os_refgrp_decl(static, map_refgrp, "vm_map", NULL);
184
185extern u_int32_t random(void); /* from <libkern/libkern.h> */
186/* Internal prototypes
187 */
188
189typedef struct vm_map_zap {
190 vm_map_entry_t vmz_head;
191 vm_map_entry_t *vmz_tail;
192} *vm_map_zap_t;
193
194#define VM_MAP_ZAP_DECLARE(zap) \
195 struct vm_map_zap zap = { .vmz_tail = &zap.vmz_head }
196
197static vm_map_entry_t vm_map_entry_insert(
198 vm_map_t map,
199 vm_map_entry_t insp_entry,
200 vm_map_offset_t start,
201 vm_map_offset_t end,
202 vm_object_t object,
203 vm_object_offset_t offset,
204 vm_map_kernel_flags_t vmk_flags,
205 boolean_t needs_copy,
206 vm_prot_t cur_protection,
207 vm_prot_t max_protection,
208 vm_inherit_t inheritance,
209 boolean_t clear_map_aligned);
210
211static void vm_map_simplify_range(
212 vm_map_t map,
213 vm_map_offset_t start,
214 vm_map_offset_t end); /* forward */
215
216static boolean_t vm_map_range_check(
217 vm_map_t map,
218 vm_map_offset_t start,
219 vm_map_offset_t end,
220 vm_map_entry_t *entry);
221
222static void vm_map_submap_pmap_clean(
223 vm_map_t map,
224 vm_map_offset_t start,
225 vm_map_offset_t end,
226 vm_map_t sub_map,
227 vm_map_offset_t offset);
228
229static void vm_map_pmap_enter(
230 vm_map_t map,
231 vm_map_offset_t addr,
232 vm_map_offset_t end_addr,
233 vm_object_t object,
234 vm_object_offset_t offset,
235 vm_prot_t protection);
236
237static void _vm_map_clip_end(
238 struct vm_map_header *map_header,
239 vm_map_entry_t entry,
240 vm_map_offset_t end);
241
242static void _vm_map_clip_start(
243 struct vm_map_header *map_header,
244 vm_map_entry_t entry,
245 vm_map_offset_t start);
246
247static kmem_return_t vm_map_delete(
248 vm_map_t map,
249 vm_map_offset_t start,
250 vm_map_offset_t end,
251 vmr_flags_t flags,
252 kmem_guard_t guard,
253 vm_map_zap_t zap);
254
255static void vm_map_copy_insert(
256 vm_map_t map,
257 vm_map_entry_t after_where,
258 vm_map_copy_t copy);
259
260static kern_return_t vm_map_copy_overwrite_unaligned(
261 vm_map_t dst_map,
262 vm_map_entry_t entry,
263 vm_map_copy_t copy,
264 vm_map_address_t start,
265 boolean_t discard_on_success);
266
267static kern_return_t vm_map_copy_overwrite_aligned(
268 vm_map_t dst_map,
269 vm_map_entry_t tmp_entry,
270 vm_map_copy_t copy,
271 vm_map_offset_t start,
272 pmap_t pmap);
273
274static kern_return_t vm_map_copyin_kernel_buffer(
275 vm_map_t src_map,
276 vm_map_address_t src_addr,
277 vm_map_size_t len,
278 boolean_t src_destroy,
279 vm_map_copy_t *copy_result); /* OUT */
280
281static kern_return_t vm_map_copyout_kernel_buffer(
282 vm_map_t map,
283 vm_map_address_t *addr, /* IN/OUT */
284 vm_map_copy_t copy,
285 vm_map_size_t copy_size,
286 boolean_t overwrite,
287 boolean_t consume_on_success);
288
289static void vm_map_fork_share(
290 vm_map_t old_map,
291 vm_map_entry_t old_entry,
292 vm_map_t new_map);
293
294static boolean_t vm_map_fork_copy(
295 vm_map_t old_map,
296 vm_map_entry_t *old_entry_p,
297 vm_map_t new_map,
298 int vm_map_copyin_flags);
299
300static kern_return_t vm_map_wire_nested(
301 vm_map_t map,
302 vm_map_offset_t start,
303 vm_map_offset_t end,
304 vm_prot_t caller_prot,
305 vm_tag_t tag,
306 boolean_t user_wire,
307 pmap_t map_pmap,
308 vm_map_offset_t pmap_addr,
309 ppnum_t *physpage_p);
310
311static kern_return_t vm_map_unwire_nested(
312 vm_map_t map,
313 vm_map_offset_t start,
314 vm_map_offset_t end,
315 boolean_t user_wire,
316 pmap_t map_pmap,
317 vm_map_offset_t pmap_addr);
318
319static kern_return_t vm_map_overwrite_submap_recurse(
320 vm_map_t dst_map,
321 vm_map_offset_t dst_addr,
322 vm_map_size_t dst_size);
323
324static kern_return_t vm_map_copy_overwrite_nested(
325 vm_map_t dst_map,
326 vm_map_offset_t dst_addr,
327 vm_map_copy_t copy,
328 boolean_t interruptible,
329 pmap_t pmap,
330 boolean_t discard_on_success);
331
332static kern_return_t vm_map_remap_extract(
333 vm_map_t map,
334 vm_map_offset_t addr,
335 vm_map_size_t size,
336 boolean_t copy,
337 vm_map_copy_t map_copy,
338 vm_prot_t *cur_protection,
339 vm_prot_t *max_protection,
340 vm_inherit_t inheritance,
341 vm_map_kernel_flags_t vmk_flags);
342
343static kern_return_t vm_map_remap_range_allocate(
344 vm_map_t map,
345 vm_map_address_t *address,
346 vm_map_size_t size,
347 vm_map_offset_t mask,
348 vm_map_kernel_flags_t vmk_flags,
349 vm_map_entry_t *map_entry,
350 vm_map_zap_t zap_list);
351
352static void vm_map_region_look_for_page(
353 vm_map_t map,
354 vm_map_offset_t va,
355 vm_object_t object,
356 vm_object_offset_t offset,
357 int max_refcnt,
358 unsigned short depth,
359 vm_region_extended_info_t extended,
360 mach_msg_type_number_t count);
361
362static int vm_map_region_count_obj_refs(
363 vm_map_entry_t entry,
364 vm_object_t object);
365
366
367static kern_return_t vm_map_willneed(
368 vm_map_t map,
369 vm_map_offset_t start,
370 vm_map_offset_t end);
371
372static kern_return_t vm_map_reuse_pages(
373 vm_map_t map,
374 vm_map_offset_t start,
375 vm_map_offset_t end);
376
377static kern_return_t vm_map_reusable_pages(
378 vm_map_t map,
379 vm_map_offset_t start,
380 vm_map_offset_t end);
381
382static kern_return_t vm_map_can_reuse(
383 vm_map_t map,
384 vm_map_offset_t start,
385 vm_map_offset_t end);
386
387static kern_return_t vm_map_zero(
388 vm_map_t map,
389 vm_map_offset_t start,
390 vm_map_offset_t end);
391
392static kern_return_t vm_map_random_address_for_size(
393 vm_map_t map,
394 vm_map_offset_t *address,
395 vm_map_size_t size,
396 vm_map_kernel_flags_t vmk_flags);
397
398
399#if CONFIG_MAP_RANGES
400
401static vm_map_range_id_t vm_map_user_range_resolve(
402 vm_map_t map,
403 mach_vm_address_t addr,
404 mach_vm_address_t size,
405 mach_vm_range_t range);
406
407#endif /* CONFIG_MAP_RANGES */
408#if MACH_ASSERT
409static kern_return_t vm_map_pageout(
410 vm_map_t map,
411 vm_map_offset_t start,
412 vm_map_offset_t end);
413#endif /* MACH_ASSERT */
414
415kern_return_t vm_map_corpse_footprint_collect(
416 vm_map_t old_map,
417 vm_map_entry_t old_entry,
418 vm_map_t new_map);
419void vm_map_corpse_footprint_collect_done(
420 vm_map_t new_map);
421void vm_map_corpse_footprint_destroy(
422 vm_map_t map);
423kern_return_t vm_map_corpse_footprint_query_page_info(
424 vm_map_t map,
425 vm_map_offset_t va,
426 int *disposition_p);
427void vm_map_footprint_query_page_info(
428 vm_map_t map,
429 vm_map_entry_t map_entry,
430 vm_map_offset_t curr_s_offset,
431 int *disposition_p);
432
433#if CONFIG_MAP_RANGES
434static void vm_map_range_map_init(void);
435#endif /* CONFIG_MAP_RANGES */
436
437pid_t find_largest_process_vm_map_entries(void);
438
439extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code,
440 mach_exception_data_type_t subcode);
441
442/*
443 * Macros to copy a vm_map_entry. We must be careful to correctly
444 * manage the wired page count. vm_map_entry_copy() creates a new
445 * map entry to the same memory - the wired count in the new entry
446 * must be set to zero. vm_map_entry_copy_full() creates a new
447 * entry that is identical to the old entry. This preserves the
448 * wire count; it's used for map splitting and zone changing in
449 * vm_map_copyout.
450 */
451
452static inline void
453vm_map_entry_copy_csm_assoc(
454 vm_map_t map __unused,
455 vm_map_entry_t new __unused,
456 vm_map_entry_t old __unused)
457{
458#if CODE_SIGNING_MONITOR
459 /* when code signing monitor is enabled, we want to reset on copy */
460 new->csm_associated = FALSE;
461#else
462 /* when code signing monitor is not enabled, assert as a sanity check */
463 assert(new->csm_associated == FALSE);
464#endif
465#if DEVELOPMENT || DEBUG
466 if (new->vme_xnu_user_debug && vm_log_xnu_user_debug) {
467 printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] resetting vme_xnu_user_debug\n",
468 proc_selfpid(),
469 (get_bsdtask_info(current_task())
470 ? proc_name_address(get_bsdtask_info(current_task()))
471 : "?"),
472 __FUNCTION__, __LINE__,
473 map, new, new->vme_start, new->vme_end);
474 }
475#endif /* DEVELOPMENT || DEBUG */
476 new->vme_xnu_user_debug = FALSE;
477}
478
479/*
480 * The "used_for_jit" flag was copied from OLD to NEW in vm_map_entry_copy().
481 * But for security reasons on some platforms, we don't want the
482 * new mapping to be "used for jit", so we reset the flag here.
483 */
484static inline void
485vm_map_entry_copy_code_signing(
486 vm_map_t map,
487 vm_map_entry_t new,
488 vm_map_entry_t old __unused)
489{
490 if (VM_MAP_POLICY_ALLOW_JIT_COPY(map)) {
491 assert(new->used_for_jit == old->used_for_jit);
492 } else {
493 if (old->used_for_jit) {
494 DTRACE_VM3(cs_wx,
495 uint64_t, new->vme_start,
496 uint64_t, new->vme_end,
497 vm_prot_t, new->protection);
498 printf(format: "CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
499 proc_selfpid(),
500 (get_bsdtask_info(current_task())
501 ? proc_name_address(p: get_bsdtask_info(current_task()))
502 : "?"),
503 __FUNCTION__,
504 "removing execute access");
505 new->protection &= ~VM_PROT_EXECUTE;
506 new->max_protection &= ~VM_PROT_EXECUTE;
507 }
508 new->used_for_jit = FALSE;
509 }
510}
511
512static inline void
513vm_map_entry_copy_full(
514 vm_map_entry_t new,
515 vm_map_entry_t old)
516{
517#if MAP_ENTRY_CREATION_DEBUG
518 btref_put(new->vme_creation_bt);
519 btref_retain(old->vme_creation_bt);
520#endif
521#if MAP_ENTRY_INSERTION_DEBUG
522 btref_put(new->vme_insertion_bt);
523 btref_retain(old->vme_insertion_bt);
524#endif
525#if VM_BTLOG_TAGS
526 /* Discard the btref that might be in the new entry */
527 if (new->vme_kernel_object) {
528 btref_put(new->vme_tag_btref);
529 }
530 /* Retain the btref in the old entry to account for its copy */
531 if (old->vme_kernel_object) {
532 btref_retain(old->vme_tag_btref);
533 }
534#endif /* VM_BTLOG_TAGS */
535 *new = *old;
536}
537
538static inline void
539vm_map_entry_copy(
540 vm_map_t map,
541 vm_map_entry_t new,
542 vm_map_entry_t old)
543{
544 vm_map_entry_copy_full(new, old);
545
546 new->is_shared = FALSE;
547 new->needs_wakeup = FALSE;
548 new->in_transition = FALSE;
549 new->wired_count = 0;
550 new->user_wired_count = 0;
551 new->vme_permanent = FALSE;
552 vm_map_entry_copy_code_signing(map, new, old);
553 vm_map_entry_copy_csm_assoc(map, new, old);
554 if (new->iokit_acct) {
555 assertf(!new->use_pmap, "old %p new %p\n", old, new);
556 new->iokit_acct = FALSE;
557 new->use_pmap = TRUE;
558 }
559 new->vme_resilient_codesign = FALSE;
560 new->vme_resilient_media = FALSE;
561 new->vme_atomic = FALSE;
562 new->vme_no_copy_on_read = FALSE;
563}
564
565/*
566 * Normal lock_read_to_write() returns FALSE/0 on failure.
567 * These functions evaluate to zero on success and non-zero value on failure.
568 */
569__attribute__((always_inline))
570int
571vm_map_lock_read_to_write(vm_map_t map)
572{
573 if (lck_rw_lock_shared_to_exclusive(lck: &(map)->lock)) {
574 DTRACE_VM(vm_map_lock_upgrade);
575 return 0;
576 }
577 return 1;
578}
579
580__attribute__((always_inline))
581boolean_t
582vm_map_try_lock(vm_map_t map)
583{
584 if (lck_rw_try_lock_exclusive(lck: &(map)->lock)) {
585 DTRACE_VM(vm_map_lock_w);
586 return TRUE;
587 }
588 return FALSE;
589}
590
591__attribute__((always_inline))
592boolean_t
593vm_map_try_lock_read(vm_map_t map)
594{
595 if (lck_rw_try_lock_shared(lck: &(map)->lock)) {
596 DTRACE_VM(vm_map_lock_r);
597 return TRUE;
598 }
599 return FALSE;
600}
601
602/*!
603 * @function kdp_vm_map_is_acquired_exclusive
604 *
605 * @abstract
606 * Checks if vm map is acquired exclusive.
607 *
608 * @discussion
609 * NOT SAFE: To be used only by kernel debugger.
610 *
611 * @param map map to check
612 *
613 * @returns TRUE if the map is acquired exclusively.
614 */
615boolean_t
616kdp_vm_map_is_acquired_exclusive(vm_map_t map)
617{
618 return kdp_lck_rw_lock_is_acquired_exclusive(lck: &map->lock);
619}
620
621/*
622 * Routines to get the page size the caller should
623 * use while inspecting the target address space.
624 * Use the "_safely" variant if the caller is dealing with a user-provided
625 * array whose size depends on the page size, to avoid any overflow or
626 * underflow of a user-allocated buffer.
627 */
628int
629vm_self_region_page_shift_safely(
630 vm_map_t target_map)
631{
632 int effective_page_shift = 0;
633
634 if (PAGE_SIZE == (4096)) {
635 /* x86_64 and 4k watches: always use 4k */
636 return PAGE_SHIFT;
637 }
638 /* did caller provide an explicit page size for this thread to use? */
639 effective_page_shift = thread_self_region_page_shift();
640 if (effective_page_shift) {
641 /* use the explicitly-provided page size */
642 return effective_page_shift;
643 }
644 /* no explicit page size: use the caller's page size... */
645 effective_page_shift = VM_MAP_PAGE_SHIFT(current_map());
646 if (effective_page_shift == VM_MAP_PAGE_SHIFT(map: target_map)) {
647 /* page size match: safe to use */
648 return effective_page_shift;
649 }
650 /* page size mismatch */
651 return -1;
652}
653int
654vm_self_region_page_shift(
655 vm_map_t target_map)
656{
657 int effective_page_shift;
658
659 effective_page_shift = vm_self_region_page_shift_safely(target_map);
660 if (effective_page_shift == -1) {
661 /* no safe value but OK to guess for caller */
662 effective_page_shift = MIN(VM_MAP_PAGE_SHIFT(current_map()),
663 VM_MAP_PAGE_SHIFT(target_map));
664 }
665 return effective_page_shift;
666}
667
668
669/*
670 * Decide if we want to allow processes to execute from their data or stack areas.
671 * override_nx() returns true if we do. Data/stack execution can be enabled independently
672 * for 32 and 64 bit processes. Set the VM_ABI_32 or VM_ABI_64 flags in allow_data_exec
673 * or allow_stack_exec to enable data execution for that type of data area for that particular
674 * ABI (or both by or'ing the flags together). These are initialized in the architecture
675 * specific pmap files since the default behavior varies according to architecture. The
676 * main reason it varies is because of the need to provide binary compatibility with old
677 * applications that were written before these restrictions came into being. In the old
678 * days, an app could execute anything it could read, but this has slowly been tightened
679 * up over time. The default behavior is:
680 *
681 * 32-bit PPC apps may execute from both stack and data areas
682 * 32-bit Intel apps may exeucte from data areas but not stack
683 * 64-bit PPC/Intel apps may not execute from either data or stack
684 *
685 * An application on any architecture may override these defaults by explicitly
686 * adding PROT_EXEC permission to the page in question with the mprotect(2)
687 * system call. This code here just determines what happens when an app tries to
688 * execute from a page that lacks execute permission.
689 *
690 * Note that allow_data_exec or allow_stack_exec may also be modified by sysctl to change the
691 * default behavior for both 32 and 64 bit apps on a system-wide basis. Furthermore,
692 * a Mach-O header flag bit (MH_NO_HEAP_EXECUTION) can be used to forcibly disallow
693 * execution from data areas for a particular binary even if the arch normally permits it. As
694 * a final wrinkle, a posix_spawn attribute flag can be used to negate this opt-in header bit
695 * to support some complicated use cases, notably browsers with out-of-process plugins that
696 * are not all NX-safe.
697 */
698
699extern int allow_data_exec, allow_stack_exec;
700
701int
702override_nx(vm_map_t map, uint32_t user_tag) /* map unused on arm */
703{
704 int current_abi;
705
706 if (map->pmap == kernel_pmap) {
707 return FALSE;
708 }
709
710 /*
711 * Determine if the app is running in 32 or 64 bit mode.
712 */
713
714 if (vm_map_is_64bit(map)) {
715 current_abi = VM_ABI_64;
716 } else {
717 current_abi = VM_ABI_32;
718 }
719
720 /*
721 * Determine if we should allow the execution based on whether it's a
722 * stack or data area and the current architecture.
723 */
724
725 if (user_tag == VM_MEMORY_STACK) {
726 return allow_stack_exec & current_abi;
727 }
728
729 return (allow_data_exec & current_abi) && (map->map_disallow_data_exec == FALSE);
730}
731
732
733/*
734 * Virtual memory maps provide for the mapping, protection,
735 * and sharing of virtual memory objects. In addition,
736 * this module provides for an efficient virtual copy of
737 * memory from one map to another.
738 *
739 * Synchronization is required prior to most operations.
740 *
741 * Maps consist of an ordered doubly-linked list of simple
742 * entries; a single hint is used to speed up lookups.
743 *
744 * Sharing maps have been deleted from this version of Mach.
745 * All shared objects are now mapped directly into the respective
746 * maps. This requires a change in the copy on write strategy;
747 * the asymmetric (delayed) strategy is used for shared temporary
748 * objects instead of the symmetric (shadow) strategy. All maps
749 * are now "top level" maps (either task map, kernel map or submap
750 * of the kernel map).
751 *
752 * Since portions of maps are specified by start/end addreses,
753 * which may not align with existing map entries, all
754 * routines merely "clip" entries to these start/end values.
755 * [That is, an entry is split into two, bordering at a
756 * start or end value.] Note that these clippings may not
757 * always be necessary (as the two resulting entries are then
758 * not changed); however, the clipping is done for convenience.
759 * No attempt is currently made to "glue back together" two
760 * abutting entries.
761 *
762 * The symmetric (shadow) copy strategy implements virtual copy
763 * by copying VM object references from one map to
764 * another, and then marking both regions as copy-on-write.
765 * It is important to note that only one writeable reference
766 * to a VM object region exists in any map when this strategy
767 * is used -- this means that shadow object creation can be
768 * delayed until a write operation occurs. The symmetric (delayed)
769 * strategy allows multiple maps to have writeable references to
770 * the same region of a vm object, and hence cannot delay creating
771 * its copy objects. See vm_object_copy_quickly() in vm_object.c.
772 * Copying of permanent objects is completely different; see
773 * vm_object_copy_strategically() in vm_object.c.
774 */
775
776ZONE_DECLARE_ID(ZONE_ID_VM_MAP_COPY, struct vm_map_copy);
777
778#define VM_MAP_ZONE_NAME "maps"
779#define VM_MAP_ZFLAGS (ZC_NOENCRYPT | ZC_VM)
780
781#define VM_MAP_ENTRY_ZONE_NAME "VM map entries"
782#define VM_MAP_ENTRY_ZFLAGS (ZC_NOENCRYPT | ZC_VM)
783
784#define VM_MAP_HOLES_ZONE_NAME "VM map holes"
785#define VM_MAP_HOLES_ZFLAGS (ZC_NOENCRYPT | ZC_VM)
786
787/*
788 * Asserts that a vm_map_copy object is coming from the
789 * vm_map_copy_zone to ensure that it isn't a fake constructed
790 * anywhere else.
791 */
792void
793vm_map_copy_require(struct vm_map_copy *copy)
794{
795 zone_id_require(zone_id: ZONE_ID_VM_MAP_COPY, elem_size: sizeof(struct vm_map_copy), addr: copy);
796}
797
798/*
799 * vm_map_require:
800 *
801 * Ensures that the argument is memory allocated from the genuine
802 * vm map zone. (See zone_id_require_allow_foreign).
803 */
804void
805vm_map_require(vm_map_t map)
806{
807 zone_id_require(zone_id: ZONE_ID_VM_MAP, elem_size: sizeof(struct _vm_map), addr: map);
808}
809
810#define VM_MAP_EARLY_COUNT_MAX 16
811static __startup_data vm_offset_t map_data;
812static __startup_data vm_size_t map_data_size;
813static __startup_data vm_offset_t kentry_data;
814static __startup_data vm_size_t kentry_data_size;
815static __startup_data vm_offset_t map_holes_data;
816static __startup_data vm_size_t map_holes_data_size;
817static __startup_data vm_map_t *early_map_owners[VM_MAP_EARLY_COUNT_MAX];
818static __startup_data uint32_t early_map_count;
819
820#if XNU_TARGET_OS_OSX
821#define NO_COALESCE_LIMIT ((1024 * 128) - 1)
822#else /* XNU_TARGET_OS_OSX */
823#define NO_COALESCE_LIMIT 0
824#endif /* XNU_TARGET_OS_OSX */
825
826/* Skip acquiring locks if we're in the midst of a kernel core dump */
827unsigned int not_in_kdp = 1;
828
829unsigned int vm_map_set_cache_attr_count = 0;
830
831kern_return_t
832vm_map_set_cache_attr(
833 vm_map_t map,
834 vm_map_offset_t va)
835{
836 vm_map_entry_t map_entry;
837 vm_object_t object;
838 kern_return_t kr = KERN_SUCCESS;
839
840 vm_map_lock_read(map);
841
842 if (!vm_map_lookup_entry(map, address: va, entry: &map_entry) ||
843 map_entry->is_sub_map) {
844 /*
845 * that memory is not properly mapped
846 */
847 kr = KERN_INVALID_ARGUMENT;
848 goto done;
849 }
850 object = VME_OBJECT(map_entry);
851
852 if (object == VM_OBJECT_NULL) {
853 /*
854 * there should be a VM object here at this point
855 */
856 kr = KERN_INVALID_ARGUMENT;
857 goto done;
858 }
859 vm_object_lock(object);
860 object->set_cache_attr = TRUE;
861 vm_object_unlock(object);
862
863 vm_map_set_cache_attr_count++;
864done:
865 vm_map_unlock_read(map);
866
867 return kr;
868}
869
870
871#if CONFIG_CODE_DECRYPTION
872/*
873 * vm_map_apple_protected:
874 * This remaps the requested part of the object with an object backed by
875 * the decrypting pager.
876 * crypt_info contains entry points and session data for the crypt module.
877 * The crypt_info block will be copied by vm_map_apple_protected. The data structures
878 * referenced in crypt_info must remain valid until crypt_info->crypt_end() is called.
879 */
880kern_return_t
881vm_map_apple_protected(
882 vm_map_t map,
883 vm_map_offset_t start,
884 vm_map_offset_t end,
885 vm_object_offset_t crypto_backing_offset,
886 struct pager_crypt_info *crypt_info,
887 uint32_t cryptid)
888{
889 boolean_t map_locked;
890 kern_return_t kr;
891 vm_map_entry_t map_entry;
892 struct vm_map_entry tmp_entry;
893 memory_object_t unprotected_mem_obj;
894 vm_object_t protected_object;
895 vm_map_offset_t map_addr;
896 vm_map_offset_t start_aligned, end_aligned;
897 vm_object_offset_t crypto_start, crypto_end;
898 boolean_t cache_pager;
899
900 map_locked = FALSE;
901 unprotected_mem_obj = MEMORY_OBJECT_NULL;
902
903 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
904 return KERN_INVALID_ADDRESS;
905 }
906 start_aligned = vm_map_trunc_page(start, PAGE_MASK_64);
907 end_aligned = vm_map_round_page(end, PAGE_MASK_64);
908 start_aligned = vm_map_trunc_page(start_aligned, VM_MAP_PAGE_MASK(map));
909 end_aligned = vm_map_round_page(end_aligned, VM_MAP_PAGE_MASK(map));
910
911#if __arm64__
912 /*
913 * "start" and "end" might be 4K-aligned but not 16K-aligned,
914 * so we might have to loop and establish up to 3 mappings:
915 *
916 * + the first 16K-page, which might overlap with the previous
917 * 4K-aligned mapping,
918 * + the center,
919 * + the last 16K-page, which might overlap with the next
920 * 4K-aligned mapping.
921 * Each of these mapping might be backed by a vnode pager (if
922 * properly page-aligned) or a "fourk_pager", itself backed by a
923 * vnode pager (if 4K-aligned but not page-aligned).
924 */
925#endif /* __arm64__ */
926
927 map_addr = start_aligned;
928 for (map_addr = start_aligned;
929 map_addr < end;
930 map_addr = tmp_entry.vme_end) {
931 vm_map_lock(map);
932 map_locked = TRUE;
933
934 /* lookup the protected VM object */
935 if (!vm_map_lookup_entry(map,
936 address: map_addr,
937 entry: &map_entry) ||
938 map_entry->is_sub_map ||
939 VME_OBJECT(map_entry) == VM_OBJECT_NULL) {
940 /* that memory is not properly mapped */
941 kr = KERN_INVALID_ARGUMENT;
942 goto done;
943 }
944
945 /* ensure mapped memory is mapped as executable except
946 * except for model decryption flow */
947 if ((cryptid != CRYPTID_MODEL_ENCRYPTION) &&
948 !(map_entry->protection & VM_PROT_EXECUTE)) {
949 kr = KERN_INVALID_ARGUMENT;
950 goto done;
951 }
952
953 /* get the protected object to be decrypted */
954 protected_object = VME_OBJECT(map_entry);
955 if (protected_object == VM_OBJECT_NULL) {
956 /* there should be a VM object here at this point */
957 kr = KERN_INVALID_ARGUMENT;
958 goto done;
959 }
960 /* ensure protected object stays alive while map is unlocked */
961 vm_object_reference(protected_object);
962
963 /* limit the map entry to the area we want to cover */
964 vm_map_clip_start(map, entry: map_entry, endaddr: start_aligned);
965 vm_map_clip_end(map, entry: map_entry, endaddr: end_aligned);
966
967 tmp_entry = *map_entry;
968 map_entry = VM_MAP_ENTRY_NULL; /* not valid after unlocking map */
969 vm_map_unlock(map);
970 map_locked = FALSE;
971
972 /*
973 * This map entry might be only partially encrypted
974 * (if not fully "page-aligned").
975 */
976 crypto_start = 0;
977 crypto_end = tmp_entry.vme_end - tmp_entry.vme_start;
978 if (tmp_entry.vme_start < start) {
979 if (tmp_entry.vme_start != start_aligned) {
980 kr = KERN_INVALID_ADDRESS;
981 vm_object_deallocate(object: protected_object);
982 goto done;
983 }
984 crypto_start += (start - tmp_entry.vme_start);
985 }
986 if (tmp_entry.vme_end > end) {
987 if (tmp_entry.vme_end != end_aligned) {
988 kr = KERN_INVALID_ADDRESS;
989 vm_object_deallocate(object: protected_object);
990 goto done;
991 }
992 crypto_end -= (tmp_entry.vme_end - end);
993 }
994
995 /*
996 * This "extra backing offset" is needed to get the decryption
997 * routine to use the right key. It adjusts for the possibly
998 * relative offset of an interposed "4K" pager...
999 */
1000 if (crypto_backing_offset == (vm_object_offset_t) -1) {
1001 crypto_backing_offset = VME_OFFSET(entry: &tmp_entry);
1002 }
1003
1004 cache_pager = TRUE;
1005#if XNU_TARGET_OS_OSX
1006 if (vm_map_is_alien(map)) {
1007 cache_pager = FALSE;
1008 }
1009#endif /* XNU_TARGET_OS_OSX */
1010
1011 /*
1012 * Lookup (and create if necessary) the protected memory object
1013 * matching that VM object.
1014 * If successful, this also grabs a reference on the memory object,
1015 * to guarantee that it doesn't go away before we get a chance to map
1016 * it.
1017 */
1018 unprotected_mem_obj = apple_protect_pager_setup(
1019 backing_object: protected_object,
1020 backing_offset: VME_OFFSET(entry: &tmp_entry),
1021 crypto_backing_offset,
1022 crypt_info,
1023 crypto_start,
1024 crypto_end,
1025 cache_pager);
1026
1027 /* release extra ref on protected object */
1028 vm_object_deallocate(object: protected_object);
1029
1030 if (unprotected_mem_obj == NULL) {
1031 kr = KERN_FAILURE;
1032 goto done;
1033 }
1034
1035 /* can overwrite an immutable mapping */
1036 vm_map_kernel_flags_t vmk_flags = {
1037 .vmf_fixed = true,
1038 .vmf_overwrite = true,
1039 .vmkf_overwrite_immutable = true,
1040 };
1041#if __arm64__
1042 if (tmp_entry.used_for_jit &&
1043 (VM_MAP_PAGE_SHIFT(map) != FOURK_PAGE_SHIFT ||
1044 PAGE_SHIFT != FOURK_PAGE_SHIFT) &&
1045 fourk_binary_compatibility_unsafe &&
1046 fourk_binary_compatibility_allow_wx) {
1047 printf(format: "** FOURK_COMPAT [%d]: "
1048 "allowing write+execute at 0x%llx\n",
1049 proc_selfpid(), tmp_entry.vme_start);
1050 vmk_flags.vmkf_map_jit = TRUE;
1051 }
1052#endif /* __arm64__ */
1053
1054 /* map this memory object in place of the current one */
1055 map_addr = tmp_entry.vme_start;
1056 kr = vm_map_enter_mem_object(map,
1057 address: &map_addr,
1058 size: (tmp_entry.vme_end -
1059 tmp_entry.vme_start),
1060 mask: (mach_vm_offset_t) 0,
1061 vmk_flags,
1062 port: (ipc_port_t)(uintptr_t) unprotected_mem_obj,
1063 offset: 0,
1064 TRUE,
1065 cur_protection: tmp_entry.protection,
1066 max_protection: tmp_entry.max_protection,
1067 inheritance: tmp_entry.inheritance);
1068 assertf(kr == KERN_SUCCESS,
1069 "kr = 0x%x\n", kr);
1070 assertf(map_addr == tmp_entry.vme_start,
1071 "map_addr=0x%llx vme_start=0x%llx tmp_entry=%p\n",
1072 (uint64_t)map_addr,
1073 (uint64_t) tmp_entry.vme_start,
1074 &tmp_entry);
1075
1076#if VM_MAP_DEBUG_APPLE_PROTECT
1077 if (vm_map_debug_apple_protect) {
1078 printf("APPLE_PROTECT: map %p [0x%llx:0x%llx] pager %p:"
1079 " backing:[object:%p,offset:0x%llx,"
1080 "crypto_backing_offset:0x%llx,"
1081 "crypto_start:0x%llx,crypto_end:0x%llx]\n",
1082 map,
1083 (uint64_t) map_addr,
1084 (uint64_t) (map_addr + (tmp_entry.vme_end -
1085 tmp_entry.vme_start)),
1086 unprotected_mem_obj,
1087 protected_object,
1088 VME_OFFSET(&tmp_entry),
1089 crypto_backing_offset,
1090 crypto_start,
1091 crypto_end);
1092 }
1093#endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1094
1095 /*
1096 * Release the reference obtained by
1097 * apple_protect_pager_setup().
1098 * The mapping (if it succeeded) is now holding a reference on
1099 * the memory object.
1100 */
1101 memory_object_deallocate(object: unprotected_mem_obj);
1102 unprotected_mem_obj = MEMORY_OBJECT_NULL;
1103
1104 /* continue with next map entry */
1105 crypto_backing_offset += (tmp_entry.vme_end -
1106 tmp_entry.vme_start);
1107 crypto_backing_offset -= crypto_start;
1108 }
1109 kr = KERN_SUCCESS;
1110
1111done:
1112 if (map_locked) {
1113 vm_map_unlock(map);
1114 }
1115 return kr;
1116}
1117#endif /* CONFIG_CODE_DECRYPTION */
1118
1119
1120LCK_GRP_DECLARE(vm_map_lck_grp, "vm_map");
1121LCK_ATTR_DECLARE(vm_map_lck_attr, 0, 0);
1122LCK_ATTR_DECLARE(vm_map_lck_rw_attr, 0, LCK_ATTR_DEBUG);
1123
1124#if XNU_TARGET_OS_OSX
1125#define MALLOC_NO_COW_DEFAULT 1
1126#define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 1
1127#else /* XNU_TARGET_OS_OSX */
1128#define MALLOC_NO_COW_DEFAULT 1
1129#define MALLOC_NO_COW_EXCEPT_FORK_DEFAULT 0
1130#endif /* XNU_TARGET_OS_OSX */
1131TUNABLE(int, malloc_no_cow, "malloc_no_cow", MALLOC_NO_COW_DEFAULT);
1132TUNABLE(int, malloc_no_cow_except_fork, "malloc_no_cow_except_fork", MALLOC_NO_COW_EXCEPT_FORK_DEFAULT);
1133uint64_t vm_memory_malloc_no_cow_mask = 0ULL;
1134#if DEBUG
1135int vm_check_map_sanity = 0;
1136#endif
1137
1138/*
1139 * vm_map_init:
1140 *
1141 * Initialize the vm_map module. Must be called before
1142 * any other vm_map routines.
1143 *
1144 * Map and entry structures are allocated from zones -- we must
1145 * initialize those zones.
1146 *
1147 * There are three zones of interest:
1148 *
1149 * vm_map_zone: used to allocate maps.
1150 * vm_map_entry_zone: used to allocate map entries.
1151 *
1152 * LP32:
1153 * vm_map_entry_reserved_zone: fallback zone for kernel map entries
1154 *
1155 * The kernel allocates map entries from a special zone that is initially
1156 * "crammed" with memory. It would be difficult (perhaps impossible) for
1157 * the kernel to allocate more memory to a entry zone when it became
1158 * empty since the very act of allocating memory implies the creation
1159 * of a new entry.
1160 */
1161__startup_func
1162void
1163vm_map_init(void)
1164{
1165
1166#if MACH_ASSERT
1167 PE_parse_boot_argn("debug4k_filter", &debug4k_filter,
1168 sizeof(debug4k_filter));
1169#endif /* MACH_ASSERT */
1170
1171 zone_create_ext(VM_MAP_ZONE_NAME, size: sizeof(struct _vm_map),
1172 VM_MAP_ZFLAGS, desired_zid: ZONE_ID_VM_MAP, NULL);
1173
1174 /*
1175 * Don't quarantine because we always need elements available
1176 * Disallow GC on this zone... to aid the GC.
1177 */
1178 zone_create_ext(VM_MAP_ENTRY_ZONE_NAME,
1179 size: sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1180 desired_zid: ZONE_ID_VM_MAP_ENTRY, extra_setup: ^(zone_t z) {
1181 z->z_elems_rsv = (uint16_t)(32 *
1182 (ml_early_cpu_max_number() + 1));
1183 });
1184
1185 zone_create_ext(VM_MAP_HOLES_ZONE_NAME,
1186 size: sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1187 desired_zid: ZONE_ID_VM_MAP_HOLES, extra_setup: ^(zone_t z) {
1188 z->z_elems_rsv = (uint16_t)(16 * 1024 / zone_elem_outer_size(zone: z));
1189 });
1190
1191 zone_create_ext(name: "VM map copies", size: sizeof(struct vm_map_copy),
1192 flags: ZC_NOENCRYPT, desired_zid: ZONE_ID_VM_MAP_COPY, NULL);
1193
1194 /*
1195 * Add the stolen memory to zones, adjust zone size and stolen counts.
1196 */
1197 zone_cram_early(vm_map_zone, newmem: map_data, size: map_data_size);
1198 zone_cram_early(vm_map_entry_zone, newmem: kentry_data, size: kentry_data_size);
1199 zone_cram_early(vm_map_holes_zone, newmem: map_holes_data, size: map_holes_data_size);
1200 printf(format: "VM boostrap: %d maps, %d entries and %d holes available\n",
1201 zone_count_free(vm_map_zone),
1202 zone_count_free(vm_map_entry_zone),
1203 zone_count_free(vm_map_holes_zone));
1204
1205 /*
1206 * Since these are covered by zones, remove them from stolen page accounting.
1207 */
1208 VM_PAGE_MOVE_STOLEN(atop_64(map_data_size) + atop_64(kentry_data_size) + atop_64(map_holes_data_size));
1209
1210#if VM_MAP_DEBUG_APPLE_PROTECT
1211 PE_parse_boot_argn("vm_map_debug_apple_protect",
1212 &vm_map_debug_apple_protect,
1213 sizeof(vm_map_debug_apple_protect));
1214#endif /* VM_MAP_DEBUG_APPLE_PROTECT */
1215#if VM_MAP_DEBUG_APPLE_FOURK
1216 PE_parse_boot_argn("vm_map_debug_fourk",
1217 &vm_map_debug_fourk,
1218 sizeof(vm_map_debug_fourk));
1219#endif /* VM_MAP_DEBUG_FOURK */
1220
1221 if (malloc_no_cow) {
1222 vm_memory_malloc_no_cow_mask = 0ULL;
1223 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC;
1224 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_SMALL;
1225 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_MEDIUM;
1226#if XNU_TARGET_OS_OSX
1227 /*
1228 * On macOS, keep copy-on-write for MALLOC_LARGE because
1229 * realloc() may use vm_copy() to transfer the old contents
1230 * to the new location.
1231 */
1232#else /* XNU_TARGET_OS_OSX */
1233 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE;
1234 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSABLE;
1235 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_LARGE_REUSED;
1236#endif /* XNU_TARGET_OS_OSX */
1237// vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_HUGE;
1238// vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_REALLOC;
1239 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_TINY;
1240 vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_MALLOC_NANO;
1241// vm_memory_malloc_no_cow_mask |= 1ULL << VM_MEMORY_TCMALLOC;
1242 PE_parse_boot_argn(arg_string: "vm_memory_malloc_no_cow_mask",
1243 arg_ptr: &vm_memory_malloc_no_cow_mask,
1244 max_arg: sizeof(vm_memory_malloc_no_cow_mask));
1245 }
1246
1247#if CONFIG_MAP_RANGES
1248 vm_map_range_map_init();
1249#endif /* CONFIG_MAP_RANGES */
1250
1251#if DEBUG
1252 PE_parse_boot_argn("vm_check_map_sanity", &vm_check_map_sanity, sizeof(vm_check_map_sanity));
1253 if (vm_check_map_sanity) {
1254 kprintf("VM sanity checking enabled\n");
1255 } else {
1256 kprintf("VM sanity checking disabled. Set bootarg vm_check_map_sanity=1 to enable\n");
1257 }
1258#endif /* DEBUG */
1259
1260#if DEVELOPMENT || DEBUG
1261 PE_parse_boot_argn("panic_on_unsigned_execute",
1262 &panic_on_unsigned_execute,
1263 sizeof(panic_on_unsigned_execute));
1264 PE_parse_boot_argn("panic_on_mlock_failure",
1265 &panic_on_mlock_failure,
1266 sizeof(panic_on_mlock_failure));
1267#endif /* DEVELOPMENT || DEBUG */
1268}
1269
1270__startup_func
1271static void
1272vm_map_steal_memory(void)
1273{
1274 /*
1275 * We need to reserve enough memory to support boostraping VM maps
1276 * and the zone subsystem.
1277 *
1278 * The VM Maps that need to function before zones can support them
1279 * are the ones registered with vm_map_will_allocate_early_map(),
1280 * which are:
1281 * - the kernel map
1282 * - the various submaps used by zones (pgz, meta, ...)
1283 *
1284 * We also need enough entries and holes to support them
1285 * until zone_metadata_init() is called, which is when
1286 * the zone allocator becomes capable of expanding dynamically.
1287 *
1288 * We need:
1289 * - VM_MAP_EARLY_COUNT_MAX worth of VM Maps.
1290 * - To allow for 3-4 entries per map, but the kernel map
1291 * needs a multiple of VM_MAP_EARLY_COUNT_MAX entries
1292 * to describe the submaps, so double it (and make it 8x too)
1293 * - To allow for holes between entries,
1294 * hence needs the same budget as entries
1295 */
1296 map_data_size = zone_get_early_alloc_size(VM_MAP_ZONE_NAME,
1297 elem_size: sizeof(struct _vm_map), VM_MAP_ZFLAGS,
1298 VM_MAP_EARLY_COUNT_MAX);
1299
1300 kentry_data_size = zone_get_early_alloc_size(VM_MAP_ENTRY_ZONE_NAME,
1301 elem_size: sizeof(struct vm_map_entry), VM_MAP_ENTRY_ZFLAGS,
1302 min_elems: 8 * VM_MAP_EARLY_COUNT_MAX);
1303
1304 map_holes_data_size = zone_get_early_alloc_size(VM_MAP_HOLES_ZONE_NAME,
1305 elem_size: sizeof(struct vm_map_links), VM_MAP_HOLES_ZFLAGS,
1306 min_elems: 8 * VM_MAP_EARLY_COUNT_MAX);
1307
1308 /*
1309 * Steal a contiguous range of memory so that a simple range check
1310 * can validate early addresses being freed/crammed to these
1311 * zones
1312 */
1313 map_data = zone_early_mem_init(size: map_data_size + kentry_data_size +
1314 map_holes_data_size);
1315 kentry_data = map_data + map_data_size;
1316 map_holes_data = kentry_data + kentry_data_size;
1317}
1318STARTUP(PMAP_STEAL, STARTUP_RANK_FIRST, vm_map_steal_memory);
1319
1320__startup_func
1321static void
1322vm_kernel_boostraped(void)
1323{
1324 zone_enable_caching(zone: &zone_array[ZONE_ID_VM_MAP_ENTRY]);
1325 zone_enable_caching(zone: &zone_array[ZONE_ID_VM_MAP_HOLES]);
1326 zone_enable_caching(zone: &zone_array[ZONE_ID_VM_MAP_COPY]);
1327
1328 printf(format: "VM bootstrap done: %d maps, %d entries and %d holes left\n",
1329 zone_count_free(vm_map_zone),
1330 zone_count_free(vm_map_entry_zone),
1331 zone_count_free(vm_map_holes_zone));
1332}
1333STARTUP(ZALLOC, STARTUP_RANK_SECOND, vm_kernel_boostraped);
1334
1335void
1336vm_map_disable_hole_optimization(vm_map_t map)
1337{
1338 vm_map_entry_t head_entry, hole_entry, next_hole_entry;
1339
1340 if (map->holelistenabled) {
1341 head_entry = hole_entry = CAST_TO_VM_MAP_ENTRY(map->holes_list);
1342
1343 while (hole_entry != NULL) {
1344 next_hole_entry = hole_entry->vme_next;
1345
1346 hole_entry->vme_next = NULL;
1347 hole_entry->vme_prev = NULL;
1348 zfree_id(ZONE_ID_VM_MAP_HOLES, hole_entry);
1349
1350 if (next_hole_entry == head_entry) {
1351 hole_entry = NULL;
1352 } else {
1353 hole_entry = next_hole_entry;
1354 }
1355 }
1356
1357 map->holes_list = NULL;
1358 map->holelistenabled = FALSE;
1359
1360 map->first_free = vm_map_first_entry(map);
1361 SAVE_HINT_HOLE_WRITE(map, NULL);
1362 }
1363}
1364
1365boolean_t
1366vm_kernel_map_is_kernel(vm_map_t map)
1367{
1368 return map->pmap == kernel_pmap;
1369}
1370
1371/*
1372 * vm_map_create:
1373 *
1374 * Creates and returns a new empty VM map with
1375 * the given physical map structure, and having
1376 * the given lower and upper address bounds.
1377 */
1378
1379extern vm_map_t vm_map_create_external(
1380 pmap_t pmap,
1381 vm_map_offset_t min_off,
1382 vm_map_offset_t max_off,
1383 boolean_t pageable);
1384
1385vm_map_t
1386vm_map_create_external(
1387 pmap_t pmap,
1388 vm_map_offset_t min,
1389 vm_map_offset_t max,
1390 boolean_t pageable)
1391{
1392 vm_map_create_options_t options = VM_MAP_CREATE_DEFAULT;
1393
1394 if (pageable) {
1395 options |= VM_MAP_CREATE_PAGEABLE;
1396 }
1397 return vm_map_create_options(pmap, min_off: min, max_off: max, options);
1398}
1399
1400__startup_func
1401void
1402vm_map_will_allocate_early_map(vm_map_t *owner)
1403{
1404 if (early_map_count >= VM_MAP_EARLY_COUNT_MAX) {
1405 panic("VM_MAP_EARLY_COUNT_MAX is too low");
1406 }
1407
1408 early_map_owners[early_map_count++] = owner;
1409}
1410
1411__startup_func
1412void
1413vm_map_relocate_early_maps(vm_offset_t delta)
1414{
1415 for (uint32_t i = 0; i < early_map_count; i++) {
1416 vm_address_t addr = (vm_address_t)*early_map_owners[i];
1417
1418 *early_map_owners[i] = (vm_map_t)(addr + delta);
1419 }
1420
1421 early_map_count = ~0u;
1422}
1423
1424/*
1425 * Routine: vm_map_relocate_early_elem
1426 *
1427 * Purpose:
1428 * Early zone elements are allocated in a temporary part
1429 * of the address space.
1430 *
1431 * Once the zones live in their final place, the early
1432 * VM maps, map entries and map holes need to be relocated.
1433 *
1434 * It involves rewriting any vm_map_t, vm_map_entry_t or
1435 * pointers to vm_map_links. Other pointers to other types
1436 * are fine.
1437 *
1438 * Fortunately, pointers to those types are self-contained
1439 * in those zones, _except_ for pointers to VM maps,
1440 * which are tracked during early boot and fixed with
1441 * vm_map_relocate_early_maps().
1442 */
1443__startup_func
1444void
1445vm_map_relocate_early_elem(
1446 uint32_t zone_id,
1447 vm_offset_t new_addr,
1448 vm_offset_t delta)
1449{
1450#define relocate(type_t, field) ({ \
1451 typeof(((type_t)NULL)->field) *__field = &((type_t)new_addr)->field; \
1452 if (*__field) { \
1453 *__field = (typeof(*__field))((vm_offset_t)*__field + delta); \
1454 } \
1455})
1456
1457 switch (zone_id) {
1458 case ZONE_ID_VM_MAP:
1459 case ZONE_ID_VM_MAP_ENTRY:
1460 case ZONE_ID_VM_MAP_HOLES:
1461 break;
1462
1463 default:
1464 panic("Unexpected zone ID %d", zone_id);
1465 }
1466
1467 if (zone_id == ZONE_ID_VM_MAP) {
1468 relocate(vm_map_t, hdr.links.prev);
1469 relocate(vm_map_t, hdr.links.next);
1470 ((vm_map_t)new_addr)->pmap = kernel_pmap;
1471#ifdef VM_MAP_STORE_USE_RB
1472 relocate(vm_map_t, hdr.rb_head_store.rbh_root);
1473#endif /* VM_MAP_STORE_USE_RB */
1474 relocate(vm_map_t, hint);
1475 relocate(vm_map_t, hole_hint);
1476 relocate(vm_map_t, first_free);
1477 return;
1478 }
1479
1480 relocate(struct vm_map_links *, prev);
1481 relocate(struct vm_map_links *, next);
1482
1483 if (zone_id == ZONE_ID_VM_MAP_ENTRY) {
1484#ifdef VM_MAP_STORE_USE_RB
1485 relocate(vm_map_entry_t, store.entry.rbe_left);
1486 relocate(vm_map_entry_t, store.entry.rbe_right);
1487 relocate(vm_map_entry_t, store.entry.rbe_parent);
1488#endif /* VM_MAP_STORE_USE_RB */
1489 if (((vm_map_entry_t)new_addr)->is_sub_map) {
1490 /* no object to relocate because we haven't made any */
1491 ((vm_map_entry_t)new_addr)->vme_submap +=
1492 delta >> VME_SUBMAP_SHIFT;
1493 }
1494#if MAP_ENTRY_CREATION_DEBUG
1495 relocate(vm_map_entry_t, vme_creation_maphdr);
1496#endif /* MAP_ENTRY_CREATION_DEBUG */
1497 }
1498
1499#undef relocate
1500}
1501
1502vm_map_t
1503vm_map_create_options(
1504 pmap_t pmap,
1505 vm_map_offset_t min,
1506 vm_map_offset_t max,
1507 vm_map_create_options_t options)
1508{
1509 vm_map_t result;
1510
1511#if DEBUG || DEVELOPMENT
1512 if (__improbable(startup_phase < STARTUP_SUB_ZALLOC)) {
1513 if (early_map_count != ~0u && early_map_count !=
1514 zone_count_allocated(vm_map_zone) + 1) {
1515 panic("allocating %dth early map, owner not known",
1516 zone_count_allocated(vm_map_zone) + 1);
1517 }
1518 if (early_map_count != ~0u && pmap && pmap != kernel_pmap) {
1519 panic("allocating %dth early map for non kernel pmap",
1520 early_map_count);
1521 }
1522 }
1523#endif /* DEBUG || DEVELOPMENT */
1524
1525 result = zalloc_id(ZONE_ID_VM_MAP, Z_WAITOK | Z_NOFAIL | Z_ZERO);
1526
1527 vm_map_store_init(header: &result->hdr);
1528 result->hdr.entries_pageable = (bool)(options & VM_MAP_CREATE_PAGEABLE);
1529 vm_map_set_page_shift(map: result, PAGE_SHIFT);
1530
1531 result->size_limit = RLIM_INFINITY; /* default unlimited */
1532 result->data_limit = RLIM_INFINITY; /* default unlimited */
1533 result->user_wire_limit = MACH_VM_MAX_ADDRESS; /* default limit is unlimited */
1534 os_ref_init_count_raw(&result->map_refcnt, &map_refgrp, 1);
1535 result->pmap = pmap;
1536 result->min_offset = min;
1537 result->max_offset = max;
1538 result->first_free = vm_map_to_entry(result);
1539 result->hint = vm_map_to_entry(result);
1540
1541 if (options & VM_MAP_CREATE_NEVER_FAULTS) {
1542 assert(pmap == kernel_pmap);
1543 result->never_faults = true;
1544 }
1545
1546 /* "has_corpse_footprint" and "holelistenabled" are mutually exclusive */
1547 if (options & VM_MAP_CREATE_CORPSE_FOOTPRINT) {
1548 result->has_corpse_footprint = true;
1549 } else if (!(options & VM_MAP_CREATE_DISABLE_HOLELIST)) {
1550 struct vm_map_links *hole_entry;
1551
1552 hole_entry = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
1553 hole_entry->start = min;
1554#if defined(__arm64__)
1555 hole_entry->end = result->max_offset;
1556#else
1557 hole_entry->end = MAX(max, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1558#endif
1559 result->holes_list = result->hole_hint = hole_entry;
1560 hole_entry->prev = hole_entry->next = CAST_TO_VM_MAP_ENTRY(hole_entry);
1561 result->holelistenabled = true;
1562 }
1563
1564 vm_map_lock_init(result);
1565
1566 return result;
1567}
1568
1569/*
1570 * Adjusts a submap that was made by kmem_suballoc()
1571 * before it knew where it would be mapped,
1572 * so that it has the right min/max offsets.
1573 *
1574 * We do not need to hold any locks:
1575 * only the caller knows about this map,
1576 * and it is not published on any entry yet.
1577 */
1578static void
1579vm_map_adjust_offsets(
1580 vm_map_t map,
1581 vm_map_offset_t min_off,
1582 vm_map_offset_t max_off)
1583{
1584 assert(map->min_offset == 0);
1585 assert(map->max_offset == max_off - min_off);
1586 assert(map->hdr.nentries == 0);
1587 assert(os_ref_get_count_raw(&map->map_refcnt) == 2);
1588
1589 map->min_offset = min_off;
1590 map->max_offset = max_off;
1591
1592 if (map->holelistenabled) {
1593 struct vm_map_links *hole = map->holes_list;
1594
1595 hole->start = min_off;
1596#if defined(__arm64__)
1597 hole->end = max_off;
1598#else
1599 hole->end = MAX(max_off, (vm_map_offset_t)MACH_VM_MAX_ADDRESS);
1600#endif
1601 }
1602}
1603
1604
1605vm_map_size_t
1606vm_map_adjusted_size(vm_map_t map)
1607{
1608 const struct vm_reserved_region *regions = NULL;
1609 size_t num_regions = 0;
1610 mach_vm_size_t reserved_size = 0, map_size = 0;
1611
1612 if (map == NULL || (map->size == 0)) {
1613 return 0;
1614 }
1615
1616 map_size = map->size;
1617
1618 if (map->reserved_regions == FALSE || !vm_map_is_exotic(map) || map->terminated) {
1619 /*
1620 * No special reserved regions or not an exotic map or the task
1621 * is terminating and these special regions might have already
1622 * been deallocated.
1623 */
1624 return map_size;
1625 }
1626
1627 num_regions = ml_get_vm_reserved_regions(vm_is64bit: vm_map_is_64bit(map), regions: &regions);
1628 assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
1629
1630 while (num_regions) {
1631 reserved_size += regions[--num_regions].vmrr_size;
1632 }
1633
1634 /*
1635 * There are a few places where the map is being switched out due to
1636 * 'termination' without that bit being set (e.g. exec and corpse purging).
1637 * In those cases, we could have the map's regions being deallocated on
1638 * a core while some accounting process is trying to get the map's size.
1639 * So this assert can't be enabled till all those places are uniform in
1640 * their use of the 'map->terminated' bit.
1641 *
1642 * assert(map_size >= reserved_size);
1643 */
1644
1645 return (map_size >= reserved_size) ? (map_size - reserved_size) : map_size;
1646}
1647
1648/*
1649 * vm_map_entry_create: [ internal use only ]
1650 *
1651 * Allocates a VM map entry for insertion in the
1652 * given map (or map copy). No fields are filled.
1653 *
1654 * The VM entry will be zero initialized, except for:
1655 * - behavior set to VM_BEHAVIOR_DEFAULT
1656 * - inheritance set to VM_INHERIT_DEFAULT
1657 */
1658#define vm_map_entry_create(map) _vm_map_entry_create(&(map)->hdr)
1659
1660#define vm_map_copy_entry_create(copy) _vm_map_entry_create(&(copy)->cpy_hdr)
1661
1662static vm_map_entry_t
1663_vm_map_entry_create(
1664 struct vm_map_header *map_header __unused)
1665{
1666 vm_map_entry_t entry = NULL;
1667
1668 entry = zalloc_id(ZONE_ID_VM_MAP_ENTRY, Z_WAITOK | Z_ZERO);
1669
1670 /*
1671 * Help the compiler with what we know to be true,
1672 * so that the further bitfields inits have good codegen.
1673 *
1674 * See rdar://87041299
1675 */
1676 __builtin_assume(entry->vme_object_value == 0);
1677 __builtin_assume(*(uint64_t *)(&entry->vme_object_value + 1) == 0);
1678 __builtin_assume(*(uint64_t *)(&entry->vme_object_value + 2) == 0);
1679
1680 static_assert(VM_MAX_TAG_VALUE <= VME_ALIAS_MASK,
1681 "VME_ALIAS_MASK covers tags");
1682
1683 static_assert(VM_BEHAVIOR_DEFAULT == 0,
1684 "can skip zeroing of the behavior field");
1685 entry->inheritance = VM_INHERIT_DEFAULT;
1686
1687#if MAP_ENTRY_CREATION_DEBUG
1688 entry->vme_creation_maphdr = map_header;
1689 entry->vme_creation_bt = btref_get(__builtin_frame_address(0),
1690 BTREF_GET_NOWAIT);
1691#endif
1692 return entry;
1693}
1694
1695/*
1696 * vm_map_entry_dispose: [ internal use only ]
1697 *
1698 * Inverse of vm_map_entry_create.
1699 *
1700 * write map lock held so no need to
1701 * do anything special to insure correctness
1702 * of the stores
1703 */
1704static void
1705vm_map_entry_dispose(
1706 vm_map_entry_t entry)
1707{
1708#if VM_BTLOG_TAGS
1709 if (entry->vme_kernel_object) {
1710 btref_put(entry->vme_tag_btref);
1711 }
1712#endif /* VM_BTLOG_TAGS */
1713#if MAP_ENTRY_CREATION_DEBUG
1714 btref_put(entry->vme_creation_bt);
1715#endif
1716#if MAP_ENTRY_INSERTION_DEBUG
1717 btref_put(entry->vme_insertion_bt);
1718#endif
1719 zfree(vm_map_entry_zone, entry);
1720}
1721
1722#define vm_map_copy_entry_dispose(copy_entry) \
1723 vm_map_entry_dispose(copy_entry)
1724
1725static vm_map_entry_t
1726vm_map_zap_first_entry(
1727 vm_map_zap_t list)
1728{
1729 return list->vmz_head;
1730}
1731
1732static vm_map_entry_t
1733vm_map_zap_last_entry(
1734 vm_map_zap_t list)
1735{
1736 assert(vm_map_zap_first_entry(list));
1737 return __container_of(list->vmz_tail, struct vm_map_entry, vme_next);
1738}
1739
1740static void
1741vm_map_zap_append(
1742 vm_map_zap_t list,
1743 vm_map_entry_t entry)
1744{
1745 entry->vme_next = VM_MAP_ENTRY_NULL;
1746 *list->vmz_tail = entry;
1747 list->vmz_tail = &entry->vme_next;
1748}
1749
1750static vm_map_entry_t
1751vm_map_zap_pop(
1752 vm_map_zap_t list)
1753{
1754 vm_map_entry_t head = list->vmz_head;
1755
1756 if (head != VM_MAP_ENTRY_NULL &&
1757 (list->vmz_head = head->vme_next) == VM_MAP_ENTRY_NULL) {
1758 list->vmz_tail = &list->vmz_head;
1759 }
1760
1761 return head;
1762}
1763
1764static void
1765vm_map_zap_dispose(
1766 vm_map_zap_t list)
1767{
1768 vm_map_entry_t entry;
1769
1770 while ((entry = vm_map_zap_pop(list))) {
1771 if (entry->is_sub_map) {
1772 vm_map_deallocate(VME_SUBMAP(entry));
1773 } else {
1774 vm_object_deallocate(VME_OBJECT(entry));
1775 }
1776
1777 vm_map_entry_dispose(entry);
1778 }
1779}
1780
1781#if MACH_ASSERT
1782static boolean_t first_free_check = FALSE;
1783boolean_t
1784first_free_is_valid(
1785 vm_map_t map)
1786{
1787 if (!first_free_check) {
1788 return TRUE;
1789 }
1790
1791 return first_free_is_valid_store( map );
1792}
1793#endif /* MACH_ASSERT */
1794
1795
1796#define vm_map_copy_entry_link(copy, after_where, entry) \
1797 _vm_map_store_entry_link(&(copy)->cpy_hdr, after_where, (entry))
1798
1799#define vm_map_copy_entry_unlink(copy, entry) \
1800 _vm_map_store_entry_unlink(&(copy)->cpy_hdr, (entry), false)
1801
1802/*
1803 * vm_map_destroy:
1804 *
1805 * Actually destroy a map.
1806 */
1807void
1808vm_map_destroy(
1809 vm_map_t map)
1810{
1811 /* final cleanup: this is not allowed to fail */
1812 vmr_flags_t flags = VM_MAP_REMOVE_NO_FLAGS;
1813
1814 VM_MAP_ZAP_DECLARE(zap);
1815
1816 vm_map_lock(map);
1817
1818 map->terminated = true;
1819 /* clean up regular map entries */
1820 (void)vm_map_delete(map, start: map->min_offset, end: map->max_offset, flags,
1821 KMEM_GUARD_NONE, zap: &zap);
1822 /* clean up leftover special mappings (commpage, GPU carveout, etc...) */
1823 (void)vm_map_delete(map, start: 0x0, end: 0xFFFFFFFFFFFFF000ULL, flags,
1824 KMEM_GUARD_NONE, zap: &zap);
1825
1826 vm_map_disable_hole_optimization(map);
1827 vm_map_corpse_footprint_destroy(map);
1828
1829 vm_map_unlock(map);
1830
1831 vm_map_zap_dispose(list: &zap);
1832
1833 assert(map->hdr.nentries == 0);
1834
1835 if (map->pmap) {
1836 pmap_destroy(pmap: map->pmap);
1837 }
1838
1839 lck_rw_destroy(lck: &map->lock, grp: &vm_map_lck_grp);
1840
1841#if CONFIG_MAP_RANGES
1842 kfree_data(map->extra_ranges,
1843 map->extra_ranges_count * sizeof(struct vm_map_user_range));
1844#endif
1845
1846 zfree_id(ZONE_ID_VM_MAP, map);
1847}
1848
1849/*
1850 * Returns pid of the task with the largest number of VM map entries.
1851 * Used in the zone-map-exhaustion jetsam path.
1852 */
1853pid_t
1854find_largest_process_vm_map_entries(void)
1855{
1856 pid_t victim_pid = -1;
1857 int max_vm_map_entries = 0;
1858 task_t task = TASK_NULL;
1859 queue_head_t *task_list = &tasks;
1860
1861 lck_mtx_lock(lck: &tasks_threads_lock);
1862 queue_iterate(task_list, task, task_t, tasks) {
1863 if (task == kernel_task || !task->active) {
1864 continue;
1865 }
1866
1867 vm_map_t task_map = task->map;
1868 if (task_map != VM_MAP_NULL) {
1869 int task_vm_map_entries = task_map->hdr.nentries;
1870 if (task_vm_map_entries > max_vm_map_entries) {
1871 max_vm_map_entries = task_vm_map_entries;
1872 victim_pid = pid_from_task(task);
1873 }
1874 }
1875 }
1876 lck_mtx_unlock(lck: &tasks_threads_lock);
1877
1878 printf(format: "zone_map_exhaustion: victim pid %d, vm region count: %d\n", victim_pid, max_vm_map_entries);
1879 return victim_pid;
1880}
1881
1882
1883/*
1884 * vm_map_lookup_entry: [ internal use only ]
1885 *
1886 * Calls into the vm map store layer to find the map
1887 * entry containing (or immediately preceding) the
1888 * specified address in the given map; the entry is returned
1889 * in the "entry" parameter. The boolean
1890 * result indicates whether the address is
1891 * actually contained in the map.
1892 */
1893boolean_t
1894vm_map_lookup_entry(
1895 vm_map_t map,
1896 vm_map_offset_t address,
1897 vm_map_entry_t *entry) /* OUT */
1898{
1899 if (VM_KERNEL_ADDRESS(address)) {
1900 address = VM_KERNEL_STRIP_UPTR(address);
1901 }
1902
1903
1904#if CONFIG_PROB_GZALLOC
1905 if (map->pmap == kernel_pmap) {
1906 assertf(!pgz_owned(address),
1907 "it is the responsibility of callers to unguard PGZ addresses");
1908 }
1909#endif /* CONFIG_PROB_GZALLOC */
1910 return vm_map_store_lookup_entry( map, address, entryp: entry );
1911}
1912
1913boolean_t
1914vm_map_lookup_entry_or_next(
1915 vm_map_t map,
1916 vm_map_offset_t address,
1917 vm_map_entry_t *entry) /* OUT */
1918{
1919 if (vm_map_lookup_entry(map, address, entry)) {
1920 return true;
1921 }
1922
1923 *entry = (*entry)->vme_next;
1924 return false;
1925}
1926
1927#if CONFIG_PROB_GZALLOC
1928boolean_t
1929vm_map_lookup_entry_allow_pgz(
1930 vm_map_t map,
1931 vm_map_offset_t address,
1932 vm_map_entry_t *entry) /* OUT */
1933{
1934 if (VM_KERNEL_ADDRESS(address)) {
1935 address = VM_KERNEL_STRIP_UPTR(address);
1936 }
1937 return vm_map_store_lookup_entry( map, address, entry );
1938}
1939#endif /* CONFIG_PROB_GZALLOC */
1940
1941/*
1942 * Routine: vm_map_range_invalid_panic
1943 * Purpose:
1944 * Panic on detection of an invalid range id.
1945 */
1946__abortlike
1947static void
1948vm_map_range_invalid_panic(
1949 vm_map_t map,
1950 vm_map_range_id_t range_id)
1951{
1952 panic("invalid range ID (%u) for map %p", range_id, map);
1953}
1954
1955/*
1956 * Routine: vm_map_get_range
1957 * Purpose:
1958 * Adjust bounds based on security policy.
1959 */
1960static struct mach_vm_range
1961vm_map_get_range(
1962 vm_map_t map,
1963 vm_map_address_t *address,
1964 vm_map_kernel_flags_t *vmk_flags,
1965 vm_map_size_t size,
1966 bool *is_ptr)
1967{
1968 struct mach_vm_range effective_range = {};
1969 vm_map_range_id_t range_id = vmk_flags->vmkf_range_id;
1970
1971 if (map == kernel_map) {
1972 effective_range = kmem_ranges[range_id];
1973
1974 if (startup_phase >= STARTUP_SUB_KMEM) {
1975 /*
1976 * Hint provided by caller is zeroed as the range is restricted to a
1977 * subset of the entire kernel_map VA, which could put the hint outside
1978 * the range, causing vm_map_store_find_space to fail.
1979 */
1980 *address = 0ull;
1981 /*
1982 * Ensure that range_id passed in by the caller is within meaningful
1983 * bounds. Range id of KMEM_RANGE_ID_NONE will cause vm_map_locate_space
1984 * to fail as the corresponding range is invalid. Range id larger than
1985 * KMEM_RANGE_ID_MAX will lead to an OOB access.
1986 */
1987 if ((range_id == KMEM_RANGE_ID_NONE) ||
1988 (range_id > KMEM_RANGE_ID_MAX)) {
1989 vm_map_range_invalid_panic(map, range_id);
1990 }
1991
1992 /*
1993 * Pointer ranges use kmem_locate_space to do allocations.
1994 *
1995 * Non pointer fronts look like [ Small | Large | Permanent ]
1996 * Adjust range for allocations larger than KMEM_SMALLMAP_THRESHOLD.
1997 * Allocations smaller than KMEM_SMALLMAP_THRESHOLD are allowed to
1998 * use the entire range.
1999 */
2000 if (range_id < KMEM_RANGE_ID_SPRAYQTN) {
2001 *is_ptr = true;
2002 } else if (size >= KMEM_SMALLMAP_THRESHOLD) {
2003 effective_range = kmem_large_ranges[range_id];
2004 }
2005 }
2006#if CONFIG_MAP_RANGES
2007 } else if (map->uses_user_ranges) {
2008 switch (range_id) {
2009 case UMEM_RANGE_ID_DEFAULT:
2010 effective_range = map->default_range;
2011 break;
2012 case UMEM_RANGE_ID_HEAP:
2013 effective_range = map->data_range;
2014 break;
2015 case UMEM_RANGE_ID_FIXED:
2016 /*
2017 * anywhere allocations with an address in "FIXED"
2018 * makes no sense, leave the range empty
2019 */
2020 break;
2021
2022 default:
2023 vm_map_range_invalid_panic(map, range_id);
2024 }
2025#endif /* CONFIG_MAP_RANGES */
2026 } else {
2027 /*
2028 * If minimum is 0, bump it up by PAGE_SIZE. We want to limit
2029 * allocations of PAGEZERO to explicit requests since its
2030 * normal use is to catch dereferences of NULL and many
2031 * applications also treat pointers with a value of 0 as
2032 * special and suddenly having address 0 contain useable
2033 * memory would tend to confuse those applications.
2034 */
2035 effective_range.min_address = MAX(map->min_offset, VM_MAP_PAGE_SIZE(map));
2036 effective_range.max_address = map->max_offset;
2037 }
2038
2039 return effective_range;
2040}
2041
2042/*
2043 * Routine: vm_map_locate_space
2044 * Purpose:
2045 * Finds a range in the specified virtual address map,
2046 * returning the start of that range,
2047 * as well as the entry right before it.
2048 */
2049kern_return_t
2050vm_map_locate_space(
2051 vm_map_t map,
2052 vm_map_size_t size,
2053 vm_map_offset_t mask,
2054 vm_map_kernel_flags_t vmk_flags,
2055 vm_map_offset_t *start_inout,
2056 vm_map_entry_t *entry_out)
2057{
2058 struct mach_vm_range effective_range = {};
2059 vm_map_size_t guard_offset;
2060 vm_map_offset_t hint, limit;
2061 vm_map_entry_t entry;
2062 bool is_kmem_ptr_range = false;
2063
2064 /*
2065 * Only supported by vm_map_enter() with a fixed address.
2066 */
2067 assert(!vmk_flags.vmkf_beyond_max);
2068
2069 if (__improbable(map->wait_for_space)) {
2070 /*
2071 * support for "wait_for_space" is minimal,
2072 * its only consumer is the ipc_kernel_copy_map.
2073 */
2074 assert(!map->holelistenabled &&
2075 !vmk_flags.vmkf_last_free &&
2076 !vmk_flags.vmkf_keep_map_locked &&
2077 !vmk_flags.vmkf_map_jit &&
2078 !vmk_flags.vmf_random_addr &&
2079 *start_inout <= map->min_offset);
2080 } else if (vmk_flags.vmkf_last_free) {
2081 assert(!vmk_flags.vmkf_map_jit &&
2082 !vmk_flags.vmf_random_addr);
2083 }
2084
2085 if (vmk_flags.vmkf_guard_before) {
2086 guard_offset = VM_MAP_PAGE_SIZE(map);
2087 assert(size > guard_offset);
2088 size -= guard_offset;
2089 } else {
2090 assert(size != 0);
2091 guard_offset = 0;
2092 }
2093
2094 /*
2095 * Validate range_id from flags and get associated range
2096 */
2097 effective_range = vm_map_get_range(map, address: start_inout, vmk_flags: &vmk_flags, size,
2098 is_ptr: &is_kmem_ptr_range);
2099
2100 if (is_kmem_ptr_range) {
2101 return kmem_locate_space(size: size + guard_offset, range_id: vmk_flags.vmkf_range_id,
2102 direction: vmk_flags.vmkf_last_free, start_inout, entry_out);
2103 }
2104
2105#if XNU_TARGET_OS_OSX
2106 if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2107 assert(map != kernel_map);
2108 effective_range.max_address = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2109 }
2110#endif /* XNU_TARGET_OS_OSX */
2111
2112again:
2113 if (vmk_flags.vmkf_last_free) {
2114 hint = *start_inout;
2115
2116 if (hint == 0 || hint > effective_range.max_address) {
2117 hint = effective_range.max_address;
2118 }
2119 if (hint <= effective_range.min_address) {
2120 return KERN_NO_SPACE;
2121 }
2122 limit = effective_range.min_address;
2123 } else {
2124 hint = *start_inout;
2125
2126 if (vmk_flags.vmkf_map_jit) {
2127 if (map->jit_entry_exists &&
2128 !VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
2129 return KERN_INVALID_ARGUMENT;
2130 }
2131 if (VM_MAP_POLICY_ALLOW_JIT_RANDOM_ADDRESS(map)) {
2132 vmk_flags.vmf_random_addr = true;
2133 }
2134 }
2135
2136 if (vmk_flags.vmf_random_addr) {
2137 kern_return_t kr;
2138
2139 kr = vm_map_random_address_for_size(map, address: &hint, size, vmk_flags);
2140 if (kr != KERN_SUCCESS) {
2141 return kr;
2142 }
2143 }
2144#if __x86_64__
2145 else if ((hint == 0 || hint == vm_map_min(map)) &&
2146 !map->disable_vmentry_reuse &&
2147 map->vmmap_high_start != 0) {
2148 hint = map->vmmap_high_start;
2149 }
2150#endif /* __x86_64__ */
2151
2152 if (hint < effective_range.min_address) {
2153 hint = effective_range.min_address;
2154 }
2155 if (effective_range.max_address <= hint) {
2156 return KERN_NO_SPACE;
2157 }
2158
2159 limit = effective_range.max_address;
2160 }
2161 entry = vm_map_store_find_space(map,
2162 hint, limit, backwards: vmk_flags.vmkf_last_free,
2163 guard_offset, size, mask,
2164 addr_out: start_inout);
2165
2166 if (__improbable(entry == NULL)) {
2167 if (map->wait_for_space &&
2168 guard_offset + size <=
2169 effective_range.max_address - effective_range.min_address) {
2170 assert_wait(event: (event_t)map, THREAD_ABORTSAFE);
2171 vm_map_unlock(map);
2172 thread_block(THREAD_CONTINUE_NULL);
2173 vm_map_lock(map);
2174 goto again;
2175 }
2176 return KERN_NO_SPACE;
2177 }
2178
2179 if (entry_out) {
2180 *entry_out = entry;
2181 }
2182 return KERN_SUCCESS;
2183}
2184
2185
2186/*
2187 * Routine: vm_map_find_space
2188 * Purpose:
2189 * Allocate a range in the specified virtual address map,
2190 * returning the entry allocated for that range.
2191 * Used by kmem_alloc, etc.
2192 *
2193 * The map must be NOT be locked. It will be returned locked
2194 * on KERN_SUCCESS, unlocked on failure.
2195 *
2196 * If an entry is allocated, the object/offset fields
2197 * are initialized to zero.
2198 */
2199kern_return_t
2200vm_map_find_space(
2201 vm_map_t map,
2202 vm_map_offset_t hint_address,
2203 vm_map_size_t size,
2204 vm_map_offset_t mask,
2205 vm_map_kernel_flags_t vmk_flags,
2206 vm_map_entry_t *o_entry) /* OUT */
2207{
2208 vm_map_entry_t new_entry, entry;
2209 kern_return_t kr;
2210
2211 if (size == 0) {
2212 return KERN_INVALID_ARGUMENT;
2213 }
2214
2215 new_entry = vm_map_entry_create(map);
2216 new_entry->use_pmap = true;
2217 new_entry->protection = VM_PROT_DEFAULT;
2218 new_entry->max_protection = VM_PROT_ALL;
2219
2220 if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
2221 new_entry->map_aligned = true;
2222 }
2223 if (vmk_flags.vmf_permanent) {
2224 new_entry->vme_permanent = true;
2225 }
2226
2227 vm_map_lock(map);
2228
2229 kr = vm_map_locate_space(map, size, mask, vmk_flags,
2230 start_inout: &hint_address, entry_out: &entry);
2231 if (kr != KERN_SUCCESS) {
2232 vm_map_unlock(map);
2233 vm_map_entry_dispose(entry: new_entry);
2234 return kr;
2235 }
2236 new_entry->vme_start = hint_address;
2237 new_entry->vme_end = hint_address + size;
2238
2239 /*
2240 * At this point,
2241 *
2242 * - new_entry's "vme_start" and "vme_end" should define
2243 * the endpoints of the available new range,
2244 *
2245 * - and "entry" should refer to the region before
2246 * the new range,
2247 *
2248 * - and the map should still be locked.
2249 */
2250
2251 assert(page_aligned(new_entry->vme_start));
2252 assert(page_aligned(new_entry->vme_end));
2253 assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_start, VM_MAP_PAGE_MASK(map)));
2254 assert(VM_MAP_PAGE_ALIGNED(new_entry->vme_end, VM_MAP_PAGE_MASK(map)));
2255
2256 /*
2257 * Insert the new entry into the list
2258 */
2259
2260 vm_map_store_entry_link(map, after_where: entry, entry: new_entry,
2261 VM_MAP_KERNEL_FLAGS_NONE);
2262 map->size += size;
2263
2264 /*
2265 * Update the lookup hint
2266 */
2267 SAVE_HINT_MAP_WRITE(map, new_entry);
2268
2269 *o_entry = new_entry;
2270 return KERN_SUCCESS;
2271}
2272
2273int vm_map_pmap_enter_print = FALSE;
2274int vm_map_pmap_enter_enable = FALSE;
2275
2276/*
2277 * Routine: vm_map_pmap_enter [internal only]
2278 *
2279 * Description:
2280 * Force pages from the specified object to be entered into
2281 * the pmap at the specified address if they are present.
2282 * As soon as a page not found in the object the scan ends.
2283 *
2284 * Returns:
2285 * Nothing.
2286 *
2287 * In/out conditions:
2288 * The source map should not be locked on entry.
2289 */
2290__unused static void
2291vm_map_pmap_enter(
2292 vm_map_t map,
2293 vm_map_offset_t addr,
2294 vm_map_offset_t end_addr,
2295 vm_object_t object,
2296 vm_object_offset_t offset,
2297 vm_prot_t protection)
2298{
2299 int type_of_fault;
2300 kern_return_t kr;
2301 uint8_t object_lock_type = 0;
2302 struct vm_object_fault_info fault_info = {};
2303
2304 if (map->pmap == 0) {
2305 return;
2306 }
2307
2308 assert(VM_MAP_PAGE_SHIFT(map) == PAGE_SHIFT);
2309
2310 while (addr < end_addr) {
2311 vm_page_t m;
2312
2313
2314 /*
2315 * TODO:
2316 * From vm_map_enter(), we come into this function without the map
2317 * lock held or the object lock held.
2318 * We haven't taken a reference on the object either.
2319 * We should do a proper lookup on the map to make sure
2320 * that things are sane before we go locking objects that
2321 * could have been deallocated from under us.
2322 */
2323
2324 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
2325 vm_object_lock(object);
2326
2327 m = vm_page_lookup(object, offset);
2328
2329 if (m == VM_PAGE_NULL || m->vmp_busy || m->vmp_fictitious ||
2330 (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_absent))) {
2331 vm_object_unlock(object);
2332 return;
2333 }
2334
2335 if (vm_map_pmap_enter_print) {
2336 printf(format: "vm_map_pmap_enter:");
2337 printf(format: "map: %p, addr: %llx, object: %p, offset: %llx\n",
2338 map, (unsigned long long)addr, object, (unsigned long long)offset);
2339 }
2340 type_of_fault = DBG_CACHE_HIT_FAULT;
2341 kr = vm_fault_enter(m, pmap: map->pmap,
2342 vaddr: addr,
2343 PAGE_SIZE, fault_phys_offset: 0,
2344 prot: protection, fault_type: protection,
2345 VM_PAGE_WIRED(m),
2346 FALSE, /* change_wiring */
2347 VM_KERN_MEMORY_NONE, /* tag - not wiring */
2348 fault_info: &fault_info,
2349 NULL, /* need_retry */
2350 type_of_fault: &type_of_fault,
2351 object_lock_type: &object_lock_type); /* Exclusive lock mode. Will remain unchanged.*/
2352
2353 vm_object_unlock(object);
2354
2355 offset += PAGE_SIZE_64;
2356 addr += PAGE_SIZE;
2357 }
2358}
2359
2360#define MAX_TRIES_TO_GET_RANDOM_ADDRESS 1000
2361static kern_return_t
2362vm_map_random_address_for_size(
2363 vm_map_t map,
2364 vm_map_offset_t *address,
2365 vm_map_size_t size,
2366 vm_map_kernel_flags_t vmk_flags)
2367{
2368 kern_return_t kr = KERN_SUCCESS;
2369 int tries = 0;
2370 vm_map_offset_t random_addr = 0;
2371 vm_map_offset_t hole_end;
2372
2373 vm_map_entry_t next_entry = VM_MAP_ENTRY_NULL;
2374 vm_map_entry_t prev_entry = VM_MAP_ENTRY_NULL;
2375 vm_map_size_t vm_hole_size = 0;
2376 vm_map_size_t addr_space_size;
2377 bool is_kmem_ptr;
2378 struct mach_vm_range effective_range;
2379
2380 effective_range = vm_map_get_range(map, address, vmk_flags: &vmk_flags, size,
2381 is_ptr: &is_kmem_ptr);
2382
2383 addr_space_size = effective_range.max_address - effective_range.min_address;
2384 if (size >= addr_space_size) {
2385 return KERN_NO_SPACE;
2386 }
2387 addr_space_size -= size;
2388
2389 assert(VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map)));
2390
2391 while (tries < MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2392 if (startup_phase < STARTUP_SUB_ZALLOC) {
2393 random_addr = (vm_map_offset_t)early_random();
2394 } else {
2395 random_addr = (vm_map_offset_t)random();
2396 }
2397 random_addr <<= VM_MAP_PAGE_SHIFT(map);
2398 random_addr = vm_map_trunc_page(
2399 effective_range.min_address + (random_addr % addr_space_size),
2400 VM_MAP_PAGE_MASK(map));
2401
2402#if CONFIG_PROB_GZALLOC
2403 if (map->pmap == kernel_pmap && pgz_owned(random_addr)) {
2404 continue;
2405 }
2406#endif /* CONFIG_PROB_GZALLOC */
2407
2408 if (vm_map_lookup_entry(map, address: random_addr, entry: &prev_entry) == FALSE) {
2409 if (prev_entry == vm_map_to_entry(map)) {
2410 next_entry = vm_map_first_entry(map);
2411 } else {
2412 next_entry = prev_entry->vme_next;
2413 }
2414 if (next_entry == vm_map_to_entry(map)) {
2415 hole_end = vm_map_max(map);
2416 } else {
2417 hole_end = next_entry->vme_start;
2418 }
2419 vm_hole_size = hole_end - random_addr;
2420 if (vm_hole_size >= size) {
2421 *address = random_addr;
2422 break;
2423 }
2424 }
2425 tries++;
2426 }
2427
2428 if (tries == MAX_TRIES_TO_GET_RANDOM_ADDRESS) {
2429 kr = KERN_NO_SPACE;
2430 }
2431 return kr;
2432}
2433
2434static boolean_t
2435vm_memory_malloc_no_cow(
2436 int alias)
2437{
2438 uint64_t alias_mask;
2439
2440 if (!malloc_no_cow) {
2441 return FALSE;
2442 }
2443 if (alias > 63) {
2444 return FALSE;
2445 }
2446 alias_mask = 1ULL << alias;
2447 if (alias_mask & vm_memory_malloc_no_cow_mask) {
2448 return TRUE;
2449 }
2450 return FALSE;
2451}
2452
2453uint64_t vm_map_enter_RLIMIT_AS_count = 0;
2454uint64_t vm_map_enter_RLIMIT_DATA_count = 0;
2455/*
2456 * Routine: vm_map_enter
2457 *
2458 * Description:
2459 * Allocate a range in the specified virtual address map.
2460 * The resulting range will refer to memory defined by
2461 * the given memory object and offset into that object.
2462 *
2463 * Arguments are as defined in the vm_map call.
2464 */
2465static unsigned int vm_map_enter_restore_successes = 0;
2466static unsigned int vm_map_enter_restore_failures = 0;
2467kern_return_t
2468vm_map_enter(
2469 vm_map_t map,
2470 vm_map_offset_t *address, /* IN/OUT */
2471 vm_map_size_t size,
2472 vm_map_offset_t mask,
2473 vm_map_kernel_flags_t vmk_flags,
2474 vm_object_t object,
2475 vm_object_offset_t offset,
2476 boolean_t needs_copy,
2477 vm_prot_t cur_protection,
2478 vm_prot_t max_protection,
2479 vm_inherit_t inheritance)
2480{
2481 vm_map_entry_t entry, new_entry;
2482 vm_map_offset_t start, tmp_start, tmp_offset;
2483 vm_map_offset_t end, tmp_end;
2484 vm_map_offset_t tmp2_start, tmp2_end;
2485 vm_map_offset_t step;
2486 kern_return_t result = KERN_SUCCESS;
2487 bool map_locked = FALSE;
2488 bool pmap_empty = TRUE;
2489 bool new_mapping_established = FALSE;
2490 const bool keep_map_locked = vmk_flags.vmkf_keep_map_locked;
2491 const bool anywhere = !vmk_flags.vmf_fixed;
2492 const bool purgable = vmk_flags.vmf_purgeable;
2493 const bool overwrite = vmk_flags.vmf_overwrite;
2494 const bool no_cache = vmk_flags.vmf_no_cache;
2495 const bool is_submap = vmk_flags.vmkf_submap;
2496 const bool permanent = vmk_flags.vmf_permanent;
2497 const bool no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
2498 const bool entry_for_jit = vmk_flags.vmkf_map_jit;
2499 const bool iokit_acct = vmk_flags.vmkf_iokit_acct;
2500 const bool resilient_codesign = vmk_flags.vmf_resilient_codesign;
2501 const bool resilient_media = vmk_flags.vmf_resilient_media;
2502 const bool entry_for_tpro = vmk_flags.vmf_tpro;
2503 const unsigned int superpage_size = vmk_flags.vmf_superpage_size;
2504 const vm_tag_t alias = vmk_flags.vm_tag;
2505 vm_tag_t user_alias;
2506 kern_return_t kr;
2507 bool clear_map_aligned = FALSE;
2508 vm_map_size_t chunk_size = 0;
2509 vm_object_t caller_object;
2510 VM_MAP_ZAP_DECLARE(zap_old_list);
2511 VM_MAP_ZAP_DECLARE(zap_new_list);
2512
2513 caller_object = object;
2514
2515 assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
2516
2517 if (vmk_flags.vmf_4gb_chunk) {
2518#if defined(__LP64__)
2519 chunk_size = (4ULL * 1024 * 1024 * 1024); /* max. 4GB chunks for the new allocation */
2520#else /* __LP64__ */
2521 chunk_size = ANON_CHUNK_SIZE;
2522#endif /* __LP64__ */
2523 } else {
2524 chunk_size = ANON_CHUNK_SIZE;
2525 }
2526
2527
2528
2529 if (superpage_size) {
2530 switch (superpage_size) {
2531 /*
2532 * Note that the current implementation only supports
2533 * a single size for superpages, SUPERPAGE_SIZE, per
2534 * architecture. As soon as more sizes are supposed
2535 * to be supported, SUPERPAGE_SIZE has to be replaced
2536 * with a lookup of the size depending on superpage_size.
2537 */
2538#ifdef __x86_64__
2539 case SUPERPAGE_SIZE_ANY:
2540 /* handle it like 2 MB and round up to page size */
2541 size = (size + 2 * 1024 * 1024 - 1) & ~(2 * 1024 * 1024 - 1);
2542 OS_FALLTHROUGH;
2543 case SUPERPAGE_SIZE_2MB:
2544 break;
2545#endif
2546 default:
2547 return KERN_INVALID_ARGUMENT;
2548 }
2549 mask = SUPERPAGE_SIZE - 1;
2550 if (size & (SUPERPAGE_SIZE - 1)) {
2551 return KERN_INVALID_ARGUMENT;
2552 }
2553 inheritance = VM_INHERIT_NONE; /* fork() children won't inherit superpages */
2554 }
2555
2556
2557 if ((cur_protection & VM_PROT_WRITE) &&
2558 (cur_protection & VM_PROT_EXECUTE) &&
2559#if XNU_TARGET_OS_OSX
2560 map->pmap != kernel_pmap &&
2561 (cs_process_global_enforcement() ||
2562 (vmk_flags.vmkf_cs_enforcement_override
2563 ? vmk_flags.vmkf_cs_enforcement
2564 : (vm_map_cs_enforcement(map)
2565#if __arm64__
2566 || !VM_MAP_IS_EXOTIC(map)
2567#endif /* __arm64__ */
2568 ))) &&
2569#endif /* XNU_TARGET_OS_OSX */
2570#if CODE_SIGNING_MONITOR
2571 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
2572#endif
2573 (VM_MAP_POLICY_WX_FAIL(map) ||
2574 VM_MAP_POLICY_WX_STRIP_X(map)) &&
2575 !entry_for_jit) {
2576 boolean_t vm_protect_wx_fail = VM_MAP_POLICY_WX_FAIL(map);
2577
2578 DTRACE_VM3(cs_wx,
2579 uint64_t, 0,
2580 uint64_t, 0,
2581 vm_prot_t, cur_protection);
2582 printf(format: "CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. %s\n",
2583 proc_selfpid(),
2584 (get_bsdtask_info(current_task())
2585 ? proc_name_address(p: get_bsdtask_info(current_task()))
2586 : "?"),
2587 __FUNCTION__,
2588 (vm_protect_wx_fail ? "failing" : "turning off execute"));
2589 cur_protection &= ~VM_PROT_EXECUTE;
2590 if (vm_protect_wx_fail) {
2591 return KERN_PROTECTION_FAILURE;
2592 }
2593 }
2594
2595 if (entry_for_jit
2596 && cur_protection != VM_PROT_ALL) {
2597 /*
2598 * Native macOS processes and all non-macOS processes are
2599 * expected to create JIT regions via mmap(MAP_JIT, RWX) but
2600 * the RWX requirement was not enforced, and thus, we must live
2601 * with our sins. We are now dealing with a JIT mapping without
2602 * RWX.
2603 *
2604 * We deal with these by letting the MAP_JIT stick in order
2605 * to avoid CS violations when these pages are mapped executable
2606 * down the line. In order to appease the page table monitor (you
2607 * know what I'm talking about), these pages will end up being
2608 * marked as XNU_USER_DEBUG, which will be allowed because we
2609 * don't enforce the code signing monitor on macOS systems. If
2610 * the user-space application ever changes permissions to RWX,
2611 * which they are allowed to since the mapping was originally
2612 * created with MAP_JIT, then they'll switch over to using the
2613 * XNU_USER_JIT type, and won't be allowed to downgrade any
2614 * more after that.
2615 *
2616 * When not on macOS, a MAP_JIT mapping without VM_PROT_ALL is
2617 * strictly disallowed.
2618 */
2619
2620#if XNU_TARGET_OS_OSX
2621 /*
2622 * Continue to allow non-RWX JIT
2623 */
2624#else
2625 /* non-macOS: reject JIT regions without RWX */
2626 DTRACE_VM3(cs_wx,
2627 uint64_t, 0,
2628 uint64_t, 0,
2629 vm_prot_t, cur_protection);
2630 printf("CODE SIGNING: %d[%s] %s(%d): JIT requires RWX: failing. \n",
2631 proc_selfpid(),
2632 (get_bsdtask_info(current_task())
2633 ? proc_name_address(get_bsdtask_info(current_task()))
2634 : "?"),
2635 __FUNCTION__,
2636 cur_protection);
2637 return KERN_PROTECTION_FAILURE;
2638#endif
2639 }
2640
2641 /*
2642 * If the task has requested executable lockdown,
2643 * deny any new executable mapping.
2644 */
2645 if (map->map_disallow_new_exec == TRUE) {
2646 if (cur_protection & VM_PROT_EXECUTE) {
2647 return KERN_PROTECTION_FAILURE;
2648 }
2649 }
2650
2651 if (resilient_codesign) {
2652 assert(!is_submap);
2653 int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
2654 if ((cur_protection | max_protection) & reject_prot) {
2655 return KERN_PROTECTION_FAILURE;
2656 }
2657 }
2658
2659 if (resilient_media) {
2660 assert(!is_submap);
2661// assert(!needs_copy);
2662 if (object != VM_OBJECT_NULL &&
2663 !object->internal) {
2664 /*
2665 * This mapping is directly backed by an external
2666 * memory manager (e.g. a vnode pager for a file):
2667 * we would not have any safe place to inject
2668 * a zero-filled page if an actual page is not
2669 * available, without possibly impacting the actual
2670 * contents of the mapped object (e.g. the file),
2671 * so we can't provide any media resiliency here.
2672 */
2673 return KERN_INVALID_ARGUMENT;
2674 }
2675 }
2676
2677 if (entry_for_tpro) {
2678 /*
2679 * TPRO overrides the effective permissions of the region
2680 * and explicitly maps as RW. Ensure we have been passed
2681 * the expected permissions. We accept `cur_protections`
2682 * RO as that will be handled on fault.
2683 */
2684 if (!(max_protection & VM_PROT_READ) ||
2685 !(max_protection & VM_PROT_WRITE) ||
2686 !(cur_protection & VM_PROT_READ)) {
2687 return KERN_PROTECTION_FAILURE;
2688 }
2689
2690 /*
2691 * We can now downgrade the cur_protection to RO. This is a mild lie
2692 * to the VM layer. But TPRO will be responsible for toggling the
2693 * protections between RO/RW
2694 */
2695 cur_protection = VM_PROT_READ;
2696 }
2697
2698 if (is_submap) {
2699 vm_map_t submap;
2700 if (purgable) {
2701 /* submaps can not be purgeable */
2702 return KERN_INVALID_ARGUMENT;
2703 }
2704 if (object == VM_OBJECT_NULL) {
2705 /* submaps can not be created lazily */
2706 return KERN_INVALID_ARGUMENT;
2707 }
2708 submap = (vm_map_t) object;
2709 if (VM_MAP_PAGE_SHIFT(map: submap) != VM_MAP_PAGE_SHIFT(map)) {
2710 /* page size mismatch */
2711 return KERN_INVALID_ARGUMENT;
2712 }
2713 }
2714 if (vmk_flags.vmkf_already) {
2715 /*
2716 * VM_FLAGS_ALREADY says that it's OK if the same mapping
2717 * is already present. For it to be meaningul, the requested
2718 * mapping has to be at a fixed address (!VM_FLAGS_ANYWHERE) and
2719 * we shouldn't try and remove what was mapped there first
2720 * (!VM_FLAGS_OVERWRITE).
2721 */
2722 if (!vmk_flags.vmf_fixed || vmk_flags.vmf_overwrite) {
2723 return KERN_INVALID_ARGUMENT;
2724 }
2725 }
2726
2727 if (size == 0 ||
2728 (offset & MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK_64)) != 0) {
2729 *address = 0;
2730 return KERN_INVALID_ARGUMENT;
2731 }
2732
2733 if (map->pmap == kernel_pmap) {
2734 user_alias = VM_KERN_MEMORY_NONE;
2735 } else {
2736 user_alias = alias;
2737 }
2738
2739 if (user_alias == VM_MEMORY_MALLOC_MEDIUM) {
2740 chunk_size = MALLOC_MEDIUM_CHUNK_SIZE;
2741 }
2742
2743#define RETURN(value) { result = value; goto BailOut; }
2744
2745 assertf(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK), "0x%llx", (uint64_t)*address);
2746 assertf(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK), "0x%llx", (uint64_t)size);
2747 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
2748 assertf(page_aligned(*address), "0x%llx", (uint64_t)*address);
2749 assertf(page_aligned(size), "0x%llx", (uint64_t)size);
2750 }
2751
2752 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
2753 !VM_MAP_PAGE_ALIGNED(size, VM_MAP_PAGE_MASK(map))) {
2754 /*
2755 * In most cases, the caller rounds the size up to the
2756 * map's page size.
2757 * If we get a size that is explicitly not map-aligned here,
2758 * we'll have to respect the caller's wish and mark the
2759 * mapping as "not map-aligned" to avoid tripping the
2760 * map alignment checks later.
2761 */
2762 clear_map_aligned = TRUE;
2763 }
2764 if (!anywhere &&
2765 VM_MAP_PAGE_MASK(map) >= PAGE_MASK &&
2766 !VM_MAP_PAGE_ALIGNED(*address, VM_MAP_PAGE_MASK(map))) {
2767 /*
2768 * We've been asked to map at a fixed address and that
2769 * address is not aligned to the map's specific alignment.
2770 * The caller should know what it's doing (i.e. most likely
2771 * mapping some fragmented copy map, transferring memory from
2772 * a VM map with a different alignment), so clear map_aligned
2773 * for this new VM map entry and proceed.
2774 */
2775 clear_map_aligned = TRUE;
2776 }
2777
2778 /*
2779 * Only zero-fill objects are allowed to be purgable.
2780 * LP64todo - limit purgable objects to 32-bits for now
2781 */
2782 if (purgable &&
2783 (offset != 0 ||
2784 (object != VM_OBJECT_NULL &&
2785 (object->vo_size != size ||
2786 object->purgable == VM_PURGABLE_DENY))
2787#if __LP64__
2788 || size > ANON_MAX_SIZE
2789#endif
2790 )) {
2791 return KERN_INVALID_ARGUMENT;
2792 }
2793
2794 start = *address;
2795
2796 if (anywhere) {
2797 vm_map_lock(map);
2798 map_locked = TRUE;
2799
2800 result = vm_map_locate_space(map, size, mask, vmk_flags,
2801 start_inout: &start, entry_out: &entry);
2802 if (result != KERN_SUCCESS) {
2803 goto BailOut;
2804 }
2805
2806 *address = start;
2807 end = start + size;
2808 assert(VM_MAP_PAGE_ALIGNED(*address,
2809 VM_MAP_PAGE_MASK(map)));
2810 } else {
2811 vm_map_offset_t effective_min_offset, effective_max_offset;
2812
2813 effective_min_offset = map->min_offset;
2814 effective_max_offset = map->max_offset;
2815
2816 if (vmk_flags.vmkf_beyond_max) {
2817 /*
2818 * Allow an insertion beyond the map's max offset.
2819 */
2820 effective_max_offset = 0x00000000FFFFF000ULL;
2821 if (vm_map_is_64bit(map)) {
2822 effective_max_offset = 0xFFFFFFFFFFFFF000ULL;
2823 }
2824#if XNU_TARGET_OS_OSX
2825 } else if (__improbable(vmk_flags.vmkf_32bit_map_va)) {
2826 effective_max_offset = MIN(map->max_offset, 0x00000000FFFFF000ULL);
2827#endif /* XNU_TARGET_OS_OSX */
2828 }
2829
2830 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT &&
2831 !overwrite &&
2832 user_alias == VM_MEMORY_REALLOC) {
2833 /*
2834 * Force realloc() to switch to a new allocation,
2835 * to prevent 4k-fragmented virtual ranges.
2836 */
2837// DEBUG4K_ERROR("no realloc in place");
2838 return KERN_NO_SPACE;
2839 }
2840
2841 /*
2842 * Verify that:
2843 * the address doesn't itself violate
2844 * the mask requirement.
2845 */
2846
2847 vm_map_lock(map);
2848 map_locked = TRUE;
2849 if ((start & mask) != 0) {
2850 RETURN(KERN_NO_SPACE);
2851 }
2852
2853#if CONFIG_MAP_RANGES
2854 if (map->uses_user_ranges) {
2855 struct mach_vm_range r;
2856
2857 vm_map_user_range_resolve(map, start, 1, &r);
2858 if (r.max_address == 0) {
2859 RETURN(KERN_INVALID_ADDRESS);
2860 }
2861 effective_min_offset = r.min_address;
2862 effective_max_offset = r.max_address;
2863 }
2864#endif /* CONFIG_MAP_RANGES */
2865
2866 if ((startup_phase >= STARTUP_SUB_KMEM) && !is_submap &&
2867 (map == kernel_map)) {
2868 mach_vm_range_t r = kmem_validate_range_for_overwrite(addr: start, size);
2869 effective_min_offset = r->min_address;
2870 effective_max_offset = r->max_address;
2871 }
2872
2873 /*
2874 * ... the address is within bounds
2875 */
2876
2877 end = start + size;
2878
2879 if ((start < effective_min_offset) ||
2880 (end > effective_max_offset) ||
2881 (start >= end)) {
2882 RETURN(KERN_INVALID_ADDRESS);
2883 }
2884
2885 if (overwrite) {
2886 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_TO_OVERWRITE;
2887 kern_return_t remove_kr;
2888
2889 /*
2890 * Fixed mapping and "overwrite" flag: attempt to
2891 * remove all existing mappings in the specified
2892 * address range, saving them in our "zap_old_list".
2893 *
2894 * This avoids releasing the VM map lock in
2895 * vm_map_entry_delete() and allows atomicity
2896 * when we want to replace some mappings with a new one.
2897 * It also allows us to restore the old VM mappings if the
2898 * new mapping fails.
2899 */
2900 remove_flags |= VM_MAP_REMOVE_NO_YIELD;
2901
2902 if (vmk_flags.vmkf_overwrite_immutable) {
2903 /* we can overwrite immutable mappings */
2904 remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
2905 }
2906 if (vmk_flags.vmkf_remap_prot_copy) {
2907 remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE;
2908 }
2909 remove_kr = vm_map_delete(map, start, end, flags: remove_flags,
2910 KMEM_GUARD_NONE, zap: &zap_old_list).kmr_return;
2911 if (remove_kr) {
2912 /* XXX FBDP restore zap_old_list? */
2913 RETURN(remove_kr);
2914 }
2915 }
2916
2917 /*
2918 * ... the starting address isn't allocated
2919 */
2920
2921 if (vm_map_lookup_entry(map, address: start, entry: &entry)) {
2922 if (!(vmk_flags.vmkf_already)) {
2923 RETURN(KERN_NO_SPACE);
2924 }
2925 /*
2926 * Check if what's already there is what we want.
2927 */
2928 tmp_start = start;
2929 tmp_offset = offset;
2930 if (entry->vme_start < start) {
2931 tmp_start -= start - entry->vme_start;
2932 tmp_offset -= start - entry->vme_start;
2933 }
2934 for (; entry->vme_start < end;
2935 entry = entry->vme_next) {
2936 /*
2937 * Check if the mapping's attributes
2938 * match the existing map entry.
2939 */
2940 if (entry == vm_map_to_entry(map) ||
2941 entry->vme_start != tmp_start ||
2942 entry->is_sub_map != is_submap ||
2943 VME_OFFSET(entry) != tmp_offset ||
2944 entry->needs_copy != needs_copy ||
2945 entry->protection != cur_protection ||
2946 entry->max_protection != max_protection ||
2947 entry->inheritance != inheritance ||
2948 entry->iokit_acct != iokit_acct ||
2949 VME_ALIAS(entry) != alias) {
2950 /* not the same mapping ! */
2951 RETURN(KERN_NO_SPACE);
2952 }
2953 /*
2954 * Check if the same object is being mapped.
2955 */
2956 if (is_submap) {
2957 if (VME_SUBMAP(entry) !=
2958 (vm_map_t) object) {
2959 /* not the same submap */
2960 RETURN(KERN_NO_SPACE);
2961 }
2962 } else {
2963 if (VME_OBJECT(entry) != object) {
2964 /* not the same VM object... */
2965 vm_object_t obj2;
2966
2967 obj2 = VME_OBJECT(entry);
2968 if ((obj2 == VM_OBJECT_NULL ||
2969 obj2->internal) &&
2970 (object == VM_OBJECT_NULL ||
2971 object->internal)) {
2972 /*
2973 * ... but both are
2974 * anonymous memory,
2975 * so equivalent.
2976 */
2977 } else {
2978 RETURN(KERN_NO_SPACE);
2979 }
2980 }
2981 }
2982
2983 tmp_offset += entry->vme_end - entry->vme_start;
2984 tmp_start += entry->vme_end - entry->vme_start;
2985 if (entry->vme_end >= end) {
2986 /* reached the end of our mapping */
2987 break;
2988 }
2989 }
2990 /* it all matches: let's use what's already there ! */
2991 RETURN(KERN_MEMORY_PRESENT);
2992 }
2993
2994 /*
2995 * ... the next region doesn't overlap the
2996 * end point.
2997 */
2998
2999 if ((entry->vme_next != vm_map_to_entry(map)) &&
3000 (entry->vme_next->vme_start < end)) {
3001 RETURN(KERN_NO_SPACE);
3002 }
3003 }
3004
3005 /*
3006 * At this point,
3007 * "start" and "end" should define the endpoints of the
3008 * available new range, and
3009 * "entry" should refer to the region before the new
3010 * range, and
3011 *
3012 * the map should be locked.
3013 */
3014
3015 /*
3016 * See whether we can avoid creating a new entry (and object) by
3017 * extending one of our neighbors. [So far, we only attempt to
3018 * extend from below.] Note that we can never extend/join
3019 * purgable objects because they need to remain distinct
3020 * entities in order to implement their "volatile object"
3021 * semantics.
3022 */
3023
3024 if (purgable ||
3025 entry_for_jit ||
3026 entry_for_tpro ||
3027 vm_memory_malloc_no_cow(alias: user_alias)) {
3028 if (object == VM_OBJECT_NULL) {
3029 object = vm_object_allocate(size);
3030 vm_object_lock(object);
3031 object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3032 VM_OBJECT_SET_TRUE_SHARE(object, FALSE);
3033 if (malloc_no_cow_except_fork &&
3034 !purgable &&
3035 !entry_for_jit &&
3036 !entry_for_tpro &&
3037 vm_memory_malloc_no_cow(alias: user_alias)) {
3038 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY_FORK;
3039 VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
3040 }
3041 if (purgable) {
3042 task_t owner;
3043 VM_OBJECT_SET_PURGABLE(object, VM_PURGABLE_NONVOLATILE);
3044 if (map->pmap == kernel_pmap) {
3045 /*
3046 * Purgeable mappings made in a kernel
3047 * map are "owned" by the kernel itself
3048 * rather than the current user task
3049 * because they're likely to be used by
3050 * more than this user task (see
3051 * execargs_purgeable_allocate(), for
3052 * example).
3053 */
3054 owner = kernel_task;
3055 } else {
3056 owner = current_task();
3057 }
3058 assert(object->vo_owner == NULL);
3059 assert(object->resident_page_count == 0);
3060 assert(object->wired_page_count == 0);
3061 vm_purgeable_nonvolatile_enqueue(object, task: owner);
3062 }
3063 vm_object_unlock(object);
3064 offset = (vm_object_offset_t)0;
3065 }
3066 } else if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
3067 /* no coalescing if address space uses sub-pages */
3068 } else if ((is_submap == FALSE) &&
3069 (object == VM_OBJECT_NULL) &&
3070 (entry != vm_map_to_entry(map)) &&
3071 (entry->vme_end == start) &&
3072 (!entry->is_shared) &&
3073 (!entry->is_sub_map) &&
3074 (!entry->in_transition) &&
3075 (!entry->needs_wakeup) &&
3076 (entry->behavior == VM_BEHAVIOR_DEFAULT) &&
3077 (entry->protection == cur_protection) &&
3078 (entry->max_protection == max_protection) &&
3079 (entry->inheritance == inheritance) &&
3080 ((user_alias == VM_MEMORY_REALLOC) ||
3081 (VME_ALIAS(entry) == alias)) &&
3082 (entry->no_cache == no_cache) &&
3083 (entry->vme_permanent == permanent) &&
3084 /* no coalescing for immutable executable mappings */
3085 !((entry->protection & VM_PROT_EXECUTE) &&
3086 entry->vme_permanent) &&
3087 (!entry->superpage_size && !superpage_size) &&
3088 /*
3089 * No coalescing if not map-aligned, to avoid propagating
3090 * that condition any further than needed:
3091 */
3092 (!entry->map_aligned || !clear_map_aligned) &&
3093 (!entry->zero_wired_pages) &&
3094 (!entry->used_for_jit && !entry_for_jit) &&
3095#if __arm64e__
3096 (!entry->used_for_tpro && !entry_for_tpro) &&
3097#endif
3098 (!entry->csm_associated) &&
3099 (entry->iokit_acct == iokit_acct) &&
3100 (!entry->vme_resilient_codesign) &&
3101 (!entry->vme_resilient_media) &&
3102 (!entry->vme_atomic) &&
3103 (entry->vme_no_copy_on_read == no_copy_on_read) &&
3104
3105 ((entry->vme_end - entry->vme_start) + size <=
3106 (user_alias == VM_MEMORY_REALLOC ?
3107 ANON_CHUNK_SIZE :
3108 NO_COALESCE_LIMIT)) &&
3109
3110 (entry->wired_count == 0)) { /* implies user_wired_count == 0 */
3111 if (vm_object_coalesce(VME_OBJECT(entry),
3112 VM_OBJECT_NULL,
3113 prev_offset: VME_OFFSET(entry),
3114 next_offset: (vm_object_offset_t) 0,
3115 prev_size: (vm_map_size_t)(entry->vme_end - entry->vme_start),
3116 next_size: (vm_map_size_t)(end - entry->vme_end))) {
3117 /*
3118 * Coalesced the two objects - can extend
3119 * the previous map entry to include the
3120 * new range.
3121 */
3122 map->size += (end - entry->vme_end);
3123 assert(entry->vme_start < end);
3124 assert(VM_MAP_PAGE_ALIGNED(end,
3125 VM_MAP_PAGE_MASK(map)));
3126 if (__improbable(vm_debug_events)) {
3127 DTRACE_VM5(map_entry_extend, vm_map_t, map, vm_map_entry_t, entry, vm_address_t, entry->vme_start, vm_address_t, entry->vme_end, vm_address_t, end);
3128 }
3129 entry->vme_end = end;
3130 if (map->holelistenabled) {
3131 vm_map_store_update_first_free(map, entry, TRUE);
3132 } else {
3133 vm_map_store_update_first_free(map, entry: map->first_free, TRUE);
3134 }
3135 new_mapping_established = TRUE;
3136 RETURN(KERN_SUCCESS);
3137 }
3138 }
3139
3140 step = superpage_size ? SUPERPAGE_SIZE : (end - start);
3141 new_entry = NULL;
3142
3143 if (vmk_flags.vmkf_submap_adjust) {
3144 vm_map_adjust_offsets(map: (vm_map_t)caller_object, min_off: start, max_off: end);
3145 offset = start;
3146 }
3147
3148 for (tmp2_start = start; tmp2_start < end; tmp2_start += step) {
3149 tmp2_end = tmp2_start + step;
3150 /*
3151 * Create a new entry
3152 *
3153 * XXX FBDP
3154 * The reserved "page zero" in each process's address space can
3155 * be arbitrarily large. Splitting it into separate objects and
3156 * therefore different VM map entries serves no purpose and just
3157 * slows down operations on the VM map, so let's not split the
3158 * allocation into chunks if the max protection is NONE. That
3159 * memory should never be accessible, so it will never get to the
3160 * default pager.
3161 */
3162 tmp_start = tmp2_start;
3163 if (!is_submap &&
3164 object == VM_OBJECT_NULL &&
3165 size > chunk_size &&
3166 max_protection != VM_PROT_NONE &&
3167 superpage_size == 0) {
3168 tmp_end = tmp_start + chunk_size;
3169 } else {
3170 tmp_end = tmp2_end;
3171 }
3172 do {
3173 if (!is_submap &&
3174 object != VM_OBJECT_NULL &&
3175 object->internal &&
3176 offset + (tmp_end - tmp_start) > object->vo_size) {
3177// printf("FBDP object %p size 0x%llx overmapping offset 0x%llx size 0x%llx\n", object, object->vo_size, offset, (uint64_t)(tmp_end - tmp_start));
3178 DTRACE_VM5(vm_map_enter_overmap,
3179 vm_map_t, map,
3180 vm_map_address_t, tmp_start,
3181 vm_map_address_t, tmp_end,
3182 vm_object_offset_t, offset,
3183 vm_object_size_t, object->vo_size);
3184 }
3185 new_entry = vm_map_entry_insert(map,
3186 insp_entry: entry, start: tmp_start, end: tmp_end,
3187 object, offset, vmk_flags,
3188 needs_copy,
3189 cur_protection, max_protection,
3190 inheritance: (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3191 VM_INHERIT_NONE : inheritance),
3192 clear_map_aligned);
3193
3194 assert(!is_kernel_object(object) || (VM_KERN_MEMORY_NONE != alias));
3195
3196 if (resilient_codesign) {
3197 int reject_prot = (needs_copy ? VM_PROT_ALLEXEC : (VM_PROT_WRITE | VM_PROT_ALLEXEC));
3198 if (!((cur_protection | max_protection) & reject_prot)) {
3199 new_entry->vme_resilient_codesign = TRUE;
3200 }
3201 }
3202
3203 if (resilient_media &&
3204 (object == VM_OBJECT_NULL ||
3205 object->internal)) {
3206 new_entry->vme_resilient_media = TRUE;
3207 }
3208
3209 assert(!new_entry->iokit_acct);
3210 if (!is_submap &&
3211 object != VM_OBJECT_NULL &&
3212 (object->purgable != VM_PURGABLE_DENY ||
3213 object->vo_ledger_tag)) {
3214 assert(new_entry->use_pmap);
3215 assert(!new_entry->iokit_acct);
3216 /*
3217 * Turn off pmap accounting since
3218 * purgeable (or tagged) objects have their
3219 * own ledgers.
3220 */
3221 new_entry->use_pmap = FALSE;
3222 } else if (!is_submap &&
3223 iokit_acct &&
3224 object != VM_OBJECT_NULL &&
3225 object->internal) {
3226 /* alternate accounting */
3227 assert(!new_entry->iokit_acct);
3228 assert(new_entry->use_pmap);
3229 new_entry->iokit_acct = TRUE;
3230 new_entry->use_pmap = FALSE;
3231 DTRACE_VM4(
3232 vm_map_iokit_mapped_region,
3233 vm_map_t, map,
3234 vm_map_offset_t, new_entry->vme_start,
3235 vm_map_offset_t, new_entry->vme_end,
3236 int, VME_ALIAS(new_entry));
3237 vm_map_iokit_mapped_region(
3238 map,
3239 bytes: (new_entry->vme_end -
3240 new_entry->vme_start));
3241 } else if (!is_submap) {
3242 assert(!new_entry->iokit_acct);
3243 assert(new_entry->use_pmap);
3244 }
3245
3246 if (is_submap) {
3247 vm_map_t submap;
3248 boolean_t submap_is_64bit;
3249 boolean_t use_pmap;
3250
3251 assert(new_entry->is_sub_map);
3252 assert(!new_entry->use_pmap);
3253 assert(!new_entry->iokit_acct);
3254 submap = (vm_map_t) object;
3255 submap_is_64bit = vm_map_is_64bit(map: submap);
3256 use_pmap = vmk_flags.vmkf_nested_pmap;
3257#ifndef NO_NESTED_PMAP
3258 if (use_pmap && submap->pmap == NULL) {
3259 ledger_t ledger = map->pmap->ledger;
3260 /* we need a sub pmap to nest... */
3261 submap->pmap = pmap_create_options(ledger, size: 0,
3262 flags: submap_is_64bit ? PMAP_CREATE_64BIT : 0);
3263 if (submap->pmap == NULL) {
3264 /* let's proceed without nesting... */
3265 }
3266#if defined(__arm64__)
3267 else {
3268 pmap_set_nested(pmap: submap->pmap);
3269 }
3270#endif
3271 }
3272 if (use_pmap && submap->pmap != NULL) {
3273 if (VM_MAP_PAGE_SHIFT(map) != VM_MAP_PAGE_SHIFT(map: submap)) {
3274 DEBUG4K_ERROR("map %p (%d) submap %p (%d): incompatible page sizes\n", map, VM_MAP_PAGE_SHIFT(map), submap, VM_MAP_PAGE_SHIFT(submap));
3275 kr = KERN_FAILURE;
3276 } else {
3277 kr = pmap_nest(map->pmap,
3278 submap->pmap,
3279 tmp_start,
3280 tmp_end - tmp_start);
3281 }
3282 if (kr != KERN_SUCCESS) {
3283 printf(format: "vm_map_enter: "
3284 "pmap_nest(0x%llx,0x%llx) "
3285 "error 0x%x\n",
3286 (long long)tmp_start,
3287 (long long)tmp_end,
3288 kr);
3289 } else {
3290 /* we're now nested ! */
3291 new_entry->use_pmap = TRUE;
3292 pmap_empty = FALSE;
3293 }
3294 }
3295#endif /* NO_NESTED_PMAP */
3296 }
3297 entry = new_entry;
3298
3299 if (superpage_size) {
3300 vm_page_t pages, m;
3301 vm_object_t sp_object;
3302 vm_object_offset_t sp_offset;
3303
3304 VME_OFFSET_SET(entry, offset: 0);
3305
3306 /* allocate one superpage */
3307 kr = cpm_allocate(SUPERPAGE_SIZE, list: &pages, max_pnum: 0, SUPERPAGE_NBASEPAGES - 1, TRUE, flags: 0);
3308 if (kr != KERN_SUCCESS) {
3309 /* deallocate whole range... */
3310 new_mapping_established = TRUE;
3311 /* ... but only up to "tmp_end" */
3312 size -= end - tmp_end;
3313 RETURN(kr);
3314 }
3315
3316 /* create one vm_object per superpage */
3317 sp_object = vm_object_allocate(size: (vm_map_size_t)(entry->vme_end - entry->vme_start));
3318 vm_object_lock(sp_object);
3319 sp_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
3320 VM_OBJECT_SET_PHYS_CONTIGUOUS(object: sp_object, TRUE);
3321 sp_object->vo_shadow_offset = (vm_object_offset_t)VM_PAGE_GET_PHYS_PAGE(m: pages) * PAGE_SIZE;
3322 VME_OBJECT_SET(entry, object: sp_object, false, context: 0);
3323 assert(entry->use_pmap);
3324
3325 /* enter the base pages into the object */
3326 for (sp_offset = 0;
3327 sp_offset < SUPERPAGE_SIZE;
3328 sp_offset += PAGE_SIZE) {
3329 m = pages;
3330 pmap_zero_page(pn: VM_PAGE_GET_PHYS_PAGE(m));
3331 pages = NEXT_PAGE(m);
3332 *(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
3333 vm_page_insert_wired(page: m, object: sp_object, offset: sp_offset, VM_KERN_MEMORY_OSFMK);
3334 }
3335 vm_object_unlock(sp_object);
3336 }
3337 } while (tmp_end != tmp2_end &&
3338 (tmp_start = tmp_end) &&
3339 (tmp_end = (tmp2_end - tmp_end > chunk_size) ?
3340 tmp_end + chunk_size : tmp2_end));
3341 }
3342
3343 new_mapping_established = TRUE;
3344
3345BailOut:
3346 assert(map_locked == TRUE);
3347
3348 /*
3349 * Address space limit enforcement (RLIMIT_AS and RLIMIT_DATA):
3350 * If we have identified and possibly established the new mapping(s),
3351 * make sure we did not go beyond the address space limit.
3352 */
3353 if (result == KERN_SUCCESS) {
3354 if (map->size_limit != RLIM_INFINITY &&
3355 map->size > map->size_limit) {
3356 /*
3357 * Establishing the requested mappings would exceed
3358 * the process's RLIMIT_AS limit: fail with
3359 * KERN_NO_SPACE.
3360 */
3361 result = KERN_NO_SPACE;
3362 printf(format: "%d[%s] %s: map size 0x%llx over RLIMIT_AS 0x%llx\n",
3363 proc_selfpid(),
3364 (get_bsdtask_info(current_task())
3365 ? proc_name_address(p: get_bsdtask_info(current_task()))
3366 : "?"),
3367 __FUNCTION__,
3368 (uint64_t) map->size,
3369 (uint64_t) map->size_limit);
3370 DTRACE_VM2(vm_map_enter_RLIMIT_AS,
3371 vm_map_size_t, map->size,
3372 uint64_t, map->size_limit);
3373 vm_map_enter_RLIMIT_AS_count++;
3374 } else if (map->data_limit != RLIM_INFINITY &&
3375 map->size > map->data_limit) {
3376 /*
3377 * Establishing the requested mappings would exceed
3378 * the process's RLIMIT_DATA limit: fail with
3379 * KERN_NO_SPACE.
3380 */
3381 result = KERN_NO_SPACE;
3382 printf(format: "%d[%s] %s: map size 0x%llx over RLIMIT_DATA 0x%llx\n",
3383 proc_selfpid(),
3384 (get_bsdtask_info(current_task())
3385 ? proc_name_address(p: get_bsdtask_info(current_task()))
3386 : "?"),
3387 __FUNCTION__,
3388 (uint64_t) map->size,
3389 (uint64_t) map->data_limit);
3390 DTRACE_VM2(vm_map_enter_RLIMIT_DATA,
3391 vm_map_size_t, map->size,
3392 uint64_t, map->data_limit);
3393 vm_map_enter_RLIMIT_DATA_count++;
3394 }
3395 }
3396
3397 if (result == KERN_SUCCESS) {
3398 vm_prot_t pager_prot;
3399 memory_object_t pager;
3400
3401#if DEBUG
3402 if (pmap_empty &&
3403 !(vmk_flags.vmkf_no_pmap_check)) {
3404 assert(pmap_is_empty(map->pmap,
3405 *address,
3406 *address + size));
3407 }
3408#endif /* DEBUG */
3409
3410 /*
3411 * For "named" VM objects, let the pager know that the
3412 * memory object is being mapped. Some pagers need to keep
3413 * track of this, to know when they can reclaim the memory
3414 * object, for example.
3415 * VM calls memory_object_map() for each mapping (specifying
3416 * the protection of each mapping) and calls
3417 * memory_object_last_unmap() when all the mappings are gone.
3418 */
3419 pager_prot = max_protection;
3420 if (needs_copy) {
3421 /*
3422 * Copy-On-Write mapping: won't modify
3423 * the memory object.
3424 */
3425 pager_prot &= ~VM_PROT_WRITE;
3426 }
3427 if (!is_submap &&
3428 object != VM_OBJECT_NULL &&
3429 object->named &&
3430 object->pager != MEMORY_OBJECT_NULL) {
3431 vm_object_lock(object);
3432 pager = object->pager;
3433 if (object->named &&
3434 pager != MEMORY_OBJECT_NULL) {
3435 assert(object->pager_ready);
3436 vm_object_mapping_wait(object, THREAD_UNINT);
3437 vm_object_mapping_begin(object);
3438 vm_object_unlock(object);
3439
3440 kr = memory_object_map(memory_object: pager, prot: pager_prot);
3441 assert(kr == KERN_SUCCESS);
3442
3443 vm_object_lock(object);
3444 vm_object_mapping_end(object);
3445 }
3446 vm_object_unlock(object);
3447 }
3448 }
3449
3450 assert(map_locked == TRUE);
3451
3452 if (new_mapping_established) {
3453 /*
3454 * If we release the map lock for any reason below,
3455 * another thread could deallocate our new mapping,
3456 * releasing the caller's reference on "caller_object",
3457 * which was transferred to the mapping.
3458 * If this was the only reference, the object could be
3459 * destroyed.
3460 *
3461 * We need to take an extra reference on "caller_object"
3462 * to keep it alive if we need to return the caller's
3463 * reference to the caller in case of failure.
3464 */
3465 if (is_submap) {
3466 vm_map_reference(map: (vm_map_t)caller_object);
3467 } else {
3468 vm_object_reference(caller_object);
3469 }
3470 }
3471
3472 if (!keep_map_locked) {
3473 vm_map_unlock(map);
3474 map_locked = FALSE;
3475 entry = VM_MAP_ENTRY_NULL;
3476 new_entry = VM_MAP_ENTRY_NULL;
3477 }
3478
3479 /*
3480 * We can't hold the map lock if we enter this block.
3481 */
3482
3483 if (result == KERN_SUCCESS) {
3484 /* Wire down the new entry if the user
3485 * requested all new map entries be wired.
3486 */
3487 if ((map->wiring_required) || (superpage_size)) {
3488 assert(!keep_map_locked);
3489 pmap_empty = FALSE; /* pmap won't be empty */
3490 kr = vm_map_wire_kernel(map, start, end,
3491 access_type: cur_protection, VM_KERN_MEMORY_MLOCK,
3492 TRUE);
3493 result = kr;
3494 }
3495
3496 }
3497
3498 if (result != KERN_SUCCESS) {
3499 if (new_mapping_established) {
3500 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
3501
3502 /*
3503 * We have to get rid of the new mappings since we
3504 * won't make them available to the user.
3505 * Try and do that atomically, to minimize the risk
3506 * that someone else create new mappings that range.
3507 */
3508 if (!map_locked) {
3509 vm_map_lock(map);
3510 map_locked = TRUE;
3511 }
3512 remove_flags |= VM_MAP_REMOVE_NO_MAP_ALIGN;
3513 remove_flags |= VM_MAP_REMOVE_NO_YIELD;
3514 if (permanent) {
3515 remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
3516 }
3517 (void) vm_map_delete(map,
3518 start: *address, end: *address + size,
3519 flags: remove_flags,
3520 KMEM_GUARD_NONE, zap: &zap_new_list);
3521 }
3522
3523 if (vm_map_zap_first_entry(list: &zap_old_list)) {
3524 vm_map_entry_t entry1, entry2;
3525
3526 /*
3527 * The new mapping failed. Attempt to restore
3528 * the old mappings, saved in the "zap_old_map".
3529 */
3530 if (!map_locked) {
3531 vm_map_lock(map);
3532 map_locked = TRUE;
3533 }
3534
3535 /* first check if the coast is still clear */
3536 start = vm_map_zap_first_entry(list: &zap_old_list)->vme_start;
3537 end = vm_map_zap_last_entry(list: &zap_old_list)->vme_end;
3538
3539 if (vm_map_lookup_entry(map, address: start, entry: &entry1) ||
3540 vm_map_lookup_entry(map, address: end, entry: &entry2) ||
3541 entry1 != entry2) {
3542 /*
3543 * Part of that range has already been
3544 * re-mapped: we can't restore the old
3545 * mappings...
3546 */
3547 vm_map_enter_restore_failures++;
3548 } else {
3549 /*
3550 * Transfer the saved map entries from
3551 * "zap_old_map" to the original "map",
3552 * inserting them all after "entry1".
3553 */
3554 while ((entry2 = vm_map_zap_pop(list: &zap_old_list))) {
3555 vm_map_size_t entry_size;
3556
3557 entry_size = (entry2->vme_end -
3558 entry2->vme_start);
3559 vm_map_store_entry_link(map, after_where: entry1, entry: entry2,
3560 VM_MAP_KERNEL_FLAGS_NONE);
3561 map->size += entry_size;
3562 entry1 = entry2;
3563 }
3564 if (map->wiring_required) {
3565 /*
3566 * XXX TODO: we should rewire the
3567 * old pages here...
3568 */
3569 }
3570 vm_map_enter_restore_successes++;
3571 }
3572 }
3573 }
3574
3575 /*
3576 * The caller is responsible for releasing the lock if it requested to
3577 * keep the map locked.
3578 */
3579 if (map_locked && !keep_map_locked) {
3580 vm_map_unlock(map);
3581 }
3582
3583 vm_map_zap_dispose(list: &zap_old_list);
3584 vm_map_zap_dispose(list: &zap_new_list);
3585
3586 if (new_mapping_established) {
3587 /*
3588 * The caller had a reference on "caller_object" and we
3589 * transferred that reference to the mapping.
3590 * We also took an extra reference on "caller_object" to keep
3591 * it alive while the map was unlocked.
3592 */
3593 if (result == KERN_SUCCESS) {
3594 /*
3595 * On success, the caller's reference on the object gets
3596 * tranferred to the mapping.
3597 * Release our extra reference.
3598 */
3599 if (is_submap) {
3600 vm_map_deallocate(map: (vm_map_t)caller_object);
3601 } else {
3602 vm_object_deallocate(object: caller_object);
3603 }
3604 } else {
3605 /*
3606 * On error, the caller expects to still have a
3607 * reference on the object it gave us.
3608 * Let's use our extra reference for that.
3609 */
3610 }
3611 }
3612
3613 return result;
3614
3615#undef RETURN
3616}
3617
3618#if __arm64__
3619extern const struct memory_object_pager_ops fourk_pager_ops;
3620kern_return_t
3621vm_map_enter_fourk(
3622 vm_map_t map,
3623 vm_map_offset_t *address, /* IN/OUT */
3624 vm_map_size_t size,
3625 vm_map_offset_t mask,
3626 vm_map_kernel_flags_t vmk_flags,
3627 vm_object_t object,
3628 vm_object_offset_t offset,
3629 boolean_t needs_copy,
3630 vm_prot_t cur_protection,
3631 vm_prot_t max_protection,
3632 vm_inherit_t inheritance)
3633{
3634 vm_map_entry_t entry, new_entry;
3635 vm_map_offset_t start, fourk_start;
3636 vm_map_offset_t end, fourk_end;
3637 vm_map_size_t fourk_size;
3638 kern_return_t result = KERN_SUCCESS;
3639 boolean_t map_locked = FALSE;
3640 boolean_t pmap_empty = TRUE;
3641 boolean_t new_mapping_established = FALSE;
3642 const bool keep_map_locked = vmk_flags.vmkf_keep_map_locked;
3643 const bool anywhere = !vmk_flags.vmf_fixed;
3644 const bool purgable = vmk_flags.vmf_purgeable;
3645 const bool overwrite = vmk_flags.vmf_overwrite;
3646 const bool is_submap = vmk_flags.vmkf_submap;
3647 const bool entry_for_jit = vmk_flags.vmkf_map_jit;
3648 const unsigned int superpage_size = vmk_flags.vmf_superpage_size;
3649 vm_map_offset_t effective_min_offset, effective_max_offset;
3650 kern_return_t kr;
3651 boolean_t clear_map_aligned = FALSE;
3652 memory_object_t fourk_mem_obj;
3653 vm_object_t fourk_object;
3654 vm_map_offset_t fourk_pager_offset;
3655 int fourk_pager_index_start, fourk_pager_index_num;
3656 int cur_idx;
3657 boolean_t fourk_copy;
3658 vm_object_t copy_object;
3659 vm_object_offset_t copy_offset;
3660 VM_MAP_ZAP_DECLARE(zap_list);
3661
3662 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
3663 panic("%s:%d", __FUNCTION__, __LINE__);
3664 }
3665 fourk_mem_obj = MEMORY_OBJECT_NULL;
3666 fourk_object = VM_OBJECT_NULL;
3667
3668 if (superpage_size) {
3669 return KERN_NOT_SUPPORTED;
3670 }
3671
3672 if ((cur_protection & VM_PROT_WRITE) &&
3673 (cur_protection & VM_PROT_EXECUTE) &&
3674#if XNU_TARGET_OS_OSX
3675 map->pmap != kernel_pmap &&
3676 (vm_map_cs_enforcement(map)
3677#if __arm64__
3678 || !VM_MAP_IS_EXOTIC(map)
3679#endif /* __arm64__ */
3680 ) &&
3681#endif /* XNU_TARGET_OS_OSX */
3682#if CODE_SIGNING_MONITOR
3683 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
3684#endif
3685 !entry_for_jit) {
3686 DTRACE_VM3(cs_wx,
3687 uint64_t, 0,
3688 uint64_t, 0,
3689 vm_prot_t, cur_protection);
3690 printf(format: "CODE SIGNING: %d[%s] %s: curprot cannot be write+execute. "
3691 "turning off execute\n",
3692 proc_selfpid(),
3693 (get_bsdtask_info(current_task())
3694 ? proc_name_address(p: get_bsdtask_info(current_task()))
3695 : "?"),
3696 __FUNCTION__);
3697 cur_protection &= ~VM_PROT_EXECUTE;
3698 }
3699
3700 /*
3701 * If the task has requested executable lockdown,
3702 * deny any new executable mapping.
3703 */
3704 if (map->map_disallow_new_exec == TRUE) {
3705 if (cur_protection & VM_PROT_EXECUTE) {
3706 return KERN_PROTECTION_FAILURE;
3707 }
3708 }
3709
3710 if (is_submap) {
3711 return KERN_NOT_SUPPORTED;
3712 }
3713 if (vmk_flags.vmkf_already) {
3714 return KERN_NOT_SUPPORTED;
3715 }
3716 if (purgable || entry_for_jit) {
3717 return KERN_NOT_SUPPORTED;
3718 }
3719
3720 effective_min_offset = map->min_offset;
3721
3722 if (vmk_flags.vmkf_beyond_max) {
3723 return KERN_NOT_SUPPORTED;
3724 } else {
3725 effective_max_offset = map->max_offset;
3726 }
3727
3728 if (size == 0 ||
3729 (offset & FOURK_PAGE_MASK) != 0) {
3730 *address = 0;
3731 return KERN_INVALID_ARGUMENT;
3732 }
3733
3734#define RETURN(value) { result = value; goto BailOut; }
3735
3736 assert(VM_MAP_PAGE_ALIGNED(*address, FOURK_PAGE_MASK));
3737 assert(VM_MAP_PAGE_ALIGNED(size, FOURK_PAGE_MASK));
3738
3739 if (!anywhere && overwrite) {
3740 return KERN_NOT_SUPPORTED;
3741 }
3742
3743 fourk_start = *address;
3744 fourk_size = size;
3745 fourk_end = fourk_start + fourk_size;
3746
3747 start = vm_map_trunc_page(*address, VM_MAP_PAGE_MASK(map));
3748 end = vm_map_round_page(fourk_end, VM_MAP_PAGE_MASK(map));
3749 size = end - start;
3750
3751 if (anywhere) {
3752 return KERN_NOT_SUPPORTED;
3753 } else {
3754 /*
3755 * Verify that:
3756 * the address doesn't itself violate
3757 * the mask requirement.
3758 */
3759
3760 vm_map_lock(map);
3761 map_locked = TRUE;
3762 if ((start & mask) != 0) {
3763 RETURN(KERN_NO_SPACE);
3764 }
3765
3766 /*
3767 * ... the address is within bounds
3768 */
3769
3770 end = start + size;
3771
3772 if ((start < effective_min_offset) ||
3773 (end > effective_max_offset) ||
3774 (start >= end)) {
3775 RETURN(KERN_INVALID_ADDRESS);
3776 }
3777
3778 /*
3779 * ... the starting address isn't allocated
3780 */
3781 if (vm_map_lookup_entry(map, address: start, entry: &entry)) {
3782 vm_object_t cur_object, shadow_object;
3783
3784 /*
3785 * We might already some 4K mappings
3786 * in a 16K page here.
3787 */
3788
3789 if (entry->vme_end - entry->vme_start
3790 != SIXTEENK_PAGE_SIZE) {
3791 RETURN(KERN_NO_SPACE);
3792 }
3793 if (entry->is_sub_map) {
3794 RETURN(KERN_NO_SPACE);
3795 }
3796 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
3797 RETURN(KERN_NO_SPACE);
3798 }
3799
3800 /* go all the way down the shadow chain */
3801 cur_object = VME_OBJECT(entry);
3802 vm_object_lock(cur_object);
3803 while (cur_object->shadow != VM_OBJECT_NULL) {
3804 shadow_object = cur_object->shadow;
3805 vm_object_lock(shadow_object);
3806 vm_object_unlock(cur_object);
3807 cur_object = shadow_object;
3808 shadow_object = VM_OBJECT_NULL;
3809 }
3810 if (cur_object->internal ||
3811 cur_object->pager == NULL) {
3812 vm_object_unlock(cur_object);
3813 RETURN(KERN_NO_SPACE);
3814 }
3815 if (cur_object->pager->mo_pager_ops
3816 != &fourk_pager_ops) {
3817 vm_object_unlock(cur_object);
3818 RETURN(KERN_NO_SPACE);
3819 }
3820 fourk_object = cur_object;
3821 fourk_mem_obj = fourk_object->pager;
3822
3823 /* keep the "4K" object alive */
3824 vm_object_reference_locked(fourk_object);
3825 memory_object_reference(object: fourk_mem_obj);
3826 vm_object_unlock(fourk_object);
3827
3828 /* merge permissions */
3829 entry->protection |= cur_protection;
3830 entry->max_protection |= max_protection;
3831
3832 if ((entry->protection & VM_PROT_WRITE) &&
3833 (entry->protection & VM_PROT_ALLEXEC) &&
3834 fourk_binary_compatibility_unsafe &&
3835 fourk_binary_compatibility_allow_wx) {
3836 /* write+execute: need to be "jit" */
3837 entry->used_for_jit = TRUE;
3838 }
3839 goto map_in_fourk_pager;
3840 }
3841
3842 /*
3843 * ... the next region doesn't overlap the
3844 * end point.
3845 */
3846
3847 if ((entry->vme_next != vm_map_to_entry(map)) &&
3848 (entry->vme_next->vme_start < end)) {
3849 RETURN(KERN_NO_SPACE);
3850 }
3851 }
3852
3853 /*
3854 * At this point,
3855 * "start" and "end" should define the endpoints of the
3856 * available new range, and
3857 * "entry" should refer to the region before the new
3858 * range, and
3859 *
3860 * the map should be locked.
3861 */
3862
3863 /* create a new "4K" pager */
3864 fourk_mem_obj = fourk_pager_create();
3865 fourk_object = fourk_pager_to_vm_object(mem_obj: fourk_mem_obj);
3866 assert(fourk_object);
3867
3868 /* keep the "4" object alive */
3869 vm_object_reference(fourk_object);
3870
3871 /* create a "copy" object, to map the "4K" object copy-on-write */
3872 fourk_copy = TRUE;
3873 result = vm_object_copy_strategically(src_object: fourk_object,
3874 src_offset: 0,
3875 size: end - start,
3876 false, /* forking */
3877 dst_object: &copy_object,
3878 dst_offset: &copy_offset,
3879 dst_needs_copy: &fourk_copy);
3880 assert(result == KERN_SUCCESS);
3881 assert(copy_object != VM_OBJECT_NULL);
3882 assert(copy_offset == 0);
3883
3884 /* map the "4K" pager's copy object */
3885 new_entry = vm_map_entry_insert(map,
3886 insp_entry: entry,
3887 vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map)),
3888 vm_map_round_page(end, VM_MAP_PAGE_MASK(map)),
3889 object: copy_object,
3890 offset: 0, /* offset */
3891 vmk_flags,
3892 FALSE, /* needs_copy */
3893 cur_protection, max_protection,
3894 inheritance: (entry_for_jit && !VM_MAP_POLICY_ALLOW_JIT_INHERIT(map) ?
3895 VM_INHERIT_NONE : inheritance),
3896 clear_map_aligned);
3897 entry = new_entry;
3898
3899#if VM_MAP_DEBUG_FOURK
3900 if (vm_map_debug_fourk) {
3901 printf("FOURK_PAGER: map %p [0x%llx:0x%llx] new pager %p\n",
3902 map,
3903 (uint64_t) entry->vme_start,
3904 (uint64_t) entry->vme_end,
3905 fourk_mem_obj);
3906 }
3907#endif /* VM_MAP_DEBUG_FOURK */
3908
3909 new_mapping_established = TRUE;
3910
3911map_in_fourk_pager:
3912 /* "map" the original "object" where it belongs in the "4K" pager */
3913 fourk_pager_offset = (fourk_start & SIXTEENK_PAGE_MASK);
3914 fourk_pager_index_start = (int) (fourk_pager_offset / FOURK_PAGE_SIZE);
3915 if (fourk_size > SIXTEENK_PAGE_SIZE) {
3916 fourk_pager_index_num = 4;
3917 } else {
3918 fourk_pager_index_num = (int) (fourk_size / FOURK_PAGE_SIZE);
3919 }
3920 if (fourk_pager_index_start + fourk_pager_index_num > 4) {
3921 fourk_pager_index_num = 4 - fourk_pager_index_start;
3922 }
3923 for (cur_idx = 0;
3924 cur_idx < fourk_pager_index_num;
3925 cur_idx++) {
3926 vm_object_t old_object;
3927 vm_object_offset_t old_offset;
3928
3929 kr = fourk_pager_populate(mem_obj: fourk_mem_obj,
3930 TRUE, /* overwrite */
3931 index: fourk_pager_index_start + cur_idx,
3932 new_backing_object: object,
3933 new_backing_offset: (object
3934 ? (offset +
3935 (cur_idx * FOURK_PAGE_SIZE))
3936 : 0),
3937 old_backing_object: &old_object,
3938 old_backing_offset: &old_offset);
3939#if VM_MAP_DEBUG_FOURK
3940 if (vm_map_debug_fourk) {
3941 if (old_object == (vm_object_t) -1 &&
3942 old_offset == (vm_object_offset_t) -1) {
3943 printf("FOURK_PAGER: map %p [0x%llx:0x%llx] "
3944 "pager [%p:0x%llx] "
3945 "populate[%d] "
3946 "[object:%p,offset:0x%llx]\n",
3947 map,
3948 (uint64_t) entry->vme_start,
3949 (uint64_t) entry->vme_end,
3950 fourk_mem_obj,
3951 VME_OFFSET(entry),
3952 fourk_pager_index_start + cur_idx,
3953 object,
3954 (object
3955 ? (offset + (cur_idx * FOURK_PAGE_SIZE))
3956 : 0));
3957 } else {
3958 printf("FOURK_PAGER: map %p [0x%llx:0x%llx] "
3959 "pager [%p:0x%llx] "
3960 "populate[%d] [object:%p,offset:0x%llx] "
3961 "old [%p:0x%llx]\n",
3962 map,
3963 (uint64_t) entry->vme_start,
3964 (uint64_t) entry->vme_end,
3965 fourk_mem_obj,
3966 VME_OFFSET(entry),
3967 fourk_pager_index_start + cur_idx,
3968 object,
3969 (object
3970 ? (offset + (cur_idx * FOURK_PAGE_SIZE))
3971 : 0),
3972 old_object,
3973 old_offset);
3974 }
3975 }
3976#endif /* VM_MAP_DEBUG_FOURK */
3977
3978 assert(kr == KERN_SUCCESS);
3979 if (object != old_object &&
3980 object != VM_OBJECT_NULL &&
3981 object != (vm_object_t) -1) {
3982 vm_object_reference(object);
3983 }
3984 if (object != old_object &&
3985 old_object != VM_OBJECT_NULL &&
3986 old_object != (vm_object_t) -1) {
3987 vm_object_deallocate(object: old_object);
3988 }
3989 }
3990
3991BailOut:
3992 assert(map_locked == TRUE);
3993
3994 if (result == KERN_SUCCESS) {
3995 vm_prot_t pager_prot;
3996 memory_object_t pager;
3997
3998#if DEBUG
3999 if (pmap_empty &&
4000 !(vmk_flags.vmkf_no_pmap_check)) {
4001 assert(pmap_is_empty(map->pmap,
4002 *address,
4003 *address + size));
4004 }
4005#endif /* DEBUG */
4006
4007 /*
4008 * For "named" VM objects, let the pager know that the
4009 * memory object is being mapped. Some pagers need to keep
4010 * track of this, to know when they can reclaim the memory
4011 * object, for example.
4012 * VM calls memory_object_map() for each mapping (specifying
4013 * the protection of each mapping) and calls
4014 * memory_object_last_unmap() when all the mappings are gone.
4015 */
4016 pager_prot = max_protection;
4017 if (needs_copy) {
4018 /*
4019 * Copy-On-Write mapping: won't modify
4020 * the memory object.
4021 */
4022 pager_prot &= ~VM_PROT_WRITE;
4023 }
4024 if (!is_submap &&
4025 object != VM_OBJECT_NULL &&
4026 object->named &&
4027 object->pager != MEMORY_OBJECT_NULL) {
4028 vm_object_lock(object);
4029 pager = object->pager;
4030 if (object->named &&
4031 pager != MEMORY_OBJECT_NULL) {
4032 assert(object->pager_ready);
4033 vm_object_mapping_wait(object, THREAD_UNINT);
4034 vm_object_mapping_begin(object);
4035 vm_object_unlock(object);
4036
4037 kr = memory_object_map(memory_object: pager, prot: pager_prot);
4038 assert(kr == KERN_SUCCESS);
4039
4040 vm_object_lock(object);
4041 vm_object_mapping_end(object);
4042 }
4043 vm_object_unlock(object);
4044 }
4045 if (!is_submap &&
4046 fourk_object != VM_OBJECT_NULL &&
4047 fourk_object->named &&
4048 fourk_object->pager != MEMORY_OBJECT_NULL) {
4049 vm_object_lock(fourk_object);
4050 pager = fourk_object->pager;
4051 if (fourk_object->named &&
4052 pager != MEMORY_OBJECT_NULL) {
4053 assert(fourk_object->pager_ready);
4054 vm_object_mapping_wait(fourk_object,
4055 THREAD_UNINT);
4056 vm_object_mapping_begin(fourk_object);
4057 vm_object_unlock(fourk_object);
4058
4059 kr = memory_object_map(memory_object: pager, VM_PROT_READ);
4060 assert(kr == KERN_SUCCESS);
4061
4062 vm_object_lock(fourk_object);
4063 vm_object_mapping_end(fourk_object);
4064 }
4065 vm_object_unlock(fourk_object);
4066 }
4067 }
4068
4069 if (fourk_object != VM_OBJECT_NULL) {
4070 vm_object_deallocate(object: fourk_object);
4071 fourk_object = VM_OBJECT_NULL;
4072 memory_object_deallocate(object: fourk_mem_obj);
4073 fourk_mem_obj = MEMORY_OBJECT_NULL;
4074 }
4075
4076 assert(map_locked == TRUE);
4077
4078 if (!keep_map_locked) {
4079 vm_map_unlock(map);
4080 map_locked = FALSE;
4081 }
4082
4083 /*
4084 * We can't hold the map lock if we enter this block.
4085 */
4086
4087 if (result == KERN_SUCCESS) {
4088 /* Wire down the new entry if the user
4089 * requested all new map entries be wired.
4090 */
4091 if ((map->wiring_required) || (superpage_size)) {
4092 assert(!keep_map_locked);
4093 pmap_empty = FALSE; /* pmap won't be empty */
4094 kr = vm_map_wire_kernel(map, start, end,
4095 access_type: new_entry->protection, VM_KERN_MEMORY_MLOCK,
4096 TRUE);
4097 result = kr;
4098 }
4099
4100 }
4101
4102 if (result != KERN_SUCCESS) {
4103 if (new_mapping_established) {
4104 /*
4105 * We have to get rid of the new mappings since we
4106 * won't make them available to the user.
4107 * Try and do that atomically, to minimize the risk
4108 * that someone else create new mappings that range.
4109 */
4110
4111 if (!map_locked) {
4112 vm_map_lock(map);
4113 map_locked = TRUE;
4114 }
4115 (void)vm_map_delete(map, start: *address, end: *address + size,
4116 flags: VM_MAP_REMOVE_NO_MAP_ALIGN | VM_MAP_REMOVE_NO_YIELD,
4117 KMEM_GUARD_NONE, zap: &zap_list);
4118 }
4119 }
4120
4121 /*
4122 * The caller is responsible for releasing the lock if it requested to
4123 * keep the map locked.
4124 */
4125 if (map_locked && !keep_map_locked) {
4126 vm_map_unlock(map);
4127 }
4128
4129 vm_map_zap_dispose(list: &zap_list);
4130
4131 return result;
4132
4133#undef RETURN
4134}
4135#endif /* __arm64__ */
4136
4137/*
4138 * Counters for the prefault optimization.
4139 */
4140int64_t vm_prefault_nb_pages = 0;
4141int64_t vm_prefault_nb_bailout = 0;
4142
4143static kern_return_t
4144vm_map_enter_mem_object_helper(
4145 vm_map_t target_map,
4146 vm_map_offset_t *address,
4147 vm_map_size_t initial_size,
4148 vm_map_offset_t mask,
4149 vm_map_kernel_flags_t vmk_flags,
4150 ipc_port_t port,
4151 vm_object_offset_t offset,
4152 boolean_t copy,
4153 vm_prot_t cur_protection,
4154 vm_prot_t max_protection,
4155 vm_inherit_t inheritance,
4156 upl_page_list_ptr_t page_list,
4157 unsigned int page_list_count)
4158{
4159 vm_map_address_t map_addr;
4160 vm_map_size_t map_size;
4161 vm_object_t object;
4162 vm_object_size_t size;
4163 kern_return_t result;
4164 boolean_t mask_cur_protection, mask_max_protection;
4165 boolean_t kernel_prefault, try_prefault = (page_list_count != 0);
4166 vm_map_offset_t offset_in_mapping = 0;
4167#if __arm64__
4168 boolean_t fourk = vmk_flags.vmkf_fourk;
4169#endif /* __arm64__ */
4170
4171 if (VM_MAP_PAGE_SHIFT(map: target_map) < PAGE_SHIFT) {
4172 /* XXX TODO4K prefaulting depends on page size... */
4173 try_prefault = FALSE;
4174 }
4175
4176 assertf(vmk_flags.__vmkf_unused == 0, "vmk_flags unused=0x%x\n", vmk_flags.__vmkf_unused);
4177 vm_map_kernel_flags_update_range_id(flags: &vmk_flags, map: target_map);
4178
4179 mask_cur_protection = cur_protection & VM_PROT_IS_MASK;
4180 mask_max_protection = max_protection & VM_PROT_IS_MASK;
4181 cur_protection &= ~VM_PROT_IS_MASK;
4182 max_protection &= ~VM_PROT_IS_MASK;
4183
4184 /*
4185 * Check arguments for validity
4186 */
4187 if ((target_map == VM_MAP_NULL) ||
4188 (cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4189 (max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
4190 (inheritance > VM_INHERIT_LAST_VALID) ||
4191 (try_prefault && (copy || !page_list)) ||
4192 initial_size == 0) {
4193 return KERN_INVALID_ARGUMENT;
4194 }
4195
4196 if (__improbable((cur_protection & max_protection) != cur_protection)) {
4197 /* cur is more permissive than max */
4198 cur_protection &= max_protection;
4199 }
4200
4201#if __arm64__
4202 if (cur_protection & VM_PROT_EXECUTE) {
4203 cur_protection |= VM_PROT_READ;
4204 }
4205
4206 if (fourk && VM_MAP_PAGE_SHIFT(map: target_map) < PAGE_SHIFT) {
4207 /* no "fourk" if map is using a sub-page page size */
4208 fourk = FALSE;
4209 }
4210 if (fourk) {
4211 map_addr = vm_map_trunc_page(*address, FOURK_PAGE_MASK);
4212 map_size = vm_map_round_page(initial_size, FOURK_PAGE_MASK);
4213 } else
4214#endif /* __arm64__ */
4215 {
4216 map_addr = vm_map_trunc_page(*address,
4217 VM_MAP_PAGE_MASK(target_map));
4218 map_size = vm_map_round_page(initial_size,
4219 VM_MAP_PAGE_MASK(target_map));
4220 }
4221 if (map_size == 0) {
4222 return KERN_INVALID_ARGUMENT;
4223 }
4224 size = vm_object_round_page(initial_size);
4225
4226 /*
4227 * Find the vm object (if any) corresponding to this port.
4228 */
4229 if (!IP_VALID(port)) {
4230 object = VM_OBJECT_NULL;
4231 offset = 0;
4232 copy = FALSE;
4233 } else if (ip_kotype(port) == IKOT_NAMED_ENTRY) {
4234 vm_named_entry_t named_entry;
4235 vm_object_offset_t data_offset;
4236
4237 named_entry = mach_memory_entry_from_port(port);
4238
4239 if (vmk_flags.vmf_return_data_addr ||
4240 vmk_flags.vmf_return_4k_data_addr) {
4241 data_offset = named_entry->data_offset;
4242 offset += named_entry->data_offset;
4243 } else {
4244 data_offset = 0;
4245 }
4246
4247 /* a few checks to make sure user is obeying rules */
4248 if (mask_max_protection) {
4249 max_protection &= named_entry->protection;
4250 }
4251 if (mask_cur_protection) {
4252 cur_protection &= named_entry->protection;
4253 }
4254 if ((named_entry->protection & max_protection) !=
4255 max_protection) {
4256 return KERN_INVALID_RIGHT;
4257 }
4258 if ((named_entry->protection & cur_protection) !=
4259 cur_protection) {
4260 return KERN_INVALID_RIGHT;
4261 }
4262 if (offset + size <= offset) {
4263 /* overflow */
4264 return KERN_INVALID_ARGUMENT;
4265 }
4266 if (named_entry->size < (offset + initial_size)) {
4267 return KERN_INVALID_ARGUMENT;
4268 }
4269
4270 if (named_entry->is_copy) {
4271 /* for a vm_map_copy, we can only map it whole */
4272 if ((size != named_entry->size) &&
4273 (vm_map_round_page(size,
4274 VM_MAP_PAGE_MASK(target_map)) ==
4275 named_entry->size)) {
4276 /* XXX FBDP use the rounded size... */
4277 size = vm_map_round_page(
4278 size,
4279 VM_MAP_PAGE_MASK(target_map));
4280 }
4281 }
4282
4283 /* the callers parameter offset is defined to be the */
4284 /* offset from beginning of named entry offset in object */
4285 offset = offset + named_entry->offset;
4286
4287 if (!VM_MAP_PAGE_ALIGNED(size,
4288 VM_MAP_PAGE_MASK(target_map))) {
4289 /*
4290 * Let's not map more than requested;
4291 * vm_map_enter() will handle this "not map-aligned"
4292 * case.
4293 */
4294 map_size = size;
4295 }
4296
4297 named_entry_lock(named_entry);
4298 if (named_entry->is_sub_map) {
4299 vm_map_t submap;
4300
4301 if (vmk_flags.vmf_return_data_addr ||
4302 vmk_flags.vmf_return_4k_data_addr) {
4303 panic("VM_FLAGS_RETURN_DATA_ADDR not expected for submap.");
4304 }
4305
4306 submap = named_entry->backing.map;
4307 vm_map_reference(map: submap);
4308 named_entry_unlock(named_entry);
4309
4310 vmk_flags.vmkf_submap = TRUE;
4311
4312 result = vm_map_enter(map: target_map,
4313 address: &map_addr,
4314 size: map_size,
4315 mask,
4316 vmk_flags,
4317 object: (vm_object_t)(uintptr_t) submap,
4318 offset,
4319 needs_copy: copy,
4320 cur_protection,
4321 max_protection,
4322 inheritance);
4323 if (result != KERN_SUCCESS) {
4324 vm_map_deallocate(map: submap);
4325 } else {
4326 /*
4327 * No need to lock "submap" just to check its
4328 * "mapped" flag: that flag is never reset
4329 * once it's been set and if we race, we'll
4330 * just end up setting it twice, which is OK.
4331 */
4332 if (submap->mapped_in_other_pmaps == FALSE &&
4333 vm_map_pmap(submap) != PMAP_NULL &&
4334 vm_map_pmap(submap) !=
4335 vm_map_pmap(target_map)) {
4336 /*
4337 * This submap is being mapped in a map
4338 * that uses a different pmap.
4339 * Set its "mapped_in_other_pmaps" flag
4340 * to indicate that we now need to
4341 * remove mappings from all pmaps rather
4342 * than just the submap's pmap.
4343 */
4344 vm_map_lock(submap);
4345 submap->mapped_in_other_pmaps = TRUE;
4346 vm_map_unlock(submap);
4347 }
4348 *address = map_addr;
4349 }
4350 return result;
4351 } else if (named_entry->is_copy) {
4352 kern_return_t kr;
4353 vm_map_copy_t copy_map;
4354 vm_map_entry_t copy_entry;
4355 vm_map_offset_t copy_addr;
4356 vm_map_copy_t target_copy_map;
4357 vm_map_offset_t overmap_start, overmap_end;
4358 vm_map_offset_t trimmed_start;
4359 vm_map_size_t target_size;
4360
4361 if (!vm_map_kernel_flags_check_vmflags(vmk_flags,
4362 vm_flags_mask: (VM_FLAGS_FIXED |
4363 VM_FLAGS_ANYWHERE |
4364 VM_FLAGS_OVERWRITE |
4365 VM_FLAGS_RETURN_4K_DATA_ADDR |
4366 VM_FLAGS_RETURN_DATA_ADDR))) {
4367 named_entry_unlock(named_entry);
4368 return KERN_INVALID_ARGUMENT;
4369 }
4370
4371 copy_map = named_entry->backing.copy;
4372 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
4373 if (copy_map->type != VM_MAP_COPY_ENTRY_LIST) {
4374 /* unsupported type; should not happen */
4375 printf(format: "vm_map_enter_mem_object: "
4376 "memory_entry->backing.copy "
4377 "unsupported type 0x%x\n",
4378 copy_map->type);
4379 named_entry_unlock(named_entry);
4380 return KERN_INVALID_ARGUMENT;
4381 }
4382
4383 if (VM_MAP_PAGE_SHIFT(map: target_map) != copy_map->cpy_hdr.page_shift) {
4384 DEBUG4K_SHARE("copy_map %p offset %llx size 0x%llx pgshift %d -> target_map %p pgshift %d\n", copy_map, offset, (uint64_t)map_size, copy_map->cpy_hdr.page_shift, target_map, VM_MAP_PAGE_SHIFT(target_map));
4385 }
4386
4387 if (vmk_flags.vmf_return_data_addr ||
4388 vmk_flags.vmf_return_4k_data_addr) {
4389 offset_in_mapping = offset & VM_MAP_PAGE_MASK(target_map);
4390 if (vmk_flags.vmf_return_4k_data_addr) {
4391 offset_in_mapping &= ~((signed)(0xFFF));
4392 }
4393 }
4394
4395 target_copy_map = VM_MAP_COPY_NULL;
4396 target_size = copy_map->size;
4397 overmap_start = 0;
4398 overmap_end = 0;
4399 trimmed_start = 0;
4400 if (copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(map: target_map)) {
4401 DEBUG4K_ADJUST("adjusting...\n");
4402 kr = vm_map_copy_adjust_to_target(
4403 copy_map,
4404 offset /* includes data_offset */,
4405 size: initial_size,
4406 target_map,
4407 copy,
4408 target_copy_map_p: &target_copy_map,
4409 overmap_start_p: &overmap_start,
4410 overmap_end_p: &overmap_end,
4411 trimmed_start_p: &trimmed_start);
4412 if (kr != KERN_SUCCESS) {
4413 named_entry_unlock(named_entry);
4414 return kr;
4415 }
4416 target_size = target_copy_map->size;
4417 if (trimmed_start >= data_offset) {
4418 data_offset = offset & VM_MAP_PAGE_MASK(target_map);
4419 } else {
4420 data_offset -= trimmed_start;
4421 }
4422 } else {
4423 /*
4424 * Assert that the vm_map_copy is coming from the right
4425 * zone and hasn't been forged
4426 */
4427 vm_map_copy_require(copy: copy_map);
4428 target_copy_map = copy_map;
4429 }
4430
4431 vm_map_kernel_flags_t rsv_flags = vmk_flags;
4432
4433 vm_map_kernel_flags_and_vmflags(vmk_flags: &rsv_flags,
4434 vm_flags_mask: (VM_FLAGS_FIXED |
4435 VM_FLAGS_ANYWHERE |
4436 VM_FLAGS_OVERWRITE |
4437 VM_FLAGS_RETURN_4K_DATA_ADDR |
4438 VM_FLAGS_RETURN_DATA_ADDR));
4439
4440 /* reserve a contiguous range */
4441 kr = vm_map_enter(map: target_map,
4442 address: &map_addr,
4443 vm_map_round_page(target_size, VM_MAP_PAGE_MASK(target_map)),
4444 mask,
4445 vmk_flags: rsv_flags,
4446 VM_OBJECT_NULL,
4447 offset: 0,
4448 FALSE, /* copy */
4449 cur_protection,
4450 max_protection,
4451 inheritance);
4452 if (kr != KERN_SUCCESS) {
4453 DEBUG4K_ERROR("kr 0x%x\n", kr);
4454 if (target_copy_map != copy_map) {
4455 vm_map_copy_discard(copy: target_copy_map);
4456 target_copy_map = VM_MAP_COPY_NULL;
4457 }
4458 named_entry_unlock(named_entry);
4459 return kr;
4460 }
4461
4462 copy_addr = map_addr;
4463
4464 for (copy_entry = vm_map_copy_first_entry(target_copy_map);
4465 copy_entry != vm_map_copy_to_entry(target_copy_map);
4466 copy_entry = copy_entry->vme_next) {
4467 vm_map_t copy_submap = VM_MAP_NULL;
4468 vm_object_t copy_object = VM_OBJECT_NULL;
4469 vm_map_size_t copy_size;
4470 vm_object_offset_t copy_offset;
4471 boolean_t do_copy = false;
4472
4473 if (copy_entry->is_sub_map) {
4474 copy_submap = VME_SUBMAP(copy_entry);
4475 copy_object = (vm_object_t)copy_submap;
4476 } else {
4477 copy_object = VME_OBJECT(copy_entry);
4478 }
4479 copy_offset = VME_OFFSET(entry: copy_entry);
4480 copy_size = (copy_entry->vme_end -
4481 copy_entry->vme_start);
4482
4483 /* sanity check */
4484 if ((copy_addr + copy_size) >
4485 (map_addr +
4486 overmap_start + overmap_end +
4487 named_entry->size /* XXX full size */)) {
4488 /* over-mapping too much !? */
4489 kr = KERN_INVALID_ARGUMENT;
4490 DEBUG4K_ERROR("kr 0x%x\n", kr);
4491 /* abort */
4492 break;
4493 }
4494
4495 /* take a reference on the object */
4496 if (copy_entry->is_sub_map) {
4497 vm_map_reference(map: copy_submap);
4498 } else {
4499 if (!copy &&
4500 copy_object != VM_OBJECT_NULL &&
4501 copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
4502 bool is_writable;
4503
4504 /*
4505 * We need to resolve our side of this
4506 * "symmetric" copy-on-write now; we
4507 * need a new object to map and share,
4508 * instead of the current one which
4509 * might still be shared with the
4510 * original mapping.
4511 *
4512 * Note: A "vm_map_copy_t" does not
4513 * have a lock but we're protected by
4514 * the named entry's lock here.
4515 */
4516 // assert(copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
4517 VME_OBJECT_SHADOW(entry: copy_entry, length: copy_size, TRUE);
4518 assert(copy_object != VME_OBJECT(copy_entry));
4519 is_writable = false;
4520 if (copy_entry->protection & VM_PROT_WRITE) {
4521 is_writable = true;
4522#if __arm64e__
4523 } else if (copy_entry->used_for_tpro) {
4524 is_writable = true;
4525#endif /* __arm64e__ */
4526 }
4527 if (!copy_entry->needs_copy && is_writable) {
4528 vm_prot_t prot;
4529
4530 prot = copy_entry->protection & ~VM_PROT_WRITE;
4531 vm_object_pmap_protect(object: copy_object,
4532 offset: copy_offset,
4533 size: copy_size,
4534 PMAP_NULL,
4535 PAGE_SIZE,
4536 pmap_start: 0,
4537 prot);
4538 }
4539 copy_entry->needs_copy = FALSE;
4540 copy_entry->is_shared = TRUE;
4541 copy_object = VME_OBJECT(copy_entry);
4542 copy_offset = VME_OFFSET(entry: copy_entry);
4543 vm_object_lock(copy_object);
4544 /* we're about to make a shared mapping of this object */
4545 copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
4546 VM_OBJECT_SET_TRUE_SHARE(object: copy_object, TRUE);
4547 vm_object_unlock(copy_object);
4548 }
4549
4550 if (copy_object != VM_OBJECT_NULL &&
4551 copy_object->named &&
4552 copy_object->pager != MEMORY_OBJECT_NULL &&
4553 copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4554 memory_object_t pager;
4555 vm_prot_t pager_prot;
4556
4557 /*
4558 * For "named" VM objects, let the pager know that the
4559 * memory object is being mapped. Some pagers need to keep
4560 * track of this, to know when they can reclaim the memory
4561 * object, for example.
4562 * VM calls memory_object_map() for each mapping (specifying
4563 * the protection of each mapping) and calls
4564 * memory_object_last_unmap() when all the mappings are gone.
4565 */
4566 pager_prot = max_protection;
4567 if (copy) {
4568 /*
4569 * Copy-On-Write mapping: won't modify the
4570 * memory object.
4571 */
4572 pager_prot &= ~VM_PROT_WRITE;
4573 }
4574 vm_object_lock(copy_object);
4575 pager = copy_object->pager;
4576 if (copy_object->named &&
4577 pager != MEMORY_OBJECT_NULL &&
4578 copy_object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4579 assert(copy_object->pager_ready);
4580 vm_object_mapping_wait(copy_object, THREAD_UNINT);
4581 vm_object_mapping_begin(copy_object);
4582 vm_object_unlock(copy_object);
4583
4584 kr = memory_object_map(memory_object: pager, prot: pager_prot);
4585 assert(kr == KERN_SUCCESS);
4586
4587 vm_object_lock(copy_object);
4588 vm_object_mapping_end(copy_object);
4589 }
4590 vm_object_unlock(copy_object);
4591 }
4592
4593 /*
4594 * Perform the copy if requested
4595 */
4596
4597 if (copy && copy_object != VM_OBJECT_NULL) {
4598 vm_object_t new_object;
4599 vm_object_offset_t new_offset;
4600
4601 result = vm_object_copy_strategically(src_object: copy_object, src_offset: copy_offset,
4602 size: copy_size,
4603 false, /* forking */
4604 dst_object: &new_object, dst_offset: &new_offset,
4605 dst_needs_copy: &do_copy);
4606
4607
4608 if (result == KERN_MEMORY_RESTART_COPY) {
4609 boolean_t success;
4610 boolean_t src_needs_copy;
4611
4612 /*
4613 * XXX
4614 * We currently ignore src_needs_copy.
4615 * This really is the issue of how to make
4616 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4617 * non-kernel users to use. Solution forthcoming.
4618 * In the meantime, since we don't allow non-kernel
4619 * memory managers to specify symmetric copy,
4620 * we won't run into problems here.
4621 */
4622 new_object = copy_object;
4623 new_offset = copy_offset;
4624 success = vm_object_copy_quickly(object: new_object,
4625 src_offset: new_offset,
4626 size: copy_size,
4627 src_needs_copy: &src_needs_copy,
4628 dst_needs_copy: &do_copy);
4629 assert(success);
4630 result = KERN_SUCCESS;
4631 }
4632 if (result != KERN_SUCCESS) {
4633 kr = result;
4634 break;
4635 }
4636
4637 copy_object = new_object;
4638 copy_offset = new_offset;
4639 /*
4640 * No extra object reference for the mapping:
4641 * the mapping should be the only thing keeping
4642 * this new object alive.
4643 */
4644 } else {
4645 /*
4646 * We already have the right object
4647 * to map.
4648 */
4649 copy_object = VME_OBJECT(copy_entry);
4650 /* take an extra ref for the mapping below */
4651 vm_object_reference(copy_object);
4652 }
4653 }
4654
4655 /*
4656 * If the caller does not want a specific
4657 * tag for this new mapping: use
4658 * the tag of the original mapping.
4659 */
4660 vm_map_kernel_flags_t vmk_remap_flags = {
4661 .vmkf_submap = copy_entry->is_sub_map,
4662 };
4663
4664 vm_map_kernel_flags_set_vmflags(vmk_flags: &vmk_remap_flags,
4665 vm_flags: vm_map_kernel_flags_vmflags(vmk_flags),
4666 vm_tag: vmk_flags.vm_tag ?: VME_ALIAS(copy_entry));
4667
4668 /* over-map the object into destination */
4669 vmk_remap_flags.vmf_fixed = true;
4670 vmk_remap_flags.vmf_overwrite = true;
4671
4672 if (!copy && !copy_entry->is_sub_map) {
4673 /*
4674 * copy-on-write should have been
4675 * resolved at this point, or we would
4676 * end up sharing instead of copying.
4677 */
4678 assert(!copy_entry->needs_copy);
4679 }
4680#if XNU_TARGET_OS_OSX
4681 if (copy_entry->used_for_jit) {
4682 vmk_remap_flags.vmkf_map_jit = TRUE;
4683 }
4684#endif /* XNU_TARGET_OS_OSX */
4685
4686 kr = vm_map_enter(map: target_map,
4687 address: &copy_addr,
4688 size: copy_size,
4689 mask: (vm_map_offset_t) 0,
4690 vmk_flags: vmk_remap_flags,
4691 object: copy_object,
4692 offset: copy_offset,
4693 needs_copy: ((copy_object == NULL)
4694 ? FALSE
4695 : (copy || copy_entry->needs_copy)),
4696 cur_protection,
4697 max_protection,
4698 inheritance);
4699 if (kr != KERN_SUCCESS) {
4700 DEBUG4K_SHARE("failed kr 0x%x\n", kr);
4701 if (copy_entry->is_sub_map) {
4702 vm_map_deallocate(map: copy_submap);
4703 } else {
4704 vm_object_deallocate(object: copy_object);
4705 }
4706 /* abort */
4707 break;
4708 }
4709
4710 /* next mapping */
4711 copy_addr += copy_size;
4712 }
4713
4714 if (kr == KERN_SUCCESS) {
4715 if (vmk_flags.vmf_return_data_addr ||
4716 vmk_flags.vmf_return_4k_data_addr) {
4717 *address = map_addr + offset_in_mapping;
4718 } else {
4719 *address = map_addr;
4720 }
4721 if (overmap_start) {
4722 *address += overmap_start;
4723 DEBUG4K_SHARE("map %p map_addr 0x%llx offset_in_mapping 0x%llx overmap_start 0x%llx -> *address 0x%llx\n", target_map, (uint64_t)map_addr, (uint64_t) offset_in_mapping, (uint64_t)overmap_start, (uint64_t)*address);
4724 }
4725 }
4726 named_entry_unlock(named_entry);
4727 if (target_copy_map != copy_map) {
4728 vm_map_copy_discard(copy: target_copy_map);
4729 target_copy_map = VM_MAP_COPY_NULL;
4730 }
4731
4732 if (kr != KERN_SUCCESS && !vmk_flags.vmf_overwrite) {
4733 /* deallocate the contiguous range */
4734 (void) vm_deallocate(target_task: target_map,
4735 address: map_addr,
4736 size: map_size);
4737 }
4738
4739 return kr;
4740 }
4741
4742 if (named_entry->is_object) {
4743 unsigned int access;
4744 unsigned int wimg_mode;
4745
4746 /* we are mapping a VM object */
4747
4748 access = named_entry->access;
4749
4750 if (vmk_flags.vmf_return_data_addr ||
4751 vmk_flags.vmf_return_4k_data_addr) {
4752 offset_in_mapping = offset - VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(target_map));
4753 if (vmk_flags.vmf_return_4k_data_addr) {
4754 offset_in_mapping &= ~((signed)(0xFFF));
4755 }
4756 offset = VM_MAP_TRUNC_PAGE(offset, VM_MAP_PAGE_MASK(target_map));
4757 map_size = VM_MAP_ROUND_PAGE((offset + offset_in_mapping + initial_size) - offset, VM_MAP_PAGE_MASK(target_map));
4758 }
4759
4760 object = vm_named_entry_to_vm_object(named_entry);
4761 assert(object != VM_OBJECT_NULL);
4762 vm_object_lock(object);
4763 named_entry_unlock(named_entry);
4764
4765 vm_object_reference_locked(object);
4766
4767 wimg_mode = object->wimg_bits;
4768 vm_prot_to_wimg(prot: access, wimg: &wimg_mode);
4769 if (object->wimg_bits != wimg_mode) {
4770 vm_object_change_wimg_mode(object, wimg_mode);
4771 }
4772
4773 vm_object_unlock(object);
4774 } else {
4775 panic("invalid VM named entry %p", named_entry);
4776 }
4777 } else if (ip_kotype(port) == IKOT_MEMORY_OBJECT) {
4778 /*
4779 * JMM - This is temporary until we unify named entries
4780 * and raw memory objects.
4781 *
4782 * Detected fake ip_kotype for a memory object. In
4783 * this case, the port isn't really a port at all, but
4784 * instead is just a raw memory object.
4785 */
4786 if (vmk_flags.vmf_return_data_addr ||
4787 vmk_flags.vmf_return_4k_data_addr) {
4788 panic("VM_FLAGS_RETURN_DATA_ADDR not expected for raw memory object.");
4789 }
4790
4791 object = memory_object_to_vm_object(mem_obj: (memory_object_t)port);
4792 if (object == VM_OBJECT_NULL) {
4793 return KERN_INVALID_OBJECT;
4794 }
4795 vm_object_reference(object);
4796
4797 /* wait for object (if any) to be ready */
4798 if (object != VM_OBJECT_NULL) {
4799 if (is_kernel_object(object)) {
4800 printf(format: "Warning: Attempt to map kernel object"
4801 " by a non-private kernel entity\n");
4802 return KERN_INVALID_OBJECT;
4803 }
4804 if (!object->pager_ready) {
4805 vm_object_lock(object);
4806
4807 while (!object->pager_ready) {
4808 vm_object_wait(object,
4809 VM_OBJECT_EVENT_PAGER_READY,
4810 THREAD_UNINT);
4811 vm_object_lock(object);
4812 }
4813 vm_object_unlock(object);
4814 }
4815 }
4816 } else {
4817 return KERN_INVALID_OBJECT;
4818 }
4819
4820 if (object != VM_OBJECT_NULL &&
4821 object->named &&
4822 object->pager != MEMORY_OBJECT_NULL &&
4823 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4824 memory_object_t pager;
4825 vm_prot_t pager_prot;
4826 kern_return_t kr;
4827
4828 /*
4829 * For "named" VM objects, let the pager know that the
4830 * memory object is being mapped. Some pagers need to keep
4831 * track of this, to know when they can reclaim the memory
4832 * object, for example.
4833 * VM calls memory_object_map() for each mapping (specifying
4834 * the protection of each mapping) and calls
4835 * memory_object_last_unmap() when all the mappings are gone.
4836 */
4837 pager_prot = max_protection;
4838 if (copy) {
4839 /*
4840 * Copy-On-Write mapping: won't modify the
4841 * memory object.
4842 */
4843 pager_prot &= ~VM_PROT_WRITE;
4844 }
4845 vm_object_lock(object);
4846 pager = object->pager;
4847 if (object->named &&
4848 pager != MEMORY_OBJECT_NULL &&
4849 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
4850 assert(object->pager_ready);
4851 vm_object_mapping_wait(object, THREAD_UNINT);
4852 vm_object_mapping_begin(object);
4853 vm_object_unlock(object);
4854
4855 kr = memory_object_map(memory_object: pager, prot: pager_prot);
4856 assert(kr == KERN_SUCCESS);
4857
4858 vm_object_lock(object);
4859 vm_object_mapping_end(object);
4860 }
4861 vm_object_unlock(object);
4862 }
4863
4864 /*
4865 * Perform the copy if requested
4866 */
4867
4868 if (copy) {
4869 vm_object_t new_object;
4870 vm_object_offset_t new_offset;
4871
4872 result = vm_object_copy_strategically(src_object: object, src_offset: offset,
4873 size: map_size,
4874 false, /* forking */
4875 dst_object: &new_object, dst_offset: &new_offset,
4876 dst_needs_copy: &copy);
4877
4878
4879 if (result == KERN_MEMORY_RESTART_COPY) {
4880 boolean_t success;
4881 boolean_t src_needs_copy;
4882
4883 /*
4884 * XXX
4885 * We currently ignore src_needs_copy.
4886 * This really is the issue of how to make
4887 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
4888 * non-kernel users to use. Solution forthcoming.
4889 * In the meantime, since we don't allow non-kernel
4890 * memory managers to specify symmetric copy,
4891 * we won't run into problems here.
4892 */
4893 new_object = object;
4894 new_offset = offset;
4895 success = vm_object_copy_quickly(object: new_object,
4896 src_offset: new_offset,
4897 size: map_size,
4898 src_needs_copy: &src_needs_copy,
4899 dst_needs_copy: &copy);
4900 assert(success);
4901 result = KERN_SUCCESS;
4902 }
4903 /*
4904 * Throw away the reference to the
4905 * original object, as it won't be mapped.
4906 */
4907
4908 vm_object_deallocate(object);
4909
4910 if (result != KERN_SUCCESS) {
4911 return result;
4912 }
4913
4914 object = new_object;
4915 offset = new_offset;
4916 }
4917
4918 /*
4919 * If non-kernel users want to try to prefault pages, the mapping and prefault
4920 * needs to be atomic.
4921 */
4922 kernel_prefault = (try_prefault && vm_kernel_map_is_kernel(map: target_map));
4923 vmk_flags.vmkf_keep_map_locked = (try_prefault && !kernel_prefault);
4924
4925#if __arm64__
4926 if (fourk) {
4927 /* map this object in a "4K" pager */
4928 result = vm_map_enter_fourk(map: target_map,
4929 address: &map_addr,
4930 size: map_size,
4931 mask: (vm_map_offset_t) mask,
4932 vmk_flags,
4933 object,
4934 offset,
4935 needs_copy: copy,
4936 cur_protection,
4937 max_protection,
4938 inheritance);
4939 } else
4940#endif /* __arm64__ */
4941 {
4942 result = vm_map_enter(map: target_map,
4943 address: &map_addr, size: map_size,
4944 mask: (vm_map_offset_t)mask,
4945 vmk_flags,
4946 object, offset,
4947 needs_copy: copy,
4948 cur_protection, max_protection,
4949 inheritance);
4950 }
4951 if (result != KERN_SUCCESS) {
4952 vm_object_deallocate(object);
4953 }
4954
4955 /*
4956 * Try to prefault, and do not forget to release the vm map lock.
4957 */
4958 if (result == KERN_SUCCESS && try_prefault) {
4959 mach_vm_address_t va = map_addr;
4960 kern_return_t kr = KERN_SUCCESS;
4961 unsigned int i = 0;
4962 int pmap_options;
4963
4964 pmap_options = kernel_prefault ? 0 : PMAP_OPTIONS_NOWAIT;
4965 if (object->internal) {
4966 pmap_options |= PMAP_OPTIONS_INTERNAL;
4967 }
4968
4969 for (i = 0; i < page_list_count; ++i) {
4970 if (!UPL_VALID_PAGE(page_list, i)) {
4971 if (kernel_prefault) {
4972 assertf(FALSE, "kernel_prefault && !UPL_VALID_PAGE");
4973 result = KERN_MEMORY_ERROR;
4974 break;
4975 }
4976 } else {
4977 /*
4978 * If this function call failed, we should stop
4979 * trying to optimize, other calls are likely
4980 * going to fail too.
4981 *
4982 * We are not gonna report an error for such
4983 * failure though. That's an optimization, not
4984 * something critical.
4985 */
4986 kr = pmap_enter_options(pmap: target_map->pmap,
4987 v: va, UPL_PHYS_PAGE(page_list, i),
4988 prot: cur_protection, VM_PROT_NONE,
4989 flags: 0, TRUE, options: pmap_options, NULL, mapping_type: PMAP_MAPPING_TYPE_INFER);
4990 if (kr != KERN_SUCCESS) {
4991 OSIncrementAtomic64(address: &vm_prefault_nb_bailout);
4992 if (kernel_prefault) {
4993 result = kr;
4994 }
4995 break;
4996 }
4997 OSIncrementAtomic64(address: &vm_prefault_nb_pages);
4998 }
4999
5000 /* Next virtual address */
5001 va += PAGE_SIZE;
5002 }
5003 if (vmk_flags.vmkf_keep_map_locked) {
5004 vm_map_unlock(target_map);
5005 }
5006 }
5007
5008 if (vmk_flags.vmf_return_data_addr ||
5009 vmk_flags.vmf_return_4k_data_addr) {
5010 *address = map_addr + offset_in_mapping;
5011 } else {
5012 *address = map_addr;
5013 }
5014 return result;
5015}
5016
5017kern_return_t
5018vm_map_enter_mem_object(
5019 vm_map_t target_map,
5020 vm_map_offset_t *address,
5021 vm_map_size_t initial_size,
5022 vm_map_offset_t mask,
5023 vm_map_kernel_flags_t vmk_flags,
5024 ipc_port_t port,
5025 vm_object_offset_t offset,
5026 boolean_t copy,
5027 vm_prot_t cur_protection,
5028 vm_prot_t max_protection,
5029 vm_inherit_t inheritance)
5030{
5031 kern_return_t ret;
5032
5033 /* range_id is set by vm_map_enter_mem_object_helper */
5034 ret = vm_map_enter_mem_object_helper(target_map,
5035 address,
5036 initial_size,
5037 mask,
5038 vmk_flags,
5039 port,
5040 offset,
5041 copy,
5042 cur_protection,
5043 max_protection,
5044 inheritance,
5045 NULL,
5046 page_list_count: 0);
5047
5048#if KASAN
5049 if (ret == KERN_SUCCESS && address && target_map->pmap == kernel_pmap) {
5050 kasan_notify_address(*address, initial_size);
5051 }
5052#endif
5053
5054 return ret;
5055}
5056
5057kern_return_t
5058vm_map_enter_mem_object_prefault(
5059 vm_map_t target_map,
5060 vm_map_offset_t *address,
5061 vm_map_size_t initial_size,
5062 vm_map_offset_t mask,
5063 vm_map_kernel_flags_t vmk_flags,
5064 ipc_port_t port,
5065 vm_object_offset_t offset,
5066 vm_prot_t cur_protection,
5067 vm_prot_t max_protection,
5068 upl_page_list_ptr_t page_list,
5069 unsigned int page_list_count)
5070{
5071 kern_return_t ret;
5072
5073 /* range_id is set by vm_map_enter_mem_object_helper */
5074 ret = vm_map_enter_mem_object_helper(target_map,
5075 address,
5076 initial_size,
5077 mask,
5078 vmk_flags,
5079 port,
5080 offset,
5081 FALSE,
5082 cur_protection,
5083 max_protection,
5084 VM_INHERIT_DEFAULT,
5085 page_list,
5086 page_list_count);
5087
5088#if KASAN
5089 if (ret == KERN_SUCCESS && address && target_map->pmap == kernel_pmap) {
5090 kasan_notify_address(*address, initial_size);
5091 }
5092#endif
5093
5094 return ret;
5095}
5096
5097
5098kern_return_t
5099vm_map_enter_mem_object_control(
5100 vm_map_t target_map,
5101 vm_map_offset_t *address,
5102 vm_map_size_t initial_size,
5103 vm_map_offset_t mask,
5104 vm_map_kernel_flags_t vmk_flags,
5105 memory_object_control_t control,
5106 vm_object_offset_t offset,
5107 boolean_t copy,
5108 vm_prot_t cur_protection,
5109 vm_prot_t max_protection,
5110 vm_inherit_t inheritance)
5111{
5112 vm_map_address_t map_addr;
5113 vm_map_size_t map_size;
5114 vm_object_t object;
5115 vm_object_size_t size;
5116 kern_return_t result;
5117 memory_object_t pager;
5118 vm_prot_t pager_prot;
5119 kern_return_t kr;
5120#if __arm64__
5121 boolean_t fourk = vmk_flags.vmkf_fourk;
5122#endif /* __arm64__ */
5123
5124 /*
5125 * Check arguments for validity
5126 */
5127 if ((target_map == VM_MAP_NULL) ||
5128 (cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
5129 (max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)) ||
5130 (inheritance > VM_INHERIT_LAST_VALID) ||
5131 initial_size == 0) {
5132 return KERN_INVALID_ARGUMENT;
5133 }
5134
5135 if (__improbable((cur_protection & max_protection) != cur_protection)) {
5136 /* cur is more permissive than max */
5137 cur_protection &= max_protection;
5138 }
5139
5140#if __arm64__
5141 if (fourk && VM_MAP_PAGE_MASK(target_map) < PAGE_MASK) {
5142 fourk = FALSE;
5143 }
5144
5145 if (fourk) {
5146 map_addr = vm_map_trunc_page(*address,
5147 FOURK_PAGE_MASK);
5148 map_size = vm_map_round_page(initial_size,
5149 FOURK_PAGE_MASK);
5150 } else
5151#endif /* __arm64__ */
5152 {
5153 map_addr = vm_map_trunc_page(*address,
5154 VM_MAP_PAGE_MASK(target_map));
5155 map_size = vm_map_round_page(initial_size,
5156 VM_MAP_PAGE_MASK(target_map));
5157 }
5158 size = vm_object_round_page(initial_size);
5159
5160 object = memory_object_control_to_vm_object(control);
5161
5162 if (object == VM_OBJECT_NULL) {
5163 return KERN_INVALID_OBJECT;
5164 }
5165
5166 if (is_kernel_object(object)) {
5167 printf(format: "Warning: Attempt to map kernel object"
5168 " by a non-private kernel entity\n");
5169 return KERN_INVALID_OBJECT;
5170 }
5171
5172 vm_object_lock(object);
5173 object->ref_count++;
5174
5175 /*
5176 * For "named" VM objects, let the pager know that the
5177 * memory object is being mapped. Some pagers need to keep
5178 * track of this, to know when they can reclaim the memory
5179 * object, for example.
5180 * VM calls memory_object_map() for each mapping (specifying
5181 * the protection of each mapping) and calls
5182 * memory_object_last_unmap() when all the mappings are gone.
5183 */
5184 pager_prot = max_protection;
5185 if (copy) {
5186 pager_prot &= ~VM_PROT_WRITE;
5187 }
5188 pager = object->pager;
5189 if (object->named &&
5190 pager != MEMORY_OBJECT_NULL &&
5191 object->copy_strategy != MEMORY_OBJECT_COPY_NONE) {
5192 assert(object->pager_ready);
5193 vm_object_mapping_wait(object, THREAD_UNINT);
5194 vm_object_mapping_begin(object);
5195 vm_object_unlock(object);
5196
5197 kr = memory_object_map(memory_object: pager, prot: pager_prot);
5198 assert(kr == KERN_SUCCESS);
5199
5200 vm_object_lock(object);
5201 vm_object_mapping_end(object);
5202 }
5203 vm_object_unlock(object);
5204
5205 /*
5206 * Perform the copy if requested
5207 */
5208
5209 if (copy) {
5210 vm_object_t new_object;
5211 vm_object_offset_t new_offset;
5212
5213 result = vm_object_copy_strategically(src_object: object, src_offset: offset, size,
5214 false, /* forking */
5215 dst_object: &new_object, dst_offset: &new_offset,
5216 dst_needs_copy: &copy);
5217
5218
5219 if (result == KERN_MEMORY_RESTART_COPY) {
5220 boolean_t success;
5221 boolean_t src_needs_copy;
5222
5223 /*
5224 * XXX
5225 * We currently ignore src_needs_copy.
5226 * This really is the issue of how to make
5227 * MEMORY_OBJECT_COPY_SYMMETRIC safe for
5228 * non-kernel users to use. Solution forthcoming.
5229 * In the meantime, since we don't allow non-kernel
5230 * memory managers to specify symmetric copy,
5231 * we won't run into problems here.
5232 */
5233 new_object = object;
5234 new_offset = offset;
5235 success = vm_object_copy_quickly(object: new_object,
5236 src_offset: new_offset, size,
5237 src_needs_copy: &src_needs_copy,
5238 dst_needs_copy: &copy);
5239 assert(success);
5240 result = KERN_SUCCESS;
5241 }
5242 /*
5243 * Throw away the reference to the
5244 * original object, as it won't be mapped.
5245 */
5246
5247 vm_object_deallocate(object);
5248
5249 if (result != KERN_SUCCESS) {
5250 return result;
5251 }
5252
5253 object = new_object;
5254 offset = new_offset;
5255 }
5256
5257#if __arm64__
5258 if (fourk) {
5259 result = vm_map_enter_fourk(map: target_map,
5260 address: &map_addr,
5261 size: map_size,
5262 mask: (vm_map_offset_t)mask,
5263 vmk_flags,
5264 object, offset,
5265 needs_copy: copy,
5266 cur_protection, max_protection,
5267 inheritance);
5268 } else
5269#endif /* __arm64__ */
5270 {
5271 result = vm_map_enter(map: target_map,
5272 address: &map_addr, size: map_size,
5273 mask: (vm_map_offset_t)mask,
5274 vmk_flags,
5275 object, offset,
5276 needs_copy: copy,
5277 cur_protection, max_protection,
5278 inheritance);
5279 }
5280 if (result != KERN_SUCCESS) {
5281 vm_object_deallocate(object);
5282 }
5283 *address = map_addr;
5284
5285 return result;
5286}
5287
5288
5289#if VM_CPM
5290
5291#ifdef MACH_ASSERT
5292extern pmap_paddr_t avail_start, avail_end;
5293#endif
5294
5295/*
5296 * Allocate memory in the specified map, with the caveat that
5297 * the memory is physically contiguous. This call may fail
5298 * if the system can't find sufficient contiguous memory.
5299 * This call may cause or lead to heart-stopping amounts of
5300 * paging activity.
5301 *
5302 * Memory obtained from this call should be freed in the
5303 * normal way, viz., via vm_deallocate.
5304 */
5305kern_return_t
5306vm_map_enter_cpm(
5307 vm_map_t map,
5308 vm_map_offset_t *addr,
5309 vm_map_size_t size,
5310 vm_map_kernel_flags_t vmk_flags)
5311{
5312 vm_object_t cpm_obj;
5313 pmap_t pmap;
5314 vm_page_t m, pages;
5315 kern_return_t kr;
5316 vm_map_offset_t va, start, end, offset;
5317#if MACH_ASSERT
5318 vm_map_offset_t prev_addr = 0;
5319#endif /* MACH_ASSERT */
5320 uint8_t object_lock_type = 0;
5321
5322 if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
5323 /* XXX TODO4K do we need to support this? */
5324 *addr = 0;
5325 return KERN_NOT_SUPPORTED;
5326 }
5327
5328 if (size == 0) {
5329 *addr = 0;
5330 return KERN_SUCCESS;
5331 }
5332 if (vmk_flags.vmf_fixed) {
5333 *addr = vm_map_trunc_page(*addr,
5334 VM_MAP_PAGE_MASK(map));
5335 } else {
5336 *addr = vm_map_min(map);
5337 }
5338 size = vm_map_round_page(size,
5339 VM_MAP_PAGE_MASK(map));
5340
5341 /*
5342 * LP64todo - cpm_allocate should probably allow
5343 * allocations of >4GB, but not with the current
5344 * algorithm, so just cast down the size for now.
5345 */
5346 if (size > VM_MAX_ADDRESS) {
5347 return KERN_RESOURCE_SHORTAGE;
5348 }
5349 if ((kr = cpm_allocate(CAST_DOWN(vm_size_t, size),
5350 &pages, 0, 0, TRUE, flags)) != KERN_SUCCESS) {
5351 return kr;
5352 }
5353
5354 cpm_obj = vm_object_allocate((vm_object_size_t)size);
5355 assert(cpm_obj != VM_OBJECT_NULL);
5356 assert(cpm_obj->internal);
5357 assert(cpm_obj->vo_size == (vm_object_size_t)size);
5358 assert(cpm_obj->can_persist == FALSE);
5359 assert(cpm_obj->pager_created == FALSE);
5360 assert(cpm_obj->pageout == FALSE);
5361 assert(cpm_obj->shadow == VM_OBJECT_NULL);
5362
5363 /*
5364 * Insert pages into object.
5365 */
5366 object_lock_type = OBJECT_LOCK_EXCLUSIVE;
5367 vm_object_lock(cpm_obj);
5368 for (offset = 0; offset < size; offset += PAGE_SIZE) {
5369 m = pages;
5370 pages = NEXT_PAGE(m);
5371 *(NEXT_PAGE_PTR(m)) = VM_PAGE_NULL;
5372
5373 assert(!m->vmp_gobbled);
5374 assert(!m->vmp_wanted);
5375 assert(!m->vmp_pageout);
5376 assert(!m->vmp_tabled);
5377 assert(VM_PAGE_WIRED(m));
5378 assert(m->vmp_busy);
5379 assert(VM_PAGE_GET_PHYS_PAGE(m) >= (avail_start >> PAGE_SHIFT) && VM_PAGE_GET_PHYS_PAGE(m) <= (avail_end >> PAGE_SHIFT));
5380
5381 m->vmp_busy = FALSE;
5382 vm_page_insert(m, cpm_obj, offset);
5383 }
5384 assert(cpm_obj->resident_page_count == size / PAGE_SIZE);
5385 vm_object_unlock(cpm_obj);
5386
5387 /*
5388 * Hang onto a reference on the object in case a
5389 * multi-threaded application for some reason decides
5390 * to deallocate the portion of the address space into
5391 * which we will insert this object.
5392 *
5393 * Unfortunately, we must insert the object now before
5394 * we can talk to the pmap module about which addresses
5395 * must be wired down. Hence, the race with a multi-
5396 * threaded app.
5397 */
5398 vm_object_reference(cpm_obj);
5399
5400 /*
5401 * Insert object into map.
5402 */
5403
5404 kr = vm_map_enter(
5405 map,
5406 addr,
5407 size,
5408 (vm_map_offset_t)0,
5409 vmk_flags,
5410 cpm_obj,
5411 (vm_object_offset_t)0,
5412 FALSE,
5413 VM_PROT_ALL,
5414 VM_PROT_ALL,
5415 VM_INHERIT_DEFAULT);
5416
5417 if (kr != KERN_SUCCESS) {
5418 /*
5419 * A CPM object doesn't have can_persist set,
5420 * so all we have to do is deallocate it to
5421 * free up these pages.
5422 */
5423 assert(cpm_obj->pager_created == FALSE);
5424 assert(cpm_obj->can_persist == FALSE);
5425 assert(cpm_obj->pageout == FALSE);
5426 assert(cpm_obj->shadow == VM_OBJECT_NULL);
5427 vm_object_deallocate(cpm_obj); /* kill acquired ref */
5428 vm_object_deallocate(cpm_obj); /* kill creation ref */
5429 }
5430
5431 /*
5432 * Inform the physical mapping system that the
5433 * range of addresses may not fault, so that
5434 * page tables and such can be locked down as well.
5435 */
5436 start = *addr;
5437 end = start + size;
5438 pmap = vm_map_pmap(map);
5439 pmap_pageable(pmap, start, end, FALSE);
5440
5441 /*
5442 * Enter each page into the pmap, to avoid faults.
5443 * Note that this loop could be coded more efficiently,
5444 * if the need arose, rather than looking up each page
5445 * again.
5446 */
5447 for (offset = 0, va = start; offset < size;
5448 va += PAGE_SIZE, offset += PAGE_SIZE) {
5449 int type_of_fault;
5450
5451 vm_object_lock(cpm_obj);
5452 m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset);
5453 assert(m != VM_PAGE_NULL);
5454
5455 vm_page_zero_fill(m);
5456
5457 type_of_fault = DBG_ZERO_FILL_FAULT;
5458
5459 vm_fault_enter(m, pmap, va,
5460 PAGE_SIZE, 0,
5461 VM_PROT_ALL, VM_PROT_WRITE,
5462 VM_PAGE_WIRED(m),
5463 FALSE, /* change_wiring */
5464 VM_KERN_MEMORY_NONE, /* tag - not wiring */
5465 FALSE, /* cs_bypass */
5466 0, /* user_tag */
5467 0, /* pmap_options */
5468 NULL, /* need_retry */
5469 &type_of_fault,
5470 &object_lock_type); /* Exclusive lock mode. Will remain unchanged.*/
5471
5472 vm_object_unlock(cpm_obj);
5473 }
5474
5475#if MACH_ASSERT
5476 /*
5477 * Verify ordering in address space.
5478 */
5479 for (offset = 0; offset < size; offset += PAGE_SIZE) {
5480 vm_object_lock(cpm_obj);
5481 m = vm_page_lookup(cpm_obj, (vm_object_offset_t)offset);
5482 vm_object_unlock(cpm_obj);
5483 if (m == VM_PAGE_NULL) {
5484 panic("vm_allocate_cpm: obj %p off 0x%llx no page",
5485 cpm_obj, (uint64_t)offset);
5486 }
5487 assert(m->vmp_tabled);
5488 assert(!m->vmp_busy);
5489 assert(!m->vmp_wanted);
5490 assert(!m->vmp_fictitious);
5491 assert(!m->vmp_private);
5492 assert(!m->vmp_absent);
5493 assert(!m->vmp_cleaning);
5494 assert(!m->vmp_laundry);
5495 assert(!m->vmp_precious);
5496 assert(!m->vmp_clustered);
5497 if (offset != 0) {
5498 if (VM_PAGE_GET_PHYS_PAGE(m) != prev_addr + 1) {
5499 printf("start 0x%llx end 0x%llx va 0x%llx\n",
5500 (uint64_t)start, (uint64_t)end, (uint64_t)va);
5501 printf("obj %p off 0x%llx\n", cpm_obj, (uint64_t)offset);
5502 printf("m %p prev_address 0x%llx\n", m, (uint64_t)prev_addr);
5503 panic("vm_allocate_cpm: pages not contig!");
5504 }
5505 }
5506 prev_addr = VM_PAGE_GET_PHYS_PAGE(m);
5507 }
5508#endif /* MACH_ASSERT */
5509
5510 vm_object_deallocate(cpm_obj); /* kill extra ref */
5511
5512 return kr;
5513}
5514
5515
5516#else /* VM_CPM */
5517
5518/*
5519 * Interface is defined in all cases, but unless the kernel
5520 * is built explicitly for this option, the interface does
5521 * nothing.
5522 */
5523
5524kern_return_t
5525vm_map_enter_cpm(
5526 __unused vm_map_t map,
5527 __unused vm_map_offset_t *addr,
5528 __unused vm_map_size_t size,
5529 __unused vm_map_kernel_flags_t vmk_flags)
5530{
5531 return KERN_FAILURE;
5532}
5533#endif /* VM_CPM */
5534
5535/* Not used without nested pmaps */
5536#ifndef NO_NESTED_PMAP
5537/*
5538 * Clip and unnest a portion of a nested submap mapping.
5539 */
5540
5541
5542static void
5543vm_map_clip_unnest(
5544 vm_map_t map,
5545 vm_map_entry_t entry,
5546 vm_map_offset_t start_unnest,
5547 vm_map_offset_t end_unnest)
5548{
5549 vm_map_offset_t old_start_unnest = start_unnest;
5550 vm_map_offset_t old_end_unnest = end_unnest;
5551
5552 assert(entry->is_sub_map);
5553 assert(VME_SUBMAP(entry) != NULL);
5554 assert(entry->use_pmap);
5555
5556 /*
5557 * Query the platform for the optimal unnest range.
5558 * DRK: There's some duplication of effort here, since
5559 * callers may have adjusted the range to some extent. This
5560 * routine was introduced to support 1GiB subtree nesting
5561 * for x86 platforms, which can also nest on 2MiB boundaries
5562 * depending on size/alignment.
5563 */
5564 if (pmap_adjust_unnest_parameters(map->pmap, &start_unnest, &end_unnest)) {
5565 assert(VME_SUBMAP(entry)->is_nested_map);
5566 assert(!VME_SUBMAP(entry)->disable_vmentry_reuse);
5567 log_unnest_badness(map,
5568 start_unnest: old_start_unnest,
5569 end_unnest: old_end_unnest,
5570 VME_SUBMAP(entry)->is_nested_map,
5571 lowest_unnestable_addr: (entry->vme_start +
5572 VME_SUBMAP(entry)->lowest_unnestable_start -
5573 VME_OFFSET(entry)));
5574 }
5575
5576 if (entry->vme_start > start_unnest ||
5577 entry->vme_end < end_unnest) {
5578 panic("vm_map_clip_unnest(0x%llx,0x%llx): "
5579 "bad nested entry: start=0x%llx end=0x%llx\n",
5580 (long long)start_unnest, (long long)end_unnest,
5581 (long long)entry->vme_start, (long long)entry->vme_end);
5582 }
5583
5584 if (start_unnest > entry->vme_start) {
5585 _vm_map_clip_start(map_header: &map->hdr,
5586 entry,
5587 start: start_unnest);
5588 if (map->holelistenabled) {
5589 vm_map_store_update_first_free(map, NULL, FALSE);
5590 } else {
5591 vm_map_store_update_first_free(map, entry: map->first_free, FALSE);
5592 }
5593 }
5594 if (entry->vme_end > end_unnest) {
5595 _vm_map_clip_end(map_header: &map->hdr,
5596 entry,
5597 end: end_unnest);
5598 if (map->holelistenabled) {
5599 vm_map_store_update_first_free(map, NULL, FALSE);
5600 } else {
5601 vm_map_store_update_first_free(map, entry: map->first_free, FALSE);
5602 }
5603 }
5604
5605 pmap_unnest(map->pmap,
5606 entry->vme_start,
5607 entry->vme_end - entry->vme_start);
5608 if ((map->mapped_in_other_pmaps) && os_ref_get_count_raw(rc: &map->map_refcnt) != 0) {
5609 /* clean up parent map/maps */
5610 vm_map_submap_pmap_clean(
5611 map, start: entry->vme_start,
5612 end: entry->vme_end,
5613 VME_SUBMAP(entry),
5614 offset: VME_OFFSET(entry));
5615 }
5616 entry->use_pmap = FALSE;
5617 if ((map->pmap != kernel_pmap) &&
5618 (VME_ALIAS(entry) == VM_MEMORY_SHARED_PMAP)) {
5619 VME_ALIAS_SET(entry, VM_MEMORY_UNSHARED_PMAP);
5620 }
5621}
5622#endif /* NO_NESTED_PMAP */
5623
5624__abortlike
5625static void
5626__vm_map_clip_atomic_entry_panic(
5627 vm_map_t map,
5628 vm_map_entry_t entry,
5629 vm_map_offset_t where)
5630{
5631 panic("vm_map_clip(%p): Attempting to clip an atomic VM map entry "
5632 "%p [0x%llx:0x%llx] at 0x%llx", map, entry,
5633 (uint64_t)entry->vme_start,
5634 (uint64_t)entry->vme_end,
5635 (uint64_t)where);
5636}
5637
5638/*
5639 * vm_map_clip_start: [ internal use only ]
5640 *
5641 * Asserts that the given entry begins at or after
5642 * the specified address; if necessary,
5643 * it splits the entry into two.
5644 */
5645void
5646vm_map_clip_start(
5647 vm_map_t map,
5648 vm_map_entry_t entry,
5649 vm_map_offset_t startaddr)
5650{
5651#ifndef NO_NESTED_PMAP
5652 if (entry->is_sub_map &&
5653 entry->use_pmap &&
5654 startaddr >= entry->vme_start) {
5655 vm_map_offset_t start_unnest, end_unnest;
5656
5657 /*
5658 * Make sure "startaddr" is no longer in a nested range
5659 * before we clip. Unnest only the minimum range the platform
5660 * can handle.
5661 * vm_map_clip_unnest may perform additional adjustments to
5662 * the unnest range.
5663 */
5664 start_unnest = startaddr & ~(pmap_shared_region_size_min(map: map->pmap) - 1);
5665 end_unnest = start_unnest + pmap_shared_region_size_min(map: map->pmap);
5666 vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5667 }
5668#endif /* NO_NESTED_PMAP */
5669 if (startaddr > entry->vme_start) {
5670 if (!entry->is_sub_map &&
5671 VME_OBJECT(entry) &&
5672 VME_OBJECT(entry)->phys_contiguous) {
5673 pmap_remove(map: map->pmap,
5674 s: (addr64_t)(entry->vme_start),
5675 e: (addr64_t)(entry->vme_end));
5676 }
5677 if (entry->vme_atomic) {
5678 __vm_map_clip_atomic_entry_panic(map, entry, where: startaddr);
5679 }
5680
5681 DTRACE_VM5(
5682 vm_map_clip_start,
5683 vm_map_t, map,
5684 vm_map_offset_t, entry->vme_start,
5685 vm_map_offset_t, entry->vme_end,
5686 vm_map_offset_t, startaddr,
5687 int, VME_ALIAS(entry));
5688
5689 _vm_map_clip_start(map_header: &map->hdr, entry, start: startaddr);
5690 if (map->holelistenabled) {
5691 vm_map_store_update_first_free(map, NULL, FALSE);
5692 } else {
5693 vm_map_store_update_first_free(map, entry: map->first_free, FALSE);
5694 }
5695 }
5696}
5697
5698
5699#define vm_map_copy_clip_start(copy, entry, startaddr) \
5700 MACRO_BEGIN \
5701 if ((startaddr) > (entry)->vme_start) \
5702 _vm_map_clip_start(&(copy)->cpy_hdr,(entry),(startaddr)); \
5703 MACRO_END
5704
5705/*
5706 * This routine is called only when it is known that
5707 * the entry must be split.
5708 */
5709static void
5710_vm_map_clip_start(
5711 struct vm_map_header *map_header,
5712 vm_map_entry_t entry,
5713 vm_map_offset_t start)
5714{
5715 vm_map_entry_t new_entry;
5716
5717 /*
5718 * Split off the front portion --
5719 * note that we must insert the new
5720 * entry BEFORE this one, so that
5721 * this entry has the specified starting
5722 * address.
5723 */
5724
5725 if (entry->map_aligned) {
5726 assert(VM_MAP_PAGE_ALIGNED(start,
5727 VM_MAP_HDR_PAGE_MASK(map_header)));
5728 }
5729
5730 new_entry = _vm_map_entry_create(map_header);
5731 vm_map_entry_copy_full(new: new_entry, old: entry);
5732
5733 new_entry->vme_end = start;
5734 assert(new_entry->vme_start < new_entry->vme_end);
5735 VME_OFFSET_SET(entry, offset: VME_OFFSET(entry) + (start - entry->vme_start));
5736 if (__improbable(start >= entry->vme_end)) {
5737 panic("mapHdr %p entry %p start 0x%llx end 0x%llx new start 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, start);
5738 }
5739 assert(start < entry->vme_end);
5740 entry->vme_start = start;
5741
5742#if VM_BTLOG_TAGS
5743 if (new_entry->vme_kernel_object) {
5744 btref_retain(new_entry->vme_tag_btref);
5745 }
5746#endif /* VM_BTLOG_TAGS */
5747
5748 _vm_map_store_entry_link(header: map_header, after_where: entry->vme_prev, entry: new_entry);
5749
5750 if (entry->is_sub_map) {
5751 vm_map_reference(VME_SUBMAP(new_entry));
5752 } else {
5753 vm_object_reference(VME_OBJECT(new_entry));
5754 }
5755}
5756
5757
5758/*
5759 * vm_map_clip_end: [ internal use only ]
5760 *
5761 * Asserts that the given entry ends at or before
5762 * the specified address; if necessary,
5763 * it splits the entry into two.
5764 */
5765void
5766vm_map_clip_end(
5767 vm_map_t map,
5768 vm_map_entry_t entry,
5769 vm_map_offset_t endaddr)
5770{
5771 if (endaddr > entry->vme_end) {
5772 /*
5773 * Within the scope of this clipping, limit "endaddr" to
5774 * the end of this map entry...
5775 */
5776 endaddr = entry->vme_end;
5777 }
5778#ifndef NO_NESTED_PMAP
5779 if (entry->is_sub_map && entry->use_pmap) {
5780 vm_map_offset_t start_unnest, end_unnest;
5781
5782 /*
5783 * Make sure the range between the start of this entry and
5784 * the new "endaddr" is no longer nested before we clip.
5785 * Unnest only the minimum range the platform can handle.
5786 * vm_map_clip_unnest may perform additional adjustments to
5787 * the unnest range.
5788 */
5789 start_unnest = entry->vme_start;
5790 end_unnest =
5791 (endaddr + pmap_shared_region_size_min(map: map->pmap) - 1) &
5792 ~(pmap_shared_region_size_min(map: map->pmap) - 1);
5793 vm_map_clip_unnest(map, entry, start_unnest, end_unnest);
5794 }
5795#endif /* NO_NESTED_PMAP */
5796 if (endaddr < entry->vme_end) {
5797 if (!entry->is_sub_map &&
5798 VME_OBJECT(entry) &&
5799 VME_OBJECT(entry)->phys_contiguous) {
5800 pmap_remove(map: map->pmap,
5801 s: (addr64_t)(entry->vme_start),
5802 e: (addr64_t)(entry->vme_end));
5803 }
5804 if (entry->vme_atomic) {
5805 __vm_map_clip_atomic_entry_panic(map, entry, where: endaddr);
5806 }
5807 DTRACE_VM5(
5808 vm_map_clip_end,
5809 vm_map_t, map,
5810 vm_map_offset_t, entry->vme_start,
5811 vm_map_offset_t, entry->vme_end,
5812 vm_map_offset_t, endaddr,
5813 int, VME_ALIAS(entry));
5814
5815 _vm_map_clip_end(map_header: &map->hdr, entry, end: endaddr);
5816 if (map->holelistenabled) {
5817 vm_map_store_update_first_free(map, NULL, FALSE);
5818 } else {
5819 vm_map_store_update_first_free(map, entry: map->first_free, FALSE);
5820 }
5821 }
5822}
5823
5824
5825#define vm_map_copy_clip_end(copy, entry, endaddr) \
5826 MACRO_BEGIN \
5827 if ((endaddr) < (entry)->vme_end) \
5828 _vm_map_clip_end(&(copy)->cpy_hdr,(entry),(endaddr)); \
5829 MACRO_END
5830
5831/*
5832 * This routine is called only when it is known that
5833 * the entry must be split.
5834 */
5835static void
5836_vm_map_clip_end(
5837 struct vm_map_header *map_header,
5838 vm_map_entry_t entry,
5839 vm_map_offset_t end)
5840{
5841 vm_map_entry_t new_entry;
5842
5843 /*
5844 * Create a new entry and insert it
5845 * AFTER the specified entry
5846 */
5847
5848 if (entry->map_aligned) {
5849 assert(VM_MAP_PAGE_ALIGNED(end,
5850 VM_MAP_HDR_PAGE_MASK(map_header)));
5851 }
5852
5853 new_entry = _vm_map_entry_create(map_header);
5854 vm_map_entry_copy_full(new: new_entry, old: entry);
5855
5856 if (__improbable(end <= entry->vme_start)) {
5857 panic("mapHdr %p entry %p start 0x%llx end 0x%llx new end 0x%llx", map_header, entry, entry->vme_start, entry->vme_end, end);
5858 }
5859 assert(entry->vme_start < end);
5860 new_entry->vme_start = entry->vme_end = end;
5861 VME_OFFSET_SET(entry: new_entry,
5862 offset: VME_OFFSET(entry: new_entry) + (end - entry->vme_start));
5863 assert(new_entry->vme_start < new_entry->vme_end);
5864
5865#if VM_BTLOG_TAGS
5866 if (new_entry->vme_kernel_object) {
5867 btref_retain(new_entry->vme_tag_btref);
5868 }
5869#endif /* VM_BTLOG_TAGS */
5870
5871 _vm_map_store_entry_link(header: map_header, after_where: entry, entry: new_entry);
5872
5873 if (entry->is_sub_map) {
5874 vm_map_reference(VME_SUBMAP(new_entry));
5875 } else {
5876 vm_object_reference(VME_OBJECT(new_entry));
5877 }
5878}
5879
5880
5881/*
5882 * VM_MAP_RANGE_CHECK: [ internal use only ]
5883 *
5884 * Asserts that the starting and ending region
5885 * addresses fall within the valid range of the map.
5886 */
5887#define VM_MAP_RANGE_CHECK(map, start, end) \
5888 MACRO_BEGIN \
5889 if (start < vm_map_min(map)) \
5890 start = vm_map_min(map); \
5891 if (end > vm_map_max(map)) \
5892 end = vm_map_max(map); \
5893 if (start > end) \
5894 start = end; \
5895 MACRO_END
5896
5897/*
5898 * vm_map_range_check: [ internal use only ]
5899 *
5900 * Check that the region defined by the specified start and
5901 * end addresses are wholly contained within a single map
5902 * entry or set of adjacent map entries of the spacified map,
5903 * i.e. the specified region contains no unmapped space.
5904 * If any or all of the region is unmapped, FALSE is returned.
5905 * Otherwise, TRUE is returned and if the output argument 'entry'
5906 * is not NULL it points to the map entry containing the start
5907 * of the region.
5908 *
5909 * The map is locked for reading on entry and is left locked.
5910 */
5911static boolean_t
5912vm_map_range_check(
5913 vm_map_t map,
5914 vm_map_offset_t start,
5915 vm_map_offset_t end,
5916 vm_map_entry_t *entry)
5917{
5918 vm_map_entry_t cur;
5919 vm_map_offset_t prev;
5920
5921 /*
5922 * Basic sanity checks first
5923 */
5924 if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
5925 return FALSE;
5926 }
5927
5928 /*
5929 * Check first if the region starts within a valid
5930 * mapping for the map.
5931 */
5932 if (!vm_map_lookup_entry(map, address: start, entry: &cur)) {
5933 return FALSE;
5934 }
5935
5936 /*
5937 * Optimize for the case that the region is contained
5938 * in a single map entry.
5939 */
5940 if (entry != (vm_map_entry_t *) NULL) {
5941 *entry = cur;
5942 }
5943 if (end <= cur->vme_end) {
5944 return TRUE;
5945 }
5946
5947 /*
5948 * If the region is not wholly contained within a
5949 * single entry, walk the entries looking for holes.
5950 */
5951 prev = cur->vme_end;
5952 cur = cur->vme_next;
5953 while ((cur != vm_map_to_entry(map)) && (prev == cur->vme_start)) {
5954 if (end <= cur->vme_end) {
5955 return TRUE;
5956 }
5957 prev = cur->vme_end;
5958 cur = cur->vme_next;
5959 }
5960 return FALSE;
5961}
5962
5963/*
5964 * vm_map_protect:
5965 *
5966 * Sets the protection of the specified address
5967 * region in the target map. If "set_max" is
5968 * specified, the maximum protection is to be set;
5969 * otherwise, only the current protection is affected.
5970 */
5971kern_return_t
5972vm_map_protect(
5973 vm_map_t map,
5974 vm_map_offset_t start,
5975 vm_map_offset_t end,
5976 vm_prot_t new_prot,
5977 boolean_t set_max)
5978{
5979 vm_map_entry_t current;
5980 vm_map_offset_t prev;
5981 vm_map_entry_t entry;
5982 vm_prot_t new_max;
5983 int pmap_options = 0;
5984 kern_return_t kr;
5985
5986 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
5987 return KERN_INVALID_ARGUMENT;
5988 }
5989
5990 if (new_prot & VM_PROT_COPY) {
5991 vm_map_offset_t new_start;
5992 vm_prot_t cur_prot, max_prot;
5993 vm_map_kernel_flags_t kflags;
5994
5995 /* LP64todo - see below */
5996 if (start >= map->max_offset) {
5997 return KERN_INVALID_ADDRESS;
5998 }
5999
6000 if ((new_prot & VM_PROT_ALLEXEC) &&
6001 map->pmap != kernel_pmap &&
6002 (vm_map_cs_enforcement(map)
6003#if XNU_TARGET_OS_OSX && __arm64__
6004 || !VM_MAP_IS_EXOTIC(map)
6005#endif /* XNU_TARGET_OS_OSX && __arm64__ */
6006 ) &&
6007 VM_MAP_POLICY_WX_FAIL(map)) {
6008 DTRACE_VM3(cs_wx,
6009 uint64_t, (uint64_t) start,
6010 uint64_t, (uint64_t) end,
6011 vm_prot_t, new_prot);
6012 printf(format: "CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
6013 proc_selfpid(),
6014 (get_bsdtask_info(current_task())
6015 ? proc_name_address(p: get_bsdtask_info(current_task()))
6016 : "?"),
6017 __FUNCTION__, __LINE__,
6018#if DEVELOPMENT || DEBUG
6019 (uint64_t)start,
6020 (uint64_t)end,
6021#else /* DEVELOPMENT || DEBUG */
6022 (uint64_t)0,
6023 (uint64_t)0,
6024#endif /* DEVELOPMENT || DEBUG */
6025 new_prot);
6026 return KERN_PROTECTION_FAILURE;
6027 }
6028
6029 /*
6030 * Let vm_map_remap_extract() know that it will need to:
6031 * + make a copy of the mapping
6032 * + add VM_PROT_WRITE to the max protections
6033 * + remove any protections that are no longer allowed from the
6034 * max protections (to avoid any WRITE/EXECUTE conflict, for
6035 * example).
6036 * Note that "max_prot" is an IN/OUT parameter only for this
6037 * specific (VM_PROT_COPY) case. It's usually an OUT parameter
6038 * only.
6039 */
6040 max_prot = new_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC);
6041 cur_prot = VM_PROT_NONE;
6042 kflags = VM_MAP_KERNEL_FLAGS_FIXED(.vmf_overwrite = true);
6043 kflags.vmkf_remap_prot_copy = true;
6044 kflags.vmkf_tpro_enforcement_override = !vm_map_tpro_enforcement(map);
6045 new_start = start;
6046 kr = vm_map_remap(target_map: map,
6047 address: &new_start,
6048 size: end - start,
6049 mask: 0, /* mask */
6050 vmk_flags: kflags,
6051 src_map: map,
6052 memory_address: start,
6053 TRUE, /* copy-on-write remapping! */
6054 cur_protection: &cur_prot, /* IN/OUT */
6055 max_protection: &max_prot, /* IN/OUT */
6056 VM_INHERIT_DEFAULT);
6057 if (kr != KERN_SUCCESS) {
6058 return kr;
6059 }
6060 new_prot &= ~VM_PROT_COPY;
6061 }
6062
6063 vm_map_lock(map);
6064
6065 /* LP64todo - remove this check when vm_map_commpage64()
6066 * no longer has to stuff in a map_entry for the commpage
6067 * above the map's max_offset.
6068 */
6069 if (start >= map->max_offset) {
6070 vm_map_unlock(map);
6071 return KERN_INVALID_ADDRESS;
6072 }
6073
6074 while (1) {
6075 /*
6076 * Lookup the entry. If it doesn't start in a valid
6077 * entry, return an error.
6078 */
6079 if (!vm_map_lookup_entry(map, address: start, entry: &entry)) {
6080 vm_map_unlock(map);
6081 return KERN_INVALID_ADDRESS;
6082 }
6083
6084 if (entry->superpage_size && (start & (SUPERPAGE_SIZE - 1))) { /* extend request to whole entry */
6085 start = SUPERPAGE_ROUND_DOWN(start);
6086 continue;
6087 }
6088 break;
6089 }
6090 if (entry->superpage_size) {
6091 end = SUPERPAGE_ROUND_UP(end);
6092 }
6093
6094 /*
6095 * Make a first pass to check for protection and address
6096 * violations.
6097 */
6098
6099 current = entry;
6100 prev = current->vme_start;
6101 while ((current != vm_map_to_entry(map)) &&
6102 (current->vme_start < end)) {
6103 /*
6104 * If there is a hole, return an error.
6105 */
6106 if (current->vme_start != prev) {
6107 vm_map_unlock(map);
6108 return KERN_INVALID_ADDRESS;
6109 }
6110
6111 new_max = current->max_protection;
6112
6113#if defined(__x86_64__)
6114 /* Allow max mask to include execute prot bits if this map doesn't enforce CS */
6115 if (set_max && (new_prot & VM_PROT_ALLEXEC) && !vm_map_cs_enforcement(map)) {
6116 new_max = (new_max & ~VM_PROT_ALLEXEC) | (new_prot & VM_PROT_ALLEXEC);
6117 }
6118#elif CODE_SIGNING_MONITOR
6119 if (set_max && (new_prot & VM_PROT_EXECUTE) && (csm_address_space_exempt(map->pmap) == KERN_SUCCESS)) {
6120 new_max |= VM_PROT_EXECUTE;
6121 }
6122#endif
6123 if ((new_prot & new_max) != new_prot) {
6124 vm_map_unlock(map);
6125 return KERN_PROTECTION_FAILURE;
6126 }
6127
6128 if (current->used_for_jit &&
6129 pmap_has_prot_policy(pmap: map->pmap, translated_allow_execute: current->translated_allow_execute, prot: current->protection)) {
6130 vm_map_unlock(map);
6131 return KERN_PROTECTION_FAILURE;
6132 }
6133
6134#if __arm64e__
6135 /* Disallow remapping hw assisted TPRO mappings */
6136 if (current->used_for_tpro) {
6137 vm_map_unlock(map);
6138 return KERN_PROTECTION_FAILURE;
6139 }
6140#endif /* __arm64e__ */
6141
6142
6143 if ((new_prot & VM_PROT_WRITE) &&
6144 (new_prot & VM_PROT_ALLEXEC) &&
6145#if XNU_TARGET_OS_OSX
6146 map->pmap != kernel_pmap &&
6147 (vm_map_cs_enforcement(map)
6148#if __arm64__
6149 || !VM_MAP_IS_EXOTIC(map)
6150#endif /* __arm64__ */
6151 ) &&
6152#endif /* XNU_TARGET_OS_OSX */
6153#if CODE_SIGNING_MONITOR
6154 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
6155#endif
6156 !(current->used_for_jit)) {
6157 DTRACE_VM3(cs_wx,
6158 uint64_t, (uint64_t) current->vme_start,
6159 uint64_t, (uint64_t) current->vme_end,
6160 vm_prot_t, new_prot);
6161 printf(format: "CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
6162 proc_selfpid(),
6163 (get_bsdtask_info(current_task())
6164 ? proc_name_address(p: get_bsdtask_info(current_task()))
6165 : "?"),
6166 __FUNCTION__, __LINE__,
6167#if DEVELOPMENT || DEBUG
6168 (uint64_t)current->vme_start,
6169 (uint64_t)current->vme_end,
6170#else /* DEVELOPMENT || DEBUG */
6171 (uint64_t)0,
6172 (uint64_t)0,
6173#endif /* DEVELOPMENT || DEBUG */
6174 new_prot);
6175 new_prot &= ~VM_PROT_ALLEXEC;
6176 if (VM_MAP_POLICY_WX_FAIL(map)) {
6177 vm_map_unlock(map);
6178 return KERN_PROTECTION_FAILURE;
6179 }
6180 }
6181
6182 /*
6183 * If the task has requested executable lockdown,
6184 * deny both:
6185 * - adding executable protections OR
6186 * - adding write protections to an existing executable mapping.
6187 */
6188 if (map->map_disallow_new_exec == TRUE) {
6189 if ((new_prot & VM_PROT_ALLEXEC) ||
6190 ((current->protection & VM_PROT_EXECUTE) && (new_prot & VM_PROT_WRITE))) {
6191 vm_map_unlock(map);
6192 return KERN_PROTECTION_FAILURE;
6193 }
6194 }
6195
6196 prev = current->vme_end;
6197 current = current->vme_next;
6198 }
6199
6200#if __arm64__
6201 if (end > prev &&
6202 end == vm_map_round_page(prev, VM_MAP_PAGE_MASK(map))) {
6203 vm_map_entry_t prev_entry;
6204
6205 prev_entry = current->vme_prev;
6206 if (prev_entry != vm_map_to_entry(map) &&
6207 !prev_entry->map_aligned &&
6208 (vm_map_round_page(prev_entry->vme_end,
6209 VM_MAP_PAGE_MASK(map))
6210 == end)) {
6211 /*
6212 * The last entry in our range is not "map-aligned"
6213 * but it would have reached all the way to "end"
6214 * if it had been map-aligned, so this is not really
6215 * a hole in the range and we can proceed.
6216 */
6217 prev = end;
6218 }
6219 }
6220#endif /* __arm64__ */
6221
6222 if (end > prev) {
6223 vm_map_unlock(map);
6224 return KERN_INVALID_ADDRESS;
6225 }
6226
6227 /*
6228 * Go back and fix up protections.
6229 * Clip to start here if the range starts within
6230 * the entry.
6231 */
6232
6233 current = entry;
6234 if (current != vm_map_to_entry(map)) {
6235 /* clip and unnest if necessary */
6236 vm_map_clip_start(map, entry: current, startaddr: start);
6237 }
6238
6239 while ((current != vm_map_to_entry(map)) &&
6240 (current->vme_start < end)) {
6241 vm_prot_t old_prot;
6242
6243 vm_map_clip_end(map, entry: current, endaddr: end);
6244
6245#if DEVELOPMENT || DEBUG
6246 if (current->csm_associated && vm_log_xnu_user_debug) {
6247 printf("FBDP %d[%s] %s(0x%llx,0x%llx,0x%x) on map %p entry %p [0x%llx:0x%llx 0x%x/0x%x] csm_associated\n",
6248 proc_selfpid(),
6249 (get_bsdtask_info(current_task())
6250 ? proc_name_address(get_bsdtask_info(current_task()))
6251 : "?"),
6252 __FUNCTION__,
6253 (uint64_t)start,
6254 (uint64_t)end,
6255 new_prot,
6256 map, current,
6257 current->vme_start,
6258 current->vme_end,
6259 current->protection,
6260 current->max_protection);
6261 }
6262#endif /* DEVELOPMENT || DEBUG */
6263
6264 if (current->is_sub_map) {
6265 /* clipping did unnest if needed */
6266 assert(!current->use_pmap);
6267 }
6268
6269 old_prot = current->protection;
6270
6271 if (set_max) {
6272 current->max_protection = new_prot;
6273 /* Consider either EXECUTE or UEXEC as EXECUTE for this masking */
6274 current->protection = (new_prot & old_prot);
6275 } else {
6276 current->protection = new_prot;
6277 }
6278
6279#if CODE_SIGNING_MONITOR
6280 if (!current->vme_xnu_user_debug &&
6281 /* a !csm_associated mapping becoming executable */
6282 ((!current->csm_associated &&
6283 !(old_prot & VM_PROT_EXECUTE) &&
6284 (current->protection & VM_PROT_EXECUTE))
6285 ||
6286 /* a csm_associated mapping becoming writable */
6287 (current->csm_associated &&
6288 !(old_prot & VM_PROT_WRITE) &&
6289 (current->protection & VM_PROT_WRITE)))) {
6290 /*
6291 * This mapping has not already been marked as
6292 * "user_debug" and it is either:
6293 * 1. not code-signing-monitored and becoming executable
6294 * 2. code-signing-monitored and becoming writable,
6295 * so inform the CodeSigningMonitor and mark the
6296 * mapping as "user_debug" if appropriate.
6297 */
6298 vm_map_kernel_flags_t vmk_flags;
6299 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
6300 /* pretend it's a vm_protect(VM_PROT_COPY)... */
6301 vmk_flags.vmkf_remap_prot_copy = true;
6302 kr = vm_map_entry_cs_associate(map, current, vmk_flags);
6303#if DEVELOPMENT || DEBUG
6304 if (vm_log_xnu_user_debug) {
6305 printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] prot 0x%x -> 0x%x cs_associate -> %d user_debug=%d\n",
6306 proc_selfpid(),
6307 (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
6308 __FUNCTION__, __LINE__,
6309 map, current,
6310 current->vme_start, current->vme_end,
6311 old_prot, current->protection,
6312 kr, current->vme_xnu_user_debug);
6313 }
6314#endif /* DEVELOPMENT || DEBUG */
6315 }
6316#endif /* CODE_SIGNING_MONITOR */
6317
6318 /*
6319 * Update physical map if necessary.
6320 * If the request is to turn off write protection,
6321 * we won't do it for real (in pmap). This is because
6322 * it would cause copy-on-write to fail. We've already
6323 * set, the new protection in the map, so if a
6324 * write-protect fault occurred, it will be fixed up
6325 * properly, COW or not.
6326 */
6327 if (current->protection != old_prot) {
6328 /* Look one level in we support nested pmaps */
6329 /* from mapped submaps which are direct entries */
6330 /* in our map */
6331
6332 vm_prot_t prot;
6333
6334 prot = current->protection;
6335 if (current->is_sub_map || (VME_OBJECT(current) == NULL) || (VME_OBJECT(current) != compressor_object)) {
6336 prot &= ~VM_PROT_WRITE;
6337 } else {
6338 assert(!VME_OBJECT(current)->code_signed);
6339 assert(VME_OBJECT(current)->copy_strategy == MEMORY_OBJECT_COPY_NONE);
6340 if (prot & VM_PROT_WRITE) {
6341 /*
6342 * For write requests on the
6343 * compressor, we wil ask the
6344 * pmap layer to prevent us from
6345 * taking a write fault when we
6346 * attempt to access the mapping
6347 * next.
6348 */
6349 pmap_options |= PMAP_OPTIONS_PROTECT_IMMEDIATE;
6350 }
6351 }
6352
6353 if (override_nx(map, VME_ALIAS(current)) && prot) {
6354 prot |= VM_PROT_EXECUTE;
6355 }
6356
6357#if DEVELOPMENT || DEBUG
6358 if (!(old_prot & VM_PROT_EXECUTE) &&
6359 (prot & VM_PROT_EXECUTE) &&
6360 panic_on_unsigned_execute &&
6361 (proc_selfcsflags() & CS_KILL)) {
6362 panic("vm_map_protect(%p,0x%llx,0x%llx) old=0x%x new=0x%x - <rdar://23770418> code-signing bypass?", map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, old_prot, prot);
6363 }
6364#endif /* DEVELOPMENT || DEBUG */
6365
6366 if (pmap_has_prot_policy(pmap: map->pmap, translated_allow_execute: current->translated_allow_execute, prot)) {
6367 if (current->wired_count) {
6368 panic("vm_map_protect(%p,0x%llx,0x%llx) new=0x%x wired=%x",
6369 map, (uint64_t)current->vme_start, (uint64_t)current->vme_end, prot, current->wired_count);
6370 }
6371
6372 /* If the pmap layer cares about this
6373 * protection type, force a fault for
6374 * each page so that vm_fault will
6375 * repopulate the page with the full
6376 * set of protections.
6377 */
6378 /*
6379 * TODO: We don't seem to need this,
6380 * but this is due to an internal
6381 * implementation detail of
6382 * pmap_protect. Do we want to rely
6383 * on this?
6384 */
6385 prot = VM_PROT_NONE;
6386 }
6387
6388 if (current->is_sub_map && current->use_pmap) {
6389 pmap_protect(VME_SUBMAP(current)->pmap,
6390 s: current->vme_start,
6391 e: current->vme_end,
6392 prot);
6393 } else {
6394 pmap_protect_options(map: map->pmap,
6395 s: current->vme_start,
6396 e: current->vme_end,
6397 prot,
6398 options: pmap_options,
6399 NULL);
6400 }
6401 }
6402 current = current->vme_next;
6403 }
6404
6405 current = entry;
6406 while ((current != vm_map_to_entry(map)) &&
6407 (current->vme_start <= end)) {
6408 vm_map_simplify_entry(map, this_entry: current);
6409 current = current->vme_next;
6410 }
6411
6412 vm_map_unlock(map);
6413 return KERN_SUCCESS;
6414}
6415
6416/*
6417 * vm_map_inherit:
6418 *
6419 * Sets the inheritance of the specified address
6420 * range in the target map. Inheritance
6421 * affects how the map will be shared with
6422 * child maps at the time of vm_map_fork.
6423 */
6424kern_return_t
6425vm_map_inherit(
6426 vm_map_t map,
6427 vm_map_offset_t start,
6428 vm_map_offset_t end,
6429 vm_inherit_t new_inheritance)
6430{
6431 vm_map_entry_t entry;
6432 vm_map_entry_t temp_entry;
6433
6434 vm_map_lock(map);
6435
6436 VM_MAP_RANGE_CHECK(map, start, end);
6437
6438 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
6439 vm_map_unlock(map);
6440 return KERN_INVALID_ADDRESS;
6441 }
6442
6443 if (vm_map_lookup_entry(map, address: start, entry: &temp_entry)) {
6444 entry = temp_entry;
6445 } else {
6446 temp_entry = temp_entry->vme_next;
6447 entry = temp_entry;
6448 }
6449
6450 /* first check entire range for submaps which can't support the */
6451 /* given inheritance. */
6452 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6453 if (entry->is_sub_map) {
6454 if (new_inheritance == VM_INHERIT_COPY) {
6455 vm_map_unlock(map);
6456 return KERN_INVALID_ARGUMENT;
6457 }
6458 }
6459
6460 entry = entry->vme_next;
6461 }
6462
6463 entry = temp_entry;
6464 if (entry != vm_map_to_entry(map)) {
6465 /* clip and unnest if necessary */
6466 vm_map_clip_start(map, entry, startaddr: start);
6467 }
6468
6469 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
6470 vm_map_clip_end(map, entry, endaddr: end);
6471 if (entry->is_sub_map) {
6472 /* clip did unnest if needed */
6473 assert(!entry->use_pmap);
6474 }
6475
6476 entry->inheritance = new_inheritance;
6477
6478 entry = entry->vme_next;
6479 }
6480
6481 vm_map_unlock(map);
6482 return KERN_SUCCESS;
6483}
6484
6485/*
6486 * Update the accounting for the amount of wired memory in this map. If the user has
6487 * exceeded the defined limits, then we fail. Wiring on behalf of the kernel never fails.
6488 */
6489
6490static kern_return_t
6491add_wire_counts(
6492 vm_map_t map,
6493 vm_map_entry_t entry,
6494 boolean_t user_wire)
6495{
6496 vm_map_size_t size;
6497
6498 bool first_wire = entry->wired_count == 0 && entry->user_wired_count == 0;
6499
6500 if (user_wire) {
6501 unsigned int total_wire_count = vm_page_wire_count + vm_lopage_free_count;
6502
6503 /*
6504 * We're wiring memory at the request of the user. Check if this is the first time the user is wiring
6505 * this map entry.
6506 */
6507
6508 if (entry->user_wired_count == 0) {
6509 size = entry->vme_end - entry->vme_start;
6510
6511 /*
6512 * Since this is the first time the user is wiring this map entry, check to see if we're
6513 * exceeding the user wire limits. There is a per map limit which is the smaller of either
6514 * the process's rlimit or the global vm_per_task_user_wire_limit which caps this value. There is also
6515 * a system-wide limit on the amount of memory all users can wire. If the user is over either
6516 * limit, then we fail.
6517 */
6518
6519 if (size + map->user_wire_size > MIN(map->user_wire_limit, vm_per_task_user_wire_limit) ||
6520 size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6521 if (size + ptoa_64(total_wire_count) > vm_global_user_wire_limit) {
6522#if DEVELOPMENT || DEBUG
6523 if (panic_on_mlock_failure) {
6524 panic("mlock: Over global wire limit. %llu bytes wired and requested to wire %llu bytes more", ptoa_64(total_wire_count), (uint64_t) size);
6525 }
6526#endif /* DEVELOPMENT || DEBUG */
6527 os_atomic_inc(&vm_add_wire_count_over_global_limit, relaxed);
6528 } else {
6529 os_atomic_inc(&vm_add_wire_count_over_user_limit, relaxed);
6530#if DEVELOPMENT || DEBUG
6531 if (panic_on_mlock_failure) {
6532 panic("mlock: Over process wire limit. %llu bytes wired and requested to wire %llu bytes more", (uint64_t) map->user_wire_size, (uint64_t) size);
6533 }
6534#endif /* DEVELOPMENT || DEBUG */
6535 }
6536 return KERN_RESOURCE_SHORTAGE;
6537 }
6538
6539 /*
6540 * The first time the user wires an entry, we also increment the wired_count and add this to
6541 * the total that has been wired in the map.
6542 */
6543
6544 if (entry->wired_count >= MAX_WIRE_COUNT) {
6545 return KERN_FAILURE;
6546 }
6547
6548 entry->wired_count++;
6549 map->user_wire_size += size;
6550 }
6551
6552 if (entry->user_wired_count >= MAX_WIRE_COUNT) {
6553 return KERN_FAILURE;
6554 }
6555
6556 entry->user_wired_count++;
6557 } else {
6558 /*
6559 * The kernel's wiring the memory. Just bump the count and continue.
6560 */
6561
6562 if (entry->wired_count >= MAX_WIRE_COUNT) {
6563 panic("vm_map_wire: too many wirings");
6564 }
6565
6566 entry->wired_count++;
6567 }
6568
6569 if (first_wire) {
6570 vme_btref_consider_and_set(entry, fp: __builtin_frame_address(0));
6571 }
6572
6573 return KERN_SUCCESS;
6574}
6575
6576/*
6577 * Update the memory wiring accounting now that the given map entry is being unwired.
6578 */
6579
6580static void
6581subtract_wire_counts(
6582 vm_map_t map,
6583 vm_map_entry_t entry,
6584 boolean_t user_wire)
6585{
6586 if (user_wire) {
6587 /*
6588 * We're unwiring memory at the request of the user. See if we're removing the last user wire reference.
6589 */
6590
6591 if (entry->user_wired_count == 1) {
6592 /*
6593 * We're removing the last user wire reference. Decrement the wired_count and the total
6594 * user wired memory for this map.
6595 */
6596
6597 assert(entry->wired_count >= 1);
6598 entry->wired_count--;
6599 map->user_wire_size -= entry->vme_end - entry->vme_start;
6600 }
6601
6602 assert(entry->user_wired_count >= 1);
6603 entry->user_wired_count--;
6604 } else {
6605 /*
6606 * The kernel is unwiring the memory. Just update the count.
6607 */
6608
6609 assert(entry->wired_count >= 1);
6610 entry->wired_count--;
6611 }
6612
6613 vme_btref_consider_and_put(entry);
6614}
6615
6616int cs_executable_wire = 0;
6617
6618/*
6619 * vm_map_wire:
6620 *
6621 * Sets the pageability of the specified address range in the
6622 * target map as wired. Regions specified as not pageable require
6623 * locked-down physical memory and physical page maps. The
6624 * access_type variable indicates types of accesses that must not
6625 * generate page faults. This is checked against protection of
6626 * memory being locked-down.
6627 *
6628 * The map must not be locked, but a reference must remain to the
6629 * map throughout the call.
6630 */
6631static kern_return_t
6632vm_map_wire_nested(
6633 vm_map_t map,
6634 vm_map_offset_t start,
6635 vm_map_offset_t end,
6636 vm_prot_t caller_prot,
6637 vm_tag_t tag,
6638 boolean_t user_wire,
6639 pmap_t map_pmap,
6640 vm_map_offset_t pmap_addr,
6641 ppnum_t *physpage_p)
6642{
6643 vm_map_entry_t entry;
6644 vm_prot_t access_type;
6645 struct vm_map_entry *first_entry, tmp_entry;
6646 vm_map_t real_map;
6647 vm_map_offset_t s, e;
6648 kern_return_t rc;
6649 boolean_t need_wakeup;
6650 boolean_t main_map = FALSE;
6651 wait_interrupt_t interruptible_state;
6652 thread_t cur_thread;
6653 unsigned int last_timestamp;
6654 vm_map_size_t size;
6655 boolean_t wire_and_extract;
6656 vm_prot_t extra_prots;
6657
6658 extra_prots = VM_PROT_COPY;
6659 extra_prots |= VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6660#if XNU_TARGET_OS_OSX
6661 if (map->pmap == kernel_pmap ||
6662 !vm_map_cs_enforcement(map)) {
6663 extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6664 }
6665#endif /* XNU_TARGET_OS_OSX */
6666#if CODE_SIGNING_MONITOR
6667 if (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) {
6668 extra_prots &= ~VM_PROT_COPY_FAIL_IF_EXECUTABLE;
6669 }
6670#endif /* CODE_SIGNING_MONITOR */
6671
6672 access_type = (caller_prot & (VM_PROT_ALL | VM_PROT_ALLEXEC));
6673
6674 wire_and_extract = FALSE;
6675 if (physpage_p != NULL) {
6676 /*
6677 * The caller wants the physical page number of the
6678 * wired page. We return only one physical page number
6679 * so this works for only one page at a time.
6680 */
6681 if ((end - start) != PAGE_SIZE) {
6682 return KERN_INVALID_ARGUMENT;
6683 }
6684 wire_and_extract = TRUE;
6685 *physpage_p = 0;
6686 }
6687
6688 vm_map_lock(map);
6689 if (map_pmap == NULL) {
6690 main_map = TRUE;
6691 }
6692 last_timestamp = map->timestamp;
6693
6694 VM_MAP_RANGE_CHECK(map, start, end);
6695 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
6696 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
6697
6698 if (start == end) {
6699 /* We wired what the caller asked for, zero pages */
6700 vm_map_unlock(map);
6701 return KERN_SUCCESS;
6702 }
6703
6704 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
6705 vm_map_unlock(map);
6706 return KERN_INVALID_ADDRESS;
6707 }
6708
6709 need_wakeup = FALSE;
6710 cur_thread = current_thread();
6711
6712 s = start;
6713 rc = KERN_SUCCESS;
6714
6715 if (vm_map_lookup_entry(map, address: s, entry: &first_entry)) {
6716 entry = first_entry;
6717 /*
6718 * vm_map_clip_start will be done later.
6719 * We don't want to unnest any nested submaps here !
6720 */
6721 } else {
6722 /* Start address is not in map */
6723 rc = KERN_INVALID_ADDRESS;
6724 goto done;
6725 }
6726
6727 while ((entry != vm_map_to_entry(map)) && (s < end)) {
6728 /*
6729 * At this point, we have wired from "start" to "s".
6730 * We still need to wire from "s" to "end".
6731 *
6732 * "entry" hasn't been clipped, so it could start before "s"
6733 * and/or end after "end".
6734 */
6735
6736 /* "e" is how far we want to wire in this entry */
6737 e = entry->vme_end;
6738 if (e > end) {
6739 e = end;
6740 }
6741
6742 /*
6743 * If another thread is wiring/unwiring this entry then
6744 * block after informing other thread to wake us up.
6745 */
6746 if (entry->in_transition) {
6747 wait_result_t wait_result;
6748
6749 /*
6750 * We have not clipped the entry. Make sure that
6751 * the start address is in range so that the lookup
6752 * below will succeed.
6753 * "s" is the current starting point: we've already
6754 * wired from "start" to "s" and we still have
6755 * to wire from "s" to "end".
6756 */
6757
6758 entry->needs_wakeup = TRUE;
6759
6760 /*
6761 * wake up anybody waiting on entries that we have
6762 * already wired.
6763 */
6764 if (need_wakeup) {
6765 vm_map_entry_wakeup(map);
6766 need_wakeup = FALSE;
6767 }
6768 /*
6769 * User wiring is interruptible
6770 */
6771 wait_result = vm_map_entry_wait(map,
6772 (user_wire) ? THREAD_ABORTSAFE :
6773 THREAD_UNINT);
6774 if (user_wire && wait_result == THREAD_INTERRUPTED) {
6775 /*
6776 * undo the wirings we have done so far
6777 * We do not clear the needs_wakeup flag,
6778 * because we cannot tell if we were the
6779 * only one waiting.
6780 */
6781 rc = KERN_FAILURE;
6782 goto done;
6783 }
6784
6785 /*
6786 * Cannot avoid a lookup here. reset timestamp.
6787 */
6788 last_timestamp = map->timestamp;
6789
6790 /*
6791 * The entry could have been clipped, look it up again.
6792 * Worse that can happen is, it may not exist anymore.
6793 */
6794 if (!vm_map_lookup_entry(map, address: s, entry: &first_entry)) {
6795 /*
6796 * User: undo everything upto the previous
6797 * entry. let vm_map_unwire worry about
6798 * checking the validity of the range.
6799 */
6800 rc = KERN_FAILURE;
6801 goto done;
6802 }
6803 entry = first_entry;
6804 continue;
6805 }
6806
6807 if (entry->is_sub_map) {
6808 vm_map_offset_t sub_start;
6809 vm_map_offset_t sub_end;
6810 vm_map_offset_t local_start;
6811 vm_map_offset_t local_end;
6812 pmap_t pmap;
6813
6814 if (wire_and_extract) {
6815 /*
6816 * Wiring would result in copy-on-write
6817 * which would not be compatible with
6818 * the sharing we have with the original
6819 * provider of this memory.
6820 */
6821 rc = KERN_INVALID_ARGUMENT;
6822 goto done;
6823 }
6824
6825 vm_map_clip_start(map, entry, startaddr: s);
6826 vm_map_clip_end(map, entry, endaddr: end);
6827
6828 sub_start = VME_OFFSET(entry);
6829 sub_end = entry->vme_end;
6830 sub_end += VME_OFFSET(entry) - entry->vme_start;
6831
6832 local_end = entry->vme_end;
6833 if (map_pmap == NULL) {
6834 vm_object_t object;
6835 vm_object_offset_t offset;
6836 vm_prot_t prot;
6837 boolean_t wired;
6838 vm_map_entry_t local_entry;
6839 vm_map_version_t version;
6840 vm_map_t lookup_map;
6841
6842 if (entry->use_pmap) {
6843 pmap = VME_SUBMAP(entry)->pmap;
6844 /* ppc implementation requires that */
6845 /* submaps pmap address ranges line */
6846 /* up with parent map */
6847#ifdef notdef
6848 pmap_addr = sub_start;
6849#endif
6850 pmap_addr = s;
6851 } else {
6852 pmap = map->pmap;
6853 pmap_addr = s;
6854 }
6855
6856 if (entry->wired_count) {
6857 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6858 goto done;
6859 }
6860
6861 /*
6862 * The map was not unlocked:
6863 * no need to goto re-lookup.
6864 * Just go directly to next entry.
6865 */
6866 entry = entry->vme_next;
6867 s = entry->vme_start;
6868 continue;
6869 }
6870
6871 /* call vm_map_lookup_and_lock_object to */
6872 /* cause any needs copy to be */
6873 /* evaluated */
6874 local_start = entry->vme_start;
6875 lookup_map = map;
6876 vm_map_lock_write_to_read(map);
6877 rc = vm_map_lookup_and_lock_object(
6878 var_map: &lookup_map, vaddr: local_start,
6879 fault_type: (access_type | extra_prots),
6880 OBJECT_LOCK_EXCLUSIVE,
6881 out_version: &version, object: &object,
6882 offset: &offset, out_prot: &prot, wired: &wired,
6883 NULL,
6884 real_map: &real_map, NULL);
6885 if (rc != KERN_SUCCESS) {
6886 vm_map_unlock_read(lookup_map);
6887 assert(map_pmap == NULL);
6888 vm_map_unwire(map, start,
6889 end: s, user_wire);
6890 return rc;
6891 }
6892 vm_object_unlock(object);
6893 if (real_map != lookup_map) {
6894 vm_map_unlock(real_map);
6895 }
6896 vm_map_unlock_read(lookup_map);
6897 vm_map_lock(map);
6898
6899 /* we unlocked, so must re-lookup */
6900 if (!vm_map_lookup_entry(map,
6901 address: local_start,
6902 entry: &local_entry)) {
6903 rc = KERN_FAILURE;
6904 goto done;
6905 }
6906
6907 /*
6908 * entry could have been "simplified",
6909 * so re-clip
6910 */
6911 entry = local_entry;
6912 assert(s == local_start);
6913 vm_map_clip_start(map, entry, startaddr: s);
6914 vm_map_clip_end(map, entry, endaddr: end);
6915 /* re-compute "e" */
6916 e = entry->vme_end;
6917 if (e > end) {
6918 e = end;
6919 }
6920
6921 /* did we have a change of type? */
6922 if (!entry->is_sub_map) {
6923 last_timestamp = map->timestamp;
6924 continue;
6925 }
6926 } else {
6927 local_start = entry->vme_start;
6928 pmap = map_pmap;
6929 }
6930
6931 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
6932 goto done;
6933 }
6934
6935 entry->in_transition = TRUE;
6936
6937 vm_map_unlock(map);
6938 rc = vm_map_wire_nested(VME_SUBMAP(entry),
6939 start: sub_start, end: sub_end,
6940 caller_prot, tag,
6941 user_wire, map_pmap: pmap, pmap_addr,
6942 NULL);
6943 vm_map_lock(map);
6944
6945 /*
6946 * Find the entry again. It could have been clipped
6947 * after we unlocked the map.
6948 */
6949 if (!vm_map_lookup_entry(map, address: local_start,
6950 entry: &first_entry)) {
6951 panic("vm_map_wire: re-lookup failed");
6952 }
6953 entry = first_entry;
6954
6955 assert(local_start == s);
6956 /* re-compute "e" */
6957 e = entry->vme_end;
6958 if (e > end) {
6959 e = end;
6960 }
6961
6962 last_timestamp = map->timestamp;
6963 while ((entry != vm_map_to_entry(map)) &&
6964 (entry->vme_start < e)) {
6965 assert(entry->in_transition);
6966 entry->in_transition = FALSE;
6967 if (entry->needs_wakeup) {
6968 entry->needs_wakeup = FALSE;
6969 need_wakeup = TRUE;
6970 }
6971 if (rc != KERN_SUCCESS) {/* from vm_*_wire */
6972 subtract_wire_counts(map, entry, user_wire);
6973 }
6974 entry = entry->vme_next;
6975 }
6976 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
6977 goto done;
6978 }
6979
6980 /* no need to relookup again */
6981 s = entry->vme_start;
6982 continue;
6983 }
6984
6985 /*
6986 * If this entry is already wired then increment
6987 * the appropriate wire reference count.
6988 */
6989 if (entry->wired_count) {
6990 if ((entry->protection & access_type) != access_type) {
6991 /* found a protection problem */
6992
6993 /*
6994 * XXX FBDP
6995 * We should always return an error
6996 * in this case but since we didn't
6997 * enforce it before, let's do
6998 * it only for the new "wire_and_extract"
6999 * code path for now...
7000 */
7001 if (wire_and_extract) {
7002 rc = KERN_PROTECTION_FAILURE;
7003 goto done;
7004 }
7005 }
7006
7007 /*
7008 * entry is already wired down, get our reference
7009 * after clipping to our range.
7010 */
7011 vm_map_clip_start(map, entry, startaddr: s);
7012 vm_map_clip_end(map, entry, endaddr: end);
7013
7014 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
7015 goto done;
7016 }
7017
7018 if (wire_and_extract) {
7019 vm_object_t object;
7020 vm_object_offset_t offset;
7021 vm_page_t m;
7022
7023 /*
7024 * We don't have to "wire" the page again
7025 * bit we still have to "extract" its
7026 * physical page number, after some sanity
7027 * checks.
7028 */
7029 assert((entry->vme_end - entry->vme_start)
7030 == PAGE_SIZE);
7031 assert(!entry->needs_copy);
7032 assert(!entry->is_sub_map);
7033 assert(VME_OBJECT(entry));
7034 if (((entry->vme_end - entry->vme_start)
7035 != PAGE_SIZE) ||
7036 entry->needs_copy ||
7037 entry->is_sub_map ||
7038 VME_OBJECT(entry) == VM_OBJECT_NULL) {
7039 rc = KERN_INVALID_ARGUMENT;
7040 goto done;
7041 }
7042
7043 object = VME_OBJECT(entry);
7044 offset = VME_OFFSET(entry);
7045 /* need exclusive lock to update m->dirty */
7046 if (entry->protection & VM_PROT_WRITE) {
7047 vm_object_lock(object);
7048 } else {
7049 vm_object_lock_shared(object);
7050 }
7051 m = vm_page_lookup(object, offset);
7052 assert(m != VM_PAGE_NULL);
7053 assert(VM_PAGE_WIRED(m));
7054 if (m != VM_PAGE_NULL && VM_PAGE_WIRED(m)) {
7055 *physpage_p = VM_PAGE_GET_PHYS_PAGE(m);
7056 if (entry->protection & VM_PROT_WRITE) {
7057 vm_object_lock_assert_exclusive(
7058 object);
7059 m->vmp_dirty = TRUE;
7060 }
7061 } else {
7062 /* not already wired !? */
7063 *physpage_p = 0;
7064 }
7065 vm_object_unlock(object);
7066 }
7067
7068 /* map was not unlocked: no need to relookup */
7069 entry = entry->vme_next;
7070 s = entry->vme_start;
7071 continue;
7072 }
7073
7074 /*
7075 * Unwired entry or wire request transmitted via submap
7076 */
7077
7078 /*
7079 * Wiring would copy the pages to the shadow object.
7080 * The shadow object would not be code-signed so
7081 * attempting to execute code from these copied pages
7082 * would trigger a code-signing violation.
7083 */
7084
7085 if ((entry->protection & VM_PROT_EXECUTE)
7086#if XNU_TARGET_OS_OSX
7087 &&
7088 map->pmap != kernel_pmap &&
7089 (vm_map_cs_enforcement(map)
7090#if __arm64__
7091 || !VM_MAP_IS_EXOTIC(map)
7092#endif /* __arm64__ */
7093 )
7094#endif /* XNU_TARGET_OS_OSX */
7095#if CODE_SIGNING_MONITOR
7096 &&
7097 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS)
7098#endif
7099 ) {
7100#if MACH_ASSERT
7101 printf("pid %d[%s] wiring executable range from "
7102 "0x%llx to 0x%llx: rejected to preserve "
7103 "code-signing\n",
7104 proc_selfpid(),
7105 (get_bsdtask_info(current_task())
7106 ? proc_name_address(get_bsdtask_info(current_task()))
7107 : "?"),
7108 (uint64_t) entry->vme_start,
7109 (uint64_t) entry->vme_end);
7110#endif /* MACH_ASSERT */
7111 DTRACE_VM2(cs_executable_wire,
7112 uint64_t, (uint64_t)entry->vme_start,
7113 uint64_t, (uint64_t)entry->vme_end);
7114 cs_executable_wire++;
7115 rc = KERN_PROTECTION_FAILURE;
7116 goto done;
7117 }
7118
7119 /*
7120 * Perform actions of vm_map_lookup that need the write
7121 * lock on the map: create a shadow object for a
7122 * copy-on-write region, or an object for a zero-fill
7123 * region.
7124 */
7125 size = entry->vme_end - entry->vme_start;
7126 /*
7127 * If wiring a copy-on-write page, we need to copy it now
7128 * even if we're only (currently) requesting read access.
7129 * This is aggressive, but once it's wired we can't move it.
7130 */
7131 if (entry->needs_copy) {
7132 if (wire_and_extract) {
7133 /*
7134 * We're supposed to share with the original
7135 * provider so should not be "needs_copy"
7136 */
7137 rc = KERN_INVALID_ARGUMENT;
7138 goto done;
7139 }
7140
7141 VME_OBJECT_SHADOW(entry, length: size,
7142 always: vm_map_always_shadow(map));
7143 entry->needs_copy = FALSE;
7144 } else if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
7145 if (wire_and_extract) {
7146 /*
7147 * We're supposed to share with the original
7148 * provider so should already have an object.
7149 */
7150 rc = KERN_INVALID_ARGUMENT;
7151 goto done;
7152 }
7153 VME_OBJECT_SET(entry, object: vm_object_allocate(size), false, context: 0);
7154 VME_OFFSET_SET(entry, offset: (vm_object_offset_t)0);
7155 assert(entry->use_pmap);
7156 } else if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
7157 if (wire_and_extract) {
7158 /*
7159 * We're supposed to share with the original
7160 * provider so should not be COPY_SYMMETRIC.
7161 */
7162 rc = KERN_INVALID_ARGUMENT;
7163 goto done;
7164 }
7165 /*
7166 * Force an unrequested "copy-on-write" but only for
7167 * the range we're wiring.
7168 */
7169// printf("FBDP %s:%d map %p entry %p [ 0x%llx 0x%llx ] s 0x%llx end 0x%llx wire&extract=%d\n", __FUNCTION__, __LINE__, map, entry, (uint64_t)entry->vme_start, (uint64_t)entry->vme_end, (uint64_t)s, (uint64_t)end, wire_and_extract);
7170 vm_map_clip_start(map, entry, startaddr: s);
7171 vm_map_clip_end(map, entry, endaddr: end);
7172 /* recompute "size" */
7173 size = entry->vme_end - entry->vme_start;
7174 /* make a shadow object */
7175 vm_object_t orig_object;
7176 vm_object_offset_t orig_offset;
7177 orig_object = VME_OBJECT(entry);
7178 orig_offset = VME_OFFSET(entry);
7179 VME_OBJECT_SHADOW(entry, length: size, always: vm_map_always_shadow(map));
7180 if (VME_OBJECT(entry) != orig_object) {
7181 /*
7182 * This mapping has not been shared (or it would be
7183 * COPY_DELAY instead of COPY_SYMMETRIC) and it has
7184 * not been copied-on-write (or it would be marked
7185 * as "needs_copy" and would have been handled above
7186 * and also already write-protected).
7187 * We still need to write-protect here to prevent
7188 * other threads from modifying these pages while
7189 * we're in the process of copying and wiring
7190 * the copied pages.
7191 * Since the mapping is neither shared nor COWed,
7192 * we only need to write-protect the PTEs for this
7193 * mapping.
7194 */
7195 vm_object_pmap_protect(object: orig_object,
7196 offset: orig_offset,
7197 size,
7198 pmap: map->pmap,
7199 VM_MAP_PAGE_SIZE(map),
7200 pmap_start: entry->vme_start,
7201 prot: entry->protection & ~VM_PROT_WRITE);
7202 }
7203 }
7204 if (VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
7205 /*
7206 * Make the object COPY_DELAY to get a stable object
7207 * to wire.
7208 * That should avoid creating long shadow chains while
7209 * wiring/unwiring the same range repeatedly.
7210 * That also prevents part of the object from being
7211 * wired while another part is "needs_copy", which
7212 * could result in conflicting rules wrt copy-on-write.
7213 */
7214 vm_object_t object;
7215
7216 object = VME_OBJECT(entry);
7217 vm_object_lock(object);
7218 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
7219 assertf(vm_object_round_page(VME_OFFSET(entry) + size) - vm_object_trunc_page(VME_OFFSET(entry)) == object->vo_size,
7220 "object %p size 0x%llx entry %p [0x%llx:0x%llx:0x%llx] size 0x%llx\n",
7221 object, (uint64_t)object->vo_size,
7222 entry,
7223 (uint64_t)entry->vme_start,
7224 (uint64_t)entry->vme_end,
7225 (uint64_t)VME_OFFSET(entry),
7226 (uint64_t)size);
7227 assertf(object->ref_count == 1,
7228 "object %p ref_count %d\n",
7229 object, object->ref_count);
7230 assertf(!entry->needs_copy,
7231 "entry %p\n", entry);
7232 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
7233 VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
7234 }
7235 vm_object_unlock(object);
7236 }
7237
7238 vm_map_clip_start(map, entry, startaddr: s);
7239 vm_map_clip_end(map, entry, endaddr: end);
7240
7241 /* re-compute "e" */
7242 e = entry->vme_end;
7243 if (e > end) {
7244 e = end;
7245 }
7246
7247 /*
7248 * Check for holes and protection mismatch.
7249 * Holes: Next entry should be contiguous unless this
7250 * is the end of the region.
7251 * Protection: Access requested must be allowed, unless
7252 * wiring is by protection class
7253 */
7254 if ((entry->vme_end < end) &&
7255 ((entry->vme_next == vm_map_to_entry(map)) ||
7256 (entry->vme_next->vme_start > entry->vme_end))) {
7257 /* found a hole */
7258 rc = KERN_INVALID_ADDRESS;
7259 goto done;
7260 }
7261 if ((entry->protection & access_type) != access_type) {
7262 /* found a protection problem */
7263 rc = KERN_PROTECTION_FAILURE;
7264 goto done;
7265 }
7266
7267 assert(entry->wired_count == 0 && entry->user_wired_count == 0);
7268
7269 if ((rc = add_wire_counts(map, entry, user_wire)) != KERN_SUCCESS) {
7270 goto done;
7271 }
7272
7273 entry->in_transition = TRUE;
7274
7275 /*
7276 * This entry might get split once we unlock the map.
7277 * In vm_fault_wire(), we need the current range as
7278 * defined by this entry. In order for this to work
7279 * along with a simultaneous clip operation, we make a
7280 * temporary copy of this entry and use that for the
7281 * wiring. Note that the underlying objects do not
7282 * change during a clip.
7283 */
7284 tmp_entry = *entry;
7285
7286 /*
7287 * The in_transition state guarentees that the entry
7288 * (or entries for this range, if split occured) will be
7289 * there when the map lock is acquired for the second time.
7290 */
7291 vm_map_unlock(map);
7292
7293 if (!user_wire && cur_thread != THREAD_NULL) {
7294 interruptible_state = thread_interrupt_level(THREAD_UNINT);
7295 } else {
7296 interruptible_state = THREAD_UNINT;
7297 }
7298
7299 if (map_pmap) {
7300 rc = vm_fault_wire(map,
7301 entry: &tmp_entry, prot: caller_prot, wire_tag: tag, pmap: map_pmap, pmap_addr,
7302 physpage_p);
7303 } else {
7304 rc = vm_fault_wire(map,
7305 entry: &tmp_entry, prot: caller_prot, wire_tag: tag, pmap: map->pmap,
7306 pmap_addr: tmp_entry.vme_start,
7307 physpage_p);
7308 }
7309
7310 if (!user_wire && cur_thread != THREAD_NULL) {
7311 thread_interrupt_level(interruptible: interruptible_state);
7312 }
7313
7314 vm_map_lock(map);
7315
7316 if (last_timestamp + 1 != map->timestamp) {
7317 /*
7318 * Find the entry again. It could have been clipped
7319 * after we unlocked the map.
7320 */
7321 if (!vm_map_lookup_entry(map, address: tmp_entry.vme_start,
7322 entry: &first_entry)) {
7323 panic("vm_map_wire: re-lookup failed");
7324 }
7325
7326 entry = first_entry;
7327 }
7328
7329 last_timestamp = map->timestamp;
7330
7331 while ((entry != vm_map_to_entry(map)) &&
7332 (entry->vme_start < tmp_entry.vme_end)) {
7333 assert(entry->in_transition);
7334 entry->in_transition = FALSE;
7335 if (entry->needs_wakeup) {
7336 entry->needs_wakeup = FALSE;
7337 need_wakeup = TRUE;
7338 }
7339 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
7340 subtract_wire_counts(map, entry, user_wire);
7341 }
7342 entry = entry->vme_next;
7343 }
7344
7345 if (rc != KERN_SUCCESS) { /* from vm_*_wire */
7346 goto done;
7347 }
7348
7349 if ((entry != vm_map_to_entry(map)) && /* we still have entries in the map */
7350 (tmp_entry.vme_end != end) && /* AND, we are not at the end of the requested range */
7351 (entry->vme_start != tmp_entry.vme_end)) { /* AND, the next entry is not contiguous. */
7352 /* found a "new" hole */
7353 s = tmp_entry.vme_end;
7354 rc = KERN_INVALID_ADDRESS;
7355 goto done;
7356 }
7357
7358 s = entry->vme_start;
7359 } /* end while loop through map entries */
7360
7361done:
7362 if (rc == KERN_SUCCESS) {
7363 /* repair any damage we may have made to the VM map */
7364 vm_map_simplify_range(map, start, end);
7365 }
7366
7367 vm_map_unlock(map);
7368
7369 /*
7370 * wake up anybody waiting on entries we wired.
7371 */
7372 if (need_wakeup) {
7373 vm_map_entry_wakeup(map);
7374 }
7375
7376 if (rc != KERN_SUCCESS) {
7377 /* undo what has been wired so far */
7378 vm_map_unwire_nested(map, start, end: s, user_wire,
7379 map_pmap, pmap_addr);
7380 if (physpage_p) {
7381 *physpage_p = 0;
7382 }
7383 }
7384
7385 return rc;
7386}
7387
7388kern_return_t
7389vm_map_wire_external(
7390 vm_map_t map,
7391 vm_map_offset_t start,
7392 vm_map_offset_t end,
7393 vm_prot_t caller_prot,
7394 boolean_t user_wire)
7395{
7396 kern_return_t kret;
7397
7398 kret = vm_map_wire_nested(map, start, end, caller_prot, tag: vm_tag_bt(),
7399 user_wire, map_pmap: (pmap_t)NULL, pmap_addr: 0, NULL);
7400 return kret;
7401}
7402
7403kern_return_t
7404vm_map_wire_kernel(
7405 vm_map_t map,
7406 vm_map_offset_t start,
7407 vm_map_offset_t end,
7408 vm_prot_t caller_prot,
7409 vm_tag_t tag,
7410 boolean_t user_wire)
7411{
7412 kern_return_t kret;
7413
7414 kret = vm_map_wire_nested(map, start, end, caller_prot, tag,
7415 user_wire, map_pmap: (pmap_t)NULL, pmap_addr: 0, NULL);
7416 return kret;
7417}
7418
7419kern_return_t
7420vm_map_wire_and_extract_external(
7421 vm_map_t map,
7422 vm_map_offset_t start,
7423 vm_prot_t caller_prot,
7424 boolean_t user_wire,
7425 ppnum_t *physpage_p)
7426{
7427 kern_return_t kret;
7428
7429 kret = vm_map_wire_nested(map,
7430 start,
7431 end: start + VM_MAP_PAGE_SIZE(map),
7432 caller_prot,
7433 tag: vm_tag_bt(),
7434 user_wire,
7435 map_pmap: (pmap_t)NULL,
7436 pmap_addr: 0,
7437 physpage_p);
7438 if (kret != KERN_SUCCESS &&
7439 physpage_p != NULL) {
7440 *physpage_p = 0;
7441 }
7442 return kret;
7443}
7444
7445/*
7446 * vm_map_unwire:
7447 *
7448 * Sets the pageability of the specified address range in the target
7449 * as pageable. Regions specified must have been wired previously.
7450 *
7451 * The map must not be locked, but a reference must remain to the map
7452 * throughout the call.
7453 *
7454 * Kernel will panic on failures. User unwire ignores holes and
7455 * unwired and intransition entries to avoid losing memory by leaving
7456 * it unwired.
7457 */
7458static kern_return_t
7459vm_map_unwire_nested(
7460 vm_map_t map,
7461 vm_map_offset_t start,
7462 vm_map_offset_t end,
7463 boolean_t user_wire,
7464 pmap_t map_pmap,
7465 vm_map_offset_t pmap_addr)
7466{
7467 vm_map_entry_t entry;
7468 struct vm_map_entry *first_entry, tmp_entry;
7469 boolean_t need_wakeup;
7470 boolean_t main_map = FALSE;
7471 unsigned int last_timestamp;
7472
7473 vm_map_lock(map);
7474 if (map_pmap == NULL) {
7475 main_map = TRUE;
7476 }
7477 last_timestamp = map->timestamp;
7478
7479 VM_MAP_RANGE_CHECK(map, start, end);
7480 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
7481 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
7482
7483 if (start == end) {
7484 /* We unwired what the caller asked for: zero pages */
7485 vm_map_unlock(map);
7486 return KERN_SUCCESS;
7487 }
7488
7489 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
7490 vm_map_unlock(map);
7491 return KERN_INVALID_ADDRESS;
7492 }
7493
7494 if (vm_map_lookup_entry(map, address: start, entry: &first_entry)) {
7495 entry = first_entry;
7496 /*
7497 * vm_map_clip_start will be done later.
7498 * We don't want to unnest any nested sub maps here !
7499 */
7500 } else {
7501 if (!user_wire) {
7502 panic("vm_map_unwire: start not found");
7503 }
7504 /* Start address is not in map. */
7505 vm_map_unlock(map);
7506 return KERN_INVALID_ADDRESS;
7507 }
7508
7509 if (entry->superpage_size) {
7510 /* superpages are always wired */
7511 vm_map_unlock(map);
7512 return KERN_INVALID_ADDRESS;
7513 }
7514
7515 need_wakeup = FALSE;
7516 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
7517 if (entry->in_transition) {
7518 /*
7519 * 1)
7520 * Another thread is wiring down this entry. Note
7521 * that if it is not for the other thread we would
7522 * be unwiring an unwired entry. This is not
7523 * permitted. If we wait, we will be unwiring memory
7524 * we did not wire.
7525 *
7526 * 2)
7527 * Another thread is unwiring this entry. We did not
7528 * have a reference to it, because if we did, this
7529 * entry will not be getting unwired now.
7530 */
7531 if (!user_wire) {
7532 /*
7533 * XXX FBDP
7534 * This could happen: there could be some
7535 * overlapping vslock/vsunlock operations
7536 * going on.
7537 * We should probably just wait and retry,
7538 * but then we have to be careful that this
7539 * entry could get "simplified" after
7540 * "in_transition" gets unset and before
7541 * we re-lookup the entry, so we would
7542 * have to re-clip the entry to avoid
7543 * re-unwiring what we have already unwired...
7544 * See vm_map_wire_nested().
7545 *
7546 * Or we could just ignore "in_transition"
7547 * here and proceed to decement the wired
7548 * count(s) on this entry. That should be fine
7549 * as long as "wired_count" doesn't drop all
7550 * the way to 0 (and we should panic if THAT
7551 * happens).
7552 */
7553 panic("vm_map_unwire: in_transition entry");
7554 }
7555
7556 entry = entry->vme_next;
7557 continue;
7558 }
7559
7560 if (entry->is_sub_map) {
7561 vm_map_offset_t sub_start;
7562 vm_map_offset_t sub_end;
7563 vm_map_offset_t local_end;
7564 pmap_t pmap;
7565
7566 vm_map_clip_start(map, entry, startaddr: start);
7567 vm_map_clip_end(map, entry, endaddr: end);
7568
7569 sub_start = VME_OFFSET(entry);
7570 sub_end = entry->vme_end - entry->vme_start;
7571 sub_end += VME_OFFSET(entry);
7572 local_end = entry->vme_end;
7573 if (map_pmap == NULL) {
7574 if (entry->use_pmap) {
7575 pmap = VME_SUBMAP(entry)->pmap;
7576 pmap_addr = sub_start;
7577 } else {
7578 pmap = map->pmap;
7579 pmap_addr = start;
7580 }
7581 if (entry->wired_count == 0 ||
7582 (user_wire && entry->user_wired_count == 0)) {
7583 if (!user_wire) {
7584 panic("vm_map_unwire: entry is unwired");
7585 }
7586 entry = entry->vme_next;
7587 continue;
7588 }
7589
7590 /*
7591 * Check for holes
7592 * Holes: Next entry should be contiguous unless
7593 * this is the end of the region.
7594 */
7595 if (((entry->vme_end < end) &&
7596 ((entry->vme_next == vm_map_to_entry(map)) ||
7597 (entry->vme_next->vme_start
7598 > entry->vme_end)))) {
7599 if (!user_wire) {
7600 panic("vm_map_unwire: non-contiguous region");
7601 }
7602/*
7603 * entry = entry->vme_next;
7604 * continue;
7605 */
7606 }
7607
7608 subtract_wire_counts(map, entry, user_wire);
7609
7610 if (entry->wired_count != 0) {
7611 entry = entry->vme_next;
7612 continue;
7613 }
7614
7615 entry->in_transition = TRUE;
7616 tmp_entry = *entry;/* see comment in vm_map_wire() */
7617
7618 /*
7619 * We can unlock the map now. The in_transition state
7620 * guarantees existance of the entry.
7621 */
7622 vm_map_unlock(map);
7623 vm_map_unwire_nested(VME_SUBMAP(entry),
7624 start: sub_start, end: sub_end, user_wire, map_pmap: pmap, pmap_addr);
7625 vm_map_lock(map);
7626
7627 if (last_timestamp + 1 != map->timestamp) {
7628 /*
7629 * Find the entry again. It could have been
7630 * clipped or deleted after we unlocked the map.
7631 */
7632 if (!vm_map_lookup_entry(map,
7633 address: tmp_entry.vme_start,
7634 entry: &first_entry)) {
7635 if (!user_wire) {
7636 panic("vm_map_unwire: re-lookup failed");
7637 }
7638 entry = first_entry->vme_next;
7639 } else {
7640 entry = first_entry;
7641 }
7642 }
7643 last_timestamp = map->timestamp;
7644
7645 /*
7646 * clear transition bit for all constituent entries
7647 * that were in the original entry (saved in
7648 * tmp_entry). Also check for waiters.
7649 */
7650 while ((entry != vm_map_to_entry(map)) &&
7651 (entry->vme_start < tmp_entry.vme_end)) {
7652 assert(entry->in_transition);
7653 entry->in_transition = FALSE;
7654 if (entry->needs_wakeup) {
7655 entry->needs_wakeup = FALSE;
7656 need_wakeup = TRUE;
7657 }
7658 entry = entry->vme_next;
7659 }
7660 continue;
7661 } else {
7662 tmp_entry = *entry;
7663 vm_map_unlock(map);
7664 vm_map_unwire_nested(VME_SUBMAP(entry),
7665 start: sub_start, end: sub_end, user_wire, map_pmap,
7666 pmap_addr);
7667 vm_map_lock(map);
7668
7669 if (last_timestamp + 1 != map->timestamp) {
7670 /*
7671 * Find the entry again. It could have been
7672 * clipped or deleted after we unlocked the map.
7673 */
7674 if (!vm_map_lookup_entry(map,
7675 address: tmp_entry.vme_start,
7676 entry: &first_entry)) {
7677 if (!user_wire) {
7678 panic("vm_map_unwire: re-lookup failed");
7679 }
7680 entry = first_entry->vme_next;
7681 } else {
7682 entry = first_entry;
7683 }
7684 }
7685 last_timestamp = map->timestamp;
7686 }
7687 }
7688
7689
7690 if ((entry->wired_count == 0) ||
7691 (user_wire && entry->user_wired_count == 0)) {
7692 if (!user_wire) {
7693 panic("vm_map_unwire: entry is unwired");
7694 }
7695
7696 entry = entry->vme_next;
7697 continue;
7698 }
7699
7700 assert(entry->wired_count > 0 &&
7701 (!user_wire || entry->user_wired_count > 0));
7702
7703 vm_map_clip_start(map, entry, startaddr: start);
7704 vm_map_clip_end(map, entry, endaddr: end);
7705
7706 /*
7707 * Check for holes
7708 * Holes: Next entry should be contiguous unless
7709 * this is the end of the region.
7710 */
7711 if (((entry->vme_end < end) &&
7712 ((entry->vme_next == vm_map_to_entry(map)) ||
7713 (entry->vme_next->vme_start > entry->vme_end)))) {
7714 if (!user_wire) {
7715 panic("vm_map_unwire: non-contiguous region");
7716 }
7717 entry = entry->vme_next;
7718 continue;
7719 }
7720
7721 subtract_wire_counts(map, entry, user_wire);
7722
7723 if (entry->wired_count != 0) {
7724 entry = entry->vme_next;
7725 continue;
7726 }
7727
7728 if (entry->zero_wired_pages) {
7729 entry->zero_wired_pages = FALSE;
7730 }
7731
7732 entry->in_transition = TRUE;
7733 tmp_entry = *entry; /* see comment in vm_map_wire() */
7734
7735 /*
7736 * We can unlock the map now. The in_transition state
7737 * guarantees existance of the entry.
7738 */
7739 vm_map_unlock(map);
7740 if (map_pmap) {
7741 vm_fault_unwire(map, entry: &tmp_entry, FALSE, pmap: map_pmap,
7742 pmap_addr, end_addr: tmp_entry.vme_end);
7743 } else {
7744 vm_fault_unwire(map, entry: &tmp_entry, FALSE, pmap: map->pmap,
7745 pmap_addr: tmp_entry.vme_start, end_addr: tmp_entry.vme_end);
7746 }
7747 vm_map_lock(map);
7748
7749 if (last_timestamp + 1 != map->timestamp) {
7750 /*
7751 * Find the entry again. It could have been clipped
7752 * or deleted after we unlocked the map.
7753 */
7754 if (!vm_map_lookup_entry(map, address: tmp_entry.vme_start,
7755 entry: &first_entry)) {
7756 if (!user_wire) {
7757 panic("vm_map_unwire: re-lookup failed");
7758 }
7759 entry = first_entry->vme_next;
7760 } else {
7761 entry = first_entry;
7762 }
7763 }
7764 last_timestamp = map->timestamp;
7765
7766 /*
7767 * clear transition bit for all constituent entries that
7768 * were in the original entry (saved in tmp_entry). Also
7769 * check for waiters.
7770 */
7771 while ((entry != vm_map_to_entry(map)) &&
7772 (entry->vme_start < tmp_entry.vme_end)) {
7773 assert(entry->in_transition);
7774 entry->in_transition = FALSE;
7775 if (entry->needs_wakeup) {
7776 entry->needs_wakeup = FALSE;
7777 need_wakeup = TRUE;
7778 }
7779 entry = entry->vme_next;
7780 }
7781 }
7782
7783 /*
7784 * We might have fragmented the address space when we wired this
7785 * range of addresses. Attempt to re-coalesce these VM map entries
7786 * with their neighbors now that they're no longer wired.
7787 * Under some circumstances, address space fragmentation can
7788 * prevent VM object shadow chain collapsing, which can cause
7789 * swap space leaks.
7790 */
7791 vm_map_simplify_range(map, start, end);
7792
7793 vm_map_unlock(map);
7794 /*
7795 * wake up anybody waiting on entries that we have unwired.
7796 */
7797 if (need_wakeup) {
7798 vm_map_entry_wakeup(map);
7799 }
7800 return KERN_SUCCESS;
7801}
7802
7803kern_return_t
7804vm_map_unwire(
7805 vm_map_t map,
7806 vm_map_offset_t start,
7807 vm_map_offset_t end,
7808 boolean_t user_wire)
7809{
7810 return vm_map_unwire_nested(map, start, end,
7811 user_wire, map_pmap: (pmap_t)NULL, pmap_addr: 0);
7812}
7813
7814
7815/*
7816 * vm_map_entry_zap: [ internal use only ]
7817 *
7818 * Remove the entry from the target map
7819 * and put it on a zap list.
7820 */
7821static void
7822vm_map_entry_zap(
7823 vm_map_t map,
7824 vm_map_entry_t entry,
7825 vm_map_zap_t zap)
7826{
7827 vm_map_offset_t s, e;
7828
7829 s = entry->vme_start;
7830 e = entry->vme_end;
7831 assert(VM_MAP_PAGE_ALIGNED(s, FOURK_PAGE_MASK));
7832 assert(VM_MAP_PAGE_ALIGNED(e, FOURK_PAGE_MASK));
7833 if (VM_MAP_PAGE_MASK(map) >= PAGE_MASK) {
7834 assert(page_aligned(s));
7835 assert(page_aligned(e));
7836 }
7837 if (entry->map_aligned == TRUE) {
7838 assert(VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map)));
7839 assert(VM_MAP_PAGE_ALIGNED(e, VM_MAP_PAGE_MASK(map)));
7840 }
7841 assert(entry->wired_count == 0);
7842 assert(entry->user_wired_count == 0);
7843 assert(!entry->vme_permanent);
7844
7845 vm_map_store_entry_unlink(map, entry, false);
7846 map->size -= e - s;
7847
7848 vm_map_zap_append(list: zap, entry);
7849}
7850
7851static void
7852vm_map_submap_pmap_clean(
7853 vm_map_t map,
7854 vm_map_offset_t start,
7855 vm_map_offset_t end,
7856 vm_map_t sub_map,
7857 vm_map_offset_t offset)
7858{
7859 vm_map_offset_t submap_start;
7860 vm_map_offset_t submap_end;
7861 vm_map_size_t remove_size;
7862 vm_map_entry_t entry;
7863
7864 submap_end = offset + (end - start);
7865 submap_start = offset;
7866
7867 vm_map_lock_read(sub_map);
7868 if (vm_map_lookup_entry(map: sub_map, address: offset, entry: &entry)) {
7869 remove_size = (entry->vme_end - entry->vme_start);
7870 if (offset > entry->vme_start) {
7871 remove_size -= offset - entry->vme_start;
7872 }
7873
7874
7875 if (submap_end < entry->vme_end) {
7876 remove_size -=
7877 entry->vme_end - submap_end;
7878 }
7879 if (entry->is_sub_map) {
7880 vm_map_submap_pmap_clean(
7881 map: sub_map,
7882 start,
7883 end: start + remove_size,
7884 VME_SUBMAP(entry),
7885 offset: VME_OFFSET(entry));
7886 } else {
7887 if (map->mapped_in_other_pmaps &&
7888 os_ref_get_count_raw(rc: &map->map_refcnt) != 0 &&
7889 VME_OBJECT(entry) != NULL) {
7890 vm_object_pmap_protect_options(
7891 VME_OBJECT(entry),
7892 offset: (VME_OFFSET(entry) +
7893 offset -
7894 entry->vme_start),
7895 size: remove_size,
7896 PMAP_NULL,
7897 PAGE_SIZE,
7898 pmap_start: entry->vme_start,
7899 VM_PROT_NONE,
7900 PMAP_OPTIONS_REMOVE);
7901 } else {
7902 pmap_remove(map: map->pmap,
7903 s: (addr64_t)start,
7904 e: (addr64_t)(start + remove_size));
7905 }
7906 }
7907 }
7908
7909 entry = entry->vme_next;
7910
7911 while ((entry != vm_map_to_entry(sub_map))
7912 && (entry->vme_start < submap_end)) {
7913 remove_size = (entry->vme_end - entry->vme_start);
7914 if (submap_end < entry->vme_end) {
7915 remove_size -= entry->vme_end - submap_end;
7916 }
7917 if (entry->is_sub_map) {
7918 vm_map_submap_pmap_clean(
7919 map: sub_map,
7920 start: (start + entry->vme_start) - offset,
7921 end: ((start + entry->vme_start) - offset) + remove_size,
7922 VME_SUBMAP(entry),
7923 offset: VME_OFFSET(entry));
7924 } else {
7925 if (map->mapped_in_other_pmaps &&
7926 os_ref_get_count_raw(rc: &map->map_refcnt) != 0 &&
7927 VME_OBJECT(entry) != NULL) {
7928 vm_object_pmap_protect_options(
7929 VME_OBJECT(entry),
7930 offset: VME_OFFSET(entry),
7931 size: remove_size,
7932 PMAP_NULL,
7933 PAGE_SIZE,
7934 pmap_start: entry->vme_start,
7935 VM_PROT_NONE,
7936 PMAP_OPTIONS_REMOVE);
7937 } else {
7938 pmap_remove(map: map->pmap,
7939 s: (addr64_t)((start + entry->vme_start)
7940 - offset),
7941 e: (addr64_t)(((start + entry->vme_start)
7942 - offset) + remove_size));
7943 }
7944 }
7945 entry = entry->vme_next;
7946 }
7947 vm_map_unlock_read(sub_map);
7948 return;
7949}
7950
7951/*
7952 * virt_memory_guard_ast:
7953 *
7954 * Handle the AST callout for a virtual memory guard.
7955 * raise an EXC_GUARD exception and terminate the task
7956 * if configured to do so.
7957 */
7958void
7959virt_memory_guard_ast(
7960 thread_t thread,
7961 mach_exception_data_type_t code,
7962 mach_exception_data_type_t subcode)
7963{
7964 task_t task = get_threadtask(thread);
7965 assert(task != kernel_task);
7966 assert(task == current_task());
7967 kern_return_t sync_exception_result;
7968 uint32_t behavior;
7969
7970 behavior = task->task_exc_guard;
7971
7972 /* Is delivery enabled */
7973 if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7974 return;
7975 }
7976
7977 /* If only once, make sure we're that once */
7978 while (behavior & TASK_EXC_GUARD_VM_ONCE) {
7979 uint32_t new_behavior = behavior & ~TASK_EXC_GUARD_VM_DELIVER;
7980
7981 if (OSCompareAndSwap(behavior, new_behavior, &task->task_exc_guard)) {
7982 break;
7983 }
7984 behavior = task->task_exc_guard;
7985 if ((behavior & TASK_EXC_GUARD_VM_DELIVER) == 0) {
7986 return;
7987 }
7988 }
7989
7990 const bool fatal = task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL;
7991 /* Raise exception synchronously and see if handler claimed it */
7992 sync_exception_result = task_exception_notify(EXC_GUARD, code, subcode, fatal);
7993
7994 if (fatal) {
7995 /*
7996 * If Synchronous EXC_GUARD delivery was successful then
7997 * kill the process and return, else kill the process
7998 * and deliver the exception via EXC_CORPSE_NOTIFY.
7999 */
8000 if (sync_exception_result == KERN_SUCCESS) {
8001 task_bsdtask_kill(current_task());
8002 } else {
8003 exit_with_guard_exception(p: current_proc(), code, subcode);
8004 }
8005 } else if (task->task_exc_guard & TASK_EXC_GUARD_VM_CORPSE) {
8006 /*
8007 * If the synchronous EXC_GUARD delivery was not successful,
8008 * raise a simulated crash.
8009 */
8010 if (sync_exception_result != KERN_SUCCESS) {
8011 task_violated_guard(code, subcode, NULL, FALSE);
8012 }
8013 }
8014}
8015
8016/*
8017 * vm_map_guard_exception:
8018 *
8019 * Generate a GUARD_TYPE_VIRTUAL_MEMORY EXC_GUARD exception.
8020 *
8021 * Right now, we do this when we find nothing mapped, or a
8022 * gap in the mapping when a user address space deallocate
8023 * was requested. We report the address of the first gap found.
8024 */
8025static void
8026vm_map_guard_exception(
8027 vm_map_offset_t gap_start,
8028 unsigned reason)
8029{
8030 mach_exception_code_t code = 0;
8031 unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
8032 unsigned int target = 0; /* should we pass in pid associated with map? */
8033 mach_exception_data_type_t subcode = (uint64_t)gap_start;
8034 boolean_t fatal = FALSE;
8035
8036 task_t task = current_task_early();
8037
8038 /* Can't deliver exceptions to a NULL task (early boot) or kernel task */
8039 if (task == NULL || task == kernel_task) {
8040 return;
8041 }
8042
8043 EXC_GUARD_ENCODE_TYPE(code, guard_type);
8044 EXC_GUARD_ENCODE_FLAVOR(code, reason);
8045 EXC_GUARD_ENCODE_TARGET(code, target);
8046
8047 if (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL) {
8048 fatal = TRUE;
8049 }
8050 thread_guard_violation(current_thread(), code, subcode, fatal);
8051}
8052
8053static kern_return_t
8054vm_map_delete_submap_recurse(
8055 vm_map_t submap,
8056 vm_map_offset_t submap_start,
8057 vm_map_offset_t submap_end)
8058{
8059 vm_map_entry_t submap_entry;
8060
8061 /*
8062 * Verify that the submap does not contain any "permanent" entries
8063 * within the specified range.
8064 * We do not care about gaps.
8065 */
8066
8067 vm_map_lock(submap);
8068
8069 if (!vm_map_lookup_entry(map: submap, address: submap_start, entry: &submap_entry)) {
8070 submap_entry = submap_entry->vme_next;
8071 }
8072
8073 for (;
8074 submap_entry != vm_map_to_entry(submap) &&
8075 submap_entry->vme_start < submap_end;
8076 submap_entry = submap_entry->vme_next) {
8077 if (submap_entry->vme_permanent) {
8078 /* "permanent" entry -> fail */
8079 vm_map_unlock(submap);
8080 return KERN_PROTECTION_FAILURE;
8081 }
8082 }
8083 /* no "permanent" entries in the range -> success */
8084 vm_map_unlock(submap);
8085 return KERN_SUCCESS;
8086}
8087
8088__abortlike
8089static void
8090__vm_map_delete_misaligned_panic(
8091 vm_map_t map,
8092 vm_map_offset_t start,
8093 vm_map_offset_t end)
8094{
8095 panic("vm_map_delete(%p,0x%llx,0x%llx): start is not aligned to 0x%x",
8096 map, (uint64_t)start, (uint64_t)end, VM_MAP_PAGE_SIZE(map));
8097}
8098
8099__abortlike
8100static void
8101__vm_map_delete_failed_panic(
8102 vm_map_t map,
8103 vm_map_offset_t start,
8104 vm_map_offset_t end,
8105 kern_return_t kr)
8106{
8107 panic("vm_map_delete(%p,0x%llx,0x%llx): failed unexpected with %d",
8108 map, (uint64_t)start, (uint64_t)end, kr);
8109}
8110
8111__abortlike
8112static void
8113__vm_map_delete_gap_panic(
8114 vm_map_t map,
8115 vm_map_offset_t where,
8116 vm_map_offset_t start,
8117 vm_map_offset_t end)
8118{
8119 panic("vm_map_delete(%p,0x%llx,0x%llx): no map entry at 0x%llx",
8120 map, (uint64_t)start, (uint64_t)end, (uint64_t)where);
8121}
8122
8123__abortlike
8124static void
8125__vm_map_delete_permanent_panic(
8126 vm_map_t map,
8127 vm_map_offset_t start,
8128 vm_map_offset_t end,
8129 vm_map_entry_t entry)
8130{
8131 panic("vm_map_delete(%p,0x%llx,0x%llx): "
8132 "Attempting to remove permanent VM map entry %p [0x%llx:0x%llx]",
8133 map, (uint64_t)start, (uint64_t)end, entry,
8134 (uint64_t)entry->vme_start,
8135 (uint64_t)entry->vme_end);
8136}
8137
8138__options_decl(vm_map_delete_state_t, uint32_t, {
8139 VMDS_NONE = 0x0000,
8140
8141 VMDS_FOUND_GAP = 0x0001,
8142 VMDS_GAPS_OK = 0x0002,
8143
8144 VMDS_KERNEL_PMAP = 0x0004,
8145 VMDS_NEEDS_LOOKUP = 0x0008,
8146 VMDS_NEEDS_WAKEUP = 0x0010,
8147 VMDS_KERNEL_KMEMPTR = 0x0020
8148});
8149
8150/*
8151 * vm_map_delete: [ internal use only ]
8152 *
8153 * Deallocates the given address range from the target map.
8154 * Removes all user wirings. Unwires one kernel wiring if
8155 * VM_MAP_REMOVE_KUNWIRE is set. Waits for kernel wirings to go
8156 * away if VM_MAP_REMOVE_WAIT_FOR_KWIRE is set. Sleeps
8157 * interruptibly if VM_MAP_REMOVE_INTERRUPTIBLE is set.
8158 *
8159 *
8160 * When the map is a kernel map, then any error in removing mappings
8161 * will lead to a panic so that clients do not have to repeat the panic
8162 * code at each call site. If VM_MAP_REMOVE_INTERRUPTIBLE
8163 * is also passed, then KERN_ABORTED will not lead to a panic.
8164 *
8165 * This routine is called with map locked and leaves map locked.
8166 */
8167static kmem_return_t
8168vm_map_delete(
8169 vm_map_t map,
8170 vm_map_offset_t start,
8171 vm_map_offset_t end,
8172 vmr_flags_t flags,
8173 kmem_guard_t guard,
8174 vm_map_zap_t zap_list)
8175{
8176 vm_map_entry_t entry, next;
8177 int interruptible;
8178 vm_map_offset_t gap_start = 0;
8179 vm_map_offset_t clear_in_transition_end = 0;
8180 __unused vm_map_offset_t save_start = start;
8181 __unused vm_map_offset_t save_end = end;
8182 vm_map_delete_state_t state = VMDS_NONE;
8183 kmem_return_t ret = { };
8184 vm_map_range_id_t range_id = 0;
8185 struct kmem_page_meta *meta = NULL;
8186 uint32_t size_idx, slot_idx;
8187 struct mach_vm_range slot;
8188
8189 if (vm_map_pmap(map) == kernel_pmap) {
8190 state |= VMDS_KERNEL_PMAP;
8191 range_id = kmem_addr_get_range(addr: start, size: end - start);
8192 if (kmem_is_ptr_range(range_id)) {
8193 state |= VMDS_KERNEL_KMEMPTR;
8194 slot_idx = kmem_addr_get_slot_idx(start, end, range_id, meta: &meta,
8195 size_idx: &size_idx, slot: &slot);
8196 }
8197 }
8198
8199 if (map->terminated || os_ref_get_count_raw(rc: &map->map_refcnt) == 0) {
8200 state |= VMDS_GAPS_OK;
8201 }
8202
8203 if (map->corpse_source &&
8204 !(flags & VM_MAP_REMOVE_TO_OVERWRITE) &&
8205 !map->terminated) {
8206 /*
8207 * The map is being used for corpses related diagnostics.
8208 * So skip any entry removal to avoid perturbing the map state.
8209 * The cleanup will happen in task_terminate_internal after the
8210 * call to task_port_no_senders.
8211 */
8212 goto out;
8213 }
8214
8215 interruptible = (flags & VM_MAP_REMOVE_INTERRUPTIBLE) ?
8216 THREAD_ABORTSAFE : THREAD_UNINT;
8217
8218 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) == 0 &&
8219 (start & VM_MAP_PAGE_MASK(map))) {
8220 __vm_map_delete_misaligned_panic(map, start, end);
8221 }
8222
8223 if ((state & VMDS_GAPS_OK) == 0) {
8224 /*
8225 * If the map isn't terminated then all deletions must have
8226 * no gaps, and be within the [min, max) of the map.
8227 *
8228 * We got here without VM_MAP_RANGE_CHECK() being called,
8229 * and hence must validate bounds manually.
8230 *
8231 * It is worth noting that because vm_deallocate() will
8232 * round_page() the deallocation size, it's possible for "end"
8233 * to be 0 here due to overflow. We hence must treat it as being
8234 * beyond vm_map_max(map).
8235 *
8236 * Similarly, end < start means some wrap around happend,
8237 * which should cause an error or panic.
8238 */
8239 if (end == 0 || end > vm_map_max(map)) {
8240 state |= VMDS_FOUND_GAP;
8241 gap_start = vm_map_max(map);
8242 if (state & VMDS_KERNEL_PMAP) {
8243 __vm_map_delete_gap_panic(map,
8244 where: gap_start, start, end);
8245 }
8246 goto out;
8247 }
8248
8249 if (end < start) {
8250 if (state & VMDS_KERNEL_PMAP) {
8251 __vm_map_delete_gap_panic(map,
8252 vm_map_max(map), start, end);
8253 }
8254 ret.kmr_return = KERN_INVALID_ARGUMENT;
8255 goto out;
8256 }
8257
8258 if (start < vm_map_min(map)) {
8259 state |= VMDS_FOUND_GAP;
8260 gap_start = start;
8261 if (state & VMDS_KERNEL_PMAP) {
8262 __vm_map_delete_gap_panic(map,
8263 where: gap_start, start, end);
8264 }
8265 goto out;
8266 }
8267 } else {
8268 /*
8269 * If the map is terminated, we must accept start/end
8270 * being beyond the boundaries of the map as this is
8271 * how some of the mappings like commpage mappings
8272 * can be destroyed (they're outside of those bounds).
8273 *
8274 * end < start is still something we can't cope with,
8275 * so just bail.
8276 */
8277 if (end < start) {
8278 goto out;
8279 }
8280 }
8281
8282
8283 /*
8284 * Find the start of the region.
8285 *
8286 * If in a superpage, extend the range
8287 * to include the start of the mapping.
8288 */
8289 while (vm_map_lookup_entry_or_next(map, address: start, entry: &entry)) {
8290 if (entry->superpage_size && (start & ~SUPERPAGE_MASK)) {
8291 start = SUPERPAGE_ROUND_DOWN(start);
8292 } else {
8293 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8294 break;
8295 }
8296 }
8297
8298 if (entry->superpage_size) {
8299 end = SUPERPAGE_ROUND_UP(end);
8300 }
8301
8302 /*
8303 * Step through all entries in this region
8304 */
8305 for (vm_map_offset_t s = start; s < end;) {
8306 /*
8307 * At this point, we have deleted all the memory entries
8308 * in [start, s) and are proceeding with the [s, end) range.
8309 *
8310 * This loop might drop the map lock, and it is possible that
8311 * some memory was already reallocated within [start, s)
8312 * and we don't want to mess with those entries.
8313 *
8314 * Some of those entries could even have been re-assembled
8315 * with an entry after "s" (in vm_map_simplify_entry()), so
8316 * we may have to vm_map_clip_start() again.
8317 *
8318 * When clear_in_transition_end is set, the we had marked
8319 * [start, clear_in_transition_end) as "in_transition"
8320 * during a previous iteration and we need to clear it.
8321 */
8322
8323 /*
8324 * Step 1: If needed (because we dropped locks),
8325 * lookup the entry again.
8326 *
8327 * If we're coming back from unwiring (Step 5),
8328 * we also need to mark the entries as no longer
8329 * in transition after that.
8330 */
8331
8332 if (state & VMDS_NEEDS_LOOKUP) {
8333 state &= ~VMDS_NEEDS_LOOKUP;
8334
8335 if (vm_map_lookup_entry_or_next(map, address: s, entry: &entry)) {
8336 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8337 }
8338
8339 if (state & VMDS_KERNEL_KMEMPTR) {
8340 kmem_validate_slot(addr: s, meta, size_idx, slot_idx);
8341 }
8342 }
8343
8344 if (clear_in_transition_end) {
8345 for (vm_map_entry_t it = entry;
8346 it != vm_map_to_entry(map) &&
8347 it->vme_start < clear_in_transition_end;
8348 it = it->vme_next) {
8349 assert(it->in_transition);
8350 it->in_transition = FALSE;
8351 if (it->needs_wakeup) {
8352 it->needs_wakeup = FALSE;
8353 state |= VMDS_NEEDS_WAKEUP;
8354 }
8355 }
8356
8357 clear_in_transition_end = 0;
8358 }
8359
8360
8361 /*
8362 * Step 2: Perform various policy checks
8363 * before we do _anything_ to this entry.
8364 */
8365
8366 if (entry == vm_map_to_entry(map) || s < entry->vme_start) {
8367 if (state & (VMDS_GAPS_OK | VMDS_FOUND_GAP)) {
8368 /*
8369 * Either we found a gap already,
8370 * or we are tearing down a map,
8371 * keep going.
8372 */
8373 } else if (state & VMDS_KERNEL_PMAP) {
8374 __vm_map_delete_gap_panic(map, where: s, start, end);
8375 } else if (s < end) {
8376 state |= VMDS_FOUND_GAP;
8377 gap_start = s;
8378 }
8379
8380 if (entry == vm_map_to_entry(map) ||
8381 end <= entry->vme_start) {
8382 break;
8383 }
8384
8385 s = entry->vme_start;
8386 }
8387
8388 if (state & VMDS_KERNEL_PMAP) {
8389 /*
8390 * In the kernel map and its submaps,
8391 * permanent entries never die, even
8392 * if VM_MAP_REMOVE_IMMUTABLE is passed.
8393 */
8394 if (entry->vme_permanent) {
8395 __vm_map_delete_permanent_panic(map, start, end, entry);
8396 }
8397
8398 if (flags & VM_MAP_REMOVE_GUESS_SIZE) {
8399 end = entry->vme_end;
8400 flags &= ~VM_MAP_REMOVE_GUESS_SIZE;
8401 }
8402
8403 /*
8404 * In the kernel map and its submaps,
8405 * the removal of an atomic/guarded entry is strict.
8406 *
8407 * An atomic entry is processed only if it was
8408 * specifically targeted.
8409 *
8410 * We might have deleted non-atomic entries before
8411 * we reach this this point however...
8412 */
8413 kmem_entry_validate_guard(map, entry,
8414 addr: start, size: end - start, guard);
8415 }
8416
8417 /*
8418 * Step 2.1: handle "permanent" and "submap" entries
8419 * *before* clipping to avoid triggering some unnecessary
8420 * un-nesting of the shared region.
8421 */
8422 if (entry->vme_permanent && entry->is_sub_map) {
8423// printf("FBDP %s:%d permanent submap...\n", __FUNCTION__, __LINE__);
8424 /*
8425 * Un-mapping a "permanent" mapping of a user-space
8426 * submap is not allowed unless...
8427 */
8428 if (flags & VM_MAP_REMOVE_IMMUTABLE) {
8429 /*
8430 * a. explicitly requested by the kernel caller.
8431 */
8432// printf("FBDP %s:%d flags & REMOVE_IMMUTABLE\n", __FUNCTION__, __LINE__);
8433 } else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8434 developer_mode_state()) {
8435 /*
8436 * b. we're in "developer" mode (for
8437 * breakpoints, dtrace probes, ...).
8438 */
8439// printf("FBDP %s:%d flags & REMOVE_IMMUTABLE_CODE\n", __FUNCTION__, __LINE__);
8440 } else if (map->terminated) {
8441 /*
8442 * c. this is the final address space cleanup.
8443 */
8444// printf("FBDP %s:%d map->terminated\n", __FUNCTION__, __LINE__);
8445 } else {
8446 vm_map_offset_t submap_start, submap_end;
8447 kern_return_t submap_kr;
8448
8449 /*
8450 * Check if there are any "permanent" mappings
8451 * in this range in the submap.
8452 */
8453 if (entry->in_transition) {
8454 /* can that even happen ? */
8455 goto in_transition;
8456 }
8457 /* compute the clipped range in the submap */
8458 submap_start = s - entry->vme_start;
8459 submap_start += VME_OFFSET(entry);
8460 submap_end = end - entry->vme_start;
8461 submap_end += VME_OFFSET(entry);
8462 submap_kr = vm_map_delete_submap_recurse(
8463 VME_SUBMAP(entry),
8464 submap_start,
8465 submap_end);
8466 if (submap_kr != KERN_SUCCESS) {
8467 /*
8468 * There are some "permanent" mappings
8469 * in the submap: we are not allowed
8470 * to remove this range.
8471 */
8472 printf(format: "%d[%s] removing permanent submap entry "
8473 "%p [0x%llx:0x%llx] prot 0x%x/0x%x -> KERN_PROT_FAILURE\n",
8474 proc_selfpid(),
8475 (get_bsdtask_info(current_task())
8476 ? proc_name_address(p: get_bsdtask_info(current_task()))
8477 : "?"), entry,
8478 (uint64_t)entry->vme_start,
8479 (uint64_t)entry->vme_end,
8480 entry->protection,
8481 entry->max_protection);
8482 DTRACE_VM6(vm_map_delete_permanent_deny_submap,
8483 vm_map_entry_t, entry,
8484 vm_map_offset_t, entry->vme_start,
8485 vm_map_offset_t, entry->vme_end,
8486 vm_prot_t, entry->protection,
8487 vm_prot_t, entry->max_protection,
8488 int, VME_ALIAS(entry));
8489 ret.kmr_return = KERN_PROTECTION_FAILURE;
8490 goto out;
8491 }
8492 /* no permanent mappings: proceed */
8493 }
8494 }
8495
8496 /*
8497 * Step 3: Perform any clipping needed.
8498 *
8499 * After this, "entry" starts at "s", ends before "end"
8500 */
8501
8502 if (entry->vme_start < s) {
8503 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8504 entry->map_aligned &&
8505 !VM_MAP_PAGE_ALIGNED(s, VM_MAP_PAGE_MASK(map))) {
8506 /*
8507 * The entry will no longer be map-aligned
8508 * after clipping and the caller said it's OK.
8509 */
8510 entry->map_aligned = FALSE;
8511 }
8512 vm_map_clip_start(map, entry, startaddr: s);
8513 SAVE_HINT_MAP_WRITE(map, entry->vme_prev);
8514 }
8515
8516 if (end < entry->vme_end) {
8517 if ((flags & VM_MAP_REMOVE_NO_MAP_ALIGN) &&
8518 entry->map_aligned &&
8519 !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map))) {
8520 /*
8521 * The entry will no longer be map-aligned
8522 * after clipping and the caller said it's OK.
8523 */
8524 entry->map_aligned = FALSE;
8525 }
8526 vm_map_clip_end(map, entry, endaddr: end);
8527 }
8528
8529 if (entry->vme_permanent && entry->is_sub_map) {
8530 /*
8531 * We already went through step 2.1 which did not deny
8532 * the removal of this "permanent" and "is_sub_map"
8533 * entry.
8534 * Now that we've clipped what we actually want to
8535 * delete, undo the "permanent" part to allow the
8536 * removal to proceed.
8537 */
8538 DTRACE_VM6(vm_map_delete_permanent_allow_submap,
8539 vm_map_entry_t, entry,
8540 vm_map_offset_t, entry->vme_start,
8541 vm_map_offset_t, entry->vme_end,
8542 vm_prot_t, entry->protection,
8543 vm_prot_t, entry->max_protection,
8544 int, VME_ALIAS(entry));
8545 entry->vme_permanent = false;
8546 }
8547
8548 assert(s == entry->vme_start);
8549 assert(entry->vme_end <= end);
8550
8551
8552 /*
8553 * Step 4: If the entry is in flux, wait for this to resolve.
8554 */
8555
8556 if (entry->in_transition) {
8557 wait_result_t wait_result;
8558
8559in_transition:
8560 /*
8561 * Another thread is wiring/unwiring this entry.
8562 * Let the other thread know we are waiting.
8563 */
8564
8565 entry->needs_wakeup = TRUE;
8566
8567 /*
8568 * wake up anybody waiting on entries that we have
8569 * already unwired/deleted.
8570 */
8571 if (state & VMDS_NEEDS_WAKEUP) {
8572 vm_map_entry_wakeup(map);
8573 state &= ~VMDS_NEEDS_WAKEUP;
8574 }
8575
8576 wait_result = vm_map_entry_wait(map, interruptible);
8577
8578 if (interruptible &&
8579 wait_result == THREAD_INTERRUPTED) {
8580 /*
8581 * We do not clear the needs_wakeup flag,
8582 * since we cannot tell if we were the only one.
8583 */
8584 ret.kmr_return = KERN_ABORTED;
8585 return ret;
8586 }
8587
8588 /*
8589 * The entry could have been clipped or it
8590 * may not exist anymore. Look it up again.
8591 */
8592 state |= VMDS_NEEDS_LOOKUP;
8593 continue;
8594 }
8595
8596
8597 /*
8598 * Step 5: Handle wiring
8599 */
8600
8601 if (entry->wired_count) {
8602 struct vm_map_entry tmp_entry;
8603 boolean_t user_wire;
8604 unsigned int last_timestamp;
8605
8606 user_wire = entry->user_wired_count > 0;
8607
8608 /*
8609 * Remove a kernel wiring if requested
8610 */
8611 if (flags & VM_MAP_REMOVE_KUNWIRE) {
8612 entry->wired_count--;
8613 vme_btref_consider_and_put(entry);
8614 }
8615
8616 /*
8617 * Remove all user wirings for proper accounting
8618 */
8619 while (entry->user_wired_count) {
8620 subtract_wire_counts(map, entry, user_wire);
8621 }
8622
8623 /*
8624 * All our DMA I/O operations in IOKit are currently
8625 * done by wiring through the map entries of the task
8626 * requesting the I/O.
8627 *
8628 * Because of this, we must always wait for kernel wirings
8629 * to go away on the entries before deleting them.
8630 *
8631 * Any caller who wants to actually remove a kernel wiring
8632 * should explicitly set the VM_MAP_REMOVE_KUNWIRE flag to
8633 * properly remove one wiring instead of blasting through
8634 * them all.
8635 */
8636 if (entry->wired_count != 0) {
8637 assert(map != kernel_map);
8638 /*
8639 * Cannot continue. Typical case is when
8640 * a user thread has physical io pending on
8641 * on this page. Either wait for the
8642 * kernel wiring to go away or return an
8643 * error.
8644 */
8645 wait_result_t wait_result;
8646
8647 entry->needs_wakeup = TRUE;
8648 wait_result = vm_map_entry_wait(map,
8649 interruptible);
8650
8651 if (interruptible &&
8652 wait_result == THREAD_INTERRUPTED) {
8653 /*
8654 * We do not clear the
8655 * needs_wakeup flag, since we
8656 * cannot tell if we were the
8657 * only one.
8658 */
8659 ret.kmr_return = KERN_ABORTED;
8660 return ret;
8661 }
8662
8663
8664 /*
8665 * The entry could have been clipped or
8666 * it may not exist anymore. Look it
8667 * up again.
8668 */
8669 state |= VMDS_NEEDS_LOOKUP;
8670 continue;
8671 }
8672
8673 /*
8674 * We can unlock the map now.
8675 *
8676 * The entry might be split once we unlock the map,
8677 * but we need the range as defined by this entry
8678 * to be stable. So we must make a local copy.
8679 *
8680 * The underlying objects do not change during clips,
8681 * and the in_transition state guarentees existence
8682 * of the entry.
8683 */
8684 last_timestamp = map->timestamp;
8685 entry->in_transition = TRUE;
8686 tmp_entry = *entry;
8687 vm_map_unlock(map);
8688
8689 if (tmp_entry.is_sub_map) {
8690 vm_map_t sub_map;
8691 vm_map_offset_t sub_start, sub_end;
8692 pmap_t pmap;
8693 vm_map_offset_t pmap_addr;
8694
8695
8696 sub_map = VME_SUBMAP(&tmp_entry);
8697 sub_start = VME_OFFSET(entry: &tmp_entry);
8698 sub_end = sub_start + (tmp_entry.vme_end -
8699 tmp_entry.vme_start);
8700 if (tmp_entry.use_pmap) {
8701 pmap = sub_map->pmap;
8702 pmap_addr = tmp_entry.vme_start;
8703 } else {
8704 pmap = map->pmap;
8705 pmap_addr = tmp_entry.vme_start;
8706 }
8707 (void) vm_map_unwire_nested(map: sub_map,
8708 start: sub_start, end: sub_end,
8709 user_wire,
8710 map_pmap: pmap, pmap_addr);
8711 } else {
8712 vm_map_offset_t entry_end = tmp_entry.vme_end;
8713 vm_map_offset_t max_end;
8714
8715 if (flags & VM_MAP_REMOVE_NOKUNWIRE_LAST) {
8716 max_end = end - VM_MAP_PAGE_SIZE(map);
8717 if (entry_end > max_end) {
8718 entry_end = max_end;
8719 }
8720 }
8721
8722 if (tmp_entry.vme_kernel_object) {
8723 pmap_protect_options(
8724 map: map->pmap,
8725 s: tmp_entry.vme_start,
8726 e: entry_end,
8727 VM_PROT_NONE,
8728 PMAP_OPTIONS_REMOVE,
8729 NULL);
8730 }
8731 vm_fault_unwire(map, entry: &tmp_entry,
8732 deallocate: tmp_entry.vme_kernel_object, pmap: map->pmap,
8733 pmap_addr: tmp_entry.vme_start, end_addr: entry_end);
8734 }
8735
8736 vm_map_lock(map);
8737
8738 /*
8739 * Unwiring happened, we can now go back to deleting
8740 * them (after we clear the in_transition bit for the range).
8741 */
8742 if (last_timestamp + 1 != map->timestamp) {
8743 state |= VMDS_NEEDS_LOOKUP;
8744 }
8745 clear_in_transition_end = tmp_entry.vme_end;
8746 continue;
8747 }
8748
8749 assert(entry->wired_count == 0);
8750 assert(entry->user_wired_count == 0);
8751
8752
8753 /*
8754 * Step 6: Entry is unwired and ready for us to delete !
8755 */
8756
8757 if (!entry->vme_permanent) {
8758 /*
8759 * Typical case: the entry really shouldn't be permanent
8760 */
8761 } else if ((flags & VM_MAP_REMOVE_IMMUTABLE_CODE) &&
8762 (entry->protection & VM_PROT_EXECUTE) &&
8763 developer_mode_state()) {
8764 /*
8765 * Allow debuggers to undo executable mappings
8766 * when developer mode is on.
8767 */
8768#if 0
8769 printf("FBDP %d[%s] removing permanent executable entry "
8770 "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8771 proc_selfpid(),
8772 (current_task()->bsd_info
8773 ? proc_name_address(current_task()->bsd_info)
8774 : "?"), entry,
8775 (uint64_t)entry->vme_start,
8776 (uint64_t)entry->vme_end,
8777 entry->protection,
8778 entry->max_protection);
8779#endif
8780 entry->vme_permanent = FALSE;
8781 } else if ((flags & VM_MAP_REMOVE_IMMUTABLE) || map->terminated) {
8782#if 0
8783 printf("FBDP %d[%s] removing permanent entry "
8784 "%p [0x%llx:0x%llx] prot 0x%x/0x%x\n",
8785 proc_selfpid(),
8786 (current_task()->bsd_info
8787 ? proc_name_address(current_task()->bsd_info)
8788 : "?"), entry,
8789 (uint64_t)entry->vme_start,
8790 (uint64_t)entry->vme_end,
8791 entry->protection,
8792 entry->max_protection);
8793#endif
8794 entry->vme_permanent = FALSE;
8795#if CODE_SIGNING_MONITOR
8796 } else if ((entry->protection & VM_PROT_EXECUTE) && !csm_enabled()) {
8797 entry->vme_permanent = FALSE;
8798
8799 printf("%d[%s] %s(0x%llx,0x%llx): "
8800 "code signing monitor disabled, allowing for permanent executable entry [0x%llx:0x%llx] "
8801 "prot 0x%x/0x%x\n",
8802 proc_selfpid(),
8803 (get_bsdtask_info(current_task())
8804 ? proc_name_address(get_bsdtask_info(current_task()))
8805 : "?"),
8806 __FUNCTION__,
8807 (uint64_t)start,
8808 (uint64_t)end,
8809 (uint64_t)entry->vme_start,
8810 (uint64_t)entry->vme_end,
8811 entry->protection,
8812 entry->max_protection);
8813#endif
8814 } else {
8815 DTRACE_VM6(vm_map_delete_permanent,
8816 vm_map_entry_t, entry,
8817 vm_map_offset_t, entry->vme_start,
8818 vm_map_offset_t, entry->vme_end,
8819 vm_prot_t, entry->protection,
8820 vm_prot_t, entry->max_protection,
8821 int, VME_ALIAS(entry));
8822 }
8823
8824 if (entry->is_sub_map) {
8825 assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
8826 "map %p (%d) entry %p submap %p (%d)\n",
8827 map, VM_MAP_PAGE_SHIFT(map), entry,
8828 VME_SUBMAP(entry),
8829 VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
8830 if (entry->use_pmap) {
8831#ifndef NO_NESTED_PMAP
8832 int pmap_flags;
8833
8834 if (map->terminated) {
8835 /*
8836 * This is the final cleanup of the
8837 * address space being terminated.
8838 * No new mappings are expected and
8839 * we don't really need to unnest the
8840 * shared region (and lose the "global"
8841 * pmap mappings, if applicable).
8842 *
8843 * Tell the pmap layer that we're
8844 * "clean" wrt nesting.
8845 */
8846 pmap_flags = PMAP_UNNEST_CLEAN;
8847 } else {
8848 /*
8849 * We're unmapping part of the nested
8850 * shared region, so we can't keep the
8851 * nested pmap.
8852 */
8853 pmap_flags = 0;
8854 }
8855 pmap_unnest_options(
8856 map->pmap,
8857 (addr64_t)entry->vme_start,
8858 entry->vme_end - entry->vme_start,
8859 pmap_flags);
8860#endif /* NO_NESTED_PMAP */
8861 if (map->mapped_in_other_pmaps &&
8862 os_ref_get_count_raw(rc: &map->map_refcnt) != 0) {
8863 /* clean up parent map/maps */
8864 vm_map_submap_pmap_clean(
8865 map, start: entry->vme_start,
8866 end: entry->vme_end,
8867 VME_SUBMAP(entry),
8868 offset: VME_OFFSET(entry));
8869 }
8870 } else {
8871 vm_map_submap_pmap_clean(
8872 map, start: entry->vme_start, end: entry->vme_end,
8873 VME_SUBMAP(entry),
8874 offset: VME_OFFSET(entry));
8875 }
8876 } else if (entry->vme_kernel_object ||
8877 VME_OBJECT(entry) == compressor_object) {
8878 /*
8879 * nothing to do
8880 */
8881 } else if (map->mapped_in_other_pmaps &&
8882 os_ref_get_count_raw(rc: &map->map_refcnt) != 0) {
8883 vm_object_pmap_protect_options(
8884 VME_OBJECT(entry), offset: VME_OFFSET(entry),
8885 size: entry->vme_end - entry->vme_start,
8886 PMAP_NULL,
8887 PAGE_SIZE,
8888 pmap_start: entry->vme_start,
8889 VM_PROT_NONE,
8890 PMAP_OPTIONS_REMOVE);
8891 } else if ((VME_OBJECT(entry) != VM_OBJECT_NULL) ||
8892 (state & VMDS_KERNEL_PMAP)) {
8893 /* Remove translations associated
8894 * with this range unless the entry
8895 * does not have an object, or
8896 * it's the kernel map or a descendant
8897 * since the platform could potentially
8898 * create "backdoor" mappings invisible
8899 * to the VM. It is expected that
8900 * objectless, non-kernel ranges
8901 * do not have such VM invisible
8902 * translations.
8903 */
8904 pmap_remove_options(map: map->pmap,
8905 s: (addr64_t)entry->vme_start,
8906 e: (addr64_t)entry->vme_end,
8907 PMAP_OPTIONS_REMOVE);
8908 }
8909
8910#if DEBUG
8911 /*
8912 * All pmap mappings for this map entry must have been
8913 * cleared by now.
8914 */
8915 assert(pmap_is_empty(map->pmap,
8916 entry->vme_start,
8917 entry->vme_end));
8918#endif /* DEBUG */
8919
8920 if (entry->iokit_acct) {
8921 /* alternate accounting */
8922 DTRACE_VM4(vm_map_iokit_unmapped_region,
8923 vm_map_t, map,
8924 vm_map_offset_t, entry->vme_start,
8925 vm_map_offset_t, entry->vme_end,
8926 int, VME_ALIAS(entry));
8927 vm_map_iokit_unmapped_region(map,
8928 bytes: (entry->vme_end -
8929 entry->vme_start));
8930 entry->iokit_acct = FALSE;
8931 entry->use_pmap = FALSE;
8932 }
8933
8934 /* move "s" forward */
8935 s = entry->vme_end;
8936 next = entry->vme_next;
8937 if (!entry->map_aligned) {
8938 vm_map_offset_t rounded_s;
8939
8940 /*
8941 * Skip artificial gap due to mis-aligned entry
8942 * on devices with a page size smaller than the
8943 * map's page size (i.e. 16k task on a 4k device).
8944 */
8945 rounded_s = VM_MAP_ROUND_PAGE(s, VM_MAP_PAGE_MASK(map));
8946 if (next == vm_map_to_entry(map)) {
8947 s = rounded_s;
8948 } else if (s < rounded_s) {
8949 s = MIN(rounded_s, next->vme_start);
8950 }
8951 }
8952 ret.kmr_size += s - entry->vme_start;
8953
8954 if (entry->vme_permanent) {
8955 /*
8956 * A permanent entry can not be removed, so leave it
8957 * in place but remove all access permissions.
8958 */
8959 if (!entry->csm_associated) {
8960 printf(format: "%s:%d %d[%s] map %p entry %p [ 0x%llx - 0x%llx ] submap %d prot 0x%x/0x%x -> 0/0\n",
8961 __FUNCTION__, __LINE__,
8962 proc_selfpid(),
8963 (get_bsdtask_info(current_task())
8964 ? proc_name_address(p: get_bsdtask_info(current_task()))
8965 : "?"),
8966 map,
8967 entry,
8968 (uint64_t)entry->vme_start,
8969 (uint64_t)entry->vme_end,
8970 entry->is_sub_map,
8971 entry->protection,
8972 entry->max_protection);
8973 }
8974 DTRACE_VM6(vm_map_delete_permanent_prot_none,
8975 vm_map_entry_t, entry,
8976 vm_map_offset_t, entry->vme_start,
8977 vm_map_offset_t, entry->vme_end,
8978 vm_prot_t, entry->protection,
8979 vm_prot_t, entry->max_protection,
8980 int, VME_ALIAS(entry));
8981 entry->protection = VM_PROT_NONE;
8982 entry->max_protection = VM_PROT_NONE;
8983 } else {
8984 vm_map_entry_zap(map, entry, zap: zap_list);
8985 }
8986
8987 entry = next;
8988 next = VM_MAP_ENTRY_NULL;
8989
8990 if ((flags & VM_MAP_REMOVE_NO_YIELD) == 0 && s < end) {
8991 unsigned int last_timestamp = map->timestamp++;
8992
8993 if (lck_rw_lock_yield_exclusive(lck: &map->lock,
8994 mode: LCK_RW_YIELD_ANY_WAITER)) {
8995 if (last_timestamp != map->timestamp + 1) {
8996 state |= VMDS_NEEDS_LOOKUP;
8997 }
8998 } else {
8999 /* we didn't yield, undo our change */
9000 map->timestamp--;
9001 }
9002 }
9003 }
9004
9005 if (map->wait_for_space) {
9006 thread_wakeup((event_t) map);
9007 }
9008
9009 if (state & VMDS_NEEDS_WAKEUP) {
9010 vm_map_entry_wakeup(map);
9011 }
9012
9013out:
9014 if ((state & VMDS_KERNEL_PMAP) && ret.kmr_return) {
9015 __vm_map_delete_failed_panic(map, start, end, kr: ret.kmr_return);
9016 }
9017
9018 if (state & VMDS_KERNEL_KMEMPTR) {
9019 kmem_free_space(start, end, range_id, slot: &slot);
9020 }
9021
9022 if (state & VMDS_FOUND_GAP) {
9023 DTRACE_VM3(kern_vm_deallocate_gap,
9024 vm_map_offset_t, gap_start,
9025 vm_map_offset_t, save_start,
9026 vm_map_offset_t, save_end);
9027 if (flags & VM_MAP_REMOVE_GAPS_FAIL) {
9028 ret.kmr_return = KERN_INVALID_VALUE;
9029 } else {
9030 vm_map_guard_exception(gap_start, reason: kGUARD_EXC_DEALLOC_GAP);
9031 }
9032 }
9033
9034 return ret;
9035}
9036
9037kmem_return_t
9038vm_map_remove_and_unlock(
9039 vm_map_t map,
9040 vm_map_offset_t start,
9041 vm_map_offset_t end,
9042 vmr_flags_t flags,
9043 kmem_guard_t guard)
9044{
9045 kmem_return_t ret;
9046 VM_MAP_ZAP_DECLARE(zap);
9047
9048 ret = vm_map_delete(map, start, end, flags, guard, zap_list: &zap);
9049 vm_map_unlock(map);
9050
9051 vm_map_zap_dispose(list: &zap);
9052
9053 return ret;
9054}
9055
9056/*
9057 * vm_map_remove_guard:
9058 *
9059 * Remove the given address range from the target map.
9060 * This is the exported form of vm_map_delete.
9061 */
9062kmem_return_t
9063vm_map_remove_guard(
9064 vm_map_t map,
9065 vm_map_offset_t start,
9066 vm_map_offset_t end,
9067 vmr_flags_t flags,
9068 kmem_guard_t guard)
9069{
9070 vm_map_lock(map);
9071 return vm_map_remove_and_unlock(map, start, end, flags, guard);
9072}
9073
9074/*
9075 * vm_map_terminate:
9076 *
9077 * Clean out a task's map.
9078 */
9079kern_return_t
9080vm_map_terminate(
9081 vm_map_t map)
9082{
9083 vm_map_lock(map);
9084 map->terminated = TRUE;
9085 vm_map_disable_hole_optimization(map);
9086 (void)vm_map_remove_and_unlock(map, start: map->min_offset, end: map->max_offset,
9087 flags: VM_MAP_REMOVE_NO_FLAGS, KMEM_GUARD_NONE);
9088 return KERN_SUCCESS;
9089}
9090
9091/*
9092 * Routine: vm_map_copy_allocate
9093 *
9094 * Description:
9095 * Allocates and initializes a map copy object.
9096 */
9097static vm_map_copy_t
9098vm_map_copy_allocate(uint16_t type)
9099{
9100 vm_map_copy_t new_copy;
9101
9102 new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO);
9103 new_copy->type = type;
9104 if (type == VM_MAP_COPY_ENTRY_LIST) {
9105 new_copy->c_u.hdr.rb_head_store.rbh_root = (void*)(int)SKIP_RB_TREE;
9106 vm_map_store_init(header: &new_copy->cpy_hdr);
9107 }
9108 return new_copy;
9109}
9110
9111/*
9112 * Routine: vm_map_copy_discard
9113 *
9114 * Description:
9115 * Dispose of a map copy object (returned by
9116 * vm_map_copyin).
9117 */
9118void
9119vm_map_copy_discard(
9120 vm_map_copy_t copy)
9121{
9122 if (copy == VM_MAP_COPY_NULL) {
9123 return;
9124 }
9125
9126 /*
9127 * Assert that the vm_map_copy is coming from the right
9128 * zone and hasn't been forged
9129 */
9130 vm_map_copy_require(copy);
9131
9132 switch (copy->type) {
9133 case VM_MAP_COPY_ENTRY_LIST:
9134 while (vm_map_copy_first_entry(copy) !=
9135 vm_map_copy_to_entry(copy)) {
9136 vm_map_entry_t entry = vm_map_copy_first_entry(copy);
9137
9138 vm_map_copy_entry_unlink(copy, entry);
9139 if (entry->is_sub_map) {
9140 vm_map_deallocate(VME_SUBMAP(entry));
9141 } else {
9142 vm_object_deallocate(VME_OBJECT(entry));
9143 }
9144 vm_map_copy_entry_dispose(entry);
9145 }
9146 break;
9147 case VM_MAP_COPY_KERNEL_BUFFER:
9148
9149 /*
9150 * The vm_map_copy_t and possibly the data buffer were
9151 * allocated by a single call to kalloc_data(), i.e. the
9152 * vm_map_copy_t was not allocated out of the zone.
9153 */
9154 if (copy->size > msg_ool_size_small || copy->offset) {
9155 panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
9156 (long long)copy->size, (long long)copy->offset);
9157 }
9158 kfree_data(copy->cpy_kdata, copy->size);
9159 }
9160 zfree_id(ZONE_ID_VM_MAP_COPY, copy);
9161}
9162
9163#if XNU_PLATFORM_MacOSX
9164
9165/*
9166 * Routine: vm_map_copy_copy
9167 *
9168 * Description:
9169 * Move the information in a map copy object to
9170 * a new map copy object, leaving the old one
9171 * empty.
9172 *
9173 * This is used by kernel routines that need
9174 * to look at out-of-line data (in copyin form)
9175 * before deciding whether to return SUCCESS.
9176 * If the routine returns FAILURE, the original
9177 * copy object will be deallocated; therefore,
9178 * these routines must make a copy of the copy
9179 * object and leave the original empty so that
9180 * deallocation will not fail.
9181 */
9182vm_map_copy_t
9183vm_map_copy_copy(
9184 vm_map_copy_t copy)
9185{
9186 vm_map_copy_t new_copy;
9187
9188 if (copy == VM_MAP_COPY_NULL) {
9189 return VM_MAP_COPY_NULL;
9190 }
9191
9192 /*
9193 * Assert that the vm_map_copy is coming from the right
9194 * zone and hasn't been forged
9195 */
9196 vm_map_copy_require(copy);
9197
9198 /*
9199 * Allocate a new copy object, and copy the information
9200 * from the old one into it.
9201 */
9202
9203 new_copy = zalloc_id(ZONE_ID_VM_MAP_COPY, Z_WAITOK | Z_ZERO | Z_NOFAIL);
9204 memcpy(dst: (void *) new_copy, src: (void *) copy, n: sizeof(struct vm_map_copy));
9205#if __has_feature(ptrauth_calls)
9206 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9207 new_copy->cpy_kdata = copy->cpy_kdata;
9208 }
9209#endif
9210
9211 if (copy->type == VM_MAP_COPY_ENTRY_LIST) {
9212 /*
9213 * The links in the entry chain must be
9214 * changed to point to the new copy object.
9215 */
9216 vm_map_copy_first_entry(copy)->vme_prev
9217 = vm_map_copy_to_entry(new_copy);
9218 vm_map_copy_last_entry(copy)->vme_next
9219 = vm_map_copy_to_entry(new_copy);
9220 }
9221
9222 /*
9223 * Change the old copy object into one that contains
9224 * nothing to be deallocated.
9225 */
9226 bzero(s: copy, n: sizeof(struct vm_map_copy));
9227 copy->type = VM_MAP_COPY_KERNEL_BUFFER;
9228
9229 /*
9230 * Return the new object.
9231 */
9232 return new_copy;
9233}
9234
9235#endif /* XNU_PLATFORM_MacOSX */
9236
9237static boolean_t
9238vm_map_entry_is_overwritable(
9239 vm_map_t dst_map __unused,
9240 vm_map_entry_t entry)
9241{
9242 if (!(entry->protection & VM_PROT_WRITE)) {
9243 /* can't overwrite if not writable */
9244 return FALSE;
9245 }
9246#if !__x86_64__
9247 if (entry->used_for_jit &&
9248 vm_map_cs_enforcement(map: dst_map) &&
9249 !dst_map->cs_debugged) {
9250 /*
9251 * Can't overwrite a JIT region while cs_enforced
9252 * and not cs_debugged.
9253 */
9254 return FALSE;
9255 }
9256
9257#if __arm64e__
9258 /* Do not allow overwrite HW assisted TPRO entries */
9259 if (entry->used_for_tpro) {
9260 return FALSE;
9261 }
9262#endif /* __arm64e__ */
9263
9264 if (entry->vme_permanent) {
9265 if (entry->is_sub_map) {
9266 /*
9267 * We can't tell if the submap contains "permanent"
9268 * entries within the range targeted by the caller.
9269 * The caller will have to check for that with
9270 * vm_map_overwrite_submap_recurse() for example.
9271 */
9272 } else {
9273 /*
9274 * Do not allow overwriting of a "permanent"
9275 * entry.
9276 */
9277 DTRACE_VM6(vm_map_delete_permanent_deny_overwrite,
9278 vm_map_entry_t, entry,
9279 vm_map_offset_t, entry->vme_start,
9280 vm_map_offset_t, entry->vme_end,
9281 vm_prot_t, entry->protection,
9282 vm_prot_t, entry->max_protection,
9283 int, VME_ALIAS(entry));
9284 return FALSE;
9285 }
9286 }
9287#endif /* !__x86_64__ */
9288 return TRUE;
9289}
9290
9291static kern_return_t
9292vm_map_overwrite_submap_recurse(
9293 vm_map_t dst_map,
9294 vm_map_offset_t dst_addr,
9295 vm_map_size_t dst_size)
9296{
9297 vm_map_offset_t dst_end;
9298 vm_map_entry_t tmp_entry;
9299 vm_map_entry_t entry;
9300 kern_return_t result;
9301 boolean_t encountered_sub_map = FALSE;
9302
9303
9304
9305 /*
9306 * Verify that the destination is all writeable
9307 * initially. We have to trunc the destination
9308 * address and round the copy size or we'll end up
9309 * splitting entries in strange ways.
9310 */
9311
9312 dst_end = vm_map_round_page(dst_addr + dst_size,
9313 VM_MAP_PAGE_MASK(dst_map));
9314 vm_map_lock(dst_map);
9315
9316start_pass_1:
9317 if (!vm_map_lookup_entry(map: dst_map, address: dst_addr, entry: &tmp_entry)) {
9318 vm_map_unlock(dst_map);
9319 return KERN_INVALID_ADDRESS;
9320 }
9321
9322 vm_map_clip_start(map: dst_map,
9323 entry: tmp_entry,
9324 vm_map_trunc_page(dst_addr,
9325 VM_MAP_PAGE_MASK(dst_map)));
9326 if (tmp_entry->is_sub_map) {
9327 /* clipping did unnest if needed */
9328 assert(!tmp_entry->use_pmap);
9329 }
9330
9331 for (entry = tmp_entry;;) {
9332 vm_map_entry_t next;
9333
9334 next = entry->vme_next;
9335 while (entry->is_sub_map) {
9336 vm_map_offset_t sub_start;
9337 vm_map_offset_t sub_end;
9338 vm_map_offset_t local_end;
9339
9340 if (entry->in_transition) {
9341 /*
9342 * Say that we are waiting, and wait for entry.
9343 */
9344 entry->needs_wakeup = TRUE;
9345 vm_map_entry_wait(dst_map, THREAD_UNINT);
9346
9347 goto start_pass_1;
9348 }
9349
9350 encountered_sub_map = TRUE;
9351 sub_start = VME_OFFSET(entry);
9352
9353 if (entry->vme_end < dst_end) {
9354 sub_end = entry->vme_end;
9355 } else {
9356 sub_end = dst_end;
9357 }
9358 sub_end -= entry->vme_start;
9359 sub_end += VME_OFFSET(entry);
9360 local_end = entry->vme_end;
9361 vm_map_unlock(dst_map);
9362
9363 result = vm_map_overwrite_submap_recurse(
9364 VME_SUBMAP(entry),
9365 dst_addr: sub_start,
9366 dst_size: sub_end - sub_start);
9367
9368 if (result != KERN_SUCCESS) {
9369 return result;
9370 }
9371 if (dst_end <= entry->vme_end) {
9372 return KERN_SUCCESS;
9373 }
9374 vm_map_lock(dst_map);
9375 if (!vm_map_lookup_entry(map: dst_map, address: local_end,
9376 entry: &tmp_entry)) {
9377 vm_map_unlock(dst_map);
9378 return KERN_INVALID_ADDRESS;
9379 }
9380 entry = tmp_entry;
9381 next = entry->vme_next;
9382 }
9383
9384 if (!(entry->protection & VM_PROT_WRITE)) {
9385 vm_map_unlock(dst_map);
9386 return KERN_PROTECTION_FAILURE;
9387 }
9388
9389 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9390 vm_map_unlock(dst_map);
9391 return KERN_PROTECTION_FAILURE;
9392 }
9393
9394 /*
9395 * If the entry is in transition, we must wait
9396 * for it to exit that state. Anything could happen
9397 * when we unlock the map, so start over.
9398 */
9399 if (entry->in_transition) {
9400 /*
9401 * Say that we are waiting, and wait for entry.
9402 */
9403 entry->needs_wakeup = TRUE;
9404 vm_map_entry_wait(dst_map, THREAD_UNINT);
9405
9406 goto start_pass_1;
9407 }
9408
9409/*
9410 * our range is contained completely within this map entry
9411 */
9412 if (dst_end <= entry->vme_end) {
9413 vm_map_unlock(dst_map);
9414 return KERN_SUCCESS;
9415 }
9416/*
9417 * check that range specified is contiguous region
9418 */
9419 if ((next == vm_map_to_entry(dst_map)) ||
9420 (next->vme_start != entry->vme_end)) {
9421 vm_map_unlock(dst_map);
9422 return KERN_INVALID_ADDRESS;
9423 }
9424
9425 /*
9426 * Check for permanent objects in the destination.
9427 */
9428 if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9429 ((!VME_OBJECT(entry)->internal) ||
9430 (VME_OBJECT(entry)->true_share))) {
9431 if (encountered_sub_map) {
9432 vm_map_unlock(dst_map);
9433 return KERN_FAILURE;
9434 }
9435 }
9436
9437
9438 entry = next;
9439 }/* for */
9440 vm_map_unlock(dst_map);
9441 return KERN_SUCCESS;
9442}
9443
9444/*
9445 * Routine: vm_map_copy_overwrite
9446 *
9447 * Description:
9448 * Copy the memory described by the map copy
9449 * object (copy; returned by vm_map_copyin) onto
9450 * the specified destination region (dst_map, dst_addr).
9451 * The destination must be writeable.
9452 *
9453 * Unlike vm_map_copyout, this routine actually
9454 * writes over previously-mapped memory. If the
9455 * previous mapping was to a permanent (user-supplied)
9456 * memory object, it is preserved.
9457 *
9458 * The attributes (protection and inheritance) of the
9459 * destination region are preserved.
9460 *
9461 * If successful, consumes the copy object.
9462 * Otherwise, the caller is responsible for it.
9463 *
9464 * Implementation notes:
9465 * To overwrite aligned temporary virtual memory, it is
9466 * sufficient to remove the previous mapping and insert
9467 * the new copy. This replacement is done either on
9468 * the whole region (if no permanent virtual memory
9469 * objects are embedded in the destination region) or
9470 * in individual map entries.
9471 *
9472 * To overwrite permanent virtual memory , it is necessary
9473 * to copy each page, as the external memory management
9474 * interface currently does not provide any optimizations.
9475 *
9476 * Unaligned memory also has to be copied. It is possible
9477 * to use 'vm_trickery' to copy the aligned data. This is
9478 * not done but not hard to implement.
9479 *
9480 * Once a page of permanent memory has been overwritten,
9481 * it is impossible to interrupt this function; otherwise,
9482 * the call would be neither atomic nor location-independent.
9483 * The kernel-state portion of a user thread must be
9484 * interruptible.
9485 *
9486 * It may be expensive to forward all requests that might
9487 * overwrite permanent memory (vm_write, vm_copy) to
9488 * uninterruptible kernel threads. This routine may be
9489 * called by interruptible threads; however, success is
9490 * not guaranteed -- if the request cannot be performed
9491 * atomically and interruptibly, an error indication is
9492 * returned.
9493 *
9494 * Callers of this function must call vm_map_copy_require on
9495 * previously created vm_map_copy_t or pass a newly created
9496 * one to ensure that it hasn't been forged.
9497 */
9498static kern_return_t
9499vm_map_copy_overwrite_nested(
9500 vm_map_t dst_map,
9501 vm_map_address_t dst_addr,
9502 vm_map_copy_t copy,
9503 boolean_t interruptible,
9504 pmap_t pmap,
9505 boolean_t discard_on_success)
9506{
9507 vm_map_offset_t dst_end;
9508 vm_map_entry_t tmp_entry;
9509 vm_map_entry_t entry;
9510 kern_return_t kr;
9511 boolean_t aligned = TRUE;
9512 boolean_t contains_permanent_objects = FALSE;
9513 boolean_t encountered_sub_map = FALSE;
9514 vm_map_offset_t base_addr;
9515 vm_map_size_t copy_size;
9516 vm_map_size_t total_size;
9517 uint16_t copy_page_shift;
9518
9519 /*
9520 * Check for special kernel buffer allocated
9521 * by new_ipc_kmsg_copyin.
9522 */
9523
9524 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
9525 kr = vm_map_copyout_kernel_buffer(
9526 map: dst_map, addr: &dst_addr,
9527 copy, copy_size: copy->size, TRUE, consume_on_success: discard_on_success);
9528 return kr;
9529 }
9530
9531 /*
9532 * Only works for entry lists at the moment. Will
9533 * support page lists later.
9534 */
9535
9536 assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
9537
9538 if (copy->size == 0) {
9539 if (discard_on_success) {
9540 vm_map_copy_discard(copy);
9541 }
9542 return KERN_SUCCESS;
9543 }
9544
9545 copy_page_shift = copy->cpy_hdr.page_shift;
9546
9547 /*
9548 * Verify that the destination is all writeable
9549 * initially. We have to trunc the destination
9550 * address and round the copy size or we'll end up
9551 * splitting entries in strange ways.
9552 */
9553
9554 if (!VM_MAP_PAGE_ALIGNED(copy->size,
9555 VM_MAP_PAGE_MASK(dst_map)) ||
9556 !VM_MAP_PAGE_ALIGNED(copy->offset,
9557 VM_MAP_PAGE_MASK(dst_map)) ||
9558 !VM_MAP_PAGE_ALIGNED(dst_addr,
9559 VM_MAP_PAGE_MASK(dst_map)) ||
9560 copy_page_shift != VM_MAP_PAGE_SHIFT(map: dst_map)) {
9561 aligned = FALSE;
9562 dst_end = vm_map_round_page(dst_addr + copy->size,
9563 VM_MAP_PAGE_MASK(dst_map));
9564 } else {
9565 dst_end = dst_addr + copy->size;
9566 }
9567
9568 vm_map_lock(dst_map);
9569
9570 /* LP64todo - remove this check when vm_map_commpage64()
9571 * no longer has to stuff in a map_entry for the commpage
9572 * above the map's max_offset.
9573 */
9574 if (dst_addr >= dst_map->max_offset) {
9575 vm_map_unlock(dst_map);
9576 return KERN_INVALID_ADDRESS;
9577 }
9578
9579start_pass_1:
9580 if (!vm_map_lookup_entry(map: dst_map, address: dst_addr, entry: &tmp_entry)) {
9581 vm_map_unlock(dst_map);
9582 return KERN_INVALID_ADDRESS;
9583 }
9584 vm_map_clip_start(map: dst_map,
9585 entry: tmp_entry,
9586 vm_map_trunc_page(dst_addr,
9587 VM_MAP_PAGE_MASK(dst_map)));
9588 for (entry = tmp_entry;;) {
9589 vm_map_entry_t next = entry->vme_next;
9590
9591 while (entry->is_sub_map) {
9592 vm_map_offset_t sub_start;
9593 vm_map_offset_t sub_end;
9594 vm_map_offset_t local_end;
9595
9596 if (entry->in_transition) {
9597 /*
9598 * Say that we are waiting, and wait for entry.
9599 */
9600 entry->needs_wakeup = TRUE;
9601 vm_map_entry_wait(dst_map, THREAD_UNINT);
9602
9603 goto start_pass_1;
9604 }
9605
9606 local_end = entry->vme_end;
9607 if (!(entry->needs_copy)) {
9608 /* if needs_copy we are a COW submap */
9609 /* in such a case we just replace so */
9610 /* there is no need for the follow- */
9611 /* ing check. */
9612 encountered_sub_map = TRUE;
9613 sub_start = VME_OFFSET(entry);
9614
9615 if (entry->vme_end < dst_end) {
9616 sub_end = entry->vme_end;
9617 } else {
9618 sub_end = dst_end;
9619 }
9620 sub_end -= entry->vme_start;
9621 sub_end += VME_OFFSET(entry);
9622 vm_map_unlock(dst_map);
9623
9624 kr = vm_map_overwrite_submap_recurse(
9625 VME_SUBMAP(entry),
9626 dst_addr: sub_start,
9627 dst_size: sub_end - sub_start);
9628 if (kr != KERN_SUCCESS) {
9629 return kr;
9630 }
9631 vm_map_lock(dst_map);
9632 }
9633
9634 if (dst_end <= entry->vme_end) {
9635 goto start_overwrite;
9636 }
9637 if (!vm_map_lookup_entry(map: dst_map, address: local_end,
9638 entry: &entry)) {
9639 vm_map_unlock(dst_map);
9640 return KERN_INVALID_ADDRESS;
9641 }
9642 next = entry->vme_next;
9643 }
9644
9645 if (!(entry->protection & VM_PROT_WRITE)) {
9646 vm_map_unlock(dst_map);
9647 return KERN_PROTECTION_FAILURE;
9648 }
9649
9650 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
9651 vm_map_unlock(dst_map);
9652 return KERN_PROTECTION_FAILURE;
9653 }
9654
9655 /*
9656 * If the entry is in transition, we must wait
9657 * for it to exit that state. Anything could happen
9658 * when we unlock the map, so start over.
9659 */
9660 if (entry->in_transition) {
9661 /*
9662 * Say that we are waiting, and wait for entry.
9663 */
9664 entry->needs_wakeup = TRUE;
9665 vm_map_entry_wait(dst_map, THREAD_UNINT);
9666
9667 goto start_pass_1;
9668 }
9669
9670/*
9671 * our range is contained completely within this map entry
9672 */
9673 if (dst_end <= entry->vme_end) {
9674 break;
9675 }
9676/*
9677 * check that range specified is contiguous region
9678 */
9679 if ((next == vm_map_to_entry(dst_map)) ||
9680 (next->vme_start != entry->vme_end)) {
9681 vm_map_unlock(dst_map);
9682 return KERN_INVALID_ADDRESS;
9683 }
9684
9685
9686 /*
9687 * Check for permanent objects in the destination.
9688 */
9689 if ((VME_OBJECT(entry) != VM_OBJECT_NULL) &&
9690 ((!VME_OBJECT(entry)->internal) ||
9691 (VME_OBJECT(entry)->true_share))) {
9692 contains_permanent_objects = TRUE;
9693 }
9694
9695 entry = next;
9696 }/* for */
9697
9698start_overwrite:
9699 /*
9700 * If there are permanent objects in the destination, then
9701 * the copy cannot be interrupted.
9702 */
9703
9704 if (interruptible && contains_permanent_objects) {
9705 vm_map_unlock(dst_map);
9706 return KERN_FAILURE; /* XXX */
9707 }
9708
9709 /*
9710 *
9711 * Make a second pass, overwriting the data
9712 * At the beginning of each loop iteration,
9713 * the next entry to be overwritten is "tmp_entry"
9714 * (initially, the value returned from the lookup above),
9715 * and the starting address expected in that entry
9716 * is "start".
9717 */
9718
9719 total_size = copy->size;
9720 if (encountered_sub_map) {
9721 copy_size = 0;
9722 /* re-calculate tmp_entry since we've had the map */
9723 /* unlocked */
9724 if (!vm_map_lookup_entry( map: dst_map, address: dst_addr, entry: &tmp_entry)) {
9725 vm_map_unlock(dst_map);
9726 return KERN_INVALID_ADDRESS;
9727 }
9728 } else {
9729 copy_size = copy->size;
9730 }
9731
9732 base_addr = dst_addr;
9733 while (TRUE) {
9734 /* deconstruct the copy object and do in parts */
9735 /* only in sub_map, interruptable case */
9736 vm_map_entry_t copy_entry;
9737 vm_map_entry_t previous_prev = VM_MAP_ENTRY_NULL;
9738 vm_map_entry_t next_copy = VM_MAP_ENTRY_NULL;
9739 int nentries;
9740 int remaining_entries = 0;
9741 vm_map_offset_t new_offset = 0;
9742
9743 for (entry = tmp_entry; copy_size == 0;) {
9744 vm_map_entry_t next;
9745
9746 next = entry->vme_next;
9747
9748 /* tmp_entry and base address are moved along */
9749 /* each time we encounter a sub-map. Otherwise */
9750 /* entry can outpase tmp_entry, and the copy_size */
9751 /* may reflect the distance between them */
9752 /* if the current entry is found to be in transition */
9753 /* we will start over at the beginning or the last */
9754 /* encounter of a submap as dictated by base_addr */
9755 /* we will zero copy_size accordingly. */
9756 if (entry->in_transition) {
9757 /*
9758 * Say that we are waiting, and wait for entry.
9759 */
9760 entry->needs_wakeup = TRUE;
9761 vm_map_entry_wait(dst_map, THREAD_UNINT);
9762
9763 if (!vm_map_lookup_entry(map: dst_map, address: base_addr,
9764 entry: &tmp_entry)) {
9765 vm_map_unlock(dst_map);
9766 return KERN_INVALID_ADDRESS;
9767 }
9768 copy_size = 0;
9769 entry = tmp_entry;
9770 continue;
9771 }
9772 if (entry->is_sub_map) {
9773 vm_map_offset_t sub_start;
9774 vm_map_offset_t sub_end;
9775 vm_map_offset_t local_end;
9776
9777 if (entry->needs_copy) {
9778 /* if this is a COW submap */
9779 /* just back the range with a */
9780 /* anonymous entry */
9781 assert(!entry->vme_permanent);
9782 if (entry->vme_end < dst_end) {
9783 sub_end = entry->vme_end;
9784 } else {
9785 sub_end = dst_end;
9786 }
9787 if (entry->vme_start < base_addr) {
9788 sub_start = base_addr;
9789 } else {
9790 sub_start = entry->vme_start;
9791 }
9792 vm_map_clip_end(
9793 map: dst_map, entry, endaddr: sub_end);
9794 vm_map_clip_start(
9795 map: dst_map, entry, startaddr: sub_start);
9796 assert(!entry->use_pmap);
9797 assert(!entry->iokit_acct);
9798 entry->use_pmap = TRUE;
9799 vm_map_deallocate(VME_SUBMAP(entry));
9800 assert(!entry->vme_permanent);
9801 VME_OBJECT_SET(entry, VM_OBJECT_NULL, false, context: 0);
9802 VME_OFFSET_SET(entry, offset: 0);
9803 entry->is_shared = FALSE;
9804 entry->needs_copy = FALSE;
9805 entry->protection = VM_PROT_DEFAULT;
9806 entry->max_protection = VM_PROT_ALL;
9807 entry->wired_count = 0;
9808 entry->user_wired_count = 0;
9809 if (entry->inheritance
9810 == VM_INHERIT_SHARE) {
9811 entry->inheritance = VM_INHERIT_COPY;
9812 }
9813 continue;
9814 }
9815 /* first take care of any non-sub_map */
9816 /* entries to send */
9817 if (base_addr < entry->vme_start) {
9818 /* stuff to send */
9819 copy_size =
9820 entry->vme_start - base_addr;
9821 break;
9822 }
9823 sub_start = VME_OFFSET(entry);
9824
9825 if (entry->vme_end < dst_end) {
9826 sub_end = entry->vme_end;
9827 } else {
9828 sub_end = dst_end;
9829 }
9830 sub_end -= entry->vme_start;
9831 sub_end += VME_OFFSET(entry);
9832 local_end = entry->vme_end;
9833 vm_map_unlock(dst_map);
9834 copy_size = sub_end - sub_start;
9835
9836 /* adjust the copy object */
9837 if (total_size > copy_size) {
9838 vm_map_size_t local_size = 0;
9839 vm_map_size_t entry_size;
9840
9841 nentries = 1;
9842 new_offset = copy->offset;
9843 copy_entry = vm_map_copy_first_entry(copy);
9844 while (copy_entry !=
9845 vm_map_copy_to_entry(copy)) {
9846 entry_size = copy_entry->vme_end -
9847 copy_entry->vme_start;
9848 if ((local_size < copy_size) &&
9849 ((local_size + entry_size)
9850 >= copy_size)) {
9851 vm_map_copy_clip_end(copy,
9852 copy_entry,
9853 copy_entry->vme_start +
9854 (copy_size - local_size));
9855 entry_size = copy_entry->vme_end -
9856 copy_entry->vme_start;
9857 local_size += entry_size;
9858 new_offset += entry_size;
9859 }
9860 if (local_size >= copy_size) {
9861 next_copy = copy_entry->vme_next;
9862 copy_entry->vme_next =
9863 vm_map_copy_to_entry(copy);
9864 previous_prev =
9865 copy->cpy_hdr.links.prev;
9866 copy->cpy_hdr.links.prev = copy_entry;
9867 copy->size = copy_size;
9868 remaining_entries =
9869 copy->cpy_hdr.nentries;
9870 remaining_entries -= nentries;
9871 copy->cpy_hdr.nentries = nentries;
9872 break;
9873 } else {
9874 local_size += entry_size;
9875 new_offset += entry_size;
9876 nentries++;
9877 }
9878 copy_entry = copy_entry->vme_next;
9879 }
9880 }
9881
9882 if ((entry->use_pmap) && (pmap == NULL)) {
9883 kr = vm_map_copy_overwrite_nested(
9884 VME_SUBMAP(entry),
9885 dst_addr: sub_start,
9886 copy,
9887 interruptible,
9888 VME_SUBMAP(entry)->pmap,
9889 TRUE);
9890 } else if (pmap != NULL) {
9891 kr = vm_map_copy_overwrite_nested(
9892 VME_SUBMAP(entry),
9893 dst_addr: sub_start,
9894 copy,
9895 interruptible, pmap,
9896 TRUE);
9897 } else {
9898 kr = vm_map_copy_overwrite_nested(
9899 VME_SUBMAP(entry),
9900 dst_addr: sub_start,
9901 copy,
9902 interruptible,
9903 pmap: dst_map->pmap,
9904 TRUE);
9905 }
9906 if (kr != KERN_SUCCESS) {
9907 if (next_copy != NULL) {
9908 copy->cpy_hdr.nentries +=
9909 remaining_entries;
9910 copy->cpy_hdr.links.prev->vme_next =
9911 next_copy;
9912 copy->cpy_hdr.links.prev
9913 = previous_prev;
9914 copy->size = total_size;
9915 }
9916 return kr;
9917 }
9918 if (dst_end <= local_end) {
9919 return KERN_SUCCESS;
9920 }
9921 /* otherwise copy no longer exists, it was */
9922 /* destroyed after successful copy_overwrite */
9923 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
9924 copy->offset = new_offset;
9925 copy->cpy_hdr.page_shift = copy_page_shift;
9926
9927 total_size -= copy_size;
9928 copy_size = 0;
9929 /* put back remainder of copy in container */
9930 if (next_copy != NULL) {
9931 copy->cpy_hdr.nentries = remaining_entries;
9932 copy->cpy_hdr.links.next = next_copy;
9933 copy->cpy_hdr.links.prev = previous_prev;
9934 copy->size = total_size;
9935 next_copy->vme_prev =
9936 vm_map_copy_to_entry(copy);
9937 next_copy = NULL;
9938 }
9939 base_addr = local_end;
9940 vm_map_lock(dst_map);
9941 if (!vm_map_lookup_entry(map: dst_map,
9942 address: local_end, entry: &tmp_entry)) {
9943 vm_map_unlock(dst_map);
9944 return KERN_INVALID_ADDRESS;
9945 }
9946 entry = tmp_entry;
9947 continue;
9948 }
9949 if (dst_end <= entry->vme_end) {
9950 copy_size = dst_end - base_addr;
9951 break;
9952 }
9953
9954 if ((next == vm_map_to_entry(dst_map)) ||
9955 (next->vme_start != entry->vme_end)) {
9956 vm_map_unlock(dst_map);
9957 return KERN_INVALID_ADDRESS;
9958 }
9959
9960 entry = next;
9961 }/* for */
9962
9963 next_copy = NULL;
9964 nentries = 1;
9965
9966 /* adjust the copy object */
9967 if (total_size > copy_size) {
9968 vm_map_size_t local_size = 0;
9969 vm_map_size_t entry_size;
9970
9971 new_offset = copy->offset;
9972 copy_entry = vm_map_copy_first_entry(copy);
9973 while (copy_entry != vm_map_copy_to_entry(copy)) {
9974 entry_size = copy_entry->vme_end -
9975 copy_entry->vme_start;
9976 if ((local_size < copy_size) &&
9977 ((local_size + entry_size)
9978 >= copy_size)) {
9979 vm_map_copy_clip_end(copy, copy_entry,
9980 copy_entry->vme_start +
9981 (copy_size - local_size));
9982 entry_size = copy_entry->vme_end -
9983 copy_entry->vme_start;
9984 local_size += entry_size;
9985 new_offset += entry_size;
9986 }
9987 if (local_size >= copy_size) {
9988 next_copy = copy_entry->vme_next;
9989 copy_entry->vme_next =
9990 vm_map_copy_to_entry(copy);
9991 previous_prev =
9992 copy->cpy_hdr.links.prev;
9993 copy->cpy_hdr.links.prev = copy_entry;
9994 copy->size = copy_size;
9995 remaining_entries =
9996 copy->cpy_hdr.nentries;
9997 remaining_entries -= nentries;
9998 copy->cpy_hdr.nentries = nentries;
9999 break;
10000 } else {
10001 local_size += entry_size;
10002 new_offset += entry_size;
10003 nentries++;
10004 }
10005 copy_entry = copy_entry->vme_next;
10006 }
10007 }
10008
10009 if (aligned) {
10010 pmap_t local_pmap;
10011
10012 if (pmap) {
10013 local_pmap = pmap;
10014 } else {
10015 local_pmap = dst_map->pmap;
10016 }
10017
10018 if ((kr = vm_map_copy_overwrite_aligned(
10019 dst_map, tmp_entry, copy,
10020 start: base_addr, pmap: local_pmap)) != KERN_SUCCESS) {
10021 if (next_copy != NULL) {
10022 copy->cpy_hdr.nentries +=
10023 remaining_entries;
10024 copy->cpy_hdr.links.prev->vme_next =
10025 next_copy;
10026 copy->cpy_hdr.links.prev =
10027 previous_prev;
10028 copy->size += copy_size;
10029 }
10030 return kr;
10031 }
10032 vm_map_unlock(dst_map);
10033 } else {
10034 /*
10035 * Performance gain:
10036 *
10037 * if the copy and dst address are misaligned but the same
10038 * offset within the page we can copy_not_aligned the
10039 * misaligned parts and copy aligned the rest. If they are
10040 * aligned but len is unaligned we simply need to copy
10041 * the end bit unaligned. We'll need to split the misaligned
10042 * bits of the region in this case !
10043 */
10044 /* ALWAYS UNLOCKS THE dst_map MAP */
10045 kr = vm_map_copy_overwrite_unaligned(
10046 dst_map,
10047 entry: tmp_entry,
10048 copy,
10049 start: base_addr,
10050 discard_on_success);
10051 if (kr != KERN_SUCCESS) {
10052 if (next_copy != NULL) {
10053 copy->cpy_hdr.nentries +=
10054 remaining_entries;
10055 copy->cpy_hdr.links.prev->vme_next =
10056 next_copy;
10057 copy->cpy_hdr.links.prev =
10058 previous_prev;
10059 copy->size += copy_size;
10060 }
10061 return kr;
10062 }
10063 }
10064 total_size -= copy_size;
10065 if (total_size == 0) {
10066 break;
10067 }
10068 base_addr += copy_size;
10069 copy_size = 0;
10070 copy->offset = new_offset;
10071 if (next_copy != NULL) {
10072 copy->cpy_hdr.nentries = remaining_entries;
10073 copy->cpy_hdr.links.next = next_copy;
10074 copy->cpy_hdr.links.prev = previous_prev;
10075 next_copy->vme_prev = vm_map_copy_to_entry(copy);
10076 copy->size = total_size;
10077 }
10078 vm_map_lock(dst_map);
10079 while (TRUE) {
10080 if (!vm_map_lookup_entry(map: dst_map,
10081 address: base_addr, entry: &tmp_entry)) {
10082 vm_map_unlock(dst_map);
10083 return KERN_INVALID_ADDRESS;
10084 }
10085 if (tmp_entry->in_transition) {
10086 entry->needs_wakeup = TRUE;
10087 vm_map_entry_wait(dst_map, THREAD_UNINT);
10088 } else {
10089 break;
10090 }
10091 }
10092 vm_map_clip_start(map: dst_map,
10093 entry: tmp_entry,
10094 vm_map_trunc_page(base_addr,
10095 VM_MAP_PAGE_MASK(dst_map)));
10096
10097 entry = tmp_entry;
10098 } /* while */
10099
10100 /*
10101 * Throw away the vm_map_copy object
10102 */
10103 if (discard_on_success) {
10104 vm_map_copy_discard(copy);
10105 }
10106
10107 return KERN_SUCCESS;
10108}/* vm_map_copy_overwrite */
10109
10110kern_return_t
10111vm_map_copy_overwrite(
10112 vm_map_t dst_map,
10113 vm_map_offset_t dst_addr,
10114 vm_map_copy_t copy,
10115 vm_map_size_t copy_size,
10116 boolean_t interruptible)
10117{
10118 vm_map_size_t head_size, tail_size;
10119 vm_map_copy_t head_copy, tail_copy;
10120 vm_map_offset_t head_addr, tail_addr;
10121 vm_map_entry_t entry;
10122 kern_return_t kr;
10123 vm_map_offset_t effective_page_mask, effective_page_size;
10124 uint16_t copy_page_shift;
10125
10126 head_size = 0;
10127 tail_size = 0;
10128 head_copy = NULL;
10129 tail_copy = NULL;
10130 head_addr = 0;
10131 tail_addr = 0;
10132
10133 /*
10134 * Check for null copy object.
10135 */
10136 if (copy == VM_MAP_COPY_NULL) {
10137 return KERN_SUCCESS;
10138 }
10139
10140 if (__improbable(vm_map_range_overflows(dst_map, dst_addr, copy_size))) {
10141 return KERN_INVALID_ADDRESS;
10142 }
10143
10144 /*
10145 * Assert that the vm_map_copy is coming from the right
10146 * zone and hasn't been forged
10147 */
10148 vm_map_copy_require(copy);
10149
10150 if (interruptible ||
10151 copy->type != VM_MAP_COPY_ENTRY_LIST) {
10152 /*
10153 * We can't split the "copy" map if we're interruptible
10154 * or if we don't have a "copy" map...
10155 */
10156blunt_copy:
10157 kr = vm_map_copy_overwrite_nested(dst_map,
10158 dst_addr,
10159 copy,
10160 interruptible,
10161 pmap: (pmap_t) NULL,
10162 TRUE);
10163 if (kr) {
10164 ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_FULL_NESTED_ERROR), arg: kr /* arg */);
10165 }
10166 return kr;
10167 }
10168
10169 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy);
10170 if (copy_page_shift < PAGE_SHIFT ||
10171 VM_MAP_PAGE_SHIFT(map: dst_map) < PAGE_SHIFT) {
10172 goto blunt_copy;
10173 }
10174
10175 if (VM_MAP_PAGE_SHIFT(map: dst_map) < PAGE_SHIFT) {
10176 effective_page_mask = VM_MAP_PAGE_MASK(dst_map);
10177 } else {
10178 effective_page_mask = MAX(VM_MAP_PAGE_MASK(dst_map), PAGE_MASK);
10179 effective_page_mask = MAX(VM_MAP_COPY_PAGE_MASK(copy),
10180 effective_page_mask);
10181 }
10182 effective_page_size = effective_page_mask + 1;
10183
10184 if (copy_size < VM_MAP_COPY_OVERWRITE_OPTIMIZATION_THRESHOLD_PAGES * effective_page_size) {
10185 /*
10186 * Too small to bother with optimizing...
10187 */
10188 goto blunt_copy;
10189 }
10190
10191 if ((dst_addr & effective_page_mask) !=
10192 (copy->offset & effective_page_mask)) {
10193 /*
10194 * Incompatible mis-alignment of source and destination...
10195 */
10196 goto blunt_copy;
10197 }
10198
10199 /*
10200 * Proper alignment or identical mis-alignment at the beginning.
10201 * Let's try and do a small unaligned copy first (if needed)
10202 * and then an aligned copy for the rest.
10203 */
10204 if (!vm_map_page_aligned(offset: dst_addr, mask: effective_page_mask)) {
10205 head_addr = dst_addr;
10206 head_size = (effective_page_size -
10207 (copy->offset & effective_page_mask));
10208 head_size = MIN(head_size, copy_size);
10209 }
10210 if (!vm_map_page_aligned(offset: copy->offset + copy_size,
10211 mask: effective_page_mask)) {
10212 /*
10213 * Mis-alignment at the end.
10214 * Do an aligned copy up to the last page and
10215 * then an unaligned copy for the remaining bytes.
10216 */
10217 tail_size = ((copy->offset + copy_size) &
10218 effective_page_mask);
10219 tail_size = MIN(tail_size, copy_size);
10220 tail_addr = dst_addr + copy_size - tail_size;
10221 assert(tail_addr >= head_addr + head_size);
10222 }
10223 assert(head_size + tail_size <= copy_size);
10224
10225 if (head_size + tail_size == copy_size) {
10226 /*
10227 * It's all unaligned, no optimization possible...
10228 */
10229 goto blunt_copy;
10230 }
10231
10232 /*
10233 * Can't optimize if there are any submaps in the
10234 * destination due to the way we free the "copy" map
10235 * progressively in vm_map_copy_overwrite_nested()
10236 * in that case.
10237 */
10238 vm_map_lock_read(dst_map);
10239 if (!vm_map_lookup_entry(map: dst_map, address: dst_addr, entry: &entry)) {
10240 vm_map_unlock_read(dst_map);
10241 goto blunt_copy;
10242 }
10243 for (;
10244 (entry != vm_map_to_entry(dst_map) &&
10245 entry->vme_start < dst_addr + copy_size);
10246 entry = entry->vme_next) {
10247 if (entry->is_sub_map) {
10248 vm_map_unlock_read(dst_map);
10249 goto blunt_copy;
10250 }
10251 }
10252 vm_map_unlock_read(dst_map);
10253
10254 if (head_size) {
10255 /*
10256 * Unaligned copy of the first "head_size" bytes, to reach
10257 * a page boundary.
10258 */
10259
10260 /*
10261 * Extract "head_copy" out of "copy".
10262 */
10263 head_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10264 head_copy->cpy_hdr.entries_pageable =
10265 copy->cpy_hdr.entries_pageable;
10266 head_copy->cpy_hdr.page_shift = copy_page_shift;
10267
10268 entry = vm_map_copy_first_entry(copy);
10269 if (entry->vme_end < copy->offset + head_size) {
10270 head_size = entry->vme_end - copy->offset;
10271 }
10272
10273 head_copy->offset = copy->offset;
10274 head_copy->size = head_size;
10275 copy->offset += head_size;
10276 copy->size -= head_size;
10277 copy_size -= head_size;
10278 assert(copy_size > 0);
10279
10280 vm_map_copy_clip_end(copy, entry, copy->offset);
10281 vm_map_copy_entry_unlink(copy, entry);
10282 vm_map_copy_entry_link(head_copy,
10283 vm_map_copy_to_entry(head_copy),
10284 entry);
10285
10286 /*
10287 * Do the unaligned copy.
10288 */
10289 kr = vm_map_copy_overwrite_nested(dst_map,
10290 dst_addr: head_addr,
10291 copy: head_copy,
10292 interruptible,
10293 pmap: (pmap_t) NULL,
10294 FALSE);
10295 if (kr != KERN_SUCCESS) {
10296 ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_HEAD_NESTED_ERROR), arg: kr /* arg */);
10297 goto done;
10298 }
10299 }
10300
10301 if (tail_size) {
10302 /*
10303 * Extract "tail_copy" out of "copy".
10304 */
10305 tail_copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
10306 tail_copy->cpy_hdr.entries_pageable =
10307 copy->cpy_hdr.entries_pageable;
10308 tail_copy->cpy_hdr.page_shift = copy_page_shift;
10309
10310 tail_copy->offset = copy->offset + copy_size - tail_size;
10311 tail_copy->size = tail_size;
10312
10313 copy->size -= tail_size;
10314 copy_size -= tail_size;
10315 assert(copy_size > 0);
10316
10317 entry = vm_map_copy_last_entry(copy);
10318 vm_map_copy_clip_start(copy, entry, tail_copy->offset);
10319 entry = vm_map_copy_last_entry(copy);
10320 vm_map_copy_entry_unlink(copy, entry);
10321 vm_map_copy_entry_link(tail_copy,
10322 vm_map_copy_last_entry(tail_copy),
10323 entry);
10324 }
10325
10326 /*
10327 * If we are here from ipc_kmsg_copyout_ool_descriptor(),
10328 * we want to avoid TOCTOU issues w.r.t copy->size but
10329 * we don't need to change vm_map_copy_overwrite_nested()
10330 * and all other vm_map_copy_overwrite variants.
10331 *
10332 * So we assign the original copy_size that was passed into
10333 * this routine back to copy.
10334 *
10335 * This use of local 'copy_size' passed into this routine is
10336 * to try and protect against TOCTOU attacks where the kernel
10337 * has been exploited. We don't expect this to be an issue
10338 * during normal system operation.
10339 */
10340 assertf(copy->size == copy_size,
10341 "Mismatch of copy sizes. Expected 0x%llx, Got 0x%llx\n", (uint64_t) copy_size, (uint64_t) copy->size);
10342 copy->size = copy_size;
10343
10344 /*
10345 * Copy most (or possibly all) of the data.
10346 */
10347 kr = vm_map_copy_overwrite_nested(dst_map,
10348 dst_addr: dst_addr + head_size,
10349 copy,
10350 interruptible,
10351 pmap: (pmap_t) NULL,
10352 FALSE);
10353 if (kr != KERN_SUCCESS) {
10354 ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_NESTED_ERROR), arg: kr /* arg */);
10355 goto done;
10356 }
10357
10358 if (tail_size) {
10359 kr = vm_map_copy_overwrite_nested(dst_map,
10360 dst_addr: tail_addr,
10361 copy: tail_copy,
10362 interruptible,
10363 pmap: (pmap_t) NULL,
10364 FALSE);
10365 if (kr) {
10366 ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOVERWRITE_PARTIAL_TAIL_NESTED_ERROR), arg: kr /* arg */);
10367 }
10368 }
10369
10370done:
10371 assert(copy->type == VM_MAP_COPY_ENTRY_LIST);
10372 if (kr == KERN_SUCCESS) {
10373 /*
10374 * Discard all the copy maps.
10375 */
10376 if (head_copy) {
10377 vm_map_copy_discard(copy: head_copy);
10378 head_copy = NULL;
10379 }
10380 vm_map_copy_discard(copy);
10381 if (tail_copy) {
10382 vm_map_copy_discard(copy: tail_copy);
10383 tail_copy = NULL;
10384 }
10385 } else {
10386 /*
10387 * Re-assemble the original copy map.
10388 */
10389 if (head_copy) {
10390 entry = vm_map_copy_first_entry(head_copy);
10391 vm_map_copy_entry_unlink(head_copy, entry);
10392 vm_map_copy_entry_link(copy,
10393 vm_map_copy_to_entry(copy),
10394 entry);
10395 copy->offset -= head_size;
10396 copy->size += head_size;
10397 vm_map_copy_discard(copy: head_copy);
10398 head_copy = NULL;
10399 }
10400 if (tail_copy) {
10401 entry = vm_map_copy_last_entry(tail_copy);
10402 vm_map_copy_entry_unlink(tail_copy, entry);
10403 vm_map_copy_entry_link(copy,
10404 vm_map_copy_last_entry(copy),
10405 entry);
10406 copy->size += tail_size;
10407 vm_map_copy_discard(copy: tail_copy);
10408 tail_copy = NULL;
10409 }
10410 }
10411 return kr;
10412}
10413
10414
10415/*
10416 * Routine: vm_map_copy_overwrite_unaligned [internal use only]
10417 *
10418 * Decription:
10419 * Physically copy unaligned data
10420 *
10421 * Implementation:
10422 * Unaligned parts of pages have to be physically copied. We use
10423 * a modified form of vm_fault_copy (which understands none-aligned
10424 * page offsets and sizes) to do the copy. We attempt to copy as
10425 * much memory in one go as possibly, however vm_fault_copy copies
10426 * within 1 memory object so we have to find the smaller of "amount left"
10427 * "source object data size" and "target object data size". With
10428 * unaligned data we don't need to split regions, therefore the source
10429 * (copy) object should be one map entry, the target range may be split
10430 * over multiple map entries however. In any event we are pessimistic
10431 * about these assumptions.
10432 *
10433 * Callers of this function must call vm_map_copy_require on
10434 * previously created vm_map_copy_t or pass a newly created
10435 * one to ensure that it hasn't been forged.
10436 *
10437 * Assumptions:
10438 * dst_map is locked on entry and is return locked on success,
10439 * unlocked on error.
10440 */
10441
10442static kern_return_t
10443vm_map_copy_overwrite_unaligned(
10444 vm_map_t dst_map,
10445 vm_map_entry_t entry,
10446 vm_map_copy_t copy,
10447 vm_map_offset_t start,
10448 boolean_t discard_on_success)
10449{
10450 vm_map_entry_t copy_entry;
10451 vm_map_entry_t copy_entry_next;
10452 vm_map_version_t version;
10453 vm_object_t dst_object;
10454 vm_object_offset_t dst_offset;
10455 vm_object_offset_t src_offset;
10456 vm_object_offset_t entry_offset;
10457 vm_map_offset_t entry_end;
10458 vm_map_size_t src_size,
10459 dst_size,
10460 copy_size,
10461 amount_left;
10462 kern_return_t kr = KERN_SUCCESS;
10463
10464
10465 copy_entry = vm_map_copy_first_entry(copy);
10466
10467 vm_map_lock_write_to_read(dst_map);
10468
10469 src_offset = copy->offset - trunc_page_mask_64(copy->offset, VM_MAP_COPY_PAGE_MASK(copy));
10470 amount_left = copy->size;
10471/*
10472 * unaligned so we never clipped this entry, we need the offset into
10473 * the vm_object not just the data.
10474 */
10475 while (amount_left > 0) {
10476 if (entry == vm_map_to_entry(dst_map)) {
10477 vm_map_unlock_read(dst_map);
10478 return KERN_INVALID_ADDRESS;
10479 }
10480
10481 /* "start" must be within the current map entry */
10482 assert((start >= entry->vme_start) && (start < entry->vme_end));
10483
10484 /*
10485 * Check protection again
10486 */
10487 if (!(entry->protection & VM_PROT_WRITE)) {
10488 vm_map_unlock_read(dst_map);
10489 return KERN_PROTECTION_FAILURE;
10490 }
10491 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10492 vm_map_unlock_read(dst_map);
10493 return KERN_PROTECTION_FAILURE;
10494 }
10495
10496 /*
10497 * If the entry is in transition, we must wait
10498 * for it to exit that state. Anything could happen
10499 * when we unlock the map, so start over.
10500 */
10501 if (entry->in_transition) {
10502 /*
10503 * Say that we are waiting, and wait for entry.
10504 */
10505 entry->needs_wakeup = TRUE;
10506 vm_map_entry_wait(dst_map, THREAD_UNINT);
10507
10508 goto RetryLookup;
10509 }
10510
10511 dst_offset = start - entry->vme_start;
10512
10513 dst_size = entry->vme_end - start;
10514
10515 src_size = copy_entry->vme_end -
10516 (copy_entry->vme_start + src_offset);
10517
10518 if (dst_size < src_size) {
10519/*
10520 * we can only copy dst_size bytes before
10521 * we have to get the next destination entry
10522 */
10523 copy_size = dst_size;
10524 } else {
10525/*
10526 * we can only copy src_size bytes before
10527 * we have to get the next source copy entry
10528 */
10529 copy_size = src_size;
10530 }
10531
10532 if (copy_size > amount_left) {
10533 copy_size = amount_left;
10534 }
10535/*
10536 * Entry needs copy, create a shadow shadow object for
10537 * Copy on write region.
10538 */
10539 if (entry->needs_copy) {
10540 if (vm_map_lock_read_to_write(map: dst_map)) {
10541 vm_map_lock_read(dst_map);
10542 goto RetryLookup;
10543 }
10544 VME_OBJECT_SHADOW(entry,
10545 length: (vm_map_size_t)(entry->vme_end
10546 - entry->vme_start),
10547 always: vm_map_always_shadow(map: dst_map));
10548 entry->needs_copy = FALSE;
10549 vm_map_lock_write_to_read(dst_map);
10550 }
10551 dst_object = VME_OBJECT(entry);
10552/*
10553 * unlike with the virtual (aligned) copy we're going
10554 * to fault on it therefore we need a target object.
10555 */
10556 if (dst_object == VM_OBJECT_NULL) {
10557 if (vm_map_lock_read_to_write(map: dst_map)) {
10558 vm_map_lock_read(dst_map);
10559 goto RetryLookup;
10560 }
10561 dst_object = vm_object_allocate(size: (vm_map_size_t)
10562 entry->vme_end - entry->vme_start);
10563 VME_OBJECT_SET(entry, object: dst_object, false, context: 0);
10564 VME_OFFSET_SET(entry, offset: 0);
10565 assert(entry->use_pmap);
10566 vm_map_lock_write_to_read(dst_map);
10567 }
10568/*
10569 * Take an object reference and unlock map. The "entry" may
10570 * disappear or change when the map is unlocked.
10571 */
10572 vm_object_reference(dst_object);
10573 version.main_timestamp = dst_map->timestamp;
10574 entry_offset = VME_OFFSET(entry);
10575 entry_end = entry->vme_end;
10576 vm_map_unlock_read(dst_map);
10577/*
10578 * Copy as much as possible in one pass
10579 */
10580 kr = vm_fault_copy(
10581 VME_OBJECT(copy_entry),
10582 src_offset: VME_OFFSET(entry: copy_entry) + src_offset,
10583 copy_size: &copy_size,
10584 dst_object,
10585 dst_offset: entry_offset + dst_offset,
10586 dst_map,
10587 dst_version: &version,
10588 THREAD_UNINT );
10589
10590 start += copy_size;
10591 src_offset += copy_size;
10592 amount_left -= copy_size;
10593/*
10594 * Release the object reference
10595 */
10596 vm_object_deallocate(object: dst_object);
10597/*
10598 * If a hard error occurred, return it now
10599 */
10600 if (kr != KERN_SUCCESS) {
10601 return kr;
10602 }
10603
10604 if ((copy_entry->vme_start + src_offset) == copy_entry->vme_end
10605 || amount_left == 0) {
10606/*
10607 * all done with this copy entry, dispose.
10608 */
10609 copy_entry_next = copy_entry->vme_next;
10610
10611 if (discard_on_success) {
10612 vm_map_copy_entry_unlink(copy, copy_entry);
10613 assert(!copy_entry->is_sub_map);
10614 vm_object_deallocate(VME_OBJECT(copy_entry));
10615 vm_map_copy_entry_dispose(copy_entry);
10616 }
10617
10618 if (copy_entry_next == vm_map_copy_to_entry(copy) &&
10619 amount_left) {
10620/*
10621 * not finished copying but run out of source
10622 */
10623 return KERN_INVALID_ADDRESS;
10624 }
10625
10626 copy_entry = copy_entry_next;
10627
10628 src_offset = 0;
10629 }
10630
10631 if (amount_left == 0) {
10632 return KERN_SUCCESS;
10633 }
10634
10635 vm_map_lock_read(dst_map);
10636 if (version.main_timestamp == dst_map->timestamp) {
10637 if (start == entry_end) {
10638/*
10639 * destination region is split. Use the version
10640 * information to avoid a lookup in the normal
10641 * case.
10642 */
10643 entry = entry->vme_next;
10644/*
10645 * should be contiguous. Fail if we encounter
10646 * a hole in the destination.
10647 */
10648 if (start != entry->vme_start) {
10649 vm_map_unlock_read(dst_map);
10650 return KERN_INVALID_ADDRESS;
10651 }
10652 }
10653 } else {
10654/*
10655 * Map version check failed.
10656 * we must lookup the entry because somebody
10657 * might have changed the map behind our backs.
10658 */
10659RetryLookup:
10660 if (!vm_map_lookup_entry(map: dst_map, address: start, entry: &entry)) {
10661 vm_map_unlock_read(dst_map);
10662 return KERN_INVALID_ADDRESS;
10663 }
10664 }
10665 }/* while */
10666
10667 return KERN_SUCCESS;
10668}/* vm_map_copy_overwrite_unaligned */
10669
10670/*
10671 * Routine: vm_map_copy_overwrite_aligned [internal use only]
10672 *
10673 * Description:
10674 * Does all the vm_trickery possible for whole pages.
10675 *
10676 * Implementation:
10677 *
10678 * If there are no permanent objects in the destination,
10679 * and the source and destination map entry zones match,
10680 * and the destination map entry is not shared,
10681 * then the map entries can be deleted and replaced
10682 * with those from the copy. The following code is the
10683 * basic idea of what to do, but there are lots of annoying
10684 * little details about getting protection and inheritance
10685 * right. Should add protection, inheritance, and sharing checks
10686 * to the above pass and make sure that no wiring is involved.
10687 *
10688 * Callers of this function must call vm_map_copy_require on
10689 * previously created vm_map_copy_t or pass a newly created
10690 * one to ensure that it hasn't been forged.
10691 */
10692
10693int vm_map_copy_overwrite_aligned_src_not_internal = 0;
10694int vm_map_copy_overwrite_aligned_src_not_symmetric = 0;
10695int vm_map_copy_overwrite_aligned_src_large = 0;
10696
10697static kern_return_t
10698vm_map_copy_overwrite_aligned(
10699 vm_map_t dst_map,
10700 vm_map_entry_t tmp_entry,
10701 vm_map_copy_t copy,
10702 vm_map_offset_t start,
10703 __unused pmap_t pmap)
10704{
10705 vm_object_t object;
10706 vm_map_entry_t copy_entry;
10707 vm_map_size_t copy_size;
10708 vm_map_size_t size;
10709 vm_map_entry_t entry;
10710
10711 while ((copy_entry = vm_map_copy_first_entry(copy))
10712 != vm_map_copy_to_entry(copy)) {
10713 copy_size = (copy_entry->vme_end - copy_entry->vme_start);
10714
10715 entry = tmp_entry;
10716 if (entry->is_sub_map) {
10717 /* unnested when clipped earlier */
10718 assert(!entry->use_pmap);
10719 }
10720 if (entry == vm_map_to_entry(dst_map)) {
10721 vm_map_unlock(dst_map);
10722 return KERN_INVALID_ADDRESS;
10723 }
10724 size = (entry->vme_end - entry->vme_start);
10725 /*
10726 * Make sure that no holes popped up in the
10727 * address map, and that the protection is
10728 * still valid, in case the map was unlocked
10729 * earlier.
10730 */
10731
10732 if ((entry->vme_start != start) || ((entry->is_sub_map)
10733 && !entry->needs_copy)) {
10734 vm_map_unlock(dst_map);
10735 return KERN_INVALID_ADDRESS;
10736 }
10737 assert(entry != vm_map_to_entry(dst_map));
10738
10739 /*
10740 * Check protection again
10741 */
10742
10743 if (!(entry->protection & VM_PROT_WRITE)) {
10744 vm_map_unlock(dst_map);
10745 return KERN_PROTECTION_FAILURE;
10746 }
10747
10748 if (!vm_map_entry_is_overwritable(dst_map, entry)) {
10749 vm_map_unlock(dst_map);
10750 return KERN_PROTECTION_FAILURE;
10751 }
10752
10753 /*
10754 * If the entry is in transition, we must wait
10755 * for it to exit that state. Anything could happen
10756 * when we unlock the map, so start over.
10757 */
10758 if (entry->in_transition) {
10759 /*
10760 * Say that we are waiting, and wait for entry.
10761 */
10762 entry->needs_wakeup = TRUE;
10763 vm_map_entry_wait(dst_map, THREAD_UNINT);
10764
10765 goto RetryLookup;
10766 }
10767
10768 /*
10769 * Adjust to source size first
10770 */
10771
10772 if (copy_size < size) {
10773 if (entry->map_aligned &&
10774 !VM_MAP_PAGE_ALIGNED(entry->vme_start + copy_size,
10775 VM_MAP_PAGE_MASK(dst_map))) {
10776 /* no longer map-aligned */
10777 entry->map_aligned = FALSE;
10778 }
10779 vm_map_clip_end(map: dst_map, entry, endaddr: entry->vme_start + copy_size);
10780 size = copy_size;
10781 }
10782
10783 /*
10784 * Adjust to destination size
10785 */
10786
10787 if (size < copy_size) {
10788 vm_map_copy_clip_end(copy, copy_entry,
10789 copy_entry->vme_start + size);
10790 copy_size = size;
10791 }
10792
10793 assert((entry->vme_end - entry->vme_start) == size);
10794 assert((tmp_entry->vme_end - tmp_entry->vme_start) == size);
10795 assert((copy_entry->vme_end - copy_entry->vme_start) == size);
10796
10797 /*
10798 * If the destination contains temporary unshared memory,
10799 * we can perform the copy by throwing it away and
10800 * installing the source data.
10801 *
10802 * Exceptions for mappings with special semantics:
10803 * + "permanent" entries,
10804 * + JIT regions,
10805 * + TPRO regions,
10806 * + pmap-specific protection policies,
10807 * + VM objects with COPY_NONE copy strategy.
10808 */
10809
10810 object = VME_OBJECT(entry);
10811 if ((!entry->is_shared &&
10812 !entry->vme_permanent &&
10813 !entry->used_for_jit &&
10814#if __arm64e__
10815 !entry->used_for_tpro &&
10816#endif /* __arm64e__ */
10817 !(entry->protection & VM_PROT_EXECUTE) &&
10818 !pmap_has_prot_policy(pmap: dst_map->pmap, translated_allow_execute: entry->translated_allow_execute, prot: entry->protection) &&
10819 ((object == VM_OBJECT_NULL) ||
10820 (object->internal &&
10821 !object->true_share &&
10822 object->copy_strategy != MEMORY_OBJECT_COPY_NONE))) ||
10823 entry->needs_copy) {
10824 vm_object_t old_object = VME_OBJECT(entry);
10825 vm_object_offset_t old_offset = VME_OFFSET(entry);
10826 vm_object_offset_t offset;
10827
10828 /*
10829 * Ensure that the source and destination aren't
10830 * identical
10831 */
10832 if (old_object == VME_OBJECT(copy_entry) &&
10833 old_offset == VME_OFFSET(entry: copy_entry)) {
10834 vm_map_copy_entry_unlink(copy, copy_entry);
10835 vm_map_copy_entry_dispose(copy_entry);
10836
10837 if (old_object != VM_OBJECT_NULL) {
10838 vm_object_deallocate(object: old_object);
10839 }
10840
10841 start = tmp_entry->vme_end;
10842 tmp_entry = tmp_entry->vme_next;
10843 continue;
10844 }
10845
10846#if XNU_TARGET_OS_OSX
10847#define __TRADEOFF1_OBJ_SIZE (64 * 1024 * 1024) /* 64 MB */
10848#define __TRADEOFF1_COPY_SIZE (128 * 1024) /* 128 KB */
10849 if (VME_OBJECT(copy_entry) != VM_OBJECT_NULL &&
10850 VME_OBJECT(copy_entry)->vo_size >= __TRADEOFF1_OBJ_SIZE &&
10851 copy_size <= __TRADEOFF1_COPY_SIZE) {
10852 /*
10853 * Virtual vs. Physical copy tradeoff #1.
10854 *
10855 * Copying only a few pages out of a large
10856 * object: do a physical copy instead of
10857 * a virtual copy, to avoid possibly keeping
10858 * the entire large object alive because of
10859 * those few copy-on-write pages.
10860 */
10861 vm_map_copy_overwrite_aligned_src_large++;
10862 goto slow_copy;
10863 }
10864#endif /* XNU_TARGET_OS_OSX */
10865
10866 if ((dst_map->pmap != kernel_pmap) &&
10867 (VME_ALIAS(entry) >= VM_MEMORY_MALLOC) &&
10868 (VME_ALIAS(entry) <= VM_MEMORY_MALLOC_MEDIUM)) {
10869 vm_object_t new_object, new_shadow;
10870
10871 /*
10872 * We're about to map something over a mapping
10873 * established by malloc()...
10874 */
10875 new_object = VME_OBJECT(copy_entry);
10876 if (new_object != VM_OBJECT_NULL) {
10877 vm_object_lock_shared(new_object);
10878 }
10879 while (new_object != VM_OBJECT_NULL &&
10880#if XNU_TARGET_OS_OSX
10881 !new_object->true_share &&
10882 new_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
10883#endif /* XNU_TARGET_OS_OSX */
10884 new_object->internal) {
10885 new_shadow = new_object->shadow;
10886 if (new_shadow == VM_OBJECT_NULL) {
10887 break;
10888 }
10889 vm_object_lock_shared(new_shadow);
10890 vm_object_unlock(new_object);
10891 new_object = new_shadow;
10892 }
10893 if (new_object != VM_OBJECT_NULL) {
10894 if (!new_object->internal) {
10895 /*
10896 * The new mapping is backed
10897 * by an external object. We
10898 * don't want malloc'ed memory
10899 * to be replaced with such a
10900 * non-anonymous mapping, so
10901 * let's go off the optimized
10902 * path...
10903 */
10904 vm_map_copy_overwrite_aligned_src_not_internal++;
10905 vm_object_unlock(new_object);
10906 goto slow_copy;
10907 }
10908#if XNU_TARGET_OS_OSX
10909 if (new_object->true_share ||
10910 new_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
10911 /*
10912 * Same if there's a "true_share"
10913 * object in the shadow chain, or
10914 * an object with a non-default
10915 * (SYMMETRIC) copy strategy.
10916 */
10917 vm_map_copy_overwrite_aligned_src_not_symmetric++;
10918 vm_object_unlock(new_object);
10919 goto slow_copy;
10920 }
10921#endif /* XNU_TARGET_OS_OSX */
10922 vm_object_unlock(new_object);
10923 }
10924 /*
10925 * The new mapping is still backed by
10926 * anonymous (internal) memory, so it's
10927 * OK to substitute it for the original
10928 * malloc() mapping.
10929 */
10930 }
10931
10932 if (old_object != VM_OBJECT_NULL) {
10933 assert(!entry->vme_permanent);
10934 if (entry->is_sub_map) {
10935 if (entry->use_pmap) {
10936#ifndef NO_NESTED_PMAP
10937 pmap_unnest(dst_map->pmap,
10938 (addr64_t)entry->vme_start,
10939 entry->vme_end - entry->vme_start);
10940#endif /* NO_NESTED_PMAP */
10941 if (dst_map->mapped_in_other_pmaps) {
10942 /* clean up parent */
10943 /* map/maps */
10944 vm_map_submap_pmap_clean(
10945 map: dst_map, start: entry->vme_start,
10946 end: entry->vme_end,
10947 VME_SUBMAP(entry),
10948 offset: VME_OFFSET(entry));
10949 }
10950 } else {
10951 vm_map_submap_pmap_clean(
10952 map: dst_map, start: entry->vme_start,
10953 end: entry->vme_end,
10954 VME_SUBMAP(entry),
10955 offset: VME_OFFSET(entry));
10956 }
10957 vm_map_deallocate(VME_SUBMAP(entry));
10958 } else {
10959 if (dst_map->mapped_in_other_pmaps) {
10960 vm_object_pmap_protect_options(
10961 VME_OBJECT(entry),
10962 offset: VME_OFFSET(entry),
10963 size: entry->vme_end
10964 - entry->vme_start,
10965 PMAP_NULL,
10966 PAGE_SIZE,
10967 pmap_start: entry->vme_start,
10968 VM_PROT_NONE,
10969 PMAP_OPTIONS_REMOVE);
10970 } else {
10971 pmap_remove_options(
10972 map: dst_map->pmap,
10973 s: (addr64_t)(entry->vme_start),
10974 e: (addr64_t)(entry->vme_end),
10975 PMAP_OPTIONS_REMOVE);
10976 }
10977 vm_object_deallocate(object: old_object);
10978 }
10979 }
10980
10981 if (entry->iokit_acct) {
10982 /* keep using iokit accounting */
10983 entry->use_pmap = FALSE;
10984 } else {
10985 /* use pmap accounting */
10986 entry->use_pmap = TRUE;
10987 }
10988 assert(!entry->vme_permanent);
10989 VME_OBJECT_SET(entry, VME_OBJECT(copy_entry), false, context: 0);
10990 object = VME_OBJECT(entry);
10991 entry->needs_copy = copy_entry->needs_copy;
10992 entry->wired_count = 0;
10993 entry->user_wired_count = 0;
10994 offset = VME_OFFSET(entry: copy_entry);
10995 VME_OFFSET_SET(entry, offset);
10996
10997 vm_map_copy_entry_unlink(copy, copy_entry);
10998 vm_map_copy_entry_dispose(copy_entry);
10999
11000 /*
11001 * we could try to push pages into the pmap at this point, BUT
11002 * this optimization only saved on average 2 us per page if ALL
11003 * the pages in the source were currently mapped
11004 * and ALL the pages in the dest were touched, if there were fewer
11005 * than 2/3 of the pages touched, this optimization actually cost more cycles
11006 * it also puts a lot of pressure on the pmap layer w/r to mapping structures
11007 */
11008
11009 /*
11010 * Set up for the next iteration. The map
11011 * has not been unlocked, so the next
11012 * address should be at the end of this
11013 * entry, and the next map entry should be
11014 * the one following it.
11015 */
11016
11017 start = tmp_entry->vme_end;
11018 tmp_entry = tmp_entry->vme_next;
11019 } else {
11020 vm_map_version_t version;
11021 vm_object_t dst_object;
11022 vm_object_offset_t dst_offset;
11023 kern_return_t r;
11024
11025slow_copy:
11026 if (entry->needs_copy) {
11027 VME_OBJECT_SHADOW(entry,
11028 length: (entry->vme_end -
11029 entry->vme_start),
11030 always: vm_map_always_shadow(map: dst_map));
11031 entry->needs_copy = FALSE;
11032 }
11033
11034 dst_object = VME_OBJECT(entry);
11035 dst_offset = VME_OFFSET(entry);
11036
11037 /*
11038 * Take an object reference, and record
11039 * the map version information so that the
11040 * map can be safely unlocked.
11041 */
11042
11043 if (dst_object == VM_OBJECT_NULL) {
11044 /*
11045 * We would usually have just taken the
11046 * optimized path above if the destination
11047 * object has not been allocated yet. But we
11048 * now disable that optimization if the copy
11049 * entry's object is not backed by anonymous
11050 * memory to avoid replacing malloc'ed
11051 * (i.e. re-usable) anonymous memory with a
11052 * not-so-anonymous mapping.
11053 * So we have to handle this case here and
11054 * allocate a new VM object for this map entry.
11055 */
11056 dst_object = vm_object_allocate(
11057 size: entry->vme_end - entry->vme_start);
11058 dst_offset = 0;
11059 VME_OBJECT_SET(entry, object: dst_object, false, context: 0);
11060 VME_OFFSET_SET(entry, offset: dst_offset);
11061 assert(entry->use_pmap);
11062 }
11063
11064 vm_object_reference(dst_object);
11065
11066 /* account for unlock bumping up timestamp */
11067 version.main_timestamp = dst_map->timestamp + 1;
11068
11069 vm_map_unlock(dst_map);
11070
11071 /*
11072 * Copy as much as possible in one pass
11073 */
11074
11075 copy_size = size;
11076 r = vm_fault_copy(
11077 VME_OBJECT(copy_entry),
11078 src_offset: VME_OFFSET(entry: copy_entry),
11079 copy_size: &copy_size,
11080 dst_object,
11081 dst_offset,
11082 dst_map,
11083 dst_version: &version,
11084 THREAD_UNINT );
11085
11086 /*
11087 * Release the object reference
11088 */
11089
11090 vm_object_deallocate(object: dst_object);
11091
11092 /*
11093 * If a hard error occurred, return it now
11094 */
11095
11096 if (r != KERN_SUCCESS) {
11097 return r;
11098 }
11099
11100 if (copy_size != 0) {
11101 /*
11102 * Dispose of the copied region
11103 */
11104
11105 vm_map_copy_clip_end(copy, copy_entry,
11106 copy_entry->vme_start + copy_size);
11107 vm_map_copy_entry_unlink(copy, copy_entry);
11108 vm_object_deallocate(VME_OBJECT(copy_entry));
11109 vm_map_copy_entry_dispose(copy_entry);
11110 }
11111
11112 /*
11113 * Pick up in the destination map where we left off.
11114 *
11115 * Use the version information to avoid a lookup
11116 * in the normal case.
11117 */
11118
11119 start += copy_size;
11120 vm_map_lock(dst_map);
11121 if (version.main_timestamp == dst_map->timestamp &&
11122 copy_size != 0) {
11123 /* We can safely use saved tmp_entry value */
11124
11125 if (tmp_entry->map_aligned &&
11126 !VM_MAP_PAGE_ALIGNED(
11127 start,
11128 VM_MAP_PAGE_MASK(dst_map))) {
11129 /* no longer map-aligned */
11130 tmp_entry->map_aligned = FALSE;
11131 }
11132 vm_map_clip_end(map: dst_map, entry: tmp_entry, endaddr: start);
11133 tmp_entry = tmp_entry->vme_next;
11134 } else {
11135 /* Must do lookup of tmp_entry */
11136
11137RetryLookup:
11138 if (!vm_map_lookup_entry(map: dst_map, address: start, entry: &tmp_entry)) {
11139 vm_map_unlock(dst_map);
11140 return KERN_INVALID_ADDRESS;
11141 }
11142 if (tmp_entry->map_aligned &&
11143 !VM_MAP_PAGE_ALIGNED(
11144 start,
11145 VM_MAP_PAGE_MASK(dst_map))) {
11146 /* no longer map-aligned */
11147 tmp_entry->map_aligned = FALSE;
11148 }
11149 vm_map_clip_start(map: dst_map, entry: tmp_entry, startaddr: start);
11150 }
11151 }
11152 }/* while */
11153
11154 return KERN_SUCCESS;
11155}/* vm_map_copy_overwrite_aligned */
11156
11157/*
11158 * Routine: vm_map_copyin_kernel_buffer [internal use only]
11159 *
11160 * Description:
11161 * Copy in data to a kernel buffer from space in the
11162 * source map. The original space may be optionally
11163 * deallocated.
11164 *
11165 * If successful, returns a new copy object.
11166 */
11167static kern_return_t
11168vm_map_copyin_kernel_buffer(
11169 vm_map_t src_map,
11170 vm_map_offset_t src_addr,
11171 vm_map_size_t len,
11172 boolean_t src_destroy,
11173 vm_map_copy_t *copy_result)
11174{
11175 kern_return_t kr;
11176 vm_map_copy_t copy;
11177 void *kdata;
11178
11179 if (len > msg_ool_size_small) {
11180 return KERN_INVALID_ARGUMENT;
11181 }
11182
11183 kdata = kalloc_data(len, Z_WAITOK);
11184 if (kdata == NULL) {
11185 return KERN_RESOURCE_SHORTAGE;
11186 }
11187 kr = copyinmap(map: src_map, fromaddr: src_addr, todata: kdata, length: (vm_size_t)len);
11188 if (kr != KERN_SUCCESS) {
11189 kfree_data(kdata, len);
11190 return kr;
11191 }
11192
11193 copy = vm_map_copy_allocate(VM_MAP_COPY_KERNEL_BUFFER);
11194 copy->cpy_kdata = kdata;
11195 copy->size = len;
11196 copy->offset = 0;
11197
11198 if (src_destroy) {
11199 vmr_flags_t flags = VM_MAP_REMOVE_INTERRUPTIBLE;
11200
11201 if (src_map == kernel_map) {
11202 flags |= VM_MAP_REMOVE_KUNWIRE;
11203 }
11204
11205 (void)vm_map_remove_guard(map: src_map,
11206 vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
11207 vm_map_round_page(src_addr + len, VM_MAP_PAGE_MASK(src_map)),
11208 flags, KMEM_GUARD_NONE);
11209 }
11210
11211 *copy_result = copy;
11212 return KERN_SUCCESS;
11213}
11214
11215/*
11216 * Routine: vm_map_copyout_kernel_buffer [internal use only]
11217 *
11218 * Description:
11219 * Copy out data from a kernel buffer into space in the
11220 * destination map. The space may be otpionally dynamically
11221 * allocated.
11222 *
11223 * If successful, consumes the copy object.
11224 * Otherwise, the caller is responsible for it.
11225 *
11226 * Callers of this function must call vm_map_copy_require on
11227 * previously created vm_map_copy_t or pass a newly created
11228 * one to ensure that it hasn't been forged.
11229 */
11230static int vm_map_copyout_kernel_buffer_failures = 0;
11231static kern_return_t
11232vm_map_copyout_kernel_buffer(
11233 vm_map_t map,
11234 vm_map_address_t *addr, /* IN/OUT */
11235 vm_map_copy_t copy,
11236 vm_map_size_t copy_size,
11237 boolean_t overwrite,
11238 boolean_t consume_on_success)
11239{
11240 kern_return_t kr = KERN_SUCCESS;
11241 thread_t thread = current_thread();
11242
11243 assert(copy->size == copy_size);
11244
11245 /*
11246 * check for corrupted vm_map_copy structure
11247 */
11248 if (copy_size > msg_ool_size_small || copy->offset) {
11249 panic("Invalid vm_map_copy_t sz:%lld, ofst:%lld",
11250 (long long)copy->size, (long long)copy->offset);
11251 }
11252
11253 if (!overwrite) {
11254 /*
11255 * Allocate space in the target map for the data
11256 */
11257 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11258
11259 if (map == kernel_map) {
11260 vmk_flags.vmkf_range_id = KMEM_RANGE_ID_DATA;
11261 }
11262
11263 *addr = 0;
11264 kr = vm_map_enter(map,
11265 address: addr,
11266 vm_map_round_page(copy_size,
11267 VM_MAP_PAGE_MASK(map)),
11268 mask: (vm_map_offset_t) 0,
11269 vmk_flags,
11270 VM_OBJECT_NULL,
11271 offset: (vm_object_offset_t) 0,
11272 FALSE,
11273 VM_PROT_DEFAULT,
11274 VM_PROT_ALL,
11275 VM_INHERIT_DEFAULT);
11276 if (kr != KERN_SUCCESS) {
11277 return kr;
11278 }
11279#if KASAN
11280 if (map->pmap == kernel_pmap) {
11281 kasan_notify_address(*addr, copy->size);
11282 }
11283#endif
11284 }
11285
11286 /*
11287 * Copyout the data from the kernel buffer to the target map.
11288 */
11289 if (thread->map == map) {
11290 /*
11291 * If the target map is the current map, just do
11292 * the copy.
11293 */
11294 assert((vm_size_t)copy_size == copy_size);
11295 if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11296 kr = KERN_INVALID_ADDRESS;
11297 }
11298 } else {
11299 vm_map_t oldmap;
11300
11301 /*
11302 * If the target map is another map, assume the
11303 * target's address space identity for the duration
11304 * of the copy.
11305 */
11306 vm_map_reference(map);
11307 oldmap = vm_map_switch(map);
11308
11309 assert((vm_size_t)copy_size == copy_size);
11310 if (copyout(copy->cpy_kdata, *addr, (vm_size_t)copy_size)) {
11311 vm_map_copyout_kernel_buffer_failures++;
11312 kr = KERN_INVALID_ADDRESS;
11313 }
11314
11315 (void) vm_map_switch(map: oldmap);
11316 vm_map_deallocate(map);
11317 }
11318
11319 if (kr != KERN_SUCCESS) {
11320 /* the copy failed, clean up */
11321 if (!overwrite) {
11322 /*
11323 * Deallocate the space we allocated in the target map.
11324 */
11325 (void) vm_map_remove(map,
11326 vm_map_trunc_page(*addr,
11327 VM_MAP_PAGE_MASK(map)),
11328 vm_map_round_page((*addr +
11329 vm_map_round_page(copy_size,
11330 VM_MAP_PAGE_MASK(map))),
11331 VM_MAP_PAGE_MASK(map)));
11332 *addr = 0;
11333 }
11334 } else {
11335 /* copy was successful, dicard the copy structure */
11336 if (consume_on_success) {
11337 kfree_data(copy->cpy_kdata, copy_size);
11338 zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11339 }
11340 }
11341
11342 return kr;
11343}
11344
11345/*
11346 * Routine: vm_map_copy_insert [internal use only]
11347 *
11348 * Description:
11349 * Link a copy chain ("copy") into a map at the
11350 * specified location (after "where").
11351 *
11352 * Callers of this function must call vm_map_copy_require on
11353 * previously created vm_map_copy_t or pass a newly created
11354 * one to ensure that it hasn't been forged.
11355 * Side effects:
11356 * The copy chain is destroyed.
11357 */
11358static void
11359vm_map_copy_insert(
11360 vm_map_t map,
11361 vm_map_entry_t after_where,
11362 vm_map_copy_t copy)
11363{
11364 vm_map_entry_t entry;
11365
11366 while (vm_map_copy_first_entry(copy) != vm_map_copy_to_entry(copy)) {
11367 entry = vm_map_copy_first_entry(copy);
11368 vm_map_copy_entry_unlink(copy, entry);
11369 vm_map_store_entry_link(map, after_where, entry,
11370 VM_MAP_KERNEL_FLAGS_NONE);
11371 after_where = entry;
11372 }
11373 zfree_id(ZONE_ID_VM_MAP_COPY, copy);
11374}
11375
11376/*
11377 * Callers of this function must call vm_map_copy_require on
11378 * previously created vm_map_copy_t or pass a newly created
11379 * one to ensure that it hasn't been forged.
11380 */
11381void
11382vm_map_copy_remap(
11383 vm_map_t map,
11384 vm_map_entry_t where,
11385 vm_map_copy_t copy,
11386 vm_map_offset_t adjustment,
11387 vm_prot_t cur_prot,
11388 vm_prot_t max_prot,
11389 vm_inherit_t inheritance)
11390{
11391 vm_map_entry_t copy_entry, new_entry;
11392
11393 for (copy_entry = vm_map_copy_first_entry(copy);
11394 copy_entry != vm_map_copy_to_entry(copy);
11395 copy_entry = copy_entry->vme_next) {
11396 /* get a new VM map entry for the map */
11397 new_entry = vm_map_entry_create(map);
11398 /* copy the "copy entry" to the new entry */
11399 vm_map_entry_copy(map, new: new_entry, old: copy_entry);
11400 /* adjust "start" and "end" */
11401 new_entry->vme_start += adjustment;
11402 new_entry->vme_end += adjustment;
11403 /* clear some attributes */
11404 new_entry->inheritance = inheritance;
11405 new_entry->protection = cur_prot;
11406 new_entry->max_protection = max_prot;
11407 new_entry->behavior = VM_BEHAVIOR_DEFAULT;
11408 /* take an extra reference on the entry's "object" */
11409 if (new_entry->is_sub_map) {
11410 assert(!new_entry->use_pmap); /* not nested */
11411 vm_map_reference(VME_SUBMAP(new_entry));
11412 } else {
11413 vm_object_reference(VME_OBJECT(new_entry));
11414 }
11415 /* insert the new entry in the map */
11416 vm_map_store_entry_link(map, after_where: where, entry: new_entry,
11417 VM_MAP_KERNEL_FLAGS_NONE);
11418 /* continue inserting the "copy entries" after the new entry */
11419 where = new_entry;
11420 }
11421}
11422
11423
11424/*
11425 * Returns true if *size matches (or is in the range of) copy->size.
11426 * Upon returning true, the *size field is updated with the actual size of the
11427 * copy object (may be different for VM_MAP_COPY_ENTRY_LIST types)
11428 */
11429boolean_t
11430vm_map_copy_validate_size(
11431 vm_map_t dst_map,
11432 vm_map_copy_t copy,
11433 vm_map_size_t *size)
11434{
11435 if (copy == VM_MAP_COPY_NULL) {
11436 return FALSE;
11437 }
11438
11439 /*
11440 * Assert that the vm_map_copy is coming from the right
11441 * zone and hasn't been forged
11442 */
11443 vm_map_copy_require(copy);
11444
11445 vm_map_size_t copy_sz = copy->size;
11446 vm_map_size_t sz = *size;
11447 switch (copy->type) {
11448 case VM_MAP_COPY_KERNEL_BUFFER:
11449 if (sz == copy_sz) {
11450 return TRUE;
11451 }
11452 break;
11453 case VM_MAP_COPY_ENTRY_LIST:
11454 /*
11455 * potential page-size rounding prevents us from exactly
11456 * validating this flavor of vm_map_copy, but we can at least
11457 * assert that it's within a range.
11458 */
11459 if (copy_sz >= sz &&
11460 copy_sz <= vm_map_round_page(sz, VM_MAP_PAGE_MASK(dst_map))) {
11461 *size = copy_sz;
11462 return TRUE;
11463 }
11464 break;
11465 default:
11466 break;
11467 }
11468 return FALSE;
11469}
11470
11471/*
11472 * Routine: vm_map_copyout_size
11473 *
11474 * Description:
11475 * Copy out a copy chain ("copy") into newly-allocated
11476 * space in the destination map. Uses a prevalidated
11477 * size for the copy object (vm_map_copy_validate_size).
11478 *
11479 * If successful, consumes the copy object.
11480 * Otherwise, the caller is responsible for it.
11481 */
11482kern_return_t
11483vm_map_copyout_size(
11484 vm_map_t dst_map,
11485 vm_map_address_t *dst_addr, /* OUT */
11486 vm_map_copy_t copy,
11487 vm_map_size_t copy_size)
11488{
11489 return vm_map_copyout_internal(dst_map, dst_addr, copy, copy_size,
11490 TRUE, /* consume_on_success */
11491 VM_PROT_DEFAULT,
11492 VM_PROT_ALL,
11493 VM_INHERIT_DEFAULT);
11494}
11495
11496/*
11497 * Routine: vm_map_copyout
11498 *
11499 * Description:
11500 * Copy out a copy chain ("copy") into newly-allocated
11501 * space in the destination map.
11502 *
11503 * If successful, consumes the copy object.
11504 * Otherwise, the caller is responsible for it.
11505 */
11506kern_return_t
11507vm_map_copyout(
11508 vm_map_t dst_map,
11509 vm_map_address_t *dst_addr, /* OUT */
11510 vm_map_copy_t copy)
11511{
11512 return vm_map_copyout_internal(dst_map, dst_addr, copy, copy_size: copy ? copy->size : 0,
11513 TRUE, /* consume_on_success */
11514 VM_PROT_DEFAULT,
11515 VM_PROT_ALL,
11516 VM_INHERIT_DEFAULT);
11517}
11518
11519kern_return_t
11520vm_map_copyout_internal(
11521 vm_map_t dst_map,
11522 vm_map_address_t *dst_addr, /* OUT */
11523 vm_map_copy_t copy,
11524 vm_map_size_t copy_size,
11525 boolean_t consume_on_success,
11526 vm_prot_t cur_protection,
11527 vm_prot_t max_protection,
11528 vm_inherit_t inheritance)
11529{
11530 vm_map_size_t size;
11531 vm_map_size_t adjustment;
11532 vm_map_offset_t start;
11533 vm_object_offset_t vm_copy_start;
11534 vm_map_entry_t last;
11535 vm_map_entry_t entry;
11536 vm_map_copy_t original_copy;
11537 kern_return_t kr;
11538 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_ANYWHERE();
11539
11540 /*
11541 * Check for null copy object.
11542 */
11543
11544 if (copy == VM_MAP_COPY_NULL) {
11545 *dst_addr = 0;
11546 return KERN_SUCCESS;
11547 }
11548
11549 /*
11550 * Assert that the vm_map_copy is coming from the right
11551 * zone and hasn't been forged
11552 */
11553 vm_map_copy_require(copy);
11554
11555 if (copy->size != copy_size) {
11556 *dst_addr = 0;
11557 ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SIZE_ERROR), KERN_FAILURE /* arg */);
11558 return KERN_FAILURE;
11559 }
11560
11561 /*
11562 * Check for special kernel buffer allocated
11563 * by new_ipc_kmsg_copyin.
11564 */
11565
11566 if (copy->type == VM_MAP_COPY_KERNEL_BUFFER) {
11567 kr = vm_map_copyout_kernel_buffer(map: dst_map, addr: dst_addr,
11568 copy, copy_size, FALSE,
11569 consume_on_success);
11570 if (kr) {
11571 ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_KERNEL_BUFFER_ERROR), arg: kr /* arg */);
11572 }
11573 return kr;
11574 }
11575
11576 original_copy = copy;
11577 if (copy->cpy_hdr.page_shift != VM_MAP_PAGE_SHIFT(map: dst_map)) {
11578 vm_map_copy_t target_copy;
11579 vm_map_offset_t overmap_start, overmap_end, trimmed_start;
11580
11581 target_copy = VM_MAP_COPY_NULL;
11582 DEBUG4K_ADJUST("adjusting...\n");
11583 kr = vm_map_copy_adjust_to_target(
11584 copy_map: copy,
11585 offset: 0, /* offset */
11586 size: copy->size, /* size */
11587 target_map: dst_map,
11588 TRUE, /* copy */
11589 target_copy_map_p: &target_copy,
11590 overmap_start_p: &overmap_start,
11591 overmap_end_p: &overmap_end,
11592 trimmed_start_p: &trimmed_start);
11593 if (kr != KERN_SUCCESS) {
11594 DEBUG4K_COPY("adjust failed 0x%x\n", kr);
11595 ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_ADJUSTING_ERROR), arg: kr /* arg */);
11596 return kr;
11597 }
11598 DEBUG4K_COPY("copy %p (%d 0x%llx 0x%llx) dst_map %p (%d) target_copy %p (%d 0x%llx 0x%llx) overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx\n", copy, copy->cpy_hdr.page_shift, copy->offset, (uint64_t)copy->size, dst_map, VM_MAP_PAGE_SHIFT(dst_map), target_copy, target_copy->cpy_hdr.page_shift, target_copy->offset, (uint64_t)target_copy->size, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start);
11599 if (target_copy != copy) {
11600 copy = target_copy;
11601 }
11602 copy_size = copy->size;
11603 }
11604
11605 /*
11606 * Find space for the data
11607 */
11608
11609 vm_copy_start = vm_map_trunc_page((vm_map_size_t)copy->offset,
11610 VM_MAP_COPY_PAGE_MASK(copy));
11611 size = vm_map_round_page((vm_map_size_t)copy->offset + copy_size,
11612 VM_MAP_COPY_PAGE_MASK(copy))
11613 - vm_copy_start;
11614
11615 vm_map_kernel_flags_update_range_id(flags: &vmk_flags, map: dst_map);
11616
11617 vm_map_lock(dst_map);
11618 kr = vm_map_locate_space(map: dst_map, size, mask: 0, vmk_flags,
11619 start_inout: &start, entry_out: &last);
11620 if (kr != KERN_SUCCESS) {
11621 vm_map_unlock(dst_map);
11622 ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_COPYOUT_INTERNAL_SPACE_ERROR), arg: kr /* arg */);
11623 return kr;
11624 }
11625
11626 adjustment = start - vm_copy_start;
11627 if (!consume_on_success) {
11628 /*
11629 * We're not allowed to consume "copy", so we'll have to
11630 * copy its map entries into the destination map below.
11631 * No need to re-allocate map entries from the correct
11632 * (pageable or not) zone, since we'll get new map entries
11633 * during the transfer.
11634 * We'll also adjust the map entries's "start" and "end"
11635 * during the transfer, to keep "copy"'s entries consistent
11636 * with its "offset".
11637 */
11638 goto after_adjustments;
11639 }
11640
11641 /*
11642 * Since we're going to just drop the map
11643 * entries from the copy into the destination
11644 * map, they must come from the same pool.
11645 */
11646
11647 if (copy->cpy_hdr.entries_pageable != dst_map->hdr.entries_pageable) {
11648 /*
11649 * Mismatches occur when dealing with the default
11650 * pager.
11651 */
11652 vm_map_entry_t next, new;
11653
11654 /*
11655 * Find the zone that the copies were allocated from
11656 */
11657
11658 entry = vm_map_copy_first_entry(copy);
11659
11660 /*
11661 * Reinitialize the copy so that vm_map_copy_entry_link
11662 * will work.
11663 */
11664 vm_map_store_copy_reset(copy_map: copy, entry);
11665 copy->cpy_hdr.entries_pageable = dst_map->hdr.entries_pageable;
11666
11667 /*
11668 * Copy each entry.
11669 */
11670 while (entry != vm_map_copy_to_entry(copy)) {
11671 new = vm_map_copy_entry_create(copy);
11672 vm_map_entry_copy_full(new, old: entry);
11673 new->vme_no_copy_on_read = FALSE;
11674 assert(!new->iokit_acct);
11675 if (new->is_sub_map) {
11676 /* clr address space specifics */
11677 new->use_pmap = FALSE;
11678 }
11679 vm_map_copy_entry_link(copy,
11680 vm_map_copy_last_entry(copy),
11681 new);
11682 next = entry->vme_next;
11683 vm_map_entry_dispose(entry);
11684 entry = next;
11685 }
11686 }
11687
11688 /*
11689 * Adjust the addresses in the copy chain, and
11690 * reset the region attributes.
11691 */
11692
11693 for (entry = vm_map_copy_first_entry(copy);
11694 entry != vm_map_copy_to_entry(copy);
11695 entry = entry->vme_next) {
11696 if (VM_MAP_PAGE_SHIFT(map: dst_map) == PAGE_SHIFT) {
11697 /*
11698 * We're injecting this copy entry into a map that
11699 * has the standard page alignment, so clear
11700 * "map_aligned" (which might have been inherited
11701 * from the original map entry).
11702 */
11703 entry->map_aligned = FALSE;
11704 }
11705
11706 entry->vme_start += adjustment;
11707 entry->vme_end += adjustment;
11708
11709 if (entry->map_aligned) {
11710 assert(VM_MAP_PAGE_ALIGNED(entry->vme_start,
11711 VM_MAP_PAGE_MASK(dst_map)));
11712 assert(VM_MAP_PAGE_ALIGNED(entry->vme_end,
11713 VM_MAP_PAGE_MASK(dst_map)));
11714 }
11715
11716 entry->inheritance = VM_INHERIT_DEFAULT;
11717 entry->protection = VM_PROT_DEFAULT;
11718 entry->max_protection = VM_PROT_ALL;
11719 entry->behavior = VM_BEHAVIOR_DEFAULT;
11720
11721 /*
11722 * If the entry is now wired,
11723 * map the pages into the destination map.
11724 */
11725 if (entry->wired_count != 0) {
11726 vm_map_offset_t va;
11727 vm_object_offset_t offset;
11728 vm_object_t object;
11729 vm_prot_t prot;
11730 int type_of_fault;
11731 uint8_t object_lock_type = OBJECT_LOCK_EXCLUSIVE;
11732
11733 /* TODO4K would need to use actual page size */
11734 assert(VM_MAP_PAGE_SHIFT(dst_map) == PAGE_SHIFT);
11735
11736 object = VME_OBJECT(entry);
11737 offset = VME_OFFSET(entry);
11738 va = entry->vme_start;
11739
11740 pmap_pageable(dst_map->pmap,
11741 entry->vme_start,
11742 entry->vme_end,
11743 TRUE);
11744
11745 while (va < entry->vme_end) {
11746 vm_page_t m;
11747 struct vm_object_fault_info fault_info = {};
11748
11749 /*
11750 * Look up the page in the object.
11751 * Assert that the page will be found in the
11752 * top object:
11753 * either
11754 * the object was newly created by
11755 * vm_object_copy_slowly, and has
11756 * copies of all of the pages from
11757 * the source object
11758 * or
11759 * the object was moved from the old
11760 * map entry; because the old map
11761 * entry was wired, all of the pages
11762 * were in the top-level object.
11763 * (XXX not true if we wire pages for
11764 * reading)
11765 */
11766 vm_object_lock(object);
11767
11768 m = vm_page_lookup(object, offset);
11769 if (m == VM_PAGE_NULL || !VM_PAGE_WIRED(m) ||
11770 m->vmp_absent) {
11771 panic("vm_map_copyout: wiring %p", m);
11772 }
11773
11774 prot = entry->protection;
11775
11776 if (override_nx(map: dst_map, VME_ALIAS(entry)) &&
11777 prot) {
11778 prot |= VM_PROT_EXECUTE;
11779 }
11780
11781 type_of_fault = DBG_CACHE_HIT_FAULT;
11782
11783 fault_info.user_tag = VME_ALIAS(entry);
11784 fault_info.pmap_options = 0;
11785 if (entry->iokit_acct ||
11786 (!entry->is_sub_map && !entry->use_pmap)) {
11787 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
11788 }
11789 if (entry->vme_xnu_user_debug &&
11790 !VM_PAGE_OBJECT(m)->code_signed) {
11791 /*
11792 * Modified code-signed executable
11793 * region: this page does not belong
11794 * to a code-signed VM object, so it
11795 * must have been copied and should
11796 * therefore be typed XNU_USER_DEBUG
11797 * rather than XNU_USER_EXEC.
11798 */
11799 fault_info.pmap_options |= PMAP_OPTIONS_XNU_USER_DEBUG;
11800 }
11801
11802 vm_fault_enter(m,
11803 pmap: dst_map->pmap,
11804 vaddr: va,
11805 PAGE_SIZE, fault_phys_offset: 0,
11806 prot,
11807 fault_type: prot,
11808 VM_PAGE_WIRED(m),
11809 FALSE, /* change_wiring */
11810 VM_KERN_MEMORY_NONE, /* tag - not wiring */
11811 fault_info: &fault_info,
11812 NULL, /* need_retry */
11813 type_of_fault: &type_of_fault,
11814 object_lock_type: &object_lock_type); /*Exclusive mode lock. Will remain unchanged.*/
11815
11816 vm_object_unlock(object);
11817
11818 offset += PAGE_SIZE_64;
11819 va += PAGE_SIZE;
11820 }
11821 }
11822 }
11823
11824after_adjustments:
11825
11826 /*
11827 * Correct the page alignment for the result
11828 */
11829
11830 *dst_addr = start + (copy->offset - vm_copy_start);
11831
11832#if KASAN
11833 kasan_notify_address(*dst_addr, size);
11834#endif
11835
11836 /*
11837 * Update the hints and the map size
11838 */
11839
11840 if (consume_on_success) {
11841 SAVE_HINT_MAP_WRITE(dst_map, vm_map_copy_last_entry(copy));
11842 } else {
11843 SAVE_HINT_MAP_WRITE(dst_map, last);
11844 }
11845
11846 dst_map->size += size;
11847
11848 /*
11849 * Link in the copy
11850 */
11851
11852 if (consume_on_success) {
11853 vm_map_copy_insert(map: dst_map, after_where: last, copy);
11854 if (copy != original_copy) {
11855 vm_map_copy_discard(copy: original_copy);
11856 original_copy = VM_MAP_COPY_NULL;
11857 }
11858 } else {
11859 vm_map_copy_remap(map: dst_map, where: last, copy, adjustment,
11860 cur_prot: cur_protection, max_prot: max_protection,
11861 inheritance);
11862 if (copy != original_copy && original_copy != VM_MAP_COPY_NULL) {
11863 vm_map_copy_discard(copy);
11864 copy = original_copy;
11865 }
11866 }
11867
11868
11869 vm_map_unlock(dst_map);
11870
11871 /*
11872 * XXX If wiring_required, call vm_map_pageable
11873 */
11874
11875 return KERN_SUCCESS;
11876}
11877
11878/*
11879 * Routine: vm_map_copyin
11880 *
11881 * Description:
11882 * see vm_map_copyin_common. Exported via Unsupported.exports.
11883 *
11884 */
11885
11886#undef vm_map_copyin
11887
11888kern_return_t
11889vm_map_copyin(
11890 vm_map_t src_map,
11891 vm_map_address_t src_addr,
11892 vm_map_size_t len,
11893 boolean_t src_destroy,
11894 vm_map_copy_t *copy_result) /* OUT */
11895{
11896 return vm_map_copyin_common(src_map, src_addr, len, src_destroy,
11897 FALSE, copy_result, FALSE);
11898}
11899
11900/*
11901 * Routine: vm_map_copyin_common
11902 *
11903 * Description:
11904 * Copy the specified region (src_addr, len) from the
11905 * source address space (src_map), possibly removing
11906 * the region from the source address space (src_destroy).
11907 *
11908 * Returns:
11909 * A vm_map_copy_t object (copy_result), suitable for
11910 * insertion into another address space (using vm_map_copyout),
11911 * copying over another address space region (using
11912 * vm_map_copy_overwrite). If the copy is unused, it
11913 * should be destroyed (using vm_map_copy_discard).
11914 *
11915 * In/out conditions:
11916 * The source map should not be locked on entry.
11917 */
11918
11919typedef struct submap_map {
11920 vm_map_t parent_map;
11921 vm_map_offset_t base_start;
11922 vm_map_offset_t base_end;
11923 vm_map_size_t base_len;
11924 struct submap_map *next;
11925} submap_map_t;
11926
11927kern_return_t
11928vm_map_copyin_common(
11929 vm_map_t src_map,
11930 vm_map_address_t src_addr,
11931 vm_map_size_t len,
11932 boolean_t src_destroy,
11933 __unused boolean_t src_volatile,
11934 vm_map_copy_t *copy_result, /* OUT */
11935 boolean_t use_maxprot)
11936{
11937 int flags;
11938
11939 flags = 0;
11940 if (src_destroy) {
11941 flags |= VM_MAP_COPYIN_SRC_DESTROY;
11942 }
11943 if (use_maxprot) {
11944 flags |= VM_MAP_COPYIN_USE_MAXPROT;
11945 }
11946 return vm_map_copyin_internal(src_map,
11947 src_addr,
11948 len,
11949 flags,
11950 copy_result);
11951}
11952kern_return_t
11953vm_map_copyin_internal(
11954 vm_map_t src_map,
11955 vm_map_address_t src_addr,
11956 vm_map_size_t len,
11957 int flags,
11958 vm_map_copy_t *copy_result) /* OUT */
11959{
11960 vm_map_entry_t tmp_entry; /* Result of last map lookup --
11961 * in multi-level lookup, this
11962 * entry contains the actual
11963 * vm_object/offset.
11964 */
11965 vm_map_entry_t new_entry = VM_MAP_ENTRY_NULL; /* Map entry for copy */
11966
11967 vm_map_offset_t src_start; /* Start of current entry --
11968 * where copy is taking place now
11969 */
11970 vm_map_offset_t src_end; /* End of entire region to be
11971 * copied */
11972 vm_map_offset_t src_base;
11973 vm_map_t base_map = src_map;
11974 boolean_t map_share = FALSE;
11975 submap_map_t *parent_maps = NULL;
11976
11977 vm_map_copy_t copy; /* Resulting copy */
11978 vm_map_address_t copy_addr;
11979 vm_map_size_t copy_size;
11980 boolean_t src_destroy;
11981 boolean_t use_maxprot;
11982 boolean_t preserve_purgeable;
11983 boolean_t entry_was_shared;
11984 vm_map_entry_t saved_src_entry;
11985
11986
11987 if (flags & ~VM_MAP_COPYIN_ALL_FLAGS) {
11988 return KERN_INVALID_ARGUMENT;
11989 }
11990
11991#if CONFIG_KERNEL_TAGGING
11992 if (src_map->pmap == kernel_pmap) {
11993 src_addr = vm_memtag_canonicalize_address(src_addr);
11994 }
11995#endif /* CONFIG_KERNEL_TAGGING */
11996
11997 src_destroy = (flags & VM_MAP_COPYIN_SRC_DESTROY) ? TRUE : FALSE;
11998 use_maxprot = (flags & VM_MAP_COPYIN_USE_MAXPROT) ? TRUE : FALSE;
11999 preserve_purgeable =
12000 (flags & VM_MAP_COPYIN_PRESERVE_PURGEABLE) ? TRUE : FALSE;
12001
12002 /*
12003 * Check for copies of zero bytes.
12004 */
12005
12006 if (len == 0) {
12007 *copy_result = VM_MAP_COPY_NULL;
12008 return KERN_SUCCESS;
12009 }
12010
12011 /*
12012 * Check that the end address doesn't overflow
12013 */
12014 if (__improbable(vm_map_range_overflows(src_map, src_addr, len))) {
12015 return KERN_INVALID_ADDRESS;
12016 }
12017 src_end = src_addr + len;
12018 if (src_end < src_addr) {
12019 return KERN_INVALID_ADDRESS;
12020 }
12021
12022 /*
12023 * Compute (page aligned) start and end of region
12024 */
12025 src_start = vm_map_trunc_page(src_addr,
12026 VM_MAP_PAGE_MASK(src_map));
12027 src_end = vm_map_round_page(src_end,
12028 VM_MAP_PAGE_MASK(src_map));
12029 if (src_end < src_addr) {
12030 return KERN_INVALID_ADDRESS;
12031 }
12032
12033 /*
12034 * If the copy is sufficiently small, use a kernel buffer instead
12035 * of making a virtual copy. The theory being that the cost of
12036 * setting up VM (and taking C-O-W faults) dominates the copy costs
12037 * for small regions.
12038 */
12039 if ((len <= msg_ool_size_small) &&
12040 !use_maxprot &&
12041 !preserve_purgeable &&
12042 !(flags & VM_MAP_COPYIN_ENTRY_LIST) &&
12043 /*
12044 * Since the "msg_ool_size_small" threshold was increased and
12045 * vm_map_copyin_kernel_buffer() doesn't handle accesses beyond the
12046 * address space limits, we revert to doing a virtual copy if the
12047 * copied range goes beyond those limits. Otherwise, mach_vm_read()
12048 * of the commpage would now fail when it used to work.
12049 */
12050 (src_start >= vm_map_min(src_map) &&
12051 src_start < vm_map_max(src_map) &&
12052 src_end >= vm_map_min(src_map) &&
12053 src_end < vm_map_max(src_map))) {
12054 return vm_map_copyin_kernel_buffer(src_map, src_addr, len,
12055 src_destroy, copy_result);
12056 }
12057
12058 /*
12059 * Allocate a header element for the list.
12060 *
12061 * Use the start and end in the header to
12062 * remember the endpoints prior to rounding.
12063 */
12064
12065 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
12066 copy->cpy_hdr.entries_pageable = TRUE;
12067 copy->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(map: src_map);
12068 copy->offset = src_addr;
12069 copy->size = len;
12070
12071 new_entry = vm_map_copy_entry_create(copy);
12072
12073#define RETURN(x) \
12074 MACRO_BEGIN \
12075 vm_map_unlock(src_map); \
12076 if(src_map != base_map) \
12077 vm_map_deallocate(src_map); \
12078 if (new_entry != VM_MAP_ENTRY_NULL) \
12079 vm_map_copy_entry_dispose(new_entry); \
12080 vm_map_copy_discard(copy); \
12081 { \
12082 submap_map_t *_ptr; \
12083 \
12084 for(_ptr = parent_maps; _ptr != NULL; _ptr = parent_maps) { \
12085 parent_maps=parent_maps->next; \
12086 if (_ptr->parent_map != base_map) \
12087 vm_map_deallocate(_ptr->parent_map); \
12088 kfree_type(submap_map_t, _ptr); \
12089 } \
12090 } \
12091 MACRO_RETURN(x); \
12092 MACRO_END
12093
12094 /*
12095 * Find the beginning of the region.
12096 */
12097
12098 vm_map_lock(src_map);
12099
12100 /*
12101 * Lookup the original "src_addr" rather than the truncated
12102 * "src_start", in case "src_start" falls in a non-map-aligned
12103 * map entry *before* the map entry that contains "src_addr"...
12104 */
12105 if (!vm_map_lookup_entry(map: src_map, address: src_addr, entry: &tmp_entry)) {
12106 RETURN(KERN_INVALID_ADDRESS);
12107 }
12108 if (!tmp_entry->is_sub_map) {
12109 /*
12110 * ... but clip to the map-rounded "src_start" rather than
12111 * "src_addr" to preserve map-alignment. We'll adjust the
12112 * first copy entry at the end, if needed.
12113 */
12114 vm_map_clip_start(map: src_map, entry: tmp_entry, startaddr: src_start);
12115 }
12116 if (src_start < tmp_entry->vme_start) {
12117 /*
12118 * Move "src_start" up to the start of the
12119 * first map entry to copy.
12120 */
12121 src_start = tmp_entry->vme_start;
12122 }
12123 /* set for later submap fix-up */
12124 copy_addr = src_start;
12125
12126 /*
12127 * Go through entries until we get to the end.
12128 */
12129
12130 while (TRUE) {
12131 vm_map_entry_t src_entry = tmp_entry; /* Top-level entry */
12132 vm_map_size_t src_size; /* Size of source
12133 * map entry (in both
12134 * maps)
12135 */
12136
12137 vm_object_t src_object; /* Object to copy */
12138 vm_object_offset_t src_offset;
12139
12140 vm_object_t new_copy_object;/* vm_object_copy_* result */
12141
12142 boolean_t src_needs_copy; /* Should source map
12143 * be made read-only
12144 * for copy-on-write?
12145 */
12146
12147 boolean_t new_entry_needs_copy; /* Will new entry be COW? */
12148
12149 boolean_t was_wired; /* Was source wired? */
12150 boolean_t saved_used_for_jit; /* Saved used_for_jit. */
12151 vm_map_version_t version; /* Version before locks
12152 * dropped to make copy
12153 */
12154 kern_return_t result; /* Return value from
12155 * copy_strategically.
12156 */
12157 while (tmp_entry->is_sub_map) {
12158 vm_map_size_t submap_len;
12159 submap_map_t *ptr;
12160
12161 ptr = kalloc_type(submap_map_t, Z_WAITOK);
12162 ptr->next = parent_maps;
12163 parent_maps = ptr;
12164 ptr->parent_map = src_map;
12165 ptr->base_start = src_start;
12166 ptr->base_end = src_end;
12167 submap_len = tmp_entry->vme_end - src_start;
12168 if (submap_len > (src_end - src_start)) {
12169 submap_len = src_end - src_start;
12170 }
12171 ptr->base_len = submap_len;
12172
12173 src_start -= tmp_entry->vme_start;
12174 src_start += VME_OFFSET(entry: tmp_entry);
12175 src_end = src_start + submap_len;
12176 src_map = VME_SUBMAP(tmp_entry);
12177 vm_map_lock(src_map);
12178 /* keep an outstanding reference for all maps in */
12179 /* the parents tree except the base map */
12180 vm_map_reference(map: src_map);
12181 vm_map_unlock(ptr->parent_map);
12182 if (!vm_map_lookup_entry(
12183 map: src_map, address: src_start, entry: &tmp_entry)) {
12184 RETURN(KERN_INVALID_ADDRESS);
12185 }
12186 map_share = TRUE;
12187 if (!tmp_entry->is_sub_map) {
12188 vm_map_clip_start(map: src_map, entry: tmp_entry, startaddr: src_start);
12189 }
12190 src_entry = tmp_entry;
12191 }
12192 /* we are now in the lowest level submap... */
12193
12194 if ((VME_OBJECT(tmp_entry) != VM_OBJECT_NULL) &&
12195 (VME_OBJECT(tmp_entry)->phys_contiguous)) {
12196 /* This is not, supported for now.In future */
12197 /* we will need to detect the phys_contig */
12198 /* condition and then upgrade copy_slowly */
12199 /* to do physical copy from the device mem */
12200 /* based object. We can piggy-back off of */
12201 /* the was wired boolean to set-up the */
12202 /* proper handling */
12203 RETURN(KERN_PROTECTION_FAILURE);
12204 }
12205 /*
12206 * Create a new address map entry to hold the result.
12207 * Fill in the fields from the appropriate source entries.
12208 * We must unlock the source map to do this if we need
12209 * to allocate a map entry.
12210 */
12211 if (new_entry == VM_MAP_ENTRY_NULL) {
12212 version.main_timestamp = src_map->timestamp;
12213 vm_map_unlock(src_map);
12214
12215 new_entry = vm_map_copy_entry_create(copy);
12216
12217 vm_map_lock(src_map);
12218 if ((version.main_timestamp + 1) != src_map->timestamp) {
12219 if (!vm_map_lookup_entry(map: src_map, address: src_start,
12220 entry: &tmp_entry)) {
12221 RETURN(KERN_INVALID_ADDRESS);
12222 }
12223 if (!tmp_entry->is_sub_map) {
12224 vm_map_clip_start(map: src_map, entry: tmp_entry, startaddr: src_start);
12225 }
12226 continue; /* restart w/ new tmp_entry */
12227 }
12228 }
12229
12230 /*
12231 * Verify that the region can be read.
12232 */
12233 if (((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE &&
12234 !use_maxprot) ||
12235 (src_entry->max_protection & VM_PROT_READ) == 0) {
12236 RETURN(KERN_PROTECTION_FAILURE);
12237 }
12238
12239 /*
12240 * Clip against the endpoints of the entire region.
12241 */
12242
12243 vm_map_clip_end(map: src_map, entry: src_entry, endaddr: src_end);
12244
12245 src_size = src_entry->vme_end - src_start;
12246 src_object = VME_OBJECT(src_entry);
12247 src_offset = VME_OFFSET(entry: src_entry);
12248 was_wired = (src_entry->wired_count != 0);
12249
12250 vm_map_entry_copy(map: src_map, new: new_entry, old: src_entry);
12251 if (new_entry->is_sub_map) {
12252 /* clr address space specifics */
12253 new_entry->use_pmap = FALSE;
12254 } else {
12255 /*
12256 * We're dealing with a copy-on-write operation,
12257 * so the resulting mapping should not inherit the
12258 * original mapping's accounting settings.
12259 * "iokit_acct" should have been cleared in
12260 * vm_map_entry_copy().
12261 * "use_pmap" should be reset to its default (TRUE)
12262 * so that the new mapping gets accounted for in
12263 * the task's memory footprint.
12264 */
12265 assert(!new_entry->iokit_acct);
12266 new_entry->use_pmap = TRUE;
12267 }
12268
12269 /*
12270 * Attempt non-blocking copy-on-write optimizations.
12271 */
12272
12273 /*
12274 * If we are destroying the source, and the object
12275 * is internal, we could move the object reference
12276 * from the source to the copy. The copy is
12277 * copy-on-write only if the source is.
12278 * We make another reference to the object, because
12279 * destroying the source entry will deallocate it.
12280 *
12281 * This memory transfer has to be atomic, (to prevent
12282 * the VM object from being shared or copied while
12283 * it's being moved here), so we could only do this
12284 * if we won't have to unlock the VM map until the
12285 * original mapping has been fully removed.
12286 */
12287
12288RestartCopy:
12289 if ((src_object == VM_OBJECT_NULL ||
12290 (!was_wired && !map_share && !tmp_entry->is_shared
12291 && !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(map: src_map) < PAGE_SHIFT))) &&
12292 vm_object_copy_quickly(
12293 VME_OBJECT(new_entry),
12294 src_offset,
12295 size: src_size,
12296 src_needs_copy: &src_needs_copy,
12297 dst_needs_copy: &new_entry_needs_copy)) {
12298 new_entry->needs_copy = new_entry_needs_copy;
12299
12300 /*
12301 * Handle copy-on-write obligations
12302 */
12303
12304 if (src_needs_copy && !tmp_entry->needs_copy) {
12305 vm_prot_t prot;
12306
12307 prot = src_entry->protection & ~VM_PROT_WRITE;
12308
12309 if (override_nx(map: src_map, VME_ALIAS(src_entry))
12310 && prot) {
12311 prot |= VM_PROT_EXECUTE;
12312 }
12313
12314 vm_object_pmap_protect(
12315 object: src_object,
12316 offset: src_offset,
12317 size: src_size,
12318 pmap: (src_entry->is_shared ?
12319 PMAP_NULL
12320 : src_map->pmap),
12321 VM_MAP_PAGE_SIZE(src_map),
12322 pmap_start: src_entry->vme_start,
12323 prot);
12324
12325 assert(tmp_entry->wired_count == 0);
12326 tmp_entry->needs_copy = TRUE;
12327 }
12328
12329 /*
12330 * The map has never been unlocked, so it's safe
12331 * to move to the next entry rather than doing
12332 * another lookup.
12333 */
12334
12335 goto CopySuccessful;
12336 }
12337
12338 entry_was_shared = tmp_entry->is_shared;
12339
12340 /*
12341 * Take an object reference, so that we may
12342 * release the map lock(s).
12343 */
12344
12345 assert(src_object != VM_OBJECT_NULL);
12346 vm_object_reference(src_object);
12347
12348 /*
12349 * Record the timestamp for later verification.
12350 * Unlock the map.
12351 */
12352
12353 version.main_timestamp = src_map->timestamp;
12354 vm_map_unlock(src_map); /* Increments timestamp once! */
12355 saved_src_entry = src_entry;
12356 tmp_entry = VM_MAP_ENTRY_NULL;
12357 src_entry = VM_MAP_ENTRY_NULL;
12358
12359 /*
12360 * Perform the copy
12361 */
12362
12363 if (was_wired ||
12364 (src_object->copy_strategy == MEMORY_OBJECT_COPY_DELAY_FORK &&
12365 !(flags & VM_MAP_COPYIN_FORK)) ||
12366 (debug4k_no_cow_copyin &&
12367 VM_MAP_PAGE_SHIFT(map: src_map) < PAGE_SHIFT)) {
12368CopySlowly:
12369 vm_object_lock(src_object);
12370 result = vm_object_copy_slowly(
12371 src_object,
12372 src_offset,
12373 size: src_size,
12374 THREAD_UNINT,
12375 result_object: &new_copy_object);
12376 /* VME_OBJECT_SET will reset used_for_jit|tpro, so preserve it. */
12377 saved_used_for_jit = new_entry->used_for_jit;
12378 VME_OBJECT_SET(entry: new_entry, object: new_copy_object, false, context: 0);
12379 new_entry->used_for_jit = saved_used_for_jit;
12380 VME_OFFSET_SET(entry: new_entry,
12381 offset: src_offset - vm_object_trunc_page(src_offset));
12382 new_entry->needs_copy = FALSE;
12383 } else if (src_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC &&
12384 (entry_was_shared || map_share)) {
12385 vm_object_t new_object;
12386
12387 vm_object_lock_shared(src_object);
12388 new_object = vm_object_copy_delayed(
12389 src_object,
12390 src_offset,
12391 size: src_size,
12392 TRUE);
12393 if (new_object == VM_OBJECT_NULL) {
12394 goto CopySlowly;
12395 }
12396
12397 VME_OBJECT_SET(entry: new_entry, object: new_object, false, context: 0);
12398 assert(new_entry->wired_count == 0);
12399 new_entry->needs_copy = TRUE;
12400 assert(!new_entry->iokit_acct);
12401 assert(new_object->purgable == VM_PURGABLE_DENY);
12402 assertf(new_entry->use_pmap, "src_map %p new_entry %p\n", src_map, new_entry);
12403 result = KERN_SUCCESS;
12404 } else {
12405 vm_object_offset_t new_offset;
12406 new_offset = VME_OFFSET(entry: new_entry);
12407 result = vm_object_copy_strategically(src_object,
12408 src_offset,
12409 size: src_size,
12410 forking: (flags & VM_MAP_COPYIN_FORK),
12411 dst_object: &new_copy_object,
12412 dst_offset: &new_offset,
12413 dst_needs_copy: &new_entry_needs_copy);
12414 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
12415 saved_used_for_jit = new_entry->used_for_jit;
12416 VME_OBJECT_SET(entry: new_entry, object: new_copy_object, false, context: 0);
12417 new_entry->used_for_jit = saved_used_for_jit;
12418 if (new_offset != VME_OFFSET(entry: new_entry)) {
12419 VME_OFFSET_SET(entry: new_entry, offset: new_offset);
12420 }
12421
12422 new_entry->needs_copy = new_entry_needs_copy;
12423 }
12424
12425 if (result == KERN_SUCCESS &&
12426 ((preserve_purgeable &&
12427 src_object->purgable != VM_PURGABLE_DENY) ||
12428 new_entry->used_for_jit)) {
12429 /*
12430 * Purgeable objects should be COPY_NONE, true share;
12431 * this should be propogated to the copy.
12432 *
12433 * Also force mappings the pmap specially protects to
12434 * be COPY_NONE; trying to COW these mappings would
12435 * change the effective protections, which could have
12436 * side effects if the pmap layer relies on the
12437 * specified protections.
12438 */
12439
12440 vm_object_t new_object;
12441
12442 new_object = VME_OBJECT(new_entry);
12443 assert(new_object != src_object);
12444 vm_object_lock(new_object);
12445 assert(new_object->ref_count == 1);
12446 assert(new_object->shadow == VM_OBJECT_NULL);
12447 assert(new_object->vo_copy == VM_OBJECT_NULL);
12448 assert(new_object->vo_owner == NULL);
12449
12450 new_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
12451
12452 if (preserve_purgeable &&
12453 src_object->purgable != VM_PURGABLE_DENY) {
12454 VM_OBJECT_SET_TRUE_SHARE(object: new_object, TRUE);
12455
12456 /* start as non-volatile with no owner... */
12457 VM_OBJECT_SET_PURGABLE(object: new_object, VM_PURGABLE_NONVOLATILE);
12458 vm_purgeable_nonvolatile_enqueue(object: new_object, NULL);
12459 /* ... and move to src_object's purgeable state */
12460 if (src_object->purgable != VM_PURGABLE_NONVOLATILE) {
12461 int state;
12462 state = src_object->purgable;
12463 vm_object_purgable_control(
12464 object: new_object,
12465 VM_PURGABLE_SET_STATE_FROM_KERNEL,
12466 state: &state);
12467 }
12468 /* no pmap accounting for purgeable objects */
12469 new_entry->use_pmap = FALSE;
12470 }
12471
12472 vm_object_unlock(new_object);
12473 new_object = VM_OBJECT_NULL;
12474 }
12475
12476 if (result != KERN_SUCCESS &&
12477 result != KERN_MEMORY_RESTART_COPY) {
12478 vm_map_lock(src_map);
12479 RETURN(result);
12480 }
12481
12482 /*
12483 * Throw away the extra reference
12484 */
12485
12486 vm_object_deallocate(object: src_object);
12487
12488 /*
12489 * Verify that the map has not substantially
12490 * changed while the copy was being made.
12491 */
12492
12493 vm_map_lock(src_map);
12494
12495 if ((version.main_timestamp + 1) == src_map->timestamp) {
12496 /* src_map hasn't changed: src_entry is still valid */
12497 src_entry = saved_src_entry;
12498 goto VerificationSuccessful;
12499 }
12500
12501 /*
12502 * Simple version comparison failed.
12503 *
12504 * Retry the lookup and verify that the
12505 * same object/offset are still present.
12506 *
12507 * [Note: a memory manager that colludes with
12508 * the calling task can detect that we have
12509 * cheated. While the map was unlocked, the
12510 * mapping could have been changed and restored.]
12511 */
12512
12513 if (!vm_map_lookup_entry(map: src_map, address: src_start, entry: &tmp_entry)) {
12514 if (result != KERN_MEMORY_RESTART_COPY) {
12515 vm_object_deallocate(VME_OBJECT(new_entry));
12516 VME_OBJECT_SET(entry: new_entry, VM_OBJECT_NULL, false, context: 0);
12517 /* reset accounting state */
12518 new_entry->iokit_acct = FALSE;
12519 new_entry->use_pmap = TRUE;
12520 }
12521 RETURN(KERN_INVALID_ADDRESS);
12522 }
12523
12524 src_entry = tmp_entry;
12525 vm_map_clip_start(map: src_map, entry: src_entry, startaddr: src_start);
12526
12527 if ((((src_entry->protection & VM_PROT_READ) == VM_PROT_NONE) &&
12528 !use_maxprot) ||
12529 ((src_entry->max_protection & VM_PROT_READ) == 0)) {
12530 goto VerificationFailed;
12531 }
12532
12533 if (src_entry->vme_end < new_entry->vme_end) {
12534 /*
12535 * This entry might have been shortened
12536 * (vm_map_clip_end) or been replaced with
12537 * an entry that ends closer to "src_start"
12538 * than before.
12539 * Adjust "new_entry" accordingly; copying
12540 * less memory would be correct but we also
12541 * redo the copy (see below) if the new entry
12542 * no longer points at the same object/offset.
12543 */
12544 assert(VM_MAP_PAGE_ALIGNED(src_entry->vme_end,
12545 VM_MAP_COPY_PAGE_MASK(copy)));
12546 new_entry->vme_end = src_entry->vme_end;
12547 src_size = new_entry->vme_end - src_start;
12548 } else if (src_entry->vme_end > new_entry->vme_end) {
12549 /*
12550 * This entry might have been extended
12551 * (vm_map_entry_simplify() or coalesce)
12552 * or been replaced with an entry that ends farther
12553 * from "src_start" than before.
12554 *
12555 * We've called vm_object_copy_*() only on
12556 * the previous <start:end> range, so we can't
12557 * just extend new_entry. We have to re-do
12558 * the copy based on the new entry as if it was
12559 * pointing at a different object/offset (see
12560 * "Verification failed" below).
12561 */
12562 }
12563
12564 if ((VME_OBJECT(src_entry) != src_object) ||
12565 (VME_OFFSET(entry: src_entry) != src_offset) ||
12566 (src_entry->vme_end > new_entry->vme_end)) {
12567 /*
12568 * Verification failed.
12569 *
12570 * Start over with this top-level entry.
12571 */
12572
12573VerificationFailed: ;
12574
12575 vm_object_deallocate(VME_OBJECT(new_entry));
12576 tmp_entry = src_entry;
12577 continue;
12578 }
12579
12580 /*
12581 * Verification succeeded.
12582 */
12583
12584VerificationSuccessful:;
12585
12586 if (result == KERN_MEMORY_RESTART_COPY) {
12587 goto RestartCopy;
12588 }
12589
12590 /*
12591 * Copy succeeded.
12592 */
12593
12594CopySuccessful: ;
12595
12596 /*
12597 * Link in the new copy entry.
12598 */
12599
12600 vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy),
12601 new_entry);
12602
12603 /*
12604 * Determine whether the entire region
12605 * has been copied.
12606 */
12607 src_base = src_start;
12608 src_start = new_entry->vme_end;
12609 new_entry = VM_MAP_ENTRY_NULL;
12610 while ((src_start >= src_end) && (src_end != 0)) {
12611 submap_map_t *ptr;
12612
12613 if (src_map == base_map) {
12614 /* back to the top */
12615 break;
12616 }
12617
12618 ptr = parent_maps;
12619 assert(ptr != NULL);
12620 parent_maps = parent_maps->next;
12621
12622 /* fix up the damage we did in that submap */
12623 vm_map_simplify_range(map: src_map,
12624 start: src_base,
12625 end: src_end);
12626
12627 vm_map_unlock(src_map);
12628 vm_map_deallocate(map: src_map);
12629 vm_map_lock(ptr->parent_map);
12630 src_map = ptr->parent_map;
12631 src_base = ptr->base_start;
12632 src_start = ptr->base_start + ptr->base_len;
12633 src_end = ptr->base_end;
12634 if (!vm_map_lookup_entry(map: src_map,
12635 address: src_start,
12636 entry: &tmp_entry) &&
12637 (src_end > src_start)) {
12638 RETURN(KERN_INVALID_ADDRESS);
12639 }
12640 kfree_type(submap_map_t, ptr);
12641 if (parent_maps == NULL) {
12642 map_share = FALSE;
12643 }
12644 src_entry = tmp_entry->vme_prev;
12645 }
12646
12647 if ((VM_MAP_PAGE_SHIFT(map: src_map) != PAGE_SHIFT) &&
12648 (src_start >= src_addr + len) &&
12649 (src_addr + len != 0)) {
12650 /*
12651 * Stop copying now, even though we haven't reached
12652 * "src_end". We'll adjust the end of the last copy
12653 * entry at the end, if needed.
12654 *
12655 * If src_map's aligment is different from the
12656 * system's page-alignment, there could be
12657 * extra non-map-aligned map entries between
12658 * the original (non-rounded) "src_addr + len"
12659 * and the rounded "src_end".
12660 * We do not want to copy those map entries since
12661 * they're not part of the copied range.
12662 */
12663 break;
12664 }
12665
12666 if ((src_start >= src_end) && (src_end != 0)) {
12667 break;
12668 }
12669
12670 /*
12671 * Verify that there are no gaps in the region
12672 */
12673
12674 tmp_entry = src_entry->vme_next;
12675 if ((tmp_entry->vme_start != src_start) ||
12676 (tmp_entry == vm_map_to_entry(src_map))) {
12677 RETURN(KERN_INVALID_ADDRESS);
12678 }
12679 }
12680
12681 /*
12682 * If the source should be destroyed, do it now, since the
12683 * copy was successful.
12684 */
12685 if (src_destroy) {
12686 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_FLAGS;
12687
12688 if (src_map == kernel_map) {
12689 remove_flags |= VM_MAP_REMOVE_KUNWIRE;
12690 }
12691 (void)vm_map_remove_and_unlock(map: src_map,
12692 vm_map_trunc_page(src_addr, VM_MAP_PAGE_MASK(src_map)),
12693 end: src_end,
12694 flags: remove_flags,
12695 KMEM_GUARD_NONE);
12696 } else {
12697 /* fix up the damage we did in the base map */
12698 vm_map_simplify_range(
12699 map: src_map,
12700 vm_map_trunc_page(src_addr,
12701 VM_MAP_PAGE_MASK(src_map)),
12702 vm_map_round_page(src_end,
12703 VM_MAP_PAGE_MASK(src_map)));
12704 vm_map_unlock(src_map);
12705 }
12706
12707 tmp_entry = VM_MAP_ENTRY_NULL;
12708
12709 if (VM_MAP_PAGE_SHIFT(map: src_map) > PAGE_SHIFT &&
12710 VM_MAP_PAGE_SHIFT(map: src_map) != VM_MAP_COPY_PAGE_SHIFT(copy)) {
12711 vm_map_offset_t original_start, original_offset, original_end;
12712
12713 assert(VM_MAP_COPY_PAGE_MASK(copy) == PAGE_MASK);
12714
12715 /* adjust alignment of first copy_entry's "vme_start" */
12716 tmp_entry = vm_map_copy_first_entry(copy);
12717 if (tmp_entry != vm_map_copy_to_entry(copy)) {
12718 vm_map_offset_t adjustment;
12719
12720 original_start = tmp_entry->vme_start;
12721 original_offset = VME_OFFSET(entry: tmp_entry);
12722
12723 /* map-align the start of the first copy entry... */
12724 adjustment = (tmp_entry->vme_start -
12725 vm_map_trunc_page(
12726 tmp_entry->vme_start,
12727 VM_MAP_PAGE_MASK(src_map)));
12728 tmp_entry->vme_start -= adjustment;
12729 VME_OFFSET_SET(entry: tmp_entry,
12730 offset: VME_OFFSET(entry: tmp_entry) - adjustment);
12731 copy_addr -= adjustment;
12732 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12733 /* ... adjust for mis-aligned start of copy range */
12734 adjustment =
12735 (vm_map_trunc_page(copy->offset,
12736 PAGE_MASK) -
12737 vm_map_trunc_page(copy->offset,
12738 VM_MAP_PAGE_MASK(src_map)));
12739 if (adjustment) {
12740 assert(page_aligned(adjustment));
12741 assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12742 tmp_entry->vme_start += adjustment;
12743 VME_OFFSET_SET(entry: tmp_entry,
12744 offset: (VME_OFFSET(entry: tmp_entry) +
12745 adjustment));
12746 copy_addr += adjustment;
12747 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12748 }
12749
12750 /*
12751 * Assert that the adjustments haven't exposed
12752 * more than was originally copied...
12753 */
12754 assert(tmp_entry->vme_start >= original_start);
12755 assert(VME_OFFSET(tmp_entry) >= original_offset);
12756 /*
12757 * ... and that it did not adjust outside of a
12758 * a single 16K page.
12759 */
12760 assert(vm_map_trunc_page(tmp_entry->vme_start,
12761 VM_MAP_PAGE_MASK(src_map)) ==
12762 vm_map_trunc_page(original_start,
12763 VM_MAP_PAGE_MASK(src_map)));
12764 }
12765
12766 /* adjust alignment of last copy_entry's "vme_end" */
12767 tmp_entry = vm_map_copy_last_entry(copy);
12768 if (tmp_entry != vm_map_copy_to_entry(copy)) {
12769 vm_map_offset_t adjustment;
12770
12771 original_end = tmp_entry->vme_end;
12772
12773 /* map-align the end of the last copy entry... */
12774 tmp_entry->vme_end =
12775 vm_map_round_page(tmp_entry->vme_end,
12776 VM_MAP_PAGE_MASK(src_map));
12777 /* ... adjust for mis-aligned end of copy range */
12778 adjustment =
12779 (vm_map_round_page((copy->offset +
12780 copy->size),
12781 VM_MAP_PAGE_MASK(src_map)) -
12782 vm_map_round_page((copy->offset +
12783 copy->size),
12784 PAGE_MASK));
12785 if (adjustment) {
12786 assert(page_aligned(adjustment));
12787 assert(adjustment < VM_MAP_PAGE_SIZE(src_map));
12788 tmp_entry->vme_end -= adjustment;
12789 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12790 }
12791
12792 /*
12793 * Assert that the adjustments haven't exposed
12794 * more than was originally copied...
12795 */
12796 assert(tmp_entry->vme_end <= original_end);
12797 /*
12798 * ... and that it did not adjust outside of a
12799 * a single 16K page.
12800 */
12801 assert(vm_map_round_page(tmp_entry->vme_end,
12802 VM_MAP_PAGE_MASK(src_map)) ==
12803 vm_map_round_page(original_end,
12804 VM_MAP_PAGE_MASK(src_map)));
12805 }
12806 }
12807
12808 /* Fix-up start and end points in copy. This is necessary */
12809 /* when the various entries in the copy object were picked */
12810 /* up from different sub-maps */
12811
12812 tmp_entry = vm_map_copy_first_entry(copy);
12813 copy_size = 0; /* compute actual size */
12814 while (tmp_entry != vm_map_copy_to_entry(copy)) {
12815 assert(VM_MAP_PAGE_ALIGNED(
12816 copy_addr + (tmp_entry->vme_end -
12817 tmp_entry->vme_start),
12818 MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12819 assert(VM_MAP_PAGE_ALIGNED(
12820 copy_addr,
12821 MIN(VM_MAP_COPY_PAGE_MASK(copy), PAGE_MASK)));
12822
12823 /*
12824 * The copy_entries will be injected directly into the
12825 * destination map and might not be "map aligned" there...
12826 */
12827 tmp_entry->map_aligned = FALSE;
12828
12829 tmp_entry->vme_end = copy_addr +
12830 (tmp_entry->vme_end - tmp_entry->vme_start);
12831 tmp_entry->vme_start = copy_addr;
12832 assert(tmp_entry->vme_start < tmp_entry->vme_end);
12833 copy_addr += tmp_entry->vme_end - tmp_entry->vme_start;
12834 copy_size += tmp_entry->vme_end - tmp_entry->vme_start;
12835 tmp_entry = (struct vm_map_entry *)tmp_entry->vme_next;
12836 }
12837
12838 if (VM_MAP_PAGE_SHIFT(map: src_map) != PAGE_SHIFT &&
12839 copy_size < copy->size) {
12840 /*
12841 * The actual size of the VM map copy is smaller than what
12842 * was requested by the caller. This must be because some
12843 * PAGE_SIZE-sized pages are missing at the end of the last
12844 * VM_MAP_PAGE_SIZE(src_map)-sized chunk of the range.
12845 * The caller might not have been aware of those missing
12846 * pages and might not want to be aware of it, which is
12847 * fine as long as they don't try to access (and crash on)
12848 * those missing pages.
12849 * Let's adjust the size of the "copy", to avoid failing
12850 * in vm_map_copyout() or vm_map_copy_overwrite().
12851 */
12852 assert(vm_map_round_page(copy_size,
12853 VM_MAP_PAGE_MASK(src_map)) ==
12854 vm_map_round_page(copy->size,
12855 VM_MAP_PAGE_MASK(src_map)));
12856 copy->size = copy_size;
12857 }
12858
12859 *copy_result = copy;
12860 return KERN_SUCCESS;
12861
12862#undef RETURN
12863}
12864
12865kern_return_t
12866vm_map_copy_extract(
12867 vm_map_t src_map,
12868 vm_map_address_t src_addr,
12869 vm_map_size_t len,
12870 boolean_t do_copy,
12871 vm_map_copy_t *copy_result, /* OUT */
12872 vm_prot_t *cur_prot, /* IN/OUT */
12873 vm_prot_t *max_prot, /* IN/OUT */
12874 vm_inherit_t inheritance,
12875 vm_map_kernel_flags_t vmk_flags)
12876{
12877 vm_map_copy_t copy;
12878 kern_return_t kr;
12879 vm_prot_t required_cur_prot, required_max_prot;
12880
12881 /*
12882 * Check for copies of zero bytes.
12883 */
12884
12885 if (len == 0) {
12886 *copy_result = VM_MAP_COPY_NULL;
12887 return KERN_SUCCESS;
12888 }
12889
12890 /*
12891 * Check that the end address doesn't overflow
12892 */
12893 if (src_addr + len < src_addr) {
12894 return KERN_INVALID_ADDRESS;
12895 }
12896 if (__improbable(vm_map_range_overflows(src_map, src_addr, len))) {
12897 return KERN_INVALID_ADDRESS;
12898 }
12899
12900 if (VM_MAP_PAGE_SIZE(src_map) < PAGE_SIZE) {
12901 DEBUG4K_SHARE("src_map %p src_addr 0x%llx src_end 0x%llx\n", src_map, (uint64_t)src_addr, (uint64_t)(src_addr + len));
12902 }
12903
12904 required_cur_prot = *cur_prot;
12905 required_max_prot = *max_prot;
12906
12907 /*
12908 * Allocate a header element for the list.
12909 *
12910 * Use the start and end in the header to
12911 * remember the endpoints prior to rounding.
12912 */
12913
12914 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
12915 copy->cpy_hdr.entries_pageable = vmk_flags.vmkf_copy_pageable;
12916 copy->offset = 0;
12917 copy->size = len;
12918
12919 kr = vm_map_remap_extract(map: src_map,
12920 addr: src_addr,
12921 size: len,
12922 copy: do_copy, /* copy */
12923 map_copy: copy,
12924 cur_protection: cur_prot, /* IN/OUT */
12925 max_protection: max_prot, /* IN/OUT */
12926 inheritance,
12927 vmk_flags);
12928 if (kr != KERN_SUCCESS) {
12929 vm_map_copy_discard(copy);
12930 return kr;
12931 }
12932 if (required_cur_prot != VM_PROT_NONE) {
12933 assert((*cur_prot & required_cur_prot) == required_cur_prot);
12934 assert((*max_prot & required_max_prot) == required_max_prot);
12935 }
12936
12937 *copy_result = copy;
12938 return KERN_SUCCESS;
12939}
12940
12941static void
12942vm_map_fork_share(
12943 vm_map_t old_map,
12944 vm_map_entry_t old_entry,
12945 vm_map_t new_map)
12946{
12947 vm_object_t object;
12948 vm_map_entry_t new_entry;
12949
12950 /*
12951 * New sharing code. New map entry
12952 * references original object. Internal
12953 * objects use asynchronous copy algorithm for
12954 * future copies. First make sure we have
12955 * the right object. If we need a shadow,
12956 * or someone else already has one, then
12957 * make a new shadow and share it.
12958 */
12959
12960 if (!old_entry->is_sub_map) {
12961 object = VME_OBJECT(old_entry);
12962 }
12963
12964 if (old_entry->is_sub_map) {
12965 assert(old_entry->wired_count == 0);
12966#ifndef NO_NESTED_PMAP
12967#if !PMAP_FORK_NEST
12968 if (old_entry->use_pmap) {
12969 kern_return_t result;
12970
12971 result = pmap_nest(new_map->pmap,
12972 (VME_SUBMAP(old_entry))->pmap,
12973 (addr64_t)old_entry->vme_start,
12974 (uint64_t)(old_entry->vme_end - old_entry->vme_start));
12975 if (result) {
12976 panic("vm_map_fork_share: pmap_nest failed!");
12977 }
12978 }
12979#endif /* !PMAP_FORK_NEST */
12980#endif /* NO_NESTED_PMAP */
12981 } else if (object == VM_OBJECT_NULL) {
12982 object = vm_object_allocate(size: (vm_map_size_t)(old_entry->vme_end -
12983 old_entry->vme_start));
12984 VME_OFFSET_SET(entry: old_entry, offset: 0);
12985 VME_OBJECT_SET(entry: old_entry, object, false, context: 0);
12986 old_entry->use_pmap = TRUE;
12987// assert(!old_entry->needs_copy);
12988 } else if (object->copy_strategy !=
12989 MEMORY_OBJECT_COPY_SYMMETRIC) {
12990 /*
12991 * We are already using an asymmetric
12992 * copy, and therefore we already have
12993 * the right object.
12994 */
12995
12996 assert(!old_entry->needs_copy);
12997 } else if (old_entry->needs_copy || /* case 1 */
12998 object->shadowed || /* case 2 */
12999 (!object->true_share && /* case 3 */
13000 !old_entry->is_shared &&
13001 (object->vo_size >
13002 (vm_map_size_t)(old_entry->vme_end -
13003 old_entry->vme_start)))) {
13004 bool is_writable;
13005
13006 /*
13007 * We need to create a shadow.
13008 * There are three cases here.
13009 * In the first case, we need to
13010 * complete a deferred symmetrical
13011 * copy that we participated in.
13012 * In the second and third cases,
13013 * we need to create the shadow so
13014 * that changes that we make to the
13015 * object do not interfere with
13016 * any symmetrical copies which
13017 * have occured (case 2) or which
13018 * might occur (case 3).
13019 *
13020 * The first case is when we had
13021 * deferred shadow object creation
13022 * via the entry->needs_copy mechanism.
13023 * This mechanism only works when
13024 * only one entry points to the source
13025 * object, and we are about to create
13026 * a second entry pointing to the
13027 * same object. The problem is that
13028 * there is no way of mapping from
13029 * an object to the entries pointing
13030 * to it. (Deferred shadow creation
13031 * works with one entry because occurs
13032 * at fault time, and we walk from the
13033 * entry to the object when handling
13034 * the fault.)
13035 *
13036 * The second case is when the object
13037 * to be shared has already been copied
13038 * with a symmetric copy, but we point
13039 * directly to the object without
13040 * needs_copy set in our entry. (This
13041 * can happen because different ranges
13042 * of an object can be pointed to by
13043 * different entries. In particular,
13044 * a single entry pointing to an object
13045 * can be split by a call to vm_inherit,
13046 * which, combined with task_create, can
13047 * result in the different entries
13048 * having different needs_copy values.)
13049 * The shadowed flag in the object allows
13050 * us to detect this case. The problem
13051 * with this case is that if this object
13052 * has or will have shadows, then we
13053 * must not perform an asymmetric copy
13054 * of this object, since such a copy
13055 * allows the object to be changed, which
13056 * will break the previous symmetrical
13057 * copies (which rely upon the object
13058 * not changing). In a sense, the shadowed
13059 * flag says "don't change this object".
13060 * We fix this by creating a shadow
13061 * object for this object, and sharing
13062 * that. This works because we are free
13063 * to change the shadow object (and thus
13064 * to use an asymmetric copy strategy);
13065 * this is also semantically correct,
13066 * since this object is temporary, and
13067 * therefore a copy of the object is
13068 * as good as the object itself. (This
13069 * is not true for permanent objects,
13070 * since the pager needs to see changes,
13071 * which won't happen if the changes
13072 * are made to a copy.)
13073 *
13074 * The third case is when the object
13075 * to be shared has parts sticking
13076 * outside of the entry we're working
13077 * with, and thus may in the future
13078 * be subject to a symmetrical copy.
13079 * (This is a preemptive version of
13080 * case 2.)
13081 */
13082 VME_OBJECT_SHADOW(entry: old_entry,
13083 length: (vm_map_size_t) (old_entry->vme_end -
13084 old_entry->vme_start),
13085 always: vm_map_always_shadow(map: old_map));
13086
13087 /*
13088 * If we're making a shadow for other than
13089 * copy on write reasons, then we have
13090 * to remove write permission.
13091 */
13092
13093 is_writable = false;
13094 if (old_entry->protection & VM_PROT_WRITE) {
13095 is_writable = true;
13096#if __arm64e__
13097 } else if (old_entry->used_for_tpro) {
13098 is_writable = true;
13099#endif /* __arm64e__ */
13100 }
13101 if (!old_entry->needs_copy && is_writable) {
13102 vm_prot_t prot;
13103
13104 if (pmap_has_prot_policy(pmap: old_map->pmap, translated_allow_execute: old_entry->translated_allow_execute, prot: old_entry->protection)) {
13105 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13106 __FUNCTION__, old_map, old_map->pmap,
13107 old_entry,
13108 (uint64_t)old_entry->vme_start,
13109 (uint64_t)old_entry->vme_end,
13110 old_entry->protection);
13111 }
13112
13113 prot = old_entry->protection & ~VM_PROT_WRITE;
13114
13115 if (pmap_has_prot_policy(pmap: old_map->pmap, translated_allow_execute: old_entry->translated_allow_execute, prot)) {
13116 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13117 __FUNCTION__, old_map, old_map->pmap,
13118 old_entry,
13119 (uint64_t)old_entry->vme_start,
13120 (uint64_t)old_entry->vme_end,
13121 prot);
13122 }
13123
13124 if (override_nx(map: old_map, VME_ALIAS(old_entry)) && prot) {
13125 prot |= VM_PROT_EXECUTE;
13126 }
13127
13128
13129 if (old_map->mapped_in_other_pmaps) {
13130 vm_object_pmap_protect(
13131 VME_OBJECT(old_entry),
13132 offset: VME_OFFSET(entry: old_entry),
13133 size: (old_entry->vme_end -
13134 old_entry->vme_start),
13135 PMAP_NULL,
13136 PAGE_SIZE,
13137 pmap_start: old_entry->vme_start,
13138 prot);
13139 } else {
13140 pmap_protect(map: old_map->pmap,
13141 s: old_entry->vme_start,
13142 e: old_entry->vme_end,
13143 prot);
13144 }
13145 }
13146
13147 old_entry->needs_copy = FALSE;
13148 object = VME_OBJECT(old_entry);
13149 }
13150
13151
13152 /*
13153 * If object was using a symmetric copy strategy,
13154 * change its copy strategy to the default
13155 * asymmetric copy strategy, which is copy_delay
13156 * in the non-norma case and copy_call in the
13157 * norma case. Bump the reference count for the
13158 * new entry.
13159 */
13160
13161 if (old_entry->is_sub_map) {
13162 vm_map_reference(VME_SUBMAP(old_entry));
13163 } else {
13164 vm_object_lock(object);
13165 vm_object_reference_locked(object);
13166 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
13167 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
13168 }
13169 vm_object_unlock(object);
13170 }
13171
13172 /*
13173 * Clone the entry, using object ref from above.
13174 * Mark both entries as shared.
13175 */
13176
13177 new_entry = vm_map_entry_create(new_map); /* Never the kernel map or descendants */
13178 vm_map_entry_copy(map: old_map, new: new_entry, old: old_entry);
13179 old_entry->is_shared = TRUE;
13180 new_entry->is_shared = TRUE;
13181
13182 /*
13183 * We're dealing with a shared mapping, so the resulting mapping
13184 * should inherit some of the original mapping's accounting settings.
13185 * "iokit_acct" should have been cleared in vm_map_entry_copy().
13186 * "use_pmap" should stay the same as before (if it hasn't been reset
13187 * to TRUE when we cleared "iokit_acct").
13188 */
13189 assert(!new_entry->iokit_acct);
13190
13191 /*
13192 * If old entry's inheritence is VM_INHERIT_NONE,
13193 * the new entry is for corpse fork, remove the
13194 * write permission from the new entry.
13195 */
13196 if (old_entry->inheritance == VM_INHERIT_NONE) {
13197 new_entry->protection &= ~VM_PROT_WRITE;
13198 new_entry->max_protection &= ~VM_PROT_WRITE;
13199 }
13200
13201 /*
13202 * Insert the entry into the new map -- we
13203 * know we're inserting at the end of the new
13204 * map.
13205 */
13206
13207 vm_map_store_entry_link(map: new_map, vm_map_last_entry(new_map), entry: new_entry,
13208 VM_MAP_KERNEL_FLAGS_NONE);
13209
13210 /*
13211 * Update the physical map
13212 */
13213
13214 if (old_entry->is_sub_map) {
13215 /* Bill Angell pmap support goes here */
13216 } else {
13217 pmap_copy(new_map->pmap, old_map->pmap, new_entry->vme_start,
13218 old_entry->vme_end - old_entry->vme_start,
13219 old_entry->vme_start);
13220 }
13221}
13222
13223static boolean_t
13224vm_map_fork_copy(
13225 vm_map_t old_map,
13226 vm_map_entry_t *old_entry_p,
13227 vm_map_t new_map,
13228 int vm_map_copyin_flags)
13229{
13230 vm_map_entry_t old_entry = *old_entry_p;
13231 vm_map_size_t entry_size = old_entry->vme_end - old_entry->vme_start;
13232 vm_map_offset_t start = old_entry->vme_start;
13233 vm_map_copy_t copy;
13234 vm_map_entry_t last = vm_map_last_entry(new_map);
13235
13236 vm_map_unlock(old_map);
13237 /*
13238 * Use maxprot version of copyin because we
13239 * care about whether this memory can ever
13240 * be accessed, not just whether it's accessible
13241 * right now.
13242 */
13243 vm_map_copyin_flags |= VM_MAP_COPYIN_USE_MAXPROT;
13244 if (vm_map_copyin_internal(src_map: old_map, src_addr: start, len: entry_size,
13245 flags: vm_map_copyin_flags, copy_result: &copy)
13246 != KERN_SUCCESS) {
13247 /*
13248 * The map might have changed while it
13249 * was unlocked, check it again. Skip
13250 * any blank space or permanently
13251 * unreadable region.
13252 */
13253 vm_map_lock(old_map);
13254 if (!vm_map_lookup_entry(map: old_map, address: start, entry: &last) ||
13255 (last->max_protection & VM_PROT_READ) == VM_PROT_NONE) {
13256 last = last->vme_next;
13257 }
13258 *old_entry_p = last;
13259
13260 /*
13261 * XXX For some error returns, want to
13262 * XXX skip to the next element. Note
13263 * that INVALID_ADDRESS and
13264 * PROTECTION_FAILURE are handled above.
13265 */
13266
13267 return FALSE;
13268 }
13269
13270 /*
13271 * Assert that the vm_map_copy is coming from the right
13272 * zone and hasn't been forged
13273 */
13274 vm_map_copy_require(copy);
13275
13276 /*
13277 * Insert the copy into the new map
13278 */
13279 vm_map_copy_insert(map: new_map, after_where: last, copy);
13280
13281 /*
13282 * Pick up the traversal at the end of
13283 * the copied region.
13284 */
13285
13286 vm_map_lock(old_map);
13287 start += entry_size;
13288 if (!vm_map_lookup_entry(map: old_map, address: start, entry: &last)) {
13289 last = last->vme_next;
13290 } else {
13291 if (last->vme_start == start) {
13292 /*
13293 * No need to clip here and we don't
13294 * want to cause any unnecessary
13295 * unnesting...
13296 */
13297 } else {
13298 vm_map_clip_start(map: old_map, entry: last, startaddr: start);
13299 }
13300 }
13301 *old_entry_p = last;
13302
13303 return TRUE;
13304}
13305
13306#if PMAP_FORK_NEST
13307#define PMAP_FORK_NEST_DEBUG 0
13308static inline void
13309vm_map_fork_unnest(
13310 pmap_t new_pmap,
13311 vm_map_offset_t pre_nested_start,
13312 vm_map_offset_t pre_nested_end,
13313 vm_map_offset_t start,
13314 vm_map_offset_t end)
13315{
13316 kern_return_t kr;
13317 vm_map_offset_t nesting_mask, start_unnest, end_unnest;
13318
13319 assertf(pre_nested_start <= pre_nested_end,
13320 "pre_nested start 0x%llx end 0x%llx",
13321 (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13322 assertf(start <= end,
13323 "start 0x%llx end 0x%llx",
13324 (uint64_t) start, (uint64_t)end);
13325
13326 if (pre_nested_start == pre_nested_end) {
13327 /* nothing was pre-nested: done */
13328 return;
13329 }
13330 if (end <= pre_nested_start) {
13331 /* fully before pre-nested range: done */
13332 return;
13333 }
13334 if (start >= pre_nested_end) {
13335 /* fully after pre-nested range: done */
13336 return;
13337 }
13338 /* ignore parts of range outside of pre_nested range */
13339 if (start < pre_nested_start) {
13340 start = pre_nested_start;
13341 }
13342 if (end > pre_nested_end) {
13343 end = pre_nested_end;
13344 }
13345 nesting_mask = pmap_shared_region_size_min(new_pmap) - 1;
13346 start_unnest = start & ~nesting_mask;
13347 end_unnest = (end + nesting_mask) & ~nesting_mask;
13348 kr = pmap_unnest(new_pmap,
13349 (addr64_t)start_unnest,
13350 (uint64_t)(end_unnest - start_unnest));
13351#if PMAP_FORK_NEST_DEBUG
13352 printf("PMAP_FORK_NEST %s:%d new_pmap %p 0x%llx:0x%llx -> pmap_unnest 0x%llx:0x%llx kr 0x%x\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)start, (uint64_t)end, (uint64_t)start_unnest, (uint64_t)end_unnest, kr);
13353#endif /* PMAP_FORK_NEST_DEBUG */
13354 assertf(kr == KERN_SUCCESS,
13355 "0x%llx 0x%llx pmap_unnest(%p, 0x%llx, 0x%llx) -> 0x%x",
13356 (uint64_t)start, (uint64_t)end, new_pmap,
13357 (uint64_t)start_unnest, (uint64_t)(end_unnest - start_unnest),
13358 kr);
13359}
13360#endif /* PMAP_FORK_NEST */
13361
13362void
13363vm_map_inherit_limits(vm_map_t new_map, const struct _vm_map *old_map)
13364{
13365 new_map->size_limit = old_map->size_limit;
13366 new_map->data_limit = old_map->data_limit;
13367 new_map->user_wire_limit = old_map->user_wire_limit;
13368 new_map->reserved_regions = old_map->reserved_regions;
13369}
13370
13371/*
13372 * vm_map_fork:
13373 *
13374 * Create and return a new map based on the old
13375 * map, according to the inheritance values on the
13376 * regions in that map and the options.
13377 *
13378 * The source map must not be locked.
13379 */
13380vm_map_t
13381vm_map_fork(
13382 ledger_t ledger,
13383 vm_map_t old_map,
13384 int options)
13385{
13386 pmap_t new_pmap;
13387 vm_map_t new_map;
13388 vm_map_entry_t old_entry;
13389 vm_map_size_t new_size = 0, entry_size;
13390 vm_map_entry_t new_entry;
13391 boolean_t src_needs_copy;
13392 boolean_t new_entry_needs_copy;
13393 boolean_t pmap_is64bit;
13394 int vm_map_copyin_flags;
13395 vm_inherit_t old_entry_inheritance;
13396 int map_create_options;
13397 kern_return_t footprint_collect_kr;
13398
13399 if (options & ~(VM_MAP_FORK_SHARE_IF_INHERIT_NONE |
13400 VM_MAP_FORK_PRESERVE_PURGEABLE |
13401 VM_MAP_FORK_CORPSE_FOOTPRINT)) {
13402 /* unsupported option */
13403 return VM_MAP_NULL;
13404 }
13405
13406 pmap_is64bit =
13407#if defined(__i386__) || defined(__x86_64__)
13408 old_map->pmap->pm_task_map != TASK_MAP_32BIT;
13409#elif defined(__arm64__)
13410 old_map->pmap->is_64bit;
13411#else
13412#error Unknown architecture.
13413#endif
13414
13415 unsigned int pmap_flags = 0;
13416 pmap_flags |= pmap_is64bit ? PMAP_CREATE_64BIT : 0;
13417#if defined(HAS_APPLE_PAC)
13418 pmap_flags |= old_map->pmap->disable_jop ? PMAP_CREATE_DISABLE_JOP : 0;
13419#endif
13420#if CONFIG_ROSETTA
13421 pmap_flags |= old_map->pmap->is_rosetta ? PMAP_CREATE_ROSETTA : 0;
13422#endif
13423#if PMAP_CREATE_FORCE_4K_PAGES
13424 if (VM_MAP_PAGE_SIZE(old_map) == FOURK_PAGE_SIZE &&
13425 PAGE_SIZE != FOURK_PAGE_SIZE) {
13426 pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
13427 }
13428#endif /* PMAP_CREATE_FORCE_4K_PAGES */
13429 new_pmap = pmap_create_options(ledger, size: (vm_map_size_t) 0, flags: pmap_flags);
13430 if (new_pmap == NULL) {
13431 return VM_MAP_NULL;
13432 }
13433
13434 vm_map_reference(map: old_map);
13435 vm_map_lock(old_map);
13436
13437 map_create_options = 0;
13438 if (old_map->hdr.entries_pageable) {
13439 map_create_options |= VM_MAP_CREATE_PAGEABLE;
13440 }
13441 if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13442 map_create_options |= VM_MAP_CREATE_CORPSE_FOOTPRINT;
13443 footprint_collect_kr = KERN_SUCCESS;
13444 }
13445 new_map = vm_map_create_options(pmap: new_pmap,
13446 min: old_map->min_offset,
13447 max: old_map->max_offset,
13448 options: map_create_options);
13449
13450 /* inherit cs_enforcement */
13451 vm_map_cs_enforcement_set(map: new_map, val: old_map->cs_enforcement);
13452
13453 vm_map_lock(new_map);
13454 vm_commit_pagezero_status(tmap: new_map);
13455 /* inherit the parent map's page size */
13456 vm_map_set_page_shift(map: new_map, pageshift: VM_MAP_PAGE_SHIFT(map: old_map));
13457
13458 /* inherit the parent rlimits */
13459 vm_map_inherit_limits(new_map, old_map);
13460
13461#if CONFIG_MAP_RANGES
13462 /* inherit the parent map's VM ranges */
13463 vm_map_range_fork(new_map, old_map);
13464#endif
13465
13466#if CODE_SIGNING_MONITOR
13467 /* Prepare the monitor for the fork */
13468 csm_fork_prepare(old_map->pmap, new_pmap);
13469#endif
13470
13471#if PMAP_FORK_NEST
13472 /*
13473 * Pre-nest the shared region's pmap.
13474 */
13475 vm_map_offset_t pre_nested_start = 0, pre_nested_end = 0;
13476 pmap_fork_nest(old_map->pmap, new_pmap,
13477 &pre_nested_start, &pre_nested_end);
13478#if PMAP_FORK_NEST_DEBUG
13479 printf("PMAP_FORK_NEST %s:%d old %p new %p pre_nested start 0x%llx end 0x%llx\n", __FUNCTION__, __LINE__, old_map->pmap, new_pmap, (uint64_t)pre_nested_start, (uint64_t)pre_nested_end);
13480#endif /* PMAP_FORK_NEST_DEBUG */
13481#endif /* PMAP_FORK_NEST */
13482
13483 for (old_entry = vm_map_first_entry(old_map); old_entry != vm_map_to_entry(old_map);) {
13484 /*
13485 * Abort any corpse collection if the system is shutting down.
13486 */
13487 if ((options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13488 get_system_inshutdown()) {
13489#if PMAP_FORK_NEST
13490 new_entry = vm_map_last_entry(new_map);
13491 if (new_entry == vm_map_to_entry(new_map)) {
13492 /* unnest all that was pre-nested */
13493 vm_map_fork_unnest(new_pmap,
13494 pre_nested_start, pre_nested_end,
13495 vm_map_min(new_map), vm_map_max(new_map));
13496 } else if (new_entry->vme_end < vm_map_max(new_map)) {
13497 /* unnest hole at the end, if pre-nested */
13498 vm_map_fork_unnest(new_pmap,
13499 pre_nested_start, pre_nested_end,
13500 new_entry->vme_end, vm_map_max(new_map));
13501 }
13502#endif /* PMAP_FORK_NEST */
13503 vm_map_corpse_footprint_collect_done(new_map);
13504 vm_map_unlock(new_map);
13505 vm_map_unlock(old_map);
13506 vm_map_deallocate(map: new_map);
13507 vm_map_deallocate(map: old_map);
13508 printf(format: "Aborting corpse map due to system shutdown\n");
13509 return VM_MAP_NULL;
13510 }
13511
13512 entry_size = old_entry->vme_end - old_entry->vme_start;
13513
13514#if PMAP_FORK_NEST
13515 /*
13516 * Undo any unnecessary pre-nesting.
13517 */
13518 vm_map_offset_t prev_end;
13519 if (old_entry == vm_map_first_entry(old_map)) {
13520 prev_end = vm_map_min(old_map);
13521 } else {
13522 prev_end = old_entry->vme_prev->vme_end;
13523 }
13524 if (prev_end < old_entry->vme_start) {
13525 /* unnest hole before this entry, if pre-nested */
13526 vm_map_fork_unnest(new_pmap,
13527 pre_nested_start, pre_nested_end,
13528 prev_end, old_entry->vme_start);
13529 }
13530 if (old_entry->is_sub_map && old_entry->use_pmap) {
13531 /* keep this entry nested in the child */
13532#if PMAP_FORK_NEST_DEBUG
13533 printf("PMAP_FORK_NEST %s:%d new_pmap %p keeping 0x%llx:0x%llx nested\n", __FUNCTION__, __LINE__, new_pmap, (uint64_t)old_entry->vme_start, (uint64_t)old_entry->vme_end);
13534#endif /* PMAP_FORK_NEST_DEBUG */
13535 } else {
13536 /* undo nesting for this entry, if pre-nested */
13537 vm_map_fork_unnest(new_pmap,
13538 pre_nested_start, pre_nested_end,
13539 old_entry->vme_start, old_entry->vme_end);
13540 }
13541#endif /* PMAP_FORK_NEST */
13542
13543 old_entry_inheritance = old_entry->inheritance;
13544 /*
13545 * If caller used the VM_MAP_FORK_SHARE_IF_INHERIT_NONE option
13546 * share VM_INHERIT_NONE entries that are not backed by a
13547 * device pager.
13548 */
13549 if (old_entry_inheritance == VM_INHERIT_NONE &&
13550 (options & VM_MAP_FORK_SHARE_IF_INHERIT_NONE) &&
13551 (old_entry->protection & VM_PROT_READ) &&
13552 !(!old_entry->is_sub_map &&
13553 VME_OBJECT(old_entry) != NULL &&
13554 VME_OBJECT(old_entry)->pager != NULL &&
13555 is_device_pager_ops(
13556 VME_OBJECT(old_entry)->pager->mo_pager_ops))) {
13557 old_entry_inheritance = VM_INHERIT_SHARE;
13558 }
13559
13560 if (old_entry_inheritance != VM_INHERIT_NONE &&
13561 (options & VM_MAP_FORK_CORPSE_FOOTPRINT) &&
13562 footprint_collect_kr == KERN_SUCCESS) {
13563 /*
13564 * The corpse won't have old_map->pmap to query
13565 * footprint information, so collect that data now
13566 * and store it in new_map->vmmap_corpse_footprint
13567 * for later autopsy.
13568 */
13569 footprint_collect_kr =
13570 vm_map_corpse_footprint_collect(old_map,
13571 old_entry,
13572 new_map);
13573 }
13574
13575 switch (old_entry_inheritance) {
13576 case VM_INHERIT_NONE:
13577 break;
13578
13579 case VM_INHERIT_SHARE:
13580 vm_map_fork_share(old_map, old_entry, new_map);
13581 new_size += entry_size;
13582 break;
13583
13584 case VM_INHERIT_COPY:
13585
13586 /*
13587 * Inline the copy_quickly case;
13588 * upon failure, fall back on call
13589 * to vm_map_fork_copy.
13590 */
13591
13592 if (old_entry->is_sub_map) {
13593 break;
13594 }
13595 if ((old_entry->wired_count != 0) ||
13596 ((VME_OBJECT(old_entry) != NULL) &&
13597 (VME_OBJECT(old_entry)->true_share))) {
13598 goto slow_vm_map_fork_copy;
13599 }
13600
13601 new_entry = vm_map_entry_create(new_map); /* never the kernel map or descendants */
13602 vm_map_entry_copy(map: old_map, new: new_entry, old: old_entry);
13603 if (old_entry->vme_permanent) {
13604 /* inherit "permanent" on fork() */
13605 new_entry->vme_permanent = TRUE;
13606 }
13607
13608 if (new_entry->used_for_jit == TRUE && new_map->jit_entry_exists == FALSE) {
13609 new_map->jit_entry_exists = TRUE;
13610 }
13611
13612 if (new_entry->is_sub_map) {
13613 /* clear address space specifics */
13614 new_entry->use_pmap = FALSE;
13615 } else {
13616 /*
13617 * We're dealing with a copy-on-write operation,
13618 * so the resulting mapping should not inherit
13619 * the original mapping's accounting settings.
13620 * "iokit_acct" should have been cleared in
13621 * vm_map_entry_copy().
13622 * "use_pmap" should be reset to its default
13623 * (TRUE) so that the new mapping gets
13624 * accounted for in the task's memory footprint.
13625 */
13626 assert(!new_entry->iokit_acct);
13627 new_entry->use_pmap = TRUE;
13628 }
13629
13630 if (!vm_object_copy_quickly(
13631 VME_OBJECT(new_entry),
13632 src_offset: VME_OFFSET(entry: old_entry),
13633 size: (old_entry->vme_end -
13634 old_entry->vme_start),
13635 src_needs_copy: &src_needs_copy,
13636 dst_needs_copy: &new_entry_needs_copy)) {
13637 vm_map_entry_dispose(entry: new_entry);
13638 goto slow_vm_map_fork_copy;
13639 }
13640
13641 /*
13642 * Handle copy-on-write obligations
13643 */
13644
13645 if (src_needs_copy && !old_entry->needs_copy) {
13646 vm_prot_t prot;
13647
13648 if (pmap_has_prot_policy(pmap: old_map->pmap, translated_allow_execute: old_entry->translated_allow_execute, prot: old_entry->protection)) {
13649 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13650 __FUNCTION__,
13651 old_map, old_map->pmap, old_entry,
13652 (uint64_t)old_entry->vme_start,
13653 (uint64_t)old_entry->vme_end,
13654 old_entry->protection);
13655 }
13656
13657 prot = old_entry->protection & ~VM_PROT_WRITE;
13658
13659 if (override_nx(map: old_map, VME_ALIAS(old_entry))
13660 && prot) {
13661 prot |= VM_PROT_EXECUTE;
13662 }
13663
13664 if (pmap_has_prot_policy(pmap: old_map->pmap, translated_allow_execute: old_entry->translated_allow_execute, prot)) {
13665 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
13666 __FUNCTION__,
13667 old_map, old_map->pmap, old_entry,
13668 (uint64_t)old_entry->vme_start,
13669 (uint64_t)old_entry->vme_end,
13670 prot);
13671 }
13672
13673 vm_object_pmap_protect(
13674 VME_OBJECT(old_entry),
13675 offset: VME_OFFSET(entry: old_entry),
13676 size: (old_entry->vme_end -
13677 old_entry->vme_start),
13678 pmap: ((old_entry->is_shared
13679 || old_map->mapped_in_other_pmaps)
13680 ? PMAP_NULL :
13681 old_map->pmap),
13682 VM_MAP_PAGE_SIZE(old_map),
13683 pmap_start: old_entry->vme_start,
13684 prot);
13685
13686 assert(old_entry->wired_count == 0);
13687 old_entry->needs_copy = TRUE;
13688 }
13689 new_entry->needs_copy = new_entry_needs_copy;
13690
13691 /*
13692 * Insert the entry at the end
13693 * of the map.
13694 */
13695
13696 vm_map_store_entry_link(map: new_map,
13697 vm_map_last_entry(new_map),
13698 entry: new_entry,
13699 VM_MAP_KERNEL_FLAGS_NONE);
13700 new_size += entry_size;
13701 break;
13702
13703slow_vm_map_fork_copy:
13704 vm_map_copyin_flags = VM_MAP_COPYIN_FORK;
13705 if (options & VM_MAP_FORK_PRESERVE_PURGEABLE) {
13706 vm_map_copyin_flags |=
13707 VM_MAP_COPYIN_PRESERVE_PURGEABLE;
13708 }
13709 if (vm_map_fork_copy(old_map,
13710 old_entry_p: &old_entry,
13711 new_map,
13712 vm_map_copyin_flags)) {
13713 new_size += entry_size;
13714 }
13715 continue;
13716 }
13717 old_entry = old_entry->vme_next;
13718 }
13719
13720#if PMAP_FORK_NEST
13721 new_entry = vm_map_last_entry(new_map);
13722 if (new_entry == vm_map_to_entry(new_map)) {
13723 /* unnest all that was pre-nested */
13724 vm_map_fork_unnest(new_pmap,
13725 pre_nested_start, pre_nested_end,
13726 vm_map_min(new_map), vm_map_max(new_map));
13727 } else if (new_entry->vme_end < vm_map_max(new_map)) {
13728 /* unnest hole at the end, if pre-nested */
13729 vm_map_fork_unnest(new_pmap,
13730 pre_nested_start, pre_nested_end,
13731 new_entry->vme_end, vm_map_max(new_map));
13732 }
13733#endif /* PMAP_FORK_NEST */
13734
13735#if defined(__arm64__)
13736 pmap_insert_commpage(pmap: new_map->pmap);
13737#endif /* __arm64__ */
13738
13739 new_map->size = new_size;
13740
13741 if (options & VM_MAP_FORK_CORPSE_FOOTPRINT) {
13742 vm_map_corpse_footprint_collect_done(new_map);
13743 }
13744
13745 /* Propagate JIT entitlement for the pmap layer. */
13746 if (pmap_get_jit_entitled(pmap: old_map->pmap)) {
13747 /* Tell the pmap that it supports JIT. */
13748 pmap_set_jit_entitled(pmap: new_map->pmap);
13749 }
13750
13751 /* Propagate TPRO settings for the pmap layer */
13752 if (pmap_get_tpro(pmap: old_map->pmap)) {
13753 /* Tell the pmap that it supports TPRO */
13754 pmap_set_tpro(pmap: new_map->pmap);
13755 }
13756
13757
13758 vm_map_unlock(new_map);
13759 vm_map_unlock(old_map);
13760 vm_map_deallocate(map: old_map);
13761
13762 return new_map;
13763}
13764
13765/*
13766 * vm_map_exec:
13767 *
13768 * Setup the "new_map" with the proper execution environment according
13769 * to the type of executable (platform, 64bit, chroot environment).
13770 * Map the comm page and shared region, etc...
13771 */
13772kern_return_t
13773vm_map_exec(
13774 vm_map_t new_map,
13775 task_t task,
13776 boolean_t is64bit,
13777 void *fsroot,
13778 cpu_type_t cpu,
13779 cpu_subtype_t cpu_subtype,
13780 boolean_t reslide,
13781 boolean_t is_driverkit,
13782 uint32_t rsr_version)
13783{
13784 SHARED_REGION_TRACE_DEBUG(
13785 ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): ->\n",
13786 (void *)VM_KERNEL_ADDRPERM(current_task()),
13787 (void *)VM_KERNEL_ADDRPERM(new_map),
13788 (void *)VM_KERNEL_ADDRPERM(task),
13789 (void *)VM_KERNEL_ADDRPERM(fsroot),
13790 cpu,
13791 cpu_subtype));
13792 (void) vm_commpage_enter(map: new_map, task, is64bit);
13793
13794 (void) vm_shared_region_enter(map: new_map, task, is_64bit: is64bit, fsroot, cpu, cpu_subtype, reslide, is_driverkit, rsr_version);
13795
13796 SHARED_REGION_TRACE_DEBUG(
13797 ("shared_region: task %p: vm_map_exec(%p,%p,%p,0x%x,0x%x): <-\n",
13798 (void *)VM_KERNEL_ADDRPERM(current_task()),
13799 (void *)VM_KERNEL_ADDRPERM(new_map),
13800 (void *)VM_KERNEL_ADDRPERM(task),
13801 (void *)VM_KERNEL_ADDRPERM(fsroot),
13802 cpu,
13803 cpu_subtype));
13804
13805 /*
13806 * Some devices have region(s) of memory that shouldn't get allocated by
13807 * user processes. The following code creates dummy vm_map_entry_t's for each
13808 * of the regions that needs to be reserved to prevent any allocations in
13809 * those regions.
13810 */
13811 kern_return_t kr = KERN_FAILURE;
13812 vm_map_kernel_flags_t vmk_flags = VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT();
13813 vmk_flags.vmkf_beyond_max = true;
13814
13815 const struct vm_reserved_region *regions = NULL;
13816 size_t num_regions = ml_get_vm_reserved_regions(vm_is64bit: is64bit, regions: &regions);
13817 assert((num_regions == 0) || (num_regions > 0 && regions != NULL));
13818
13819 for (size_t i = 0; i < num_regions; ++i) {
13820 vm_map_offset_t address = regions[i].vmrr_addr;
13821
13822 kr = vm_map_enter(
13823 map: new_map,
13824 address: &address,
13825 size: regions[i].vmrr_size,
13826 mask: (vm_map_offset_t)0,
13827 vmk_flags,
13828 VM_OBJECT_NULL,
13829 offset: (vm_object_offset_t)0,
13830 FALSE,
13831 VM_PROT_NONE,
13832 VM_PROT_NONE,
13833 VM_INHERIT_COPY);
13834
13835 if (kr != KERN_SUCCESS) {
13836 panic("Failed to reserve %s region in user map %p %d", regions[i].vmrr_name, new_map, kr);
13837 }
13838 }
13839
13840 new_map->reserved_regions = (num_regions ? TRUE : FALSE);
13841
13842 return KERN_SUCCESS;
13843}
13844
13845uint64_t vm_map_lookup_and_lock_object_copy_slowly_count = 0;
13846uint64_t vm_map_lookup_and_lock_object_copy_slowly_size = 0;
13847uint64_t vm_map_lookup_and_lock_object_copy_slowly_max = 0;
13848uint64_t vm_map_lookup_and_lock_object_copy_slowly_restart = 0;
13849uint64_t vm_map_lookup_and_lock_object_copy_slowly_error = 0;
13850uint64_t vm_map_lookup_and_lock_object_copy_strategically_count = 0;
13851uint64_t vm_map_lookup_and_lock_object_copy_strategically_size = 0;
13852uint64_t vm_map_lookup_and_lock_object_copy_strategically_max = 0;
13853uint64_t vm_map_lookup_and_lock_object_copy_strategically_restart = 0;
13854uint64_t vm_map_lookup_and_lock_object_copy_strategically_error = 0;
13855uint64_t vm_map_lookup_and_lock_object_copy_shadow_count = 0;
13856uint64_t vm_map_lookup_and_lock_object_copy_shadow_size = 0;
13857uint64_t vm_map_lookup_and_lock_object_copy_shadow_max = 0;
13858/*
13859 * vm_map_lookup_and_lock_object:
13860 *
13861 * Finds the VM object, offset, and
13862 * protection for a given virtual address in the
13863 * specified map, assuming a page fault of the
13864 * type specified.
13865 *
13866 * Returns the (object, offset, protection) for
13867 * this address, whether it is wired down, and whether
13868 * this map has the only reference to the data in question.
13869 * In order to later verify this lookup, a "version"
13870 * is returned.
13871 * If contended != NULL, *contended will be set to
13872 * true iff the thread had to spin or block to acquire
13873 * an exclusive lock.
13874 *
13875 * The map MUST be locked by the caller and WILL be
13876 * locked on exit. In order to guarantee the
13877 * existence of the returned object, it is returned
13878 * locked.
13879 *
13880 * If a lookup is requested with "write protection"
13881 * specified, the map may be changed to perform virtual
13882 * copying operations, although the data referenced will
13883 * remain the same.
13884 */
13885kern_return_t
13886vm_map_lookup_and_lock_object(
13887 vm_map_t *var_map, /* IN/OUT */
13888 vm_map_offset_t vaddr,
13889 vm_prot_t fault_type,
13890 int object_lock_type,
13891 vm_map_version_t *out_version, /* OUT */
13892 vm_object_t *object, /* OUT */
13893 vm_object_offset_t *offset, /* OUT */
13894 vm_prot_t *out_prot, /* OUT */
13895 boolean_t *wired, /* OUT */
13896 vm_object_fault_info_t fault_info, /* OUT */
13897 vm_map_t *real_map, /* OUT */
13898 bool *contended) /* OUT */
13899{
13900 vm_map_entry_t entry;
13901 vm_map_t map = *var_map;
13902 vm_map_t old_map = *var_map;
13903 vm_map_t cow_sub_map_parent = VM_MAP_NULL;
13904 vm_map_offset_t cow_parent_vaddr = 0;
13905 vm_map_offset_t old_start = 0;
13906 vm_map_offset_t old_end = 0;
13907 vm_prot_t prot;
13908 boolean_t mask_protections;
13909 boolean_t force_copy;
13910 boolean_t no_force_copy_if_executable;
13911 boolean_t submap_needed_copy;
13912 vm_prot_t original_fault_type;
13913 vm_map_size_t fault_page_mask;
13914
13915 /*
13916 * VM_PROT_MASK means that the caller wants us to use "fault_type"
13917 * as a mask against the mapping's actual protections, not as an
13918 * absolute value.
13919 */
13920 mask_protections = (fault_type & VM_PROT_IS_MASK) ? TRUE : FALSE;
13921 force_copy = (fault_type & VM_PROT_COPY) ? TRUE : FALSE;
13922 no_force_copy_if_executable = (fault_type & VM_PROT_COPY_FAIL_IF_EXECUTABLE) ? TRUE : FALSE;
13923 fault_type &= VM_PROT_ALL;
13924 original_fault_type = fault_type;
13925 if (contended) {
13926 *contended = false;
13927 }
13928
13929 *real_map = map;
13930
13931 fault_page_mask = MIN(VM_MAP_PAGE_MASK(map), PAGE_MASK);
13932 vaddr = VM_MAP_TRUNC_PAGE(vaddr, fault_page_mask);
13933
13934RetryLookup:
13935 fault_type = original_fault_type;
13936
13937 /*
13938 * If the map has an interesting hint, try it before calling
13939 * full blown lookup routine.
13940 */
13941 entry = map->hint;
13942
13943 if ((entry == vm_map_to_entry(map)) ||
13944 (vaddr < entry->vme_start) || (vaddr >= entry->vme_end)) {
13945 vm_map_entry_t tmp_entry;
13946
13947 /*
13948 * Entry was either not a valid hint, or the vaddr
13949 * was not contained in the entry, so do a full lookup.
13950 */
13951 if (!vm_map_lookup_entry(map, address: vaddr, entry: &tmp_entry)) {
13952 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
13953 vm_map_unlock(cow_sub_map_parent);
13954 }
13955 if ((*real_map != map)
13956 && (*real_map != cow_sub_map_parent)) {
13957 vm_map_unlock(*real_map);
13958 }
13959 return KERN_INVALID_ADDRESS;
13960 }
13961
13962 entry = tmp_entry;
13963 }
13964 if (map == old_map) {
13965 old_start = entry->vme_start;
13966 old_end = entry->vme_end;
13967 }
13968
13969 /*
13970 * Handle submaps. Drop lock on upper map, submap is
13971 * returned locked.
13972 */
13973
13974 submap_needed_copy = FALSE;
13975submap_recurse:
13976 if (entry->is_sub_map) {
13977 vm_map_offset_t local_vaddr;
13978 vm_map_offset_t end_delta;
13979 vm_map_offset_t start_delta;
13980 vm_map_offset_t top_entry_saved_start;
13981 vm_object_offset_t top_entry_saved_offset;
13982 vm_map_entry_t submap_entry, saved_submap_entry;
13983 vm_object_offset_t submap_entry_offset;
13984 vm_object_size_t submap_entry_size;
13985 vm_prot_t subentry_protection;
13986 vm_prot_t subentry_max_protection;
13987 boolean_t subentry_no_copy_on_read;
13988 boolean_t subentry_permanent;
13989 boolean_t subentry_csm_associated;
13990#if __arm64e__
13991 boolean_t subentry_used_for_tpro;
13992#endif /* __arm64e__ */
13993 boolean_t mapped_needs_copy = FALSE;
13994 vm_map_version_t version;
13995
13996 assertf(VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)) >= VM_MAP_PAGE_SHIFT(map),
13997 "map %p (%d) entry %p submap %p (%d)\n",
13998 map, VM_MAP_PAGE_SHIFT(map), entry,
13999 VME_SUBMAP(entry), VM_MAP_PAGE_SHIFT(VME_SUBMAP(entry)));
14000
14001 local_vaddr = vaddr;
14002 top_entry_saved_start = entry->vme_start;
14003 top_entry_saved_offset = VME_OFFSET(entry);
14004
14005 if ((entry->use_pmap &&
14006 !((fault_type & VM_PROT_WRITE) ||
14007 force_copy))) {
14008 /* if real_map equals map we unlock below */
14009 if ((*real_map != map) &&
14010 (*real_map != cow_sub_map_parent)) {
14011 vm_map_unlock(*real_map);
14012 }
14013 *real_map = VME_SUBMAP(entry);
14014 }
14015
14016 if (entry->needs_copy &&
14017 ((fault_type & VM_PROT_WRITE) ||
14018 force_copy)) {
14019 if (!mapped_needs_copy) {
14020 if (vm_map_lock_read_to_write(map)) {
14021 vm_map_lock_read(map);
14022 *real_map = map;
14023 goto RetryLookup;
14024 }
14025 vm_map_lock_read(VME_SUBMAP(entry));
14026 *var_map = VME_SUBMAP(entry);
14027 cow_sub_map_parent = map;
14028 /* reset base to map before cow object */
14029 /* this is the map which will accept */
14030 /* the new cow object */
14031 old_start = entry->vme_start;
14032 old_end = entry->vme_end;
14033 cow_parent_vaddr = vaddr;
14034 mapped_needs_copy = TRUE;
14035 } else {
14036 vm_map_lock_read(VME_SUBMAP(entry));
14037 *var_map = VME_SUBMAP(entry);
14038 if ((cow_sub_map_parent != map) &&
14039 (*real_map != map)) {
14040 vm_map_unlock(map);
14041 }
14042 }
14043 } else {
14044 if (entry->needs_copy) {
14045 submap_needed_copy = TRUE;
14046 }
14047 vm_map_lock_read(VME_SUBMAP(entry));
14048 *var_map = VME_SUBMAP(entry);
14049 /* leave map locked if it is a target */
14050 /* cow sub_map above otherwise, just */
14051 /* follow the maps down to the object */
14052 /* here we unlock knowing we are not */
14053 /* revisiting the map. */
14054 if ((*real_map != map) && (map != cow_sub_map_parent)) {
14055 vm_map_unlock_read(map);
14056 }
14057 }
14058
14059 entry = NULL;
14060 map = *var_map;
14061
14062 /* calculate the offset in the submap for vaddr */
14063 local_vaddr = (local_vaddr - top_entry_saved_start) + top_entry_saved_offset;
14064 assertf(VM_MAP_PAGE_ALIGNED(local_vaddr, fault_page_mask),
14065 "local_vaddr 0x%llx entry->vme_start 0x%llx fault_page_mask 0x%llx\n",
14066 (uint64_t)local_vaddr, (uint64_t)top_entry_saved_start, (uint64_t)fault_page_mask);
14067
14068RetrySubMap:
14069 if (!vm_map_lookup_entry(map, address: local_vaddr, entry: &submap_entry)) {
14070 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14071 vm_map_unlock(cow_sub_map_parent);
14072 }
14073 if ((*real_map != map)
14074 && (*real_map != cow_sub_map_parent)) {
14075 vm_map_unlock(*real_map);
14076 }
14077 *real_map = map;
14078 return KERN_INVALID_ADDRESS;
14079 }
14080
14081 /* find the attenuated shadow of the underlying object */
14082 /* on our target map */
14083
14084 /* in english the submap object may extend beyond the */
14085 /* region mapped by the entry or, may only fill a portion */
14086 /* of it. For our purposes, we only care if the object */
14087 /* doesn't fill. In this case the area which will */
14088 /* ultimately be clipped in the top map will only need */
14089 /* to be as big as the portion of the underlying entry */
14090 /* which is mapped */
14091 start_delta = submap_entry->vme_start > top_entry_saved_offset ?
14092 submap_entry->vme_start - top_entry_saved_offset : 0;
14093
14094 end_delta =
14095 (top_entry_saved_offset + start_delta + (old_end - old_start)) <=
14096 submap_entry->vme_end ?
14097 0 : (top_entry_saved_offset +
14098 (old_end - old_start))
14099 - submap_entry->vme_end;
14100
14101 old_start += start_delta;
14102 old_end -= end_delta;
14103
14104 if (submap_entry->is_sub_map) {
14105 entry = submap_entry;
14106 vaddr = local_vaddr;
14107 goto submap_recurse;
14108 }
14109
14110 if (((fault_type & VM_PROT_WRITE) ||
14111 force_copy)
14112 && cow_sub_map_parent) {
14113 vm_object_t sub_object, copy_object;
14114 vm_object_offset_t copy_offset;
14115 vm_map_offset_t local_start;
14116 vm_map_offset_t local_end;
14117 boolean_t object_copied = FALSE;
14118 vm_object_offset_t object_copied_offset = 0;
14119 boolean_t object_copied_needs_copy = FALSE;
14120 kern_return_t kr = KERN_SUCCESS;
14121
14122 if (vm_map_lock_read_to_write(map)) {
14123 vm_map_lock_read(map);
14124 old_start -= start_delta;
14125 old_end += end_delta;
14126 goto RetrySubMap;
14127 }
14128
14129
14130 sub_object = VME_OBJECT(submap_entry);
14131 if (sub_object == VM_OBJECT_NULL) {
14132 sub_object =
14133 vm_object_allocate(
14134 size: (vm_map_size_t)
14135 (submap_entry->vme_end -
14136 submap_entry->vme_start));
14137 VME_OBJECT_SET(entry: submap_entry, object: sub_object, false, context: 0);
14138 VME_OFFSET_SET(entry: submap_entry, offset: 0);
14139 assert(!submap_entry->is_sub_map);
14140 assert(submap_entry->use_pmap);
14141 }
14142 local_start = local_vaddr -
14143 (cow_parent_vaddr - old_start);
14144 local_end = local_vaddr +
14145 (old_end - cow_parent_vaddr);
14146 vm_map_clip_start(map, entry: submap_entry, startaddr: local_start);
14147 vm_map_clip_end(map, entry: submap_entry, endaddr: local_end);
14148 if (submap_entry->is_sub_map) {
14149 /* unnesting was done when clipping */
14150 assert(!submap_entry->use_pmap);
14151 }
14152
14153 /* This is the COW case, lets connect */
14154 /* an entry in our space to the underlying */
14155 /* object in the submap, bypassing the */
14156 /* submap. */
14157 submap_entry_offset = VME_OFFSET(entry: submap_entry);
14158 submap_entry_size = submap_entry->vme_end - submap_entry->vme_start;
14159
14160 if ((submap_entry->wired_count != 0 ||
14161 sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) &&
14162 (submap_entry->protection & VM_PROT_EXECUTE) &&
14163 no_force_copy_if_executable) {
14164// printf("FBDP map %p entry %p start 0x%llx end 0x%llx wired %d strat %d\n", map, submap_entry, (uint64_t)local_start, (uint64_t)local_end, submap_entry->wired_count, sub_object->copy_strategy);
14165 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14166 vm_map_unlock(cow_sub_map_parent);
14167 }
14168 if ((*real_map != map)
14169 && (*real_map != cow_sub_map_parent)) {
14170 vm_map_unlock(*real_map);
14171 }
14172 *real_map = map;
14173 ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_NO_COW_ON_EXECUTABLE), arg: 0 /* arg */);
14174 vm_map_lock_write_to_read(map);
14175 kr = KERN_PROTECTION_FAILURE;
14176 DTRACE_VM4(submap_no_copy_executable,
14177 vm_map_t, map,
14178 vm_object_offset_t, submap_entry_offset,
14179 vm_object_size_t, submap_entry_size,
14180 int, kr);
14181 return kr;
14182 }
14183
14184 if (submap_entry->wired_count != 0) {
14185 vm_object_reference(sub_object);
14186
14187 assertf(VM_MAP_PAGE_ALIGNED(VME_OFFSET(submap_entry), VM_MAP_PAGE_MASK(map)),
14188 "submap_entry %p offset 0x%llx\n",
14189 submap_entry, VME_OFFSET(submap_entry));
14190
14191 DTRACE_VM6(submap_copy_slowly,
14192 vm_map_t, cow_sub_map_parent,
14193 vm_map_offset_t, vaddr,
14194 vm_map_t, map,
14195 vm_object_size_t, submap_entry_size,
14196 int, submap_entry->wired_count,
14197 int, sub_object->copy_strategy);
14198
14199 saved_submap_entry = submap_entry;
14200 version.main_timestamp = map->timestamp;
14201 vm_map_unlock(map); /* Increments timestamp by 1 */
14202 submap_entry = VM_MAP_ENTRY_NULL;
14203
14204 vm_object_lock(sub_object);
14205 kr = vm_object_copy_slowly(src_object: sub_object,
14206 src_offset: submap_entry_offset,
14207 size: submap_entry_size,
14208 FALSE,
14209 result_object: &copy_object);
14210 object_copied = TRUE;
14211 object_copied_offset = 0;
14212 /* 4k: account for extra offset in physical page */
14213 object_copied_offset += submap_entry_offset - vm_object_trunc_page(submap_entry_offset);
14214 object_copied_needs_copy = FALSE;
14215 vm_object_deallocate(object: sub_object);
14216
14217 vm_map_lock(map);
14218
14219 if (kr != KERN_SUCCESS &&
14220 kr != KERN_MEMORY_RESTART_COPY) {
14221 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14222 vm_map_unlock(cow_sub_map_parent);
14223 }
14224 if ((*real_map != map)
14225 && (*real_map != cow_sub_map_parent)) {
14226 vm_map_unlock(*real_map);
14227 }
14228 *real_map = map;
14229 vm_object_deallocate(object: copy_object);
14230 copy_object = VM_OBJECT_NULL;
14231 ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_SLOWLY_FAILED), arg: 0 /* arg */);
14232 vm_map_lock_write_to_read(map);
14233 DTRACE_VM4(submap_copy_error_slowly,
14234 vm_object_t, sub_object,
14235 vm_object_offset_t, submap_entry_offset,
14236 vm_object_size_t, submap_entry_size,
14237 int, kr);
14238 vm_map_lookup_and_lock_object_copy_slowly_error++;
14239 return kr;
14240 }
14241
14242 if ((kr == KERN_SUCCESS) &&
14243 (version.main_timestamp + 1) == map->timestamp) {
14244 submap_entry = saved_submap_entry;
14245 } else {
14246 saved_submap_entry = NULL;
14247 old_start -= start_delta;
14248 old_end += end_delta;
14249 vm_object_deallocate(object: copy_object);
14250 copy_object = VM_OBJECT_NULL;
14251 vm_map_lock_write_to_read(map);
14252 vm_map_lookup_and_lock_object_copy_slowly_restart++;
14253 goto RetrySubMap;
14254 }
14255 vm_map_lookup_and_lock_object_copy_slowly_count++;
14256 vm_map_lookup_and_lock_object_copy_slowly_size += submap_entry_size;
14257 if (submap_entry_size > vm_map_lookup_and_lock_object_copy_slowly_max) {
14258 vm_map_lookup_and_lock_object_copy_slowly_max = submap_entry_size;
14259 }
14260 } else if (sub_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
14261 submap_entry_offset = VME_OFFSET(entry: submap_entry);
14262 copy_object = VM_OBJECT_NULL;
14263 object_copied_offset = submap_entry_offset;
14264 object_copied_needs_copy = FALSE;
14265 DTRACE_VM6(submap_copy_strategically,
14266 vm_map_t, cow_sub_map_parent,
14267 vm_map_offset_t, vaddr,
14268 vm_map_t, map,
14269 vm_object_size_t, submap_entry_size,
14270 int, submap_entry->wired_count,
14271 int, sub_object->copy_strategy);
14272 kr = vm_object_copy_strategically(
14273 src_object: sub_object,
14274 src_offset: submap_entry_offset,
14275 size: submap_entry->vme_end - submap_entry->vme_start,
14276 false, /* forking */
14277 dst_object: &copy_object,
14278 dst_offset: &object_copied_offset,
14279 dst_needs_copy: &object_copied_needs_copy);
14280 if (kr == KERN_MEMORY_RESTART_COPY) {
14281 old_start -= start_delta;
14282 old_end += end_delta;
14283 vm_object_deallocate(object: copy_object);
14284 copy_object = VM_OBJECT_NULL;
14285 vm_map_lock_write_to_read(map);
14286 vm_map_lookup_and_lock_object_copy_strategically_restart++;
14287 goto RetrySubMap;
14288 }
14289 if (kr != KERN_SUCCESS) {
14290 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14291 vm_map_unlock(cow_sub_map_parent);
14292 }
14293 if ((*real_map != map)
14294 && (*real_map != cow_sub_map_parent)) {
14295 vm_map_unlock(*real_map);
14296 }
14297 *real_map = map;
14298 vm_object_deallocate(object: copy_object);
14299 copy_object = VM_OBJECT_NULL;
14300 ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_SUBMAP_COPY_STRAT_FAILED), arg: 0 /* arg */);
14301 vm_map_lock_write_to_read(map);
14302 DTRACE_VM4(submap_copy_error_strategically,
14303 vm_object_t, sub_object,
14304 vm_object_offset_t, submap_entry_offset,
14305 vm_object_size_t, submap_entry_size,
14306 int, kr);
14307 vm_map_lookup_and_lock_object_copy_strategically_error++;
14308 return kr;
14309 }
14310 assert(copy_object != VM_OBJECT_NULL);
14311 assert(copy_object != sub_object);
14312 object_copied = TRUE;
14313 vm_map_lookup_and_lock_object_copy_strategically_count++;
14314 vm_map_lookup_and_lock_object_copy_strategically_size += submap_entry_size;
14315 if (submap_entry_size > vm_map_lookup_and_lock_object_copy_strategically_max) {
14316 vm_map_lookup_and_lock_object_copy_strategically_max = submap_entry_size;
14317 }
14318 } else {
14319 /* set up shadow object */
14320 object_copied = FALSE;
14321 copy_object = sub_object;
14322 vm_object_lock(sub_object);
14323 vm_object_reference_locked(sub_object);
14324 VM_OBJECT_SET_SHADOWED(object: sub_object, TRUE);
14325 vm_object_unlock(sub_object);
14326
14327 assert(submap_entry->wired_count == 0);
14328 submap_entry->needs_copy = TRUE;
14329
14330 prot = submap_entry->protection;
14331 if (pmap_has_prot_policy(pmap: map->pmap, translated_allow_execute: submap_entry->translated_allow_execute, prot)) {
14332 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
14333 __FUNCTION__,
14334 map, map->pmap, submap_entry,
14335 (uint64_t)submap_entry->vme_start,
14336 (uint64_t)submap_entry->vme_end,
14337 prot);
14338 }
14339 prot = prot & ~VM_PROT_WRITE;
14340 if (pmap_has_prot_policy(pmap: map->pmap, translated_allow_execute: submap_entry->translated_allow_execute, prot)) {
14341 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
14342 __FUNCTION__,
14343 map, map->pmap, submap_entry,
14344 (uint64_t)submap_entry->vme_start,
14345 (uint64_t)submap_entry->vme_end,
14346 prot);
14347 }
14348
14349 if (override_nx(map: old_map,
14350 VME_ALIAS(submap_entry))
14351 && prot) {
14352 prot |= VM_PROT_EXECUTE;
14353 }
14354
14355 vm_object_pmap_protect(
14356 object: sub_object,
14357 offset: VME_OFFSET(entry: submap_entry),
14358 size: submap_entry->vme_end -
14359 submap_entry->vme_start,
14360 pmap: (submap_entry->is_shared
14361 || map->mapped_in_other_pmaps) ?
14362 PMAP_NULL : map->pmap,
14363 VM_MAP_PAGE_SIZE(map),
14364 pmap_start: submap_entry->vme_start,
14365 prot);
14366 vm_map_lookup_and_lock_object_copy_shadow_count++;
14367 vm_map_lookup_and_lock_object_copy_shadow_size += submap_entry_size;
14368 if (submap_entry_size > vm_map_lookup_and_lock_object_copy_shadow_max) {
14369 vm_map_lookup_and_lock_object_copy_shadow_max = submap_entry_size;
14370 }
14371 }
14372
14373 /*
14374 * Adjust the fault offset to the submap entry.
14375 */
14376 copy_offset = (local_vaddr -
14377 submap_entry->vme_start +
14378 VME_OFFSET(entry: submap_entry));
14379
14380 /* This works diffently than the */
14381 /* normal submap case. We go back */
14382 /* to the parent of the cow map and*/
14383 /* clip out the target portion of */
14384 /* the sub_map, substituting the */
14385 /* new copy object, */
14386
14387 subentry_protection = submap_entry->protection;
14388 subentry_max_protection = submap_entry->max_protection;
14389 subentry_no_copy_on_read = submap_entry->vme_no_copy_on_read;
14390 subentry_permanent = submap_entry->vme_permanent;
14391 subentry_csm_associated = submap_entry->csm_associated;
14392#if __arm64e__
14393 subentry_used_for_tpro = submap_entry->used_for_tpro;
14394#endif // __arm64e__
14395 vm_map_unlock(map);
14396 submap_entry = NULL; /* not valid after map unlock */
14397
14398 local_start = old_start;
14399 local_end = old_end;
14400 map = cow_sub_map_parent;
14401 *var_map = cow_sub_map_parent;
14402 vaddr = cow_parent_vaddr;
14403 cow_sub_map_parent = NULL;
14404
14405 if (!vm_map_lookup_entry(map,
14406 address: vaddr, entry: &entry)) {
14407 if ((cow_sub_map_parent) && (cow_sub_map_parent != map)) {
14408 vm_map_unlock(cow_sub_map_parent);
14409 }
14410 if ((*real_map != map)
14411 && (*real_map != cow_sub_map_parent)) {
14412 vm_map_unlock(*real_map);
14413 }
14414 *real_map = map;
14415 vm_object_deallocate(
14416 object: copy_object);
14417 copy_object = VM_OBJECT_NULL;
14418 vm_map_lock_write_to_read(map);
14419 DTRACE_VM4(submap_lookup_post_unlock,
14420 uint64_t, (uint64_t)entry->vme_start,
14421 uint64_t, (uint64_t)entry->vme_end,
14422 vm_map_offset_t, vaddr,
14423 int, object_copied);
14424 return KERN_INVALID_ADDRESS;
14425 }
14426
14427 /* clip out the portion of space */
14428 /* mapped by the sub map which */
14429 /* corresponds to the underlying */
14430 /* object */
14431
14432 /*
14433 * Clip (and unnest) the smallest nested chunk
14434 * possible around the faulting address...
14435 */
14436 local_start = vaddr & ~(pmap_shared_region_size_min(map: map->pmap) - 1);
14437 local_end = local_start + pmap_shared_region_size_min(map: map->pmap);
14438 /*
14439 * ... but don't go beyond the "old_start" to "old_end"
14440 * range, to avoid spanning over another VM region
14441 * with a possibly different VM object and/or offset.
14442 */
14443 if (local_start < old_start) {
14444 local_start = old_start;
14445 }
14446 if (local_end > old_end) {
14447 local_end = old_end;
14448 }
14449 /*
14450 * Adjust copy_offset to the start of the range.
14451 */
14452 copy_offset -= (vaddr - local_start);
14453
14454 vm_map_clip_start(map, entry, startaddr: local_start);
14455 vm_map_clip_end(map, entry, endaddr: local_end);
14456 if (entry->is_sub_map) {
14457 /* unnesting was done when clipping */
14458 assert(!entry->use_pmap);
14459 }
14460
14461 /* substitute copy object for */
14462 /* shared map entry */
14463 vm_map_deallocate(VME_SUBMAP(entry));
14464 assert(!entry->iokit_acct);
14465 entry->use_pmap = TRUE;
14466 VME_OBJECT_SET(entry, object: copy_object, false, context: 0);
14467
14468 /* propagate the submap entry's protections */
14469 if (entry->protection != VM_PROT_READ) {
14470 /*
14471 * Someone has already altered the top entry's
14472 * protections via vm_protect(VM_PROT_COPY).
14473 * Respect these new values and ignore the
14474 * submap entry's protections.
14475 */
14476 } else {
14477 /*
14478 * Regular copy-on-write: propagate the submap
14479 * entry's protections to the top map entry.
14480 */
14481 entry->protection |= subentry_protection;
14482 }
14483 entry->max_protection |= subentry_max_protection;
14484 /* propagate some attributes from subentry */
14485 entry->vme_no_copy_on_read = subentry_no_copy_on_read;
14486 entry->vme_permanent = subentry_permanent;
14487 entry->csm_associated = subentry_csm_associated;
14488#if __arm64e__
14489 /* propagate TPRO iff the destination map has TPRO enabled */
14490 if (subentry_used_for_tpro && vm_map_tpro(map)) {
14491 entry->used_for_tpro = subentry_used_for_tpro;
14492 }
14493#endif /* __arm64e */
14494 if ((entry->protection & VM_PROT_WRITE) &&
14495 (entry->protection & VM_PROT_EXECUTE) &&
14496#if XNU_TARGET_OS_OSX
14497 map->pmap != kernel_pmap &&
14498 (vm_map_cs_enforcement(map)
14499#if __arm64__
14500 || !VM_MAP_IS_EXOTIC(map)
14501#endif /* __arm64__ */
14502 ) &&
14503#endif /* XNU_TARGET_OS_OSX */
14504#if CODE_SIGNING_MONITOR
14505 (csm_address_space_exempt(map->pmap) != KERN_SUCCESS) &&
14506#endif
14507 !(entry->used_for_jit) &&
14508 VM_MAP_POLICY_WX_STRIP_X(map)) {
14509 DTRACE_VM3(cs_wx,
14510 uint64_t, (uint64_t)entry->vme_start,
14511 uint64_t, (uint64_t)entry->vme_end,
14512 vm_prot_t, entry->protection);
14513 printf(format: "CODE SIGNING: %d[%s] %s:%d(0x%llx,0x%llx,0x%x) can't have both write and exec at the same time\n",
14514 proc_selfpid(),
14515 (get_bsdtask_info(current_task())
14516 ? proc_name_address(p: get_bsdtask_info(current_task()))
14517 : "?"),
14518 __FUNCTION__, __LINE__,
14519#if DEVELOPMENT || DEBUG
14520 (uint64_t)entry->vme_start,
14521 (uint64_t)entry->vme_end,
14522#else /* DEVELOPMENT || DEBUG */
14523 (uint64_t)0,
14524 (uint64_t)0,
14525#endif /* DEVELOPMENT || DEBUG */
14526 entry->protection);
14527 entry->protection &= ~VM_PROT_EXECUTE;
14528 }
14529
14530 if (object_copied) {
14531 VME_OFFSET_SET(entry, offset: local_start - old_start + object_copied_offset);
14532 entry->needs_copy = object_copied_needs_copy;
14533 entry->is_shared = FALSE;
14534 } else {
14535 assert(VME_OBJECT(entry) != VM_OBJECT_NULL);
14536 assert(VME_OBJECT(entry)->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC);
14537 assert(entry->wired_count == 0);
14538 VME_OFFSET_SET(entry, offset: copy_offset);
14539 entry->needs_copy = TRUE;
14540 if (map != old_map) {
14541 entry->is_shared = TRUE;
14542 }
14543 }
14544 if (entry->inheritance == VM_INHERIT_SHARE) {
14545 entry->inheritance = VM_INHERIT_COPY;
14546 }
14547
14548 vm_map_lock_write_to_read(map);
14549 } else {
14550 if ((cow_sub_map_parent)
14551 && (cow_sub_map_parent != *real_map)
14552 && (cow_sub_map_parent != map)) {
14553 vm_map_unlock(cow_sub_map_parent);
14554 }
14555 entry = submap_entry;
14556 vaddr = local_vaddr;
14557 }
14558 }
14559
14560 /*
14561 * Check whether this task is allowed to have
14562 * this page.
14563 */
14564
14565 prot = entry->protection;
14566
14567 if (override_nx(map: old_map, VME_ALIAS(entry)) && prot) {
14568 /*
14569 * HACK -- if not a stack, then allow execution
14570 */
14571 prot |= VM_PROT_EXECUTE;
14572 }
14573
14574#if __arm64e__
14575 /*
14576 * If the entry we're dealing with is TPRO and we have a write
14577 * fault, inject VM_PROT_WRITE into protections. This allows us
14578 * to maintain RO permissions when not marked as TPRO.
14579 */
14580 if (entry->used_for_tpro && (fault_type & VM_PROT_WRITE)) {
14581 prot |= VM_PROT_WRITE;
14582 }
14583#endif /* __arm64e__ */
14584 if (mask_protections) {
14585 fault_type &= prot;
14586 if (fault_type == VM_PROT_NONE) {
14587 goto protection_failure;
14588 }
14589 }
14590 if (((fault_type & prot) != fault_type)
14591#if __arm64__
14592 /* prefetch abort in execute-only page */
14593 && !(prot == VM_PROT_EXECUTE && fault_type == (VM_PROT_READ | VM_PROT_EXECUTE))
14594#elif defined(__x86_64__)
14595 /* Consider the UEXEC bit when handling an EXECUTE fault */
14596 && !((fault_type & VM_PROT_EXECUTE) && !(prot & VM_PROT_EXECUTE) && (prot & VM_PROT_UEXEC))
14597#endif
14598 ) {
14599protection_failure:
14600 if (*real_map != map) {
14601 vm_map_unlock(*real_map);
14602 }
14603 *real_map = map;
14604
14605 if ((fault_type & VM_PROT_EXECUTE) && prot) {
14606 log_stack_execution_failure(vaddr: (addr64_t)vaddr, prot);
14607 }
14608
14609 DTRACE_VM2(prot_fault, int, 1, (uint64_t *), NULL);
14610 DTRACE_VM3(prot_fault_detailed, vm_prot_t, fault_type, vm_prot_t, prot, void *, vaddr);
14611 /*
14612 * Noisy (esp. internally) and can be inferred from CrashReports. So OFF for now.
14613 *
14614 * ktriage_record(thread_tid(current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_PROTECTION_FAILURE), 0);
14615 */
14616 return KERN_PROTECTION_FAILURE;
14617 }
14618
14619 /*
14620 * If this page is not pageable, we have to get
14621 * it for all possible accesses.
14622 */
14623
14624 *wired = (entry->wired_count != 0);
14625 if (*wired) {
14626 fault_type = prot;
14627 }
14628
14629 /*
14630 * If the entry was copy-on-write, we either ...
14631 */
14632
14633 if (entry->needs_copy) {
14634 /*
14635 * If we want to write the page, we may as well
14636 * handle that now since we've got the map locked.
14637 *
14638 * If we don't need to write the page, we just
14639 * demote the permissions allowed.
14640 */
14641
14642 if ((fault_type & VM_PROT_WRITE) || *wired || force_copy) {
14643 /*
14644 * Make a new object, and place it in the
14645 * object chain. Note that no new references
14646 * have appeared -- one just moved from the
14647 * map to the new object.
14648 */
14649
14650 if (vm_map_lock_read_to_write(map)) {
14651 vm_map_lock_read(map);
14652 goto RetryLookup;
14653 }
14654
14655 if (VME_OBJECT(entry)->shadowed == FALSE) {
14656 vm_object_lock(VME_OBJECT(entry));
14657 VM_OBJECT_SET_SHADOWED(VME_OBJECT(entry), TRUE);
14658 vm_object_unlock(VME_OBJECT(entry));
14659 }
14660 VME_OBJECT_SHADOW(entry,
14661 length: (vm_map_size_t) (entry->vme_end -
14662 entry->vme_start),
14663 always: vm_map_always_shadow(map));
14664 entry->needs_copy = FALSE;
14665
14666 vm_map_lock_write_to_read(map);
14667 }
14668 if ((fault_type & VM_PROT_WRITE) == 0 && *wired == 0) {
14669 /*
14670 * We're attempting to read a copy-on-write
14671 * page -- don't allow writes.
14672 */
14673
14674 prot &= (~VM_PROT_WRITE);
14675 }
14676 }
14677
14678 if (submap_needed_copy && (prot & VM_PROT_WRITE)) {
14679 /*
14680 * We went through a "needs_copy" submap without triggering
14681 * a copy, so granting write access to the page would bypass
14682 * that submap's "needs_copy".
14683 */
14684 assert(!(fault_type & VM_PROT_WRITE));
14685 assert(!*wired);
14686 assert(!force_copy);
14687 // printf("FBDP %d[%s] submap_needed_copy for %p 0x%llx\n", proc_selfpid(), proc_name_address(current_task()->bsd_info), map, vaddr);
14688 prot &= ~VM_PROT_WRITE;
14689 }
14690
14691 /*
14692 * Create an object if necessary.
14693 */
14694 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
14695 if (vm_map_lock_read_to_write(map)) {
14696 vm_map_lock_read(map);
14697 goto RetryLookup;
14698 }
14699
14700 VME_OBJECT_SET(entry,
14701 object: vm_object_allocate(
14702 size: (vm_map_size_t)(entry->vme_end -
14703 entry->vme_start)), false, context: 0);
14704 VME_OFFSET_SET(entry, offset: 0);
14705 assert(entry->use_pmap);
14706 vm_map_lock_write_to_read(map);
14707 }
14708
14709 /*
14710 * Return the object/offset from this entry. If the entry
14711 * was copy-on-write or empty, it has been fixed up. Also
14712 * return the protection.
14713 */
14714
14715 *offset = (vaddr - entry->vme_start) + VME_OFFSET(entry);
14716 *object = VME_OBJECT(entry);
14717 *out_prot = prot;
14718 KDBG_FILTERED(MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_MAP_LOOKUP_OBJECT), VM_KERNEL_UNSLIDE_OR_PERM(*object), (unsigned long) VME_ALIAS(entry), 0, 0);
14719
14720 if (fault_info) {
14721 fault_info->interruptible = THREAD_UNINT; /* for now... */
14722 /* ... the caller will change "interruptible" if needed */
14723 fault_info->cluster_size = 0;
14724 fault_info->user_tag = VME_ALIAS(entry);
14725 fault_info->pmap_options = 0;
14726 if (entry->iokit_acct ||
14727 (!entry->is_sub_map && !entry->use_pmap)) {
14728 fault_info->pmap_options |= PMAP_OPTIONS_ALT_ACCT;
14729 }
14730 fault_info->behavior = entry->behavior;
14731 fault_info->lo_offset = VME_OFFSET(entry);
14732 fault_info->hi_offset =
14733 (entry->vme_end - entry->vme_start) + VME_OFFSET(entry);
14734 fault_info->no_cache = entry->no_cache;
14735 fault_info->stealth = FALSE;
14736 fault_info->io_sync = FALSE;
14737 if (entry->used_for_jit ||
14738#if CODE_SIGNING_MONITOR
14739 (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
14740#endif
14741 entry->vme_resilient_codesign) {
14742 fault_info->cs_bypass = TRUE;
14743 } else {
14744 fault_info->cs_bypass = FALSE;
14745 }
14746 fault_info->csm_associated = FALSE;
14747#if CODE_SIGNING_MONITOR
14748 if (entry->csm_associated) {
14749 /*
14750 * The pmap layer will validate this page
14751 * before allowing it to be executed from.
14752 */
14753 fault_info->csm_associated = TRUE;
14754 }
14755#endif
14756 fault_info->mark_zf_absent = FALSE;
14757 fault_info->batch_pmap_op = FALSE;
14758 fault_info->resilient_media = entry->vme_resilient_media;
14759 fault_info->fi_xnu_user_debug = entry->vme_xnu_user_debug;
14760 fault_info->no_copy_on_read = entry->vme_no_copy_on_read;
14761#if __arm64e__
14762 fault_info->fi_used_for_tpro = entry->used_for_tpro;
14763#else /* __arm64e__ */
14764 fault_info->fi_used_for_tpro = FALSE;
14765#endif
14766 if (entry->translated_allow_execute) {
14767 fault_info->pmap_options |= PMAP_OPTIONS_TRANSLATED_ALLOW_EXECUTE;
14768 }
14769 }
14770
14771 /*
14772 * Lock the object to prevent it from disappearing
14773 */
14774 if (object_lock_type == OBJECT_LOCK_EXCLUSIVE) {
14775 if (contended == NULL) {
14776 vm_object_lock(*object);
14777 } else {
14778 *contended = vm_object_lock_check_contended(*object);
14779 }
14780 } else {
14781 vm_object_lock_shared(*object);
14782 }
14783
14784 /*
14785 * Save the version number
14786 */
14787
14788 out_version->main_timestamp = map->timestamp;
14789
14790 return KERN_SUCCESS;
14791}
14792
14793
14794/*
14795 * vm_map_verify:
14796 *
14797 * Verifies that the map in question has not changed
14798 * since the given version. The map has to be locked
14799 * ("shared" mode is fine) before calling this function
14800 * and it will be returned locked too.
14801 */
14802boolean_t
14803vm_map_verify(
14804 vm_map_t map,
14805 vm_map_version_t *version) /* REF */
14806{
14807 boolean_t result;
14808
14809 vm_map_lock_assert_held(map);
14810 result = (map->timestamp == version->main_timestamp);
14811
14812 return result;
14813}
14814
14815/*
14816 * TEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARYTEMPORARY
14817 * Goes away after regular vm_region_recurse function migrates to
14818 * 64 bits
14819 * vm_region_recurse: A form of vm_region which follows the
14820 * submaps in a target map
14821 *
14822 */
14823
14824kern_return_t
14825vm_map_region_recurse_64(
14826 vm_map_t map,
14827 vm_map_offset_t *address, /* IN/OUT */
14828 vm_map_size_t *size, /* OUT */
14829 natural_t *nesting_depth, /* IN/OUT */
14830 vm_region_submap_info_64_t submap_info, /* IN/OUT */
14831 mach_msg_type_number_t *count) /* IN/OUT */
14832{
14833 mach_msg_type_number_t original_count;
14834 vm_region_extended_info_data_t extended;
14835 vm_map_entry_t tmp_entry;
14836 vm_map_offset_t user_address;
14837 unsigned int user_max_depth;
14838
14839 /*
14840 * "curr_entry" is the VM map entry preceding or including the
14841 * address we're looking for.
14842 * "curr_map" is the map or sub-map containing "curr_entry".
14843 * "curr_address" is the equivalent of the top map's "user_address"
14844 * in the current map.
14845 * "curr_offset" is the cumulated offset of "curr_map" in the
14846 * target task's address space.
14847 * "curr_depth" is the depth of "curr_map" in the chain of
14848 * sub-maps.
14849 *
14850 * "curr_max_below" and "curr_max_above" limit the range (around
14851 * "curr_address") we should take into account in the current (sub)map.
14852 * They limit the range to what's visible through the map entries
14853 * we've traversed from the top map to the current map.
14854 *
14855 */
14856 vm_map_entry_t curr_entry;
14857 vm_map_address_t curr_address;
14858 vm_map_offset_t curr_offset;
14859 vm_map_t curr_map;
14860 unsigned int curr_depth;
14861 vm_map_offset_t curr_max_below, curr_max_above;
14862 vm_map_offset_t curr_skip;
14863
14864 /*
14865 * "next_" is the same as "curr_" but for the VM region immediately
14866 * after the address we're looking for. We need to keep track of this
14867 * too because we want to return info about that region if the
14868 * address we're looking for is not mapped.
14869 */
14870 vm_map_entry_t next_entry;
14871 vm_map_offset_t next_offset;
14872 vm_map_offset_t next_address;
14873 vm_map_t next_map;
14874 unsigned int next_depth;
14875 vm_map_offset_t next_max_below, next_max_above;
14876 vm_map_offset_t next_skip;
14877
14878 boolean_t look_for_pages;
14879 vm_region_submap_short_info_64_t short_info;
14880 boolean_t do_region_footprint;
14881 int effective_page_size, effective_page_shift;
14882 boolean_t submap_needed_copy;
14883
14884 if (map == VM_MAP_NULL) {
14885 /* no address space to work on */
14886 return KERN_INVALID_ARGUMENT;
14887 }
14888
14889 effective_page_shift = vm_self_region_page_shift(target_map: map);
14890 effective_page_size = (1 << effective_page_shift);
14891
14892 if (*count < VM_REGION_SUBMAP_SHORT_INFO_COUNT_64) {
14893 /*
14894 * "info" structure is not big enough and
14895 * would overflow
14896 */
14897 return KERN_INVALID_ARGUMENT;
14898 }
14899
14900 do_region_footprint = task_self_region_footprint();
14901 original_count = *count;
14902
14903 if (original_count < VM_REGION_SUBMAP_INFO_V0_COUNT_64) {
14904 *count = VM_REGION_SUBMAP_SHORT_INFO_COUNT_64;
14905 look_for_pages = FALSE;
14906 short_info = (vm_region_submap_short_info_64_t) submap_info;
14907 submap_info = NULL;
14908 } else {
14909 look_for_pages = TRUE;
14910 *count = VM_REGION_SUBMAP_INFO_V0_COUNT_64;
14911 short_info = NULL;
14912
14913 if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
14914 *count = VM_REGION_SUBMAP_INFO_V1_COUNT_64;
14915 }
14916 if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
14917 *count = VM_REGION_SUBMAP_INFO_V2_COUNT_64;
14918 }
14919 }
14920
14921 user_address = *address;
14922 user_max_depth = *nesting_depth;
14923 submap_needed_copy = FALSE;
14924
14925 if (not_in_kdp) {
14926 vm_map_lock_read(map);
14927 }
14928
14929recurse_again:
14930 curr_entry = NULL;
14931 curr_map = map;
14932 curr_address = user_address;
14933 curr_offset = 0;
14934 curr_skip = 0;
14935 curr_depth = 0;
14936 curr_max_above = ((vm_map_offset_t) -1) - curr_address;
14937 curr_max_below = curr_address;
14938
14939 next_entry = NULL;
14940 next_map = NULL;
14941 next_address = 0;
14942 next_offset = 0;
14943 next_skip = 0;
14944 next_depth = 0;
14945 next_max_above = (vm_map_offset_t) -1;
14946 next_max_below = (vm_map_offset_t) -1;
14947
14948 for (;;) {
14949 if (vm_map_lookup_entry(map: curr_map,
14950 address: curr_address,
14951 entry: &tmp_entry)) {
14952 /* tmp_entry contains the address we're looking for */
14953 curr_entry = tmp_entry;
14954 } else {
14955 vm_map_offset_t skip;
14956 /*
14957 * The address is not mapped. "tmp_entry" is the
14958 * map entry preceding the address. We want the next
14959 * one, if it exists.
14960 */
14961 curr_entry = tmp_entry->vme_next;
14962
14963 if (curr_entry == vm_map_to_entry(curr_map) ||
14964 (curr_entry->vme_start >=
14965 curr_address + curr_max_above)) {
14966 /* no next entry at this level: stop looking */
14967 if (not_in_kdp) {
14968 vm_map_unlock_read(curr_map);
14969 }
14970 curr_entry = NULL;
14971 curr_map = NULL;
14972 curr_skip = 0;
14973 curr_offset = 0;
14974 curr_depth = 0;
14975 curr_max_above = 0;
14976 curr_max_below = 0;
14977 break;
14978 }
14979
14980 /* adjust current address and offset */
14981 skip = curr_entry->vme_start - curr_address;
14982 curr_address = curr_entry->vme_start;
14983 curr_skip += skip;
14984 curr_offset += skip;
14985 curr_max_above -= skip;
14986 curr_max_below = 0;
14987 }
14988
14989 /*
14990 * Is the next entry at this level closer to the address (or
14991 * deeper in the submap chain) than the one we had
14992 * so far ?
14993 */
14994 tmp_entry = curr_entry->vme_next;
14995 if (tmp_entry == vm_map_to_entry(curr_map)) {
14996 /* no next entry at this level */
14997 } else if (tmp_entry->vme_start >=
14998 curr_address + curr_max_above) {
14999 /*
15000 * tmp_entry is beyond the scope of what we mapped of
15001 * this submap in the upper level: ignore it.
15002 */
15003 } else if ((next_entry == NULL) ||
15004 (tmp_entry->vme_start + curr_offset <=
15005 next_entry->vme_start + next_offset)) {
15006 /*
15007 * We didn't have a "next_entry" or this one is
15008 * closer to the address we're looking for:
15009 * use this "tmp_entry" as the new "next_entry".
15010 */
15011 if (next_entry != NULL) {
15012 /* unlock the last "next_map" */
15013 if (next_map != curr_map && not_in_kdp) {
15014 vm_map_unlock_read(next_map);
15015 }
15016 }
15017 next_entry = tmp_entry;
15018 next_map = curr_map;
15019 next_depth = curr_depth;
15020 next_address = next_entry->vme_start;
15021 next_skip = curr_skip;
15022 next_skip += (next_address - curr_address);
15023 next_offset = curr_offset;
15024 next_offset += (next_address - curr_address);
15025 next_max_above = MIN(next_max_above, curr_max_above);
15026 next_max_above = MIN(next_max_above,
15027 next_entry->vme_end - next_address);
15028 next_max_below = MIN(next_max_below, curr_max_below);
15029 next_max_below = MIN(next_max_below,
15030 next_address - next_entry->vme_start);
15031 }
15032
15033 /*
15034 * "curr_max_{above,below}" allow us to keep track of the
15035 * portion of the submap that is actually mapped at this level:
15036 * the rest of that submap is irrelevant to us, since it's not
15037 * mapped here.
15038 * The relevant portion of the map starts at
15039 * "VME_OFFSET(curr_entry)" up to the size of "curr_entry".
15040 */
15041 curr_max_above = MIN(curr_max_above,
15042 curr_entry->vme_end - curr_address);
15043 curr_max_below = MIN(curr_max_below,
15044 curr_address - curr_entry->vme_start);
15045
15046 if (!curr_entry->is_sub_map ||
15047 curr_depth >= user_max_depth) {
15048 /*
15049 * We hit a leaf map or we reached the maximum depth
15050 * we could, so stop looking. Keep the current map
15051 * locked.
15052 */
15053 break;
15054 }
15055
15056 /*
15057 * Get down to the next submap level.
15058 */
15059
15060 if (curr_entry->needs_copy) {
15061 /* everything below this is effectively copy-on-write */
15062 submap_needed_copy = TRUE;
15063 }
15064
15065 /*
15066 * Lock the next level and unlock the current level,
15067 * unless we need to keep it locked to access the "next_entry"
15068 * later.
15069 */
15070 if (not_in_kdp) {
15071 vm_map_lock_read(VME_SUBMAP(curr_entry));
15072 }
15073 if (curr_map == next_map) {
15074 /* keep "next_map" locked in case we need it */
15075 } else {
15076 /* release this map */
15077 if (not_in_kdp) {
15078 vm_map_unlock_read(curr_map);
15079 }
15080 }
15081
15082 /*
15083 * Adjust the offset. "curr_entry" maps the submap
15084 * at relative address "curr_entry->vme_start" in the
15085 * curr_map but skips the first "VME_OFFSET(curr_entry)"
15086 * bytes of the submap.
15087 * "curr_offset" always represents the offset of a virtual
15088 * address in the curr_map relative to the absolute address
15089 * space (i.e. the top-level VM map).
15090 */
15091 curr_offset +=
15092 (VME_OFFSET(entry: curr_entry) - curr_entry->vme_start);
15093 curr_address = user_address + curr_offset;
15094 /* switch to the submap */
15095 curr_map = VME_SUBMAP(curr_entry);
15096 curr_depth++;
15097 curr_entry = NULL;
15098 }
15099
15100// LP64todo: all the current tools are 32bit, obviously never worked for 64b
15101// so probably should be a real 32b ID vs. ptr.
15102// Current users just check for equality
15103
15104 if (curr_entry == NULL) {
15105 /* no VM region contains the address... */
15106
15107 if (do_region_footprint && /* we want footprint numbers */
15108 next_entry == NULL && /* & there are no more regions */
15109 /* & we haven't already provided our fake region: */
15110 user_address <= vm_map_last_entry(map)->vme_end) {
15111 ledger_amount_t ledger_resident, ledger_compressed;
15112
15113 /*
15114 * Add a fake memory region to account for
15115 * purgeable and/or ledger-tagged memory that
15116 * counts towards this task's memory footprint,
15117 * i.e. the resident/compressed pages of non-volatile
15118 * objects owned by that task.
15119 */
15120 task_ledgers_footprint(ledger: map->pmap->ledger,
15121 ledger_resident: &ledger_resident,
15122 ledger_compressed: &ledger_compressed);
15123 if (ledger_resident + ledger_compressed == 0) {
15124 /* no purgeable memory usage to report */
15125 return KERN_INVALID_ADDRESS;
15126 }
15127 /* fake region to show nonvolatile footprint */
15128 if (look_for_pages) {
15129 submap_info->protection = VM_PROT_DEFAULT;
15130 submap_info->max_protection = VM_PROT_DEFAULT;
15131 submap_info->inheritance = VM_INHERIT_DEFAULT;
15132 submap_info->offset = 0;
15133 submap_info->user_tag = -1;
15134 submap_info->pages_resident = (unsigned int) (ledger_resident / effective_page_size);
15135 submap_info->pages_shared_now_private = 0;
15136 submap_info->pages_swapped_out = (unsigned int) (ledger_compressed / effective_page_size);
15137 submap_info->pages_dirtied = submap_info->pages_resident;
15138 submap_info->ref_count = 1;
15139 submap_info->shadow_depth = 0;
15140 submap_info->external_pager = 0;
15141 submap_info->share_mode = SM_PRIVATE;
15142 if (submap_needed_copy) {
15143 submap_info->share_mode = SM_COW;
15144 }
15145 submap_info->is_submap = 0;
15146 submap_info->behavior = VM_BEHAVIOR_DEFAULT;
15147 submap_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
15148 submap_info->user_wired_count = 0;
15149 submap_info->pages_reusable = 0;
15150 } else {
15151 short_info->user_tag = -1;
15152 short_info->offset = 0;
15153 short_info->protection = VM_PROT_DEFAULT;
15154 short_info->inheritance = VM_INHERIT_DEFAULT;
15155 short_info->max_protection = VM_PROT_DEFAULT;
15156 short_info->behavior = VM_BEHAVIOR_DEFAULT;
15157 short_info->user_wired_count = 0;
15158 short_info->is_submap = 0;
15159 short_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
15160 short_info->external_pager = 0;
15161 short_info->shadow_depth = 0;
15162 short_info->share_mode = SM_PRIVATE;
15163 if (submap_needed_copy) {
15164 short_info->share_mode = SM_COW;
15165 }
15166 short_info->ref_count = 1;
15167 }
15168 *nesting_depth = 0;
15169 *size = (vm_map_size_t) (ledger_resident + ledger_compressed);
15170// *address = user_address;
15171 *address = vm_map_last_entry(map)->vme_end;
15172 return KERN_SUCCESS;
15173 }
15174
15175 if (next_entry == NULL) {
15176 /* ... and no VM region follows it either */
15177 return KERN_INVALID_ADDRESS;
15178 }
15179 /* ... gather info about the next VM region */
15180 curr_entry = next_entry;
15181 curr_map = next_map; /* still locked ... */
15182 curr_address = next_address;
15183 curr_skip = next_skip;
15184 curr_offset = next_offset;
15185 curr_depth = next_depth;
15186 curr_max_above = next_max_above;
15187 curr_max_below = next_max_below;
15188 } else {
15189 /* we won't need "next_entry" after all */
15190 if (next_entry != NULL) {
15191 /* release "next_map" */
15192 if (next_map != curr_map && not_in_kdp) {
15193 vm_map_unlock_read(next_map);
15194 }
15195 }
15196 }
15197 next_entry = NULL;
15198 next_map = NULL;
15199 next_offset = 0;
15200 next_skip = 0;
15201 next_depth = 0;
15202 next_max_below = -1;
15203 next_max_above = -1;
15204
15205 if (curr_entry->is_sub_map &&
15206 curr_depth < user_max_depth) {
15207 /*
15208 * We're not as deep as we could be: we must have
15209 * gone back up after not finding anything mapped
15210 * below the original top-level map entry's.
15211 * Let's move "curr_address" forward and recurse again.
15212 */
15213 user_address = curr_address;
15214 goto recurse_again;
15215 }
15216
15217 *nesting_depth = curr_depth;
15218 *size = curr_max_above + curr_max_below;
15219 *address = user_address + curr_skip - curr_max_below;
15220
15221 if (look_for_pages) {
15222 submap_info->user_tag = VME_ALIAS(curr_entry);
15223 submap_info->offset = VME_OFFSET(entry: curr_entry);
15224 submap_info->protection = curr_entry->protection;
15225 submap_info->inheritance = curr_entry->inheritance;
15226 submap_info->max_protection = curr_entry->max_protection;
15227 submap_info->behavior = curr_entry->behavior;
15228 submap_info->user_wired_count = curr_entry->user_wired_count;
15229 submap_info->is_submap = curr_entry->is_sub_map;
15230 if (curr_entry->is_sub_map) {
15231 submap_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
15232 } else {
15233 submap_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
15234 }
15235 } else {
15236 short_info->user_tag = VME_ALIAS(curr_entry);
15237 short_info->offset = VME_OFFSET(entry: curr_entry);
15238 short_info->protection = curr_entry->protection;
15239 short_info->inheritance = curr_entry->inheritance;
15240 short_info->max_protection = curr_entry->max_protection;
15241 short_info->behavior = curr_entry->behavior;
15242 short_info->user_wired_count = curr_entry->user_wired_count;
15243 short_info->is_submap = curr_entry->is_sub_map;
15244 if (curr_entry->is_sub_map) {
15245 short_info->object_id = VM_OBJECT_ID(VME_SUBMAP(curr_entry));
15246 } else {
15247 short_info->object_id = VM_OBJECT_ID(VME_OBJECT(curr_entry));
15248 }
15249 }
15250
15251 extended.pages_resident = 0;
15252 extended.pages_swapped_out = 0;
15253 extended.pages_shared_now_private = 0;
15254 extended.pages_dirtied = 0;
15255 extended.pages_reusable = 0;
15256 extended.external_pager = 0;
15257 extended.shadow_depth = 0;
15258 extended.share_mode = SM_EMPTY;
15259 extended.ref_count = 0;
15260
15261 if (not_in_kdp) {
15262 if (!curr_entry->is_sub_map) {
15263 vm_map_offset_t range_start, range_end;
15264 range_start = MAX((curr_address - curr_max_below),
15265 curr_entry->vme_start);
15266 range_end = MIN((curr_address + curr_max_above),
15267 curr_entry->vme_end);
15268 vm_map_region_walk(map: curr_map,
15269 va: range_start,
15270 entry: curr_entry,
15271 offset: (VME_OFFSET(entry: curr_entry) +
15272 (range_start -
15273 curr_entry->vme_start)),
15274 range: range_end - range_start,
15275 extended: &extended,
15276 look_for_pages, VM_REGION_EXTENDED_INFO_COUNT);
15277 if (extended.external_pager &&
15278 extended.ref_count == 2 &&
15279 extended.share_mode == SM_SHARED) {
15280 extended.share_mode = SM_PRIVATE;
15281 }
15282 if (submap_needed_copy) {
15283 extended.share_mode = SM_COW;
15284 }
15285 } else {
15286 if (curr_entry->use_pmap) {
15287 extended.share_mode = SM_TRUESHARED;
15288 } else {
15289 extended.share_mode = SM_PRIVATE;
15290 }
15291 extended.ref_count = os_ref_get_count_raw(rc: &VME_SUBMAP(curr_entry)->map_refcnt);
15292 }
15293 }
15294
15295 if (look_for_pages) {
15296 submap_info->pages_resident = extended.pages_resident;
15297 submap_info->pages_swapped_out = extended.pages_swapped_out;
15298 submap_info->pages_shared_now_private =
15299 extended.pages_shared_now_private;
15300 submap_info->pages_dirtied = extended.pages_dirtied;
15301 submap_info->external_pager = extended.external_pager;
15302 submap_info->shadow_depth = extended.shadow_depth;
15303 submap_info->share_mode = extended.share_mode;
15304 submap_info->ref_count = extended.ref_count;
15305
15306 if (original_count >= VM_REGION_SUBMAP_INFO_V1_COUNT_64) {
15307 submap_info->pages_reusable = extended.pages_reusable;
15308 }
15309 if (original_count >= VM_REGION_SUBMAP_INFO_V2_COUNT_64) {
15310 if (curr_entry->is_sub_map) {
15311 submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRHASH(VME_SUBMAP(curr_entry));
15312 } else if (VME_OBJECT(curr_entry)) {
15313 submap_info->object_id_full = (vm_object_id_t)VM_KERNEL_ADDRHASH(VME_OBJECT(curr_entry));
15314 } else {
15315 submap_info->object_id_full = 0ull;
15316 }
15317 }
15318 } else {
15319 short_info->external_pager = extended.external_pager;
15320 short_info->shadow_depth = extended.shadow_depth;
15321 short_info->share_mode = extended.share_mode;
15322 short_info->ref_count = extended.ref_count;
15323 }
15324
15325 if (not_in_kdp) {
15326 vm_map_unlock_read(curr_map);
15327 }
15328
15329 return KERN_SUCCESS;
15330}
15331
15332/*
15333 * vm_region:
15334 *
15335 * User call to obtain information about a region in
15336 * a task's address map. Currently, only one flavor is
15337 * supported.
15338 *
15339 * XXX The reserved and behavior fields cannot be filled
15340 * in until the vm merge from the IK is completed, and
15341 * vm_reserve is implemented.
15342 */
15343
15344kern_return_t
15345vm_map_region(
15346 vm_map_t map,
15347 vm_map_offset_t *address, /* IN/OUT */
15348 vm_map_size_t *size, /* OUT */
15349 vm_region_flavor_t flavor, /* IN */
15350 vm_region_info_t info, /* OUT */
15351 mach_msg_type_number_t *count, /* IN/OUT */
15352 mach_port_t *object_name) /* OUT */
15353{
15354 vm_map_entry_t tmp_entry;
15355 vm_map_entry_t entry;
15356 vm_map_offset_t start;
15357
15358 if (map == VM_MAP_NULL) {
15359 return KERN_INVALID_ARGUMENT;
15360 }
15361
15362 switch (flavor) {
15363 case VM_REGION_BASIC_INFO:
15364 /* legacy for old 32-bit objects info */
15365 {
15366 vm_region_basic_info_t basic;
15367
15368 if (*count < VM_REGION_BASIC_INFO_COUNT) {
15369 return KERN_INVALID_ARGUMENT;
15370 }
15371
15372 basic = (vm_region_basic_info_t) info;
15373 *count = VM_REGION_BASIC_INFO_COUNT;
15374
15375 vm_map_lock_read(map);
15376
15377 start = *address;
15378 if (!vm_map_lookup_entry(map, address: start, entry: &tmp_entry)) {
15379 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15380 vm_map_unlock_read(map);
15381 return KERN_INVALID_ADDRESS;
15382 }
15383 } else {
15384 entry = tmp_entry;
15385 }
15386
15387 start = entry->vme_start;
15388
15389 basic->offset = (uint32_t)VME_OFFSET(entry);
15390 basic->protection = entry->protection;
15391 basic->inheritance = entry->inheritance;
15392 basic->max_protection = entry->max_protection;
15393 basic->behavior = entry->behavior;
15394 basic->user_wired_count = entry->user_wired_count;
15395 basic->reserved = entry->is_sub_map;
15396 *address = start;
15397 *size = (entry->vme_end - start);
15398
15399 if (object_name) {
15400 *object_name = IP_NULL;
15401 }
15402 if (entry->is_sub_map) {
15403 basic->shared = FALSE;
15404 } else {
15405 basic->shared = entry->is_shared;
15406 }
15407
15408 vm_map_unlock_read(map);
15409 return KERN_SUCCESS;
15410 }
15411
15412 case VM_REGION_BASIC_INFO_64:
15413 {
15414 vm_region_basic_info_64_t basic;
15415
15416 if (*count < VM_REGION_BASIC_INFO_COUNT_64) {
15417 return KERN_INVALID_ARGUMENT;
15418 }
15419
15420 basic = (vm_region_basic_info_64_t) info;
15421 *count = VM_REGION_BASIC_INFO_COUNT_64;
15422
15423 vm_map_lock_read(map);
15424
15425 start = *address;
15426 if (!vm_map_lookup_entry(map, address: start, entry: &tmp_entry)) {
15427 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15428 vm_map_unlock_read(map);
15429 return KERN_INVALID_ADDRESS;
15430 }
15431 } else {
15432 entry = tmp_entry;
15433 }
15434
15435 start = entry->vme_start;
15436
15437 basic->offset = VME_OFFSET(entry);
15438 basic->protection = entry->protection;
15439 basic->inheritance = entry->inheritance;
15440 basic->max_protection = entry->max_protection;
15441 basic->behavior = entry->behavior;
15442 basic->user_wired_count = entry->user_wired_count;
15443 basic->reserved = entry->is_sub_map;
15444 *address = start;
15445 *size = (entry->vme_end - start);
15446
15447 if (object_name) {
15448 *object_name = IP_NULL;
15449 }
15450 if (entry->is_sub_map) {
15451 basic->shared = FALSE;
15452 } else {
15453 basic->shared = entry->is_shared;
15454 }
15455
15456 vm_map_unlock_read(map);
15457 return KERN_SUCCESS;
15458 }
15459 case VM_REGION_EXTENDED_INFO:
15460 if (*count < VM_REGION_EXTENDED_INFO_COUNT) {
15461 return KERN_INVALID_ARGUMENT;
15462 }
15463 OS_FALLTHROUGH;
15464 case VM_REGION_EXTENDED_INFO__legacy:
15465 if (*count < VM_REGION_EXTENDED_INFO_COUNT__legacy) {
15466 return KERN_INVALID_ARGUMENT;
15467 }
15468
15469 {
15470 vm_region_extended_info_t extended;
15471 mach_msg_type_number_t original_count;
15472 int effective_page_size, effective_page_shift;
15473
15474 extended = (vm_region_extended_info_t) info;
15475
15476 effective_page_shift = vm_self_region_page_shift(target_map: map);
15477 effective_page_size = (1 << effective_page_shift);
15478
15479 vm_map_lock_read(map);
15480
15481 start = *address;
15482 if (!vm_map_lookup_entry(map, address: start, entry: &tmp_entry)) {
15483 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15484 vm_map_unlock_read(map);
15485 return KERN_INVALID_ADDRESS;
15486 }
15487 } else {
15488 entry = tmp_entry;
15489 }
15490 start = entry->vme_start;
15491
15492 extended->protection = entry->protection;
15493 extended->user_tag = VME_ALIAS(entry);
15494 extended->pages_resident = 0;
15495 extended->pages_swapped_out = 0;
15496 extended->pages_shared_now_private = 0;
15497 extended->pages_dirtied = 0;
15498 extended->external_pager = 0;
15499 extended->shadow_depth = 0;
15500
15501 original_count = *count;
15502 if (flavor == VM_REGION_EXTENDED_INFO__legacy) {
15503 *count = VM_REGION_EXTENDED_INFO_COUNT__legacy;
15504 } else {
15505 extended->pages_reusable = 0;
15506 *count = VM_REGION_EXTENDED_INFO_COUNT;
15507 }
15508
15509 vm_map_region_walk(map, va: start, entry, offset: VME_OFFSET(entry), range: entry->vme_end - start, extended, TRUE, count: *count);
15510
15511 if (extended->external_pager && extended->ref_count == 2 && extended->share_mode == SM_SHARED) {
15512 extended->share_mode = SM_PRIVATE;
15513 }
15514
15515 if (object_name) {
15516 *object_name = IP_NULL;
15517 }
15518 *address = start;
15519 *size = (entry->vme_end - start);
15520
15521 vm_map_unlock_read(map);
15522 return KERN_SUCCESS;
15523 }
15524 case VM_REGION_TOP_INFO:
15525 {
15526 vm_region_top_info_t top;
15527
15528 if (*count < VM_REGION_TOP_INFO_COUNT) {
15529 return KERN_INVALID_ARGUMENT;
15530 }
15531
15532 top = (vm_region_top_info_t) info;
15533 *count = VM_REGION_TOP_INFO_COUNT;
15534
15535 vm_map_lock_read(map);
15536
15537 start = *address;
15538 if (!vm_map_lookup_entry(map, address: start, entry: &tmp_entry)) {
15539 if ((entry = tmp_entry->vme_next) == vm_map_to_entry(map)) {
15540 vm_map_unlock_read(map);
15541 return KERN_INVALID_ADDRESS;
15542 }
15543 } else {
15544 entry = tmp_entry;
15545 }
15546 start = entry->vme_start;
15547
15548 top->private_pages_resident = 0;
15549 top->shared_pages_resident = 0;
15550
15551 vm_map_region_top_walk(entry, top);
15552
15553 if (object_name) {
15554 *object_name = IP_NULL;
15555 }
15556 *address = start;
15557 *size = (entry->vme_end - start);
15558
15559 vm_map_unlock_read(map);
15560 return KERN_SUCCESS;
15561 }
15562 default:
15563 return KERN_INVALID_ARGUMENT;
15564 }
15565}
15566
15567#define OBJ_RESIDENT_COUNT(obj, entry_size) \
15568 MIN((entry_size), \
15569 ((obj)->all_reusable ? \
15570 (obj)->wired_page_count : \
15571 (obj)->resident_page_count - (obj)->reusable_page_count))
15572
15573void
15574vm_map_region_top_walk(
15575 vm_map_entry_t entry,
15576 vm_region_top_info_t top)
15577{
15578 if (entry->is_sub_map || VME_OBJECT(entry) == 0) {
15579 top->share_mode = SM_EMPTY;
15580 top->ref_count = 0;
15581 top->obj_id = 0;
15582 return;
15583 }
15584
15585 {
15586 struct vm_object *obj, *tmp_obj;
15587 int ref_count;
15588 uint32_t entry_size;
15589
15590 entry_size = (uint32_t) ((entry->vme_end - entry->vme_start) / PAGE_SIZE_64);
15591
15592 obj = VME_OBJECT(entry);
15593
15594 vm_object_lock(obj);
15595
15596 if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15597 ref_count--;
15598 }
15599
15600 assert(obj->reusable_page_count <= obj->resident_page_count);
15601 if (obj->shadow) {
15602 if (ref_count == 1) {
15603 top->private_pages_resident =
15604 OBJ_RESIDENT_COUNT(obj, entry_size);
15605 } else {
15606 top->shared_pages_resident =
15607 OBJ_RESIDENT_COUNT(obj, entry_size);
15608 }
15609 top->ref_count = ref_count;
15610 top->share_mode = SM_COW;
15611
15612 while ((tmp_obj = obj->shadow)) {
15613 vm_object_lock(tmp_obj);
15614 vm_object_unlock(obj);
15615 obj = tmp_obj;
15616
15617 if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15618 ref_count--;
15619 }
15620
15621 assert(obj->reusable_page_count <= obj->resident_page_count);
15622 top->shared_pages_resident +=
15623 OBJ_RESIDENT_COUNT(obj, entry_size);
15624 top->ref_count += ref_count - 1;
15625 }
15626 } else {
15627 if (entry->superpage_size) {
15628 top->share_mode = SM_LARGE_PAGE;
15629 top->shared_pages_resident = 0;
15630 top->private_pages_resident = entry_size;
15631 } else if (entry->needs_copy) {
15632 top->share_mode = SM_COW;
15633 top->shared_pages_resident =
15634 OBJ_RESIDENT_COUNT(obj, entry_size);
15635 } else {
15636 if (ref_count == 1 ||
15637 (ref_count == 2 && obj->named)) {
15638 top->share_mode = SM_PRIVATE;
15639 top->private_pages_resident =
15640 OBJ_RESIDENT_COUNT(obj,
15641 entry_size);
15642 } else {
15643 top->share_mode = SM_SHARED;
15644 top->shared_pages_resident =
15645 OBJ_RESIDENT_COUNT(obj,
15646 entry_size);
15647 }
15648 }
15649 top->ref_count = ref_count;
15650 }
15651
15652 vm_object_unlock(obj);
15653
15654 /* XXX K64: obj_id will be truncated */
15655 top->obj_id = (unsigned int) (uintptr_t)VM_KERNEL_ADDRHASH(obj);
15656 }
15657}
15658
15659void
15660vm_map_region_walk(
15661 vm_map_t map,
15662 vm_map_offset_t va,
15663 vm_map_entry_t entry,
15664 vm_object_offset_t offset,
15665 vm_object_size_t range,
15666 vm_region_extended_info_t extended,
15667 boolean_t look_for_pages,
15668 mach_msg_type_number_t count)
15669{
15670 struct vm_object *obj, *tmp_obj;
15671 vm_map_offset_t last_offset;
15672 int i;
15673 int ref_count;
15674 struct vm_object *shadow_object;
15675 unsigned short shadow_depth;
15676 boolean_t do_region_footprint;
15677 int effective_page_size, effective_page_shift;
15678 vm_map_offset_t effective_page_mask;
15679
15680 do_region_footprint = task_self_region_footprint();
15681
15682 if ((entry->is_sub_map) ||
15683 (VME_OBJECT(entry) == 0) ||
15684 (VME_OBJECT(entry)->phys_contiguous &&
15685 !entry->superpage_size)) {
15686 extended->share_mode = SM_EMPTY;
15687 extended->ref_count = 0;
15688 return;
15689 }
15690
15691 if (entry->superpage_size) {
15692 extended->shadow_depth = 0;
15693 extended->share_mode = SM_LARGE_PAGE;
15694 extended->ref_count = 1;
15695 extended->external_pager = 0;
15696
15697 /* TODO4K: Superpage in 4k mode? */
15698 extended->pages_resident = (unsigned int)(range >> PAGE_SHIFT);
15699 extended->shadow_depth = 0;
15700 return;
15701 }
15702
15703 effective_page_shift = vm_self_region_page_shift(target_map: map);
15704 effective_page_size = (1 << effective_page_shift);
15705 effective_page_mask = effective_page_size - 1;
15706
15707 offset = vm_map_trunc_page(offset, effective_page_mask);
15708
15709 obj = VME_OBJECT(entry);
15710
15711 vm_object_lock(obj);
15712
15713 if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15714 ref_count--;
15715 }
15716
15717 if (look_for_pages) {
15718 for (last_offset = offset + range;
15719 offset < last_offset;
15720 offset += effective_page_size, va += effective_page_size) {
15721 if (do_region_footprint) {
15722 int disp;
15723
15724 disp = 0;
15725 if (map->has_corpse_footprint) {
15726 /*
15727 * Query the page info data we saved
15728 * while forking the corpse.
15729 */
15730 vm_map_corpse_footprint_query_page_info(
15731 map,
15732 va,
15733 disposition_p: &disp);
15734 } else {
15735 /*
15736 * Query the pmap.
15737 */
15738 vm_map_footprint_query_page_info(
15739 map,
15740 map_entry: entry,
15741 curr_s_offset: va,
15742 disposition_p: &disp);
15743 }
15744 if (disp & VM_PAGE_QUERY_PAGE_PRESENT) {
15745 extended->pages_resident++;
15746 }
15747 if (disp & VM_PAGE_QUERY_PAGE_REUSABLE) {
15748 extended->pages_reusable++;
15749 }
15750 if (disp & VM_PAGE_QUERY_PAGE_DIRTY) {
15751 extended->pages_dirtied++;
15752 }
15753 if (disp & PMAP_QUERY_PAGE_COMPRESSED) {
15754 extended->pages_swapped_out++;
15755 }
15756 continue;
15757 }
15758
15759 vm_map_region_look_for_page(map, va, object: obj,
15760 vm_object_trunc_page(offset), max_refcnt: ref_count,
15761 depth: 0, extended, count);
15762 }
15763
15764 if (do_region_footprint) {
15765 goto collect_object_info;
15766 }
15767 } else {
15768collect_object_info:
15769 shadow_object = obj->shadow;
15770 shadow_depth = 0;
15771
15772 if (!(obj->internal)) {
15773 extended->external_pager = 1;
15774 }
15775
15776 if (shadow_object != VM_OBJECT_NULL) {
15777 vm_object_lock(shadow_object);
15778 for (;
15779 shadow_object != VM_OBJECT_NULL;
15780 shadow_depth++) {
15781 vm_object_t next_shadow;
15782
15783 if (!(shadow_object->internal)) {
15784 extended->external_pager = 1;
15785 }
15786
15787 next_shadow = shadow_object->shadow;
15788 if (next_shadow) {
15789 vm_object_lock(next_shadow);
15790 }
15791 vm_object_unlock(shadow_object);
15792 shadow_object = next_shadow;
15793 }
15794 }
15795 extended->shadow_depth = shadow_depth;
15796 }
15797
15798 if (extended->shadow_depth || entry->needs_copy) {
15799 extended->share_mode = SM_COW;
15800 } else {
15801 if (ref_count == 1) {
15802 extended->share_mode = SM_PRIVATE;
15803 } else {
15804 if (obj->true_share) {
15805 extended->share_mode = SM_TRUESHARED;
15806 } else {
15807 extended->share_mode = SM_SHARED;
15808 }
15809 }
15810 }
15811 extended->ref_count = ref_count - extended->shadow_depth;
15812
15813 for (i = 0; i < extended->shadow_depth; i++) {
15814 if ((tmp_obj = obj->shadow) == 0) {
15815 break;
15816 }
15817 vm_object_lock(tmp_obj);
15818 vm_object_unlock(obj);
15819
15820 if ((ref_count = tmp_obj->ref_count) > 1 && tmp_obj->paging_in_progress) {
15821 ref_count--;
15822 }
15823
15824 extended->ref_count += ref_count;
15825 obj = tmp_obj;
15826 }
15827 vm_object_unlock(obj);
15828
15829 if (extended->share_mode == SM_SHARED) {
15830 vm_map_entry_t cur;
15831 vm_map_entry_t last;
15832 int my_refs;
15833
15834 obj = VME_OBJECT(entry);
15835 last = vm_map_to_entry(map);
15836 my_refs = 0;
15837
15838 if ((ref_count = obj->ref_count) > 1 && obj->paging_in_progress) {
15839 ref_count--;
15840 }
15841 for (cur = vm_map_first_entry(map); cur != last; cur = cur->vme_next) {
15842 my_refs += vm_map_region_count_obj_refs(entry: cur, object: obj);
15843 }
15844
15845 if (my_refs == ref_count) {
15846 extended->share_mode = SM_PRIVATE_ALIASED;
15847 } else if (my_refs > 1) {
15848 extended->share_mode = SM_SHARED_ALIASED;
15849 }
15850 }
15851}
15852
15853
15854/* object is locked on entry and locked on return */
15855
15856
15857static void
15858vm_map_region_look_for_page(
15859 __unused vm_map_t map,
15860 __unused vm_map_offset_t va,
15861 vm_object_t object,
15862 vm_object_offset_t offset,
15863 int max_refcnt,
15864 unsigned short depth,
15865 vm_region_extended_info_t extended,
15866 mach_msg_type_number_t count)
15867{
15868 vm_page_t p;
15869 vm_object_t shadow;
15870 int ref_count;
15871 vm_object_t caller_object;
15872
15873 shadow = object->shadow;
15874 caller_object = object;
15875
15876
15877 while (TRUE) {
15878 if (!(object->internal)) {
15879 extended->external_pager = 1;
15880 }
15881
15882 if ((p = vm_page_lookup(object, offset)) != VM_PAGE_NULL) {
15883 if (shadow && (max_refcnt == 1)) {
15884 extended->pages_shared_now_private++;
15885 }
15886
15887 if (!p->vmp_fictitious &&
15888 (p->vmp_dirty || pmap_is_modified(pn: VM_PAGE_GET_PHYS_PAGE(m: p)))) {
15889 extended->pages_dirtied++;
15890 } else if (count >= VM_REGION_EXTENDED_INFO_COUNT) {
15891 if (p->vmp_reusable || object->all_reusable) {
15892 extended->pages_reusable++;
15893 }
15894 }
15895
15896 extended->pages_resident++;
15897
15898 if (object != caller_object) {
15899 vm_object_unlock(object);
15900 }
15901
15902 return;
15903 }
15904 if (object->internal &&
15905 object->alive &&
15906 !object->terminating &&
15907 object->pager_ready) {
15908 if (VM_COMPRESSOR_PAGER_STATE_GET(object, offset)
15909 == VM_EXTERNAL_STATE_EXISTS) {
15910 /* the pager has that page */
15911 extended->pages_swapped_out++;
15912 if (object != caller_object) {
15913 vm_object_unlock(object);
15914 }
15915 return;
15916 }
15917 }
15918
15919 if (shadow) {
15920 vm_object_lock(shadow);
15921
15922 if ((ref_count = shadow->ref_count) > 1 && shadow->paging_in_progress) {
15923 ref_count--;
15924 }
15925
15926 if (++depth > extended->shadow_depth) {
15927 extended->shadow_depth = depth;
15928 }
15929
15930 if (ref_count > max_refcnt) {
15931 max_refcnt = ref_count;
15932 }
15933
15934 if (object != caller_object) {
15935 vm_object_unlock(object);
15936 }
15937
15938 offset = offset + object->vo_shadow_offset;
15939 object = shadow;
15940 shadow = object->shadow;
15941 continue;
15942 }
15943 if (object != caller_object) {
15944 vm_object_unlock(object);
15945 }
15946 break;
15947 }
15948}
15949
15950static int
15951vm_map_region_count_obj_refs(
15952 vm_map_entry_t entry,
15953 vm_object_t object)
15954{
15955 int ref_count;
15956 vm_object_t chk_obj;
15957 vm_object_t tmp_obj;
15958
15959 if (entry->is_sub_map || VME_OBJECT(entry) == VM_OBJECT_NULL) {
15960 return 0;
15961 }
15962
15963 ref_count = 0;
15964 chk_obj = VME_OBJECT(entry);
15965 vm_object_lock(chk_obj);
15966
15967 while (chk_obj) {
15968 if (chk_obj == object) {
15969 ref_count++;
15970 }
15971 tmp_obj = chk_obj->shadow;
15972 if (tmp_obj) {
15973 vm_object_lock(tmp_obj);
15974 }
15975 vm_object_unlock(chk_obj);
15976
15977 chk_obj = tmp_obj;
15978 }
15979
15980 return ref_count;
15981}
15982
15983
15984/*
15985 * Routine: vm_map_simplify
15986 *
15987 * Description:
15988 * Attempt to simplify the map representation in
15989 * the vicinity of the given starting address.
15990 * Note:
15991 * This routine is intended primarily to keep the
15992 * kernel maps more compact -- they generally don't
15993 * benefit from the "expand a map entry" technology
15994 * at allocation time because the adjacent entry
15995 * is often wired down.
15996 */
15997void
15998vm_map_simplify_entry(
15999 vm_map_t map,
16000 vm_map_entry_t this_entry)
16001{
16002 vm_map_entry_t prev_entry;
16003
16004 prev_entry = this_entry->vme_prev;
16005
16006 if ((this_entry != vm_map_to_entry(map)) &&
16007 (prev_entry != vm_map_to_entry(map)) &&
16008
16009 (prev_entry->vme_end == this_entry->vme_start) &&
16010
16011 (prev_entry->is_sub_map == this_entry->is_sub_map) &&
16012 (prev_entry->vme_object_value == this_entry->vme_object_value) &&
16013 (prev_entry->vme_kernel_object == this_entry->vme_kernel_object) &&
16014 ((VME_OFFSET(entry: prev_entry) + (prev_entry->vme_end -
16015 prev_entry->vme_start))
16016 == VME_OFFSET(entry: this_entry)) &&
16017
16018 (prev_entry->behavior == this_entry->behavior) &&
16019 (prev_entry->needs_copy == this_entry->needs_copy) &&
16020 (prev_entry->protection == this_entry->protection) &&
16021 (prev_entry->max_protection == this_entry->max_protection) &&
16022 (prev_entry->inheritance == this_entry->inheritance) &&
16023 (prev_entry->use_pmap == this_entry->use_pmap) &&
16024 (VME_ALIAS(prev_entry) == VME_ALIAS(this_entry)) &&
16025 (prev_entry->no_cache == this_entry->no_cache) &&
16026 (prev_entry->vme_permanent == this_entry->vme_permanent) &&
16027 (prev_entry->map_aligned == this_entry->map_aligned) &&
16028 (prev_entry->zero_wired_pages == this_entry->zero_wired_pages) &&
16029 (prev_entry->used_for_jit == this_entry->used_for_jit) &&
16030#if __arm64e__
16031 (prev_entry->used_for_tpro == this_entry->used_for_tpro) &&
16032#endif
16033 (prev_entry->csm_associated == this_entry->csm_associated) &&
16034 (prev_entry->vme_xnu_user_debug == this_entry->vme_xnu_user_debug) &&
16035 (prev_entry->iokit_acct == this_entry->iokit_acct) &&
16036 (prev_entry->vme_resilient_codesign ==
16037 this_entry->vme_resilient_codesign) &&
16038 (prev_entry->vme_resilient_media ==
16039 this_entry->vme_resilient_media) &&
16040 (prev_entry->vme_no_copy_on_read == this_entry->vme_no_copy_on_read) &&
16041 (prev_entry->translated_allow_execute == this_entry->translated_allow_execute) &&
16042
16043 (prev_entry->wired_count == this_entry->wired_count) &&
16044 (prev_entry->user_wired_count == this_entry->user_wired_count) &&
16045
16046 ((prev_entry->vme_atomic == FALSE) && (this_entry->vme_atomic == FALSE)) &&
16047 (prev_entry->in_transition == FALSE) &&
16048 (this_entry->in_transition == FALSE) &&
16049 (prev_entry->needs_wakeup == FALSE) &&
16050 (this_entry->needs_wakeup == FALSE) &&
16051 (prev_entry->is_shared == this_entry->is_shared) &&
16052 (prev_entry->superpage_size == FALSE) &&
16053 (this_entry->superpage_size == FALSE)
16054 ) {
16055 if (prev_entry->vme_permanent) {
16056 assert(this_entry->vme_permanent);
16057 prev_entry->vme_permanent = false;
16058 }
16059 vm_map_store_entry_unlink(map, entry: prev_entry, true);
16060 assert(prev_entry->vme_start < this_entry->vme_end);
16061 if (prev_entry->map_aligned) {
16062 assert(VM_MAP_PAGE_ALIGNED(prev_entry->vme_start,
16063 VM_MAP_PAGE_MASK(map)));
16064 }
16065 this_entry->vme_start = prev_entry->vme_start;
16066 VME_OFFSET_SET(entry: this_entry, offset: VME_OFFSET(entry: prev_entry));
16067
16068 if (map->holelistenabled) {
16069 vm_map_store_update_first_free(map, entry: this_entry, TRUE);
16070 }
16071
16072 if (prev_entry->is_sub_map) {
16073 vm_map_deallocate(VME_SUBMAP(prev_entry));
16074 } else {
16075 vm_object_deallocate(VME_OBJECT(prev_entry));
16076 }
16077 vm_map_entry_dispose(entry: prev_entry);
16078 SAVE_HINT_MAP_WRITE(map, this_entry);
16079 }
16080}
16081
16082void
16083vm_map_simplify(
16084 vm_map_t map,
16085 vm_map_offset_t start)
16086{
16087 vm_map_entry_t this_entry;
16088
16089 vm_map_lock(map);
16090 if (vm_map_lookup_entry(map, address: start, entry: &this_entry)) {
16091 vm_map_simplify_entry(map, this_entry);
16092 vm_map_simplify_entry(map, this_entry: this_entry->vme_next);
16093 }
16094 vm_map_unlock(map);
16095}
16096
16097static void
16098vm_map_simplify_range(
16099 vm_map_t map,
16100 vm_map_offset_t start,
16101 vm_map_offset_t end)
16102{
16103 vm_map_entry_t entry;
16104
16105 /*
16106 * The map should be locked (for "write") by the caller.
16107 */
16108
16109 if (start >= end) {
16110 /* invalid address range */
16111 return;
16112 }
16113
16114 start = vm_map_trunc_page(start,
16115 VM_MAP_PAGE_MASK(map));
16116 end = vm_map_round_page(end,
16117 VM_MAP_PAGE_MASK(map));
16118
16119 if (!vm_map_lookup_entry(map, address: start, entry: &entry)) {
16120 /* "start" is not mapped and "entry" ends before "start" */
16121 if (entry == vm_map_to_entry(map)) {
16122 /* start with first entry in the map */
16123 entry = vm_map_first_entry(map);
16124 } else {
16125 /* start with next entry */
16126 entry = entry->vme_next;
16127 }
16128 }
16129
16130 while (entry != vm_map_to_entry(map) &&
16131 entry->vme_start <= end) {
16132 /* try and coalesce "entry" with its previous entry */
16133 vm_map_simplify_entry(map, this_entry: entry);
16134 entry = entry->vme_next;
16135 }
16136}
16137
16138
16139/*
16140 * Routine: vm_map_machine_attribute
16141 * Purpose:
16142 * Provide machine-specific attributes to mappings,
16143 * such as cachability etc. for machines that provide
16144 * them. NUMA architectures and machines with big/strange
16145 * caches will use this.
16146 * Note:
16147 * Responsibilities for locking and checking are handled here,
16148 * everything else in the pmap module. If any non-volatile
16149 * information must be kept, the pmap module should handle
16150 * it itself. [This assumes that attributes do not
16151 * need to be inherited, which seems ok to me]
16152 */
16153kern_return_t
16154vm_map_machine_attribute(
16155 vm_map_t map,
16156 vm_map_offset_t start,
16157 vm_map_offset_t end,
16158 vm_machine_attribute_t attribute,
16159 vm_machine_attribute_val_t* value) /* IN/OUT */
16160{
16161 kern_return_t ret;
16162 vm_map_size_t sync_size;
16163 vm_map_entry_t entry;
16164
16165 if (start < vm_map_min(map) || end > vm_map_max(map)) {
16166 return KERN_INVALID_ADDRESS;
16167 }
16168 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
16169 return KERN_INVALID_ADDRESS;
16170 }
16171
16172 /* Figure how much memory we need to flush (in page increments) */
16173 sync_size = end - start;
16174
16175 vm_map_lock(map);
16176
16177 if (attribute != MATTR_CACHE) {
16178 /* If we don't have to find physical addresses, we */
16179 /* don't have to do an explicit traversal here. */
16180 ret = pmap_attribute(map->pmap, start, end - start,
16181 attribute, value);
16182 vm_map_unlock(map);
16183 return ret;
16184 }
16185
16186 ret = KERN_SUCCESS; /* Assume it all worked */
16187
16188 while (sync_size) {
16189 if (vm_map_lookup_entry(map, address: start, entry: &entry)) {
16190 vm_map_size_t sub_size;
16191 if ((entry->vme_end - start) > sync_size) {
16192 sub_size = sync_size;
16193 sync_size = 0;
16194 } else {
16195 sub_size = entry->vme_end - start;
16196 sync_size -= sub_size;
16197 }
16198 if (entry->is_sub_map) {
16199 vm_map_offset_t sub_start;
16200 vm_map_offset_t sub_end;
16201
16202 sub_start = (start - entry->vme_start)
16203 + VME_OFFSET(entry);
16204 sub_end = sub_start + sub_size;
16205 vm_map_machine_attribute(
16206 VME_SUBMAP(entry),
16207 start: sub_start,
16208 end: sub_end,
16209 attribute, value);
16210 } else if (VME_OBJECT(entry)) {
16211 vm_page_t m;
16212 vm_object_t object;
16213 vm_object_t base_object;
16214 vm_object_t last_object;
16215 vm_object_offset_t offset;
16216 vm_object_offset_t base_offset;
16217 vm_map_size_t range;
16218 range = sub_size;
16219 offset = (start - entry->vme_start)
16220 + VME_OFFSET(entry);
16221 offset = vm_object_trunc_page(offset);
16222 base_offset = offset;
16223 object = VME_OBJECT(entry);
16224 base_object = object;
16225 last_object = NULL;
16226
16227 vm_object_lock(object);
16228
16229 while (range) {
16230 m = vm_page_lookup(
16231 object, offset);
16232
16233 if (m && !m->vmp_fictitious) {
16234 ret =
16235 pmap_attribute_cache_sync(
16236 pn: VM_PAGE_GET_PHYS_PAGE(m),
16237 PAGE_SIZE,
16238 attribute, value);
16239 } else if (object->shadow) {
16240 offset = offset + object->vo_shadow_offset;
16241 last_object = object;
16242 object = object->shadow;
16243 vm_object_lock(last_object->shadow);
16244 vm_object_unlock(last_object);
16245 continue;
16246 }
16247 if (range < PAGE_SIZE) {
16248 range = 0;
16249 } else {
16250 range -= PAGE_SIZE;
16251 }
16252
16253 if (base_object != object) {
16254 vm_object_unlock(object);
16255 vm_object_lock(base_object);
16256 object = base_object;
16257 }
16258 /* Bump to the next page */
16259 base_offset += PAGE_SIZE;
16260 offset = base_offset;
16261 }
16262 vm_object_unlock(object);
16263 }
16264 start += sub_size;
16265 } else {
16266 vm_map_unlock(map);
16267 return KERN_FAILURE;
16268 }
16269 }
16270
16271 vm_map_unlock(map);
16272
16273 return ret;
16274}
16275
16276/*
16277 * vm_map_behavior_set:
16278 *
16279 * Sets the paging reference behavior of the specified address
16280 * range in the target map. Paging reference behavior affects
16281 * how pagein operations resulting from faults on the map will be
16282 * clustered.
16283 */
16284kern_return_t
16285vm_map_behavior_set(
16286 vm_map_t map,
16287 vm_map_offset_t start,
16288 vm_map_offset_t end,
16289 vm_behavior_t new_behavior)
16290{
16291 vm_map_entry_t entry;
16292 vm_map_entry_t temp_entry;
16293
16294 if (start > end ||
16295 start < vm_map_min(map) ||
16296 end > vm_map_max(map)) {
16297 return KERN_NO_SPACE;
16298 }
16299 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
16300 return KERN_INVALID_ADDRESS;
16301 }
16302
16303 switch (new_behavior) {
16304 /*
16305 * This first block of behaviors all set a persistent state on the specified
16306 * memory range. All we have to do here is to record the desired behavior
16307 * in the vm_map_entry_t's.
16308 */
16309
16310 case VM_BEHAVIOR_DEFAULT:
16311 case VM_BEHAVIOR_RANDOM:
16312 case VM_BEHAVIOR_SEQUENTIAL:
16313 case VM_BEHAVIOR_RSEQNTL:
16314 case VM_BEHAVIOR_ZERO_WIRED_PAGES:
16315 vm_map_lock(map);
16316
16317 /*
16318 * The entire address range must be valid for the map.
16319 * Note that vm_map_range_check() does a
16320 * vm_map_lookup_entry() internally and returns the
16321 * entry containing the start of the address range if
16322 * the entire range is valid.
16323 */
16324 if (vm_map_range_check(map, start, end, entry: &temp_entry)) {
16325 entry = temp_entry;
16326 vm_map_clip_start(map, entry, startaddr: start);
16327 } else {
16328 vm_map_unlock(map);
16329 return KERN_INVALID_ADDRESS;
16330 }
16331
16332 while ((entry != vm_map_to_entry(map)) && (entry->vme_start < end)) {
16333 vm_map_clip_end(map, entry, endaddr: end);
16334 if (entry->is_sub_map) {
16335 assert(!entry->use_pmap);
16336 }
16337
16338 if (new_behavior == VM_BEHAVIOR_ZERO_WIRED_PAGES) {
16339 entry->zero_wired_pages = TRUE;
16340 } else {
16341 entry->behavior = new_behavior;
16342 }
16343 entry = entry->vme_next;
16344 }
16345
16346 vm_map_unlock(map);
16347 break;
16348
16349 /*
16350 * The rest of these are different from the above in that they cause
16351 * an immediate action to take place as opposed to setting a behavior that
16352 * affects future actions.
16353 */
16354
16355 case VM_BEHAVIOR_WILLNEED:
16356 return vm_map_willneed(map, start, end);
16357
16358 case VM_BEHAVIOR_DONTNEED:
16359 return vm_map_msync(map, address: start, size: end - start, VM_SYNC_DEACTIVATE | VM_SYNC_CONTIGUOUS);
16360
16361 case VM_BEHAVIOR_FREE:
16362 return vm_map_msync(map, address: start, size: end - start, VM_SYNC_KILLPAGES | VM_SYNC_CONTIGUOUS);
16363
16364 case VM_BEHAVIOR_REUSABLE:
16365 return vm_map_reusable_pages(map, start, end);
16366
16367 case VM_BEHAVIOR_REUSE:
16368 return vm_map_reuse_pages(map, start, end);
16369
16370 case VM_BEHAVIOR_CAN_REUSE:
16371 return vm_map_can_reuse(map, start, end);
16372
16373#if MACH_ASSERT
16374 case VM_BEHAVIOR_PAGEOUT:
16375 return vm_map_pageout(map, start, end);
16376#endif /* MACH_ASSERT */
16377
16378 case VM_BEHAVIOR_ZERO:
16379 return vm_map_zero(map, start, end);
16380
16381 default:
16382 return KERN_INVALID_ARGUMENT;
16383 }
16384
16385 return KERN_SUCCESS;
16386}
16387
16388
16389/*
16390 * Internals for madvise(MADV_WILLNEED) system call.
16391 *
16392 * The implementation is to do:-
16393 * a) read-ahead if the mapping corresponds to a mapped regular file
16394 * b) or, fault in the pages (zero-fill, decompress etc) if it's an anonymous mapping
16395 */
16396
16397
16398static kern_return_t
16399vm_map_willneed(
16400 vm_map_t map,
16401 vm_map_offset_t start,
16402 vm_map_offset_t end
16403 )
16404{
16405 vm_map_entry_t entry;
16406 vm_object_t object;
16407 memory_object_t pager;
16408 struct vm_object_fault_info fault_info = {};
16409 kern_return_t kr;
16410 vm_object_size_t len;
16411 vm_object_offset_t offset;
16412
16413 fault_info.interruptible = THREAD_UNINT; /* ignored value */
16414 fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
16415 fault_info.stealth = TRUE;
16416
16417 /*
16418 * The MADV_WILLNEED operation doesn't require any changes to the
16419 * vm_map_entry_t's, so the read lock is sufficient.
16420 */
16421
16422 vm_map_lock_read(map);
16423
16424 /*
16425 * The madvise semantics require that the address range be fully
16426 * allocated with no holes. Otherwise, we're required to return
16427 * an error.
16428 */
16429
16430 if (!vm_map_range_check(map, start, end, entry: &entry)) {
16431 vm_map_unlock_read(map);
16432 return KERN_INVALID_ADDRESS;
16433 }
16434
16435 /*
16436 * Examine each vm_map_entry_t in the range.
16437 */
16438 for (; entry != vm_map_to_entry(map) && start < end;) {
16439 /*
16440 * The first time through, the start address could be anywhere
16441 * within the vm_map_entry we found. So adjust the offset to
16442 * correspond. After that, the offset will always be zero to
16443 * correspond to the beginning of the current vm_map_entry.
16444 */
16445 offset = (start - entry->vme_start) + VME_OFFSET(entry);
16446
16447 /*
16448 * Set the length so we don't go beyond the end of the
16449 * map_entry or beyond the end of the range we were given.
16450 * This range could span also multiple map entries all of which
16451 * map different files, so make sure we only do the right amount
16452 * of I/O for each object. Note that it's possible for there
16453 * to be multiple map entries all referring to the same object
16454 * but with different page permissions, but it's not worth
16455 * trying to optimize that case.
16456 */
16457 len = MIN(entry->vme_end - start, end - start);
16458
16459 if ((vm_size_t) len != len) {
16460 /* 32-bit overflow */
16461 len = (vm_size_t) (0 - PAGE_SIZE);
16462 }
16463 fault_info.cluster_size = (vm_size_t) len;
16464 fault_info.lo_offset = offset;
16465 fault_info.hi_offset = offset + len;
16466 fault_info.user_tag = VME_ALIAS(entry);
16467 fault_info.pmap_options = 0;
16468 if (entry->iokit_acct ||
16469 (!entry->is_sub_map && !entry->use_pmap)) {
16470 fault_info.pmap_options |= PMAP_OPTIONS_ALT_ACCT;
16471 }
16472 fault_info.fi_xnu_user_debug = entry->vme_xnu_user_debug;
16473
16474 /*
16475 * If the entry is a submap OR there's no read permission
16476 * to this mapping, then just skip it.
16477 */
16478 if ((entry->is_sub_map) || (entry->protection & VM_PROT_READ) == 0) {
16479 entry = entry->vme_next;
16480 start = entry->vme_start;
16481 continue;
16482 }
16483
16484 object = VME_OBJECT(entry);
16485
16486 if (object == NULL ||
16487 (object && object->internal)) {
16488 /*
16489 * Memory range backed by anonymous memory.
16490 */
16491 vm_size_t region_size = 0, effective_page_size = 0;
16492 vm_map_offset_t addr = 0, effective_page_mask = 0;
16493
16494 region_size = len;
16495 addr = start;
16496
16497 effective_page_mask = MIN(vm_map_page_mask(current_map()), PAGE_MASK);
16498 effective_page_size = effective_page_mask + 1;
16499
16500 vm_map_unlock_read(map);
16501
16502 while (region_size) {
16503 vm_pre_fault(
16504 vm_map_trunc_page(addr, effective_page_mask),
16505 VM_PROT_READ | VM_PROT_WRITE);
16506
16507 region_size -= effective_page_size;
16508 addr += effective_page_size;
16509 }
16510 } else {
16511 /*
16512 * Find the file object backing this map entry. If there is
16513 * none, then we simply ignore the "will need" advice for this
16514 * entry and go on to the next one.
16515 */
16516 if ((object = find_vnode_object(entry)) == VM_OBJECT_NULL) {
16517 entry = entry->vme_next;
16518 start = entry->vme_start;
16519 continue;
16520 }
16521
16522 vm_object_paging_begin(object);
16523 pager = object->pager;
16524 vm_object_unlock(object);
16525
16526 /*
16527 * The data_request() could take a long time, so let's
16528 * release the map lock to avoid blocking other threads.
16529 */
16530 vm_map_unlock_read(map);
16531
16532 /*
16533 * Get the data from the object asynchronously.
16534 *
16535 * Note that memory_object_data_request() places limits on the
16536 * amount of I/O it will do. Regardless of the len we
16537 * specified, it won't do more than MAX_UPL_TRANSFER_BYTES and it
16538 * silently truncates the len to that size. This isn't
16539 * necessarily bad since madvise shouldn't really be used to
16540 * page in unlimited amounts of data. Other Unix variants
16541 * limit the willneed case as well. If this turns out to be an
16542 * issue for developers, then we can always adjust the policy
16543 * here and still be backwards compatible since this is all
16544 * just "advice".
16545 */
16546 kr = memory_object_data_request(
16547 memory_object: pager,
16548 vm_object_trunc_page(offset) + object->paging_offset,
16549 length: 0, /* ignored */
16550 VM_PROT_READ,
16551 fault_info: (memory_object_fault_info_t)&fault_info);
16552
16553 vm_object_lock(object);
16554 vm_object_paging_end(object);
16555 vm_object_unlock(object);
16556
16557 /*
16558 * If we couldn't do the I/O for some reason, just give up on
16559 * the madvise. We still return success to the user since
16560 * madvise isn't supposed to fail when the advice can't be
16561 * taken.
16562 */
16563
16564 if (kr != KERN_SUCCESS) {
16565 return KERN_SUCCESS;
16566 }
16567 }
16568
16569 start += len;
16570 if (start >= end) {
16571 /* done */
16572 return KERN_SUCCESS;
16573 }
16574
16575 /* look up next entry */
16576 vm_map_lock_read(map);
16577 if (!vm_map_lookup_entry(map, address: start, entry: &entry)) {
16578 /*
16579 * There's a new hole in the address range.
16580 */
16581 vm_map_unlock_read(map);
16582 return KERN_INVALID_ADDRESS;
16583 }
16584 }
16585
16586 vm_map_unlock_read(map);
16587 return KERN_SUCCESS;
16588}
16589
16590static boolean_t
16591vm_map_entry_is_reusable(
16592 vm_map_entry_t entry)
16593{
16594 /* Only user map entries */
16595
16596 vm_object_t object;
16597
16598 if (entry->is_sub_map) {
16599 return FALSE;
16600 }
16601
16602 switch (VME_ALIAS(entry)) {
16603 case VM_MEMORY_MALLOC:
16604 case VM_MEMORY_MALLOC_SMALL:
16605 case VM_MEMORY_MALLOC_LARGE:
16606 case VM_MEMORY_REALLOC:
16607 case VM_MEMORY_MALLOC_TINY:
16608 case VM_MEMORY_MALLOC_LARGE_REUSABLE:
16609 case VM_MEMORY_MALLOC_LARGE_REUSED:
16610 /*
16611 * This is a malloc() memory region: check if it's still
16612 * in its original state and can be re-used for more
16613 * malloc() allocations.
16614 */
16615 break;
16616 default:
16617 /*
16618 * Not a malloc() memory region: let the caller decide if
16619 * it's re-usable.
16620 */
16621 return TRUE;
16622 }
16623
16624 if (/*entry->is_shared ||*/
16625 entry->is_sub_map ||
16626 entry->in_transition ||
16627 entry->protection != VM_PROT_DEFAULT ||
16628 entry->max_protection != VM_PROT_ALL ||
16629 entry->inheritance != VM_INHERIT_DEFAULT ||
16630 entry->no_cache ||
16631 entry->vme_permanent ||
16632 entry->superpage_size != FALSE ||
16633 entry->zero_wired_pages ||
16634 entry->wired_count != 0 ||
16635 entry->user_wired_count != 0) {
16636 return FALSE;
16637 }
16638
16639 object = VME_OBJECT(entry);
16640 if (object == VM_OBJECT_NULL) {
16641 return TRUE;
16642 }
16643 if (
16644#if 0
16645 /*
16646 * Let's proceed even if the VM object is potentially
16647 * shared.
16648 * We check for this later when processing the actual
16649 * VM pages, so the contents will be safe if shared.
16650 *
16651 * But we can still mark this memory region as "reusable" to
16652 * acknowledge that the caller did let us know that the memory
16653 * could be re-used and should not be penalized for holding
16654 * on to it. This allows its "resident size" to not include
16655 * the reusable range.
16656 */
16657 object->ref_count == 1 &&
16658#endif
16659 object->vo_copy == VM_OBJECT_NULL &&
16660 object->shadow == VM_OBJECT_NULL &&
16661 object->internal &&
16662 object->purgable == VM_PURGABLE_DENY &&
16663 object->wimg_bits == VM_WIMG_USE_DEFAULT &&
16664 !object->code_signed) {
16665 return TRUE;
16666 }
16667 return FALSE;
16668}
16669
16670static kern_return_t
16671vm_map_reuse_pages(
16672 vm_map_t map,
16673 vm_map_offset_t start,
16674 vm_map_offset_t end)
16675{
16676 vm_map_entry_t entry;
16677 vm_object_t object;
16678 vm_object_offset_t start_offset, end_offset;
16679
16680 /*
16681 * The MADV_REUSE operation doesn't require any changes to the
16682 * vm_map_entry_t's, so the read lock is sufficient.
16683 */
16684
16685 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16686 /*
16687 * XXX TODO4K
16688 * need to figure out what reusable means for a
16689 * portion of a native page.
16690 */
16691 return KERN_SUCCESS;
16692 }
16693
16694 vm_map_lock_read(map);
16695 assert(map->pmap != kernel_pmap); /* protect alias access */
16696
16697 /*
16698 * The madvise semantics require that the address range be fully
16699 * allocated with no holes. Otherwise, we're required to return
16700 * an error.
16701 */
16702
16703 if (!vm_map_range_check(map, start, end, entry: &entry)) {
16704 vm_map_unlock_read(map);
16705 vm_page_stats_reusable.reuse_pages_failure++;
16706 return KERN_INVALID_ADDRESS;
16707 }
16708
16709 /*
16710 * Examine each vm_map_entry_t in the range.
16711 */
16712 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16713 entry = entry->vme_next) {
16714 /*
16715 * Sanity check on the VM map entry.
16716 */
16717 if (!vm_map_entry_is_reusable(entry)) {
16718 vm_map_unlock_read(map);
16719 vm_page_stats_reusable.reuse_pages_failure++;
16720 return KERN_INVALID_ADDRESS;
16721 }
16722
16723 /*
16724 * The first time through, the start address could be anywhere
16725 * within the vm_map_entry we found. So adjust the offset to
16726 * correspond.
16727 */
16728 if (entry->vme_start < start) {
16729 start_offset = start - entry->vme_start;
16730 } else {
16731 start_offset = 0;
16732 }
16733 end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16734 start_offset += VME_OFFSET(entry);
16735 end_offset += VME_OFFSET(entry);
16736
16737 object = VME_OBJECT(entry);
16738 if (object != VM_OBJECT_NULL) {
16739 vm_object_lock(object);
16740 vm_object_reuse_pages(object, start_offset, end_offset,
16741 TRUE);
16742 vm_object_unlock(object);
16743 }
16744
16745 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSABLE) {
16746 /*
16747 * XXX
16748 * We do not hold the VM map exclusively here.
16749 * The "alias" field is not that critical, so it's
16750 * safe to update it here, as long as it is the only
16751 * one that can be modified while holding the VM map
16752 * "shared".
16753 */
16754 VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSED);
16755 }
16756 }
16757
16758 vm_map_unlock_read(map);
16759 vm_page_stats_reusable.reuse_pages_success++;
16760 return KERN_SUCCESS;
16761}
16762
16763
16764static kern_return_t
16765vm_map_reusable_pages(
16766 vm_map_t map,
16767 vm_map_offset_t start,
16768 vm_map_offset_t end)
16769{
16770 vm_map_entry_t entry;
16771 vm_object_t object;
16772 vm_object_offset_t start_offset, end_offset;
16773 vm_map_offset_t pmap_offset;
16774
16775 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
16776 /*
16777 * XXX TODO4K
16778 * need to figure out what reusable means for a portion
16779 * of a native page.
16780 */
16781 return KERN_SUCCESS;
16782 }
16783
16784 /*
16785 * The MADV_REUSABLE operation doesn't require any changes to the
16786 * vm_map_entry_t's, so the read lock is sufficient.
16787 */
16788
16789 vm_map_lock_read(map);
16790 assert(map->pmap != kernel_pmap); /* protect alias access */
16791
16792 /*
16793 * The madvise semantics require that the address range be fully
16794 * allocated with no holes. Otherwise, we're required to return
16795 * an error.
16796 */
16797
16798 if (!vm_map_range_check(map, start, end, entry: &entry)) {
16799 vm_map_unlock_read(map);
16800 vm_page_stats_reusable.reusable_pages_failure++;
16801 return KERN_INVALID_ADDRESS;
16802 }
16803
16804 /*
16805 * Examine each vm_map_entry_t in the range.
16806 */
16807 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16808 entry = entry->vme_next) {
16809 int kill_pages = 0;
16810 boolean_t reusable_no_write = FALSE;
16811
16812 /*
16813 * Sanity check on the VM map entry.
16814 */
16815 if (!vm_map_entry_is_reusable(entry)) {
16816 vm_map_unlock_read(map);
16817 vm_page_stats_reusable.reusable_pages_failure++;
16818 return KERN_INVALID_ADDRESS;
16819 }
16820
16821 if (!(entry->protection & VM_PROT_WRITE) && !entry->used_for_jit
16822#if __arm64e__
16823 && !entry->used_for_tpro
16824#endif
16825 ) {
16826 /* not writable: can't discard contents */
16827 vm_map_unlock_read(map);
16828 vm_page_stats_reusable.reusable_nonwritable++;
16829 vm_page_stats_reusable.reusable_pages_failure++;
16830 return KERN_PROTECTION_FAILURE;
16831 }
16832
16833 /*
16834 * The first time through, the start address could be anywhere
16835 * within the vm_map_entry we found. So adjust the offset to
16836 * correspond.
16837 */
16838 if (entry->vme_start < start) {
16839 start_offset = start - entry->vme_start;
16840 pmap_offset = start;
16841 } else {
16842 start_offset = 0;
16843 pmap_offset = entry->vme_start;
16844 }
16845 end_offset = MIN(end, entry->vme_end) - entry->vme_start;
16846 start_offset += VME_OFFSET(entry);
16847 end_offset += VME_OFFSET(entry);
16848
16849 object = VME_OBJECT(entry);
16850 if (object == VM_OBJECT_NULL) {
16851 continue;
16852 }
16853
16854 if (entry->protection & VM_PROT_EXECUTE) {
16855 /*
16856 * Executable mappings might be write-protected by
16857 * hardware, so do not attempt to write to these pages.
16858 */
16859 reusable_no_write = TRUE;
16860 }
16861
16862 if (entry->vme_xnu_user_debug) {
16863 /*
16864 * User debug pages might be write-protected by hardware,
16865 * so do not attempt to write to these pages.
16866 */
16867 reusable_no_write = TRUE;
16868 }
16869
16870 vm_object_lock(object);
16871 if (((object->ref_count == 1) ||
16872 (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC &&
16873 object->vo_copy == VM_OBJECT_NULL)) &&
16874 object->shadow == VM_OBJECT_NULL &&
16875 /*
16876 * "iokit_acct" entries are billed for their virtual size
16877 * (rather than for their resident pages only), so they
16878 * wouldn't benefit from making pages reusable, and it
16879 * would be hard to keep track of pages that are both
16880 * "iokit_acct" and "reusable" in the pmap stats and
16881 * ledgers.
16882 */
16883 !(entry->iokit_acct ||
16884 (!entry->is_sub_map && !entry->use_pmap))) {
16885 if (object->ref_count != 1) {
16886 vm_page_stats_reusable.reusable_shared++;
16887 }
16888 kill_pages = 1;
16889 } else {
16890 kill_pages = -1;
16891 }
16892 if (kill_pages != -1) {
16893 vm_object_deactivate_pages(object,
16894 offset: start_offset,
16895 size: end_offset - start_offset,
16896 kill_page: kill_pages,
16897 TRUE /*reusable_pages*/,
16898 reusable_no_write,
16899 pmap: map->pmap,
16900 pmap_offset);
16901 } else {
16902 vm_page_stats_reusable.reusable_pages_shared++;
16903 DTRACE_VM4(vm_map_reusable_pages_shared,
16904 unsigned int, VME_ALIAS(entry),
16905 vm_map_t, map,
16906 vm_map_entry_t, entry,
16907 vm_object_t, object);
16908 }
16909 vm_object_unlock(object);
16910
16911 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE ||
16912 VME_ALIAS(entry) == VM_MEMORY_MALLOC_LARGE_REUSED) {
16913 /*
16914 * XXX
16915 * We do not hold the VM map exclusively here.
16916 * The "alias" field is not that critical, so it's
16917 * safe to update it here, as long as it is the only
16918 * one that can be modified while holding the VM map
16919 * "shared".
16920 */
16921 VME_ALIAS_SET(entry, VM_MEMORY_MALLOC_LARGE_REUSABLE);
16922 }
16923 }
16924
16925 vm_map_unlock_read(map);
16926 vm_page_stats_reusable.reusable_pages_success++;
16927 return KERN_SUCCESS;
16928}
16929
16930
16931static kern_return_t
16932vm_map_can_reuse(
16933 vm_map_t map,
16934 vm_map_offset_t start,
16935 vm_map_offset_t end)
16936{
16937 vm_map_entry_t entry;
16938
16939 /*
16940 * The MADV_REUSABLE operation doesn't require any changes to the
16941 * vm_map_entry_t's, so the read lock is sufficient.
16942 */
16943
16944 vm_map_lock_read(map);
16945 assert(map->pmap != kernel_pmap); /* protect alias access */
16946
16947 /*
16948 * The madvise semantics require that the address range be fully
16949 * allocated with no holes. Otherwise, we're required to return
16950 * an error.
16951 */
16952
16953 if (!vm_map_range_check(map, start, end, entry: &entry)) {
16954 vm_map_unlock_read(map);
16955 vm_page_stats_reusable.can_reuse_failure++;
16956 return KERN_INVALID_ADDRESS;
16957 }
16958
16959 /*
16960 * Examine each vm_map_entry_t in the range.
16961 */
16962 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
16963 entry = entry->vme_next) {
16964 /*
16965 * Sanity check on the VM map entry.
16966 */
16967 if (!vm_map_entry_is_reusable(entry)) {
16968 vm_map_unlock_read(map);
16969 vm_page_stats_reusable.can_reuse_failure++;
16970 return KERN_INVALID_ADDRESS;
16971 }
16972 }
16973
16974 vm_map_unlock_read(map);
16975 vm_page_stats_reusable.can_reuse_success++;
16976 return KERN_SUCCESS;
16977}
16978
16979
16980#if MACH_ASSERT
16981static kern_return_t
16982vm_map_pageout(
16983 vm_map_t map,
16984 vm_map_offset_t start,
16985 vm_map_offset_t end)
16986{
16987 vm_map_entry_t entry;
16988
16989 /*
16990 * The MADV_PAGEOUT operation doesn't require any changes to the
16991 * vm_map_entry_t's, so the read lock is sufficient.
16992 */
16993
16994 vm_map_lock_read(map);
16995
16996 /*
16997 * The madvise semantics require that the address range be fully
16998 * allocated with no holes. Otherwise, we're required to return
16999 * an error.
17000 */
17001
17002 if (!vm_map_range_check(map, start, end, &entry)) {
17003 vm_map_unlock_read(map);
17004 return KERN_INVALID_ADDRESS;
17005 }
17006
17007 /*
17008 * Examine each vm_map_entry_t in the range.
17009 */
17010 for (; entry != vm_map_to_entry(map) && entry->vme_start < end;
17011 entry = entry->vme_next) {
17012 vm_object_t object;
17013
17014 /*
17015 * Sanity check on the VM map entry.
17016 */
17017 if (entry->is_sub_map) {
17018 vm_map_t submap;
17019 vm_map_offset_t submap_start;
17020 vm_map_offset_t submap_end;
17021 vm_map_entry_t submap_entry;
17022
17023 submap = VME_SUBMAP(entry);
17024 submap_start = VME_OFFSET(entry);
17025 submap_end = submap_start + (entry->vme_end -
17026 entry->vme_start);
17027
17028 vm_map_lock_read(submap);
17029
17030 if (!vm_map_range_check(submap,
17031 submap_start,
17032 submap_end,
17033 &submap_entry)) {
17034 vm_map_unlock_read(submap);
17035 vm_map_unlock_read(map);
17036 return KERN_INVALID_ADDRESS;
17037 }
17038
17039 if (submap_entry->is_sub_map) {
17040 vm_map_unlock_read(submap);
17041 continue;
17042 }
17043
17044 object = VME_OBJECT(submap_entry);
17045 if (object == VM_OBJECT_NULL || !object->internal) {
17046 vm_map_unlock_read(submap);
17047 continue;
17048 }
17049
17050 vm_object_pageout(object);
17051
17052 vm_map_unlock_read(submap);
17053 submap = VM_MAP_NULL;
17054 submap_entry = VM_MAP_ENTRY_NULL;
17055 continue;
17056 }
17057
17058 object = VME_OBJECT(entry);
17059 if (object == VM_OBJECT_NULL || !object->internal) {
17060 continue;
17061 }
17062
17063 vm_object_pageout(object);
17064 }
17065
17066 vm_map_unlock_read(map);
17067 return KERN_SUCCESS;
17068}
17069#endif /* MACH_ASSERT */
17070
17071/*
17072 * This function determines if the zero operation can be run on the
17073 * respective entry. Additional checks on the object are in
17074 * vm_object_zero_preflight.
17075 */
17076static kern_return_t
17077vm_map_zero_entry_preflight(vm_map_entry_t entry)
17078{
17079 /*
17080 * Zeroing is restricted to writable non-executable entries and non-JIT
17081 * regions.
17082 */
17083 if (!(entry->protection & VM_PROT_WRITE) ||
17084 (entry->protection & VM_PROT_EXECUTE) ||
17085 entry->used_for_jit ||
17086 entry->vme_xnu_user_debug) {
17087 return KERN_PROTECTION_FAILURE;
17088 }
17089
17090 /*
17091 * Zeroing for copy on write isn't yet supported. Zeroing is also not
17092 * allowed for submaps.
17093 */
17094 if (entry->needs_copy || entry->is_sub_map) {
17095 return KERN_NO_ACCESS;
17096 }
17097
17098 return KERN_SUCCESS;
17099}
17100
17101/*
17102 * This function translates entry's start and end to offsets in the object
17103 */
17104static void
17105vm_map_get_bounds_in_object(
17106 vm_map_entry_t entry,
17107 vm_map_offset_t start,
17108 vm_map_offset_t end,
17109 vm_map_offset_t *start_offset,
17110 vm_map_offset_t *end_offset)
17111{
17112 if (entry->vme_start < start) {
17113 *start_offset = start - entry->vme_start;
17114 } else {
17115 *start_offset = 0;
17116 }
17117 *end_offset = MIN(end, entry->vme_end) - entry->vme_start;
17118 *start_offset += VME_OFFSET(entry);
17119 *end_offset += VME_OFFSET(entry);
17120}
17121
17122/*
17123 * This function iterates through the entries in the requested range
17124 * and zeroes any resident pages in the corresponding objects. Compressed
17125 * pages are dropped instead of being faulted in and zeroed.
17126 */
17127static kern_return_t
17128vm_map_zero(
17129 vm_map_t map,
17130 vm_map_offset_t start,
17131 vm_map_offset_t end)
17132{
17133 vm_map_entry_t entry;
17134 vm_map_offset_t cur = start;
17135 kern_return_t ret;
17136
17137 /*
17138 * This operation isn't supported where the map page size is less than
17139 * the hardware page size. Caller will need to handle error and
17140 * explicitly zero memory if needed.
17141 */
17142 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
17143 return KERN_NO_ACCESS;
17144 }
17145
17146 /*
17147 * The MADV_ZERO operation doesn't require any changes to the
17148 * vm_map_entry_t's, so the read lock is sufficient.
17149 */
17150 vm_map_lock_read(map);
17151 assert(map->pmap != kernel_pmap); /* protect alias access */
17152
17153 /*
17154 * The madvise semantics require that the address range be fully
17155 * allocated with no holes. Otherwise, we're required to return
17156 * an error. This check needs to be redone if the map has changed.
17157 */
17158 if (!vm_map_range_check(map, start: cur, end, entry: &entry)) {
17159 vm_map_unlock_read(map);
17160 return KERN_INVALID_ADDRESS;
17161 }
17162
17163 /*
17164 * Examine each vm_map_entry_t in the range.
17165 */
17166 while (entry != vm_map_to_entry(map) && entry->vme_start < end) {
17167 vm_map_offset_t cur_offset;
17168 vm_map_offset_t end_offset;
17169 unsigned int last_timestamp = map->timestamp;
17170 vm_object_t object = VME_OBJECT(entry);
17171
17172 ret = vm_map_zero_entry_preflight(entry);
17173 if (ret != KERN_SUCCESS) {
17174 vm_map_unlock_read(map);
17175 return ret;
17176 }
17177
17178 if (object == VM_OBJECT_NULL) {
17179 entry = entry->vme_next;
17180 continue;
17181 }
17182
17183 vm_map_get_bounds_in_object(entry, start: cur, end, start_offset: &cur_offset, end_offset: &end_offset);
17184 vm_object_lock(object);
17185 /*
17186 * Take a reference on the object as vm_object_zero will drop the object
17187 * lock when it encounters a busy page.
17188 */
17189 vm_object_reference_locked(object);
17190 vm_map_unlock_read(map);
17191
17192 ret = vm_object_zero(object, cur_offset, end_offset);
17193 vm_object_unlock(object);
17194 vm_object_deallocate(object);
17195 if (ret != KERN_SUCCESS) {
17196 return ret;
17197 }
17198 /*
17199 * Update cur as vm_object_zero has succeeded.
17200 */
17201 cur += (end_offset - cur_offset);
17202 if (cur == end) {
17203 return KERN_SUCCESS;
17204 }
17205
17206 /*
17207 * If the map timestamp has changed, restart by relooking up cur in the
17208 * map
17209 */
17210 vm_map_lock_read(map);
17211 if (last_timestamp != map->timestamp) {
17212 /*
17213 * Relookup cur in the map
17214 */
17215 if (!vm_map_range_check(map, start: cur, end, entry: &entry)) {
17216 vm_map_unlock_read(map);
17217 return KERN_INVALID_ADDRESS;
17218 }
17219 continue;
17220 }
17221 /*
17222 * If the map hasn't changed proceed with the next entry
17223 */
17224 entry = entry->vme_next;
17225 }
17226
17227 vm_map_unlock_read(map);
17228 return KERN_SUCCESS;
17229}
17230
17231
17232/*
17233 * Routine: vm_map_entry_insert
17234 *
17235 * Description: This routine inserts a new vm_entry in a locked map.
17236 */
17237static vm_map_entry_t
17238vm_map_entry_insert(
17239 vm_map_t map,
17240 vm_map_entry_t insp_entry,
17241 vm_map_offset_t start,
17242 vm_map_offset_t end,
17243 vm_object_t object,
17244 vm_object_offset_t offset,
17245 vm_map_kernel_flags_t vmk_flags,
17246 boolean_t needs_copy,
17247 vm_prot_t cur_protection,
17248 vm_prot_t max_protection,
17249 vm_inherit_t inheritance,
17250 boolean_t clear_map_aligned)
17251{
17252 vm_map_entry_t new_entry;
17253 boolean_t map_aligned = FALSE;
17254
17255 assert(insp_entry != (vm_map_entry_t)0);
17256 vm_map_lock_assert_exclusive(map);
17257
17258#if DEVELOPMENT || DEBUG
17259 vm_object_offset_t end_offset = 0;
17260 assertf(!os_add_overflow(end - start, offset, &end_offset), "size 0x%llx, offset 0x%llx caused overflow", (uint64_t)(end - start), offset);
17261#endif /* DEVELOPMENT || DEBUG */
17262
17263 if (VM_MAP_PAGE_SHIFT(map) != PAGE_SHIFT) {
17264 map_aligned = TRUE;
17265 }
17266 if (clear_map_aligned &&
17267 (!VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)) ||
17268 !VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)))) {
17269 map_aligned = FALSE;
17270 }
17271 if (map_aligned) {
17272 assert(VM_MAP_PAGE_ALIGNED(start, VM_MAP_PAGE_MASK(map)));
17273 assert(VM_MAP_PAGE_ALIGNED(end, VM_MAP_PAGE_MASK(map)));
17274 } else {
17275 assert(page_aligned(start));
17276 assert(page_aligned(end));
17277 }
17278 assert(start < end);
17279
17280 new_entry = vm_map_entry_create(map);
17281
17282 new_entry->vme_start = start;
17283 new_entry->vme_end = end;
17284
17285 if (vmk_flags.vmkf_submap) {
17286 new_entry->vme_atomic = vmk_flags.vmkf_submap_atomic;
17287 VME_SUBMAP_SET(entry: new_entry, submap: (vm_map_t)object);
17288 } else {
17289 VME_OBJECT_SET(entry: new_entry, object, false, context: 0);
17290 }
17291 VME_OFFSET_SET(entry: new_entry, offset);
17292 VME_ALIAS_SET(entry: new_entry, alias: vmk_flags.vm_tag);
17293
17294 new_entry->map_aligned = map_aligned;
17295 new_entry->needs_copy = needs_copy;
17296 new_entry->inheritance = inheritance;
17297 new_entry->protection = cur_protection;
17298 new_entry->max_protection = max_protection;
17299 /*
17300 * submap: "use_pmap" means "nested".
17301 * default: false.
17302 *
17303 * object: "use_pmap" means "use pmap accounting" for footprint.
17304 * default: true.
17305 */
17306 new_entry->use_pmap = !vmk_flags.vmkf_submap;
17307 new_entry->no_cache = vmk_flags.vmf_no_cache;
17308 new_entry->vme_permanent = vmk_flags.vmf_permanent;
17309 new_entry->translated_allow_execute = vmk_flags.vmkf_translated_allow_execute;
17310 new_entry->vme_no_copy_on_read = vmk_flags.vmkf_no_copy_on_read;
17311 new_entry->superpage_size = (vmk_flags.vmf_superpage_size != 0);
17312
17313 if (vmk_flags.vmkf_map_jit) {
17314 if (!(map->jit_entry_exists) ||
17315 VM_MAP_POLICY_ALLOW_MULTIPLE_JIT(map)) {
17316 new_entry->used_for_jit = TRUE;
17317 map->jit_entry_exists = TRUE;
17318 }
17319 }
17320
17321 /*
17322 * Insert the new entry into the list.
17323 */
17324
17325 vm_map_store_entry_link(map, after_where: insp_entry, entry: new_entry, vmk_flags);
17326 map->size += end - start;
17327
17328 /*
17329 * Update the free space hint and the lookup hint.
17330 */
17331
17332 SAVE_HINT_MAP_WRITE(map, new_entry);
17333 return new_entry;
17334}
17335
17336/*
17337 * Routine: vm_map_remap_extract
17338 *
17339 * Description: This routine returns a vm_entry list from a map.
17340 */
17341static kern_return_t
17342vm_map_remap_extract(
17343 vm_map_t map,
17344 vm_map_offset_t addr,
17345 vm_map_size_t size,
17346 boolean_t copy,
17347 vm_map_copy_t map_copy,
17348 vm_prot_t *cur_protection, /* IN/OUT */
17349 vm_prot_t *max_protection, /* IN/OUT */
17350 /* What, no behavior? */
17351 vm_inherit_t inheritance,
17352 vm_map_kernel_flags_t vmk_flags)
17353{
17354 struct vm_map_header *map_header = &map_copy->cpy_hdr;
17355 kern_return_t result;
17356 vm_map_size_t mapped_size;
17357 vm_map_size_t tmp_size;
17358 vm_map_entry_t src_entry; /* result of last map lookup */
17359 vm_map_entry_t new_entry;
17360 vm_object_offset_t offset;
17361 vm_map_offset_t map_address;
17362 vm_map_offset_t src_start; /* start of entry to map */
17363 vm_map_offset_t src_end; /* end of region to be mapped */
17364 vm_object_t object;
17365 vm_map_version_t version;
17366 boolean_t src_needs_copy;
17367 boolean_t new_entry_needs_copy;
17368 vm_map_entry_t saved_src_entry;
17369 boolean_t src_entry_was_wired;
17370 vm_prot_t max_prot_for_prot_copy;
17371 vm_map_offset_t effective_page_mask;
17372 bool pageable, same_map;
17373 boolean_t vm_remap_legacy;
17374 vm_prot_t required_cur_prot, required_max_prot;
17375 vm_object_t new_copy_object; /* vm_object_copy_* result */
17376 boolean_t saved_used_for_jit; /* Saved used_for_jit. */
17377
17378 pageable = vmk_flags.vmkf_copy_pageable;
17379 same_map = vmk_flags.vmkf_copy_same_map;
17380
17381 effective_page_mask = MIN(PAGE_MASK, VM_MAP_PAGE_MASK(map));
17382
17383 assert(map != VM_MAP_NULL);
17384 assert(size != 0);
17385 assert(size == vm_map_round_page(size, effective_page_mask));
17386 assert(inheritance == VM_INHERIT_NONE ||
17387 inheritance == VM_INHERIT_COPY ||
17388 inheritance == VM_INHERIT_SHARE);
17389 assert(!(*cur_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17390 assert(!(*max_protection & ~(VM_PROT_ALL | VM_PROT_ALLEXEC)));
17391 assert((*cur_protection & *max_protection) == *cur_protection);
17392
17393 /*
17394 * Compute start and end of region.
17395 */
17396 src_start = vm_map_trunc_page(addr, effective_page_mask);
17397 src_end = vm_map_round_page(src_start + size, effective_page_mask);
17398
17399 /*
17400 * Initialize map_header.
17401 */
17402 map_header->nentries = 0;
17403 map_header->entries_pageable = pageable;
17404// map_header->page_shift = MIN(VM_MAP_PAGE_SHIFT(map), PAGE_SHIFT);
17405 map_header->page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(map);
17406 map_header->rb_head_store.rbh_root = (void *)(int)SKIP_RB_TREE;
17407 vm_map_store_init(header: map_header);
17408
17409 if (copy && vmk_flags.vmkf_remap_prot_copy) {
17410 /*
17411 * Special case for vm_map_protect(VM_PROT_COPY):
17412 * we want to set the new mappings' max protection to the
17413 * specified *max_protection...
17414 */
17415 max_prot_for_prot_copy = *max_protection & (VM_PROT_ALL | VM_PROT_ALLEXEC);
17416 /* ... but we want to use the vm_remap() legacy mode */
17417 *max_protection = VM_PROT_NONE;
17418 *cur_protection = VM_PROT_NONE;
17419 } else {
17420 max_prot_for_prot_copy = VM_PROT_NONE;
17421 }
17422
17423 if (*cur_protection == VM_PROT_NONE &&
17424 *max_protection == VM_PROT_NONE) {
17425 /*
17426 * vm_remap() legacy mode:
17427 * Extract all memory regions in the specified range and
17428 * collect the strictest set of protections allowed on the
17429 * entire range, so the caller knows what they can do with
17430 * the remapped range.
17431 * We start with VM_PROT_ALL and we'll remove the protections
17432 * missing from each memory region.
17433 */
17434 vm_remap_legacy = TRUE;
17435 *cur_protection = VM_PROT_ALL;
17436 *max_protection = VM_PROT_ALL;
17437 required_cur_prot = VM_PROT_NONE;
17438 required_max_prot = VM_PROT_NONE;
17439 } else {
17440 /*
17441 * vm_remap_new() mode:
17442 * Extract all memory regions in the specified range and
17443 * ensure that they have at least the protections specified
17444 * by the caller via *cur_protection and *max_protection.
17445 * The resulting mapping should have these protections.
17446 */
17447 vm_remap_legacy = FALSE;
17448 if (copy) {
17449 required_cur_prot = VM_PROT_NONE;
17450 required_max_prot = VM_PROT_READ;
17451 } else {
17452 required_cur_prot = *cur_protection;
17453 required_max_prot = *max_protection;
17454 }
17455 }
17456
17457 map_address = 0;
17458 mapped_size = 0;
17459 result = KERN_SUCCESS;
17460
17461 /*
17462 * The specified source virtual space might correspond to
17463 * multiple map entries, need to loop on them.
17464 */
17465 vm_map_lock(map);
17466
17467 if (map->pmap == kernel_pmap) {
17468 map_copy->is_kernel_range = true;
17469 map_copy->orig_range = kmem_addr_get_range(addr, size);
17470#if CONFIG_MAP_RANGES
17471 } else if (map->uses_user_ranges) {
17472 map_copy->is_user_range = true;
17473 map_copy->orig_range = vm_map_user_range_resolve(map, addr, size, NULL);
17474#endif /* CONFIG_MAP_RANGES */
17475 }
17476
17477 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
17478 /*
17479 * This address space uses sub-pages so the range might
17480 * not be re-mappable in an address space with larger
17481 * pages. Re-assemble any broken-up VM map entries to
17482 * improve our chances of making it work.
17483 */
17484 vm_map_simplify_range(map, start: src_start, end: src_end);
17485 }
17486 while (mapped_size != size) {
17487 vm_map_size_t entry_size;
17488
17489 /*
17490 * Find the beginning of the region.
17491 */
17492 if (!vm_map_lookup_entry(map, address: src_start, entry: &src_entry)) {
17493 result = KERN_INVALID_ADDRESS;
17494 break;
17495 }
17496
17497 if (src_start < src_entry->vme_start ||
17498 (mapped_size && src_start != src_entry->vme_start)) {
17499 result = KERN_INVALID_ADDRESS;
17500 break;
17501 }
17502
17503 tmp_size = size - mapped_size;
17504 if (src_end > src_entry->vme_end) {
17505 tmp_size -= (src_end - src_entry->vme_end);
17506 }
17507
17508 entry_size = (vm_map_size_t)(src_entry->vme_end -
17509 src_entry->vme_start);
17510
17511 if (src_entry->is_sub_map &&
17512 vmk_flags.vmkf_copy_single_object) {
17513 vm_map_t submap;
17514 vm_map_offset_t submap_start;
17515 vm_map_size_t submap_size;
17516 boolean_t submap_needs_copy;
17517
17518 /*
17519 * No check for "required protection" on "src_entry"
17520 * because the protections that matter are the ones
17521 * on the submap's VM map entry, which will be checked
17522 * during the call to vm_map_remap_extract() below.
17523 */
17524 submap_size = src_entry->vme_end - src_start;
17525 if (submap_size > size) {
17526 submap_size = size;
17527 }
17528 submap_start = VME_OFFSET(entry: src_entry) + src_start - src_entry->vme_start;
17529 submap = VME_SUBMAP(src_entry);
17530 if (copy) {
17531 /*
17532 * The caller wants a copy-on-write re-mapping,
17533 * so let's extract from the submap accordingly.
17534 */
17535 submap_needs_copy = TRUE;
17536 } else if (src_entry->needs_copy) {
17537 /*
17538 * The caller wants a shared re-mapping but the
17539 * submap is mapped with "needs_copy", so its
17540 * contents can't be shared as is. Extract the
17541 * contents of the submap as "copy-on-write".
17542 * The re-mapping won't be shared with the
17543 * original mapping but this is equivalent to
17544 * what happened with the original "remap from
17545 * submap" code.
17546 * The shared region is mapped "needs_copy", for
17547 * example.
17548 */
17549 submap_needs_copy = TRUE;
17550 } else {
17551 /*
17552 * The caller wants a shared re-mapping and
17553 * this mapping can be shared (no "needs_copy"),
17554 * so let's extract from the submap accordingly.
17555 * Kernel submaps are mapped without
17556 * "needs_copy", for example.
17557 */
17558 submap_needs_copy = FALSE;
17559 }
17560 vm_map_reference(map: submap);
17561 vm_map_unlock(map);
17562 src_entry = NULL;
17563 if (vm_remap_legacy) {
17564 *cur_protection = VM_PROT_NONE;
17565 *max_protection = VM_PROT_NONE;
17566 }
17567
17568 DTRACE_VM7(remap_submap_recurse,
17569 vm_map_t, map,
17570 vm_map_offset_t, addr,
17571 vm_map_size_t, size,
17572 boolean_t, copy,
17573 vm_map_offset_t, submap_start,
17574 vm_map_size_t, submap_size,
17575 boolean_t, submap_needs_copy);
17576
17577 result = vm_map_remap_extract(map: submap,
17578 addr: submap_start,
17579 size: submap_size,
17580 copy: submap_needs_copy,
17581 map_copy,
17582 cur_protection,
17583 max_protection,
17584 inheritance,
17585 vmk_flags);
17586 vm_map_deallocate(map: submap);
17587
17588 if (result == KERN_SUCCESS &&
17589 submap_needs_copy &&
17590 !copy) {
17591 /*
17592 * We were asked for a "shared"
17593 * re-mapping but had to ask for a
17594 * "copy-on-write" remapping of the
17595 * submap's mapping to honor the
17596 * submap's "needs_copy".
17597 * We now need to resolve that
17598 * pending "copy-on-write" to
17599 * get something we can share.
17600 */
17601 vm_map_entry_t copy_entry;
17602 vm_object_offset_t copy_offset;
17603 vm_map_size_t copy_size;
17604 vm_object_t copy_object;
17605 copy_entry = vm_map_copy_first_entry(map_copy);
17606 copy_size = copy_entry->vme_end - copy_entry->vme_start;
17607 copy_object = VME_OBJECT(copy_entry);
17608 copy_offset = VME_OFFSET(entry: copy_entry);
17609 if (copy_object == VM_OBJECT_NULL) {
17610 assert(copy_offset == 0);
17611 assert(!copy_entry->needs_copy);
17612 if (copy_entry->max_protection == VM_PROT_NONE) {
17613 assert(copy_entry->protection == VM_PROT_NONE);
17614 /* nothing to share */
17615 } else {
17616 assert(copy_offset == 0);
17617 copy_object = vm_object_allocate(size: copy_size);
17618 VME_OFFSET_SET(entry: copy_entry, offset: 0);
17619 VME_OBJECT_SET(entry: copy_entry, object: copy_object, false, context: 0);
17620 assert(copy_entry->use_pmap);
17621 }
17622 } else if (copy_object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
17623 /* already shareable */
17624 assert(!copy_entry->needs_copy);
17625 } else if (copy_entry->needs_copy ||
17626 copy_object->shadowed ||
17627 (object->internal &&
17628 !object->true_share &&
17629 !copy_entry->is_shared &&
17630 copy_object->vo_size > copy_size)) {
17631 VME_OBJECT_SHADOW(entry: copy_entry, length: copy_size, TRUE);
17632 assert(copy_entry->use_pmap);
17633 if (copy_entry->needs_copy) {
17634 /* already write-protected */
17635 } else {
17636 vm_prot_t prot;
17637 prot = copy_entry->protection & ~VM_PROT_WRITE;
17638 vm_object_pmap_protect(object: copy_object,
17639 offset: copy_offset,
17640 size: copy_size,
17641 PMAP_NULL,
17642 PAGE_SIZE,
17643 pmap_start: 0,
17644 prot);
17645 }
17646 copy_entry->needs_copy = FALSE;
17647 }
17648 copy_object = VME_OBJECT(copy_entry);
17649 copy_offset = VME_OFFSET(entry: copy_entry);
17650 if (copy_object &&
17651 copy_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
17652 copy_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
17653 copy_object->true_share = TRUE;
17654 }
17655 }
17656
17657 return result;
17658 }
17659
17660 if (src_entry->is_sub_map) {
17661 /* protections for submap mapping are irrelevant here */
17662 } else if (((src_entry->protection & required_cur_prot) !=
17663 required_cur_prot) ||
17664 ((src_entry->max_protection & required_max_prot) !=
17665 required_max_prot)) {
17666 if (vmk_flags.vmkf_copy_single_object &&
17667 mapped_size != 0) {
17668 /*
17669 * Single object extraction.
17670 * We can't extract more with the required
17671 * protection but we've extracted some, so
17672 * stop there and declare success.
17673 * The caller should check the size of
17674 * the copy entry we've extracted.
17675 */
17676 result = KERN_SUCCESS;
17677 } else {
17678 /*
17679 * VM range extraction.
17680 * Required proctection is not available
17681 * for this part of the range: fail.
17682 */
17683 result = KERN_PROTECTION_FAILURE;
17684 }
17685 break;
17686 }
17687
17688 if (src_entry->is_sub_map) {
17689 vm_map_t submap;
17690 vm_map_offset_t submap_start;
17691 vm_map_size_t submap_size;
17692 vm_map_copy_t submap_copy;
17693 vm_prot_t submap_curprot, submap_maxprot;
17694 boolean_t submap_needs_copy;
17695
17696 /*
17697 * No check for "required protection" on "src_entry"
17698 * because the protections that matter are the ones
17699 * on the submap's VM map entry, which will be checked
17700 * during the call to vm_map_copy_extract() below.
17701 */
17702 object = VM_OBJECT_NULL;
17703 submap_copy = VM_MAP_COPY_NULL;
17704
17705 /* find equivalent range in the submap */
17706 submap = VME_SUBMAP(src_entry);
17707 submap_start = VME_OFFSET(entry: src_entry) + src_start - src_entry->vme_start;
17708 submap_size = tmp_size;
17709 if (copy) {
17710 /*
17711 * The caller wants a copy-on-write re-mapping,
17712 * so let's extract from the submap accordingly.
17713 */
17714 submap_needs_copy = TRUE;
17715 } else if (src_entry->needs_copy) {
17716 /*
17717 * The caller wants a shared re-mapping but the
17718 * submap is mapped with "needs_copy", so its
17719 * contents can't be shared as is. Extract the
17720 * contents of the submap as "copy-on-write".
17721 * The re-mapping won't be shared with the
17722 * original mapping but this is equivalent to
17723 * what happened with the original "remap from
17724 * submap" code.
17725 * The shared region is mapped "needs_copy", for
17726 * example.
17727 */
17728 submap_needs_copy = TRUE;
17729 } else {
17730 /*
17731 * The caller wants a shared re-mapping and
17732 * this mapping can be shared (no "needs_copy"),
17733 * so let's extract from the submap accordingly.
17734 * Kernel submaps are mapped without
17735 * "needs_copy", for example.
17736 */
17737 submap_needs_copy = FALSE;
17738 }
17739 /* extra ref to keep submap alive */
17740 vm_map_reference(map: submap);
17741
17742 DTRACE_VM7(remap_submap_recurse,
17743 vm_map_t, map,
17744 vm_map_offset_t, addr,
17745 vm_map_size_t, size,
17746 boolean_t, copy,
17747 vm_map_offset_t, submap_start,
17748 vm_map_size_t, submap_size,
17749 boolean_t, submap_needs_copy);
17750
17751 /*
17752 * The map can be safely unlocked since we
17753 * already hold a reference on the submap.
17754 *
17755 * No timestamp since we don't care if the map
17756 * gets modified while we're down in the submap.
17757 * We'll resume the extraction at src_start + tmp_size
17758 * anyway.
17759 */
17760 vm_map_unlock(map);
17761 src_entry = NULL; /* not valid once map is unlocked */
17762
17763 if (vm_remap_legacy) {
17764 submap_curprot = VM_PROT_NONE;
17765 submap_maxprot = VM_PROT_NONE;
17766 if (max_prot_for_prot_copy) {
17767 submap_maxprot = max_prot_for_prot_copy;
17768 }
17769 } else {
17770 assert(!max_prot_for_prot_copy);
17771 submap_curprot = *cur_protection;
17772 submap_maxprot = *max_protection;
17773 }
17774 result = vm_map_copy_extract(src_map: submap,
17775 src_addr: submap_start,
17776 len: submap_size,
17777 do_copy: submap_needs_copy,
17778 copy_result: &submap_copy,
17779 cur_prot: &submap_curprot,
17780 max_prot: &submap_maxprot,
17781 inheritance,
17782 vmk_flags);
17783
17784 /* release extra ref on submap */
17785 vm_map_deallocate(map: submap);
17786 submap = VM_MAP_NULL;
17787
17788 if (result != KERN_SUCCESS) {
17789 vm_map_lock(map);
17790 break;
17791 }
17792
17793 /* transfer submap_copy entries to map_header */
17794 while (vm_map_copy_first_entry(submap_copy) !=
17795 vm_map_copy_to_entry(submap_copy)) {
17796 vm_map_entry_t copy_entry;
17797 vm_map_size_t copy_entry_size;
17798
17799 copy_entry = vm_map_copy_first_entry(submap_copy);
17800
17801 /*
17802 * Prevent kernel_object from being exposed to
17803 * user space.
17804 */
17805 if (__improbable(copy_entry->vme_kernel_object)) {
17806 printf(format: "%d[%s]: rejecting attempt to extract from kernel_object\n",
17807 proc_selfpid(),
17808 (get_bsdtask_info(current_task())
17809 ? proc_name_address(p: get_bsdtask_info(current_task()))
17810 : "?"));
17811 DTRACE_VM(extract_kernel_only);
17812 result = KERN_INVALID_RIGHT;
17813 vm_map_copy_discard(copy: submap_copy);
17814 submap_copy = VM_MAP_COPY_NULL;
17815 vm_map_lock(map);
17816 break;
17817 }
17818
17819#ifdef __arm64e__
17820 if (vmk_flags.vmkf_tpro_enforcement_override) {
17821 copy_entry->used_for_tpro = FALSE;
17822 }
17823#endif /* __arm64e__ */
17824
17825 vm_map_copy_entry_unlink(submap_copy, copy_entry);
17826 copy_entry_size = copy_entry->vme_end - copy_entry->vme_start;
17827 copy_entry->vme_start = map_address;
17828 copy_entry->vme_end = map_address + copy_entry_size;
17829 map_address += copy_entry_size;
17830 mapped_size += copy_entry_size;
17831 src_start += copy_entry_size;
17832 assert(src_start <= src_end);
17833 _vm_map_store_entry_link(header: map_header,
17834 after_where: map_header->links.prev,
17835 entry: copy_entry);
17836 }
17837 /* done with submap_copy */
17838 vm_map_copy_discard(copy: submap_copy);
17839
17840 if (vm_remap_legacy) {
17841 *cur_protection &= submap_curprot;
17842 *max_protection &= submap_maxprot;
17843 }
17844
17845 /* re-acquire the map lock and continue to next entry */
17846 vm_map_lock(map);
17847 continue;
17848 } else {
17849 object = VME_OBJECT(src_entry);
17850
17851 /*
17852 * Prevent kernel_object from being exposed to
17853 * user space.
17854 */
17855 if (__improbable(is_kernel_object(object))) {
17856 printf(format: "%d[%s]: rejecting attempt to extract from kernel_object\n",
17857 proc_selfpid(),
17858 (get_bsdtask_info(current_task())
17859 ? proc_name_address(p: get_bsdtask_info(current_task()))
17860 : "?"));
17861 DTRACE_VM(extract_kernel_only);
17862 result = KERN_INVALID_RIGHT;
17863 break;
17864 }
17865
17866 if (src_entry->iokit_acct) {
17867 /*
17868 * This entry uses "IOKit accounting".
17869 */
17870 } else if (object != VM_OBJECT_NULL &&
17871 (object->purgable != VM_PURGABLE_DENY ||
17872 object->vo_ledger_tag != VM_LEDGER_TAG_NONE)) {
17873 /*
17874 * Purgeable objects have their own accounting:
17875 * no pmap accounting for them.
17876 */
17877 assertf(!src_entry->use_pmap,
17878 "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
17879 map,
17880 src_entry,
17881 (uint64_t)src_entry->vme_start,
17882 (uint64_t)src_entry->vme_end,
17883 src_entry->protection,
17884 src_entry->max_protection,
17885 VME_ALIAS(src_entry));
17886 } else {
17887 /*
17888 * Not IOKit or purgeable:
17889 * must be accounted by pmap stats.
17890 */
17891 assertf(src_entry->use_pmap,
17892 "map=%p src_entry=%p [0x%llx:0x%llx] 0x%x/0x%x %d",
17893 map,
17894 src_entry,
17895 (uint64_t)src_entry->vme_start,
17896 (uint64_t)src_entry->vme_end,
17897 src_entry->protection,
17898 src_entry->max_protection,
17899 VME_ALIAS(src_entry));
17900 }
17901
17902 if (object == VM_OBJECT_NULL) {
17903 assert(!src_entry->needs_copy);
17904 if (src_entry->max_protection == VM_PROT_NONE) {
17905 assert(src_entry->protection == VM_PROT_NONE);
17906 /*
17907 * No VM object and no permissions:
17908 * this must be a reserved range with
17909 * nothing to share or copy.
17910 * There could also be all sorts of
17911 * pmap shenanigans within that reserved
17912 * range, so let's just copy the map
17913 * entry as is to remap a similar
17914 * reserved range.
17915 */
17916 offset = 0; /* no object => no offset */
17917 goto copy_src_entry;
17918 }
17919 object = vm_object_allocate(size: entry_size);
17920 VME_OFFSET_SET(entry: src_entry, offset: 0);
17921 VME_OBJECT_SET(entry: src_entry, object, false, context: 0);
17922 assert(src_entry->use_pmap);
17923 assert(!map->mapped_in_other_pmaps);
17924 } else if (src_entry->wired_count ||
17925 object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
17926 /*
17927 * A wired memory region should not have
17928 * any pending copy-on-write and needs to
17929 * keep pointing at the VM object that
17930 * contains the wired pages.
17931 * If we're sharing this memory (copy=false),
17932 * we'll share this VM object.
17933 * If we're copying this memory (copy=true),
17934 * we'll call vm_object_copy_slowly() below
17935 * and use the new VM object for the remapping.
17936 *
17937 * Or, we are already using an asymmetric
17938 * copy, and therefore we already have
17939 * the right object.
17940 */
17941 assert(!src_entry->needs_copy);
17942 } else if (src_entry->needs_copy || object->shadowed ||
17943 (object->internal && !object->true_share &&
17944 !src_entry->is_shared &&
17945 object->vo_size > entry_size)) {
17946 bool is_writable;
17947
17948 VME_OBJECT_SHADOW(entry: src_entry, length: entry_size,
17949 always: vm_map_always_shadow(map));
17950 assert(src_entry->use_pmap);
17951
17952 is_writable = false;
17953 if (src_entry->protection & VM_PROT_WRITE) {
17954 is_writable = true;
17955#if __arm64e__
17956 } else if (src_entry->used_for_tpro) {
17957 is_writable = true;
17958#endif /* __arm64e__ */
17959 }
17960 if (!src_entry->needs_copy && is_writable) {
17961 vm_prot_t prot;
17962
17963 if (pmap_has_prot_policy(pmap: map->pmap, translated_allow_execute: src_entry->translated_allow_execute, prot: src_entry->protection)) {
17964 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
17965 __FUNCTION__,
17966 map, map->pmap,
17967 src_entry,
17968 (uint64_t)src_entry->vme_start,
17969 (uint64_t)src_entry->vme_end,
17970 src_entry->protection);
17971 }
17972
17973 prot = src_entry->protection & ~VM_PROT_WRITE;
17974
17975 if (override_nx(map,
17976 VME_ALIAS(src_entry))
17977 && prot) {
17978 prot |= VM_PROT_EXECUTE;
17979 }
17980
17981 if (pmap_has_prot_policy(pmap: map->pmap, translated_allow_execute: src_entry->translated_allow_execute, prot)) {
17982 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
17983 __FUNCTION__,
17984 map, map->pmap,
17985 src_entry,
17986 (uint64_t)src_entry->vme_start,
17987 (uint64_t)src_entry->vme_end,
17988 prot);
17989 }
17990
17991 if (map->mapped_in_other_pmaps) {
17992 vm_object_pmap_protect(
17993 VME_OBJECT(src_entry),
17994 offset: VME_OFFSET(entry: src_entry),
17995 size: entry_size,
17996 PMAP_NULL,
17997 PAGE_SIZE,
17998 pmap_start: src_entry->vme_start,
17999 prot);
18000#if MACH_ASSERT
18001 } else if (__improbable(map->pmap == PMAP_NULL)) {
18002 extern boolean_t vm_tests_in_progress;
18003 assert(vm_tests_in_progress);
18004 /*
18005 * Some VM tests (in vm_tests.c)
18006 * sometimes want to use a VM
18007 * map without a pmap.
18008 * Otherwise, this should never
18009 * happen.
18010 */
18011#endif /* MACH_ASSERT */
18012 } else {
18013 pmap_protect(vm_map_pmap(map),
18014 s: src_entry->vme_start,
18015 e: src_entry->vme_end,
18016 prot);
18017 }
18018 }
18019
18020 object = VME_OBJECT(src_entry);
18021 src_entry->needs_copy = FALSE;
18022 }
18023
18024
18025 vm_object_lock(object);
18026 vm_object_reference_locked(object); /* object ref. for new entry */
18027 assert(!src_entry->needs_copy);
18028 if (object->copy_strategy ==
18029 MEMORY_OBJECT_COPY_SYMMETRIC) {
18030 /*
18031 * If we want to share this object (copy==0),
18032 * it needs to be COPY_DELAY.
18033 * If we want to copy this object (copy==1),
18034 * we can't just set "needs_copy" on our side
18035 * and expect the other side to do the same
18036 * (symmetrically), so we can't let the object
18037 * stay COPY_SYMMETRIC.
18038 * So we always switch from COPY_SYMMETRIC to
18039 * COPY_DELAY.
18040 */
18041 object->copy_strategy =
18042 MEMORY_OBJECT_COPY_DELAY;
18043 VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
18044 }
18045 vm_object_unlock(object);
18046 }
18047
18048 offset = (VME_OFFSET(entry: src_entry) +
18049 (src_start - src_entry->vme_start));
18050
18051copy_src_entry:
18052 new_entry = _vm_map_entry_create(map_header);
18053 vm_map_entry_copy(map, new: new_entry, old: src_entry);
18054 if (new_entry->is_sub_map) {
18055 /* clr address space specifics */
18056 new_entry->use_pmap = FALSE;
18057 } else if (copy) {
18058 /*
18059 * We're dealing with a copy-on-write operation,
18060 * so the resulting mapping should not inherit the
18061 * original mapping's accounting settings.
18062 * "use_pmap" should be reset to its default (TRUE)
18063 * so that the new mapping gets accounted for in
18064 * the task's memory footprint.
18065 */
18066 new_entry->use_pmap = TRUE;
18067 }
18068 /* "iokit_acct" was cleared in vm_map_entry_copy() */
18069 assert(!new_entry->iokit_acct);
18070
18071 new_entry->map_aligned = FALSE;
18072
18073 new_entry->vme_start = map_address;
18074 new_entry->vme_end = map_address + tmp_size;
18075 assert(new_entry->vme_start < new_entry->vme_end);
18076 if (copy && vmk_flags.vmkf_remap_prot_copy) {
18077 /* security: keep "permanent" and "csm_associated" */
18078 new_entry->vme_permanent = src_entry->vme_permanent;
18079 new_entry->csm_associated = src_entry->csm_associated;
18080 /*
18081 * Remapping for vm_map_protect(VM_PROT_COPY)
18082 * to convert a read-only mapping into a
18083 * copy-on-write version of itself but
18084 * with write access:
18085 * keep the original inheritance but let's not
18086 * add VM_PROT_WRITE to the max protection yet
18087 * since we want to do more security checks against
18088 * the target map.
18089 */
18090 new_entry->inheritance = src_entry->inheritance;
18091 new_entry->protection &= max_prot_for_prot_copy;
18092 } else {
18093 new_entry->inheritance = inheritance;
18094 if (!vm_remap_legacy) {
18095 new_entry->protection = *cur_protection;
18096 new_entry->max_protection = *max_protection;
18097 }
18098 }
18099#ifdef __arm64e__
18100 if (copy && vmk_flags.vmkf_tpro_enforcement_override) {
18101 new_entry->used_for_tpro = FALSE;
18102 }
18103#endif /* __arm64e__ */
18104 VME_OFFSET_SET(entry: new_entry, offset);
18105
18106 /*
18107 * The new region has to be copied now if required.
18108 */
18109RestartCopy:
18110 if (!copy) {
18111 if (src_entry->used_for_jit == TRUE) {
18112 if (same_map) {
18113 } else if (!VM_MAP_POLICY_ALLOW_JIT_SHARING(map)) {
18114 /*
18115 * Cannot allow an entry describing a JIT
18116 * region to be shared across address spaces.
18117 */
18118 result = KERN_INVALID_ARGUMENT;
18119 vm_object_deallocate(object);
18120 vm_map_entry_dispose(entry: new_entry);
18121 new_entry = VM_MAP_ENTRY_NULL;
18122 break;
18123 }
18124 }
18125
18126 src_entry->is_shared = TRUE;
18127 new_entry->is_shared = TRUE;
18128 if (!(new_entry->is_sub_map)) {
18129 new_entry->needs_copy = FALSE;
18130 }
18131 } else if (src_entry->is_sub_map) {
18132 /* make this a COW sub_map if not already */
18133 assert(new_entry->wired_count == 0);
18134 new_entry->needs_copy = TRUE;
18135 object = VM_OBJECT_NULL;
18136 } else if (src_entry->wired_count == 0 &&
18137 !(debug4k_no_cow_copyin && VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) &&
18138 vm_object_copy_quickly(VME_OBJECT(new_entry),
18139 src_offset: VME_OFFSET(entry: new_entry),
18140 size: (new_entry->vme_end -
18141 new_entry->vme_start),
18142 src_needs_copy: &src_needs_copy,
18143 dst_needs_copy: &new_entry_needs_copy)) {
18144 new_entry->needs_copy = new_entry_needs_copy;
18145 new_entry->is_shared = FALSE;
18146 assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
18147
18148 /*
18149 * Handle copy_on_write semantics.
18150 */
18151 if (src_needs_copy && !src_entry->needs_copy) {
18152 vm_prot_t prot;
18153
18154 if (pmap_has_prot_policy(pmap: map->pmap, translated_allow_execute: src_entry->translated_allow_execute, prot: src_entry->protection)) {
18155 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18156 __FUNCTION__,
18157 map, map->pmap, src_entry,
18158 (uint64_t)src_entry->vme_start,
18159 (uint64_t)src_entry->vme_end,
18160 src_entry->protection);
18161 }
18162
18163 prot = src_entry->protection & ~VM_PROT_WRITE;
18164
18165 if (override_nx(map,
18166 VME_ALIAS(src_entry))
18167 && prot) {
18168 prot |= VM_PROT_EXECUTE;
18169 }
18170
18171 if (pmap_has_prot_policy(pmap: map->pmap, translated_allow_execute: src_entry->translated_allow_execute, prot)) {
18172 panic("%s: map %p pmap %p entry %p 0x%llx:0x%llx prot 0x%x",
18173 __FUNCTION__,
18174 map, map->pmap, src_entry,
18175 (uint64_t)src_entry->vme_start,
18176 (uint64_t)src_entry->vme_end,
18177 prot);
18178 }
18179
18180 vm_object_pmap_protect(object,
18181 offset,
18182 size: entry_size,
18183 pmap: ((src_entry->is_shared
18184 || map->mapped_in_other_pmaps) ?
18185 PMAP_NULL : map->pmap),
18186 VM_MAP_PAGE_SIZE(map),
18187 pmap_start: src_entry->vme_start,
18188 prot);
18189
18190 assert(src_entry->wired_count == 0);
18191 src_entry->needs_copy = TRUE;
18192 }
18193 /*
18194 * Throw away the old object reference of the new entry.
18195 */
18196 vm_object_deallocate(object);
18197 } else {
18198 new_entry->is_shared = FALSE;
18199 assertf(new_entry->use_pmap, "map %p new_entry %p\n", map, new_entry);
18200
18201 src_entry_was_wired = (src_entry->wired_count > 0);
18202 saved_src_entry = src_entry;
18203 src_entry = VM_MAP_ENTRY_NULL;
18204
18205 /*
18206 * The map can be safely unlocked since we
18207 * already hold a reference on the object.
18208 *
18209 * Record the timestamp of the map for later
18210 * verification, and unlock the map.
18211 */
18212 version.main_timestamp = map->timestamp;
18213 vm_map_unlock(map); /* Increments timestamp once! */
18214
18215 /*
18216 * Perform the copy.
18217 */
18218 if (src_entry_was_wired > 0 ||
18219 (debug4k_no_cow_copyin &&
18220 VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT)) {
18221 vm_object_lock(object);
18222 result = vm_object_copy_slowly(
18223 src_object: object,
18224 src_offset: offset,
18225 size: (new_entry->vme_end -
18226 new_entry->vme_start),
18227 THREAD_UNINT,
18228 result_object: &new_copy_object);
18229 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
18230 saved_used_for_jit = new_entry->used_for_jit;
18231 VME_OBJECT_SET(entry: new_entry, object: new_copy_object, false, context: 0);
18232 new_entry->used_for_jit = saved_used_for_jit;
18233 VME_OFFSET_SET(entry: new_entry, offset: offset - vm_object_trunc_page(offset));
18234 new_entry->needs_copy = FALSE;
18235 } else {
18236 vm_object_offset_t new_offset;
18237
18238 new_offset = VME_OFFSET(entry: new_entry);
18239 result = vm_object_copy_strategically(
18240 src_object: object,
18241 src_offset: offset,
18242 size: (new_entry->vme_end -
18243 new_entry->vme_start),
18244 false, /* forking */
18245 dst_object: &new_copy_object,
18246 dst_offset: &new_offset,
18247 dst_needs_copy: &new_entry_needs_copy);
18248 /* VME_OBJECT_SET will reset used_for_jit, so preserve it. */
18249 saved_used_for_jit = new_entry->used_for_jit;
18250 VME_OBJECT_SET(entry: new_entry, object: new_copy_object, false, context: 0);
18251 new_entry->used_for_jit = saved_used_for_jit;
18252 if (new_offset != VME_OFFSET(entry: new_entry)) {
18253 VME_OFFSET_SET(entry: new_entry, offset: new_offset);
18254 }
18255
18256 new_entry->needs_copy = new_entry_needs_copy;
18257 }
18258
18259 /*
18260 * Throw away the old object reference of the new entry.
18261 */
18262 vm_object_deallocate(object);
18263
18264 if (result != KERN_SUCCESS &&
18265 result != KERN_MEMORY_RESTART_COPY) {
18266 vm_map_entry_dispose(entry: new_entry);
18267 vm_map_lock(map);
18268 break;
18269 }
18270
18271 /*
18272 * Verify that the map has not substantially
18273 * changed while the copy was being made.
18274 */
18275
18276 vm_map_lock(map);
18277 if (version.main_timestamp + 1 != map->timestamp) {
18278 /*
18279 * Simple version comparison failed.
18280 *
18281 * Retry the lookup and verify that the
18282 * same object/offset are still present.
18283 */
18284 saved_src_entry = VM_MAP_ENTRY_NULL;
18285 vm_object_deallocate(VME_OBJECT(new_entry));
18286 vm_map_entry_dispose(entry: new_entry);
18287 if (result == KERN_MEMORY_RESTART_COPY) {
18288 result = KERN_SUCCESS;
18289 }
18290 continue;
18291 }
18292 /* map hasn't changed: src_entry is still valid */
18293 src_entry = saved_src_entry;
18294 saved_src_entry = VM_MAP_ENTRY_NULL;
18295
18296 if (result == KERN_MEMORY_RESTART_COPY) {
18297 vm_object_reference(object);
18298 goto RestartCopy;
18299 }
18300 }
18301
18302 _vm_map_store_entry_link(header: map_header,
18303 after_where: map_header->links.prev, entry: new_entry);
18304
18305 /* protections for submap mapping are irrelevant here */
18306 if (vm_remap_legacy && !src_entry->is_sub_map) {
18307 *cur_protection &= src_entry->protection;
18308 *max_protection &= src_entry->max_protection;
18309 }
18310
18311 map_address += tmp_size;
18312 mapped_size += tmp_size;
18313 src_start += tmp_size;
18314
18315 if (vmk_flags.vmkf_copy_single_object) {
18316 if (mapped_size != size) {
18317 DEBUG4K_SHARE("map %p addr 0x%llx size 0x%llx clipped copy at mapped_size 0x%llx\n",
18318 map, (uint64_t)addr, (uint64_t)size, (uint64_t)mapped_size);
18319 if (src_entry->vme_next != vm_map_to_entry(map) &&
18320 src_entry->vme_next->vme_object_value ==
18321 src_entry->vme_object_value) {
18322 /* XXX TODO4K */
18323 DEBUG4K_ERROR("could have extended copy to next entry...\n");
18324 }
18325 }
18326 break;
18327 }
18328 } /* end while */
18329
18330 vm_map_unlock(map);
18331 if (result != KERN_SUCCESS) {
18332 /*
18333 * Free all allocated elements.
18334 */
18335 for (src_entry = map_header->links.next;
18336 src_entry != CAST_TO_VM_MAP_ENTRY(&map_header->links);
18337 src_entry = new_entry) {
18338 new_entry = src_entry->vme_next;
18339 _vm_map_store_entry_unlink(header: map_header, entry: src_entry, false);
18340 if (src_entry->is_sub_map) {
18341 vm_map_deallocate(VME_SUBMAP(src_entry));
18342 } else {
18343 vm_object_deallocate(VME_OBJECT(src_entry));
18344 }
18345 vm_map_entry_dispose(entry: src_entry);
18346 }
18347 }
18348 return result;
18349}
18350
18351bool
18352vm_map_is_exotic(
18353 vm_map_t map)
18354{
18355 return VM_MAP_IS_EXOTIC(map);
18356}
18357
18358bool
18359vm_map_is_alien(
18360 vm_map_t map)
18361{
18362 return VM_MAP_IS_ALIEN(map);
18363}
18364
18365#if XNU_TARGET_OS_OSX
18366void
18367vm_map_mark_alien(
18368 vm_map_t map)
18369{
18370 vm_map_lock(map);
18371 map->is_alien = true;
18372 vm_map_unlock(map);
18373}
18374
18375void
18376vm_map_single_jit(
18377 vm_map_t map)
18378{
18379 vm_map_lock(map);
18380 map->single_jit = true;
18381 vm_map_unlock(map);
18382}
18383#endif /* XNU_TARGET_OS_OSX */
18384
18385
18386/*
18387 * Callers of this function must call vm_map_copy_require on
18388 * previously created vm_map_copy_t or pass a newly created
18389 * one to ensure that it hasn't been forged.
18390 */
18391static kern_return_t
18392vm_map_copy_to_physcopy(
18393 vm_map_copy_t copy_map,
18394 vm_map_t target_map)
18395{
18396 vm_map_size_t size;
18397 vm_map_entry_t entry;
18398 vm_map_entry_t new_entry;
18399 vm_object_t new_object;
18400 unsigned int pmap_flags;
18401 pmap_t new_pmap;
18402 vm_map_t new_map;
18403 vm_map_address_t src_start, src_end, src_cur;
18404 vm_map_address_t dst_start, dst_end, dst_cur;
18405 kern_return_t kr;
18406 void *kbuf;
18407
18408 /*
18409 * Perform the equivalent of vm_allocate() and memcpy().
18410 * Replace the mappings in "copy_map" with the newly allocated mapping.
18411 */
18412 DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) BEFORE\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18413
18414 assert(copy_map->cpy_hdr.page_shift != VM_MAP_PAGE_MASK(target_map));
18415
18416 /* create a new pmap to map "copy_map" */
18417 pmap_flags = 0;
18418 assert(copy_map->cpy_hdr.page_shift == FOURK_PAGE_SHIFT);
18419#if PMAP_CREATE_FORCE_4K_PAGES
18420 pmap_flags |= PMAP_CREATE_FORCE_4K_PAGES;
18421#endif /* PMAP_CREATE_FORCE_4K_PAGES */
18422 pmap_flags |= PMAP_CREATE_64BIT;
18423 new_pmap = pmap_create_options(NULL, size: (vm_map_size_t)0, flags: pmap_flags);
18424 if (new_pmap == NULL) {
18425 return KERN_RESOURCE_SHORTAGE;
18426 }
18427
18428 /* allocate new VM object */
18429 size = VM_MAP_ROUND_PAGE(copy_map->size, PAGE_MASK);
18430 new_object = vm_object_allocate(size);
18431 assert(new_object);
18432
18433 /* allocate new VM map entry */
18434 new_entry = vm_map_copy_entry_create(copy_map);
18435 assert(new_entry);
18436
18437 /* finish initializing new VM map entry */
18438 new_entry->protection = VM_PROT_DEFAULT;
18439 new_entry->max_protection = VM_PROT_DEFAULT;
18440 new_entry->use_pmap = TRUE;
18441
18442 /* make new VM map entry point to new VM object */
18443 new_entry->vme_start = 0;
18444 new_entry->vme_end = size;
18445 VME_OBJECT_SET(entry: new_entry, object: new_object, false, context: 0);
18446 VME_OFFSET_SET(entry: new_entry, offset: 0);
18447
18448 /* create a new pageable VM map to map "copy_map" */
18449 new_map = vm_map_create_options(pmap: new_pmap, min: 0, MACH_VM_MAX_ADDRESS,
18450 options: VM_MAP_CREATE_PAGEABLE);
18451 assert(new_map);
18452 vm_map_set_page_shift(map: new_map, pageshift: copy_map->cpy_hdr.page_shift);
18453
18454 /* map "copy_map" in the new VM map */
18455 src_start = 0;
18456 kr = vm_map_copyout_internal(
18457 dst_map: new_map,
18458 dst_addr: &src_start,
18459 copy: copy_map,
18460 copy_size: copy_map->size,
18461 FALSE, /* consume_on_success */
18462 VM_PROT_DEFAULT,
18463 VM_PROT_DEFAULT,
18464 VM_INHERIT_DEFAULT);
18465 assert(kr == KERN_SUCCESS);
18466 src_end = src_start + copy_map->size;
18467
18468 /* map "new_object" in the new VM map */
18469 vm_object_reference(new_object);
18470 dst_start = 0;
18471 kr = vm_map_enter(map: new_map,
18472 address: &dst_start,
18473 size,
18474 mask: 0, /* mask */
18475 VM_MAP_KERNEL_FLAGS_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
18476 object: new_object,
18477 offset: 0, /* offset */
18478 FALSE, /* needs copy */
18479 VM_PROT_DEFAULT,
18480 VM_PROT_DEFAULT,
18481 VM_INHERIT_DEFAULT);
18482 assert(kr == KERN_SUCCESS);
18483 dst_end = dst_start + size;
18484
18485 /* get a kernel buffer */
18486 kbuf = kalloc_data(PAGE_SIZE, Z_WAITOK | Z_NOFAIL);
18487
18488 /* physically copy "copy_map" mappings to new VM object */
18489 for (src_cur = src_start, dst_cur = dst_start;
18490 src_cur < src_end;
18491 src_cur += PAGE_SIZE, dst_cur += PAGE_SIZE) {
18492 vm_size_t bytes;
18493
18494 bytes = PAGE_SIZE;
18495 if (src_cur + PAGE_SIZE > src_end) {
18496 /* partial copy for last page */
18497 bytes = src_end - src_cur;
18498 assert(bytes > 0 && bytes < PAGE_SIZE);
18499 /* rest of dst page should be zero-filled */
18500 }
18501 /* get bytes from src mapping */
18502 kr = copyinmap(map: new_map, fromaddr: src_cur, todata: kbuf, length: bytes);
18503 if (kr != KERN_SUCCESS) {
18504 DEBUG4K_COPY("copyinmap(%p, 0x%llx, %p, 0x%llx) kr 0x%x\n", new_map, (uint64_t)src_cur, kbuf, (uint64_t)bytes, kr);
18505 }
18506 /* put bytes in dst mapping */
18507 assert(dst_cur < dst_end);
18508 assert(dst_cur + bytes <= dst_end);
18509 kr = copyoutmap(map: new_map, fromdata: kbuf, toaddr: dst_cur, length: bytes);
18510 if (kr != KERN_SUCCESS) {
18511 DEBUG4K_COPY("copyoutmap(%p, %p, 0x%llx, 0x%llx) kr 0x%x\n", new_map, kbuf, (uint64_t)dst_cur, (uint64_t)bytes, kr);
18512 }
18513 }
18514
18515 /* free kernel buffer */
18516 kfree_data(kbuf, PAGE_SIZE);
18517
18518 /* destroy new map */
18519 vm_map_destroy(map: new_map);
18520 new_map = VM_MAP_NULL;
18521
18522 /* dispose of the old map entries in "copy_map" */
18523 while (vm_map_copy_first_entry(copy_map) !=
18524 vm_map_copy_to_entry(copy_map)) {
18525 entry = vm_map_copy_first_entry(copy_map);
18526 vm_map_copy_entry_unlink(copy_map, entry);
18527 if (entry->is_sub_map) {
18528 vm_map_deallocate(VME_SUBMAP(entry));
18529 } else {
18530 vm_object_deallocate(VME_OBJECT(entry));
18531 }
18532 vm_map_copy_entry_dispose(entry);
18533 }
18534
18535 /* change "copy_map"'s page_size to match "target_map" */
18536 copy_map->cpy_hdr.page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(map: target_map);
18537 copy_map->offset = 0;
18538 copy_map->size = size;
18539
18540 /* insert new map entry in "copy_map" */
18541 assert(vm_map_copy_last_entry(copy_map) == vm_map_copy_to_entry(copy_map));
18542 vm_map_copy_entry_link(copy_map, vm_map_copy_last_entry(copy_map), new_entry);
18543
18544 DEBUG4K_COPY("copy_map %p (%d %d 0x%llx 0x%llx) AFTER\n", copy_map, copy_map->cpy_hdr.page_shift, copy_map->cpy_hdr.nentries, copy_map->offset, (uint64_t)copy_map->size);
18545 return KERN_SUCCESS;
18546}
18547
18548void
18549vm_map_copy_adjust_get_target_copy_map(
18550 vm_map_copy_t copy_map,
18551 vm_map_copy_t *target_copy_map_p);
18552void
18553vm_map_copy_adjust_get_target_copy_map(
18554 vm_map_copy_t copy_map,
18555 vm_map_copy_t *target_copy_map_p)
18556{
18557 vm_map_copy_t target_copy_map;
18558 vm_map_entry_t entry, target_entry;
18559
18560 if (*target_copy_map_p != VM_MAP_COPY_NULL) {
18561 /* the caller already has a "target_copy_map": use it */
18562 return;
18563 }
18564
18565 /* the caller wants us to create a new copy of "copy_map" */
18566 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18567 target_copy_map = vm_map_copy_allocate(type: copy_map->type);
18568 target_copy_map->offset = copy_map->offset;
18569 target_copy_map->size = copy_map->size;
18570 target_copy_map->cpy_hdr.page_shift = copy_map->cpy_hdr.page_shift;
18571 for (entry = vm_map_copy_first_entry(copy_map);
18572 entry != vm_map_copy_to_entry(copy_map);
18573 entry = entry->vme_next) {
18574 target_entry = vm_map_copy_entry_create(target_copy_map);
18575 vm_map_entry_copy_full(new: target_entry, old: entry);
18576 if (target_entry->is_sub_map) {
18577 vm_map_reference(VME_SUBMAP(target_entry));
18578 } else {
18579 vm_object_reference(VME_OBJECT(target_entry));
18580 }
18581 vm_map_copy_entry_link(
18582 target_copy_map,
18583 vm_map_copy_last_entry(target_copy_map),
18584 target_entry);
18585 }
18586 entry = VM_MAP_ENTRY_NULL;
18587 *target_copy_map_p = target_copy_map;
18588}
18589
18590/*
18591 * Callers of this function must call vm_map_copy_require on
18592 * previously created vm_map_copy_t or pass a newly created
18593 * one to ensure that it hasn't been forged.
18594 */
18595static void
18596vm_map_copy_trim(
18597 vm_map_copy_t copy_map,
18598 uint16_t new_page_shift,
18599 vm_map_offset_t trim_start,
18600 vm_map_offset_t trim_end)
18601{
18602 uint16_t copy_page_shift;
18603 vm_map_entry_t entry, next_entry;
18604
18605 assert(copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18606 assert(copy_map->cpy_hdr.nentries > 0);
18607
18608 trim_start += vm_map_copy_first_entry(copy_map)->vme_start;
18609 trim_end += vm_map_copy_first_entry(copy_map)->vme_start;
18610
18611 /* use the new page_shift to do the clipping */
18612 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18613 copy_map->cpy_hdr.page_shift = new_page_shift;
18614
18615 for (entry = vm_map_copy_first_entry(copy_map);
18616 entry != vm_map_copy_to_entry(copy_map);
18617 entry = next_entry) {
18618 next_entry = entry->vme_next;
18619 if (entry->vme_end <= trim_start) {
18620 /* entry fully before trim range: skip */
18621 continue;
18622 }
18623 if (entry->vme_start >= trim_end) {
18624 /* entry fully after trim range: done */
18625 break;
18626 }
18627 /* clip entry if needed */
18628 vm_map_copy_clip_start(copy_map, entry, trim_start);
18629 vm_map_copy_clip_end(copy_map, entry, trim_end);
18630 /* dispose of entry */
18631 copy_map->size -= entry->vme_end - entry->vme_start;
18632 vm_map_copy_entry_unlink(copy_map, entry);
18633 if (entry->is_sub_map) {
18634 vm_map_deallocate(VME_SUBMAP(entry));
18635 } else {
18636 vm_object_deallocate(VME_OBJECT(entry));
18637 }
18638 vm_map_copy_entry_dispose(entry);
18639 entry = VM_MAP_ENTRY_NULL;
18640 }
18641
18642 /* restore copy_map's original page_shift */
18643 copy_map->cpy_hdr.page_shift = copy_page_shift;
18644}
18645
18646/*
18647 * Make any necessary adjustments to "copy_map" to allow it to be
18648 * mapped into "target_map".
18649 * If no changes were necessary, "target_copy_map" points to the
18650 * untouched "copy_map".
18651 * If changes are necessary, changes will be made to "target_copy_map".
18652 * If "target_copy_map" was NULL, we create a new "vm_map_copy_t" and
18653 * copy the original "copy_map" to it before applying the changes.
18654 * The caller should discard "target_copy_map" if it's not the same as
18655 * the original "copy_map".
18656 */
18657/* TODO4K: also adjust to sub-range in the copy_map -> add start&end? */
18658kern_return_t
18659vm_map_copy_adjust_to_target(
18660 vm_map_copy_t src_copy_map,
18661 vm_map_offset_t offset,
18662 vm_map_size_t size,
18663 vm_map_t target_map,
18664 boolean_t copy,
18665 vm_map_copy_t *target_copy_map_p,
18666 vm_map_offset_t *overmap_start_p,
18667 vm_map_offset_t *overmap_end_p,
18668 vm_map_offset_t *trimmed_start_p)
18669{
18670 vm_map_copy_t copy_map, target_copy_map;
18671 vm_map_size_t target_size;
18672 vm_map_size_t src_copy_map_size;
18673 vm_map_size_t overmap_start, overmap_end;
18674 int misalignments;
18675 vm_map_entry_t entry, target_entry;
18676 vm_map_offset_t addr_adjustment;
18677 vm_map_offset_t new_start, new_end;
18678 int copy_page_mask, target_page_mask;
18679 uint16_t copy_page_shift, target_page_shift;
18680 vm_map_offset_t trimmed_end;
18681
18682 /*
18683 * Assert that the vm_map_copy is coming from the right
18684 * zone and hasn't been forged
18685 */
18686 vm_map_copy_require(copy: src_copy_map);
18687 assert(src_copy_map->type == VM_MAP_COPY_ENTRY_LIST);
18688
18689 /*
18690 * Start working with "src_copy_map" but we'll switch
18691 * to "target_copy_map" as soon as we start making adjustments.
18692 */
18693 copy_map = src_copy_map;
18694 src_copy_map_size = src_copy_map->size;
18695
18696 copy_page_shift = VM_MAP_COPY_PAGE_SHIFT(copy_map);
18697 copy_page_mask = VM_MAP_COPY_PAGE_MASK(copy_map);
18698 target_page_shift = (uint16_t)VM_MAP_PAGE_SHIFT(map: target_map);
18699 target_page_mask = VM_MAP_PAGE_MASK(target_map);
18700
18701 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p...\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, *target_copy_map_p);
18702
18703 target_copy_map = *target_copy_map_p;
18704 if (target_copy_map != VM_MAP_COPY_NULL) {
18705 vm_map_copy_require(copy: target_copy_map);
18706 }
18707
18708 if (offset + size > copy_map->size) {
18709 DEBUG4K_ERROR("copy_map %p (%d->%d) copy_map->size 0x%llx offset 0x%llx size 0x%llx KERN_INVALID_ARGUMENT\n", copy_map, copy_page_shift, target_page_shift, (uint64_t)copy_map->size, (uint64_t)offset, (uint64_t)size);
18710 return KERN_INVALID_ARGUMENT;
18711 }
18712
18713 /* trim the end */
18714 trimmed_end = 0;
18715 new_end = VM_MAP_ROUND_PAGE(offset + size, target_page_mask);
18716 if (new_end < copy_map->size) {
18717 trimmed_end = src_copy_map_size - new_end;
18718 DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim end from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, target_copy_map, (uint64_t)new_end, (uint64_t)copy_map->size);
18719 /* get "target_copy_map" if needed and adjust it */
18720 vm_map_copy_adjust_get_target_copy_map(copy_map,
18721 target_copy_map_p: &target_copy_map);
18722 copy_map = target_copy_map;
18723 vm_map_copy_trim(copy_map: target_copy_map, new_page_shift: target_page_shift,
18724 trim_start: new_end, trim_end: copy_map->size);
18725 }
18726
18727 /* trim the start */
18728 new_start = VM_MAP_TRUNC_PAGE(offset, target_page_mask);
18729 if (new_start != 0) {
18730 DEBUG4K_ADJUST("copy_map %p (%d->%d) copy %d offset 0x%llx size 0x%llx target_copy_map %p... trim start from 0x%llx to 0x%llx\n", copy_map, copy_page_shift, target_page_shift, copy, (uint64_t)offset, (uint64_t)size, target_copy_map, (uint64_t)0, (uint64_t)new_start);
18731 /* get "target_copy_map" if needed and adjust it */
18732 vm_map_copy_adjust_get_target_copy_map(copy_map,
18733 target_copy_map_p: &target_copy_map);
18734 copy_map = target_copy_map;
18735 vm_map_copy_trim(copy_map: target_copy_map, new_page_shift: target_page_shift,
18736 trim_start: 0, trim_end: new_start);
18737 }
18738 *trimmed_start_p = new_start;
18739
18740 /* target_size starts with what's left after trimming */
18741 target_size = copy_map->size;
18742 assertf(target_size == src_copy_map_size - *trimmed_start_p - trimmed_end,
18743 "target_size 0x%llx src_copy_map_size 0x%llx trimmed_start 0x%llx trimmed_end 0x%llx\n",
18744 (uint64_t)target_size, (uint64_t)src_copy_map_size,
18745 (uint64_t)*trimmed_start_p, (uint64_t)trimmed_end);
18746
18747 /* check for misalignments but don't adjust yet */
18748 misalignments = 0;
18749 overmap_start = 0;
18750 overmap_end = 0;
18751 if (copy_page_shift < target_page_shift) {
18752 /*
18753 * Remapping from 4K to 16K: check the VM object alignments
18754 * throughout the range.
18755 * If the start and end of the range are mis-aligned, we can
18756 * over-map to re-align, and adjust the "overmap" start/end
18757 * and "target_size" of the range accordingly.
18758 * If there is any mis-alignment within the range:
18759 * if "copy":
18760 * we can do immediate-copy instead of copy-on-write,
18761 * else:
18762 * no way to remap and share; fail.
18763 */
18764 for (entry = vm_map_copy_first_entry(copy_map);
18765 entry != vm_map_copy_to_entry(copy_map);
18766 entry = entry->vme_next) {
18767 vm_object_offset_t object_offset_start, object_offset_end;
18768
18769 object_offset_start = VME_OFFSET(entry);
18770 object_offset_end = object_offset_start;
18771 object_offset_end += entry->vme_end - entry->vme_start;
18772 if (object_offset_start & target_page_mask) {
18773 if (entry == vm_map_copy_first_entry(copy_map) && !copy) {
18774 overmap_start++;
18775 } else {
18776 misalignments++;
18777 }
18778 }
18779 if (object_offset_end & target_page_mask) {
18780 if (entry->vme_next == vm_map_copy_to_entry(copy_map) && !copy) {
18781 overmap_end++;
18782 } else {
18783 misalignments++;
18784 }
18785 }
18786 }
18787 }
18788 entry = VM_MAP_ENTRY_NULL;
18789
18790 /* decide how to deal with misalignments */
18791 assert(overmap_start <= 1);
18792 assert(overmap_end <= 1);
18793 if (!overmap_start && !overmap_end && !misalignments) {
18794 /* copy_map is properly aligned for target_map ... */
18795 if (*trimmed_start_p) {
18796 /* ... but we trimmed it, so still need to adjust */
18797 } else {
18798 /* ... and we didn't trim anything: we're done */
18799 if (target_copy_map == VM_MAP_COPY_NULL) {
18800 target_copy_map = copy_map;
18801 }
18802 *target_copy_map_p = target_copy_map;
18803 *overmap_start_p = 0;
18804 *overmap_end_p = 0;
18805 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18806 return KERN_SUCCESS;
18807 }
18808 } else if (misalignments && !copy) {
18809 /* can't "share" if misaligned */
18810 DEBUG4K_ADJUST("unsupported sharing\n");
18811#if MACH_ASSERT
18812 if (debug4k_panic_on_misaligned_sharing) {
18813 panic("DEBUG4k %s:%d unsupported sharing", __FUNCTION__, __LINE__);
18814 }
18815#endif /* MACH_ASSERT */
18816 DEBUG4K_ADJUST("copy_map %p (%d) target_map %p (%d) copy %d target_copy_map %p -> KERN_NOT_SUPPORTED\n", copy_map, copy_page_shift, target_map, target_page_shift, copy, *target_copy_map_p);
18817 return KERN_NOT_SUPPORTED;
18818 } else {
18819 /* can't virtual-copy if misaligned (but can physical-copy) */
18820 DEBUG4K_ADJUST("mis-aligned copying\n");
18821 }
18822
18823 /* get a "target_copy_map" if needed and switch to it */
18824 vm_map_copy_adjust_get_target_copy_map(copy_map, target_copy_map_p: &target_copy_map);
18825 copy_map = target_copy_map;
18826
18827 if (misalignments && copy) {
18828 vm_map_size_t target_copy_map_size;
18829
18830 /*
18831 * Can't do copy-on-write with misaligned mappings.
18832 * Replace the mappings with a physical copy of the original
18833 * mappings' contents.
18834 */
18835 target_copy_map_size = target_copy_map->size;
18836 kern_return_t kr = vm_map_copy_to_physcopy(copy_map: target_copy_map, target_map);
18837 if (kr != KERN_SUCCESS) {
18838 return kr;
18839 }
18840 *target_copy_map_p = target_copy_map;
18841 *overmap_start_p = 0;
18842 *overmap_end_p = target_copy_map->size - target_copy_map_size;
18843 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx)-> trimmed 0x%llx overmap start 0x%llx end 0x%llx PHYSCOPY\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18844 return KERN_SUCCESS;
18845 }
18846
18847 /* apply the adjustments */
18848 misalignments = 0;
18849 overmap_start = 0;
18850 overmap_end = 0;
18851 /* remove copy_map->offset, so that everything starts at offset 0 */
18852 addr_adjustment = copy_map->offset;
18853 /* also remove whatever we trimmed from the start */
18854 addr_adjustment += *trimmed_start_p;
18855 for (target_entry = vm_map_copy_first_entry(target_copy_map);
18856 target_entry != vm_map_copy_to_entry(target_copy_map);
18857 target_entry = target_entry->vme_next) {
18858 vm_object_offset_t object_offset_start, object_offset_end;
18859
18860 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx BEFORE\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18861 object_offset_start = VME_OFFSET(entry: target_entry);
18862 if (object_offset_start & target_page_mask) {
18863 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at start\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18864 if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
18865 /*
18866 * start of 1st entry is mis-aligned:
18867 * re-adjust by over-mapping.
18868 */
18869 overmap_start = object_offset_start - trunc_page_mask_64(object_offset_start, target_page_mask);
18870 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_start 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_start);
18871 VME_OFFSET_SET(entry: target_entry, offset: VME_OFFSET(entry: target_entry) - overmap_start);
18872 } else {
18873 misalignments++;
18874 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
18875 assert(copy);
18876 }
18877 }
18878
18879 if (target_entry == vm_map_copy_first_entry(target_copy_map)) {
18880 target_size += overmap_start;
18881 } else {
18882 target_entry->vme_start += overmap_start;
18883 }
18884 target_entry->vme_end += overmap_start;
18885
18886 object_offset_end = VME_OFFSET(entry: target_entry) + target_entry->vme_end - target_entry->vme_start;
18887 if (object_offset_end & target_page_mask) {
18888 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx misaligned at end\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18889 if (target_entry->vme_next == vm_map_copy_to_entry(target_copy_map)) {
18890 /*
18891 * end of last entry is mis-aligned: re-adjust by over-mapping.
18892 */
18893 overmap_end = round_page_mask_64(object_offset_end, target_page_mask) - object_offset_end;
18894 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> overmap_end 0x%llx\n", target_entry, VME_OFFSET(target_entry), copy, (uint64_t)overmap_end);
18895 target_entry->vme_end += overmap_end;
18896 target_size += overmap_end;
18897 } else {
18898 misalignments++;
18899 DEBUG4K_ADJUST("entry %p offset 0x%llx copy %d -> misalignments %d\n", target_entry, VME_OFFSET(target_entry), copy, misalignments);
18900 assert(copy);
18901 }
18902 }
18903 target_entry->vme_start -= addr_adjustment;
18904 target_entry->vme_end -= addr_adjustment;
18905 DEBUG4K_ADJUST("copy %p (%d 0x%llx 0x%llx) entry %p [ 0x%llx 0x%llx ] object %p offset 0x%llx AFTER\n", target_copy_map, VM_MAP_COPY_PAGE_SHIFT(target_copy_map), target_copy_map->offset, (uint64_t)target_copy_map->size, target_entry, (uint64_t)target_entry->vme_start, (uint64_t)target_entry->vme_end, VME_OBJECT(target_entry), VME_OFFSET(target_entry));
18906 }
18907
18908 target_copy_map->size = target_size;
18909 target_copy_map->offset += overmap_start;
18910 target_copy_map->offset -= addr_adjustment;
18911 target_copy_map->cpy_hdr.page_shift = target_page_shift;
18912
18913// assert(VM_MAP_PAGE_ALIGNED(target_copy_map->size, target_page_mask));
18914// assert(VM_MAP_PAGE_ALIGNED(target_copy_map->offset, FOURK_PAGE_MASK));
18915 assert(overmap_start < VM_MAP_PAGE_SIZE(target_map));
18916 assert(overmap_end < VM_MAP_PAGE_SIZE(target_map));
18917
18918 *target_copy_map_p = target_copy_map;
18919 *overmap_start_p = overmap_start;
18920 *overmap_end_p = overmap_end;
18921
18922 DEBUG4K_ADJUST("copy_map %p (%d offset 0x%llx size 0x%llx) target_map %p (%d) copy %d target_copy_map %p (%d offset 0x%llx size 0x%llx) -> trimmed 0x%llx overmap start 0x%llx end 0x%llx KERN_SUCCESS\n", copy_map, copy_page_shift, (uint64_t)copy_map->offset, (uint64_t)copy_map->size, target_map, target_page_shift, copy, *target_copy_map_p, VM_MAP_COPY_PAGE_SHIFT(*target_copy_map_p), (uint64_t)(*target_copy_map_p)->offset, (uint64_t)(*target_copy_map_p)->size, (uint64_t)*trimmed_start_p, (uint64_t)*overmap_start_p, (uint64_t)*overmap_end_p);
18923 return KERN_SUCCESS;
18924}
18925
18926kern_return_t
18927vm_map_range_physical_size(
18928 vm_map_t map,
18929 vm_map_address_t start,
18930 mach_vm_size_t size,
18931 mach_vm_size_t * phys_size)
18932{
18933 kern_return_t kr;
18934 vm_map_copy_t copy_map, target_copy_map;
18935 vm_map_offset_t adjusted_start, adjusted_end;
18936 vm_map_size_t adjusted_size;
18937 vm_prot_t cur_prot, max_prot;
18938 vm_map_offset_t overmap_start, overmap_end, trimmed_start, end;
18939 vm_map_kernel_flags_t vmk_flags;
18940
18941 if (size == 0) {
18942 DEBUG4K_SHARE("map %p start 0x%llx size 0x%llx -> phys_size 0!\n", map, (uint64_t)start, (uint64_t)size);
18943 *phys_size = 0;
18944 return KERN_SUCCESS;
18945 }
18946
18947 adjusted_start = vm_map_trunc_page(start, VM_MAP_PAGE_MASK(map));
18948 adjusted_end = vm_map_round_page(start + size, VM_MAP_PAGE_MASK(map));
18949 if (__improbable(os_add_overflow(start, size, &end) ||
18950 adjusted_end <= adjusted_start)) {
18951 /* wraparound */
18952 printf(format: "%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, VM_MAP_PAGE_MASK(map));
18953 *phys_size = 0;
18954 return KERN_INVALID_ARGUMENT;
18955 }
18956 if (__improbable(vm_map_range_overflows(map, start, size))) {
18957 *phys_size = 0;
18958 return KERN_INVALID_ADDRESS;
18959 }
18960 assert(adjusted_end > adjusted_start);
18961 adjusted_size = adjusted_end - adjusted_start;
18962 *phys_size = adjusted_size;
18963 if (VM_MAP_PAGE_SIZE(map) == PAGE_SIZE) {
18964 return KERN_SUCCESS;
18965 }
18966 if (start == 0) {
18967 adjusted_start = vm_map_trunc_page(start, PAGE_MASK);
18968 adjusted_end = vm_map_round_page(start + size, PAGE_MASK);
18969 if (__improbable(adjusted_end <= adjusted_start)) {
18970 /* wraparound */
18971 printf(format: "%s:%d(start=0x%llx, size=0x%llx) pgmask 0x%x: wraparound\n", __FUNCTION__, __LINE__, (uint64_t)start, (uint64_t)size, PAGE_MASK);
18972 *phys_size = 0;
18973 return KERN_INVALID_ARGUMENT;
18974 }
18975 assert(adjusted_end > adjusted_start);
18976 adjusted_size = adjusted_end - adjusted_start;
18977 *phys_size = adjusted_size;
18978 return KERN_SUCCESS;
18979 }
18980
18981 vmk_flags = VM_MAP_KERNEL_FLAGS_NONE;
18982 vmk_flags.vmkf_copy_pageable = TRUE;
18983 vmk_flags.vmkf_copy_same_map = TRUE;
18984 assert(adjusted_size != 0);
18985 cur_prot = VM_PROT_NONE; /* legacy mode */
18986 max_prot = VM_PROT_NONE; /* legacy mode */
18987 kr = vm_map_copy_extract(src_map: map, src_addr: adjusted_start, len: adjusted_size,
18988 FALSE /* copy */,
18989 copy_result: &copy_map,
18990 cur_prot: &cur_prot, max_prot: &max_prot, VM_INHERIT_DEFAULT,
18991 vmk_flags);
18992 if (kr != KERN_SUCCESS) {
18993 DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
18994 //assert(0);
18995 *phys_size = 0;
18996 return kr;
18997 }
18998 assert(copy_map != VM_MAP_COPY_NULL);
18999 target_copy_map = copy_map;
19000 DEBUG4K_ADJUST("adjusting...\n");
19001 kr = vm_map_copy_adjust_to_target(
19002 src_copy_map: copy_map,
19003 offset: start - adjusted_start, /* offset */
19004 size, /* size */
19005 target_map: kernel_map,
19006 FALSE, /* copy */
19007 target_copy_map_p: &target_copy_map,
19008 overmap_start_p: &overmap_start,
19009 overmap_end_p: &overmap_end,
19010 trimmed_start_p: &trimmed_start);
19011 if (kr == KERN_SUCCESS) {
19012 if (target_copy_map->size != *phys_size) {
19013 DEBUG4K_ADJUST("map %p (%d) start 0x%llx size 0x%llx adjusted_start 0x%llx adjusted_end 0x%llx overmap_start 0x%llx overmap_end 0x%llx trimmed_start 0x%llx phys_size 0x%llx -> 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)start, (uint64_t)size, (uint64_t)adjusted_start, (uint64_t)adjusted_end, (uint64_t)overmap_start, (uint64_t)overmap_end, (uint64_t)trimmed_start, (uint64_t)*phys_size, (uint64_t)target_copy_map->size);
19014 }
19015 *phys_size = target_copy_map->size;
19016 } else {
19017 DEBUG4K_ERROR("map %p start 0x%llx 0x%llx size 0x%llx 0x%llx kr 0x%x\n", map, (uint64_t)start, (uint64_t)adjusted_start, size, (uint64_t)adjusted_size, kr);
19018 //assert(0);
19019 *phys_size = 0;
19020 }
19021 vm_map_copy_discard(copy: copy_map);
19022 copy_map = VM_MAP_COPY_NULL;
19023
19024 return kr;
19025}
19026
19027
19028kern_return_t
19029memory_entry_check_for_adjustment(
19030 vm_map_t src_map,
19031 ipc_port_t port,
19032 vm_map_offset_t *overmap_start,
19033 vm_map_offset_t *overmap_end)
19034{
19035 kern_return_t kr = KERN_SUCCESS;
19036 vm_map_copy_t copy_map = VM_MAP_COPY_NULL, target_copy_map = VM_MAP_COPY_NULL;
19037
19038 assert(port);
19039 assertf(ip_kotype(port) == IKOT_NAMED_ENTRY, "Port Type expected: %d...received:%d\n", IKOT_NAMED_ENTRY, ip_kotype(port));
19040
19041 vm_named_entry_t named_entry;
19042
19043 named_entry = mach_memory_entry_from_port(port);
19044 named_entry_lock(named_entry);
19045 copy_map = named_entry->backing.copy;
19046 target_copy_map = copy_map;
19047
19048 if (src_map && VM_MAP_PAGE_SHIFT(map: src_map) < PAGE_SHIFT) {
19049 vm_map_offset_t trimmed_start;
19050
19051 trimmed_start = 0;
19052 DEBUG4K_ADJUST("adjusting...\n");
19053 kr = vm_map_copy_adjust_to_target(
19054 src_copy_map: copy_map,
19055 offset: 0, /* offset */
19056 size: copy_map->size, /* size */
19057 target_map: src_map,
19058 FALSE, /* copy */
19059 target_copy_map_p: &target_copy_map,
19060 overmap_start_p: overmap_start,
19061 overmap_end_p: overmap_end,
19062 trimmed_start_p: &trimmed_start);
19063 assert(trimmed_start == 0);
19064 }
19065 named_entry_unlock(named_entry);
19066
19067 return kr;
19068}
19069
19070
19071/*
19072 * Routine: vm_remap
19073 *
19074 * Map portion of a task's address space.
19075 * Mapped region must not overlap more than
19076 * one vm memory object. Protections and
19077 * inheritance attributes remain the same
19078 * as in the original task and are out parameters.
19079 * Source and Target task can be identical
19080 * Other attributes are identical as for vm_map()
19081 */
19082kern_return_t
19083vm_map_remap(
19084 vm_map_t target_map,
19085 vm_map_address_t *address,
19086 vm_map_size_t size,
19087 vm_map_offset_t mask,
19088 vm_map_kernel_flags_t vmk_flags,
19089 vm_map_t src_map,
19090 vm_map_offset_t memory_address,
19091 boolean_t copy,
19092 vm_prot_t *cur_protection, /* IN/OUT */
19093 vm_prot_t *max_protection, /* IN/OUT */
19094 vm_inherit_t inheritance)
19095{
19096 kern_return_t result;
19097 vm_map_entry_t entry;
19098 vm_map_entry_t insp_entry = VM_MAP_ENTRY_NULL;
19099 vm_map_entry_t new_entry;
19100 vm_map_copy_t copy_map;
19101 vm_map_offset_t offset_in_mapping;
19102 vm_map_size_t target_size = 0;
19103 vm_map_size_t src_page_mask, target_page_mask;
19104 vm_map_offset_t overmap_start, overmap_end, trimmed_start;
19105 vm_map_offset_t initial_memory_address;
19106 vm_map_size_t initial_size;
19107 VM_MAP_ZAP_DECLARE(zap_list);
19108
19109 if (target_map == VM_MAP_NULL) {
19110 return KERN_INVALID_ARGUMENT;
19111 }
19112
19113 if (__improbable(vm_map_range_overflows(src_map, memory_address, size))) {
19114 return KERN_INVALID_ARGUMENT;
19115 }
19116
19117 if (__improbable((*cur_protection & *max_protection) != *cur_protection)) {
19118 /* cur is more permissive than max */
19119 return KERN_INVALID_ARGUMENT;
19120 }
19121
19122 initial_memory_address = memory_address;
19123 initial_size = size;
19124 src_page_mask = VM_MAP_PAGE_MASK(src_map);
19125 target_page_mask = VM_MAP_PAGE_MASK(target_map);
19126
19127 switch (inheritance) {
19128 case VM_INHERIT_NONE:
19129 case VM_INHERIT_COPY:
19130 case VM_INHERIT_SHARE:
19131 if (size != 0 && src_map != VM_MAP_NULL) {
19132 break;
19133 }
19134 OS_FALLTHROUGH;
19135 default:
19136 return KERN_INVALID_ARGUMENT;
19137 }
19138
19139 if (src_page_mask != target_page_mask) {
19140 if (copy) {
19141 DEBUG4K_COPY("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), (uint64_t)memory_address, (uint64_t)size, copy, target_map, VM_MAP_PAGE_SIZE(target_map));
19142 } else {
19143 DEBUG4K_SHARE("src_map %p pgsz 0x%x addr 0x%llx size 0x%llx copy %d -> target_map %p pgsz 0x%x\n", src_map, VM_MAP_PAGE_SIZE(src_map), (uint64_t)memory_address, (uint64_t)size, copy, target_map, VM_MAP_PAGE_SIZE(target_map));
19144 }
19145 }
19146
19147 /*
19148 * If the user is requesting that we return the address of the
19149 * first byte of the data (rather than the base of the page),
19150 * then we use different rounding semantics: specifically,
19151 * we assume that (memory_address, size) describes a region
19152 * all of whose pages we must cover, rather than a base to be truncated
19153 * down and a size to be added to that base. So we figure out
19154 * the highest page that the requested region includes and make
19155 * sure that the size will cover it.
19156 *
19157 * The key example we're worried about it is of the form:
19158 *
19159 * memory_address = 0x1ff0, size = 0x20
19160 *
19161 * With the old semantics, we round down the memory_address to 0x1000
19162 * and round up the size to 0x1000, resulting in our covering *only*
19163 * page 0x1000. With the new semantics, we'd realize that the region covers
19164 * 0x1ff0-0x2010, and compute a size of 0x2000. Thus, we cover both page
19165 * 0x1000 and page 0x2000 in the region we remap.
19166 */
19167 if (vmk_flags.vmf_return_data_addr) {
19168 vm_map_offset_t range_start, range_end;
19169
19170 range_start = vm_map_trunc_page(memory_address, src_page_mask);
19171 range_end = vm_map_round_page(memory_address + size, src_page_mask);
19172 memory_address = range_start;
19173 size = range_end - range_start;
19174 offset_in_mapping = initial_memory_address - memory_address;
19175 } else {
19176 /*
19177 * IMPORTANT:
19178 * This legacy code path is broken: for the range mentioned
19179 * above [ memory_address = 0x1ff0,size = 0x20 ], which spans
19180 * two 4k pages, it yields [ memory_address = 0x1000,
19181 * size = 0x1000 ], which covers only the first 4k page.
19182 * BUT some code unfortunately depends on this bug, so we
19183 * can't fix it without breaking something.
19184 * New code should get automatically opted in the new
19185 * behavior with the new VM_FLAGS_RETURN_DATA_ADDR flags.
19186 */
19187 offset_in_mapping = 0;
19188 memory_address = vm_map_trunc_page(memory_address, src_page_mask);
19189 size = vm_map_round_page(size, src_page_mask);
19190 initial_memory_address = memory_address;
19191 initial_size = size;
19192 }
19193
19194
19195 if (size == 0) {
19196 return KERN_INVALID_ARGUMENT;
19197 }
19198
19199 if (vmk_flags.vmf_resilient_media) {
19200 /* must be copy-on-write to be "media resilient" */
19201 if (!copy) {
19202 return KERN_INVALID_ARGUMENT;
19203 }
19204 }
19205
19206 vmk_flags.vmkf_copy_pageable = target_map->hdr.entries_pageable;
19207 vmk_flags.vmkf_copy_same_map = (src_map == target_map);
19208
19209 assert(size != 0);
19210 result = vm_map_copy_extract(src_map,
19211 src_addr: memory_address,
19212 len: size,
19213 do_copy: copy, copy_result: &copy_map,
19214 cur_prot: cur_protection, /* IN/OUT */
19215 max_prot: max_protection, /* IN/OUT */
19216 inheritance,
19217 vmk_flags);
19218 if (result != KERN_SUCCESS) {
19219 return result;
19220 }
19221 assert(copy_map != VM_MAP_COPY_NULL);
19222
19223 /*
19224 * Handle the policy for vm map ranges
19225 *
19226 * If the maps differ, the target_map policy applies like for vm_map()
19227 * For same mapping remaps, we preserve the range.
19228 */
19229 if (vmk_flags.vmkf_copy_same_map) {
19230 vmk_flags.vmkf_range_id = copy_map->orig_range;
19231 } else {
19232 vm_map_kernel_flags_update_range_id(flags: &vmk_flags, map: target_map);
19233 }
19234
19235 overmap_start = 0;
19236 overmap_end = 0;
19237 trimmed_start = 0;
19238 target_size = size;
19239 if (src_page_mask != target_page_mask) {
19240 vm_map_copy_t target_copy_map;
19241
19242 target_copy_map = copy_map; /* can modify "copy_map" itself */
19243 DEBUG4K_ADJUST("adjusting...\n");
19244 result = vm_map_copy_adjust_to_target(
19245 src_copy_map: copy_map,
19246 offset: offset_in_mapping, /* offset */
19247 size: initial_size,
19248 target_map,
19249 copy,
19250 target_copy_map_p: &target_copy_map,
19251 overmap_start_p: &overmap_start,
19252 overmap_end_p: &overmap_end,
19253 trimmed_start_p: &trimmed_start);
19254 if (result != KERN_SUCCESS) {
19255 DEBUG4K_COPY("failed to adjust 0x%x\n", result);
19256 vm_map_copy_discard(copy: copy_map);
19257 return result;
19258 }
19259 if (trimmed_start == 0) {
19260 /* nothing trimmed: no adjustment needed */
19261 } else if (trimmed_start >= offset_in_mapping) {
19262 /* trimmed more than offset_in_mapping: nothing left */
19263 assert(overmap_start == 0);
19264 assert(overmap_end == 0);
19265 offset_in_mapping = 0;
19266 } else {
19267 /* trimmed some of offset_in_mapping: adjust */
19268 assert(overmap_start == 0);
19269 assert(overmap_end == 0);
19270 offset_in_mapping -= trimmed_start;
19271 }
19272 offset_in_mapping += overmap_start;
19273 target_size = target_copy_map->size;
19274 }
19275
19276 /*
19277 * Allocate/check a range of free virtual address
19278 * space for the target
19279 */
19280 *address = vm_map_trunc_page(*address, target_page_mask);
19281 vm_map_lock(target_map);
19282 target_size = vm_map_round_page(target_size, target_page_mask);
19283 result = vm_map_remap_range_allocate(map: target_map, address,
19284 size: target_size, mask, vmk_flags,
19285 map_entry: &insp_entry, zap_list: &zap_list);
19286
19287 for (entry = vm_map_copy_first_entry(copy_map);
19288 entry != vm_map_copy_to_entry(copy_map);
19289 entry = new_entry) {
19290 new_entry = entry->vme_next;
19291 vm_map_copy_entry_unlink(copy_map, entry);
19292 if (result == KERN_SUCCESS) {
19293 if (vmk_flags.vmkf_remap_prot_copy) {
19294 /*
19295 * This vm_map_remap() is for a
19296 * vm_protect(VM_PROT_COPY), so the caller
19297 * expects to be allowed to add write access
19298 * to this new mapping. This is done by
19299 * adding VM_PROT_WRITE to each entry's
19300 * max_protection... unless some security
19301 * settings disallow it.
19302 */
19303 bool allow_write = false;
19304 if (entry->vme_permanent) {
19305 /* immutable mapping... */
19306 if ((entry->max_protection & VM_PROT_EXECUTE) &&
19307 developer_mode_state()) {
19308 /*
19309 * ... but executable and
19310 * possibly being debugged,
19311 * so let's allow it to become
19312 * writable, for breakpoints
19313 * and dtrace probes, for
19314 * example.
19315 */
19316 allow_write = true;
19317 } else {
19318 printf(format: "%d[%s] vm_remap(0x%llx,0x%llx) VM_PROT_COPY denied on permanent mapping prot 0x%x/0x%x developer %d\n",
19319 proc_selfpid(),
19320 (get_bsdtask_info(current_task())
19321 ? proc_name_address(p: get_bsdtask_info(current_task()))
19322 : "?"),
19323 (uint64_t)memory_address,
19324 (uint64_t)size,
19325 entry->protection,
19326 entry->max_protection,
19327 developer_mode_state());
19328 DTRACE_VM6(vm_map_delete_permanent_deny_protcopy,
19329 vm_map_entry_t, entry,
19330 vm_map_offset_t, entry->vme_start,
19331 vm_map_offset_t, entry->vme_end,
19332 vm_prot_t, entry->protection,
19333 vm_prot_t, entry->max_protection,
19334 int, VME_ALIAS(entry));
19335 }
19336 } else {
19337 allow_write = true;
19338 }
19339
19340 /*
19341 * VM_PROT_COPY: allow this mapping to become
19342 * writable, unless it was "permanent".
19343 */
19344 if (allow_write) {
19345 entry->max_protection |= VM_PROT_WRITE;
19346 }
19347 }
19348 if (vmk_flags.vmf_resilient_codesign) {
19349 /* no codesigning -> read-only access */
19350 entry->max_protection = VM_PROT_READ;
19351 entry->protection = VM_PROT_READ;
19352 entry->vme_resilient_codesign = TRUE;
19353 }
19354 entry->vme_start += *address;
19355 entry->vme_end += *address;
19356 assert(!entry->map_aligned);
19357 if (vmk_flags.vmf_resilient_media &&
19358 !entry->is_sub_map &&
19359 (VME_OBJECT(entry) == VM_OBJECT_NULL ||
19360 VME_OBJECT(entry)->internal)) {
19361 entry->vme_resilient_media = TRUE;
19362 }
19363 assert(VM_MAP_PAGE_ALIGNED(entry->vme_start, MIN(target_page_mask, PAGE_MASK)));
19364 assert(VM_MAP_PAGE_ALIGNED(entry->vme_end, MIN(target_page_mask, PAGE_MASK)));
19365 assert(VM_MAP_PAGE_ALIGNED(VME_OFFSET(entry), MIN(target_page_mask, PAGE_MASK)));
19366 vm_map_store_entry_link(map: target_map, after_where: insp_entry, entry,
19367 vmk_flags);
19368 insp_entry = entry;
19369 } else {
19370 if (!entry->is_sub_map) {
19371 vm_object_deallocate(VME_OBJECT(entry));
19372 } else {
19373 vm_map_deallocate(VME_SUBMAP(entry));
19374 }
19375 vm_map_copy_entry_dispose(entry);
19376 }
19377 }
19378
19379 if (vmk_flags.vmf_resilient_codesign) {
19380 *cur_protection = VM_PROT_READ;
19381 *max_protection = VM_PROT_READ;
19382 }
19383
19384 if (result == KERN_SUCCESS) {
19385 target_map->size += target_size;
19386 SAVE_HINT_MAP_WRITE(target_map, insp_entry);
19387 }
19388 vm_map_unlock(target_map);
19389
19390 vm_map_zap_dispose(list: &zap_list);
19391
19392 if (result == KERN_SUCCESS && target_map->wiring_required) {
19393 result = vm_map_wire_kernel(map: target_map, start: *address,
19394 end: *address + size, caller_prot: *cur_protection, VM_KERN_MEMORY_MLOCK,
19395 TRUE);
19396 }
19397
19398 /*
19399 * If requested, return the address of the data pointed to by the
19400 * request, rather than the base of the resulting page.
19401 */
19402 if (vmk_flags.vmf_return_data_addr) {
19403 *address += offset_in_mapping;
19404 }
19405
19406 if (src_page_mask != target_page_mask) {
19407 DEBUG4K_SHARE("vm_remap(%p 0x%llx 0x%llx copy=%d-> %p 0x%llx 0x%llx result=0x%x\n", src_map, (uint64_t)memory_address, (uint64_t)size, copy, target_map, (uint64_t)*address, (uint64_t)offset_in_mapping, result);
19408 }
19409 vm_map_copy_discard(copy: copy_map);
19410 copy_map = VM_MAP_COPY_NULL;
19411
19412 return result;
19413}
19414
19415/*
19416 * Routine: vm_map_remap_range_allocate
19417 *
19418 * Description:
19419 * Allocate a range in the specified virtual address map.
19420 * returns the address and the map entry just before the allocated
19421 * range
19422 *
19423 * Map must be locked.
19424 */
19425
19426static kern_return_t
19427vm_map_remap_range_allocate(
19428 vm_map_t map,
19429 vm_map_address_t *address, /* IN/OUT */
19430 vm_map_size_t size,
19431 vm_map_offset_t mask,
19432 vm_map_kernel_flags_t vmk_flags,
19433 vm_map_entry_t *map_entry, /* OUT */
19434 vm_map_zap_t zap_list)
19435{
19436 vm_map_entry_t entry;
19437 vm_map_offset_t start;
19438 kern_return_t kr;
19439
19440 start = *address;
19441
19442 if (!vmk_flags.vmf_fixed) {
19443 kr = vm_map_locate_space(map, size, mask, vmk_flags,
19444 start_inout: &start, entry_out: &entry);
19445 if (kr != KERN_SUCCESS) {
19446 return kr;
19447 }
19448 *address = start;
19449 } else {
19450 vm_map_offset_t effective_min_offset, effective_max_offset;
19451 vm_map_entry_t temp_entry;
19452 vm_map_offset_t end;
19453
19454 effective_min_offset = map->min_offset;
19455 effective_max_offset = map->max_offset;
19456
19457 /*
19458 * Verify that:
19459 * the address doesn't itself violate
19460 * the mask requirement.
19461 */
19462
19463 if ((start & mask) != 0) {
19464 return KERN_NO_SPACE;
19465 }
19466
19467#if CONFIG_MAP_RANGES
19468 if (map->uses_user_ranges) {
19469 struct mach_vm_range r;
19470
19471 vm_map_user_range_resolve(map, start, 1, &r);
19472 if (r.max_address == 0) {
19473 return KERN_INVALID_ADDRESS;
19474 }
19475
19476 effective_min_offset = r.min_address;
19477 effective_max_offset = r.max_address;
19478 }
19479#endif /* CONFIG_MAP_RANGES */
19480 if (map == kernel_map) {
19481 mach_vm_range_t r = kmem_validate_range_for_overwrite(addr: start, size);
19482 effective_min_offset = r->min_address;
19483 effective_min_offset = r->max_address;
19484 }
19485
19486 /*
19487 * ... the address is within bounds
19488 */
19489
19490 end = start + size;
19491
19492 if ((start < effective_min_offset) ||
19493 (end > effective_max_offset) ||
19494 (start >= end)) {
19495 return KERN_INVALID_ADDRESS;
19496 }
19497
19498 /*
19499 * If we're asked to overwrite whatever was mapped in that
19500 * range, first deallocate that range.
19501 */
19502 if (vmk_flags.vmf_overwrite) {
19503 vmr_flags_t remove_flags = VM_MAP_REMOVE_NO_MAP_ALIGN;
19504
19505 /*
19506 * We use a "zap_list" to avoid having to unlock
19507 * the "map" in vm_map_delete(), which would compromise
19508 * the atomicity of the "deallocate" and then "remap"
19509 * combination.
19510 */
19511 remove_flags |= VM_MAP_REMOVE_NO_YIELD;
19512
19513 if (vmk_flags.vmkf_overwrite_immutable) {
19514 remove_flags |= VM_MAP_REMOVE_IMMUTABLE;
19515 }
19516 if (vmk_flags.vmkf_remap_prot_copy) {
19517 remove_flags |= VM_MAP_REMOVE_IMMUTABLE_CODE;
19518 }
19519 kr = vm_map_delete(map, start, end, flags: remove_flags,
19520 KMEM_GUARD_NONE, zap_list).kmr_return;
19521 if (kr != KERN_SUCCESS) {
19522 /* XXX FBDP restore zap_list? */
19523 return kr;
19524 }
19525 }
19526
19527 /*
19528 * ... the starting address isn't allocated
19529 */
19530
19531 if (vm_map_lookup_entry(map, address: start, entry: &temp_entry)) {
19532 return KERN_NO_SPACE;
19533 }
19534
19535 entry = temp_entry;
19536
19537 /*
19538 * ... the next region doesn't overlap the
19539 * end point.
19540 */
19541
19542 if ((entry->vme_next != vm_map_to_entry(map)) &&
19543 (entry->vme_next->vme_start < end)) {
19544 return KERN_NO_SPACE;
19545 }
19546 }
19547 *map_entry = entry;
19548 return KERN_SUCCESS;
19549}
19550
19551/*
19552 * vm_map_switch:
19553 *
19554 * Set the address map for the current thread to the specified map
19555 */
19556
19557vm_map_t
19558vm_map_switch(
19559 vm_map_t map)
19560{
19561 thread_t thread = current_thread();
19562 vm_map_t oldmap = thread->map;
19563
19564
19565 /*
19566 * Deactivate the current map and activate the requested map
19567 */
19568 mp_disable_preemption();
19569 PMAP_SWITCH_USER(thread, map, cpu_number());
19570 mp_enable_preemption();
19571 return oldmap;
19572}
19573
19574
19575/*
19576 * Routine: vm_map_write_user
19577 *
19578 * Description:
19579 * Copy out data from a kernel space into space in the
19580 * destination map. The space must already exist in the
19581 * destination map.
19582 * NOTE: This routine should only be called by threads
19583 * which can block on a page fault. i.e. kernel mode user
19584 * threads.
19585 *
19586 */
19587kern_return_t
19588vm_map_write_user(
19589 vm_map_t map,
19590 void *src_p,
19591 vm_map_address_t dst_addr,
19592 vm_size_t size)
19593{
19594 kern_return_t kr = KERN_SUCCESS;
19595
19596 if (__improbable(vm_map_range_overflows(map, dst_addr, size))) {
19597 return KERN_INVALID_ADDRESS;
19598 }
19599
19600 if (current_map() == map) {
19601 if (copyout(src_p, dst_addr, size)) {
19602 kr = KERN_INVALID_ADDRESS;
19603 }
19604 } else {
19605 vm_map_t oldmap;
19606
19607 /* take on the identity of the target map while doing */
19608 /* the transfer */
19609
19610 vm_map_reference(map);
19611 oldmap = vm_map_switch(map);
19612 if (copyout(src_p, dst_addr, size)) {
19613 kr = KERN_INVALID_ADDRESS;
19614 }
19615 vm_map_switch(map: oldmap);
19616 vm_map_deallocate(map);
19617 }
19618 return kr;
19619}
19620
19621/*
19622 * Routine: vm_map_read_user
19623 *
19624 * Description:
19625 * Copy in data from a user space source map into the
19626 * kernel map. The space must already exist in the
19627 * kernel map.
19628 * NOTE: This routine should only be called by threads
19629 * which can block on a page fault. i.e. kernel mode user
19630 * threads.
19631 *
19632 */
19633kern_return_t
19634vm_map_read_user(
19635 vm_map_t map,
19636 vm_map_address_t src_addr,
19637 void *dst_p,
19638 vm_size_t size)
19639{
19640 kern_return_t kr = KERN_SUCCESS;
19641
19642 if (__improbable(vm_map_range_overflows(map, src_addr, size))) {
19643 return KERN_INVALID_ADDRESS;
19644 }
19645
19646 if (current_map() == map) {
19647 if (copyin(src_addr, dst_p, size)) {
19648 kr = KERN_INVALID_ADDRESS;
19649 }
19650 } else {
19651 vm_map_t oldmap;
19652
19653 /* take on the identity of the target map while doing */
19654 /* the transfer */
19655
19656 vm_map_reference(map);
19657 oldmap = vm_map_switch(map);
19658 if (copyin(src_addr, dst_p, size)) {
19659 kr = KERN_INVALID_ADDRESS;
19660 }
19661 vm_map_switch(map: oldmap);
19662 vm_map_deallocate(map);
19663 }
19664 return kr;
19665}
19666
19667
19668/*
19669 * vm_map_check_protection:
19670 *
19671 * Assert that the target map allows the specified
19672 * privilege on the entire address region given.
19673 * The entire region must be allocated.
19674 */
19675boolean_t
19676vm_map_check_protection(vm_map_t map, vm_map_offset_t start,
19677 vm_map_offset_t end, vm_prot_t protection)
19678{
19679 vm_map_entry_t entry;
19680 vm_map_entry_t tmp_entry;
19681
19682 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
19683 return FALSE;
19684 }
19685
19686 vm_map_lock(map);
19687
19688 if (start < vm_map_min(map) || end > vm_map_max(map) || start > end) {
19689 vm_map_unlock(map);
19690 return FALSE;
19691 }
19692
19693 if (!vm_map_lookup_entry(map, address: start, entry: &tmp_entry)) {
19694 vm_map_unlock(map);
19695 return FALSE;
19696 }
19697
19698 entry = tmp_entry;
19699
19700 while (start < end) {
19701 if (entry == vm_map_to_entry(map)) {
19702 vm_map_unlock(map);
19703 return FALSE;
19704 }
19705
19706 /*
19707 * No holes allowed!
19708 */
19709
19710 if (start < entry->vme_start) {
19711 vm_map_unlock(map);
19712 return FALSE;
19713 }
19714
19715 /*
19716 * Check protection associated with entry.
19717 */
19718
19719 if ((entry->protection & protection) != protection) {
19720 vm_map_unlock(map);
19721 return FALSE;
19722 }
19723
19724 /* go to next entry */
19725
19726 start = entry->vme_end;
19727 entry = entry->vme_next;
19728 }
19729 vm_map_unlock(map);
19730 return TRUE;
19731}
19732
19733kern_return_t
19734vm_map_purgable_control(
19735 vm_map_t map,
19736 vm_map_offset_t address,
19737 vm_purgable_t control,
19738 int *state)
19739{
19740 vm_map_entry_t entry;
19741 vm_object_t object;
19742 kern_return_t kr;
19743 boolean_t was_nonvolatile;
19744
19745 /*
19746 * Vet all the input parameters and current type and state of the
19747 * underlaying object. Return with an error if anything is amiss.
19748 */
19749 if (map == VM_MAP_NULL) {
19750 return KERN_INVALID_ARGUMENT;
19751 }
19752
19753 if (control != VM_PURGABLE_SET_STATE &&
19754 control != VM_PURGABLE_GET_STATE &&
19755 control != VM_PURGABLE_PURGE_ALL &&
19756 control != VM_PURGABLE_SET_STATE_FROM_KERNEL) {
19757 return KERN_INVALID_ARGUMENT;
19758 }
19759
19760 if (control == VM_PURGABLE_PURGE_ALL) {
19761 vm_purgeable_object_purge_all();
19762 return KERN_SUCCESS;
19763 }
19764
19765 if ((control == VM_PURGABLE_SET_STATE ||
19766 control == VM_PURGABLE_SET_STATE_FROM_KERNEL) &&
19767 (((*state & ~(VM_PURGABLE_ALL_MASKS)) != 0) ||
19768 ((*state & VM_PURGABLE_STATE_MASK) > VM_PURGABLE_STATE_MASK))) {
19769 return KERN_INVALID_ARGUMENT;
19770 }
19771
19772 vm_map_lock_read(map);
19773
19774 if (!vm_map_lookup_entry(map, address, entry: &entry) || entry->is_sub_map) {
19775 /*
19776 * Must pass a valid non-submap address.
19777 */
19778 vm_map_unlock_read(map);
19779 return KERN_INVALID_ADDRESS;
19780 }
19781
19782 if ((entry->protection & VM_PROT_WRITE) == 0 &&
19783 control != VM_PURGABLE_GET_STATE) {
19784 /*
19785 * Can't apply purgable controls to something you can't write.
19786 */
19787 vm_map_unlock_read(map);
19788 return KERN_PROTECTION_FAILURE;
19789 }
19790
19791 object = VME_OBJECT(entry);
19792 if (object == VM_OBJECT_NULL ||
19793 object->purgable == VM_PURGABLE_DENY) {
19794 /*
19795 * Object must already be present and be purgeable.
19796 */
19797 vm_map_unlock_read(map);
19798 return KERN_INVALID_ARGUMENT;
19799 }
19800
19801 vm_object_lock(object);
19802
19803#if 00
19804 if (VME_OFFSET(entry) != 0 ||
19805 entry->vme_end - entry->vme_start != object->vo_size) {
19806 /*
19807 * Can only apply purgable controls to the whole (existing)
19808 * object at once.
19809 */
19810 vm_map_unlock_read(map);
19811 vm_object_unlock(object);
19812 return KERN_INVALID_ARGUMENT;
19813 }
19814#endif
19815
19816 assert(!entry->is_sub_map);
19817 assert(!entry->use_pmap); /* purgeable has its own accounting */
19818
19819 vm_map_unlock_read(map);
19820
19821 was_nonvolatile = (object->purgable == VM_PURGABLE_NONVOLATILE);
19822
19823 kr = vm_object_purgable_control(object, control, state);
19824
19825 if (was_nonvolatile &&
19826 object->purgable != VM_PURGABLE_NONVOLATILE &&
19827 map->pmap == kernel_pmap) {
19828#if DEBUG
19829 object->vo_purgeable_volatilizer = kernel_task;
19830#endif /* DEBUG */
19831 }
19832
19833 vm_object_unlock(object);
19834
19835 return kr;
19836}
19837
19838void
19839vm_map_footprint_query_page_info(
19840 vm_map_t map,
19841 vm_map_entry_t map_entry,
19842 vm_map_offset_t curr_s_offset,
19843 int *disposition_p)
19844{
19845 int pmap_disp;
19846 vm_object_t object = VM_OBJECT_NULL;
19847 int disposition;
19848 int effective_page_size;
19849
19850 vm_map_lock_assert_held(map);
19851 assert(!map->has_corpse_footprint);
19852 assert(curr_s_offset >= map_entry->vme_start);
19853 assert(curr_s_offset < map_entry->vme_end);
19854
19855 if (map_entry->is_sub_map) {
19856 if (!map_entry->use_pmap) {
19857 /* nested pmap: no footprint */
19858 *disposition_p = 0;
19859 return;
19860 }
19861 } else {
19862 object = VME_OBJECT(map_entry);
19863 if (object == VM_OBJECT_NULL) {
19864 /* nothing mapped here: no need to ask */
19865 *disposition_p = 0;
19866 return;
19867 }
19868 }
19869
19870 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
19871
19872 pmap_disp = 0;
19873
19874 /*
19875 * Query the pmap.
19876 */
19877 pmap_query_page_info(pmap: map->pmap, va: curr_s_offset, disp: &pmap_disp);
19878
19879 /*
19880 * Compute this page's disposition.
19881 */
19882 disposition = 0;
19883
19884 /* deal with "alternate accounting" first */
19885 if (!map_entry->is_sub_map &&
19886 object->vo_no_footprint) {
19887 /* does not count in footprint */
19888 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19889 } else if (!map_entry->is_sub_map &&
19890 (object->purgable == VM_PURGABLE_NONVOLATILE ||
19891 (object->purgable == VM_PURGABLE_DENY &&
19892 object->vo_ledger_tag)) &&
19893 VM_OBJECT_OWNER(object) != NULL &&
19894 VM_OBJECT_OWNER(object)->map == map) {
19895 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19896 if ((((curr_s_offset
19897 - map_entry->vme_start
19898 + VME_OFFSET(entry: map_entry))
19899 / effective_page_size) <
19900 (object->resident_page_count +
19901 vm_compressor_pager_get_count(mem_obj: object->pager)))) {
19902 /*
19903 * Non-volatile purgeable object owned
19904 * by this task: report the first
19905 * "#resident + #compressed" pages as
19906 * "resident" (to show that they
19907 * contribute to the footprint) but not
19908 * "dirty" (to avoid double-counting
19909 * with the fake "non-volatile" region
19910 * we'll report at the end of the
19911 * address space to account for all
19912 * (mapped or not) non-volatile memory
19913 * owned by this task.
19914 */
19915 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19916 }
19917 } else if (!map_entry->is_sub_map &&
19918 (object->purgable == VM_PURGABLE_VOLATILE ||
19919 object->purgable == VM_PURGABLE_EMPTY) &&
19920 VM_OBJECT_OWNER(object) != NULL &&
19921 VM_OBJECT_OWNER(object)->map == map) {
19922 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19923 if ((((curr_s_offset
19924 - map_entry->vme_start
19925 + VME_OFFSET(entry: map_entry))
19926 / effective_page_size) <
19927 object->wired_page_count)) {
19928 /*
19929 * Volatile|empty purgeable object owned
19930 * by this task: report the first
19931 * "#wired" pages as "resident" (to
19932 * show that they contribute to the
19933 * footprint) but not "dirty" (to avoid
19934 * double-counting with the fake
19935 * "non-volatile" region we'll report
19936 * at the end of the address space to
19937 * account for all (mapped or not)
19938 * non-volatile memory owned by this
19939 * task.
19940 */
19941 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19942 }
19943 } else if (!map_entry->is_sub_map &&
19944 map_entry->iokit_acct &&
19945 object->internal &&
19946 object->purgable == VM_PURGABLE_DENY) {
19947 /*
19948 * Non-purgeable IOKit memory: phys_footprint
19949 * includes the entire virtual mapping.
19950 */
19951 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19952 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19953 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19954 } else if (pmap_disp & (PMAP_QUERY_PAGE_ALTACCT |
19955 PMAP_QUERY_PAGE_COMPRESSED_ALTACCT)) {
19956 /* alternate accounting */
19957#if __arm64__ && (DEVELOPMENT || DEBUG)
19958 if (map->pmap->footprint_was_suspended) {
19959 /*
19960 * The assertion below can fail if dyld
19961 * suspended footprint accounting
19962 * while doing some adjustments to
19963 * this page; the mapping would say
19964 * "use pmap accounting" but the page
19965 * would be marked "alternate
19966 * accounting".
19967 */
19968 } else
19969#endif /* __arm64__ && (DEVELOPMENT || DEBUG) */
19970 {
19971 assertf(!map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19972 }
19973 disposition = 0;
19974 } else {
19975 if (pmap_disp & PMAP_QUERY_PAGE_PRESENT) {
19976 assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19977 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
19978 disposition |= VM_PAGE_QUERY_PAGE_REF;
19979 if (pmap_disp & PMAP_QUERY_PAGE_INTERNAL) {
19980 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
19981 } else {
19982 disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
19983 }
19984 if (pmap_disp & PMAP_QUERY_PAGE_REUSABLE) {
19985 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
19986 }
19987 } else if (pmap_disp & PMAP_QUERY_PAGE_COMPRESSED) {
19988 assertf(map_entry->use_pmap, "offset 0x%llx map_entry %p", (uint64_t) curr_s_offset, map_entry);
19989 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
19990 }
19991 }
19992
19993 *disposition_p = disposition;
19994}
19995
19996kern_return_t
19997vm_map_page_query_internal(
19998 vm_map_t target_map,
19999 vm_map_offset_t offset,
20000 int *disposition,
20001 int *ref_count)
20002{
20003 kern_return_t kr;
20004 vm_page_info_basic_data_t info;
20005 mach_msg_type_number_t count;
20006
20007 count = VM_PAGE_INFO_BASIC_COUNT;
20008 kr = vm_map_page_info(map: target_map,
20009 offset,
20010 VM_PAGE_INFO_BASIC,
20011 info: (vm_page_info_t) &info,
20012 count: &count);
20013 if (kr == KERN_SUCCESS) {
20014 *disposition = info.disposition;
20015 *ref_count = info.ref_count;
20016 } else {
20017 *disposition = 0;
20018 *ref_count = 0;
20019 }
20020
20021 return kr;
20022}
20023
20024kern_return_t
20025vm_map_page_info(
20026 vm_map_t map,
20027 vm_map_offset_t offset,
20028 vm_page_info_flavor_t flavor,
20029 vm_page_info_t info,
20030 mach_msg_type_number_t *count)
20031{
20032 return vm_map_page_range_info_internal(map,
20033 start_offset: offset, /* start of range */
20034 end_offset: (offset + 1), /* this will get rounded in the call to the page boundary */
20035 effective_page_shift: (int)-1, /* effective_page_shift: unspecified */
20036 flavor,
20037 info,
20038 count);
20039}
20040
20041kern_return_t
20042vm_map_page_range_info_internal(
20043 vm_map_t map,
20044 vm_map_offset_t start_offset,
20045 vm_map_offset_t end_offset,
20046 int effective_page_shift,
20047 vm_page_info_flavor_t flavor,
20048 vm_page_info_t info,
20049 mach_msg_type_number_t *count)
20050{
20051 vm_map_entry_t map_entry = VM_MAP_ENTRY_NULL;
20052 vm_object_t object = VM_OBJECT_NULL, curr_object = VM_OBJECT_NULL;
20053 vm_page_t m = VM_PAGE_NULL;
20054 kern_return_t retval = KERN_SUCCESS;
20055 int disposition = 0;
20056 int ref_count = 0;
20057 int depth = 0, info_idx = 0;
20058 vm_page_info_basic_t basic_info = 0;
20059 vm_map_offset_t offset_in_page = 0, offset_in_object = 0, curr_offset_in_object = 0;
20060 vm_map_offset_t start = 0, end = 0, curr_s_offset = 0, curr_e_offset = 0;
20061 boolean_t do_region_footprint;
20062 ledger_amount_t ledger_resident, ledger_compressed;
20063 int effective_page_size;
20064 vm_map_offset_t effective_page_mask;
20065
20066 switch (flavor) {
20067 case VM_PAGE_INFO_BASIC:
20068 if (*count != VM_PAGE_INFO_BASIC_COUNT) {
20069 /*
20070 * The "vm_page_info_basic_data" structure was not
20071 * properly padded, so allow the size to be off by
20072 * one to maintain backwards binary compatibility...
20073 */
20074 if (*count != VM_PAGE_INFO_BASIC_COUNT - 1) {
20075 return KERN_INVALID_ARGUMENT;
20076 }
20077 }
20078 break;
20079 default:
20080 return KERN_INVALID_ARGUMENT;
20081 }
20082
20083 if (effective_page_shift == -1) {
20084 effective_page_shift = vm_self_region_page_shift_safely(target_map: map);
20085 if (effective_page_shift == -1) {
20086 return KERN_INVALID_ARGUMENT;
20087 }
20088 }
20089 effective_page_size = (1 << effective_page_shift);
20090 effective_page_mask = effective_page_size - 1;
20091
20092 do_region_footprint = task_self_region_footprint();
20093 disposition = 0;
20094 ref_count = 0;
20095 depth = 0;
20096 info_idx = 0; /* Tracks the next index within the info structure to be filled.*/
20097 retval = KERN_SUCCESS;
20098
20099 if (__improbable(vm_map_range_overflows(map, start_offset, end_offset - start_offset))) {
20100 return KERN_INVALID_ADDRESS;
20101 }
20102
20103 offset_in_page = start_offset & effective_page_mask;
20104 start = vm_map_trunc_page(start_offset, effective_page_mask);
20105 end = vm_map_round_page(end_offset, effective_page_mask);
20106
20107 if (end < start) {
20108 return KERN_INVALID_ARGUMENT;
20109 }
20110
20111 assert((end - start) <= MAX_PAGE_RANGE_QUERY);
20112
20113 vm_map_lock_read(map);
20114
20115 task_ledgers_footprint(ledger: map->pmap->ledger, ledger_resident: &ledger_resident, ledger_compressed: &ledger_compressed);
20116
20117 for (curr_s_offset = start; curr_s_offset < end;) {
20118 /*
20119 * New lookup needs reset of these variables.
20120 */
20121 curr_object = object = VM_OBJECT_NULL;
20122 offset_in_object = 0;
20123 ref_count = 0;
20124 depth = 0;
20125
20126 if (do_region_footprint &&
20127 curr_s_offset >= vm_map_last_entry(map)->vme_end) {
20128 /*
20129 * Request for "footprint" info about a page beyond
20130 * the end of address space: this must be for
20131 * the fake region vm_map_region_recurse_64()
20132 * reported to account for non-volatile purgeable
20133 * memory owned by this task.
20134 */
20135 disposition = 0;
20136
20137 if (curr_s_offset - vm_map_last_entry(map)->vme_end <=
20138 (unsigned) ledger_compressed) {
20139 /*
20140 * We haven't reported all the "non-volatile
20141 * compressed" pages yet, so report this fake
20142 * page as "compressed".
20143 */
20144 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20145 } else {
20146 /*
20147 * We've reported all the non-volatile
20148 * compressed page but not all the non-volatile
20149 * pages , so report this fake page as
20150 * "resident dirty".
20151 */
20152 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20153 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20154 disposition |= VM_PAGE_QUERY_PAGE_REF;
20155 }
20156 switch (flavor) {
20157 case VM_PAGE_INFO_BASIC:
20158 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20159 basic_info->disposition = disposition;
20160 basic_info->ref_count = 1;
20161 basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
20162 basic_info->offset = 0;
20163 basic_info->depth = 0;
20164
20165 info_idx++;
20166 break;
20167 }
20168 curr_s_offset += effective_page_size;
20169 continue;
20170 }
20171
20172 /*
20173 * First, find the map entry covering "curr_s_offset", going down
20174 * submaps if necessary.
20175 */
20176 if (!vm_map_lookup_entry(map, address: curr_s_offset, entry: &map_entry)) {
20177 /* no entry -> no object -> no page */
20178
20179 if (curr_s_offset < vm_map_min(map)) {
20180 /*
20181 * Illegal address that falls below map min.
20182 */
20183 curr_e_offset = MIN(end, vm_map_min(map));
20184 } else if (curr_s_offset >= vm_map_max(map)) {
20185 /*
20186 * Illegal address that falls on/after map max.
20187 */
20188 curr_e_offset = end;
20189 } else if (map_entry == vm_map_to_entry(map)) {
20190 /*
20191 * Hit a hole.
20192 */
20193 if (map_entry->vme_next == vm_map_to_entry(map)) {
20194 /*
20195 * Empty map.
20196 */
20197 curr_e_offset = MIN(map->max_offset, end);
20198 } else {
20199 /*
20200 * Hole at start of the map.
20201 */
20202 curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
20203 }
20204 } else {
20205 if (map_entry->vme_next == vm_map_to_entry(map)) {
20206 /*
20207 * Hole at the end of the map.
20208 */
20209 curr_e_offset = MIN(map->max_offset, end);
20210 } else {
20211 curr_e_offset = MIN(map_entry->vme_next->vme_start, end);
20212 }
20213 }
20214
20215 assert(curr_e_offset >= curr_s_offset);
20216
20217 uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
20218
20219 void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20220
20221 bzero(s: info_ptr, n: num_pages * sizeof(struct vm_page_info_basic));
20222
20223 curr_s_offset = curr_e_offset;
20224
20225 info_idx += num_pages;
20226
20227 continue;
20228 }
20229
20230 /* compute offset from this map entry's start */
20231 offset_in_object = curr_s_offset - map_entry->vme_start;
20232
20233 /* compute offset into this map entry's object (or submap) */
20234 offset_in_object += VME_OFFSET(entry: map_entry);
20235
20236 if (map_entry->is_sub_map) {
20237 vm_map_t sub_map = VM_MAP_NULL;
20238 vm_page_info_t submap_info = 0;
20239 vm_map_offset_t submap_s_offset = 0, submap_e_offset = 0, range_len = 0;
20240
20241 range_len = MIN(map_entry->vme_end, end) - curr_s_offset;
20242
20243 submap_s_offset = offset_in_object;
20244 submap_e_offset = submap_s_offset + range_len;
20245
20246 sub_map = VME_SUBMAP(map_entry);
20247
20248 vm_map_reference(map: sub_map);
20249 vm_map_unlock_read(map);
20250
20251 submap_info = (vm_page_info_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20252
20253 assertf(VM_MAP_PAGE_SHIFT(sub_map) >= VM_MAP_PAGE_SHIFT(map),
20254 "Submap page size (%d) differs from current map (%d)\n", VM_MAP_PAGE_SIZE(sub_map), VM_MAP_PAGE_SIZE(map));
20255
20256 retval = vm_map_page_range_info_internal(map: sub_map,
20257 start_offset: submap_s_offset,
20258 end_offset: submap_e_offset,
20259 effective_page_shift,
20260 VM_PAGE_INFO_BASIC,
20261 info: (vm_page_info_t) submap_info,
20262 count);
20263
20264 assert(retval == KERN_SUCCESS);
20265
20266 vm_map_lock_read(map);
20267 vm_map_deallocate(map: sub_map);
20268
20269 /* Move the "info" index by the number of pages we inspected.*/
20270 info_idx += range_len >> effective_page_shift;
20271
20272 /* Move our current offset by the size of the range we inspected.*/
20273 curr_s_offset += range_len;
20274
20275 continue;
20276 }
20277
20278 object = VME_OBJECT(map_entry);
20279
20280 if (object == VM_OBJECT_NULL) {
20281 /*
20282 * We don't have an object here and, hence,
20283 * no pages to inspect. We'll fill up the
20284 * info structure appropriately.
20285 */
20286
20287 curr_e_offset = MIN(map_entry->vme_end, end);
20288
20289 uint64_t num_pages = (curr_e_offset - curr_s_offset) >> effective_page_shift;
20290
20291 void *info_ptr = (void*) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20292
20293 bzero(s: info_ptr, n: num_pages * sizeof(struct vm_page_info_basic));
20294
20295 curr_s_offset = curr_e_offset;
20296
20297 info_idx += num_pages;
20298
20299 continue;
20300 }
20301
20302 if (do_region_footprint) {
20303 disposition = 0;
20304 if (map->has_corpse_footprint) {
20305 /*
20306 * Query the page info data we saved
20307 * while forking the corpse.
20308 */
20309 vm_map_corpse_footprint_query_page_info(
20310 map,
20311 va: curr_s_offset,
20312 disposition_p: &disposition);
20313 } else {
20314 /*
20315 * Query the live pmap for footprint info
20316 * about this page.
20317 */
20318 vm_map_footprint_query_page_info(
20319 map,
20320 map_entry,
20321 curr_s_offset,
20322 disposition_p: &disposition);
20323 }
20324 switch (flavor) {
20325 case VM_PAGE_INFO_BASIC:
20326 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20327 basic_info->disposition = disposition;
20328 basic_info->ref_count = 1;
20329 basic_info->object_id = VM_OBJECT_ID_FAKE(map, task_ledgers.purgeable_nonvolatile);
20330 basic_info->offset = 0;
20331 basic_info->depth = 0;
20332
20333 info_idx++;
20334 break;
20335 }
20336 curr_s_offset += effective_page_size;
20337 continue;
20338 }
20339
20340 vm_object_reference(object);
20341 /*
20342 * Shared mode -- so we can allow other readers
20343 * to grab the lock too.
20344 */
20345 vm_object_lock_shared(object);
20346
20347 curr_e_offset = MIN(map_entry->vme_end, end);
20348
20349 vm_map_unlock_read(map);
20350
20351 map_entry = NULL; /* map is unlocked, the entry is no longer valid. */
20352
20353 curr_object = object;
20354
20355 for (; curr_s_offset < curr_e_offset;) {
20356 if (object == curr_object) {
20357 ref_count = curr_object->ref_count - 1; /* account for our object reference above. */
20358 } else {
20359 ref_count = curr_object->ref_count;
20360 }
20361
20362 curr_offset_in_object = offset_in_object;
20363
20364 for (;;) {
20365 m = vm_page_lookup(object: curr_object, vm_object_trunc_page(curr_offset_in_object));
20366
20367 if (m != VM_PAGE_NULL) {
20368 disposition |= VM_PAGE_QUERY_PAGE_PRESENT;
20369 break;
20370 } else {
20371 if (curr_object->internal &&
20372 curr_object->alive &&
20373 !curr_object->terminating &&
20374 curr_object->pager_ready) {
20375 if (VM_COMPRESSOR_PAGER_STATE_GET(curr_object, vm_object_trunc_page(curr_offset_in_object))
20376 == VM_EXTERNAL_STATE_EXISTS) {
20377 /* the pager has that page */
20378 disposition |= VM_PAGE_QUERY_PAGE_PAGED_OUT;
20379 break;
20380 }
20381 }
20382
20383 /*
20384 * Go down the VM object shadow chain until we find the page
20385 * we're looking for.
20386 */
20387
20388 if (curr_object->shadow != VM_OBJECT_NULL) {
20389 vm_object_t shadow = VM_OBJECT_NULL;
20390
20391 curr_offset_in_object += curr_object->vo_shadow_offset;
20392 shadow = curr_object->shadow;
20393
20394 vm_object_lock_shared(shadow);
20395 vm_object_unlock(curr_object);
20396
20397 curr_object = shadow;
20398 depth++;
20399 continue;
20400 } else {
20401 break;
20402 }
20403 }
20404 }
20405
20406 /* The ref_count is not strictly accurate, it measures the number */
20407 /* of entities holding a ref on the object, they may not be mapping */
20408 /* the object or may not be mapping the section holding the */
20409 /* target page but its still a ball park number and though an over- */
20410 /* count, it picks up the copy-on-write cases */
20411
20412 /* We could also get a picture of page sharing from pmap_attributes */
20413 /* but this would under count as only faulted-in mappings would */
20414 /* show up. */
20415
20416 if ((curr_object == object) && curr_object->shadow) {
20417 disposition |= VM_PAGE_QUERY_PAGE_COPIED;
20418 }
20419
20420 if (!curr_object->internal) {
20421 disposition |= VM_PAGE_QUERY_PAGE_EXTERNAL;
20422 }
20423
20424 if (m != VM_PAGE_NULL) {
20425 if (m->vmp_fictitious) {
20426 disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
20427 } else {
20428 if (m->vmp_dirty || pmap_is_modified(pn: VM_PAGE_GET_PHYS_PAGE(m))) {
20429 disposition |= VM_PAGE_QUERY_PAGE_DIRTY;
20430 }
20431
20432 if (m->vmp_reference || pmap_is_referenced(pn: VM_PAGE_GET_PHYS_PAGE(m))) {
20433 disposition |= VM_PAGE_QUERY_PAGE_REF;
20434 }
20435
20436 if (m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
20437 disposition |= VM_PAGE_QUERY_PAGE_SPECULATIVE;
20438 }
20439
20440 /*
20441 * XXX TODO4K:
20442 * when this routine deals with 4k
20443 * pages, check the appropriate CS bit
20444 * here.
20445 */
20446 if (m->vmp_cs_validated) {
20447 disposition |= VM_PAGE_QUERY_PAGE_CS_VALIDATED;
20448 }
20449 if (m->vmp_cs_tainted) {
20450 disposition |= VM_PAGE_QUERY_PAGE_CS_TAINTED;
20451 }
20452 if (m->vmp_cs_nx) {
20453 disposition |= VM_PAGE_QUERY_PAGE_CS_NX;
20454 }
20455 if (m->vmp_reusable || curr_object->all_reusable) {
20456 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
20457 }
20458 }
20459 }
20460
20461 switch (flavor) {
20462 case VM_PAGE_INFO_BASIC:
20463 basic_info = (vm_page_info_basic_t) (((uintptr_t) info) + (info_idx * sizeof(struct vm_page_info_basic)));
20464 basic_info->disposition = disposition;
20465 basic_info->ref_count = ref_count;
20466 basic_info->object_id = (vm_object_id_t) (uintptr_t)
20467 VM_KERNEL_ADDRHASH(curr_object);
20468 basic_info->offset =
20469 (memory_object_offset_t) curr_offset_in_object + offset_in_page;
20470 basic_info->depth = depth;
20471
20472 info_idx++;
20473 break;
20474 }
20475
20476 disposition = 0;
20477 offset_in_page = 0; // This doesn't really make sense for any offset other than the starting offset.
20478
20479 /*
20480 * Move to next offset in the range and in our object.
20481 */
20482 curr_s_offset += effective_page_size;
20483 offset_in_object += effective_page_size;
20484 curr_offset_in_object = offset_in_object;
20485
20486 if (curr_object != object) {
20487 vm_object_unlock(curr_object);
20488
20489 curr_object = object;
20490
20491 vm_object_lock_shared(curr_object);
20492 } else {
20493 vm_object_lock_yield_shared(curr_object);
20494 }
20495 }
20496
20497 vm_object_unlock(curr_object);
20498 vm_object_deallocate(object: curr_object);
20499
20500 vm_map_lock_read(map);
20501 }
20502
20503 vm_map_unlock_read(map);
20504 return retval;
20505}
20506
20507/*
20508 * vm_map_msync
20509 *
20510 * Synchronises the memory range specified with its backing store
20511 * image by either flushing or cleaning the contents to the appropriate
20512 * memory manager engaging in a memory object synchronize dialog with
20513 * the manager. The client doesn't return until the manager issues
20514 * m_o_s_completed message. MIG Magically converts user task parameter
20515 * to the task's address map.
20516 *
20517 * interpretation of sync_flags
20518 * VM_SYNC_INVALIDATE - discard pages, only return precious
20519 * pages to manager.
20520 *
20521 * VM_SYNC_INVALIDATE & (VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS)
20522 * - discard pages, write dirty or precious
20523 * pages back to memory manager.
20524 *
20525 * VM_SYNC_SYNCHRONOUS | VM_SYNC_ASYNCHRONOUS
20526 * - write dirty or precious pages back to
20527 * the memory manager.
20528 *
20529 * VM_SYNC_CONTIGUOUS - does everything normally, but if there
20530 * is a hole in the region, and we would
20531 * have returned KERN_SUCCESS, return
20532 * KERN_INVALID_ADDRESS instead.
20533 *
20534 * NOTE
20535 * The memory object attributes have not yet been implemented, this
20536 * function will have to deal with the invalidate attribute
20537 *
20538 * RETURNS
20539 * KERN_INVALID_TASK Bad task parameter
20540 * KERN_INVALID_ARGUMENT both sync and async were specified.
20541 * KERN_SUCCESS The usual.
20542 * KERN_INVALID_ADDRESS There was a hole in the region.
20543 */
20544
20545kern_return_t
20546vm_map_msync(
20547 vm_map_t map,
20548 vm_map_address_t address,
20549 vm_map_size_t size,
20550 vm_sync_t sync_flags)
20551{
20552 vm_map_entry_t entry;
20553 vm_map_size_t amount_left;
20554 vm_object_offset_t offset;
20555 vm_object_offset_t start_offset, end_offset;
20556 boolean_t do_sync_req;
20557 boolean_t had_hole = FALSE;
20558 vm_map_offset_t pmap_offset;
20559
20560 if ((sync_flags & VM_SYNC_ASYNCHRONOUS) &&
20561 (sync_flags & VM_SYNC_SYNCHRONOUS)) {
20562 return KERN_INVALID_ARGUMENT;
20563 }
20564
20565 if (__improbable(vm_map_range_overflows(map, address, size))) {
20566 return KERN_INVALID_ADDRESS;
20567 }
20568
20569 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20570 DEBUG4K_SHARE("map %p address 0x%llx size 0x%llx flags 0x%x\n", map, (uint64_t)address, (uint64_t)size, sync_flags);
20571 }
20572
20573 /*
20574 * align address and size on page boundaries
20575 */
20576 size = (vm_map_round_page(address + size,
20577 VM_MAP_PAGE_MASK(map)) -
20578 vm_map_trunc_page(address,
20579 VM_MAP_PAGE_MASK(map)));
20580 address = vm_map_trunc_page(address,
20581 VM_MAP_PAGE_MASK(map));
20582
20583 if (map == VM_MAP_NULL) {
20584 return KERN_INVALID_TASK;
20585 }
20586
20587 if (size == 0) {
20588 return KERN_SUCCESS;
20589 }
20590
20591 amount_left = size;
20592
20593 while (amount_left > 0) {
20594 vm_object_size_t flush_size;
20595 vm_object_t object;
20596
20597 vm_map_lock(map);
20598 if (!vm_map_lookup_entry(map,
20599 address,
20600 entry: &entry)) {
20601 vm_map_size_t skip;
20602
20603 /*
20604 * hole in the address map.
20605 */
20606 had_hole = TRUE;
20607
20608 if (sync_flags & VM_SYNC_KILLPAGES) {
20609 /*
20610 * For VM_SYNC_KILLPAGES, there should be
20611 * no holes in the range, since we couldn't
20612 * prevent someone else from allocating in
20613 * that hole and we wouldn't want to "kill"
20614 * their pages.
20615 */
20616 vm_map_unlock(map);
20617 break;
20618 }
20619
20620 /*
20621 * Check for empty map.
20622 */
20623 if (entry == vm_map_to_entry(map) &&
20624 entry->vme_next == entry) {
20625 vm_map_unlock(map);
20626 break;
20627 }
20628 /*
20629 * Check that we don't wrap and that
20630 * we have at least one real map entry.
20631 */
20632 if ((map->hdr.nentries == 0) ||
20633 (entry->vme_next->vme_start < address)) {
20634 vm_map_unlock(map);
20635 break;
20636 }
20637 /*
20638 * Move up to the next entry if needed
20639 */
20640 skip = (entry->vme_next->vme_start - address);
20641 if (skip >= amount_left) {
20642 amount_left = 0;
20643 } else {
20644 amount_left -= skip;
20645 }
20646 address = entry->vme_next->vme_start;
20647 vm_map_unlock(map);
20648 continue;
20649 }
20650
20651 offset = address - entry->vme_start;
20652 pmap_offset = address;
20653
20654 /*
20655 * do we have more to flush than is contained in this
20656 * entry ?
20657 */
20658 if (amount_left + entry->vme_start + offset > entry->vme_end) {
20659 flush_size = entry->vme_end -
20660 (entry->vme_start + offset);
20661 } else {
20662 flush_size = amount_left;
20663 }
20664 amount_left -= flush_size;
20665 address += flush_size;
20666
20667 if (entry->is_sub_map == TRUE) {
20668 vm_map_t local_map;
20669 vm_map_offset_t local_offset;
20670
20671 local_map = VME_SUBMAP(entry);
20672 local_offset = VME_OFFSET(entry);
20673 vm_map_reference(map: local_map);
20674 vm_map_unlock(map);
20675 if (vm_map_msync(
20676 map: local_map,
20677 address: local_offset,
20678 size: flush_size,
20679 sync_flags) == KERN_INVALID_ADDRESS) {
20680 had_hole = TRUE;
20681 }
20682 vm_map_deallocate(map: local_map);
20683 continue;
20684 }
20685 object = VME_OBJECT(entry);
20686
20687 /*
20688 * We can't sync this object if the object has not been
20689 * created yet
20690 */
20691 if (object == VM_OBJECT_NULL) {
20692 vm_map_unlock(map);
20693 continue;
20694 }
20695 offset += VME_OFFSET(entry);
20696
20697 vm_object_lock(object);
20698
20699 if (sync_flags & (VM_SYNC_KILLPAGES | VM_SYNC_DEACTIVATE)) {
20700 int kill_pages = 0;
20701
20702 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20703 /*
20704 * This is a destructive operation and so we
20705 * err on the side of limiting the range of
20706 * the operation.
20707 */
20708 start_offset = vm_object_round_page(offset);
20709 end_offset = vm_object_trunc_page(offset + flush_size);
20710
20711 if (end_offset <= start_offset) {
20712 vm_object_unlock(object);
20713 vm_map_unlock(map);
20714 continue;
20715 }
20716
20717 pmap_offset += start_offset - offset;
20718 } else {
20719 start_offset = offset;
20720 end_offset = offset + flush_size;
20721 }
20722
20723 if (sync_flags & VM_SYNC_KILLPAGES) {
20724 if (((object->ref_count == 1) ||
20725 ((object->copy_strategy !=
20726 MEMORY_OBJECT_COPY_SYMMETRIC) &&
20727 (object->vo_copy == VM_OBJECT_NULL))) &&
20728 (object->shadow == VM_OBJECT_NULL)) {
20729 if (object->ref_count != 1) {
20730 vm_page_stats_reusable.free_shared++;
20731 }
20732 kill_pages = 1;
20733 } else {
20734 kill_pages = -1;
20735 }
20736 }
20737 if (kill_pages != -1) {
20738 vm_object_deactivate_pages(
20739 object,
20740 offset: start_offset,
20741 size: (vm_object_size_t) (end_offset - start_offset),
20742 kill_page: kill_pages,
20743 FALSE, /* reusable_pages */
20744 FALSE, /* reusable_no_write */
20745 pmap: map->pmap,
20746 pmap_offset);
20747 }
20748 vm_object_unlock(object);
20749 vm_map_unlock(map);
20750 continue;
20751 }
20752 /*
20753 * We can't sync this object if there isn't a pager.
20754 * Don't bother to sync internal objects, since there can't
20755 * be any "permanent" storage for these objects anyway.
20756 */
20757 if ((object->pager == MEMORY_OBJECT_NULL) ||
20758 (object->internal) || (object->private)) {
20759 vm_object_unlock(object);
20760 vm_map_unlock(map);
20761 continue;
20762 }
20763 /*
20764 * keep reference on the object until syncing is done
20765 */
20766 vm_object_reference_locked(object);
20767 vm_object_unlock(object);
20768
20769 vm_map_unlock(map);
20770
20771 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
20772 start_offset = vm_object_trunc_page(offset);
20773 end_offset = vm_object_round_page(offset + flush_size);
20774 } else {
20775 start_offset = offset;
20776 end_offset = offset + flush_size;
20777 }
20778
20779 do_sync_req = vm_object_sync(object,
20780 offset: start_offset,
20781 size: (end_offset - start_offset),
20782 should_flush: sync_flags & VM_SYNC_INVALIDATE,
20783 should_return: ((sync_flags & VM_SYNC_SYNCHRONOUS) ||
20784 (sync_flags & VM_SYNC_ASYNCHRONOUS)),
20785 should_iosync: sync_flags & VM_SYNC_SYNCHRONOUS);
20786
20787 if ((sync_flags & VM_SYNC_INVALIDATE) && object->resident_page_count == 0) {
20788 /*
20789 * clear out the clustering and read-ahead hints
20790 */
20791 vm_object_lock(object);
20792
20793 object->pages_created = 0;
20794 object->pages_used = 0;
20795 object->sequential = 0;
20796 object->last_alloc = 0;
20797
20798 vm_object_unlock(object);
20799 }
20800 vm_object_deallocate(object);
20801 } /* while */
20802
20803 /* for proper msync() behaviour */
20804 if (had_hole == TRUE && (sync_flags & VM_SYNC_CONTIGUOUS)) {
20805 return KERN_INVALID_ADDRESS;
20806 }
20807
20808 return KERN_SUCCESS;
20809}/* vm_msync */
20810
20811void
20812vm_named_entry_associate_vm_object(
20813 vm_named_entry_t named_entry,
20814 vm_object_t object,
20815 vm_object_offset_t offset,
20816 vm_object_size_t size,
20817 vm_prot_t prot)
20818{
20819 vm_map_copy_t copy;
20820 vm_map_entry_t copy_entry;
20821
20822 assert(!named_entry->is_sub_map);
20823 assert(!named_entry->is_copy);
20824 assert(!named_entry->is_object);
20825 assert(!named_entry->internal);
20826 assert(named_entry->backing.copy == VM_MAP_COPY_NULL);
20827
20828 copy = vm_map_copy_allocate(VM_MAP_COPY_ENTRY_LIST);
20829 copy->offset = offset;
20830 copy->size = size;
20831 copy->cpy_hdr.page_shift = (uint16_t)PAGE_SHIFT;
20832
20833 copy_entry = vm_map_copy_entry_create(copy);
20834 copy_entry->protection = prot;
20835 copy_entry->max_protection = prot;
20836 copy_entry->use_pmap = TRUE;
20837 copy_entry->vme_start = VM_MAP_TRUNC_PAGE(offset, PAGE_MASK);
20838 copy_entry->vme_end = VM_MAP_ROUND_PAGE(offset + size, PAGE_MASK);
20839 VME_OBJECT_SET(entry: copy_entry, object, false, context: 0);
20840 VME_OFFSET_SET(entry: copy_entry, vm_object_trunc_page(offset));
20841 vm_map_copy_entry_link(copy, vm_map_copy_last_entry(copy), copy_entry);
20842
20843 named_entry->backing.copy = copy;
20844 named_entry->is_object = TRUE;
20845 if (object->internal) {
20846 named_entry->internal = TRUE;
20847 }
20848
20849 DEBUG4K_MEMENTRY("named_entry %p copy %p object %p offset 0x%llx size 0x%llx prot 0x%x\n",
20850 named_entry, copy, object, offset, size, prot);
20851}
20852
20853vm_object_t
20854vm_named_entry_to_vm_object(
20855 vm_named_entry_t named_entry)
20856{
20857 vm_map_copy_t copy;
20858 vm_map_entry_t copy_entry;
20859 vm_object_t object;
20860
20861 assert(!named_entry->is_sub_map);
20862 assert(!named_entry->is_copy);
20863 assert(named_entry->is_object);
20864 copy = named_entry->backing.copy;
20865 assert(copy != VM_MAP_COPY_NULL);
20866 /*
20867 * Assert that the vm_map_copy is coming from the right
20868 * zone and hasn't been forged
20869 */
20870 vm_map_copy_require(copy);
20871 assert(copy->cpy_hdr.nentries == 1);
20872 copy_entry = vm_map_copy_first_entry(copy);
20873 object = VME_OBJECT(copy_entry);
20874
20875 DEBUG4K_MEMENTRY("%p -> %p -> %p [0x%llx 0x%llx 0x%llx 0x%x/0x%x ] -> %p offset 0x%llx size 0x%llx prot 0x%x\n", named_entry, copy, copy_entry, (uint64_t)copy_entry->vme_start, (uint64_t)copy_entry->vme_end, copy_entry->vme_offset, copy_entry->protection, copy_entry->max_protection, object, named_entry->offset, named_entry->size, named_entry->protection);
20876
20877 return object;
20878}
20879
20880/*
20881 * Routine: convert_port_entry_to_map
20882 * Purpose:
20883 * Convert from a port specifying an entry or a task
20884 * to a map. Doesn't consume the port ref; produces a map ref,
20885 * which may be null. Unlike convert_port_to_map, the
20886 * port may be task or a named entry backed.
20887 * Conditions:
20888 * Nothing locked.
20889 */
20890
20891vm_map_t
20892convert_port_entry_to_map(
20893 ipc_port_t port)
20894{
20895 vm_map_t map = VM_MAP_NULL;
20896 vm_named_entry_t named_entry;
20897
20898 if (!IP_VALID(port)) {
20899 return VM_MAP_NULL;
20900 }
20901
20902 if (ip_kotype(port) != IKOT_NAMED_ENTRY) {
20903 return convert_port_to_map(port);
20904 }
20905
20906 named_entry = mach_memory_entry_from_port(port);
20907
20908 if ((named_entry->is_sub_map) &&
20909 (named_entry->protection & VM_PROT_WRITE)) {
20910 map = named_entry->backing.map;
20911 if (map->pmap != PMAP_NULL) {
20912 if (map->pmap == kernel_pmap) {
20913 panic("userspace has access "
20914 "to a kernel map %p", map);
20915 }
20916 pmap_require(pmap: map->pmap);
20917 }
20918 vm_map_reference(map);
20919 }
20920
20921 return map;
20922}
20923
20924/*
20925 * Export routines to other components for the things we access locally through
20926 * macros.
20927 */
20928#undef current_map
20929vm_map_t
20930current_map(void)
20931{
20932 return current_map_fast();
20933}
20934
20935/*
20936 * vm_map_reference:
20937 *
20938 * Takes a reference on the specified map.
20939 */
20940void
20941vm_map_reference(
20942 vm_map_t map)
20943{
20944 if (__probable(map != VM_MAP_NULL)) {
20945 vm_map_require(map);
20946 os_ref_retain_raw(&map->map_refcnt, &map_refgrp);
20947 }
20948}
20949
20950/*
20951 * vm_map_deallocate:
20952 *
20953 * Removes a reference from the specified map,
20954 * destroying it if no references remain.
20955 * The map should not be locked.
20956 */
20957void
20958vm_map_deallocate(
20959 vm_map_t map)
20960{
20961 if (__probable(map != VM_MAP_NULL)) {
20962 vm_map_require(map);
20963 if (os_ref_release_raw(&map->map_refcnt, &map_refgrp) == 0) {
20964 vm_map_destroy(map);
20965 }
20966 }
20967}
20968
20969void
20970vm_map_inspect_deallocate(
20971 vm_map_inspect_t map)
20972{
20973 vm_map_deallocate(map: (vm_map_t)map);
20974}
20975
20976void
20977vm_map_read_deallocate(
20978 vm_map_read_t map)
20979{
20980 vm_map_deallocate(map: (vm_map_t)map);
20981}
20982
20983
20984void
20985vm_map_disable_NX(vm_map_t map)
20986{
20987 if (map == NULL) {
20988 return;
20989 }
20990 if (map->pmap == NULL) {
20991 return;
20992 }
20993
20994 pmap_disable_NX(pmap: map->pmap);
20995}
20996
20997void
20998vm_map_disallow_data_exec(vm_map_t map)
20999{
21000 if (map == NULL) {
21001 return;
21002 }
21003
21004 map->map_disallow_data_exec = TRUE;
21005}
21006
21007/* XXX Consider making these constants (VM_MAX_ADDRESS and MACH_VM_MAX_ADDRESS)
21008 * more descriptive.
21009 */
21010void
21011vm_map_set_32bit(vm_map_t map)
21012{
21013#if defined(__arm64__)
21014 map->max_offset = pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_DEVICE);
21015#else
21016 map->max_offset = (vm_map_offset_t)VM_MAX_ADDRESS;
21017#endif
21018}
21019
21020
21021void
21022vm_map_set_64bit(vm_map_t map)
21023{
21024#if defined(__arm64__)
21025 map->max_offset = pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_DEVICE);
21026#else
21027 map->max_offset = (vm_map_offset_t)MACH_VM_MAX_ADDRESS;
21028#endif
21029}
21030
21031/*
21032 * Expand the maximum size of an existing map to the maximum supported.
21033 */
21034void
21035vm_map_set_jumbo(vm_map_t map)
21036{
21037#if defined (__arm64__) && !XNU_TARGET_OS_OSX
21038 vm_map_set_max_addr(map, ~0);
21039#else /* arm64 */
21040 (void) map;
21041#endif
21042}
21043
21044/*
21045 * This map has a JIT entitlement
21046 */
21047void
21048vm_map_set_jit_entitled(vm_map_t map)
21049{
21050#if defined (__arm64__)
21051 pmap_set_jit_entitled(pmap: map->pmap);
21052#else /* arm64 */
21053 (void) map;
21054#endif
21055}
21056
21057/*
21058 * Get status of this maps TPRO flag
21059 */
21060boolean_t
21061vm_map_tpro(vm_map_t map)
21062{
21063#if defined (__arm64e__)
21064 return pmap_get_tpro(pmap: map->pmap);
21065#else /* arm64e */
21066 (void) map;
21067 return FALSE;
21068#endif
21069}
21070
21071/*
21072 * This map has TPRO enabled
21073 */
21074void
21075vm_map_set_tpro(vm_map_t map)
21076{
21077#if defined (__arm64e__)
21078 pmap_set_tpro(pmap: map->pmap);
21079#else /* arm64e */
21080 (void) map;
21081#endif
21082}
21083
21084/*
21085 * Does this map have TPRO enforcement enabled
21086 */
21087boolean_t
21088vm_map_tpro_enforcement(vm_map_t map)
21089{
21090 return map->tpro_enforcement;
21091}
21092
21093/*
21094 * Set TPRO enforcement for this map
21095 */
21096void
21097vm_map_set_tpro_enforcement(vm_map_t map)
21098{
21099 if (vm_map_tpro(map)) {
21100 vm_map_lock(map);
21101 map->tpro_enforcement = TRUE;
21102 vm_map_unlock(map);
21103 }
21104}
21105
21106/*
21107 * Enable TPRO on the requested region
21108 *
21109 * Note:
21110 * This routine is primarily intended to be called during/soon after map
21111 * creation before the associated task has been released to run. It is only
21112 * currently safe when we have no resident pages.
21113 */
21114boolean_t
21115vm_map_set_tpro_range(
21116 __unused vm_map_t map,
21117 __unused vm_map_address_t start,
21118 __unused vm_map_address_t end)
21119{
21120 return TRUE;
21121}
21122
21123/*
21124 * Expand the maximum size of an existing map.
21125 */
21126void
21127vm_map_set_max_addr(vm_map_t map, vm_map_offset_t new_max_offset)
21128{
21129#if defined(__arm64__)
21130 vm_map_offset_t max_supported_offset;
21131 vm_map_offset_t old_max_offset;
21132
21133 vm_map_lock(map);
21134
21135 old_max_offset = map->max_offset;
21136 max_supported_offset = pmap_max_offset(is64: vm_map_is_64bit(map), ARM_PMAP_MAX_OFFSET_JUMBO);
21137
21138 new_max_offset = trunc_page(new_max_offset);
21139
21140 /* The address space cannot be shrunk using this routine. */
21141 if (old_max_offset >= new_max_offset) {
21142 vm_map_unlock(map);
21143 return;
21144 }
21145
21146 if (max_supported_offset < new_max_offset) {
21147 new_max_offset = max_supported_offset;
21148 }
21149
21150 map->max_offset = new_max_offset;
21151
21152 if (map->holelistenabled) {
21153 if (map->holes_list->prev->vme_end == old_max_offset) {
21154 /*
21155 * There is already a hole at the end of the map; simply make it bigger.
21156 */
21157 map->holes_list->prev->vme_end = map->max_offset;
21158 } else {
21159 /*
21160 * There is no hole at the end, so we need to create a new hole
21161 * for the new empty space we're creating.
21162 */
21163 struct vm_map_links *new_hole;
21164
21165 new_hole = zalloc_id(ZONE_ID_VM_MAP_HOLES, Z_WAITOK | Z_NOFAIL);
21166 new_hole->start = old_max_offset;
21167 new_hole->end = map->max_offset;
21168 new_hole->prev = map->holes_list->prev;
21169 new_hole->next = (struct vm_map_entry *)map->holes_list;
21170 map->holes_list->prev->vme_next = (struct vm_map_entry *)new_hole;
21171 map->holes_list->prev = (struct vm_map_entry *)new_hole;
21172 }
21173 }
21174
21175 vm_map_unlock(map);
21176#else
21177 (void)map;
21178 (void)new_max_offset;
21179#endif
21180}
21181
21182vm_map_offset_t
21183vm_compute_max_offset(boolean_t is64)
21184{
21185#if defined(__arm64__)
21186 return pmap_max_offset(is64, ARM_PMAP_MAX_OFFSET_DEVICE);
21187#else
21188 return is64 ? (vm_map_offset_t)MACH_VM_MAX_ADDRESS : (vm_map_offset_t)VM_MAX_ADDRESS;
21189#endif
21190}
21191
21192void
21193vm_map_get_max_aslr_slide_section(
21194 vm_map_t map __unused,
21195 int64_t *max_sections,
21196 int64_t *section_size)
21197{
21198#if defined(__arm64__)
21199 *max_sections = 3;
21200 *section_size = ARM_TT_TWIG_SIZE;
21201#else
21202 *max_sections = 1;
21203 *section_size = 0;
21204#endif
21205}
21206
21207uint64_t
21208vm_map_get_max_aslr_slide_pages(vm_map_t map)
21209{
21210#if defined(__arm64__)
21211 /* Limit arm64 slide to 16MB to conserve contiguous VA space in the more
21212 * limited embedded address space; this is also meant to minimize pmap
21213 * memory usage on 16KB page systems.
21214 */
21215 return 1 << (24 - VM_MAP_PAGE_SHIFT(map));
21216#else
21217 return 1 << (vm_map_is_64bit(map) ? 16 : 8);
21218#endif
21219}
21220
21221uint64_t
21222vm_map_get_max_loader_aslr_slide_pages(vm_map_t map)
21223{
21224#if defined(__arm64__)
21225 /* We limit the loader slide to 4MB, in order to ensure at least 8 bits
21226 * of independent entropy on 16KB page systems.
21227 */
21228 return 1 << (22 - VM_MAP_PAGE_SHIFT(map));
21229#else
21230 return 1 << (vm_map_is_64bit(map) ? 16 : 8);
21231#endif
21232}
21233
21234boolean_t
21235vm_map_is_64bit(
21236 vm_map_t map)
21237{
21238 return map->max_offset > ((vm_map_offset_t)VM_MAX_ADDRESS);
21239}
21240
21241boolean_t
21242vm_map_has_hard_pagezero(
21243 vm_map_t map,
21244 vm_map_offset_t pagezero_size)
21245{
21246 /*
21247 * XXX FBDP
21248 * We should lock the VM map (for read) here but we can get away
21249 * with it for now because there can't really be any race condition:
21250 * the VM map's min_offset is changed only when the VM map is created
21251 * and when the zero page is established (when the binary gets loaded),
21252 * and this routine gets called only when the task terminates and the
21253 * VM map is being torn down, and when a new map is created via
21254 * load_machfile()/execve().
21255 */
21256 return map->min_offset >= pagezero_size;
21257}
21258
21259/*
21260 * Raise a VM map's maximun offset.
21261 */
21262kern_return_t
21263vm_map_raise_max_offset(
21264 vm_map_t map,
21265 vm_map_offset_t new_max_offset)
21266{
21267 kern_return_t ret;
21268
21269 vm_map_lock(map);
21270 ret = KERN_INVALID_ADDRESS;
21271
21272 if (new_max_offset >= map->max_offset) {
21273 if (!vm_map_is_64bit(map)) {
21274 if (new_max_offset <= (vm_map_offset_t)VM_MAX_ADDRESS) {
21275 map->max_offset = new_max_offset;
21276 ret = KERN_SUCCESS;
21277 }
21278 } else {
21279 if (new_max_offset <= (vm_map_offset_t)MACH_VM_MAX_ADDRESS) {
21280 map->max_offset = new_max_offset;
21281 ret = KERN_SUCCESS;
21282 }
21283 }
21284 }
21285
21286 vm_map_unlock(map);
21287 return ret;
21288}
21289
21290
21291/*
21292 * Raise a VM map's minimum offset.
21293 * To strictly enforce "page zero" reservation.
21294 */
21295kern_return_t
21296vm_map_raise_min_offset(
21297 vm_map_t map,
21298 vm_map_offset_t new_min_offset)
21299{
21300 vm_map_entry_t first_entry;
21301
21302 new_min_offset = vm_map_round_page(new_min_offset,
21303 VM_MAP_PAGE_MASK(map));
21304
21305 vm_map_lock(map);
21306
21307 if (new_min_offset < map->min_offset) {
21308 /*
21309 * Can't move min_offset backwards, as that would expose
21310 * a part of the address space that was previously, and for
21311 * possibly good reasons, inaccessible.
21312 */
21313 vm_map_unlock(map);
21314 return KERN_INVALID_ADDRESS;
21315 }
21316 if (new_min_offset >= map->max_offset) {
21317 /* can't go beyond the end of the address space */
21318 vm_map_unlock(map);
21319 return KERN_INVALID_ADDRESS;
21320 }
21321
21322 first_entry = vm_map_first_entry(map);
21323 if (first_entry != vm_map_to_entry(map) &&
21324 first_entry->vme_start < new_min_offset) {
21325 /*
21326 * Some memory was already allocated below the new
21327 * minimun offset. It's too late to change it now...
21328 */
21329 vm_map_unlock(map);
21330 return KERN_NO_SPACE;
21331 }
21332
21333 map->min_offset = new_min_offset;
21334
21335 if (map->holelistenabled) {
21336 assert(map->holes_list);
21337 map->holes_list->start = new_min_offset;
21338 assert(new_min_offset < map->holes_list->end);
21339 }
21340
21341 vm_map_unlock(map);
21342
21343 return KERN_SUCCESS;
21344}
21345
21346/*
21347 * Set the limit on the maximum amount of address space and user wired memory allowed for this map.
21348 * This is basically a copy of the RLIMIT_AS and RLIMIT_MEMLOCK rlimit value maintained by the BSD
21349 * side of the kernel. The limits are checked in the mach VM side, so we keep a copy so we don't
21350 * have to reach over to the BSD data structures.
21351 */
21352
21353uint64_t vm_map_set_size_limit_count = 0;
21354kern_return_t
21355vm_map_set_size_limit(vm_map_t map, uint64_t new_size_limit)
21356{
21357 kern_return_t kr;
21358
21359 vm_map_lock(map);
21360 if (new_size_limit < map->size) {
21361 /* new limit should not be lower than its current size */
21362 DTRACE_VM2(vm_map_set_size_limit_fail,
21363 vm_map_size_t, map->size,
21364 uint64_t, new_size_limit);
21365 kr = KERN_FAILURE;
21366 } else if (new_size_limit == map->size_limit) {
21367 /* no change */
21368 kr = KERN_SUCCESS;
21369 } else {
21370 /* set new limit */
21371 DTRACE_VM2(vm_map_set_size_limit,
21372 vm_map_size_t, map->size,
21373 uint64_t, new_size_limit);
21374 if (new_size_limit != RLIM_INFINITY) {
21375 vm_map_set_size_limit_count++;
21376 }
21377 map->size_limit = new_size_limit;
21378 kr = KERN_SUCCESS;
21379 }
21380 vm_map_unlock(map);
21381 return kr;
21382}
21383
21384uint64_t vm_map_set_data_limit_count = 0;
21385kern_return_t
21386vm_map_set_data_limit(vm_map_t map, uint64_t new_data_limit)
21387{
21388 kern_return_t kr;
21389
21390 vm_map_lock(map);
21391 if (new_data_limit < map->size) {
21392 /* new limit should not be lower than its current size */
21393 DTRACE_VM2(vm_map_set_data_limit_fail,
21394 vm_map_size_t, map->size,
21395 uint64_t, new_data_limit);
21396 kr = KERN_FAILURE;
21397 } else if (new_data_limit == map->data_limit) {
21398 /* no change */
21399 kr = KERN_SUCCESS;
21400 } else {
21401 /* set new limit */
21402 DTRACE_VM2(vm_map_set_data_limit,
21403 vm_map_size_t, map->size,
21404 uint64_t, new_data_limit);
21405 if (new_data_limit != RLIM_INFINITY) {
21406 vm_map_set_data_limit_count++;
21407 }
21408 map->data_limit = new_data_limit;
21409 kr = KERN_SUCCESS;
21410 }
21411 vm_map_unlock(map);
21412 return kr;
21413}
21414
21415void
21416vm_map_set_user_wire_limit(vm_map_t map,
21417 vm_size_t limit)
21418{
21419 vm_map_lock(map);
21420 map->user_wire_limit = limit;
21421 vm_map_unlock(map);
21422}
21423
21424
21425void
21426vm_map_switch_protect(vm_map_t map,
21427 boolean_t val)
21428{
21429 vm_map_lock(map);
21430 map->switch_protect = val;
21431 vm_map_unlock(map);
21432}
21433
21434extern int cs_process_enforcement_enable;
21435boolean_t
21436vm_map_cs_enforcement(
21437 vm_map_t map)
21438{
21439 if (cs_process_enforcement_enable) {
21440 return TRUE;
21441 }
21442 return map->cs_enforcement;
21443}
21444
21445kern_return_t
21446vm_map_cs_wx_enable(
21447 __unused vm_map_t map)
21448{
21449#if CODE_SIGNING_MONITOR
21450 kern_return_t ret = csm_allow_invalid_code(vm_map_pmap(map));
21451 if ((ret == KERN_SUCCESS) || (ret == KERN_NOT_SUPPORTED)) {
21452 return KERN_SUCCESS;
21453 }
21454 return ret;
21455#else
21456 /* The VM manages WX memory entirely on its own */
21457 return KERN_SUCCESS;
21458#endif
21459}
21460
21461kern_return_t
21462vm_map_csm_allow_jit(
21463 __unused vm_map_t map)
21464{
21465#if CODE_SIGNING_MONITOR
21466 return csm_allow_jit_region(vm_map_pmap(map));
21467#else
21468 /* No code signing monitor to enforce JIT policy */
21469 return KERN_SUCCESS;
21470#endif
21471}
21472
21473void
21474vm_map_cs_debugged_set(
21475 vm_map_t map,
21476 boolean_t val)
21477{
21478 vm_map_lock(map);
21479 map->cs_debugged = val;
21480 vm_map_unlock(map);
21481}
21482
21483void
21484vm_map_cs_enforcement_set(
21485 vm_map_t map,
21486 boolean_t val)
21487{
21488 vm_map_lock(map);
21489 map->cs_enforcement = val;
21490 pmap_set_vm_map_cs_enforced(pmap: map->pmap, new_value: val);
21491 vm_map_unlock(map);
21492}
21493
21494/*
21495 * IOKit has mapped a region into this map; adjust the pmap's ledgers appropriately.
21496 * phys_footprint is a composite limit consisting of iokit + physmem, so we need to
21497 * bump both counters.
21498 */
21499void
21500vm_map_iokit_mapped_region(vm_map_t map, vm_size_t bytes)
21501{
21502 pmap_t pmap = vm_map_pmap(map);
21503
21504 ledger_credit(ledger: pmap->ledger, entry: task_ledgers.iokit_mapped, amount: bytes);
21505 ledger_credit(ledger: pmap->ledger, entry: task_ledgers.phys_footprint, amount: bytes);
21506}
21507
21508void
21509vm_map_iokit_unmapped_region(vm_map_t map, vm_size_t bytes)
21510{
21511 pmap_t pmap = vm_map_pmap(map);
21512
21513 ledger_debit(ledger: pmap->ledger, entry: task_ledgers.iokit_mapped, amount: bytes);
21514 ledger_debit(ledger: pmap->ledger, entry: task_ledgers.phys_footprint, amount: bytes);
21515}
21516
21517/* Add (generate) code signature for memory range */
21518#if CONFIG_DYNAMIC_CODE_SIGNING
21519kern_return_t
21520vm_map_sign(vm_map_t map,
21521 vm_map_offset_t start,
21522 vm_map_offset_t end)
21523{
21524 vm_map_entry_t entry;
21525 vm_page_t m;
21526 vm_object_t object;
21527
21528 /*
21529 * Vet all the input parameters and current type and state of the
21530 * underlaying object. Return with an error if anything is amiss.
21531 */
21532 if (map == VM_MAP_NULL) {
21533 return KERN_INVALID_ARGUMENT;
21534 }
21535
21536 if (__improbable(vm_map_range_overflows(map, start, end - start))) {
21537 return KERN_INVALID_ADDRESS;
21538 }
21539
21540 vm_map_lock_read(map);
21541
21542 if (!vm_map_lookup_entry(map, start, &entry) || entry->is_sub_map) {
21543 /*
21544 * Must pass a valid non-submap address.
21545 */
21546 vm_map_unlock_read(map);
21547 return KERN_INVALID_ADDRESS;
21548 }
21549
21550 if ((entry->vme_start > start) || (entry->vme_end < end)) {
21551 /*
21552 * Map entry doesn't cover the requested range. Not handling
21553 * this situation currently.
21554 */
21555 vm_map_unlock_read(map);
21556 return KERN_INVALID_ARGUMENT;
21557 }
21558
21559 object = VME_OBJECT(entry);
21560 if (object == VM_OBJECT_NULL) {
21561 /*
21562 * Object must already be present or we can't sign.
21563 */
21564 vm_map_unlock_read(map);
21565 return KERN_INVALID_ARGUMENT;
21566 }
21567
21568 vm_object_lock(object);
21569 vm_map_unlock_read(map);
21570
21571 while (start < end) {
21572 uint32_t refmod;
21573
21574 m = vm_page_lookup(object,
21575 start - entry->vme_start + VME_OFFSET(entry));
21576 if (m == VM_PAGE_NULL) {
21577 /* shoud we try to fault a page here? we can probably
21578 * demand it exists and is locked for this request */
21579 vm_object_unlock(object);
21580 return KERN_FAILURE;
21581 }
21582 /* deal with special page status */
21583 if (m->vmp_busy ||
21584 (m->vmp_unusual && (VMP_ERROR_GET(m) || m->vmp_restart || m->vmp_private || m->vmp_absent))) {
21585 vm_object_unlock(object);
21586 return KERN_FAILURE;
21587 }
21588
21589 /* Page is OK... now "validate" it */
21590 /* This is the place where we'll call out to create a code
21591 * directory, later */
21592 /* XXX TODO4K: deal with 4k subpages individually? */
21593 m->vmp_cs_validated = VMP_CS_ALL_TRUE;
21594
21595 /* The page is now "clean" for codesigning purposes. That means
21596 * we don't consider it as modified (wpmapped) anymore. But
21597 * we'll disconnect the page so we note any future modification
21598 * attempts. */
21599 m->vmp_wpmapped = FALSE;
21600 refmod = pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
21601
21602 /* Pull the dirty status from the pmap, since we cleared the
21603 * wpmapped bit */
21604 if ((refmod & VM_MEM_MODIFIED) && !m->vmp_dirty) {
21605 SET_PAGE_DIRTY(m, FALSE);
21606 }
21607
21608 /* On to the next page */
21609 start += PAGE_SIZE;
21610 }
21611 vm_object_unlock(object);
21612
21613 return KERN_SUCCESS;
21614}
21615#endif
21616
21617kern_return_t
21618vm_map_partial_reap(vm_map_t map, unsigned int *reclaimed_resident, unsigned int *reclaimed_compressed)
21619{
21620 vm_map_entry_t entry = VM_MAP_ENTRY_NULL;
21621 vm_map_entry_t next_entry;
21622 kern_return_t kr = KERN_SUCCESS;
21623 VM_MAP_ZAP_DECLARE(zap_list);
21624
21625 vm_map_lock(map);
21626
21627 for (entry = vm_map_first_entry(map);
21628 entry != vm_map_to_entry(map);
21629 entry = next_entry) {
21630 next_entry = entry->vme_next;
21631
21632 if (!entry->is_sub_map &&
21633 VME_OBJECT(entry) &&
21634 (VME_OBJECT(entry)->internal == TRUE) &&
21635 (VME_OBJECT(entry)->ref_count == 1)) {
21636 *reclaimed_resident += VME_OBJECT(entry)->resident_page_count;
21637 *reclaimed_compressed += vm_compressor_pager_get_count(VME_OBJECT(entry)->pager);
21638
21639 (void)vm_map_delete(map, start: entry->vme_start,
21640 end: entry->vme_end, flags: VM_MAP_REMOVE_NO_YIELD,
21641 KMEM_GUARD_NONE, zap_list: &zap_list);
21642 }
21643 }
21644
21645 vm_map_unlock(map);
21646
21647 vm_map_zap_dispose(list: &zap_list);
21648
21649 return kr;
21650}
21651
21652
21653#if DEVELOPMENT || DEBUG
21654
21655int
21656vm_map_disconnect_page_mappings(
21657 vm_map_t map,
21658 boolean_t do_unnest)
21659{
21660 vm_map_entry_t entry;
21661 ledger_amount_t byte_count = 0;
21662
21663 if (do_unnest == TRUE) {
21664#ifndef NO_NESTED_PMAP
21665 vm_map_lock(map);
21666
21667 for (entry = vm_map_first_entry(map);
21668 entry != vm_map_to_entry(map);
21669 entry = entry->vme_next) {
21670 if (entry->is_sub_map && entry->use_pmap) {
21671 /*
21672 * Make sure the range between the start of this entry and
21673 * the end of this entry is no longer nested, so that
21674 * we will only remove mappings from the pmap in use by this
21675 * this task
21676 */
21677 vm_map_clip_unnest(map, entry, entry->vme_start, entry->vme_end);
21678 }
21679 }
21680 vm_map_unlock(map);
21681#endif
21682 }
21683 vm_map_lock_read(map);
21684
21685 ledger_get_balance(map->pmap->ledger, task_ledgers.phys_mem, &byte_count);
21686
21687 for (entry = vm_map_first_entry(map);
21688 entry != vm_map_to_entry(map);
21689 entry = entry->vme_next) {
21690 if (!entry->is_sub_map && ((VME_OBJECT(entry) == 0) ||
21691 (VME_OBJECT(entry)->phys_contiguous))) {
21692 continue;
21693 }
21694 if (entry->is_sub_map) {
21695 assert(!entry->use_pmap);
21696 }
21697
21698 pmap_remove_options(map->pmap, entry->vme_start, entry->vme_end, 0);
21699 }
21700 vm_map_unlock_read(map);
21701
21702 return (int) (byte_count / VM_MAP_PAGE_SIZE(map));
21703}
21704
21705kern_return_t
21706vm_map_inject_error(vm_map_t map, vm_map_offset_t vaddr)
21707{
21708 vm_object_t object = NULL;
21709 vm_object_offset_t offset;
21710 vm_prot_t prot;
21711 boolean_t wired;
21712 vm_map_version_t version;
21713 vm_map_t real_map;
21714 int result = KERN_FAILURE;
21715
21716 vaddr = vm_map_trunc_page(vaddr, PAGE_MASK);
21717 vm_map_lock(map);
21718
21719 result = vm_map_lookup_and_lock_object(&map, vaddr, VM_PROT_READ,
21720 OBJECT_LOCK_EXCLUSIVE, &version, &object, &offset, &prot, &wired,
21721 NULL, &real_map, NULL);
21722 if (object == NULL) {
21723 result = KERN_MEMORY_ERROR;
21724 } else if (object->pager) {
21725 result = vm_compressor_pager_inject_error(object->pager,
21726 offset);
21727 } else {
21728 result = KERN_MEMORY_PRESENT;
21729 }
21730
21731 if (object != NULL) {
21732 vm_object_unlock(object);
21733 }
21734
21735 if (real_map != map) {
21736 vm_map_unlock(real_map);
21737 }
21738 vm_map_unlock(map);
21739
21740 return result;
21741}
21742
21743#endif
21744
21745
21746#if CONFIG_FREEZE
21747
21748
21749extern struct freezer_context freezer_context_global;
21750AbsoluteTime c_freezer_last_yield_ts = 0;
21751
21752extern unsigned int memorystatus_freeze_private_shared_pages_ratio;
21753extern unsigned int memorystatus_freeze_shared_mb_per_process_max;
21754
21755kern_return_t
21756vm_map_freeze(
21757 task_t task,
21758 unsigned int *purgeable_count,
21759 unsigned int *wired_count,
21760 unsigned int *clean_count,
21761 unsigned int *dirty_count,
21762 unsigned int dirty_budget,
21763 unsigned int *shared_count,
21764 int *freezer_error_code,
21765 boolean_t eval_only)
21766{
21767 vm_map_entry_t entry2 = VM_MAP_ENTRY_NULL;
21768 kern_return_t kr = KERN_SUCCESS;
21769 boolean_t evaluation_phase = TRUE;
21770 vm_object_t cur_shared_object = NULL;
21771 int cur_shared_obj_ref_cnt = 0;
21772 unsigned int dirty_private_count = 0, dirty_shared_count = 0, obj_pages_snapshot = 0;
21773
21774 *purgeable_count = *wired_count = *clean_count = *dirty_count = *shared_count = 0;
21775
21776 /*
21777 * We need the exclusive lock here so that we can
21778 * block any page faults or lookups while we are
21779 * in the middle of freezing this vm map.
21780 */
21781 vm_map_t map = task->map;
21782
21783 vm_map_lock(map);
21784
21785 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
21786
21787 if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
21788 if (vm_compressor_low_on_space()) {
21789 *freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
21790 }
21791
21792 if (vm_swap_low_on_space()) {
21793 *freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
21794 }
21795
21796 kr = KERN_NO_SPACE;
21797 goto done;
21798 }
21799
21800 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE == FALSE) {
21801 /*
21802 * In-memory compressor backing the freezer. No disk.
21803 * So no need to do the evaluation phase.
21804 */
21805 evaluation_phase = FALSE;
21806
21807 if (eval_only == TRUE) {
21808 /*
21809 * We don't support 'eval_only' mode
21810 * in this non-swap config.
21811 */
21812 *freezer_error_code = FREEZER_ERROR_GENERIC;
21813 kr = KERN_INVALID_ARGUMENT;
21814 goto done;
21815 }
21816
21817 freezer_context_global.freezer_ctx_uncompressed_pages = 0;
21818 clock_get_uptime(&c_freezer_last_yield_ts);
21819 }
21820again:
21821
21822 for (entry2 = vm_map_first_entry(map);
21823 entry2 != vm_map_to_entry(map);
21824 entry2 = entry2->vme_next) {
21825 vm_object_t src_object;
21826
21827 if (entry2->is_sub_map) {
21828 continue;
21829 }
21830
21831 src_object = VME_OBJECT(entry2);
21832 if (!src_object ||
21833 src_object->phys_contiguous ||
21834 !src_object->internal) {
21835 continue;
21836 }
21837
21838 /* If eligible, scan the entry, moving eligible pages over to our parent object */
21839
21840 if (VM_CONFIG_FREEZER_SWAP_IS_ACTIVE) {
21841 /*
21842 * We skip purgeable objects during evaluation phase only.
21843 * If we decide to freeze this process, we'll explicitly
21844 * purge these objects before we go around again with
21845 * 'evaluation_phase' set to FALSE.
21846 */
21847
21848 if ((src_object->purgable == VM_PURGABLE_EMPTY) || (src_object->purgable == VM_PURGABLE_VOLATILE)) {
21849 /*
21850 * We want to purge objects that may not belong to this task but are mapped
21851 * in this task alone. Since we already purged this task's purgeable memory
21852 * at the end of a successful evaluation phase, we want to avoid doing no-op calls
21853 * on this task's purgeable objects. Hence the check for only volatile objects.
21854 */
21855 if (evaluation_phase ||
21856 src_object->purgable != VM_PURGABLE_VOLATILE ||
21857 src_object->ref_count != 1) {
21858 continue;
21859 }
21860 vm_object_lock(src_object);
21861 if (src_object->purgable == VM_PURGABLE_VOLATILE &&
21862 src_object->ref_count == 1) {
21863 purgeable_q_t old_queue;
21864
21865 /* object should be on a purgeable queue */
21866 assert(src_object->objq.next != NULL &&
21867 src_object->objq.prev != NULL);
21868 /* move object from its volatile queue to the nonvolatile queue */
21869 old_queue = vm_purgeable_object_remove(src_object);
21870 assert(old_queue);
21871 if (src_object->purgeable_when_ripe) {
21872 /* remove a token from that volatile queue */
21873 vm_page_lock_queues();
21874 vm_purgeable_token_delete_first(old_queue);
21875 vm_page_unlock_queues();
21876 }
21877 /* purge the object */
21878 vm_object_purge(src_object, 0);
21879 }
21880 vm_object_unlock(src_object);
21881 continue;
21882 }
21883
21884 /*
21885 * Pages belonging to this object could be swapped to disk.
21886 * Make sure it's not a shared object because we could end
21887 * up just bringing it back in again.
21888 *
21889 * We try to optimize somewhat by checking for objects that are mapped
21890 * more than once within our own map. But we don't do full searches,
21891 * we just look at the entries following our current entry.
21892 */
21893
21894 if (src_object->ref_count > 1) {
21895 if (src_object != cur_shared_object) {
21896 obj_pages_snapshot = (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
21897 dirty_shared_count += obj_pages_snapshot;
21898
21899 cur_shared_object = src_object;
21900 cur_shared_obj_ref_cnt = 1;
21901 continue;
21902 } else {
21903 cur_shared_obj_ref_cnt++;
21904 if (src_object->ref_count == cur_shared_obj_ref_cnt) {
21905 /*
21906 * Fall through to below and treat this object as private.
21907 * So deduct its pages from our shared total and add it to the
21908 * private total.
21909 */
21910
21911 dirty_shared_count -= obj_pages_snapshot;
21912 dirty_private_count += obj_pages_snapshot;
21913 } else {
21914 continue;
21915 }
21916 }
21917 }
21918
21919
21920 if (src_object->ref_count == 1) {
21921 dirty_private_count += (src_object->resident_page_count - src_object->wired_page_count) + vm_compressor_pager_get_count(src_object->pager);
21922 }
21923
21924 if (evaluation_phase == TRUE) {
21925 continue;
21926 }
21927 }
21928
21929 uint32_t paged_out_count = vm_object_compressed_freezer_pageout(src_object, dirty_budget);
21930 *wired_count += src_object->wired_page_count;
21931
21932 if (vm_compressor_low_on_space() || vm_swap_low_on_space()) {
21933 if (vm_compressor_low_on_space()) {
21934 *freezer_error_code = FREEZER_ERROR_NO_COMPRESSOR_SPACE;
21935 }
21936
21937 if (vm_swap_low_on_space()) {
21938 *freezer_error_code = FREEZER_ERROR_NO_SWAP_SPACE;
21939 }
21940
21941 kr = KERN_NO_SPACE;
21942 break;
21943 }
21944 if (paged_out_count >= dirty_budget) {
21945 break;
21946 }
21947 dirty_budget -= paged_out_count;
21948 }
21949
21950 *shared_count = (unsigned int) ((dirty_shared_count * PAGE_SIZE_64) / (1024 * 1024ULL));
21951 if (evaluation_phase) {
21952 unsigned int shared_pages_threshold = (memorystatus_freeze_shared_mb_per_process_max * 1024 * 1024ULL) / PAGE_SIZE_64;
21953
21954 if (dirty_shared_count > shared_pages_threshold) {
21955 *freezer_error_code = FREEZER_ERROR_EXCESS_SHARED_MEMORY;
21956 kr = KERN_FAILURE;
21957 goto done;
21958 }
21959
21960 if (dirty_shared_count &&
21961 ((dirty_private_count / dirty_shared_count) < memorystatus_freeze_private_shared_pages_ratio)) {
21962 *freezer_error_code = FREEZER_ERROR_LOW_PRIVATE_SHARED_RATIO;
21963 kr = KERN_FAILURE;
21964 goto done;
21965 }
21966
21967 evaluation_phase = FALSE;
21968 dirty_shared_count = dirty_private_count = 0;
21969
21970 freezer_context_global.freezer_ctx_uncompressed_pages = 0;
21971 clock_get_uptime(&c_freezer_last_yield_ts);
21972
21973 if (eval_only) {
21974 kr = KERN_SUCCESS;
21975 goto done;
21976 }
21977
21978 vm_purgeable_purge_task_owned(task);
21979
21980 goto again;
21981 } else {
21982 kr = KERN_SUCCESS;
21983 }
21984
21985done:
21986 vm_map_unlock(map);
21987
21988 if ((eval_only == FALSE) && (kr == KERN_SUCCESS)) {
21989 vm_object_compressed_freezer_done();
21990 }
21991 return kr;
21992}
21993
21994#endif
21995
21996/*
21997 * vm_map_entry_should_cow_for_true_share:
21998 *
21999 * Determines if the map entry should be clipped and setup for copy-on-write
22000 * to avoid applying "true_share" to a large VM object when only a subset is
22001 * targeted.
22002 *
22003 * For now, we target only the map entries created for the Objective C
22004 * Garbage Collector, which initially have the following properties:
22005 * - alias == VM_MEMORY_MALLOC
22006 * - wired_count == 0
22007 * - !needs_copy
22008 * and a VM object with:
22009 * - internal
22010 * - copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC
22011 * - !true_share
22012 * - vo_size == ANON_CHUNK_SIZE
22013 *
22014 * Only non-kernel map entries.
22015 */
22016boolean_t
22017vm_map_entry_should_cow_for_true_share(
22018 vm_map_entry_t entry)
22019{
22020 vm_object_t object;
22021
22022 if (entry->is_sub_map) {
22023 /* entry does not point at a VM object */
22024 return FALSE;
22025 }
22026
22027 if (entry->needs_copy) {
22028 /* already set for copy_on_write: done! */
22029 return FALSE;
22030 }
22031
22032 if (VME_ALIAS(entry) != VM_MEMORY_MALLOC &&
22033 VME_ALIAS(entry) != VM_MEMORY_MALLOC_SMALL) {
22034 /* not a malloc heap or Obj-C Garbage Collector heap */
22035 return FALSE;
22036 }
22037
22038 if (entry->wired_count) {
22039 /* wired: can't change the map entry... */
22040 vm_counters.should_cow_but_wired++;
22041 return FALSE;
22042 }
22043
22044 object = VME_OBJECT(entry);
22045
22046 if (object == VM_OBJECT_NULL) {
22047 /* no object yet... */
22048 return FALSE;
22049 }
22050
22051 if (!object->internal) {
22052 /* not an internal object */
22053 return FALSE;
22054 }
22055
22056 if (object->copy_strategy != MEMORY_OBJECT_COPY_SYMMETRIC) {
22057 /* not the default copy strategy */
22058 return FALSE;
22059 }
22060
22061 if (object->true_share) {
22062 /* already true_share: too late to avoid it */
22063 return FALSE;
22064 }
22065
22066 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC &&
22067 object->vo_size != ANON_CHUNK_SIZE) {
22068 /* ... not an object created for the ObjC Garbage Collector */
22069 return FALSE;
22070 }
22071
22072 if (VME_ALIAS(entry) == VM_MEMORY_MALLOC_SMALL &&
22073 object->vo_size != 2048 * 4096) {
22074 /* ... not a "MALLOC_SMALL" heap */
22075 return FALSE;
22076 }
22077
22078 /*
22079 * All the criteria match: we have a large object being targeted for "true_share".
22080 * To limit the adverse side-effects linked with "true_share", tell the caller to
22081 * try and avoid setting up the entire object for "true_share" by clipping the
22082 * targeted range and setting it up for copy-on-write.
22083 */
22084 return TRUE;
22085}
22086
22087uint64_t vm_map_range_overflows_count = 0;
22088TUNABLE_WRITEABLE(boolean_t, vm_map_range_overflows_log, "vm_map_range_overflows_log", FALSE);
22089bool
22090vm_map_range_overflows(
22091 vm_map_t map,
22092 vm_map_offset_t addr,
22093 vm_map_size_t size)
22094{
22095 vm_map_offset_t start, end, sum;
22096 vm_map_offset_t pgmask;
22097
22098 if (size == 0) {
22099 /* empty range -> no overflow */
22100 return false;
22101 }
22102 pgmask = vm_map_page_mask(map);
22103 start = vm_map_trunc_page_mask(offset: addr, mask: pgmask);
22104 end = vm_map_round_page_mask(offset: addr + size, mask: pgmask);
22105 if (__improbable(os_add_overflow(addr, size, &sum) || end <= start)) {
22106 vm_map_range_overflows_count++;
22107 if (vm_map_range_overflows_log) {
22108 printf(format: "%d[%s] vm_map_range_overflows addr 0x%llx size 0x%llx pgmask 0x%llx\n",
22109 proc_selfpid(),
22110 proc_best_name(p: current_proc()),
22111 (uint64_t)addr,
22112 (uint64_t)size,
22113 (uint64_t)pgmask);
22114 }
22115 DTRACE_VM4(vm_map_range_overflows,
22116 vm_map_t, map,
22117 uint32_t, pgmask,
22118 uint64_t, (uint64_t)addr,
22119 uint64_t, (uint64_t)size);
22120 return true;
22121 }
22122 return false;
22123}
22124
22125vm_map_offset_t
22126vm_map_round_page_mask(
22127 vm_map_offset_t offset,
22128 vm_map_offset_t mask)
22129{
22130 return VM_MAP_ROUND_PAGE(offset, mask);
22131}
22132
22133vm_map_offset_t
22134vm_map_trunc_page_mask(
22135 vm_map_offset_t offset,
22136 vm_map_offset_t mask)
22137{
22138 return VM_MAP_TRUNC_PAGE(offset, mask);
22139}
22140
22141boolean_t
22142vm_map_page_aligned(
22143 vm_map_offset_t offset,
22144 vm_map_offset_t mask)
22145{
22146 return ((offset) & mask) == 0;
22147}
22148
22149int
22150vm_map_page_shift(
22151 vm_map_t map)
22152{
22153 return VM_MAP_PAGE_SHIFT(map);
22154}
22155
22156int
22157vm_map_page_size(
22158 vm_map_t map)
22159{
22160 return VM_MAP_PAGE_SIZE(map);
22161}
22162
22163vm_map_offset_t
22164vm_map_page_mask(
22165 vm_map_t map)
22166{
22167 return VM_MAP_PAGE_MASK(map);
22168}
22169
22170kern_return_t
22171vm_map_set_page_shift(
22172 vm_map_t map,
22173 int pageshift)
22174{
22175 if (map->hdr.nentries != 0) {
22176 /* too late to change page size */
22177 return KERN_FAILURE;
22178 }
22179
22180 map->hdr.page_shift = (uint16_t)pageshift;
22181
22182 return KERN_SUCCESS;
22183}
22184
22185kern_return_t
22186vm_map_query_volatile(
22187 vm_map_t map,
22188 mach_vm_size_t *volatile_virtual_size_p,
22189 mach_vm_size_t *volatile_resident_size_p,
22190 mach_vm_size_t *volatile_compressed_size_p,
22191 mach_vm_size_t *volatile_pmap_size_p,
22192 mach_vm_size_t *volatile_compressed_pmap_size_p)
22193{
22194 mach_vm_size_t volatile_virtual_size;
22195 mach_vm_size_t volatile_resident_count;
22196 mach_vm_size_t volatile_compressed_count;
22197 mach_vm_size_t volatile_pmap_count;
22198 mach_vm_size_t volatile_compressed_pmap_count;
22199 mach_vm_size_t resident_count;
22200 vm_map_entry_t entry;
22201 vm_object_t object;
22202
22203 /* map should be locked by caller */
22204
22205 volatile_virtual_size = 0;
22206 volatile_resident_count = 0;
22207 volatile_compressed_count = 0;
22208 volatile_pmap_count = 0;
22209 volatile_compressed_pmap_count = 0;
22210
22211 for (entry = vm_map_first_entry(map);
22212 entry != vm_map_to_entry(map);
22213 entry = entry->vme_next) {
22214 mach_vm_size_t pmap_resident_bytes, pmap_compressed_bytes;
22215
22216 if (entry->is_sub_map) {
22217 continue;
22218 }
22219 if (!(entry->protection & VM_PROT_WRITE)) {
22220 continue;
22221 }
22222 object = VME_OBJECT(entry);
22223 if (object == VM_OBJECT_NULL) {
22224 continue;
22225 }
22226 if (object->purgable != VM_PURGABLE_VOLATILE &&
22227 object->purgable != VM_PURGABLE_EMPTY) {
22228 continue;
22229 }
22230 if (VME_OFFSET(entry)) {
22231 /*
22232 * If the map entry has been split and the object now
22233 * appears several times in the VM map, we don't want
22234 * to count the object's resident_page_count more than
22235 * once. We count it only for the first one, starting
22236 * at offset 0 and ignore the other VM map entries.
22237 */
22238 continue;
22239 }
22240 resident_count = object->resident_page_count;
22241 if ((VME_OFFSET(entry) / PAGE_SIZE) >= resident_count) {
22242 resident_count = 0;
22243 } else {
22244 resident_count -= (VME_OFFSET(entry) / PAGE_SIZE);
22245 }
22246
22247 volatile_virtual_size += entry->vme_end - entry->vme_start;
22248 volatile_resident_count += resident_count;
22249 if (object->pager) {
22250 volatile_compressed_count +=
22251 vm_compressor_pager_get_count(mem_obj: object->pager);
22252 }
22253 pmap_compressed_bytes = 0;
22254 pmap_resident_bytes =
22255 pmap_query_resident(pmap: map->pmap,
22256 s: entry->vme_start,
22257 e: entry->vme_end,
22258 compressed_bytes_p: &pmap_compressed_bytes);
22259 volatile_pmap_count += (pmap_resident_bytes / PAGE_SIZE);
22260 volatile_compressed_pmap_count += (pmap_compressed_bytes
22261 / PAGE_SIZE);
22262 }
22263
22264 /* map is still locked on return */
22265
22266 *volatile_virtual_size_p = volatile_virtual_size;
22267 *volatile_resident_size_p = volatile_resident_count * PAGE_SIZE;
22268 *volatile_compressed_size_p = volatile_compressed_count * PAGE_SIZE;
22269 *volatile_pmap_size_p = volatile_pmap_count * PAGE_SIZE;
22270 *volatile_compressed_pmap_size_p = volatile_compressed_pmap_count * PAGE_SIZE;
22271
22272 return KERN_SUCCESS;
22273}
22274
22275void
22276vm_map_sizes(vm_map_t map,
22277 vm_map_size_t * psize,
22278 vm_map_size_t * pfree,
22279 vm_map_size_t * plargest_free)
22280{
22281 vm_map_entry_t entry;
22282 vm_map_offset_t prev;
22283 vm_map_size_t free, total_free, largest_free;
22284 boolean_t end;
22285
22286 if (!map) {
22287 *psize = *pfree = *plargest_free = 0;
22288 return;
22289 }
22290 total_free = largest_free = 0;
22291
22292 vm_map_lock_read(map);
22293 if (psize) {
22294 *psize = map->max_offset - map->min_offset;
22295 }
22296
22297 prev = map->min_offset;
22298 for (entry = vm_map_first_entry(map);; entry = entry->vme_next) {
22299 end = (entry == vm_map_to_entry(map));
22300
22301 if (end) {
22302 free = entry->vme_end - prev;
22303 } else {
22304 free = entry->vme_start - prev;
22305 }
22306
22307 total_free += free;
22308 if (free > largest_free) {
22309 largest_free = free;
22310 }
22311
22312 if (end) {
22313 break;
22314 }
22315 prev = entry->vme_end;
22316 }
22317 vm_map_unlock_read(map);
22318 if (pfree) {
22319 *pfree = total_free;
22320 }
22321 if (plargest_free) {
22322 *plargest_free = largest_free;
22323 }
22324}
22325
22326#if VM_SCAN_FOR_SHADOW_CHAIN
22327int vm_map_shadow_max(vm_map_t map);
22328int
22329vm_map_shadow_max(
22330 vm_map_t map)
22331{
22332 int shadows, shadows_max;
22333 vm_map_entry_t entry;
22334 vm_object_t object, next_object;
22335
22336 if (map == NULL) {
22337 return 0;
22338 }
22339
22340 shadows_max = 0;
22341
22342 vm_map_lock_read(map);
22343
22344 for (entry = vm_map_first_entry(map);
22345 entry != vm_map_to_entry(map);
22346 entry = entry->vme_next) {
22347 if (entry->is_sub_map) {
22348 continue;
22349 }
22350 object = VME_OBJECT(entry);
22351 if (object == NULL) {
22352 continue;
22353 }
22354 vm_object_lock_shared(object);
22355 for (shadows = 0;
22356 object->shadow != NULL;
22357 shadows++, object = next_object) {
22358 next_object = object->shadow;
22359 vm_object_lock_shared(next_object);
22360 vm_object_unlock(object);
22361 }
22362 vm_object_unlock(object);
22363 if (shadows > shadows_max) {
22364 shadows_max = shadows;
22365 }
22366 }
22367
22368 vm_map_unlock_read(map);
22369
22370 return shadows_max;
22371}
22372#endif /* VM_SCAN_FOR_SHADOW_CHAIN */
22373
22374void
22375vm_commit_pagezero_status(vm_map_t lmap)
22376{
22377 pmap_advise_pagezero_range(lmap->pmap, lmap->min_offset);
22378}
22379
22380#if __x86_64__
22381void
22382vm_map_set_high_start(
22383 vm_map_t map,
22384 vm_map_offset_t high_start)
22385{
22386 map->vmmap_high_start = high_start;
22387}
22388#endif /* __x86_64__ */
22389
22390#if CODE_SIGNING_MONITOR
22391
22392kern_return_t
22393vm_map_entry_cs_associate(
22394 vm_map_t map,
22395 vm_map_entry_t entry,
22396 vm_map_kernel_flags_t vmk_flags)
22397{
22398 vm_object_t cs_object, cs_shadow, backing_object;
22399 vm_object_offset_t cs_offset, backing_offset;
22400 void *cs_blobs;
22401 struct vnode *cs_vnode;
22402 kern_return_t cs_ret;
22403
22404 if (map->pmap == NULL ||
22405 entry->is_sub_map || /* XXX FBDP: recurse on sub-range? */
22406 (csm_address_space_exempt(map->pmap) == KERN_SUCCESS) ||
22407 VME_OBJECT(entry) == VM_OBJECT_NULL) {
22408 return KERN_SUCCESS;
22409 }
22410
22411 if (!(entry->protection & VM_PROT_EXECUTE)) {
22412 /*
22413 * This memory region is not executable, so the code-signing
22414 * monitor would usually not care about it...
22415 */
22416 if (vmk_flags.vmkf_remap_prot_copy &&
22417 (entry->max_protection & VM_PROT_EXECUTE)) {
22418 /*
22419 * ... except if the memory region is being remapped
22420 * from r-x/r-x to rw-/rwx via vm_protect(VM_PROT_COPY)
22421 * which is what a debugger or dtrace would be doing
22422 * to prepare to modify an executable page to insert
22423 * a breakpoint or activate a probe.
22424 * In that case, fall through so that we can mark
22425 * this region as being "debugged" and no longer
22426 * strictly code-signed.
22427 */
22428 } else {
22429 /*
22430 * Really not executable, so no need to tell the
22431 * code-signing monitor.
22432 */
22433 return KERN_SUCCESS;
22434 }
22435 }
22436
22437 vm_map_lock_assert_exclusive(map);
22438
22439 /*
22440 * Check for a debug association mapping before we check for used_for_jit. This
22441 * allows non-RWX JIT on macOS systems to masquerade their mappings as USER_DEBUG
22442 * pages instead of USER_JIT. These non-RWX JIT pages cannot be marked as USER_JIT
22443 * since they are mapped with RW or RX permissions, which the page table monitor
22444 * denies on USER_JIT pages. Given that, if they're not mapped as USER_DEBUG,
22445 * they will be mapped as USER_EXEC, and that will cause another page table monitor
22446 * violation when those USER_EXEC pages are mapped as RW.
22447 *
22448 * Since these pages switch between RW and RX through mprotect, they mimic what
22449 * we expect a debugger to do. As the code signing monitor does not enforce mappings
22450 * on macOS systems, this works in our favor here and allows us to continue to
22451 * support these legacy-programmed applications without sacrificing security on
22452 * the page table or the code signing monitor. We don't need to explicitly check
22453 * for entry_for_jit here and the mapping permissions. If the initial mapping is
22454 * created with RX, then the application must map it as RW in order to first write
22455 * to the page (MAP_JIT mappings must be private and anonymous). The switch to
22456 * RX will cause vm_map_protect to mark the entry as vmkf_remap_prot_copy.
22457 * Similarly, if the mapping was created as RW, and then switched to RX,
22458 * vm_map_protect will again mark the entry as a copy, and both these cases
22459 * lead to this if-statement being entered.
22460 *
22461 * For more information: rdar://115313336.
22462 */
22463 if (vmk_flags.vmkf_remap_prot_copy) {
22464 cs_ret = csm_associate_debug_region(
22465 map->pmap,
22466 entry->vme_start,
22467 entry->vme_end - entry->vme_start);
22468
22469 /*
22470 * csm_associate_debug_region returns not supported when the code signing
22471 * monitor is disabled. This is intentional, since cs_ret is checked towards
22472 * the end of the function, and if it is not supported, then we still want the
22473 * VM to perform code-signing enforcement on this entry. That said, if we don't
22474 * mark this as a xnu_user_debug page when the code-signing monitor is disabled,
22475 * then it never gets retyped to XNU_USER_DEBUG frame type, which then causes
22476 * an issue with debugging (since it'll be mapped in as XNU_USER_EXEC in some
22477 * cases, which will cause a violation when attempted to be mapped as writable).
22478 */
22479 if ((cs_ret == KERN_SUCCESS) || (cs_ret == KERN_NOT_SUPPORTED)) {
22480 entry->vme_xnu_user_debug = TRUE;
22481 }
22482#if DEVELOPMENT || DEBUG
22483 if (vm_log_xnu_user_debug) {
22484 printf("FBDP %d[%s] %s:%d map %p entry %p [ 0x%llx 0x%llx ] vme_xnu_user_debug=%d cs_ret %d\n",
22485 proc_selfpid(),
22486 (get_bsdtask_info(current_task()) ? proc_name_address(get_bsdtask_info(current_task())) : "?"),
22487 __FUNCTION__, __LINE__,
22488 map, entry,
22489 (uint64_t)entry->vme_start, (uint64_t)entry->vme_end,
22490 entry->vme_xnu_user_debug,
22491 cs_ret);
22492 }
22493#endif /* DEVELOPMENT || DEBUG */
22494 goto done;
22495 }
22496
22497 if (entry->used_for_jit) {
22498 cs_ret = csm_associate_jit_region(
22499 map->pmap,
22500 entry->vme_start,
22501 entry->vme_end - entry->vme_start);
22502 goto done;
22503 }
22504
22505 cs_object = VME_OBJECT(entry);
22506 vm_object_lock_shared(cs_object);
22507 cs_offset = VME_OFFSET(entry);
22508
22509 /* find the VM object backed by the code-signed vnode */
22510 for (;;) {
22511 /* go to the bottom of cs_object's shadow chain */
22512 for (;
22513 cs_object->shadow != VM_OBJECT_NULL;
22514 cs_object = cs_shadow) {
22515 cs_shadow = cs_object->shadow;
22516 cs_offset += cs_object->vo_shadow_offset;
22517 vm_object_lock_shared(cs_shadow);
22518 vm_object_unlock(cs_object);
22519 }
22520 if (cs_object->internal ||
22521 cs_object->pager == MEMORY_OBJECT_NULL) {
22522 vm_object_unlock(cs_object);
22523 return KERN_SUCCESS;
22524 }
22525
22526 cs_offset += cs_object->paging_offset;
22527
22528 /*
22529 * cs_object could be backed by a:
22530 * vnode_pager
22531 * apple_protect_pager
22532 * shared_region_pager
22533 * fourk_pager (multiple backing objects -> fail?)
22534 * ask the pager if it has a backing VM object
22535 */
22536 if (!memory_object_backing_object(cs_object->pager,
22537 cs_offset,
22538 &backing_object,
22539 &backing_offset)) {
22540 /* no backing object: cs_object is it */
22541 break;
22542 }
22543
22544 /* look down the backing object's shadow chain */
22545 vm_object_lock_shared(backing_object);
22546 vm_object_unlock(cs_object);
22547 cs_object = backing_object;
22548 cs_offset = backing_offset;
22549 }
22550
22551 cs_vnode = vnode_pager_lookup_vnode(cs_object->pager);
22552 if (cs_vnode == NULL) {
22553 /* no vnode, no code signatures to associate */
22554 cs_ret = KERN_SUCCESS;
22555 } else {
22556 cs_ret = vnode_pager_get_cs_blobs(cs_vnode,
22557 &cs_blobs);
22558 assert(cs_ret == KERN_SUCCESS);
22559 cs_ret = cs_associate_blob_with_mapping(map->pmap,
22560 entry->vme_start,
22561 (entry->vme_end - entry->vme_start),
22562 cs_offset,
22563 cs_blobs);
22564 }
22565 vm_object_unlock(cs_object);
22566 cs_object = VM_OBJECT_NULL;
22567
22568done:
22569 if (cs_ret == KERN_SUCCESS) {
22570 DTRACE_VM2(vm_map_entry_cs_associate_success,
22571 vm_map_offset_t, entry->vme_start,
22572 vm_map_offset_t, entry->vme_end);
22573 if (vm_map_executable_immutable) {
22574 /*
22575 * Prevent this executable
22576 * mapping from being unmapped
22577 * or modified.
22578 */
22579 entry->vme_permanent = TRUE;
22580 }
22581 /*
22582 * pmap says it will validate the
22583 * code-signing validity of pages
22584 * faulted in via this mapping, so
22585 * this map entry should be marked so
22586 * that vm_fault() bypasses code-signing
22587 * validation for faults coming through
22588 * this mapping.
22589 */
22590 entry->csm_associated = TRUE;
22591 } else if (cs_ret == KERN_NOT_SUPPORTED) {
22592 /*
22593 * pmap won't check the code-signing
22594 * validity of pages faulted in via
22595 * this mapping, so VM should keep
22596 * doing it.
22597 */
22598 DTRACE_VM3(vm_map_entry_cs_associate_off,
22599 vm_map_offset_t, entry->vme_start,
22600 vm_map_offset_t, entry->vme_end,
22601 int, cs_ret);
22602 } else {
22603 /*
22604 * A real error: do not allow
22605 * execution in this mapping.
22606 */
22607 DTRACE_VM3(vm_map_entry_cs_associate_failure,
22608 vm_map_offset_t, entry->vme_start,
22609 vm_map_offset_t, entry->vme_end,
22610 int, cs_ret);
22611 if (vmk_flags.vmkf_overwrite_immutable) {
22612 /*
22613 * We can get here when we remap an apple_protect pager
22614 * on top of an already cs_associated executable mapping
22615 * with the same code signatures, so we don't want to
22616 * lose VM_PROT_EXECUTE in that case...
22617 */
22618 } else {
22619 entry->protection &= ~VM_PROT_ALLEXEC;
22620 entry->max_protection &= ~VM_PROT_ALLEXEC;
22621 }
22622 }
22623
22624 return cs_ret;
22625}
22626
22627#endif /* CODE_SIGNING_MONITOR */
22628
22629inline bool
22630vm_map_is_corpse_source(vm_map_t map)
22631{
22632 bool status = false;
22633 if (map) {
22634 vm_map_lock_read(map);
22635 status = map->corpse_source;
22636 vm_map_unlock_read(map);
22637 }
22638 return status;
22639}
22640
22641inline void
22642vm_map_set_corpse_source(vm_map_t map)
22643{
22644 if (map) {
22645 vm_map_lock(map);
22646 map->corpse_source = true;
22647 vm_map_unlock(map);
22648 }
22649}
22650
22651inline void
22652vm_map_unset_corpse_source(vm_map_t map)
22653{
22654 if (map) {
22655 vm_map_lock(map);
22656 map->corpse_source = false;
22657 vm_map_unlock(map);
22658 }
22659}
22660/*
22661 * FORKED CORPSE FOOTPRINT
22662 *
22663 * A forked corpse gets a copy of the original VM map but its pmap is mostly
22664 * empty since it never ran and never got to fault in any pages.
22665 * Collecting footprint info (via "sysctl vm.self_region_footprint") for
22666 * a forked corpse would therefore return very little information.
22667 *
22668 * When forking a corpse, we can pass the VM_MAP_FORK_CORPSE_FOOTPRINT option
22669 * to vm_map_fork() to collect footprint information from the original VM map
22670 * and its pmap, and store it in the forked corpse's VM map. That information
22671 * is stored in place of the VM map's "hole list" since we'll never need to
22672 * lookup for holes in the corpse's map.
22673 *
22674 * The corpse's footprint info looks like this:
22675 *
22676 * vm_map->vmmap_corpse_footprint points to pageable kernel memory laid out
22677 * as follows:
22678 * +---------------------------------------+
22679 * header-> | cf_size |
22680 * +-------------------+-------------------+
22681 * | cf_last_region | cf_last_zeroes |
22682 * +-------------------+-------------------+
22683 * region1-> | cfr_vaddr |
22684 * +-------------------+-------------------+
22685 * | cfr_num_pages | d0 | d1 | d2 | d3 |
22686 * +---------------------------------------+
22687 * | d4 | d5 | ... |
22688 * +---------------------------------------+
22689 * | ... |
22690 * +-------------------+-------------------+
22691 * | dy | dz | na | na | cfr_vaddr... | <-region2
22692 * +-------------------+-------------------+
22693 * | cfr_vaddr (ctd) | cfr_num_pages |
22694 * +---------------------------------------+
22695 * | d0 | d1 ... |
22696 * +---------------------------------------+
22697 * ...
22698 * +---------------------------------------+
22699 * last region-> | cfr_vaddr |
22700 * +---------------------------------------+
22701 * + cfr_num_pages | d0 | d1 | d2 | d3 |
22702 * +---------------------------------------+
22703 * ...
22704 * +---------------------------------------+
22705 * | dx | dy | dz | na | na | na | na | na |
22706 * +---------------------------------------+
22707 *
22708 * where:
22709 * cf_size: total size of the buffer (rounded to page size)
22710 * cf_last_region: offset in the buffer of the last "region" sub-header
22711 * cf_last_zeroes: number of trailing "zero" dispositions at the end
22712 * of last region
22713 * cfr_vaddr: virtual address of the start of the covered "region"
22714 * cfr_num_pages: number of pages in the covered "region"
22715 * d*: disposition of the page at that virtual address
22716 * Regions in the buffer are word-aligned.
22717 *
22718 * We estimate the size of the buffer based on the number of memory regions
22719 * and the virtual size of the address space. While copying each memory region
22720 * during vm_map_fork(), we also collect the footprint info for that region
22721 * and store it in the buffer, packing it as much as possible (coalescing
22722 * contiguous memory regions to avoid having too many region headers and
22723 * avoiding long streaks of "zero" page dispositions by splitting footprint
22724 * "regions", so the number of regions in the footprint buffer might not match
22725 * the number of memory regions in the address space.
22726 *
22727 * We also have to copy the original task's "nonvolatile" ledgers since that's
22728 * part of the footprint and will need to be reported to any tool asking for
22729 * the footprint information of the forked corpse.
22730 */
22731
22732uint64_t vm_map_corpse_footprint_count = 0;
22733uint64_t vm_map_corpse_footprint_size_avg = 0;
22734uint64_t vm_map_corpse_footprint_size_max = 0;
22735uint64_t vm_map_corpse_footprint_full = 0;
22736uint64_t vm_map_corpse_footprint_no_buf = 0;
22737
22738struct vm_map_corpse_footprint_header {
22739 vm_size_t cf_size; /* allocated buffer size */
22740 uint32_t cf_last_region; /* offset of last region in buffer */
22741 union {
22742 uint32_t cfu_last_zeroes; /* during creation:
22743 * number of "zero" dispositions at
22744 * end of last region */
22745 uint32_t cfu_hint_region; /* during lookup:
22746 * offset of last looked up region */
22747#define cf_last_zeroes cfu.cfu_last_zeroes
22748#define cf_hint_region cfu.cfu_hint_region
22749 } cfu;
22750};
22751typedef uint8_t cf_disp_t;
22752struct vm_map_corpse_footprint_region {
22753 vm_map_offset_t cfr_vaddr; /* region start virtual address */
22754 uint32_t cfr_num_pages; /* number of pages in this "region" */
22755 cf_disp_t cfr_disposition[0]; /* disposition of each page */
22756} __attribute__((packed));
22757
22758static cf_disp_t
22759vm_page_disposition_to_cf_disp(
22760 int disposition)
22761{
22762 assert(sizeof(cf_disp_t) == 1);
22763 /* relocate bits that don't fit in a "uint8_t" */
22764 if (disposition & VM_PAGE_QUERY_PAGE_REUSABLE) {
22765 disposition |= VM_PAGE_QUERY_PAGE_FICTITIOUS;
22766 }
22767 /* cast gets rid of extra bits */
22768 return (cf_disp_t) disposition;
22769}
22770
22771static int
22772vm_page_cf_disp_to_disposition(
22773 cf_disp_t cf_disp)
22774{
22775 int disposition;
22776
22777 assert(sizeof(cf_disp_t) == 1);
22778 disposition = (int) cf_disp;
22779 /* move relocated bits back in place */
22780 if (cf_disp & VM_PAGE_QUERY_PAGE_FICTITIOUS) {
22781 disposition |= VM_PAGE_QUERY_PAGE_REUSABLE;
22782 disposition &= ~VM_PAGE_QUERY_PAGE_FICTITIOUS;
22783 }
22784 return disposition;
22785}
22786
22787/*
22788 * vm_map_corpse_footprint_new_region:
22789 * closes the current footprint "region" and creates a new one
22790 *
22791 * Returns NULL if there's not enough space in the buffer for a new region.
22792 */
22793static struct vm_map_corpse_footprint_region *
22794vm_map_corpse_footprint_new_region(
22795 struct vm_map_corpse_footprint_header *footprint_header)
22796{
22797 uintptr_t footprint_edge;
22798 uint32_t new_region_offset;
22799 struct vm_map_corpse_footprint_region *footprint_region;
22800 struct vm_map_corpse_footprint_region *new_footprint_region;
22801
22802 footprint_edge = ((uintptr_t)footprint_header +
22803 footprint_header->cf_size);
22804 footprint_region = ((struct vm_map_corpse_footprint_region *)
22805 ((char *)footprint_header +
22806 footprint_header->cf_last_region));
22807 assert((uintptr_t)footprint_region + sizeof(*footprint_region) <=
22808 footprint_edge);
22809
22810 /* get rid of trailing zeroes in the last region */
22811 assert(footprint_region->cfr_num_pages >=
22812 footprint_header->cf_last_zeroes);
22813 footprint_region->cfr_num_pages -=
22814 footprint_header->cf_last_zeroes;
22815 footprint_header->cf_last_zeroes = 0;
22816
22817 /* reuse this region if it's now empty */
22818 if (footprint_region->cfr_num_pages == 0) {
22819 return footprint_region;
22820 }
22821
22822 /* compute offset of new region */
22823 new_region_offset = footprint_header->cf_last_region;
22824 new_region_offset += sizeof(*footprint_region);
22825 new_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
22826 new_region_offset = roundup(new_region_offset, sizeof(int));
22827
22828 /* check if we're going over the edge */
22829 if (((uintptr_t)footprint_header +
22830 new_region_offset +
22831 sizeof(*footprint_region)) >=
22832 footprint_edge) {
22833 /* over the edge: no new region */
22834 return NULL;
22835 }
22836
22837 /* adjust offset of last region in header */
22838 footprint_header->cf_last_region = new_region_offset;
22839
22840 new_footprint_region = (struct vm_map_corpse_footprint_region *)
22841 ((char *)footprint_header +
22842 footprint_header->cf_last_region);
22843 new_footprint_region->cfr_vaddr = 0;
22844 new_footprint_region->cfr_num_pages = 0;
22845 /* caller needs to initialize new region */
22846
22847 return new_footprint_region;
22848}
22849
22850/*
22851 * vm_map_corpse_footprint_collect:
22852 * collect footprint information for "old_entry" in "old_map" and
22853 * stores it in "new_map"'s vmmap_footprint_info.
22854 */
22855kern_return_t
22856vm_map_corpse_footprint_collect(
22857 vm_map_t old_map,
22858 vm_map_entry_t old_entry,
22859 vm_map_t new_map)
22860{
22861 vm_map_offset_t va;
22862 kern_return_t kr;
22863 struct vm_map_corpse_footprint_header *footprint_header;
22864 struct vm_map_corpse_footprint_region *footprint_region;
22865 struct vm_map_corpse_footprint_region *new_footprint_region;
22866 cf_disp_t *next_disp_p;
22867 uintptr_t footprint_edge;
22868 uint32_t num_pages_tmp;
22869 int effective_page_size;
22870
22871 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(old_map));
22872
22873 va = old_entry->vme_start;
22874
22875 vm_map_lock_assert_exclusive(old_map);
22876 vm_map_lock_assert_exclusive(new_map);
22877
22878 assert(new_map->has_corpse_footprint);
22879 assert(!old_map->has_corpse_footprint);
22880 if (!new_map->has_corpse_footprint ||
22881 old_map->has_corpse_footprint) {
22882 /*
22883 * This can only transfer footprint info from a
22884 * map with a live pmap to a map with a corpse footprint.
22885 */
22886 return KERN_NOT_SUPPORTED;
22887 }
22888
22889 if (new_map->vmmap_corpse_footprint == NULL) {
22890 vm_offset_t buf;
22891 vm_size_t buf_size;
22892
22893 buf = 0;
22894 buf_size = (sizeof(*footprint_header) +
22895 (old_map->hdr.nentries
22896 *
22897 (sizeof(*footprint_region) +
22898 +3)) /* potential alignment for each region */
22899 +
22900 ((old_map->size / effective_page_size)
22901 *
22902 sizeof(cf_disp_t))); /* disposition for each page */
22903// printf("FBDP corpse map %p guestimate footprint size 0x%llx\n", new_map, (uint64_t) buf_size);
22904 buf_size = round_page(x: buf_size);
22905
22906 /* limit buffer to 1 page to validate overflow detection */
22907// buf_size = PAGE_SIZE;
22908
22909 /* limit size to a somewhat sane amount */
22910#if XNU_TARGET_OS_OSX
22911#define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE (8*1024*1024) /* 8MB */
22912#else /* XNU_TARGET_OS_OSX */
22913#define VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE (256*1024) /* 256KB */
22914#endif /* XNU_TARGET_OS_OSX */
22915 if (buf_size > VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE) {
22916 buf_size = VM_MAP_CORPSE_FOOTPRINT_INFO_MAX_SIZE;
22917 }
22918
22919 /*
22920 * Allocate the pageable buffer (with a trailing guard page).
22921 * It will be zero-filled on demand.
22922 */
22923 kr = kmem_alloc(map: kernel_map, addrp: &buf, size: buf_size + PAGE_SIZE,
22924 flags: KMA_DATA | KMA_PAGEABLE | KMA_GUARD_LAST,
22925 VM_KERN_MEMORY_DIAG);
22926 if (kr != KERN_SUCCESS) {
22927 vm_map_corpse_footprint_no_buf++;
22928 return kr;
22929 }
22930
22931 /* initialize header and 1st region */
22932 footprint_header = (struct vm_map_corpse_footprint_header *)buf;
22933 new_map->vmmap_corpse_footprint = footprint_header;
22934
22935 footprint_header->cf_size = buf_size;
22936 footprint_header->cf_last_region =
22937 sizeof(*footprint_header);
22938 footprint_header->cf_last_zeroes = 0;
22939
22940 footprint_region = (struct vm_map_corpse_footprint_region *)
22941 ((char *)footprint_header +
22942 footprint_header->cf_last_region);
22943 footprint_region->cfr_vaddr = 0;
22944 footprint_region->cfr_num_pages = 0;
22945 } else {
22946 /* retrieve header and last region */
22947 footprint_header = (struct vm_map_corpse_footprint_header *)
22948 new_map->vmmap_corpse_footprint;
22949 footprint_region = (struct vm_map_corpse_footprint_region *)
22950 ((char *)footprint_header +
22951 footprint_header->cf_last_region);
22952 }
22953 footprint_edge = ((uintptr_t)footprint_header +
22954 footprint_header->cf_size);
22955
22956 if ((footprint_region->cfr_vaddr +
22957 (((vm_map_offset_t)footprint_region->cfr_num_pages) *
22958 effective_page_size))
22959 != old_entry->vme_start) {
22960 uint64_t num_pages_delta, num_pages_delta_size;
22961 uint32_t region_offset_delta_size;
22962
22963 /*
22964 * Not the next contiguous virtual address:
22965 * start a new region or store "zero" dispositions for
22966 * the missing pages?
22967 */
22968 /* size of gap in actual page dispositions */
22969 num_pages_delta = ((old_entry->vme_start -
22970 footprint_region->cfr_vaddr) / effective_page_size)
22971 - footprint_region->cfr_num_pages;
22972 num_pages_delta_size = num_pages_delta * sizeof(cf_disp_t);
22973 /* size of gap as a new footprint region header */
22974 region_offset_delta_size =
22975 (sizeof(*footprint_region) +
22976 roundup(((footprint_region->cfr_num_pages -
22977 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)),
22978 sizeof(int)) -
22979 ((footprint_region->cfr_num_pages -
22980 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)));
22981// printf("FBDP %s:%d region 0x%x 0x%llx 0x%x vme_start 0x%llx pages_delta 0x%llx region_delta 0x%x\n", __FUNCTION__, __LINE__, footprint_header->cf_last_region, footprint_region->cfr_vaddr, footprint_region->cfr_num_pages, old_entry->vme_start, num_pages_delta, region_offset_delta);
22982 if (region_offset_delta_size < num_pages_delta_size ||
22983 os_add3_overflow(footprint_region->cfr_num_pages,
22984 (uint32_t) num_pages_delta,
22985 1,
22986 &num_pages_tmp)) {
22987 /*
22988 * Storing data for this gap would take more space
22989 * than inserting a new footprint region header:
22990 * let's start a new region and save space. If it's a
22991 * tie, let's avoid using a new region, since that
22992 * would require more region hops to find the right
22993 * range during lookups.
22994 *
22995 * If the current region's cfr_num_pages would overflow
22996 * if we added "zero" page dispositions for the gap,
22997 * no choice but to start a new region.
22998 */
22999// printf("FBDP %s:%d new region\n", __FUNCTION__, __LINE__);
23000 new_footprint_region =
23001 vm_map_corpse_footprint_new_region(footprint_header);
23002 /* check that we're not going over the edge */
23003 if (new_footprint_region == NULL) {
23004 goto over_the_edge;
23005 }
23006 footprint_region = new_footprint_region;
23007 /* initialize new region as empty */
23008 footprint_region->cfr_vaddr = old_entry->vme_start;
23009 footprint_region->cfr_num_pages = 0;
23010 } else {
23011 /*
23012 * Store "zero" page dispositions for the missing
23013 * pages.
23014 */
23015// printf("FBDP %s:%d zero gap\n", __FUNCTION__, __LINE__);
23016 for (; num_pages_delta > 0; num_pages_delta--) {
23017 next_disp_p = (cf_disp_t *)
23018 ((uintptr_t) footprint_region +
23019 sizeof(*footprint_region));
23020 next_disp_p += footprint_region->cfr_num_pages;
23021 /* check that we're not going over the edge */
23022 if ((uintptr_t)next_disp_p >= footprint_edge) {
23023 goto over_the_edge;
23024 }
23025 /* store "zero" disposition for this gap page */
23026 footprint_region->cfr_num_pages++;
23027 *next_disp_p = (cf_disp_t) 0;
23028 footprint_header->cf_last_zeroes++;
23029 }
23030 }
23031 }
23032
23033 for (va = old_entry->vme_start;
23034 va < old_entry->vme_end;
23035 va += effective_page_size) {
23036 int disposition;
23037 cf_disp_t cf_disp;
23038
23039 vm_map_footprint_query_page_info(map: old_map,
23040 map_entry: old_entry,
23041 curr_s_offset: va,
23042 disposition_p: &disposition);
23043 cf_disp = vm_page_disposition_to_cf_disp(disposition);
23044
23045// if (va < SHARED_REGION_BASE_ARM64) printf("FBDP collect map %p va 0x%llx disp 0x%x\n", new_map, va, disp);
23046
23047 if (cf_disp == 0 && footprint_region->cfr_num_pages == 0) {
23048 /*
23049 * Ignore "zero" dispositions at start of
23050 * region: just move start of region.
23051 */
23052 footprint_region->cfr_vaddr += effective_page_size;
23053 continue;
23054 }
23055
23056 /* would region's cfr_num_pages overflow? */
23057 if (os_add_overflow(footprint_region->cfr_num_pages, 1,
23058 &num_pages_tmp)) {
23059 /* overflow: create a new region */
23060 new_footprint_region =
23061 vm_map_corpse_footprint_new_region(
23062 footprint_header);
23063 if (new_footprint_region == NULL) {
23064 goto over_the_edge;
23065 }
23066 footprint_region = new_footprint_region;
23067 footprint_region->cfr_vaddr = va;
23068 footprint_region->cfr_num_pages = 0;
23069 }
23070
23071 next_disp_p = (cf_disp_t *) ((uintptr_t) footprint_region +
23072 sizeof(*footprint_region));
23073 next_disp_p += footprint_region->cfr_num_pages;
23074 /* check that we're not going over the edge */
23075 if ((uintptr_t)next_disp_p >= footprint_edge) {
23076 goto over_the_edge;
23077 }
23078 /* store this dispostion */
23079 *next_disp_p = cf_disp;
23080 footprint_region->cfr_num_pages++;
23081
23082 if (cf_disp != 0) {
23083 /* non-zero disp: break the current zero streak */
23084 footprint_header->cf_last_zeroes = 0;
23085 /* done */
23086 continue;
23087 }
23088
23089 /* zero disp: add to the current streak of zeroes */
23090 footprint_header->cf_last_zeroes++;
23091 if ((footprint_header->cf_last_zeroes +
23092 roundup(((footprint_region->cfr_num_pages -
23093 footprint_header->cf_last_zeroes) * sizeof(cf_disp_t)) &
23094 (sizeof(int) - 1),
23095 sizeof(int))) <
23096 (sizeof(*footprint_header))) {
23097 /*
23098 * There are not enough trailing "zero" dispositions
23099 * (+ the extra padding we would need for the previous
23100 * region); creating a new region would not save space
23101 * at this point, so let's keep this "zero" disposition
23102 * in this region and reconsider later.
23103 */
23104 continue;
23105 }
23106 /*
23107 * Create a new region to avoid having too many consecutive
23108 * "zero" dispositions.
23109 */
23110 new_footprint_region =
23111 vm_map_corpse_footprint_new_region(footprint_header);
23112 if (new_footprint_region == NULL) {
23113 goto over_the_edge;
23114 }
23115 footprint_region = new_footprint_region;
23116 /* initialize the new region as empty ... */
23117 footprint_region->cfr_num_pages = 0;
23118 /* ... and skip this "zero" disp */
23119 footprint_region->cfr_vaddr = va + effective_page_size;
23120 }
23121
23122 return KERN_SUCCESS;
23123
23124over_the_edge:
23125// printf("FBDP map %p footprint was full for va 0x%llx\n", new_map, va);
23126 vm_map_corpse_footprint_full++;
23127 return KERN_RESOURCE_SHORTAGE;
23128}
23129
23130/*
23131 * vm_map_corpse_footprint_collect_done:
23132 * completes the footprint collection by getting rid of any remaining
23133 * trailing "zero" dispositions and trimming the unused part of the
23134 * kernel buffer
23135 */
23136void
23137vm_map_corpse_footprint_collect_done(
23138 vm_map_t new_map)
23139{
23140 struct vm_map_corpse_footprint_header *footprint_header;
23141 struct vm_map_corpse_footprint_region *footprint_region;
23142 vm_size_t buf_size, actual_size;
23143 kern_return_t kr;
23144
23145 assert(new_map->has_corpse_footprint);
23146 if (!new_map->has_corpse_footprint ||
23147 new_map->vmmap_corpse_footprint == NULL) {
23148 return;
23149 }
23150
23151 footprint_header = (struct vm_map_corpse_footprint_header *)
23152 new_map->vmmap_corpse_footprint;
23153 buf_size = footprint_header->cf_size;
23154
23155 footprint_region = (struct vm_map_corpse_footprint_region *)
23156 ((char *)footprint_header +
23157 footprint_header->cf_last_region);
23158
23159 /* get rid of trailing zeroes in last region */
23160 assert(footprint_region->cfr_num_pages >= footprint_header->cf_last_zeroes);
23161 footprint_region->cfr_num_pages -= footprint_header->cf_last_zeroes;
23162 footprint_header->cf_last_zeroes = 0;
23163
23164 actual_size = (vm_size_t)(footprint_header->cf_last_region +
23165 sizeof(*footprint_region) +
23166 (footprint_region->cfr_num_pages * sizeof(cf_disp_t)));
23167
23168// printf("FBDP map %p buf_size 0x%llx actual_size 0x%llx\n", new_map, (uint64_t) buf_size, (uint64_t) actual_size);
23169 vm_map_corpse_footprint_size_avg =
23170 (((vm_map_corpse_footprint_size_avg *
23171 vm_map_corpse_footprint_count) +
23172 actual_size) /
23173 (vm_map_corpse_footprint_count + 1));
23174 vm_map_corpse_footprint_count++;
23175 if (actual_size > vm_map_corpse_footprint_size_max) {
23176 vm_map_corpse_footprint_size_max = actual_size;
23177 }
23178
23179 actual_size = round_page(x: actual_size);
23180 if (buf_size > actual_size) {
23181 kr = vm_deallocate(target_task: kernel_map,
23182 address: ((vm_address_t)footprint_header +
23183 actual_size +
23184 PAGE_SIZE), /* trailing guard page */
23185 size: (buf_size - actual_size));
23186 assertf(kr == KERN_SUCCESS,
23187 "trim: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
23188 footprint_header,
23189 (uint64_t) buf_size,
23190 (uint64_t) actual_size,
23191 kr);
23192 kr = vm_protect(target_task: kernel_map,
23193 address: ((vm_address_t)footprint_header +
23194 actual_size),
23195 PAGE_SIZE,
23196 FALSE, /* set_maximum */
23197 VM_PROT_NONE);
23198 assertf(kr == KERN_SUCCESS,
23199 "guard: footprint_header %p buf_size 0x%llx actual_size 0x%llx kr=0x%x\n",
23200 footprint_header,
23201 (uint64_t) buf_size,
23202 (uint64_t) actual_size,
23203 kr);
23204 }
23205
23206 footprint_header->cf_size = actual_size;
23207}
23208
23209/*
23210 * vm_map_corpse_footprint_query_page_info:
23211 * retrieves the disposition of the page at virtual address "vaddr"
23212 * in the forked corpse's VM map
23213 *
23214 * This is the equivalent of vm_map_footprint_query_page_info() for a forked corpse.
23215 */
23216kern_return_t
23217vm_map_corpse_footprint_query_page_info(
23218 vm_map_t map,
23219 vm_map_offset_t va,
23220 int *disposition_p)
23221{
23222 struct vm_map_corpse_footprint_header *footprint_header;
23223 struct vm_map_corpse_footprint_region *footprint_region;
23224 uint32_t footprint_region_offset;
23225 vm_map_offset_t region_start, region_end;
23226 int disp_idx;
23227 kern_return_t kr;
23228 int effective_page_size;
23229 cf_disp_t cf_disp;
23230
23231 if (!map->has_corpse_footprint) {
23232 *disposition_p = 0;
23233 kr = KERN_INVALID_ARGUMENT;
23234 goto done;
23235 }
23236
23237 footprint_header = map->vmmap_corpse_footprint;
23238 if (footprint_header == NULL) {
23239 *disposition_p = 0;
23240// if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23241 kr = KERN_INVALID_ARGUMENT;
23242 goto done;
23243 }
23244
23245 /* start looking at the hint ("cf_hint_region") */
23246 footprint_region_offset = footprint_header->cf_hint_region;
23247
23248 effective_page_size = MIN(PAGE_SIZE, VM_MAP_PAGE_SIZE(map));
23249
23250lookup_again:
23251 if (footprint_region_offset < sizeof(*footprint_header)) {
23252 /* hint too low: start from 1st region */
23253 footprint_region_offset = sizeof(*footprint_header);
23254 }
23255 if (footprint_region_offset >= footprint_header->cf_last_region) {
23256 /* hint too high: re-start from 1st region */
23257 footprint_region_offset = sizeof(*footprint_header);
23258 }
23259 footprint_region = (struct vm_map_corpse_footprint_region *)
23260 ((char *)footprint_header + footprint_region_offset);
23261 region_start = footprint_region->cfr_vaddr;
23262 region_end = (region_start +
23263 ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
23264 effective_page_size));
23265 if (va < region_start &&
23266 footprint_region_offset != sizeof(*footprint_header)) {
23267 /* our range starts before the hint region */
23268
23269 /* reset the hint (in a racy way...) */
23270 footprint_header->cf_hint_region = sizeof(*footprint_header);
23271 /* lookup "va" again from 1st region */
23272 footprint_region_offset = sizeof(*footprint_header);
23273 goto lookup_again;
23274 }
23275
23276 while (va >= region_end) {
23277 if (footprint_region_offset >= footprint_header->cf_last_region) {
23278 break;
23279 }
23280 /* skip the region's header */
23281 footprint_region_offset += sizeof(*footprint_region);
23282 /* skip the region's page dispositions */
23283 footprint_region_offset += (footprint_region->cfr_num_pages * sizeof(cf_disp_t));
23284 /* align to next word boundary */
23285 footprint_region_offset =
23286 roundup(footprint_region_offset,
23287 sizeof(int));
23288 footprint_region = (struct vm_map_corpse_footprint_region *)
23289 ((char *)footprint_header + footprint_region_offset);
23290 region_start = footprint_region->cfr_vaddr;
23291 region_end = (region_start +
23292 ((vm_map_offset_t)(footprint_region->cfr_num_pages) *
23293 effective_page_size));
23294 }
23295 if (va < region_start || va >= region_end) {
23296 /* page not found */
23297 *disposition_p = 0;
23298// if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23299 kr = KERN_SUCCESS;
23300 goto done;
23301 }
23302
23303 /* "va" found: set the lookup hint for next lookup (in a racy way...) */
23304 footprint_header->cf_hint_region = footprint_region_offset;
23305
23306 /* get page disposition for "va" in this region */
23307 disp_idx = (int) ((va - footprint_region->cfr_vaddr) / effective_page_size);
23308 cf_disp = footprint_region->cfr_disposition[disp_idx];
23309 *disposition_p = vm_page_cf_disp_to_disposition(cf_disp);
23310 kr = KERN_SUCCESS;
23311done:
23312// if (va < SHARED_REGION_BASE_ARM64) printf("FBDP %d query map %p va 0x%llx disp 0x%x\n", __LINE__, map, va, *disposition_p);
23313 /* dtrace -n 'vminfo:::footprint_query_page_info { printf("map 0x%p va 0x%llx disp 0x%x kr 0x%x", arg0, arg1, arg2, arg3); }' */
23314 DTRACE_VM4(footprint_query_page_info,
23315 vm_map_t, map,
23316 vm_map_offset_t, va,
23317 int, *disposition_p,
23318 kern_return_t, kr);
23319
23320 return kr;
23321}
23322
23323void
23324vm_map_corpse_footprint_destroy(
23325 vm_map_t map)
23326{
23327 if (map->has_corpse_footprint &&
23328 map->vmmap_corpse_footprint != 0) {
23329 struct vm_map_corpse_footprint_header *footprint_header;
23330 vm_size_t buf_size;
23331 kern_return_t kr;
23332
23333 footprint_header = map->vmmap_corpse_footprint;
23334 buf_size = footprint_header->cf_size;
23335 kr = vm_deallocate(target_task: kernel_map,
23336 address: (vm_offset_t) map->vmmap_corpse_footprint,
23337 size: ((vm_size_t) buf_size
23338 + PAGE_SIZE)); /* trailing guard page */
23339 assertf(kr == KERN_SUCCESS, "kr=0x%x\n", kr);
23340 map->vmmap_corpse_footprint = 0;
23341 map->has_corpse_footprint = FALSE;
23342 }
23343}
23344
23345/*
23346 * vm_map_copy_footprint_ledgers:
23347 * copies any ledger that's relevant to the memory footprint of "old_task"
23348 * into the forked corpse's task ("new_task")
23349 */
23350void
23351vm_map_copy_footprint_ledgers(
23352 task_t old_task,
23353 task_t new_task)
23354{
23355 vm_map_copy_ledger(old_task, new_task, ledger_entry: task_ledgers.phys_footprint);
23356 vm_map_copy_ledger(old_task, new_task, ledger_entry: task_ledgers.purgeable_nonvolatile);
23357 vm_map_copy_ledger(old_task, new_task, ledger_entry: task_ledgers.purgeable_nonvolatile_compressed);
23358 vm_map_copy_ledger(old_task, new_task, ledger_entry: task_ledgers.internal);
23359 vm_map_copy_ledger(old_task, new_task, ledger_entry: task_ledgers.internal_compressed);
23360 vm_map_copy_ledger(old_task, new_task, ledger_entry: task_ledgers.iokit_mapped);
23361 vm_map_copy_ledger(old_task, new_task, ledger_entry: task_ledgers.alternate_accounting);
23362 vm_map_copy_ledger(old_task, new_task, ledger_entry: task_ledgers.alternate_accounting_compressed);
23363 vm_map_copy_ledger(old_task, new_task, ledger_entry: task_ledgers.page_table);
23364 vm_map_copy_ledger(old_task, new_task, ledger_entry: task_ledgers.tagged_footprint);
23365 vm_map_copy_ledger(old_task, new_task, ledger_entry: task_ledgers.tagged_footprint_compressed);
23366 vm_map_copy_ledger(old_task, new_task, ledger_entry: task_ledgers.network_nonvolatile);
23367 vm_map_copy_ledger(old_task, new_task, ledger_entry: task_ledgers.network_nonvolatile_compressed);
23368 vm_map_copy_ledger(old_task, new_task, ledger_entry: task_ledgers.media_footprint);
23369 vm_map_copy_ledger(old_task, new_task, ledger_entry: task_ledgers.media_footprint_compressed);
23370 vm_map_copy_ledger(old_task, new_task, ledger_entry: task_ledgers.graphics_footprint);
23371 vm_map_copy_ledger(old_task, new_task, ledger_entry: task_ledgers.graphics_footprint_compressed);
23372 vm_map_copy_ledger(old_task, new_task, ledger_entry: task_ledgers.neural_footprint);
23373 vm_map_copy_ledger(old_task, new_task, ledger_entry: task_ledgers.neural_footprint_compressed);
23374 vm_map_copy_ledger(old_task, new_task, ledger_entry: task_ledgers.wired_mem);
23375}
23376
23377/*
23378 * vm_map_copy_ledger:
23379 * copy a single ledger from "old_task" to "new_task"
23380 */
23381void
23382vm_map_copy_ledger(
23383 task_t old_task,
23384 task_t new_task,
23385 int ledger_entry)
23386{
23387 ledger_amount_t old_balance, new_balance, delta;
23388
23389 assert(new_task->map->has_corpse_footprint);
23390 if (!new_task->map->has_corpse_footprint) {
23391 return;
23392 }
23393
23394 /* turn off sanity checks for the ledger we're about to mess with */
23395 ledger_disable_panic_on_negative(ledger: new_task->ledger,
23396 entry: ledger_entry);
23397
23398 /* adjust "new_task" to match "old_task" */
23399 ledger_get_balance(ledger: old_task->ledger,
23400 entry: ledger_entry,
23401 balance: &old_balance);
23402 ledger_get_balance(ledger: new_task->ledger,
23403 entry: ledger_entry,
23404 balance: &new_balance);
23405 if (new_balance == old_balance) {
23406 /* new == old: done */
23407 } else if (new_balance > old_balance) {
23408 /* new > old ==> new -= new - old */
23409 delta = new_balance - old_balance;
23410 ledger_debit(ledger: new_task->ledger,
23411 entry: ledger_entry,
23412 amount: delta);
23413 } else {
23414 /* new < old ==> new += old - new */
23415 delta = old_balance - new_balance;
23416 ledger_credit(ledger: new_task->ledger,
23417 entry: ledger_entry,
23418 amount: delta);
23419 }
23420}
23421
23422/*
23423 * vm_map_get_pmap:
23424 * returns the pmap associated with the vm_map
23425 */
23426pmap_t
23427vm_map_get_pmap(vm_map_t map)
23428{
23429 return vm_map_pmap(map);
23430}
23431
23432#if CONFIG_MAP_RANGES
23433static bitmap_t vm_map_user_range_heap_map[BITMAP_LEN(VM_MEMORY_COUNT)];
23434
23435static_assert(UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
23436static_assert(UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
23437
23438/*
23439 * vm_map_range_map_init:
23440 * initializes the VM range ID map to enable index lookup
23441 * of user VM ranges based on VM tag from userspace.
23442 */
23443static void
23444vm_map_range_map_init(void)
23445{
23446 /*
23447 * VM_MEMORY_MALLOC{,_NANO} are skipped on purpose:
23448 * - the former is malloc metadata which should be kept separate
23449 * - the latter has its own ranges
23450 */
23451 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_HUGE);
23452 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE);
23453 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_LARGE_REUSED);
23454 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_MEDIUM);
23455 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_PROB_GUARD);
23456 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_SMALL);
23457 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_MALLOC_TINY);
23458 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_TCMALLOC);
23459 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LIBNETWORK);
23460 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOACCELERATOR);
23461 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IOSURFACE);
23462 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_IMAGEIO);
23463 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREGRAPHICS);
23464 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_CORESERVICES);
23465 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_COREDATA);
23466 bitmap_set(vm_map_user_range_heap_map, VM_MEMORY_LAYERKIT);
23467}
23468
23469static struct mach_vm_range
23470vm_map_range_random_uniform(
23471 vm_map_size_t req_size,
23472 vm_map_offset_t min_addr,
23473 vm_map_offset_t max_addr,
23474 vm_map_offset_t offmask)
23475{
23476 vm_map_offset_t random_addr;
23477 struct mach_vm_range alloc;
23478
23479 req_size = (req_size + offmask) & ~offmask;
23480 min_addr = (min_addr + offmask) & ~offmask;
23481 max_addr = max_addr & ~offmask;
23482
23483 read_random(&random_addr, sizeof(random_addr));
23484 random_addr %= (max_addr - req_size - min_addr);
23485 random_addr &= ~offmask;
23486
23487 alloc.min_address = min_addr + random_addr;
23488 alloc.max_address = min_addr + random_addr + req_size;
23489 return alloc;
23490}
23491
23492static vm_map_offset_t
23493vm_map_range_offmask(void)
23494{
23495 uint32_t pte_depth;
23496
23497 /*
23498 * PTE optimizations
23499 *
23500 *
23501 * 16k pages systems
23502 * ~~~~~~~~~~~~~~~~~
23503 *
23504 * A single L1 (sub-)page covers the address space.
23505 * - L2 pages cover 64G,
23506 * - L3 pages cover 32M.
23507 *
23508 * On embedded, the dynamic VA range is 64G and uses a single L2 page.
23509 * As a result, we really only need to align the ranges to 32M to avoid
23510 * partial L3 pages.
23511 *
23512 * On macOS, the usage of L2 pages will increase, so as a result we will
23513 * want to align ranges to 64G in order to utilize them fully.
23514 *
23515 *
23516 * 4k pages systems
23517 * ~~~~~~~~~~~~~~~~
23518 *
23519 * A single L0 (sub-)page covers the address space.
23520 * - L1 pages cover 512G,
23521 * - L2 pages cover 1G,
23522 * - L3 pages cover 2M.
23523 *
23524 * The long tail of processes on a system will tend to have a VA usage
23525 * (ignoring the shared regions) in the 100s of MB order of magnitnude.
23526 * This is achievable with a single L1 and a few L2s without
23527 * randomization.
23528 *
23529 * However once randomization is introduced, the system will immediately
23530 * need several L1s and many more L2s. As a result:
23531 *
23532 * - on embedded devices, the cost of these extra pages isn't
23533 * sustainable, and we just disable the feature entirely,
23534 *
23535 * - on macOS we align ranges to a 512G boundary so that the extra L1
23536 * pages can be used to their full potential.
23537 */
23538
23539 /*
23540 * note, this function assumes _non exotic mappings_
23541 * which is why it uses the native kernel's PAGE_SHIFT.
23542 */
23543#if XNU_PLATFORM_MacOSX
23544 pte_depth = PAGE_SHIFT > 12 ? 2 : 3;
23545#else /* !XNU_PLATFORM_MacOSX */
23546 pte_depth = PAGE_SHIFT > 12 ? 1 : 0;
23547#endif /* !XNU_PLATFORM_MacOSX */
23548
23549 if (pte_depth == 0) {
23550 return 0;
23551 }
23552
23553 return (1ull << ((PAGE_SHIFT - 3) * pte_depth + PAGE_SHIFT)) - 1;
23554}
23555
23556/*
23557 * vm_map_range_configure:
23558 * configures the user vm_map ranges by increasing the maximum VA range of
23559 * the map and carving out a range at the end of VA space (searching backwards
23560 * in the newly expanded map).
23561 */
23562kern_return_t
23563vm_map_range_configure(vm_map_t map)
23564{
23565 const vm_map_offset_t offmask = vm_map_range_offmask();
23566 struct mach_vm_range data_range;
23567 vm_map_offset_t default_end;
23568 kern_return_t kr;
23569
23570 if (!vm_map_is_64bit(map) || vm_map_is_exotic(map) || offmask == 0) {
23571 /*
23572 * No point doing vm ranges in a 32bit address space.
23573 */
23574 return KERN_NOT_SUPPORTED;
23575 }
23576
23577 /* Should not be applying ranges to kernel map or kernel map submaps */
23578 assert(vm_map_pmap(map) != kernel_pmap);
23579
23580#if XNU_PLATFORM_MacOSX
23581
23582 /*
23583 * on macOS, the address space is a massive 47 bits (128T),
23584 * with several carve outs that processes can't use:
23585 * - the shared region
23586 * - the commpage region
23587 * - the GPU carve out (if applicable)
23588 *
23589 * and when nano-malloc is in use it desires memory at the 96T mark.
23590 *
23591 * However, their location is architecture dependent:
23592 * - On intel, the shared region and commpage are
23593 * at the very end of the usable address space (above +127T),
23594 * and there is no GPU carve out, and pthread wants to place
23595 * threads at the 112T mark (0x70T).
23596 *
23597 * - On arm64, these are in the same spot as on embedded devices:
23598 * o shared region: [ 6G, 10G) [ will likely grow over time ]
23599 * o commpage region: [63G, 64G)
23600 * o GPU carve out: [64G, 448G)
23601 *
23602 * This is conveninent because the mappings at the end of the address
23603 * space (when they exist) are made by the kernel.
23604 *
23605 * The policy is to allocate a random 1T for the data heap
23606 * in the end of the address-space in the:
23607 * - [0x71, 0x7f) range on Intel (to leave space for pthread stacks)
23608 * - [0x61, 0x7f) range on ASM (to leave space for Nano malloc).
23609 */
23610
23611 /* see NANOZONE_SIGNATURE in libmalloc */
23612#if __x86_64__
23613 default_end = 0x71ull << 40;
23614#else
23615 default_end = 0x61ull << 40;
23616#endif
23617 data_range = vm_map_range_random_uniform(1ull << 40,
23618 default_end, 0x7full << 40, offmask);
23619
23620#else /* !XNU_PLATFORM_MacOSX */
23621
23622 /*
23623 * Embedded devices:
23624 *
23625 * The default VA Size scales with the device physical memory.
23626 *
23627 * Out of that:
23628 * - the "zero" page typically uses 4G + some slide
23629 * - the shared region uses SHARED_REGION_SIZE bytes (4G)
23630 *
23631 * Without the use of jumbo or any adjustment to the address space,
23632 * a default VM map typically looks like this:
23633 *
23634 * 0G -->╒════════════╕
23635 * │ pagezero │
23636 * │ + slide │
23637 * ~4G -->╞════════════╡<-- vm_map_min(map)
23638 * │ │
23639 * 6G -->├────────────┤
23640 * │ shared │
23641 * │ region │
23642 * 10G -->├────────────┤
23643 * │ │
23644 * max_va -->├────────────┤<-- vm_map_max(map)
23645 * │ │
23646 * ╎ jumbo ╎
23647 * ╎ ╎
23648 * │ │
23649 * 63G -->╞════════════╡<-- MACH_VM_MAX_ADDRESS
23650 * │ commpage │
23651 * 64G -->├────────────┤<-- MACH_VM_MIN_GPU_CARVEOUT_ADDRESS
23652 * │ │
23653 * ╎ GPU ╎
23654 * ╎ carveout ╎
23655 * │ │
23656 * 448G -->├────────────┤<-- MACH_VM_MAX_GPU_CARVEOUT_ADDRESS
23657 * │ │
23658 * ╎ ╎
23659 * ╎ ╎
23660 * │ │
23661 * 512G -->╘════════════╛<-- (1ull << ARM_16K_TT_L1_SHIFT)
23662 *
23663 * When this drawing was made, "max_va" was smaller than
23664 * ARM64_MAX_OFFSET_DEVICE_LARGE (~15.5G), leaving shy of
23665 * 12G of address space for the zero-page, slide, files,
23666 * binaries, heap ...
23667 *
23668 * We will want to make a "heap/data" carve out inside
23669 * the jumbo range of half of that usable space, assuming
23670 * that this is less than a forth of the jumbo range.
23671 *
23672 * The assert below intends to catch when max_va grows
23673 * too large for this heuristic.
23674 */
23675
23676 vm_map_lock_read(map);
23677 default_end = vm_map_max(map);
23678 vm_map_unlock_read(map);
23679
23680 /*
23681 * Check that we're not already jumbo'd,
23682 * or our address space was somehow modified.
23683 *
23684 * If so we cannot guarantee that we can set up the ranges
23685 * safely without interfering with the existing map.
23686 */
23687 if (default_end > vm_compute_max_offset(true)) {
23688 return KERN_NO_SPACE;
23689 }
23690
23691 if (pmap_max_offset(true, ARM_PMAP_MAX_OFFSET_DEFAULT)) {
23692 /*
23693 * an override boot-arg was set, disable user-ranges
23694 *
23695 * XXX: this is problematic because it means these boot-args
23696 * no longer test the behavior changing the value
23697 * of ARM64_MAX_OFFSET_DEVICE_* would have.
23698 */
23699 return KERN_NOT_SUPPORTED;
23700 }
23701
23702 /* expand the default VM space to the largest possible address */
23703 vm_map_set_jumbo(map);
23704
23705 assert3u(7 * GiB(10) / 2, <=, vm_map_max(map) - default_end);
23706 data_range = vm_map_range_random_uniform(GiB(10),
23707 default_end + PAGE_SIZE, vm_map_max(map), offmask);
23708
23709#endif /* !XNU_PLATFORM_MacOSX */
23710
23711 /*
23712 * Poke holes so that ASAN or people listing regions
23713 * do not think this space is free.
23714 */
23715
23716 if (default_end != data_range.min_address) {
23717 kr = vm_map_enter(map, &default_end,
23718 data_range.min_address - default_end,
23719 0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
23720 0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
23721 assert(kr == KERN_SUCCESS);
23722 }
23723
23724 if (data_range.max_address != vm_map_max(map)) {
23725 vm_map_entry_t entry;
23726 vm_size_t size;
23727
23728 vm_map_lock_read(map);
23729 vm_map_lookup_entry_or_next(map, data_range.max_address, &entry);
23730 if (entry != vm_map_to_entry(map)) {
23731 size = vm_map_max(map) - data_range.max_address;
23732 } else {
23733 size = entry->vme_start - data_range.max_address;
23734 }
23735 vm_map_unlock_read(map);
23736
23737 kr = vm_map_enter(map, &data_range.max_address, size,
23738 0, VM_MAP_KERNEL_FLAGS_FIXED_PERMANENT(), VM_OBJECT_NULL,
23739 0, FALSE, VM_PROT_NONE, VM_PROT_NONE, VM_INHERIT_DEFAULT);
23740 assert(kr == KERN_SUCCESS);
23741 }
23742
23743 vm_map_lock(map);
23744 map->default_range.min_address = vm_map_min(map);
23745 map->default_range.max_address = default_end;
23746 map->data_range = data_range;
23747 map->uses_user_ranges = true;
23748 vm_map_unlock(map);
23749
23750 return KERN_SUCCESS;
23751}
23752
23753/*
23754 * vm_map_range_fork:
23755 * clones the array of ranges from old_map to new_map in support
23756 * of a VM map fork.
23757 */
23758void
23759vm_map_range_fork(vm_map_t new_map, vm_map_t old_map)
23760{
23761 if (!old_map->uses_user_ranges) {
23762 /* nothing to do */
23763 return;
23764 }
23765
23766 new_map->default_range = old_map->default_range;
23767 new_map->data_range = old_map->data_range;
23768
23769 if (old_map->extra_ranges_count) {
23770 vm_map_user_range_t otable, ntable;
23771 uint16_t count;
23772
23773 otable = old_map->extra_ranges;
23774 count = old_map->extra_ranges_count;
23775 ntable = kalloc_data(count * sizeof(struct vm_map_user_range),
23776 Z_WAITOK | Z_ZERO | Z_NOFAIL);
23777 memcpy(ntable, otable,
23778 count * sizeof(struct vm_map_user_range));
23779
23780 new_map->extra_ranges_count = count;
23781 new_map->extra_ranges = ntable;
23782 }
23783
23784 new_map->uses_user_ranges = true;
23785}
23786
23787/*
23788 * vm_map_get_user_range:
23789 * copy the VM user range for the given VM map and range ID.
23790 */
23791kern_return_t
23792vm_map_get_user_range(
23793 vm_map_t map,
23794 vm_map_range_id_t range_id,
23795 mach_vm_range_t range)
23796{
23797 if (map == NULL || !map->uses_user_ranges || range == NULL) {
23798 return KERN_INVALID_ARGUMENT;
23799 }
23800
23801 switch (range_id) {
23802 case UMEM_RANGE_ID_DEFAULT:
23803 *range = map->default_range;
23804 return KERN_SUCCESS;
23805
23806 case UMEM_RANGE_ID_HEAP:
23807 *range = map->data_range;
23808 return KERN_SUCCESS;
23809
23810 default:
23811 return KERN_INVALID_ARGUMENT;
23812 }
23813}
23814
23815static vm_map_range_id_t
23816vm_map_user_range_resolve(
23817 vm_map_t map,
23818 mach_vm_address_t addr,
23819 mach_vm_size_t size,
23820 mach_vm_range_t range)
23821{
23822 struct mach_vm_range tmp;
23823
23824 vm_map_lock_assert_held(map);
23825
23826 static_assert(UMEM_RANGE_ID_DEFAULT == MACH_VM_RANGE_DEFAULT);
23827 static_assert(UMEM_RANGE_ID_HEAP == MACH_VM_RANGE_DATA);
23828
23829 if (mach_vm_range_contains(&map->default_range, addr, size)) {
23830 if (range) {
23831 *range = map->default_range;
23832 }
23833 return UMEM_RANGE_ID_DEFAULT;
23834 }
23835
23836 if (mach_vm_range_contains(&map->data_range, addr, size)) {
23837 if (range) {
23838 *range = map->data_range;
23839 }
23840 return UMEM_RANGE_ID_HEAP;
23841 }
23842
23843 for (size_t i = 0; i < map->extra_ranges_count; i++) {
23844 vm_map_user_range_t r = &map->extra_ranges[i];
23845
23846 tmp.min_address = r->vmur_min_address;
23847 tmp.max_address = r->vmur_max_address;
23848
23849 if (mach_vm_range_contains(&tmp, addr, size)) {
23850 if (range) {
23851 *range = tmp;
23852 }
23853 return r->vmur_range_id;
23854 }
23855 }
23856
23857 if (range) {
23858 range->min_address = range->max_address = 0;
23859 }
23860 return UMEM_RANGE_ID_DEFAULT;
23861}
23862
23863static int
23864vm_map_user_range_cmp(const void *e1, const void *e2)
23865{
23866 const struct vm_map_user_range *r1 = e1;
23867 const struct vm_map_user_range *r2 = e2;
23868
23869 if (r1->vmur_min_address != r2->vmur_min_address) {
23870 return r1->vmur_min_address < r2->vmur_min_address ? -1 : 1;
23871 }
23872
23873 return 0;
23874}
23875
23876static int
23877mach_vm_range_recipe_v1_cmp(const void *e1, const void *e2)
23878{
23879 const mach_vm_range_recipe_v1_t *r1 = e1;
23880 const mach_vm_range_recipe_v1_t *r2 = e2;
23881
23882 if (r1->range.min_address != r2->range.min_address) {
23883 return r1->range.min_address < r2->range.min_address ? -1 : 1;
23884 }
23885
23886 return 0;
23887}
23888
23889/*!
23890 * @function mach_vm_range_create_v1()
23891 *
23892 * @brief
23893 * Handle the backend for mach_vm_range_create() for the
23894 * MACH_VM_RANGE_FLAVOR_V1 flavor.
23895 *
23896 * @description
23897 * This call allows to create "ranges" in the map of a task
23898 * that have special semantics/policies around placement of
23899 * new allocations (in the vm_map_locate_space() sense).
23900 *
23901 * @returns
23902 * - KERN_SUCCESS on success
23903 * - KERN_INVALID_ARGUMENT for incorrect arguments
23904 * - KERN_NO_SPACE if the maximum amount of ranges would be exceeded
23905 * - KERN_MEMORY_PRESENT if any of the requested ranges
23906 * overlaps with existing ranges or allocations in the map.
23907 */
23908static kern_return_t
23909mach_vm_range_create_v1(
23910 vm_map_t map,
23911 mach_vm_range_recipe_v1_t *recipe,
23912 uint32_t new_count)
23913{
23914 const vm_offset_t mask = VM_MAP_PAGE_MASK(map);
23915 vm_map_user_range_t table;
23916 kern_return_t kr = KERN_SUCCESS;
23917 uint16_t count;
23918
23919 struct mach_vm_range void1 = {
23920 .min_address = map->default_range.max_address,
23921 .max_address = map->data_range.min_address,
23922 };
23923 struct mach_vm_range void2 = {
23924 .min_address = map->data_range.max_address,
23925 .max_address = vm_map_max(map),
23926 };
23927
23928 qsort(recipe, new_count, sizeof(mach_vm_range_recipe_v1_t),
23929 mach_vm_range_recipe_v1_cmp);
23930
23931 /*
23932 * Step 1: Validate that the recipes have no intersections.
23933 */
23934
23935 for (size_t i = 0; i < new_count; i++) {
23936 mach_vm_range_t r = &recipe[i].range;
23937 mach_vm_size_t s;
23938
23939 if (recipe[i].flags) {
23940 return KERN_INVALID_ARGUMENT;
23941 }
23942
23943 static_assert(UMEM_RANGE_ID_FIXED == MACH_VM_RANGE_FIXED);
23944 switch (recipe[i].range_tag) {
23945 case MACH_VM_RANGE_FIXED:
23946 break;
23947 default:
23948 return KERN_INVALID_ARGUMENT;
23949 }
23950
23951 if (!VM_MAP_PAGE_ALIGNED(r->min_address, mask) ||
23952 !VM_MAP_PAGE_ALIGNED(r->max_address, mask) ||
23953 r->min_address >= r->max_address) {
23954 return KERN_INVALID_ARGUMENT;
23955 }
23956
23957 s = mach_vm_range_size(r);
23958 if (!mach_vm_range_contains(&void1, r->min_address, s) &&
23959 !mach_vm_range_contains(&void2, r->min_address, s)) {
23960 return KERN_INVALID_ARGUMENT;
23961 }
23962
23963 if (i > 0 && recipe[i - 1].range.max_address >
23964 recipe[i].range.min_address) {
23965 return KERN_INVALID_ARGUMENT;
23966 }
23967 }
23968
23969 vm_map_lock(map);
23970
23971 table = map->extra_ranges;
23972 count = map->extra_ranges_count;
23973
23974 if (count + new_count > VM_MAP_EXTRA_RANGES_MAX) {
23975 kr = KERN_NO_SPACE;
23976 goto out_unlock;
23977 }
23978
23979 /*
23980 * Step 2: Check that there is no intersection with existing ranges.
23981 */
23982
23983 for (size_t i = 0, j = 0; i < new_count && j < count;) {
23984 mach_vm_range_t r1 = &recipe[i].range;
23985 vm_map_user_range_t r2 = &table[j];
23986
23987 if (r1->max_address <= r2->vmur_min_address) {
23988 i++;
23989 } else if (r2->vmur_max_address <= r1->min_address) {
23990 j++;
23991 } else {
23992 kr = KERN_MEMORY_PRESENT;
23993 goto out_unlock;
23994 }
23995 }
23996
23997 /*
23998 * Step 4: commit the new ranges.
23999 */
24000
24001 static_assert(VM_MAP_EXTRA_RANGES_MAX * sizeof(struct vm_map_user_range) <=
24002 KALLOC_SAFE_ALLOC_SIZE);
24003
24004 table = krealloc_data(table,
24005 count * sizeof(struct vm_map_user_range),
24006 (count + new_count) * sizeof(struct vm_map_user_range),
24007 Z_ZERO | Z_WAITOK | Z_NOFAIL);
24008
24009 for (size_t i = 0; i < new_count; i++) {
24010 static_assert(MACH_VM_MAX_ADDRESS < (1ull << 56));
24011
24012 table[count + i] = (struct vm_map_user_range){
24013 .vmur_min_address = recipe[i].range.min_address,
24014 .vmur_max_address = recipe[i].range.max_address,
24015 .vmur_range_id = (vm_map_range_id_t)recipe[i].range_tag,
24016 };
24017 }
24018
24019 qsort(table, count + new_count,
24020 sizeof(struct vm_map_user_range), vm_map_user_range_cmp);
24021
24022 map->extra_ranges_count += new_count;
24023 map->extra_ranges = table;
24024
24025out_unlock:
24026 vm_map_unlock(map);
24027
24028 if (kr == KERN_SUCCESS) {
24029 for (size_t i = 0; i < new_count; i++) {
24030 vm_map_kernel_flags_t vmk_flags = {
24031 .vmf_fixed = true,
24032 .vmf_overwrite = true,
24033 .vmkf_overwrite_immutable = true,
24034 .vm_tag = recipe[i].vm_tag,
24035 };
24036 __assert_only kern_return_t kr2;
24037
24038 kr2 = vm_map_enter(map, &recipe[i].range.min_address,
24039 mach_vm_range_size(&recipe[i].range),
24040 0, vmk_flags, VM_OBJECT_NULL, 0, FALSE,
24041 VM_PROT_NONE, VM_PROT_ALL,
24042 VM_INHERIT_DEFAULT);
24043 assert(kr2 == KERN_SUCCESS);
24044 }
24045 }
24046 return kr;
24047}
24048
24049kern_return_t
24050mach_vm_range_create(
24051 vm_map_t map,
24052 mach_vm_range_flavor_t flavor,
24053 mach_vm_range_recipes_raw_t recipe,
24054 natural_t size)
24055{
24056 if (map != current_map()) {
24057 return KERN_INVALID_ARGUMENT;
24058 }
24059
24060 if (!map->uses_user_ranges) {
24061 return KERN_NOT_SUPPORTED;
24062 }
24063
24064 if (size == 0) {
24065 return KERN_SUCCESS;
24066 }
24067
24068 if (flavor == MACH_VM_RANGE_FLAVOR_V1) {
24069 mach_vm_range_recipe_v1_t *array;
24070
24071 if (size % sizeof(mach_vm_range_recipe_v1_t)) {
24072 return KERN_INVALID_ARGUMENT;
24073 }
24074
24075 size /= sizeof(mach_vm_range_recipe_v1_t);
24076 if (size > VM_MAP_EXTRA_RANGES_MAX) {
24077 return KERN_NO_SPACE;
24078 }
24079
24080 array = (mach_vm_range_recipe_v1_t *)recipe;
24081 return mach_vm_range_create_v1(map, array, size);
24082 }
24083
24084 return KERN_INVALID_ARGUMENT;
24085}
24086
24087#else /* !CONFIG_MAP_RANGES */
24088
24089kern_return_t
24090mach_vm_range_create(
24091 vm_map_t map,
24092 mach_vm_range_flavor_t flavor,
24093 mach_vm_range_recipes_raw_t recipe,
24094 natural_t size)
24095{
24096#pragma unused(map, flavor, recipe, size)
24097 return KERN_NOT_SUPPORTED;
24098}
24099
24100#endif /* !CONFIG_MAP_RANGES */
24101
24102void
24103vm_map_kernel_flags_update_range_id(vm_map_kernel_flags_t *vmkf, vm_map_t map)
24104{
24105 if (map == kernel_map) {
24106 if (vmkf->vmkf_range_id == KMEM_RANGE_ID_NONE) {
24107 vmkf->vmkf_range_id = KMEM_RANGE_ID_DATA;
24108 }
24109#if CONFIG_MAP_RANGES
24110 } else if (vmkf->vm_tag < VM_MEMORY_COUNT &&
24111 vmkf->vmkf_range_id == UMEM_RANGE_ID_DEFAULT &&
24112 bitmap_test(vm_map_user_range_heap_map, vmkf->vm_tag)) {
24113 vmkf->vmkf_range_id = UMEM_RANGE_ID_HEAP;
24114#endif /* CONFIG_MAP_RANGES */
24115 }
24116}
24117
24118/*
24119 * vm_map_entry_has_device_pager:
24120 * Check if the vm map entry specified by the virtual address has a device pager.
24121 * If the vm map entry does not exist or if the map is NULL, this returns FALSE.
24122 */
24123boolean_t
24124vm_map_entry_has_device_pager(vm_map_t map, vm_map_offset_t vaddr)
24125{
24126 vm_map_entry_t entry;
24127 vm_object_t object;
24128 boolean_t result;
24129
24130 if (map == NULL) {
24131 return FALSE;
24132 }
24133
24134 vm_map_lock(map);
24135 while (TRUE) {
24136 if (!vm_map_lookup_entry(map, address: vaddr, entry: &entry)) {
24137 result = FALSE;
24138 break;
24139 }
24140 if (entry->is_sub_map) {
24141 // Check the submap
24142 vm_map_t submap = VME_SUBMAP(entry);
24143 assert(submap != NULL);
24144 vm_map_lock(submap);
24145 vm_map_unlock(map);
24146 map = submap;
24147 continue;
24148 }
24149 object = VME_OBJECT(entry);
24150 if (object != NULL && object->pager != NULL && is_device_pager_ops(pager_ops: object->pager->mo_pager_ops)) {
24151 result = TRUE;
24152 break;
24153 }
24154 result = FALSE;
24155 break;
24156 }
24157
24158 vm_map_unlock(map);
24159 return result;
24160}
24161
24162
24163#if MACH_ASSERT
24164
24165extern int pmap_ledgers_panic;
24166extern int pmap_ledgers_panic_leeway;
24167
24168#define LEDGER_DRIFT(__LEDGER) \
24169 int __LEDGER##_over; \
24170 ledger_amount_t __LEDGER##_over_total; \
24171 ledger_amount_t __LEDGER##_over_max; \
24172 int __LEDGER##_under; \
24173 ledger_amount_t __LEDGER##_under_total; \
24174 ledger_amount_t __LEDGER##_under_max
24175
24176struct {
24177 uint64_t num_pmaps_checked;
24178
24179 LEDGER_DRIFT(phys_footprint);
24180 LEDGER_DRIFT(internal);
24181 LEDGER_DRIFT(internal_compressed);
24182 LEDGER_DRIFT(external);
24183 LEDGER_DRIFT(reusable);
24184 LEDGER_DRIFT(iokit_mapped);
24185 LEDGER_DRIFT(alternate_accounting);
24186 LEDGER_DRIFT(alternate_accounting_compressed);
24187 LEDGER_DRIFT(page_table);
24188 LEDGER_DRIFT(purgeable_volatile);
24189 LEDGER_DRIFT(purgeable_nonvolatile);
24190 LEDGER_DRIFT(purgeable_volatile_compressed);
24191 LEDGER_DRIFT(purgeable_nonvolatile_compressed);
24192 LEDGER_DRIFT(tagged_nofootprint);
24193 LEDGER_DRIFT(tagged_footprint);
24194 LEDGER_DRIFT(tagged_nofootprint_compressed);
24195 LEDGER_DRIFT(tagged_footprint_compressed);
24196 LEDGER_DRIFT(network_volatile);
24197 LEDGER_DRIFT(network_nonvolatile);
24198 LEDGER_DRIFT(network_volatile_compressed);
24199 LEDGER_DRIFT(network_nonvolatile_compressed);
24200 LEDGER_DRIFT(media_nofootprint);
24201 LEDGER_DRIFT(media_footprint);
24202 LEDGER_DRIFT(media_nofootprint_compressed);
24203 LEDGER_DRIFT(media_footprint_compressed);
24204 LEDGER_DRIFT(graphics_nofootprint);
24205 LEDGER_DRIFT(graphics_footprint);
24206 LEDGER_DRIFT(graphics_nofootprint_compressed);
24207 LEDGER_DRIFT(graphics_footprint_compressed);
24208 LEDGER_DRIFT(neural_nofootprint);
24209 LEDGER_DRIFT(neural_footprint);
24210 LEDGER_DRIFT(neural_nofootprint_compressed);
24211 LEDGER_DRIFT(neural_footprint_compressed);
24212} pmap_ledgers_drift;
24213
24214void
24215vm_map_pmap_check_ledgers(
24216 pmap_t pmap,
24217 ledger_t ledger,
24218 int pid,
24219 char *procname)
24220{
24221 ledger_amount_t bal;
24222 boolean_t do_panic;
24223
24224 do_panic = FALSE;
24225
24226 pmap_ledgers_drift.num_pmaps_checked++;
24227
24228#define LEDGER_CHECK_BALANCE(__LEDGER) \
24229MACRO_BEGIN \
24230 int panic_on_negative = TRUE; \
24231 ledger_get_balance(ledger, \
24232 task_ledgers.__LEDGER, \
24233 &bal); \
24234 ledger_get_panic_on_negative(ledger, \
24235 task_ledgers.__LEDGER, \
24236 &panic_on_negative); \
24237 if (bal != 0) { \
24238 if (panic_on_negative || \
24239 (pmap_ledgers_panic && \
24240 pmap_ledgers_panic_leeway > 0 && \
24241 (bal > (pmap_ledgers_panic_leeway * PAGE_SIZE) || \
24242 bal < (-pmap_ledgers_panic_leeway * PAGE_SIZE)))) { \
24243 do_panic = TRUE; \
24244 } \
24245 printf("LEDGER BALANCE proc %d (%s) " \
24246 "\"%s\" = %lld\n", \
24247 pid, procname, #__LEDGER, bal); \
24248 if (bal > 0) { \
24249 pmap_ledgers_drift.__LEDGER##_over++; \
24250 pmap_ledgers_drift.__LEDGER##_over_total += bal; \
24251 if (bal > pmap_ledgers_drift.__LEDGER##_over_max) { \
24252 pmap_ledgers_drift.__LEDGER##_over_max = bal; \
24253 } \
24254 } else if (bal < 0) { \
24255 pmap_ledgers_drift.__LEDGER##_under++; \
24256 pmap_ledgers_drift.__LEDGER##_under_total += bal; \
24257 if (bal < pmap_ledgers_drift.__LEDGER##_under_max) { \
24258 pmap_ledgers_drift.__LEDGER##_under_max = bal; \
24259 } \
24260 } \
24261 } \
24262MACRO_END
24263
24264 LEDGER_CHECK_BALANCE(phys_footprint);
24265 LEDGER_CHECK_BALANCE(internal);
24266 LEDGER_CHECK_BALANCE(internal_compressed);
24267 LEDGER_CHECK_BALANCE(external);
24268 LEDGER_CHECK_BALANCE(reusable);
24269 LEDGER_CHECK_BALANCE(iokit_mapped);
24270 LEDGER_CHECK_BALANCE(alternate_accounting);
24271 LEDGER_CHECK_BALANCE(alternate_accounting_compressed);
24272 LEDGER_CHECK_BALANCE(page_table);
24273 LEDGER_CHECK_BALANCE(purgeable_volatile);
24274 LEDGER_CHECK_BALANCE(purgeable_nonvolatile);
24275 LEDGER_CHECK_BALANCE(purgeable_volatile_compressed);
24276 LEDGER_CHECK_BALANCE(purgeable_nonvolatile_compressed);
24277 LEDGER_CHECK_BALANCE(tagged_nofootprint);
24278 LEDGER_CHECK_BALANCE(tagged_footprint);
24279 LEDGER_CHECK_BALANCE(tagged_nofootprint_compressed);
24280 LEDGER_CHECK_BALANCE(tagged_footprint_compressed);
24281 LEDGER_CHECK_BALANCE(network_volatile);
24282 LEDGER_CHECK_BALANCE(network_nonvolatile);
24283 LEDGER_CHECK_BALANCE(network_volatile_compressed);
24284 LEDGER_CHECK_BALANCE(network_nonvolatile_compressed);
24285 LEDGER_CHECK_BALANCE(media_nofootprint);
24286 LEDGER_CHECK_BALANCE(media_footprint);
24287 LEDGER_CHECK_BALANCE(media_nofootprint_compressed);
24288 LEDGER_CHECK_BALANCE(media_footprint_compressed);
24289 LEDGER_CHECK_BALANCE(graphics_nofootprint);
24290 LEDGER_CHECK_BALANCE(graphics_footprint);
24291 LEDGER_CHECK_BALANCE(graphics_nofootprint_compressed);
24292 LEDGER_CHECK_BALANCE(graphics_footprint_compressed);
24293 LEDGER_CHECK_BALANCE(neural_nofootprint);
24294 LEDGER_CHECK_BALANCE(neural_footprint);
24295 LEDGER_CHECK_BALANCE(neural_nofootprint_compressed);
24296 LEDGER_CHECK_BALANCE(neural_footprint_compressed);
24297
24298 if (do_panic) {
24299 if (pmap_ledgers_panic) {
24300 panic("pmap_destroy(%p) %d[%s] has imbalanced ledgers",
24301 pmap, pid, procname);
24302 } else {
24303 printf("pmap_destroy(%p) %d[%s] has imbalanced ledgers\n",
24304 pmap, pid, procname);
24305 }
24306 }
24307}
24308
24309void
24310vm_map_pmap_set_process(
24311 vm_map_t map,
24312 int pid,
24313 char *procname)
24314{
24315 pmap_set_process(vm_map_pmap(map), pid, procname);
24316}
24317
24318#endif /* MACH_ASSERT */
24319