1/*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28/*
29 * @OSF_COPYRIGHT@
30 */
31/*
32 * Mach Operating System
33 * Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56/*
57 */
58/*
59 * File: vm/vm_pageout.c
60 * Author: Avadis Tevanian, Jr., Michael Wayne Young
61 * Date: 1985
62 *
63 * The proverbial page-out daemon.
64 */
65
66#include <stdint.h>
67#include <ptrauth.h>
68
69#include <debug.h>
70
71#include <mach/mach_types.h>
72#include <mach/memory_object.h>
73#include <mach/mach_host_server.h>
74#include <mach/upl.h>
75#include <mach/vm_map.h>
76#include <mach/vm_param.h>
77#include <mach/vm_statistics.h>
78#include <mach/sdt.h>
79
80#include <kern/kern_types.h>
81#include <kern/counter.h>
82#include <kern/host_statistics.h>
83#include <kern/machine.h>
84#include <kern/misc_protos.h>
85#include <kern/sched.h>
86#include <kern/thread.h>
87#include <kern/kalloc.h>
88#include <kern/zalloc_internal.h>
89#include <kern/policy_internal.h>
90#include <kern/thread_group.h>
91
92#include <os/log.h>
93
94#include <sys/kdebug_triage.h>
95
96#include <machine/vm_tuning.h>
97#include <machine/commpage.h>
98
99#include <vm/pmap.h>
100#include <vm/vm_compressor_pager.h>
101#include <vm/vm_fault.h>
102#include <vm/vm_map_internal.h>
103#include <vm/vm_object.h>
104#include <vm/vm_page.h>
105#include <vm/vm_pageout.h>
106#include <vm/vm_protos.h> /* must be last */
107#include <vm/memory_object.h>
108#include <vm/vm_purgeable_internal.h>
109#include <vm/vm_shared_region.h>
110#include <vm/vm_compressor.h>
111
112#include <san/kasan.h>
113
114#if CONFIG_PHANTOM_CACHE
115#include <vm/vm_phantom_cache.h>
116#endif
117
118#if UPL_DEBUG
119#include <libkern/OSDebug.h>
120#endif
121
122extern int cs_debug;
123
124#if CONFIG_MBUF_MCACHE
125extern void mbuf_drain(boolean_t);
126#endif /* CONFIG_MBUF_MCACHE */
127
128#if VM_PRESSURE_EVENTS
129#if CONFIG_JETSAM
130extern unsigned int memorystatus_available_pages;
131extern unsigned int memorystatus_available_pages_pressure;
132extern unsigned int memorystatus_available_pages_critical;
133#else /* CONFIG_JETSAM */
134extern uint64_t memorystatus_available_pages;
135extern uint64_t memorystatus_available_pages_pressure;
136extern uint64_t memorystatus_available_pages_critical;
137#endif /* CONFIG_JETSAM */
138
139extern unsigned int memorystatus_frozen_count;
140extern unsigned int memorystatus_suspended_count;
141extern vm_pressure_level_t memorystatus_vm_pressure_level;
142
143extern lck_mtx_t memorystatus_jetsam_fg_band_lock;
144extern uint32_t memorystatus_jetsam_fg_band_waiters;
145
146void vm_pressure_response(void);
147extern void consider_vm_pressure_events(void);
148
149#define MEMORYSTATUS_SUSPENDED_THRESHOLD 4
150#endif /* VM_PRESSURE_EVENTS */
151
152SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_scan_thread;
153SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_gc_thread;
154#if CONFIG_VPS_DYNAMIC_PRIO
155TUNABLE(bool, vps_dynamic_priority_enabled, "vps_dynamic_priority_enabled", false);
156#else
157const bool vps_dynamic_priority_enabled = false;
158#endif
159boolean_t vps_yield_for_pgqlockwaiters = TRUE;
160
161#ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
162#if !XNU_TARGET_OS_OSX
163#define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024
164#else /* !XNU_TARGET_OS_OSX */
165#define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
166#endif /* !XNU_TARGET_OS_OSX */
167#endif
168
169#ifndef VM_PAGEOUT_DEADLOCK_RELIEF
170#define VM_PAGEOUT_DEADLOCK_RELIEF 100 /* number of pages to move to break deadlock */
171#endif
172
173#ifndef VM_PAGE_LAUNDRY_MAX
174#define VM_PAGE_LAUNDRY_MAX 128UL /* maximum pageouts on a given pageout queue */
175#endif /* VM_PAGEOUT_LAUNDRY_MAX */
176
177#ifndef VM_PAGEOUT_BURST_WAIT
178#define VM_PAGEOUT_BURST_WAIT 1 /* milliseconds */
179#endif /* VM_PAGEOUT_BURST_WAIT */
180
181#ifndef VM_PAGEOUT_EMPTY_WAIT
182#define VM_PAGEOUT_EMPTY_WAIT 50 /* milliseconds */
183#endif /* VM_PAGEOUT_EMPTY_WAIT */
184
185#ifndef VM_PAGEOUT_DEADLOCK_WAIT
186#define VM_PAGEOUT_DEADLOCK_WAIT 100 /* milliseconds */
187#endif /* VM_PAGEOUT_DEADLOCK_WAIT */
188
189#ifndef VM_PAGEOUT_IDLE_WAIT
190#define VM_PAGEOUT_IDLE_WAIT 10 /* milliseconds */
191#endif /* VM_PAGEOUT_IDLE_WAIT */
192
193#ifndef VM_PAGEOUT_SWAP_WAIT
194#define VM_PAGEOUT_SWAP_WAIT 10 /* milliseconds */
195#endif /* VM_PAGEOUT_SWAP_WAIT */
196
197
198#ifndef VM_PAGE_SPECULATIVE_TARGET
199#define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / (100 / vm_pageout_state.vm_page_speculative_percentage))
200#endif /* VM_PAGE_SPECULATIVE_TARGET */
201
202
203/*
204 * To obtain a reasonable LRU approximation, the inactive queue
205 * needs to be large enough to give pages on it a chance to be
206 * referenced a second time. This macro defines the fraction
207 * of active+inactive pages that should be inactive.
208 * The pageout daemon uses it to update vm_page_inactive_target.
209 *
210 * If vm_page_free_count falls below vm_page_free_target and
211 * vm_page_inactive_count is below vm_page_inactive_target,
212 * then the pageout daemon starts running.
213 */
214
215#ifndef VM_PAGE_INACTIVE_TARGET
216#define VM_PAGE_INACTIVE_TARGET(avail) ((avail) * 1 / 2)
217#endif /* VM_PAGE_INACTIVE_TARGET */
218
219/*
220 * Once the pageout daemon starts running, it keeps going
221 * until vm_page_free_count meets or exceeds vm_page_free_target.
222 */
223
224#ifndef VM_PAGE_FREE_TARGET
225#if !XNU_TARGET_OS_OSX
226#define VM_PAGE_FREE_TARGET(free) (15 + (free) / 100)
227#else /* !XNU_TARGET_OS_OSX */
228#define VM_PAGE_FREE_TARGET(free) (15 + (free) / 80)
229#endif /* !XNU_TARGET_OS_OSX */
230#endif /* VM_PAGE_FREE_TARGET */
231
232
233/*
234 * The pageout daemon always starts running once vm_page_free_count
235 * falls below vm_page_free_min.
236 */
237
238#ifndef VM_PAGE_FREE_MIN
239#if !XNU_TARGET_OS_OSX
240#define VM_PAGE_FREE_MIN(free) (10 + (free) / 200)
241#else /* !XNU_TARGET_OS_OSX */
242#define VM_PAGE_FREE_MIN(free) (10 + (free) / 100)
243#endif /* !XNU_TARGET_OS_OSX */
244#endif /* VM_PAGE_FREE_MIN */
245
246#if !XNU_TARGET_OS_OSX
247#define VM_PAGE_FREE_RESERVED_LIMIT 100
248#define VM_PAGE_FREE_MIN_LIMIT 1500
249#define VM_PAGE_FREE_TARGET_LIMIT 2000
250#else /* !XNU_TARGET_OS_OSX */
251#define VM_PAGE_FREE_RESERVED_LIMIT 1700
252#define VM_PAGE_FREE_MIN_LIMIT 3500
253#define VM_PAGE_FREE_TARGET_LIMIT 4000
254#endif /* !XNU_TARGET_OS_OSX */
255
256/*
257 * When vm_page_free_count falls below vm_page_free_reserved,
258 * only vm-privileged threads can allocate pages. vm-privilege
259 * allows the pageout daemon and default pager (and any other
260 * associated threads needed for default pageout) to continue
261 * operation by dipping into the reserved pool of pages.
262 */
263
264#ifndef VM_PAGE_FREE_RESERVED
265#define VM_PAGE_FREE_RESERVED(n) \
266 ((unsigned) (6 * VM_PAGE_LAUNDRY_MAX) + (n))
267#endif /* VM_PAGE_FREE_RESERVED */
268
269/*
270 * When we dequeue pages from the inactive list, they are
271 * reactivated (ie, put back on the active queue) if referenced.
272 * However, it is possible to starve the free list if other
273 * processors are referencing pages faster than we can turn off
274 * the referenced bit. So we limit the number of reactivations
275 * we will make per call of vm_pageout_scan().
276 */
277#define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
278
279#ifndef VM_PAGE_REACTIVATE_LIMIT
280#if !XNU_TARGET_OS_OSX
281#define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2)
282#else /* !XNU_TARGET_OS_OSX */
283#define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
284#endif /* !XNU_TARGET_OS_OSX */
285#endif /* VM_PAGE_REACTIVATE_LIMIT */
286#define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM 1000
287
288int vm_pageout_protect_realtime = true;
289
290extern boolean_t hibernate_cleaning_in_progress;
291
292struct pgo_iothread_state pgo_iothread_internal_state[MAX_COMPRESSOR_THREAD_COUNT];
293struct pgo_iothread_state pgo_iothread_external_state;
294
295#if VM_PRESSURE_EVENTS
296void vm_pressure_thread(void);
297
298boolean_t VM_PRESSURE_NORMAL_TO_WARNING(void);
299boolean_t VM_PRESSURE_WARNING_TO_CRITICAL(void);
300
301boolean_t VM_PRESSURE_WARNING_TO_NORMAL(void);
302boolean_t VM_PRESSURE_CRITICAL_TO_WARNING(void);
303#endif
304
305static void vm_pageout_iothread_external(struct pgo_iothread_state *, wait_result_t);
306static void vm_pageout_iothread_internal(struct pgo_iothread_state *, wait_result_t);
307static void vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state *, boolean_t);
308
309extern void vm_pageout_continue(void);
310extern void vm_pageout_scan(void);
311
312boolean_t vm_pageout_running = FALSE;
313
314uint32_t vm_page_upl_tainted = 0;
315uint32_t vm_page_iopl_tainted = 0;
316
317#if XNU_TARGET_OS_OSX
318static boolean_t vm_pageout_waiter = FALSE;
319#endif /* XNU_TARGET_OS_OSX */
320
321
322#if DEVELOPMENT || DEBUG
323struct vm_pageout_debug vm_pageout_debug;
324#endif
325struct vm_pageout_vminfo vm_pageout_vminfo;
326struct vm_pageout_state vm_pageout_state;
327struct vm_config vm_config;
328
329struct vm_pageout_queue vm_pageout_queue_internal VM_PAGE_PACKED_ALIGNED;
330struct vm_pageout_queue vm_pageout_queue_external VM_PAGE_PACKED_ALIGNED;
331#if DEVELOPMENT || DEBUG
332struct vm_pageout_queue vm_pageout_queue_benchmark VM_PAGE_PACKED_ALIGNED;
333#endif /* DEVELOPMENT || DEBUG */
334
335int vm_upl_wait_for_pages = 0;
336vm_object_t vm_pageout_scan_wants_object = VM_OBJECT_NULL;
337
338boolean_t(*volatile consider_buffer_cache_collect)(int) = NULL;
339
340int vm_debug_events = 0;
341
342LCK_GRP_DECLARE(vm_pageout_lck_grp, "vm_pageout");
343
344#if CONFIG_MEMORYSTATUS
345extern void memorystatus_kill_on_vps_starvation(void);
346
347uint32_t vm_pageout_memorystatus_fb_factor_nr = 5;
348uint32_t vm_pageout_memorystatus_fb_factor_dr = 2;
349
350#endif
351
352#if __AMP__
353
354
355/*
356 * Bind compressor threads to e-cores unless there are multiple non-e clusters
357 */
358#if (MAX_CPU_CLUSTERS > 2)
359#define VM_COMPRESSOR_EBOUND_DEFAULT false
360#else
361#define VM_COMPRESSOR_EBOUND_DEFAULT true
362#endif
363
364TUNABLE(bool, vm_compressor_ebound, "vmcomp_ecluster", VM_COMPRESSOR_EBOUND_DEFAULT);
365int vm_pgo_pbound = 0;
366extern void thread_bind_cluster_type(thread_t, char, bool);
367
368#endif /* __AMP__ */
369
370
371/*
372 * Routine: vm_pageout_object_terminate
373 * Purpose:
374 * Destroy the pageout_object, and perform all of the
375 * required cleanup actions.
376 *
377 * In/Out conditions:
378 * The object must be locked, and will be returned locked.
379 */
380void
381vm_pageout_object_terminate(
382 vm_object_t object)
383{
384 vm_object_t shadow_object;
385
386 /*
387 * Deal with the deallocation (last reference) of a pageout object
388 * (used for cleaning-in-place) by dropping the paging references/
389 * freeing pages in the original object.
390 */
391
392 assert(object->pageout);
393 shadow_object = object->shadow;
394 vm_object_lock(shadow_object);
395
396 while (!vm_page_queue_empty(&object->memq)) {
397 vm_page_t p, m;
398 vm_object_offset_t offset;
399
400 p = (vm_page_t) vm_page_queue_first(&object->memq);
401
402 assert(p->vmp_private);
403 assert(p->vmp_free_when_done);
404 p->vmp_free_when_done = FALSE;
405 assert(!p->vmp_cleaning);
406 assert(!p->vmp_laundry);
407
408 offset = p->vmp_offset;
409 VM_PAGE_FREE(p);
410 p = VM_PAGE_NULL;
411
412 m = vm_page_lookup(object: shadow_object,
413 offset: offset + object->vo_shadow_offset);
414
415 if (m == VM_PAGE_NULL) {
416 continue;
417 }
418
419 assert((m->vmp_dirty) || (m->vmp_precious) ||
420 (m->vmp_busy && m->vmp_cleaning));
421
422 /*
423 * Handle the trusted pager throttle.
424 * Also decrement the burst throttle (if external).
425 */
426 vm_page_lock_queues();
427 if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
428 vm_pageout_throttle_up(page: m);
429 }
430
431 /*
432 * Handle the "target" page(s). These pages are to be freed if
433 * successfully cleaned. Target pages are always busy, and are
434 * wired exactly once. The initial target pages are not mapped,
435 * (so cannot be referenced or modified) but converted target
436 * pages may have been modified between the selection as an
437 * adjacent page and conversion to a target.
438 */
439 if (m->vmp_free_when_done) {
440 assert(m->vmp_busy);
441 assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
442 assert(m->vmp_wire_count == 1);
443 m->vmp_cleaning = FALSE;
444 m->vmp_free_when_done = FALSE;
445 /*
446 * Revoke all access to the page. Since the object is
447 * locked, and the page is busy, this prevents the page
448 * from being dirtied after the pmap_disconnect() call
449 * returns.
450 *
451 * Since the page is left "dirty" but "not modifed", we
452 * can detect whether the page was redirtied during
453 * pageout by checking the modify state.
454 */
455 if (pmap_disconnect(phys: VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED) {
456 SET_PAGE_DIRTY(m, FALSE);
457 } else {
458 m->vmp_dirty = FALSE;
459 }
460
461 if (m->vmp_dirty) {
462 vm_page_unwire(page: m, TRUE); /* reactivates */
463 counter_inc(&vm_statistics_reactivations);
464 PAGE_WAKEUP_DONE(m);
465 } else {
466 vm_page_free(page: m); /* clears busy, etc. */
467 }
468 vm_page_unlock_queues();
469 continue;
470 }
471 /*
472 * Handle the "adjacent" pages. These pages were cleaned in
473 * place, and should be left alone.
474 * If prep_pin_count is nonzero, then someone is using the
475 * page, so make it active.
476 */
477 if ((m->vmp_q_state == VM_PAGE_NOT_ON_Q) && !m->vmp_private) {
478 if (m->vmp_reference) {
479 vm_page_activate(page: m);
480 } else {
481 vm_page_deactivate(page: m);
482 }
483 }
484 if (m->vmp_overwriting) {
485 /*
486 * the (COPY_OUT_FROM == FALSE) request_page_list case
487 */
488 if (m->vmp_busy) {
489 /*
490 * We do not re-set m->vmp_dirty !
491 * The page was busy so no extraneous activity
492 * could have occurred. COPY_INTO is a read into the
493 * new pages. CLEAN_IN_PLACE does actually write
494 * out the pages but handling outside of this code
495 * will take care of resetting dirty. We clear the
496 * modify however for the Programmed I/O case.
497 */
498 pmap_clear_modify(pn: VM_PAGE_GET_PHYS_PAGE(m));
499
500 m->vmp_busy = FALSE;
501 m->vmp_absent = FALSE;
502 } else {
503 /*
504 * alternate (COPY_OUT_FROM == FALSE) request_page_list case
505 * Occurs when the original page was wired
506 * at the time of the list request
507 */
508 assert(VM_PAGE_WIRED(m));
509 vm_page_unwire(page: m, TRUE); /* reactivates */
510 }
511 m->vmp_overwriting = FALSE;
512 } else {
513 m->vmp_dirty = FALSE;
514 }
515 m->vmp_cleaning = FALSE;
516
517 /*
518 * Wakeup any thread waiting for the page to be un-cleaning.
519 */
520 PAGE_WAKEUP(m);
521 vm_page_unlock_queues();
522 }
523 /*
524 * Account for the paging reference taken in vm_paging_object_allocate.
525 */
526 vm_object_activity_end(shadow_object);
527 vm_object_unlock(shadow_object);
528
529 assert(object->ref_count == 0);
530 assert(object->paging_in_progress == 0);
531 assert(object->activity_in_progress == 0);
532 assert(object->resident_page_count == 0);
533 return;
534}
535
536/*
537 * Routine: vm_pageclean_setup
538 *
539 * Purpose: setup a page to be cleaned (made non-dirty), but not
540 * necessarily flushed from the VM page cache.
541 * This is accomplished by cleaning in place.
542 *
543 * The page must not be busy, and new_object
544 * must be locked.
545 *
546 */
547static void
548vm_pageclean_setup(
549 vm_page_t m,
550 vm_page_t new_m,
551 vm_object_t new_object,
552 vm_object_offset_t new_offset)
553{
554 assert(!m->vmp_busy);
555#if 0
556 assert(!m->vmp_cleaning);
557#endif
558
559 pmap_clear_modify(pn: VM_PAGE_GET_PHYS_PAGE(m));
560
561 /*
562 * Mark original page as cleaning in place.
563 */
564 m->vmp_cleaning = TRUE;
565 SET_PAGE_DIRTY(m, FALSE);
566 m->vmp_precious = FALSE;
567
568 /*
569 * Convert the fictitious page to a private shadow of
570 * the real page.
571 */
572 assert(new_m->vmp_fictitious);
573 assert(VM_PAGE_GET_PHYS_PAGE(new_m) == vm_page_fictitious_addr);
574 new_m->vmp_fictitious = FALSE;
575 new_m->vmp_private = TRUE;
576 new_m->vmp_free_when_done = TRUE;
577 VM_PAGE_SET_PHYS_PAGE(new_m, VM_PAGE_GET_PHYS_PAGE(m));
578
579 vm_page_lockspin_queues();
580 vm_page_wire(page: new_m, VM_KERN_MEMORY_NONE, TRUE);
581 vm_page_unlock_queues();
582
583 vm_page_insert_wired(page: new_m, object: new_object, offset: new_offset, VM_KERN_MEMORY_NONE);
584 assert(!new_m->vmp_wanted);
585 new_m->vmp_busy = FALSE;
586}
587
588/*
589 * Routine: vm_pageout_initialize_page
590 * Purpose:
591 * Causes the specified page to be initialized in
592 * the appropriate memory object. This routine is used to push
593 * pages into a copy-object when they are modified in the
594 * permanent object.
595 *
596 * The page is moved to a temporary object and paged out.
597 *
598 * In/out conditions:
599 * The page in question must not be on any pageout queues.
600 * The object to which it belongs must be locked.
601 * The page must be busy, but not hold a paging reference.
602 *
603 * Implementation:
604 * Move this page to a completely new object.
605 */
606void
607vm_pageout_initialize_page(
608 vm_page_t m)
609{
610 vm_object_t object;
611 vm_object_offset_t paging_offset;
612 memory_object_t pager;
613
614 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
615
616 object = VM_PAGE_OBJECT(m);
617
618 assert(m->vmp_busy);
619 assert(object->internal);
620
621 /*
622 * Verify that we really want to clean this page
623 */
624 assert(!m->vmp_absent);
625 assert(m->vmp_dirty);
626
627 /*
628 * Create a paging reference to let us play with the object.
629 */
630 paging_offset = m->vmp_offset + object->paging_offset;
631
632 if (m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_restart || (!m->vmp_dirty && !m->vmp_precious)) {
633 panic("reservation without pageout?"); /* alan */
634
635 VM_PAGE_FREE(m);
636 vm_object_unlock(object);
637
638 return;
639 }
640
641 /*
642 * If there's no pager, then we can't clean the page. This should
643 * never happen since this should be a copy object and therefore not
644 * an external object, so the pager should always be there.
645 */
646
647 pager = object->pager;
648
649 if (pager == MEMORY_OBJECT_NULL) {
650 panic("missing pager for copy object");
651
652 VM_PAGE_FREE(m);
653 return;
654 }
655
656 /*
657 * set the page for future call to vm_fault_list_request
658 */
659 pmap_clear_modify(pn: VM_PAGE_GET_PHYS_PAGE(m));
660 SET_PAGE_DIRTY(m, FALSE);
661
662 /*
663 * keep the object from collapsing or terminating
664 */
665 vm_object_paging_begin(object);
666 vm_object_unlock(object);
667
668 /*
669 * Write the data to its pager.
670 * Note that the data is passed by naming the new object,
671 * not a virtual address; the pager interface has been
672 * manipulated to use the "internal memory" data type.
673 * [The object reference from its allocation is donated
674 * to the eventual recipient.]
675 */
676 memory_object_data_initialize(memory_object: pager, offset: paging_offset, PAGE_SIZE);
677
678 vm_object_lock(object);
679 vm_object_paging_end(object);
680}
681
682
683/*
684 * vm_pageout_cluster:
685 *
686 * Given a page, queue it to the appropriate I/O thread,
687 * which will page it out and attempt to clean adjacent pages
688 * in the same operation.
689 *
690 * The object and queues must be locked. We will take a
691 * paging reference to prevent deallocation or collapse when we
692 * release the object lock back at the call site. The I/O thread
693 * is responsible for consuming this reference
694 *
695 * The page must not be on any pageout queue.
696 */
697#if DEVELOPMENT || DEBUG
698vmct_stats_t vmct_stats;
699
700int32_t vmct_active = 0;
701uint64_t vm_compressor_epoch_start = 0;
702uint64_t vm_compressor_epoch_stop = 0;
703
704typedef enum vmct_state_t {
705 VMCT_IDLE,
706 VMCT_AWAKENED,
707 VMCT_ACTIVE,
708} vmct_state_t;
709vmct_state_t vmct_state[MAX_COMPRESSOR_THREAD_COUNT];
710#endif
711
712
713
714static void
715vm_pageout_cluster_to_queue(vm_page_t m, struct vm_pageout_queue *q)
716{
717 vm_object_t object = VM_PAGE_OBJECT(m);
718
719 VM_PAGE_CHECK(m);
720 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
721 vm_object_lock_assert_exclusive(object);
722
723 /*
724 * Make sure it's OK to page this out.
725 */
726 assert((m->vmp_dirty || m->vmp_precious) && (!VM_PAGE_WIRED(m)));
727 assert(!m->vmp_cleaning && !m->vmp_laundry);
728 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
729
730 /*
731 * protect the object from collapse or termination
732 */
733 vm_object_activity_begin(object);
734
735
736 /*
737 * pgo_laundry count is tied to the laundry bit
738 */
739 m->vmp_laundry = TRUE;
740 q->pgo_laundry++;
741
742 m->vmp_q_state = VM_PAGE_ON_PAGEOUT_Q;
743 vm_page_queue_enter(&q->pgo_pending, m, vmp_pageq);
744
745 // the benchmark queue will be woken up independently by the benchmark itself
746 if (
747 object->internal == TRUE
748#if DEVELOPMENT || DEBUG
749 && q != &vm_pageout_queue_benchmark
750#endif
751 ) {
752 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
753 m->vmp_busy = TRUE;
754 // Wake up the first compressor thread. It will wake subsequent threads if necessary.
755 sched_cond_signal(cond: &pgo_iothread_internal_state[0].pgo_wakeup, thread: pgo_iothread_internal_state[0].pgo_iothread);
756 } else {
757 sched_cond_signal(cond: &pgo_iothread_external_state.pgo_wakeup, thread: pgo_iothread_external_state.pgo_iothread);
758 }
759 VM_PAGE_CHECK(m);
760}
761
762void
763vm_pageout_cluster(vm_page_t m)
764{
765 struct vm_pageout_queue *q;
766 vm_object_t object = VM_PAGE_OBJECT(m);
767 if (object->internal) {
768 q = &vm_pageout_queue_internal;
769 } else {
770 q = &vm_pageout_queue_external;
771 }
772 vm_pageout_cluster_to_queue(m, q);
773}
774
775
776/*
777 * A page is back from laundry or we are stealing it back from
778 * the laundering state. See if there are some pages waiting to
779 * go to laundry and if we can let some of them go now.
780 *
781 * Object and page queues must be locked.
782 */
783void
784vm_pageout_throttle_up(
785 vm_page_t m)
786{
787 struct vm_pageout_queue *q;
788 vm_object_t m_object;
789
790 m_object = VM_PAGE_OBJECT(m);
791
792 assert(m_object != VM_OBJECT_NULL);
793 assert(!is_kernel_object(m_object));
794
795 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
796 vm_object_lock_assert_exclusive(m_object);
797
798 if (m_object->internal == TRUE) {
799 q = &vm_pageout_queue_internal;
800 } else {
801 q = &vm_pageout_queue_external;
802 }
803
804 if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
805 vm_page_queue_remove(&q->pgo_pending, m, vmp_pageq);
806 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
807
808 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
809
810 vm_object_activity_end(m_object);
811
812 VM_PAGEOUT_DEBUG(vm_page_steal_pageout_page, 1);
813 }
814 if (m->vmp_laundry == TRUE) {
815 m->vmp_laundry = FALSE;
816 q->pgo_laundry--;
817
818 if (q->pgo_throttled == TRUE) {
819 q->pgo_throttled = FALSE;
820 thread_wakeup((event_t) &q->pgo_laundry);
821 }
822 if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
823 q->pgo_draining = FALSE;
824 thread_wakeup((event_t) (&q->pgo_laundry + 1));
825 }
826 VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, 1);
827 }
828}
829
830
831static void
832vm_pageout_throttle_up_batch(
833 struct vm_pageout_queue *q,
834 int batch_cnt)
835{
836 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
837
838 VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, batch_cnt);
839
840 q->pgo_laundry -= batch_cnt;
841
842 if (q->pgo_throttled == TRUE) {
843 q->pgo_throttled = FALSE;
844 thread_wakeup((event_t) &q->pgo_laundry);
845 }
846 if (q->pgo_draining == TRUE && q->pgo_laundry == 0) {
847 q->pgo_draining = FALSE;
848 thread_wakeup((event_t) (&q->pgo_laundry + 1));
849 }
850}
851
852
853
854/*
855 * VM memory pressure monitoring.
856 *
857 * vm_pageout_scan() keeps track of the number of pages it considers and
858 * reclaims, in the currently active vm_pageout_stat[vm_pageout_stat_now].
859 *
860 * compute_memory_pressure() is called every second from compute_averages()
861 * and moves "vm_pageout_stat_now" forward, to start accumulating the number
862 * of recalimed pages in a new vm_pageout_stat[] bucket.
863 *
864 * mach_vm_pressure_monitor() collects past statistics about memory pressure.
865 * The caller provides the number of seconds ("nsecs") worth of statistics
866 * it wants, up to 30 seconds.
867 * It computes the number of pages reclaimed in the past "nsecs" seconds and
868 * also returns the number of pages the system still needs to reclaim at this
869 * moment in time.
870 */
871#if DEVELOPMENT || DEBUG
872#define VM_PAGEOUT_STAT_SIZE (30 * 8) + 1
873#else
874#define VM_PAGEOUT_STAT_SIZE (1 * 8) + 1
875#endif
876struct vm_pageout_stat {
877 unsigned long vm_page_active_count;
878 unsigned long vm_page_speculative_count;
879 unsigned long vm_page_inactive_count;
880 unsigned long vm_page_anonymous_count;
881
882 unsigned long vm_page_free_count;
883 unsigned long vm_page_wire_count;
884 unsigned long vm_page_compressor_count;
885
886 unsigned long vm_page_pages_compressed;
887 unsigned long vm_page_pageable_internal_count;
888 unsigned long vm_page_pageable_external_count;
889 unsigned long vm_page_xpmapped_external_count;
890
891 unsigned int pages_grabbed;
892 unsigned int pages_freed;
893
894 unsigned int pages_compressed;
895 unsigned int pages_grabbed_by_compressor;
896 unsigned int failed_compressions;
897
898 unsigned int pages_evicted;
899 unsigned int pages_purged;
900
901 unsigned int considered;
902 unsigned int considered_bq_internal;
903 unsigned int considered_bq_external;
904
905 unsigned int skipped_external;
906 unsigned int skipped_internal;
907 unsigned int filecache_min_reactivations;
908
909 unsigned int freed_speculative;
910 unsigned int freed_cleaned;
911 unsigned int freed_internal;
912 unsigned int freed_external;
913
914 unsigned int cleaned_dirty_external;
915 unsigned int cleaned_dirty_internal;
916
917 unsigned int inactive_referenced;
918 unsigned int inactive_nolock;
919 unsigned int reactivation_limit_exceeded;
920 unsigned int forced_inactive_reclaim;
921
922 unsigned int throttled_internal_q;
923 unsigned int throttled_external_q;
924
925 unsigned int phantom_ghosts_found;
926 unsigned int phantom_ghosts_added;
927
928 unsigned int vm_page_realtime_count;
929 unsigned int forcereclaimed_sharedcache;
930 unsigned int forcereclaimed_realtime;
931 unsigned int protected_sharedcache;
932 unsigned int protected_realtime;
933} vm_pageout_stats[VM_PAGEOUT_STAT_SIZE];
934
935unsigned int vm_pageout_stat_now = 0;
936
937#define VM_PAGEOUT_STAT_BEFORE(i) \
938 (((i) == 0) ? VM_PAGEOUT_STAT_SIZE - 1 : (i) - 1)
939#define VM_PAGEOUT_STAT_AFTER(i) \
940 (((i) == VM_PAGEOUT_STAT_SIZE - 1) ? 0 : (i) + 1)
941
942#if VM_PAGE_BUCKETS_CHECK
943int vm_page_buckets_check_interval = 80; /* in eighths of a second */
944#endif /* VM_PAGE_BUCKETS_CHECK */
945
946
947void
948record_memory_pressure(void);
949void
950record_memory_pressure(void)
951{
952 unsigned int vm_pageout_next;
953
954#if VM_PAGE_BUCKETS_CHECK
955 /* check the consistency of VM page buckets at regular interval */
956 static int counter = 0;
957 if ((++counter % vm_page_buckets_check_interval) == 0) {
958 vm_page_buckets_check();
959 }
960#endif /* VM_PAGE_BUCKETS_CHECK */
961
962 vm_pageout_state.vm_memory_pressure =
963 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_speculative +
964 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_cleaned +
965 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_internal +
966 vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_external;
967
968 commpage_set_memory_pressure(pressure: (unsigned int)vm_pageout_state.vm_memory_pressure );
969
970 /* move "now" forward */
971 vm_pageout_next = VM_PAGEOUT_STAT_AFTER(vm_pageout_stat_now);
972
973 bzero(s: &vm_pageout_stats[vm_pageout_next], n: sizeof(struct vm_pageout_stat));
974
975 vm_pageout_stat_now = vm_pageout_next;
976}
977
978
979/*
980 * IMPORTANT
981 * mach_vm_ctl_page_free_wanted() is called indirectly, via
982 * mach_vm_pressure_monitor(), when taking a stackshot. Therefore,
983 * it must be safe in the restricted stackshot context. Locks and/or
984 * blocking are not allowable.
985 */
986unsigned int
987mach_vm_ctl_page_free_wanted(void)
988{
989 unsigned int page_free_target, page_free_count, page_free_wanted;
990
991 page_free_target = vm_page_free_target;
992 page_free_count = vm_page_free_count;
993 if (page_free_target > page_free_count) {
994 page_free_wanted = page_free_target - page_free_count;
995 } else {
996 page_free_wanted = 0;
997 }
998
999 return page_free_wanted;
1000}
1001
1002
1003/*
1004 * IMPORTANT:
1005 * mach_vm_pressure_monitor() is called when taking a stackshot, with
1006 * wait_for_pressure FALSE, so that code path must remain safe in the
1007 * restricted stackshot context. No blocking or locks are allowable.
1008 * on that code path.
1009 */
1010
1011kern_return_t
1012mach_vm_pressure_monitor(
1013 boolean_t wait_for_pressure,
1014 unsigned int nsecs_monitored,
1015 unsigned int *pages_reclaimed_p,
1016 unsigned int *pages_wanted_p)
1017{
1018 wait_result_t wr;
1019 unsigned int vm_pageout_then, vm_pageout_now;
1020 unsigned int pages_reclaimed;
1021 unsigned int units_of_monitor;
1022
1023 units_of_monitor = 8 * nsecs_monitored;
1024 /*
1025 * We don't take the vm_page_queue_lock here because we don't want
1026 * vm_pressure_monitor() to get in the way of the vm_pageout_scan()
1027 * thread when it's trying to reclaim memory. We don't need fully
1028 * accurate monitoring anyway...
1029 */
1030
1031 if (wait_for_pressure) {
1032 /* wait until there's memory pressure */
1033 while (vm_page_free_count >= vm_page_free_target) {
1034 wr = assert_wait(event: (event_t) &vm_page_free_wanted,
1035 THREAD_INTERRUPTIBLE);
1036 if (wr == THREAD_WAITING) {
1037 wr = thread_block(THREAD_CONTINUE_NULL);
1038 }
1039 if (wr == THREAD_INTERRUPTED) {
1040 return KERN_ABORTED;
1041 }
1042 if (wr == THREAD_AWAKENED) {
1043 /*
1044 * The memory pressure might have already
1045 * been relieved but let's not block again
1046 * and let's report that there was memory
1047 * pressure at some point.
1048 */
1049 break;
1050 }
1051 }
1052 }
1053
1054 /* provide the number of pages the system wants to reclaim */
1055 if (pages_wanted_p != NULL) {
1056 *pages_wanted_p = mach_vm_ctl_page_free_wanted();
1057 }
1058
1059 if (pages_reclaimed_p == NULL) {
1060 return KERN_SUCCESS;
1061 }
1062
1063 /* provide number of pages reclaimed in the last "nsecs_monitored" */
1064 vm_pageout_now = vm_pageout_stat_now;
1065 pages_reclaimed = 0;
1066 for (vm_pageout_then =
1067 VM_PAGEOUT_STAT_BEFORE(vm_pageout_now);
1068 vm_pageout_then != vm_pageout_now &&
1069 units_of_monitor-- != 0;
1070 vm_pageout_then =
1071 VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) {
1072 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_speculative;
1073 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_cleaned;
1074 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_internal;
1075 pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_external;
1076 }
1077 *pages_reclaimed_p = pages_reclaimed;
1078
1079 return KERN_SUCCESS;
1080}
1081
1082
1083
1084#if DEVELOPMENT || DEBUG
1085
1086static void
1087vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *, int);
1088
1089/*
1090 * condition variable used to make sure there is
1091 * only a single sweep going on at a time
1092 */
1093bool vm_pageout_disconnect_all_pages_active = false;
1094
1095void
1096vm_pageout_disconnect_all_pages()
1097{
1098 vm_page_lock_queues();
1099
1100 if (vm_pageout_disconnect_all_pages_active) {
1101 vm_page_unlock_queues();
1102 return;
1103 }
1104 vm_pageout_disconnect_all_pages_active = true;
1105
1106 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_throttled,
1107 vm_page_throttled_count);
1108 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_anonymous,
1109 vm_page_anonymous_count);
1110 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_inactive,
1111 (vm_page_inactive_count - vm_page_anonymous_count));
1112 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_active,
1113 vm_page_active_count);
1114#ifdef CONFIG_SECLUDED_MEMORY
1115 vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_secluded,
1116 vm_page_secluded_count);
1117#endif /* CONFIG_SECLUDED_MEMORY */
1118 vm_page_unlock_queues();
1119
1120 vm_pageout_disconnect_all_pages_active = false;
1121}
1122
1123/* NB: assumes the page_queues lock is held on entry, returns with page queue lock held */
1124void
1125vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t *q, int qcount)
1126{
1127 vm_page_t m;
1128 vm_object_t t_object = NULL;
1129 vm_object_t l_object = NULL;
1130 vm_object_t m_object = NULL;
1131 int delayed_unlock = 0;
1132 int try_failed_count = 0;
1133 int disconnected_count = 0;
1134 int paused_count = 0;
1135 int object_locked_count = 0;
1136
1137 KDBG((MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS) |
1138 DBG_FUNC_START),
1139 q, qcount);
1140
1141 while (qcount && !vm_page_queue_empty(q)) {
1142 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1143
1144 m = (vm_page_t) vm_page_queue_first(q);
1145 m_object = VM_PAGE_OBJECT(m);
1146
1147 /*
1148 * check to see if we currently are working
1149 * with the same object... if so, we've
1150 * already got the lock
1151 */
1152 if (m_object != l_object) {
1153 /*
1154 * the object associated with candidate page is
1155 * different from the one we were just working
1156 * with... dump the lock if we still own it
1157 */
1158 if (l_object != NULL) {
1159 vm_object_unlock(l_object);
1160 l_object = NULL;
1161 }
1162 if (m_object != t_object) {
1163 try_failed_count = 0;
1164 }
1165
1166 /*
1167 * Try to lock object; since we've alread got the
1168 * page queues lock, we can only 'try' for this one.
1169 * if the 'try' fails, we need to do a mutex_pause
1170 * to allow the owner of the object lock a chance to
1171 * run...
1172 */
1173 if (!vm_object_lock_try_scan(m_object)) {
1174 if (try_failed_count > 20) {
1175 goto reenter_pg_on_q;
1176 }
1177 vm_page_unlock_queues();
1178 mutex_pause(try_failed_count++);
1179 vm_page_lock_queues();
1180 delayed_unlock = 0;
1181
1182 paused_count++;
1183
1184 t_object = m_object;
1185 continue;
1186 }
1187 object_locked_count++;
1188
1189 l_object = m_object;
1190 }
1191 if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry ||
1192 m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) ||
1193 m->vmp_free_when_done) {
1194 /*
1195 * put it back on the head of its queue
1196 */
1197 goto reenter_pg_on_q;
1198 }
1199 if (m->vmp_pmapped == TRUE) {
1200 pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
1201
1202 disconnected_count++;
1203 }
1204reenter_pg_on_q:
1205 vm_page_queue_remove(q, m, vmp_pageq);
1206 vm_page_queue_enter(q, m, vmp_pageq);
1207
1208 qcount--;
1209 try_failed_count = 0;
1210
1211 if (delayed_unlock++ > 128) {
1212 if (l_object != NULL) {
1213 vm_object_unlock(l_object);
1214 l_object = NULL;
1215 }
1216 lck_mtx_yield(&vm_page_queue_lock);
1217 delayed_unlock = 0;
1218 }
1219 }
1220 if (l_object != NULL) {
1221 vm_object_unlock(l_object);
1222 l_object = NULL;
1223 }
1224
1225 KDBG((MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS) |
1226 DBG_FUNC_END),
1227 q, disconnected_count, object_locked_count, paused_count);
1228}
1229
1230extern char* proc_best_name(struct proc* proc);
1231
1232int
1233vm_toggle_task_selfdonate_pages(task_t task)
1234{
1235 int state = 0;
1236 if (vm_page_donate_mode == VM_PAGE_DONATE_DISABLED) {
1237 printf("VM Donation mode is OFF on the system\n");
1238 return state;
1239 }
1240 if (task != kernel_task) {
1241 task_lock(task);
1242 if (!task->donates_own_pages) {
1243 printf("SELF DONATE for %s ON\n", proc_best_name(get_bsdtask_info(task)));
1244 task->donates_own_pages = true;
1245 state = 1;
1246 } else if (task->donates_own_pages) {
1247 printf("SELF DONATE for %s OFF\n", proc_best_name(get_bsdtask_info(task)));
1248 task->donates_own_pages = false;
1249 state = 0;
1250 }
1251 task_unlock(task);
1252 }
1253 return state;
1254}
1255#endif /* DEVELOPMENT || DEBUG */
1256
1257void
1258vm_task_set_selfdonate_pages(task_t task, bool donate)
1259{
1260 assert(vm_page_donate_mode != VM_PAGE_DONATE_DISABLED);
1261 assert(task != kernel_task);
1262
1263 task_lock(task);
1264 task->donates_own_pages = donate;
1265 task_unlock(task);
1266}
1267
1268
1269
1270static size_t
1271vm_pageout_page_queue(vm_page_queue_head_t *, size_t, bool);
1272
1273/*
1274 * condition variable used to make sure there is
1275 * only a single sweep going on at a time
1276 */
1277boolean_t vm_pageout_anonymous_pages_active = FALSE;
1278
1279
1280void
1281vm_pageout_anonymous_pages()
1282{
1283 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
1284 vm_page_lock_queues();
1285
1286 if (vm_pageout_anonymous_pages_active == TRUE) {
1287 vm_page_unlock_queues();
1288 return;
1289 }
1290 vm_pageout_anonymous_pages_active = TRUE;
1291 vm_page_unlock_queues();
1292
1293 vm_pageout_page_queue(&vm_page_queue_throttled, vm_page_throttled_count, false);
1294 vm_pageout_page_queue(&vm_page_queue_anonymous, vm_page_anonymous_count, false);
1295 vm_pageout_page_queue(&vm_page_queue_active, vm_page_active_count, false);
1296
1297 if (VM_CONFIG_SWAP_IS_PRESENT) {
1298 vm_consider_swapping();
1299 }
1300
1301 vm_page_lock_queues();
1302 vm_pageout_anonymous_pages_active = FALSE;
1303 vm_page_unlock_queues();
1304 }
1305}
1306
1307
1308size_t
1309vm_pageout_page_queue(vm_page_queue_head_t *q, size_t qcount, bool perf_test)
1310{
1311 vm_page_t m;
1312 vm_object_t t_object = NULL;
1313 vm_object_t l_object = NULL;
1314 vm_object_t m_object = NULL;
1315 int delayed_unlock = 0;
1316 int try_failed_count = 0;
1317 int refmod_state;
1318 int pmap_options;
1319 struct vm_pageout_queue *iq;
1320 ppnum_t phys_page;
1321 size_t pages_moved = 0;
1322
1323
1324 iq = &vm_pageout_queue_internal;
1325
1326 vm_page_lock_queues();
1327
1328#if DEVELOPMENT || DEBUG
1329 if (perf_test) {
1330 iq = &vm_pageout_queue_benchmark;
1331 // ensure the benchmark queue isn't throttled
1332 iq->pgo_maxlaundry = (unsigned int) qcount;
1333 }
1334#endif /* DEVELOPMENT ||DEBUG */
1335
1336 while (qcount && !vm_page_queue_empty(q)) {
1337 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1338
1339 if (VM_PAGE_Q_THROTTLED(iq)) {
1340 if (l_object != NULL) {
1341 vm_object_unlock(l_object);
1342 l_object = NULL;
1343 }
1344 iq->pgo_draining = TRUE;
1345
1346 assert_wait(event: (event_t) (&iq->pgo_laundry + 1), THREAD_INTERRUPTIBLE);
1347 vm_page_unlock_queues();
1348
1349 thread_block(THREAD_CONTINUE_NULL);
1350
1351 vm_page_lock_queues();
1352 delayed_unlock = 0;
1353 continue;
1354 }
1355 m = (vm_page_t) vm_page_queue_first(q);
1356 m_object = VM_PAGE_OBJECT(m);
1357
1358 /*
1359 * check to see if we currently are working
1360 * with the same object... if so, we've
1361 * already got the lock
1362 */
1363 if (m_object != l_object) {
1364 if (!m_object->internal) {
1365 goto reenter_pg_on_q;
1366 }
1367
1368 /*
1369 * the object associated with candidate page is
1370 * different from the one we were just working
1371 * with... dump the lock if we still own it
1372 */
1373 if (l_object != NULL) {
1374 vm_object_unlock(l_object);
1375 l_object = NULL;
1376 }
1377 if (m_object != t_object) {
1378 try_failed_count = 0;
1379 }
1380
1381 /*
1382 * Try to lock object; since we've alread got the
1383 * page queues lock, we can only 'try' for this one.
1384 * if the 'try' fails, we need to do a mutex_pause
1385 * to allow the owner of the object lock a chance to
1386 * run...
1387 */
1388 if (!vm_object_lock_try_scan(m_object)) {
1389 if (try_failed_count > 20) {
1390 goto reenter_pg_on_q;
1391 }
1392 vm_page_unlock_queues();
1393 mutex_pause(try_failed_count++);
1394 vm_page_lock_queues();
1395 delayed_unlock = 0;
1396
1397 t_object = m_object;
1398 continue;
1399 }
1400 l_object = m_object;
1401 }
1402 if (!m_object->alive || m->vmp_cleaning || m->vmp_laundry || m->vmp_busy || m->vmp_absent || VMP_ERROR_GET(m) || m->vmp_free_when_done) {
1403 /*
1404 * page is not to be cleaned
1405 * put it back on the head of its queue
1406 */
1407 goto reenter_pg_on_q;
1408 }
1409 phys_page = VM_PAGE_GET_PHYS_PAGE(m);
1410
1411 if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
1412 refmod_state = pmap_get_refmod(pn: phys_page);
1413
1414 if (refmod_state & VM_MEM_REFERENCED) {
1415 m->vmp_reference = TRUE;
1416 }
1417 if (refmod_state & VM_MEM_MODIFIED) {
1418 SET_PAGE_DIRTY(m, FALSE);
1419 }
1420 }
1421 if (m->vmp_reference == TRUE) {
1422 m->vmp_reference = FALSE;
1423 pmap_clear_refmod_options(pn: phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
1424 goto reenter_pg_on_q;
1425 }
1426 if (m->vmp_pmapped == TRUE) {
1427 if (m->vmp_dirty || m->vmp_precious) {
1428 pmap_options = PMAP_OPTIONS_COMPRESSOR;
1429 } else {
1430 pmap_options = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1431 }
1432 refmod_state = pmap_disconnect_options(phys: phys_page, options: pmap_options, NULL);
1433 if (refmod_state & VM_MEM_MODIFIED) {
1434 SET_PAGE_DIRTY(m, FALSE);
1435 }
1436 }
1437
1438 if (!m->vmp_dirty && !m->vmp_precious) {
1439 vm_page_unlock_queues();
1440 VM_PAGE_FREE(m);
1441 vm_page_lock_queues();
1442 delayed_unlock = 0;
1443
1444 goto next_pg;
1445 }
1446 if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1447 if (!m_object->pager_initialized) {
1448 vm_page_unlock_queues();
1449
1450 vm_object_collapse(object: m_object, offset: (vm_object_offset_t) 0, TRUE);
1451
1452 if (!m_object->pager_initialized) {
1453 vm_object_compressor_pager_create(object: m_object);
1454 }
1455
1456 vm_page_lock_queues();
1457 delayed_unlock = 0;
1458 }
1459 if (!m_object->pager_initialized || m_object->pager == MEMORY_OBJECT_NULL) {
1460 goto reenter_pg_on_q;
1461 }
1462 /*
1463 * vm_object_compressor_pager_create will drop the object lock
1464 * which means 'm' may no longer be valid to use
1465 */
1466 continue;
1467 }
1468
1469 if (!perf_test) {
1470 /*
1471 * we've already factored out pages in the laundry which
1472 * means this page can't be on the pageout queue so it's
1473 * safe to do the vm_page_queues_remove
1474 */
1475 bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
1476 vm_page_queues_remove(mem: m, TRUE);
1477 if (donate) {
1478 /*
1479 * The compressor needs to see this bit to know
1480 * where this page needs to land. Also if stolen,
1481 * this bit helps put the page back in the right
1482 * special queue where it belongs.
1483 */
1484 m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
1485 }
1486 } else {
1487 vm_page_queue_remove(q, m, vmp_pageq);
1488 }
1489
1490 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1491
1492 vm_pageout_cluster_to_queue(m, q: iq);
1493
1494 pages_moved++;
1495 goto next_pg;
1496
1497reenter_pg_on_q:
1498 vm_page_queue_remove(q, m, vmp_pageq);
1499 vm_page_queue_enter(q, m, vmp_pageq);
1500next_pg:
1501 qcount--;
1502 try_failed_count = 0;
1503
1504 if (delayed_unlock++ > 128) {
1505 if (l_object != NULL) {
1506 vm_object_unlock(l_object);
1507 l_object = NULL;
1508 }
1509 lck_mtx_yield(lck: &vm_page_queue_lock);
1510 delayed_unlock = 0;
1511 }
1512 }
1513 if (l_object != NULL) {
1514 vm_object_unlock(l_object);
1515 l_object = NULL;
1516 }
1517 vm_page_unlock_queues();
1518 return pages_moved;
1519}
1520
1521
1522
1523/*
1524 * function in BSD to apply I/O throttle to the pageout thread
1525 */
1526extern void vm_pageout_io_throttle(void);
1527
1528#define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, obj) \
1529 MACRO_BEGIN \
1530 /* \
1531 * If a "reusable" page somehow made it back into \
1532 * the active queue, it's been re-used and is not \
1533 * quite re-usable. \
1534 * If the VM object was "all_reusable", consider it \
1535 * as "all re-used" instead of converting it to \
1536 * "partially re-used", which could be expensive. \
1537 */ \
1538 assert(VM_PAGE_OBJECT((m)) == (obj)); \
1539 if ((m)->vmp_reusable || \
1540 (obj)->all_reusable) { \
1541 vm_object_reuse_pages((obj), \
1542 (m)->vmp_offset, \
1543 (m)->vmp_offset + PAGE_SIZE_64, \
1544 FALSE); \
1545 } \
1546 MACRO_END
1547
1548
1549#define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT 64
1550#define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX 1024
1551
1552#define FCS_IDLE 0
1553#define FCS_DELAYED 1
1554#define FCS_DEADLOCK_DETECTED 2
1555
1556struct flow_control {
1557 int state;
1558 mach_timespec_t ts;
1559};
1560
1561
1562uint64_t vm_pageout_rejected_bq_internal = 0;
1563uint64_t vm_pageout_rejected_bq_external = 0;
1564uint64_t vm_pageout_skipped_bq_internal = 0;
1565uint64_t vm_pageout_skipped_bq_external = 0;
1566
1567#define ANONS_GRABBED_LIMIT 2
1568
1569
1570#if 0
1571static void vm_pageout_delayed_unlock(int *, int *, vm_page_t *);
1572#endif
1573static void vm_pageout_prepare_to_block(vm_object_t *, int *, vm_page_t *, int *, int);
1574
1575#define VM_PAGEOUT_PB_NO_ACTION 0
1576#define VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER 1
1577#define VM_PAGEOUT_PB_THREAD_YIELD 2
1578
1579
1580#if 0
1581static void
1582vm_pageout_delayed_unlock(int *delayed_unlock, int *local_freed, vm_page_t *local_freeq)
1583{
1584 if (*local_freeq) {
1585 vm_page_unlock_queues();
1586
1587 VM_DEBUG_CONSTANT_EVENT(
1588 vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
1589 vm_page_free_count, 0, 0, 1);
1590
1591 vm_page_free_list(*local_freeq, TRUE);
1592
1593 VM_DEBUG_CONSTANT_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
1594 vm_page_free_count, *local_freed, 0, 1);
1595
1596 *local_freeq = NULL;
1597 *local_freed = 0;
1598
1599 vm_page_lock_queues();
1600 } else {
1601 lck_mtx_yield(&vm_page_queue_lock);
1602 }
1603 *delayed_unlock = 1;
1604}
1605#endif
1606
1607
1608static void
1609vm_pageout_prepare_to_block(vm_object_t *object, int *delayed_unlock,
1610 vm_page_t *local_freeq, int *local_freed, int action)
1611{
1612 vm_page_unlock_queues();
1613
1614 if (*object != NULL) {
1615 vm_object_unlock(*object);
1616 *object = NULL;
1617 }
1618 if (*local_freeq) {
1619 vm_page_free_list(mem: *local_freeq, TRUE);
1620
1621 *local_freeq = NULL;
1622 *local_freed = 0;
1623 }
1624 *delayed_unlock = 1;
1625
1626 switch (action) {
1627 case VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER:
1628 vm_consider_waking_compactor_swapper();
1629 break;
1630 case VM_PAGEOUT_PB_THREAD_YIELD:
1631 thread_yield_internal(interval: 1);
1632 break;
1633 case VM_PAGEOUT_PB_NO_ACTION:
1634 default:
1635 break;
1636 }
1637 vm_page_lock_queues();
1638}
1639
1640
1641static struct vm_pageout_vminfo last;
1642
1643uint64_t last_vm_page_pages_grabbed = 0;
1644
1645extern uint32_t c_segment_pages_compressed;
1646
1647extern uint64_t shared_region_pager_reclaimed;
1648extern struct memory_object_pager_ops shared_region_pager_ops;
1649
1650void
1651update_vm_info(void)
1652{
1653 unsigned long tmp;
1654 uint64_t tmp64;
1655
1656 vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count = vm_page_active_count;
1657 vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count = vm_page_speculative_count;
1658 vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count = vm_page_inactive_count;
1659 vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count = vm_page_anonymous_count;
1660
1661 vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count = vm_page_free_count;
1662 vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count = vm_page_wire_count;
1663 vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count = VM_PAGE_COMPRESSOR_COUNT;
1664
1665 vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed = c_segment_pages_compressed;
1666 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count = vm_page_pageable_internal_count;
1667 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count = vm_page_pageable_external_count;
1668 vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count = vm_page_xpmapped_external_count;
1669 vm_pageout_stats[vm_pageout_stat_now].vm_page_realtime_count = vm_page_realtime_count;
1670
1671 tmp = vm_pageout_vminfo.vm_pageout_considered_page;
1672 vm_pageout_stats[vm_pageout_stat_now].considered = (unsigned int)(tmp - last.vm_pageout_considered_page);
1673 last.vm_pageout_considered_page = tmp;
1674
1675 tmp64 = vm_pageout_vminfo.vm_pageout_compressions;
1676 vm_pageout_stats[vm_pageout_stat_now].pages_compressed = (unsigned int)(tmp64 - last.vm_pageout_compressions);
1677 last.vm_pageout_compressions = tmp64;
1678
1679 tmp = vm_pageout_vminfo.vm_compressor_failed;
1680 vm_pageout_stats[vm_pageout_stat_now].failed_compressions = (unsigned int)(tmp - last.vm_compressor_failed);
1681 last.vm_compressor_failed = tmp;
1682
1683 tmp64 = vm_pageout_vminfo.vm_compressor_pages_grabbed;
1684 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor = (unsigned int)(tmp64 - last.vm_compressor_pages_grabbed);
1685 last.vm_compressor_pages_grabbed = tmp64;
1686
1687 tmp = vm_pageout_vminfo.vm_phantom_cache_found_ghost;
1688 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found = (unsigned int)(tmp - last.vm_phantom_cache_found_ghost);
1689 last.vm_phantom_cache_found_ghost = tmp;
1690
1691 tmp = vm_pageout_vminfo.vm_phantom_cache_added_ghost;
1692 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added = (unsigned int)(tmp - last.vm_phantom_cache_added_ghost);
1693 last.vm_phantom_cache_added_ghost = tmp;
1694
1695 tmp64 = counter_load(&vm_page_grab_count);
1696 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed = (unsigned int)(tmp64 - last_vm_page_pages_grabbed);
1697 last_vm_page_pages_grabbed = tmp64;
1698
1699 tmp = vm_pageout_vminfo.vm_page_pages_freed;
1700 vm_pageout_stats[vm_pageout_stat_now].pages_freed = (unsigned int)(tmp - last.vm_page_pages_freed);
1701 last.vm_page_pages_freed = tmp;
1702
1703 if (vm_pageout_stats[vm_pageout_stat_now].considered) {
1704 tmp = vm_pageout_vminfo.vm_pageout_pages_evicted;
1705 vm_pageout_stats[vm_pageout_stat_now].pages_evicted = (unsigned int)(tmp - last.vm_pageout_pages_evicted);
1706 last.vm_pageout_pages_evicted = tmp;
1707
1708 tmp = vm_pageout_vminfo.vm_pageout_pages_purged;
1709 vm_pageout_stats[vm_pageout_stat_now].pages_purged = (unsigned int)(tmp - last.vm_pageout_pages_purged);
1710 last.vm_pageout_pages_purged = tmp;
1711
1712 tmp = vm_pageout_vminfo.vm_pageout_freed_speculative;
1713 vm_pageout_stats[vm_pageout_stat_now].freed_speculative = (unsigned int)(tmp - last.vm_pageout_freed_speculative);
1714 last.vm_pageout_freed_speculative = tmp;
1715
1716 tmp = vm_pageout_vminfo.vm_pageout_freed_external;
1717 vm_pageout_stats[vm_pageout_stat_now].freed_external = (unsigned int)(tmp - last.vm_pageout_freed_external);
1718 last.vm_pageout_freed_external = tmp;
1719
1720 tmp = vm_pageout_vminfo.vm_pageout_inactive_referenced;
1721 vm_pageout_stats[vm_pageout_stat_now].inactive_referenced = (unsigned int)(tmp - last.vm_pageout_inactive_referenced);
1722 last.vm_pageout_inactive_referenced = tmp;
1723
1724 tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external;
1725 vm_pageout_stats[vm_pageout_stat_now].throttled_external_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_external);
1726 last.vm_pageout_scan_inactive_throttled_external = tmp;
1727
1728 tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_external;
1729 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_external);
1730 last.vm_pageout_inactive_dirty_external = tmp;
1731
1732 tmp = vm_pageout_vminfo.vm_pageout_freed_cleaned;
1733 vm_pageout_stats[vm_pageout_stat_now].freed_cleaned = (unsigned int)(tmp - last.vm_pageout_freed_cleaned);
1734 last.vm_pageout_freed_cleaned = tmp;
1735
1736 tmp = vm_pageout_vminfo.vm_pageout_inactive_nolock;
1737 vm_pageout_stats[vm_pageout_stat_now].inactive_nolock = (unsigned int)(tmp - last.vm_pageout_inactive_nolock);
1738 last.vm_pageout_inactive_nolock = tmp;
1739
1740 tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal;
1741 vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_internal);
1742 last.vm_pageout_scan_inactive_throttled_internal = tmp;
1743
1744 tmp = vm_pageout_vminfo.vm_pageout_skipped_external;
1745 vm_pageout_stats[vm_pageout_stat_now].skipped_external = (unsigned int)(tmp - last.vm_pageout_skipped_external);
1746 last.vm_pageout_skipped_external = tmp;
1747
1748 tmp = vm_pageout_vminfo.vm_pageout_skipped_internal;
1749 vm_pageout_stats[vm_pageout_stat_now].skipped_internal = (unsigned int)(tmp - last.vm_pageout_skipped_internal);
1750 last.vm_pageout_skipped_internal = tmp;
1751
1752 tmp = vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded;
1753 vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded = (unsigned int)(tmp - last.vm_pageout_reactivation_limit_exceeded);
1754 last.vm_pageout_reactivation_limit_exceeded = tmp;
1755
1756 tmp = vm_pageout_vminfo.vm_pageout_inactive_force_reclaim;
1757 vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim = (unsigned int)(tmp - last.vm_pageout_inactive_force_reclaim);
1758 last.vm_pageout_inactive_force_reclaim = tmp;
1759
1760 tmp = vm_pageout_vminfo.vm_pageout_freed_internal;
1761 vm_pageout_stats[vm_pageout_stat_now].freed_internal = (unsigned int)(tmp - last.vm_pageout_freed_internal);
1762 last.vm_pageout_freed_internal = tmp;
1763
1764 tmp = vm_pageout_vminfo.vm_pageout_considered_bq_internal;
1765 vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal = (unsigned int)(tmp - last.vm_pageout_considered_bq_internal);
1766 last.vm_pageout_considered_bq_internal = tmp;
1767
1768 tmp = vm_pageout_vminfo.vm_pageout_considered_bq_external;
1769 vm_pageout_stats[vm_pageout_stat_now].considered_bq_external = (unsigned int)(tmp - last.vm_pageout_considered_bq_external);
1770 last.vm_pageout_considered_bq_external = tmp;
1771
1772 tmp = vm_pageout_vminfo.vm_pageout_filecache_min_reactivated;
1773 vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations = (unsigned int)(tmp - last.vm_pageout_filecache_min_reactivated);
1774 last.vm_pageout_filecache_min_reactivated = tmp;
1775
1776 tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_internal;
1777 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_internal);
1778 last.vm_pageout_inactive_dirty_internal = tmp;
1779
1780 tmp = vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache;
1781 vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache = (unsigned int)(tmp - last.vm_pageout_forcereclaimed_sharedcache);
1782 last.vm_pageout_forcereclaimed_sharedcache = tmp;
1783
1784 tmp = vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime;
1785 vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime = (unsigned int)(tmp - last.vm_pageout_forcereclaimed_realtime);
1786 last.vm_pageout_forcereclaimed_realtime = tmp;
1787
1788 tmp = vm_pageout_vminfo.vm_pageout_protected_sharedcache;
1789 vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache = (unsigned int)(tmp - last.vm_pageout_protected_sharedcache);
1790 last.vm_pageout_protected_sharedcache = tmp;
1791
1792 tmp = vm_pageout_vminfo.vm_pageout_protected_realtime;
1793 vm_pageout_stats[vm_pageout_stat_now].protected_realtime = (unsigned int)(tmp - last.vm_pageout_protected_realtime);
1794 last.vm_pageout_protected_realtime = tmp;
1795 }
1796
1797 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO1)) | DBG_FUNC_NONE,
1798 vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count,
1799 vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count,
1800 vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count,
1801 vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count,
1802 0);
1803
1804 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO2)) | DBG_FUNC_NONE,
1805 vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count,
1806 vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count,
1807 vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count,
1808 0,
1809 0);
1810
1811 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO3)) | DBG_FUNC_NONE,
1812 vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed,
1813 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count,
1814 vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count,
1815 vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count,
1816 0);
1817
1818 if (vm_pageout_stats[vm_pageout_stat_now].considered ||
1819 vm_pageout_stats[vm_pageout_stat_now].pages_compressed ||
1820 vm_pageout_stats[vm_pageout_stat_now].failed_compressions) {
1821 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO4)) | DBG_FUNC_NONE,
1822 vm_pageout_stats[vm_pageout_stat_now].considered,
1823 vm_pageout_stats[vm_pageout_stat_now].freed_speculative,
1824 vm_pageout_stats[vm_pageout_stat_now].freed_external,
1825 vm_pageout_stats[vm_pageout_stat_now].inactive_referenced,
1826 0);
1827
1828 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO5)) | DBG_FUNC_NONE,
1829 vm_pageout_stats[vm_pageout_stat_now].throttled_external_q,
1830 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external,
1831 vm_pageout_stats[vm_pageout_stat_now].freed_cleaned,
1832 vm_pageout_stats[vm_pageout_stat_now].inactive_nolock,
1833 0);
1834
1835 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO6)) | DBG_FUNC_NONE,
1836 vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q,
1837 vm_pageout_stats[vm_pageout_stat_now].pages_compressed,
1838 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor,
1839 vm_pageout_stats[vm_pageout_stat_now].skipped_external,
1840 0);
1841
1842 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO7)) | DBG_FUNC_NONE,
1843 vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded,
1844 vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim,
1845 vm_pageout_stats[vm_pageout_stat_now].failed_compressions,
1846 vm_pageout_stats[vm_pageout_stat_now].freed_internal,
1847 0);
1848
1849 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO8)) | DBG_FUNC_NONE,
1850 vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal,
1851 vm_pageout_stats[vm_pageout_stat_now].considered_bq_external,
1852 vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations,
1853 vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal,
1854 0);
1855
1856 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO10)) | DBG_FUNC_NONE,
1857 vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache,
1858 vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime,
1859 vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache,
1860 vm_pageout_stats[vm_pageout_stat_now].protected_realtime,
1861 0);
1862 }
1863 KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO9)) | DBG_FUNC_NONE,
1864 vm_pageout_stats[vm_pageout_stat_now].pages_grabbed,
1865 vm_pageout_stats[vm_pageout_stat_now].pages_freed,
1866 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found,
1867 vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added,
1868 0);
1869
1870 record_memory_pressure();
1871}
1872
1873extern boolean_t hibernation_vmqueues_inspection;
1874
1875/*
1876 * Return values for functions called by vm_pageout_scan
1877 * that control its flow.
1878 *
1879 * PROCEED -- vm_pageout_scan will keep making forward progress.
1880 * DONE_RETURN -- page demand satisfied, work is done -> vm_pageout_scan returns.
1881 * NEXT_ITERATION -- restart the 'for' loop in vm_pageout_scan aka continue.
1882 */
1883
1884#define VM_PAGEOUT_SCAN_PROCEED (0)
1885#define VM_PAGEOUT_SCAN_DONE_RETURN (1)
1886#define VM_PAGEOUT_SCAN_NEXT_ITERATION (2)
1887
1888/*
1889 * This function is called only from vm_pageout_scan and
1890 * it moves overflow secluded pages (one-at-a-time) to the
1891 * batched 'local' free Q or active Q.
1892 */
1893static void
1894vps_deal_with_secluded_page_overflow(vm_page_t *local_freeq, int *local_freed)
1895{
1896#if CONFIG_SECLUDED_MEMORY
1897 /*
1898 * Deal with secluded_q overflow.
1899 */
1900 if (vm_page_secluded_count > vm_page_secluded_target) {
1901 vm_page_t secluded_page;
1902
1903 /*
1904 * SECLUDED_AGING_BEFORE_ACTIVE:
1905 * Excess secluded pages go to the active queue and
1906 * will later go to the inactive queue.
1907 */
1908 assert((vm_page_secluded_count_free +
1909 vm_page_secluded_count_inuse) ==
1910 vm_page_secluded_count);
1911 secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
1912 assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
1913
1914 vm_page_queues_remove(secluded_page, FALSE);
1915 assert(!secluded_page->vmp_fictitious);
1916 assert(!VM_PAGE_WIRED(secluded_page));
1917
1918 if (secluded_page->vmp_object == 0) {
1919 /* transfer to free queue */
1920 assert(secluded_page->vmp_busy);
1921 secluded_page->vmp_snext = *local_freeq;
1922 *local_freeq = secluded_page;
1923 *local_freed += 1;
1924 } else {
1925 /* transfer to head of active queue */
1926 vm_page_enqueue_active(secluded_page, FALSE);
1927 secluded_page = VM_PAGE_NULL;
1928 }
1929 }
1930#else /* CONFIG_SECLUDED_MEMORY */
1931
1932#pragma unused(local_freeq)
1933#pragma unused(local_freed)
1934
1935 return;
1936
1937#endif /* CONFIG_SECLUDED_MEMORY */
1938}
1939
1940/*
1941 * This function is called only from vm_pageout_scan and
1942 * it initializes the loop targets for vm_pageout_scan().
1943 */
1944static void
1945vps_init_page_targets(void)
1946{
1947 /*
1948 * LD TODO: Other page targets should be calculated here too.
1949 */
1950 vm_page_anonymous_min = vm_page_inactive_target / 20;
1951
1952 if (vm_pageout_state.vm_page_speculative_percentage > 50) {
1953 vm_pageout_state.vm_page_speculative_percentage = 50;
1954 } else if (vm_pageout_state.vm_page_speculative_percentage <= 0) {
1955 vm_pageout_state.vm_page_speculative_percentage = 1;
1956 }
1957
1958 vm_pageout_state.vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
1959 vm_page_inactive_count);
1960}
1961
1962/*
1963 * This function is called only from vm_pageout_scan and
1964 * it purges a single VM object at-a-time and will either
1965 * make vm_pageout_scan() restart the loop or keeping moving forward.
1966 */
1967static int
1968vps_purge_object()
1969{
1970 int force_purge;
1971
1972 assert(available_for_purge >= 0);
1973 force_purge = 0; /* no force-purging */
1974
1975#if VM_PRESSURE_EVENTS
1976 vm_pressure_level_t pressure_level;
1977
1978 pressure_level = memorystatus_vm_pressure_level;
1979
1980 if (pressure_level > kVMPressureNormal) {
1981 if (pressure_level >= kVMPressureCritical) {
1982 force_purge = vm_pageout_state.memorystatus_purge_on_critical;
1983 } else if (pressure_level >= kVMPressureUrgent) {
1984 force_purge = vm_pageout_state.memorystatus_purge_on_urgent;
1985 } else if (pressure_level >= kVMPressureWarning) {
1986 force_purge = vm_pageout_state.memorystatus_purge_on_warning;
1987 }
1988 }
1989#endif /* VM_PRESSURE_EVENTS */
1990
1991 if (available_for_purge || force_purge) {
1992 memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_START);
1993
1994 VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, 0, 0, 0);
1995 if (vm_purgeable_object_purge_one(force_purge_below_group: force_purge, flags: C_DONT_BLOCK)) {
1996 VM_PAGEOUT_DEBUG(vm_pageout_purged_objects, 1);
1997 VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, 0, 0, 0);
1998 memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
1999
2000 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2001 }
2002 VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, 0, 0, 0, -1);
2003 memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
2004 }
2005
2006 return VM_PAGEOUT_SCAN_PROCEED;
2007}
2008
2009/*
2010 * This function is called only from vm_pageout_scan and
2011 * it will try to age the next speculative Q if the oldest
2012 * one is empty.
2013 */
2014static int
2015vps_age_speculative_queue(boolean_t force_speculative_aging)
2016{
2017#define DELAY_SPECULATIVE_AGE 1000
2018
2019 /*
2020 * try to pull pages from the aging bins...
2021 * see vm_page.h for an explanation of how
2022 * this mechanism works
2023 */
2024 boolean_t can_steal = FALSE;
2025 int num_scanned_queues;
2026 static int delay_speculative_age = 0; /* depends the # of times we go through the main pageout_scan loop.*/
2027 mach_timespec_t ts;
2028 struct vm_speculative_age_q *aq;
2029 struct vm_speculative_age_q *sq;
2030
2031 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2032
2033 aq = &vm_page_queue_speculative[speculative_steal_index];
2034
2035 num_scanned_queues = 0;
2036 while (vm_page_queue_empty(&aq->age_q) &&
2037 num_scanned_queues++ != VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
2038 speculative_steal_index++;
2039
2040 if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
2041 speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
2042 }
2043
2044 aq = &vm_page_queue_speculative[speculative_steal_index];
2045 }
2046
2047 if (num_scanned_queues == VM_PAGE_MAX_SPECULATIVE_AGE_Q + 1) {
2048 /*
2049 * XXX We've scanned all the speculative
2050 * queues but still haven't found one
2051 * that is not empty, even though
2052 * vm_page_speculative_count is not 0.
2053 */
2054 if (!vm_page_queue_empty(&sq->age_q)) {
2055 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2056 }
2057#if DEVELOPMENT || DEBUG
2058 panic("vm_pageout_scan: vm_page_speculative_count=%d but queues are empty", vm_page_speculative_count);
2059#endif
2060 /* readjust... */
2061 vm_page_speculative_count = 0;
2062 /* ... and continue */
2063 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2064 }
2065
2066 if (vm_page_speculative_count > vm_pageout_state.vm_page_speculative_target || force_speculative_aging == TRUE) {
2067 can_steal = TRUE;
2068 } else {
2069 if (!delay_speculative_age) {
2070 mach_timespec_t ts_fully_aged;
2071
2072 ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) / 1000;
2073 ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) % 1000)
2074 * 1000 * NSEC_PER_USEC;
2075
2076 ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
2077
2078 clock_sec_t sec;
2079 clock_nsec_t nsec;
2080 clock_get_system_nanotime(secs: &sec, nanosecs: &nsec);
2081 ts.tv_sec = (unsigned int) sec;
2082 ts.tv_nsec = nsec;
2083
2084 if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= 0) {
2085 can_steal = TRUE;
2086 } else {
2087 delay_speculative_age++;
2088 }
2089 } else {
2090 delay_speculative_age++;
2091 if (delay_speculative_age == DELAY_SPECULATIVE_AGE) {
2092 delay_speculative_age = 0;
2093 }
2094 }
2095 }
2096 if (can_steal == TRUE) {
2097 vm_page_speculate_ageit(aq);
2098 }
2099
2100 return VM_PAGEOUT_SCAN_PROCEED;
2101}
2102
2103/*
2104 * This function is called only from vm_pageout_scan and
2105 * it evicts a single VM object from the cache.
2106 */
2107static int inline
2108vps_object_cache_evict(vm_object_t *object_to_unlock)
2109{
2110 static int cache_evict_throttle = 0;
2111 struct vm_speculative_age_q *sq;
2112
2113 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2114
2115 if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == 0) {
2116 int pages_evicted;
2117
2118 if (*object_to_unlock != NULL) {
2119 vm_object_unlock(*object_to_unlock);
2120 *object_to_unlock = NULL;
2121 }
2122 KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_START, 0, 0, 0, 0, 0);
2123
2124 pages_evicted = vm_object_cache_evict(100, 10);
2125
2126 KERNEL_DEBUG_CONSTANT(0x13001ec | DBG_FUNC_END, pages_evicted, 0, 0, 0, 0);
2127
2128 if (pages_evicted) {
2129 vm_pageout_vminfo.vm_pageout_pages_evicted += pages_evicted;
2130
2131 VM_DEBUG_EVENT(vm_pageout_cache_evict, VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE,
2132 vm_page_free_count, pages_evicted, vm_pageout_vminfo.vm_pageout_pages_evicted, 0);
2133 memoryshot(VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE);
2134
2135 /*
2136 * we just freed up to 100 pages,
2137 * so go back to the top of the main loop
2138 * and re-evaulate the memory situation
2139 */
2140 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2141 } else {
2142 cache_evict_throttle = 1000;
2143 }
2144 }
2145 if (cache_evict_throttle) {
2146 cache_evict_throttle--;
2147 }
2148
2149 return VM_PAGEOUT_SCAN_PROCEED;
2150}
2151
2152
2153/*
2154 * This function is called only from vm_pageout_scan and
2155 * it calculates the filecache min. that needs to be maintained
2156 * as we start to steal pages.
2157 */
2158static void
2159vps_calculate_filecache_min(void)
2160{
2161 int divisor = vm_pageout_state.vm_page_filecache_min_divisor;
2162
2163#if CONFIG_JETSAM
2164 /*
2165 * don't let the filecache_min fall below 15% of available memory
2166 * on systems with an active compressor that isn't nearing its
2167 * limits w/r to accepting new data
2168 *
2169 * on systems w/o the compressor/swapper, the filecache is always
2170 * a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY
2171 * since most (if not all) of the anonymous pages are in the
2172 * throttled queue (which isn't counted as available) which
2173 * effectively disables this filter
2174 */
2175 if (vm_compressor_low_on_space() || divisor == 0) {
2176 vm_pageout_state.vm_page_filecache_min = 0;
2177 } else {
2178 vm_pageout_state.vm_page_filecache_min =
2179 ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2180 }
2181#else
2182 if (vm_compressor_out_of_space() || divisor == 0) {
2183 vm_pageout_state.vm_page_filecache_min = 0;
2184 } else {
2185 /*
2186 * don't let the filecache_min fall below the specified critical level
2187 */
2188 vm_pageout_state.vm_page_filecache_min =
2189 ((AVAILABLE_NON_COMPRESSED_MEMORY) * 10) / divisor;
2190 }
2191#endif
2192 if (vm_page_free_count < (vm_page_free_reserved / 4)) {
2193 vm_pageout_state.vm_page_filecache_min = 0;
2194 }
2195}
2196
2197/*
2198 * This function is called only from vm_pageout_scan and
2199 * it updates the flow control time to detect if VM pageoutscan
2200 * isn't making progress.
2201 */
2202static void
2203vps_flow_control_reset_deadlock_timer(struct flow_control *flow_control)
2204{
2205 mach_timespec_t ts;
2206 clock_sec_t sec;
2207 clock_nsec_t nsec;
2208
2209 ts.tv_sec = vm_pageout_state.vm_pageout_deadlock_wait / 1000;
2210 ts.tv_nsec = (vm_pageout_state.vm_pageout_deadlock_wait % 1000) * 1000 * NSEC_PER_USEC;
2211 clock_get_system_nanotime(secs: &sec, nanosecs: &nsec);
2212 flow_control->ts.tv_sec = (unsigned int) sec;
2213 flow_control->ts.tv_nsec = nsec;
2214 ADD_MACH_TIMESPEC(&flow_control->ts, &ts);
2215
2216 flow_control->state = FCS_DELAYED;
2217
2218 vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal++;
2219}
2220
2221/*
2222 * This function is called only from vm_pageout_scan and
2223 * it is the flow control logic of VM pageout scan which
2224 * controls if it should block and for how long.
2225 * Any blocking of vm_pageout_scan happens ONLY in this function.
2226 */
2227static int
2228vps_flow_control(struct flow_control *flow_control, int *anons_grabbed, vm_object_t *object, int *delayed_unlock,
2229 vm_page_t *local_freeq, int *local_freed, int *vm_pageout_deadlock_target, unsigned int inactive_burst_count)
2230{
2231 boolean_t exceeded_burst_throttle = FALSE;
2232 unsigned int msecs = 0;
2233 uint32_t inactive_external_count;
2234 mach_timespec_t ts;
2235 struct vm_pageout_queue *iq;
2236 struct vm_pageout_queue *eq;
2237 struct vm_speculative_age_q *sq;
2238
2239 iq = &vm_pageout_queue_internal;
2240 eq = &vm_pageout_queue_external;
2241 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2242
2243 /*
2244 * Sometimes we have to pause:
2245 * 1) No inactive pages - nothing to do.
2246 * 2) Loop control - no acceptable pages found on the inactive queue
2247 * within the last vm_pageout_burst_inactive_throttle iterations
2248 * 3) Flow control - default pageout queue is full
2249 */
2250 if (vm_page_queue_empty(&vm_page_queue_inactive) &&
2251 vm_page_queue_empty(&vm_page_queue_anonymous) &&
2252 vm_page_queue_empty(&vm_page_queue_cleaned) &&
2253 vm_page_queue_empty(&sq->age_q)) {
2254 VM_PAGEOUT_DEBUG(vm_pageout_scan_empty_throttle, 1);
2255 msecs = vm_pageout_state.vm_pageout_empty_wait;
2256 } else if (inactive_burst_count >=
2257 MIN(vm_pageout_state.vm_pageout_burst_inactive_throttle,
2258 (vm_page_inactive_count +
2259 vm_page_speculative_count))) {
2260 VM_PAGEOUT_DEBUG(vm_pageout_scan_burst_throttle, 1);
2261 msecs = vm_pageout_state.vm_pageout_burst_wait;
2262
2263 exceeded_burst_throttle = TRUE;
2264 } else if (VM_PAGE_Q_THROTTLED(iq) &&
2265 VM_DYNAMIC_PAGING_ENABLED()) {
2266 clock_sec_t sec;
2267 clock_nsec_t nsec;
2268
2269 switch (flow_control->state) {
2270 case FCS_IDLE:
2271 if ((vm_page_free_count + *local_freed) < vm_page_free_target &&
2272 vm_pageout_state.vm_restricted_to_single_processor == FALSE) {
2273 /*
2274 * since the compressor is running independently of vm_pageout_scan
2275 * let's not wait for it just yet... as long as we have a healthy supply
2276 * of filecache pages to work with, let's keep stealing those.
2277 */
2278 inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2279
2280 if (vm_page_pageable_external_count > vm_pageout_state.vm_page_filecache_min &&
2281 (inactive_external_count >= VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2282 *anons_grabbed = ANONS_GRABBED_LIMIT;
2283 VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle_deferred, 1);
2284 return VM_PAGEOUT_SCAN_PROCEED;
2285 }
2286 }
2287
2288 vps_flow_control_reset_deadlock_timer(flow_control);
2289 msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2290
2291 break;
2292
2293 case FCS_DELAYED:
2294 clock_get_system_nanotime(secs: &sec, nanosecs: &nsec);
2295 ts.tv_sec = (unsigned int) sec;
2296 ts.tv_nsec = nsec;
2297
2298 if (CMP_MACH_TIMESPEC(&ts, &flow_control->ts) >= 0) {
2299 /*
2300 * the pageout thread for the default pager is potentially
2301 * deadlocked since the
2302 * default pager queue has been throttled for more than the
2303 * allowable time... we need to move some clean pages or dirty
2304 * pages belonging to the external pagers if they aren't throttled
2305 * vm_page_free_wanted represents the number of threads currently
2306 * blocked waiting for pages... we'll move one page for each of
2307 * these plus a fixed amount to break the logjam... once we're done
2308 * moving this number of pages, we'll re-enter the FSC_DELAYED state
2309 * with a new timeout target since we have no way of knowing
2310 * whether we've broken the deadlock except through observation
2311 * of the queue associated with the default pager... we need to
2312 * stop moving pages and allow the system to run to see what
2313 * state it settles into.
2314 */
2315
2316 *vm_pageout_deadlock_target = vm_pageout_state.vm_pageout_deadlock_relief +
2317 vm_page_free_wanted + vm_page_free_wanted_privileged;
2318 VM_PAGEOUT_DEBUG(vm_pageout_scan_deadlock_detected, 1);
2319 flow_control->state = FCS_DEADLOCK_DETECTED;
2320 thread_wakeup(VM_PAGEOUT_GC_EVENT);
2321 return VM_PAGEOUT_SCAN_PROCEED;
2322 }
2323 /*
2324 * just resniff instead of trying
2325 * to compute a new delay time... we're going to be
2326 * awakened immediately upon a laundry completion,
2327 * so we won't wait any longer than necessary
2328 */
2329 msecs = vm_pageout_state.vm_pageout_idle_wait;
2330 break;
2331
2332 case FCS_DEADLOCK_DETECTED:
2333 if (*vm_pageout_deadlock_target) {
2334 return VM_PAGEOUT_SCAN_PROCEED;
2335 }
2336
2337 vps_flow_control_reset_deadlock_timer(flow_control);
2338 msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2339
2340 break;
2341 }
2342 } else {
2343 /*
2344 * No need to pause...
2345 */
2346 return VM_PAGEOUT_SCAN_PROCEED;
2347 }
2348
2349 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2350
2351 vm_pageout_prepare_to_block(object, delayed_unlock, local_freeq, local_freed,
2352 VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
2353
2354 if (vm_page_free_count >= vm_page_free_target) {
2355 /*
2356 * we're here because
2357 * 1) someone else freed up some pages while we had
2358 * the queues unlocked above
2359 * and we've hit one of the 3 conditions that
2360 * cause us to pause the pageout scan thread
2361 *
2362 * since we already have enough free pages,
2363 * let's avoid stalling and return normally
2364 *
2365 * before we return, make sure the pageout I/O threads
2366 * are running throttled in case there are still requests
2367 * in the laundry... since we have enough free pages
2368 * we don't need the laundry to be cleaned in a timely
2369 * fashion... so let's avoid interfering with foreground
2370 * activity
2371 *
2372 * we don't want to hold vm_page_queue_free_lock when
2373 * calling vm_pageout_adjust_eq_iothrottle (since it
2374 * may cause other locks to be taken), we do the intitial
2375 * check outside of the lock. Once we take the lock,
2376 * we recheck the condition since it may have changed.
2377 * if it has, no problem, we will make the threads
2378 * non-throttled before actually blocking
2379 */
2380 vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, TRUE);
2381 }
2382 vm_free_page_lock();
2383
2384 if (vm_page_free_count >= vm_page_free_target &&
2385 (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
2386 return VM_PAGEOUT_SCAN_DONE_RETURN;
2387 }
2388 vm_free_page_unlock();
2389
2390 if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) {
2391 /*
2392 * we're most likely about to block due to one of
2393 * the 3 conditions that cause vm_pageout_scan to
2394 * not be able to make forward progress w/r
2395 * to providing new pages to the free queue,
2396 * so unthrottle the I/O threads in case we
2397 * have laundry to be cleaned... it needs
2398 * to be completed ASAP.
2399 *
2400 * even if we don't block, we want the io threads
2401 * running unthrottled since the sum of free +
2402 * clean pages is still under our free target
2403 */
2404 vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, FALSE);
2405 }
2406 if (vm_page_cleaned_count > 0 && exceeded_burst_throttle == FALSE) {
2407 /*
2408 * if we get here we're below our free target and
2409 * we're stalling due to a full laundry queue or
2410 * we don't have any inactive pages other then
2411 * those in the clean queue...
2412 * however, we have pages on the clean queue that
2413 * can be moved to the free queue, so let's not
2414 * stall the pageout scan
2415 */
2416 flow_control->state = FCS_IDLE;
2417 return VM_PAGEOUT_SCAN_PROCEED;
2418 }
2419 if (flow_control->state == FCS_DELAYED && !VM_PAGE_Q_THROTTLED(iq)) {
2420 flow_control->state = FCS_IDLE;
2421 return VM_PAGEOUT_SCAN_PROCEED;
2422 }
2423
2424 VM_CHECK_MEMORYSTATUS;
2425
2426 if (flow_control->state != FCS_IDLE) {
2427 VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle, 1);
2428 }
2429
2430 iq->pgo_throttled = TRUE;
2431 assert_wait_timeout(event: (event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, interval: msecs, scale_factor: 1000 * NSEC_PER_USEC);
2432
2433 vm_page_unlock_queues();
2434
2435 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
2436
2437 VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START,
2438 iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2439 memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START);
2440
2441 thread_block(THREAD_CONTINUE_NULL);
2442
2443 VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END,
2444 iq->pgo_laundry, iq->pgo_maxlaundry, msecs, 0);
2445 memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END);
2446
2447 vm_page_lock_queues();
2448
2449 iq->pgo_throttled = FALSE;
2450
2451 vps_init_page_targets();
2452
2453 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2454}
2455
2456extern boolean_t vm_darkwake_mode;
2457/*
2458 * This function is called only from vm_pageout_scan and
2459 * it will find and return the most appropriate page to be
2460 * reclaimed.
2461 */
2462static int
2463vps_choose_victim_page(vm_page_t *victim_page, int *anons_grabbed, boolean_t *grab_anonymous, boolean_t force_anonymous,
2464 boolean_t *is_page_from_bg_q, unsigned int *reactivated_this_call)
2465{
2466 vm_page_t m = NULL;
2467 vm_object_t m_object = VM_OBJECT_NULL;
2468 uint32_t inactive_external_count;
2469 struct vm_speculative_age_q *sq;
2470 struct vm_pageout_queue *iq;
2471 int retval = VM_PAGEOUT_SCAN_PROCEED;
2472
2473 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2474 iq = &vm_pageout_queue_internal;
2475
2476 *is_page_from_bg_q = FALSE;
2477
2478 m = NULL;
2479 m_object = VM_OBJECT_NULL;
2480
2481 if (VM_DYNAMIC_PAGING_ENABLED()) {
2482 assert(vm_page_throttled_count == 0);
2483 assert(vm_page_queue_empty(&vm_page_queue_throttled));
2484 }
2485
2486 /*
2487 * Try for a clean-queue inactive page.
2488 * These are pages that vm_pageout_scan tried to steal earlier, but
2489 * were dirty and had to be cleaned. Pick them up now that they are clean.
2490 */
2491 if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2492 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2493
2494 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
2495
2496 goto found_page;
2497 }
2498
2499 /*
2500 * The next most eligible pages are ones we paged in speculatively,
2501 * but which have not yet been touched and have been aged out.
2502 */
2503 if (!vm_page_queue_empty(&sq->age_q)) {
2504 m = (vm_page_t) vm_page_queue_first(&sq->age_q);
2505
2506 assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q);
2507
2508 if (!m->vmp_dirty || force_anonymous == FALSE) {
2509 goto found_page;
2510 } else {
2511 m = NULL;
2512 }
2513 }
2514
2515#if !CONFIG_JETSAM
2516 if (vm_page_donate_mode != VM_PAGE_DONATE_DISABLED) {
2517 if (vm_page_donate_queue_ripe && !vm_page_queue_empty(&vm_page_queue_donate)) {
2518 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_donate);
2519 assert(m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
2520 goto found_page;
2521 }
2522 }
2523#endif /* !CONFIG_JETSAM */
2524
2525 if (vm_page_background_mode != VM_PAGE_BG_DISABLED && (vm_page_background_count > vm_page_background_target)) {
2526 vm_object_t bg_m_object = NULL;
2527
2528 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2529
2530 bg_m_object = VM_PAGE_OBJECT(m);
2531
2532 if (!VM_PAGE_PAGEABLE(m) || (vm_darkwake_mode && m->vmp_busy)) {
2533 /*
2534 * This page is on the background queue
2535 * but not on a pageable queue OR is busy during
2536 * darkwake mode when the target is artificially lowered.
2537 * If it is busy during darkwake mode, and we don't skip it,
2538 * we will just swing back around and try again with the same
2539 * queue and might hit the same page or its neighbor in a
2540 * similar state. Both of these are transient states and will
2541 * get resolved, but, at this point let's ignore this page.
2542 */
2543 if (vm_darkwake_mode && m->vmp_busy) {
2544 if (bg_m_object->internal) {
2545 vm_pageout_skipped_bq_internal++;
2546 } else {
2547 vm_pageout_skipped_bq_external++;
2548 }
2549 }
2550 } else if (force_anonymous == FALSE || bg_m_object->internal) {
2551 if (bg_m_object->internal &&
2552 (VM_PAGE_Q_THROTTLED(iq) ||
2553 vm_compressor_out_of_space() == TRUE ||
2554 vm_page_free_count < (vm_page_free_reserved / 4))) {
2555 vm_pageout_skipped_bq_internal++;
2556 } else {
2557 *is_page_from_bg_q = TRUE;
2558
2559 if (bg_m_object->internal) {
2560 vm_pageout_vminfo.vm_pageout_considered_bq_internal++;
2561 } else {
2562 vm_pageout_vminfo.vm_pageout_considered_bq_external++;
2563 }
2564 goto found_page;
2565 }
2566 }
2567 }
2568
2569 inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2570
2571 if ((vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min || force_anonymous == TRUE) ||
2572 (inactive_external_count < VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2573 *grab_anonymous = TRUE;
2574 *anons_grabbed = 0;
2575
2576 if (VM_CONFIG_SWAP_IS_ACTIVE) {
2577 vm_pageout_vminfo.vm_pageout_skipped_external++;
2578 } else {
2579 if (vm_page_free_count < (COMPRESSOR_FREE_RESERVED_LIMIT * 2)) {
2580 /*
2581 * No swap and we are in dangerously low levels of free memory.
2582 * If we keep going ahead with anonymous pages, we are going to run into a situation
2583 * where the compressor will be stuck waiting for free pages (if it isn't already).
2584 *
2585 * So, pick a file backed page...
2586 */
2587 *grab_anonymous = FALSE;
2588 *anons_grabbed = ANONS_GRABBED_LIMIT;
2589 vm_pageout_vminfo.vm_pageout_skipped_internal++;
2590 }
2591 }
2592 goto want_anonymous;
2593 }
2594 *grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min);
2595
2596#if CONFIG_JETSAM
2597 /* If the file-backed pool has accumulated
2598 * significantly more pages than the jetsam
2599 * threshold, prefer to reclaim those
2600 * inline to minimise compute overhead of reclaiming
2601 * anonymous pages.
2602 * This calculation does not account for the CPU local
2603 * external page queues, as those are expected to be
2604 * much smaller relative to the global pools.
2605 */
2606
2607 struct vm_pageout_queue *eq = &vm_pageout_queue_external;
2608
2609 if (*grab_anonymous == TRUE && !VM_PAGE_Q_THROTTLED(eq)) {
2610 if (vm_page_pageable_external_count >
2611 vm_pageout_state.vm_page_filecache_min) {
2612 if ((vm_page_pageable_external_count *
2613 vm_pageout_memorystatus_fb_factor_dr) >
2614 (memorystatus_available_pages_critical *
2615 vm_pageout_memorystatus_fb_factor_nr)) {
2616 *grab_anonymous = FALSE;
2617
2618 VM_PAGEOUT_DEBUG(vm_grab_anon_overrides, 1);
2619 }
2620 }
2621 if (*grab_anonymous) {
2622 VM_PAGEOUT_DEBUG(vm_grab_anon_nops, 1);
2623 }
2624 }
2625#endif /* CONFIG_JETSAM */
2626
2627want_anonymous:
2628 if (*grab_anonymous == FALSE || *anons_grabbed >= ANONS_GRABBED_LIMIT || vm_page_queue_empty(&vm_page_queue_anonymous)) {
2629 if (!vm_page_queue_empty(&vm_page_queue_inactive)) {
2630 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2631
2632 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
2633 *anons_grabbed = 0;
2634
2635 if (vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min) {
2636 if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2637 if ((++(*reactivated_this_call) % 100)) {
2638 vm_pageout_vminfo.vm_pageout_filecache_min_reactivated++;
2639
2640 vm_page_activate(page: m);
2641 counter_inc(&vm_statistics_reactivations);
2642#if DEVELOPMENT || DEBUG
2643 if (*is_page_from_bg_q == TRUE) {
2644 if (m_object->internal) {
2645 vm_pageout_rejected_bq_internal++;
2646 } else {
2647 vm_pageout_rejected_bq_external++;
2648 }
2649 }
2650#endif /* DEVELOPMENT || DEBUG */
2651 vm_pageout_state.vm_pageout_inactive_used++;
2652
2653 m = NULL;
2654 retval = VM_PAGEOUT_SCAN_NEXT_ITERATION;
2655
2656 goto found_page;
2657 }
2658
2659 /*
2660 * steal 1 of the file backed pages even if
2661 * we are under the limit that has been set
2662 * for a healthy filecache
2663 */
2664 }
2665 }
2666 goto found_page;
2667 }
2668 }
2669 if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2670 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2671
2672 assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
2673 *anons_grabbed += 1;
2674
2675 goto found_page;
2676 }
2677
2678 m = NULL;
2679
2680found_page:
2681 *victim_page = m;
2682
2683 return retval;
2684}
2685
2686/*
2687 * This function is called only from vm_pageout_scan and
2688 * it will put a page back on the active/inactive queue
2689 * if we can't reclaim it for some reason.
2690 */
2691static void
2692vps_requeue_page(vm_page_t m, int page_prev_q_state, __unused boolean_t page_from_bg_q)
2693{
2694 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
2695 vm_page_enqueue_inactive(mem: m, FALSE);
2696 } else {
2697 vm_page_activate(page: m);
2698 }
2699
2700#if DEVELOPMENT || DEBUG
2701 vm_object_t m_object = VM_PAGE_OBJECT(m);
2702
2703 if (page_from_bg_q == TRUE) {
2704 if (m_object->internal) {
2705 vm_pageout_rejected_bq_internal++;
2706 } else {
2707 vm_pageout_rejected_bq_external++;
2708 }
2709 }
2710#endif /* DEVELOPMENT || DEBUG */
2711}
2712
2713/*
2714 * This function is called only from vm_pageout_scan and
2715 * it will try to grab the victim page's VM object (m_object)
2716 * which differs from the previous victim page's object (object).
2717 */
2718static int
2719vps_switch_object(vm_page_t m, vm_object_t m_object, vm_object_t *object, int page_prev_q_state, boolean_t avoid_anon_pages, boolean_t page_from_bg_q)
2720{
2721 struct vm_speculative_age_q *sq;
2722
2723 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2724
2725 /*
2726 * the object associated with candidate page is
2727 * different from the one we were just working
2728 * with... dump the lock if we still own it
2729 */
2730 if (*object != NULL) {
2731 vm_object_unlock(*object);
2732 *object = NULL;
2733 }
2734 /*
2735 * Try to lock object; since we've alread got the
2736 * page queues lock, we can only 'try' for this one.
2737 * if the 'try' fails, we need to do a mutex_pause
2738 * to allow the owner of the object lock a chance to
2739 * run... otherwise, we're likely to trip over this
2740 * object in the same state as we work our way through
2741 * the queue... clumps of pages associated with the same
2742 * object are fairly typical on the inactive and active queues
2743 */
2744 if (!vm_object_lock_try_scan(m_object)) {
2745 vm_page_t m_want = NULL;
2746
2747 vm_pageout_vminfo.vm_pageout_inactive_nolock++;
2748
2749 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
2750 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_nolock, 1);
2751 }
2752
2753 pmap_clear_reference(pn: VM_PAGE_GET_PHYS_PAGE(m));
2754
2755 m->vmp_reference = FALSE;
2756
2757 if (!m_object->object_is_shared_cache) {
2758 /*
2759 * don't apply this optimization if this is the shared cache
2760 * object, it's too easy to get rid of very hot and important
2761 * pages...
2762 * m->vmp_object must be stable since we hold the page queues lock...
2763 * we can update the scan_collisions field sans the object lock
2764 * since it is a separate field and this is the only spot that does
2765 * a read-modify-write operation and it is never executed concurrently...
2766 * we can asynchronously set this field to 0 when creating a UPL, so it
2767 * is possible for the value to be a bit non-determistic, but that's ok
2768 * since it's only used as a hint
2769 */
2770 m_object->scan_collisions = 1;
2771 }
2772 if (page_from_bg_q) {
2773 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2774 } else if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2775 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2776 } else if (!vm_page_queue_empty(&sq->age_q)) {
2777 m_want = (vm_page_t) vm_page_queue_first(&sq->age_q);
2778 } else if ((avoid_anon_pages || vm_page_queue_empty(&vm_page_queue_anonymous)) &&
2779 !vm_page_queue_empty(&vm_page_queue_inactive)) {
2780 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2781 } else if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2782 m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2783 }
2784
2785 /*
2786 * this is the next object we're going to be interested in
2787 * try to make sure its available after the mutex_pause
2788 * returns control
2789 */
2790 if (m_want) {
2791 vm_pageout_scan_wants_object = VM_PAGE_OBJECT(m_want);
2792 }
2793
2794 vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
2795
2796 return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2797 } else {
2798 *object = m_object;
2799 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2800 }
2801
2802 return VM_PAGEOUT_SCAN_PROCEED;
2803}
2804
2805/*
2806 * This function is called only from vm_pageout_scan and
2807 * it notices that pageout scan may be rendered ineffective
2808 * due to a FS deadlock and will jetsam a process if possible.
2809 * If jetsam isn't supported, it'll move the page to the active
2810 * queue to try and get some different pages pushed onwards so
2811 * we can try to get out of this scenario.
2812 */
2813static void
2814vps_deal_with_throttled_queues(vm_page_t m, vm_object_t *object, uint32_t *vm_pageout_inactive_external_forced_reactivate_limit,
2815 boolean_t *force_anonymous, __unused boolean_t is_page_from_bg_q)
2816{
2817 struct vm_pageout_queue *eq;
2818 vm_object_t cur_object = VM_OBJECT_NULL;
2819
2820 cur_object = *object;
2821
2822 eq = &vm_pageout_queue_external;
2823
2824 if (cur_object->internal == FALSE) {
2825 /*
2826 * we need to break up the following potential deadlock case...
2827 * a) The external pageout thread is stuck on the truncate lock for a file that is being extended i.e. written.
2828 * b) The thread doing the writing is waiting for pages while holding the truncate lock
2829 * c) Most of the pages in the inactive queue belong to this file.
2830 *
2831 * we are potentially in this deadlock because...
2832 * a) the external pageout queue is throttled
2833 * b) we're done with the active queue and moved on to the inactive queue
2834 * c) we've got a dirty external page
2835 *
2836 * since we don't know the reason for the external pageout queue being throttled we
2837 * must suspect that we are deadlocked, so move the current page onto the active queue
2838 * in an effort to cause a page from the active queue to 'age' to the inactive queue
2839 *
2840 * if we don't have jetsam configured (i.e. we have a dynamic pager), set
2841 * 'force_anonymous' to TRUE to cause us to grab a page from the cleaned/anonymous
2842 * pool the next time we select a victim page... if we can make enough new free pages,
2843 * the deadlock will break, the external pageout queue will empty and it will no longer
2844 * be throttled
2845 *
2846 * if we have jetsam configured, keep a count of the pages reactivated this way so
2847 * that we can try to find clean pages in the active/inactive queues before
2848 * deciding to jetsam a process
2849 */
2850 vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external++;
2851
2852 vm_page_check_pageable_safe(page: m);
2853 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
2854 vm_page_queue_enter(&vm_page_queue_active, m, vmp_pageq);
2855 m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
2856 vm_page_active_count++;
2857 vm_page_pageable_external_count++;
2858
2859 vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, FALSE);
2860
2861#if CONFIG_MEMORYSTATUS && CONFIG_JETSAM
2862
2863#pragma unused(force_anonymous)
2864
2865 *vm_pageout_inactive_external_forced_reactivate_limit -= 1;
2866
2867 if (*vm_pageout_inactive_external_forced_reactivate_limit <= 0) {
2868 *vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
2869 /*
2870 * Possible deadlock scenario so request jetsam action
2871 */
2872 memorystatus_kill_on_vps_starvation();
2873 VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_NONE,
2874 vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
2875 }
2876#else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2877
2878#pragma unused(vm_pageout_inactive_external_forced_reactivate_limit)
2879
2880 *force_anonymous = TRUE;
2881#endif /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2882 } else {
2883 vm_page_activate(page: m);
2884 counter_inc(&vm_statistics_reactivations);
2885
2886#if DEVELOPMENT || DEBUG
2887 if (is_page_from_bg_q == TRUE) {
2888 if (cur_object->internal) {
2889 vm_pageout_rejected_bq_internal++;
2890 } else {
2891 vm_pageout_rejected_bq_external++;
2892 }
2893 }
2894#endif /* DEVELOPMENT || DEBUG */
2895
2896 vm_pageout_state.vm_pageout_inactive_used++;
2897 }
2898}
2899
2900
2901void
2902vm_page_balance_inactive(int max_to_move)
2903{
2904 vm_page_t m;
2905
2906 LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
2907
2908 if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
2909 /*
2910 * It is likely that the hibernation code path is
2911 * dealing with these very queues as we are about
2912 * to move pages around in/from them and completely
2913 * change the linkage of the pages.
2914 *
2915 * And so we skip the rebalancing of these queues.
2916 */
2917 return;
2918 }
2919 vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
2920 vm_page_inactive_count +
2921 vm_page_speculative_count);
2922
2923 while (max_to_move-- && (vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) {
2924 VM_PAGEOUT_DEBUG(vm_pageout_balanced, 1);
2925
2926 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
2927
2928 assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
2929 assert(!m->vmp_laundry);
2930 assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
2931 assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
2932
2933 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
2934
2935 /*
2936 * by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
2937 *
2938 * a TLB flush isn't really needed here since at worst we'll miss the reference bit being
2939 * updated in the PTE if a remote processor still has this mapping cached in its TLB when the
2940 * new reference happens. If no futher references happen on the page after that remote TLB flushes
2941 * we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
2942 * by pageout_scan, which is just fine since the last reference would have happened quite far
2943 * in the past (TLB caches don't hang around for very long), and of course could just as easily
2944 * have happened before we moved the page
2945 */
2946 if (m->vmp_pmapped == TRUE) {
2947 /*
2948 * We might be holding the page queue lock as a
2949 * spin lock and clearing the "referenced" bit could
2950 * take a while if there are lots of mappings of
2951 * that page, so make sure we acquire the lock as
2952 * as mutex to avoid a spinlock timeout.
2953 */
2954 vm_page_lockconvert_queues();
2955 pmap_clear_refmod_options(pn: VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
2956 }
2957
2958 /*
2959 * The page might be absent or busy,
2960 * but vm_page_deactivate can handle that.
2961 * FALSE indicates that we don't want a H/W clear reference
2962 */
2963 vm_page_deactivate_internal(page: m, FALSE);
2964 }
2965}
2966
2967/*
2968 * vm_pageout_scan does the dirty work for the pageout daemon.
2969 * It returns with both vm_page_queue_free_lock and vm_page_queue_lock
2970 * held and vm_page_free_wanted == 0.
2971 */
2972void
2973vm_pageout_scan(void)
2974{
2975 unsigned int loop_count = 0;
2976 unsigned int inactive_burst_count = 0;
2977 unsigned int reactivated_this_call;
2978 unsigned int reactivate_limit;
2979 vm_page_t local_freeq = NULL;
2980 int local_freed = 0;
2981 int delayed_unlock;
2982 int delayed_unlock_limit = 0;
2983 int refmod_state = 0;
2984 int vm_pageout_deadlock_target = 0;
2985 struct vm_pageout_queue *iq;
2986 struct vm_pageout_queue *eq;
2987 struct vm_speculative_age_q *sq;
2988 struct flow_control flow_control = { .state = 0, .ts = { .tv_sec = 0, .tv_nsec = 0 } };
2989 boolean_t inactive_throttled = FALSE;
2990 vm_object_t object = NULL;
2991 uint32_t inactive_reclaim_run;
2992 boolean_t grab_anonymous = FALSE;
2993 boolean_t force_anonymous = FALSE;
2994 boolean_t force_speculative_aging = FALSE;
2995 int anons_grabbed = 0;
2996 int page_prev_q_state = 0;
2997 boolean_t page_from_bg_q = FALSE;
2998 uint32_t vm_pageout_inactive_external_forced_reactivate_limit = 0;
2999 vm_object_t m_object = VM_OBJECT_NULL;
3000 int retval = 0;
3001 boolean_t lock_yield_check = FALSE;
3002
3003
3004 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_START,
3005 vm_pageout_vminfo.vm_pageout_freed_speculative,
3006 vm_pageout_state.vm_pageout_inactive_clean,
3007 vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3008 vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3009
3010 flow_control.state = FCS_IDLE;
3011 iq = &vm_pageout_queue_internal;
3012 eq = &vm_pageout_queue_external;
3013 sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
3014
3015 /* Ask the pmap layer to return any pages it no longer needs. */
3016 pmap_release_pages_fast();
3017
3018 vm_page_lock_queues();
3019
3020 delayed_unlock = 1;
3021
3022 /*
3023 * Calculate the max number of referenced pages on the inactive
3024 * queue that we will reactivate.
3025 */
3026 reactivated_this_call = 0;
3027 reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
3028 vm_page_inactive_count);
3029 inactive_reclaim_run = 0;
3030
3031 vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
3032
3033 /*
3034 * We must limit the rate at which we send pages to the pagers
3035 * so that we don't tie up too many pages in the I/O queues.
3036 * We implement a throttling mechanism using the laundry count
3037 * to limit the number of pages outstanding to the default
3038 * and external pagers. We can bypass the throttles and look
3039 * for clean pages if the pageout queues don't drain in a timely
3040 * fashion since this may indicate that the pageout paths are
3041 * stalled waiting for memory, which only we can provide.
3042 */
3043
3044 vps_init_page_targets();
3045 assert(object == NULL);
3046 assert(delayed_unlock != 0);
3047
3048 for (;;) {
3049 vm_page_t m;
3050
3051 DTRACE_VM2(rev, int, 1, (uint64_t *), NULL);
3052
3053 if (lock_yield_check) {
3054 lock_yield_check = FALSE;
3055
3056 if (delayed_unlock++ > delayed_unlock_limit) {
3057 vm_pageout_prepare_to_block(object: &object, delayed_unlock: &delayed_unlock, local_freeq: &local_freeq, local_freed: &local_freed,
3058 VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3059 } else if (vm_pageout_scan_wants_object) {
3060 vm_page_unlock_queues();
3061 mutex_pause(0);
3062 vm_page_lock_queues();
3063 } else if (vps_yield_for_pgqlockwaiters && lck_mtx_yield(lck: &vm_page_queue_lock)) {
3064 VM_PAGEOUT_DEBUG(vm_pageout_yield_for_free_pages, 1);
3065 }
3066 }
3067
3068 if (vm_upl_wait_for_pages < 0) {
3069 vm_upl_wait_for_pages = 0;
3070 }
3071
3072 delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT + vm_upl_wait_for_pages;
3073
3074 if (delayed_unlock_limit > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX) {
3075 delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX;
3076 }
3077
3078 vps_deal_with_secluded_page_overflow(local_freeq: &local_freeq, local_freed: &local_freed);
3079
3080 assert(delayed_unlock);
3081
3082 /*
3083 * maintain our balance
3084 */
3085 vm_page_balance_inactive(max_to_move: 1);
3086
3087
3088 /**********************************************************************
3089 * above this point we're playing with the active and secluded queues
3090 * below this point we're playing with the throttling mechanisms
3091 * and the inactive queue
3092 **********************************************************************/
3093
3094 if (vm_page_free_count + local_freed >= vm_page_free_target) {
3095 vm_pageout_scan_wants_object = VM_OBJECT_NULL;
3096
3097 vm_pageout_prepare_to_block(object: &object, delayed_unlock: &delayed_unlock, local_freeq: &local_freeq, local_freed: &local_freed,
3098 VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3099 /*
3100 * make sure the pageout I/O threads are running
3101 * throttled in case there are still requests
3102 * in the laundry... since we have met our targets
3103 * we don't need the laundry to be cleaned in a timely
3104 * fashion... so let's avoid interfering with foreground
3105 * activity
3106 */
3107 vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, TRUE);
3108
3109 vm_free_page_lock();
3110
3111 if ((vm_page_free_count >= vm_page_free_target) &&
3112 (vm_page_free_wanted == 0) && (vm_page_free_wanted_privileged == 0)) {
3113 /*
3114 * done - we have met our target *and*
3115 * there is no one waiting for a page.
3116 */
3117return_from_scan:
3118 assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
3119
3120 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_NONE,
3121 vm_pageout_state.vm_pageout_inactive,
3122 vm_pageout_state.vm_pageout_inactive_used, 0, 0);
3123 VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_END,
3124 vm_pageout_vminfo.vm_pageout_freed_speculative,
3125 vm_pageout_state.vm_pageout_inactive_clean,
3126 vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3127 vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3128
3129 return;
3130 }
3131 vm_free_page_unlock();
3132 }
3133
3134 /*
3135 * Before anything, we check if we have any ripe volatile
3136 * objects around. If so, try to purge the first object.
3137 * If the purge fails, fall through to reclaim a page instead.
3138 * If the purge succeeds, go back to the top and reevalute
3139 * the new memory situation.
3140 */
3141 retval = vps_purge_object();
3142
3143 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3144 /*
3145 * Success
3146 */
3147 if (object != NULL) {
3148 vm_object_unlock(object);
3149 object = NULL;
3150 }
3151
3152 lock_yield_check = FALSE;
3153 continue;
3154 }
3155
3156 /*
3157 * If our 'aged' queue is empty and we have some speculative pages
3158 * in the other queues, let's go through and see if we need to age
3159 * them.
3160 *
3161 * If we succeeded in aging a speculative Q or just that everything
3162 * looks normal w.r.t queue age and queue counts, we keep going onward.
3163 *
3164 * If, for some reason, we seem to have a mismatch between the spec.
3165 * page count and the page queues, we reset those variables and
3166 * restart the loop (LD TODO: Track this better?).
3167 */
3168 if (vm_page_queue_empty(&sq->age_q) && vm_page_speculative_count) {
3169 retval = vps_age_speculative_queue(force_speculative_aging);
3170
3171 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3172 lock_yield_check = FALSE;
3173 continue;
3174 }
3175 }
3176 force_speculative_aging = FALSE;
3177
3178 /*
3179 * Check to see if we need to evict objects from the cache.
3180 *
3181 * Note: 'object' here doesn't have anything to do with
3182 * the eviction part. We just need to make sure we have dropped
3183 * any object lock we might be holding if we need to go down
3184 * into the eviction logic.
3185 */
3186 retval = vps_object_cache_evict(object_to_unlock: &object);
3187
3188 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3189 lock_yield_check = FALSE;
3190 continue;
3191 }
3192
3193
3194 /*
3195 * Calculate our filecache_min that will affect the loop
3196 * going forward.
3197 */
3198 vps_calculate_filecache_min();
3199
3200 /*
3201 * LD TODO: Use a structure to hold all state variables for a single
3202 * vm_pageout_scan iteration and pass that structure to this function instead.
3203 */
3204 retval = vps_flow_control(flow_control: &flow_control, anons_grabbed: &anons_grabbed, object: &object,
3205 delayed_unlock: &delayed_unlock, local_freeq: &local_freeq, local_freed: &local_freed,
3206 vm_pageout_deadlock_target: &vm_pageout_deadlock_target, inactive_burst_count);
3207
3208 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3209 if (loop_count >= vm_page_inactive_count) {
3210 loop_count = 0;
3211 }
3212
3213 inactive_burst_count = 0;
3214
3215 assert(object == NULL);
3216 assert(delayed_unlock != 0);
3217
3218 lock_yield_check = FALSE;
3219 continue;
3220 } else if (retval == VM_PAGEOUT_SCAN_DONE_RETURN) {
3221 goto return_from_scan;
3222 }
3223
3224 flow_control.state = FCS_IDLE;
3225
3226 vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count),
3227 vm_pageout_inactive_external_forced_reactivate_limit);
3228 loop_count++;
3229 inactive_burst_count++;
3230 vm_pageout_state.vm_pageout_inactive++;
3231
3232 /*
3233 * Choose a victim.
3234 */
3235
3236 m = NULL;
3237 retval = vps_choose_victim_page(victim_page: &m, anons_grabbed: &anons_grabbed, grab_anonymous: &grab_anonymous, force_anonymous, is_page_from_bg_q: &page_from_bg_q, reactivated_this_call: &reactivated_this_call);
3238
3239 if (m == NULL) {
3240 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3241 inactive_burst_count = 0;
3242
3243 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3244 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3245 }
3246
3247 lock_yield_check = TRUE;
3248 continue;
3249 }
3250
3251 /*
3252 * if we've gotten here, we have no victim page.
3253 * check to see if we've not finished balancing the queues
3254 * or we have a page on the aged speculative queue that we
3255 * skipped due to force_anonymous == TRUE.. or we have
3256 * speculative pages that we can prematurely age... if
3257 * one of these cases we'll keep going, else panic
3258 */
3259 force_anonymous = FALSE;
3260 VM_PAGEOUT_DEBUG(vm_pageout_no_victim, 1);
3261
3262 if (!vm_page_queue_empty(&sq->age_q)) {
3263 lock_yield_check = TRUE;
3264 continue;
3265 }
3266
3267 if (vm_page_speculative_count) {
3268 force_speculative_aging = TRUE;
3269 lock_yield_check = TRUE;
3270 continue;
3271 }
3272 panic("vm_pageout: no victim");
3273
3274 /* NOTREACHED */
3275 }
3276
3277 assert(VM_PAGE_PAGEABLE(m));
3278 m_object = VM_PAGE_OBJECT(m);
3279 force_anonymous = FALSE;
3280
3281 page_prev_q_state = m->vmp_q_state;
3282 /*
3283 * we just found this page on one of our queues...
3284 * it can't also be on the pageout queue, so safe
3285 * to call vm_page_queues_remove
3286 */
3287 bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
3288 vm_page_queues_remove(mem: m, TRUE);
3289 if (donate) {
3290 /*
3291 * The compressor needs to see this bit to know
3292 * where this page needs to land. Also if stolen,
3293 * this bit helps put the page back in the right
3294 * special queue where it belongs.
3295 */
3296 m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
3297 }
3298
3299 assert(!m->vmp_laundry);
3300 assert(!m->vmp_private);
3301 assert(!m->vmp_fictitious);
3302 assert(!is_kernel_object(m_object));
3303 assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
3304
3305 vm_pageout_vminfo.vm_pageout_considered_page++;
3306
3307 DTRACE_VM2(scan, int, 1, (uint64_t *), NULL);
3308
3309 /*
3310 * check to see if we currently are working
3311 * with the same object... if so, we've
3312 * already got the lock
3313 */
3314 if (m_object != object) {
3315 boolean_t avoid_anon_pages = (grab_anonymous == FALSE || anons_grabbed >= ANONS_GRABBED_LIMIT);
3316
3317 /*
3318 * vps_switch_object() will always drop the 'object' lock first
3319 * and then try to acquire the 'm_object' lock. So 'object' has to point to
3320 * either 'm_object' or NULL.
3321 */
3322 retval = vps_switch_object(m, m_object, object: &object, page_prev_q_state, avoid_anon_pages, page_from_bg_q);
3323
3324 if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3325 lock_yield_check = TRUE;
3326 continue;
3327 }
3328 }
3329 assert(m_object == object);
3330 assert(VM_PAGE_OBJECT(m) == m_object);
3331
3332 if (m->vmp_busy) {
3333 /*
3334 * Somebody is already playing with this page.
3335 * Put it back on the appropriate queue
3336 *
3337 */
3338 VM_PAGEOUT_DEBUG(vm_pageout_inactive_busy, 1);
3339
3340 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3341 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_busy, 1);
3342 }
3343
3344 vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
3345
3346 lock_yield_check = TRUE;
3347 continue;
3348 }
3349
3350 /*
3351 * if (m->vmp_cleaning && !m->vmp_free_when_done)
3352 * If already cleaning this page in place
3353 * just leave if off the paging queues.
3354 * We can leave the page mapped, and upl_commit_range
3355 * will put it on the clean queue.
3356 *
3357 * if (m->vmp_free_when_done && !m->vmp_cleaning)
3358 * an msync INVALIDATE is in progress...
3359 * this page has been marked for destruction
3360 * after it has been cleaned,
3361 * but not yet gathered into a UPL
3362 * where 'cleaning' will be set...
3363 * just leave it off the paging queues
3364 *
3365 * if (m->vmp_free_when_done && m->vmp_clenaing)
3366 * an msync INVALIDATE is in progress
3367 * and the UPL has already gathered this page...
3368 * just leave it off the paging queues
3369 */
3370 if (m->vmp_free_when_done || m->vmp_cleaning) {
3371 lock_yield_check = TRUE;
3372 continue;
3373 }
3374
3375
3376 /*
3377 * If it's absent, in error or the object is no longer alive,
3378 * we can reclaim the page... in the no longer alive case,
3379 * there are 2 states the page can be in that preclude us
3380 * from reclaiming it - busy or cleaning - that we've already
3381 * dealt with
3382 */
3383 if (m->vmp_absent || VMP_ERROR_GET(m) || !object->alive ||
3384 (!object->internal && object->pager == MEMORY_OBJECT_NULL)) {
3385 if (m->vmp_absent) {
3386 VM_PAGEOUT_DEBUG(vm_pageout_inactive_absent, 1);
3387 } else if (!object->alive ||
3388 (!object->internal &&
3389 object->pager == MEMORY_OBJECT_NULL)) {
3390 VM_PAGEOUT_DEBUG(vm_pageout_inactive_notalive, 1);
3391 } else {
3392 VM_PAGEOUT_DEBUG(vm_pageout_inactive_error, 1);
3393 }
3394reclaim_page:
3395 if (vm_pageout_deadlock_target) {
3396 VM_PAGEOUT_DEBUG(vm_pageout_scan_inactive_throttle_success, 1);
3397 vm_pageout_deadlock_target--;
3398 }
3399
3400 DTRACE_VM2(dfree, int, 1, (uint64_t *), NULL);
3401
3402 if (object->internal) {
3403 DTRACE_VM2(anonfree, int, 1, (uint64_t *), NULL);
3404 } else {
3405 DTRACE_VM2(fsfree, int, 1, (uint64_t *), NULL);
3406 }
3407 assert(!m->vmp_cleaning);
3408 assert(!m->vmp_laundry);
3409
3410 if (!object->internal &&
3411 object->pager != NULL &&
3412 object->pager->mo_pager_ops == &shared_region_pager_ops) {
3413 shared_region_pager_reclaimed++;
3414 }
3415
3416 m->vmp_busy = TRUE;
3417
3418 /*
3419 * remove page from object here since we're already
3420 * behind the object lock... defer the rest of the work
3421 * we'd normally do in vm_page_free_prepare_object
3422 * until 'vm_page_free_list' is called
3423 */
3424 if (m->vmp_tabled) {
3425 vm_page_remove(page: m, TRUE);
3426 }
3427
3428 assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
3429 m->vmp_snext = local_freeq;
3430 local_freeq = m;
3431 local_freed++;
3432
3433 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3434 vm_pageout_vminfo.vm_pageout_freed_speculative++;
3435 } else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3436 vm_pageout_vminfo.vm_pageout_freed_cleaned++;
3437 } else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q) {
3438 vm_pageout_vminfo.vm_pageout_freed_internal++;
3439 } else {
3440 vm_pageout_vminfo.vm_pageout_freed_external++;
3441 }
3442
3443 inactive_burst_count = 0;
3444
3445 lock_yield_check = TRUE;
3446 continue;
3447 }
3448 if (object->vo_copy == VM_OBJECT_NULL) {
3449 /*
3450 * No one else can have any interest in this page.
3451 * If this is an empty purgable object, the page can be
3452 * reclaimed even if dirty.
3453 * If the page belongs to a volatile purgable object, we
3454 * reactivate it if the compressor isn't active.
3455 */
3456 if (object->purgable == VM_PURGABLE_EMPTY) {
3457 if (m->vmp_pmapped == TRUE) {
3458 /* unmap the page */
3459 refmod_state = pmap_disconnect(phys: VM_PAGE_GET_PHYS_PAGE(m));
3460 if (refmod_state & VM_MEM_MODIFIED) {
3461 SET_PAGE_DIRTY(m, FALSE);
3462 }
3463 }
3464 if (m->vmp_dirty || m->vmp_precious) {
3465 /* we saved the cost of cleaning this page ! */
3466 vm_page_purged_count++;
3467 }
3468 goto reclaim_page;
3469 }
3470
3471 if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
3472 /*
3473 * With the VM compressor, the cost of
3474 * reclaiming a page is much lower (no I/O),
3475 * so if we find a "volatile" page, it's better
3476 * to let it get compressed rather than letting
3477 * it occupy a full page until it gets purged.
3478 * So no need to check for "volatile" here.
3479 */
3480 } else if (object->purgable == VM_PURGABLE_VOLATILE) {
3481 /*
3482 * Avoid cleaning a "volatile" page which might
3483 * be purged soon.
3484 */
3485
3486 /* if it's wired, we can't put it on our queue */
3487 assert(!VM_PAGE_WIRED(m));
3488
3489 /* just stick it back on! */
3490 reactivated_this_call++;
3491
3492 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3493 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_volatile_reactivated, 1);
3494 }
3495
3496 goto reactivate_page;
3497 }
3498 }
3499 /*
3500 * If it's being used, reactivate.
3501 * (Fictitious pages are either busy or absent.)
3502 * First, update the reference and dirty bits
3503 * to make sure the page is unreferenced.
3504 */
3505 refmod_state = -1;
3506
3507 if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
3508 refmod_state = pmap_get_refmod(pn: VM_PAGE_GET_PHYS_PAGE(m));
3509
3510 if (refmod_state & VM_MEM_REFERENCED) {
3511 m->vmp_reference = TRUE;
3512 }
3513 if (refmod_state & VM_MEM_MODIFIED) {
3514 SET_PAGE_DIRTY(m, FALSE);
3515 }
3516 }
3517
3518 if (m->vmp_reference || m->vmp_dirty) {
3519 /* deal with a rogue "reusable" page */
3520 VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, m_object);
3521 }
3522
3523 if (vm_pageout_state.vm_page_xpmapped_min_divisor == 0) {
3524 vm_pageout_state.vm_page_xpmapped_min = 0;
3525 } else {
3526 vm_pageout_state.vm_page_xpmapped_min = (vm_page_external_count * 10) / vm_pageout_state.vm_page_xpmapped_min_divisor;
3527 }
3528
3529 if (!m->vmp_no_cache &&
3530 page_from_bg_q == FALSE &&
3531 (m->vmp_reference || (m->vmp_xpmapped && !object->internal &&
3532 (vm_page_xpmapped_external_count < vm_pageout_state.vm_page_xpmapped_min)))) {
3533 /*
3534 * The page we pulled off the inactive list has
3535 * been referenced. It is possible for other
3536 * processors to be touching pages faster than we
3537 * can clear the referenced bit and traverse the
3538 * inactive queue, so we limit the number of
3539 * reactivations.
3540 */
3541 if (++reactivated_this_call >= reactivate_limit &&
3542 !object->object_is_shared_cache &&
3543 !((m->vmp_realtime ||
3544 object->for_realtime) &&
3545 vm_pageout_protect_realtime)) {
3546 vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded++;
3547 } else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
3548 vm_pageout_vminfo.vm_pageout_inactive_force_reclaim++;
3549 if (object->object_is_shared_cache) {
3550 vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache++;
3551 } else if (m->vmp_realtime ||
3552 object->for_realtime) {
3553 vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime++;
3554 }
3555 } else {
3556 uint32_t isinuse;
3557
3558 if (reactivated_this_call >= reactivate_limit) {
3559 if (object->object_is_shared_cache) {
3560 vm_pageout_vminfo.vm_pageout_protected_sharedcache++;
3561 } else if ((m->vmp_realtime ||
3562 object->for_realtime) &&
3563 vm_pageout_protect_realtime) {
3564 vm_pageout_vminfo.vm_pageout_protected_realtime++;
3565 }
3566 }
3567 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3568 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reference_reactivated, 1);
3569 }
3570
3571 vm_pageout_vminfo.vm_pageout_inactive_referenced++;
3572reactivate_page:
3573 if (!object->internal && object->pager != MEMORY_OBJECT_NULL &&
3574 vnode_pager_get_isinuse(object->pager, &isinuse) == KERN_SUCCESS && !isinuse) {
3575 /*
3576 * no explict mappings of this object exist
3577 * and it's not open via the filesystem
3578 */
3579 vm_page_deactivate(page: m);
3580 VM_PAGEOUT_DEBUG(vm_pageout_inactive_deactivated, 1);
3581 } else {
3582 /*
3583 * The page was/is being used, so put back on active list.
3584 */
3585 vm_page_activate(page: m);
3586 counter_inc(&vm_statistics_reactivations);
3587 inactive_burst_count = 0;
3588 }
3589#if DEVELOPMENT || DEBUG
3590 if (page_from_bg_q == TRUE) {
3591 if (m_object->internal) {
3592 vm_pageout_rejected_bq_internal++;
3593 } else {
3594 vm_pageout_rejected_bq_external++;
3595 }
3596 }
3597#endif /* DEVELOPMENT || DEBUG */
3598
3599 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3600 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3601 }
3602 vm_pageout_state.vm_pageout_inactive_used++;
3603
3604 lock_yield_check = TRUE;
3605 continue;
3606 }
3607 /*
3608 * Make sure we call pmap_get_refmod() if it
3609 * wasn't already called just above, to update
3610 * the dirty bit.
3611 */
3612 if ((refmod_state == -1) && !m->vmp_dirty && m->vmp_pmapped) {
3613 refmod_state = pmap_get_refmod(pn: VM_PAGE_GET_PHYS_PAGE(m));
3614 if (refmod_state & VM_MEM_MODIFIED) {
3615 SET_PAGE_DIRTY(m, FALSE);
3616 }
3617 }
3618 }
3619
3620 /*
3621 * we've got a candidate page to steal...
3622 *
3623 * m->vmp_dirty is up to date courtesy of the
3624 * preceding check for m->vmp_reference... if
3625 * we get here, then m->vmp_reference had to be
3626 * FALSE (or possibly "reactivate_limit" was
3627 * exceeded), but in either case we called
3628 * pmap_get_refmod() and updated both
3629 * m->vmp_reference and m->vmp_dirty
3630 *
3631 * if it's dirty or precious we need to
3632 * see if the target queue is throtttled
3633 * it if is, we need to skip over it by moving it back
3634 * to the end of the inactive queue
3635 */
3636
3637 inactive_throttled = FALSE;
3638
3639 if (m->vmp_dirty || m->vmp_precious) {
3640 if (object->internal) {
3641 if (VM_PAGE_Q_THROTTLED(iq)) {
3642 inactive_throttled = TRUE;
3643 }
3644 } else if (VM_PAGE_Q_THROTTLED(eq)) {
3645 inactive_throttled = TRUE;
3646 }
3647 }
3648throttle_inactive:
3649 if (!VM_DYNAMIC_PAGING_ENABLED() &&
3650 object->internal && m->vmp_dirty &&
3651 (object->purgable == VM_PURGABLE_DENY ||
3652 object->purgable == VM_PURGABLE_NONVOLATILE ||
3653 object->purgable == VM_PURGABLE_VOLATILE)) {
3654 vm_page_check_pageable_safe(page: m);
3655 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3656 vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
3657 m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
3658 vm_page_throttled_count++;
3659
3660 VM_PAGEOUT_DEBUG(vm_pageout_scan_reclaimed_throttled, 1);
3661
3662 inactive_burst_count = 0;
3663
3664 lock_yield_check = TRUE;
3665 continue;
3666 }
3667 if (inactive_throttled == TRUE) {
3668 vps_deal_with_throttled_queues(m, object: &object, vm_pageout_inactive_external_forced_reactivate_limit: &vm_pageout_inactive_external_forced_reactivate_limit,
3669 force_anonymous: &force_anonymous, is_page_from_bg_q: page_from_bg_q);
3670
3671 inactive_burst_count = 0;
3672
3673 if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3674 VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, 1);
3675 }
3676
3677 lock_yield_check = TRUE;
3678 continue;
3679 }
3680
3681 /*
3682 * we've got a page that we can steal...
3683 * eliminate all mappings and make sure
3684 * we have the up-to-date modified state
3685 *
3686 * if we need to do a pmap_disconnect then we
3687 * need to re-evaluate m->vmp_dirty since the pmap_disconnect
3688 * provides the true state atomically... the
3689 * page was still mapped up to the pmap_disconnect
3690 * and may have been dirtied at the last microsecond
3691 *
3692 * Note that if 'pmapped' is FALSE then the page is not
3693 * and has not been in any map, so there is no point calling
3694 * pmap_disconnect(). m->vmp_dirty could have been set in anticipation
3695 * of likely usage of the page.
3696 */
3697 if (m->vmp_pmapped == TRUE) {
3698 int pmap_options;
3699
3700 /*
3701 * Don't count this page as going into the compressor
3702 * if any of these are true:
3703 * 1) compressed pager isn't enabled
3704 * 2) Freezer enabled device with compressed pager
3705 * backend (exclusive use) i.e. most of the VM system
3706 * (including vm_pageout_scan) has no knowledge of
3707 * the compressor
3708 * 3) This page belongs to a file and hence will not be
3709 * sent into the compressor
3710 */
3711 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE ||
3712 object->internal == FALSE) {
3713 pmap_options = 0;
3714 } else if (m->vmp_dirty || m->vmp_precious) {
3715 /*
3716 * VM knows that this page is dirty (or
3717 * precious) and needs to be compressed
3718 * rather than freed.
3719 * Tell the pmap layer to count this page
3720 * as "compressed".
3721 */
3722 pmap_options = PMAP_OPTIONS_COMPRESSOR;
3723 } else {
3724 /*
3725 * VM does not know if the page needs to
3726 * be preserved but the pmap layer might tell
3727 * us if any mapping has "modified" it.
3728 * Let's the pmap layer to count this page
3729 * as compressed if and only if it has been
3730 * modified.
3731 */
3732 pmap_options =
3733 PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
3734 }
3735 refmod_state = pmap_disconnect_options(phys: VM_PAGE_GET_PHYS_PAGE(m),
3736 options: pmap_options,
3737 NULL);
3738 if (refmod_state & VM_MEM_MODIFIED) {
3739 SET_PAGE_DIRTY(m, FALSE);
3740 }
3741 }
3742
3743 /*
3744 * reset our count of pages that have been reclaimed
3745 * since the last page was 'stolen'
3746 */
3747 inactive_reclaim_run = 0;
3748
3749 /*
3750 * If it's clean and not precious, we can free the page.
3751 */
3752 if (!m->vmp_dirty && !m->vmp_precious) {
3753 vm_pageout_state.vm_pageout_inactive_clean++;
3754
3755 /*
3756 * OK, at this point we have found a page we are going to free.
3757 */
3758#if CONFIG_PHANTOM_CACHE
3759 if (!object->internal) {
3760 vm_phantom_cache_add_ghost(m);
3761 }
3762#endif
3763 goto reclaim_page;
3764 }
3765
3766 /*
3767 * The page may have been dirtied since the last check
3768 * for a throttled target queue (which may have been skipped
3769 * if the page was clean then). With the dirty page
3770 * disconnected here, we can make one final check.
3771 */
3772 if (object->internal) {
3773 if (VM_PAGE_Q_THROTTLED(iq)) {
3774 inactive_throttled = TRUE;
3775 }
3776 } else if (VM_PAGE_Q_THROTTLED(eq)) {
3777 inactive_throttled = TRUE;
3778 }
3779
3780 if (inactive_throttled == TRUE) {
3781 goto throttle_inactive;
3782 }
3783
3784#if VM_PRESSURE_EVENTS
3785#if CONFIG_JETSAM
3786
3787 /*
3788 * If Jetsam is enabled, then the sending
3789 * of memory pressure notifications is handled
3790 * from the same thread that takes care of high-water
3791 * and other jetsams i.e. the memorystatus_thread.
3792 */
3793
3794#else /* CONFIG_JETSAM */
3795
3796 vm_pressure_response();
3797
3798#endif /* CONFIG_JETSAM */
3799#endif /* VM_PRESSURE_EVENTS */
3800
3801 if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3802 VM_PAGEOUT_DEBUG(vm_pageout_speculative_dirty, 1);
3803 }
3804
3805 if (object->internal) {
3806 vm_pageout_vminfo.vm_pageout_inactive_dirty_internal++;
3807 } else {
3808 vm_pageout_vminfo.vm_pageout_inactive_dirty_external++;
3809 }
3810
3811 /*
3812 * internal pages will go to the compressor...
3813 * external pages will go to the appropriate pager to be cleaned
3814 * and upon completion will end up on 'vm_page_queue_cleaned' which
3815 * is a preferred queue to steal from
3816 */
3817 vm_pageout_cluster(m);
3818 inactive_burst_count = 0;
3819
3820 /*
3821 * back to top of pageout scan loop
3822 */
3823 }
3824}
3825
3826
3827void
3828vm_page_free_reserve(
3829 int pages)
3830{
3831 int free_after_reserve;
3832
3833 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3834 if ((vm_page_free_reserved + pages + COMPRESSOR_FREE_RESERVED_LIMIT) >= (VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT)) {
3835 vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT;
3836 } else {
3837 vm_page_free_reserved += (pages + COMPRESSOR_FREE_RESERVED_LIMIT);
3838 }
3839 } else {
3840 if ((vm_page_free_reserved + pages) >= VM_PAGE_FREE_RESERVED_LIMIT) {
3841 vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT;
3842 } else {
3843 vm_page_free_reserved += pages;
3844 }
3845 }
3846 free_after_reserve = vm_pageout_state.vm_page_free_count_init - vm_page_free_reserved;
3847
3848 vm_page_free_min = vm_page_free_reserved +
3849 VM_PAGE_FREE_MIN(free_after_reserve);
3850
3851 if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT) {
3852 vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
3853 }
3854
3855 vm_page_free_target = vm_page_free_reserved +
3856 VM_PAGE_FREE_TARGET(free_after_reserve);
3857
3858 if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT) {
3859 vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
3860 }
3861
3862 if (vm_page_free_target < vm_page_free_min + 5) {
3863 vm_page_free_target = vm_page_free_min + 5;
3864 }
3865
3866 vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / 2);
3867}
3868
3869/*
3870 * vm_pageout is the high level pageout daemon.
3871 */
3872
3873void
3874vm_pageout_continue(void)
3875{
3876 DTRACE_VM2(pgrrun, int, 1, (uint64_t *), NULL);
3877 VM_PAGEOUT_DEBUG(vm_pageout_scan_event_counter, 1);
3878
3879 vm_free_page_lock();
3880 vm_pageout_running = TRUE;
3881 vm_free_page_unlock();
3882
3883 vm_pageout_scan();
3884 /*
3885 * we hold both the vm_page_queue_free_lock
3886 * and the vm_page_queues_lock at this point
3887 */
3888 assert(vm_page_free_wanted == 0);
3889 assert(vm_page_free_wanted_privileged == 0);
3890 assert_wait(event: (event_t) &vm_page_free_wanted, THREAD_UNINT);
3891
3892 vm_pageout_running = FALSE;
3893#if XNU_TARGET_OS_OSX
3894 if (vm_pageout_waiter) {
3895 vm_pageout_waiter = FALSE;
3896 thread_wakeup((event_t)&vm_pageout_waiter);
3897 }
3898#endif /* XNU_TARGET_OS_OSX */
3899
3900 vm_free_page_unlock();
3901 vm_page_unlock_queues();
3902
3903 thread_block(continuation: (thread_continue_t)vm_pageout_continue);
3904 /*NOTREACHED*/
3905}
3906
3907#if XNU_TARGET_OS_OSX
3908kern_return_t
3909vm_pageout_wait(uint64_t deadline)
3910{
3911 kern_return_t kr;
3912
3913 vm_free_page_lock();
3914 for (kr = KERN_SUCCESS; vm_pageout_running && (KERN_SUCCESS == kr);) {
3915 vm_pageout_waiter = TRUE;
3916 if (THREAD_AWAKENED != lck_mtx_sleep_deadline(
3917 lck: &vm_page_queue_free_lock, lck_sleep_action: LCK_SLEEP_DEFAULT,
3918 event: (event_t) &vm_pageout_waiter, THREAD_UNINT, deadline)) {
3919 kr = KERN_OPERATION_TIMED_OUT;
3920 }
3921 }
3922 vm_free_page_unlock();
3923
3924 return kr;
3925}
3926#endif /* XNU_TARGET_OS_OSX */
3927
3928OS_NORETURN
3929static void
3930vm_pageout_iothread_external_continue(struct pgo_iothread_state *ethr, __unused wait_result_t w)
3931{
3932 vm_page_t m = NULL;
3933 vm_object_t object;
3934 vm_object_offset_t offset;
3935 memory_object_t pager;
3936 struct vm_pageout_queue *q = ethr->q;
3937
3938 /* On systems with a compressor, the external IO thread clears its
3939 * VM privileged bit to accommodate large allocations (e.g. bulk UPL
3940 * creation)
3941 */
3942 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3943 current_thread()->options &= ~TH_OPT_VMPRIV;
3944 }
3945
3946 sched_cond_ack(cond: &(ethr->pgo_wakeup));
3947
3948 while (true) {
3949 vm_page_lockspin_queues();
3950
3951 while (!vm_page_queue_empty(&q->pgo_pending)) {
3952 q->pgo_busy = TRUE;
3953 vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
3954
3955 assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
3956 VM_PAGE_CHECK(m);
3957 /*
3958 * grab a snapshot of the object and offset this
3959 * page is tabled in so that we can relookup this
3960 * page after we've taken the object lock - these
3961 * fields are stable while we hold the page queues lock
3962 * but as soon as we drop it, there is nothing to keep
3963 * this page in this object... we hold an activity_in_progress
3964 * on this object which will keep it from terminating
3965 */
3966 object = VM_PAGE_OBJECT(m);
3967 offset = m->vmp_offset;
3968
3969 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
3970 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
3971
3972 vm_page_unlock_queues();
3973
3974 vm_object_lock(object);
3975
3976 m = vm_page_lookup(object, offset);
3977
3978 if (m == NULL || m->vmp_busy || m->vmp_cleaning ||
3979 !m->vmp_laundry || (m->vmp_q_state != VM_PAGE_NOT_ON_Q)) {
3980 /*
3981 * it's either the same page that someone else has
3982 * started cleaning (or it's finished cleaning or
3983 * been put back on the pageout queue), or
3984 * the page has been freed or we have found a
3985 * new page at this offset... in all of these cases
3986 * we merely need to release the activity_in_progress
3987 * we took when we put the page on the pageout queue
3988 */
3989 vm_object_activity_end(object);
3990 vm_object_unlock(object);
3991
3992 vm_page_lockspin_queues();
3993 continue;
3994 }
3995 pager = object->pager;
3996
3997 if (pager == MEMORY_OBJECT_NULL) {
3998 /*
3999 * This pager has been destroyed by either
4000 * memory_object_destroy or vm_object_destroy, and
4001 * so there is nowhere for the page to go.
4002 */
4003 if (m->vmp_free_when_done) {
4004 /*
4005 * Just free the page... VM_PAGE_FREE takes
4006 * care of cleaning up all the state...
4007 * including doing the vm_pageout_throttle_up
4008 */
4009 VM_PAGE_FREE(m);
4010 } else {
4011 vm_page_lockspin_queues();
4012
4013 vm_pageout_throttle_up(m);
4014 vm_page_activate(page: m);
4015
4016 vm_page_unlock_queues();
4017
4018 /*
4019 * And we are done with it.
4020 */
4021 }
4022 vm_object_activity_end(object);
4023 vm_object_unlock(object);
4024
4025 vm_page_lockspin_queues();
4026 continue;
4027 }
4028 #if 0
4029 /*
4030 * we don't hold the page queue lock
4031 * so this check isn't safe to make
4032 */
4033 VM_PAGE_CHECK(m);
4034 #endif
4035 /*
4036 * give back the activity_in_progress reference we
4037 * took when we queued up this page and replace it
4038 * it with a paging_in_progress reference that will
4039 * also hold the paging offset from changing and
4040 * prevent the object from terminating
4041 */
4042 vm_object_activity_end(object);
4043 vm_object_paging_begin(object);
4044 vm_object_unlock(object);
4045
4046 /*
4047 * Send the data to the pager.
4048 * any pageout clustering happens there
4049 */
4050 memory_object_data_return(memory_object: pager,
4051 offset: m->vmp_offset + object->paging_offset,
4052 PAGE_SIZE,
4053 NULL,
4054 NULL,
4055 FALSE,
4056 FALSE,
4057 upl_flags: 0);
4058
4059 vm_object_lock(object);
4060 vm_object_paging_end(object);
4061 vm_object_unlock(object);
4062
4063 vm_pageout_io_throttle();
4064
4065 vm_page_lockspin_queues();
4066 }
4067 q->pgo_busy = FALSE;
4068
4069 vm_page_unlock_queues();
4070 sched_cond_wait_parameter(cond: &(ethr->pgo_wakeup), THREAD_UNINT, continuation: (thread_continue_t)vm_pageout_iothread_external_continue, parameter: ethr);
4071 }
4072 /*NOTREACHED*/
4073}
4074
4075
4076#define MAX_FREE_BATCH 32
4077uint32_t vm_compressor_time_thread; /* Set via sysctl to record time accrued by
4078 * this thread.
4079 */
4080
4081
4082OS_NORETURN
4083static void
4084vm_pageout_iothread_internal_continue(struct pgo_iothread_state *cq, __unused wait_result_t w)
4085{
4086 struct vm_pageout_queue *q;
4087 vm_page_t m = NULL;
4088 boolean_t pgo_draining;
4089 vm_page_t local_q;
4090 int local_cnt;
4091 vm_page_t local_freeq = NULL;
4092 int local_freed = 0;
4093 int local_batch_size;
4094#if DEVELOPMENT || DEBUG
4095 int ncomps = 0;
4096 boolean_t marked_active = FALSE;
4097 int num_pages_processed = 0;
4098#endif
4099 void *chead = NULL;
4100
4101 KERNEL_DEBUG(0xe040000c | DBG_FUNC_END, 0, 0, 0, 0, 0);
4102
4103 sched_cond_ack(cond: &(cq->pgo_wakeup));
4104
4105 q = cq->q;
4106
4107 while (true) {
4108#if DEVELOPMENT || DEBUG
4109 bool benchmark_accounting = false;
4110 /*
4111 * If we're running the compressor perf test, only process the benchmark pages.
4112 * We'll get back to our regular queue once the benchmark is done
4113 */
4114 if (compressor_running_perf_test) {
4115 q = cq->benchmark_q;
4116 if (!vm_page_queue_empty(&q->pgo_pending)) {
4117 benchmark_accounting = true;
4118 } else {
4119 q = cq->q;
4120 benchmark_accounting = false;
4121 }
4122 }
4123#endif /* DEVELOPMENT || DEBUG */
4124
4125#if __AMP__
4126 if (vm_compressor_ebound && (vm_pageout_state.vm_compressor_thread_count > 1)) {
4127 local_batch_size = (q->pgo_maxlaundry >> 3);
4128 local_batch_size = MAX(local_batch_size, 16);
4129 } else {
4130 local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
4131 }
4132#else
4133 local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * 2);
4134#endif
4135
4136#if RECORD_THE_COMPRESSED_DATA
4137 if (q->pgo_laundry) {
4138 c_compressed_record_init();
4139 }
4140#endif
4141 while (true) {
4142 int pages_left_on_q = 0;
4143
4144 local_cnt = 0;
4145 local_q = NULL;
4146
4147 KERNEL_DEBUG(0xe0400014 | DBG_FUNC_START, 0, 0, 0, 0, 0);
4148
4149 vm_page_lock_queues();
4150#if DEVELOPMENT || DEBUG
4151 if (marked_active == FALSE) {
4152 vmct_active++;
4153 vmct_state[cq->id] = VMCT_ACTIVE;
4154 marked_active = TRUE;
4155 if (vmct_active == 1) {
4156 vm_compressor_epoch_start = mach_absolute_time();
4157 }
4158 }
4159#endif
4160 KERNEL_DEBUG(0xe0400014 | DBG_FUNC_END, 0, 0, 0, 0, 0);
4161
4162 KERNEL_DEBUG(0xe0400018 | DBG_FUNC_START, q->pgo_laundry, 0, 0, 0, 0);
4163
4164 while (!vm_page_queue_empty(&q->pgo_pending) && local_cnt < local_batch_size) {
4165 vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
4166 assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
4167 VM_PAGE_CHECK(m);
4168
4169 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
4170 VM_PAGE_ZERO_PAGEQ_ENTRY(m);
4171 m->vmp_laundry = FALSE;
4172
4173 m->vmp_snext = local_q;
4174 local_q = m;
4175 local_cnt++;
4176 }
4177 if (local_q == NULL) {
4178 break;
4179 }
4180
4181 q->pgo_busy = TRUE;
4182
4183 if ((pgo_draining = q->pgo_draining) == FALSE) {
4184 vm_pageout_throttle_up_batch(q, batch_cnt: local_cnt);
4185 pages_left_on_q = q->pgo_laundry;
4186 } else {
4187 pages_left_on_q = q->pgo_laundry - local_cnt;
4188 }
4189
4190 vm_page_unlock_queues();
4191
4192#if !RECORD_THE_COMPRESSED_DATA
4193 if (pages_left_on_q >= local_batch_size && cq->id < (vm_pageout_state.vm_compressor_thread_count - 1)) {
4194 // wake up the next compressor thread
4195 sched_cond_signal(cond: &pgo_iothread_internal_state[cq->id + 1].pgo_wakeup,
4196 thread: pgo_iothread_internal_state[cq->id + 1].pgo_iothread);
4197 }
4198#endif
4199 KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, q->pgo_laundry, 0, 0, 0, 0);
4200
4201 while (local_q) {
4202 KERNEL_DEBUG(0xe0400024 | DBG_FUNC_START, local_cnt, 0, 0, 0, 0);
4203
4204 m = local_q;
4205 local_q = m->vmp_snext;
4206 m->vmp_snext = NULL;
4207
4208 /*
4209 * Technically we need the pageq locks to manipulate this field.
4210 * However, this page has been removed from all queues and is only
4211 * known to this compressor thread dealing with this local queue.
4212 *
4213 * TODO LIONEL: Add a second localq that is the early localq and
4214 * put special pages like this one on that queue in the block above
4215 * under the pageq lock to avoid this 'works but not clean' logic.
4216 */
4217 void *donate_queue_head;
4218#if XNU_TARGET_OS_OSX
4219 donate_queue_head = &cq->current_early_swapout_chead;
4220#else /* XNU_TARGET_OS_OSX */
4221 donate_queue_head = &cq->current_late_swapout_chead;
4222#endif /* XNU_TARGET_OS_OSX */
4223 if (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE) {
4224 m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
4225 chead = donate_queue_head;
4226 } else {
4227 chead = &cq->current_regular_swapout_chead;
4228 }
4229
4230 if (vm_pageout_compress_page(chead, cq->scratch_buf, m) == KERN_SUCCESS) {
4231#if DEVELOPMENT || DEBUG
4232 ncomps++;
4233#endif
4234 KERNEL_DEBUG(0xe0400024 | DBG_FUNC_END, local_cnt, 0, 0, 0, 0);
4235
4236 m->vmp_snext = local_freeq;
4237 local_freeq = m;
4238 local_freed++;
4239
4240 if (local_freed >= MAX_FREE_BATCH) {
4241 OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4242
4243 vm_page_free_list(mem: local_freeq, TRUE);
4244
4245 local_freeq = NULL;
4246 local_freed = 0;
4247 }
4248 }
4249#if DEVELOPMENT || DEBUG
4250 num_pages_processed++;
4251#endif /* DEVELOPMENT || DEBUG */
4252#if !CONFIG_JETSAM
4253 while (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4254 kern_return_t wait_result;
4255 int need_wakeup = 0;
4256
4257 if (local_freeq) {
4258 OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4259
4260 vm_page_free_list(mem: local_freeq, TRUE);
4261 local_freeq = NULL;
4262 local_freed = 0;
4263
4264 continue;
4265 }
4266 vm_free_page_lock_spin();
4267
4268 if (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4269 if (vm_page_free_wanted_privileged++ == 0) {
4270 need_wakeup = 1;
4271 }
4272 wait_result = assert_wait(event: (event_t)&vm_page_free_wanted_privileged, THREAD_UNINT);
4273
4274 vm_free_page_unlock();
4275
4276 if (need_wakeup) {
4277 thread_wakeup((event_t)&vm_page_free_wanted);
4278 }
4279
4280 if (wait_result == THREAD_WAITING) {
4281 thread_block(THREAD_CONTINUE_NULL);
4282 }
4283 } else {
4284 vm_free_page_unlock();
4285 }
4286 }
4287#endif
4288 }
4289 if (local_freeq) {
4290 OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4291
4292 vm_page_free_list(mem: local_freeq, TRUE);
4293 local_freeq = NULL;
4294 local_freed = 0;
4295 }
4296 if (pgo_draining == TRUE) {
4297 vm_page_lockspin_queues();
4298 vm_pageout_throttle_up_batch(q, batch_cnt: local_cnt);
4299 vm_page_unlock_queues();
4300 }
4301 }
4302 KERNEL_DEBUG(0xe040000c | DBG_FUNC_START, 0, 0, 0, 0, 0);
4303
4304 /*
4305 * queue lock is held and our q is empty
4306 */
4307 q->pgo_busy = FALSE;
4308#if DEVELOPMENT || DEBUG
4309 if (marked_active == TRUE) {
4310 vmct_active--;
4311 vmct_state[cq->id] = VMCT_IDLE;
4312
4313 if (vmct_active == 0) {
4314 vm_compressor_epoch_stop = mach_absolute_time();
4315 assertf(vm_compressor_epoch_stop >= vm_compressor_epoch_start,
4316 "Compressor epoch non-monotonic: 0x%llx -> 0x%llx",
4317 vm_compressor_epoch_start, vm_compressor_epoch_stop);
4318 /* This interval includes intervals where one or more
4319 * compressor threads were pre-empted
4320 */
4321 vmct_stats.vmct_cthreads_total += vm_compressor_epoch_stop - vm_compressor_epoch_start;
4322 }
4323 }
4324 if (compressor_running_perf_test && benchmark_accounting) {
4325 /*
4326 * We could turn ON compressor_running_perf_test while still processing
4327 * regular non-benchmark pages. We shouldn't count them here else we
4328 * could overshoot. We might also still be populating that benchmark Q
4329 * and be under pressure. So we will go back to the regular queues. And
4330 * benchmark accounting will be off for that case too.
4331 */
4332 compressor_perf_test_pages_processed += num_pages_processed;
4333 thread_wakeup(&compressor_perf_test_pages_processed);
4334 }
4335#endif
4336 vm_page_unlock_queues();
4337#if DEVELOPMENT || DEBUG
4338 if (__improbable(vm_compressor_time_thread)) {
4339 vmct_stats.vmct_runtimes[cq->id] = thread_get_runtime_self();
4340 vmct_stats.vmct_pages[cq->id] += ncomps;
4341 vmct_stats.vmct_iterations[cq->id]++;
4342 if (ncomps > vmct_stats.vmct_maxpages[cq->id]) {
4343 vmct_stats.vmct_maxpages[cq->id] = ncomps;
4344 }
4345 if (ncomps < vmct_stats.vmct_minpages[cq->id]) {
4346 vmct_stats.vmct_minpages[cq->id] = ncomps;
4347 }
4348 }
4349#endif
4350
4351 KERNEL_DEBUG(0xe0400018 | DBG_FUNC_END, 0, 0, 0, 0, 0);
4352#if DEVELOPMENT || DEBUG
4353 if (compressor_running_perf_test && benchmark_accounting) {
4354 /*
4355 * We've been exclusively compressing pages from the benchmark queue,
4356 * do 1 pass over the internal queue before blocking.
4357 */
4358 continue;
4359 }
4360#endif
4361
4362 sched_cond_wait_parameter(cond: &(cq->pgo_wakeup), THREAD_UNINT, continuation: (thread_continue_t)vm_pageout_iothread_internal_continue, parameter: (void *) cq);
4363 }
4364 /*NOTREACHED*/
4365}
4366
4367
4368kern_return_t
4369vm_pageout_compress_page(void **current_chead, char *scratch_buf, vm_page_t m)
4370{
4371 vm_object_t object;
4372 memory_object_t pager;
4373 int compressed_count_delta;
4374 kern_return_t retval;
4375
4376 object = VM_PAGE_OBJECT(m);
4377
4378 assert(!m->vmp_free_when_done);
4379 assert(!m->vmp_laundry);
4380
4381 pager = object->pager;
4382
4383 if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4384 KERNEL_DEBUG(0xe0400010 | DBG_FUNC_START, object, pager, 0, 0, 0);
4385
4386 vm_object_lock(object);
4387
4388 /*
4389 * If there is no memory object for the page, create
4390 * one and hand it to the compression pager.
4391 */
4392
4393 if (!object->pager_initialized) {
4394 vm_object_collapse(object, offset: (vm_object_offset_t) 0, TRUE);
4395 }
4396 if (!object->pager_initialized) {
4397 vm_object_compressor_pager_create(object);
4398 }
4399
4400 pager = object->pager;
4401
4402 if (!object->pager_initialized || pager == MEMORY_OBJECT_NULL) {
4403 /*
4404 * Still no pager for the object,
4405 * or the pager has been destroyed.
4406 * Reactivate the page.
4407 *
4408 * Should only happen if there is no
4409 * compression pager
4410 */
4411 PAGE_WAKEUP_DONE(m);
4412
4413 vm_page_lockspin_queues();
4414 vm_page_activate(page: m);
4415 VM_PAGEOUT_DEBUG(vm_pageout_dirty_no_pager, 1);
4416 vm_page_unlock_queues();
4417
4418 /*
4419 * And we are done with it.
4420 */
4421 vm_object_activity_end(object);
4422 vm_object_unlock(object);
4423
4424 return KERN_FAILURE;
4425 }
4426 vm_object_unlock(object);
4427
4428 KERNEL_DEBUG(0xe0400010 | DBG_FUNC_END, object, pager, 0, 0, 0);
4429 }
4430 assert(object->pager_initialized && pager != MEMORY_OBJECT_NULL);
4431 assert(object->activity_in_progress > 0);
4432
4433#if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
4434 if (m->vmp_unmodified_ro == true) {
4435 os_atomic_inc(&compressor_ro_uncompressed_total_returned, relaxed);
4436 }
4437#endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
4438
4439 retval = vm_compressor_pager_put(
4440 mem_obj: pager,
4441 offset: m->vmp_offset + object->paging_offset,
4442 ppnum: VM_PAGE_GET_PHYS_PAGE(m),
4443#if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
4444 m->vmp_unmodified_ro,
4445#else /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
4446 false,
4447#endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
4448 current_chead,
4449 scratch_buf,
4450 compressed_count_delta_p: &compressed_count_delta);
4451
4452 vm_object_lock(object);
4453
4454 assert(object->activity_in_progress > 0);
4455 assert(VM_PAGE_OBJECT(m) == object);
4456 assert( !VM_PAGE_WIRED(m));
4457
4458 vm_compressor_pager_count(mem_obj: pager,
4459 compressed_count_delta,
4460 FALSE, /* shared_lock */
4461 object);
4462
4463 if (retval == KERN_SUCCESS) {
4464 /*
4465 * If the object is purgeable, its owner's
4466 * purgeable ledgers will be updated in
4467 * vm_page_remove() but the page still
4468 * contributes to the owner's memory footprint,
4469 * so account for it as such.
4470 */
4471 if ((object->purgable != VM_PURGABLE_DENY ||
4472 object->vo_ledger_tag) &&
4473 object->vo_owner != NULL) {
4474 /* one more compressed purgeable/tagged page */
4475 vm_object_owner_compressed_update(object,
4476 delta: compressed_count_delta);
4477 }
4478 counter_inc(&vm_statistics_compressions);
4479
4480 if (m->vmp_tabled) {
4481 vm_page_remove(page: m, TRUE);
4482 }
4483 } else {
4484 PAGE_WAKEUP_DONE(m);
4485
4486 vm_page_lockspin_queues();
4487
4488 vm_page_activate(page: m);
4489 vm_pageout_vminfo.vm_compressor_failed++;
4490
4491 vm_page_unlock_queues();
4492 }
4493 vm_object_activity_end(object);
4494 vm_object_unlock(object);
4495
4496 return retval;
4497}
4498
4499
4500static void
4501vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state *ethr, boolean_t req_lowpriority)
4502{
4503 uint32_t policy;
4504
4505 if (hibernate_cleaning_in_progress == TRUE) {
4506 req_lowpriority = FALSE;
4507 }
4508
4509 if (ethr->q->pgo_inited == TRUE && ethr->q->pgo_lowpriority != req_lowpriority) {
4510 vm_page_unlock_queues();
4511
4512 if (req_lowpriority == TRUE) {
4513 policy = THROTTLE_LEVEL_PAGEOUT_THROTTLED;
4514 DTRACE_VM(laundrythrottle);
4515 } else {
4516 policy = THROTTLE_LEVEL_PAGEOUT_UNTHROTTLED;
4517 DTRACE_VM(laundryunthrottle);
4518 }
4519 proc_set_thread_policy(thread: ethr->pgo_iothread,
4520 TASK_POLICY_EXTERNAL, TASK_POLICY_IO, value: policy);
4521
4522 vm_page_lock_queues();
4523 ethr->q->pgo_lowpriority = req_lowpriority;
4524 }
4525}
4526
4527OS_NORETURN
4528static void
4529vm_pageout_iothread_external(struct pgo_iothread_state *ethr, __unused wait_result_t w)
4530{
4531 thread_t self = current_thread();
4532
4533 self->options |= TH_OPT_VMPRIV;
4534
4535 DTRACE_VM2(laundrythrottle, int, 1, (uint64_t *), NULL);
4536
4537 proc_set_thread_policy(thread: self, TASK_POLICY_EXTERNAL,
4538 TASK_POLICY_IO, THROTTLE_LEVEL_PAGEOUT_THROTTLED);
4539
4540 vm_page_lock_queues();
4541
4542 vm_pageout_queue_external.pgo_lowpriority = TRUE;
4543 vm_pageout_queue_external.pgo_inited = TRUE;
4544
4545 vm_page_unlock_queues();
4546
4547#if CONFIG_THREAD_GROUPS
4548 thread_group_vm_add();
4549#endif /* CONFIG_THREAD_GROUPS */
4550
4551 vm_pageout_iothread_external_continue(ethr, w: 0);
4552 /*NOTREACHED*/
4553}
4554
4555
4556OS_NORETURN
4557static void
4558vm_pageout_iothread_internal(struct pgo_iothread_state *cthr, __unused wait_result_t w)
4559{
4560 thread_t self = current_thread();
4561
4562 self->options |= TH_OPT_VMPRIV;
4563
4564 vm_page_lock_queues();
4565
4566 vm_pageout_queue_internal.pgo_lowpriority = TRUE;
4567 vm_pageout_queue_internal.pgo_inited = TRUE;
4568
4569#if DEVELOPMENT || DEBUG
4570 vm_pageout_queue_benchmark.pgo_lowpriority = vm_pageout_queue_internal.pgo_lowpriority;
4571 vm_pageout_queue_benchmark.pgo_inited = vm_pageout_queue_internal.pgo_inited;
4572 vm_pageout_queue_benchmark.pgo_busy = FALSE;
4573#endif /* DEVELOPMENT || DEBUG */
4574
4575 vm_page_unlock_queues();
4576
4577 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
4578 thread_vm_bind_group_add();
4579 }
4580
4581#if CONFIG_THREAD_GROUPS
4582 thread_group_vm_add();
4583#endif /* CONFIG_THREAD_GROUPS */
4584
4585#if __AMP__
4586 if (vm_compressor_ebound) {
4587 /*
4588 * Use the soft bound option for vm_compressor to allow it to run on
4589 * P-cores if E-cluster is unavailable.
4590 */
4591 thread_bind_cluster_type(self, 'E', true);
4592 }
4593#endif /* __AMP__ */
4594
4595 thread_set_thread_name(th: current_thread(), name: "VM_compressor");
4596#if DEVELOPMENT || DEBUG
4597 vmct_stats.vmct_minpages[cthr->id] = INT32_MAX;
4598#endif
4599 vm_pageout_iothread_internal_continue(cq: cthr, w: 0);
4600
4601 /*NOTREACHED*/
4602}
4603
4604kern_return_t
4605vm_set_buffer_cleanup_callout(boolean_t (*func)(int))
4606{
4607 if (OSCompareAndSwapPtr(NULL, ptrauth_nop_cast(void *, func), (void * volatile *) &consider_buffer_cache_collect)) {
4608 return KERN_SUCCESS;
4609 } else {
4610 return KERN_FAILURE; /* Already set */
4611 }
4612}
4613
4614extern boolean_t memorystatus_manual_testing_on;
4615extern unsigned int memorystatus_level;
4616
4617
4618#if VM_PRESSURE_EVENTS
4619
4620boolean_t vm_pressure_events_enabled = FALSE;
4621
4622extern uint64_t next_warning_notification_sent_at_ts;
4623extern uint64_t next_critical_notification_sent_at_ts;
4624
4625#define PRESSURE_LEVEL_STUCK_THRESHOLD_MINS (30) /* 30 minutes. */
4626
4627/*
4628 * The last time there was change in pressure level OR we forced a check
4629 * because the system is stuck in a non-normal pressure level.
4630 */
4631uint64_t vm_pressure_last_level_transition_abs = 0;
4632
4633/*
4634 * This is how the long the system waits 'stuck' in an unchanged non-normal pressure
4635 * level before resending out notifications for that level again.
4636 */
4637int vm_pressure_level_transition_threshold = PRESSURE_LEVEL_STUCK_THRESHOLD_MINS;
4638
4639void
4640vm_pressure_response(void)
4641{
4642 vm_pressure_level_t old_level = kVMPressureNormal;
4643 int new_level = -1;
4644 unsigned int total_pages;
4645 uint64_t available_memory = 0;
4646 uint64_t curr_ts, abs_time_since_level_transition, time_in_ns;
4647 bool force_check = false;
4648 int time_in_mins;
4649
4650
4651 if (vm_pressure_events_enabled == FALSE) {
4652 return;
4653 }
4654
4655#if !XNU_TARGET_OS_OSX
4656
4657 available_memory = (uint64_t) memorystatus_available_pages;
4658
4659#else /* !XNU_TARGET_OS_OSX */
4660
4661 available_memory = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
4662 memorystatus_available_pages = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
4663
4664#endif /* !XNU_TARGET_OS_OSX */
4665
4666 total_pages = (unsigned int) atop_64(max_mem);
4667#if CONFIG_SECLUDED_MEMORY
4668 total_pages -= vm_page_secluded_count;
4669#endif /* CONFIG_SECLUDED_MEMORY */
4670 memorystatus_level = (unsigned int) ((available_memory * 100) / total_pages);
4671
4672 if (memorystatus_manual_testing_on) {
4673 return;
4674 }
4675
4676 curr_ts = mach_absolute_time();
4677 abs_time_since_level_transition = curr_ts - vm_pressure_last_level_transition_abs;
4678
4679 absolutetime_to_nanoseconds(abstime: abs_time_since_level_transition, result: &time_in_ns);
4680 time_in_mins = (int) ((time_in_ns / NSEC_PER_SEC) / 60);
4681 force_check = (time_in_mins >= vm_pressure_level_transition_threshold);
4682
4683 old_level = memorystatus_vm_pressure_level;
4684
4685 switch (memorystatus_vm_pressure_level) {
4686 case kVMPressureNormal:
4687 {
4688 if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4689 new_level = kVMPressureCritical;
4690 } else if (VM_PRESSURE_NORMAL_TO_WARNING()) {
4691 new_level = kVMPressureWarning;
4692 }
4693 break;
4694 }
4695
4696 case kVMPressureWarning:
4697 case kVMPressureUrgent:
4698 {
4699 if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4700 new_level = kVMPressureNormal;
4701 } else if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4702 new_level = kVMPressureCritical;
4703 } else if (force_check) {
4704 new_level = kVMPressureWarning;
4705 next_warning_notification_sent_at_ts = curr_ts;
4706 }
4707 break;
4708 }
4709
4710 case kVMPressureCritical:
4711 {
4712 if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4713 new_level = kVMPressureNormal;
4714 } else if (VM_PRESSURE_CRITICAL_TO_WARNING()) {
4715 new_level = kVMPressureWarning;
4716 } else if (force_check) {
4717 new_level = kVMPressureCritical;
4718 next_critical_notification_sent_at_ts = curr_ts;
4719 }
4720 break;
4721 }
4722
4723 default:
4724 return;
4725 }
4726
4727 if (new_level != -1 || force_check) {
4728 if (new_level != -1) {
4729 memorystatus_vm_pressure_level = (vm_pressure_level_t) new_level;
4730
4731 if (new_level != (int) old_level) {
4732 VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4733 new_level, old_level, 0, 0);
4734 }
4735 } else {
4736 VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4737 new_level, old_level, force_check, 0);
4738 }
4739
4740 if (hibernation_vmqueues_inspection || hibernate_cleaning_in_progress) {
4741 /*
4742 * We don't want to schedule a wakeup while hibernation is in progress
4743 * because that could collide with checks for non-monotonicity in the scheduler.
4744 * We do however do all the updates to memorystatus_vm_pressure_level because
4745 * we _might_ want to use that for decisions regarding which pages or how
4746 * many pages we want to dump in hibernation.
4747 */
4748 return;
4749 }
4750
4751 if ((memorystatus_vm_pressure_level != kVMPressureNormal) || (old_level != memorystatus_vm_pressure_level) || force_check) {
4752 if (vm_pageout_state.vm_pressure_thread_running == FALSE) {
4753 thread_wakeup(&vm_pressure_thread);
4754 }
4755
4756 if (old_level != memorystatus_vm_pressure_level) {
4757 thread_wakeup(&vm_pageout_state.vm_pressure_changed);
4758 }
4759 vm_pressure_last_level_transition_abs = curr_ts; /* renew the window of observation for a stuck pressure level */
4760 }
4761 }
4762}
4763#endif /* VM_PRESSURE_EVENTS */
4764
4765
4766/**
4767 * Called by a kernel thread to ask if a number of pages may be wired.
4768 */
4769kern_return_t
4770mach_vm_wire_level_monitor(int64_t requested_pages)
4771{
4772 if (requested_pages <= 0) {
4773 return KERN_INVALID_ARGUMENT;
4774 }
4775
4776 const int64_t max_wire_pages = atop_64(vm_global_user_wire_limit);
4777 /**
4778 * Available pages can be negative in the case where more system memory is
4779 * wired than the threshold, so we must use a signed integer.
4780 */
4781 const int64_t available_pages = max_wire_pages - vm_page_wire_count;
4782
4783 if (requested_pages > available_pages) {
4784 return KERN_RESOURCE_SHORTAGE;
4785 }
4786 return KERN_SUCCESS;
4787}
4788
4789/*
4790 * Function called by a kernel thread to either get the current pressure level or
4791 * wait until memory pressure changes from a given level.
4792 */
4793kern_return_t
4794mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure, __unused unsigned int *pressure_level)
4795{
4796#if !VM_PRESSURE_EVENTS
4797
4798 return KERN_FAILURE;
4799
4800#else /* VM_PRESSURE_EVENTS */
4801
4802 wait_result_t wr = 0;
4803 vm_pressure_level_t old_level = memorystatus_vm_pressure_level;
4804
4805 if (pressure_level == NULL) {
4806 return KERN_INVALID_ARGUMENT;
4807 }
4808
4809 if (*pressure_level == kVMPressureJetsam) {
4810 if (!wait_for_pressure) {
4811 return KERN_INVALID_ARGUMENT;
4812 }
4813
4814 lck_mtx_lock(lck: &memorystatus_jetsam_fg_band_lock);
4815 wr = assert_wait(event: (event_t)&memorystatus_jetsam_fg_band_waiters,
4816 THREAD_INTERRUPTIBLE);
4817 if (wr == THREAD_WAITING) {
4818 ++memorystatus_jetsam_fg_band_waiters;
4819 lck_mtx_unlock(lck: &memorystatus_jetsam_fg_band_lock);
4820 wr = thread_block(THREAD_CONTINUE_NULL);
4821 } else {
4822 lck_mtx_unlock(lck: &memorystatus_jetsam_fg_band_lock);
4823 }
4824 if (wr != THREAD_AWAKENED) {
4825 return KERN_ABORTED;
4826 }
4827 *pressure_level = kVMPressureJetsam;
4828 return KERN_SUCCESS;
4829 }
4830
4831 if (wait_for_pressure == TRUE) {
4832 while (old_level == *pressure_level) {
4833 wr = assert_wait(event: (event_t) &vm_pageout_state.vm_pressure_changed,
4834 THREAD_INTERRUPTIBLE);
4835 if (wr == THREAD_WAITING) {
4836 wr = thread_block(THREAD_CONTINUE_NULL);
4837 }
4838 if (wr == THREAD_INTERRUPTED) {
4839 return KERN_ABORTED;
4840 }
4841
4842 if (wr == THREAD_AWAKENED) {
4843 old_level = memorystatus_vm_pressure_level;
4844 }
4845 }
4846 }
4847
4848 *pressure_level = old_level;
4849 return KERN_SUCCESS;
4850#endif /* VM_PRESSURE_EVENTS */
4851}
4852
4853#if VM_PRESSURE_EVENTS
4854void
4855vm_pressure_thread(void)
4856{
4857 static boolean_t thread_initialized = FALSE;
4858
4859 if (thread_initialized == TRUE) {
4860 vm_pageout_state.vm_pressure_thread_running = TRUE;
4861 consider_vm_pressure_events();
4862 vm_pageout_state.vm_pressure_thread_running = FALSE;
4863 }
4864
4865#if CONFIG_THREAD_GROUPS
4866 thread_group_vm_add();
4867#endif /* CONFIG_THREAD_GROUPS */
4868
4869 thread_set_thread_name(th: current_thread(), name: "VM_pressure");
4870 thread_initialized = TRUE;
4871 assert_wait(event: (event_t) &vm_pressure_thread, THREAD_UNINT);
4872 thread_block(continuation: (thread_continue_t)vm_pressure_thread);
4873}
4874#endif /* VM_PRESSURE_EVENTS */
4875
4876
4877/*
4878 * called once per-second via "compute_averages"
4879 */
4880void
4881compute_pageout_gc_throttle(__unused void *arg)
4882{
4883 if (vm_pageout_vminfo.vm_pageout_considered_page != vm_pageout_state.vm_pageout_considered_page_last) {
4884 vm_pageout_state.vm_pageout_considered_page_last = vm_pageout_vminfo.vm_pageout_considered_page;
4885
4886 thread_wakeup(VM_PAGEOUT_GC_EVENT);
4887 }
4888}
4889
4890/*
4891 * vm_pageout_garbage_collect can also be called when the zone allocator needs
4892 * to call zone_gc on a different thread in order to trigger zone-map-exhaustion
4893 * jetsams. We need to check if the zone map size is above its jetsam limit to
4894 * decide if this was indeed the case.
4895 *
4896 * We need to do this on a different thread because of the following reasons:
4897 *
4898 * 1. In the case of synchronous jetsams, the leaking process can try to jetsam
4899 * itself causing the system to hang. We perform synchronous jetsams if we're
4900 * leaking in the VM map entries zone, so the leaking process could be doing a
4901 * zalloc for a VM map entry while holding its vm_map lock, when it decides to
4902 * jetsam itself. We also need the vm_map lock on the process termination path,
4903 * which would now lead the dying process to deadlock against itself.
4904 *
4905 * 2. The jetsam path might need to allocate zone memory itself. We could try
4906 * using the non-blocking variant of zalloc for this path, but we can still
4907 * end up trying to do a kmem_alloc when the zone maps are almost full.
4908 */
4909__dead2
4910void
4911vm_pageout_garbage_collect(void *step, wait_result_t wr __unused)
4912{
4913 assert(step == VM_PAGEOUT_GC_INIT || step == VM_PAGEOUT_GC_COLLECT);
4914
4915 if (step == VM_PAGEOUT_GC_INIT) {
4916 /* first time being called is not about GC */
4917#if CONFIG_THREAD_GROUPS
4918 thread_group_vm_add();
4919#endif /* CONFIG_THREAD_GROUPS */
4920 } else if (zone_map_nearing_exhaustion()) {
4921 /*
4922 * Woken up by the zone allocator for zone-map-exhaustion jetsams.
4923 *
4924 * Bail out after calling zone_gc (which triggers the
4925 * zone-map-exhaustion jetsams). If we fall through, the subsequent
4926 * operations that clear out a bunch of caches might allocate zone
4927 * memory themselves (for eg. vm_map operations would need VM map
4928 * entries). Since the zone map is almost full at this point, we
4929 * could end up with a panic. We just need to quickly jetsam a
4930 * process and exit here.
4931 *
4932 * It could so happen that we were woken up to relieve memory
4933 * pressure and the zone map also happened to be near its limit at
4934 * the time, in which case we'll skip out early. But that should be
4935 * ok; if memory pressure persists, the thread will simply be woken
4936 * up again.
4937 */
4938 zone_gc(level: ZONE_GC_JETSAM);
4939 } else {
4940 /* Woken up by vm_pageout_scan or compute_pageout_gc_throttle. */
4941 boolean_t buf_large_zfree = FALSE;
4942 boolean_t first_try = TRUE;
4943
4944 stack_collect();
4945
4946 consider_machine_collect();
4947#if CONFIG_MBUF_MCACHE
4948 mbuf_drain(FALSE);
4949#endif /* CONFIG_MBUF_MCACHE */
4950
4951 do {
4952 if (consider_buffer_cache_collect != NULL) {
4953 buf_large_zfree = (*consider_buffer_cache_collect)(0);
4954 }
4955 if (first_try == TRUE || buf_large_zfree == TRUE) {
4956 /*
4957 * zone_gc should be last, because the other operations
4958 * might return memory to zones.
4959 */
4960 zone_gc(level: ZONE_GC_TRIM);
4961 }
4962 first_try = FALSE;
4963 } while (buf_large_zfree == TRUE && vm_page_free_count < vm_page_free_target);
4964
4965 consider_machine_adjust();
4966 }
4967
4968 assert_wait(VM_PAGEOUT_GC_EVENT, THREAD_UNINT);
4969
4970 thread_block_parameter(continuation: vm_pageout_garbage_collect, VM_PAGEOUT_GC_COLLECT);
4971 __builtin_unreachable();
4972}
4973
4974
4975#if VM_PAGE_BUCKETS_CHECK
4976#if VM_PAGE_FAKE_BUCKETS
4977extern vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end;
4978#endif /* VM_PAGE_FAKE_BUCKETS */
4979#endif /* VM_PAGE_BUCKETS_CHECK */
4980
4981
4982
4983void
4984vm_set_restrictions(unsigned int num_cpus)
4985{
4986 int vm_restricted_to_single_processor = 0;
4987
4988 if (PE_parse_boot_argn(arg_string: "vm_restricted_to_single_processor", arg_ptr: &vm_restricted_to_single_processor, max_arg: sizeof(vm_restricted_to_single_processor))) {
4989 kprintf(fmt: "Overriding vm_restricted_to_single_processor to %d\n", vm_restricted_to_single_processor);
4990 vm_pageout_state.vm_restricted_to_single_processor = (vm_restricted_to_single_processor ? TRUE : FALSE);
4991 } else {
4992 assert(num_cpus > 0);
4993
4994 if (num_cpus <= 3) {
4995 /*
4996 * on systems with a limited number of CPUS, bind the
4997 * 4 major threads that can free memory and that tend to use
4998 * a fair bit of CPU under pressured conditions to a single processor.
4999 * This insures that these threads don't hog all of the available CPUs
5000 * (important for camera launch), while allowing them to run independently
5001 * w/r to locks... the 4 threads are
5002 * vm_pageout_scan, vm_pageout_iothread_internal (compressor),
5003 * vm_compressor_swap_trigger_thread (minor and major compactions),
5004 * memorystatus_thread (jetsams).
5005 *
5006 * the first time the thread is run, it is responsible for checking the
5007 * state of vm_restricted_to_single_processor, and if TRUE it calls
5008 * thread_bind_master... someday this should be replaced with a group
5009 * scheduling mechanism and KPI.
5010 */
5011 vm_pageout_state.vm_restricted_to_single_processor = TRUE;
5012 } else {
5013 vm_pageout_state.vm_restricted_to_single_processor = FALSE;
5014 }
5015 }
5016}
5017
5018/*
5019 * Set up vm_config based on the vm_compressor_mode.
5020 * Must run BEFORE the pageout thread starts up.
5021 */
5022__startup_func
5023void
5024vm_config_init(void)
5025{
5026 bzero(s: &vm_config, n: sizeof(vm_config));
5027
5028 switch (vm_compressor_mode) {
5029 case VM_PAGER_DEFAULT:
5030 printf(format: "mapping deprecated VM_PAGER_DEFAULT to VM_PAGER_COMPRESSOR_WITH_SWAP\n");
5031 OS_FALLTHROUGH;
5032
5033 case VM_PAGER_COMPRESSOR_WITH_SWAP:
5034 vm_config.compressor_is_present = TRUE;
5035 vm_config.swap_is_present = TRUE;
5036 vm_config.compressor_is_active = TRUE;
5037 vm_config.swap_is_active = TRUE;
5038 break;
5039
5040 case VM_PAGER_COMPRESSOR_NO_SWAP:
5041 vm_config.compressor_is_present = TRUE;
5042 vm_config.swap_is_present = TRUE;
5043 vm_config.compressor_is_active = TRUE;
5044 break;
5045
5046 case VM_PAGER_FREEZER_DEFAULT:
5047 printf(format: "mapping deprecated VM_PAGER_FREEZER_DEFAULT to VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP\n");
5048 OS_FALLTHROUGH;
5049
5050 case VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP:
5051 vm_config.compressor_is_present = TRUE;
5052 vm_config.swap_is_present = TRUE;
5053 break;
5054
5055 case VM_PAGER_COMPRESSOR_NO_SWAP_PLUS_FREEZER_COMPRESSOR_WITH_SWAP:
5056 vm_config.compressor_is_present = TRUE;
5057 vm_config.swap_is_present = TRUE;
5058 vm_config.compressor_is_active = TRUE;
5059 vm_config.freezer_swap_is_active = TRUE;
5060 break;
5061
5062 case VM_PAGER_NOT_CONFIGURED:
5063 break;
5064
5065 default:
5066 printf(format: "unknown compressor mode - %x\n", vm_compressor_mode);
5067 break;
5068 }
5069}
5070
5071__startup_func
5072static void
5073vm_pageout_create_gc_thread(void)
5074{
5075 thread_t thread;
5076
5077 if (kernel_thread_create(continuation: vm_pageout_garbage_collect,
5078 VM_PAGEOUT_GC_INIT, BASEPRI_DEFAULT, new_thread: &thread) != KERN_SUCCESS) {
5079 panic("vm_pageout_garbage_collect: create failed");
5080 }
5081 thread_set_thread_name(th: thread, name: "VM_pageout_garbage_collect");
5082 if (thread->reserved_stack == 0) {
5083 assert(thread->kernel_stack);
5084 thread->reserved_stack = thread->kernel_stack;
5085 }
5086
5087 /* thread is started in vm_pageout() */
5088 vm_pageout_gc_thread = thread;
5089}
5090STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_pageout_create_gc_thread);
5091
5092void
5093vm_pageout(void)
5094{
5095 thread_t self = current_thread();
5096 thread_t thread;
5097 kern_return_t result;
5098 spl_t s;
5099
5100 /*
5101 * Set thread privileges.
5102 */
5103 s = splsched();
5104
5105#if CONFIG_VPS_DYNAMIC_PRIO
5106 if (vps_dynamic_priority_enabled) {
5107 sched_set_kernel_thread_priority(self, MAXPRI_THROTTLE);
5108 thread_set_eager_preempt(self);
5109 } else {
5110 sched_set_kernel_thread_priority(self, BASEPRI_VM);
5111 }
5112#else /* CONFIG_VPS_DYNAMIC_PRIO */
5113 sched_set_kernel_thread_priority(thread: self, BASEPRI_VM);
5114#endif /* CONFIG_VPS_DYNAMIC_PRIO */
5115
5116 thread_lock(self);
5117 self->options |= TH_OPT_VMPRIV;
5118 thread_unlock(self);
5119
5120 if (!self->reserved_stack) {
5121 self->reserved_stack = self->kernel_stack;
5122 }
5123
5124 if (vm_pageout_state.vm_restricted_to_single_processor == TRUE &&
5125 !vps_dynamic_priority_enabled) {
5126 thread_vm_bind_group_add();
5127 }
5128
5129
5130#if CONFIG_THREAD_GROUPS
5131 thread_group_vm_add();
5132#endif /* CONFIG_THREAD_GROUPS */
5133
5134#if __AMP__
5135 PE_parse_boot_argn("vmpgo_pcluster", &vm_pgo_pbound, sizeof(vm_pgo_pbound));
5136 if (vm_pgo_pbound) {
5137 /*
5138 * Use the soft bound option for vm pageout to allow it to run on
5139 * E-cores if P-cluster is unavailable.
5140 */
5141 thread_bind_cluster_type(self, 'P', true);
5142 }
5143#endif /* __AMP__ */
5144
5145 PE_parse_boot_argn(arg_string: "vmpgo_protect_realtime",
5146 arg_ptr: &vm_pageout_protect_realtime,
5147 max_arg: sizeof(vm_pageout_protect_realtime));
5148 splx(s);
5149
5150 thread_set_thread_name(th: current_thread(), name: "VM_pageout_scan");
5151
5152 /*
5153 * Initialize some paging parameters.
5154 */
5155
5156 vm_pageout_state.vm_pressure_thread_running = FALSE;
5157 vm_pageout_state.vm_pressure_changed = FALSE;
5158 vm_pageout_state.memorystatus_purge_on_warning = 2;
5159 vm_pageout_state.memorystatus_purge_on_urgent = 5;
5160 vm_pageout_state.memorystatus_purge_on_critical = 8;
5161 vm_pageout_state.vm_page_speculative_q_age_ms = VM_PAGE_SPECULATIVE_Q_AGE_MS;
5162 vm_pageout_state.vm_page_speculative_percentage = 5;
5163 vm_pageout_state.vm_page_speculative_target = 0;
5164
5165 vm_pageout_state.vm_pageout_swap_wait = 0;
5166 vm_pageout_state.vm_pageout_idle_wait = 0;
5167 vm_pageout_state.vm_pageout_empty_wait = 0;
5168 vm_pageout_state.vm_pageout_burst_wait = 0;
5169 vm_pageout_state.vm_pageout_deadlock_wait = 0;
5170 vm_pageout_state.vm_pageout_deadlock_relief = 0;
5171 vm_pageout_state.vm_pageout_burst_inactive_throttle = 0;
5172
5173 vm_pageout_state.vm_pageout_inactive = 0;
5174 vm_pageout_state.vm_pageout_inactive_used = 0;
5175 vm_pageout_state.vm_pageout_inactive_clean = 0;
5176
5177 vm_pageout_state.vm_memory_pressure = 0;
5178 vm_pageout_state.vm_page_filecache_min = 0;
5179#if CONFIG_JETSAM
5180 vm_pageout_state.vm_page_filecache_min_divisor = 70;
5181 vm_pageout_state.vm_page_xpmapped_min_divisor = 40;
5182#else
5183 vm_pageout_state.vm_page_filecache_min_divisor = 27;
5184 vm_pageout_state.vm_page_xpmapped_min_divisor = 36;
5185#endif
5186 vm_pageout_state.vm_page_free_count_init = vm_page_free_count;
5187
5188 vm_pageout_state.vm_pageout_considered_page_last = 0;
5189
5190 if (vm_pageout_state.vm_pageout_swap_wait == 0) {
5191 vm_pageout_state.vm_pageout_swap_wait = VM_PAGEOUT_SWAP_WAIT;
5192 }
5193
5194 if (vm_pageout_state.vm_pageout_idle_wait == 0) {
5195 vm_pageout_state.vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
5196 }
5197
5198 if (vm_pageout_state.vm_pageout_burst_wait == 0) {
5199 vm_pageout_state.vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
5200 }
5201
5202 if (vm_pageout_state.vm_pageout_empty_wait == 0) {
5203 vm_pageout_state.vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
5204 }
5205
5206 if (vm_pageout_state.vm_pageout_deadlock_wait == 0) {
5207 vm_pageout_state.vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
5208 }
5209
5210 if (vm_pageout_state.vm_pageout_deadlock_relief == 0) {
5211 vm_pageout_state.vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
5212 }
5213
5214 if (vm_pageout_state.vm_pageout_burst_inactive_throttle == 0) {
5215 vm_pageout_state.vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
5216 }
5217 /*
5218 * even if we've already called vm_page_free_reserve
5219 * call it again here to insure that the targets are
5220 * accurately calculated (it uses vm_page_free_count_init)
5221 * calling it with an arg of 0 will not change the reserve
5222 * but will re-calculate free_min and free_target
5223 */
5224 if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
5225 vm_page_free_reserve(pages: (VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
5226 } else {
5227 vm_page_free_reserve(pages: 0);
5228 }
5229
5230 bzero(s: &vm_pageout_queue_external, n: sizeof(struct vm_pageout_queue));
5231 bzero(s: &vm_pageout_queue_internal, n: sizeof(struct vm_pageout_queue));
5232
5233 vm_page_queue_init(&vm_pageout_queue_external.pgo_pending);
5234 vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
5235
5236 vm_page_queue_init(&vm_pageout_queue_internal.pgo_pending);
5237
5238#if DEVELOPMENT || DEBUG
5239 bzero(&vm_pageout_queue_benchmark, sizeof(struct vm_pageout_queue));
5240 vm_page_queue_init(&vm_pageout_queue_benchmark.pgo_pending);
5241#endif /* DEVELOPMENT || DEBUG */
5242
5243
5244 /* internal pageout thread started when default pager registered first time */
5245 /* external pageout and garbage collection threads started here */
5246 struct pgo_iothread_state *ethr = &pgo_iothread_external_state;
5247 ethr->id = 0;
5248 ethr->q = &vm_pageout_queue_external;
5249 ethr->current_early_swapout_chead = NULL;
5250 ethr->current_regular_swapout_chead = NULL;
5251 ethr->current_late_swapout_chead = NULL;
5252 ethr->scratch_buf = NULL;
5253#if DEVELOPMENT || DEBUG
5254 ethr->benchmark_q = NULL;
5255#endif /* DEVELOPMENT || DEBUG */
5256 sched_cond_init(cond: &(ethr->pgo_wakeup));
5257
5258 result = kernel_thread_start_priority(continuation: (thread_continue_t)vm_pageout_iothread_external,
5259 parameter: (void *)ethr, BASEPRI_VM,
5260 new_thread: &(ethr->pgo_iothread));
5261 if (result != KERN_SUCCESS) {
5262 panic("vm_pageout: Unable to create external thread (%d)\n", result);
5263 }
5264 thread_set_thread_name(th: ethr->pgo_iothread, name: "VM_pageout_external_iothread");
5265
5266 thread_mtx_lock(thread: vm_pageout_gc_thread );
5267 thread_start(thread: vm_pageout_gc_thread );
5268 thread_mtx_unlock(thread: vm_pageout_gc_thread);
5269
5270#if VM_PRESSURE_EVENTS
5271 result = kernel_thread_start_priority(continuation: (thread_continue_t)vm_pressure_thread, NULL,
5272 BASEPRI_DEFAULT,
5273 new_thread: &thread);
5274
5275 if (result != KERN_SUCCESS) {
5276 panic("vm_pressure_thread: create failed");
5277 }
5278
5279 thread_deallocate(thread);
5280#endif
5281
5282 vm_object_reaper_init();
5283
5284
5285 if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
5286 vm_compressor_init();
5287 }
5288
5289#if VM_PRESSURE_EVENTS
5290 vm_pressure_events_enabled = TRUE;
5291#endif /* VM_PRESSURE_EVENTS */
5292
5293#if CONFIG_PHANTOM_CACHE
5294 vm_phantom_cache_init();
5295#endif
5296#if VM_PAGE_BUCKETS_CHECK
5297#if VM_PAGE_FAKE_BUCKETS
5298 printf("**** DEBUG: protecting fake buckets [0x%llx:0x%llx]\n",
5299 (uint64_t) vm_page_fake_buckets_start,
5300 (uint64_t) vm_page_fake_buckets_end);
5301 pmap_protect(kernel_pmap,
5302 vm_page_fake_buckets_start,
5303 vm_page_fake_buckets_end,
5304 VM_PROT_READ);
5305// *(char *) vm_page_fake_buckets_start = 'x'; /* panic! */
5306#endif /* VM_PAGE_FAKE_BUCKETS */
5307#endif /* VM_PAGE_BUCKETS_CHECK */
5308
5309#if VM_OBJECT_TRACKING
5310 vm_object_tracking_init();
5311#endif /* VM_OBJECT_TRACKING */
5312
5313#if __arm64__
5314// vm_tests();
5315#endif /* __arm64__ */
5316
5317 vm_pageout_continue();
5318
5319 /*
5320 * Unreached code!
5321 *
5322 * The vm_pageout_continue() call above never returns, so the code below is never
5323 * executed. We take advantage of this to declare several DTrace VM related probe
5324 * points that our kernel doesn't have an analog for. These are probe points that
5325 * exist in Solaris and are in the DTrace documentation, so people may have written
5326 * scripts that use them. Declaring the probe points here means their scripts will
5327 * compile and execute which we want for portability of the scripts, but since this
5328 * section of code is never reached, the probe points will simply never fire. Yes,
5329 * this is basically a hack. The problem is the DTrace probe points were chosen with
5330 * Solaris specific VM events in mind, not portability to different VM implementations.
5331 */
5332
5333 DTRACE_VM2(execfree, int, 1, (uint64_t *), NULL);
5334 DTRACE_VM2(execpgin, int, 1, (uint64_t *), NULL);
5335 DTRACE_VM2(execpgout, int, 1, (uint64_t *), NULL);
5336 DTRACE_VM2(pgswapin, int, 1, (uint64_t *), NULL);
5337 DTRACE_VM2(pgswapout, int, 1, (uint64_t *), NULL);
5338 DTRACE_VM2(swapin, int, 1, (uint64_t *), NULL);
5339 DTRACE_VM2(swapout, int, 1, (uint64_t *), NULL);
5340 /*NOTREACHED*/
5341}
5342
5343
5344
5345kern_return_t
5346vm_pageout_internal_start(void)
5347{
5348 kern_return_t result = KERN_SUCCESS;
5349 host_basic_info_data_t hinfo;
5350 vm_offset_t buf, bufsize;
5351
5352 assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
5353
5354 mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
5355#define BSD_HOST 1
5356 host_info(host: (host_t)BSD_HOST, HOST_BASIC_INFO, host_info_out: (host_info_t)&hinfo, host_info_outCnt: &count);
5357
5358 assert(hinfo.max_cpus > 0);
5359
5360#if !XNU_TARGET_OS_OSX
5361 vm_pageout_state.vm_compressor_thread_count = 1;
5362#else /* !XNU_TARGET_OS_OSX */
5363 if (hinfo.max_cpus > 4) {
5364 vm_pageout_state.vm_compressor_thread_count = 2;
5365 } else {
5366 vm_pageout_state.vm_compressor_thread_count = 1;
5367 }
5368#endif /* !XNU_TARGET_OS_OSX */
5369#if __AMP__
5370 if (vm_compressor_ebound) {
5371 vm_pageout_state.vm_compressor_thread_count = 2;
5372 }
5373#endif
5374 PE_parse_boot_argn(arg_string: "vmcomp_threads", arg_ptr: &vm_pageout_state.vm_compressor_thread_count,
5375 max_arg: sizeof(vm_pageout_state.vm_compressor_thread_count));
5376
5377 if (vm_pageout_state.vm_compressor_thread_count >= hinfo.max_cpus) {
5378 vm_pageout_state.vm_compressor_thread_count = hinfo.max_cpus - 1;
5379 }
5380 if (vm_pageout_state.vm_compressor_thread_count <= 0) {
5381 vm_pageout_state.vm_compressor_thread_count = 1;
5382 } else if (vm_pageout_state.vm_compressor_thread_count > MAX_COMPRESSOR_THREAD_COUNT) {
5383 vm_pageout_state.vm_compressor_thread_count = MAX_COMPRESSOR_THREAD_COUNT;
5384 }
5385
5386 vm_pageout_queue_internal.pgo_maxlaundry =
5387 (vm_pageout_state.vm_compressor_thread_count * 4) * VM_PAGE_LAUNDRY_MAX;
5388
5389 PE_parse_boot_argn(arg_string: "vmpgoi_maxlaundry",
5390 arg_ptr: &vm_pageout_queue_internal.pgo_maxlaundry,
5391 max_arg: sizeof(vm_pageout_queue_internal.pgo_maxlaundry));
5392
5393#if DEVELOPMENT || DEBUG
5394 // Note: this will be modified at enqueue-time such that the benchmark queue is never throttled
5395 vm_pageout_queue_benchmark.pgo_maxlaundry = vm_pageout_queue_internal.pgo_maxlaundry;
5396#endif /* DEVELOPMENT || DEBUG */
5397
5398 bufsize = COMPRESSOR_SCRATCH_BUF_SIZE;
5399
5400 kmem_alloc(map: kernel_map, addrp: &buf,
5401 size: bufsize * vm_pageout_state.vm_compressor_thread_count,
5402 flags: KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT,
5403 VM_KERN_MEMORY_COMPRESSOR);
5404
5405 for (int i = 0; i < vm_pageout_state.vm_compressor_thread_count; i++) {
5406 struct pgo_iothread_state *iq = &pgo_iothread_internal_state[i];
5407 iq->id = i;
5408 iq->q = &vm_pageout_queue_internal;
5409 iq->current_early_swapout_chead = NULL;
5410 iq->current_regular_swapout_chead = NULL;
5411 iq->current_late_swapout_chead = NULL;
5412 iq->scratch_buf = (char *)(buf + i * bufsize);
5413#if DEVELOPMENT || DEBUG
5414 iq->benchmark_q = &vm_pageout_queue_benchmark;
5415#endif /* DEVELOPMENT || DEBUG */
5416 sched_cond_init(cond: &(iq->pgo_wakeup));
5417 result = kernel_thread_start_priority(continuation: (thread_continue_t)vm_pageout_iothread_internal,
5418 parameter: (void *)iq, BASEPRI_VM,
5419 new_thread: &(iq->pgo_iothread));
5420
5421 if (result != KERN_SUCCESS) {
5422 panic("vm_pageout: Unable to create compressor thread no. %d (%d)\n", i, result);
5423 }
5424 }
5425 return result;
5426}
5427
5428#if CONFIG_IOSCHED
5429/*
5430 * To support I/O Expedite for compressed files we mark the upls with special flags.
5431 * The way decmpfs works is that we create a big upl which marks all the pages needed to
5432 * represent the compressed file as busy. We tag this upl with the flag UPL_DECMP_REQ. Decmpfs
5433 * then issues smaller I/Os for compressed I/Os, deflates them and puts the data into the pages
5434 * being held in the big original UPL. We mark each of these smaller UPLs with the flag
5435 * UPL_DECMP_REAL_IO. Any outstanding real I/O UPL is tracked by the big req upl using the
5436 * decmp_io_upl field (in the upl structure). This link is protected in the forward direction
5437 * by the req upl lock (the reverse link doesnt need synch. since we never inspect this link
5438 * unless the real I/O upl is being destroyed).
5439 */
5440
5441
5442static void
5443upl_set_decmp_info(upl_t upl, upl_t src_upl)
5444{
5445 assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5446
5447 upl_lock(src_upl);
5448 if (src_upl->decmp_io_upl) {
5449 /*
5450 * If there is already an alive real I/O UPL, ignore this new UPL.
5451 * This case should rarely happen and even if it does, it just means
5452 * that we might issue a spurious expedite which the driver is expected
5453 * to handle.
5454 */
5455 upl_unlock(src_upl);
5456 return;
5457 }
5458 src_upl->decmp_io_upl = (void *)upl;
5459 src_upl->ref_count++;
5460
5461 upl->flags |= UPL_DECMP_REAL_IO;
5462 upl->decmp_io_upl = (void *)src_upl;
5463 upl_unlock(src_upl);
5464}
5465#endif /* CONFIG_IOSCHED */
5466
5467#if UPL_DEBUG
5468int upl_debug_enabled = 1;
5469#else
5470int upl_debug_enabled = 0;
5471#endif
5472
5473static upl_t
5474upl_create(int type, int flags, upl_size_t size)
5475{
5476 uint32_t pages = (uint32_t)atop(round_page_32(size));
5477 upl_t upl;
5478
5479 assert(page_aligned(size));
5480
5481 /*
5482 * FIXME: this code assumes the allocation always succeeds,
5483 * however `pages` can be up to MAX_UPL_SIZE.
5484 *
5485 * The allocation size is above 32k (resp. 128k)
5486 * on 16k pages (resp. 4k), which kalloc might fail
5487 * to allocate.
5488 */
5489 upl = kalloc_type(struct upl, struct upl_page_info,
5490 (type & UPL_CREATE_INTERNAL) ? pages : 0, Z_WAITOK | Z_ZERO);
5491 if (type & UPL_CREATE_INTERNAL) {
5492 flags |= UPL_INTERNAL;
5493 }
5494
5495 if (type & UPL_CREATE_LITE) {
5496 flags |= UPL_LITE;
5497 if (pages) {
5498 upl->lite_list = bitmap_alloc(nbits: pages);
5499 }
5500 }
5501
5502 upl->flags = flags;
5503 upl->ref_count = 1;
5504 upl_lock_init(upl);
5505#if CONFIG_IOSCHED
5506 if (type & UPL_CREATE_IO_TRACKING) {
5507 upl->upl_priority = proc_get_effective_thread_policy(thread: current_thread(), TASK_POLICY_IO);
5508 }
5509
5510 if ((type & UPL_CREATE_INTERNAL) && (type & UPL_CREATE_EXPEDITE_SUP)) {
5511 /* Only support expedite on internal UPLs */
5512 thread_t curthread = current_thread();
5513 upl->upl_reprio_info = kalloc_data(sizeof(uint64_t) * pages,
5514 Z_WAITOK | Z_ZERO);
5515 upl->flags |= UPL_EXPEDITE_SUPPORTED;
5516 if (curthread->decmp_upl != NULL) {
5517 upl_set_decmp_info(upl, src_upl: curthread->decmp_upl);
5518 }
5519 }
5520#endif
5521#if CONFIG_IOSCHED || UPL_DEBUG
5522 if ((type & UPL_CREATE_IO_TRACKING) || upl_debug_enabled) {
5523 upl->upl_creator = current_thread();
5524 upl->flags |= UPL_TRACKED_BY_OBJECT;
5525 }
5526#endif
5527
5528#if UPL_DEBUG
5529 upl->uple_create_btref = btref_get(__builtin_frame_address(0), 0);
5530#endif /* UPL_DEBUG */
5531
5532 return upl;
5533}
5534
5535static void
5536upl_destroy(upl_t upl)
5537{
5538 uint32_t pages;
5539
5540// DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object);
5541
5542 if (upl->ext_ref_count) {
5543 panic("upl(%p) ext_ref_count", upl);
5544 }
5545
5546#if CONFIG_IOSCHED
5547 if ((upl->flags & UPL_DECMP_REAL_IO) && upl->decmp_io_upl) {
5548 upl_t src_upl;
5549 src_upl = upl->decmp_io_upl;
5550 assert((src_upl->flags & UPL_DECMP_REQ) != 0);
5551 upl_lock(src_upl);
5552 src_upl->decmp_io_upl = NULL;
5553 upl_unlock(src_upl);
5554 upl_deallocate(upl: src_upl);
5555 }
5556#endif /* CONFIG_IOSCHED */
5557
5558#if CONFIG_IOSCHED || UPL_DEBUG
5559 if (((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) &&
5560 !(upl->flags & UPL_VECTOR)) {
5561 vm_object_t object;
5562
5563 if (upl->flags & UPL_SHADOWED) {
5564 object = upl->map_object->shadow;
5565 } else {
5566 object = upl->map_object;
5567 }
5568
5569 vm_object_lock(object);
5570 queue_remove(&object->uplq, upl, upl_t, uplq);
5571 vm_object_activity_end(object);
5572 vm_object_collapse(object, offset: 0, TRUE);
5573 vm_object_unlock(object);
5574 }
5575#endif
5576 /*
5577 * drop a reference on the map_object whether or
5578 * not a pageout object is inserted
5579 */
5580 if (upl->flags & UPL_SHADOWED) {
5581 vm_object_deallocate(object: upl->map_object);
5582 }
5583
5584 if (upl->flags & UPL_DEVICE_MEMORY) {
5585 pages = 1;
5586 } else {
5587 pages = (uint32_t)atop(upl_adjusted_size(upl, PAGE_MASK));
5588 }
5589
5590 upl_lock_destroy(upl);
5591
5592#if CONFIG_IOSCHED
5593 if (upl->flags & UPL_EXPEDITE_SUPPORTED) {
5594 kfree_data(upl->upl_reprio_info, sizeof(uint64_t) * pages);
5595 }
5596#endif
5597
5598#if UPL_DEBUG
5599 for (int i = 0; i < upl->upl_commit_index; i++) {
5600 btref_put(upl->upl_commit_records[i].c_btref);
5601 }
5602 btref_put(upl->uple_create_btref);
5603#endif /* UPL_DEBUG */
5604
5605 if ((upl->flags & UPL_LITE) && pages) {
5606 bitmap_free(map: upl->lite_list, nbits: pages);
5607 }
5608 kfree_type(struct upl, struct upl_page_info,
5609 (upl->flags & UPL_INTERNAL) ? pages : 0, upl);
5610}
5611
5612void
5613upl_deallocate(upl_t upl)
5614{
5615 upl_lock(upl);
5616
5617 if (--upl->ref_count == 0) {
5618 if (vector_upl_is_valid(upl)) {
5619 vector_upl_deallocate(upl);
5620 }
5621 upl_unlock(upl);
5622
5623 if (upl->upl_iodone) {
5624 upl_callout_iodone(upl);
5625 }
5626
5627 upl_destroy(upl);
5628 } else {
5629 upl_unlock(upl);
5630 }
5631}
5632
5633#if CONFIG_IOSCHED
5634void
5635upl_mark_decmp(upl_t upl)
5636{
5637 if (upl->flags & UPL_TRACKED_BY_OBJECT) {
5638 upl->flags |= UPL_DECMP_REQ;
5639 upl->upl_creator->decmp_upl = (void *)upl;
5640 }
5641}
5642
5643void
5644upl_unmark_decmp(upl_t upl)
5645{
5646 if (upl && (upl->flags & UPL_DECMP_REQ)) {
5647 upl->upl_creator->decmp_upl = NULL;
5648 }
5649}
5650
5651#endif /* CONFIG_IOSCHED */
5652
5653#define VM_PAGE_Q_BACKING_UP(q) \
5654 ((q)->pgo_laundry >= (((q)->pgo_maxlaundry * 8) / 10))
5655
5656boolean_t must_throttle_writes(void);
5657
5658boolean_t
5659must_throttle_writes()
5660{
5661 if (VM_PAGE_Q_BACKING_UP(&vm_pageout_queue_external) &&
5662 vm_page_pageable_external_count > (AVAILABLE_NON_COMPRESSED_MEMORY * 6) / 10) {
5663 return TRUE;
5664 }
5665
5666 return FALSE;
5667}
5668
5669int vm_page_delayed_work_ctx_needed = 0;
5670KALLOC_TYPE_DEFINE(dw_ctx_zone, struct vm_page_delayed_work_ctx, KT_PRIV_ACCT);
5671
5672__startup_func
5673static void
5674vm_page_delayed_work_init_ctx(void)
5675{
5676 uint16_t min_delayed_work_ctx_allocated = 16;
5677
5678 /*
5679 * try really hard to always keep NCPU elements around in the zone
5680 * in order for the UPL code to almost always get an element.
5681 */
5682 if (min_delayed_work_ctx_allocated < zpercpu_count()) {
5683 min_delayed_work_ctx_allocated = (uint16_t)zpercpu_count();
5684 }
5685
5686 zone_raise_reserve(zone_or_view: dw_ctx_zone, min_elements: min_delayed_work_ctx_allocated);
5687}
5688STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_page_delayed_work_init_ctx);
5689
5690struct vm_page_delayed_work*
5691vm_page_delayed_work_get_ctx(void)
5692{
5693 struct vm_page_delayed_work_ctx * dw_ctx = NULL;
5694
5695 dw_ctx = zalloc_flags(dw_ctx_zone, Z_ZERO | Z_NOWAIT);
5696
5697 if (__probable(dw_ctx)) {
5698 dw_ctx->delayed_owner = current_thread();
5699 } else {
5700 vm_page_delayed_work_ctx_needed++;
5701 }
5702 return dw_ctx ? dw_ctx->dwp : NULL;
5703}
5704
5705void
5706vm_page_delayed_work_finish_ctx(struct vm_page_delayed_work* dwp)
5707{
5708 struct vm_page_delayed_work_ctx *ldw_ctx;
5709
5710 ldw_ctx = (struct vm_page_delayed_work_ctx *)dwp;
5711 ldw_ctx->delayed_owner = NULL;
5712
5713 zfree(dw_ctx_zone, ldw_ctx);
5714}
5715
5716/*
5717 * Routine: vm_object_upl_request
5718 * Purpose:
5719 * Cause the population of a portion of a vm_object.
5720 * Depending on the nature of the request, the pages
5721 * returned may be contain valid data or be uninitialized.
5722 * A page list structure, listing the physical pages
5723 * will be returned upon request.
5724 * This function is called by the file system or any other
5725 * supplier of backing store to a pager.
5726 * IMPORTANT NOTE: The caller must still respect the relationship
5727 * between the vm_object and its backing memory object. The
5728 * caller MUST NOT substitute changes in the backing file
5729 * without first doing a memory_object_lock_request on the
5730 * target range unless it is know that the pages are not
5731 * shared with another entity at the pager level.
5732 * Copy_in_to:
5733 * if a page list structure is present
5734 * return the mapped physical pages, where a
5735 * page is not present, return a non-initialized
5736 * one. If the no_sync bit is turned on, don't
5737 * call the pager unlock to synchronize with other
5738 * possible copies of the page. Leave pages busy
5739 * in the original object, if a page list structure
5740 * was specified. When a commit of the page list
5741 * pages is done, the dirty bit will be set for each one.
5742 * Copy_out_from:
5743 * If a page list structure is present, return
5744 * all mapped pages. Where a page does not exist
5745 * map a zero filled one. Leave pages busy in
5746 * the original object. If a page list structure
5747 * is not specified, this call is a no-op.
5748 *
5749 * Note: access of default pager objects has a rather interesting
5750 * twist. The caller of this routine, presumably the file system
5751 * page cache handling code, will never actually make a request
5752 * against a default pager backed object. Only the default
5753 * pager will make requests on backing store related vm_objects
5754 * In this way the default pager can maintain the relationship
5755 * between backing store files (abstract memory objects) and
5756 * the vm_objects (cache objects), they support.
5757 *
5758 */
5759
5760__private_extern__ kern_return_t
5761vm_object_upl_request(
5762 vm_object_t object,
5763 vm_object_offset_t offset,
5764 upl_size_t size,
5765 upl_t *upl_ptr,
5766 upl_page_info_array_t user_page_list,
5767 unsigned int *page_list_count,
5768 upl_control_flags_t cntrl_flags,
5769 vm_tag_t tag)
5770{
5771 vm_page_t dst_page = VM_PAGE_NULL;
5772 vm_object_offset_t dst_offset;
5773 upl_size_t xfer_size;
5774 unsigned int size_in_pages;
5775 boolean_t dirty;
5776 boolean_t hw_dirty;
5777 upl_t upl = NULL;
5778 unsigned int entry;
5779 vm_page_t alias_page = NULL;
5780 int refmod_state = 0;
5781 vm_object_t last_copy_object;
5782 uint32_t last_copy_version;
5783 struct vm_page_delayed_work dw_array;
5784 struct vm_page_delayed_work *dwp, *dwp_start;
5785 bool dwp_finish_ctx = TRUE;
5786 int dw_count;
5787 int dw_limit;
5788 int io_tracking_flag = 0;
5789 int grab_options;
5790 int page_grab_count = 0;
5791 ppnum_t phys_page;
5792 pmap_flush_context pmap_flush_context_storage;
5793 boolean_t pmap_flushes_delayed = FALSE;
5794#if DEVELOPMENT || DEBUG
5795 task_t task = current_task();
5796#endif /* DEVELOPMENT || DEBUG */
5797
5798 dwp_start = dwp = NULL;
5799
5800 if (cntrl_flags & ~UPL_VALID_FLAGS) {
5801 /*
5802 * For forward compatibility's sake,
5803 * reject any unknown flag.
5804 */
5805 return KERN_INVALID_VALUE;
5806 }
5807 if ((!object->internal) && (object->paging_offset != 0)) {
5808 panic("vm_object_upl_request: external object with non-zero paging offset");
5809 }
5810 if (object->phys_contiguous) {
5811 panic("vm_object_upl_request: contiguous object specified");
5812 }
5813
5814 assertf(page_aligned(offset) && page_aligned(size),
5815 "offset 0x%llx size 0x%x",
5816 offset, size);
5817
5818 VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, VM_UPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, 0, 0);
5819
5820 dw_count = 0;
5821 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
5822 dwp_start = vm_page_delayed_work_get_ctx();
5823 if (dwp_start == NULL) {
5824 dwp_start = &dw_array;
5825 dw_limit = 1;
5826 dwp_finish_ctx = FALSE;
5827 }
5828
5829 dwp = dwp_start;
5830
5831 if (size > MAX_UPL_SIZE_BYTES) {
5832 size = MAX_UPL_SIZE_BYTES;
5833 }
5834
5835 if ((cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL) {
5836 *page_list_count = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
5837 }
5838
5839#if CONFIG_IOSCHED || UPL_DEBUG
5840 if (object->io_tracking || upl_debug_enabled) {
5841 io_tracking_flag |= UPL_CREATE_IO_TRACKING;
5842 }
5843#endif
5844#if CONFIG_IOSCHED
5845 if (object->io_tracking) {
5846 io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
5847 }
5848#endif
5849
5850 if (cntrl_flags & UPL_SET_INTERNAL) {
5851 if (cntrl_flags & UPL_SET_LITE) {
5852 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, flags: 0, size);
5853 } else {
5854 upl = upl_create(UPL_CREATE_INTERNAL | io_tracking_flag, flags: 0, size);
5855 }
5856 user_page_list = size ? upl->page_list : NULL;
5857 } else {
5858 if (cntrl_flags & UPL_SET_LITE) {
5859 upl = upl_create(UPL_CREATE_EXTERNAL | UPL_CREATE_LITE | io_tracking_flag, flags: 0, size);
5860 } else {
5861 upl = upl_create(UPL_CREATE_EXTERNAL | io_tracking_flag, flags: 0, size);
5862 }
5863 }
5864 *upl_ptr = upl;
5865
5866 if (user_page_list) {
5867 user_page_list[0].device = FALSE;
5868 }
5869
5870 if (cntrl_flags & UPL_SET_LITE) {
5871 upl->map_object = object;
5872 } else {
5873 upl->map_object = vm_object_allocate(size);
5874 vm_object_lock(upl->map_object);
5875 /*
5876 * No neeed to lock the new object: nobody else knows
5877 * about it yet, so it's all ours so far.
5878 */
5879 upl->map_object->shadow = object;
5880 VM_OBJECT_SET_PAGEOUT(object: upl->map_object, TRUE);
5881 VM_OBJECT_SET_CAN_PERSIST(object: upl->map_object, FALSE);
5882 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
5883 upl->map_object->vo_shadow_offset = offset;
5884 upl->map_object->wimg_bits = object->wimg_bits;
5885 assertf(page_aligned(upl->map_object->vo_shadow_offset),
5886 "object %p shadow_offset 0x%llx",
5887 upl->map_object, upl->map_object->vo_shadow_offset);
5888 vm_object_unlock(upl->map_object);
5889
5890 alias_page = vm_page_grab_fictitious(TRUE);
5891
5892 upl->flags |= UPL_SHADOWED;
5893 }
5894 if (cntrl_flags & UPL_FOR_PAGEOUT) {
5895 upl->flags |= UPL_PAGEOUT;
5896 }
5897
5898 vm_object_lock(object);
5899 vm_object_activity_begin(object);
5900
5901 grab_options = 0;
5902#if CONFIG_SECLUDED_MEMORY
5903 if (object->can_grab_secluded) {
5904 grab_options |= VM_PAGE_GRAB_SECLUDED;
5905 }
5906#endif /* CONFIG_SECLUDED_MEMORY */
5907
5908 /*
5909 * we can lock in the paging_offset once paging_in_progress is set
5910 */
5911 upl->u_size = size;
5912 upl->u_offset = offset + object->paging_offset;
5913
5914#if CONFIG_IOSCHED || UPL_DEBUG
5915 if (object->io_tracking || upl_debug_enabled) {
5916 vm_object_activity_begin(object);
5917 queue_enter(&object->uplq, upl, upl_t, uplq);
5918 }
5919#endif
5920 if ((cntrl_flags & UPL_WILL_MODIFY) && object->vo_copy != VM_OBJECT_NULL) {
5921 /*
5922 * Honor copy-on-write obligations
5923 *
5924 * The caller is gathering these pages and
5925 * might modify their contents. We need to
5926 * make sure that the copy object has its own
5927 * private copies of these pages before we let
5928 * the caller modify them.
5929 */
5930 vm_object_update(object,
5931 offset,
5932 size,
5933 NULL,
5934 NULL,
5935 FALSE, /* should_return */
5936 MEMORY_OBJECT_COPY_SYNC,
5937 VM_PROT_NO_CHANGE);
5938
5939 VM_PAGEOUT_DEBUG(upl_cow, 1);
5940 VM_PAGEOUT_DEBUG(upl_cow_pages, (size >> PAGE_SHIFT));
5941 }
5942 /*
5943 * remember which copy object we synchronized with
5944 */
5945 last_copy_object = object->vo_copy;
5946 last_copy_version = object->vo_copy_version;
5947 entry = 0;
5948
5949 xfer_size = size;
5950 dst_offset = offset;
5951 size_in_pages = size / PAGE_SIZE;
5952
5953 if (vm_page_free_count > (vm_page_free_target + size_in_pages) ||
5954 object->resident_page_count < ((MAX_UPL_SIZE_BYTES * 2) >> PAGE_SHIFT)) {
5955 object->scan_collisions = 0;
5956 }
5957
5958 if ((cntrl_flags & UPL_WILL_MODIFY) && must_throttle_writes() == TRUE) {
5959 boolean_t isSSD = FALSE;
5960
5961#if !XNU_TARGET_OS_OSX
5962 isSSD = TRUE;
5963#else /* !XNU_TARGET_OS_OSX */
5964 vnode_pager_get_isSSD(object->pager, &isSSD);
5965#endif /* !XNU_TARGET_OS_OSX */
5966 vm_object_unlock(object);
5967
5968 OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
5969
5970 if (isSSD == TRUE) {
5971 delay(usec: 1000 * size_in_pages);
5972 } else {
5973 delay(usec: 5000 * size_in_pages);
5974 }
5975 OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
5976
5977 vm_object_lock(object);
5978 }
5979
5980 while (xfer_size) {
5981 dwp->dw_mask = 0;
5982
5983 if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
5984 vm_object_unlock(object);
5985 alias_page = vm_page_grab_fictitious(TRUE);
5986 vm_object_lock(object);
5987 }
5988 if (cntrl_flags & UPL_COPYOUT_FROM) {
5989 upl->flags |= UPL_PAGE_SYNC_DONE;
5990
5991 if (((dst_page = vm_page_lookup(object, offset: dst_offset)) == VM_PAGE_NULL) ||
5992 dst_page->vmp_fictitious ||
5993 dst_page->vmp_absent ||
5994 VMP_ERROR_GET(dst_page) ||
5995 dst_page->vmp_cleaning ||
5996 (VM_PAGE_WIRED(dst_page))) {
5997 if (user_page_list) {
5998 user_page_list[entry].phys_addr = 0;
5999 }
6000
6001 goto try_next_page;
6002 }
6003 phys_page = VM_PAGE_GET_PHYS_PAGE(m: dst_page);
6004
6005 /*
6006 * grab this up front...
6007 * a high percentange of the time we're going to
6008 * need the hardware modification state a bit later
6009 * anyway... so we can eliminate an extra call into
6010 * the pmap layer by grabbing it here and recording it
6011 */
6012 if (dst_page->vmp_pmapped) {
6013 refmod_state = pmap_get_refmod(pn: phys_page);
6014 } else {
6015 refmod_state = 0;
6016 }
6017
6018 if ((refmod_state & VM_MEM_REFERENCED) && VM_PAGE_INACTIVE(dst_page)) {
6019 /*
6020 * page is on inactive list and referenced...
6021 * reactivate it now... this gets it out of the
6022 * way of vm_pageout_scan which would have to
6023 * reactivate it upon tripping over it
6024 */
6025 dwp->dw_mask |= DW_vm_page_activate;
6026 }
6027 if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
6028 /*
6029 * we're only asking for DIRTY pages to be returned
6030 */
6031 if (dst_page->vmp_laundry || !(cntrl_flags & UPL_FOR_PAGEOUT)) {
6032 /*
6033 * if we were the page stolen by vm_pageout_scan to be
6034 * cleaned (as opposed to a buddy being clustered in
6035 * or this request is not being driven by a PAGEOUT cluster
6036 * then we only need to check for the page being dirty or
6037 * precious to decide whether to return it
6038 */
6039 if (dst_page->vmp_dirty || dst_page->vmp_precious || (refmod_state & VM_MEM_MODIFIED)) {
6040 goto check_busy;
6041 }
6042 goto dont_return;
6043 }
6044 /*
6045 * this is a request for a PAGEOUT cluster and this page
6046 * is merely along for the ride as a 'buddy'... not only
6047 * does it have to be dirty to be returned, but it also
6048 * can't have been referenced recently...
6049 */
6050 if ((hibernate_cleaning_in_progress == TRUE ||
6051 (!((refmod_state & VM_MEM_REFERENCED) || dst_page->vmp_reference) ||
6052 (dst_page->vmp_q_state == VM_PAGE_ON_THROTTLED_Q))) &&
6053 ((refmod_state & VM_MEM_MODIFIED) || dst_page->vmp_dirty || dst_page->vmp_precious)) {
6054 goto check_busy;
6055 }
6056dont_return:
6057 /*
6058 * if we reach here, we're not to return
6059 * the page... go on to the next one
6060 */
6061 if (dst_page->vmp_laundry == TRUE) {
6062 /*
6063 * if we get here, the page is not 'cleaning' (filtered out above).
6064 * since it has been referenced, remove it from the laundry
6065 * so we don't pay the cost of an I/O to clean a page
6066 * we're just going to take back
6067 */
6068 vm_page_lockspin_queues();
6069
6070 vm_pageout_steal_laundry(page: dst_page, TRUE);
6071 vm_page_activate(page: dst_page);
6072
6073 vm_page_unlock_queues();
6074 }
6075 if (user_page_list) {
6076 user_page_list[entry].phys_addr = 0;
6077 }
6078
6079 goto try_next_page;
6080 }
6081check_busy:
6082 if (dst_page->vmp_busy) {
6083 if (cntrl_flags & UPL_NOBLOCK) {
6084 if (user_page_list) {
6085 user_page_list[entry].phys_addr = 0;
6086 }
6087 dwp->dw_mask = 0;
6088
6089 goto try_next_page;
6090 }
6091 /*
6092 * someone else is playing with the
6093 * page. We will have to wait.
6094 */
6095 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
6096
6097 continue;
6098 }
6099 if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6100 vm_page_lockspin_queues();
6101
6102 if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6103 /*
6104 * we've buddied up a page for a clustered pageout
6105 * that has already been moved to the pageout
6106 * queue by pageout_scan... we need to remove
6107 * it from the queue and drop the laundry count
6108 * on that queue
6109 */
6110 vm_pageout_throttle_up(m: dst_page);
6111 }
6112 vm_page_unlock_queues();
6113 }
6114 hw_dirty = refmod_state & VM_MEM_MODIFIED;
6115 dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6116
6117 if (phys_page > upl->highest_page) {
6118 upl->highest_page = phys_page;
6119 }
6120
6121 assert(!pmap_is_noencrypt(phys_page));
6122
6123 if (cntrl_flags & UPL_SET_LITE) {
6124 unsigned int pg_num;
6125
6126 pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6127 assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6128 bitmap_set(map: upl->lite_list, n: pg_num);
6129
6130 if (hw_dirty) {
6131 if (pmap_flushes_delayed == FALSE) {
6132 pmap_flush_context_init(&pmap_flush_context_storage);
6133 pmap_flushes_delayed = TRUE;
6134 }
6135 pmap_clear_refmod_options(pn: phys_page,
6136 VM_MEM_MODIFIED,
6137 PMAP_OPTIONS_NOFLUSH | PMAP_OPTIONS_CLEAR_WRITE,
6138 &pmap_flush_context_storage);
6139 }
6140
6141 /*
6142 * Mark original page as cleaning
6143 * in place.
6144 */
6145 dst_page->vmp_cleaning = TRUE;
6146 dst_page->vmp_precious = FALSE;
6147 } else {
6148 /*
6149 * use pageclean setup, it is more
6150 * convenient even for the pageout
6151 * cases here
6152 */
6153 vm_object_lock(upl->map_object);
6154 vm_pageclean_setup(m: dst_page, new_m: alias_page, new_object: upl->map_object, new_offset: size - xfer_size);
6155 vm_object_unlock(upl->map_object);
6156
6157 alias_page->vmp_absent = FALSE;
6158 alias_page = NULL;
6159 }
6160 if (dirty) {
6161 SET_PAGE_DIRTY(dst_page, FALSE);
6162 } else {
6163 dst_page->vmp_dirty = FALSE;
6164 }
6165
6166 if (!dirty) {
6167 dst_page->vmp_precious = TRUE;
6168 }
6169
6170 if (!(cntrl_flags & UPL_CLEAN_IN_PLACE)) {
6171 if (!VM_PAGE_WIRED(dst_page)) {
6172 dst_page->vmp_free_when_done = TRUE;
6173 }
6174 }
6175 } else {
6176 if ((cntrl_flags & UPL_WILL_MODIFY) &&
6177 (object->vo_copy != last_copy_object ||
6178 object->vo_copy_version != last_copy_version)) {
6179 /*
6180 * Honor copy-on-write obligations
6181 *
6182 * The copy object has changed since we
6183 * last synchronized for copy-on-write.
6184 * Another copy object might have been
6185 * inserted while we released the object's
6186 * lock. Since someone could have seen the
6187 * original contents of the remaining pages
6188 * through that new object, we have to
6189 * synchronize with it again for the remaining
6190 * pages only. The previous pages are "busy"
6191 * so they can not be seen through the new
6192 * mapping. The new mapping will see our
6193 * upcoming changes for those previous pages,
6194 * but that's OK since they couldn't see what
6195 * was there before. It's just a race anyway
6196 * and there's no guarantee of consistency or
6197 * atomicity. We just don't want new mappings
6198 * to see both the *before* and *after* pages.
6199 */
6200 if (object->vo_copy != VM_OBJECT_NULL) {
6201 vm_object_update(
6202 object,
6203 offset: dst_offset,/* current offset */
6204 size: xfer_size, /* remaining size */
6205 NULL,
6206 NULL,
6207 FALSE, /* should_return */
6208 MEMORY_OBJECT_COPY_SYNC,
6209 VM_PROT_NO_CHANGE);
6210
6211 VM_PAGEOUT_DEBUG(upl_cow_again, 1);
6212 VM_PAGEOUT_DEBUG(upl_cow_again_pages, (xfer_size >> PAGE_SHIFT));
6213 }
6214 /*
6215 * remember the copy object we synced with
6216 */
6217 last_copy_object = object->vo_copy;
6218 last_copy_version = object->vo_copy_version;
6219 }
6220 dst_page = vm_page_lookup(object, offset: dst_offset);
6221
6222 if (dst_page != VM_PAGE_NULL) {
6223 if ((cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6224 /*
6225 * skip over pages already present in the cache
6226 */
6227 if (user_page_list) {
6228 user_page_list[entry].phys_addr = 0;
6229 }
6230
6231 goto try_next_page;
6232 }
6233 if (dst_page->vmp_fictitious) {
6234 panic("need corner case for fictitious page");
6235 }
6236
6237 if (dst_page->vmp_busy || dst_page->vmp_cleaning) {
6238 /*
6239 * someone else is playing with the
6240 * page. We will have to wait.
6241 */
6242 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
6243
6244 continue;
6245 }
6246 if (dst_page->vmp_laundry) {
6247 vm_pageout_steal_laundry(page: dst_page, FALSE);
6248 }
6249 } else {
6250 if (object->private) {
6251 /*
6252 * This is a nasty wrinkle for users
6253 * of upl who encounter device or
6254 * private memory however, it is
6255 * unavoidable, only a fault can
6256 * resolve the actual backing
6257 * physical page by asking the
6258 * backing device.
6259 */
6260 if (user_page_list) {
6261 user_page_list[entry].phys_addr = 0;
6262 }
6263
6264 goto try_next_page;
6265 }
6266 if (object->scan_collisions) {
6267 /*
6268 * the pageout_scan thread is trying to steal
6269 * pages from this object, but has run into our
6270 * lock... grab 2 pages from the head of the object...
6271 * the first is freed on behalf of pageout_scan, the
6272 * 2nd is for our own use... we use vm_object_page_grab
6273 * in both cases to avoid taking pages from the free
6274 * list since we are under memory pressure and our
6275 * lock on this object is getting in the way of
6276 * relieving it
6277 */
6278 dst_page = vm_object_page_grab(object);
6279
6280 if (dst_page != VM_PAGE_NULL) {
6281 vm_page_release(page: dst_page,
6282 FALSE);
6283 }
6284
6285 dst_page = vm_object_page_grab(object);
6286 }
6287 if (dst_page == VM_PAGE_NULL) {
6288 /*
6289 * need to allocate a page
6290 */
6291 dst_page = vm_page_grab_options(flags: grab_options);
6292 if (dst_page != VM_PAGE_NULL) {
6293 page_grab_count++;
6294 }
6295 }
6296 if (dst_page == VM_PAGE_NULL) {
6297 if ((cntrl_flags & (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT | UPL_NOBLOCK)) {
6298 /*
6299 * we don't want to stall waiting for pages to come onto the free list
6300 * while we're already holding absent pages in this UPL
6301 * the caller will deal with the empty slots
6302 */
6303 if (user_page_list) {
6304 user_page_list[entry].phys_addr = 0;
6305 }
6306
6307 goto try_next_page;
6308 }
6309 /*
6310 * no pages available... wait
6311 * then try again for the same
6312 * offset...
6313 */
6314 vm_object_unlock(object);
6315
6316 OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
6317
6318 VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
6319
6320 VM_PAGE_WAIT();
6321 OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
6322
6323 VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
6324
6325 vm_object_lock(object);
6326
6327 continue;
6328 }
6329 vm_page_insert(page: dst_page, object, offset: dst_offset);
6330
6331 dst_page->vmp_absent = TRUE;
6332 dst_page->vmp_busy = FALSE;
6333
6334 if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
6335 /*
6336 * if UPL_RET_ONLY_ABSENT was specified,
6337 * than we're definitely setting up a
6338 * upl for a clustered read/pagein
6339 * operation... mark the pages as clustered
6340 * so upl_commit_range can put them on the
6341 * speculative list
6342 */
6343 dst_page->vmp_clustered = TRUE;
6344
6345 if (!(cntrl_flags & UPL_FILE_IO)) {
6346 counter_inc(&vm_statistics_pageins);
6347 }
6348 }
6349 }
6350 phys_page = VM_PAGE_GET_PHYS_PAGE(m: dst_page);
6351
6352 dst_page->vmp_overwriting = TRUE;
6353
6354 if (dst_page->vmp_pmapped) {
6355 if (!(cntrl_flags & UPL_FILE_IO)) {
6356 /*
6357 * eliminate all mappings from the
6358 * original object and its prodigy
6359 */
6360 refmod_state = pmap_disconnect(phys: phys_page);
6361 } else {
6362 refmod_state = pmap_get_refmod(pn: phys_page);
6363 }
6364 } else {
6365 refmod_state = 0;
6366 }
6367
6368 hw_dirty = refmod_state & VM_MEM_MODIFIED;
6369 dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6370
6371 if (cntrl_flags & UPL_SET_LITE) {
6372 unsigned int pg_num;
6373
6374 pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6375 assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6376 bitmap_set(map: upl->lite_list, n: pg_num);
6377
6378 if (hw_dirty) {
6379 pmap_clear_modify(pn: phys_page);
6380 }
6381
6382 /*
6383 * Mark original page as cleaning
6384 * in place.
6385 */
6386 dst_page->vmp_cleaning = TRUE;
6387 dst_page->vmp_precious = FALSE;
6388 } else {
6389 /*
6390 * use pageclean setup, it is more
6391 * convenient even for the pageout
6392 * cases here
6393 */
6394 vm_object_lock(upl->map_object);
6395 vm_pageclean_setup(m: dst_page, new_m: alias_page, new_object: upl->map_object, new_offset: size - xfer_size);
6396 vm_object_unlock(upl->map_object);
6397
6398 alias_page->vmp_absent = FALSE;
6399 alias_page = NULL;
6400 }
6401
6402 if (cntrl_flags & UPL_REQUEST_SET_DIRTY) {
6403 upl->flags &= ~UPL_CLEAR_DIRTY;
6404 upl->flags |= UPL_SET_DIRTY;
6405 dirty = TRUE;
6406 /*
6407 * Page belonging to a code-signed object is about to
6408 * be written. Mark it tainted and disconnect it from
6409 * all pmaps so processes have to fault it back in and
6410 * deal with the tainted bit.
6411 */
6412 if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
6413 dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
6414 vm_page_upl_tainted++;
6415 if (dst_page->vmp_pmapped) {
6416 refmod_state = pmap_disconnect(phys: VM_PAGE_GET_PHYS_PAGE(m: dst_page));
6417 if (refmod_state & VM_MEM_REFERENCED) {
6418 dst_page->vmp_reference = TRUE;
6419 }
6420 }
6421 }
6422 } else if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
6423 /*
6424 * clean in place for read implies
6425 * that a write will be done on all
6426 * the pages that are dirty before
6427 * a upl commit is done. The caller
6428 * is obligated to preserve the
6429 * contents of all pages marked dirty
6430 */
6431 upl->flags |= UPL_CLEAR_DIRTY;
6432 }
6433 dst_page->vmp_dirty = dirty;
6434
6435 if (!dirty) {
6436 dst_page->vmp_precious = TRUE;
6437 }
6438
6439 if (!VM_PAGE_WIRED(dst_page)) {
6440 /*
6441 * deny access to the target page while
6442 * it is being worked on
6443 */
6444 dst_page->vmp_busy = TRUE;
6445 } else {
6446 dwp->dw_mask |= DW_vm_page_wire;
6447 }
6448
6449 /*
6450 * We might be about to satisfy a fault which has been
6451 * requested. So no need for the "restart" bit.
6452 */
6453 dst_page->vmp_restart = FALSE;
6454 if (!dst_page->vmp_absent && !(cntrl_flags & UPL_WILL_MODIFY)) {
6455 /*
6456 * expect the page to be used
6457 */
6458 dwp->dw_mask |= DW_set_reference;
6459 }
6460 if (cntrl_flags & UPL_PRECIOUS) {
6461 if (object->internal) {
6462 SET_PAGE_DIRTY(dst_page, FALSE);
6463 dst_page->vmp_precious = FALSE;
6464 } else {
6465 dst_page->vmp_precious = TRUE;
6466 }
6467 } else {
6468 dst_page->vmp_precious = FALSE;
6469 }
6470 }
6471 if (dst_page->vmp_busy) {
6472 upl->flags |= UPL_HAS_BUSY;
6473 }
6474
6475 if (phys_page > upl->highest_page) {
6476 upl->highest_page = phys_page;
6477 }
6478 assert(!pmap_is_noencrypt(phys_page));
6479 if (user_page_list) {
6480 user_page_list[entry].phys_addr = phys_page;
6481 user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
6482 user_page_list[entry].absent = dst_page->vmp_absent;
6483 user_page_list[entry].dirty = dst_page->vmp_dirty;
6484 user_page_list[entry].precious = dst_page->vmp_precious;
6485 user_page_list[entry].device = FALSE;
6486 user_page_list[entry].needed = FALSE;
6487 if (dst_page->vmp_clustered == TRUE) {
6488 user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
6489 } else {
6490 user_page_list[entry].speculative = FALSE;
6491 }
6492 user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
6493 user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
6494 user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
6495 user_page_list[entry].mark = FALSE;
6496 }
6497 /*
6498 * if UPL_RET_ONLY_ABSENT is set, then
6499 * we are working with a fresh page and we've
6500 * just set the clustered flag on it to
6501 * indicate that it was drug in as part of a
6502 * speculative cluster... so leave it alone
6503 */
6504 if (!(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6505 /*
6506 * someone is explicitly grabbing this page...
6507 * update clustered and speculative state
6508 *
6509 */
6510 if (dst_page->vmp_clustered) {
6511 VM_PAGE_CONSUME_CLUSTERED(dst_page);
6512 }
6513 }
6514try_next_page:
6515 if (dwp->dw_mask) {
6516 if (dwp->dw_mask & DW_vm_page_activate) {
6517 counter_inc(&vm_statistics_reactivations);
6518 }
6519
6520 VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
6521
6522 if (dw_count >= dw_limit) {
6523 vm_page_do_delayed_work(object, tag, dwp: dwp_start, dw_count);
6524
6525 dwp = dwp_start;
6526 dw_count = 0;
6527 }
6528 }
6529 entry++;
6530 dst_offset += PAGE_SIZE_64;
6531 xfer_size -= PAGE_SIZE;
6532 }
6533 if (dw_count) {
6534 vm_page_do_delayed_work(object, tag, dwp: dwp_start, dw_count);
6535 dwp = dwp_start;
6536 dw_count = 0;
6537 }
6538
6539 if (alias_page != NULL) {
6540 VM_PAGE_FREE(alias_page);
6541 }
6542 if (pmap_flushes_delayed == TRUE) {
6543 pmap_flush(&pmap_flush_context_storage);
6544 }
6545
6546 if (page_list_count != NULL) {
6547 if (upl->flags & UPL_INTERNAL) {
6548 *page_list_count = 0;
6549 } else if (*page_list_count > entry) {
6550 *page_list_count = entry;
6551 }
6552 }
6553#if UPL_DEBUG
6554 upl->upl_state = 1;
6555#endif
6556 vm_object_unlock(object);
6557
6558 VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, VM_UPL_REQUEST, DBG_FUNC_END, page_grab_count, 0, 0, 0);
6559#if DEVELOPMENT || DEBUG
6560 if (task != NULL) {
6561 ledger_credit(task->ledger, task_ledgers.pages_grabbed_upl, page_grab_count);
6562 }
6563#endif /* DEVELOPMENT || DEBUG */
6564
6565 if (dwp_start && dwp_finish_ctx) {
6566 vm_page_delayed_work_finish_ctx(dwp: dwp_start);
6567 dwp_start = dwp = NULL;
6568 }
6569
6570 return KERN_SUCCESS;
6571}
6572
6573/*
6574 * Routine: vm_object_super_upl_request
6575 * Purpose:
6576 * Cause the population of a portion of a vm_object
6577 * in much the same way as memory_object_upl_request.
6578 * Depending on the nature of the request, the pages
6579 * returned may be contain valid data or be uninitialized.
6580 * However, the region may be expanded up to the super
6581 * cluster size provided.
6582 */
6583
6584__private_extern__ kern_return_t
6585vm_object_super_upl_request(
6586 vm_object_t object,
6587 vm_object_offset_t offset,
6588 upl_size_t size,
6589 upl_size_t super_cluster,
6590 upl_t *upl,
6591 upl_page_info_t *user_page_list,
6592 unsigned int *page_list_count,
6593 upl_control_flags_t cntrl_flags,
6594 vm_tag_t tag)
6595{
6596 if (object->paging_offset > offset || ((cntrl_flags & UPL_VECTOR) == UPL_VECTOR)) {
6597 return KERN_FAILURE;
6598 }
6599
6600 assert(object->paging_in_progress);
6601 offset = offset - object->paging_offset;
6602
6603 if (super_cluster > size) {
6604 vm_object_offset_t base_offset;
6605 upl_size_t super_size;
6606 vm_object_size_t super_size_64;
6607
6608 base_offset = (offset & ~((vm_object_offset_t) super_cluster - 1));
6609 super_size = (offset + size) > (base_offset + super_cluster) ? super_cluster << 1 : super_cluster;
6610 super_size_64 = ((base_offset + super_size) > object->vo_size) ? (object->vo_size - base_offset) : super_size;
6611 super_size = (upl_size_t) super_size_64;
6612 assert(super_size == super_size_64);
6613
6614 if (offset > (base_offset + super_size)) {
6615 panic("vm_object_super_upl_request: Missed target pageout"
6616 " %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
6617 offset, base_offset, super_size, super_cluster,
6618 size, object->paging_offset);
6619 }
6620 /*
6621 * apparently there is a case where the vm requests a
6622 * page to be written out who's offset is beyond the
6623 * object size
6624 */
6625 if ((offset + size) > (base_offset + super_size)) {
6626 super_size_64 = (offset + size) - base_offset;
6627 super_size = (upl_size_t) super_size_64;
6628 assert(super_size == super_size_64);
6629 }
6630
6631 offset = base_offset;
6632 size = super_size;
6633 }
6634 return vm_object_upl_request(object, offset, size, upl_ptr: upl, user_page_list, page_list_count, cntrl_flags, tag);
6635}
6636
6637int cs_executable_create_upl = 0;
6638extern int proc_selfpid(void);
6639extern char *proc_name_address(void *p);
6640
6641kern_return_t
6642vm_map_create_upl(
6643 vm_map_t map,
6644 vm_map_address_t offset,
6645 upl_size_t *upl_size,
6646 upl_t *upl,
6647 upl_page_info_array_t page_list,
6648 unsigned int *count,
6649 upl_control_flags_t *flags,
6650 vm_tag_t tag)
6651{
6652 vm_map_entry_t entry;
6653 upl_control_flags_t caller_flags;
6654 int force_data_sync;
6655 int sync_cow_data;
6656 vm_object_t local_object;
6657 vm_map_offset_t local_offset;
6658 vm_map_offset_t local_start;
6659 kern_return_t ret;
6660 vm_map_address_t original_offset;
6661 vm_map_size_t original_size, adjusted_size;
6662 vm_map_offset_t local_entry_start;
6663 vm_object_offset_t local_entry_offset;
6664 vm_object_offset_t offset_in_mapped_page;
6665 boolean_t release_map = FALSE;
6666
6667
6668start_with_map:
6669
6670 original_offset = offset;
6671 original_size = *upl_size;
6672 adjusted_size = original_size;
6673
6674 caller_flags = *flags;
6675
6676 if (caller_flags & ~UPL_VALID_FLAGS) {
6677 /*
6678 * For forward compatibility's sake,
6679 * reject any unknown flag.
6680 */
6681 ret = KERN_INVALID_VALUE;
6682 goto done;
6683 }
6684 force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
6685 sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
6686
6687 if (upl == NULL) {
6688 ret = KERN_INVALID_ARGUMENT;
6689 goto done;
6690 }
6691
6692REDISCOVER_ENTRY:
6693 vm_map_lock_read(map);
6694
6695 if (!vm_map_lookup_entry(map, address: offset, entry: &entry)) {
6696 vm_map_unlock_read(map);
6697 ret = KERN_FAILURE;
6698 goto done;
6699 }
6700
6701 local_entry_start = entry->vme_start;
6702 local_entry_offset = VME_OFFSET(entry);
6703
6704 if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
6705 DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%x flags 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)offset, *upl_size, *flags);
6706 }
6707
6708 if (entry->vme_end - original_offset < adjusted_size) {
6709 adjusted_size = entry->vme_end - original_offset;
6710 assert(adjusted_size > 0);
6711 *upl_size = (upl_size_t) adjusted_size;
6712 assert(*upl_size == adjusted_size);
6713 }
6714
6715 if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
6716 *flags = 0;
6717
6718 if (!entry->is_sub_map &&
6719 VME_OBJECT(entry) != VM_OBJECT_NULL) {
6720 if (VME_OBJECT(entry)->private) {
6721 *flags = UPL_DEV_MEMORY;
6722 }
6723
6724 if (VME_OBJECT(entry)->phys_contiguous) {
6725 *flags |= UPL_PHYS_CONTIG;
6726 }
6727 }
6728 vm_map_unlock_read(map);
6729 ret = KERN_SUCCESS;
6730 goto done;
6731 }
6732
6733 offset_in_mapped_page = 0;
6734 if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) {
6735 offset = vm_map_trunc_page(original_offset, VM_MAP_PAGE_MASK(map));
6736 *upl_size = (upl_size_t)
6737 (vm_map_round_page(original_offset + adjusted_size,
6738 VM_MAP_PAGE_MASK(map))
6739 - offset);
6740
6741 offset_in_mapped_page = original_offset - offset;
6742 assert(offset_in_mapped_page < VM_MAP_PAGE_SIZE(map));
6743
6744 DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%llx flags 0x%llx -> offset 0x%llx adjusted_size 0x%llx *upl_size 0x%x offset_in_mapped_page 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)original_offset, (uint64_t)original_size, *flags, (uint64_t)offset, (uint64_t)adjusted_size, *upl_size, offset_in_mapped_page);
6745 }
6746
6747 if (!entry->is_sub_map) {
6748 if (VME_OBJECT(entry) == VM_OBJECT_NULL ||
6749 !VME_OBJECT(entry)->phys_contiguous) {
6750 if (*upl_size > MAX_UPL_SIZE_BYTES) {
6751 *upl_size = MAX_UPL_SIZE_BYTES;
6752 }
6753 }
6754
6755 /*
6756 * Create an object if necessary.
6757 */
6758 if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6759 if (vm_map_lock_read_to_write(map)) {
6760 goto REDISCOVER_ENTRY;
6761 }
6762
6763 VME_OBJECT_SET(entry,
6764 object: vm_object_allocate(size: (vm_size_t)
6765 vm_object_round_page((entry->vme_end - entry->vme_start))),
6766 false, context: 0);
6767 VME_OFFSET_SET(entry, offset: 0);
6768 assert(entry->use_pmap);
6769
6770 vm_map_lock_write_to_read(map);
6771 }
6772
6773 if (!(caller_flags & UPL_COPYOUT_FROM) &&
6774 !(entry->protection & VM_PROT_WRITE)) {
6775 vm_map_unlock_read(map);
6776 ret = KERN_PROTECTION_FAILURE;
6777 goto done;
6778 }
6779 }
6780
6781#if !XNU_TARGET_OS_OSX
6782 if (map->pmap != kernel_pmap &&
6783 (caller_flags & UPL_COPYOUT_FROM) &&
6784 (entry->protection & VM_PROT_EXECUTE) &&
6785 !(entry->protection & VM_PROT_WRITE)) {
6786 vm_offset_t kaddr;
6787 vm_size_t ksize;
6788
6789 /*
6790 * We're about to create a read-only UPL backed by
6791 * memory from an executable mapping.
6792 * Wiring the pages would result in the pages being copied
6793 * (due to the "MAP_PRIVATE" mapping) and no longer
6794 * code-signed, so no longer eligible for execution.
6795 * Instead, let's copy the data into a kernel buffer and
6796 * create the UPL from this kernel buffer.
6797 * The kernel buffer is then freed, leaving the UPL holding
6798 * the last reference on the VM object, so the memory will
6799 * be released when the UPL is committed.
6800 */
6801
6802 vm_map_unlock_read(map);
6803 entry = VM_MAP_ENTRY_NULL;
6804 /* allocate kernel buffer */
6805 ksize = round_page(*upl_size);
6806 kaddr = 0;
6807 ret = kmem_alloc(kernel_map, &kaddr, ksize,
6808 KMA_PAGEABLE | KMA_DATA, tag);
6809 if (ret == KERN_SUCCESS) {
6810 /* copyin the user data */
6811 ret = copyinmap(map, offset, (void *)kaddr, *upl_size);
6812 }
6813 if (ret == KERN_SUCCESS) {
6814 if (ksize > *upl_size) {
6815 /* zero out the extra space in kernel buffer */
6816 memset((void *)(kaddr + *upl_size),
6817 0,
6818 ksize - *upl_size);
6819 }
6820 /* create the UPL from the kernel buffer */
6821 vm_object_offset_t offset_in_object;
6822 vm_object_offset_t offset_in_object_page;
6823
6824 offset_in_object = offset - local_entry_start + local_entry_offset;
6825 offset_in_object_page = offset_in_object - vm_object_trunc_page(offset_in_object);
6826 assert(offset_in_object_page < PAGE_SIZE);
6827 assert(offset_in_object_page + offset_in_mapped_page < PAGE_SIZE);
6828 *upl_size -= offset_in_object_page + offset_in_mapped_page;
6829 ret = vm_map_create_upl(kernel_map,
6830 (vm_map_address_t)(kaddr + offset_in_object_page + offset_in_mapped_page),
6831 upl_size, upl, page_list, count, flags, tag);
6832 }
6833 if (kaddr != 0) {
6834 /* free the kernel buffer */
6835 kmem_free(kernel_map, kaddr, ksize);
6836 kaddr = 0;
6837 ksize = 0;
6838 }
6839#if DEVELOPMENT || DEBUG
6840 DTRACE_VM4(create_upl_from_executable,
6841 vm_map_t, map,
6842 vm_map_address_t, offset,
6843 upl_size_t, *upl_size,
6844 kern_return_t, ret);
6845#endif /* DEVELOPMENT || DEBUG */
6846 goto done;
6847 }
6848#endif /* !XNU_TARGET_OS_OSX */
6849
6850 if (!entry->is_sub_map) {
6851 local_object = VME_OBJECT(entry);
6852 assert(local_object != VM_OBJECT_NULL);
6853 }
6854
6855 if (!entry->is_sub_map &&
6856 !entry->needs_copy &&
6857 *upl_size != 0 &&
6858 local_object->vo_size > *upl_size && /* partial UPL */
6859 entry->wired_count == 0 && /* No COW for entries that are wired */
6860 (map->pmap != kernel_pmap) && /* alias checks */
6861 (vm_map_entry_should_cow_for_true_share(entry) /* case 1 */
6862 ||
6863 ( /* case 2 */
6864 local_object->internal &&
6865 (local_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) &&
6866 local_object->ref_count > 1))) {
6867 vm_prot_t prot;
6868
6869 /*
6870 * Case 1:
6871 * Set up the targeted range for copy-on-write to avoid
6872 * applying true_share/copy_delay to the entire object.
6873 *
6874 * Case 2:
6875 * This map entry covers only part of an internal
6876 * object. There could be other map entries covering
6877 * other areas of this object and some of these map
6878 * entries could be marked as "needs_copy", which
6879 * assumes that the object is COPY_SYMMETRIC.
6880 * To avoid marking this object as COPY_DELAY and
6881 * "true_share", let's shadow it and mark the new
6882 * (smaller) object as "true_share" and COPY_DELAY.
6883 */
6884
6885 if (vm_map_lock_read_to_write(map)) {
6886 goto REDISCOVER_ENTRY;
6887 }
6888 vm_map_lock_assert_exclusive(map);
6889 assert(VME_OBJECT(entry) == local_object);
6890
6891 vm_map_clip_start(map,
6892 entry,
6893 vm_map_trunc_page(offset,
6894 VM_MAP_PAGE_MASK(map)));
6895 vm_map_clip_end(map,
6896 entry,
6897 vm_map_round_page(offset + *upl_size,
6898 VM_MAP_PAGE_MASK(map)));
6899 if ((entry->vme_end - offset) < *upl_size) {
6900 *upl_size = (upl_size_t) (entry->vme_end - offset);
6901 assert(*upl_size == entry->vme_end - offset);
6902 }
6903
6904 prot = entry->protection & ~VM_PROT_WRITE;
6905 if (override_nx(map, VME_ALIAS(entry)) && prot) {
6906 prot |= VM_PROT_EXECUTE;
6907 }
6908 vm_object_pmap_protect(object: local_object,
6909 offset: VME_OFFSET(entry),
6910 size: entry->vme_end - entry->vme_start,
6911 pmap: ((entry->is_shared ||
6912 map->mapped_in_other_pmaps)
6913 ? PMAP_NULL
6914 : map->pmap),
6915 VM_MAP_PAGE_SIZE(map),
6916 pmap_start: entry->vme_start,
6917 prot);
6918
6919 assert(entry->wired_count == 0);
6920
6921 /*
6922 * Lock the VM object and re-check its status: if it's mapped
6923 * in another address space, we could still be racing with
6924 * another thread holding that other VM map exclusively.
6925 */
6926 vm_object_lock(local_object);
6927 if (local_object->true_share) {
6928 /* object is already in proper state: no COW needed */
6929 assert(local_object->copy_strategy !=
6930 MEMORY_OBJECT_COPY_SYMMETRIC);
6931 } else {
6932 /* not true_share: ask for copy-on-write below */
6933 assert(local_object->copy_strategy ==
6934 MEMORY_OBJECT_COPY_SYMMETRIC);
6935 entry->needs_copy = TRUE;
6936 }
6937 vm_object_unlock(local_object);
6938
6939 vm_map_lock_write_to_read(map);
6940 }
6941
6942 if (entry->needs_copy) {
6943 /*
6944 * Honor copy-on-write for COPY_SYMMETRIC
6945 * strategy.
6946 */
6947 vm_map_t local_map;
6948 vm_object_t object;
6949 vm_object_offset_t new_offset;
6950 vm_prot_t prot;
6951 boolean_t wired;
6952 vm_map_version_t version;
6953 vm_map_t real_map;
6954 vm_prot_t fault_type;
6955
6956 local_map = map;
6957
6958 if (caller_flags & UPL_COPYOUT_FROM) {
6959 fault_type = VM_PROT_READ | VM_PROT_COPY;
6960 vm_counters.create_upl_extra_cow++;
6961 vm_counters.create_upl_extra_cow_pages +=
6962 (entry->vme_end - entry->vme_start) / PAGE_SIZE;
6963 } else {
6964 fault_type = VM_PROT_WRITE;
6965 }
6966 if (vm_map_lookup_and_lock_object(var_map: &local_map,
6967 vaddr: offset, fault_type,
6968 OBJECT_LOCK_EXCLUSIVE,
6969 out_version: &version, object: &object,
6970 offset: &new_offset, out_prot: &prot, wired: &wired,
6971 NULL,
6972 real_map: &real_map, NULL) != KERN_SUCCESS) {
6973 if (fault_type == VM_PROT_WRITE) {
6974 vm_counters.create_upl_lookup_failure_write++;
6975 } else {
6976 vm_counters.create_upl_lookup_failure_copy++;
6977 }
6978 vm_map_unlock_read(local_map);
6979 ret = KERN_FAILURE;
6980 goto done;
6981 }
6982 if (real_map != local_map) {
6983 vm_map_unlock(real_map);
6984 }
6985 vm_map_unlock_read(local_map);
6986
6987 vm_object_unlock(object);
6988
6989 goto REDISCOVER_ENTRY;
6990 }
6991
6992 if (entry->is_sub_map) {
6993 vm_map_t submap;
6994
6995 submap = VME_SUBMAP(entry);
6996 local_start = entry->vme_start;
6997 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
6998
6999 vm_map_reference(map: submap);
7000 vm_map_unlock_read(map);
7001
7002 DEBUG4K_UPL("map %p offset 0x%llx (0x%llx) size 0x%x (adjusted 0x%llx original 0x%llx) offset_in_mapped_page 0x%llx submap %p\n", map, (uint64_t)offset, (uint64_t)original_offset, *upl_size, (uint64_t)adjusted_size, (uint64_t)original_size, offset_in_mapped_page, submap);
7003 offset += offset_in_mapped_page;
7004 *upl_size -= offset_in_mapped_page;
7005
7006 if (release_map) {
7007 vm_map_deallocate(map);
7008 }
7009 map = submap;
7010 release_map = TRUE;
7011 offset = local_offset + (offset - local_start);
7012 goto start_with_map;
7013 }
7014
7015 if (sync_cow_data &&
7016 (VME_OBJECT(entry)->shadow ||
7017 VME_OBJECT(entry)->vo_copy)) {
7018 local_object = VME_OBJECT(entry);
7019 local_start = entry->vme_start;
7020 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7021
7022 vm_object_reference(local_object);
7023 vm_map_unlock_read(map);
7024
7025 if (local_object->shadow && local_object->vo_copy) {
7026 vm_object_lock_request(object: local_object->shadow,
7027 offset: ((vm_object_offset_t)
7028 ((offset - local_start) +
7029 local_offset) +
7030 local_object->vo_shadow_offset),
7031 size: *upl_size, FALSE,
7032 MEMORY_OBJECT_DATA_SYNC,
7033 VM_PROT_NO_CHANGE);
7034 }
7035 sync_cow_data = FALSE;
7036 vm_object_deallocate(object: local_object);
7037
7038 goto REDISCOVER_ENTRY;
7039 }
7040 if (force_data_sync) {
7041 local_object = VME_OBJECT(entry);
7042 local_start = entry->vme_start;
7043 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7044
7045 vm_object_reference(local_object);
7046 vm_map_unlock_read(map);
7047
7048 vm_object_lock_request(object: local_object,
7049 offset: ((vm_object_offset_t)
7050 ((offset - local_start) +
7051 local_offset)),
7052 size: (vm_object_size_t)*upl_size,
7053 FALSE,
7054 MEMORY_OBJECT_DATA_SYNC,
7055 VM_PROT_NO_CHANGE);
7056
7057 force_data_sync = FALSE;
7058 vm_object_deallocate(object: local_object);
7059
7060 goto REDISCOVER_ENTRY;
7061 }
7062 if (VME_OBJECT(entry)->private) {
7063 *flags = UPL_DEV_MEMORY;
7064 } else {
7065 *flags = 0;
7066 }
7067
7068 if (VME_OBJECT(entry)->phys_contiguous) {
7069 *flags |= UPL_PHYS_CONTIG;
7070 }
7071
7072 local_object = VME_OBJECT(entry);
7073 local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7074 local_start = entry->vme_start;
7075
7076 /*
7077 * Wiring will copy the pages to the shadow object.
7078 * The shadow object will not be code-signed so
7079 * attempting to execute code from these copied pages
7080 * would trigger a code-signing violation.
7081 */
7082 if (entry->protection & VM_PROT_EXECUTE) {
7083#if MACH_ASSERT
7084 printf("pid %d[%s] create_upl out of executable range from "
7085 "0x%llx to 0x%llx: side effects may include "
7086 "code-signing violations later on\n",
7087 proc_selfpid(),
7088 (get_bsdtask_info(current_task())
7089 ? proc_name_address(get_bsdtask_info(current_task()))
7090 : "?"),
7091 (uint64_t) entry->vme_start,
7092 (uint64_t) entry->vme_end);
7093#endif /* MACH_ASSERT */
7094 DTRACE_VM2(cs_executable_create_upl,
7095 uint64_t, (uint64_t)entry->vme_start,
7096 uint64_t, (uint64_t)entry->vme_end);
7097 cs_executable_create_upl++;
7098 }
7099
7100 vm_object_lock(local_object);
7101
7102 /*
7103 * Ensure that this object is "true_share" and "copy_delay" now,
7104 * while we're still holding the VM map lock. After we unlock the map,
7105 * anything could happen to that mapping, including some copy-on-write
7106 * activity. We need to make sure that the IOPL will point at the
7107 * same memory as the mapping.
7108 */
7109 if (local_object->true_share) {
7110 assert(local_object->copy_strategy !=
7111 MEMORY_OBJECT_COPY_SYMMETRIC);
7112 } else if (!is_kernel_object(local_object) &&
7113 local_object != compressor_object &&
7114 !local_object->phys_contiguous) {
7115#if VM_OBJECT_TRACKING_OP_TRUESHARE
7116 if (!local_object->true_share &&
7117 vm_object_tracking_btlog) {
7118 btlog_record(vm_object_tracking_btlog, local_object,
7119 VM_OBJECT_TRACKING_OP_TRUESHARE,
7120 btref_get(__builtin_frame_address(0), 0));
7121 }
7122#endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
7123 VM_OBJECT_SET_TRUE_SHARE(object: local_object, TRUE);
7124 if (local_object->copy_strategy ==
7125 MEMORY_OBJECT_COPY_SYMMETRIC) {
7126 local_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
7127 }
7128 }
7129
7130 vm_object_reference_locked(local_object);
7131 vm_object_unlock(local_object);
7132
7133 vm_map_unlock_read(map);
7134
7135 offset += offset_in_mapped_page;
7136 assert(*upl_size > offset_in_mapped_page);
7137 *upl_size -= offset_in_mapped_page;
7138
7139 ret = vm_object_iopl_request(object: local_object,
7140 offset: ((vm_object_offset_t)
7141 ((offset - local_start) + local_offset)),
7142 size: *upl_size,
7143 upl_ptr: upl,
7144 user_page_list: page_list,
7145 page_list_count: count,
7146 cntrl_flags: caller_flags,
7147 tag);
7148 vm_object_deallocate(object: local_object);
7149
7150done:
7151 if (release_map) {
7152 vm_map_deallocate(map);
7153 }
7154
7155 return ret;
7156}
7157
7158/*
7159 * Internal routine to enter a UPL into a VM map.
7160 *
7161 * JMM - This should just be doable through the standard
7162 * vm_map_enter() API.
7163 */
7164kern_return_t
7165vm_map_enter_upl_range(
7166 vm_map_t map,
7167 upl_t upl,
7168 vm_object_offset_t offset_to_map,
7169 upl_size_t size_to_map,
7170 vm_prot_t prot_to_map,
7171 vm_map_offset_t *dst_addr)
7172{
7173 vm_map_size_t size;
7174 vm_object_offset_t offset;
7175 vm_map_offset_t addr;
7176 vm_page_t m;
7177 kern_return_t kr;
7178 int isVectorUPL = 0, curr_upl = 0;
7179 upl_t vector_upl = NULL;
7180 mach_vm_offset_t vector_upl_dst_addr = 0;
7181 vm_map_t vector_upl_submap = NULL;
7182 upl_offset_t subupl_offset = 0;
7183 upl_size_t subupl_size = 0;
7184
7185 if (upl == UPL_NULL) {
7186 return KERN_INVALID_ARGUMENT;
7187 }
7188
7189 DEBUG4K_UPL("map %p upl %p flags 0x%x object %p offset 0x%llx (uploff: 0x%llx) size 0x%x (uplsz: 0x%x) \n", map, upl, upl->flags, upl->map_object, offset_to_map, upl->u_offset, size_to_map, upl->u_size);
7190 assert(map == kernel_map);
7191
7192 if ((isVectorUPL = vector_upl_is_valid(upl))) {
7193 int mapped = 0, valid_upls = 0;
7194 vector_upl = upl;
7195
7196 upl_lock(vector_upl);
7197 for (curr_upl = 0; curr_upl < vector_upl_max_upls(upl: vector_upl); curr_upl++) {
7198 upl = vector_upl_subupl_byindex(vector_upl, curr_upl );
7199 if (upl == NULL) {
7200 continue;
7201 }
7202 valid_upls++;
7203 if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7204 mapped++;
7205 }
7206 }
7207
7208 if (mapped) {
7209 if (mapped != valid_upls) {
7210 panic("Only %d of the %d sub-upls within the Vector UPL are alread mapped", mapped, valid_upls);
7211 } else {
7212 upl_unlock(vector_upl);
7213 return KERN_FAILURE;
7214 }
7215 }
7216
7217 if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
7218 panic("TODO4K: vector UPL not implemented");
7219 }
7220
7221 vector_upl_submap = kmem_suballoc(parent: map, addr: &vector_upl_dst_addr,
7222 size: vector_upl->u_size, vmc_options: VM_MAP_CREATE_DEFAULT,
7223 VM_FLAGS_ANYWHERE, flags: KMS_NOFAIL | KMS_DATA,
7224 VM_KERN_MEMORY_NONE).kmr_submap;
7225 map = vector_upl_submap;
7226 vector_upl_set_submap(vector_upl, vector_upl_submap, vector_upl_dst_addr);
7227 curr_upl = 0;
7228 } else {
7229 upl_lock(upl);
7230 }
7231
7232process_upl_to_enter:
7233 if (isVectorUPL) {
7234 if (curr_upl == vector_upl_max_upls(upl: vector_upl)) {
7235 *dst_addr = vector_upl_dst_addr;
7236 upl_unlock(vector_upl);
7237 return KERN_SUCCESS;
7238 }
7239 upl = vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7240 if (upl == NULL) {
7241 goto process_upl_to_enter;
7242 }
7243
7244 vector_upl_get_iostate(vector_upl, upl, &subupl_offset, &subupl_size);
7245 *dst_addr = (vm_map_offset_t)(vector_upl_dst_addr + (vm_map_offset_t)subupl_offset);
7246 } else {
7247 /*
7248 * check to see if already mapped
7249 */
7250 if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7251 upl_unlock(upl);
7252 return KERN_FAILURE;
7253 }
7254 }
7255
7256 if ((!(upl->flags & UPL_SHADOWED)) &&
7257 ((upl->flags & UPL_HAS_BUSY) ||
7258 !((upl->flags & (UPL_DEVICE_MEMORY | UPL_IO_WIRE)) || (upl->map_object->phys_contiguous)))) {
7259 vm_object_t object;
7260 vm_page_t alias_page;
7261 vm_object_offset_t new_offset;
7262 unsigned int pg_num;
7263
7264 size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7265 object = upl->map_object;
7266 upl->map_object = vm_object_allocate(vm_object_round_page(size));
7267
7268 vm_object_lock(upl->map_object);
7269
7270 upl->map_object->shadow = object;
7271 VM_OBJECT_SET_PAGEOUT(object: upl->map_object, TRUE);
7272 VM_OBJECT_SET_CAN_PERSIST(object: upl->map_object, FALSE);
7273 upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
7274 upl->map_object->vo_shadow_offset = upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset;
7275 assertf(page_aligned(upl->map_object->vo_shadow_offset),
7276 "object %p shadow_offset 0x%llx",
7277 upl->map_object,
7278 (uint64_t)upl->map_object->vo_shadow_offset);
7279 upl->map_object->wimg_bits = object->wimg_bits;
7280 offset = upl->map_object->vo_shadow_offset;
7281 new_offset = 0;
7282
7283 upl->flags |= UPL_SHADOWED;
7284
7285 while (size) {
7286 pg_num = (unsigned int) (new_offset / PAGE_SIZE);
7287 assert(pg_num == new_offset / PAGE_SIZE);
7288
7289 if (bitmap_test(map: upl->lite_list, n: pg_num)) {
7290 alias_page = vm_page_grab_fictitious(TRUE);
7291
7292 vm_object_lock(object);
7293
7294 m = vm_page_lookup(object, offset);
7295 if (m == VM_PAGE_NULL) {
7296 panic("vm_upl_map: page missing");
7297 }
7298
7299 /*
7300 * Convert the fictitious page to a private
7301 * shadow of the real page.
7302 */
7303 assert(alias_page->vmp_fictitious);
7304 alias_page->vmp_fictitious = FALSE;
7305 alias_page->vmp_private = TRUE;
7306 alias_page->vmp_free_when_done = TRUE;
7307 /*
7308 * since m is a page in the upl it must
7309 * already be wired or BUSY, so it's
7310 * safe to assign the underlying physical
7311 * page to the alias
7312 */
7313 VM_PAGE_SET_PHYS_PAGE(alias_page, VM_PAGE_GET_PHYS_PAGE(m));
7314
7315 vm_object_unlock(object);
7316
7317 vm_page_lockspin_queues();
7318 vm_page_wire(page: alias_page, VM_KERN_MEMORY_NONE, TRUE);
7319 vm_page_unlock_queues();
7320
7321 vm_page_insert_wired(page: alias_page, object: upl->map_object, offset: new_offset, VM_KERN_MEMORY_NONE);
7322
7323 assert(!alias_page->vmp_wanted);
7324 alias_page->vmp_busy = FALSE;
7325 alias_page->vmp_absent = FALSE;
7326 }
7327 size -= PAGE_SIZE;
7328 offset += PAGE_SIZE_64;
7329 new_offset += PAGE_SIZE_64;
7330 }
7331 vm_object_unlock(upl->map_object);
7332 }
7333 if (upl->flags & UPL_SHADOWED) {
7334 if (isVectorUPL) {
7335 offset = 0;
7336 } else {
7337 offset = offset_to_map;
7338 }
7339 } else {
7340 offset = upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)) - upl->map_object->paging_offset;
7341 if (!isVectorUPL) {
7342 offset += offset_to_map;
7343 }
7344 }
7345
7346 if (isVectorUPL) {
7347 size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7348 } else {
7349 size = MIN(upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map)), size_to_map);
7350 }
7351
7352 vm_object_reference(upl->map_object);
7353
7354 if (!isVectorUPL) {
7355 *dst_addr = 0;
7356 /*
7357 * NEED A UPL_MAP ALIAS
7358 */
7359 kr = vm_map_enter(map, address: dst_addr, size: (vm_map_size_t)size, mask: (vm_map_offset_t) 0,
7360 VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
7361 object: upl->map_object, offset, FALSE,
7362 cur_protection: prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7363
7364 if (kr != KERN_SUCCESS) {
7365 vm_object_deallocate(object: upl->map_object);
7366 upl_unlock(upl);
7367 return kr;
7368 }
7369 } else {
7370 kr = vm_map_enter(map, address: dst_addr, size: (vm_map_size_t)size, mask: (vm_map_offset_t) 0,
7371 VM_MAP_KERNEL_FLAGS_FIXED(.vm_tag = VM_KERN_MEMORY_OSFMK),
7372 object: upl->map_object, offset, FALSE,
7373 cur_protection: prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7374 if (kr) {
7375 panic("vm_map_enter failed for a Vector UPL");
7376 }
7377 }
7378 upl->u_mapped_size = (upl_size_t) size; /* When we allow multiple submappings of the UPL */
7379 /* this will have to be an increment rather than */
7380 /* an assignment. */
7381 vm_object_lock(upl->map_object);
7382
7383 for (addr = *dst_addr; size > 0; size -= PAGE_SIZE, addr += PAGE_SIZE) {
7384 m = vm_page_lookup(object: upl->map_object, offset);
7385
7386 if (m) {
7387 m->vmp_pmapped = TRUE;
7388
7389 /*
7390 * CODE SIGNING ENFORCEMENT: page has been wpmapped,
7391 * but only in kernel space. If this was on a user map,
7392 * we'd have to set the wpmapped bit.
7393 */
7394 /* m->vmp_wpmapped = TRUE; */
7395 assert(map->pmap == kernel_pmap);
7396
7397 kr = pmap_enter_check(pmap: map->pmap, virtual_address: addr, page: m, protection: prot_to_map, VM_PROT_NONE, flags: 0, TRUE);
7398
7399 assert(kr == KERN_SUCCESS);
7400#if KASAN
7401 kasan_notify_address(addr, PAGE_SIZE_64);
7402#endif
7403 }
7404 offset += PAGE_SIZE_64;
7405 }
7406 vm_object_unlock(upl->map_object);
7407
7408 /*
7409 * hold a reference for the mapping
7410 */
7411 upl->ref_count++;
7412 upl->flags |= UPL_PAGE_LIST_MAPPED;
7413 upl->kaddr = (vm_offset_t) *dst_addr;
7414 assert(upl->kaddr == *dst_addr);
7415
7416 if (isVectorUPL) {
7417 goto process_upl_to_enter;
7418 }
7419
7420 if (!isVectorUPL) {
7421 vm_map_offset_t addr_adjustment;
7422
7423 addr_adjustment = (vm_map_offset_t)(upl->u_offset - upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)));
7424 if (addr_adjustment) {
7425 assert(VM_MAP_PAGE_MASK(map) != PAGE_MASK);
7426 DEBUG4K_UPL("dst_addr 0x%llx (+ 0x%llx) -> 0x%llx\n", (uint64_t)*dst_addr, (uint64_t)addr_adjustment, (uint64_t)(*dst_addr + addr_adjustment));
7427 *dst_addr += addr_adjustment;
7428 }
7429 }
7430
7431 upl_unlock(upl);
7432
7433 return KERN_SUCCESS;
7434}
7435
7436kern_return_t
7437vm_map_enter_upl(
7438 vm_map_t map,
7439 upl_t upl,
7440 vm_map_offset_t *dst_addr)
7441{
7442 upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7443 return vm_map_enter_upl_range(map, upl, offset_to_map: 0, size_to_map: upl_size, VM_PROT_DEFAULT, dst_addr);
7444}
7445
7446/*
7447 * Internal routine to remove a UPL mapping from a VM map.
7448 *
7449 * XXX - This should just be doable through a standard
7450 * vm_map_remove() operation. Otherwise, implicit clean-up
7451 * of the target map won't be able to correctly remove
7452 * these (and release the reference on the UPL). Having
7453 * to do this means we can't map these into user-space
7454 * maps yet.
7455 */
7456kern_return_t
7457vm_map_remove_upl_range(
7458 vm_map_t map,
7459 upl_t upl,
7460 __unused vm_object_offset_t offset_to_unmap,
7461 __unused upl_size_t size_to_unmap)
7462{
7463 vm_address_t addr;
7464 upl_size_t size;
7465 int isVectorUPL = 0, curr_upl = 0;
7466 upl_t vector_upl = NULL;
7467
7468 if (upl == UPL_NULL) {
7469 return KERN_INVALID_ARGUMENT;
7470 }
7471
7472 if ((isVectorUPL = vector_upl_is_valid(upl))) {
7473 int unmapped = 0, valid_upls = 0;
7474 vector_upl = upl;
7475 upl_lock(vector_upl);
7476 for (curr_upl = 0; curr_upl < vector_upl_max_upls(upl: vector_upl); curr_upl++) {
7477 upl = vector_upl_subupl_byindex(vector_upl, curr_upl );
7478 if (upl == NULL) {
7479 continue;
7480 }
7481 valid_upls++;
7482 if (!(UPL_PAGE_LIST_MAPPED & upl->flags)) {
7483 unmapped++;
7484 }
7485 }
7486
7487 if (unmapped) {
7488 if (unmapped != valid_upls) {
7489 panic("%d of the %d sub-upls within the Vector UPL is/are not mapped", unmapped, valid_upls);
7490 } else {
7491 upl_unlock(vector_upl);
7492 return KERN_FAILURE;
7493 }
7494 }
7495 curr_upl = 0;
7496 } else {
7497 upl_lock(upl);
7498 }
7499
7500process_upl_to_remove:
7501 if (isVectorUPL) {
7502 if (curr_upl == vector_upl_max_upls(upl: vector_upl)) {
7503 vm_map_t v_upl_submap;
7504 vm_offset_t v_upl_submap_dst_addr;
7505 vector_upl_get_submap(vector_upl, &v_upl_submap, &v_upl_submap_dst_addr);
7506
7507 kmem_free_guard(map, addr: v_upl_submap_dst_addr,
7508 size: vector_upl->u_size, flags: KMF_NONE, KMEM_GUARD_SUBMAP);
7509 vm_map_deallocate(map: v_upl_submap);
7510 upl_unlock(vector_upl);
7511 return KERN_SUCCESS;
7512 }
7513
7514 upl = vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7515 if (upl == NULL) {
7516 goto process_upl_to_remove;
7517 }
7518 }
7519
7520 if (upl->flags & UPL_PAGE_LIST_MAPPED) {
7521 addr = upl->kaddr;
7522 size = upl->u_mapped_size;
7523
7524 assert(upl->ref_count > 1);
7525 upl->ref_count--; /* removing mapping ref */
7526
7527 upl->flags &= ~UPL_PAGE_LIST_MAPPED;
7528 upl->kaddr = (vm_offset_t) 0;
7529 upl->u_mapped_size = 0;
7530
7531 if (isVectorUPL) {
7532 /*
7533 * If it's a Vectored UPL, we'll be removing the entire
7534 * submap anyways, so no need to remove individual UPL
7535 * element mappings from within the submap
7536 */
7537 goto process_upl_to_remove;
7538 }
7539
7540 upl_unlock(upl);
7541
7542 vm_map_remove(map,
7543 vm_map_trunc_page(addr, VM_MAP_PAGE_MASK(map)),
7544 vm_map_round_page(addr + size, VM_MAP_PAGE_MASK(map)));
7545 return KERN_SUCCESS;
7546 }
7547 upl_unlock(upl);
7548
7549 return KERN_FAILURE;
7550}
7551
7552kern_return_t
7553vm_map_remove_upl(
7554 vm_map_t map,
7555 upl_t upl)
7556{
7557 upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7558 return vm_map_remove_upl_range(map, upl, offset_to_unmap: 0, size_to_unmap: upl_size);
7559}
7560
7561kern_return_t
7562upl_commit_range(
7563 upl_t upl,
7564 upl_offset_t offset,
7565 upl_size_t size,
7566 int flags,
7567 upl_page_info_t *page_list,
7568 mach_msg_type_number_t count,
7569 boolean_t *empty)
7570{
7571 upl_size_t xfer_size, subupl_size;
7572 vm_object_t shadow_object;
7573 vm_object_t object;
7574 vm_object_t m_object;
7575 vm_object_offset_t target_offset;
7576 upl_offset_t subupl_offset = offset;
7577 int entry;
7578 int occupied;
7579 int clear_refmod = 0;
7580 int pgpgout_count = 0;
7581 struct vm_page_delayed_work dw_array;
7582 struct vm_page_delayed_work *dwp, *dwp_start;
7583 bool dwp_finish_ctx = TRUE;
7584 int dw_count;
7585 int dw_limit;
7586 int isVectorUPL = 0;
7587 upl_t vector_upl = NULL;
7588 boolean_t should_be_throttled = FALSE;
7589
7590 vm_page_t nxt_page = VM_PAGE_NULL;
7591 int fast_path_possible = 0;
7592 int fast_path_full_commit = 0;
7593 int throttle_page = 0;
7594 int unwired_count = 0;
7595 int local_queue_count = 0;
7596 vm_page_t first_local, last_local;
7597 vm_object_offset_t obj_start, obj_end, obj_offset;
7598 kern_return_t kr = KERN_SUCCESS;
7599
7600// DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p offset 0x%llx size 0x%llx flags 0x%x\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object, (uint64_t)offset, (uint64_t)size, flags);
7601
7602 dwp_start = dwp = NULL;
7603
7604 subupl_size = size;
7605 *empty = FALSE;
7606
7607 if (upl == UPL_NULL) {
7608 return KERN_INVALID_ARGUMENT;
7609 }
7610
7611 dw_count = 0;
7612 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
7613 dwp_start = vm_page_delayed_work_get_ctx();
7614 if (dwp_start == NULL) {
7615 dwp_start = &dw_array;
7616 dw_limit = 1;
7617 dwp_finish_ctx = FALSE;
7618 }
7619
7620 dwp = dwp_start;
7621
7622 if (count == 0) {
7623 page_list = NULL;
7624 }
7625
7626 if ((isVectorUPL = vector_upl_is_valid(upl))) {
7627 vector_upl = upl;
7628 upl_lock(vector_upl);
7629 } else {
7630 upl_lock(upl);
7631 }
7632
7633process_upl_to_commit:
7634
7635 if (isVectorUPL) {
7636 size = subupl_size;
7637 offset = subupl_offset;
7638 if (size == 0) {
7639 upl_unlock(vector_upl);
7640 kr = KERN_SUCCESS;
7641 goto done;
7642 }
7643 upl = vector_upl_subupl_byoffset(vector_upl, &offset, &size);
7644 if (upl == NULL) {
7645 upl_unlock(vector_upl);
7646 kr = KERN_FAILURE;
7647 goto done;
7648 }
7649 page_list = upl->page_list;
7650 subupl_size -= size;
7651 subupl_offset += size;
7652 }
7653
7654#if UPL_DEBUG
7655 if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
7656 upl->upl_commit_records[upl->upl_commit_index].c_btref = btref_get(__builtin_frame_address(0), 0);
7657 upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
7658 upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
7659
7660 upl->upl_commit_index++;
7661 }
7662#endif
7663 if (upl->flags & UPL_DEVICE_MEMORY) {
7664 xfer_size = 0;
7665 } else if ((offset + size) <= upl_adjusted_size(upl, PAGE_MASK)) {
7666 xfer_size = size;
7667 } else {
7668 if (!isVectorUPL) {
7669 upl_unlock(upl);
7670 } else {
7671 upl_unlock(vector_upl);
7672 }
7673 DEBUG4K_ERROR("upl %p (u_offset 0x%llx u_size 0x%x) offset 0x%x size 0x%x\n", upl, upl->u_offset, upl->u_size, offset, size);
7674 kr = KERN_FAILURE;
7675 goto done;
7676 }
7677 if (upl->flags & UPL_SET_DIRTY) {
7678 flags |= UPL_COMMIT_SET_DIRTY;
7679 }
7680 if (upl->flags & UPL_CLEAR_DIRTY) {
7681 flags |= UPL_COMMIT_CLEAR_DIRTY;
7682 }
7683
7684 object = upl->map_object;
7685
7686 if (upl->flags & UPL_SHADOWED) {
7687 vm_object_lock(object);
7688 shadow_object = object->shadow;
7689 } else {
7690 shadow_object = object;
7691 }
7692 entry = offset / PAGE_SIZE;
7693 target_offset = (vm_object_offset_t)offset;
7694
7695 if (upl->flags & UPL_KERNEL_OBJECT) {
7696 vm_object_lock_shared(shadow_object);
7697 } else {
7698 vm_object_lock(shadow_object);
7699 }
7700
7701 VM_OBJECT_WIRED_PAGE_UPDATE_START(shadow_object);
7702
7703 if (upl->flags & UPL_ACCESS_BLOCKED) {
7704 assert(shadow_object->blocked_access);
7705 shadow_object->blocked_access = FALSE;
7706 vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
7707 }
7708
7709 if (shadow_object->code_signed) {
7710 /*
7711 * CODE SIGNING:
7712 * If the object is code-signed, do not let this UPL tell
7713 * us if the pages are valid or not. Let the pages be
7714 * validated by VM the normal way (when they get mapped or
7715 * copied).
7716 */
7717 flags &= ~UPL_COMMIT_CS_VALIDATED;
7718 }
7719 if (!page_list) {
7720 /*
7721 * No page list to get the code-signing info from !?
7722 */
7723 flags &= ~UPL_COMMIT_CS_VALIDATED;
7724 }
7725 if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal) {
7726 should_be_throttled = TRUE;
7727 }
7728
7729 if ((upl->flags & UPL_IO_WIRE) &&
7730 !(flags & UPL_COMMIT_FREE_ABSENT) &&
7731 !isVectorUPL &&
7732 shadow_object->purgable != VM_PURGABLE_VOLATILE &&
7733 shadow_object->purgable != VM_PURGABLE_EMPTY) {
7734 if (!vm_page_queue_empty(&shadow_object->memq)) {
7735 if (shadow_object->internal && size == shadow_object->vo_size) {
7736 nxt_page = (vm_page_t)vm_page_queue_first(&shadow_object->memq);
7737 fast_path_full_commit = 1;
7738 }
7739 fast_path_possible = 1;
7740
7741 if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal &&
7742 (shadow_object->purgable == VM_PURGABLE_DENY ||
7743 shadow_object->purgable == VM_PURGABLE_NONVOLATILE ||
7744 shadow_object->purgable == VM_PURGABLE_VOLATILE)) {
7745 throttle_page = 1;
7746 }
7747 }
7748 }
7749 first_local = VM_PAGE_NULL;
7750 last_local = VM_PAGE_NULL;
7751
7752 obj_start = target_offset + upl->u_offset - shadow_object->paging_offset;
7753 obj_end = obj_start + xfer_size;
7754 obj_start = vm_object_trunc_page(obj_start);
7755 obj_end = vm_object_round_page(obj_end);
7756 for (obj_offset = obj_start;
7757 obj_offset < obj_end;
7758 obj_offset += PAGE_SIZE) {
7759 vm_page_t t, m;
7760
7761 dwp->dw_mask = 0;
7762 clear_refmod = 0;
7763
7764 m = VM_PAGE_NULL;
7765
7766 if (upl->flags & UPL_LITE) {
7767 unsigned int pg_num;
7768
7769 if (nxt_page != VM_PAGE_NULL) {
7770 m = nxt_page;
7771 nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
7772 target_offset = m->vmp_offset;
7773 }
7774 pg_num = (unsigned int) (target_offset / PAGE_SIZE);
7775 assert(pg_num == target_offset / PAGE_SIZE);
7776
7777 if (bitmap_test(map: upl->lite_list, n: pg_num)) {
7778 bitmap_clear(map: upl->lite_list, n: pg_num);
7779
7780 if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL) {
7781 m = vm_page_lookup(object: shadow_object, offset: obj_offset);
7782 }
7783 } else {
7784 m = NULL;
7785 }
7786 }
7787 if (upl->flags & UPL_SHADOWED) {
7788 if ((t = vm_page_lookup(object, offset: target_offset)) != VM_PAGE_NULL) {
7789 t->vmp_free_when_done = FALSE;
7790
7791 VM_PAGE_FREE(t);
7792
7793 if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL) {
7794 m = vm_page_lookup(object: shadow_object, offset: target_offset + object->vo_shadow_offset);
7795 }
7796 }
7797 }
7798 if (m == VM_PAGE_NULL) {
7799 goto commit_next_page;
7800 }
7801
7802 m_object = VM_PAGE_OBJECT(m);
7803
7804 if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
7805 assert(m->vmp_busy);
7806
7807 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7808 goto commit_next_page;
7809 }
7810
7811 if (flags & UPL_COMMIT_CS_VALIDATED) {
7812 /*
7813 * CODE SIGNING:
7814 * Set the code signing bits according to
7815 * what the UPL says they should be.
7816 */
7817 m->vmp_cs_validated |= page_list[entry].cs_validated;
7818 m->vmp_cs_tainted |= page_list[entry].cs_tainted;
7819 m->vmp_cs_nx |= page_list[entry].cs_nx;
7820 }
7821 if (flags & UPL_COMMIT_WRITTEN_BY_KERNEL) {
7822 m->vmp_written_by_kernel = TRUE;
7823 }
7824
7825 if (upl->flags & UPL_IO_WIRE) {
7826 if (page_list) {
7827 page_list[entry].phys_addr = 0;
7828 }
7829
7830 if (flags & UPL_COMMIT_SET_DIRTY) {
7831 SET_PAGE_DIRTY(m, FALSE);
7832 } else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
7833 m->vmp_dirty = FALSE;
7834
7835 if (!(flags & UPL_COMMIT_CS_VALIDATED) &&
7836 m->vmp_cs_validated &&
7837 m->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
7838 /*
7839 * CODE SIGNING:
7840 * This page is no longer dirty
7841 * but could have been modified,
7842 * so it will need to be
7843 * re-validated.
7844 */
7845 m->vmp_cs_validated = VMP_CS_ALL_FALSE;
7846
7847 VM_PAGEOUT_DEBUG(vm_cs_validated_resets, 1);
7848
7849 pmap_disconnect(phys: VM_PAGE_GET_PHYS_PAGE(m));
7850 }
7851 clear_refmod |= VM_MEM_MODIFIED;
7852 }
7853 if (upl->flags & UPL_ACCESS_BLOCKED) {
7854 /*
7855 * We blocked access to the pages in this UPL.
7856 * Clear the "busy" bit and wake up any waiter
7857 * for this page.
7858 */
7859 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7860 }
7861 if (fast_path_possible) {
7862 assert(m_object->purgable != VM_PURGABLE_EMPTY);
7863 assert(m_object->purgable != VM_PURGABLE_VOLATILE);
7864 if (m->vmp_absent) {
7865 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
7866 assert(m->vmp_wire_count == 0);
7867 assert(m->vmp_busy);
7868
7869 m->vmp_absent = FALSE;
7870 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7871 } else {
7872 if (m->vmp_wire_count == 0) {
7873 panic("wire_count == 0, m = %p, obj = %p", m, shadow_object);
7874 }
7875 assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
7876
7877 /*
7878 * XXX FBDP need to update some other
7879 * counters here (purgeable_wired_count)
7880 * (ledgers), ...
7881 */
7882 assert(m->vmp_wire_count > 0);
7883 m->vmp_wire_count--;
7884
7885 if (m->vmp_wire_count == 0) {
7886 m->vmp_q_state = VM_PAGE_NOT_ON_Q;
7887 unwired_count++;
7888 }
7889 }
7890 if (m->vmp_wire_count == 0) {
7891 assert(m->vmp_pageq.next == 0 && m->vmp_pageq.prev == 0);
7892
7893 if (last_local == VM_PAGE_NULL) {
7894 assert(first_local == VM_PAGE_NULL);
7895
7896 last_local = m;
7897 first_local = m;
7898 } else {
7899 assert(first_local != VM_PAGE_NULL);
7900
7901 m->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
7902 first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(m);
7903 first_local = m;
7904 }
7905 local_queue_count++;
7906
7907 if (throttle_page) {
7908 m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
7909 } else {
7910 if (flags & UPL_COMMIT_INACTIVATE) {
7911 if (shadow_object->internal) {
7912 m->vmp_q_state = VM_PAGE_ON_INACTIVE_INTERNAL_Q;
7913 } else {
7914 m->vmp_q_state = VM_PAGE_ON_INACTIVE_EXTERNAL_Q;
7915 }
7916 } else {
7917 m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
7918 }
7919 }
7920 }
7921 } else {
7922 if (flags & UPL_COMMIT_INACTIVATE) {
7923 dwp->dw_mask |= DW_vm_page_deactivate_internal;
7924 clear_refmod |= VM_MEM_REFERENCED;
7925 }
7926 if (m->vmp_absent) {
7927 if (flags & UPL_COMMIT_FREE_ABSENT) {
7928 dwp->dw_mask |= DW_vm_page_free;
7929 } else {
7930 m->vmp_absent = FALSE;
7931 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
7932
7933 if (!(dwp->dw_mask & DW_vm_page_deactivate_internal)) {
7934 dwp->dw_mask |= DW_vm_page_activate;
7935 }
7936 }
7937 } else {
7938 dwp->dw_mask |= DW_vm_page_unwire;
7939 }
7940 }
7941 goto commit_next_page;
7942 }
7943 assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
7944
7945 if (page_list) {
7946 page_list[entry].phys_addr = 0;
7947 }
7948
7949 /*
7950 * make sure to clear the hardware
7951 * modify or reference bits before
7952 * releasing the BUSY bit on this page
7953 * otherwise we risk losing a legitimate
7954 * change of state
7955 */
7956 if (flags & UPL_COMMIT_CLEAR_DIRTY) {
7957 m->vmp_dirty = FALSE;
7958
7959 clear_refmod |= VM_MEM_MODIFIED;
7960 }
7961 if (m->vmp_laundry) {
7962 dwp->dw_mask |= DW_vm_pageout_throttle_up;
7963 }
7964
7965 if (VM_PAGE_WIRED(m)) {
7966 m->vmp_free_when_done = FALSE;
7967 }
7968
7969 if (!(flags & UPL_COMMIT_CS_VALIDATED) &&
7970 m->vmp_cs_validated &&
7971 m->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
7972 /*
7973 * CODE SIGNING:
7974 * This page is no longer dirty
7975 * but could have been modified,
7976 * so it will need to be
7977 * re-validated.
7978 */
7979 m->vmp_cs_validated = VMP_CS_ALL_FALSE;
7980
7981 VM_PAGEOUT_DEBUG(vm_cs_validated_resets, 1);
7982
7983 pmap_disconnect(phys: VM_PAGE_GET_PHYS_PAGE(m));
7984 }
7985 if (m->vmp_overwriting) {
7986 /*
7987 * the (COPY_OUT_FROM == FALSE) request_page_list case
7988 */
7989 if (m->vmp_busy) {
7990#if CONFIG_PHANTOM_CACHE
7991 if (m->vmp_absent && !m_object->internal) {
7992 dwp->dw_mask |= DW_vm_phantom_cache_update;
7993 }
7994#endif
7995 m->vmp_absent = FALSE;
7996
7997 dwp->dw_mask |= DW_clear_busy;
7998 } else {
7999 /*
8000 * alternate (COPY_OUT_FROM == FALSE) page_list case
8001 * Occurs when the original page was wired
8002 * at the time of the list request
8003 */
8004 assert(VM_PAGE_WIRED(m));
8005
8006 dwp->dw_mask |= DW_vm_page_unwire; /* reactivates */
8007 }
8008 m->vmp_overwriting = FALSE;
8009 }
8010 m->vmp_cleaning = FALSE;
8011
8012 if (m->vmp_free_when_done) {
8013 /*
8014 * With the clean queue enabled, UPL_PAGEOUT should
8015 * no longer set the pageout bit. Its pages now go
8016 * to the clean queue.
8017 *
8018 * We don't use the cleaned Q anymore and so this
8019 * assert isn't correct. The code for the clean Q
8020 * still exists and might be used in the future. If we
8021 * go back to the cleaned Q, we will re-enable this
8022 * assert.
8023 *
8024 * assert(!(upl->flags & UPL_PAGEOUT));
8025 */
8026 assert(!m_object->internal);
8027
8028 m->vmp_free_when_done = FALSE;
8029
8030 if ((flags & UPL_COMMIT_SET_DIRTY) ||
8031 (m->vmp_pmapped && (pmap_disconnect(phys: VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED))) {
8032 /*
8033 * page was re-dirtied after we started
8034 * the pageout... reactivate it since
8035 * we don't know whether the on-disk
8036 * copy matches what is now in memory
8037 */
8038 SET_PAGE_DIRTY(m, FALSE);
8039
8040 dwp->dw_mask |= DW_vm_page_activate | DW_PAGE_WAKEUP;
8041
8042 if (upl->flags & UPL_PAGEOUT) {
8043 counter_inc(&vm_statistics_reactivations);
8044 DTRACE_VM2(pgrec, int, 1, (uint64_t *), NULL);
8045 }
8046 } else if (m->vmp_busy && !(upl->flags & UPL_HAS_BUSY)) {
8047 /*
8048 * Someone else might still be handling this
8049 * page (vm_fault() for example), so let's not
8050 * free it or "un-busy" it!
8051 * Put that page in the "speculative" queue
8052 * for now (since we would otherwise have freed
8053 * it) and let whoever is keeping the page
8054 * "busy" move it if needed when they're done
8055 * with it.
8056 */
8057 dwp->dw_mask |= DW_vm_page_speculate;
8058 } else {
8059 /*
8060 * page has been successfully cleaned
8061 * go ahead and free it for other use
8062 */
8063 if (m_object->internal) {
8064 DTRACE_VM2(anonpgout, int, 1, (uint64_t *), NULL);
8065 } else {
8066 DTRACE_VM2(fspgout, int, 1, (uint64_t *), NULL);
8067 }
8068 m->vmp_dirty = FALSE;
8069 if (!(upl->flags & UPL_HAS_BUSY)) {
8070 assert(!m->vmp_busy);
8071 }
8072 m->vmp_busy = TRUE;
8073
8074 dwp->dw_mask |= DW_vm_page_free;
8075 }
8076 goto commit_next_page;
8077 }
8078 /*
8079 * It is a part of the semantic of COPYOUT_FROM
8080 * UPLs that a commit implies cache sync
8081 * between the vm page and the backing store
8082 * this can be used to strip the precious bit
8083 * as well as clean
8084 */
8085 if ((upl->flags & UPL_PAGE_SYNC_DONE) || (flags & UPL_COMMIT_CLEAR_PRECIOUS)) {
8086 m->vmp_precious = FALSE;
8087 }
8088
8089 if (flags & UPL_COMMIT_SET_DIRTY) {
8090 SET_PAGE_DIRTY(m, FALSE);
8091 } else {
8092 m->vmp_dirty = FALSE;
8093 }
8094
8095 /* with the clean queue on, move *all* cleaned pages to the clean queue */
8096 if (hibernate_cleaning_in_progress == FALSE && !m->vmp_dirty && (upl->flags & UPL_PAGEOUT)) {
8097 pgpgout_count++;
8098
8099 counter_inc(&vm_statistics_pageouts);
8100 DTRACE_VM2(pgout, int, 1, (uint64_t *), NULL);
8101
8102 dwp->dw_mask |= DW_enqueue_cleaned;
8103 } else if (should_be_throttled == TRUE && (m->vmp_q_state == VM_PAGE_NOT_ON_Q)) {
8104 /*
8105 * page coming back in from being 'frozen'...
8106 * it was dirty before it was frozen, so keep it so
8107 * the vm_page_activate will notice that it really belongs
8108 * on the throttle queue and put it there
8109 */
8110 SET_PAGE_DIRTY(m, FALSE);
8111 dwp->dw_mask |= DW_vm_page_activate;
8112 } else {
8113 if ((flags & UPL_COMMIT_INACTIVATE) && !m->vmp_clustered && (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q)) {
8114 dwp->dw_mask |= DW_vm_page_deactivate_internal;
8115 clear_refmod |= VM_MEM_REFERENCED;
8116 } else if (!VM_PAGE_PAGEABLE(m)) {
8117 if (m->vmp_clustered || (flags & UPL_COMMIT_SPECULATE)) {
8118 dwp->dw_mask |= DW_vm_page_speculate;
8119 } else if (m->vmp_reference) {
8120 dwp->dw_mask |= DW_vm_page_activate;
8121 } else {
8122 dwp->dw_mask |= DW_vm_page_deactivate_internal;
8123 clear_refmod |= VM_MEM_REFERENCED;
8124 }
8125 }
8126 }
8127 if (upl->flags & UPL_ACCESS_BLOCKED) {
8128 /*
8129 * We blocked access to the pages in this URL.
8130 * Clear the "busy" bit on this page before we
8131 * wake up any waiter.
8132 */
8133 dwp->dw_mask |= DW_clear_busy;
8134 }
8135 /*
8136 * Wakeup any thread waiting for the page to be un-cleaning.
8137 */
8138 dwp->dw_mask |= DW_PAGE_WAKEUP;
8139
8140commit_next_page:
8141 if (clear_refmod) {
8142 pmap_clear_refmod(pn: VM_PAGE_GET_PHYS_PAGE(m), mask: clear_refmod);
8143 }
8144
8145 target_offset += PAGE_SIZE_64;
8146 xfer_size -= PAGE_SIZE;
8147 entry++;
8148
8149 if (dwp->dw_mask) {
8150 if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
8151 VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
8152
8153 if (dw_count >= dw_limit) {
8154 vm_page_do_delayed_work(object: shadow_object, VM_KERN_MEMORY_NONE, dwp: dwp_start, dw_count);
8155
8156 dwp = dwp_start;
8157 dw_count = 0;
8158 }
8159 } else {
8160 if (dwp->dw_mask & DW_clear_busy) {
8161 m->vmp_busy = FALSE;
8162 }
8163
8164 if (dwp->dw_mask & DW_PAGE_WAKEUP) {
8165 PAGE_WAKEUP(m);
8166 }
8167 }
8168 }
8169 }
8170 if (dw_count) {
8171 vm_page_do_delayed_work(object: shadow_object, VM_KERN_MEMORY_NONE, dwp: dwp_start, dw_count);
8172 dwp = dwp_start;
8173 dw_count = 0;
8174 }
8175
8176 if (fast_path_possible) {
8177 assert(shadow_object->purgable != VM_PURGABLE_VOLATILE);
8178 assert(shadow_object->purgable != VM_PURGABLE_EMPTY);
8179
8180 if (local_queue_count || unwired_count) {
8181 if (local_queue_count) {
8182 vm_page_t first_target;
8183 vm_page_queue_head_t *target_queue;
8184
8185 if (throttle_page) {
8186 target_queue = &vm_page_queue_throttled;
8187 } else {
8188 if (flags & UPL_COMMIT_INACTIVATE) {
8189 if (shadow_object->internal) {
8190 target_queue = &vm_page_queue_anonymous;
8191 } else {
8192 target_queue = &vm_page_queue_inactive;
8193 }
8194 } else {
8195 target_queue = &vm_page_queue_active;
8196 }
8197 }
8198 /*
8199 * Transfer the entire local queue to a regular LRU page queues.
8200 */
8201 vm_page_lockspin_queues();
8202
8203 first_target = (vm_page_t) vm_page_queue_first(target_queue);
8204
8205 if (vm_page_queue_empty(target_queue)) {
8206 target_queue->prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
8207 } else {
8208 first_target->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
8209 }
8210
8211 target_queue->next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
8212 first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(target_queue);
8213 last_local->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_target);
8214
8215 /*
8216 * Adjust the global page counts.
8217 */
8218 if (throttle_page) {
8219 vm_page_throttled_count += local_queue_count;
8220 } else {
8221 if (flags & UPL_COMMIT_INACTIVATE) {
8222 if (shadow_object->internal) {
8223 vm_page_anonymous_count += local_queue_count;
8224 }
8225 vm_page_inactive_count += local_queue_count;
8226
8227 token_new_pagecount += local_queue_count;
8228 } else {
8229 vm_page_active_count += local_queue_count;
8230 }
8231
8232 if (shadow_object->internal) {
8233 vm_page_pageable_internal_count += local_queue_count;
8234 } else {
8235 vm_page_pageable_external_count += local_queue_count;
8236 }
8237 }
8238 } else {
8239 vm_page_lockspin_queues();
8240 }
8241 if (unwired_count) {
8242 vm_page_wire_count -= unwired_count;
8243 VM_CHECK_MEMORYSTATUS;
8244 }
8245 vm_page_unlock_queues();
8246
8247 VM_OBJECT_WIRED_PAGE_COUNT(shadow_object, -unwired_count);
8248 }
8249 }
8250
8251 if (upl->flags & UPL_DEVICE_MEMORY) {
8252 occupied = 0;
8253 } else if (upl->flags & UPL_LITE) {
8254 uint32_t pages = (uint32_t)atop(upl_adjusted_size(upl, PAGE_MASK));
8255
8256 occupied = !fast_path_full_commit &&
8257 !bitmap_is_empty(map: upl->lite_list, nbits: pages);
8258 } else {
8259 occupied = !vm_page_queue_empty(&upl->map_object->memq);
8260 }
8261 if (occupied == 0) {
8262 /*
8263 * If this UPL element belongs to a Vector UPL and is
8264 * empty, then this is the right function to deallocate
8265 * it. So go ahead set the *empty variable. The flag
8266 * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
8267 * should be considered relevant for the Vector UPL and not
8268 * the internal UPLs.
8269 */
8270 if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL) {
8271 *empty = TRUE;
8272 }
8273
8274 if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
8275 /*
8276 * this is not a paging object
8277 * so we need to drop the paging reference
8278 * that was taken when we created the UPL
8279 * against this object
8280 */
8281 vm_object_activity_end(shadow_object);
8282 vm_object_collapse(object: shadow_object, offset: 0, TRUE);
8283 } else {
8284 /*
8285 * we dontated the paging reference to
8286 * the map object... vm_pageout_object_terminate
8287 * will drop this reference
8288 */
8289 }
8290 }
8291 VM_OBJECT_WIRED_PAGE_UPDATE_END(shadow_object, shadow_object->wire_tag);
8292 vm_object_unlock(shadow_object);
8293 if (object != shadow_object) {
8294 vm_object_unlock(object);
8295 }
8296
8297 if (!isVectorUPL) {
8298 upl_unlock(upl);
8299 } else {
8300 /*
8301 * If we completed our operations on an UPL that is
8302 * part of a Vectored UPL and if empty is TRUE, then
8303 * we should go ahead and deallocate this UPL element.
8304 * Then we check if this was the last of the UPL elements
8305 * within that Vectored UPL. If so, set empty to TRUE
8306 * so that in ubc_upl_commit_range or ubc_upl_commit, we
8307 * can go ahead and deallocate the Vector UPL too.
8308 */
8309 if (*empty == TRUE) {
8310 *empty = vector_upl_set_subupl(vector_upl, upl, 0);
8311 upl_deallocate(upl);
8312 }
8313 goto process_upl_to_commit;
8314 }
8315 if (pgpgout_count) {
8316 DTRACE_VM2(pgpgout, int, pgpgout_count, (uint64_t *), NULL);
8317 }
8318
8319 kr = KERN_SUCCESS;
8320done:
8321 if (dwp_start && dwp_finish_ctx) {
8322 vm_page_delayed_work_finish_ctx(dwp: dwp_start);
8323 dwp_start = dwp = NULL;
8324 }
8325
8326 return kr;
8327}
8328
8329kern_return_t
8330upl_abort_range(
8331 upl_t upl,
8332 upl_offset_t offset,
8333 upl_size_t size,
8334 int error,
8335 boolean_t *empty)
8336{
8337 upl_size_t xfer_size, subupl_size;
8338 vm_object_t shadow_object;
8339 vm_object_t object;
8340 vm_object_offset_t target_offset;
8341 upl_offset_t subupl_offset = offset;
8342 int occupied;
8343 struct vm_page_delayed_work dw_array;
8344 struct vm_page_delayed_work *dwp, *dwp_start;
8345 bool dwp_finish_ctx = TRUE;
8346 int dw_count;
8347 int dw_limit;
8348 int isVectorUPL = 0;
8349 upl_t vector_upl = NULL;
8350 vm_object_offset_t obj_start, obj_end, obj_offset;
8351 kern_return_t kr = KERN_SUCCESS;
8352
8353// DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p offset 0x%llx size 0x%llx error 0x%x\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object, (uint64_t)offset, (uint64_t)size, error);
8354
8355 dwp_start = dwp = NULL;
8356
8357 subupl_size = size;
8358 *empty = FALSE;
8359
8360 if (upl == UPL_NULL) {
8361 return KERN_INVALID_ARGUMENT;
8362 }
8363
8364 if ((upl->flags & UPL_IO_WIRE) && !(error & UPL_ABORT_DUMP_PAGES)) {
8365 return upl_commit_range(upl, offset, size, UPL_COMMIT_FREE_ABSENT, NULL, count: 0, empty);
8366 }
8367
8368 dw_count = 0;
8369 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
8370 dwp_start = vm_page_delayed_work_get_ctx();
8371 if (dwp_start == NULL) {
8372 dwp_start = &dw_array;
8373 dw_limit = 1;
8374 dwp_finish_ctx = FALSE;
8375 }
8376
8377 dwp = dwp_start;
8378
8379 if ((isVectorUPL = vector_upl_is_valid(upl))) {
8380 vector_upl = upl;
8381 upl_lock(vector_upl);
8382 } else {
8383 upl_lock(upl);
8384 }
8385
8386process_upl_to_abort:
8387 if (isVectorUPL) {
8388 size = subupl_size;
8389 offset = subupl_offset;
8390 if (size == 0) {
8391 upl_unlock(vector_upl);
8392 kr = KERN_SUCCESS;
8393 goto done;
8394 }
8395 upl = vector_upl_subupl_byoffset(vector_upl, &offset, &size);
8396 if (upl == NULL) {
8397 upl_unlock(vector_upl);
8398 kr = KERN_FAILURE;
8399 goto done;
8400 }
8401 subupl_size -= size;
8402 subupl_offset += size;
8403 }
8404
8405 *empty = FALSE;
8406
8407#if UPL_DEBUG
8408 if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
8409 upl->upl_commit_records[upl->upl_commit_index].c_btref = btref_get(__builtin_frame_address(0), 0);
8410 upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
8411 upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
8412 upl->upl_commit_records[upl->upl_commit_index].c_aborted = 1;
8413
8414 upl->upl_commit_index++;
8415 }
8416#endif
8417 if (upl->flags & UPL_DEVICE_MEMORY) {
8418 xfer_size = 0;
8419 } else if ((offset + size) <= upl_adjusted_size(upl, PAGE_MASK)) {
8420 xfer_size = size;
8421 } else {
8422 if (!isVectorUPL) {
8423 upl_unlock(upl);
8424 } else {
8425 upl_unlock(vector_upl);
8426 }
8427 DEBUG4K_ERROR("upl %p (u_offset 0x%llx u_size 0x%x) offset 0x%x size 0x%x\n", upl, upl->u_offset, upl->u_size, offset, size);
8428 kr = KERN_FAILURE;
8429 goto done;
8430 }
8431 object = upl->map_object;
8432
8433 if (upl->flags & UPL_SHADOWED) {
8434 vm_object_lock(object);
8435 shadow_object = object->shadow;
8436 } else {
8437 shadow_object = object;
8438 }
8439
8440 target_offset = (vm_object_offset_t)offset;
8441
8442 if (upl->flags & UPL_KERNEL_OBJECT) {
8443 vm_object_lock_shared(shadow_object);
8444 } else {
8445 vm_object_lock(shadow_object);
8446 }
8447
8448 if (upl->flags & UPL_ACCESS_BLOCKED) {
8449 assert(shadow_object->blocked_access);
8450 shadow_object->blocked_access = FALSE;
8451 vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
8452 }
8453
8454 if ((error & UPL_ABORT_DUMP_PAGES) && (upl->flags & UPL_KERNEL_OBJECT)) {
8455 panic("upl_abort_range: kernel_object being DUMPED");
8456 }
8457
8458 obj_start = target_offset + upl->u_offset - shadow_object->paging_offset;
8459 obj_end = obj_start + xfer_size;
8460 obj_start = vm_object_trunc_page(obj_start);
8461 obj_end = vm_object_round_page(obj_end);
8462 for (obj_offset = obj_start;
8463 obj_offset < obj_end;
8464 obj_offset += PAGE_SIZE) {
8465 vm_page_t t, m;
8466 unsigned int pg_num;
8467 boolean_t needed;
8468
8469 pg_num = (unsigned int) (target_offset / PAGE_SIZE);
8470 assert(pg_num == target_offset / PAGE_SIZE);
8471
8472 needed = FALSE;
8473
8474 if (upl->flags & UPL_INTERNAL) {
8475 needed = upl->page_list[pg_num].needed;
8476 }
8477
8478 dwp->dw_mask = 0;
8479 m = VM_PAGE_NULL;
8480
8481 if (upl->flags & UPL_LITE) {
8482 if (bitmap_test(map: upl->lite_list, n: pg_num)) {
8483 bitmap_clear(map: upl->lite_list, n: pg_num);
8484
8485 if (!(upl->flags & UPL_KERNEL_OBJECT)) {
8486 m = vm_page_lookup(object: shadow_object, offset: obj_offset);
8487 }
8488 }
8489 }
8490 if (upl->flags & UPL_SHADOWED) {
8491 if ((t = vm_page_lookup(object, offset: target_offset)) != VM_PAGE_NULL) {
8492 t->vmp_free_when_done = FALSE;
8493
8494 VM_PAGE_FREE(t);
8495
8496 if (m == VM_PAGE_NULL) {
8497 m = vm_page_lookup(object: shadow_object, offset: target_offset + object->vo_shadow_offset);
8498 }
8499 }
8500 }
8501 if ((upl->flags & UPL_KERNEL_OBJECT)) {
8502 goto abort_next_page;
8503 }
8504
8505 if (m != VM_PAGE_NULL) {
8506 assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
8507
8508 if (m->vmp_absent) {
8509 boolean_t must_free = TRUE;
8510
8511 /*
8512 * COPYOUT = FALSE case
8513 * check for error conditions which must
8514 * be passed back to the pages customer
8515 */
8516 if (error & UPL_ABORT_RESTART) {
8517 m->vmp_restart = TRUE;
8518 m->vmp_absent = FALSE;
8519 m->vmp_unusual = TRUE;
8520 must_free = FALSE;
8521 } else if (error & UPL_ABORT_UNAVAILABLE) {
8522 m->vmp_restart = FALSE;
8523 m->vmp_unusual = TRUE;
8524 must_free = FALSE;
8525 } else if (error & UPL_ABORT_ERROR) {
8526 m->vmp_restart = FALSE;
8527 m->vmp_absent = FALSE;
8528 m->vmp_error = TRUE;
8529 m->vmp_unusual = TRUE;
8530 must_free = FALSE;
8531 }
8532 if (m->vmp_clustered && needed == FALSE) {
8533 /*
8534 * This page was a part of a speculative
8535 * read-ahead initiated by the kernel
8536 * itself. No one is expecting this
8537 * page and no one will clean up its
8538 * error state if it ever becomes valid
8539 * in the future.
8540 * We have to free it here.
8541 */
8542 must_free = TRUE;
8543 }
8544 m->vmp_cleaning = FALSE;
8545
8546 if (m->vmp_overwriting && !m->vmp_busy) {
8547 /*
8548 * this shouldn't happen since
8549 * this is an 'absent' page, but
8550 * it doesn't hurt to check for
8551 * the 'alternate' method of
8552 * stabilizing the page...
8553 * we will mark 'busy' to be cleared
8554 * in the following code which will
8555 * take care of the primary stabilzation
8556 * method (i.e. setting 'busy' to TRUE)
8557 */
8558 dwp->dw_mask |= DW_vm_page_unwire;
8559 }
8560 m->vmp_overwriting = FALSE;
8561
8562 dwp->dw_mask |= (DW_clear_busy | DW_PAGE_WAKEUP);
8563
8564 if (must_free == TRUE) {
8565 dwp->dw_mask |= DW_vm_page_free;
8566 } else {
8567 dwp->dw_mask |= DW_vm_page_activate;
8568 }
8569 } else {
8570 /*
8571 * Handle the trusted pager throttle.
8572 */
8573 if (m->vmp_laundry) {
8574 dwp->dw_mask |= DW_vm_pageout_throttle_up;
8575 }
8576
8577 if (upl->flags & UPL_ACCESS_BLOCKED) {
8578 /*
8579 * We blocked access to the pages in this UPL.
8580 * Clear the "busy" bit and wake up any waiter
8581 * for this page.
8582 */
8583 dwp->dw_mask |= DW_clear_busy;
8584 }
8585 if (m->vmp_overwriting) {
8586 if (m->vmp_busy) {
8587 dwp->dw_mask |= DW_clear_busy;
8588 } else {
8589 /*
8590 * deal with the 'alternate' method
8591 * of stabilizing the page...
8592 * we will either free the page
8593 * or mark 'busy' to be cleared
8594 * in the following code which will
8595 * take care of the primary stabilzation
8596 * method (i.e. setting 'busy' to TRUE)
8597 */
8598 dwp->dw_mask |= DW_vm_page_unwire;
8599 }
8600 m->vmp_overwriting = FALSE;
8601 }
8602 m->vmp_free_when_done = FALSE;
8603 m->vmp_cleaning = FALSE;
8604
8605 if (error & UPL_ABORT_DUMP_PAGES) {
8606 pmap_disconnect(phys: VM_PAGE_GET_PHYS_PAGE(m));
8607
8608 dwp->dw_mask |= DW_vm_page_free;
8609 } else {
8610 if (!(dwp->dw_mask & DW_vm_page_unwire)) {
8611 if (error & UPL_ABORT_REFERENCE) {
8612 /*
8613 * we've been told to explictly
8614 * reference this page... for
8615 * file I/O, this is done by
8616 * implementing an LRU on the inactive q
8617 */
8618 dwp->dw_mask |= DW_vm_page_lru;
8619 } else if (!VM_PAGE_PAGEABLE(m)) {
8620 dwp->dw_mask |= DW_vm_page_deactivate_internal;
8621 }
8622 }
8623 dwp->dw_mask |= DW_PAGE_WAKEUP;
8624 }
8625 }
8626 }
8627abort_next_page:
8628 target_offset += PAGE_SIZE_64;
8629 xfer_size -= PAGE_SIZE;
8630
8631 if (dwp->dw_mask) {
8632 if (dwp->dw_mask & ~(DW_clear_busy | DW_PAGE_WAKEUP)) {
8633 VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
8634
8635 if (dw_count >= dw_limit) {
8636 vm_page_do_delayed_work(object: shadow_object, VM_KERN_MEMORY_NONE, dwp: dwp_start, dw_count);
8637
8638 dwp = dwp_start;
8639 dw_count = 0;
8640 }
8641 } else {
8642 if (dwp->dw_mask & DW_clear_busy) {
8643 m->vmp_busy = FALSE;
8644 }
8645
8646 if (dwp->dw_mask & DW_PAGE_WAKEUP) {
8647 PAGE_WAKEUP(m);
8648 }
8649 }
8650 }
8651 }
8652 if (dw_count) {
8653 vm_page_do_delayed_work(object: shadow_object, VM_KERN_MEMORY_NONE, dwp: dwp_start, dw_count);
8654 dwp = dwp_start;
8655 dw_count = 0;
8656 }
8657
8658 if (upl->flags & UPL_DEVICE_MEMORY) {
8659 occupied = 0;
8660 } else if (upl->flags & UPL_LITE) {
8661 uint32_t pages = (uint32_t)atop(upl_adjusted_size(upl, PAGE_MASK));
8662
8663 occupied = !bitmap_is_empty(map: upl->lite_list, nbits: pages);
8664 } else {
8665 occupied = !vm_page_queue_empty(&upl->map_object->memq);
8666 }
8667 if (occupied == 0) {
8668 /*
8669 * If this UPL element belongs to a Vector UPL and is
8670 * empty, then this is the right function to deallocate
8671 * it. So go ahead set the *empty variable. The flag
8672 * UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
8673 * should be considered relevant for the Vector UPL and
8674 * not the internal UPLs.
8675 */
8676 if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) || isVectorUPL) {
8677 *empty = TRUE;
8678 }
8679
8680 if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
8681 /*
8682 * this is not a paging object
8683 * so we need to drop the paging reference
8684 * that was taken when we created the UPL
8685 * against this object
8686 */
8687 vm_object_activity_end(shadow_object);
8688 vm_object_collapse(object: shadow_object, offset: 0, TRUE);
8689 } else {
8690 /*
8691 * we dontated the paging reference to
8692 * the map object... vm_pageout_object_terminate
8693 * will drop this reference
8694 */
8695 }
8696 }
8697 vm_object_unlock(shadow_object);
8698 if (object != shadow_object) {
8699 vm_object_unlock(object);
8700 }
8701
8702 if (!isVectorUPL) {
8703 upl_unlock(upl);
8704 } else {
8705 /*
8706 * If we completed our operations on an UPL that is
8707 * part of a Vectored UPL and if empty is TRUE, then
8708 * we should go ahead and deallocate this UPL element.
8709 * Then we check if this was the last of the UPL elements
8710 * within that Vectored UPL. If so, set empty to TRUE
8711 * so that in ubc_upl_abort_range or ubc_upl_abort, we
8712 * can go ahead and deallocate the Vector UPL too.
8713 */
8714 if (*empty == TRUE) {
8715 *empty = vector_upl_set_subupl(vector_upl, upl, 0);
8716 upl_deallocate(upl);
8717 }
8718 goto process_upl_to_abort;
8719 }
8720
8721 kr = KERN_SUCCESS;
8722
8723done:
8724 if (dwp_start && dwp_finish_ctx) {
8725 vm_page_delayed_work_finish_ctx(dwp: dwp_start);
8726 dwp_start = dwp = NULL;
8727 }
8728
8729 return kr;
8730}
8731
8732
8733kern_return_t
8734upl_abort(
8735 upl_t upl,
8736 int error)
8737{
8738 boolean_t empty;
8739
8740 if (upl == UPL_NULL) {
8741 return KERN_INVALID_ARGUMENT;
8742 }
8743
8744 return upl_abort_range(upl, offset: 0, size: upl->u_size, error, empty: &empty);
8745}
8746
8747
8748/* an option on commit should be wire */
8749kern_return_t
8750upl_commit(
8751 upl_t upl,
8752 upl_page_info_t *page_list,
8753 mach_msg_type_number_t count)
8754{
8755 boolean_t empty;
8756
8757 if (upl == UPL_NULL) {
8758 return KERN_INVALID_ARGUMENT;
8759 }
8760
8761 return upl_commit_range(upl, offset: 0, size: upl->u_size, flags: 0,
8762 page_list, count, empty: &empty);
8763}
8764
8765
8766void
8767iopl_valid_data(
8768 upl_t upl,
8769 vm_tag_t tag)
8770{
8771 vm_object_t object;
8772 vm_offset_t offset;
8773 vm_page_t m, nxt_page = VM_PAGE_NULL;
8774 upl_size_t size;
8775 int wired_count = 0;
8776
8777 if (upl == NULL) {
8778 panic("iopl_valid_data: NULL upl");
8779 }
8780 if (vector_upl_is_valid(upl)) {
8781 panic("iopl_valid_data: vector upl");
8782 }
8783 if ((upl->flags & (UPL_DEVICE_MEMORY | UPL_SHADOWED | UPL_ACCESS_BLOCKED | UPL_IO_WIRE | UPL_INTERNAL)) != UPL_IO_WIRE) {
8784 panic("iopl_valid_data: unsupported upl, flags = %x", upl->flags);
8785 }
8786
8787 object = upl->map_object;
8788
8789 if (is_kernel_object(object) || object == compressor_object) {
8790 panic("iopl_valid_data: object == kernel or compressor");
8791 }
8792
8793 if (object->purgable == VM_PURGABLE_VOLATILE ||
8794 object->purgable == VM_PURGABLE_EMPTY) {
8795 panic("iopl_valid_data: object %p purgable %d",
8796 object, object->purgable);
8797 }
8798
8799 size = upl_adjusted_size(upl, PAGE_MASK);
8800
8801 vm_object_lock(object);
8802 VM_OBJECT_WIRED_PAGE_UPDATE_START(object);
8803
8804 bool whole_object;
8805
8806 if (object->vo_size == size && object->resident_page_count == (size / PAGE_SIZE)) {
8807 nxt_page = (vm_page_t)vm_page_queue_first(&object->memq);
8808 whole_object = true;
8809 } else {
8810 offset = (vm_offset_t)(upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset);
8811 whole_object = false;
8812 }
8813
8814 while (size) {
8815 if (whole_object) {
8816 if (nxt_page != VM_PAGE_NULL) {
8817 m = nxt_page;
8818 nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
8819 }
8820 } else {
8821 m = vm_page_lookup(object, offset);
8822 offset += PAGE_SIZE;
8823
8824 if (m == VM_PAGE_NULL) {
8825 panic("iopl_valid_data: missing expected page at offset %lx", (long)offset);
8826 }
8827 }
8828 if (m->vmp_busy) {
8829 if (!m->vmp_absent) {
8830 panic("iopl_valid_data: busy page w/o absent");
8831 }
8832
8833 if (m->vmp_pageq.next || m->vmp_pageq.prev) {
8834 panic("iopl_valid_data: busy+absent page on page queue");
8835 }
8836 if (m->vmp_reusable) {
8837 panic("iopl_valid_data: %p is reusable", m);
8838 }
8839
8840 m->vmp_absent = FALSE;
8841 m->vmp_dirty = TRUE;
8842 assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
8843 assert(m->vmp_wire_count == 0);
8844 m->vmp_wire_count++;
8845 assert(m->vmp_wire_count);
8846 if (m->vmp_wire_count == 1) {
8847 m->vmp_q_state = VM_PAGE_IS_WIRED;
8848 wired_count++;
8849 } else {
8850 panic("iopl_valid_data: %p already wired", m);
8851 }
8852
8853 PAGE_WAKEUP_DONE(m);
8854 }
8855 size -= PAGE_SIZE;
8856 }
8857 if (wired_count) {
8858 VM_OBJECT_WIRED_PAGE_COUNT(object, wired_count);
8859 assert(object->resident_page_count >= object->wired_page_count);
8860
8861 /* no need to adjust purgeable accounting for this object: */
8862 assert(object->purgable != VM_PURGABLE_VOLATILE);
8863 assert(object->purgable != VM_PURGABLE_EMPTY);
8864
8865 vm_page_lockspin_queues();
8866 vm_page_wire_count += wired_count;
8867 vm_page_unlock_queues();
8868 }
8869 VM_OBJECT_WIRED_PAGE_UPDATE_END(object, tag);
8870 vm_object_unlock(object);
8871}
8872
8873
8874void
8875vm_object_set_pmap_cache_attr(
8876 vm_object_t object,
8877 upl_page_info_array_t user_page_list,
8878 unsigned int num_pages,
8879 boolean_t batch_pmap_op)
8880{
8881 unsigned int cache_attr = 0;
8882
8883 cache_attr = object->wimg_bits & VM_WIMG_MASK;
8884 assert(user_page_list);
8885 if (cache_attr != VM_WIMG_USE_DEFAULT) {
8886 PMAP_BATCH_SET_CACHE_ATTR(object, user_page_list, cache_attr, num_pages, batch_pmap_op);
8887 }
8888}
8889
8890
8891static bool
8892vm_object_iopl_wire_full(
8893 vm_object_t object,
8894 upl_t upl,
8895 upl_page_info_array_t user_page_list,
8896 upl_control_flags_t cntrl_flags,
8897 vm_tag_t tag)
8898{
8899 vm_page_t dst_page;
8900 unsigned int entry;
8901 int page_count;
8902 int delayed_unlock = 0;
8903 boolean_t retval = TRUE;
8904 ppnum_t phys_page;
8905
8906 vm_object_lock_assert_exclusive(object);
8907 assert(object->purgable != VM_PURGABLE_VOLATILE);
8908 assert(object->purgable != VM_PURGABLE_EMPTY);
8909 assert(object->pager == NULL);
8910 assert(object->vo_copy == NULL);
8911 assert(object->shadow == NULL);
8912
8913 page_count = object->resident_page_count;
8914 dst_page = (vm_page_t)vm_page_queue_first(&object->memq);
8915
8916 vm_page_lock_queues();
8917
8918 while (page_count--) {
8919 if (dst_page->vmp_busy ||
8920 dst_page->vmp_fictitious ||
8921 dst_page->vmp_absent ||
8922 VMP_ERROR_GET(dst_page) ||
8923 dst_page->vmp_cleaning ||
8924 dst_page->vmp_restart ||
8925 dst_page->vmp_laundry) {
8926 retval = FALSE;
8927 goto done;
8928 }
8929 if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
8930 retval = FALSE;
8931 goto done;
8932 }
8933 dst_page->vmp_reference = TRUE;
8934
8935 vm_page_wire(page: dst_page, tag, FALSE);
8936
8937 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
8938 SET_PAGE_DIRTY(dst_page, FALSE);
8939 }
8940 entry = (unsigned int)(dst_page->vmp_offset / PAGE_SIZE);
8941 assert(entry >= 0 && entry < object->resident_page_count);
8942 bitmap_set(map: upl->lite_list, n: entry);
8943
8944 phys_page = VM_PAGE_GET_PHYS_PAGE(m: dst_page);
8945
8946 if (phys_page > upl->highest_page) {
8947 upl->highest_page = phys_page;
8948 }
8949
8950 if (user_page_list) {
8951 user_page_list[entry].phys_addr = phys_page;
8952 user_page_list[entry].absent = dst_page->vmp_absent;
8953 user_page_list[entry].dirty = dst_page->vmp_dirty;
8954 user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
8955 user_page_list[entry].precious = dst_page->vmp_precious;
8956 user_page_list[entry].device = FALSE;
8957 user_page_list[entry].speculative = FALSE;
8958 user_page_list[entry].cs_validated = FALSE;
8959 user_page_list[entry].cs_tainted = FALSE;
8960 user_page_list[entry].cs_nx = FALSE;
8961 user_page_list[entry].needed = FALSE;
8962 user_page_list[entry].mark = FALSE;
8963 }
8964 if (delayed_unlock++ > 256) {
8965 delayed_unlock = 0;
8966 lck_mtx_yield(lck: &vm_page_queue_lock);
8967
8968 VM_CHECK_MEMORYSTATUS;
8969 }
8970 dst_page = (vm_page_t)vm_page_queue_next(&dst_page->vmp_listq);
8971 }
8972done:
8973 vm_page_unlock_queues();
8974
8975 VM_CHECK_MEMORYSTATUS;
8976
8977 return retval;
8978}
8979
8980
8981static kern_return_t
8982vm_object_iopl_wire_empty(
8983 vm_object_t object,
8984 upl_t upl,
8985 upl_page_info_array_t user_page_list,
8986 upl_control_flags_t cntrl_flags,
8987 vm_tag_t tag,
8988 vm_object_offset_t *dst_offset,
8989 int page_count,
8990 int *page_grab_count)
8991{
8992 vm_page_t dst_page;
8993 boolean_t no_zero_fill = FALSE;
8994 int interruptible;
8995 int pages_wired = 0;
8996 int pages_inserted = 0;
8997 int entry = 0;
8998 uint64_t delayed_ledger_update = 0;
8999 kern_return_t ret = KERN_SUCCESS;
9000 int grab_options;
9001 ppnum_t phys_page;
9002
9003 vm_object_lock_assert_exclusive(object);
9004 assert(object->purgable != VM_PURGABLE_VOLATILE);
9005 assert(object->purgable != VM_PURGABLE_EMPTY);
9006 assert(object->pager == NULL);
9007 assert(object->vo_copy == NULL);
9008 assert(object->shadow == NULL);
9009
9010 if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
9011 interruptible = THREAD_ABORTSAFE;
9012 } else {
9013 interruptible = THREAD_UNINT;
9014 }
9015
9016 if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
9017 no_zero_fill = TRUE;
9018 }
9019
9020 grab_options = 0;
9021#if CONFIG_SECLUDED_MEMORY
9022 if (object->can_grab_secluded) {
9023 grab_options |= VM_PAGE_GRAB_SECLUDED;
9024 }
9025#endif /* CONFIG_SECLUDED_MEMORY */
9026
9027 while (page_count--) {
9028 while ((dst_page = vm_page_grab_options(flags: grab_options))
9029 == VM_PAGE_NULL) {
9030 OSAddAtomic(page_count, &vm_upl_wait_for_pages);
9031
9032 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
9033
9034 if (vm_page_wait(interruptible) == FALSE) {
9035 /*
9036 * interrupted case
9037 */
9038 OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
9039
9040 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
9041
9042 ret = MACH_SEND_INTERRUPTED;
9043 goto done;
9044 }
9045 OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
9046
9047 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
9048 }
9049 if (no_zero_fill == FALSE) {
9050 vm_page_zero_fill(page: dst_page);
9051 } else {
9052 dst_page->vmp_absent = TRUE;
9053 }
9054
9055 dst_page->vmp_reference = TRUE;
9056
9057 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
9058 SET_PAGE_DIRTY(dst_page, FALSE);
9059 }
9060 if (dst_page->vmp_absent == FALSE) {
9061 assert(dst_page->vmp_q_state == VM_PAGE_NOT_ON_Q);
9062 assert(dst_page->vmp_wire_count == 0);
9063 dst_page->vmp_wire_count++;
9064 dst_page->vmp_q_state = VM_PAGE_IS_WIRED;
9065 assert(dst_page->vmp_wire_count);
9066 pages_wired++;
9067 PAGE_WAKEUP_DONE(dst_page);
9068 }
9069 pages_inserted++;
9070
9071 vm_page_insert_internal(page: dst_page, object, offset: *dst_offset, tag, FALSE, TRUE, TRUE, TRUE, delayed_ledger_update: &delayed_ledger_update);
9072
9073 bitmap_set(map: upl->lite_list, n: entry);
9074
9075 phys_page = VM_PAGE_GET_PHYS_PAGE(m: dst_page);
9076
9077 if (phys_page > upl->highest_page) {
9078 upl->highest_page = phys_page;
9079 }
9080
9081 if (user_page_list) {
9082 user_page_list[entry].phys_addr = phys_page;
9083 user_page_list[entry].absent = dst_page->vmp_absent;
9084 user_page_list[entry].dirty = dst_page->vmp_dirty;
9085 user_page_list[entry].free_when_done = FALSE;
9086 user_page_list[entry].precious = FALSE;
9087 user_page_list[entry].device = FALSE;
9088 user_page_list[entry].speculative = FALSE;
9089 user_page_list[entry].cs_validated = FALSE;
9090 user_page_list[entry].cs_tainted = FALSE;
9091 user_page_list[entry].cs_nx = FALSE;
9092 user_page_list[entry].needed = FALSE;
9093 user_page_list[entry].mark = FALSE;
9094 }
9095 entry++;
9096 *dst_offset += PAGE_SIZE_64;
9097 }
9098done:
9099 if (pages_wired) {
9100 vm_page_lockspin_queues();
9101 vm_page_wire_count += pages_wired;
9102 vm_page_unlock_queues();
9103 }
9104 if (pages_inserted) {
9105 if (object->internal) {
9106 OSAddAtomic(pages_inserted, &vm_page_internal_count);
9107 } else {
9108 OSAddAtomic(pages_inserted, &vm_page_external_count);
9109 }
9110 }
9111 if (delayed_ledger_update) {
9112 task_t owner;
9113 int ledger_idx_volatile;
9114 int ledger_idx_nonvolatile;
9115 int ledger_idx_volatile_compressed;
9116 int ledger_idx_nonvolatile_compressed;
9117 boolean_t do_footprint;
9118
9119 owner = VM_OBJECT_OWNER(object);
9120 assert(owner);
9121
9122 vm_object_ledger_tag_ledgers(object,
9123 ledger_idx_volatile: &ledger_idx_volatile,
9124 ledger_idx_nonvolatile: &ledger_idx_nonvolatile,
9125 ledger_idx_volatile_compressed: &ledger_idx_volatile_compressed,
9126 ledger_idx_nonvolatile_compressed: &ledger_idx_nonvolatile_compressed,
9127 do_footprint: &do_footprint);
9128
9129 /* more non-volatile bytes */
9130 ledger_credit(ledger: owner->ledger,
9131 entry: ledger_idx_nonvolatile,
9132 amount: delayed_ledger_update);
9133 if (do_footprint) {
9134 /* more footprint */
9135 ledger_credit(ledger: owner->ledger,
9136 entry: task_ledgers.phys_footprint,
9137 amount: delayed_ledger_update);
9138 }
9139 }
9140
9141 assert(page_grab_count);
9142 *page_grab_count = pages_inserted;
9143
9144 return ret;
9145}
9146
9147
9148
9149kern_return_t
9150vm_object_iopl_request(
9151 vm_object_t object,
9152 vm_object_offset_t offset,
9153 upl_size_t size,
9154 upl_t *upl_ptr,
9155 upl_page_info_array_t user_page_list,
9156 unsigned int *page_list_count,
9157 upl_control_flags_t cntrl_flags,
9158 vm_tag_t tag)
9159{
9160 vm_page_t dst_page;
9161 vm_object_offset_t dst_offset;
9162 upl_size_t xfer_size;
9163 upl_t upl = NULL;
9164 unsigned int entry;
9165 int no_zero_fill = FALSE;
9166 unsigned int size_in_pages;
9167 int page_grab_count = 0;
9168 u_int32_t psize;
9169 kern_return_t ret;
9170 vm_prot_t prot;
9171 struct vm_object_fault_info fault_info = {};
9172 struct vm_page_delayed_work dw_array;
9173 struct vm_page_delayed_work *dwp, *dwp_start;
9174 bool dwp_finish_ctx = TRUE;
9175 int dw_count;
9176 int dw_limit;
9177 int dw_index;
9178 boolean_t caller_lookup;
9179 int io_tracking_flag = 0;
9180 int interruptible;
9181 ppnum_t phys_page;
9182
9183 boolean_t set_cache_attr_needed = FALSE;
9184 boolean_t free_wired_pages = FALSE;
9185 boolean_t fast_path_empty_req = FALSE;
9186 boolean_t fast_path_full_req = FALSE;
9187
9188#if DEVELOPMENT || DEBUG
9189 task_t task = current_task();
9190#endif /* DEVELOPMENT || DEBUG */
9191
9192 dwp_start = dwp = NULL;
9193
9194 vm_object_offset_t original_offset = offset;
9195 upl_size_t original_size = size;
9196
9197// DEBUG4K_UPL("object %p offset 0x%llx size 0x%llx cntrl_flags 0x%llx\n", object, (uint64_t)offset, (uint64_t)size, cntrl_flags);
9198
9199 size = (upl_size_t)(vm_object_round_page(offset + size) - vm_object_trunc_page(offset));
9200 offset = vm_object_trunc_page(offset);
9201 if (size != original_size || offset != original_offset) {
9202 DEBUG4K_IOKIT("flags 0x%llx object %p offset 0x%llx size 0x%x -> offset 0x%llx size 0x%x\n", cntrl_flags, object, original_offset, original_size, offset, size);
9203 }
9204
9205 if (cntrl_flags & ~UPL_VALID_FLAGS) {
9206 /*
9207 * For forward compatibility's sake,
9208 * reject any unknown flag.
9209 */
9210 return KERN_INVALID_VALUE;
9211 }
9212 if (vm_lopage_needed == FALSE) {
9213 cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
9214 }
9215
9216 if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
9217 if ((cntrl_flags & (UPL_SET_IO_WIRE | UPL_SET_LITE)) != (UPL_SET_IO_WIRE | UPL_SET_LITE)) {
9218 return KERN_INVALID_VALUE;
9219 }
9220
9221 if (object->phys_contiguous) {
9222 if ((offset + object->vo_shadow_offset) >= (vm_object_offset_t)max_valid_dma_address) {
9223 return KERN_INVALID_ADDRESS;
9224 }
9225
9226 if (((offset + object->vo_shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address) {
9227 return KERN_INVALID_ADDRESS;
9228 }
9229 }
9230 }
9231 if (cntrl_flags & (UPL_NOZEROFILL | UPL_NOZEROFILLIO)) {
9232 no_zero_fill = TRUE;
9233 }
9234
9235 if (cntrl_flags & UPL_COPYOUT_FROM) {
9236 prot = VM_PROT_READ;
9237 } else {
9238 prot = VM_PROT_READ | VM_PROT_WRITE;
9239 }
9240
9241 if ((!object->internal) && (object->paging_offset != 0)) {
9242 panic("vm_object_iopl_request: external object with non-zero paging offset");
9243 }
9244
9245
9246 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, prot, 0);
9247
9248#if CONFIG_IOSCHED || UPL_DEBUG
9249 if ((object->io_tracking && !is_kernel_object(object)) || upl_debug_enabled) {
9250 io_tracking_flag |= UPL_CREATE_IO_TRACKING;
9251 }
9252#endif
9253
9254#if CONFIG_IOSCHED
9255 if (object->io_tracking) {
9256 /* Check if we're dealing with the kernel object. We do not support expedite on kernel object UPLs */
9257 if (!is_kernel_object(object)) {
9258 io_tracking_flag |= UPL_CREATE_EXPEDITE_SUP;
9259 }
9260 }
9261#endif
9262
9263 if (object->phys_contiguous) {
9264 psize = PAGE_SIZE;
9265 } else {
9266 psize = size;
9267
9268 dw_count = 0;
9269 dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
9270 dwp_start = vm_page_delayed_work_get_ctx();
9271 if (dwp_start == NULL) {
9272 dwp_start = &dw_array;
9273 dw_limit = 1;
9274 dwp_finish_ctx = FALSE;
9275 }
9276
9277 dwp = dwp_start;
9278 }
9279
9280 if (cntrl_flags & UPL_SET_INTERNAL) {
9281 upl = upl_create(UPL_CREATE_INTERNAL | UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, size: psize);
9282 user_page_list = size ? upl->page_list : NULL;
9283 } else {
9284 upl = upl_create(UPL_CREATE_LITE | io_tracking_flag, UPL_IO_WIRE, size: psize);
9285 }
9286 if (user_page_list) {
9287 user_page_list[0].device = FALSE;
9288 }
9289 *upl_ptr = upl;
9290
9291 if (cntrl_flags & UPL_NOZEROFILLIO) {
9292 DTRACE_VM4(upl_nozerofillio,
9293 vm_object_t, object,
9294 vm_object_offset_t, offset,
9295 upl_size_t, size,
9296 upl_t, upl);
9297 }
9298
9299 upl->map_object = object;
9300 upl->u_offset = original_offset;
9301 upl->u_size = original_size;
9302
9303 size_in_pages = size / PAGE_SIZE;
9304
9305 if (is_kernel_object(object) &&
9306 !(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS))) {
9307 upl->flags |= UPL_KERNEL_OBJECT;
9308#if UPL_DEBUG
9309 vm_object_lock(object);
9310#else
9311 vm_object_lock_shared(object);
9312#endif
9313 } else {
9314 vm_object_lock(object);
9315 vm_object_activity_begin(object);
9316 }
9317 /*
9318 * paging in progress also protects the paging_offset
9319 */
9320 upl->u_offset = original_offset + object->paging_offset;
9321
9322 if (cntrl_flags & UPL_BLOCK_ACCESS) {
9323 /*
9324 * The user requested that access to the pages in this UPL
9325 * be blocked until the UPL is commited or aborted.
9326 */
9327 upl->flags |= UPL_ACCESS_BLOCKED;
9328 }
9329
9330#if CONFIG_IOSCHED || UPL_DEBUG
9331 if ((upl->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
9332 vm_object_activity_begin(object);
9333 queue_enter(&object->uplq, upl, upl_t, uplq);
9334 }
9335#endif
9336
9337 if (object->phys_contiguous) {
9338 if (upl->flags & UPL_ACCESS_BLOCKED) {
9339 assert(!object->blocked_access);
9340 object->blocked_access = TRUE;
9341 }
9342
9343 vm_object_unlock(object);
9344
9345 /*
9346 * don't need any shadow mappings for this one
9347 * since it is already I/O memory
9348 */
9349 upl->flags |= UPL_DEVICE_MEMORY;
9350
9351 upl->highest_page = (ppnum_t) ((offset + object->vo_shadow_offset + size - 1) >> PAGE_SHIFT);
9352
9353 if (user_page_list) {
9354 user_page_list[0].phys_addr = (ppnum_t) ((offset + object->vo_shadow_offset) >> PAGE_SHIFT);
9355 user_page_list[0].device = TRUE;
9356 }
9357 if (page_list_count != NULL) {
9358 if (upl->flags & UPL_INTERNAL) {
9359 *page_list_count = 0;
9360 } else {
9361 *page_list_count = 1;
9362 }
9363 }
9364
9365 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
9366#if DEVELOPMENT || DEBUG
9367 if (task != NULL) {
9368 ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9369 }
9370#endif /* DEVELOPMENT || DEBUG */
9371 return KERN_SUCCESS;
9372 }
9373 if (!is_kernel_object(object) && object != compressor_object) {
9374 /*
9375 * Protect user space from future COW operations
9376 */
9377#if VM_OBJECT_TRACKING_OP_TRUESHARE
9378 if (!object->true_share &&
9379 vm_object_tracking_btlog) {
9380 btlog_record(vm_object_tracking_btlog, object,
9381 VM_OBJECT_TRACKING_OP_TRUESHARE,
9382 btref_get(__builtin_frame_address(0), 0));
9383 }
9384#endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
9385
9386 vm_object_lock_assert_exclusive(object);
9387 VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
9388
9389 if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
9390 object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
9391 }
9392 }
9393
9394 if (!(cntrl_flags & UPL_COPYOUT_FROM) &&
9395 object->vo_copy != VM_OBJECT_NULL) {
9396 /*
9397 * Honor copy-on-write obligations
9398 *
9399 * The caller is gathering these pages and
9400 * might modify their contents. We need to
9401 * make sure that the copy object has its own
9402 * private copies of these pages before we let
9403 * the caller modify them.
9404 *
9405 * NOTE: someone else could map the original object
9406 * after we've done this copy-on-write here, and they
9407 * could then see an inconsistent picture of the memory
9408 * while it's being modified via the UPL. To prevent this,
9409 * we would have to block access to these pages until the
9410 * UPL is released. We could use the UPL_BLOCK_ACCESS
9411 * code path for that...
9412 */
9413 vm_object_update(object,
9414 offset,
9415 size,
9416 NULL,
9417 NULL,
9418 FALSE, /* should_return */
9419 MEMORY_OBJECT_COPY_SYNC,
9420 VM_PROT_NO_CHANGE);
9421 VM_PAGEOUT_DEBUG(iopl_cow, 1);
9422 VM_PAGEOUT_DEBUG(iopl_cow_pages, (size >> PAGE_SHIFT));
9423 }
9424 if (!(cntrl_flags & (UPL_NEED_32BIT_ADDR | UPL_BLOCK_ACCESS)) &&
9425 object->purgable != VM_PURGABLE_VOLATILE &&
9426 object->purgable != VM_PURGABLE_EMPTY &&
9427 object->vo_copy == NULL &&
9428 size == object->vo_size &&
9429 offset == 0 &&
9430 object->shadow == NULL &&
9431 object->pager == NULL) {
9432 if (object->resident_page_count == size_in_pages) {
9433 assert(object != compressor_object);
9434 assert(!is_kernel_object(object));
9435 fast_path_full_req = TRUE;
9436 } else if (object->resident_page_count == 0) {
9437 assert(object != compressor_object);
9438 assert(!is_kernel_object(object));
9439 fast_path_empty_req = TRUE;
9440 set_cache_attr_needed = TRUE;
9441 }
9442 }
9443
9444 if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
9445 interruptible = THREAD_ABORTSAFE;
9446 } else {
9447 interruptible = THREAD_UNINT;
9448 }
9449
9450 entry = 0;
9451
9452 xfer_size = size;
9453 dst_offset = offset;
9454
9455 if (fast_path_full_req) {
9456 if (vm_object_iopl_wire_full(object, upl, user_page_list, cntrl_flags, tag) == TRUE) {
9457 goto finish;
9458 }
9459 /*
9460 * we couldn't complete the processing of this request on the fast path
9461 * so fall through to the slow path and finish up
9462 */
9463 } else if (fast_path_empty_req) {
9464 if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
9465 ret = KERN_MEMORY_ERROR;
9466 goto return_err;
9467 }
9468 ret = vm_object_iopl_wire_empty(object, upl, user_page_list,
9469 cntrl_flags, tag, dst_offset: &dst_offset, page_count: size_in_pages, page_grab_count: &page_grab_count);
9470
9471 if (ret) {
9472 free_wired_pages = TRUE;
9473 goto return_err;
9474 }
9475 goto finish;
9476 }
9477
9478 fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
9479 fault_info.lo_offset = offset;
9480 fault_info.hi_offset = offset + xfer_size;
9481 fault_info.mark_zf_absent = TRUE;
9482 fault_info.interruptible = interruptible;
9483 fault_info.batch_pmap_op = TRUE;
9484
9485 while (xfer_size) {
9486 vm_fault_return_t result;
9487
9488 dwp->dw_mask = 0;
9489
9490 if (fast_path_full_req) {
9491 /*
9492 * if we get here, it means that we ran into a page
9493 * state we couldn't handle in the fast path and
9494 * bailed out to the slow path... since the order
9495 * we look at pages is different between the 2 paths,
9496 * the following check is needed to determine whether
9497 * this page was already processed in the fast path
9498 */
9499 if (bitmap_test(map: upl->lite_list, n: entry)) {
9500 goto skip_page;
9501 }
9502 }
9503 dst_page = vm_page_lookup(object, offset: dst_offset);
9504
9505 if (dst_page == VM_PAGE_NULL ||
9506 dst_page->vmp_busy ||
9507 VMP_ERROR_GET(dst_page) ||
9508 dst_page->vmp_restart ||
9509 dst_page->vmp_absent ||
9510 dst_page->vmp_fictitious) {
9511 if (is_kernel_object(object)) {
9512 panic("vm_object_iopl_request: missing/bad page in kernel object");
9513 }
9514 if (object == compressor_object) {
9515 panic("vm_object_iopl_request: missing/bad page in compressor object");
9516 }
9517
9518 if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
9519 ret = KERN_MEMORY_ERROR;
9520 goto return_err;
9521 }
9522 set_cache_attr_needed = TRUE;
9523
9524 /*
9525 * We just looked up the page and the result remains valid
9526 * until the object lock is release, so send it to
9527 * vm_fault_page() (as "dst_page"), to avoid having to
9528 * look it up again there.
9529 */
9530 caller_lookup = TRUE;
9531
9532 do {
9533 vm_page_t top_page;
9534 kern_return_t error_code;
9535
9536 fault_info.cluster_size = xfer_size;
9537
9538 vm_object_paging_begin(object);
9539
9540 result = vm_fault_page(first_object: object, first_offset: dst_offset,
9541 fault_type: prot | VM_PROT_WRITE, FALSE,
9542 caller_lookup,
9543 protection: &prot, result_page: &dst_page, top_page: &top_page,
9544 type_of_fault: (int *)0,
9545 error_code: &error_code, no_zero_fill,
9546 fault_info: &fault_info);
9547
9548 /* our lookup is no longer valid at this point */
9549 caller_lookup = FALSE;
9550
9551 switch (result) {
9552 case VM_FAULT_SUCCESS:
9553 page_grab_count++;
9554
9555 if (!dst_page->vmp_absent) {
9556 PAGE_WAKEUP_DONE(dst_page);
9557 } else {
9558 /*
9559 * we only get back an absent page if we
9560 * requested that it not be zero-filled
9561 * because we are about to fill it via I/O
9562 *
9563 * absent pages should be left BUSY
9564 * to prevent them from being faulted
9565 * into an address space before we've
9566 * had a chance to complete the I/O on
9567 * them since they may contain info that
9568 * shouldn't be seen by the faulting task
9569 */
9570 }
9571 /*
9572 * Release paging references and
9573 * top-level placeholder page, if any.
9574 */
9575 if (top_page != VM_PAGE_NULL) {
9576 vm_object_t local_object;
9577
9578 local_object = VM_PAGE_OBJECT(top_page);
9579
9580 /*
9581 * comparing 2 packed pointers
9582 */
9583 if (top_page->vmp_object != dst_page->vmp_object) {
9584 vm_object_lock(local_object);
9585 VM_PAGE_FREE(top_page);
9586 vm_object_paging_end(local_object);
9587 vm_object_unlock(local_object);
9588 } else {
9589 VM_PAGE_FREE(top_page);
9590 vm_object_paging_end(local_object);
9591 }
9592 }
9593 vm_object_paging_end(object);
9594 break;
9595
9596 case VM_FAULT_RETRY:
9597 vm_object_lock(object);
9598 break;
9599
9600 case VM_FAULT_MEMORY_SHORTAGE:
9601 OSAddAtomic((size_in_pages - entry), &vm_upl_wait_for_pages);
9602
9603 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, 0, 0, 0);
9604
9605 if (vm_page_wait(interruptible)) {
9606 OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
9607
9608 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, 0);
9609 vm_object_lock(object);
9610
9611 break;
9612 }
9613 OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
9614
9615 VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, 0, 0, -1);
9616 ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_OBJIOPLREQ_MEMORY_SHORTAGE), arg: 0 /* arg */);
9617 OS_FALLTHROUGH;
9618
9619 case VM_FAULT_INTERRUPTED:
9620 error_code = MACH_SEND_INTERRUPTED;
9621 OS_FALLTHROUGH;
9622 case VM_FAULT_MEMORY_ERROR:
9623memory_error:
9624 ret = (error_code ? error_code: KERN_MEMORY_ERROR);
9625
9626 vm_object_lock(object);
9627 goto return_err;
9628
9629 case VM_FAULT_SUCCESS_NO_VM_PAGE:
9630 /* success but no page: fail */
9631 vm_object_paging_end(object);
9632 vm_object_unlock(object);
9633 goto memory_error;
9634
9635 default:
9636 panic("vm_object_iopl_request: unexpected error"
9637 " 0x%x from vm_fault_page()\n", result);
9638 }
9639 } while (result != VM_FAULT_SUCCESS);
9640 }
9641 phys_page = VM_PAGE_GET_PHYS_PAGE(m: dst_page);
9642
9643 if (upl->flags & UPL_KERNEL_OBJECT) {
9644 goto record_phys_addr;
9645 }
9646
9647 if (dst_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
9648 dst_page->vmp_busy = TRUE;
9649 goto record_phys_addr;
9650 }
9651
9652 if (dst_page->vmp_cleaning) {
9653 /*
9654 * Someone else is cleaning this page in place.
9655 * In theory, we should be able to proceed and use this
9656 * page but they'll probably end up clearing the "busy"
9657 * bit on it in upl_commit_range() but they didn't set
9658 * it, so they would clear our "busy" bit and open
9659 * us to race conditions.
9660 * We'd better wait for the cleaning to complete and
9661 * then try again.
9662 */
9663 VM_PAGEOUT_DEBUG(vm_object_iopl_request_sleep_for_cleaning, 1);
9664 PAGE_SLEEP(object, dst_page, THREAD_UNINT);
9665 continue;
9666 }
9667 if (dst_page->vmp_laundry) {
9668 vm_pageout_steal_laundry(page: dst_page, FALSE);
9669 }
9670
9671 if ((cntrl_flags & UPL_NEED_32BIT_ADDR) &&
9672 phys_page >= (max_valid_dma_address >> PAGE_SHIFT)) {
9673 vm_page_t low_page;
9674 int refmod;
9675
9676 /*
9677 * support devices that can't DMA above 32 bits
9678 * by substituting pages from a pool of low address
9679 * memory for any pages we find above the 4G mark
9680 * can't substitute if the page is already wired because
9681 * we don't know whether that physical address has been
9682 * handed out to some other 64 bit capable DMA device to use
9683 */
9684 if (VM_PAGE_WIRED(dst_page)) {
9685 ret = KERN_PROTECTION_FAILURE;
9686 goto return_err;
9687 }
9688 low_page = vm_page_grablo();
9689
9690 if (low_page == VM_PAGE_NULL) {
9691 ret = KERN_RESOURCE_SHORTAGE;
9692 goto return_err;
9693 }
9694 /*
9695 * from here until the vm_page_replace completes
9696 * we musn't drop the object lock... we don't
9697 * want anyone refaulting this page in and using
9698 * it after we disconnect it... we want the fault
9699 * to find the new page being substituted.
9700 */
9701 if (dst_page->vmp_pmapped) {
9702 refmod = pmap_disconnect(phys: phys_page);
9703 } else {
9704 refmod = 0;
9705 }
9706
9707 if (!dst_page->vmp_absent) {
9708 vm_page_copy(src_page: dst_page, dest_page: low_page);
9709 }
9710
9711 low_page->vmp_reference = dst_page->vmp_reference;
9712 low_page->vmp_dirty = dst_page->vmp_dirty;
9713 low_page->vmp_absent = dst_page->vmp_absent;
9714
9715 if (refmod & VM_MEM_REFERENCED) {
9716 low_page->vmp_reference = TRUE;
9717 }
9718 if (refmod & VM_MEM_MODIFIED) {
9719 SET_PAGE_DIRTY(low_page, FALSE);
9720 }
9721
9722 vm_page_replace(mem: low_page, object, offset: dst_offset);
9723
9724 dst_page = low_page;
9725 /*
9726 * vm_page_grablo returned the page marked
9727 * BUSY... we don't need a PAGE_WAKEUP_DONE
9728 * here, because we've never dropped the object lock
9729 */
9730 if (!dst_page->vmp_absent) {
9731 dst_page->vmp_busy = FALSE;
9732 }
9733
9734 phys_page = VM_PAGE_GET_PHYS_PAGE(m: dst_page);
9735 }
9736 if (!dst_page->vmp_busy) {
9737 dwp->dw_mask |= DW_vm_page_wire;
9738 }
9739
9740 if (cntrl_flags & UPL_BLOCK_ACCESS) {
9741 /*
9742 * Mark the page "busy" to block any future page fault
9743 * on this page in addition to wiring it.
9744 * We'll also remove the mapping
9745 * of all these pages before leaving this routine.
9746 */
9747 assert(!dst_page->vmp_fictitious);
9748 dst_page->vmp_busy = TRUE;
9749 }
9750 /*
9751 * expect the page to be used
9752 * page queues lock must be held to set 'reference'
9753 */
9754 dwp->dw_mask |= DW_set_reference;
9755
9756 if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
9757 SET_PAGE_DIRTY(dst_page, TRUE);
9758 /*
9759 * Page belonging to a code-signed object is about to
9760 * be written. Mark it tainted and disconnect it from
9761 * all pmaps so processes have to fault it back in and
9762 * deal with the tainted bit.
9763 */
9764 if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
9765 dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
9766 vm_page_iopl_tainted++;
9767 if (dst_page->vmp_pmapped) {
9768 int refmod = pmap_disconnect(phys: VM_PAGE_GET_PHYS_PAGE(m: dst_page));
9769 if (refmod & VM_MEM_REFERENCED) {
9770 dst_page->vmp_reference = TRUE;
9771 }
9772 }
9773 }
9774 }
9775 if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
9776 pmap_sync_page_attributes_phys(pa: phys_page);
9777 dst_page->vmp_written_by_kernel = FALSE;
9778 }
9779
9780record_phys_addr:
9781 if (dst_page->vmp_busy) {
9782 upl->flags |= UPL_HAS_BUSY;
9783 }
9784
9785 bitmap_set(map: upl->lite_list, n: entry);
9786
9787 if (phys_page > upl->highest_page) {
9788 upl->highest_page = phys_page;
9789 }
9790
9791 if (user_page_list) {
9792 user_page_list[entry].phys_addr = phys_page;
9793 user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
9794 user_page_list[entry].absent = dst_page->vmp_absent;
9795 user_page_list[entry].dirty = dst_page->vmp_dirty;
9796 user_page_list[entry].precious = dst_page->vmp_precious;
9797 user_page_list[entry].device = FALSE;
9798 user_page_list[entry].needed = FALSE;
9799 if (dst_page->vmp_clustered == TRUE) {
9800 user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
9801 } else {
9802 user_page_list[entry].speculative = FALSE;
9803 }
9804 user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
9805 user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
9806 user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
9807 user_page_list[entry].mark = FALSE;
9808 }
9809 if (!is_kernel_object(object) && object != compressor_object) {
9810 /*
9811 * someone is explicitly grabbing this page...
9812 * update clustered and speculative state
9813 *
9814 */
9815 if (dst_page->vmp_clustered) {
9816 VM_PAGE_CONSUME_CLUSTERED(dst_page);
9817 }
9818 }
9819skip_page:
9820 entry++;
9821 dst_offset += PAGE_SIZE_64;
9822 xfer_size -= PAGE_SIZE;
9823
9824 if (dwp->dw_mask) {
9825 VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
9826
9827 if (dw_count >= dw_limit) {
9828 vm_page_do_delayed_work(object, tag, dwp: dwp_start, dw_count);
9829
9830 dwp = dwp_start;
9831 dw_count = 0;
9832 }
9833 }
9834 }
9835 assert(entry == size_in_pages);
9836
9837 if (dw_count) {
9838 vm_page_do_delayed_work(object, tag, dwp: dwp_start, dw_count);
9839 dwp = dwp_start;
9840 dw_count = 0;
9841 }
9842finish:
9843 if (user_page_list && set_cache_attr_needed == TRUE) {
9844 vm_object_set_pmap_cache_attr(object, user_page_list, num_pages: size_in_pages, TRUE);
9845 }
9846
9847 if (page_list_count != NULL) {
9848 if (upl->flags & UPL_INTERNAL) {
9849 *page_list_count = 0;
9850 } else if (*page_list_count > size_in_pages) {
9851 *page_list_count = size_in_pages;
9852 }
9853 }
9854 vm_object_unlock(object);
9855
9856 if (cntrl_flags & UPL_BLOCK_ACCESS) {
9857 /*
9858 * We've marked all the pages "busy" so that future
9859 * page faults will block.
9860 * Now remove the mapping for these pages, so that they
9861 * can't be accessed without causing a page fault.
9862 */
9863 vm_object_pmap_protect(object, offset, size: (vm_object_size_t)size,
9864 PMAP_NULL,
9865 PAGE_SIZE,
9866 pmap_start: 0, VM_PROT_NONE);
9867 assert(!object->blocked_access);
9868 object->blocked_access = TRUE;
9869 }
9870
9871 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, 0, 0);
9872#if DEVELOPMENT || DEBUG
9873 if (task != NULL) {
9874 ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9875 }
9876#endif /* DEVELOPMENT || DEBUG */
9877
9878 if (dwp_start && dwp_finish_ctx) {
9879 vm_page_delayed_work_finish_ctx(dwp: dwp_start);
9880 dwp_start = dwp = NULL;
9881 }
9882
9883 return KERN_SUCCESS;
9884
9885return_err:
9886 dw_index = 0;
9887
9888 for (; offset < dst_offset; offset += PAGE_SIZE) {
9889 boolean_t need_unwire;
9890
9891 dst_page = vm_page_lookup(object, offset);
9892
9893 if (dst_page == VM_PAGE_NULL) {
9894 panic("vm_object_iopl_request: Wired page missing.");
9895 }
9896
9897 /*
9898 * if we've already processed this page in an earlier
9899 * dw_do_work, we need to undo the wiring... we will
9900 * leave the dirty and reference bits on if they
9901 * were set, since we don't have a good way of knowing
9902 * what the previous state was and we won't get here
9903 * under any normal circumstances... we will always
9904 * clear BUSY and wakeup any waiters via vm_page_free
9905 * or PAGE_WAKEUP_DONE
9906 */
9907 need_unwire = TRUE;
9908
9909 if (dw_count) {
9910 if ((dwp_start)[dw_index].dw_m == dst_page) {
9911 /*
9912 * still in the deferred work list
9913 * which means we haven't yet called
9914 * vm_page_wire on this page
9915 */
9916 need_unwire = FALSE;
9917
9918 dw_index++;
9919 dw_count--;
9920 }
9921 }
9922 vm_page_lock_queues();
9923
9924 if (dst_page->vmp_absent || free_wired_pages == TRUE) {
9925 vm_page_free(page: dst_page);
9926
9927 need_unwire = FALSE;
9928 } else {
9929 if (need_unwire == TRUE) {
9930 vm_page_unwire(page: dst_page, TRUE);
9931 }
9932
9933 PAGE_WAKEUP_DONE(dst_page);
9934 }
9935 vm_page_unlock_queues();
9936
9937 if (need_unwire == TRUE) {
9938 counter_inc(&vm_statistics_reactivations);
9939 }
9940 }
9941#if UPL_DEBUG
9942 upl->upl_state = 2;
9943#endif
9944 if (!(upl->flags & UPL_KERNEL_OBJECT)) {
9945 vm_object_activity_end(object);
9946 vm_object_collapse(object, offset: 0, TRUE);
9947 }
9948 vm_object_unlock(object);
9949 upl_destroy(upl);
9950
9951 VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, ret, 0, 0);
9952#if DEVELOPMENT || DEBUG
9953 if (task != NULL) {
9954 ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9955 }
9956#endif /* DEVELOPMENT || DEBUG */
9957
9958 if (dwp_start && dwp_finish_ctx) {
9959 vm_page_delayed_work_finish_ctx(dwp: dwp_start);
9960 dwp_start = dwp = NULL;
9961 }
9962 return ret;
9963}
9964
9965kern_return_t
9966upl_transpose(
9967 upl_t upl1,
9968 upl_t upl2)
9969{
9970 kern_return_t retval;
9971 boolean_t upls_locked;
9972 vm_object_t object1, object2;
9973
9974 /* LD: Should mapped UPLs be eligible for a transpose? */
9975 if (upl1 == UPL_NULL || upl2 == UPL_NULL || upl1 == upl2 || ((upl1->flags & UPL_VECTOR) == UPL_VECTOR) || ((upl2->flags & UPL_VECTOR) == UPL_VECTOR)) {
9976 return KERN_INVALID_ARGUMENT;
9977 }
9978
9979 upls_locked = FALSE;
9980
9981 /*
9982 * Since we need to lock both UPLs at the same time,
9983 * avoid deadlocks by always taking locks in the same order.
9984 */
9985 if (upl1 < upl2) {
9986 upl_lock(upl1);
9987 upl_lock(upl2);
9988 } else {
9989 upl_lock(upl2);
9990 upl_lock(upl1);
9991 }
9992 upls_locked = TRUE; /* the UPLs will need to be unlocked */
9993
9994 object1 = upl1->map_object;
9995 object2 = upl2->map_object;
9996
9997 if (upl1->u_offset != 0 || upl2->u_offset != 0 ||
9998 upl1->u_size != upl2->u_size) {
9999 /*
10000 * We deal only with full objects, not subsets.
10001 * That's because we exchange the entire backing store info
10002 * for the objects: pager, resident pages, etc... We can't do
10003 * only part of it.
10004 */
10005 retval = KERN_INVALID_VALUE;
10006 goto done;
10007 }
10008
10009 /*
10010 * Tranpose the VM objects' backing store.
10011 */
10012 retval = vm_object_transpose(object1, object2,
10013 transpose_size: upl_adjusted_size(upl: upl1, PAGE_MASK));
10014
10015 if (retval == KERN_SUCCESS) {
10016 /*
10017 * Make each UPL point to the correct VM object, i.e. the
10018 * object holding the pages that the UPL refers to...
10019 */
10020#if CONFIG_IOSCHED || UPL_DEBUG
10021 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
10022 vm_object_lock(object1);
10023 vm_object_lock(object2);
10024 }
10025 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10026 queue_remove(&object1->uplq, upl1, upl_t, uplq);
10027 }
10028 if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10029 queue_remove(&object2->uplq, upl2, upl_t, uplq);
10030 }
10031#endif
10032 upl1->map_object = object2;
10033 upl2->map_object = object1;
10034
10035#if CONFIG_IOSCHED || UPL_DEBUG
10036 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10037 queue_enter(&object2->uplq, upl1, upl_t, uplq);
10038 }
10039 if ((upl2->flags & UPL_TRACKED_BY_OBJECT) || upl_debug_enabled) {
10040 queue_enter(&object1->uplq, upl2, upl_t, uplq);
10041 }
10042 if ((upl1->flags & UPL_TRACKED_BY_OBJECT) || (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
10043 vm_object_unlock(object2);
10044 vm_object_unlock(object1);
10045 }
10046#endif
10047 }
10048
10049done:
10050 /*
10051 * Cleanup.
10052 */
10053 if (upls_locked) {
10054 upl_unlock(upl1);
10055 upl_unlock(upl2);
10056 upls_locked = FALSE;
10057 }
10058
10059 return retval;
10060}
10061
10062void
10063upl_range_needed(
10064 upl_t upl,
10065 int index,
10066 int count)
10067{
10068 int size_in_pages;
10069
10070 if (!(upl->flags & UPL_INTERNAL) || count <= 0) {
10071 return;
10072 }
10073
10074 size_in_pages = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
10075
10076 while (count-- && index < size_in_pages) {
10077 upl->page_list[index++].needed = TRUE;
10078 }
10079}
10080
10081
10082/*
10083 * Reserve of virtual addresses in the kernel address space.
10084 * We need to map the physical pages in the kernel, so that we
10085 * can call the code-signing or slide routines with a kernel
10086 * virtual address. We keep this pool of pre-allocated kernel
10087 * virtual addresses so that we don't have to scan the kernel's
10088 * virtaul address space each time we need to work with
10089 * a physical page.
10090 */
10091SIMPLE_LOCK_DECLARE(vm_paging_lock, 0);
10092#define VM_PAGING_NUM_PAGES 64
10093SECURITY_READ_ONLY_LATE(vm_offset_t) vm_paging_base_address = 0;
10094bool vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
10095int vm_paging_max_index = 0;
10096int vm_paging_page_waiter = 0;
10097int vm_paging_page_waiter_total = 0;
10098
10099unsigned long vm_paging_no_kernel_page = 0;
10100unsigned long vm_paging_objects_mapped = 0;
10101unsigned long vm_paging_pages_mapped = 0;
10102unsigned long vm_paging_objects_mapped_slow = 0;
10103unsigned long vm_paging_pages_mapped_slow = 0;
10104
10105__startup_func
10106static void
10107vm_paging_map_init(void)
10108{
10109 kmem_alloc(map: kernel_map, addrp: &vm_paging_base_address,
10110 ptoa(VM_PAGING_NUM_PAGES),
10111 flags: KMA_DATA | KMA_NOFAIL | KMA_KOBJECT | KMA_PERMANENT | KMA_PAGEABLE,
10112 VM_KERN_MEMORY_NONE);
10113}
10114STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_paging_map_init);
10115
10116/*
10117 * vm_paging_map_object:
10118 * Maps part of a VM object's pages in the kernel
10119 * virtual address space, using the pre-allocated
10120 * kernel virtual addresses, if possible.
10121 * Context:
10122 * The VM object is locked. This lock will get
10123 * dropped and re-acquired though, so the caller
10124 * must make sure the VM object is kept alive
10125 * (by holding a VM map that has a reference
10126 * on it, for example, or taking an extra reference).
10127 * The page should also be kept busy to prevent
10128 * it from being reclaimed.
10129 */
10130kern_return_t
10131vm_paging_map_object(
10132 vm_page_t page,
10133 vm_object_t object,
10134 vm_object_offset_t offset,
10135 vm_prot_t protection,
10136 boolean_t can_unlock_object,
10137 vm_map_size_t *size, /* IN/OUT */
10138 vm_map_offset_t *address, /* OUT */
10139 boolean_t *need_unmap) /* OUT */
10140{
10141 kern_return_t kr;
10142 vm_map_offset_t page_map_offset;
10143 vm_map_size_t map_size;
10144 vm_object_offset_t object_offset;
10145 int i;
10146
10147 if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
10148 /* use permanent 1-to-1 kernel mapping of physical memory ? */
10149 *address = (vm_map_offset_t)
10150 phystokv(pa: (pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m: page) << PAGE_SHIFT);
10151 *need_unmap = FALSE;
10152 return KERN_SUCCESS;
10153
10154 assert(page->vmp_busy);
10155 /*
10156 * Use one of the pre-allocated kernel virtual addresses
10157 * and just enter the VM page in the kernel address space
10158 * at that virtual address.
10159 */
10160 simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10161
10162 /*
10163 * Try and find an available kernel virtual address
10164 * from our pre-allocated pool.
10165 */
10166 page_map_offset = 0;
10167 for (;;) {
10168 for (i = 0; i < VM_PAGING_NUM_PAGES; i++) {
10169 if (vm_paging_page_inuse[i] == FALSE) {
10170 page_map_offset =
10171 vm_paging_base_address +
10172 (i * PAGE_SIZE);
10173 break;
10174 }
10175 }
10176 if (page_map_offset != 0) {
10177 /* found a space to map our page ! */
10178 break;
10179 }
10180
10181 if (can_unlock_object) {
10182 /*
10183 * If we can afford to unlock the VM object,
10184 * let's take the slow path now...
10185 */
10186 break;
10187 }
10188 /*
10189 * We can't afford to unlock the VM object, so
10190 * let's wait for a space to become available...
10191 */
10192 vm_paging_page_waiter_total++;
10193 vm_paging_page_waiter++;
10194 kr = assert_wait(event: (event_t)&vm_paging_page_waiter, THREAD_UNINT);
10195 if (kr == THREAD_WAITING) {
10196 simple_unlock(&vm_paging_lock);
10197 kr = thread_block(THREAD_CONTINUE_NULL);
10198 simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10199 }
10200 vm_paging_page_waiter--;
10201 /* ... and try again */
10202 }
10203
10204 if (page_map_offset != 0) {
10205 /*
10206 * We found a kernel virtual address;
10207 * map the physical page to that virtual address.
10208 */
10209 if (i > vm_paging_max_index) {
10210 vm_paging_max_index = i;
10211 }
10212 vm_paging_page_inuse[i] = TRUE;
10213 simple_unlock(&vm_paging_lock);
10214
10215 page->vmp_pmapped = TRUE;
10216
10217 /*
10218 * Keep the VM object locked over the PMAP_ENTER
10219 * and the actual use of the page by the kernel,
10220 * or this pmap mapping might get undone by a
10221 * vm_object_pmap_protect() call...
10222 */
10223 kr = pmap_enter_check(pmap: kernel_pmap,
10224 virtual_address: page_map_offset,
10225 page,
10226 protection,
10227 VM_PROT_NONE,
10228 flags: 0,
10229 TRUE);
10230 assert(kr == KERN_SUCCESS);
10231 vm_paging_objects_mapped++;
10232 vm_paging_pages_mapped++;
10233 *address = page_map_offset;
10234 *need_unmap = TRUE;
10235
10236#if KASAN
10237 kasan_notify_address(page_map_offset, PAGE_SIZE);
10238#endif
10239
10240 /* all done and mapped, ready to use ! */
10241 return KERN_SUCCESS;
10242 }
10243
10244 /*
10245 * We ran out of pre-allocated kernel virtual
10246 * addresses. Just map the page in the kernel
10247 * the slow and regular way.
10248 */
10249 vm_paging_no_kernel_page++;
10250 simple_unlock(&vm_paging_lock);
10251 }
10252
10253 if (!can_unlock_object) {
10254 *address = 0;
10255 *size = 0;
10256 *need_unmap = FALSE;
10257 return KERN_NOT_SUPPORTED;
10258 }
10259
10260 object_offset = vm_object_trunc_page(offset);
10261 map_size = vm_map_round_page(*size,
10262 VM_MAP_PAGE_MASK(kernel_map));
10263
10264 /*
10265 * Try and map the required range of the object
10266 * in the kernel_map. Given that allocation is
10267 * for pageable memory, it shouldn't contain
10268 * pointers and is mapped into the data range.
10269 */
10270
10271 vm_object_reference_locked(object); /* for the map entry */
10272 vm_object_unlock(object);
10273
10274 kr = vm_map_enter(map: kernel_map,
10275 address,
10276 size: map_size,
10277 mask: 0,
10278 VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(),
10279 object,
10280 offset: object_offset,
10281 FALSE,
10282 cur_protection: protection,
10283 VM_PROT_ALL,
10284 VM_INHERIT_NONE);
10285 if (kr != KERN_SUCCESS) {
10286 *address = 0;
10287 *size = 0;
10288 *need_unmap = FALSE;
10289 vm_object_deallocate(object); /* for the map entry */
10290 vm_object_lock(object);
10291 return kr;
10292 }
10293
10294 *size = map_size;
10295
10296 /*
10297 * Enter the mapped pages in the page table now.
10298 */
10299 vm_object_lock(object);
10300 /*
10301 * VM object must be kept locked from before PMAP_ENTER()
10302 * until after the kernel is done accessing the page(s).
10303 * Otherwise, the pmap mappings in the kernel could be
10304 * undone by a call to vm_object_pmap_protect().
10305 */
10306
10307 for (page_map_offset = 0;
10308 map_size != 0;
10309 map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
10310 page = vm_page_lookup(object, offset: offset + page_map_offset);
10311 if (page == VM_PAGE_NULL) {
10312 printf(format: "vm_paging_map_object: no page !?");
10313 vm_object_unlock(object);
10314 vm_map_remove(map: kernel_map, start: *address, end: *size);
10315 *address = 0;
10316 *size = 0;
10317 *need_unmap = FALSE;
10318 vm_object_lock(object);
10319 return KERN_MEMORY_ERROR;
10320 }
10321 page->vmp_pmapped = TRUE;
10322
10323 kr = pmap_enter_check(pmap: kernel_pmap,
10324 virtual_address: *address + page_map_offset,
10325 page,
10326 protection,
10327 VM_PROT_NONE,
10328 flags: 0,
10329 TRUE);
10330 assert(kr == KERN_SUCCESS);
10331#if KASAN
10332 kasan_notify_address(*address + page_map_offset, PAGE_SIZE);
10333#endif
10334 }
10335
10336 vm_paging_objects_mapped_slow++;
10337 vm_paging_pages_mapped_slow += (unsigned long) (map_size / PAGE_SIZE_64);
10338
10339 *need_unmap = TRUE;
10340
10341 return KERN_SUCCESS;
10342}
10343
10344/*
10345 * vm_paging_unmap_object:
10346 * Unmaps part of a VM object's pages from the kernel
10347 * virtual address space.
10348 * Context:
10349 * The VM object is locked. This lock will get
10350 * dropped and re-acquired though.
10351 */
10352void
10353vm_paging_unmap_object(
10354 vm_object_t object,
10355 vm_map_offset_t start,
10356 vm_map_offset_t end)
10357{
10358 int i;
10359
10360 if ((vm_paging_base_address == 0) ||
10361 (start < vm_paging_base_address) ||
10362 (end > (vm_paging_base_address
10363 + (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
10364 /*
10365 * We didn't use our pre-allocated pool of
10366 * kernel virtual address. Deallocate the
10367 * virtual memory.
10368 */
10369 if (object != VM_OBJECT_NULL) {
10370 vm_object_unlock(object);
10371 }
10372 vm_map_remove(map: kernel_map, start, end);
10373 if (object != VM_OBJECT_NULL) {
10374 vm_object_lock(object);
10375 }
10376 } else {
10377 /*
10378 * We used a kernel virtual address from our
10379 * pre-allocated pool. Put it back in the pool
10380 * for next time.
10381 */
10382 assert(end - start == PAGE_SIZE);
10383 i = (int) ((start - vm_paging_base_address) >> PAGE_SHIFT);
10384 assert(i >= 0 && i < VM_PAGING_NUM_PAGES);
10385
10386 /* undo the pmap mapping */
10387 pmap_remove(map: kernel_pmap, s: start, e: end);
10388
10389 simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10390 vm_paging_page_inuse[i] = FALSE;
10391 if (vm_paging_page_waiter) {
10392 thread_wakeup(&vm_paging_page_waiter);
10393 }
10394 simple_unlock(&vm_paging_lock);
10395 }
10396}
10397
10398
10399/*
10400 * page->vmp_object must be locked
10401 */
10402void
10403vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked)
10404{
10405 if (!queues_locked) {
10406 vm_page_lockspin_queues();
10407 }
10408
10409 page->vmp_free_when_done = FALSE;
10410 /*
10411 * need to drop the laundry count...
10412 * we may also need to remove it
10413 * from the I/O paging queue...
10414 * vm_pageout_throttle_up handles both cases
10415 *
10416 * the laundry and pageout_queue flags are cleared...
10417 */
10418 vm_pageout_throttle_up(m: page);
10419
10420 if (!queues_locked) {
10421 vm_page_unlock_queues();
10422 }
10423}
10424
10425#define VECTOR_UPL_ELEMENTS_UPPER_LIMIT 64
10426
10427upl_t
10428vector_upl_create(vm_offset_t upl_offset, uint32_t max_upls)
10429{
10430 int i = 0;
10431 upl_t upl;
10432
10433 assert(max_upls > 0);
10434 if (max_upls == 0) {
10435 return NULL;
10436 }
10437
10438 if (max_upls > VECTOR_UPL_ELEMENTS_UPPER_LIMIT) {
10439 max_upls = VECTOR_UPL_ELEMENTS_UPPER_LIMIT;
10440 }
10441 vector_upl_t vector_upl = kalloc_type(struct _vector_upl, typeof(vector_upl->upls[0]), max_upls, Z_WAITOK | Z_NOFAIL);
10442
10443 upl = upl_create(type: 0, UPL_VECTOR, size: 0);
10444 upl->vector_upl = vector_upl;
10445 upl->u_offset = upl_offset;
10446 vector_upl->size = 0;
10447 vector_upl->offset = upl_offset;
10448 vector_upl->invalid_upls = 0;
10449 vector_upl->num_upls = 0;
10450 vector_upl->pagelist = NULL;
10451 vector_upl->max_upls = max_upls;
10452
10453 for (i = 0; i < max_upls; i++) {
10454 vector_upl->upls[i].iostate.size = 0;
10455 vector_upl->upls[i].iostate.offset = 0;
10456 }
10457 return upl;
10458}
10459
10460uint32_t
10461vector_upl_max_upls(const upl_t upl)
10462{
10463 if (!vector_upl_is_valid(upl)) {
10464 return 0;
10465 }
10466 return ((vector_upl_t)(upl->vector_upl))->max_upls;
10467}
10468
10469void
10470vector_upl_deallocate(upl_t upl)
10471{
10472 vector_upl_t vector_upl = upl->vector_upl;
10473
10474 assert(vector_upl_is_valid(upl));
10475
10476 if (vector_upl->invalid_upls != vector_upl->num_upls) {
10477 panic("Deallocating non-empty Vectored UPL");
10478 }
10479 uint32_t max_upls = vector_upl->max_upls;
10480 kfree_type(struct upl_page_info, atop(vector_upl->size), vector_upl->pagelist);
10481 kfree_type(struct _vector_upl, typeof(vector_upl->upls[0]), max_upls, vector_upl);
10482 upl->vector_upl = NULL;
10483}
10484
10485boolean_t
10486vector_upl_is_valid(upl_t upl)
10487{
10488 return upl && (upl->flags & UPL_VECTOR) && upl->vector_upl;
10489}
10490
10491boolean_t
10492vector_upl_set_subupl(upl_t upl, upl_t subupl, uint32_t io_size)
10493{
10494 if (vector_upl_is_valid(upl)) {
10495 vector_upl_t vector_upl = upl->vector_upl;
10496
10497 if (vector_upl) {
10498 if (subupl) {
10499 if (io_size) {
10500 if (io_size < PAGE_SIZE) {
10501 io_size = PAGE_SIZE;
10502 }
10503 subupl->vector_upl = (void*)vector_upl;
10504 vector_upl->upls[vector_upl->num_upls++].elem = subupl;
10505 vector_upl->size += io_size;
10506 upl->u_size += io_size;
10507 } else {
10508 uint32_t i = 0, invalid_upls = 0;
10509 for (i = 0; i < vector_upl->num_upls; i++) {
10510 if (vector_upl->upls[i].elem == subupl) {
10511 break;
10512 }
10513 }
10514 if (i == vector_upl->num_upls) {
10515 panic("Trying to remove sub-upl when none exists");
10516 }
10517
10518 vector_upl->upls[i].elem = NULL;
10519 invalid_upls = os_atomic_inc(&(vector_upl)->invalid_upls,
10520 relaxed);
10521 if (invalid_upls == vector_upl->num_upls) {
10522 return TRUE;
10523 } else {
10524 return FALSE;
10525 }
10526 }
10527 } else {
10528 panic("vector_upl_set_subupl was passed a NULL upl element");
10529 }
10530 } else {
10531 panic("vector_upl_set_subupl was passed a non-vectored upl");
10532 }
10533 } else {
10534 panic("vector_upl_set_subupl was passed a NULL upl");
10535 }
10536
10537 return FALSE;
10538}
10539
10540void
10541vector_upl_set_pagelist(upl_t upl)
10542{
10543 if (vector_upl_is_valid(upl)) {
10544 uint32_t i = 0;
10545 vector_upl_t vector_upl = upl->vector_upl;
10546
10547 if (vector_upl) {
10548 vm_offset_t pagelist_size = 0, cur_upl_pagelist_size = 0;
10549
10550 vector_upl->pagelist = kalloc_type(struct upl_page_info,
10551 atop(vector_upl->size), Z_WAITOK);
10552
10553 for (i = 0; i < vector_upl->num_upls; i++) {
10554 cur_upl_pagelist_size = sizeof(struct upl_page_info) * upl_adjusted_size(upl: vector_upl->upls[i].elem, PAGE_MASK) / PAGE_SIZE;
10555 bcopy(src: vector_upl->upls[i].elem->page_list, dst: (char*)vector_upl->pagelist + pagelist_size, n: cur_upl_pagelist_size);
10556 pagelist_size += cur_upl_pagelist_size;
10557 if (vector_upl->upls[i].elem->highest_page > upl->highest_page) {
10558 upl->highest_page = vector_upl->upls[i].elem->highest_page;
10559 }
10560 }
10561 assert( pagelist_size == (sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE)));
10562 } else {
10563 panic("vector_upl_set_pagelist was passed a non-vectored upl");
10564 }
10565 } else {
10566 panic("vector_upl_set_pagelist was passed a NULL upl");
10567 }
10568}
10569
10570upl_t
10571vector_upl_subupl_byindex(upl_t upl, uint32_t index)
10572{
10573 if (vector_upl_is_valid(upl)) {
10574 vector_upl_t vector_upl = upl->vector_upl;
10575 if (vector_upl) {
10576 if (index < vector_upl->num_upls) {
10577 return vector_upl->upls[index].elem;
10578 }
10579 } else {
10580 panic("vector_upl_subupl_byindex was passed a non-vectored upl");
10581 }
10582 }
10583 return NULL;
10584}
10585
10586upl_t
10587vector_upl_subupl_byoffset(upl_t upl, upl_offset_t *upl_offset, upl_size_t *upl_size)
10588{
10589 if (vector_upl_is_valid(upl)) {
10590 uint32_t i = 0;
10591 vector_upl_t vector_upl = upl->vector_upl;
10592
10593 if (vector_upl) {
10594 upl_t subupl = NULL;
10595 vector_upl_iostates_t subupl_state;
10596
10597 for (i = 0; i < vector_upl->num_upls; i++) {
10598 subupl = vector_upl->upls[i].elem;
10599 subupl_state = vector_upl->upls[i].iostate;
10600 if (*upl_offset <= (subupl_state.offset + subupl_state.size - 1)) {
10601 /* We could have been passed an offset/size pair that belongs
10602 * to an UPL element that has already been committed/aborted.
10603 * If so, return NULL.
10604 */
10605 if (subupl == NULL) {
10606 return NULL;
10607 }
10608 if ((subupl_state.offset + subupl_state.size) < (*upl_offset + *upl_size)) {
10609 *upl_size = (subupl_state.offset + subupl_state.size) - *upl_offset;
10610 if (*upl_size > subupl_state.size) {
10611 *upl_size = subupl_state.size;
10612 }
10613 }
10614 if (*upl_offset >= subupl_state.offset) {
10615 *upl_offset -= subupl_state.offset;
10616 } else if (i) {
10617 panic("Vector UPL offset miscalculation");
10618 }
10619 return subupl;
10620 }
10621 }
10622 } else {
10623 panic("vector_upl_subupl_byoffset was passed a non-vectored UPL");
10624 }
10625 }
10626 return NULL;
10627}
10628
10629void
10630vector_upl_get_submap(upl_t upl, vm_map_t *v_upl_submap, vm_offset_t *submap_dst_addr)
10631{
10632 *v_upl_submap = NULL;
10633
10634 if (vector_upl_is_valid(upl)) {
10635 vector_upl_t vector_upl = upl->vector_upl;
10636 if (vector_upl) {
10637 *v_upl_submap = vector_upl->submap;
10638 *submap_dst_addr = vector_upl->submap_dst_addr;
10639 } else {
10640 panic("vector_upl_get_submap was passed a non-vectored UPL");
10641 }
10642 } else {
10643 panic("vector_upl_get_submap was passed a null UPL");
10644 }
10645}
10646
10647void
10648vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr)
10649{
10650 if (vector_upl_is_valid(upl)) {
10651 vector_upl_t vector_upl = upl->vector_upl;
10652 if (vector_upl) {
10653 vector_upl->submap = submap;
10654 vector_upl->submap_dst_addr = submap_dst_addr;
10655 } else {
10656 panic("vector_upl_get_submap was passed a non-vectored UPL");
10657 }
10658 } else {
10659 panic("vector_upl_get_submap was passed a NULL UPL");
10660 }
10661}
10662
10663void
10664vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t size)
10665{
10666 if (vector_upl_is_valid(upl)) {
10667 uint32_t i = 0;
10668 vector_upl_t vector_upl = upl->vector_upl;
10669
10670 if (vector_upl) {
10671 for (i = 0; i < vector_upl->num_upls; i++) {
10672 if (vector_upl->upls[i].elem == subupl) {
10673 break;
10674 }
10675 }
10676
10677 if (i == vector_upl->num_upls) {
10678 panic("setting sub-upl iostate when none exists");
10679 }
10680
10681 vector_upl->upls[i].iostate.offset = offset;
10682 if (size < PAGE_SIZE) {
10683 size = PAGE_SIZE;
10684 }
10685 vector_upl->upls[i].iostate.size = size;
10686 } else {
10687 panic("vector_upl_set_iostate was passed a non-vectored UPL");
10688 }
10689 } else {
10690 panic("vector_upl_set_iostate was passed a NULL UPL");
10691 }
10692}
10693
10694void
10695vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t *offset, upl_size_t *size)
10696{
10697 if (vector_upl_is_valid(upl)) {
10698 uint32_t i = 0;
10699 vector_upl_t vector_upl = upl->vector_upl;
10700
10701 if (vector_upl) {
10702 for (i = 0; i < vector_upl->num_upls; i++) {
10703 if (vector_upl->upls[i].elem == subupl) {
10704 break;
10705 }
10706 }
10707
10708 if (i == vector_upl->num_upls) {
10709 panic("getting sub-upl iostate when none exists");
10710 }
10711
10712 *offset = vector_upl->upls[i].iostate.offset;
10713 *size = vector_upl->upls[i].iostate.size;
10714 } else {
10715 panic("vector_upl_get_iostate was passed a non-vectored UPL");
10716 }
10717 } else {
10718 panic("vector_upl_get_iostate was passed a NULL UPL");
10719 }
10720}
10721
10722void
10723vector_upl_get_iostate_byindex(upl_t upl, uint32_t index, upl_offset_t *offset, upl_size_t *size)
10724{
10725 if (vector_upl_is_valid(upl)) {
10726 vector_upl_t vector_upl = upl->vector_upl;
10727 if (vector_upl) {
10728 if (index < vector_upl->num_upls) {
10729 *offset = vector_upl->upls[index].iostate.offset;
10730 *size = vector_upl->upls[index].iostate.size;
10731 } else {
10732 *offset = *size = 0;
10733 }
10734 } else {
10735 panic("vector_upl_get_iostate_byindex was passed a non-vectored UPL");
10736 }
10737 } else {
10738 panic("vector_upl_get_iostate_byindex was passed a NULL UPL");
10739 }
10740}
10741
10742void *
10743upl_get_internal_vectorupl(upl_t upl)
10744{
10745 return upl->vector_upl;
10746}
10747
10748upl_page_info_t *
10749upl_get_internal_vectorupl_pagelist(upl_t upl)
10750{
10751 return upl->vector_upl->pagelist;
10752}
10753
10754upl_page_info_t *
10755upl_get_internal_page_list(upl_t upl)
10756{
10757 return upl->vector_upl ? upl->vector_upl->pagelist : upl->page_list;
10758}
10759
10760void
10761upl_clear_dirty(
10762 upl_t upl,
10763 boolean_t value)
10764{
10765 if (value) {
10766 upl->flags |= UPL_CLEAR_DIRTY;
10767 } else {
10768 upl->flags &= ~UPL_CLEAR_DIRTY;
10769 }
10770}
10771
10772void
10773upl_set_referenced(
10774 upl_t upl,
10775 boolean_t value)
10776{
10777 upl_lock(upl);
10778 if (value) {
10779 upl->ext_ref_count++;
10780 } else {
10781 if (!upl->ext_ref_count) {
10782 panic("upl_set_referenced not %p", upl);
10783 }
10784 upl->ext_ref_count--;
10785 }
10786 upl_unlock(upl);
10787}
10788
10789#if CONFIG_IOSCHED
10790void
10791upl_set_blkno(
10792 upl_t upl,
10793 vm_offset_t upl_offset,
10794 int io_size,
10795 int64_t blkno)
10796{
10797 int i, j;
10798 if ((upl->flags & UPL_EXPEDITE_SUPPORTED) == 0) {
10799 return;
10800 }
10801
10802 assert(upl->upl_reprio_info != 0);
10803 for (i = (int)(upl_offset / PAGE_SIZE), j = 0; j < io_size; i++, j += PAGE_SIZE) {
10804 UPL_SET_REPRIO_INFO(upl, i, blkno, io_size);
10805 }
10806}
10807#endif
10808
10809void inline
10810memoryshot(unsigned int event, unsigned int control)
10811{
10812 if (vm_debug_events) {
10813 KERNEL_DEBUG_CONSTANT1((MACHDBG_CODE(DBG_MACH_VM_PRESSURE, event)) | control,
10814 vm_page_active_count, vm_page_inactive_count,
10815 vm_page_free_count, vm_page_speculative_count,
10816 vm_page_throttled_count);
10817 } else {
10818 (void) event;
10819 (void) control;
10820 }
10821}
10822
10823#ifdef MACH_BSD
10824
10825boolean_t
10826upl_device_page(upl_page_info_t *upl)
10827{
10828 return UPL_DEVICE_PAGE(upl);
10829}
10830boolean_t
10831upl_page_present(upl_page_info_t *upl, int index)
10832{
10833 return UPL_PAGE_PRESENT(upl, index);
10834}
10835boolean_t
10836upl_speculative_page(upl_page_info_t *upl, int index)
10837{
10838 return UPL_SPECULATIVE_PAGE(upl, index);
10839}
10840boolean_t
10841upl_dirty_page(upl_page_info_t *upl, int index)
10842{
10843 return UPL_DIRTY_PAGE(upl, index);
10844}
10845boolean_t
10846upl_valid_page(upl_page_info_t *upl, int index)
10847{
10848 return UPL_VALID_PAGE(upl, index);
10849}
10850ppnum_t
10851upl_phys_page(upl_page_info_t *upl, int index)
10852{
10853 return UPL_PHYS_PAGE(upl, index);
10854}
10855
10856void
10857upl_page_set_mark(upl_page_info_t *upl, int index, boolean_t v)
10858{
10859 upl[index].mark = v;
10860}
10861
10862boolean_t
10863upl_page_get_mark(upl_page_info_t *upl, int index)
10864{
10865 return upl[index].mark;
10866}
10867
10868void
10869vm_countdirtypages(void)
10870{
10871 vm_page_t m;
10872 int dpages;
10873 int pgopages;
10874 int precpages;
10875
10876
10877 dpages = 0;
10878 pgopages = 0;
10879 precpages = 0;
10880
10881 vm_page_lock_queues();
10882 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
10883 do {
10884 if (m == (vm_page_t)0) {
10885 break;
10886 }
10887
10888 if (m->vmp_dirty) {
10889 dpages++;
10890 }
10891 if (m->vmp_free_when_done) {
10892 pgopages++;
10893 }
10894 if (m->vmp_precious) {
10895 precpages++;
10896 }
10897
10898 assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
10899 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10900 if (m == (vm_page_t)0) {
10901 break;
10902 }
10903 } while (!vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t) m));
10904 vm_page_unlock_queues();
10905
10906 vm_page_lock_queues();
10907 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
10908 do {
10909 if (m == (vm_page_t)0) {
10910 break;
10911 }
10912
10913 dpages++;
10914 assert(m->vmp_dirty);
10915 assert(!m->vmp_free_when_done);
10916 assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
10917 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10918 if (m == (vm_page_t)0) {
10919 break;
10920 }
10921 } while (!vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t) m));
10922 vm_page_unlock_queues();
10923
10924 vm_page_lock_queues();
10925 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
10926 do {
10927 if (m == (vm_page_t)0) {
10928 break;
10929 }
10930
10931 if (m->vmp_dirty) {
10932 dpages++;
10933 }
10934 if (m->vmp_free_when_done) {
10935 pgopages++;
10936 }
10937 if (m->vmp_precious) {
10938 precpages++;
10939 }
10940
10941 assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
10942 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10943 if (m == (vm_page_t)0) {
10944 break;
10945 }
10946 } while (!vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t) m));
10947 vm_page_unlock_queues();
10948
10949 printf(format: "IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
10950
10951 dpages = 0;
10952 pgopages = 0;
10953 precpages = 0;
10954
10955 vm_page_lock_queues();
10956 m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
10957
10958 do {
10959 if (m == (vm_page_t)0) {
10960 break;
10961 }
10962 if (m->vmp_dirty) {
10963 dpages++;
10964 }
10965 if (m->vmp_free_when_done) {
10966 pgopages++;
10967 }
10968 if (m->vmp_precious) {
10969 precpages++;
10970 }
10971
10972 assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
10973 m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10974 if (m == (vm_page_t)0) {
10975 break;
10976 }
10977 } while (!vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t) m));
10978 vm_page_unlock_queues();
10979
10980 printf(format: "AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
10981}
10982#endif /* MACH_BSD */
10983
10984
10985#if CONFIG_IOSCHED
10986int
10987upl_get_cached_tier(upl_t upl)
10988{
10989 assert(upl);
10990 if (upl->flags & UPL_TRACKED_BY_OBJECT) {
10991 return upl->upl_priority;
10992 }
10993 return -1;
10994}
10995#endif /* CONFIG_IOSCHED */
10996
10997
10998void
10999upl_callout_iodone(upl_t upl)
11000{
11001 struct upl_io_completion *upl_ctx = upl->upl_iodone;
11002
11003 if (upl_ctx) {
11004 void (*iodone_func)(void *, int) = upl_ctx->io_done;
11005
11006 assert(upl_ctx->io_done);
11007
11008 (*iodone_func)(upl_ctx->io_context, upl_ctx->io_error);
11009 }
11010}
11011
11012void
11013upl_set_iodone(upl_t upl, void *upl_iodone)
11014{
11015 upl->upl_iodone = (struct upl_io_completion *)upl_iodone;
11016}
11017
11018void
11019upl_set_iodone_error(upl_t upl, int error)
11020{
11021 struct upl_io_completion *upl_ctx = upl->upl_iodone;
11022
11023 if (upl_ctx) {
11024 upl_ctx->io_error = error;
11025 }
11026}
11027
11028
11029ppnum_t
11030upl_get_highest_page(
11031 upl_t upl)
11032{
11033 return upl->highest_page;
11034}
11035
11036upl_size_t
11037upl_get_size(
11038 upl_t upl)
11039{
11040 return upl_adjusted_size(upl, PAGE_MASK);
11041}
11042
11043upl_size_t
11044upl_adjusted_size(
11045 upl_t upl,
11046 vm_map_offset_t pgmask)
11047{
11048 vm_object_offset_t start_offset, end_offset;
11049
11050 start_offset = trunc_page_mask_64(upl->u_offset, pgmask);
11051 end_offset = round_page_mask_64(upl->u_offset + upl->u_size, pgmask);
11052
11053 return (upl_size_t)(end_offset - start_offset);
11054}
11055
11056vm_object_offset_t
11057upl_adjusted_offset(
11058 upl_t upl,
11059 vm_map_offset_t pgmask)
11060{
11061 return trunc_page_mask_64(upl->u_offset, pgmask);
11062}
11063
11064vm_object_offset_t
11065upl_get_data_offset(
11066 upl_t upl)
11067{
11068 return upl->u_offset - upl_adjusted_offset(upl, PAGE_MASK);
11069}
11070
11071upl_t
11072upl_associated_upl(upl_t upl)
11073{
11074 return upl->associated_upl;
11075}
11076
11077void
11078upl_set_associated_upl(upl_t upl, upl_t associated_upl)
11079{
11080 upl->associated_upl = associated_upl;
11081}
11082
11083struct vnode *
11084upl_lookup_vnode(upl_t upl)
11085{
11086 if (!upl->map_object->internal) {
11087 return vnode_pager_lookup_vnode(upl->map_object->pager);
11088 } else {
11089 return NULL;
11090 }
11091}
11092
11093#if UPL_DEBUG
11094kern_return_t
11095upl_ubc_alias_set(upl_t upl, uintptr_t alias1, uintptr_t alias2)
11096{
11097 upl->ubc_alias1 = alias1;
11098 upl->ubc_alias2 = alias2;
11099 return KERN_SUCCESS;
11100}
11101int
11102upl_ubc_alias_get(upl_t upl, uintptr_t * al, uintptr_t * al2)
11103{
11104 if (al) {
11105 *al = upl->ubc_alias1;
11106 }
11107 if (al2) {
11108 *al2 = upl->ubc_alias2;
11109 }
11110 return KERN_SUCCESS;
11111}
11112#endif /* UPL_DEBUG */
11113
11114#if VM_PRESSURE_EVENTS
11115/*
11116 * Upward trajectory.
11117 */
11118extern boolean_t vm_compressor_low_on_space(void);
11119
11120boolean_t
11121VM_PRESSURE_NORMAL_TO_WARNING(void)
11122{
11123 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11124 /* Available pages below our threshold */
11125 if (memorystatus_available_pages < memorystatus_available_pages_pressure) {
11126 /* No frozen processes to kill */
11127 if (memorystatus_frozen_count == 0) {
11128 /* Not enough suspended processes available. */
11129 if (memorystatus_suspended_count < MEMORYSTATUS_SUSPENDED_THRESHOLD) {
11130 return TRUE;
11131 }
11132 }
11133 }
11134 return FALSE;
11135 } else {
11136 return (AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) ? 1 : 0;
11137 }
11138}
11139
11140boolean_t
11141VM_PRESSURE_WARNING_TO_CRITICAL(void)
11142{
11143 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11144 /* Available pages below our threshold */
11145 if (memorystatus_available_pages < memorystatus_available_pages_critical) {
11146 return TRUE;
11147 }
11148 return FALSE;
11149 } else {
11150 return vm_compressor_low_on_space() || (AVAILABLE_NON_COMPRESSED_MEMORY < ((12 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
11151 }
11152}
11153
11154/*
11155 * Downward trajectory.
11156 */
11157boolean_t
11158VM_PRESSURE_WARNING_TO_NORMAL(void)
11159{
11160 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11161 /* Available pages above our threshold */
11162 unsigned int target_threshold = (unsigned int) (memorystatus_available_pages_pressure + ((15 * memorystatus_available_pages_pressure) / 100));
11163 if (memorystatus_available_pages > target_threshold) {
11164 return TRUE;
11165 }
11166 return FALSE;
11167 } else {
11168 return (AVAILABLE_NON_COMPRESSED_MEMORY > ((12 * VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) / 10)) ? 1 : 0;
11169 }
11170}
11171
11172boolean_t
11173VM_PRESSURE_CRITICAL_TO_WARNING(void)
11174{
11175 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11176 /* Available pages above our threshold */
11177 unsigned int target_threshold = (unsigned int)(memorystatus_available_pages_critical + ((15 * memorystatus_available_pages_critical) / 100));
11178 if (memorystatus_available_pages > target_threshold) {
11179 return TRUE;
11180 }
11181 return FALSE;
11182 } else {
11183 return (AVAILABLE_NON_COMPRESSED_MEMORY > ((14 * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / 10)) ? 1 : 0;
11184 }
11185}
11186#endif /* VM_PRESSURE_EVENTS */
11187
11188#if DEVELOPMENT || DEBUG
11189bool compressor_running_perf_test;
11190uint64_t compressor_perf_test_pages_processed;
11191
11192kern_return_t
11193run_compressor_perf_test(
11194 user_addr_t buf,
11195 size_t buffer_size,
11196 uint64_t *time,
11197 uint64_t *bytes_compressed,
11198 uint64_t *compressor_growth);
11199
11200static kern_return_t
11201move_pages_to_queue(
11202 vm_map_t map,
11203 user_addr_t start_addr,
11204 size_t buffer_size,
11205 vm_page_queue_head_t *queue,
11206 size_t *pages_moved)
11207{
11208 kern_return_t err = KERN_SUCCESS;
11209 vm_map_entry_t curr_entry = VM_MAP_ENTRY_NULL;
11210 boolean_t addr_in_map = FALSE;
11211 user_addr_t end_addr = USER_ADDR_NULL, curr_addr = USER_ADDR_NULL;
11212 vm_object_t curr_object = VM_OBJECT_NULL;
11213 *pages_moved = 0;
11214
11215
11216 if (VM_MAP_PAGE_SIZE(map) != PAGE_SIZE_64) {
11217 /*
11218 * We don't currently support benchmarking maps with a different page size
11219 * than the kernel.
11220 */
11221 return KERN_INVALID_ARGUMENT;
11222 }
11223
11224 if (os_add_overflow(start_addr, buffer_size, &end_addr)) {
11225 return KERN_INVALID_ARGUMENT;
11226 }
11227
11228 vm_map_lock_read(map);
11229 curr_addr = vm_map_trunc_page_mask(start_addr, VM_MAP_PAGE_MASK(map));
11230 end_addr = vm_map_round_page_mask(start_addr + buffer_size, VM_MAP_PAGE_MASK(map));
11231
11232
11233 while (curr_addr < end_addr) {
11234 addr_in_map = vm_map_lookup_entry(map, curr_addr, &curr_entry);
11235 if (!addr_in_map) {
11236 err = KERN_INVALID_ARGUMENT;
11237 break;
11238 }
11239 curr_object = VME_OBJECT(curr_entry);
11240 if (curr_object) {
11241 vm_object_lock(curr_object);
11242 /* We really only want anonymous memory that's in the top level map and object here. */
11243 if (curr_entry->is_sub_map || curr_entry->wired_count != 0 ||
11244 curr_object->shadow != VM_OBJECT_NULL || !curr_object->internal) {
11245 err = KERN_INVALID_ARGUMENT;
11246 vm_object_unlock(curr_object);
11247 break;
11248 }
11249 vm_map_offset_t start_offset = (curr_addr - curr_entry->vme_start) + VME_OFFSET(curr_entry);
11250 vm_map_offset_t end_offset = MIN(curr_entry->vme_end, end_addr) -
11251 (curr_entry->vme_start + VME_OFFSET(curr_entry));
11252 vm_map_offset_t curr_offset = start_offset;
11253 vm_page_t curr_page;
11254 while (curr_offset < end_offset) {
11255 curr_page = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset));
11256 if (curr_page != VM_PAGE_NULL) {
11257 vm_page_lock_queues();
11258 if (curr_page->vmp_laundry) {
11259 vm_pageout_steal_laundry(curr_page, TRUE);
11260 }
11261 /*
11262 * we've already factored out pages in the laundry which
11263 * means this page can't be on the pageout queue so it's
11264 * safe to do the vm_page_queues_remove
11265 */
11266 bool donate = (curr_page->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
11267 vm_page_queues_remove(curr_page, TRUE);
11268 if (donate) {
11269 /*
11270 * The compressor needs to see this bit to know
11271 * where this page needs to land. Also if stolen,
11272 * this bit helps put the page back in the right
11273 * special queue where it belongs.
11274 */
11275 curr_page->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
11276 }
11277 // Clear the referenced bit so we ensure this gets paged out
11278 curr_page->vmp_reference = false;
11279 if (curr_page->vmp_pmapped) {
11280 pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(curr_page),
11281 VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void*)NULL);
11282 }
11283 vm_page_queue_enter(queue, curr_page, vmp_pageq);
11284 vm_page_unlock_queues();
11285 *pages_moved += 1;
11286 }
11287 curr_offset += PAGE_SIZE_64;
11288 curr_addr += PAGE_SIZE_64;
11289 }
11290 }
11291 vm_object_unlock(curr_object);
11292 }
11293 vm_map_unlock_read(map);
11294 return err;
11295}
11296
11297/*
11298 * Local queue for processing benchmark pages.
11299 * Can't be allocated on the stack because the pointer has to
11300 * be packable.
11301 */
11302vm_page_queue_head_t compressor_perf_test_queue VM_PAGE_PACKED_ALIGNED;
11303kern_return_t
11304run_compressor_perf_test(
11305 user_addr_t buf,
11306 size_t buffer_size,
11307 uint64_t *time,
11308 uint64_t *bytes_compressed,
11309 uint64_t *compressor_growth)
11310{
11311 kern_return_t err = KERN_SUCCESS;
11312 if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11313 return KERN_NOT_SUPPORTED;
11314 }
11315 if (current_task() == kernel_task) {
11316 return KERN_INVALID_ARGUMENT;
11317 }
11318 vm_page_lock_queues();
11319 if (compressor_running_perf_test) {
11320 /* Only run one instance of the benchmark at a time. */
11321 vm_page_unlock_queues();
11322 return KERN_RESOURCE_SHORTAGE;
11323 }
11324 vm_page_unlock_queues();
11325 size_t page_count = 0;
11326 vm_map_t map;
11327 vm_page_t p, next;
11328 uint64_t compressor_perf_test_start = 0, compressor_perf_test_end = 0;
11329 uint64_t compressed_bytes_start = 0, compressed_bytes_end = 0;
11330 *bytes_compressed = *compressor_growth = 0;
11331
11332 vm_page_queue_init(&compressor_perf_test_queue);
11333 map = current_task()->map;
11334 err = move_pages_to_queue(map, buf, buffer_size, &compressor_perf_test_queue, &page_count);
11335 if (err != KERN_SUCCESS) {
11336 goto out;
11337 }
11338
11339 vm_page_lock_queues();
11340 compressor_running_perf_test = true;
11341 compressor_perf_test_pages_processed = 0;
11342 /*
11343 * At this point the compressor threads should only process the benchmark queue
11344 * so we can look at the difference in c_segment_compressed_bytes while the perf test is running
11345 * to determine how many compressed bytes we ended up using.
11346 */
11347 compressed_bytes_start = c_segment_compressed_bytes;
11348 vm_page_unlock_queues();
11349
11350 page_count = vm_pageout_page_queue(&compressor_perf_test_queue, page_count, true);
11351
11352 vm_page_lock_queues();
11353 compressor_perf_test_start = mach_absolute_time();
11354
11355 // Wake up the compressor thread(s)
11356 sched_cond_signal(&pgo_iothread_internal_state[0].pgo_wakeup,
11357 pgo_iothread_internal_state[0].pgo_iothread);
11358
11359 /*
11360 * Depending on when this test is run we could overshoot or be right on the mark
11361 * with our page_count. So the comparison is of the _less than_ variety.
11362 */
11363 while (compressor_perf_test_pages_processed < page_count) {
11364 assert_wait((event_t) &compressor_perf_test_pages_processed, THREAD_UNINT);
11365 vm_page_unlock_queues();
11366 thread_block(THREAD_CONTINUE_NULL);
11367 vm_page_lock_queues();
11368 }
11369 compressor_perf_test_end = mach_absolute_time();
11370 compressed_bytes_end = c_segment_compressed_bytes;
11371 vm_page_unlock_queues();
11372
11373
11374out:
11375 /*
11376 * If we errored out above, then we could still have some pages
11377 * on the local queue. Make sure to put them back on the active queue before
11378 * returning so they're not orphaned.
11379 */
11380 vm_page_lock_queues();
11381 absolutetime_to_nanoseconds(compressor_perf_test_end - compressor_perf_test_start, time);
11382 p = (vm_page_t) vm_page_queue_first(&compressor_perf_test_queue);
11383 while (p && !vm_page_queue_end(&compressor_perf_test_queue, (vm_page_queue_entry_t)p)) {
11384 next = (vm_page_t)VM_PAGE_UNPACK_PTR(p->vmp_pageq.next);
11385
11386 vm_page_enqueue_active(p, FALSE);
11387 p = next;
11388 }
11389
11390 compressor_running_perf_test = false;
11391 vm_page_unlock_queues();
11392 if (err == KERN_SUCCESS) {
11393 *bytes_compressed = page_count * PAGE_SIZE_64;
11394 *compressor_growth = compressed_bytes_end - compressed_bytes_start;
11395 }
11396
11397 /*
11398 * pageout_scan will consider waking the compactor swapper
11399 * before it blocks. Do the same thing here before we return
11400 * to ensure that back to back benchmark runs can't overly fragment the
11401 * compressor pool.
11402 */
11403 vm_consider_waking_compactor_swapper();
11404 return err;
11405}
11406#endif /* DEVELOPMENT || DEBUG */
11407