vm_pageout.c source code [xnu/osfmk/vm/vm_pageout.c]

1	/*
2	* Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28	/*
29	* @OSF_COPYRIGHT@
30	*/
31	/*
32	* Mach Operating System
33	* Copyright (c) 1991,1990,1989,1988,1987 Carnegie Mellon University
34	* All Rights Reserved.
35	*
36	* Permission to use, copy, modify and distribute this software and its
37	* documentation is hereby granted, provided that both the copyright
38	* notice and this permission notice appear in all copies of the
39	* software, derivative works or modified versions, and any portions
40	* thereof, and that both notices appear in supporting documentation.
41	*
42	* CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43	* CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44	* ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45	*
46	* Carnegie Mellon requests users of this software to return to
47	*
48	* Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
49	* School of Computer Science
50	* Carnegie Mellon University
51	* Pittsburgh PA 15213-3890
52	*
53	* any improvements or extensions that they make and grant Carnegie Mellon
54	* the rights to redistribute these changes.
55	*/
56	/*
57	*/
58	/*
59	* File: vm/vm_pageout.c
60	* Author: Avadis Tevanian, Jr., Michael Wayne Young
61	* Date: 1985
62	*
63	* The proverbial page-out daemon.
64	*/
65
66	#include <stdint.h>
67	#include <ptrauth.h>
68
69	#include <debug.h>
70
71	#include <mach/mach_types.h>
72	#include <mach/memory_object.h>
73	#include <mach/mach_host_server.h>
74	#include <mach/upl.h>
75	#include <mach/vm_map.h>
76	#include <mach/vm_param.h>
77	#include <mach/vm_statistics.h>
78	#include <mach/sdt.h>
79
80	#include <kern/kern_types.h>
81	#include <kern/counter.h>
82	#include <kern/host_statistics.h>
83	#include <kern/machine.h>
84	#include <kern/misc_protos.h>
85	#include <kern/sched.h>
86	#include <kern/thread.h>
87	#include <kern/kalloc.h>
88	#include <kern/zalloc_internal.h>
89	#include <kern/policy_internal.h>
90	#include <kern/thread_group.h>
91
92	#include <os/log.h>
93
94	#include <sys/kdebug_triage.h>
95
96	#include <machine/vm_tuning.h>
97	#include <machine/commpage.h>
98
99	#include <vm/pmap.h>
100	#include <vm/vm_compressor_pager.h>
101	#include <vm/vm_fault.h>
102	#include <vm/vm_map_internal.h>
103	#include <vm/vm_object.h>
104	#include <vm/vm_page.h>
105	#include <vm/vm_pageout.h>
106	#include <vm/vm_protos.h> /* must be last */
107	#include <vm/memory_object.h>
108	#include <vm/vm_purgeable_internal.h>
109	#include <vm/vm_shared_region.h>
110	#include <vm/vm_compressor.h>
111
112	#include <san/kasan.h>
113
114	#if CONFIG_PHANTOM_CACHE
115	#include <vm/vm_phantom_cache.h>
116	#endif
117
118	#if UPL_DEBUG
119	#include <libkern/OSDebug.h>
120	#endif
121
122	extern int cs_debug;
123
124	#if CONFIG_MBUF_MCACHE
125	extern void mbuf_drain(boolean_t);
126	#endif /* CONFIG_MBUF_MCACHE */
127
128	#if VM_PRESSURE_EVENTS
129	#if CONFIG_JETSAM
130	extern unsigned int memorystatus_available_pages;
131	extern unsigned int memorystatus_available_pages_pressure;
132	extern unsigned int memorystatus_available_pages_critical;
133	#else /* CONFIG_JETSAM */
134	extern uint64_t memorystatus_available_pages;
135	extern uint64_t memorystatus_available_pages_pressure;
136	extern uint64_t memorystatus_available_pages_critical;
137	#endif /* CONFIG_JETSAM */
138
139	extern unsigned int memorystatus_frozen_count;
140	extern unsigned int memorystatus_suspended_count;
141	extern vm_pressure_level_t memorystatus_vm_pressure_level;
142
143	extern lck_mtx_t memorystatus_jetsam_fg_band_lock;
144	extern uint32_t memorystatus_jetsam_fg_band_waiters;
145
146	void vm_pressure_response(void);
147	extern void consider_vm_pressure_events(void);
148
149	#define MEMORYSTATUS_SUSPENDED_THRESHOLD 4
150	#endif /* VM_PRESSURE_EVENTS */
151
152	SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_scan_thread;
153	SECURITY_READ_ONLY_LATE(thread_t) vm_pageout_gc_thread;
154	#if CONFIG_VPS_DYNAMIC_PRIO
155	TUNABLE(bool, vps_dynamic_priority_enabled, "vps_dynamic_priority_enabled", false);
156	#else
157	const bool vps_dynamic_priority_enabled = false;
158	#endif
159	boolean_t vps_yield_for_pgqlockwaiters = TRUE;
160
161	#ifndef VM_PAGEOUT_BURST_INACTIVE_THROTTLE /* maximum iterations of the inactive queue w/o stealing/cleaning a page */
162	#if !XNU_TARGET_OS_OSX
163	#define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 1024
164	#else /* !XNU_TARGET_OS_OSX */
165	#define VM_PAGEOUT_BURST_INACTIVE_THROTTLE 4096
166	#endif /* !XNU_TARGET_OS_OSX */
167	#endif
168
169	#ifndef VM_PAGEOUT_DEADLOCK_RELIEF
170	#define VM_PAGEOUT_DEADLOCK_RELIEF 100 /* number of pages to move to break deadlock */
171	#endif
172
173	#ifndef VM_PAGE_LAUNDRY_MAX
174	#define VM_PAGE_LAUNDRY_MAX 128UL /* maximum pageouts on a given pageout queue */
175	#endif /* VM_PAGEOUT_LAUNDRY_MAX */
176
177	#ifndef VM_PAGEOUT_BURST_WAIT
178	#define VM_PAGEOUT_BURST_WAIT 1 /* milliseconds */
179	#endif /* VM_PAGEOUT_BURST_WAIT */
180
181	#ifndef VM_PAGEOUT_EMPTY_WAIT
182	#define VM_PAGEOUT_EMPTY_WAIT 50 /* milliseconds */
183	#endif /* VM_PAGEOUT_EMPTY_WAIT */
184
185	#ifndef VM_PAGEOUT_DEADLOCK_WAIT
186	#define VM_PAGEOUT_DEADLOCK_WAIT 100 /* milliseconds */
187	#endif /* VM_PAGEOUT_DEADLOCK_WAIT */
188
189	#ifndef VM_PAGEOUT_IDLE_WAIT
190	#define VM_PAGEOUT_IDLE_WAIT 10 /* milliseconds */
191	#endif /* VM_PAGEOUT_IDLE_WAIT */
192
193	#ifndef VM_PAGEOUT_SWAP_WAIT
194	#define VM_PAGEOUT_SWAP_WAIT 10 /* milliseconds */
195	#endif /* VM_PAGEOUT_SWAP_WAIT */
196
197
198	#ifndef VM_PAGE_SPECULATIVE_TARGET
199	#define VM_PAGE_SPECULATIVE_TARGET(total) ((total) * 1 / (100 / vm_pageout_state.vm_page_speculative_percentage))
200	#endif /* VM_PAGE_SPECULATIVE_TARGET */
201
202
203	/*
204	* To obtain a reasonable LRU approximation, the inactive queue
205	* needs to be large enough to give pages on it a chance to be
206	* referenced a second time. This macro defines the fraction
207	* of active+inactive pages that should be inactive.
208	* The pageout daemon uses it to update vm_page_inactive_target.
209	*
210	* If vm_page_free_count falls below vm_page_free_target and
211	* vm_page_inactive_count is below vm_page_inactive_target,
212	* then the pageout daemon starts running.
213	*/
214
215	#ifndef VM_PAGE_INACTIVE_TARGET
216	#define VM_PAGE_INACTIVE_TARGET(avail) ((avail) * 1 / 2)
217	#endif /* VM_PAGE_INACTIVE_TARGET */
218
219	/*
220	* Once the pageout daemon starts running, it keeps going
221	* until vm_page_free_count meets or exceeds vm_page_free_target.
222	*/
223
224	#ifndef VM_PAGE_FREE_TARGET
225	#if !XNU_TARGET_OS_OSX
226	#define VM_PAGE_FREE_TARGET(free) (15 + (free) / 100)
227	#else /* !XNU_TARGET_OS_OSX */
228	#define VM_PAGE_FREE_TARGET(free) (15 + (free) / 80)
229	#endif /* !XNU_TARGET_OS_OSX */
230	#endif /* VM_PAGE_FREE_TARGET */
231
232
233	/*
234	* The pageout daemon always starts running once vm_page_free_count
235	* falls below vm_page_free_min.
236	*/
237
238	#ifndef VM_PAGE_FREE_MIN
239	#if !XNU_TARGET_OS_OSX
240	#define VM_PAGE_FREE_MIN(free) (10 + (free) / 200)
241	#else /* !XNU_TARGET_OS_OSX */
242	#define VM_PAGE_FREE_MIN(free) (10 + (free) / 100)
243	#endif /* !XNU_TARGET_OS_OSX */
244	#endif /* VM_PAGE_FREE_MIN */
245
246	#if !XNU_TARGET_OS_OSX
247	#define VM_PAGE_FREE_RESERVED_LIMIT 100
248	#define VM_PAGE_FREE_MIN_LIMIT 1500
249	#define VM_PAGE_FREE_TARGET_LIMIT 2000
250	#else /* !XNU_TARGET_OS_OSX */
251	#define VM_PAGE_FREE_RESERVED_LIMIT 1700
252	#define VM_PAGE_FREE_MIN_LIMIT 3500
253	#define VM_PAGE_FREE_TARGET_LIMIT 4000
254	#endif /* !XNU_TARGET_OS_OSX */
255
256	/*
257	* When vm_page_free_count falls below vm_page_free_reserved,
258	* only vm-privileged threads can allocate pages. vm-privilege
259	* allows the pageout daemon and default pager (and any other
260	* associated threads needed for default pageout) to continue
261	* operation by dipping into the reserved pool of pages.
262	*/
263
264	#ifndef VM_PAGE_FREE_RESERVED
265	#define VM_PAGE_FREE_RESERVED(n) \
266	((unsigned) (6 * VM_PAGE_LAUNDRY_MAX) + (n))
267	#endif /* VM_PAGE_FREE_RESERVED */
268
269	/*
270	* When we dequeue pages from the inactive list, they are
271	* reactivated (ie, put back on the active queue) if referenced.
272	* However, it is possible to starve the free list if other
273	* processors are referencing pages faster than we can turn off
274	* the referenced bit. So we limit the number of reactivations
275	* we will make per call of vm_pageout_scan().
276	*/
277	#define VM_PAGE_REACTIVATE_LIMIT_MAX 20000
278
279	#ifndef VM_PAGE_REACTIVATE_LIMIT
280	#if !XNU_TARGET_OS_OSX
281	#define VM_PAGE_REACTIVATE_LIMIT(avail) (VM_PAGE_INACTIVE_TARGET(avail) / 2)
282	#else /* !XNU_TARGET_OS_OSX */
283	#define VM_PAGE_REACTIVATE_LIMIT(avail) (MAX((avail) * 1 / 20,VM_PAGE_REACTIVATE_LIMIT_MAX))
284	#endif /* !XNU_TARGET_OS_OSX */
285	#endif /* VM_PAGE_REACTIVATE_LIMIT */
286	#define VM_PAGEOUT_INACTIVE_FORCE_RECLAIM 1000
287
288	int vm_pageout_protect_realtime = true;
289
290	extern boolean_t hibernate_cleaning_in_progress;
291
292	struct pgo_iothread_state pgo_iothread_internal_state[MAX_COMPRESSOR_THREAD_COUNT];
293	struct pgo_iothread_state pgo_iothread_external_state;
294
295	#if VM_PRESSURE_EVENTS
296	void vm_pressure_thread(void);
297
298	boolean_t VM_PRESSURE_NORMAL_TO_WARNING(void);
299	boolean_t VM_PRESSURE_WARNING_TO_CRITICAL(void);
300
301	boolean_t VM_PRESSURE_WARNING_TO_NORMAL(void);
302	boolean_t VM_PRESSURE_CRITICAL_TO_WARNING(void);
303	#endif
304
305	static void vm_pageout_iothread_external(struct pgo_iothread_state *, wait_result_t);
306	static void vm_pageout_iothread_internal(struct pgo_iothread_state *, wait_result_t);
307	static void vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state *, boolean_t);
308
309	extern void vm_pageout_continue(void);
310	extern void vm_pageout_scan(void);
311
312	boolean_t vm_pageout_running = FALSE;
313
314	uint32_t vm_page_upl_tainted = `0`;
315	uint32_t vm_page_iopl_tainted = `0`;
316
317	#if XNU_TARGET_OS_OSX
318	static boolean_t vm_pageout_waiter = FALSE;
319	#endif /* XNU_TARGET_OS_OSX */
320
321
322	#if DEVELOPMENT \|\| DEBUG
323	struct vm_pageout_debug vm_pageout_debug;
324	#endif
325	struct vm_pageout_vminfo vm_pageout_vminfo;
326	struct vm_pageout_state vm_pageout_state;
327	struct vm_config vm_config;
328
329	struct vm_pageout_queue vm_pageout_queue_internal VM_PAGE_PACKED_ALIGNED;
330	struct vm_pageout_queue vm_pageout_queue_external VM_PAGE_PACKED_ALIGNED;
331	#if DEVELOPMENT \|\| DEBUG
332	struct vm_pageout_queue vm_pageout_queue_benchmark VM_PAGE_PACKED_ALIGNED;
333	#endif /* DEVELOPMENT \|\| DEBUG */
334
335	int vm_upl_wait_for_pages = `0`;
336	vm_object_t vm_pageout_scan_wants_object = VM_OBJECT_NULL;
337
338	boolean_t(*volatile consider_buffer_cache_collect)(int) = NULL;
339
340	int vm_debug_events = `0`;
341
342	LCK_GRP_DECLARE(vm_pageout_lck_grp, "vm_pageout");
343
344	#if CONFIG_MEMORYSTATUS
345	extern void memorystatus_kill_on_vps_starvation(void);
346
347	uint32_t vm_pageout_memorystatus_fb_factor_nr = `5`;
348	uint32_t vm_pageout_memorystatus_fb_factor_dr = `2`;
349
350	#endif
351
352	#if __AMP__
353
354
355	/*
356	* Bind compressor threads to e-cores unless there are multiple non-e clusters
357	*/
358	#if (MAX_CPU_CLUSTERS > 2)
359	#define VM_COMPRESSOR_EBOUND_DEFAULT false
360	#else
361	#define VM_COMPRESSOR_EBOUND_DEFAULT true
362	#endif
363
364	TUNABLE(bool, vm_compressor_ebound, "vmcomp_ecluster", VM_COMPRESSOR_EBOUND_DEFAULT);
365	int vm_pgo_pbound = `0`;
366	extern void thread_bind_cluster_type(thread_t, char, bool);
367
368	#endif /* __AMP__ */
369
370
371	/*
372	* Routine: vm_pageout_object_terminate
373	* Purpose:
374	* Destroy the pageout_object, and perform all of the
375	* required cleanup actions.
376	*
377	* In/Out conditions:
378	* The object must be locked, and will be returned locked.
379	*/
380	void
381	vm_pageout_object_terminate(
382	vm_object_t object)
383	{
384	vm_object_t shadow_object;
385
386	/*
387	* Deal with the deallocation (last reference) of a pageout object
388	* (used for cleaning-in-place) by dropping the paging references/
389	* freeing pages in the original object.
390	*/
391
392	assert(object->pageout);
393	shadow_object = object->shadow;
394	vm_object_lock(shadow_object);
395
396	while (!vm_page_queue_empty(&object->memq)) {
397	vm_page_t p, m;
398	vm_object_offset_t offset;
399
400	p = (vm_page_t) vm_page_queue_first(&object->memq);
401
402	assert(p->vmp_private);
403	assert(p->vmp_free_when_done);
404	p->vmp_free_when_done = FALSE;
405	assert(!p->vmp_cleaning);
406	assert(!p->vmp_laundry);
407
408	offset = p->vmp_offset;
409	VM_PAGE_FREE(p);
410	p = VM_PAGE_NULL;
411
412	m = vm_page_lookup(object: shadow_object,
413	offset: offset + object->vo_shadow_offset);
414
415	if (m == VM_PAGE_NULL) {
416	continue;
417	}
418
419	assert((m->vmp_dirty) \|\| (m->vmp_precious) \|\|
420	(m->vmp_busy && m->vmp_cleaning));
421
422	/*
423	* Handle the trusted pager throttle.
424	* Also decrement the burst throttle (if external).
425	*/
426	vm_page_lock_queues();
427	if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
428	vm_pageout_throttle_up(page: m);
429	}
430
431	/*
432	* Handle the "target" page(s). These pages are to be freed if
433	* successfully cleaned. Target pages are always busy, and are
434	* wired exactly once. The initial target pages are not mapped,
435	* (so cannot be referenced or modified) but converted target
436	* pages may have been modified between the selection as an
437	* adjacent page and conversion to a target.
438	*/
439	if (m->vmp_free_when_done) {
440	assert(m->vmp_busy);
441	assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
442	assert(m->vmp_wire_count == `1`);
443	m->vmp_cleaning = FALSE;
444	m->vmp_free_when_done = FALSE;
445	/*
446	* Revoke all access to the page. Since the object is
447	* locked, and the page is busy, this prevents the page
448	* from being dirtied after the pmap_disconnect() call
449	* returns.
450	*
451	* Since the page is left "dirty" but "not modifed", we
452	* can detect whether the page was redirtied during
453	* pageout by checking the modify state.
454	*/
455	if (pmap_disconnect(phys: VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED) {
456	SET_PAGE_DIRTY(m, FALSE);
457	} else {
458	m->vmp_dirty = FALSE;
459	}
460
461	if (m->vmp_dirty) {
462	vm_page_unwire(page: m, TRUE); / reactivates /
463	counter_inc(&vm_statistics_reactivations);
464	PAGE_WAKEUP_DONE(m);
465	} else {
466	vm_page_free(page: m); / clears busy, etc. /
467	}
468	vm_page_unlock_queues();
469	continue;
470	}
471	/*
472	* Handle the "adjacent" pages. These pages were cleaned in
473	* place, and should be left alone.
474	* If prep_pin_count is nonzero, then someone is using the
475	* page, so make it active.
476	*/
477	if ((m->vmp_q_state == VM_PAGE_NOT_ON_Q) && !m->vmp_private) {
478	if (m->vmp_reference) {
479	vm_page_activate(page: m);
480	} else {
481	vm_page_deactivate(page: m);
482	}
483	}
484	if (m->vmp_overwriting) {
485	/*
486	* the (COPY_OUT_FROM == FALSE) request_page_list case
487	*/
488	if (m->vmp_busy) {
489	/*
490	* We do not re-set m->vmp_dirty !
491	* The page was busy so no extraneous activity
492	* could have occurred. COPY_INTO is a read into the
493	* new pages. CLEAN_IN_PLACE does actually write
494	* out the pages but handling outside of this code
495	* will take care of resetting dirty. We clear the
496	* modify however for the Programmed I/O case.
497	*/
498	pmap_clear_modify(pn: VM_PAGE_GET_PHYS_PAGE(m));
499
500	m->vmp_busy = FALSE;
501	m->vmp_absent = FALSE;
502	} else {
503	/*
504	* alternate (COPY_OUT_FROM == FALSE) request_page_list case
505	* Occurs when the original page was wired
506	* at the time of the list request
507	*/
508	assert(VM_PAGE_WIRED(m));
509	vm_page_unwire(page: m, TRUE); / reactivates /
510	}
511	m->vmp_overwriting = FALSE;
512	} else {
513	m->vmp_dirty = FALSE;
514	}
515	m->vmp_cleaning = FALSE;
516
517	/*
518	* Wakeup any thread waiting for the page to be un-cleaning.
519	*/
520	PAGE_WAKEUP(m);
521	vm_page_unlock_queues();
522	}
523	/*
524	* Account for the paging reference taken in vm_paging_object_allocate.
525	*/
526	vm_object_activity_end(shadow_object);
527	vm_object_unlock(shadow_object);
528
529	assert(object->ref_count == `0`);
530	assert(object->paging_in_progress == `0`);
531	assert(object->activity_in_progress == `0`);
532	assert(object->resident_page_count == `0`);
533	return;
534	}
535
536	/*
537	* Routine: vm_pageclean_setup
538	*
539	* Purpose: setup a page to be cleaned (made non-dirty), but not
540	* necessarily flushed from the VM page cache.
541	* This is accomplished by cleaning in place.
542	*
543	* The page must not be busy, and new_object
544	* must be locked.
545	*
546	*/
547	static void
548	vm_pageclean_setup(
549	vm_page_t m,
550	vm_page_t new_m,
551	vm_object_t new_object,
552	vm_object_offset_t new_offset)
553	{
554	assert(!m->vmp_busy);
555	#if 0
556	assert(!m->vmp_cleaning);
557	#endif
558
559	pmap_clear_modify(pn: VM_PAGE_GET_PHYS_PAGE(m));
560
561	/*
562	* Mark original page as cleaning in place.
563	*/
564	m->vmp_cleaning = TRUE;
565	SET_PAGE_DIRTY(m, FALSE);
566	m->vmp_precious = FALSE;
567
568	/*
569	* Convert the fictitious page to a private shadow of
570	* the real page.
571	*/
572	assert(new_m->vmp_fictitious);
573	assert(VM_PAGE_GET_PHYS_PAGE(new_m) == vm_page_fictitious_addr);
574	new_m->vmp_fictitious = FALSE;
575	new_m->vmp_private = TRUE;
576	new_m->vmp_free_when_done = TRUE;
577	VM_PAGE_SET_PHYS_PAGE(new_m, VM_PAGE_GET_PHYS_PAGE(m));
578
579	vm_page_lockspin_queues();
580	vm_page_wire(page: new_m, VM_KERN_MEMORY_NONE, TRUE);
581	vm_page_unlock_queues();
582
583	vm_page_insert_wired(page: new_m, object: new_object, offset: new_offset, VM_KERN_MEMORY_NONE);
584	assert(!new_m->vmp_wanted);
585	new_m->vmp_busy = FALSE;
586	}
587
588	/*
589	* Routine: vm_pageout_initialize_page
590	* Purpose:
591	* Causes the specified page to be initialized in
592	* the appropriate memory object. This routine is used to push
593	* pages into a copy-object when they are modified in the
594	* permanent object.
595	*
596	* The page is moved to a temporary object and paged out.
597	*
598	* In/out conditions:
599	* The page in question must not be on any pageout queues.
600	* The object to which it belongs must be locked.
601	* The page must be busy, but not hold a paging reference.
602	*
603	* Implementation:
604	* Move this page to a completely new object.
605	*/
606	void
607	vm_pageout_initialize_page(
608	vm_page_t m)
609	{
610	vm_object_t object;
611	vm_object_offset_t paging_offset;
612	memory_object_t pager;
613
614	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
615
616	object = VM_PAGE_OBJECT(m);
617
618	assert(m->vmp_busy);
619	assert(object->internal);
620
621	/*
622	* Verify that we really want to clean this page
623	*/
624	assert(!m->vmp_absent);
625	assert(m->vmp_dirty);
626
627	/*
628	* Create a paging reference to let us play with the object.
629	*/
630	paging_offset = m->vmp_offset + object->paging_offset;
631
632	if (m->vmp_absent \|\| VMP_ERROR_GET(m) \|\| m->vmp_restart \|\| (!m->vmp_dirty && !m->vmp_precious)) {
633	panic("reservation without pageout?"); / alan /
634
635	VM_PAGE_FREE(m);
636	vm_object_unlock(object);
637
638	return;
639	}
640
641	/*
642	* If there's no pager, then we can't clean the page. This should
643	* never happen since this should be a copy object and therefore not
644	* an external object, so the pager should always be there.
645	*/
646
647	pager = object->pager;
648
649	if (pager == MEMORY_OBJECT_NULL) {
650	panic("missing pager for copy object");
651
652	VM_PAGE_FREE(m);
653	return;
654	}
655
656	/*
657	* set the page for future call to vm_fault_list_request
658	*/
659	pmap_clear_modify(pn: VM_PAGE_GET_PHYS_PAGE(m));
660	SET_PAGE_DIRTY(m, FALSE);
661
662	/*
663	* keep the object from collapsing or terminating
664	*/
665	vm_object_paging_begin(object);
666	vm_object_unlock(object);
667
668	/*
669	* Write the data to its pager.
670	* Note that the data is passed by naming the new object,
671	* not a virtual address; the pager interface has been
672	* manipulated to use the "internal memory" data type.
673	* [The object reference from its allocation is donated
674	* to the eventual recipient.]
675	*/
676	memory_object_data_initialize(memory_object: pager, offset: paging_offset, PAGE_SIZE);
677
678	vm_object_lock(object);
679	vm_object_paging_end(object);
680	}
681
682
683	/*
684	* vm_pageout_cluster:
685	*
686	* Given a page, queue it to the appropriate I/O thread,
687	* which will page it out and attempt to clean adjacent pages
688	* in the same operation.
689	*
690	* The object and queues must be locked. We will take a
691	* paging reference to prevent deallocation or collapse when we
692	* release the object lock back at the call site. The I/O thread
693	* is responsible for consuming this reference
694	*
695	* The page must not be on any pageout queue.
696	*/
697	#if DEVELOPMENT \|\| DEBUG
698	vmct_stats_t vmct_stats;
699
700	int32_t vmct_active = `0`;
701	uint64_t vm_compressor_epoch_start = `0`;
702	uint64_t vm_compressor_epoch_stop = `0`;
703
704	typedef enum vmct_state_t {
705	VMCT_IDLE,
706	VMCT_AWAKENED,
707	VMCT_ACTIVE,
708	} vmct_state_t;
709	vmct_state_t vmct_state[MAX_COMPRESSOR_THREAD_COUNT];
710	#endif
711
712
713
714	static void
715	vm_pageout_cluster_to_queue(vm_page_t m, struct vm_pageout_queue *q)
716	{
717	vm_object_t object = VM_PAGE_OBJECT(m);
718
719	VM_PAGE_CHECK(m);
720	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
721	vm_object_lock_assert_exclusive(object);
722
723	/*
724	* Make sure it's OK to page this out.
725	*/
726	assert((m->vmp_dirty \|\| m->vmp_precious) && (!VM_PAGE_WIRED(m)));
727	assert(!m->vmp_cleaning && !m->vmp_laundry);
728	assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
729
730	/*
731	* protect the object from collapse or termination
732	*/
733	vm_object_activity_begin(object);
734
735
736	/*
737	* pgo_laundry count is tied to the laundry bit
738	*/
739	m->vmp_laundry = TRUE;
740	q->pgo_laundry++;
741
742	m->vmp_q_state = VM_PAGE_ON_PAGEOUT_Q;
743	vm_page_queue_enter(&q->pgo_pending, m, vmp_pageq);
744
745	// the benchmark queue will be woken up independently by the benchmark itself
746	if (
747	object->internal == TRUE
748	#if DEVELOPMENT \|\| DEBUG
749	&& q != &vm_pageout_queue_benchmark
750	#endif
751	) {
752	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
753	m->vmp_busy = TRUE;
754	// Wake up the first compressor thread. It will wake subsequent threads if necessary.
755	sched_cond_signal(cond: &pgo_iothread_internal_state[`0`].pgo_wakeup, thread: pgo_iothread_internal_state[`0`].pgo_iothread);
756	} else {
757	sched_cond_signal(cond: &pgo_iothread_external_state.pgo_wakeup, thread: pgo_iothread_external_state.pgo_iothread);
758	}
759	VM_PAGE_CHECK(m);
760	}
761
762	void
763	vm_pageout_cluster(vm_page_t m)
764	{
765	struct vm_pageout_queue *q;
766	vm_object_t object = VM_PAGE_OBJECT(m);
767	if (object->internal) {
768	q = &vm_pageout_queue_internal;
769	} else {
770	q = &vm_pageout_queue_external;
771	}
772	vm_pageout_cluster_to_queue(m, q);
773	}
774
775
776	/*
777	* A page is back from laundry or we are stealing it back from
778	* the laundering state. See if there are some pages waiting to
779	* go to laundry and if we can let some of them go now.
780	*
781	* Object and page queues must be locked.
782	*/
783	void
784	vm_pageout_throttle_up(
785	vm_page_t m)
786	{
787	struct vm_pageout_queue *q;
788	vm_object_t m_object;
789
790	m_object = VM_PAGE_OBJECT(m);
791
792	assert(m_object != VM_OBJECT_NULL);
793	assert(!is_kernel_object(m_object));
794
795	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
796	vm_object_lock_assert_exclusive(m_object);
797
798	if (m_object->internal == TRUE) {
799	q = &vm_pageout_queue_internal;
800	} else {
801	q = &vm_pageout_queue_external;
802	}
803
804	if (m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
805	vm_page_queue_remove(&q->pgo_pending, m, vmp_pageq);
806	m->vmp_q_state = VM_PAGE_NOT_ON_Q;
807
808	VM_PAGE_ZERO_PAGEQ_ENTRY(m);
809
810	vm_object_activity_end(m_object);
811
812	VM_PAGEOUT_DEBUG(vm_page_steal_pageout_page, `1`);
813	}
814	if (m->vmp_laundry == TRUE) {
815	m->vmp_laundry = FALSE;
816	q->pgo_laundry--;
817
818	if (q->pgo_throttled == TRUE) {
819	q->pgo_throttled = FALSE;
820	thread_wakeup((event_t) &q->pgo_laundry);
821	}
822	if (q->pgo_draining == TRUE && q->pgo_laundry == `0`) {
823	q->pgo_draining = FALSE;
824	thread_wakeup((event_t) (&q->pgo_laundry + `1`));
825	}
826	VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, `1`);
827	}
828	}
829
830
831	static void
832	vm_pageout_throttle_up_batch(
833	struct vm_pageout_queue *q,
834	int batch_cnt)
835	{
836	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
837
838	VM_PAGEOUT_DEBUG(vm_pageout_throttle_up_count, batch_cnt);
839
840	q->pgo_laundry -= batch_cnt;
841
842	if (q->pgo_throttled == TRUE) {
843	q->pgo_throttled = FALSE;
844	thread_wakeup((event_t) &q->pgo_laundry);
845	}
846	if (q->pgo_draining == TRUE && q->pgo_laundry == `0`) {
847	q->pgo_draining = FALSE;
848	thread_wakeup((event_t) (&q->pgo_laundry + `1`));
849	}
850	}
851
852
853
854	/*
855	* VM memory pressure monitoring.
856	*
857	* vm_pageout_scan() keeps track of the number of pages it considers and
858	* reclaims, in the currently active vm_pageout_stat[vm_pageout_stat_now].
859	*
860	* compute_memory_pressure() is called every second from compute_averages()
861	* and moves "vm_pageout_stat_now" forward, to start accumulating the number
862	* of recalimed pages in a new vm_pageout_stat[] bucket.
863	*
864	* mach_vm_pressure_monitor() collects past statistics about memory pressure.
865	* The caller provides the number of seconds ("nsecs") worth of statistics
866	* it wants, up to 30 seconds.
867	* It computes the number of pages reclaimed in the past "nsecs" seconds and
868	* also returns the number of pages the system still needs to reclaim at this
869	* moment in time.
870	*/
871	#if DEVELOPMENT \|\| DEBUG
872	#define VM_PAGEOUT_STAT_SIZE (30 * 8) + 1
873	#else
874	#define VM_PAGEOUT_STAT_SIZE (1 * 8) + 1
875	#endif
876	struct vm_pageout_stat {
877	unsigned long vm_page_active_count;
878	unsigned long vm_page_speculative_count;
879	unsigned long vm_page_inactive_count;
880	unsigned long vm_page_anonymous_count;
881
882	unsigned long vm_page_free_count;
883	unsigned long vm_page_wire_count;
884	unsigned long vm_page_compressor_count;
885
886	unsigned long vm_page_pages_compressed;
887	unsigned long vm_page_pageable_internal_count;
888	unsigned long vm_page_pageable_external_count;
889	unsigned long vm_page_xpmapped_external_count;
890
891	unsigned int pages_grabbed;
892	unsigned int pages_freed;
893
894	unsigned int pages_compressed;
895	unsigned int pages_grabbed_by_compressor;
896	unsigned int failed_compressions;
897
898	unsigned int pages_evicted;
899	unsigned int pages_purged;
900
901	unsigned int considered;
902	unsigned int considered_bq_internal;
903	unsigned int considered_bq_external;
904
905	unsigned int skipped_external;
906	unsigned int skipped_internal;
907	unsigned int filecache_min_reactivations;
908
909	unsigned int freed_speculative;
910	unsigned int freed_cleaned;
911	unsigned int freed_internal;
912	unsigned int freed_external;
913
914	unsigned int cleaned_dirty_external;
915	unsigned int cleaned_dirty_internal;
916
917	unsigned int inactive_referenced;
918	unsigned int inactive_nolock;
919	unsigned int reactivation_limit_exceeded;
920	unsigned int forced_inactive_reclaim;
921
922	unsigned int throttled_internal_q;
923	unsigned int throttled_external_q;
924
925	unsigned int phantom_ghosts_found;
926	unsigned int phantom_ghosts_added;
927
928	unsigned int vm_page_realtime_count;
929	unsigned int forcereclaimed_sharedcache;
930	unsigned int forcereclaimed_realtime;
931	unsigned int protected_sharedcache;
932	unsigned int protected_realtime;
933	} vm_pageout_stats[VM_PAGEOUT_STAT_SIZE];
934
935	unsigned int vm_pageout_stat_now = `0`;
936
937	#define VM_PAGEOUT_STAT_BEFORE(i) \
938	(((i) == 0) ? VM_PAGEOUT_STAT_SIZE - 1 : (i) - 1)
939	#define VM_PAGEOUT_STAT_AFTER(i) \
940	(((i) == VM_PAGEOUT_STAT_SIZE - 1) ? 0 : (i) + 1)
941
942	#if VM_PAGE_BUCKETS_CHECK
943	int vm_page_buckets_check_interval = `80`; / in eighths of a second /
944	#endif /* VM_PAGE_BUCKETS_CHECK */
945
946
947	void
948	record_memory_pressure(void);
949	void
950	record_memory_pressure(void)
951	{
952	unsigned int vm_pageout_next;
953
954	#if VM_PAGE_BUCKETS_CHECK
955	/ check the consistency of VM page buckets at regular interval /
956	static int counter = `0`;
957	if ((++counter % vm_page_buckets_check_interval) == `0`) {
958	vm_page_buckets_check();
959	}
960	#endif /* VM_PAGE_BUCKETS_CHECK */
961
962	vm_pageout_state.vm_memory_pressure =
963	vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_speculative +
964	vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_cleaned +
965	vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_internal +
966	vm_pageout_stats[VM_PAGEOUT_STAT_BEFORE(vm_pageout_stat_now)].freed_external;
967
968	commpage_set_memory_pressure(pressure: (unsigned int)vm_pageout_state.vm_memory_pressure );
969
970	/ move "now" forward /
971	vm_pageout_next = VM_PAGEOUT_STAT_AFTER(vm_pageout_stat_now);
972
973	bzero(s: &vm_pageout_stats[vm_pageout_next], n: sizeof(struct vm_pageout_stat));
974
975	vm_pageout_stat_now = vm_pageout_next;
976	}
977
978
979	/*
980	* IMPORTANT
981	* mach_vm_ctl_page_free_wanted() is called indirectly, via
982	* mach_vm_pressure_monitor(), when taking a stackshot. Therefore,
983	* it must be safe in the restricted stackshot context. Locks and/or
984	* blocking are not allowable.
985	*/
986	unsigned int
987	mach_vm_ctl_page_free_wanted(void)
988	{
989	unsigned int page_free_target, page_free_count, page_free_wanted;
990
991	page_free_target = vm_page_free_target;
992	page_free_count = vm_page_free_count;
993	if (page_free_target > page_free_count) {
994	page_free_wanted = page_free_target - page_free_count;
995	} else {
996	page_free_wanted = `0`;
997	}
998
999	return page_free_wanted;
1000	}
1001
1002
1003	/*
1004	* IMPORTANT:
1005	* mach_vm_pressure_monitor() is called when taking a stackshot, with
1006	* wait_for_pressure FALSE, so that code path must remain safe in the
1007	* restricted stackshot context. No blocking or locks are allowable.
1008	* on that code path.
1009	*/
1010
1011	kern_return_t
1012	mach_vm_pressure_monitor(
1013	boolean_t wait_for_pressure,
1014	unsigned int nsecs_monitored,
1015	unsigned int *pages_reclaimed_p,
1016	unsigned int *pages_wanted_p)
1017	{
1018	wait_result_t wr;
1019	unsigned int vm_pageout_then, vm_pageout_now;
1020	unsigned int pages_reclaimed;
1021	unsigned int units_of_monitor;
1022
1023	units_of_monitor = `8` * nsecs_monitored;
1024	/*
1025	* We don't take the vm_page_queue_lock here because we don't want
1026	* vm_pressure_monitor() to get in the way of the vm_pageout_scan()
1027	* thread when it's trying to reclaim memory. We don't need fully
1028	* accurate monitoring anyway...
1029	*/
1030
1031	if (wait_for_pressure) {
1032	/ wait until there's memory pressure /
1033	while (vm_page_free_count >= vm_page_free_target) {
1034	wr = assert_wait(event: (event_t) &vm_page_free_wanted,
1035	THREAD_INTERRUPTIBLE);
1036	if (wr == THREAD_WAITING) {
1037	wr = thread_block(THREAD_CONTINUE_NULL);
1038	}
1039	if (wr == THREAD_INTERRUPTED) {
1040	return KERN_ABORTED;
1041	}
1042	if (wr == THREAD_AWAKENED) {
1043	/*
1044	* The memory pressure might have already
1045	* been relieved but let's not block again
1046	* and let's report that there was memory
1047	* pressure at some point.
1048	*/
1049	break;
1050	}
1051	}
1052	}
1053
1054	/ provide the number of pages the system wants to reclaim /
1055	if (pages_wanted_p != NULL) {
1056	*pages_wanted_p = mach_vm_ctl_page_free_wanted();
1057	}
1058
1059	if (pages_reclaimed_p == NULL) {
1060	return KERN_SUCCESS;
1061	}
1062
1063	/ provide number of pages reclaimed in the last "nsecs_monitored" /
1064	vm_pageout_now = vm_pageout_stat_now;
1065	pages_reclaimed = `0`;
1066	for (vm_pageout_then =
1067	VM_PAGEOUT_STAT_BEFORE(vm_pageout_now);
1068	vm_pageout_then != vm_pageout_now &&
1069	units_of_monitor-- != `0`;
1070	vm_pageout_then =
1071	VM_PAGEOUT_STAT_BEFORE(vm_pageout_then)) {
1072	pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_speculative;
1073	pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_cleaned;
1074	pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_internal;
1075	pages_reclaimed += vm_pageout_stats[vm_pageout_then].freed_external;
1076	}
1077	*pages_reclaimed_p = pages_reclaimed;
1078
1079	return KERN_SUCCESS;
1080	}
1081
1082
1083
1084	#if DEVELOPMENT \|\| DEBUG
1085
1086	static void
1087	vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t , int*);
1088
1089	/*
1090	* condition variable used to make sure there is
1091	* only a single sweep going on at a time
1092	*/
1093	bool vm_pageout_disconnect_all_pages_active = false;
1094
1095	void
1096	vm_pageout_disconnect_all_pages()
1097	{
1098	vm_page_lock_queues();
1099
1100	if (vm_pageout_disconnect_all_pages_active) {
1101	vm_page_unlock_queues();
1102	return;
1103	}
1104	vm_pageout_disconnect_all_pages_active = true;
1105
1106	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_throttled,
1107	vm_page_throttled_count);
1108	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_anonymous,
1109	vm_page_anonymous_count);
1110	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_inactive,
1111	(vm_page_inactive_count - vm_page_anonymous_count));
1112	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_active,
1113	vm_page_active_count);
1114	#ifdef CONFIG_SECLUDED_MEMORY
1115	vm_pageout_disconnect_all_pages_in_queue(&vm_page_queue_secluded,
1116	vm_page_secluded_count);
1117	#endif /* CONFIG_SECLUDED_MEMORY */
1118	vm_page_unlock_queues();
1119
1120	vm_pageout_disconnect_all_pages_active = false;
1121	}
1122
1123	/ NB: assumes the page_queues lock is held on entry, returns with page queue lock held /
1124	void
1125	vm_pageout_disconnect_all_pages_in_queue(vm_page_queue_head_t q, int* qcount)
1126	{
1127	vm_page_t m;
1128	vm_object_t t_object = NULL;
1129	vm_object_t l_object = NULL;
1130	vm_object_t m_object = NULL;
1131	int delayed_unlock = `0`;
1132	int try_failed_count = `0`;
1133	int disconnected_count = `0`;
1134	int paused_count = `0`;
1135	int object_locked_count = `0`;
1136
1137	KDBG((MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS) \|
1138	DBG_FUNC_START),
1139	q, qcount);
1140
1141	while (qcount && !vm_page_queue_empty(q)) {
1142	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1143
1144	m = (vm_page_t) vm_page_queue_first(q);
1145	m_object = VM_PAGE_OBJECT(m);
1146
1147	/*
1148	* check to see if we currently are working
1149	* with the same object... if so, we've
1150	* already got the lock
1151	*/
1152	if (m_object != l_object) {
1153	/*
1154	* the object associated with candidate page is
1155	* different from the one we were just working
1156	* with... dump the lock if we still own it
1157	*/
1158	if (l_object != NULL) {
1159	vm_object_unlock(l_object);
1160	l_object = NULL;
1161	}
1162	if (m_object != t_object) {
1163	try_failed_count = `0`;
1164	}
1165
1166	/*
1167	* Try to lock object; since we've alread got the
1168	* page queues lock, we can only 'try' for this one.
1169	* if the 'try' fails, we need to do a mutex_pause
1170	* to allow the owner of the object lock a chance to
1171	* run...
1172	*/
1173	if (!vm_object_lock_try_scan(m_object)) {
1174	if (try_failed_count > `20`) {
1175	goto reenter_pg_on_q;
1176	}
1177	vm_page_unlock_queues();
1178	mutex_pause(try_failed_count++);
1179	vm_page_lock_queues();
1180	delayed_unlock = `0`;
1181
1182	paused_count++;
1183
1184	t_object = m_object;
1185	continue;
1186	}
1187	object_locked_count++;
1188
1189	l_object = m_object;
1190	}
1191	if (!m_object->alive \|\| m->vmp_cleaning \|\| m->vmp_laundry \|\|
1192	m->vmp_busy \|\| m->vmp_absent \|\| VMP_ERROR_GET(m) \|\|
1193	m->vmp_free_when_done) {
1194	/*
1195	* put it back on the head of its queue
1196	*/
1197	goto reenter_pg_on_q;
1198	}
1199	if (m->vmp_pmapped == TRUE) {
1200	pmap_disconnect(VM_PAGE_GET_PHYS_PAGE(m));
1201
1202	disconnected_count++;
1203	}
1204	reenter_pg_on_q:
1205	vm_page_queue_remove(q, m, vmp_pageq);
1206	vm_page_queue_enter(q, m, vmp_pageq);
1207
1208	qcount--;
1209	try_failed_count = `0`;
1210
1211	if (delayed_unlock++ > `128`) {
1212	if (l_object != NULL) {
1213	vm_object_unlock(l_object);
1214	l_object = NULL;
1215	}
1216	lck_mtx_yield(&vm_page_queue_lock);
1217	delayed_unlock = `0`;
1218	}
1219	}
1220	if (l_object != NULL) {
1221	vm_object_unlock(l_object);
1222	l_object = NULL;
1223	}
1224
1225	KDBG((MACHDBG_CODE(DBG_MACH_WORKINGSET, VM_DISCONNECT_ALL_PAGE_MAPPINGS) \|
1226	DBG_FUNC_END),
1227	q, disconnected_count, object_locked_count, paused_count);
1228	}
1229
1230	extern char* proc_best_name(struct proc* proc);
1231
1232	int
1233	vm_toggle_task_selfdonate_pages(task_t task)
1234	{
1235	int state = `0`;
1236	if (vm_page_donate_mode == VM_PAGE_DONATE_DISABLED) {
1237	printf("VM Donation mode is OFF on the system\n");
1238	return state;
1239	}
1240	if (task != kernel_task) {
1241	task_lock(task);
1242	if (!task->donates_own_pages) {
1243	printf("SELF DONATE for %s ON\n", proc_best_name(get_bsdtask_info(task)));
1244	task->donates_own_pages = true;
1245	state = `1`;
1246	} else if (task->donates_own_pages) {
1247	printf("SELF DONATE for %s OFF\n", proc_best_name(get_bsdtask_info(task)));
1248	task->donates_own_pages = false;
1249	state = `0`;
1250	}
1251	task_unlock(task);
1252	}
1253	return state;
1254	}
1255	#endif /* DEVELOPMENT \|\| DEBUG */
1256
1257	void
1258	vm_task_set_selfdonate_pages(task_t task, bool donate)
1259	{
1260	assert(vm_page_donate_mode != VM_PAGE_DONATE_DISABLED);
1261	assert(task != kernel_task);
1262
1263	task_lock(task);
1264	task->donates_own_pages = donate;
1265	task_unlock(task);
1266	}
1267
1268
1269
1270	static size_t
1271	vm_pageout_page_queue(vm_page_queue_head_t *, size_t, bool);
1272
1273	/*
1274	* condition variable used to make sure there is
1275	* only a single sweep going on at a time
1276	*/
1277	boolean_t vm_pageout_anonymous_pages_active = FALSE;
1278
1279
1280	void
1281	vm_pageout_anonymous_pages()
1282	{
1283	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
1284	vm_page_lock_queues();
1285
1286	if (vm_pageout_anonymous_pages_active == TRUE) {
1287	vm_page_unlock_queues();
1288	return;
1289	}
1290	vm_pageout_anonymous_pages_active = TRUE;
1291	vm_page_unlock_queues();
1292
1293	vm_pageout_page_queue(&vm_page_queue_throttled, vm_page_throttled_count, false);
1294	vm_pageout_page_queue(&vm_page_queue_anonymous, vm_page_anonymous_count, false);
1295	vm_pageout_page_queue(&vm_page_queue_active, vm_page_active_count, false);
1296
1297	if (VM_CONFIG_SWAP_IS_PRESENT) {
1298	vm_consider_swapping();
1299	}
1300
1301	vm_page_lock_queues();
1302	vm_pageout_anonymous_pages_active = FALSE;
1303	vm_page_unlock_queues();
1304	}
1305	}
1306
1307
1308	size_t
1309	vm_pageout_page_queue(vm_page_queue_head_t *q, size_t qcount, bool perf_test)
1310	{
1311	vm_page_t m;
1312	vm_object_t t_object = NULL;
1313	vm_object_t l_object = NULL;
1314	vm_object_t m_object = NULL;
1315	int delayed_unlock = `0`;
1316	int try_failed_count = `0`;
1317	int refmod_state;
1318	int pmap_options;
1319	struct vm_pageout_queue *iq;
1320	ppnum_t phys_page;
1321	size_t pages_moved = `0`;
1322
1323
1324	iq = &vm_pageout_queue_internal;
1325
1326	vm_page_lock_queues();
1327
1328	#if DEVELOPMENT \|\| DEBUG
1329	if (perf_test) {
1330	iq = &vm_pageout_queue_benchmark;
1331	// ensure the benchmark queue isn't throttled
1332	iq->pgo_maxlaundry = (unsigned int) qcount;
1333	}
1334	#endif /* DEVELOPMENT \|\|DEBUG */
1335
1336	while (qcount && !vm_page_queue_empty(q)) {
1337	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1338
1339	if (VM_PAGE_Q_THROTTLED(iq)) {
1340	if (l_object != NULL) {
1341	vm_object_unlock(l_object);
1342	l_object = NULL;
1343	}
1344	iq->pgo_draining = TRUE;
1345
1346	assert_wait(event: (event_t) (&iq->pgo_laundry + `1`), THREAD_INTERRUPTIBLE);
1347	vm_page_unlock_queues();
1348
1349	thread_block(THREAD_CONTINUE_NULL);
1350
1351	vm_page_lock_queues();
1352	delayed_unlock = `0`;
1353	continue;
1354	}
1355	m = (vm_page_t) vm_page_queue_first(q);
1356	m_object = VM_PAGE_OBJECT(m);
1357
1358	/*
1359	* check to see if we currently are working
1360	* with the same object... if so, we've
1361	* already got the lock
1362	*/
1363	if (m_object != l_object) {
1364	if (!m_object->internal) {
1365	goto reenter_pg_on_q;
1366	}
1367
1368	/*
1369	* the object associated with candidate page is
1370	* different from the one we were just working
1371	* with... dump the lock if we still own it
1372	*/
1373	if (l_object != NULL) {
1374	vm_object_unlock(l_object);
1375	l_object = NULL;
1376	}
1377	if (m_object != t_object) {
1378	try_failed_count = `0`;
1379	}
1380
1381	/*
1382	* Try to lock object; since we've alread got the
1383	* page queues lock, we can only 'try' for this one.
1384	* if the 'try' fails, we need to do a mutex_pause
1385	* to allow the owner of the object lock a chance to
1386	* run...
1387	*/
1388	if (!vm_object_lock_try_scan(m_object)) {
1389	if (try_failed_count > `20`) {
1390	goto reenter_pg_on_q;
1391	}
1392	vm_page_unlock_queues();
1393	mutex_pause(try_failed_count++);
1394	vm_page_lock_queues();
1395	delayed_unlock = `0`;
1396
1397	t_object = m_object;
1398	continue;
1399	}
1400	l_object = m_object;
1401	}
1402	if (!m_object->alive \|\| m->vmp_cleaning \|\| m->vmp_laundry \|\| m->vmp_busy \|\| m->vmp_absent \|\| VMP_ERROR_GET(m) \|\| m->vmp_free_when_done) {
1403	/*
1404	* page is not to be cleaned
1405	* put it back on the head of its queue
1406	*/
1407	goto reenter_pg_on_q;
1408	}
1409	phys_page = VM_PAGE_GET_PHYS_PAGE(m);
1410
1411	if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
1412	refmod_state = pmap_get_refmod(pn: phys_page);
1413
1414	if (refmod_state & VM_MEM_REFERENCED) {
1415	m->vmp_reference = TRUE;
1416	}
1417	if (refmod_state & VM_MEM_MODIFIED) {
1418	SET_PAGE_DIRTY(m, FALSE);
1419	}
1420	}
1421	if (m->vmp_reference == TRUE) {
1422	m->vmp_reference = FALSE;
1423	pmap_clear_refmod_options(pn: phys_page, VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
1424	goto reenter_pg_on_q;
1425	}
1426	if (m->vmp_pmapped == TRUE) {
1427	if (m->vmp_dirty \|\| m->vmp_precious) {
1428	pmap_options = PMAP_OPTIONS_COMPRESSOR;
1429	} else {
1430	pmap_options = PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
1431	}
1432	refmod_state = pmap_disconnect_options(phys: phys_page, options: pmap_options, NULL);
1433	if (refmod_state & VM_MEM_MODIFIED) {
1434	SET_PAGE_DIRTY(m, FALSE);
1435	}
1436	}
1437
1438	if (!m->vmp_dirty && !m->vmp_precious) {
1439	vm_page_unlock_queues();
1440	VM_PAGE_FREE(m);
1441	vm_page_lock_queues();
1442	delayed_unlock = `0`;
1443
1444	goto next_pg;
1445	}
1446	if (!m_object->pager_initialized \|\| m_object->pager == MEMORY_OBJECT_NULL) {
1447	if (!m_object->pager_initialized) {
1448	vm_page_unlock_queues();
1449
1450	vm_object_collapse(object: m_object, offset: (vm_object_offset_t) `0`, TRUE);
1451
1452	if (!m_object->pager_initialized) {
1453	vm_object_compressor_pager_create(object: m_object);
1454	}
1455
1456	vm_page_lock_queues();
1457	delayed_unlock = `0`;
1458	}
1459	if (!m_object->pager_initialized \|\| m_object->pager == MEMORY_OBJECT_NULL) {
1460	goto reenter_pg_on_q;
1461	}
1462	/*
1463	* vm_object_compressor_pager_create will drop the object lock
1464	* which means 'm' may no longer be valid to use
1465	*/
1466	continue;
1467	}
1468
1469	if (!perf_test) {
1470	/*
1471	* we've already factored out pages in the laundry which
1472	* means this page can't be on the pageout queue so it's
1473	* safe to do the vm_page_queues_remove
1474	*/
1475	bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
1476	vm_page_queues_remove(mem: m, TRUE);
1477	if (donate) {
1478	/*
1479	* The compressor needs to see this bit to know
1480	* where this page needs to land. Also if stolen,
1481	* this bit helps put the page back in the right
1482	* special queue where it belongs.
1483	*/
1484	m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
1485	}
1486	} else {
1487	vm_page_queue_remove(q, m, vmp_pageq);
1488	}
1489
1490	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
1491
1492	vm_pageout_cluster_to_queue(m, q: iq);
1493
1494	pages_moved++;
1495	goto next_pg;
1496
1497	reenter_pg_on_q:
1498	vm_page_queue_remove(q, m, vmp_pageq);
1499	vm_page_queue_enter(q, m, vmp_pageq);
1500	next_pg:
1501	qcount--;
1502	try_failed_count = `0`;
1503
1504	if (delayed_unlock++ > `128`) {
1505	if (l_object != NULL) {
1506	vm_object_unlock(l_object);
1507	l_object = NULL;
1508	}
1509	lck_mtx_yield(lck: &vm_page_queue_lock);
1510	delayed_unlock = `0`;
1511	}
1512	}
1513	if (l_object != NULL) {
1514	vm_object_unlock(l_object);
1515	l_object = NULL;
1516	}
1517	vm_page_unlock_queues();
1518	return pages_moved;
1519	}
1520
1521
1522
1523	/*
1524	* function in BSD to apply I/O throttle to the pageout thread
1525	*/
1526	extern void vm_pageout_io_throttle(void);
1527
1528	#define VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, obj) \
1529	MACRO_BEGIN \
1530	/* \
1531	* If a "reusable" page somehow made it back into \
1532	* the active queue, it's been re-used and is not \
1533	* quite re-usable. \
1534	* If the VM object was "all_reusable", consider it \
1535	* as "all re-used" instead of converting it to \
1536	* "partially re-used", which could be expensive. \
1537	*/ \
1538	assert(VM_PAGE_OBJECT((m)) == (obj)); \
1539	if ((m)->vmp_reusable \|\| \
1540	(obj)->all_reusable) { \
1541	vm_object_reuse_pages((obj), \
1542	(m)->vmp_offset, \
1543	(m)->vmp_offset + PAGE_SIZE_64, \
1544	FALSE); \
1545	} \
1546	MACRO_END
1547
1548
1549	#define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT 64
1550	#define VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX 1024
1551
1552	#define FCS_IDLE 0
1553	#define FCS_DELAYED 1
1554	#define FCS_DEADLOCK_DETECTED 2
1555
1556	struct flow_control {
1557	int state;
1558	mach_timespec_t ts;
1559	};
1560
1561
1562	uint64_t vm_pageout_rejected_bq_internal = `0`;
1563	uint64_t vm_pageout_rejected_bq_external = `0`;
1564	uint64_t vm_pageout_skipped_bq_internal = `0`;
1565	uint64_t vm_pageout_skipped_bq_external = `0`;
1566
1567	#define ANONS_GRABBED_LIMIT 2
1568
1569
1570	#if 0
1571	static void vm_pageout_delayed_unlock(int , int* , vm_page_t );
1572	#endif
1573	static void vm_pageout_prepare_to_block(vm_object_t , int* , vm_page_t , int , int*);
1574
1575	#define VM_PAGEOUT_PB_NO_ACTION 0
1576	#define VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER 1
1577	#define VM_PAGEOUT_PB_THREAD_YIELD 2
1578
1579
1580	#if 0
1581	static void
1582	vm_pageout_delayed_unlock(int delayed_unlock, int* local_freed, vm_page_t local_freeq)
1583	{
1584	if (*local_freeq) {
1585	vm_page_unlock_queues();
1586
1587	VM_DEBUG_CONSTANT_EVENT(
1588	vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_START,
1589	vm_page_free_count, `0`, `0`, `1`);
1590
1591	vm_page_free_list(*local_freeq, TRUE);
1592
1593	VM_DEBUG_CONSTANT_EVENT(vm_pageout_freelist, VM_PAGEOUT_FREELIST, DBG_FUNC_END,
1594	vm_page_free_count, *local_freed, `0`, `1`);
1595
1596	*local_freeq = NULL;
1597	*local_freed = `0`;
1598
1599	vm_page_lock_queues();
1600	} else {
1601	lck_mtx_yield(&vm_page_queue_lock);
1602	}
1603	*delayed_unlock = `1`;
1604	}
1605	#endif
1606
1607
1608	static void
1609	vm_pageout_prepare_to_block(vm_object_t object, int* *delayed_unlock,
1610	vm_page_t local_freeq, int* local_freed, int* action)
1611	{
1612	vm_page_unlock_queues();
1613
1614	if (*object != NULL) {
1615	vm_object_unlock(*object);
1616	*object = NULL;
1617	}
1618	if (*local_freeq) {
1619	vm_page_free_list(mem: *local_freeq, TRUE);
1620
1621	*local_freeq = NULL;
1622	*local_freed = `0`;
1623	}
1624	*delayed_unlock = `1`;
1625
1626	switch (action) {
1627	case VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER:
1628	vm_consider_waking_compactor_swapper();
1629	break;
1630	case VM_PAGEOUT_PB_THREAD_YIELD:
1631	thread_yield_internal(interval: `1`);
1632	break;
1633	case VM_PAGEOUT_PB_NO_ACTION:
1634	default:
1635	break;
1636	}
1637	vm_page_lock_queues();
1638	}
1639
1640
1641	static struct vm_pageout_vminfo last;
1642
1643	uint64_t last_vm_page_pages_grabbed = `0`;
1644
1645	extern uint32_t c_segment_pages_compressed;
1646
1647	extern uint64_t shared_region_pager_reclaimed;
1648	extern struct memory_object_pager_ops shared_region_pager_ops;
1649
1650	void
1651	update_vm_info(void)
1652	{
1653	unsigned long tmp;
1654	uint64_t tmp64;
1655
1656	vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count = vm_page_active_count;
1657	vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count = vm_page_speculative_count;
1658	vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count = vm_page_inactive_count;
1659	vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count = vm_page_anonymous_count;
1660
1661	vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count = vm_page_free_count;
1662	vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count = vm_page_wire_count;
1663	vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count = VM_PAGE_COMPRESSOR_COUNT;
1664
1665	vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed = c_segment_pages_compressed;
1666	vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count = vm_page_pageable_internal_count;
1667	vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count = vm_page_pageable_external_count;
1668	vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count = vm_page_xpmapped_external_count;
1669	vm_pageout_stats[vm_pageout_stat_now].vm_page_realtime_count = vm_page_realtime_count;
1670
1671	tmp = vm_pageout_vminfo.vm_pageout_considered_page;
1672	vm_pageout_stats[vm_pageout_stat_now].considered = (unsigned int)(tmp - last.vm_pageout_considered_page);
1673	last.vm_pageout_considered_page = tmp;
1674
1675	tmp64 = vm_pageout_vminfo.vm_pageout_compressions;
1676	vm_pageout_stats[vm_pageout_stat_now].pages_compressed = (unsigned int)(tmp64 - last.vm_pageout_compressions);
1677	last.vm_pageout_compressions = tmp64;
1678
1679	tmp = vm_pageout_vminfo.vm_compressor_failed;
1680	vm_pageout_stats[vm_pageout_stat_now].failed_compressions = (unsigned int)(tmp - last.vm_compressor_failed);
1681	last.vm_compressor_failed = tmp;
1682
1683	tmp64 = vm_pageout_vminfo.vm_compressor_pages_grabbed;
1684	vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor = (unsigned int)(tmp64 - last.vm_compressor_pages_grabbed);
1685	last.vm_compressor_pages_grabbed = tmp64;
1686
1687	tmp = vm_pageout_vminfo.vm_phantom_cache_found_ghost;
1688	vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found = (unsigned int)(tmp - last.vm_phantom_cache_found_ghost);
1689	last.vm_phantom_cache_found_ghost = tmp;
1690
1691	tmp = vm_pageout_vminfo.vm_phantom_cache_added_ghost;
1692	vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added = (unsigned int)(tmp - last.vm_phantom_cache_added_ghost);
1693	last.vm_phantom_cache_added_ghost = tmp;
1694
1695	tmp64 = counter_load(&vm_page_grab_count);
1696	vm_pageout_stats[vm_pageout_stat_now].pages_grabbed = (unsigned int)(tmp64 - last_vm_page_pages_grabbed);
1697	last_vm_page_pages_grabbed = tmp64;
1698
1699	tmp = vm_pageout_vminfo.vm_page_pages_freed;
1700	vm_pageout_stats[vm_pageout_stat_now].pages_freed = (unsigned int)(tmp - last.vm_page_pages_freed);
1701	last.vm_page_pages_freed = tmp;
1702
1703	if (vm_pageout_stats[vm_pageout_stat_now].considered) {
1704	tmp = vm_pageout_vminfo.vm_pageout_pages_evicted;
1705	vm_pageout_stats[vm_pageout_stat_now].pages_evicted = (unsigned int)(tmp - last.vm_pageout_pages_evicted);
1706	last.vm_pageout_pages_evicted = tmp;
1707
1708	tmp = vm_pageout_vminfo.vm_pageout_pages_purged;
1709	vm_pageout_stats[vm_pageout_stat_now].pages_purged = (unsigned int)(tmp - last.vm_pageout_pages_purged);
1710	last.vm_pageout_pages_purged = tmp;
1711
1712	tmp = vm_pageout_vminfo.vm_pageout_freed_speculative;
1713	vm_pageout_stats[vm_pageout_stat_now].freed_speculative = (unsigned int)(tmp - last.vm_pageout_freed_speculative);
1714	last.vm_pageout_freed_speculative = tmp;
1715
1716	tmp = vm_pageout_vminfo.vm_pageout_freed_external;
1717	vm_pageout_stats[vm_pageout_stat_now].freed_external = (unsigned int)(tmp - last.vm_pageout_freed_external);
1718	last.vm_pageout_freed_external = tmp;
1719
1720	tmp = vm_pageout_vminfo.vm_pageout_inactive_referenced;
1721	vm_pageout_stats[vm_pageout_stat_now].inactive_referenced = (unsigned int)(tmp - last.vm_pageout_inactive_referenced);
1722	last.vm_pageout_inactive_referenced = tmp;
1723
1724	tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external;
1725	vm_pageout_stats[vm_pageout_stat_now].throttled_external_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_external);
1726	last.vm_pageout_scan_inactive_throttled_external = tmp;
1727
1728	tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_external;
1729	vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_external);
1730	last.vm_pageout_inactive_dirty_external = tmp;
1731
1732	tmp = vm_pageout_vminfo.vm_pageout_freed_cleaned;
1733	vm_pageout_stats[vm_pageout_stat_now].freed_cleaned = (unsigned int)(tmp - last.vm_pageout_freed_cleaned);
1734	last.vm_pageout_freed_cleaned = tmp;
1735
1736	tmp = vm_pageout_vminfo.vm_pageout_inactive_nolock;
1737	vm_pageout_stats[vm_pageout_stat_now].inactive_nolock = (unsigned int)(tmp - last.vm_pageout_inactive_nolock);
1738	last.vm_pageout_inactive_nolock = tmp;
1739
1740	tmp = vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal;
1741	vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q = (unsigned int)(tmp - last.vm_pageout_scan_inactive_throttled_internal);
1742	last.vm_pageout_scan_inactive_throttled_internal = tmp;
1743
1744	tmp = vm_pageout_vminfo.vm_pageout_skipped_external;
1745	vm_pageout_stats[vm_pageout_stat_now].skipped_external = (unsigned int)(tmp - last.vm_pageout_skipped_external);
1746	last.vm_pageout_skipped_external = tmp;
1747
1748	tmp = vm_pageout_vminfo.vm_pageout_skipped_internal;
1749	vm_pageout_stats[vm_pageout_stat_now].skipped_internal = (unsigned int)(tmp - last.vm_pageout_skipped_internal);
1750	last.vm_pageout_skipped_internal = tmp;
1751
1752	tmp = vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded;
1753	vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded = (unsigned int)(tmp - last.vm_pageout_reactivation_limit_exceeded);
1754	last.vm_pageout_reactivation_limit_exceeded = tmp;
1755
1756	tmp = vm_pageout_vminfo.vm_pageout_inactive_force_reclaim;
1757	vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim = (unsigned int)(tmp - last.vm_pageout_inactive_force_reclaim);
1758	last.vm_pageout_inactive_force_reclaim = tmp;
1759
1760	tmp = vm_pageout_vminfo.vm_pageout_freed_internal;
1761	vm_pageout_stats[vm_pageout_stat_now].freed_internal = (unsigned int)(tmp - last.vm_pageout_freed_internal);
1762	last.vm_pageout_freed_internal = tmp;
1763
1764	tmp = vm_pageout_vminfo.vm_pageout_considered_bq_internal;
1765	vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal = (unsigned int)(tmp - last.vm_pageout_considered_bq_internal);
1766	last.vm_pageout_considered_bq_internal = tmp;
1767
1768	tmp = vm_pageout_vminfo.vm_pageout_considered_bq_external;
1769	vm_pageout_stats[vm_pageout_stat_now].considered_bq_external = (unsigned int)(tmp - last.vm_pageout_considered_bq_external);
1770	last.vm_pageout_considered_bq_external = tmp;
1771
1772	tmp = vm_pageout_vminfo.vm_pageout_filecache_min_reactivated;
1773	vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations = (unsigned int)(tmp - last.vm_pageout_filecache_min_reactivated);
1774	last.vm_pageout_filecache_min_reactivated = tmp;
1775
1776	tmp = vm_pageout_vminfo.vm_pageout_inactive_dirty_internal;
1777	vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal = (unsigned int)(tmp - last.vm_pageout_inactive_dirty_internal);
1778	last.vm_pageout_inactive_dirty_internal = tmp;
1779
1780	tmp = vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache;
1781	vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache = (unsigned int)(tmp - last.vm_pageout_forcereclaimed_sharedcache);
1782	last.vm_pageout_forcereclaimed_sharedcache = tmp;
1783
1784	tmp = vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime;
1785	vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime = (unsigned int)(tmp - last.vm_pageout_forcereclaimed_realtime);
1786	last.vm_pageout_forcereclaimed_realtime = tmp;
1787
1788	tmp = vm_pageout_vminfo.vm_pageout_protected_sharedcache;
1789	vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache = (unsigned int)(tmp - last.vm_pageout_protected_sharedcache);
1790	last.vm_pageout_protected_sharedcache = tmp;
1791
1792	tmp = vm_pageout_vminfo.vm_pageout_protected_realtime;
1793	vm_pageout_stats[vm_pageout_stat_now].protected_realtime = (unsigned int)(tmp - last.vm_pageout_protected_realtime);
1794	last.vm_pageout_protected_realtime = tmp;
1795	}
1796
1797	KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO1)) \| DBG_FUNC_NONE,
1798	vm_pageout_stats[vm_pageout_stat_now].vm_page_active_count,
1799	vm_pageout_stats[vm_pageout_stat_now].vm_page_speculative_count,
1800	vm_pageout_stats[vm_pageout_stat_now].vm_page_inactive_count,
1801	vm_pageout_stats[vm_pageout_stat_now].vm_page_anonymous_count,
1802	`0`);
1803
1804	KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO2)) \| DBG_FUNC_NONE,
1805	vm_pageout_stats[vm_pageout_stat_now].vm_page_free_count,
1806	vm_pageout_stats[vm_pageout_stat_now].vm_page_wire_count,
1807	vm_pageout_stats[vm_pageout_stat_now].vm_page_compressor_count,
1808	`0`,
1809	`0`);
1810
1811	KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO3)) \| DBG_FUNC_NONE,
1812	vm_pageout_stats[vm_pageout_stat_now].vm_page_pages_compressed,
1813	vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_internal_count,
1814	vm_pageout_stats[vm_pageout_stat_now].vm_page_pageable_external_count,
1815	vm_pageout_stats[vm_pageout_stat_now].vm_page_xpmapped_external_count,
1816	`0`);
1817
1818	if (vm_pageout_stats[vm_pageout_stat_now].considered \|\|
1819	vm_pageout_stats[vm_pageout_stat_now].pages_compressed \|\|
1820	vm_pageout_stats[vm_pageout_stat_now].failed_compressions) {
1821	KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO4)) \| DBG_FUNC_NONE,
1822	vm_pageout_stats[vm_pageout_stat_now].considered,
1823	vm_pageout_stats[vm_pageout_stat_now].freed_speculative,
1824	vm_pageout_stats[vm_pageout_stat_now].freed_external,
1825	vm_pageout_stats[vm_pageout_stat_now].inactive_referenced,
1826	`0`);
1827
1828	KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO5)) \| DBG_FUNC_NONE,
1829	vm_pageout_stats[vm_pageout_stat_now].throttled_external_q,
1830	vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_external,
1831	vm_pageout_stats[vm_pageout_stat_now].freed_cleaned,
1832	vm_pageout_stats[vm_pageout_stat_now].inactive_nolock,
1833	`0`);
1834
1835	KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO6)) \| DBG_FUNC_NONE,
1836	vm_pageout_stats[vm_pageout_stat_now].throttled_internal_q,
1837	vm_pageout_stats[vm_pageout_stat_now].pages_compressed,
1838	vm_pageout_stats[vm_pageout_stat_now].pages_grabbed_by_compressor,
1839	vm_pageout_stats[vm_pageout_stat_now].skipped_external,
1840	`0`);
1841
1842	KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO7)) \| DBG_FUNC_NONE,
1843	vm_pageout_stats[vm_pageout_stat_now].reactivation_limit_exceeded,
1844	vm_pageout_stats[vm_pageout_stat_now].forced_inactive_reclaim,
1845	vm_pageout_stats[vm_pageout_stat_now].failed_compressions,
1846	vm_pageout_stats[vm_pageout_stat_now].freed_internal,
1847	`0`);
1848
1849	KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO8)) \| DBG_FUNC_NONE,
1850	vm_pageout_stats[vm_pageout_stat_now].considered_bq_internal,
1851	vm_pageout_stats[vm_pageout_stat_now].considered_bq_external,
1852	vm_pageout_stats[vm_pageout_stat_now].filecache_min_reactivations,
1853	vm_pageout_stats[vm_pageout_stat_now].cleaned_dirty_internal,
1854	`0`);
1855
1856	KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO10)) \| DBG_FUNC_NONE,
1857	vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_sharedcache,
1858	vm_pageout_stats[vm_pageout_stat_now].forcereclaimed_realtime,
1859	vm_pageout_stats[vm_pageout_stat_now].protected_sharedcache,
1860	vm_pageout_stats[vm_pageout_stat_now].protected_realtime,
1861	`0`);
1862	}
1863	KERNEL_DEBUG_CONSTANT((MACHDBG_CODE(DBG_MACH_VM, VM_INFO9)) \| DBG_FUNC_NONE,
1864	vm_pageout_stats[vm_pageout_stat_now].pages_grabbed,
1865	vm_pageout_stats[vm_pageout_stat_now].pages_freed,
1866	vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_found,
1867	vm_pageout_stats[vm_pageout_stat_now].phantom_ghosts_added,
1868	`0`);
1869
1870	record_memory_pressure();
1871	}
1872
1873	extern boolean_t hibernation_vmqueues_inspection;
1874
1875	/*
1876	* Return values for functions called by vm_pageout_scan
1877	* that control its flow.
1878	*
1879	* PROCEED -- vm_pageout_scan will keep making forward progress.
1880	* DONE_RETURN -- page demand satisfied, work is done -> vm_pageout_scan returns.
1881	* NEXT_ITERATION -- restart the 'for' loop in vm_pageout_scan aka continue.
1882	*/
1883
1884	#define VM_PAGEOUT_SCAN_PROCEED (0)
1885	#define VM_PAGEOUT_SCAN_DONE_RETURN (1)
1886	#define VM_PAGEOUT_SCAN_NEXT_ITERATION (2)
1887
1888	/*
1889	* This function is called only from vm_pageout_scan and
1890	* it moves overflow secluded pages (one-at-a-time) to the
1891	* batched 'local' free Q or active Q.
1892	*/
1893	static void
1894	vps_deal_with_secluded_page_overflow(vm_page_t local_freeq, int* *local_freed)
1895	{
1896	#if CONFIG_SECLUDED_MEMORY
1897	/*
1898	* Deal with secluded_q overflow.
1899	*/
1900	if (vm_page_secluded_count > vm_page_secluded_target) {
1901	vm_page_t secluded_page;
1902
1903	/*
1904	* SECLUDED_AGING_BEFORE_ACTIVE:
1905	* Excess secluded pages go to the active queue and
1906	* will later go to the inactive queue.
1907	*/
1908	assert((vm_page_secluded_count_free +
1909	vm_page_secluded_count_inuse) ==
1910	vm_page_secluded_count);
1911	secluded_page = (vm_page_t)vm_page_queue_first(&vm_page_queue_secluded);
1912	assert(secluded_page->vmp_q_state == VM_PAGE_ON_SECLUDED_Q);
1913
1914	vm_page_queues_remove(secluded_page, FALSE);
1915	assert(!secluded_page->vmp_fictitious);
1916	assert(!VM_PAGE_WIRED(secluded_page));
1917
1918	if (secluded_page->vmp_object == `0`) {
1919	/ transfer to free queue /
1920	assert(secluded_page->vmp_busy);
1921	secluded_page->vmp_snext = *local_freeq;
1922	*local_freeq = secluded_page;
1923	*local_freed += `1`;
1924	} else {
1925	/ transfer to head of active queue /
1926	vm_page_enqueue_active(secluded_page, FALSE);
1927	secluded_page = VM_PAGE_NULL;
1928	}
1929	}
1930	#else /* CONFIG_SECLUDED_MEMORY */
1931
1932	#pragma unused(local_freeq)
1933	#pragma unused(local_freed)
1934
1935	return;
1936
1937	#endif /* CONFIG_SECLUDED_MEMORY */
1938	}
1939
1940	/*
1941	* This function is called only from vm_pageout_scan and
1942	* it initializes the loop targets for vm_pageout_scan().
1943	*/
1944	static void
1945	vps_init_page_targets(void)
1946	{
1947	/*
1948	* LD TODO: Other page targets should be calculated here too.
1949	*/
1950	vm_page_anonymous_min = vm_page_inactive_target / `20`;
1951
1952	if (vm_pageout_state.vm_page_speculative_percentage > `50`) {
1953	vm_pageout_state.vm_page_speculative_percentage = `50`;
1954	} else if (vm_pageout_state.vm_page_speculative_percentage <= `0`) {
1955	vm_pageout_state.vm_page_speculative_percentage = `1`;
1956	}
1957
1958	vm_pageout_state.vm_page_speculative_target = VM_PAGE_SPECULATIVE_TARGET(vm_page_active_count +
1959	vm_page_inactive_count);
1960	}
1961
1962	/*
1963	* This function is called only from vm_pageout_scan and
1964	* it purges a single VM object at-a-time and will either
1965	* make vm_pageout_scan() restart the loop or keeping moving forward.
1966	*/
1967	static int
1968	vps_purge_object()
1969	{
1970	int force_purge;
1971
1972	assert(available_for_purge >= `0`);
1973	force_purge = `0`; / no force-purging /
1974
1975	#if VM_PRESSURE_EVENTS
1976	vm_pressure_level_t pressure_level;
1977
1978	pressure_level = memorystatus_vm_pressure_level;
1979
1980	if (pressure_level > kVMPressureNormal) {
1981	if (pressure_level >= kVMPressureCritical) {
1982	force_purge = vm_pageout_state.memorystatus_purge_on_critical;
1983	} else if (pressure_level >= kVMPressureUrgent) {
1984	force_purge = vm_pageout_state.memorystatus_purge_on_urgent;
1985	} else if (pressure_level >= kVMPressureWarning) {
1986	force_purge = vm_pageout_state.memorystatus_purge_on_warning;
1987	}
1988	}
1989	#endif /* VM_PRESSURE_EVENTS */
1990
1991	if (available_for_purge \|\| force_purge) {
1992	memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_START);
1993
1994	VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_START, vm_page_free_count, `0`, `0`, `0`);
1995	if (vm_purgeable_object_purge_one(force_purge_below_group: force_purge, flags: C_DONT_BLOCK)) {
1996	VM_PAGEOUT_DEBUG(vm_pageout_purged_objects, `1`);
1997	VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, vm_page_free_count, `0`, `0`, `0`);
1998	memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
1999
2000	return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2001	}
2002	VM_DEBUG_EVENT(vm_pageout_purgeone, VM_PAGEOUT_PURGEONE, DBG_FUNC_END, `0`, `0`, `0`, -`1`);
2003	memoryshot(VM_PAGEOUT_PURGEONE, DBG_FUNC_END);
2004	}
2005
2006	return VM_PAGEOUT_SCAN_PROCEED;
2007	}
2008
2009	/*
2010	* This function is called only from vm_pageout_scan and
2011	* it will try to age the next speculative Q if the oldest
2012	* one is empty.
2013	*/
2014	static int
2015	vps_age_speculative_queue(boolean_t force_speculative_aging)
2016	{
2017	#define DELAY_SPECULATIVE_AGE 1000
2018
2019	/*
2020	* try to pull pages from the aging bins...
2021	* see vm_page.h for an explanation of how
2022	* this mechanism works
2023	*/
2024	boolean_t can_steal = FALSE;
2025	int num_scanned_queues;
2026	static int delay_speculative_age = `0`; / depends the # of times we go through the main pageout_scan loop./
2027	mach_timespec_t ts;
2028	struct vm_speculative_age_q *aq;
2029	struct vm_speculative_age_q *sq;
2030
2031	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2032
2033	aq = &vm_page_queue_speculative[speculative_steal_index];
2034
2035	num_scanned_queues = `0`;
2036	while (vm_page_queue_empty(&aq->age_q) &&
2037	num_scanned_queues++ != VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
2038	speculative_steal_index++;
2039
2040	if (speculative_steal_index > VM_PAGE_MAX_SPECULATIVE_AGE_Q) {
2041	speculative_steal_index = VM_PAGE_MIN_SPECULATIVE_AGE_Q;
2042	}
2043
2044	aq = &vm_page_queue_speculative[speculative_steal_index];
2045	}
2046
2047	if (num_scanned_queues == VM_PAGE_MAX_SPECULATIVE_AGE_Q + `1`) {
2048	/*
2049	* XXX We've scanned all the speculative
2050	* queues but still haven't found one
2051	* that is not empty, even though
2052	* vm_page_speculative_count is not 0.
2053	*/
2054	if (!vm_page_queue_empty(&sq->age_q)) {
2055	return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2056	}
2057	#if DEVELOPMENT \|\| DEBUG
2058	panic("vm_pageout_scan: vm_page_speculative_count=%d but queues are empty", vm_page_speculative_count);
2059	#endif
2060	/ readjust... /
2061	vm_page_speculative_count = `0`;
2062	/ ... and continue /
2063	return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2064	}
2065
2066	if (vm_page_speculative_count > vm_pageout_state.vm_page_speculative_target \|\| force_speculative_aging == TRUE) {
2067	can_steal = TRUE;
2068	} else {
2069	if (!delay_speculative_age) {
2070	mach_timespec_t ts_fully_aged;
2071
2072	ts_fully_aged.tv_sec = (VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) / `1000`;
2073	ts_fully_aged.tv_nsec = ((VM_PAGE_MAX_SPECULATIVE_AGE_Q * vm_pageout_state.vm_page_speculative_q_age_ms) % `1000`)
2074	* `1000` * NSEC_PER_USEC;
2075
2076	ADD_MACH_TIMESPEC(&ts_fully_aged, &aq->age_ts);
2077
2078	clock_sec_t sec;
2079	clock_nsec_t nsec;
2080	clock_get_system_nanotime(secs: &sec, nanosecs: &nsec);
2081	ts.tv_sec = (unsigned int) sec;
2082	ts.tv_nsec = nsec;
2083
2084	if (CMP_MACH_TIMESPEC(&ts, &ts_fully_aged) >= `0`) {
2085	can_steal = TRUE;
2086	} else {
2087	delay_speculative_age++;
2088	}
2089	} else {
2090	delay_speculative_age++;
2091	if (delay_speculative_age == DELAY_SPECULATIVE_AGE) {
2092	delay_speculative_age = `0`;
2093	}
2094	}
2095	}
2096	if (can_steal == TRUE) {
2097	vm_page_speculate_ageit(aq);
2098	}
2099
2100	return VM_PAGEOUT_SCAN_PROCEED;
2101	}
2102
2103	/*
2104	* This function is called only from vm_pageout_scan and
2105	* it evicts a single VM object from the cache.
2106	*/
2107	static int inline
2108	vps_object_cache_evict(vm_object_t *object_to_unlock)
2109	{
2110	static int cache_evict_throttle = `0`;
2111	struct vm_speculative_age_q *sq;
2112
2113	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2114
2115	if (vm_page_queue_empty(&sq->age_q) && cache_evict_throttle == `0`) {
2116	int pages_evicted;
2117
2118	if (*object_to_unlock != NULL) {
2119	vm_object_unlock(*object_to_unlock);
2120	*object_to_unlock = NULL;
2121	}
2122	KERNEL_DEBUG_CONSTANT(`0x13001ec` \| DBG_FUNC_START, `0`, `0`, `0`, `0`, `0`);
2123
2124	pages_evicted = vm_object_cache_evict(`100`, `10`);
2125
2126	KERNEL_DEBUG_CONSTANT(`0x13001ec` \| DBG_FUNC_END, pages_evicted, `0`, `0`, `0`, `0`);
2127
2128	if (pages_evicted) {
2129	vm_pageout_vminfo.vm_pageout_pages_evicted += pages_evicted;
2130
2131	VM_DEBUG_EVENT(vm_pageout_cache_evict, VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE,
2132	vm_page_free_count, pages_evicted, vm_pageout_vminfo.vm_pageout_pages_evicted, `0`);
2133	memoryshot(VM_PAGEOUT_CACHE_EVICT, DBG_FUNC_NONE);
2134
2135	/*
2136	* we just freed up to 100 pages,
2137	* so go back to the top of the main loop
2138	* and re-evaulate the memory situation
2139	*/
2140	return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2141	} else {
2142	cache_evict_throttle = `1000`;
2143	}
2144	}
2145	if (cache_evict_throttle) {
2146	cache_evict_throttle--;
2147	}
2148
2149	return VM_PAGEOUT_SCAN_PROCEED;
2150	}
2151
2152
2153	/*
2154	* This function is called only from vm_pageout_scan and
2155	* it calculates the filecache min. that needs to be maintained
2156	* as we start to steal pages.
2157	*/
2158	static void
2159	vps_calculate_filecache_min(void)
2160	{
2161	int divisor = vm_pageout_state.vm_page_filecache_min_divisor;
2162
2163	#if CONFIG_JETSAM
2164	/*
2165	* don't let the filecache_min fall below 15% of available memory
2166	* on systems with an active compressor that isn't nearing its
2167	* limits w/r to accepting new data
2168	*
2169	* on systems w/o the compressor/swapper, the filecache is always
2170	* a very large percentage of the AVAILABLE_NON_COMPRESSED_MEMORY
2171	* since most (if not all) of the anonymous pages are in the
2172	* throttled queue (which isn't counted as available) which
2173	* effectively disables this filter
2174	*/
2175	if (vm_compressor_low_on_space() \|\| divisor == `0`) {
2176	vm_pageout_state.vm_page_filecache_min = `0`;
2177	} else {
2178	vm_pageout_state.vm_page_filecache_min =
2179	((AVAILABLE_NON_COMPRESSED_MEMORY) * `10`) / divisor;
2180	}
2181	#else
2182	if (vm_compressor_out_of_space() \|\| divisor == `0`) {
2183	vm_pageout_state.vm_page_filecache_min = `0`;
2184	} else {
2185	/*
2186	* don't let the filecache_min fall below the specified critical level
2187	*/
2188	vm_pageout_state.vm_page_filecache_min =
2189	((AVAILABLE_NON_COMPRESSED_MEMORY) * `10`) / divisor;
2190	}
2191	#endif
2192	if (vm_page_free_count < (vm_page_free_reserved / `4`)) {
2193	vm_pageout_state.vm_page_filecache_min = `0`;
2194	}
2195	}
2196
2197	/*
2198	* This function is called only from vm_pageout_scan and
2199	* it updates the flow control time to detect if VM pageoutscan
2200	* isn't making progress.
2201	*/
2202	static void
2203	vps_flow_control_reset_deadlock_timer(struct flow_control *flow_control)
2204	{
2205	mach_timespec_t ts;
2206	clock_sec_t sec;
2207	clock_nsec_t nsec;
2208
2209	ts.tv_sec = vm_pageout_state.vm_pageout_deadlock_wait / `1000`;
2210	ts.tv_nsec = (vm_pageout_state.vm_pageout_deadlock_wait % `1000`) * `1000` * NSEC_PER_USEC;
2211	clock_get_system_nanotime(secs: &sec, nanosecs: &nsec);
2212	flow_control->ts.tv_sec = (unsigned int) sec;
2213	flow_control->ts.tv_nsec = nsec;
2214	ADD_MACH_TIMESPEC(&flow_control->ts, &ts);
2215
2216	flow_control->state = FCS_DELAYED;
2217
2218	vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_internal++;
2219	}
2220
2221	/*
2222	* This function is called only from vm_pageout_scan and
2223	* it is the flow control logic of VM pageout scan which
2224	* controls if it should block and for how long.
2225	* Any blocking of vm_pageout_scan happens ONLY in this function.
2226	*/
2227	static int
2228	vps_flow_control(struct flow_control flow_control, int* anons_grabbed, vm_object_t object, int *delayed_unlock,
2229	vm_page_t local_freeq, int* local_freed, int* vm_pageout_deadlock_target, unsigned* int inactive_burst_count)
2230	{
2231	boolean_t exceeded_burst_throttle = FALSE;
2232	unsigned int msecs = `0`;
2233	uint32_t inactive_external_count;
2234	mach_timespec_t ts;
2235	struct vm_pageout_queue *iq;
2236	struct vm_pageout_queue *eq;
2237	struct vm_speculative_age_q *sq;
2238
2239	iq = &vm_pageout_queue_internal;
2240	eq = &vm_pageout_queue_external;
2241	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2242
2243	/*
2244	* Sometimes we have to pause:
2245	* 1) No inactive pages - nothing to do.
2246	* 2) Loop control - no acceptable pages found on the inactive queue
2247	* within the last vm_pageout_burst_inactive_throttle iterations
2248	* 3) Flow control - default pageout queue is full
2249	*/
2250	if (vm_page_queue_empty(&vm_page_queue_inactive) &&
2251	vm_page_queue_empty(&vm_page_queue_anonymous) &&
2252	vm_page_queue_empty(&vm_page_queue_cleaned) &&
2253	vm_page_queue_empty(&sq->age_q)) {
2254	VM_PAGEOUT_DEBUG(vm_pageout_scan_empty_throttle, `1`);
2255	msecs = vm_pageout_state.vm_pageout_empty_wait;
2256	} else if (inactive_burst_count >=
2257	MIN(vm_pageout_state.vm_pageout_burst_inactive_throttle,
2258	(vm_page_inactive_count +
2259	vm_page_speculative_count))) {
2260	VM_PAGEOUT_DEBUG(vm_pageout_scan_burst_throttle, `1`);
2261	msecs = vm_pageout_state.vm_pageout_burst_wait;
2262
2263	exceeded_burst_throttle = TRUE;
2264	} else if (VM_PAGE_Q_THROTTLED(iq) &&
2265	VM_DYNAMIC_PAGING_ENABLED()) {
2266	clock_sec_t sec;
2267	clock_nsec_t nsec;
2268
2269	switch (flow_control->state) {
2270	case FCS_IDLE:
2271	if ((vm_page_free_count + *local_freed) < vm_page_free_target &&
2272	vm_pageout_state.vm_restricted_to_single_processor == FALSE) {
2273	/*
2274	* since the compressor is running independently of vm_pageout_scan
2275	* let's not wait for it just yet... as long as we have a healthy supply
2276	* of filecache pages to work with, let's keep stealing those.
2277	*/
2278	inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2279
2280	if (vm_page_pageable_external_count > vm_pageout_state.vm_page_filecache_min &&
2281	(inactive_external_count >= VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2282	*anons_grabbed = ANONS_GRABBED_LIMIT;
2283	VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle_deferred, `1`);
2284	return VM_PAGEOUT_SCAN_PROCEED;
2285	}
2286	}
2287
2288	vps_flow_control_reset_deadlock_timer(flow_control);
2289	msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2290
2291	break;
2292
2293	case FCS_DELAYED:
2294	clock_get_system_nanotime(secs: &sec, nanosecs: &nsec);
2295	ts.tv_sec = (unsigned int) sec;
2296	ts.tv_nsec = nsec;
2297
2298	if (CMP_MACH_TIMESPEC(&ts, &flow_control->ts) >= `0`) {
2299	/*
2300	* the pageout thread for the default pager is potentially
2301	* deadlocked since the
2302	* default pager queue has been throttled for more than the
2303	* allowable time... we need to move some clean pages or dirty
2304	* pages belonging to the external pagers if they aren't throttled
2305	* vm_page_free_wanted represents the number of threads currently
2306	* blocked waiting for pages... we'll move one page for each of
2307	* these plus a fixed amount to break the logjam... once we're done
2308	* moving this number of pages, we'll re-enter the FSC_DELAYED state
2309	* with a new timeout target since we have no way of knowing
2310	* whether we've broken the deadlock except through observation
2311	* of the queue associated with the default pager... we need to
2312	* stop moving pages and allow the system to run to see what
2313	* state it settles into.
2314	*/
2315
2316	*vm_pageout_deadlock_target = vm_pageout_state.vm_pageout_deadlock_relief +
2317	vm_page_free_wanted + vm_page_free_wanted_privileged;
2318	VM_PAGEOUT_DEBUG(vm_pageout_scan_deadlock_detected, `1`);
2319	flow_control->state = FCS_DEADLOCK_DETECTED;
2320	thread_wakeup(VM_PAGEOUT_GC_EVENT);
2321	return VM_PAGEOUT_SCAN_PROCEED;
2322	}
2323	/*
2324	* just resniff instead of trying
2325	* to compute a new delay time... we're going to be
2326	* awakened immediately upon a laundry completion,
2327	* so we won't wait any longer than necessary
2328	*/
2329	msecs = vm_pageout_state.vm_pageout_idle_wait;
2330	break;
2331
2332	case FCS_DEADLOCK_DETECTED:
2333	if (*vm_pageout_deadlock_target) {
2334	return VM_PAGEOUT_SCAN_PROCEED;
2335	}
2336
2337	vps_flow_control_reset_deadlock_timer(flow_control);
2338	msecs = vm_pageout_state.vm_pageout_deadlock_wait;
2339
2340	break;
2341	}
2342	} else {
2343	/*
2344	* No need to pause...
2345	*/
2346	return VM_PAGEOUT_SCAN_PROCEED;
2347	}
2348
2349	vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2350
2351	vm_pageout_prepare_to_block(object, delayed_unlock, local_freeq, local_freed,
2352	VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
2353
2354	if (vm_page_free_count >= vm_page_free_target) {
2355	/*
2356	* we're here because
2357	* 1) someone else freed up some pages while we had
2358	* the queues unlocked above
2359	* and we've hit one of the 3 conditions that
2360	* cause us to pause the pageout scan thread
2361	*
2362	* since we already have enough free pages,
2363	* let's avoid stalling and return normally
2364	*
2365	* before we return, make sure the pageout I/O threads
2366	* are running throttled in case there are still requests
2367	* in the laundry... since we have enough free pages
2368	* we don't need the laundry to be cleaned in a timely
2369	* fashion... so let's avoid interfering with foreground
2370	* activity
2371	*
2372	* we don't want to hold vm_page_queue_free_lock when
2373	* calling vm_pageout_adjust_eq_iothrottle (since it
2374	* may cause other locks to be taken), we do the intitial
2375	* check outside of the lock. Once we take the lock,
2376	* we recheck the condition since it may have changed.
2377	* if it has, no problem, we will make the threads
2378	* non-throttled before actually blocking
2379	*/
2380	vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, TRUE);
2381	}
2382	vm_free_page_lock();
2383
2384	if (vm_page_free_count >= vm_page_free_target &&
2385	(vm_page_free_wanted == `0`) && (vm_page_free_wanted_privileged == `0`)) {
2386	return VM_PAGEOUT_SCAN_DONE_RETURN;
2387	}
2388	vm_free_page_unlock();
2389
2390	if ((vm_page_free_count + vm_page_cleaned_count) < vm_page_free_target) {
2391	/*
2392	* we're most likely about to block due to one of
2393	* the 3 conditions that cause vm_pageout_scan to
2394	* not be able to make forward progress w/r
2395	* to providing new pages to the free queue,
2396	* so unthrottle the I/O threads in case we
2397	* have laundry to be cleaned... it needs
2398	* to be completed ASAP.
2399	*
2400	* even if we don't block, we want the io threads
2401	* running unthrottled since the sum of free +
2402	* clean pages is still under our free target
2403	*/
2404	vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, FALSE);
2405	}
2406	if (vm_page_cleaned_count > `0` && exceeded_burst_throttle == FALSE) {
2407	/*
2408	* if we get here we're below our free target and
2409	* we're stalling due to a full laundry queue or
2410	* we don't have any inactive pages other then
2411	* those in the clean queue...
2412	* however, we have pages on the clean queue that
2413	* can be moved to the free queue, so let's not
2414	* stall the pageout scan
2415	*/
2416	flow_control->state = FCS_IDLE;
2417	return VM_PAGEOUT_SCAN_PROCEED;
2418	}
2419	if (flow_control->state == FCS_DELAYED && !VM_PAGE_Q_THROTTLED(iq)) {
2420	flow_control->state = FCS_IDLE;
2421	return VM_PAGEOUT_SCAN_PROCEED;
2422	}
2423
2424	VM_CHECK_MEMORYSTATUS;
2425
2426	if (flow_control->state != FCS_IDLE) {
2427	VM_PAGEOUT_DEBUG(vm_pageout_scan_throttle, `1`);
2428	}
2429
2430	iq->pgo_throttled = TRUE;
2431	assert_wait_timeout(event: (event_t) &iq->pgo_laundry, THREAD_INTERRUPTIBLE, interval: msecs, scale_factor: `1000` * NSEC_PER_USEC);
2432
2433	vm_page_unlock_queues();
2434
2435	assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
2436
2437	VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START,
2438	iq->pgo_laundry, iq->pgo_maxlaundry, msecs, `0`);
2439	memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_START);
2440
2441	thread_block(THREAD_CONTINUE_NULL);
2442
2443	VM_DEBUG_EVENT(vm_pageout_thread_block, VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END,
2444	iq->pgo_laundry, iq->pgo_maxlaundry, msecs, `0`);
2445	memoryshot(VM_PAGEOUT_THREAD_BLOCK, DBG_FUNC_END);
2446
2447	vm_page_lock_queues();
2448
2449	iq->pgo_throttled = FALSE;
2450
2451	vps_init_page_targets();
2452
2453	return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2454	}
2455
2456	extern boolean_t vm_darkwake_mode;
2457	/*
2458	* This function is called only from vm_pageout_scan and
2459	* it will find and return the most appropriate page to be
2460	* reclaimed.
2461	*/
2462	static int
2463	vps_choose_victim_page(vm_page_t victim_page, int* anons_grabbed, boolean_t grab_anonymous, boolean_t force_anonymous,
2464	boolean_t is_page_from_bg_q, unsigned* int *reactivated_this_call)
2465	{
2466	vm_page_t m = NULL;
2467	vm_object_t m_object = VM_OBJECT_NULL;
2468	uint32_t inactive_external_count;
2469	struct vm_speculative_age_q *sq;
2470	struct vm_pageout_queue *iq;
2471	int retval = VM_PAGEOUT_SCAN_PROCEED;
2472
2473	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2474	iq = &vm_pageout_queue_internal;
2475
2476	*is_page_from_bg_q = FALSE;
2477
2478	m = NULL;
2479	m_object = VM_OBJECT_NULL;
2480
2481	if (VM_DYNAMIC_PAGING_ENABLED()) {
2482	assert(vm_page_throttled_count == `0`);
2483	assert(vm_page_queue_empty(&vm_page_queue_throttled));
2484	}
2485
2486	/*
2487	* Try for a clean-queue inactive page.
2488	* These are pages that vm_pageout_scan tried to steal earlier, but
2489	* were dirty and had to be cleaned. Pick them up now that they are clean.
2490	*/
2491	if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2492	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2493
2494	assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q);
2495
2496	goto found_page;
2497	}
2498
2499	/*
2500	* The next most eligible pages are ones we paged in speculatively,
2501	* but which have not yet been touched and have been aged out.
2502	*/
2503	if (!vm_page_queue_empty(&sq->age_q)) {
2504	m = (vm_page_t) vm_page_queue_first(&sq->age_q);
2505
2506	assert(m->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q);
2507
2508	if (!m->vmp_dirty \|\| force_anonymous == FALSE) {
2509	goto found_page;
2510	} else {
2511	m = NULL;
2512	}
2513	}
2514
2515	#if !CONFIG_JETSAM
2516	if (vm_page_donate_mode != VM_PAGE_DONATE_DISABLED) {
2517	if (vm_page_donate_queue_ripe && !vm_page_queue_empty(&vm_page_queue_donate)) {
2518	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_donate);
2519	assert(m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
2520	goto found_page;
2521	}
2522	}
2523	#endif /* !CONFIG_JETSAM */
2524
2525	if (vm_page_background_mode != VM_PAGE_BG_DISABLED && (vm_page_background_count > vm_page_background_target)) {
2526	vm_object_t bg_m_object = NULL;
2527
2528	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2529
2530	bg_m_object = VM_PAGE_OBJECT(m);
2531
2532	if (!VM_PAGE_PAGEABLE(m) \|\| (vm_darkwake_mode && m->vmp_busy)) {
2533	/*
2534	* This page is on the background queue
2535	* but not on a pageable queue OR is busy during
2536	* darkwake mode when the target is artificially lowered.
2537	* If it is busy during darkwake mode, and we don't skip it,
2538	* we will just swing back around and try again with the same
2539	* queue and might hit the same page or its neighbor in a
2540	* similar state. Both of these are transient states and will
2541	* get resolved, but, at this point let's ignore this page.
2542	*/
2543	if (vm_darkwake_mode && m->vmp_busy) {
2544	if (bg_m_object->internal) {
2545	vm_pageout_skipped_bq_internal++;
2546	} else {
2547	vm_pageout_skipped_bq_external++;
2548	}
2549	}
2550	} else if (force_anonymous == FALSE \|\| bg_m_object->internal) {
2551	if (bg_m_object->internal &&
2552	(VM_PAGE_Q_THROTTLED(iq) \|\|
2553	vm_compressor_out_of_space() == TRUE \|\|
2554	vm_page_free_count < (vm_page_free_reserved / `4`))) {
2555	vm_pageout_skipped_bq_internal++;
2556	} else {
2557	*is_page_from_bg_q = TRUE;
2558
2559	if (bg_m_object->internal) {
2560	vm_pageout_vminfo.vm_pageout_considered_bq_internal++;
2561	} else {
2562	vm_pageout_vminfo.vm_pageout_considered_bq_external++;
2563	}
2564	goto found_page;
2565	}
2566	}
2567	}
2568
2569	inactive_external_count = vm_page_inactive_count - vm_page_anonymous_count;
2570
2571	if ((vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min \|\| force_anonymous == TRUE) \|\|
2572	(inactive_external_count < VM_PAGE_INACTIVE_TARGET(vm_page_pageable_external_count))) {
2573	*grab_anonymous = TRUE;
2574	*anons_grabbed = `0`;
2575
2576	if (VM_CONFIG_SWAP_IS_ACTIVE) {
2577	vm_pageout_vminfo.vm_pageout_skipped_external++;
2578	} else {
2579	if (vm_page_free_count < (COMPRESSOR_FREE_RESERVED_LIMIT * `2`)) {
2580	/*
2581	* No swap and we are in dangerously low levels of free memory.
2582	* If we keep going ahead with anonymous pages, we are going to run into a situation
2583	* where the compressor will be stuck waiting for free pages (if it isn't already).
2584	*
2585	* So, pick a file backed page...
2586	*/
2587	*grab_anonymous = FALSE;
2588	*anons_grabbed = ANONS_GRABBED_LIMIT;
2589	vm_pageout_vminfo.vm_pageout_skipped_internal++;
2590	}
2591	}
2592	goto want_anonymous;
2593	}
2594	*grab_anonymous = (vm_page_anonymous_count > vm_page_anonymous_min);
2595
2596	#if CONFIG_JETSAM
2597	/ If the file-backed pool has accumulated*
2598	* significantly more pages than the jetsam
2599	* threshold, prefer to reclaim those
2600	* inline to minimise compute overhead of reclaiming
2601	* anonymous pages.
2602	* This calculation does not account for the CPU local
2603	* external page queues, as those are expected to be
2604	* much smaller relative to the global pools.
2605	*/
2606
2607	struct vm_pageout_queue *eq = &vm_pageout_queue_external;
2608
2609	if (*grab_anonymous == TRUE && !VM_PAGE_Q_THROTTLED(eq)) {
2610	if (vm_page_pageable_external_count >
2611	vm_pageout_state.vm_page_filecache_min) {
2612	if ((vm_page_pageable_external_count *
2613	vm_pageout_memorystatus_fb_factor_dr) >
2614	(memorystatus_available_pages_critical *
2615	vm_pageout_memorystatus_fb_factor_nr)) {
2616	*grab_anonymous = FALSE;
2617
2618	VM_PAGEOUT_DEBUG(vm_grab_anon_overrides, `1`);
2619	}
2620	}
2621	if (*grab_anonymous) {
2622	VM_PAGEOUT_DEBUG(vm_grab_anon_nops, `1`);
2623	}
2624	}
2625	#endif /* CONFIG_JETSAM */
2626
2627	want_anonymous:
2628	if (grab_anonymous == FALSE \|\| anons_grabbed >= ANONS_GRABBED_LIMIT \|\| vm_page_queue_empty(&vm_page_queue_anonymous)) {
2629	if (!vm_page_queue_empty(&vm_page_queue_inactive)) {
2630	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2631
2632	assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_EXTERNAL_Q);
2633	*anons_grabbed = `0`;
2634
2635	if (vm_page_pageable_external_count < vm_pageout_state.vm_page_filecache_min) {
2636	if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2637	if ((++(*reactivated_this_call) % `100`)) {
2638	vm_pageout_vminfo.vm_pageout_filecache_min_reactivated++;
2639
2640	vm_page_activate(page: m);
2641	counter_inc(&vm_statistics_reactivations);
2642	#if DEVELOPMENT \|\| DEBUG
2643	if (*is_page_from_bg_q == TRUE) {
2644	if (m_object->internal) {
2645	vm_pageout_rejected_bq_internal++;
2646	} else {
2647	vm_pageout_rejected_bq_external++;
2648	}
2649	}
2650	#endif /* DEVELOPMENT \|\| DEBUG */
2651	vm_pageout_state.vm_pageout_inactive_used++;
2652
2653	m = NULL;
2654	retval = VM_PAGEOUT_SCAN_NEXT_ITERATION;
2655
2656	goto found_page;
2657	}
2658
2659	/*
2660	* steal 1 of the file backed pages even if
2661	* we are under the limit that has been set
2662	* for a healthy filecache
2663	*/
2664	}
2665	}
2666	goto found_page;
2667	}
2668	}
2669	if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2670	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2671
2672	assert(m->vmp_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q);
2673	*anons_grabbed += `1`;
2674
2675	goto found_page;
2676	}
2677
2678	m = NULL;
2679
2680	found_page:
2681	*victim_page = m;
2682
2683	return retval;
2684	}
2685
2686	/*
2687	* This function is called only from vm_pageout_scan and
2688	* it will put a page back on the active/inactive queue
2689	* if we can't reclaim it for some reason.
2690	*/
2691	static void
2692	vps_requeue_page(vm_page_t m, int page_prev_q_state, __unused boolean_t page_from_bg_q)
2693	{
2694	if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
2695	vm_page_enqueue_inactive(mem: m, FALSE);
2696	} else {
2697	vm_page_activate(page: m);
2698	}
2699
2700	#if DEVELOPMENT \|\| DEBUG
2701	vm_object_t m_object = VM_PAGE_OBJECT(m);
2702
2703	if (page_from_bg_q == TRUE) {
2704	if (m_object->internal) {
2705	vm_pageout_rejected_bq_internal++;
2706	} else {
2707	vm_pageout_rejected_bq_external++;
2708	}
2709	}
2710	#endif /* DEVELOPMENT \|\| DEBUG */
2711	}
2712
2713	/*
2714	* This function is called only from vm_pageout_scan and
2715	* it will try to grab the victim page's VM object (m_object)
2716	* which differs from the previous victim page's object (object).
2717	*/
2718	static int
2719	vps_switch_object(vm_page_t m, vm_object_t m_object, vm_object_t object, int* page_prev_q_state, boolean_t avoid_anon_pages, boolean_t page_from_bg_q)
2720	{
2721	struct vm_speculative_age_q *sq;
2722
2723	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
2724
2725	/*
2726	* the object associated with candidate page is
2727	* different from the one we were just working
2728	* with... dump the lock if we still own it
2729	*/
2730	if (*object != NULL) {
2731	vm_object_unlock(*object);
2732	*object = NULL;
2733	}
2734	/*
2735	* Try to lock object; since we've alread got the
2736	* page queues lock, we can only 'try' for this one.
2737	* if the 'try' fails, we need to do a mutex_pause
2738	* to allow the owner of the object lock a chance to
2739	* run... otherwise, we're likely to trip over this
2740	* object in the same state as we work our way through
2741	* the queue... clumps of pages associated with the same
2742	* object are fairly typical on the inactive and active queues
2743	*/
2744	if (!vm_object_lock_try_scan(m_object)) {
2745	vm_page_t m_want = NULL;
2746
2747	vm_pageout_vminfo.vm_pageout_inactive_nolock++;
2748
2749	if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
2750	VM_PAGEOUT_DEBUG(vm_pageout_cleaned_nolock, `1`);
2751	}
2752
2753	pmap_clear_reference(pn: VM_PAGE_GET_PHYS_PAGE(m));
2754
2755	m->vmp_reference = FALSE;
2756
2757	if (!m_object->object_is_shared_cache) {
2758	/*
2759	* don't apply this optimization if this is the shared cache
2760	* object, it's too easy to get rid of very hot and important
2761	* pages...
2762	* m->vmp_object must be stable since we hold the page queues lock...
2763	* we can update the scan_collisions field sans the object lock
2764	* since it is a separate field and this is the only spot that does
2765	* a read-modify-write operation and it is never executed concurrently...
2766	* we can asynchronously set this field to 0 when creating a UPL, so it
2767	* is possible for the value to be a bit non-determistic, but that's ok
2768	* since it's only used as a hint
2769	*/
2770	m_object->scan_collisions = `1`;
2771	}
2772	if (page_from_bg_q) {
2773	m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_background);
2774	} else if (!vm_page_queue_empty(&vm_page_queue_cleaned)) {
2775	m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_cleaned);
2776	} else if (!vm_page_queue_empty(&sq->age_q)) {
2777	m_want = (vm_page_t) vm_page_queue_first(&sq->age_q);
2778	} else if ((avoid_anon_pages \|\| vm_page_queue_empty(&vm_page_queue_anonymous)) &&
2779	!vm_page_queue_empty(&vm_page_queue_inactive)) {
2780	m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
2781	} else if (!vm_page_queue_empty(&vm_page_queue_anonymous)) {
2782	m_want = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
2783	}
2784
2785	/*
2786	* this is the next object we're going to be interested in
2787	* try to make sure its available after the mutex_pause
2788	* returns control
2789	*/
2790	if (m_want) {
2791	vm_pageout_scan_wants_object = VM_PAGE_OBJECT(m_want);
2792	}
2793
2794	vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
2795
2796	return VM_PAGEOUT_SCAN_NEXT_ITERATION;
2797	} else {
2798	*object = m_object;
2799	vm_pageout_scan_wants_object = VM_OBJECT_NULL;
2800	}
2801
2802	return VM_PAGEOUT_SCAN_PROCEED;
2803	}
2804
2805	/*
2806	* This function is called only from vm_pageout_scan and
2807	* it notices that pageout scan may be rendered ineffective
2808	* due to a FS deadlock and will jetsam a process if possible.
2809	* If jetsam isn't supported, it'll move the page to the active
2810	* queue to try and get some different pages pushed onwards so
2811	* we can try to get out of this scenario.
2812	*/
2813	static void
2814	vps_deal_with_throttled_queues(vm_page_t m, vm_object_t object, uint32_t vm_pageout_inactive_external_forced_reactivate_limit,
2815	boolean_t *force_anonymous, __unused boolean_t is_page_from_bg_q)
2816	{
2817	struct vm_pageout_queue *eq;
2818	vm_object_t cur_object = VM_OBJECT_NULL;
2819
2820	cur_object = *object;
2821
2822	eq = &vm_pageout_queue_external;
2823
2824	if (cur_object->internal == FALSE) {
2825	/*
2826	* we need to break up the following potential deadlock case...
2827	* a) The external pageout thread is stuck on the truncate lock for a file that is being extended i.e. written.
2828	* b) The thread doing the writing is waiting for pages while holding the truncate lock
2829	* c) Most of the pages in the inactive queue belong to this file.
2830	*
2831	* we are potentially in this deadlock because...
2832	* a) the external pageout queue is throttled
2833	* b) we're done with the active queue and moved on to the inactive queue
2834	* c) we've got a dirty external page
2835	*
2836	* since we don't know the reason for the external pageout queue being throttled we
2837	* must suspect that we are deadlocked, so move the current page onto the active queue
2838	* in an effort to cause a page from the active queue to 'age' to the inactive queue
2839	*
2840	* if we don't have jetsam configured (i.e. we have a dynamic pager), set
2841	* 'force_anonymous' to TRUE to cause us to grab a page from the cleaned/anonymous
2842	* pool the next time we select a victim page... if we can make enough new free pages,
2843	* the deadlock will break, the external pageout queue will empty and it will no longer
2844	* be throttled
2845	*
2846	* if we have jetsam configured, keep a count of the pages reactivated this way so
2847	* that we can try to find clean pages in the active/inactive queues before
2848	* deciding to jetsam a process
2849	*/
2850	vm_pageout_vminfo.vm_pageout_scan_inactive_throttled_external++;
2851
2852	vm_page_check_pageable_safe(page: m);
2853	assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
2854	vm_page_queue_enter(&vm_page_queue_active, m, vmp_pageq);
2855	m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
2856	vm_page_active_count++;
2857	vm_page_pageable_external_count++;
2858
2859	vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, FALSE);
2860
2861	#if CONFIG_MEMORYSTATUS && CONFIG_JETSAM
2862
2863	#pragma unused(force_anonymous)
2864
2865	*vm_pageout_inactive_external_forced_reactivate_limit -= `1`;
2866
2867	if (*vm_pageout_inactive_external_forced_reactivate_limit <= `0`) {
2868	*vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
2869	/*
2870	* Possible deadlock scenario so request jetsam action
2871	*/
2872	memorystatus_kill_on_vps_starvation();
2873	VM_DEBUG_CONSTANT_EVENT(vm_pageout_jetsam, VM_PAGEOUT_JETSAM, DBG_FUNC_NONE,
2874	vm_page_active_count, vm_page_inactive_count, vm_page_free_count, vm_page_free_count);
2875	}
2876	#else /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2877
2878	#pragma unused(vm_pageout_inactive_external_forced_reactivate_limit)
2879
2880	*force_anonymous = TRUE;
2881	#endif /* CONFIG_MEMORYSTATUS && CONFIG_JETSAM */
2882	} else {
2883	vm_page_activate(page: m);
2884	counter_inc(&vm_statistics_reactivations);
2885
2886	#if DEVELOPMENT \|\| DEBUG
2887	if (is_page_from_bg_q == TRUE) {
2888	if (cur_object->internal) {
2889	vm_pageout_rejected_bq_internal++;
2890	} else {
2891	vm_pageout_rejected_bq_external++;
2892	}
2893	}
2894	#endif /* DEVELOPMENT \|\| DEBUG */
2895
2896	vm_pageout_state.vm_pageout_inactive_used++;
2897	}
2898	}
2899
2900
2901	void
2902	vm_page_balance_inactive(int max_to_move)
2903	{
2904	vm_page_t m;
2905
2906	LCK_MTX_ASSERT(&vm_page_queue_lock, LCK_MTX_ASSERT_OWNED);
2907
2908	if (hibernation_vmqueues_inspection \|\| hibernate_cleaning_in_progress) {
2909	/*
2910	* It is likely that the hibernation code path is
2911	* dealing with these very queues as we are about
2912	* to move pages around in/from them and completely
2913	* change the linkage of the pages.
2914	*
2915	* And so we skip the rebalancing of these queues.
2916	*/
2917	return;
2918	}
2919	vm_page_inactive_target = VM_PAGE_INACTIVE_TARGET(vm_page_active_count +
2920	vm_page_inactive_count +
2921	vm_page_speculative_count);
2922
2923	while (max_to_move-- && (vm_page_inactive_count + vm_page_speculative_count) < vm_page_inactive_target) {
2924	VM_PAGEOUT_DEBUG(vm_pageout_balanced, `1`);
2925
2926	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
2927
2928	assert(m->vmp_q_state == VM_PAGE_ON_ACTIVE_Q);
2929	assert(!m->vmp_laundry);
2930	assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
2931	assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
2932
2933	DTRACE_VM2(scan, int, `1`, (uint64_t *), NULL);
2934
2935	/*
2936	* by not passing in a pmap_flush_context we will forgo any TLB flushing, local or otherwise...
2937	*
2938	* a TLB flush isn't really needed here since at worst we'll miss the reference bit being
2939	* updated in the PTE if a remote processor still has this mapping cached in its TLB when the
2940	* new reference happens. If no futher references happen on the page after that remote TLB flushes
2941	* we'll see a clean, non-referenced page when it eventually gets pulled out of the inactive queue
2942	* by pageout_scan, which is just fine since the last reference would have happened quite far
2943	* in the past (TLB caches don't hang around for very long), and of course could just as easily
2944	* have happened before we moved the page
2945	*/
2946	if (m->vmp_pmapped == TRUE) {
2947	/*
2948	* We might be holding the page queue lock as a
2949	* spin lock and clearing the "referenced" bit could
2950	* take a while if there are lots of mappings of
2951	* that page, so make sure we acquire the lock as
2952	* as mutex to avoid a spinlock timeout.
2953	*/
2954	vm_page_lockconvert_queues();
2955	pmap_clear_refmod_options(pn: VM_PAGE_GET_PHYS_PAGE(m), VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void *)NULL);
2956	}
2957
2958	/*
2959	* The page might be absent or busy,
2960	* but vm_page_deactivate can handle that.
2961	* FALSE indicates that we don't want a H/W clear reference
2962	*/
2963	vm_page_deactivate_internal(page: m, FALSE);
2964	}
2965	}
2966
2967	/*
2968	* vm_pageout_scan does the dirty work for the pageout daemon.
2969	* It returns with both vm_page_queue_free_lock and vm_page_queue_lock
2970	* held and vm_page_free_wanted == 0.
2971	*/
2972	void
2973	vm_pageout_scan(void)
2974	{
2975	unsigned int loop_count = `0`;
2976	unsigned int inactive_burst_count = `0`;
2977	unsigned int reactivated_this_call;
2978	unsigned int reactivate_limit;
2979	vm_page_t local_freeq = NULL;
2980	int local_freed = `0`;
2981	int delayed_unlock;
2982	int delayed_unlock_limit = `0`;
2983	int refmod_state = `0`;
2984	int vm_pageout_deadlock_target = `0`;
2985	struct vm_pageout_queue *iq;
2986	struct vm_pageout_queue *eq;
2987	struct vm_speculative_age_q *sq;
2988	struct flow_control flow_control = { .state = `0`, .ts = { .tv_sec = `0`, .tv_nsec = `0` } };
2989	boolean_t inactive_throttled = FALSE;
2990	vm_object_t object = NULL;
2991	uint32_t inactive_reclaim_run;
2992	boolean_t grab_anonymous = FALSE;
2993	boolean_t force_anonymous = FALSE;
2994	boolean_t force_speculative_aging = FALSE;
2995	int anons_grabbed = `0`;
2996	int page_prev_q_state = `0`;
2997	boolean_t page_from_bg_q = FALSE;
2998	uint32_t vm_pageout_inactive_external_forced_reactivate_limit = `0`;
2999	vm_object_t m_object = VM_OBJECT_NULL;
3000	int retval = `0`;
3001	boolean_t lock_yield_check = FALSE;
3002
3003
3004	VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_START,
3005	vm_pageout_vminfo.vm_pageout_freed_speculative,
3006	vm_pageout_state.vm_pageout_inactive_clean,
3007	vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3008	vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3009
3010	flow_control.state = FCS_IDLE;
3011	iq = &vm_pageout_queue_internal;
3012	eq = &vm_pageout_queue_external;
3013	sq = &vm_page_queue_speculative[VM_PAGE_SPECULATIVE_AGED_Q];
3014
3015	/ Ask the pmap layer to return any pages it no longer needs. /
3016	pmap_release_pages_fast();
3017
3018	vm_page_lock_queues();
3019
3020	delayed_unlock = `1`;
3021
3022	/*
3023	* Calculate the max number of referenced pages on the inactive
3024	* queue that we will reactivate.
3025	*/
3026	reactivated_this_call = `0`;
3027	reactivate_limit = VM_PAGE_REACTIVATE_LIMIT(vm_page_active_count +
3028	vm_page_inactive_count);
3029	inactive_reclaim_run = `0`;
3030
3031	vm_pageout_inactive_external_forced_reactivate_limit = vm_page_active_count + vm_page_inactive_count;
3032
3033	/*
3034	* We must limit the rate at which we send pages to the pagers
3035	* so that we don't tie up too many pages in the I/O queues.
3036	* We implement a throttling mechanism using the laundry count
3037	* to limit the number of pages outstanding to the default
3038	* and external pagers. We can bypass the throttles and look
3039	* for clean pages if the pageout queues don't drain in a timely
3040	* fashion since this may indicate that the pageout paths are
3041	* stalled waiting for memory, which only we can provide.
3042	*/
3043
3044	vps_init_page_targets();
3045	assert(object == NULL);
3046	assert(delayed_unlock != `0`);
3047
3048	for (;;) {
3049	vm_page_t m;
3050
3051	DTRACE_VM2(rev, int, `1`, (uint64_t *), NULL);
3052
3053	if (lock_yield_check) {
3054	lock_yield_check = FALSE;
3055
3056	if (delayed_unlock++ > delayed_unlock_limit) {
3057	vm_pageout_prepare_to_block(object: &object, delayed_unlock: &delayed_unlock, local_freeq: &local_freeq, local_freed: &local_freed,
3058	VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3059	} else if (vm_pageout_scan_wants_object) {
3060	vm_page_unlock_queues();
3061	mutex_pause(`0`);
3062	vm_page_lock_queues();
3063	} else if (vps_yield_for_pgqlockwaiters && lck_mtx_yield(lck: &vm_page_queue_lock)) {
3064	VM_PAGEOUT_DEBUG(vm_pageout_yield_for_free_pages, `1`);
3065	}
3066	}
3067
3068	if (vm_upl_wait_for_pages < `0`) {
3069	vm_upl_wait_for_pages = `0`;
3070	}
3071
3072	delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT + vm_upl_wait_for_pages;
3073
3074	if (delayed_unlock_limit > VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX) {
3075	delayed_unlock_limit = VM_PAGEOUT_DELAYED_UNLOCK_LIMIT_MAX;
3076	}
3077
3078	vps_deal_with_secluded_page_overflow(local_freeq: &local_freeq, local_freed: &local_freed);
3079
3080	assert(delayed_unlock);
3081
3082	/*
3083	* maintain our balance
3084	*/
3085	vm_page_balance_inactive(max_to_move: `1`);
3086
3087
3088	/**********************************************************************
3089	* above this point we're playing with the active and secluded queues
3090	* below this point we're playing with the throttling mechanisms
3091	* and the inactive queue
3092	**********************************************************************/
3093
3094	if (vm_page_free_count + local_freed >= vm_page_free_target) {
3095	vm_pageout_scan_wants_object = VM_OBJECT_NULL;
3096
3097	vm_pageout_prepare_to_block(object: &object, delayed_unlock: &delayed_unlock, local_freeq: &local_freeq, local_freed: &local_freed,
3098	VM_PAGEOUT_PB_CONSIDER_WAKING_COMPACTOR_SWAPPER);
3099	/*
3100	* make sure the pageout I/O threads are running
3101	* throttled in case there are still requests
3102	* in the laundry... since we have met our targets
3103	* we don't need the laundry to be cleaned in a timely
3104	* fashion... so let's avoid interfering with foreground
3105	* activity
3106	*/
3107	vm_pageout_adjust_eq_iothrottle(&pgo_iothread_external_state, TRUE);
3108
3109	vm_free_page_lock();
3110
3111	if ((vm_page_free_count >= vm_page_free_target) &&
3112	(vm_page_free_wanted == `0`) && (vm_page_free_wanted_privileged == `0`)) {
3113	/*
3114	* done - we have met our target and
3115	* there is no one waiting for a page.
3116	*/
3117	return_from_scan:
3118	assert(vm_pageout_scan_wants_object == VM_OBJECT_NULL);
3119
3120	VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_NONE,
3121	vm_pageout_state.vm_pageout_inactive,
3122	vm_pageout_state.vm_pageout_inactive_used, `0`, `0`);
3123	VM_DEBUG_CONSTANT_EVENT(vm_pageout_scan, VM_PAGEOUT_SCAN, DBG_FUNC_END,
3124	vm_pageout_vminfo.vm_pageout_freed_speculative,
3125	vm_pageout_state.vm_pageout_inactive_clean,
3126	vm_pageout_vminfo.vm_pageout_inactive_dirty_internal,
3127	vm_pageout_vminfo.vm_pageout_inactive_dirty_external);
3128
3129	return;
3130	}
3131	vm_free_page_unlock();
3132	}
3133
3134	/*
3135	* Before anything, we check if we have any ripe volatile
3136	* objects around. If so, try to purge the first object.
3137	* If the purge fails, fall through to reclaim a page instead.
3138	* If the purge succeeds, go back to the top and reevalute
3139	* the new memory situation.
3140	*/
3141	retval = vps_purge_object();
3142
3143	if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3144	/*
3145	* Success
3146	*/
3147	if (object != NULL) {
3148	vm_object_unlock(object);
3149	object = NULL;
3150	}
3151
3152	lock_yield_check = FALSE;
3153	continue;
3154	}
3155
3156	/*
3157	* If our 'aged' queue is empty and we have some speculative pages
3158	* in the other queues, let's go through and see if we need to age
3159	* them.
3160	*
3161	* If we succeeded in aging a speculative Q or just that everything
3162	* looks normal w.r.t queue age and queue counts, we keep going onward.
3163	*
3164	* If, for some reason, we seem to have a mismatch between the spec.
3165	* page count and the page queues, we reset those variables and
3166	* restart the loop (LD TODO: Track this better?).
3167	*/
3168	if (vm_page_queue_empty(&sq->age_q) && vm_page_speculative_count) {
3169	retval = vps_age_speculative_queue(force_speculative_aging);
3170
3171	if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3172	lock_yield_check = FALSE;
3173	continue;
3174	}
3175	}
3176	force_speculative_aging = FALSE;
3177
3178	/*
3179	* Check to see if we need to evict objects from the cache.
3180	*
3181	* Note: 'object' here doesn't have anything to do with
3182	* the eviction part. We just need to make sure we have dropped
3183	* any object lock we might be holding if we need to go down
3184	* into the eviction logic.
3185	*/
3186	retval = vps_object_cache_evict(object_to_unlock: &object);
3187
3188	if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3189	lock_yield_check = FALSE;
3190	continue;
3191	}
3192
3193
3194	/*
3195	* Calculate our filecache_min that will affect the loop
3196	* going forward.
3197	*/
3198	vps_calculate_filecache_min();
3199
3200	/*
3201	* LD TODO: Use a structure to hold all state variables for a single
3202	* vm_pageout_scan iteration and pass that structure to this function instead.
3203	*/
3204	retval = vps_flow_control(flow_control: &flow_control, anons_grabbed: &anons_grabbed, object: &object,
3205	delayed_unlock: &delayed_unlock, local_freeq: &local_freeq, local_freed: &local_freed,
3206	vm_pageout_deadlock_target: &vm_pageout_deadlock_target, inactive_burst_count);
3207
3208	if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3209	if (loop_count >= vm_page_inactive_count) {
3210	loop_count = `0`;
3211	}
3212
3213	inactive_burst_count = `0`;
3214
3215	assert(object == NULL);
3216	assert(delayed_unlock != `0`);
3217
3218	lock_yield_check = FALSE;
3219	continue;
3220	} else if (retval == VM_PAGEOUT_SCAN_DONE_RETURN) {
3221	goto return_from_scan;
3222	}
3223
3224	flow_control.state = FCS_IDLE;
3225
3226	vm_pageout_inactive_external_forced_reactivate_limit = MIN((vm_page_active_count + vm_page_inactive_count),
3227	vm_pageout_inactive_external_forced_reactivate_limit);
3228	loop_count++;
3229	inactive_burst_count++;
3230	vm_pageout_state.vm_pageout_inactive++;
3231
3232	/*
3233	* Choose a victim.
3234	*/
3235
3236	m = NULL;
3237	retval = vps_choose_victim_page(victim_page: &m, anons_grabbed: &anons_grabbed, grab_anonymous: &grab_anonymous, force_anonymous, is_page_from_bg_q: &page_from_bg_q, reactivated_this_call: &reactivated_this_call);
3238
3239	if (m == NULL) {
3240	if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3241	inactive_burst_count = `0`;
3242
3243	if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3244	VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, `1`);
3245	}
3246
3247	lock_yield_check = TRUE;
3248	continue;
3249	}
3250
3251	/*
3252	* if we've gotten here, we have no victim page.
3253	* check to see if we've not finished balancing the queues
3254	* or we have a page on the aged speculative queue that we
3255	* skipped due to force_anonymous == TRUE.. or we have
3256	* speculative pages that we can prematurely age... if
3257	* one of these cases we'll keep going, else panic
3258	*/
3259	force_anonymous = FALSE;
3260	VM_PAGEOUT_DEBUG(vm_pageout_no_victim, `1`);
3261
3262	if (!vm_page_queue_empty(&sq->age_q)) {
3263	lock_yield_check = TRUE;
3264	continue;
3265	}
3266
3267	if (vm_page_speculative_count) {
3268	force_speculative_aging = TRUE;
3269	lock_yield_check = TRUE;
3270	continue;
3271	}
3272	panic("vm_pageout: no victim");
3273
3274	/ NOTREACHED /
3275	}
3276
3277	assert(VM_PAGE_PAGEABLE(m));
3278	m_object = VM_PAGE_OBJECT(m);
3279	force_anonymous = FALSE;
3280
3281	page_prev_q_state = m->vmp_q_state;
3282	/*
3283	* we just found this page on one of our queues...
3284	* it can't also be on the pageout queue, so safe
3285	* to call vm_page_queues_remove
3286	*/
3287	bool donate = (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
3288	vm_page_queues_remove(mem: m, TRUE);
3289	if (donate) {
3290	/*
3291	* The compressor needs to see this bit to know
3292	* where this page needs to land. Also if stolen,
3293	* this bit helps put the page back in the right
3294	* special queue where it belongs.
3295	*/
3296	m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
3297	}
3298
3299	assert(!m->vmp_laundry);
3300	assert(!m->vmp_private);
3301	assert(!m->vmp_fictitious);
3302	assert(!is_kernel_object(m_object));
3303	assert(VM_PAGE_GET_PHYS_PAGE(m) != vm_page_guard_addr);
3304
3305	vm_pageout_vminfo.vm_pageout_considered_page++;
3306
3307	DTRACE_VM2(scan, int, `1`, (uint64_t *), NULL);
3308
3309	/*
3310	* check to see if we currently are working
3311	* with the same object... if so, we've
3312	* already got the lock
3313	*/
3314	if (m_object != object) {
3315	boolean_t avoid_anon_pages = (grab_anonymous == FALSE \|\| anons_grabbed >= ANONS_GRABBED_LIMIT);
3316
3317	/*
3318	* vps_switch_object() will always drop the 'object' lock first
3319	* and then try to acquire the 'm_object' lock. So 'object' has to point to
3320	* either 'm_object' or NULL.
3321	*/
3322	retval = vps_switch_object(m, m_object, object: &object, page_prev_q_state, avoid_anon_pages, page_from_bg_q);
3323
3324	if (retval == VM_PAGEOUT_SCAN_NEXT_ITERATION) {
3325	lock_yield_check = TRUE;
3326	continue;
3327	}
3328	}
3329	assert(m_object == object);
3330	assert(VM_PAGE_OBJECT(m) == m_object);
3331
3332	if (m->vmp_busy) {
3333	/*
3334	* Somebody is already playing with this page.
3335	* Put it back on the appropriate queue
3336	*
3337	*/
3338	VM_PAGEOUT_DEBUG(vm_pageout_inactive_busy, `1`);
3339
3340	if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3341	VM_PAGEOUT_DEBUG(vm_pageout_cleaned_busy, `1`);
3342	}
3343
3344	vps_requeue_page(m, page_prev_q_state, page_from_bg_q);
3345
3346	lock_yield_check = TRUE;
3347	continue;
3348	}
3349
3350	/*
3351	* if (m->vmp_cleaning && !m->vmp_free_when_done)
3352	* If already cleaning this page in place
3353	* just leave if off the paging queues.
3354	* We can leave the page mapped, and upl_commit_range
3355	* will put it on the clean queue.
3356	*
3357	* if (m->vmp_free_when_done && !m->vmp_cleaning)
3358	* an msync INVALIDATE is in progress...
3359	* this page has been marked for destruction
3360	* after it has been cleaned,
3361	* but not yet gathered into a UPL
3362	* where 'cleaning' will be set...
3363	* just leave it off the paging queues
3364	*
3365	* if (m->vmp_free_when_done && m->vmp_clenaing)
3366	* an msync INVALIDATE is in progress
3367	* and the UPL has already gathered this page...
3368	* just leave it off the paging queues
3369	*/
3370	if (m->vmp_free_when_done \|\| m->vmp_cleaning) {
3371	lock_yield_check = TRUE;
3372	continue;
3373	}
3374
3375
3376	/*
3377	* If it's absent, in error or the object is no longer alive,
3378	* we can reclaim the page... in the no longer alive case,
3379	* there are 2 states the page can be in that preclude us
3380	* from reclaiming it - busy or cleaning - that we've already
3381	* dealt with
3382	*/
3383	if (m->vmp_absent \|\| VMP_ERROR_GET(m) \|\| !object->alive \|\|
3384	(!object->internal && object->pager == MEMORY_OBJECT_NULL)) {
3385	if (m->vmp_absent) {
3386	VM_PAGEOUT_DEBUG(vm_pageout_inactive_absent, `1`);
3387	} else if (!object->alive \|\|
3388	(!object->internal &&
3389	object->pager == MEMORY_OBJECT_NULL)) {
3390	VM_PAGEOUT_DEBUG(vm_pageout_inactive_notalive, `1`);
3391	} else {
3392	VM_PAGEOUT_DEBUG(vm_pageout_inactive_error, `1`);
3393	}
3394	reclaim_page:
3395	if (vm_pageout_deadlock_target) {
3396	VM_PAGEOUT_DEBUG(vm_pageout_scan_inactive_throttle_success, `1`);
3397	vm_pageout_deadlock_target--;
3398	}
3399
3400	DTRACE_VM2(dfree, int, `1`, (uint64_t *), NULL);
3401
3402	if (object->internal) {
3403	DTRACE_VM2(anonfree, int, `1`, (uint64_t *), NULL);
3404	} else {
3405	DTRACE_VM2(fsfree, int, `1`, (uint64_t *), NULL);
3406	}
3407	assert(!m->vmp_cleaning);
3408	assert(!m->vmp_laundry);
3409
3410	if (!object->internal &&
3411	object->pager != NULL &&
3412	object->pager->mo_pager_ops == &shared_region_pager_ops) {
3413	shared_region_pager_reclaimed++;
3414	}
3415
3416	m->vmp_busy = TRUE;
3417
3418	/*
3419	* remove page from object here since we're already
3420	* behind the object lock... defer the rest of the work
3421	* we'd normally do in vm_page_free_prepare_object
3422	* until 'vm_page_free_list' is called
3423	*/
3424	if (m->vmp_tabled) {
3425	vm_page_remove(page: m, TRUE);
3426	}
3427
3428	assert(m->vmp_pageq.next == `0` && m->vmp_pageq.prev == `0`);
3429	m->vmp_snext = local_freeq;
3430	local_freeq = m;
3431	local_freed++;
3432
3433	if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3434	vm_pageout_vminfo.vm_pageout_freed_speculative++;
3435	} else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3436	vm_pageout_vminfo.vm_pageout_freed_cleaned++;
3437	} else if (page_prev_q_state == VM_PAGE_ON_INACTIVE_INTERNAL_Q) {
3438	vm_pageout_vminfo.vm_pageout_freed_internal++;
3439	} else {
3440	vm_pageout_vminfo.vm_pageout_freed_external++;
3441	}
3442
3443	inactive_burst_count = `0`;
3444
3445	lock_yield_check = TRUE;
3446	continue;
3447	}
3448	if (object->vo_copy == VM_OBJECT_NULL) {
3449	/*
3450	* No one else can have any interest in this page.
3451	* If this is an empty purgable object, the page can be
3452	* reclaimed even if dirty.
3453	* If the page belongs to a volatile purgable object, we
3454	* reactivate it if the compressor isn't active.
3455	*/
3456	if (object->purgable == VM_PURGABLE_EMPTY) {
3457	if (m->vmp_pmapped == TRUE) {
3458	/ unmap the page /
3459	refmod_state = pmap_disconnect(phys: VM_PAGE_GET_PHYS_PAGE(m));
3460	if (refmod_state & VM_MEM_MODIFIED) {
3461	SET_PAGE_DIRTY(m, FALSE);
3462	}
3463	}
3464	if (m->vmp_dirty \|\| m->vmp_precious) {
3465	/ we saved the cost of cleaning this page ! /
3466	vm_page_purged_count++;
3467	}
3468	goto reclaim_page;
3469	}
3470
3471	if (VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
3472	/*
3473	* With the VM compressor, the cost of
3474	* reclaiming a page is much lower (no I/O),
3475	* so if we find a "volatile" page, it's better
3476	* to let it get compressed rather than letting
3477	* it occupy a full page until it gets purged.
3478	* So no need to check for "volatile" here.
3479	*/
3480	} else if (object->purgable == VM_PURGABLE_VOLATILE) {
3481	/*
3482	* Avoid cleaning a "volatile" page which might
3483	* be purged soon.
3484	*/
3485
3486	/ if it's wired, we can't put it on our queue /
3487	assert(!VM_PAGE_WIRED(m));
3488
3489	/ just stick it back on! /
3490	reactivated_this_call++;
3491
3492	if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3493	VM_PAGEOUT_DEBUG(vm_pageout_cleaned_volatile_reactivated, `1`);
3494	}
3495
3496	goto reactivate_page;
3497	}
3498	}
3499	/*
3500	* If it's being used, reactivate.
3501	* (Fictitious pages are either busy or absent.)
3502	* First, update the reference and dirty bits
3503	* to make sure the page is unreferenced.
3504	*/
3505	refmod_state = -`1`;
3506
3507	if (m->vmp_reference == FALSE && m->vmp_pmapped == TRUE) {
3508	refmod_state = pmap_get_refmod(pn: VM_PAGE_GET_PHYS_PAGE(m));
3509
3510	if (refmod_state & VM_MEM_REFERENCED) {
3511	m->vmp_reference = TRUE;
3512	}
3513	if (refmod_state & VM_MEM_MODIFIED) {
3514	SET_PAGE_DIRTY(m, FALSE);
3515	}
3516	}
3517
3518	if (m->vmp_reference \|\| m->vmp_dirty) {
3519	/ deal with a rogue "reusable" page /
3520	VM_PAGEOUT_SCAN_HANDLE_REUSABLE_PAGE(m, m_object);
3521	}
3522
3523	if (vm_pageout_state.vm_page_xpmapped_min_divisor == `0`) {
3524	vm_pageout_state.vm_page_xpmapped_min = `0`;
3525	} else {
3526	vm_pageout_state.vm_page_xpmapped_min = (vm_page_external_count * `10`) / vm_pageout_state.vm_page_xpmapped_min_divisor;
3527	}
3528
3529	if (!m->vmp_no_cache &&
3530	page_from_bg_q == FALSE &&
3531	(m->vmp_reference \|\| (m->vmp_xpmapped && !object->internal &&
3532	(vm_page_xpmapped_external_count < vm_pageout_state.vm_page_xpmapped_min)))) {
3533	/*
3534	* The page we pulled off the inactive list has
3535	* been referenced. It is possible for other
3536	* processors to be touching pages faster than we
3537	* can clear the referenced bit and traverse the
3538	* inactive queue, so we limit the number of
3539	* reactivations.
3540	*/
3541	if (++reactivated_this_call >= reactivate_limit &&
3542	!object->object_is_shared_cache &&
3543	!((m->vmp_realtime \|\|
3544	object->for_realtime) &&
3545	vm_pageout_protect_realtime)) {
3546	vm_pageout_vminfo.vm_pageout_reactivation_limit_exceeded++;
3547	} else if (++inactive_reclaim_run >= VM_PAGEOUT_INACTIVE_FORCE_RECLAIM) {
3548	vm_pageout_vminfo.vm_pageout_inactive_force_reclaim++;
3549	if (object->object_is_shared_cache) {
3550	vm_pageout_vminfo.vm_pageout_forcereclaimed_sharedcache++;
3551	} else if (m->vmp_realtime \|\|
3552	object->for_realtime) {
3553	vm_pageout_vminfo.vm_pageout_forcereclaimed_realtime++;
3554	}
3555	} else {
3556	uint32_t isinuse;
3557
3558	if (reactivated_this_call >= reactivate_limit) {
3559	if (object->object_is_shared_cache) {
3560	vm_pageout_vminfo.vm_pageout_protected_sharedcache++;
3561	} else if ((m->vmp_realtime \|\|
3562	object->for_realtime) &&
3563	vm_pageout_protect_realtime) {
3564	vm_pageout_vminfo.vm_pageout_protected_realtime++;
3565	}
3566	}
3567	if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3568	VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reference_reactivated, `1`);
3569	}
3570
3571	vm_pageout_vminfo.vm_pageout_inactive_referenced++;
3572	reactivate_page:
3573	if (!object->internal && object->pager != MEMORY_OBJECT_NULL &&
3574	vnode_pager_get_isinuse(object->pager, &isinuse) == KERN_SUCCESS && !isinuse) {
3575	/*
3576	* no explict mappings of this object exist
3577	* and it's not open via the filesystem
3578	*/
3579	vm_page_deactivate(page: m);
3580	VM_PAGEOUT_DEBUG(vm_pageout_inactive_deactivated, `1`);
3581	} else {
3582	/*
3583	* The page was/is being used, so put back on active list.
3584	*/
3585	vm_page_activate(page: m);
3586	counter_inc(&vm_statistics_reactivations);
3587	inactive_burst_count = `0`;
3588	}
3589	#if DEVELOPMENT \|\| DEBUG
3590	if (page_from_bg_q == TRUE) {
3591	if (m_object->internal) {
3592	vm_pageout_rejected_bq_internal++;
3593	} else {
3594	vm_pageout_rejected_bq_external++;
3595	}
3596	}
3597	#endif /* DEVELOPMENT \|\| DEBUG */
3598
3599	if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3600	VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, `1`);
3601	}
3602	vm_pageout_state.vm_pageout_inactive_used++;
3603
3604	lock_yield_check = TRUE;
3605	continue;
3606	}
3607	/*
3608	* Make sure we call pmap_get_refmod() if it
3609	* wasn't already called just above, to update
3610	* the dirty bit.
3611	*/
3612	if ((refmod_state == -`1`) && !m->vmp_dirty && m->vmp_pmapped) {
3613	refmod_state = pmap_get_refmod(pn: VM_PAGE_GET_PHYS_PAGE(m));
3614	if (refmod_state & VM_MEM_MODIFIED) {
3615	SET_PAGE_DIRTY(m, FALSE);
3616	}
3617	}
3618	}
3619
3620	/*
3621	* we've got a candidate page to steal...
3622	*
3623	* m->vmp_dirty is up to date courtesy of the
3624	* preceding check for m->vmp_reference... if
3625	* we get here, then m->vmp_reference had to be
3626	* FALSE (or possibly "reactivate_limit" was
3627	* exceeded), but in either case we called
3628	* pmap_get_refmod() and updated both
3629	* m->vmp_reference and m->vmp_dirty
3630	*
3631	* if it's dirty or precious we need to
3632	* see if the target queue is throtttled
3633	* it if is, we need to skip over it by moving it back
3634	* to the end of the inactive queue
3635	*/
3636
3637	inactive_throttled = FALSE;
3638
3639	if (m->vmp_dirty \|\| m->vmp_precious) {
3640	if (object->internal) {
3641	if (VM_PAGE_Q_THROTTLED(iq)) {
3642	inactive_throttled = TRUE;
3643	}
3644	} else if (VM_PAGE_Q_THROTTLED(eq)) {
3645	inactive_throttled = TRUE;
3646	}
3647	}
3648	throttle_inactive:
3649	if (!VM_DYNAMIC_PAGING_ENABLED() &&
3650	object->internal && m->vmp_dirty &&
3651	(object->purgable == VM_PURGABLE_DENY \|\|
3652	object->purgable == VM_PURGABLE_NONVOLATILE \|\|
3653	object->purgable == VM_PURGABLE_VOLATILE)) {
3654	vm_page_check_pageable_safe(page: m);
3655	assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
3656	vm_page_queue_enter(&vm_page_queue_throttled, m, vmp_pageq);
3657	m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
3658	vm_page_throttled_count++;
3659
3660	VM_PAGEOUT_DEBUG(vm_pageout_scan_reclaimed_throttled, `1`);
3661
3662	inactive_burst_count = `0`;
3663
3664	lock_yield_check = TRUE;
3665	continue;
3666	}
3667	if (inactive_throttled == TRUE) {
3668	vps_deal_with_throttled_queues(m, object: &object, vm_pageout_inactive_external_forced_reactivate_limit: &vm_pageout_inactive_external_forced_reactivate_limit,
3669	force_anonymous: &force_anonymous, is_page_from_bg_q: page_from_bg_q);
3670
3671	inactive_burst_count = `0`;
3672
3673	if (page_prev_q_state == VM_PAGE_ON_INACTIVE_CLEANED_Q) {
3674	VM_PAGEOUT_DEBUG(vm_pageout_cleaned_reactivated, `1`);
3675	}
3676
3677	lock_yield_check = TRUE;
3678	continue;
3679	}
3680
3681	/*
3682	* we've got a page that we can steal...
3683	* eliminate all mappings and make sure
3684	* we have the up-to-date modified state
3685	*
3686	* if we need to do a pmap_disconnect then we
3687	* need to re-evaluate m->vmp_dirty since the pmap_disconnect
3688	* provides the true state atomically... the
3689	* page was still mapped up to the pmap_disconnect
3690	* and may have been dirtied at the last microsecond
3691	*
3692	* Note that if 'pmapped' is FALSE then the page is not
3693	* and has not been in any map, so there is no point calling
3694	* pmap_disconnect(). m->vmp_dirty could have been set in anticipation
3695	* of likely usage of the page.
3696	*/
3697	if (m->vmp_pmapped == TRUE) {
3698	int pmap_options;
3699
3700	/*
3701	* Don't count this page as going into the compressor
3702	* if any of these are true:
3703	* 1) compressed pager isn't enabled
3704	* 2) Freezer enabled device with compressed pager
3705	* backend (exclusive use) i.e. most of the VM system
3706	* (including vm_pageout_scan) has no knowledge of
3707	* the compressor
3708	* 3) This page belongs to a file and hence will not be
3709	* sent into the compressor
3710	*/
3711	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE \|\|
3712	object->internal == FALSE) {
3713	pmap_options = `0`;
3714	} else if (m->vmp_dirty \|\| m->vmp_precious) {
3715	/*
3716	* VM knows that this page is dirty (or
3717	* precious) and needs to be compressed
3718	* rather than freed.
3719	* Tell the pmap layer to count this page
3720	* as "compressed".
3721	*/
3722	pmap_options = PMAP_OPTIONS_COMPRESSOR;
3723	} else {
3724	/*
3725	* VM does not know if the page needs to
3726	* be preserved but the pmap layer might tell
3727	* us if any mapping has "modified" it.
3728	* Let's the pmap layer to count this page
3729	* as compressed if and only if it has been
3730	* modified.
3731	*/
3732	pmap_options =
3733	PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED;
3734	}
3735	refmod_state = pmap_disconnect_options(phys: VM_PAGE_GET_PHYS_PAGE(m),
3736	options: pmap_options,
3737	NULL);
3738	if (refmod_state & VM_MEM_MODIFIED) {
3739	SET_PAGE_DIRTY(m, FALSE);
3740	}
3741	}
3742
3743	/*
3744	* reset our count of pages that have been reclaimed
3745	* since the last page was 'stolen'
3746	*/
3747	inactive_reclaim_run = `0`;
3748
3749	/*
3750	* If it's clean and not precious, we can free the page.
3751	*/
3752	if (!m->vmp_dirty && !m->vmp_precious) {
3753	vm_pageout_state.vm_pageout_inactive_clean++;
3754
3755	/*
3756	* OK, at this point we have found a page we are going to free.
3757	*/
3758	#if CONFIG_PHANTOM_CACHE
3759	if (!object->internal) {
3760	vm_phantom_cache_add_ghost(m);
3761	}
3762	#endif
3763	goto reclaim_page;
3764	}
3765
3766	/*
3767	* The page may have been dirtied since the last check
3768	* for a throttled target queue (which may have been skipped
3769	* if the page was clean then). With the dirty page
3770	* disconnected here, we can make one final check.
3771	*/
3772	if (object->internal) {
3773	if (VM_PAGE_Q_THROTTLED(iq)) {
3774	inactive_throttled = TRUE;
3775	}
3776	} else if (VM_PAGE_Q_THROTTLED(eq)) {
3777	inactive_throttled = TRUE;
3778	}
3779
3780	if (inactive_throttled == TRUE) {
3781	goto throttle_inactive;
3782	}
3783
3784	#if VM_PRESSURE_EVENTS
3785	#if CONFIG_JETSAM
3786
3787	/*
3788	* If Jetsam is enabled, then the sending
3789	* of memory pressure notifications is handled
3790	* from the same thread that takes care of high-water
3791	* and other jetsams i.e. the memorystatus_thread.
3792	*/
3793
3794	#else /* CONFIG_JETSAM */
3795
3796	vm_pressure_response();
3797
3798	#endif /* CONFIG_JETSAM */
3799	#endif /* VM_PRESSURE_EVENTS */
3800
3801	if (page_prev_q_state == VM_PAGE_ON_SPECULATIVE_Q) {
3802	VM_PAGEOUT_DEBUG(vm_pageout_speculative_dirty, `1`);
3803	}
3804
3805	if (object->internal) {
3806	vm_pageout_vminfo.vm_pageout_inactive_dirty_internal++;
3807	} else {
3808	vm_pageout_vminfo.vm_pageout_inactive_dirty_external++;
3809	}
3810
3811	/*
3812	* internal pages will go to the compressor...
3813	* external pages will go to the appropriate pager to be cleaned
3814	* and upon completion will end up on 'vm_page_queue_cleaned' which
3815	* is a preferred queue to steal from
3816	*/
3817	vm_pageout_cluster(m);
3818	inactive_burst_count = `0`;
3819
3820	/*
3821	* back to top of pageout scan loop
3822	*/
3823	}
3824	}
3825
3826
3827	void
3828	vm_page_free_reserve(
3829	int pages)
3830	{
3831	int free_after_reserve;
3832
3833	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3834	if ((vm_page_free_reserved + pages + COMPRESSOR_FREE_RESERVED_LIMIT) >= (VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT)) {
3835	vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT + COMPRESSOR_FREE_RESERVED_LIMIT;
3836	} else {
3837	vm_page_free_reserved += (pages + COMPRESSOR_FREE_RESERVED_LIMIT);
3838	}
3839	} else {
3840	if ((vm_page_free_reserved + pages) >= VM_PAGE_FREE_RESERVED_LIMIT) {
3841	vm_page_free_reserved = VM_PAGE_FREE_RESERVED_LIMIT;
3842	} else {
3843	vm_page_free_reserved += pages;
3844	}
3845	}
3846	free_after_reserve = vm_pageout_state.vm_page_free_count_init - vm_page_free_reserved;
3847
3848	vm_page_free_min = vm_page_free_reserved +
3849	VM_PAGE_FREE_MIN(free_after_reserve);
3850
3851	if (vm_page_free_min > VM_PAGE_FREE_MIN_LIMIT) {
3852	vm_page_free_min = VM_PAGE_FREE_MIN_LIMIT;
3853	}
3854
3855	vm_page_free_target = vm_page_free_reserved +
3856	VM_PAGE_FREE_TARGET(free_after_reserve);
3857
3858	if (vm_page_free_target > VM_PAGE_FREE_TARGET_LIMIT) {
3859	vm_page_free_target = VM_PAGE_FREE_TARGET_LIMIT;
3860	}
3861
3862	if (vm_page_free_target < vm_page_free_min + `5`) {
3863	vm_page_free_target = vm_page_free_min + `5`;
3864	}
3865
3866	vm_page_throttle_limit = vm_page_free_target - (vm_page_free_target / `2`);
3867	}
3868
3869	/*
3870	* vm_pageout is the high level pageout daemon.
3871	*/
3872
3873	void
3874	vm_pageout_continue(void)
3875	{
3876	DTRACE_VM2(pgrrun, int, `1`, (uint64_t *), NULL);
3877	VM_PAGEOUT_DEBUG(vm_pageout_scan_event_counter, `1`);
3878
3879	vm_free_page_lock();
3880	vm_pageout_running = TRUE;
3881	vm_free_page_unlock();
3882
3883	vm_pageout_scan();
3884	/*
3885	* we hold both the vm_page_queue_free_lock
3886	* and the vm_page_queues_lock at this point
3887	*/
3888	assert(vm_page_free_wanted == `0`);
3889	assert(vm_page_free_wanted_privileged == `0`);
3890	assert_wait(event: (event_t) &vm_page_free_wanted, THREAD_UNINT);
3891
3892	vm_pageout_running = FALSE;
3893	#if XNU_TARGET_OS_OSX
3894	if (vm_pageout_waiter) {
3895	vm_pageout_waiter = FALSE;
3896	thread_wakeup((event_t)&vm_pageout_waiter);
3897	}
3898	#endif /* XNU_TARGET_OS_OSX */
3899
3900	vm_free_page_unlock();
3901	vm_page_unlock_queues();
3902
3903	thread_block(continuation: (thread_continue_t)vm_pageout_continue);
3904	/NOTREACHED/
3905	}
3906
3907	#if XNU_TARGET_OS_OSX
3908	kern_return_t
3909	vm_pageout_wait(uint64_t deadline)
3910	{
3911	kern_return_t kr;
3912
3913	vm_free_page_lock();
3914	for (kr = KERN_SUCCESS; vm_pageout_running && (KERN_SUCCESS == kr);) {
3915	vm_pageout_waiter = TRUE;
3916	if (THREAD_AWAKENED != lck_mtx_sleep_deadline(
3917	lck: &vm_page_queue_free_lock, lck_sleep_action: LCK_SLEEP_DEFAULT,
3918	event: (event_t) &vm_pageout_waiter, THREAD_UNINT, deadline)) {
3919	kr = KERN_OPERATION_TIMED_OUT;
3920	}
3921	}
3922	vm_free_page_unlock();
3923
3924	return kr;
3925	}
3926	#endif /* XNU_TARGET_OS_OSX */
3927
3928	OS_NORETURN
3929	static void
3930	vm_pageout_iothread_external_continue(struct pgo_iothread_state *ethr, __unused wait_result_t w)
3931	{
3932	vm_page_t m = NULL;
3933	vm_object_t object;
3934	vm_object_offset_t offset;
3935	memory_object_t pager;
3936	struct vm_pageout_queue *q = ethr->q;
3937
3938	/ On systems with a compressor, the external IO thread clears its*
3939	* VM privileged bit to accommodate large allocations (e.g. bulk UPL
3940	* creation)
3941	*/
3942	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
3943	current_thread()->options &= ~TH_OPT_VMPRIV;
3944	}
3945
3946	sched_cond_ack(cond: &(ethr->pgo_wakeup));
3947
3948	while (true) {
3949	vm_page_lockspin_queues();
3950
3951	while (!vm_page_queue_empty(&q->pgo_pending)) {
3952	q->pgo_busy = TRUE;
3953	vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
3954
3955	assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
3956	VM_PAGE_CHECK(m);
3957	/*
3958	* grab a snapshot of the object and offset this
3959	* page is tabled in so that we can relookup this
3960	* page after we've taken the object lock - these
3961	* fields are stable while we hold the page queues lock
3962	* but as soon as we drop it, there is nothing to keep
3963	* this page in this object... we hold an activity_in_progress
3964	* on this object which will keep it from terminating
3965	*/
3966	object = VM_PAGE_OBJECT(m);
3967	offset = m->vmp_offset;
3968
3969	m->vmp_q_state = VM_PAGE_NOT_ON_Q;
3970	VM_PAGE_ZERO_PAGEQ_ENTRY(m);
3971
3972	vm_page_unlock_queues();
3973
3974	vm_object_lock(object);
3975
3976	m = vm_page_lookup(object, offset);
3977
3978	if (m == NULL \|\| m->vmp_busy \|\| m->vmp_cleaning \|\|
3979	!m->vmp_laundry \|\| (m->vmp_q_state != VM_PAGE_NOT_ON_Q)) {
3980	/*
3981	* it's either the same page that someone else has
3982	* started cleaning (or it's finished cleaning or
3983	* been put back on the pageout queue), or
3984	* the page has been freed or we have found a
3985	* new page at this offset... in all of these cases
3986	* we merely need to release the activity_in_progress
3987	* we took when we put the page on the pageout queue
3988	*/
3989	vm_object_activity_end(object);
3990	vm_object_unlock(object);
3991
3992	vm_page_lockspin_queues();
3993	continue;
3994	}
3995	pager = object->pager;
3996
3997	if (pager == MEMORY_OBJECT_NULL) {
3998	/*
3999	* This pager has been destroyed by either
4000	* memory_object_destroy or vm_object_destroy, and
4001	* so there is nowhere for the page to go.
4002	*/
4003	if (m->vmp_free_when_done) {
4004	/*
4005	* Just free the page... VM_PAGE_FREE takes
4006	* care of cleaning up all the state...
4007	* including doing the vm_pageout_throttle_up
4008	*/
4009	VM_PAGE_FREE(m);
4010	} else {
4011	vm_page_lockspin_queues();
4012
4013	vm_pageout_throttle_up(m);
4014	vm_page_activate(page: m);
4015
4016	vm_page_unlock_queues();
4017
4018	/*
4019	* And we are done with it.
4020	*/
4021	}
4022	vm_object_activity_end(object);
4023	vm_object_unlock(object);
4024
4025	vm_page_lockspin_queues();
4026	continue;
4027	}
4028	#if 0
4029	/*
4030	* we don't hold the page queue lock
4031	* so this check isn't safe to make
4032	*/
4033	VM_PAGE_CHECK(m);
4034	#endif
4035	/*
4036	* give back the activity_in_progress reference we
4037	* took when we queued up this page and replace it
4038	* it with a paging_in_progress reference that will
4039	* also hold the paging offset from changing and
4040	* prevent the object from terminating
4041	*/
4042	vm_object_activity_end(object);
4043	vm_object_paging_begin(object);
4044	vm_object_unlock(object);
4045
4046	/*
4047	* Send the data to the pager.
4048	* any pageout clustering happens there
4049	*/
4050	memory_object_data_return(memory_object: pager,
4051	offset: m->vmp_offset + object->paging_offset,
4052	PAGE_SIZE,
4053	NULL,
4054	NULL,
4055	FALSE,
4056	FALSE,
4057	upl_flags: `0`);
4058
4059	vm_object_lock(object);
4060	vm_object_paging_end(object);
4061	vm_object_unlock(object);
4062
4063	vm_pageout_io_throttle();
4064
4065	vm_page_lockspin_queues();
4066	}
4067	q->pgo_busy = FALSE;
4068
4069	vm_page_unlock_queues();
4070	sched_cond_wait_parameter(cond: &(ethr->pgo_wakeup), THREAD_UNINT, continuation: (thread_continue_t)vm_pageout_iothread_external_continue, parameter: ethr);
4071	}
4072	/NOTREACHED/
4073	}
4074
4075
4076	#define MAX_FREE_BATCH 32
4077	uint32_t vm_compressor_time_thread; / Set via sysctl to record time accrued by*
4078	* this thread.
4079	*/
4080
4081
4082	OS_NORETURN
4083	static void
4084	vm_pageout_iothread_internal_continue(struct pgo_iothread_state *cq, __unused wait_result_t w)
4085	{
4086	struct vm_pageout_queue *q;
4087	vm_page_t m = NULL;
4088	boolean_t pgo_draining;
4089	vm_page_t local_q;
4090	int local_cnt;
4091	vm_page_t local_freeq = NULL;
4092	int local_freed = `0`;
4093	int local_batch_size;
4094	#if DEVELOPMENT \|\| DEBUG
4095	int ncomps = `0`;
4096	boolean_t marked_active = FALSE;
4097	int num_pages_processed = `0`;
4098	#endif
4099	void *chead = NULL;
4100
4101	KERNEL_DEBUG(`0xe040000c` \| DBG_FUNC_END, `0`, `0`, `0`, `0`, `0`);
4102
4103	sched_cond_ack(cond: &(cq->pgo_wakeup));
4104
4105	q = cq->q;
4106
4107	while (true) {
4108	#if DEVELOPMENT \|\| DEBUG
4109	bool benchmark_accounting = false;
4110	/*
4111	* If we're running the compressor perf test, only process the benchmark pages.
4112	* We'll get back to our regular queue once the benchmark is done
4113	*/
4114	if (compressor_running_perf_test) {
4115	q = cq->benchmark_q;
4116	if (!vm_page_queue_empty(&q->pgo_pending)) {
4117	benchmark_accounting = true;
4118	} else {
4119	q = cq->q;
4120	benchmark_accounting = false;
4121	}
4122	}
4123	#endif /* DEVELOPMENT \|\| DEBUG */
4124
4125	#if __AMP__
4126	if (vm_compressor_ebound && (vm_pageout_state.vm_compressor_thread_count > `1`)) {
4127	local_batch_size = (q->pgo_maxlaundry >> `3`);
4128	local_batch_size = MAX(local_batch_size, `16`);
4129	} else {
4130	local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * `2`);
4131	}
4132	#else
4133	local_batch_size = q->pgo_maxlaundry / (vm_pageout_state.vm_compressor_thread_count * `2`);
4134	#endif
4135
4136	#if RECORD_THE_COMPRESSED_DATA
4137	if (q->pgo_laundry) {
4138	c_compressed_record_init();
4139	}
4140	#endif
4141	while (true) {
4142	int pages_left_on_q = `0`;
4143
4144	local_cnt = `0`;
4145	local_q = NULL;
4146
4147	KERNEL_DEBUG(`0xe0400014` \| DBG_FUNC_START, `0`, `0`, `0`, `0`, `0`);
4148
4149	vm_page_lock_queues();
4150	#if DEVELOPMENT \|\| DEBUG
4151	if (marked_active == FALSE) {
4152	vmct_active++;
4153	vmct_state[cq->id] = VMCT_ACTIVE;
4154	marked_active = TRUE;
4155	if (vmct_active == `1`) {
4156	vm_compressor_epoch_start = mach_absolute_time();
4157	}
4158	}
4159	#endif
4160	KERNEL_DEBUG(`0xe0400014` \| DBG_FUNC_END, `0`, `0`, `0`, `0`, `0`);
4161
4162	KERNEL_DEBUG(`0xe0400018` \| DBG_FUNC_START, q->pgo_laundry, `0`, `0`, `0`, `0`);
4163
4164	while (!vm_page_queue_empty(&q->pgo_pending) && local_cnt < local_batch_size) {
4165	vm_page_queue_remove_first(&q->pgo_pending, m, vmp_pageq);
4166	assert(m->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q);
4167	VM_PAGE_CHECK(m);
4168
4169	m->vmp_q_state = VM_PAGE_NOT_ON_Q;
4170	VM_PAGE_ZERO_PAGEQ_ENTRY(m);
4171	m->vmp_laundry = FALSE;
4172
4173	m->vmp_snext = local_q;
4174	local_q = m;
4175	local_cnt++;
4176	}
4177	if (local_q == NULL) {
4178	break;
4179	}
4180
4181	q->pgo_busy = TRUE;
4182
4183	if ((pgo_draining = q->pgo_draining) == FALSE) {
4184	vm_pageout_throttle_up_batch(q, batch_cnt: local_cnt);
4185	pages_left_on_q = q->pgo_laundry;
4186	} else {
4187	pages_left_on_q = q->pgo_laundry - local_cnt;
4188	}
4189
4190	vm_page_unlock_queues();
4191
4192	#if !RECORD_THE_COMPRESSED_DATA
4193	if (pages_left_on_q >= local_batch_size && cq->id < (vm_pageout_state.vm_compressor_thread_count - `1`)) {
4194	// wake up the next compressor thread
4195	sched_cond_signal(cond: &pgo_iothread_internal_state[cq->id + `1`].pgo_wakeup,
4196	thread: pgo_iothread_internal_state[cq->id + `1`].pgo_iothread);
4197	}
4198	#endif
4199	KERNEL_DEBUG(`0xe0400018` \| DBG_FUNC_END, q->pgo_laundry, `0`, `0`, `0`, `0`);
4200
4201	while (local_q) {
4202	KERNEL_DEBUG(`0xe0400024` \| DBG_FUNC_START, local_cnt, `0`, `0`, `0`, `0`);
4203
4204	m = local_q;
4205	local_q = m->vmp_snext;
4206	m->vmp_snext = NULL;
4207
4208	/*
4209	* Technically we need the pageq locks to manipulate this field.
4210	* However, this page has been removed from all queues and is only
4211	* known to this compressor thread dealing with this local queue.
4212	*
4213	* TODO LIONEL: Add a second localq that is the early localq and
4214	* put special pages like this one on that queue in the block above
4215	* under the pageq lock to avoid this 'works but not clean' logic.
4216	*/
4217	void *donate_queue_head;
4218	#if XNU_TARGET_OS_OSX
4219	donate_queue_head = &cq->current_early_swapout_chead;
4220	#else /* XNU_TARGET_OS_OSX */
4221	donate_queue_head = &cq->current_late_swapout_chead;
4222	#endif /* XNU_TARGET_OS_OSX */
4223	if (m->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE) {
4224	m->vmp_on_specialq = VM_PAGE_SPECIAL_Q_EMPTY;
4225	chead = donate_queue_head;
4226	} else {
4227	chead = &cq->current_regular_swapout_chead;
4228	}
4229
4230	if (vm_pageout_compress_page(chead, cq->scratch_buf, m) == KERN_SUCCESS) {
4231	#if DEVELOPMENT \|\| DEBUG
4232	ncomps++;
4233	#endif
4234	KERNEL_DEBUG(`0xe0400024` \| DBG_FUNC_END, local_cnt, `0`, `0`, `0`, `0`);
4235
4236	m->vmp_snext = local_freeq;
4237	local_freeq = m;
4238	local_freed++;
4239
4240	if (local_freed >= MAX_FREE_BATCH) {
4241	OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4242
4243	vm_page_free_list(mem: local_freeq, TRUE);
4244
4245	local_freeq = NULL;
4246	local_freed = `0`;
4247	}
4248	}
4249	#if DEVELOPMENT \|\| DEBUG
4250	num_pages_processed++;
4251	#endif /* DEVELOPMENT \|\| DEBUG */
4252	#if !CONFIG_JETSAM
4253	while (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4254	kern_return_t wait_result;
4255	int need_wakeup = `0`;
4256
4257	if (local_freeq) {
4258	OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4259
4260	vm_page_free_list(mem: local_freeq, TRUE);
4261	local_freeq = NULL;
4262	local_freed = `0`;
4263
4264	continue;
4265	}
4266	vm_free_page_lock_spin();
4267
4268	if (vm_page_free_count < COMPRESSOR_FREE_RESERVED_LIMIT) {
4269	if (vm_page_free_wanted_privileged++ == `0`) {
4270	need_wakeup = `1`;
4271	}
4272	wait_result = assert_wait(event: (event_t)&vm_page_free_wanted_privileged, THREAD_UNINT);
4273
4274	vm_free_page_unlock();
4275
4276	if (need_wakeup) {
4277	thread_wakeup((event_t)&vm_page_free_wanted);
4278	}
4279
4280	if (wait_result == THREAD_WAITING) {
4281	thread_block(THREAD_CONTINUE_NULL);
4282	}
4283	} else {
4284	vm_free_page_unlock();
4285	}
4286	}
4287	#endif
4288	}
4289	if (local_freeq) {
4290	OSAddAtomic64(local_freed, &vm_pageout_vminfo.vm_pageout_compressions);
4291
4292	vm_page_free_list(mem: local_freeq, TRUE);
4293	local_freeq = NULL;
4294	local_freed = `0`;
4295	}
4296	if (pgo_draining == TRUE) {
4297	vm_page_lockspin_queues();
4298	vm_pageout_throttle_up_batch(q, batch_cnt: local_cnt);
4299	vm_page_unlock_queues();
4300	}
4301	}
4302	KERNEL_DEBUG(`0xe040000c` \| DBG_FUNC_START, `0`, `0`, `0`, `0`, `0`);
4303
4304	/*
4305	* queue lock is held and our q is empty
4306	*/
4307	q->pgo_busy = FALSE;
4308	#if DEVELOPMENT \|\| DEBUG
4309	if (marked_active == TRUE) {
4310	vmct_active--;
4311	vmct_state[cq->id] = VMCT_IDLE;
4312
4313	if (vmct_active == `0`) {
4314	vm_compressor_epoch_stop = mach_absolute_time();
4315	assertf(vm_compressor_epoch_stop >= vm_compressor_epoch_start,
4316	"Compressor epoch non-monotonic: 0x%llx -> 0x%llx",
4317	vm_compressor_epoch_start, vm_compressor_epoch_stop);
4318	/ This interval includes intervals where one or more*
4319	* compressor threads were pre-empted
4320	*/
4321	vmct_stats.vmct_cthreads_total += vm_compressor_epoch_stop - vm_compressor_epoch_start;
4322	}
4323	}
4324	if (compressor_running_perf_test && benchmark_accounting) {
4325	/*
4326	* We could turn ON compressor_running_perf_test while still processing
4327	* regular non-benchmark pages. We shouldn't count them here else we
4328	* could overshoot. We might also still be populating that benchmark Q
4329	* and be under pressure. So we will go back to the regular queues. And
4330	* benchmark accounting will be off for that case too.
4331	*/
4332	compressor_perf_test_pages_processed += num_pages_processed;
4333	thread_wakeup(&compressor_perf_test_pages_processed);
4334	}
4335	#endif
4336	vm_page_unlock_queues();
4337	#if DEVELOPMENT \|\| DEBUG
4338	if (__improbable(vm_compressor_time_thread)) {
4339	vmct_stats.vmct_runtimes[cq->id] = thread_get_runtime_self();
4340	vmct_stats.vmct_pages[cq->id] += ncomps;
4341	vmct_stats.vmct_iterations[cq->id]++;
4342	if (ncomps > vmct_stats.vmct_maxpages[cq->id]) {
4343	vmct_stats.vmct_maxpages[cq->id] = ncomps;
4344	}
4345	if (ncomps < vmct_stats.vmct_minpages[cq->id]) {
4346	vmct_stats.vmct_minpages[cq->id] = ncomps;
4347	}
4348	}
4349	#endif
4350
4351	KERNEL_DEBUG(`0xe0400018` \| DBG_FUNC_END, `0`, `0`, `0`, `0`, `0`);
4352	#if DEVELOPMENT \|\| DEBUG
4353	if (compressor_running_perf_test && benchmark_accounting) {
4354	/*
4355	* We've been exclusively compressing pages from the benchmark queue,
4356	* do 1 pass over the internal queue before blocking.
4357	*/
4358	continue;
4359	}
4360	#endif
4361
4362	sched_cond_wait_parameter(cond: &(cq->pgo_wakeup), THREAD_UNINT, continuation: (thread_continue_t)vm_pageout_iothread_internal_continue, parameter: (void *) cq);
4363	}
4364	/NOTREACHED/
4365	}
4366
4367
4368	kern_return_t
4369	vm_pageout_compress_page(void *current_chead, char* *scratch_buf, vm_page_t m)
4370	{
4371	vm_object_t object;
4372	memory_object_t pager;
4373	int compressed_count_delta;
4374	kern_return_t retval;
4375
4376	object = VM_PAGE_OBJECT(m);
4377
4378	assert(!m->vmp_free_when_done);
4379	assert(!m->vmp_laundry);
4380
4381	pager = object->pager;
4382
4383	if (!object->pager_initialized \|\| pager == MEMORY_OBJECT_NULL) {
4384	KERNEL_DEBUG(`0xe0400010` \| DBG_FUNC_START, object, pager, `0`, `0`, `0`);
4385
4386	vm_object_lock(object);
4387
4388	/*
4389	* If there is no memory object for the page, create
4390	* one and hand it to the compression pager.
4391	*/
4392
4393	if (!object->pager_initialized) {
4394	vm_object_collapse(object, offset: (vm_object_offset_t) `0`, TRUE);
4395	}
4396	if (!object->pager_initialized) {
4397	vm_object_compressor_pager_create(object);
4398	}
4399
4400	pager = object->pager;
4401
4402	if (!object->pager_initialized \|\| pager == MEMORY_OBJECT_NULL) {
4403	/*
4404	* Still no pager for the object,
4405	* or the pager has been destroyed.
4406	* Reactivate the page.
4407	*
4408	* Should only happen if there is no
4409	* compression pager
4410	*/
4411	PAGE_WAKEUP_DONE(m);
4412
4413	vm_page_lockspin_queues();
4414	vm_page_activate(page: m);
4415	VM_PAGEOUT_DEBUG(vm_pageout_dirty_no_pager, `1`);
4416	vm_page_unlock_queues();
4417
4418	/*
4419	* And we are done with it.
4420	*/
4421	vm_object_activity_end(object);
4422	vm_object_unlock(object);
4423
4424	return KERN_FAILURE;
4425	}
4426	vm_object_unlock(object);
4427
4428	KERNEL_DEBUG(`0xe0400010` \| DBG_FUNC_END, object, pager, `0`, `0`, `0`);
4429	}
4430	assert(object->pager_initialized && pager != MEMORY_OBJECT_NULL);
4431	assert(object->activity_in_progress > `0`);
4432
4433	#if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
4434	if (m->vmp_unmodified_ro == true) {
4435	os_atomic_inc(&compressor_ro_uncompressed_total_returned, relaxed);
4436	}
4437	#endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
4438
4439	retval = vm_compressor_pager_put(
4440	mem_obj: pager,
4441	offset: m->vmp_offset + object->paging_offset,
4442	ppnum: VM_PAGE_GET_PHYS_PAGE(m),
4443	#if CONFIG_TRACK_UNMODIFIED_ANON_PAGES
4444	m->vmp_unmodified_ro,
4445	#else /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
4446	false,
4447	#endif /* CONFIG_TRACK_UNMODIFIED_ANON_PAGES */
4448	current_chead,
4449	scratch_buf,
4450	compressed_count_delta_p: &compressed_count_delta);
4451
4452	vm_object_lock(object);
4453
4454	assert(object->activity_in_progress > `0`);
4455	assert(VM_PAGE_OBJECT(m) == object);
4456	assert( !VM_PAGE_WIRED(m));
4457
4458	vm_compressor_pager_count(mem_obj: pager,
4459	compressed_count_delta,
4460	FALSE, / shared_lock /
4461	object);
4462
4463	if (retval == KERN_SUCCESS) {
4464	/*
4465	* If the object is purgeable, its owner's
4466	* purgeable ledgers will be updated in
4467	* vm_page_remove() but the page still
4468	* contributes to the owner's memory footprint,
4469	* so account for it as such.
4470	*/
4471	if ((object->purgable != VM_PURGABLE_DENY \|\|
4472	object->vo_ledger_tag) &&
4473	object->vo_owner != NULL) {
4474	/ one more compressed purgeable/tagged page /
4475	vm_object_owner_compressed_update(object,
4476	delta: compressed_count_delta);
4477	}
4478	counter_inc(&vm_statistics_compressions);
4479
4480	if (m->vmp_tabled) {
4481	vm_page_remove(page: m, TRUE);
4482	}
4483	} else {
4484	PAGE_WAKEUP_DONE(m);
4485
4486	vm_page_lockspin_queues();
4487
4488	vm_page_activate(page: m);
4489	vm_pageout_vminfo.vm_compressor_failed++;
4490
4491	vm_page_unlock_queues();
4492	}
4493	vm_object_activity_end(object);
4494	vm_object_unlock(object);
4495
4496	return retval;
4497	}
4498
4499
4500	static void
4501	vm_pageout_adjust_eq_iothrottle(struct pgo_iothread_state *ethr, boolean_t req_lowpriority)
4502	{
4503	uint32_t policy;
4504
4505	if (hibernate_cleaning_in_progress == TRUE) {
4506	req_lowpriority = FALSE;
4507	}
4508
4509	if (ethr->q->pgo_inited == TRUE && ethr->q->pgo_lowpriority != req_lowpriority) {
4510	vm_page_unlock_queues();
4511
4512	if (req_lowpriority == TRUE) {
4513	policy = THROTTLE_LEVEL_PAGEOUT_THROTTLED;
4514	DTRACE_VM(laundrythrottle);
4515	} else {
4516	policy = THROTTLE_LEVEL_PAGEOUT_UNTHROTTLED;
4517	DTRACE_VM(laundryunthrottle);
4518	}
4519	proc_set_thread_policy(thread: ethr->pgo_iothread,
4520	TASK_POLICY_EXTERNAL, TASK_POLICY_IO, value: policy);
4521
4522	vm_page_lock_queues();
4523	ethr->q->pgo_lowpriority = req_lowpriority;
4524	}
4525	}
4526
4527	OS_NORETURN
4528	static void
4529	vm_pageout_iothread_external(struct pgo_iothread_state *ethr, __unused wait_result_t w)
4530	{
4531	thread_t self = current_thread();
4532
4533	self->options \|= TH_OPT_VMPRIV;
4534
4535	DTRACE_VM2(laundrythrottle, int, `1`, (uint64_t *), NULL);
4536
4537	proc_set_thread_policy(thread: self, TASK_POLICY_EXTERNAL,
4538	TASK_POLICY_IO, THROTTLE_LEVEL_PAGEOUT_THROTTLED);
4539
4540	vm_page_lock_queues();
4541
4542	vm_pageout_queue_external.pgo_lowpriority = TRUE;
4543	vm_pageout_queue_external.pgo_inited = TRUE;
4544
4545	vm_page_unlock_queues();
4546
4547	#if CONFIG_THREAD_GROUPS
4548	thread_group_vm_add();
4549	#endif /* CONFIG_THREAD_GROUPS */
4550
4551	vm_pageout_iothread_external_continue(ethr, w: `0`);
4552	/NOTREACHED/
4553	}
4554
4555
4556	OS_NORETURN
4557	static void
4558	vm_pageout_iothread_internal(struct pgo_iothread_state *cthr, __unused wait_result_t w)
4559	{
4560	thread_t self = current_thread();
4561
4562	self->options \|= TH_OPT_VMPRIV;
4563
4564	vm_page_lock_queues();
4565
4566	vm_pageout_queue_internal.pgo_lowpriority = TRUE;
4567	vm_pageout_queue_internal.pgo_inited = TRUE;
4568
4569	#if DEVELOPMENT \|\| DEBUG
4570	vm_pageout_queue_benchmark.pgo_lowpriority = vm_pageout_queue_internal.pgo_lowpriority;
4571	vm_pageout_queue_benchmark.pgo_inited = vm_pageout_queue_internal.pgo_inited;
4572	vm_pageout_queue_benchmark.pgo_busy = FALSE;
4573	#endif /* DEVELOPMENT \|\| DEBUG */
4574
4575	vm_page_unlock_queues();
4576
4577	if (vm_pageout_state.vm_restricted_to_single_processor == TRUE) {
4578	thread_vm_bind_group_add();
4579	}
4580
4581	#if CONFIG_THREAD_GROUPS
4582	thread_group_vm_add();
4583	#endif /* CONFIG_THREAD_GROUPS */
4584
4585	#if __AMP__
4586	if (vm_compressor_ebound) {
4587	/*
4588	* Use the soft bound option for vm_compressor to allow it to run on
4589	* P-cores if E-cluster is unavailable.
4590	*/
4591	thread_bind_cluster_type(self, `'E'`, true);
4592	}
4593	#endif /* __AMP__ */
4594
4595	thread_set_thread_name(th: current_thread(), name: "VM_compressor");
4596	#if DEVELOPMENT \|\| DEBUG
4597	vmct_stats.vmct_minpages[cthr->id] = INT32_MAX;
4598	#endif
4599	vm_pageout_iothread_internal_continue(cq: cthr, w: `0`);
4600
4601	/NOTREACHED/
4602	}
4603
4604	kern_return_t
4605	vm_set_buffer_cleanup_callout(boolean_t (func)(int*))
4606	{
4607	if (OSCompareAndSwapPtr(NULL, ptrauth_nop_cast(void , func), (void* * volatile *) &consider_buffer_cache_collect)) {
4608	return KERN_SUCCESS;
4609	} else {
4610	return KERN_FAILURE; / Already set /
4611	}
4612	}
4613
4614	extern boolean_t memorystatus_manual_testing_on;
4615	extern unsigned int memorystatus_level;
4616
4617
4618	#if VM_PRESSURE_EVENTS
4619
4620	boolean_t vm_pressure_events_enabled = FALSE;
4621
4622	extern uint64_t next_warning_notification_sent_at_ts;
4623	extern uint64_t next_critical_notification_sent_at_ts;
4624
4625	#define PRESSURE_LEVEL_STUCK_THRESHOLD_MINS (30) /* 30 minutes. */
4626
4627	/*
4628	* The last time there was change in pressure level OR we forced a check
4629	* because the system is stuck in a non-normal pressure level.
4630	*/
4631	uint64_t vm_pressure_last_level_transition_abs = `0`;
4632
4633	/*
4634	* This is how the long the system waits 'stuck' in an unchanged non-normal pressure
4635	* level before resending out notifications for that level again.
4636	*/
4637	int vm_pressure_level_transition_threshold = PRESSURE_LEVEL_STUCK_THRESHOLD_MINS;
4638
4639	void
4640	vm_pressure_response(void)
4641	{
4642	vm_pressure_level_t old_level = kVMPressureNormal;
4643	int new_level = -`1`;
4644	unsigned int total_pages;
4645	uint64_t available_memory = `0`;
4646	uint64_t curr_ts, abs_time_since_level_transition, time_in_ns;
4647	bool force_check = false;
4648	int time_in_mins;
4649
4650
4651	if (vm_pressure_events_enabled == FALSE) {
4652	return;
4653	}
4654
4655	#if !XNU_TARGET_OS_OSX
4656
4657	available_memory = (uint64_t) memorystatus_available_pages;
4658
4659	#else /* !XNU_TARGET_OS_OSX */
4660
4661	available_memory = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
4662	memorystatus_available_pages = (uint64_t) AVAILABLE_NON_COMPRESSED_MEMORY;
4663
4664	#endif /* !XNU_TARGET_OS_OSX */
4665
4666	total_pages = (unsigned int) atop_64(max_mem);
4667	#if CONFIG_SECLUDED_MEMORY
4668	total_pages -= vm_page_secluded_count;
4669	#endif /* CONFIG_SECLUDED_MEMORY */
4670	memorystatus_level = (unsigned int) ((available_memory * `100`) / total_pages);
4671
4672	if (memorystatus_manual_testing_on) {
4673	return;
4674	}
4675
4676	curr_ts = mach_absolute_time();
4677	abs_time_since_level_transition = curr_ts - vm_pressure_last_level_transition_abs;
4678
4679	absolutetime_to_nanoseconds(abstime: abs_time_since_level_transition, result: &time_in_ns);
4680	time_in_mins = (int) ((time_in_ns / NSEC_PER_SEC) / `60`);
4681	force_check = (time_in_mins >= vm_pressure_level_transition_threshold);
4682
4683	old_level = memorystatus_vm_pressure_level;
4684
4685	switch (memorystatus_vm_pressure_level) {
4686	case kVMPressureNormal:
4687	{
4688	if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4689	new_level = kVMPressureCritical;
4690	} else if (VM_PRESSURE_NORMAL_TO_WARNING()) {
4691	new_level = kVMPressureWarning;
4692	}
4693	break;
4694	}
4695
4696	case kVMPressureWarning:
4697	case kVMPressureUrgent:
4698	{
4699	if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4700	new_level = kVMPressureNormal;
4701	} else if (VM_PRESSURE_WARNING_TO_CRITICAL()) {
4702	new_level = kVMPressureCritical;
4703	} else if (force_check) {
4704	new_level = kVMPressureWarning;
4705	next_warning_notification_sent_at_ts = curr_ts;
4706	}
4707	break;
4708	}
4709
4710	case kVMPressureCritical:
4711	{
4712	if (VM_PRESSURE_WARNING_TO_NORMAL()) {
4713	new_level = kVMPressureNormal;
4714	} else if (VM_PRESSURE_CRITICAL_TO_WARNING()) {
4715	new_level = kVMPressureWarning;
4716	} else if (force_check) {
4717	new_level = kVMPressureCritical;
4718	next_critical_notification_sent_at_ts = curr_ts;
4719	}
4720	break;
4721	}
4722
4723	default:
4724	return;
4725	}
4726
4727	if (new_level != -`1` \|\| force_check) {
4728	if (new_level != -`1`) {
4729	memorystatus_vm_pressure_level = (vm_pressure_level_t) new_level;
4730
4731	if (new_level != (int) old_level) {
4732	VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4733	new_level, old_level, `0`, `0`);
4734	}
4735	} else {
4736	VM_DEBUG_CONSTANT_EVENT(vm_pressure_level_change, VM_PRESSURE_LEVEL_CHANGE, DBG_FUNC_NONE,
4737	new_level, old_level, force_check, `0`);
4738	}
4739
4740	if (hibernation_vmqueues_inspection \|\| hibernate_cleaning_in_progress) {
4741	/*
4742	* We don't want to schedule a wakeup while hibernation is in progress
4743	* because that could collide with checks for non-monotonicity in the scheduler.
4744	* We do however do all the updates to memorystatus_vm_pressure_level because
4745	* we _might_ want to use that for decisions regarding which pages or how
4746	* many pages we want to dump in hibernation.
4747	*/
4748	return;
4749	}
4750
4751	if ((memorystatus_vm_pressure_level != kVMPressureNormal) \|\| (old_level != memorystatus_vm_pressure_level) \|\| force_check) {
4752	if (vm_pageout_state.vm_pressure_thread_running == FALSE) {
4753	thread_wakeup(&vm_pressure_thread);
4754	}
4755
4756	if (old_level != memorystatus_vm_pressure_level) {
4757	thread_wakeup(&vm_pageout_state.vm_pressure_changed);
4758	}
4759	vm_pressure_last_level_transition_abs = curr_ts; / renew the window of observation for a stuck pressure level /
4760	}
4761	}
4762	}
4763	#endif /* VM_PRESSURE_EVENTS */
4764
4765
4766	/**
4767	* Called by a kernel thread to ask if a number of pages may be wired.
4768	*/
4769	kern_return_t
4770	mach_vm_wire_level_monitor(int64_t requested_pages)
4771	{
4772	if (requested_pages <= `0`) {
4773	return KERN_INVALID_ARGUMENT;
4774	}
4775
4776	const int64_t max_wire_pages = atop_64(vm_global_user_wire_limit);
4777	/**
4778	* Available pages can be negative in the case where more system memory is
4779	* wired than the threshold, so we must use a signed integer.
4780	*/
4781	const int64_t available_pages = max_wire_pages - vm_page_wire_count;
4782
4783	if (requested_pages > available_pages) {
4784	return KERN_RESOURCE_SHORTAGE;
4785	}
4786	return KERN_SUCCESS;
4787	}
4788
4789	/*
4790	* Function called by a kernel thread to either get the current pressure level or
4791	* wait until memory pressure changes from a given level.
4792	*/
4793	kern_return_t
4794	mach_vm_pressure_level_monitor(__unused boolean_t wait_for_pressure, __unused unsigned int *pressure_level)
4795	{
4796	#if !VM_PRESSURE_EVENTS
4797
4798	return KERN_FAILURE;
4799
4800	#else /* VM_PRESSURE_EVENTS */
4801
4802	wait_result_t wr = `0`;
4803	vm_pressure_level_t old_level = memorystatus_vm_pressure_level;
4804
4805	if (pressure_level == NULL) {
4806	return KERN_INVALID_ARGUMENT;
4807	}
4808
4809	if (*pressure_level == kVMPressureJetsam) {
4810	if (!wait_for_pressure) {
4811	return KERN_INVALID_ARGUMENT;
4812	}
4813
4814	lck_mtx_lock(lck: &memorystatus_jetsam_fg_band_lock);
4815	wr = assert_wait(event: (event_t)&memorystatus_jetsam_fg_band_waiters,
4816	THREAD_INTERRUPTIBLE);
4817	if (wr == THREAD_WAITING) {
4818	++memorystatus_jetsam_fg_band_waiters;
4819	lck_mtx_unlock(lck: &memorystatus_jetsam_fg_band_lock);
4820	wr = thread_block(THREAD_CONTINUE_NULL);
4821	} else {
4822	lck_mtx_unlock(lck: &memorystatus_jetsam_fg_band_lock);
4823	}
4824	if (wr != THREAD_AWAKENED) {
4825	return KERN_ABORTED;
4826	}
4827	*pressure_level = kVMPressureJetsam;
4828	return KERN_SUCCESS;
4829	}
4830
4831	if (wait_for_pressure == TRUE) {
4832	while (old_level == *pressure_level) {
4833	wr = assert_wait(event: (event_t) &vm_pageout_state.vm_pressure_changed,
4834	THREAD_INTERRUPTIBLE);
4835	if (wr == THREAD_WAITING) {
4836	wr = thread_block(THREAD_CONTINUE_NULL);
4837	}
4838	if (wr == THREAD_INTERRUPTED) {
4839	return KERN_ABORTED;
4840	}
4841
4842	if (wr == THREAD_AWAKENED) {
4843	old_level = memorystatus_vm_pressure_level;
4844	}
4845	}
4846	}
4847
4848	*pressure_level = old_level;
4849	return KERN_SUCCESS;
4850	#endif /* VM_PRESSURE_EVENTS */
4851	}
4852
4853	#if VM_PRESSURE_EVENTS
4854	void
4855	vm_pressure_thread(void)
4856	{
4857	static boolean_t thread_initialized = FALSE;
4858
4859	if (thread_initialized == TRUE) {
4860	vm_pageout_state.vm_pressure_thread_running = TRUE;
4861	consider_vm_pressure_events();
4862	vm_pageout_state.vm_pressure_thread_running = FALSE;
4863	}
4864
4865	#if CONFIG_THREAD_GROUPS
4866	thread_group_vm_add();
4867	#endif /* CONFIG_THREAD_GROUPS */
4868
4869	thread_set_thread_name(th: current_thread(), name: "VM_pressure");
4870	thread_initialized = TRUE;
4871	assert_wait(event: (event_t) &vm_pressure_thread, THREAD_UNINT);
4872	thread_block(continuation: (thread_continue_t)vm_pressure_thread);
4873	}
4874	#endif /* VM_PRESSURE_EVENTS */
4875
4876
4877	/*
4878	* called once per-second via "compute_averages"
4879	*/
4880	void
4881	compute_pageout_gc_throttle(__unused void *arg)
4882	{
4883	if (vm_pageout_vminfo.vm_pageout_considered_page != vm_pageout_state.vm_pageout_considered_page_last) {
4884	vm_pageout_state.vm_pageout_considered_page_last = vm_pageout_vminfo.vm_pageout_considered_page;
4885
4886	thread_wakeup(VM_PAGEOUT_GC_EVENT);
4887	}
4888	}
4889
4890	/*
4891	* vm_pageout_garbage_collect can also be called when the zone allocator needs
4892	* to call zone_gc on a different thread in order to trigger zone-map-exhaustion
4893	* jetsams. We need to check if the zone map size is above its jetsam limit to
4894	* decide if this was indeed the case.
4895	*
4896	* We need to do this on a different thread because of the following reasons:
4897	*
4898	* 1. In the case of synchronous jetsams, the leaking process can try to jetsam
4899	* itself causing the system to hang. We perform synchronous jetsams if we're
4900	* leaking in the VM map entries zone, so the leaking process could be doing a
4901	* zalloc for a VM map entry while holding its vm_map lock, when it decides to
4902	* jetsam itself. We also need the vm_map lock on the process termination path,
4903	* which would now lead the dying process to deadlock against itself.
4904	*
4905	* 2. The jetsam path might need to allocate zone memory itself. We could try
4906	* using the non-blocking variant of zalloc for this path, but we can still
4907	* end up trying to do a kmem_alloc when the zone maps are almost full.
4908	*/
4909	__dead2
4910	void
4911	vm_pageout_garbage_collect(void *step, wait_result_t wr __unused)
4912	{
4913	assert(step == VM_PAGEOUT_GC_INIT \|\| step == VM_PAGEOUT_GC_COLLECT);
4914
4915	if (step == VM_PAGEOUT_GC_INIT) {
4916	/ first time being called is not about GC /
4917	#if CONFIG_THREAD_GROUPS
4918	thread_group_vm_add();
4919	#endif /* CONFIG_THREAD_GROUPS */
4920	} else if (zone_map_nearing_exhaustion()) {
4921	/*
4922	* Woken up by the zone allocator for zone-map-exhaustion jetsams.
4923	*
4924	* Bail out after calling zone_gc (which triggers the
4925	* zone-map-exhaustion jetsams). If we fall through, the subsequent
4926	* operations that clear out a bunch of caches might allocate zone
4927	* memory themselves (for eg. vm_map operations would need VM map
4928	* entries). Since the zone map is almost full at this point, we
4929	* could end up with a panic. We just need to quickly jetsam a
4930	* process and exit here.
4931	*
4932	* It could so happen that we were woken up to relieve memory
4933	* pressure and the zone map also happened to be near its limit at
4934	* the time, in which case we'll skip out early. But that should be
4935	* ok; if memory pressure persists, the thread will simply be woken
4936	* up again.
4937	*/
4938	zone_gc(level: ZONE_GC_JETSAM);
4939	} else {
4940	/ Woken up by vm_pageout_scan or compute_pageout_gc_throttle. /
4941	boolean_t buf_large_zfree = FALSE;
4942	boolean_t first_try = TRUE;
4943
4944	stack_collect();
4945
4946	consider_machine_collect();
4947	#if CONFIG_MBUF_MCACHE
4948	mbuf_drain(FALSE);
4949	#endif /* CONFIG_MBUF_MCACHE */
4950
4951	do {
4952	if (consider_buffer_cache_collect != NULL) {
4953	buf_large_zfree = (*consider_buffer_cache_collect)(`0`);
4954	}
4955	if (first_try == TRUE \|\| buf_large_zfree == TRUE) {
4956	/*
4957	* zone_gc should be last, because the other operations
4958	* might return memory to zones.
4959	*/
4960	zone_gc(level: ZONE_GC_TRIM);
4961	}
4962	first_try = FALSE;
4963	} while (buf_large_zfree == TRUE && vm_page_free_count < vm_page_free_target);
4964
4965	consider_machine_adjust();
4966	}
4967
4968	assert_wait(VM_PAGEOUT_GC_EVENT, THREAD_UNINT);
4969
4970	thread_block_parameter(continuation: vm_pageout_garbage_collect, VM_PAGEOUT_GC_COLLECT);
4971	__builtin_unreachable();
4972	}
4973
4974
4975	#if VM_PAGE_BUCKETS_CHECK
4976	#if VM_PAGE_FAKE_BUCKETS
4977	extern vm_map_offset_t vm_page_fake_buckets_start, vm_page_fake_buckets_end;
4978	#endif /* VM_PAGE_FAKE_BUCKETS */
4979	#endif /* VM_PAGE_BUCKETS_CHECK */
4980
4981
4982
4983	void
4984	vm_set_restrictions(unsigned int num_cpus)
4985	{
4986	int vm_restricted_to_single_processor = `0`;
4987
4988	if (PE_parse_boot_argn(arg_string: "vm_restricted_to_single_processor", arg_ptr: &vm_restricted_to_single_processor, max_arg: sizeof(vm_restricted_to_single_processor))) {
4989	kprintf(fmt: "Overriding vm_restricted_to_single_processor to %d\n", vm_restricted_to_single_processor);
4990	vm_pageout_state.vm_restricted_to_single_processor = (vm_restricted_to_single_processor ? TRUE : FALSE);
4991	} else {
4992	assert(num_cpus > `0`);
4993
4994	if (num_cpus <= `3`) {
4995	/*
4996	* on systems with a limited number of CPUS, bind the
4997	* 4 major threads that can free memory and that tend to use
4998	* a fair bit of CPU under pressured conditions to a single processor.
4999	* This insures that these threads don't hog all of the available CPUs
5000	* (important for camera launch), while allowing them to run independently
5001	* w/r to locks... the 4 threads are
5002	* vm_pageout_scan, vm_pageout_iothread_internal (compressor),
5003	* vm_compressor_swap_trigger_thread (minor and major compactions),
5004	* memorystatus_thread (jetsams).
5005	*
5006	* the first time the thread is run, it is responsible for checking the
5007	* state of vm_restricted_to_single_processor, and if TRUE it calls
5008	* thread_bind_master... someday this should be replaced with a group
5009	* scheduling mechanism and KPI.
5010	*/
5011	vm_pageout_state.vm_restricted_to_single_processor = TRUE;
5012	} else {
5013	vm_pageout_state.vm_restricted_to_single_processor = FALSE;
5014	}
5015	}
5016	}
5017
5018	/*
5019	* Set up vm_config based on the vm_compressor_mode.
5020	* Must run BEFORE the pageout thread starts up.
5021	*/
5022	__startup_func
5023	void
5024	vm_config_init(void)
5025	{
5026	bzero(s: &vm_config, n: sizeof(vm_config));
5027
5028	switch (vm_compressor_mode) {
5029	case VM_PAGER_DEFAULT:
5030	printf(format: "mapping deprecated VM_PAGER_DEFAULT to VM_PAGER_COMPRESSOR_WITH_SWAP\n");
5031	OS_FALLTHROUGH;
5032
5033	case VM_PAGER_COMPRESSOR_WITH_SWAP:
5034	vm_config.compressor_is_present = TRUE;
5035	vm_config.swap_is_present = TRUE;
5036	vm_config.compressor_is_active = TRUE;
5037	vm_config.swap_is_active = TRUE;
5038	break;
5039
5040	case VM_PAGER_COMPRESSOR_NO_SWAP:
5041	vm_config.compressor_is_present = TRUE;
5042	vm_config.swap_is_present = TRUE;
5043	vm_config.compressor_is_active = TRUE;
5044	break;
5045
5046	case VM_PAGER_FREEZER_DEFAULT:
5047	printf(format: "mapping deprecated VM_PAGER_FREEZER_DEFAULT to VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP\n");
5048	OS_FALLTHROUGH;
5049
5050	case VM_PAGER_FREEZER_COMPRESSOR_NO_SWAP:
5051	vm_config.compressor_is_present = TRUE;
5052	vm_config.swap_is_present = TRUE;
5053	break;
5054
5055	case VM_PAGER_COMPRESSOR_NO_SWAP_PLUS_FREEZER_COMPRESSOR_WITH_SWAP:
5056	vm_config.compressor_is_present = TRUE;
5057	vm_config.swap_is_present = TRUE;
5058	vm_config.compressor_is_active = TRUE;
5059	vm_config.freezer_swap_is_active = TRUE;
5060	break;
5061
5062	case VM_PAGER_NOT_CONFIGURED:
5063	break;
5064
5065	default:
5066	printf(format: "unknown compressor mode - %x\n", vm_compressor_mode);
5067	break;
5068	}
5069	}
5070
5071	__startup_func
5072	static void
5073	vm_pageout_create_gc_thread(void)
5074	{
5075	thread_t thread;
5076
5077	if (kernel_thread_create(continuation: vm_pageout_garbage_collect,
5078	VM_PAGEOUT_GC_INIT, BASEPRI_DEFAULT, new_thread: &thread) != KERN_SUCCESS) {
5079	panic("vm_pageout_garbage_collect: create failed");
5080	}
5081	thread_set_thread_name(th: thread, name: "VM_pageout_garbage_collect");
5082	if (thread->reserved_stack == `0`) {
5083	assert(thread->kernel_stack);
5084	thread->reserved_stack = thread->kernel_stack;
5085	}
5086
5087	/ thread is started in vm_pageout() /
5088	vm_pageout_gc_thread = thread;
5089	}
5090	STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_pageout_create_gc_thread);
5091
5092	void
5093	vm_pageout(void)
5094	{
5095	thread_t self = current_thread();
5096	thread_t thread;
5097	kern_return_t result;
5098	spl_t s;
5099
5100	/*
5101	* Set thread privileges.
5102	*/
5103	s = splsched();
5104
5105	#if CONFIG_VPS_DYNAMIC_PRIO
5106	if (vps_dynamic_priority_enabled) {
5107	sched_set_kernel_thread_priority(self, MAXPRI_THROTTLE);
5108	thread_set_eager_preempt(self);
5109	} else {
5110	sched_set_kernel_thread_priority(self, BASEPRI_VM);
5111	}
5112	#else /* CONFIG_VPS_DYNAMIC_PRIO */
5113	sched_set_kernel_thread_priority(thread: self, BASEPRI_VM);
5114	#endif /* CONFIG_VPS_DYNAMIC_PRIO */
5115
5116	thread_lock(self);
5117	self->options \|= TH_OPT_VMPRIV;
5118	thread_unlock(self);
5119
5120	if (!self->reserved_stack) {
5121	self->reserved_stack = self->kernel_stack;
5122	}
5123
5124	if (vm_pageout_state.vm_restricted_to_single_processor == TRUE &&
5125	!vps_dynamic_priority_enabled) {
5126	thread_vm_bind_group_add();
5127	}
5128
5129
5130	#if CONFIG_THREAD_GROUPS
5131	thread_group_vm_add();
5132	#endif /* CONFIG_THREAD_GROUPS */
5133
5134	#if __AMP__
5135	PE_parse_boot_argn("vmpgo_pcluster", &vm_pgo_pbound, sizeof(vm_pgo_pbound));
5136	if (vm_pgo_pbound) {
5137	/*
5138	* Use the soft bound option for vm pageout to allow it to run on
5139	* E-cores if P-cluster is unavailable.
5140	*/
5141	thread_bind_cluster_type(self, `'P'`, true);
5142	}
5143	#endif /* __AMP__ */
5144
5145	PE_parse_boot_argn(arg_string: "vmpgo_protect_realtime",
5146	arg_ptr: &vm_pageout_protect_realtime,
5147	max_arg: sizeof(vm_pageout_protect_realtime));
5148	splx(s);
5149
5150	thread_set_thread_name(th: current_thread(), name: "VM_pageout_scan");
5151
5152	/*
5153	* Initialize some paging parameters.
5154	*/
5155
5156	vm_pageout_state.vm_pressure_thread_running = FALSE;
5157	vm_pageout_state.vm_pressure_changed = FALSE;
5158	vm_pageout_state.memorystatus_purge_on_warning = `2`;
5159	vm_pageout_state.memorystatus_purge_on_urgent = `5`;
5160	vm_pageout_state.memorystatus_purge_on_critical = `8`;
5161	vm_pageout_state.vm_page_speculative_q_age_ms = VM_PAGE_SPECULATIVE_Q_AGE_MS;
5162	vm_pageout_state.vm_page_speculative_percentage = `5`;
5163	vm_pageout_state.vm_page_speculative_target = `0`;
5164
5165	vm_pageout_state.vm_pageout_swap_wait = `0`;
5166	vm_pageout_state.vm_pageout_idle_wait = `0`;
5167	vm_pageout_state.vm_pageout_empty_wait = `0`;
5168	vm_pageout_state.vm_pageout_burst_wait = `0`;
5169	vm_pageout_state.vm_pageout_deadlock_wait = `0`;
5170	vm_pageout_state.vm_pageout_deadlock_relief = `0`;
5171	vm_pageout_state.vm_pageout_burst_inactive_throttle = `0`;
5172
5173	vm_pageout_state.vm_pageout_inactive = `0`;
5174	vm_pageout_state.vm_pageout_inactive_used = `0`;
5175	vm_pageout_state.vm_pageout_inactive_clean = `0`;
5176
5177	vm_pageout_state.vm_memory_pressure = `0`;
5178	vm_pageout_state.vm_page_filecache_min = `0`;
5179	#if CONFIG_JETSAM
5180	vm_pageout_state.vm_page_filecache_min_divisor = `70`;
5181	vm_pageout_state.vm_page_xpmapped_min_divisor = `40`;
5182	#else
5183	vm_pageout_state.vm_page_filecache_min_divisor = `27`;
5184	vm_pageout_state.vm_page_xpmapped_min_divisor = `36`;
5185	#endif
5186	vm_pageout_state.vm_page_free_count_init = vm_page_free_count;
5187
5188	vm_pageout_state.vm_pageout_considered_page_last = `0`;
5189
5190	if (vm_pageout_state.vm_pageout_swap_wait == `0`) {
5191	vm_pageout_state.vm_pageout_swap_wait = VM_PAGEOUT_SWAP_WAIT;
5192	}
5193
5194	if (vm_pageout_state.vm_pageout_idle_wait == `0`) {
5195	vm_pageout_state.vm_pageout_idle_wait = VM_PAGEOUT_IDLE_WAIT;
5196	}
5197
5198	if (vm_pageout_state.vm_pageout_burst_wait == `0`) {
5199	vm_pageout_state.vm_pageout_burst_wait = VM_PAGEOUT_BURST_WAIT;
5200	}
5201
5202	if (vm_pageout_state.vm_pageout_empty_wait == `0`) {
5203	vm_pageout_state.vm_pageout_empty_wait = VM_PAGEOUT_EMPTY_WAIT;
5204	}
5205
5206	if (vm_pageout_state.vm_pageout_deadlock_wait == `0`) {
5207	vm_pageout_state.vm_pageout_deadlock_wait = VM_PAGEOUT_DEADLOCK_WAIT;
5208	}
5209
5210	if (vm_pageout_state.vm_pageout_deadlock_relief == `0`) {
5211	vm_pageout_state.vm_pageout_deadlock_relief = VM_PAGEOUT_DEADLOCK_RELIEF;
5212	}
5213
5214	if (vm_pageout_state.vm_pageout_burst_inactive_throttle == `0`) {
5215	vm_pageout_state.vm_pageout_burst_inactive_throttle = VM_PAGEOUT_BURST_INACTIVE_THROTTLE;
5216	}
5217	/*
5218	* even if we've already called vm_page_free_reserve
5219	* call it again here to insure that the targets are
5220	* accurately calculated (it uses vm_page_free_count_init)
5221	* calling it with an arg of 0 will not change the reserve
5222	* but will re-calculate free_min and free_target
5223	*/
5224	if (vm_page_free_reserved < VM_PAGE_FREE_RESERVED(processor_count)) {
5225	vm_page_free_reserve(pages: (VM_PAGE_FREE_RESERVED(processor_count)) - vm_page_free_reserved);
5226	} else {
5227	vm_page_free_reserve(pages: `0`);
5228	}
5229
5230	bzero(s: &vm_pageout_queue_external, n: sizeof(struct vm_pageout_queue));
5231	bzero(s: &vm_pageout_queue_internal, n: sizeof(struct vm_pageout_queue));
5232
5233	vm_page_queue_init(&vm_pageout_queue_external.pgo_pending);
5234	vm_pageout_queue_external.pgo_maxlaundry = VM_PAGE_LAUNDRY_MAX;
5235
5236	vm_page_queue_init(&vm_pageout_queue_internal.pgo_pending);
5237
5238	#if DEVELOPMENT \|\| DEBUG
5239	bzero(&vm_pageout_queue_benchmark, sizeof(struct vm_pageout_queue));
5240	vm_page_queue_init(&vm_pageout_queue_benchmark.pgo_pending);
5241	#endif /* DEVELOPMENT \|\| DEBUG */
5242
5243
5244	/ internal pageout thread started when default pager registered first time /
5245	/ external pageout and garbage collection threads started here /
5246	struct pgo_iothread_state *ethr = &pgo_iothread_external_state;
5247	ethr->id = `0`;
5248	ethr->q = &vm_pageout_queue_external;
5249	ethr->current_early_swapout_chead = NULL;
5250	ethr->current_regular_swapout_chead = NULL;
5251	ethr->current_late_swapout_chead = NULL;
5252	ethr->scratch_buf = NULL;
5253	#if DEVELOPMENT \|\| DEBUG
5254	ethr->benchmark_q = NULL;
5255	#endif /* DEVELOPMENT \|\| DEBUG */
5256	sched_cond_init(cond: &(ethr->pgo_wakeup));
5257
5258	result = kernel_thread_start_priority(continuation: (thread_continue_t)vm_pageout_iothread_external,
5259	parameter: (void *)ethr, BASEPRI_VM,
5260	new_thread: &(ethr->pgo_iothread));
5261	if (result != KERN_SUCCESS) {
5262	panic("vm_pageout: Unable to create external thread (%d)\n", result);
5263	}
5264	thread_set_thread_name(th: ethr->pgo_iothread, name: "VM_pageout_external_iothread");
5265
5266	thread_mtx_lock(thread: vm_pageout_gc_thread );
5267	thread_start(thread: vm_pageout_gc_thread );
5268	thread_mtx_unlock(thread: vm_pageout_gc_thread);
5269
5270	#if VM_PRESSURE_EVENTS
5271	result = kernel_thread_start_priority(continuation: (thread_continue_t)vm_pressure_thread, NULL,
5272	BASEPRI_DEFAULT,
5273	new_thread: &thread);
5274
5275	if (result != KERN_SUCCESS) {
5276	panic("vm_pressure_thread: create failed");
5277	}
5278
5279	thread_deallocate(thread);
5280	#endif
5281
5282	vm_object_reaper_init();
5283
5284
5285	if (VM_CONFIG_COMPRESSOR_IS_PRESENT) {
5286	vm_compressor_init();
5287	}
5288
5289	#if VM_PRESSURE_EVENTS
5290	vm_pressure_events_enabled = TRUE;
5291	#endif /* VM_PRESSURE_EVENTS */
5292
5293	#if CONFIG_PHANTOM_CACHE
5294	vm_phantom_cache_init();
5295	#endif
5296	#if VM_PAGE_BUCKETS_CHECK
5297	#if VM_PAGE_FAKE_BUCKETS
5298	printf("**** DEBUG: protecting fake buckets [0x%llx:0x%llx]\n",
5299	(uint64_t) vm_page_fake_buckets_start,
5300	(uint64_t) vm_page_fake_buckets_end);
5301	pmap_protect(kernel_pmap,
5302	vm_page_fake_buckets_start,
5303	vm_page_fake_buckets_end,
5304	VM_PROT_READ);
5305	// (char ) vm_page_fake_buckets_start = 'x'; / panic! /
5306	#endif /* VM_PAGE_FAKE_BUCKETS */
5307	#endif /* VM_PAGE_BUCKETS_CHECK */
5308
5309	#if VM_OBJECT_TRACKING
5310	vm_object_tracking_init();
5311	#endif /* VM_OBJECT_TRACKING */
5312
5313	#if __arm64__
5314	// vm_tests();
5315	#endif /* __arm64__ */
5316
5317	vm_pageout_continue();
5318
5319	/*
5320	* Unreached code!
5321	*
5322	* The vm_pageout_continue() call above never returns, so the code below is never
5323	* executed. We take advantage of this to declare several DTrace VM related probe
5324	* points that our kernel doesn't have an analog for. These are probe points that
5325	* exist in Solaris and are in the DTrace documentation, so people may have written
5326	* scripts that use them. Declaring the probe points here means their scripts will
5327	* compile and execute which we want for portability of the scripts, but since this
5328	* section of code is never reached, the probe points will simply never fire. Yes,
5329	* this is basically a hack. The problem is the DTrace probe points were chosen with
5330	* Solaris specific VM events in mind, not portability to different VM implementations.
5331	*/
5332
5333	DTRACE_VM2(execfree, int, `1`, (uint64_t *), NULL);
5334	DTRACE_VM2(execpgin, int, `1`, (uint64_t *), NULL);
5335	DTRACE_VM2(execpgout, int, `1`, (uint64_t *), NULL);
5336	DTRACE_VM2(pgswapin, int, `1`, (uint64_t *), NULL);
5337	DTRACE_VM2(pgswapout, int, `1`, (uint64_t *), NULL);
5338	DTRACE_VM2(swapin, int, `1`, (uint64_t *), NULL);
5339	DTRACE_VM2(swapout, int, `1`, (uint64_t *), NULL);
5340	/NOTREACHED/
5341	}
5342
5343
5344
5345	kern_return_t
5346	vm_pageout_internal_start(void)
5347	{
5348	kern_return_t result = KERN_SUCCESS;
5349	host_basic_info_data_t hinfo;
5350	vm_offset_t buf, bufsize;
5351
5352	assert(VM_CONFIG_COMPRESSOR_IS_PRESENT);
5353
5354	mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
5355	#define BSD_HOST 1
5356	host_info(host: (host_t)BSD_HOST, HOST_BASIC_INFO, host_info_out: (host_info_t)&hinfo, host_info_outCnt: &count);
5357
5358	assert(hinfo.max_cpus > `0`);
5359
5360	#if !XNU_TARGET_OS_OSX
5361	vm_pageout_state.vm_compressor_thread_count = `1`;
5362	#else /* !XNU_TARGET_OS_OSX */
5363	if (hinfo.max_cpus > `4`) {
5364	vm_pageout_state.vm_compressor_thread_count = `2`;
5365	} else {
5366	vm_pageout_state.vm_compressor_thread_count = `1`;
5367	}
5368	#endif /* !XNU_TARGET_OS_OSX */
5369	#if __AMP__
5370	if (vm_compressor_ebound) {
5371	vm_pageout_state.vm_compressor_thread_count = `2`;
5372	}
5373	#endif
5374	PE_parse_boot_argn(arg_string: "vmcomp_threads", arg_ptr: &vm_pageout_state.vm_compressor_thread_count,
5375	max_arg: sizeof(vm_pageout_state.vm_compressor_thread_count));
5376
5377	if (vm_pageout_state.vm_compressor_thread_count >= hinfo.max_cpus) {
5378	vm_pageout_state.vm_compressor_thread_count = hinfo.max_cpus - `1`;
5379	}
5380	if (vm_pageout_state.vm_compressor_thread_count <= `0`) {
5381	vm_pageout_state.vm_compressor_thread_count = `1`;
5382	} else if (vm_pageout_state.vm_compressor_thread_count > MAX_COMPRESSOR_THREAD_COUNT) {
5383	vm_pageout_state.vm_compressor_thread_count = MAX_COMPRESSOR_THREAD_COUNT;
5384	}
5385
5386	vm_pageout_queue_internal.pgo_maxlaundry =
5387	(vm_pageout_state.vm_compressor_thread_count * `4`) * VM_PAGE_LAUNDRY_MAX;
5388
5389	PE_parse_boot_argn(arg_string: "vmpgoi_maxlaundry",
5390	arg_ptr: &vm_pageout_queue_internal.pgo_maxlaundry,
5391	max_arg: sizeof(vm_pageout_queue_internal.pgo_maxlaundry));
5392
5393	#if DEVELOPMENT \|\| DEBUG
5394	// Note: this will be modified at enqueue-time such that the benchmark queue is never throttled
5395	vm_pageout_queue_benchmark.pgo_maxlaundry = vm_pageout_queue_internal.pgo_maxlaundry;
5396	#endif /* DEVELOPMENT \|\| DEBUG */
5397
5398	bufsize = COMPRESSOR_SCRATCH_BUF_SIZE;
5399
5400	kmem_alloc(map: kernel_map, addrp: &buf,
5401	size: bufsize * vm_pageout_state.vm_compressor_thread_count,
5402	flags: KMA_DATA \| KMA_NOFAIL \| KMA_KOBJECT \| KMA_PERMANENT,
5403	VM_KERN_MEMORY_COMPRESSOR);
5404
5405	for (int i = `0`; i < vm_pageout_state.vm_compressor_thread_count; i++) {
5406	struct pgo_iothread_state *iq = &pgo_iothread_internal_state[i];
5407	iq->id = i;
5408	iq->q = &vm_pageout_queue_internal;
5409	iq->current_early_swapout_chead = NULL;
5410	iq->current_regular_swapout_chead = NULL;
5411	iq->current_late_swapout_chead = NULL;
5412	iq->scratch_buf = (char )(buf + i bufsize);
5413	#if DEVELOPMENT \|\| DEBUG
5414	iq->benchmark_q = &vm_pageout_queue_benchmark;
5415	#endif /* DEVELOPMENT \|\| DEBUG */
5416	sched_cond_init(cond: &(iq->pgo_wakeup));
5417	result = kernel_thread_start_priority(continuation: (thread_continue_t)vm_pageout_iothread_internal,
5418	parameter: (void *)iq, BASEPRI_VM,
5419	new_thread: &(iq->pgo_iothread));
5420
5421	if (result != KERN_SUCCESS) {
5422	panic("vm_pageout: Unable to create compressor thread no. %d (%d)\n", i, result);
5423	}
5424	}
5425	return result;
5426	}
5427
5428	#if CONFIG_IOSCHED
5429	/*
5430	* To support I/O Expedite for compressed files we mark the upls with special flags.
5431	* The way decmpfs works is that we create a big upl which marks all the pages needed to
5432	* represent the compressed file as busy. We tag this upl with the flag UPL_DECMP_REQ. Decmpfs
5433	* then issues smaller I/Os for compressed I/Os, deflates them and puts the data into the pages
5434	* being held in the big original UPL. We mark each of these smaller UPLs with the flag
5435	* UPL_DECMP_REAL_IO. Any outstanding real I/O UPL is tracked by the big req upl using the
5436	* decmp_io_upl field (in the upl structure). This link is protected in the forward direction
5437	* by the req upl lock (the reverse link doesnt need synch. since we never inspect this link
5438	* unless the real I/O upl is being destroyed).
5439	*/
5440
5441
5442	static void
5443	upl_set_decmp_info(upl_t upl, upl_t src_upl)
5444	{
5445	assert((src_upl->flags & UPL_DECMP_REQ) != `0`);
5446
5447	upl_lock(src_upl);
5448	if (src_upl->decmp_io_upl) {
5449	/*
5450	* If there is already an alive real I/O UPL, ignore this new UPL.
5451	* This case should rarely happen and even if it does, it just means
5452	* that we might issue a spurious expedite which the driver is expected
5453	* to handle.
5454	*/
5455	upl_unlock(src_upl);
5456	return;
5457	}
5458	src_upl->decmp_io_upl = (void *)upl;
5459	src_upl->ref_count++;
5460
5461	upl->flags \|= UPL_DECMP_REAL_IO;
5462	upl->decmp_io_upl = (void *)src_upl;
5463	upl_unlock(src_upl);
5464	}
5465	#endif /* CONFIG_IOSCHED */
5466
5467	#if UPL_DEBUG
5468	int upl_debug_enabled = `1`;
5469	#else
5470	int upl_debug_enabled = `0`;
5471	#endif
5472
5473	static upl_t
5474	upl_create(int type, int flags, upl_size_t size)
5475	{
5476	uint32_t pages = (uint32_t)atop(round_page_32(size));
5477	upl_t upl;
5478
5479	assert(page_aligned(size));
5480
5481	/*
5482	* FIXME: this code assumes the allocation always succeeds,
5483	* however `pages` can be up to MAX_UPL_SIZE.
5484	*
5485	* The allocation size is above 32k (resp. 128k)
5486	* on 16k pages (resp. 4k), which kalloc might fail
5487	* to allocate.
5488	*/
5489	upl = kalloc_type(struct upl, struct upl_page_info,
5490	(type & UPL_CREATE_INTERNAL) ? pages : `0`, Z_WAITOK \| Z_ZERO);
5491	if (type & UPL_CREATE_INTERNAL) {
5492	flags \|= UPL_INTERNAL;
5493	}
5494
5495	if (type & UPL_CREATE_LITE) {
5496	flags \|= UPL_LITE;
5497	if (pages) {
5498	upl->lite_list = bitmap_alloc(nbits: pages);
5499	}
5500	}
5501
5502	upl->flags = flags;
5503	upl->ref_count = `1`;
5504	upl_lock_init(upl);
5505	#if CONFIG_IOSCHED
5506	if (type & UPL_CREATE_IO_TRACKING) {
5507	upl->upl_priority = proc_get_effective_thread_policy(thread: current_thread(), TASK_POLICY_IO);
5508	}
5509
5510	if ((type & UPL_CREATE_INTERNAL) && (type & UPL_CREATE_EXPEDITE_SUP)) {
5511	/ Only support expedite on internal UPLs /
5512	thread_t curthread = current_thread();
5513	upl->upl_reprio_info = kalloc_data(sizeof(uint64_t) * pages,
5514	Z_WAITOK \| Z_ZERO);
5515	upl->flags \|= UPL_EXPEDITE_SUPPORTED;
5516	if (curthread->decmp_upl != NULL) {
5517	upl_set_decmp_info(upl, src_upl: curthread->decmp_upl);
5518	}
5519	}
5520	#endif
5521	#if CONFIG_IOSCHED \|\| UPL_DEBUG
5522	if ((type & UPL_CREATE_IO_TRACKING) \|\| upl_debug_enabled) {
5523	upl->upl_creator = current_thread();
5524	upl->flags \|= UPL_TRACKED_BY_OBJECT;
5525	}
5526	#endif
5527
5528	#if UPL_DEBUG
5529	upl->uple_create_btref = btref_get(__builtin_frame_address(`0`), `0`);
5530	#endif /* UPL_DEBUG */
5531
5532	return upl;
5533	}
5534
5535	static void
5536	upl_destroy(upl_t upl)
5537	{
5538	uint32_t pages;
5539
5540	// DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object);
5541
5542	if (upl->ext_ref_count) {
5543	panic("upl(%p) ext_ref_count", upl);
5544	}
5545
5546	#if CONFIG_IOSCHED
5547	if ((upl->flags & UPL_DECMP_REAL_IO) && upl->decmp_io_upl) {
5548	upl_t src_upl;
5549	src_upl = upl->decmp_io_upl;
5550	assert((src_upl->flags & UPL_DECMP_REQ) != `0`);
5551	upl_lock(src_upl);
5552	src_upl->decmp_io_upl = NULL;
5553	upl_unlock(src_upl);
5554	upl_deallocate(upl: src_upl);
5555	}
5556	#endif /* CONFIG_IOSCHED */
5557
5558	#if CONFIG_IOSCHED \|\| UPL_DEBUG
5559	if (((upl->flags & UPL_TRACKED_BY_OBJECT) \|\| upl_debug_enabled) &&
5560	!(upl->flags & UPL_VECTOR)) {
5561	vm_object_t object;
5562
5563	if (upl->flags & UPL_SHADOWED) {
5564	object = upl->map_object->shadow;
5565	} else {
5566	object = upl->map_object;
5567	}
5568
5569	vm_object_lock(object);
5570	queue_remove(&object->uplq, upl, upl_t, uplq);
5571	vm_object_activity_end(object);
5572	vm_object_collapse(object, offset: `0`, TRUE);
5573	vm_object_unlock(object);
5574	}
5575	#endif
5576	/*
5577	* drop a reference on the map_object whether or
5578	* not a pageout object is inserted
5579	*/
5580	if (upl->flags & UPL_SHADOWED) {
5581	vm_object_deallocate(object: upl->map_object);
5582	}
5583
5584	if (upl->flags & UPL_DEVICE_MEMORY) {
5585	pages = `1`;
5586	} else {
5587	pages = (uint32_t)atop(upl_adjusted_size(upl, PAGE_MASK));
5588	}
5589
5590	upl_lock_destroy(upl);
5591
5592	#if CONFIG_IOSCHED
5593	if (upl->flags & UPL_EXPEDITE_SUPPORTED) {
5594	kfree_data(upl->upl_reprio_info, sizeof(uint64_t) * pages);
5595	}
5596	#endif
5597
5598	#if UPL_DEBUG
5599	for (int i = `0`; i < upl->upl_commit_index; i++) {
5600	btref_put(upl->upl_commit_records[i].c_btref);
5601	}
5602	btref_put(upl->uple_create_btref);
5603	#endif /* UPL_DEBUG */
5604
5605	if ((upl->flags & UPL_LITE) && pages) {
5606	bitmap_free(map: upl->lite_list, nbits: pages);
5607	}
5608	kfree_type(struct upl, struct upl_page_info,
5609	(upl->flags & UPL_INTERNAL) ? pages : `0`, upl);
5610	}
5611
5612	void
5613	upl_deallocate(upl_t upl)
5614	{
5615	upl_lock(upl);
5616
5617	if (--upl->ref_count == `0`) {
5618	if (vector_upl_is_valid(upl)) {
5619	vector_upl_deallocate(upl);
5620	}
5621	upl_unlock(upl);
5622
5623	if (upl->upl_iodone) {
5624	upl_callout_iodone(upl);
5625	}
5626
5627	upl_destroy(upl);
5628	} else {
5629	upl_unlock(upl);
5630	}
5631	}
5632
5633	#if CONFIG_IOSCHED
5634	void
5635	upl_mark_decmp(upl_t upl)
5636	{
5637	if (upl->flags & UPL_TRACKED_BY_OBJECT) {
5638	upl->flags \|= UPL_DECMP_REQ;
5639	upl->upl_creator->decmp_upl = (void *)upl;
5640	}
5641	}
5642
5643	void
5644	upl_unmark_decmp(upl_t upl)
5645	{
5646	if (upl && (upl->flags & UPL_DECMP_REQ)) {
5647	upl->upl_creator->decmp_upl = NULL;
5648	}
5649	}
5650
5651	#endif /* CONFIG_IOSCHED */
5652
5653	#define VM_PAGE_Q_BACKING_UP(q) \
5654	((q)->pgo_laundry >= (((q)->pgo_maxlaundry * 8) / 10))
5655
5656	boolean_t must_throttle_writes(void);
5657
5658	boolean_t
5659	must_throttle_writes()
5660	{
5661	if (VM_PAGE_Q_BACKING_UP(&vm_pageout_queue_external) &&
5662	vm_page_pageable_external_count > (AVAILABLE_NON_COMPRESSED_MEMORY * `6`) / `10`) {
5663	return TRUE;
5664	}
5665
5666	return FALSE;
5667	}
5668
5669	int vm_page_delayed_work_ctx_needed = `0`;
5670	KALLOC_TYPE_DEFINE(dw_ctx_zone, struct vm_page_delayed_work_ctx, KT_PRIV_ACCT);
5671
5672	__startup_func
5673	static void
5674	vm_page_delayed_work_init_ctx(void)
5675	{
5676	uint16_t min_delayed_work_ctx_allocated = `16`;
5677
5678	/*
5679	* try really hard to always keep NCPU elements around in the zone
5680	* in order for the UPL code to almost always get an element.
5681	*/
5682	if (min_delayed_work_ctx_allocated < zpercpu_count()) {
5683	min_delayed_work_ctx_allocated = (uint16_t)zpercpu_count();
5684	}
5685
5686	zone_raise_reserve(zone_or_view: dw_ctx_zone, min_elements: min_delayed_work_ctx_allocated);
5687	}
5688	STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_page_delayed_work_init_ctx);
5689
5690	struct vm_page_delayed_work*
5691	vm_page_delayed_work_get_ctx(void)
5692	{
5693	struct vm_page_delayed_work_ctx * dw_ctx = NULL;
5694
5695	dw_ctx = zalloc_flags(dw_ctx_zone, Z_ZERO \| Z_NOWAIT);
5696
5697	if (__probable(dw_ctx)) {
5698	dw_ctx->delayed_owner = current_thread();
5699	} else {
5700	vm_page_delayed_work_ctx_needed++;
5701	}
5702	return dw_ctx ? dw_ctx->dwp : NULL;
5703	}
5704
5705	void
5706	vm_page_delayed_work_finish_ctx(struct vm_page_delayed_work* dwp)
5707	{
5708	struct vm_page_delayed_work_ctx *ldw_ctx;
5709
5710	ldw_ctx = (struct vm_page_delayed_work_ctx *)dwp;
5711	ldw_ctx->delayed_owner = NULL;
5712
5713	zfree(dw_ctx_zone, ldw_ctx);
5714	}
5715
5716	/*
5717	* Routine: vm_object_upl_request
5718	* Purpose:
5719	* Cause the population of a portion of a vm_object.
5720	* Depending on the nature of the request, the pages
5721	* returned may be contain valid data or be uninitialized.
5722	* A page list structure, listing the physical pages
5723	* will be returned upon request.
5724	* This function is called by the file system or any other
5725	* supplier of backing store to a pager.
5726	* IMPORTANT NOTE: The caller must still respect the relationship
5727	* between the vm_object and its backing memory object. The
5728	* caller MUST NOT substitute changes in the backing file
5729	* without first doing a memory_object_lock_request on the
5730	* target range unless it is know that the pages are not
5731	* shared with another entity at the pager level.
5732	* Copy_in_to:
5733	* if a page list structure is present
5734	* return the mapped physical pages, where a
5735	* page is not present, return a non-initialized
5736	* one. If the no_sync bit is turned on, don't
5737	* call the pager unlock to synchronize with other
5738	* possible copies of the page. Leave pages busy
5739	* in the original object, if a page list structure
5740	* was specified. When a commit of the page list
5741	* pages is done, the dirty bit will be set for each one.
5742	* Copy_out_from:
5743	* If a page list structure is present, return
5744	* all mapped pages. Where a page does not exist
5745	* map a zero filled one. Leave pages busy in
5746	* the original object. If a page list structure
5747	* is not specified, this call is a no-op.
5748	*
5749	* Note: access of default pager objects has a rather interesting
5750	* twist. The caller of this routine, presumably the file system
5751	* page cache handling code, will never actually make a request
5752	* against a default pager backed object. Only the default
5753	* pager will make requests on backing store related vm_objects
5754	* In this way the default pager can maintain the relationship
5755	* between backing store files (abstract memory objects) and
5756	* the vm_objects (cache objects), they support.
5757	*
5758	*/
5759
5760	__private_extern__ kern_return_t
5761	vm_object_upl_request(
5762	vm_object_t object,
5763	vm_object_offset_t offset,
5764	upl_size_t size,
5765	upl_t *upl_ptr,
5766	upl_page_info_array_t user_page_list,
5767	unsigned int *page_list_count,
5768	upl_control_flags_t cntrl_flags,
5769	vm_tag_t tag)
5770	{
5771	vm_page_t dst_page = VM_PAGE_NULL;
5772	vm_object_offset_t dst_offset;
5773	upl_size_t xfer_size;
5774	unsigned int size_in_pages;
5775	boolean_t dirty;
5776	boolean_t hw_dirty;
5777	upl_t upl = NULL;
5778	unsigned int entry;
5779	vm_page_t alias_page = NULL;
5780	int refmod_state = `0`;
5781	vm_object_t last_copy_object;
5782	uint32_t last_copy_version;
5783	struct vm_page_delayed_work dw_array;
5784	struct vm_page_delayed_work dwp, dwp_start;
5785	bool dwp_finish_ctx = TRUE;
5786	int dw_count;
5787	int dw_limit;
5788	int io_tracking_flag = `0`;
5789	int grab_options;
5790	int page_grab_count = `0`;
5791	ppnum_t phys_page;
5792	pmap_flush_context pmap_flush_context_storage;
5793	boolean_t pmap_flushes_delayed = FALSE;
5794	#if DEVELOPMENT \|\| DEBUG
5795	task_t task = current_task();
5796	#endif /* DEVELOPMENT \|\| DEBUG */
5797
5798	dwp_start = dwp = NULL;
5799
5800	if (cntrl_flags & ~UPL_VALID_FLAGS) {
5801	/*
5802	* For forward compatibility's sake,
5803	* reject any unknown flag.
5804	*/
5805	return KERN_INVALID_VALUE;
5806	}
5807	if ((!object->internal) && (object->paging_offset != `0`)) {
5808	panic("vm_object_upl_request: external object with non-zero paging offset");
5809	}
5810	if (object->phys_contiguous) {
5811	panic("vm_object_upl_request: contiguous object specified");
5812	}
5813
5814	assertf(page_aligned(offset) && page_aligned(size),
5815	"offset 0x%llx size 0x%x",
5816	offset, size);
5817
5818	VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, VM_UPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, `0`, `0`);
5819
5820	dw_count = `0`;
5821	dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
5822	dwp_start = vm_page_delayed_work_get_ctx();
5823	if (dwp_start == NULL) {
5824	dwp_start = &dw_array;
5825	dw_limit = `1`;
5826	dwp_finish_ctx = FALSE;
5827	}
5828
5829	dwp = dwp_start;
5830
5831	if (size > MAX_UPL_SIZE_BYTES) {
5832	size = MAX_UPL_SIZE_BYTES;
5833	}
5834
5835	if ((cntrl_flags & UPL_SET_INTERNAL) && page_list_count != NULL) {
5836	*page_list_count = MAX_UPL_SIZE_BYTES >> PAGE_SHIFT;
5837	}
5838
5839	#if CONFIG_IOSCHED \|\| UPL_DEBUG
5840	if (object->io_tracking \|\| upl_debug_enabled) {
5841	io_tracking_flag \|= UPL_CREATE_IO_TRACKING;
5842	}
5843	#endif
5844	#if CONFIG_IOSCHED
5845	if (object->io_tracking) {
5846	io_tracking_flag \|= UPL_CREATE_EXPEDITE_SUP;
5847	}
5848	#endif
5849
5850	if (cntrl_flags & UPL_SET_INTERNAL) {
5851	if (cntrl_flags & UPL_SET_LITE) {
5852	upl = upl_create(UPL_CREATE_INTERNAL \| UPL_CREATE_LITE \| io_tracking_flag, flags: `0`, size);
5853	} else {
5854	upl = upl_create(UPL_CREATE_INTERNAL \| io_tracking_flag, flags: `0`, size);
5855	}
5856	user_page_list = size ? upl->page_list : NULL;
5857	} else {
5858	if (cntrl_flags & UPL_SET_LITE) {
5859	upl = upl_create(UPL_CREATE_EXTERNAL \| UPL_CREATE_LITE \| io_tracking_flag, flags: `0`, size);
5860	} else {
5861	upl = upl_create(UPL_CREATE_EXTERNAL \| io_tracking_flag, flags: `0`, size);
5862	}
5863	}
5864	*upl_ptr = upl;
5865
5866	if (user_page_list) {
5867	user_page_list[`0`].device = FALSE;
5868	}
5869
5870	if (cntrl_flags & UPL_SET_LITE) {
5871	upl->map_object = object;
5872	} else {
5873	upl->map_object = vm_object_allocate(size);
5874	vm_object_lock(upl->map_object);
5875	/*
5876	* No neeed to lock the new object: nobody else knows
5877	* about it yet, so it's all ours so far.
5878	*/
5879	upl->map_object->shadow = object;
5880	VM_OBJECT_SET_PAGEOUT(object: upl->map_object, TRUE);
5881	VM_OBJECT_SET_CAN_PERSIST(object: upl->map_object, FALSE);
5882	upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
5883	upl->map_object->vo_shadow_offset = offset;
5884	upl->map_object->wimg_bits = object->wimg_bits;
5885	assertf(page_aligned(upl->map_object->vo_shadow_offset),
5886	"object %p shadow_offset 0x%llx",
5887	upl->map_object, upl->map_object->vo_shadow_offset);
5888	vm_object_unlock(upl->map_object);
5889
5890	alias_page = vm_page_grab_fictitious(TRUE);
5891
5892	upl->flags \|= UPL_SHADOWED;
5893	}
5894	if (cntrl_flags & UPL_FOR_PAGEOUT) {
5895	upl->flags \|= UPL_PAGEOUT;
5896	}
5897
5898	vm_object_lock(object);
5899	vm_object_activity_begin(object);
5900
5901	grab_options = `0`;
5902	#if CONFIG_SECLUDED_MEMORY
5903	if (object->can_grab_secluded) {
5904	grab_options \|= VM_PAGE_GRAB_SECLUDED;
5905	}
5906	#endif /* CONFIG_SECLUDED_MEMORY */
5907
5908	/*
5909	* we can lock in the paging_offset once paging_in_progress is set
5910	*/
5911	upl->u_size = size;
5912	upl->u_offset = offset + object->paging_offset;
5913
5914	#if CONFIG_IOSCHED \|\| UPL_DEBUG
5915	if (object->io_tracking \|\| upl_debug_enabled) {
5916	vm_object_activity_begin(object);
5917	queue_enter(&object->uplq, upl, upl_t, uplq);
5918	}
5919	#endif
5920	if ((cntrl_flags & UPL_WILL_MODIFY) && object->vo_copy != VM_OBJECT_NULL) {
5921	/*
5922	* Honor copy-on-write obligations
5923	*
5924	* The caller is gathering these pages and
5925	* might modify their contents. We need to
5926	* make sure that the copy object has its own
5927	* private copies of these pages before we let
5928	* the caller modify them.
5929	*/
5930	vm_object_update(object,
5931	offset,
5932	size,
5933	NULL,
5934	NULL,
5935	FALSE, / should_return /
5936	MEMORY_OBJECT_COPY_SYNC,
5937	VM_PROT_NO_CHANGE);
5938
5939	VM_PAGEOUT_DEBUG(upl_cow, `1`);
5940	VM_PAGEOUT_DEBUG(upl_cow_pages, (size >> PAGE_SHIFT));
5941	}
5942	/*
5943	* remember which copy object we synchronized with
5944	*/
5945	last_copy_object = object->vo_copy;
5946	last_copy_version = object->vo_copy_version;
5947	entry = `0`;
5948
5949	xfer_size = size;
5950	dst_offset = offset;
5951	size_in_pages = size / PAGE_SIZE;
5952
5953	if (vm_page_free_count > (vm_page_free_target + size_in_pages) \|\|
5954	object->resident_page_count < ((MAX_UPL_SIZE_BYTES * `2`) >> PAGE_SHIFT)) {
5955	object->scan_collisions = `0`;
5956	}
5957
5958	if ((cntrl_flags & UPL_WILL_MODIFY) && must_throttle_writes() == TRUE) {
5959	boolean_t isSSD = FALSE;
5960
5961	#if !XNU_TARGET_OS_OSX
5962	isSSD = TRUE;
5963	#else /* !XNU_TARGET_OS_OSX */
5964	vnode_pager_get_isSSD(object->pager, &isSSD);
5965	#endif /* !XNU_TARGET_OS_OSX */
5966	vm_object_unlock(object);
5967
5968	OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
5969
5970	if (isSSD == TRUE) {
5971	delay(usec: `1000` * size_in_pages);
5972	} else {
5973	delay(usec: `5000` * size_in_pages);
5974	}
5975	OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
5976
5977	vm_object_lock(object);
5978	}
5979
5980	while (xfer_size) {
5981	dwp->dw_mask = `0`;
5982
5983	if ((alias_page == NULL) && !(cntrl_flags & UPL_SET_LITE)) {
5984	vm_object_unlock(object);
5985	alias_page = vm_page_grab_fictitious(TRUE);
5986	vm_object_lock(object);
5987	}
5988	if (cntrl_flags & UPL_COPYOUT_FROM) {
5989	upl->flags \|= UPL_PAGE_SYNC_DONE;
5990
5991	if (((dst_page = vm_page_lookup(object, offset: dst_offset)) == VM_PAGE_NULL) \|\|
5992	dst_page->vmp_fictitious \|\|
5993	dst_page->vmp_absent \|\|
5994	VMP_ERROR_GET(dst_page) \|\|
5995	dst_page->vmp_cleaning \|\|
5996	(VM_PAGE_WIRED(dst_page))) {
5997	if (user_page_list) {
5998	user_page_list[entry].phys_addr = `0`;
5999	}
6000
6001	goto try_next_page;
6002	}
6003	phys_page = VM_PAGE_GET_PHYS_PAGE(m: dst_page);
6004
6005	/*
6006	* grab this up front...
6007	* a high percentange of the time we're going to
6008	* need the hardware modification state a bit later
6009	* anyway... so we can eliminate an extra call into
6010	* the pmap layer by grabbing it here and recording it
6011	*/
6012	if (dst_page->vmp_pmapped) {
6013	refmod_state = pmap_get_refmod(pn: phys_page);
6014	} else {
6015	refmod_state = `0`;
6016	}
6017
6018	if ((refmod_state & VM_MEM_REFERENCED) && VM_PAGE_INACTIVE(dst_page)) {
6019	/*
6020	* page is on inactive list and referenced...
6021	* reactivate it now... this gets it out of the
6022	* way of vm_pageout_scan which would have to
6023	* reactivate it upon tripping over it
6024	*/
6025	dwp->dw_mask \|= DW_vm_page_activate;
6026	}
6027	if (cntrl_flags & UPL_RET_ONLY_DIRTY) {
6028	/*
6029	* we're only asking for DIRTY pages to be returned
6030	*/
6031	if (dst_page->vmp_laundry \|\| !(cntrl_flags & UPL_FOR_PAGEOUT)) {
6032	/*
6033	* if we were the page stolen by vm_pageout_scan to be
6034	* cleaned (as opposed to a buddy being clustered in
6035	* or this request is not being driven by a PAGEOUT cluster
6036	* then we only need to check for the page being dirty or
6037	* precious to decide whether to return it
6038	*/
6039	if (dst_page->vmp_dirty \|\| dst_page->vmp_precious \|\| (refmod_state & VM_MEM_MODIFIED)) {
6040	goto check_busy;
6041	}
6042	goto dont_return;
6043	}
6044	/*
6045	* this is a request for a PAGEOUT cluster and this page
6046	* is merely along for the ride as a 'buddy'... not only
6047	* does it have to be dirty to be returned, but it also
6048	* can't have been referenced recently...
6049	*/
6050	if ((hibernate_cleaning_in_progress == TRUE \|\|
6051	(!((refmod_state & VM_MEM_REFERENCED) \|\| dst_page->vmp_reference) \|\|
6052	(dst_page->vmp_q_state == VM_PAGE_ON_THROTTLED_Q))) &&
6053	((refmod_state & VM_MEM_MODIFIED) \|\| dst_page->vmp_dirty \|\| dst_page->vmp_precious)) {
6054	goto check_busy;
6055	}
6056	dont_return:
6057	/*
6058	* if we reach here, we're not to return
6059	* the page... go on to the next one
6060	*/
6061	if (dst_page->vmp_laundry == TRUE) {
6062	/*
6063	* if we get here, the page is not 'cleaning' (filtered out above).
6064	* since it has been referenced, remove it from the laundry
6065	* so we don't pay the cost of an I/O to clean a page
6066	* we're just going to take back
6067	*/
6068	vm_page_lockspin_queues();
6069
6070	vm_pageout_steal_laundry(page: dst_page, TRUE);
6071	vm_page_activate(page: dst_page);
6072
6073	vm_page_unlock_queues();
6074	}
6075	if (user_page_list) {
6076	user_page_list[entry].phys_addr = `0`;
6077	}
6078
6079	goto try_next_page;
6080	}
6081	check_busy:
6082	if (dst_page->vmp_busy) {
6083	if (cntrl_flags & UPL_NOBLOCK) {
6084	if (user_page_list) {
6085	user_page_list[entry].phys_addr = `0`;
6086	}
6087	dwp->dw_mask = `0`;
6088
6089	goto try_next_page;
6090	}
6091	/*
6092	* someone else is playing with the
6093	* page. We will have to wait.
6094	*/
6095	PAGE_SLEEP(object, dst_page, THREAD_UNINT);
6096
6097	continue;
6098	}
6099	if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6100	vm_page_lockspin_queues();
6101
6102	if (dst_page->vmp_q_state == VM_PAGE_ON_PAGEOUT_Q) {
6103	/*
6104	* we've buddied up a page for a clustered pageout
6105	* that has already been moved to the pageout
6106	* queue by pageout_scan... we need to remove
6107	* it from the queue and drop the laundry count
6108	* on that queue
6109	*/
6110	vm_pageout_throttle_up(m: dst_page);
6111	}
6112	vm_page_unlock_queues();
6113	}
6114	hw_dirty = refmod_state & VM_MEM_MODIFIED;
6115	dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6116
6117	if (phys_page > upl->highest_page) {
6118	upl->highest_page = phys_page;
6119	}
6120
6121	assert(!pmap_is_noencrypt(phys_page));
6122
6123	if (cntrl_flags & UPL_SET_LITE) {
6124	unsigned int pg_num;
6125
6126	pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6127	assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6128	bitmap_set(map: upl->lite_list, n: pg_num);
6129
6130	if (hw_dirty) {
6131	if (pmap_flushes_delayed == FALSE) {
6132	pmap_flush_context_init(&pmap_flush_context_storage);
6133	pmap_flushes_delayed = TRUE;
6134	}
6135	pmap_clear_refmod_options(pn: phys_page,
6136	VM_MEM_MODIFIED,
6137	PMAP_OPTIONS_NOFLUSH \| PMAP_OPTIONS_CLEAR_WRITE,
6138	&pmap_flush_context_storage);
6139	}
6140
6141	/*
6142	* Mark original page as cleaning
6143	* in place.
6144	*/
6145	dst_page->vmp_cleaning = TRUE;
6146	dst_page->vmp_precious = FALSE;
6147	} else {
6148	/*
6149	* use pageclean setup, it is more
6150	* convenient even for the pageout
6151	* cases here
6152	*/
6153	vm_object_lock(upl->map_object);
6154	vm_pageclean_setup(m: dst_page, new_m: alias_page, new_object: upl->map_object, new_offset: size - xfer_size);
6155	vm_object_unlock(upl->map_object);
6156
6157	alias_page->vmp_absent = FALSE;
6158	alias_page = NULL;
6159	}
6160	if (dirty) {
6161	SET_PAGE_DIRTY(dst_page, FALSE);
6162	} else {
6163	dst_page->vmp_dirty = FALSE;
6164	}
6165
6166	if (!dirty) {
6167	dst_page->vmp_precious = TRUE;
6168	}
6169
6170	if (!(cntrl_flags & UPL_CLEAN_IN_PLACE)) {
6171	if (!VM_PAGE_WIRED(dst_page)) {
6172	dst_page->vmp_free_when_done = TRUE;
6173	}
6174	}
6175	} else {
6176	if ((cntrl_flags & UPL_WILL_MODIFY) &&
6177	(object->vo_copy != last_copy_object \|\|
6178	object->vo_copy_version != last_copy_version)) {
6179	/*
6180	* Honor copy-on-write obligations
6181	*
6182	* The copy object has changed since we
6183	* last synchronized for copy-on-write.
6184	* Another copy object might have been
6185	* inserted while we released the object's
6186	* lock. Since someone could have seen the
6187	* original contents of the remaining pages
6188	* through that new object, we have to
6189	* synchronize with it again for the remaining
6190	* pages only. The previous pages are "busy"
6191	* so they can not be seen through the new
6192	* mapping. The new mapping will see our
6193	* upcoming changes for those previous pages,
6194	* but that's OK since they couldn't see what
6195	* was there before. It's just a race anyway
6196	* and there's no guarantee of consistency or
6197	* atomicity. We just don't want new mappings
6198	* to see both the before and after pages.
6199	*/
6200	if (object->vo_copy != VM_OBJECT_NULL) {
6201	vm_object_update(
6202	object,
6203	offset: dst_offset,/ current offset /
6204	size: xfer_size, / remaining size /
6205	NULL,
6206	NULL,
6207	FALSE, / should_return /
6208	MEMORY_OBJECT_COPY_SYNC,
6209	VM_PROT_NO_CHANGE);
6210
6211	VM_PAGEOUT_DEBUG(upl_cow_again, `1`);
6212	VM_PAGEOUT_DEBUG(upl_cow_again_pages, (xfer_size >> PAGE_SHIFT));
6213	}
6214	/*
6215	* remember the copy object we synced with
6216	*/
6217	last_copy_object = object->vo_copy;
6218	last_copy_version = object->vo_copy_version;
6219	}
6220	dst_page = vm_page_lookup(object, offset: dst_offset);
6221
6222	if (dst_page != VM_PAGE_NULL) {
6223	if ((cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6224	/*
6225	* skip over pages already present in the cache
6226	*/
6227	if (user_page_list) {
6228	user_page_list[entry].phys_addr = `0`;
6229	}
6230
6231	goto try_next_page;
6232	}
6233	if (dst_page->vmp_fictitious) {
6234	panic("need corner case for fictitious page");
6235	}
6236
6237	if (dst_page->vmp_busy \|\| dst_page->vmp_cleaning) {
6238	/*
6239	* someone else is playing with the
6240	* page. We will have to wait.
6241	*/
6242	PAGE_SLEEP(object, dst_page, THREAD_UNINT);
6243
6244	continue;
6245	}
6246	if (dst_page->vmp_laundry) {
6247	vm_pageout_steal_laundry(page: dst_page, FALSE);
6248	}
6249	} else {
6250	if (object->private) {
6251	/*
6252	* This is a nasty wrinkle for users
6253	* of upl who encounter device or
6254	* private memory however, it is
6255	* unavoidable, only a fault can
6256	* resolve the actual backing
6257	* physical page by asking the
6258	* backing device.
6259	*/
6260	if (user_page_list) {
6261	user_page_list[entry].phys_addr = `0`;
6262	}
6263
6264	goto try_next_page;
6265	}
6266	if (object->scan_collisions) {
6267	/*
6268	* the pageout_scan thread is trying to steal
6269	* pages from this object, but has run into our
6270	* lock... grab 2 pages from the head of the object...
6271	* the first is freed on behalf of pageout_scan, the
6272	* 2nd is for our own use... we use vm_object_page_grab
6273	* in both cases to avoid taking pages from the free
6274	* list since we are under memory pressure and our
6275	* lock on this object is getting in the way of
6276	* relieving it
6277	*/
6278	dst_page = vm_object_page_grab(object);
6279
6280	if (dst_page != VM_PAGE_NULL) {
6281	vm_page_release(page: dst_page,
6282	FALSE);
6283	}
6284
6285	dst_page = vm_object_page_grab(object);
6286	}
6287	if (dst_page == VM_PAGE_NULL) {
6288	/*
6289	* need to allocate a page
6290	*/
6291	dst_page = vm_page_grab_options(flags: grab_options);
6292	if (dst_page != VM_PAGE_NULL) {
6293	page_grab_count++;
6294	}
6295	}
6296	if (dst_page == VM_PAGE_NULL) {
6297	if ((cntrl_flags & (UPL_RET_ONLY_ABSENT \| UPL_NOBLOCK)) == (UPL_RET_ONLY_ABSENT \| UPL_NOBLOCK)) {
6298	/*
6299	* we don't want to stall waiting for pages to come onto the free list
6300	* while we're already holding absent pages in this UPL
6301	* the caller will deal with the empty slots
6302	*/
6303	if (user_page_list) {
6304	user_page_list[entry].phys_addr = `0`;
6305	}
6306
6307	goto try_next_page;
6308	}
6309	/*
6310	* no pages available... wait
6311	* then try again for the same
6312	* offset...
6313	*/
6314	vm_object_unlock(object);
6315
6316	OSAddAtomic(size_in_pages, &vm_upl_wait_for_pages);
6317
6318	VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, `0`, `0`, `0`);
6319
6320	VM_PAGE_WAIT();
6321	OSAddAtomic(-size_in_pages, &vm_upl_wait_for_pages);
6322
6323	VM_DEBUG_EVENT(vm_upl_page_wait, VM_UPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, `0`, `0`, `0`);
6324
6325	vm_object_lock(object);
6326
6327	continue;
6328	}
6329	vm_page_insert(page: dst_page, object, offset: dst_offset);
6330
6331	dst_page->vmp_absent = TRUE;
6332	dst_page->vmp_busy = FALSE;
6333
6334	if (cntrl_flags & UPL_RET_ONLY_ABSENT) {
6335	/*
6336	* if UPL_RET_ONLY_ABSENT was specified,
6337	* than we're definitely setting up a
6338	* upl for a clustered read/pagein
6339	* operation... mark the pages as clustered
6340	* so upl_commit_range can put them on the
6341	* speculative list
6342	*/
6343	dst_page->vmp_clustered = TRUE;
6344
6345	if (!(cntrl_flags & UPL_FILE_IO)) {
6346	counter_inc(&vm_statistics_pageins);
6347	}
6348	}
6349	}
6350	phys_page = VM_PAGE_GET_PHYS_PAGE(m: dst_page);
6351
6352	dst_page->vmp_overwriting = TRUE;
6353
6354	if (dst_page->vmp_pmapped) {
6355	if (!(cntrl_flags & UPL_FILE_IO)) {
6356	/*
6357	* eliminate all mappings from the
6358	* original object and its prodigy
6359	*/
6360	refmod_state = pmap_disconnect(phys: phys_page);
6361	} else {
6362	refmod_state = pmap_get_refmod(pn: phys_page);
6363	}
6364	} else {
6365	refmod_state = `0`;
6366	}
6367
6368	hw_dirty = refmod_state & VM_MEM_MODIFIED;
6369	dirty = hw_dirty ? TRUE : dst_page->vmp_dirty;
6370
6371	if (cntrl_flags & UPL_SET_LITE) {
6372	unsigned int pg_num;
6373
6374	pg_num = (unsigned int) ((dst_offset - offset) / PAGE_SIZE);
6375	assert(pg_num == (dst_offset - offset) / PAGE_SIZE);
6376	bitmap_set(map: upl->lite_list, n: pg_num);
6377
6378	if (hw_dirty) {
6379	pmap_clear_modify(pn: phys_page);
6380	}
6381
6382	/*
6383	* Mark original page as cleaning
6384	* in place.
6385	*/
6386	dst_page->vmp_cleaning = TRUE;
6387	dst_page->vmp_precious = FALSE;
6388	} else {
6389	/*
6390	* use pageclean setup, it is more
6391	* convenient even for the pageout
6392	* cases here
6393	*/
6394	vm_object_lock(upl->map_object);
6395	vm_pageclean_setup(m: dst_page, new_m: alias_page, new_object: upl->map_object, new_offset: size - xfer_size);
6396	vm_object_unlock(upl->map_object);
6397
6398	alias_page->vmp_absent = FALSE;
6399	alias_page = NULL;
6400	}
6401
6402	if (cntrl_flags & UPL_REQUEST_SET_DIRTY) {
6403	upl->flags &= ~UPL_CLEAR_DIRTY;
6404	upl->flags \|= UPL_SET_DIRTY;
6405	dirty = TRUE;
6406	/*
6407	* Page belonging to a code-signed object is about to
6408	* be written. Mark it tainted and disconnect it from
6409	* all pmaps so processes have to fault it back in and
6410	* deal with the tainted bit.
6411	*/
6412	if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
6413	dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
6414	vm_page_upl_tainted++;
6415	if (dst_page->vmp_pmapped) {
6416	refmod_state = pmap_disconnect(phys: VM_PAGE_GET_PHYS_PAGE(m: dst_page));
6417	if (refmod_state & VM_MEM_REFERENCED) {
6418	dst_page->vmp_reference = TRUE;
6419	}
6420	}
6421	}
6422	} else if (cntrl_flags & UPL_CLEAN_IN_PLACE) {
6423	/*
6424	* clean in place for read implies
6425	* that a write will be done on all
6426	* the pages that are dirty before
6427	* a upl commit is done. The caller
6428	* is obligated to preserve the
6429	* contents of all pages marked dirty
6430	*/
6431	upl->flags \|= UPL_CLEAR_DIRTY;
6432	}
6433	dst_page->vmp_dirty = dirty;
6434
6435	if (!dirty) {
6436	dst_page->vmp_precious = TRUE;
6437	}
6438
6439	if (!VM_PAGE_WIRED(dst_page)) {
6440	/*
6441	* deny access to the target page while
6442	* it is being worked on
6443	*/
6444	dst_page->vmp_busy = TRUE;
6445	} else {
6446	dwp->dw_mask \|= DW_vm_page_wire;
6447	}
6448
6449	/*
6450	* We might be about to satisfy a fault which has been
6451	* requested. So no need for the "restart" bit.
6452	*/
6453	dst_page->vmp_restart = FALSE;
6454	if (!dst_page->vmp_absent && !(cntrl_flags & UPL_WILL_MODIFY)) {
6455	/*
6456	* expect the page to be used
6457	*/
6458	dwp->dw_mask \|= DW_set_reference;
6459	}
6460	if (cntrl_flags & UPL_PRECIOUS) {
6461	if (object->internal) {
6462	SET_PAGE_DIRTY(dst_page, FALSE);
6463	dst_page->vmp_precious = FALSE;
6464	} else {
6465	dst_page->vmp_precious = TRUE;
6466	}
6467	} else {
6468	dst_page->vmp_precious = FALSE;
6469	}
6470	}
6471	if (dst_page->vmp_busy) {
6472	upl->flags \|= UPL_HAS_BUSY;
6473	}
6474
6475	if (phys_page > upl->highest_page) {
6476	upl->highest_page = phys_page;
6477	}
6478	assert(!pmap_is_noencrypt(phys_page));
6479	if (user_page_list) {
6480	user_page_list[entry].phys_addr = phys_page;
6481	user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
6482	user_page_list[entry].absent = dst_page->vmp_absent;
6483	user_page_list[entry].dirty = dst_page->vmp_dirty;
6484	user_page_list[entry].precious = dst_page->vmp_precious;
6485	user_page_list[entry].device = FALSE;
6486	user_page_list[entry].needed = FALSE;
6487	if (dst_page->vmp_clustered == TRUE) {
6488	user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
6489	} else {
6490	user_page_list[entry].speculative = FALSE;
6491	}
6492	user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
6493	user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
6494	user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
6495	user_page_list[entry].mark = FALSE;
6496	}
6497	/*
6498	* if UPL_RET_ONLY_ABSENT is set, then
6499	* we are working with a fresh page and we've
6500	* just set the clustered flag on it to
6501	* indicate that it was drug in as part of a
6502	* speculative cluster... so leave it alone
6503	*/
6504	if (!(cntrl_flags & UPL_RET_ONLY_ABSENT)) {
6505	/*
6506	* someone is explicitly grabbing this page...
6507	* update clustered and speculative state
6508	*
6509	*/
6510	if (dst_page->vmp_clustered) {
6511	VM_PAGE_CONSUME_CLUSTERED(dst_page);
6512	}
6513	}
6514	try_next_page:
6515	if (dwp->dw_mask) {
6516	if (dwp->dw_mask & DW_vm_page_activate) {
6517	counter_inc(&vm_statistics_reactivations);
6518	}
6519
6520	VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
6521
6522	if (dw_count >= dw_limit) {
6523	vm_page_do_delayed_work(object, tag, dwp: dwp_start, dw_count);
6524
6525	dwp = dwp_start;
6526	dw_count = `0`;
6527	}
6528	}
6529	entry++;
6530	dst_offset += PAGE_SIZE_64;
6531	xfer_size -= PAGE_SIZE;
6532	}
6533	if (dw_count) {
6534	vm_page_do_delayed_work(object, tag, dwp: dwp_start, dw_count);
6535	dwp = dwp_start;
6536	dw_count = `0`;
6537	}
6538
6539	if (alias_page != NULL) {
6540	VM_PAGE_FREE(alias_page);
6541	}
6542	if (pmap_flushes_delayed == TRUE) {
6543	pmap_flush(&pmap_flush_context_storage);
6544	}
6545
6546	if (page_list_count != NULL) {
6547	if (upl->flags & UPL_INTERNAL) {
6548	*page_list_count = `0`;
6549	} else if (*page_list_count > entry) {
6550	*page_list_count = entry;
6551	}
6552	}
6553	#if UPL_DEBUG
6554	upl->upl_state = `1`;
6555	#endif
6556	vm_object_unlock(object);
6557
6558	VM_DEBUG_CONSTANT_EVENT(vm_object_upl_request, VM_UPL_REQUEST, DBG_FUNC_END, page_grab_count, `0`, `0`, `0`);
6559	#if DEVELOPMENT \|\| DEBUG
6560	if (task != NULL) {
6561	ledger_credit(task->ledger, task_ledgers.pages_grabbed_upl, page_grab_count);
6562	}
6563	#endif /* DEVELOPMENT \|\| DEBUG */
6564
6565	if (dwp_start && dwp_finish_ctx) {
6566	vm_page_delayed_work_finish_ctx(dwp: dwp_start);
6567	dwp_start = dwp = NULL;
6568	}
6569
6570	return KERN_SUCCESS;
6571	}
6572
6573	/*
6574	* Routine: vm_object_super_upl_request
6575	* Purpose:
6576	* Cause the population of a portion of a vm_object
6577	* in much the same way as memory_object_upl_request.
6578	* Depending on the nature of the request, the pages
6579	* returned may be contain valid data or be uninitialized.
6580	* However, the region may be expanded up to the super
6581	* cluster size provided.
6582	*/
6583
6584	__private_extern__ kern_return_t
6585	vm_object_super_upl_request(
6586	vm_object_t object,
6587	vm_object_offset_t offset,
6588	upl_size_t size,
6589	upl_size_t super_cluster,
6590	upl_t *upl,
6591	upl_page_info_t *user_page_list,
6592	unsigned int *page_list_count,
6593	upl_control_flags_t cntrl_flags,
6594	vm_tag_t tag)
6595	{
6596	if (object->paging_offset > offset \|\| ((cntrl_flags & UPL_VECTOR) == UPL_VECTOR)) {
6597	return KERN_FAILURE;
6598	}
6599
6600	assert(object->paging_in_progress);
6601	offset = offset - object->paging_offset;
6602
6603	if (super_cluster > size) {
6604	vm_object_offset_t base_offset;
6605	upl_size_t super_size;
6606	vm_object_size_t super_size_64;
6607
6608	base_offset = (offset & ~((vm_object_offset_t) super_cluster - `1`));
6609	super_size = (offset + size) > (base_offset + super_cluster) ? super_cluster << `1` : super_cluster;
6610	super_size_64 = ((base_offset + super_size) > object->vo_size) ? (object->vo_size - base_offset) : super_size;
6611	super_size = (upl_size_t) super_size_64;
6612	assert(super_size == super_size_64);
6613
6614	if (offset > (base_offset + super_size)) {
6615	panic("vm_object_super_upl_request: Missed target pageout"
6616	" %#llx,%#llx, %#x, %#x, %#x, %#llx\n",
6617	offset, base_offset, super_size, super_cluster,
6618	size, object->paging_offset);
6619	}
6620	/*
6621	* apparently there is a case where the vm requests a
6622	* page to be written out who's offset is beyond the
6623	* object size
6624	*/
6625	if ((offset + size) > (base_offset + super_size)) {
6626	super_size_64 = (offset + size) - base_offset;
6627	super_size = (upl_size_t) super_size_64;
6628	assert(super_size == super_size_64);
6629	}
6630
6631	offset = base_offset;
6632	size = super_size;
6633	}
6634	return vm_object_upl_request(object, offset, size, upl_ptr: upl, user_page_list, page_list_count, cntrl_flags, tag);
6635	}
6636
6637	int cs_executable_create_upl = `0`;
6638	extern int proc_selfpid(void);
6639	extern char proc_name_address(void* *p);
6640
6641	kern_return_t
6642	vm_map_create_upl(
6643	vm_map_t map,
6644	vm_map_address_t offset,
6645	upl_size_t *upl_size,
6646	upl_t *upl,
6647	upl_page_info_array_t page_list,
6648	unsigned int *count,
6649	upl_control_flags_t *flags,
6650	vm_tag_t tag)
6651	{
6652	vm_map_entry_t entry;
6653	upl_control_flags_t caller_flags;
6654	int force_data_sync;
6655	int sync_cow_data;
6656	vm_object_t local_object;
6657	vm_map_offset_t local_offset;
6658	vm_map_offset_t local_start;
6659	kern_return_t ret;
6660	vm_map_address_t original_offset;
6661	vm_map_size_t original_size, adjusted_size;
6662	vm_map_offset_t local_entry_start;
6663	vm_object_offset_t local_entry_offset;
6664	vm_object_offset_t offset_in_mapped_page;
6665	boolean_t release_map = FALSE;
6666
6667
6668	start_with_map:
6669
6670	original_offset = offset;
6671	original_size = *upl_size;
6672	adjusted_size = original_size;
6673
6674	caller_flags = *flags;
6675
6676	if (caller_flags & ~UPL_VALID_FLAGS) {
6677	/*
6678	* For forward compatibility's sake,
6679	* reject any unknown flag.
6680	*/
6681	ret = KERN_INVALID_VALUE;
6682	goto done;
6683	}
6684	force_data_sync = (caller_flags & UPL_FORCE_DATA_SYNC);
6685	sync_cow_data = !(caller_flags & UPL_COPYOUT_FROM);
6686
6687	if (upl == NULL) {
6688	ret = KERN_INVALID_ARGUMENT;
6689	goto done;
6690	}
6691
6692	REDISCOVER_ENTRY:
6693	vm_map_lock_read(map);
6694
6695	if (!vm_map_lookup_entry(map, address: offset, entry: &entry)) {
6696	vm_map_unlock_read(map);
6697	ret = KERN_FAILURE;
6698	goto done;
6699	}
6700
6701	local_entry_start = entry->vme_start;
6702	local_entry_offset = VME_OFFSET(entry);
6703
6704	if (VM_MAP_PAGE_SHIFT(map) < PAGE_SHIFT) {
6705	DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%x flags 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)offset, upl_size, flags);
6706	}
6707
6708	if (entry->vme_end - original_offset < adjusted_size) {
6709	adjusted_size = entry->vme_end - original_offset;
6710	assert(adjusted_size > `0`);
6711	*upl_size = (upl_size_t) adjusted_size;
6712	assert(*upl_size == adjusted_size);
6713	}
6714
6715	if (caller_flags & UPL_QUERY_OBJECT_TYPE) {
6716	*flags = `0`;
6717
6718	if (!entry->is_sub_map &&
6719	VME_OBJECT(entry) != VM_OBJECT_NULL) {
6720	if (VME_OBJECT(entry)->private) {
6721	*flags = UPL_DEV_MEMORY;
6722	}
6723
6724	if (VME_OBJECT(entry)->phys_contiguous) {
6725	*flags \|= UPL_PHYS_CONTIG;
6726	}
6727	}
6728	vm_map_unlock_read(map);
6729	ret = KERN_SUCCESS;
6730	goto done;
6731	}
6732
6733	offset_in_mapped_page = `0`;
6734	if (VM_MAP_PAGE_SIZE(map) < PAGE_SIZE) {
6735	offset = vm_map_trunc_page(original_offset, VM_MAP_PAGE_MASK(map));
6736	*upl_size = (upl_size_t)
6737	(vm_map_round_page(original_offset + adjusted_size,
6738	VM_MAP_PAGE_MASK(map))
6739	- offset);
6740
6741	offset_in_mapped_page = original_offset - offset;
6742	assert(offset_in_mapped_page < VM_MAP_PAGE_SIZE(map));
6743
6744	DEBUG4K_UPL("map %p (%d) offset 0x%llx size 0x%llx flags 0x%llx -> offset 0x%llx adjusted_size 0x%llx upl_size 0x%x offset_in_mapped_page 0x%llx\n", map, VM_MAP_PAGE_SHIFT(map), (uint64_t)original_offset, (uint64_t)original_size, flags, (uint64_t)offset, (uint64_t)adjusted_size, *upl_size, offset_in_mapped_page);
6745	}
6746
6747	if (!entry->is_sub_map) {
6748	if (VME_OBJECT(entry) == VM_OBJECT_NULL \|\|
6749	!VME_OBJECT(entry)->phys_contiguous) {
6750	if (*upl_size > MAX_UPL_SIZE_BYTES) {
6751	*upl_size = MAX_UPL_SIZE_BYTES;
6752	}
6753	}
6754
6755	/*
6756	* Create an object if necessary.
6757	*/
6758	if (VME_OBJECT(entry) == VM_OBJECT_NULL) {
6759	if (vm_map_lock_read_to_write(map)) {
6760	goto REDISCOVER_ENTRY;
6761	}
6762
6763	VME_OBJECT_SET(entry,
6764	object: vm_object_allocate(size: (vm_size_t)
6765	vm_object_round_page((entry->vme_end - entry->vme_start))),
6766	false, context: `0`);
6767	VME_OFFSET_SET(entry, offset: `0`);
6768	assert(entry->use_pmap);
6769
6770	vm_map_lock_write_to_read(map);
6771	}
6772
6773	if (!(caller_flags & UPL_COPYOUT_FROM) &&
6774	!(entry->protection & VM_PROT_WRITE)) {
6775	vm_map_unlock_read(map);
6776	ret = KERN_PROTECTION_FAILURE;
6777	goto done;
6778	}
6779	}
6780
6781	#if !XNU_TARGET_OS_OSX
6782	if (map->pmap != kernel_pmap &&
6783	(caller_flags & UPL_COPYOUT_FROM) &&
6784	(entry->protection & VM_PROT_EXECUTE) &&
6785	!(entry->protection & VM_PROT_WRITE)) {
6786	vm_offset_t kaddr;
6787	vm_size_t ksize;
6788
6789	/*
6790	* We're about to create a read-only UPL backed by
6791	* memory from an executable mapping.
6792	* Wiring the pages would result in the pages being copied
6793	* (due to the "MAP_PRIVATE" mapping) and no longer
6794	* code-signed, so no longer eligible for execution.
6795	* Instead, let's copy the data into a kernel buffer and
6796	* create the UPL from this kernel buffer.
6797	* The kernel buffer is then freed, leaving the UPL holding
6798	* the last reference on the VM object, so the memory will
6799	* be released when the UPL is committed.
6800	*/
6801
6802	vm_map_unlock_read(map);
6803	entry = VM_MAP_ENTRY_NULL;
6804	/ allocate kernel buffer /
6805	ksize = round_page(*upl_size);
6806	kaddr = `0`;
6807	ret = kmem_alloc(kernel_map, &kaddr, ksize,
6808	KMA_PAGEABLE \| KMA_DATA, tag);
6809	if (ret == KERN_SUCCESS) {
6810	/ copyin the user data /
6811	ret = copyinmap(map, offset, (void )kaddr, upl_size);
6812	}
6813	if (ret == KERN_SUCCESS) {
6814	if (ksize > *upl_size) {
6815	/ zero out the extra space in kernel buffer /
6816	memset((void )(kaddr + upl_size),
6817	`0`,
6818	ksize - *upl_size);
6819	}
6820	/ create the UPL from the kernel buffer /
6821	vm_object_offset_t offset_in_object;
6822	vm_object_offset_t offset_in_object_page;
6823
6824	offset_in_object = offset - local_entry_start + local_entry_offset;
6825	offset_in_object_page = offset_in_object - vm_object_trunc_page(offset_in_object);
6826	assert(offset_in_object_page < PAGE_SIZE);
6827	assert(offset_in_object_page + offset_in_mapped_page < PAGE_SIZE);
6828	*upl_size -= offset_in_object_page + offset_in_mapped_page;
6829	ret = vm_map_create_upl(kernel_map,
6830	(vm_map_address_t)(kaddr + offset_in_object_page + offset_in_mapped_page),
6831	upl_size, upl, page_list, count, flags, tag);
6832	}
6833	if (kaddr != `0`) {
6834	/ free the kernel buffer /
6835	kmem_free(kernel_map, kaddr, ksize);
6836	kaddr = `0`;
6837	ksize = `0`;
6838	}
6839	#if DEVELOPMENT \|\| DEBUG
6840	DTRACE_VM4(create_upl_from_executable,
6841	vm_map_t, map,
6842	vm_map_address_t, offset,
6843	upl_size_t, *upl_size,
6844	kern_return_t, ret);
6845	#endif /* DEVELOPMENT \|\| DEBUG */
6846	goto done;
6847	}
6848	#endif /* !XNU_TARGET_OS_OSX */
6849
6850	if (!entry->is_sub_map) {
6851	local_object = VME_OBJECT(entry);
6852	assert(local_object != VM_OBJECT_NULL);
6853	}
6854
6855	if (!entry->is_sub_map &&
6856	!entry->needs_copy &&
6857	*upl_size != `0` &&
6858	local_object->vo_size > upl_size && /* partial UPL /
6859	entry->wired_count == `0` && / No COW for entries that are wired /
6860	(map->pmap != kernel_pmap) && / alias checks /
6861	(vm_map_entry_should_cow_for_true_share(entry) / case 1 /
6862	\|\|
6863	( / case 2 /
6864	local_object->internal &&
6865	(local_object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) &&
6866	local_object->ref_count > `1`))) {
6867	vm_prot_t prot;
6868
6869	/*
6870	* Case 1:
6871	* Set up the targeted range for copy-on-write to avoid
6872	* applying true_share/copy_delay to the entire object.
6873	*
6874	* Case 2:
6875	* This map entry covers only part of an internal
6876	* object. There could be other map entries covering
6877	* other areas of this object and some of these map
6878	* entries could be marked as "needs_copy", which
6879	* assumes that the object is COPY_SYMMETRIC.
6880	* To avoid marking this object as COPY_DELAY and
6881	* "true_share", let's shadow it and mark the new
6882	* (smaller) object as "true_share" and COPY_DELAY.
6883	*/
6884
6885	if (vm_map_lock_read_to_write(map)) {
6886	goto REDISCOVER_ENTRY;
6887	}
6888	vm_map_lock_assert_exclusive(map);
6889	assert(VME_OBJECT(entry) == local_object);
6890
6891	vm_map_clip_start(map,
6892	entry,
6893	vm_map_trunc_page(offset,
6894	VM_MAP_PAGE_MASK(map)));
6895	vm_map_clip_end(map,
6896	entry,
6897	vm_map_round_page(offset + *upl_size,
6898	VM_MAP_PAGE_MASK(map)));
6899	if ((entry->vme_end - offset) < *upl_size) {
6900	*upl_size = (upl_size_t) (entry->vme_end - offset);
6901	assert(*upl_size == entry->vme_end - offset);
6902	}
6903
6904	prot = entry->protection & ~VM_PROT_WRITE;
6905	if (override_nx(map, VME_ALIAS(entry)) && prot) {
6906	prot \|= VM_PROT_EXECUTE;
6907	}
6908	vm_object_pmap_protect(object: local_object,
6909	offset: VME_OFFSET(entry),
6910	size: entry->vme_end - entry->vme_start,
6911	pmap: ((entry->is_shared \|\|
6912	map->mapped_in_other_pmaps)
6913	? PMAP_NULL
6914	: map->pmap),
6915	VM_MAP_PAGE_SIZE(map),
6916	pmap_start: entry->vme_start,
6917	prot);
6918
6919	assert(entry->wired_count == `0`);
6920
6921	/*
6922	* Lock the VM object and re-check its status: if it's mapped
6923	* in another address space, we could still be racing with
6924	* another thread holding that other VM map exclusively.
6925	*/
6926	vm_object_lock(local_object);
6927	if (local_object->true_share) {
6928	/ object is already in proper state: no COW needed /
6929	assert(local_object->copy_strategy !=
6930	MEMORY_OBJECT_COPY_SYMMETRIC);
6931	} else {
6932	/ not true_share: ask for copy-on-write below /
6933	assert(local_object->copy_strategy ==
6934	MEMORY_OBJECT_COPY_SYMMETRIC);
6935	entry->needs_copy = TRUE;
6936	}
6937	vm_object_unlock(local_object);
6938
6939	vm_map_lock_write_to_read(map);
6940	}
6941
6942	if (entry->needs_copy) {
6943	/*
6944	* Honor copy-on-write for COPY_SYMMETRIC
6945	* strategy.
6946	*/
6947	vm_map_t local_map;
6948	vm_object_t object;
6949	vm_object_offset_t new_offset;
6950	vm_prot_t prot;
6951	boolean_t wired;
6952	vm_map_version_t version;
6953	vm_map_t real_map;
6954	vm_prot_t fault_type;
6955
6956	local_map = map;
6957
6958	if (caller_flags & UPL_COPYOUT_FROM) {
6959	fault_type = VM_PROT_READ \| VM_PROT_COPY;
6960	vm_counters.create_upl_extra_cow++;
6961	vm_counters.create_upl_extra_cow_pages +=
6962	(entry->vme_end - entry->vme_start) / PAGE_SIZE;
6963	} else {
6964	fault_type = VM_PROT_WRITE;
6965	}
6966	if (vm_map_lookup_and_lock_object(var_map: &local_map,
6967	vaddr: offset, fault_type,
6968	OBJECT_LOCK_EXCLUSIVE,
6969	out_version: &version, object: &object,
6970	offset: &new_offset, out_prot: &prot, wired: &wired,
6971	NULL,
6972	real_map: &real_map, NULL) != KERN_SUCCESS) {
6973	if (fault_type == VM_PROT_WRITE) {
6974	vm_counters.create_upl_lookup_failure_write++;
6975	} else {
6976	vm_counters.create_upl_lookup_failure_copy++;
6977	}
6978	vm_map_unlock_read(local_map);
6979	ret = KERN_FAILURE;
6980	goto done;
6981	}
6982	if (real_map != local_map) {
6983	vm_map_unlock(real_map);
6984	}
6985	vm_map_unlock_read(local_map);
6986
6987	vm_object_unlock(object);
6988
6989	goto REDISCOVER_ENTRY;
6990	}
6991
6992	if (entry->is_sub_map) {
6993	vm_map_t submap;
6994
6995	submap = VME_SUBMAP(entry);
6996	local_start = entry->vme_start;
6997	local_offset = (vm_map_offset_t)VME_OFFSET(entry);
6998
6999	vm_map_reference(map: submap);
7000	vm_map_unlock_read(map);
7001
7002	DEBUG4K_UPL("map %p offset 0x%llx (0x%llx) size 0x%x (adjusted 0x%llx original 0x%llx) offset_in_mapped_page 0x%llx submap %p\n", map, (uint64_t)offset, (uint64_t)original_offset, *upl_size, (uint64_t)adjusted_size, (uint64_t)original_size, offset_in_mapped_page, submap);
7003	offset += offset_in_mapped_page;
7004	*upl_size -= offset_in_mapped_page;
7005
7006	if (release_map) {
7007	vm_map_deallocate(map);
7008	}
7009	map = submap;
7010	release_map = TRUE;
7011	offset = local_offset + (offset - local_start);
7012	goto start_with_map;
7013	}
7014
7015	if (sync_cow_data &&
7016	(VME_OBJECT(entry)->shadow \|\|
7017	VME_OBJECT(entry)->vo_copy)) {
7018	local_object = VME_OBJECT(entry);
7019	local_start = entry->vme_start;
7020	local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7021
7022	vm_object_reference(local_object);
7023	vm_map_unlock_read(map);
7024
7025	if (local_object->shadow && local_object->vo_copy) {
7026	vm_object_lock_request(object: local_object->shadow,
7027	offset: ((vm_object_offset_t)
7028	((offset - local_start) +
7029	local_offset) +
7030	local_object->vo_shadow_offset),
7031	size: *upl_size, FALSE,
7032	MEMORY_OBJECT_DATA_SYNC,
7033	VM_PROT_NO_CHANGE);
7034	}
7035	sync_cow_data = FALSE;
7036	vm_object_deallocate(object: local_object);
7037
7038	goto REDISCOVER_ENTRY;
7039	}
7040	if (force_data_sync) {
7041	local_object = VME_OBJECT(entry);
7042	local_start = entry->vme_start;
7043	local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7044
7045	vm_object_reference(local_object);
7046	vm_map_unlock_read(map);
7047
7048	vm_object_lock_request(object: local_object,
7049	offset: ((vm_object_offset_t)
7050	((offset - local_start) +
7051	local_offset)),
7052	size: (vm_object_size_t)*upl_size,
7053	FALSE,
7054	MEMORY_OBJECT_DATA_SYNC,
7055	VM_PROT_NO_CHANGE);
7056
7057	force_data_sync = FALSE;
7058	vm_object_deallocate(object: local_object);
7059
7060	goto REDISCOVER_ENTRY;
7061	}
7062	if (VME_OBJECT(entry)->private) {
7063	*flags = UPL_DEV_MEMORY;
7064	} else {
7065	*flags = `0`;
7066	}
7067
7068	if (VME_OBJECT(entry)->phys_contiguous) {
7069	*flags \|= UPL_PHYS_CONTIG;
7070	}
7071
7072	local_object = VME_OBJECT(entry);
7073	local_offset = (vm_map_offset_t)VME_OFFSET(entry);
7074	local_start = entry->vme_start;
7075
7076	/*
7077	* Wiring will copy the pages to the shadow object.
7078	* The shadow object will not be code-signed so
7079	* attempting to execute code from these copied pages
7080	* would trigger a code-signing violation.
7081	*/
7082	if (entry->protection & VM_PROT_EXECUTE) {
7083	#if MACH_ASSERT
7084	printf("pid %d[%s] create_upl out of executable range from "
7085	"0x%llx to 0x%llx: side effects may include "
7086	"code-signing violations later on\n",
7087	proc_selfpid(),
7088	(get_bsdtask_info(current_task())
7089	? proc_name_address(get_bsdtask_info(current_task()))
7090	: "?"),
7091	(uint64_t) entry->vme_start,
7092	(uint64_t) entry->vme_end);
7093	#endif /* MACH_ASSERT */
7094	DTRACE_VM2(cs_executable_create_upl,
7095	uint64_t, (uint64_t)entry->vme_start,
7096	uint64_t, (uint64_t)entry->vme_end);
7097	cs_executable_create_upl++;
7098	}
7099
7100	vm_object_lock(local_object);
7101
7102	/*
7103	* Ensure that this object is "true_share" and "copy_delay" now,
7104	* while we're still holding the VM map lock. After we unlock the map,
7105	* anything could happen to that mapping, including some copy-on-write
7106	* activity. We need to make sure that the IOPL will point at the
7107	* same memory as the mapping.
7108	*/
7109	if (local_object->true_share) {
7110	assert(local_object->copy_strategy !=
7111	MEMORY_OBJECT_COPY_SYMMETRIC);
7112	} else if (!is_kernel_object(local_object) &&
7113	local_object != compressor_object &&
7114	!local_object->phys_contiguous) {
7115	#if VM_OBJECT_TRACKING_OP_TRUESHARE
7116	if (!local_object->true_share &&
7117	vm_object_tracking_btlog) {
7118	btlog_record(vm_object_tracking_btlog, local_object,
7119	VM_OBJECT_TRACKING_OP_TRUESHARE,
7120	btref_get(__builtin_frame_address(`0`), `0`));
7121	}
7122	#endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
7123	VM_OBJECT_SET_TRUE_SHARE(object: local_object, TRUE);
7124	if (local_object->copy_strategy ==
7125	MEMORY_OBJECT_COPY_SYMMETRIC) {
7126	local_object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
7127	}
7128	}
7129
7130	vm_object_reference_locked(local_object);
7131	vm_object_unlock(local_object);
7132
7133	vm_map_unlock_read(map);
7134
7135	offset += offset_in_mapped_page;
7136	assert(*upl_size > offset_in_mapped_page);
7137	*upl_size -= offset_in_mapped_page;
7138
7139	ret = vm_object_iopl_request(object: local_object,
7140	offset: ((vm_object_offset_t)
7141	((offset - local_start) + local_offset)),
7142	size: *upl_size,
7143	upl_ptr: upl,
7144	user_page_list: page_list,
7145	page_list_count: count,
7146	cntrl_flags: caller_flags,
7147	tag);
7148	vm_object_deallocate(object: local_object);
7149
7150	done:
7151	if (release_map) {
7152	vm_map_deallocate(map);
7153	}
7154
7155	return ret;
7156	}
7157
7158	/*
7159	* Internal routine to enter a UPL into a VM map.
7160	*
7161	* JMM - This should just be doable through the standard
7162	* vm_map_enter() API.
7163	*/
7164	kern_return_t
7165	vm_map_enter_upl_range(
7166	vm_map_t map,
7167	upl_t upl,
7168	vm_object_offset_t offset_to_map,
7169	upl_size_t size_to_map,
7170	vm_prot_t prot_to_map,
7171	vm_map_offset_t *dst_addr)
7172	{
7173	vm_map_size_t size;
7174	vm_object_offset_t offset;
7175	vm_map_offset_t addr;
7176	vm_page_t m;
7177	kern_return_t kr;
7178	int isVectorUPL = `0`, curr_upl = `0`;
7179	upl_t vector_upl = NULL;
7180	mach_vm_offset_t vector_upl_dst_addr = `0`;
7181	vm_map_t vector_upl_submap = NULL;
7182	upl_offset_t subupl_offset = `0`;
7183	upl_size_t subupl_size = `0`;
7184
7185	if (upl == UPL_NULL) {
7186	return KERN_INVALID_ARGUMENT;
7187	}
7188
7189	DEBUG4K_UPL("map %p upl %p flags 0x%x object %p offset 0x%llx (uploff: 0x%llx) size 0x%x (uplsz: 0x%x) \n", map, upl, upl->flags, upl->map_object, offset_to_map, upl->u_offset, size_to_map, upl->u_size);
7190	assert(map == kernel_map);
7191
7192	if ((isVectorUPL = vector_upl_is_valid(upl))) {
7193	int mapped = `0`, valid_upls = `0`;
7194	vector_upl = upl;
7195
7196	upl_lock(vector_upl);
7197	for (curr_upl = `0`; curr_upl < vector_upl_max_upls(upl: vector_upl); curr_upl++) {
7198	upl = vector_upl_subupl_byindex(vector_upl, curr_upl );
7199	if (upl == NULL) {
7200	continue;
7201	}
7202	valid_upls++;
7203	if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7204	mapped++;
7205	}
7206	}
7207
7208	if (mapped) {
7209	if (mapped != valid_upls) {
7210	panic("Only %d of the %d sub-upls within the Vector UPL are alread mapped", mapped, valid_upls);
7211	} else {
7212	upl_unlock(vector_upl);
7213	return KERN_FAILURE;
7214	}
7215	}
7216
7217	if (VM_MAP_PAGE_MASK(map) < PAGE_MASK) {
7218	panic("TODO4K: vector UPL not implemented");
7219	}
7220
7221	vector_upl_submap = kmem_suballoc(parent: map, addr: &vector_upl_dst_addr,
7222	size: vector_upl->u_size, vmc_options: VM_MAP_CREATE_DEFAULT,
7223	VM_FLAGS_ANYWHERE, flags: KMS_NOFAIL \| KMS_DATA,
7224	VM_KERN_MEMORY_NONE).kmr_submap;
7225	map = vector_upl_submap;
7226	vector_upl_set_submap(vector_upl, vector_upl_submap, vector_upl_dst_addr);
7227	curr_upl = `0`;
7228	} else {
7229	upl_lock(upl);
7230	}
7231
7232	process_upl_to_enter:
7233	if (isVectorUPL) {
7234	if (curr_upl == vector_upl_max_upls(upl: vector_upl)) {
7235	*dst_addr = vector_upl_dst_addr;
7236	upl_unlock(vector_upl);
7237	return KERN_SUCCESS;
7238	}
7239	upl = vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7240	if (upl == NULL) {
7241	goto process_upl_to_enter;
7242	}
7243
7244	vector_upl_get_iostate(vector_upl, upl, &subupl_offset, &subupl_size);
7245	*dst_addr = (vm_map_offset_t)(vector_upl_dst_addr + (vm_map_offset_t)subupl_offset);
7246	} else {
7247	/*
7248	* check to see if already mapped
7249	*/
7250	if (UPL_PAGE_LIST_MAPPED & upl->flags) {
7251	upl_unlock(upl);
7252	return KERN_FAILURE;
7253	}
7254	}
7255
7256	if ((!(upl->flags & UPL_SHADOWED)) &&
7257	((upl->flags & UPL_HAS_BUSY) \|\|
7258	!((upl->flags & (UPL_DEVICE_MEMORY \| UPL_IO_WIRE)) \|\| (upl->map_object->phys_contiguous)))) {
7259	vm_object_t object;
7260	vm_page_t alias_page;
7261	vm_object_offset_t new_offset;
7262	unsigned int pg_num;
7263
7264	size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7265	object = upl->map_object;
7266	upl->map_object = vm_object_allocate(vm_object_round_page(size));
7267
7268	vm_object_lock(upl->map_object);
7269
7270	upl->map_object->shadow = object;
7271	VM_OBJECT_SET_PAGEOUT(object: upl->map_object, TRUE);
7272	VM_OBJECT_SET_CAN_PERSIST(object: upl->map_object, FALSE);
7273	upl->map_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
7274	upl->map_object->vo_shadow_offset = upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset;
7275	assertf(page_aligned(upl->map_object->vo_shadow_offset),
7276	"object %p shadow_offset 0x%llx",
7277	upl->map_object,
7278	(uint64_t)upl->map_object->vo_shadow_offset);
7279	upl->map_object->wimg_bits = object->wimg_bits;
7280	offset = upl->map_object->vo_shadow_offset;
7281	new_offset = `0`;
7282
7283	upl->flags \|= UPL_SHADOWED;
7284
7285	while (size) {
7286	pg_num = (unsigned int) (new_offset / PAGE_SIZE);
7287	assert(pg_num == new_offset / PAGE_SIZE);
7288
7289	if (bitmap_test(map: upl->lite_list, n: pg_num)) {
7290	alias_page = vm_page_grab_fictitious(TRUE);
7291
7292	vm_object_lock(object);
7293
7294	m = vm_page_lookup(object, offset);
7295	if (m == VM_PAGE_NULL) {
7296	panic("vm_upl_map: page missing");
7297	}
7298
7299	/*
7300	* Convert the fictitious page to a private
7301	* shadow of the real page.
7302	*/
7303	assert(alias_page->vmp_fictitious);
7304	alias_page->vmp_fictitious = FALSE;
7305	alias_page->vmp_private = TRUE;
7306	alias_page->vmp_free_when_done = TRUE;
7307	/*
7308	* since m is a page in the upl it must
7309	* already be wired or BUSY, so it's
7310	* safe to assign the underlying physical
7311	* page to the alias
7312	*/
7313	VM_PAGE_SET_PHYS_PAGE(alias_page, VM_PAGE_GET_PHYS_PAGE(m));
7314
7315	vm_object_unlock(object);
7316
7317	vm_page_lockspin_queues();
7318	vm_page_wire(page: alias_page, VM_KERN_MEMORY_NONE, TRUE);
7319	vm_page_unlock_queues();
7320
7321	vm_page_insert_wired(page: alias_page, object: upl->map_object, offset: new_offset, VM_KERN_MEMORY_NONE);
7322
7323	assert(!alias_page->vmp_wanted);
7324	alias_page->vmp_busy = FALSE;
7325	alias_page->vmp_absent = FALSE;
7326	}
7327	size -= PAGE_SIZE;
7328	offset += PAGE_SIZE_64;
7329	new_offset += PAGE_SIZE_64;
7330	}
7331	vm_object_unlock(upl->map_object);
7332	}
7333	if (upl->flags & UPL_SHADOWED) {
7334	if (isVectorUPL) {
7335	offset = `0`;
7336	} else {
7337	offset = offset_to_map;
7338	}
7339	} else {
7340	offset = upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)) - upl->map_object->paging_offset;
7341	if (!isVectorUPL) {
7342	offset += offset_to_map;
7343	}
7344	}
7345
7346	if (isVectorUPL) {
7347	size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7348	} else {
7349	size = MIN(upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map)), size_to_map);
7350	}
7351
7352	vm_object_reference(upl->map_object);
7353
7354	if (!isVectorUPL) {
7355	*dst_addr = `0`;
7356	/*
7357	* NEED A UPL_MAP ALIAS
7358	*/
7359	kr = vm_map_enter(map, address: dst_addr, size: (vm_map_size_t)size, mask: (vm_map_offset_t) `0`,
7360	VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(.vm_tag = VM_KERN_MEMORY_OSFMK),
7361	object: upl->map_object, offset, FALSE,
7362	cur_protection: prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7363
7364	if (kr != KERN_SUCCESS) {
7365	vm_object_deallocate(object: upl->map_object);
7366	upl_unlock(upl);
7367	return kr;
7368	}
7369	} else {
7370	kr = vm_map_enter(map, address: dst_addr, size: (vm_map_size_t)size, mask: (vm_map_offset_t) `0`,
7371	VM_MAP_KERNEL_FLAGS_FIXED(.vm_tag = VM_KERN_MEMORY_OSFMK),
7372	object: upl->map_object, offset, FALSE,
7373	cur_protection: prot_to_map, VM_PROT_ALL, VM_INHERIT_DEFAULT);
7374	if (kr) {
7375	panic("vm_map_enter failed for a Vector UPL");
7376	}
7377	}
7378	upl->u_mapped_size = (upl_size_t) size; / When we allow multiple submappings of the UPL /
7379	/ this will have to be an increment rather than /
7380	/ an assignment. /
7381	vm_object_lock(upl->map_object);
7382
7383	for (addr = *dst_addr; size > `0`; size -= PAGE_SIZE, addr += PAGE_SIZE) {
7384	m = vm_page_lookup(object: upl->map_object, offset);
7385
7386	if (m) {
7387	m->vmp_pmapped = TRUE;
7388
7389	/*
7390	* CODE SIGNING ENFORCEMENT: page has been wpmapped,
7391	* but only in kernel space. If this was on a user map,
7392	* we'd have to set the wpmapped bit.
7393	*/
7394	/ m->vmp_wpmapped = TRUE; /
7395	assert(map->pmap == kernel_pmap);
7396
7397	kr = pmap_enter_check(pmap: map->pmap, virtual_address: addr, page: m, protection: prot_to_map, VM_PROT_NONE, flags: `0`, TRUE);
7398
7399	assert(kr == KERN_SUCCESS);
7400	#if KASAN
7401	kasan_notify_address(addr, PAGE_SIZE_64);
7402	#endif
7403	}
7404	offset += PAGE_SIZE_64;
7405	}
7406	vm_object_unlock(upl->map_object);
7407
7408	/*
7409	* hold a reference for the mapping
7410	*/
7411	upl->ref_count++;
7412	upl->flags \|= UPL_PAGE_LIST_MAPPED;
7413	upl->kaddr = (vm_offset_t) *dst_addr;
7414	assert(upl->kaddr == *dst_addr);
7415
7416	if (isVectorUPL) {
7417	goto process_upl_to_enter;
7418	}
7419
7420	if (!isVectorUPL) {
7421	vm_map_offset_t addr_adjustment;
7422
7423	addr_adjustment = (vm_map_offset_t)(upl->u_offset - upl_adjusted_offset(upl, VM_MAP_PAGE_MASK(map)));
7424	if (addr_adjustment) {
7425	assert(VM_MAP_PAGE_MASK(map) != PAGE_MASK);
7426	DEBUG4K_UPL("dst_addr 0x%llx (+ 0x%llx) -> 0x%llx\n", (uint64_t)dst_addr, (uint64_t)addr_adjustment, (uint64_t)(dst_addr + addr_adjustment));
7427	*dst_addr += addr_adjustment;
7428	}
7429	}
7430
7431	upl_unlock(upl);
7432
7433	return KERN_SUCCESS;
7434	}
7435
7436	kern_return_t
7437	vm_map_enter_upl(
7438	vm_map_t map,
7439	upl_t upl,
7440	vm_map_offset_t *dst_addr)
7441	{
7442	upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7443	return vm_map_enter_upl_range(map, upl, offset_to_map: `0`, size_to_map: upl_size, VM_PROT_DEFAULT, dst_addr);
7444	}
7445
7446	/*
7447	* Internal routine to remove a UPL mapping from a VM map.
7448	*
7449	* XXX - This should just be doable through a standard
7450	* vm_map_remove() operation. Otherwise, implicit clean-up
7451	* of the target map won't be able to correctly remove
7452	* these (and release the reference on the UPL). Having
7453	* to do this means we can't map these into user-space
7454	* maps yet.
7455	*/
7456	kern_return_t
7457	vm_map_remove_upl_range(
7458	vm_map_t map,
7459	upl_t upl,
7460	__unused vm_object_offset_t offset_to_unmap,
7461	__unused upl_size_t size_to_unmap)
7462	{
7463	vm_address_t addr;
7464	upl_size_t size;
7465	int isVectorUPL = `0`, curr_upl = `0`;
7466	upl_t vector_upl = NULL;
7467
7468	if (upl == UPL_NULL) {
7469	return KERN_INVALID_ARGUMENT;
7470	}
7471
7472	if ((isVectorUPL = vector_upl_is_valid(upl))) {
7473	int unmapped = `0`, valid_upls = `0`;
7474	vector_upl = upl;
7475	upl_lock(vector_upl);
7476	for (curr_upl = `0`; curr_upl < vector_upl_max_upls(upl: vector_upl); curr_upl++) {
7477	upl = vector_upl_subupl_byindex(vector_upl, curr_upl );
7478	if (upl == NULL) {
7479	continue;
7480	}
7481	valid_upls++;
7482	if (!(UPL_PAGE_LIST_MAPPED & upl->flags)) {
7483	unmapped++;
7484	}
7485	}
7486
7487	if (unmapped) {
7488	if (unmapped != valid_upls) {
7489	panic("%d of the %d sub-upls within the Vector UPL is/are not mapped", unmapped, valid_upls);
7490	} else {
7491	upl_unlock(vector_upl);
7492	return KERN_FAILURE;
7493	}
7494	}
7495	curr_upl = `0`;
7496	} else {
7497	upl_lock(upl);
7498	}
7499
7500	process_upl_to_remove:
7501	if (isVectorUPL) {
7502	if (curr_upl == vector_upl_max_upls(upl: vector_upl)) {
7503	vm_map_t v_upl_submap;
7504	vm_offset_t v_upl_submap_dst_addr;
7505	vector_upl_get_submap(vector_upl, &v_upl_submap, &v_upl_submap_dst_addr);
7506
7507	kmem_free_guard(map, addr: v_upl_submap_dst_addr,
7508	size: vector_upl->u_size, flags: KMF_NONE, KMEM_GUARD_SUBMAP);
7509	vm_map_deallocate(map: v_upl_submap);
7510	upl_unlock(vector_upl);
7511	return KERN_SUCCESS;
7512	}
7513
7514	upl = vector_upl_subupl_byindex(vector_upl, curr_upl++ );
7515	if (upl == NULL) {
7516	goto process_upl_to_remove;
7517	}
7518	}
7519
7520	if (upl->flags & UPL_PAGE_LIST_MAPPED) {
7521	addr = upl->kaddr;
7522	size = upl->u_mapped_size;
7523
7524	assert(upl->ref_count > `1`);
7525	upl->ref_count--; / removing mapping ref /
7526
7527	upl->flags &= ~UPL_PAGE_LIST_MAPPED;
7528	upl->kaddr = (vm_offset_t) `0`;
7529	upl->u_mapped_size = `0`;
7530
7531	if (isVectorUPL) {
7532	/*
7533	* If it's a Vectored UPL, we'll be removing the entire
7534	* submap anyways, so no need to remove individual UPL
7535	* element mappings from within the submap
7536	*/
7537	goto process_upl_to_remove;
7538	}
7539
7540	upl_unlock(upl);
7541
7542	vm_map_remove(map,
7543	vm_map_trunc_page(addr, VM_MAP_PAGE_MASK(map)),
7544	vm_map_round_page(addr + size, VM_MAP_PAGE_MASK(map)));
7545	return KERN_SUCCESS;
7546	}
7547	upl_unlock(upl);
7548
7549	return KERN_FAILURE;
7550	}
7551
7552	kern_return_t
7553	vm_map_remove_upl(
7554	vm_map_t map,
7555	upl_t upl)
7556	{
7557	upl_size_t upl_size = upl_adjusted_size(upl, VM_MAP_PAGE_MASK(map));
7558	return vm_map_remove_upl_range(map, upl, offset_to_unmap: `0`, size_to_unmap: upl_size);
7559	}
7560
7561	kern_return_t
7562	upl_commit_range(
7563	upl_t upl,
7564	upl_offset_t offset,
7565	upl_size_t size,
7566	int flags,
7567	upl_page_info_t *page_list,
7568	mach_msg_type_number_t count,
7569	boolean_t *empty)
7570	{
7571	upl_size_t xfer_size, subupl_size;
7572	vm_object_t shadow_object;
7573	vm_object_t object;
7574	vm_object_t m_object;
7575	vm_object_offset_t target_offset;
7576	upl_offset_t subupl_offset = offset;
7577	int entry;
7578	int occupied;
7579	int clear_refmod = `0`;
7580	int pgpgout_count = `0`;
7581	struct vm_page_delayed_work dw_array;
7582	struct vm_page_delayed_work dwp, dwp_start;
7583	bool dwp_finish_ctx = TRUE;
7584	int dw_count;
7585	int dw_limit;
7586	int isVectorUPL = `0`;
7587	upl_t vector_upl = NULL;
7588	boolean_t should_be_throttled = FALSE;
7589
7590	vm_page_t nxt_page = VM_PAGE_NULL;
7591	int fast_path_possible = `0`;
7592	int fast_path_full_commit = `0`;
7593	int throttle_page = `0`;
7594	int unwired_count = `0`;
7595	int local_queue_count = `0`;
7596	vm_page_t first_local, last_local;
7597	vm_object_offset_t obj_start, obj_end, obj_offset;
7598	kern_return_t kr = KERN_SUCCESS;
7599
7600	// DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p offset 0x%llx size 0x%llx flags 0x%x\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object, (uint64_t)offset, (uint64_t)size, flags);
7601
7602	dwp_start = dwp = NULL;
7603
7604	subupl_size = size;
7605	*empty = FALSE;
7606
7607	if (upl == UPL_NULL) {
7608	return KERN_INVALID_ARGUMENT;
7609	}
7610
7611	dw_count = `0`;
7612	dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
7613	dwp_start = vm_page_delayed_work_get_ctx();
7614	if (dwp_start == NULL) {
7615	dwp_start = &dw_array;
7616	dw_limit = `1`;
7617	dwp_finish_ctx = FALSE;
7618	}
7619
7620	dwp = dwp_start;
7621
7622	if (count == `0`) {
7623	page_list = NULL;
7624	}
7625
7626	if ((isVectorUPL = vector_upl_is_valid(upl))) {
7627	vector_upl = upl;
7628	upl_lock(vector_upl);
7629	} else {
7630	upl_lock(upl);
7631	}
7632
7633	process_upl_to_commit:
7634
7635	if (isVectorUPL) {
7636	size = subupl_size;
7637	offset = subupl_offset;
7638	if (size == `0`) {
7639	upl_unlock(vector_upl);
7640	kr = KERN_SUCCESS;
7641	goto done;
7642	}
7643	upl = vector_upl_subupl_byoffset(vector_upl, &offset, &size);
7644	if (upl == NULL) {
7645	upl_unlock(vector_upl);
7646	kr = KERN_FAILURE;
7647	goto done;
7648	}
7649	page_list = upl->page_list;
7650	subupl_size -= size;
7651	subupl_offset += size;
7652	}
7653
7654	#if UPL_DEBUG
7655	if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
7656	upl->upl_commit_records[upl->upl_commit_index].c_btref = btref_get(__builtin_frame_address(`0`), `0`);
7657	upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
7658	upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
7659
7660	upl->upl_commit_index++;
7661	}
7662	#endif
7663	if (upl->flags & UPL_DEVICE_MEMORY) {
7664	xfer_size = `0`;
7665	} else if ((offset + size) <= upl_adjusted_size(upl, PAGE_MASK)) {
7666	xfer_size = size;
7667	} else {
7668	if (!isVectorUPL) {
7669	upl_unlock(upl);
7670	} else {
7671	upl_unlock(vector_upl);
7672	}
7673	DEBUG4K_ERROR("upl %p (u_offset 0x%llx u_size 0x%x) offset 0x%x size 0x%x\n", upl, upl->u_offset, upl->u_size, offset, size);
7674	kr = KERN_FAILURE;
7675	goto done;
7676	}
7677	if (upl->flags & UPL_SET_DIRTY) {
7678	flags \|= UPL_COMMIT_SET_DIRTY;
7679	}
7680	if (upl->flags & UPL_CLEAR_DIRTY) {
7681	flags \|= UPL_COMMIT_CLEAR_DIRTY;
7682	}
7683
7684	object = upl->map_object;
7685
7686	if (upl->flags & UPL_SHADOWED) {
7687	vm_object_lock(object);
7688	shadow_object = object->shadow;
7689	} else {
7690	shadow_object = object;
7691	}
7692	entry = offset / PAGE_SIZE;
7693	target_offset = (vm_object_offset_t)offset;
7694
7695	if (upl->flags & UPL_KERNEL_OBJECT) {
7696	vm_object_lock_shared(shadow_object);
7697	} else {
7698	vm_object_lock(shadow_object);
7699	}
7700
7701	VM_OBJECT_WIRED_PAGE_UPDATE_START(shadow_object);
7702
7703	if (upl->flags & UPL_ACCESS_BLOCKED) {
7704	assert(shadow_object->blocked_access);
7705	shadow_object->blocked_access = FALSE;
7706	vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
7707	}
7708
7709	if (shadow_object->code_signed) {
7710	/*
7711	* CODE SIGNING:
7712	* If the object is code-signed, do not let this UPL tell
7713	* us if the pages are valid or not. Let the pages be
7714	* validated by VM the normal way (when they get mapped or
7715	* copied).
7716	*/
7717	flags &= ~UPL_COMMIT_CS_VALIDATED;
7718	}
7719	if (!page_list) {
7720	/*
7721	* No page list to get the code-signing info from !?
7722	*/
7723	flags &= ~UPL_COMMIT_CS_VALIDATED;
7724	}
7725	if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal) {
7726	should_be_throttled = TRUE;
7727	}
7728
7729	if ((upl->flags & UPL_IO_WIRE) &&
7730	!(flags & UPL_COMMIT_FREE_ABSENT) &&
7731	!isVectorUPL &&
7732	shadow_object->purgable != VM_PURGABLE_VOLATILE &&
7733	shadow_object->purgable != VM_PURGABLE_EMPTY) {
7734	if (!vm_page_queue_empty(&shadow_object->memq)) {
7735	if (shadow_object->internal && size == shadow_object->vo_size) {
7736	nxt_page = (vm_page_t)vm_page_queue_first(&shadow_object->memq);
7737	fast_path_full_commit = `1`;
7738	}
7739	fast_path_possible = `1`;
7740
7741	if (!VM_DYNAMIC_PAGING_ENABLED() && shadow_object->internal &&
7742	(shadow_object->purgable == VM_PURGABLE_DENY \|\|
7743	shadow_object->purgable == VM_PURGABLE_NONVOLATILE \|\|
7744	shadow_object->purgable == VM_PURGABLE_VOLATILE)) {
7745	throttle_page = `1`;
7746	}
7747	}
7748	}
7749	first_local = VM_PAGE_NULL;
7750	last_local = VM_PAGE_NULL;
7751
7752	obj_start = target_offset + upl->u_offset - shadow_object->paging_offset;
7753	obj_end = obj_start + xfer_size;
7754	obj_start = vm_object_trunc_page(obj_start);
7755	obj_end = vm_object_round_page(obj_end);
7756	for (obj_offset = obj_start;
7757	obj_offset < obj_end;
7758	obj_offset += PAGE_SIZE) {
7759	vm_page_t t, m;
7760
7761	dwp->dw_mask = `0`;
7762	clear_refmod = `0`;
7763
7764	m = VM_PAGE_NULL;
7765
7766	if (upl->flags & UPL_LITE) {
7767	unsigned int pg_num;
7768
7769	if (nxt_page != VM_PAGE_NULL) {
7770	m = nxt_page;
7771	nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
7772	target_offset = m->vmp_offset;
7773	}
7774	pg_num = (unsigned int) (target_offset / PAGE_SIZE);
7775	assert(pg_num == target_offset / PAGE_SIZE);
7776
7777	if (bitmap_test(map: upl->lite_list, n: pg_num)) {
7778	bitmap_clear(map: upl->lite_list, n: pg_num);
7779
7780	if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL) {
7781	m = vm_page_lookup(object: shadow_object, offset: obj_offset);
7782	}
7783	} else {
7784	m = NULL;
7785	}
7786	}
7787	if (upl->flags & UPL_SHADOWED) {
7788	if ((t = vm_page_lookup(object, offset: target_offset)) != VM_PAGE_NULL) {
7789	t->vmp_free_when_done = FALSE;
7790
7791	VM_PAGE_FREE(t);
7792
7793	if (!(upl->flags & UPL_KERNEL_OBJECT) && m == VM_PAGE_NULL) {
7794	m = vm_page_lookup(object: shadow_object, offset: target_offset + object->vo_shadow_offset);
7795	}
7796	}
7797	}
7798	if (m == VM_PAGE_NULL) {
7799	goto commit_next_page;
7800	}
7801
7802	m_object = VM_PAGE_OBJECT(m);
7803
7804	if (m->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
7805	assert(m->vmp_busy);
7806
7807	dwp->dw_mask \|= (DW_clear_busy \| DW_PAGE_WAKEUP);
7808	goto commit_next_page;
7809	}
7810
7811	if (flags & UPL_COMMIT_CS_VALIDATED) {
7812	/*
7813	* CODE SIGNING:
7814	* Set the code signing bits according to
7815	* what the UPL says they should be.
7816	*/
7817	m->vmp_cs_validated \|= page_list[entry].cs_validated;
7818	m->vmp_cs_tainted \|= page_list[entry].cs_tainted;
7819	m->vmp_cs_nx \|= page_list[entry].cs_nx;
7820	}
7821	if (flags & UPL_COMMIT_WRITTEN_BY_KERNEL) {
7822	m->vmp_written_by_kernel = TRUE;
7823	}
7824
7825	if (upl->flags & UPL_IO_WIRE) {
7826	if (page_list) {
7827	page_list[entry].phys_addr = `0`;
7828	}
7829
7830	if (flags & UPL_COMMIT_SET_DIRTY) {
7831	SET_PAGE_DIRTY(m, FALSE);
7832	} else if (flags & UPL_COMMIT_CLEAR_DIRTY) {
7833	m->vmp_dirty = FALSE;
7834
7835	if (!(flags & UPL_COMMIT_CS_VALIDATED) &&
7836	m->vmp_cs_validated &&
7837	m->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
7838	/*
7839	* CODE SIGNING:
7840	* This page is no longer dirty
7841	* but could have been modified,
7842	* so it will need to be
7843	* re-validated.
7844	*/
7845	m->vmp_cs_validated = VMP_CS_ALL_FALSE;
7846
7847	VM_PAGEOUT_DEBUG(vm_cs_validated_resets, `1`);
7848
7849	pmap_disconnect(phys: VM_PAGE_GET_PHYS_PAGE(m));
7850	}
7851	clear_refmod \|= VM_MEM_MODIFIED;
7852	}
7853	if (upl->flags & UPL_ACCESS_BLOCKED) {
7854	/*
7855	* We blocked access to the pages in this UPL.
7856	* Clear the "busy" bit and wake up any waiter
7857	* for this page.
7858	*/
7859	dwp->dw_mask \|= (DW_clear_busy \| DW_PAGE_WAKEUP);
7860	}
7861	if (fast_path_possible) {
7862	assert(m_object->purgable != VM_PURGABLE_EMPTY);
7863	assert(m_object->purgable != VM_PURGABLE_VOLATILE);
7864	if (m->vmp_absent) {
7865	assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
7866	assert(m->vmp_wire_count == `0`);
7867	assert(m->vmp_busy);
7868
7869	m->vmp_absent = FALSE;
7870	dwp->dw_mask \|= (DW_clear_busy \| DW_PAGE_WAKEUP);
7871	} else {
7872	if (m->vmp_wire_count == `0`) {
7873	panic("wire_count == 0, m = %p, obj = %p", m, shadow_object);
7874	}
7875	assert(m->vmp_q_state == VM_PAGE_IS_WIRED);
7876
7877	/*
7878	* XXX FBDP need to update some other
7879	* counters here (purgeable_wired_count)
7880	* (ledgers), ...
7881	*/
7882	assert(m->vmp_wire_count > `0`);
7883	m->vmp_wire_count--;
7884
7885	if (m->vmp_wire_count == `0`) {
7886	m->vmp_q_state = VM_PAGE_NOT_ON_Q;
7887	unwired_count++;
7888	}
7889	}
7890	if (m->vmp_wire_count == `0`) {
7891	assert(m->vmp_pageq.next == `0` && m->vmp_pageq.prev == `0`);
7892
7893	if (last_local == VM_PAGE_NULL) {
7894	assert(first_local == VM_PAGE_NULL);
7895
7896	last_local = m;
7897	first_local = m;
7898	} else {
7899	assert(first_local != VM_PAGE_NULL);
7900
7901	m->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
7902	first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(m);
7903	first_local = m;
7904	}
7905	local_queue_count++;
7906
7907	if (throttle_page) {
7908	m->vmp_q_state = VM_PAGE_ON_THROTTLED_Q;
7909	} else {
7910	if (flags & UPL_COMMIT_INACTIVATE) {
7911	if (shadow_object->internal) {
7912	m->vmp_q_state = VM_PAGE_ON_INACTIVE_INTERNAL_Q;
7913	} else {
7914	m->vmp_q_state = VM_PAGE_ON_INACTIVE_EXTERNAL_Q;
7915	}
7916	} else {
7917	m->vmp_q_state = VM_PAGE_ON_ACTIVE_Q;
7918	}
7919	}
7920	}
7921	} else {
7922	if (flags & UPL_COMMIT_INACTIVATE) {
7923	dwp->dw_mask \|= DW_vm_page_deactivate_internal;
7924	clear_refmod \|= VM_MEM_REFERENCED;
7925	}
7926	if (m->vmp_absent) {
7927	if (flags & UPL_COMMIT_FREE_ABSENT) {
7928	dwp->dw_mask \|= DW_vm_page_free;
7929	} else {
7930	m->vmp_absent = FALSE;
7931	dwp->dw_mask \|= (DW_clear_busy \| DW_PAGE_WAKEUP);
7932
7933	if (!(dwp->dw_mask & DW_vm_page_deactivate_internal)) {
7934	dwp->dw_mask \|= DW_vm_page_activate;
7935	}
7936	}
7937	} else {
7938	dwp->dw_mask \|= DW_vm_page_unwire;
7939	}
7940	}
7941	goto commit_next_page;
7942	}
7943	assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
7944
7945	if (page_list) {
7946	page_list[entry].phys_addr = `0`;
7947	}
7948
7949	/*
7950	* make sure to clear the hardware
7951	* modify or reference bits before
7952	* releasing the BUSY bit on this page
7953	* otherwise we risk losing a legitimate
7954	* change of state
7955	*/
7956	if (flags & UPL_COMMIT_CLEAR_DIRTY) {
7957	m->vmp_dirty = FALSE;
7958
7959	clear_refmod \|= VM_MEM_MODIFIED;
7960	}
7961	if (m->vmp_laundry) {
7962	dwp->dw_mask \|= DW_vm_pageout_throttle_up;
7963	}
7964
7965	if (VM_PAGE_WIRED(m)) {
7966	m->vmp_free_when_done = FALSE;
7967	}
7968
7969	if (!(flags & UPL_COMMIT_CS_VALIDATED) &&
7970	m->vmp_cs_validated &&
7971	m->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
7972	/*
7973	* CODE SIGNING:
7974	* This page is no longer dirty
7975	* but could have been modified,
7976	* so it will need to be
7977	* re-validated.
7978	*/
7979	m->vmp_cs_validated = VMP_CS_ALL_FALSE;
7980
7981	VM_PAGEOUT_DEBUG(vm_cs_validated_resets, `1`);
7982
7983	pmap_disconnect(phys: VM_PAGE_GET_PHYS_PAGE(m));
7984	}
7985	if (m->vmp_overwriting) {
7986	/*
7987	* the (COPY_OUT_FROM == FALSE) request_page_list case
7988	*/
7989	if (m->vmp_busy) {
7990	#if CONFIG_PHANTOM_CACHE
7991	if (m->vmp_absent && !m_object->internal) {
7992	dwp->dw_mask \|= DW_vm_phantom_cache_update;
7993	}
7994	#endif
7995	m->vmp_absent = FALSE;
7996
7997	dwp->dw_mask \|= DW_clear_busy;
7998	} else {
7999	/*
8000	* alternate (COPY_OUT_FROM == FALSE) page_list case
8001	* Occurs when the original page was wired
8002	* at the time of the list request
8003	*/
8004	assert(VM_PAGE_WIRED(m));
8005
8006	dwp->dw_mask \|= DW_vm_page_unwire; / reactivates /
8007	}
8008	m->vmp_overwriting = FALSE;
8009	}
8010	m->vmp_cleaning = FALSE;
8011
8012	if (m->vmp_free_when_done) {
8013	/*
8014	* With the clean queue enabled, UPL_PAGEOUT should
8015	* no longer set the pageout bit. Its pages now go
8016	* to the clean queue.
8017	*
8018	* We don't use the cleaned Q anymore and so this
8019	* assert isn't correct. The code for the clean Q
8020	* still exists and might be used in the future. If we
8021	* go back to the cleaned Q, we will re-enable this
8022	* assert.
8023	*
8024	* assert(!(upl->flags & UPL_PAGEOUT));
8025	*/
8026	assert(!m_object->internal);
8027
8028	m->vmp_free_when_done = FALSE;
8029
8030	if ((flags & UPL_COMMIT_SET_DIRTY) \|\|
8031	(m->vmp_pmapped && (pmap_disconnect(phys: VM_PAGE_GET_PHYS_PAGE(m)) & VM_MEM_MODIFIED))) {
8032	/*
8033	* page was re-dirtied after we started
8034	* the pageout... reactivate it since
8035	* we don't know whether the on-disk
8036	* copy matches what is now in memory
8037	*/
8038	SET_PAGE_DIRTY(m, FALSE);
8039
8040	dwp->dw_mask \|= DW_vm_page_activate \| DW_PAGE_WAKEUP;
8041
8042	if (upl->flags & UPL_PAGEOUT) {
8043	counter_inc(&vm_statistics_reactivations);
8044	DTRACE_VM2(pgrec, int, `1`, (uint64_t *), NULL);
8045	}
8046	} else if (m->vmp_busy && !(upl->flags & UPL_HAS_BUSY)) {
8047	/*
8048	* Someone else might still be handling this
8049	* page (vm_fault() for example), so let's not
8050	* free it or "un-busy" it!
8051	* Put that page in the "speculative" queue
8052	* for now (since we would otherwise have freed
8053	* it) and let whoever is keeping the page
8054	* "busy" move it if needed when they're done
8055	* with it.
8056	*/
8057	dwp->dw_mask \|= DW_vm_page_speculate;
8058	} else {
8059	/*
8060	* page has been successfully cleaned
8061	* go ahead and free it for other use
8062	*/
8063	if (m_object->internal) {
8064	DTRACE_VM2(anonpgout, int, `1`, (uint64_t *), NULL);
8065	} else {
8066	DTRACE_VM2(fspgout, int, `1`, (uint64_t *), NULL);
8067	}
8068	m->vmp_dirty = FALSE;
8069	if (!(upl->flags & UPL_HAS_BUSY)) {
8070	assert(!m->vmp_busy);
8071	}
8072	m->vmp_busy = TRUE;
8073
8074	dwp->dw_mask \|= DW_vm_page_free;
8075	}
8076	goto commit_next_page;
8077	}
8078	/*
8079	* It is a part of the semantic of COPYOUT_FROM
8080	* UPLs that a commit implies cache sync
8081	* between the vm page and the backing store
8082	* this can be used to strip the precious bit
8083	* as well as clean
8084	*/
8085	if ((upl->flags & UPL_PAGE_SYNC_DONE) \|\| (flags & UPL_COMMIT_CLEAR_PRECIOUS)) {
8086	m->vmp_precious = FALSE;
8087	}
8088
8089	if (flags & UPL_COMMIT_SET_DIRTY) {
8090	SET_PAGE_DIRTY(m, FALSE);
8091	} else {
8092	m->vmp_dirty = FALSE;
8093	}
8094
8095	/ with the clean queue on, move all cleaned pages to the clean queue /
8096	if (hibernate_cleaning_in_progress == FALSE && !m->vmp_dirty && (upl->flags & UPL_PAGEOUT)) {
8097	pgpgout_count++;
8098
8099	counter_inc(&vm_statistics_pageouts);
8100	DTRACE_VM2(pgout, int, `1`, (uint64_t *), NULL);
8101
8102	dwp->dw_mask \|= DW_enqueue_cleaned;
8103	} else if (should_be_throttled == TRUE && (m->vmp_q_state == VM_PAGE_NOT_ON_Q)) {
8104	/*
8105	* page coming back in from being 'frozen'...
8106	* it was dirty before it was frozen, so keep it so
8107	* the vm_page_activate will notice that it really belongs
8108	* on the throttle queue and put it there
8109	*/
8110	SET_PAGE_DIRTY(m, FALSE);
8111	dwp->dw_mask \|= DW_vm_page_activate;
8112	} else {
8113	if ((flags & UPL_COMMIT_INACTIVATE) && !m->vmp_clustered && (m->vmp_q_state != VM_PAGE_ON_SPECULATIVE_Q)) {
8114	dwp->dw_mask \|= DW_vm_page_deactivate_internal;
8115	clear_refmod \|= VM_MEM_REFERENCED;
8116	} else if (!VM_PAGE_PAGEABLE(m)) {
8117	if (m->vmp_clustered \|\| (flags & UPL_COMMIT_SPECULATE)) {
8118	dwp->dw_mask \|= DW_vm_page_speculate;
8119	} else if (m->vmp_reference) {
8120	dwp->dw_mask \|= DW_vm_page_activate;
8121	} else {
8122	dwp->dw_mask \|= DW_vm_page_deactivate_internal;
8123	clear_refmod \|= VM_MEM_REFERENCED;
8124	}
8125	}
8126	}
8127	if (upl->flags & UPL_ACCESS_BLOCKED) {
8128	/*
8129	* We blocked access to the pages in this URL.
8130	* Clear the "busy" bit on this page before we
8131	* wake up any waiter.
8132	*/
8133	dwp->dw_mask \|= DW_clear_busy;
8134	}
8135	/*
8136	* Wakeup any thread waiting for the page to be un-cleaning.
8137	*/
8138	dwp->dw_mask \|= DW_PAGE_WAKEUP;
8139
8140	commit_next_page:
8141	if (clear_refmod) {
8142	pmap_clear_refmod(pn: VM_PAGE_GET_PHYS_PAGE(m), mask: clear_refmod);
8143	}
8144
8145	target_offset += PAGE_SIZE_64;
8146	xfer_size -= PAGE_SIZE;
8147	entry++;
8148
8149	if (dwp->dw_mask) {
8150	if (dwp->dw_mask & ~(DW_clear_busy \| DW_PAGE_WAKEUP)) {
8151	VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
8152
8153	if (dw_count >= dw_limit) {
8154	vm_page_do_delayed_work(object: shadow_object, VM_KERN_MEMORY_NONE, dwp: dwp_start, dw_count);
8155
8156	dwp = dwp_start;
8157	dw_count = `0`;
8158	}
8159	} else {
8160	if (dwp->dw_mask & DW_clear_busy) {
8161	m->vmp_busy = FALSE;
8162	}
8163
8164	if (dwp->dw_mask & DW_PAGE_WAKEUP) {
8165	PAGE_WAKEUP(m);
8166	}
8167	}
8168	}
8169	}
8170	if (dw_count) {
8171	vm_page_do_delayed_work(object: shadow_object, VM_KERN_MEMORY_NONE, dwp: dwp_start, dw_count);
8172	dwp = dwp_start;
8173	dw_count = `0`;
8174	}
8175
8176	if (fast_path_possible) {
8177	assert(shadow_object->purgable != VM_PURGABLE_VOLATILE);
8178	assert(shadow_object->purgable != VM_PURGABLE_EMPTY);
8179
8180	if (local_queue_count \|\| unwired_count) {
8181	if (local_queue_count) {
8182	vm_page_t first_target;
8183	vm_page_queue_head_t *target_queue;
8184
8185	if (throttle_page) {
8186	target_queue = &vm_page_queue_throttled;
8187	} else {
8188	if (flags & UPL_COMMIT_INACTIVATE) {
8189	if (shadow_object->internal) {
8190	target_queue = &vm_page_queue_anonymous;
8191	} else {
8192	target_queue = &vm_page_queue_inactive;
8193	}
8194	} else {
8195	target_queue = &vm_page_queue_active;
8196	}
8197	}
8198	/*
8199	* Transfer the entire local queue to a regular LRU page queues.
8200	*/
8201	vm_page_lockspin_queues();
8202
8203	first_target = (vm_page_t) vm_page_queue_first(target_queue);
8204
8205	if (vm_page_queue_empty(target_queue)) {
8206	target_queue->prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
8207	} else {
8208	first_target->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(last_local);
8209	}
8210
8211	target_queue->next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_local);
8212	first_local->vmp_pageq.prev = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(target_queue);
8213	last_local->vmp_pageq.next = VM_PAGE_CONVERT_TO_QUEUE_ENTRY(first_target);
8214
8215	/*
8216	* Adjust the global page counts.
8217	*/
8218	if (throttle_page) {
8219	vm_page_throttled_count += local_queue_count;
8220	} else {
8221	if (flags & UPL_COMMIT_INACTIVATE) {
8222	if (shadow_object->internal) {
8223	vm_page_anonymous_count += local_queue_count;
8224	}
8225	vm_page_inactive_count += local_queue_count;
8226
8227	token_new_pagecount += local_queue_count;
8228	} else {
8229	vm_page_active_count += local_queue_count;
8230	}
8231
8232	if (shadow_object->internal) {
8233	vm_page_pageable_internal_count += local_queue_count;
8234	} else {
8235	vm_page_pageable_external_count += local_queue_count;
8236	}
8237	}
8238	} else {
8239	vm_page_lockspin_queues();
8240	}
8241	if (unwired_count) {
8242	vm_page_wire_count -= unwired_count;
8243	VM_CHECK_MEMORYSTATUS;
8244	}
8245	vm_page_unlock_queues();
8246
8247	VM_OBJECT_WIRED_PAGE_COUNT(shadow_object, -unwired_count);
8248	}
8249	}
8250
8251	if (upl->flags & UPL_DEVICE_MEMORY) {
8252	occupied = `0`;
8253	} else if (upl->flags & UPL_LITE) {
8254	uint32_t pages = (uint32_t)atop(upl_adjusted_size(upl, PAGE_MASK));
8255
8256	occupied = !fast_path_full_commit &&
8257	!bitmap_is_empty(map: upl->lite_list, nbits: pages);
8258	} else {
8259	occupied = !vm_page_queue_empty(&upl->map_object->memq);
8260	}
8261	if (occupied == `0`) {
8262	/*
8263	* If this UPL element belongs to a Vector UPL and is
8264	* empty, then this is the right function to deallocate
8265	* it. So go ahead set the *empty variable. The flag
8266	* UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
8267	* should be considered relevant for the Vector UPL and not
8268	* the internal UPLs.
8269	*/
8270	if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) \|\| isVectorUPL) {
8271	*empty = TRUE;
8272	}
8273
8274	if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
8275	/*
8276	* this is not a paging object
8277	* so we need to drop the paging reference
8278	* that was taken when we created the UPL
8279	* against this object
8280	*/
8281	vm_object_activity_end(shadow_object);
8282	vm_object_collapse(object: shadow_object, offset: `0`, TRUE);
8283	} else {
8284	/*
8285	* we dontated the paging reference to
8286	* the map object... vm_pageout_object_terminate
8287	* will drop this reference
8288	*/
8289	}
8290	}
8291	VM_OBJECT_WIRED_PAGE_UPDATE_END(shadow_object, shadow_object->wire_tag);
8292	vm_object_unlock(shadow_object);
8293	if (object != shadow_object) {
8294	vm_object_unlock(object);
8295	}
8296
8297	if (!isVectorUPL) {
8298	upl_unlock(upl);
8299	} else {
8300	/*
8301	* If we completed our operations on an UPL that is
8302	* part of a Vectored UPL and if empty is TRUE, then
8303	* we should go ahead and deallocate this UPL element.
8304	* Then we check if this was the last of the UPL elements
8305	* within that Vectored UPL. If so, set empty to TRUE
8306	* so that in ubc_upl_commit_range or ubc_upl_commit, we
8307	* can go ahead and deallocate the Vector UPL too.
8308	*/
8309	if (*empty == TRUE) {
8310	*empty = vector_upl_set_subupl(vector_upl, upl, `0`);
8311	upl_deallocate(upl);
8312	}
8313	goto process_upl_to_commit;
8314	}
8315	if (pgpgout_count) {
8316	DTRACE_VM2(pgpgout, int, pgpgout_count, (uint64_t *), NULL);
8317	}
8318
8319	kr = KERN_SUCCESS;
8320	done:
8321	if (dwp_start && dwp_finish_ctx) {
8322	vm_page_delayed_work_finish_ctx(dwp: dwp_start);
8323	dwp_start = dwp = NULL;
8324	}
8325
8326	return kr;
8327	}
8328
8329	kern_return_t
8330	upl_abort_range(
8331	upl_t upl,
8332	upl_offset_t offset,
8333	upl_size_t size,
8334	int error,
8335	boolean_t *empty)
8336	{
8337	upl_size_t xfer_size, subupl_size;
8338	vm_object_t shadow_object;
8339	vm_object_t object;
8340	vm_object_offset_t target_offset;
8341	upl_offset_t subupl_offset = offset;
8342	int occupied;
8343	struct vm_page_delayed_work dw_array;
8344	struct vm_page_delayed_work dwp, dwp_start;
8345	bool dwp_finish_ctx = TRUE;
8346	int dw_count;
8347	int dw_limit;
8348	int isVectorUPL = `0`;
8349	upl_t vector_upl = NULL;
8350	vm_object_offset_t obj_start, obj_end, obj_offset;
8351	kern_return_t kr = KERN_SUCCESS;
8352
8353	// DEBUG4K_UPL("upl %p (u_offset 0x%llx u_size 0x%llx) object %p offset 0x%llx size 0x%llx error 0x%x\n", upl, (uint64_t)upl->u_offset, (uint64_t)upl->u_size, upl->map_object, (uint64_t)offset, (uint64_t)size, error);
8354
8355	dwp_start = dwp = NULL;
8356
8357	subupl_size = size;
8358	*empty = FALSE;
8359
8360	if (upl == UPL_NULL) {
8361	return KERN_INVALID_ARGUMENT;
8362	}
8363
8364	if ((upl->flags & UPL_IO_WIRE) && !(error & UPL_ABORT_DUMP_PAGES)) {
8365	return upl_commit_range(upl, offset, size, UPL_COMMIT_FREE_ABSENT, NULL, count: `0`, empty);
8366	}
8367
8368	dw_count = `0`;
8369	dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
8370	dwp_start = vm_page_delayed_work_get_ctx();
8371	if (dwp_start == NULL) {
8372	dwp_start = &dw_array;
8373	dw_limit = `1`;
8374	dwp_finish_ctx = FALSE;
8375	}
8376
8377	dwp = dwp_start;
8378
8379	if ((isVectorUPL = vector_upl_is_valid(upl))) {
8380	vector_upl = upl;
8381	upl_lock(vector_upl);
8382	} else {
8383	upl_lock(upl);
8384	}
8385
8386	process_upl_to_abort:
8387	if (isVectorUPL) {
8388	size = subupl_size;
8389	offset = subupl_offset;
8390	if (size == `0`) {
8391	upl_unlock(vector_upl);
8392	kr = KERN_SUCCESS;
8393	goto done;
8394	}
8395	upl = vector_upl_subupl_byoffset(vector_upl, &offset, &size);
8396	if (upl == NULL) {
8397	upl_unlock(vector_upl);
8398	kr = KERN_FAILURE;
8399	goto done;
8400	}
8401	subupl_size -= size;
8402	subupl_offset += size;
8403	}
8404
8405	*empty = FALSE;
8406
8407	#if UPL_DEBUG
8408	if (upl->upl_commit_index < UPL_DEBUG_COMMIT_RECORDS) {
8409	upl->upl_commit_records[upl->upl_commit_index].c_btref = btref_get(__builtin_frame_address(`0`), `0`);
8410	upl->upl_commit_records[upl->upl_commit_index].c_beg = offset;
8411	upl->upl_commit_records[upl->upl_commit_index].c_end = (offset + size);
8412	upl->upl_commit_records[upl->upl_commit_index].c_aborted = `1`;
8413
8414	upl->upl_commit_index++;
8415	}
8416	#endif
8417	if (upl->flags & UPL_DEVICE_MEMORY) {
8418	xfer_size = `0`;
8419	} else if ((offset + size) <= upl_adjusted_size(upl, PAGE_MASK)) {
8420	xfer_size = size;
8421	} else {
8422	if (!isVectorUPL) {
8423	upl_unlock(upl);
8424	} else {
8425	upl_unlock(vector_upl);
8426	}
8427	DEBUG4K_ERROR("upl %p (u_offset 0x%llx u_size 0x%x) offset 0x%x size 0x%x\n", upl, upl->u_offset, upl->u_size, offset, size);
8428	kr = KERN_FAILURE;
8429	goto done;
8430	}
8431	object = upl->map_object;
8432
8433	if (upl->flags & UPL_SHADOWED) {
8434	vm_object_lock(object);
8435	shadow_object = object->shadow;
8436	} else {
8437	shadow_object = object;
8438	}
8439
8440	target_offset = (vm_object_offset_t)offset;
8441
8442	if (upl->flags & UPL_KERNEL_OBJECT) {
8443	vm_object_lock_shared(shadow_object);
8444	} else {
8445	vm_object_lock(shadow_object);
8446	}
8447
8448	if (upl->flags & UPL_ACCESS_BLOCKED) {
8449	assert(shadow_object->blocked_access);
8450	shadow_object->blocked_access = FALSE;
8451	vm_object_wakeup(object, VM_OBJECT_EVENT_UNBLOCKED);
8452	}
8453
8454	if ((error & UPL_ABORT_DUMP_PAGES) && (upl->flags & UPL_KERNEL_OBJECT)) {
8455	panic("upl_abort_range: kernel_object being DUMPED");
8456	}
8457
8458	obj_start = target_offset + upl->u_offset - shadow_object->paging_offset;
8459	obj_end = obj_start + xfer_size;
8460	obj_start = vm_object_trunc_page(obj_start);
8461	obj_end = vm_object_round_page(obj_end);
8462	for (obj_offset = obj_start;
8463	obj_offset < obj_end;
8464	obj_offset += PAGE_SIZE) {
8465	vm_page_t t, m;
8466	unsigned int pg_num;
8467	boolean_t needed;
8468
8469	pg_num = (unsigned int) (target_offset / PAGE_SIZE);
8470	assert(pg_num == target_offset / PAGE_SIZE);
8471
8472	needed = FALSE;
8473
8474	if (upl->flags & UPL_INTERNAL) {
8475	needed = upl->page_list[pg_num].needed;
8476	}
8477
8478	dwp->dw_mask = `0`;
8479	m = VM_PAGE_NULL;
8480
8481	if (upl->flags & UPL_LITE) {
8482	if (bitmap_test(map: upl->lite_list, n: pg_num)) {
8483	bitmap_clear(map: upl->lite_list, n: pg_num);
8484
8485	if (!(upl->flags & UPL_KERNEL_OBJECT)) {
8486	m = vm_page_lookup(object: shadow_object, offset: obj_offset);
8487	}
8488	}
8489	}
8490	if (upl->flags & UPL_SHADOWED) {
8491	if ((t = vm_page_lookup(object, offset: target_offset)) != VM_PAGE_NULL) {
8492	t->vmp_free_when_done = FALSE;
8493
8494	VM_PAGE_FREE(t);
8495
8496	if (m == VM_PAGE_NULL) {
8497	m = vm_page_lookup(object: shadow_object, offset: target_offset + object->vo_shadow_offset);
8498	}
8499	}
8500	}
8501	if ((upl->flags & UPL_KERNEL_OBJECT)) {
8502	goto abort_next_page;
8503	}
8504
8505	if (m != VM_PAGE_NULL) {
8506	assert(m->vmp_q_state != VM_PAGE_USED_BY_COMPRESSOR);
8507
8508	if (m->vmp_absent) {
8509	boolean_t must_free = TRUE;
8510
8511	/*
8512	* COPYOUT = FALSE case
8513	* check for error conditions which must
8514	* be passed back to the pages customer
8515	*/
8516	if (error & UPL_ABORT_RESTART) {
8517	m->vmp_restart = TRUE;
8518	m->vmp_absent = FALSE;
8519	m->vmp_unusual = TRUE;
8520	must_free = FALSE;
8521	} else if (error & UPL_ABORT_UNAVAILABLE) {
8522	m->vmp_restart = FALSE;
8523	m->vmp_unusual = TRUE;
8524	must_free = FALSE;
8525	} else if (error & UPL_ABORT_ERROR) {
8526	m->vmp_restart = FALSE;
8527	m->vmp_absent = FALSE;
8528	m->vmp_error = TRUE;
8529	m->vmp_unusual = TRUE;
8530	must_free = FALSE;
8531	}
8532	if (m->vmp_clustered && needed == FALSE) {
8533	/*
8534	* This page was a part of a speculative
8535	* read-ahead initiated by the kernel
8536	* itself. No one is expecting this
8537	* page and no one will clean up its
8538	* error state if it ever becomes valid
8539	* in the future.
8540	* We have to free it here.
8541	*/
8542	must_free = TRUE;
8543	}
8544	m->vmp_cleaning = FALSE;
8545
8546	if (m->vmp_overwriting && !m->vmp_busy) {
8547	/*
8548	* this shouldn't happen since
8549	* this is an 'absent' page, but
8550	* it doesn't hurt to check for
8551	* the 'alternate' method of
8552	* stabilizing the page...
8553	* we will mark 'busy' to be cleared
8554	* in the following code which will
8555	* take care of the primary stabilzation
8556	* method (i.e. setting 'busy' to TRUE)
8557	*/
8558	dwp->dw_mask \|= DW_vm_page_unwire;
8559	}
8560	m->vmp_overwriting = FALSE;
8561
8562	dwp->dw_mask \|= (DW_clear_busy \| DW_PAGE_WAKEUP);
8563
8564	if (must_free == TRUE) {
8565	dwp->dw_mask \|= DW_vm_page_free;
8566	} else {
8567	dwp->dw_mask \|= DW_vm_page_activate;
8568	}
8569	} else {
8570	/*
8571	* Handle the trusted pager throttle.
8572	*/
8573	if (m->vmp_laundry) {
8574	dwp->dw_mask \|= DW_vm_pageout_throttle_up;
8575	}
8576
8577	if (upl->flags & UPL_ACCESS_BLOCKED) {
8578	/*
8579	* We blocked access to the pages in this UPL.
8580	* Clear the "busy" bit and wake up any waiter
8581	* for this page.
8582	*/
8583	dwp->dw_mask \|= DW_clear_busy;
8584	}
8585	if (m->vmp_overwriting) {
8586	if (m->vmp_busy) {
8587	dwp->dw_mask \|= DW_clear_busy;
8588	} else {
8589	/*
8590	* deal with the 'alternate' method
8591	* of stabilizing the page...
8592	* we will either free the page
8593	* or mark 'busy' to be cleared
8594	* in the following code which will
8595	* take care of the primary stabilzation
8596	* method (i.e. setting 'busy' to TRUE)
8597	*/
8598	dwp->dw_mask \|= DW_vm_page_unwire;
8599	}
8600	m->vmp_overwriting = FALSE;
8601	}
8602	m->vmp_free_when_done = FALSE;
8603	m->vmp_cleaning = FALSE;
8604
8605	if (error & UPL_ABORT_DUMP_PAGES) {
8606	pmap_disconnect(phys: VM_PAGE_GET_PHYS_PAGE(m));
8607
8608	dwp->dw_mask \|= DW_vm_page_free;
8609	} else {
8610	if (!(dwp->dw_mask & DW_vm_page_unwire)) {
8611	if (error & UPL_ABORT_REFERENCE) {
8612	/*
8613	* we've been told to explictly
8614	* reference this page... for
8615	* file I/O, this is done by
8616	* implementing an LRU on the inactive q
8617	*/
8618	dwp->dw_mask \|= DW_vm_page_lru;
8619	} else if (!VM_PAGE_PAGEABLE(m)) {
8620	dwp->dw_mask \|= DW_vm_page_deactivate_internal;
8621	}
8622	}
8623	dwp->dw_mask \|= DW_PAGE_WAKEUP;
8624	}
8625	}
8626	}
8627	abort_next_page:
8628	target_offset += PAGE_SIZE_64;
8629	xfer_size -= PAGE_SIZE;
8630
8631	if (dwp->dw_mask) {
8632	if (dwp->dw_mask & ~(DW_clear_busy \| DW_PAGE_WAKEUP)) {
8633	VM_PAGE_ADD_DELAYED_WORK(dwp, m, dw_count);
8634
8635	if (dw_count >= dw_limit) {
8636	vm_page_do_delayed_work(object: shadow_object, VM_KERN_MEMORY_NONE, dwp: dwp_start, dw_count);
8637
8638	dwp = dwp_start;
8639	dw_count = `0`;
8640	}
8641	} else {
8642	if (dwp->dw_mask & DW_clear_busy) {
8643	m->vmp_busy = FALSE;
8644	}
8645
8646	if (dwp->dw_mask & DW_PAGE_WAKEUP) {
8647	PAGE_WAKEUP(m);
8648	}
8649	}
8650	}
8651	}
8652	if (dw_count) {
8653	vm_page_do_delayed_work(object: shadow_object, VM_KERN_MEMORY_NONE, dwp: dwp_start, dw_count);
8654	dwp = dwp_start;
8655	dw_count = `0`;
8656	}
8657
8658	if (upl->flags & UPL_DEVICE_MEMORY) {
8659	occupied = `0`;
8660	} else if (upl->flags & UPL_LITE) {
8661	uint32_t pages = (uint32_t)atop(upl_adjusted_size(upl, PAGE_MASK));
8662
8663	occupied = !bitmap_is_empty(map: upl->lite_list, nbits: pages);
8664	} else {
8665	occupied = !vm_page_queue_empty(&upl->map_object->memq);
8666	}
8667	if (occupied == `0`) {
8668	/*
8669	* If this UPL element belongs to a Vector UPL and is
8670	* empty, then this is the right function to deallocate
8671	* it. So go ahead set the *empty variable. The flag
8672	* UPL_COMMIT_NOTIFY_EMPTY, from the caller's point of view
8673	* should be considered relevant for the Vector UPL and
8674	* not the internal UPLs.
8675	*/
8676	if ((upl->flags & UPL_COMMIT_NOTIFY_EMPTY) \|\| isVectorUPL) {
8677	*empty = TRUE;
8678	}
8679
8680	if (object == shadow_object && !(upl->flags & UPL_KERNEL_OBJECT)) {
8681	/*
8682	* this is not a paging object
8683	* so we need to drop the paging reference
8684	* that was taken when we created the UPL
8685	* against this object
8686	*/
8687	vm_object_activity_end(shadow_object);
8688	vm_object_collapse(object: shadow_object, offset: `0`, TRUE);
8689	} else {
8690	/*
8691	* we dontated the paging reference to
8692	* the map object... vm_pageout_object_terminate
8693	* will drop this reference
8694	*/
8695	}
8696	}
8697	vm_object_unlock(shadow_object);
8698	if (object != shadow_object) {
8699	vm_object_unlock(object);
8700	}
8701
8702	if (!isVectorUPL) {
8703	upl_unlock(upl);
8704	} else {
8705	/*
8706	* If we completed our operations on an UPL that is
8707	* part of a Vectored UPL and if empty is TRUE, then
8708	* we should go ahead and deallocate this UPL element.
8709	* Then we check if this was the last of the UPL elements
8710	* within that Vectored UPL. If so, set empty to TRUE
8711	* so that in ubc_upl_abort_range or ubc_upl_abort, we
8712	* can go ahead and deallocate the Vector UPL too.
8713	*/
8714	if (*empty == TRUE) {
8715	*empty = vector_upl_set_subupl(vector_upl, upl, `0`);
8716	upl_deallocate(upl);
8717	}
8718	goto process_upl_to_abort;
8719	}
8720
8721	kr = KERN_SUCCESS;
8722
8723	done:
8724	if (dwp_start && dwp_finish_ctx) {
8725	vm_page_delayed_work_finish_ctx(dwp: dwp_start);
8726	dwp_start = dwp = NULL;
8727	}
8728
8729	return kr;
8730	}
8731
8732
8733	kern_return_t
8734	upl_abort(
8735	upl_t upl,
8736	int error)
8737	{
8738	boolean_t empty;
8739
8740	if (upl == UPL_NULL) {
8741	return KERN_INVALID_ARGUMENT;
8742	}
8743
8744	return upl_abort_range(upl, offset: `0`, size: upl->u_size, error, empty: &empty);
8745	}
8746
8747
8748	/ an option on commit should be wire /
8749	kern_return_t
8750	upl_commit(
8751	upl_t upl,
8752	upl_page_info_t *page_list,
8753	mach_msg_type_number_t count)
8754	{
8755	boolean_t empty;
8756
8757	if (upl == UPL_NULL) {
8758	return KERN_INVALID_ARGUMENT;
8759	}
8760
8761	return upl_commit_range(upl, offset: `0`, size: upl->u_size, flags: `0`,
8762	page_list, count, empty: &empty);
8763	}
8764
8765
8766	void
8767	iopl_valid_data(
8768	upl_t upl,
8769	vm_tag_t tag)
8770	{
8771	vm_object_t object;
8772	vm_offset_t offset;
8773	vm_page_t m, nxt_page = VM_PAGE_NULL;
8774	upl_size_t size;
8775	int wired_count = `0`;
8776
8777	if (upl == NULL) {
8778	panic("iopl_valid_data: NULL upl");
8779	}
8780	if (vector_upl_is_valid(upl)) {
8781	panic("iopl_valid_data: vector upl");
8782	}
8783	if ((upl->flags & (UPL_DEVICE_MEMORY \| UPL_SHADOWED \| UPL_ACCESS_BLOCKED \| UPL_IO_WIRE \| UPL_INTERNAL)) != UPL_IO_WIRE) {
8784	panic("iopl_valid_data: unsupported upl, flags = %x", upl->flags);
8785	}
8786
8787	object = upl->map_object;
8788
8789	if (is_kernel_object(object) \|\| object == compressor_object) {
8790	panic("iopl_valid_data: object == kernel or compressor");
8791	}
8792
8793	if (object->purgable == VM_PURGABLE_VOLATILE \|\|
8794	object->purgable == VM_PURGABLE_EMPTY) {
8795	panic("iopl_valid_data: object %p purgable %d",
8796	object, object->purgable);
8797	}
8798
8799	size = upl_adjusted_size(upl, PAGE_MASK);
8800
8801	vm_object_lock(object);
8802	VM_OBJECT_WIRED_PAGE_UPDATE_START(object);
8803
8804	bool whole_object;
8805
8806	if (object->vo_size == size && object->resident_page_count == (size / PAGE_SIZE)) {
8807	nxt_page = (vm_page_t)vm_page_queue_first(&object->memq);
8808	whole_object = true;
8809	} else {
8810	offset = (vm_offset_t)(upl_adjusted_offset(upl, PAGE_MASK) - object->paging_offset);
8811	whole_object = false;
8812	}
8813
8814	while (size) {
8815	if (whole_object) {
8816	if (nxt_page != VM_PAGE_NULL) {
8817	m = nxt_page;
8818	nxt_page = (vm_page_t)vm_page_queue_next(&nxt_page->vmp_listq);
8819	}
8820	} else {
8821	m = vm_page_lookup(object, offset);
8822	offset += PAGE_SIZE;
8823
8824	if (m == VM_PAGE_NULL) {
8825	panic("iopl_valid_data: missing expected page at offset %lx", (long)offset);
8826	}
8827	}
8828	if (m->vmp_busy) {
8829	if (!m->vmp_absent) {
8830	panic("iopl_valid_data: busy page w/o absent");
8831	}
8832
8833	if (m->vmp_pageq.next \|\| m->vmp_pageq.prev) {
8834	panic("iopl_valid_data: busy+absent page on page queue");
8835	}
8836	if (m->vmp_reusable) {
8837	panic("iopl_valid_data: %p is reusable", m);
8838	}
8839
8840	m->vmp_absent = FALSE;
8841	m->vmp_dirty = TRUE;
8842	assert(m->vmp_q_state == VM_PAGE_NOT_ON_Q);
8843	assert(m->vmp_wire_count == `0`);
8844	m->vmp_wire_count++;
8845	assert(m->vmp_wire_count);
8846	if (m->vmp_wire_count == `1`) {
8847	m->vmp_q_state = VM_PAGE_IS_WIRED;
8848	wired_count++;
8849	} else {
8850	panic("iopl_valid_data: %p already wired", m);
8851	}
8852
8853	PAGE_WAKEUP_DONE(m);
8854	}
8855	size -= PAGE_SIZE;
8856	}
8857	if (wired_count) {
8858	VM_OBJECT_WIRED_PAGE_COUNT(object, wired_count);
8859	assert(object->resident_page_count >= object->wired_page_count);
8860
8861	/ no need to adjust purgeable accounting for this object: /
8862	assert(object->purgable != VM_PURGABLE_VOLATILE);
8863	assert(object->purgable != VM_PURGABLE_EMPTY);
8864
8865	vm_page_lockspin_queues();
8866	vm_page_wire_count += wired_count;
8867	vm_page_unlock_queues();
8868	}
8869	VM_OBJECT_WIRED_PAGE_UPDATE_END(object, tag);
8870	vm_object_unlock(object);
8871	}
8872
8873
8874	void
8875	vm_object_set_pmap_cache_attr(
8876	vm_object_t object,
8877	upl_page_info_array_t user_page_list,
8878	unsigned int num_pages,
8879	boolean_t batch_pmap_op)
8880	{
8881	unsigned int cache_attr = `0`;
8882
8883	cache_attr = object->wimg_bits & VM_WIMG_MASK;
8884	assert(user_page_list);
8885	if (cache_attr != VM_WIMG_USE_DEFAULT) {
8886	PMAP_BATCH_SET_CACHE_ATTR(object, user_page_list, cache_attr, num_pages, batch_pmap_op);
8887	}
8888	}
8889
8890
8891	static bool
8892	vm_object_iopl_wire_full(
8893	vm_object_t object,
8894	upl_t upl,
8895	upl_page_info_array_t user_page_list,
8896	upl_control_flags_t cntrl_flags,
8897	vm_tag_t tag)
8898	{
8899	vm_page_t dst_page;
8900	unsigned int entry;
8901	int page_count;
8902	int delayed_unlock = `0`;
8903	boolean_t retval = TRUE;
8904	ppnum_t phys_page;
8905
8906	vm_object_lock_assert_exclusive(object);
8907	assert(object->purgable != VM_PURGABLE_VOLATILE);
8908	assert(object->purgable != VM_PURGABLE_EMPTY);
8909	assert(object->pager == NULL);
8910	assert(object->vo_copy == NULL);
8911	assert(object->shadow == NULL);
8912
8913	page_count = object->resident_page_count;
8914	dst_page = (vm_page_t)vm_page_queue_first(&object->memq);
8915
8916	vm_page_lock_queues();
8917
8918	while (page_count--) {
8919	if (dst_page->vmp_busy \|\|
8920	dst_page->vmp_fictitious \|\|
8921	dst_page->vmp_absent \|\|
8922	VMP_ERROR_GET(dst_page) \|\|
8923	dst_page->vmp_cleaning \|\|
8924	dst_page->vmp_restart \|\|
8925	dst_page->vmp_laundry) {
8926	retval = FALSE;
8927	goto done;
8928	}
8929	if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
8930	retval = FALSE;
8931	goto done;
8932	}
8933	dst_page->vmp_reference = TRUE;
8934
8935	vm_page_wire(page: dst_page, tag, FALSE);
8936
8937	if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
8938	SET_PAGE_DIRTY(dst_page, FALSE);
8939	}
8940	entry = (unsigned int)(dst_page->vmp_offset / PAGE_SIZE);
8941	assert(entry >= `0` && entry < object->resident_page_count);
8942	bitmap_set(map: upl->lite_list, n: entry);
8943
8944	phys_page = VM_PAGE_GET_PHYS_PAGE(m: dst_page);
8945
8946	if (phys_page > upl->highest_page) {
8947	upl->highest_page = phys_page;
8948	}
8949
8950	if (user_page_list) {
8951	user_page_list[entry].phys_addr = phys_page;
8952	user_page_list[entry].absent = dst_page->vmp_absent;
8953	user_page_list[entry].dirty = dst_page->vmp_dirty;
8954	user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
8955	user_page_list[entry].precious = dst_page->vmp_precious;
8956	user_page_list[entry].device = FALSE;
8957	user_page_list[entry].speculative = FALSE;
8958	user_page_list[entry].cs_validated = FALSE;
8959	user_page_list[entry].cs_tainted = FALSE;
8960	user_page_list[entry].cs_nx = FALSE;
8961	user_page_list[entry].needed = FALSE;
8962	user_page_list[entry].mark = FALSE;
8963	}
8964	if (delayed_unlock++ > `256`) {
8965	delayed_unlock = `0`;
8966	lck_mtx_yield(lck: &vm_page_queue_lock);
8967
8968	VM_CHECK_MEMORYSTATUS;
8969	}
8970	dst_page = (vm_page_t)vm_page_queue_next(&dst_page->vmp_listq);
8971	}
8972	done:
8973	vm_page_unlock_queues();
8974
8975	VM_CHECK_MEMORYSTATUS;
8976
8977	return retval;
8978	}
8979
8980
8981	static kern_return_t
8982	vm_object_iopl_wire_empty(
8983	vm_object_t object,
8984	upl_t upl,
8985	upl_page_info_array_t user_page_list,
8986	upl_control_flags_t cntrl_flags,
8987	vm_tag_t tag,
8988	vm_object_offset_t *dst_offset,
8989	int page_count,
8990	int *page_grab_count)
8991	{
8992	vm_page_t dst_page;
8993	boolean_t no_zero_fill = FALSE;
8994	int interruptible;
8995	int pages_wired = `0`;
8996	int pages_inserted = `0`;
8997	int entry = `0`;
8998	uint64_t delayed_ledger_update = `0`;
8999	kern_return_t ret = KERN_SUCCESS;
9000	int grab_options;
9001	ppnum_t phys_page;
9002
9003	vm_object_lock_assert_exclusive(object);
9004	assert(object->purgable != VM_PURGABLE_VOLATILE);
9005	assert(object->purgable != VM_PURGABLE_EMPTY);
9006	assert(object->pager == NULL);
9007	assert(object->vo_copy == NULL);
9008	assert(object->shadow == NULL);
9009
9010	if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
9011	interruptible = THREAD_ABORTSAFE;
9012	} else {
9013	interruptible = THREAD_UNINT;
9014	}
9015
9016	if (cntrl_flags & (UPL_NOZEROFILL \| UPL_NOZEROFILLIO)) {
9017	no_zero_fill = TRUE;
9018	}
9019
9020	grab_options = `0`;
9021	#if CONFIG_SECLUDED_MEMORY
9022	if (object->can_grab_secluded) {
9023	grab_options \|= VM_PAGE_GRAB_SECLUDED;
9024	}
9025	#endif /* CONFIG_SECLUDED_MEMORY */
9026
9027	while (page_count--) {
9028	while ((dst_page = vm_page_grab_options(flags: grab_options))
9029	== VM_PAGE_NULL) {
9030	OSAddAtomic(page_count, &vm_upl_wait_for_pages);
9031
9032	VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, `0`, `0`, `0`);
9033
9034	if (vm_page_wait(interruptible) == FALSE) {
9035	/*
9036	* interrupted case
9037	*/
9038	OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
9039
9040	VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, `0`, `0`, -`1`);
9041
9042	ret = MACH_SEND_INTERRUPTED;
9043	goto done;
9044	}
9045	OSAddAtomic(-page_count, &vm_upl_wait_for_pages);
9046
9047	VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, `0`, `0`, `0`);
9048	}
9049	if (no_zero_fill == FALSE) {
9050	vm_page_zero_fill(page: dst_page);
9051	} else {
9052	dst_page->vmp_absent = TRUE;
9053	}
9054
9055	dst_page->vmp_reference = TRUE;
9056
9057	if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
9058	SET_PAGE_DIRTY(dst_page, FALSE);
9059	}
9060	if (dst_page->vmp_absent == FALSE) {
9061	assert(dst_page->vmp_q_state == VM_PAGE_NOT_ON_Q);
9062	assert(dst_page->vmp_wire_count == `0`);
9063	dst_page->vmp_wire_count++;
9064	dst_page->vmp_q_state = VM_PAGE_IS_WIRED;
9065	assert(dst_page->vmp_wire_count);
9066	pages_wired++;
9067	PAGE_WAKEUP_DONE(dst_page);
9068	}
9069	pages_inserted++;
9070
9071	vm_page_insert_internal(page: dst_page, object, offset: *dst_offset, tag, FALSE, TRUE, TRUE, TRUE, delayed_ledger_update: &delayed_ledger_update);
9072
9073	bitmap_set(map: upl->lite_list, n: entry);
9074
9075	phys_page = VM_PAGE_GET_PHYS_PAGE(m: dst_page);
9076
9077	if (phys_page > upl->highest_page) {
9078	upl->highest_page = phys_page;
9079	}
9080
9081	if (user_page_list) {
9082	user_page_list[entry].phys_addr = phys_page;
9083	user_page_list[entry].absent = dst_page->vmp_absent;
9084	user_page_list[entry].dirty = dst_page->vmp_dirty;
9085	user_page_list[entry].free_when_done = FALSE;
9086	user_page_list[entry].precious = FALSE;
9087	user_page_list[entry].device = FALSE;
9088	user_page_list[entry].speculative = FALSE;
9089	user_page_list[entry].cs_validated = FALSE;
9090	user_page_list[entry].cs_tainted = FALSE;
9091	user_page_list[entry].cs_nx = FALSE;
9092	user_page_list[entry].needed = FALSE;
9093	user_page_list[entry].mark = FALSE;
9094	}
9095	entry++;
9096	*dst_offset += PAGE_SIZE_64;
9097	}
9098	done:
9099	if (pages_wired) {
9100	vm_page_lockspin_queues();
9101	vm_page_wire_count += pages_wired;
9102	vm_page_unlock_queues();
9103	}
9104	if (pages_inserted) {
9105	if (object->internal) {
9106	OSAddAtomic(pages_inserted, &vm_page_internal_count);
9107	} else {
9108	OSAddAtomic(pages_inserted, &vm_page_external_count);
9109	}
9110	}
9111	if (delayed_ledger_update) {
9112	task_t owner;
9113	int ledger_idx_volatile;
9114	int ledger_idx_nonvolatile;
9115	int ledger_idx_volatile_compressed;
9116	int ledger_idx_nonvolatile_compressed;
9117	boolean_t do_footprint;
9118
9119	owner = VM_OBJECT_OWNER(object);
9120	assert(owner);
9121
9122	vm_object_ledger_tag_ledgers(object,
9123	ledger_idx_volatile: &ledger_idx_volatile,
9124	ledger_idx_nonvolatile: &ledger_idx_nonvolatile,
9125	ledger_idx_volatile_compressed: &ledger_idx_volatile_compressed,
9126	ledger_idx_nonvolatile_compressed: &ledger_idx_nonvolatile_compressed,
9127	do_footprint: &do_footprint);
9128
9129	/ more non-volatile bytes /
9130	ledger_credit(ledger: owner->ledger,
9131	entry: ledger_idx_nonvolatile,
9132	amount: delayed_ledger_update);
9133	if (do_footprint) {
9134	/ more footprint /
9135	ledger_credit(ledger: owner->ledger,
9136	entry: task_ledgers.phys_footprint,
9137	amount: delayed_ledger_update);
9138	}
9139	}
9140
9141	assert(page_grab_count);
9142	*page_grab_count = pages_inserted;
9143
9144	return ret;
9145	}
9146
9147
9148
9149	kern_return_t
9150	vm_object_iopl_request(
9151	vm_object_t object,
9152	vm_object_offset_t offset,
9153	upl_size_t size,
9154	upl_t *upl_ptr,
9155	upl_page_info_array_t user_page_list,
9156	unsigned int *page_list_count,
9157	upl_control_flags_t cntrl_flags,
9158	vm_tag_t tag)
9159	{
9160	vm_page_t dst_page;
9161	vm_object_offset_t dst_offset;
9162	upl_size_t xfer_size;
9163	upl_t upl = NULL;
9164	unsigned int entry;
9165	int no_zero_fill = FALSE;
9166	unsigned int size_in_pages;
9167	int page_grab_count = `0`;
9168	u_int32_t psize;
9169	kern_return_t ret;
9170	vm_prot_t prot;
9171	struct vm_object_fault_info fault_info = {};
9172	struct vm_page_delayed_work dw_array;
9173	struct vm_page_delayed_work dwp, dwp_start;
9174	bool dwp_finish_ctx = TRUE;
9175	int dw_count;
9176	int dw_limit;
9177	int dw_index;
9178	boolean_t caller_lookup;
9179	int io_tracking_flag = `0`;
9180	int interruptible;
9181	ppnum_t phys_page;
9182
9183	boolean_t set_cache_attr_needed = FALSE;
9184	boolean_t free_wired_pages = FALSE;
9185	boolean_t fast_path_empty_req = FALSE;
9186	boolean_t fast_path_full_req = FALSE;
9187
9188	#if DEVELOPMENT \|\| DEBUG
9189	task_t task = current_task();
9190	#endif /* DEVELOPMENT \|\| DEBUG */
9191
9192	dwp_start = dwp = NULL;
9193
9194	vm_object_offset_t original_offset = offset;
9195	upl_size_t original_size = size;
9196
9197	// DEBUG4K_UPL("object %p offset 0x%llx size 0x%llx cntrl_flags 0x%llx\n", object, (uint64_t)offset, (uint64_t)size, cntrl_flags);
9198
9199	size = (upl_size_t)(vm_object_round_page(offset + size) - vm_object_trunc_page(offset));
9200	offset = vm_object_trunc_page(offset);
9201	if (size != original_size \|\| offset != original_offset) {
9202	DEBUG4K_IOKIT("flags 0x%llx object %p offset 0x%llx size 0x%x -> offset 0x%llx size 0x%x\n", cntrl_flags, object, original_offset, original_size, offset, size);
9203	}
9204
9205	if (cntrl_flags & ~UPL_VALID_FLAGS) {
9206	/*
9207	* For forward compatibility's sake,
9208	* reject any unknown flag.
9209	*/
9210	return KERN_INVALID_VALUE;
9211	}
9212	if (vm_lopage_needed == FALSE) {
9213	cntrl_flags &= ~UPL_NEED_32BIT_ADDR;
9214	}
9215
9216	if (cntrl_flags & UPL_NEED_32BIT_ADDR) {
9217	if ((cntrl_flags & (UPL_SET_IO_WIRE \| UPL_SET_LITE)) != (UPL_SET_IO_WIRE \| UPL_SET_LITE)) {
9218	return KERN_INVALID_VALUE;
9219	}
9220
9221	if (object->phys_contiguous) {
9222	if ((offset + object->vo_shadow_offset) >= (vm_object_offset_t)max_valid_dma_address) {
9223	return KERN_INVALID_ADDRESS;
9224	}
9225
9226	if (((offset + object->vo_shadow_offset) + size) >= (vm_object_offset_t)max_valid_dma_address) {
9227	return KERN_INVALID_ADDRESS;
9228	}
9229	}
9230	}
9231	if (cntrl_flags & (UPL_NOZEROFILL \| UPL_NOZEROFILLIO)) {
9232	no_zero_fill = TRUE;
9233	}
9234
9235	if (cntrl_flags & UPL_COPYOUT_FROM) {
9236	prot = VM_PROT_READ;
9237	} else {
9238	prot = VM_PROT_READ \| VM_PROT_WRITE;
9239	}
9240
9241	if ((!object->internal) && (object->paging_offset != `0`)) {
9242	panic("vm_object_iopl_request: external object with non-zero paging offset");
9243	}
9244
9245
9246	VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_START, size, cntrl_flags, prot, `0`);
9247
9248	#if CONFIG_IOSCHED \|\| UPL_DEBUG
9249	if ((object->io_tracking && !is_kernel_object(object)) \|\| upl_debug_enabled) {
9250	io_tracking_flag \|= UPL_CREATE_IO_TRACKING;
9251	}
9252	#endif
9253
9254	#if CONFIG_IOSCHED
9255	if (object->io_tracking) {
9256	/ Check if we're dealing with the kernel object. We do not support expedite on kernel object UPLs /
9257	if (!is_kernel_object(object)) {
9258	io_tracking_flag \|= UPL_CREATE_EXPEDITE_SUP;
9259	}
9260	}
9261	#endif
9262
9263	if (object->phys_contiguous) {
9264	psize = PAGE_SIZE;
9265	} else {
9266	psize = size;
9267
9268	dw_count = `0`;
9269	dw_limit = DELAYED_WORK_LIMIT(DEFAULT_DELAYED_WORK_LIMIT);
9270	dwp_start = vm_page_delayed_work_get_ctx();
9271	if (dwp_start == NULL) {
9272	dwp_start = &dw_array;
9273	dw_limit = `1`;
9274	dwp_finish_ctx = FALSE;
9275	}
9276
9277	dwp = dwp_start;
9278	}
9279
9280	if (cntrl_flags & UPL_SET_INTERNAL) {
9281	upl = upl_create(UPL_CREATE_INTERNAL \| UPL_CREATE_LITE \| io_tracking_flag, UPL_IO_WIRE, size: psize);
9282	user_page_list = size ? upl->page_list : NULL;
9283	} else {
9284	upl = upl_create(UPL_CREATE_LITE \| io_tracking_flag, UPL_IO_WIRE, size: psize);
9285	}
9286	if (user_page_list) {
9287	user_page_list[`0`].device = FALSE;
9288	}
9289	*upl_ptr = upl;
9290
9291	if (cntrl_flags & UPL_NOZEROFILLIO) {
9292	DTRACE_VM4(upl_nozerofillio,
9293	vm_object_t, object,
9294	vm_object_offset_t, offset,
9295	upl_size_t, size,
9296	upl_t, upl);
9297	}
9298
9299	upl->map_object = object;
9300	upl->u_offset = original_offset;
9301	upl->u_size = original_size;
9302
9303	size_in_pages = size / PAGE_SIZE;
9304
9305	if (is_kernel_object(object) &&
9306	!(cntrl_flags & (UPL_NEED_32BIT_ADDR \| UPL_BLOCK_ACCESS))) {
9307	upl->flags \|= UPL_KERNEL_OBJECT;
9308	#if UPL_DEBUG
9309	vm_object_lock(object);
9310	#else
9311	vm_object_lock_shared(object);
9312	#endif
9313	} else {
9314	vm_object_lock(object);
9315	vm_object_activity_begin(object);
9316	}
9317	/*
9318	* paging in progress also protects the paging_offset
9319	*/
9320	upl->u_offset = original_offset + object->paging_offset;
9321
9322	if (cntrl_flags & UPL_BLOCK_ACCESS) {
9323	/*
9324	* The user requested that access to the pages in this UPL
9325	* be blocked until the UPL is commited or aborted.
9326	*/
9327	upl->flags \|= UPL_ACCESS_BLOCKED;
9328	}
9329
9330	#if CONFIG_IOSCHED \|\| UPL_DEBUG
9331	if ((upl->flags & UPL_TRACKED_BY_OBJECT) \|\| upl_debug_enabled) {
9332	vm_object_activity_begin(object);
9333	queue_enter(&object->uplq, upl, upl_t, uplq);
9334	}
9335	#endif
9336
9337	if (object->phys_contiguous) {
9338	if (upl->flags & UPL_ACCESS_BLOCKED) {
9339	assert(!object->blocked_access);
9340	object->blocked_access = TRUE;
9341	}
9342
9343	vm_object_unlock(object);
9344
9345	/*
9346	* don't need any shadow mappings for this one
9347	* since it is already I/O memory
9348	*/
9349	upl->flags \|= UPL_DEVICE_MEMORY;
9350
9351	upl->highest_page = (ppnum_t) ((offset + object->vo_shadow_offset + size - `1`) >> PAGE_SHIFT);
9352
9353	if (user_page_list) {
9354	user_page_list[`0`].phys_addr = (ppnum_t) ((offset + object->vo_shadow_offset) >> PAGE_SHIFT);
9355	user_page_list[`0`].device = TRUE;
9356	}
9357	if (page_list_count != NULL) {
9358	if (upl->flags & UPL_INTERNAL) {
9359	*page_list_count = `0`;
9360	} else {
9361	*page_list_count = `1`;
9362	}
9363	}
9364
9365	VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, `0`, `0`);
9366	#if DEVELOPMENT \|\| DEBUG
9367	if (task != NULL) {
9368	ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9369	}
9370	#endif /* DEVELOPMENT \|\| DEBUG */
9371	return KERN_SUCCESS;
9372	}
9373	if (!is_kernel_object(object) && object != compressor_object) {
9374	/*
9375	* Protect user space from future COW operations
9376	*/
9377	#if VM_OBJECT_TRACKING_OP_TRUESHARE
9378	if (!object->true_share &&
9379	vm_object_tracking_btlog) {
9380	btlog_record(vm_object_tracking_btlog, object,
9381	VM_OBJECT_TRACKING_OP_TRUESHARE,
9382	btref_get(__builtin_frame_address(`0`), `0`));
9383	}
9384	#endif /* VM_OBJECT_TRACKING_OP_TRUESHARE */
9385
9386	vm_object_lock_assert_exclusive(object);
9387	VM_OBJECT_SET_TRUE_SHARE(object, TRUE);
9388
9389	if (object->copy_strategy == MEMORY_OBJECT_COPY_SYMMETRIC) {
9390	object->copy_strategy = MEMORY_OBJECT_COPY_DELAY;
9391	}
9392	}
9393
9394	if (!(cntrl_flags & UPL_COPYOUT_FROM) &&
9395	object->vo_copy != VM_OBJECT_NULL) {
9396	/*
9397	* Honor copy-on-write obligations
9398	*
9399	* The caller is gathering these pages and
9400	* might modify their contents. We need to
9401	* make sure that the copy object has its own
9402	* private copies of these pages before we let
9403	* the caller modify them.
9404	*
9405	* NOTE: someone else could map the original object
9406	* after we've done this copy-on-write here, and they
9407	* could then see an inconsistent picture of the memory
9408	* while it's being modified via the UPL. To prevent this,
9409	* we would have to block access to these pages until the
9410	* UPL is released. We could use the UPL_BLOCK_ACCESS
9411	* code path for that...
9412	*/
9413	vm_object_update(object,
9414	offset,
9415	size,
9416	NULL,
9417	NULL,
9418	FALSE, / should_return /
9419	MEMORY_OBJECT_COPY_SYNC,
9420	VM_PROT_NO_CHANGE);
9421	VM_PAGEOUT_DEBUG(iopl_cow, `1`);
9422	VM_PAGEOUT_DEBUG(iopl_cow_pages, (size >> PAGE_SHIFT));
9423	}
9424	if (!(cntrl_flags & (UPL_NEED_32BIT_ADDR \| UPL_BLOCK_ACCESS)) &&
9425	object->purgable != VM_PURGABLE_VOLATILE &&
9426	object->purgable != VM_PURGABLE_EMPTY &&
9427	object->vo_copy == NULL &&
9428	size == object->vo_size &&
9429	offset == `0` &&
9430	object->shadow == NULL &&
9431	object->pager == NULL) {
9432	if (object->resident_page_count == size_in_pages) {
9433	assert(object != compressor_object);
9434	assert(!is_kernel_object(object));
9435	fast_path_full_req = TRUE;
9436	} else if (object->resident_page_count == `0`) {
9437	assert(object != compressor_object);
9438	assert(!is_kernel_object(object));
9439	fast_path_empty_req = TRUE;
9440	set_cache_attr_needed = TRUE;
9441	}
9442	}
9443
9444	if (cntrl_flags & UPL_SET_INTERRUPTIBLE) {
9445	interruptible = THREAD_ABORTSAFE;
9446	} else {
9447	interruptible = THREAD_UNINT;
9448	}
9449
9450	entry = `0`;
9451
9452	xfer_size = size;
9453	dst_offset = offset;
9454
9455	if (fast_path_full_req) {
9456	if (vm_object_iopl_wire_full(object, upl, user_page_list, cntrl_flags, tag) == TRUE) {
9457	goto finish;
9458	}
9459	/*
9460	* we couldn't complete the processing of this request on the fast path
9461	* so fall through to the slow path and finish up
9462	*/
9463	} else if (fast_path_empty_req) {
9464	if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
9465	ret = KERN_MEMORY_ERROR;
9466	goto return_err;
9467	}
9468	ret = vm_object_iopl_wire_empty(object, upl, user_page_list,
9469	cntrl_flags, tag, dst_offset: &dst_offset, page_count: size_in_pages, page_grab_count: &page_grab_count);
9470
9471	if (ret) {
9472	free_wired_pages = TRUE;
9473	goto return_err;
9474	}
9475	goto finish;
9476	}
9477
9478	fault_info.behavior = VM_BEHAVIOR_SEQUENTIAL;
9479	fault_info.lo_offset = offset;
9480	fault_info.hi_offset = offset + xfer_size;
9481	fault_info.mark_zf_absent = TRUE;
9482	fault_info.interruptible = interruptible;
9483	fault_info.batch_pmap_op = TRUE;
9484
9485	while (xfer_size) {
9486	vm_fault_return_t result;
9487
9488	dwp->dw_mask = `0`;
9489
9490	if (fast_path_full_req) {
9491	/*
9492	* if we get here, it means that we ran into a page
9493	* state we couldn't handle in the fast path and
9494	* bailed out to the slow path... since the order
9495	* we look at pages is different between the 2 paths,
9496	* the following check is needed to determine whether
9497	* this page was already processed in the fast path
9498	*/
9499	if (bitmap_test(map: upl->lite_list, n: entry)) {
9500	goto skip_page;
9501	}
9502	}
9503	dst_page = vm_page_lookup(object, offset: dst_offset);
9504
9505	if (dst_page == VM_PAGE_NULL \|\|
9506	dst_page->vmp_busy \|\|
9507	VMP_ERROR_GET(dst_page) \|\|
9508	dst_page->vmp_restart \|\|
9509	dst_page->vmp_absent \|\|
9510	dst_page->vmp_fictitious) {
9511	if (is_kernel_object(object)) {
9512	panic("vm_object_iopl_request: missing/bad page in kernel object");
9513	}
9514	if (object == compressor_object) {
9515	panic("vm_object_iopl_request: missing/bad page in compressor object");
9516	}
9517
9518	if (cntrl_flags & UPL_REQUEST_NO_FAULT) {
9519	ret = KERN_MEMORY_ERROR;
9520	goto return_err;
9521	}
9522	set_cache_attr_needed = TRUE;
9523
9524	/*
9525	* We just looked up the page and the result remains valid
9526	* until the object lock is release, so send it to
9527	* vm_fault_page() (as "dst_page"), to avoid having to
9528	* look it up again there.
9529	*/
9530	caller_lookup = TRUE;
9531
9532	do {
9533	vm_page_t top_page;
9534	kern_return_t error_code;
9535
9536	fault_info.cluster_size = xfer_size;
9537
9538	vm_object_paging_begin(object);
9539
9540	result = vm_fault_page(first_object: object, first_offset: dst_offset,
9541	fault_type: prot \| VM_PROT_WRITE, FALSE,
9542	caller_lookup,
9543	protection: &prot, result_page: &dst_page, top_page: &top_page,
9544	type_of_fault: (int *)`0`,
9545	error_code: &error_code, no_zero_fill,
9546	fault_info: &fault_info);
9547
9548	/ our lookup is no longer valid at this point /
9549	caller_lookup = FALSE;
9550
9551	switch (result) {
9552	case VM_FAULT_SUCCESS:
9553	page_grab_count++;
9554
9555	if (!dst_page->vmp_absent) {
9556	PAGE_WAKEUP_DONE(dst_page);
9557	} else {
9558	/*
9559	* we only get back an absent page if we
9560	* requested that it not be zero-filled
9561	* because we are about to fill it via I/O
9562	*
9563	* absent pages should be left BUSY
9564	* to prevent them from being faulted
9565	* into an address space before we've
9566	* had a chance to complete the I/O on
9567	* them since they may contain info that
9568	* shouldn't be seen by the faulting task
9569	*/
9570	}
9571	/*
9572	* Release paging references and
9573	* top-level placeholder page, if any.
9574	*/
9575	if (top_page != VM_PAGE_NULL) {
9576	vm_object_t local_object;
9577
9578	local_object = VM_PAGE_OBJECT(top_page);
9579
9580	/*
9581	* comparing 2 packed pointers
9582	*/
9583	if (top_page->vmp_object != dst_page->vmp_object) {
9584	vm_object_lock(local_object);
9585	VM_PAGE_FREE(top_page);
9586	vm_object_paging_end(local_object);
9587	vm_object_unlock(local_object);
9588	} else {
9589	VM_PAGE_FREE(top_page);
9590	vm_object_paging_end(local_object);
9591	}
9592	}
9593	vm_object_paging_end(object);
9594	break;
9595
9596	case VM_FAULT_RETRY:
9597	vm_object_lock(object);
9598	break;
9599
9600	case VM_FAULT_MEMORY_SHORTAGE:
9601	OSAddAtomic((size_in_pages - entry), &vm_upl_wait_for_pages);
9602
9603	VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_START, vm_upl_wait_for_pages, `0`, `0`, `0`);
9604
9605	if (vm_page_wait(interruptible)) {
9606	OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
9607
9608	VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, `0`, `0`, `0`);
9609	vm_object_lock(object);
9610
9611	break;
9612	}
9613	OSAddAtomic(-(size_in_pages - entry), &vm_upl_wait_for_pages);
9614
9615	VM_DEBUG_EVENT(vm_iopl_page_wait, VM_IOPL_PAGE_WAIT, DBG_FUNC_END, vm_upl_wait_for_pages, `0`, `0`, -`1`);
9616	ktriage_record(thread_id: thread_tid(thread: current_thread()), KDBG_TRIAGE_EVENTID(KDBG_TRIAGE_SUBSYS_VM, KDBG_TRIAGE_RESERVED, KDBG_TRIAGE_VM_FAULT_OBJIOPLREQ_MEMORY_SHORTAGE), arg: `0` / arg /);
9617	OS_FALLTHROUGH;
9618
9619	case VM_FAULT_INTERRUPTED:
9620	error_code = MACH_SEND_INTERRUPTED;
9621	OS_FALLTHROUGH;
9622	case VM_FAULT_MEMORY_ERROR:
9623	memory_error:
9624	ret = (error_code ? error_code: KERN_MEMORY_ERROR);
9625
9626	vm_object_lock(object);
9627	goto return_err;
9628
9629	case VM_FAULT_SUCCESS_NO_VM_PAGE:
9630	/ success but no page: fail /
9631	vm_object_paging_end(object);
9632	vm_object_unlock(object);
9633	goto memory_error;
9634
9635	default:
9636	panic("vm_object_iopl_request: unexpected error"
9637	" 0x%x from vm_fault_page()\n", result);
9638	}
9639	} while (result != VM_FAULT_SUCCESS);
9640	}
9641	phys_page = VM_PAGE_GET_PHYS_PAGE(m: dst_page);
9642
9643	if (upl->flags & UPL_KERNEL_OBJECT) {
9644	goto record_phys_addr;
9645	}
9646
9647	if (dst_page->vmp_q_state == VM_PAGE_USED_BY_COMPRESSOR) {
9648	dst_page->vmp_busy = TRUE;
9649	goto record_phys_addr;
9650	}
9651
9652	if (dst_page->vmp_cleaning) {
9653	/*
9654	* Someone else is cleaning this page in place.
9655	* In theory, we should be able to proceed and use this
9656	* page but they'll probably end up clearing the "busy"
9657	* bit on it in upl_commit_range() but they didn't set
9658	* it, so they would clear our "busy" bit and open
9659	* us to race conditions.
9660	* We'd better wait for the cleaning to complete and
9661	* then try again.
9662	*/
9663	VM_PAGEOUT_DEBUG(vm_object_iopl_request_sleep_for_cleaning, `1`);
9664	PAGE_SLEEP(object, dst_page, THREAD_UNINT);
9665	continue;
9666	}
9667	if (dst_page->vmp_laundry) {
9668	vm_pageout_steal_laundry(page: dst_page, FALSE);
9669	}
9670
9671	if ((cntrl_flags & UPL_NEED_32BIT_ADDR) &&
9672	phys_page >= (max_valid_dma_address >> PAGE_SHIFT)) {
9673	vm_page_t low_page;
9674	int refmod;
9675
9676	/*
9677	* support devices that can't DMA above 32 bits
9678	* by substituting pages from a pool of low address
9679	* memory for any pages we find above the 4G mark
9680	* can't substitute if the page is already wired because
9681	* we don't know whether that physical address has been
9682	* handed out to some other 64 bit capable DMA device to use
9683	*/
9684	if (VM_PAGE_WIRED(dst_page)) {
9685	ret = KERN_PROTECTION_FAILURE;
9686	goto return_err;
9687	}
9688	low_page = vm_page_grablo();
9689
9690	if (low_page == VM_PAGE_NULL) {
9691	ret = KERN_RESOURCE_SHORTAGE;
9692	goto return_err;
9693	}
9694	/*
9695	* from here until the vm_page_replace completes
9696	* we musn't drop the object lock... we don't
9697	* want anyone refaulting this page in and using
9698	* it after we disconnect it... we want the fault
9699	* to find the new page being substituted.
9700	*/
9701	if (dst_page->vmp_pmapped) {
9702	refmod = pmap_disconnect(phys: phys_page);
9703	} else {
9704	refmod = `0`;
9705	}
9706
9707	if (!dst_page->vmp_absent) {
9708	vm_page_copy(src_page: dst_page, dest_page: low_page);
9709	}
9710
9711	low_page->vmp_reference = dst_page->vmp_reference;
9712	low_page->vmp_dirty = dst_page->vmp_dirty;
9713	low_page->vmp_absent = dst_page->vmp_absent;
9714
9715	if (refmod & VM_MEM_REFERENCED) {
9716	low_page->vmp_reference = TRUE;
9717	}
9718	if (refmod & VM_MEM_MODIFIED) {
9719	SET_PAGE_DIRTY(low_page, FALSE);
9720	}
9721
9722	vm_page_replace(mem: low_page, object, offset: dst_offset);
9723
9724	dst_page = low_page;
9725	/*
9726	* vm_page_grablo returned the page marked
9727	* BUSY... we don't need a PAGE_WAKEUP_DONE
9728	* here, because we've never dropped the object lock
9729	*/
9730	if (!dst_page->vmp_absent) {
9731	dst_page->vmp_busy = FALSE;
9732	}
9733
9734	phys_page = VM_PAGE_GET_PHYS_PAGE(m: dst_page);
9735	}
9736	if (!dst_page->vmp_busy) {
9737	dwp->dw_mask \|= DW_vm_page_wire;
9738	}
9739
9740	if (cntrl_flags & UPL_BLOCK_ACCESS) {
9741	/*
9742	* Mark the page "busy" to block any future page fault
9743	* on this page in addition to wiring it.
9744	* We'll also remove the mapping
9745	* of all these pages before leaving this routine.
9746	*/
9747	assert(!dst_page->vmp_fictitious);
9748	dst_page->vmp_busy = TRUE;
9749	}
9750	/*
9751	* expect the page to be used
9752	* page queues lock must be held to set 'reference'
9753	*/
9754	dwp->dw_mask \|= DW_set_reference;
9755
9756	if (!(cntrl_flags & UPL_COPYOUT_FROM)) {
9757	SET_PAGE_DIRTY(dst_page, TRUE);
9758	/*
9759	* Page belonging to a code-signed object is about to
9760	* be written. Mark it tainted and disconnect it from
9761	* all pmaps so processes have to fault it back in and
9762	* deal with the tainted bit.
9763	*/
9764	if (object->code_signed && dst_page->vmp_cs_tainted != VMP_CS_ALL_TRUE) {
9765	dst_page->vmp_cs_tainted = VMP_CS_ALL_TRUE;
9766	vm_page_iopl_tainted++;
9767	if (dst_page->vmp_pmapped) {
9768	int refmod = pmap_disconnect(phys: VM_PAGE_GET_PHYS_PAGE(m: dst_page));
9769	if (refmod & VM_MEM_REFERENCED) {
9770	dst_page->vmp_reference = TRUE;
9771	}
9772	}
9773	}
9774	}
9775	if ((cntrl_flags & UPL_REQUEST_FORCE_COHERENCY) && dst_page->vmp_written_by_kernel == TRUE) {
9776	pmap_sync_page_attributes_phys(pa: phys_page);
9777	dst_page->vmp_written_by_kernel = FALSE;
9778	}
9779
9780	record_phys_addr:
9781	if (dst_page->vmp_busy) {
9782	upl->flags \|= UPL_HAS_BUSY;
9783	}
9784
9785	bitmap_set(map: upl->lite_list, n: entry);
9786
9787	if (phys_page > upl->highest_page) {
9788	upl->highest_page = phys_page;
9789	}
9790
9791	if (user_page_list) {
9792	user_page_list[entry].phys_addr = phys_page;
9793	user_page_list[entry].free_when_done = dst_page->vmp_free_when_done;
9794	user_page_list[entry].absent = dst_page->vmp_absent;
9795	user_page_list[entry].dirty = dst_page->vmp_dirty;
9796	user_page_list[entry].precious = dst_page->vmp_precious;
9797	user_page_list[entry].device = FALSE;
9798	user_page_list[entry].needed = FALSE;
9799	if (dst_page->vmp_clustered == TRUE) {
9800	user_page_list[entry].speculative = (dst_page->vmp_q_state == VM_PAGE_ON_SPECULATIVE_Q) ? TRUE : FALSE;
9801	} else {
9802	user_page_list[entry].speculative = FALSE;
9803	}
9804	user_page_list[entry].cs_validated = dst_page->vmp_cs_validated;
9805	user_page_list[entry].cs_tainted = dst_page->vmp_cs_tainted;
9806	user_page_list[entry].cs_nx = dst_page->vmp_cs_nx;
9807	user_page_list[entry].mark = FALSE;
9808	}
9809	if (!is_kernel_object(object) && object != compressor_object) {
9810	/*
9811	* someone is explicitly grabbing this page...
9812	* update clustered and speculative state
9813	*
9814	*/
9815	if (dst_page->vmp_clustered) {
9816	VM_PAGE_CONSUME_CLUSTERED(dst_page);
9817	}
9818	}
9819	skip_page:
9820	entry++;
9821	dst_offset += PAGE_SIZE_64;
9822	xfer_size -= PAGE_SIZE;
9823
9824	if (dwp->dw_mask) {
9825	VM_PAGE_ADD_DELAYED_WORK(dwp, dst_page, dw_count);
9826
9827	if (dw_count >= dw_limit) {
9828	vm_page_do_delayed_work(object, tag, dwp: dwp_start, dw_count);
9829
9830	dwp = dwp_start;
9831	dw_count = `0`;
9832	}
9833	}
9834	}
9835	assert(entry == size_in_pages);
9836
9837	if (dw_count) {
9838	vm_page_do_delayed_work(object, tag, dwp: dwp_start, dw_count);
9839	dwp = dwp_start;
9840	dw_count = `0`;
9841	}
9842	finish:
9843	if (user_page_list && set_cache_attr_needed == TRUE) {
9844	vm_object_set_pmap_cache_attr(object, user_page_list, num_pages: size_in_pages, TRUE);
9845	}
9846
9847	if (page_list_count != NULL) {
9848	if (upl->flags & UPL_INTERNAL) {
9849	*page_list_count = `0`;
9850	} else if (*page_list_count > size_in_pages) {
9851	*page_list_count = size_in_pages;
9852	}
9853	}
9854	vm_object_unlock(object);
9855
9856	if (cntrl_flags & UPL_BLOCK_ACCESS) {
9857	/*
9858	* We've marked all the pages "busy" so that future
9859	* page faults will block.
9860	* Now remove the mapping for these pages, so that they
9861	* can't be accessed without causing a page fault.
9862	*/
9863	vm_object_pmap_protect(object, offset, size: (vm_object_size_t)size,
9864	PMAP_NULL,
9865	PAGE_SIZE,
9866	pmap_start: `0`, VM_PROT_NONE);
9867	assert(!object->blocked_access);
9868	object->blocked_access = TRUE;
9869	}
9870
9871	VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, KERN_SUCCESS, `0`, `0`);
9872	#if DEVELOPMENT \|\| DEBUG
9873	if (task != NULL) {
9874	ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9875	}
9876	#endif /* DEVELOPMENT \|\| DEBUG */
9877
9878	if (dwp_start && dwp_finish_ctx) {
9879	vm_page_delayed_work_finish_ctx(dwp: dwp_start);
9880	dwp_start = dwp = NULL;
9881	}
9882
9883	return KERN_SUCCESS;
9884
9885	return_err:
9886	dw_index = `0`;
9887
9888	for (; offset < dst_offset; offset += PAGE_SIZE) {
9889	boolean_t need_unwire;
9890
9891	dst_page = vm_page_lookup(object, offset);
9892
9893	if (dst_page == VM_PAGE_NULL) {
9894	panic("vm_object_iopl_request: Wired page missing.");
9895	}
9896
9897	/*
9898	* if we've already processed this page in an earlier
9899	* dw_do_work, we need to undo the wiring... we will
9900	* leave the dirty and reference bits on if they
9901	* were set, since we don't have a good way of knowing
9902	* what the previous state was and we won't get here
9903	* under any normal circumstances... we will always
9904	* clear BUSY and wakeup any waiters via vm_page_free
9905	* or PAGE_WAKEUP_DONE
9906	*/
9907	need_unwire = TRUE;
9908
9909	if (dw_count) {
9910	if ((dwp_start)[dw_index].dw_m == dst_page) {
9911	/*
9912	* still in the deferred work list
9913	* which means we haven't yet called
9914	* vm_page_wire on this page
9915	*/
9916	need_unwire = FALSE;
9917
9918	dw_index++;
9919	dw_count--;
9920	}
9921	}
9922	vm_page_lock_queues();
9923
9924	if (dst_page->vmp_absent \|\| free_wired_pages == TRUE) {
9925	vm_page_free(page: dst_page);
9926
9927	need_unwire = FALSE;
9928	} else {
9929	if (need_unwire == TRUE) {
9930	vm_page_unwire(page: dst_page, TRUE);
9931	}
9932
9933	PAGE_WAKEUP_DONE(dst_page);
9934	}
9935	vm_page_unlock_queues();
9936
9937	if (need_unwire == TRUE) {
9938	counter_inc(&vm_statistics_reactivations);
9939	}
9940	}
9941	#if UPL_DEBUG
9942	upl->upl_state = `2`;
9943	#endif
9944	if (!(upl->flags & UPL_KERNEL_OBJECT)) {
9945	vm_object_activity_end(object);
9946	vm_object_collapse(object, offset: `0`, TRUE);
9947	}
9948	vm_object_unlock(object);
9949	upl_destroy(upl);
9950
9951	VM_DEBUG_CONSTANT_EVENT(vm_object_iopl_request, VM_IOPL_REQUEST, DBG_FUNC_END, page_grab_count, ret, `0`, `0`);
9952	#if DEVELOPMENT \|\| DEBUG
9953	if (task != NULL) {
9954	ledger_credit(task->ledger, task_ledgers.pages_grabbed_iopl, page_grab_count);
9955	}
9956	#endif /* DEVELOPMENT \|\| DEBUG */
9957
9958	if (dwp_start && dwp_finish_ctx) {
9959	vm_page_delayed_work_finish_ctx(dwp: dwp_start);
9960	dwp_start = dwp = NULL;
9961	}
9962	return ret;
9963	}
9964
9965	kern_return_t
9966	upl_transpose(
9967	upl_t upl1,
9968	upl_t upl2)
9969	{
9970	kern_return_t retval;
9971	boolean_t upls_locked;
9972	vm_object_t object1, object2;
9973
9974	/ LD: Should mapped UPLs be eligible for a transpose? /
9975	if (upl1 == UPL_NULL \|\| upl2 == UPL_NULL \|\| upl1 == upl2 \|\| ((upl1->flags & UPL_VECTOR) == UPL_VECTOR) \|\| ((upl2->flags & UPL_VECTOR) == UPL_VECTOR)) {
9976	return KERN_INVALID_ARGUMENT;
9977	}
9978
9979	upls_locked = FALSE;
9980
9981	/*
9982	* Since we need to lock both UPLs at the same time,
9983	* avoid deadlocks by always taking locks in the same order.
9984	*/
9985	if (upl1 < upl2) {
9986	upl_lock(upl1);
9987	upl_lock(upl2);
9988	} else {
9989	upl_lock(upl2);
9990	upl_lock(upl1);
9991	}
9992	upls_locked = TRUE; / the UPLs will need to be unlocked /
9993
9994	object1 = upl1->map_object;
9995	object2 = upl2->map_object;
9996
9997	if (upl1->u_offset != `0` \|\| upl2->u_offset != `0` \|\|
9998	upl1->u_size != upl2->u_size) {
9999	/*
10000	* We deal only with full objects, not subsets.
10001	* That's because we exchange the entire backing store info
10002	* for the objects: pager, resident pages, etc... We can't do
10003	* only part of it.
10004	*/
10005	retval = KERN_INVALID_VALUE;
10006	goto done;
10007	}
10008
10009	/*
10010	* Tranpose the VM objects' backing store.
10011	*/
10012	retval = vm_object_transpose(object1, object2,
10013	transpose_size: upl_adjusted_size(upl: upl1, PAGE_MASK));
10014
10015	if (retval == KERN_SUCCESS) {
10016	/*
10017	* Make each UPL point to the correct VM object, i.e. the
10018	* object holding the pages that the UPL refers to...
10019	*/
10020	#if CONFIG_IOSCHED \|\| UPL_DEBUG
10021	if ((upl1->flags & UPL_TRACKED_BY_OBJECT) \|\| (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
10022	vm_object_lock(object1);
10023	vm_object_lock(object2);
10024	}
10025	if ((upl1->flags & UPL_TRACKED_BY_OBJECT) \|\| upl_debug_enabled) {
10026	queue_remove(&object1->uplq, upl1, upl_t, uplq);
10027	}
10028	if ((upl2->flags & UPL_TRACKED_BY_OBJECT) \|\| upl_debug_enabled) {
10029	queue_remove(&object2->uplq, upl2, upl_t, uplq);
10030	}
10031	#endif
10032	upl1->map_object = object2;
10033	upl2->map_object = object1;
10034
10035	#if CONFIG_IOSCHED \|\| UPL_DEBUG
10036	if ((upl1->flags & UPL_TRACKED_BY_OBJECT) \|\| upl_debug_enabled) {
10037	queue_enter(&object2->uplq, upl1, upl_t, uplq);
10038	}
10039	if ((upl2->flags & UPL_TRACKED_BY_OBJECT) \|\| upl_debug_enabled) {
10040	queue_enter(&object1->uplq, upl2, upl_t, uplq);
10041	}
10042	if ((upl1->flags & UPL_TRACKED_BY_OBJECT) \|\| (upl2->flags & UPL_TRACKED_BY_OBJECT)) {
10043	vm_object_unlock(object2);
10044	vm_object_unlock(object1);
10045	}
10046	#endif
10047	}
10048
10049	done:
10050	/*
10051	* Cleanup.
10052	*/
10053	if (upls_locked) {
10054	upl_unlock(upl1);
10055	upl_unlock(upl2);
10056	upls_locked = FALSE;
10057	}
10058
10059	return retval;
10060	}
10061
10062	void
10063	upl_range_needed(
10064	upl_t upl,
10065	int index,
10066	int count)
10067	{
10068	int size_in_pages;
10069
10070	if (!(upl->flags & UPL_INTERNAL) \|\| count <= `0`) {
10071	return;
10072	}
10073
10074	size_in_pages = upl_adjusted_size(upl, PAGE_MASK) / PAGE_SIZE;
10075
10076	while (count-- && index < size_in_pages) {
10077	upl->page_list[index++].needed = TRUE;
10078	}
10079	}
10080
10081
10082	/*
10083	* Reserve of virtual addresses in the kernel address space.
10084	* We need to map the physical pages in the kernel, so that we
10085	* can call the code-signing or slide routines with a kernel
10086	* virtual address. We keep this pool of pre-allocated kernel
10087	* virtual addresses so that we don't have to scan the kernel's
10088	* virtaul address space each time we need to work with
10089	* a physical page.
10090	*/
10091	SIMPLE_LOCK_DECLARE(vm_paging_lock, `0`);
10092	#define VM_PAGING_NUM_PAGES 64
10093	SECURITY_READ_ONLY_LATE(vm_offset_t) vm_paging_base_address = `0`;
10094	bool vm_paging_page_inuse[VM_PAGING_NUM_PAGES] = { FALSE, };
10095	int vm_paging_max_index = `0`;
10096	int vm_paging_page_waiter = `0`;
10097	int vm_paging_page_waiter_total = `0`;
10098
10099	unsigned long vm_paging_no_kernel_page = `0`;
10100	unsigned long vm_paging_objects_mapped = `0`;
10101	unsigned long vm_paging_pages_mapped = `0`;
10102	unsigned long vm_paging_objects_mapped_slow = `0`;
10103	unsigned long vm_paging_pages_mapped_slow = `0`;
10104
10105	__startup_func
10106	static void
10107	vm_paging_map_init(void)
10108	{
10109	kmem_alloc(map: kernel_map, addrp: &vm_paging_base_address,
10110	ptoa(VM_PAGING_NUM_PAGES),
10111	flags: KMA_DATA \| KMA_NOFAIL \| KMA_KOBJECT \| KMA_PERMANENT \| KMA_PAGEABLE,
10112	VM_KERN_MEMORY_NONE);
10113	}
10114	STARTUP(ZALLOC, STARTUP_RANK_LAST, vm_paging_map_init);
10115
10116	/*
10117	* vm_paging_map_object:
10118	* Maps part of a VM object's pages in the kernel
10119	* virtual address space, using the pre-allocated
10120	* kernel virtual addresses, if possible.
10121	* Context:
10122	* The VM object is locked. This lock will get
10123	* dropped and re-acquired though, so the caller
10124	* must make sure the VM object is kept alive
10125	* (by holding a VM map that has a reference
10126	* on it, for example, or taking an extra reference).
10127	* The page should also be kept busy to prevent
10128	* it from being reclaimed.
10129	*/
10130	kern_return_t
10131	vm_paging_map_object(
10132	vm_page_t page,
10133	vm_object_t object,
10134	vm_object_offset_t offset,
10135	vm_prot_t protection,
10136	boolean_t can_unlock_object,
10137	vm_map_size_t size, /* IN/OUT /
10138	vm_map_offset_t address, /* OUT /
10139	boolean_t need_unmap) /* OUT /
10140	{
10141	kern_return_t kr;
10142	vm_map_offset_t page_map_offset;
10143	vm_map_size_t map_size;
10144	vm_object_offset_t object_offset;
10145	int i;
10146
10147	if (page != VM_PAGE_NULL && *size == PAGE_SIZE) {
10148	/ use permanent 1-to-1 kernel mapping of physical memory ? /
10149	*address = (vm_map_offset_t)
10150	phystokv(pa: (pmap_paddr_t)VM_PAGE_GET_PHYS_PAGE(m: page) << PAGE_SHIFT);
10151	*need_unmap = FALSE;
10152	return KERN_SUCCESS;
10153
10154	assert(page->vmp_busy);
10155	/*
10156	* Use one of the pre-allocated kernel virtual addresses
10157	* and just enter the VM page in the kernel address space
10158	* at that virtual address.
10159	*/
10160	simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10161
10162	/*
10163	* Try and find an available kernel virtual address
10164	* from our pre-allocated pool.
10165	*/
10166	page_map_offset = `0`;
10167	for (;;) {
10168	for (i = `0`; i < VM_PAGING_NUM_PAGES; i++) {
10169	if (vm_paging_page_inuse[i] == FALSE) {
10170	page_map_offset =
10171	vm_paging_base_address +
10172	(i * PAGE_SIZE);
10173	break;
10174	}
10175	}
10176	if (page_map_offset != `0`) {
10177	/ found a space to map our page ! /
10178	break;
10179	}
10180
10181	if (can_unlock_object) {
10182	/*
10183	* If we can afford to unlock the VM object,
10184	* let's take the slow path now...
10185	*/
10186	break;
10187	}
10188	/*
10189	* We can't afford to unlock the VM object, so
10190	* let's wait for a space to become available...
10191	*/
10192	vm_paging_page_waiter_total++;
10193	vm_paging_page_waiter++;
10194	kr = assert_wait(event: (event_t)&vm_paging_page_waiter, THREAD_UNINT);
10195	if (kr == THREAD_WAITING) {
10196	simple_unlock(&vm_paging_lock);
10197	kr = thread_block(THREAD_CONTINUE_NULL);
10198	simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10199	}
10200	vm_paging_page_waiter--;
10201	/ ... and try again /
10202	}
10203
10204	if (page_map_offset != `0`) {
10205	/*
10206	* We found a kernel virtual address;
10207	* map the physical page to that virtual address.
10208	*/
10209	if (i > vm_paging_max_index) {
10210	vm_paging_max_index = i;
10211	}
10212	vm_paging_page_inuse[i] = TRUE;
10213	simple_unlock(&vm_paging_lock);
10214
10215	page->vmp_pmapped = TRUE;
10216
10217	/*
10218	* Keep the VM object locked over the PMAP_ENTER
10219	* and the actual use of the page by the kernel,
10220	* or this pmap mapping might get undone by a
10221	* vm_object_pmap_protect() call...
10222	*/
10223	kr = pmap_enter_check(pmap: kernel_pmap,
10224	virtual_address: page_map_offset,
10225	page,
10226	protection,
10227	VM_PROT_NONE,
10228	flags: `0`,
10229	TRUE);
10230	assert(kr == KERN_SUCCESS);
10231	vm_paging_objects_mapped++;
10232	vm_paging_pages_mapped++;
10233	*address = page_map_offset;
10234	*need_unmap = TRUE;
10235
10236	#if KASAN
10237	kasan_notify_address(page_map_offset, PAGE_SIZE);
10238	#endif
10239
10240	/ all done and mapped, ready to use ! /
10241	return KERN_SUCCESS;
10242	}
10243
10244	/*
10245	* We ran out of pre-allocated kernel virtual
10246	* addresses. Just map the page in the kernel
10247	* the slow and regular way.
10248	*/
10249	vm_paging_no_kernel_page++;
10250	simple_unlock(&vm_paging_lock);
10251	}
10252
10253	if (!can_unlock_object) {
10254	*address = `0`;
10255	*size = `0`;
10256	*need_unmap = FALSE;
10257	return KERN_NOT_SUPPORTED;
10258	}
10259
10260	object_offset = vm_object_trunc_page(offset);
10261	map_size = vm_map_round_page(*size,
10262	VM_MAP_PAGE_MASK(kernel_map));
10263
10264	/*
10265	* Try and map the required range of the object
10266	* in the kernel_map. Given that allocation is
10267	* for pageable memory, it shouldn't contain
10268	* pointers and is mapped into the data range.
10269	*/
10270
10271	vm_object_reference_locked(object); / for the map entry /
10272	vm_object_unlock(object);
10273
10274	kr = vm_map_enter(map: kernel_map,
10275	address,
10276	size: map_size,
10277	mask: `0`,
10278	VM_MAP_KERNEL_FLAGS_DATA_ANYWHERE(),
10279	object,
10280	offset: object_offset,
10281	FALSE,
10282	cur_protection: protection,
10283	VM_PROT_ALL,
10284	VM_INHERIT_NONE);
10285	if (kr != KERN_SUCCESS) {
10286	*address = `0`;
10287	*size = `0`;
10288	*need_unmap = FALSE;
10289	vm_object_deallocate(object); / for the map entry /
10290	vm_object_lock(object);
10291	return kr;
10292	}
10293
10294	*size = map_size;
10295
10296	/*
10297	* Enter the mapped pages in the page table now.
10298	*/
10299	vm_object_lock(object);
10300	/*
10301	* VM object must be kept locked from before PMAP_ENTER()
10302	* until after the kernel is done accessing the page(s).
10303	* Otherwise, the pmap mappings in the kernel could be
10304	* undone by a call to vm_object_pmap_protect().
10305	*/
10306
10307	for (page_map_offset = `0`;
10308	map_size != `0`;
10309	map_size -= PAGE_SIZE_64, page_map_offset += PAGE_SIZE_64) {
10310	page = vm_page_lookup(object, offset: offset + page_map_offset);
10311	if (page == VM_PAGE_NULL) {
10312	printf(format: "vm_paging_map_object: no page !?");
10313	vm_object_unlock(object);
10314	vm_map_remove(map: kernel_map, start: address, end: size);
10315	*address = `0`;
10316	*size = `0`;
10317	*need_unmap = FALSE;
10318	vm_object_lock(object);
10319	return KERN_MEMORY_ERROR;
10320	}
10321	page->vmp_pmapped = TRUE;
10322
10323	kr = pmap_enter_check(pmap: kernel_pmap,
10324	virtual_address: *address + page_map_offset,
10325	page,
10326	protection,
10327	VM_PROT_NONE,
10328	flags: `0`,
10329	TRUE);
10330	assert(kr == KERN_SUCCESS);
10331	#if KASAN
10332	kasan_notify_address(*address + page_map_offset, PAGE_SIZE);
10333	#endif
10334	}
10335
10336	vm_paging_objects_mapped_slow++;
10337	vm_paging_pages_mapped_slow += (unsigned long) (map_size / PAGE_SIZE_64);
10338
10339	*need_unmap = TRUE;
10340
10341	return KERN_SUCCESS;
10342	}
10343
10344	/*
10345	* vm_paging_unmap_object:
10346	* Unmaps part of a VM object's pages from the kernel
10347	* virtual address space.
10348	* Context:
10349	* The VM object is locked. This lock will get
10350	* dropped and re-acquired though.
10351	*/
10352	void
10353	vm_paging_unmap_object(
10354	vm_object_t object,
10355	vm_map_offset_t start,
10356	vm_map_offset_t end)
10357	{
10358	int i;
10359
10360	if ((vm_paging_base_address == `0`) \|\|
10361	(start < vm_paging_base_address) \|\|
10362	(end > (vm_paging_base_address
10363	+ (VM_PAGING_NUM_PAGES * PAGE_SIZE)))) {
10364	/*
10365	* We didn't use our pre-allocated pool of
10366	* kernel virtual address. Deallocate the
10367	* virtual memory.
10368	*/
10369	if (object != VM_OBJECT_NULL) {
10370	vm_object_unlock(object);
10371	}
10372	vm_map_remove(map: kernel_map, start, end);
10373	if (object != VM_OBJECT_NULL) {
10374	vm_object_lock(object);
10375	}
10376	} else {
10377	/*
10378	* We used a kernel virtual address from our
10379	* pre-allocated pool. Put it back in the pool
10380	* for next time.
10381	*/
10382	assert(end - start == PAGE_SIZE);
10383	i = (int) ((start - vm_paging_base_address) >> PAGE_SHIFT);
10384	assert(i >= `0` && i < VM_PAGING_NUM_PAGES);
10385
10386	/ undo the pmap mapping /
10387	pmap_remove(map: kernel_pmap, s: start, e: end);
10388
10389	simple_lock(&vm_paging_lock, &vm_pageout_lck_grp);
10390	vm_paging_page_inuse[i] = FALSE;
10391	if (vm_paging_page_waiter) {
10392	thread_wakeup(&vm_paging_page_waiter);
10393	}
10394	simple_unlock(&vm_paging_lock);
10395	}
10396	}
10397
10398
10399	/*
10400	* page->vmp_object must be locked
10401	*/
10402	void
10403	vm_pageout_steal_laundry(vm_page_t page, boolean_t queues_locked)
10404	{
10405	if (!queues_locked) {
10406	vm_page_lockspin_queues();
10407	}
10408
10409	page->vmp_free_when_done = FALSE;
10410	/*
10411	* need to drop the laundry count...
10412	* we may also need to remove it
10413	* from the I/O paging queue...
10414	* vm_pageout_throttle_up handles both cases
10415	*
10416	* the laundry and pageout_queue flags are cleared...
10417	*/
10418	vm_pageout_throttle_up(m: page);
10419
10420	if (!queues_locked) {
10421	vm_page_unlock_queues();
10422	}
10423	}
10424
10425	#define VECTOR_UPL_ELEMENTS_UPPER_LIMIT 64
10426
10427	upl_t
10428	vector_upl_create(vm_offset_t upl_offset, uint32_t max_upls)
10429	{
10430	int i = `0`;
10431	upl_t upl;
10432
10433	assert(max_upls > `0`);
10434	if (max_upls == `0`) {
10435	return NULL;
10436	}
10437
10438	if (max_upls > VECTOR_UPL_ELEMENTS_UPPER_LIMIT) {
10439	max_upls = VECTOR_UPL_ELEMENTS_UPPER_LIMIT;
10440	}
10441	vector_upl_t vector_upl = kalloc_type(struct _vector_upl, typeof(vector_upl->upls[`0`]), max_upls, Z_WAITOK \| Z_NOFAIL);
10442
10443	upl = upl_create(type: `0`, UPL_VECTOR, size: `0`);
10444	upl->vector_upl = vector_upl;
10445	upl->u_offset = upl_offset;
10446	vector_upl->size = `0`;
10447	vector_upl->offset = upl_offset;
10448	vector_upl->invalid_upls = `0`;
10449	vector_upl->num_upls = `0`;
10450	vector_upl->pagelist = NULL;
10451	vector_upl->max_upls = max_upls;
10452
10453	for (i = `0`; i < max_upls; i++) {
10454	vector_upl->upls[i].iostate.size = `0`;
10455	vector_upl->upls[i].iostate.offset = `0`;
10456	}
10457	return upl;
10458	}
10459
10460	uint32_t
10461	vector_upl_max_upls(const upl_t upl)
10462	{
10463	if (!vector_upl_is_valid(upl)) {
10464	return `0`;
10465	}
10466	return ((vector_upl_t)(upl->vector_upl))->max_upls;
10467	}
10468
10469	void
10470	vector_upl_deallocate(upl_t upl)
10471	{
10472	vector_upl_t vector_upl = upl->vector_upl;
10473
10474	assert(vector_upl_is_valid(upl));
10475
10476	if (vector_upl->invalid_upls != vector_upl->num_upls) {
10477	panic("Deallocating non-empty Vectored UPL");
10478	}
10479	uint32_t max_upls = vector_upl->max_upls;
10480	kfree_type(struct upl_page_info, atop(vector_upl->size), vector_upl->pagelist);
10481	kfree_type(struct _vector_upl, typeof(vector_upl->upls[`0`]), max_upls, vector_upl);
10482	upl->vector_upl = NULL;
10483	}
10484
10485	boolean_t
10486	vector_upl_is_valid(upl_t upl)
10487	{
10488	return upl && (upl->flags & UPL_VECTOR) && upl->vector_upl;
10489	}
10490
10491	boolean_t
10492	vector_upl_set_subupl(upl_t upl, upl_t subupl, uint32_t io_size)
10493	{
10494	if (vector_upl_is_valid(upl)) {
10495	vector_upl_t vector_upl = upl->vector_upl;
10496
10497	if (vector_upl) {
10498	if (subupl) {
10499	if (io_size) {
10500	if (io_size < PAGE_SIZE) {
10501	io_size = PAGE_SIZE;
10502	}
10503	subupl->vector_upl = (void*)vector_upl;
10504	vector_upl->upls[vector_upl->num_upls++].elem = subupl;
10505	vector_upl->size += io_size;
10506	upl->u_size += io_size;
10507	} else {
10508	uint32_t i = `0`, invalid_upls = `0`;
10509	for (i = `0`; i < vector_upl->num_upls; i++) {
10510	if (vector_upl->upls[i].elem == subupl) {
10511	break;
10512	}
10513	}
10514	if (i == vector_upl->num_upls) {
10515	panic("Trying to remove sub-upl when none exists");
10516	}
10517
10518	vector_upl->upls[i].elem = NULL;
10519	invalid_upls = os_atomic_inc(&(vector_upl)->invalid_upls,
10520	relaxed);
10521	if (invalid_upls == vector_upl->num_upls) {
10522	return TRUE;
10523	} else {
10524	return FALSE;
10525	}
10526	}
10527	} else {
10528	panic("vector_upl_set_subupl was passed a NULL upl element");
10529	}
10530	} else {
10531	panic("vector_upl_set_subupl was passed a non-vectored upl");
10532	}
10533	} else {
10534	panic("vector_upl_set_subupl was passed a NULL upl");
10535	}
10536
10537	return FALSE;
10538	}
10539
10540	void
10541	vector_upl_set_pagelist(upl_t upl)
10542	{
10543	if (vector_upl_is_valid(upl)) {
10544	uint32_t i = `0`;
10545	vector_upl_t vector_upl = upl->vector_upl;
10546
10547	if (vector_upl) {
10548	vm_offset_t pagelist_size = `0`, cur_upl_pagelist_size = `0`;
10549
10550	vector_upl->pagelist = kalloc_type(struct upl_page_info,
10551	atop(vector_upl->size), Z_WAITOK);
10552
10553	for (i = `0`; i < vector_upl->num_upls; i++) {
10554	cur_upl_pagelist_size = sizeof(struct upl_page_info) * upl_adjusted_size(upl: vector_upl->upls[i].elem, PAGE_MASK) / PAGE_SIZE;
10555	bcopy(src: vector_upl->upls[i].elem->page_list, dst: (char*)vector_upl->pagelist + pagelist_size, n: cur_upl_pagelist_size);
10556	pagelist_size += cur_upl_pagelist_size;
10557	if (vector_upl->upls[i].elem->highest_page > upl->highest_page) {
10558	upl->highest_page = vector_upl->upls[i].elem->highest_page;
10559	}
10560	}
10561	assert( pagelist_size == (sizeof(struct upl_page_info) * (vector_upl->size / PAGE_SIZE)));
10562	} else {
10563	panic("vector_upl_set_pagelist was passed a non-vectored upl");
10564	}
10565	} else {
10566	panic("vector_upl_set_pagelist was passed a NULL upl");
10567	}
10568	}
10569
10570	upl_t
10571	vector_upl_subupl_byindex(upl_t upl, uint32_t index)
10572	{
10573	if (vector_upl_is_valid(upl)) {
10574	vector_upl_t vector_upl = upl->vector_upl;
10575	if (vector_upl) {
10576	if (index < vector_upl->num_upls) {
10577	return vector_upl->upls[index].elem;
10578	}
10579	} else {
10580	panic("vector_upl_subupl_byindex was passed a non-vectored upl");
10581	}
10582	}
10583	return NULL;
10584	}
10585
10586	upl_t
10587	vector_upl_subupl_byoffset(upl_t upl, upl_offset_t upl_offset, upl_size_t upl_size)
10588	{
10589	if (vector_upl_is_valid(upl)) {
10590	uint32_t i = `0`;
10591	vector_upl_t vector_upl = upl->vector_upl;
10592
10593	if (vector_upl) {
10594	upl_t subupl = NULL;
10595	vector_upl_iostates_t subupl_state;
10596
10597	for (i = `0`; i < vector_upl->num_upls; i++) {
10598	subupl = vector_upl->upls[i].elem;
10599	subupl_state = vector_upl->upls[i].iostate;
10600	if (*upl_offset <= (subupl_state.offset + subupl_state.size - `1`)) {
10601	/ We could have been passed an offset/size pair that belongs*
10602	* to an UPL element that has already been committed/aborted.
10603	* If so, return NULL.
10604	*/
10605	if (subupl == NULL) {
10606	return NULL;
10607	}
10608	if ((subupl_state.offset + subupl_state.size) < (upl_offset + upl_size)) {
10609	upl_size = (subupl_state.offset + subupl_state.size) - upl_offset;
10610	if (*upl_size > subupl_state.size) {
10611	*upl_size = subupl_state.size;
10612	}
10613	}
10614	if (*upl_offset >= subupl_state.offset) {
10615	*upl_offset -= subupl_state.offset;
10616	} else if (i) {
10617	panic("Vector UPL offset miscalculation");
10618	}
10619	return subupl;
10620	}
10621	}
10622	} else {
10623	panic("vector_upl_subupl_byoffset was passed a non-vectored UPL");
10624	}
10625	}
10626	return NULL;
10627	}
10628
10629	void
10630	vector_upl_get_submap(upl_t upl, vm_map_t v_upl_submap, vm_offset_t submap_dst_addr)
10631	{
10632	*v_upl_submap = NULL;
10633
10634	if (vector_upl_is_valid(upl)) {
10635	vector_upl_t vector_upl = upl->vector_upl;
10636	if (vector_upl) {
10637	*v_upl_submap = vector_upl->submap;
10638	*submap_dst_addr = vector_upl->submap_dst_addr;
10639	} else {
10640	panic("vector_upl_get_submap was passed a non-vectored UPL");
10641	}
10642	} else {
10643	panic("vector_upl_get_submap was passed a null UPL");
10644	}
10645	}
10646
10647	void
10648	vector_upl_set_submap(upl_t upl, vm_map_t submap, vm_offset_t submap_dst_addr)
10649	{
10650	if (vector_upl_is_valid(upl)) {
10651	vector_upl_t vector_upl = upl->vector_upl;
10652	if (vector_upl) {
10653	vector_upl->submap = submap;
10654	vector_upl->submap_dst_addr = submap_dst_addr;
10655	} else {
10656	panic("vector_upl_get_submap was passed a non-vectored UPL");
10657	}
10658	} else {
10659	panic("vector_upl_get_submap was passed a NULL UPL");
10660	}
10661	}
10662
10663	void
10664	vector_upl_set_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t size)
10665	{
10666	if (vector_upl_is_valid(upl)) {
10667	uint32_t i = `0`;
10668	vector_upl_t vector_upl = upl->vector_upl;
10669
10670	if (vector_upl) {
10671	for (i = `0`; i < vector_upl->num_upls; i++) {
10672	if (vector_upl->upls[i].elem == subupl) {
10673	break;
10674	}
10675	}
10676
10677	if (i == vector_upl->num_upls) {
10678	panic("setting sub-upl iostate when none exists");
10679	}
10680
10681	vector_upl->upls[i].iostate.offset = offset;
10682	if (size < PAGE_SIZE) {
10683	size = PAGE_SIZE;
10684	}
10685	vector_upl->upls[i].iostate.size = size;
10686	} else {
10687	panic("vector_upl_set_iostate was passed a non-vectored UPL");
10688	}
10689	} else {
10690	panic("vector_upl_set_iostate was passed a NULL UPL");
10691	}
10692	}
10693
10694	void
10695	vector_upl_get_iostate(upl_t upl, upl_t subupl, upl_offset_t offset, upl_size_t size)
10696	{
10697	if (vector_upl_is_valid(upl)) {
10698	uint32_t i = `0`;
10699	vector_upl_t vector_upl = upl->vector_upl;
10700
10701	if (vector_upl) {
10702	for (i = `0`; i < vector_upl->num_upls; i++) {
10703	if (vector_upl->upls[i].elem == subupl) {
10704	break;
10705	}
10706	}
10707
10708	if (i == vector_upl->num_upls) {
10709	panic("getting sub-upl iostate when none exists");
10710	}
10711
10712	*offset = vector_upl->upls[i].iostate.offset;
10713	*size = vector_upl->upls[i].iostate.size;
10714	} else {
10715	panic("vector_upl_get_iostate was passed a non-vectored UPL");
10716	}
10717	} else {
10718	panic("vector_upl_get_iostate was passed a NULL UPL");
10719	}
10720	}
10721
10722	void
10723	vector_upl_get_iostate_byindex(upl_t upl, uint32_t index, upl_offset_t offset, upl_size_t size)
10724	{
10725	if (vector_upl_is_valid(upl)) {
10726	vector_upl_t vector_upl = upl->vector_upl;
10727	if (vector_upl) {
10728	if (index < vector_upl->num_upls) {
10729	*offset = vector_upl->upls[index].iostate.offset;
10730	*size = vector_upl->upls[index].iostate.size;
10731	} else {
10732	offset = size = `0`;
10733	}
10734	} else {
10735	panic("vector_upl_get_iostate_byindex was passed a non-vectored UPL");
10736	}
10737	} else {
10738	panic("vector_upl_get_iostate_byindex was passed a NULL UPL");
10739	}
10740	}
10741
10742	void *
10743	upl_get_internal_vectorupl(upl_t upl)
10744	{
10745	return upl->vector_upl;
10746	}
10747
10748	upl_page_info_t *
10749	upl_get_internal_vectorupl_pagelist(upl_t upl)
10750	{
10751	return upl->vector_upl->pagelist;
10752	}
10753
10754	upl_page_info_t *
10755	upl_get_internal_page_list(upl_t upl)
10756	{
10757	return upl->vector_upl ? upl->vector_upl->pagelist : upl->page_list;
10758	}
10759
10760	void
10761	upl_clear_dirty(
10762	upl_t upl,
10763	boolean_t value)
10764	{
10765	if (value) {
10766	upl->flags \|= UPL_CLEAR_DIRTY;
10767	} else {
10768	upl->flags &= ~UPL_CLEAR_DIRTY;
10769	}
10770	}
10771
10772	void
10773	upl_set_referenced(
10774	upl_t upl,
10775	boolean_t value)
10776	{
10777	upl_lock(upl);
10778	if (value) {
10779	upl->ext_ref_count++;
10780	} else {
10781	if (!upl->ext_ref_count) {
10782	panic("upl_set_referenced not %p", upl);
10783	}
10784	upl->ext_ref_count--;
10785	}
10786	upl_unlock(upl);
10787	}
10788
10789	#if CONFIG_IOSCHED
10790	void
10791	upl_set_blkno(
10792	upl_t upl,
10793	vm_offset_t upl_offset,
10794	int io_size,
10795	int64_t blkno)
10796	{
10797	int i, j;
10798	if ((upl->flags & UPL_EXPEDITE_SUPPORTED) == `0`) {
10799	return;
10800	}
10801
10802	assert(upl->upl_reprio_info != `0`);
10803	for (i = (int)(upl_offset / PAGE_SIZE), j = `0`; j < io_size; i++, j += PAGE_SIZE) {
10804	UPL_SET_REPRIO_INFO(upl, i, blkno, io_size);
10805	}
10806	}
10807	#endif
10808
10809	void inline
10810	memoryshot(unsigned int event, unsigned int control)
10811	{
10812	if (vm_debug_events) {
10813	KERNEL_DEBUG_CONSTANT1((MACHDBG_CODE(DBG_MACH_VM_PRESSURE, event)) \| control,
10814	vm_page_active_count, vm_page_inactive_count,
10815	vm_page_free_count, vm_page_speculative_count,
10816	vm_page_throttled_count);
10817	} else {
10818	(void) event;
10819	(void) control;
10820	}
10821	}
10822
10823	#ifdef MACH_BSD
10824
10825	boolean_t
10826	upl_device_page(upl_page_info_t *upl)
10827	{
10828	return UPL_DEVICE_PAGE(upl);
10829	}
10830	boolean_t
10831	upl_page_present(upl_page_info_t upl, int* index)
10832	{
10833	return UPL_PAGE_PRESENT(upl, index);
10834	}
10835	boolean_t
10836	upl_speculative_page(upl_page_info_t upl, int* index)
10837	{
10838	return UPL_SPECULATIVE_PAGE(upl, index);
10839	}
10840	boolean_t
10841	upl_dirty_page(upl_page_info_t upl, int* index)
10842	{
10843	return UPL_DIRTY_PAGE(upl, index);
10844	}
10845	boolean_t
10846	upl_valid_page(upl_page_info_t upl, int* index)
10847	{
10848	return UPL_VALID_PAGE(upl, index);
10849	}
10850	ppnum_t
10851	upl_phys_page(upl_page_info_t upl, int* index)
10852	{
10853	return UPL_PHYS_PAGE(upl, index);
10854	}
10855
10856	void
10857	upl_page_set_mark(upl_page_info_t upl, int* index, boolean_t v)
10858	{
10859	upl[index].mark = v;
10860	}
10861
10862	boolean_t
10863	upl_page_get_mark(upl_page_info_t upl, int* index)
10864	{
10865	return upl[index].mark;
10866	}
10867
10868	void
10869	vm_countdirtypages(void)
10870	{
10871	vm_page_t m;
10872	int dpages;
10873	int pgopages;
10874	int precpages;
10875
10876
10877	dpages = `0`;
10878	pgopages = `0`;
10879	precpages = `0`;
10880
10881	vm_page_lock_queues();
10882	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_inactive);
10883	do {
10884	if (m == (vm_page_t)`0`) {
10885	break;
10886	}
10887
10888	if (m->vmp_dirty) {
10889	dpages++;
10890	}
10891	if (m->vmp_free_when_done) {
10892	pgopages++;
10893	}
10894	if (m->vmp_precious) {
10895	precpages++;
10896	}
10897
10898	assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
10899	m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10900	if (m == (vm_page_t)`0`) {
10901	break;
10902	}
10903	} while (!vm_page_queue_end(&vm_page_queue_inactive, (vm_page_queue_entry_t) m));
10904	vm_page_unlock_queues();
10905
10906	vm_page_lock_queues();
10907	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_throttled);
10908	do {
10909	if (m == (vm_page_t)`0`) {
10910	break;
10911	}
10912
10913	dpages++;
10914	assert(m->vmp_dirty);
10915	assert(!m->vmp_free_when_done);
10916	assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
10917	m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10918	if (m == (vm_page_t)`0`) {
10919	break;
10920	}
10921	} while (!vm_page_queue_end(&vm_page_queue_throttled, (vm_page_queue_entry_t) m));
10922	vm_page_unlock_queues();
10923
10924	vm_page_lock_queues();
10925	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_anonymous);
10926	do {
10927	if (m == (vm_page_t)`0`) {
10928	break;
10929	}
10930
10931	if (m->vmp_dirty) {
10932	dpages++;
10933	}
10934	if (m->vmp_free_when_done) {
10935	pgopages++;
10936	}
10937	if (m->vmp_precious) {
10938	precpages++;
10939	}
10940
10941	assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
10942	m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10943	if (m == (vm_page_t)`0`) {
10944	break;
10945	}
10946	} while (!vm_page_queue_end(&vm_page_queue_anonymous, (vm_page_queue_entry_t) m));
10947	vm_page_unlock_queues();
10948
10949	printf(format: "IN Q: %d : %d : %d\n", dpages, pgopages, precpages);
10950
10951	dpages = `0`;
10952	pgopages = `0`;
10953	precpages = `0`;
10954
10955	vm_page_lock_queues();
10956	m = (vm_page_t) vm_page_queue_first(&vm_page_queue_active);
10957
10958	do {
10959	if (m == (vm_page_t)`0`) {
10960	break;
10961	}
10962	if (m->vmp_dirty) {
10963	dpages++;
10964	}
10965	if (m->vmp_free_when_done) {
10966	pgopages++;
10967	}
10968	if (m->vmp_precious) {
10969	precpages++;
10970	}
10971
10972	assert(!is_kernel_object(VM_PAGE_OBJECT(m)));
10973	m = (vm_page_t) vm_page_queue_next(&m->vmp_pageq);
10974	if (m == (vm_page_t)`0`) {
10975	break;
10976	}
10977	} while (!vm_page_queue_end(&vm_page_queue_active, (vm_page_queue_entry_t) m));
10978	vm_page_unlock_queues();
10979
10980	printf(format: "AC Q: %d : %d : %d\n", dpages, pgopages, precpages);
10981	}
10982	#endif /* MACH_BSD */
10983
10984
10985	#if CONFIG_IOSCHED
10986	int
10987	upl_get_cached_tier(upl_t upl)
10988	{
10989	assert(upl);
10990	if (upl->flags & UPL_TRACKED_BY_OBJECT) {
10991	return upl->upl_priority;
10992	}
10993	return -`1`;
10994	}
10995	#endif /* CONFIG_IOSCHED */
10996
10997
10998	void
10999	upl_callout_iodone(upl_t upl)
11000	{
11001	struct upl_io_completion *upl_ctx = upl->upl_iodone;
11002
11003	if (upl_ctx) {
11004	void (iodone_func)(void* , int*) = upl_ctx->io_done;
11005
11006	assert(upl_ctx->io_done);
11007
11008	(*iodone_func)(upl_ctx->io_context, upl_ctx->io_error);
11009	}
11010	}
11011
11012	void
11013	upl_set_iodone(upl_t upl, void *upl_iodone)
11014	{
11015	upl->upl_iodone = (struct upl_io_completion *)upl_iodone;
11016	}
11017
11018	void
11019	upl_set_iodone_error(upl_t upl, int error)
11020	{
11021	struct upl_io_completion *upl_ctx = upl->upl_iodone;
11022
11023	if (upl_ctx) {
11024	upl_ctx->io_error = error;
11025	}
11026	}
11027
11028
11029	ppnum_t
11030	upl_get_highest_page(
11031	upl_t upl)
11032	{
11033	return upl->highest_page;
11034	}
11035
11036	upl_size_t
11037	upl_get_size(
11038	upl_t upl)
11039	{
11040	return upl_adjusted_size(upl, PAGE_MASK);
11041	}
11042
11043	upl_size_t
11044	upl_adjusted_size(
11045	upl_t upl,
11046	vm_map_offset_t pgmask)
11047	{
11048	vm_object_offset_t start_offset, end_offset;
11049
11050	start_offset = trunc_page_mask_64(upl->u_offset, pgmask);
11051	end_offset = round_page_mask_64(upl->u_offset + upl->u_size, pgmask);
11052
11053	return (upl_size_t)(end_offset - start_offset);
11054	}
11055
11056	vm_object_offset_t
11057	upl_adjusted_offset(
11058	upl_t upl,
11059	vm_map_offset_t pgmask)
11060	{
11061	return trunc_page_mask_64(upl->u_offset, pgmask);
11062	}
11063
11064	vm_object_offset_t
11065	upl_get_data_offset(
11066	upl_t upl)
11067	{
11068	return upl->u_offset - upl_adjusted_offset(upl, PAGE_MASK);
11069	}
11070
11071	upl_t
11072	upl_associated_upl(upl_t upl)
11073	{
11074	return upl->associated_upl;
11075	}
11076
11077	void
11078	upl_set_associated_upl(upl_t upl, upl_t associated_upl)
11079	{
11080	upl->associated_upl = associated_upl;
11081	}
11082
11083	struct vnode *
11084	upl_lookup_vnode(upl_t upl)
11085	{
11086	if (!upl->map_object->internal) {
11087	return vnode_pager_lookup_vnode(upl->map_object->pager);
11088	} else {
11089	return NULL;
11090	}
11091	}
11092
11093	#if UPL_DEBUG
11094	kern_return_t
11095	upl_ubc_alias_set(upl_t upl, uintptr_t alias1, uintptr_t alias2)
11096	{
11097	upl->ubc_alias1 = alias1;
11098	upl->ubc_alias2 = alias2;
11099	return KERN_SUCCESS;
11100	}
11101	int
11102	upl_ubc_alias_get(upl_t upl, uintptr_t * al, uintptr_t * al2)
11103	{
11104	if (al) {
11105	*al = upl->ubc_alias1;
11106	}
11107	if (al2) {
11108	*al2 = upl->ubc_alias2;
11109	}
11110	return KERN_SUCCESS;
11111	}
11112	#endif /* UPL_DEBUG */
11113
11114	#if VM_PRESSURE_EVENTS
11115	/*
11116	* Upward trajectory.
11117	*/
11118	extern boolean_t vm_compressor_low_on_space(void);
11119
11120	boolean_t
11121	VM_PRESSURE_NORMAL_TO_WARNING(void)
11122	{
11123	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11124	/ Available pages below our threshold /
11125	if (memorystatus_available_pages < memorystatus_available_pages_pressure) {
11126	/ No frozen processes to kill /
11127	if (memorystatus_frozen_count == `0`) {
11128	/ Not enough suspended processes available. /
11129	if (memorystatus_suspended_count < MEMORYSTATUS_SUSPENDED_THRESHOLD) {
11130	return TRUE;
11131	}
11132	}
11133	}
11134	return FALSE;
11135	} else {
11136	return (AVAILABLE_NON_COMPRESSED_MEMORY < VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) ? `1` : `0`;
11137	}
11138	}
11139
11140	boolean_t
11141	VM_PRESSURE_WARNING_TO_CRITICAL(void)
11142	{
11143	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11144	/ Available pages below our threshold /
11145	if (memorystatus_available_pages < memorystatus_available_pages_critical) {
11146	return TRUE;
11147	}
11148	return FALSE;
11149	} else {
11150	return vm_compressor_low_on_space() \|\| (AVAILABLE_NON_COMPRESSED_MEMORY < ((`12` * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / `10`)) ? `1` : `0`;
11151	}
11152	}
11153
11154	/*
11155	* Downward trajectory.
11156	*/
11157	boolean_t
11158	VM_PRESSURE_WARNING_TO_NORMAL(void)
11159	{
11160	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11161	/ Available pages above our threshold /
11162	unsigned int target_threshold = (unsigned int) (memorystatus_available_pages_pressure + ((`15` * memorystatus_available_pages_pressure) / `100`));
11163	if (memorystatus_available_pages > target_threshold) {
11164	return TRUE;
11165	}
11166	return FALSE;
11167	} else {
11168	return (AVAILABLE_NON_COMPRESSED_MEMORY > ((`12` * VM_PAGE_COMPRESSOR_COMPACT_THRESHOLD) / `10`)) ? `1` : `0`;
11169	}
11170	}
11171
11172	boolean_t
11173	VM_PRESSURE_CRITICAL_TO_WARNING(void)
11174	{
11175	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11176	/ Available pages above our threshold /
11177	unsigned int target_threshold = (unsigned int)(memorystatus_available_pages_critical + ((`15` * memorystatus_available_pages_critical) / `100`));
11178	if (memorystatus_available_pages > target_threshold) {
11179	return TRUE;
11180	}
11181	return FALSE;
11182	} else {
11183	return (AVAILABLE_NON_COMPRESSED_MEMORY > ((`14` * VM_PAGE_COMPRESSOR_SWAP_UNTHROTTLE_THRESHOLD) / `10`)) ? `1` : `0`;
11184	}
11185	}
11186	#endif /* VM_PRESSURE_EVENTS */
11187
11188	#if DEVELOPMENT \|\| DEBUG
11189	bool compressor_running_perf_test;
11190	uint64_t compressor_perf_test_pages_processed;
11191
11192	kern_return_t
11193	run_compressor_perf_test(
11194	user_addr_t buf,
11195	size_t buffer_size,
11196	uint64_t *time,
11197	uint64_t *bytes_compressed,
11198	uint64_t *compressor_growth);
11199
11200	static kern_return_t
11201	move_pages_to_queue(
11202	vm_map_t map,
11203	user_addr_t start_addr,
11204	size_t buffer_size,
11205	vm_page_queue_head_t *queue,
11206	size_t *pages_moved)
11207	{
11208	kern_return_t err = KERN_SUCCESS;
11209	vm_map_entry_t curr_entry = VM_MAP_ENTRY_NULL;
11210	boolean_t addr_in_map = FALSE;
11211	user_addr_t end_addr = USER_ADDR_NULL, curr_addr = USER_ADDR_NULL;
11212	vm_object_t curr_object = VM_OBJECT_NULL;
11213	*pages_moved = `0`;
11214
11215
11216	if (VM_MAP_PAGE_SIZE(map) != PAGE_SIZE_64) {
11217	/*
11218	* We don't currently support benchmarking maps with a different page size
11219	* than the kernel.
11220	*/
11221	return KERN_INVALID_ARGUMENT;
11222	}
11223
11224	if (os_add_overflow(start_addr, buffer_size, &end_addr)) {
11225	return KERN_INVALID_ARGUMENT;
11226	}
11227
11228	vm_map_lock_read(map);
11229	curr_addr = vm_map_trunc_page_mask(start_addr, VM_MAP_PAGE_MASK(map));
11230	end_addr = vm_map_round_page_mask(start_addr + buffer_size, VM_MAP_PAGE_MASK(map));
11231
11232
11233	while (curr_addr < end_addr) {
11234	addr_in_map = vm_map_lookup_entry(map, curr_addr, &curr_entry);
11235	if (!addr_in_map) {
11236	err = KERN_INVALID_ARGUMENT;
11237	break;
11238	}
11239	curr_object = VME_OBJECT(curr_entry);
11240	if (curr_object) {
11241	vm_object_lock(curr_object);
11242	/ We really only want anonymous memory that's in the top level map and object here. /
11243	if (curr_entry->is_sub_map \|\| curr_entry->wired_count != `0` \|\|
11244	curr_object->shadow != VM_OBJECT_NULL \|\| !curr_object->internal) {
11245	err = KERN_INVALID_ARGUMENT;
11246	vm_object_unlock(curr_object);
11247	break;
11248	}
11249	vm_map_offset_t start_offset = (curr_addr - curr_entry->vme_start) + VME_OFFSET(curr_entry);
11250	vm_map_offset_t end_offset = MIN(curr_entry->vme_end, end_addr) -
11251	(curr_entry->vme_start + VME_OFFSET(curr_entry));
11252	vm_map_offset_t curr_offset = start_offset;
11253	vm_page_t curr_page;
11254	while (curr_offset < end_offset) {
11255	curr_page = vm_page_lookup(curr_object, vm_object_trunc_page(curr_offset));
11256	if (curr_page != VM_PAGE_NULL) {
11257	vm_page_lock_queues();
11258	if (curr_page->vmp_laundry) {
11259	vm_pageout_steal_laundry(curr_page, TRUE);
11260	}
11261	/*
11262	* we've already factored out pages in the laundry which
11263	* means this page can't be on the pageout queue so it's
11264	* safe to do the vm_page_queues_remove
11265	*/
11266	bool donate = (curr_page->vmp_on_specialq == VM_PAGE_SPECIAL_Q_DONATE);
11267	vm_page_queues_remove(curr_page, TRUE);
11268	if (donate) {
11269	/*
11270	* The compressor needs to see this bit to know
11271	* where this page needs to land. Also if stolen,
11272	* this bit helps put the page back in the right
11273	* special queue where it belongs.
11274	*/
11275	curr_page->vmp_on_specialq = VM_PAGE_SPECIAL_Q_DONATE;
11276	}
11277	// Clear the referenced bit so we ensure this gets paged out
11278	curr_page->vmp_reference = false;
11279	if (curr_page->vmp_pmapped) {
11280	pmap_clear_refmod_options(VM_PAGE_GET_PHYS_PAGE(curr_page),
11281	VM_MEM_REFERENCED, PMAP_OPTIONS_NOFLUSH, (void*)NULL);
11282	}
11283	vm_page_queue_enter(queue, curr_page, vmp_pageq);
11284	vm_page_unlock_queues();
11285	*pages_moved += `1`;
11286	}
11287	curr_offset += PAGE_SIZE_64;
11288	curr_addr += PAGE_SIZE_64;
11289	}
11290	}
11291	vm_object_unlock(curr_object);
11292	}
11293	vm_map_unlock_read(map);
11294	return err;
11295	}
11296
11297	/*
11298	* Local queue for processing benchmark pages.
11299	* Can't be allocated on the stack because the pointer has to
11300	* be packable.
11301	*/
11302	vm_page_queue_head_t compressor_perf_test_queue VM_PAGE_PACKED_ALIGNED;
11303	kern_return_t
11304	run_compressor_perf_test(
11305	user_addr_t buf,
11306	size_t buffer_size,
11307	uint64_t *time,
11308	uint64_t *bytes_compressed,
11309	uint64_t *compressor_growth)
11310	{
11311	kern_return_t err = KERN_SUCCESS;
11312	if (!VM_CONFIG_COMPRESSOR_IS_ACTIVE) {
11313	return KERN_NOT_SUPPORTED;
11314	}
11315	if (current_task() == kernel_task) {
11316	return KERN_INVALID_ARGUMENT;
11317	}
11318	vm_page_lock_queues();
11319	if (compressor_running_perf_test) {
11320	/ Only run one instance of the benchmark at a time. /
11321	vm_page_unlock_queues();
11322	return KERN_RESOURCE_SHORTAGE;
11323	}
11324	vm_page_unlock_queues();
11325	size_t page_count = `0`;
11326	vm_map_t map;
11327	vm_page_t p, next;
11328	uint64_t compressor_perf_test_start = `0`, compressor_perf_test_end = `0`;
11329	uint64_t compressed_bytes_start = `0`, compressed_bytes_end = `0`;
11330	bytes_compressed = compressor_growth = `0`;
11331
11332	vm_page_queue_init(&compressor_perf_test_queue);
11333	map = current_task()->map;
11334	err = move_pages_to_queue(map, buf, buffer_size, &compressor_perf_test_queue, &page_count);
11335	if (err != KERN_SUCCESS) {
11336	goto out;
11337	}
11338
11339	vm_page_lock_queues();
11340	compressor_running_perf_test = true;
11341	compressor_perf_test_pages_processed = `0`;
11342	/*
11343	* At this point the compressor threads should only process the benchmark queue
11344	* so we can look at the difference in c_segment_compressed_bytes while the perf test is running
11345	* to determine how many compressed bytes we ended up using.
11346	*/
11347	compressed_bytes_start = c_segment_compressed_bytes;
11348	vm_page_unlock_queues();
11349
11350	page_count = vm_pageout_page_queue(&compressor_perf_test_queue, page_count, true);
11351
11352	vm_page_lock_queues();
11353	compressor_perf_test_start = mach_absolute_time();
11354
11355	// Wake up the compressor thread(s)
11356	sched_cond_signal(&pgo_iothread_internal_state[`0`].pgo_wakeup,
11357	pgo_iothread_internal_state[`0`].pgo_iothread);
11358
11359	/*
11360	* Depending on when this test is run we could overshoot or be right on the mark
11361	* with our page_count. So the comparison is of the _less than_ variety.
11362	*/
11363	while (compressor_perf_test_pages_processed < page_count) {
11364	assert_wait((event_t) &compressor_perf_test_pages_processed, THREAD_UNINT);
11365	vm_page_unlock_queues();
11366	thread_block(THREAD_CONTINUE_NULL);
11367	vm_page_lock_queues();
11368	}
11369	compressor_perf_test_end = mach_absolute_time();
11370	compressed_bytes_end = c_segment_compressed_bytes;
11371	vm_page_unlock_queues();
11372
11373
11374	out:
11375	/*
11376	* If we errored out above, then we could still have some pages
11377	* on the local queue. Make sure to put them back on the active queue before
11378	* returning so they're not orphaned.
11379	*/
11380	vm_page_lock_queues();
11381	absolutetime_to_nanoseconds(compressor_perf_test_end - compressor_perf_test_start, time);
11382	p = (vm_page_t) vm_page_queue_first(&compressor_perf_test_queue);
11383	while (p && !vm_page_queue_end(&compressor_perf_test_queue, (vm_page_queue_entry_t)p)) {
11384	next = (vm_page_t)VM_PAGE_UNPACK_PTR(p->vmp_pageq.next);
11385
11386	vm_page_enqueue_active(p, FALSE);
11387	p = next;
11388	}
11389
11390	compressor_running_perf_test = false;
11391	vm_page_unlock_queues();
11392	if (err == KERN_SUCCESS) {
11393	bytes_compressed = page_count PAGE_SIZE_64;
11394	*compressor_growth = compressed_bytes_end - compressed_bytes_start;
11395	}
11396
11397	/*
11398	* pageout_scan will consider waking the compactor swapper
11399	* before it blocks. Do the same thing here before we return
11400	* to ensure that back to back benchmark runs can't overly fragment the
11401	* compressor pool.
11402	*/
11403	vm_consider_waking_compactor_swapper();
11404	return err;
11405	}
11406	#endif /* DEVELOPMENT \|\| DEBUG */
11407

Browse the source code of xnu/osfmk/vm/vm_pageout.c