vm_reclaim.c source code [xnu/osfmk/vm/vm_reclaim.c]

1	/*
2	* Copyright (c) 2021 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28
29	#include <kern/exc_guard.h>
30	#include <kern/locks.h>
31	#include <kern/task.h>
32	#include <kern/zalloc.h>
33	#include <kern/misc_protos.h>
34	#include <kern/startup.h>
35	#include <kern/sched.h>
36	#include <libkern/OSAtomic.h>
37	#include <mach/kern_return.h>
38	#include <mach/mach_types.h>
39	#include <mach/mach_vm.h>
40	#include <mach/vm_reclaim.h>
41	#include <os/log.h>
42	#include <pexpert/pexpert.h>
43	#include <vm/vm_map.h>
44	#include <vm/vm_map_internal.h>
45	#include <vm/vm_reclaim_internal.h>
46	#include <sys/kdebug.h>
47	#include <sys/queue.h>
48	#include <os/atomic_private.h>
49
50	#pragma mark Tunables
51	TUNABLE(uint32_t, kReclaimChunkSize, "vm_reclaim_chunk_size", `16`);
52	static integer_t kReclaimThreadPriority = BASEPRI_VM;
53	// Reclaim down to vm_reclaim_max_threshold / vm_reclaim_trim_divisor when doing a trim reclaim operation
54	TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_trim_divisor, "vm_reclaim_trim_divisor", `2`);
55	TUNABLE_DT_DEV_WRITEABLE(uint64_t, vm_reclaim_max_threshold, "/defaults", "kern.vm_reclaim_max_threshold", "vm_reclaim_max_threshold", `0`, TUNABLE_DT_NONE);
56	// Used to debug vm_reclaim kills
57	TUNABLE(bool, panic_on_kill, "vm_reclaim_panic_on_kill", false);
58
59	#pragma mark Declarations
60	typedef struct proc *proc_t;
61	extern char *proc_best_name(proc_t proc);
62	extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code, mach_exception_data_type_t subcode);
63	struct proc proc_ref(struct* proc p, int* locked);
64	int proc_rele(proc_t p);
65	static bool reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head);
66	static bool reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail);
67	static bool reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy);
68
69	struct vm_deferred_reclamation_metadata_s {
70	TAILQ_ENTRY(vm_deferred_reclamation_metadata_s) vdrm_list; // Global list containing every reclamation buffer
71	TAILQ_ENTRY(vm_deferred_reclamation_metadata_s) vdrm_async_list; // A list containing buffers that are ripe for reclamation
72	decl_lck_mtx_data(, vdrm_lock); / Held when reclaiming from the buffer /
73	/*
74	* The task owns this structure but we maintain a backpointer here
75	* so that we can send an exception if we hit an error.
76	* Since this is a backpointer we don't hold a reference (it's a weak pointer).
77	*/
78	task_t vdrm_task;
79	vm_map_t vdrm_map;
80	user_addr_t vdrm_reclaim_buffer;
81	mach_vm_size_t vdrm_buffer_size;
82	user_addr_t vdrm_reclaim_indices;
83	uint64_t vdrm_reclaimed_at;
84	/*
85	* These two values represent running sums of bytes placed in the buffer and bytes reclaimed out of the buffer
86	* cumulatively. Both values are in terms of virtual memory, so they give an upper bound
87	* on the amount of physical memory that can be reclaimed.
88	* To get an estimate of the current amount of VA in the buffer do vdrm_num_bytes_reclaimed - vdrm_num_bytes_put_in_buffer.
89	* Note that neither value is protected by the vdrm_lock.
90	*/
91	_Atomic size_t vdrm_num_bytes_put_in_buffer;
92	_Atomic size_t vdrm_num_bytes_reclaimed;
93	};
94	static void process_async_reclamation_list(void);
95
96	extern void proc_find(int* pid);
97	extern task_t proc_task(proc_t);
98
99	#pragma mark Globals
100	static KALLOC_TYPE_DEFINE(vm_reclaim_metadata_zone, struct vm_deferred_reclamation_metadata_s, KT_DEFAULT);
101	static LCK_GRP_DECLARE(vm_reclaim_lock_grp, "vm_reclaim");
102	static os_log_t vm_reclaim_log_handle;
103
104	/*
105	* The ringbuffer must contain at least 2 entries to distinguish between empty
106	* (head == tail) and full (head == tail + 1).
107	*/
108	#define BUFFER_MIN_ENTRY_COUNT 2
109
110	/*
111	* We maintain two lists of reclamation buffers.
112	* The reclamation_buffers list contains every buffer in the system.
113	* The async_reclamation_buffers_list contains buffers that are ripe for reclamation.
114	* Each list has its own lock.
115	*/
116	static TAILQ_HEAD(, vm_deferred_reclamation_metadata_s) reclamation_buffers = TAILQ_HEAD_INITIALIZER(reclamation_buffers);
117
118	static TAILQ_HEAD(, vm_deferred_reclamation_metadata_s) async_reclamation_buffers = TAILQ_HEAD_INITIALIZER(async_reclamation_buffers);
119	/*
120	* The reclamation_buffers_lock protects the reclamation_buffers list.
121	* It must be held when iterating over the list or manipulating the list.
122	* It should be dropped when acting on a specific metadata entry after acquiring the vdrm_lock.
123	*/
124	LCK_MTX_DECLARE(reclamation_buffers_lock, &vm_reclaim_lock_grp);
125	LCK_MTX_DECLARE(async_reclamation_buffers_lock, &vm_reclaim_lock_grp);
126	static size_t reclamation_buffers_length;
127	static uint64_t reclamation_counter; // generation count for global reclaims
128
129	static SECURITY_READ_ONLY_LATE(thread_t) vm_reclaim_thread;
130	static void reclaim_thread(void *param __unused, wait_result_t wr __unused);
131
132	#pragma mark Implementation
133
134	/*
135	* The current design is not tolerant to faulting on the buffer under the
136	* metadata lock. Wire the buffer as a stop-gap solution for now; in the
137	* future, the synchronization scheme should be revised to allow the buffer
138	* to be pageable (rdar://112039103).
139	*/
140
141	static kern_return_t
142	vmdr_metadata_wire(vm_deferred_reclamation_metadata_t metadata)
143	{
144	kern_return_t kr;
145	vm_map_offset_t buffer_start = (metadata->vdrm_reclaim_buffer -
146	offsetof(struct mach_vm_reclaim_buffer_v1_s, entries));
147	vm_map_offset_t buffer_end = (metadata->vdrm_reclaim_buffer +
148	metadata->vdrm_buffer_size);
149	kr = vm_map_wire_kernel(map: metadata->vdrm_map, start: buffer_start, end: buffer_end,
150	VM_PROT_NONE, VM_KERN_MEMORY_OSFMK, TRUE);
151	if (kr != KERN_SUCCESS) {
152	os_log_error(vm_reclaim_log_handle,
153	"vm_reclaim: failed to wire userspace reclaim buffer for pid %d (%d)",
154	task_pid(metadata->vdrm_task), kr);
155	}
156	return kr;
157	}
158
159	static kern_return_t
160	vmdr_metadata_unwire(vm_deferred_reclamation_metadata_t metadata)
161	{
162	kern_return_t kr;
163	vm_map_offset_t buffer_start = (metadata->vdrm_reclaim_buffer -
164	offsetof(struct mach_vm_reclaim_buffer_v1_s, entries));
165	vm_map_offset_t buffer_end = (metadata->vdrm_reclaim_buffer +
166	metadata->vdrm_buffer_size);
167	kr = vm_map_unwire(map: metadata->vdrm_map, start: buffer_start, end: buffer_end, TRUE);
168	if (kr != KERN_SUCCESS) {
169	os_log_error(vm_reclaim_log_handle,
170	"vm_reclaim: unable to un-wire buffer %p (%llu) for pid %d (%d)",
171	(void *)buffer_start, (buffer_end - buffer_start),
172	task_pid(metadata->vdrm_task), kr);
173	}
174	return kr;
175	}
176
177	static vm_deferred_reclamation_metadata_t
178	vmdr_metadata_alloc(
179	task_t task,
180	user_addr_t buffer,
181	mach_vm_size_t size,
182	user_addr_t indices)
183	{
184	vm_deferred_reclamation_metadata_t metadata;
185	vm_map_t map = task->map;
186
187	assert(!map->is_nested_map);
188
189	metadata = zalloc_flags(vm_reclaim_metadata_zone, Z_WAITOK \| Z_ZERO);
190	lck_mtx_init(lck: &metadata->vdrm_lock, grp: &vm_reclaim_lock_grp, LCK_ATTR_NULL);
191	metadata->vdrm_task = task;
192	metadata->vdrm_map = map;
193	metadata->vdrm_reclaim_buffer = buffer;
194	metadata->vdrm_buffer_size = size;
195	metadata->vdrm_reclaim_indices = indices;
196
197	/*
198	* we do not need to hold a lock on `task` because this is called
199	* either at fork() time or from the context of current_task().
200	*/
201	vm_map_reference(map);
202	return metadata;
203	}
204
205	static void
206	vmdr_metadata_free(vm_deferred_reclamation_metadata_t metadata)
207	{
208	vm_map_deallocate(map: metadata->vdrm_map);
209	lck_mtx_destroy(lck: &metadata->vdrm_lock, grp: &vm_reclaim_lock_grp);
210	zfree(vm_reclaim_metadata_zone, metadata);
211	}
212
213	kern_return_t
214	vm_deferred_reclamation_buffer_init_internal(
215	task_t task,
216	mach_vm_offset_t address,
217	mach_vm_size_t size)
218	{
219	kern_return_t kr = KERN_FAILURE, tmp_kr;
220	vm_deferred_reclamation_metadata_t metadata = NULL;
221	bool success;
222	uint64_t head = `0`, tail = `0`, busy = `0`;
223
224	if (address == `0` \|\|
225	size < (sizeof(struct mach_vm_reclaim_buffer_v1_s) +
226	BUFFER_MIN_ENTRY_COUNT * sizeof(mach_vm_reclaim_entry_v1_t)) \|\|
227	!VM_MAP_PAGE_ALIGNED(address, VM_MAP_PAGE_MASK(task->map)) \|\|
228	!VM_MAP_PAGE_ALIGNED((address + size), VM_MAP_PAGE_MASK(task->map))) {
229	return KERN_INVALID_ARGUMENT;
230	}
231
232	/ vm_reclaim is disabled /
233	if (vm_reclaim_max_threshold == `0`) {
234	os_log_error(vm_reclaim_log_handle,
235	"vm_reclaim: failed to initialize vmdr buffer - reclaim is disabled (%llu)",
236	vm_reclaim_max_threshold);
237	return KERN_NOT_SUPPORTED;
238	}
239
240	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) \| DBG_FUNC_START,
241	task_pid(task), address, size);
242
243	user_addr_t buffer = address + \
244	offsetof(struct mach_vm_reclaim_buffer_v1_s, entries);
245	mach_vm_size_t buffer_size = size - \
246	offsetof(struct mach_vm_reclaim_buffer_v1_s, entries);
247	user_addr_t indices = address + \
248	offsetof(struct mach_vm_reclaim_buffer_v1_s, indices);
249
250	metadata = vmdr_metadata_alloc(task, buffer, size: buffer_size, indices);
251
252	kr = vmdr_metadata_wire(metadata);
253	if (kr != KERN_SUCCESS) {
254	goto out;
255	}
256
257	/*
258	* Validate the starting indices.
259	*
260	* NB: At this point it is impossible for another thread to hold a
261	* reference to this metadata. However, reclaim_copyin may call reclaim_kill
262	* on failure, which assumes the metadata lock is held.
263	*/
264	lck_mtx_lock(lck: &metadata->vdrm_lock);
265
266	success = reclaim_copyin_busy(metadata, busy: &busy);
267	if (!success) {
268	/ metadata lock has been dropped and exception delivered to task /
269	kr = KERN_INVALID_ARGUMENT;
270	goto fail_wired;
271	}
272	success = reclaim_copyin_head(metadata, head: &head);
273	if (!success) {
274	/ metadata lock has been dropped and exception delivered to task /
275	kr = KERN_INVALID_ARGUMENT;
276	goto fail_wired;
277	}
278	success = reclaim_copyin_tail(metadata, tail: &tail);
279	if (!success) {
280	/ metadata lock has been dropped and exception delivered to task /
281	kr = KERN_INVALID_ARGUMENT;
282	goto fail_wired;
283	}
284
285	lck_mtx_unlock(lck: &metadata->vdrm_lock);
286
287	if (head != `0` \|\| tail != `0` \|\| busy != `0`) {
288	kr = KERN_INVALID_ARGUMENT;
289	goto fail_wired;
290	}
291
292	/*
293	* Publish the metadata to the task & global buffer list. This must be
294	* done under the task lock to synchronize with task termination - i.e.
295	* task_terminate_internal is guaranteed to see the published metadata and
296	* tear it down.
297	*/
298	lck_mtx_lock(lck: &reclamation_buffers_lock);
299	task_lock(task);
300
301	if (!task_is_active(task) \|\| task_is_halting(task)) {
302	os_log_error(vm_reclaim_log_handle,
303	"vm_reclaim: failed to initialize buffer on dying task (pid %d)", task_pid(task));
304	kr = KERN_TERMINATED;
305	goto fail_task;
306	} else if (task->deferred_reclamation_metadata != NULL) {
307	os_log_error(vm_reclaim_log_handle,
308	"vm_reclaim: tried to overwrite existing reclaim buffer for pid %d", task_pid(task));
309	kr = KERN_INVALID_ARGUMENT;
310	goto fail_task;
311	}
312
313	TAILQ_INSERT_TAIL(&reclamation_buffers, metadata, vdrm_list);
314	reclamation_buffers_length++;
315
316	task->deferred_reclamation_metadata = metadata;
317
318	task_unlock(task);
319	lck_mtx_unlock(lck: &reclamation_buffers_lock);
320
321	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) \| DBG_FUNC_END,
322	task_pid(task), KERN_SUCCESS);
323	return KERN_SUCCESS;
324
325	fail_task:
326	task_unlock(task);
327	lck_mtx_unlock(lck: &reclamation_buffers_lock);
328
329	fail_wired:
330	tmp_kr = vmdr_metadata_unwire(metadata);
331	assert3u(tmp_kr, ==, KERN_SUCCESS);
332
333	out:
334	vmdr_metadata_free(metadata);
335	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) \| DBG_FUNC_END,
336	task_pid(task), kr);
337	return kr;
338	}
339
340	void
341	vm_deferred_reclamation_buffer_uninstall(vm_deferred_reclamation_metadata_t metadata)
342	{
343	assert(metadata != NULL);
344	/*
345	* First remove the buffer from the global list so no one else can get access to it.
346	*/
347	lck_mtx_lock(lck: &reclamation_buffers_lock);
348	TAILQ_REMOVE(&reclamation_buffers, metadata, vdrm_list);
349	reclamation_buffers_length--;
350	lck_mtx_unlock(lck: &reclamation_buffers_lock);
351
352	/*
353	* Now remove it from the async list (if present)
354	*/
355	lck_mtx_lock(lck: &async_reclamation_buffers_lock);
356	if (metadata->vdrm_async_list.tqe_next != NULL \|\| metadata->vdrm_async_list.tqe_prev != NULL) {
357	TAILQ_REMOVE(&async_reclamation_buffers, metadata, vdrm_async_list);
358	metadata->vdrm_async_list.tqe_next = NULL;
359	metadata->vdrm_async_list.tqe_prev = NULL;
360	}
361	lck_mtx_unlock(lck: &async_reclamation_buffers_lock);
362
363	// A kernel thread may have grabbed the lock for this buffer before we had
364	// a chance to remove it from the queues. Take the metadata lock to ensure
365	// any such workers are finished operating on the buffer.
366	lck_mtx_lock(lck: &metadata->vdrm_lock);
367	lck_mtx_unlock(lck: &metadata->vdrm_lock);
368
369	vmdr_metadata_unwire(metadata);
370	}
371
372	void
373	vm_deferred_reclamation_buffer_deallocate(vm_deferred_reclamation_metadata_t metadata)
374	{
375	assert(metadata != NULL);
376	vmdr_metadata_free(metadata);
377	}
378
379	static user_addr_t
380	get_head_ptr(user_addr_t indices)
381	{
382	return indices + offsetof(mach_vm_reclaim_indices_v1_t, head);
383	}
384
385	static user_addr_t
386	get_tail_ptr(user_addr_t indices)
387	{
388	return indices + offsetof(mach_vm_reclaim_indices_v1_t, tail);
389	}
390
391	static user_addr_t
392	get_busy_ptr(user_addr_t indices)
393	{
394	return indices + offsetof(mach_vm_reclaim_indices_v1_t, busy);
395	}
396
397	static void
398	reclaim_kill_with_reason(
399	vm_deferred_reclamation_metadata_t metadata,
400	unsigned reason,
401	mach_exception_data_type_t subcode)
402	{
403	unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
404	mach_exception_code_t code = `0`;
405	task_t task = metadata->vdrm_task;
406	proc_t p = NULL;
407	boolean_t fatal = TRUE;
408	bool killing_self = false;
409	pid_t pid;
410	int err;
411
412	if (panic_on_kill) {
413	panic("vm_reclaim: About to kill %p due to %d with subcode %lld\n", task, reason, subcode);
414	}
415
416	EXC_GUARD_ENCODE_TYPE(code, guard_type);
417	EXC_GUARD_ENCODE_FLAVOR(code, reason);
418	EXC_GUARD_ENCODE_TARGET(code, `0`);
419
420	assert(metadata->vdrm_task != kernel_task);
421	killing_self = task == current_task();
422	if (!killing_self) {
423	/*
424	* Grab a reference on the task to make sure it doesn't go away
425	* after we drop the metadata lock
426	*/
427	task_reference(task);
428	}
429	/*
430	* We need to issue a wakeup in case this kill is coming from the async path.
431	* Once we drop the lock the caller can no longer do this wakeup, but
432	* if there's someone blocked on this reclaim they hold a map reference
433	* and thus need to be woken up so the map can be freed.
434	*/
435	thread_wakeup(&metadata->vdrm_async_list);
436	lck_mtx_unlock(lck: &metadata->vdrm_lock);
437
438	if (reason == kGUARD_EXC_DEALLOC_GAP) {
439	task_lock(task);
440	fatal = (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL);
441	task_unlock(task);
442	}
443
444	if (!fatal) {
445	os_log_info(vm_reclaim_log_handle,
446	"vm_reclaim: Skipping non fatal guard exception.\n");
447	goto out;
448	}
449
450	pid = task_pid(task);
451	if (killing_self) {
452	p = get_bsdtask_info(task);
453	} else {
454	p = proc_find(pid);
455	if (p && proc_task(p) != task) {
456	os_log_error(vm_reclaim_log_handle,
457	"vm_reclaim: Unable to deliver guard exception because proc is gone & pid rolled over.\n");
458	goto out;
459	}
460
461	task_deallocate(task);
462	task = NULL;
463	}
464
465	if (!p) {
466	os_log_error(vm_reclaim_log_handle,
467	"vm_reclaim: Unable to deliver guard exception because task does not have a proc.\n");
468	goto out;
469	}
470
471	err = exit_with_guard_exception(p, code, subcode);
472	if (err != `0`) {
473	os_log_error(vm_reclaim_log_handle, "vm_reclaim: Unable to deliver guard exception to %p: %d\n", p, err);
474	}
475	out:
476	if (!killing_self) {
477	if (p) {
478	proc_rele(p);
479	p = NULL;
480	}
481	if (task) {
482	task_deallocate(task);
483	task = NULL;
484	}
485	}
486	}
487
488	static void
489	reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata, int result)
490	{
491	reclaim_kill_with_reason(metadata, reason: kGUARD_EXC_RECLAIM_COPYIO_FAILURE, subcode: result);
492	}
493
494	/*
495	* Helper functions to do copyio on the head, tail, and busy pointers.
496	* Note that the kernel will only write to the busy and head pointers.
497	* Userspace is not supposed to write to the head or busy pointers, but the kernel
498	* must be resilient to that kind of bug in userspace.
499	*/
500
501
502	static bool
503	reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head)
504	{
505	int result;
506	user_addr_t indices = metadata->vdrm_reclaim_indices;
507	user_addr_t head_ptr = get_head_ptr(indices);
508
509	result = copyin_atomic64(user_addr: head_ptr, kernel_addr: head);
510
511	if (result != `0`) {
512	os_log_error(vm_reclaim_log_handle,
513	"vm_reclaim: Unable to copy head ptr from 0x%llx: err=%d\n", head_ptr, result);
514	reclaim_handle_copyio_error(metadata, result);
515	return false;
516	}
517	return true;
518	}
519
520	static bool
521	reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail)
522	{
523	int result;
524	user_addr_t indices = metadata->vdrm_reclaim_indices;
525	user_addr_t tail_ptr = get_tail_ptr(indices);
526
527	result = copyin_atomic64(user_addr: tail_ptr, kernel_addr: tail);
528
529	if (result != `0`) {
530	os_log_error(vm_reclaim_log_handle,
531	"vm_reclaim: Unable to copy tail ptr from 0x%llx: err=%d\n", tail_ptr, result);
532	reclaim_handle_copyio_error(metadata, result);
533	return false;
534	}
535	return true;
536	}
537
538	static bool
539	reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy)
540	{
541	int result;
542	user_addr_t indices = metadata->vdrm_reclaim_indices;
543	user_addr_t busy_ptr = get_busy_ptr(indices);
544
545	result = copyin_atomic64(user_addr: busy_ptr, kernel_addr: busy);
546
547	if (result != `0`) {
548	os_log_error(vm_reclaim_log_handle,
549	"vm_reclaim: Unable to copy busy ptr from 0x%llx: err=%d\n", busy_ptr, result);
550	reclaim_handle_copyio_error(metadata, result);
551	return false;
552	}
553	return true;
554	}
555
556	static bool
557	reclaim_copyout_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t value)
558	{
559	int result;
560	user_addr_t indices = metadata->vdrm_reclaim_indices;
561	user_addr_t busy_ptr = get_busy_ptr(indices);
562
563	result = copyout_atomic64(u64: value, user_addr: busy_ptr);
564
565	if (result != `0`) {
566	os_log_error(vm_reclaim_log_handle,
567	"vm_reclaim: Unable to copy %llu to busy ptr at 0x%llx: err=%d\n", value, busy_ptr, result);
568	reclaim_handle_copyio_error(metadata, result);
569	return false;
570	}
571	return true;
572	}
573
574	static bool
575	reclaim_copyout_head(vm_deferred_reclamation_metadata_t metadata, uint64_t value)
576	{
577	int result;
578	user_addr_t indices = metadata->vdrm_reclaim_indices;
579	user_addr_t head_ptr = get_head_ptr(indices);
580
581	result = copyout_atomic64(u64: value, user_addr: head_ptr);
582
583	if (result != `0`) {
584	os_log_error(vm_reclaim_log_handle,
585	"vm_reclaim: Unable to copy %llu to head ptr at 0x%llx: err=%d\n", value, head_ptr, result);
586	reclaim_handle_copyio_error(metadata, result);
587	return false;
588	}
589	return true;
590	}
591
592	/*
593	* Reclaim a chunk (kReclaimChunkSize entries) from the buffer.
594	*
595	* Writes the number of entries reclaimed to `num_reclaimed_out`. Note that
596	* there may be zero reclaimable entries in the chunk (they have all been
597	* re-used by userspace).
598	*
599	* Returns:
600	* - KERN_NOT_FOUND if the buffer has been exhausted (head == tail)
601	* - KERN_FAILURE on failure to reclaim -- metadata lock will be dropped
602	* before returning
603	*/
604	static kern_return_t
605	reclaim_chunk(vm_deferred_reclamation_metadata_t metadata, size_t *num_reclaimed_out)
606	{
607	assert(metadata != NULL);
608	LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_OWNED);
609	int result = `0`;
610	size_t num_reclaimed = `0`;
611	uint64_t head = `0`, tail = `0`, busy = `0`, num_to_reclaim = `0`, new_tail = `0`, num_copied = `0`, buffer_len = `0`;
612	user_addr_t indices;
613	vm_map_t map = metadata->vdrm_map, old_map;
614	mach_vm_reclaim_entry_v1_t reclaim_entries[kReclaimChunkSize];
615	bool success;
616
617	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_CHUNK) \| DBG_FUNC_START,
618	task_pid(metadata->vdrm_task), kReclaimChunkSize);
619
620	buffer_len = metadata->vdrm_buffer_size / sizeof(mach_vm_reclaim_entry_v1_t);
621
622	memset(s: reclaim_entries, c: `0`, n: sizeof(reclaim_entries));
623
624	indices = (user_addr_t) metadata->vdrm_reclaim_indices;
625	old_map = vm_map_switch(map);
626
627	success = reclaim_copyin_busy(metadata, busy: &busy);
628	if (!success) {
629	goto fail;
630	}
631	success = reclaim_copyin_head(metadata, head: &head);
632	if (!success) {
633	goto fail;
634	}
635	success = reclaim_copyin_tail(metadata, tail: &tail);
636	if (!success) {
637	goto fail;
638	}
639
640	if (busy != head) {
641	// Userspace overwrote one of the pointers
642	os_log_error(vm_reclaim_log_handle,
643	"vm_reclaim: Userspace modified head or busy pointer! head: %llu (0x%llx) != busy: %llu (0x%llx) \| tail = %llu (0x%llx)\n",
644	head, get_head_ptr(indices), busy, get_busy_ptr(indices), tail, get_tail_ptr(indices));
645	reclaim_kill_with_reason(metadata, reason: kGUARD_EXC_RECLAIM_INDEX_FAILURE, subcode: busy);
646	goto fail;
647	}
648
649	if (tail < head) {
650	// Userspace is likely in the middle of trying to re-use an entry, bail on this reclamation
651	os_log_error(vm_reclaim_log_handle,
652	"vm_reclaim: Userspace modified head or tail pointer! head: %llu (0x%llx) > tail: %llu (0x%llx) \| busy = %llu (0x%llx)\n",
653	head, get_head_ptr(indices), tail, get_tail_ptr(indices), busy, get_busy_ptr(indices));
654	lck_mtx_unlock(lck: &metadata->vdrm_lock);
655	goto fail;
656	}
657
658	num_to_reclaim = tail - head;
659	while (true) {
660	num_to_reclaim = MIN(num_to_reclaim, kReclaimChunkSize);
661	if (num_to_reclaim == `0`) {
662	break;
663	}
664	busy = head + num_to_reclaim;
665	success = reclaim_copyout_busy(metadata, value: busy);
666	if (!success) {
667	goto fail;
668	}
669	os_atomic_thread_fence(seq_cst);
670	success = reclaim_copyin_tail(metadata, tail: &new_tail);
671	if (!success) {
672	goto fail;
673	}
674
675	if (new_tail >= busy) {
676	/ Got num_to_reclaim entries /
677	break;
678	}
679	tail = new_tail;
680	if (tail < head) {
681	// Userspace is likely in the middle of trying to re-use an entry, bail on this reclamation
682	os_log_error(vm_reclaim_log_handle,
683	"vm_reclaim: Userspace modified head or tail pointer! head: %llu (0x%llx) > tail: %llu (0x%llx) \| busy = %llu (0x%llx)\n",
684	head, get_head_ptr(indices), tail, get_tail_ptr(indices), busy, get_busy_ptr(indices));
685	lck_mtx_unlock(lck: &metadata->vdrm_lock);
686	goto fail;
687	}
688	/ Can't reclaim these entries. Try again /
689	num_to_reclaim = tail - head;
690	if (num_to_reclaim == `0`) {
691	/ Nothing left to reclaim. Reset busy to head. /
692	success = reclaim_copyout_busy(metadata, value: head);
693	if (!success) {
694	goto fail;
695	}
696	break;
697	}
698	/*
699	* Note that num_to_reclaim must have gotten smaller since tail got smaller,
700	* so this is gauranteed to converge.
701	*/
702	}
703
704	while (num_copied < num_to_reclaim) {
705	uint64_t memcpy_start_idx = (head % buffer_len);
706	uint64_t memcpy_end_idx = memcpy_start_idx + num_to_reclaim - num_copied;
707	// Clamp the end idx to the buffer. We'll handle wrap-around in our next go around the loop.
708	memcpy_end_idx = MIN(memcpy_end_idx, buffer_len);
709	uint64_t num_to_copy = memcpy_end_idx - memcpy_start_idx;
710
711	assert(num_to_copy + num_copied <= kReclaimChunkSize);
712	user_addr_t src_ptr = metadata->vdrm_reclaim_buffer + memcpy_start_idx * sizeof(mach_vm_reclaim_entry_v1_t);
713	mach_vm_reclaim_entry_v1_t *dst_ptr = reclaim_entries + num_copied;
714
715	result = copyin(src_ptr, dst_ptr, num_to_copy * sizeof(mach_vm_reclaim_entry_v1_t));
716
717	if (result != `0`) {
718	os_log_error(vm_reclaim_log_handle,
719	"vm_reclaim: Unable to copyin %llu entries in reclaim buffer at 0x%llx to 0x%llx: err=%d\n",
720	num_to_copy, src_ptr, (uint64_t) dst_ptr, result);
721	reclaim_handle_copyio_error(metadata, result);
722	goto fail;
723	}
724
725	num_copied += num_to_copy;
726	head += num_to_copy;
727	}
728
729	for (size_t i = `0`; i < num_to_reclaim; i++) {
730	mach_vm_reclaim_entry_v1_t *entry = &reclaim_entries[i];
731	KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_ENTRY) \| DBG_FUNC_START,
732	task_pid(metadata->vdrm_task), entry->address, entry->size,
733	entry->behavior);
734	DTRACE_VM4(vm_reclaim_chunk,
735	int, task_pid(metadata->vdrm_task),
736	mach_vm_address_t, entry->address,
737	size_t, entry->size,
738	mach_vm_reclaim_behavior_v1_t, entry->behavior);
739	if (entry->address != `0` && entry->size != `0`) {
740	kern_return_t kr;
741	switch (entry->behavior) {
742	case MACH_VM_RECLAIM_DEALLOCATE:
743	kr = vm_map_remove_guard(map,
744	vm_map_trunc_page(entry->address,
745	VM_MAP_PAGE_MASK(map)),
746	vm_map_round_page(entry->address + entry->size,
747	VM_MAP_PAGE_MASK(map)),
748	flags: VM_MAP_REMOVE_GAPS_FAIL,
749	KMEM_GUARD_NONE).kmr_return;
750	if (kr == KERN_INVALID_VALUE) {
751	reclaim_kill_with_reason(metadata, reason: kGUARD_EXC_DEALLOC_GAP, subcode: entry->address);
752	goto fail;
753	} else if (kr != KERN_SUCCESS) {
754	os_log_error(vm_reclaim_log_handle,
755	"vm_reclaim: Unable to deallocate 0x%llx (%u) from 0x%llx err=%d\n",
756	entry->address, entry->size, (uint64_t) map, kr);
757	reclaim_kill_with_reason(metadata, reason: kGUARD_EXC_RECLAIM_DEALLOCATE_FAILURE, subcode: kr);
758	goto fail;
759	}
760	break;
761	case MACH_VM_RECLAIM_REUSABLE:
762	kr = vm_map_behavior_set(map,
763	vm_map_trunc_page(entry->address, VM_MAP_PAGE_MASK(map)),
764	vm_map_round_page(entry->address + entry->size, VM_MAP_PAGE_MASK(map)),
765	VM_BEHAVIOR_REUSABLE);
766	if (kr != KERN_SUCCESS) {
767	os_log_error(vm_reclaim_log_handle,
768	"vm_reclaim: unable to free(reusable) 0x%llx (%u) for pid %d err=%d\n",
769	entry->address, entry->size, task_pid(metadata->vdrm_task), kr);
770	}
771	break;
772	default:
773	os_log_error(vm_reclaim_log_handle,
774	"vm_reclaim: attempted to reclaim entry with unsupported behavior %uh",
775	entry->behavior);
776	reclaim_kill_with_reason(metadata, reason: kGUARD_EXC_RECLAIM_DEALLOCATE_FAILURE, subcode: kr);
777	goto fail;
778	}
779	num_reclaimed++;
780	os_atomic_add(&metadata->vdrm_num_bytes_reclaimed, entry->size, relaxed);
781	KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_ENTRY) \| DBG_FUNC_END,
782	task_pid(metadata->vdrm_task), entry->address);
783	}
784	}
785
786	success = reclaim_copyout_head(metadata, value: head);
787	if (!success) {
788	goto fail;
789	}
790
791	vm_map_switch(map: old_map);
792	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_CHUNK) \| DBG_FUNC_END,
793	task_pid(metadata->vdrm_task), num_to_reclaim, num_reclaimed, true);
794	*num_reclaimed_out = num_reclaimed;
795	if (num_to_reclaim == `0`) {
796	// We have exhausted the reclaimable portion of the buffer
797	return KERN_NOT_FOUND;
798	}
799	return KERN_SUCCESS;
800	fail:
801	vm_map_switch(map: old_map);
802	*num_reclaimed_out = num_reclaimed;
803	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_CHUNK) \| DBG_FUNC_END,
804	task_pid(metadata->vdrm_task), num_to_reclaim, num_reclaimed, false);
805	return KERN_FAILURE;
806	}
807
808	/*
809	* Attempts to reclaim until the buffer's estimated number of available bytes
810	* is <= num_bytes_reclaimable_threshold. The metadata buffer lock should be
811	* held by the caller.
812	*
813	* Writes the number of entries reclaimed to `num_reclaimed_out`.
814	*/
815	static kern_return_t
816	reclaim_entries_from_buffer(vm_deferred_reclamation_metadata_t metadata,
817	size_t num_bytes_reclaimable_threshold, size_t *num_reclaimed_out)
818	{
819	assert(metadata != NULL);
820	assert(num_reclaimed_out != NULL);
821	LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_OWNED);
822	if (!task_is_active(task: metadata->vdrm_task)) {
823	/*
824	* If the task is exiting, the reclaim below will likely fail and fall through
825	* to the (slower) error path.
826	* So as an optimization, we bail out early here.
827	*/
828	return `0`;
829	}
830
831	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ENTRIES) \| DBG_FUNC_START, task_pid(metadata->vdrm_task));
832
833	size_t num_entries_reclaimed = `0`, num_bytes_reclaimed, estimated_reclaimable_bytes, reclaimable_bytes;
834	while (true) {
835	kern_return_t kr;
836	size_t curr_entries_reclaimed = `0`;
837	num_bytes_reclaimed = os_atomic_load(&metadata->vdrm_num_bytes_reclaimed, relaxed);
838	reclaimable_bytes = os_atomic_load(&metadata->vdrm_num_bytes_put_in_buffer, relaxed);
839	if (num_bytes_reclaimed > reclaimable_bytes) {
840	estimated_reclaimable_bytes = `0`;
841	} else {
842	estimated_reclaimable_bytes = reclaimable_bytes - num_bytes_reclaimed;
843	}
844	if (reclaimable_bytes <= num_bytes_reclaimable_threshold) {
845	break;
846	}
847	kr = reclaim_chunk(metadata, num_reclaimed_out: &curr_entries_reclaimed);
848	if (kr == KERN_NOT_FOUND) {
849	break;
850	} else if (kr != KERN_SUCCESS) {
851	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ENTRIES) \| DBG_FUNC_END,
852	task_pid(metadata->vdrm_task), num_entries_reclaimed,
853	estimated_reclaimable_bytes, kr);
854	*num_reclaimed_out = num_entries_reclaimed;
855	return kr;
856	}
857	num_entries_reclaimed += curr_entries_reclaimed;
858	}
859
860	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ENTRIES) \| DBG_FUNC_END,
861	task_pid(metadata->vdrm_task), num_entries_reclaimed,
862	estimated_reclaimable_bytes, KERN_SUCCESS);
863	*num_reclaimed_out = num_entries_reclaimed;
864	return KERN_SUCCESS;
865	}
866
867	/*
868	* Get the reclamation metadata buffer for the given map.
869	* If the buffer exists it is returned locked.
870	*/
871	static vm_deferred_reclamation_metadata_t
872	get_task_reclaim_metadata(task_t task)
873	{
874	assert(task != NULL);
875	vm_deferred_reclamation_metadata_t metadata = NULL;
876	task_lock(task);
877	metadata = task->deferred_reclamation_metadata;
878	if (metadata != NULL) {
879	lck_mtx_lock(lck: &metadata->vdrm_lock);
880	}
881	task_unlock(task);
882	return metadata;
883	}
884
885	kern_return_t
886	vm_deferred_reclamation_buffer_synchronize_internal(task_t task, size_t num_entries_to_reclaim)
887	{
888	kern_return_t kr;
889	vm_deferred_reclamation_metadata_t metadata = NULL;
890	size_t total_reclaimed = `0`;
891
892	if (!task_is_active(task)) {
893	return KERN_FAILURE;
894	}
895
896	metadata = get_task_reclaim_metadata(task);
897	if (metadata == NULL) {
898	return KERN_INVALID_ARGUMENT;
899	}
900
901	while (total_reclaimed < num_entries_to_reclaim) {
902	size_t num_reclaimed;
903	kr = reclaim_chunk(metadata, num_reclaimed_out: &num_reclaimed);
904	if (kr == KERN_NOT_FOUND) {
905	/ buffer has been fully reclaimed from /
906	break;
907	} else if (kr != KERN_SUCCESS) {
908	/ Lock has already been released and task is being killed. /
909	return kr;
910	}
911
912	total_reclaimed += num_reclaimed;
913	}
914	lck_mtx_unlock(lck: &metadata->vdrm_lock);
915
916	return KERN_SUCCESS;
917	}
918
919	kern_return_t
920	vm_deferred_reclamation_buffer_update_reclaimable_bytes_internal(task_t task, size_t reclaimable_bytes)
921	{
922	vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
923	size_t num_bytes_reclaimed, estimated_reclaimable_bytes, num_bytes_in_buffer, num_reclaimed = `0`;
924	bool success;
925	kern_return_t kr = KERN_SUCCESS;
926	if (metadata == NULL) {
927	return KERN_INVALID_ARGUMENT;
928	}
929
930	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_UPDATE_ACCOUNTING) \| DBG_FUNC_START,
931	task_pid(task), reclaimable_bytes);
932
933	/*
934	* The client is allowed to make this call in parallel from multiple threads.
935	* Ensure we only ever increase the value of vdrm_num_bytes_put_in_buffer.
936	* If the client's value is smaller than what we've stored, another thread
937	* raced ahead of them and we've already acted on that accounting so this
938	* call should be a no-op.
939	*/
940	success = os_atomic_rmw_loop(&metadata->vdrm_num_bytes_put_in_buffer, num_bytes_in_buffer,
941	reclaimable_bytes, acquire,
942	{
943	if (num_bytes_in_buffer > reclaimable_bytes) {
944	os_atomic_rmw_loop_give_up(break);
945	}
946	});
947	if (!success) {
948	/ Stale value. Nothing new to reclaim /
949	goto done;
950	}
951	num_bytes_reclaimed = os_atomic_load(&metadata->vdrm_num_bytes_reclaimed, relaxed);
952
953	if (reclaimable_bytes > num_bytes_reclaimed) {
954	estimated_reclaimable_bytes = reclaimable_bytes - num_bytes_reclaimed;
955	if (estimated_reclaimable_bytes > vm_reclaim_max_threshold) {
956	lck_mtx_lock(lck: &metadata->vdrm_lock);
957	kr = reclaim_entries_from_buffer(metadata,
958	num_bytes_reclaimable_threshold: vm_reclaim_max_threshold, num_reclaimed_out: &num_reclaimed);
959	if (kr != KERN_SUCCESS) {
960	/ Lock has already been released & task is in the process of getting killed. /
961	goto done;
962	}
963	lck_mtx_unlock(lck: &metadata->vdrm_lock);
964	}
965	}
966
967	done:
968	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_UPDATE_ACCOUNTING) \| DBG_FUNC_END,
969	task_pid(task), reclaimable_bytes, num_bytes_reclaimed, num_reclaimed);
970
971	return kr;
972	}
973
974	static inline size_t
975	pick_reclaim_threshold(vm_deferred_reclamation_action_t action)
976	{
977	switch (action) {
978	case RECLAIM_FULL:
979	return `0`;
980	case RECLAIM_TRIM:
981	return vm_reclaim_max_threshold / vm_reclaim_trim_divisor;
982	case RECLAIM_ASYNC:
983	return `0`;
984	}
985	}
986
987	void
988	vm_deferred_reclamation_reclaim_memory(vm_deferred_reclamation_action_t action)
989	{
990	kern_return_t kr;
991	size_t num_reclaimed;
992
993	if (action == RECLAIM_ASYNC) {
994	lck_mtx_lock(lck: &async_reclamation_buffers_lock);
995
996	process_async_reclamation_list();
997	lck_mtx_unlock(lck: &async_reclamation_buffers_lock);
998	} else {
999	size_t reclaim_threshold = pick_reclaim_threshold(action);
1000	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ALL_MEMORY) \| DBG_FUNC_START,
1001	action, reclaim_threshold);
1002	lck_mtx_lock(lck: &reclamation_buffers_lock);
1003	reclamation_counter++;
1004	while (true) {
1005	vm_deferred_reclamation_metadata_t metadata = TAILQ_FIRST(&reclamation_buffers);
1006	if (metadata == NULL) {
1007	break;
1008	}
1009	lck_mtx_lock(lck: &metadata->vdrm_lock);
1010	if (metadata->vdrm_reclaimed_at >= reclamation_counter) {
1011	// We've already seen this one. We're done
1012	lck_mtx_unlock(lck: &metadata->vdrm_lock);
1013	break;
1014	}
1015	metadata->vdrm_reclaimed_at = reclamation_counter;
1016
1017	TAILQ_REMOVE(&reclamation_buffers, metadata, vdrm_list);
1018	TAILQ_INSERT_TAIL(&reclamation_buffers, metadata, vdrm_list);
1019	lck_mtx_unlock(lck: &reclamation_buffers_lock);
1020
1021	kr = reclaim_entries_from_buffer(metadata,
1022	num_bytes_reclaimable_threshold: reclaim_threshold, num_reclaimed_out: &num_reclaimed);
1023	if (kr == KERN_SUCCESS) {
1024	lck_mtx_unlock(lck: &metadata->vdrm_lock);
1025	}
1026
1027	lck_mtx_lock(lck: &reclamation_buffers_lock);
1028	}
1029	lck_mtx_unlock(lck: &reclamation_buffers_lock);
1030	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ALL_MEMORY) \| DBG_FUNC_END,
1031	reclamation_counter);
1032	}
1033	}
1034
1035	void
1036	vm_deferred_reclamation_reclaim_all_memory(void)
1037	{
1038	vm_deferred_reclamation_reclaim_memory(action: RECLAIM_FULL);
1039	}
1040
1041	bool
1042	vm_deferred_reclamation_reclaim_from_task_async(task_t task)
1043	{
1044	bool queued = false;
1045	vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
1046
1047	if (metadata != NULL) {
1048	lck_mtx_lock(lck: &async_reclamation_buffers_lock);
1049	if (metadata->vdrm_async_list.tqe_next != NULL \|\|
1050	metadata->vdrm_async_list.tqe_prev != NULL) {
1051	// move this buffer to the tail if still on the async list
1052	TAILQ_REMOVE(&async_reclamation_buffers, metadata, vdrm_async_list);
1053	}
1054	TAILQ_INSERT_TAIL(&async_reclamation_buffers, metadata, vdrm_async_list);
1055	lck_mtx_unlock(lck: &async_reclamation_buffers_lock);
1056	queued = true;
1057	thread_wakeup(&vm_reclaim_thread);
1058	}
1059
1060	return queued;
1061	}
1062
1063	bool
1064	vm_deferred_reclamation_reclaim_from_task_sync(task_t task, size_t max_entries_to_reclaim)
1065	{
1066	kern_return_t kr;
1067	size_t num_reclaimed = `0`;
1068	vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
1069
1070	if (!task_is_active(task)) {
1071	return false;
1072	}
1073
1074	if (metadata != NULL) {
1075	lck_mtx_lock(lck: &metadata->vdrm_lock);
1076	while (num_reclaimed < max_entries_to_reclaim) {
1077	size_t num_reclaimed_now;
1078	kr = reclaim_chunk(metadata, num_reclaimed_out: &num_reclaimed_now);
1079	if (kr == KERN_NOT_FOUND) {
1080	// Nothing left to reclaim
1081	break;
1082	} else if (kr != KERN_SUCCESS) {
1083	/ Lock has already been released and task is being killed. /
1084	return false;
1085	}
1086	num_reclaimed += num_reclaimed_now;
1087	}
1088	lck_mtx_unlock(lck: &metadata->vdrm_lock);
1089	}
1090
1091	return num_reclaimed > `0`;
1092	}
1093
1094	vm_deferred_reclamation_metadata_t
1095	vm_deferred_reclamation_buffer_fork(task_t task, vm_deferred_reclamation_metadata_t parent)
1096	{
1097	kern_return_t kr;
1098	vm_deferred_reclamation_metadata_t metadata = NULL;
1099
1100	LCK_MTX_ASSERT(&parent->vdrm_lock, LCK_MTX_ASSERT_OWNED);
1101
1102	assert(task->deferred_reclamation_metadata == NULL);
1103	metadata = vmdr_metadata_alloc(task, buffer: parent->vdrm_reclaim_buffer,
1104	size: parent->vdrm_buffer_size, indices: parent->vdrm_reclaim_indices);
1105	lck_mtx_unlock(lck: &parent->vdrm_lock);
1106
1107	kr = vmdr_metadata_wire(metadata);
1108	if (kr != KERN_SUCCESS) {
1109	vmdr_metadata_free(metadata);
1110	return NULL;
1111	}
1112
1113	lck_mtx_lock(lck: &reclamation_buffers_lock);
1114	TAILQ_INSERT_TAIL(&reclamation_buffers, metadata, vdrm_list);
1115	reclamation_buffers_length++;
1116	lck_mtx_unlock(lck: &reclamation_buffers_lock);
1117
1118	return metadata;
1119	}
1120
1121	void
1122	vm_deferred_reclamation_buffer_lock(vm_deferred_reclamation_metadata_t metadata)
1123	{
1124	lck_mtx_lock(lck: &metadata->vdrm_lock);
1125	}
1126
1127	void
1128	vm_deferred_reclamation_buffer_unlock(vm_deferred_reclamation_metadata_t metadata)
1129	{
1130	lck_mtx_unlock(lck: &metadata->vdrm_lock);
1131	}
1132
1133
1134	static void
1135	reclaim_thread_init(void)
1136	{
1137	#if CONFIG_THREAD_GROUPS
1138	thread_group_vm_add();
1139	#endif
1140	thread_set_thread_name(th: current_thread(), name: "VM_reclaim");
1141	}
1142
1143
1144	static void
1145	process_async_reclamation_list(void)
1146	{
1147	kern_return_t kr;
1148	size_t total_entries_reclaimed = `0`;
1149	size_t num_tasks_reclaimed = `0`;
1150	LCK_MTX_ASSERT(&async_reclamation_buffers_lock, LCK_MTX_ASSERT_OWNED);
1151	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ASYNC_MEMORY) \| DBG_FUNC_START);
1152
1153	vm_deferred_reclamation_metadata_t metadata = TAILQ_FIRST(&async_reclamation_buffers);
1154	while (metadata != NULL) {
1155	size_t num_reclaimed;
1156	TAILQ_REMOVE(&async_reclamation_buffers, metadata, vdrm_async_list);
1157	metadata->vdrm_async_list.tqe_next = NULL;
1158	metadata->vdrm_async_list.tqe_prev = NULL;
1159	lck_mtx_lock(lck: &metadata->vdrm_lock);
1160	lck_mtx_unlock(lck: &async_reclamation_buffers_lock);
1161
1162	// NB: Currently the async reclaim thread fully reclaims the buffer.
1163	kr = reclaim_entries_from_buffer(metadata, num_bytes_reclaimable_threshold: `0`, num_reclaimed_out: &num_reclaimed);
1164	total_entries_reclaimed += num_reclaimed;
1165	if (kr != KERN_SUCCESS) {
1166	/ Lock has already been released & task is in the process of getting killed. /
1167	goto next;
1168	}
1169	num_tasks_reclaimed++;
1170	/ Wakeup anyone waiting on this buffer getting processed /
1171	thread_wakeup(&metadata->vdrm_async_list);
1172	assert(current_thread()->map == kernel_map);
1173	lck_mtx_unlock(lck: &metadata->vdrm_lock);
1174
1175	next:
1176	lck_mtx_lock(lck: &async_reclamation_buffers_lock);
1177	metadata = TAILQ_FIRST(&async_reclamation_buffers);
1178	}
1179	KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ASYNC_MEMORY) \| DBG_FUNC_END,
1180	num_tasks_reclaimed, total_entries_reclaimed);
1181	}
1182
1183	__enum_decl(reclaim_thread_state, uint32_t, {
1184	RECLAIM_THREAD_INIT = `0`,
1185	RECLAIM_THREAD_CONT = `1`,
1186	});
1187
1188	static void
1189	reclaim_thread_continue(void)
1190	{
1191	lck_mtx_lock(lck: &async_reclamation_buffers_lock);
1192
1193	process_async_reclamation_list();
1194	assert_wait(event: &vm_reclaim_thread, THREAD_UNINT);
1195
1196	lck_mtx_unlock(lck: &async_reclamation_buffers_lock);
1197	}
1198
1199	void
1200	reclaim_thread(void *param, wait_result_t wr __unused)
1201	{
1202	if (param == (void *) RECLAIM_THREAD_INIT) {
1203	reclaim_thread_init();
1204	} else {
1205	assert(param == (void *) RECLAIM_THREAD_CONT);
1206	}
1207
1208	reclaim_thread_continue();
1209
1210	(void) thread_block_parameter(continuation: reclaim_thread, parameter: (void*) RECLAIM_THREAD_CONT);
1211	}
1212
1213	__startup_func
1214	static void
1215	vm_deferred_reclamation_init(void)
1216	{
1217	// Note: no-op pending rdar://27006343 (Custom kernel log handles)
1218	vm_reclaim_log_handle = os_log_create(subsystem: "com.apple.mach.vm", category: "reclaim");
1219
1220	(void)kernel_thread_start_priority(continuation: reclaim_thread,
1221	parameter: (void *)RECLAIM_THREAD_INIT, priority: kReclaimThreadPriority,
1222	new_thread: &vm_reclaim_thread);
1223	}
1224
1225	STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_deferred_reclamation_init);
1226
1227	#if DEVELOPMENT \|\| DEBUG
1228
1229	bool
1230	vm_deferred_reclamation_block_until_pid_has_been_reclaimed(int pid)
1231	{
1232	vm_deferred_reclamation_metadata_t metadata = NULL;
1233	proc_t p = proc_find(pid);
1234	vm_map_t map = NULL;
1235	if (p == NULL) {
1236	return false;
1237	}
1238	task_t t = proc_task(p);
1239	if (t == NULL) {
1240	proc_rele(p);
1241	return false;
1242	}
1243
1244	task_lock(t);
1245	if (t->map) {
1246	metadata = t->deferred_reclamation_metadata;
1247	if (metadata != NULL) {
1248	map = t->map;
1249	vm_map_reference(t->map);
1250	}
1251	}
1252	task_unlock(t);
1253	proc_rele(p);
1254	if (metadata == NULL) {
1255	return false;
1256	}
1257
1258	lck_mtx_lock(&async_reclamation_buffers_lock);
1259	while (metadata->vdrm_async_list.tqe_next != NULL \|\| metadata->vdrm_async_list.tqe_prev != NULL) {
1260	assert_wait(&metadata->vdrm_async_list, THREAD_UNINT);
1261	lck_mtx_unlock(&async_reclamation_buffers_lock);
1262	thread_block(THREAD_CONTINUE_NULL);
1263	lck_mtx_lock(&async_reclamation_buffers_lock);
1264	}
1265
1266	/*
1267	* The async reclaim thread first removes the buffer from the list
1268	* and then reclaims it (while holding its lock).
1269	* So grab the metadata buffer's lock here to ensure the
1270	* reclaim is done.
1271	*/
1272	lck_mtx_lock(&metadata->vdrm_lock);
1273	lck_mtx_unlock(&metadata->vdrm_lock);
1274	lck_mtx_unlock(&async_reclamation_buffers_lock);
1275
1276	vm_map_deallocate(map);
1277	return true;
1278	}
1279
1280	#endif /* DEVELOPMENT \|\| DEBUG */
1281

Browse the source code of xnu/osfmk/vm/vm_reclaim.c