1/*
2 * Copyright (c) 2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <kern/exc_guard.h>
30#include <kern/locks.h>
31#include <kern/task.h>
32#include <kern/zalloc.h>
33#include <kern/misc_protos.h>
34#include <kern/startup.h>
35#include <kern/sched.h>
36#include <libkern/OSAtomic.h>
37#include <mach/kern_return.h>
38#include <mach/mach_types.h>
39#include <mach/mach_vm.h>
40#include <mach/vm_reclaim.h>
41#include <os/log.h>
42#include <pexpert/pexpert.h>
43#include <vm/vm_map.h>
44#include <vm/vm_map_internal.h>
45#include <vm/vm_reclaim_internal.h>
46#include <sys/kdebug.h>
47#include <sys/queue.h>
48#include <os/atomic_private.h>
49
50#pragma mark Tunables
51TUNABLE(uint32_t, kReclaimChunkSize, "vm_reclaim_chunk_size", 16);
52static integer_t kReclaimThreadPriority = BASEPRI_VM;
53// Reclaim down to vm_reclaim_max_threshold / vm_reclaim_trim_divisor when doing a trim reclaim operation
54TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_trim_divisor, "vm_reclaim_trim_divisor", 2);
55TUNABLE_DT_DEV_WRITEABLE(uint64_t, vm_reclaim_max_threshold, "/defaults", "kern.vm_reclaim_max_threshold", "vm_reclaim_max_threshold", 0, TUNABLE_DT_NONE);
56// Used to debug vm_reclaim kills
57TUNABLE(bool, panic_on_kill, "vm_reclaim_panic_on_kill", false);
58
59#pragma mark Declarations
60typedef struct proc *proc_t;
61extern char *proc_best_name(proc_t proc);
62extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code, mach_exception_data_type_t subcode);
63struct proc *proc_ref(struct proc *p, int locked);
64int proc_rele(proc_t p);
65static bool reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head);
66static bool reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail);
67static bool reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy);
68
69struct vm_deferred_reclamation_metadata_s {
70 TAILQ_ENTRY(vm_deferred_reclamation_metadata_s) vdrm_list; // Global list containing every reclamation buffer
71 TAILQ_ENTRY(vm_deferred_reclamation_metadata_s) vdrm_async_list; // A list containing buffers that are ripe for reclamation
72 decl_lck_mtx_data(, vdrm_lock); /* Held when reclaiming from the buffer */
73 /*
74 * The task owns this structure but we maintain a backpointer here
75 * so that we can send an exception if we hit an error.
76 * Since this is a backpointer we don't hold a reference (it's a weak pointer).
77 */
78 task_t vdrm_task;
79 vm_map_t vdrm_map;
80 user_addr_t vdrm_reclaim_buffer;
81 mach_vm_size_t vdrm_buffer_size;
82 user_addr_t vdrm_reclaim_indices;
83 uint64_t vdrm_reclaimed_at;
84 /*
85 * These two values represent running sums of bytes placed in the buffer and bytes reclaimed out of the buffer
86 * cumulatively. Both values are in terms of virtual memory, so they give an upper bound
87 * on the amount of physical memory that can be reclaimed.
88 * To get an estimate of the current amount of VA in the buffer do vdrm_num_bytes_reclaimed - vdrm_num_bytes_put_in_buffer.
89 * Note that neither value is protected by the vdrm_lock.
90 */
91 _Atomic size_t vdrm_num_bytes_put_in_buffer;
92 _Atomic size_t vdrm_num_bytes_reclaimed;
93};
94static void process_async_reclamation_list(void);
95
96extern void *proc_find(int pid);
97extern task_t proc_task(proc_t);
98
99#pragma mark Globals
100static KALLOC_TYPE_DEFINE(vm_reclaim_metadata_zone, struct vm_deferred_reclamation_metadata_s, KT_DEFAULT);
101static LCK_GRP_DECLARE(vm_reclaim_lock_grp, "vm_reclaim");
102static os_log_t vm_reclaim_log_handle;
103
104/*
105 * The ringbuffer must contain at least 2 entries to distinguish between empty
106 * (head == tail) and full (head == tail + 1).
107 */
108#define BUFFER_MIN_ENTRY_COUNT 2
109
110/*
111 * We maintain two lists of reclamation buffers.
112 * The reclamation_buffers list contains every buffer in the system.
113 * The async_reclamation_buffers_list contains buffers that are ripe for reclamation.
114 * Each list has its own lock.
115 */
116static TAILQ_HEAD(, vm_deferred_reclamation_metadata_s) reclamation_buffers = TAILQ_HEAD_INITIALIZER(reclamation_buffers);
117
118static TAILQ_HEAD(, vm_deferred_reclamation_metadata_s) async_reclamation_buffers = TAILQ_HEAD_INITIALIZER(async_reclamation_buffers);
119/*
120 * The reclamation_buffers_lock protects the reclamation_buffers list.
121 * It must be held when iterating over the list or manipulating the list.
122 * It should be dropped when acting on a specific metadata entry after acquiring the vdrm_lock.
123 */
124LCK_MTX_DECLARE(reclamation_buffers_lock, &vm_reclaim_lock_grp);
125LCK_MTX_DECLARE(async_reclamation_buffers_lock, &vm_reclaim_lock_grp);
126static size_t reclamation_buffers_length;
127static uint64_t reclamation_counter; // generation count for global reclaims
128
129static SECURITY_READ_ONLY_LATE(thread_t) vm_reclaim_thread;
130static void reclaim_thread(void *param __unused, wait_result_t wr __unused);
131
132#pragma mark Implementation
133
134/*
135 * The current design is not tolerant to faulting on the buffer under the
136 * metadata lock. Wire the buffer as a stop-gap solution for now; in the
137 * future, the synchronization scheme should be revised to allow the buffer
138 * to be pageable (rdar://112039103).
139 */
140
141static kern_return_t
142vmdr_metadata_wire(vm_deferred_reclamation_metadata_t metadata)
143{
144 kern_return_t kr;
145 vm_map_offset_t buffer_start = (metadata->vdrm_reclaim_buffer -
146 offsetof(struct mach_vm_reclaim_buffer_v1_s, entries));
147 vm_map_offset_t buffer_end = (metadata->vdrm_reclaim_buffer +
148 metadata->vdrm_buffer_size);
149 kr = vm_map_wire_kernel(map: metadata->vdrm_map, start: buffer_start, end: buffer_end,
150 VM_PROT_NONE, VM_KERN_MEMORY_OSFMK, TRUE);
151 if (kr != KERN_SUCCESS) {
152 os_log_error(vm_reclaim_log_handle,
153 "vm_reclaim: failed to wire userspace reclaim buffer for pid %d (%d)",
154 task_pid(metadata->vdrm_task), kr);
155 }
156 return kr;
157}
158
159static kern_return_t
160vmdr_metadata_unwire(vm_deferred_reclamation_metadata_t metadata)
161{
162 kern_return_t kr;
163 vm_map_offset_t buffer_start = (metadata->vdrm_reclaim_buffer -
164 offsetof(struct mach_vm_reclaim_buffer_v1_s, entries));
165 vm_map_offset_t buffer_end = (metadata->vdrm_reclaim_buffer +
166 metadata->vdrm_buffer_size);
167 kr = vm_map_unwire(map: metadata->vdrm_map, start: buffer_start, end: buffer_end, TRUE);
168 if (kr != KERN_SUCCESS) {
169 os_log_error(vm_reclaim_log_handle,
170 "vm_reclaim: unable to un-wire buffer %p (%llu) for pid %d (%d)",
171 (void *)buffer_start, (buffer_end - buffer_start),
172 task_pid(metadata->vdrm_task), kr);
173 }
174 return kr;
175}
176
177static vm_deferred_reclamation_metadata_t
178vmdr_metadata_alloc(
179 task_t task,
180 user_addr_t buffer,
181 mach_vm_size_t size,
182 user_addr_t indices)
183{
184 vm_deferred_reclamation_metadata_t metadata;
185 vm_map_t map = task->map;
186
187 assert(!map->is_nested_map);
188
189 metadata = zalloc_flags(vm_reclaim_metadata_zone, Z_WAITOK | Z_ZERO);
190 lck_mtx_init(lck: &metadata->vdrm_lock, grp: &vm_reclaim_lock_grp, LCK_ATTR_NULL);
191 metadata->vdrm_task = task;
192 metadata->vdrm_map = map;
193 metadata->vdrm_reclaim_buffer = buffer;
194 metadata->vdrm_buffer_size = size;
195 metadata->vdrm_reclaim_indices = indices;
196
197 /*
198 * we do not need to hold a lock on `task` because this is called
199 * either at fork() time or from the context of current_task().
200 */
201 vm_map_reference(map);
202 return metadata;
203}
204
205static void
206vmdr_metadata_free(vm_deferred_reclamation_metadata_t metadata)
207{
208 vm_map_deallocate(map: metadata->vdrm_map);
209 lck_mtx_destroy(lck: &metadata->vdrm_lock, grp: &vm_reclaim_lock_grp);
210 zfree(vm_reclaim_metadata_zone, metadata);
211}
212
213kern_return_t
214vm_deferred_reclamation_buffer_init_internal(
215 task_t task,
216 mach_vm_offset_t address,
217 mach_vm_size_t size)
218{
219 kern_return_t kr = KERN_FAILURE, tmp_kr;
220 vm_deferred_reclamation_metadata_t metadata = NULL;
221 bool success;
222 uint64_t head = 0, tail = 0, busy = 0;
223
224 if (address == 0 ||
225 size < (sizeof(struct mach_vm_reclaim_buffer_v1_s) +
226 BUFFER_MIN_ENTRY_COUNT * sizeof(mach_vm_reclaim_entry_v1_t)) ||
227 !VM_MAP_PAGE_ALIGNED(address, VM_MAP_PAGE_MASK(task->map)) ||
228 !VM_MAP_PAGE_ALIGNED((address + size), VM_MAP_PAGE_MASK(task->map))) {
229 return KERN_INVALID_ARGUMENT;
230 }
231
232 /* vm_reclaim is disabled */
233 if (vm_reclaim_max_threshold == 0) {
234 os_log_error(vm_reclaim_log_handle,
235 "vm_reclaim: failed to initialize vmdr buffer - reclaim is disabled (%llu)",
236 vm_reclaim_max_threshold);
237 return KERN_NOT_SUPPORTED;
238 }
239
240 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_START,
241 task_pid(task), address, size);
242
243 user_addr_t buffer = address + \
244 offsetof(struct mach_vm_reclaim_buffer_v1_s, entries);
245 mach_vm_size_t buffer_size = size - \
246 offsetof(struct mach_vm_reclaim_buffer_v1_s, entries);
247 user_addr_t indices = address + \
248 offsetof(struct mach_vm_reclaim_buffer_v1_s, indices);
249
250 metadata = vmdr_metadata_alloc(task, buffer, size: buffer_size, indices);
251
252 kr = vmdr_metadata_wire(metadata);
253 if (kr != KERN_SUCCESS) {
254 goto out;
255 }
256
257 /*
258 * Validate the starting indices.
259 *
260 * NB: At this point it is impossible for another thread to hold a
261 * reference to this metadata. However, reclaim_copyin may call reclaim_kill
262 * on failure, which assumes the metadata lock is held.
263 */
264 lck_mtx_lock(lck: &metadata->vdrm_lock);
265
266 success = reclaim_copyin_busy(metadata, busy: &busy);
267 if (!success) {
268 /* metadata lock has been dropped and exception delivered to task */
269 kr = KERN_INVALID_ARGUMENT;
270 goto fail_wired;
271 }
272 success = reclaim_copyin_head(metadata, head: &head);
273 if (!success) {
274 /* metadata lock has been dropped and exception delivered to task */
275 kr = KERN_INVALID_ARGUMENT;
276 goto fail_wired;
277 }
278 success = reclaim_copyin_tail(metadata, tail: &tail);
279 if (!success) {
280 /* metadata lock has been dropped and exception delivered to task */
281 kr = KERN_INVALID_ARGUMENT;
282 goto fail_wired;
283 }
284
285 lck_mtx_unlock(lck: &metadata->vdrm_lock);
286
287 if (head != 0 || tail != 0 || busy != 0) {
288 kr = KERN_INVALID_ARGUMENT;
289 goto fail_wired;
290 }
291
292 /*
293 * Publish the metadata to the task & global buffer list. This must be
294 * done under the task lock to synchronize with task termination - i.e.
295 * task_terminate_internal is guaranteed to see the published metadata and
296 * tear it down.
297 */
298 lck_mtx_lock(lck: &reclamation_buffers_lock);
299 task_lock(task);
300
301 if (!task_is_active(task) || task_is_halting(task)) {
302 os_log_error(vm_reclaim_log_handle,
303 "vm_reclaim: failed to initialize buffer on dying task (pid %d)", task_pid(task));
304 kr = KERN_TERMINATED;
305 goto fail_task;
306 } else if (task->deferred_reclamation_metadata != NULL) {
307 os_log_error(vm_reclaim_log_handle,
308 "vm_reclaim: tried to overwrite existing reclaim buffer for pid %d", task_pid(task));
309 kr = KERN_INVALID_ARGUMENT;
310 goto fail_task;
311 }
312
313 TAILQ_INSERT_TAIL(&reclamation_buffers, metadata, vdrm_list);
314 reclamation_buffers_length++;
315
316 task->deferred_reclamation_metadata = metadata;
317
318 task_unlock(task);
319 lck_mtx_unlock(lck: &reclamation_buffers_lock);
320
321 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_END,
322 task_pid(task), KERN_SUCCESS);
323 return KERN_SUCCESS;
324
325fail_task:
326 task_unlock(task);
327 lck_mtx_unlock(lck: &reclamation_buffers_lock);
328
329fail_wired:
330 tmp_kr = vmdr_metadata_unwire(metadata);
331 assert3u(tmp_kr, ==, KERN_SUCCESS);
332
333out:
334 vmdr_metadata_free(metadata);
335 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_END,
336 task_pid(task), kr);
337 return kr;
338}
339
340void
341vm_deferred_reclamation_buffer_uninstall(vm_deferred_reclamation_metadata_t metadata)
342{
343 assert(metadata != NULL);
344 /*
345 * First remove the buffer from the global list so no one else can get access to it.
346 */
347 lck_mtx_lock(lck: &reclamation_buffers_lock);
348 TAILQ_REMOVE(&reclamation_buffers, metadata, vdrm_list);
349 reclamation_buffers_length--;
350 lck_mtx_unlock(lck: &reclamation_buffers_lock);
351
352 /*
353 * Now remove it from the async list (if present)
354 */
355 lck_mtx_lock(lck: &async_reclamation_buffers_lock);
356 if (metadata->vdrm_async_list.tqe_next != NULL || metadata->vdrm_async_list.tqe_prev != NULL) {
357 TAILQ_REMOVE(&async_reclamation_buffers, metadata, vdrm_async_list);
358 metadata->vdrm_async_list.tqe_next = NULL;
359 metadata->vdrm_async_list.tqe_prev = NULL;
360 }
361 lck_mtx_unlock(lck: &async_reclamation_buffers_lock);
362
363 // A kernel thread may have grabbed the lock for this buffer before we had
364 // a chance to remove it from the queues. Take the metadata lock to ensure
365 // any such workers are finished operating on the buffer.
366 lck_mtx_lock(lck: &metadata->vdrm_lock);
367 lck_mtx_unlock(lck: &metadata->vdrm_lock);
368
369 vmdr_metadata_unwire(metadata);
370}
371
372void
373vm_deferred_reclamation_buffer_deallocate(vm_deferred_reclamation_metadata_t metadata)
374{
375 assert(metadata != NULL);
376 vmdr_metadata_free(metadata);
377}
378
379static user_addr_t
380get_head_ptr(user_addr_t indices)
381{
382 return indices + offsetof(mach_vm_reclaim_indices_v1_t, head);
383}
384
385static user_addr_t
386get_tail_ptr(user_addr_t indices)
387{
388 return indices + offsetof(mach_vm_reclaim_indices_v1_t, tail);
389}
390
391static user_addr_t
392get_busy_ptr(user_addr_t indices)
393{
394 return indices + offsetof(mach_vm_reclaim_indices_v1_t, busy);
395}
396
397static void
398reclaim_kill_with_reason(
399 vm_deferred_reclamation_metadata_t metadata,
400 unsigned reason,
401 mach_exception_data_type_t subcode)
402{
403 unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY;
404 mach_exception_code_t code = 0;
405 task_t task = metadata->vdrm_task;
406 proc_t p = NULL;
407 boolean_t fatal = TRUE;
408 bool killing_self = false;
409 pid_t pid;
410 int err;
411
412 if (panic_on_kill) {
413 panic("vm_reclaim: About to kill %p due to %d with subcode %lld\n", task, reason, subcode);
414 }
415
416 EXC_GUARD_ENCODE_TYPE(code, guard_type);
417 EXC_GUARD_ENCODE_FLAVOR(code, reason);
418 EXC_GUARD_ENCODE_TARGET(code, 0);
419
420 assert(metadata->vdrm_task != kernel_task);
421 killing_self = task == current_task();
422 if (!killing_self) {
423 /*
424 * Grab a reference on the task to make sure it doesn't go away
425 * after we drop the metadata lock
426 */
427 task_reference(task);
428 }
429 /*
430 * We need to issue a wakeup in case this kill is coming from the async path.
431 * Once we drop the lock the caller can no longer do this wakeup, but
432 * if there's someone blocked on this reclaim they hold a map reference
433 * and thus need to be woken up so the map can be freed.
434 */
435 thread_wakeup(&metadata->vdrm_async_list);
436 lck_mtx_unlock(lck: &metadata->vdrm_lock);
437
438 if (reason == kGUARD_EXC_DEALLOC_GAP) {
439 task_lock(task);
440 fatal = (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL);
441 task_unlock(task);
442 }
443
444 if (!fatal) {
445 os_log_info(vm_reclaim_log_handle,
446 "vm_reclaim: Skipping non fatal guard exception.\n");
447 goto out;
448 }
449
450 pid = task_pid(task);
451 if (killing_self) {
452 p = get_bsdtask_info(task);
453 } else {
454 p = proc_find(pid);
455 if (p && proc_task(p) != task) {
456 os_log_error(vm_reclaim_log_handle,
457 "vm_reclaim: Unable to deliver guard exception because proc is gone & pid rolled over.\n");
458 goto out;
459 }
460
461 task_deallocate(task);
462 task = NULL;
463 }
464
465 if (!p) {
466 os_log_error(vm_reclaim_log_handle,
467 "vm_reclaim: Unable to deliver guard exception because task does not have a proc.\n");
468 goto out;
469 }
470
471 err = exit_with_guard_exception(p, code, subcode);
472 if (err != 0) {
473 os_log_error(vm_reclaim_log_handle, "vm_reclaim: Unable to deliver guard exception to %p: %d\n", p, err);
474 }
475out:
476 if (!killing_self) {
477 if (p) {
478 proc_rele(p);
479 p = NULL;
480 }
481 if (task) {
482 task_deallocate(task);
483 task = NULL;
484 }
485 }
486}
487
488static void
489reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata, int result)
490{
491 reclaim_kill_with_reason(metadata, reason: kGUARD_EXC_RECLAIM_COPYIO_FAILURE, subcode: result);
492}
493
494/*
495 * Helper functions to do copyio on the head, tail, and busy pointers.
496 * Note that the kernel will only write to the busy and head pointers.
497 * Userspace is not supposed to write to the head or busy pointers, but the kernel
498 * must be resilient to that kind of bug in userspace.
499 */
500
501
502static bool
503reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head)
504{
505 int result;
506 user_addr_t indices = metadata->vdrm_reclaim_indices;
507 user_addr_t head_ptr = get_head_ptr(indices);
508
509 result = copyin_atomic64(user_addr: head_ptr, kernel_addr: head);
510
511 if (result != 0) {
512 os_log_error(vm_reclaim_log_handle,
513 "vm_reclaim: Unable to copy head ptr from 0x%llx: err=%d\n", head_ptr, result);
514 reclaim_handle_copyio_error(metadata, result);
515 return false;
516 }
517 return true;
518}
519
520static bool
521reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail)
522{
523 int result;
524 user_addr_t indices = metadata->vdrm_reclaim_indices;
525 user_addr_t tail_ptr = get_tail_ptr(indices);
526
527 result = copyin_atomic64(user_addr: tail_ptr, kernel_addr: tail);
528
529 if (result != 0) {
530 os_log_error(vm_reclaim_log_handle,
531 "vm_reclaim: Unable to copy tail ptr from 0x%llx: err=%d\n", tail_ptr, result);
532 reclaim_handle_copyio_error(metadata, result);
533 return false;
534 }
535 return true;
536}
537
538static bool
539reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy)
540{
541 int result;
542 user_addr_t indices = metadata->vdrm_reclaim_indices;
543 user_addr_t busy_ptr = get_busy_ptr(indices);
544
545 result = copyin_atomic64(user_addr: busy_ptr, kernel_addr: busy);
546
547 if (result != 0) {
548 os_log_error(vm_reclaim_log_handle,
549 "vm_reclaim: Unable to copy busy ptr from 0x%llx: err=%d\n", busy_ptr, result);
550 reclaim_handle_copyio_error(metadata, result);
551 return false;
552 }
553 return true;
554}
555
556static bool
557reclaim_copyout_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t value)
558{
559 int result;
560 user_addr_t indices = metadata->vdrm_reclaim_indices;
561 user_addr_t busy_ptr = get_busy_ptr(indices);
562
563 result = copyout_atomic64(u64: value, user_addr: busy_ptr);
564
565 if (result != 0) {
566 os_log_error(vm_reclaim_log_handle,
567 "vm_reclaim: Unable to copy %llu to busy ptr at 0x%llx: err=%d\n", value, busy_ptr, result);
568 reclaim_handle_copyio_error(metadata, result);
569 return false;
570 }
571 return true;
572}
573
574static bool
575reclaim_copyout_head(vm_deferred_reclamation_metadata_t metadata, uint64_t value)
576{
577 int result;
578 user_addr_t indices = metadata->vdrm_reclaim_indices;
579 user_addr_t head_ptr = get_head_ptr(indices);
580
581 result = copyout_atomic64(u64: value, user_addr: head_ptr);
582
583 if (result != 0) {
584 os_log_error(vm_reclaim_log_handle,
585 "vm_reclaim: Unable to copy %llu to head ptr at 0x%llx: err=%d\n", value, head_ptr, result);
586 reclaim_handle_copyio_error(metadata, result);
587 return false;
588 }
589 return true;
590}
591
592/*
593 * Reclaim a chunk (kReclaimChunkSize entries) from the buffer.
594 *
595 * Writes the number of entries reclaimed to `num_reclaimed_out`. Note that
596 * there may be zero reclaimable entries in the chunk (they have all been
597 * re-used by userspace).
598 *
599 * Returns:
600 * - KERN_NOT_FOUND if the buffer has been exhausted (head == tail)
601 * - KERN_FAILURE on failure to reclaim -- metadata lock will be dropped
602 * before returning
603 */
604static kern_return_t
605reclaim_chunk(vm_deferred_reclamation_metadata_t metadata, size_t *num_reclaimed_out)
606{
607 assert(metadata != NULL);
608 LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_OWNED);
609 int result = 0;
610 size_t num_reclaimed = 0;
611 uint64_t head = 0, tail = 0, busy = 0, num_to_reclaim = 0, new_tail = 0, num_copied = 0, buffer_len = 0;
612 user_addr_t indices;
613 vm_map_t map = metadata->vdrm_map, old_map;
614 mach_vm_reclaim_entry_v1_t reclaim_entries[kReclaimChunkSize];
615 bool success;
616
617 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_CHUNK) | DBG_FUNC_START,
618 task_pid(metadata->vdrm_task), kReclaimChunkSize);
619
620 buffer_len = metadata->vdrm_buffer_size / sizeof(mach_vm_reclaim_entry_v1_t);
621
622 memset(s: reclaim_entries, c: 0, n: sizeof(reclaim_entries));
623
624 indices = (user_addr_t) metadata->vdrm_reclaim_indices;
625 old_map = vm_map_switch(map);
626
627 success = reclaim_copyin_busy(metadata, busy: &busy);
628 if (!success) {
629 goto fail;
630 }
631 success = reclaim_copyin_head(metadata, head: &head);
632 if (!success) {
633 goto fail;
634 }
635 success = reclaim_copyin_tail(metadata, tail: &tail);
636 if (!success) {
637 goto fail;
638 }
639
640 if (busy != head) {
641 // Userspace overwrote one of the pointers
642 os_log_error(vm_reclaim_log_handle,
643 "vm_reclaim: Userspace modified head or busy pointer! head: %llu (0x%llx) != busy: %llu (0x%llx) | tail = %llu (0x%llx)\n",
644 head, get_head_ptr(indices), busy, get_busy_ptr(indices), tail, get_tail_ptr(indices));
645 reclaim_kill_with_reason(metadata, reason: kGUARD_EXC_RECLAIM_INDEX_FAILURE, subcode: busy);
646 goto fail;
647 }
648
649 if (tail < head) {
650 // Userspace is likely in the middle of trying to re-use an entry, bail on this reclamation
651 os_log_error(vm_reclaim_log_handle,
652 "vm_reclaim: Userspace modified head or tail pointer! head: %llu (0x%llx) > tail: %llu (0x%llx) | busy = %llu (0x%llx)\n",
653 head, get_head_ptr(indices), tail, get_tail_ptr(indices), busy, get_busy_ptr(indices));
654 lck_mtx_unlock(lck: &metadata->vdrm_lock);
655 goto fail;
656 }
657
658 num_to_reclaim = tail - head;
659 while (true) {
660 num_to_reclaim = MIN(num_to_reclaim, kReclaimChunkSize);
661 if (num_to_reclaim == 0) {
662 break;
663 }
664 busy = head + num_to_reclaim;
665 success = reclaim_copyout_busy(metadata, value: busy);
666 if (!success) {
667 goto fail;
668 }
669 os_atomic_thread_fence(seq_cst);
670 success = reclaim_copyin_tail(metadata, tail: &new_tail);
671 if (!success) {
672 goto fail;
673 }
674
675 if (new_tail >= busy) {
676 /* Got num_to_reclaim entries */
677 break;
678 }
679 tail = new_tail;
680 if (tail < head) {
681 // Userspace is likely in the middle of trying to re-use an entry, bail on this reclamation
682 os_log_error(vm_reclaim_log_handle,
683 "vm_reclaim: Userspace modified head or tail pointer! head: %llu (0x%llx) > tail: %llu (0x%llx) | busy = %llu (0x%llx)\n",
684 head, get_head_ptr(indices), tail, get_tail_ptr(indices), busy, get_busy_ptr(indices));
685 lck_mtx_unlock(lck: &metadata->vdrm_lock);
686 goto fail;
687 }
688 /* Can't reclaim these entries. Try again */
689 num_to_reclaim = tail - head;
690 if (num_to_reclaim == 0) {
691 /* Nothing left to reclaim. Reset busy to head. */
692 success = reclaim_copyout_busy(metadata, value: head);
693 if (!success) {
694 goto fail;
695 }
696 break;
697 }
698 /*
699 * Note that num_to_reclaim must have gotten smaller since tail got smaller,
700 * so this is gauranteed to converge.
701 */
702 }
703
704 while (num_copied < num_to_reclaim) {
705 uint64_t memcpy_start_idx = (head % buffer_len);
706 uint64_t memcpy_end_idx = memcpy_start_idx + num_to_reclaim - num_copied;
707 // Clamp the end idx to the buffer. We'll handle wrap-around in our next go around the loop.
708 memcpy_end_idx = MIN(memcpy_end_idx, buffer_len);
709 uint64_t num_to_copy = memcpy_end_idx - memcpy_start_idx;
710
711 assert(num_to_copy + num_copied <= kReclaimChunkSize);
712 user_addr_t src_ptr = metadata->vdrm_reclaim_buffer + memcpy_start_idx * sizeof(mach_vm_reclaim_entry_v1_t);
713 mach_vm_reclaim_entry_v1_t *dst_ptr = reclaim_entries + num_copied;
714
715 result = copyin(src_ptr, dst_ptr, num_to_copy * sizeof(mach_vm_reclaim_entry_v1_t));
716
717 if (result != 0) {
718 os_log_error(vm_reclaim_log_handle,
719 "vm_reclaim: Unable to copyin %llu entries in reclaim buffer at 0x%llx to 0x%llx: err=%d\n",
720 num_to_copy, src_ptr, (uint64_t) dst_ptr, result);
721 reclaim_handle_copyio_error(metadata, result);
722 goto fail;
723 }
724
725 num_copied += num_to_copy;
726 head += num_to_copy;
727 }
728
729 for (size_t i = 0; i < num_to_reclaim; i++) {
730 mach_vm_reclaim_entry_v1_t *entry = &reclaim_entries[i];
731 KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_ENTRY) | DBG_FUNC_START,
732 task_pid(metadata->vdrm_task), entry->address, entry->size,
733 entry->behavior);
734 DTRACE_VM4(vm_reclaim_chunk,
735 int, task_pid(metadata->vdrm_task),
736 mach_vm_address_t, entry->address,
737 size_t, entry->size,
738 mach_vm_reclaim_behavior_v1_t, entry->behavior);
739 if (entry->address != 0 && entry->size != 0) {
740 kern_return_t kr;
741 switch (entry->behavior) {
742 case MACH_VM_RECLAIM_DEALLOCATE:
743 kr = vm_map_remove_guard(map,
744 vm_map_trunc_page(entry->address,
745 VM_MAP_PAGE_MASK(map)),
746 vm_map_round_page(entry->address + entry->size,
747 VM_MAP_PAGE_MASK(map)),
748 flags: VM_MAP_REMOVE_GAPS_FAIL,
749 KMEM_GUARD_NONE).kmr_return;
750 if (kr == KERN_INVALID_VALUE) {
751 reclaim_kill_with_reason(metadata, reason: kGUARD_EXC_DEALLOC_GAP, subcode: entry->address);
752 goto fail;
753 } else if (kr != KERN_SUCCESS) {
754 os_log_error(vm_reclaim_log_handle,
755 "vm_reclaim: Unable to deallocate 0x%llx (%u) from 0x%llx err=%d\n",
756 entry->address, entry->size, (uint64_t) map, kr);
757 reclaim_kill_with_reason(metadata, reason: kGUARD_EXC_RECLAIM_DEALLOCATE_FAILURE, subcode: kr);
758 goto fail;
759 }
760 break;
761 case MACH_VM_RECLAIM_REUSABLE:
762 kr = vm_map_behavior_set(map,
763 vm_map_trunc_page(entry->address, VM_MAP_PAGE_MASK(map)),
764 vm_map_round_page(entry->address + entry->size, VM_MAP_PAGE_MASK(map)),
765 VM_BEHAVIOR_REUSABLE);
766 if (kr != KERN_SUCCESS) {
767 os_log_error(vm_reclaim_log_handle,
768 "vm_reclaim: unable to free(reusable) 0x%llx (%u) for pid %d err=%d\n",
769 entry->address, entry->size, task_pid(metadata->vdrm_task), kr);
770 }
771 break;
772 default:
773 os_log_error(vm_reclaim_log_handle,
774 "vm_reclaim: attempted to reclaim entry with unsupported behavior %uh",
775 entry->behavior);
776 reclaim_kill_with_reason(metadata, reason: kGUARD_EXC_RECLAIM_DEALLOCATE_FAILURE, subcode: kr);
777 goto fail;
778 }
779 num_reclaimed++;
780 os_atomic_add(&metadata->vdrm_num_bytes_reclaimed, entry->size, relaxed);
781 KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_ENTRY) | DBG_FUNC_END,
782 task_pid(metadata->vdrm_task), entry->address);
783 }
784 }
785
786 success = reclaim_copyout_head(metadata, value: head);
787 if (!success) {
788 goto fail;
789 }
790
791 vm_map_switch(map: old_map);
792 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_CHUNK) | DBG_FUNC_END,
793 task_pid(metadata->vdrm_task), num_to_reclaim, num_reclaimed, true);
794 *num_reclaimed_out = num_reclaimed;
795 if (num_to_reclaim == 0) {
796 // We have exhausted the reclaimable portion of the buffer
797 return KERN_NOT_FOUND;
798 }
799 return KERN_SUCCESS;
800fail:
801 vm_map_switch(map: old_map);
802 *num_reclaimed_out = num_reclaimed;
803 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_CHUNK) | DBG_FUNC_END,
804 task_pid(metadata->vdrm_task), num_to_reclaim, num_reclaimed, false);
805 return KERN_FAILURE;
806}
807
808/*
809 * Attempts to reclaim until the buffer's estimated number of available bytes
810 * is <= num_bytes_reclaimable_threshold. The metadata buffer lock should be
811 * held by the caller.
812 *
813 * Writes the number of entries reclaimed to `num_reclaimed_out`.
814 */
815static kern_return_t
816reclaim_entries_from_buffer(vm_deferred_reclamation_metadata_t metadata,
817 size_t num_bytes_reclaimable_threshold, size_t *num_reclaimed_out)
818{
819 assert(metadata != NULL);
820 assert(num_reclaimed_out != NULL);
821 LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_OWNED);
822 if (!task_is_active(task: metadata->vdrm_task)) {
823 /*
824 * If the task is exiting, the reclaim below will likely fail and fall through
825 * to the (slower) error path.
826 * So as an optimization, we bail out early here.
827 */
828 return 0;
829 }
830
831 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ENTRIES) | DBG_FUNC_START, task_pid(metadata->vdrm_task));
832
833 size_t num_entries_reclaimed = 0, num_bytes_reclaimed, estimated_reclaimable_bytes, reclaimable_bytes;
834 while (true) {
835 kern_return_t kr;
836 size_t curr_entries_reclaimed = 0;
837 num_bytes_reclaimed = os_atomic_load(&metadata->vdrm_num_bytes_reclaimed, relaxed);
838 reclaimable_bytes = os_atomic_load(&metadata->vdrm_num_bytes_put_in_buffer, relaxed);
839 if (num_bytes_reclaimed > reclaimable_bytes) {
840 estimated_reclaimable_bytes = 0;
841 } else {
842 estimated_reclaimable_bytes = reclaimable_bytes - num_bytes_reclaimed;
843 }
844 if (reclaimable_bytes <= num_bytes_reclaimable_threshold) {
845 break;
846 }
847 kr = reclaim_chunk(metadata, num_reclaimed_out: &curr_entries_reclaimed);
848 if (kr == KERN_NOT_FOUND) {
849 break;
850 } else if (kr != KERN_SUCCESS) {
851 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ENTRIES) | DBG_FUNC_END,
852 task_pid(metadata->vdrm_task), num_entries_reclaimed,
853 estimated_reclaimable_bytes, kr);
854 *num_reclaimed_out = num_entries_reclaimed;
855 return kr;
856 }
857 num_entries_reclaimed += curr_entries_reclaimed;
858 }
859
860 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ENTRIES) | DBG_FUNC_END,
861 task_pid(metadata->vdrm_task), num_entries_reclaimed,
862 estimated_reclaimable_bytes, KERN_SUCCESS);
863 *num_reclaimed_out = num_entries_reclaimed;
864 return KERN_SUCCESS;
865}
866
867/*
868 * Get the reclamation metadata buffer for the given map.
869 * If the buffer exists it is returned locked.
870 */
871static vm_deferred_reclamation_metadata_t
872get_task_reclaim_metadata(task_t task)
873{
874 assert(task != NULL);
875 vm_deferred_reclamation_metadata_t metadata = NULL;
876 task_lock(task);
877 metadata = task->deferred_reclamation_metadata;
878 if (metadata != NULL) {
879 lck_mtx_lock(lck: &metadata->vdrm_lock);
880 }
881 task_unlock(task);
882 return metadata;
883}
884
885kern_return_t
886vm_deferred_reclamation_buffer_synchronize_internal(task_t task, size_t num_entries_to_reclaim)
887{
888 kern_return_t kr;
889 vm_deferred_reclamation_metadata_t metadata = NULL;
890 size_t total_reclaimed = 0;
891
892 if (!task_is_active(task)) {
893 return KERN_FAILURE;
894 }
895
896 metadata = get_task_reclaim_metadata(task);
897 if (metadata == NULL) {
898 return KERN_INVALID_ARGUMENT;
899 }
900
901 while (total_reclaimed < num_entries_to_reclaim) {
902 size_t num_reclaimed;
903 kr = reclaim_chunk(metadata, num_reclaimed_out: &num_reclaimed);
904 if (kr == KERN_NOT_FOUND) {
905 /* buffer has been fully reclaimed from */
906 break;
907 } else if (kr != KERN_SUCCESS) {
908 /* Lock has already been released and task is being killed. */
909 return kr;
910 }
911
912 total_reclaimed += num_reclaimed;
913 }
914 lck_mtx_unlock(lck: &metadata->vdrm_lock);
915
916 return KERN_SUCCESS;
917}
918
919kern_return_t
920vm_deferred_reclamation_buffer_update_reclaimable_bytes_internal(task_t task, size_t reclaimable_bytes)
921{
922 vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
923 size_t num_bytes_reclaimed, estimated_reclaimable_bytes, num_bytes_in_buffer, num_reclaimed = 0;
924 bool success;
925 kern_return_t kr = KERN_SUCCESS;
926 if (metadata == NULL) {
927 return KERN_INVALID_ARGUMENT;
928 }
929
930 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_UPDATE_ACCOUNTING) | DBG_FUNC_START,
931 task_pid(task), reclaimable_bytes);
932
933 /*
934 * The client is allowed to make this call in parallel from multiple threads.
935 * Ensure we only ever increase the value of vdrm_num_bytes_put_in_buffer.
936 * If the client's value is smaller than what we've stored, another thread
937 * raced ahead of them and we've already acted on that accounting so this
938 * call should be a no-op.
939 */
940 success = os_atomic_rmw_loop(&metadata->vdrm_num_bytes_put_in_buffer, num_bytes_in_buffer,
941 reclaimable_bytes, acquire,
942 {
943 if (num_bytes_in_buffer > reclaimable_bytes) {
944 os_atomic_rmw_loop_give_up(break);
945 }
946 });
947 if (!success) {
948 /* Stale value. Nothing new to reclaim */
949 goto done;
950 }
951 num_bytes_reclaimed = os_atomic_load(&metadata->vdrm_num_bytes_reclaimed, relaxed);
952
953 if (reclaimable_bytes > num_bytes_reclaimed) {
954 estimated_reclaimable_bytes = reclaimable_bytes - num_bytes_reclaimed;
955 if (estimated_reclaimable_bytes > vm_reclaim_max_threshold) {
956 lck_mtx_lock(lck: &metadata->vdrm_lock);
957 kr = reclaim_entries_from_buffer(metadata,
958 num_bytes_reclaimable_threshold: vm_reclaim_max_threshold, num_reclaimed_out: &num_reclaimed);
959 if (kr != KERN_SUCCESS) {
960 /* Lock has already been released & task is in the process of getting killed. */
961 goto done;
962 }
963 lck_mtx_unlock(lck: &metadata->vdrm_lock);
964 }
965 }
966
967done:
968 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_UPDATE_ACCOUNTING) | DBG_FUNC_END,
969 task_pid(task), reclaimable_bytes, num_bytes_reclaimed, num_reclaimed);
970
971 return kr;
972}
973
974static inline size_t
975pick_reclaim_threshold(vm_deferred_reclamation_action_t action)
976{
977 switch (action) {
978 case RECLAIM_FULL:
979 return 0;
980 case RECLAIM_TRIM:
981 return vm_reclaim_max_threshold / vm_reclaim_trim_divisor;
982 case RECLAIM_ASYNC:
983 return 0;
984 }
985}
986
987void
988vm_deferred_reclamation_reclaim_memory(vm_deferred_reclamation_action_t action)
989{
990 kern_return_t kr;
991 size_t num_reclaimed;
992
993 if (action == RECLAIM_ASYNC) {
994 lck_mtx_lock(lck: &async_reclamation_buffers_lock);
995
996 process_async_reclamation_list();
997 lck_mtx_unlock(lck: &async_reclamation_buffers_lock);
998 } else {
999 size_t reclaim_threshold = pick_reclaim_threshold(action);
1000 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ALL_MEMORY) | DBG_FUNC_START,
1001 action, reclaim_threshold);
1002 lck_mtx_lock(lck: &reclamation_buffers_lock);
1003 reclamation_counter++;
1004 while (true) {
1005 vm_deferred_reclamation_metadata_t metadata = TAILQ_FIRST(&reclamation_buffers);
1006 if (metadata == NULL) {
1007 break;
1008 }
1009 lck_mtx_lock(lck: &metadata->vdrm_lock);
1010 if (metadata->vdrm_reclaimed_at >= reclamation_counter) {
1011 // We've already seen this one. We're done
1012 lck_mtx_unlock(lck: &metadata->vdrm_lock);
1013 break;
1014 }
1015 metadata->vdrm_reclaimed_at = reclamation_counter;
1016
1017 TAILQ_REMOVE(&reclamation_buffers, metadata, vdrm_list);
1018 TAILQ_INSERT_TAIL(&reclamation_buffers, metadata, vdrm_list);
1019 lck_mtx_unlock(lck: &reclamation_buffers_lock);
1020
1021 kr = reclaim_entries_from_buffer(metadata,
1022 num_bytes_reclaimable_threshold: reclaim_threshold, num_reclaimed_out: &num_reclaimed);
1023 if (kr == KERN_SUCCESS) {
1024 lck_mtx_unlock(lck: &metadata->vdrm_lock);
1025 }
1026
1027 lck_mtx_lock(lck: &reclamation_buffers_lock);
1028 }
1029 lck_mtx_unlock(lck: &reclamation_buffers_lock);
1030 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ALL_MEMORY) | DBG_FUNC_END,
1031 reclamation_counter);
1032 }
1033}
1034
1035void
1036vm_deferred_reclamation_reclaim_all_memory(void)
1037{
1038 vm_deferred_reclamation_reclaim_memory(action: RECLAIM_FULL);
1039}
1040
1041bool
1042vm_deferred_reclamation_reclaim_from_task_async(task_t task)
1043{
1044 bool queued = false;
1045 vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
1046
1047 if (metadata != NULL) {
1048 lck_mtx_lock(lck: &async_reclamation_buffers_lock);
1049 if (metadata->vdrm_async_list.tqe_next != NULL ||
1050 metadata->vdrm_async_list.tqe_prev != NULL) {
1051 // move this buffer to the tail if still on the async list
1052 TAILQ_REMOVE(&async_reclamation_buffers, metadata, vdrm_async_list);
1053 }
1054 TAILQ_INSERT_TAIL(&async_reclamation_buffers, metadata, vdrm_async_list);
1055 lck_mtx_unlock(lck: &async_reclamation_buffers_lock);
1056 queued = true;
1057 thread_wakeup(&vm_reclaim_thread);
1058 }
1059
1060 return queued;
1061}
1062
1063bool
1064vm_deferred_reclamation_reclaim_from_task_sync(task_t task, size_t max_entries_to_reclaim)
1065{
1066 kern_return_t kr;
1067 size_t num_reclaimed = 0;
1068 vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata;
1069
1070 if (!task_is_active(task)) {
1071 return false;
1072 }
1073
1074 if (metadata != NULL) {
1075 lck_mtx_lock(lck: &metadata->vdrm_lock);
1076 while (num_reclaimed < max_entries_to_reclaim) {
1077 size_t num_reclaimed_now;
1078 kr = reclaim_chunk(metadata, num_reclaimed_out: &num_reclaimed_now);
1079 if (kr == KERN_NOT_FOUND) {
1080 // Nothing left to reclaim
1081 break;
1082 } else if (kr != KERN_SUCCESS) {
1083 /* Lock has already been released and task is being killed. */
1084 return false;
1085 }
1086 num_reclaimed += num_reclaimed_now;
1087 }
1088 lck_mtx_unlock(lck: &metadata->vdrm_lock);
1089 }
1090
1091 return num_reclaimed > 0;
1092}
1093
1094vm_deferred_reclamation_metadata_t
1095vm_deferred_reclamation_buffer_fork(task_t task, vm_deferred_reclamation_metadata_t parent)
1096{
1097 kern_return_t kr;
1098 vm_deferred_reclamation_metadata_t metadata = NULL;
1099
1100 LCK_MTX_ASSERT(&parent->vdrm_lock, LCK_MTX_ASSERT_OWNED);
1101
1102 assert(task->deferred_reclamation_metadata == NULL);
1103 metadata = vmdr_metadata_alloc(task, buffer: parent->vdrm_reclaim_buffer,
1104 size: parent->vdrm_buffer_size, indices: parent->vdrm_reclaim_indices);
1105 lck_mtx_unlock(lck: &parent->vdrm_lock);
1106
1107 kr = vmdr_metadata_wire(metadata);
1108 if (kr != KERN_SUCCESS) {
1109 vmdr_metadata_free(metadata);
1110 return NULL;
1111 }
1112
1113 lck_mtx_lock(lck: &reclamation_buffers_lock);
1114 TAILQ_INSERT_TAIL(&reclamation_buffers, metadata, vdrm_list);
1115 reclamation_buffers_length++;
1116 lck_mtx_unlock(lck: &reclamation_buffers_lock);
1117
1118 return metadata;
1119}
1120
1121void
1122vm_deferred_reclamation_buffer_lock(vm_deferred_reclamation_metadata_t metadata)
1123{
1124 lck_mtx_lock(lck: &metadata->vdrm_lock);
1125}
1126
1127void
1128vm_deferred_reclamation_buffer_unlock(vm_deferred_reclamation_metadata_t metadata)
1129{
1130 lck_mtx_unlock(lck: &metadata->vdrm_lock);
1131}
1132
1133
1134static void
1135reclaim_thread_init(void)
1136{
1137#if CONFIG_THREAD_GROUPS
1138 thread_group_vm_add();
1139#endif
1140 thread_set_thread_name(th: current_thread(), name: "VM_reclaim");
1141}
1142
1143
1144static void
1145process_async_reclamation_list(void)
1146{
1147 kern_return_t kr;
1148 size_t total_entries_reclaimed = 0;
1149 size_t num_tasks_reclaimed = 0;
1150 LCK_MTX_ASSERT(&async_reclamation_buffers_lock, LCK_MTX_ASSERT_OWNED);
1151 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ASYNC_MEMORY) | DBG_FUNC_START);
1152
1153 vm_deferred_reclamation_metadata_t metadata = TAILQ_FIRST(&async_reclamation_buffers);
1154 while (metadata != NULL) {
1155 size_t num_reclaimed;
1156 TAILQ_REMOVE(&async_reclamation_buffers, metadata, vdrm_async_list);
1157 metadata->vdrm_async_list.tqe_next = NULL;
1158 metadata->vdrm_async_list.tqe_prev = NULL;
1159 lck_mtx_lock(lck: &metadata->vdrm_lock);
1160 lck_mtx_unlock(lck: &async_reclamation_buffers_lock);
1161
1162 // NB: Currently the async reclaim thread fully reclaims the buffer.
1163 kr = reclaim_entries_from_buffer(metadata, num_bytes_reclaimable_threshold: 0, num_reclaimed_out: &num_reclaimed);
1164 total_entries_reclaimed += num_reclaimed;
1165 if (kr != KERN_SUCCESS) {
1166 /* Lock has already been released & task is in the process of getting killed. */
1167 goto next;
1168 }
1169 num_tasks_reclaimed++;
1170 /* Wakeup anyone waiting on this buffer getting processed */
1171 thread_wakeup(&metadata->vdrm_async_list);
1172 assert(current_thread()->map == kernel_map);
1173 lck_mtx_unlock(lck: &metadata->vdrm_lock);
1174
1175next:
1176 lck_mtx_lock(lck: &async_reclamation_buffers_lock);
1177 metadata = TAILQ_FIRST(&async_reclamation_buffers);
1178 }
1179 KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ASYNC_MEMORY) | DBG_FUNC_END,
1180 num_tasks_reclaimed, total_entries_reclaimed);
1181}
1182
1183__enum_decl(reclaim_thread_state, uint32_t, {
1184 RECLAIM_THREAD_INIT = 0,
1185 RECLAIM_THREAD_CONT = 1,
1186});
1187
1188static void
1189reclaim_thread_continue(void)
1190{
1191 lck_mtx_lock(lck: &async_reclamation_buffers_lock);
1192
1193 process_async_reclamation_list();
1194 assert_wait(event: &vm_reclaim_thread, THREAD_UNINT);
1195
1196 lck_mtx_unlock(lck: &async_reclamation_buffers_lock);
1197}
1198
1199void
1200reclaim_thread(void *param, wait_result_t wr __unused)
1201{
1202 if (param == (void *) RECLAIM_THREAD_INIT) {
1203 reclaim_thread_init();
1204 } else {
1205 assert(param == (void *) RECLAIM_THREAD_CONT);
1206 }
1207
1208 reclaim_thread_continue();
1209
1210 (void) thread_block_parameter(continuation: reclaim_thread, parameter: (void*) RECLAIM_THREAD_CONT);
1211}
1212
1213__startup_func
1214static void
1215vm_deferred_reclamation_init(void)
1216{
1217 // Note: no-op pending rdar://27006343 (Custom kernel log handles)
1218 vm_reclaim_log_handle = os_log_create(subsystem: "com.apple.mach.vm", category: "reclaim");
1219
1220 (void)kernel_thread_start_priority(continuation: reclaim_thread,
1221 parameter: (void *)RECLAIM_THREAD_INIT, priority: kReclaimThreadPriority,
1222 new_thread: &vm_reclaim_thread);
1223}
1224
1225STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_deferred_reclamation_init);
1226
1227#if DEVELOPMENT || DEBUG
1228
1229bool
1230vm_deferred_reclamation_block_until_pid_has_been_reclaimed(int pid)
1231{
1232 vm_deferred_reclamation_metadata_t metadata = NULL;
1233 proc_t p = proc_find(pid);
1234 vm_map_t map = NULL;
1235 if (p == NULL) {
1236 return false;
1237 }
1238 task_t t = proc_task(p);
1239 if (t == NULL) {
1240 proc_rele(p);
1241 return false;
1242 }
1243
1244 task_lock(t);
1245 if (t->map) {
1246 metadata = t->deferred_reclamation_metadata;
1247 if (metadata != NULL) {
1248 map = t->map;
1249 vm_map_reference(t->map);
1250 }
1251 }
1252 task_unlock(t);
1253 proc_rele(p);
1254 if (metadata == NULL) {
1255 return false;
1256 }
1257
1258 lck_mtx_lock(&async_reclamation_buffers_lock);
1259 while (metadata->vdrm_async_list.tqe_next != NULL || metadata->vdrm_async_list.tqe_prev != NULL) {
1260 assert_wait(&metadata->vdrm_async_list, THREAD_UNINT);
1261 lck_mtx_unlock(&async_reclamation_buffers_lock);
1262 thread_block(THREAD_CONTINUE_NULL);
1263 lck_mtx_lock(&async_reclamation_buffers_lock);
1264 }
1265
1266 /*
1267 * The async reclaim thread first removes the buffer from the list
1268 * and then reclaims it (while holding its lock).
1269 * So grab the metadata buffer's lock here to ensure the
1270 * reclaim is done.
1271 */
1272 lck_mtx_lock(&metadata->vdrm_lock);
1273 lck_mtx_unlock(&metadata->vdrm_lock);
1274 lck_mtx_unlock(&async_reclamation_buffers_lock);
1275
1276 vm_map_deallocate(map);
1277 return true;
1278}
1279
1280#endif /* DEVELOPMENT || DEBUG */
1281