1 | /* |
2 | * Copyright (c) 2021 Apple Inc. All rights reserved. |
3 | * |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
5 | * |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License |
8 | * Version 2.0 (the 'License'). You may not use this file except in |
9 | * compliance with the License. The rights granted to you under the License |
10 | * may not be used to create, or enable the creation or redistribution of, |
11 | * unlawful or unlicensed copies of an Apple operating system, or to |
12 | * circumvent, violate, or enable the circumvention or violation of, any |
13 | * terms of an Apple operating system software license agreement. |
14 | * |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
17 | * |
18 | * The Original Code and all software distributed under the License are |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and |
24 | * limitations under the License. |
25 | * |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
27 | */ |
28 | |
29 | #include <kern/exc_guard.h> |
30 | #include <kern/locks.h> |
31 | #include <kern/task.h> |
32 | #include <kern/zalloc.h> |
33 | #include <kern/misc_protos.h> |
34 | #include <kern/startup.h> |
35 | #include <kern/sched.h> |
36 | #include <libkern/OSAtomic.h> |
37 | #include <mach/kern_return.h> |
38 | #include <mach/mach_types.h> |
39 | #include <mach/mach_vm.h> |
40 | #include <mach/vm_reclaim.h> |
41 | #include <os/log.h> |
42 | #include <pexpert/pexpert.h> |
43 | #include <vm/vm_map.h> |
44 | #include <vm/vm_map_internal.h> |
45 | #include <vm/vm_reclaim_internal.h> |
46 | #include <sys/kdebug.h> |
47 | #include <sys/queue.h> |
48 | #include <os/atomic_private.h> |
49 | |
50 | #pragma mark Tunables |
51 | TUNABLE(uint32_t, kReclaimChunkSize, "vm_reclaim_chunk_size" , 16); |
52 | static integer_t kReclaimThreadPriority = BASEPRI_VM; |
53 | // Reclaim down to vm_reclaim_max_threshold / vm_reclaim_trim_divisor when doing a trim reclaim operation |
54 | TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_trim_divisor, "vm_reclaim_trim_divisor" , 2); |
55 | TUNABLE_DT_DEV_WRITEABLE(uint64_t, vm_reclaim_max_threshold, "/defaults" , "kern.vm_reclaim_max_threshold" , "vm_reclaim_max_threshold" , 0, TUNABLE_DT_NONE); |
56 | // Used to debug vm_reclaim kills |
57 | TUNABLE(bool, panic_on_kill, "vm_reclaim_panic_on_kill" , false); |
58 | |
59 | #pragma mark Declarations |
60 | typedef struct proc *proc_t; |
61 | extern char *proc_best_name(proc_t proc); |
62 | extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code, mach_exception_data_type_t subcode); |
63 | struct proc *proc_ref(struct proc *p, int locked); |
64 | int proc_rele(proc_t p); |
65 | static bool reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head); |
66 | static bool reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail); |
67 | static bool reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy); |
68 | |
69 | struct vm_deferred_reclamation_metadata_s { |
70 | TAILQ_ENTRY(vm_deferred_reclamation_metadata_s) vdrm_list; // Global list containing every reclamation buffer |
71 | TAILQ_ENTRY(vm_deferred_reclamation_metadata_s) vdrm_async_list; // A list containing buffers that are ripe for reclamation |
72 | decl_lck_mtx_data(, vdrm_lock); /* Held when reclaiming from the buffer */ |
73 | /* |
74 | * The task owns this structure but we maintain a backpointer here |
75 | * so that we can send an exception if we hit an error. |
76 | * Since this is a backpointer we don't hold a reference (it's a weak pointer). |
77 | */ |
78 | task_t vdrm_task; |
79 | vm_map_t vdrm_map; |
80 | user_addr_t vdrm_reclaim_buffer; |
81 | mach_vm_size_t vdrm_buffer_size; |
82 | user_addr_t vdrm_reclaim_indices; |
83 | uint64_t vdrm_reclaimed_at; |
84 | /* |
85 | * These two values represent running sums of bytes placed in the buffer and bytes reclaimed out of the buffer |
86 | * cumulatively. Both values are in terms of virtual memory, so they give an upper bound |
87 | * on the amount of physical memory that can be reclaimed. |
88 | * To get an estimate of the current amount of VA in the buffer do vdrm_num_bytes_reclaimed - vdrm_num_bytes_put_in_buffer. |
89 | * Note that neither value is protected by the vdrm_lock. |
90 | */ |
91 | _Atomic size_t vdrm_num_bytes_put_in_buffer; |
92 | _Atomic size_t vdrm_num_bytes_reclaimed; |
93 | }; |
94 | static void process_async_reclamation_list(void); |
95 | |
96 | extern void *proc_find(int pid); |
97 | extern task_t proc_task(proc_t); |
98 | |
99 | #pragma mark Globals |
100 | static KALLOC_TYPE_DEFINE(vm_reclaim_metadata_zone, struct vm_deferred_reclamation_metadata_s, KT_DEFAULT); |
101 | static LCK_GRP_DECLARE(vm_reclaim_lock_grp, "vm_reclaim" ); |
102 | static os_log_t vm_reclaim_log_handle; |
103 | |
104 | /* |
105 | * The ringbuffer must contain at least 2 entries to distinguish between empty |
106 | * (head == tail) and full (head == tail + 1). |
107 | */ |
108 | #define BUFFER_MIN_ENTRY_COUNT 2 |
109 | |
110 | /* |
111 | * We maintain two lists of reclamation buffers. |
112 | * The reclamation_buffers list contains every buffer in the system. |
113 | * The async_reclamation_buffers_list contains buffers that are ripe for reclamation. |
114 | * Each list has its own lock. |
115 | */ |
116 | static TAILQ_HEAD(, vm_deferred_reclamation_metadata_s) reclamation_buffers = TAILQ_HEAD_INITIALIZER(reclamation_buffers); |
117 | |
118 | static TAILQ_HEAD(, vm_deferred_reclamation_metadata_s) async_reclamation_buffers = TAILQ_HEAD_INITIALIZER(async_reclamation_buffers); |
119 | /* |
120 | * The reclamation_buffers_lock protects the reclamation_buffers list. |
121 | * It must be held when iterating over the list or manipulating the list. |
122 | * It should be dropped when acting on a specific metadata entry after acquiring the vdrm_lock. |
123 | */ |
124 | LCK_MTX_DECLARE(reclamation_buffers_lock, &vm_reclaim_lock_grp); |
125 | LCK_MTX_DECLARE(async_reclamation_buffers_lock, &vm_reclaim_lock_grp); |
126 | static size_t reclamation_buffers_length; |
127 | static uint64_t reclamation_counter; // generation count for global reclaims |
128 | |
129 | static SECURITY_READ_ONLY_LATE(thread_t) vm_reclaim_thread; |
130 | static void reclaim_thread(void *param __unused, wait_result_t wr __unused); |
131 | |
132 | #pragma mark Implementation |
133 | |
134 | /* |
135 | * The current design is not tolerant to faulting on the buffer under the |
136 | * metadata lock. Wire the buffer as a stop-gap solution for now; in the |
137 | * future, the synchronization scheme should be revised to allow the buffer |
138 | * to be pageable (rdar://112039103). |
139 | */ |
140 | |
141 | static kern_return_t |
142 | vmdr_metadata_wire(vm_deferred_reclamation_metadata_t metadata) |
143 | { |
144 | kern_return_t kr; |
145 | vm_map_offset_t buffer_start = (metadata->vdrm_reclaim_buffer - |
146 | offsetof(struct mach_vm_reclaim_buffer_v1_s, entries)); |
147 | vm_map_offset_t buffer_end = (metadata->vdrm_reclaim_buffer + |
148 | metadata->vdrm_buffer_size); |
149 | kr = vm_map_wire_kernel(map: metadata->vdrm_map, start: buffer_start, end: buffer_end, |
150 | VM_PROT_NONE, VM_KERN_MEMORY_OSFMK, TRUE); |
151 | if (kr != KERN_SUCCESS) { |
152 | os_log_error(vm_reclaim_log_handle, |
153 | "vm_reclaim: failed to wire userspace reclaim buffer for pid %d (%d)" , |
154 | task_pid(metadata->vdrm_task), kr); |
155 | } |
156 | return kr; |
157 | } |
158 | |
159 | static kern_return_t |
160 | vmdr_metadata_unwire(vm_deferred_reclamation_metadata_t metadata) |
161 | { |
162 | kern_return_t kr; |
163 | vm_map_offset_t buffer_start = (metadata->vdrm_reclaim_buffer - |
164 | offsetof(struct mach_vm_reclaim_buffer_v1_s, entries)); |
165 | vm_map_offset_t buffer_end = (metadata->vdrm_reclaim_buffer + |
166 | metadata->vdrm_buffer_size); |
167 | kr = vm_map_unwire(map: metadata->vdrm_map, start: buffer_start, end: buffer_end, TRUE); |
168 | if (kr != KERN_SUCCESS) { |
169 | os_log_error(vm_reclaim_log_handle, |
170 | "vm_reclaim: unable to un-wire buffer %p (%llu) for pid %d (%d)" , |
171 | (void *)buffer_start, (buffer_end - buffer_start), |
172 | task_pid(metadata->vdrm_task), kr); |
173 | } |
174 | return kr; |
175 | } |
176 | |
177 | static vm_deferred_reclamation_metadata_t |
178 | vmdr_metadata_alloc( |
179 | task_t task, |
180 | user_addr_t buffer, |
181 | mach_vm_size_t size, |
182 | user_addr_t indices) |
183 | { |
184 | vm_deferred_reclamation_metadata_t metadata; |
185 | vm_map_t map = task->map; |
186 | |
187 | assert(!map->is_nested_map); |
188 | |
189 | metadata = zalloc_flags(vm_reclaim_metadata_zone, Z_WAITOK | Z_ZERO); |
190 | lck_mtx_init(lck: &metadata->vdrm_lock, grp: &vm_reclaim_lock_grp, LCK_ATTR_NULL); |
191 | metadata->vdrm_task = task; |
192 | metadata->vdrm_map = map; |
193 | metadata->vdrm_reclaim_buffer = buffer; |
194 | metadata->vdrm_buffer_size = size; |
195 | metadata->vdrm_reclaim_indices = indices; |
196 | |
197 | /* |
198 | * we do not need to hold a lock on `task` because this is called |
199 | * either at fork() time or from the context of current_task(). |
200 | */ |
201 | vm_map_reference(map); |
202 | return metadata; |
203 | } |
204 | |
205 | static void |
206 | vmdr_metadata_free(vm_deferred_reclamation_metadata_t metadata) |
207 | { |
208 | vm_map_deallocate(map: metadata->vdrm_map); |
209 | lck_mtx_destroy(lck: &metadata->vdrm_lock, grp: &vm_reclaim_lock_grp); |
210 | zfree(vm_reclaim_metadata_zone, metadata); |
211 | } |
212 | |
213 | kern_return_t |
214 | vm_deferred_reclamation_buffer_init_internal( |
215 | task_t task, |
216 | mach_vm_offset_t address, |
217 | mach_vm_size_t size) |
218 | { |
219 | kern_return_t kr = KERN_FAILURE, tmp_kr; |
220 | vm_deferred_reclamation_metadata_t metadata = NULL; |
221 | bool success; |
222 | uint64_t head = 0, tail = 0, busy = 0; |
223 | |
224 | if (address == 0 || |
225 | size < (sizeof(struct mach_vm_reclaim_buffer_v1_s) + |
226 | BUFFER_MIN_ENTRY_COUNT * sizeof(mach_vm_reclaim_entry_v1_t)) || |
227 | !VM_MAP_PAGE_ALIGNED(address, VM_MAP_PAGE_MASK(task->map)) || |
228 | !VM_MAP_PAGE_ALIGNED((address + size), VM_MAP_PAGE_MASK(task->map))) { |
229 | return KERN_INVALID_ARGUMENT; |
230 | } |
231 | |
232 | /* vm_reclaim is disabled */ |
233 | if (vm_reclaim_max_threshold == 0) { |
234 | os_log_error(vm_reclaim_log_handle, |
235 | "vm_reclaim: failed to initialize vmdr buffer - reclaim is disabled (%llu)" , |
236 | vm_reclaim_max_threshold); |
237 | return KERN_NOT_SUPPORTED; |
238 | } |
239 | |
240 | KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_START, |
241 | task_pid(task), address, size); |
242 | |
243 | user_addr_t buffer = address + \ |
244 | offsetof(struct mach_vm_reclaim_buffer_v1_s, entries); |
245 | mach_vm_size_t buffer_size = size - \ |
246 | offsetof(struct mach_vm_reclaim_buffer_v1_s, entries); |
247 | user_addr_t indices = address + \ |
248 | offsetof(struct mach_vm_reclaim_buffer_v1_s, indices); |
249 | |
250 | metadata = vmdr_metadata_alloc(task, buffer, size: buffer_size, indices); |
251 | |
252 | kr = vmdr_metadata_wire(metadata); |
253 | if (kr != KERN_SUCCESS) { |
254 | goto out; |
255 | } |
256 | |
257 | /* |
258 | * Validate the starting indices. |
259 | * |
260 | * NB: At this point it is impossible for another thread to hold a |
261 | * reference to this metadata. However, reclaim_copyin may call reclaim_kill |
262 | * on failure, which assumes the metadata lock is held. |
263 | */ |
264 | lck_mtx_lock(lck: &metadata->vdrm_lock); |
265 | |
266 | success = reclaim_copyin_busy(metadata, busy: &busy); |
267 | if (!success) { |
268 | /* metadata lock has been dropped and exception delivered to task */ |
269 | kr = KERN_INVALID_ARGUMENT; |
270 | goto fail_wired; |
271 | } |
272 | success = reclaim_copyin_head(metadata, head: &head); |
273 | if (!success) { |
274 | /* metadata lock has been dropped and exception delivered to task */ |
275 | kr = KERN_INVALID_ARGUMENT; |
276 | goto fail_wired; |
277 | } |
278 | success = reclaim_copyin_tail(metadata, tail: &tail); |
279 | if (!success) { |
280 | /* metadata lock has been dropped and exception delivered to task */ |
281 | kr = KERN_INVALID_ARGUMENT; |
282 | goto fail_wired; |
283 | } |
284 | |
285 | lck_mtx_unlock(lck: &metadata->vdrm_lock); |
286 | |
287 | if (head != 0 || tail != 0 || busy != 0) { |
288 | kr = KERN_INVALID_ARGUMENT; |
289 | goto fail_wired; |
290 | } |
291 | |
292 | /* |
293 | * Publish the metadata to the task & global buffer list. This must be |
294 | * done under the task lock to synchronize with task termination - i.e. |
295 | * task_terminate_internal is guaranteed to see the published metadata and |
296 | * tear it down. |
297 | */ |
298 | lck_mtx_lock(lck: &reclamation_buffers_lock); |
299 | task_lock(task); |
300 | |
301 | if (!task_is_active(task) || task_is_halting(task)) { |
302 | os_log_error(vm_reclaim_log_handle, |
303 | "vm_reclaim: failed to initialize buffer on dying task (pid %d)" , task_pid(task)); |
304 | kr = KERN_TERMINATED; |
305 | goto fail_task; |
306 | } else if (task->deferred_reclamation_metadata != NULL) { |
307 | os_log_error(vm_reclaim_log_handle, |
308 | "vm_reclaim: tried to overwrite existing reclaim buffer for pid %d" , task_pid(task)); |
309 | kr = KERN_INVALID_ARGUMENT; |
310 | goto fail_task; |
311 | } |
312 | |
313 | TAILQ_INSERT_TAIL(&reclamation_buffers, metadata, vdrm_list); |
314 | reclamation_buffers_length++; |
315 | |
316 | task->deferred_reclamation_metadata = metadata; |
317 | |
318 | task_unlock(task); |
319 | lck_mtx_unlock(lck: &reclamation_buffers_lock); |
320 | |
321 | KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_END, |
322 | task_pid(task), KERN_SUCCESS); |
323 | return KERN_SUCCESS; |
324 | |
325 | fail_task: |
326 | task_unlock(task); |
327 | lck_mtx_unlock(lck: &reclamation_buffers_lock); |
328 | |
329 | fail_wired: |
330 | tmp_kr = vmdr_metadata_unwire(metadata); |
331 | assert3u(tmp_kr, ==, KERN_SUCCESS); |
332 | |
333 | out: |
334 | vmdr_metadata_free(metadata); |
335 | KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_END, |
336 | task_pid(task), kr); |
337 | return kr; |
338 | } |
339 | |
340 | void |
341 | vm_deferred_reclamation_buffer_uninstall(vm_deferred_reclamation_metadata_t metadata) |
342 | { |
343 | assert(metadata != NULL); |
344 | /* |
345 | * First remove the buffer from the global list so no one else can get access to it. |
346 | */ |
347 | lck_mtx_lock(lck: &reclamation_buffers_lock); |
348 | TAILQ_REMOVE(&reclamation_buffers, metadata, vdrm_list); |
349 | reclamation_buffers_length--; |
350 | lck_mtx_unlock(lck: &reclamation_buffers_lock); |
351 | |
352 | /* |
353 | * Now remove it from the async list (if present) |
354 | */ |
355 | lck_mtx_lock(lck: &async_reclamation_buffers_lock); |
356 | if (metadata->vdrm_async_list.tqe_next != NULL || metadata->vdrm_async_list.tqe_prev != NULL) { |
357 | TAILQ_REMOVE(&async_reclamation_buffers, metadata, vdrm_async_list); |
358 | metadata->vdrm_async_list.tqe_next = NULL; |
359 | metadata->vdrm_async_list.tqe_prev = NULL; |
360 | } |
361 | lck_mtx_unlock(lck: &async_reclamation_buffers_lock); |
362 | |
363 | // A kernel thread may have grabbed the lock for this buffer before we had |
364 | // a chance to remove it from the queues. Take the metadata lock to ensure |
365 | // any such workers are finished operating on the buffer. |
366 | lck_mtx_lock(lck: &metadata->vdrm_lock); |
367 | lck_mtx_unlock(lck: &metadata->vdrm_lock); |
368 | |
369 | vmdr_metadata_unwire(metadata); |
370 | } |
371 | |
372 | void |
373 | vm_deferred_reclamation_buffer_deallocate(vm_deferred_reclamation_metadata_t metadata) |
374 | { |
375 | assert(metadata != NULL); |
376 | vmdr_metadata_free(metadata); |
377 | } |
378 | |
379 | static user_addr_t |
380 | get_head_ptr(user_addr_t indices) |
381 | { |
382 | return indices + offsetof(mach_vm_reclaim_indices_v1_t, head); |
383 | } |
384 | |
385 | static user_addr_t |
386 | get_tail_ptr(user_addr_t indices) |
387 | { |
388 | return indices + offsetof(mach_vm_reclaim_indices_v1_t, tail); |
389 | } |
390 | |
391 | static user_addr_t |
392 | get_busy_ptr(user_addr_t indices) |
393 | { |
394 | return indices + offsetof(mach_vm_reclaim_indices_v1_t, busy); |
395 | } |
396 | |
397 | static void |
398 | reclaim_kill_with_reason( |
399 | vm_deferred_reclamation_metadata_t metadata, |
400 | unsigned reason, |
401 | mach_exception_data_type_t subcode) |
402 | { |
403 | unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY; |
404 | mach_exception_code_t code = 0; |
405 | task_t task = metadata->vdrm_task; |
406 | proc_t p = NULL; |
407 | boolean_t fatal = TRUE; |
408 | bool killing_self = false; |
409 | pid_t pid; |
410 | int err; |
411 | |
412 | if (panic_on_kill) { |
413 | panic("vm_reclaim: About to kill %p due to %d with subcode %lld\n" , task, reason, subcode); |
414 | } |
415 | |
416 | EXC_GUARD_ENCODE_TYPE(code, guard_type); |
417 | EXC_GUARD_ENCODE_FLAVOR(code, reason); |
418 | EXC_GUARD_ENCODE_TARGET(code, 0); |
419 | |
420 | assert(metadata->vdrm_task != kernel_task); |
421 | killing_self = task == current_task(); |
422 | if (!killing_self) { |
423 | /* |
424 | * Grab a reference on the task to make sure it doesn't go away |
425 | * after we drop the metadata lock |
426 | */ |
427 | task_reference(task); |
428 | } |
429 | /* |
430 | * We need to issue a wakeup in case this kill is coming from the async path. |
431 | * Once we drop the lock the caller can no longer do this wakeup, but |
432 | * if there's someone blocked on this reclaim they hold a map reference |
433 | * and thus need to be woken up so the map can be freed. |
434 | */ |
435 | thread_wakeup(&metadata->vdrm_async_list); |
436 | lck_mtx_unlock(lck: &metadata->vdrm_lock); |
437 | |
438 | if (reason == kGUARD_EXC_DEALLOC_GAP) { |
439 | task_lock(task); |
440 | fatal = (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL); |
441 | task_unlock(task); |
442 | } |
443 | |
444 | if (!fatal) { |
445 | os_log_info(vm_reclaim_log_handle, |
446 | "vm_reclaim: Skipping non fatal guard exception.\n" ); |
447 | goto out; |
448 | } |
449 | |
450 | pid = task_pid(task); |
451 | if (killing_self) { |
452 | p = get_bsdtask_info(task); |
453 | } else { |
454 | p = proc_find(pid); |
455 | if (p && proc_task(p) != task) { |
456 | os_log_error(vm_reclaim_log_handle, |
457 | "vm_reclaim: Unable to deliver guard exception because proc is gone & pid rolled over.\n" ); |
458 | goto out; |
459 | } |
460 | |
461 | task_deallocate(task); |
462 | task = NULL; |
463 | } |
464 | |
465 | if (!p) { |
466 | os_log_error(vm_reclaim_log_handle, |
467 | "vm_reclaim: Unable to deliver guard exception because task does not have a proc.\n" ); |
468 | goto out; |
469 | } |
470 | |
471 | err = exit_with_guard_exception(p, code, subcode); |
472 | if (err != 0) { |
473 | os_log_error(vm_reclaim_log_handle, "vm_reclaim: Unable to deliver guard exception to %p: %d\n" , p, err); |
474 | } |
475 | out: |
476 | if (!killing_self) { |
477 | if (p) { |
478 | proc_rele(p); |
479 | p = NULL; |
480 | } |
481 | if (task) { |
482 | task_deallocate(task); |
483 | task = NULL; |
484 | } |
485 | } |
486 | } |
487 | |
488 | static void |
489 | reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata, int result) |
490 | { |
491 | reclaim_kill_with_reason(metadata, reason: kGUARD_EXC_RECLAIM_COPYIO_FAILURE, subcode: result); |
492 | } |
493 | |
494 | /* |
495 | * Helper functions to do copyio on the head, tail, and busy pointers. |
496 | * Note that the kernel will only write to the busy and head pointers. |
497 | * Userspace is not supposed to write to the head or busy pointers, but the kernel |
498 | * must be resilient to that kind of bug in userspace. |
499 | */ |
500 | |
501 | |
502 | static bool |
503 | reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head) |
504 | { |
505 | int result; |
506 | user_addr_t indices = metadata->vdrm_reclaim_indices; |
507 | user_addr_t head_ptr = get_head_ptr(indices); |
508 | |
509 | result = copyin_atomic64(user_addr: head_ptr, kernel_addr: head); |
510 | |
511 | if (result != 0) { |
512 | os_log_error(vm_reclaim_log_handle, |
513 | "vm_reclaim: Unable to copy head ptr from 0x%llx: err=%d\n" , head_ptr, result); |
514 | reclaim_handle_copyio_error(metadata, result); |
515 | return false; |
516 | } |
517 | return true; |
518 | } |
519 | |
520 | static bool |
521 | reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail) |
522 | { |
523 | int result; |
524 | user_addr_t indices = metadata->vdrm_reclaim_indices; |
525 | user_addr_t tail_ptr = get_tail_ptr(indices); |
526 | |
527 | result = copyin_atomic64(user_addr: tail_ptr, kernel_addr: tail); |
528 | |
529 | if (result != 0) { |
530 | os_log_error(vm_reclaim_log_handle, |
531 | "vm_reclaim: Unable to copy tail ptr from 0x%llx: err=%d\n" , tail_ptr, result); |
532 | reclaim_handle_copyio_error(metadata, result); |
533 | return false; |
534 | } |
535 | return true; |
536 | } |
537 | |
538 | static bool |
539 | reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy) |
540 | { |
541 | int result; |
542 | user_addr_t indices = metadata->vdrm_reclaim_indices; |
543 | user_addr_t busy_ptr = get_busy_ptr(indices); |
544 | |
545 | result = copyin_atomic64(user_addr: busy_ptr, kernel_addr: busy); |
546 | |
547 | if (result != 0) { |
548 | os_log_error(vm_reclaim_log_handle, |
549 | "vm_reclaim: Unable to copy busy ptr from 0x%llx: err=%d\n" , busy_ptr, result); |
550 | reclaim_handle_copyio_error(metadata, result); |
551 | return false; |
552 | } |
553 | return true; |
554 | } |
555 | |
556 | static bool |
557 | reclaim_copyout_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t value) |
558 | { |
559 | int result; |
560 | user_addr_t indices = metadata->vdrm_reclaim_indices; |
561 | user_addr_t busy_ptr = get_busy_ptr(indices); |
562 | |
563 | result = copyout_atomic64(u64: value, user_addr: busy_ptr); |
564 | |
565 | if (result != 0) { |
566 | os_log_error(vm_reclaim_log_handle, |
567 | "vm_reclaim: Unable to copy %llu to busy ptr at 0x%llx: err=%d\n" , value, busy_ptr, result); |
568 | reclaim_handle_copyio_error(metadata, result); |
569 | return false; |
570 | } |
571 | return true; |
572 | } |
573 | |
574 | static bool |
575 | reclaim_copyout_head(vm_deferred_reclamation_metadata_t metadata, uint64_t value) |
576 | { |
577 | int result; |
578 | user_addr_t indices = metadata->vdrm_reclaim_indices; |
579 | user_addr_t head_ptr = get_head_ptr(indices); |
580 | |
581 | result = copyout_atomic64(u64: value, user_addr: head_ptr); |
582 | |
583 | if (result != 0) { |
584 | os_log_error(vm_reclaim_log_handle, |
585 | "vm_reclaim: Unable to copy %llu to head ptr at 0x%llx: err=%d\n" , value, head_ptr, result); |
586 | reclaim_handle_copyio_error(metadata, result); |
587 | return false; |
588 | } |
589 | return true; |
590 | } |
591 | |
592 | /* |
593 | * Reclaim a chunk (kReclaimChunkSize entries) from the buffer. |
594 | * |
595 | * Writes the number of entries reclaimed to `num_reclaimed_out`. Note that |
596 | * there may be zero reclaimable entries in the chunk (they have all been |
597 | * re-used by userspace). |
598 | * |
599 | * Returns: |
600 | * - KERN_NOT_FOUND if the buffer has been exhausted (head == tail) |
601 | * - KERN_FAILURE on failure to reclaim -- metadata lock will be dropped |
602 | * before returning |
603 | */ |
604 | static kern_return_t |
605 | reclaim_chunk(vm_deferred_reclamation_metadata_t metadata, size_t *num_reclaimed_out) |
606 | { |
607 | assert(metadata != NULL); |
608 | LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_OWNED); |
609 | int result = 0; |
610 | size_t num_reclaimed = 0; |
611 | uint64_t head = 0, tail = 0, busy = 0, num_to_reclaim = 0, new_tail = 0, num_copied = 0, buffer_len = 0; |
612 | user_addr_t indices; |
613 | vm_map_t map = metadata->vdrm_map, old_map; |
614 | mach_vm_reclaim_entry_v1_t reclaim_entries[kReclaimChunkSize]; |
615 | bool success; |
616 | |
617 | KDBG(VM_RECLAIM_CODE(VM_RECLAIM_CHUNK) | DBG_FUNC_START, |
618 | task_pid(metadata->vdrm_task), kReclaimChunkSize); |
619 | |
620 | buffer_len = metadata->vdrm_buffer_size / sizeof(mach_vm_reclaim_entry_v1_t); |
621 | |
622 | memset(s: reclaim_entries, c: 0, n: sizeof(reclaim_entries)); |
623 | |
624 | indices = (user_addr_t) metadata->vdrm_reclaim_indices; |
625 | old_map = vm_map_switch(map); |
626 | |
627 | success = reclaim_copyin_busy(metadata, busy: &busy); |
628 | if (!success) { |
629 | goto fail; |
630 | } |
631 | success = reclaim_copyin_head(metadata, head: &head); |
632 | if (!success) { |
633 | goto fail; |
634 | } |
635 | success = reclaim_copyin_tail(metadata, tail: &tail); |
636 | if (!success) { |
637 | goto fail; |
638 | } |
639 | |
640 | if (busy != head) { |
641 | // Userspace overwrote one of the pointers |
642 | os_log_error(vm_reclaim_log_handle, |
643 | "vm_reclaim: Userspace modified head or busy pointer! head: %llu (0x%llx) != busy: %llu (0x%llx) | tail = %llu (0x%llx)\n" , |
644 | head, get_head_ptr(indices), busy, get_busy_ptr(indices), tail, get_tail_ptr(indices)); |
645 | reclaim_kill_with_reason(metadata, reason: kGUARD_EXC_RECLAIM_INDEX_FAILURE, subcode: busy); |
646 | goto fail; |
647 | } |
648 | |
649 | if (tail < head) { |
650 | // Userspace is likely in the middle of trying to re-use an entry, bail on this reclamation |
651 | os_log_error(vm_reclaim_log_handle, |
652 | "vm_reclaim: Userspace modified head or tail pointer! head: %llu (0x%llx) > tail: %llu (0x%llx) | busy = %llu (0x%llx)\n" , |
653 | head, get_head_ptr(indices), tail, get_tail_ptr(indices), busy, get_busy_ptr(indices)); |
654 | lck_mtx_unlock(lck: &metadata->vdrm_lock); |
655 | goto fail; |
656 | } |
657 | |
658 | num_to_reclaim = tail - head; |
659 | while (true) { |
660 | num_to_reclaim = MIN(num_to_reclaim, kReclaimChunkSize); |
661 | if (num_to_reclaim == 0) { |
662 | break; |
663 | } |
664 | busy = head + num_to_reclaim; |
665 | success = reclaim_copyout_busy(metadata, value: busy); |
666 | if (!success) { |
667 | goto fail; |
668 | } |
669 | os_atomic_thread_fence(seq_cst); |
670 | success = reclaim_copyin_tail(metadata, tail: &new_tail); |
671 | if (!success) { |
672 | goto fail; |
673 | } |
674 | |
675 | if (new_tail >= busy) { |
676 | /* Got num_to_reclaim entries */ |
677 | break; |
678 | } |
679 | tail = new_tail; |
680 | if (tail < head) { |
681 | // Userspace is likely in the middle of trying to re-use an entry, bail on this reclamation |
682 | os_log_error(vm_reclaim_log_handle, |
683 | "vm_reclaim: Userspace modified head or tail pointer! head: %llu (0x%llx) > tail: %llu (0x%llx) | busy = %llu (0x%llx)\n" , |
684 | head, get_head_ptr(indices), tail, get_tail_ptr(indices), busy, get_busy_ptr(indices)); |
685 | lck_mtx_unlock(lck: &metadata->vdrm_lock); |
686 | goto fail; |
687 | } |
688 | /* Can't reclaim these entries. Try again */ |
689 | num_to_reclaim = tail - head; |
690 | if (num_to_reclaim == 0) { |
691 | /* Nothing left to reclaim. Reset busy to head. */ |
692 | success = reclaim_copyout_busy(metadata, value: head); |
693 | if (!success) { |
694 | goto fail; |
695 | } |
696 | break; |
697 | } |
698 | /* |
699 | * Note that num_to_reclaim must have gotten smaller since tail got smaller, |
700 | * so this is gauranteed to converge. |
701 | */ |
702 | } |
703 | |
704 | while (num_copied < num_to_reclaim) { |
705 | uint64_t memcpy_start_idx = (head % buffer_len); |
706 | uint64_t memcpy_end_idx = memcpy_start_idx + num_to_reclaim - num_copied; |
707 | // Clamp the end idx to the buffer. We'll handle wrap-around in our next go around the loop. |
708 | memcpy_end_idx = MIN(memcpy_end_idx, buffer_len); |
709 | uint64_t num_to_copy = memcpy_end_idx - memcpy_start_idx; |
710 | |
711 | assert(num_to_copy + num_copied <= kReclaimChunkSize); |
712 | user_addr_t src_ptr = metadata->vdrm_reclaim_buffer + memcpy_start_idx * sizeof(mach_vm_reclaim_entry_v1_t); |
713 | mach_vm_reclaim_entry_v1_t *dst_ptr = reclaim_entries + num_copied; |
714 | |
715 | result = copyin(src_ptr, dst_ptr, num_to_copy * sizeof(mach_vm_reclaim_entry_v1_t)); |
716 | |
717 | if (result != 0) { |
718 | os_log_error(vm_reclaim_log_handle, |
719 | "vm_reclaim: Unable to copyin %llu entries in reclaim buffer at 0x%llx to 0x%llx: err=%d\n" , |
720 | num_to_copy, src_ptr, (uint64_t) dst_ptr, result); |
721 | reclaim_handle_copyio_error(metadata, result); |
722 | goto fail; |
723 | } |
724 | |
725 | num_copied += num_to_copy; |
726 | head += num_to_copy; |
727 | } |
728 | |
729 | for (size_t i = 0; i < num_to_reclaim; i++) { |
730 | mach_vm_reclaim_entry_v1_t *entry = &reclaim_entries[i]; |
731 | KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_ENTRY) | DBG_FUNC_START, |
732 | task_pid(metadata->vdrm_task), entry->address, entry->size, |
733 | entry->behavior); |
734 | DTRACE_VM4(vm_reclaim_chunk, |
735 | int, task_pid(metadata->vdrm_task), |
736 | mach_vm_address_t, entry->address, |
737 | size_t, entry->size, |
738 | mach_vm_reclaim_behavior_v1_t, entry->behavior); |
739 | if (entry->address != 0 && entry->size != 0) { |
740 | kern_return_t kr; |
741 | switch (entry->behavior) { |
742 | case MACH_VM_RECLAIM_DEALLOCATE: |
743 | kr = vm_map_remove_guard(map, |
744 | vm_map_trunc_page(entry->address, |
745 | VM_MAP_PAGE_MASK(map)), |
746 | vm_map_round_page(entry->address + entry->size, |
747 | VM_MAP_PAGE_MASK(map)), |
748 | flags: VM_MAP_REMOVE_GAPS_FAIL, |
749 | KMEM_GUARD_NONE).kmr_return; |
750 | if (kr == KERN_INVALID_VALUE) { |
751 | reclaim_kill_with_reason(metadata, reason: kGUARD_EXC_DEALLOC_GAP, subcode: entry->address); |
752 | goto fail; |
753 | } else if (kr != KERN_SUCCESS) { |
754 | os_log_error(vm_reclaim_log_handle, |
755 | "vm_reclaim: Unable to deallocate 0x%llx (%u) from 0x%llx err=%d\n" , |
756 | entry->address, entry->size, (uint64_t) map, kr); |
757 | reclaim_kill_with_reason(metadata, reason: kGUARD_EXC_RECLAIM_DEALLOCATE_FAILURE, subcode: kr); |
758 | goto fail; |
759 | } |
760 | break; |
761 | case MACH_VM_RECLAIM_REUSABLE: |
762 | kr = vm_map_behavior_set(map, |
763 | vm_map_trunc_page(entry->address, VM_MAP_PAGE_MASK(map)), |
764 | vm_map_round_page(entry->address + entry->size, VM_MAP_PAGE_MASK(map)), |
765 | VM_BEHAVIOR_REUSABLE); |
766 | if (kr != KERN_SUCCESS) { |
767 | os_log_error(vm_reclaim_log_handle, |
768 | "vm_reclaim: unable to free(reusable) 0x%llx (%u) for pid %d err=%d\n" , |
769 | entry->address, entry->size, task_pid(metadata->vdrm_task), kr); |
770 | } |
771 | break; |
772 | default: |
773 | os_log_error(vm_reclaim_log_handle, |
774 | "vm_reclaim: attempted to reclaim entry with unsupported behavior %uh" , |
775 | entry->behavior); |
776 | reclaim_kill_with_reason(metadata, reason: kGUARD_EXC_RECLAIM_DEALLOCATE_FAILURE, subcode: kr); |
777 | goto fail; |
778 | } |
779 | num_reclaimed++; |
780 | os_atomic_add(&metadata->vdrm_num_bytes_reclaimed, entry->size, relaxed); |
781 | KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_ENTRY) | DBG_FUNC_END, |
782 | task_pid(metadata->vdrm_task), entry->address); |
783 | } |
784 | } |
785 | |
786 | success = reclaim_copyout_head(metadata, value: head); |
787 | if (!success) { |
788 | goto fail; |
789 | } |
790 | |
791 | vm_map_switch(map: old_map); |
792 | KDBG(VM_RECLAIM_CODE(VM_RECLAIM_CHUNK) | DBG_FUNC_END, |
793 | task_pid(metadata->vdrm_task), num_to_reclaim, num_reclaimed, true); |
794 | *num_reclaimed_out = num_reclaimed; |
795 | if (num_to_reclaim == 0) { |
796 | // We have exhausted the reclaimable portion of the buffer |
797 | return KERN_NOT_FOUND; |
798 | } |
799 | return KERN_SUCCESS; |
800 | fail: |
801 | vm_map_switch(map: old_map); |
802 | *num_reclaimed_out = num_reclaimed; |
803 | KDBG(VM_RECLAIM_CODE(VM_RECLAIM_CHUNK) | DBG_FUNC_END, |
804 | task_pid(metadata->vdrm_task), num_to_reclaim, num_reclaimed, false); |
805 | return KERN_FAILURE; |
806 | } |
807 | |
808 | /* |
809 | * Attempts to reclaim until the buffer's estimated number of available bytes |
810 | * is <= num_bytes_reclaimable_threshold. The metadata buffer lock should be |
811 | * held by the caller. |
812 | * |
813 | * Writes the number of entries reclaimed to `num_reclaimed_out`. |
814 | */ |
815 | static kern_return_t |
816 | reclaim_entries_from_buffer(vm_deferred_reclamation_metadata_t metadata, |
817 | size_t num_bytes_reclaimable_threshold, size_t *num_reclaimed_out) |
818 | { |
819 | assert(metadata != NULL); |
820 | assert(num_reclaimed_out != NULL); |
821 | LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_OWNED); |
822 | if (!task_is_active(task: metadata->vdrm_task)) { |
823 | /* |
824 | * If the task is exiting, the reclaim below will likely fail and fall through |
825 | * to the (slower) error path. |
826 | * So as an optimization, we bail out early here. |
827 | */ |
828 | return 0; |
829 | } |
830 | |
831 | KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ENTRIES) | DBG_FUNC_START, task_pid(metadata->vdrm_task)); |
832 | |
833 | size_t num_entries_reclaimed = 0, num_bytes_reclaimed, estimated_reclaimable_bytes, reclaimable_bytes; |
834 | while (true) { |
835 | kern_return_t kr; |
836 | size_t curr_entries_reclaimed = 0; |
837 | num_bytes_reclaimed = os_atomic_load(&metadata->vdrm_num_bytes_reclaimed, relaxed); |
838 | reclaimable_bytes = os_atomic_load(&metadata->vdrm_num_bytes_put_in_buffer, relaxed); |
839 | if (num_bytes_reclaimed > reclaimable_bytes) { |
840 | estimated_reclaimable_bytes = 0; |
841 | } else { |
842 | estimated_reclaimable_bytes = reclaimable_bytes - num_bytes_reclaimed; |
843 | } |
844 | if (reclaimable_bytes <= num_bytes_reclaimable_threshold) { |
845 | break; |
846 | } |
847 | kr = reclaim_chunk(metadata, num_reclaimed_out: &curr_entries_reclaimed); |
848 | if (kr == KERN_NOT_FOUND) { |
849 | break; |
850 | } else if (kr != KERN_SUCCESS) { |
851 | KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ENTRIES) | DBG_FUNC_END, |
852 | task_pid(metadata->vdrm_task), num_entries_reclaimed, |
853 | estimated_reclaimable_bytes, kr); |
854 | *num_reclaimed_out = num_entries_reclaimed; |
855 | return kr; |
856 | } |
857 | num_entries_reclaimed += curr_entries_reclaimed; |
858 | } |
859 | |
860 | KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ENTRIES) | DBG_FUNC_END, |
861 | task_pid(metadata->vdrm_task), num_entries_reclaimed, |
862 | estimated_reclaimable_bytes, KERN_SUCCESS); |
863 | *num_reclaimed_out = num_entries_reclaimed; |
864 | return KERN_SUCCESS; |
865 | } |
866 | |
867 | /* |
868 | * Get the reclamation metadata buffer for the given map. |
869 | * If the buffer exists it is returned locked. |
870 | */ |
871 | static vm_deferred_reclamation_metadata_t |
872 | get_task_reclaim_metadata(task_t task) |
873 | { |
874 | assert(task != NULL); |
875 | vm_deferred_reclamation_metadata_t metadata = NULL; |
876 | task_lock(task); |
877 | metadata = task->deferred_reclamation_metadata; |
878 | if (metadata != NULL) { |
879 | lck_mtx_lock(lck: &metadata->vdrm_lock); |
880 | } |
881 | task_unlock(task); |
882 | return metadata; |
883 | } |
884 | |
885 | kern_return_t |
886 | vm_deferred_reclamation_buffer_synchronize_internal(task_t task, size_t num_entries_to_reclaim) |
887 | { |
888 | kern_return_t kr; |
889 | vm_deferred_reclamation_metadata_t metadata = NULL; |
890 | size_t total_reclaimed = 0; |
891 | |
892 | if (!task_is_active(task)) { |
893 | return KERN_FAILURE; |
894 | } |
895 | |
896 | metadata = get_task_reclaim_metadata(task); |
897 | if (metadata == NULL) { |
898 | return KERN_INVALID_ARGUMENT; |
899 | } |
900 | |
901 | while (total_reclaimed < num_entries_to_reclaim) { |
902 | size_t num_reclaimed; |
903 | kr = reclaim_chunk(metadata, num_reclaimed_out: &num_reclaimed); |
904 | if (kr == KERN_NOT_FOUND) { |
905 | /* buffer has been fully reclaimed from */ |
906 | break; |
907 | } else if (kr != KERN_SUCCESS) { |
908 | /* Lock has already been released and task is being killed. */ |
909 | return kr; |
910 | } |
911 | |
912 | total_reclaimed += num_reclaimed; |
913 | } |
914 | lck_mtx_unlock(lck: &metadata->vdrm_lock); |
915 | |
916 | return KERN_SUCCESS; |
917 | } |
918 | |
919 | kern_return_t |
920 | vm_deferred_reclamation_buffer_update_reclaimable_bytes_internal(task_t task, size_t reclaimable_bytes) |
921 | { |
922 | vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata; |
923 | size_t num_bytes_reclaimed, estimated_reclaimable_bytes, num_bytes_in_buffer, num_reclaimed = 0; |
924 | bool success; |
925 | kern_return_t kr = KERN_SUCCESS; |
926 | if (metadata == NULL) { |
927 | return KERN_INVALID_ARGUMENT; |
928 | } |
929 | |
930 | KDBG(VM_RECLAIM_CODE(VM_RECLAIM_UPDATE_ACCOUNTING) | DBG_FUNC_START, |
931 | task_pid(task), reclaimable_bytes); |
932 | |
933 | /* |
934 | * The client is allowed to make this call in parallel from multiple threads. |
935 | * Ensure we only ever increase the value of vdrm_num_bytes_put_in_buffer. |
936 | * If the client's value is smaller than what we've stored, another thread |
937 | * raced ahead of them and we've already acted on that accounting so this |
938 | * call should be a no-op. |
939 | */ |
940 | success = os_atomic_rmw_loop(&metadata->vdrm_num_bytes_put_in_buffer, num_bytes_in_buffer, |
941 | reclaimable_bytes, acquire, |
942 | { |
943 | if (num_bytes_in_buffer > reclaimable_bytes) { |
944 | os_atomic_rmw_loop_give_up(break); |
945 | } |
946 | }); |
947 | if (!success) { |
948 | /* Stale value. Nothing new to reclaim */ |
949 | goto done; |
950 | } |
951 | num_bytes_reclaimed = os_atomic_load(&metadata->vdrm_num_bytes_reclaimed, relaxed); |
952 | |
953 | if (reclaimable_bytes > num_bytes_reclaimed) { |
954 | estimated_reclaimable_bytes = reclaimable_bytes - num_bytes_reclaimed; |
955 | if (estimated_reclaimable_bytes > vm_reclaim_max_threshold) { |
956 | lck_mtx_lock(lck: &metadata->vdrm_lock); |
957 | kr = reclaim_entries_from_buffer(metadata, |
958 | num_bytes_reclaimable_threshold: vm_reclaim_max_threshold, num_reclaimed_out: &num_reclaimed); |
959 | if (kr != KERN_SUCCESS) { |
960 | /* Lock has already been released & task is in the process of getting killed. */ |
961 | goto done; |
962 | } |
963 | lck_mtx_unlock(lck: &metadata->vdrm_lock); |
964 | } |
965 | } |
966 | |
967 | done: |
968 | KDBG(VM_RECLAIM_CODE(VM_RECLAIM_UPDATE_ACCOUNTING) | DBG_FUNC_END, |
969 | task_pid(task), reclaimable_bytes, num_bytes_reclaimed, num_reclaimed); |
970 | |
971 | return kr; |
972 | } |
973 | |
974 | static inline size_t |
975 | pick_reclaim_threshold(vm_deferred_reclamation_action_t action) |
976 | { |
977 | switch (action) { |
978 | case RECLAIM_FULL: |
979 | return 0; |
980 | case RECLAIM_TRIM: |
981 | return vm_reclaim_max_threshold / vm_reclaim_trim_divisor; |
982 | case RECLAIM_ASYNC: |
983 | return 0; |
984 | } |
985 | } |
986 | |
987 | void |
988 | vm_deferred_reclamation_reclaim_memory(vm_deferred_reclamation_action_t action) |
989 | { |
990 | kern_return_t kr; |
991 | size_t num_reclaimed; |
992 | |
993 | if (action == RECLAIM_ASYNC) { |
994 | lck_mtx_lock(lck: &async_reclamation_buffers_lock); |
995 | |
996 | process_async_reclamation_list(); |
997 | lck_mtx_unlock(lck: &async_reclamation_buffers_lock); |
998 | } else { |
999 | size_t reclaim_threshold = pick_reclaim_threshold(action); |
1000 | KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ALL_MEMORY) | DBG_FUNC_START, |
1001 | action, reclaim_threshold); |
1002 | lck_mtx_lock(lck: &reclamation_buffers_lock); |
1003 | reclamation_counter++; |
1004 | while (true) { |
1005 | vm_deferred_reclamation_metadata_t metadata = TAILQ_FIRST(&reclamation_buffers); |
1006 | if (metadata == NULL) { |
1007 | break; |
1008 | } |
1009 | lck_mtx_lock(lck: &metadata->vdrm_lock); |
1010 | if (metadata->vdrm_reclaimed_at >= reclamation_counter) { |
1011 | // We've already seen this one. We're done |
1012 | lck_mtx_unlock(lck: &metadata->vdrm_lock); |
1013 | break; |
1014 | } |
1015 | metadata->vdrm_reclaimed_at = reclamation_counter; |
1016 | |
1017 | TAILQ_REMOVE(&reclamation_buffers, metadata, vdrm_list); |
1018 | TAILQ_INSERT_TAIL(&reclamation_buffers, metadata, vdrm_list); |
1019 | lck_mtx_unlock(lck: &reclamation_buffers_lock); |
1020 | |
1021 | kr = reclaim_entries_from_buffer(metadata, |
1022 | num_bytes_reclaimable_threshold: reclaim_threshold, num_reclaimed_out: &num_reclaimed); |
1023 | if (kr == KERN_SUCCESS) { |
1024 | lck_mtx_unlock(lck: &metadata->vdrm_lock); |
1025 | } |
1026 | |
1027 | lck_mtx_lock(lck: &reclamation_buffers_lock); |
1028 | } |
1029 | lck_mtx_unlock(lck: &reclamation_buffers_lock); |
1030 | KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ALL_MEMORY) | DBG_FUNC_END, |
1031 | reclamation_counter); |
1032 | } |
1033 | } |
1034 | |
1035 | void |
1036 | vm_deferred_reclamation_reclaim_all_memory(void) |
1037 | { |
1038 | vm_deferred_reclamation_reclaim_memory(action: RECLAIM_FULL); |
1039 | } |
1040 | |
1041 | bool |
1042 | vm_deferred_reclamation_reclaim_from_task_async(task_t task) |
1043 | { |
1044 | bool queued = false; |
1045 | vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata; |
1046 | |
1047 | if (metadata != NULL) { |
1048 | lck_mtx_lock(lck: &async_reclamation_buffers_lock); |
1049 | if (metadata->vdrm_async_list.tqe_next != NULL || |
1050 | metadata->vdrm_async_list.tqe_prev != NULL) { |
1051 | // move this buffer to the tail if still on the async list |
1052 | TAILQ_REMOVE(&async_reclamation_buffers, metadata, vdrm_async_list); |
1053 | } |
1054 | TAILQ_INSERT_TAIL(&async_reclamation_buffers, metadata, vdrm_async_list); |
1055 | lck_mtx_unlock(lck: &async_reclamation_buffers_lock); |
1056 | queued = true; |
1057 | thread_wakeup(&vm_reclaim_thread); |
1058 | } |
1059 | |
1060 | return queued; |
1061 | } |
1062 | |
1063 | bool |
1064 | vm_deferred_reclamation_reclaim_from_task_sync(task_t task, size_t max_entries_to_reclaim) |
1065 | { |
1066 | kern_return_t kr; |
1067 | size_t num_reclaimed = 0; |
1068 | vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata; |
1069 | |
1070 | if (!task_is_active(task)) { |
1071 | return false; |
1072 | } |
1073 | |
1074 | if (metadata != NULL) { |
1075 | lck_mtx_lock(lck: &metadata->vdrm_lock); |
1076 | while (num_reclaimed < max_entries_to_reclaim) { |
1077 | size_t num_reclaimed_now; |
1078 | kr = reclaim_chunk(metadata, num_reclaimed_out: &num_reclaimed_now); |
1079 | if (kr == KERN_NOT_FOUND) { |
1080 | // Nothing left to reclaim |
1081 | break; |
1082 | } else if (kr != KERN_SUCCESS) { |
1083 | /* Lock has already been released and task is being killed. */ |
1084 | return false; |
1085 | } |
1086 | num_reclaimed += num_reclaimed_now; |
1087 | } |
1088 | lck_mtx_unlock(lck: &metadata->vdrm_lock); |
1089 | } |
1090 | |
1091 | return num_reclaimed > 0; |
1092 | } |
1093 | |
1094 | vm_deferred_reclamation_metadata_t |
1095 | vm_deferred_reclamation_buffer_fork(task_t task, vm_deferred_reclamation_metadata_t parent) |
1096 | { |
1097 | kern_return_t kr; |
1098 | vm_deferred_reclamation_metadata_t metadata = NULL; |
1099 | |
1100 | LCK_MTX_ASSERT(&parent->vdrm_lock, LCK_MTX_ASSERT_OWNED); |
1101 | |
1102 | assert(task->deferred_reclamation_metadata == NULL); |
1103 | metadata = vmdr_metadata_alloc(task, buffer: parent->vdrm_reclaim_buffer, |
1104 | size: parent->vdrm_buffer_size, indices: parent->vdrm_reclaim_indices); |
1105 | lck_mtx_unlock(lck: &parent->vdrm_lock); |
1106 | |
1107 | kr = vmdr_metadata_wire(metadata); |
1108 | if (kr != KERN_SUCCESS) { |
1109 | vmdr_metadata_free(metadata); |
1110 | return NULL; |
1111 | } |
1112 | |
1113 | lck_mtx_lock(lck: &reclamation_buffers_lock); |
1114 | TAILQ_INSERT_TAIL(&reclamation_buffers, metadata, vdrm_list); |
1115 | reclamation_buffers_length++; |
1116 | lck_mtx_unlock(lck: &reclamation_buffers_lock); |
1117 | |
1118 | return metadata; |
1119 | } |
1120 | |
1121 | void |
1122 | vm_deferred_reclamation_buffer_lock(vm_deferred_reclamation_metadata_t metadata) |
1123 | { |
1124 | lck_mtx_lock(lck: &metadata->vdrm_lock); |
1125 | } |
1126 | |
1127 | void |
1128 | vm_deferred_reclamation_buffer_unlock(vm_deferred_reclamation_metadata_t metadata) |
1129 | { |
1130 | lck_mtx_unlock(lck: &metadata->vdrm_lock); |
1131 | } |
1132 | |
1133 | |
1134 | static void |
1135 | reclaim_thread_init(void) |
1136 | { |
1137 | #if CONFIG_THREAD_GROUPS |
1138 | thread_group_vm_add(); |
1139 | #endif |
1140 | thread_set_thread_name(th: current_thread(), name: "VM_reclaim" ); |
1141 | } |
1142 | |
1143 | |
1144 | static void |
1145 | process_async_reclamation_list(void) |
1146 | { |
1147 | kern_return_t kr; |
1148 | size_t total_entries_reclaimed = 0; |
1149 | size_t num_tasks_reclaimed = 0; |
1150 | LCK_MTX_ASSERT(&async_reclamation_buffers_lock, LCK_MTX_ASSERT_OWNED); |
1151 | KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ASYNC_MEMORY) | DBG_FUNC_START); |
1152 | |
1153 | vm_deferred_reclamation_metadata_t metadata = TAILQ_FIRST(&async_reclamation_buffers); |
1154 | while (metadata != NULL) { |
1155 | size_t num_reclaimed; |
1156 | TAILQ_REMOVE(&async_reclamation_buffers, metadata, vdrm_async_list); |
1157 | metadata->vdrm_async_list.tqe_next = NULL; |
1158 | metadata->vdrm_async_list.tqe_prev = NULL; |
1159 | lck_mtx_lock(lck: &metadata->vdrm_lock); |
1160 | lck_mtx_unlock(lck: &async_reclamation_buffers_lock); |
1161 | |
1162 | // NB: Currently the async reclaim thread fully reclaims the buffer. |
1163 | kr = reclaim_entries_from_buffer(metadata, num_bytes_reclaimable_threshold: 0, num_reclaimed_out: &num_reclaimed); |
1164 | total_entries_reclaimed += num_reclaimed; |
1165 | if (kr != KERN_SUCCESS) { |
1166 | /* Lock has already been released & task is in the process of getting killed. */ |
1167 | goto next; |
1168 | } |
1169 | num_tasks_reclaimed++; |
1170 | /* Wakeup anyone waiting on this buffer getting processed */ |
1171 | thread_wakeup(&metadata->vdrm_async_list); |
1172 | assert(current_thread()->map == kernel_map); |
1173 | lck_mtx_unlock(lck: &metadata->vdrm_lock); |
1174 | |
1175 | next: |
1176 | lck_mtx_lock(lck: &async_reclamation_buffers_lock); |
1177 | metadata = TAILQ_FIRST(&async_reclamation_buffers); |
1178 | } |
1179 | KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ASYNC_MEMORY) | DBG_FUNC_END, |
1180 | num_tasks_reclaimed, total_entries_reclaimed); |
1181 | } |
1182 | |
1183 | __enum_decl(reclaim_thread_state, uint32_t, { |
1184 | RECLAIM_THREAD_INIT = 0, |
1185 | RECLAIM_THREAD_CONT = 1, |
1186 | }); |
1187 | |
1188 | static void |
1189 | reclaim_thread_continue(void) |
1190 | { |
1191 | lck_mtx_lock(lck: &async_reclamation_buffers_lock); |
1192 | |
1193 | process_async_reclamation_list(); |
1194 | assert_wait(event: &vm_reclaim_thread, THREAD_UNINT); |
1195 | |
1196 | lck_mtx_unlock(lck: &async_reclamation_buffers_lock); |
1197 | } |
1198 | |
1199 | void |
1200 | reclaim_thread(void *param, wait_result_t wr __unused) |
1201 | { |
1202 | if (param == (void *) RECLAIM_THREAD_INIT) { |
1203 | reclaim_thread_init(); |
1204 | } else { |
1205 | assert(param == (void *) RECLAIM_THREAD_CONT); |
1206 | } |
1207 | |
1208 | reclaim_thread_continue(); |
1209 | |
1210 | (void) thread_block_parameter(continuation: reclaim_thread, parameter: (void*) RECLAIM_THREAD_CONT); |
1211 | } |
1212 | |
1213 | __startup_func |
1214 | static void |
1215 | vm_deferred_reclamation_init(void) |
1216 | { |
1217 | // Note: no-op pending rdar://27006343 (Custom kernel log handles) |
1218 | vm_reclaim_log_handle = os_log_create(subsystem: "com.apple.mach.vm" , category: "reclaim" ); |
1219 | |
1220 | (void)kernel_thread_start_priority(continuation: reclaim_thread, |
1221 | parameter: (void *)RECLAIM_THREAD_INIT, priority: kReclaimThreadPriority, |
1222 | new_thread: &vm_reclaim_thread); |
1223 | } |
1224 | |
1225 | STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_deferred_reclamation_init); |
1226 | |
1227 | #if DEVELOPMENT || DEBUG |
1228 | |
1229 | bool |
1230 | vm_deferred_reclamation_block_until_pid_has_been_reclaimed(int pid) |
1231 | { |
1232 | vm_deferred_reclamation_metadata_t metadata = NULL; |
1233 | proc_t p = proc_find(pid); |
1234 | vm_map_t map = NULL; |
1235 | if (p == NULL) { |
1236 | return false; |
1237 | } |
1238 | task_t t = proc_task(p); |
1239 | if (t == NULL) { |
1240 | proc_rele(p); |
1241 | return false; |
1242 | } |
1243 | |
1244 | task_lock(t); |
1245 | if (t->map) { |
1246 | metadata = t->deferred_reclamation_metadata; |
1247 | if (metadata != NULL) { |
1248 | map = t->map; |
1249 | vm_map_reference(t->map); |
1250 | } |
1251 | } |
1252 | task_unlock(t); |
1253 | proc_rele(p); |
1254 | if (metadata == NULL) { |
1255 | return false; |
1256 | } |
1257 | |
1258 | lck_mtx_lock(&async_reclamation_buffers_lock); |
1259 | while (metadata->vdrm_async_list.tqe_next != NULL || metadata->vdrm_async_list.tqe_prev != NULL) { |
1260 | assert_wait(&metadata->vdrm_async_list, THREAD_UNINT); |
1261 | lck_mtx_unlock(&async_reclamation_buffers_lock); |
1262 | thread_block(THREAD_CONTINUE_NULL); |
1263 | lck_mtx_lock(&async_reclamation_buffers_lock); |
1264 | } |
1265 | |
1266 | /* |
1267 | * The async reclaim thread first removes the buffer from the list |
1268 | * and then reclaims it (while holding its lock). |
1269 | * So grab the metadata buffer's lock here to ensure the |
1270 | * reclaim is done. |
1271 | */ |
1272 | lck_mtx_lock(&metadata->vdrm_lock); |
1273 | lck_mtx_unlock(&metadata->vdrm_lock); |
1274 | lck_mtx_unlock(&async_reclamation_buffers_lock); |
1275 | |
1276 | vm_map_deallocate(map); |
1277 | return true; |
1278 | } |
1279 | |
1280 | #endif /* DEVELOPMENT || DEBUG */ |
1281 | |