| 1 | /* |
| 2 | * Copyright (c) 2021 Apple Inc. All rights reserved. |
| 3 | * |
| 4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
| 5 | * |
| 6 | * This file contains Original Code and/or Modifications of Original Code |
| 7 | * as defined in and that are subject to the Apple Public Source License |
| 8 | * Version 2.0 (the 'License'). You may not use this file except in |
| 9 | * compliance with the License. The rights granted to you under the License |
| 10 | * may not be used to create, or enable the creation or redistribution of, |
| 11 | * unlawful or unlicensed copies of an Apple operating system, or to |
| 12 | * circumvent, violate, or enable the circumvention or violation of, any |
| 13 | * terms of an Apple operating system software license agreement. |
| 14 | * |
| 15 | * Please obtain a copy of the License at |
| 16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
| 17 | * |
| 18 | * The Original Code and all software distributed under the License are |
| 19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
| 20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
| 21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
| 22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
| 23 | * Please see the License for the specific language governing rights and |
| 24 | * limitations under the License. |
| 25 | * |
| 26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
| 27 | */ |
| 28 | |
| 29 | #include <kern/exc_guard.h> |
| 30 | #include <kern/locks.h> |
| 31 | #include <kern/task.h> |
| 32 | #include <kern/zalloc.h> |
| 33 | #include <kern/misc_protos.h> |
| 34 | #include <kern/startup.h> |
| 35 | #include <kern/sched.h> |
| 36 | #include <libkern/OSAtomic.h> |
| 37 | #include <mach/kern_return.h> |
| 38 | #include <mach/mach_types.h> |
| 39 | #include <mach/mach_vm.h> |
| 40 | #include <mach/vm_reclaim.h> |
| 41 | #include <os/log.h> |
| 42 | #include <pexpert/pexpert.h> |
| 43 | #include <vm/vm_map.h> |
| 44 | #include <vm/vm_map_internal.h> |
| 45 | #include <vm/vm_reclaim_internal.h> |
| 46 | #include <sys/kdebug.h> |
| 47 | #include <sys/queue.h> |
| 48 | #include <os/atomic_private.h> |
| 49 | |
| 50 | #pragma mark Tunables |
| 51 | TUNABLE(uint32_t, kReclaimChunkSize, "vm_reclaim_chunk_size" , 16); |
| 52 | static integer_t kReclaimThreadPriority = BASEPRI_VM; |
| 53 | // Reclaim down to vm_reclaim_max_threshold / vm_reclaim_trim_divisor when doing a trim reclaim operation |
| 54 | TUNABLE_DEV_WRITEABLE(uint64_t, vm_reclaim_trim_divisor, "vm_reclaim_trim_divisor" , 2); |
| 55 | TUNABLE_DT_DEV_WRITEABLE(uint64_t, vm_reclaim_max_threshold, "/defaults" , "kern.vm_reclaim_max_threshold" , "vm_reclaim_max_threshold" , 0, TUNABLE_DT_NONE); |
| 56 | // Used to debug vm_reclaim kills |
| 57 | TUNABLE(bool, panic_on_kill, "vm_reclaim_panic_on_kill" , false); |
| 58 | |
| 59 | #pragma mark Declarations |
| 60 | typedef struct proc *proc_t; |
| 61 | extern char *proc_best_name(proc_t proc); |
| 62 | extern int exit_with_guard_exception(void *p, mach_exception_data_type_t code, mach_exception_data_type_t subcode); |
| 63 | struct proc *proc_ref(struct proc *p, int locked); |
| 64 | int proc_rele(proc_t p); |
| 65 | static bool reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head); |
| 66 | static bool reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail); |
| 67 | static bool reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy); |
| 68 | |
| 69 | struct vm_deferred_reclamation_metadata_s { |
| 70 | TAILQ_ENTRY(vm_deferred_reclamation_metadata_s) vdrm_list; // Global list containing every reclamation buffer |
| 71 | TAILQ_ENTRY(vm_deferred_reclamation_metadata_s) vdrm_async_list; // A list containing buffers that are ripe for reclamation |
| 72 | decl_lck_mtx_data(, vdrm_lock); /* Held when reclaiming from the buffer */ |
| 73 | /* |
| 74 | * The task owns this structure but we maintain a backpointer here |
| 75 | * so that we can send an exception if we hit an error. |
| 76 | * Since this is a backpointer we don't hold a reference (it's a weak pointer). |
| 77 | */ |
| 78 | task_t vdrm_task; |
| 79 | vm_map_t vdrm_map; |
| 80 | user_addr_t vdrm_reclaim_buffer; |
| 81 | mach_vm_size_t vdrm_buffer_size; |
| 82 | user_addr_t vdrm_reclaim_indices; |
| 83 | uint64_t vdrm_reclaimed_at; |
| 84 | /* |
| 85 | * These two values represent running sums of bytes placed in the buffer and bytes reclaimed out of the buffer |
| 86 | * cumulatively. Both values are in terms of virtual memory, so they give an upper bound |
| 87 | * on the amount of physical memory that can be reclaimed. |
| 88 | * To get an estimate of the current amount of VA in the buffer do vdrm_num_bytes_reclaimed - vdrm_num_bytes_put_in_buffer. |
| 89 | * Note that neither value is protected by the vdrm_lock. |
| 90 | */ |
| 91 | _Atomic size_t vdrm_num_bytes_put_in_buffer; |
| 92 | _Atomic size_t vdrm_num_bytes_reclaimed; |
| 93 | }; |
| 94 | static void process_async_reclamation_list(void); |
| 95 | |
| 96 | extern void *proc_find(int pid); |
| 97 | extern task_t proc_task(proc_t); |
| 98 | |
| 99 | #pragma mark Globals |
| 100 | static KALLOC_TYPE_DEFINE(vm_reclaim_metadata_zone, struct vm_deferred_reclamation_metadata_s, KT_DEFAULT); |
| 101 | static LCK_GRP_DECLARE(vm_reclaim_lock_grp, "vm_reclaim" ); |
| 102 | static os_log_t vm_reclaim_log_handle; |
| 103 | |
| 104 | /* |
| 105 | * The ringbuffer must contain at least 2 entries to distinguish between empty |
| 106 | * (head == tail) and full (head == tail + 1). |
| 107 | */ |
| 108 | #define BUFFER_MIN_ENTRY_COUNT 2 |
| 109 | |
| 110 | /* |
| 111 | * We maintain two lists of reclamation buffers. |
| 112 | * The reclamation_buffers list contains every buffer in the system. |
| 113 | * The async_reclamation_buffers_list contains buffers that are ripe for reclamation. |
| 114 | * Each list has its own lock. |
| 115 | */ |
| 116 | static TAILQ_HEAD(, vm_deferred_reclamation_metadata_s) reclamation_buffers = TAILQ_HEAD_INITIALIZER(reclamation_buffers); |
| 117 | |
| 118 | static TAILQ_HEAD(, vm_deferred_reclamation_metadata_s) async_reclamation_buffers = TAILQ_HEAD_INITIALIZER(async_reclamation_buffers); |
| 119 | /* |
| 120 | * The reclamation_buffers_lock protects the reclamation_buffers list. |
| 121 | * It must be held when iterating over the list or manipulating the list. |
| 122 | * It should be dropped when acting on a specific metadata entry after acquiring the vdrm_lock. |
| 123 | */ |
| 124 | LCK_MTX_DECLARE(reclamation_buffers_lock, &vm_reclaim_lock_grp); |
| 125 | LCK_MTX_DECLARE(async_reclamation_buffers_lock, &vm_reclaim_lock_grp); |
| 126 | static size_t reclamation_buffers_length; |
| 127 | static uint64_t reclamation_counter; // generation count for global reclaims |
| 128 | |
| 129 | static SECURITY_READ_ONLY_LATE(thread_t) vm_reclaim_thread; |
| 130 | static void reclaim_thread(void *param __unused, wait_result_t wr __unused); |
| 131 | |
| 132 | #pragma mark Implementation |
| 133 | |
| 134 | /* |
| 135 | * The current design is not tolerant to faulting on the buffer under the |
| 136 | * metadata lock. Wire the buffer as a stop-gap solution for now; in the |
| 137 | * future, the synchronization scheme should be revised to allow the buffer |
| 138 | * to be pageable (rdar://112039103). |
| 139 | */ |
| 140 | |
| 141 | static kern_return_t |
| 142 | vmdr_metadata_wire(vm_deferred_reclamation_metadata_t metadata) |
| 143 | { |
| 144 | kern_return_t kr; |
| 145 | vm_map_offset_t buffer_start = (metadata->vdrm_reclaim_buffer - |
| 146 | offsetof(struct mach_vm_reclaim_buffer_v1_s, entries)); |
| 147 | vm_map_offset_t buffer_end = (metadata->vdrm_reclaim_buffer + |
| 148 | metadata->vdrm_buffer_size); |
| 149 | kr = vm_map_wire_kernel(map: metadata->vdrm_map, start: buffer_start, end: buffer_end, |
| 150 | VM_PROT_NONE, VM_KERN_MEMORY_OSFMK, TRUE); |
| 151 | if (kr != KERN_SUCCESS) { |
| 152 | os_log_error(vm_reclaim_log_handle, |
| 153 | "vm_reclaim: failed to wire userspace reclaim buffer for pid %d (%d)" , |
| 154 | task_pid(metadata->vdrm_task), kr); |
| 155 | } |
| 156 | return kr; |
| 157 | } |
| 158 | |
| 159 | static kern_return_t |
| 160 | vmdr_metadata_unwire(vm_deferred_reclamation_metadata_t metadata) |
| 161 | { |
| 162 | kern_return_t kr; |
| 163 | vm_map_offset_t buffer_start = (metadata->vdrm_reclaim_buffer - |
| 164 | offsetof(struct mach_vm_reclaim_buffer_v1_s, entries)); |
| 165 | vm_map_offset_t buffer_end = (metadata->vdrm_reclaim_buffer + |
| 166 | metadata->vdrm_buffer_size); |
| 167 | kr = vm_map_unwire(map: metadata->vdrm_map, start: buffer_start, end: buffer_end, TRUE); |
| 168 | if (kr != KERN_SUCCESS) { |
| 169 | os_log_error(vm_reclaim_log_handle, |
| 170 | "vm_reclaim: unable to un-wire buffer %p (%llu) for pid %d (%d)" , |
| 171 | (void *)buffer_start, (buffer_end - buffer_start), |
| 172 | task_pid(metadata->vdrm_task), kr); |
| 173 | } |
| 174 | return kr; |
| 175 | } |
| 176 | |
| 177 | static vm_deferred_reclamation_metadata_t |
| 178 | vmdr_metadata_alloc( |
| 179 | task_t task, |
| 180 | user_addr_t buffer, |
| 181 | mach_vm_size_t size, |
| 182 | user_addr_t indices) |
| 183 | { |
| 184 | vm_deferred_reclamation_metadata_t metadata; |
| 185 | vm_map_t map = task->map; |
| 186 | |
| 187 | assert(!map->is_nested_map); |
| 188 | |
| 189 | metadata = zalloc_flags(vm_reclaim_metadata_zone, Z_WAITOK | Z_ZERO); |
| 190 | lck_mtx_init(lck: &metadata->vdrm_lock, grp: &vm_reclaim_lock_grp, LCK_ATTR_NULL); |
| 191 | metadata->vdrm_task = task; |
| 192 | metadata->vdrm_map = map; |
| 193 | metadata->vdrm_reclaim_buffer = buffer; |
| 194 | metadata->vdrm_buffer_size = size; |
| 195 | metadata->vdrm_reclaim_indices = indices; |
| 196 | |
| 197 | /* |
| 198 | * we do not need to hold a lock on `task` because this is called |
| 199 | * either at fork() time or from the context of current_task(). |
| 200 | */ |
| 201 | vm_map_reference(map); |
| 202 | return metadata; |
| 203 | } |
| 204 | |
| 205 | static void |
| 206 | vmdr_metadata_free(vm_deferred_reclamation_metadata_t metadata) |
| 207 | { |
| 208 | vm_map_deallocate(map: metadata->vdrm_map); |
| 209 | lck_mtx_destroy(lck: &metadata->vdrm_lock, grp: &vm_reclaim_lock_grp); |
| 210 | zfree(vm_reclaim_metadata_zone, metadata); |
| 211 | } |
| 212 | |
| 213 | kern_return_t |
| 214 | vm_deferred_reclamation_buffer_init_internal( |
| 215 | task_t task, |
| 216 | mach_vm_offset_t address, |
| 217 | mach_vm_size_t size) |
| 218 | { |
| 219 | kern_return_t kr = KERN_FAILURE, tmp_kr; |
| 220 | vm_deferred_reclamation_metadata_t metadata = NULL; |
| 221 | bool success; |
| 222 | uint64_t head = 0, tail = 0, busy = 0; |
| 223 | |
| 224 | if (address == 0 || |
| 225 | size < (sizeof(struct mach_vm_reclaim_buffer_v1_s) + |
| 226 | BUFFER_MIN_ENTRY_COUNT * sizeof(mach_vm_reclaim_entry_v1_t)) || |
| 227 | !VM_MAP_PAGE_ALIGNED(address, VM_MAP_PAGE_MASK(task->map)) || |
| 228 | !VM_MAP_PAGE_ALIGNED((address + size), VM_MAP_PAGE_MASK(task->map))) { |
| 229 | return KERN_INVALID_ARGUMENT; |
| 230 | } |
| 231 | |
| 232 | /* vm_reclaim is disabled */ |
| 233 | if (vm_reclaim_max_threshold == 0) { |
| 234 | os_log_error(vm_reclaim_log_handle, |
| 235 | "vm_reclaim: failed to initialize vmdr buffer - reclaim is disabled (%llu)" , |
| 236 | vm_reclaim_max_threshold); |
| 237 | return KERN_NOT_SUPPORTED; |
| 238 | } |
| 239 | |
| 240 | KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_START, |
| 241 | task_pid(task), address, size); |
| 242 | |
| 243 | user_addr_t buffer = address + \ |
| 244 | offsetof(struct mach_vm_reclaim_buffer_v1_s, entries); |
| 245 | mach_vm_size_t buffer_size = size - \ |
| 246 | offsetof(struct mach_vm_reclaim_buffer_v1_s, entries); |
| 247 | user_addr_t indices = address + \ |
| 248 | offsetof(struct mach_vm_reclaim_buffer_v1_s, indices); |
| 249 | |
| 250 | metadata = vmdr_metadata_alloc(task, buffer, size: buffer_size, indices); |
| 251 | |
| 252 | kr = vmdr_metadata_wire(metadata); |
| 253 | if (kr != KERN_SUCCESS) { |
| 254 | goto out; |
| 255 | } |
| 256 | |
| 257 | /* |
| 258 | * Validate the starting indices. |
| 259 | * |
| 260 | * NB: At this point it is impossible for another thread to hold a |
| 261 | * reference to this metadata. However, reclaim_copyin may call reclaim_kill |
| 262 | * on failure, which assumes the metadata lock is held. |
| 263 | */ |
| 264 | lck_mtx_lock(lck: &metadata->vdrm_lock); |
| 265 | |
| 266 | success = reclaim_copyin_busy(metadata, busy: &busy); |
| 267 | if (!success) { |
| 268 | /* metadata lock has been dropped and exception delivered to task */ |
| 269 | kr = KERN_INVALID_ARGUMENT; |
| 270 | goto fail_wired; |
| 271 | } |
| 272 | success = reclaim_copyin_head(metadata, head: &head); |
| 273 | if (!success) { |
| 274 | /* metadata lock has been dropped and exception delivered to task */ |
| 275 | kr = KERN_INVALID_ARGUMENT; |
| 276 | goto fail_wired; |
| 277 | } |
| 278 | success = reclaim_copyin_tail(metadata, tail: &tail); |
| 279 | if (!success) { |
| 280 | /* metadata lock has been dropped and exception delivered to task */ |
| 281 | kr = KERN_INVALID_ARGUMENT; |
| 282 | goto fail_wired; |
| 283 | } |
| 284 | |
| 285 | lck_mtx_unlock(lck: &metadata->vdrm_lock); |
| 286 | |
| 287 | if (head != 0 || tail != 0 || busy != 0) { |
| 288 | kr = KERN_INVALID_ARGUMENT; |
| 289 | goto fail_wired; |
| 290 | } |
| 291 | |
| 292 | /* |
| 293 | * Publish the metadata to the task & global buffer list. This must be |
| 294 | * done under the task lock to synchronize with task termination - i.e. |
| 295 | * task_terminate_internal is guaranteed to see the published metadata and |
| 296 | * tear it down. |
| 297 | */ |
| 298 | lck_mtx_lock(lck: &reclamation_buffers_lock); |
| 299 | task_lock(task); |
| 300 | |
| 301 | if (!task_is_active(task) || task_is_halting(task)) { |
| 302 | os_log_error(vm_reclaim_log_handle, |
| 303 | "vm_reclaim: failed to initialize buffer on dying task (pid %d)" , task_pid(task)); |
| 304 | kr = KERN_TERMINATED; |
| 305 | goto fail_task; |
| 306 | } else if (task->deferred_reclamation_metadata != NULL) { |
| 307 | os_log_error(vm_reclaim_log_handle, |
| 308 | "vm_reclaim: tried to overwrite existing reclaim buffer for pid %d" , task_pid(task)); |
| 309 | kr = KERN_INVALID_ARGUMENT; |
| 310 | goto fail_task; |
| 311 | } |
| 312 | |
| 313 | TAILQ_INSERT_TAIL(&reclamation_buffers, metadata, vdrm_list); |
| 314 | reclamation_buffers_length++; |
| 315 | |
| 316 | task->deferred_reclamation_metadata = metadata; |
| 317 | |
| 318 | task_unlock(task); |
| 319 | lck_mtx_unlock(lck: &reclamation_buffers_lock); |
| 320 | |
| 321 | KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_END, |
| 322 | task_pid(task), KERN_SUCCESS); |
| 323 | return KERN_SUCCESS; |
| 324 | |
| 325 | fail_task: |
| 326 | task_unlock(task); |
| 327 | lck_mtx_unlock(lck: &reclamation_buffers_lock); |
| 328 | |
| 329 | fail_wired: |
| 330 | tmp_kr = vmdr_metadata_unwire(metadata); |
| 331 | assert3u(tmp_kr, ==, KERN_SUCCESS); |
| 332 | |
| 333 | out: |
| 334 | vmdr_metadata_free(metadata); |
| 335 | KDBG(VM_RECLAIM_CODE(VM_RECLAIM_INIT) | DBG_FUNC_END, |
| 336 | task_pid(task), kr); |
| 337 | return kr; |
| 338 | } |
| 339 | |
| 340 | void |
| 341 | vm_deferred_reclamation_buffer_uninstall(vm_deferred_reclamation_metadata_t metadata) |
| 342 | { |
| 343 | assert(metadata != NULL); |
| 344 | /* |
| 345 | * First remove the buffer from the global list so no one else can get access to it. |
| 346 | */ |
| 347 | lck_mtx_lock(lck: &reclamation_buffers_lock); |
| 348 | TAILQ_REMOVE(&reclamation_buffers, metadata, vdrm_list); |
| 349 | reclamation_buffers_length--; |
| 350 | lck_mtx_unlock(lck: &reclamation_buffers_lock); |
| 351 | |
| 352 | /* |
| 353 | * Now remove it from the async list (if present) |
| 354 | */ |
| 355 | lck_mtx_lock(lck: &async_reclamation_buffers_lock); |
| 356 | if (metadata->vdrm_async_list.tqe_next != NULL || metadata->vdrm_async_list.tqe_prev != NULL) { |
| 357 | TAILQ_REMOVE(&async_reclamation_buffers, metadata, vdrm_async_list); |
| 358 | metadata->vdrm_async_list.tqe_next = NULL; |
| 359 | metadata->vdrm_async_list.tqe_prev = NULL; |
| 360 | } |
| 361 | lck_mtx_unlock(lck: &async_reclamation_buffers_lock); |
| 362 | |
| 363 | // A kernel thread may have grabbed the lock for this buffer before we had |
| 364 | // a chance to remove it from the queues. Take the metadata lock to ensure |
| 365 | // any such workers are finished operating on the buffer. |
| 366 | lck_mtx_lock(lck: &metadata->vdrm_lock); |
| 367 | lck_mtx_unlock(lck: &metadata->vdrm_lock); |
| 368 | |
| 369 | vmdr_metadata_unwire(metadata); |
| 370 | } |
| 371 | |
| 372 | void |
| 373 | vm_deferred_reclamation_buffer_deallocate(vm_deferred_reclamation_metadata_t metadata) |
| 374 | { |
| 375 | assert(metadata != NULL); |
| 376 | vmdr_metadata_free(metadata); |
| 377 | } |
| 378 | |
| 379 | static user_addr_t |
| 380 | get_head_ptr(user_addr_t indices) |
| 381 | { |
| 382 | return indices + offsetof(mach_vm_reclaim_indices_v1_t, head); |
| 383 | } |
| 384 | |
| 385 | static user_addr_t |
| 386 | get_tail_ptr(user_addr_t indices) |
| 387 | { |
| 388 | return indices + offsetof(mach_vm_reclaim_indices_v1_t, tail); |
| 389 | } |
| 390 | |
| 391 | static user_addr_t |
| 392 | get_busy_ptr(user_addr_t indices) |
| 393 | { |
| 394 | return indices + offsetof(mach_vm_reclaim_indices_v1_t, busy); |
| 395 | } |
| 396 | |
| 397 | static void |
| 398 | reclaim_kill_with_reason( |
| 399 | vm_deferred_reclamation_metadata_t metadata, |
| 400 | unsigned reason, |
| 401 | mach_exception_data_type_t subcode) |
| 402 | { |
| 403 | unsigned int guard_type = GUARD_TYPE_VIRT_MEMORY; |
| 404 | mach_exception_code_t code = 0; |
| 405 | task_t task = metadata->vdrm_task; |
| 406 | proc_t p = NULL; |
| 407 | boolean_t fatal = TRUE; |
| 408 | bool killing_self = false; |
| 409 | pid_t pid; |
| 410 | int err; |
| 411 | |
| 412 | if (panic_on_kill) { |
| 413 | panic("vm_reclaim: About to kill %p due to %d with subcode %lld\n" , task, reason, subcode); |
| 414 | } |
| 415 | |
| 416 | EXC_GUARD_ENCODE_TYPE(code, guard_type); |
| 417 | EXC_GUARD_ENCODE_FLAVOR(code, reason); |
| 418 | EXC_GUARD_ENCODE_TARGET(code, 0); |
| 419 | |
| 420 | assert(metadata->vdrm_task != kernel_task); |
| 421 | killing_self = task == current_task(); |
| 422 | if (!killing_self) { |
| 423 | /* |
| 424 | * Grab a reference on the task to make sure it doesn't go away |
| 425 | * after we drop the metadata lock |
| 426 | */ |
| 427 | task_reference(task); |
| 428 | } |
| 429 | /* |
| 430 | * We need to issue a wakeup in case this kill is coming from the async path. |
| 431 | * Once we drop the lock the caller can no longer do this wakeup, but |
| 432 | * if there's someone blocked on this reclaim they hold a map reference |
| 433 | * and thus need to be woken up so the map can be freed. |
| 434 | */ |
| 435 | thread_wakeup(&metadata->vdrm_async_list); |
| 436 | lck_mtx_unlock(lck: &metadata->vdrm_lock); |
| 437 | |
| 438 | if (reason == kGUARD_EXC_DEALLOC_GAP) { |
| 439 | task_lock(task); |
| 440 | fatal = (task->task_exc_guard & TASK_EXC_GUARD_VM_FATAL); |
| 441 | task_unlock(task); |
| 442 | } |
| 443 | |
| 444 | if (!fatal) { |
| 445 | os_log_info(vm_reclaim_log_handle, |
| 446 | "vm_reclaim: Skipping non fatal guard exception.\n" ); |
| 447 | goto out; |
| 448 | } |
| 449 | |
| 450 | pid = task_pid(task); |
| 451 | if (killing_self) { |
| 452 | p = get_bsdtask_info(task); |
| 453 | } else { |
| 454 | p = proc_find(pid); |
| 455 | if (p && proc_task(p) != task) { |
| 456 | os_log_error(vm_reclaim_log_handle, |
| 457 | "vm_reclaim: Unable to deliver guard exception because proc is gone & pid rolled over.\n" ); |
| 458 | goto out; |
| 459 | } |
| 460 | |
| 461 | task_deallocate(task); |
| 462 | task = NULL; |
| 463 | } |
| 464 | |
| 465 | if (!p) { |
| 466 | os_log_error(vm_reclaim_log_handle, |
| 467 | "vm_reclaim: Unable to deliver guard exception because task does not have a proc.\n" ); |
| 468 | goto out; |
| 469 | } |
| 470 | |
| 471 | err = exit_with_guard_exception(p, code, subcode); |
| 472 | if (err != 0) { |
| 473 | os_log_error(vm_reclaim_log_handle, "vm_reclaim: Unable to deliver guard exception to %p: %d\n" , p, err); |
| 474 | } |
| 475 | out: |
| 476 | if (!killing_self) { |
| 477 | if (p) { |
| 478 | proc_rele(p); |
| 479 | p = NULL; |
| 480 | } |
| 481 | if (task) { |
| 482 | task_deallocate(task); |
| 483 | task = NULL; |
| 484 | } |
| 485 | } |
| 486 | } |
| 487 | |
| 488 | static void |
| 489 | reclaim_handle_copyio_error(vm_deferred_reclamation_metadata_t metadata, int result) |
| 490 | { |
| 491 | reclaim_kill_with_reason(metadata, reason: kGUARD_EXC_RECLAIM_COPYIO_FAILURE, subcode: result); |
| 492 | } |
| 493 | |
| 494 | /* |
| 495 | * Helper functions to do copyio on the head, tail, and busy pointers. |
| 496 | * Note that the kernel will only write to the busy and head pointers. |
| 497 | * Userspace is not supposed to write to the head or busy pointers, but the kernel |
| 498 | * must be resilient to that kind of bug in userspace. |
| 499 | */ |
| 500 | |
| 501 | |
| 502 | static bool |
| 503 | reclaim_copyin_head(vm_deferred_reclamation_metadata_t metadata, uint64_t *head) |
| 504 | { |
| 505 | int result; |
| 506 | user_addr_t indices = metadata->vdrm_reclaim_indices; |
| 507 | user_addr_t head_ptr = get_head_ptr(indices); |
| 508 | |
| 509 | result = copyin_atomic64(user_addr: head_ptr, kernel_addr: head); |
| 510 | |
| 511 | if (result != 0) { |
| 512 | os_log_error(vm_reclaim_log_handle, |
| 513 | "vm_reclaim: Unable to copy head ptr from 0x%llx: err=%d\n" , head_ptr, result); |
| 514 | reclaim_handle_copyio_error(metadata, result); |
| 515 | return false; |
| 516 | } |
| 517 | return true; |
| 518 | } |
| 519 | |
| 520 | static bool |
| 521 | reclaim_copyin_tail(vm_deferred_reclamation_metadata_t metadata, uint64_t *tail) |
| 522 | { |
| 523 | int result; |
| 524 | user_addr_t indices = metadata->vdrm_reclaim_indices; |
| 525 | user_addr_t tail_ptr = get_tail_ptr(indices); |
| 526 | |
| 527 | result = copyin_atomic64(user_addr: tail_ptr, kernel_addr: tail); |
| 528 | |
| 529 | if (result != 0) { |
| 530 | os_log_error(vm_reclaim_log_handle, |
| 531 | "vm_reclaim: Unable to copy tail ptr from 0x%llx: err=%d\n" , tail_ptr, result); |
| 532 | reclaim_handle_copyio_error(metadata, result); |
| 533 | return false; |
| 534 | } |
| 535 | return true; |
| 536 | } |
| 537 | |
| 538 | static bool |
| 539 | reclaim_copyin_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t *busy) |
| 540 | { |
| 541 | int result; |
| 542 | user_addr_t indices = metadata->vdrm_reclaim_indices; |
| 543 | user_addr_t busy_ptr = get_busy_ptr(indices); |
| 544 | |
| 545 | result = copyin_atomic64(user_addr: busy_ptr, kernel_addr: busy); |
| 546 | |
| 547 | if (result != 0) { |
| 548 | os_log_error(vm_reclaim_log_handle, |
| 549 | "vm_reclaim: Unable to copy busy ptr from 0x%llx: err=%d\n" , busy_ptr, result); |
| 550 | reclaim_handle_copyio_error(metadata, result); |
| 551 | return false; |
| 552 | } |
| 553 | return true; |
| 554 | } |
| 555 | |
| 556 | static bool |
| 557 | reclaim_copyout_busy(vm_deferred_reclamation_metadata_t metadata, uint64_t value) |
| 558 | { |
| 559 | int result; |
| 560 | user_addr_t indices = metadata->vdrm_reclaim_indices; |
| 561 | user_addr_t busy_ptr = get_busy_ptr(indices); |
| 562 | |
| 563 | result = copyout_atomic64(u64: value, user_addr: busy_ptr); |
| 564 | |
| 565 | if (result != 0) { |
| 566 | os_log_error(vm_reclaim_log_handle, |
| 567 | "vm_reclaim: Unable to copy %llu to busy ptr at 0x%llx: err=%d\n" , value, busy_ptr, result); |
| 568 | reclaim_handle_copyio_error(metadata, result); |
| 569 | return false; |
| 570 | } |
| 571 | return true; |
| 572 | } |
| 573 | |
| 574 | static bool |
| 575 | reclaim_copyout_head(vm_deferred_reclamation_metadata_t metadata, uint64_t value) |
| 576 | { |
| 577 | int result; |
| 578 | user_addr_t indices = metadata->vdrm_reclaim_indices; |
| 579 | user_addr_t head_ptr = get_head_ptr(indices); |
| 580 | |
| 581 | result = copyout_atomic64(u64: value, user_addr: head_ptr); |
| 582 | |
| 583 | if (result != 0) { |
| 584 | os_log_error(vm_reclaim_log_handle, |
| 585 | "vm_reclaim: Unable to copy %llu to head ptr at 0x%llx: err=%d\n" , value, head_ptr, result); |
| 586 | reclaim_handle_copyio_error(metadata, result); |
| 587 | return false; |
| 588 | } |
| 589 | return true; |
| 590 | } |
| 591 | |
| 592 | /* |
| 593 | * Reclaim a chunk (kReclaimChunkSize entries) from the buffer. |
| 594 | * |
| 595 | * Writes the number of entries reclaimed to `num_reclaimed_out`. Note that |
| 596 | * there may be zero reclaimable entries in the chunk (they have all been |
| 597 | * re-used by userspace). |
| 598 | * |
| 599 | * Returns: |
| 600 | * - KERN_NOT_FOUND if the buffer has been exhausted (head == tail) |
| 601 | * - KERN_FAILURE on failure to reclaim -- metadata lock will be dropped |
| 602 | * before returning |
| 603 | */ |
| 604 | static kern_return_t |
| 605 | reclaim_chunk(vm_deferred_reclamation_metadata_t metadata, size_t *num_reclaimed_out) |
| 606 | { |
| 607 | assert(metadata != NULL); |
| 608 | LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_OWNED); |
| 609 | int result = 0; |
| 610 | size_t num_reclaimed = 0; |
| 611 | uint64_t head = 0, tail = 0, busy = 0, num_to_reclaim = 0, new_tail = 0, num_copied = 0, buffer_len = 0; |
| 612 | user_addr_t indices; |
| 613 | vm_map_t map = metadata->vdrm_map, old_map; |
| 614 | mach_vm_reclaim_entry_v1_t reclaim_entries[kReclaimChunkSize]; |
| 615 | bool success; |
| 616 | |
| 617 | KDBG(VM_RECLAIM_CODE(VM_RECLAIM_CHUNK) | DBG_FUNC_START, |
| 618 | task_pid(metadata->vdrm_task), kReclaimChunkSize); |
| 619 | |
| 620 | buffer_len = metadata->vdrm_buffer_size / sizeof(mach_vm_reclaim_entry_v1_t); |
| 621 | |
| 622 | memset(s: reclaim_entries, c: 0, n: sizeof(reclaim_entries)); |
| 623 | |
| 624 | indices = (user_addr_t) metadata->vdrm_reclaim_indices; |
| 625 | old_map = vm_map_switch(map); |
| 626 | |
| 627 | success = reclaim_copyin_busy(metadata, busy: &busy); |
| 628 | if (!success) { |
| 629 | goto fail; |
| 630 | } |
| 631 | success = reclaim_copyin_head(metadata, head: &head); |
| 632 | if (!success) { |
| 633 | goto fail; |
| 634 | } |
| 635 | success = reclaim_copyin_tail(metadata, tail: &tail); |
| 636 | if (!success) { |
| 637 | goto fail; |
| 638 | } |
| 639 | |
| 640 | if (busy != head) { |
| 641 | // Userspace overwrote one of the pointers |
| 642 | os_log_error(vm_reclaim_log_handle, |
| 643 | "vm_reclaim: Userspace modified head or busy pointer! head: %llu (0x%llx) != busy: %llu (0x%llx) | tail = %llu (0x%llx)\n" , |
| 644 | head, get_head_ptr(indices), busy, get_busy_ptr(indices), tail, get_tail_ptr(indices)); |
| 645 | reclaim_kill_with_reason(metadata, reason: kGUARD_EXC_RECLAIM_INDEX_FAILURE, subcode: busy); |
| 646 | goto fail; |
| 647 | } |
| 648 | |
| 649 | if (tail < head) { |
| 650 | // Userspace is likely in the middle of trying to re-use an entry, bail on this reclamation |
| 651 | os_log_error(vm_reclaim_log_handle, |
| 652 | "vm_reclaim: Userspace modified head or tail pointer! head: %llu (0x%llx) > tail: %llu (0x%llx) | busy = %llu (0x%llx)\n" , |
| 653 | head, get_head_ptr(indices), tail, get_tail_ptr(indices), busy, get_busy_ptr(indices)); |
| 654 | lck_mtx_unlock(lck: &metadata->vdrm_lock); |
| 655 | goto fail; |
| 656 | } |
| 657 | |
| 658 | num_to_reclaim = tail - head; |
| 659 | while (true) { |
| 660 | num_to_reclaim = MIN(num_to_reclaim, kReclaimChunkSize); |
| 661 | if (num_to_reclaim == 0) { |
| 662 | break; |
| 663 | } |
| 664 | busy = head + num_to_reclaim; |
| 665 | success = reclaim_copyout_busy(metadata, value: busy); |
| 666 | if (!success) { |
| 667 | goto fail; |
| 668 | } |
| 669 | os_atomic_thread_fence(seq_cst); |
| 670 | success = reclaim_copyin_tail(metadata, tail: &new_tail); |
| 671 | if (!success) { |
| 672 | goto fail; |
| 673 | } |
| 674 | |
| 675 | if (new_tail >= busy) { |
| 676 | /* Got num_to_reclaim entries */ |
| 677 | break; |
| 678 | } |
| 679 | tail = new_tail; |
| 680 | if (tail < head) { |
| 681 | // Userspace is likely in the middle of trying to re-use an entry, bail on this reclamation |
| 682 | os_log_error(vm_reclaim_log_handle, |
| 683 | "vm_reclaim: Userspace modified head or tail pointer! head: %llu (0x%llx) > tail: %llu (0x%llx) | busy = %llu (0x%llx)\n" , |
| 684 | head, get_head_ptr(indices), tail, get_tail_ptr(indices), busy, get_busy_ptr(indices)); |
| 685 | lck_mtx_unlock(lck: &metadata->vdrm_lock); |
| 686 | goto fail; |
| 687 | } |
| 688 | /* Can't reclaim these entries. Try again */ |
| 689 | num_to_reclaim = tail - head; |
| 690 | if (num_to_reclaim == 0) { |
| 691 | /* Nothing left to reclaim. Reset busy to head. */ |
| 692 | success = reclaim_copyout_busy(metadata, value: head); |
| 693 | if (!success) { |
| 694 | goto fail; |
| 695 | } |
| 696 | break; |
| 697 | } |
| 698 | /* |
| 699 | * Note that num_to_reclaim must have gotten smaller since tail got smaller, |
| 700 | * so this is gauranteed to converge. |
| 701 | */ |
| 702 | } |
| 703 | |
| 704 | while (num_copied < num_to_reclaim) { |
| 705 | uint64_t memcpy_start_idx = (head % buffer_len); |
| 706 | uint64_t memcpy_end_idx = memcpy_start_idx + num_to_reclaim - num_copied; |
| 707 | // Clamp the end idx to the buffer. We'll handle wrap-around in our next go around the loop. |
| 708 | memcpy_end_idx = MIN(memcpy_end_idx, buffer_len); |
| 709 | uint64_t num_to_copy = memcpy_end_idx - memcpy_start_idx; |
| 710 | |
| 711 | assert(num_to_copy + num_copied <= kReclaimChunkSize); |
| 712 | user_addr_t src_ptr = metadata->vdrm_reclaim_buffer + memcpy_start_idx * sizeof(mach_vm_reclaim_entry_v1_t); |
| 713 | mach_vm_reclaim_entry_v1_t *dst_ptr = reclaim_entries + num_copied; |
| 714 | |
| 715 | result = copyin(src_ptr, dst_ptr, num_to_copy * sizeof(mach_vm_reclaim_entry_v1_t)); |
| 716 | |
| 717 | if (result != 0) { |
| 718 | os_log_error(vm_reclaim_log_handle, |
| 719 | "vm_reclaim: Unable to copyin %llu entries in reclaim buffer at 0x%llx to 0x%llx: err=%d\n" , |
| 720 | num_to_copy, src_ptr, (uint64_t) dst_ptr, result); |
| 721 | reclaim_handle_copyio_error(metadata, result); |
| 722 | goto fail; |
| 723 | } |
| 724 | |
| 725 | num_copied += num_to_copy; |
| 726 | head += num_to_copy; |
| 727 | } |
| 728 | |
| 729 | for (size_t i = 0; i < num_to_reclaim; i++) { |
| 730 | mach_vm_reclaim_entry_v1_t *entry = &reclaim_entries[i]; |
| 731 | KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_ENTRY) | DBG_FUNC_START, |
| 732 | task_pid(metadata->vdrm_task), entry->address, entry->size, |
| 733 | entry->behavior); |
| 734 | DTRACE_VM4(vm_reclaim_chunk, |
| 735 | int, task_pid(metadata->vdrm_task), |
| 736 | mach_vm_address_t, entry->address, |
| 737 | size_t, entry->size, |
| 738 | mach_vm_reclaim_behavior_v1_t, entry->behavior); |
| 739 | if (entry->address != 0 && entry->size != 0) { |
| 740 | kern_return_t kr; |
| 741 | switch (entry->behavior) { |
| 742 | case MACH_VM_RECLAIM_DEALLOCATE: |
| 743 | kr = vm_map_remove_guard(map, |
| 744 | vm_map_trunc_page(entry->address, |
| 745 | VM_MAP_PAGE_MASK(map)), |
| 746 | vm_map_round_page(entry->address + entry->size, |
| 747 | VM_MAP_PAGE_MASK(map)), |
| 748 | flags: VM_MAP_REMOVE_GAPS_FAIL, |
| 749 | KMEM_GUARD_NONE).kmr_return; |
| 750 | if (kr == KERN_INVALID_VALUE) { |
| 751 | reclaim_kill_with_reason(metadata, reason: kGUARD_EXC_DEALLOC_GAP, subcode: entry->address); |
| 752 | goto fail; |
| 753 | } else if (kr != KERN_SUCCESS) { |
| 754 | os_log_error(vm_reclaim_log_handle, |
| 755 | "vm_reclaim: Unable to deallocate 0x%llx (%u) from 0x%llx err=%d\n" , |
| 756 | entry->address, entry->size, (uint64_t) map, kr); |
| 757 | reclaim_kill_with_reason(metadata, reason: kGUARD_EXC_RECLAIM_DEALLOCATE_FAILURE, subcode: kr); |
| 758 | goto fail; |
| 759 | } |
| 760 | break; |
| 761 | case MACH_VM_RECLAIM_REUSABLE: |
| 762 | kr = vm_map_behavior_set(map, |
| 763 | vm_map_trunc_page(entry->address, VM_MAP_PAGE_MASK(map)), |
| 764 | vm_map_round_page(entry->address + entry->size, VM_MAP_PAGE_MASK(map)), |
| 765 | VM_BEHAVIOR_REUSABLE); |
| 766 | if (kr != KERN_SUCCESS) { |
| 767 | os_log_error(vm_reclaim_log_handle, |
| 768 | "vm_reclaim: unable to free(reusable) 0x%llx (%u) for pid %d err=%d\n" , |
| 769 | entry->address, entry->size, task_pid(metadata->vdrm_task), kr); |
| 770 | } |
| 771 | break; |
| 772 | default: |
| 773 | os_log_error(vm_reclaim_log_handle, |
| 774 | "vm_reclaim: attempted to reclaim entry with unsupported behavior %uh" , |
| 775 | entry->behavior); |
| 776 | reclaim_kill_with_reason(metadata, reason: kGUARD_EXC_RECLAIM_DEALLOCATE_FAILURE, subcode: kr); |
| 777 | goto fail; |
| 778 | } |
| 779 | num_reclaimed++; |
| 780 | os_atomic_add(&metadata->vdrm_num_bytes_reclaimed, entry->size, relaxed); |
| 781 | KDBG_FILTERED(VM_RECLAIM_CODE(VM_RECLAIM_ENTRY) | DBG_FUNC_END, |
| 782 | task_pid(metadata->vdrm_task), entry->address); |
| 783 | } |
| 784 | } |
| 785 | |
| 786 | success = reclaim_copyout_head(metadata, value: head); |
| 787 | if (!success) { |
| 788 | goto fail; |
| 789 | } |
| 790 | |
| 791 | vm_map_switch(map: old_map); |
| 792 | KDBG(VM_RECLAIM_CODE(VM_RECLAIM_CHUNK) | DBG_FUNC_END, |
| 793 | task_pid(metadata->vdrm_task), num_to_reclaim, num_reclaimed, true); |
| 794 | *num_reclaimed_out = num_reclaimed; |
| 795 | if (num_to_reclaim == 0) { |
| 796 | // We have exhausted the reclaimable portion of the buffer |
| 797 | return KERN_NOT_FOUND; |
| 798 | } |
| 799 | return KERN_SUCCESS; |
| 800 | fail: |
| 801 | vm_map_switch(map: old_map); |
| 802 | *num_reclaimed_out = num_reclaimed; |
| 803 | KDBG(VM_RECLAIM_CODE(VM_RECLAIM_CHUNK) | DBG_FUNC_END, |
| 804 | task_pid(metadata->vdrm_task), num_to_reclaim, num_reclaimed, false); |
| 805 | return KERN_FAILURE; |
| 806 | } |
| 807 | |
| 808 | /* |
| 809 | * Attempts to reclaim until the buffer's estimated number of available bytes |
| 810 | * is <= num_bytes_reclaimable_threshold. The metadata buffer lock should be |
| 811 | * held by the caller. |
| 812 | * |
| 813 | * Writes the number of entries reclaimed to `num_reclaimed_out`. |
| 814 | */ |
| 815 | static kern_return_t |
| 816 | reclaim_entries_from_buffer(vm_deferred_reclamation_metadata_t metadata, |
| 817 | size_t num_bytes_reclaimable_threshold, size_t *num_reclaimed_out) |
| 818 | { |
| 819 | assert(metadata != NULL); |
| 820 | assert(num_reclaimed_out != NULL); |
| 821 | LCK_MTX_ASSERT(&metadata->vdrm_lock, LCK_MTX_ASSERT_OWNED); |
| 822 | if (!task_is_active(task: metadata->vdrm_task)) { |
| 823 | /* |
| 824 | * If the task is exiting, the reclaim below will likely fail and fall through |
| 825 | * to the (slower) error path. |
| 826 | * So as an optimization, we bail out early here. |
| 827 | */ |
| 828 | return 0; |
| 829 | } |
| 830 | |
| 831 | KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ENTRIES) | DBG_FUNC_START, task_pid(metadata->vdrm_task)); |
| 832 | |
| 833 | size_t num_entries_reclaimed = 0, num_bytes_reclaimed, estimated_reclaimable_bytes, reclaimable_bytes; |
| 834 | while (true) { |
| 835 | kern_return_t kr; |
| 836 | size_t curr_entries_reclaimed = 0; |
| 837 | num_bytes_reclaimed = os_atomic_load(&metadata->vdrm_num_bytes_reclaimed, relaxed); |
| 838 | reclaimable_bytes = os_atomic_load(&metadata->vdrm_num_bytes_put_in_buffer, relaxed); |
| 839 | if (num_bytes_reclaimed > reclaimable_bytes) { |
| 840 | estimated_reclaimable_bytes = 0; |
| 841 | } else { |
| 842 | estimated_reclaimable_bytes = reclaimable_bytes - num_bytes_reclaimed; |
| 843 | } |
| 844 | if (reclaimable_bytes <= num_bytes_reclaimable_threshold) { |
| 845 | break; |
| 846 | } |
| 847 | kr = reclaim_chunk(metadata, num_reclaimed_out: &curr_entries_reclaimed); |
| 848 | if (kr == KERN_NOT_FOUND) { |
| 849 | break; |
| 850 | } else if (kr != KERN_SUCCESS) { |
| 851 | KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ENTRIES) | DBG_FUNC_END, |
| 852 | task_pid(metadata->vdrm_task), num_entries_reclaimed, |
| 853 | estimated_reclaimable_bytes, kr); |
| 854 | *num_reclaimed_out = num_entries_reclaimed; |
| 855 | return kr; |
| 856 | } |
| 857 | num_entries_reclaimed += curr_entries_reclaimed; |
| 858 | } |
| 859 | |
| 860 | KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ENTRIES) | DBG_FUNC_END, |
| 861 | task_pid(metadata->vdrm_task), num_entries_reclaimed, |
| 862 | estimated_reclaimable_bytes, KERN_SUCCESS); |
| 863 | *num_reclaimed_out = num_entries_reclaimed; |
| 864 | return KERN_SUCCESS; |
| 865 | } |
| 866 | |
| 867 | /* |
| 868 | * Get the reclamation metadata buffer for the given map. |
| 869 | * If the buffer exists it is returned locked. |
| 870 | */ |
| 871 | static vm_deferred_reclamation_metadata_t |
| 872 | get_task_reclaim_metadata(task_t task) |
| 873 | { |
| 874 | assert(task != NULL); |
| 875 | vm_deferred_reclamation_metadata_t metadata = NULL; |
| 876 | task_lock(task); |
| 877 | metadata = task->deferred_reclamation_metadata; |
| 878 | if (metadata != NULL) { |
| 879 | lck_mtx_lock(lck: &metadata->vdrm_lock); |
| 880 | } |
| 881 | task_unlock(task); |
| 882 | return metadata; |
| 883 | } |
| 884 | |
| 885 | kern_return_t |
| 886 | vm_deferred_reclamation_buffer_synchronize_internal(task_t task, size_t num_entries_to_reclaim) |
| 887 | { |
| 888 | kern_return_t kr; |
| 889 | vm_deferred_reclamation_metadata_t metadata = NULL; |
| 890 | size_t total_reclaimed = 0; |
| 891 | |
| 892 | if (!task_is_active(task)) { |
| 893 | return KERN_FAILURE; |
| 894 | } |
| 895 | |
| 896 | metadata = get_task_reclaim_metadata(task); |
| 897 | if (metadata == NULL) { |
| 898 | return KERN_INVALID_ARGUMENT; |
| 899 | } |
| 900 | |
| 901 | while (total_reclaimed < num_entries_to_reclaim) { |
| 902 | size_t num_reclaimed; |
| 903 | kr = reclaim_chunk(metadata, num_reclaimed_out: &num_reclaimed); |
| 904 | if (kr == KERN_NOT_FOUND) { |
| 905 | /* buffer has been fully reclaimed from */ |
| 906 | break; |
| 907 | } else if (kr != KERN_SUCCESS) { |
| 908 | /* Lock has already been released and task is being killed. */ |
| 909 | return kr; |
| 910 | } |
| 911 | |
| 912 | total_reclaimed += num_reclaimed; |
| 913 | } |
| 914 | lck_mtx_unlock(lck: &metadata->vdrm_lock); |
| 915 | |
| 916 | return KERN_SUCCESS; |
| 917 | } |
| 918 | |
| 919 | kern_return_t |
| 920 | vm_deferred_reclamation_buffer_update_reclaimable_bytes_internal(task_t task, size_t reclaimable_bytes) |
| 921 | { |
| 922 | vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata; |
| 923 | size_t num_bytes_reclaimed, estimated_reclaimable_bytes, num_bytes_in_buffer, num_reclaimed = 0; |
| 924 | bool success; |
| 925 | kern_return_t kr = KERN_SUCCESS; |
| 926 | if (metadata == NULL) { |
| 927 | return KERN_INVALID_ARGUMENT; |
| 928 | } |
| 929 | |
| 930 | KDBG(VM_RECLAIM_CODE(VM_RECLAIM_UPDATE_ACCOUNTING) | DBG_FUNC_START, |
| 931 | task_pid(task), reclaimable_bytes); |
| 932 | |
| 933 | /* |
| 934 | * The client is allowed to make this call in parallel from multiple threads. |
| 935 | * Ensure we only ever increase the value of vdrm_num_bytes_put_in_buffer. |
| 936 | * If the client's value is smaller than what we've stored, another thread |
| 937 | * raced ahead of them and we've already acted on that accounting so this |
| 938 | * call should be a no-op. |
| 939 | */ |
| 940 | success = os_atomic_rmw_loop(&metadata->vdrm_num_bytes_put_in_buffer, num_bytes_in_buffer, |
| 941 | reclaimable_bytes, acquire, |
| 942 | { |
| 943 | if (num_bytes_in_buffer > reclaimable_bytes) { |
| 944 | os_atomic_rmw_loop_give_up(break); |
| 945 | } |
| 946 | }); |
| 947 | if (!success) { |
| 948 | /* Stale value. Nothing new to reclaim */ |
| 949 | goto done; |
| 950 | } |
| 951 | num_bytes_reclaimed = os_atomic_load(&metadata->vdrm_num_bytes_reclaimed, relaxed); |
| 952 | |
| 953 | if (reclaimable_bytes > num_bytes_reclaimed) { |
| 954 | estimated_reclaimable_bytes = reclaimable_bytes - num_bytes_reclaimed; |
| 955 | if (estimated_reclaimable_bytes > vm_reclaim_max_threshold) { |
| 956 | lck_mtx_lock(lck: &metadata->vdrm_lock); |
| 957 | kr = reclaim_entries_from_buffer(metadata, |
| 958 | num_bytes_reclaimable_threshold: vm_reclaim_max_threshold, num_reclaimed_out: &num_reclaimed); |
| 959 | if (kr != KERN_SUCCESS) { |
| 960 | /* Lock has already been released & task is in the process of getting killed. */ |
| 961 | goto done; |
| 962 | } |
| 963 | lck_mtx_unlock(lck: &metadata->vdrm_lock); |
| 964 | } |
| 965 | } |
| 966 | |
| 967 | done: |
| 968 | KDBG(VM_RECLAIM_CODE(VM_RECLAIM_UPDATE_ACCOUNTING) | DBG_FUNC_END, |
| 969 | task_pid(task), reclaimable_bytes, num_bytes_reclaimed, num_reclaimed); |
| 970 | |
| 971 | return kr; |
| 972 | } |
| 973 | |
| 974 | static inline size_t |
| 975 | pick_reclaim_threshold(vm_deferred_reclamation_action_t action) |
| 976 | { |
| 977 | switch (action) { |
| 978 | case RECLAIM_FULL: |
| 979 | return 0; |
| 980 | case RECLAIM_TRIM: |
| 981 | return vm_reclaim_max_threshold / vm_reclaim_trim_divisor; |
| 982 | case RECLAIM_ASYNC: |
| 983 | return 0; |
| 984 | } |
| 985 | } |
| 986 | |
| 987 | void |
| 988 | vm_deferred_reclamation_reclaim_memory(vm_deferred_reclamation_action_t action) |
| 989 | { |
| 990 | kern_return_t kr; |
| 991 | size_t num_reclaimed; |
| 992 | |
| 993 | if (action == RECLAIM_ASYNC) { |
| 994 | lck_mtx_lock(lck: &async_reclamation_buffers_lock); |
| 995 | |
| 996 | process_async_reclamation_list(); |
| 997 | lck_mtx_unlock(lck: &async_reclamation_buffers_lock); |
| 998 | } else { |
| 999 | size_t reclaim_threshold = pick_reclaim_threshold(action); |
| 1000 | KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ALL_MEMORY) | DBG_FUNC_START, |
| 1001 | action, reclaim_threshold); |
| 1002 | lck_mtx_lock(lck: &reclamation_buffers_lock); |
| 1003 | reclamation_counter++; |
| 1004 | while (true) { |
| 1005 | vm_deferred_reclamation_metadata_t metadata = TAILQ_FIRST(&reclamation_buffers); |
| 1006 | if (metadata == NULL) { |
| 1007 | break; |
| 1008 | } |
| 1009 | lck_mtx_lock(lck: &metadata->vdrm_lock); |
| 1010 | if (metadata->vdrm_reclaimed_at >= reclamation_counter) { |
| 1011 | // We've already seen this one. We're done |
| 1012 | lck_mtx_unlock(lck: &metadata->vdrm_lock); |
| 1013 | break; |
| 1014 | } |
| 1015 | metadata->vdrm_reclaimed_at = reclamation_counter; |
| 1016 | |
| 1017 | TAILQ_REMOVE(&reclamation_buffers, metadata, vdrm_list); |
| 1018 | TAILQ_INSERT_TAIL(&reclamation_buffers, metadata, vdrm_list); |
| 1019 | lck_mtx_unlock(lck: &reclamation_buffers_lock); |
| 1020 | |
| 1021 | kr = reclaim_entries_from_buffer(metadata, |
| 1022 | num_bytes_reclaimable_threshold: reclaim_threshold, num_reclaimed_out: &num_reclaimed); |
| 1023 | if (kr == KERN_SUCCESS) { |
| 1024 | lck_mtx_unlock(lck: &metadata->vdrm_lock); |
| 1025 | } |
| 1026 | |
| 1027 | lck_mtx_lock(lck: &reclamation_buffers_lock); |
| 1028 | } |
| 1029 | lck_mtx_unlock(lck: &reclamation_buffers_lock); |
| 1030 | KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ALL_MEMORY) | DBG_FUNC_END, |
| 1031 | reclamation_counter); |
| 1032 | } |
| 1033 | } |
| 1034 | |
| 1035 | void |
| 1036 | vm_deferred_reclamation_reclaim_all_memory(void) |
| 1037 | { |
| 1038 | vm_deferred_reclamation_reclaim_memory(action: RECLAIM_FULL); |
| 1039 | } |
| 1040 | |
| 1041 | bool |
| 1042 | vm_deferred_reclamation_reclaim_from_task_async(task_t task) |
| 1043 | { |
| 1044 | bool queued = false; |
| 1045 | vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata; |
| 1046 | |
| 1047 | if (metadata != NULL) { |
| 1048 | lck_mtx_lock(lck: &async_reclamation_buffers_lock); |
| 1049 | if (metadata->vdrm_async_list.tqe_next != NULL || |
| 1050 | metadata->vdrm_async_list.tqe_prev != NULL) { |
| 1051 | // move this buffer to the tail if still on the async list |
| 1052 | TAILQ_REMOVE(&async_reclamation_buffers, metadata, vdrm_async_list); |
| 1053 | } |
| 1054 | TAILQ_INSERT_TAIL(&async_reclamation_buffers, metadata, vdrm_async_list); |
| 1055 | lck_mtx_unlock(lck: &async_reclamation_buffers_lock); |
| 1056 | queued = true; |
| 1057 | thread_wakeup(&vm_reclaim_thread); |
| 1058 | } |
| 1059 | |
| 1060 | return queued; |
| 1061 | } |
| 1062 | |
| 1063 | bool |
| 1064 | vm_deferred_reclamation_reclaim_from_task_sync(task_t task, size_t max_entries_to_reclaim) |
| 1065 | { |
| 1066 | kern_return_t kr; |
| 1067 | size_t num_reclaimed = 0; |
| 1068 | vm_deferred_reclamation_metadata_t metadata = task->deferred_reclamation_metadata; |
| 1069 | |
| 1070 | if (!task_is_active(task)) { |
| 1071 | return false; |
| 1072 | } |
| 1073 | |
| 1074 | if (metadata != NULL) { |
| 1075 | lck_mtx_lock(lck: &metadata->vdrm_lock); |
| 1076 | while (num_reclaimed < max_entries_to_reclaim) { |
| 1077 | size_t num_reclaimed_now; |
| 1078 | kr = reclaim_chunk(metadata, num_reclaimed_out: &num_reclaimed_now); |
| 1079 | if (kr == KERN_NOT_FOUND) { |
| 1080 | // Nothing left to reclaim |
| 1081 | break; |
| 1082 | } else if (kr != KERN_SUCCESS) { |
| 1083 | /* Lock has already been released and task is being killed. */ |
| 1084 | return false; |
| 1085 | } |
| 1086 | num_reclaimed += num_reclaimed_now; |
| 1087 | } |
| 1088 | lck_mtx_unlock(lck: &metadata->vdrm_lock); |
| 1089 | } |
| 1090 | |
| 1091 | return num_reclaimed > 0; |
| 1092 | } |
| 1093 | |
| 1094 | vm_deferred_reclamation_metadata_t |
| 1095 | vm_deferred_reclamation_buffer_fork(task_t task, vm_deferred_reclamation_metadata_t parent) |
| 1096 | { |
| 1097 | kern_return_t kr; |
| 1098 | vm_deferred_reclamation_metadata_t metadata = NULL; |
| 1099 | |
| 1100 | LCK_MTX_ASSERT(&parent->vdrm_lock, LCK_MTX_ASSERT_OWNED); |
| 1101 | |
| 1102 | assert(task->deferred_reclamation_metadata == NULL); |
| 1103 | metadata = vmdr_metadata_alloc(task, buffer: parent->vdrm_reclaim_buffer, |
| 1104 | size: parent->vdrm_buffer_size, indices: parent->vdrm_reclaim_indices); |
| 1105 | lck_mtx_unlock(lck: &parent->vdrm_lock); |
| 1106 | |
| 1107 | kr = vmdr_metadata_wire(metadata); |
| 1108 | if (kr != KERN_SUCCESS) { |
| 1109 | vmdr_metadata_free(metadata); |
| 1110 | return NULL; |
| 1111 | } |
| 1112 | |
| 1113 | lck_mtx_lock(lck: &reclamation_buffers_lock); |
| 1114 | TAILQ_INSERT_TAIL(&reclamation_buffers, metadata, vdrm_list); |
| 1115 | reclamation_buffers_length++; |
| 1116 | lck_mtx_unlock(lck: &reclamation_buffers_lock); |
| 1117 | |
| 1118 | return metadata; |
| 1119 | } |
| 1120 | |
| 1121 | void |
| 1122 | vm_deferred_reclamation_buffer_lock(vm_deferred_reclamation_metadata_t metadata) |
| 1123 | { |
| 1124 | lck_mtx_lock(lck: &metadata->vdrm_lock); |
| 1125 | } |
| 1126 | |
| 1127 | void |
| 1128 | vm_deferred_reclamation_buffer_unlock(vm_deferred_reclamation_metadata_t metadata) |
| 1129 | { |
| 1130 | lck_mtx_unlock(lck: &metadata->vdrm_lock); |
| 1131 | } |
| 1132 | |
| 1133 | |
| 1134 | static void |
| 1135 | reclaim_thread_init(void) |
| 1136 | { |
| 1137 | #if CONFIG_THREAD_GROUPS |
| 1138 | thread_group_vm_add(); |
| 1139 | #endif |
| 1140 | thread_set_thread_name(th: current_thread(), name: "VM_reclaim" ); |
| 1141 | } |
| 1142 | |
| 1143 | |
| 1144 | static void |
| 1145 | process_async_reclamation_list(void) |
| 1146 | { |
| 1147 | kern_return_t kr; |
| 1148 | size_t total_entries_reclaimed = 0; |
| 1149 | size_t num_tasks_reclaimed = 0; |
| 1150 | LCK_MTX_ASSERT(&async_reclamation_buffers_lock, LCK_MTX_ASSERT_OWNED); |
| 1151 | KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ASYNC_MEMORY) | DBG_FUNC_START); |
| 1152 | |
| 1153 | vm_deferred_reclamation_metadata_t metadata = TAILQ_FIRST(&async_reclamation_buffers); |
| 1154 | while (metadata != NULL) { |
| 1155 | size_t num_reclaimed; |
| 1156 | TAILQ_REMOVE(&async_reclamation_buffers, metadata, vdrm_async_list); |
| 1157 | metadata->vdrm_async_list.tqe_next = NULL; |
| 1158 | metadata->vdrm_async_list.tqe_prev = NULL; |
| 1159 | lck_mtx_lock(lck: &metadata->vdrm_lock); |
| 1160 | lck_mtx_unlock(lck: &async_reclamation_buffers_lock); |
| 1161 | |
| 1162 | // NB: Currently the async reclaim thread fully reclaims the buffer. |
| 1163 | kr = reclaim_entries_from_buffer(metadata, num_bytes_reclaimable_threshold: 0, num_reclaimed_out: &num_reclaimed); |
| 1164 | total_entries_reclaimed += num_reclaimed; |
| 1165 | if (kr != KERN_SUCCESS) { |
| 1166 | /* Lock has already been released & task is in the process of getting killed. */ |
| 1167 | goto next; |
| 1168 | } |
| 1169 | num_tasks_reclaimed++; |
| 1170 | /* Wakeup anyone waiting on this buffer getting processed */ |
| 1171 | thread_wakeup(&metadata->vdrm_async_list); |
| 1172 | assert(current_thread()->map == kernel_map); |
| 1173 | lck_mtx_unlock(lck: &metadata->vdrm_lock); |
| 1174 | |
| 1175 | next: |
| 1176 | lck_mtx_lock(lck: &async_reclamation_buffers_lock); |
| 1177 | metadata = TAILQ_FIRST(&async_reclamation_buffers); |
| 1178 | } |
| 1179 | KDBG(VM_RECLAIM_CODE(VM_RECLAIM_ASYNC_MEMORY) | DBG_FUNC_END, |
| 1180 | num_tasks_reclaimed, total_entries_reclaimed); |
| 1181 | } |
| 1182 | |
| 1183 | __enum_decl(reclaim_thread_state, uint32_t, { |
| 1184 | RECLAIM_THREAD_INIT = 0, |
| 1185 | RECLAIM_THREAD_CONT = 1, |
| 1186 | }); |
| 1187 | |
| 1188 | static void |
| 1189 | reclaim_thread_continue(void) |
| 1190 | { |
| 1191 | lck_mtx_lock(lck: &async_reclamation_buffers_lock); |
| 1192 | |
| 1193 | process_async_reclamation_list(); |
| 1194 | assert_wait(event: &vm_reclaim_thread, THREAD_UNINT); |
| 1195 | |
| 1196 | lck_mtx_unlock(lck: &async_reclamation_buffers_lock); |
| 1197 | } |
| 1198 | |
| 1199 | void |
| 1200 | reclaim_thread(void *param, wait_result_t wr __unused) |
| 1201 | { |
| 1202 | if (param == (void *) RECLAIM_THREAD_INIT) { |
| 1203 | reclaim_thread_init(); |
| 1204 | } else { |
| 1205 | assert(param == (void *) RECLAIM_THREAD_CONT); |
| 1206 | } |
| 1207 | |
| 1208 | reclaim_thread_continue(); |
| 1209 | |
| 1210 | (void) thread_block_parameter(continuation: reclaim_thread, parameter: (void*) RECLAIM_THREAD_CONT); |
| 1211 | } |
| 1212 | |
| 1213 | __startup_func |
| 1214 | static void |
| 1215 | vm_deferred_reclamation_init(void) |
| 1216 | { |
| 1217 | // Note: no-op pending rdar://27006343 (Custom kernel log handles) |
| 1218 | vm_reclaim_log_handle = os_log_create(subsystem: "com.apple.mach.vm" , category: "reclaim" ); |
| 1219 | |
| 1220 | (void)kernel_thread_start_priority(continuation: reclaim_thread, |
| 1221 | parameter: (void *)RECLAIM_THREAD_INIT, priority: kReclaimThreadPriority, |
| 1222 | new_thread: &vm_reclaim_thread); |
| 1223 | } |
| 1224 | |
| 1225 | STARTUP(EARLY_BOOT, STARTUP_RANK_MIDDLE, vm_deferred_reclamation_init); |
| 1226 | |
| 1227 | #if DEVELOPMENT || DEBUG |
| 1228 | |
| 1229 | bool |
| 1230 | vm_deferred_reclamation_block_until_pid_has_been_reclaimed(int pid) |
| 1231 | { |
| 1232 | vm_deferred_reclamation_metadata_t metadata = NULL; |
| 1233 | proc_t p = proc_find(pid); |
| 1234 | vm_map_t map = NULL; |
| 1235 | if (p == NULL) { |
| 1236 | return false; |
| 1237 | } |
| 1238 | task_t t = proc_task(p); |
| 1239 | if (t == NULL) { |
| 1240 | proc_rele(p); |
| 1241 | return false; |
| 1242 | } |
| 1243 | |
| 1244 | task_lock(t); |
| 1245 | if (t->map) { |
| 1246 | metadata = t->deferred_reclamation_metadata; |
| 1247 | if (metadata != NULL) { |
| 1248 | map = t->map; |
| 1249 | vm_map_reference(t->map); |
| 1250 | } |
| 1251 | } |
| 1252 | task_unlock(t); |
| 1253 | proc_rele(p); |
| 1254 | if (metadata == NULL) { |
| 1255 | return false; |
| 1256 | } |
| 1257 | |
| 1258 | lck_mtx_lock(&async_reclamation_buffers_lock); |
| 1259 | while (metadata->vdrm_async_list.tqe_next != NULL || metadata->vdrm_async_list.tqe_prev != NULL) { |
| 1260 | assert_wait(&metadata->vdrm_async_list, THREAD_UNINT); |
| 1261 | lck_mtx_unlock(&async_reclamation_buffers_lock); |
| 1262 | thread_block(THREAD_CONTINUE_NULL); |
| 1263 | lck_mtx_lock(&async_reclamation_buffers_lock); |
| 1264 | } |
| 1265 | |
| 1266 | /* |
| 1267 | * The async reclaim thread first removes the buffer from the list |
| 1268 | * and then reclaims it (while holding its lock). |
| 1269 | * So grab the metadata buffer's lock here to ensure the |
| 1270 | * reclaim is done. |
| 1271 | */ |
| 1272 | lck_mtx_lock(&metadata->vdrm_lock); |
| 1273 | lck_mtx_unlock(&metadata->vdrm_lock); |
| 1274 | lck_mtx_unlock(&async_reclamation_buffers_lock); |
| 1275 | |
| 1276 | vm_map_deallocate(map); |
| 1277 | return true; |
| 1278 | } |
| 1279 | |
| 1280 | #endif /* DEVELOPMENT || DEBUG */ |
| 1281 | |